From ed198cb49750fd9ec564e9f1df66c10efea605f1 Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Sat, 22 Apr 2006 02:38:50 +0100 Subject: [RBTREE] Update hrtimers to use rb_parent() accessor macro. Also switch it to use the same method of using off-tree nodes as everyone else now does -- set them to point to themselves. Signed-off-by: David Woodhouse --- kernel/hrtimer.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index d2a7296c8251..04ab27ddfd90 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -393,7 +393,7 @@ static void __remove_hrtimer(struct hrtimer *timer, struct hrtimer_base *base) if (base->first == &timer->node) base->first = rb_next(&timer->node); rb_erase(&timer->node, &base->active); - timer->node.rb_parent = HRTIMER_INACTIVE; + rb_set_parent(&timer->node, &timer->node); } /* @@ -578,7 +578,7 @@ void hrtimer_init(struct hrtimer *timer, clockid_t clock_id, clock_id = CLOCK_MONOTONIC; timer->base = &bases[clock_id]; - timer->node.rb_parent = HRTIMER_INACTIVE; + rb_set_parent(&timer->node, &timer->node); } /** -- cgit v1.2.3 From 6f18a022fb311f07f3b32f2c0e1b5c9477dc4439 Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Mon, 8 May 2006 22:40:05 +0100 Subject: Finally remove the obnoxious inter_module_xxx() This was already a bad plan when I argued against adding it in the first place. Good riddance. Signed-off-by: David Woodhouse --- include/linux/module.h | 9 --- init/Kconfig | 3 - kernel/Makefile | 1 - kernel/intermodule.c | 184 ------------------------------------------------- 4 files changed, 197 deletions(-) delete mode 100644 kernel/intermodule.c (limited to 'kernel') diff --git a/include/linux/module.h b/include/linux/module.h index eaec13ddd667..b0d44134f3c4 100644 --- a/include/linux/module.h +++ b/include/linux/module.h @@ -557,13 +557,4 @@ static inline void module_remove_driver(struct device_driver *driver) #define __MODULE_STRING(x) __stringify(x) -/* Use symbol_get and symbol_put instead. You'll thank me. */ -#define HAVE_INTER_MODULE -extern void __deprecated inter_module_register(const char *, - struct module *, const void *); -extern void __deprecated inter_module_unregister(const char *); -extern const void * __deprecated inter_module_get_request(const char *, - const char *); -extern void __deprecated inter_module_put(const char *); - #endif /* _LINUX_MODULE_H */ diff --git a/init/Kconfig b/init/Kconfig index 3b36a1d53656..a7697787946a 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -389,9 +389,6 @@ config SLOB default !SLAB bool -config OBSOLETE_INTERMODULE - tristate - menu "Loadable module support" config MODULES diff --git a/kernel/Makefile b/kernel/Makefile index 58908f9d156a..f6ef00f4f90f 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -20,7 +20,6 @@ obj-$(CONFIG_SMP) += cpu.o spinlock.o obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o obj-$(CONFIG_UID16) += uid16.o obj-$(CONFIG_MODULES) += module.o -obj-$(CONFIG_OBSOLETE_INTERMODULE) += intermodule.o obj-$(CONFIG_KALLSYMS) += kallsyms.o obj-$(CONFIG_PM) += power/ obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o diff --git a/kernel/intermodule.c b/kernel/intermodule.c deleted file mode 100644 index 55b1e5b85db9..000000000000 --- a/kernel/intermodule.c +++ /dev/null @@ -1,184 +0,0 @@ -/* Deprecated, do not use. Moved from module.c to here. --RR */ - -/* Written by Keith Owens Oct 2000 */ -#include -#include -#include -#include -#include - -/* inter_module functions are always available, even when the kernel is - * compiled without modules. Consumers of inter_module_xxx routines - * will always work, even when both are built into the kernel, this - * approach removes lots of #ifdefs in mainline code. - */ - -static struct list_head ime_list = LIST_HEAD_INIT(ime_list); -static DEFINE_SPINLOCK(ime_lock); -static int kmalloc_failed; - -struct inter_module_entry { - struct list_head list; - const char *im_name; - struct module *owner; - const void *userdata; -}; - -/** - * inter_module_register - register a new set of inter module data. - * @im_name: an arbitrary string to identify the data, must be unique - * @owner: module that is registering the data, always use THIS_MODULE - * @userdata: pointer to arbitrary userdata to be registered - * - * Description: Check that the im_name has not already been registered, - * complain if it has. For new data, add it to the inter_module_entry - * list. - */ -void inter_module_register(const char *im_name, struct module *owner, const void *userdata) -{ - struct list_head *tmp; - struct inter_module_entry *ime, *ime_new; - - if (!(ime_new = kzalloc(sizeof(*ime), GFP_KERNEL))) { - /* Overloaded kernel, not fatal */ - printk(KERN_ERR - "Aiee, inter_module_register: cannot kmalloc entry for '%s'\n", - im_name); - kmalloc_failed = 1; - return; - } - ime_new->im_name = im_name; - ime_new->owner = owner; - ime_new->userdata = userdata; - - spin_lock(&ime_lock); - list_for_each(tmp, &ime_list) { - ime = list_entry(tmp, struct inter_module_entry, list); - if (strcmp(ime->im_name, im_name) == 0) { - spin_unlock(&ime_lock); - kfree(ime_new); - /* Program logic error, fatal */ - printk(KERN_ERR "inter_module_register: duplicate im_name '%s'", im_name); - BUG(); - } - } - list_add(&(ime_new->list), &ime_list); - spin_unlock(&ime_lock); -} - -/** - * inter_module_unregister - unregister a set of inter module data. - * @im_name: an arbitrary string to identify the data, must be unique - * - * Description: Check that the im_name has been registered, complain if - * it has not. For existing data, remove it from the - * inter_module_entry list. - */ -void inter_module_unregister(const char *im_name) -{ - struct list_head *tmp; - struct inter_module_entry *ime; - - spin_lock(&ime_lock); - list_for_each(tmp, &ime_list) { - ime = list_entry(tmp, struct inter_module_entry, list); - if (strcmp(ime->im_name, im_name) == 0) { - list_del(&(ime->list)); - spin_unlock(&ime_lock); - kfree(ime); - return; - } - } - spin_unlock(&ime_lock); - if (kmalloc_failed) { - printk(KERN_ERR - "inter_module_unregister: no entry for '%s', " - "probably caused by previous kmalloc failure\n", - im_name); - return; - } - else { - /* Program logic error, fatal */ - printk(KERN_ERR "inter_module_unregister: no entry for '%s'", im_name); - BUG(); - } -} - -/** - * inter_module_get - return arbitrary userdata from another module. - * @im_name: an arbitrary string to identify the data, must be unique - * - * Description: If the im_name has not been registered, return NULL. - * Try to increment the use count on the owning module, if that fails - * then return NULL. Otherwise return the userdata. - */ -static const void *inter_module_get(const char *im_name) -{ - struct list_head *tmp; - struct inter_module_entry *ime; - const void *result = NULL; - - spin_lock(&ime_lock); - list_for_each(tmp, &ime_list) { - ime = list_entry(tmp, struct inter_module_entry, list); - if (strcmp(ime->im_name, im_name) == 0) { - if (try_module_get(ime->owner)) - result = ime->userdata; - break; - } - } - spin_unlock(&ime_lock); - return(result); -} - -/** - * inter_module_get_request - im get with automatic request_module. - * @im_name: an arbitrary string to identify the data, must be unique - * @modname: module that is expected to register im_name - * - * Description: If inter_module_get fails, do request_module then retry. - */ -const void *inter_module_get_request(const char *im_name, const char *modname) -{ - const void *result = inter_module_get(im_name); - if (!result) { - request_module("%s", modname); - result = inter_module_get(im_name); - } - return(result); -} - -/** - * inter_module_put - release use of data from another module. - * @im_name: an arbitrary string to identify the data, must be unique - * - * Description: If the im_name has not been registered, complain, - * otherwise decrement the use count on the owning module. - */ -void inter_module_put(const char *im_name) -{ - struct list_head *tmp; - struct inter_module_entry *ime; - - spin_lock(&ime_lock); - list_for_each(tmp, &ime_list) { - ime = list_entry(tmp, struct inter_module_entry, list); - if (strcmp(ime->im_name, im_name) == 0) { - if (ime->owner) - module_put(ime->owner); - spin_unlock(&ime_lock); - return; - } - } - spin_unlock(&ime_lock); - printk(KERN_ERR "inter_module_put: no entry for '%s'", im_name); - BUG(); -} - -EXPORT_SYMBOL(inter_module_register); -EXPORT_SYMBOL(inter_module_unregister); -EXPORT_SYMBOL(inter_module_get_request); -EXPORT_SYMBOL(inter_module_put); - -MODULE_LICENSE("GPL"); - -- cgit v1.2.3 From 651d765d0b2c72d33430487c8b6ef64c60cd2134 Mon Sep 17 00:00:00 2001 From: Anton Blanchard Date: Wed, 7 Jun 2006 16:10:19 +1000 Subject: [PATCH] Add a prctl to change the endianness of a process. This new prctl is intended for changing the execution mode of the processor, on processors that support both a little-endian mode and a big-endian mode. It is intended for use by programs such as instruction set emulators (for example an x86 emulator on PowerPC), which may find it convenient to use the processor in an alternate endianness mode when executing translated instructions. Note that this does not imply the existence of a fully-fledged ABI for both endiannesses, or of compatibility code for converting system calls done in the non-native endianness mode. The program is expected to arrange for all of its system call arguments to be presented in the native endianness. Switching between big and little-endian mode will require some care in constructing the instruction sequence for the switch. Generally the instructions up to the instruction that invokes the prctl system call will have to be in the old endianness, and subsequent instructions will have to be in the new endianness. Signed-off-by: Anton Blanchard Signed-off-by: Paul Mackerras --- include/linux/prctl.h | 7 +++++++ kernel/sys.c | 13 +++++++++++++ 2 files changed, 20 insertions(+) (limited to 'kernel') diff --git a/include/linux/prctl.h b/include/linux/prctl.h index bf022c43a18e..52a9be41250d 100644 --- a/include/linux/prctl.h +++ b/include/linux/prctl.h @@ -52,4 +52,11 @@ #define PR_SET_NAME 15 /* Set process name */ #define PR_GET_NAME 16 /* Get process name */ +/* Get/set process endian */ +#define PR_GET_ENDIAN 19 +#define PR_SET_ENDIAN 20 +# define PR_ENDIAN_BIG 0 +# define PR_ENDIAN_LITTLE 1 /* True little endian mode */ +# define PR_ENDIAN_PPC_LITTLE 2 /* "PowerPC" pseudo little endian */ + #endif /* _LINUX_PRCTL_H */ diff --git a/kernel/sys.c b/kernel/sys.c index 0b6ec0e7936f..12d2d753dc3b 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -57,6 +57,12 @@ #ifndef GET_FPEXC_CTL # define GET_FPEXC_CTL(a,b) (-EINVAL) #endif +#ifndef GET_ENDIAN +# define GET_ENDIAN(a,b) (-EINVAL) +#endif +#ifndef SET_ENDIAN +# define SET_ENDIAN(a,b) (-EINVAL) +#endif /* * this is where the system-wide overflow UID and GID are defined, for @@ -2057,6 +2063,13 @@ asmlinkage long sys_prctl(int option, unsigned long arg2, unsigned long arg3, return -EFAULT; return 0; } + case PR_GET_ENDIAN: + error = GET_ENDIAN(current, arg2); + break; + case PR_SET_ENDIAN: + error = SET_ENDIAN(current, arg2); + break; + default: error = -EINVAL; break; -- cgit v1.2.3 From b817f6feff4a565b08f0e699a5790b4008b8f494 Mon Sep 17 00:00:00 2001 From: Sam Ravnborg Date: Fri, 9 Jun 2006 21:53:55 +0200 Subject: kbuild: check license compatibility when building modules Modules that uses GPL symbols can no longer be build with kbuild, the build will fail during the modpost step. When a GPL-incompatible module uses a EXPORT_SYMBOL_GPL_FUTURE symbol then warn during modpost so author are actually notified. The actual license compatibility check is shared with the kernel to make sure it is in sync. Patch originally from: Andreas Gruenbacher and Ram Pai Signed-off-by: Sam Ravnborg --- include/linux/license.h | 14 ++++++++++ kernel/module.c | 11 +------- scripts/mod/modpost.c | 71 +++++++++++++++++++++++++++++++++++++++++++++++-- scripts/mod/modpost.h | 1 + 4 files changed, 85 insertions(+), 12 deletions(-) create mode 100644 include/linux/license.h (limited to 'kernel') diff --git a/include/linux/license.h b/include/linux/license.h new file mode 100644 index 000000000000..decdbf43cb5c --- /dev/null +++ b/include/linux/license.h @@ -0,0 +1,14 @@ +#ifndef __LICENSE_H +#define __LICENSE_H + +static inline int license_is_gpl_compatible(const char *license) +{ + return (strcmp(license, "GPL") == 0 + || strcmp(license, "GPL v2") == 0 + || strcmp(license, "GPL and additional rights") == 0 + || strcmp(license, "Dual BSD/GPL") == 0 + || strcmp(license, "Dual MIT/GPL") == 0 + || strcmp(license, "Dual MPL/GPL") == 0); +} + +#endif diff --git a/kernel/module.c b/kernel/module.c index bbe04862e1b0..690381508d09 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -43,6 +43,7 @@ #include #include #include +#include #if 0 #define DEBUGP printk @@ -1248,16 +1249,6 @@ static void layout_sections(struct module *mod, } } -static inline int license_is_gpl_compatible(const char *license) -{ - return (strcmp(license, "GPL") == 0 - || strcmp(license, "GPL v2") == 0 - || strcmp(license, "GPL and additional rights") == 0 - || strcmp(license, "Dual BSD/GPL") == 0 - || strcmp(license, "Dual MIT/GPL") == 0 - || strcmp(license, "Dual MPL/GPL") == 0); -} - static void set_license(struct module *mod, const char *license) { if (!license) diff --git a/scripts/mod/modpost.c b/scripts/mod/modpost.c index ba2e4fc2af20..baa4d83d29a8 100644 --- a/scripts/mod/modpost.c +++ b/scripts/mod/modpost.c @@ -13,6 +13,7 @@ #include #include "modpost.h" +#include "../../include/linux/license.h" /* Are we using CONFIG_MODVERSIONS? */ int modversions = 0; @@ -99,6 +100,7 @@ static struct module *new_module(char *modname) /* add to list */ mod->name = p; + mod->gpl_compatible = -1; mod->next = modules; modules = mod; @@ -493,13 +495,18 @@ static char *next_string(char *string, unsigned long *secsize) return string; } -static char *get_modinfo(void *modinfo, unsigned long modinfo_len, - const char *tag) +static char *get_next_modinfo(void *modinfo, unsigned long modinfo_len, + const char *tag, char *info) { char *p; unsigned int taglen = strlen(tag); unsigned long size = modinfo_len; + if (info) { + size -= info - (char *)modinfo; + modinfo = next_string(info, &size); + } + for (p = modinfo; p; p = next_string(p, &size)) { if (strncmp(p, tag, taglen) == 0 && p[taglen] == '=') return p + taglen + 1; @@ -507,6 +514,13 @@ static char *get_modinfo(void *modinfo, unsigned long modinfo_len, return NULL; } +static char *get_modinfo(void *modinfo, unsigned long modinfo_len, + const char *tag) + +{ + return get_next_modinfo(modinfo, modinfo_len, tag, NULL); +} + /** * Test if string s ends in string sub * return 0 if match @@ -981,6 +995,7 @@ static void read_symbols(char *modname) { const char *symname; char *version; + char *license; struct module *mod; struct elf_info info = { }; Elf_Sym *sym; @@ -996,6 +1011,18 @@ static void read_symbols(char *modname) mod->skip = 1; } + license = get_modinfo(info.modinfo, info.modinfo_len, "license"); + while (license) { + if (license_is_gpl_compatible(license)) + mod->gpl_compatible = 1; + else { + mod->gpl_compatible = 0; + break; + } + license = get_next_modinfo(info.modinfo, info.modinfo_len, + "license", license); + } + for (sym = info.symtab_start; sym < info.symtab_stop; sym++) { symname = info.strtab + sym->st_name; @@ -1052,6 +1079,40 @@ void buf_write(struct buffer *buf, const char *s, int len) buf->pos += len; } +void check_license(struct module *mod) +{ + struct symbol *s, *exp; + + for (s = mod->unres; s; s = s->next) { + if (mod->gpl_compatible == 1) { + /* GPL-compatible modules may use all symbols */ + continue; + } + exp = find_symbol(s->name); + if (!exp || exp->module == mod) + continue; + const char *basename = strrchr(mod->name, '/'); + if (basename) + basename++; + switch (exp->export) { + case export_gpl: + fatal("modpost: GPL-incompatible module %s " + "uses GPL-only symbol '%s'\n", + basename ? basename : mod->name, + exp->name); + break; + case export_gpl_future: + warn("modpost: GPL-incompatible module %s " + "uses future GPL-only symbol '%s'\n", + basename ? basename : mod->name, + exp->name); + break; + case export_plain: /* ignore */ break; + case export_unknown: /* ignore */ break; + } + } +} + /** * Header for the generated file **/ @@ -1325,6 +1386,12 @@ int main(int argc, char **argv) read_symbols(argv[optind++]); } + for (mod = modules; mod; mod = mod->next) { + if (mod->skip) + continue; + check_license(mod); + } + for (mod = modules; mod; mod = mod->next) { if (mod->skip) continue; diff --git a/scripts/mod/modpost.h b/scripts/mod/modpost.h index f7ee3a3fde14..2b00c6062844 100644 --- a/scripts/mod/modpost.h +++ b/scripts/mod/modpost.h @@ -100,6 +100,7 @@ buf_write(struct buffer *buf, const char *s, int len); struct module { struct module *next; const char *name; + int gpl_compatible; struct symbol *unres; int seen; int skip; -- cgit v1.2.3 From 2d9048e201bfb67ba21f05e647b1286b8a4a5667 Mon Sep 17 00:00:00 2001 From: Amy Griffis Date: Thu, 1 Jun 2006 13:10:59 -0700 Subject: [PATCH] inotify (1/5): split kernel API from userspace support The following series of patches introduces a kernel API for inotify, making it possible for kernel modules to benefit from inotify's mechanism for watching inodes. With these patches, inotify will maintain for each caller a list of watches (via an embedded struct inotify_watch), where each inotify_watch is associated with a corresponding struct inode. The caller registers an event handler and specifies for which filesystem events their event handler should be called per inotify_watch. Signed-off-by: Amy Griffis Acked-by: Robert Love Acked-by: John McCutchan Signed-off-by: Al Viro --- fs/Kconfig | 24 +- fs/Makefile | 1 + fs/inotify.c | 941 ++++++++++++------------------------------------ fs/inotify_user.c | 717 ++++++++++++++++++++++++++++++++++++ include/linux/inotify.h | 76 ++++ include/linux/sched.h | 2 +- kernel/sysctl.c | 4 +- kernel/user.c | 2 +- 8 files changed, 1046 insertions(+), 721 deletions(-) create mode 100644 fs/inotify_user.c (limited to 'kernel') diff --git a/fs/Kconfig b/fs/Kconfig index f9b5842c8d2d..74f11a23622d 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -393,18 +393,30 @@ config INOTIFY bool "Inotify file change notification support" default y ---help--- - Say Y here to enable inotify support and the associated system - calls. Inotify is a file change notification system and a - replacement for dnotify. Inotify fixes numerous shortcomings in - dnotify and introduces several new features. It allows monitoring - of both files and directories via a single open fd. Other features - include multiple file events, one-shot support, and unmount + Say Y here to enable inotify support. Inotify is a file change + notification system and a replacement for dnotify. Inotify fixes + numerous shortcomings in dnotify and introduces several new features + including multiple file events, one-shot support, and unmount notification. For more information, see Documentation/filesystems/inotify.txt If unsure, say Y. +config INOTIFY_USER + bool "Inotify support for userspace" + depends on INOTIFY + default y + ---help--- + Say Y here to enable inotify support for userspace, including the + associated system calls. Inotify allows monitoring of both files and + directories via a single open fd. Events are read from the file + descriptor, which is also select()- and poll()-able. + + For more information, see Documentation/filesystems/inotify.txt + + If unsure, say Y. + config QUOTA bool "Quota support" help diff --git a/fs/Makefile b/fs/Makefile index 078d3d1191a5..d0ea6bfccf29 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -13,6 +13,7 @@ obj-y := open.o read_write.o file_table.o buffer.o bio.o super.o \ ioprio.o pnode.o drop_caches.o splice.o sync.o obj-$(CONFIG_INOTIFY) += inotify.o +obj-$(CONFIG_INOTIFY_USER) += inotify_user.o obj-$(CONFIG_EPOLL) += eventpoll.o obj-$(CONFIG_COMPAT) += compat.o compat_ioctl.o diff --git a/fs/inotify.c b/fs/inotify.c index 732ec4bd5774..a1bedf3975ca 100644 --- a/fs/inotify.c +++ b/fs/inotify.c @@ -5,7 +5,10 @@ * John McCutchan * Robert Love * + * Kernel API added by: Amy Griffis + * * Copyright (C) 2005 John McCutchan + * Copyright 2006 Hewlett-Packard Development Company, L.P. * * This program is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the @@ -20,35 +23,17 @@ #include #include -#include #include #include #include #include -#include -#include -#include -#include #include #include #include #include -#include - -#include static atomic_t inotify_cookie; -static kmem_cache_t *watch_cachep __read_mostly; -static kmem_cache_t *event_cachep __read_mostly; - -static struct vfsmount *inotify_mnt __read_mostly; - -/* these are configurable via /proc/sys/fs/inotify/ */ -int inotify_max_user_instances __read_mostly; -int inotify_max_user_watches __read_mostly; -int inotify_max_queued_events __read_mostly; - /* * Lock ordering: * @@ -56,327 +41,108 @@ int inotify_max_queued_events __read_mostly; * iprune_mutex (synchronize shrink_icache_memory()) * inode_lock (protects the super_block->s_inodes list) * inode->inotify_mutex (protects inode->inotify_watches and watches->i_list) - * inotify_dev->mutex (protects inotify_device and watches->d_list) + * inotify_handle->mutex (protects inotify_handle and watches->h_list) + * + * The inode->inotify_mutex and inotify_handle->mutex and held during execution + * of a caller's event handler. Thus, the caller must not hold any locks + * taken in their event handler while calling any of the published inotify + * interfaces. */ /* - * Lifetimes of the three main data structures--inotify_device, inode, and + * Lifetimes of the three main data structures--inotify_handle, inode, and * inotify_watch--are managed by reference count. * - * inotify_device: Lifetime is from inotify_init() until release. Additional - * references can bump the count via get_inotify_dev() and drop the count via - * put_inotify_dev(). + * inotify_handle: Lifetime is from inotify_init() to inotify_destroy(). + * Additional references can bump the count via get_inotify_handle() and drop + * the count via put_inotify_handle(). * - * inotify_watch: Lifetime is from create_watch() to destory_watch(). - * Additional references can bump the count via get_inotify_watch() and drop - * the count via put_inotify_watch(). + * inotify_watch: for inotify's purposes, lifetime is from inotify_add_watch() + * to remove_watch_no_event(). Additional references can bump the count via + * get_inotify_watch() and drop the count via put_inotify_watch(). The caller + * is reponsible for the final put after receiving IN_IGNORED, or when using + * IN_ONESHOT after receiving the first event. Inotify does the final put if + * inotify_destroy() is called. * * inode: Pinned so long as the inode is associated with a watch, from - * create_watch() to put_inotify_watch(). + * inotify_add_watch() to the final put_inotify_watch(). */ /* - * struct inotify_device - represents an inotify instance + * struct inotify_handle - represents an inotify instance * * This structure is protected by the mutex 'mutex'. */ -struct inotify_device { - wait_queue_head_t wq; /* wait queue for i/o */ +struct inotify_handle { struct idr idr; /* idr mapping wd -> watch */ struct mutex mutex; /* protects this bad boy */ - struct list_head events; /* list of queued events */ struct list_head watches; /* list of watches */ atomic_t count; /* reference count */ - struct user_struct *user; /* user who opened this dev */ - unsigned int queue_size; /* size of the queue (bytes) */ - unsigned int event_count; /* number of pending events */ - unsigned int max_events; /* maximum number of events */ u32 last_wd; /* the last wd allocated */ + const struct inotify_operations *in_ops; /* inotify caller operations */ }; -/* - * struct inotify_kernel_event - An inotify event, originating from a watch and - * queued for user-space. A list of these is attached to each instance of the - * device. In read(), this list is walked and all events that can fit in the - * buffer are returned. - * - * Protected by dev->mutex of the device in which we are queued. - */ -struct inotify_kernel_event { - struct inotify_event event; /* the user-space event */ - struct list_head list; /* entry in inotify_device's list */ - char *name; /* filename, if any */ -}; - -/* - * struct inotify_watch - represents a watch request on a specific inode - * - * d_list is protected by dev->mutex of the associated watch->dev. - * i_list and mask are protected by inode->inotify_mutex of the associated inode. - * dev, inode, and wd are never written to once the watch is created. - */ -struct inotify_watch { - struct list_head d_list; /* entry in inotify_device's list */ - struct list_head i_list; /* entry in inode's list */ - atomic_t count; /* reference count */ - struct inotify_device *dev; /* associated device */ - struct inode *inode; /* associated inode */ - s32 wd; /* watch descriptor */ - u32 mask; /* event mask for this watch */ -}; - -#ifdef CONFIG_SYSCTL - -#include - -static int zero; - -ctl_table inotify_table[] = { - { - .ctl_name = INOTIFY_MAX_USER_INSTANCES, - .procname = "max_user_instances", - .data = &inotify_max_user_instances, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec_minmax, - .strategy = &sysctl_intvec, - .extra1 = &zero, - }, - { - .ctl_name = INOTIFY_MAX_USER_WATCHES, - .procname = "max_user_watches", - .data = &inotify_max_user_watches, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec_minmax, - .strategy = &sysctl_intvec, - .extra1 = &zero, - }, - { - .ctl_name = INOTIFY_MAX_QUEUED_EVENTS, - .procname = "max_queued_events", - .data = &inotify_max_queued_events, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec_minmax, - .strategy = &sysctl_intvec, - .extra1 = &zero - }, - { .ctl_name = 0 } -}; -#endif /* CONFIG_SYSCTL */ - -static inline void get_inotify_dev(struct inotify_device *dev) +static inline void get_inotify_handle(struct inotify_handle *ih) { - atomic_inc(&dev->count); + atomic_inc(&ih->count); } -static inline void put_inotify_dev(struct inotify_device *dev) +static inline void put_inotify_handle(struct inotify_handle *ih) { - if (atomic_dec_and_test(&dev->count)) { - atomic_dec(&dev->user->inotify_devs); - free_uid(dev->user); - idr_destroy(&dev->idr); - kfree(dev); + if (atomic_dec_and_test(&ih->count)) { + idr_destroy(&ih->idr); + kfree(ih); } } -static inline void get_inotify_watch(struct inotify_watch *watch) +/** + * get_inotify_watch - grab a reference to an inotify_watch + * @watch: watch to grab + */ +void get_inotify_watch(struct inotify_watch *watch) { atomic_inc(&watch->count); } +EXPORT_SYMBOL_GPL(get_inotify_watch); -/* +/** * put_inotify_watch - decrements the ref count on a given watch. cleans up - * the watch and its references if the count reaches zero. + * watch references if the count reaches zero. inotify_watch is freed by + * inotify callers via the destroy_watch() op. + * @watch: watch to release */ -static inline void put_inotify_watch(struct inotify_watch *watch) +void put_inotify_watch(struct inotify_watch *watch) { if (atomic_dec_and_test(&watch->count)) { - put_inotify_dev(watch->dev); - iput(watch->inode); - kmem_cache_free(watch_cachep, watch); - } -} - -/* - * kernel_event - create a new kernel event with the given parameters - * - * This function can sleep. - */ -static struct inotify_kernel_event * kernel_event(s32 wd, u32 mask, u32 cookie, - const char *name) -{ - struct inotify_kernel_event *kevent; - - kevent = kmem_cache_alloc(event_cachep, GFP_KERNEL); - if (unlikely(!kevent)) - return NULL; - - /* we hand this out to user-space, so zero it just in case */ - memset(&kevent->event, 0, sizeof(struct inotify_event)); - - kevent->event.wd = wd; - kevent->event.mask = mask; - kevent->event.cookie = cookie; - - INIT_LIST_HEAD(&kevent->list); - - if (name) { - size_t len, rem, event_size = sizeof(struct inotify_event); - - /* - * We need to pad the filename so as to properly align an - * array of inotify_event structures. Because the structure is - * small and the common case is a small filename, we just round - * up to the next multiple of the structure's sizeof. This is - * simple and safe for all architectures. - */ - len = strlen(name) + 1; - rem = event_size - len; - if (len > event_size) { - rem = event_size - (len % event_size); - if (len % event_size == 0) - rem = 0; - } - - kevent->name = kmalloc(len + rem, GFP_KERNEL); - if (unlikely(!kevent->name)) { - kmem_cache_free(event_cachep, kevent); - return NULL; - } - memcpy(kevent->name, name, len); - if (rem) - memset(kevent->name + len, 0, rem); - kevent->event.len = len + rem; - } else { - kevent->event.len = 0; - kevent->name = NULL; - } - - return kevent; -} - -/* - * inotify_dev_get_event - return the next event in the given dev's queue - * - * Caller must hold dev->mutex. - */ -static inline struct inotify_kernel_event * -inotify_dev_get_event(struct inotify_device *dev) -{ - return list_entry(dev->events.next, struct inotify_kernel_event, list); -} - -/* - * inotify_dev_queue_event - add a new event to the given device - * - * Caller must hold dev->mutex. Can sleep (calls kernel_event()). - */ -static void inotify_dev_queue_event(struct inotify_device *dev, - struct inotify_watch *watch, u32 mask, - u32 cookie, const char *name) -{ - struct inotify_kernel_event *kevent, *last; - - /* coalescing: drop this event if it is a dupe of the previous */ - last = inotify_dev_get_event(dev); - if (last && last->event.mask == mask && last->event.wd == watch->wd && - last->event.cookie == cookie) { - const char *lastname = last->name; - - if (!name && !lastname) - return; - if (name && lastname && !strcmp(lastname, name)) - return; - } - - /* the queue overflowed and we already sent the Q_OVERFLOW event */ - if (unlikely(dev->event_count > dev->max_events)) - return; - - /* if the queue overflows, we need to notify user space */ - if (unlikely(dev->event_count == dev->max_events)) - kevent = kernel_event(-1, IN_Q_OVERFLOW, cookie, NULL); - else - kevent = kernel_event(watch->wd, mask, cookie, name); - - if (unlikely(!kevent)) - return; - - /* queue the event and wake up anyone waiting */ - dev->event_count++; - dev->queue_size += sizeof(struct inotify_event) + kevent->event.len; - list_add_tail(&kevent->list, &dev->events); - wake_up_interruptible(&dev->wq); -} - -/* - * remove_kevent - cleans up and ultimately frees the given kevent - * - * Caller must hold dev->mutex. - */ -static void remove_kevent(struct inotify_device *dev, - struct inotify_kernel_event *kevent) -{ - list_del(&kevent->list); + struct inotify_handle *ih = watch->ih; - dev->event_count--; - dev->queue_size -= sizeof(struct inotify_event) + kevent->event.len; - - kfree(kevent->name); - kmem_cache_free(event_cachep, kevent); -} - -/* - * inotify_dev_event_dequeue - destroy an event on the given device - * - * Caller must hold dev->mutex. - */ -static void inotify_dev_event_dequeue(struct inotify_device *dev) -{ - if (!list_empty(&dev->events)) { - struct inotify_kernel_event *kevent; - kevent = inotify_dev_get_event(dev); - remove_kevent(dev, kevent); + iput(watch->inode); + ih->in_ops->destroy_watch(watch); + put_inotify_handle(ih); } } +EXPORT_SYMBOL_GPL(put_inotify_watch); /* - * inotify_dev_get_wd - returns the next WD for use by the given dev + * inotify_handle_get_wd - returns the next WD for use by the given handle * - * Callers must hold dev->mutex. This function can sleep. + * Callers must hold ih->mutex. This function can sleep. */ -static int inotify_dev_get_wd(struct inotify_device *dev, - struct inotify_watch *watch) +static int inotify_handle_get_wd(struct inotify_handle *ih, + struct inotify_watch *watch) { int ret; do { - if (unlikely(!idr_pre_get(&dev->idr, GFP_KERNEL))) + if (unlikely(!idr_pre_get(&ih->idr, GFP_KERNEL))) return -ENOSPC; - ret = idr_get_new_above(&dev->idr, watch, dev->last_wd+1, &watch->wd); + ret = idr_get_new_above(&ih->idr, watch, ih->last_wd+1, &watch->wd); } while (ret == -EAGAIN); - return ret; -} + if (likely(!ret)) + ih->last_wd = watch->wd; -/* - * find_inode - resolve a user-given path to a specific inode and return a nd - */ -static int find_inode(const char __user *dirname, struct nameidata *nd, - unsigned flags) -{ - int error; - - error = __user_walk(dirname, flags, nd); - if (error) - return error; - /* you can only watch an inode if you have read permissions on it */ - error = vfs_permission(nd, MAY_READ); - if (error) - path_release(nd); - return error; + return ret; } /* @@ -422,67 +188,18 @@ static void set_dentry_child_flags(struct inode *inode, int watched) } /* - * create_watch - creates a watch on the given device. - * - * Callers must hold dev->mutex. Calls inotify_dev_get_wd() so may sleep. - * Both 'dev' and 'inode' (by way of nameidata) need to be pinned. - */ -static struct inotify_watch *create_watch(struct inotify_device *dev, - u32 mask, struct inode *inode) -{ - struct inotify_watch *watch; - int ret; - - if (atomic_read(&dev->user->inotify_watches) >= - inotify_max_user_watches) - return ERR_PTR(-ENOSPC); - - watch = kmem_cache_alloc(watch_cachep, GFP_KERNEL); - if (unlikely(!watch)) - return ERR_PTR(-ENOMEM); - - ret = inotify_dev_get_wd(dev, watch); - if (unlikely(ret)) { - kmem_cache_free(watch_cachep, watch); - return ERR_PTR(ret); - } - - dev->last_wd = watch->wd; - watch->mask = mask; - atomic_set(&watch->count, 0); - INIT_LIST_HEAD(&watch->d_list); - INIT_LIST_HEAD(&watch->i_list); - - /* save a reference to device and bump the count to make it official */ - get_inotify_dev(dev); - watch->dev = dev; - - /* - * Save a reference to the inode and bump the ref count to make it - * official. We hold a reference to nameidata, which makes this safe. - */ - watch->inode = igrab(inode); - - /* bump our own count, corresponding to our entry in dev->watches */ - get_inotify_watch(watch); - - atomic_inc(&dev->user->inotify_watches); - - return watch; -} - -/* - * inotify_find_dev - find the watch associated with the given inode and dev + * inotify_find_handle - find the watch associated with the given inode and + * handle * * Callers must hold inode->inotify_mutex. */ -static struct inotify_watch *inode_find_dev(struct inode *inode, - struct inotify_device *dev) +static struct inotify_watch *inode_find_handle(struct inode *inode, + struct inotify_handle *ih) { struct inotify_watch *watch; list_for_each_entry(watch, &inode->inotify_watches, i_list) { - if (watch->dev == dev) + if (watch->ih == ih) return watch; } @@ -491,39 +208,34 @@ static struct inotify_watch *inode_find_dev(struct inode *inode, /* * remove_watch_no_event - remove_watch() without the IN_IGNORED event. + * + * Callers must hold both inode->inotify_mutex and ih->mutex. */ static void remove_watch_no_event(struct inotify_watch *watch, - struct inotify_device *dev) + struct inotify_handle *ih) { list_del(&watch->i_list); - list_del(&watch->d_list); + list_del(&watch->h_list); if (!inotify_inode_watched(watch->inode)) set_dentry_child_flags(watch->inode, 0); - atomic_dec(&dev->user->inotify_watches); - idr_remove(&dev->idr, watch->wd); - put_inotify_watch(watch); + idr_remove(&ih->idr, watch->wd); } /* - * remove_watch - Remove a watch from both the device and the inode. Sends - * the IN_IGNORED event to the given device signifying that the inode is no - * longer watched. - * - * Callers must hold both inode->inotify_mutex and dev->mutex. We drop a - * reference to the inode before returning. + * remove_watch - Remove a watch from both the handle and the inode. Sends + * the IN_IGNORED event signifying that the inode is no longer watched. * - * The inode is not iput() so as to remain atomic. If the inode needs to be - * iput(), the call returns one. Otherwise, it returns zero. + * Callers must hold both inode->inotify_mutex and ih->mutex. */ -static void remove_watch(struct inotify_watch *watch,struct inotify_device *dev) +static void remove_watch(struct inotify_watch *watch, struct inotify_handle *ih) { - inotify_dev_queue_event(dev, watch, IN_IGNORED, 0, NULL); - remove_watch_no_event(watch, dev); + remove_watch_no_event(watch, ih); + ih->in_ops->handle_event(watch, watch->wd, IN_IGNORED, 0, NULL); } -/* Kernel API */ +/* Kernel API for producing events */ /* * inotify_d_instantiate - instantiate dcache entry for inode @@ -576,14 +288,12 @@ void inotify_inode_queue_event(struct inode *inode, u32 mask, u32 cookie, list_for_each_entry_safe(watch, next, &inode->inotify_watches, i_list) { u32 watch_mask = watch->mask; if (watch_mask & mask) { - struct inotify_device *dev = watch->dev; - get_inotify_watch(watch); - mutex_lock(&dev->mutex); - inotify_dev_queue_event(dev, watch, mask, cookie, name); + struct inotify_handle *ih= watch->ih; + mutex_lock(&ih->mutex); if (watch_mask & IN_ONESHOT) - remove_watch_no_event(watch, dev); - mutex_unlock(&dev->mutex); - put_inotify_watch(watch); + remove_watch_no_event(watch, ih); + ih->in_ops->handle_event(watch, watch->wd, mask, cookie, name); + mutex_unlock(&ih->mutex); } } mutex_unlock(&inode->inotify_mutex); @@ -694,11 +404,12 @@ void inotify_unmount_inodes(struct list_head *list) mutex_lock(&inode->inotify_mutex); watches = &inode->inotify_watches; list_for_each_entry_safe(watch, next_w, watches, i_list) { - struct inotify_device *dev = watch->dev; - mutex_lock(&dev->mutex); - inotify_dev_queue_event(dev, watch, IN_UNMOUNT,0,NULL); - remove_watch(watch, dev); - mutex_unlock(&dev->mutex); + struct inotify_handle *ih= watch->ih; + mutex_lock(&ih->mutex); + ih->in_ops->handle_event(watch, watch->wd, IN_UNMOUNT, 0, + NULL); + remove_watch(watch, ih); + mutex_unlock(&ih->mutex); } mutex_unlock(&inode->inotify_mutex); iput(inode); @@ -718,432 +429,240 @@ void inotify_inode_is_dead(struct inode *inode) mutex_lock(&inode->inotify_mutex); list_for_each_entry_safe(watch, next, &inode->inotify_watches, i_list) { - struct inotify_device *dev = watch->dev; - mutex_lock(&dev->mutex); - remove_watch(watch, dev); - mutex_unlock(&dev->mutex); + struct inotify_handle *ih = watch->ih; + mutex_lock(&ih->mutex); + remove_watch(watch, ih); + mutex_unlock(&ih->mutex); } mutex_unlock(&inode->inotify_mutex); } EXPORT_SYMBOL_GPL(inotify_inode_is_dead); -/* Device Interface */ - -static unsigned int inotify_poll(struct file *file, poll_table *wait) -{ - struct inotify_device *dev = file->private_data; - int ret = 0; - - poll_wait(file, &dev->wq, wait); - mutex_lock(&dev->mutex); - if (!list_empty(&dev->events)) - ret = POLLIN | POLLRDNORM; - mutex_unlock(&dev->mutex); - - return ret; -} +/* Kernel Consumer API */ -static ssize_t inotify_read(struct file *file, char __user *buf, - size_t count, loff_t *pos) +/** + * inotify_init - allocate and initialize an inotify instance + * @ops: caller's inotify operations + */ +struct inotify_handle *inotify_init(const struct inotify_operations *ops) { - size_t event_size = sizeof (struct inotify_event); - struct inotify_device *dev; - char __user *start; - int ret; - DEFINE_WAIT(wait); - - start = buf; - dev = file->private_data; - - while (1) { - int events; - - prepare_to_wait(&dev->wq, &wait, TASK_INTERRUPTIBLE); + struct inotify_handle *ih; - mutex_lock(&dev->mutex); - events = !list_empty(&dev->events); - mutex_unlock(&dev->mutex); - if (events) { - ret = 0; - break; - } - - if (file->f_flags & O_NONBLOCK) { - ret = -EAGAIN; - break; - } - - if (signal_pending(current)) { - ret = -EINTR; - break; - } - - schedule(); - } - - finish_wait(&dev->wq, &wait); - if (ret) - return ret; - - mutex_lock(&dev->mutex); - while (1) { - struct inotify_kernel_event *kevent; - - ret = buf - start; - if (list_empty(&dev->events)) - break; - - kevent = inotify_dev_get_event(dev); - if (event_size + kevent->event.len > count) - break; - - if (copy_to_user(buf, &kevent->event, event_size)) { - ret = -EFAULT; - break; - } - buf += event_size; - count -= event_size; - - if (kevent->name) { - if (copy_to_user(buf, kevent->name, kevent->event.len)){ - ret = -EFAULT; - break; - } - buf += kevent->event.len; - count -= kevent->event.len; - } + ih = kmalloc(sizeof(struct inotify_handle), GFP_KERNEL); + if (unlikely(!ih)) + return ERR_PTR(-ENOMEM); - remove_kevent(dev, kevent); - } - mutex_unlock(&dev->mutex); + idr_init(&ih->idr); + INIT_LIST_HEAD(&ih->watches); + mutex_init(&ih->mutex); + ih->last_wd = 0; + ih->in_ops = ops; + atomic_set(&ih->count, 0); + get_inotify_handle(ih); - return ret; + return ih; } +EXPORT_SYMBOL_GPL(inotify_init); -static int inotify_release(struct inode *ignored, struct file *file) +/** + * inotify_destroy - clean up and destroy an inotify instance + * @ih: inotify handle + */ +void inotify_destroy(struct inotify_handle *ih) { - struct inotify_device *dev = file->private_data; - /* - * Destroy all of the watches on this device. Unfortunately, not very + * Destroy all of the watches for this handle. Unfortunately, not very * pretty. We cannot do a simple iteration over the list, because we * do not know the inode until we iterate to the watch. But we need to - * hold inode->inotify_mutex before dev->mutex. The following works. + * hold inode->inotify_mutex before ih->mutex. The following works. */ while (1) { struct inotify_watch *watch; struct list_head *watches; struct inode *inode; - mutex_lock(&dev->mutex); - watches = &dev->watches; + mutex_lock(&ih->mutex); + watches = &ih->watches; if (list_empty(watches)) { - mutex_unlock(&dev->mutex); + mutex_unlock(&ih->mutex); break; } - watch = list_entry(watches->next, struct inotify_watch, d_list); + watch = list_entry(watches->next, struct inotify_watch, h_list); get_inotify_watch(watch); - mutex_unlock(&dev->mutex); + mutex_unlock(&ih->mutex); inode = watch->inode; mutex_lock(&inode->inotify_mutex); - mutex_lock(&dev->mutex); + mutex_lock(&ih->mutex); /* make sure we didn't race with another list removal */ - if (likely(idr_find(&dev->idr, watch->wd))) - remove_watch_no_event(watch, dev); + if (likely(idr_find(&ih->idr, watch->wd))) { + remove_watch_no_event(watch, ih); + put_inotify_watch(watch); + } - mutex_unlock(&dev->mutex); + mutex_unlock(&ih->mutex); mutex_unlock(&inode->inotify_mutex); put_inotify_watch(watch); } - /* destroy all of the events on this device */ - mutex_lock(&dev->mutex); - while (!list_empty(&dev->events)) - inotify_dev_event_dequeue(dev); - mutex_unlock(&dev->mutex); - - /* free this device: the put matching the get in inotify_init() */ - put_inotify_dev(dev); - - return 0; + /* free this handle: the put matching the get in inotify_init() */ + put_inotify_handle(ih); } +EXPORT_SYMBOL_GPL(inotify_destroy); -/* - * inotify_ignore - remove a given wd from this inotify instance. +/** + * inotify_find_update_watch - find and update the mask of an existing watch + * @ih: inotify handle + * @inode: inode's watch to update + * @mask: mask of events to watch * - * Can sleep. + * Caller must pin given inode (via nameidata). */ -static int inotify_ignore(struct inotify_device *dev, s32 wd) +s32 inotify_find_update_watch(struct inotify_handle *ih, struct inode *inode, + u32 mask) { - struct inotify_watch *watch; - struct inode *inode; + struct inotify_watch *old; + int mask_add = 0; + int ret; - mutex_lock(&dev->mutex); - watch = idr_find(&dev->idr, wd); - if (unlikely(!watch)) { - mutex_unlock(&dev->mutex); + if (mask & IN_MASK_ADD) + mask_add = 1; + + /* don't allow invalid bits: we don't want flags set */ + mask &= IN_ALL_EVENTS | IN_ONESHOT; + if (unlikely(!mask)) return -EINVAL; - } - get_inotify_watch(watch); - inode = watch->inode; - mutex_unlock(&dev->mutex); mutex_lock(&inode->inotify_mutex); - mutex_lock(&dev->mutex); - - /* make sure that we did not race */ - if (likely(idr_find(&dev->idr, wd) == watch)) - remove_watch(watch, dev); - - mutex_unlock(&dev->mutex); - mutex_unlock(&inode->inotify_mutex); - put_inotify_watch(watch); - - return 0; -} - -static long inotify_ioctl(struct file *file, unsigned int cmd, - unsigned long arg) -{ - struct inotify_device *dev; - void __user *p; - int ret = -ENOTTY; + mutex_lock(&ih->mutex); - dev = file->private_data; - p = (void __user *) arg; - - switch (cmd) { - case FIONREAD: - ret = put_user(dev->queue_size, (int __user *) p); - break; - } - - return ret; -} - -static const struct file_operations inotify_fops = { - .poll = inotify_poll, - .read = inotify_read, - .release = inotify_release, - .unlocked_ioctl = inotify_ioctl, - .compat_ioctl = inotify_ioctl, -}; - -asmlinkage long sys_inotify_init(void) -{ - struct inotify_device *dev; - struct user_struct *user; - struct file *filp; - int fd, ret; - - fd = get_unused_fd(); - if (fd < 0) - return fd; - - filp = get_empty_filp(); - if (!filp) { - ret = -ENFILE; - goto out_put_fd; - } - - user = get_uid(current->user); - if (unlikely(atomic_read(&user->inotify_devs) >= - inotify_max_user_instances)) { - ret = -EMFILE; - goto out_free_uid; - } - - dev = kmalloc(sizeof(struct inotify_device), GFP_KERNEL); - if (unlikely(!dev)) { - ret = -ENOMEM; - goto out_free_uid; + /* + * Handle the case of re-adding a watch on an (inode,ih) pair that we + * are already watching. We just update the mask and return its wd. + */ + old = inode_find_handle(inode, ih); + if (unlikely(!old)) { + ret = -ENOENT; + goto out; } - filp->f_op = &inotify_fops; - filp->f_vfsmnt = mntget(inotify_mnt); - filp->f_dentry = dget(inotify_mnt->mnt_root); - filp->f_mapping = filp->f_dentry->d_inode->i_mapping; - filp->f_mode = FMODE_READ; - filp->f_flags = O_RDONLY; - filp->private_data = dev; - - idr_init(&dev->idr); - INIT_LIST_HEAD(&dev->events); - INIT_LIST_HEAD(&dev->watches); - init_waitqueue_head(&dev->wq); - mutex_init(&dev->mutex); - dev->event_count = 0; - dev->queue_size = 0; - dev->max_events = inotify_max_queued_events; - dev->user = user; - dev->last_wd = 0; - atomic_set(&dev->count, 0); - - get_inotify_dev(dev); - atomic_inc(&user->inotify_devs); - fd_install(fd, filp); - - return fd; -out_free_uid: - free_uid(user); - put_filp(filp); -out_put_fd: - put_unused_fd(fd); + if (mask_add) + old->mask |= mask; + else + old->mask = mask; + ret = old->wd; +out: + mutex_unlock(&ih->mutex); + mutex_unlock(&inode->inotify_mutex); return ret; } +EXPORT_SYMBOL_GPL(inotify_find_update_watch); -asmlinkage long sys_inotify_add_watch(int fd, const char __user *path, u32 mask) +/** + * inotify_add_watch - add a watch to an inotify instance + * @ih: inotify handle + * @watch: caller allocated watch structure + * @inode: inode to watch + * @mask: mask of events to watch + * + * Caller must pin given inode (via nameidata). + * Caller must ensure it only calls inotify_add_watch() once per watch. + * Calls inotify_handle_get_wd() so may sleep. + */ +s32 inotify_add_watch(struct inotify_handle *ih, struct inotify_watch *watch, + struct inode *inode, u32 mask) { - struct inotify_watch *watch, *old; - struct inode *inode; - struct inotify_device *dev; - struct nameidata nd; - struct file *filp; - int ret, fput_needed; - int mask_add = 0; - unsigned flags = 0; - - filp = fget_light(fd, &fput_needed); - if (unlikely(!filp)) - return -EBADF; + int ret = 0; - /* verify that this is indeed an inotify instance */ - if (unlikely(filp->f_op != &inotify_fops)) { - ret = -EINVAL; - goto fput_and_out; - } + /* don't allow invalid bits: we don't want flags set */ + mask &= IN_ALL_EVENTS | IN_ONESHOT; + if (unlikely(!mask)) + return -EINVAL; + watch->mask = mask; - if (!(mask & IN_DONT_FOLLOW)) - flags |= LOOKUP_FOLLOW; - if (mask & IN_ONLYDIR) - flags |= LOOKUP_DIRECTORY; + mutex_lock(&inode->inotify_mutex); + mutex_lock(&ih->mutex); - ret = find_inode(path, &nd, flags); + /* Initialize a new watch */ + ret = inotify_handle_get_wd(ih, watch); if (unlikely(ret)) - goto fput_and_out; - - /* inode held in place by reference to nd; dev by fget on fd */ - inode = nd.dentry->d_inode; - dev = filp->private_data; - - mutex_lock(&inode->inotify_mutex); - mutex_lock(&dev->mutex); + goto out; + ret = watch->wd; - if (mask & IN_MASK_ADD) - mask_add = 1; + atomic_set(&watch->count, 0); + INIT_LIST_HEAD(&watch->h_list); + INIT_LIST_HEAD(&watch->i_list); - /* don't let user-space set invalid bits: we don't want flags set */ - mask &= IN_ALL_EVENTS | IN_ONESHOT; - if (unlikely(!mask)) { - ret = -EINVAL; - goto out; - } + /* save a reference to handle and bump the count to make it official */ + get_inotify_handle(ih); + watch->ih = ih; /* - * Handle the case of re-adding a watch on an (inode,dev) pair that we - * are already watching. We just update the mask and return its wd. + * Save a reference to the inode and bump the ref count to make it + * official. We hold a reference to nameidata, which makes this safe. */ - old = inode_find_dev(inode, dev); - if (unlikely(old)) { - if (mask_add) - old->mask |= mask; - else - old->mask = mask; - ret = old->wd; - goto out; - } + watch->inode = igrab(inode); - watch = create_watch(dev, mask, inode); - if (unlikely(IS_ERR(watch))) { - ret = PTR_ERR(watch); - goto out; - } + get_inotify_watch(watch); /* initial get */ if (!inotify_inode_watched(inode)) set_dentry_child_flags(inode, 1); - /* Add the watch to the device's and the inode's list */ - list_add(&watch->d_list, &dev->watches); + /* Add the watch to the handle's and the inode's list */ + list_add(&watch->h_list, &ih->watches); list_add(&watch->i_list, &inode->inotify_watches); - ret = watch->wd; out: - mutex_unlock(&dev->mutex); + mutex_unlock(&ih->mutex); mutex_unlock(&inode->inotify_mutex); - path_release(&nd); -fput_and_out: - fput_light(filp, fput_needed); return ret; } +EXPORT_SYMBOL_GPL(inotify_add_watch); -asmlinkage long sys_inotify_rm_watch(int fd, u32 wd) +/** + * inotify_rm_wd - remove a watch from an inotify instance + * @ih: inotify handle + * @wd: watch descriptor to remove + * + * Can sleep. + */ +int inotify_rm_wd(struct inotify_handle *ih, u32 wd) { - struct file *filp; - struct inotify_device *dev; - int ret, fput_needed; - - filp = fget_light(fd, &fput_needed); - if (unlikely(!filp)) - return -EBADF; + struct inotify_watch *watch; + struct inode *inode; - /* verify that this is indeed an inotify instance */ - if (unlikely(filp->f_op != &inotify_fops)) { - ret = -EINVAL; - goto out; + mutex_lock(&ih->mutex); + watch = idr_find(&ih->idr, wd); + if (unlikely(!watch)) { + mutex_unlock(&ih->mutex); + return -EINVAL; } + get_inotify_watch(watch); + inode = watch->inode; + mutex_unlock(&ih->mutex); - dev = filp->private_data; - ret = inotify_ignore(dev, wd); + mutex_lock(&inode->inotify_mutex); + mutex_lock(&ih->mutex); -out: - fput_light(filp, fput_needed); - return ret; -} + /* make sure that we did not race */ + if (likely(idr_find(&ih->idr, wd) == watch)) + remove_watch(watch, ih); -static struct super_block * -inotify_get_sb(struct file_system_type *fs_type, int flags, - const char *dev_name, void *data) -{ - return get_sb_pseudo(fs_type, "inotify", NULL, 0xBAD1DEA); -} + mutex_unlock(&ih->mutex); + mutex_unlock(&inode->inotify_mutex); + put_inotify_watch(watch); -static struct file_system_type inotify_fs_type = { - .name = "inotifyfs", - .get_sb = inotify_get_sb, - .kill_sb = kill_anon_super, -}; + return 0; +} +EXPORT_SYMBOL_GPL(inotify_rm_wd); /* - * inotify_setup - Our initialization function. Note that we cannnot return - * error because we have compiled-in VFS hooks. So an (unlikely) failure here - * must result in panic(). + * inotify_setup - core initialization function */ static int __init inotify_setup(void) { - int ret; - - ret = register_filesystem(&inotify_fs_type); - if (unlikely(ret)) - panic("inotify: register_filesystem returned %d!\n", ret); - - inotify_mnt = kern_mount(&inotify_fs_type); - if (IS_ERR(inotify_mnt)) - panic("inotify: kern_mount ret %ld!\n", PTR_ERR(inotify_mnt)); - - inotify_max_queued_events = 16384; - inotify_max_user_instances = 128; - inotify_max_user_watches = 8192; - atomic_set(&inotify_cookie, 0); - watch_cachep = kmem_cache_create("inotify_watch_cache", - sizeof(struct inotify_watch), - 0, SLAB_PANIC, NULL, NULL); - event_cachep = kmem_cache_create("inotify_event_cache", - sizeof(struct inotify_kernel_event), - 0, SLAB_PANIC, NULL, NULL); - return 0; } diff --git a/fs/inotify_user.c b/fs/inotify_user.c new file mode 100644 index 000000000000..845dc79a4e9c --- /dev/null +++ b/fs/inotify_user.c @@ -0,0 +1,717 @@ +/* + * fs/inotify_user.c - inotify support for userspace + * + * Authors: + * John McCutchan + * Robert Love + * + * Copyright (C) 2005 John McCutchan + * Copyright 2006 Hewlett-Packard Development Company, L.P. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2, or (at your option) any + * later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +static kmem_cache_t *watch_cachep __read_mostly; +static kmem_cache_t *event_cachep __read_mostly; + +static struct vfsmount *inotify_mnt __read_mostly; + +/* these are configurable via /proc/sys/fs/inotify/ */ +int inotify_max_user_instances __read_mostly; +int inotify_max_user_watches __read_mostly; +int inotify_max_queued_events __read_mostly; + +/* + * Lock ordering: + * + * inotify_dev->up_mutex (ensures we don't re-add the same watch) + * inode->inotify_mutex (protects inode's watch list) + * inotify_handle->mutex (protects inotify_handle's watch list) + * inotify_dev->ev_mutex (protects device's event queue) + */ + +/* + * Lifetimes of the main data structures: + * + * inotify_device: Lifetime is managed by reference count, from + * sys_inotify_init() until release. Additional references can bump the count + * via get_inotify_dev() and drop the count via put_inotify_dev(). + * + * inotify_user_watch: Lifetime is from create_watch() to the receipt of an + * IN_IGNORED event from inotify, or when using IN_ONESHOT, to receipt of the + * first event, or to inotify_destroy(). + */ + +/* + * struct inotify_device - represents an inotify instance + * + * This structure is protected by the mutex 'mutex'. + */ +struct inotify_device { + wait_queue_head_t wq; /* wait queue for i/o */ + struct mutex ev_mutex; /* protects event queue */ + struct mutex up_mutex; /* synchronizes watch updates */ + struct list_head events; /* list of queued events */ + atomic_t count; /* reference count */ + struct user_struct *user; /* user who opened this dev */ + struct inotify_handle *ih; /* inotify handle */ + unsigned int queue_size; /* size of the queue (bytes) */ + unsigned int event_count; /* number of pending events */ + unsigned int max_events; /* maximum number of events */ +}; + +/* + * struct inotify_kernel_event - An inotify event, originating from a watch and + * queued for user-space. A list of these is attached to each instance of the + * device. In read(), this list is walked and all events that can fit in the + * buffer are returned. + * + * Protected by dev->ev_mutex of the device in which we are queued. + */ +struct inotify_kernel_event { + struct inotify_event event; /* the user-space event */ + struct list_head list; /* entry in inotify_device's list */ + char *name; /* filename, if any */ +}; + +/* + * struct inotify_user_watch - our version of an inotify_watch, we add + * a reference to the associated inotify_device. + */ +struct inotify_user_watch { + struct inotify_device *dev; /* associated device */ + struct inotify_watch wdata; /* inotify watch data */ +}; + +#ifdef CONFIG_SYSCTL + +#include + +static int zero; + +ctl_table inotify_table[] = { + { + .ctl_name = INOTIFY_MAX_USER_INSTANCES, + .procname = "max_user_instances", + .data = &inotify_max_user_instances, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .strategy = &sysctl_intvec, + .extra1 = &zero, + }, + { + .ctl_name = INOTIFY_MAX_USER_WATCHES, + .procname = "max_user_watches", + .data = &inotify_max_user_watches, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .strategy = &sysctl_intvec, + .extra1 = &zero, + }, + { + .ctl_name = INOTIFY_MAX_QUEUED_EVENTS, + .procname = "max_queued_events", + .data = &inotify_max_queued_events, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .strategy = &sysctl_intvec, + .extra1 = &zero + }, + { .ctl_name = 0 } +}; +#endif /* CONFIG_SYSCTL */ + +static inline void get_inotify_dev(struct inotify_device *dev) +{ + atomic_inc(&dev->count); +} + +static inline void put_inotify_dev(struct inotify_device *dev) +{ + if (atomic_dec_and_test(&dev->count)) { + atomic_dec(&dev->user->inotify_devs); + free_uid(dev->user); + kfree(dev); + } +} + +/* + * free_inotify_user_watch - cleans up the watch and its references + */ +static void free_inotify_user_watch(struct inotify_watch *w) +{ + struct inotify_user_watch *watch; + struct inotify_device *dev; + + watch = container_of(w, struct inotify_user_watch, wdata); + dev = watch->dev; + + atomic_dec(&dev->user->inotify_watches); + put_inotify_dev(dev); + kmem_cache_free(watch_cachep, watch); +} + +/* + * kernel_event - create a new kernel event with the given parameters + * + * This function can sleep. + */ +static struct inotify_kernel_event * kernel_event(s32 wd, u32 mask, u32 cookie, + const char *name) +{ + struct inotify_kernel_event *kevent; + + kevent = kmem_cache_alloc(event_cachep, GFP_KERNEL); + if (unlikely(!kevent)) + return NULL; + + /* we hand this out to user-space, so zero it just in case */ + memset(&kevent->event, 0, sizeof(struct inotify_event)); + + kevent->event.wd = wd; + kevent->event.mask = mask; + kevent->event.cookie = cookie; + + INIT_LIST_HEAD(&kevent->list); + + if (name) { + size_t len, rem, event_size = sizeof(struct inotify_event); + + /* + * We need to pad the filename so as to properly align an + * array of inotify_event structures. Because the structure is + * small and the common case is a small filename, we just round + * up to the next multiple of the structure's sizeof. This is + * simple and safe for all architectures. + */ + len = strlen(name) + 1; + rem = event_size - len; + if (len > event_size) { + rem = event_size - (len % event_size); + if (len % event_size == 0) + rem = 0; + } + + kevent->name = kmalloc(len + rem, GFP_KERNEL); + if (unlikely(!kevent->name)) { + kmem_cache_free(event_cachep, kevent); + return NULL; + } + memcpy(kevent->name, name, len); + if (rem) + memset(kevent->name + len, 0, rem); + kevent->event.len = len + rem; + } else { + kevent->event.len = 0; + kevent->name = NULL; + } + + return kevent; +} + +/* + * inotify_dev_get_event - return the next event in the given dev's queue + * + * Caller must hold dev->ev_mutex. + */ +static inline struct inotify_kernel_event * +inotify_dev_get_event(struct inotify_device *dev) +{ + return list_entry(dev->events.next, struct inotify_kernel_event, list); +} + +/* + * inotify_dev_queue_event - event handler registered with core inotify, adds + * a new event to the given device + * + * Can sleep (calls kernel_event()). + */ +static void inotify_dev_queue_event(struct inotify_watch *w, u32 wd, u32 mask, + u32 cookie, const char *name) +{ + struct inotify_user_watch *watch; + struct inotify_device *dev; + struct inotify_kernel_event *kevent, *last; + + watch = container_of(w, struct inotify_user_watch, wdata); + dev = watch->dev; + + mutex_lock(&dev->ev_mutex); + + /* we can safely put the watch as we don't reference it while + * generating the event + */ + if (mask & IN_IGNORED || mask & IN_ONESHOT) + put_inotify_watch(w); /* final put */ + + /* coalescing: drop this event if it is a dupe of the previous */ + last = inotify_dev_get_event(dev); + if (last && last->event.mask == mask && last->event.wd == wd && + last->event.cookie == cookie) { + const char *lastname = last->name; + + if (!name && !lastname) + goto out; + if (name && lastname && !strcmp(lastname, name)) + goto out; + } + + /* the queue overflowed and we already sent the Q_OVERFLOW event */ + if (unlikely(dev->event_count > dev->max_events)) + goto out; + + /* if the queue overflows, we need to notify user space */ + if (unlikely(dev->event_count == dev->max_events)) + kevent = kernel_event(-1, IN_Q_OVERFLOW, cookie, NULL); + else + kevent = kernel_event(wd, mask, cookie, name); + + if (unlikely(!kevent)) + goto out; + + /* queue the event and wake up anyone waiting */ + dev->event_count++; + dev->queue_size += sizeof(struct inotify_event) + kevent->event.len; + list_add_tail(&kevent->list, &dev->events); + wake_up_interruptible(&dev->wq); + +out: + mutex_unlock(&dev->ev_mutex); +} + +/* + * remove_kevent - cleans up and ultimately frees the given kevent + * + * Caller must hold dev->ev_mutex. + */ +static void remove_kevent(struct inotify_device *dev, + struct inotify_kernel_event *kevent) +{ + list_del(&kevent->list); + + dev->event_count--; + dev->queue_size -= sizeof(struct inotify_event) + kevent->event.len; + + kfree(kevent->name); + kmem_cache_free(event_cachep, kevent); +} + +/* + * inotify_dev_event_dequeue - destroy an event on the given device + * + * Caller must hold dev->ev_mutex. + */ +static void inotify_dev_event_dequeue(struct inotify_device *dev) +{ + if (!list_empty(&dev->events)) { + struct inotify_kernel_event *kevent; + kevent = inotify_dev_get_event(dev); + remove_kevent(dev, kevent); + } +} + +/* + * find_inode - resolve a user-given path to a specific inode and return a nd + */ +static int find_inode(const char __user *dirname, struct nameidata *nd, + unsigned flags) +{ + int error; + + error = __user_walk(dirname, flags, nd); + if (error) + return error; + /* you can only watch an inode if you have read permissions on it */ + error = vfs_permission(nd, MAY_READ); + if (error) + path_release(nd); + return error; +} + +/* + * create_watch - creates a watch on the given device. + * + * Callers must hold dev->up_mutex. + */ +static int create_watch(struct inotify_device *dev, struct inode *inode, + u32 mask) +{ + struct inotify_user_watch *watch; + int ret; + + if (atomic_read(&dev->user->inotify_watches) >= + inotify_max_user_watches) + return -ENOSPC; + + watch = kmem_cache_alloc(watch_cachep, GFP_KERNEL); + if (unlikely(!watch)) + return -ENOMEM; + + /* save a reference to device and bump the count to make it official */ + get_inotify_dev(dev); + watch->dev = dev; + + atomic_inc(&dev->user->inotify_watches); + + ret = inotify_add_watch(dev->ih, &watch->wdata, inode, mask); + if (ret < 0) + free_inotify_user_watch(&watch->wdata); + + return ret; +} + +/* Device Interface */ + +static unsigned int inotify_poll(struct file *file, poll_table *wait) +{ + struct inotify_device *dev = file->private_data; + int ret = 0; + + poll_wait(file, &dev->wq, wait); + mutex_lock(&dev->ev_mutex); + if (!list_empty(&dev->events)) + ret = POLLIN | POLLRDNORM; + mutex_unlock(&dev->ev_mutex); + + return ret; +} + +static ssize_t inotify_read(struct file *file, char __user *buf, + size_t count, loff_t *pos) +{ + size_t event_size = sizeof (struct inotify_event); + struct inotify_device *dev; + char __user *start; + int ret; + DEFINE_WAIT(wait); + + start = buf; + dev = file->private_data; + + while (1) { + int events; + + prepare_to_wait(&dev->wq, &wait, TASK_INTERRUPTIBLE); + + mutex_lock(&dev->ev_mutex); + events = !list_empty(&dev->events); + mutex_unlock(&dev->ev_mutex); + if (events) { + ret = 0; + break; + } + + if (file->f_flags & O_NONBLOCK) { + ret = -EAGAIN; + break; + } + + if (signal_pending(current)) { + ret = -EINTR; + break; + } + + schedule(); + } + + finish_wait(&dev->wq, &wait); + if (ret) + return ret; + + mutex_lock(&dev->ev_mutex); + while (1) { + struct inotify_kernel_event *kevent; + + ret = buf - start; + if (list_empty(&dev->events)) + break; + + kevent = inotify_dev_get_event(dev); + if (event_size + kevent->event.len > count) + break; + + if (copy_to_user(buf, &kevent->event, event_size)) { + ret = -EFAULT; + break; + } + buf += event_size; + count -= event_size; + + if (kevent->name) { + if (copy_to_user(buf, kevent->name, kevent->event.len)){ + ret = -EFAULT; + break; + } + buf += kevent->event.len; + count -= kevent->event.len; + } + + remove_kevent(dev, kevent); + } + mutex_unlock(&dev->ev_mutex); + + return ret; +} + +static int inotify_release(struct inode *ignored, struct file *file) +{ + struct inotify_device *dev = file->private_data; + + inotify_destroy(dev->ih); + + /* destroy all of the events on this device */ + mutex_lock(&dev->ev_mutex); + while (!list_empty(&dev->events)) + inotify_dev_event_dequeue(dev); + mutex_unlock(&dev->ev_mutex); + + /* free this device: the put matching the get in inotify_init() */ + put_inotify_dev(dev); + + return 0; +} + +static long inotify_ioctl(struct file *file, unsigned int cmd, + unsigned long arg) +{ + struct inotify_device *dev; + void __user *p; + int ret = -ENOTTY; + + dev = file->private_data; + p = (void __user *) arg; + + switch (cmd) { + case FIONREAD: + ret = put_user(dev->queue_size, (int __user *) p); + break; + } + + return ret; +} + +static const struct file_operations inotify_fops = { + .poll = inotify_poll, + .read = inotify_read, + .release = inotify_release, + .unlocked_ioctl = inotify_ioctl, + .compat_ioctl = inotify_ioctl, +}; + +static const struct inotify_operations inotify_user_ops = { + .handle_event = inotify_dev_queue_event, + .destroy_watch = free_inotify_user_watch, +}; + +asmlinkage long sys_inotify_init(void) +{ + struct inotify_device *dev; + struct inotify_handle *ih; + struct user_struct *user; + struct file *filp; + int fd, ret; + + fd = get_unused_fd(); + if (fd < 0) + return fd; + + filp = get_empty_filp(); + if (!filp) { + ret = -ENFILE; + goto out_put_fd; + } + + user = get_uid(current->user); + if (unlikely(atomic_read(&user->inotify_devs) >= + inotify_max_user_instances)) { + ret = -EMFILE; + goto out_free_uid; + } + + dev = kmalloc(sizeof(struct inotify_device), GFP_KERNEL); + if (unlikely(!dev)) { + ret = -ENOMEM; + goto out_free_uid; + } + + ih = inotify_init(&inotify_user_ops); + if (unlikely(IS_ERR(ih))) { + ret = PTR_ERR(ih); + goto out_free_dev; + } + dev->ih = ih; + + filp->f_op = &inotify_fops; + filp->f_vfsmnt = mntget(inotify_mnt); + filp->f_dentry = dget(inotify_mnt->mnt_root); + filp->f_mapping = filp->f_dentry->d_inode->i_mapping; + filp->f_mode = FMODE_READ; + filp->f_flags = O_RDONLY; + filp->private_data = dev; + + INIT_LIST_HEAD(&dev->events); + init_waitqueue_head(&dev->wq); + mutex_init(&dev->ev_mutex); + mutex_init(&dev->up_mutex); + dev->event_count = 0; + dev->queue_size = 0; + dev->max_events = inotify_max_queued_events; + dev->user = user; + atomic_set(&dev->count, 0); + + get_inotify_dev(dev); + atomic_inc(&user->inotify_devs); + fd_install(fd, filp); + + return fd; +out_free_dev: + kfree(dev); +out_free_uid: + free_uid(user); + put_filp(filp); +out_put_fd: + put_unused_fd(fd); + return ret; +} + +asmlinkage long sys_inotify_add_watch(int fd, const char __user *path, u32 mask) +{ + struct inode *inode; + struct inotify_device *dev; + struct nameidata nd; + struct file *filp; + int ret, fput_needed; + unsigned flags = 0; + + filp = fget_light(fd, &fput_needed); + if (unlikely(!filp)) + return -EBADF; + + /* verify that this is indeed an inotify instance */ + if (unlikely(filp->f_op != &inotify_fops)) { + ret = -EINVAL; + goto fput_and_out; + } + + if (!(mask & IN_DONT_FOLLOW)) + flags |= LOOKUP_FOLLOW; + if (mask & IN_ONLYDIR) + flags |= LOOKUP_DIRECTORY; + + ret = find_inode(path, &nd, flags); + if (unlikely(ret)) + goto fput_and_out; + + /* inode held in place by reference to nd; dev by fget on fd */ + inode = nd.dentry->d_inode; + dev = filp->private_data; + + mutex_lock(&dev->up_mutex); + ret = inotify_find_update_watch(dev->ih, inode, mask); + if (ret == -ENOENT) + ret = create_watch(dev, inode, mask); + mutex_unlock(&dev->up_mutex); + + path_release(&nd); +fput_and_out: + fput_light(filp, fput_needed); + return ret; +} + +asmlinkage long sys_inotify_rm_watch(int fd, u32 wd) +{ + struct file *filp; + struct inotify_device *dev; + int ret, fput_needed; + + filp = fget_light(fd, &fput_needed); + if (unlikely(!filp)) + return -EBADF; + + /* verify that this is indeed an inotify instance */ + if (unlikely(filp->f_op != &inotify_fops)) { + ret = -EINVAL; + goto out; + } + + dev = filp->private_data; + + /* we free our watch data when we get IN_IGNORED */ + ret = inotify_rm_wd(dev->ih, wd); + +out: + fput_light(filp, fput_needed); + return ret; +} + +static struct super_block * +inotify_get_sb(struct file_system_type *fs_type, int flags, + const char *dev_name, void *data) +{ + return get_sb_pseudo(fs_type, "inotify", NULL, 0xBAD1DEA); +} + +static struct file_system_type inotify_fs_type = { + .name = "inotifyfs", + .get_sb = inotify_get_sb, + .kill_sb = kill_anon_super, +}; + +/* + * inotify_user_setup - Our initialization function. Note that we cannnot return + * error because we have compiled-in VFS hooks. So an (unlikely) failure here + * must result in panic(). + */ +static int __init inotify_user_setup(void) +{ + int ret; + + ret = register_filesystem(&inotify_fs_type); + if (unlikely(ret)) + panic("inotify: register_filesystem returned %d!\n", ret); + + inotify_mnt = kern_mount(&inotify_fs_type); + if (IS_ERR(inotify_mnt)) + panic("inotify: kern_mount ret %ld!\n", PTR_ERR(inotify_mnt)); + + inotify_max_queued_events = 16384; + inotify_max_user_instances = 128; + inotify_max_user_watches = 8192; + + watch_cachep = kmem_cache_create("inotify_watch_cache", + sizeof(struct inotify_user_watch), + 0, SLAB_PANIC, NULL, NULL); + event_cachep = kmem_cache_create("inotify_event_cache", + sizeof(struct inotify_kernel_event), + 0, SLAB_PANIC, NULL, NULL); + + return 0; +} + +module_init(inotify_user_setup); diff --git a/include/linux/inotify.h b/include/linux/inotify.h index 71aa1553ef38..68b6e0127de4 100644 --- a/include/linux/inotify.h +++ b/include/linux/inotify.h @@ -68,8 +68,37 @@ struct inotify_event { #include #include +/* + * struct inotify_watch - represents a watch request on a specific inode + * + * h_list is protected by ih->mutex of the associated inotify_handle. + * i_list, mask are protected by inode->inotify_mutex of the associated inode. + * ih, inode, and wd are never written to once the watch is created. + * + * Callers must use the established inotify interfaces to access inotify_watch + * contents. The content of this structure is private to the inotify + * implementation. + */ +struct inotify_watch { + struct list_head h_list; /* entry in inotify_handle's list */ + struct list_head i_list; /* entry in inode's list */ + atomic_t count; /* reference count */ + struct inotify_handle *ih; /* associated inotify handle */ + struct inode *inode; /* associated inode */ + __s32 wd; /* watch descriptor */ + __u32 mask; /* event mask for this watch */ +}; + +struct inotify_operations { + void (*handle_event)(struct inotify_watch *, u32, u32, u32, + const char *); + void (*destroy_watch)(struct inotify_watch *); +}; + #ifdef CONFIG_INOTIFY +/* Kernel API for producing events */ + extern void inotify_d_instantiate(struct dentry *, struct inode *); extern void inotify_d_move(struct dentry *); extern void inotify_inode_queue_event(struct inode *, __u32, __u32, @@ -80,6 +109,18 @@ extern void inotify_unmount_inodes(struct list_head *); extern void inotify_inode_is_dead(struct inode *); extern u32 inotify_get_cookie(void); +/* Kernel Consumer API */ + +extern struct inotify_handle *inotify_init(const struct inotify_operations *); +extern void inotify_destroy(struct inotify_handle *); +extern __s32 inotify_find_update_watch(struct inotify_handle *, struct inode *, + u32); +extern __s32 inotify_add_watch(struct inotify_handle *, struct inotify_watch *, + struct inode *, __u32); +extern int inotify_rm_wd(struct inotify_handle *, __u32); +extern void get_inotify_watch(struct inotify_watch *); +extern void put_inotify_watch(struct inotify_watch *); + #else static inline void inotify_d_instantiate(struct dentry *dentry, @@ -116,6 +157,41 @@ static inline u32 inotify_get_cookie(void) return 0; } +static inline struct inotify_handle *inotify_init(const struct inotify_operations *ops) +{ + return ERR_PTR(-EOPNOTSUPP); +} + +static inline void inotify_destroy(struct inotify_handle *ih) +{ +} + +static inline __s32 inotify_find_update_watch(struct inotify_handle *ih, + struct inode *inode, u32 mask) +{ + return -EOPNOTSUPP; +} + +static inline __s32 inotify_add_watch(struct inotify_handle *ih, + struct inotify_watch *watch, + struct inode *inode, __u32 mask) +{ + return -EOPNOTSUPP; +} + +static inline int inotify_rm_wd(struct inotify_handle *ih, __u32 wd) +{ + return -EOPNOTSUPP; +} + +static inline void get_inotify_watch(struct inotify_watch *watch) +{ +} + +static inline void put_inotify_watch(struct inotify_watch *watch) +{ +} + #endif /* CONFIG_INOTIFY */ #endif /* __KERNEL __ */ diff --git a/include/linux/sched.h b/include/linux/sched.h index 29b7d4f87d20..864e5a70ff65 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -494,7 +494,7 @@ struct user_struct { atomic_t processes; /* How many processes does this user have? */ atomic_t files; /* How many open files does this user have? */ atomic_t sigpending; /* How many pending signals does this user have? */ -#ifdef CONFIG_INOTIFY +#ifdef CONFIG_INOTIFY_USER atomic_t inotify_watches; /* How many inotify watches does this user have? */ atomic_t inotify_devs; /* How many inotify devs does this user have opened? */ #endif diff --git a/kernel/sysctl.c b/kernel/sysctl.c index e82726faeeff..0d656e61621d 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -150,7 +150,7 @@ extern ctl_table random_table[]; #ifdef CONFIG_UNIX98_PTYS extern ctl_table pty_table[]; #endif -#ifdef CONFIG_INOTIFY +#ifdef CONFIG_INOTIFY_USER extern ctl_table inotify_table[]; #endif @@ -1028,7 +1028,7 @@ static ctl_table fs_table[] = { .mode = 0644, .proc_handler = &proc_doulongvec_minmax, }, -#ifdef CONFIG_INOTIFY +#ifdef CONFIG_INOTIFY_USER { .ctl_name = FS_INOTIFY, .procname = "inotify", diff --git a/kernel/user.c b/kernel/user.c index 2116642f42c6..4b1eb745afa1 100644 --- a/kernel/user.c +++ b/kernel/user.c @@ -140,7 +140,7 @@ struct user_struct * alloc_uid(uid_t uid) atomic_set(&new->processes, 0); atomic_set(&new->files, 0); atomic_set(&new->sigpending, 0); -#ifdef CONFIG_INOTIFY +#ifdef CONFIG_INOTIFY_USER atomic_set(&new->inotify_watches, 0); atomic_set(&new->inotify_devs, 0); #endif -- cgit v1.2.3 From 9044e6bca5a4a575d3c068dfccb5651a2d6a13bc Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 22 May 2006 01:09:24 -0400 Subject: [PATCH] fix deadlocks in AUDIT_LIST/AUDIT_LIST_RULES We should not send a pile of replies while holding audit_netlink_mutex since we hold the same mutex when we receive commands. As the result, we can get blocked while sending and sit there holding the mutex while auditctl is unable to send the next command and get around to receiving what we'd sent. Solution: create skb and put them into a queue instead of sending; once we are done, send what we've got on the list. The former can be done synchronously while we are handling AUDIT_LIST or AUDIT_LIST_RULES; we are holding audit_netlink_mutex at that point. The latter is done asynchronously and without messing with audit_netlink_mutex. Signed-off-by: Al Viro --- kernel/audit.c | 62 ++++++++++++++++++++++++++++++++++++++-------------- kernel/audit.h | 11 ++++++++++ kernel/auditfilter.c | 60 +++++++++++++++++++++----------------------------- 3 files changed, 81 insertions(+), 52 deletions(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index df57b493e1cb..bf74bf02aa4b 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -366,6 +366,50 @@ static int kauditd_thread(void *dummy) return 0; } +int audit_send_list(void *_dest) +{ + struct audit_netlink_list *dest = _dest; + int pid = dest->pid; + struct sk_buff *skb; + + /* wait for parent to finish and send an ACK */ + mutex_lock(&audit_netlink_mutex); + mutex_unlock(&audit_netlink_mutex); + + while ((skb = __skb_dequeue(&dest->q)) != NULL) + netlink_unicast(audit_sock, skb, pid, 0); + + kfree(dest); + + return 0; +} + +struct sk_buff *audit_make_reply(int pid, int seq, int type, int done, + int multi, void *payload, int size) +{ + struct sk_buff *skb; + struct nlmsghdr *nlh; + int len = NLMSG_SPACE(size); + void *data; + int flags = multi ? NLM_F_MULTI : 0; + int t = done ? NLMSG_DONE : type; + + skb = alloc_skb(len, GFP_KERNEL); + if (!skb) + return NULL; + + nlh = NLMSG_PUT(skb, pid, seq, t, size); + nlh->nlmsg_flags = flags; + data = NLMSG_DATA(nlh); + memcpy(data, payload, size); + return skb; + +nlmsg_failure: /* Used by NLMSG_PUT */ + if (skb) + kfree_skb(skb); + return NULL; +} + /** * audit_send_reply - send an audit reply message via netlink * @pid: process id to send reply to @@ -383,29 +427,13 @@ void audit_send_reply(int pid, int seq, int type, int done, int multi, void *payload, int size) { struct sk_buff *skb; - struct nlmsghdr *nlh; - int len = NLMSG_SPACE(size); - void *data; - int flags = multi ? NLM_F_MULTI : 0; - int t = done ? NLMSG_DONE : type; - - skb = alloc_skb(len, GFP_KERNEL); + skb = audit_make_reply(pid, seq, type, done, multi, payload, size); if (!skb) return; - - nlh = NLMSG_PUT(skb, pid, seq, t, size); - nlh->nlmsg_flags = flags; - data = NLMSG_DATA(nlh); - memcpy(data, payload, size); - /* Ignore failure. It'll only happen if the sender goes away, because our timeout is set to infinite. */ netlink_unicast(audit_sock, skb, pid, 0); return; - -nlmsg_failure: /* Used by NLMSG_PUT */ - if (skb) - kfree_skb(skb); } /* diff --git a/kernel/audit.h b/kernel/audit.h index 6f733920fd32..8948fc1e9e54 100644 --- a/kernel/audit.h +++ b/kernel/audit.h @@ -22,6 +22,7 @@ #include #include #include +#include /* 0 = no checking 1 = put_count checking @@ -82,6 +83,9 @@ struct audit_entry { extern int audit_pid; extern int audit_comparator(const u32 left, const u32 op, const u32 right); +extern struct sk_buff * audit_make_reply(int pid, int seq, int type, + int done, int multi, + void *payload, int size); extern void audit_send_reply(int pid, int seq, int type, int done, int multi, void *payload, int size); @@ -89,4 +93,11 @@ extern void audit_log_lost(const char *message); extern void audit_panic(const char *message); extern struct mutex audit_netlink_mutex; +struct audit_netlink_list { + int pid; + struct sk_buff_head q; +}; + +int audit_send_list(void *); + extern int selinux_audit_rule_update(void); diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index 7c134906d689..ccfea6d82cc3 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -510,19 +510,12 @@ static inline int audit_del_rule(struct audit_entry *entry, /* List rules using struct audit_rule. Exists for backward * compatibility with userspace. */ -static int audit_list(void *_dest) +static void audit_list(int pid, int seq, struct sk_buff_head *q) { - int pid, seq; - int *dest = _dest; + struct sk_buff *skb; struct audit_entry *entry; int i; - pid = dest[0]; - seq = dest[1]; - kfree(dest); - - mutex_lock(&audit_netlink_mutex); - /* The *_rcu iterators not needed here because we are always called with audit_netlink_mutex held. */ for (i=0; irule); if (unlikely(!rule)) break; - audit_send_reply(pid, seq, AUDIT_LIST, 0, 1, + skb = audit_make_reply(pid, seq, AUDIT_LIST, 0, 1, rule, sizeof(*rule)); + if (skb) + skb_queue_tail(q, skb); kfree(rule); } } - audit_send_reply(pid, seq, AUDIT_LIST, 1, 1, NULL, 0); - - mutex_unlock(&audit_netlink_mutex); - return 0; + skb = audit_make_reply(pid, seq, AUDIT_LIST, 1, 1, NULL, 0); + if (skb) + skb_queue_tail(q, skb); } /* List rules using struct audit_rule_data. */ -static int audit_list_rules(void *_dest) +static void audit_list_rules(int pid, int seq, struct sk_buff_head *q) { - int pid, seq; - int *dest = _dest; + struct sk_buff *skb; struct audit_entry *e; int i; - pid = dest[0]; - seq = dest[1]; - kfree(dest); - - mutex_lock(&audit_netlink_mutex); - /* The *_rcu iterators not needed here because we are always called with audit_netlink_mutex held. */ for (i=0; irule); if (unlikely(!data)) break; - audit_send_reply(pid, seq, AUDIT_LIST_RULES, 0, 1, + skb = audit_make_reply(pid, seq, AUDIT_LIST_RULES, 0, 1, data, sizeof(*data)); + if (skb) + skb_queue_tail(q, skb); kfree(data); } } - audit_send_reply(pid, seq, AUDIT_LIST_RULES, 1, 1, NULL, 0); - - mutex_unlock(&audit_netlink_mutex); - return 0; + skb = audit_make_reply(pid, seq, AUDIT_LIST_RULES, 1, 1, NULL, 0); + if (skb) + skb_queue_tail(q, skb); } /** @@ -592,7 +580,7 @@ int audit_receive_filter(int type, int pid, int uid, int seq, void *data, size_t datasz, uid_t loginuid, u32 sid) { struct task_struct *tsk; - int *dest; + struct audit_netlink_list *dest; int err = 0; struct audit_entry *entry; @@ -605,18 +593,20 @@ int audit_receive_filter(int type, int pid, int uid, int seq, void *data, * happen if we're actually running in the context of auditctl * trying to _send_ the stuff */ - dest = kmalloc(2 * sizeof(int), GFP_KERNEL); + dest = kmalloc(sizeof(struct audit_netlink_list), GFP_KERNEL); if (!dest) return -ENOMEM; - dest[0] = pid; - dest[1] = seq; + dest->pid = pid; + skb_queue_head_init(&dest->q); if (type == AUDIT_LIST) - tsk = kthread_run(audit_list, dest, "audit_list"); + audit_list(pid, seq, &dest->q); else - tsk = kthread_run(audit_list_rules, dest, - "audit_list_rules"); + audit_list_rules(pid, seq, &dest->q); + + tsk = kthread_run(audit_send_list, dest, "audit_send_list"); if (IS_ERR(tsk)) { + skb_queue_purge(&dest->q); kfree(dest); err = PTR_ERR(tsk); } -- cgit v1.2.3 From 473ae30bc7b1dda5c5791c773f95e9424ddfead9 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 26 Apr 2006 14:04:08 -0400 Subject: [PATCH] execve argument logging Signed-off-by: Al Viro --- fs/exec.c | 6 ++++++ include/linux/audit.h | 6 +++++- kernel/audit.c | 8 +++++--- kernel/auditsc.c | 51 +++++++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 67 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/fs/exec.c b/fs/exec.c index 3a79d97ac234..d07858c0b7c4 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -49,6 +49,7 @@ #include #include #include +#include #include #include @@ -1085,6 +1086,11 @@ int search_binary_handler(struct linux_binprm *bprm,struct pt_regs *regs) /* kernel module loader fixup */ /* so we don't try to load run modprobe in kernel space. */ set_fs(USER_DS); + + retval = audit_bprm(bprm); + if (retval) + return retval; + retval = -ENOENT; for (try=0; try<2; try++) { read_lock(&binfmt_lock); diff --git a/include/linux/audit.h b/include/linux/audit.h index e65399bf2710..1a221b65f7b7 100644 --- a/include/linux/audit.h +++ b/include/linux/audit.h @@ -83,6 +83,7 @@ #define AUDIT_CONFIG_CHANGE 1305 /* Audit system configuration change */ #define AUDIT_SOCKADDR 1306 /* sockaddr copied as syscall arg */ #define AUDIT_CWD 1307 /* Current working directory */ +#define AUDIT_EXECVE 1309 /* execve arguments */ #define AUDIT_IPC_SET_PERM 1311 /* IPC new permissions record type */ #define AUDIT_AVC 1400 /* SE Linux avc denial or grant */ @@ -283,6 +284,7 @@ struct audit_buffer; struct audit_context; struct inode; struct netlink_skb_parms; +struct linux_binprm; #define AUDITSC_INVALID 0 #define AUDITSC_SUCCESS 1 @@ -322,6 +324,7 @@ extern int audit_set_loginuid(struct task_struct *task, uid_t loginuid); extern uid_t audit_get_loginuid(struct audit_context *ctx); extern int audit_ipc_obj(struct kern_ipc_perm *ipcp); extern int audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, mode_t mode, struct kern_ipc_perm *ipcp); +extern int audit_bprm(struct linux_binprm *bprm); extern int audit_socketcall(int nargs, unsigned long *args); extern int audit_sockaddr(int len, void *addr); extern int audit_avc_path(struct dentry *dentry, struct vfsmount *mnt); @@ -342,6 +345,7 @@ extern int audit_set_macxattr(const char *name); #define audit_get_loginuid(c) ({ -1; }) #define audit_ipc_obj(i) ({ 0; }) #define audit_ipc_set_perm(q,u,g,m,i) ({ 0; }) +#define audit_bprm(p) ({ 0; }) #define audit_socketcall(n,a) ({ 0; }) #define audit_sockaddr(len, addr) ({ 0; }) #define audit_avc_path(dentry, mnt) ({ 0; }) @@ -364,7 +368,7 @@ extern void audit_log_end(struct audit_buffer *ab); extern void audit_log_hex(struct audit_buffer *ab, const unsigned char *buf, size_t len); -extern void audit_log_untrustedstring(struct audit_buffer *ab, +extern const char * audit_log_untrustedstring(struct audit_buffer *ab, const char *string); extern void audit_log_d_path(struct audit_buffer *ab, const char *prefix, diff --git a/kernel/audit.c b/kernel/audit.c index bf74bf02aa4b..d09f131b111a 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -1026,18 +1026,20 @@ void audit_log_hex(struct audit_buffer *ab, const unsigned char *buf, * or a space. Unescaped strings will start and end with a double quote mark. * Strings that are escaped are printed in hex (2 digits per char). */ -void audit_log_untrustedstring(struct audit_buffer *ab, const char *string) +const char *audit_log_untrustedstring(struct audit_buffer *ab, const char *string) { const unsigned char *p = string; + size_t len = strlen(string); while (*p) { if (*p == '"' || *p < 0x21 || *p > 0x7f) { - audit_log_hex(ab, string, strlen(string)); - return; + audit_log_hex(ab, string, len); + return string + len + 1; } p++; } audit_log_format(ab, "\"%s\"", string); + return p + 1; } /* This is a helper-function to print the escaped d_path */ diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 1c03a4ed1b27..114f921979ec 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -59,6 +59,7 @@ #include #include #include +#include #include "audit.h" @@ -110,6 +111,13 @@ struct audit_aux_data_ipcctl { u32 osid; }; +struct audit_aux_data_execve { + struct audit_aux_data d; + int argc; + int envc; + char mem[0]; +}; + struct audit_aux_data_socketcall { struct audit_aux_data d; int nargs; @@ -667,6 +675,16 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts kfree(ctx); } break; } + case AUDIT_EXECVE: { + struct audit_aux_data_execve *axi = (void *)aux; + int i; + const char *p; + for (i = 0, p = axi->mem; i < axi->argc; i++) { + audit_log_format(ab, "a%d=", i); + p = audit_log_untrustedstring(ab, p); + audit_log_format(ab, "\n"); + } + break; } case AUDIT_SOCKETCALL: { int i; @@ -1231,6 +1249,39 @@ int audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, mode_t mode, return 0; } +int audit_bprm(struct linux_binprm *bprm) +{ + struct audit_aux_data_execve *ax; + struct audit_context *context = current->audit_context; + unsigned long p, next; + void *to; + + if (likely(!audit_enabled || !context)) + return 0; + + ax = kmalloc(sizeof(*ax) + PAGE_SIZE * MAX_ARG_PAGES - bprm->p, + GFP_KERNEL); + if (!ax) + return -ENOMEM; + + ax->argc = bprm->argc; + ax->envc = bprm->envc; + for (p = bprm->p, to = ax->mem; p < MAX_ARG_PAGES*PAGE_SIZE; p = next) { + struct page *page = bprm->page[p / PAGE_SIZE]; + void *kaddr = kmap(page); + next = (p + PAGE_SIZE) & ~(PAGE_SIZE - 1); + memcpy(to, kaddr + (p & (PAGE_SIZE - 1)), next - p); + to += next - p; + kunmap(page); + } + + ax->d.type = AUDIT_EXECVE; + ax->d.next = context->aux; + context->aux = (void *)ax; + return 0; +} + + /** * audit_socketcall - record audit data for sys_socketcall * @nargs: number of args -- cgit v1.2.3 From e1396065e0489f98b35021b97907ab4edbfb24e1 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 25 May 2006 10:19:47 -0400 Subject: [PATCH] collect sid of those who send signals to auditd Signed-off-by: Al Viro --- include/linux/audit.h | 3 +-- kernel/audit.c | 31 ++++++++++++++++++++----------- kernel/audit.h | 11 +++++++++++ kernel/auditsc.c | 23 ++++++++++++----------- kernel/signal.c | 2 +- 5 files changed, 45 insertions(+), 25 deletions(-) (limited to 'kernel') diff --git a/include/linux/audit.h b/include/linux/audit.h index 1a221b65f7b7..1057e90bd3e3 100644 --- a/include/linux/audit.h +++ b/include/linux/audit.h @@ -278,6 +278,7 @@ struct audit_rule { /* for AUDIT_LIST, AUDIT_ADD, and AUDIT_DEL */ struct audit_sig_info { uid_t uid; pid_t pid; + char ctx[0]; }; struct audit_buffer; @@ -328,7 +329,6 @@ extern int audit_bprm(struct linux_binprm *bprm); extern int audit_socketcall(int nargs, unsigned long *args); extern int audit_sockaddr(int len, void *addr); extern int audit_avc_path(struct dentry *dentry, struct vfsmount *mnt); -extern void audit_signal_info(int sig, struct task_struct *t); extern int audit_set_macxattr(const char *name); #else #define audit_alloc(t) ({ 0; }) @@ -349,7 +349,6 @@ extern int audit_set_macxattr(const char *name); #define audit_socketcall(n,a) ({ 0; }) #define audit_sockaddr(len, addr) ({ 0; }) #define audit_avc_path(dentry, mnt) ({ 0; }) -#define audit_signal_info(s,t) do { ; } while (0) #define audit_set_macxattr(n) do { ; } while (0) #endif diff --git a/kernel/audit.c b/kernel/audit.c index d09f131b111a..bb20922d08cc 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -89,6 +89,7 @@ static int audit_backlog_wait_overflow = 0; /* The identity of the user shutting down the audit system. */ uid_t audit_sig_uid = -1; pid_t audit_sig_pid = -1; +u32 audit_sig_sid = 0; /* Records can be lost in several ways: 0) [suppressed in audit_alloc] @@ -479,7 +480,9 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) struct audit_buffer *ab; u16 msg_type = nlh->nlmsg_type; uid_t loginuid; /* loginuid of sender */ - struct audit_sig_info sig_data; + struct audit_sig_info *sig_data; + char *ctx; + u32 len; err = audit_netlink_ok(NETLINK_CB(skb).eff_cap, msg_type); if (err) @@ -531,12 +534,9 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) if (status_get->mask & AUDIT_STATUS_PID) { int old = audit_pid; if (sid) { - char *ctx = NULL; - u32 len; - int rc; - if ((rc = selinux_ctxid_to_string( + if ((err = selinux_ctxid_to_string( sid, &ctx, &len))) - return rc; + return err; else audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE, @@ -572,8 +572,6 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) "user pid=%d uid=%u auid=%u", pid, uid, loginuid); if (sid) { - char *ctx = NULL; - u32 len; if (selinux_ctxid_to_string( sid, &ctx, &len)) { audit_log_format(ab, @@ -612,10 +610,21 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) loginuid, sid); break; case AUDIT_SIGNAL_INFO: - sig_data.uid = audit_sig_uid; - sig_data.pid = audit_sig_pid; + err = selinux_ctxid_to_string(audit_sig_sid, &ctx, &len); + if (err) + return err; + sig_data = kmalloc(sizeof(*sig_data) + len, GFP_KERNEL); + if (!sig_data) { + kfree(ctx); + return -ENOMEM; + } + sig_data->uid = audit_sig_uid; + sig_data->pid = audit_sig_pid; + memcpy(sig_data->ctx, ctx, len); + kfree(ctx); audit_send_reply(NETLINK_CB(skb).pid, seq, AUDIT_SIGNAL_INFO, - 0, 0, &sig_data, sizeof(sig_data)); + 0, 0, sig_data, sizeof(*sig_data) + len); + kfree(sig_data); break; default: err = -EINVAL; diff --git a/kernel/audit.h b/kernel/audit.h index 8948fc1e9e54..52cb1e31d522 100644 --- a/kernel/audit.h +++ b/kernel/audit.h @@ -101,3 +101,14 @@ struct audit_netlink_list { int audit_send_list(void *); extern int selinux_audit_rule_update(void); + +#ifdef CONFIG_AUDITSYSCALL +extern void __audit_signal_info(int sig, struct task_struct *t); +static inline void audit_signal_info(int sig, struct task_struct *t) +{ + if (unlikely(audit_pid && t->tgid == audit_pid)) + __audit_signal_info(sig, t); +} +#else +#define audit_signal_info(s,t) +#endif diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 114f921979ec..4ca913daa7da 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -1376,19 +1376,20 @@ int audit_avc_path(struct dentry *dentry, struct vfsmount *mnt) * If the audit subsystem is being terminated, record the task (pid) * and uid that is doing that. */ -void audit_signal_info(int sig, struct task_struct *t) +void __audit_signal_info(int sig, struct task_struct *t) { extern pid_t audit_sig_pid; extern uid_t audit_sig_uid; - - if (unlikely(audit_pid && t->tgid == audit_pid)) { - if (sig == SIGTERM || sig == SIGHUP) { - struct audit_context *ctx = current->audit_context; - audit_sig_pid = current->pid; - if (ctx) - audit_sig_uid = ctx->loginuid; - else - audit_sig_uid = current->uid; - } + extern u32 audit_sig_sid; + + if (sig == SIGTERM || sig == SIGHUP || sig == SIGUSR1) { + struct task_struct *tsk = current; + struct audit_context *ctx = tsk->audit_context; + audit_sig_pid = tsk->pid; + if (ctx) + audit_sig_uid = ctx->loginuid; + else + audit_sig_uid = tsk->uid; + selinux_get_task_sid(tsk, &audit_sig_sid); } } diff --git a/kernel/signal.c b/kernel/signal.c index e5f8aea78ffe..1b3c921737e2 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -23,12 +23,12 @@ #include #include #include -#include #include #include #include #include #include +#include "audit.h" /* audit_signal_info() */ /* * SLAB caches for signal bits. -- cgit v1.2.3 From f46038ff7d23ae092d61b366332c05aab8227b48 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 6 May 2006 08:22:52 -0400 Subject: [PATCH] log ppid Signed-off-by: Al Viro --- kernel/auditsc.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 4ca913daa7da..4fc3867fa25a 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -60,6 +60,7 @@ #include #include #include +#include #include "audit.h" @@ -156,7 +157,7 @@ struct audit_context { struct audit_aux_data *aux; /* Save things to print about task_struct */ - pid_t pid; + pid_t pid, ppid; uid_t uid, euid, suid, fsuid; gid_t gid, egid, sgid, fsgid; unsigned long personality; @@ -379,6 +380,7 @@ static inline struct audit_context *audit_get_context(struct task_struct *tsk, } context->pid = tsk->pid; + context->ppid = sys_getppid(); /* sic. tsk == current in all cases */ context->uid = tsk->uid; context->gid = tsk->gid; context->euid = tsk->euid; @@ -614,7 +616,7 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts tty = "(none)"; audit_log_format(ab, " a0=%lx a1=%lx a2=%lx a3=%lx items=%d" - " pid=%d auid=%u uid=%u gid=%u" + " ppid=%d pid=%d auid=%u uid=%u gid=%u" " euid=%u suid=%u fsuid=%u" " egid=%u sgid=%u fsgid=%u tty=%s", context->argv[0], @@ -622,6 +624,7 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts context->argv[2], context->argv[3], context->name_count, + context->ppid, context->pid, context->loginuid, context->uid, -- cgit v1.2.3 From 3c66251e573219a0532a5a07381b2f60a412d9eb Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 6 May 2006 08:26:27 -0400 Subject: [PATCH] add filtering by ppid Signed-off-by: Al Viro --- include/linux/audit.h | 1 + kernel/auditsc.c | 4 ++++ 2 files changed, 5 insertions(+) (limited to 'kernel') diff --git a/include/linux/audit.h b/include/linux/audit.h index 1057e90bd3e3..8f6424f2b604 100644 --- a/include/linux/audit.h +++ b/include/linux/audit.h @@ -152,6 +152,7 @@ #define AUDIT_SE_TYPE 15 /* security label type */ #define AUDIT_SE_SEN 16 /* security label sensitivity label */ #define AUDIT_SE_CLR 17 /* security label clearance label */ +#define AUDIT_PPID 18 /* These are ONLY useful when checking * at syscall exit time (AUDIT_AT_EXIT). */ diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 4fc3867fa25a..e4551659ad79 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -188,6 +188,10 @@ static int audit_filter_rules(struct task_struct *tsk, case AUDIT_PID: result = audit_comparator(tsk->pid, f->op, f->val); break; + case AUDIT_PPID: + if (ctx) + result = audit_comparator(ctx->ppid, f->op, f->val); + break; case AUDIT_UID: result = audit_comparator(tsk->uid, f->op, f->val); break; -- cgit v1.2.3 From 0a3b483e83edb6aa6d3c49db70eeb6f1cd9f6c6b Mon Sep 17 00:00:00 2001 From: Amy Griffis Date: Tue, 2 May 2006 15:06:01 -0400 Subject: [PATCH] fix audit_krule_to_{rule,data} return values Don't return -ENOMEM when callers of these functions are checking for a NULL return. Bug noticed by Serge Hallyn. Signed-off-by: Amy Griffis Signed-off-by: Al Viro --- kernel/auditfilter.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index ccfea6d82cc3..b3fccd6808f9 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -291,7 +291,7 @@ static struct audit_rule *audit_krule_to_rule(struct audit_krule *krule) rule = kmalloc(sizeof(*rule), GFP_KERNEL); if (unlikely(!rule)) - return ERR_PTR(-ENOMEM); + return NULL; memset(rule, 0, sizeof(*rule)); rule->flags = krule->flags | krule->listnr; @@ -322,7 +322,7 @@ static struct audit_rule_data *audit_krule_to_data(struct audit_krule *krule) data = kmalloc(sizeof(*data) + krule->buflen, GFP_KERNEL); if (unlikely(!data)) - return ERR_PTR(-ENOMEM); + return NULL; memset(data, 0, sizeof(*data)); data->flags = krule->flags | krule->listnr; -- cgit v1.2.3 From 5d136a010de3bc16fe595987feb9ef8868f064c2 Mon Sep 17 00:00:00 2001 From: "Serge E. Hallyn" Date: Thu, 27 Apr 2006 16:45:14 -0500 Subject: [PATCH] minor audit updates Just a few minor proposed updates. Only the last one will actually affect behavior. The rest are just misleading code. Several AUDIT_SET functions return 'old' value, but only return value <0 is checked for. So just return 0. propagate audit_set_rate_limit and audit_set_backlog_limit error values In audit_buffer_free, the audit_freelist_count was being incremented even when we discard the return buffer, so audit_freelist_count can end up wrong. This could cause the actual freelist to shrink over time, eventually threatening to degrate audit performance. Signed-off-by: Serge E. Hallyn Signed-off-by: Al Viro --- kernel/audit.c | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index bb20922d08cc..0738a4b290e6 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -251,7 +251,7 @@ static int audit_set_rate_limit(int limit, uid_t loginuid, u32 sid) "audit_rate_limit=%d old=%d by auid=%u", limit, old, loginuid); audit_rate_limit = limit; - return old; + return 0; } static int audit_set_backlog_limit(int limit, uid_t loginuid, u32 sid) @@ -274,7 +274,7 @@ static int audit_set_backlog_limit(int limit, uid_t loginuid, u32 sid) "audit_backlog_limit=%d old=%d by auid=%u", limit, old, loginuid); audit_backlog_limit = limit; - return old; + return 0; } static int audit_set_enabled(int state, uid_t loginuid, u32 sid) @@ -300,7 +300,7 @@ static int audit_set_enabled(int state, uid_t loginuid, u32 sid) "audit_enabled=%d old=%d by auid=%u", state, old, loginuid); audit_enabled = state; - return old; + return 0; } static int audit_set_failure(int state, uid_t loginuid, u32 sid) @@ -328,7 +328,7 @@ static int audit_set_failure(int state, uid_t loginuid, u32 sid) "audit_failure=%d old=%d by auid=%u", state, old, loginuid); audit_failure = state; - return old; + return 0; } static int kauditd_thread(void *dummy) @@ -364,7 +364,6 @@ static int kauditd_thread(void *dummy) remove_wait_queue(&kauditd_wait, &wait); } } - return 0; } int audit_send_list(void *_dest) @@ -551,10 +550,10 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) audit_pid = status_get->pid; } if (status_get->mask & AUDIT_STATUS_RATE_LIMIT) - audit_set_rate_limit(status_get->rate_limit, + err = audit_set_rate_limit(status_get->rate_limit, loginuid, sid); if (status_get->mask & AUDIT_STATUS_BACKLOG_LIMIT) - audit_set_backlog_limit(status_get->backlog_limit, + err = audit_set_backlog_limit(status_get->backlog_limit, loginuid, sid); break; case AUDIT_USER: @@ -727,10 +726,12 @@ static void audit_buffer_free(struct audit_buffer *ab) kfree_skb(ab->skb); spin_lock_irqsave(&audit_freelist_lock, flags); - if (++audit_freelist_count > AUDIT_MAXFREE) + if (audit_freelist_count > AUDIT_MAXFREE) kfree(ab); - else + else { + audit_freelist_count++; list_add(&ab->list, &audit_freelist); + } spin_unlock_irqrestore(&audit_freelist_lock, flags); } -- cgit v1.2.3 From ac03221a4fdda9bfdabf99bcd129847f20fc1d80 Mon Sep 17 00:00:00 2001 From: Linda Knippers Date: Tue, 16 May 2006 22:03:48 -0400 Subject: [PATCH] update of IPC audit record cleanup The following patch addresses most of the issues with the IPC_SET_PERM records as described in: https://www.redhat.com/archives/linux-audit/2006-May/msg00010.html and addresses the comments I received on the record field names. To summarize, I made the following changes: 1. Changed sys_msgctl() and semctl_down() so that an IPC_SET_PERM record is emitted in the failure case as well as the success case. This matches the behavior in sys_shmctl(). I could simplify the code in sys_msgctl() and semctl_down() slightly but it would mean that in some error cases we could get an IPC_SET_PERM record without an IPC record and that seemed odd. 2. No change to the IPC record type, given no feedback on the backward compatibility question. 3. Removed the qbytes field from the IPC record. It wasn't being set and when audit_ipc_obj() is called from ipcperms(), the information isn't available. If we want the information in the IPC record, more extensive changes will be necessary. Since it only applies to message queues and it isn't really permission related, it doesn't seem worth it. 4. Removed the obj field from the IPC_SET_PERM record. This means that the kern_ipc_perm argument is no longer needed. 5. Removed the spaces and renamed the IPC_SET_PERM field names. Replaced iuid and igid fields with ouid and ogid in the IPC record. I tested this with the lspp.22 kernel on an x86_64 box. I believe it applies cleanly on the latest kernel. -- ljk Signed-off-by: Linda Knippers Signed-off-by: Al Viro --- include/linux/audit.h | 4 ++-- ipc/msg.c | 9 +++++---- ipc/sem.c | 8 +++++--- ipc/shm.c | 2 +- kernel/auditsc.c | 22 +++++----------------- 5 files changed, 18 insertions(+), 27 deletions(-) (limited to 'kernel') diff --git a/include/linux/audit.h b/include/linux/audit.h index 8f6424f2b604..da5f521be04b 100644 --- a/include/linux/audit.h +++ b/include/linux/audit.h @@ -325,7 +325,7 @@ extern void auditsc_get_stamp(struct audit_context *ctx, extern int audit_set_loginuid(struct task_struct *task, uid_t loginuid); extern uid_t audit_get_loginuid(struct audit_context *ctx); extern int audit_ipc_obj(struct kern_ipc_perm *ipcp); -extern int audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, mode_t mode, struct kern_ipc_perm *ipcp); +extern int audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, mode_t mode); extern int audit_bprm(struct linux_binprm *bprm); extern int audit_socketcall(int nargs, unsigned long *args); extern int audit_sockaddr(int len, void *addr); @@ -345,7 +345,7 @@ extern int audit_set_macxattr(const char *name); #define auditsc_get_stamp(c,t,s) do { BUG(); } while (0) #define audit_get_loginuid(c) ({ -1; }) #define audit_ipc_obj(i) ({ 0; }) -#define audit_ipc_set_perm(q,u,g,m,i) ({ 0; }) +#define audit_ipc_set_perm(q,u,g,m) ({ 0; }) #define audit_bprm(p) ({ 0; }) #define audit_socketcall(n,a) ({ 0; }) #define audit_sockaddr(len, addr) ({ 0; }) diff --git a/ipc/msg.c b/ipc/msg.c index 7d1340ccb16b..00f015a092d2 100644 --- a/ipc/msg.c +++ b/ipc/msg.c @@ -454,6 +454,11 @@ asmlinkage long sys_msgctl (int msqid, int cmd, struct msqid_ds __user *buf) err = audit_ipc_obj(ipcp); if (err) goto out_unlock_up; + if (cmd==IPC_SET) { + err = audit_ipc_set_perm(setbuf.qbytes, setbuf.uid, setbuf.gid, setbuf.mode); + if (err) + goto out_unlock_up; + } err = -EPERM; if (current->euid != ipcp->cuid && @@ -468,10 +473,6 @@ asmlinkage long sys_msgctl (int msqid, int cmd, struct msqid_ds __user *buf) switch (cmd) { case IPC_SET: { - err = audit_ipc_set_perm(setbuf.qbytes, setbuf.uid, setbuf.gid, setbuf.mode, ipcp); - if (err) - goto out_unlock_up; - err = -EPERM; if (setbuf.qbytes > msg_ctlmnb && !capable(CAP_SYS_RESOURCE)) goto out_unlock_up; diff --git a/ipc/sem.c b/ipc/sem.c index 7919f8ece6ba..fce0bc8b5ad6 100644 --- a/ipc/sem.c +++ b/ipc/sem.c @@ -828,6 +828,11 @@ static int semctl_down(int semid, int semnum, int cmd, int version, union semun if (err) goto out_unlock; + if (cmd == IPC_SET) { + err = audit_ipc_set_perm(0, setbuf.uid, setbuf.gid, setbuf.mode); + if (err) + goto out_unlock; + } if (current->euid != ipcp->cuid && current->euid != ipcp->uid && !capable(CAP_SYS_ADMIN)) { err=-EPERM; @@ -844,9 +849,6 @@ static int semctl_down(int semid, int semnum, int cmd, int version, union semun err = 0; break; case IPC_SET: - err = audit_ipc_set_perm(0, setbuf.uid, setbuf.gid, setbuf.mode, ipcp); - if (err) - goto out_unlock; ipcp->uid = setbuf.uid; ipcp->gid = setbuf.gid; ipcp->mode = (ipcp->mode & ~S_IRWXUGO) diff --git a/ipc/shm.c b/ipc/shm.c index 809896851902..4f133d24030f 100644 --- a/ipc/shm.c +++ b/ipc/shm.c @@ -643,7 +643,7 @@ asmlinkage long sys_shmctl (int shmid, int cmd, struct shmid_ds __user *buf) err = audit_ipc_obj(&(shp->shm_perm)); if (err) goto out_unlock_up; - err = audit_ipc_set_perm(0, setbuf.uid, setbuf.gid, setbuf.mode, &(shp->shm_perm)); + err = audit_ipc_set_perm(0, setbuf.uid, setbuf.gid, setbuf.mode); if (err) goto out_unlock_up; err=-EPERM; diff --git a/kernel/auditsc.c b/kernel/auditsc.c index e4551659ad79..fa4bf9625456 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -648,8 +648,8 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts case AUDIT_IPC: { struct audit_aux_data_ipcctl *axi = (void *)aux; audit_log_format(ab, - " qbytes=%lx iuid=%u igid=%u mode=%x", - axi->qbytes, axi->uid, axi->gid, axi->mode); + "ouid=%u ogid=%u mode=%x", + axi->uid, axi->gid, axi->mode); if (axi->osid != 0) { char *ctx = NULL; u32 len; @@ -667,21 +667,10 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts case AUDIT_IPC_SET_PERM: { struct audit_aux_data_ipcctl *axi = (void *)aux; audit_log_format(ab, - " new qbytes=%lx new iuid=%u new igid=%u new mode=%x", + "qbytes=%lx ouid=%u ogid=%u mode=%x", axi->qbytes, axi->uid, axi->gid, axi->mode); - if (axi->osid != 0) { - char *ctx = NULL; - u32 len; - if (selinux_ctxid_to_string( - axi->osid, &ctx, &len)) { - audit_log_format(ab, " osid=%u", - axi->osid); - call_panic = 1; - } else - audit_log_format(ab, " obj=%s", ctx); - kfree(ctx); - } break; } + case AUDIT_EXECVE: { struct audit_aux_data_execve *axi = (void *)aux; int i; @@ -1232,7 +1221,7 @@ int audit_ipc_obj(struct kern_ipc_perm *ipcp) * * Returns 0 for success or NULL context or < 0 on error. */ -int audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, mode_t mode, struct kern_ipc_perm *ipcp) +int audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, mode_t mode) { struct audit_aux_data_ipcctl *ax; struct audit_context *context = current->audit_context; @@ -1248,7 +1237,6 @@ int audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, mode_t mode, ax->uid = uid; ax->gid = gid; ax->mode = mode; - selinux_get_ipc_sid(ipcp, &ax->osid); ax->d.type = AUDIT_IPC_SET_PERM; ax->d.next = context->aux; -- cgit v1.2.3 From d8945bb51a2bb6623cfa36b9ff63594f46d513aa Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 18 May 2006 16:01:30 -0400 Subject: [PATCH] inline more audit helpers pull checks for ->audit_context into inlined wrappers Signed-off-by: Al Viro --- include/linux/audit.h | 24 +++++++++++++++++++++--- kernel/auditsc.c | 14 ++++---------- 2 files changed, 25 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/include/linux/audit.h b/include/linux/audit.h index da5f521be04b..4b62743b2e6d 100644 --- a/include/linux/audit.h +++ b/include/linux/audit.h @@ -301,11 +301,16 @@ extern void audit_syscall_entry(int arch, int major, unsigned long a0, unsigned long a1, unsigned long a2, unsigned long a3); extern void audit_syscall_exit(int failed, long return_code); -extern void audit_getname(const char *name); +extern void __audit_getname(const char *name); extern void audit_putname(const char *name); extern void __audit_inode(const char *name, const struct inode *inode, unsigned flags); extern void __audit_inode_child(const char *dname, const struct inode *inode, unsigned long pino); +static inline void audit_getname(const char *name) +{ + if (unlikely(current->audit_context)) + __audit_getname(name); +} static inline void audit_inode(const char *name, const struct inode *inode, unsigned flags) { if (unlikely(current->audit_context)) @@ -324,13 +329,26 @@ extern void auditsc_get_stamp(struct audit_context *ctx, struct timespec *t, unsigned int *serial); extern int audit_set_loginuid(struct task_struct *task, uid_t loginuid); extern uid_t audit_get_loginuid(struct audit_context *ctx); -extern int audit_ipc_obj(struct kern_ipc_perm *ipcp); -extern int audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, mode_t mode); +extern int __audit_ipc_obj(struct kern_ipc_perm *ipcp); +extern int __audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, mode_t mode); extern int audit_bprm(struct linux_binprm *bprm); extern int audit_socketcall(int nargs, unsigned long *args); extern int audit_sockaddr(int len, void *addr); extern int audit_avc_path(struct dentry *dentry, struct vfsmount *mnt); extern int audit_set_macxattr(const char *name); + +static inline int audit_ipc_obj(struct kern_ipc_perm *ipcp) +{ + if (unlikely(current->audit_context)) + return __audit_ipc_obj(ipcp); + return 0; +} +static inline int audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, mode_t mode) +{ + if (unlikely(current->audit_context)) + return __audit_ipc_set_perm(qbytes, uid, gid, mode); + return 0; +} #else #define audit_alloc(t) ({ 0; }) #define audit_free(t) do { ; } while (0) diff --git a/kernel/auditsc.c b/kernel/auditsc.c index fa4bf9625456..05d31ee4f3dd 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -922,11 +922,11 @@ void audit_syscall_exit(int valid, long return_code) * Add a name to the list of audit names for this context. * Called from fs/namei.c:getname(). */ -void audit_getname(const char *name) +void __audit_getname(const char *name) { struct audit_context *context = current->audit_context; - if (!context || IS_ERR(name) || !name) + if (IS_ERR(name) || !name) return; if (!context->in_syscall) { @@ -1189,14 +1189,11 @@ uid_t audit_get_loginuid(struct audit_context *ctx) * * Returns 0 for success or NULL context or < 0 on error. */ -int audit_ipc_obj(struct kern_ipc_perm *ipcp) +int __audit_ipc_obj(struct kern_ipc_perm *ipcp) { struct audit_aux_data_ipcctl *ax; struct audit_context *context = current->audit_context; - if (likely(!context)) - return 0; - ax = kmalloc(sizeof(*ax), GFP_ATOMIC); if (!ax) return -ENOMEM; @@ -1221,14 +1218,11 @@ int audit_ipc_obj(struct kern_ipc_perm *ipcp) * * Returns 0 for success or NULL context or < 0 on error. */ -int audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, mode_t mode) +int __audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, mode_t mode) { struct audit_aux_data_ipcctl *ax; struct audit_context *context = current->audit_context; - if (likely(!context)) - return 0; - ax = kmalloc(sizeof(*ax), GFP_ATOMIC); if (!ax) return -ENOMEM; -- cgit v1.2.3 From 014149cce19c5acb19014e57a5b739b7f64e6fbf Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 23 May 2006 01:36:13 -0400 Subject: [PATCH] deprecate AUDIT_POSSBILE Signed-off-by: Al Viro --- kernel/auditfilter.c | 8 +++++--- kernel/auditsc.c | 1 - 2 files changed, 5 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index b3fccd6808f9..df9503da40fb 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -128,8 +128,11 @@ static inline struct audit_entry *audit_to_entry_common(struct audit_rule *rule) #endif ; } - if (rule->action != AUDIT_NEVER && rule->action != AUDIT_POSSIBLE && - rule->action != AUDIT_ALWAYS) + if (unlikely(rule->action == AUDIT_POSSIBLE)) { + printk(KERN_ERR "AUDIT_POSSIBLE is deprecated\n"); + goto exit_err; + } + if (rule->action != AUDIT_NEVER && rule->action != AUDIT_ALWAYS) goto exit_err; if (rule->field_count > AUDIT_MAX_FIELDS) goto exit_err; @@ -734,7 +737,6 @@ static int audit_filter_user_rules(struct netlink_skb_parms *cb, } switch (rule->action) { case AUDIT_NEVER: *state = AUDIT_DISABLED; break; - case AUDIT_POSSIBLE: *state = AUDIT_BUILD_CONTEXT; break; case AUDIT_ALWAYS: *state = AUDIT_RECORD_CONTEXT; break; } return 1; diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 05d31ee4f3dd..4503c4663cf8 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -307,7 +307,6 @@ static int audit_filter_rules(struct task_struct *tsk, } switch (rule->action) { case AUDIT_NEVER: *state = AUDIT_DISABLED; break; - case AUDIT_POSSIBLE: *state = AUDIT_BUILD_CONTEXT; break; case AUDIT_ALWAYS: *state = AUDIT_RECORD_CONTEXT; break; } return 1; -- cgit v1.2.3 From 20ca73bc792be9625af184cbec36e1372611d1c3 Mon Sep 17 00:00:00 2001 From: "George C. Wilson" Date: Wed, 24 May 2006 16:09:55 -0500 Subject: [PATCH] Audit of POSIX Message Queue Syscalls v.2 This patch adds audit support to POSIX message queues. It applies cleanly to the lspp.b15 branch of Al Viro's git tree. There are new auxiliary data structures, and collection and emission routines in kernel/auditsc.c. New hooks in ipc/mqueue.c collect arguments from the syscalls. I tested the patch by building the examples from the POSIX MQ library tarball. Build them -lrt, not against the old MQ library in the tarball. Here's the URL: http://www.geocities.com/wronski12/posix_ipc/libmqueue-4.41.tar.gz Do auditctl -a exit,always -S for mq_open, mq_timedsend, mq_timedreceive, mq_notify, mq_getsetattr. mq_unlink has no new hooks. Please see the corresponding userspace patch to get correct output from auditd for the new record types. [fixes folded] Signed-off-by: George Wilson Signed-off-by: Al Viro --- include/linux/audit.h | 46 +++++++++ ipc/mqueue.c | 22 ++++ kernel/auditsc.c | 274 +++++++++++++++++++++++++++++++++++++++++++++++++- 3 files changed, 341 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/include/linux/audit.h b/include/linux/audit.h index 4b62743b2e6d..7c8780b150e6 100644 --- a/include/linux/audit.h +++ b/include/linux/audit.h @@ -85,6 +85,10 @@ #define AUDIT_CWD 1307 /* Current working directory */ #define AUDIT_EXECVE 1309 /* execve arguments */ #define AUDIT_IPC_SET_PERM 1311 /* IPC new permissions record type */ +#define AUDIT_MQ_OPEN 1312 /* POSIX MQ open record type */ +#define AUDIT_MQ_SENDRECV 1313 /* POSIX MQ send/receive record type */ +#define AUDIT_MQ_NOTIFY 1314 /* POSIX MQ notify record type */ +#define AUDIT_MQ_GETSETATTR 1315 /* POSIX MQ get/set attribute record type */ #define AUDIT_AVC 1400 /* SE Linux avc denial or grant */ #define AUDIT_SELINUX_ERR 1401 /* Internal SE Linux Errors */ @@ -287,6 +291,8 @@ struct audit_context; struct inode; struct netlink_skb_parms; struct linux_binprm; +struct mq_attr; +struct mqstat; #define AUDITSC_INVALID 0 #define AUDITSC_SUCCESS 1 @@ -336,6 +342,11 @@ extern int audit_socketcall(int nargs, unsigned long *args); extern int audit_sockaddr(int len, void *addr); extern int audit_avc_path(struct dentry *dentry, struct vfsmount *mnt); extern int audit_set_macxattr(const char *name); +extern int __audit_mq_open(int oflag, mode_t mode, struct mq_attr __user *u_attr); +extern int __audit_mq_timedsend(mqd_t mqdes, size_t msg_len, unsigned int msg_prio, const struct timespec __user *u_abs_timeout); +extern int __audit_mq_timedreceive(mqd_t mqdes, size_t msg_len, unsigned int __user *u_msg_prio, const struct timespec __user *u_abs_timeout); +extern int __audit_mq_notify(mqd_t mqdes, const struct sigevent __user *u_notification); +extern int __audit_mq_getsetattr(mqd_t mqdes, struct mq_attr *mqstat); static inline int audit_ipc_obj(struct kern_ipc_perm *ipcp) { @@ -349,6 +360,36 @@ static inline int audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, return __audit_ipc_set_perm(qbytes, uid, gid, mode); return 0; } +static inline int audit_mq_open(int oflag, mode_t mode, struct mq_attr __user *u_attr) +{ + if (unlikely(current->audit_context)) + return __audit_mq_open(oflag, mode, u_attr); + return 0; +} +static inline int audit_mq_timedsend(mqd_t mqdes, size_t msg_len, unsigned int msg_prio, const struct timespec __user *u_abs_timeout) +{ + if (unlikely(current->audit_context)) + return __audit_mq_timedsend(mqdes, msg_len, msg_prio, u_abs_timeout); + return 0; +} +static inline int audit_mq_timedreceive(mqd_t mqdes, size_t msg_len, unsigned int __user *u_msg_prio, const struct timespec __user *u_abs_timeout) +{ + if (unlikely(current->audit_context)) + return __audit_mq_timedreceive(mqdes, msg_len, u_msg_prio, u_abs_timeout); + return 0; +} +static inline int audit_mq_notify(mqd_t mqdes, const struct sigevent __user *u_notification) +{ + if (unlikely(current->audit_context)) + return __audit_mq_notify(mqdes, u_notification); + return 0; +} +static inline int audit_mq_getsetattr(mqd_t mqdes, struct mq_attr *mqstat) +{ + if (unlikely(current->audit_context)) + return __audit_mq_getsetattr(mqdes, mqstat); + return 0; +} #else #define audit_alloc(t) ({ 0; }) #define audit_free(t) do { ; } while (0) @@ -369,6 +410,11 @@ static inline int audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, #define audit_sockaddr(len, addr) ({ 0; }) #define audit_avc_path(dentry, mnt) ({ 0; }) #define audit_set_macxattr(n) do { ; } while (0) +#define audit_mq_open(o,m,a) ({ 0; }) +#define audit_mq_timedsend(d,l,p,t) ({ 0; }) +#define audit_mq_timedreceive(d,l,p,t) ({ 0; }) +#define audit_mq_notify(d,n) ({ 0; }) +#define audit_mq_getsetattr(d,s) ({ 0; }) #endif #ifdef CONFIG_AUDIT diff --git a/ipc/mqueue.c b/ipc/mqueue.c index 41ecbd440fed..1511714a9585 100644 --- a/ipc/mqueue.c +++ b/ipc/mqueue.c @@ -8,6 +8,8 @@ * Lockless receive & send, fd based notify: * Manfred Spraul (manfred@colorfullife.com) * + * Audit: George Wilson (ltcgcw@us.ibm.com) + * * This file is released under the GPL. */ @@ -24,6 +26,7 @@ #include #include #include +#include #include #include @@ -657,6 +660,10 @@ asmlinkage long sys_mq_open(const char __user *u_name, int oflag, mode_t mode, char *name; int fd, error; + error = audit_mq_open(oflag, mode, u_attr); + if (error != 0) + return error; + if (IS_ERR(name = getname(u_name))) return PTR_ERR(name); @@ -814,6 +821,10 @@ asmlinkage long sys_mq_timedsend(mqd_t mqdes, const char __user *u_msg_ptr, long timeout; int ret; + ret = audit_mq_timedsend(mqdes, msg_len, msg_prio, u_abs_timeout); + if (ret != 0) + return ret; + if (unlikely(msg_prio >= (unsigned long) MQ_PRIO_MAX)) return -EINVAL; @@ -896,6 +907,10 @@ asmlinkage ssize_t sys_mq_timedreceive(mqd_t mqdes, char __user *u_msg_ptr, struct mqueue_inode_info *info; struct ext_wait_queue wait; + ret = audit_mq_timedreceive(mqdes, msg_len, u_msg_prio, u_abs_timeout); + if (ret != 0) + return ret; + timeout = prepare_timeout(u_abs_timeout); ret = -EBADF; @@ -975,6 +990,10 @@ asmlinkage long sys_mq_notify(mqd_t mqdes, struct mqueue_inode_info *info; struct sk_buff *nc; + ret = audit_mq_notify(mqdes, u_notification); + if (ret != 0) + return ret; + nc = NULL; sock = NULL; if (u_notification != NULL) { @@ -1115,6 +1134,9 @@ asmlinkage long sys_mq_getsetattr(mqd_t mqdes, omqstat = info->attr; omqstat.mq_flags = filp->f_flags & O_NONBLOCK; if (u_mqstat) { + ret = audit_mq_getsetattr(mqdes, &mqstat); + if (ret != 0) + goto out; if (mqstat.mq_flags & O_NONBLOCK) filp->f_flags |= O_NONBLOCK; else diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 4503c4663cf8..14e295a4121b 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -3,7 +3,7 @@ * * Copyright 2003-2004 Red Hat Inc., Durham, North Carolina. * Copyright 2005 Hewlett-Packard Development Company, L.P. - * Copyright (C) 2005 IBM Corporation + * Copyright (C) 2005, 2006 IBM Corporation * All Rights Reserved. * * This program is free software; you can redistribute it and/or modify @@ -29,6 +29,9 @@ * this file -- see entry.S) is based on a GPL'd patch written by * okir@suse.de and Copyright 2003 SuSE Linux AG. * + * POSIX message queue support added by George Wilson , + * 2006. + * * The support of additional filter rules compares (>, <, >=, <=) was * added by Dustin Kirkland , 2005. * @@ -49,6 +52,7 @@ #include #include #include +#include #include #include #include @@ -102,6 +106,33 @@ struct audit_aux_data { #define AUDIT_AUX_IPCPERM 0 +struct audit_aux_data_mq_open { + struct audit_aux_data d; + int oflag; + mode_t mode; + struct mq_attr attr; +}; + +struct audit_aux_data_mq_sendrecv { + struct audit_aux_data d; + mqd_t mqdes; + size_t msg_len; + unsigned int msg_prio; + struct timespec abs_timeout; +}; + +struct audit_aux_data_mq_notify { + struct audit_aux_data d; + mqd_t mqdes; + struct sigevent notification; +}; + +struct audit_aux_data_mq_getsetattr { + struct audit_aux_data d; + mqd_t mqdes; + struct mq_attr mqstat; +}; + struct audit_aux_data_ipcctl { struct audit_aux_data d; struct ipc_perm p; @@ -644,6 +675,43 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts continue; /* audit_panic has been called */ switch (aux->type) { + case AUDIT_MQ_OPEN: { + struct audit_aux_data_mq_open *axi = (void *)aux; + audit_log_format(ab, + "oflag=0x%x mode=%#o mq_flags=0x%lx mq_maxmsg=%ld " + "mq_msgsize=%ld mq_curmsgs=%ld", + axi->oflag, axi->mode, axi->attr.mq_flags, + axi->attr.mq_maxmsg, axi->attr.mq_msgsize, + axi->attr.mq_curmsgs); + break; } + + case AUDIT_MQ_SENDRECV: { + struct audit_aux_data_mq_sendrecv *axi = (void *)aux; + audit_log_format(ab, + "mqdes=%d msg_len=%zd msg_prio=%u " + "abs_timeout_sec=%ld abs_timeout_nsec=%ld", + axi->mqdes, axi->msg_len, axi->msg_prio, + axi->abs_timeout.tv_sec, axi->abs_timeout.tv_nsec); + break; } + + case AUDIT_MQ_NOTIFY: { + struct audit_aux_data_mq_notify *axi = (void *)aux; + audit_log_format(ab, + "mqdes=%d sigev_signo=%d", + axi->mqdes, + axi->notification.sigev_signo); + break; } + + case AUDIT_MQ_GETSETATTR: { + struct audit_aux_data_mq_getsetattr *axi = (void *)aux; + audit_log_format(ab, + "mqdes=%d mq_flags=0x%lx mq_maxmsg=%ld mq_msgsize=%ld " + "mq_curmsgs=%ld ", + axi->mqdes, + axi->mqstat.mq_flags, axi->mqstat.mq_maxmsg, + axi->mqstat.mq_msgsize, axi->mqstat.mq_curmsgs); + break; } + case AUDIT_IPC: { struct audit_aux_data_ipcctl *axi = (void *)aux; audit_log_format(ab, @@ -1182,6 +1250,210 @@ uid_t audit_get_loginuid(struct audit_context *ctx) return ctx ? ctx->loginuid : -1; } +/** + * __audit_mq_open - record audit data for a POSIX MQ open + * @oflag: open flag + * @mode: mode bits + * @u_attr: queue attributes + * + * Returns 0 for success or NULL context or < 0 on error. + */ +int __audit_mq_open(int oflag, mode_t mode, struct mq_attr __user *u_attr) +{ + struct audit_aux_data_mq_open *ax; + struct audit_context *context = current->audit_context; + + if (!audit_enabled) + return 0; + + if (likely(!context)) + return 0; + + ax = kmalloc(sizeof(*ax), GFP_ATOMIC); + if (!ax) + return -ENOMEM; + + if (u_attr != NULL) { + if (copy_from_user(&ax->attr, u_attr, sizeof(ax->attr))) { + kfree(ax); + return -EFAULT; + } + } else + memset(&ax->attr, 0, sizeof(ax->attr)); + + ax->oflag = oflag; + ax->mode = mode; + + ax->d.type = AUDIT_MQ_OPEN; + ax->d.next = context->aux; + context->aux = (void *)ax; + return 0; +} + +/** + * __audit_mq_timedsend - record audit data for a POSIX MQ timed send + * @mqdes: MQ descriptor + * @msg_len: Message length + * @msg_prio: Message priority + * @abs_timeout: Message timeout in absolute time + * + * Returns 0 for success or NULL context or < 0 on error. + */ +int __audit_mq_timedsend(mqd_t mqdes, size_t msg_len, unsigned int msg_prio, + const struct timespec __user *u_abs_timeout) +{ + struct audit_aux_data_mq_sendrecv *ax; + struct audit_context *context = current->audit_context; + + if (!audit_enabled) + return 0; + + if (likely(!context)) + return 0; + + ax = kmalloc(sizeof(*ax), GFP_ATOMIC); + if (!ax) + return -ENOMEM; + + if (u_abs_timeout != NULL) { + if (copy_from_user(&ax->abs_timeout, u_abs_timeout, sizeof(ax->abs_timeout))) { + kfree(ax); + return -EFAULT; + } + } else + memset(&ax->abs_timeout, 0, sizeof(ax->abs_timeout)); + + ax->mqdes = mqdes; + ax->msg_len = msg_len; + ax->msg_prio = msg_prio; + + ax->d.type = AUDIT_MQ_SENDRECV; + ax->d.next = context->aux; + context->aux = (void *)ax; + return 0; +} + +/** + * __audit_mq_timedreceive - record audit data for a POSIX MQ timed receive + * @mqdes: MQ descriptor + * @msg_len: Message length + * @msg_prio: Message priority + * @abs_timeout: Message timeout in absolute time + * + * Returns 0 for success or NULL context or < 0 on error. + */ +int __audit_mq_timedreceive(mqd_t mqdes, size_t msg_len, + unsigned int __user *u_msg_prio, + const struct timespec __user *u_abs_timeout) +{ + struct audit_aux_data_mq_sendrecv *ax; + struct audit_context *context = current->audit_context; + + if (!audit_enabled) + return 0; + + if (likely(!context)) + return 0; + + ax = kmalloc(sizeof(*ax), GFP_ATOMIC); + if (!ax) + return -ENOMEM; + + if (u_msg_prio != NULL) { + if (get_user(ax->msg_prio, u_msg_prio)) { + kfree(ax); + return -EFAULT; + } + } else + ax->msg_prio = 0; + + if (u_abs_timeout != NULL) { + if (copy_from_user(&ax->abs_timeout, u_abs_timeout, sizeof(ax->abs_timeout))) { + kfree(ax); + return -EFAULT; + } + } else + memset(&ax->abs_timeout, 0, sizeof(ax->abs_timeout)); + + ax->mqdes = mqdes; + ax->msg_len = msg_len; + + ax->d.type = AUDIT_MQ_SENDRECV; + ax->d.next = context->aux; + context->aux = (void *)ax; + return 0; +} + +/** + * __audit_mq_notify - record audit data for a POSIX MQ notify + * @mqdes: MQ descriptor + * @u_notification: Notification event + * + * Returns 0 for success or NULL context or < 0 on error. + */ + +int __audit_mq_notify(mqd_t mqdes, const struct sigevent __user *u_notification) +{ + struct audit_aux_data_mq_notify *ax; + struct audit_context *context = current->audit_context; + + if (!audit_enabled) + return 0; + + if (likely(!context)) + return 0; + + ax = kmalloc(sizeof(*ax), GFP_ATOMIC); + if (!ax) + return -ENOMEM; + + if (u_notification != NULL) { + if (copy_from_user(&ax->notification, u_notification, sizeof(ax->notification))) { + kfree(ax); + return -EFAULT; + } + } else + memset(&ax->notification, 0, sizeof(ax->notification)); + + ax->mqdes = mqdes; + + ax->d.type = AUDIT_MQ_NOTIFY; + ax->d.next = context->aux; + context->aux = (void *)ax; + return 0; +} + +/** + * __audit_mq_getsetattr - record audit data for a POSIX MQ get/set attribute + * @mqdes: MQ descriptor + * @mqstat: MQ flags + * + * Returns 0 for success or NULL context or < 0 on error. + */ +int __audit_mq_getsetattr(mqd_t mqdes, struct mq_attr *mqstat) +{ + struct audit_aux_data_mq_getsetattr *ax; + struct audit_context *context = current->audit_context; + + if (!audit_enabled) + return 0; + + if (likely(!context)) + return 0; + + ax = kmalloc(sizeof(*ax), GFP_ATOMIC); + if (!ax) + return -ENOMEM; + + ax->mqdes = mqdes; + ax->mqstat = *mqstat; + + ax->d.type = AUDIT_MQ_GETSETATTR; + ax->d.next = context->aux; + context->aux = (void *)ax; + return 0; +} + /** * audit_ipc_obj - record audit data for ipc object * @ipcp: ipc permissions -- cgit v1.2.3 From f368c07d7214a7c41dfceb76c8db473b850f0229 Mon Sep 17 00:00:00 2001 From: Amy Griffis Date: Fri, 7 Apr 2006 16:55:56 -0400 Subject: [PATCH] audit: path-based rules In this implementation, audit registers inotify watches on the parent directories of paths specified in audit rules. When audit's inotify event handler is called, it updates any affected rules based on the filesystem event. If the parent directory is renamed, removed, or its filesystem is unmounted, audit removes all rules referencing that inotify watch. To keep things simple, this implementation limits location-based auditing to the directory entries in an existing directory. Given a path-based rule for /foo/bar/passwd, the following table applies: passwd modified -- audit event logged passwd replaced -- audit event logged, rules list updated bar renamed -- rule removed foo renamed -- untracked, meaning that the rule now applies to the new location Audit users typically want to have many rules referencing filesystem objects, which can significantly impact filtering performance. This patch also adds an inode-number-based rule hash to mitigate this situation. The patch is relative to the audit git tree: http://kernel.org/git/?p=linux/kernel/git/viro/audit-current.git;a=summary and uses the inotify kernel API: http://lkml.org/lkml/2006/6/1/145 Signed-off-by: Amy Griffis Signed-off-by: Al Viro --- include/linux/audit.h | 1 + init/Kconfig | 3 +- kernel/audit.c | 41 ++- kernel/audit.h | 38 ++- kernel/auditfilter.c | 785 +++++++++++++++++++++++++++++++++++++++++++++++--- kernel/auditsc.c | 124 +++++--- 6 files changed, 903 insertions(+), 89 deletions(-) (limited to 'kernel') diff --git a/include/linux/audit.h b/include/linux/audit.h index 7c8780b150e6..c78327507f4e 100644 --- a/include/linux/audit.h +++ b/include/linux/audit.h @@ -165,6 +165,7 @@ #define AUDIT_INODE 102 #define AUDIT_EXIT 103 #define AUDIT_SUCCESS 104 /* exit >= 0; value ignored */ +#define AUDIT_WATCH 105 #define AUDIT_ARG0 200 #define AUDIT_ARG1 (AUDIT_ARG0+1) diff --git a/init/Kconfig b/init/Kconfig index 3b36a1d53656..c4d0fa655d5d 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -182,7 +182,8 @@ config AUDITSYSCALL help Enable low-overhead system-call auditing infrastructure that can be used independently or with another kernel subsystem, - such as SELinux. + such as SELinux. To use audit's filesystem watch feature, please + ensure that INOTIFY is configured. config IKCONFIG bool "Kernel .config support" diff --git a/kernel/audit.c b/kernel/audit.c index 0738a4b290e6..0fbf1c116363 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -56,6 +56,7 @@ #include #include #include +#include #include "audit.h" @@ -103,6 +104,12 @@ static atomic_t audit_lost = ATOMIC_INIT(0); /* The netlink socket. */ static struct sock *audit_sock; +/* Inotify handle. */ +struct inotify_handle *audit_ih; + +/* Hash for inode-based rules */ +struct list_head audit_inode_hash[AUDIT_INODE_BUCKETS]; + /* The audit_freelist is a list of pre-allocated audit buffers (if more * than AUDIT_MAXFREE are in use, the audit buffer is freed instead of * being placed on the freelist). */ @@ -115,10 +122,8 @@ static struct task_struct *kauditd_task; static DECLARE_WAIT_QUEUE_HEAD(kauditd_wait); static DECLARE_WAIT_QUEUE_HEAD(audit_backlog_wait); -/* The netlink socket is only to be read by 1 CPU, which lets us assume - * that list additions and deletions never happen simultaneously in - * auditsc.c */ -DEFINE_MUTEX(audit_netlink_mutex); +/* Serialize requests from userspace. */ +static DEFINE_MUTEX(audit_cmd_mutex); /* AUDIT_BUFSIZ is the size of the temporary buffer used for formatting * audit records. Since printk uses a 1024 byte buffer, this buffer @@ -373,8 +378,8 @@ int audit_send_list(void *_dest) struct sk_buff *skb; /* wait for parent to finish and send an ACK */ - mutex_lock(&audit_netlink_mutex); - mutex_unlock(&audit_netlink_mutex); + mutex_lock(&audit_cmd_mutex); + mutex_unlock(&audit_cmd_mutex); while ((skb = __skb_dequeue(&dest->q)) != NULL) netlink_unicast(audit_sock, skb, pid, 0); @@ -665,20 +670,30 @@ static void audit_receive(struct sock *sk, int length) struct sk_buff *skb; unsigned int qlen; - mutex_lock(&audit_netlink_mutex); + mutex_lock(&audit_cmd_mutex); for (qlen = skb_queue_len(&sk->sk_receive_queue); qlen; qlen--) { skb = skb_dequeue(&sk->sk_receive_queue); audit_receive_skb(skb); kfree_skb(skb); } - mutex_unlock(&audit_netlink_mutex); + mutex_unlock(&audit_cmd_mutex); } +#ifdef CONFIG_AUDITSYSCALL +static const struct inotify_operations audit_inotify_ops = { + .handle_event = audit_handle_ievent, + .destroy_watch = audit_free_parent, +}; +#endif /* Initialize audit support at boot time. */ static int __init audit_init(void) { +#ifdef CONFIG_AUDITSYSCALL + int i; +#endif + printk(KERN_INFO "audit: initializing netlink socket (%s)\n", audit_default ? "enabled" : "disabled"); audit_sock = netlink_kernel_create(NETLINK_AUDIT, 0, audit_receive, @@ -697,6 +712,16 @@ static int __init audit_init(void) selinux_audit_set_callback(&selinux_audit_rule_update); audit_log(NULL, GFP_KERNEL, AUDIT_KERNEL, "initialized"); + +#ifdef CONFIG_AUDITSYSCALL + audit_ih = inotify_init(&audit_inotify_ops); + if (IS_ERR(audit_ih)) + audit_panic("cannot initialize inotify handle"); + + for (i = 0; i < AUDIT_INODE_BUCKETS; i++) + INIT_LIST_HEAD(&audit_inode_hash[i]); +#endif + return 0; } __initcall(audit_init); diff --git a/kernel/audit.h b/kernel/audit.h index 52cb1e31d522..58fa44cb8d01 100644 --- a/kernel/audit.h +++ b/kernel/audit.h @@ -19,7 +19,6 @@ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ -#include #include #include #include @@ -54,6 +53,18 @@ enum audit_state { }; /* Rule lists */ +struct audit_parent; + +struct audit_watch { + atomic_t count; /* reference count */ + char *path; /* insertion path */ + dev_t dev; /* associated superblock device */ + unsigned long ino; /* associated inode number */ + struct audit_parent *parent; /* associated parent */ + struct list_head wlist; /* entry in parent->watches list */ + struct list_head rules; /* associated rules */ +}; + struct audit_field { u32 type; u32 val; @@ -71,6 +82,9 @@ struct audit_krule { u32 buflen; /* for data alloc on list rules */ u32 field_count; struct audit_field *fields; + struct audit_field *inode_f; /* quick access to an inode field */ + struct audit_watch *watch; /* associated watch */ + struct list_head rlist; /* entry in audit_watch.rules list */ }; struct audit_entry { @@ -79,10 +93,18 @@ struct audit_entry { struct audit_krule rule; }; - extern int audit_pid; -extern int audit_comparator(const u32 left, const u32 op, const u32 right); +#define AUDIT_INODE_BUCKETS 32 +extern struct list_head audit_inode_hash[AUDIT_INODE_BUCKETS]; + +static inline int audit_hash_ino(u32 ino) +{ + return (ino & (AUDIT_INODE_BUCKETS-1)); +} + +extern int audit_comparator(const u32 left, const u32 op, const u32 right); +extern int audit_compare_dname_path(const char *dname, const char *path); extern struct sk_buff * audit_make_reply(int pid, int seq, int type, int done, int multi, void *payload, int size); @@ -91,7 +113,6 @@ extern void audit_send_reply(int pid, int seq, int type, void *payload, int size); extern void audit_log_lost(const char *message); extern void audit_panic(const char *message); -extern struct mutex audit_netlink_mutex; struct audit_netlink_list { int pid; @@ -100,6 +121,10 @@ struct audit_netlink_list { int audit_send_list(void *); +struct inotify_watch; +extern void audit_free_parent(struct inotify_watch *); +extern void audit_handle_ievent(struct inotify_watch *, u32, u32, u32, + const char *, struct inode *); extern int selinux_audit_rule_update(void); #ifdef CONFIG_AUDITSYSCALL @@ -109,6 +134,11 @@ static inline void audit_signal_info(int sig, struct task_struct *t) if (unlikely(audit_pid && t->tgid == audit_pid)) __audit_signal_info(sig, t); } +extern enum audit_state audit_filter_inodes(struct task_struct *, + struct audit_context *); +extern void audit_set_auditable(struct audit_context *); #else #define audit_signal_info(s,t) +#define audit_filter_inodes(t,c) AUDIT_DISABLED +#define audit_set_auditable(c) #endif diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index df9503da40fb..03a6919103d4 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -22,13 +22,59 @@ #include #include #include +#include +#include +#include #include +#include +#include #include #include "audit.h" -/* There are three lists of rules -- one to search at task creation - * time, one to search at syscall entry time, and another to search at - * syscall exit time. */ +/* + * Locking model: + * + * audit_filter_mutex: + * Synchronizes writes and blocking reads of audit's filterlist + * data. Rcu is used to traverse the filterlist and access + * contents of structs audit_entry, audit_watch and opaque + * selinux rules during filtering. If modified, these structures + * must be copied and replace their counterparts in the filterlist. + * An audit_parent struct is not accessed during filtering, so may + * be written directly provided audit_filter_mutex is held. + */ + +/* + * Reference counting: + * + * audit_parent: lifetime is from audit_init_parent() to receipt of an IN_IGNORED + * event. Each audit_watch holds a reference to its associated parent. + * + * audit_watch: if added to lists, lifetime is from audit_init_watch() to + * audit_remove_watch(). Additionally, an audit_watch may exist + * temporarily to assist in searching existing filter data. Each + * audit_krule holds a reference to its associated watch. + */ + +struct audit_parent { + struct list_head ilist; /* entry in inotify registration list */ + struct list_head watches; /* associated watches */ + struct inotify_watch wdata; /* inotify watch data */ + unsigned flags; /* status flags */ +}; + +/* + * audit_parent status flags: + * + * AUDIT_PARENT_INVALID - set anytime rules/watches are auto-removed due to + * a filesystem event to ensure we're adding audit watches to a valid parent. + * Technically not needed for IN_DELETE_SELF or IN_UNMOUNT events, as we cannot + * receive them while we have nameidata, but must be used for IN_MOVE_SELF which + * we can receive while holding nameidata. + */ +#define AUDIT_PARENT_INVALID 0x001 + +/* Audit filter lists, defined in */ struct list_head audit_filter_list[AUDIT_NR_FILTERS] = { LIST_HEAD_INIT(audit_filter_list[0]), LIST_HEAD_INIT(audit_filter_list[1]), @@ -41,9 +87,53 @@ struct list_head audit_filter_list[AUDIT_NR_FILTERS] = { #endif }; +static DEFINE_MUTEX(audit_filter_mutex); + +/* Inotify handle */ +extern struct inotify_handle *audit_ih; + +/* Inotify events we care about. */ +#define AUDIT_IN_WATCH IN_MOVE|IN_CREATE|IN_DELETE|IN_DELETE_SELF|IN_MOVE_SELF + +void audit_free_parent(struct inotify_watch *i_watch) +{ + struct audit_parent *parent; + + parent = container_of(i_watch, struct audit_parent, wdata); + WARN_ON(!list_empty(&parent->watches)); + kfree(parent); +} + +static inline void audit_get_watch(struct audit_watch *watch) +{ + atomic_inc(&watch->count); +} + +static void audit_put_watch(struct audit_watch *watch) +{ + if (atomic_dec_and_test(&watch->count)) { + WARN_ON(watch->parent); + WARN_ON(!list_empty(&watch->rules)); + kfree(watch->path); + kfree(watch); + } +} + +static void audit_remove_watch(struct audit_watch *watch) +{ + list_del(&watch->wlist); + put_inotify_watch(&watch->parent->wdata); + watch->parent = NULL; + audit_put_watch(watch); /* match initial get */ +} + static inline void audit_free_rule(struct audit_entry *e) { int i; + + /* some rules don't have associated watches */ + if (e->rule.watch) + audit_put_watch(e->rule.watch); if (e->rule.fields) for (i = 0; i < e->rule.field_count; i++) { struct audit_field *f = &e->rule.fields[i]; @@ -60,6 +150,50 @@ static inline void audit_free_rule_rcu(struct rcu_head *head) audit_free_rule(e); } +/* Initialize a parent watch entry. */ +static struct audit_parent *audit_init_parent(struct nameidata *ndp) +{ + struct audit_parent *parent; + s32 wd; + + parent = kzalloc(sizeof(*parent), GFP_KERNEL); + if (unlikely(!parent)) + return ERR_PTR(-ENOMEM); + + INIT_LIST_HEAD(&parent->watches); + parent->flags = 0; + + inotify_init_watch(&parent->wdata); + /* grab a ref so inotify watch hangs around until we take audit_filter_mutex */ + get_inotify_watch(&parent->wdata); + wd = inotify_add_watch(audit_ih, &parent->wdata, ndp->dentry->d_inode, + AUDIT_IN_WATCH); + if (wd < 0) { + audit_free_parent(&parent->wdata); + return ERR_PTR(wd); + } + + return parent; +} + +/* Initialize a watch entry. */ +static struct audit_watch *audit_init_watch(char *path) +{ + struct audit_watch *watch; + + watch = kzalloc(sizeof(*watch), GFP_KERNEL); + if (unlikely(!watch)) + return ERR_PTR(-ENOMEM); + + INIT_LIST_HEAD(&watch->rules); + atomic_set(&watch->count, 1); + watch->path = path; + watch->dev = (dev_t)-1; + watch->ino = (unsigned long)-1; + + return watch; +} + /* Initialize an audit filterlist entry. */ static inline struct audit_entry *audit_init_entry(u32 field_count) { @@ -107,6 +241,43 @@ static char *audit_unpack_string(void **bufp, size_t *remain, size_t len) return str; } +/* Translate an inode field to kernel respresentation. */ +static inline int audit_to_inode(struct audit_krule *krule, + struct audit_field *f) +{ + if (krule->listnr != AUDIT_FILTER_EXIT || + krule->watch || krule->inode_f) + return -EINVAL; + + krule->inode_f = f; + return 0; +} + +/* Translate a watch string to kernel respresentation. */ +static int audit_to_watch(struct audit_krule *krule, char *path, int len, + u32 op) +{ + struct audit_watch *watch; + + if (!audit_ih) + return -EOPNOTSUPP; + + if (path[0] != '/' || path[len-1] == '/' || + krule->listnr != AUDIT_FILTER_EXIT || + op & ~AUDIT_EQUAL || + krule->inode_f || krule->watch) /* 1 inode # per rule, for hash */ + return -EINVAL; + + watch = audit_init_watch(path); + if (unlikely(IS_ERR(watch))) + return PTR_ERR(watch); + + audit_get_watch(watch); + krule->watch = watch; + + return 0; +} + /* Common user-space to kernel rule translation. */ static inline struct audit_entry *audit_to_entry_common(struct audit_rule *rule) { @@ -161,6 +332,7 @@ exit_err: static struct audit_entry *audit_rule_to_entry(struct audit_rule *rule) { struct audit_entry *entry; + struct audit_field *f; int err = 0; int i; @@ -175,14 +347,23 @@ static struct audit_entry *audit_rule_to_entry(struct audit_rule *rule) f->type = rule->fields[i] & ~(AUDIT_NEGATE|AUDIT_OPERATORS); f->val = rule->values[i]; - if (f->type & AUDIT_UNUSED_BITS || - f->type == AUDIT_SE_USER || - f->type == AUDIT_SE_ROLE || - f->type == AUDIT_SE_TYPE || - f->type == AUDIT_SE_SEN || - f->type == AUDIT_SE_CLR) { - err = -EINVAL; + err = -EINVAL; + if (f->type & AUDIT_UNUSED_BITS) + goto exit_free; + + switch(f->type) { + case AUDIT_SE_USER: + case AUDIT_SE_ROLE: + case AUDIT_SE_TYPE: + case AUDIT_SE_SEN: + case AUDIT_SE_CLR: + case AUDIT_WATCH: goto exit_free; + case AUDIT_INODE: + err = audit_to_inode(&entry->rule, f); + if (err) + goto exit_free; + break; } entry->rule.vers_ops = (f->op & AUDIT_OPERATORS) ? 2 : 1; @@ -199,6 +380,18 @@ static struct audit_entry *audit_rule_to_entry(struct audit_rule *rule) } } + f = entry->rule.inode_f; + if (f) { + switch(f->op) { + case AUDIT_NOT_EQUAL: + entry->rule.inode_f = NULL; + case AUDIT_EQUAL: + break; + default: + goto exit_free; + } + } + exit_nofree: return entry; @@ -213,6 +406,7 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data, { int err = 0; struct audit_entry *entry; + struct audit_field *f; void *bufp; size_t remain = datasz - sizeof(struct audit_rule_data); int i; @@ -263,6 +457,35 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data, } else f->se_str = str; break; + case AUDIT_WATCH: + str = audit_unpack_string(&bufp, &remain, f->val); + if (IS_ERR(str)) + goto exit_free; + entry->rule.buflen += f->val; + + err = audit_to_watch(&entry->rule, str, f->val, f->op); + if (err) { + kfree(str); + goto exit_free; + } + break; + case AUDIT_INODE: + err = audit_to_inode(&entry->rule, f); + if (err) + goto exit_free; + break; + } + } + + f = entry->rule.inode_f; + if (f) { + switch(f->op) { + case AUDIT_NOT_EQUAL: + entry->rule.inode_f = NULL; + case AUDIT_EQUAL: + break; + default: + goto exit_free; } } @@ -346,6 +569,10 @@ static struct audit_rule_data *audit_krule_to_data(struct audit_krule *krule) data->buflen += data->values[i] = audit_pack_string(&bufp, f->se_str); break; + case AUDIT_WATCH: + data->buflen += data->values[i] = + audit_pack_string(&bufp, krule->watch->path); + break; default: data->values[i] = f->val; } @@ -381,6 +608,10 @@ static int audit_compare_rule(struct audit_krule *a, struct audit_krule *b) if (strcmp(a->fields[i].se_str, b->fields[i].se_str)) return 1; break; + case AUDIT_WATCH: + if (strcmp(a->watch->path, b->watch->path)) + return 1; + break; default: if (a->fields[i].val != b->fields[i].val) return 1; @@ -394,6 +625,32 @@ static int audit_compare_rule(struct audit_krule *a, struct audit_krule *b) return 0; } +/* Duplicate the given audit watch. The new watch's rules list is initialized + * to an empty list and wlist is undefined. */ +static struct audit_watch *audit_dupe_watch(struct audit_watch *old) +{ + char *path; + struct audit_watch *new; + + path = kstrdup(old->path, GFP_KERNEL); + if (unlikely(!path)) + return ERR_PTR(-ENOMEM); + + new = audit_init_watch(path); + if (unlikely(IS_ERR(new))) { + kfree(path); + goto out; + } + + new->dev = old->dev; + new->ino = old->ino; + get_inotify_watch(&old->parent->wdata); + new->parent = old->parent; + +out: + return new; +} + /* Duplicate selinux field information. The se_rule is opaque, so must be * re-initialized. */ static inline int audit_dupe_selinux_field(struct audit_field *df, @@ -425,8 +682,11 @@ static inline int audit_dupe_selinux_field(struct audit_field *df, /* Duplicate an audit rule. This will be a deep copy with the exception * of the watch - that pointer is carried over. The selinux specific fields * will be updated in the copy. The point is to be able to replace the old - * rule with the new rule in the filterlist, then free the old rule. */ -static struct audit_entry *audit_dupe_rule(struct audit_krule *old) + * rule with the new rule in the filterlist, then free the old rule. + * The rlist element is undefined; list manipulations are handled apart from + * the initial copy. */ +static struct audit_entry *audit_dupe_rule(struct audit_krule *old, + struct audit_watch *watch) { u32 fcount = old->field_count; struct audit_entry *entry; @@ -445,6 +705,8 @@ static struct audit_entry *audit_dupe_rule(struct audit_krule *old) for (i = 0; i < AUDIT_BITMASK_SIZE; i++) new->mask[i] = old->mask[i]; new->buflen = old->buflen; + new->inode_f = old->inode_f; + new->watch = NULL; new->field_count = old->field_count; memcpy(new->fields, old->fields, sizeof(struct audit_field) * fcount); @@ -466,21 +728,318 @@ static struct audit_entry *audit_dupe_rule(struct audit_krule *old) } } + if (watch) { + audit_get_watch(watch); + new->watch = watch; + } + return entry; } -/* Add rule to given filterlist if not a duplicate. Protected by - * audit_netlink_mutex. */ +/* Update inode info in audit rules based on filesystem event. */ +static void audit_update_watch(struct audit_parent *parent, + const char *dname, dev_t dev, + unsigned long ino, unsigned invalidating) +{ + struct audit_watch *owatch, *nwatch, *nextw; + struct audit_krule *r, *nextr; + struct audit_entry *oentry, *nentry; + struct audit_buffer *ab; + + mutex_lock(&audit_filter_mutex); + list_for_each_entry_safe(owatch, nextw, &parent->watches, wlist) { + if (audit_compare_dname_path(dname, owatch->path)) + continue; + + /* If the update involves invalidating rules, do the inode-based + * filtering now, so we don't omit records. */ + if (invalidating && + audit_filter_inodes(current, current->audit_context) == AUDIT_RECORD_CONTEXT) + audit_set_auditable(current->audit_context); + + nwatch = audit_dupe_watch(owatch); + if (unlikely(IS_ERR(nwatch))) { + mutex_unlock(&audit_filter_mutex); + audit_panic("error updating watch, skipping"); + return; + } + nwatch->dev = dev; + nwatch->ino = ino; + + list_for_each_entry_safe(r, nextr, &owatch->rules, rlist) { + + oentry = container_of(r, struct audit_entry, rule); + list_del(&oentry->rule.rlist); + list_del_rcu(&oentry->list); + + nentry = audit_dupe_rule(&oentry->rule, nwatch); + if (unlikely(IS_ERR(nentry))) + audit_panic("error updating watch, removing"); + else { + int h = audit_hash_ino((u32)ino); + list_add(&nentry->rule.rlist, &nwatch->rules); + list_add_rcu(&nentry->list, &audit_inode_hash[h]); + } + + call_rcu(&oentry->rcu, audit_free_rule_rcu); + } + + ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE); + audit_log_format(ab, "audit updated rules specifying watch="); + audit_log_untrustedstring(ab, owatch->path); + audit_log_format(ab, " with dev=%u ino=%lu\n", dev, ino); + audit_log_end(ab); + + audit_remove_watch(owatch); + goto add_watch_to_parent; /* event applies to a single watch */ + } + mutex_unlock(&audit_filter_mutex); + return; + +add_watch_to_parent: + list_add(&nwatch->wlist, &parent->watches); + mutex_unlock(&audit_filter_mutex); + return; +} + +/* Remove all watches & rules associated with a parent that is going away. */ +static void audit_remove_parent_watches(struct audit_parent *parent) +{ + struct audit_watch *w, *nextw; + struct audit_krule *r, *nextr; + struct audit_entry *e; + + mutex_lock(&audit_filter_mutex); + parent->flags |= AUDIT_PARENT_INVALID; + list_for_each_entry_safe(w, nextw, &parent->watches, wlist) { + list_for_each_entry_safe(r, nextr, &w->rules, rlist) { + e = container_of(r, struct audit_entry, rule); + list_del(&r->rlist); + list_del_rcu(&e->list); + call_rcu(&e->rcu, audit_free_rule_rcu); + + audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE, + "audit implicitly removed rule from list=%d\n", + AUDIT_FILTER_EXIT); + } + audit_remove_watch(w); + } + mutex_unlock(&audit_filter_mutex); +} + +/* Unregister inotify watches for parents on in_list. + * Generates an IN_IGNORED event. */ +static void audit_inotify_unregister(struct list_head *in_list) +{ + struct audit_parent *p, *n; + + list_for_each_entry_safe(p, n, in_list, ilist) { + list_del(&p->ilist); + inotify_rm_watch(audit_ih, &p->wdata); + /* the put matching the get in audit_do_del_rule() */ + put_inotify_watch(&p->wdata); + } +} + +/* Find an existing audit rule. + * Caller must hold audit_filter_mutex to prevent stale rule data. */ +static struct audit_entry *audit_find_rule(struct audit_entry *entry, + struct list_head *list) +{ + struct audit_entry *e, *found = NULL; + int h; + + if (entry->rule.watch) { + /* we don't know the inode number, so must walk entire hash */ + for (h = 0; h < AUDIT_INODE_BUCKETS; h++) { + list = &audit_inode_hash[h]; + list_for_each_entry(e, list, list) + if (!audit_compare_rule(&entry->rule, &e->rule)) { + found = e; + goto out; + } + } + goto out; + } + + list_for_each_entry(e, list, list) + if (!audit_compare_rule(&entry->rule, &e->rule)) { + found = e; + goto out; + } + +out: + return found; +} + +/* Get path information necessary for adding watches. */ +static int audit_get_nd(char *path, struct nameidata **ndp, + struct nameidata **ndw) +{ + struct nameidata *ndparent, *ndwatch; + int err; + + ndparent = kmalloc(sizeof(*ndparent), GFP_KERNEL); + if (unlikely(!ndparent)) + return -ENOMEM; + + ndwatch = kmalloc(sizeof(*ndwatch), GFP_KERNEL); + if (unlikely(!ndwatch)) { + kfree(ndparent); + return -ENOMEM; + } + + err = path_lookup(path, LOOKUP_PARENT, ndparent); + if (err) { + kfree(ndparent); + kfree(ndwatch); + return err; + } + + err = path_lookup(path, 0, ndwatch); + if (err) { + kfree(ndwatch); + ndwatch = NULL; + } + + *ndp = ndparent; + *ndw = ndwatch; + + return 0; +} + +/* Release resources used for watch path information. */ +static void audit_put_nd(struct nameidata *ndp, struct nameidata *ndw) +{ + if (ndp) { + path_release(ndp); + kfree(ndp); + } + if (ndw) { + path_release(ndw); + kfree(ndw); + } +} + +/* Associate the given rule with an existing parent inotify_watch. + * Caller must hold audit_filter_mutex. */ +static void audit_add_to_parent(struct audit_krule *krule, + struct audit_parent *parent) +{ + struct audit_watch *w, *watch = krule->watch; + int watch_found = 0; + + list_for_each_entry(w, &parent->watches, wlist) { + if (strcmp(watch->path, w->path)) + continue; + + watch_found = 1; + + /* put krule's and initial refs to temporary watch */ + audit_put_watch(watch); + audit_put_watch(watch); + + audit_get_watch(w); + krule->watch = watch = w; + break; + } + + if (!watch_found) { + get_inotify_watch(&parent->wdata); + watch->parent = parent; + + list_add(&watch->wlist, &parent->watches); + } + list_add(&krule->rlist, &watch->rules); +} + +/* Find a matching watch entry, or add this one. + * Caller must hold audit_filter_mutex. */ +static int audit_add_watch(struct audit_krule *krule, struct nameidata *ndp, + struct nameidata *ndw) +{ + struct audit_watch *watch = krule->watch; + struct inotify_watch *i_watch; + struct audit_parent *parent; + int ret = 0; + + /* update watch filter fields */ + if (ndw) { + watch->dev = ndw->dentry->d_inode->i_sb->s_dev; + watch->ino = ndw->dentry->d_inode->i_ino; + } + + /* The audit_filter_mutex must not be held during inotify calls because + * we hold it during inotify event callback processing. If an existing + * inotify watch is found, inotify_find_watch() grabs a reference before + * returning. + */ + mutex_unlock(&audit_filter_mutex); + + if (inotify_find_watch(audit_ih, ndp->dentry->d_inode, &i_watch) < 0) { + parent = audit_init_parent(ndp); + if (IS_ERR(parent)) { + /* caller expects mutex locked */ + mutex_lock(&audit_filter_mutex); + return PTR_ERR(parent); + } + } else + parent = container_of(i_watch, struct audit_parent, wdata); + + mutex_lock(&audit_filter_mutex); + + /* parent was moved before we took audit_filter_mutex */ + if (parent->flags & AUDIT_PARENT_INVALID) + ret = -ENOENT; + else + audit_add_to_parent(krule, parent); + + /* match get in audit_init_parent or inotify_find_watch */ + put_inotify_watch(&parent->wdata); + return ret; +} + +/* Add rule to given filterlist if not a duplicate. */ static inline int audit_add_rule(struct audit_entry *entry, - struct list_head *list) + struct list_head *list) { struct audit_entry *e; + struct audit_field *inode_f = entry->rule.inode_f; + struct audit_watch *watch = entry->rule.watch; + struct nameidata *ndp, *ndw; + int h, err, putnd_needed = 0; + + if (inode_f) { + h = audit_hash_ino(inode_f->val); + list = &audit_inode_hash[h]; + } + + mutex_lock(&audit_filter_mutex); + e = audit_find_rule(entry, list); + mutex_unlock(&audit_filter_mutex); + if (e) { + err = -EEXIST; + goto error; + } - /* Do not use the _rcu iterator here, since this is the only - * addition routine. */ - list_for_each_entry(e, list, list) { - if (!audit_compare_rule(&entry->rule, &e->rule)) - return -EEXIST; + /* Avoid calling path_lookup under audit_filter_mutex. */ + if (watch) { + err = audit_get_nd(watch->path, &ndp, &ndw); + if (err) + goto error; + putnd_needed = 1; + } + + mutex_lock(&audit_filter_mutex); + if (watch) { + /* audit_filter_mutex is dropped and re-taken during this call */ + err = audit_add_watch(&entry->rule, ndp, ndw); + if (err) { + mutex_unlock(&audit_filter_mutex); + goto error; + } + h = audit_hash_ino((u32)watch->ino); + list = &audit_inode_hash[h]; } if (entry->rule.flags & AUDIT_FILTER_PREPEND) { @@ -488,27 +1047,77 @@ static inline int audit_add_rule(struct audit_entry *entry, } else { list_add_tail_rcu(&entry->list, list); } + mutex_unlock(&audit_filter_mutex); - return 0; + if (putnd_needed) + audit_put_nd(ndp, ndw); + + return 0; + +error: + if (putnd_needed) + audit_put_nd(ndp, ndw); + if (watch) + audit_put_watch(watch); /* tmp watch, matches initial get */ + return err; } -/* Remove an existing rule from filterlist. Protected by - * audit_netlink_mutex. */ +/* Remove an existing rule from filterlist. */ static inline int audit_del_rule(struct audit_entry *entry, struct list_head *list) { struct audit_entry *e; + struct audit_field *inode_f = entry->rule.inode_f; + struct audit_watch *watch, *tmp_watch = entry->rule.watch; + LIST_HEAD(inotify_list); + int h, ret = 0; + + if (inode_f) { + h = audit_hash_ino(inode_f->val); + list = &audit_inode_hash[h]; + } - /* Do not use the _rcu iterator here, since this is the only - * deletion routine. */ - list_for_each_entry(e, list, list) { - if (!audit_compare_rule(&entry->rule, &e->rule)) { - list_del_rcu(&e->list); - call_rcu(&e->rcu, audit_free_rule_rcu); - return 0; + mutex_lock(&audit_filter_mutex); + e = audit_find_rule(entry, list); + if (!e) { + mutex_unlock(&audit_filter_mutex); + ret = -ENOENT; + goto out; + } + + watch = e->rule.watch; + if (watch) { + struct audit_parent *parent = watch->parent; + + list_del(&e->rule.rlist); + + if (list_empty(&watch->rules)) { + audit_remove_watch(watch); + + if (list_empty(&parent->watches)) { + /* Put parent on the inotify un-registration + * list. Grab a reference before releasing + * audit_filter_mutex, to be released in + * audit_inotify_unregister(). */ + list_add(&parent->ilist, &inotify_list); + get_inotify_watch(&parent->wdata); + } } } - return -ENOENT; /* No matching rule */ + + list_del_rcu(&e->list); + call_rcu(&e->rcu, audit_free_rule_rcu); + + mutex_unlock(&audit_filter_mutex); + + if (!list_empty(&inotify_list)) + audit_inotify_unregister(&inotify_list); + +out: + if (tmp_watch) + audit_put_watch(tmp_watch); /* match initial get */ + + return ret; } /* List rules using struct audit_rule. Exists for backward @@ -519,8 +1128,8 @@ static void audit_list(int pid, int seq, struct sk_buff_head *q) struct audit_entry *entry; int i; - /* The *_rcu iterators not needed here because we are - always called with audit_netlink_mutex held. */ + /* This is a blocking read, so use audit_filter_mutex instead of rcu + * iterator to sync with list writers. */ for (i=0; irule); + if (unlikely(!rule)) + break; + skb = audit_make_reply(pid, seq, AUDIT_LIST, 0, 1, + rule, sizeof(*rule)); + if (skb) + skb_queue_tail(q, skb); + kfree(rule); + } + } skb = audit_make_reply(pid, seq, AUDIT_LIST, 1, 1, NULL, 0); if (skb) skb_queue_tail(q, skb); @@ -547,8 +1170,8 @@ static void audit_list_rules(int pid, int seq, struct sk_buff_head *q) struct audit_entry *e; int i; - /* The *_rcu iterators not needed here because we are - always called with audit_netlink_mutex held. */ + /* This is a blocking read, so use audit_filter_mutex instead of rcu + * iterator to sync with list writers. */ for (i=0; ibuflen); + if (skb) + skb_queue_tail(q, skb); + kfree(data); + } + } + for (i=0; i< AUDIT_INODE_BUCKETS; i++) { + list_for_each_entry(e, &audit_inode_hash[i], list) { + struct audit_rule_data *data; + + data = audit_krule_to_data(&e->rule); + if (unlikely(!data)) + break; + skb = audit_make_reply(pid, seq, AUDIT_LIST_RULES, 0, 1, + data, sizeof(*data) + data->buflen); if (skb) skb_queue_tail(q, skb); kfree(data); @@ -602,10 +1239,12 @@ int audit_receive_filter(int type, int pid, int uid, int seq, void *data, dest->pid = pid; skb_queue_head_init(&dest->q); + mutex_lock(&audit_filter_mutex); if (type == AUDIT_LIST) audit_list(pid, seq, &dest->q); else audit_list_rules(pid, seq, &dest->q); + mutex_unlock(&audit_filter_mutex); tsk = kthread_run(audit_send_list, dest, "audit_send_list"); if (IS_ERR(tsk)) { @@ -625,6 +1264,7 @@ int audit_receive_filter(int type, int pid, int uid, int seq, void *data, err = audit_add_rule(entry, &audit_filter_list[entry->rule.listnr]); + if (sid) { char *ctx = NULL; u32 len; @@ -705,7 +1345,39 @@ int audit_comparator(const u32 left, const u32 op, const u32 right) return 0; } +/* Compare given dentry name with last component in given path, + * return of 0 indicates a match. */ +int audit_compare_dname_path(const char *dname, const char *path) +{ + int dlen, plen; + const char *p; + + if (!dname || !path) + return 1; + + dlen = strlen(dname); + plen = strlen(path); + if (plen < dlen) + return 1; + + /* disregard trailing slashes */ + p = path + plen - 1; + while ((*p == '/') && (p > path)) + p--; + + /* find last path component */ + p = p - dlen + 1; + if (p < path) + return 1; + else if (p > path) { + if (*--p != '/') + return 1; + else + p++; + } + return strncmp(p, dname, dlen); +} static int audit_filter_user_rules(struct netlink_skb_parms *cb, struct audit_krule *rule, @@ -818,32 +1490,65 @@ static inline int audit_rule_has_selinux(struct audit_krule *rule) int selinux_audit_rule_update(void) { struct audit_entry *entry, *n, *nentry; + struct audit_watch *watch; int i, err = 0; - /* audit_netlink_mutex synchronizes the writers */ - mutex_lock(&audit_netlink_mutex); + /* audit_filter_mutex synchronizes the writers */ + mutex_lock(&audit_filter_mutex); for (i = 0; i < AUDIT_NR_FILTERS; i++) { list_for_each_entry_safe(entry, n, &audit_filter_list[i], list) { if (!audit_rule_has_selinux(&entry->rule)) continue; - nentry = audit_dupe_rule(&entry->rule); + watch = entry->rule.watch; + nentry = audit_dupe_rule(&entry->rule, watch); if (unlikely(IS_ERR(nentry))) { /* save the first error encountered for the * return value */ if (!err) err = PTR_ERR(nentry); audit_panic("error updating selinux filters"); + if (watch) + list_del(&entry->rule.rlist); list_del_rcu(&entry->list); } else { + if (watch) { + list_add(&nentry->rule.rlist, + &watch->rules); + list_del(&entry->rule.rlist); + } list_replace_rcu(&entry->list, &nentry->list); } call_rcu(&entry->rcu, audit_free_rule_rcu); } } - mutex_unlock(&audit_netlink_mutex); + mutex_unlock(&audit_filter_mutex); return err; } + +/* Update watch data in audit rules based on inotify events. */ +void audit_handle_ievent(struct inotify_watch *i_watch, u32 wd, u32 mask, + u32 cookie, const char *dname, struct inode *inode) +{ + struct audit_parent *parent; + + parent = container_of(i_watch, struct audit_parent, wdata); + + if (mask & (IN_CREATE|IN_MOVED_TO) && inode) + audit_update_watch(parent, dname, inode->i_sb->s_dev, + inode->i_ino, 0); + else if (mask & (IN_DELETE|IN_MOVED_FROM)) + audit_update_watch(parent, dname, (dev_t)-1, (unsigned long)-1, 1); + /* inotify automatically removes the watch and sends IN_IGNORED */ + else if (mask & (IN_DELETE_SELF|IN_UNMOUNT)) + audit_remove_parent_watches(parent); + /* inotify does not remove the watch, so remove it manually */ + else if(mask & IN_MOVE_SELF) { + audit_remove_parent_watches(parent); + inotify_remove_watch_locked(audit_ih, i_watch); + } else if (mask & IN_IGNORED) + put_inotify_watch(i_watch); +} diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 14e295a4121b..174a3f624892 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -200,12 +200,13 @@ struct audit_context { #endif }; - +/* Determine if any context name data matches a rule's watch data */ /* Compare a task_struct with an audit_rule. Return 1 on match, 0 * otherwise. */ static int audit_filter_rules(struct task_struct *tsk, struct audit_krule *rule, struct audit_context *ctx, + struct audit_names *name, enum audit_state *state) { int i, j, need_sid = 1; @@ -268,7 +269,10 @@ static int audit_filter_rules(struct task_struct *tsk, } break; case AUDIT_DEVMAJOR: - if (ctx) { + if (name) + result = audit_comparator(MAJOR(name->dev), + f->op, f->val); + else if (ctx) { for (j = 0; j < ctx->name_count; j++) { if (audit_comparator(MAJOR(ctx->names[j].dev), f->op, f->val)) { ++result; @@ -278,7 +282,10 @@ static int audit_filter_rules(struct task_struct *tsk, } break; case AUDIT_DEVMINOR: - if (ctx) { + if (name) + result = audit_comparator(MINOR(name->dev), + f->op, f->val); + else if (ctx) { for (j = 0; j < ctx->name_count; j++) { if (audit_comparator(MINOR(ctx->names[j].dev), f->op, f->val)) { ++result; @@ -288,7 +295,10 @@ static int audit_filter_rules(struct task_struct *tsk, } break; case AUDIT_INODE: - if (ctx) { + if (name) + result = (name->ino == f->val || + name->pino == f->val); + else if (ctx) { for (j = 0; j < ctx->name_count; j++) { if (audit_comparator(ctx->names[j].ino, f->op, f->val) || audit_comparator(ctx->names[j].pino, f->op, f->val)) { @@ -298,6 +308,12 @@ static int audit_filter_rules(struct task_struct *tsk, } } break; + case AUDIT_WATCH: + if (name && rule->watch->ino != (unsigned long)-1) + result = (name->dev == rule->watch->dev && + (name->ino == rule->watch->ino || + name->pino == rule->watch->ino)); + break; case AUDIT_LOGINUID: result = 0; if (ctx) @@ -354,7 +370,7 @@ static enum audit_state audit_filter_task(struct task_struct *tsk) rcu_read_lock(); list_for_each_entry_rcu(e, &audit_filter_list[AUDIT_FILTER_TASK], list) { - if (audit_filter_rules(tsk, &e->rule, NULL, &state)) { + if (audit_filter_rules(tsk, &e->rule, NULL, NULL, &state)) { rcu_read_unlock(); return state; } @@ -384,8 +400,9 @@ static enum audit_state audit_filter_syscall(struct task_struct *tsk, int bit = AUDIT_BIT(ctx->major); list_for_each_entry_rcu(e, list, list) { - if ((e->rule.mask[word] & bit) == bit - && audit_filter_rules(tsk, &e->rule, ctx, &state)) { + if ((e->rule.mask[word] & bit) == bit && + audit_filter_rules(tsk, &e->rule, ctx, NULL, + &state)) { rcu_read_unlock(); return state; } @@ -395,6 +412,49 @@ static enum audit_state audit_filter_syscall(struct task_struct *tsk, return AUDIT_BUILD_CONTEXT; } +/* At syscall exit time, this filter is called if any audit_names[] have been + * collected during syscall processing. We only check rules in sublists at hash + * buckets applicable to the inode numbers in audit_names[]. + * Regarding audit_state, same rules apply as for audit_filter_syscall(). + */ +enum audit_state audit_filter_inodes(struct task_struct *tsk, + struct audit_context *ctx) +{ + int i; + struct audit_entry *e; + enum audit_state state; + + if (audit_pid && tsk->tgid == audit_pid) + return AUDIT_DISABLED; + + rcu_read_lock(); + for (i = 0; i < ctx->name_count; i++) { + int word = AUDIT_WORD(ctx->major); + int bit = AUDIT_BIT(ctx->major); + struct audit_names *n = &ctx->names[i]; + int h = audit_hash_ino((u32)n->ino); + struct list_head *list = &audit_inode_hash[h]; + + if (list_empty(list)) + continue; + + list_for_each_entry_rcu(e, list, list) { + if ((e->rule.mask[word] & bit) == bit && + audit_filter_rules(tsk, &e->rule, ctx, n, &state)) { + rcu_read_unlock(); + return state; + } + } + } + rcu_read_unlock(); + return AUDIT_BUILD_CONTEXT; +} + +void audit_set_auditable(struct audit_context *ctx) +{ + ctx->auditable = 1; +} + static inline struct audit_context *audit_get_context(struct task_struct *tsk, int return_valid, int return_code) @@ -408,11 +468,20 @@ static inline struct audit_context *audit_get_context(struct task_struct *tsk, if (context->in_syscall && !context->auditable) { enum audit_state state; + state = audit_filter_syscall(tsk, context, &audit_filter_list[AUDIT_FILTER_EXIT]); + if (state == AUDIT_RECORD_CONTEXT) { + context->auditable = 1; + goto get_context; + } + + state = audit_filter_inodes(tsk, context); if (state == AUDIT_RECORD_CONTEXT) context->auditable = 1; + } +get_context: context->pid = tsk->pid; context->ppid = sys_getppid(); /* sic. tsk == current in all cases */ context->uid = tsk->uid; @@ -1142,37 +1211,20 @@ void __audit_inode_child(const char *dname, const struct inode *inode, return; /* determine matching parent */ - if (dname) - for (idx = 0; idx < context->name_count; idx++) - if (context->names[idx].pino == pino) { - const char *n; - const char *name = context->names[idx].name; - int dlen = strlen(dname); - int nlen = name ? strlen(name) : 0; - - if (nlen < dlen) - continue; - - /* disregard trailing slashes */ - n = name + nlen - 1; - while ((*n == '/') && (n > name)) - n--; - - /* find last path component */ - n = n - dlen + 1; - if (n < name) - continue; - else if (n > name) { - if (*--n != '/') - continue; - else - n++; - } + if (!dname) + goto no_match; + for (idx = 0; idx < context->name_count; idx++) + if (context->names[idx].pino == pino) { + const char *name = context->names[idx].name; - if (strncmp(n, dname, dlen) == 0) - goto update_context; - } + if (!name) + continue; + + if (audit_compare_dname_path(dname, name) == 0) + goto update_context; + } +no_match: /* catch-all in case match not found */ idx = context->name_count++; context->names[idx].name = NULL; -- cgit v1.2.3 From 0a73dccc4fd472e65887eae6fbf4afc030541709 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 5 Jun 2006 08:15:59 -0400 Subject: [PATCH] validate rule fields' types Signed-off-by: Al Viro --- kernel/auditfilter.c | 57 +++++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 48 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index 03a6919103d4..9f985dd5e937 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -348,17 +348,31 @@ static struct audit_entry *audit_rule_to_entry(struct audit_rule *rule) f->val = rule->values[i]; err = -EINVAL; - if (f->type & AUDIT_UNUSED_BITS) - goto exit_free; - switch(f->type) { - case AUDIT_SE_USER: - case AUDIT_SE_ROLE: - case AUDIT_SE_TYPE: - case AUDIT_SE_SEN: - case AUDIT_SE_CLR: - case AUDIT_WATCH: + default: goto exit_free; + case AUDIT_PID: + case AUDIT_UID: + case AUDIT_EUID: + case AUDIT_SUID: + case AUDIT_FSUID: + case AUDIT_GID: + case AUDIT_EGID: + case AUDIT_SGID: + case AUDIT_FSGID: + case AUDIT_LOGINUID: + case AUDIT_PERS: + case AUDIT_ARCH: + case AUDIT_MSGTYPE: + case AUDIT_DEVMAJOR: + case AUDIT_DEVMINOR: + case AUDIT_EXIT: + case AUDIT_SUCCESS: + case AUDIT_ARG0: + case AUDIT_ARG1: + case AUDIT_ARG2: + case AUDIT_ARG3: + break; case AUDIT_INODE: err = audit_to_inode(&entry->rule, f); if (err) @@ -432,6 +446,29 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data, f->se_str = NULL; f->se_rule = NULL; switch(f->type) { + case AUDIT_PID: + case AUDIT_UID: + case AUDIT_EUID: + case AUDIT_SUID: + case AUDIT_FSUID: + case AUDIT_GID: + case AUDIT_EGID: + case AUDIT_SGID: + case AUDIT_FSGID: + case AUDIT_LOGINUID: + case AUDIT_PERS: + case AUDIT_ARCH: + case AUDIT_MSGTYPE: + case AUDIT_PPID: + case AUDIT_DEVMAJOR: + case AUDIT_DEVMINOR: + case AUDIT_EXIT: + case AUDIT_SUCCESS: + case AUDIT_ARG0: + case AUDIT_ARG1: + case AUDIT_ARG2: + case AUDIT_ARG3: + break; case AUDIT_SE_USER: case AUDIT_SE_ROLE: case AUDIT_SE_TYPE: @@ -474,6 +511,8 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data, if (err) goto exit_free; break; + default: + goto exit_free; } } -- cgit v1.2.3 From 6a2bceec0ea7fdc47aef9a3f2f771c201eaabe5d Mon Sep 17 00:00:00 2001 From: Amy Griffis Date: Fri, 2 Jun 2006 13:16:01 -0400 Subject: [PATCH] fix AUDIT_FILTER_PREPEND handling Clear AUDIT_FILTER_PREPEND flag after adding rule to list. This fixes three problems when a rule is added with the -A syntax: - auditctl displays filter list as "(null)" - the rule cannot be removed using -d - a duplicate rule can be added with -a Signed-off-by: Amy Griffis Signed-off-by: Al Viro --- kernel/auditfilter.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index 9f985dd5e937..a536f7148bcd 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -1083,6 +1083,7 @@ static inline int audit_add_rule(struct audit_entry *entry, if (entry->rule.flags & AUDIT_FILTER_PREPEND) { list_add_rcu(&entry->list, list); + entry->rule.flags &= ~AUDIT_FILTER_PREPEND; } else { list_add_tail_rcu(&entry->list, list); } -- cgit v1.2.3 From 9c937dcc71021f2dbf78f904f03d962dd9bcc130 Mon Sep 17 00:00:00 2001 From: Amy Griffis Date: Thu, 8 Jun 2006 23:19:31 -0400 Subject: [PATCH] log more info for directory entry change events When an audit event involves changes to a directory entry, include a PATH record for the directory itself. A few other notable changes: - fixed audit_inode_child() hooks in fsnotify_move() - removed unused flags arg from audit_inode() - added audit log routines for logging a portion of a string Here's some sample output. before patch: type=SYSCALL msg=audit(1149821605.320:26): arch=40000003 syscall=39 success=yes exit=0 a0=bf8d3c7c a1=1ff a2=804e1b8 a3=bf8d3c7c items=1 ppid=739 pid=800 auid=0 uid=0 gid=0 euid=0 suid=0 fsuid=0 egid=0 sgid=0 fsgid=0 tty=ttyS0 comm="mkdir" exe="/bin/mkdir" subj=root:system_r:unconfined_t:s0-s0:c0.c255 type=CWD msg=audit(1149821605.320:26): cwd="/root" type=PATH msg=audit(1149821605.320:26): item=0 name="foo" parent=164068 inode=164010 dev=03:00 mode=040755 ouid=0 ogid=0 rdev=00:00 obj=root:object_r:user_home_t:s0 after patch: type=SYSCALL msg=audit(1149822032.332:24): arch=40000003 syscall=39 success=yes exit=0 a0=bfdd9c7c a1=1ff a2=804e1b8 a3=bfdd9c7c items=2 ppid=714 pid=777 auid=0 uid=0 gid=0 euid=0 suid=0 fsuid=0 egid=0 sgid=0 fsgid=0 tty=ttyS0 comm="mkdir" exe="/bin/mkdir" subj=root:system_r:unconfined_t:s0-s0:c0.c255 type=CWD msg=audit(1149822032.332:24): cwd="/root" type=PATH msg=audit(1149822032.332:24): item=0 name="/root" inode=164068 dev=03:00 mode=040750 ouid=0 ogid=0 rdev=00:00 obj=root:object_r:user_home_dir_t:s0 type=PATH msg=audit(1149822032.332:24): item=1 name="foo" inode=164010 dev=03:00 mode=040755 ouid=0 ogid=0 rdev=00:00 obj=root:object_r:user_home_t:s0 Signed-off-by: Amy Griffis Signed-off-by: Al Viro --- fs/namei.c | 2 +- fs/open.c | 4 +- fs/xattr.c | 4 +- include/linux/audit.h | 15 +++--- include/linux/fsnotify.h | 3 +- kernel/audit.c | 54 +++++++++++++++++++-- kernel/audit.h | 3 +- kernel/auditfilter.c | 8 ++- kernel/auditsc.c | 123 ++++++++++++++++++++++++++--------------------- 9 files changed, 142 insertions(+), 74 deletions(-) (limited to 'kernel') diff --git a/fs/namei.c b/fs/namei.c index d6e2ee251736..184fe4acf824 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1127,7 +1127,7 @@ out: if (likely(retval == 0)) { if (unlikely(current->audit_context && nd && nd->dentry && nd->dentry->d_inode)) - audit_inode(name, nd->dentry->d_inode, flags); + audit_inode(name, nd->dentry->d_inode); } out_fail: return retval; diff --git a/fs/open.c b/fs/open.c index 317b7c7f38a7..4f178acd4c09 100644 --- a/fs/open.c +++ b/fs/open.c @@ -633,7 +633,7 @@ asmlinkage long sys_fchmod(unsigned int fd, mode_t mode) dentry = file->f_dentry; inode = dentry->d_inode; - audit_inode(NULL, inode, 0); + audit_inode(NULL, inode); err = -EROFS; if (IS_RDONLY(inode)) @@ -786,7 +786,7 @@ asmlinkage long sys_fchown(unsigned int fd, uid_t user, gid_t group) if (file) { struct dentry * dentry; dentry = file->f_dentry; - audit_inode(NULL, dentry->d_inode, 0); + audit_inode(NULL, dentry->d_inode); error = chown_common(dentry, user, group); fput(file); } diff --git a/fs/xattr.c b/fs/xattr.c index e416190f5e9c..c32f15b5f60f 100644 --- a/fs/xattr.c +++ b/fs/xattr.c @@ -242,7 +242,7 @@ sys_fsetxattr(int fd, char __user *name, void __user *value, if (!f) return error; dentry = f->f_dentry; - audit_inode(NULL, dentry->d_inode, 0); + audit_inode(NULL, dentry->d_inode); error = setxattr(dentry, name, value, size, flags); fput(f); return error; @@ -469,7 +469,7 @@ sys_fremovexattr(int fd, char __user *name) if (!f) return error; dentry = f->f_dentry; - audit_inode(NULL, dentry->d_inode, 0); + audit_inode(NULL, dentry->d_inode); error = removexattr(dentry, name); fput(f); return error; diff --git a/include/linux/audit.h b/include/linux/audit.h index c78327507f4e..e1c1dbdf9efb 100644 --- a/include/linux/audit.h +++ b/include/linux/audit.h @@ -310,7 +310,7 @@ extern void audit_syscall_entry(int arch, extern void audit_syscall_exit(int failed, long return_code); extern void __audit_getname(const char *name); extern void audit_putname(const char *name); -extern void __audit_inode(const char *name, const struct inode *inode, unsigned flags); +extern void __audit_inode(const char *name, const struct inode *inode); extern void __audit_inode_child(const char *dname, const struct inode *inode, unsigned long pino); static inline void audit_getname(const char *name) @@ -318,10 +318,9 @@ static inline void audit_getname(const char *name) if (unlikely(current->audit_context)) __audit_getname(name); } -static inline void audit_inode(const char *name, const struct inode *inode, - unsigned flags) { +static inline void audit_inode(const char *name, const struct inode *inode) { if (unlikely(current->audit_context)) - __audit_inode(name, inode, flags); + __audit_inode(name, inode); } static inline void audit_inode_child(const char *dname, const struct inode *inode, @@ -398,9 +397,9 @@ static inline int audit_mq_getsetattr(mqd_t mqdes, struct mq_attr *mqstat) #define audit_syscall_exit(f,r) do { ; } while (0) #define audit_getname(n) do { ; } while (0) #define audit_putname(n) do { ; } while (0) -#define __audit_inode(n,i,f) do { ; } while (0) +#define __audit_inode(n,i) do { ; } while (0) #define __audit_inode_child(d,i,p) do { ; } while (0) -#define audit_inode(n,i,f) do { ; } while (0) +#define audit_inode(n,i) do { ; } while (0) #define audit_inode_child(d,i,p) do { ; } while (0) #define auditsc_get_stamp(c,t,s) do { BUG(); } while (0) #define audit_get_loginuid(c) ({ -1; }) @@ -435,6 +434,9 @@ extern void audit_log_hex(struct audit_buffer *ab, size_t len); extern const char * audit_log_untrustedstring(struct audit_buffer *ab, const char *string); +extern const char * audit_log_n_untrustedstring(struct audit_buffer *ab, + size_t n, + const char *string); extern void audit_log_d_path(struct audit_buffer *ab, const char *prefix, struct dentry *dentry, @@ -452,6 +454,7 @@ extern int audit_receive_filter(int type, int pid, int uid, int seq, #define audit_log_end(b) do { ; } while (0) #define audit_log_hex(a,b,l) do { ; } while (0) #define audit_log_untrustedstring(a,s) do { ; } while (0) +#define audit_log_n_untrustedstring(a,n,s) do { ; } while (0) #define audit_log_d_path(b,p,d,v) do { ; } while (0) #endif #endif diff --git a/include/linux/fsnotify.h b/include/linux/fsnotify.h index a9d30442448f..cc5dec70c32c 100644 --- a/include/linux/fsnotify.h +++ b/include/linux/fsnotify.h @@ -67,8 +67,7 @@ static inline void fsnotify_move(struct inode *old_dir, struct inode *new_dir, if (source) { inotify_inode_queue_event(source, IN_MOVE_SELF, 0, NULL, NULL); } - audit_inode_child(old_name, source, old_dir->i_ino); - audit_inode_child(new_name, target, new_dir->i_ino); + audit_inode_child(new_name, source, new_dir->i_ino); } /* diff --git a/kernel/audit.c b/kernel/audit.c index 0fbf1c116363..7dfac7031bd7 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -1051,20 +1051,53 @@ void audit_log_hex(struct audit_buffer *ab, const unsigned char *buf, skb_put(skb, len << 1); /* new string is twice the old string */ } +/* + * Format a string of no more than slen characters into the audit buffer, + * enclosed in quote marks. + */ +static void audit_log_n_string(struct audit_buffer *ab, size_t slen, + const char *string) +{ + int avail, new_len; + unsigned char *ptr; + struct sk_buff *skb; + + BUG_ON(!ab->skb); + skb = ab->skb; + avail = skb_tailroom(skb); + new_len = slen + 3; /* enclosing quotes + null terminator */ + if (new_len > avail) { + avail = audit_expand(ab, new_len); + if (!avail) + return; + } + ptr = skb->tail; + *ptr++ = '"'; + memcpy(ptr, string, slen); + ptr += slen; + *ptr++ = '"'; + *ptr = 0; + skb_put(skb, slen + 2); /* don't include null terminator */ +} + /** - * audit_log_unstrustedstring - log a string that may contain random characters + * audit_log_n_unstrustedstring - log a string that may contain random characters * @ab: audit_buffer + * @len: lenth of string (not including trailing null) * @string: string to be logged * * This code will escape a string that is passed to it if the string * contains a control character, unprintable character, double quote mark, * or a space. Unescaped strings will start and end with a double quote mark. * Strings that are escaped are printed in hex (2 digits per char). + * + * The caller specifies the number of characters in the string to log, which may + * or may not be the entire string. */ -const char *audit_log_untrustedstring(struct audit_buffer *ab, const char *string) +const char *audit_log_n_untrustedstring(struct audit_buffer *ab, size_t len, + const char *string) { const unsigned char *p = string; - size_t len = strlen(string); while (*p) { if (*p == '"' || *p < 0x21 || *p > 0x7f) { @@ -1073,10 +1106,23 @@ const char *audit_log_untrustedstring(struct audit_buffer *ab, const char *strin } p++; } - audit_log_format(ab, "\"%s\"", string); + audit_log_n_string(ab, len, string); return p + 1; } +/** + * audit_log_unstrustedstring - log a string that may contain random characters + * @ab: audit_buffer + * @string: string to be logged + * + * Same as audit_log_n_unstrustedstring(), except that strlen is used to + * determine string length. + */ +const char *audit_log_untrustedstring(struct audit_buffer *ab, const char *string) +{ + return audit_log_n_untrustedstring(ab, strlen(string), string); +} + /* This is a helper-function to print the escaped d_path */ void audit_log_d_path(struct audit_buffer *ab, const char *prefix, struct dentry *dentry, struct vfsmount *vfsmnt) diff --git a/kernel/audit.h b/kernel/audit.h index 58fa44cb8d01..8323e4132a33 100644 --- a/kernel/audit.h +++ b/kernel/audit.h @@ -104,7 +104,8 @@ static inline int audit_hash_ino(u32 ino) } extern int audit_comparator(const u32 left, const u32 op, const u32 right); -extern int audit_compare_dname_path(const char *dname, const char *path); +extern int audit_compare_dname_path(const char *dname, const char *path, + int *dirlen); extern struct sk_buff * audit_make_reply(int pid, int seq, int type, int done, int multi, void *payload, int size); diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index a536f7148bcd..4c99d2c586ed 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -787,7 +787,7 @@ static void audit_update_watch(struct audit_parent *parent, mutex_lock(&audit_filter_mutex); list_for_each_entry_safe(owatch, nextw, &parent->watches, wlist) { - if (audit_compare_dname_path(dname, owatch->path)) + if (audit_compare_dname_path(dname, owatch->path, NULL)) continue; /* If the update involves invalidating rules, do the inode-based @@ -1387,7 +1387,8 @@ int audit_comparator(const u32 left, const u32 op, const u32 right) /* Compare given dentry name with last component in given path, * return of 0 indicates a match. */ -int audit_compare_dname_path(const char *dname, const char *path) +int audit_compare_dname_path(const char *dname, const char *path, + int *dirlen) { int dlen, plen; const char *p; @@ -1416,6 +1417,9 @@ int audit_compare_dname_path(const char *dname, const char *path) p++; } + /* return length of path's directory component */ + if (dirlen) + *dirlen = p - path; return strncmp(p, dname, dlen); } diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 174a3f624892..851ae0217e4b 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -82,6 +82,9 @@ extern int audit_enabled; * path_lookup. */ #define AUDIT_NAMES_RESERVED 7 +/* Indicates that audit should log the full pathname. */ +#define AUDIT_NAME_FULL -1 + /* When fs/namei.c:getname() is called, we store the pointer in name and * we don't let putname() free it (instead we free all of the saved * pointers at syscall exit time). @@ -89,8 +92,9 @@ extern int audit_enabled; * Further, in fs/namei.c:path_lookup() we store the inode and device. */ struct audit_names { const char *name; + int name_len; /* number of name's characters to log */ + unsigned name_put; /* call __putname() for this name */ unsigned long ino; - unsigned long pino; dev_t dev; umode_t mode; uid_t uid; @@ -296,12 +300,10 @@ static int audit_filter_rules(struct task_struct *tsk, break; case AUDIT_INODE: if (name) - result = (name->ino == f->val || - name->pino == f->val); + result = (name->ino == f->val); else if (ctx) { for (j = 0; j < ctx->name_count; j++) { - if (audit_comparator(ctx->names[j].ino, f->op, f->val) || - audit_comparator(ctx->names[j].pino, f->op, f->val)) { + if (audit_comparator(ctx->names[j].ino, f->op, f->val)) { ++result; break; } @@ -311,8 +313,7 @@ static int audit_filter_rules(struct task_struct *tsk, case AUDIT_WATCH: if (name && rule->watch->ino != (unsigned long)-1) result = (name->dev == rule->watch->dev && - (name->ino == rule->watch->ino || - name->pino == rule->watch->ino)); + name->ino == rule->watch->ino); break; case AUDIT_LOGINUID: result = 0; @@ -526,7 +527,7 @@ static inline void audit_free_names(struct audit_context *context) #endif for (i = 0; i < context->name_count; i++) { - if (context->names[i].name) + if (context->names[i].name && context->names[i].name_put) __putname(context->names[i].name); } context->name_count = 0; @@ -850,8 +851,7 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts } } for (i = 0; i < context->name_count; i++) { - unsigned long ino = context->names[i].ino; - unsigned long pino = context->names[i].pino; + struct audit_names *n = &context->names[i]; ab = audit_log_start(context, GFP_KERNEL, AUDIT_PATH); if (!ab) @@ -859,33 +859,47 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts audit_log_format(ab, "item=%d", i); - audit_log_format(ab, " name="); - if (context->names[i].name) - audit_log_untrustedstring(ab, context->names[i].name); - else - audit_log_format(ab, "(null)"); - - if (pino != (unsigned long)-1) - audit_log_format(ab, " parent=%lu", pino); - if (ino != (unsigned long)-1) - audit_log_format(ab, " inode=%lu", ino); - if ((pino != (unsigned long)-1) || (ino != (unsigned long)-1)) - audit_log_format(ab, " dev=%02x:%02x mode=%#o" - " ouid=%u ogid=%u rdev=%02x:%02x", - MAJOR(context->names[i].dev), - MINOR(context->names[i].dev), - context->names[i].mode, - context->names[i].uid, - context->names[i].gid, - MAJOR(context->names[i].rdev), - MINOR(context->names[i].rdev)); - if (context->names[i].osid != 0) { + if (n->name) { + switch(n->name_len) { + case AUDIT_NAME_FULL: + /* log the full path */ + audit_log_format(ab, " name="); + audit_log_untrustedstring(ab, n->name); + break; + case 0: + /* name was specified as a relative path and the + * directory component is the cwd */ + audit_log_d_path(ab, " name=", context->pwd, + context->pwdmnt); + break; + default: + /* log the name's directory component */ + audit_log_format(ab, " name="); + audit_log_n_untrustedstring(ab, n->name_len, + n->name); + } + } else + audit_log_format(ab, " name=(null)"); + + if (n->ino != (unsigned long)-1) { + audit_log_format(ab, " inode=%lu" + " dev=%02x:%02x mode=%#o" + " ouid=%u ogid=%u rdev=%02x:%02x", + n->ino, + MAJOR(n->dev), + MINOR(n->dev), + n->mode, + n->uid, + n->gid, + MAJOR(n->rdev), + MINOR(n->rdev)); + } + if (n->osid != 0) { char *ctx = NULL; u32 len; if (selinux_ctxid_to_string( - context->names[i].osid, &ctx, &len)) { - audit_log_format(ab, " osid=%u", - context->names[i].osid); + n->osid, &ctx, &len)) { + audit_log_format(ab, " osid=%u", n->osid); call_panic = 2; } else audit_log_format(ab, " obj=%s", ctx); @@ -1075,6 +1089,8 @@ void __audit_getname(const char *name) } BUG_ON(context->name_count >= AUDIT_NAMES); context->names[context->name_count].name = name; + context->names[context->name_count].name_len = AUDIT_NAME_FULL; + context->names[context->name_count].name_put = 1; context->names[context->name_count].ino = (unsigned long)-1; ++context->name_count; if (!context->pwd) { @@ -1141,11 +1157,10 @@ static void audit_inode_context(int idx, const struct inode *inode) * audit_inode - store the inode and device from a lookup * @name: name being audited * @inode: inode being audited - * @flags: lookup flags (as used in path_lookup()) * * Called from fs/namei.c:path_lookup(). */ -void __audit_inode(const char *name, const struct inode *inode, unsigned flags) +void __audit_inode(const char *name, const struct inode *inode) { int idx; struct audit_context *context = current->audit_context; @@ -1171,20 +1186,13 @@ void __audit_inode(const char *name, const struct inode *inode, unsigned flags) ++context->ino_count; #endif } + context->names[idx].ino = inode->i_ino; context->names[idx].dev = inode->i_sb->s_dev; context->names[idx].mode = inode->i_mode; context->names[idx].uid = inode->i_uid; context->names[idx].gid = inode->i_gid; context->names[idx].rdev = inode->i_rdev; audit_inode_context(idx, inode); - if ((flags & LOOKUP_PARENT) && (strcmp(name, "/") != 0) && - (strcmp(name, ".") != 0)) { - context->names[idx].ino = (unsigned long)-1; - context->names[idx].pino = inode->i_ino; - } else { - context->names[idx].ino = inode->i_ino; - context->names[idx].pino = (unsigned long)-1; - } } /** @@ -1206,34 +1214,40 @@ void __audit_inode_child(const char *dname, const struct inode *inode, { int idx; struct audit_context *context = current->audit_context; + const char *found_name = NULL; + int dirlen = 0; if (!context->in_syscall) return; /* determine matching parent */ if (!dname) - goto no_match; + goto update_context; for (idx = 0; idx < context->name_count; idx++) - if (context->names[idx].pino == pino) { + if (context->names[idx].ino == pino) { const char *name = context->names[idx].name; if (!name) continue; - if (audit_compare_dname_path(dname, name) == 0) - goto update_context; + if (audit_compare_dname_path(dname, name, &dirlen) == 0) { + context->names[idx].name_len = dirlen; + found_name = name; + break; + } } -no_match: - /* catch-all in case match not found */ +update_context: idx = context->name_count++; - context->names[idx].name = NULL; - context->names[idx].pino = pino; #if AUDIT_DEBUG context->ino_count++; #endif + /* Re-use the name belonging to the slot for a matching parent directory. + * All names for this context are relinquished in audit_free_names() */ + context->names[idx].name = found_name; + context->names[idx].name_len = AUDIT_NAME_FULL; + context->names[idx].name_put = 0; /* don't call __putname() */ -update_context: if (inode) { context->names[idx].ino = inode->i_ino; context->names[idx].dev = inode->i_sb->s_dev; @@ -1242,7 +1256,8 @@ update_context: context->names[idx].gid = inode->i_gid; context->names[idx].rdev = inode->i_rdev; audit_inode_context(idx, inode); - } + } else + context->names[idx].ino = (unsigned long)-1; } /** -- cgit v1.2.3 From 41757106b9ca7867dafb2404d618f947b4786fd7 Mon Sep 17 00:00:00 2001 From: Steve Grubb Date: Mon, 12 Jun 2006 07:48:28 -0400 Subject: [PATCH] make set_loginuid obey audit_enabled Hi, I was doing some testing and noticed that when the audit system was disabled, I was still getting messages about the loginuid being set. The following patch makes audit_set_loginuid look at in_syscall to determine if it should create an audit event. The loginuid will continue to be set as long as there is a context. Signed-off-by: Steve Grubb Signed-off-by: Al Viro --- kernel/auditsc.c | 27 ++++++++++++++++----------- 1 file changed, 16 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 851ae0217e4b..b097ccb4eb7e 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -1290,18 +1290,23 @@ void auditsc_get_stamp(struct audit_context *ctx, */ int audit_set_loginuid(struct task_struct *task, uid_t loginuid) { - if (task->audit_context) { - struct audit_buffer *ab; - - ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_LOGIN); - if (ab) { - audit_log_format(ab, "login pid=%d uid=%u " - "old auid=%u new auid=%u", - task->pid, task->uid, - task->audit_context->loginuid, loginuid); - audit_log_end(ab); + struct audit_context *context = task->audit_context; + + if (context) { + /* Only log if audit is enabled */ + if (context->in_syscall) { + struct audit_buffer *ab; + + ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_LOGIN); + if (ab) { + audit_log_format(ab, "login pid=%d uid=%u " + "old auid=%u new auid=%u", + task->pid, task->uid, + context->loginuid, loginuid); + audit_log_end(ab); + } } - task->audit_context->loginuid = loginuid; + context->loginuid = loginuid; } return 0; } -- cgit v1.2.3 From d720024e94de4e8b7f10ee83c532926f3ad5d708 Mon Sep 17 00:00:00 2001 From: Michael LeMay Date: Thu, 22 Jun 2006 14:47:17 -0700 Subject: [PATCH] selinux: add hooks for key subsystem Introduce SELinux hooks to support the access key retention subsystem within the kernel. Incorporate new flask headers from a modified version of the SELinux reference policy, with support for the new security class representing retained keys. Extend the "key_alloc" security hook with a task parameter representing the intended ownership context for the key being allocated. Attach security information to root's default keyrings within the SELinux initialization routine. Has passed David's testsuite. Signed-off-by: Michael LeMay Signed-off-by: David Howells Signed-off-by: James Morris Acked-by: Chris Wright Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/keys.txt | 29 +++++++++++++ include/linux/key.h | 18 +++++--- include/linux/security.h | 10 +++-- kernel/user.c | 2 +- security/dummy.c | 2 +- security/keys/key.c | 8 ++-- security/keys/keyring.c | 5 ++- security/keys/process_keys.c | 15 ++++--- security/keys/request_key.c | 6 ++- security/keys/request_key_auth.c | 2 +- security/selinux/hooks.c | 64 ++++++++++++++++++++++++++++ security/selinux/include/av_perm_to_string.h | 6 +++ security/selinux/include/av_permissions.h | 8 ++++ security/selinux/include/class_to_string.h | 1 + security/selinux/include/flask.h | 1 + security/selinux/include/objsec.h | 5 +++ 16 files changed, 155 insertions(+), 27 deletions(-) (limited to 'kernel') diff --git a/Documentation/keys.txt b/Documentation/keys.txt index aaa01b0e3ee9..703020012708 100644 --- a/Documentation/keys.txt +++ b/Documentation/keys.txt @@ -19,6 +19,7 @@ This document has the following sections: - Key overview - Key service overview - Key access permissions + - SELinux support - New procfs files - Userspace system call interface - Kernel services @@ -232,6 +233,34 @@ For changing the ownership, group ID or permissions mask, being the owner of the key or having the sysadmin capability is sufficient. +=============== +SELINUX SUPPORT +=============== + +The security class "key" has been added to SELinux so that mandatory access +controls can be applied to keys created within various contexts. This support +is preliminary, and is likely to change quite significantly in the near future. +Currently, all of the basic permissions explained above are provided in SELinux +as well; SE Linux is simply invoked after all basic permission checks have been +performed. + +Each key is labeled with the same context as the task to which it belongs. +Typically, this is the same task that was running when the key was created. +The default keyrings are handled differently, but in a way that is very +intuitive: + + (*) The user and user session keyrings that are created when the user logs in + are currently labeled with the context of the login manager. + + (*) The keyrings associated with new threads are each labeled with the context + of their associated thread, and both session and process keyrings are + handled similarly. + +Note, however, that the default keyrings associated with the root user are +labeled with the default kernel context, since they are created early in the +boot process, before root has a chance to log in. + + ================ NEW PROCFS FILES ================ diff --git a/include/linux/key.h b/include/linux/key.h index cbf464ad9589..8c275d12ef63 100644 --- a/include/linux/key.h +++ b/include/linux/key.h @@ -241,8 +241,9 @@ extern void unregister_key_type(struct key_type *ktype); extern struct key *key_alloc(struct key_type *type, const char *desc, - uid_t uid, gid_t gid, key_perm_t perm, - int not_in_quota); + uid_t uid, gid_t gid, + struct task_struct *ctx, + key_perm_t perm, int not_in_quota); extern int key_payload_reserve(struct key *key, size_t datalen); extern int key_instantiate_and_link(struct key *key, const void *data, @@ -292,7 +293,9 @@ extern int key_unlink(struct key *keyring, struct key *key); extern struct key *keyring_alloc(const char *description, uid_t uid, gid_t gid, - int not_in_quota, struct key *dest); + struct task_struct *ctx, + int not_in_quota, + struct key *dest); extern int keyring_clear(struct key *keyring); @@ -313,7 +316,8 @@ extern void keyring_replace_payload(struct key *key, void *replacement); * the userspace interface */ extern struct key root_user_keyring, root_session_keyring; -extern int alloc_uid_keyring(struct user_struct *user); +extern int alloc_uid_keyring(struct user_struct *user, + struct task_struct *ctx); extern void switch_uid_keyring(struct user_struct *new_user); extern int copy_keys(unsigned long clone_flags, struct task_struct *tsk); extern int copy_thread_group_keys(struct task_struct *tsk); @@ -342,7 +346,7 @@ extern void key_init(void); #define make_key_ref(k) ({ NULL; }) #define key_ref_to_ptr(k) ({ NULL; }) #define is_key_possessed(k) 0 -#define alloc_uid_keyring(u) 0 +#define alloc_uid_keyring(u,c) 0 #define switch_uid_keyring(u) do { } while(0) #define __install_session_keyring(t, k) ({ NULL; }) #define copy_keys(f,t) 0 @@ -355,6 +359,10 @@ extern void key_init(void); #define key_fsgid_changed(t) do { } while(0) #define key_init() do { } while(0) +/* Initial keyrings */ +extern struct key root_user_keyring; +extern struct key root_session_keyring; + #endif /* CONFIG_KEYS */ #endif /* __KERNEL__ */ #endif /* _LINUX_KEY_H */ diff --git a/include/linux/security.h b/include/linux/security.h index 4dfb1b84a9b3..47722d355532 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -1313,7 +1313,7 @@ struct security_operations { /* key management security hooks */ #ifdef CONFIG_KEYS - int (*key_alloc)(struct key *key); + int (*key_alloc)(struct key *key, struct task_struct *tsk); void (*key_free)(struct key *key); int (*key_permission)(key_ref_t key_ref, struct task_struct *context, @@ -3008,9 +3008,10 @@ static inline int security_xfrm_policy_lookup(struct xfrm_policy *xp, u32 sk_sid #ifdef CONFIG_KEYS #ifdef CONFIG_SECURITY -static inline int security_key_alloc(struct key *key) +static inline int security_key_alloc(struct key *key, + struct task_struct *tsk) { - return security_ops->key_alloc(key); + return security_ops->key_alloc(key, tsk); } static inline void security_key_free(struct key *key) @@ -3027,7 +3028,8 @@ static inline int security_key_permission(key_ref_t key_ref, #else -static inline int security_key_alloc(struct key *key) +static inline int security_key_alloc(struct key *key, + struct task_struct *tsk) { return 0; } diff --git a/kernel/user.c b/kernel/user.c index 4b1eb745afa1..6408c0424291 100644 --- a/kernel/user.c +++ b/kernel/user.c @@ -148,7 +148,7 @@ struct user_struct * alloc_uid(uid_t uid) new->mq_bytes = 0; new->locked_shm = 0; - if (alloc_uid_keyring(new) < 0) { + if (alloc_uid_keyring(new, current) < 0) { kmem_cache_free(uid_cachep, new); return NULL; } diff --git a/security/dummy.c b/security/dummy.c index 64f6da0f422e..6de4a4a5eb13 100644 --- a/security/dummy.c +++ b/security/dummy.c @@ -860,7 +860,7 @@ static int dummy_setprocattr(struct task_struct *p, char *name, void *value, siz } #ifdef CONFIG_KEYS -static inline int dummy_key_alloc(struct key *key) +static inline int dummy_key_alloc(struct key *key, struct task_struct *ctx) { return 0; } diff --git a/security/keys/key.c b/security/keys/key.c index 3fdc49c6a02c..14a15abb7735 100644 --- a/security/keys/key.c +++ b/security/keys/key.c @@ -247,8 +247,8 @@ static inline void key_alloc_serial(struct key *key) * instantiate the key or discard it before returning */ struct key *key_alloc(struct key_type *type, const char *desc, - uid_t uid, gid_t gid, key_perm_t perm, - int not_in_quota) + uid_t uid, gid_t gid, struct task_struct *ctx, + key_perm_t perm, int not_in_quota) { struct key_user *user = NULL; struct key *key; @@ -318,7 +318,7 @@ struct key *key_alloc(struct key_type *type, const char *desc, #endif /* let the security module know about the key */ - ret = security_key_alloc(key); + ret = security_key_alloc(key, ctx); if (ret < 0) goto security_error; @@ -822,7 +822,7 @@ key_ref_t key_create_or_update(key_ref_t keyring_ref, /* allocate a new key */ key = key_alloc(ktype, description, current->fsuid, current->fsgid, - perm, not_in_quota); + current, perm, not_in_quota); if (IS_ERR(key)) { key_ref = ERR_PTR(PTR_ERR(key)); goto error_3; diff --git a/security/keys/keyring.c b/security/keys/keyring.c index bffa924c1f88..1357207fc9df 100644 --- a/security/keys/keyring.c +++ b/security/keys/keyring.c @@ -240,13 +240,14 @@ static long keyring_read(const struct key *keyring, * allocate a keyring and link into the destination keyring */ struct key *keyring_alloc(const char *description, uid_t uid, gid_t gid, - int not_in_quota, struct key *dest) + struct task_struct *ctx, int not_in_quota, + struct key *dest) { struct key *keyring; int ret; keyring = key_alloc(&key_type_keyring, description, - uid, gid, + uid, gid, ctx, (KEY_POS_ALL & ~KEY_POS_SETATTR) | KEY_USR_ALL, not_in_quota); diff --git a/security/keys/process_keys.c b/security/keys/process_keys.c index 217a0bef3c82..a50a91332fe1 100644 --- a/security/keys/process_keys.c +++ b/security/keys/process_keys.c @@ -67,7 +67,8 @@ struct key root_session_keyring = { /* * allocate the keyrings to be associated with a UID */ -int alloc_uid_keyring(struct user_struct *user) +int alloc_uid_keyring(struct user_struct *user, + struct task_struct *ctx) { struct key *uid_keyring, *session_keyring; char buf[20]; @@ -76,7 +77,7 @@ int alloc_uid_keyring(struct user_struct *user) /* concoct a default session keyring */ sprintf(buf, "_uid_ses.%u", user->uid); - session_keyring = keyring_alloc(buf, user->uid, (gid_t) -1, 0, NULL); + session_keyring = keyring_alloc(buf, user->uid, (gid_t) -1, ctx, 0, NULL); if (IS_ERR(session_keyring)) { ret = PTR_ERR(session_keyring); goto error; @@ -86,7 +87,7 @@ int alloc_uid_keyring(struct user_struct *user) * keyring */ sprintf(buf, "_uid.%u", user->uid); - uid_keyring = keyring_alloc(buf, user->uid, (gid_t) -1, 0, + uid_keyring = keyring_alloc(buf, user->uid, (gid_t) -1, ctx, 0, session_keyring); if (IS_ERR(uid_keyring)) { key_put(session_keyring); @@ -143,7 +144,7 @@ int install_thread_keyring(struct task_struct *tsk) sprintf(buf, "_tid.%u", tsk->pid); - keyring = keyring_alloc(buf, tsk->uid, tsk->gid, 1, NULL); + keyring = keyring_alloc(buf, tsk->uid, tsk->gid, tsk, 1, NULL); if (IS_ERR(keyring)) { ret = PTR_ERR(keyring); goto error; @@ -177,7 +178,7 @@ int install_process_keyring(struct task_struct *tsk) if (!tsk->signal->process_keyring) { sprintf(buf, "_pid.%u", tsk->tgid); - keyring = keyring_alloc(buf, tsk->uid, tsk->gid, 1, NULL); + keyring = keyring_alloc(buf, tsk->uid, tsk->gid, tsk, 1, NULL); if (IS_ERR(keyring)) { ret = PTR_ERR(keyring); goto error; @@ -217,7 +218,7 @@ static int install_session_keyring(struct task_struct *tsk, if (!keyring) { sprintf(buf, "_ses.%u", tsk->tgid); - keyring = keyring_alloc(buf, tsk->uid, tsk->gid, 1, NULL); + keyring = keyring_alloc(buf, tsk->uid, tsk->gid, tsk, 1, NULL); if (IS_ERR(keyring)) return PTR_ERR(keyring); } @@ -717,7 +718,7 @@ long join_session_keyring(const char *name) keyring = find_keyring_by_name(name, 0); if (PTR_ERR(keyring) == -ENOKEY) { /* not found - try and create a new one */ - keyring = keyring_alloc(name, tsk->uid, tsk->gid, 0, NULL); + keyring = keyring_alloc(name, tsk->uid, tsk->gid, tsk, 0, NULL); if (IS_ERR(keyring)) { ret = PTR_ERR(keyring); goto error2; diff --git a/security/keys/request_key.c b/security/keys/request_key.c index f030a0ccbb93..eab66a06ca53 100644 --- a/security/keys/request_key.c +++ b/security/keys/request_key.c @@ -48,7 +48,8 @@ static int call_sbin_request_key(struct key *key, /* allocate a new session keyring */ sprintf(desc, "_req.%u", key->serial); - keyring = keyring_alloc(desc, current->fsuid, current->fsgid, 1, NULL); + keyring = keyring_alloc(desc, current->fsuid, current->fsgid, + current, 1, NULL); if (IS_ERR(keyring)) { ret = PTR_ERR(keyring); goto error_alloc; @@ -137,7 +138,8 @@ static struct key *__request_key_construction(struct key_type *type, /* create a key and add it to the queue */ key = key_alloc(type, description, - current->fsuid, current->fsgid, KEY_POS_ALL, 0); + current->fsuid, current->fsgid, + current, KEY_POS_ALL, 0); if (IS_ERR(key)) goto alloc_failed; diff --git a/security/keys/request_key_auth.c b/security/keys/request_key_auth.c index cce6ba6b0323..0ecc2e8d2bd0 100644 --- a/security/keys/request_key_auth.c +++ b/security/keys/request_key_auth.c @@ -148,7 +148,7 @@ struct key *request_key_auth_new(struct key *target, const char *callout_info) sprintf(desc, "%x", target->serial); authkey = key_alloc(&key_type_request_key_auth, desc, - current->fsuid, current->fsgid, + current->fsuid, current->fsgid, current, KEY_POS_VIEW | KEY_POS_READ | KEY_POS_SEARCH | KEY_USR_VIEW, 1); if (IS_ERR(authkey)) { diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c index 54adc9d31e92..524915dfda64 100644 --- a/security/selinux/hooks.c +++ b/security/selinux/hooks.c @@ -4252,6 +4252,57 @@ static int selinux_setprocattr(struct task_struct *p, return size; } +#ifdef CONFIG_KEYS + +static int selinux_key_alloc(struct key *k, struct task_struct *tsk) +{ + struct task_security_struct *tsec = tsk->security; + struct key_security_struct *ksec; + + ksec = kzalloc(sizeof(struct key_security_struct), GFP_KERNEL); + if (!ksec) + return -ENOMEM; + + ksec->obj = k; + ksec->sid = tsec->sid; + k->security = ksec; + + return 0; +} + +static void selinux_key_free(struct key *k) +{ + struct key_security_struct *ksec = k->security; + + k->security = NULL; + kfree(ksec); +} + +static int selinux_key_permission(key_ref_t key_ref, + struct task_struct *ctx, + key_perm_t perm) +{ + struct key *key; + struct task_security_struct *tsec; + struct key_security_struct *ksec; + + key = key_ref_to_ptr(key_ref); + + tsec = ctx->security; + ksec = key->security; + + /* if no specific permissions are requested, we skip the + permission check. No serious, additional covert channels + appear to be created. */ + if (perm == 0) + return 0; + + return avc_has_perm(tsec->sid, ksec->sid, + SECCLASS_KEY, perm, NULL); +} + +#endif + static struct security_operations selinux_ops = { .ptrace = selinux_ptrace, .capget = selinux_capget, @@ -4406,6 +4457,12 @@ static struct security_operations selinux_ops = { .xfrm_state_delete_security = selinux_xfrm_state_delete, .xfrm_policy_lookup = selinux_xfrm_policy_lookup, #endif + +#ifdef CONFIG_KEYS + .key_alloc = selinux_key_alloc, + .key_free = selinux_key_free, + .key_permission = selinux_key_permission, +#endif }; static __init int selinux_init(void) @@ -4441,6 +4498,13 @@ static __init int selinux_init(void) } else { printk(KERN_INFO "SELinux: Starting in permissive mode\n"); } + +#ifdef CONFIG_KEYS + /* Add security information to initial keyrings */ + security_key_alloc(&root_user_keyring, current); + security_key_alloc(&root_session_keyring, current); +#endif + return 0; } diff --git a/security/selinux/include/av_perm_to_string.h b/security/selinux/include/av_perm_to_string.h index 70ee65a58817..bc020bde6c86 100644 --- a/security/selinux/include/av_perm_to_string.h +++ b/security/selinux/include/av_perm_to_string.h @@ -242,3 +242,9 @@ S_(SECCLASS_PACKET, PACKET__SEND, "send") S_(SECCLASS_PACKET, PACKET__RECV, "recv") S_(SECCLASS_PACKET, PACKET__RELABELTO, "relabelto") + S_(SECCLASS_KEY, KEY__VIEW, "view") + S_(SECCLASS_KEY, KEY__READ, "read") + S_(SECCLASS_KEY, KEY__WRITE, "write") + S_(SECCLASS_KEY, KEY__SEARCH, "search") + S_(SECCLASS_KEY, KEY__LINK, "link") + S_(SECCLASS_KEY, KEY__SETATTR, "setattr") diff --git a/security/selinux/include/av_permissions.h b/security/selinux/include/av_permissions.h index 1d9cf3d306bc..1205227a3a33 100644 --- a/security/selinux/include/av_permissions.h +++ b/security/selinux/include/av_permissions.h @@ -959,3 +959,11 @@ #define PACKET__SEND 0x00000001UL #define PACKET__RECV 0x00000002UL #define PACKET__RELABELTO 0x00000004UL + +#define KEY__VIEW 0x00000001UL +#define KEY__READ 0x00000002UL +#define KEY__WRITE 0x00000004UL +#define KEY__SEARCH 0x00000008UL +#define KEY__LINK 0x00000010UL +#define KEY__SETATTR 0x00000020UL + diff --git a/security/selinux/include/class_to_string.h b/security/selinux/include/class_to_string.h index 3aec75fee4f7..24303b61309f 100644 --- a/security/selinux/include/class_to_string.h +++ b/security/selinux/include/class_to_string.h @@ -60,3 +60,4 @@ S_("netlink_kobject_uevent_socket") S_("appletalk_socket") S_("packet") + S_("key") diff --git a/security/selinux/include/flask.h b/security/selinux/include/flask.h index a0eb9e281d18..95887aed2a68 100644 --- a/security/selinux/include/flask.h +++ b/security/selinux/include/flask.h @@ -62,6 +62,7 @@ #define SECCLASS_NETLINK_KOBJECT_UEVENT_SOCKET 55 #define SECCLASS_APPLETALK_SOCKET 56 #define SECCLASS_PACKET 57 +#define SECCLASS_KEY 58 /* * Security identifier indices for initial entities diff --git a/security/selinux/include/objsec.h b/security/selinux/include/objsec.h index 54c030778882..8f5547ad1856 100644 --- a/security/selinux/include/objsec.h +++ b/security/selinux/include/objsec.h @@ -99,6 +99,11 @@ struct sk_security_struct { u32 peer_sid; /* SID of peer */ }; +struct key_security_struct { + struct key *obj; /* back pointer */ + u32 sid; /* SID of key */ +}; + extern unsigned int selinux_checkreqprot; #endif /* _SELINUX_OBJSEC_H_ */ -- cgit v1.2.3 From 6cc0719181a7aa8883855140541e7892250e66af Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Thu, 22 Jun 2006 14:47:18 -0700 Subject: [PATCH] suspend_console() warning fix kernel/power/main.c: In function 'suspend_prepare': kernel/power/main.c:89: warning: implicit declaration of function 'suspend_console' kernel/power/main.c: In function 'suspend_finish': kernel/power/main.c:137: warning: implicit declaration of function 'resume_console' Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/power/main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/power/main.c b/kernel/power/main.c index 0a907f0dc56b..cdf0f07af92f 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -15,7 +15,7 @@ #include #include #include - +#include #include "power.h" -- cgit v1.2.3 From de047c1bcd7f7bcfbdc29eb5b439fb332594da3f Mon Sep 17 00:00:00 2001 From: Ravikiran G Thirumalai Date: Thu, 22 Jun 2006 14:47:26 -0700 Subject: [PATCH] avoid tasklist_lock at getrusage for multithreaded case too Avoid taking tasklist_lock for at getrusage for the multithreaded case too. We don't need to take the tasklist lock for thread traversal of a process since Oleg's do-__unhash_process-under-siglock.patch and related work. Signed-off-by: Ravikiran Thirumalai Cc: Oleg Nesterov Cc: "Eric W. Biederman" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sys.c | 56 ++++++++++++++++++++++---------------------------------- 1 file changed, 22 insertions(+), 34 deletions(-) (limited to 'kernel') diff --git a/kernel/sys.c b/kernel/sys.c index 0b6ec0e7936f..fc9ebbbaba0c 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1860,23 +1860,20 @@ out: * fields when reaping, so a sample either gets all the additions of a * given child after it's reaped, or none so this sample is before reaping. * - * tasklist_lock locking optimisation: - * If we are current and single threaded, we do not need to take the tasklist - * lock or the siglock. No one else can take our signal_struct away, - * no one else can reap the children to update signal->c* counters, and - * no one else can race with the signal-> fields. - * If we do not take the tasklist_lock, the signal-> fields could be read - * out of order while another thread was just exiting. So we place a - * read memory barrier when we avoid the lock. On the writer side, - * write memory barrier is implied in __exit_signal as __exit_signal releases - * the siglock spinlock after updating the signal-> fields. - * - * We don't really need the siglock when we access the non c* fields - * of the signal_struct (for RUSAGE_SELF) even in multithreaded - * case, since we take the tasklist lock for read and the non c* signal-> - * fields are updated only in __exit_signal, which is called with - * tasklist_lock taken for write, hence these two threads cannot execute - * concurrently. + * Locking: + * We need to take the siglock for CHILDEREN, SELF and BOTH + * for the cases current multithreaded, non-current single threaded + * non-current multithreaded. Thread traversal is now safe with + * the siglock held. + * Strictly speaking, we donot need to take the siglock if we are current and + * single threaded, as no one else can take our signal_struct away, no one + * else can reap the children to update signal->c* counters, and no one else + * can race with the signal-> fields. If we do not take any lock, the + * signal-> fields could be read out of order while another thread was just + * exiting. So we should place a read memory barrier when we avoid the lock. + * On the writer side, write memory barrier is implied in __exit_signal + * as __exit_signal releases the siglock spinlock after updating the signal-> + * fields. But we don't do this yet to keep things simple. * */ @@ -1885,35 +1882,25 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r) struct task_struct *t; unsigned long flags; cputime_t utime, stime; - int need_lock = 0; memset((char *) r, 0, sizeof *r); utime = stime = cputime_zero; - if (p != current || !thread_group_empty(p)) - need_lock = 1; - - if (need_lock) { - read_lock(&tasklist_lock); - if (unlikely(!p->signal)) { - read_unlock(&tasklist_lock); - return; - } - } else - /* See locking comments above */ - smp_rmb(); + rcu_read_lock(); + if (!lock_task_sighand(p, &flags)) { + rcu_read_unlock(); + return; + } switch (who) { case RUSAGE_BOTH: case RUSAGE_CHILDREN: - spin_lock_irqsave(&p->sighand->siglock, flags); utime = p->signal->cutime; stime = p->signal->cstime; r->ru_nvcsw = p->signal->cnvcsw; r->ru_nivcsw = p->signal->cnivcsw; r->ru_minflt = p->signal->cmin_flt; r->ru_majflt = p->signal->cmaj_flt; - spin_unlock_irqrestore(&p->sighand->siglock, flags); if (who == RUSAGE_CHILDREN) break; @@ -1941,8 +1928,9 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r) BUG(); } - if (need_lock) - read_unlock(&tasklist_lock); + unlock_task_sighand(p, &flags); + rcu_read_unlock(); + cputime_to_timeval(utime, &r->ru_utime); cputime_to_timeval(stime, &r->ru_stime); } -- cgit v1.2.3 From 454e2398be9b9fa30433fccc548db34d19aa9958 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 23 Jun 2006 02:02:57 -0700 Subject: [PATCH] VFS: Permit filesystem to override root dentry on mount Extend the get_sb() filesystem operation to take an extra argument that permits the VFS to pass in the target vfsmount that defines the mountpoint. The filesystem is then required to manually set the superblock and root dentry pointers. For most filesystems, this should be done with simple_set_mnt() which will set the superblock pointer and then set the root dentry to the superblock's s_root (as per the old default behaviour). The get_sb() op now returns an integer as there's now no need to return the superblock pointer. This patch permits a superblock to be implicitly shared amongst several mount points, such as can be done with NFS to avoid potential inode aliasing. In such a case, simple_set_mnt() would not be called, and instead the mnt_root and mnt_sb would be set directly. The patch also makes the following changes: (*) the get_sb_*() convenience functions in the core kernel now take a vfsmount pointer argument and return an integer, so most filesystems have to change very little. (*) If one of the convenience function is not used, then get_sb() should normally call simple_set_mnt() to instantiate the vfsmount. This will always return 0, and so can be tail-called from get_sb(). (*) generic_shutdown_super() now calls shrink_dcache_sb() to clean up the dcache upon superblock destruction rather than shrink_dcache_anon(). This is required because the superblock may now have multiple trees that aren't actually bound to s_root, but that still need to be cleaned up. The currently called functions assume that the whole tree is rooted at s_root, and that anonymous dentries are not the roots of trees which results in dentries being left unculled. However, with the way NFS superblock sharing are currently set to be implemented, these assumptions are violated: the root of the filesystem is simply a dummy dentry and inode (the real inode for '/' may well be inaccessible), and all the vfsmounts are rooted on anonymous[*] dentries with child trees. [*] Anonymous until discovered from another tree. (*) The documentation has been adjusted, including the additional bit of changing ext2_* into foo_* in the documentation. [akpm@osdl.org: convert ipath_fs, do other stuff] Signed-off-by: David Howells Acked-by: Al Viro Cc: Nathan Scott Cc: Roland Dreier Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/filesystems/Locking | 7 +- Documentation/filesystems/porting | 7 +- Documentation/filesystems/vfs.txt | 4 +- arch/ia64/kernel/perfmon.c | 7 +- arch/powerpc/platforms/cell/spufs/inode.c | 6 +- drivers/infiniband/core/uverbs_main.c | 7 +- drivers/infiniband/hw/ipath/ipath_fs.c | 13 ++-- drivers/isdn/capi/capifs.c | 6 +- drivers/misc/ibmasm/ibmasmfs.c | 7 +- drivers/oprofile/oprofilefs.c | 6 +- drivers/usb/core/inode.c | 6 +- drivers/usb/gadget/inode.c | 6 +- fs/9p/vfs_super.c | 21 +++--- fs/adfs/super.c | 7 +- fs/affs/super.c | 7 +- fs/afs/super.c | 24 ++++--- fs/autofs/init.c | 6 +- fs/autofs4/init.c | 6 +- fs/befs/linuxvfs.c | 7 +- fs/bfs/inode.c | 6 +- fs/binfmt_misc.c | 6 +- fs/block_dev.c | 6 +- fs/cifs/cifsfs.c | 10 +-- fs/coda/inode.c | 6 +- fs/configfs/mount.c | 6 +- fs/cramfs/inode.c | 7 +- fs/dcache.c | 40 ----------- fs/debugfs/inode.c | 8 +-- fs/devfs/base.c | 8 +-- fs/devpts/inode.c | 6 +- fs/efs/super.c | 6 +- fs/eventpoll.c | 13 ++-- fs/ext2/super.c | 6 +- fs/ext3/super.c | 6 +- fs/freevxfs/vxfs_super.c | 7 +- fs/fuse/inode.c | 8 +-- fs/hfs/super.c | 7 +- fs/hfsplus/super.c | 8 ++- fs/hostfs/hostfs_kern.c | 8 +-- fs/hpfs/super.c | 7 +- fs/hppfs/hppfs_kern.c | 8 +-- fs/hugetlbfs/inode.c | 6 +- fs/inotify_user.c | 6 +- fs/isofs/inode.c | 7 +- fs/jffs/inode-v23.c | 7 +- fs/jffs2/super.c | 49 ++++++++------ fs/jfs/super.c | 7 +- fs/libfs.c | 12 ++-- fs/minix/inode.c | 7 +- fs/msdos/namei.c | 9 +-- fs/namespace.c | 9 +++ fs/ncpfs/inode.c | 6 +- fs/nfs/inode.c | 96 +++++++++++++++----------- fs/nfsd/nfsctl.c | 6 +- fs/ntfs/super.c | 7 +- fs/ocfs2/dlm/dlmfs.c | 6 +- fs/ocfs2/super.c | 12 ++-- fs/openpromfs/inode.c | 6 +- fs/pipe.c | 9 ++- fs/proc/root.c | 6 +- fs/qnx4/inode.c | 7 +- fs/ramfs/inode.c | 13 ++-- fs/reiserfs/super.c | 9 +-- fs/romfs/inode.c | 7 +- fs/smbfs/inode.c | 6 +- fs/super.c | 109 +++++++++++++++++------------- fs/sysfs/mount.c | 6 +- fs/sysv/super.c | 13 ++-- fs/udf/super.c | 6 +- fs/ufs/super.c | 6 +- fs/vfat/namei.c | 9 +-- fs/xfs/linux-2.6/xfs_super.c | 8 ++- include/linux/dcache.h | 1 - include/linux/fs.h | 25 ++++--- include/linux/ramfs.h | 4 +- ipc/mqueue.c | 8 +-- kernel/cpuset.c | 8 +-- kernel/futex.c | 8 +-- mm/shmem.c | 6 +- net/socket.c | 7 +- net/sunrpc/rpc_pipe.c | 6 +- security/inode.c | 8 +-- security/selinux/selinuxfs.c | 7 +- 83 files changed, 482 insertions(+), 431 deletions(-) (limited to 'kernel') diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking index 1045da582b9b..3abf08f1b14a 100644 --- a/Documentation/filesystems/Locking +++ b/Documentation/filesystems/Locking @@ -142,15 +142,16 @@ see also dquot_operations section. --------------------------- file_system_type --------------------------- prototypes: - struct super_block *(*get_sb) (struct file_system_type *, int, - const char *, void *); + struct int (*get_sb) (struct file_system_type *, int, + const char *, void *, struct vfsmount *); void (*kill_sb) (struct super_block *); locking rules: may block BKL get_sb yes yes kill_sb yes yes -->get_sb() returns error or a locked superblock (exclusive on ->s_umount). +->get_sb() returns error or 0 with locked superblock attached to the vfsmount +(exclusive on ->s_umount). ->kill_sb() takes a write-locked superblock, does all shutdown work on it, unlocks and drops the reference. diff --git a/Documentation/filesystems/porting b/Documentation/filesystems/porting index 2f388460cbe7..5531694059ab 100644 --- a/Documentation/filesystems/porting +++ b/Documentation/filesystems/porting @@ -50,10 +50,11 @@ Turn your foo_read_super() into a function that would return 0 in case of success and negative number in case of error (-EINVAL unless you have more informative error value to report). Call it foo_fill_super(). Now declare -struct super_block foo_get_sb(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data) +int foo_get_sb(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data, struct vfsmount *mnt) { - return get_sb_bdev(fs_type, flags, dev_name, data, ext2_fill_super); + return get_sb_bdev(fs_type, flags, dev_name, data, foo_fill_super, + mnt); } (or similar with s/bdev/nodev/ or s/bdev/single/, depending on the kind of diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt index 3a2e5520c1e3..dd7d0dcedc87 100644 --- a/Documentation/filesystems/vfs.txt +++ b/Documentation/filesystems/vfs.txt @@ -113,8 +113,8 @@ members are defined: struct file_system_type { const char *name; int fs_flags; - struct super_block *(*get_sb) (struct file_system_type *, int, - const char *, void *); + struct int (*get_sb) (struct file_system_type *, int, + const char *, void *, struct vfsmount *); void (*kill_sb) (struct super_block *); struct module *owner; struct file_system_type * next; diff --git a/arch/ia64/kernel/perfmon.c b/arch/ia64/kernel/perfmon.c index 077f21216b65..2359e2809f50 100644 --- a/arch/ia64/kernel/perfmon.c +++ b/arch/ia64/kernel/perfmon.c @@ -595,10 +595,11 @@ pfm_get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, } -static struct super_block * -pfmfs_get_sb(struct file_system_type *fs_type, int flags, const char *dev_name, void *data) +static int +pfmfs_get_sb(struct file_system_type *fs_type, int flags, const char *dev_name, void *data, + struct vfsmount *mnt) { - return get_sb_pseudo(fs_type, "pfm:", NULL, PFMFS_MAGIC); + return get_sb_pseudo(fs_type, "pfm:", NULL, PFMFS_MAGIC, mnt); } static struct file_system_type pfm_fs_type = { diff --git a/arch/powerpc/platforms/cell/spufs/inode.c b/arch/powerpc/platforms/cell/spufs/inode.c index 1987697b23a0..7b4572805db9 100644 --- a/arch/powerpc/platforms/cell/spufs/inode.c +++ b/arch/powerpc/platforms/cell/spufs/inode.c @@ -436,11 +436,11 @@ spufs_fill_super(struct super_block *sb, void *data, int silent) return spufs_create_root(sb, data); } -static struct super_block * +static int spufs_get_sb(struct file_system_type *fstype, int flags, - const char *name, void *data) + const char *name, void *data, struct vfsmount *mnt) { - return get_sb_single(fstype, flags, data, spufs_fill_super); + return get_sb_single(fstype, flags, data, spufs_fill_super, mnt); } static struct file_system_type spufs_type = { diff --git a/drivers/infiniband/core/uverbs_main.c b/drivers/infiniband/core/uverbs_main.c index 5ec2d49e9bb6..e57d3c50f75f 100644 --- a/drivers/infiniband/core/uverbs_main.c +++ b/drivers/infiniband/core/uverbs_main.c @@ -821,11 +821,12 @@ static void ib_uverbs_remove_one(struct ib_device *device) kref_put(&uverbs_dev->ref, ib_uverbs_release_dev); } -static struct super_block *uverbs_event_get_sb(struct file_system_type *fs_type, int flags, - const char *dev_name, void *data) +static int uverbs_event_get_sb(struct file_system_type *fs_type, int flags, + const char *dev_name, void *data, + struct vfsmount *mnt) { return get_sb_pseudo(fs_type, "infinibandevent:", NULL, - INFINIBANDEVENTFS_MAGIC); + INFINIBANDEVENTFS_MAGIC, mnt); } static struct file_system_type uverbs_event_fs = { diff --git a/drivers/infiniband/hw/ipath/ipath_fs.c b/drivers/infiniband/hw/ipath/ipath_fs.c index e274120567e1..63de3046aff3 100644 --- a/drivers/infiniband/hw/ipath/ipath_fs.c +++ b/drivers/infiniband/hw/ipath/ipath_fs.c @@ -542,13 +542,14 @@ bail: return ret; } -static struct super_block *ipathfs_get_sb(struct file_system_type *fs_type, - int flags, const char *dev_name, - void *data) +static int ipathfs_get_sb(struct file_system_type *fs_type, int flags, + const char *dev_name, void *data, struct vfsmount *mnt) { - ipath_super = get_sb_single(fs_type, flags, data, - ipathfs_fill_super); - return ipath_super; + int ret = get_sb_single(fs_type, flags, data, + ipathfs_fill_super, mnt); + if (ret >= 0) + ipath_super = mnt->mnt_sb; + return ret; } static void ipathfs_kill_super(struct super_block *s) diff --git a/drivers/isdn/capi/capifs.c b/drivers/isdn/capi/capifs.c index 0a37aded4b54..9ea6bd0ddc35 100644 --- a/drivers/isdn/capi/capifs.c +++ b/drivers/isdn/capi/capifs.c @@ -121,10 +121,10 @@ fail: return -ENOMEM; } -static struct super_block *capifs_get_sb(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data) +static int capifs_get_sb(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data, struct vfsmount *mnt) { - return get_sb_single(fs_type, flags, data, capifs_fill_super); + return get_sb_single(fs_type, flags, data, capifs_fill_super, mnt); } static struct file_system_type capifs_fs_type = { diff --git a/drivers/misc/ibmasm/ibmasmfs.c b/drivers/misc/ibmasm/ibmasmfs.c index 26a230b6ff80..4a35caff5d02 100644 --- a/drivers/misc/ibmasm/ibmasmfs.c +++ b/drivers/misc/ibmasm/ibmasmfs.c @@ -90,10 +90,11 @@ static void ibmasmfs_create_files (struct super_block *sb, struct dentry *root); static int ibmasmfs_fill_super (struct super_block *sb, void *data, int silent); -static struct super_block *ibmasmfs_get_super(struct file_system_type *fst, - int flags, const char *name, void *data) +static int ibmasmfs_get_super(struct file_system_type *fst, + int flags, const char *name, void *data, + struct vfsmount *mnt) { - return get_sb_single(fst, flags, data, ibmasmfs_fill_super); + return get_sb_single(fst, flags, data, ibmasmfs_fill_super, mnt); } static struct super_operations ibmasmfs_s_ops = { diff --git a/drivers/oprofile/oprofilefs.c b/drivers/oprofile/oprofilefs.c index b62da9b0cbf0..71c2da277d6e 100644 --- a/drivers/oprofile/oprofilefs.c +++ b/drivers/oprofile/oprofilefs.c @@ -272,10 +272,10 @@ static int oprofilefs_fill_super(struct super_block * sb, void * data, int silen } -static struct super_block *oprofilefs_get_sb(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data) +static int oprofilefs_get_sb(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data, struct vfsmount *mnt) { - return get_sb_single(fs_type, flags, data, oprofilefs_fill_super); + return get_sb_single(fs_type, flags, data, oprofilefs_fill_super, mnt); } diff --git a/drivers/usb/core/inode.c b/drivers/usb/core/inode.c index 3cf945cc5b9a..95f5ad923b0f 100644 --- a/drivers/usb/core/inode.c +++ b/drivers/usb/core/inode.c @@ -543,10 +543,10 @@ static void fs_remove_file (struct dentry *dentry) /* --------------------------------------------------------------------- */ -static struct super_block *usb_get_sb(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data) +static int usb_get_sb(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data, struct vfsmount *mnt) { - return get_sb_single(fs_type, flags, data, usbfs_fill_super); + return get_sb_single(fs_type, flags, data, usbfs_fill_super, mnt); } static struct file_system_type usb_fs_type = { diff --git a/drivers/usb/gadget/inode.c b/drivers/usb/gadget/inode.c index aef0722b8f17..3bdc5e3ba234 100644 --- a/drivers/usb/gadget/inode.c +++ b/drivers/usb/gadget/inode.c @@ -2070,11 +2070,11 @@ enomem0: } /* "mount -t gadgetfs path /dev/gadget" ends up here */ -static struct super_block * +static int gadgetfs_get_sb (struct file_system_type *t, int flags, - const char *path, void *opts) + const char *path, void *opts, struct vfsmount *mnt) { - return get_sb_single (t, flags, opts, gadgetfs_fill_super); + return get_sb_single (t, flags, opts, gadgetfs_fill_super, mnt); } static void diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c index 61c599b4a1e3..872943004e59 100644 --- a/fs/9p/vfs_super.c +++ b/fs/9p/vfs_super.c @@ -99,12 +99,13 @@ v9fs_fill_super(struct super_block *sb, struct v9fs_session_info *v9ses, * @flags: mount flags * @dev_name: device name that was mounted * @data: mount options + * @mnt: mountpoint record to be instantiated * */ -static struct super_block *v9fs_get_sb(struct file_system_type - *fs_type, int flags, - const char *dev_name, void *data) +static int v9fs_get_sb(struct file_system_type *fs_type, int flags, + const char *dev_name, void *data, + struct vfsmount *mnt) { struct super_block *sb = NULL; struct v9fs_fcall *fcall = NULL; @@ -123,17 +124,19 @@ static struct super_block *v9fs_get_sb(struct file_system_type v9ses = kzalloc(sizeof(struct v9fs_session_info), GFP_KERNEL); if (!v9ses) - return ERR_PTR(-ENOMEM); + return -ENOMEM; if ((newfid = v9fs_session_init(v9ses, dev_name, data)) < 0) { dprintk(DEBUG_ERROR, "problem initiating session\n"); - sb = ERR_PTR(newfid); + retval = newfid; goto out_free_session; } sb = sget(fs_type, NULL, v9fs_set_super, v9ses); - if (IS_ERR(sb)) + if (IS_ERR(sb)) { + retval = PTR_ERR(sb); goto out_close_session; + } v9fs_fill_super(sb, v9ses, flags); inode = v9fs_get_inode(sb, S_IFDIR | mode); @@ -184,19 +187,19 @@ static struct super_block *v9fs_get_sb(struct file_system_type goto put_back_sb; } - return sb; + return simple_set_mnt(mnt, sb); out_close_session: v9fs_session_close(v9ses); out_free_session: kfree(v9ses); - return sb; + return retval; put_back_sb: /* deactivate_super calls v9fs_kill_super which will frees the rest */ up_write(&sb->s_umount); deactivate_super(sb); - return ERR_PTR(retval); + return retval; } /** diff --git a/fs/adfs/super.c b/fs/adfs/super.c index 252abda0d200..1b58a9b7f6aa 100644 --- a/fs/adfs/super.c +++ b/fs/adfs/super.c @@ -470,10 +470,11 @@ error: return -EINVAL; } -static struct super_block *adfs_get_sb(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data) +static int adfs_get_sb(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data, struct vfsmount *mnt) { - return get_sb_bdev(fs_type, flags, dev_name, data, adfs_fill_super); + return get_sb_bdev(fs_type, flags, dev_name, data, adfs_fill_super, + mnt); } static struct file_system_type adfs_fs_type = { diff --git a/fs/affs/super.c b/fs/affs/super.c index 4d7e5b19e5cd..6a52e7875403 100644 --- a/fs/affs/super.c +++ b/fs/affs/super.c @@ -524,10 +524,11 @@ affs_statfs(struct super_block *sb, struct kstatfs *buf) return 0; } -static struct super_block *affs_get_sb(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data) +static int affs_get_sb(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data, struct vfsmount *mnt) { - return get_sb_bdev(fs_type, flags, dev_name, data, affs_fill_super); + return get_sb_bdev(fs_type, flags, dev_name, data, affs_fill_super, + mnt); } static struct file_system_type affs_fs_type = { diff --git a/fs/afs/super.c b/fs/afs/super.c index 53c56e7231ab..82468df0ba54 100644 --- a/fs/afs/super.c +++ b/fs/afs/super.c @@ -38,9 +38,9 @@ struct afs_mount_params { static void afs_i_init_once(void *foo, kmem_cache_t *cachep, unsigned long flags); -static struct super_block *afs_get_sb(struct file_system_type *fs_type, - int flags, const char *dev_name, - void *data); +static int afs_get_sb(struct file_system_type *fs_type, + int flags, const char *dev_name, + void *data, struct vfsmount *mnt); static struct inode *afs_alloc_inode(struct super_block *sb); @@ -294,10 +294,11 @@ static int afs_fill_super(struct super_block *sb, void *data, int silent) * get an AFS superblock * - TODO: don't use get_sb_nodev(), but rather call sget() directly */ -static struct super_block *afs_get_sb(struct file_system_type *fs_type, - int flags, - const char *dev_name, - void *options) +static int afs_get_sb(struct file_system_type *fs_type, + int flags, + const char *dev_name, + void *options, + struct vfsmount *mnt) { struct afs_mount_params params; struct super_block *sb; @@ -311,7 +312,7 @@ static struct super_block *afs_get_sb(struct file_system_type *fs_type, ret = afscm_start(); if (ret < 0) { _leave(" = %d", ret); - return ERR_PTR(ret); + return ret; } /* parse the options */ @@ -348,18 +349,19 @@ static struct super_block *afs_get_sb(struct file_system_type *fs_type, goto error; } sb->s_flags |= MS_ACTIVE; + simple_set_mnt(mnt, sb); afs_put_volume(params.volume); afs_put_cell(params.default_cell); - _leave(" = %p", sb); - return sb; + _leave(" = 0 [%p]", 0, sb); + return 0; error: afs_put_volume(params.volume); afs_put_cell(params.default_cell); afscm_stop(); _leave(" = %d", ret); - return ERR_PTR(ret); + return ret; } /* end afs_get_sb() */ /*****************************************************************************/ diff --git a/fs/autofs/init.c b/fs/autofs/init.c index b977ece69f0c..aca123752406 100644 --- a/fs/autofs/init.c +++ b/fs/autofs/init.c @@ -14,10 +14,10 @@ #include #include "autofs_i.h" -static struct super_block *autofs_get_sb(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data) +static int autofs_get_sb(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data, struct vfsmount *mnt) { - return get_sb_nodev(fs_type, flags, data, autofs_fill_super); + return get_sb_nodev(fs_type, flags, data, autofs_fill_super, mnt); } static struct file_system_type autofs_fs_type = { diff --git a/fs/autofs4/init.c b/fs/autofs4/init.c index acecec8578ce..5d9193332bef 100644 --- a/fs/autofs4/init.c +++ b/fs/autofs4/init.c @@ -14,10 +14,10 @@ #include #include "autofs_i.h" -static struct super_block *autofs_get_sb(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data) +static int autofs_get_sb(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data, struct vfsmount *mnt) { - return get_sb_nodev(fs_type, flags, data, autofs4_fill_super); + return get_sb_nodev(fs_type, flags, data, autofs4_fill_super, mnt); } static struct file_system_type autofs_fs_type = { diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c index 68ebd10f345d..6ed07a5f10c6 100644 --- a/fs/befs/linuxvfs.c +++ b/fs/befs/linuxvfs.c @@ -899,11 +899,12 @@ befs_statfs(struct super_block *sb, struct kstatfs *buf) return 0; } -static struct super_block * +static int befs_get_sb(struct file_system_type *fs_type, int flags, const char *dev_name, - void *data) + void *data, struct vfsmount *mnt) { - return get_sb_bdev(fs_type, flags, dev_name, data, befs_fill_super); + return get_sb_bdev(fs_type, flags, dev_name, data, befs_fill_super, + mnt); } static struct file_system_type befs_fs_type = { diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c index 55a7a78332f8..e7da03f63a5a 100644 --- a/fs/bfs/inode.c +++ b/fs/bfs/inode.c @@ -410,10 +410,10 @@ out: return -EINVAL; } -static struct super_block *bfs_get_sb(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data) +static int bfs_get_sb(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data, struct vfsmount *mnt) { - return get_sb_bdev(fs_type, flags, dev_name, data, bfs_fill_super); + return get_sb_bdev(fs_type, flags, dev_name, data, bfs_fill_super, mnt); } static struct file_system_type bfs_fs_type = { diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c index 599f36fd0f67..07a4996cca3f 100644 --- a/fs/binfmt_misc.c +++ b/fs/binfmt_misc.c @@ -739,10 +739,10 @@ static int bm_fill_super(struct super_block * sb, void * data, int silent) return err; } -static struct super_block *bm_get_sb(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data) +static int bm_get_sb(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data, struct vfsmount *mnt) { - return get_sb_single(fs_type, flags, data, bm_fill_super); + return get_sb_single(fs_type, flags, data, bm_fill_super, mnt); } static struct linux_binfmt misc_format = { diff --git a/fs/block_dev.c b/fs/block_dev.c index 44aaba202f78..028d9fb9c2d5 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -300,10 +300,10 @@ static struct super_operations bdev_sops = { .clear_inode = bdev_clear_inode, }; -static struct super_block *bd_get_sb(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data) +static int bd_get_sb(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data, struct vfsmount *mnt) { - return get_sb_pseudo(fs_type, "bdev:", &bdev_sops, 0x62646576); + return get_sb_pseudo(fs_type, "bdev:", &bdev_sops, 0x62646576, mnt); } static struct file_system_type bd_type = { diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index c262d8874ce9..08b35801dfed 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -460,9 +460,9 @@ struct super_operations cifs_super_ops = { .remount_fs = cifs_remount, }; -static struct super_block * +static int cifs_get_sb(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data) + int flags, const char *dev_name, void *data, struct vfsmount *mnt) { int rc; struct super_block *sb = sget(fs_type, NULL, set_anon_super, NULL); @@ -470,7 +470,7 @@ cifs_get_sb(struct file_system_type *fs_type, cFYI(1, ("Devname: %s flags: %d ", dev_name, flags)); if (IS_ERR(sb)) - return sb; + return PTR_ERR(sb); sb->s_flags = flags; @@ -478,10 +478,10 @@ cifs_get_sb(struct file_system_type *fs_type, if (rc) { up_write(&sb->s_umount); deactivate_super(sb); - return ERR_PTR(rc); + return rc; } sb->s_flags |= MS_ACTIVE; - return sb; + return simple_set_mnt(mnt, sb); } static ssize_t cifs_file_writev(struct file *file, const struct iovec *iov, diff --git a/fs/coda/inode.c b/fs/coda/inode.c index ada1a81df6bd..cba70201567d 100644 --- a/fs/coda/inode.c +++ b/fs/coda/inode.c @@ -307,10 +307,10 @@ static int coda_statfs(struct super_block *sb, struct kstatfs *buf) /* init_coda: used by filesystems.c to register coda */ -static struct super_block *coda_get_sb(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data) +static int coda_get_sb(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data, struct vfsmount *mnt) { - return get_sb_nodev(fs_type, flags, data, coda_fill_super); + return get_sb_nodev(fs_type, flags, data, coda_fill_super, mnt); } struct file_system_type coda_fs_type = { diff --git a/fs/configfs/mount.c b/fs/configfs/mount.c index f920d30478e5..94dab7bdd851 100644 --- a/fs/configfs/mount.c +++ b/fs/configfs/mount.c @@ -103,10 +103,10 @@ static int configfs_fill_super(struct super_block *sb, void *data, int silent) return 0; } -static struct super_block *configfs_get_sb(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data) +static int configfs_get_sb(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data, struct vfsmount *mnt) { - return get_sb_single(fs_type, flags, data, configfs_fill_super); + return get_sb_single(fs_type, flags, data, configfs_fill_super, mnt); } static struct file_system_type configfs_fs_type = { diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c index 9efcc3a164e8..37a91a153aa5 100644 --- a/fs/cramfs/inode.c +++ b/fs/cramfs/inode.c @@ -528,10 +528,11 @@ static struct super_operations cramfs_ops = { .statfs = cramfs_statfs, }; -static struct super_block *cramfs_get_sb(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data) +static int cramfs_get_sb(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data, struct vfsmount *mnt) { - return get_sb_bdev(fs_type, flags, dev_name, data, cramfs_fill_super); + return get_sb_bdev(fs_type, flags, dev_name, data, cramfs_fill_super, + mnt); } static struct file_system_type cramfs_fs_type = { diff --git a/fs/dcache.c b/fs/dcache.c index 59dbc92c2079..313b54b2b8f2 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -687,46 +687,6 @@ void shrink_dcache_parent(struct dentry * parent) prune_dcache(found, parent->d_sb); } -/** - * shrink_dcache_anon - further prune the cache - * @head: head of d_hash list of dentries to prune - * - * Prune the dentries that are anonymous - * - * parsing d_hash list does not hlist_for_each_entry_rcu() as it - * done under dcache_lock. - * - */ -void shrink_dcache_anon(struct super_block *sb) -{ - struct hlist_node *lp; - struct hlist_head *head = &sb->s_anon; - int found; - do { - found = 0; - spin_lock(&dcache_lock); - hlist_for_each(lp, head) { - struct dentry *this = hlist_entry(lp, struct dentry, d_hash); - if (!list_empty(&this->d_lru)) { - dentry_stat.nr_unused--; - list_del_init(&this->d_lru); - } - - /* - * move only zero ref count dentries to the end - * of the unused list for prune_dcache - */ - if (!atomic_read(&this->d_count)) { - list_add_tail(&this->d_lru, &dentry_unused); - dentry_stat.nr_unused++; - found++; - } - } - spin_unlock(&dcache_lock); - prune_dcache(found, sb); - } while(found); -} - /* * Scan `nr' dentries and return the number which remain. * diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c index b55b4ea9a676..440128ebef3b 100644 --- a/fs/debugfs/inode.c +++ b/fs/debugfs/inode.c @@ -111,11 +111,11 @@ static int debug_fill_super(struct super_block *sb, void *data, int silent) return simple_fill_super(sb, DEBUGFS_MAGIC, debug_files); } -static struct super_block *debug_get_sb(struct file_system_type *fs_type, - int flags, const char *dev_name, - void *data) +static int debug_get_sb(struct file_system_type *fs_type, + int flags, const char *dev_name, + void *data, struct vfsmount *mnt) { - return get_sb_single(fs_type, flags, data, debug_fill_super); + return get_sb_single(fs_type, flags, data, debug_fill_super, mnt); } static struct file_system_type debug_fs_type = { diff --git a/fs/devfs/base.c b/fs/devfs/base.c index 52f5059c4f31..51a97f132745 100644 --- a/fs/devfs/base.c +++ b/fs/devfs/base.c @@ -2549,11 +2549,11 @@ static int devfs_fill_super(struct super_block *sb, void *data, int silent) return -EINVAL; } /* End Function devfs_fill_super */ -static struct super_block *devfs_get_sb(struct file_system_type *fs_type, - int flags, const char *dev_name, - void *data) +static int devfs_get_sb(struct file_system_type *fs_type, + int flags, const char *dev_name, + void *data, struct vfsmount *mnt) { - return get_sb_single(fs_type, flags, data, devfs_fill_super); + return get_sb_single(fs_type, flags, data, devfs_fill_super, mnt); } static struct file_system_type devfs_fs_type = { diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c index 14c5620b5cab..f7aef5bb584a 100644 --- a/fs/devpts/inode.c +++ b/fs/devpts/inode.c @@ -130,10 +130,10 @@ fail: return -ENOMEM; } -static struct super_block *devpts_get_sb(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data) +static int devpts_get_sb(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data, struct vfsmount *mnt) { - return get_sb_single(fs_type, flags, data, devpts_fill_super); + return get_sb_single(fs_type, flags, data, devpts_fill_super, mnt); } static struct file_system_type devpts_fs_type = { diff --git a/fs/efs/super.c b/fs/efs/super.c index dff623e3ddbf..1ba5e14f879f 100644 --- a/fs/efs/super.c +++ b/fs/efs/super.c @@ -18,10 +18,10 @@ static int efs_statfs(struct super_block *s, struct kstatfs *buf); static int efs_fill_super(struct super_block *s, void *d, int silent); -static struct super_block *efs_get_sb(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data) +static int efs_get_sb(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data, struct vfsmount *mnt) { - return get_sb_bdev(fs_type, flags, dev_name, data, efs_fill_super); + return get_sb_bdev(fs_type, flags, dev_name, data, efs_fill_super, mnt); } static struct file_system_type efs_fs_type = { diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 2695337d4d64..08e7e6a555ca 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -268,9 +268,9 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events, int maxevents, long timeout); static int eventpollfs_delete_dentry(struct dentry *dentry); static struct inode *ep_eventpoll_inode(void); -static struct super_block *eventpollfs_get_sb(struct file_system_type *fs_type, - int flags, const char *dev_name, - void *data); +static int eventpollfs_get_sb(struct file_system_type *fs_type, + int flags, const char *dev_name, + void *data, struct vfsmount *mnt); /* * This semaphore is used to serialize ep_free() and eventpoll_release_file(). @@ -1595,11 +1595,12 @@ eexit_1: } -static struct super_block * +static int eventpollfs_get_sb(struct file_system_type *fs_type, int flags, - const char *dev_name, void *data) + const char *dev_name, void *data, struct vfsmount *mnt) { - return get_sb_pseudo(fs_type, "eventpoll:", NULL, EVENTPOLLFS_MAGIC); + return get_sb_pseudo(fs_type, "eventpoll:", NULL, EVENTPOLLFS_MAGIC, + mnt); } diff --git a/fs/ext2/super.c b/fs/ext2/super.c index 7e30bae174ed..a4dfffac5967 100644 --- a/fs/ext2/super.c +++ b/fs/ext2/super.c @@ -1087,10 +1087,10 @@ static int ext2_statfs (struct super_block * sb, struct kstatfs * buf) return 0; } -static struct super_block *ext2_get_sb(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data) +static int ext2_get_sb(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data, struct vfsmount *mnt) { - return get_sb_bdev(fs_type, flags, dev_name, data, ext2_fill_super); + return get_sb_bdev(fs_type, flags, dev_name, data, ext2_fill_super, mnt); } #ifdef CONFIG_QUOTA diff --git a/fs/ext3/super.c b/fs/ext3/super.c index f8a5266ea1ff..657f8e73b62f 100644 --- a/fs/ext3/super.c +++ b/fs/ext3/super.c @@ -2646,10 +2646,10 @@ out: #endif -static struct super_block *ext3_get_sb(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data) +static int ext3_get_sb(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data, struct vfsmount *mnt) { - return get_sb_bdev(fs_type, flags, dev_name, data, ext3_fill_super); + return get_sb_bdev(fs_type, flags, dev_name, data, ext3_fill_super, mnt); } static struct file_system_type ext3_fs_type = { diff --git a/fs/freevxfs/vxfs_super.c b/fs/freevxfs/vxfs_super.c index b44c916d24a1..d76eeaafbde2 100644 --- a/fs/freevxfs/vxfs_super.c +++ b/fs/freevxfs/vxfs_super.c @@ -241,10 +241,11 @@ out: /* * The usual module blurb. */ -static struct super_block *vxfs_get_sb(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data) +static int vxfs_get_sb(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data, struct vfsmount *mnt) { - return get_sb_bdev(fs_type, flags, dev_name, data, vxfs_fill_super); + return get_sb_bdev(fs_type, flags, dev_name, data, vxfs_fill_super, + mnt); } static struct file_system_type vxfs_fs_type = { diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 7627022446b2..c91f0a50aadb 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -569,11 +569,11 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent) return err; } -static struct super_block *fuse_get_sb(struct file_system_type *fs_type, - int flags, const char *dev_name, - void *raw_data) +static int fuse_get_sb(struct file_system_type *fs_type, + int flags, const char *dev_name, + void *raw_data, struct vfsmount *mnt) { - return get_sb_nodev(fs_type, flags, raw_data, fuse_fill_super); + return get_sb_nodev(fs_type, flags, raw_data, fuse_fill_super, mnt); } static struct file_system_type fuse_fs_type = { diff --git a/fs/hfs/super.c b/fs/hfs/super.c index 1181d116117d..ee5b80a409e8 100644 --- a/fs/hfs/super.c +++ b/fs/hfs/super.c @@ -413,10 +413,11 @@ bail: return res; } -static struct super_block *hfs_get_sb(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data) +static int hfs_get_sb(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data, + struct vfsmount *mnt) { - return get_sb_bdev(fs_type, flags, dev_name, data, hfs_fill_super); + return get_sb_bdev(fs_type, flags, dev_name, data, hfs_fill_super, mnt); } static struct file_system_type hfs_fs_type = { diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c index 7843f792a4b7..0ed8b7e8e87f 100644 --- a/fs/hfsplus/super.c +++ b/fs/hfsplus/super.c @@ -450,10 +450,12 @@ static void hfsplus_destroy_inode(struct inode *inode) #define HFSPLUS_INODE_SIZE sizeof(struct hfsplus_inode_info) -static struct super_block *hfsplus_get_sb(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data) +static int hfsplus_get_sb(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data, + struct vfsmount *mnt) { - return get_sb_bdev(fs_type, flags, dev_name, data, hfsplus_fill_super); + return get_sb_bdev(fs_type, flags, dev_name, data, hfsplus_fill_super, + mnt); } static struct file_system_type hfsplus_fs_type = { diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c index bf0f8e16e433..04035e08f5c1 100644 --- a/fs/hostfs/hostfs_kern.c +++ b/fs/hostfs/hostfs_kern.c @@ -993,11 +993,11 @@ static int hostfs_fill_sb_common(struct super_block *sb, void *d, int silent) return(err); } -static struct super_block *hostfs_read_sb(struct file_system_type *type, - int flags, const char *dev_name, - void *data) +static int hostfs_read_sb(struct file_system_type *type, + int flags, const char *dev_name, + void *data, struct vfsmount *mnt) { - return(get_sb_nodev(type, flags, data, hostfs_fill_sb_common)); + return get_sb_nodev(type, flags, data, hostfs_fill_sb_common, mnt); } static struct file_system_type hostfs_type = { diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c index d72d8c87c996..3b25cf3e2e65 100644 --- a/fs/hpfs/super.c +++ b/fs/hpfs/super.c @@ -662,10 +662,11 @@ bail0: return -EINVAL; } -static struct super_block *hpfs_get_sb(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data) +static int hpfs_get_sb(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data, struct vfsmount *mnt) { - return get_sb_bdev(fs_type, flags, dev_name, data, hpfs_fill_super); + return get_sb_bdev(fs_type, flags, dev_name, data, hpfs_fill_super, + mnt); } static struct file_system_type hpfs_fs_type = { diff --git a/fs/hppfs/hppfs_kern.c b/fs/hppfs/hppfs_kern.c index 5e6363be246f..ec43c22bc9c0 100644 --- a/fs/hppfs/hppfs_kern.c +++ b/fs/hppfs/hppfs_kern.c @@ -769,11 +769,11 @@ static int hppfs_fill_super(struct super_block *sb, void *d, int silent) return(err); } -static struct super_block *hppfs_read_super(struct file_system_type *type, - int flags, const char *dev_name, - void *data) +static int hppfs_read_super(struct file_system_type *type, + int flags, const char *dev_name, + void *data, struct vfsmount *mnt) { - return(get_sb_nodev(type, flags, data, hppfs_fill_super)); + return get_sb_nodev(type, flags, data, hppfs_fill_super, mnt); } static struct file_system_type hppfs_type = { diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 3a5b4e923455..4665c26171f7 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -723,10 +723,10 @@ void hugetlb_put_quota(struct address_space *mapping) } } -static struct super_block *hugetlbfs_get_sb(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data) +static int hugetlbfs_get_sb(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data, struct vfsmount *mnt) { - return get_sb_nodev(fs_type, flags, data, hugetlbfs_fill_super); + return get_sb_nodev(fs_type, flags, data, hugetlbfs_fill_super, mnt); } static struct file_system_type hugetlbfs_fs_type = { diff --git a/fs/inotify_user.c b/fs/inotify_user.c index 9e9931e2badd..f2386442adee 100644 --- a/fs/inotify_user.c +++ b/fs/inotify_user.c @@ -672,11 +672,11 @@ out: return ret; } -static struct super_block * +static int inotify_get_sb(struct file_system_type *fs_type, int flags, - const char *dev_name, void *data) + const char *dev_name, void *data, struct vfsmount *mnt) { - return get_sb_pseudo(fs_type, "inotify", NULL, 0xBAD1DEA); + return get_sb_pseudo(fs_type, "inotify", NULL, 0xBAD1DEA, mnt); } static struct file_system_type inotify_fs_type = { diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c index 70adbb98bad1..17268da63a49 100644 --- a/fs/isofs/inode.c +++ b/fs/isofs/inode.c @@ -1399,10 +1399,11 @@ struct inode *isofs_iget(struct super_block *sb, return inode; } -static struct super_block *isofs_get_sb(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data) +static int isofs_get_sb(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data, struct vfsmount *mnt) { - return get_sb_bdev(fs_type, flags, dev_name, data, isofs_fill_super); + return get_sb_bdev(fs_type, flags, dev_name, data, isofs_fill_super, + mnt); } static struct file_system_type iso9660_fs_type = { diff --git a/fs/jffs/inode-v23.c b/fs/jffs/inode-v23.c index 020cc097c539..dd93a091ad67 100644 --- a/fs/jffs/inode-v23.c +++ b/fs/jffs/inode-v23.c @@ -1785,10 +1785,11 @@ static struct super_operations jffs_ops = .remount_fs = jffs_remount, }; -static struct super_block *jffs_get_sb(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data) +static int jffs_get_sb(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data, struct vfsmount *mnt) { - return get_sb_bdev(fs_type, flags, dev_name, data, jffs_fill_super); + return get_sb_bdev(fs_type, flags, dev_name, data, jffs_fill_super, + mnt); } static struct file_system_type jffs_fs_type = { diff --git a/fs/jffs2/super.c b/fs/jffs2/super.c index 9d0521451f59..2378a662c256 100644 --- a/fs/jffs2/super.c +++ b/fs/jffs2/super.c @@ -111,9 +111,10 @@ static int jffs2_sb_set(struct super_block *sb, void *data) return 0; } -static struct super_block *jffs2_get_sb_mtd(struct file_system_type *fs_type, - int flags, const char *dev_name, - void *data, struct mtd_info *mtd) +static int jffs2_get_sb_mtd(struct file_system_type *fs_type, + int flags, const char *dev_name, + void *data, struct mtd_info *mtd, + struct vfsmount *mnt) { struct super_block *sb; struct jffs2_sb_info *c; @@ -121,19 +122,20 @@ static struct super_block *jffs2_get_sb_mtd(struct file_system_type *fs_type, c = kmalloc(sizeof(*c), GFP_KERNEL); if (!c) - return ERR_PTR(-ENOMEM); + return -ENOMEM; memset(c, 0, sizeof(*c)); c->mtd = mtd; sb = sget(fs_type, jffs2_sb_compare, jffs2_sb_set, c); if (IS_ERR(sb)) - goto out_put; + goto out_error; if (sb->s_root) { /* New mountpoint for JFFS2 which is already mounted */ D1(printk(KERN_DEBUG "jffs2_get_sb_mtd(): Device %d (\"%s\") is already mounted\n", mtd->index, mtd->name)); + ret = simple_set_mnt(mnt, sb); goto out_put; } @@ -161,44 +163,47 @@ static struct super_block *jffs2_get_sb_mtd(struct file_system_type *fs_type, /* Failure case... */ up_write(&sb->s_umount); deactivate_super(sb); - return ERR_PTR(ret); + return ret; } sb->s_flags |= MS_ACTIVE; - return sb; + return simple_set_mnt(mnt, sb); +out_error: + ret = PTR_ERR(sb); out_put: kfree(c); put_mtd_device(mtd); - return sb; + return ret; } -static struct super_block *jffs2_get_sb_mtdnr(struct file_system_type *fs_type, - int flags, const char *dev_name, - void *data, int mtdnr) +static int jffs2_get_sb_mtdnr(struct file_system_type *fs_type, + int flags, const char *dev_name, + void *data, int mtdnr, + struct vfsmount *mnt) { struct mtd_info *mtd; mtd = get_mtd_device(NULL, mtdnr); if (!mtd) { D1(printk(KERN_DEBUG "jffs2: MTD device #%u doesn't appear to exist\n", mtdnr)); - return ERR_PTR(-EINVAL); + return -EINVAL; } - return jffs2_get_sb_mtd(fs_type, flags, dev_name, data, mtd); + return jffs2_get_sb_mtd(fs_type, flags, dev_name, data, mtd, mnt); } -static struct super_block *jffs2_get_sb(struct file_system_type *fs_type, - int flags, const char *dev_name, - void *data) +static int jffs2_get_sb(struct file_system_type *fs_type, + int flags, const char *dev_name, + void *data, struct vfsmount *mnt) { int err; struct nameidata nd; int mtdnr; if (!dev_name) - return ERR_PTR(-EINVAL); + return -EINVAL; D1(printk(KERN_DEBUG "jffs2_get_sb(): dev_name \"%s\"\n", dev_name)); @@ -220,7 +225,7 @@ static struct super_block *jffs2_get_sb(struct file_system_type *fs_type, mtd = get_mtd_device(NULL, mtdnr); if (mtd) { if (!strcmp(mtd->name, dev_name+4)) - return jffs2_get_sb_mtd(fs_type, flags, dev_name, data, mtd); + return jffs2_get_sb_mtd(fs_type, flags, dev_name, data, mtd, mnt); put_mtd_device(mtd); } } @@ -233,7 +238,7 @@ static struct super_block *jffs2_get_sb(struct file_system_type *fs_type, if (!*endptr) { /* It was a valid number */ D1(printk(KERN_DEBUG "jffs2_get_sb(): mtd%%d, mtdnr %d\n", mtdnr)); - return jffs2_get_sb_mtdnr(fs_type, flags, dev_name, data, mtdnr); + return jffs2_get_sb_mtdnr(fs_type, flags, dev_name, data, mtdnr, mnt); } } } @@ -247,7 +252,7 @@ static struct super_block *jffs2_get_sb(struct file_system_type *fs_type, err, nd.dentry->d_inode)); if (err) - return ERR_PTR(err); + return err; err = -EINVAL; @@ -269,11 +274,11 @@ static struct super_block *jffs2_get_sb(struct file_system_type *fs_type, mtdnr = iminor(nd.dentry->d_inode); path_release(&nd); - return jffs2_get_sb_mtdnr(fs_type, flags, dev_name, data, mtdnr); + return jffs2_get_sb_mtdnr(fs_type, flags, dev_name, data, mtdnr, mnt); out: path_release(&nd); - return ERR_PTR(err); + return err; } static void jffs2_put_super (struct super_block *sb) diff --git a/fs/jfs/super.c b/fs/jfs/super.c index db6f41d6dd60..18a28137b90e 100644 --- a/fs/jfs/super.c +++ b/fs/jfs/super.c @@ -565,10 +565,11 @@ static void jfs_unlockfs(struct super_block *sb) } } -static struct super_block *jfs_get_sb(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data) +static int jfs_get_sb(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data, struct vfsmount *mnt) { - return get_sb_bdev(fs_type, flags, dev_name, data, jfs_fill_super); + return get_sb_bdev(fs_type, flags, dev_name, data, jfs_fill_super, + mnt); } static int jfs_sync_fs(struct super_block *sb, int wait) diff --git a/fs/libfs.c b/fs/libfs.c index 7145ba7a48d0..7d70efa46da9 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -196,9 +196,9 @@ struct inode_operations simple_dir_inode_operations = { * Common helper for pseudo-filesystems (sockfs, pipefs, bdev - stuff that * will never be mountable) */ -struct super_block * -get_sb_pseudo(struct file_system_type *fs_type, char *name, - struct super_operations *ops, unsigned long magic) +int get_sb_pseudo(struct file_system_type *fs_type, char *name, + struct super_operations *ops, unsigned long magic, + struct vfsmount *mnt) { struct super_block *s = sget(fs_type, NULL, set_anon_super, NULL); static struct super_operations default_ops = {.statfs = simple_statfs}; @@ -207,7 +207,7 @@ get_sb_pseudo(struct file_system_type *fs_type, char *name, struct qstr d_name = {.name = name, .len = strlen(name)}; if (IS_ERR(s)) - return s; + return PTR_ERR(s); s->s_flags = MS_NOUSER; s->s_maxbytes = ~0ULL; @@ -232,12 +232,12 @@ get_sb_pseudo(struct file_system_type *fs_type, char *name, d_instantiate(dentry, root); s->s_root = dentry; s->s_flags |= MS_ACTIVE; - return s; + return simple_set_mnt(mnt, s); Enomem: up_write(&s->s_umount); deactivate_super(s); - return ERR_PTR(-ENOMEM); + return -ENOMEM; } int simple_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry) diff --git a/fs/minix/inode.c b/fs/minix/inode.c index 2dcccf1d1b7f..14f24dfbfe30 100644 --- a/fs/minix/inode.c +++ b/fs/minix/inode.c @@ -559,10 +559,11 @@ void minix_truncate(struct inode * inode) V2_minix_truncate(inode); } -static struct super_block *minix_get_sb(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data) +static int minix_get_sb(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data, struct vfsmount *mnt) { - return get_sb_bdev(fs_type, flags, dev_name, data, minix_fill_super); + return get_sb_bdev(fs_type, flags, dev_name, data, minix_fill_super, + mnt); } static struct file_system_type minix_fs_type = { diff --git a/fs/msdos/namei.c b/fs/msdos/namei.c index 5b76ccd19e3f..9e44158a7540 100644 --- a/fs/msdos/namei.c +++ b/fs/msdos/namei.c @@ -661,11 +661,12 @@ static int msdos_fill_super(struct super_block *sb, void *data, int silent) return 0; } -static struct super_block *msdos_get_sb(struct file_system_type *fs_type, - int flags, const char *dev_name, - void *data) +static int msdos_get_sb(struct file_system_type *fs_type, + int flags, const char *dev_name, + void *data, struct vfsmount *mnt) { - return get_sb_bdev(fs_type, flags, dev_name, data, msdos_fill_super); + return get_sb_bdev(fs_type, flags, dev_name, data, msdos_fill_super, + mnt); } static struct file_system_type msdos_fs_type = { diff --git a/fs/namespace.c b/fs/namespace.c index bf478addb852..c13072a5f1ee 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -86,6 +86,15 @@ struct vfsmount *alloc_vfsmnt(const char *name) return mnt; } +int simple_set_mnt(struct vfsmount *mnt, struct super_block *sb) +{ + mnt->mnt_sb = sb; + mnt->mnt_root = dget(sb->s_root); + return 0; +} + +EXPORT_SYMBOL(simple_set_mnt); + void free_vfsmnt(struct vfsmount *mnt) { kfree(mnt->mnt_devname); diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c index a1f3e972c6ef..8db033fab3fd 100644 --- a/fs/ncpfs/inode.c +++ b/fs/ncpfs/inode.c @@ -957,10 +957,10 @@ out: return result; } -static struct super_block *ncp_get_sb(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data) +static int ncp_get_sb(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data, struct vfsmount *mnt) { - return get_sb_nodev(fs_type, flags, data, ncp_fill_super); + return get_sb_nodev(fs_type, flags, data, ncp_fill_super, mnt); } static struct file_system_type ncp_fs_type = { diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index d0b991a92327..ff645a961bc8 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -1690,8 +1690,8 @@ static int nfs_compare_super(struct super_block *sb, void *data) return !nfs_compare_fh(&old->fh, &server->fh); } -static struct super_block *nfs_get_sb(struct file_system_type *fs_type, - int flags, const char *dev_name, void *raw_data) +static int nfs_get_sb(struct file_system_type *fs_type, + int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt) { int error; struct nfs_server *server = NULL; @@ -1699,14 +1699,14 @@ static struct super_block *nfs_get_sb(struct file_system_type *fs_type, struct nfs_fh *root; struct nfs_mount_data *data = raw_data; - s = ERR_PTR(-EINVAL); + error = -EINVAL; if (data == NULL) { dprintk("%s: missing data argument\n", __FUNCTION__); - goto out_err; + goto out_err_noserver; } if (data->version <= 0 || data->version > NFS_MOUNT_VERSION) { dprintk("%s: bad mount version\n", __FUNCTION__); - goto out_err; + goto out_err_noserver; } switch (data->version) { case 1: @@ -1718,7 +1718,7 @@ static struct super_block *nfs_get_sb(struct file_system_type *fs_type, dprintk("%s: mount structure version %d does not support NFSv3\n", __FUNCTION__, data->version); - goto out_err; + goto out_err_noserver; } data->root.size = NFS2_FHSIZE; memcpy(data->root.data, data->old_root.data, NFS2_FHSIZE); @@ -1727,24 +1727,24 @@ static struct super_block *nfs_get_sb(struct file_system_type *fs_type, dprintk("%s: mount structure version %d does not support strong security\n", __FUNCTION__, data->version); - goto out_err; + goto out_err_noserver; } case 5: memset(data->context, 0, sizeof(data->context)); } #ifndef CONFIG_NFS_V3 /* If NFSv3 is not compiled in, return -EPROTONOSUPPORT */ - s = ERR_PTR(-EPROTONOSUPPORT); + error = -EPROTONOSUPPORT; if (data->flags & NFS_MOUNT_VER3) { dprintk("%s: NFSv3 not compiled into kernel\n", __FUNCTION__); - goto out_err; + goto out_err_noserver; } #endif /* CONFIG_NFS_V3 */ - s = ERR_PTR(-ENOMEM); + error = -ENOMEM; server = kzalloc(sizeof(struct nfs_server), GFP_KERNEL); if (!server) - goto out_err; + goto out_err_noserver; /* Zero out the NFS state stuff */ init_nfsv4_state(server); server->client = server->client_sys = server->client_acl = ERR_PTR(-EINVAL); @@ -1754,7 +1754,7 @@ static struct super_block *nfs_get_sb(struct file_system_type *fs_type, root->size = data->root.size; else root->size = NFS2_FHSIZE; - s = ERR_PTR(-EINVAL); + error = -EINVAL; if (root->size > sizeof(root->data)) { dprintk("%s: invalid root filehandle\n", __FUNCTION__); goto out_err; @@ -1770,15 +1770,20 @@ static struct super_block *nfs_get_sb(struct file_system_type *fs_type, } /* Fire up rpciod if not yet running */ - s = ERR_PTR(rpciod_up()); - if (IS_ERR(s)) { - dprintk("%s: couldn't start rpciod! Error = %ld\n", - __FUNCTION__, PTR_ERR(s)); + error = rpciod_up(); + if (error < 0) { + dprintk("%s: couldn't start rpciod! Error = %d\n", + __FUNCTION__, error); goto out_err; } s = sget(fs_type, nfs_compare_super, nfs_set_super, server); - if (IS_ERR(s) || s->s_root) + if (IS_ERR(s)) { + error = PTR_ERR(s); + goto out_err_rpciod; + } + + if (s->s_root) goto out_rpciod_down; s->s_flags = flags; @@ -1787,15 +1792,22 @@ static struct super_block *nfs_get_sb(struct file_system_type *fs_type, if (error) { up_write(&s->s_umount); deactivate_super(s); - return ERR_PTR(error); + return error; } s->s_flags |= MS_ACTIVE; - return s; + return simple_set_mnt(mnt, s); + out_rpciod_down: rpciod_down(); + kfree(server); + return simple_set_mnt(mnt, s); + +out_err_rpciod: + rpciod_down(); out_err: kfree(server); - return s; +out_err_noserver: + return error; } static void nfs_kill_super(struct super_block *s) @@ -2032,8 +2044,8 @@ nfs_copy_user_string(char *dst, struct nfs_string *src, int maxlen) return dst; } -static struct super_block *nfs4_get_sb(struct file_system_type *fs_type, - int flags, const char *dev_name, void *raw_data) +static int nfs4_get_sb(struct file_system_type *fs_type, + int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt) { int error; struct nfs_server *server; @@ -2043,16 +2055,16 @@ static struct super_block *nfs4_get_sb(struct file_system_type *fs_type, if (data == NULL) { dprintk("%s: missing data argument\n", __FUNCTION__); - return ERR_PTR(-EINVAL); + return -EINVAL; } if (data->version <= 0 || data->version > NFS4_MOUNT_VERSION) { dprintk("%s: bad mount version\n", __FUNCTION__); - return ERR_PTR(-EINVAL); + return -EINVAL; } server = kzalloc(sizeof(struct nfs_server), GFP_KERNEL); if (!server) - return ERR_PTR(-ENOMEM); + return -ENOMEM; /* Zero out the NFS state stuff */ init_nfsv4_state(server); server->client = server->client_sys = server->client_acl = ERR_PTR(-EINVAL); @@ -2074,33 +2086,41 @@ static struct super_block *nfs4_get_sb(struct file_system_type *fs_type, /* We now require that the mount process passes the remote address */ if (data->host_addrlen != sizeof(server->addr)) { - s = ERR_PTR(-EINVAL); + error = -EINVAL; goto out_free; } if (copy_from_user(&server->addr, data->host_addr, sizeof(server->addr))) { - s = ERR_PTR(-EFAULT); + error = -EFAULT; goto out_free; } if (server->addr.sin_family != AF_INET || server->addr.sin_addr.s_addr == INADDR_ANY) { dprintk("%s: mount program didn't pass remote IP address!\n", __FUNCTION__); - s = ERR_PTR(-EINVAL); + error = -EINVAL; goto out_free; } /* Fire up rpciod if not yet running */ - s = ERR_PTR(rpciod_up()); - if (IS_ERR(s)) { - dprintk("%s: couldn't start rpciod! Error = %ld\n", - __FUNCTION__, PTR_ERR(s)); + error = rpciod_up(); + if (error < 0) { + dprintk("%s: couldn't start rpciod! Error = %d\n", + __FUNCTION__, error); goto out_free; } s = sget(fs_type, nfs4_compare_super, nfs_set_super, server); - - if (IS_ERR(s) || s->s_root) + if (IS_ERR(s)) { + error = PTR_ERR(s); goto out_free; + } + + if (s->s_root) { + kfree(server->mnt_path); + kfree(server->hostname); + kfree(server); + return simple_set_mnt(mnt, s); + } s->s_flags = flags; @@ -2108,17 +2128,17 @@ static struct super_block *nfs4_get_sb(struct file_system_type *fs_type, if (error) { up_write(&s->s_umount); deactivate_super(s); - return ERR_PTR(error); + return error; } s->s_flags |= MS_ACTIVE; - return s; + return simple_set_mnt(mnt, s); out_err: - s = (struct super_block *)p; + error = PTR_ERR(p); out_free: kfree(server->mnt_path); kfree(server->hostname); kfree(server); - return s; + return error; } static void nfs4_kill_super(struct super_block *sb) diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index 3ef017b3b5bd..a1810e6a93e5 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -494,10 +494,10 @@ static int nfsd_fill_super(struct super_block * sb, void * data, int silent) return simple_fill_super(sb, 0x6e667364, nfsd_files); } -static struct super_block *nfsd_get_sb(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data) +static int nfsd_get_sb(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data, struct vfsmount *mnt) { - return get_sb_single(fs_type, flags, data, nfsd_fill_super); + return get_sb_single(fs_type, flags, data, nfsd_fill_super, mnt); } static struct file_system_type nfsd_fs_type = { diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c index 27833f6df49f..d5d5e969294f 100644 --- a/fs/ntfs/super.c +++ b/fs/ntfs/super.c @@ -3093,10 +3093,11 @@ struct kmem_cache *ntfs_index_ctx_cache; /* Driver wide mutex. */ DEFINE_MUTEX(ntfs_lock); -static struct super_block *ntfs_get_sb(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data) +static int ntfs_get_sb(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data, struct vfsmount *mnt) { - return get_sb_bdev(fs_type, flags, dev_name, data, ntfs_fill_super); + return get_sb_bdev(fs_type, flags, dev_name, data, ntfs_fill_super, + mnt); } static struct file_system_type ntfs_fs_type = { diff --git a/fs/ocfs2/dlm/dlmfs.c b/fs/ocfs2/dlm/dlmfs.c index 7e88e24b3471..7273d9fa6bab 100644 --- a/fs/ocfs2/dlm/dlmfs.c +++ b/fs/ocfs2/dlm/dlmfs.c @@ -574,10 +574,10 @@ static struct inode_operations dlmfs_file_inode_operations = { .getattr = simple_getattr, }; -static struct super_block *dlmfs_get_sb(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data) +static int dlmfs_get_sb(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data, struct vfsmount *mnt) { - return get_sb_nodev(fs_type, flags, data, dlmfs_fill_super); + return get_sb_nodev(fs_type, flags, data, dlmfs_fill_super, mnt); } static struct file_system_type dlmfs_fs_type = { diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index 949b3dac30f1..788b8b50dc4c 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -672,12 +672,14 @@ read_super_error: return status; } -static struct super_block *ocfs2_get_sb(struct file_system_type *fs_type, - int flags, - const char *dev_name, - void *data) +static int ocfs2_get_sb(struct file_system_type *fs_type, + int flags, + const char *dev_name, + void *data, + struct vfsmount *mnt) { - return get_sb_bdev(fs_type, flags, dev_name, data, ocfs2_fill_super); + return get_sb_bdev(fs_type, flags, dev_name, data, ocfs2_fill_super, + mnt); } static struct file_system_type ocfs2_fs_type = { diff --git a/fs/openpromfs/inode.c b/fs/openpromfs/inode.c index 0f14276a2e51..464e2bce0203 100644 --- a/fs/openpromfs/inode.c +++ b/fs/openpromfs/inode.c @@ -1054,10 +1054,10 @@ out_no_root: return -ENOMEM; } -static struct super_block *openprom_get_sb(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data) +static int openprom_get_sb(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data, struct vfsmount *mnt) { - return get_sb_single(fs_type, flags, data, openprom_fill_super); + return get_sb_single(fs_type, flags, data, openprom_fill_super, mnt); } static struct file_system_type openprom_fs_type = { diff --git a/fs/pipe.c b/fs/pipe.c index 5acd8954aaa0..20352573e025 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -979,12 +979,11 @@ no_files: * any operations on the root directory. However, we need a non-trivial * d_name - pipe: will go nicely and kill the special-casing in procfs. */ - -static struct super_block * -pipefs_get_sb(struct file_system_type *fs_type, int flags, - const char *dev_name, void *data) +static int pipefs_get_sb(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data, + struct vfsmount *mnt) { - return get_sb_pseudo(fs_type, "pipe:", NULL, PIPEFS_MAGIC); + return get_sb_pseudo(fs_type, "pipe:", NULL, PIPEFS_MAGIC, mnt); } static struct file_system_type pipe_fs_type = { diff --git a/fs/proc/root.c b/fs/proc/root.c index c3fd3611112f..9995356ce73e 100644 --- a/fs/proc/root.c +++ b/fs/proc/root.c @@ -26,10 +26,10 @@ struct proc_dir_entry *proc_net, *proc_net_stat, *proc_bus, *proc_root_fs, *proc struct proc_dir_entry *proc_sys_root; #endif -static struct super_block *proc_get_sb(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data) +static int proc_get_sb(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data, struct vfsmount *mnt) { - return get_sb_single(fs_type, flags, data, proc_fill_super); + return get_sb_single(fs_type, flags, data, proc_fill_super, mnt); } static struct file_system_type proc_fs_type = { diff --git a/fs/qnx4/inode.c b/fs/qnx4/inode.c index 2ecd46f85e9f..e6cca5cd4b44 100644 --- a/fs/qnx4/inode.c +++ b/fs/qnx4/inode.c @@ -561,10 +561,11 @@ static void destroy_inodecache(void) "qnx4_inode_cache: not all structures were freed\n"); } -static struct super_block *qnx4_get_sb(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data) +static int qnx4_get_sb(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data, struct vfsmount *mnt) { - return get_sb_bdev(fs_type, flags, dev_name, data, qnx4_fill_super); + return get_sb_bdev(fs_type, flags, dev_name, data, qnx4_fill_super, + mnt); } static struct file_system_type qnx4_fs_type = { diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c index 14bd2246fb6d..b9677335cc8d 100644 --- a/fs/ramfs/inode.c +++ b/fs/ramfs/inode.c @@ -185,16 +185,17 @@ static int ramfs_fill_super(struct super_block * sb, void * data, int silent) return 0; } -struct super_block *ramfs_get_sb(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data) +int ramfs_get_sb(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data, struct vfsmount *mnt) { - return get_sb_nodev(fs_type, flags, data, ramfs_fill_super); + return get_sb_nodev(fs_type, flags, data, ramfs_fill_super, mnt); } -static struct super_block *rootfs_get_sb(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data) +static int rootfs_get_sb(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data, struct vfsmount *mnt) { - return get_sb_nodev(fs_type, flags|MS_NOUSER, data, ramfs_fill_super); + return get_sb_nodev(fs_type, flags|MS_NOUSER, data, ramfs_fill_super, + mnt); } static struct file_system_type ramfs_fs_type = { diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c index cae2abbc0c71..f3ff41d33989 100644 --- a/fs/reiserfs/super.c +++ b/fs/reiserfs/super.c @@ -2249,11 +2249,12 @@ static ssize_t reiserfs_quota_write(struct super_block *sb, int type, #endif -static struct super_block *get_super_block(struct file_system_type *fs_type, - int flags, const char *dev_name, - void *data) +static int get_super_block(struct file_system_type *fs_type, + int flags, const char *dev_name, + void *data, struct vfsmount *mnt) { - return get_sb_bdev(fs_type, flags, dev_name, data, reiserfs_fill_super); + return get_sb_bdev(fs_type, flags, dev_name, data, reiserfs_fill_super, + mnt); } static int __init init_reiserfs_fs(void) diff --git a/fs/romfs/inode.c b/fs/romfs/inode.c index 9b9eda7b335c..4d6cd6621062 100644 --- a/fs/romfs/inode.c +++ b/fs/romfs/inode.c @@ -607,10 +607,11 @@ static struct super_operations romfs_ops = { .remount_fs = romfs_remount, }; -static struct super_block *romfs_get_sb(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data) +static int romfs_get_sb(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data, struct vfsmount *mnt) { - return get_sb_bdev(fs_type, flags, dev_name, data, romfs_fill_super); + return get_sb_bdev(fs_type, flags, dev_name, data, romfs_fill_super, + mnt); } static struct file_system_type romfs_fs_type = { diff --git a/fs/smbfs/inode.c b/fs/smbfs/inode.c index fdeabc0a34f7..4a37c2bbfa3f 100644 --- a/fs/smbfs/inode.c +++ b/fs/smbfs/inode.c @@ -782,10 +782,10 @@ out: return error; } -static struct super_block *smb_get_sb(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data) +static int smb_get_sb(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data, struct vfsmount *mnt) { - return get_sb_nodev(fs_type, flags, data, smb_fill_super); + return get_sb_nodev(fs_type, flags, data, smb_fill_super, mnt); } static struct file_system_type smb_fs_type = { diff --git a/fs/super.c b/fs/super.c index 9d5c2add7228..324c2d232f54 100644 --- a/fs/super.c +++ b/fs/super.c @@ -231,7 +231,7 @@ void generic_shutdown_super(struct super_block *sb) if (root) { sb->s_root = NULL; shrink_dcache_parent(root); - shrink_dcache_anon(sb); + shrink_dcache_sb(sb); dput(root); fsync_super(sb); lock_super(sb); @@ -676,9 +676,10 @@ static void bdev_uevent(struct block_device *bdev, enum kobject_action action) } } -struct super_block *get_sb_bdev(struct file_system_type *fs_type, +int get_sb_bdev(struct file_system_type *fs_type, int flags, const char *dev_name, void *data, - int (*fill_super)(struct super_block *, void *, int)) + int (*fill_super)(struct super_block *, void *, int), + struct vfsmount *mnt) { struct block_device *bdev; struct super_block *s; @@ -686,7 +687,7 @@ struct super_block *get_sb_bdev(struct file_system_type *fs_type, bdev = open_bdev_excl(dev_name, flags, fs_type); if (IS_ERR(bdev)) - return (struct super_block *)bdev; + return PTR_ERR(bdev); /* * once the super is inserted into the list by sget, s_umount @@ -697,15 +698,17 @@ struct super_block *get_sb_bdev(struct file_system_type *fs_type, s = sget(fs_type, test_bdev_super, set_bdev_super, bdev); mutex_unlock(&bdev->bd_mount_mutex); if (IS_ERR(s)) - goto out; + goto error_s; if (s->s_root) { if ((flags ^ s->s_flags) & MS_RDONLY) { up_write(&s->s_umount); deactivate_super(s); - s = ERR_PTR(-EBUSY); + error = -EBUSY; + goto error_bdev; } - goto out; + + close_bdev_excl(bdev); } else { char b[BDEVNAME_SIZE]; @@ -716,18 +719,21 @@ struct super_block *get_sb_bdev(struct file_system_type *fs_type, if (error) { up_write(&s->s_umount); deactivate_super(s); - s = ERR_PTR(error); - } else { - s->s_flags |= MS_ACTIVE; - bdev_uevent(bdev, KOBJ_MOUNT); + goto error; } + + s->s_flags |= MS_ACTIVE; + bdev_uevent(bdev, KOBJ_MOUNT); } - return s; + return simple_set_mnt(mnt, s); -out: +error_s: + error = PTR_ERR(s); +error_bdev: close_bdev_excl(bdev); - return s; +error: + return error; } EXPORT_SYMBOL(get_sb_bdev); @@ -744,15 +750,16 @@ void kill_block_super(struct super_block *sb) EXPORT_SYMBOL(kill_block_super); -struct super_block *get_sb_nodev(struct file_system_type *fs_type, +int get_sb_nodev(struct file_system_type *fs_type, int flags, void *data, - int (*fill_super)(struct super_block *, void *, int)) + int (*fill_super)(struct super_block *, void *, int), + struct vfsmount *mnt) { int error; struct super_block *s = sget(fs_type, NULL, set_anon_super, NULL); if (IS_ERR(s)) - return s; + return PTR_ERR(s); s->s_flags = flags; @@ -760,10 +767,10 @@ struct super_block *get_sb_nodev(struct file_system_type *fs_type, if (error) { up_write(&s->s_umount); deactivate_super(s); - return ERR_PTR(error); + return error; } s->s_flags |= MS_ACTIVE; - return s; + return simple_set_mnt(mnt, s); } EXPORT_SYMBOL(get_sb_nodev); @@ -773,94 +780,102 @@ static int compare_single(struct super_block *s, void *p) return 1; } -struct super_block *get_sb_single(struct file_system_type *fs_type, +int get_sb_single(struct file_system_type *fs_type, int flags, void *data, - int (*fill_super)(struct super_block *, void *, int)) + int (*fill_super)(struct super_block *, void *, int), + struct vfsmount *mnt) { struct super_block *s; int error; s = sget(fs_type, compare_single, set_anon_super, NULL); if (IS_ERR(s)) - return s; + return PTR_ERR(s); if (!s->s_root) { s->s_flags = flags; error = fill_super(s, data, flags & MS_SILENT ? 1 : 0); if (error) { up_write(&s->s_umount); deactivate_super(s); - return ERR_PTR(error); + return error; } s->s_flags |= MS_ACTIVE; } do_remount_sb(s, flags, data, 0); - return s; + return simple_set_mnt(mnt, s); } EXPORT_SYMBOL(get_sb_single); struct vfsmount * -do_kern_mount(const char *fstype, int flags, const char *name, void *data) +vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void *data) { - struct file_system_type *type = get_fs_type(fstype); - struct super_block *sb = ERR_PTR(-ENOMEM); struct vfsmount *mnt; - int error; char *secdata = NULL; + int error; if (!type) return ERR_PTR(-ENODEV); + error = -ENOMEM; mnt = alloc_vfsmnt(name); if (!mnt) goto out; if (data) { secdata = alloc_secdata(); - if (!secdata) { - sb = ERR_PTR(-ENOMEM); + if (!secdata) goto out_mnt; - } error = security_sb_copy_data(type, data, secdata); - if (error) { - sb = ERR_PTR(error); + if (error) goto out_free_secdata; - } } - sb = type->get_sb(type, flags, name, data); - if (IS_ERR(sb)) + error = type->get_sb(type, flags, name, data, mnt); + if (error < 0) goto out_free_secdata; - error = security_sb_kern_mount(sb, secdata); + + error = security_sb_kern_mount(mnt->mnt_sb, secdata); if (error) goto out_sb; - mnt->mnt_sb = sb; - mnt->mnt_root = dget(sb->s_root); - mnt->mnt_mountpoint = sb->s_root; + + mnt->mnt_mountpoint = mnt->mnt_root; mnt->mnt_parent = mnt; - up_write(&sb->s_umount); + up_write(&mnt->mnt_sb->s_umount); free_secdata(secdata); - put_filesystem(type); return mnt; out_sb: - up_write(&sb->s_umount); - deactivate_super(sb); - sb = ERR_PTR(error); + dput(mnt->mnt_root); + up_write(&mnt->mnt_sb->s_umount); + deactivate_super(mnt->mnt_sb); out_free_secdata: free_secdata(secdata); out_mnt: free_vfsmnt(mnt); out: + return ERR_PTR(error); +} + +EXPORT_SYMBOL_GPL(vfs_kern_mount); + +struct vfsmount * +do_kern_mount(const char *fstype, int flags, const char *name, void *data) +{ + struct file_system_type *type = get_fs_type(fstype); + struct vfsmount *mnt; + if (!type) + return ERR_PTR(-ENODEV); + mnt = vfs_kern_mount(type, flags, name, data); put_filesystem(type); - return (struct vfsmount *)sb; + return mnt; } EXPORT_SYMBOL_GPL(do_kern_mount); struct vfsmount *kern_mount(struct file_system_type *type) { - return do_kern_mount(type->name, 0, type->name, NULL); + return vfs_kern_mount(type, 0, type->name, NULL); } EXPORT_SYMBOL(kern_mount); diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c index f1117e885bd6..40190c489271 100644 --- a/fs/sysfs/mount.c +++ b/fs/sysfs/mount.c @@ -66,10 +66,10 @@ static int sysfs_fill_super(struct super_block *sb, void *data, int silent) return 0; } -static struct super_block *sysfs_get_sb(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data) +static int sysfs_get_sb(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data, struct vfsmount *mnt) { - return get_sb_single(fs_type, flags, data, sysfs_fill_super); + return get_sb_single(fs_type, flags, data, sysfs_fill_super, mnt); } static struct file_system_type sysfs_fs_type = { diff --git a/fs/sysv/super.c b/fs/sysv/super.c index e92b991e6dda..876639b93321 100644 --- a/fs/sysv/super.c +++ b/fs/sysv/super.c @@ -506,16 +506,17 @@ failed: /* Every kernel module contains stuff like this. */ -static struct super_block *sysv_get_sb(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data) +static int sysv_get_sb(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data, struct vfsmount *mnt) { - return get_sb_bdev(fs_type, flags, dev_name, data, sysv_fill_super); + return get_sb_bdev(fs_type, flags, dev_name, data, sysv_fill_super, + mnt); } -static struct super_block *v7_get_sb(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data) +static int v7_get_sb(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data, struct vfsmount *mnt) { - return get_sb_bdev(fs_type, flags, dev_name, data, v7_fill_super); + return get_sb_bdev(fs_type, flags, dev_name, data, v7_fill_super, mnt); } static struct file_system_type sysv_fs_type = { diff --git a/fs/udf/super.c b/fs/udf/super.c index e45789fe38e8..2250774a831d 100644 --- a/fs/udf/super.c +++ b/fs/udf/super.c @@ -94,10 +94,10 @@ static unsigned int udf_count_free(struct super_block *); static int udf_statfs(struct super_block *, struct kstatfs *); /* UDF filesystem type */ -static struct super_block *udf_get_sb(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data) +static int udf_get_sb(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data, struct vfsmount *mnt) { - return get_sb_bdev(fs_type, flags, dev_name, data, udf_fill_super); + return get_sb_bdev(fs_type, flags, dev_name, data, udf_fill_super, mnt); } static struct file_system_type udf_fstype = { diff --git a/fs/ufs/super.c b/fs/ufs/super.c index db98a4c71e63..768fb8d9e67a 100644 --- a/fs/ufs/super.c +++ b/fs/ufs/super.c @@ -1311,10 +1311,10 @@ out: #endif -static struct super_block *ufs_get_sb(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data) +static int ufs_get_sb(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data, struct vfsmount *mnt) { - return get_sb_bdev(fs_type, flags, dev_name, data, ufs_fill_super); + return get_sb_bdev(fs_type, flags, dev_name, data, ufs_fill_super, mnt); } static struct file_system_type ufs_fs_type = { diff --git a/fs/vfat/namei.c b/fs/vfat/namei.c index a56cec3be5f0..9a8f48bae956 100644 --- a/fs/vfat/namei.c +++ b/fs/vfat/namei.c @@ -1023,11 +1023,12 @@ static int vfat_fill_super(struct super_block *sb, void *data, int silent) return 0; } -static struct super_block *vfat_get_sb(struct file_system_type *fs_type, - int flags, const char *dev_name, - void *data) +static int vfat_get_sb(struct file_system_type *fs_type, + int flags, const char *dev_name, + void *data, struct vfsmount *mnt) { - return get_sb_bdev(fs_type, flags, dev_name, data, vfat_fill_super); + return get_sb_bdev(fs_type, flags, dev_name, data, vfat_fill_super, + mnt); } static struct file_system_type vfat_fs_type = { diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c index f2a0778536f4..d03c89a36655 100644 --- a/fs/xfs/linux-2.6/xfs_super.c +++ b/fs/xfs/linux-2.6/xfs_super.c @@ -853,14 +853,16 @@ fail_vfsop: return -error; } -STATIC struct super_block * +STATIC int xfs_fs_get_sb( struct file_system_type *fs_type, int flags, const char *dev_name, - void *data) + void *data, + struct vfsmount *mnt) { - return get_sb_bdev(fs_type, flags, dev_name, data, xfs_fs_fill_super); + return get_sb_bdev(fs_type, flags, dev_name, data, xfs_fs_fill_super, + mnt); } STATIC struct super_operations xfs_super_operations = { diff --git a/include/linux/dcache.h b/include/linux/dcache.h index 46d0e079735d..0dd1610a94a9 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -217,7 +217,6 @@ extern struct dentry * d_alloc_anon(struct inode *); extern struct dentry * d_splice_alias(struct inode *, struct dentry *); extern void shrink_dcache_sb(struct super_block *); extern void shrink_dcache_parent(struct dentry *); -extern void shrink_dcache_anon(struct super_block *); extern int d_invalidate(struct dentry *); /* only used at mount-time */ diff --git a/include/linux/fs.h b/include/linux/fs.h index 73c7d6f04b31..3e50dd24af87 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1269,23 +1269,26 @@ find_exported_dentry(struct super_block *sb, void *obj, void *parent, struct file_system_type { const char *name; int fs_flags; - struct super_block *(*get_sb) (struct file_system_type *, int, - const char *, void *); + int (*get_sb) (struct file_system_type *, int, + const char *, void *, struct vfsmount *); void (*kill_sb) (struct super_block *); struct module *owner; struct file_system_type * next; struct list_head fs_supers; }; -struct super_block *get_sb_bdev(struct file_system_type *fs_type, +extern int get_sb_bdev(struct file_system_type *fs_type, int flags, const char *dev_name, void *data, - int (*fill_super)(struct super_block *, void *, int)); -struct super_block *get_sb_single(struct file_system_type *fs_type, + int (*fill_super)(struct super_block *, void *, int), + struct vfsmount *mnt); +extern int get_sb_single(struct file_system_type *fs_type, int flags, void *data, - int (*fill_super)(struct super_block *, void *, int)); -struct super_block *get_sb_nodev(struct file_system_type *fs_type, + int (*fill_super)(struct super_block *, void *, int), + struct vfsmount *mnt); +extern int get_sb_nodev(struct file_system_type *fs_type, int flags, void *data, - int (*fill_super)(struct super_block *, void *, int)); + int (*fill_super)(struct super_block *, void *, int), + struct vfsmount *mnt); void generic_shutdown_super(struct super_block *sb); void kill_block_super(struct super_block *sb); void kill_anon_super(struct super_block *sb); @@ -1296,8 +1299,10 @@ struct super_block *sget(struct file_system_type *type, int (*test)(struct super_block *,void *), int (*set)(struct super_block *,void *), void *data); -struct super_block *get_sb_pseudo(struct file_system_type *, char *, - struct super_operations *ops, unsigned long); +extern int get_sb_pseudo(struct file_system_type *, char *, + struct super_operations *ops, unsigned long, + struct vfsmount *mnt); +extern int simple_set_mnt(struct vfsmount *mnt, struct super_block *sb); int __put_super(struct super_block *sb); int __put_super_and_need_restart(struct super_block *sb); void unnamed_dev_init(void); diff --git a/include/linux/ramfs.h b/include/linux/ramfs.h index 78ecfa28b1c2..00b340ba6612 100644 --- a/include/linux/ramfs.h +++ b/include/linux/ramfs.h @@ -2,8 +2,8 @@ #define _LINUX_RAMFS_H struct inode *ramfs_get_inode(struct super_block *sb, int mode, dev_t dev); -struct super_block *ramfs_get_sb(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data); +extern int ramfs_get_sb(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data, struct vfsmount *mnt); #ifndef CONFIG_MMU extern unsigned long ramfs_nommu_get_unmapped_area(struct file *file, diff --git a/ipc/mqueue.c b/ipc/mqueue.c index 1511714a9585..0a2a24b6ebe4 100644 --- a/ipc/mqueue.c +++ b/ipc/mqueue.c @@ -205,11 +205,11 @@ static int mqueue_fill_super(struct super_block *sb, void *data, int silent) return 0; } -static struct super_block *mqueue_get_sb(struct file_system_type *fs_type, - int flags, const char *dev_name, - void *data) +static int mqueue_get_sb(struct file_system_type *fs_type, + int flags, const char *dev_name, + void *data, struct vfsmount *mnt) { - return get_sb_single(fs_type, flags, data, mqueue_fill_super); + return get_sb_single(fs_type, flags, data, mqueue_fill_super, mnt); } static void init_once(void *foo, kmem_cache_t * cachep, unsigned long flags) diff --git a/kernel/cpuset.c b/kernel/cpuset.c index ab81fdd4572b..77f45ffd5ea1 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -392,11 +392,11 @@ static int cpuset_fill_super(struct super_block *sb, void *unused_data, return 0; } -static struct super_block *cpuset_get_sb(struct file_system_type *fs_type, - int flags, const char *unused_dev_name, - void *data) +static int cpuset_get_sb(struct file_system_type *fs_type, + int flags, const char *unused_dev_name, + void *data, struct vfsmount *mnt) { - return get_sb_single(fs_type, flags, data, cpuset_fill_super); + return get_sb_single(fs_type, flags, data, cpuset_fill_super, mnt); } static struct file_system_type cpuset_fs_type = { diff --git a/kernel/futex.c b/kernel/futex.c index 5699c512057b..e1a380c77a5a 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -1056,11 +1056,11 @@ asmlinkage long sys_futex(u32 __user *uaddr, int op, int val, (unsigned long)uaddr2, val2, val3); } -static struct super_block * -futexfs_get_sb(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data) +static int futexfs_get_sb(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data, + struct vfsmount *mnt) { - return get_sb_pseudo(fs_type, "futex", NULL, 0xBAD1DEA); + return get_sb_pseudo(fs_type, "futex", NULL, 0xBAD1DEA, mnt); } static struct file_system_type futex_fs_type = { diff --git a/mm/shmem.c b/mm/shmem.c index 1e43c8a865ba..7617bb1c6bf7 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2233,10 +2233,10 @@ static struct vm_operations_struct shmem_vm_ops = { }; -static struct super_block *shmem_get_sb(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data) +static int shmem_get_sb(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data, struct vfsmount *mnt) { - return get_sb_nodev(fs_type, flags, data, shmem_fill_super); + return get_sb_nodev(fs_type, flags, data, shmem_fill_super, mnt); } static struct file_system_type tmpfs_fs_type = { diff --git a/net/socket.c b/net/socket.c index 02948b622bd2..565f5e8d1191 100644 --- a/net/socket.c +++ b/net/socket.c @@ -335,10 +335,11 @@ static struct super_operations sockfs_ops = { .statfs = simple_statfs, }; -static struct super_block *sockfs_get_sb(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data) +static int sockfs_get_sb(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data, struct vfsmount *mnt) { - return get_sb_pseudo(fs_type, "socket:", &sockfs_ops, SOCKFS_MAGIC); + return get_sb_pseudo(fs_type, "socket:", &sockfs_ops, SOCKFS_MAGIC, + mnt); } static struct vfsmount *sock_mnt __read_mostly; diff --git a/net/sunrpc/rpc_pipe.c b/net/sunrpc/rpc_pipe.c index cc673dd8433f..8241fa726803 100644 --- a/net/sunrpc/rpc_pipe.c +++ b/net/sunrpc/rpc_pipe.c @@ -815,11 +815,11 @@ out: return -ENOMEM; } -static struct super_block * +static int rpc_get_sb(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data) + int flags, const char *dev_name, void *data, struct vfsmount *mnt) { - return get_sb_single(fs_type, flags, data, rpc_fill_super); + return get_sb_single(fs_type, flags, data, rpc_fill_super, mnt); } static struct file_system_type rpc_pipe_fs_type = { diff --git a/security/inode.c b/security/inode.c index 0f77b0223662..e6fc29ac8564 100644 --- a/security/inode.c +++ b/security/inode.c @@ -135,11 +135,11 @@ static int fill_super(struct super_block *sb, void *data, int silent) return simple_fill_super(sb, SECURITYFS_MAGIC, files); } -static struct super_block *get_sb(struct file_system_type *fs_type, - int flags, const char *dev_name, - void *data) +static int get_sb(struct file_system_type *fs_type, + int flags, const char *dev_name, + void *data, struct vfsmount *mnt) { - return get_sb_single(fs_type, flags, data, fill_super); + return get_sb_single(fs_type, flags, data, fill_super, mnt); } static struct file_system_type fs_type = { diff --git a/security/selinux/selinuxfs.c b/security/selinux/selinuxfs.c index 2e73d3279f2d..7029bbc9bef8 100644 --- a/security/selinux/selinuxfs.c +++ b/security/selinux/selinuxfs.c @@ -1345,10 +1345,11 @@ err: goto out; } -static struct super_block *sel_get_sb(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data) +static int sel_get_sb(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data, + struct vfsmount *mnt) { - return get_sb_single(fs_type, flags, data, sel_fill_super); + return get_sb_single(fs_type, flags, data, sel_fill_super, mnt); } static struct file_system_type sel_fs_type = { -- cgit v1.2.3 From 726c334223180e3c0197cc980a432681370d4baf Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 23 Jun 2006 02:02:58 -0700 Subject: [PATCH] VFS: Permit filesystem to perform statfs with a known root dentry Give the statfs superblock operation a dentry pointer rather than a superblock pointer. This complements the get_sb() patch. That reduced the significance of sb->s_root, allowing NFS to place a fake root there. However, NFS does require a dentry to use as a target for the statfs operation. This permits the root in the vfsmount to be used instead. linux/mount.h has been added where necessary to make allyesconfig build successfully. Interest has also been expressed for use with the FUSE and XFS filesystems. Signed-off-by: David Howells Acked-by: Al Viro Cc: Nathan Scott Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/filesystems/Locking | 2 +- Documentation/filesystems/vfs.txt | 2 +- arch/alpha/kernel/osf_sys.c | 2 +- arch/mips/kernel/sysirix.c | 12 ++++++------ arch/parisc/hpux/sys_hpux.c | 10 +++++----- arch/sparc64/solaris/fs.c | 4 ++-- fs/adfs/super.c | 8 ++++---- fs/affs/super.c | 5 +++-- fs/befs/linuxvfs.c | 5 +++-- fs/bfs/inode.c | 3 ++- fs/cifs/cifsfs.c | 3 ++- fs/coda/inode.c | 6 +++--- fs/coda/upcall.c | 4 ++-- fs/compat.c | 8 ++++---- fs/cramfs/inode.c | 4 +++- fs/efs/super.c | 6 +++--- fs/ext2/super.c | 5 +++-- fs/ext3/super.c | 5 +++-- fs/fat/inode.c | 8 ++++---- fs/freevxfs/vxfs_super.c | 13 +++++++------ fs/fuse/inode.c | 3 ++- fs/hfs/super.c | 4 +++- fs/hfsplus/super.c | 4 +++- fs/hostfs/hostfs_kern.c | 4 ++-- fs/hpfs/super.c | 3 ++- fs/hppfs/hppfs_kern.c | 2 +- fs/hugetlbfs/inode.c | 4 ++-- fs/isofs/inode.c | 6 ++++-- fs/jffs/inode-v23.c | 4 ++-- fs/jffs2/fs.c | 4 ++-- fs/jffs2/os-linux.h | 2 +- fs/jfs/super.c | 4 ++-- fs/libfs.c | 4 ++-- fs/minix/inode.c | 10 +++++----- fs/ncpfs/inode.c | 5 +++-- fs/nfs/inode.c | 5 +++-- fs/nfsd/nfs4xdr.c | 2 +- fs/nfsd/vfs.c | 2 +- fs/ntfs/super.c | 7 ++++--- fs/ocfs2/super.c | 10 +++++----- fs/open.c | 26 +++++++++++++------------- fs/qnx4/inode.c | 6 ++++-- fs/reiserfs/super.c | 8 ++++---- fs/romfs/inode.c | 4 ++-- fs/smbfs/inode.c | 6 +++--- fs/smbfs/proc.c | 4 ++-- fs/smbfs/proto.h | 2 +- fs/super.c | 2 +- fs/sysv/inode.c | 3 ++- fs/udf/super.c | 6 ++++-- fs/ufs/super.c | 3 ++- fs/xfs/linux-2.6/xfs_super.c | 4 ++-- include/linux/coda_psdev.h | 2 +- include/linux/fs.h | 6 +++--- include/linux/mount.h | 5 +++++ include/linux/security.h | 14 +++++++------- kernel/acct.c | 2 +- mm/shmem.c | 4 ++-- security/dummy.c | 2 +- security/selinux/hooks.c | 6 +++--- 60 files changed, 175 insertions(+), 144 deletions(-) (limited to 'kernel') diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking index 3abf08f1b14a..d31efbbdfe50 100644 --- a/Documentation/filesystems/Locking +++ b/Documentation/filesystems/Locking @@ -99,7 +99,7 @@ prototypes: int (*sync_fs)(struct super_block *sb, int wait); void (*write_super_lockfs) (struct super_block *); void (*unlockfs) (struct super_block *); - int (*statfs) (struct super_block *, struct kstatfs *); + int (*statfs) (struct dentry *, struct kstatfs *); int (*remount_fs) (struct super_block *, int *, char *); void (*clear_inode) (struct inode *); void (*umount_begin) (struct super_block *); diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt index dd7d0dcedc87..9d3aed628bc1 100644 --- a/Documentation/filesystems/vfs.txt +++ b/Documentation/filesystems/vfs.txt @@ -211,7 +211,7 @@ struct super_operations { int (*sync_fs)(struct super_block *sb, int wait); void (*write_super_lockfs) (struct super_block *); void (*unlockfs) (struct super_block *); - int (*statfs) (struct super_block *, struct kstatfs *); + int (*statfs) (struct dentry *, struct kstatfs *); int (*remount_fs) (struct super_block *, int *, char *); void (*clear_inode) (struct inode *); void (*umount_begin) (struct super_block *); diff --git a/arch/alpha/kernel/osf_sys.c b/arch/alpha/kernel/osf_sys.c index 31afe3d91ac6..e15dcf4f3dcd 100644 --- a/arch/alpha/kernel/osf_sys.c +++ b/arch/alpha/kernel/osf_sys.c @@ -244,7 +244,7 @@ do_osf_statfs(struct dentry * dentry, struct osf_statfs __user *buffer, unsigned long bufsiz) { struct kstatfs linux_stat; - int error = vfs_statfs(dentry->d_inode->i_sb, &linux_stat); + int error = vfs_statfs(dentry, &linux_stat); if (!error) error = linux_to_osf_statfs(&linux_stat, buffer, bufsiz); return error; diff --git a/arch/mips/kernel/sysirix.c b/arch/mips/kernel/sysirix.c index 5407b784cd01..19e1ef43eb4b 100644 --- a/arch/mips/kernel/sysirix.c +++ b/arch/mips/kernel/sysirix.c @@ -694,7 +694,7 @@ asmlinkage int irix_statfs(const char __user *path, if (error) goto out; - error = vfs_statfs(nd.dentry->d_inode->i_sb, &kbuf); + error = vfs_statfs(nd.dentry, &kbuf); if (error) goto dput_and_out; @@ -732,7 +732,7 @@ asmlinkage int irix_fstatfs(unsigned int fd, struct irix_statfs __user *buf) goto out; } - error = vfs_statfs(file->f_dentry->d_inode->i_sb, &kbuf); + error = vfs_statfs(file->f_dentry, &kbuf); if (error) goto out_f; @@ -1360,7 +1360,7 @@ asmlinkage int irix_statvfs(char __user *fname, struct irix_statvfs __user *buf) error = user_path_walk(fname, &nd); if (error) goto out; - error = vfs_statfs(nd.dentry->d_inode->i_sb, &kbuf); + error = vfs_statfs(nd.dentry, &kbuf); if (error) goto dput_and_out; @@ -1406,7 +1406,7 @@ asmlinkage int irix_fstatvfs(int fd, struct irix_statvfs __user *buf) error = -EBADF; goto out; } - error = vfs_statfs(file->f_dentry->d_inode->i_sb, &kbuf); + error = vfs_statfs(file->f_dentry, &kbuf); if (error) goto out_f; @@ -1611,7 +1611,7 @@ asmlinkage int irix_statvfs64(char __user *fname, struct irix_statvfs64 __user * error = user_path_walk(fname, &nd); if (error) goto out; - error = vfs_statfs(nd.dentry->d_inode->i_sb, &kbuf); + error = vfs_statfs(nd.dentry, &kbuf); if (error) goto dput_and_out; @@ -1658,7 +1658,7 @@ asmlinkage int irix_fstatvfs64(int fd, struct irix_statvfs __user *buf) error = -EBADF; goto out; } - error = vfs_statfs(file->f_dentry->d_inode->i_sb, &kbuf); + error = vfs_statfs(file->f_dentry, &kbuf); if (error) goto out_f; diff --git a/arch/parisc/hpux/sys_hpux.c b/arch/parisc/hpux/sys_hpux.c index 05273ccced0e..cb69727027ae 100644 --- a/arch/parisc/hpux/sys_hpux.c +++ b/arch/parisc/hpux/sys_hpux.c @@ -145,7 +145,7 @@ static int hpux_ustat(dev_t dev, struct hpux_ustat __user *ubuf) s = user_get_super(dev); if (s == NULL) goto out; - err = vfs_statfs(s, &sbuf); + err = vfs_statfs(s->s_root, &sbuf); drop_super(s); if (err) goto out; @@ -186,12 +186,12 @@ struct hpux_statfs { int16_t f_pad; }; -static int vfs_statfs_hpux(struct super_block *sb, struct hpux_statfs *buf) +static int vfs_statfs_hpux(struct dentry *dentry, struct hpux_statfs *buf) { struct kstatfs st; int retval; - retval = vfs_statfs(sb, &st); + retval = vfs_statfs(dentry, &st); if (retval) return retval; @@ -219,7 +219,7 @@ asmlinkage long hpux_statfs(const char __user *path, error = user_path_walk(path, &nd); if (!error) { struct hpux_statfs tmp; - error = vfs_statfs_hpux(nd.dentry->d_inode->i_sb, &tmp); + error = vfs_statfs_hpux(nd.dentry, &tmp); if (!error && copy_to_user(buf, &tmp, sizeof(tmp))) error = -EFAULT; path_release(&nd); @@ -237,7 +237,7 @@ asmlinkage long hpux_fstatfs(unsigned int fd, struct hpux_statfs __user * buf) file = fget(fd); if (!file) goto out; - error = vfs_statfs_hpux(file->f_dentry->d_inode->i_sb, &tmp); + error = vfs_statfs_hpux(file->f_dentry, &tmp); if (!error && copy_to_user(buf, &tmp, sizeof(tmp))) error = -EFAULT; fput(file); diff --git a/arch/sparc64/solaris/fs.c b/arch/sparc64/solaris/fs.c index 4885ca6cbc77..0f0eb6aa1c40 100644 --- a/arch/sparc64/solaris/fs.c +++ b/arch/sparc64/solaris/fs.c @@ -356,7 +356,7 @@ static int report_statvfs(struct vfsmount *mnt, struct inode *inode, u32 buf) int error; struct sol_statvfs __user *ss = A(buf); - error = vfs_statfs(mnt->mnt_sb, &s); + error = vfs_statfs(mnt->mnt_root, &s); if (!error) { const char *p = mnt->mnt_sb->s_type->name; int i = 0; @@ -392,7 +392,7 @@ static int report_statvfs64(struct vfsmount *mnt, struct inode *inode, u32 buf) int error; struct sol_statvfs64 __user *ss = A(buf); - error = vfs_statfs(mnt->mnt_sb, &s); + error = vfs_statfs(mnt->mnt_root, &s); if (!error) { const char *p = mnt->mnt_sb->s_type->name; int i = 0; diff --git a/fs/adfs/super.c b/fs/adfs/super.c index 1b58a9b7f6aa..ba1c88af49fe 100644 --- a/fs/adfs/super.c +++ b/fs/adfs/super.c @@ -196,17 +196,17 @@ static int adfs_remount(struct super_block *sb, int *flags, char *data) return parse_options(sb, data); } -static int adfs_statfs(struct super_block *sb, struct kstatfs *buf) +static int adfs_statfs(struct dentry *dentry, struct kstatfs *buf) { - struct adfs_sb_info *asb = ADFS_SB(sb); + struct adfs_sb_info *asb = ADFS_SB(dentry->d_sb); buf->f_type = ADFS_SUPER_MAGIC; buf->f_namelen = asb->s_namelen; - buf->f_bsize = sb->s_blocksize; + buf->f_bsize = dentry->d_sb->s_blocksize; buf->f_blocks = asb->s_size; buf->f_files = asb->s_ids_per_zone * asb->s_map_size; buf->f_bavail = - buf->f_bfree = adfs_map_free(sb); + buf->f_bfree = adfs_map_free(dentry->d_sb); buf->f_ffree = (long)(buf->f_bfree * buf->f_files) / (long)buf->f_blocks; return 0; diff --git a/fs/affs/super.c b/fs/affs/super.c index 6a52e7875403..8765cba35bb9 100644 --- a/fs/affs/super.c +++ b/fs/affs/super.c @@ -18,7 +18,7 @@ extern struct timezone sys_tz; -static int affs_statfs(struct super_block *sb, struct kstatfs *buf); +static int affs_statfs(struct dentry *dentry, struct kstatfs *buf); static int affs_remount (struct super_block *sb, int *flags, char *data); static void @@ -508,8 +508,9 @@ affs_remount(struct super_block *sb, int *flags, char *data) } static int -affs_statfs(struct super_block *sb, struct kstatfs *buf) +affs_statfs(struct dentry *dentry, struct kstatfs *buf) { + struct super_block *sb = dentry->d_sb; int free; pr_debug("AFFS: statfs() partsize=%d, reserved=%d\n",AFFS_SB(sb)->s_partition_size, diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c index 6ed07a5f10c6..08201fab26cd 100644 --- a/fs/befs/linuxvfs.c +++ b/fs/befs/linuxvfs.c @@ -49,7 +49,7 @@ static int befs_nls2utf(struct super_block *sb, const char *in, int in_len, char **out, int *out_len); static void befs_put_super(struct super_block *); static int befs_remount(struct super_block *, int *, char *); -static int befs_statfs(struct super_block *, struct kstatfs *); +static int befs_statfs(struct dentry *, struct kstatfs *); static int parse_options(char *, befs_mount_options *); static const struct super_operations befs_sops = { @@ -880,8 +880,9 @@ befs_remount(struct super_block *sb, int *flags, char *data) } static int -befs_statfs(struct super_block *sb, struct kstatfs *buf) +befs_statfs(struct dentry *dentry, struct kstatfs *buf) { + struct super_block *sb = dentry->d_sb; befs_debug(sb, "---> befs_statfs()"); diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c index e7da03f63a5a..cf74f3d4d966 100644 --- a/fs/bfs/inode.c +++ b/fs/bfs/inode.c @@ -203,8 +203,9 @@ static void bfs_put_super(struct super_block *s) s->s_fs_info = NULL; } -static int bfs_statfs(struct super_block *s, struct kstatfs *buf) +static int bfs_statfs(struct dentry *dentry, struct kstatfs *buf) { + struct super_block *s = dentry->d_sb; struct bfs_sb_info *info = BFS_SB(s); u64 id = huge_encode_dev(s->s_bdev->bd_dev); buf->f_type = BFS_MAGIC; diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index 08b35801dfed..7520f4687158 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -166,8 +166,9 @@ cifs_put_super(struct super_block *sb) } static int -cifs_statfs(struct super_block *sb, struct kstatfs *buf) +cifs_statfs(struct dentry *dentry, struct kstatfs *buf) { + struct super_block *sb = dentry->d_sb; int xid; int rc = -EOPNOTSUPP; struct cifs_sb_info *cifs_sb; diff --git a/fs/coda/inode.c b/fs/coda/inode.c index cba70201567d..87f1dc8aa24b 100644 --- a/fs/coda/inode.c +++ b/fs/coda/inode.c @@ -36,7 +36,7 @@ /* VFS super_block ops */ static void coda_clear_inode(struct inode *); static void coda_put_super(struct super_block *); -static int coda_statfs(struct super_block *sb, struct kstatfs *buf); +static int coda_statfs(struct dentry *dentry, struct kstatfs *buf); static kmem_cache_t * coda_inode_cachep; @@ -278,13 +278,13 @@ struct inode_operations coda_file_inode_operations = { .setattr = coda_setattr, }; -static int coda_statfs(struct super_block *sb, struct kstatfs *buf) +static int coda_statfs(struct dentry *dentry, struct kstatfs *buf) { int error; lock_kernel(); - error = venus_statfs(sb, buf); + error = venus_statfs(dentry, buf); unlock_kernel(); diff --git a/fs/coda/upcall.c b/fs/coda/upcall.c index 1bae99650a91..b040eba13a7d 100644 --- a/fs/coda/upcall.c +++ b/fs/coda/upcall.c @@ -611,7 +611,7 @@ int venus_pioctl(struct super_block *sb, struct CodaFid *fid, return error; } -int venus_statfs(struct super_block *sb, struct kstatfs *sfs) +int venus_statfs(struct dentry *dentry, struct kstatfs *sfs) { union inputArgs *inp; union outputArgs *outp; @@ -620,7 +620,7 @@ int venus_statfs(struct super_block *sb, struct kstatfs *sfs) insize = max_t(unsigned int, INSIZE(statfs), OUTSIZE(statfs)); UPARG(CODA_STATFS); - error = coda_upcall(coda_sbp(sb), insize, &outsize, inp); + error = coda_upcall(coda_sbp(dentry->d_sb), insize, &outsize, inp); if (!error) { sfs->f_blocks = outp->coda_statfs.stat.f_blocks; diff --git a/fs/compat.c b/fs/compat.c index b1f64786a613..7e7e5bc4f3cf 100644 --- a/fs/compat.c +++ b/fs/compat.c @@ -197,7 +197,7 @@ asmlinkage long compat_sys_statfs(const char __user *path, struct compat_statfs error = user_path_walk(path, &nd); if (!error) { struct kstatfs tmp; - error = vfs_statfs(nd.dentry->d_inode->i_sb, &tmp); + error = vfs_statfs(nd.dentry, &tmp); if (!error) error = put_compat_statfs(buf, &tmp); path_release(&nd); @@ -215,7 +215,7 @@ asmlinkage long compat_sys_fstatfs(unsigned int fd, struct compat_statfs __user file = fget(fd); if (!file) goto out; - error = vfs_statfs(file->f_dentry->d_inode->i_sb, &tmp); + error = vfs_statfs(file->f_dentry, &tmp); if (!error) error = put_compat_statfs(buf, &tmp); fput(file); @@ -265,7 +265,7 @@ asmlinkage long compat_sys_statfs64(const char __user *path, compat_size_t sz, s error = user_path_walk(path, &nd); if (!error) { struct kstatfs tmp; - error = vfs_statfs(nd.dentry->d_inode->i_sb, &tmp); + error = vfs_statfs(nd.dentry, &tmp); if (!error) error = put_compat_statfs64(buf, &tmp); path_release(&nd); @@ -286,7 +286,7 @@ asmlinkage long compat_sys_fstatfs64(unsigned int fd, compat_size_t sz, struct c file = fget(fd); if (!file) goto out; - error = vfs_statfs(file->f_dentry->d_inode->i_sb, &tmp); + error = vfs_statfs(file->f_dentry, &tmp); if (!error) error = put_compat_statfs64(buf, &tmp); fput(file); diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c index 37a91a153aa5..8a9d5d3b3262 100644 --- a/fs/cramfs/inode.c +++ b/fs/cramfs/inode.c @@ -322,8 +322,10 @@ out: return -EINVAL; } -static int cramfs_statfs(struct super_block *sb, struct kstatfs *buf) +static int cramfs_statfs(struct dentry *dentry, struct kstatfs *buf) { + struct super_block *sb = dentry->d_sb; + buf->f_type = CRAMFS_MAGIC; buf->f_bsize = PAGE_CACHE_SIZE; buf->f_blocks = CRAMFS_SB(sb)->blocks; diff --git a/fs/efs/super.c b/fs/efs/super.c index 1ba5e14f879f..8ac2462ae5dd 100644 --- a/fs/efs/super.c +++ b/fs/efs/super.c @@ -15,7 +15,7 @@ #include #include -static int efs_statfs(struct super_block *s, struct kstatfs *buf); +static int efs_statfs(struct dentry *dentry, struct kstatfs *buf); static int efs_fill_super(struct super_block *s, void *d, int silent); static int efs_get_sb(struct file_system_type *fs_type, @@ -322,8 +322,8 @@ out_no_fs: return -EINVAL; } -static int efs_statfs(struct super_block *s, struct kstatfs *buf) { - struct efs_sb_info *sb = SUPER_INFO(s); +static int efs_statfs(struct dentry *dentry, struct kstatfs *buf) { + struct efs_sb_info *sb = SUPER_INFO(dentry->d_sb); buf->f_type = EFS_SUPER_MAGIC; /* efs magic number */ buf->f_bsize = EFS_BLOCKSIZE; /* blocksize */ diff --git a/fs/ext2/super.c b/fs/ext2/super.c index a4dfffac5967..a6c4d6e02324 100644 --- a/fs/ext2/super.c +++ b/fs/ext2/super.c @@ -39,7 +39,7 @@ static void ext2_sync_super(struct super_block *sb, struct ext2_super_block *es); static int ext2_remount (struct super_block * sb, int * flags, char * data); -static int ext2_statfs (struct super_block * sb, struct kstatfs * buf); +static int ext2_statfs (struct dentry * dentry, struct kstatfs * buf); void ext2_error (struct super_block * sb, const char * function, const char * fmt, ...) @@ -1038,8 +1038,9 @@ restore_opts: return err; } -static int ext2_statfs (struct super_block * sb, struct kstatfs * buf) +static int ext2_statfs (struct dentry * dentry, struct kstatfs * buf) { + struct super_block *sb = dentry->d_sb; struct ext2_sb_info *sbi = EXT2_SB(sb); unsigned long overhead; int i; diff --git a/fs/ext3/super.c b/fs/ext3/super.c index 657f8e73b62f..e08b6439617c 100644 --- a/fs/ext3/super.c +++ b/fs/ext3/super.c @@ -58,7 +58,7 @@ static int ext3_sync_fs(struct super_block *sb, int wait); static const char *ext3_decode_error(struct super_block * sb, int errno, char nbuf[16]); static int ext3_remount (struct super_block * sb, int * flags, char * data); -static int ext3_statfs (struct super_block * sb, struct kstatfs * buf); +static int ext3_statfs (struct dentry * dentry, struct kstatfs * buf); static void ext3_unlockfs(struct super_block *sb); static void ext3_write_super (struct super_block * sb); static void ext3_write_super_lockfs(struct super_block *sb); @@ -2318,8 +2318,9 @@ restore_opts: return err; } -static int ext3_statfs (struct super_block * sb, struct kstatfs * buf) +static int ext3_statfs (struct dentry * dentry, struct kstatfs * buf) { + struct super_block *sb = dentry->d_sb; struct ext3_sb_info *sbi = EXT3_SB(sb); struct ext3_super_block *es = sbi->s_es; unsigned long overhead; diff --git a/fs/fat/inode.c b/fs/fat/inode.c index c1ce284f8a94..7c35d582ec10 100644 --- a/fs/fat/inode.c +++ b/fs/fat/inode.c @@ -539,18 +539,18 @@ static int fat_remount(struct super_block *sb, int *flags, char *data) return 0; } -static int fat_statfs(struct super_block *sb, struct kstatfs *buf) +static int fat_statfs(struct dentry *dentry, struct kstatfs *buf) { - struct msdos_sb_info *sbi = MSDOS_SB(sb); + struct msdos_sb_info *sbi = MSDOS_SB(dentry->d_sb); /* If the count of free cluster is still unknown, counts it here. */ if (sbi->free_clusters == -1) { - int err = fat_count_free_clusters(sb); + int err = fat_count_free_clusters(dentry->d_sb); if (err) return err; } - buf->f_type = sb->s_magic; + buf->f_type = dentry->d_sb->s_magic; buf->f_bsize = sbi->cluster_size; buf->f_blocks = sbi->max_cluster - FAT_START_ENT; buf->f_bfree = sbi->free_clusters; diff --git a/fs/freevxfs/vxfs_super.c b/fs/freevxfs/vxfs_super.c index d76eeaafbde2..b74b791fc23b 100644 --- a/fs/freevxfs/vxfs_super.c +++ b/fs/freevxfs/vxfs_super.c @@ -40,6 +40,7 @@ #include #include #include +#include #include "vxfs.h" #include "vxfs_extern.h" @@ -55,7 +56,7 @@ MODULE_ALIAS("vxfs"); /* makes mount -t vxfs autoload the module */ static void vxfs_put_super(struct super_block *); -static int vxfs_statfs(struct super_block *, struct kstatfs *); +static int vxfs_statfs(struct dentry *, struct kstatfs *); static int vxfs_remount(struct super_block *, int *, char *); static struct super_operations vxfs_super_ops = { @@ -90,12 +91,12 @@ vxfs_put_super(struct super_block *sbp) /** * vxfs_statfs - get filesystem information - * @sbp: VFS superblock + * @dentry: VFS dentry to locate superblock * @bufp: output buffer * * Description: * vxfs_statfs fills the statfs buffer @bufp with information - * about the filesystem described by @sbp. + * about the filesystem described by @dentry. * * Returns: * Zero. @@ -107,12 +108,12 @@ vxfs_put_super(struct super_block *sbp) * This is everything but complete... */ static int -vxfs_statfs(struct super_block *sbp, struct kstatfs *bufp) +vxfs_statfs(struct dentry *dentry, struct kstatfs *bufp) { - struct vxfs_sb_info *infp = VXFS_SBI(sbp); + struct vxfs_sb_info *infp = VXFS_SBI(dentry->d_sb); bufp->f_type = VXFS_SUPER_MAGIC; - bufp->f_bsize = sbp->s_blocksize; + bufp->f_bsize = dentry->d_sb->s_blocksize; bufp->f_blocks = infp->vsi_raw->vs_dsize; bufp->f_bfree = infp->vsi_raw->vs_free; bufp->f_bavail = 0; diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index c91f0a50aadb..a13c0f529058 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -236,8 +236,9 @@ static void convert_fuse_statfs(struct kstatfs *stbuf, struct fuse_kstatfs *attr /* fsid is left zero */ } -static int fuse_statfs(struct super_block *sb, struct kstatfs *buf) +static int fuse_statfs(struct dentry *dentry, struct kstatfs *buf) { + struct super_block *sb = dentry->d_sb; struct fuse_conn *fc = get_fuse_conn_super(sb); struct fuse_req *req; struct fuse_statfs_out outarg; diff --git a/fs/hfs/super.c b/fs/hfs/super.c index ee5b80a409e8..d9227bf14e86 100644 --- a/fs/hfs/super.c +++ b/fs/hfs/super.c @@ -80,8 +80,10 @@ static void hfs_put_super(struct super_block *sb) * * changed f_files/f_ffree to reflect the fs_ablock/free_ablocks. */ -static int hfs_statfs(struct super_block *sb, struct kstatfs *buf) +static int hfs_statfs(struct dentry *dentry, struct kstatfs *buf) { + struct super_block *sb = dentry->d_sb; + buf->f_type = HFS_SUPER_MAGIC; buf->f_bsize = sb->s_blocksize; buf->f_blocks = (u32)HFS_SB(sb)->fs_ablocks * HFS_SB(sb)->fs_div; diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c index 0ed8b7e8e87f..0a92fa2336a2 100644 --- a/fs/hfsplus/super.c +++ b/fs/hfsplus/super.c @@ -212,8 +212,10 @@ static void hfsplus_put_super(struct super_block *sb) sb->s_fs_info = NULL; } -static int hfsplus_statfs(struct super_block *sb, struct kstatfs *buf) +static int hfsplus_statfs(struct dentry *dentry, struct kstatfs *buf) { + struct super_block *sb = dentry->d_sb; + buf->f_type = HFSPLUS_SUPER_MAGIC; buf->f_bsize = sb->s_blocksize; buf->f_blocks = HFSPLUS_SB(sb).total_blocks << HFSPLUS_SB(sb).fs_shift; diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c index 04035e08f5c1..8e0d37743e7c 100644 --- a/fs/hostfs/hostfs_kern.c +++ b/fs/hostfs/hostfs_kern.c @@ -239,7 +239,7 @@ static int read_inode(struct inode *ino) return(err); } -int hostfs_statfs(struct super_block *sb, struct kstatfs *sf) +int hostfs_statfs(struct dentry *dentry, struct kstatfs *sf) { /* do_statfs uses struct statfs64 internally, but the linux kernel * struct statfs still has 32-bit versions for most of these fields, @@ -252,7 +252,7 @@ int hostfs_statfs(struct super_block *sb, struct kstatfs *sf) long long f_files; long long f_ffree; - err = do_statfs(HOSTFS_I(sb->s_root->d_inode)->host_filename, + err = do_statfs(HOSTFS_I(dentry->d_sb->s_root->d_inode)->host_filename, &sf->f_bsize, &f_blocks, &f_bfree, &f_bavail, &f_files, &f_ffree, &sf->f_fsid, sizeof(sf->f_fsid), &sf->f_namelen, sf->f_spare); diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c index 3b25cf3e2e65..f798480a363f 100644 --- a/fs/hpfs/super.c +++ b/fs/hpfs/super.c @@ -135,8 +135,9 @@ static unsigned count_bitmaps(struct super_block *s) return count; } -static int hpfs_statfs(struct super_block *s, struct kstatfs *buf) +static int hpfs_statfs(struct dentry *dentry, struct kstatfs *buf) { + struct super_block *s = dentry->d_sb; struct hpfs_sb_info *sbi = hpfs_sb(s); lock_kernel(); diff --git a/fs/hppfs/hppfs_kern.c b/fs/hppfs/hppfs_kern.c index ec43c22bc9c0..3a9bdf58166f 100644 --- a/fs/hppfs/hppfs_kern.c +++ b/fs/hppfs/hppfs_kern.c @@ -616,7 +616,7 @@ static const struct file_operations hppfs_dir_fops = { .fsync = hppfs_fsync, }; -static int hppfs_statfs(struct super_block *sb, struct kstatfs *sf) +static int hppfs_statfs(struct dentry *dentry, struct kstatfs *sf) { sf->f_blocks = 0; sf->f_bfree = 0; diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 4665c26171f7..678fc72c3646 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -467,9 +467,9 @@ static int hugetlbfs_set_page_dirty(struct page *page) return 0; } -static int hugetlbfs_statfs(struct super_block *sb, struct kstatfs *buf) +static int hugetlbfs_statfs(struct dentry *dentry, struct kstatfs *buf) { - struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(sb); + struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(dentry->d_sb); buf->f_type = HUGETLBFS_MAGIC; buf->f_bsize = HPAGE_SIZE; diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c index 17268da63a49..3f9c8ba1fa1f 100644 --- a/fs/isofs/inode.c +++ b/fs/isofs/inode.c @@ -56,7 +56,7 @@ static void isofs_put_super(struct super_block *sb) } static void isofs_read_inode(struct inode *); -static int isofs_statfs (struct super_block *, struct kstatfs *); +static int isofs_statfs (struct dentry *, struct kstatfs *); static kmem_cache_t *isofs_inode_cachep; @@ -901,8 +901,10 @@ out_freesbi: return -EINVAL; } -static int isofs_statfs (struct super_block *sb, struct kstatfs *buf) +static int isofs_statfs (struct dentry *dentry, struct kstatfs *buf) { + struct super_block *sb = dentry->d_sb; + buf->f_type = ISOFS_SUPER_MAGIC; buf->f_bsize = sb->s_blocksize; buf->f_blocks = (ISOFS_SB(sb)->s_nzones diff --git a/fs/jffs/inode-v23.c b/fs/jffs/inode-v23.c index dd93a091ad67..9e46ea6da752 100644 --- a/fs/jffs/inode-v23.c +++ b/fs/jffs/inode-v23.c @@ -377,9 +377,9 @@ jffs_new_inode(const struct inode * dir, struct jffs_raw_inode *raw_inode, /* Get statistics of the file system. */ static int -jffs_statfs(struct super_block *sb, struct kstatfs *buf) +jffs_statfs(struct dentry *dentry, struct kstatfs *buf) { - struct jffs_control *c = (struct jffs_control *) sb->s_fs_info; + struct jffs_control *c = (struct jffs_control *) dentry->d_sb->s_fs_info; struct jffs_fmcontrol *fmc; lock_kernel(); diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c index 7b6c24b14f85..2900ec3ec3af 100644 --- a/fs/jffs2/fs.c +++ b/fs/jffs2/fs.c @@ -192,9 +192,9 @@ int jffs2_setattr(struct dentry *dentry, struct iattr *iattr) return rc; } -int jffs2_statfs(struct super_block *sb, struct kstatfs *buf) +int jffs2_statfs(struct dentry *dentry, struct kstatfs *buf) { - struct jffs2_sb_info *c = JFFS2_SB_INFO(sb); + struct jffs2_sb_info *c = JFFS2_SB_INFO(dentry->d_sb); unsigned long avail; buf->f_type = JFFS2_SUPER_MAGIC; diff --git a/fs/jffs2/os-linux.h b/fs/jffs2/os-linux.h index cd4021bcb944..6b5223565405 100644 --- a/fs/jffs2/os-linux.h +++ b/fs/jffs2/os-linux.h @@ -175,7 +175,7 @@ void jffs2_clear_inode (struct inode *); void jffs2_dirty_inode(struct inode *inode); struct inode *jffs2_new_inode (struct inode *dir_i, int mode, struct jffs2_raw_inode *ri); -int jffs2_statfs (struct super_block *, struct kstatfs *); +int jffs2_statfs (struct dentry *, struct kstatfs *); void jffs2_write_super (struct super_block *); int jffs2_remount_fs (struct super_block *, int *, char *); int jffs2_do_fill_super(struct super_block *sb, void *data, int silent); diff --git a/fs/jfs/super.c b/fs/jfs/super.c index 18a28137b90e..73d2aba084c6 100644 --- a/fs/jfs/super.c +++ b/fs/jfs/super.c @@ -139,9 +139,9 @@ static void jfs_destroy_inode(struct inode *inode) kmem_cache_free(jfs_inode_cachep, ji); } -static int jfs_statfs(struct super_block *sb, struct kstatfs *buf) +static int jfs_statfs(struct dentry *dentry, struct kstatfs *buf) { - struct jfs_sb_info *sbi = JFS_SBI(sb); + struct jfs_sb_info *sbi = JFS_SBI(dentry->d_sb); s64 maxinodes; struct inomap *imap = JFS_IP(sbi->ipimap)->i_imap; diff --git a/fs/libfs.c b/fs/libfs.c index 7d70efa46da9..1b1156381787 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -20,9 +20,9 @@ int simple_getattr(struct vfsmount *mnt, struct dentry *dentry, return 0; } -int simple_statfs(struct super_block *sb, struct kstatfs *buf) +int simple_statfs(struct dentry *dentry, struct kstatfs *buf) { - buf->f_type = sb->s_magic; + buf->f_type = dentry->d_sb->s_magic; buf->f_bsize = PAGE_CACHE_SIZE; buf->f_namelen = NAME_MAX; return 0; diff --git a/fs/minix/inode.c b/fs/minix/inode.c index 14f24dfbfe30..a6fb509b7341 100644 --- a/fs/minix/inode.c +++ b/fs/minix/inode.c @@ -19,7 +19,7 @@ static void minix_read_inode(struct inode * inode); static int minix_write_inode(struct inode * inode, int wait); -static int minix_statfs(struct super_block *sb, struct kstatfs *buf); +static int minix_statfs(struct dentry *dentry, struct kstatfs *buf); static int minix_remount (struct super_block * sb, int * flags, char * data); static void minix_delete_inode(struct inode *inode) @@ -296,11 +296,11 @@ out_bad_sb: return -EINVAL; } -static int minix_statfs(struct super_block *sb, struct kstatfs *buf) +static int minix_statfs(struct dentry *dentry, struct kstatfs *buf) { - struct minix_sb_info *sbi = minix_sb(sb); - buf->f_type = sb->s_magic; - buf->f_bsize = sb->s_blocksize; + struct minix_sb_info *sbi = minix_sb(dentry->d_sb); + buf->f_type = dentry->d_sb->s_magic; + buf->f_bsize = dentry->d_sb->s_blocksize; buf->f_blocks = (sbi->s_nzones - sbi->s_firstdatazone) << sbi->s_log_zone_size; buf->f_bfree = minix_count_free_blocks(sbi); buf->f_bavail = buf->f_bfree; diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c index 8db033fab3fd..90d2ea28f333 100644 --- a/fs/ncpfs/inode.c +++ b/fs/ncpfs/inode.c @@ -39,7 +39,7 @@ static void ncp_delete_inode(struct inode *); static void ncp_put_super(struct super_block *); -static int ncp_statfs(struct super_block *, struct kstatfs *); +static int ncp_statfs(struct dentry *, struct kstatfs *); static kmem_cache_t * ncp_inode_cachep; @@ -724,13 +724,14 @@ static void ncp_put_super(struct super_block *sb) kfree(server); } -static int ncp_statfs(struct super_block *sb, struct kstatfs *buf) +static int ncp_statfs(struct dentry *dentry, struct kstatfs *buf) { struct dentry* d; struct inode* i; struct ncp_inode_info* ni; struct ncp_server* s; struct ncp_volume_info vi; + struct super_block *sb = dentry->d_sb; int err; __u8 dh; diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index ff645a961bc8..937fbfc381bb 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -65,7 +65,7 @@ static int nfs_write_inode(struct inode *,int); static void nfs_delete_inode(struct inode *); static void nfs_clear_inode(struct inode *); static void nfs_umount_begin(struct super_block *); -static int nfs_statfs(struct super_block *, struct kstatfs *); +static int nfs_statfs(struct dentry *, struct kstatfs *); static int nfs_show_options(struct seq_file *, struct vfsmount *); static int nfs_show_stats(struct seq_file *, struct vfsmount *); static void nfs_zap_acl_cache(struct inode *); @@ -534,8 +534,9 @@ nfs_fill_super(struct super_block *sb, struct nfs_mount_data *data, int silent) } static int -nfs_statfs(struct super_block *sb, struct kstatfs *buf) +nfs_statfs(struct dentry *dentry, struct kstatfs *buf) { + struct super_block *sb = dentry->d_sb; struct nfs_server *server = NFS_SB(sb); unsigned char blockbits; unsigned long blockres; diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index de3998f15f10..5446a0861d1d 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -1310,7 +1310,7 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp, if ((bmval0 & (FATTR4_WORD0_FILES_FREE | FATTR4_WORD0_FILES_TOTAL)) || (bmval1 & (FATTR4_WORD1_SPACE_AVAIL | FATTR4_WORD1_SPACE_FREE | FATTR4_WORD1_SPACE_TOTAL))) { - status = vfs_statfs(dentry->d_inode->i_sb, &statfs); + status = vfs_statfs(dentry, &statfs); if (status) goto out_nfserr; } diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 1d65f13f458c..245eaa1fb59b 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -1737,7 +1737,7 @@ int nfsd_statfs(struct svc_rqst *rqstp, struct svc_fh *fhp, struct kstatfs *stat) { int err = fh_verify(rqstp, fhp, 0, MAY_NOP); - if (!err && vfs_statfs(fhp->fh_dentry->d_inode->i_sb,stat)) + if (!err && vfs_statfs(fhp->fh_dentry,stat)) err = nfserr_io; return err; } diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c index d5d5e969294f..0e14acea3f8b 100644 --- a/fs/ntfs/super.c +++ b/fs/ntfs/super.c @@ -2601,10 +2601,10 @@ static unsigned long __get_nr_free_mft_records(ntfs_volume *vol, /** * ntfs_statfs - return information about mounted NTFS volume - * @sb: super block of mounted volume + * @dentry: dentry from mounted volume * @sfs: statfs structure in which to return the information * - * Return information about the mounted NTFS volume @sb in the statfs structure + * Return information about the mounted NTFS volume @dentry in the statfs structure * pointed to by @sfs (this is initialized with zeros before ntfs_statfs is * called). We interpret the values to be correct of the moment in time at * which we are called. Most values are variable otherwise and this isn't just @@ -2617,8 +2617,9 @@ static unsigned long __get_nr_free_mft_records(ntfs_volume *vol, * * Return 0 on success or -errno on error. */ -static int ntfs_statfs(struct super_block *sb, struct kstatfs *sfs) +static int ntfs_statfs(struct dentry *dentry, struct kstatfs *sfs) { + struct super_block *sb = dentry->d_sb; s64 size; ntfs_volume *vol = NTFS_SB(sb); ntfs_inode *mft_ni = NTFS_I(vol->mft_ino); diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index 788b8b50dc4c..cdf73393f094 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -100,7 +100,7 @@ static int ocfs2_initialize_mem_caches(void); static void ocfs2_free_mem_caches(void); static void ocfs2_delete_osb(struct ocfs2_super *osb); -static int ocfs2_statfs(struct super_block *sb, struct kstatfs *buf); +static int ocfs2_statfs(struct dentry *dentry, struct kstatfs *buf); static int ocfs2_sync_fs(struct super_block *sb, int wait); @@ -857,7 +857,7 @@ static void ocfs2_put_super(struct super_block *sb) mlog_exit_void(); } -static int ocfs2_statfs(struct super_block *sb, struct kstatfs *buf) +static int ocfs2_statfs(struct dentry *dentry, struct kstatfs *buf) { struct ocfs2_super *osb; u32 numbits, freebits; @@ -866,9 +866,9 @@ static int ocfs2_statfs(struct super_block *sb, struct kstatfs *buf) struct buffer_head *bh = NULL; struct inode *inode = NULL; - mlog_entry("(%p, %p)\n", sb, buf); + mlog_entry("(%p, %p)\n", dentry->d_sb, buf); - osb = OCFS2_SB(sb); + osb = OCFS2_SB(dentry->d_sb); inode = ocfs2_get_system_file_inode(osb, GLOBAL_BITMAP_SYSTEM_INODE, @@ -891,7 +891,7 @@ static int ocfs2_statfs(struct super_block *sb, struct kstatfs *buf) freebits = numbits - le32_to_cpu(bm_lock->id1.bitmap1.i_used); buf->f_type = OCFS2_SUPER_MAGIC; - buf->f_bsize = sb->s_blocksize; + buf->f_bsize = dentry->d_sb->s_blocksize; buf->f_namelen = OCFS2_MAX_FILENAME_LEN; buf->f_blocks = ((sector_t) numbits) * (osb->s_clustersize >> osb->sb->s_blocksize_bits); diff --git a/fs/open.c b/fs/open.c index 4f178acd4c09..a37ff861108f 100644 --- a/fs/open.c +++ b/fs/open.c @@ -31,18 +31,18 @@ #include -int vfs_statfs(struct super_block *sb, struct kstatfs *buf) +int vfs_statfs(struct dentry *dentry, struct kstatfs *buf) { int retval = -ENODEV; - if (sb) { + if (dentry) { retval = -ENOSYS; - if (sb->s_op->statfs) { + if (dentry->d_sb->s_op->statfs) { memset(buf, 0, sizeof(*buf)); - retval = security_sb_statfs(sb); + retval = security_sb_statfs(dentry); if (retval) return retval; - retval = sb->s_op->statfs(sb, buf); + retval = dentry->d_sb->s_op->statfs(dentry, buf); if (retval == 0 && buf->f_frsize == 0) buf->f_frsize = buf->f_bsize; } @@ -52,12 +52,12 @@ int vfs_statfs(struct super_block *sb, struct kstatfs *buf) EXPORT_SYMBOL(vfs_statfs); -static int vfs_statfs_native(struct super_block *sb, struct statfs *buf) +static int vfs_statfs_native(struct dentry *dentry, struct statfs *buf) { struct kstatfs st; int retval; - retval = vfs_statfs(sb, &st); + retval = vfs_statfs(dentry, &st); if (retval) return retval; @@ -95,12 +95,12 @@ static int vfs_statfs_native(struct super_block *sb, struct statfs *buf) return 0; } -static int vfs_statfs64(struct super_block *sb, struct statfs64 *buf) +static int vfs_statfs64(struct dentry *dentry, struct statfs64 *buf) { struct kstatfs st; int retval; - retval = vfs_statfs(sb, &st); + retval = vfs_statfs(dentry, &st); if (retval) return retval; @@ -130,7 +130,7 @@ asmlinkage long sys_statfs(const char __user * path, struct statfs __user * buf) error = user_path_walk(path, &nd); if (!error) { struct statfs tmp; - error = vfs_statfs_native(nd.dentry->d_inode->i_sb, &tmp); + error = vfs_statfs_native(nd.dentry, &tmp); if (!error && copy_to_user(buf, &tmp, sizeof(tmp))) error = -EFAULT; path_release(&nd); @@ -149,7 +149,7 @@ asmlinkage long sys_statfs64(const char __user *path, size_t sz, struct statfs64 error = user_path_walk(path, &nd); if (!error) { struct statfs64 tmp; - error = vfs_statfs64(nd.dentry->d_inode->i_sb, &tmp); + error = vfs_statfs64(nd.dentry, &tmp); if (!error && copy_to_user(buf, &tmp, sizeof(tmp))) error = -EFAULT; path_release(&nd); @@ -168,7 +168,7 @@ asmlinkage long sys_fstatfs(unsigned int fd, struct statfs __user * buf) file = fget(fd); if (!file) goto out; - error = vfs_statfs_native(file->f_dentry->d_inode->i_sb, &tmp); + error = vfs_statfs_native(file->f_dentry, &tmp); if (!error && copy_to_user(buf, &tmp, sizeof(tmp))) error = -EFAULT; fput(file); @@ -189,7 +189,7 @@ asmlinkage long sys_fstatfs64(unsigned int fd, size_t sz, struct statfs64 __user file = fget(fd); if (!file) goto out; - error = vfs_statfs64(file->f_dentry->d_inode->i_sb, &tmp); + error = vfs_statfs64(file->f_dentry, &tmp); if (!error && copy_to_user(buf, &tmp, sizeof(tmp))) error = -EFAULT; fput(file); diff --git a/fs/qnx4/inode.c b/fs/qnx4/inode.c index e6cca5cd4b44..2f24c46f72a1 100644 --- a/fs/qnx4/inode.c +++ b/fs/qnx4/inode.c @@ -128,7 +128,7 @@ static struct inode *qnx4_alloc_inode(struct super_block *sb); static void qnx4_destroy_inode(struct inode *inode); static void qnx4_read_inode(struct inode *); static int qnx4_remount(struct super_block *sb, int *flags, char *data); -static int qnx4_statfs(struct super_block *, struct kstatfs *); +static int qnx4_statfs(struct dentry *, struct kstatfs *); static struct super_operations qnx4_sops = { @@ -282,8 +282,10 @@ unsigned long qnx4_block_map( struct inode *inode, long iblock ) return block; } -static int qnx4_statfs(struct super_block *sb, struct kstatfs *buf) +static int qnx4_statfs(struct dentry *dentry, struct kstatfs *buf) { + struct super_block *sb = dentry->d_sb; + lock_kernel(); buf->f_type = sb->s_magic; diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c index f3ff41d33989..00f1321e9209 100644 --- a/fs/reiserfs/super.c +++ b/fs/reiserfs/super.c @@ -60,7 +60,7 @@ static int is_any_reiserfs_magic_string(struct reiserfs_super_block *rs) } static int reiserfs_remount(struct super_block *s, int *flags, char *data); -static int reiserfs_statfs(struct super_block *s, struct kstatfs *buf); +static int reiserfs_statfs(struct dentry *dentry, struct kstatfs *buf); static int reiserfs_sync_fs(struct super_block *s, int wait) { @@ -1938,15 +1938,15 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent) return errval; } -static int reiserfs_statfs(struct super_block *s, struct kstatfs *buf) +static int reiserfs_statfs(struct dentry *dentry, struct kstatfs *buf) { - struct reiserfs_super_block *rs = SB_DISK_SUPER_BLOCK(s); + struct reiserfs_super_block *rs = SB_DISK_SUPER_BLOCK(dentry->d_sb); buf->f_namelen = (REISERFS_MAX_NAME(s->s_blocksize)); buf->f_bfree = sb_free_blocks(rs); buf->f_bavail = buf->f_bfree; buf->f_blocks = sb_block_count(rs) - sb_bmap_nr(rs) - 1; - buf->f_bsize = s->s_blocksize; + buf->f_bsize = dentry->d_sb->s_blocksize; /* changed to accommodate gcc folks. */ buf->f_type = REISERFS_SUPER_MAGIC; return 0; diff --git a/fs/romfs/inode.c b/fs/romfs/inode.c index 4d6cd6621062..283fbc6b8eea 100644 --- a/fs/romfs/inode.c +++ b/fs/romfs/inode.c @@ -179,12 +179,12 @@ outnobh: /* That's simple too. */ static int -romfs_statfs(struct super_block *sb, struct kstatfs *buf) +romfs_statfs(struct dentry *dentry, struct kstatfs *buf) { buf->f_type = ROMFS_MAGIC; buf->f_bsize = ROMBSIZE; buf->f_bfree = buf->f_bavail = buf->f_ffree; - buf->f_blocks = (romfs_maxsize(sb)+ROMBSIZE-1)>>ROMBSBITS; + buf->f_blocks = (romfs_maxsize(dentry->d_sb)+ROMBSIZE-1)>>ROMBSBITS; buf->f_namelen = ROMFS_MAXFN; return 0; } diff --git a/fs/smbfs/inode.c b/fs/smbfs/inode.c index 4a37c2bbfa3f..506ff87c1d4b 100644 --- a/fs/smbfs/inode.c +++ b/fs/smbfs/inode.c @@ -48,7 +48,7 @@ static void smb_delete_inode(struct inode *); static void smb_put_super(struct super_block *); -static int smb_statfs(struct super_block *, struct kstatfs *); +static int smb_statfs(struct dentry *, struct kstatfs *); static int smb_show_options(struct seq_file *, struct vfsmount *); static kmem_cache_t *smb_inode_cachep; @@ -641,13 +641,13 @@ out_no_server: } static int -smb_statfs(struct super_block *sb, struct kstatfs *buf) +smb_statfs(struct dentry *dentry, struct kstatfs *buf) { int result; lock_kernel(); - result = smb_proc_dskattr(sb, buf); + result = smb_proc_dskattr(dentry, buf); unlock_kernel(); diff --git a/fs/smbfs/proc.c b/fs/smbfs/proc.c index b1b878b81730..c3495059889d 100644 --- a/fs/smbfs/proc.c +++ b/fs/smbfs/proc.c @@ -3226,9 +3226,9 @@ smb_proc_settime(struct dentry *dentry, struct smb_fattr *fattr) } int -smb_proc_dskattr(struct super_block *sb, struct kstatfs *attr) +smb_proc_dskattr(struct dentry *dentry, struct kstatfs *attr) { - struct smb_sb_info *server = SMB_SB(sb); + struct smb_sb_info *server = SMB_SB(dentry->d_sb); int result; char *p; long unit; diff --git a/fs/smbfs/proto.h b/fs/smbfs/proto.h index 47664597e6b1..972ed7dad388 100644 --- a/fs/smbfs/proto.h +++ b/fs/smbfs/proto.h @@ -29,7 +29,7 @@ extern int smb_proc_getattr(struct dentry *dir, struct smb_fattr *fattr); extern int smb_proc_setattr(struct dentry *dir, struct smb_fattr *fattr); extern int smb_proc_setattr_unix(struct dentry *d, struct iattr *attr, unsigned int major, unsigned int minor); extern int smb_proc_settime(struct dentry *dentry, struct smb_fattr *fattr); -extern int smb_proc_dskattr(struct super_block *sb, struct kstatfs *attr); +extern int smb_proc_dskattr(struct dentry *dentry, struct kstatfs *attr); extern int smb_proc_read_link(struct smb_sb_info *server, struct dentry *d, char *buffer, int len); extern int smb_proc_symlink(struct smb_sb_info *server, struct dentry *d, const char *oldpath); extern int smb_proc_link(struct smb_sb_info *server, struct dentry *dentry, struct dentry *new_dentry); diff --git a/fs/super.c b/fs/super.c index 324c2d232f54..057b5325b7ef 100644 --- a/fs/super.c +++ b/fs/super.c @@ -486,7 +486,7 @@ asmlinkage long sys_ustat(unsigned dev, struct ustat __user * ubuf) s = user_get_super(new_decode_dev(dev)); if (s == NULL) goto out; - err = vfs_statfs(s, &sbuf); + err = vfs_statfs(s->s_root, &sbuf); drop_super(s); if (err) goto out; diff --git a/fs/sysv/inode.c b/fs/sysv/inode.c index 3ff89cc5833a..58b2d22142ba 100644 --- a/fs/sysv/inode.c +++ b/fs/sysv/inode.c @@ -85,8 +85,9 @@ static void sysv_put_super(struct super_block *sb) kfree(sbi); } -static int sysv_statfs(struct super_block *sb, struct kstatfs *buf) +static int sysv_statfs(struct dentry *dentry, struct kstatfs *buf) { + struct super_block *sb = dentry->d_sb; struct sysv_sb_info *sbi = SYSV_SB(sb); buf->f_type = sb->s_magic; diff --git a/fs/udf/super.c b/fs/udf/super.c index 2250774a831d..44fe2cb0bbb2 100644 --- a/fs/udf/super.c +++ b/fs/udf/super.c @@ -91,7 +91,7 @@ static void udf_load_partdesc(struct super_block *, struct buffer_head *); static void udf_open_lvid(struct super_block *); static void udf_close_lvid(struct super_block *); static unsigned int udf_count_free(struct super_block *); -static int udf_statfs(struct super_block *, struct kstatfs *); +static int udf_statfs(struct dentry *, struct kstatfs *); /* UDF filesystem type */ static int udf_get_sb(struct file_system_type *fs_type, @@ -1779,8 +1779,10 @@ udf_put_super(struct super_block *sb) * Written, tested, and released. */ static int -udf_statfs(struct super_block *sb, struct kstatfs *buf) +udf_statfs(struct dentry *dentry, struct kstatfs *buf) { + struct super_block *sb = dentry->d_sb; + buf->f_type = UDF_SUPER_MAGIC; buf->f_bsize = sb->s_blocksize; buf->f_blocks = UDF_SB_PARTLEN(sb, UDF_SB_PARTITION(sb)); diff --git a/fs/ufs/super.c b/fs/ufs/super.c index 768fb8d9e67a..fe5ab2aa2899 100644 --- a/fs/ufs/super.c +++ b/fs/ufs/super.c @@ -1113,8 +1113,9 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data) return 0; } -static int ufs_statfs (struct super_block *sb, struct kstatfs *buf) +static int ufs_statfs (struct dentry *dentry, struct kstatfs *buf) { + struct super_block *sb = dentry->d_sb; struct ufs_sb_private_info * uspi; struct ufs_super_block_first * usb1; struct ufs_super_block * usb; diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c index d03c89a36655..4fb0fc65af34 100644 --- a/fs/xfs/linux-2.6/xfs_super.c +++ b/fs/xfs/linux-2.6/xfs_super.c @@ -684,10 +684,10 @@ xfs_fs_sync_super( STATIC int xfs_fs_statfs( - struct super_block *sb, + struct dentry *dentry, struct kstatfs *statp) { - return -bhv_vfs_statvfs(vfs_from_sb(sb), statp, NULL); + return -bhv_vfs_statvfs(vfs_from_sb(dentry->d_sb), statp, NULL); } STATIC int diff --git a/include/linux/coda_psdev.h b/include/linux/coda_psdev.h index d539262a8f89..98f6c52c152b 100644 --- a/include/linux/coda_psdev.h +++ b/include/linux/coda_psdev.h @@ -70,7 +70,7 @@ int venus_pioctl(struct super_block *sb, struct CodaFid *fid, unsigned int cmd, struct PioctlData *data); int coda_downcall(int opcode, union outputArgs *out, struct super_block *sb); int venus_fsync(struct super_block *sb, struct CodaFid *fid); -int venus_statfs(struct super_block *sb, struct kstatfs *sfs); +int venus_statfs(struct dentry *dentry, struct kstatfs *sfs); /* messages between coda filesystem in kernel and Venus */ diff --git a/include/linux/fs.h b/include/linux/fs.h index 3e50dd24af87..c823a3815e24 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1096,7 +1096,7 @@ struct super_operations { int (*sync_fs)(struct super_block *sb, int wait); void (*write_super_lockfs) (struct super_block *); void (*unlockfs) (struct super_block *); - int (*statfs) (struct super_block *, struct kstatfs *); + int (*statfs) (struct dentry *, struct kstatfs *); int (*remount_fs) (struct super_block *, int *, char *); void (*clear_inode) (struct inode *); void (*umount_begin) (struct super_block *); @@ -1325,7 +1325,7 @@ extern struct vfsmount *copy_tree(struct vfsmount *, struct dentry *, int); extern void mnt_set_mountpoint(struct vfsmount *, struct dentry *, struct vfsmount *); -extern int vfs_statfs(struct super_block *, struct kstatfs *); +extern int vfs_statfs(struct dentry *, struct kstatfs *); /* /sys/fs */ extern struct subsystem fs_subsys; @@ -1746,7 +1746,7 @@ extern int dcache_dir_close(struct inode *, struct file *); extern loff_t dcache_dir_lseek(struct file *, loff_t, int); extern int dcache_readdir(struct file *, void *, filldir_t); extern int simple_getattr(struct vfsmount *, struct dentry *, struct kstat *); -extern int simple_statfs(struct super_block *, struct kstatfs *); +extern int simple_statfs(struct dentry *, struct kstatfs *); extern int simple_link(struct dentry *, struct inode *, struct dentry *); extern int simple_unlink(struct inode *, struct dentry *); extern int simple_rmdir(struct inode *, struct dentry *); diff --git a/include/linux/mount.h b/include/linux/mount.h index b7472ae91fa4..60718f12caa9 100644 --- a/include/linux/mount.h +++ b/include/linux/mount.h @@ -17,6 +17,11 @@ #include #include +struct super_block; +struct vfsmount; +struct dentry; +struct namespace; + #define MNT_NOSUID 0x01 #define MNT_NODEV 0x02 #define MNT_NOEXEC 0x04 diff --git a/include/linux/security.h b/include/linux/security.h index 47722d355532..383c320fc834 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -171,9 +171,9 @@ struct swap_info_struct; * Deallocate and clear the sb->s_security field. * @sb contains the super_block structure to be modified. * @sb_statfs: - * Check permission before obtaining filesystem statistics for the @sb - * filesystem. - * @sb contains the super_block structure for the filesystem. + * Check permission before obtaining filesystem statistics for the @mnt + * mountpoint. + * @dentry is a handle on the superblock for the filesystem. * Return 0 if permission is granted. * @sb_mount: * Check permission before an object specified by @dev_name is mounted on @@ -1127,7 +1127,7 @@ struct security_operations { int (*sb_copy_data)(struct file_system_type *type, void *orig, void *copy); int (*sb_kern_mount) (struct super_block *sb, void *data); - int (*sb_statfs) (struct super_block * sb); + int (*sb_statfs) (struct dentry *dentry); int (*sb_mount) (char *dev_name, struct nameidata * nd, char *type, unsigned long flags, void *data); int (*sb_check_sb) (struct vfsmount * mnt, struct nameidata * nd); @@ -1450,9 +1450,9 @@ static inline int security_sb_kern_mount (struct super_block *sb, void *data) return security_ops->sb_kern_mount (sb, data); } -static inline int security_sb_statfs (struct super_block *sb) +static inline int security_sb_statfs (struct dentry *dentry) { - return security_ops->sb_statfs (sb); + return security_ops->sb_statfs (dentry); } static inline int security_sb_mount (char *dev_name, struct nameidata *nd, @@ -2162,7 +2162,7 @@ static inline int security_sb_kern_mount (struct super_block *sb, void *data) return 0; } -static inline int security_sb_statfs (struct super_block *sb) +static inline int security_sb_statfs (struct dentry *dentry) { return 0; } diff --git a/kernel/acct.c b/kernel/acct.c index b327f4d20104..6802020e0ceb 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -118,7 +118,7 @@ static int check_free_space(struct file *file) spin_unlock(&acct_globals.lock); /* May block */ - if (vfs_statfs(file->f_dentry->d_inode->i_sb, &sbuf)) + if (vfs_statfs(file->f_dentry, &sbuf)) return res; suspend = sbuf.f_blocks * SUSPEND; resume = sbuf.f_blocks * RESUME; diff --git a/mm/shmem.c b/mm/shmem.c index 7617bb1c6bf7..10020d8b4073 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1654,9 +1654,9 @@ static ssize_t shmem_file_sendfile(struct file *in_file, loff_t *ppos, return desc.error; } -static int shmem_statfs(struct super_block *sb, struct kstatfs *buf) +static int shmem_statfs(struct dentry *dentry, struct kstatfs *buf) { - struct shmem_sb_info *sbinfo = SHMEM_SB(sb); + struct shmem_sb_info *sbinfo = SHMEM_SB(dentry->d_sb); buf->f_type = TMPFS_MAGIC; buf->f_bsize = PAGE_CACHE_SIZE; diff --git a/security/dummy.c b/security/dummy.c index 6de4a4a5eb13..c98d553984ec 100644 --- a/security/dummy.c +++ b/security/dummy.c @@ -191,7 +191,7 @@ static int dummy_sb_kern_mount (struct super_block *sb, void *data) return 0; } -static int dummy_sb_statfs (struct super_block *sb) +static int dummy_sb_statfs (struct dentry *dentry) { return 0; } diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c index 524915dfda64..093efba4d9b6 100644 --- a/security/selinux/hooks.c +++ b/security/selinux/hooks.c @@ -1903,13 +1903,13 @@ static int selinux_sb_kern_mount(struct super_block *sb, void *data) return superblock_has_perm(current, sb, FILESYSTEM__MOUNT, &ad); } -static int selinux_sb_statfs(struct super_block *sb) +static int selinux_sb_statfs(struct dentry *dentry) { struct avc_audit_data ad; AVC_AUDIT_DATA_INIT(&ad,FS); - ad.u.fs.dentry = sb->s_root; - return superblock_has_perm(current, sb, FILESYSTEM__GETATTR, &ad); + ad.u.fs.dentry = dentry->d_sb->s_root; + return superblock_has_perm(current, dentry->d_sb, FILESYSTEM__GETATTR, &ad); } static int selinux_mount(char * dev_name, -- cgit v1.2.3 From fadd8fbd153c12963f8fe3c9ef7f8967f286f98b Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Fri, 23 Jun 2006 02:03:13 -0700 Subject: [PATCH] support for panic at OOM This patch adds panic_on_oom sysctl under sys.vm. When sysctl vm.panic_on_oom = 1, the kernel panics intead of killing rogue processes. And if vm.panic_on_oom is 0 the kernel will do oom_kill() in the same way as it does today. Of course, the default value is 0 and only root can modifies it. In general, oom_killer works well and kill rogue processes. So the whole system can survive. But there are environments where panic is preferable rather than kill some processes. Signed-off-by: KAMEZAWA Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/sysctl/vm.txt | 13 +++++++++++++ include/linux/sysctl.h | 1 + kernel/sysctl.c | 9 +++++++++ mm/oom_kill.c | 3 +++ 4 files changed, 26 insertions(+) (limited to 'kernel') diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt index a46c10fcddfc..2dc246af4885 100644 --- a/Documentation/sysctl/vm.txt +++ b/Documentation/sysctl/vm.txt @@ -29,6 +29,7 @@ Currently, these files are in /proc/sys/vm: - drop-caches - zone_reclaim_mode - zone_reclaim_interval +- panic_on_oom ============================================================== @@ -178,3 +179,15 @@ Time is set in seconds and set by default to 30 seconds. Reduce the interval if undesired off node allocations occur. However, too frequent scans will have a negative impact onoff node allocation performance. +============================================================= + +panic_on_oom + +This enables or disables panic on out-of-memory feature. If this is set to 1, +the kernel panics when out-of-memory happens. If this is set to 0, the kernel +will kill some rogue process, called oom_killer. Usually, oom_killer can kill +rogue processes and system will survive. If you want to panic the system +rather than killing rogue processes, set this to 1. + +The default value is 0. + diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h index cee944dbdcd4..c7132029af0f 100644 --- a/include/linux/sysctl.h +++ b/include/linux/sysctl.h @@ -186,6 +186,7 @@ enum VM_PERCPU_PAGELIST_FRACTION=30,/* int: fraction of pages in each percpu_pagelist */ VM_ZONE_RECLAIM_MODE=31, /* reclaim local zone memory before going off node */ VM_ZONE_RECLAIM_INTERVAL=32, /* time period to wait after reclaim failure */ + VM_PANIC_ON_OOM=33, /* panic at out-of-memory */ }; diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 0d656e61621d..072ac446810a 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -59,6 +59,7 @@ extern int proc_nr_files(ctl_table *table, int write, struct file *filp, extern int C_A_D; extern int sysctl_overcommit_memory; extern int sysctl_overcommit_ratio; +extern int sysctl_panic_on_oom; extern int max_threads; extern int sysrq_enabled; extern int core_uses_pid; @@ -701,6 +702,14 @@ static ctl_table vm_table[] = { .mode = 0644, .proc_handler = &proc_dointvec, }, + { + .ctl_name = VM_PANIC_ON_OOM, + .procname = "panic_on_oom", + .data = &sysctl_panic_on_oom, + .maxlen = sizeof(sysctl_panic_on_oom), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, { .ctl_name = VM_OVERCOMMIT_RATIO, .procname = "overcommit_ratio", diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 042e6436c3ee..f9bb3cf32384 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -22,6 +22,7 @@ #include #include +int sysctl_panic_on_oom; /* #define DEBUG */ /** @@ -344,6 +345,8 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order) break; case CONSTRAINT_NONE: + if (sysctl_panic_on_oom) + panic("out of memory. panic_on_oom is selected\n"); retry: /* * Rambo mode: Shoot down a process and hope it solves whatever -- cgit v1.2.3 From d6277db4ab271862ed599da08d78961c70f00002 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Fri, 23 Jun 2006 02:03:18 -0700 Subject: [PATCH] swsusp: rework memory shrinker MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Rework the swsusp's memory shrinker in the following way: - Simplify balance_pgdat() by removing all of the swsusp-related code from it. - Make shrink_all_memory() use shrink_slab() and a new function shrink_all_zones() which calls shrink_active_list() and shrink_inactive_list() directly for each zone in a way that's optimized for suspend. In shrink_all_memory() we try to free exactly as many pages as the caller asks for, preferably in one shot, starting from easier targets.  If slab caches are huge, they are most likely to have enough pages to reclaim.  The inactive lists are next (the zones with more inactive pages go first) etc. Each time shrink_all_memory() attempts to shrink the active and inactive lists for each zone in 5 passes.  In the first pass, only the inactive lists are taken into consideration.  In the next two passes the active lists are also shrunk, but mapped pages are not reclaimed.  In the last two passes the active and inactive lists are shrunk and mapped pages are reclaimed as well. The aim of this is to alter the reclaim logic to choose the best pages to keep on resume and improve the responsiveness of the resumed system. Signed-off-by: Rafael J. Wysocki Signed-off-by: Con Kolivas Signed-off-by: Adrian Bunk Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/power/swsusp.c | 10 ++- mm/vmscan.c | 219 +++++++++++++++++++++++++++++++++++++------------- 2 files changed, 172 insertions(+), 57 deletions(-) (limited to 'kernel') diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c index c4016cbbd3e0..f9238faf76e4 100644 --- a/kernel/power/swsusp.c +++ b/kernel/power/swsusp.c @@ -175,6 +175,12 @@ void free_all_swap_pages(int swap, struct bitmap_page *bitmap) */ #define SHRINK_BITE 10000 +static inline unsigned long __shrink_memory(long tmp) +{ + if (tmp > SHRINK_BITE) + tmp = SHRINK_BITE; + return shrink_all_memory(tmp); +} int swsusp_shrink_memory(void) { @@ -195,12 +201,12 @@ int swsusp_shrink_memory(void) if (!is_highmem(zone)) tmp -= zone->free_pages; if (tmp > 0) { - tmp = shrink_all_memory(SHRINK_BITE); + tmp = __shrink_memory(tmp); if (!tmp) return -ENOMEM; pages += tmp; } else if (size > image_size / PAGE_SIZE) { - tmp = shrink_all_memory(SHRINK_BITE); + tmp = __shrink_memory(size - (image_size / PAGE_SIZE)); pages += tmp; } printk("\b%c", p[i++%4]); diff --git a/mm/vmscan.c b/mm/vmscan.c index 440a733fe2e9..46be8a02280e 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -61,6 +61,8 @@ struct scan_control { * In this context, it doesn't matter that we scan the * whole list at once. */ int swap_cluster_max; + + int swappiness; }; /* @@ -741,7 +743,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, * A 100% value of vm_swappiness overrides this algorithm * altogether. */ - swap_tendency = mapped_ratio / 2 + distress + vm_swappiness; + swap_tendency = mapped_ratio / 2 + distress + sc->swappiness; /* * Now use this metric to decide whether to start moving mapped @@ -957,6 +959,7 @@ unsigned long try_to_free_pages(struct zone **zones, gfp_t gfp_mask) .may_writepage = !laptop_mode, .swap_cluster_max = SWAP_CLUSTER_MAX, .may_swap = 1, + .swappiness = vm_swappiness, }; inc_page_state(allocstall); @@ -1021,10 +1024,6 @@ out: * For kswapd, balance_pgdat() will work across all this node's zones until * they are all at pages_high. * - * If `nr_pages' is non-zero then it is the number of pages which are to be - * reclaimed, regardless of the zone occupancies. This is a software suspend - * special. - * * Returns the number of pages which were actually freed. * * There is special handling here for zones which are full of pinned pages. @@ -1042,10 +1041,8 @@ out: * the page allocator fallback scheme to ensure that aging of pages is balanced * across the zones. */ -static unsigned long balance_pgdat(pg_data_t *pgdat, unsigned long nr_pages, - int order) +static unsigned long balance_pgdat(pg_data_t *pgdat, int order) { - unsigned long to_free = nr_pages; int all_zones_ok; int priority; int i; @@ -1055,7 +1052,8 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, unsigned long nr_pages, struct scan_control sc = { .gfp_mask = GFP_KERNEL, .may_swap = 1, - .swap_cluster_max = nr_pages ? nr_pages : SWAP_CLUSTER_MAX, + .swap_cluster_max = SWAP_CLUSTER_MAX, + .swappiness = vm_swappiness, }; loop_again: @@ -1082,31 +1080,26 @@ loop_again: all_zones_ok = 1; - if (nr_pages == 0) { - /* - * Scan in the highmem->dma direction for the highest - * zone which needs scanning - */ - for (i = pgdat->nr_zones - 1; i >= 0; i--) { - struct zone *zone = pgdat->node_zones + i; + /* + * Scan in the highmem->dma direction for the highest + * zone which needs scanning + */ + for (i = pgdat->nr_zones - 1; i >= 0; i--) { + struct zone *zone = pgdat->node_zones + i; - if (!populated_zone(zone)) - continue; + if (!populated_zone(zone)) + continue; - if (zone->all_unreclaimable && - priority != DEF_PRIORITY) - continue; + if (zone->all_unreclaimable && priority != DEF_PRIORITY) + continue; - if (!zone_watermark_ok(zone, order, - zone->pages_high, 0, 0)) { - end_zone = i; - goto scan; - } + if (!zone_watermark_ok(zone, order, zone->pages_high, + 0, 0)) { + end_zone = i; + goto scan; } - goto out; - } else { - end_zone = pgdat->nr_zones - 1; } + goto out; scan: for (i = 0; i <= end_zone; i++) { struct zone *zone = pgdat->node_zones + i; @@ -1133,11 +1126,9 @@ scan: if (zone->all_unreclaimable && priority != DEF_PRIORITY) continue; - if (nr_pages == 0) { /* Not software suspend */ - if (!zone_watermark_ok(zone, order, - zone->pages_high, end_zone, 0)) - all_zones_ok = 0; - } + if (!zone_watermark_ok(zone, order, zone->pages_high, + end_zone, 0)) + all_zones_ok = 0; zone->temp_priority = priority; if (zone->prev_priority > priority) zone->prev_priority = priority; @@ -1162,8 +1153,6 @@ scan: total_scanned > nr_reclaimed + nr_reclaimed / 2) sc.may_writepage = 1; } - if (nr_pages && to_free > nr_reclaimed) - continue; /* swsusp: need to do more work */ if (all_zones_ok) break; /* kswapd: all done */ /* @@ -1179,7 +1168,7 @@ scan: * matches the direct reclaim path behaviour in terms of impact * on zone->*_priority. */ - if ((nr_reclaimed >= SWAP_CLUSTER_MAX) && !nr_pages) + if (nr_reclaimed >= SWAP_CLUSTER_MAX) break; } out: @@ -1261,7 +1250,7 @@ static int kswapd(void *p) } finish_wait(&pgdat->kswapd_wait, &wait); - balance_pgdat(pgdat, 0, order); + balance_pgdat(pgdat, order); } return 0; } @@ -1290,35 +1279,154 @@ void wakeup_kswapd(struct zone *zone, int order) #ifdef CONFIG_PM /* - * Try to free `nr_pages' of memory, system-wide. Returns the number of freed - * pages. + * Helper function for shrink_all_memory(). Tries to reclaim 'nr_pages' pages + * from LRU lists system-wide, for given pass and priority, and returns the + * number of reclaimed pages + * + * For pass > 3 we also try to shrink the LRU lists that contain a few pages + */ +static unsigned long shrink_all_zones(unsigned long nr_pages, int pass, + int prio, struct scan_control *sc) +{ + struct zone *zone; + unsigned long nr_to_scan, ret = 0; + + for_each_zone(zone) { + + if (!populated_zone(zone)) + continue; + + if (zone->all_unreclaimable && prio != DEF_PRIORITY) + continue; + + /* For pass = 0 we don't shrink the active list */ + if (pass > 0) { + zone->nr_scan_active += (zone->nr_active >> prio) + 1; + if (zone->nr_scan_active >= nr_pages || pass > 3) { + zone->nr_scan_active = 0; + nr_to_scan = min(nr_pages, zone->nr_active); + shrink_active_list(nr_to_scan, zone, sc); + } + } + + zone->nr_scan_inactive += (zone->nr_inactive >> prio) + 1; + if (zone->nr_scan_inactive >= nr_pages || pass > 3) { + zone->nr_scan_inactive = 0; + nr_to_scan = min(nr_pages, zone->nr_inactive); + ret += shrink_inactive_list(nr_to_scan, zone, sc); + if (ret >= nr_pages) + return ret; + } + } + + return ret; +} + +/* + * Try to free `nr_pages' of memory, system-wide, and return the number of + * freed pages. + * + * Rather than trying to age LRUs the aim is to preserve the overall + * LRU order by reclaiming preferentially + * inactive > active > active referenced > active mapped */ unsigned long shrink_all_memory(unsigned long nr_pages) { - pg_data_t *pgdat; - unsigned long nr_to_free = nr_pages; + unsigned long lru_pages, nr_slab; unsigned long ret = 0; - unsigned retry = 2; - struct reclaim_state reclaim_state = { - .reclaimed_slab = 0, + int pass; + struct reclaim_state reclaim_state; + struct zone *zone; + struct scan_control sc = { + .gfp_mask = GFP_KERNEL, + .may_swap = 0, + .swap_cluster_max = nr_pages, + .may_writepage = 1, + .swappiness = vm_swappiness, }; current->reclaim_state = &reclaim_state; -repeat: - for_each_online_pgdat(pgdat) { - unsigned long freed; - freed = balance_pgdat(pgdat, nr_to_free, 0); - ret += freed; - nr_to_free -= freed; - if ((long)nr_to_free <= 0) + lru_pages = 0; + for_each_zone(zone) + lru_pages += zone->nr_active + zone->nr_inactive; + + nr_slab = read_page_state(nr_slab); + /* If slab caches are huge, it's better to hit them first */ + while (nr_slab >= lru_pages) { + reclaim_state.reclaimed_slab = 0; + shrink_slab(nr_pages, sc.gfp_mask, lru_pages); + if (!reclaim_state.reclaimed_slab) break; + + ret += reclaim_state.reclaimed_slab; + if (ret >= nr_pages) + goto out; + + nr_slab -= reclaim_state.reclaimed_slab; } - if (retry-- && ret < nr_pages) { - blk_congestion_wait(WRITE, HZ/5); - goto repeat; + + /* + * We try to shrink LRUs in 5 passes: + * 0 = Reclaim from inactive_list only + * 1 = Reclaim from active list but don't reclaim mapped + * 2 = 2nd pass of type 1 + * 3 = Reclaim mapped (normal reclaim) + * 4 = 2nd pass of type 3 + */ + for (pass = 0; pass < 5; pass++) { + int prio; + + /* Needed for shrinking slab caches later on */ + if (!lru_pages) + for_each_zone(zone) { + lru_pages += zone->nr_active; + lru_pages += zone->nr_inactive; + } + + /* Force reclaiming mapped pages in the passes #3 and #4 */ + if (pass > 2) { + sc.may_swap = 1; + sc.swappiness = 100; + } + + for (prio = DEF_PRIORITY; prio >= 0; prio--) { + unsigned long nr_to_scan = nr_pages - ret; + + sc.nr_mapped = read_page_state(nr_mapped); + sc.nr_scanned = 0; + + ret += shrink_all_zones(nr_to_scan, prio, pass, &sc); + if (ret >= nr_pages) + goto out; + + reclaim_state.reclaimed_slab = 0; + shrink_slab(sc.nr_scanned, sc.gfp_mask, lru_pages); + ret += reclaim_state.reclaimed_slab; + if (ret >= nr_pages) + goto out; + + if (sc.nr_scanned && prio < DEF_PRIORITY - 2) + blk_congestion_wait(WRITE, HZ / 10); + } + + lru_pages = 0; } + + /* + * If ret = 0, we could not shrink LRUs, but there may be something + * in slab caches + */ + if (!ret) + do { + reclaim_state.reclaimed_slab = 0; + shrink_slab(nr_pages, sc.gfp_mask, lru_pages); + ret += reclaim_state.reclaimed_slab; + } while (ret < nr_pages && reclaim_state.reclaimed_slab > 0); + +out: current->reclaim_state = NULL; + return ret; } #endif @@ -1416,6 +1524,7 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) .swap_cluster_max = max_t(unsigned long, nr_pages, SWAP_CLUSTER_MAX), .gfp_mask = gfp_mask, + .swappiness = vm_swappiness, }; disable_swap_token(); -- cgit v1.2.3 From 742755a1d8ce2b548428f7aacf1758b4bba50080 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Fri, 23 Jun 2006 02:03:55 -0700 Subject: [PATCH] page migration: sys_move_pages(): support moving of individual pages move_pages() is used to move individual pages of a process. The function can be used to determine the location of pages and to move them onto the desired node. move_pages() returns status information for each page. long move_pages(pid, number_of_pages_to_move, addresses_of_pages[], nodes[] or NULL, status[], flags); The addresses of pages is an array of void * pointing to the pages to be moved. The nodes array contains the node numbers that the pages should be moved to. If a NULL is passed instead of an array then no pages are moved but the status array is updated. The status request may be used to determine the page state before issuing another move_pages() to move pages. The status array will contain the state of all individual page migration attempts when the function terminates. The status array is only valid if move_pages() completed successfullly. Possible page states in status[]: 0..MAX_NUMNODES The page is now on the indicated node. -ENOENT Page is not present -EACCES Page is mapped by multiple processes and can only be moved if MPOL_MF_MOVE_ALL is specified. -EPERM The page has been mlocked by a process/driver and cannot be moved. -EBUSY Page is busy and cannot be moved. Try again later. -EFAULT Invalid address (no VMA or zero page). -ENOMEM Unable to allocate memory on target node. -EIO Unable to write back page. The page must be written back in order to move it since the page is dirty and the filesystem does not provide a migration function that would allow the moving of dirty pages. -EINVAL A dirty page cannot be moved. The filesystem does not provide a migration function and has no ability to write back pages. The flags parameter indicates what types of pages to move: MPOL_MF_MOVE Move pages that are only mapped by the process. MPOL_MF_MOVE_ALL Also move pages that are mapped by multiple processes. Requires sufficient capabilities. Possible return codes from move_pages() -ENOENT No pages found that would require moving. All pages are either already on the target node, not present, had an invalid address or could not be moved because they were mapped by multiple processes. -EINVAL Flags other than MPOL_MF_MOVE(_ALL) specified or an attempt to migrate pages in a kernel thread. -EPERM MPOL_MF_MOVE_ALL specified without sufficient priviledges. or an attempt to move a process belonging to another user. -EACCES One of the target nodes is not allowed by the current cpuset. -ENODEV One of the target nodes is not online. -ESRCH Process does not exist. -E2BIG Too many pages to move. -ENOMEM Not enough memory to allocate control array. -EFAULT Parameters could not be accessed. A test program for move_pages() may be found with the patches on ftp.kernel.org:/pub/linux/kernel/people/christoph/pmig/patches-2.6.17-rc4-mm3 From: Christoph Lameter Detailed results for sys_move_pages() Pass a pointer to an integer to get_new_page() that may be used to indicate where the completion status of a migration operation should be placed. This allows sys_move_pags() to report back exactly what happened to each page. Wish there would be a better way to do this. Looks a bit hacky. Signed-off-by: Christoph Lameter Cc: Hugh Dickins Cc: Jes Sorensen Cc: KAMEZAWA Hiroyuki Cc: Lee Schermerhorn Cc: Andi Kleen Cc: Michael Kerrisk Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/vm/page_migration | 29 ++--- arch/ia64/kernel/entry.S | 2 +- include/asm-ia64/unistd.h | 2 +- include/linux/migrate.h | 2 +- include/linux/syscalls.h | 5 + kernel/sys_ni.c | 1 + mm/mempolicy.c | 4 +- mm/migrate.c | 268 +++++++++++++++++++++++++++++++++++++++- 8 files changed, 288 insertions(+), 25 deletions(-) (limited to 'kernel') diff --git a/Documentation/vm/page_migration b/Documentation/vm/page_migration index 0a5d5fb18854..99f89aa10169 100644 --- a/Documentation/vm/page_migration +++ b/Documentation/vm/page_migration @@ -26,8 +26,13 @@ a process are located. See also the numa_maps manpage in the numactl package. Manual migration is useful if for example the scheduler has relocated a process to a processor on a distant node. A batch scheduler or an administrator may detect the situation and move the pages of the process -nearer to the new processor. At some point in the future we may have -some mechanism in the scheduler that will automatically move the pages. +nearer to the new processor. The kernel itself does only provide +manual page migration support. Automatic page migration may be implemented +through user space processes that move pages. A special function call +"move_pages" allows the moving of individual pages within a process. +A NUMA profiler may f.e. obtain a log showing frequent off node +accesses and may use the result to move pages to more advantageous +locations. Larger installations usually partition the system using cpusets into sections of nodes. Paul Jackson has equipped cpusets with the ability to @@ -62,22 +67,14 @@ A. In kernel use of migrate_pages() It also prevents the swapper or other scans to encounter the page. -2. Generate a list of newly allocates pages. These pages will contain the - contents of the pages from the first list after page migration is - complete. +2. We need to have a function of type new_page_t that can be + passed to migrate_pages(). This function should figure out + how to allocate the correct new page given the old page. 3. The migrate_pages() function is called which attempts - to do the migration. It returns the moved pages in the - list specified as the third parameter and the failed - migrations in the fourth parameter. When the function - returns the first list will contain the pages that could still be retried. - -4. The leftover pages of various types are returned - to the LRU using putback_to_lru_pages() or otherwise - disposed of. The pages will still have the refcount as - increased by isolate_lru_pages() if putback_to_lru_pages() is not - used! The kernel may want to handle the various cases of failures in - different ways. + to do the migration. It will call the function to allocate + the new page for each page that is considered for + moving. B. How migrate_pages() works ---------------------------- diff --git a/arch/ia64/kernel/entry.S b/arch/ia64/kernel/entry.S index bcb80ca5cf40..32c999f58d12 100644 --- a/arch/ia64/kernel/entry.S +++ b/arch/ia64/kernel/entry.S @@ -1584,7 +1584,7 @@ sys_call_table: data8 sys_keyctl data8 sys_ioprio_set data8 sys_ioprio_get // 1275 - data8 sys_ni_syscall + data8 sys_move_pages data8 sys_inotify_init data8 sys_inotify_add_watch data8 sys_inotify_rm_watch diff --git a/include/asm-ia64/unistd.h b/include/asm-ia64/unistd.h index 632f2eedf72c..bb0eb727dcd0 100644 --- a/include/asm-ia64/unistd.h +++ b/include/asm-ia64/unistd.h @@ -265,7 +265,7 @@ #define __NR_keyctl 1273 #define __NR_ioprio_set 1274 #define __NR_ioprio_get 1275 -/* 1276 is available for reuse (was briefly sys_set_zone_reclaim) */ +#define __NR_move_pages 1276 #define __NR_inotify_init 1277 #define __NR_inotify_add_watch 1278 #define __NR_inotify_rm_watch 1279 diff --git a/include/linux/migrate.h b/include/linux/migrate.h index 5b95d6568dc4..5dba23a1c0d0 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -3,7 +3,7 @@ #include -typedef struct page *new_page_t(struct page *, unsigned long private); +typedef struct page *new_page_t(struct page *, unsigned long private, int **); #ifdef CONFIG_MIGRATION extern int isolate_lru_page(struct page *p, struct list_head *pagelist); diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index bd67a4413df7..7e3f23490918 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -516,6 +516,11 @@ asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask, asmlinkage long sys_migrate_pages(pid_t pid, unsigned long maxnode, const unsigned long __user *from, const unsigned long __user *to); +asmlinkage long sys_move_pages(pid_t pid, unsigned long nr_pages, + const void __user * __user *pages, + const int __user *nodes, + int __user *status, + int flags); asmlinkage long sys_mbind(unsigned long start, unsigned long len, unsigned long mode, unsigned long __user *nmask, diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 5433195040f1..597229749dec 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -87,6 +87,7 @@ cond_syscall(sys_inotify_init); cond_syscall(sys_inotify_add_watch); cond_syscall(sys_inotify_rm_watch); cond_syscall(sys_migrate_pages); +cond_syscall(sys_move_pages); cond_syscall(sys_chown16); cond_syscall(sys_fchown16); cond_syscall(sys_getegid16); diff --git a/mm/mempolicy.c b/mm/mempolicy.c index f432642e9e66..05b84acf0bb3 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -588,7 +588,7 @@ static void migrate_page_add(struct page *page, struct list_head *pagelist, isolate_lru_page(page, pagelist); } -static struct page *new_node_page(struct page *page, unsigned long node) +static struct page *new_node_page(struct page *page, unsigned long node, int **x) { return alloc_pages_node(node, GFP_HIGHUSER, 0); } @@ -698,7 +698,7 @@ int do_migrate_pages(struct mm_struct *mm, } -static struct page *new_vma_page(struct page *page, unsigned long private) +static struct page *new_vma_page(struct page *page, unsigned long private, int **x) { struct vm_area_struct *vma = (struct vm_area_struct *)private; diff --git a/mm/migrate.c b/mm/migrate.c index 251a8d158257..033a12f4c949 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -25,6 +25,8 @@ #include #include #include +#include +#include #include "internal.h" @@ -62,9 +64,8 @@ int isolate_lru_page(struct page *page, struct list_head *pagelist) } /* - * migrate_prep() needs to be called after we have compiled the list of pages - * to be migrated using isolate_lru_page() but before we begin a series of calls - * to migrate_pages(). + * migrate_prep() needs to be called before we start compiling a list of pages + * to be migrated using isolate_lru_page(). */ int migrate_prep(void) { @@ -588,7 +589,8 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private, struct page *page, int force) { int rc = 0; - struct page *newpage = get_new_page(page, private); + int *result = NULL; + struct page *newpage = get_new_page(page, private, &result); if (!newpage) return -ENOMEM; @@ -642,6 +644,12 @@ move_newpage: * then this will free the page. */ move_to_lru(newpage); + if (result) { + if (rc) + *result = rc; + else + *result = page_to_nid(newpage); + } return rc; } @@ -710,3 +718,255 @@ out: return nr_failed + retry; } +#ifdef CONFIG_NUMA +/* + * Move a list of individual pages + */ +struct page_to_node { + unsigned long addr; + struct page *page; + int node; + int status; +}; + +static struct page *new_page_node(struct page *p, unsigned long private, + int **result) +{ + struct page_to_node *pm = (struct page_to_node *)private; + + while (pm->node != MAX_NUMNODES && pm->page != p) + pm++; + + if (pm->node == MAX_NUMNODES) + return NULL; + + *result = &pm->status; + + return alloc_pages_node(pm->node, GFP_HIGHUSER, 0); +} + +/* + * Move a set of pages as indicated in the pm array. The addr + * field must be set to the virtual address of the page to be moved + * and the node number must contain a valid target node. + */ +static int do_move_pages(struct mm_struct *mm, struct page_to_node *pm, + int migrate_all) +{ + int err; + struct page_to_node *pp; + LIST_HEAD(pagelist); + + down_read(&mm->mmap_sem); + + /* + * Build a list of pages to migrate + */ + migrate_prep(); + for (pp = pm; pp->node != MAX_NUMNODES; pp++) { + struct vm_area_struct *vma; + struct page *page; + + /* + * A valid page pointer that will not match any of the + * pages that will be moved. + */ + pp->page = ZERO_PAGE(0); + + err = -EFAULT; + vma = find_vma(mm, pp->addr); + if (!vma) + goto set_status; + + page = follow_page(vma, pp->addr, FOLL_GET); + err = -ENOENT; + if (!page) + goto set_status; + + if (PageReserved(page)) /* Check for zero page */ + goto put_and_set; + + pp->page = page; + err = page_to_nid(page); + + if (err == pp->node) + /* + * Node already in the right place + */ + goto put_and_set; + + err = -EACCES; + if (page_mapcount(page) > 1 && + !migrate_all) + goto put_and_set; + + err = isolate_lru_page(page, &pagelist); +put_and_set: + /* + * Either remove the duplicate refcount from + * isolate_lru_page() or drop the page ref if it was + * not isolated. + */ + put_page(page); +set_status: + pp->status = err; + } + + if (!list_empty(&pagelist)) + err = migrate_pages(&pagelist, new_page_node, + (unsigned long)pm); + else + err = -ENOENT; + + up_read(&mm->mmap_sem); + return err; +} + +/* + * Determine the nodes of a list of pages. The addr in the pm array + * must have been set to the virtual address of which we want to determine + * the node number. + */ +static int do_pages_stat(struct mm_struct *mm, struct page_to_node *pm) +{ + down_read(&mm->mmap_sem); + + for ( ; pm->node != MAX_NUMNODES; pm++) { + struct vm_area_struct *vma; + struct page *page; + int err; + + err = -EFAULT; + vma = find_vma(mm, pm->addr); + if (!vma) + goto set_status; + + page = follow_page(vma, pm->addr, 0); + err = -ENOENT; + /* Use PageReserved to check for zero page */ + if (!page || PageReserved(page)) + goto set_status; + + err = page_to_nid(page); +set_status: + pm->status = err; + } + + up_read(&mm->mmap_sem); + return 0; +} + +/* + * Move a list of pages in the address space of the currently executing + * process. + */ +asmlinkage long sys_move_pages(pid_t pid, unsigned long nr_pages, + const void __user * __user *pages, + const int __user *nodes, + int __user *status, int flags) +{ + int err = 0; + int i; + struct task_struct *task; + nodemask_t task_nodes; + struct mm_struct *mm; + struct page_to_node *pm = NULL; + + /* Check flags */ + if (flags & ~(MPOL_MF_MOVE|MPOL_MF_MOVE_ALL)) + return -EINVAL; + + if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE)) + return -EPERM; + + /* Find the mm_struct */ + read_lock(&tasklist_lock); + task = pid ? find_task_by_pid(pid) : current; + if (!task) { + read_unlock(&tasklist_lock); + return -ESRCH; + } + mm = get_task_mm(task); + read_unlock(&tasklist_lock); + + if (!mm) + return -EINVAL; + + /* + * Check if this process has the right to modify the specified + * process. The right exists if the process has administrative + * capabilities, superuser privileges or the same + * userid as the target process. + */ + if ((current->euid != task->suid) && (current->euid != task->uid) && + (current->uid != task->suid) && (current->uid != task->uid) && + !capable(CAP_SYS_NICE)) { + err = -EPERM; + goto out2; + } + + task_nodes = cpuset_mems_allowed(task); + + /* Limit nr_pages so that the multiplication may not overflow */ + if (nr_pages >= ULONG_MAX / sizeof(struct page_to_node) - 1) { + err = -E2BIG; + goto out2; + } + + pm = vmalloc((nr_pages + 1) * sizeof(struct page_to_node)); + if (!pm) { + err = -ENOMEM; + goto out2; + } + + /* + * Get parameters from user space and initialize the pm + * array. Return various errors if the user did something wrong. + */ + for (i = 0; i < nr_pages; i++) { + const void *p; + + err = -EFAULT; + if (get_user(p, pages + i)) + goto out; + + pm[i].addr = (unsigned long)p; + if (nodes) { + int node; + + if (get_user(node, nodes + i)) + goto out; + + err = -ENODEV; + if (!node_online(node)) + goto out; + + err = -EACCES; + if (!node_isset(node, task_nodes)) + goto out; + + pm[i].node = node; + } + } + /* End marker */ + pm[nr_pages].node = MAX_NUMNODES; + + if (nodes) + err = do_move_pages(mm, pm, flags & MPOL_MF_MOVE_ALL); + else + err = do_pages_stat(mm, pm); + + if (err >= 0) + /* Return status information */ + for (i = 0; i < nr_pages; i++) + if (put_user(pm[i].status, status + i)) + err = -EFAULT; + +out: + vfree(pm); +out2: + mmput(mm); + return err; +} +#endif + -- cgit v1.2.3 From 1b2db9fb7adc4d67d9ce7d16ce79c41ee84730fe Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Fri, 23 Jun 2006 02:03:56 -0700 Subject: [PATCH] sys_move_pages: 32bit support (i386, x86_64) sys_move_pages() support for 32bit (i386 plus x86_64 compat layer) Add support for move_pages() on i386 and also add the compat functions necessary to run 32 bit binaries on x86_64. Add compat_sys_move_pages to the x86_64 32bit binary layer. Note that it is not up to date so I added the missing pieces. Not sure if this is done the right way. [akpm@osdl.org: compile fix] Signed-off-by: Christoph Lameter Cc: Andi Kleen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/i386/kernel/syscall_table.S | 1 + arch/x86_64/ia32/ia32entry.S | 1 + include/asm-i386/unistd.h | 3 ++- include/linux/syscalls.h | 5 +++++ kernel/compat.c | 23 +++++++++++++++++++++++ kernel/sys_ni.c | 1 + 6 files changed, 33 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/arch/i386/kernel/syscall_table.S b/arch/i386/kernel/syscall_table.S index af56987f69b0..dd63d4775398 100644 --- a/arch/i386/kernel/syscall_table.S +++ b/arch/i386/kernel/syscall_table.S @@ -316,3 +316,4 @@ ENTRY(sys_call_table) .long sys_sync_file_range .long sys_tee /* 315 */ .long sys_vmsplice + .long sys_move_pages diff --git a/arch/x86_64/ia32/ia32entry.S b/arch/x86_64/ia32/ia32entry.S index 5a92fed2d1d5..4ec594ab1a98 100644 --- a/arch/x86_64/ia32/ia32entry.S +++ b/arch/x86_64/ia32/ia32entry.S @@ -696,4 +696,5 @@ ia32_sys_call_table: .quad sys_sync_file_range .quad sys_tee .quad compat_sys_vmsplice + .quad compat_sys_move_pages ia32_syscall_end: diff --git a/include/asm-i386/unistd.h b/include/asm-i386/unistd.h index de2ccc149e34..fc1c8ddae149 100644 --- a/include/asm-i386/unistd.h +++ b/include/asm-i386/unistd.h @@ -322,10 +322,11 @@ #define __NR_sync_file_range 314 #define __NR_tee 315 #define __NR_vmsplice 316 +#define __NR_move_pages 317 #ifdef __KERNEL__ -#define NR_syscalls 317 +#define NR_syscalls 318 /* * user-visible error numbers are in the range -1 - -128: see diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 7e3f23490918..e42738c69166 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -521,6 +521,11 @@ asmlinkage long sys_move_pages(pid_t pid, unsigned long nr_pages, const int __user *nodes, int __user *status, int flags); +asmlinkage long compat_sys_move_pages(pid_t pid, unsigned long nr_page, + void __user *pages, + const int __user *nodes, + int __user *status, + int flags); asmlinkage long sys_mbind(unsigned long start, unsigned long len, unsigned long mode, unsigned long __user *nmask, diff --git a/kernel/compat.c b/kernel/compat.c index c1601a84f8d8..ccea93e28954 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -21,6 +21,7 @@ #include #include #include +#include #include @@ -934,3 +935,25 @@ asmlinkage long compat_sys_adjtimex(struct compat_timex __user *utp) return ret; } + +#ifdef CONFIG_NUMA +asmlinkage long compat_sys_move_pages(pid_t pid, unsigned long nr_pages, + void __user *pages32, + const int __user *nodes, + int __user *status, + int flags) +{ + const void __user * __user *pages; + int i; + + pages = compat_alloc_user_space(nr_pages * sizeof(void *)); + for (i = 0; i < nr_pages; i++) { + compat_uptr_t p; + + if (get_user(p, (compat_uptr_t *)(pages32 + i)) || + put_user(compat_ptr(p), pages + i)) + return -EFAULT; + } + return sys_move_pages(pid, nr_pages, pages, nodes, status, flags); +} +#endif diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 597229749dec..6991bece67e8 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -133,3 +133,4 @@ cond_syscall(sys_mincore); cond_syscall(sys_madvise); cond_syscall(sys_mremap); cond_syscall(sys_remap_file_pages); +cond_syscall(compat_sys_move_pages); -- cgit v1.2.3 From 9216dfad4fc97ab639ef0885efc713f3d7a20d5b Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Fri, 23 Jun 2006 02:03:57 -0700 Subject: [PATCH] move_pages: fix 32 -> 64 bit compat function The definition of the third parameter is a pointer to an array of virtual addresses which give us some trouble. The existing code calculated the wrong address in the array since I used void to avoid having to specify a type. I now use the correct type "compat_uptr_t __user *" in the definition of the function in kernel/compat.c. However, I used __u32 in syscalls.h. Would have to include compat.h there in order to provide the same definition which would generate an ugly include situation. On both ia64 and x86_64 compat_uptr_t is u32. So this works although parameter declarations differ. Signed-off-by: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/syscalls.h | 2 +- kernel/compat.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index e42738c69166..33785b79d548 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -522,7 +522,7 @@ asmlinkage long sys_move_pages(pid_t pid, unsigned long nr_pages, int __user *status, int flags); asmlinkage long compat_sys_move_pages(pid_t pid, unsigned long nr_page, - void __user *pages, + __u32 __user *pages, const int __user *nodes, int __user *status, int flags); diff --git a/kernel/compat.c b/kernel/compat.c index ccea93e28954..2f672332430f 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -938,7 +938,7 @@ asmlinkage long compat_sys_adjtimex(struct compat_timex __user *utp) #ifdef CONFIG_NUMA asmlinkage long compat_sys_move_pages(pid_t pid, unsigned long nr_pages, - void __user *pages32, + compat_uptr_t __user *pages32, const int __user *nodes, int __user *status, int flags) @@ -950,7 +950,7 @@ asmlinkage long compat_sys_move_pages(pid_t pid, unsigned long nr_pages, for (i = 0; i < nr_pages; i++) { compat_uptr_t p; - if (get_user(p, (compat_uptr_t *)(pages32 + i)) || + if (get_user(p, pages32 + i) || put_user(compat_ptr(p), pages + i)) return -EFAULT; } -- cgit v1.2.3 From e7834f8fccd791225a1cf91c2c3e740ad8e2e145 Mon Sep 17 00:00:00 2001 From: David Quigley Date: Fri, 23 Jun 2006 02:03:59 -0700 Subject: [PATCH] SELinux: add security hooks to {get,set}affinity This patch adds LSM hooks into the setaffinity and getaffinity functions to enable security modules to control these operations between tasks with task_setscheduler and task_getscheduler LSM hooks. Signed-off-by: David Quigley Acked-by: Stephen Smalley Signed-off-by: James Morris Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sched.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index c13f1bd2df7d..87665132cec1 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -3886,6 +3886,10 @@ long sched_setaffinity(pid_t pid, cpumask_t new_mask) !capable(CAP_SYS_NICE)) goto out_unlock; + retval = security_task_setscheduler(p, 0, NULL); + if (retval) + goto out_unlock; + cpus_allowed = cpuset_cpus_allowed(p); cpus_and(new_mask, new_mask, cpus_allowed); retval = set_cpus_allowed(p, new_mask); @@ -3954,7 +3958,10 @@ long sched_getaffinity(pid_t pid, cpumask_t *mask) if (!p) goto out_unlock; - retval = 0; + retval = security_task_getscheduler(p); + if (retval) + goto out_unlock; + cpus_and(*mask, p->cpus_allowed, cpu_online_map); out_unlock: -- cgit v1.2.3 From 22fb52dd736a62e24c44c50739007496265dc38c Mon Sep 17 00:00:00 2001 From: David Quigley Date: Fri, 23 Jun 2006 02:04:00 -0700 Subject: [PATCH] SELinux: add security hook call to mediate attach_task (kernel/cpuset.c) Add a security hook call to enable security modules to control the ability to attach a task to a cpuset. While limited control over this operation is possible via permission checks on the pseudo fs interface, those checks are not sufficient to control access to the target task, which is looked up in this function. The existing task_setscheduler hook is re-used for this operation since this falls under the same class of operations. Signed-off-by: David Quigley Acked-by: Stephen Smalley Signed-off-by: James Morris Acked-by: Paul Jackson Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cpuset.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'kernel') diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 77f45ffd5ea1..b602f73fb38d 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -41,6 +41,7 @@ #include #include #include +#include #include #include #include @@ -1177,6 +1178,7 @@ static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf) cpumask_t cpus; nodemask_t from, to; struct mm_struct *mm; + int retval; if (sscanf(pidbuf, "%d", &pid) != 1) return -EIO; @@ -1205,6 +1207,12 @@ static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf) get_task_struct(tsk); } + retval = security_task_setscheduler(tsk, 0, NULL); + if (retval) { + put_task_struct(tsk); + return retval; + } + mutex_lock(&callback_mutex); task_lock(tsk); -- cgit v1.2.3 From 1b61b910e99059abdd54c93aa70e84e076e33d16 Mon Sep 17 00:00:00 2001 From: Zhang Yanmin Date: Fri, 23 Jun 2006 02:04:22 -0700 Subject: [PATCH] x86: kernel irq balance doesn't work On i386, kernel irq balance doesn't work. 1) In function do_irq_balance, after kernel finds the min_loaded cpu but before calling set_pending_irq to really pin the selected_irq to the target cpu, kernel does a cpus_and with irq_affinity[selected_irq]. Later on, when the irq is acked, kernel would calls move_native_irq=>desc->handler->set_affinity to change the irq affinity. However, every function pointed by hw_interrupt_type->set_affinity(unsigned int irq, cpumask_t cpumask) always changes irq_affinity[irq] to cpumask. Next time when recalling do_irq_balance, it has to do cpu_ands again with irq_affinity[selected_irq], but irq_affinity[selected_irq] already becomes one cpu selected by the first irq balance. 2) Function balance_irq in file arch/i386/kernel/io_apic.c has the same issue. [akpm@osdl.org: cleanups] Signed-off-by: Zhang Yanmin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/i386/kernel/io_apic.c | 31 ++++++++++++++++++++----------- include/linux/irq.h | 8 ++++++++ kernel/irq/proc.c | 3 +++ 3 files changed, 31 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/arch/i386/kernel/io_apic.c b/arch/i386/kernel/io_apic.c index d70f2ade5cde..a62df3e764c5 100644 --- a/arch/i386/kernel/io_apic.c +++ b/arch/i386/kernel/io_apic.c @@ -267,7 +267,7 @@ static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t cpumask) # include /* kmalloc() */ # include /* time_after() */ -# ifdef CONFIG_BALANCED_IRQ_DEBUG +#ifdef CONFIG_BALANCED_IRQ_DEBUG # define TDprintk(x...) do { printk("<%ld:%s:%d>: ", jiffies, __FILE__, __LINE__); printk(x); } while (0) # define Dprintk(x...) do { TDprintk(x); } while (0) # else @@ -275,10 +275,15 @@ static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t cpumask) # define Dprintk(x...) # endif - #define IRQBALANCE_CHECK_ARCH -999 -static int irqbalance_disabled = IRQBALANCE_CHECK_ARCH; -static int physical_balance = 0; +#define MAX_BALANCED_IRQ_INTERVAL (5*HZ) +#define MIN_BALANCED_IRQ_INTERVAL (HZ/2) +#define BALANCED_IRQ_MORE_DELTA (HZ/10) +#define BALANCED_IRQ_LESS_DELTA (HZ) + +static int irqbalance_disabled __read_mostly = IRQBALANCE_CHECK_ARCH; +static int physical_balance __read_mostly; +static long balanced_irq_interval __read_mostly = MAX_BALANCED_IRQ_INTERVAL; static struct irq_cpu_info { unsigned long * last_irq; @@ -297,12 +302,14 @@ static struct irq_cpu_info { #define CPU_TO_PACKAGEINDEX(i) (first_cpu(cpu_sibling_map[i])) -#define MAX_BALANCED_IRQ_INTERVAL (5*HZ) -#define MIN_BALANCED_IRQ_INTERVAL (HZ/2) -#define BALANCED_IRQ_MORE_DELTA (HZ/10) -#define BALANCED_IRQ_LESS_DELTA (HZ) +static cpumask_t balance_irq_affinity[NR_IRQS] = { + [0 ... NR_IRQS-1] = CPU_MASK_ALL +}; -static long balanced_irq_interval = MAX_BALANCED_IRQ_INTERVAL; +void set_balance_irq_affinity(unsigned int irq, cpumask_t mask) +{ + balance_irq_affinity[irq] = mask; +} static unsigned long move(int curr_cpu, cpumask_t allowed_mask, unsigned long now, int direction) @@ -340,7 +347,7 @@ static inline void balance_irq(int cpu, int irq) if (irqbalance_disabled) return; - cpus_and(allowed_mask, cpu_online_map, irq_affinity[irq]); + cpus_and(allowed_mask, cpu_online_map, balance_irq_affinity[irq]); new_cpu = move(cpu, allowed_mask, now, 1); if (cpu != new_cpu) { set_pending_irq(irq, cpumask_of_cpu(new_cpu)); @@ -529,7 +536,9 @@ tryanotherirq: } } - cpus_and(allowed_mask, cpu_online_map, irq_affinity[selected_irq]); + cpus_and(allowed_mask, + cpu_online_map, + balance_irq_affinity[selected_irq]); target_cpu_mask = cpumask_of_cpu(min_loaded); cpus_and(tmp, target_cpu_mask, allowed_mask); diff --git a/include/linux/irq.h b/include/linux/irq.h index 42c9cd562860..e8a07e75e4fb 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -164,6 +164,14 @@ static inline void set_irq_info(int irq, cpumask_t mask) #endif // CONFIG_SMP +#ifdef CONFIG_IRQBALANCE +extern void set_balance_irq_affinity(unsigned int irq, cpumask_t mask); +#else +static inline void set_balance_irq_affinity(unsigned int irq, cpumask_t mask) +{ +} +#endif + extern int no_irq_affinity; extern int noirqdebug_setup(char *str); diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index d03b5eef8ce0..afacd6f585fa 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c @@ -24,6 +24,8 @@ static struct proc_dir_entry *smp_affinity_entry[NR_IRQS]; #ifdef CONFIG_GENERIC_PENDING_IRQ void proc_set_irq_affinity(unsigned int irq, cpumask_t mask_val) { + set_balance_irq_affinity(irq, mask_val); + /* * Save these away for later use. Re-progam when the * interrupt is pending @@ -33,6 +35,7 @@ void proc_set_irq_affinity(unsigned int irq, cpumask_t mask_val) #else void proc_set_irq_affinity(unsigned int irq, cpumask_t mask_val) { + set_balance_irq_affinity(irq, mask_val); irq_affinity[irq] = mask_val; irq_desc[irq].handler->set_affinity(irq, mask_val); } -- cgit v1.2.3 From ce4ab0012b32c1a4a1d6e934aeb73bf3151c48d9 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Fri, 23 Jun 2006 02:04:44 -0700 Subject: [PATCH] swsusp: add architecture special saveable pages support 1. Add architecture specific pages save/restore support. Next two patches will use this to save/restore 'ACPI NVS' pages. 2. Allow reserved pages 'nosave'. This could avoid save/restore BIOS reserved pages. Signed-off-by: Shaohua Li Cc: Pavel Machek Cc: "Rafael J. Wysocki" Cc: Nigel Cunningham Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/suspend.h | 1 + kernel/power/power.h | 4 ++ kernel/power/snapshot.c | 110 +++++++++++++++++++++++++++++++++++++++++++++++- kernel/power/swsusp.c | 18 ++------ 4 files changed, 118 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/include/linux/suspend.h b/include/linux/suspend.h index 96e31aa64cc7..e82cb10fb3ea 100644 --- a/include/linux/suspend.h +++ b/include/linux/suspend.h @@ -71,6 +71,7 @@ struct saved_context; void __save_processor_state(struct saved_context *ctxt); void __restore_processor_state(struct saved_context *ctxt); unsigned long get_safe_page(gfp_t gfp_mask); +int swsusp_add_arch_pages(unsigned long start, unsigned long end); /* * XXX: We try to keep some more pages free so that I/O operations succeed diff --git a/kernel/power/power.h b/kernel/power/power.h index f06f12f21767..c81f0ed3eeba 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h @@ -105,6 +105,10 @@ extern struct bitmap_page *alloc_bitmap(unsigned int nr_bits); extern unsigned long alloc_swap_page(int swap, struct bitmap_page *bitmap); extern void free_all_swap_pages(int swap, struct bitmap_page *bitmap); +extern unsigned int count_special_pages(void); +extern int save_special_mem(void); +extern int restore_special_mem(void); + extern int swsusp_check(void); extern int swsusp_shrink_memory(void); extern void swsusp_free(void); diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 3eeedbb13b78..7f511d89c667 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -39,6 +39,88 @@ static unsigned int nr_copy_pages; static unsigned int nr_meta_pages; static unsigned long *buffer; +struct arch_saveable_page { + unsigned long start; + unsigned long end; + char *data; + struct arch_saveable_page *next; +}; +static struct arch_saveable_page *arch_pages; + +int swsusp_add_arch_pages(unsigned long start, unsigned long end) +{ + struct arch_saveable_page *tmp; + + while (start < end) { + tmp = kzalloc(sizeof(struct arch_saveable_page), GFP_KERNEL); + if (!tmp) + return -ENOMEM; + tmp->start = start; + tmp->end = ((start >> PAGE_SHIFT) + 1) << PAGE_SHIFT; + if (tmp->end > end) + tmp->end = end; + tmp->next = arch_pages; + start = tmp->end; + arch_pages = tmp; + } + return 0; +} + +static unsigned int count_arch_pages(void) +{ + unsigned int count = 0; + struct arch_saveable_page *tmp = arch_pages; + while (tmp) { + count++; + tmp = tmp->next; + } + return count; +} + +static int save_arch_mem(void) +{ + char *kaddr; + struct arch_saveable_page *tmp = arch_pages; + int offset; + + pr_debug("swsusp: Saving arch specific memory"); + while (tmp) { + tmp->data = (char *)__get_free_page(GFP_ATOMIC); + if (!tmp->data) + return -ENOMEM; + offset = tmp->start - (tmp->start & PAGE_MASK); + /* arch pages might haven't a 'struct page' */ + kaddr = kmap_atomic_pfn(tmp->start >> PAGE_SHIFT, KM_USER0); + memcpy(tmp->data + offset, kaddr + offset, + tmp->end - tmp->start); + kunmap_atomic(kaddr, KM_USER0); + + tmp = tmp->next; + } + return 0; +} + +static int restore_arch_mem(void) +{ + char *kaddr; + struct arch_saveable_page *tmp = arch_pages; + int offset; + + while (tmp) { + if (!tmp->data) + continue; + offset = tmp->start - (tmp->start & PAGE_MASK); + kaddr = kmap_atomic_pfn(tmp->start >> PAGE_SHIFT, KM_USER0); + memcpy(kaddr + offset, tmp->data + offset, + tmp->end - tmp->start); + kunmap_atomic(kaddr, KM_USER0); + free_page((long)tmp->data); + tmp->data = NULL; + tmp = tmp->next; + } + return 0; +} + #ifdef CONFIG_HIGHMEM unsigned int count_highmem_pages(void) { @@ -150,8 +232,35 @@ int restore_highmem(void) } return 0; } +#else +static unsigned int count_highmem_pages(void) {return 0;} +static int save_highmem(void) {return 0;} +static int restore_highmem(void) {return 0;} #endif +unsigned int count_special_pages(void) +{ + return count_arch_pages() + count_highmem_pages(); +} + +int save_special_mem(void) +{ + int ret; + ret = save_arch_mem(); + if (!ret) + ret = save_highmem(); + return ret; +} + +int restore_special_mem(void) +{ + int ret; + ret = restore_arch_mem(); + if (!ret) + ret = restore_highmem(); + return ret; +} + static int pfn_is_nosave(unsigned long pfn) { unsigned long nosave_begin_pfn = __pa(&__nosave_begin) >> PAGE_SHIFT; @@ -177,7 +286,6 @@ static int saveable(struct zone *zone, unsigned long *zone_pfn) return 0; page = pfn_to_page(pfn); - BUG_ON(PageReserved(page) && PageNosave(page)); if (PageNosave(page)) return 0; if (PageReserved(page) && pfn_is_nosave(pfn)) diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c index f9238faf76e4..78b6e71b0813 100644 --- a/kernel/power/swsusp.c +++ b/kernel/power/swsusp.c @@ -62,16 +62,6 @@ unsigned long image_size = 500 * 1024 * 1024; int in_suspend __nosavedata = 0; -#ifdef CONFIG_HIGHMEM -unsigned int count_highmem_pages(void); -int save_highmem(void); -int restore_highmem(void); -#else -static int save_highmem(void) { return 0; } -static int restore_highmem(void) { return 0; } -static unsigned int count_highmem_pages(void) { return 0; } -#endif - /** * The following functions are used for tracing the allocated * swap pages, so that they can be freed in case of an error. @@ -192,7 +182,7 @@ int swsusp_shrink_memory(void) printk("Shrinking memory... "); do { - size = 2 * count_highmem_pages(); + size = 2 * count_special_pages(); size += size / 50 + count_data_pages(); size += (size + PBES_PER_PAGE - 1) / PBES_PER_PAGE + PAGES_FOR_IO; @@ -234,7 +224,7 @@ int swsusp_suspend(void) goto Enable_irqs; } - if ((error = save_highmem())) { + if ((error = save_special_mem())) { printk(KERN_ERR "swsusp: Not enough free pages for highmem\n"); goto Restore_highmem; } @@ -245,7 +235,7 @@ int swsusp_suspend(void) /* Restore control flow magically appears here */ restore_processor_state(); Restore_highmem: - restore_highmem(); + restore_special_mem(); device_power_up(); Enable_irqs: local_irq_enable(); @@ -271,7 +261,7 @@ int swsusp_resume(void) */ swsusp_free(); restore_processor_state(); - restore_highmem(); + restore_special_mem(); touch_softlockup_watchdog(); device_power_up(); local_irq_enable(); -- cgit v1.2.3 From a938c356d5b007fe6d28251c0ddbf6c11d0d92b5 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Fri, 23 Jun 2006 02:04:46 -0700 Subject: [PATCH] swsusp: take lowmem reserves into account swsusp allocates memory from the normal zone, so it cannot use lowmem reserve pages from the lower zones. Therefore it should not count these pages as available to it. Signed-off-by: Rafael J. Wysocki Cc: Pavel Machek Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/power/swsusp.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c index 78b6e71b0813..f0ee4e7780d6 100644 --- a/kernel/power/swsusp.c +++ b/kernel/power/swsusp.c @@ -188,8 +188,10 @@ int swsusp_shrink_memory(void) PAGES_FOR_IO; tmp = size; for_each_zone (zone) - if (!is_highmem(zone)) + if (!is_highmem(zone) && populated_zone(zone)) { tmp -= zone->free_pages; + tmp += zone->lowmem_reserve[ZONE_NORMAL]; + } if (tmp > 0) { tmp = __shrink_memory(tmp); if (!tmp) -- cgit v1.2.3 From 7bff24e255ee11ecbc304315a252fcbd84f9ffce Mon Sep 17 00:00:00 2001 From: Adrian Bunk Date: Fri, 23 Jun 2006 02:04:47 -0700 Subject: [PATCH] kernel/power/snapshot.c: cleanups - make needlessly global functions static - make dummy functions static inline Signed-off-by: Adrian Bunk Acked-by: Pavel Machek Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/power/snapshot.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 7f511d89c667..513eef3391a0 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -122,7 +122,7 @@ static int restore_arch_mem(void) } #ifdef CONFIG_HIGHMEM -unsigned int count_highmem_pages(void) +static unsigned int count_highmem_pages(void) { struct zone *zone; unsigned long zone_pfn; @@ -199,7 +199,7 @@ static int save_highmem_zone(struct zone *zone) return 0; } -int save_highmem(void) +static int save_highmem(void) { struct zone *zone; int res = 0; @@ -216,7 +216,7 @@ int save_highmem(void) return 0; } -int restore_highmem(void) +static int restore_highmem(void) { printk("swsusp: Restoring Highmem\n"); while (highmem_copy) { @@ -233,9 +233,9 @@ int restore_highmem(void) return 0; } #else -static unsigned int count_highmem_pages(void) {return 0;} -static int save_highmem(void) {return 0;} -static int restore_highmem(void) {return 0;} +static inline unsigned int count_highmem_pages(void) {return 0;} +static inline int save_highmem(void) {return 0;} +static inline int restore_highmem(void) {return 0;} #endif unsigned int count_special_pages(void) @@ -482,7 +482,8 @@ unsigned long get_safe_page(gfp_t gfp_mask) * On each page we set up a list of struct_pbe elements. */ -struct pbe *alloc_pagedir(unsigned int nr_pages, gfp_t gfp_mask, int safe_needed) +static struct pbe *alloc_pagedir(unsigned int nr_pages, gfp_t gfp_mask, + int safe_needed) { unsigned int num; struct pbe *pblist, *pbe; -- cgit v1.2.3 From 968808b8956e332e556b1eae9b4f7df77518f53b Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Fri, 23 Jun 2006 02:04:48 -0700 Subject: [PATCH] swsusp: use less memory during resume MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Make swsusp allocate only as much memory as needed to store the image data and metadata during resume. Without this patch swsusp additionally allocates many page frames that will conflict with the "original" locations of the image data and are considered as "unsafe", treating them as "eaten" pages (ie. allocated but unusable). The patch makes swsusp allocate as many pages as it'll need to store the data read from the image in one shot, creating a list of allocated "safe" pages, and use the observation that all pages allocated by it are marked with the PG_nosave and PG_nosave_free flags set.  Namely, when it's about to load an image page, swsusp can check whether the page frame corresponding to the "original" location of this page has been allocated (ie. if the page frame has the PG_nosave and PG_nosave_free flags set) and if so, it can load the page directly into this page frame.  Otherwise it uses an allocated "safe" page from the list to store the data that will be copied to their "original" location later on. This allows us to save many page copyings and page allocations during resume and in the future it may allow us to load images greater than 50% of the normal zone. Signed-off-by: Rafael J. Wysocki Acked-by: "Pavel Machek" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/power/power.h | 2 +- kernel/power/snapshot.c | 141 ++++++++++++++++++++++++++++-------------------- 2 files changed, 85 insertions(+), 58 deletions(-) (limited to 'kernel') diff --git a/kernel/power/power.h b/kernel/power/power.h index c81f0ed3eeba..98c41423f3b1 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h @@ -55,7 +55,7 @@ struct snapshot_handle { unsigned int page; unsigned int page_offset; unsigned int prev; - struct pbe *pbe; + struct pbe *pbe, *last_pbe; void *buffer; unsigned int buf_offset; }; diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 513eef3391a0..3d9284100b22 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -401,62 +401,29 @@ static inline void create_pbe_list(struct pbe *pblist, unsigned int nr_pages) } } -/** - * On resume it is necessary to trace and eventually free the unsafe - * pages that have been allocated, because they are needed for I/O - * (on x86-64 we likely will "eat" these pages once again while - * creating the temporary page translation tables) - */ - -struct eaten_page { - struct eaten_page *next; - char padding[PAGE_SIZE - sizeof(void *)]; -}; - -static struct eaten_page *eaten_pages = NULL; - -static void release_eaten_pages(void) -{ - struct eaten_page *p, *q; - - p = eaten_pages; - while (p) { - q = p->next; - /* We don't want swsusp_free() to free this page again */ - ClearPageNosave(virt_to_page(p)); - free_page((unsigned long)p); - p = q; - } - eaten_pages = NULL; -} +static unsigned int unsafe_pages; /** * @safe_needed - on resume, for storing the PBE list and the image, * we can only use memory pages that do not conflict with the pages - * which had been used before suspend. + * used before suspend. * * The unsafe pages are marked with the PG_nosave_free flag - * - * Allocated but unusable (ie eaten) memory pages should be marked - * so that swsusp_free() can release them + * and we count them using unsafe_pages */ static inline void *alloc_image_page(gfp_t gfp_mask, int safe_needed) { void *res; + res = (void *)get_zeroed_page(gfp_mask); if (safe_needed) - do { + while (res && PageNosaveFree(virt_to_page(res))) { + /* The page is unsafe, mark it for swsusp_free() */ + SetPageNosave(virt_to_page(res)); + unsafe_pages++; res = (void *)get_zeroed_page(gfp_mask); - if (res && PageNosaveFree(virt_to_page(res))) { - /* This is for swsusp_free() */ - SetPageNosave(virt_to_page(res)); - ((struct eaten_page *)res)->next = eaten_pages; - eaten_pages = res; - } - } while (res && PageNosaveFree(virt_to_page(res))); - else - res = (void *)get_zeroed_page(gfp_mask); + } if (res) { SetPageNosave(virt_to_page(res)); SetPageNosaveFree(virt_to_page(res)); @@ -751,6 +718,8 @@ static int mark_unsafe_pages(struct pbe *pblist) return -EFAULT; } + unsafe_pages = 0; + return 0; } @@ -828,42 +797,99 @@ static inline struct pbe *unpack_orig_addresses(unsigned long *buf, } /** - * create_image - use metadata contained in the PBE list + * prepare_image - use metadata contained in the PBE list * pointed to by pagedir_nosave to mark the pages that will * be overwritten in the process of restoring the system - * memory state from the image and allocate memory for - * the image avoiding these pages + * memory state from the image ("unsafe" pages) and allocate + * memory for the image + * + * The idea is to allocate the PBE list first and then + * allocate as many pages as it's needed for the image data, + * but not to assign these pages to the PBEs initially. + * Instead, we just mark them as allocated and create a list + * of "safe" which will be used later */ -static int create_image(struct snapshot_handle *handle) +struct safe_page { + struct safe_page *next; + char padding[PAGE_SIZE - sizeof(void *)]; +}; + +static struct safe_page *safe_pages; + +static int prepare_image(struct snapshot_handle *handle) { int error = 0; - struct pbe *p, *pblist; + unsigned int nr_pages = nr_copy_pages; + struct pbe *p, *pblist = NULL; p = pagedir_nosave; error = mark_unsafe_pages(p); if (!error) { - pblist = alloc_pagedir(nr_copy_pages, GFP_ATOMIC, 1); + pblist = alloc_pagedir(nr_pages, GFP_ATOMIC, 1); if (pblist) copy_page_backup_list(pblist, p); free_pagedir(p, 0); if (!pblist) error = -ENOMEM; } - if (!error) - error = alloc_data_pages(pblist, GFP_ATOMIC, 1); + safe_pages = NULL; + if (!error && nr_pages > unsafe_pages) { + nr_pages -= unsafe_pages; + while (nr_pages--) { + struct safe_page *ptr; + + ptr = (struct safe_page *)get_zeroed_page(GFP_ATOMIC); + if (!ptr) { + error = -ENOMEM; + break; + } + if (!PageNosaveFree(virt_to_page(ptr))) { + /* The page is "safe", add it to the list */ + ptr->next = safe_pages; + safe_pages = ptr; + } + /* Mark the page as allocated */ + SetPageNosave(virt_to_page(ptr)); + SetPageNosaveFree(virt_to_page(ptr)); + } + } if (!error) { - release_eaten_pages(); pagedir_nosave = pblist; } else { - pagedir_nosave = NULL; handle->pbe = NULL; - nr_copy_pages = 0; - nr_meta_pages = 0; + swsusp_free(); } return error; } +static void *get_buffer(struct snapshot_handle *handle) +{ + struct pbe *pbe = handle->pbe, *last = handle->last_pbe; + struct page *page = virt_to_page(pbe->orig_address); + + if (PageNosave(page) && PageNosaveFree(page)) { + /* + * We have allocated the "original" page frame and we can + * use it directly to store the read page + */ + pbe->address = 0; + if (last && last->next) + last->next = NULL; + return (void *)pbe->orig_address; + } + /* + * The "original" page frame has not been allocated and we have to + * use a "safe" page frame to store the read page + */ + pbe->address = (unsigned long)safe_pages; + safe_pages = safe_pages->next; + if (last) + last->next = pbe; + handle->last_pbe = pbe; + return (void *)pbe->address; +} + /** * snapshot_write_next - used for writing the system memory snapshot. * @@ -908,15 +934,16 @@ int snapshot_write_next(struct snapshot_handle *handle, size_t count) } else if (handle->prev <= nr_meta_pages) { handle->pbe = unpack_orig_addresses(buffer, handle->pbe); if (!handle->pbe) { - error = create_image(handle); + error = prepare_image(handle); if (error) return error; handle->pbe = pagedir_nosave; - handle->buffer = (void *)handle->pbe->address; + handle->last_pbe = NULL; + handle->buffer = get_buffer(handle); } } else { handle->pbe = handle->pbe->next; - handle->buffer = (void *)handle->pbe->address; + handle->buffer = get_buffer(handle); } handle->prev = handle->page; } -- cgit v1.2.3 From c330dda908b5a46469a997eea90b66f2f9f02b34 Mon Sep 17 00:00:00 2001 From: Jeff Moyer Date: Fri, 23 Jun 2006 02:05:07 -0700 Subject: [PATCH] Add a sysfs file to determine if a kexec kernel is loaded Create two files in /sys/kernel, kexec_loaded and kexec_crash_loaded. Each file contains a simple boolean value indicating whether the relevant kernel has been loaded into memory. The motivation for this is geared around support. Signed-off-by: Jeff Moyer Cc: "Eric W. Biederman" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/kexec.h | 1 + kernel/kexec.c | 6 +++--- kernel/ksysfs.c | 19 +++++++++++++++++++ 3 files changed, 23 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/include/linux/kexec.h b/include/linux/kexec.h index cfb3410e32b1..6427949ddf99 100644 --- a/include/linux/kexec.h +++ b/include/linux/kexec.h @@ -106,6 +106,7 @@ extern struct page *kimage_alloc_control_pages(struct kimage *image, extern void crash_kexec(struct pt_regs *); int kexec_should_crash(struct task_struct *); extern struct kimage *kexec_image; +extern struct kimage *kexec_crash_image; #define KEXEC_ON_CRASH 0x00000001 #define KEXEC_ARCH_MASK 0xffff0000 diff --git a/kernel/kexec.c b/kernel/kexec.c index bf39d28e4c0e..58f0f382597c 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -902,14 +902,14 @@ static int kimage_load_segment(struct kimage *image, * kexec does not sync, or unmount filesystems so if you need * that to happen you need to do that yourself. */ -struct kimage *kexec_image = NULL; -static struct kimage *kexec_crash_image = NULL; +struct kimage *kexec_image; +struct kimage *kexec_crash_image; /* * A home grown binary mutex. * Nothing can wait so this mutex is safe to use * in interrupt context :) */ -static int kexec_lock = 0; +static int kexec_lock; asmlinkage long sys_kexec_load(unsigned long entry, unsigned long nr_segments, struct kexec_segment __user *segments, diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c index f119e098e67b..9e28478a17a5 100644 --- a/kernel/ksysfs.c +++ b/kernel/ksysfs.c @@ -14,6 +14,7 @@ #include #include #include +#include #define KERNEL_ATTR_RO(_name) \ static struct subsys_attribute _name##_attr = __ATTR_RO(_name) @@ -48,6 +49,20 @@ static ssize_t uevent_helper_store(struct subsystem *subsys, const char *page, s KERNEL_ATTR_RW(uevent_helper); #endif +#ifdef CONFIG_KEXEC +static ssize_t kexec_loaded_show(struct subsystem *subsys, char *page) +{ + return sprintf(page, "%d\n", !!kexec_image); +} +KERNEL_ATTR_RO(kexec_loaded); + +static ssize_t kexec_crash_loaded_show(struct subsystem *subsys, char *page) +{ + return sprintf(page, "%d\n", !!kexec_crash_image); +} +KERNEL_ATTR_RO(kexec_crash_loaded); +#endif /* CONFIG_KEXEC */ + decl_subsys(kernel, NULL, NULL); EXPORT_SYMBOL_GPL(kernel_subsys); @@ -55,6 +70,10 @@ static struct attribute * kernel_attrs[] = { #if defined(CONFIG_HOTPLUG) && defined(CONFIG_NET) &uevent_seqnum_attr.attr, &uevent_helper_attr.attr, +#endif +#ifdef CONFIG_KEXEC + &kexec_loaded_attr.attr, + &kexec_crash_loaded_attr.attr, #endif NULL }; -- cgit v1.2.3 From 0ae26f1b3159f3acb21ae1e866c3c7e16edd450f Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Fri, 23 Jun 2006 02:05:15 -0700 Subject: [PATCH] mmput() might sleep exit_aio() and exit_mmap() can sleep. But it's easy to accidentally call mmput() from inside locks. Cc: Dave Peterson Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index ac8100e3088a..195958a3a4d6 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -368,6 +368,8 @@ void fastcall __mmdrop(struct mm_struct *mm) */ void mmput(struct mm_struct *mm) { + might_sleep(); + if (atomic_dec_and_test(&mm->mm_users)) { exit_aio(mm); exit_mmap(mm); -- cgit v1.2.3 From 6e6672604773b9bae44d88d38afdf0763c104b1c Mon Sep 17 00:00:00 2001 From: Prasanna Meda Date: Fri, 23 Jun 2006 02:05:23 -0700 Subject: [PATCH] dup fd error fix Set errorp in dup_fd, it will be used in sys_unshare also. Signed-off-by: Prasanna Meda Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 195958a3a4d6..49adc0e8d47c 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -625,6 +625,7 @@ out: /* * Allocate a new files structure and copy contents from the * passed in files structure. + * errorp will be valid only when the returned files_struct is NULL. */ static struct files_struct *dup_fd(struct files_struct *oldf, int *errorp) { @@ -633,6 +634,7 @@ static struct files_struct *dup_fd(struct files_struct *oldf, int *errorp) int open_files, size, i, expand; struct fdtable *old_fdt, *new_fdt; + *errorp = -ENOMEM; newf = alloc_files(); if (!newf) goto out; @@ -746,7 +748,6 @@ static int copy_files(unsigned long clone_flags, struct task_struct * tsk) * break this. */ tsk->files = NULL; - error = -ENOMEM; newf = dup_fd(oldf, &error); if (!newf) goto out; -- cgit v1.2.3 From 8e0a43d8fa953179505869ec28de78550246e795 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 23 Jun 2006 02:05:23 -0700 Subject: [PATCH] cond_resched() might_sleep() fix add the __might_sleep() check back to cond_resched(). Signed-off-by: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sched.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 87665132cec1..5dbc42694477 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -4053,6 +4053,9 @@ asmlinkage long sys_sched_yield(void) static inline void __cond_resched(void) { +#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP + __might_sleep(__FILE__, __LINE__); +#endif /* * The BKS might be reacquired before we have dropped * PREEMPT_ACTIVE, which could trigger a second -- cgit v1.2.3 From 89d0cf01c0aa9e8241cc3703a359ecd6abf3c28a Mon Sep 17 00:00:00 2001 From: Daniel Walker Date: Fri, 23 Jun 2006 02:05:29 -0700 Subject: [PATCH] invert irq/migration.c brach prediction If you get to that point in the code it means that desc->move_irq is set, pending_irq_cpumask[irq] and cpu_online_map should have a value. Still pretty good chance anding those two you'll still have a value. So these two branch predictors should be inverted. Signed-off-by: Daniel Walker Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/irq/migration.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c index 134f9f2e0e39..a12d00eb5e7c 100644 --- a/kernel/irq/migration.c +++ b/kernel/irq/migration.c @@ -30,7 +30,7 @@ void move_native_irq(int irq) desc->move_irq = 0; - if (likely(cpus_empty(pending_irq_cpumask[irq]))) + if (unlikely(cpus_empty(pending_irq_cpumask[irq]))) return; if (!desc->handler->set_affinity) @@ -49,7 +49,7 @@ void move_native_irq(int irq) * cause some ioapics to mal-function. * Being paranoid i guess! */ - if (unlikely(!cpus_empty(tmp))) { + if (likely(!cpus_empty(tmp))) { if (likely(!(desc->status & IRQ_DISABLED))) desc->handler->disable(irq); -- cgit v1.2.3 From 83d4e6e7fba0b2a01092f0cf14ba2e33bd1253e9 Mon Sep 17 00:00:00 2001 From: Andreas Mohr Date: Fri, 23 Jun 2006 02:05:32 -0700 Subject: [PATCH] make noirqdebug/irqfixup __read_mostly, add (un)likely() Signed-off-by: Andreas Mohr Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm/kernel/irq.c | 4 ++-- kernel/irq/spurious.c | 12 ++++++------ 2 files changed, 8 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/arch/arm/kernel/irq.c b/arch/arm/kernel/irq.c index bcc19fbb32df..ec20f8935e8b 100644 --- a/arch/arm/kernel/irq.c +++ b/arch/arm/kernel/irq.c @@ -52,7 +52,7 @@ */ #define MAX_IRQ_CNT 100000 -static int noirqdebug; +static int noirqdebug __read_mostly; static volatile unsigned long irq_err_count; static DEFINE_SPINLOCK(irq_controller_lock); static LIST_HEAD(irq_pending); @@ -81,7 +81,7 @@ irqreturn_t no_action(int irq, void *dev_id, struct pt_regs *regs) void do_bad_IRQ(unsigned int irq, struct irqdesc *desc, struct pt_regs *regs) { - irq_err_count += 1; + irq_err_count++; printk(KERN_ERR "IRQ: spurious interrupt %d\n", irq); } diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c index 7df9abd5ec86..b2fb3c18d06b 100644 --- a/kernel/irq/spurious.c +++ b/kernel/irq/spurious.c @@ -11,7 +11,7 @@ #include #include -static int irqfixup; +static int irqfixup __read_mostly; /* * Recovery handler for misrouted interrupts. @@ -136,9 +136,9 @@ static void report_bad_irq(unsigned int irq, irq_desc_t *desc, irqreturn_t actio void note_interrupt(unsigned int irq, irq_desc_t *desc, irqreturn_t action_ret, struct pt_regs *regs) { - if (action_ret != IRQ_HANDLED) { + if (unlikely(action_ret != IRQ_HANDLED)) { desc->irqs_unhandled++; - if (action_ret != IRQ_NONE) + if (unlikely(action_ret != IRQ_NONE)) report_bad_irq(irq, desc, action_ret); } @@ -152,11 +152,11 @@ void note_interrupt(unsigned int irq, irq_desc_t *desc, irqreturn_t action_ret, } desc->irq_count++; - if (desc->irq_count < 100000) + if (likely(desc->irq_count < 100000)) return; desc->irq_count = 0; - if (desc->irqs_unhandled > 99900) { + if (unlikely(desc->irqs_unhandled > 99900)) { /* * The interrupt is stuck */ @@ -171,7 +171,7 @@ void note_interrupt(unsigned int irq, irq_desc_t *desc, irqreturn_t action_ret, desc->irqs_unhandled = 0; } -int noirqdebug; +int noirqdebug __read_mostly; int __init noirqdebug_setup(char *str) { -- cgit v1.2.3 From 57ae2508610d50893cb3e3bbb869ff70ff724a2a Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Fri, 23 Jun 2006 02:05:47 -0700 Subject: [PATCH] CONFIG_NET=n build fix Cc: Greg KH Cc: Russell King Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sysctl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 072ac446810a..eb8bd214e7d7 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -399,7 +399,7 @@ static ctl_table kern_table[] = { .strategy = &sysctl_string, }, #endif -#ifdef CONFIG_HOTPLUG +#if defined(CONFIG_HOTPLUG) && defined(CONFIG_NET) { .ctl_name = KERN_HOTPLUG, .procname = "hotplug", -- cgit v1.2.3 From 55f4e8d156d23709739029afb108932ef94cac94 Mon Sep 17 00:00:00 2001 From: Jes Sorensen Date: Fri, 23 Jun 2006 02:05:50 -0700 Subject: [PATCH] kernel/sys.c doesn't need init.h kernel/sys.c doesn't have anything in it relying on linux/init.h - remove the include. Signed-off-by: Jes Sorensen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sys.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sys.c b/kernel/sys.c index a57a00597ce0..90930b28d2ca 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -13,7 +13,6 @@ #include #include #include -#include #include #include #include -- cgit v1.2.3 From d83015b8f62ee3fcd338f6f009051ed57f77a531 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 23 Jun 2006 02:05:51 -0700 Subject: [PATCH] Make RCU API inaccessible to non-GPL Linux kernel modules Remove synchronize_kernel() (deprecated 2-APR-2005 in http://lkml.org/lkml/2005/4/3/11) and makes the RCU API inaccessible to non-GPL Linux kernel modules (as was announced more than one year ago in http://lkml.org/lkml/2005/4/3/8). Tested on x86 and ppc64. Signed-off-by: "Paul E. McKenney" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/RCU/whatisRCU.txt | 1 - Documentation/feature-removal-schedule.txt | 15 --------------- include/linux/rcupdate.h | 3 +-- kernel/rcupdate.c | 13 ++----------- 4 files changed, 3 insertions(+), 29 deletions(-) (limited to 'kernel') diff --git a/Documentation/RCU/whatisRCU.txt b/Documentation/RCU/whatisRCU.txt index 07cb93b82ba9..6e459420ee9f 100644 --- a/Documentation/RCU/whatisRCU.txt +++ b/Documentation/RCU/whatisRCU.txt @@ -790,7 +790,6 @@ RCU pointer update: RCU grace period: - synchronize_kernel (deprecated) synchronize_net synchronize_sched synchronize_rcu diff --git a/Documentation/feature-removal-schedule.txt b/Documentation/feature-removal-schedule.txt index f7293297f326..027285d0c26c 100644 --- a/Documentation/feature-removal-schedule.txt +++ b/Documentation/feature-removal-schedule.txt @@ -33,21 +33,6 @@ Who: Adrian Bunk --------------------------- -What: RCU API moves to EXPORT_SYMBOL_GPL -When: April 2006 -Files: include/linux/rcupdate.h, kernel/rcupdate.c -Why: Outside of Linux, the only implementations of anything even - vaguely resembling RCU that I am aware of are in DYNIX/ptx, - VM/XA, Tornado, and K42. I do not expect anyone to port binary - drivers or kernel modules from any of these, since the first two - are owned by IBM and the last two are open-source research OSes. - So these will move to GPL after a grace period to allow - people, who might be using implementations that I am not aware - of, to adjust to this upcoming change. -Who: Paul E. McKenney - ---------------------------- - What: raw1394: requests of type RAW1394_REQ_ISO_SEND, RAW1394_REQ_ISO_LISTEN When: November 2006 Why: Deprecated in favour of the new ioctl-based rawiso interface, which is diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 970284f571a6..6312758393b6 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -246,7 +246,7 @@ extern int rcu_needs_cpu(int cpu); * softirq handlers will have completed, since in some kernels, these * handlers can run in process context, and can block. * - * This primitive provides the guarantees made by the (deprecated) + * This primitive provides the guarantees made by the (now removed) * synchronize_kernel() API. In contrast, synchronize_rcu() only * guarantees that rcu_read_lock() sections will have completed. * In "classic RCU", these two guarantees happen to be one and @@ -264,7 +264,6 @@ extern void FASTCALL(call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *head))); extern void FASTCALL(call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *head))); -extern __deprecated_for_modules void synchronize_kernel(void); extern void synchronize_rcu(void); void synchronize_idle(void); extern void rcu_barrier(void); diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c index 2058f88c7bbb..20e9710fc21c 100644 --- a/kernel/rcupdate.c +++ b/kernel/rcupdate.c @@ -612,14 +612,6 @@ void synchronize_rcu(void) wait_for_completion(&rcu.completion); } -/* - * Deprecated, use synchronize_rcu() or synchronize_sched() instead. - */ -void synchronize_kernel(void) -{ - synchronize_rcu(); -} - module_param(blimit, int, 0); module_param(qhimark, int, 0); module_param(qlowmark, int, 0); @@ -627,7 +619,6 @@ module_param(qlowmark, int, 0); module_param(rsinterval, int, 0); #endif EXPORT_SYMBOL_GPL(rcu_batches_completed); -EXPORT_SYMBOL_GPL_FUTURE(call_rcu); /* WARNING: GPL-only in April 2006. */ -EXPORT_SYMBOL_GPL_FUTURE(call_rcu_bh); /* WARNING: GPL-only in April 2006. */ +EXPORT_SYMBOL_GPL(call_rcu); +EXPORT_SYMBOL_GPL(call_rcu_bh); EXPORT_SYMBOL_GPL(synchronize_rcu); -EXPORT_SYMBOL_GPL_FUTURE(synchronize_kernel); /* WARNING: GPL-only in April 2006. */ -- cgit v1.2.3 From 862f5f0133f1c8a179dd93adc03d43f8f7e8bac5 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Fri, 23 Jun 2006 02:05:52 -0700 Subject: [PATCH] Doc: add audit & acct to DocBook Fix one audit kernel-doc description (one parameter was missing). Add audit*.c interfaces to DocBook. Add BSD accounting interfaces to DocBook. Signed-off-by: Randy Dunlap Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/DocBook/kernel-api.tmpl | 12 ++++++++++++ kernel/auditsc.c | 1 + 2 files changed, 13 insertions(+) (limited to 'kernel') diff --git a/Documentation/DocBook/kernel-api.tmpl b/Documentation/DocBook/kernel-api.tmpl index 6dab3dd36995..31b727ceb127 100644 --- a/Documentation/DocBook/kernel-api.tmpl +++ b/Documentation/DocBook/kernel-api.tmpl @@ -332,6 +332,18 @@ X!Earch/i386/kernel/mca.c !Esecurity/security.c + + Audit Interfaces +!Ekernel/audit.c +!Ikernel/auditsc.c +!Ikernel/auditfilter.c + + + + Accounting Framework +!Ikernel/acct.c + + Power Management !Ekernel/power/pm.c diff --git a/kernel/auditsc.c b/kernel/auditsc.c index b097ccb4eb7e..9ebd96fda295 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -1558,6 +1558,7 @@ int __audit_ipc_obj(struct kern_ipc_perm *ipcp) * @uid: msgq user id * @gid: msgq group id * @mode: msgq mode (permissions) + * @ipcp: in-kernel IPC permissions * * Returns 0 for success or NULL context or < 0 on error. */ -- cgit v1.2.3 From 626ab0e69d376fa07599af669af8ba92d58e87c1 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 23 Jun 2006 02:05:55 -0700 Subject: [PATCH] list: use list_replace_init() instead of list_splice_init() list_splice_init(list, head) does unneeded job if it is known that list_empty(head) == 1. We can use list_replace_init() instead. Signed-off-by: Oleg Nesterov Acked-by: David S. Miller Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/i386/mm/pageattr.c | 8 ++++---- block/ll_rw_blk.c | 5 ++--- fs/aio.c | 4 ++-- kernel/timer.c | 8 ++++---- kernel/workqueue.c | 4 ++-- net/core/dev.c | 6 +++--- net/core/link_watch.c | 5 ++--- 7 files changed, 19 insertions(+), 21 deletions(-) (limited to 'kernel') diff --git a/arch/i386/mm/pageattr.c b/arch/i386/mm/pageattr.c index 92c3d9f0e731..0887b34bc59b 100644 --- a/arch/i386/mm/pageattr.c +++ b/arch/i386/mm/pageattr.c @@ -209,19 +209,19 @@ int change_page_attr(struct page *page, int numpages, pgprot_t prot) } void global_flush_tlb(void) -{ - LIST_HEAD(l); +{ + struct list_head l; struct page *pg, *next; BUG_ON(irqs_disabled()); spin_lock_irq(&cpa_lock); - list_splice_init(&df_list, &l); + list_replace_init(&df_list, &l); spin_unlock_irq(&cpa_lock); flush_map(); list_for_each_entry_safe(pg, next, &l, lru) __free_page(pg); -} +} #ifdef CONFIG_DEBUG_PAGEALLOC void kernel_map_pages(struct page *page, int numpages, int enable) diff --git a/block/ll_rw_blk.c b/block/ll_rw_blk.c index 7eb36c53f4b7..465b54312c59 100644 --- a/block/ll_rw_blk.c +++ b/block/ll_rw_blk.c @@ -3359,12 +3359,11 @@ EXPORT_SYMBOL(end_that_request_chunk); */ static void blk_done_softirq(struct softirq_action *h) { - struct list_head *cpu_list; - LIST_HEAD(local_list); + struct list_head *cpu_list, local_list; local_irq_disable(); cpu_list = &__get_cpu_var(blk_cpu_done); - list_splice_init(cpu_list, &local_list); + list_replace_init(cpu_list, &local_list); local_irq_enable(); while (!list_empty(&local_list)) { diff --git a/fs/aio.c b/fs/aio.c index e41e932ba489..8c34a62df7d7 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -777,11 +777,11 @@ out: static int __aio_run_iocbs(struct kioctx *ctx) { struct kiocb *iocb; - LIST_HEAD(run_list); + struct list_head run_list; assert_spin_locked(&ctx->ctx_lock); - list_splice_init(&ctx->run_list, &run_list); + list_replace_init(&ctx->run_list, &run_list); while (!list_empty(&run_list)) { iocb = list_entry(run_list.next, struct kiocb, ki_run_list); diff --git a/kernel/timer.c b/kernel/timer.c index 9e49deed468c..3bf0e9ed2dbe 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -419,10 +419,10 @@ static inline void __run_timers(tvec_base_t *base) spin_lock_irq(&base->lock); while (time_after_eq(jiffies, base->timer_jiffies)) { - struct list_head work_list = LIST_HEAD_INIT(work_list); + struct list_head work_list; struct list_head *head = &work_list; int index = base->timer_jiffies & TVR_MASK; - + /* * Cascade timers: */ @@ -431,8 +431,8 @@ static inline void __run_timers(tvec_base_t *base) (!cascade(base, &base->tv3, INDEX(1))) && !cascade(base, &base->tv4, INDEX(2))) cascade(base, &base->tv5, INDEX(3)); - ++base->timer_jiffies; - list_splice_init(base->tv1.vec + index, &work_list); + ++base->timer_jiffies; + list_replace_init(base->tv1.vec + index, &work_list); while (!list_empty(head)) { void (*fn)(unsigned long); unsigned long data; diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 880fb415a8f6..740c5abceb07 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -531,11 +531,11 @@ int current_is_keventd(void) static void take_over_work(struct workqueue_struct *wq, unsigned int cpu) { struct cpu_workqueue_struct *cwq = per_cpu_ptr(wq->cpu_wq, cpu); - LIST_HEAD(list); + struct list_head list; struct work_struct *work; spin_lock_irq(&cwq->lock); - list_splice_init(&cwq->worklist, &list); + list_replace_init(&cwq->worklist, &list); while (!list_empty(&list)) { printk("Taking work for %s\n", wq->name); diff --git a/net/core/dev.c b/net/core/dev.c index ab39fe17cb58..195a5e96b2d1 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2980,7 +2980,7 @@ static void netdev_wait_allrefs(struct net_device *dev) static DEFINE_MUTEX(net_todo_run_mutex); void netdev_run_todo(void) { - struct list_head list = LIST_HEAD_INIT(list); + struct list_head list; /* Need to guard against multiple cpu's getting out of order. */ mutex_lock(&net_todo_run_mutex); @@ -2995,9 +2995,9 @@ void netdev_run_todo(void) /* Snapshot list, allow later requests */ spin_lock(&net_todo_list_lock); - list_splice_init(&net_todo_list, &list); + list_replace_init(&net_todo_list, &list); spin_unlock(&net_todo_list_lock); - + while (!list_empty(&list)) { struct net_device *dev = list_entry(list.next, struct net_device, todo_list); diff --git a/net/core/link_watch.c b/net/core/link_watch.c index 646937cc2d84..0f37266411b5 100644 --- a/net/core/link_watch.c +++ b/net/core/link_watch.c @@ -91,11 +91,10 @@ static void rfc2863_policy(struct net_device *dev) /* Must be called with the rtnl semaphore held */ void linkwatch_run_queue(void) { - LIST_HEAD(head); - struct list_head *n, *next; + struct list_head head, *n, *next; spin_lock_irq(&lweventlist_lock); - list_splice_init(&lweventlist, &head); + list_replace_init(&lweventlist, &head); spin_unlock_irq(&lweventlist_lock); list_for_each_safe(n, next, &head) { -- cgit v1.2.3 From 3439dd86e34580384d3b58cf8d54a9283cd7a342 Mon Sep 17 00:00:00 2001 From: Porpoise Date: Fri, 23 Jun 2006 02:05:56 -0700 Subject: [PATCH] When CONFIG_BASE_SMALL=1, cascade() may enter an infinite loop When CONFIG_BASE_SAMLL=1, cascade() in may enter the infinite loop. Because of CONFIG_BASE_SMALL=1(TVR_BITS=6 and TVN_BITS=4), the list base->tv5 may cascade into base->tv5. So, the kernel enters the infinite loop in the function cascade(). I created a test module to verify this bug, and a patch to fix it. #include #include #include #include #if 0 #include #else #define kdb_printf printk #endif #define TVN_BITS (CONFIG_BASE_SMALL ? 4 : 6) #define TVR_BITS (CONFIG_BASE_SMALL ? 6 : 8) #define TVN_SIZE (1 << TVN_BITS) #define TVR_SIZE (1 << TVR_BITS) #define TVN_MASK (TVN_SIZE - 1) #define TVR_MASK (TVR_SIZE - 1) #define TV_SIZE(N) (N*TVN_BITS + TVR_BITS) struct timer_list timer0; struct timer_list dummy_timer1; struct timer_list dummy_timer2; void dummy_timer_fun(unsigned long data) { } unsigned long j=0; void check_timer_base(unsigned long data) { kdb_printf("check_timer_base %08x\n",jiffies); mod_timer(&timer0,(jiffies & (~0xFFF)) + 0x1FFF); } int init_module(void) { init_timer(&timer0); timer0.data = (unsigned long)0; timer0.function = check_timer_base; mod_timer(&timer0,jiffies+1); init_timer(&dummy_timer1); dummy_timer1.data = (unsigned long)0; dummy_timer1.function = dummy_timer_fun; init_timer(&dummy_timer2); dummy_timer2.data = (unsigned long)0; dummy_timer2.function = dummy_timer_fun; j=jiffies; j&=(~((1< Cc: Matt Mackall Signed-off-by: Oleg Nesterov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/timer.c | 22 +++++++++------------- 1 file changed, 9 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/timer.c b/kernel/timer.c index 3bf0e9ed2dbe..f35b3939e937 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -383,23 +383,19 @@ EXPORT_SYMBOL(del_timer_sync); static int cascade(tvec_base_t *base, tvec_t *tv, int index) { /* cascade all the timers from tv up one level */ - struct list_head *head, *curr; + struct timer_list *timer, *tmp; + struct list_head tv_list; + + list_replace_init(tv->vec + index, &tv_list); - head = tv->vec + index; - curr = head->next; /* - * We are removing _all_ timers from the list, so we don't have to - * detach them individually, just clear the list afterwards. + * We are removing _all_ timers from the list, so we + * don't have to detach them individually. */ - while (curr != head) { - struct timer_list *tmp; - - tmp = list_entry(curr, struct timer_list, entry); - BUG_ON(tmp->base != base); - curr = curr->next; - internal_add_timer(base, tmp); + list_for_each_entry_safe(timer, tmp, &tv_list, entry) { + BUG_ON(timer->base != base); + internal_add_timer(base, timer); } - INIT_LIST_HEAD(head); return index; } -- cgit v1.2.3 From 908dcecda1d18803b5823f30e6c47d2882dc0cf1 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Fri, 23 Jun 2006 02:06:00 -0700 Subject: [PATCH] adjust handle_IRR_event() return type Correct the return type of handle_IRQ_event() (inconsistency noticed during Xen development), and remove redundant declarations. The return type adjustment required breaking out the definition of irqreturn_t into a separate header, in order to satisfy current include order dependencies. Signed-off-by: Jan Beulich Cc: Richard Henderson Cc: Ivan Kokshaysky Cc: Russell King Cc: Ian Molton Cc: Mikael Starvik Cc: Yoshinori Sato Cc: Hirokazu Takata Cc: Heiko Carstens Cc: Martin Schwidefsky Cc: William Lee Irwin III Cc: "David S. Miller" Cc: Miles Bader Cc: Geert Uytterhoeven Cc: Roman Zippel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/asm-alpha/irq.h | 4 ---- include/asm-arm/irq.h | 4 ---- include/asm-arm26/irq.h | 4 ---- include/asm-h8300/irq.h | 4 ---- include/asm-m68k/irq.h | 4 ---- include/asm-m68knommu/irq.h | 4 ---- include/asm-s390/irq.h | 4 ---- include/asm-sparc/irq.h | 4 ---- include/asm-v850/irq.h | 2 -- include/linux/interrupt.h | 21 +-------------------- include/linux/irq.h | 3 ++- include/linux/irqreturn.h | 25 +++++++++++++++++++++++++ kernel/irq/handle.c | 5 +++-- 13 files changed, 31 insertions(+), 57 deletions(-) create mode 100644 include/linux/irqreturn.h (limited to 'kernel') diff --git a/include/asm-alpha/irq.h b/include/asm-alpha/irq.h index f6de033718a0..917b9fe372cf 100644 --- a/include/asm-alpha/irq.h +++ b/include/asm-alpha/irq.h @@ -92,8 +92,4 @@ extern void enable_irq(unsigned int); struct pt_regs; extern void (*perf_irq)(unsigned long, struct pt_regs *); -struct irqaction; -int handle_IRQ_event(unsigned int, struct pt_regs *, struct irqaction *); - - #endif /* _ALPHA_IRQ_H */ diff --git a/include/asm-arm/irq.h b/include/asm-arm/irq.h index 60b5105c9c93..66e67e60bc56 100644 --- a/include/asm-arm/irq.h +++ b/include/asm-arm/irq.h @@ -47,10 +47,6 @@ void disable_irq_wake(unsigned int irq); void enable_irq_wake(unsigned int irq); int setup_irq(unsigned int, struct irqaction *); -struct irqaction; -struct pt_regs; -int handle_IRQ_event(unsigned int, struct pt_regs *, struct irqaction *); - extern void migrate_irqs(void); #endif diff --git a/include/asm-arm26/irq.h b/include/asm-arm26/irq.h index 06bd5a543d13..9aaac87efba9 100644 --- a/include/asm-arm26/irq.h +++ b/include/asm-arm26/irq.h @@ -44,9 +44,5 @@ extern void enable_irq(unsigned int); int set_irq_type(unsigned int irq, unsigned int type); -int setup_irq(unsigned int, struct irqaction *); -struct pt_regs; -int handle_IRQ_event(unsigned int, struct pt_regs *, struct irqaction *); - #endif diff --git a/include/asm-h8300/irq.h b/include/asm-h8300/irq.h index 73065f5bda0e..42a3ac424a9e 100644 --- a/include/asm-h8300/irq.h +++ b/include/asm-h8300/irq.h @@ -63,8 +63,4 @@ extern void enable_irq(unsigned int); extern void disable_irq(unsigned int); #define disable_irq_nosync(x) disable_irq(x) -struct irqaction; -struct pt_regs; -int handle_IRQ_event(unsigned int, struct pt_regs *, struct irqaction *); - #endif /* _H8300_IRQ_H_ */ diff --git a/include/asm-m68k/irq.h b/include/asm-m68k/irq.h index b4f48b2a6a57..9727ca9d9f26 100644 --- a/include/asm-m68k/irq.h +++ b/include/asm-m68k/irq.h @@ -130,8 +130,4 @@ extern volatile unsigned int num_spurious; */ extern irq_node_t *new_irq_node(void); -struct irqaction; -struct pt_regs; -int handle_IRQ_event(unsigned int, struct pt_regs *, struct irqaction *); - #endif /* _M68K_IRQ_H_ */ diff --git a/include/asm-m68knommu/irq.h b/include/asm-m68knommu/irq.h index 2b408842a30e..c5247516fcfe 100644 --- a/include/asm-m68knommu/irq.h +++ b/include/asm-m68knommu/irq.h @@ -87,8 +87,4 @@ extern void (*mach_disable_irq)(unsigned int); #define disable_irq(x) do { } while (0) #define disable_irq_nosync(x) disable_irq(x) -struct irqaction; -struct pt_regs; -int handle_IRQ_event(unsigned int, struct pt_regs *, struct irqaction *); - #endif /* _M68K_IRQ_H_ */ diff --git a/include/asm-s390/irq.h b/include/asm-s390/irq.h index 916a1aa0b073..bd1a721f7aa2 100644 --- a/include/asm-s390/irq.h +++ b/include/asm-s390/irq.h @@ -21,10 +21,6 @@ enum interruption_class { #define touch_nmi_watchdog() do { } while(0) -struct irqaction; -struct pt_regs; -int handle_IRQ_event(unsigned int, struct pt_regs *, struct irqaction *); - #endif /* __KERNEL__ */ #endif diff --git a/include/asm-sparc/irq.h b/include/asm-sparc/irq.h index f2d64537e29d..3141ddfea97d 100644 --- a/include/asm-sparc/irq.h +++ b/include/asm-sparc/irq.h @@ -181,8 +181,4 @@ extern struct sun4m_intregs *sun4m_interrupts; #define SUN4M_INT_SBUS(x) (1 << (x+7)) #define SUN4M_INT_VME(x) (1 << (x)) -struct irqaction; -struct pt_regs; -int handle_IRQ_event(unsigned int, struct pt_regs *, struct irqaction *); - #endif diff --git a/include/asm-v850/irq.h b/include/asm-v850/irq.h index 44431152b36d..1bf096db8f4c 100644 --- a/include/asm-v850/irq.h +++ b/include/asm-v850/irq.h @@ -62,8 +62,6 @@ extern void disable_irq (unsigned int irq); /* Disable an irq without waiting. */ extern void disable_irq_nosync (unsigned int irq); -extern int handle_IRQ_event(unsigned int, struct pt_regs *, struct irqaction *); - #endif /* !__ASSEMBLY__ */ #endif /* __V850_IRQ_H__ */ diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h index 9e0fefd7884a..70741e170114 100644 --- a/include/linux/interrupt.h +++ b/include/linux/interrupt.h @@ -7,32 +7,13 @@ #include #include #include +#include #include #include #include #include #include -/* - * For 2.4.x compatibility, 2.4.x can use - * - * typedef void irqreturn_t; - * #define IRQ_NONE - * #define IRQ_HANDLED - * #define IRQ_RETVAL(x) - * - * To mix old-style and new-style irq handler returns. - * - * IRQ_NONE means we didn't handle it. - * IRQ_HANDLED means that we did have a valid interrupt and handled it. - * IRQ_RETVAL(x) selects on the two depending on x being non-zero (for handled) - */ -typedef int irqreturn_t; - -#define IRQ_NONE (0) -#define IRQ_HANDLED (1) -#define IRQ_RETVAL(x) ((x) != 0) - struct irqaction { irqreturn_t (*handler)(int, void *, struct pt_regs *); unsigned long flags; diff --git a/include/linux/irq.h b/include/linux/irq.h index e8a07e75e4fb..676e00dfb21a 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -17,6 +17,7 @@ #include #include #include +#include #include #include @@ -175,7 +176,7 @@ static inline void set_balance_irq_affinity(unsigned int irq, cpumask_t mask) extern int no_irq_affinity; extern int noirqdebug_setup(char *str); -extern fastcall int handle_IRQ_event(unsigned int irq, struct pt_regs *regs, +extern fastcall irqreturn_t handle_IRQ_event(unsigned int irq, struct pt_regs *regs, struct irqaction *action); extern fastcall unsigned int __do_IRQ(unsigned int irq, struct pt_regs *regs); extern void note_interrupt(unsigned int irq, irq_desc_t *desc, diff --git a/include/linux/irqreturn.h b/include/linux/irqreturn.h new file mode 100644 index 000000000000..881883c2009d --- /dev/null +++ b/include/linux/irqreturn.h @@ -0,0 +1,25 @@ +/* irqreturn.h */ +#ifndef _LINUX_IRQRETURN_H +#define _LINUX_IRQRETURN_H + +/* + * For 2.4.x compatibility, 2.4.x can use + * + * typedef void irqreturn_t; + * #define IRQ_NONE + * #define IRQ_HANDLED + * #define IRQ_RETVAL(x) + * + * To mix old-style and new-style irq handler returns. + * + * IRQ_NONE means we didn't handle it. + * IRQ_HANDLED means that we did have a valid interrupt and handled it. + * IRQ_RETVAL(x) selects on the two depending on x being non-zero (for handled) + */ +typedef int irqreturn_t; + +#define IRQ_NONE (0) +#define IRQ_HANDLED (1) +#define IRQ_RETVAL(x) ((x) != 0) + +#endif diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index 51df337b37db..0f6530117105 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -76,10 +76,11 @@ irqreturn_t no_action(int cpl, void *dev_id, struct pt_regs *regs) /* * Have got an event to handle: */ -fastcall int handle_IRQ_event(unsigned int irq, struct pt_regs *regs, +fastcall irqreturn_t handle_IRQ_event(unsigned int irq, struct pt_regs *regs, struct irqaction *action) { - int ret, retval = 0, status = 0; + irqreturn_t ret, retval = IRQ_NONE; + unsigned int status = 0; if (!(action->flags & SA_INTERRUPT)) local_irq_enable(); -- cgit v1.2.3 From 125e18745f16685f69a34fd6130d47598fc4bf54 Mon Sep 17 00:00:00 2001 From: Eric Sesterhenn Date: Fri, 23 Jun 2006 02:06:06 -0700 Subject: [PATCH] More BUG_ON conversion Signed-off-by: Eric Sesterhenn Signed-off-by: Alexey Dobriyan Cc: Bartlomiej Zolnierkiewicz Cc: Alan Cox Cc: James Bottomley Acked-by: "Salyzyn, Mark" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/frv/kernel/setup.c | 2 +- drivers/ide/ide-cd.c | 3 +-- drivers/ide/ide-dma.c | 6 ++---- drivers/ide/ide-floppy.c | 12 ++++-------- drivers/ide/ide-iops.c | 12 ++++-------- drivers/ide/ide-lib.c | 3 +-- drivers/ide/ide-taskfile.c | 3 +-- drivers/ide/ide.c | 3 +-- drivers/ide/pci/trm290.c | 3 +-- drivers/scsi/53c700.h | 6 ++---- drivers/scsi/aacraid/aachba.c | 15 +++++---------- drivers/scsi/aacraid/commsup.c | 6 ++---- drivers/scsi/arm/queue.c | 6 ++---- drivers/scsi/ibmmca.c | 3 +-- drivers/scsi/ide-scsi.c | 6 ++---- drivers/scsi/megaraid.c | 2 +- kernel/exit.c | 5 ++--- mm/pdflush.c | 3 +-- 18 files changed, 34 insertions(+), 65 deletions(-) (limited to 'kernel') diff --git a/arch/frv/kernel/setup.c b/arch/frv/kernel/setup.c index 5908deae9607..1f7d65f29e78 100644 --- a/arch/frv/kernel/setup.c +++ b/arch/frv/kernel/setup.c @@ -814,7 +814,7 @@ void __init setup_arch(char **cmdline_p) * - by now the stack is part of the init task */ printk("Memory %08lx-%08lx\n", memory_start, memory_end); - if (memory_start == memory_end) BUG(); + BUG_ON(memory_start == memory_end); init_mm.start_code = (unsigned long) &_stext; init_mm.end_code = (unsigned long) &_etext; diff --git a/drivers/ide/ide-cd.c b/drivers/ide/ide-cd.c index abb83d95df57..6de3cd3d6e8e 100644 --- a/drivers/ide/ide-cd.c +++ b/drivers/ide/ide-cd.c @@ -1725,8 +1725,7 @@ static ide_startstop_t cdrom_newpc_intr(ide_drive_t *drive) } } - if (HWGROUP(drive)->handler != NULL) - BUG(); + BUG_ON(HWGROUP(drive)->handler != NULL); ide_set_handler(drive, cdrom_newpc_intr, rq->timeout, NULL); return ide_started; diff --git a/drivers/ide/ide-dma.c b/drivers/ide/ide-dma.c index c481be8b807f..783a2475ee8b 100644 --- a/drivers/ide/ide-dma.c +++ b/drivers/ide/ide-dma.c @@ -206,8 +206,7 @@ int ide_build_sglist(ide_drive_t *drive, struct request *rq) ide_hwif_t *hwif = HWIF(drive); struct scatterlist *sg = hwif->sg_table; - if ((rq->flags & REQ_DRIVE_TASKFILE) && rq->nr_sectors > 256) - BUG(); + BUG_ON((rq->flags & REQ_DRIVE_TASKFILE) && rq->nr_sectors > 256); ide_map_sg(drive, rq); @@ -947,8 +946,7 @@ void ide_setup_dma (ide_hwif_t *hwif, unsigned long dma_base, unsigned int num_p } printk("\n"); - if (!(hwif->dma_master)) - BUG(); + BUG_ON(!hwif->dma_master); } EXPORT_SYMBOL_GPL(ide_setup_dma); diff --git a/drivers/ide/ide-floppy.c b/drivers/ide/ide-floppy.c index a53e3ce4a142..a1179e924962 100644 --- a/drivers/ide/ide-floppy.c +++ b/drivers/ide/ide-floppy.c @@ -898,8 +898,7 @@ static ide_startstop_t idefloppy_pc_intr (ide_drive_t *drive) "to send us more data than expected " "- discarding data\n"); idefloppy_discard_data(drive,bcount.all); - if (HWGROUP(drive)->handler != NULL) - BUG(); + BUG_ON(HWGROUP(drive)->handler != NULL); ide_set_handler(drive, &idefloppy_pc_intr, IDEFLOPPY_WAIT_CMD, @@ -932,8 +931,7 @@ static ide_startstop_t idefloppy_pc_intr (ide_drive_t *drive) pc->actually_transferred += bcount.all; pc->current_position += bcount.all; - if (HWGROUP(drive)->handler != NULL) - BUG(); + BUG_ON(HWGROUP(drive)->handler != NULL); ide_set_handler(drive, &idefloppy_pc_intr, IDEFLOPPY_WAIT_CMD, NULL); /* And set the interrupt handler again */ return ide_started; } @@ -960,8 +958,7 @@ static ide_startstop_t idefloppy_transfer_pc (ide_drive_t *drive) "issuing a packet command\n"); return ide_do_reset(drive); } - if (HWGROUP(drive)->handler != NULL) - BUG(); + BUG_ON(HWGROUP(drive)->handler != NULL); /* Set the interrupt routine */ ide_set_handler(drive, &idefloppy_pc_intr, IDEFLOPPY_WAIT_CMD, NULL); /* Send the actual packet */ @@ -1017,8 +1014,7 @@ static ide_startstop_t idefloppy_transfer_pc1 (ide_drive_t *drive) * 40 and 50msec work well. idefloppy_pc_intr will not be actually * used until after the packet is moved in about 50 msec. */ - if (HWGROUP(drive)->handler != NULL) - BUG(); + BUG_ON(HWGROUP(drive)->handler != NULL); ide_set_handler(drive, &idefloppy_pc_intr, /* service routine for packet command */ floppy->ticks, /* wait this long before "failing" */ diff --git a/drivers/ide/ide-iops.c b/drivers/ide/ide-iops.c index b72dde70840a..97a49e77a8f1 100644 --- a/drivers/ide/ide-iops.c +++ b/drivers/ide/ide-iops.c @@ -939,8 +939,7 @@ void ide_execute_command(ide_drive_t *drive, task_ioreg_t cmd, ide_handler_t *ha spin_lock_irqsave(&ide_lock, flags); - if(hwgroup->handler) - BUG(); + BUG_ON(hwgroup->handler); hwgroup->handler = handler; hwgroup->expiry = expiry; hwgroup->timer.expires = jiffies + timeout; @@ -981,8 +980,7 @@ static ide_startstop_t atapi_reset_pollfunc (ide_drive_t *drive) printk("%s: ATAPI reset complete\n", drive->name); } else { if (time_before(jiffies, hwgroup->poll_timeout)) { - if (HWGROUP(drive)->handler != NULL) - BUG(); + BUG_ON(HWGROUP(drive)->handler != NULL); ide_set_handler(drive, &atapi_reset_pollfunc, HZ/20, NULL); /* continue polling */ return ide_started; @@ -1021,8 +1019,7 @@ static ide_startstop_t reset_pollfunc (ide_drive_t *drive) if (!OK_STAT(tmp = hwif->INB(IDE_STATUS_REG), 0, BUSY_STAT)) { if (time_before(jiffies, hwgroup->poll_timeout)) { - if (HWGROUP(drive)->handler != NULL) - BUG(); + BUG_ON(HWGROUP(drive)->handler != NULL); ide_set_handler(drive, &reset_pollfunc, HZ/20, NULL); /* continue polling */ return ide_started; @@ -1138,8 +1135,7 @@ static ide_startstop_t do_reset1 (ide_drive_t *drive, int do_not_try_atapi) hwgroup = HWGROUP(drive); /* We must not reset with running handlers */ - if(hwgroup->handler != NULL) - BUG(); + BUG_ON(hwgroup->handler != NULL); /* For an ATAPI device, first try an ATAPI SRST. */ if (drive->media != ide_disk && !do_not_try_atapi) { diff --git a/drivers/ide/ide-lib.c b/drivers/ide/ide-lib.c index 41d46dbe6c24..16a143133f93 100644 --- a/drivers/ide/ide-lib.c +++ b/drivers/ide/ide-lib.c @@ -164,8 +164,7 @@ u8 ide_rate_filter (u8 mode, u8 speed) // printk("%s: mode 0x%02x, speed 0x%02x\n", __FUNCTION__, mode, speed); /* So that we remember to update this if new modes appear */ - if (mode > 4) - BUG(); + BUG_ON(mode > 4); return min(speed, speed_max[mode]); #else /* !CONFIG_BLK_DEV_IDEDMA */ return min(speed, (u8)XFER_PIO_4); diff --git a/drivers/ide/ide-taskfile.c b/drivers/ide/ide-taskfile.c index 9233b8109a0f..a839b2a8f6f4 100644 --- a/drivers/ide/ide-taskfile.c +++ b/drivers/ide/ide-taskfile.c @@ -196,8 +196,7 @@ ide_startstop_t set_geometry_intr (ide_drive_t *drive) if (stat & (ERR_STAT|DRQ_STAT)) return ide_error(drive, "set_geometry_intr", stat); - if (HWGROUP(drive)->handler != NULL) - BUG(); + BUG_ON(HWGROUP(drive)->handler != NULL); ide_set_handler(drive, &set_geometry_intr, WAIT_WORSTCASE, NULL); return ide_started; } diff --git a/drivers/ide/ide.c b/drivers/ide/ide.c index f5a60912c271..9799aed772e1 100644 --- a/drivers/ide/ide.c +++ b/drivers/ide/ide.c @@ -1367,8 +1367,7 @@ int generic_ide_ioctl(ide_drive_t *drive, struct file *file, struct block_device ide_abort(drive, "drive reset"); - if(HWGROUP(drive)->handler) - BUG(); + BUG_ON(HWGROUP(drive)->handler); /* Ensure nothing gets queued after we drop the lock. Reset will clear the busy */ diff --git a/drivers/ide/pci/trm290.c b/drivers/ide/pci/trm290.c index c26c8ca90dd4..fe80295974e1 100644 --- a/drivers/ide/pci/trm290.c +++ b/drivers/ide/pci/trm290.c @@ -183,8 +183,7 @@ static void trm290_ide_dma_exec_cmd(ide_drive_t *drive, u8 command) { ide_hwif_t *hwif = HWIF(drive); - if (HWGROUP(drive)->handler != NULL) /* paranoia check */ - BUG(); + BUG_ON(HWGROUP(drive)->handler != NULL); /* paranoia check */ ide_set_handler(drive, &ide_dma_intr, WAIT_CMD, NULL); /* issue cmd to drive */ hwif->OUTB(command, IDE_COMMAND_REG); diff --git a/drivers/scsi/53c700.h b/drivers/scsi/53c700.h index d8041952b1b5..7f22a06fe5ec 100644 --- a/drivers/scsi/53c700.h +++ b/drivers/scsi/53c700.h @@ -474,8 +474,7 @@ NCR_700_readl(struct Scsi_Host *host, __u32 reg) ioread32(hostdata->base + reg); #if 1 /* sanity check the register */ - if((reg & 0x3) != 0) - BUG(); + BUG_ON((reg & 0x3) != 0); #endif return value; @@ -498,8 +497,7 @@ NCR_700_writel(__u32 value, struct Scsi_Host *host, __u32 reg) #if 1 /* sanity check the register */ - if((reg & 0x3) != 0) - BUG(); + BUG_ON((reg & 0x3) != 0); #endif bEBus ? iowrite32be(value, hostdata->base + reg): diff --git a/drivers/scsi/aacraid/aachba.c b/drivers/scsi/aacraid/aachba.c index ea9e038813eb..83b5c7d085f2 100644 --- a/drivers/scsi/aacraid/aachba.c +++ b/drivers/scsi/aacraid/aachba.c @@ -396,8 +396,7 @@ static void get_container_name_callback(void *context, struct fib * fibptr) scsicmd->SCp.phase = AAC_OWNER_MIDLEVEL; dprintk((KERN_DEBUG "get_container_name_callback[cpu %d]: t = %ld.\n", smp_processor_id(), jiffies)); - if (fibptr == NULL) - BUG(); + BUG_ON(fibptr == NULL); get_name_reply = (struct aac_get_name_resp *) fib_data(fibptr); /* Failure is irrelevant, using default value instead */ @@ -956,8 +955,7 @@ static void io_callback(void *context, struct fib * fibptr) smp_processor_id(), (unsigned long long)lba, jiffies); } - if (fibptr == NULL) - BUG(); + BUG_ON(fibptr == NULL); if(scsicmd->use_sg) pci_unmap_sg(dev->pdev, @@ -1092,8 +1090,7 @@ static int aac_read(struct scsi_cmnd * scsicmd, int cid) aac_build_sgraw(scsicmd, &readcmd->sg); fibsize = sizeof(struct aac_raw_io) + ((le32_to_cpu(readcmd->sg.count) - 1) * sizeof (struct sgentryraw)); - if (fibsize > (dev->max_fib_size - sizeof(struct aac_fibhdr))) - BUG(); + BUG_ON(fibsize > (dev->max_fib_size - sizeof(struct aac_fibhdr))); /* * Now send the Fib to the adapter */ @@ -1261,8 +1258,7 @@ static int aac_write(struct scsi_cmnd * scsicmd, int cid) aac_build_sgraw(scsicmd, &writecmd->sg); fibsize = sizeof(struct aac_raw_io) + ((le32_to_cpu(writecmd->sg.count) - 1) * sizeof (struct sgentryraw)); - if (fibsize > (dev->max_fib_size - sizeof(struct aac_fibhdr))) - BUG(); + BUG_ON(fibsize > (dev->max_fib_size - sizeof(struct aac_fibhdr))); /* * Now send the Fib to the adapter */ @@ -1904,8 +1900,7 @@ static void aac_srb_callback(void *context, struct fib * fibptr) scsicmd->SCp.phase = AAC_OWNER_MIDLEVEL; dev = (struct aac_dev *)scsicmd->device->host->hostdata; - if (fibptr == NULL) - BUG(); + BUG_ON(fibptr == NULL); srbreply = (struct aac_srb_reply *) fib_data(fibptr); diff --git a/drivers/scsi/aacraid/commsup.c b/drivers/scsi/aacraid/commsup.c index d2ef17ea44fa..3f27419c66af 100644 --- a/drivers/scsi/aacraid/commsup.c +++ b/drivers/scsi/aacraid/commsup.c @@ -229,8 +229,7 @@ void aac_fib_init(struct fib *fibptr) static void fib_dealloc(struct fib * fibptr) { struct hw_fib *hw_fib = fibptr->hw_fib; - if(hw_fib->header.StructType != FIB_MAGIC) - BUG(); + BUG_ON(hw_fib->header.StructType != FIB_MAGIC); hw_fib->header.XferState = 0; } @@ -530,8 +529,7 @@ int aac_fib_send(u16 command, struct fib *fibptr, unsigned long size, } } else down(&fibptr->event_wait); - if(fibptr->done == 0) - BUG(); + BUG_ON(fibptr->done == 0); if((fibptr->flags & FIB_CONTEXT_FLAG_TIMED_OUT)){ return -ETIMEDOUT; diff --git a/drivers/scsi/arm/queue.c b/drivers/scsi/arm/queue.c index b10750bb5c09..8caa5903ce38 100644 --- a/drivers/scsi/arm/queue.c +++ b/drivers/scsi/arm/queue.c @@ -118,8 +118,7 @@ int __queue_add(Queue_t *queue, Scsi_Cmnd *SCpnt, int head) list_del(l); q = list_entry(l, QE_t, list); - if (BAD_MAGIC(q, QUEUE_MAGIC_FREE)) - BUG(); + BUG_ON(BAD_MAGIC(q, QUEUE_MAGIC_FREE)); SET_MAGIC(q, QUEUE_MAGIC_USED); q->SCpnt = SCpnt; @@ -144,8 +143,7 @@ static Scsi_Cmnd *__queue_remove(Queue_t *queue, struct list_head *ent) */ list_del(ent); q = list_entry(ent, QE_t, list); - if (BAD_MAGIC(q, QUEUE_MAGIC_USED)) - BUG(); + BUG_ON(BAD_MAGIC(q, QUEUE_MAGIC_USED)); SET_MAGIC(q, QUEUE_MAGIC_FREE); list_add(ent, &queue->free); diff --git a/drivers/scsi/ibmmca.c b/drivers/scsi/ibmmca.c index 0c6dc31bb14d..115f55471ed3 100644 --- a/drivers/scsi/ibmmca.c +++ b/drivers/scsi/ibmmca.c @@ -2241,8 +2241,7 @@ static int __ibmmca_host_reset(Scsi_Cmnd * cmd) int host_index; unsigned long imm_command; - if (cmd == NULL) - BUG(); + BUG_ON(cmd == NULL); ticks = IM_RESET_DELAY * HZ; shpnt = cmd->device->host; diff --git a/drivers/scsi/ide-scsi.c b/drivers/scsi/ide-scsi.c index 39b760a24241..988e6f7af01a 100644 --- a/drivers/scsi/ide-scsi.c +++ b/drivers/scsi/ide-scsi.c @@ -600,8 +600,7 @@ static ide_startstop_t idescsi_transfer_pc(ide_drive_t *drive) "issuing a packet command\n"); return ide_do_reset (drive); } - if (HWGROUP(drive)->handler != NULL) - BUG(); + BUG_ON(HWGROUP(drive)->handler != NULL); /* Set the interrupt routine */ ide_set_handler(drive, &idescsi_pc_intr, get_timeout(pc), idescsi_expiry); /* Send the actual packet */ @@ -691,8 +690,7 @@ static ide_startstop_t idescsi_issue_pc (ide_drive_t *drive, idescsi_pc_t *pc) set_bit(PC_DMA_OK, &pc->flags); if (test_bit(IDESCSI_DRQ_INTERRUPT, &scsi->flags)) { - if (HWGROUP(drive)->handler != NULL) - BUG(); + BUG_ON(HWGROUP(drive)->handler != NULL); ide_set_handler(drive, &idescsi_transfer_pc, get_timeout(pc), idescsi_expiry); /* Issue the packet command */ diff --git a/drivers/scsi/megaraid.c b/drivers/scsi/megaraid.c index c33857e7b33b..5d2cefb5e52d 100644 --- a/drivers/scsi/megaraid.c +++ b/drivers/scsi/megaraid.c @@ -1828,7 +1828,7 @@ mega_build_sglist(adapter_t *adapter, scb_t *scb, u32 *buf, u32 *len) scb->dma_type = MEGA_SGLIST; - if( sgcnt > adapter->sglen ) BUG(); + BUG_ON(sgcnt > adapter->sglen); *len = 0; diff --git a/kernel/exit.c b/kernel/exit.c index e06d0c10a24e..a3baf92462bd 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -579,7 +579,7 @@ static void exit_mm(struct task_struct * tsk) down_read(&mm->mmap_sem); } atomic_inc(&mm->mm_count); - if (mm != tsk->active_mm) BUG(); + BUG_ON(mm != tsk->active_mm); /* more a memory barrier than a real lock */ task_lock(tsk); tsk->mm = NULL; @@ -1530,8 +1530,7 @@ check_continued: if (options & __WNOTHREAD) break; tsk = next_thread(tsk); - if (tsk->signal != current->signal) - BUG(); + BUG_ON(tsk->signal != current->signal); } while (tsk != current); read_unlock(&tasklist_lock); diff --git a/mm/pdflush.c b/mm/pdflush.c index c4b6d0afd736..df7e50b8f70c 100644 --- a/mm/pdflush.c +++ b/mm/pdflush.c @@ -202,8 +202,7 @@ int pdflush_operation(void (*fn)(unsigned long), unsigned long arg0) unsigned long flags; int ret = 0; - if (fn == NULL) - BUG(); /* Hard to diagnose if it's deferred */ + BUG_ON(fn == NULL); /* Hard to diagnose if it's deferred */ spin_lock_irqsave(&pdflush_lock, flags); if (list_empty(&pdflush_list)) { -- cgit v1.2.3 From eb71c87a492b7090ff9e8ac46912c480a1687e38 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sat, 24 Jun 2006 14:27:42 -0700 Subject: Add some basic resume trace facilities Considering that there isn't a lot of hw we can depend on during resume, this is about as good as it gets. This is x86-only for now, although the basic concept (and most of the code) will certainly work on almost any platform. Signed-off-by: Linus Torvalds --- arch/i386/kernel/vmlinux.lds.S | 7 ++ drivers/base/power/Makefile | 1 + drivers/base/power/trace.c | 228 +++++++++++++++++++++++++++++++++++++++++ include/asm-generic/rtc.h | 7 +- include/linux/resume-trace.h | 30 ++++++ kernel/power/Kconfig | 9 ++ 6 files changed, 279 insertions(+), 3 deletions(-) create mode 100644 drivers/base/power/trace.c create mode 100644 include/linux/resume-trace.h (limited to 'kernel') diff --git a/arch/i386/kernel/vmlinux.lds.S b/arch/i386/kernel/vmlinux.lds.S index 8831303a473f..7512f39c9f25 100644 --- a/arch/i386/kernel/vmlinux.lds.S +++ b/arch/i386/kernel/vmlinux.lds.S @@ -37,6 +37,13 @@ SECTIONS RODATA + . = ALIGN(4); + __tracedata_start = .; + .tracedata : AT(ADDR(.tracedata) - LOAD_OFFSET) { + *(.tracedata) + } + __tracedata_end = .; + /* writeable */ .data : AT(ADDR(.data) - LOAD_OFFSET) { /* Data */ *(.data) diff --git a/drivers/base/power/Makefile b/drivers/base/power/Makefile index ceeeba2c56c7..91f230939c1e 100644 --- a/drivers/base/power/Makefile +++ b/drivers/base/power/Makefile @@ -1,5 +1,6 @@ obj-y := shutdown.o obj-$(CONFIG_PM) += main.o suspend.o resume.o runtime.o sysfs.o +obj-$(CONFIG_PM_TRACE) += trace.o ifeq ($(CONFIG_DEBUG_DRIVER),y) EXTRA_CFLAGS += -DDEBUG diff --git a/drivers/base/power/trace.c b/drivers/base/power/trace.c new file mode 100644 index 000000000000..a9ab30fefffc --- /dev/null +++ b/drivers/base/power/trace.c @@ -0,0 +1,228 @@ +/* + * drivers/base/power/trace.c + * + * Copyright (C) 2006 Linus Torvalds + * + * Trace facility for suspend/resume problems, when none of the + * devices may be working. + */ + +#include +#include + +#include + +#include "power.h" + +/* + * Horrid, horrid, horrid. + * + * It turns out that the _only_ piece of hardware that actually + * keeps its value across a hard boot (and, more importantly, the + * POST init sequence) is literally the realtime clock. + * + * Never mind that an RTC chip has 114 bytes (and often a whole + * other bank of an additional 128 bytes) of nice SRAM that is + * _designed_ to keep data - the POST will clear it. So we literally + * can just use the few bytes of actual time data, which means that + * we're really limited. + * + * It means, for example, that we can't use the seconds at all + * (since the time between the hang and the boot might be more + * than a minute), and we'd better not depend on the low bits of + * the minutes either. + * + * There are the wday fields etc, but I wouldn't guarantee those + * are dependable either. And if the date isn't valid, either the + * hw or POST will do strange things. + * + * So we're left with: + * - year: 0-99 + * - month: 0-11 + * - day-of-month: 1-28 + * - hour: 0-23 + * - min: (0-30)*2 + * + * Giving us a total range of 0-16128000 (0xf61800), ie less + * than 24 bits of actual data we can save across reboots. + * + * And if your box can't boot in less than three minutes, + * you're screwed. + * + * Now, almost 24 bits of data is pitifully small, so we need + * to be pretty dense if we want to use it for anything nice. + * What we do is that instead of saving off nice readable info, + * we save off _hashes_ of information that we can hopefully + * regenerate after the reboot. + * + * In particular, this means that we might be unlucky, and hit + * a case where we have a hash collision, and we end up not + * being able to tell for certain exactly which case happened. + * But that's hopefully unlikely. + * + * What we do is to take the bits we can fit, and split them + * into three parts (16*997*1009 = 16095568), and use the values + * for: + * - 0-15: user-settable + * - 0-996: file + line number + * - 0-1008: device + */ +#define USERHASH (16) +#define FILEHASH (997) +#define DEVHASH (1009) + +#define DEVSEED (7919) + +static unsigned int dev_hash_value; + +static int set_magic_time(unsigned int user, unsigned int file, unsigned int device) +{ + unsigned int n = user + USERHASH*(file + FILEHASH*device); + + // June 7th, 2006 + static struct rtc_time time = { + .tm_sec = 0, + .tm_min = 0, + .tm_hour = 0, + .tm_mday = 7, + .tm_mon = 5, // June - counting from zero + .tm_year = 106, + .tm_wday = 3, + .tm_yday = 160, + .tm_isdst = 1 + }; + + time.tm_year = (n % 100); + n /= 100; + time.tm_mon = (n % 12); + n /= 12; + time.tm_mday = (n % 28) + 1; + n /= 28; + time.tm_hour = (n % 24); + n /= 24; + time.tm_min = (n % 20) * 3; + n /= 20; + set_rtc_time(&time); + return n ? -1 : 0; +} + +static unsigned int read_magic_time(void) +{ + struct rtc_time time; + unsigned int val; + + get_rtc_time(&time); + printk("Time: %2d:%02d:%02d Date: %02d/%02d/%02d\n", + time.tm_hour, time.tm_min, time.tm_sec, + time.tm_mon, time.tm_mday, time.tm_year); + val = time.tm_year; /* 100 years */ + if (val > 100) + val -= 100; + val += time.tm_mon * 100; /* 12 months */ + val += (time.tm_mday-1) * 100 * 12; /* 28 month-days */ + val += time.tm_hour * 100 * 12 * 28; /* 24 hours */ + val += (time.tm_min / 3) * 100 * 12 * 28 * 24; /* 20 3-minute intervals */ + return val; +} + +/* + * This is just the sdbm hash function with a user-supplied + * seed and final size parameter. + */ +static unsigned int hash_string(unsigned int seed, const char *data, unsigned int mod) +{ + unsigned char c; + while ((c = *data++) != 0) { + seed = (seed << 16) + (seed << 6) - seed + c; + } + return seed % mod; +} + +void set_trace_device(struct device *dev) +{ + dev_hash_value = hash_string(DEVSEED, dev->bus_id, DEVHASH); +} + +/* + * We could just take the "tracedata" index into the .tracedata + * section instead. Generating a hash of the data gives us a + * chance to work across kernel versions, and perhaps more + * importantly it also gives us valid/invalid check (ie we will + * likely not give totally bogus reports - if the hash matches, + * it's not any guarantee, but it's a high _likelihood_ that + * the match is valid). + */ +void generate_resume_trace(void *tracedata, unsigned int user) +{ + unsigned short lineno = *(unsigned short *)tracedata; + const char *file = *(const char **)(tracedata + 2); + unsigned int user_hash_value, file_hash_value; + + user_hash_value = user % USERHASH; + file_hash_value = hash_string(lineno, file, FILEHASH); + set_magic_time(user_hash_value, file_hash_value, dev_hash_value); +} + +extern char __tracedata_start, __tracedata_end; +static int show_file_hash(unsigned int value) +{ + int match; + char *tracedata; + + match = 0; + for (tracedata = &__tracedata_start ; tracedata < &__tracedata_end ; tracedata += 6) { + unsigned short lineno = *(unsigned short *)tracedata; + const char *file = *(const char **)(tracedata + 2); + unsigned int hash = hash_string(lineno, file, FILEHASH); + if (hash != value) + continue; + printk(" hash matches %s:%u\n", file, lineno); + match++; + } + return match; +} + +static int show_dev_hash(unsigned int value) +{ + int match = 0; + struct list_head * entry = dpm_active.prev; + + while (entry != &dpm_active) { + struct device * dev = to_device(entry); + unsigned int hash = hash_string(DEVSEED, dev->bus_id, DEVHASH); + if (hash == value) { + printk(" hash matches device %s\n", dev->bus_id); + match++; + } + entry = entry->prev; + } + return match; +} + +static unsigned int hash_value_early_read; + +static int early_resume_init(void) +{ + hash_value_early_read = read_magic_time(); + return 0; +} + +static int late_resume_init(void) +{ + unsigned int val = hash_value_early_read; + unsigned int user, file, dev; + + user = val % USERHASH; + val = val / USERHASH; + file = val % FILEHASH; + val = val / FILEHASH; + dev = val /* % DEVHASH */; + + printk(" Magic number: %d:%d:%d\n", user, file, dev); + show_file_hash(file); + show_dev_hash(dev); + return 0; +} + +core_initcall(early_resume_init); +late_initcall(late_resume_init); diff --git a/include/asm-generic/rtc.h b/include/asm-generic/rtc.h index cef08db34ada..4087037a4225 100644 --- a/include/asm-generic/rtc.h +++ b/include/asm-generic/rtc.h @@ -114,6 +114,7 @@ static inline unsigned int get_rtc_time(struct rtc_time *time) /* Set the current date and time in the real time clock. */ static inline int set_rtc_time(struct rtc_time *time) { + unsigned long flags; unsigned char mon, day, hrs, min, sec; unsigned char save_control, save_freq_select; unsigned int yrs; @@ -131,7 +132,7 @@ static inline int set_rtc_time(struct rtc_time *time) if (yrs > 255) /* They are unsigned */ return -EINVAL; - spin_lock_irq(&rtc_lock); + spin_lock_irqsave(&rtc_lock, flags); #ifdef CONFIG_MACH_DECSTATION real_yrs = yrs; leap_yr = ((!((yrs + 1900) % 4) && ((yrs + 1900) % 100)) || @@ -152,7 +153,7 @@ static inline int set_rtc_time(struct rtc_time *time) * whether the chip is in binary mode or not. */ if (yrs > 169) { - spin_unlock_irq(&rtc_lock); + spin_unlock_irqrestore(&rtc_lock, flags); return -EINVAL; } @@ -187,7 +188,7 @@ static inline int set_rtc_time(struct rtc_time *time) CMOS_WRITE(save_control, RTC_CONTROL); CMOS_WRITE(save_freq_select, RTC_FREQ_SELECT); - spin_unlock_irq(&rtc_lock); + spin_unlock_irqrestore(&rtc_lock, flags); return 0; } diff --git a/include/linux/resume-trace.h b/include/linux/resume-trace.h new file mode 100644 index 000000000000..a376bd4ade39 --- /dev/null +++ b/include/linux/resume-trace.h @@ -0,0 +1,30 @@ +#ifndef RESUME_TRACE_H +#define RESUME_TRACE_H + +#ifdef CONFIG_PM_TRACE + +struct device; +extern void set_trace_device(struct device *); +extern void generate_resume_trace(void *tracedata, unsigned int user); + +#define TRACE_DEVICE(dev) set_trace_device(dev) +#define TRACE_RESUME(user) do { \ + void *tracedata; \ + asm volatile("movl $1f,%0\n" \ + ".section .tracedata,\"a\"\n" \ + "1:\t.word %c1\n" \ + "\t.long %c2\n" \ + ".previous" \ + :"=r" (tracedata) \ + : "i" (__LINE__), "i" (__FILE__)); \ + generate_resume_trace(tracedata, user); \ +} while (0) + +#else + +#define TRACE_DEVICE(dev) do { } while (0) +#define TRACE_RESUME(dev) do { } while (0) + +#endif + +#endif diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index ce0dfb8f4a4e..cdf315e794ff 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig @@ -36,6 +36,15 @@ config PM_DEBUG code. This is helpful when debugging and reporting various PM bugs, like suspend support. +config PM_TRACE + bool "Suspend/resume event tracing" + depends on PM && PM_DEBUG && X86 + default y + ---help--- + This enables some cheesy code to save the last PM event point in the + RTC across reboots, so that you can debug a machine that just hangs + during suspend (or more commonly, during resume). + config SOFTWARE_SUSPEND bool "Software Suspend" depends on PM && SWAP && (X86 && (!SMP || SUSPEND_SMP)) || ((FRV || PPC32) && !SMP) -- cgit v1.2.3 From f867d2a2e5f3f0ce6356f452cc27b70d577de7c7 Mon Sep 17 00:00:00 2001 From: Jesper Juhl Date: Sun, 25 Jun 2006 05:47:09 -0700 Subject: [PATCH] ensure NULL deref can't possibly happen in is_exported() If CONFIG_KALLSYMS is defined and if it should happen that is_exported() is given a NULL 'mod' and lookup_symbol(name, __start___ksymtab, __stop___ksymtab) returns 0, then we'll end up dereferencing a NULL pointer. Signed-off-by: Jesper Juhl Cc: Rusty Russell Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/module.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index bbe04862e1b0..d75275de1c28 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -1326,7 +1326,7 @@ int is_exported(const char *name, const struct module *mod) if (!mod && lookup_symbol(name, __start___ksymtab, __stop___ksymtab)) return 1; else - if (lookup_symbol(name, mod->syms, mod->syms + mod->num_syms)) + if (mod && lookup_symbol(name, mod->syms, mod->syms + mod->num_syms)) return 1; else return 0; -- cgit v1.2.3 From bfe5d834195b3089b8846577311340376cc0f450 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Sun, 25 Jun 2006 05:47:14 -0700 Subject: [PATCH] Define __raw_get_cpu_var and use it There are several instances of per_cpu(foo, raw_smp_processor_id()), which is semantically equivalent to __get_cpu_var(foo) but without the warning that smp_processor_id() can give if CONFIG_DEBUG_PREEMPT is enabled. For those architectures with optimized per-cpu implementations, namely ia64, powerpc, s390, sparc64 and x86_64, per_cpu() turns into more and slower code than __get_cpu_var(), so it would be preferable to use __get_cpu_var on those platforms. This defines a __raw_get_cpu_var(x) macro which turns into per_cpu(x, raw_smp_processor_id()) on architectures that use the generic per-cpu implementation, and turns into __get_cpu_var(x) on the architectures that have an optimized per-cpu implementation. Signed-off-by: Paul Mackerras Acked-by: David S. Miller Acked-by: Ingo Molnar Acked-by: Martin Schwidefsky Cc: Rusty Russell Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/asm-generic/percpu.h | 2 ++ include/asm-ia64/percpu.h | 2 ++ include/asm-powerpc/percpu.h | 2 ++ include/asm-s390/percpu.h | 2 ++ include/asm-sparc64/percpu.h | 2 ++ include/asm-x86_64/percpu.h | 2 ++ kernel/hrtimer.c | 4 ++-- kernel/sched.c | 4 ++-- kernel/softlockup.c | 2 +- kernel/timer.c | 2 +- net/ipv4/route.c | 2 +- 11 files changed, 19 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/include/asm-generic/percpu.h b/include/asm-generic/percpu.h index c0caf433a7d7..c74521157461 100644 --- a/include/asm-generic/percpu.h +++ b/include/asm-generic/percpu.h @@ -14,6 +14,7 @@ extern unsigned long __per_cpu_offset[NR_CPUS]; /* var is in discarded region: offset to particular copy we want */ #define per_cpu(var, cpu) (*RELOC_HIDE(&per_cpu__##var, __per_cpu_offset[cpu])) #define __get_cpu_var(var) per_cpu(var, smp_processor_id()) +#define __raw_get_cpu_var(var) per_cpu(var, raw_smp_processor_id()) /* A macro to avoid #include hell... */ #define percpu_modcopy(pcpudst, src, size) \ @@ -30,6 +31,7 @@ do { \ #define per_cpu(var, cpu) (*((void)(cpu), &per_cpu__##var)) #define __get_cpu_var(var) per_cpu__##var +#define __raw_get_cpu_var(var) per_cpu__##var #endif /* SMP */ diff --git a/include/asm-ia64/percpu.h b/include/asm-ia64/percpu.h index ae357d504fba..24d898b650c5 100644 --- a/include/asm-ia64/percpu.h +++ b/include/asm-ia64/percpu.h @@ -42,6 +42,7 @@ DECLARE_PER_CPU(unsigned long, local_per_cpu_offset); #define per_cpu(var, cpu) (*RELOC_HIDE(&per_cpu__##var, __per_cpu_offset[cpu])) #define __get_cpu_var(var) (*RELOC_HIDE(&per_cpu__##var, __ia64_per_cpu_var(local_per_cpu_offset))) +#define __raw_get_cpu_var(var) (*RELOC_HIDE(&per_cpu__##var, __ia64_per_cpu_var(local_per_cpu_offset))) extern void percpu_modcopy(void *pcpudst, const void *src, unsigned long size); extern void setup_per_cpu_areas (void); @@ -51,6 +52,7 @@ extern void *per_cpu_init(void); #define per_cpu(var, cpu) (*((void)(cpu), &per_cpu__##var)) #define __get_cpu_var(var) per_cpu__##var +#define __raw_get_cpu_var(var) per_cpu__##var #define per_cpu_init() (__phys_per_cpu_start) #endif /* SMP */ diff --git a/include/asm-powerpc/percpu.h b/include/asm-powerpc/percpu.h index 184a7a4d2fdf..faa1fc703053 100644 --- a/include/asm-powerpc/percpu.h +++ b/include/asm-powerpc/percpu.h @@ -22,6 +22,7 @@ /* var is in discarded region: offset to particular copy we want */ #define per_cpu(var, cpu) (*RELOC_HIDE(&per_cpu__##var, __per_cpu_offset(cpu))) #define __get_cpu_var(var) (*RELOC_HIDE(&per_cpu__##var, __my_cpu_offset())) +#define __raw_get_cpu_var(var) (*RELOC_HIDE(&per_cpu__##var, __my_cpu_offset())) /* A macro to avoid #include hell... */ #define percpu_modcopy(pcpudst, src, size) \ @@ -41,6 +42,7 @@ extern void setup_per_cpu_areas(void); #define per_cpu(var, cpu) (*((void)(cpu), &per_cpu__##var)) #define __get_cpu_var(var) per_cpu__##var +#define __raw_get_cpu_var(var) per_cpu__##var #endif /* SMP */ diff --git a/include/asm-s390/percpu.h b/include/asm-s390/percpu.h index 436d216601e5..d9a8cca9b653 100644 --- a/include/asm-s390/percpu.h +++ b/include/asm-s390/percpu.h @@ -40,6 +40,7 @@ extern unsigned long __per_cpu_offset[NR_CPUS]; __typeof__(type) per_cpu__##name #define __get_cpu_var(var) __reloc_hide(var,S390_lowcore.percpu_offset) +#define __raw_get_cpu_var(var) __reloc_hide(var,S390_lowcore.percpu_offset) #define per_cpu(var,cpu) __reloc_hide(var,__per_cpu_offset[cpu]) /* A macro to avoid #include hell... */ @@ -57,6 +58,7 @@ do { \ __typeof__(type) per_cpu__##name #define __get_cpu_var(var) __reloc_hide(var,0) +#define __raw_get_cpu_var(var) __reloc_hide(var,0) #define per_cpu(var,cpu) __reloc_hide(var,0) #endif /* SMP */ diff --git a/include/asm-sparc64/percpu.h b/include/asm-sparc64/percpu.h index baef13b58952..a6ece06b83db 100644 --- a/include/asm-sparc64/percpu.h +++ b/include/asm-sparc64/percpu.h @@ -21,6 +21,7 @@ register unsigned long __local_per_cpu_offset asm("g5"); /* var is in discarded region: offset to particular copy we want */ #define per_cpu(var, cpu) (*RELOC_HIDE(&per_cpu__##var, __per_cpu_offset(cpu))) #define __get_cpu_var(var) (*RELOC_HIDE(&per_cpu__##var, __local_per_cpu_offset)) +#define __raw_get_cpu_var(var) (*RELOC_HIDE(&per_cpu__##var, __local_per_cpu_offset)) /* A macro to avoid #include hell... */ #define percpu_modcopy(pcpudst, src, size) \ @@ -37,6 +38,7 @@ do { \ #define per_cpu(var, cpu) (*((void)cpu, &per_cpu__##var)) #define __get_cpu_var(var) per_cpu__##var +#define __raw_get_cpu_var(var) per_cpu__##var #endif /* SMP */ diff --git a/include/asm-x86_64/percpu.h b/include/asm-x86_64/percpu.h index 7f33aaf9f7b1..549eb929b2c0 100644 --- a/include/asm-x86_64/percpu.h +++ b/include/asm-x86_64/percpu.h @@ -21,6 +21,7 @@ /* var is in discarded region: offset to particular copy we want */ #define per_cpu(var, cpu) (*RELOC_HIDE(&per_cpu__##var, __per_cpu_offset(cpu))) #define __get_cpu_var(var) (*RELOC_HIDE(&per_cpu__##var, __my_cpu_offset())) +#define __raw_get_cpu_var(var) (*RELOC_HIDE(&per_cpu__##var, __my_cpu_offset())) /* A macro to avoid #include hell... */ #define percpu_modcopy(pcpudst, src, size) \ @@ -40,6 +41,7 @@ extern void setup_per_cpu_areas(void); #define per_cpu(var, cpu) (*((void)(cpu), &per_cpu__##var)) #define __get_cpu_var(var) per_cpu__##var +#define __raw_get_cpu_var(var) per_cpu__##var #endif /* SMP */ diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index 18324305724a..9587aac72f4d 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -576,7 +576,7 @@ void hrtimer_init(struct hrtimer *timer, clockid_t clock_id, memset(timer, 0, sizeof(struct hrtimer)); - bases = per_cpu(hrtimer_bases, raw_smp_processor_id()); + bases = __raw_get_cpu_var(hrtimer_bases); if (clock_id == CLOCK_REALTIME && mode != HRTIMER_ABS) clock_id = CLOCK_MONOTONIC; @@ -599,7 +599,7 @@ int hrtimer_get_res(const clockid_t which_clock, struct timespec *tp) { struct hrtimer_base *bases; - bases = per_cpu(hrtimer_bases, raw_smp_processor_id()); + bases = __raw_get_cpu_var(hrtimer_bases); *tp = ktime_to_timespec(bases[which_clock].resolution); return 0; diff --git a/kernel/sched.c b/kernel/sched.c index 5dbc42694477..f8d540b324ca 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -4152,7 +4152,7 @@ EXPORT_SYMBOL(yield); */ void __sched io_schedule(void) { - struct runqueue *rq = &per_cpu(runqueues, raw_smp_processor_id()); + struct runqueue *rq = &__raw_get_cpu_var(runqueues); atomic_inc(&rq->nr_iowait); schedule(); @@ -4163,7 +4163,7 @@ EXPORT_SYMBOL(io_schedule); long __sched io_schedule_timeout(long timeout) { - struct runqueue *rq = &per_cpu(runqueues, raw_smp_processor_id()); + struct runqueue *rq = &__raw_get_cpu_var(runqueues); long ret; atomic_inc(&rq->nr_iowait); diff --git a/kernel/softlockup.c b/kernel/softlockup.c index 14c7faf02909..2c1be1163edc 100644 --- a/kernel/softlockup.c +++ b/kernel/softlockup.c @@ -36,7 +36,7 @@ static struct notifier_block panic_block = { void touch_softlockup_watchdog(void) { - per_cpu(touch_timestamp, raw_smp_processor_id()) = jiffies; + __raw_get_cpu_var(touch_timestamp) = jiffies; } EXPORT_SYMBOL(touch_softlockup_watchdog); diff --git a/kernel/timer.c b/kernel/timer.c index f35b3939e937..eb97371b87d8 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -146,7 +146,7 @@ static void internal_add_timer(tvec_base_t *base, struct timer_list *timer) void fastcall init_timer(struct timer_list *timer) { timer->entry.next = NULL; - timer->base = per_cpu(tvec_bases, raw_smp_processor_id()); + timer->base = __raw_get_cpu_var(tvec_bases); } EXPORT_SYMBOL(init_timer); diff --git a/net/ipv4/route.c b/net/ipv4/route.c index cc9423de7311..60b11aece5c3 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -244,7 +244,7 @@ static unsigned int rt_hash_rnd; static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat); #define RT_CACHE_STAT_INC(field) \ - (per_cpu(rt_cache_stat, raw_smp_processor_id()).field++) + (__raw_get_cpu_var(rt_cache_stat).field++) static int rt_intern_hash(unsigned hash, struct rtable *rth, struct rtable **res); -- cgit v1.2.3 From bbb1747d4e44ce49acc73daa8d66e5f6bd546f1b Mon Sep 17 00:00:00 2001 From: Alan Stern Date: Sun, 25 Jun 2006 05:47:15 -0700 Subject: [PATCH] Allow raw_notifier callouts to unregister themselves Since raw_notifier chains don't benefit from any centralized locking protections, they shouldn't suffer from the associated limitations. Under some circumstances it might make sense for a raw_notifier callout routine to unregister itself from the notifier chain. This patch (as678) changes the notifier core to allow for such things. Signed-off-by: Alan Stern Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sys.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sys.c b/kernel/sys.c index 90930b28d2ca..7e0927bad713 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -137,14 +137,15 @@ static int __kprobes notifier_call_chain(struct notifier_block **nl, unsigned long val, void *v) { int ret = NOTIFY_DONE; - struct notifier_block *nb; + struct notifier_block *nb, *next_nb; nb = rcu_dereference(*nl); while (nb) { + next_nb = rcu_dereference(nb->next); ret = nb->notifier_call(nb, val, v); if ((ret & NOTIFY_STOP_MASK) == NOTIFY_STOP_MASK) break; - nb = rcu_dereference(nb->next); + nb = next_nb; } return ret; } -- cgit v1.2.3 From 76a8ad293912cd2f01eca075d80cd0ddec30c627 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Sun, 25 Jun 2006 05:47:40 -0700 Subject: [PATCH] Make printk work for really early debugging Currently printk is no use for early debugging because it refuses to actually print anything to the console unless cpu_online(smp_processor_id()) is true. The stated explanation is that console drivers may require per-cpu resources, or otherwise barf, because the system is not yet setup correctly. Fair enough. However some console drivers might be quite happy running early during boot, in fact we have one, and so it'd be nice if printk understood that. So I added a flag (which I would have called CON_BOOT, but that's taken) called CON_ANYTIME, which indicates that a console is happy to be called anytime, even if the cpu is not yet online. Tested on a Power 5 machine, with both a CON_ANYTIME driver and a bogus console driver that BUG()s if called while offline. No problems AFAICT. Built for i386 UP & SMP. Signed-off-by: Michael Ellerman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/console.h | 1 + kernel/printk.c | 50 ++++++++++++++++++++++++++++++++----------------- 2 files changed, 34 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/include/linux/console.h b/include/linux/console.h index 08734e660d41..d0f8a8009490 100644 --- a/include/linux/console.h +++ b/include/linux/console.h @@ -87,6 +87,7 @@ void give_up_console(const struct consw *sw); #define CON_CONSDEV (2) /* Last on the command line */ #define CON_ENABLED (4) #define CON_BOOT (8) +#define CON_ANYTIME (16) /* Safe to call when cpu is offline */ struct console { diff --git a/kernel/printk.c b/kernel/printk.c index 19a955619294..6b89dd9d11b6 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -327,7 +327,9 @@ static void __call_console_drivers(unsigned long start, unsigned long end) struct console *con; for (con = console_drivers; con; con = con->next) { - if ((con->flags & CON_ENABLED) && con->write) + if ((con->flags & CON_ENABLED) && con->write && + (cpu_online(smp_processor_id()) || + (con->flags & CON_ANYTIME))) con->write(con, &LOG_BUF(start), end - start); } } @@ -453,6 +455,18 @@ __attribute__((weak)) unsigned long long printk_clock(void) return sched_clock(); } +/* Check if we have any console registered that can be called early in boot. */ +static int have_callable_console(void) +{ + struct console *con; + + for (con = console_drivers; con; con = con->next) + if (con->flags & CON_ANYTIME) + return 1; + + return 0; +} + /** * printk - print a kernel message * @fmt: format string @@ -566,27 +580,29 @@ asmlinkage int vprintk(const char *fmt, va_list args) log_level_unknown = 1; } - if (!cpu_online(smp_processor_id())) { + if (!down_trylock(&console_sem)) { /* - * Some console drivers may assume that per-cpu resources have - * been allocated. So don't allow them to be called by this - * CPU until it is officially up. We shouldn't be calling into - * random console drivers on a CPU which doesn't exist yet.. + * We own the drivers. We can drop the spinlock and + * let release_console_sem() print the text, maybe ... */ + console_locked = 1; printk_cpu = UINT_MAX; spin_unlock_irqrestore(&logbuf_lock, flags); - goto out; - } - if (!down_trylock(&console_sem)) { - console_locked = 1; + /* - * We own the drivers. We can drop the spinlock and let - * release_console_sem() print the text + * Console drivers may assume that per-cpu resources have + * been allocated. So unless they're explicitly marked as + * being able to cope (CON_ANYTIME) don't call them until + * this CPU is officially up. */ - printk_cpu = UINT_MAX; - spin_unlock_irqrestore(&logbuf_lock, flags); - console_may_schedule = 0; - release_console_sem(); + if (cpu_online(smp_processor_id()) || have_callable_console()) { + console_may_schedule = 0; + release_console_sem(); + } else { + /* Release by hand to avoid flushing the buffer. */ + console_locked = 0; + up(&console_sem); + } } else { /* * Someone else owns the drivers. We drop the spinlock, which @@ -596,7 +612,7 @@ asmlinkage int vprintk(const char *fmt, va_list args) printk_cpu = UINT_MAX; spin_unlock_irqrestore(&logbuf_lock, flags); } -out: + preempt_enable(); return printed_len; } -- cgit v1.2.3 From 83cc5ed3c4c65fc4c3729a5cec2111ede1ebf85e Mon Sep 17 00:00:00 2001 From: Adrian Bunk Date: Sun, 25 Jun 2006 05:47:41 -0700 Subject: [PATCH] kernel/sys.c: cleanups - proper prototypes for the following functions: - ctrl_alt_del() (in include/linux/reboot.h) - getrusage() (in include/linux/resource.h) - make the following needlessly global functions static: - kernel_restart_prepare() - kernel_kexec() [akpm@osdl.org: compile fix] Signed-off-by: Adrian Bunk Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm/mach-ixp4xx/nas100d-power.c | 3 +-- arch/arm/mach-ixp4xx/nslu2-power.c | 3 +-- arch/mips/kernel/irixsig.c | 3 +-- arch/mips/kernel/sysirix.c | 2 +- arch/um/drivers/mconsole_kern.c | 2 -- drivers/char/keyboard.c | 1 + drivers/s390/char/sclp_quiesce.c | 3 +-- include/linux/reboot.h | 4 ++-- include/linux/resource.h | 4 ++++ kernel/exit.c | 3 +-- kernel/sys.c | 5 ++--- 11 files changed, 15 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/arch/arm/mach-ixp4xx/nas100d-power.c b/arch/arm/mach-ixp4xx/nas100d-power.c index 99d333d7ebdd..a3745ed37f9f 100644 --- a/arch/arm/mach-ixp4xx/nas100d-power.c +++ b/arch/arm/mach-ixp4xx/nas100d-power.c @@ -20,11 +20,10 @@ #include #include #include +#include #include -extern void ctrl_alt_del(void); - static irqreturn_t nas100d_reset_handler(int irq, void *dev_id, struct pt_regs *regs) { /* Signal init to do the ctrlaltdel action, this will bypass init if diff --git a/arch/arm/mach-ixp4xx/nslu2-power.c b/arch/arm/mach-ixp4xx/nslu2-power.c index d80c362bc539..6d38e97142cc 100644 --- a/arch/arm/mach-ixp4xx/nslu2-power.c +++ b/arch/arm/mach-ixp4xx/nslu2-power.c @@ -20,11 +20,10 @@ #include #include #include +#include #include -extern void ctrl_alt_del(void); - static irqreturn_t nslu2_power_handler(int irq, void *dev_id, struct pt_regs *regs) { /* Signal init to do the ctrlaltdel action, this will bypass init if diff --git a/arch/mips/kernel/irixsig.c b/arch/mips/kernel/irixsig.c index a9bf6cc3abd1..676e868d26fb 100644 --- a/arch/mips/kernel/irixsig.c +++ b/arch/mips/kernel/irixsig.c @@ -13,6 +13,7 @@ #include #include #include +#include #include #include @@ -540,8 +541,6 @@ out: #define IRIX_P_PGID 2 #define IRIX_P_ALL 7 -extern int getrusage(struct task_struct *, int, struct rusage __user *); - #define W_EXITED 1 #define W_TRAPPED 2 #define W_STOPPED 4 diff --git a/arch/mips/kernel/sysirix.c b/arch/mips/kernel/sysirix.c index 19e1ef43eb4b..1137dd6ea7aa 100644 --- a/arch/mips/kernel/sysirix.c +++ b/arch/mips/kernel/sysirix.c @@ -31,6 +31,7 @@ #include #include #include +#include #include #include @@ -235,7 +236,6 @@ asmlinkage int irix_prctl(unsigned option, ...) #undef DEBUG_PROCGRPS extern unsigned long irix_mapelf(int fd, struct elf_phdr __user *user_phdrp, int cnt); -extern int getrusage(struct task_struct *p, int who, struct rusage __user *ru); extern char *prom_getenv(char *name); extern long prom_setenv(char *name, char *value); diff --git a/arch/um/drivers/mconsole_kern.c b/arch/um/drivers/mconsole_kern.c index 6d7173fc55a3..79149314ed04 100644 --- a/arch/um/drivers/mconsole_kern.c +++ b/arch/um/drivers/mconsole_kern.c @@ -300,8 +300,6 @@ void mconsole_reboot(struct mc_request *req) machine_restart(NULL); } -extern void ctrl_alt_del(void); - void mconsole_cad(struct mc_request *req) { mconsole_reply(req, "", 0, 0); diff --git a/drivers/char/keyboard.c b/drivers/char/keyboard.c index 5755b7e5f187..edd996f6fb87 100644 --- a/drivers/char/keyboard.c +++ b/drivers/char/keyboard.c @@ -39,6 +39,7 @@ #include #include #include +#include static void kbd_disconnect(struct input_handle *handle); extern void ctrl_alt_del(void); diff --git a/drivers/s390/char/sclp_quiesce.c b/drivers/s390/char/sclp_quiesce.c index 56fa69168898..a4c53c172db6 100644 --- a/drivers/s390/char/sclp_quiesce.c +++ b/drivers/s390/char/sclp_quiesce.c @@ -13,6 +13,7 @@ #include #include #include +#include #include #include #include @@ -66,8 +67,6 @@ do_machine_quiesce(void) } #endif -extern void ctrl_alt_del(void); - /* Handler for quiesce event. Start shutdown procedure. */ static void sclp_quiesce_handler(struct evbuf_header *evbuf) diff --git a/include/linux/reboot.h b/include/linux/reboot.h index 015297ff73fa..1dd1c707311f 100644 --- a/include/linux/reboot.h +++ b/include/linux/reboot.h @@ -59,13 +59,13 @@ extern void machine_crash_shutdown(struct pt_regs *); * Architecture independent implemenations of sys_reboot commands. */ -extern void kernel_restart_prepare(char *cmd); extern void kernel_shutdown_prepare(enum system_states state); extern void kernel_restart(char *cmd); extern void kernel_halt(void); extern void kernel_power_off(void); -extern void kernel_kexec(void); + +void ctrl_alt_del(void); /* * Emergency restart, callable from an interrupt handler. diff --git a/include/linux/resource.h b/include/linux/resource.h index 21a86cb6acdb..ae13db714742 100644 --- a/include/linux/resource.h +++ b/include/linux/resource.h @@ -3,6 +3,8 @@ #include +struct task_struct; + /* * Resource control/accounting header file for linux */ @@ -67,4 +69,6 @@ struct rlimit { */ #include +int getrusage(struct task_struct *p, int who, struct rusage __user *ru); + #endif diff --git a/kernel/exit.c b/kernel/exit.c index a3baf92462bd..b12a4706f73f 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -36,6 +36,7 @@ #include #include #include /* for audit_free() */ +#include #include #include @@ -45,8 +46,6 @@ extern void sem_exit (void); extern struct task_struct *child_reaper; -int getrusage(struct task_struct *, int, struct rusage __user *); - static void exit_mm(struct task_struct * tsk); static void __unhash_process(struct task_struct *p) diff --git a/kernel/sys.c b/kernel/sys.c index 7e0927bad713..2d5179c67cec 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -589,7 +589,7 @@ void emergency_restart(void) } EXPORT_SYMBOL_GPL(emergency_restart); -void kernel_restart_prepare(char *cmd) +static void kernel_restart_prepare(char *cmd) { blocking_notifier_call_chain(&reboot_notifier_list, SYS_RESTART, cmd); system_state = SYSTEM_RESTART; @@ -623,7 +623,7 @@ EXPORT_SYMBOL_GPL(kernel_restart); * Move into place and start executing a preloaded standalone * executable. If nothing was preloaded return an error. */ -void kernel_kexec(void) +static void kernel_kexec(void) { #ifdef CONFIG_KEXEC struct kimage *image; @@ -637,7 +637,6 @@ void kernel_kexec(void) machine_kexec(image); #endif } -EXPORT_SYMBOL_GPL(kernel_kexec); void kernel_shutdown_prepare(enum system_states state) { -- cgit v1.2.3 From b61367732fc273977cc3fb85c272ce1a7bb1f533 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Sun, 25 Jun 2006 05:47:49 -0700 Subject: [PATCH] schedule_on_each_cpu(): reduce kmalloc() size schedule_on_each_cpu() presently does a large kmalloc - 96 kbytes on 1024 CPU 64-bit. Rework it so that we do one 8192-byte allocation and then a pile of tiny ones, via alloc_percpu(). This has a much higher chance of success (100% in the current VM). This also has the effect of reducing the memory requirements from NR_CPUS*n to num_possible_cpus()*n. Cc: Christoph Lameter Cc: Andi Kleen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/workqueue.c | 28 ++++++++++++++++++++-------- 1 file changed, 20 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 740c5abceb07..f869aff6bc0c 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -428,22 +428,34 @@ int schedule_delayed_work_on(int cpu, return ret; } -int schedule_on_each_cpu(void (*func) (void *info), void *info) +/** + * schedule_on_each_cpu - call a function on each online CPU from keventd + * @func: the function to call + * @info: a pointer to pass to func() + * + * Returns zero on success. + * Returns -ve errno on failure. + * + * Appears to be racy against CPU hotplug. + * + * schedule_on_each_cpu() is very slow. + */ +int schedule_on_each_cpu(void (*func)(void *info), void *info) { int cpu; - struct work_struct *work; + struct work_struct *works; - work = kmalloc(NR_CPUS * sizeof(struct work_struct), GFP_KERNEL); - - if (!work) + works = alloc_percpu(struct work_struct); + if (!works) return -ENOMEM; + for_each_online_cpu(cpu) { - INIT_WORK(work + cpu, func, info); + INIT_WORK(per_cpu_ptr(works, cpu), func, info); __queue_work(per_cpu_ptr(keventd_wq->cpu_wq, cpu), - work + cpu); + per_cpu_ptr(works, cpu)); } flush_workqueue(keventd_wq); - kfree(work); + free_percpu(works); return 0; } -- cgit v1.2.3 From 3b364b8d584b94777f8446a943b3c65e75e758f8 Mon Sep 17 00:00:00 2001 From: Andreas Mohr Date: Sun, 25 Jun 2006 05:47:56 -0700 Subject: [PATCH] constify parts of kernel/power/ Signed-off-by: Andreas Mohr Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/power/disk.c | 2 +- kernel/power/main.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/power/disk.c b/kernel/power/disk.c index 81d4d982f3f0..e13e74067845 100644 --- a/kernel/power/disk.c +++ b/kernel/power/disk.c @@ -231,7 +231,7 @@ static int software_resume(void) late_initcall(software_resume); -static char * pm_disk_modes[] = { +static const char * const pm_disk_modes[] = { [PM_DISK_FIRMWARE] = "firmware", [PM_DISK_PLATFORM] = "platform", [PM_DISK_SHUTDOWN] = "shutdown", diff --git a/kernel/power/main.c b/kernel/power/main.c index cdf0f07af92f..6d295c776794 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -145,7 +145,7 @@ static void suspend_finish(suspend_state_t state) -static char *pm_states[PM_SUSPEND_MAX] = { +static const char * const pm_states[PM_SUSPEND_MAX] = { [PM_SUSPEND_STANDBY] = "standby", [PM_SUSPEND_MEM] = "mem", #ifdef CONFIG_SOFTWARE_SUSPEND @@ -262,7 +262,7 @@ static ssize_t state_show(struct subsystem * subsys, char * buf) static ssize_t state_store(struct subsystem * subsys, const char * buf, size_t n) { suspend_state_t state = PM_SUSPEND_STANDBY; - char ** s; + const char * const *s; char *p; int error; int len; -- cgit v1.2.3 From 11e64757f9fb32f13f51596bbf01988f42fca764 Mon Sep 17 00:00:00 2001 From: Matt Helsley Date: Sun, 25 Jun 2006 05:48:03 -0700 Subject: [PATCH] Remove unecessary NULL check in kernel/acct.c copy_process() appears to be the only caller of acct_clear_integrals() and does not pass in NULL task pointers. Remove the unecessary check. Signed-off-by: Matt Helsley Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/acct.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/acct.c b/kernel/acct.c index 6802020e0ceb..44dd6bd63517 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -599,9 +599,7 @@ void acct_update_integrals(struct task_struct *tsk) */ void acct_clear_integrals(struct task_struct *tsk) { - if (tsk) { - tsk->acct_stimexpd = 0; - tsk->acct_rss_mem1 = 0; - tsk->acct_vm_mem1 = 0; - } + tsk->acct_stimexpd = 0; + tsk->acct_rss_mem1 = 0; + tsk->acct_vm_mem1 = 0; } -- cgit v1.2.3 From 3b9c04106b70e46803c69d13d5da32f6129fa76d Mon Sep 17 00:00:00 2001 From: Jan Engelhardt Date: Sun, 25 Jun 2006 05:48:15 -0700 Subject: [PATCH] printk time parameter Currently, enabling/disabling printk timestamps is only possible through reboot (bootparam) or recompile. I normally do not run with timestamps (since syslog handles that in a good manner), but for measuring small kernel delays (e.g. irq probing - see parport thread) I needed subsecond precision, but then again, just for some minutes rather than all kernel messages to come. The following patch adds a module_param() with which the timestamps can be en-/disabled in a live system through /sys/modules/printk/parameters/printk_time. Signed-off-by: Jan Engelhardt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/printk.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/printk.c b/kernel/printk.c index 6b89dd9d11b6..95b7fe17f124 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -24,6 +24,7 @@ #include #include #include +#include #include /* For in_interrupt() */ #include #include @@ -439,6 +440,7 @@ static int printk_time = 1; #else static int printk_time = 0; #endif +module_param(printk_time, int, S_IRUGO | S_IWUSR); static int __init printk_time_setup(char *str) { -- cgit v1.2.3 From eab03ac7bd3e0da99eb9dc068772a85a5e3f3577 Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Sun, 25 Jun 2006 05:48:31 -0700 Subject: [PATCH] Get rid of /proc/sys/proc The table is empty, why does it still exist? Signed-off-by: Stephen Hemminger Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/sysctl.h | 4 +--- kernel/sysctl.c | 11 ----------- 2 files changed, 1 insertion(+), 14 deletions(-) (limited to 'kernel') diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h index c7132029af0f..6a60770984e9 100644 --- a/include/linux/sysctl.h +++ b/include/linux/sysctl.h @@ -55,7 +55,7 @@ enum CTL_KERN=1, /* General kernel info and control */ CTL_VM=2, /* VM management */ CTL_NET=3, /* Networking */ - CTL_PROC=4, /* Process info */ + /* was CTL_PROC */ CTL_FS=5, /* Filesystems */ CTL_DEBUG=6, /* Debugging */ CTL_DEV=7, /* Devices */ @@ -767,8 +767,6 @@ enum { NET_BRIDGE_NF_FILTER_VLAN_TAGGED = 4, }; -/* CTL_PROC names: */ - /* CTL_FS names: */ enum { diff --git a/kernel/sysctl.c b/kernel/sysctl.c index eb8bd214e7d7..2c0e65819448 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -143,7 +143,6 @@ static struct ctl_table_header root_table_header = static ctl_table kern_table[]; static ctl_table vm_table[]; -static ctl_table proc_table[]; static ctl_table fs_table[]; static ctl_table debug_table[]; static ctl_table dev_table[]; @@ -202,12 +201,6 @@ static ctl_table root_table[] = { .child = net_table, }, #endif - { - .ctl_name = CTL_PROC, - .procname = "proc", - .mode = 0555, - .child = proc_table, - }, { .ctl_name = CTL_FS, .procname = "fs", @@ -927,10 +920,6 @@ static ctl_table vm_table[] = { { .ctl_name = 0 } }; -static ctl_table proc_table[] = { - { .ctl_name = 0 } -}; - static ctl_table fs_table[] = { { .ctl_name = FS_NRINODE, -- cgit v1.2.3 From 838cd153a5250a79a302f6c5d68a4794b70c4ccb Mon Sep 17 00:00:00 2001 From: "akpm@osdl.org" Date: Sun, 25 Jun 2006 05:48:35 -0700 Subject: [PATCH] N32 sigset and __COMPAT_ENDIAN_SWAP__ I'm testing glibc on MIPS64, little-endian, N32, O32 and N64 multilibs. Among the NPTL test failures seen are some arising from sigsuspend problems for N32: it blocks the wrong signals, so SIGCANCEL (SIGRTMIN) is blocked despite glibc's carefully excluding it from sets of signals to block. Specifically, testing suggests it blocks signal N^32 instead of signal N, so (in the example tested) blocking SIGUSR1 (17) blocks signal 49 instead. glibc's sigset_t uses an array of unsigned long, as does the kernel. In both cases, signal N+1 is represented as (1UL << (N % (8 * sizeof (unsigned long)))) in word number (N / (8 * sizeof (unsigned long))). Thus the N32 glibc uses an array of 32-bit words and the N64 kernel uses an array of 64-bit words. For little-endian, the layout is the same, with signals 1-32 in the first 4 bytes, signals 33-64 in the second, etc.; for big-endian, userspace has that layout while in the kernel each 8 bytes have the two halves swapped from the userspace layout. The N32 sigsuspend syscall uses sigset_from_compat to convert the userspace sigset to kernel format. If __COMPAT_ENDIAN_SWAP__ is *not* set, this uses logic of the form set->sig[0] = compat->sig[0] | (((long)compat->sig[1]) << 32 ) to convert the userspace sigset to a kernel one. This looks correct to me for both big and little endian, given that in userspace compat->sig[1] will represent signals 33-64, and so will the high 32 bits of set->sig[0] in the kernel. If however __COMPAT_ENDIAN_SWAP__ *is* set, as it is for __MIPSEL__, it uses set->sig[0] = compat->sig[1] | (((long)compat->sig[0]) << 32 ); which seems incorrect for both big and little endian, and would explain the observed symptoms. This code is the only use of __COMPAT_ENDIAN_SWAP__, so if incorrect then that macro serves no purpose, in which case something like the following patch would seem appropriate to remove it. Signed-off-by: Joseph Myers Signed-off-by: Ralf Baechle Cc: Arnd Bergmann Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/asm-mips/compat.h | 3 --- kernel/compat.c | 7 ------- 2 files changed, 10 deletions(-) (limited to 'kernel') diff --git a/include/asm-mips/compat.h b/include/asm-mips/compat.h index 986511db54a6..900f472fdd2b 100644 --- a/include/asm-mips/compat.h +++ b/include/asm-mips/compat.h @@ -145,8 +145,5 @@ static inline void __user *compat_alloc_user_space(long len) return (void __user *) (regs->regs[29] - len); } -#if defined (__MIPSEL__) -#define __COMPAT_ENDIAN_SWAP__ 1 -#endif #endif /* _ASM_COMPAT_H */ diff --git a/kernel/compat.c b/kernel/compat.c index 2f672332430f..126dee9530aa 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -730,17 +730,10 @@ void sigset_from_compat (sigset_t *set, compat_sigset_t *compat) { switch (_NSIG_WORDS) { -#if defined (__COMPAT_ENDIAN_SWAP__) - case 4: set->sig[3] = compat->sig[7] | (((long)compat->sig[6]) << 32 ); - case 3: set->sig[2] = compat->sig[5] | (((long)compat->sig[4]) << 32 ); - case 2: set->sig[1] = compat->sig[3] | (((long)compat->sig[2]) << 32 ); - case 1: set->sig[0] = compat->sig[1] | (((long)compat->sig[0]) << 32 ); -#else case 4: set->sig[3] = compat->sig[6] | (((long)compat->sig[7]) << 32 ); case 3: set->sig[2] = compat->sig[4] | (((long)compat->sig[5]) << 32 ); case 2: set->sig[1] = compat->sig[2] | (((long)compat->sig[3]) << 32 ); case 1: set->sig[0] = compat->sig[0] | (((long)compat->sig[1]) << 32 ); -#endif } } -- cgit v1.2.3 From 2aa92581fb13e04e1440e5041b412cc06c782e0e Mon Sep 17 00:00:00 2001 From: Anton Blanchard Date: Sun, 25 Jun 2006 05:48:44 -0700 Subject: [PATCH] Link error when futexes are disabled on 64bit architectures If futexes are disabled we fail to link on ppc64. Signed-off-by: Anton Blanchard Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/exit.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index b12a4706f73f..601263c0806f 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -898,7 +898,7 @@ fastcall NORET_TYPE void do_exit(long code) } if (unlikely(tsk->robust_list)) exit_robust_list(tsk); -#ifdef CONFIG_COMPAT +#if defined(CONFIG_FUTEX) && defined(CONFIG_COMPAT) if (unlikely(tsk->compat_robust_list)) compat_exit_robust_list(tsk); #endif -- cgit v1.2.3 From 8bdd1d1250d55afe403ac4affa6ccc5f9e60468f Mon Sep 17 00:00:00 2001 From: "Serge E. Hallyn" Date: Sun, 25 Jun 2006 05:49:08 -0700 Subject: [PATCH] kthread: convert stop_machine into a kthread - Update stop_machine.c to spawn stop_machine as kthreads rather than the deprecated kernel_threads. - Update stop_machine to use the more efficient kthread_bind() before running task in place of set_cpus_allowed() after. [akpm@osdl.org: remove now-wrong set_cpus_allowed()] Signed-off-by: Serge E. Hallyn Cc: Rusty Russell Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/stop_machine.c | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index dcfb5d731466..2c0aacc37c55 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -4,6 +4,7 @@ #include #include #include +#include #include #include #include @@ -25,13 +26,11 @@ static unsigned int stopmachine_num_threads; static atomic_t stopmachine_thread_ack; static DECLARE_MUTEX(stopmachine_mutex); -static int stopmachine(void *cpu) +static int stopmachine(void *unused) { int irqs_disabled = 0; int prepared = 0; - set_cpus_allowed(current, cpumask_of_cpu((int)(long)cpu)); - /* Ack: we are alive */ smp_mb(); /* Theoretically the ack = 0 might not be on this CPU yet. */ atomic_inc(&stopmachine_thread_ack); @@ -85,7 +84,8 @@ static void stopmachine_set_state(enum stopmachine_state state) static int stop_machine(void) { - int i, ret = 0; + int ret = 0; + unsigned int i; struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; /* One high-prio thread per cpu. We'll do this one. */ @@ -96,11 +96,16 @@ static int stop_machine(void) stopmachine_state = STOPMACHINE_WAIT; for_each_online_cpu(i) { + struct task_struct *tsk; if (i == raw_smp_processor_id()) continue; - ret = kernel_thread(stopmachine, (void *)(long)i,CLONE_KERNEL); - if (ret < 0) + tsk = kthread_create(stopmachine, NULL, "stopmachine"); + if (IS_ERR(tsk)) { + ret = PTR_ERR(tsk); break; + } + kthread_bind(tsk, i); + wake_up_process(tsk); stopmachine_num_threads++; } -- cgit v1.2.3 From fc75cdfa5b43ac4d3232b490800cd35063adafd3 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Sun, 25 Jun 2006 05:49:10 -0700 Subject: [PATCH] cpu hotplug: fix CPU_UP_CANCEL handling If a cpu hotplug callback fails on CPU_UP_PREPARE, all callbacks will be called with CPU_UP_CANCELED. A few of these callbacks assume that on CPU_UP_PREPARE a pointer to task has been stored in a percpu array. This assumption is not true if CPU_UP_PREPARE fails and the following calls to kthread_bind() in CPU_UP_CANCELED will cause an addressing exception because of passing a NULL pointer. Signed-off-by: Heiko Carstens Cc: Ashok Raj Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sched.c | 2 ++ kernel/softirq.c | 2 ++ kernel/softlockup.c | 2 ++ kernel/workqueue.c | 2 ++ 4 files changed, 8 insertions(+) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index f8d540b324ca..f06d059edef5 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -4756,6 +4756,8 @@ static int migration_call(struct notifier_block *nfb, unsigned long action, break; #ifdef CONFIG_HOTPLUG_CPU case CPU_UP_CANCELED: + if (!cpu_rq(cpu)->migration_thread) + break; /* Unbind it from offline cpu so it can run. Fall thru. */ kthread_bind(cpu_rq(cpu)->migration_thread, any_online_cpu(cpu_online_map)); diff --git a/kernel/softirq.c b/kernel/softirq.c index 336f92d64e2e..9e2f1c6e73d7 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -470,6 +470,8 @@ static int cpu_callback(struct notifier_block *nfb, break; #ifdef CONFIG_HOTPLUG_CPU case CPU_UP_CANCELED: + if (!per_cpu(ksoftirqd, hotcpu)) + break; /* Unbind so it can run. Fall thru. */ kthread_bind(per_cpu(ksoftirqd, hotcpu), any_online_cpu(cpu_online_map)); diff --git a/kernel/softlockup.c b/kernel/softlockup.c index 2c1be1163edc..b5c3b94e01ce 100644 --- a/kernel/softlockup.c +++ b/kernel/softlockup.c @@ -127,6 +127,8 @@ cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) break; #ifdef CONFIG_HOTPLUG_CPU case CPU_UP_CANCELED: + if (!per_cpu(watchdog_task, hotcpu)) + break; /* Unbind so it can run. Fall thru. */ kthread_bind(per_cpu(watchdog_task, hotcpu), any_online_cpu(cpu_online_map)); diff --git a/kernel/workqueue.c b/kernel/workqueue.c index f869aff6bc0c..565cf7a1febd 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -590,6 +590,8 @@ static int workqueue_cpu_callback(struct notifier_block *nfb, case CPU_UP_CANCELED: list_for_each_entry(wq, &workqueues, list) { + if (!per_cpu_ptr(wq->cpu_wq, hotcpu)->thread) + continue; /* Unbind so it can run. */ kthread_bind(per_cpu_ptr(wq->cpu_wq, hotcpu)->thread, any_online_cpu(cpu_online_map)); -- cgit v1.2.3 From fa9799e33d362aeca4555cd6318735bab1c04d16 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sun, 25 Jun 2006 05:49:15 -0700 Subject: [PATCH] ktime/hrtimer: fix kernel-doc comments Fix kernel-doc formatting in ktime.h and hrtimer.[ch] files. Signed-off-by: Randy Dunlap Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/hrtimer.h | 3 --- include/linux/ktime.h | 8 -------- kernel/hrtimer.c | 11 +---------- 3 files changed, 1 insertion(+), 21 deletions(-) (limited to 'kernel') diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index 7d2a1b974c5e..07d7305f131e 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -40,7 +40,6 @@ struct hrtimer_base; /** * struct hrtimer - the basic hrtimer structure - * * @node: red black tree node for time ordered insertion * @expires: the absolute expiry time in the hrtimers internal * representation. The time is related to the clock on @@ -59,7 +58,6 @@ struct hrtimer { /** * struct hrtimer_sleeper - simple sleeper structure - * * @timer: embedded timer structure * @task: task to wake up * @@ -72,7 +70,6 @@ struct hrtimer_sleeper { /** * struct hrtimer_base - the timer base for a specific clock - * * @index: clock type index for per_cpu support when moving a timer * to a base on another cpu. * @lock: lock protecting the base and associated timers diff --git a/include/linux/ktime.h b/include/linux/ktime.h index 62bc57580707..ed3396dcc4f7 100644 --- a/include/linux/ktime.h +++ b/include/linux/ktime.h @@ -66,7 +66,6 @@ typedef union { /** * ktime_set - Set a ktime_t variable from a seconds/nanoseconds value - * * @secs: seconds to set * @nsecs: nanoseconds to set * @@ -138,7 +137,6 @@ static inline ktime_t ktime_set(const long secs, const unsigned long nsecs) /** * ktime_sub - subtract two ktime_t variables - * * @lhs: minuend * @rhs: subtrahend * @@ -157,7 +155,6 @@ static inline ktime_t ktime_sub(const ktime_t lhs, const ktime_t rhs) /** * ktime_add - add two ktime_t variables - * * @add1: addend1 * @add2: addend2 * @@ -184,7 +181,6 @@ static inline ktime_t ktime_add(const ktime_t add1, const ktime_t add2) /** * ktime_add_ns - Add a scalar nanoseconds value to a ktime_t variable - * * @kt: addend * @nsec: the scalar nsec value to add * @@ -194,7 +190,6 @@ extern ktime_t ktime_add_ns(const ktime_t kt, u64 nsec); /** * timespec_to_ktime - convert a timespec to ktime_t format - * * @ts: the timespec variable to convert * * Returns a ktime_t variable with the converted timespec value @@ -207,7 +202,6 @@ static inline ktime_t timespec_to_ktime(const struct timespec ts) /** * timeval_to_ktime - convert a timeval to ktime_t format - * * @tv: the timeval variable to convert * * Returns a ktime_t variable with the converted timeval value @@ -220,7 +214,6 @@ static inline ktime_t timeval_to_ktime(const struct timeval tv) /** * ktime_to_timespec - convert a ktime_t variable to timespec format - * * @kt: the ktime_t variable to convert * * Returns the timespec representation of the ktime value @@ -233,7 +226,6 @@ static inline struct timespec ktime_to_timespec(const ktime_t kt) /** * ktime_to_timeval - convert a ktime_t variable to timeval format - * * @kt: the ktime_t variable to convert * * Returns the timeval representation of the ktime value diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index 9587aac72f4d..55601b3ce60e 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -98,7 +98,6 @@ static DEFINE_PER_CPU(struct hrtimer_base, hrtimer_bases[MAX_HRTIMER_BASES]) = /** * ktime_get_ts - get the monotonic clock in timespec format - * * @ts: pointer to timespec variable * * The function calculates the monotonic clock from the realtime @@ -238,7 +237,6 @@ lock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags) # ifndef CONFIG_KTIME_SCALAR /** * ktime_add_ns - Add a scalar nanoseconds value to a ktime_t variable - * * @kt: addend * @nsec: the scalar nsec value to add * @@ -299,7 +297,6 @@ void unlock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags) /** * hrtimer_forward - forward the timer expiry - * * @timer: hrtimer to forward * @now: forward past this time * @interval: the interval to forward @@ -411,7 +408,6 @@ remove_hrtimer(struct hrtimer *timer, struct hrtimer_base *base) /** * hrtimer_start - (re)start an relative timer on the current CPU - * * @timer: the timer to be added * @tim: expiry time * @mode: expiry mode: absolute (HRTIMER_ABS) or relative (HRTIMER_REL) @@ -460,14 +456,13 @@ EXPORT_SYMBOL_GPL(hrtimer_start); /** * hrtimer_try_to_cancel - try to deactivate a timer - * * @timer: hrtimer to stop * * Returns: * 0 when the timer was not active * 1 when the timer was active * -1 when the timer is currently excuting the callback function and - * can not be stopped + * cannot be stopped */ int hrtimer_try_to_cancel(struct hrtimer *timer) { @@ -489,7 +484,6 @@ EXPORT_SYMBOL_GPL(hrtimer_try_to_cancel); /** * hrtimer_cancel - cancel a timer and wait for the handler to finish. - * * @timer: the timer to be cancelled * * Returns: @@ -510,7 +504,6 @@ EXPORT_SYMBOL_GPL(hrtimer_cancel); /** * hrtimer_get_remaining - get remaining time for the timer - * * @timer: the timer to read */ ktime_t hrtimer_get_remaining(const struct hrtimer *timer) @@ -564,7 +557,6 @@ ktime_t hrtimer_get_next_event(void) /** * hrtimer_init - initialize a timer to the given clock - * * @timer: the timer to be initialized * @clock_id: the clock to be used * @mode: timer mode abs/rel @@ -588,7 +580,6 @@ EXPORT_SYMBOL_GPL(hrtimer_init); /** * hrtimer_get_res - get the timer resolution for a clock - * * @which_clock: which clock to query * @tp: pointer to timespec variable to store the resolution * -- cgit v1.2.3 From 9e37bd301ee130598fa1406c1281caa159473bf8 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sun, 25 Jun 2006 05:49:19 -0700 Subject: [PATCH] kthread: move kernel-doc and put it into DocBook Move kthread API kernel-doc from kthread.h to kthread.c & fix it. Add kthread API to kernel-api DocBook. Signed-off-by: Randy Dunlap Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/DocBook/kernel-api.tmpl | 2 ++ include/linux/kthread.h | 65 ++--------------------------------- kernel/kthread.c | 61 +++++++++++++++++++++++++++++++- 3 files changed, 65 insertions(+), 63 deletions(-) (limited to 'kernel') diff --git a/Documentation/DocBook/kernel-api.tmpl b/Documentation/DocBook/kernel-api.tmpl index 5a4abe0d5165..8305eb7b8c15 100644 --- a/Documentation/DocBook/kernel-api.tmpl +++ b/Documentation/DocBook/kernel-api.tmpl @@ -62,6 +62,8 @@ Internal Functions !Ikernel/exit.c !Ikernel/signal.c +!Iinclude/linux/kthread.h +!Ekernel/kthread.c Kernel objects manipulation diff --git a/include/linux/kthread.h b/include/linux/kthread.h index ebdd41fd1082..7cce5dfa092f 100644 --- a/include/linux/kthread.h +++ b/include/linux/kthread.h @@ -4,37 +4,19 @@ #include #include -/** - * kthread_create: create a kthread. - * @threadfn: the function to run until signal_pending(current). - * @data: data ptr for @threadfn. - * @namefmt: printf-style name for the thread. - * - * Description: This helper function creates and names a kernel - * thread. The thread will be stopped: use wake_up_process() to start - * it. See also kthread_run(), kthread_create_on_cpu(). - * - * When woken, the thread will run @threadfn() with @data as its - * argument. @threadfn can either call do_exit() directly if it is a - * standalone thread for which noone will call kthread_stop(), or - * return when 'kthread_should_stop()' is true (which means - * kthread_stop() has been called). The return value should be zero - * or a negative error number: it will be passed to kthread_stop(). - * - * Returns a task_struct or ERR_PTR(-ENOMEM). - */ struct task_struct *kthread_create(int (*threadfn)(void *data), void *data, const char namefmt[], ...); /** - * kthread_run: create and wake a thread. + * kthread_run - create and wake a thread. * @threadfn: the function to run until signal_pending(current). * @data: data ptr for @threadfn. * @namefmt: printf-style name for the thread. * * Description: Convenient wrapper for kthread_create() followed by - * wake_up_process(). Returns the kthread, or ERR_PTR(-ENOMEM). */ + * wake_up_process(). Returns the kthread or ERR_PTR(-ENOMEM). + */ #define kthread_run(threadfn, data, namefmt, ...) \ ({ \ struct task_struct *__k \ @@ -44,50 +26,9 @@ struct task_struct *kthread_create(int (*threadfn)(void *data), __k; \ }) -/** - * kthread_bind: bind a just-created kthread to a cpu. - * @k: thread created by kthread_create(). - * @cpu: cpu (might not be online, must be possible) for @k to run on. - * - * Description: This function is equivalent to set_cpus_allowed(), - * except that @cpu doesn't need to be online, and the thread must be - * stopped (ie. just returned from kthread_create(). - */ void kthread_bind(struct task_struct *k, unsigned int cpu); - -/** - * kthread_stop: stop a thread created by kthread_create(). - * @k: thread created by kthread_create(). - * - * Sets kthread_should_stop() for @k to return true, wakes it, and - * waits for it to exit. Your threadfn() must not call do_exit() - * itself if you use this function! This can also be called after - * kthread_create() instead of calling wake_up_process(): the thread - * will exit without calling threadfn(). - * - * Returns the result of threadfn(), or -EINTR if wake_up_process() - * was never called. */ int kthread_stop(struct task_struct *k); - -/** - * kthread_stop_sem: stop a thread created by kthread_create(). - * @k: thread created by kthread_create(). - * @s: semaphore that @k waits on while idle. - * - * Does essentially the same thing as kthread_stop() above, but wakes - * @k by calling up(@s). - * - * Returns the result of threadfn(), or -EINTR if wake_up_process() - * was never called. */ int kthread_stop_sem(struct task_struct *k, struct semaphore *s); - -/** - * kthread_should_stop: should this kthread return now? - * - * When someone calls kthread_stop on your kthread, it will be woken - * and this will return true. You should then return, and your return - * value will be passed through to kthread_stop(). - */ int kthread_should_stop(void); #endif /* _LINUX_KTHREAD_H */ diff --git a/kernel/kthread.c b/kernel/kthread.c index c5f3c6613b6d..24be714b04c7 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -45,6 +45,13 @@ struct kthread_stop_info static DEFINE_MUTEX(kthread_stop_lock); static struct kthread_stop_info kthread_stop_info; +/** + * kthread_should_stop - should this kthread return now? + * + * When someone calls kthread_stop on your kthread, it will be woken + * and this will return true. You should then return, and your return + * value will be passed through to kthread_stop(). + */ int kthread_should_stop(void) { return (kthread_stop_info.k == current); @@ -122,6 +129,25 @@ static void keventd_create_kthread(void *_create) complete(&create->done); } +/** + * kthread_create - create a kthread. + * @threadfn: the function to run until signal_pending(current). + * @data: data ptr for @threadfn. + * @namefmt: printf-style name for the thread. + * + * Description: This helper function creates and names a kernel + * thread. The thread will be stopped: use wake_up_process() to start + * it. See also kthread_run(), kthread_create_on_cpu(). + * + * When woken, the thread will run @threadfn() with @data as its + * argument. @threadfn can either call do_exit() directly if it is a + * standalone thread for which noone will call kthread_stop(), or + * return when 'kthread_should_stop()' is true (which means + * kthread_stop() has been called). The return value should be zero + * or a negative error number; it will be passed to kthread_stop(). + * + * Returns a task_struct or ERR_PTR(-ENOMEM). + */ struct task_struct *kthread_create(int (*threadfn)(void *data), void *data, const char namefmt[], @@ -156,6 +182,15 @@ struct task_struct *kthread_create(int (*threadfn)(void *data), } EXPORT_SYMBOL(kthread_create); +/** + * kthread_bind - bind a just-created kthread to a cpu. + * @k: thread created by kthread_create(). + * @cpu: cpu (might not be online, must be possible) for @k to run on. + * + * Description: This function is equivalent to set_cpus_allowed(), + * except that @cpu doesn't need to be online, and the thread must be + * stopped (i.e., just returned from kthread_create(). + */ void kthread_bind(struct task_struct *k, unsigned int cpu) { BUG_ON(k->state != TASK_INTERRUPTIBLE); @@ -166,12 +201,36 @@ void kthread_bind(struct task_struct *k, unsigned int cpu) } EXPORT_SYMBOL(kthread_bind); +/** + * kthread_stop - stop a thread created by kthread_create(). + * @k: thread created by kthread_create(). + * + * Sets kthread_should_stop() for @k to return true, wakes it, and + * waits for it to exit. Your threadfn() must not call do_exit() + * itself if you use this function! This can also be called after + * kthread_create() instead of calling wake_up_process(): the thread + * will exit without calling threadfn(). + * + * Returns the result of threadfn(), or %-EINTR if wake_up_process() + * was never called. + */ int kthread_stop(struct task_struct *k) { return kthread_stop_sem(k, NULL); } EXPORT_SYMBOL(kthread_stop); +/** + * kthread_stop_sem - stop a thread created by kthread_create(). + * @k: thread created by kthread_create(). + * @s: semaphore that @k waits on while idle. + * + * Does essentially the same thing as kthread_stop() above, but wakes + * @k by calling up(@s). + * + * Returns the result of threadfn(), or %-EINTR if wake_up_process() + * was never called. + */ int kthread_stop_sem(struct task_struct *k, struct semaphore *s) { int ret; @@ -210,5 +269,5 @@ static __init int helper_init(void) return 0; } -core_initcall(helper_init); +core_initcall(helper_init); -- cgit v1.2.3 From 0e4648141af02331f21aabcd34940c70f09a2d04 Mon Sep 17 00:00:00 2001 From: KaiGai Kohei Date: Sun, 25 Jun 2006 05:49:24 -0700 Subject: [PATCH] pacct: add pacct_struct to fix some pacct bugs. The pacct facility need an i/o operation when an accounting record is generated. There is a possibility to wake OOM killer up. If OOM killer is activated, it kills some processes to make them release process memory regions. But acct_process() is called in the killed processes context before calling exit_mm(), so those processes cannot release own memory. In the results, any processes stop in this point and it finally cause a system stall. --- include/linux/acct.h | 4 ++++ include/linux/sched.h | 7 +++++++ kernel/acct.c | 51 ++++++++++++++++++++++++++++++++++++--------------- kernel/exit.c | 4 +++- kernel/fork.c | 1 + 5 files changed, 51 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/include/linux/acct.h b/include/linux/acct.h index 3d54fbcf969e..5bca9b3ef2d7 100644 --- a/include/linux/acct.h +++ b/include/linux/acct.h @@ -121,12 +121,16 @@ struct vfsmount; struct super_block; extern void acct_auto_close_mnt(struct vfsmount *m); extern void acct_auto_close(struct super_block *sb); +extern void acct_init_pacct(struct pacct_struct *pacct); +extern void acct_collect(); extern void acct_process(long exitcode); extern void acct_update_integrals(struct task_struct *tsk); extern void acct_clear_integrals(struct task_struct *tsk); #else #define acct_auto_close_mnt(x) do { } while (0) #define acct_auto_close(x) do { } while (0) +#define acct_init_pacct(x) do { } while (0) +#define acct_collect() do { } while (0) #define acct_process(x) do { } while (0) #define acct_update_integrals(x) do { } while (0) #define acct_clear_integrals(task) do { } while (0) diff --git a/include/linux/sched.h b/include/linux/sched.h index 38b4791e6a5d..abada7c1d5e4 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -358,6 +358,10 @@ struct sighand_struct { spinlock_t siglock; }; +struct pacct_struct { + unsigned long ac_mem; +}; + /* * NOTE! "signal_struct" does not have it's own * locking, because a shared signal_struct always @@ -449,6 +453,9 @@ struct signal_struct { struct key *session_keyring; /* keyring inherited over fork */ struct key *process_keyring; /* keyring private to this process */ #endif +#ifdef CONFIG_BSD_PROCESS_ACCT + struct pacct_struct pacct; /* per-process accounting information */ +#endif }; /* Context switch must be unlocked if interrupts are to be enabled */ diff --git a/kernel/acct.c b/kernel/acct.c index 44dd6bd63517..b35263137824 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -421,9 +421,9 @@ static u32 encode_float(u64 value) */ static void do_acct_process(long exitcode, struct file *file) { + struct pacct_struct *pacct = ¤t->signal->pacct; acct_t ac; mm_segment_t fs; - unsigned long vsize; unsigned long flim; u64 elapsed; u64 run_time; @@ -505,20 +505,9 @@ static void do_acct_process(long exitcode, struct file *file) ac.ac_flag |= ACORE; if (current->flags & PF_SIGNALED) ac.ac_flag |= AXSIG; - - vsize = 0; - if (current->mm) { - struct vm_area_struct *vma; - down_read(¤t->mm->mmap_sem); - vma = current->mm->mmap; - while (vma) { - vsize += vma->vm_end - vma->vm_start; - vma = vma->vm_next; - } - up_read(¤t->mm->mmap_sem); - } - vsize = vsize / 1024; - ac.ac_mem = encode_comp_t(vsize); + spin_lock(¤t->sighand->siglock); + ac.ac_mem = encode_comp_t(pacct->ac_mem); + spin_unlock(¤t->sighand->siglock); ac.ac_io = encode_comp_t(0 /* current->io_usage */); /* %% */ ac.ac_rw = encode_comp_t(ac.ac_io / 1024); ac.ac_minflt = encode_comp_t(current->signal->min_flt + @@ -545,6 +534,38 @@ static void do_acct_process(long exitcode, struct file *file) set_fs(fs); } +/** + * acct_init_pacct - initialize a new pacct_struct + */ +void acct_init_pacct(struct pacct_struct *pacct) +{ + memset(pacct, 0, sizeof(struct pacct_struct)); +} + +/** + * acct_collect - collect accounting information into pacct_struct + */ +void acct_collect(void) +{ + struct pacct_struct *pacct = ¤t->signal->pacct; + unsigned long vsize = 0; + + if (current->mm) { + struct vm_area_struct *vma; + down_read(¤t->mm->mmap_sem); + vma = current->mm->mmap; + while (vma) { + vsize += vma->vm_end - vma->vm_start; + vma = vma->vm_next; + } + up_read(¤t->mm->mmap_sem); + } + + spin_lock(¤t->sighand->siglock); + pacct->ac_mem = vsize / 1024; + spin_unlock(¤t->sighand->siglock); +} + /** * acct_process - now just a wrapper around do_acct_process * @exitcode: task exit code diff --git a/kernel/exit.c b/kernel/exit.c index 601263c0806f..819d82c2efba 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -894,7 +894,7 @@ fastcall NORET_TYPE void do_exit(long code) if (group_dead) { hrtimer_cancel(&tsk->signal->real_timer); exit_itimers(tsk->signal); - acct_process(code); + acct_collect(); } if (unlikely(tsk->robust_list)) exit_robust_list(tsk); @@ -906,6 +906,8 @@ fastcall NORET_TYPE void do_exit(long code) audit_free(tsk); exit_mm(tsk); + if (group_dead) + acct_process(code); exit_sem(tsk); __exit_files(tsk); __exit_fs(tsk); diff --git a/kernel/fork.c b/kernel/fork.c index 49adc0e8d47c..dfd10cb370c3 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -874,6 +874,7 @@ static inline int copy_signal(unsigned long clone_flags, struct task_struct * ts tsk->it_prof_expires = secs_to_cputime(sig->rlim[RLIMIT_CPU].rlim_cur); } + acct_init_pacct(&sig->pacct); return 0; } -- cgit v1.2.3 From f6ec29a42d7ac3b309a9cef179b686d23986ab98 Mon Sep 17 00:00:00 2001 From: KaiGai Kohei Date: Sun, 25 Jun 2006 05:49:25 -0700 Subject: [PATCH] pacct: avoidance to refer the last thread as a representation of the process When pacct facility generate an 'ac_flag' field in accounting record, it refers a task_struct of the thread which died last in the process. But any other task_structs are ignored. Therefore, pacct facility drops ASU flag even if root-privilege operations are used by any other threads except the last one. In addition, AFORK flag is always set when the thread of group-leader didn't die last, although this process has called execve() after fork(). We have a same matter in ac_exitcode. The recorded ac_exitcode is an exit code of the last thread in the process. There is a possibility this exitcode is not the group leader's one. --- include/linux/acct.h | 8 ++++---- include/linux/sched.h | 2 ++ kernel/acct.c | 42 ++++++++++++++++++++++++------------------ kernel/exit.c | 4 ++-- 4 files changed, 32 insertions(+), 24 deletions(-) (limited to 'kernel') diff --git a/include/linux/acct.h b/include/linux/acct.h index 5bca9b3ef2d7..e86bae7324d2 100644 --- a/include/linux/acct.h +++ b/include/linux/acct.h @@ -122,16 +122,16 @@ struct super_block; extern void acct_auto_close_mnt(struct vfsmount *m); extern void acct_auto_close(struct super_block *sb); extern void acct_init_pacct(struct pacct_struct *pacct); -extern void acct_collect(); -extern void acct_process(long exitcode); +extern void acct_collect(long exitcode, int group_dead); +extern void acct_process(void); extern void acct_update_integrals(struct task_struct *tsk); extern void acct_clear_integrals(struct task_struct *tsk); #else #define acct_auto_close_mnt(x) do { } while (0) #define acct_auto_close(x) do { } while (0) #define acct_init_pacct(x) do { } while (0) -#define acct_collect() do { } while (0) -#define acct_process(x) do { } while (0) +#define acct_collect(x,y) do { } while (0) +#define acct_process() do { } while (0) #define acct_update_integrals(x) do { } while (0) #define acct_clear_integrals(task) do { } while (0) #endif diff --git a/include/linux/sched.h b/include/linux/sched.h index abada7c1d5e4..d8429dc250f0 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -359,6 +359,8 @@ struct sighand_struct { }; struct pacct_struct { + int ac_flag; + long ac_exitcode; unsigned long ac_mem; }; diff --git a/kernel/acct.c b/kernel/acct.c index b35263137824..4c85fdf615da 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -75,7 +75,7 @@ int acct_parm[3] = {4, 2, 30}; /* * External references and all of the globals. */ -static void do_acct_process(long, struct file *); +static void do_acct_process(struct file *); /* * This structure is used so that all the data protected by lock @@ -196,7 +196,7 @@ static void acct_file_reopen(struct file *file) if (old_acct) { mnt_unpin(old_acct->f_vfsmnt); spin_unlock(&acct_globals.lock); - do_acct_process(0, old_acct); + do_acct_process(old_acct); filp_close(old_acct, NULL); spin_lock(&acct_globals.lock); } @@ -419,7 +419,7 @@ static u32 encode_float(u64 value) /* * do_acct_process does all actual work. Caller holds the reference to file. */ -static void do_acct_process(long exitcode, struct file *file) +static void do_acct_process(struct file *file) { struct pacct_struct *pacct = ¤t->signal->pacct; acct_t ac; @@ -496,17 +496,10 @@ static void do_acct_process(long exitcode, struct file *file) old_encode_dev(tty_devnum(current->signal->tty)) : 0; read_unlock(&tasklist_lock); - ac.ac_flag = 0; - if (current->flags & PF_FORKNOEXEC) - ac.ac_flag |= AFORK; - if (current->flags & PF_SUPERPRIV) - ac.ac_flag |= ASU; - if (current->flags & PF_DUMPCORE) - ac.ac_flag |= ACORE; - if (current->flags & PF_SIGNALED) - ac.ac_flag |= AXSIG; spin_lock(¤t->sighand->siglock); + ac.ac_flag = pacct->ac_flag; ac.ac_mem = encode_comp_t(pacct->ac_mem); + ac.ac_exitcode = pacct->ac_exitcode; spin_unlock(¤t->sighand->siglock); ac.ac_io = encode_comp_t(0 /* current->io_usage */); /* %% */ ac.ac_rw = encode_comp_t(ac.ac_io / 1024); @@ -515,7 +508,6 @@ static void do_acct_process(long exitcode, struct file *file) ac.ac_majflt = encode_comp_t(current->signal->maj_flt + current->maj_flt); ac.ac_swaps = encode_comp_t(0); - ac.ac_exitcode = exitcode; /* * Kernel segment override to datasegment and write it @@ -544,13 +536,15 @@ void acct_init_pacct(struct pacct_struct *pacct) /** * acct_collect - collect accounting information into pacct_struct + * @exitcode: task exit code + * @group_dead: not 0, if this thread is the last one in the process. */ -void acct_collect(void) +void acct_collect(long exitcode, int group_dead) { struct pacct_struct *pacct = ¤t->signal->pacct; unsigned long vsize = 0; - if (current->mm) { + if (group_dead && current->mm) { struct vm_area_struct *vma; down_read(¤t->mm->mmap_sem); vma = current->mm->mmap; @@ -562,7 +556,19 @@ void acct_collect(void) } spin_lock(¤t->sighand->siglock); - pacct->ac_mem = vsize / 1024; + if (group_dead) + pacct->ac_mem = vsize / 1024; + if (thread_group_leader(current)) { + pacct->ac_exitcode = exitcode; + if (current->flags & PF_FORKNOEXEC) + pacct->ac_flag |= AFORK; + } + if (current->flags & PF_SUPERPRIV) + pacct->ac_flag |= ASU; + if (current->flags & PF_DUMPCORE) + pacct->ac_flag |= ACORE; + if (current->flags & PF_SIGNALED) + pacct->ac_flag |= AXSIG; spin_unlock(¤t->sighand->siglock); } @@ -572,7 +578,7 @@ void acct_collect(void) * * handles process accounting for an exiting task */ -void acct_process(long exitcode) +void acct_process() { struct file *file = NULL; @@ -591,7 +597,7 @@ void acct_process(long exitcode) get_file(file); spin_unlock(&acct_globals.lock); - do_acct_process(exitcode, file); + do_acct_process(file); fput(file); } diff --git a/kernel/exit.c b/kernel/exit.c index 819d82c2efba..e76bd02e930e 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -894,8 +894,8 @@ fastcall NORET_TYPE void do_exit(long code) if (group_dead) { hrtimer_cancel(&tsk->signal->real_timer); exit_itimers(tsk->signal); - acct_collect(); } + acct_collect(code, group_dead); if (unlikely(tsk->robust_list)) exit_robust_list(tsk); #if defined(CONFIG_FUTEX) && defined(CONFIG_COMPAT) @@ -907,7 +907,7 @@ fastcall NORET_TYPE void do_exit(long code) exit_mm(tsk); if (group_dead) - acct_process(code); + acct_process(); exit_sem(tsk); __exit_files(tsk); __exit_fs(tsk); -- cgit v1.2.3 From 77787bfb44da6e6166af088226707aeccee27968 Mon Sep 17 00:00:00 2001 From: KaiGai Kohei Date: Sun, 25 Jun 2006 05:49:26 -0700 Subject: [PATCH] pacct: none-delayed process accounting accumulation In current 2.6.17 implementation, signal_struct refered from task_struct is used for per-process data structure. The pacct facility also uses it as a per-process data structure to store stime, utime, minflt, majflt. But those members are saved in __exit_signal(). It's too late. For example, if some threads exits at same time, pacct facility has a possibility to drop accountings for a part of those threads. (see, the following 'The results of original 2.6.17 kernel') I think accounting information should be completely collected into the per-process data structure before writing out an accounting record. This patch fixes this matter. Accumulation of stime, utime, minflt and majflt are done before generating accounting record. [mingo@elte.hu: fix acct_collect() siglock bug found by lockdep] Signed-off-by: KaiGai Kohei Signed-off-by: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/sched.h | 2 ++ kernel/acct.c | 24 +++++++++++------------- 2 files changed, 13 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/include/linux/sched.h b/include/linux/sched.h index d8429dc250f0..8d11d9310db0 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -362,6 +362,8 @@ struct pacct_struct { int ac_flag; long ac_exitcode; unsigned long ac_mem; + cputime_t ac_utime, ac_stime; + unsigned long ac_minflt, ac_majflt; }; /* diff --git a/kernel/acct.c b/kernel/acct.c index 4c85fdf615da..368c4f03fe0e 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -428,7 +428,6 @@ static void do_acct_process(struct file *file) u64 elapsed; u64 run_time; struct timespec uptime; - unsigned long jiffies; /* * First check to see if there is enough free_space to continue @@ -469,12 +468,6 @@ static void do_acct_process(struct file *file) #endif do_div(elapsed, AHZ); ac.ac_btime = xtime.tv_sec - elapsed; - jiffies = cputime_to_jiffies(cputime_add(current->utime, - current->signal->utime)); - ac.ac_utime = encode_comp_t(jiffies_to_AHZ(jiffies)); - jiffies = cputime_to_jiffies(cputime_add(current->stime, - current->signal->stime)); - ac.ac_stime = encode_comp_t(jiffies_to_AHZ(jiffies)); /* we really need to bite the bullet and change layout */ ac.ac_uid = current->uid; ac.ac_gid = current->gid; @@ -497,16 +490,16 @@ static void do_acct_process(struct file *file) read_unlock(&tasklist_lock); spin_lock(¤t->sighand->siglock); + ac.ac_utime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_utime))); + ac.ac_stime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_stime))); ac.ac_flag = pacct->ac_flag; ac.ac_mem = encode_comp_t(pacct->ac_mem); + ac.ac_minflt = encode_comp_t(pacct->ac_minflt); + ac.ac_majflt = encode_comp_t(pacct->ac_majflt); ac.ac_exitcode = pacct->ac_exitcode; spin_unlock(¤t->sighand->siglock); ac.ac_io = encode_comp_t(0 /* current->io_usage */); /* %% */ ac.ac_rw = encode_comp_t(ac.ac_io / 1024); - ac.ac_minflt = encode_comp_t(current->signal->min_flt + - current->min_flt); - ac.ac_majflt = encode_comp_t(current->signal->maj_flt + - current->maj_flt); ac.ac_swaps = encode_comp_t(0); /* @@ -532,6 +525,7 @@ static void do_acct_process(struct file *file) void acct_init_pacct(struct pacct_struct *pacct) { memset(pacct, 0, sizeof(struct pacct_struct)); + pacct->ac_utime = pacct->ac_stime = cputime_zero; } /** @@ -555,7 +549,7 @@ void acct_collect(long exitcode, int group_dead) up_read(¤t->mm->mmap_sem); } - spin_lock(¤t->sighand->siglock); + spin_lock_irq(¤t->sighand->siglock); if (group_dead) pacct->ac_mem = vsize / 1024; if (thread_group_leader(current)) { @@ -569,7 +563,11 @@ void acct_collect(long exitcode, int group_dead) pacct->ac_flag |= ACORE; if (current->flags & PF_SIGNALED) pacct->ac_flag |= AXSIG; - spin_unlock(¤t->sighand->siglock); + pacct->ac_utime = cputime_add(pacct->ac_utime, current->utime); + pacct->ac_stime = cputime_add(pacct->ac_stime, current->stime); + pacct->ac_minflt += current->min_flt; + pacct->ac_majflt += current->maj_flt; + spin_unlock_irq(¤t->sighand->siglock); } /** -- cgit v1.2.3 From 72cf2709bf8e0410800f118c4298bfbf8715b303 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 25 Jun 2006 10:04:15 -0700 Subject: Fix PM_TRACE dependency: works only on 32-bit x86 for now Not that x86-64 and other architecture support should be difficult to add (trivial fixups to the data format and add the proper linker script entry). Signed-off-by: Linus Torvalds --- kernel/power/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index cdf315e794ff..fc311a4673a2 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig @@ -38,7 +38,7 @@ config PM_DEBUG config PM_TRACE bool "Suspend/resume event tracing" - depends on PM && PM_DEBUG && X86 + depends on PM && PM_DEBUG && X86_32 default y ---help--- This enables some cheesy code to save the last PM event point in the -- cgit v1.2.3 From 3448097fccdce4ea8f0fcad4f37f502a8cd72e68 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 25 Jun 2006 18:41:00 -0700 Subject: Revert "swsusp special saveable pages support" commits This reverts commits 3e3318dee0878d42ed62a19c292a2ac284135db3 [PATCH] swsusp: x86_64 mark special saveable/unsaveable pages b6370d96e09944c6e3ae8d5743ca8a8ab1f79f6c [PATCH] swsusp: i386 mark special saveable/unsaveable pages ce4ab0012b32c1a4a1d6e934aeb73bf3151c48d9 [PATCH] swsusp: add architecture special saveable pages support because not only do they apparently cause page faults on x86, the infrastructure doesn't compile on powerpc. Signed-off-by: Linus Torvalds --- arch/i386/kernel/setup.c | 106 ------------------------------------------ arch/x86_64/kernel/setup.c | 95 -------------------------------------- include/linux/suspend.h | 1 - kernel/power/power.h | 4 -- kernel/power/snapshot.c | 112 ++------------------------------------------- kernel/power/swsusp.c | 18 ++++++-- 6 files changed, 18 insertions(+), 318 deletions(-) (limited to 'kernel') diff --git a/arch/i386/kernel/setup.c b/arch/i386/kernel/setup.c index 6c1639836e06..6bef9273733e 100644 --- a/arch/i386/kernel/setup.c +++ b/arch/i386/kernel/setup.c @@ -48,7 +48,6 @@ #include #include #include -#include #include