24 files changed, 726 insertions, 511 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index 58908f9d156a..f6ef00f4f90f 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -20,7 +20,6 @@ obj-$(CONFIG_SMP) += cpu.o spinlock.o
 obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o
 obj-$(CONFIG_UID16) += uid16.o
 obj-$(CONFIG_MODULES) += module.o
-obj-$(CONFIG_OBSOLETE_INTERMODULE) += intermodule.o
 obj-$(CONFIG_KALLSYMS) += kallsyms.o
 obj-$(CONFIG_PM) += power/
 obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o
diff --git a/kernel/audit.c b/kernel/audit.c
index c8ccbd09048f..df57b493e1cb 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -55,6 +55,9 @@
 #include <net/netlink.h>
 #include <linux/skbuff.h>
 #include <linux/netlink.h>
+#include <linux/selinux.h>
+
+#include "audit.h"
 
 /* No auditing will take place until audit_initialized != 0.
  * (Initialization happens after skb_init is called.) */
@@ -227,49 +230,103 @@ void audit_log_lost(const char *message)
 	}
 }
 
-static int audit_set_rate_limit(int limit, uid_t loginuid)
+static int audit_set_rate_limit(int limit, uid_t loginuid, u32 sid)
 {
-	int old		 = audit_rate_limit;
-	audit_rate_limit = limit;
-	audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE, 
+	int old	= audit_rate_limit;
+
+	if (sid) {
+		char *ctx = NULL;
+		u32 len;
+		int rc;
+		if ((rc = selinux_ctxid_to_string(sid, &ctx, &len)))
+			return rc;
+		else
+			audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
+				"audit_rate_limit=%d old=%d by auid=%u subj=%s",
+				limit, old, loginuid, ctx);
+		kfree(ctx);
+	} else
+		audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
 			"audit_rate_limit=%d old=%d by auid=%u",
-			audit_rate_limit, old, loginuid);
+			limit, old, loginuid);
+	audit_rate_limit = limit;
 	return old;
 }
 
-static int audit_set_backlog_limit(int limit, uid_t loginuid)
+static int audit_set_backlog_limit(int limit, uid_t loginuid, u32 sid)
 {
-	int old		 = audit_backlog_limit;
-	audit_backlog_limit = limit;
-	audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
+	int old	= audit_backlog_limit;
+
+	if (sid) {
+		char *ctx = NULL;
+		u32 len;
+		int rc;
+		if ((rc = selinux_ctxid_to_string(sid, &ctx, &len)))
+			return rc;
+		else
+			audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
+			    "audit_backlog_limit=%d old=%d by auid=%u subj=%s",
+				limit, old, loginuid, ctx);
+		kfree(ctx);
+	} else
+		audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
 			"audit_backlog_limit=%d old=%d by auid=%u",
-			audit_backlog_limit, old, loginuid);
+			limit, old, loginuid);
+	audit_backlog_limit = limit;
 	return old;
 }
 
-static int audit_set_enabled(int state, uid_t loginuid)
+static int audit_set_enabled(int state, uid_t loginuid, u32 sid)
 {
-	int old		 = audit_enabled;
+	int old = audit_enabled;
+
 	if (state != 0 && state != 1)
 		return -EINVAL;
-	audit_enabled = state;
-	audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
+
+	if (sid) {
+		char *ctx = NULL;
+		u32 len;
+		int rc;
+		if ((rc = selinux_ctxid_to_string(sid, &ctx, &len)))
+			return rc;
+		else
+			audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
+				"audit_enabled=%d old=%d by auid=%u subj=%s",
+				state, old, loginuid, ctx);
+		kfree(ctx);
+	} else
+		audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
 			"audit_enabled=%d old=%d by auid=%u",
-			audit_enabled, old, loginuid);
+			state, old, loginuid);
+	audit_enabled = state;
 	return old;
 }
 
-static int audit_set_failure(int state, uid_t loginuid)
+static int audit_set_failure(int state, uid_t loginuid, u32 sid)
 {
-	int old		 = audit_failure;
+	int old = audit_failure;
+
 	if (state != AUDIT_FAIL_SILENT
 	    && state != AUDIT_FAIL_PRINTK
 	    && state != AUDIT_FAIL_PANIC)
 		return -EINVAL;
-	audit_failure = state;
-	audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
+
+	if (sid) {
+		char *ctx = NULL;
+		u32 len;
+		int rc;
+		if ((rc = selinux_ctxid_to_string(sid, &ctx, &len)))
+			return rc;
+		else
+			audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
+				"audit_failure=%d old=%d by auid=%u subj=%s",
+				state, old, loginuid, ctx);
+		kfree(ctx);
+	} else
+		audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
 			"audit_failure=%d old=%d by auid=%u",
-			audit_failure, old, loginuid);
+			state, old, loginuid);
+	audit_failure = state;
 	return old;
 }
 
@@ -387,7 +444,7 @@ static int audit_netlink_ok(kernel_cap_t eff_cap, u16 msg_type)
 
 static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
 {
-	u32			uid, pid, seq;
+	u32			uid, pid, seq, sid;
 	void			*data;
 	struct audit_status	*status_get, status_set;
 	int			err;
@@ -413,6 +470,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
 	pid  = NETLINK_CREDS(skb)->pid;
 	uid  = NETLINK_CREDS(skb)->uid;
 	loginuid = NETLINK_CB(skb).loginuid;
+	sid  = NETLINK_CB(skb).sid;
 	seq  = nlh->nlmsg_seq;
 	data = NLMSG_DATA(nlh);
 
@@ -433,25 +491,43 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
 			return -EINVAL;
 		status_get   = (struct audit_status *)data;
 		if (status_get->mask & AUDIT_STATUS_ENABLED) {
-			err = audit_set_enabled(status_get->enabled, loginuid);
+			err = audit_set_enabled(status_get->enabled,
+							loginuid, sid);
 			if (err < 0) return err;
 		}
 		if (status_get->mask & AUDIT_STATUS_FAILURE) {
-			err = audit_set_failure(status_get->failure, loginuid);
+			err = audit_set_failure(status_get->failure,
+							 loginuid, sid);
 			if (err < 0) return err;
 		}
 		if (status_get->mask & AUDIT_STATUS_PID) {
 			int old   = audit_pid;
+			if (sid) {
+				char *ctx = NULL;
+				u32 len;
+				int rc;
+				if ((rc = selinux_ctxid_to_string(
+						sid, &ctx, &len)))
+					return rc;
+				else
+					audit_log(NULL, GFP_KERNEL,
+						AUDIT_CONFIG_CHANGE,
+						"audit_pid=%d old=%d by auid=%u subj=%s",
+						status_get->pid, old,
+						loginuid, ctx);
+				kfree(ctx);
+			} else
+				audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
+					"audit_pid=%d old=%d by auid=%u",
+					  status_get->pid, old, loginuid);
 			audit_pid = status_get->pid;
-			audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
-				"audit_pid=%d old=%d by auid=%u",
-				  audit_pid, old, loginuid);
 		}
 		if (status_get->mask & AUDIT_STATUS_RATE_LIMIT)
-			audit_set_rate_limit(status_get->rate_limit, loginuid);
+			audit_set_rate_limit(status_get->rate_limit,
+							 loginuid, sid);
 		if (status_get->mask & AUDIT_STATUS_BACKLOG_LIMIT)
 			audit_set_backlog_limit(status_get->backlog_limit,
-							loginuid);
+							loginuid, sid);
 		break;
 	case AUDIT_USER:
 	case AUDIT_FIRST_USER_MSG...AUDIT_LAST_USER_MSG:
@@ -465,8 +541,23 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
 			ab = audit_log_start(NULL, GFP_KERNEL, msg_type);
 			if (ab) {
 				audit_log_format(ab,
-						 "user pid=%d uid=%u auid=%u msg='%.1024s'",
-						 pid, uid, loginuid, (char *)data);
+						 "user pid=%d uid=%u auid=%u",
+						 pid, uid, loginuid);
+				if (sid) {
+					char *ctx = NULL;
+					u32 len;
+					if (selinux_ctxid_to_string(
+							sid, &ctx, &len)) {
+						audit_log_format(ab, 
+							" ssid=%u", sid);
+						/* Maybe call audit_panic? */
+					} else
+						audit_log_format(ab, 
+							" subj=%s", ctx);
+					kfree(ctx);
+				}
+				audit_log_format(ab, " msg='%.1024s'",
+					 (char *)data);
 				audit_set_pid(ab, pid);
 				audit_log_end(ab);
 			}
@@ -480,7 +571,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
 	case AUDIT_LIST:
 		err = audit_receive_filter(nlh->nlmsg_type, NETLINK_CB(skb).pid,
 					   uid, seq, data, nlmsg_len(nlh),
-					   loginuid);
+					   loginuid, sid);
 		break;
 	case AUDIT_ADD_RULE:
 	case AUDIT_DEL_RULE:
@@ -490,7 +581,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
 	case AUDIT_LIST_RULES:
 		err = audit_receive_filter(nlh->nlmsg_type, NETLINK_CB(skb).pid,
 					   uid, seq, data, nlmsg_len(nlh),
-					   loginuid);
+					   loginuid, sid);
 		break;
 	case AUDIT_SIGNAL_INFO:
 		sig_data.uid = audit_sig_uid;
@@ -564,6 +655,11 @@ static int __init audit_init(void)
 	skb_queue_head_init(&audit_skb_queue);
 	audit_initialized = 1;
 	audit_enabled = audit_default;
+
+	/* Register the callback with selinux.  This callback will be invoked
+	 * when a new policy is loaded. */
+	selinux_audit_set_callback(&selinux_audit_rule_update);
+
 	audit_log(NULL, GFP_KERNEL, AUDIT_KERNEL, "initialized");
 	return 0;
 }
diff --git a/kernel/audit.h b/kernel/audit.h
index bc5392076e2b..6f733920fd32 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -54,9 +54,11 @@ enum audit_state {
 
 /* Rule lists */
 struct audit_field {
-	u32			type;
-	u32			val;
-	u32			op;
+	u32				type;
+	u32				val;
+	u32				op;
+	char				*se_str;
+	struct selinux_audit_rule	*se_rule;
 };
 
 struct audit_krule {
@@ -86,3 +88,5 @@ extern void		    audit_send_reply(int pid, int seq, int type,
 extern void		    audit_log_lost(const char *message);
 extern void		    audit_panic(const char *message);
 extern struct mutex audit_netlink_mutex;
+
+extern int selinux_audit_rule_update(void);
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index d3a8539f3a83..7c134906d689 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -23,6 +23,7 @@
 #include <linux/audit.h>
 #include <linux/kthread.h>
 #include <linux/netlink.h>
+#include <linux/selinux.h>
 #include "audit.h"
 
 /* There are three lists of rules -- one to search at task creation
@@ -42,6 +43,13 @@ struct list_head audit_filter_list[AUDIT_NR_FILTERS] = {
 
 static inline void audit_free_rule(struct audit_entry *e)
 {
+	int i;
+	if (e->rule.fields)
+		for (i = 0; i < e->rule.field_count; i++) {
+			struct audit_field *f = &e->rule.fields[i];
+			kfree(f->se_str);
+			selinux_audit_rule_free(f->se_rule);
+		}
 	kfree(e->rule.fields);
 	kfree(e);
 }
@@ -52,9 +60,29 @@ static inline void audit_free_rule_rcu(struct rcu_head *head)
 	audit_free_rule(e);
 }
 
+/* Initialize an audit filterlist entry. */
+static inline struct audit_entry *audit_init_entry(u32 field_count)
+{
+	struct audit_entry *entry;
+	struct audit_field *fields;
+
+	entry = kzalloc(sizeof(*entry), GFP_KERNEL);
+	if (unlikely(!entry))
+		return NULL;
+
+	fields = kzalloc(sizeof(*fields) * field_count, GFP_KERNEL);
+	if (unlikely(!fields)) {
+		kfree(entry);
+		return NULL;
+	}
+	entry->rule.fields = fields;
+
+	return entry;
+}
+
 /* Unpack a filter field's string representation from user-space
  * buffer. */
-static __attribute__((unused)) char *audit_unpack_string(void **bufp, size_t *remain, size_t len)
+static char *audit_unpack_string(void **bufp, size_t *remain, size_t len)
 {
 	char *str;
 
@@ -84,7 +112,6 @@ static inline struct audit_entry *audit_to_entry_common(struct audit_rule *rule)
 {
 	unsigned listnr;
 	struct audit_entry *entry;
-	struct audit_field *fields;
 	int i, err;
 
 	err = -EINVAL;
@@ -108,23 +135,14 @@ static inline struct audit_entry *audit_to_entry_common(struct audit_rule *rule)
 		goto exit_err;
 
 	err = -ENOMEM;
-	entry = kmalloc(sizeof(*entry), GFP_KERNEL);
-	if (unlikely(!entry))
-		goto exit_err;
-	fields = kmalloc(sizeof(*fields) * rule->field_count, GFP_KERNEL);
-	if (unlikely(!fields)) {
-		kfree(entry);
+	entry = audit_init_entry(rule->field_count);
+	if (!entry)
 		goto exit_err;
-	}
-
-	memset(&entry->rule, 0, sizeof(struct audit_krule));
-	memset(fields, 0, sizeof(struct audit_field));
 
 	entry->rule.flags = rule->flags & AUDIT_FILTER_PREPEND;
 	entry->rule.listnr = listnr;
 	entry->rule.action = rule->action;
 	entry->rule.field_count = rule->field_count;
-	entry->rule.fields = fields;
 
 	for (i = 0; i < AUDIT_BITMASK_SIZE; i++)
 		entry->rule.mask[i] = rule->mask[i];
@@ -150,15 +168,20 @@ static struct audit_entry *audit_rule_to_entry(struct audit_rule *rule)
 	for (i = 0; i < rule->field_count; i++) {
 		struct audit_field *f = &entry->rule.fields[i];
 
-		if (rule->fields[i] & AUDIT_UNUSED_BITS) {
-			err = -EINVAL;
-			goto exit_free;
-		}
-
 		f->op = rule->fields[i] & (AUDIT_NEGATE|AUDIT_OPERATORS);
 		f->type = rule->fields[i] & ~(AUDIT_NEGATE|AUDIT_OPERATORS);
 		f->val = rule->values[i];
 
+		if (f->type & AUDIT_UNUSED_BITS ||
+		    f->type == AUDIT_SE_USER ||
+		    f->type == AUDIT_SE_ROLE ||
+		    f->type == AUDIT_SE_TYPE ||
+		    f->type == AUDIT_SE_SEN ||
+		    f->type == AUDIT_SE_CLR) {
+			err = -EINVAL;
+			goto exit_free;
+		}
+
 		entry->rule.vers_ops = (f->op & AUDIT_OPERATORS) ? 2 : 1;
 
 		/* Support for legacy operators where
@@ -188,8 +211,9 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
 	int err = 0;
 	struct audit_entry *entry;
 	void *bufp;
-	/* size_t remain = datasz - sizeof(struct audit_rule_data); */
+	size_t remain = datasz - sizeof(struct audit_rule_data);
 	int i;
+	char *str;
 
 	entry = audit_to_entry_common((struct audit_rule *)data);
 	if (IS_ERR(entry))
@@ -207,10 +231,35 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
 
 		f->op = data->fieldflags[i] & AUDIT_OPERATORS;
 		f->type = data->fields[i];
+		f->val = data->values[i];
+		f->se_str = NULL;
+		f->se_rule = NULL;
 		switch(f->type) {
-		/* call type-specific conversion routines here */
-		default:
-			f->val = data->values[i];
+		case AUDIT_SE_USER:
+		case AUDIT_SE_ROLE:
+		case AUDIT_SE_TYPE:
+		case AUDIT_SE_SEN:
+		case AUDIT_SE_CLR:
+			str = audit_unpack_string(&bufp, &remain, f->val);
+			if (IS_ERR(str))
+				goto exit_free;
+			entry->rule.buflen += f->val;
+
+			err = selinux_audit_rule_init(f->type, f->op, str,
+						      &f->se_rule);
+			/* Keep currently invalid fields around in case they
+			 * become valid after a policy reload. */
+			if (err == -EINVAL) {
+				printk(KERN_WARNING "audit rule for selinux "
+				       "\'%s\' is invalid\n",  str);
+				err = 0;
+			}
+			if (err) {
+				kfree(str);
+				goto exit_free;
+			} else
+				f->se_str = str;
+			break;
 		}
 	}
 
@@ -286,7 +335,14 @@ static struct audit_rule_data *audit_krule_to_data(struct audit_krule *krule)
 		data->fields[i] = f->type;
 		data->fieldflags[i] = f->op;
 		switch(f->type) {
-		/* call type-specific conversion routines here */
+		case AUDIT_SE_USER:
+		case AUDIT_SE_ROLE:
+		case AUDIT_SE_TYPE:
+		case AUDIT_SE_SEN:
+		case AUDIT_SE_CLR:
+			data->buflen += data->values[i] =
+				audit_pack_string(&bufp, f->se_str);
+			break;
 		default:
 			data->values[i] = f->val;
 		}
@@ -314,7 +370,14 @@ static int audit_compare_rule(struct audit_krule *a, struct audit_krule *b)
 			return 1;
 
 		switch(a->fields[i].type) {
-		/* call type-specific comparison routines here */
+		case AUDIT_SE_USER:
+		case AUDIT_SE_ROLE:
+		case AUDIT_SE_TYPE:
+		case AUDIT_SE_SEN:
+		case AUDIT_SE_CLR:
+			if (strcmp(a->fields[i].se_str, b->fields[i].se_str))
+				return 1;
+			break;
 		default:
 			if (a->fields[i].val != b->fields[i].val)
 				return 1;
@@ -328,6 +391,81 @@ static int audit_compare_rule(struct audit_krule *a, struct audit_krule *b)
 	return 0;
 }
 
+/* Duplicate selinux field information.  The se_rule is opaque, so must be
+ * re-initialized. */
+static inline int audit_dupe_selinux_field(struct audit_field *df,
+					   struct audit_field *sf)
+{
+	int ret = 0;
+	char *se_str;
+
+	/* our own copy of se_str */
+	se_str = kstrdup(sf->se_str, GFP_KERNEL);
+	if (unlikely(IS_ERR(se_str)))
+	    return -ENOMEM;
+	df->se_str = se_str;
+
+	/* our own (refreshed) copy of se_rule */
+	ret = selinux_audit_rule_init(df->type, df->op, df->se_str,
+				      &df->se_rule);
+	/* Keep currently invalid fields around in case they
+	 * become valid after a policy reload. */
+	if (ret == -EINVAL) {
+		printk(KERN_WARNING "audit rule for selinux \'%s\' is "
+		       "invalid\n", df->se_str);
+		ret = 0;
+	}
+
+	return ret;
+}
+
+/* Duplicate an audit rule.  This will be a deep copy with the exception
+ * of the watch - that pointer is carried over.  The selinux specific fields
+ * will be updated in the copy.  The point is to be able to replace the old
+ * rule with the new rule in the filterlist, then free the old rule. */
+static struct audit_entry *audit_dupe_rule(struct audit_krule *old)
+{
+	u32 fcount = old->field_count;
+	struct audit_entry *entry;
+	struct audit_krule *new;
+	int i, err = 0;
+
+	entry = audit_init_entry(fcount);
+	if (unlikely(!entry))
+		return ERR_PTR(-ENOMEM);
+
+	new = &entry->rule;
+	new->vers_ops = old->vers_ops;
+	new->flags = old->flags;
+	new->listnr = old->listnr;
+	new->action = old->action;
+	for (i = 0; i < AUDIT_BITMASK_SIZE; i++)
+		new->mask[i] = old->mask[i];
+	new->buflen = old->buflen;
+	new->field_count = old->field_count;
+	memcpy(new->fields, old->fields, sizeof(struct audit_field) * fcount);
+
+	/* deep copy this information, updating the se_rule fields, because
+	 * the originals will all be freed when the old rule is freed. */
+	for (i = 0; i < fcount; i++) {
+		switch (new->fields[i].type) {
+		case AUDIT_SE_USER:
+		case AUDIT_SE_ROLE:
+		case AUDIT_SE_TYPE:
+		case AUDIT_SE_SEN:
+		case AUDIT_SE_CLR:
+			err = audit_dupe_selinux_field(&new->fields[i],
+						       &old->fields[i]);
+		}
+		if (err) {
+			audit_free_rule(entry);
+			return ERR_PTR(err);
+		}
+	}
+
+	return entry;
+}
+
 /* Add rule to given filterlist if not a duplicate.  Protected by
  * audit_netlink_mutex. */
 static inline int audit_add_rule(struct audit_entry *entry,
@@ -448,9 +586,10 @@ static int audit_list_rules(void *_dest)
  * @data: payload data
  * @datasz: size of payload data
  * @loginuid: loginuid of sender
+ * @sid: SE Linux Security ID of sender
  */
 int audit_receive_filter(int type, int pid, int uid, int seq, void *data,
-			 size_t datasz, uid_t loginuid)
+			 size_t datasz, uid_t loginuid, u32 sid)
 {
 	struct task_struct *tsk;
 	int *dest;
@@ -493,9 +632,23 @@ int audit_receive_filter(int type, int pid, int uid, int seq, void *data,
 
 		err = audit_add_rule(entry,
 				     &audit_filter_list[entry->rule.listnr]);
-		audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
-			"auid=%u add rule to list=%d res=%d\n",
-			loginuid, entry->rule.listnr, !err);
+		if (sid) {
+			char *ctx = NULL;
+			u32 len;
+			if (selinux_ctxid_to_string(sid, &ctx, &len)) {
+				/* Maybe call audit_panic? */
+				audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
+				 "auid=%u ssid=%u add rule to list=%d res=%d",
+				 loginuid, sid, entry->rule.listnr, !err);
+			} else
+				audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
+				 "auid=%u subj=%s add rule to list=%d res=%d",
+				 loginuid, ctx, entry->rule.listnr, !err);
+			kfree(ctx);
+		} else
+			audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
+				"auid=%u add rule to list=%d res=%d",
+				loginuid, entry->rule.listnr, !err);
 
 		if (err)
 			audit_free_rule(entry);
@@ -511,9 +664,24 @@ int audit_receive_filter(int type, int pid, int uid, int seq, void *data,
 
 		err = audit_del_rule(entry,
 				     &audit_filter_list[entry->rule.listnr]);
-		audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
-			"auid=%u remove rule from list=%d res=%d\n",
-			loginuid, entry->rule.listnr, !err);
+
+		if (sid) {
+			char *ctx = NULL;
+			u32 len;
+			if (selinux_ctxid_to_string(sid, &ctx, &len)) {
+				/* Maybe call audit_panic? */
+				audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
+					"auid=%u ssid=%u remove rule from list=%d res=%d",
+					 loginuid, sid, entry->rule.listnr, !err);
+			} else
+				audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
+					"auid=%u subj=%s remove rule from list=%d res=%d",
+					 loginuid, ctx, entry->rule.listnr, !err);
+			kfree(ctx);
+		} else
+			audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
+				"auid=%u remove rule from list=%d res=%d",
+				loginuid, entry->rule.listnr, !err);
 
 		audit_free_rule(entry);
 		break;
@@ -628,3 +796,62 @@ unlock_and_return:
 	rcu_read_unlock();
 	return result;
 }
+
+/* Check to see if the rule contains any selinux fields.  Returns 1 if there
+   are selinux fields specified in the rule, 0 otherwise. */
+static inline int audit_rule_has_selinux(struct audit_krule *rule)
+{
+	int i;
+
+	for (i = 0; i < rule->field_count; i++) {
+		struct audit_field *f = &rule->fields[i];
+		switch (f->type) {
+		case AUDIT_SE_USER:
+		case AUDIT_SE_ROLE:
+		case AUDIT_SE_TYPE:
+		case AUDIT_SE_SEN:
+		case AUDIT_SE_CLR:
+			return 1;
+		}
+	}
+
+	return 0;
+}
+
+/* This function will re-initialize the se_rule field of all applicable rules.
+ * It will traverse the filter lists serarching for rules that contain selinux
+ * specific filter fields.  When such a rule is found, it is copied, the
+ * selinux field is re-initialized, and the old rule is replaced with the
+ * updated rule. */
+int selinux_audit_rule_update(void)
+{
+	struct audit_entry *entry, *n, *nentry;
+	int i, err = 0;
+
+	/* audit_netlink_mutex synchronizes the writers */
+	mutex_lock(&audit_netlink_mutex);
+
+	for (i = 0; i < AUDIT_NR_FILTERS; i++) {
+		list_for_each_entry_safe(entry, n, &audit_filter_list[i], list) {
+			if (!audit_rule_has_selinux(&entry->rule))
+				continue;
+
+			nentry = audit_dupe_rule(&entry->rule);
+			if (unlikely(IS_ERR(nentry))) {
+				/* save the first error encountered for the
+				 * return value */
+				if (!err)
+					err = PTR_ERR(nentry);
+				audit_panic("error updating selinux filters");
+				list_del_rcu(&entry->list);
+			} else {
+				list_replace_rcu(&entry->list, &nentry->list);
+			}
+			call_rcu(&entry->rcu, audit_free_rule_rcu);
+		}
+	}
+
+	mutex_unlock(&audit_netlink_mutex);
+
+	return err;
+}
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 7f160df21a23..1c03a4ed1b27 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -58,6 +58,7 @@
 #include <linux/security.h>
 #include <linux/list.h>
 #include <linux/tty.h>
+#include <linux/selinux.h>
 
 #include "audit.h"
 
@@ -89,7 +90,7 @@ struct audit_names {
 	uid_t		uid;
 	gid_t		gid;
 	dev_t		rdev;
-	char		*ctx;
+	u32		osid;
 };
 
 struct audit_aux_data {
@@ -106,7 +107,7 @@ struct audit_aux_data_ipcctl {
 	uid_t			uid;
 	gid_t			gid;
 	mode_t			mode;
-	char 			*ctx;
+	u32			osid;
 };
 
 struct audit_aux_data_socketcall {
@@ -167,7 +168,8 @@ static int audit_filter_rules(struct task_struct *tsk,
 			      struct audit_context *ctx,
 			      enum audit_state *state)
 {
-	int i, j;
+	int i, j, need_sid = 1;
+	u32 sid;
 
 	for (i = 0; i < rule->field_count; i++) {
 		struct audit_field *f = &rule->fields[i];
@@ -257,6 +259,27 @@ static int audit_filter_rules(struct task_struct *tsk,
 			if (ctx)
 				result = audit_comparator(ctx->loginuid, f->op, f->val);
 			break;
+		case AUDIT_SE_USER:
+		case AUDIT_SE_ROLE:
+		case AUDIT_SE_TYPE:
+		case AUDIT_SE_SEN:
+		case AUDIT_SE_CLR:
+			/* NOTE: this may return negative values indicating
+			   a temporary error.  We simply treat this as a
+			   match for now to avoid losing information that
+			   may be wanted.   An error message will also be
+			   logged upon error */
+			if (f->se_rule) {
+				if (need_sid) {
+					selinux_task_ctxid(tsk, &sid);
+					need_sid = 0;
+				}
+				result = selinux_audit_rule_match(sid, f->type,
+				                                  f->op,
+				                                  f->se_rule,
+				                                  ctx);
+			}
+			break;
 		case AUDIT_ARG0:
 		case AUDIT_ARG1:
 		case AUDIT_ARG2:
@@ -329,7 +352,6 @@ static enum audit_state audit_filter_syscall(struct task_struct *tsk,
 	return AUDIT_BUILD_CONTEXT;
 }
 
-/* This should be called with task_lock() held. */
 static inline struct audit_context *audit_get_context(struct task_struct *tsk,
 						      int return_valid,
 						      int return_code)
@@ -391,9 +413,6 @@ static inline void audit_free_names(struct audit_context *context)
 #endif
 
 	for (i = 0; i < context->name_count; i++) {
-		char *p = context->names[i].ctx;
-		context->names[i].ctx = NULL;
-		kfree(p);
 		if (context->names[i].name)
 			__putname(context->names[i].name);
 	}
@@ -416,11 +435,6 @@ static inline void audit_free_aux(struct audit_context *context)
 			dput(axi->dentry);
 			mntput(axi->mnt);
 		}
-		if ( aux->type == AUDIT_IPC ) {
-			struct audit_aux_data_ipcctl *axi = (void *)aux;
-			if (axi->ctx)
-				kfree(axi->ctx);
-		}
 
 		context->aux = aux->next;
 		kfree(aux);
@@ -506,7 +520,7 @@ static inline void audit_free_context(struct audit_context *context)
 		printk(KERN_ERR "audit: freed %d contexts\n", count);
 }
 
-static void audit_log_task_context(struct audit_buffer *ab, gfp_t gfp_mask)
+static void audit_log_task_context(struct audit_buffer *ab)
 {
 	char *ctx = NULL;
 	ssize_t len = 0;
@@ -518,7 +532,7 @@ static void audit_log_task_context(struct audit_buffer *ab, gfp_t gfp_mask)
 		return;
 	}
 
-	ctx = kmalloc(len, gfp_mask);
+	ctx = kmalloc(len, GFP_KERNEL);
 	if (!ctx)
 		goto error_path;
 
@@ -536,47 +550,46 @@ error_path:
 	return;
 }
 
-static void audit_log_task_info(struct audit_buffer *ab, gfp_t gfp_mask)
+static void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk)
 {
-	char name[sizeof(current->comm)];
-	struct mm_struct *mm = current->mm;
+	char name[sizeof(tsk->comm)];
+	struct mm_struct *mm = tsk->mm;
 	struct vm_area_struct *vma;
 
-	get_task_comm(name, current);
+	/* tsk == current */
+
+	get_task_comm(name, tsk);
 	audit_log_format(ab, " comm=");
 	audit_log_untrustedstring(ab, name);
 
-	if (!mm)
-		return;
-
-	/*
-	 * this is brittle; all callers that pass GFP_ATOMIC will have
-	 * NULL current->mm and we won't get here.
-	 */
-	down_read(&mm->mmap_sem);
-	vma = mm->mmap;
-	while (vma) {
-		if ((vma->vm_flags & VM_EXECUTABLE) &&
-		    vma->vm_file) {
-			audit_log_d_path(ab, "exe=",
-					 vma->vm_file->f_dentry,
-					 vma->vm_file->f_vfsmnt);
-			break;
+	if (mm) {
+		down_read(&mm->mmap_sem);
+		vma = mm->mmap;
+		while (vma) {
+			if ((vma->vm_flags & VM_EXECUTABLE) &&
+			    vma->vm_file) {
+				audit_log_d_path(ab, "exe=",
+						 vma->vm_file->f_dentry,
+						 vma->vm_file->f_vfsmnt);
+				break;
+			}
+			vma = vma->vm_next;
 		}
-		vma = vma->vm_next;
+		up_read(&mm->mmap_sem);
 	}
-	up_read(&mm->mmap_sem);
-	audit_log_task_context(ab, gfp_mask);
+	audit_log_task_context(ab);
 }
 
-static void audit_log_exit(struct audit_context *context, gfp_t gfp_mask)
+static void audit_log_exit(struct audit_context *context, struct task_struct *tsk)
 {
-	int i;
+	int i, call_panic = 0;
 	struct audit_buffer *ab;
 	struct audit_aux_data *aux;
 	const char *tty;
 
-	ab = audit_log_start(context, gfp_mask, AUDIT_SYSCALL);
+	/* tsk == current */
+
+	ab = audit_log_start(context, GFP_KERNEL, AUDIT_SYSCALL);
 	if (!ab)
 		return;		/* audit_panic has been called */
 	audit_log_format(ab, "arch=%x syscall=%d",
@@ -587,8 +600,8 @@ static void audit_log_exit(struct audit_context *context, gfp_t gfp_mask)
 		audit_log_format(ab, " success=%s exit=%ld", 
 				 (context->return_valid==AUDITSC_SUCCESS)?"yes":"no",
 				 context->return_code);
-	if (current->signal->tty && current->signal->tty->name)
-		tty = current->signal->tty->name;
+	if (tsk->signal && tsk->signal->tty && tsk->signal->tty->name)
+		tty = tsk->signal->tty->name;
 	else
 		tty = "(none)";
 	audit_log_format(ab,
@@ -607,12 +620,12 @@ static void audit_log_exit(struct audit_context *context, gfp_t gfp_mask)
 		  context->gid,
 		  context->euid, context->suid, context->fsuid,
 		  context->egid, context->sgid, context->fsgid, tty);
-	audit_log_task_info(ab, gfp_mask);
+	audit_log_task_info(ab, tsk);
 	audit_log_end(ab);
 
 	for (aux = context->aux; aux; aux = aux->next) {
 
-		ab = audit_log_start(context, gfp_mask, aux->type);
+		ab = audit_log_start(context, GFP_KERNEL, aux->type);
 		if (!ab)
 			continue; /* audit_panic has been called */
 
@@ -620,8 +633,39 @@ static void audit_log_exit(struct audit_context *context, gfp_t gfp_mask)
 		case AUDIT_IPC: {
 			struct audit_aux_data_ipcctl *axi = (void *)aux;
 			audit_log_format(ab, 
-					 " qbytes=%lx iuid=%u igid=%u mode=%x obj=%s",
-					 axi->qbytes, axi->uid, axi->gid, axi->mode, axi->ctx);
+				 " qbytes=%lx iuid=%u igid=%u mode=%x",
+				 axi->qbytes, axi->uid, axi->gid, axi->mode);
+			if (axi->osid != 0) {
+				char *ctx = NULL;
+				u32 len;
+				if (selinux_ctxid_to_string(
+						axi->osid, &ctx, &len)) {
+					audit_log_format(ab, " osid=%u",
+							axi->osid);
+					call_panic = 1;
+				} else
+					audit_log_format(ab, " obj=%s", ctx);
+				kfree(ctx);
+			}
+			break; }
+
+		case AUDIT_IPC_SET_PERM: {
+			struct audit_aux_data_ipcctl *axi = (void *)aux;
+			audit_log_format(ab,
+				" new qbytes=%lx new iuid=%u new igid=%u new mode=%x",
+				axi->qbytes, axi->uid, axi->gid, axi->mode);
+			if (axi->osid != 0) {
+				char *ctx = NULL;
+				u32 len;
+				if (selinux_ctxid_to_string(
+						axi->osid, &ctx, &len)) {
+					audit_log_format(ab, " osid=%u",
+							axi->osid);
+					call_panic = 1;
+				} else
+					audit_log_format(ab, " obj=%s", ctx);
+				kfree(ctx);
+			}
 			break; }
 
 		case AUDIT_SOCKETCALL: {
@@ -649,7 +693,7 @@ static void audit_log_exit(struct audit_context *context, gfp_t gfp_mask)
 	}
 
 	if (context->pwd && context->pwdmnt) {
-		ab = audit_log_start(context, gfp_mask, AUDIT_CWD);
+		ab = audit_log_start(context, GFP_KERNEL, AUDIT_CWD);
 		if (ab) {
 			audit_log_d_path(ab, "cwd=", context->pwd, context->pwdmnt);
 			audit_log_end(ab);
@@ -659,7 +703,7 @@ static void audit_log_exit(struct audit_context *context, gfp_t gfp_mask)
 		unsigned long ino  = context->names[i].ino;
 		unsigned long pino = context->names[i].pino;
 
-		ab = audit_log_start(context, gfp_mask, AUDIT_PATH);
+		ab = audit_log_start(context, GFP_KERNEL, AUDIT_PATH);
 		if (!ab)
 			continue; /* audit_panic has been called */
 
@@ -685,32 +729,35 @@ static void audit_log_exit(struct audit_context *context, gfp_t gfp_mask)
 					 context->names[i].gid, 
 					 MAJOR(context->names[i].rdev), 
 					 MINOR(context->names[i].rdev));
-		if (context->names[i].ctx) {
-			audit_log_format(ab, " obj=%s",
-					context->names[i].ctx);
+		if (context->names[i].osid != 0) {
+			char *ctx = NULL;
+			u32 len;
+			if (selinux_ctxid_to_string(
+				context->names[i].osid, &ctx, &len)) {
+				audit_log_format(ab, " osid=%u",
+						context->names[i].osid);
+				call_panic = 2;
+			} else
+				audit_log_format(ab, " obj=%s", ctx);
+			kfree(ctx);
 		}
 
 		audit_log_end(ab);
 	}
+	if (call_panic)
+		audit_panic("error converting sid to string");
 }
 
 /**
  * audit_free - free a per-task audit context
  * @tsk: task whose audit context block to free
  *
- * Called from copy_process and __put_task_struct.
+ * Called from copy_process and do_exit
  */
 void audit_free(struct task_struct *tsk)
 {
 	struct audit_context *context;
 
-	/*
-	 * No need to lock the task - when we execute audit_free()
-	 * then the task has no external references anymore, and
-	 * we are tearing it down. (The locking also confuses
-	 * DEBUG_LOCKDEP - this freeing may occur in softirq
-	 * contexts as well, via RCU.)
-	 */
 	context = audit_get_context(tsk, 0, 0);
 	if (likely(!context))
 		return;
@@ -719,8 +766,9 @@ void audit_free(struct task_struct *tsk)
 	 * function (e.g., exit_group), then free context block. 
 	 * We use GFP_ATOMIC here because we might be doing this 
 	 * in the context of the idle thread */
+	/* that can happen only if we are called from do_exit() */
 	if (context->in_syscall && context->auditable)
-		audit_log_exit(context, GFP_ATOMIC);
+		audit_log_exit(context, tsk);
 
 	audit_free_context(context);
 }
@@ -743,10 +791,11 @@ void audit_free(struct task_struct *tsk)
  * will only be written if another part of the kernel requests that it
  * be written).
  */
-void audit_syscall_entry(struct task_struct *tsk, int arch, int major,
+void audit_syscall_entry(int arch, int major,
 			 unsigned long a1, unsigned long a2,
 			 unsigned long a3, unsigned long a4)
 {
+	struct task_struct *tsk = current;
 	struct audit_context *context = tsk->audit_context;
 	enum audit_state     state;
 
@@ -824,22 +873,18 @@ void audit_syscall_entry(struct task_struct *tsk, int arch, int major,
  * message), then write out the syscall information.  In call cases,
  * free the names stored from getname().
  */
-void audit_syscall_exit(struct task_struct *tsk, int valid, long return_code)
+void audit_syscall_exit(int valid, long return_code)
 {
+	struct task_struct *tsk = current;
 	struct audit_context *context;
 
-	get_task_struct(tsk);
-	task_lock(tsk);
 	context = audit_get_context(tsk, valid, return_code);
-	task_unlock(tsk);
 
-	/* Not having a context here is ok, since the parent may have
-	 * called __put_task_struct. */
 	if (likely(!context))
-		goto out;
+		return;
 
 	if (context->in_syscall && context->auditable)
-		audit_log_exit(context, GFP_KERNEL);
+		audit_log_exit(context, tsk);
 
 	context->in_syscall = 0;
 	context->auditable  = 0;
@@ -854,8 +899,6 @@ void audit_syscall_exit(struct task_struct *tsk, int valid, long return_code)
 		audit_free_aux(context);
 		tsk->audit_context = context;
 	}
- out:
-	put_task_struct(tsk);
 }
 
 /**
@@ -936,40 +979,11 @@ void audit_putname(const char *name)
 #endif
 }
 
-void audit_inode_context(int idx, const struct inode *inode)
+static void audit_inode_context(int idx, const struct inode *inode)
 {
 	struct audit_context *context = current->audit_context;
-	const char *suffix = security_inode_xattr_getsuffix();
-	char *ctx = NULL;
-	int len = 0;
-
-	if (!suffix)
-		goto ret;
-
-	len = security_inode_getsecurity(inode, suffix, NULL, 0, 0);
-	if (len == -EOPNOTSUPP)
-		goto ret;
-	if (len < 0) 
-		goto error_path;
-
-	ctx = kmalloc(len, GFP_KERNEL);
-	if (!ctx) 
-		goto error_path;
 
-	len = security_inode_getsecurity(inode, suffix, ctx, len, 0);
-	if (len < 0)
-		goto error_path;
-
-	kfree(context->names[idx].ctx);
-	context->names[idx].ctx = ctx;
-	goto ret;
-
-error_path:
-	if (ctx)
-		kfree(ctx);
-	audit_panic("error in audit_inode_context");
-ret:
-	return;
+	selinux_get_inode_sid(inode, &context->names[idx].osid);
 }
 
 
@@ -1155,40 +1169,37 @@ uid_t audit_get_loginuid(struct audit_context *ctx)
 	return ctx ? ctx->loginuid : -1;
 }
 
-static char *audit_ipc_context(struct kern_ipc_perm *ipcp)
+/**
+ * audit_ipc_obj - record audit data for ipc object
+ * @ipcp: ipc permissions
+ *
+ * Returns 0 for success or NULL context or < 0 on error.
+ */
+int audit_ipc_obj(struct kern_ipc_perm *ipcp)
 {
+	struct audit_aux_data_ipcctl *ax;
 	struct audit_context *context = current->audit_context;
-	char *ctx = NULL;
-	int len = 0;
 
 	if (likely(!context))
-		return NULL;
-
-	len = security_ipc_getsecurity(ipcp, NULL, 0);
-	if (len == -EOPNOTSUPP)
-		goto ret;
-	if (len < 0)
-		goto error_path;
-
-	ctx = kmalloc(len, GFP_ATOMIC);
-	if (!ctx)
-		goto error_path;
+		return 0;
 
-	len = security_ipc_getsecurity(ipcp, ctx, len);
-	if (len < 0)
-		goto error_path;
+	ax = kmalloc(sizeof(*ax), GFP_ATOMIC);
+	if (!ax)
+		return -ENOMEM;
 
-	return ctx;
+	ax->uid = ipcp->uid;
+	ax->gid = ipcp->gid;
+	ax->mode = ipcp->mode;
+	selinux_get_ipc_sid(ipcp, &ax->osid);
 
-error_path:
-	kfree(ctx);
-	audit_panic("error in audit_ipc_context");
-ret:
-	return NULL;
+	ax->d.type = AUDIT_IPC;
+	ax->d.next = context->aux;
+	context->aux = (void *)ax;
+	return 0;
 }
 
 /**
- * audit_ipc_perms - record audit data for ipc
+ * audit_ipc_set_perm - record audit data for new ipc permissions
  * @qbytes: msgq bytes
  * @uid: msgq user id
  * @gid: msgq group id
@@ -1196,7 +1207,7 @@ ret:
  *
  * Returns 0 for success or NULL context or < 0 on error.
  */
-int audit_ipc_perms(unsigned long qbytes, uid_t uid, gid_t gid, mode_t mode, struct kern_ipc_perm *ipcp)
+int audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, mode_t mode, struct kern_ipc_perm *ipcp)
 {
 	struct audit_aux_data_ipcctl *ax;
 	struct audit_context *context = current->audit_context;
@@ -1212,9 +1223,9 @@ int audit_ipc_perms(unsigned long qbytes, uid_t uid, gid_t gid, mode_t mode, str
 	ax->uid = uid;
 	ax->gid = gid;
 	ax->mode = mode;
-	ax->ctx = audit_ipc_context(ipcp);
+	selinux_get_ipc_sid(ipcp, &ax->osid);
 
-	ax->d.type = AUDIT_IPC;
+	ax->d.type = AUDIT_IPC_SET_PERM;
 	ax->d.next = context->aux;
 	context->aux = (void *)ax;
 	return 0;
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 72248d1b9e3f..ab81fdd4572b 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -2231,19 +2231,25 @@ static const struct cpuset *nearest_exclusive_ancestor(const struct cpuset *cs)
  * So only GFP_KERNEL allocations, if all nodes in the cpuset are
  * short of memory, might require taking the callback_mutex mutex.
  *
- * The first loop over the zonelist in mm/page_alloc.c:__alloc_pages()
- * calls here with __GFP_HARDWALL always set in gfp_mask, enforcing
- * hardwall cpusets - no allocation on a node outside the cpuset is
- * allowed (unless in interrupt, of course).
- *
- * The second loop doesn't even call here for GFP_ATOMIC requests
- * (if the __alloc_pages() local variable 'wait' is set).  That check
- * and the checks below have the combined affect in the second loop of
- * the __alloc_pages() routine that:
+ * The first call here from mm/page_alloc:get_page_from_freelist()
+ * has __GFP_HARDWALL set in gfp_mask, enforcing hardwall cpusets, so
+ * no allocation on a node outside the cpuset is allowed (unless in
+ * interrupt, of course).
+ *
+ * The second pass through get_page_from_freelist() doesn't even call
+ * here for GFP_ATOMIC calls.  For those calls, the __alloc_pages()
+ * variable 'wait' is not set, and the bit ALLOC_CPUSET is not set
+ * in alloc_flags.  That logic and the checks below have the combined
+ * affect that:
  *	in_interrupt - any node ok (current task context irrelevant)
  *	GFP_ATOMIC   - any node ok
  *	GFP_KERNEL   - any node in enclosing mem_exclusive cpuset ok
  *	GFP_USER     - only nodes in current tasks mems allowed ok.
+ *
+ * Rule:
+ *    Don't call cpuset_zone_allowed() if you can't sleep, unless you
+ *    pass in the __GFP_HARDWALL flag set in gfp_flag, which disables
+ *    the code that might scan up ancestor cpusets and sleep.
  **/
 
 int __cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask)
@@ -2255,6 +2261,7 @@ int __cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask)
 	if (in_interrupt())
 		return 1;
 	node = z->zone_pgdat->node_id;
+	might_sleep_if(!(gfp_mask & __GFP_HARDWALL));
 	if (node_isset(node, current->mems_allowed))
 		return 1;
 	if (gfp_mask & __GFP_HARDWALL)	/* If hardwall request, stop here */
diff --git a/kernel/exit.c b/kernel/exit.c
index f86434d7b3d1..e06d0c10a24e 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -35,6 +35,7 @@
 #include <linux/futex.h>
 #include <linux/compat.h>
 #include <linux/pipe_fs_i.h>
+#include <linux/audit.h> /* for audit_free() */
 
 #include <asm/uaccess.h>
 #include <asm/unistd.h>
@@ -880,14 +881,6 @@ fastcall NORET_TYPE void do_exit(long code)
 
 	tsk->flags |= PF_EXITING;
 
-	/*
-	 * Make sure we don't try to process any timer firings
-	 * while we are already exiting.
-	 */
- 	tsk->it_virt_expires = cputime_zero;
- 	tsk->it_prof_expires = cputime_zero;
-	tsk->it_sched_expires = 0;
-
 	if (unlikely(in_atomic()))
 		printk(KERN_INFO "note: %s[%d] exited with preempt_count %d\n",
 				current->comm, current->pid,
@@ -910,6 +903,8 @@ fastcall NORET_TYPE void do_exit(long code)
 	if (unlikely(tsk->compat_robust_list))
 		compat_exit_robust_list(tsk);
 #endif
+	if (unlikely(tsk->audit_context))
+		audit_free(tsk);
 	exit_mm(tsk);
 
 	exit_sem(tsk);
diff --git a/kernel/extable.c b/kernel/extable.c
index 7501b531ceed..7fe262855317 100644
--- a/kernel/extable.c
+++ b/kernel/extable.c
@@ -40,7 +40,7 @@ const struct exception_table_entry *search_exception_tables(unsigned long addr)
 	return e;
 }
 
-static int core_kernel_text(unsigned long addr)
+int core_kernel_text(unsigned long addr)
 {
 	if (addr >= (unsigned long)_stext &&
 	    addr <= (unsigned long)_etext)
diff --git a/kernel/fork.c b/kernel/fork.c
index d2fa57d480d4..ac8100e3088a 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -114,8 +114,6 @@ void __put_task_struct(struct task_struct *tsk)
 	WARN_ON(atomic_read(&tsk->usage));
 	WARN_ON(tsk == current);
 
-	if (unlikely(tsk->audit_context))
-		audit_free(tsk);
 	security_task_free(tsk);
 	free_uid(tsk->user);
 	put_group_info(tsk->group_info);
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 04ab27ddfd90..18324305724a 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -456,6 +456,7 @@ hrtimer_start(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode)
 
 	return ret;
 }
+EXPORT_SYMBOL_GPL(hrtimer_start);
 
 /**
  * hrtimer_try_to_cancel - try to deactivate a timer
@@ -484,6 +485,7 @@ int hrtimer_try_to_cancel(struct hrtimer *timer)
 	return ret;
 
 }
+EXPORT_SYMBOL_GPL(hrtimer_try_to_cancel);
 
 /**
  * hrtimer_cancel - cancel a timer and wait for the handler to finish.
@@ -504,6 +506,7 @@ int hrtimer_cancel(struct hrtimer *timer)
 		cpu_relax();
 	}
 }
+EXPORT_SYMBOL_GPL(hrtimer_cancel);
 
 /**
  * hrtimer_get_remaining - get remaining time for the timer
@@ -522,6 +525,7 @@ ktime_t hrtimer_get_remaining(const struct hrtimer *timer)
 
 	return rem;
 }
+EXPORT_SYMBOL_GPL(hrtimer_get_remaining);
 
 #ifdef CONFIG_NO_IDLE_HZ
 /**
@@ -580,6 +584,7 @@ void hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
 	timer->base = &bases[clock_id];
 	rb_set_parent(&timer->node, &timer->node);
 }
+EXPORT_SYMBOL_GPL(hrtimer_init);
 
 /**
  * hrtimer_get_res - get the timer resolution for a clock
@@ -599,6 +604,7 @@ int hrtimer_get_res(const clockid_t which_clock, struct timespec *tp)
 
 	return 0;
 }
+EXPORT_SYMBOL_GPL(hrtimer_get_res);
 
 /*
  * Expire the per base hrtimer-queue:
@@ -836,7 +842,7 @@ static void migrate_hrtimers(int cpu)
 }
 #endif /* CONFIG_HOTPLUG_CPU */
 
-static int __devinit hrtimer_cpu_notify(struct notifier_block *self,
+static int hrtimer_cpu_notify(struct notifier_block *self,
 					unsigned long action, void *hcpu)
 {
 	long cpu = (long)hcpu;
@@ -860,7 +866,7 @@ static int __devinit hrtimer_cpu_notify(struct notifier_block *self,
 	return NOTIFY_OK;
 }
 
-static struct notifier_block __devinitdata hrtimers_nb = {
+static struct notifier_block hrtimers_nb = {
 	.notifier_call = hrtimer_cpu_notify,
 };
 
diff --git a/kernel/intermodule.c b/kernel/intermodule.c
deleted file mode 100644
index 55b1e5b85db9..000000000000
--- a/kernel/intermodule.c
+++ /dev/null
@@ -1,184 +0,0 @@
-/* Deprecated, do not use.  Moved from module.c to here. --RR */
-
-/* Written by Keith Owens <kaos@ocs.com.au> Oct 2000 */
-#include <linux/module.h>
-#include <linux/kmod.h>
-#include <linux/spinlock.h>
-#include <linux/list.h>
-#include <linux/slab.h>
-
-/* inter_module functions are always available, even when the kernel is
- * compiled without modules.  Consumers of inter_module_xxx routines
- * will always work, even when both are built into the kernel, this
- * approach removes lots of #ifdefs in mainline code.
- */
-
-static struct list_head ime_list = LIST_HEAD_INIT(ime_list);
-static DEFINE_SPINLOCK(ime_lock);
-static int kmalloc_failed;
-
-struct inter_module_entry {
-	struct list_head list;
-	const char *im_name;
-	struct module *owner;
-	const void *userdata;
-};
-
-/**
- * inter_module_register - register a new set of inter module data.
- * @im_name: an arbitrary string to identify the data, must be unique
- * @owner: module that is registering the data, always use THIS_MODULE
- * @userdata: pointer to arbitrary userdata to be registered
- *
- * Description: Check that the im_name has not already been registered,
- * complain if it has.  For new data, add it to the inter_module_entry
- * list.
- */
-void inter_module_register(const char *im_name, struct module *owner, const void *userdata)
-{
-	struct list_head *tmp;
-	struct inter_module_entry *ime, *ime_new;
-
-	if (!(ime_new = kzalloc(sizeof(*ime), GFP_KERNEL))) {
-		/* Overloaded kernel, not fatal */
-		printk(KERN_ERR
-			"Aiee, inter_module_register: cannot kmalloc entry for '%s'\n",
-			im_name);
-		kmalloc_failed = 1;
-		return;
-	}
-	ime_new->im_name = im_name;
-	ime_new->owner = owner;
-	ime_new->userdata = userdata;
-
-	spin_lock(&ime_lock);
-	list_for_each(tmp, &ime_list) {
-		ime = list_entry(tmp, struct inter_module_entry, list);
-		if (strcmp(ime->im_name, im_name) == 0) {
-			spin_unlock(&ime_lock);
-			kfree(ime_new);
-			/* Program logic error, fatal */
-			printk(KERN_ERR "inter_module_register: duplicate im_name '%s'", im_name);
-			BUG();
-		}
-	}
-	list_add(&(ime_new->list), &ime_list);
-	spin_unlock(&ime_lock);
-}
-
-/**
- * inter_module_unregister - unregister a set of inter module data.
- * @im_name: an arbitrary string to identify the data, must be unique
- *
- * Description: Check that the im_name has been registered, complain if
- * it has not.  For existing data, remove it from the
- * inter_module_entry list.
- */
-void inter_module_unregister(const char *im_name)
-{
-	struct list_head *tmp;
-	struct inter_module_entry *ime;
-
-	spin_lock(&ime_lock);
-	list_for_each(tmp, &ime_list) {
-		ime = list_entry(tmp, struct inter_module_entry, list);
-		if (strcmp(ime->im_name, im_name) == 0) {
-			list_del(&(ime->list));
-			spin_unlock(&ime_lock);
-			kfree(ime);
-			return;
-		}
-	}
-	spin_unlock(&ime_lock);
-	if (kmalloc_failed) {
-		printk(KERN_ERR
-			"inter_module_unregister: no entry for '%s', "
-			"probably caused by previous kmalloc failure\n",
-			im_name);
-		return;
-	}
-	else {
-		/* Program logic error, fatal */
-		printk(KERN_ERR "inter_module_unregister: no entry for '%s'", im_name);
-		BUG();
-	}
-}
-
-/**
- * inter_module_get - return arbitrary userdata from another module.
- * @im_name: an arbitrary string to identify the data, must be unique
- *
- * Description: If the im_name has not been registered, return NULL.
- * Try to increment the use count on the owning module, if that fails
- * then return NULL.  Otherwise return the userdata.
- */
-static const void *inter_module_get(const char *im_name)
-{
-	struct list_head *tmp;
-	struct inter_module_entry *ime;
-	const void *result = NULL;
-
-	spin_lock(&ime_lock);
-	list_for_each(tmp, &ime_list) {
-		ime = list_entry(tmp, struct inter_module_entry, list);
-		if (strcmp(ime->im_name, im_name) == 0) {
-			if (try_module_get(ime->owner))
-				result = ime->userdata;
-			break;
-		}
-	}
-	spin_unlock(&ime_lock);
-	return(result);
-}
-
-/**
- * inter_module_get_request - im get with automatic request_module.
- * @im_name: an arbitrary string to identify the data, must be unique
- * @modname: module that is expected to register im_name
- *
- * Description: If inter_module_get fails, do request_module then retry.
- */
-const void *inter_module_get_request(const char *im_name, const char *modname)
-{
-	const void *result = inter_module_get(im_name);
-	if (!result) {
-		request_module("%s", modname);
-		result = inter_module_get(im_name);
-	}
-	return(result);
-}
-
-/**
- * inter_module_put - release use of data from another module.
- * @im_name: an arbitrary string to identify the data, must be unique
- *
- * Description: If the im_name has not been registered, complain,
- * otherwise decrement the use count on the owning module.
- */
-void inter_module_put(const char *im_name)
-{
-	struct list_head *tmp;
-	struct inter_module_entry *ime;
-
-	spin_lock(&ime_lock);
-	list_for_each(tmp, &ime_list) {
-		ime = list_entry(tmp, struct inter_module_entry, list);
-		if (strcmp(ime->im_name, im_name) == 0) {
-			if (ime->owner)
-				module_put(ime->owner);
-			spin_unlock(&ime_lock);
-			return;
-		}
-	}
-	spin_unlock(&ime_lock);
-	printk(KERN_ERR "inter_module_put: no entry for '%s'", im_name);
-	BUG();
-}
-
-EXPORT_SYMBOL(inter_module_register);
-EXPORT_SYMBOL(inter_module_unregister);
-EXPORT_SYMBOL(inter_module_get_request);
-EXPORT_SYMBOL(inter_module_put);
-
-MODULE_LICENSE("GPL");
-
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index ac766ad573e8..1279e3499534 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -246,8 +246,10 @@ int setup_irq(unsigned int irq, struct irqaction * new)
 
 mismatch:
 	spin_unlock_irqrestore(&desc->lock, flags);
-	printk(KERN_ERR "%s: irq handler mismatch\n", __FUNCTION__);
-	dump_stack();
+	if (!(new->flags & SA_PROBEIRQ)) {
+		printk(KERN_ERR "%s: irq handler mismatch\n", __FUNCTION__);
+		dump_stack();
+	}
 	return -EBUSY;
 }
 
diff --git a/kernel/module.c b/kernel/module.c
index d24deb0dbbc9..bbe04862e1b0 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -705,14 +705,14 @@ EXPORT_SYMBOL(__symbol_put);
 
 void symbol_put_addr(void *addr)
 {
-	unsigned long flags;
+	struct module *modaddr;
 
-	spin_lock_irqsave(&modlist_lock, flags);
-	if (!kernel_text_address((unsigned long)addr))
-		BUG();
+	if (core_kernel_text((unsigned long)addr))
+		return;
 
-	module_put(module_text_address((unsigned long)addr));
-	spin_unlock_irqrestore(&modlist_lock, flags);
+	if (!(modaddr = module_text_address((unsigned long)addr)))
+		BUG();
+	module_put(modaddr);
 }
 EXPORT_SYMBOL_GPL(symbol_put_addr);
 
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index 520f6c59948d..d38d9ec3276c 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -555,9 +555,6 @@ static void arm_timer(struct k_itimer *timer, union cpu_time_count now)
 	struct cpu_timer_list *next;
 	unsigned long i;
 
-	if (CPUCLOCK_PERTHREAD(timer->it_clock)	&& (p->flags & PF_EXITING))
-		return;
-
 	head = (CPUCLOCK_PERTHREAD(timer->it_clock) ?
 		p->cpu_timers : p->signal->cpu_timers);
 	head += CPUCLOCK_WHICH(timer->it_clock);
@@ -1173,6 +1170,9 @@ static void check_process_timers(struct task_struct *tsk,
 		}
 		t = tsk;
 		do {
+			if (unlikely(t->flags & PF_EXITING))
+				continue;
+
 			ticks = cputime_add(cputime_add(t->utime, t->stime),
 					    prof_left);
 			if (!cputime_eq(prof_expires, cputime_zero) &&
@@ -1193,11 +1193,7 @@ static void check_process_timers(struct task_struct *tsk,
 					      t->it_sched_expires > sched)) {
 				t->it_sched_expires = sched;
 			}
-
-			do {
-				t = next_thread(t);
-			} while (unlikely(t->flags & PF_EXITING));
-		} while (t != tsk);
+		} while ((t = next_thread(t)) != tsk);
 	}
 }
 
@@ -1289,30 +1285,30 @@ void run_posix_cpu_timers(struct task_struct *tsk)
 
 #undef	UNEXPIRED
 
-	BUG_ON(tsk->exit_state);
-
 	/*
 	 * Double-check with locks held.
 	 */
 	read_lock(&tasklist_lock);
-	spin_lock(&tsk->sighand->siglock);
+	if (likely(tsk->signal != NULL)) {
+		spin_lock(&tsk->sighand->siglock);
 
-	/*
-	 * Here we take off tsk->cpu_timers[N] and tsk->signal->cpu_timers[N]
-	 * all the timers that are firing, and put them on the firing list.
-	 */
-	check_thread_timers(tsk, &firing);
-	check_process_timers(tsk, &firing);
+		/*
+		 * Here we take off tsk->cpu_timers[N] and tsk->signal->cpu_timers[N]
+		 * all the timers that are firing, and put them on the firing list.
+		 */
+		check_thread_timers(tsk, &firing);
+		check_process_timers(tsk, &firing);
 
-	/*
-	 * We must release these locks before taking any timer's lock.
-	 * There is a potential race with timer deletion here, as the
-	 * siglock now protects our private firing list.  We have set
-	 * the firing flag in each timer, so that a deletion attempt
-	 * that gets the timer lock before we do will give it up and
-	 * spin until we've taken care of that timer below.
-	 */
-	spin_unlock(&tsk->sighand->siglock);
+		/*
+		 * We must release these locks before taking any timer's lock.
+		 * There is a potential race with timer deletion here, as the
+		 * siglock now protects our private firing list.  We have set
+		 * the firing flag in each timer, so that a deletion attempt
+		 * that gets the timer lock before we do will give it up and
+		 * spin until we've taken care of that timer below.
+		 */
+		spin_unlock(&tsk->sighand->siglock);
+	}
 	read_unlock(&tasklist_lock);
 
 	/*
diff --git a/kernel/power/main.c b/kernel/power/main.c
index ee371f50ccaa..0a907f0dc56b 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -86,6 +86,7 @@ static int suspend_prepare(suspend_state_t state)
 			goto Thaw;
 	}
 
+	suspend_console();
 	if ((error = device_suspend(PMSG_SUSPEND))) {
 		printk(KERN_ERR "Some devices failed to suspend\n");
 		goto Finish;
@@ -133,6 +134,7 @@ int suspend_enter(suspend_state_t state)
 static void suspend_finish(suspend_state_t state)
 {
 	device_resume();
+	resume_console();
 	thaw_processes();
 	enable_nonboot_cpus();
 	if (pm_ops && pm_ops->finish)
@@ -272,7 +274,7 @@ static ssize_t state_store(struct subsystem * subsys, const char * buf, size_t n
 		if (*s && !strncmp(buf, *s, len))
 			break;
 	}
-	if (*s)
+	if (state < PM_SUSPEND_MAX && *s)
 		error = enter_state(state);
 	else
 		error = -EINVAL;
diff --git a/kernel/printk.c b/kernel/printk.c
index c056f3324432..19a955619294 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -67,6 +67,7 @@ EXPORT_SYMBOL(oops_in_progress);
  * driver system.
  */
 static DECLARE_MUTEX(console_sem);
+static DECLARE_MUTEX(secondary_console_sem);
 struct console *console_drivers;
 /*
  * This is used for debugging the mess that is the VT code by
@@ -76,7 +77,7 @@ struct console *console_drivers;
  * path in the console code where we end up in places I want
  * locked without the console sempahore held
  */
-static int console_locked;
+static int console_locked, console_suspended;
 
 /*
  * logbuf_lock protects log_buf, log_start, log_end, con_start and logged_chars
@@ -698,6 +699,23 @@ int __init add_preferred_console(char *name, int idx, char *options)
 }
 
 /**
+ * suspend_console - suspend the console subsystem
+ *
+ * This disables printk() while we go into suspend states
+ */
+void suspend_console(void)
+{
+	acquire_console_sem();
+	console_suspended = 1;
+}
+
+void resume_console(void)
+{
+	console_suspended = 0;
+	release_console_sem();
+}
+
+/**
  * acquire_console_sem - lock the console system for exclusive use.
  *
  * Acquires a semaphore which guarantees that the caller has
@@ -708,6 +726,10 @@ int __init add_preferred_console(char *name, int idx, char *options)
 void acquire_console_sem(void)
 {
 	BUG_ON(in_interrupt());
+	if (console_suspended) {
+		down(&secondary_console_sem);
+		return;
+	}
 	down(&console_sem);
 	console_locked = 1;
 	console_may_schedule = 1;
@@ -750,6 +772,10 @@ void release_console_sem(void)
 	unsigned long _con_start, _log_end;
 	unsigned long wake_klogd = 0;
 
+	if (console_suspended) {
+		up(&secondary_console_sem);
+		return;
+	}
 	for ( ; ; ) {
 		spin_lock_irqsave(&logbuf_lock, flags);
 		wake_klogd |= log_start - log_end;
diff --git a/kernel/profile.c b/kernel/profile.c
index 5a730fdb1a2c..68afe121e507 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -299,7 +299,7 @@ out:
 }
 
 #ifdef CONFIG_HOTPLUG_CPU
-static int __devinit profile_cpu_callback(struct notifier_block *info,
+static int profile_cpu_callback(struct notifier_block *info,
 					unsigned long action, void *__cpu)
 {
 	int node, cpu = (unsigned long)__cpu;
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 4e0f0ec003f7..921c22ad16e4 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -148,12 +148,34 @@ int ptrace_may_attach(struct task_struct *task)
 int ptrace_attach(struct task_struct *task)
 {
 	int retval;
-	task_lock(task);
+
 	retval = -EPERM;
 	if (task->pid <= 1)
-		goto bad;
+		goto out;
 	if (task->tgid == current->tgid)
-		goto bad;
+		goto out;
+
+repeat:
+	/*
+	 * Nasty, nasty.
+	 *
+	 * We want to hold both the task-lock and the
+	 * tasklist_lock for writing at the same time.
+	 * But that's against the rules (tasklist_lock
+	 * is taken for reading by interrupts on other
+	 * cpu's that may have task_lock).
+	 */
+	task_lock(task);
+	local_irq_disable();
+	if (!write_trylock(&tasklist_lock)) {
+		local_irq_enable();
+		task_unlock(task);
+		do {
+			cpu_relax();
+		} while (!write_can_lock(&tasklist_lock));
+		goto repeat;
+	}
+
 	/* the same process cannot be attached many times */
 	if (task->ptrace & PT_PTRACED)
 		goto bad;
@@ -166,17 +188,15 @@ int ptrace_attach(struct task_struct *task)
 				      ? PT_ATTACHED : 0);
 	if (capable(CAP_SYS_PTRACE))
 		task->ptrace |= PT_PTRACE_CAP;
-	task_unlock(task);
 
-	write_lock_irq(&tasklist_lock);
 	__ptrace_link(task, current);
-	write_unlock_irq(&tasklist_lock);
 
 	force_sig_specific(SIGSTOP, task);
-	return 0;
 
 bad:
+	write_unlock_irq(&tasklist_lock);
 	task_unlock(task);
+out:
 	return retval;
 }
 
@@ -417,21 +437,22 @@ int ptrace_request(struct task_struct *child, long request,
  */
 int ptrace_traceme(void)
 {
-	int ret;
+	int ret = -EPERM;
 
 	/*
 	 * Are we already being traced?
 	 */
-	if (current->ptrace & PT_PTRACED)
-		return -EPERM;
-	ret = security_ptrace(current->parent, current);
-	if (ret)
-		return -EPERM;
-	/*
-	 * Set the ptrace bit in the process ptrace flags.
-	 */
-	current->ptrace |= PT_PTRACED;
-	return 0;
+	task_lock(current);
+	if (!(current->ptrace & PT_PTRACED)) {
+		ret = security_ptrace(current->parent, current);
+		/*
+		 * Set the ptrace bit in the process ptrace flags.
+		 */
+		if (!ret)
+			current->ptrace |= PT_PTRACED;
+	}
+	task_unlock(current);
+	return ret;
 }
 
 /**
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index 13458bbaa1be..2058f88c7bbb 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -479,12 +479,31 @@ static int __rcu_pending(struct rcu_ctrlblk *rcp, struct rcu_data *rdp)
 	return 0;
 }
 
+/*
+ * Check to see if there is any immediate RCU-related work to be done
+ * by the current CPU, returning 1 if so.  This function is part of the
+ * RCU implementation; it is -not- an exported member of the RCU API.
+ */
 int rcu_pending(int cpu)
 {
 	return __rcu_pending(&rcu_ctrlblk, &per_cpu(rcu_data, cpu)) ||
 		__rcu_pending(&rcu_bh_ctrlblk, &per_cpu(rcu_bh_data, cpu));
 }
 
+/*
+ * Check to see if any future RCU-related work will need to be done
+ * by the current CPU, even if none need be done immediately, returning
+ * 1 if so.  This function is part of the RCU implementation; it is -not-
+ * an exported member of the RCU API.
+ */
+int rcu_needs_cpu(int cpu)
+{
+	struct rcu_data *rdp = &per_cpu(rcu_data, cpu);
+	struct rcu_data *rdp_bh = &per_cpu(rcu_bh_data, cpu);
+
+	return (!!rdp->curlist || !!rdp_bh->curlist || rcu_pending(cpu));
+}
+
 void rcu_check_callbacks(int cpu, int user)
 {
 	if (user || 
@@ -520,7 +539,7 @@ static void __devinit rcu_online_cpu(int cpu)
 	tasklet_init(&per_cpu(rcu_tasklet, cpu), rcu_process_callbacks, 0UL);
 }
 
-static int __devinit rcu_cpu_notify(struct notifier_block *self, 
+static int rcu_cpu_notify(struct notifier_block *self,
 				unsigned long action, void *hcpu)
 {
 	long cpu = (long)hcpu;
@@ -537,7 +556,7 @@ static int __devinit rcu_cpu_notify(struct notifier_block *self,
 	return NOTIFY_OK;
 }
 
-static struct notifier_block __devinitdata rcu_nb = {
+static struct notifier_block rcu_nb = {
 	.notifier_call	= rcu_cpu_notify,
 };
 
diff --git a/kernel/sched.c b/kernel/sched.c
index 365f0b90b4de..c13f1bd2df7d 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -665,55 +665,13 @@ static int effective_prio(task_t *p)
 }
 
 /*
- * We place interactive tasks back into the active array, if possible.
- *
- * To guarantee that this does not starve expired tasks we ignore the
- * interactivity of a task if the first expired task had to wait more
- * than a 'reasonable' amount of time. This deadline timeout is
- * load-dependent, as the frequency of array switched decreases with
- * increasing number of running tasks. We also ignore the interactivity
- * if a better static_prio task has expired, and switch periodically
- * regardless, to ensure that highly interactive tasks do not starve
- * the less fortunate for unreasonably long periods.
- */
-static inline int expired_starving(runqueue_t *rq)
-{
-	int limit;
-
-	/*
-	 * Arrays were recently switched, all is well
-	 */
-	if (!rq->expired_timestamp)
-		return 0;
-
-	limit = STARVATION_LIMIT * rq->nr_running;
-
-	/*
-	 * It's time to switch arrays
-	 */
-	if (jiffies - rq->expired_timestamp >= limit)
-		return 1;
-
-	/*
-	 * There's a better selection in the expired array
-	 */
-	if (rq->curr->static_prio > rq->best_expired_prio)
-		return 1;
-
-	/*
-	 * All is well
-	 */
-	return 0;
-}
-
-/*
  * __activate_task - move a task to the runqueue.
  */
 static void __activate_task(task_t *p, runqueue_t *rq)
 {
 	prio_array_t *target = rq->active;
 
-	if (unlikely(batch_task(p) || (expired_starving(rq) && !rt_task(p))))
+	if (batch_task(p))
 		target = rq->expired;
 	enqueue_task(p, target);
 	rq->nr_running++;
@@ -2532,6 +2490,22 @@ unsigned long long current_sched_time(const task_t *tsk)
 }
 
 /*
+ * We place interactive tasks back into the active array, if possible.
+ *
+ * To guarantee that this does not starve expired tasks we ignore the
+ * interactivity of a task if the first expired task had to wait more
+ * than a 'reasonable' amount of time. This deadline timeout is
+ * load-dependent, as the frequency of array switched decreases with
+ * increasing number of running tasks. We also ignore the interactivity
+ * if a better static_prio task has expired:
+ */
+#define EXPIRED_STARVING(rq) \
+	((STARVATION_LIMIT && ((rq)->expired_timestamp && \
+		(jiffies - (rq)->expired_timestamp >= \
+			STARVATION_LIMIT * ((rq)->nr_running) + 1))) || \
+			((rq)->curr->static_prio > (rq)->best_expired_prio))
+
+/*
  * Account user cpu time to a process.
  * @p: the process that the cpu time gets accounted to
  * @hardirq_offset: the offset to subtract from hardirq_count()
@@ -2666,7 +2640,7 @@ void scheduler_tick(void)
 
 		if (!rq->expired_timestamp)
 			rq->expired_timestamp = jiffies;
-		if (!TASK_INTERACTIVE(p) || expired_starving(rq)) {
+		if (!TASK_INTERACTIVE(p) || EXPIRED_STARVING(rq)) {
 			enqueue_task(p, rq->expired);
 			if (p->static_prio < rq->best_expired_prio)
 				rq->best_expired_prio = p->static_prio;
@@ -4814,7 +4788,7 @@ static int migration_call(struct notifier_block *nfb, unsigned long action,
 /* Register at highest priority so that task migration (migrate_all_tasks)
  * happens before everything else.
  */
-static struct notifier_block __devinitdata migration_notifier = {
+static struct notifier_block migration_notifier = {
 	.notifier_call = migration_call,
 	.priority = 10
 };
diff --git a/kernel/softirq.c b/kernel/softirq.c
index ec8fed42a86f..336f92d64e2e 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -446,7 +446,7 @@ static void takeover_tasklets(unsigned int cpu)
 }
 #endif /* CONFIG_HOTPLUG_CPU */
 
-static int __devinit cpu_callback(struct notifier_block *nfb,
+static int cpu_callback(struct notifier_block *nfb,
 				  unsigned long action,
 				  void *hcpu)
 {
@@ -484,7 +484,7 @@ static int __devinit cpu_callback(struct notifier_block *nfb,
 	return NOTIFY_OK;
 }
 
-static struct notifier_block __devinitdata cpu_nfb = {
+static struct notifier_block cpu_nfb = {
 	.notifier_call = cpu_callback
 };
 
diff --git a/kernel/softlockup.c b/kernel/softlockup.c
index ced91e1ff564..14c7faf02909 100644
--- a/kernel/softlockup.c
+++ b/kernel/softlockup.c
@@ -104,7 +104,7 @@ static int watchdog(void * __bind_cpu)
 /*
  * Create/destroy watchdog threads as CPUs come and go:
  */
-static int __devinit
+static int
 cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
 {
 	int hotcpu = (unsigned long)hcpu;
@@ -140,7 +140,7 @@ cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
 	return NOTIFY_OK;
 }
 
-static struct notifier_block __devinitdata cpu_nfb = {
+static struct notifier_block cpu_nfb = {
 	.notifier_call = cpu_callback
 };
 
diff --git a/kernel/timer.c b/kernel/timer.c
index 883773788836..9e49deed468c 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -541,6 +541,22 @@ found:
 	}
 	spin_unlock(&base->lock);
 
+	/*
+	 * It can happen that other CPUs service timer IRQs and increment
+	 * jiffies, but we have not yet got a local timer tick to process
+	 * the timer wheels.  In that case, the expiry time can be before
+	 * jiffies, but since the high-resolution timer here is relative to
+	 * jiffies, the default expression when high-resolution timers are
+	 * not active,
+	 *
+	 *   time_before(MAX_JIFFY_OFFSET + jiffies, expires)
+	 *
+	 * would falsely evaluate to true.  If that is the case, just
+	 * return jiffies so that we can immediately fire the local timer
+	 */
+	if (time_before(expires, jiffies))
+		return jiffies;
+
 	if (time_before(hr_expires, expires))
 		return hr_expires;
 
@@ -1314,7 +1330,7 @@ static void __devinit migrate_timers(int cpu)
 }
 #endif /* CONFIG_HOTPLUG_CPU */
 
-static int __devinit timer_cpu_notify(struct notifier_block *self, 
+static int timer_cpu_notify(struct notifier_block *self,
 				unsigned long action, void *hcpu)
 {
 	long cpu = (long)hcpu;
@@ -1334,7 +1350,7 @@ static int __devinit timer_cpu_notify(struct notifier_block *self,
 	return NOTIFY_OK;
 }
 
-static struct notifier_block __devinitdata timers_nb = {
+static struct notifier_block timers_nb = {
 	.notifier_call	= timer_cpu_notify,
 };
 
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index e9e464a90376..880fb415a8f6 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -547,7 +547,7 @@ static void take_over_work(struct workqueue_struct *wq, unsigned int cpu)
 }
 
 /* We're holding the cpucontrol mutex here */
-static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
+static int workqueue_cpu_callback(struct notifier_block *nfb,
 				  unsigned long action,
 				  void *hcpu)
 {