From a2000572ad511f5f43091ed7bd2cc3b913104a1e Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@suse.de>
Date: Fri, 10 Feb 2006 01:51:02 -0800
Subject: [PATCH] sched: remove smpnice

I don't think the code is quite ready, which is why I asked for Peter's
additions to also be merged before I acked it (although it turned out that
it still isn't quite ready with his additions either).

Basically I have had similar observations to Suresh in that it does not
play nicely with the rest of the balancing infrastructure (and raised
similar concerns in my review).

The samples (group of 4) I got for "maximum recorded imbalance" on a 2x2
SMP+HT Xeon are as follows:

            | Following boot | hackbench 20        | hackbench 40
 -----------+----------------+---------------------+---------------------
 2.6.16-rc2 | 30,37,100,112  | 5600,5530,6020,6090 | 6390,7090,8760,8470
 +nosmpnice |  3, 2,  4,  2  |   28, 150, 294, 132 |  348, 348, 294, 347

Hackbench raw performance is down around 15% with smpnice (but that in
itself isn't a huge deal because it is just a benchmark).  However, the
samples show that the imbalance passed into move_tasks is increased by
about a factor of 10-30.  I think this would also go some way to explaining
latency blips turning up in the balancing code (though I haven't actually
measured that).

We'll probably have to revert this in the SUSE kernel.

Cc: "Siddha, Suresh B" <suresh.b.siddha@intel.com>
Acked-by: Ingo Molnar <mingo@elte.hu>
Cc: Peter Williams <pwil3058@bigpond.net.au>
Cc: "Martin J. Bligh" <mbligh@aracnet.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/sched.c | 129 ++++++++-------------------------------------------------
 1 file changed, 18 insertions(+), 111 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index bc38804e40dd..87d93be336a1 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -215,7 +215,6 @@ struct runqueue {
 	 */
 	unsigned long nr_running;
 #ifdef CONFIG_SMP
-	unsigned long prio_bias;
 	unsigned long cpu_load[3];
 #endif
 	unsigned long long nr_switches;
@@ -669,68 +668,13 @@ static int effective_prio(task_t *p)
 	return prio;
 }
 
-#ifdef CONFIG_SMP
-static inline void inc_prio_bias(runqueue_t *rq, int prio)
-{
-	rq->prio_bias += MAX_PRIO - prio;
-}
-
-static inline void dec_prio_bias(runqueue_t *rq, int prio)
-{
-	rq->prio_bias -= MAX_PRIO - prio;
-}
-
-static inline void inc_nr_running(task_t *p, runqueue_t *rq)
-{
-	rq->nr_running++;
-	if (rt_task(p)) {
-		if (p != rq->migration_thread)
-			/*
-			 * The migration thread does the actual balancing. Do
-			 * not bias by its priority as the ultra high priority
-			 * will skew balancing adversely.
-			 */
-			inc_prio_bias(rq, p->prio);
-	} else
-		inc_prio_bias(rq, p->static_prio);
-}
-
-static inline void dec_nr_running(task_t *p, runqueue_t *rq)
-{
-	rq->nr_running--;
-	if (rt_task(p)) {
-		if (p != rq->migration_thread)
-			dec_prio_bias(rq, p->prio);
-	} else
-		dec_prio_bias(rq, p->static_prio);
-}
-#else
-static inline void inc_prio_bias(runqueue_t *rq, int prio)
-{
-}
-
-static inline void dec_prio_bias(runqueue_t *rq, int prio)
-{
-}
-
-static inline void inc_nr_running(task_t *p, runqueue_t *rq)
-{
-	rq->nr_running++;
-}
-
-static inline void dec_nr_running(task_t *p, runqueue_t *rq)
-{
-	rq->nr_running--;
-}
-#endif
-
 /*
  * __activate_task - move a task to the runqueue.
  */
 static inline void __activate_task(task_t *p, runqueue_t *rq)
 {
 	enqueue_task(p, rq->active);
-	inc_nr_running(p, rq);
+	rq->nr_running++;
 }
 
 /*
@@ -739,7 +683,7 @@ static inline void __activate_task(task_t *p, runqueue_t *rq)
 static inline void __activate_idle_task(task_t *p, runqueue_t *rq)
 {
 	enqueue_task_head(p, rq->active);
-	inc_nr_running(p, rq);
+	rq->nr_running++;
 }
 
 static int recalc_task_prio(task_t *p, unsigned long long now)
@@ -863,7 +807,7 @@ static void activate_task(task_t *p, runqueue_t *rq, int local)
  */
 static void deactivate_task(struct task_struct *p, runqueue_t *rq)
 {
-	dec_nr_running(p, rq);
+	rq->nr_running--;
 	dequeue_task(p, p->array);
 	p->array = NULL;
 }
@@ -1007,61 +951,27 @@ void kick_process(task_t *p)
  * We want to under-estimate the load of migration sources, to
  * balance conservatively.
  */
-static unsigned long __source_load(int cpu, int type, enum idle_type idle)
+static inline unsigned long source_load(int cpu, int type)
 {
 	runqueue_t *rq = cpu_rq(cpu);
-	unsigned long running = rq->nr_running;
-	unsigned long source_load, cpu_load = rq->cpu_load[type-1],
-		load_now = running * SCHED_LOAD_SCALE;
-
+	unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE;
 	if (type == 0)
-		source_load = load_now;
-	else
-		source_load = min(cpu_load, load_now);
-
-	if (running > 1 || (idle == NOT_IDLE && running))
-		/*
-		 * If we are busy rebalancing the load is biased by
-		 * priority to create 'nice' support across cpus. When
-		 * idle rebalancing we should only bias the source_load if
-		 * there is more than one task running on that queue to
-		 * prevent idle rebalance from trying to pull tasks from a
-		 * queue with only one running task.
-		 */
-		source_load = source_load * rq->prio_bias / running;
+		return load_now;
 
-	return source_load;
-}
-
-static inline unsigned long source_load(int cpu, int type)
-{
-	return __source_load(cpu, type, NOT_IDLE);
+	return min(rq->cpu_load[type-1], load_now);
 }
 
 /*
  * Return a high guess at the load of a migration-target cpu
  */
-static inline unsigned long __target_load(int cpu, int type, enum idle_type idle)
+static inline unsigned long target_load(int cpu, int type)
 {
 	runqueue_t *rq = cpu_rq(cpu);
-	unsigned long running = rq->nr_running;
-	unsigned long target_load, cpu_load = rq->cpu_load[type-1],
-		load_now = running * SCHED_LOAD_SCALE;
-
+	unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE;
 	if (type == 0)
-		target_load = load_now;
-	else
-		target_load = max(cpu_load, load_now);
+		return load_now;
 
-	if (running > 1 || (idle == NOT_IDLE && running))
-		target_load = target_load * rq->prio_bias / running;
-
-	return target_load;
-}
-
-static inline unsigned long target_load(int cpu, int type)
-{
-	return __target_load(cpu, type, NOT_IDLE);
+	return max(rq->cpu_load[type-1], load_now);
 }
 
 /*
@@ -1530,7 +1440,7 @@ void fastcall wake_up_new_task(task_t *p, unsigned long clone_flags)
 				list_add_tail(&p->run_list, &current->run_list);
 				p->array = current->array;
 				p->array->nr_active++;
-				inc_nr_running(p, rq);
+				rq->nr_running++;
 			}
 			set_need_resched();
 		} else
@@ -1875,9 +1785,9 @@ void pull_task(runqueue_t *src_rq, prio_array_t *src_array, task_t *p,
 	       runqueue_t *this_rq, prio_array_t *this_array, int this_cpu)
 {
 	dequeue_task(p, src_array);
-	dec_nr_running(p, src_rq);
+	src_rq->nr_running--;
 	set_task_cpu(p, this_cpu);
-	inc_nr_running(p, this_rq);
+	this_rq->nr_running++;
 	enqueue_task(p, this_array);
 	p->timestamp = (p->timestamp - src_rq->timestamp_last_tick)
 				+ this_rq->timestamp_last_tick;
@@ -2056,9 +1966,9 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
 
 			/* Bias balancing toward cpus of our domain */
 			if (local_group)
-				load = __target_load(i, load_idx, idle);
+				load = target_load(i, load_idx);
 			else
-				load = __source_load(i, load_idx, idle);
+				load = source_load(i, load_idx);
 
 			avg_load += load;
 		}
@@ -2171,7 +2081,7 @@ static runqueue_t *find_busiest_queue(struct sched_group *group,
 	int i;
 
 	for_each_cpu_mask(i, group->cpumask) {
-		load = __source_load(i, 0, idle);
+		load = source_load(i, 0);
 
 		if (load > max_load) {
 			max_load = load;
@@ -3571,10 +3481,8 @@ void set_user_nice(task_t *p, long nice)
 		goto out_unlock;
 	}
 	array = p->array;
-	if (array) {
+	if (array)
 		dequeue_task(p, array);
-		dec_prio_bias(rq, p->static_prio);
-	}
 
 	old_prio = p->prio;
 	new_prio = NICE_TO_PRIO(nice);
@@ -3584,7 +3492,6 @@ void set_user_nice(task_t *p, long nice)
 
 	if (array) {
 		enqueue_task(p, array);
-		inc_prio_bias(rq, p->static_prio);
 		/*
 		 * If the task increased its priority or is running and
 		 * lowered its priority, then reschedule its CPU:
-- 
cgit v1.2.3


From c22db9412736204b25aeba19d18e5ea922f7d632 Mon Sep 17 00:00:00 2001
From: Jan Beulich <jbeulich@novell.com>
Date: Fri, 10 Feb 2006 01:51:11 -0800
Subject: [PATCH] prevent recursive panic from softlockup watchdog

When panic_timeout is zero, suppress triggering a nested panic due to soft
lockup detection.

Signed-off-by: Jan Beulich <jbeulich@novell.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/panic.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/panic.c b/kernel/panic.c
index c5c4ab255834..126dc43f1c74 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -130,6 +130,7 @@ NORET_TYPE void panic(const char * fmt, ...)
 #endif
 	local_irq_enable();
 	for (i = 0;;) {
+		touch_softlockup_watchdog();
 		i += panic_blink(i);
 		mdelay(1);
 		i++;
-- 
cgit v1.2.3


From 16bf134840da3920ded1290973c56ec214636f12 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh@veritas.com>
Date: Tue, 14 Feb 2006 13:52:59 -0800
Subject: [PATCH] compound page: no access_process_vm check

The PageCompound check before access_process_vm's set_page_dirty_lock is no
longer necessary, so remove it.  But leave the PageCompound checks in
bio_set_pages_dirty, dio_bio_complete and nfs_free_user_pages: at least some
of those were introduced as a little optimization on hugetlb pages.

Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/ptrace.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 5f33cdb6fff5..d2cf144d0af5 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -242,8 +242,7 @@ int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, in
 		if (write) {
 			copy_to_user_page(vma, page, addr,
 					  maddr + offset, buf, bytes);
-			if (!PageCompound(page))
-				set_page_dirty_lock(page);
+			set_page_dirty_lock(page);
 		} else {
 			copy_from_user_page(vma, page, addr,
 					    buf, maddr + offset, bytes);
-- 
cgit v1.2.3


From d6077cb80cde4506720f9165eba99ee07438513f Mon Sep 17 00:00:00 2001
From: "Chen, Kenneth W" <kenneth.w.chen@intel.com>
Date: Tue, 14 Feb 2006 13:53:10 -0800
Subject: [PATCH] sched: revert "filter affine wakeups"

Revert commit d7102e95b7b9c00277562c29aad421d2d521c5f6:

    [PATCH] sched: filter affine wakeups

Apparently caused more than 10% performance regression for aim7 benchmark.
The setup in use is 16-cpu HP rx8620, 64Gb of memory and 12 MSA1000s with 144
disks.  Each disk is 72Gb with a single ext3 filesystem (courtesy of HP, who
supplied benchmark results).

The problem is, for aim7, the wake-up pattern is random, but it still needs
load balancing action in the wake-up path to achieve best performance.  With
the above commit, lack of load balancing hurts that workload.

However, for workloads like database transaction processing, the requirement
is exactly opposite.  In the wake up path, best performance is achieved with
absolutely zero load balancing.  We simply wake up the process on the CPU that
it was previously run.  Worst performance is obtained when we do load
balancing at wake up.

There isn't an easy way to auto detect the workload characteristics.  Ingo's
earlier patch that detects idle CPU and decide whether to load balance or not
doesn't perform with aim7 either since all CPUs are busy (it causes even
bigger perf.  regression).

Revert commit d7102e95b7b9c00277562c29aad421d2d521c5f6, which causes more
than 10% performance regression with aim7.

Signed-off-by: Ken Chen <kenneth.w.chen@intel.com>
Acked-by: Ingo Molnar <mingo@elte.hu>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/sched.h |  5 +----
 kernel/sched.c        | 10 +---------
 2 files changed, 2 insertions(+), 13 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 9c1da0269a18..b6f51e3a38ec 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -697,11 +697,8 @@ struct task_struct {
 
 	int lock_depth;		/* BKL lock depth */
 
-#if defined(CONFIG_SMP)
-	int last_waker_cpu;	/* CPU that last woke this task up */
-#if defined(__ARCH_WANT_UNLOCKED_CTXSW)
+#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
 	int oncpu;
-#endif
 #endif
 	int prio, static_prio;
 	struct list_head run_list;
diff --git a/kernel/sched.c b/kernel/sched.c
index 87d93be336a1..66d957227de9 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1204,9 +1204,6 @@ static int try_to_wake_up(task_t *p, unsigned int state, int sync)
 		}
 	}
 
-	if (p->last_waker_cpu != this_cpu)
-		goto out_set_cpu;
-
 	if (unlikely(!cpu_isset(this_cpu, p->cpus_allowed)))
 		goto out_set_cpu;
 
@@ -1277,8 +1274,6 @@ out_set_cpu:
 		cpu = task_cpu(p);
 	}
 
-	p->last_waker_cpu = this_cpu;
-
 out_activate:
 #endif /* CONFIG_SMP */
 	if (old_state == TASK_UNINTERRUPTIBLE) {
@@ -1360,12 +1355,9 @@ void fastcall sched_fork(task_t *p, int clone_flags)
 #ifdef CONFIG_SCHEDSTATS
 	memset(&p->sched_info, 0, sizeof(p->sched_info));
 #endif
-#if defined(CONFIG_SMP)
-	p->last_waker_cpu = cpu;
-#if defined(__ARCH_WANT_UNLOCKED_CTXSW)
+#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
 	p->oncpu = 0;
 #endif
-#endif
 #ifdef CONFIG_PREEMPT
 	/* Want to start with kernel preemption disabled. */
 	task_thread_info(p)->preempt_count = 1;
-- 
cgit v1.2.3


From 06027bdd278a32a84b273e41db68a5db8ffd2bb6 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Tue, 14 Feb 2006 13:53:15 -0800
Subject: [PATCH] hrtimer: round up relative start time on low-res arches

CONFIG_TIME_LOW_RES is a temporary way for architectures to signal that
they simply return xtime in do_gettimeoffset().  In this corner-case we
want to round up by resolution when starting a relative timer, to avoid
short timeouts.  This will go away with the GTOD framework.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Cc: Roman Zippel <zippel@linux-m68k.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/frv/Kconfig       |  4 ++++
 arch/h8300/Kconfig     |  4 ++++
 arch/m68k/Kconfig      |  4 ++++
 arch/m68knommu/Kconfig |  4 ++++
 arch/parisc/Kconfig    |  5 +++++
 arch/v850/Kconfig      |  4 ++++
 kernel/hrtimer.c       | 13 ++++++++++++-
 7 files changed, 37 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/arch/frv/Kconfig b/arch/frv/Kconfig
index 60a617aff8ba..e08383712370 100644
--- a/arch/frv/Kconfig
+++ b/arch/frv/Kconfig
@@ -25,6 +25,10 @@ config GENERIC_HARDIRQS
 	bool
 	default n
 
+config TIME_LOW_RES
+	bool
+	default y
+
 mainmenu "Fujitsu FR-V Kernel Configuration"
 
 source "init/Kconfig"
diff --git a/arch/h8300/Kconfig b/arch/h8300/Kconfig
index 80940d712acf..98308b018a35 100644
--- a/arch/h8300/Kconfig
+++ b/arch/h8300/Kconfig
@@ -33,6 +33,10 @@ config GENERIC_CALIBRATE_DELAY
 	bool
 	default y
 
+config TIME_LOW_RES
+	bool
+	default y
+
 config ISA
 	bool
 	default y
diff --git a/arch/m68k/Kconfig b/arch/m68k/Kconfig
index 96b919828053..8849439e88dd 100644
--- a/arch/m68k/Kconfig
+++ b/arch/m68k/Kconfig
@@ -21,6 +21,10 @@ config GENERIC_CALIBRATE_DELAY
 	bool
 	default y
 
+config TIME_LOW_RES
+	bool
+	default y
+
 config ARCH_MAY_HAVE_PC_FDC
 	bool
 	depends on Q40 || (BROKEN && SUN3X)
diff --git a/arch/m68knommu/Kconfig b/arch/m68knommu/Kconfig
index e2a6e8648960..e50858dbc237 100644
--- a/arch/m68knommu/Kconfig
+++ b/arch/m68knommu/Kconfig
@@ -29,6 +29,10 @@ config GENERIC_CALIBRATE_DELAY
 	bool
 	default y
 
+config TIME_LOW_RES
+	bool
+	default y
+
 source "init/Kconfig"
 
 menu "Processor type and features"
diff --git a/arch/parisc/Kconfig b/arch/parisc/Kconfig
index 7c914a4c67c3..eca33cfa8a4c 100644
--- a/arch/parisc/Kconfig
+++ b/arch/parisc/Kconfig
@@ -29,6 +29,11 @@ config GENERIC_CALIBRATE_DELAY
 	bool
 	default y
 
+config TIME_LOW_RES
+	bool
+	depends on SMP
+	default y
+
 config GENERIC_ISA_DMA
 	bool
 
diff --git a/arch/v850/Kconfig b/arch/v850/Kconfig
index 04494638b963..e7fc3e500342 100644
--- a/arch/v850/Kconfig
+++ b/arch/v850/Kconfig
@@ -28,6 +28,10 @@ config GENERIC_IRQ_PROBE
 	bool
 	default y
 
+config TIME_LOW_RES
+	bool
+	default y
+
 # Turn off some random 386 crap that can affect device config
 config ISA
 	bool
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 2b6e1757aedd..5ae51f1bc7c8 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -418,8 +418,19 @@ hrtimer_start(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode)
 	/* Switch the timer base, if necessary: */
 	new_base = switch_hrtimer_base(timer, base);
 
-	if (mode == HRTIMER_REL)
+	if (mode == HRTIMER_REL) {
 		tim = ktime_add(tim, new_base->get_time());
+		/*
+		 * CONFIG_TIME_LOW_RES is a temporary way for architectures
+		 * to signal that they simply return xtime in
+		 * do_gettimeoffset(). In this case we want to round up by
+		 * resolution when starting a relative timer, to avoid short
+		 * timeouts. This will go away with the GTOD framework.
+		 */
+#ifdef CONFIG_TIME_LOW_RES
+		tim = ktime_add(tim, base->resolution);
+#endif
+	}
 	timer->expires = tim;
 
 	enqueue_hrtimer(timer, new_base);
-- 
cgit v1.2.3


From 3f17da699431ec48540beabc55c54d4b5e66c8e7 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Wed, 15 Feb 2006 22:13:24 +0300
Subject: [PATCH] fix kill_proc_info() vs CLONE_THREAD race

There is a window after copy_process() unlocks ->sighand.siglock
and before it adds the new thread to the thread list.

In that window __group_complete_signal(SIGKILL) will not see the
new thread yet, so this thread will start running while the whole
thread group was supposed to exit.

I beleive we have another good reason to place attach_pid(PID/TGID)
under ->sighand.siglock. We can do the same for

	release_task()->__unhash_process()

	de_thread()->switch_exec_pids()

After that we don't need tasklist_lock to iterate over the thread
list, and we can simplify things, see for example do_sigaction()
or sys_times().

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Roland McGrath <roland@redhat.com>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/fork.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/fork.c b/kernel/fork.c
index 8e88b374cee9..3683ce10f4a9 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1123,8 +1123,8 @@ static task_t *copy_process(unsigned long clone_flags,
 		p->real_parent = current;
 	p->parent = p->real_parent;
 
+	spin_lock(&current->sighand->siglock);
 	if (clone_flags & CLONE_THREAD) {
-		spin_lock(&current->sighand->siglock);
 		/*
 		 * Important: if an exit-all has been started then
 		 * do not create this new thread - the whole thread
@@ -1162,8 +1162,6 @@ static task_t *copy_process(unsigned long clone_flags,
 			 */
 			p->it_prof_expires = jiffies_to_cputime(1);
 		}
-
-		spin_unlock(&current->sighand->siglock);
 	}
 
 	/*
@@ -1189,6 +1187,7 @@ static task_t *copy_process(unsigned long clone_flags,
 
 	nr_threads++;
 	total_forks++;
+	spin_unlock(&current->sighand->siglock);
 	write_unlock_irq(&tasklist_lock);
 	proc_fork_connector(p);
 	return p;
-- 
cgit v1.2.3


From dadac81b1b86196fcc48fb87620403c4a7174f06 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Wed, 15 Feb 2006 22:13:26 +0300
Subject: [PATCH] fix kill_proc_info() vs fork() theoretical race

copy_process:

	attach_pid(p, PIDTYPE_PID, p->pid);
	attach_pid(p, PIDTYPE_TGID, p->tgid);

What if kill_proc_info(p->pid) happens in between?

copy_process() holds current->sighand.siglock, so we are safe
in CLONE_THREAD case, because current->sighand == p->sighand.

Otherwise, p->sighand is unlocked, the new process is already
visible to the find_task_by_pid(), but have a copy of parent's
'struct pid' in ->pids[PIDTYPE_TGID].

This means that __group_complete_signal() may hang while doing

	do ... while (next_thread() != p)

We can solve this problem if we reverse these 2 attach_pid()s:

	attach_pid() does wmb()

	group_send_sig_info() calls spin_lock(), which
	provides a read barrier. // Yes ?

I don't think we can hit this race in practice, but still.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Roland McGrath <roland@redhat.com>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/fork.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/fork.c b/kernel/fork.c
index 3683ce10f4a9..fbea12d7a943 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1173,8 +1173,6 @@ static task_t *copy_process(unsigned long clone_flags,
 	if (unlikely(p->ptrace & PT_PTRACED))
 		__ptrace_link(p, current->parent);
 
-	attach_pid(p, PIDTYPE_PID, p->pid);
-	attach_pid(p, PIDTYPE_TGID, p->tgid);
 	if (thread_group_leader(p)) {
 		p->signal->tty = current->signal->tty;
 		p->signal->pgrp = process_group(current);
@@ -1184,6 +1182,8 @@ static task_t *copy_process(unsigned long clone_flags,
 		if (p->pid)
 			__get_cpu_var(process_counts)++;
 	}
+	attach_pid(p, PIDTYPE_TGID, p->tgid);
+	attach_pid(p, PIDTYPE_PID, p->pid);
 
 	nr_threads++;
 	total_forks++;
-- 
cgit v1.2.3


From 5ecfbae093f0c37311e89b29bfc0c9d586eace87 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Wed, 15 Feb 2006 22:50:10 +0300
Subject: [PATCH] fix zap_thread's ptrace related problems

1. The tracee can go from ptrace_stop() to do_signal_stop()
   after __ptrace_unlink(p).

2. It is unsafe to __ptrace_unlink(p) while p->parent may wait
   for tasklist_lock in ptrace_detach().

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Roland McGrath <roland@redhat.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/exec.c              |  2 +-
 include/linux/ptrace.h |  1 +
 kernel/ptrace.c        | 25 +++++++++++++++----------
 3 files changed, 17 insertions(+), 11 deletions(-)

(limited to 'kernel')

diff --git a/fs/exec.c b/fs/exec.c
index 055378d2513e..0e1c95074d42 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1403,7 +1403,7 @@ static void zap_threads (struct mm_struct *mm)
 		do_each_thread(g,p) {
 			if (mm == p->mm && p != tsk &&
 			    p->ptrace && p->parent->mm == mm) {
-				__ptrace_unlink(p);
+				__ptrace_detach(p, 0);
 			}
 		} while_each_thread(g,p);
 		write_unlock_irq(&tasklist_lock);
diff --git a/include/linux/ptrace.h b/include/linux/ptrace.h
index 9d5cd106b344..0d36750fc0f1 100644
--- a/include/linux/ptrace.h
+++ b/include/linux/ptrace.h
@@ -84,6 +84,7 @@ extern int ptrace_readdata(struct task_struct *tsk, unsigned long src, char __us
 extern int ptrace_writedata(struct task_struct *tsk, char __user *src, unsigned long dst, int len);
 extern int ptrace_attach(struct task_struct *tsk);
 extern int ptrace_detach(struct task_struct *, unsigned int);
+extern void __ptrace_detach(struct task_struct *, unsigned int);
 extern void ptrace_disable(struct task_struct *);
 extern int ptrace_check_attach(struct task_struct *task, int kill);
 extern int ptrace_request(struct task_struct *child, long request, long addr, long data);
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index d2cf144d0af5..d95a72c9279d 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -72,8 +72,8 @@ void ptrace_untrace(task_t *child)
  */
 void __ptrace_unlink(task_t *child)
 {
-	if (!child->ptrace)
-		BUG();
+	BUG_ON(!child->ptrace);
+
 	child->ptrace = 0;
 	if (!list_empty(&child->ptrace_list)) {
 		list_del_init(&child->ptrace_list);
@@ -184,22 +184,27 @@ bad:
 	return retval;
 }
 
+void __ptrace_detach(struct task_struct *child, unsigned int data)
+{
+	child->exit_code = data;
+	/* .. re-parent .. */
+	__ptrace_unlink(child);
+	/* .. and wake it up. */
+	if (child->exit_state != EXIT_ZOMBIE)
+		wake_up_process(child);
+}
+
 int ptrace_detach(struct task_struct *child, unsigned int data)
 {
 	if (!valid_signal(data))
-		return	-EIO;
+		return -EIO;
 
 	/* Architecture-specific hardware disable .. */
 	ptrace_disable(child);
 
-	/* .. re-parent .. */
-	child->exit_code = data;
-
 	write_lock_irq(&tasklist_lock);
-	__ptrace_unlink(child);
-	/* .. and wake it up. */
-	if (child->exit_state != EXIT_ZOMBIE)
-		wake_up_process(child);
+	if (child->ptrace)
+		__ptrace_detach(child, data);
 	write_unlock_irq(&tasklist_lock);
 
 	return 0;
-- 
cgit v1.2.3


From 06fed33849c13af637c4d09e9ba27828fac9edd5 Mon Sep 17 00:00:00 2001
From: Paul Jackson <pj@sgi.com>
Date: Wed, 15 Feb 2006 15:17:38 -0800
Subject: [PATCH] cpuset: oops in exit on null cpuset fix

Fix a latent bug in cpuset_exit() handling.  If a task tried to allocate
memory after calling cpuset_exit(), it oops'd in
cpuset_update_task_memory_state() on a NULL cpuset pointer.

So set the exiting tasks cpuset to the root cpuset instead of to NULL.

A distro kernel hit this with an added kernel package that had just such a
hook (allocating memory) in the exit code path.

Signed-off-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/cpuset.c | 35 ++++++++++++++++++++++++++++++++++-
 1 file changed, 34 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index ba42b0a76961..12815d3f1a05 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -1977,6 +1977,39 @@ void cpuset_fork(struct task_struct *child)
  * We don't need to task_lock() this reference to tsk->cpuset,
  * because tsk is already marked PF_EXITING, so attach_task() won't
  * mess with it, or task is a failed fork, never visible to attach_task.
+ *
+ * Hack:
+ *
+ *    Set the exiting tasks cpuset to the root cpuset (top_cpuset).
+ *
+ *    Don't leave a task unable to allocate memory, as that is an
+ *    accident waiting to happen should someone add a callout in
+ *    do_exit() after the cpuset_exit() call that might allocate.
+ *    If a task tries to allocate memory with an invalid cpuset,
+ *    it will oops in cpuset_update_task_memory_state().
+ *
+ *    We call cpuset_exit() while the task is still competent to
+ *    handle notify_on_release(), then leave the task attached to
+ *    the root cpuset (top_cpuset) for the remainder of its exit.
+ *
+ *    To do this properly, we would increment the reference count on
+ *    top_cpuset, and near the very end of the kernel/exit.c do_exit()
+ *    code we would add a second cpuset function call, to drop that
+ *    reference.  This would just create an unnecessary hot spot on
+ *    the top_cpuset reference count, to no avail.
+ *
+ *    Normally, holding a reference to a cpuset without bumping its
+ *    count is unsafe.   The cpuset could go away, or someone could
+ *    attach us to a different cpuset, decrementing the count on
+ *    the first cpuset that we never incremented.  But in this case,
+ *    top_cpuset isn't going away, and either task has PF_EXITING set,
+ *    which wards off any attach_task() attempts, or task is a failed
+ *    fork, never visible to attach_task.
+ *
+ *    Another way to do this would be to set the cpuset pointer
+ *    to NULL here, and check in cpuset_update_task_memory_state()
+ *    for a NULL pointer.  This hack avoids that NULL check, for no
+ *    cost (other than this way too long comment ;).
  **/
 
 void cpuset_exit(struct task_struct *tsk)
@@ -1984,7 +2017,7 @@ void cpuset_exit(struct task_struct *tsk)
 	struct cpuset *cs;
 
 	cs = tsk->cpuset;
-	tsk->cpuset = NULL;
+	tsk->cpuset = &top_cpuset;	/* Hack - see comment above */
 
 	if (notify_on_release(cs)) {
 		char *pathbuf = NULL;
-- 
cgit v1.2.3


From c8adb494a6df6b2be8e50a8dafd5bab231df3505 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Wed, 15 Feb 2006 15:17:42 -0800
Subject: [PATCH] swsusp: nuke noisy message

I get about 88 squillion of these when suspending an old ad450nx server.

Cc: Pavel Roskin <proski@gnu.org>
Cc: "Rafael J. Wysocki" <rjw@sisk.pl>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/power/snapshot.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 41f66365f0d8..8d5a5986d621 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -91,10 +91,8 @@ static int save_highmem_zone(struct zone *zone)
 		 * corrected eventually when the cases giving rise to this
 		 * are better understood.
 		 */
-		if (PageReserved(page)) {
-			printk("highmem reserved page?!\n");
+		if (PageReserved(page))
 			continue;
-		}
 		BUG_ON(PageNosave(page));
 		if (PageNosaveFree(page))
 			continue;
-- 
cgit v1.2.3


From a62eaf151d9cb478d127cfbc2e93c498869785b0 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Thu, 16 Feb 2006 23:41:58 +0100
Subject: [PATCH] x86_64: Add boot option to disable randomized mappings and
 cleanup

AMD SimNow!'s JIT doesn't like them at all in the guest. For distribution
installation it's easiest if it's a boot time option.

Also I moved the variable to a more appropiate place and make
it independent from sysctl

And marked __read_mostly which it is.

Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 Documentation/kernel-parameters.txt |  3 +++
 arch/i386/kernel/cpu/transmeta.c    |  1 +
 include/linux/kernel.h              |  6 ------
 include/linux/mm.h                  |  2 ++
 kernel/sysctl.c                     |  2 --
 mm/memory.c                         | 10 ++++++++++
 6 files changed, 16 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index ac75b57edf2e..b874771385cd 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -1638,6 +1638,9 @@ running once the system is up.
 			Format:
 			<irq>,<irq_mask>,<io>,<full_duplex>,<do_sound>,<lockup_hack>[,<irq2>[,<irq3>[,<irq4>]]]
 
+	norandmaps	Don't use address space randomization
+			Equivalent to echo 0 > /proc/sys/kernel/randomize_va_space
+
 
 ______________________________________________________________________
 Changelog:
diff --git a/arch/i386/kernel/cpu/transmeta.c b/arch/i386/kernel/cpu/transmeta.c
index bdbeb77f4e22..7214c9b577ab 100644
--- a/arch/i386/kernel/cpu/transmeta.c
+++ b/arch/i386/kernel/cpu/transmeta.c
@@ -1,4 +1,5 @@
 #include <linux/kernel.h>
+#include <linux/mm.h>
 #include <linux/init.h>
 #include <asm/processor.h>
 #include <asm/msr.h>
diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index b49affa0ac5a..3b507bf05d09 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -326,12 +326,6 @@ struct sysinfo {
 /* Force a compilation error if condition is true */
 #define BUILD_BUG_ON(condition) ((void)sizeof(char[1 - 2*!!(condition)]))
 
-#ifdef CONFIG_SYSCTL
-extern int randomize_va_space;
-#else
-#define randomize_va_space 1
-#endif
-
 /* Trap pasters of __FUNCTION__ at compile-time */
 #define __FUNCTION__ (__func__)
 
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 75e9f0724997..26e1663a5cbe 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1051,5 +1051,7 @@ int shrink_slab(unsigned long scanned, gfp_t gfp_mask,
 void drop_pagecache(void);
 void drop_slab(void);
 
+extern int randomize_va_space;
+
 #endif /* __KERNEL__ */
 #endif /* _LINUX_MM_H */
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 71dd6f62efec..7654d55c47f5 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -126,8 +126,6 @@ extern int sysctl_hz_timer;
 extern int acct_parm[];
 #endif
 
-int randomize_va_space = 1;
-
 static int parse_table(int __user *, int, void __user *, size_t __user *, void __user *, size_t,
 		       ctl_table *, void **);
 static int proc_doutsstring(ctl_table *table, int write, struct file *filp,
diff --git a/mm/memory.c b/mm/memory.c
index 2bee1f21aa8a..9abc6008544b 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -82,6 +82,16 @@ EXPORT_SYMBOL(num_physpages);
 EXPORT_SYMBOL(high_memory);
 EXPORT_SYMBOL(vmalloc_earlyreserve);
 
+int randomize_va_space __read_mostly = 1;
+
+static int __init disable_randmaps(char *s)
+{
+	randomize_va_space = 0;
+	return 0;
+}
+__setup("norandmaps", disable_randmaps);
+
+
 /*
  * If a p?d_bad entry is found while walking page tables, report
  * the error, before resetting entry to p?d_none.  Usually (but
-- 
cgit v1.2.3


From 726c14bf499e91e7ede4f1728830aba05c675061 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Fri, 17 Feb 2006 10:30:23 +1100
Subject: [PATCH] Provide an interface for getting the current tick length

This provides an interface for arch code to find out how many
nanoseconds are going to be added on to xtime by the next call to
do_timer.  The value returned is a fixed-point number in 52.12 format
in nanoseconds.  The reason for this format is that it gives the
full precision that the timekeeping code is using internally.

The motivation for this is to fix a problem that has arisen on 32-bit
powerpc in that the value returned by do_gettimeofday drifts apart
from xtime if NTP is being used.  PowerPC is now using a lockless
do_gettimeofday based on reading the timebase register and performing
some simple arithmetic.  (This method of getting the time is also
exported to userspace via the VDSO.)  However, the factor and offset
it uses were calculated based on the nominal tick length and weren't
being adjusted when NTP varied the tick length.

Note that 64-bit powerpc has had the lockless do_gettimeofday for a
long time now.  It also had an extremely hairy routine that got called
from the 32-bit compat routine for adjtimex, which adjusted the
factor and offset according to what it thought the timekeeping code
was going to do.  Not only was this only called if a 32-bit task did
adjtimex (i.e. not if a 64-bit task did adjtimex), it was also
duplicating computations from kernel/timer.c and it wasn't clear that
it was (still) correct.

The simple solution is to ask the timekeeping code how long the
current jiffy will be on each timer interrupt, after calling
do_timer.  If this jiffy will be a different length from the last one,
we then need to compute new values for the factor and offset used in
the lockless do_gettimeofday.  In this way we can keep xtime and
do_gettimeofday in sync, even when NTP is varying the tick length.

Note that when adjtimex varies the tick length, it almost always
introduces the variation from the next tick on.  The only case I could
see where adjtimex would vary the length of the current tick is when
an old-style adjtime adjustment is being cancelled.  (It's not clear
to me why the adjustment has to be cancelled immediately rather than
from the next tick on.)  Thus I don't see any real need for a hook in
adjtimex; the rare case of an old-style adjustment being cancelled can
be fixed up at the next tick.

Signed-off-by: Paul Mackerras <paulus@samba.org>
Acked-by: john stultz <johnstul@us.ibm.com>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/timex.h |  3 +++
 kernel/timer.c        | 39 ++++++++++++++++++++++++++++++++++-----
 2 files changed, 37 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/timex.h b/include/linux/timex.h
index 04a4a8cb4ed3..b7ca1204e42a 100644
--- a/include/linux/timex.h
+++ b/include/linux/timex.h
@@ -345,6 +345,9 @@ time_interpolator_reset(void)
 
 #endif /* !CONFIG_TIME_INTERPOLATION */
 
+/* Returns how long ticks are at present, in ns / 2^(SHIFT_SCALE-10). */
+extern u64 current_tick_length(void);
+
 #endif /* KERNEL */
 
 #endif /* LINUX_TIMEX_H */
diff --git a/kernel/timer.c b/kernel/timer.c
index b9dad3994676..fe3a9a9f8328 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -717,12 +717,16 @@ static void second_overflow(void)
 #endif
 }
 
-/* in the NTP reference this is called "hardclock()" */
-static void update_wall_time_one_tick(void)
+/*
+ * Returns how many microseconds we need to add to xtime this tick
+ * in doing an adjustment requested with adjtime.
+ */
+static long adjtime_adjustment(void)
 {
-	long time_adjust_step, delta_nsec;
+	long time_adjust_step;
 
-	if ((time_adjust_step = time_adjust) != 0 ) {
+	time_adjust_step = time_adjust;
+	if (time_adjust_step) {
 		/*
 		 * We are doing an adjtime thing.  Prepare time_adjust_step to
 		 * be within bounds.  Note that a positive time_adjust means we
@@ -733,10 +737,19 @@ static void update_wall_time_one_tick(void)
 		 */
 		time_adjust_step = min(time_adjust_step, (long)tickadj);
 		time_adjust_step = max(time_adjust_step, (long)-tickadj);
+	}
+	return time_adjust_step;
+}
 
+/* in the NTP reference this is called "hardclock()" */
+static void update_wall_time_one_tick(void)
+{
+	long time_adjust_step, delta_nsec;
+
+	time_adjust_step = adjtime_adjustment();
+	if (time_adjust_step)
 		/* Reduce by this step the amount of time left  */
 		time_adjust -= time_adjust_step;
-	}
 	delta_nsec = tick_nsec + time_adjust_step * 1000;
 	/*
 	 * Advance the phase, once it gets to one microsecond, then
@@ -758,6 +771,22 @@ static void update_wall_time_one_tick(void)
 	}
 }
 
+/*
+ * Return how long ticks are at the moment, that is, how much time
+ * update_wall_time_one_tick will add to xtime next time we call it
+ * (assuming no calls to do_adjtimex in the meantime).
+ * The return value is in fixed-point nanoseconds with SHIFT_SCALE-10
+ * bits to the right of the binary point.
+ * This function has no side-effects.
+ */
+u64 current_tick_length(void)
+{
+	long delta_nsec;
+
+	delta_nsec = tick_nsec + adjtime_adjustment() * 1000;
+	return ((u64) delta_nsec << (SHIFT_SCALE - 10)) + time_adj;
+}
+
 /*
  * Using a loop looks inefficient, but "ticks" is
  * usually just one (we shouldn't be losing ticks,
-- 
cgit v1.2.3


From 4bbf39c29bc3409d6454faf0dfa1b3b0aa2ac2af Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 17 Feb 2006 13:52:44 -0800
Subject: [PATCH] Introduce CONFIG_DEFAULT_MIGRATION_COST

Heiko Carstens <heiko.carstens@de.ibm.com> wrote:

  The boot sequence on s390 sometimes takes ages and we spend a very long
  time (up to one or two minutes) in calibrate_migration_costs.  The time
  spent there differs from boot to boot.  Also the calculated costs differ
  a lot.  I've seen differences by up to a factor of 15 (yes, factor not
  percent).  Also I doubt that making these measurements make much sense on
  a completely virtualized architecture where you cannot tell how much cpu
  time you will get anyway.

So introduce the CONFIG_DEFAULT_MIGRATION_COST method for an architecture
to set the scheduler migration costs.  This turns off automatic detection
of migration costs.  Makes sense on virtual platforms, where migration
costs are hard to measure accurately.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/s390/Kconfig |  4 ++++
 kernel/sched.c    | 13 ++++++++++++-
 2 files changed, 16 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index b66602ad7b33..b7ca5bf9acfc 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -80,6 +80,10 @@ config HOTPLUG_CPU
 	  can be controlled through /sys/devices/system/cpu/cpu#.
 	  Say N if you want to disable CPU hotplug.
 
+config DEFAULT_MIGRATION_COST
+	int
+	default "1000000"
+
 config MATHEMU
 	bool "IEEE FPU emulation"
 	depends on MARCH_G5
diff --git a/kernel/sched.c b/kernel/sched.c
index 66d957227de9..12d291bf3379 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -5058,7 +5058,18 @@ static void init_sched_build_groups(struct sched_group groups[], cpumask_t span,
 #define MAX_DOMAIN_DISTANCE 32
 
 static unsigned long long migration_cost[MAX_DOMAIN_DISTANCE] =
-		{ [ 0 ... MAX_DOMAIN_DISTANCE-1 ] = -1LL };
+		{ [ 0 ... MAX_DOMAIN_DISTANCE-1 ] =
+/*
+ * Architectures may override the migration cost and thus avoid
+ * boot-time calibration. Unit is nanoseconds. Mostly useful for
+ * virtualized hardware:
+ */
+#ifdef CONFIG_DEFAULT_MIGRATION_COST
+			CONFIG_DEFAULT_MIGRATION_COST
+#else
+			-1LL
+#endif
+};
 
 /*
  * Allow override of migration cost - in units of microseconds.
-- 
cgit v1.2.3


From a8534adb74e23374889b84b3d97eb18da542a1b5 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Fri, 17 Feb 2006 13:52:51 -0800
Subject: [PATCH] swsusp: fix breakage with swap on LVM

Restore the compatibility with the older code and make it possible to
suspend if the kernel command line doesn't contain the "resume=" argument

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Cc: Pavel Machek <pavel@ucw.cz>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/power/swsusp.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c
index 4e90905f0e87..2d9d08f72f76 100644
--- a/kernel/power/swsusp.c
+++ b/kernel/power/swsusp.c
@@ -153,13 +153,11 @@ static int swsusp_swap_check(void) /* This is called before saving image */
 {
 	int i;
 
-	if (!swsusp_resume_device)
-		return -ENODEV;
 	spin_lock(&swap_lock);
 	for (i = 0; i < MAX_SWAPFILES; i++) {
 		if (!(swap_info[i].flags & SWP_WRITEOK))
 			continue;
-		if (is_resume_device(swap_info + i)) {
+		if (!swsusp_resume_device || is_resume_device(swap_info + i)) {
 			spin_unlock(&swap_lock);
 			root_swap = i;
 			return 0;
-- 
cgit v1.2.3


From ef20c8c197df9b8d5bd4af0679123826da028861 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sat, 18 Feb 2006 15:41:50 -0500
Subject: [PATCH] GFP_KERNEL allocations in atomic (auditsc)

audit_log_exit() is called from atomic contexts and gets explicit
gfp_mask argument; it should use it for all allocations rather
than doing some with gfp_mask and some with GFP_KERNEL.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 kernel/auditsc.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 685c25175d96..d7e7e637b92a 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -841,7 +841,7 @@ static void audit_log_exit(struct audit_context *context, gfp_t gfp_mask)
 
 	for (aux = context->aux; aux; aux = aux->next) {
 
-		ab = audit_log_start(context, GFP_KERNEL, aux->type);
+		ab = audit_log_start(context, gfp_mask, aux->type);
 		if (!ab)
 			continue; /* audit_panic has been called */
 
@@ -878,14 +878,14 @@ static void audit_log_exit(struct audit_context *context, gfp_t gfp_mask)
 	}
 
 	if (context->pwd && context->pwdmnt) {
-		ab = audit_log_start(context, GFP_KERNEL, AUDIT_CWD);
+		ab = audit_log_start(context, gfp_mask, AUDIT_CWD);
 		if (ab) {
 			audit_log_d_path(ab, "cwd=", context->pwd, context->pwdmnt);
 			audit_log_end(ab);
 		}
 	}
 	for (i = 0; i < context->name_count; i++) {
-		ab = audit_log_start(context, GFP_KERNEL, AUDIT_PATH);
+		ab = audit_log_start(context, gfp_mask, AUDIT_PATH);
 		if (!ab)
 			continue; /* audit_panic has been called */
 
-- 
cgit v1.2.3


From c255d844dd73616f23e4b4733edcc2e5fa4042b2 Mon Sep 17 00:00:00 2001
From: Pavel Machek <pavel@ucw.cz>
Date: Mon, 20 Feb 2006 18:27:58 -0800
Subject: [PATCH] suspend-to-ram: allow video options to be set at runtime

Currently, acpi video options can only be set on kernel command line.  That's
little inflexible; I'd like userland s2ram application that just works, and
modifying kernel command line according to whitelist is not fun.  It is better
to just allow s2ram application to set video options just before suspend
(according to the whitelist).

This implements sysctl to allow setting suspend video options without reboot.

(akpm: Documentation updates for this new sysctl are pending..)

Signed-off-by: Pavel Machek <pavel@suse.cz>
Cc: "Brown, Len" <len.brown@intel.com>
Cc: "Antonino A. Daplas" <adaplas@pol.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 Documentation/sysctl/kernel.txt | 10 ++++++++++
 include/linux/acpi.h            |  3 ++-
 include/linux/sysctl.h          |  1 +
 kernel/sysctl.c                 | 16 ++++++++++++----
 4 files changed, 25 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt
index 9f11d36a8c10..b0c7ab93dcb9 100644
--- a/Documentation/sysctl/kernel.txt
+++ b/Documentation/sysctl/kernel.txt
@@ -16,6 +16,7 @@ before actually making adjustments.
 
 Currently, these files might (depending on your configuration)
 show up in /proc/sys/kernel:
+- acpi_video_flags
 - acct
 - core_pattern
 - core_uses_pid
@@ -57,6 +58,15 @@ show up in /proc/sys/kernel:
 
 ==============================================================
 
+acpi_video_flags:
+
+flags
+
+See Doc*/kernel/power/video.txt, it allows mode of video boot to be
+set during run time.
+
+==============================================================
+
 acct:
 
 highwater lowwater frequency
diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index 84d3d9f034ce..d3bc25e6d27d 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -427,7 +427,8 @@ extern int acpi_mp_config;
 extern struct acpi_table_mcfg_config *pci_mmcfg_config;
 extern int pci_mmcfg_config_num;
 
-extern int sbf_port ;
+extern int sbf_port;
+extern unsigned long acpi_video_flags;
 
 #else	/* !CONFIG_ACPI */
 
diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index 32a4139c4ad8..0e92bf7ec28e 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -146,6 +146,7 @@ enum
 	KERN_RANDOMIZE=68, /* int: randomize virtual address space */
 	KERN_SETUID_DUMPABLE=69, /* int: behaviour of dumps for setuid core */
 	KERN_SPIN_RETRY=70,	/* int: number of spinlock retries */
+	KERN_ACPI_VIDEO_FLAGS=71, /* int: flags for setting up video after ACPI sleep */
 };
 
 
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 7654d55c47f5..ebc41bf22f1e 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -44,14 +44,12 @@
 #include <linux/limits.h>
 #include <linux/dcache.h>
 #include <linux/syscalls.h>
+#include <linux/nfs_fs.h>
+#include <linux/acpi.h>
 
 #include <asm/uaccess.h>
 #include <asm/processor.h>
 
-#ifdef CONFIG_ROOT_NFS
-#include <linux/nfs_fs.h>
-#endif
-
 #if defined(CONFIG_SYSCTL)
 
 /* External variables not in a header file. */
@@ -655,6 +653,16 @@ static ctl_table kern_table[] = {
 		.mode		= 0644,
 		.proc_handler	= &proc_dointvec,
 	},
+#endif
+#ifdef CONFIG_ACPI_SLEEP
+	{
+		.ctl_name	= KERN_ACPI_VIDEO_FLAGS,
+		.procname	= "acpi_video_flags",
+		.data		= &acpi_video_flags,
+		.maxlen		= sizeof (unsigned long),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
 #endif
 	{ .ctl_name = 0 }
 };
-- 
cgit v1.2.3


From 7a9166e3b037296366cea6f3c97f705d33e209e6 Mon Sep 17 00:00:00 2001
From: Luke Yang <luke.adi@gmail.com>
Date: Mon, 20 Feb 2006 18:28:07 -0800
Subject: [PATCH] Fix undefined symbols for nommu architecture

Signed-off-by: Luke Yang <luke.adi@gmail.com>
Acked-by: David Howells <dhowells@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/mm.h | 4 ++++
 kernel/sysctl.c    | 2 ++
 mm/nommu.c         | 2 ++
 3 files changed, 8 insertions(+)

(limited to 'kernel')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 26e1663a5cbe..498ff8778fb6 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1051,7 +1051,11 @@ int shrink_slab(unsigned long scanned, gfp_t gfp_mask,
 void drop_pagecache(void);
 void drop_slab(void);
 
+#ifndef CONFIG_MMU
+#define randomize_va_space 0
+#else
 extern int randomize_va_space;
+#endif
 
 #endif /* __KERNEL__ */
 #endif /* _LINUX_MM_H */
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index ebc41bf22f1e..c05a2b7125e1 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -636,6 +636,7 @@ static ctl_table kern_table[] = {
 		.proc_handler	= &proc_dointvec,
 	},
 #endif
+#if defined(CONFIG_MMU)
 	{
 		.ctl_name	= KERN_RANDOMIZE,
 		.procname	= "randomize_va_space",
@@ -644,6 +645,7 @@ static ctl_table kern_table[] = {
 		.mode		= 0644,
 		.proc_handler	= &proc_dointvec,
 	},
+#endif
 #if defined(CONFIG_S390) && defined(CONFIG_SMP)
 	{
 		.ctl_name	= KERN_SPIN_RETRY,
diff --git a/mm/nommu.c b/mm/nommu.c
index c10262d68232..99d21020ec9d 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -57,6 +57,8 @@ EXPORT_SYMBOL(vmalloc);
 EXPORT_SYMBOL(vfree);
 EXPORT_SYMBOL(vmalloc_to_page);
 EXPORT_SYMBOL(vmalloc_32);
+EXPORT_SYMBOL(vmap);
+EXPORT_SYMBOL(vunmap);
 
 /*
  * Handle all mappings that got truncated by a "truncate()"
-- 
cgit v1.2.3


From 7fd105e758c8d746d57ab7e77f100e096bf153c8 Mon Sep 17 00:00:00 2001
From: Stephen Rothwell <sfr@canb.auug.org.au>
Date: Mon, 20 Feb 2006 18:28:08 -0800
Subject: [PATCH] Fix compile for CONFIG_SYSVIPC=n or CONFIG_SYSCTL=n

The compat syscalls are added to sys_ni.c since they are not defined if the
above CONFIG options are off.  Also, nfs would not build with CONFIG_SYSCTL
off.

Noticed by Arthur Othieno.

Signed-off-by: Stephen Rothwell <sfr@canb.auug.org.au>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Trond Myklebust <trond.myklebust@fys.uio.no>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/nfs_fs.h | 2 +-
 kernel/sys_ni.c        | 2 ++
 2 files changed, 3 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index 547d649b274e..b4dc6e2e10c9 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -398,7 +398,7 @@ extern struct inode_operations nfs_symlink_inode_operations;
 extern int nfs_register_sysctl(void);
 extern void nfs_unregister_sysctl(void);
 #else
-#define nfs_register_sysctl() do { } while(0)
+#define nfs_register_sysctl() 0
 #define nfs_unregister_sysctl() do { } while(0)
 #endif
 
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 17313b99e53d..1067090db6b1 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -104,6 +104,8 @@ cond_syscall(sys_setreuid16);
 cond_syscall(sys_setuid16);
 cond_syscall(sys_vm86old);
 cond_syscall(sys_vm86);
+cond_syscall(compat_sys_ipc);
+cond_syscall(compat_sys_sysctl);
 
 /* arch-specific weak syscall entries */
 cond_syscall(sys_pciconfig_read);
-- 
cgit v1.2.3


From 5914811acf36c3ff091f860a6964808f668f27d0 Mon Sep 17 00:00:00 2001
From: Bj�rn Steinbrink <B.Steinbrink@gmx.de>
Date: Sat, 18 Feb 2006 18:12:43 +0100
Subject: [PATCH] kjournald keeps reference to namespace
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

In daemonize() a new thread gets cleaned up and 'merged' with init_task.
The current fs_struct is handled there, but not the current namespace.

This adds the namespace part.

[ Eric Biederman pointed out the namespace wrappers, and also notes that
  we can't ever count on using our parents namespace because we already
  have called exit_fs(), which is the only way to the namespace from a
  process. ]

Signed-off-by: Björn Steinbrink <B.Steinbrink@gmx.de>
Acked-by: Eric Biederman <ebiederm@xmission.com>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/exit.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'kernel')

diff --git a/kernel/exit.c b/kernel/exit.c
index 93cee3671332..531aadca5530 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -360,6 +360,9 @@ void daemonize(const char *name, ...)
 	fs = init_task.fs;
 	current->fs = fs;
 	atomic_inc(&fs->count);
+	exit_namespace(current);
+	current->namespace = init_task.namespace;
+	get_namespace(current->namespace);
  	exit_files(current);
 	current->files = init_task.files;
 	atomic_inc(&current->files->count);
-- 
cgit v1.2.3