From 16882c1e962b4be5122fc05aaf2afc10fd9e2d15 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sun, 8 Jun 2008 21:20:41 +0400 Subject: sched: fix TASK_WAKEKILL vs SIGKILL race schedule() has the special "TASK_INTERRUPTIBLE && signal_pending()" case, this allows us to do current->state = TASK_INTERRUPTIBLE; schedule(); without fear to sleep with pending signal. However, the code like current->state = TASK_KILLABLE; schedule(); is not right, schedule() doesn't take TASK_WAKEKILL into account. This means that mutex_lock_killable(), wait_for_completion_killable(), down_killable(), schedule_timeout_killable() can miss SIGKILL (and btw the second SIGKILL has no effect). Introduce the new helper, signal_pending_state(), and change schedule() to use it. Hopefully it will have more users, that is why the task's state is passed separately. Note this "__TASK_STOPPED | __TASK_TRACED" check in signal_pending_state(). This is needed to preserve the current behaviour (ptrace_notify). I hope this check will be removed soon, but this (afaics good) change needs the separate discussion. The fast path is "(state & (INTERRUPTIBLE | WAKEKILL)) + signal_pending(p)", basically the same that schedule() does now. However, this patch of course bloats schedule(). Signed-off-by: Oleg Nesterov Signed-off-by: Ingo Molnar --- kernel/sched.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index bfb8ad8ed171..2c65bf29d133 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -4159,12 +4159,10 @@ need_resched_nonpreemptible: clear_tsk_need_resched(prev); if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { - if (unlikely((prev->state & TASK_INTERRUPTIBLE) && - signal_pending(prev))) { + if (unlikely(signal_pending_state(prev->state, prev))) prev->state = TASK_RUNNING; - } else { + else deactivate_task(rq, prev, 1); - } switch_count = &prev->nvcsw; } -- cgit v1.2.3 From 2e084786f6fe052274f1dfa7c675fe4a02cacd6e Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Thu, 12 Jun 2008 16:42:58 +0800 Subject: sched: fair group: fix overflow(was: fix divide by zero) I found a bug which can be reproduced by this way:(linux-2.6.26-rc5, x86-64) (use 2^32, 2^33, ...., 2^63 as shares value) # mkdir /dev/cpuctl # mount -t cgroup -o cpu cpuctl /dev/cpuctl # cd /dev/cpuctl # mkdir sub # echo 0x8000000000000000 > sub/cpu.shares # echo $$ > sub/tasks oops here! divide by zero. This is because do_div() expects the 2th parameter to be 32 bits, but unsigned long is 64 bits in x86_64. Peter Zijstra pointed it out that the sane thing to do is limit the shares value to something smaller instead of using an even more expensive divide. Also, I found another bug about "the shares value is too large": pid1 and pid2 are set affinity to cpu#0 pid1 is attached to cg1 and pid2 is attached to cg2 if cg1/cpu.shares = 1024 cg2/cpu.shares = 2000000000 then pid2 got 100% usage of cpu, and pid1 0% if cg1/cpu.shares = 1024 cg2/cpu.shares = 20000000000 then pid2 got 0% usage of cpu, and pid1 100% And a weight of a cfs_rq is the sum of weights of which entities are queued on this cfs_rq, so the shares value should be limited to a smaller value. I think that (1UL << 18) is a good limited value: 1) it's not too large, we can create a lot of group before overflow 2) it's several times the weight value for nice=-19 (not too small) Signed-off-by: Lai Jiangshan Acked-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 2c65bf29d133..6c1ecbdc0db9 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -312,12 +312,15 @@ static DEFINE_SPINLOCK(task_group_lock); #endif /* - * A weight of 0, 1 or ULONG_MAX can cause arithmetics problems. + * A weight of 0 or 1 can cause arithmetics problems. + * A weight of a cfs_rq is the sum of weights of which entities + * are queued on this cfs_rq, so a weight of a entity should not be + * too large, so as the shares value of a task group. * (The default weight is 1024 - so there's no practical * limitation from this.) */ #define MIN_SHARES 2 -#define MAX_SHARES (ULONG_MAX - 1) +#define MAX_SHARES (1UL << 18) static int init_task_group_load = INIT_TASK_GROUP_LOAD; #endif -- cgit v1.2.3 From 7a232e0350940d2664f4de5cc3f0f443bae5062d Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Thu, 12 Jun 2008 16:43:07 +0800 Subject: sched: 64-bit: fix arithmetics overflow (overflow means weight >= 2^32 here, because inv_weigh = 2^32/weight) A weight of a cfs_rq is the sum of weights of which entities are queued on this cfs_rq, so it will overflow when there are too many entities. Although, overflow occurs very rarely, but it break fairness when it occurs. 64-bits systems have more memory than 32-bit systems and 64-bit systems can create more process usually, so overflow may occur more frequently. This patch guarantees fairness when overflow happens on 64-bit systems. Thanks to the optimization of compiler, it changes nothing on 32-bit. Signed-off-by: Lai Jiangshan Acked-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 6c1ecbdc0db9..eaf6751e7612 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1340,8 +1340,13 @@ calc_delta_mine(unsigned long delta_exec, unsigned long weight, { u64 tmp; - if (!lw->inv_weight) - lw->inv_weight = 1 + (WMULT_CONST-lw->weight/2)/(lw->weight+1); + if (!lw->inv_weight) { + if (BITS_PER_LONG > 32 && unlikely(lw->weight >= WMULT_CONST)) + lw->inv_weight = 1; + else + lw->inv_weight = 1 + (WMULT_CONST-lw->weight/2) + / (lw->weight+1); + } tmp = (u64)delta_exec * weight; /* -- cgit v1.2.3