diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2013-11-12 02:20:12 +0100 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2013-11-12 02:20:12 +0100 |
commit | 39cf275a1a18ba3c7eb9b986c5c9b35b57332798 (patch) | |
tree | 40b119ca9d2fbaf8128d3fa25f4c64669002b0c0 /kernel/sched/core.c | |
parent | Merge branch 'perf-core-for-linus' of git://git.kernel.org/pub/scm/linux/kern... (diff) | |
parent | ftrace, sched: Add TRACE_FLAG_PREEMPT_RESCHED (diff) | |
download | linux-39cf275a1a18ba3c7eb9b986c5c9b35b57332798.tar.xz linux-39cf275a1a18ba3c7eb9b986c5c9b35b57332798.zip |
Merge branch 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull scheduler changes from Ingo Molnar:
"The main changes in this cycle are:
- (much) improved CONFIG_NUMA_BALANCING support from Mel Gorman, Rik
van Riel, Peter Zijlstra et al. Yay!
- optimize preemption counter handling: merge the NEED_RESCHED flag
into the preempt_count variable, by Peter Zijlstra.
- wait.h fixes and code reorganization from Peter Zijlstra
- cfs_bandwidth fixes from Ben Segall
- SMP load-balancer cleanups from Peter Zijstra
- idle balancer improvements from Jason Low
- other fixes and cleanups"
* 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (129 commits)
ftrace, sched: Add TRACE_FLAG_PREEMPT_RESCHED
stop_machine: Fix race between stop_two_cpus() and stop_cpus()
sched: Remove unnecessary iteration over sched domains to update nr_busy_cpus
sched: Fix asymmetric scheduling for POWER7
sched: Move completion code from core.c to completion.c
sched: Move wait code from core.c to wait.c
sched: Move wait.c into kernel/sched/
sched/wait: Fix __wait_event_interruptible_lock_irq_timeout()
sched: Avoid throttle_cfs_rq() racing with period_timer stopping
sched: Guarantee new group-entities always have weight
sched: Fix hrtimer_cancel()/rq->lock deadlock
sched: Fix cfs_bandwidth misuse of hrtimer_expires_remaining
sched: Fix race on toggling cfs_bandwidth_used
sched: Remove extra put_online_cpus() inside sched_setaffinity()
sched/rt: Fix task_tick_rt() comment
sched/wait: Fix build breakage
sched/wait: Introduce prepare_to_wait_event()
sched/wait: Add ___wait_cond_timeout() to wait_event*_timeout() too
sched: Remove get_online_cpus() usage
sched: Fix race in migrate_swap_stop()
...
Diffstat (limited to 'kernel/sched/core.c')
-rw-r--r-- | kernel/sched/core.c | 683 |
1 files changed, 234 insertions, 449 deletions
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 5ac63c9a995a..1deccd78be98 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -513,12 +513,11 @@ static inline void init_hrtick(void) * might also involve a cross-CPU call to trigger the scheduler on * the target CPU. */ -#ifdef CONFIG_SMP void resched_task(struct task_struct *p) { int cpu; - assert_raw_spin_locked(&task_rq(p)->lock); + lockdep_assert_held(&task_rq(p)->lock); if (test_tsk_need_resched(p)) return; @@ -526,8 +525,10 @@ void resched_task(struct task_struct *p) set_tsk_need_resched(p); cpu = task_cpu(p); - if (cpu == smp_processor_id()) + if (cpu == smp_processor_id()) { + set_preempt_need_resched(); return; + } /* NEED_RESCHED must be visible before we test polling */ smp_mb(); @@ -546,6 +547,7 @@ void resched_cpu(int cpu) raw_spin_unlock_irqrestore(&rq->lock, flags); } +#ifdef CONFIG_SMP #ifdef CONFIG_NO_HZ_COMMON /* * In the semi idle case, use the nearest busy cpu for migrating timers @@ -693,12 +695,6 @@ void sched_avg_update(struct rq *rq) } } -#else /* !CONFIG_SMP */ -void resched_task(struct task_struct *p) -{ - assert_raw_spin_locked(&task_rq(p)->lock); - set_tsk_need_resched(p); -} #endif /* CONFIG_SMP */ #if defined(CONFIG_RT_GROUP_SCHED) || (defined(CONFIG_FAIR_GROUP_SCHED) && \ @@ -767,14 +763,14 @@ static void set_load_weight(struct task_struct *p) static void enqueue_task(struct rq *rq, struct task_struct *p, int flags) { update_rq_clock(rq); - sched_info_queued(p); + sched_info_queued(rq, p); p->sched_class->enqueue_task(rq, p, flags); } static void dequeue_task(struct rq *rq, struct task_struct *p, int flags) { update_rq_clock(rq); - sched_info_dequeued(p); + sched_info_dequeued(rq, p); p->sched_class->dequeue_task(rq, p, flags); } @@ -987,7 +983,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) * ttwu() will sort out the placement. */ WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING && - !(task_thread_info(p)->preempt_count & PREEMPT_ACTIVE)); + !(task_preempt_count(p) & PREEMPT_ACTIVE)); #ifdef CONFIG_LOCKDEP /* @@ -1017,6 +1013,107 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) __set_task_cpu(p, new_cpu); } +static void __migrate_swap_task(struct task_struct *p, int cpu) +{ + if (p->on_rq) { + struct rq *src_rq, *dst_rq; + + src_rq = task_rq(p); + dst_rq = cpu_rq(cpu); + + deactivate_task(src_rq, p, 0); + set_task_cpu(p, cpu); + activate_task(dst_rq, p, 0); + check_preempt_curr(dst_rq, p, 0); + } else { + /* + * Task isn't running anymore; make it appear like we migrated + * it before it went to sleep. This means on wakeup we make the + * previous cpu our targer instead of where it really is. + */ + p->wake_cpu = cpu; + } +} + +struct migration_swap_arg { + struct task_struct *src_task, *dst_task; + int src_cpu, dst_cpu; +}; + +static int migrate_swap_stop(void *data) +{ + struct migration_swap_arg *arg = data; + struct rq *src_rq, *dst_rq; + int ret = -EAGAIN; + + src_rq = cpu_rq(arg->src_cpu); + dst_rq = cpu_rq(arg->dst_cpu); + + double_raw_lock(&arg->src_task->pi_lock, + &arg->dst_task->pi_lock); + double_rq_lock(src_rq, dst_rq); + if (task_cpu(arg->dst_task) != arg->dst_cpu) + goto unlock; + + if (task_cpu(arg->src_task) != arg->src_cpu) + goto unlock; + + if (!cpumask_test_cpu(arg->dst_cpu, tsk_cpus_allowed(arg->src_task))) + goto unlock; + + if (!cpumask_test_cpu(arg->src_cpu, tsk_cpus_allowed(arg->dst_task))) + goto unlock; + + __migrate_swap_task(arg->src_task, arg->dst_cpu); + __migrate_swap_task(arg->dst_task, arg->src_cpu); + + ret = 0; + +unlock: + double_rq_unlock(src_rq, dst_rq); + raw_spin_unlock(&arg->dst_task->pi_lock); + raw_spin_unlock(&arg->src_task->pi_lock); + + return ret; +} + +/* + * Cross migrate two tasks + */ +int migrate_swap(struct task_struct *cur, struct task_struct *p) +{ + struct migration_swap_arg arg; + int ret = -EINVAL; + + arg = (struct migration_swap_arg){ + .src_task = cur, + .src_cpu = task_cpu(cur), + .dst_task = p, + .dst_cpu = task_cpu(p), + }; + + if (arg.src_cpu == arg.dst_cpu) + goto out; + + /* + * These three tests are all lockless; this is OK since all of them + * will be re-checked with proper locks held further down the line. + */ + if (!cpu_active(arg.src_cpu) || !cpu_active(arg.dst_cpu)) + goto out; + + if (!cpumask_test_cpu(arg.dst_cpu, tsk_cpus_allowed(arg.src_task))) + goto out; + + if (!cpumask_test_cpu(arg.src_cpu, tsk_cpus_allowed(arg.dst_task))) + goto out; + + ret = stop_two_cpus(arg.dst_cpu, arg.src_cpu, migrate_swap_stop, &arg); + +out: + return ret; +} + struct migration_arg { struct task_struct *task; int dest_cpu; @@ -1236,9 +1333,9 @@ out: * The caller (fork, wakeup) owns p->pi_lock, ->cpus_allowed is stable. */ static inline -int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags) +int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags) { - int cpu = p->sched_class->select_task_rq(p, sd_flags, wake_flags); + cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags); /* * In order not to call set_task_cpu() on a blocking task we need @@ -1330,12 +1427,13 @@ ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags) if (rq->idle_stamp) { u64 delta = rq_clock(rq) - rq->idle_stamp; - u64 max = 2*sysctl_sched_migration_cost; + u64 max = 2*rq->max_idle_balance_cost; + + update_avg(&rq->avg_idle, delta); - if (delta > max) + if (rq->avg_idle > max) rq->avg_idle = max; - else - update_avg(&rq->avg_idle, delta); + rq->idle_stamp = 0; } #endif @@ -1396,6 +1494,14 @@ static void sched_ttwu_pending(void) void scheduler_ipi(void) { + /* + * Fold TIF_NEED_RESCHED into the preempt_count; anybody setting + * TIF_NEED_RESCHED remotely (for the first time) will also send + * this IPI. + */ + if (tif_need_resched()) + set_preempt_need_resched(); + if (llist_empty(&this_rq()->wake_list) && !tick_nohz_full_cpu(smp_processor_id()) && !got_nohz_idle_kick()) @@ -1513,7 +1619,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) if (p->sched_class->task_waking) p->sched_class->task_waking(p); - cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags); + cpu = select_task_rq(p, p->wake_cpu, SD_BALANCE_WAKE, wake_flags); if (task_cpu(p) != cpu) { wake_flags |= WF_MIGRATED; set_task_cpu(p, cpu); @@ -1595,7 +1701,7 @@ int wake_up_state(struct task_struct *p, unsigned int state) * * __sched_fork() is basic setup used by init_idle() too: */ -static void __sched_fork(struct task_struct *p) +static void __sched_fork(unsigned long clone_flags, struct task_struct *p) { p->on_rq = 0; @@ -1619,16 +1725,24 @@ static void __sched_fork(struct task_struct *p) #ifdef CONFIG_NUMA_BALANCING if (p->mm && atomic_read(&p->mm->mm_users) == 1) { - p->mm->numa_next_scan = jiffies; - p->mm->numa_next_reset = jiffies; + p->mm->numa_next_scan = jiffies + msecs_to_jiffies(sysctl_numa_balancing_scan_delay); p->mm->numa_scan_seq = 0; } + if (clone_flags & CLONE_VM) + p->numa_preferred_nid = current->numa_preferred_nid; + else + p->numa_preferred_nid = -1; + p->node_stamp = 0ULL; p->numa_scan_seq = p->mm ? p->mm->numa_scan_seq : 0; - p->numa_migrate_seq = p->mm ? p->mm->numa_scan_seq - 1 : 0; p->numa_scan_period = sysctl_numa_balancing_scan_delay; p->numa_work.next = &p->numa_work; + p->numa_faults = NULL; + p->numa_faults_buffer = NULL; + + INIT_LIST_HEAD(&p->numa_entry); + p->numa_group = NULL; #endif /* CONFIG_NUMA_BALANCING */ } @@ -1654,12 +1768,12 @@ void set_numabalancing_state(bool enabled) /* * fork()/clone()-time setup: */ -void sched_fork(struct task_struct *p) +void sched_fork(unsigned long clone_flags, struct task_struct *p) { unsigned long flags; int cpu = get_cpu(); - __sched_fork(p); + __sched_fork(clone_flags, p); /* * We mark the process as running here. This guarantees that * nobody will actually run it, and a signal or other external @@ -1717,10 +1831,7 @@ void sched_fork(struct task_struct *p) #if defined(CONFIG_SMP) p->on_cpu = 0; #endif -#ifdef CONFIG_PREEMPT_COUNT - /* Want to start with kernel preemption disabled. */ - task_thread_info(p)->preempt_count = 1; -#endif + init_task_preempt_count(p); #ifdef CONFIG_SMP plist_node_init(&p->pushable_tasks, MAX_PRIO); #endif @@ -1747,7 +1858,7 @@ void wake_up_new_task(struct task_struct *p) * - cpus_allowed can change in the fork path * - any previously selected cpu might disappear through hotplug */ - set_task_cpu(p, select_task_rq(p, SD_BALANCE_FORK, 0)); + set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0)); #endif /* Initialize new task's runnable average */ @@ -1838,7 +1949,7 @@ prepare_task_switch(struct rq *rq, struct task_struct *prev, struct task_struct *next) { trace_sched_switch(prev, next); - sched_info_switch(prev, next); + sched_info_switch(rq, prev, next); perf_event_task_sched_out(prev, next); fire_sched_out_preempt_notifiers(prev, next); prepare_lock_switch(rq, next); @@ -1890,6 +2001,8 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev) if (mm) mmdrop(mm); if (unlikely(prev_state == TASK_DEAD)) { + task_numa_free(prev); + /* * Remove function-return probe instances associated with this * task and put them back on the free list. @@ -2073,7 +2186,7 @@ void sched_exec(void) int dest_cpu; raw_spin_lock_irqsave(&p->pi_lock, flags); - dest_cpu = p->sched_class->select_task_rq(p, SD_BALANCE_EXEC, 0); + dest_cpu = p->sched_class->select_task_rq(p, task_cpu(p), SD_BALANCE_EXEC, 0); if (dest_cpu == smp_processor_id()) goto unlock; @@ -2215,7 +2328,7 @@ notrace unsigned long get_parent_ip(unsigned long addr) #if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \ defined(CONFIG_PREEMPT_TRACER)) -void __kprobes add_preempt_count(int val) +void __kprobes preempt_count_add(int val) { #ifdef CONFIG_DEBUG_PREEMPT /* @@ -2224,7 +2337,7 @@ void __kprobes add_preempt_count(int val) if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0))) return; #endif - preempt_count() += val; + __preempt_count_add(val); #ifdef CONFIG_DEBUG_PREEMPT /* * Spinlock count overflowing soon? @@ -2235,9 +2348,9 @@ void __kprobes add_preempt_count(int val) if (preempt_count() == val) trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1)); } -EXPORT_SYMBOL(add_preempt_count); +EXPORT_SYMBOL(preempt_count_add); -void __kprobes sub_preempt_count(int val) +void __kprobes preempt_count_sub(int val) { #ifdef CONFIG_DEBUG_PREEMPT /* @@ -2255,9 +2368,9 @@ void __kprobes sub_preempt_count(int val) if (preempt_count() == val) trace_preempt_on(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1)); - preempt_count() -= val; + __preempt_count_sub(val); } -EXPORT_SYMBOL(sub_preempt_count); +EXPORT_SYMBOL(preempt_count_sub); #endif @@ -2430,6 +2543,7 @@ need_resched: put_prev_task(rq, prev); next = pick_next_task(rq); clear_tsk_need_resched(prev); + clear_preempt_need_resched(); rq->skip_clock_update = 0; if (likely(prev != next)) { @@ -2520,9 +2634,9 @@ asmlinkage void __sched notrace preempt_schedule(void) return; do { - add_preempt_count_notrace(PREEMPT_ACTIVE); + __preempt_count_add(PREEMPT_ACTIVE); __schedule(); - sub_preempt_count_notrace(PREEMPT_ACTIVE); + __preempt_count_sub(PREEMPT_ACTIVE); /* * Check again in case we missed a preemption opportunity @@ -2541,20 +2655,19 @@ EXPORT_SYMBOL(preempt_schedule); */ asmlinkage void __sched preempt_schedule_irq(void) { - struct thread_info *ti = current_thread_info(); enum ctx_state prev_state; /* Catch callers which need to be fixed */ - BUG_ON(ti->preempt_count || !irqs_disabled()); + BUG_ON(preempt_count() || !irqs_disabled()); prev_state = exception_enter(); do { - add_preempt_count(PREEMPT_ACTIVE); + __preempt_count_add(PREEMPT_ACTIVE); local_irq_enable(); __schedule(); local_irq_disable(); - sub_preempt_count(PREEMPT_ACTIVE); + __preempt_count_sub(PREEMPT_ACTIVE); /* * Check again in case we missed a preemption opportunity @@ -2575,393 +2688,6 @@ int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags, } EXPORT_SYMBOL(default_wake_function); -/* - * The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just - * wake everything up. If it's an exclusive wakeup (nr_exclusive == small +ve - * number) then we wake all the non-exclusive tasks and one exclusive task. - * - * There are circumstances in which we can try to wake a task which has already - * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns - * zero in this (rare) case, and we handle it by continuing to scan the queue. - */ -static void __wake_up_common(wait_queue_head_t *q, unsigned int mode, - int nr_exclusive, int wake_flags, void *key) -{ - wait_queue_t *curr, *next; - - list_for_each_entry_safe(curr, next, &q->task_list, task_list) { - unsigned flags = curr->flags; - - if (curr->func(curr, mode, wake_flags, key) && - (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive) - break; - } -} - -/** - * __wake_up - wake up threads blocked on a waitqueue. - * @q: the waitqueue - * @mode: which threads - * @nr_exclusive: how many wake-one or wake-many threads to wake up - * @key: is directly passed to the wakeup function - * - * It may be assumed that this function implies a write memory barrier before - * changing the task state if and only if any tasks are woken up. - */ -void __wake_up(wait_queue_head_t *q, unsigned int mode, - int nr_exclusive, void *key) -{ - unsigned long flags; - - spin_lock_irqsave(&q->lock, flags); - __wake_up_common(q, mode, nr_exclusive, 0, key); - spin_unlock_irqrestore(&q->lock, flags); -} -EXPORT_SYMBOL(__wake_up); - -/* - * Same as __wake_up but called with the spinlock in wait_queue_head_t held. - */ -void __wake_up_locked(wait_queue_head_t *q, unsigned int mode, int nr) -{ - __wake_up_common(q, mode, nr, 0, NULL); -} -EXPORT_SYMBOL_GPL(__wake_up_locked); - -void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key) -{ - __wake_up_common(q, mode, 1, 0, key); -} -EXPORT_SYMBOL_GPL(__wake_up_locked_key); - -/** - * __wake_up_sync_key - wake up threads blocked on a waitqueue. - * @q: the waitqueue - * @mode: which threads - * @nr_exclusive: how many wake-one or wake-many threads to wake up - * @key: opaque value to be passed to wakeup targets - * - * The sync wakeup differs that the waker knows that it will schedule - * away soon, so while the target thread will be woken up, it will not - * be migrated to another CPU - ie. the two threads are 'synchronized' - * with each other. This can prevent needless bouncing between CPUs. - * - * On UP it can prevent extra preemption. - * - * It may be assumed that this function implies a write memory barrier before - * changing the task state if and only if any tasks are woken up. - */ -void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode, - int nr_exclusive, void *key) -{ - unsigned long flags; - int wake_flags = WF_SYNC; - - if (unlikely(!q)) - return; - - if (unlikely(nr_exclusive != 1)) - wake_flags = 0; - - spin_lock_irqsave(&q->lock, flags); - __wake_up_common(q, mode, nr_exclusive, wake_flags, key); - spin_unlock_irqrestore(&q->lock, flags); -} -EXPORT_SYMBOL_GPL(__wake_up_sync_key); - -/* - * __wake_up_sync - see __wake_up_sync_key() - */ -void __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive) -{ - __wake_up_sync_key(q, mode, nr_exclusive, NULL); -} -EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */ - -/** - * complete: - signals a single thread waiting on this completion - * @x: holds the state of this particular completion - * - * This will wake up a single thread waiting on this completion. Threads will be - * awakened in the same order in which they were queued. - * - * See also complete_all(), wait_for_completion() and related routines. - * - * It may be assumed that this function implies a write memory barrier before - * changing the task state if and only if any tasks are woken up. - */ -void complete(struct completion *x) -{ - unsigned long flags; - - spin_lock_irqsave(&x->wait.lock, flags); - x->done++; - __wake_up_common(&x->wait, TASK_NORMAL, 1, 0, NULL); - spin_unlock_irqrestore(&x->wait.lock, flags); -} -EXPORT_SYMBOL(complete); - -/** - * complete_all: - signals all threads waiting on this completion - * @x: holds the state of this particular completion - * - * This will wake up all threads waiting on this particular completion event. - * - * It may be assumed that this function implies a write memory barrier before - * changing the task state if and only if any tasks are woken up. - */ -void complete_all(struct completion *x) -{ - unsigned long flags; - - spin_lock_irqsave(&x->wait.lock, flags); - x->done += UINT_MAX/2; - __wake_up_common(&x->wait, TASK_NORMAL, 0, 0, NULL); - spin_unlock_irqrestore(&x->wait.lock, flags); -} -EXPORT_SYMBOL(complete_all); - -static inline long __sched -do_wait_for_common(struct completion *x, - long (*action)(long), long timeout, int state) -{ - if (!x->done) { - DECLARE_WAITQUEUE(wait, current); - - __add_wait_queue_tail_exclusive(&x->wait, &wait); - do { - if (signal_pending_state(state, current)) { - timeout = -ERESTARTSYS; - break; - } - __set_current_state(state); - spin_unlock_irq(&x->wait.lock); - timeout = action(timeout); - spin_lock_irq(&x->wait.lock); - } while (!x->done && timeout); - __remove_wait_queue(&x->wait, &wait); - if (!x->done) - return timeout; - } - x->done--; - return timeout ?: 1; -} - -static inline long __sched -__wait_for_common(struct completion *x, - long (*action)(long), long timeout, int state) -{ - might_sleep(); - - spin_lock_irq(&x->wait.lock); - timeout = do_wait_for_common(x, action, timeout, state); - spin_unlock_irq(&x->wait.lock); - return timeout; -} - -static long __sched -wait_for_common(struct completion *x, long timeout, int state) -{ - return __wait_for_common(x, schedule_timeout, timeout, state); -} - -static long __sched -wait_for_common_io(struct completion *x, long timeout, int state) -{ - return __wait_for_common(x, io_schedule_timeout, timeout, state); -} - -/** - * wait_for_completion: - waits for completion of a task - * @x: holds the state of this particular completion - * - * This waits to be signaled for completion of a specific task. It is NOT - * interruptible and there is no timeout. - * - * See also similar routines (i.e. wait_for_completion_timeout()) with timeout - * and interrupt capability. Also see complete(). - */ -void __sched wait_for_completion(struct completion *x) -{ - wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE); -} -EXPORT_SYMBOL(wait_for_completion); - -/** - * wait_for_completion_timeout: - waits for completion of a task (w/timeout) - * @x: holds the state of this particular completion - * @timeout: timeout value in jiffies - * - * This waits for either a completion of a specific task to be signaled or for a - * specified timeout to expire. The timeout is in jiffies. It is not - * interruptible. - * - * Return: 0 if timed out, and positive (at least 1, or number of jiffies left - * till timeout) if completed. - */ -unsigned long __sched -wait_for_completion_timeout(struct completion *x, unsigned long timeout) -{ - return wait_for_common(x, timeout, TASK_UNINTERRUPTIBLE); -} -EXPORT_SYMBOL(wait_for_completion_timeout); - -/** - * wait_for_completion_io: - waits for completion of a task - * @x: holds the state of this particular completion - * - * This waits to be signaled for completion of a specific task. It is NOT - * interruptible and there is no timeout. The caller is accounted as waiting - * for IO. - */ -void __sched wait_for_completion_io(struct completion *x) -{ - wait_for_common_io(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE); -} -EXPORT_SYMBOL(wait_for_completion_io); - -/** - * wait_for_completion_io_timeout: - waits for completion of a task (w/timeout) - * @x: holds the state of this particular completion - * @timeout: timeout value in jiffies - * - * This waits for either a completion of a specific task to be signaled or for a - * specified timeout to expire. The timeout is in jiffies. It is not - * interruptible. The caller is accounted as waiting for IO. - * - * Return: 0 if timed out, and positive (at least 1, or number of jiffies left - * till timeout) if completed. - */ -unsigned long __sched -wait_for_completion_io_timeout(struct completion *x, unsigned long timeout) -{ - return wait_for_common_io(x, timeout, TASK_UNINTERRUPTIBLE); -} -EXPORT_SYMBOL(wait_for_completion_io_timeout); - -/** - * wait_for_completion_interruptible: - waits for completion of a task (w/intr) - * @x: holds the state of this particular completion - * - * This waits for completion of a specific task to be signaled. It is - * interruptible. - * - * Return: -ERESTARTSYS if interrupted, 0 if completed. - */ -int __sched wait_for_completion_interruptible(struct completion *x) -{ - long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_INTERRUPTIBLE); - if (t == -ERESTARTSYS) - return t; - return 0; -} -EXPORT_SYMBOL(wait_for_completion_interruptible); - -/** - * wait_for_completion_interruptible_timeout: - waits for completion (w/(to,intr)) - * @x: holds the state of this particular completion - * @timeout: timeout value in jiffies - * - * This waits for either a completion of a specific task to be signaled or for a - * specified timeout to expire. It is interruptible. The timeout is in jiffies. - * - * Return: -ERESTARTSYS if interrupted, 0 if timed out, positive (at least 1, - * or number of jiffies left till timeout) if completed. - */ -long __sched -wait_for_completion_interruptible_timeout(struct completion *x, - unsigned long timeout) -{ - return wait_for_common(x, timeout, TASK_INTERRUPTIBLE); -} -EXPORT_SYMBOL(wait_for_completion_interruptible_timeout); - -/** - * wait_for_completion_killable: - waits for completion of a task (killable) - * @x: holds the state of this particular completion - * - * This waits to be signaled for completion of a specific task. It can be - * interrupted by a kill signal. - * - * Return: -ERESTARTSYS if interrupted, 0 if completed. - */ -int __sched wait_for_completion_killable(struct completion *x) -{ - long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_KILLABLE); - if (t == -ERESTARTSYS) - return t; - return 0; -} -EXPORT_SYMBOL(wait_for_completion_killable); - -/** - * wait_for_completion_killable_timeout: - waits for completion of a task (w/(to,killable)) - * @x: holds the state of this particular completion - * @timeout: timeout value in jiffies - * - * This waits for either a completion of a specific task to be - * signaled or for a specified timeout to expire. It can be - * interrupted by a kill signal. The timeout is in jiffies. - * - * Return: -ERESTARTSYS if interrupted, 0 if timed out, positive (at least 1, - * or number of jiffies left till timeout) if completed. - */ -long __sched -wait_for_completion_killable_timeout(struct completion *x, - unsigned long timeout) -{ - return wait_for_common(x, timeout, TASK_KILLABLE); -} -EXPORT_SYMBOL(wait_for_completion_killable_timeout); - -/** - * try_wait_for_completion - try to decrement a completion without blocking - * @x: completion structure - * - * Return: 0 if a decrement cannot be done without blocking - * 1 if a decrement succeeded. - * - * If a completion is being used as a counting completion, - * attempt to decrement the counter without blocking. This - * enables us to avoid waiting if the resource the completion - * is protecting is not available. - */ -bool try_wait_for_completion(struct completion *x) -{ - unsigned long flags; - int ret = 1; - - spin_lock_irqsave(&x->wait.lock, flags); - if (!x->done) - ret = 0; - else - x->done--; - spin_unlock_irqrestore(&x->wait.lock, flags); - return ret; -} -EXPORT_SYMBOL(try_wait_for_completion); - -/** - * completion_done - Test to see if a completion has any waiters - * @x: completion structure - * - * Return: 0 if there are waiters (wait_for_completion() in progress) - * 1 if there are no waiters. - * - */ -bool completion_done(struct completion *x) -{ - unsigned long flags; - int ret = 1; - - spin_lock_irqsave(&x->wait.lock, flags); - if (!x->done) - ret = 0; - spin_unlock_irqrestore(&x->wait.lock, flags); - return ret; -} -EXPORT_SYMBOL(completion_done); - static long __sched sleep_on_common(wait_queue_head_t *q, int state, long timeout) { @@ -3598,13 +3324,11 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) struct task_struct *p; int retval; - get_online_cpus(); rcu_read_lock(); p = find_process_by_pid(pid); if (!p) { rcu_read_unlock(); - put_online_cpus(); return -ESRCH; } @@ -3661,7 +3385,6 @@ out_free_cpus_allowed: free_cpumask_var(cpus_allowed); out_put_task: put_task_struct(p); - put_online_cpus(); return retval; } @@ -3706,7 +3429,6 @@ long sched_getaffinity(pid_t pid, struct cpumask *mask) unsigned long flags; int retval; - get_online_cpus(); rcu_read_lock(); retval = -ESRCH; @@ -3719,12 +3441,11 @@ long sched_getaffinity(pid_t pid, struct cpumask *mask) goto out_unlock; raw_spin_lock_irqsave(&p->pi_lock, flags); - cpumask_and(mask, &p->cpus_allowed, cpu_online_mask); + cpumask_and(mask, &p->cpus_allowed, cpu_active_mask); raw_spin_unlock_irqrestore(&p->pi_lock, flags); out_unlock: rcu_read_unlock(); - put_online_cpus(); return retval; } @@ -3794,16 +3515,11 @@ SYSCALL_DEFINE0(sched_yield) return 0; } -static inline int should_resched(void) -{ - return need_resched() && !(preempt_count() & PREEMPT_ACTIVE); -} - static void __cond_resched(void) { - add_preempt_count(PREEMPT_ACTIVE); + __preempt_count_add(PREEMPT_ACTIVE); __schedule(); - sub_preempt_count(PREEMPT_ACTIVE); + __preempt_count_sub(PREEMPT_ACTIVE); } int __sched _cond_resched(void) @@ -4186,7 +3902,7 @@ void init_idle(struct task_struct *idle, int cpu) raw_spin_lock_irqsave(&rq->lock, flags); - __sched_fork(idle); + __sched_fork(0, idle); idle->state = TASK_RUNNING; idle->se.exec_start = sched_clock(); @@ -4212,7 +3928,7 @@ void init_idle(struct task_struct *idle, int cpu) raw_spin_unlock_irqrestore(&rq->lock, flags); /* Set the preempt count _outside_ the spinlocks! */ - task_thread_info(idle)->preempt_count = 0; + init_idle_preempt_count(idle, cpu); /* * The idle tasks have their own, simple scheduling class: @@ -4346,6 +4062,53 @@ fail: return ret; } +#ifdef CONFIG_NUMA_BALANCING +/* Migrate current task p to target_cpu */ +int migrate_task_to(struct task_struct *p, int target_cpu) +{ + struct migration_arg arg = { p, target_cpu }; + int curr_cpu = task_cpu(p); + + if (curr_cpu == target_cpu) + return 0; + + if (!cpumask_test_cpu(target_cpu, tsk_cpus_allowed(p))) + return -EINVAL; + + /* TODO: This is not properly updating schedstats */ + + return stop_one_cpu(curr_cpu, migration_cpu_stop, &arg); +} + +/* + * Requeue a task on a given node and accurately track the number of NUMA + * tasks on the runqueues + */ +void sched_setnuma(struct task_struct *p, int nid) +{ + struct rq *rq; + unsigned long flags; + bool on_rq, running; + + rq = task_rq_lock(p, &flags); + on_rq = p->on_rq; + running = task_current(rq, p); + + if (on_rq) + dequeue_task(rq, p, 0); + if (running) + p->sched_class->put_prev_task(rq, p); + + p->numa_preferred_nid = nid; + + if (running) + p->sched_class->set_curr_task(rq); + if (on_rq) + enqueue_task(rq, p, 0); + task_rq_unlock(rq, p, &flags); +} +#endif + /* * migration_cpu_stop - this will be executed by a highprio stopper thread * and performs thread migration by bumping thread off CPU then @@ -5119,6 +4882,9 @@ static void destroy_sched_domains(struct sched_domain *sd, int cpu) DEFINE_PER_CPU(struct sched_domain *, sd_llc); DEFINE_PER_CPU(int, sd_llc_size); DEFINE_PER_CPU(int, sd_llc_id); +DEFINE_PER_CPU(struct sched_domain *, sd_numa); +DEFINE_PER_CPU(struct sched_domain *, sd_busy); +DEFINE_PER_CPU(struct sched_domain *, sd_asym); static void update_top_cache_domain(int cpu) { @@ -5130,11 +4896,18 @@ static void update_top_cache_domain(int cpu) if (sd) { id = cpumask_first(sched_domain_span(sd)); size = cpumask_weight(sched_domain_span(sd)); + rcu_assign_pointer(per_cpu(sd_busy, cpu), sd->parent); } rcu_assign_pointer(per_cpu(sd_llc, cpu), sd); per_cpu(sd_llc_size, cpu) = size; per_cpu(sd_llc_id, cpu) = id; + + sd = lowest_flag_domain(cpu, SD_NUMA); + rcu_assign_pointer(per_cpu(sd_numa, cpu), sd); + + sd = highest_flag_domain(cpu, SD_ASYM_PACKING); + rcu_assign_pointer(per_cpu(sd_asym, cpu), sd); } /* @@ -5654,6 +5427,7 @@ sd_numa_init(struct sched_domain_topology_level *tl, int cpu) | 0*SD_SHARE_PKG_RESOURCES | 1*SD_SERIALIZE | 0*SD_PREFER_SIBLING + | 1*SD_NUMA | sd_local_flags(level) , .last_balance = jiffies, @@ -6335,14 +6109,17 @@ void __init sched_init_smp(void) sched_init_numa(); - get_online_cpus(); + /* + * There's no userspace yet to cause hotplug operations; hence all the + * cpu masks are stable and all blatant races in the below code cannot + * happen. + */ mutex_lock(&sched_domains_mutex); init_sched_domains(cpu_active_mask); cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map); if (cpumask_empty(non_isolated_cpus)) cpumask_set_cpu(smp_processor_id(), non_isolated_cpus); mutex_unlock(&sched_domains_mutex); - put_online_cpus(); hotcpu_notifier(sched_domains_numa_masks_update, CPU_PRI_SCHED_ACTIVE); hotcpu_notifier(cpuset_cpu_active, CPU_PRI_CPUSET_ACTIVE); @@ -6505,6 +6282,7 @@ void __init sched_init(void) rq->online = 0; rq->idle_stamp = 0; rq->avg_idle = 2*sysctl_sched_migration_cost; + rq->max_idle_balance_cost = sysctl_sched_migration_cost; INIT_LIST_HEAD(&rq->cfs_tasks); @@ -7277,7 +7055,12 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota) runtime_enabled = quota != RUNTIME_INF; runtime_was_enabled = cfs_b->quota != RUNTIME_INF; - account_cfs_bandwidth_used(runtime_enabled, runtime_was_enabled); + /* + * If we need to toggle cfs_bandwidth_used, off->on must occur + * before making related changes, and on->off must occur afterwards + */ + if (runtime_enabled && !runtime_was_enabled) + cfs_bandwidth_usage_inc(); raw_spin_lock_irq(&cfs_b->lock); cfs_b->period = ns_to_ktime(period); cfs_b->quota = quota; @@ -7303,6 +7086,8 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota) unthrottle_cfs_rq(cfs_rq); raw_spin_unlock_irq(&rq->lock); } + if (runtime_was_enabled && !runtime_enabled) + cfs_bandwidth_usage_dec(); out_unlock: mutex_unlock(&cfs_constraints_mutex); |