summaryrefslogtreecommitdiffstats
path: root/kernel/sched
diff options
context:
space:
mode:
authorIngo Molnar <mingo@kernel.org>2014-05-07 13:15:46 +0200
committerIngo Molnar <mingo@kernel.org>2014-05-07 13:15:46 +0200
commit2fe5de9ce7d57498abc14b375cad2fcf8c3ee6cc (patch)
tree9478e8cf470c1d5bdb2d89b57a7e35919ab95e72 /kernel/sched
parentsched: Remove set_need_resched() (diff)
parentsched/numa: Initialize newidle balance stats in sd_numa_init() (diff)
downloadlinux-2fe5de9ce7d57498abc14b375cad2fcf8c3ee6cc.tar.xz
linux-2fe5de9ce7d57498abc14b375cad2fcf8c3ee6cc.zip
Merge branch 'sched/urgent' into sched/core, to avoid conflicts
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Diffstat (limited to 'kernel/sched')
-rw-r--r--kernel/sched/clock.c3
-rw-r--r--kernel/sched/core.c74
-rw-r--r--kernel/sched/cpuacct.c6
-rw-r--r--kernel/sched/cpudeadline.c4
-rw-r--r--kernel/sched/cpupri.c3
-rw-r--r--kernel/sched/cputime.c32
-rw-r--r--kernel/sched/deadline.c5
-rw-r--r--kernel/sched/debug.c3
-rw-r--r--kernel/sched/fair.c16
-rw-r--r--kernel/sched/idle.c150
-rw-r--r--kernel/sched/stats.c2
11 files changed, 188 insertions, 110 deletions
diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c
index b30a2924ef14..3ef6451e972e 100644
--- a/kernel/sched/clock.c
+++ b/kernel/sched/clock.c
@@ -60,13 +60,14 @@
#include <linux/sched.h>
#include <linux/static_key.h>
#include <linux/workqueue.h>
+#include <linux/compiler.h>
/*
* Scheduler clock - returns current time in nanosec units.
* This is default implementation.
* Architectures and sub-architectures can override this.
*/
-unsigned long long __attribute__((weak)) sched_clock(void)
+unsigned long long __weak sched_clock(void)
{
return (unsigned long long)(jiffies - INITIAL_JIFFIES)
* (NSEC_PER_SEC / HZ);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 9cae286824bb..13584f1cccfc 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -73,6 +73,7 @@
#include <linux/init_task.h>
#include <linux/binfmts.h>
#include <linux/context_tracking.h>
+#include <linux/compiler.h>
#include <asm/switch_to.h>
#include <asm/tlb.h>
@@ -2591,8 +2592,14 @@ pick_next_task(struct rq *rq, struct task_struct *prev)
if (likely(prev->sched_class == class &&
rq->nr_running == rq->cfs.h_nr_running)) {
p = fair_sched_class.pick_next_task(rq, prev);
- if (likely(p && p != RETRY_TASK))
- return p;
+ if (unlikely(p == RETRY_TASK))
+ goto again;
+
+ /* assumes fair_sched_class->next == idle_sched_class */
+ if (unlikely(!p))
+ p = idle_sched_class.pick_next_task(rq, prev);
+
+ return p;
}
again:
@@ -2845,52 +2852,6 @@ int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags,
}
EXPORT_SYMBOL(default_wake_function);
-static long __sched
-sleep_on_common(wait_queue_head_t *q, int state, long timeout)
-{
- unsigned long flags;
- wait_queue_t wait;
-
- init_waitqueue_entry(&wait, current);
-
- __set_current_state(state);
-
- spin_lock_irqsave(&q->lock, flags);
- __add_wait_queue(q, &wait);
- spin_unlock(&q->lock);
- timeout = schedule_timeout(timeout);
- spin_lock_irq(&q->lock);
- __remove_wait_queue(q, &wait);
- spin_unlock_irqrestore(&q->lock, flags);
-
- return timeout;
-}
-
-void __sched interruptible_sleep_on(wait_queue_head_t *q)
-{
- sleep_on_common(q, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
-}
-EXPORT_SYMBOL(interruptible_sleep_on);
-
-long __sched
-interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout)
-{
- return sleep_on_common(q, TASK_INTERRUPTIBLE, timeout);
-}
-EXPORT_SYMBOL(interruptible_sleep_on_timeout);
-
-void __sched sleep_on(wait_queue_head_t *q)
-{
- sleep_on_common(q, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
-}
-EXPORT_SYMBOL(sleep_on);
-
-long __sched sleep_on_timeout(wait_queue_head_t *q, long timeout)
-{
- return sleep_on_common(q, TASK_UNINTERRUPTIBLE, timeout);
-}
-EXPORT_SYMBOL(sleep_on_timeout);
-
#ifdef CONFIG_RT_MUTEXES
/*
@@ -3169,6 +3130,7 @@ __setparam_dl(struct task_struct *p, const struct sched_attr *attr)
dl_se->dl_bw = to_ratio(dl_se->dl_period, dl_se->dl_runtime);
dl_se->dl_throttled = 0;
dl_se->dl_new = 1;
+ dl_se->dl_yielded = 0;
}
static void __setscheduler_params(struct task_struct *p,
@@ -3684,6 +3646,7 @@ SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param)
* sys_sched_setattr - same as above, but with extended sched_attr
* @pid: the pid in question.
* @uattr: structure containing the extended parameters.
+ * @flags: for future extension.
*/
SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr,
unsigned int, flags)
@@ -3828,6 +3791,7 @@ err_size:
* @pid: the pid in question.
* @uattr: structure containing the extended parameters.
* @size: sizeof(attr) for fwd/bwd comp.
+ * @flags: for future extension.
*/
SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,
unsigned int, size, unsigned int, flags)
@@ -6062,6 +6026,8 @@ sd_numa_init(struct sched_domain_topology_level *tl, int cpu)
,
.last_balance = jiffies,
.balance_interval = sd_weight,
+ .max_newidle_lb_cost = 0,
+ .next_decay_max_lb_cost = jiffies,
};
SD_INIT_NAME(sd, NUMA);
sd->private = &tl->data;
@@ -6498,7 +6464,7 @@ static cpumask_var_t fallback_doms;
* cpu core maps. It is supposed to return 1 if the topology changed
* or 0 if it stayed the same.
*/
-int __attribute__((weak)) arch_update_cpu_topology(void)
+int __weak arch_update_cpu_topology(void)
{
return 0;
}
@@ -7230,7 +7196,7 @@ void sched_move_task(struct task_struct *tsk)
if (unlikely(running))
tsk->sched_class->put_prev_task(rq, tsk);
- tg = container_of(task_css_check(tsk, cpu_cgroup_subsys_id,
+ tg = container_of(task_css_check(tsk, cpu_cgrp_id,
lockdep_is_held(&tsk->sighand->siglock)),
struct task_group, css);
tg = autogroup_task_group(tsk, tg);
@@ -7657,7 +7623,7 @@ static int cpu_cgroup_can_attach(struct cgroup_subsys_state *css,
{
struct task_struct *task;
- cgroup_taskset_for_each(task, css, tset) {
+ cgroup_taskset_for_each(task, tset) {
#ifdef CONFIG_RT_GROUP_SCHED
if (!sched_rt_can_attach(css_tg(css), task))
return -EINVAL;
@@ -7675,7 +7641,7 @@ static void cpu_cgroup_attach(struct cgroup_subsys_state *css,
{
struct task_struct *task;
- cgroup_taskset_for_each(task, css, tset)
+ cgroup_taskset_for_each(task, tset)
sched_move_task(task);
}
@@ -8014,8 +7980,7 @@ static struct cftype cpu_files[] = {
{ } /* terminate */
};
-struct cgroup_subsys cpu_cgroup_subsys = {
- .name = "cpu",
+struct cgroup_subsys cpu_cgrp_subsys = {
.css_alloc = cpu_cgroup_css_alloc,
.css_free = cpu_cgroup_css_free,
.css_online = cpu_cgroup_css_online,
@@ -8023,7 +7988,6 @@ struct cgroup_subsys cpu_cgroup_subsys = {
.can_attach = cpu_cgroup_can_attach,
.attach = cpu_cgroup_attach,
.exit = cpu_cgroup_exit,
- .subsys_id = cpu_cgroup_subsys_id,
.base_cftypes = cpu_files,
.early_init = 1,
};
diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c
index 622e0818f905..c143ee380e3a 100644
--- a/kernel/sched/cpuacct.c
+++ b/kernel/sched/cpuacct.c
@@ -41,7 +41,7 @@ static inline struct cpuacct *css_ca(struct cgroup_subsys_state *css)
/* return cpu accounting group to which this task belongs */
static inline struct cpuacct *task_ca(struct task_struct *tsk)
{
- return css_ca(task_css(tsk, cpuacct_subsys_id));
+ return css_ca(task_css(tsk, cpuacct_cgrp_id));
}
static inline struct cpuacct *parent_ca(struct cpuacct *ca)
@@ -275,11 +275,9 @@ void cpuacct_account_field(struct task_struct *p, int index, u64 val)
rcu_read_unlock();
}
-struct cgroup_subsys cpuacct_subsys = {
- .name = "cpuacct",
+struct cgroup_subsys cpuacct_cgrp_subsys = {
.css_alloc = cpuacct_css_alloc,
.css_free = cpuacct_css_free,
- .subsys_id = cpuacct_subsys_id,
.base_cftypes = files,
.early_init = 1,
};
diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c
index 5b9bb42b2d47..ab001b5d5048 100644
--- a/kernel/sched/cpudeadline.c
+++ b/kernel/sched/cpudeadline.c
@@ -210,7 +210,5 @@ int cpudl_init(struct cpudl *cp)
*/
void cpudl_cleanup(struct cpudl *cp)
{
- /*
- * nothing to do for the moment
- */
+ free_cpumask_var(cp->free_cpus);
}
diff --git a/kernel/sched/cpupri.c b/kernel/sched/cpupri.c
index 8b836b376d91..3031bac8aa3e 100644
--- a/kernel/sched/cpupri.c
+++ b/kernel/sched/cpupri.c
@@ -70,8 +70,7 @@ int cpupri_find(struct cpupri *cp, struct task_struct *p,
int idx = 0;
int task_pri = convert_prio(p->prio);
- if (task_pri >= MAX_RT_PRIO)
- return 0;
+ BUG_ON(task_pri >= CPUPRI_NR_PRIORITIES);
for (idx = 0; idx < task_pri; idx++) {
struct cpupri_vec *vec = &cp->pri_to_cpu[idx];
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index a95097cb4591..72fdf06ef865 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -332,50 +332,50 @@ out:
* softirq as those do not count in task exec_runtime any more.
*/
static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
- struct rq *rq)
+ struct rq *rq, int ticks)
{
- cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);
+ cputime_t scaled = cputime_to_scaled(cputime_one_jiffy);
+ u64 cputime = (__force u64) cputime_one_jiffy;
u64 *cpustat = kcpustat_this_cpu->cpustat;
if (steal_account_process_tick())
return;
+ cputime *= ticks;
+ scaled *= ticks;
+
if (irqtime_account_hi_update()) {
- cpustat[CPUTIME_IRQ] += (__force u64) cputime_one_jiffy;
+ cpustat[CPUTIME_IRQ] += cputime;
} else if (irqtime_account_si_update()) {
- cpustat[CPUTIME_SOFTIRQ] += (__force u64) cputime_one_jiffy;
+ cpustat[CPUTIME_SOFTIRQ] += cputime;
} else if (this_cpu_ksoftirqd() == p) {
/*
* ksoftirqd time do not get accounted in cpu_softirq_time.
* So, we have to handle it separately here.
* Also, p->stime needs to be updated for ksoftirqd.
*/
- __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled,
- CPUTIME_SOFTIRQ);
+ __account_system_time(p, cputime, scaled, CPUTIME_SOFTIRQ);
} else if (user_tick) {
- account_user_time(p, cputime_one_jiffy, one_jiffy_scaled);
+ account_user_time(p, cputime, scaled);
} else if (p == rq->idle) {
- account_idle_time(cputime_one_jiffy);
+ account_idle_time(cputime);
} else if (p->flags & PF_VCPU) { /* System time or guest time */
- account_guest_time(p, cputime_one_jiffy, one_jiffy_scaled);
+ account_guest_time(p, cputime, scaled);
} else {
- __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled,
- CPUTIME_SYSTEM);
+ __account_system_time(p, cputime, scaled, CPUTIME_SYSTEM);
}
}
static void irqtime_account_idle_ticks(int ticks)
{
- int i;
struct rq *rq = this_rq();
- for (i = 0; i < ticks; i++)
- irqtime_account_process_tick(current, 0, rq);
+ irqtime_account_process_tick(current, 0, rq, ticks);
}
#else /* CONFIG_IRQ_TIME_ACCOUNTING */
static inline void irqtime_account_idle_ticks(int ticks) {}
static inline void irqtime_account_process_tick(struct task_struct *p, int user_tick,
- struct rq *rq) {}
+ struct rq *rq, int nr_ticks) {}
#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
/*
@@ -464,7 +464,7 @@ void account_process_tick(struct task_struct *p, int user_tick)
return;
if (sched_clock_irqtime) {
- irqtime_account_process_tick(p, user_tick, rq);
+ irqtime_account_process_tick(p, user_tick, rq, 1);
return;
}
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index b08095786cb8..800e99b99075 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -528,6 +528,7 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer)
sched_clock_tick();
update_rq_clock(rq);
dl_se->dl_throttled = 0;
+ dl_se->dl_yielded = 0;
if (p->on_rq) {
enqueue_task_dl(rq, p, ENQUEUE_REPLENISH);
if (task_has_dl_policy(rq->curr))
@@ -893,10 +894,10 @@ static void yield_task_dl(struct rq *rq)
* We make the task go to sleep until its current deadline by
* forcing its runtime to zero. This way, update_curr_dl() stops
* it and the bandwidth timer will wake it up and will give it
- * new scheduling parameters (thanks to dl_new=1).
+ * new scheduling parameters (thanks to dl_yielded=1).
*/
if (p->dl.runtime > 0) {
- rq->curr->dl.dl_new = 1;
+ rq->curr->dl.dl_yielded = 1;
p->dl.runtime = 0;
}
update_curr_dl(rq);
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index f3344c31632a..695f9773bb60 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -111,8 +111,7 @@ static char *task_group_path(struct task_group *tg)
if (autogroup_path(tg, group_path, PATH_MAX))
return group_path;
- cgroup_path(tg->css.cgroup, group_path, PATH_MAX);
- return group_path;
+ return cgroup_path(tg->css.cgroup, group_path, PATH_MAX);
}
#endif
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 43232b8bacde..5d859ec975c2 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -6658,6 +6658,7 @@ static int idle_balance(struct rq *this_rq)
int this_cpu = this_rq->cpu;
idle_enter_fair(this_rq);
+
/*
* We must set idle_stamp _before_ calling idle_balance(), such that we
* measure the duration of idle_balance() as idle time.
@@ -6710,14 +6711,16 @@ static int idle_balance(struct rq *this_rq)
raw_spin_lock(&this_rq->lock);
+ if (curr_cost > this_rq->max_idle_balance_cost)
+ this_rq->max_idle_balance_cost = curr_cost;
+
/*
- * While browsing the domains, we released the rq lock.
- * A task could have be enqueued in the meantime
+ * While browsing the domains, we released the rq lock, a task could
+ * have been enqueued in the meantime. Since we're not going idle,
+ * pretend we pulled a task.
*/
- if (this_rq->cfs.h_nr_running && !pulled_task) {
+ if (this_rq->cfs.h_nr_running && !pulled_task)
pulled_task = 1;
- goto out;
- }
if (pulled_task || time_after(jiffies, this_rq->next_balance)) {
/*
@@ -6727,9 +6730,6 @@ static int idle_balance(struct rq *this_rq)
this_rq->next_balance = next_balance;
}
- if (curr_cost > this_rq->max_idle_balance_cost)
- this_rq->max_idle_balance_cost = curr_cost;
-
out:
/* Is there a task of a high priority class? */
if (this_rq->nr_running != this_rq->cfs.h_nr_running)
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index b7976a127178..8f4390a079c7 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -63,6 +63,136 @@ void __weak arch_cpu_idle(void)
local_irq_enable();
}
+/**
+ * cpuidle_idle_call - the main idle function
+ *
+ * NOTE: no locks or semaphores should be used here
+ * return non-zero on failure
+ */
+static int cpuidle_idle_call(void)
+{
+ struct cpuidle_device *dev = __this_cpu_read(cpuidle_devices);
+ struct cpuidle_driver *drv = cpuidle_get_cpu_driver(dev);
+ int next_state, entered_state, ret;
+ bool broadcast;
+
+ /*
+ * Check if the idle task must be rescheduled. If it is the
+ * case, exit the function after re-enabling the local irq and
+ * set again the polling flag
+ */
+ if (current_clr_polling_and_test()) {
+ local_irq_enable();
+ __current_set_polling();
+ return 0;
+ }
+
+ /*
+ * During the idle period, stop measuring the disabled irqs
+ * critical sections latencies
+ */
+ stop_critical_timings();
+
+ /*
+ * Tell the RCU framework we are entering an idle section,
+ * so no more rcu read side critical sections and one more
+ * step to the grace period
+ */
+ rcu_idle_enter();
+
+ /*
+ * Check if the cpuidle framework is ready, otherwise fallback
+ * to the default arch specific idle method
+ */
+ ret = cpuidle_enabled(drv, dev);
+
+ if (!ret) {
+ /*
+ * Ask the governor to choose an idle state it thinks
+ * it is convenient to go to. There is *always* a
+ * convenient idle state
+ */
+ next_state = cpuidle_select(drv, dev);
+
+ /*
+ * The idle task must be scheduled, it is pointless to
+ * go to idle, just update no idle residency and get
+ * out of this function
+ */
+ if (current_clr_polling_and_test()) {
+ dev->last_residency = 0;
+ entered_state = next_state;
+ local_irq_enable();
+ } else {
+ broadcast = !!(drv->states[next_state].flags &
+ CPUIDLE_FLAG_TIMER_STOP);
+
+ if (broadcast)
+ /*
+ * Tell the time framework to switch
+ * to a broadcast timer because our
+ * local timer will be shutdown. If a
+ * local timer is used from another
+ * cpu as a broadcast timer, this call
+ * may fail if it is not available
+ */
+ ret = clockevents_notify(
+ CLOCK_EVT_NOTIFY_BROADCAST_ENTER,
+ &dev->cpu);
+
+ if (!ret) {
+ trace_cpu_idle_rcuidle(next_state, dev->cpu);
+
+ /*
+ * Enter the idle state previously
+ * returned by the governor
+ * decision. This function will block
+ * until an interrupt occurs and will
+ * take care of re-enabling the local
+ * interrupts
+ */
+ entered_state = cpuidle_enter(drv, dev,
+ next_state);
+
+ trace_cpu_idle_rcuidle(PWR_EVENT_EXIT,
+ dev->cpu);
+
+ if (broadcast)
+ clockevents_notify(
+ CLOCK_EVT_NOTIFY_BROADCAST_EXIT,
+ &dev->cpu);
+
+ /*
+ * Give the governor an opportunity to reflect on the
+ * outcome
+ */
+ cpuidle_reflect(dev, entered_state);
+ }
+ }
+ }
+
+ /*
+ * We can't use the cpuidle framework, let's use the default
+ * idle routine
+ */
+ if (ret)
+ arch_cpu_idle();
+
+ __current_set_polling();
+
+ /*
+ * It is up to the idle functions to enable back the local
+ * interrupt
+ */
+ if (WARN_ON_ONCE(irqs_disabled()))
+ local_irq_enable();
+
+ rcu_idle_exit();
+ start_critical_timings();
+
+ return 0;
+}
+
/*
* Generic idle loop implementation
*/
@@ -90,23 +220,11 @@ static void cpu_idle_loop(void)
* know that the IPI is going to arrive right
* away
*/
- if (cpu_idle_force_poll || tick_check_broadcast_expired()) {
+ if (cpu_idle_force_poll || tick_check_broadcast_expired())
cpu_idle_poll();
- } else {
- if (!current_clr_polling_and_test()) {
- stop_critical_timings();
- rcu_idle_enter();
- if (cpuidle_idle_call())
- arch_cpu_idle();
- if (WARN_ON_ONCE(irqs_disabled()))
- local_irq_enable();
- rcu_idle_exit();
- start_critical_timings();
- } else {
- local_irq_enable();
- }
- __current_set_polling();
- }
+ else
+ cpuidle_idle_call();
+
arch_cpu_idle_exit();
}
diff --git a/kernel/sched/stats.c b/kernel/sched/stats.c
index da98af347e8b..a476bea17fbc 100644
--- a/kernel/sched/stats.c
+++ b/kernel/sched/stats.c
@@ -142,4 +142,4 @@ static int __init proc_schedstat_init(void)
proc_create("schedstat", 0, NULL, &proc_schedstat_operations);
return 0;
}
-module_init(proc_schedstat_init);
+subsys_initcall(proc_schedstat_init);