diff options
Diffstat (limited to 'kernel/sched/fair.c')
-rw-r--r-- | kernel/sched/fair.c | 603 |
1 files changed, 380 insertions, 223 deletions
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 2048138ce54b..533547e3c90a 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -551,7 +551,11 @@ static inline u64 min_vruntime(u64 min_vruntime, u64 vruntime) static inline bool entity_before(const struct sched_entity *a, const struct sched_entity *b) { - return (s64)(a->vruntime - b->vruntime) < 0; + /* + * Tiebreak on vruntime seems unnecessary since it can + * hardly happen. + */ + return (s64)(a->deadline - b->deadline) < 0; } static inline s64 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se) @@ -720,7 +724,7 @@ static void update_entity_lag(struct cfs_rq *cfs_rq, struct sched_entity *se) * Note: using 'avg_vruntime() > se->vruntime' is inacurate due * to the loss in precision caused by the division. */ -int entity_eligible(struct cfs_rq *cfs_rq, struct sched_entity *se) +static int vruntime_eligible(struct cfs_rq *cfs_rq, u64 vruntime) { struct sched_entity *curr = cfs_rq->curr; s64 avg = cfs_rq->avg_vruntime; @@ -733,7 +737,12 @@ int entity_eligible(struct cfs_rq *cfs_rq, struct sched_entity *se) load += weight; } - return avg >= entity_key(cfs_rq, se) * load; + return avg >= (s64)(vruntime - cfs_rq->min_vruntime) * load; +} + +int entity_eligible(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ + return vruntime_eligible(cfs_rq, se->vruntime); } static u64 __update_min_vruntime(struct cfs_rq *cfs_rq, u64 vruntime) @@ -752,9 +761,8 @@ static u64 __update_min_vruntime(struct cfs_rq *cfs_rq, u64 vruntime) static void update_min_vruntime(struct cfs_rq *cfs_rq) { - struct sched_entity *se = __pick_first_entity(cfs_rq); + struct sched_entity *se = __pick_root_entity(cfs_rq); struct sched_entity *curr = cfs_rq->curr; - u64 vruntime = cfs_rq->min_vruntime; if (curr) { @@ -766,9 +774,9 @@ static void update_min_vruntime(struct cfs_rq *cfs_rq) if (se) { if (!curr) - vruntime = se->vruntime; + vruntime = se->min_vruntime; else - vruntime = min_vruntime(vruntime, se->vruntime); + vruntime = min_vruntime(vruntime, se->min_vruntime); } /* ensure we never gain time by being placed backwards. */ @@ -781,34 +789,34 @@ static inline bool __entity_less(struct rb_node *a, const struct rb_node *b) return entity_before(__node_2_se(a), __node_2_se(b)); } -#define deadline_gt(field, lse, rse) ({ (s64)((lse)->field - (rse)->field) > 0; }) +#define vruntime_gt(field, lse, rse) ({ (s64)((lse)->field - (rse)->field) > 0; }) -static inline void __update_min_deadline(struct sched_entity *se, struct rb_node *node) +static inline void __min_vruntime_update(struct sched_entity *se, struct rb_node *node) { if (node) { struct sched_entity *rse = __node_2_se(node); - if (deadline_gt(min_deadline, se, rse)) - se->min_deadline = rse->min_deadline; + if (vruntime_gt(min_vruntime, se, rse)) + se->min_vruntime = rse->min_vruntime; } } /* - * se->min_deadline = min(se->deadline, left->min_deadline, right->min_deadline) + * se->min_vruntime = min(se->vruntime, {left,right}->min_vruntime) */ -static inline bool min_deadline_update(struct sched_entity *se, bool exit) +static inline bool min_vruntime_update(struct sched_entity *se, bool exit) { - u64 old_min_deadline = se->min_deadline; + u64 old_min_vruntime = se->min_vruntime; struct rb_node *node = &se->run_node; - se->min_deadline = se->deadline; - __update_min_deadline(se, node->rb_right); - __update_min_deadline(se, node->rb_left); + se->min_vruntime = se->vruntime; + __min_vruntime_update(se, node->rb_right); + __min_vruntime_update(se, node->rb_left); - return se->min_deadline == old_min_deadline; + return se->min_vruntime == old_min_vruntime; } -RB_DECLARE_CALLBACKS(static, min_deadline_cb, struct sched_entity, - run_node, min_deadline, min_deadline_update); +RB_DECLARE_CALLBACKS(static, min_vruntime_cb, struct sched_entity, + run_node, min_vruntime, min_vruntime_update); /* * Enqueue an entity into the rb-tree: @@ -816,18 +824,28 @@ RB_DECLARE_CALLBACKS(static, min_deadline_cb, struct sched_entity, static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) { avg_vruntime_add(cfs_rq, se); - se->min_deadline = se->deadline; + se->min_vruntime = se->vruntime; rb_add_augmented_cached(&se->run_node, &cfs_rq->tasks_timeline, - __entity_less, &min_deadline_cb); + __entity_less, &min_vruntime_cb); } static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) { rb_erase_augmented_cached(&se->run_node, &cfs_rq->tasks_timeline, - &min_deadline_cb); + &min_vruntime_cb); avg_vruntime_sub(cfs_rq, se); } +struct sched_entity *__pick_root_entity(struct cfs_rq *cfs_rq) +{ + struct rb_node *root = cfs_rq->tasks_timeline.rb_root.rb_node; + + if (!root) + return NULL; + + return __node_2_se(root); +} + struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq) { struct rb_node *left = rb_first_cached(&cfs_rq->tasks_timeline); @@ -850,23 +868,29 @@ struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq) * with the earliest virtual deadline. * * We can do this in O(log n) time due to an augmented RB-tree. The - * tree keeps the entries sorted on service, but also functions as a - * heap based on the deadline by keeping: + * tree keeps the entries sorted on deadline, but also functions as a + * heap based on the vruntime by keeping: * - * se->min_deadline = min(se->deadline, se->{left,right}->min_deadline) + * se->min_vruntime = min(se->vruntime, se->{left,right}->min_vruntime) * - * Which allows an EDF like search on (sub)trees. + * Which allows tree pruning through eligibility. */ -static struct sched_entity *__pick_eevdf(struct cfs_rq *cfs_rq) +static struct sched_entity *pick_eevdf(struct cfs_rq *cfs_rq) { struct rb_node *node = cfs_rq->tasks_timeline.rb_root.rb_node; + struct sched_entity *se = __pick_first_entity(cfs_rq); struct sched_entity *curr = cfs_rq->curr; struct sched_entity *best = NULL; - struct sched_entity *best_left = NULL; + + /* + * We can safely skip eligibility check if there is only one entity + * in this cfs_rq, saving some cycles. + */ + if (cfs_rq->nr_running == 1) + return curr && curr->on_rq ? curr : se; if (curr && (!curr->on_rq || !entity_eligible(cfs_rq, curr))) curr = NULL; - best = curr; /* * Once selected, run a task until it either becomes non-eligible or @@ -875,95 +899,45 @@ static struct sched_entity *__pick_eevdf(struct cfs_rq *cfs_rq) if (sched_feat(RUN_TO_PARITY) && curr && curr->vlag == curr->deadline) return curr; + /* Pick the leftmost entity if it's eligible */ + if (se && entity_eligible(cfs_rq, se)) { + best = se; + goto found; + } + + /* Heap search for the EEVD entity */ while (node) { - struct sched_entity *se = __node_2_se(node); + struct rb_node *left = node->rb_left; /* - * If this entity is not eligible, try the left subtree. + * Eligible entities in left subtree are always better + * choices, since they have earlier deadlines. */ - if (!entity_eligible(cfs_rq, se)) { - node = node->rb_left; + if (left && vruntime_eligible(cfs_rq, + __node_2_se(left)->min_vruntime)) { + node = left; continue; } - /* - * Now we heap search eligible trees for the best (min_)deadline - */ - if (!best || deadline_gt(deadline, best, se)) - best = se; + se = __node_2_se(node); /* - * Every se in a left branch is eligible, keep track of the - * branch with the best min_deadline + * The left subtree either is empty or has no eligible + * entity, so check the current node since it is the one + * with earliest deadline that might be eligible. */ - if (node->rb_left) { - struct sched_entity *left = __node_2_se(node->rb_left); - - if (!best_left || deadline_gt(min_deadline, best_left, left)) - best_left = left; - - /* - * min_deadline is in the left branch. rb_left and all - * descendants are eligible, so immediately switch to the second - * loop. - */ - if (left->min_deadline == se->min_deadline) - break; - } - - /* min_deadline is at this node, no need to look right */ - if (se->deadline == se->min_deadline) + if (entity_eligible(cfs_rq, se)) { + best = se; break; - - /* else min_deadline is in the right branch. */ - node = node->rb_right; - } - - /* - * We ran into an eligible node which is itself the best. - * (Or nr_running == 0 and both are NULL) - */ - if (!best_left || (s64)(best_left->min_deadline - best->deadline) > 0) - return best; - - /* - * Now best_left and all of its children are eligible, and we are just - * looking for deadline == min_deadline - */ - node = &best_left->run_node; - while (node) { - struct sched_entity *se = __node_2_se(node); - - /* min_deadline is the current node */ - if (se->deadline == se->min_deadline) - return se; - - /* min_deadline is in the left branch */ - if (node->rb_left && - __node_2_se(node->rb_left)->min_deadline == se->min_deadline) { - node = node->rb_left; - continue; } - /* else min_deadline is in the right branch */ node = node->rb_right; } - return NULL; -} - -static struct sched_entity *pick_eevdf(struct cfs_rq *cfs_rq) -{ - struct sched_entity *se = __pick_eevdf(cfs_rq); - - if (!se) { - struct sched_entity *left = __pick_first_entity(cfs_rq); - if (left) { - pr_err("EEVDF scheduling fail, picking leftmost\n"); - return left; - } - } +found: + if (!best || (curr && entity_before(curr, best))) + best = curr; - return se; + return best; } #ifdef CONFIG_SCHED_DEBUG @@ -1129,23 +1103,17 @@ static void update_tg_load_avg(struct cfs_rq *cfs_rq) } #endif /* CONFIG_SMP */ -/* - * Update the current task's runtime statistics. - */ -static void update_curr(struct cfs_rq *cfs_rq) +static s64 update_curr_se(struct rq *rq, struct sched_entity *curr) { - struct sched_entity *curr = cfs_rq->curr; - u64 now = rq_clock_task(rq_of(cfs_rq)); - u64 delta_exec; - - if (unlikely(!curr)) - return; + u64 now = rq_clock_task(rq); + s64 delta_exec; delta_exec = now - curr->exec_start; - if (unlikely((s64)delta_exec <= 0)) - return; + if (unlikely(delta_exec <= 0)) + return delta_exec; curr->exec_start = now; + curr->sum_exec_runtime += delta_exec; if (schedstat_enabled()) { struct sched_statistics *stats; @@ -1155,20 +1123,54 @@ static void update_curr(struct cfs_rq *cfs_rq) max(delta_exec, stats->exec_max)); } - curr->sum_exec_runtime += delta_exec; - schedstat_add(cfs_rq->exec_clock, delta_exec); + return delta_exec; +} + +static inline void update_curr_task(struct task_struct *p, s64 delta_exec) +{ + trace_sched_stat_runtime(p, delta_exec); + account_group_exec_runtime(p, delta_exec); + cgroup_account_cputime(p, delta_exec); + if (p->dl_server) + dl_server_update(p->dl_server, delta_exec); +} + +/* + * Used by other classes to account runtime. + */ +s64 update_curr_common(struct rq *rq) +{ + struct task_struct *curr = rq->curr; + s64 delta_exec; + + delta_exec = update_curr_se(rq, &curr->se); + if (likely(delta_exec > 0)) + update_curr_task(curr, delta_exec); + + return delta_exec; +} + +/* + * Update the current task's runtime statistics. + */ +static void update_curr(struct cfs_rq *cfs_rq) +{ + struct sched_entity *curr = cfs_rq->curr; + s64 delta_exec; + + if (unlikely(!curr)) + return; + + delta_exec = update_curr_se(rq_of(cfs_rq), curr); + if (unlikely(delta_exec <= 0)) + return; curr->vruntime += calc_delta_fair(delta_exec, curr); update_deadline(cfs_rq, curr); update_min_vruntime(cfs_rq); - if (entity_is_task(curr)) { - struct task_struct *curtask = task_of(curr); - - trace_sched_stat_runtime(curtask, delta_exec, curr->vruntime); - cgroup_account_cputime(curtask, delta_exec); - account_group_exec_runtime(curtask, delta_exec); - } + if (entity_is_task(curr)) + update_curr_task(task_of(curr), delta_exec); account_cfs_rq_runtime(cfs_rq, delta_exec); } @@ -3164,7 +3166,7 @@ static bool vma_is_accessed(struct mm_struct *mm, struct vm_area_struct *vma) * This is also done to avoid any side effect of task scanning * amplifying the unfairness of disjoint set of VMAs' access. */ - if (READ_ONCE(current->mm->numa_scan_seq) < 2) + if ((READ_ONCE(current->mm->numa_scan_seq) - vma->numab_state->start_scan_seq) < 2) return true; pids = vma->numab_state->pids_active[0] | vma->numab_state->pids_active[1]; @@ -3307,6 +3309,8 @@ retry_pids: if (!vma->numab_state) continue; + vma->numab_state->start_scan_seq = mm->numa_scan_seq; + vma->numab_state->next_scan = now + msecs_to_jiffies(sysctl_numa_balancing_scan_delay); @@ -3666,41 +3670,140 @@ static inline void dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { } #endif +static void reweight_eevdf(struct cfs_rq *cfs_rq, struct sched_entity *se, + unsigned long weight) +{ + unsigned long old_weight = se->load.weight; + u64 avruntime = avg_vruntime(cfs_rq); + s64 vlag, vslice; + + /* + * VRUNTIME + * ======== + * + * COROLLARY #1: The virtual runtime of the entity needs to be + * adjusted if re-weight at !0-lag point. + * + * Proof: For contradiction assume this is not true, so we can + * re-weight without changing vruntime at !0-lag point. + * + * Weight VRuntime Avg-VRuntime + * before w v V + * after w' v' V' + * + * Since lag needs to be preserved through re-weight: + * + * lag = (V - v)*w = (V'- v')*w', where v = v' + * ==> V' = (V - v)*w/w' + v (1) + * + * Let W be the total weight of the entities before reweight, + * since V' is the new weighted average of entities: + * + * V' = (WV + w'v - wv) / (W + w' - w) (2) + * + * by using (1) & (2) we obtain: + * + * (WV + w'v - wv) / (W + w' - w) = (V - v)*w/w' + v + * ==> (WV-Wv+Wv+w'v-wv)/(W+w'-w) = (V - v)*w/w' + v + * ==> (WV - Wv)/(W + w' - w) + v = (V - v)*w/w' + v + * ==> (V - v)*W/(W + w' - w) = (V - v)*w/w' (3) + * + * Since we are doing at !0-lag point which means V != v, we + * can simplify (3): + * + * ==> W / (W + w' - w) = w / w' + * ==> Ww' = Ww + ww' - ww + * ==> W * (w' - w) = w * (w' - w) + * ==> W = w (re-weight indicates w' != w) + * + * So the cfs_rq contains only one entity, hence vruntime of + * the entity @v should always equal to the cfs_rq's weighted + * average vruntime @V, which means we will always re-weight + * at 0-lag point, thus breach assumption. Proof completed. + * + * + * COROLLARY #2: Re-weight does NOT affect weighted average + * vruntime of all the entities. + * + * Proof: According to corollary #1, Eq. (1) should be: + * + * (V - v)*w = (V' - v')*w' + * ==> v' = V' - (V - v)*w/w' (4) + * + * According to the weighted average formula, we have: + * + * V' = (WV - wv + w'v') / (W - w + w') + * = (WV - wv + w'(V' - (V - v)w/w')) / (W - w + w') + * = (WV - wv + w'V' - Vw + wv) / (W - w + w') + * = (WV + w'V' - Vw) / (W - w + w') + * + * ==> V'*(W - w + w') = WV + w'V' - Vw + * ==> V' * (W - w) = (W - w) * V (5) + * + * If the entity is the only one in the cfs_rq, then reweight + * always occurs at 0-lag point, so V won't change. Or else + * there are other entities, hence W != w, then Eq. (5) turns + * into V' = V. So V won't change in either case, proof done. + * + * + * So according to corollary #1 & #2, the effect of re-weight + * on vruntime should be: + * + * v' = V' - (V - v) * w / w' (4) + * = V - (V - v) * w / w' + * = V - vl * w / w' + * = V - vl' + */ + if (avruntime != se->vruntime) { + vlag = (s64)(avruntime - se->vruntime); + vlag = div_s64(vlag * old_weight, weight); + se->vruntime = avruntime - vlag; + } + + /* + * DEADLINE + * ======== + * + * When the weight changes, the virtual time slope changes and + * we should adjust the relative virtual deadline accordingly. + * + * d' = v' + (d - v)*w/w' + * = V' - (V - v)*w/w' + (d - v)*w/w' + * = V - (V - v)*w/w' + (d - v)*w/w' + * = V + (d - V)*w/w' + */ + vslice = (s64)(se->deadline - avruntime); + vslice = div_s64(vslice * old_weight, weight); + se->deadline = avruntime + vslice; +} + static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, unsigned long weight) { - unsigned long old_weight = se->load.weight; + bool curr = cfs_rq->curr == se; if (se->on_rq) { /* commit outstanding execution time */ - if (cfs_rq->curr == se) + if (curr) update_curr(cfs_rq); else - avg_vruntime_sub(cfs_rq, se); + __dequeue_entity(cfs_rq, se); update_load_sub(&cfs_rq->load, se->load.weight); } dequeue_load_avg(cfs_rq, se); - update_load_set(&se->load, weight); - if (!se->on_rq) { /* * Because we keep se->vlag = V - v_i, while: lag_i = w_i*(V - v_i), * we need to scale se->vlag when w_i changes. */ - se->vlag = div_s64(se->vlag * old_weight, weight); + se->vlag = div_s64(se->vlag * se->load.weight, weight); } else { - s64 deadline = se->deadline - se->vruntime; - /* - * When the weight changes, the virtual time slope changes and - * we should adjust the relative virtual deadline accordingly. - */ - deadline = div_s64(deadline * old_weight, weight); - se->deadline = se->vruntime + deadline; - if (se != cfs_rq->curr) - min_deadline_cb_propagate(&se->run_node, NULL); + reweight_eevdf(cfs_rq, se, weight); } + update_load_set(&se->load, weight); + #ifdef CONFIG_SMP do { u32 divider = get_pelt_divider(&se->avg); @@ -3712,8 +3815,17 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, enqueue_load_avg(cfs_rq, se); if (se->on_rq) { update_load_add(&cfs_rq->load, se->load.weight); - if (cfs_rq->curr != se) - avg_vruntime_add(cfs_rq, se); + if (!curr) + __enqueue_entity(cfs_rq, se); + + /* + * The entity's vruntime has been adjusted, so let's check + * whether the rq-wide min_vruntime needs updated too. Since + * the calculations above require stable min_vruntime rather + * than up-to-date one, we do the update at the end of the + * reweight process. + */ + update_min_vruntime(cfs_rq); } } @@ -3857,14 +3969,11 @@ static void update_cfs_group(struct sched_entity *se) #ifndef CONFIG_SMP shares = READ_ONCE(gcfs_rq->tg->shares); - - if (likely(se->load.weight == shares)) - return; #else - shares = calc_group_shares(gcfs_rq); + shares = calc_group_shares(gcfs_rq); #endif - - reweight_entity(cfs_rq_of(se), se, shares); + if (unlikely(se->load.weight != shares)) + reweight_entity(cfs_rq_of(se), se, shares); } #else /* CONFIG_FAIR_GROUP_SCHED */ @@ -3991,6 +4100,10 @@ static inline void update_tg_load_avg(struct cfs_rq *cfs_rq) if (cfs_rq->tg == &root_task_group) return; + /* rq has been offline and doesn't contribute to the share anymore: */ + if (!cpu_active(cpu_of(rq_of(cfs_rq)))) + return; + /* * For migration heavy workloads, access to tg->load_avg can be * unbound. Limit the update rate to at most once per ms. @@ -4007,6 +4120,49 @@ static inline void update_tg_load_avg(struct cfs_rq *cfs_rq) } } +static inline void clear_tg_load_avg(struct cfs_rq *cfs_rq) +{ + long delta; + u64 now; + + /* + * No need to update load_avg for root_task_group, as it is not used. + */ + if (cfs_rq->tg == &root_task_group) + return; + + now = sched_clock_cpu(cpu_of(rq_of(cfs_rq))); + delta = 0 - cfs_rq->tg_load_avg_contrib; + atomic_long_add(delta, &cfs_rq->tg->load_avg); + cfs_rq->tg_load_avg_contrib = 0; + cfs_rq->last_update_tg_load_avg = now; +} + +/* CPU offline callback: */ +static void __maybe_unused clear_tg_offline_cfs_rqs(struct rq *rq) +{ + struct task_group *tg; + + lockdep_assert_rq_held(rq); + + /* + * The rq clock has already been updated in + * set_rq_offline(), so we should skip updating + * the rq clock again in unthrottle_cfs_rq(). + */ + rq_clock_start_loop_update(rq); + + rcu_read_lock(); + list_for_each_entry_rcu(tg, &task_groups, list) { + struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)]; + + clear_tg_load_avg(cfs_rq); + } + rcu_read_unlock(); + + rq_clock_stop_loop_update(rq); +} + /* * Called within set_task_rq() right before setting a task's CPU. The * caller only guarantees p->pi_lock is held; no other assumptions, @@ -4303,6 +4459,8 @@ static inline bool skip_blocked_update(struct sched_entity *se) static inline void update_tg_load_avg(struct cfs_rq *cfs_rq) {} +static inline void clear_tg_offline_cfs_rqs(struct rq *rq) {} + static inline int propagate_entity_load_avg(struct sched_entity *se) { return 0; @@ -4665,11 +4823,14 @@ static inline unsigned long task_util(struct task_struct *p) return READ_ONCE(p->se.avg.util_avg); } -static inline unsigned long _task_util_est(struct task_struct *p) +static inline unsigned long task_runnable(struct task_struct *p) { - struct util_est ue = READ_ONCE(p->se.avg.util_est); + return READ_ONCE(p->se.avg.runnable_avg); +} - return max(ue.ewma, (ue.enqueued & ~UTIL_AVG_UNCHANGED)); +static inline unsigned long _task_util_est(struct task_struct *p) +{ + return READ_ONCE(p->se.avg.util_est) & ~UTIL_AVG_UNCHANGED; } static inline unsigned long task_util_est(struct task_struct *p) @@ -4686,9 +4847,9 @@ static inline void util_est_enqueue(struct cfs_rq *cfs_rq, return; /* Update root cfs_rq's estimated utilization */ - enqueued = cfs_rq->avg.util_est.enqueued; + enqueued = cfs_rq->avg.util_est; enqueued += _task_util_est(p); - WRITE_ONCE(cfs_rq->avg.util_est.enqueued, enqueued); + WRITE_ONCE(cfs_rq->avg.util_est, enqueued); trace_sched_util_est_cfs_tp(cfs_rq); } @@ -4702,34 +4863,20 @@ static inline void util_est_dequeue(struct cfs_rq *cfs_rq, return; /* Update root cfs_rq's estimated utilization */ - enqueued = cfs_rq->avg.util_est.enqueued; + enqueued = cfs_rq->avg.util_est; enqueued -= min_t(unsigned int, enqueued, _task_util_est(p)); - WRITE_ONCE(cfs_rq->avg.util_est.enqueued, enqueued); + WRITE_ONCE(cfs_rq->avg.util_est, enqueued); trace_sched_util_est_cfs_tp(cfs_rq); } #define UTIL_EST_MARGIN (SCHED_CAPACITY_SCALE / 100) -/* - * Check if a (signed) value is within a specified (unsigned) margin, - * based on the observation that: - * - * abs(x) < y := (unsigned)(x + y - 1) < (2 * y - 1) - * - * NOTE: this only works when value + margin < INT_MAX. - */ -static inline bool within_margin(int value, int margin) -{ - return ((unsigned int)(value + margin - 1) < (2 * margin - 1)); -} - static inline void util_est_update(struct cfs_rq *cfs_rq, struct task_struct *p, bool task_sleep) { - long last_ewma_diff, last_enqueued_diff; - struct util_est ue; + unsigned int ewma, dequeued, last_ewma_diff; if (!sched_feat(UTIL_EST)) return; @@ -4741,71 +4888,73 @@ static inline void util_est_update(struct cfs_rq *cfs_rq, if (!task_sleep) return; + /* Get current estimate of utilization */ + ewma = READ_ONCE(p->se.avg.util_est); + /* * If the PELT values haven't changed since enqueue time, * skip the util_est update. */ - ue = p->se.avg.util_est; - if (ue.enqueued & UTIL_AVG_UNCHANGED) + if (ewma & UTIL_AVG_UNCHANGED) return; - last_enqueued_diff = ue.enqueued; + /* Get utilization at dequeue */ + dequeued = task_util(p); /* * Reset EWMA on utilization increases, the moving average is used only * to smooth utilization decreases. */ - ue.enqueued = task_util(p); - if (sched_feat(UTIL_EST_FASTUP)) { - if (ue.ewma < ue.enqueued) { - ue.ewma = ue.enqueued; - goto done; - } + if (ewma <= dequeued) { + ewma = dequeued; + goto done; } /* * Skip update of task's estimated utilization when its members are * already ~1% close to its last activation value. */ - last_ewma_diff = ue.enqueued - ue.ewma; - last_enqueued_diff -= ue.enqueued; - if (within_margin(last_ewma_diff, UTIL_EST_MARGIN)) { - if (!within_margin(last_enqueued_diff, UTIL_EST_MARGIN)) - goto done; - - return; - } + last_ewma_diff = ewma - dequeued; + if (last_ewma_diff < UTIL_EST_MARGIN) + goto done; /* * To avoid overestimation of actual task utilization, skip updates if * we cannot grant there is idle time in this CPU. */ - if (task_util(p) > arch_scale_cpu_capacity(cpu_of(rq_of(cfs_rq)))) + if (dequeued > arch_scale_cpu_capacity(cpu_of(rq_of(cfs_rq)))) return; /* + * To avoid underestimate of task utilization, skip updates of EWMA if + * we cannot grant that thread got all CPU time it wanted. + */ + if ((dequeued + UTIL_EST_MARGIN) < task_runnable(p)) + goto done; + + + /* * Update Task's estimated utilization * * When *p completes an activation we can consolidate another sample - * of the task size. This is done by storing the current PELT value - * as ue.enqueued and by using this value to update the Exponential - * Weighted Moving Average (EWMA): + * of the task size. This is done by using this value to update the + * Exponential Weighted Moving Average (EWMA): * * ewma(t) = w * task_util(p) + (1-w) * ewma(t-1) * = w * task_util(p) + ewma(t-1) - w * ewma(t-1) * = w * (task_util(p) - ewma(t-1)) + ewma(t-1) - * = w * ( last_ewma_diff ) + ewma(t-1) - * = w * (last_ewma_diff + ewma(t-1) / w) + * = w * ( -last_ewma_diff ) + ewma(t-1) + * = w * (-last_ewma_diff + ewma(t-1) / w) * * Where 'w' is the weight of new samples, which is configured to be * 0.25, thus making w=1/4 ( >>= UTIL_EST_WEIGHT_SHIFT) */ - ue.ewma <<= UTIL_EST_WEIGHT_SHIFT; - ue.ewma += last_ewma_diff; - ue.ewma >>= UTIL_EST_WEIGHT_SHIFT; + ewma <<= UTIL_EST_WEIGHT_SHIFT; + ewma -= last_ewma_diff; + ewma >>= UTIL_EST_WEIGHT_SHIFT; done: - ue.enqueued |= UTIL_AVG_UNCHANGED; - WRITE_ONCE(p->se.avg.util_est, ue); + ewma |= UTIL_AVG_UNCHANGED; + WRITE_ONCE(p->se.avg.util_est, ewma); trace_sched_util_est_se_tp(&p->se); } @@ -7533,16 +7682,16 @@ cpu_util(int cpu, struct task_struct *p, int dst_cpu, int boost) if (sched_feat(UTIL_EST)) { unsigned long util_est; - util_est = READ_ONCE(cfs_rq->avg.util_est.enqueued); + util_est = READ_ONCE(cfs_rq->avg.util_est); /* * During wake-up @p isn't enqueued yet and doesn't contribute - * to any cpu_rq(cpu)->cfs.avg.util_est.enqueued. + * to any cpu_rq(cpu)->cfs.avg.util_est. * If @dst_cpu == @cpu add it to "simulate" cpu_util after @p * has been enqueued. * * During exec (@dst_cpu = -1) @p is enqueued and does - * contribute to cpu_rq(cpu)->cfs.util_est.enqueued. + * contribute to cpu_rq(cpu)->cfs.util_est. * Remove it to "simulate" cpu_util without @p's contribution. * * Despite the task_on_rq_queued(@p) check there is still a @@ -7671,7 +7820,7 @@ static inline void eenv_pd_busy_time(struct energy_env *eenv, for_each_cpu(cpu, pd_cpus) { unsigned long util = cpu_util(cpu, p, -1, 0); - busy_time += effective_cpu_util(cpu, util, ENERGY_UTIL, NULL); + busy_time += effective_cpu_util(cpu, util, NULL, NULL); } eenv->pd_busy_time = min(eenv->pd_cap, busy_time); @@ -7694,7 +7843,7 @@ eenv_pd_max_util(struct energy_env *eenv, struct cpumask *pd_cpus, for_each_cpu(cpu, pd_cpus) { struct task_struct *tsk = (cpu == dst_cpu) ? p : NULL; unsigned long util = cpu_util(cpu, p, dst_cpu, 1); - unsigned long eff_util; + unsigned long eff_util, min, max; /* * Performance domain frequency: utilization clamping @@ -7703,7 +7852,23 @@ eenv_pd_max_util(struct energy_env *eenv, struct cpumask *pd_cpus, * NOTE: in case RT tasks are running, by default the * FREQUENCY_UTIL's utilization can be max OPP. */ - eff_util = effective_cpu_util(cpu, util, FREQUENCY_UTIL, tsk); + eff_util = effective_cpu_util(cpu, util, &min, &max); + + /* Task's uclamp can modify min and max value */ + if (tsk && uclamp_is_used()) { + min = max(min, uclamp_eff_value(p, UCLAMP_MIN)); + + /* + * If there is no active max uclamp constraint, + * directly use task's one, otherwise keep max. + */ + if (uclamp_rq_is_idle(cpu_rq(cpu))) + max = uclamp_eff_value(p, UCLAMP_MAX); + else + max = max(max, uclamp_eff_value(p, UCLAMP_MAX)); + } + + eff_util = sugov_effective_cpu_perf(cpu, eff_util, min, max); max_util = max(max_util, eff_util); } @@ -8105,7 +8270,6 @@ static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int struct task_struct *curr = rq->curr; struct sched_entity *se = &curr->se, *pse = &p->se; struct cfs_rq *cfs_rq = task_cfs_rq(curr); - int next_buddy_marked = 0; int cse_is_idle, pse_is_idle; if (unlikely(se == pse)) @@ -8122,7 +8286,6 @@ static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int if (sched_feat(NEXT_BUDDY) && !(wake_flags & WF_FORK)) { set_next_buddy(pse); - next_buddy_marked = 1; } /* @@ -8955,7 +9118,7 @@ static int detach_tasks(struct lb_env *env) case migrate_util: util = task_util_est(p); - if (util > env->imbalance) + if (shr_bound(util, env->sd->nr_balance_failed) > env->imbalance) goto next; env->imbalance -= util; @@ -11079,12 +11242,16 @@ static int should_we_balance(struct lb_env *env) continue; } - /* Are we the first idle CPU? */ + /* + * Are we the first idle core in a non-SMT domain or higher, + * or the first idle CPU in a SMT domain? + */ return cpu == env->dst_cpu; } - if (idle_smt == env->dst_cpu) - return true; + /* Are we the first idle CPU with busy siblings? */ + if (idle_smt != -1) + return idle_smt == env->dst_cpu; /* Are we the first CPU of this group ? */ return group_balance_cpu(sg) == env->dst_cpu; @@ -12304,6 +12471,9 @@ static void rq_offline_fair(struct rq *rq) /* Ensure any throttled groups are reachable by pick_next_task */ unthrottle_offline_cfs_rqs(rq); + + /* Ensure that we remove rq contribution to group share: */ + clear_tg_offline_cfs_rqs(rq); } #endif /* CONFIG_SMP */ @@ -12927,19 +13097,6 @@ next_cpu: return 0; } -#else /* CONFIG_FAIR_GROUP_SCHED */ - -void free_fair_sched_group(struct task_group *tg) { } - -int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) -{ - return 1; -} - -void online_fair_sched_group(struct task_group *tg) { } - -void unregister_fair_sched_group(struct task_group *tg) { } - #endif /* CONFIG_FAIR_GROUP_SCHED */ |