diff options
Diffstat (limited to 'kernel/sched/topology.c')
-rw-r--r-- | kernel/sched/topology.c | 99 |
1 files changed, 90 insertions, 9 deletions
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index b77ad49dc14f..30169c7685b6 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -526,7 +526,7 @@ static int init_rootdomain(struct root_domain *rd) #ifdef HAVE_RT_PUSH_IPI rd->rto_cpu = -1; raw_spin_lock_init(&rd->rto_lock); - init_irq_work(&rd->rto_push_work, rto_push_irq_work_func); + rd->rto_push_work = IRQ_WORK_INIT_HARD(rto_push_irq_work_func); #endif rd->visit_gen = 0; @@ -688,7 +688,6 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) { struct rq *rq = cpu_rq(cpu); struct sched_domain *tmp; - int numa_distance = 0; /* Remove the sched domains which do not contribute to scheduling. */ for (tmp = sd; tmp; ) { @@ -716,13 +715,22 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) tmp = sd; sd = sd->parent; destroy_sched_domain(tmp); - if (sd) + if (sd) { + struct sched_group *sg = sd->groups; + + /* + * sched groups hold the flags of the child sched + * domain for convenience. Clear such flags since + * the child is being destroyed. + */ + do { + sg->flags = 0; + } while (sg != sd->groups); + sd->child = NULL; + } } - for (tmp = sd; tmp; tmp = tmp->parent) - numa_distance += !!(tmp->flags & SD_NUMA); - sched_domain_debug(sd, cpu); rq_attach_root(rq, rd); @@ -916,10 +924,12 @@ build_group_from_child_sched_domain(struct sched_domain *sd, int cpu) return NULL; sg_span = sched_group_span(sg); - if (sd->child) + if (sd->child) { cpumask_copy(sg_span, sched_domain_span(sd->child)); - else + sg->flags = sd->child->flags; + } else { cpumask_copy(sg_span, sched_domain_span(sd)); + } atomic_inc(&sg->ref); return sg; @@ -1169,6 +1179,7 @@ static struct sched_group *get_group(int cpu, struct sd_data *sdd) if (child) { cpumask_copy(sched_group_span(sg), sched_domain_span(child)); cpumask_copy(group_balance_mask(sg), sched_group_span(sg)); + sg->flags = child->flags; } else { cpumask_set_cpu(cpu, sched_group_span(sg)); cpumask_set_cpu(cpu, group_balance_mask(sg)); @@ -1482,6 +1493,8 @@ int sched_max_numa_distance; static int *sched_domains_numa_distance; static struct cpumask ***sched_domains_numa_masks; int __read_mostly node_reclaim_distance = RECLAIM_DISTANCE; + +static unsigned long __read_mostly *sched_numa_onlined_nodes; #endif /* @@ -1555,7 +1568,7 @@ sd_init(struct sched_domain_topology_level *tl, .last_balance = jiffies, .balance_interval = sd_weight, .max_newidle_lb_cost = 0, - .next_decay_max_lb_cost = jiffies, + .last_decay_max_lb_cost = jiffies, .child = child, #ifdef CONFIG_SCHED_DEBUG .name = tl->name, @@ -1625,6 +1638,11 @@ static struct sched_domain_topology_level default_topology[] = { #ifdef CONFIG_SCHED_SMT { cpu_smt_mask, cpu_smt_flags, SD_INIT_NAME(SMT) }, #endif + +#ifdef CONFIG_SCHED_CLUSTER + { cpu_clustergroup_mask, cpu_cluster_flags, SD_INIT_NAME(CLS) }, +#endif + #ifdef CONFIG_SCHED_MC { cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) }, #endif @@ -1833,6 +1851,16 @@ void sched_init_numa(void) sched_domains_numa_masks[i][j] = mask; for_each_node(k) { + /* + * Distance information can be unreliable for + * offline nodes, defer building the node + * masks to its bringup. + * This relies on all unique distance values + * still being visible at init time. + */ + if (!node_online(j)) + continue; + if (sched_debug() && (node_distance(j, k) != node_distance(k, j))) sched_numa_warn("Node-distance not symmetric"); @@ -1886,6 +1914,53 @@ void sched_init_numa(void) sched_max_numa_distance = sched_domains_numa_distance[nr_levels - 1]; init_numa_topology_type(); + + sched_numa_onlined_nodes = bitmap_alloc(nr_node_ids, GFP_KERNEL); + if (!sched_numa_onlined_nodes) + return; + + bitmap_zero(sched_numa_onlined_nodes, nr_node_ids); + for_each_online_node(i) + bitmap_set(sched_numa_onlined_nodes, i, 1); +} + +static void __sched_domains_numa_masks_set(unsigned int node) +{ + int i, j; + + /* + * NUMA masks are not built for offline nodes in sched_init_numa(). + * Thus, when a CPU of a never-onlined-before node gets plugged in, + * adding that new CPU to the right NUMA masks is not sufficient: the + * masks of that CPU's node must also be updated. + */ + if (test_bit(node, sched_numa_onlined_nodes)) + return; + + bitmap_set(sched_numa_onlined_nodes, node, 1); + + for (i = 0; i < sched_domains_numa_levels; i++) { + for (j = 0; j < nr_node_ids; j++) { + if (!node_online(j) || node == j) + continue; + + if (node_distance(j, node) > sched_domains_numa_distance[i]) + continue; + + /* Add remote nodes in our masks */ + cpumask_or(sched_domains_numa_masks[i][node], + sched_domains_numa_masks[i][node], + sched_domains_numa_masks[0][j]); + } + } + + /* + * A new node has been brought up, potentially changing the topology + * classification. + * + * Note that this is racy vs any use of sched_numa_topology_type :/ + */ + init_numa_topology_type(); } void sched_domains_numa_masks_set(unsigned int cpu) @@ -1893,8 +1968,14 @@ void sched_domains_numa_masks_set(unsigned int cpu) int node = cpu_to_node(cpu); int i, j; + __sched_domains_numa_masks_set(node); + for (i = 0; i < sched_domains_numa_levels; i++) { for (j = 0; j < nr_node_ids; j++) { + if (!node_online(j)) + continue; + + /* Set ourselves in the remote node's masks */ if (node_distance(j, node) <= sched_domains_numa_distance[i]) cpumask_set_cpu(cpu, sched_domains_numa_masks[i][j]); } |