summaryrefslogtreecommitdiffstats
path: root/kernel/sched/topology.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/sched/topology.c')
-rw-r--r--kernel/sched/topology.c99
1 files changed, 90 insertions, 9 deletions
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index b77ad49dc14f..30169c7685b6 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -526,7 +526,7 @@ static int init_rootdomain(struct root_domain *rd)
#ifdef HAVE_RT_PUSH_IPI
rd->rto_cpu = -1;
raw_spin_lock_init(&rd->rto_lock);
- init_irq_work(&rd->rto_push_work, rto_push_irq_work_func);
+ rd->rto_push_work = IRQ_WORK_INIT_HARD(rto_push_irq_work_func);
#endif
rd->visit_gen = 0;
@@ -688,7 +688,6 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
{
struct rq *rq = cpu_rq(cpu);
struct sched_domain *tmp;
- int numa_distance = 0;
/* Remove the sched domains which do not contribute to scheduling. */
for (tmp = sd; tmp; ) {
@@ -716,13 +715,22 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
tmp = sd;
sd = sd->parent;
destroy_sched_domain(tmp);
- if (sd)
+ if (sd) {
+ struct sched_group *sg = sd->groups;
+
+ /*
+ * sched groups hold the flags of the child sched
+ * domain for convenience. Clear such flags since
+ * the child is being destroyed.
+ */
+ do {
+ sg->flags = 0;
+ } while (sg != sd->groups);
+
sd->child = NULL;
+ }
}
- for (tmp = sd; tmp; tmp = tmp->parent)
- numa_distance += !!(tmp->flags & SD_NUMA);
-
sched_domain_debug(sd, cpu);
rq_attach_root(rq, rd);
@@ -916,10 +924,12 @@ build_group_from_child_sched_domain(struct sched_domain *sd, int cpu)
return NULL;
sg_span = sched_group_span(sg);
- if (sd->child)
+ if (sd->child) {
cpumask_copy(sg_span, sched_domain_span(sd->child));
- else
+ sg->flags = sd->child->flags;
+ } else {
cpumask_copy(sg_span, sched_domain_span(sd));
+ }
atomic_inc(&sg->ref);
return sg;
@@ -1169,6 +1179,7 @@ static struct sched_group *get_group(int cpu, struct sd_data *sdd)
if (child) {
cpumask_copy(sched_group_span(sg), sched_domain_span(child));
cpumask_copy(group_balance_mask(sg), sched_group_span(sg));
+ sg->flags = child->flags;
} else {
cpumask_set_cpu(cpu, sched_group_span(sg));
cpumask_set_cpu(cpu, group_balance_mask(sg));
@@ -1482,6 +1493,8 @@ int sched_max_numa_distance;
static int *sched_domains_numa_distance;
static struct cpumask ***sched_domains_numa_masks;
int __read_mostly node_reclaim_distance = RECLAIM_DISTANCE;
+
+static unsigned long __read_mostly *sched_numa_onlined_nodes;
#endif
/*
@@ -1555,7 +1568,7 @@ sd_init(struct sched_domain_topology_level *tl,
.last_balance = jiffies,
.balance_interval = sd_weight,
.max_newidle_lb_cost = 0,
- .next_decay_max_lb_cost = jiffies,
+ .last_decay_max_lb_cost = jiffies,
.child = child,
#ifdef CONFIG_SCHED_DEBUG
.name = tl->name,
@@ -1625,6 +1638,11 @@ static struct sched_domain_topology_level default_topology[] = {
#ifdef CONFIG_SCHED_SMT
{ cpu_smt_mask, cpu_smt_flags, SD_INIT_NAME(SMT) },
#endif
+
+#ifdef CONFIG_SCHED_CLUSTER
+ { cpu_clustergroup_mask, cpu_cluster_flags, SD_INIT_NAME(CLS) },
+#endif
+
#ifdef CONFIG_SCHED_MC
{ cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) },
#endif
@@ -1833,6 +1851,16 @@ void sched_init_numa(void)
sched_domains_numa_masks[i][j] = mask;
for_each_node(k) {
+ /*
+ * Distance information can be unreliable for
+ * offline nodes, defer building the node
+ * masks to its bringup.
+ * This relies on all unique distance values
+ * still being visible at init time.
+ */
+ if (!node_online(j))
+ continue;
+
if (sched_debug() && (node_distance(j, k) != node_distance(k, j)))
sched_numa_warn("Node-distance not symmetric");
@@ -1886,6 +1914,53 @@ void sched_init_numa(void)
sched_max_numa_distance = sched_domains_numa_distance[nr_levels - 1];
init_numa_topology_type();
+
+ sched_numa_onlined_nodes = bitmap_alloc(nr_node_ids, GFP_KERNEL);
+ if (!sched_numa_onlined_nodes)
+ return;
+
+ bitmap_zero(sched_numa_onlined_nodes, nr_node_ids);
+ for_each_online_node(i)
+ bitmap_set(sched_numa_onlined_nodes, i, 1);
+}
+
+static void __sched_domains_numa_masks_set(unsigned int node)
+{
+ int i, j;
+
+ /*
+ * NUMA masks are not built for offline nodes in sched_init_numa().
+ * Thus, when a CPU of a never-onlined-before node gets plugged in,
+ * adding that new CPU to the right NUMA masks is not sufficient: the
+ * masks of that CPU's node must also be updated.
+ */
+ if (test_bit(node, sched_numa_onlined_nodes))
+ return;
+
+ bitmap_set(sched_numa_onlined_nodes, node, 1);
+
+ for (i = 0; i < sched_domains_numa_levels; i++) {
+ for (j = 0; j < nr_node_ids; j++) {
+ if (!node_online(j) || node == j)
+ continue;
+
+ if (node_distance(j, node) > sched_domains_numa_distance[i])
+ continue;
+
+ /* Add remote nodes in our masks */
+ cpumask_or(sched_domains_numa_masks[i][node],
+ sched_domains_numa_masks[i][node],
+ sched_domains_numa_masks[0][j]);
+ }
+ }
+
+ /*
+ * A new node has been brought up, potentially changing the topology
+ * classification.
+ *
+ * Note that this is racy vs any use of sched_numa_topology_type :/
+ */
+ init_numa_topology_type();
}
void sched_domains_numa_masks_set(unsigned int cpu)
@@ -1893,8 +1968,14 @@ void sched_domains_numa_masks_set(unsigned int cpu)
int node = cpu_to_node(cpu);
int i, j;
+ __sched_domains_numa_masks_set(node);
+
for (i = 0; i < sched_domains_numa_levels; i++) {
for (j = 0; j < nr_node_ids; j++) {
+ if (!node_online(j))
+ continue;
+
+ /* Set ourselves in the remote node's masks */
if (node_distance(j, node) <= sched_domains_numa_distance[i])
cpumask_set_cpu(cpu, sched_domains_numa_masks[i][j]);
}