IB/hfi1: Add global structure for affinity assignments

When HFI units get initialized, they each use their own mask copy for affinity assignments. On a multi-HFI system, affinity assignments overbook CPU cores as each HFI doesn't have knowledge of affinity assignments for other HFI units. Therefore, some CPU cores are never used for interrupt handlers in systems with high number of CPU cores per NUMA node. For multi-HFI systems, SDMA engine interrupt assignments start all over from the first CPU in the local NUMA node after the first HFI initialization. This change allows assignments to continue where the last HFI unit left off. Add global structure for affinity assignments for multiple HFIs to share affinity mask. Reviewed-by: Jianxin Xiong <jianxin.xiong@intel.com> Reviewed-by: Jubin John <jubin.john@intel.com> Reviewed-by: Mike Marciniszyn <mike.marciniszyn@intel.com> Signed-off-by: Sebastian Sanchez <sebastian.sanchez@intel.com> Signed-off-by: Doug Ledford <dledford@redhat.com>
author: Dennis Dalessandro <dennis.dalessandro@intel.com> 2016-07-25 16:52:36 +0200
committer: Doug Ledford <dledford@redhat.com> 2016-08-02 21:45:14 +0200
commit: 4197344ba5c2aab24b96f141cb00af9d0471f60b (patch)
tree: aa02fbe9ebd405cdeaa802180a867e16d8daf963 /drivers/infiniband/hw/hfi1/affinity.c
parent: IB/hfi1: Add counter to track unsupported packets drop (diff)
download: linux-4197344ba5c2aab24b96f141cb00af9d0471f60b.tar.xz
linux-4197344ba5c2aab24b96f141cb00af9d0471f60b.zip
1 files changed, 168 insertions, 77 deletions
diff --git a/drivers/infiniband/hw/hfi1/affinity.c b/drivers/infiniband/hw/hfi1/affinity.c
index 14d7eeb09be6..164769952ff7 100644
--- a/drivers/infiniband/hw/hfi1/affinity.c
+++ b/drivers/infiniband/hw/hfi1/affinity.c
@@ -53,6 +53,11 @@
 #include "sdma.h"
 #include "trace.h"
 
+struct hfi1_affinity_node_list node_affinity = {
+	.list = LIST_HEAD_INIT(node_affinity.list),
+	.lock = __SPIN_LOCK_UNLOCKED(&node_affinity.lock),
+};
+
 /* Name of IRQ types, indexed by enum irq_type */
 static const char * const irq_type_names[] = {
 	"SDMA",
@@ -69,45 +74,100 @@ static inline void init_cpu_mask_set(struct cpu_mask_set *set)
 }
 
 /* Initialize non-HT cpu cores mask */
-int init_real_cpu_mask(struct hfi1_devdata *dd)
+void init_real_cpu_mask(void)
 {
-	struct hfi1_affinity *info;
 	int possible, curr_cpu, i, ht;
 
-	info = kzalloc(sizeof(*info), GFP_KERNEL);
-	if (!info)
-		return -ENOMEM;
-
-	cpumask_clear(&info->real_cpu_mask);
+	cpumask_clear(&node_affinity.real_cpu_mask);
 
 	/* Start with cpu online mask as the real cpu mask */
-	cpumask_copy(&info->real_cpu_mask, cpu_online_mask);
+	cpumask_copy(&node_affinity.real_cpu_mask, cpu_online_mask);
 
 	/*
 	 * Remove HT cores from the real cpu mask.  Do this in two steps below.
 	 */
-	possible = cpumask_weight(&info->real_cpu_mask);
+	possible = cpumask_weight(&node_affinity.real_cpu_mask);
 	ht = cpumask_weight(topology_sibling_cpumask(
-					cpumask_first(&info->real_cpu_mask)));
+				cpumask_first(&node_affinity.real_cpu_mask)));
 	/*
 	 * Step 1.  Skip over the first N HT siblings and use them as the
 	 * "real" cores.  Assumes that HT cores are not enumerated in
 	 * succession (except in the single core case).
 	 */
-	curr_cpu = cpumask_first(&info->real_cpu_mask);
+	curr_cpu = cpumask_first(&node_affinity.real_cpu_mask);
 	for (i = 0; i < possible / ht; i++)
-		curr_cpu = cpumask_next(curr_cpu, &info->real_cpu_mask);
+		curr_cpu = cpumask_next(curr_cpu, &node_affinity.real_cpu_mask);
 	/*
 	 * Step 2.  Remove the remaining HT siblings.  Use cpumask_next() to
 	 * skip any gaps.
 	 */
 	for (; i < possible; i++) {
-		cpumask_clear_cpu(curr_cpu, &info->real_cpu_mask);
-		curr_cpu = cpumask_next(curr_cpu, &info->real_cpu_mask);
+		cpumask_clear_cpu(curr_cpu, &node_affinity.real_cpu_mask);
+		curr_cpu = cpumask_next(curr_cpu, &node_affinity.real_cpu_mask);
 	}
+}
 
-	dd->affinity = info;
-	return 0;
+void node_affinity_init(void)
+{
+	cpumask_copy(&node_affinity.proc.mask, cpu_online_mask);
+	/*
+	 * The real cpu mask is part of the affinity struct but it has to be
+	 * initialized early. It is needed to calculate the number of user
+	 * contexts in set_up_context_variables().
+	 */
+	init_real_cpu_mask();
+}
+
+void node_affinity_destroy(void)
+{
+	struct list_head *pos, *q;
+	struct hfi1_affinity_node *entry;
+
+	spin_lock(&node_affinity.lock);
+	list_for_each_safe(pos, q, &node_affinity.list) {
+		entry = list_entry(pos, struct hfi1_affinity_node,
+				   list);
+		list_del(pos);
+		kfree(entry);
+	}
+	spin_unlock(&node_affinity.lock);
+}
+
+static struct hfi1_affinity_node *node_affinity_allocate(int node)
+{
+	struct hfi1_affinity_node *entry;
+
+	entry = kzalloc(sizeof(*entry), GFP_KERNEL);
+	if (!entry)
+		return NULL;
+	entry->node = node;
+	INIT_LIST_HEAD(&entry->list);
+
+	return entry;
+}
+
+/*
+ * It appends an entry to the list.
+ * It *must* be called with node_affinity.lock held.
+ */
+static void node_affinity_add_tail(struct hfi1_affinity_node *entry)
+{
+	list_add_tail(&entry->list, &node_affinity.list);
+}
+
+/* It must be called with node_affinity.lock held */
+static struct hfi1_affinity_node *node_affinity_lookup(int node)
+{
+	struct list_head *pos;
+	struct hfi1_affinity_node *entry;
+
+	list_for_each(pos, &node_affinity.list) {
+		entry = list_entry(pos, struct hfi1_affinity_node, list);
+		if (entry->node == node)
+			return entry;
+	}
+
+	return NULL;
 }
 
 /*
@@ -121,10 +181,10 @@ int init_real_cpu_mask(struct hfi1_devdata *dd)
  * to the node relative 1 as necessary.
  *
  */
-void hfi1_dev_affinity_init(struct hfi1_devdata *dd)
+int hfi1_dev_affinity_init(struct hfi1_devdata *dd)
 {
 	int node = pcibus_to_node(dd->pcidev->bus);
-	struct hfi1_affinity *info = dd->affinity;
+	struct hfi1_affinity_node *entry;
 	const struct cpumask *local_mask;
 	int curr_cpu, possible, i;
 
@@ -132,55 +192,75 @@ void hfi1_dev_affinity_init(struct hfi1_devdata *dd)
 		node = numa_node_id();
 	dd->node = node;
 
-	spin_lock_init(&info->lock);
-
-	init_cpu_mask_set(&info->def_intr);
-	init_cpu_mask_set(&info->rcv_intr);
-	init_cpu_mask_set(&info->proc);
-
 	local_mask = cpumask_of_node(dd->node);
 	if (cpumask_first(local_mask) >= nr_cpu_ids)
 		local_mask = topology_core_cpumask(0);
-	/* Use the "real" cpu mask of this node as the default */
-	cpumask_and(&info->def_intr.mask, &info->real_cpu_mask, local_mask);
-
-	/*  fill in the receive list */
-	possible = cpumask_weight(&info->def_intr.mask);
-	curr_cpu = cpumask_first(&info->def_intr.mask);
-	if (possible == 1) {
-		/*  only one CPU, everyone will use it */
-		cpumask_set_cpu(curr_cpu, &info->rcv_intr.mask);
-	} else {
-		/*
-		 * Retain the first CPU in the default list for the control
-		 * context.
-		 */
-		curr_cpu = cpumask_next(curr_cpu, &info->def_intr.mask);
-		/*
-		 * Remove the remaining kernel receive queues from
-		 * the default list and add them to the receive list.
-		 */
-		for (i = 0; i < dd->n_krcv_queues - 1; i++) {
-			cpumask_clear_cpu(curr_cpu, &info->def_intr.mask);
-			cpumask_set_cpu(curr_cpu, &info->rcv_intr.mask);
-			curr_cpu = cpumask_next(curr_cpu, &info->def_intr.mask);
-			if (curr_cpu >= nr_cpu_ids)
-				break;
+
+	spin_lock(&node_affinity.lock);
+	entry = node_affinity_lookup(dd->node);
+	spin_unlock(&node_affinity.lock);
+
+	/*
+	 * If this is the first time this NUMA node's affinity is used,
+	 * create an entry in the global affinity structure and initialize it.
+	 */
+	if (!entry) {
+		entry = node_affinity_allocate(node);
+		if (!entry) {
+			dd_dev_err(dd,
+				   "Unable to allocate global affinity node\n");
+			return -ENOMEM;
 		}
-	}
+		init_cpu_mask_set(&entry->def_intr);
+		init_cpu_mask_set(&entry->rcv_intr);
+		/* Use the "real" cpu mask of this node as the default */
+		cpumask_and(&entry->def_intr.mask, &node_affinity.real_cpu_mask,
+			    local_mask);
+
+		/* fill in the receive list */
+		possible = cpumask_weight(&entry->def_intr.mask);
+		curr_cpu = cpumask_first(&entry->def_intr.mask);
+
+		if (possible == 1) {
+			/* only one CPU, everyone will use it */
+			cpumask_set_cpu(curr_cpu, &entry->rcv_intr.mask);
+		} else {
+			/*
+			 * Retain the first CPU in the default list for the
+			 * control context.
+			 */
+			curr_cpu = cpumask_next(curr_cpu,
+						&entry->def_intr.mask);
 
-	cpumask_copy(&info->proc.mask, cpu_online_mask);
-}
+			/*
+			 * Remove the remaining kernel receive queues from
+			 * the default list and add them to the receive list.
+			 */
+			for (i = 0; i < dd->n_krcv_queues - 1; i++) {
+				cpumask_clear_cpu(curr_cpu,
+						  &entry->def_intr.mask);
+				cpumask_set_cpu(curr_cpu,
+						&entry->rcv_intr.mask);
+				curr_cpu = cpumask_next(curr_cpu,
+							&entry->def_intr.mask);
+				if (curr_cpu >= nr_cpu_ids)
+					break;
+			}
+		}
 
-void hfi1_dev_affinity_free(struct hfi1_devdata *dd)
-{
-	kfree(dd->affinity);
+		spin_lock(&node_affinity.lock);
+		node_affinity_add_tail(entry);
+		spin_unlock(&node_affinity.lock);
+	}
+
+	return 0;
 }
 
 int hfi1_get_irq_affinity(struct hfi1_devdata *dd, struct hfi1_msix_entry *msix)
 {
 	int ret;
 	cpumask_var_t diff;
+	struct hfi1_affinity_node *entry;
 	struct cpu_mask_set *set;
 	struct sdma_engine *sde = NULL;
 	struct hfi1_ctxtdata *rcd = NULL;
@@ -194,21 +274,25 @@ int hfi1_get_irq_affinity(struct hfi1_devdata *dd, struct hfi1_msix_entry *msix)
 	if (!ret)
 		return -ENOMEM;
 
+	spin_lock(&node_affinity.lock);
+	entry = node_affinity_lookup(dd->node);
+	spin_unlock(&node_affinity.lock);
+
 	switch (msix->type) {
 	case IRQ_SDMA:
 		sde = (struct sdma_engine *)msix->arg;
 		scnprintf(extra, 64, "engine %u", sde->this_idx);
 		/* fall through */
 	case IRQ_GENERAL:
-		set = &dd->affinity->def_intr;
+		set = &entry->def_intr;
 		break;
 	case IRQ_RCVCTXT:
 		rcd = (struct hfi1_ctxtdata *)msix->arg;
 		if (rcd->ctxt == HFI1_CTRL_CTXT) {
-			set = &dd->affinity->def_intr;
+			set = &entry->def_intr;
 			cpu = cpumask_first(&set->mask);
 		} else {
-			set = &dd->affinity->rcv_intr;
+			set = &entry->rcv_intr;
 		}
 		scnprintf(extra, 64, "ctxt %u", rcd->ctxt);
 		break;
@@ -222,8 +306,8 @@ int hfi1_get_irq_affinity(struct hfi1_devdata *dd, struct hfi1_msix_entry *msix)
 	 * is set above.  Skip accounting for it.  Everything else finds its
 	 * CPU here.
 	 */
-	if (cpu == -1) {
-		spin_lock(&dd->affinity->lock);
+	if (cpu == -1 && set) {
+		spin_lock(&node_affinity.lock);
 		if (cpumask_equal(&set->mask, &set->used)) {
 			/*
 			 * We've used up all the CPUs, bump up the generation
@@ -235,7 +319,7 @@ int hfi1_get_irq_affinity(struct hfi1_devdata *dd, struct hfi1_msix_entry *msix)
 		cpumask_andnot(diff, &set->mask, &set->used);
 		cpu = cpumask_first(diff);
 		cpumask_set_cpu(cpu, &set->used);
-		spin_unlock(&dd->affinity->lock);
+		spin_unlock(&node_affinity.lock);
 	}
 
 	switch (msix->type) {
@@ -263,30 +347,35 @@ void hfi1_put_irq_affinity(struct hfi1_devdata *dd,
 {
 	struct cpu_mask_set *set = NULL;
 	struct hfi1_ctxtdata *rcd;
+	struct hfi1_affinity_node *entry;
+
+	spin_lock(&node_affinity.lock);
+	entry = node_affinity_lookup(dd->node);
+	spin_unlock(&node_affinity.lock);
 
 	switch (msix->type) {
 	case IRQ_SDMA:
 	case IRQ_GENERAL:
-		set = &dd->affinity->def_intr;
+		set = &entry->def_intr;
 		break;
 	case IRQ_RCVCTXT:
 		rcd = (struct hfi1_ctxtdata *)msix->arg;
 		/* only do accounting for non control contexts */
 		if (rcd->ctxt != HFI1_CTRL_CTXT)
-			set = &dd->affinity->rcv_intr;
+			set = &entry->rcv_intr;
 		break;
 	default:
 		return;
 	}
 
 	if (set) {
-		spin_lock(&dd->affinity->lock);
+		spin_lock(&node_affinity.lock);
 		cpumask_andnot(&set->used, &set->used, &msix->mask);
 		if (cpumask_empty(&set->used) && set->gen) {
 			set->gen--;
 			cpumask_copy(&set->used, &set->mask);
 		}
-		spin_unlock(&dd->affinity->lock);
+		spin_unlock(&node_affinity.lock);
 	}
 
 	irq_set_affinity_hint(msix->msix.vector, NULL);
@@ -297,9 +386,10 @@ int hfi1_get_proc_affinity(struct hfi1_devdata *dd, int node)
 {
 	int cpu = -1, ret;
 	cpumask_var_t diff, mask, intrs;
+	struct hfi1_affinity_node *entry;
 	const struct cpumask *node_mask,
 		*proc_mask = tsk_cpus_allowed(current);
-	struct cpu_mask_set *set = &dd->affinity->proc;
+	struct cpu_mask_set *set = &node_affinity.proc;
 
 	/*
 	 * check whether process/context affinity has already
@@ -338,7 +428,7 @@ int hfi1_get_proc_affinity(struct hfi1_devdata *dd, int node)
 	if (!ret)
 		goto free_mask;
 
-	spin_lock(&dd->affinity->lock);
+	spin_lock(&node_affinity.lock);
 	/*
 	 * If we've used all available CPUs, clear the mask and start
 	 * overloading.
@@ -348,13 +438,14 @@ int hfi1_get_proc_affinity(struct hfi1_devdata *dd, int node)
 		cpumask_clear(&set->used);
 	}
 
+	entry = node_affinity_lookup(dd->node);
 	/* CPUs used by interrupt handlers */
-	cpumask_copy(intrs, (dd->affinity->def_intr.gen ?
-			     &dd->affinity->def_intr.mask :
-			     &dd->affinity->def_intr.used));
-	cpumask_or(intrs, intrs, (dd->affinity->rcv_intr.gen ?
-				  &dd->affinity->rcv_intr.mask :
-				  &dd->affinity->rcv_intr.used));
+	cpumask_copy(intrs, (entry->def_intr.gen ?
+			     &entry->def_intr.mask :
+			     &entry->def_intr.used));
+	cpumask_or(intrs, intrs, (entry->rcv_intr.gen ?
+				  &entry->rcv_intr.mask :
+				  &entry->rcv_intr.used));
 	hfi1_cdbg(PROC, "CPUs used by interrupts: %*pbl",
 		  cpumask_pr_args(intrs));
 
@@ -400,7 +491,7 @@ int hfi1_get_proc_affinity(struct hfi1_devdata *dd, int node)
 		cpu = -1;
 	else
 		cpumask_set_cpu(cpu, &set->used);
-	spin_unlock(&dd->affinity->lock);
+	spin_unlock(&node_affinity.lock);
 
 	free_cpumask_var(intrs);
 free_mask:
@@ -413,16 +504,16 @@ done:
 
 void hfi1_put_proc_affinity(struct hfi1_devdata *dd, int cpu)
 {
-	struct cpu_mask_set *set = &dd->affinity->proc;
+	struct cpu_mask_set *set = &node_affinity.proc;
 
 	if (cpu < 0)
 		return;
-	spin_lock(&dd->affinity->lock);
+	spin_lock(&node_affinity.lock);
 	cpumask_clear_cpu(cpu, &set->used);
 	if (cpumask_empty(&set->used) && set->gen) {
 		set->gen--;
 		cpumask_copy(&set->used, &set->mask);
 	}
-	spin_unlock(&dd->affinity->lock);
+	spin_unlock(&node_affinity.lock);
 }
author	Dennis Dalessandro <dennis.dalessandro@intel.com>	2016-07-25 16:52:36 +0200
committer	Doug Ledford <dledford@redhat.com>	2016-08-02 21:45:14 +0200
commit	4197344ba5c2aab24b96f141cb00af9d0471f60b (patch)
tree	aa02fbe9ebd405cdeaa802180a867e16d8daf963 /drivers/infiniband/hw/hfi1/affinity.c
parent	IB/hfi1: Add counter to track unsupported packets drop (diff)
download	linux-4197344ba5c2aab24b96f141cb00af9d0471f60b.tar.xz linux-4197344ba5c2aab24b96f141cb00af9d0471f60b.zip