summaryrefslogtreecommitdiffstats
path: root/arch/powerpc/mm/numa.c
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2018-02-02 19:01:04 +0100
committerLinus Torvalds <torvalds@linux-foundation.org>2018-02-02 19:01:04 +0100
commit03f51d4efa2287cc628bb20b0c032036d2a9e66a (patch)
treeec7fb3b6624d53092e2768578f3ef887c8d77f22 /arch/powerpc/mm/numa.c
parentMerge branch 'for-linus' of git://git.armlinux.org.uk/~rmk/linux-arm (diff)
parentpowerpc/mm/radix: Fix build error when RADIX_MMU=n (diff)
downloadlinux-03f51d4efa2287cc628bb20b0c032036d2a9e66a.tar.xz
linux-03f51d4efa2287cc628bb20b0c032036d2a9e66a.zip
Merge tag 'powerpc-4.16-1' of git://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux
Pull powerpc updates from Michael Ellerman: "Highlights: - Enable support for memory protection keys aka "pkeys" on Power7/8/9 when using the hash table MMU. - Extend our interrupt soft masking to support masking PMU interrupts as well as "normal" interrupts, and then use that to implement local_t for a ~4x speedup vs the current atomics-based implementation. - A new driver "ocxl" for "Open Coherent Accelerator Processor Interface (OpenCAPI)" devices. - Support for new device tree properties on PowerVM to describe hotpluggable memory and devices. - Add support for CLOCK_{REALTIME/MONOTONIC}_COARSE to the 64-bit VDSO. - Freescale updates from Scott: fixes for CPM GPIO and an FSL PCI erratum workaround, plus a minor cleanup patch. As well as quite a lot of other changes all over the place, and small fixes and cleanups as always. Thanks to: Alan Modra, Alastair D'Silva, Alexey Kardashevskiy, Alistair Popple, Andreas Schwab, Andrew Donnellan, Aneesh Kumar K.V, Anju T Sudhakar, Anshuman Khandual, Anton Blanchard, Arnd Bergmann, Balbir Singh, Benjamin Herrenschmidt, Bhaktipriya Shridhar, Bryant G. Ly, Cédric Le Goater, Christophe Leroy, Christophe Lombard, Cyril Bur, David Gibson, Desnes A. Nunes do Rosario, Dmitry Torokhov, Frederic Barrat, Geert Uytterhoeven, Guilherme G. Piccoli, Gustavo A. R. Silva, Gustavo Romero, Ivan Mikhaylov, Joakim Tjernlund, Joe Perches, Josh Poimboeuf, Juan J. Alvarez, Julia Cartwright, Kamalesh Babulal, Madhavan Srinivasan, Mahesh Salgaonkar, Mathieu Malaterre, Michael Bringmann, Michael Hanselmann, Michael Neuling, Nathan Fontenot, Naveen N. Rao, Nicholas Piggin, Paul Mackerras, Philippe Bergheaud, Ram Pai, Russell Currey, Santosh Sivaraj, Scott Wood, Seth Forshee, Simon Guo, Stewart Smith, Sukadev Bhattiprolu, Thiago Jung Bauermann, Vaibhav Jain, Vasyl Gomonovych" * tag 'powerpc-4.16-1' of git://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux: (199 commits) powerpc/mm/radix: Fix build error when RADIX_MMU=n macintosh/ams-input: Use true and false for boolean values macintosh: change some data types from int to bool powerpc/watchdog: Print the NIP in soft_nmi_interrupt() powerpc/watchdog: regs can't be null in soft_nmi_interrupt() powerpc/watchdog: Tweak watchdog printks powerpc/cell: Remove axonram driver rtc-opal: Fix handling of firmware error codes, prevent busy loops powerpc/mpc52xx_gpt: make use of raw_spinlock variants macintosh/adb: Properly mark continued kernel messages powerpc/pseries: Fix cpu hotplug crash with memoryless nodes powerpc/numa: Ensure nodes initialized for hotplug powerpc/numa: Use ibm,max-associativity-domains to discover possible nodes powerpc/kernel: Block interrupts when updating TIDR powerpc/powernv/idoa: Remove unnecessary pcidev from pci_dn powerpc/mm/nohash: do not flush the entire mm when range is a single page powerpc/pseries: Add Initialization of VF Bars powerpc/pseries/pci: Associate PEs to VFs in configure SR-IOV powerpc/eeh: Add EEH notify resume sysfs powerpc/eeh: Add EEH operations to notify resume ...
Diffstat (limited to 'arch/powerpc/mm/numa.c')
-rw-r--r--arch/powerpc/mm/numa.c333
1 files changed, 142 insertions, 191 deletions
diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
index adb6364f4091..314d19ab9385 100644
--- a/arch/powerpc/mm/numa.c
+++ b/arch/powerpc/mm/numa.c
@@ -40,6 +40,7 @@
#include <asm/hvcall.h>
#include <asm/setup.h>
#include <asm/vdso.h>
+#include <asm/drmem.h>
static int numa_enabled = 1;
@@ -179,21 +180,6 @@ static const __be32 *of_get_associativity(struct device_node *dev)
return of_get_property(dev, "ibm,associativity", NULL);
}
-/*
- * Returns the property linux,drconf-usable-memory if
- * it exists (the property exists only in kexec/kdump kernels,
- * added by kexec-tools)
- */
-static const __be32 *of_get_usable_memory(struct device_node *memory)
-{
- const __be32 *prop;
- u32 len;
- prop = of_get_property(memory, "linux,drconf-usable-memory", &len);
- if (!prop || len < sizeof(unsigned int))
- return NULL;
- return prop;
-}
-
int __node_distance(int a, int b)
{
int i;
@@ -387,69 +373,6 @@ static unsigned long read_n_cells(int n, const __be32 **buf)
return result;
}
-/*
- * Read the next memblock list entry from the ibm,dynamic-memory property
- * and return the information in the provided of_drconf_cell structure.
- */
-static void read_drconf_cell(struct of_drconf_cell *drmem, const __be32 **cellp)
-{
- const __be32 *cp;
-
- drmem->base_addr = read_n_cells(n_mem_addr_cells, cellp);
-
- cp = *cellp;
- drmem->drc_index = of_read_number(cp, 1);
- drmem->reserved = of_read_number(&cp[1], 1);
- drmem->aa_index = of_read_number(&cp[2], 1);
- drmem->flags = of_read_number(&cp[3], 1);
-
- *cellp = cp + 4;
-}
-
-/*
- * Retrieve and validate the ibm,dynamic-memory property of the device tree.
- *
- * The layout of the ibm,dynamic-memory property is a number N of memblock
- * list entries followed by N memblock list entries. Each memblock list entry
- * contains information as laid out in the of_drconf_cell struct above.
- */
-static int of_get_drconf_memory(struct device_node *memory, const __be32 **dm)
-{
- const __be32 *prop;
- u32 len, entries;
-
- prop = of_get_property(memory, "ibm,dynamic-memory", &len);
- if (!prop || len < sizeof(unsigned int))
- return 0;
-
- entries = of_read_number(prop++, 1);
-
- /* Now that we know the number of entries, revalidate the size
- * of the property read in to ensure we have everything
- */
- if (len < (entries * (n_mem_addr_cells + 4) + 1) * sizeof(unsigned int))
- return 0;
-
- *dm = prop;
- return entries;
-}
-
-/*
- * Retrieve and validate the ibm,lmb-size property for drconf memory
- * from the device tree.
- */
-static u64 of_get_lmb_size(struct device_node *memory)
-{
- const __be32 *prop;
- u32 len;
-
- prop = of_get_property(memory, "ibm,lmb-size", &len);
- if (!prop || len < sizeof(unsigned int))
- return 0;
-
- return read_n_cells(n_mem_size_cells, &prop);
-}
-
struct assoc_arrays {
u32 n_arrays;
u32 array_sz;
@@ -466,19 +389,27 @@ struct assoc_arrays {
* indicating the size of each associativity array, followed by a list
* of N associativity arrays.
*/
-static int of_get_assoc_arrays(struct device_node *memory,
- struct assoc_arrays *aa)
+static int of_get_assoc_arrays(struct assoc_arrays *aa)
{
+ struct device_node *memory;
const __be32 *prop;
u32 len;
+ memory = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory");
+ if (!memory)
+ return -1;
+
prop = of_get_property(memory, "ibm,associativity-lookup-arrays", &len);
- if (!prop || len < 2 * sizeof(unsigned int))
+ if (!prop || len < 2 * sizeof(unsigned int)) {
+ of_node_put(memory);
return -1;
+ }
aa->n_arrays = of_read_number(prop++, 1);
aa->array_sz = of_read_number(prop++, 1);
+ of_node_put(memory);
+
/* Now that we know the number of arrays and size of each array,
* revalidate the size of the property read in.
*/
@@ -493,26 +424,30 @@ static int of_get_assoc_arrays(struct device_node *memory,
* This is like of_node_to_nid_single() for memory represented in the
* ibm,dynamic-reconfiguration-memory node.
*/
-static int of_drconf_to_nid_single(struct of_drconf_cell *drmem,
- struct assoc_arrays *aa)
+static int of_drconf_to_nid_single(struct drmem_lmb *lmb)
{
+ struct assoc_arrays aa = { .arrays = NULL };
int default_nid = 0;
int nid = default_nid;
- int index;
+ int rc, index;
+
+ rc = of_get_assoc_arrays(&aa);
+ if (rc)
+ return default_nid;
- if (min_common_depth > 0 && min_common_depth <= aa->array_sz &&
- !(drmem->flags & DRCONF_MEM_AI_INVALID) &&
- drmem->aa_index < aa->n_arrays) {
- index = drmem->aa_index * aa->array_sz + min_common_depth - 1;
- nid = of_read_number(&aa->arrays[index], 1);
+ if (min_common_depth > 0 && min_common_depth <= aa.array_sz &&
+ !(lmb->flags & DRCONF_MEM_AI_INVALID) &&
+ lmb->aa_index < aa.n_arrays) {
+ index = lmb->aa_index * aa.array_sz + min_common_depth - 1;
+ nid = of_read_number(&aa.arrays[index], 1);
if (nid == 0xffff || nid >= MAX_NUMNODES)
nid = default_nid;
if (nid > 0) {
- index = drmem->aa_index * aa->array_sz;
+ index = lmb->aa_index * aa.array_sz;
initialize_distance_lookup_table(nid,
- &aa->arrays[index]);
+ &aa.arrays[index]);
}
}
@@ -551,7 +486,7 @@ static int numa_setup_cpu(unsigned long lcpu)
nid = of_node_to_nid_single(cpu);
out_present:
- if (nid < 0 || !node_online(nid))
+ if (nid < 0 || !node_possible(nid))
nid = first_online_node;
map_cpu_to_node(lcpu, nid);
@@ -645,67 +580,48 @@ static inline int __init read_usm_ranges(const __be32 **usm)
* Extract NUMA information from the ibm,dynamic-reconfiguration-memory
* node. This assumes n_mem_{addr,size}_cells have been set.
*/
-static void __init parse_drconf_memory(struct device_node *memory)
+static void __init numa_setup_drmem_lmb(struct drmem_lmb *lmb,
+ const __be32 **usm)
{
- const __be32 *uninitialized_var(dm), *usm;
- unsigned int n, rc, ranges, is_kexec_kdump = 0;
- unsigned long lmb_size, base, size, sz;
+ unsigned int ranges, is_kexec_kdump = 0;
+ unsigned long base, size, sz;
int nid;
- struct assoc_arrays aa = { .arrays = NULL };
-
- n = of_get_drconf_memory(memory, &dm);
- if (!n)
- return;
-
- lmb_size = of_get_lmb_size(memory);
- if (!lmb_size)
- return;
- rc = of_get_assoc_arrays(memory, &aa);
- if (rc)
+ /*
+ * Skip this block if the reserved bit is set in flags (0x80)
+ * or if the block is not assigned to this partition (0x8)
+ */
+ if ((lmb->flags & DRCONF_MEM_RESERVED)
+ || !(lmb->flags & DRCONF_MEM_ASSIGNED))
return;
- /* check if this is a kexec/kdump kernel */
- usm = of_get_usable_memory(memory);
- if (usm != NULL)
+ if (*usm)
is_kexec_kdump = 1;
- for (; n != 0; --n) {
- struct of_drconf_cell drmem;
-
- read_drconf_cell(&drmem, &dm);
+ base = lmb->base_addr;
+ size = drmem_lmb_size();
+ ranges = 1;
- /* skip this block if the reserved bit is set in flags (0x80)
- or if the block is not assigned to this partition (0x8) */
- if ((drmem.flags & DRCONF_MEM_RESERVED)
- || !(drmem.flags & DRCONF_MEM_ASSIGNED))
- continue;
-
- base = drmem.base_addr;
- size = lmb_size;
- ranges = 1;
+ if (is_kexec_kdump) {
+ ranges = read_usm_ranges(usm);
+ if (!ranges) /* there are no (base, size) duple */
+ return;
+ }
+ do {
if (is_kexec_kdump) {
- ranges = read_usm_ranges(&usm);
- if (!ranges) /* there are no (base, size) duple */
- continue;
+ base = read_n_cells(n_mem_addr_cells, usm);
+ size = read_n_cells(n_mem_size_cells, usm);
}
- do {
- if (is_kexec_kdump) {
- base = read_n_cells(n_mem_addr_cells, &usm);
- size = read_n_cells(n_mem_size_cells, &usm);
- }
- nid = of_drconf_to_nid_single(&drmem, &aa);
- fake_numa_create_new_node(
- ((base + size) >> PAGE_SHIFT),
- &nid);
- node_set_online(nid);
- sz = numa_enforce_memory_limit(base, size);
- if (sz)
- memblock_set_node(base, sz,
- &memblock.memory, nid);
- } while (--ranges);
- }
+
+ nid = of_drconf_to_nid_single(lmb);
+ fake_numa_create_new_node(((base + size) >> PAGE_SHIFT),
+ &nid);
+ node_set_online(nid);
+ sz = numa_enforce_memory_limit(base, size);
+ if (sz)
+ memblock_set_node(base, sz, &memblock.memory, nid);
+ } while (--ranges);
}
static int __init parse_numa_properties(void)
@@ -800,8 +716,10 @@ new_range:
* ibm,dynamic-reconfiguration-memory node.
*/
memory = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory");
- if (memory)
- parse_drconf_memory(memory);
+ if (memory) {
+ walk_drmem_lmbs(memory, numa_setup_drmem_lmb);
+ of_node_put(memory);
+ }
return 0;
}
@@ -892,6 +810,32 @@ static void __init setup_node_data(int nid, u64 start_pfn, u64 end_pfn)
NODE_DATA(nid)->node_spanned_pages = spanned_pages;
}
+static void __init find_possible_nodes(void)
+{
+ struct device_node *rtas;
+ u32 numnodes, i;
+
+ if (min_common_depth <= 0)
+ return;
+
+ rtas = of_find_node_by_path("/rtas");
+ if (!rtas)
+ return;
+
+ if (of_property_read_u32_index(rtas,
+ "ibm,max-associativity-domains",
+ min_common_depth, &numnodes))
+ goto out;
+
+ for (i = 0; i < numnodes; i++) {
+ if (!node_possible(i))
+ node_set(i, node_possible_map);
+ }
+
+out:
+ of_node_put(rtas);
+}
+
void __init initmem_init(void)
{
int nid, cpu;
@@ -905,12 +849,15 @@ void __init initmem_init(void)
memblock_dump_all();
/*
- * Reduce the possible NUMA nodes to the online NUMA nodes,
- * since we do not support node hotplug. This ensures that we
- * lower the maximum NUMA node ID to what is actually present.
+ * Modify the set of possible NUMA nodes to reflect information
+ * available about the set of online nodes, and the set of nodes
+ * that we expect to make use of for this platform's affinity
+ * calculations.
*/
nodes_and(node_possible_map, node_possible_map, node_online_map);
+ find_possible_nodes();
+
for_each_online_node(nid) {
unsigned long start_pfn, end_pfn;
@@ -979,43 +926,26 @@ early_param("topology_updates", early_topology_updates);
* memory represented in the device tree by the property
* ibm,dynamic-reconfiguration-memory/ibm,dynamic-memory.
*/
-static int hot_add_drconf_scn_to_nid(struct device_node *memory,
- unsigned long scn_addr)
+static int hot_add_drconf_scn_to_nid(unsigned long scn_addr)
{
- const __be32 *dm;
- unsigned int drconf_cell_cnt, rc;
+ struct drmem_lmb *lmb;
unsigned long lmb_size;
- struct assoc_arrays aa;
int nid = -1;
- drconf_cell_cnt = of_get_drconf_memory(memory, &dm);
- if (!drconf_cell_cnt)
- return -1;
-
- lmb_size = of_get_lmb_size(memory);
- if (!lmb_size)
- return -1;
-
- rc = of_get_assoc_arrays(memory, &aa);
- if (rc)
- return -1;
-
- for (; drconf_cell_cnt != 0; --drconf_cell_cnt) {
- struct of_drconf_cell drmem;
-
- read_drconf_cell(&drmem, &dm);
+ lmb_size = drmem_lmb_size();
+ for_each_drmem_lmb(lmb) {
/* skip this block if it is reserved or not assigned to
* this partition */
- if ((drmem.flags & DRCONF_MEM_RESERVED)
- || !(drmem.flags & DRCONF_MEM_ASSIGNED))
+ if ((lmb->flags & DRCONF_MEM_RESERVED)
+ || !(lmb->flags & DRCONF_MEM_ASSIGNED))
continue;
- if ((scn_addr < drmem.base_addr)
- || (scn_addr >= (drmem.base_addr + lmb_size)))
+ if ((scn_addr < lmb->base_addr)
+ || (scn_addr >= (lmb->base_addr + lmb_size)))
continue;
- nid = of_drconf_to_nid_single(&drmem, &aa);
+ nid = of_drconf_to_nid_single(lmb);
break;
}
@@ -1080,7 +1010,7 @@ int hot_add_scn_to_nid(unsigned long scn_addr)
memory = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory");
if (memory) {
- nid = hot_add_drconf_scn_to_nid(memory, scn_addr);
+ nid = hot_add_drconf_scn_to_nid(scn_addr);
of_node_put(memory);
} else {
nid = hot_add_node_scn_to_nid(scn_addr);
@@ -1096,11 +1026,7 @@ static u64 hot_add_drconf_memory_max(void)
{
struct device_node *memory = NULL;
struct device_node *dn = NULL;
- unsigned int drconf_cell_cnt = 0;
- u64 lmb_size = 0;
- const __be32 *dm = NULL;
const __be64 *lrdr = NULL;
- struct of_drconf_cell drmem;
dn = of_find_node_by_path("/rtas");
if (dn) {
@@ -1112,14 +1038,8 @@ static u64 hot_add_drconf_memory_max(void)
memory = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory");
if (memory) {
- drconf_cell_cnt = of_get_drconf_memory(memory, &dm);
- lmb_size = of_get_lmb_size(memory);
-
- /* Advance to the last cell, each cell has 6 32 bit integers */
- dm += (drconf_cell_cnt - 1) * 6;
- read_drconf_cell(&drmem, &dm);
of_node_put(memory);
- return drmem.base_addr + lmb_size;
+ return drmem_lmb_memory_max();
}
return 0;
}
@@ -1278,6 +1198,42 @@ static long vphn_get_associativity(unsigned long cpu,
return rc;
}
+int find_and_online_cpu_nid(int cpu)
+{
+ __be32 associativity[VPHN_ASSOC_BUFSIZE] = {0};
+ int new_nid;
+
+ /* Use associativity from first thread for all siblings */
+ vphn_get_associativity(cpu, associativity);
+ new_nid = associativity_to_nid(associativity);
+ if (new_nid < 0 || !node_possible(new_nid))
+ new_nid = first_online_node;
+
+ if (NODE_DATA(new_nid) == NULL) {
+#ifdef CONFIG_MEMORY_HOTPLUG
+ /*
+ * Need to ensure that NODE_DATA is initialized for a node from
+ * available memory (see memblock_alloc_try_nid). If unable to
+ * init the node, then default to nearest node that has memory
+ * installed.
+ */
+ if (try_online_node(new_nid))
+ new_nid = first_online_node;
+#else
+ /*
+ * Default to using the nearest node that has memory installed.
+ * Otherwise, it would be necessary to patch the kernel MM code
+ * to deal with more memoryless-node error conditions.
+ */
+ new_nid = first_online_node;
+#endif
+ }
+
+ pr_debug("%s:%d cpu %d nid %d\n", __FUNCTION__, __LINE__,
+ cpu, new_nid);
+ return new_nid;
+}
+
/*
* Update the CPU maps and sysfs entries for a single CPU when its NUMA
* characteristics change. This function doesn't perform any locking and is
@@ -1345,7 +1301,6 @@ int numa_update_cpu_topology(bool cpus_locked)
{
unsigned int cpu, sibling, changed = 0;
struct topology_update_data *updates, *ud;
- __be32 associativity[VPHN_ASSOC_BUFSIZE] = {0};
cpumask_t updated_cpus;
struct device *dev;
int weight, new_nid, i = 0;
@@ -1383,11 +1338,7 @@ int numa_update_cpu_topology(bool cpus_locked)
continue;
}
- /* Use associativity from first thread for all siblings */
- vphn_get_associativity(cpu, associativity);
- new_nid = associativity_to_nid(associativity);
- if (new_nid < 0 || !node_online(new_nid))
- new_nid = first_online_node;
+ new_nid = find_and_online_cpu_nid(cpu);
if (new_nid == numa_cpu_lookup_table[cpu]) {
cpumask_andnot(&cpu_associativity_changes_mask,