diff options
author | Mike Rapoport (Microsoft) <rppt@kernel.org> | 2024-08-07 08:41:01 +0200 |
---|---|---|
committer | Andrew Morton <akpm@linux-foundation.org> | 2024-09-04 06:15:30 +0200 |
commit | 87482708210ff3333adab4dc5f1a491793159d43 (patch) | |
tree | 2ab136ae642bcdd3d021f987065687caed3cdf9d /mm/numa_memblks.c | |
parent | x86/numa: numa_{add,remove}_cpu: make cpu parameter unsigned (diff) | |
download | linux-87482708210ff3333adab4dc5f1a491793159d43.tar.xz linux-87482708210ff3333adab4dc5f1a491793159d43.zip |
mm: introduce numa_memblks
Move code dealing with numa_memblks from arch/x86 to mm/ and add Kconfig
options to let x86 select it in its Kconfig.
This code will be later reused by arch_numa.
No functional changes.
Link: https://lkml.kernel.org/r/20240807064110.1003856-18-rppt@kernel.org
Signed-off-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
Tested-by: Zi Yan <ziy@nvidia.com> # for x86_64 and arm64
Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Tested-by: Jonathan Cameron <Jonathan.Cameron@huawei.com> [arm64 + CXL via QEMU]
Acked-by: Dan Williams <dan.j.williams@intel.com>
Acked-by: David Hildenbrand <david@redhat.com>
Cc: Alexander Gordeev <agordeev@linux.ibm.com>
Cc: Andreas Larsson <andreas@gaisler.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Christophe Leroy <christophe.leroy@csgroup.eu>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: David S. Miller <davem@davemloft.net>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Huacai Chen <chenhuacai@kernel.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jiaxun Yang <jiaxun.yang@flygoat.com>
Cc: John Paul Adrian Glaubitz <glaubitz@physik.fu-berlin.de>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: Rafael J. Wysocki <rafael@kernel.org>
Cc: Rob Herring (Arm) <robh@kernel.org>
Cc: Samuel Holland <samuel.holland@sifive.com>
Cc: Thomas Bogendoerfer <tsbogend@alpha.franken.de>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Diffstat (limited to 'mm/numa_memblks.c')
-rw-r--r-- | mm/numa_memblks.c | 385 |
1 files changed, 385 insertions, 0 deletions
diff --git a/mm/numa_memblks.c b/mm/numa_memblks.c new file mode 100644 index 000000000000..72f191a94c66 --- /dev/null +++ b/mm/numa_memblks.c @@ -0,0 +1,385 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +#include <linux/array_size.h> +#include <linux/sort.h> +#include <linux/printk.h> +#include <linux/memblock.h> +#include <linux/numa.h> +#include <linux/numa_memblks.h> + +nodemask_t numa_nodes_parsed __initdata; + +struct numa_meminfo numa_meminfo __initdata_or_meminfo; +struct numa_meminfo numa_reserved_meminfo __initdata_or_meminfo; + +static int __init numa_add_memblk_to(int nid, u64 start, u64 end, + struct numa_meminfo *mi) +{ + /* ignore zero length blks */ + if (start == end) + return 0; + + /* whine about and ignore invalid blks */ + if (start > end || nid < 0 || nid >= MAX_NUMNODES) { + pr_warn("Warning: invalid memblk node %d [mem %#010Lx-%#010Lx]\n", + nid, start, end - 1); + return 0; + } + + if (mi->nr_blks >= NR_NODE_MEMBLKS) { + pr_err("too many memblk ranges\n"); + return -EINVAL; + } + + mi->blk[mi->nr_blks].start = start; + mi->blk[mi->nr_blks].end = end; + mi->blk[mi->nr_blks].nid = nid; + mi->nr_blks++; + return 0; +} + +/** + * numa_remove_memblk_from - Remove one numa_memblk from a numa_meminfo + * @idx: Index of memblk to remove + * @mi: numa_meminfo to remove memblk from + * + * Remove @idx'th numa_memblk from @mi by shifting @mi->blk[] and + * decrementing @mi->nr_blks. + */ +void __init numa_remove_memblk_from(int idx, struct numa_meminfo *mi) +{ + mi->nr_blks--; + memmove(&mi->blk[idx], &mi->blk[idx + 1], + (mi->nr_blks - idx) * sizeof(mi->blk[0])); +} + +/** + * numa_move_tail_memblk - Move a numa_memblk from one numa_meminfo to another + * @dst: numa_meminfo to append block to + * @idx: Index of memblk to remove + * @src: numa_meminfo to remove memblk from + */ +static void __init numa_move_tail_memblk(struct numa_meminfo *dst, int idx, + struct numa_meminfo *src) +{ + dst->blk[dst->nr_blks++] = src->blk[idx]; + numa_remove_memblk_from(idx, src); +} + +/** + * numa_add_memblk - Add one numa_memblk to numa_meminfo + * @nid: NUMA node ID of the new memblk + * @start: Start address of the new memblk + * @end: End address of the new memblk + * + * Add a new memblk to the default numa_meminfo. + * + * RETURNS: + * 0 on success, -errno on failure. + */ +int __init numa_add_memblk(int nid, u64 start, u64 end) +{ + return numa_add_memblk_to(nid, start, end, &numa_meminfo); +} + +/** + * numa_cleanup_meminfo - Cleanup a numa_meminfo + * @mi: numa_meminfo to clean up + * + * Sanitize @mi by merging and removing unnecessary memblks. Also check for + * conflicts and clear unused memblks. + * + * RETURNS: + * 0 on success, -errno on failure. + */ +int __init numa_cleanup_meminfo(struct numa_meminfo *mi) +{ + const u64 low = 0; + const u64 high = PFN_PHYS(max_pfn); + int i, j, k; + + /* first, trim all entries */ + for (i = 0; i < mi->nr_blks; i++) { + struct numa_memblk *bi = &mi->blk[i]; + + /* move / save reserved memory ranges */ + if (!memblock_overlaps_region(&memblock.memory, + bi->start, bi->end - bi->start)) { + numa_move_tail_memblk(&numa_reserved_meminfo, i--, mi); + continue; + } + + /* make sure all non-reserved blocks are inside the limits */ + bi->start = max(bi->start, low); + + /* preserve info for non-RAM areas above 'max_pfn': */ + if (bi->end > high) { + numa_add_memblk_to(bi->nid, high, bi->end, + &numa_reserved_meminfo); + bi->end = high; + } + + /* and there's no empty block */ + if (bi->start >= bi->end) + numa_remove_memblk_from(i--, mi); + } + + /* merge neighboring / overlapping entries */ + for (i = 0; i < mi->nr_blks; i++) { + struct numa_memblk *bi = &mi->blk[i]; + + for (j = i + 1; j < mi->nr_blks; j++) { + struct numa_memblk *bj = &mi->blk[j]; + u64 start, end; + + /* + * See whether there are overlapping blocks. Whine + * about but allow overlaps of the same nid. They + * will be merged below. + */ + if (bi->end > bj->start && bi->start < bj->end) { + if (bi->nid != bj->nid) { + pr_err("node %d [mem %#010Lx-%#010Lx] overlaps with node %d [mem %#010Lx-%#010Lx]\n", + bi->nid, bi->start, bi->end - 1, + bj->nid, bj->start, bj->end - 1); + return -EINVAL; + } + pr_warn("Warning: node %d [mem %#010Lx-%#010Lx] overlaps with itself [mem %#010Lx-%#010Lx]\n", + bi->nid, bi->start, bi->end - 1, + bj->start, bj->end - 1); + } + + /* + * Join together blocks on the same node, holes + * between which don't overlap with memory on other + * nodes. + */ + if (bi->nid != bj->nid) + continue; + start = min(bi->start, bj->start); + end = max(bi->end, bj->end); + for (k = 0; k < mi->nr_blks; k++) { + struct numa_memblk *bk = &mi->blk[k]; + + if (bi->nid == bk->nid) + continue; + if (start < bk->end && end > bk->start) + break; + } + if (k < mi->nr_blks) + continue; + pr_info("NUMA: Node %d [mem %#010Lx-%#010Lx] + [mem %#010Lx-%#010Lx] -> [mem %#010Lx-%#010Lx]\n", + bi->nid, bi->start, bi->end - 1, bj->start, + bj->end - 1, start, end - 1); + bi->start = start; + bi->end = end; + numa_remove_memblk_from(j--, mi); + } + } + + /* clear unused ones */ + for (i = mi->nr_blks; i < ARRAY_SIZE(mi->blk); i++) { + mi->blk[i].start = mi->blk[i].end = 0; + mi->blk[i].nid = NUMA_NO_NODE; + } + + return 0; +} + +/* + * Set nodes, which have memory in @mi, in *@nodemask. + */ +void __init numa_nodemask_from_meminfo(nodemask_t *nodemask, + const struct numa_meminfo *mi) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(mi->blk); i++) + if (mi->blk[i].start != mi->blk[i].end && + mi->blk[i].nid != NUMA_NO_NODE) + node_set(mi->blk[i].nid, *nodemask); +} + +/* + * Mark all currently memblock-reserved physical memory (which covers the + * kernel's own memory ranges) as hot-unswappable. + */ +static void __init numa_clear_kernel_node_hotplug(void) +{ + nodemask_t reserved_nodemask = NODE_MASK_NONE; + struct memblock_region *mb_region; + int i; + + /* + * We have to do some preprocessing of memblock regions, to + * make them suitable for reservation. + * + * At this time, all memory regions reserved by memblock are + * used by the kernel, but those regions are not split up + * along node boundaries yet, and don't necessarily have their + * node ID set yet either. + * + * So iterate over all parsed memory blocks and use those ranges to + * set the nid in memblock.reserved. This will split up the + * memblock regions along node boundaries and will set the node IDs + * as well. + */ + for (i = 0; i < numa_meminfo.nr_blks; i++) { + struct numa_memblk *mb = numa_meminfo.blk + i; + int ret; + + ret = memblock_set_node(mb->start, mb->end - mb->start, + &memblock.reserved, mb->nid); + WARN_ON_ONCE(ret); + } + + /* + * Now go over all reserved memblock regions, to construct a + * node mask of all kernel reserved memory areas. + * + * [ Note, when booting with mem=nn[kMG] or in a kdump kernel, + * numa_meminfo might not include all memblock.reserved + * memory ranges, because quirks such as trim_snb_memory() + * reserve specific pages for Sandy Bridge graphics. ] + */ + for_each_reserved_mem_region(mb_region) { + int nid = memblock_get_region_node(mb_region); + + if (nid != MAX_NUMNODES) + node_set(nid, reserved_nodemask); + } + + /* + * Finally, clear the MEMBLOCK_HOTPLUG flag for all memory + * belonging to the reserved node mask. + * + * Note that this will include memory regions that reside + * on nodes that contain kernel memory - entire nodes + * become hot-unpluggable: + */ + for (i = 0; i < numa_meminfo.nr_blks; i++) { + struct numa_memblk *mb = numa_meminfo.blk + i; + + if (!node_isset(mb->nid, reserved_nodemask)) + continue; + + memblock_clear_hotplug(mb->start, mb->end - mb->start); + } +} + +int __init numa_register_meminfo(struct numa_meminfo *mi) +{ + int i; + + /* Account for nodes with cpus and no memory */ + node_possible_map = numa_nodes_parsed; + numa_nodemask_from_meminfo(&node_possible_map, mi); + if (WARN_ON(nodes_empty(node_possible_map))) + return -EINVAL; + + for (i = 0; i < mi->nr_blks; i++) { + struct numa_memblk *mb = &mi->blk[i]; + + memblock_set_node(mb->start, mb->end - mb->start, + &memblock.memory, mb->nid); + } + + /* + * At very early time, the kernel have to use some memory such as + * loading the kernel image. We cannot prevent this anyway. So any + * node the kernel resides in should be un-hotpluggable. + * + * And when we come here, alloc node data won't fail. + */ + numa_clear_kernel_node_hotplug(); + + /* + * If sections array is gonna be used for pfn -> nid mapping, check + * whether its granularity is fine enough. + */ + if (IS_ENABLED(NODE_NOT_IN_PAGE_FLAGS)) { + unsigned long pfn_align = node_map_pfn_alignment(); + + if (pfn_align && pfn_align < PAGES_PER_SECTION) { + pr_warn("Node alignment %LuMB < min %LuMB, rejecting NUMA config\n", + PFN_PHYS(pfn_align) >> 20, + PFN_PHYS(PAGES_PER_SECTION) >> 20); + return -EINVAL; + } + } + + return 0; +} + +static int __init cmp_memblk(const void *a, const void *b) +{ + const struct numa_memblk *ma = *(const struct numa_memblk **)a; + const struct numa_memblk *mb = *(const struct numa_memblk **)b; + + return (ma->start > mb->start) - (ma->start < mb->start); +} + +static struct numa_memblk *numa_memblk_list[NR_NODE_MEMBLKS] __initdata; + +/** + * numa_fill_memblks - Fill gaps in numa_meminfo memblks + * @start: address to begin fill + * @end: address to end fill + * + * Find and extend numa_meminfo memblks to cover the physical + * address range @start-@end + * + * RETURNS: + * 0 : Success + * NUMA_NO_MEMBLK : No memblks exist in address range @start-@end + */ + +int __init numa_fill_memblks(u64 start, u64 end) +{ + struct numa_memblk **blk = &numa_memblk_list[0]; + struct numa_meminfo *mi = &numa_meminfo; + int count = 0; + u64 prev_end; + + /* + * Create a list of pointers to numa_meminfo memblks that + * overlap start, end. The list is used to make in-place + * changes that fill out the numa_meminfo memblks. + */ + for (int i = 0; i < mi->nr_blks; i++) { + struct numa_memblk *bi = &mi->blk[i]; + + if (memblock_addrs_overlap(start, end - start, bi->start, + bi->end - bi->start)) { + blk[count] = &mi->blk[i]; + count++; + } + } + if (!count) + return NUMA_NO_MEMBLK; + + /* Sort the list of pointers in memblk->start order */ + sort(&blk[0], count, sizeof(blk[0]), cmp_memblk, NULL); + + /* Make sure the first/last memblks include start/end */ + blk[0]->start = min(blk[0]->start, start); + blk[count - 1]->end = max(blk[count - 1]->end, end); + + /* + * Fill any gaps by tracking the previous memblks + * end address and backfilling to it if needed. + */ + prev_end = blk[0]->end; + for (int i = 1; i < count; i++) { + struct numa_memblk *curr = blk[i]; + + if (prev_end >= curr->start) { + if (prev_end < curr->end) + prev_end = curr->end; + } else { + curr->start = prev_end; + prev_end = curr->end; + } + } + return 0; +} |