diff options
Diffstat (limited to 'mm/hugetlb.c')
-rw-r--r-- | mm/hugetlb.c | 294 |
1 files changed, 112 insertions, 182 deletions
diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 1a88006ec634..bc48ee783dd9 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -20,9 +20,9 @@ #include <linux/slab.h> #include <linux/sched/signal.h> #include <linux/rmap.h> +#include <linux/string_helpers.h> #include <linux/swap.h> #include <linux/swapops.h> -#include <linux/page-isolation.h> #include <linux/jhash.h> #include <asm/page.h> @@ -872,7 +872,7 @@ static struct page *dequeue_huge_page_node_exact(struct hstate *h, int nid) struct page *page; list_for_each_entry(page, &h->hugepage_freelists[nid], lru) - if (!is_migrate_isolate_page(page)) + if (!PageHWPoison(page)) break; /* * if 'non-isolated free hugepage' not found on the list, @@ -887,19 +887,39 @@ static struct page *dequeue_huge_page_node_exact(struct hstate *h, int nid) return page; } -static struct page *dequeue_huge_page_node(struct hstate *h, int nid) +static struct page *dequeue_huge_page_nodemask(struct hstate *h, gfp_t gfp_mask, int nid, + nodemask_t *nmask) { - struct page *page; - int node; + unsigned int cpuset_mems_cookie; + struct zonelist *zonelist; + struct zone *zone; + struct zoneref *z; + int node = -1; - if (nid != NUMA_NO_NODE) - return dequeue_huge_page_node_exact(h, nid); + zonelist = node_zonelist(nid, gfp_mask); + +retry_cpuset: + cpuset_mems_cookie = read_mems_allowed_begin(); + for_each_zone_zonelist_nodemask(zone, z, zonelist, gfp_zone(gfp_mask), nmask) { + struct page *page; + + if (!cpuset_zone_allowed(zone, gfp_mask)) + continue; + /* + * no need to ask again on the same node. Pool is node rather than + * zone aware + */ + if (zone_to_nid(zone) == node) + continue; + node = zone_to_nid(zone); - for_each_online_node(node) { page = dequeue_huge_page_node_exact(h, node); if (page) return page; } + if (unlikely(read_mems_allowed_retry(cpuset_mems_cookie))) + goto retry_cpuset; + return NULL; } @@ -917,15 +937,11 @@ static struct page *dequeue_huge_page_vma(struct hstate *h, unsigned long address, int avoid_reserve, long chg) { - struct page *page = NULL; + struct page *page; struct mempolicy *mpol; - nodemask_t *nodemask; gfp_t gfp_mask; + nodemask_t *nodemask; int nid; - struct zonelist *zonelist; - struct zone *zone; - struct zoneref *z; - unsigned int cpuset_mems_cookie; /* * A child process with MAP_PRIVATE mappings created by their parent @@ -940,32 +956,15 @@ static struct page *dequeue_huge_page_vma(struct hstate *h, if (avoid_reserve && h->free_huge_pages - h->resv_huge_pages == 0) goto err; -retry_cpuset: - cpuset_mems_cookie = read_mems_allowed_begin(); gfp_mask = htlb_alloc_mask(h); nid = huge_node(vma, address, gfp_mask, &mpol, &nodemask); - zonelist = node_zonelist(nid, gfp_mask); - - for_each_zone_zonelist_nodemask(zone, z, zonelist, - MAX_NR_ZONES - 1, nodemask) { - if (cpuset_zone_allowed(zone, gfp_mask)) { - page = dequeue_huge_page_node(h, zone_to_nid(zone)); - if (page) { - if (avoid_reserve) - break; - if (!vma_has_reserves(vma, chg)) - break; - - SetPagePrivate(page); - h->resv_huge_pages--; - break; - } - } + page = dequeue_huge_page_nodemask(h, gfp_mask, nid, nodemask); + if (page && !avoid_reserve && vma_has_reserves(vma, chg)) { + SetPagePrivate(page); + h->resv_huge_pages--; } mpol_cond_put(mpol); - if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie))) - goto retry_cpuset; return page; err: @@ -1385,7 +1384,7 @@ static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid) page = __alloc_pages_node(nid, htlb_alloc_mask(h)|__GFP_COMP|__GFP_THISNODE| - __GFP_REPEAT|__GFP_NOWARN, + __GFP_RETRY_MAYFAIL|__GFP_NOWARN, huge_page_order(h)); if (page) { prep_new_huge_page(h, page, nid); @@ -1460,7 +1459,7 @@ static int free_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed, * number of free hugepages would be reduced below the number of reserved * hugepages. */ -static int dissolve_free_huge_page(struct page *page) +int dissolve_free_huge_page(struct page *page) { int rc = 0; @@ -1473,6 +1472,14 @@ static int dissolve_free_huge_page(struct page *page) rc = -EBUSY; goto out; } + /* + * Move PageHWPoison flag from head page to the raw error page, + * which makes any subpages rather than the error page reusable. + */ + if (PageHWPoison(head) && page != head) { + SetPageHWPoison(page); + ClearPageHWPoison(head); + } list_del(&head->lru); h->free_huge_pages--; h->free_huge_pages_node[nid]--; @@ -1513,82 +1520,19 @@ int dissolve_free_huge_pages(unsigned long start_pfn, unsigned long end_pfn) return rc; } -/* - * There are 3 ways this can get called: - * 1. With vma+addr: we use the VMA's memory policy - * 2. With !vma, but nid=NUMA_NO_NODE: We try to allocate a huge - * page from any node, and let the buddy allocator itself figure - * it out. - * 3. With !vma, but nid!=NUMA_NO_NODE. We allocate a huge page - * strictly from 'nid' - */ static struct page *__hugetlb_alloc_buddy_huge_page(struct hstate *h, - struct vm_area_struct *vma, unsigned long addr, int nid) + gfp_t gfp_mask, int nid, nodemask_t *nmask) { int order = huge_page_order(h); - gfp_t gfp = htlb_alloc_mask(h)|__GFP_COMP|__GFP_REPEAT|__GFP_NOWARN; - unsigned int cpuset_mems_cookie; - - /* - * We need a VMA to get a memory policy. If we do not - * have one, we use the 'nid' argument. - * - * The mempolicy stuff below has some non-inlined bits - * and calls ->vm_ops. That makes it hard to optimize at - * compile-time, even when NUMA is off and it does - * nothing. This helps the compiler optimize it out. - */ - if (!IS_ENABLED(CONFIG_NUMA) || !vma) { - /* - * If a specific node is requested, make sure to - * get memory from there, but only when a node - * is explicitly specified. - */ - if (nid != NUMA_NO_NODE) - gfp |= __GFP_THISNODE; - /* - * Make sure to call something that can handle - * nid=NUMA_NO_NODE - */ - return alloc_pages_node(nid, gfp, order); - } - - /* - * OK, so we have a VMA. Fetch the mempolicy and try to - * allocate a huge page with it. We will only reach this - * when CONFIG_NUMA=y. - */ - do { - struct page *page; - struct mempolicy *mpol; - int nid; - nodemask_t *nodemask; - - cpuset_mems_cookie = read_mems_allowed_begin(); - nid = huge_node(vma, addr, gfp, &mpol, &nodemask); - mpol_cond_put(mpol); - page = __alloc_pages_nodemask(gfp, order, nid, nodemask); - if (page) - return page; - } while (read_mems_allowed_retry(cpuset_mems_cookie)); - return NULL; + gfp_mask |= __GFP_COMP|__GFP_RETRY_MAYFAIL|__GFP_NOWARN; + if (nid == NUMA_NO_NODE) + nid = numa_mem_id(); + return __alloc_pages_nodemask(gfp_mask, order, nid, nmask); } -/* - * There are two ways to allocate a huge page: - * 1. When you have a VMA and an address (like a fault) - * 2. When you have no VMA (like when setting /proc/.../nr_hugepages) - * - * 'vma' and 'addr' are only for (1). 'nid' is always NUMA_NO_NODE in - * this case which signifies that the allocation should be done with - * respect for the VMA's memory policy. - * - * For (2), we ignore 'vma' and 'addr' and use 'nid' exclusively. This - * implies that memory policies will not be taken in to account. - */ -static struct page *__alloc_buddy_huge_page(struct hstate *h, - struct vm_area_struct *vma, unsigned long addr, int nid) +static struct page *__alloc_buddy_huge_page(struct hstate *h, gfp_t gfp_mask, + int nid, nodemask_t *nmask) { struct page *page; unsigned int r_nid; @@ -1597,15 +1541,6 @@ static struct page *__alloc_buddy_huge_page(struct hstate *h, return NULL; /* - * Make sure that anyone specifying 'nid' is not also specifying a VMA. - * This makes sure the caller is picking _one_ of the modes with which - * we can call this function, not both. - */ - if (vma || (addr != -1)) { - VM_WARN_ON_ONCE(addr == -1); - VM_WARN_ON_ONCE(nid != NUMA_NO_NODE); - } - /* * Assume we will successfully allocate the surplus page to * prevent racing processes from causing the surplus to exceed * overcommit @@ -1638,7 +1573,7 @@ static struct page *__alloc_buddy_huge_page(struct hstate *h, } spin_unlock(&hugetlb_lock); - page = __hugetlb_alloc_buddy_huge_page(h, vma, addr, nid); + page = __hugetlb_alloc_buddy_huge_page(h, gfp_mask, nid, nmask); spin_lock(&hugetlb_lock); if (page) { @@ -1663,26 +1598,23 @@ static struct page *__alloc_buddy_huge_page(struct hstate *h, } /* - * Allocate a huge page from 'nid'. Note, 'nid' may be - * NUMA_NO_NODE, which means that it may be allocated - * anywhere. - */ -static -struct page *__alloc_buddy_huge_page_no_mpol(struct hstate *h, int nid) -{ - unsigned long addr = -1; - - return __alloc_buddy_huge_page(h, NULL, addr, nid); -} - -/* * Use the VMA's mpolicy to allocate a huge page from the buddy. */ static struct page *__alloc_buddy_huge_page_with_mpol(struct hstate *h, struct vm_area_struct *vma, unsigned long addr) { - return __alloc_buddy_huge_page(h, vma, addr, NUMA_NO_NODE); + struct page *page; + struct mempolicy *mpol; + gfp_t gfp_mask = htlb_alloc_mask(h); + int nid; + nodemask_t *nodemask; + + nid = huge_node(vma, addr, gfp_mask, &mpol, &nodemask); + page = __alloc_buddy_huge_page(h, gfp_mask, nid, nodemask); + mpol_cond_put(mpol); + + return page; } /* @@ -1692,19 +1624,46 @@ struct page *__alloc_buddy_huge_page_with_mpol(struct hstate *h, */ struct page *alloc_huge_page_node(struct hstate *h, int nid) { + gfp_t gfp_mask = htlb_alloc_mask(h); struct page *page = NULL; + if (nid != NUMA_NO_NODE) + gfp_mask |= __GFP_THISNODE; + spin_lock(&hugetlb_lock); if (h->free_huge_pages - h->resv_huge_pages > 0) - page = dequeue_huge_page_node(h, nid); + page = dequeue_huge_page_nodemask(h, gfp_mask, nid, NULL); spin_unlock(&hugetlb_lock); if (!page) - page = __alloc_buddy_huge_page_no_mpol(h, nid); + page = __alloc_buddy_huge_page(h, gfp_mask, nid, NULL); return page; } + +struct page *alloc_huge_page_nodemask(struct hstate *h, int preferred_nid, + nodemask_t *nmask) +{ + gfp_t gfp_mask = htlb_alloc_mask(h); + + spin_lock(&hugetlb_lock); + if (h->free_huge_pages - h->resv_huge_pages > 0) { + struct page *page; + + page = dequeue_huge_page_nodemask(h, gfp_mask, preferred_nid, nmask); + if (page) { + spin_unlock(&hugetlb_lock); + return page; + } + } + spin_unlock(&hugetlb_lock); + + /* No reservations, try to overcommit */ + + return __alloc_buddy_huge_page(h, gfp_mask, preferred_nid, nmask); +} + /* * Increase the hugetlb pool such that it can accommodate a reservation * of size 'delta'. @@ -1730,12 +1689,14 @@ static int gather_surplus_pages(struct hstate *h, int delta) retry: spin_unlock(&hugetlb_lock); for (i = 0; i < needed; i++) { - page = __alloc_buddy_huge_page_no_mpol(h, NUMA_NO_NODE); + page = __alloc_buddy_huge_page(h, htlb_alloc_mask(h), + NUMA_NO_NODE, NULL); if (!page) { alloc_ok = false; break; } list_add(&page->lru, &surplus_list); + cond_resched(); } allocated += i; @@ -2204,8 +2165,16 @@ static void __init hugetlb_hstate_alloc_pages(struct hstate *h) } else if (!alloc_fresh_huge_page(h, &node_states[N_MEMORY])) break; + cond_resched(); + } + if (i < h->max_huge_pages) { + char buf[32]; + + string_get_size(huge_page_size(h), 1, STRING_UNITS_2, buf, 32); + pr_warn("HugeTLB: allocating %lu of page size %s failed. Only allocated %lu hugepages.\n", + h->max_huge_pages, buf, i); + h->max_huge_pages = i; } - h->max_huge_pages = i; } static void __init hugetlb_init_hstates(void) @@ -2223,26 +2192,16 @@ static void __init hugetlb_init_hstates(void) VM_BUG_ON(minimum_order == UINT_MAX); } -static char * __init memfmt(char *buf, unsigned long n) -{ - if (n >= (1UL << 30)) - sprintf(buf, "%lu GB", n >> 30); - else if (n >= (1UL << 20)) - sprintf(buf, "%lu MB", n >> 20); - else - sprintf(buf, "%lu KB", n >> 10); - return buf; -} - static void __init report_hugepages(void) { struct hstate *h; for_each_hstate(h) { char buf[32]; + + string_get_size(huge_page_size(h), 1, STRING_UNITS_2, buf, 32); pr_info("HugeTLB registered %s page size, pre-allocated %ld pages\n", - memfmt(buf, huge_page_size(h)), - h->free_huge_pages); + buf, h->free_huge_pages); } } @@ -2801,6 +2760,11 @@ static int __init hugetlb_init(void) return 0; if (!size_to_hstate(default_hstate_size)) { + if (default_hstate_size != 0) { + pr_err("HugeTLB: unsupported default_hugepagesz %lu. Reverting to %lu\n", + default_hstate_size, HPAGE_SIZE); + } + default_hstate_size = HPAGE_SIZE; if (!size_to_hstate(default_hstate_size)) hugetlb_add_hstate(HUGETLB_PAGE_ORDER); @@ -4739,40 +4703,6 @@ follow_huge_pgd(struct mm_struct *mm, unsigned long address, pgd_t *pgd, int fla return pte_page(*(pte_t *)pgd) + ((address & ~PGDIR_MASK) >> PAGE_SHIFT); } -#ifdef CONFIG_MEMORY_FAILURE - -/* - * This function is called from memory failure code. - */ -int dequeue_hwpoisoned_huge_page(struct page *hpage) -{ - struct hstate *h = page_hstate(hpage); - int nid = page_to_nid(hpage); - int ret = -EBUSY; - - spin_lock(&hugetlb_lock); - /* - * Just checking !page_huge_active is not enough, because that could be - * an isolated/hwpoisoned hugepage (which have >0 refcount). - */ - if (!page_huge_active(hpage) && !page_count(hpage)) { - /* - * Hwpoisoned hugepage isn't linked to activelist or freelist, - * but dangling hpage->lru can trigger list-debug warnings - * (this happens when we call unpoison_memory() on it), - * so let it point to itself with list_del_init(). - */ - list_del_init(&hpage->lru); - set_page_refcounted(hpage); - h->free_huge_pages--; - h->free_huge_pages_node[nid]--; - ret = 0; - } - spin_unlock(&hugetlb_lock); - return ret; -} -#endif - bool isolate_huge_page(struct page *page, struct list_head *list) { bool ret = true; |