summaryrefslogtreecommitdiffstats
path: root/mm/hugetlb.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/hugetlb.c')
-rw-r--r--mm/hugetlb.c294
1 files changed, 112 insertions, 182 deletions
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 1a88006ec634..bc48ee783dd9 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -20,9 +20,9 @@
#include <linux/slab.h>
#include <linux/sched/signal.h>
#include <linux/rmap.h>
+#include <linux/string_helpers.h>
#include <linux/swap.h>
#include <linux/swapops.h>
-#include <linux/page-isolation.h>
#include <linux/jhash.h>
#include <asm/page.h>
@@ -872,7 +872,7 @@ static struct page *dequeue_huge_page_node_exact(struct hstate *h, int nid)
struct page *page;
list_for_each_entry(page, &h->hugepage_freelists[nid], lru)
- if (!is_migrate_isolate_page(page))
+ if (!PageHWPoison(page))
break;
/*
* if 'non-isolated free hugepage' not found on the list,
@@ -887,19 +887,39 @@ static struct page *dequeue_huge_page_node_exact(struct hstate *h, int nid)
return page;
}
-static struct page *dequeue_huge_page_node(struct hstate *h, int nid)
+static struct page *dequeue_huge_page_nodemask(struct hstate *h, gfp_t gfp_mask, int nid,
+ nodemask_t *nmask)
{
- struct page *page;
- int node;
+ unsigned int cpuset_mems_cookie;
+ struct zonelist *zonelist;
+ struct zone *zone;
+ struct zoneref *z;
+ int node = -1;
- if (nid != NUMA_NO_NODE)
- return dequeue_huge_page_node_exact(h, nid);
+ zonelist = node_zonelist(nid, gfp_mask);
+
+retry_cpuset:
+ cpuset_mems_cookie = read_mems_allowed_begin();
+ for_each_zone_zonelist_nodemask(zone, z, zonelist, gfp_zone(gfp_mask), nmask) {
+ struct page *page;
+
+ if (!cpuset_zone_allowed(zone, gfp_mask))
+ continue;
+ /*
+ * no need to ask again on the same node. Pool is node rather than
+ * zone aware
+ */
+ if (zone_to_nid(zone) == node)
+ continue;
+ node = zone_to_nid(zone);
- for_each_online_node(node) {
page = dequeue_huge_page_node_exact(h, node);
if (page)
return page;
}
+ if (unlikely(read_mems_allowed_retry(cpuset_mems_cookie)))
+ goto retry_cpuset;
+
return NULL;
}
@@ -917,15 +937,11 @@ static struct page *dequeue_huge_page_vma(struct hstate *h,
unsigned long address, int avoid_reserve,
long chg)
{
- struct page *page = NULL;
+ struct page *page;
struct mempolicy *mpol;
- nodemask_t *nodemask;
gfp_t gfp_mask;
+ nodemask_t *nodemask;
int nid;
- struct zonelist *zonelist;
- struct zone *zone;
- struct zoneref *z;
- unsigned int cpuset_mems_cookie;
/*
* A child process with MAP_PRIVATE mappings created by their parent
@@ -940,32 +956,15 @@ static struct page *dequeue_huge_page_vma(struct hstate *h,
if (avoid_reserve && h->free_huge_pages - h->resv_huge_pages == 0)
goto err;
-retry_cpuset:
- cpuset_mems_cookie = read_mems_allowed_begin();
gfp_mask = htlb_alloc_mask(h);
nid = huge_node(vma, address, gfp_mask, &mpol, &nodemask);
- zonelist = node_zonelist(nid, gfp_mask);
-
- for_each_zone_zonelist_nodemask(zone, z, zonelist,
- MAX_NR_ZONES - 1, nodemask) {
- if (cpuset_zone_allowed(zone, gfp_mask)) {
- page = dequeue_huge_page_node(h, zone_to_nid(zone));
- if (page) {
- if (avoid_reserve)
- break;
- if (!vma_has_reserves(vma, chg))
- break;
-
- SetPagePrivate(page);
- h->resv_huge_pages--;
- break;
- }
- }
+ page = dequeue_huge_page_nodemask(h, gfp_mask, nid, nodemask);
+ if (page && !avoid_reserve && vma_has_reserves(vma, chg)) {
+ SetPagePrivate(page);
+ h->resv_huge_pages--;
}
mpol_cond_put(mpol);
- if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie)))
- goto retry_cpuset;
return page;
err:
@@ -1385,7 +1384,7 @@ static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid)
page = __alloc_pages_node(nid,
htlb_alloc_mask(h)|__GFP_COMP|__GFP_THISNODE|
- __GFP_REPEAT|__GFP_NOWARN,
+ __GFP_RETRY_MAYFAIL|__GFP_NOWARN,
huge_page_order(h));
if (page) {
prep_new_huge_page(h, page, nid);
@@ -1460,7 +1459,7 @@ static int free_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed,
* number of free hugepages would be reduced below the number of reserved
* hugepages.
*/
-static int dissolve_free_huge_page(struct page *page)
+int dissolve_free_huge_page(struct page *page)
{
int rc = 0;
@@ -1473,6 +1472,14 @@ static int dissolve_free_huge_page(struct page *page)
rc = -EBUSY;
goto out;
}
+ /*
+ * Move PageHWPoison flag from head page to the raw error page,
+ * which makes any subpages rather than the error page reusable.
+ */
+ if (PageHWPoison(head) && page != head) {
+ SetPageHWPoison(page);
+ ClearPageHWPoison(head);
+ }
list_del(&head->lru);
h->free_huge_pages--;
h->free_huge_pages_node[nid]--;
@@ -1513,82 +1520,19 @@ int dissolve_free_huge_pages(unsigned long start_pfn, unsigned long end_pfn)
return rc;
}
-/*
- * There are 3 ways this can get called:
- * 1. With vma+addr: we use the VMA's memory policy
- * 2. With !vma, but nid=NUMA_NO_NODE: We try to allocate a huge
- * page from any node, and let the buddy allocator itself figure
- * it out.
- * 3. With !vma, but nid!=NUMA_NO_NODE. We allocate a huge page
- * strictly from 'nid'
- */
static struct page *__hugetlb_alloc_buddy_huge_page(struct hstate *h,
- struct vm_area_struct *vma, unsigned long addr, int nid)
+ gfp_t gfp_mask, int nid, nodemask_t *nmask)
{
int order = huge_page_order(h);
- gfp_t gfp = htlb_alloc_mask(h)|__GFP_COMP|__GFP_REPEAT|__GFP_NOWARN;
- unsigned int cpuset_mems_cookie;
-
- /*
- * We need a VMA to get a memory policy. If we do not
- * have one, we use the 'nid' argument.
- *
- * The mempolicy stuff below has some non-inlined bits
- * and calls ->vm_ops. That makes it hard to optimize at
- * compile-time, even when NUMA is off and it does
- * nothing. This helps the compiler optimize it out.
- */
- if (!IS_ENABLED(CONFIG_NUMA) || !vma) {
- /*
- * If a specific node is requested, make sure to
- * get memory from there, but only when a node
- * is explicitly specified.
- */
- if (nid != NUMA_NO_NODE)
- gfp |= __GFP_THISNODE;
- /*
- * Make sure to call something that can handle
- * nid=NUMA_NO_NODE
- */
- return alloc_pages_node(nid, gfp, order);
- }
-
- /*
- * OK, so we have a VMA. Fetch the mempolicy and try to
- * allocate a huge page with it. We will only reach this
- * when CONFIG_NUMA=y.
- */
- do {
- struct page *page;
- struct mempolicy *mpol;
- int nid;
- nodemask_t *nodemask;
-
- cpuset_mems_cookie = read_mems_allowed_begin();
- nid = huge_node(vma, addr, gfp, &mpol, &nodemask);
- mpol_cond_put(mpol);
- page = __alloc_pages_nodemask(gfp, order, nid, nodemask);
- if (page)
- return page;
- } while (read_mems_allowed_retry(cpuset_mems_cookie));
- return NULL;
+ gfp_mask |= __GFP_COMP|__GFP_RETRY_MAYFAIL|__GFP_NOWARN;
+ if (nid == NUMA_NO_NODE)
+ nid = numa_mem_id();
+ return __alloc_pages_nodemask(gfp_mask, order, nid, nmask);
}
-/*
- * There are two ways to allocate a huge page:
- * 1. When you have a VMA and an address (like a fault)
- * 2. When you have no VMA (like when setting /proc/.../nr_hugepages)
- *
- * 'vma' and 'addr' are only for (1). 'nid' is always NUMA_NO_NODE in
- * this case which signifies that the allocation should be done with
- * respect for the VMA's memory policy.
- *
- * For (2), we ignore 'vma' and 'addr' and use 'nid' exclusively. This
- * implies that memory policies will not be taken in to account.
- */
-static struct page *__alloc_buddy_huge_page(struct hstate *h,
- struct vm_area_struct *vma, unsigned long addr, int nid)
+static struct page *__alloc_buddy_huge_page(struct hstate *h, gfp_t gfp_mask,
+ int nid, nodemask_t *nmask)
{
struct page *page;
unsigned int r_nid;
@@ -1597,15 +1541,6 @@ static struct page *__alloc_buddy_huge_page(struct hstate *h,
return NULL;
/*
- * Make sure that anyone specifying 'nid' is not also specifying a VMA.
- * This makes sure the caller is picking _one_ of the modes with which
- * we can call this function, not both.
- */
- if (vma || (addr != -1)) {
- VM_WARN_ON_ONCE(addr == -1);
- VM_WARN_ON_ONCE(nid != NUMA_NO_NODE);
- }
- /*
* Assume we will successfully allocate the surplus page to
* prevent racing processes from causing the surplus to exceed
* overcommit
@@ -1638,7 +1573,7 @@ static struct page *__alloc_buddy_huge_page(struct hstate *h,
}
spin_unlock(&hugetlb_lock);
- page = __hugetlb_alloc_buddy_huge_page(h, vma, addr, nid);
+ page = __hugetlb_alloc_buddy_huge_page(h, gfp_mask, nid, nmask);
spin_lock(&hugetlb_lock);
if (page) {
@@ -1663,26 +1598,23 @@ static struct page *__alloc_buddy_huge_page(struct hstate *h,
}
/*
- * Allocate a huge page from 'nid'. Note, 'nid' may be
- * NUMA_NO_NODE, which means that it may be allocated
- * anywhere.
- */
-static
-struct page *__alloc_buddy_huge_page_no_mpol(struct hstate *h, int nid)
-{
- unsigned long addr = -1;
-
- return __alloc_buddy_huge_page(h, NULL, addr, nid);
-}
-
-/*
* Use the VMA's mpolicy to allocate a huge page from the buddy.
*/
static
struct page *__alloc_buddy_huge_page_with_mpol(struct hstate *h,
struct vm_area_struct *vma, unsigned long addr)
{
- return __alloc_buddy_huge_page(h, vma, addr, NUMA_NO_NODE);
+ struct page *page;
+ struct mempolicy *mpol;
+ gfp_t gfp_mask = htlb_alloc_mask(h);
+ int nid;
+ nodemask_t *nodemask;
+
+ nid = huge_node(vma, addr, gfp_mask, &mpol, &nodemask);
+ page = __alloc_buddy_huge_page(h, gfp_mask, nid, nodemask);
+ mpol_cond_put(mpol);
+
+ return page;
}
/*
@@ -1692,19 +1624,46 @@ struct page *__alloc_buddy_huge_page_with_mpol(struct hstate *h,
*/
struct page *alloc_huge_page_node(struct hstate *h, int nid)
{
+ gfp_t gfp_mask = htlb_alloc_mask(h);
struct page *page = NULL;
+ if (nid != NUMA_NO_NODE)
+ gfp_mask |= __GFP_THISNODE;
+
spin_lock(&hugetlb_lock);
if (h->free_huge_pages - h->resv_huge_pages > 0)
- page = dequeue_huge_page_node(h, nid);
+ page = dequeue_huge_page_nodemask(h, gfp_mask, nid, NULL);
spin_unlock(&hugetlb_lock);
if (!page)
- page = __alloc_buddy_huge_page_no_mpol(h, nid);
+ page = __alloc_buddy_huge_page(h, gfp_mask, nid, NULL);
return page;
}
+
+struct page *alloc_huge_page_nodemask(struct hstate *h, int preferred_nid,
+ nodemask_t *nmask)
+{
+ gfp_t gfp_mask = htlb_alloc_mask(h);
+
+ spin_lock(&hugetlb_lock);
+ if (h->free_huge_pages - h->resv_huge_pages > 0) {
+ struct page *page;
+
+ page = dequeue_huge_page_nodemask(h, gfp_mask, preferred_nid, nmask);
+ if (page) {
+ spin_unlock(&hugetlb_lock);
+ return page;
+ }
+ }
+ spin_unlock(&hugetlb_lock);
+
+ /* No reservations, try to overcommit */
+
+ return __alloc_buddy_huge_page(h, gfp_mask, preferred_nid, nmask);
+}
+
/*
* Increase the hugetlb pool such that it can accommodate a reservation
* of size 'delta'.
@@ -1730,12 +1689,14 @@ static int gather_surplus_pages(struct hstate *h, int delta)
retry:
spin_unlock(&hugetlb_lock);
for (i = 0; i < needed; i++) {
- page = __alloc_buddy_huge_page_no_mpol(h, NUMA_NO_NODE);
+ page = __alloc_buddy_huge_page(h, htlb_alloc_mask(h),
+ NUMA_NO_NODE, NULL);
if (!page) {
alloc_ok = false;
break;
}
list_add(&page->lru, &surplus_list);
+ cond_resched();
}
allocated += i;
@@ -2204,8 +2165,16 @@ static void __init hugetlb_hstate_alloc_pages(struct hstate *h)
} else if (!alloc_fresh_huge_page(h,
&node_states[N_MEMORY]))
break;
+ cond_resched();
+ }
+ if (i < h->max_huge_pages) {
+ char buf[32];
+
+ string_get_size(huge_page_size(h), 1, STRING_UNITS_2, buf, 32);
+ pr_warn("HugeTLB: allocating %lu of page size %s failed. Only allocated %lu hugepages.\n",
+ h->max_huge_pages, buf, i);
+ h->max_huge_pages = i;
}
- h->max_huge_pages = i;
}
static void __init hugetlb_init_hstates(void)
@@ -2223,26 +2192,16 @@ static void __init hugetlb_init_hstates(void)
VM_BUG_ON(minimum_order == UINT_MAX);
}
-static char * __init memfmt(char *buf, unsigned long n)
-{
- if (n >= (1UL << 30))
- sprintf(buf, "%lu GB", n >> 30);
- else if (n >= (1UL << 20))
- sprintf(buf, "%lu MB", n >> 20);
- else
- sprintf(buf, "%lu KB", n >> 10);
- return buf;
-}
-
static void __init report_hugepages(void)
{
struct hstate *h;
for_each_hstate(h) {
char buf[32];
+
+ string_get_size(huge_page_size(h), 1, STRING_UNITS_2, buf, 32);
pr_info("HugeTLB registered %s page size, pre-allocated %ld pages\n",
- memfmt(buf, huge_page_size(h)),
- h->free_huge_pages);
+ buf, h->free_huge_pages);
}
}
@@ -2801,6 +2760,11 @@ static int __init hugetlb_init(void)
return 0;
if (!size_to_hstate(default_hstate_size)) {
+ if (default_hstate_size != 0) {
+ pr_err("HugeTLB: unsupported default_hugepagesz %lu. Reverting to %lu\n",
+ default_hstate_size, HPAGE_SIZE);
+ }
+
default_hstate_size = HPAGE_SIZE;
if (!size_to_hstate(default_hstate_size))
hugetlb_add_hstate(HUGETLB_PAGE_ORDER);
@@ -4739,40 +4703,6 @@ follow_huge_pgd(struct mm_struct *mm, unsigned long address, pgd_t *pgd, int fla
return pte_page(*(pte_t *)pgd) + ((address & ~PGDIR_MASK) >> PAGE_SHIFT);
}
-#ifdef CONFIG_MEMORY_FAILURE
-
-/*
- * This function is called from memory failure code.
- */
-int dequeue_hwpoisoned_huge_page(struct page *hpage)
-{
- struct hstate *h = page_hstate(hpage);
- int nid = page_to_nid(hpage);
- int ret = -EBUSY;
-
- spin_lock(&hugetlb_lock);
- /*
- * Just checking !page_huge_active is not enough, because that could be
- * an isolated/hwpoisoned hugepage (which have >0 refcount).
- */
- if (!page_huge_active(hpage) && !page_count(hpage)) {
- /*
- * Hwpoisoned hugepage isn't linked to activelist or freelist,
- * but dangling hpage->lru can trigger list-debug warnings
- * (this happens when we call unpoison_memory() on it),
- * so let it point to itself with list_del_init().
- */
- list_del_init(&hpage->lru);
- set_page_refcounted(hpage);
- h->free_huge_pages--;
- h->free_huge_pages_node[nid]--;
- ret = 0;
- }
- spin_unlock(&hugetlb_lock);
- return ret;
-}
-#endif
-
bool isolate_huge_page(struct page *page, struct list_head *list)
{
bool ret = true;