diff options
Diffstat (limited to 'mm/page_alloc.c')
-rw-r--r-- | mm/page_alloc.c | 526 |
1 files changed, 234 insertions, 292 deletions
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 7633c503a116..a47f0b229a1a 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -25,6 +25,7 @@ #include <linux/compiler.h> #include <linux/kernel.h> #include <linux/kmemcheck.h> +#include <linux/kasan.h> #include <linux/module.h> #include <linux/suspend.h> #include <linux/pagevec.h> @@ -172,7 +173,7 @@ static void __free_pages_ok(struct page *page, unsigned int order); * 1G machine -> (16M dma, 784M normal, 224M high) * NORMAL allocation will leave 784M/256 of ram reserved in the ZONE_DMA * HIGHMEM allocation will leave 224M/32 of ram reserved in ZONE_NORMAL - * HIGHMEM allocation will (224M+784M)/256 of ram reserved in ZONE_DMA + * HIGHMEM allocation will leave (224M+784M)/256 of ram reserved in ZONE_DMA * * TBD: should special case ZONE_DMA32 machines here - in those we normally * don't need any ZONE_NORMAL reservation @@ -244,8 +245,6 @@ void set_pageblock_migratetype(struct page *page, int migratetype) PB_migrate, PB_migrate_end); } -bool oom_killer_disabled __read_mostly; - #ifdef CONFIG_DEBUG_VM static int page_outside_zone_boundaries(struct zone *zone, struct page *page) { @@ -381,36 +380,6 @@ void prep_compound_page(struct page *page, unsigned long order) } } -/* update __split_huge_page_refcount if you change this function */ -static int destroy_compound_page(struct page *page, unsigned long order) -{ - int i; - int nr_pages = 1 << order; - int bad = 0; - - if (unlikely(compound_order(page) != order)) { - bad_page(page, "wrong compound order", 0); - bad++; - } - - __ClearPageHead(page); - - for (i = 1; i < nr_pages; i++) { - struct page *p = page + i; - - if (unlikely(!PageTail(p))) { - bad_page(page, "PageTail not set", 0); - bad++; - } else if (unlikely(p->first_page != page)) { - bad_page(page, "first_page not consistent", 0); - bad++; - } - __ClearPageTail(p); - } - - return bad; -} - static inline void prep_zero_page(struct page *page, unsigned int order, gfp_t gfp_flags) { @@ -552,17 +521,15 @@ static inline int page_is_buddy(struct page *page, struct page *buddy, return 0; if (page_is_guard(buddy) && page_order(buddy) == order) { - VM_BUG_ON_PAGE(page_count(buddy) != 0, buddy); - if (page_zone_id(page) != page_zone_id(buddy)) return 0; + VM_BUG_ON_PAGE(page_count(buddy) != 0, buddy); + return 1; } if (PageBuddy(buddy) && page_order(buddy) == order) { - VM_BUG_ON_PAGE(page_count(buddy) != 0, buddy); - /* * zone check is done late to avoid uselessly * calculating zone/node ids for pages that could @@ -571,6 +538,8 @@ static inline int page_is_buddy(struct page *page, struct page *buddy, if (page_zone_id(page) != page_zone_id(buddy)) return 0; + VM_BUG_ON_PAGE(page_count(buddy) != 0, buddy); + return 1; } return 0; @@ -613,10 +582,7 @@ static inline void __free_one_page(struct page *page, int max_order = MAX_ORDER; VM_BUG_ON(!zone_is_initialized(zone)); - - if (unlikely(PageCompound(page))) - if (unlikely(destroy_compound_page(page, order))) - return; + VM_BUG_ON_PAGE(page->flags & PAGE_FLAGS_CHECK_AT_PREP, page); VM_BUG_ON(migratetype == -1); if (is_migrate_isolate(migratetype)) { @@ -797,21 +763,41 @@ static void free_one_page(struct zone *zone, spin_unlock(&zone->lock); } +static int free_tail_pages_check(struct page *head_page, struct page *page) +{ + if (!IS_ENABLED(CONFIG_DEBUG_VM)) + return 0; + if (unlikely(!PageTail(page))) { + bad_page(page, "PageTail not set", 0); + return 1; + } + if (unlikely(page->first_page != head_page)) { + bad_page(page, "first_page not consistent", 0); + return 1; + } + return 0; +} + static bool free_pages_prepare(struct page *page, unsigned int order) { - int i; - int bad = 0; + bool compound = PageCompound(page); + int i, bad = 0; VM_BUG_ON_PAGE(PageTail(page), page); - VM_BUG_ON_PAGE(PageHead(page) && compound_order(page) != order, page); + VM_BUG_ON_PAGE(compound && compound_order(page) != order, page); trace_mm_page_free(page, order); kmemcheck_free_shadow(page, order); + kasan_free_pages(page, order); if (PageAnon(page)) page->mapping = NULL; - for (i = 0; i < (1 << order); i++) + bad += free_pages_check(page); + for (i = 1; i < (1 << order); i++) { + if (compound) + bad += free_tail_pages_check(page, page + i); bad += free_pages_check(page + i); + } if (bad) return false; @@ -970,7 +956,8 @@ static inline int check_new_page(struct page *page) return 0; } -static int prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags) +static int prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags, + int alloc_flags) { int i; @@ -985,6 +972,7 @@ static int prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags) arch_alloc_page(page, order); kernel_map_pages(page, 1 << order, 1); + kasan_alloc_pages(page, order); if (gfp_flags & __GFP_ZERO) prep_zero_page(page, order, gfp_flags); @@ -994,6 +982,14 @@ static int prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags) set_page_owner(page, order, gfp_flags); + /* + * page->pfmemalloc is set when ALLOC_NO_WATERMARKS was necessary to + * allocate the page. The expectation is that the caller is taking + * steps that will free more memory. The caller should avoid the page + * being used for !PFMEMALLOC purposes. + */ + page->pfmemalloc = !!(alloc_flags & ALLOC_NO_WATERMARKS); + return 0; } @@ -1130,39 +1126,34 @@ static void change_pageblock_range(struct page *pageblock_page, } /* - * If breaking a large block of pages, move all free pages to the preferred - * allocation list. If falling back for a reclaimable kernel allocation, be - * more aggressive about taking ownership of free pages. + * When we are falling back to another migratetype during allocation, try to + * steal extra free pages from the same pageblocks to satisfy further + * allocations, instead of polluting multiple pageblocks. * - * On the other hand, never change migration type of MIGRATE_CMA pageblocks - * nor move CMA pages to different free lists. We don't want unmovable pages - * to be allocated from MIGRATE_CMA areas. + * If we are stealing a relatively large buddy page, it is likely there will + * be more free pages in the pageblock, so try to steal them all. For + * reclaimable and unmovable allocations, we steal regardless of page size, + * as fragmentation caused by those allocations polluting movable pageblocks + * is worse than movable allocations stealing from unmovable and reclaimable + * pageblocks. * - * Returns the new migratetype of the pageblock (or the same old migratetype - * if it was unchanged). + * If we claim more than half of the pageblock, change pageblock's migratetype + * as well. */ -static int try_to_steal_freepages(struct zone *zone, struct page *page, +static void try_to_steal_freepages(struct zone *zone, struct page *page, int start_type, int fallback_type) { int current_order = page_order(page); - /* - * When borrowing from MIGRATE_CMA, we need to release the excess - * buddy pages to CMA itself. We also ensure the freepage_migratetype - * is set to CMA so it is returned to the correct freelist in case - * the page ends up being not actually allocated from the pcp lists. - */ - if (is_migrate_cma(fallback_type)) - return fallback_type; - /* Take ownership for orders >= pageblock_order */ if (current_order >= pageblock_order) { change_pageblock_range(page, current_order, start_type); - return start_type; + return; } if (current_order >= pageblock_order / 2 || start_type == MIGRATE_RECLAIMABLE || + start_type == MIGRATE_UNMOVABLE || page_group_by_mobility_disabled) { int pages; @@ -1170,15 +1161,9 @@ static int try_to_steal_freepages(struct zone *zone, struct page *page, /* Claim the whole block if over half of it is free */ if (pages >= (1 << (pageblock_order-1)) || - page_group_by_mobility_disabled) { - + page_group_by_mobility_disabled) set_pageblock_migratetype(page, start_type); - return start_type; - } - } - - return fallback_type; } /* Remove an element from the buddy allocator from the fallback list */ @@ -1188,14 +1173,15 @@ __rmqueue_fallback(struct zone *zone, unsigned int order, int start_migratetype) struct free_area *area; unsigned int current_order; struct page *page; - int migratetype, new_type, i; /* Find the largest possible block of pages in the other list */ for (current_order = MAX_ORDER-1; current_order >= order && current_order <= MAX_ORDER-1; --current_order) { + int i; for (i = 0;; i++) { - migratetype = fallbacks[start_migratetype][i]; + int migratetype = fallbacks[start_migratetype][i]; + int buddy_type = start_migratetype; /* MIGRATE_RESERVE handled later if necessary */ if (migratetype == MIGRATE_RESERVE) @@ -1209,25 +1195,39 @@ __rmqueue_fallback(struct zone *zone, unsigned int order, int start_migratetype) struct page, lru); area->nr_free--; - new_type = try_to_steal_freepages(zone, page, - start_migratetype, - migratetype); + if (!is_migrate_cma(migratetype)) { + try_to_steal_freepages(zone, page, + start_migratetype, + migratetype); + } else { + /* + * When borrowing from MIGRATE_CMA, we need to + * release the excess buddy pages to CMA + * itself, and we do not try to steal extra + * free pages. + */ + buddy_type = migratetype; + } /* Remove the page from the freelists */ list_del(&page->lru); rmv_page_order(page); expand(zone, page, order, current_order, area, - new_type); - /* The freepage_migratetype may differ from pageblock's + buddy_type); + + /* + * The freepage_migratetype may differ from pageblock's * migratetype depending on the decisions in - * try_to_steal_freepages. This is OK as long as it does - * not differ for MIGRATE_CMA type. + * try_to_steal_freepages(). This is OK as long as it + * does not differ for MIGRATE_CMA pageblocks. For CMA + * we need to make sure unallocated pages flushed from + * pcp lists are returned to the correct freelist. */ - set_freepage_migratetype(page, new_type); + set_freepage_migratetype(page, buddy_type); trace_mm_page_alloc_extfrag(page, order, current_order, - start_migratetype, migratetype, new_type); + start_migratetype, migratetype); return page; } @@ -1642,9 +1642,7 @@ int split_free_page(struct page *page) } /* - * Really, prep_compound_page() should be called from __rmqueue_bulk(). But - * we cheat by calling it from here, in the order > 0 path. Saves a branch - * or two. + * Allocate a page from the given zone. Use pcplists for order-0 allocations. */ static inline struct page *buffered_rmqueue(struct zone *preferred_zone, @@ -1655,7 +1653,6 @@ struct page *buffered_rmqueue(struct zone *preferred_zone, struct page *page; bool cold = ((gfp_flags & __GFP_COLD) != 0); -again: if (likely(order == 0)) { struct per_cpu_pages *pcp; struct list_head *list; @@ -1711,8 +1708,6 @@ again: local_irq_restore(flags); VM_BUG_ON_PAGE(bad_range(zone, page), page); - if (prep_new_page(page, order, gfp_flags)) - goto again; return page; failed: @@ -2033,10 +2028,10 @@ static void reset_alloc_batches(struct zone *preferred_zone) * a page. */ static struct page * -get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order, - struct zonelist *zonelist, int high_zoneidx, int alloc_flags, - struct zone *preferred_zone, int classzone_idx, int migratetype) +get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags, + const struct alloc_context *ac) { + struct zonelist *zonelist = ac->zonelist; struct zoneref *z; struct page *page = NULL; struct zone *zone; @@ -2055,8 +2050,8 @@ zonelist_scan: * Scan zonelist, looking for a zone with enough free. * See also __cpuset_node_allowed() comment in kernel/cpuset.c. */ - for_each_zone_zonelist_nodemask(zone, z, zonelist, - high_zoneidx, nodemask) { + for_each_zone_zonelist_nodemask(zone, z, zonelist, ac->high_zoneidx, + ac->nodemask) { unsigned long mark; if (IS_ENABLED(CONFIG_NUMA) && zlc_active && @@ -2073,7 +2068,7 @@ zonelist_scan: * time the page has in memory before being reclaimed. */ if (alloc_flags & ALLOC_FAIR) { - if (!zone_local(preferred_zone, zone)) + if (!zone_local(ac->preferred_zone, zone)) break; if (test_bit(ZONE_FAIR_DEPLETED, &zone->flags)) { nr_fair_skipped++; @@ -2111,7 +2106,7 @@ zonelist_scan: mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK]; if (!zone_watermark_ok(zone, order, mark, - classzone_idx, alloc_flags)) { + ac->classzone_idx, alloc_flags)) { int ret; /* Checked here to keep the fast path fast */ @@ -2132,7 +2127,7 @@ zonelist_scan: } if (zone_reclaim_mode == 0 || - !zone_allows_reclaim(preferred_zone, zone)) + !zone_allows_reclaim(ac->preferred_zone, zone)) goto this_zone_full; /* @@ -2154,7 +2149,7 @@ zonelist_scan: default: /* did we reclaim enough */ if (zone_watermark_ok(zone, order, mark, - classzone_idx, alloc_flags)) + ac->classzone_idx, alloc_flags)) goto try_this_zone; /* @@ -2175,27 +2170,18 @@ zonelist_scan: } try_this_zone: - page = buffered_rmqueue(preferred_zone, zone, order, - gfp_mask, migratetype); - if (page) - break; + page = buffered_rmqueue(ac->preferred_zone, zone, order, + gfp_mask, ac->migratetype); + if (page) { + if (prep_new_page(page, order, gfp_mask, alloc_flags)) + goto try_this_zone; + return page; + } this_zone_full: if (IS_ENABLED(CONFIG_NUMA) && zlc_active) zlc_mark_zone_full(zonelist, z); } - if (page) { - /* - * page->pfmemalloc is set when ALLOC_NO_WATERMARKS was - * necessary to allocate the page. The expectation is - * that the caller is taking steps that will free more - * memory. The caller should avoid the page being used - * for !PFMEMALLOC purposes. - */ - page->pfmemalloc = !!(alloc_flags & ALLOC_NO_WATERMARKS); - return page; - } - /* * The first pass makes sure allocations are spread fairly within the * local node. However, the local node might have free pages left @@ -2208,7 +2194,7 @@ this_zone_full: alloc_flags &= ~ALLOC_FAIR; if (nr_fair_skipped) { zonelist_rescan = true; - reset_alloc_batches(preferred_zone); + reset_alloc_batches(ac->preferred_zone); } if (nr_online_nodes > 1) zonelist_rescan = true; @@ -2330,44 +2316,44 @@ should_alloc_retry(gfp_t gfp_mask, unsigned int order, static inline struct page * __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, - struct zonelist *zonelist, enum zone_type high_zoneidx, - nodemask_t *nodemask, struct zone *preferred_zone, - int classzone_idx, int migratetype) + const struct alloc_context *ac, unsigned long *did_some_progress) { struct page *page; - /* Acquire the per-zone oom lock for each zone */ - if (!oom_zonelist_trylock(zonelist, gfp_mask)) { - schedule_timeout_uninterruptible(1); - return NULL; - } + *did_some_progress = 0; /* - * PM-freezer should be notified that there might be an OOM killer on - * its way to kill and wake somebody up. This is too early and we might - * end up not killing anything but false positives are acceptable. - * See freeze_processes. + * Acquire the per-zone oom lock for each zone. If that + * fails, somebody else is making progress for us. */ - note_oom_kill(); + if (!oom_zonelist_trylock(ac->zonelist, gfp_mask)) { + *did_some_progress = 1; + schedule_timeout_uninterruptible(1); + return NULL; + } /* * Go through the zonelist yet one more time, keep very high watermark * here, this is only to catch a parallel oom killing, we must fail if * we're still under heavy pressure. */ - page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, - order, zonelist, high_zoneidx, - ALLOC_WMARK_HIGH|ALLOC_CPUSET, - preferred_zone, classzone_idx, migratetype); + page = get_page_from_freelist(gfp_mask | __GFP_HARDWALL, order, + ALLOC_WMARK_HIGH|ALLOC_CPUSET, ac); if (page) goto out; if (!(gfp_mask & __GFP_NOFAIL)) { + /* Coredumps can quickly deplete all memory reserves */ + if (current->flags & PF_DUMPCORE) + goto out; /* The OOM killer will not help higher order allocs */ if (order > PAGE_ALLOC_COSTLY_ORDER) goto out; /* The OOM killer does not needlessly kill tasks for lowmem */ - if (high_zoneidx < ZONE_NORMAL) + if (ac->high_zoneidx < ZONE_NORMAL) + goto out; + /* The OOM killer does not compensate for light reclaim */ + if (!(gfp_mask & __GFP_FS)) goto out; /* * GFP_THISNODE contains __GFP_NORETRY and we never hit this. @@ -2380,10 +2366,10 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, goto out; } /* Exhausted what can be done so it's blamo time */ - out_of_memory(zonelist, gfp_mask, order, nodemask, false); - + if (out_of_memory(ac->zonelist, gfp_mask, order, ac->nodemask, false)) + *did_some_progress = 1; out: - oom_zonelist_unlock(zonelist, gfp_mask); + oom_zonelist_unlock(ac->zonelist, gfp_mask); return page; } @@ -2391,10 +2377,9 @@ out: /* Try memory compaction for high-order allocations before reclaim */ static struct page * __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, - struct zonelist *zonelist, enum zone_type high_zoneidx, - nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, - int classzone_idx, int migratetype, enum migrate_mode mode, - int *contended_compaction, bool *deferred_compaction) + int alloc_flags, const struct alloc_context *ac, + enum migrate_mode mode, int *contended_compaction, + bool *deferred_compaction) { unsigned long compact_result; struct page *page; @@ -2403,10 +2388,8 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, return NULL; current->flags |= PF_MEMALLOC; - compact_result = try_to_compact_pages(zonelist, order, gfp_mask, - nodemask, mode, - contended_compaction, - alloc_flags, classzone_idx); + compact_result = try_to_compact_pages(gfp_mask, order, alloc_flags, ac, + mode, contended_compaction); current->flags &= ~PF_MEMALLOC; switch (compact_result) { @@ -2425,10 +2408,8 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, */ count_vm_event(COMPACTSTALL); - page = get_page_from_freelist(gfp_mask, nodemask, - order, zonelist, high_zoneidx, - alloc_flags & ~ALLOC_NO_WATERMARKS, - preferred_zone, classzone_idx, migratetype); + page = get_page_from_freelist(gfp_mask, order, + alloc_flags & ~ALLOC_NO_WATERMARKS, ac); if (page) { struct zone *zone = page_zone(page); @@ -2452,10 +2433,9 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, #else static inline struct page * __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, - struct zonelist *zonelist, enum zone_type high_zoneidx, - nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, - int classzone_idx, int migratetype, enum migrate_mode mode, - int *contended_compaction, bool *deferred_compaction) + int alloc_flags, const struct alloc_context *ac, + enum migrate_mode mode, int *contended_compaction, + bool *deferred_compaction) { return NULL; } @@ -2463,8 +2443,8 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, /* Perform direct synchronous page reclaim */ static int -__perform_reclaim(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist, - nodemask_t *nodemask) +__perform_reclaim(gfp_t gfp_mask, unsigned int order, + const struct alloc_context *ac) { struct reclaim_state reclaim_state; int progress; @@ -2478,7 +2458,8 @@ __perform_reclaim(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist, reclaim_state.reclaimed_slab = 0; current->reclaim_state = &reclaim_state; - progress = try_to_free_pages(zonelist, order, gfp_mask, nodemask); + progress = try_to_free_pages(ac->zonelist, order, gfp_mask, + ac->nodemask); current->reclaim_state = NULL; lockdep_clear_current_reclaim_state(); @@ -2492,28 +2473,23 @@ __perform_reclaim(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist, /* The really slow allocator path where we enter direct reclaim */ static inline struct page * __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order, - struct zonelist *zonelist, enum zone_type high_zoneidx, - nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, - int classzone_idx, int migratetype, unsigned long *did_some_progress) + int alloc_flags, const struct alloc_context *ac, + unsigned long *did_some_progress) { struct page *page = NULL; bool drained = false; - *did_some_progress = __perform_reclaim(gfp_mask, order, zonelist, - nodemask); + *did_some_progress = __perform_reclaim(gfp_mask, order, ac); if (unlikely(!(*did_some_progress))) return NULL; /* After successful reclaim, reconsider all zones for allocation */ if (IS_ENABLED(CONFIG_NUMA)) - zlc_clear_zones_full(zonelist); + zlc_clear_zones_full(ac->zonelist); retry: - page = get_page_from_freelist(gfp_mask, nodemask, order, - zonelist, high_zoneidx, - alloc_flags & ~ALLOC_NO_WATERMARKS, - preferred_zone, classzone_idx, - migratetype); + page = get_page_from_freelist(gfp_mask, order, + alloc_flags & ~ALLOC_NO_WATERMARKS, ac); /* * If an allocation failed after direct reclaim, it could be because @@ -2534,36 +2510,30 @@ retry: */ static inline struct page * __alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order, - struct zonelist *zonelist, enum zone_type high_zoneidx, - nodemask_t *nodemask, struct zone *preferred_zone, - int classzone_idx, int migratetype) + const struct alloc_context *ac) { struct page *page; do { - page = get_page_from_freelist(gfp_mask, nodemask, order, - zonelist, high_zoneidx, ALLOC_NO_WATERMARKS, - preferred_zone, classzone_idx, migratetype); + page = get_page_from_freelist(gfp_mask, order, + ALLOC_NO_WATERMARKS, ac); if (!page && gfp_mask & __GFP_NOFAIL) - wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50); + wait_iff_congested(ac->preferred_zone, BLK_RW_ASYNC, + HZ/50); } while (!page && (gfp_mask & __GFP_NOFAIL)); return page; } -static void wake_all_kswapds(unsigned int order, - struct zonelist *zonelist, - enum zone_type high_zoneidx, - struct zone *preferred_zone, - nodemask_t *nodemask) +static void wake_all_kswapds(unsigned int order, const struct alloc_context *ac) { struct zoneref *z; struct zone *zone; - for_each_zone_zonelist_nodemask(zone, z, zonelist, - high_zoneidx, nodemask) - wakeup_kswapd(zone, order, zone_idx(preferred_zone)); + for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, + ac->high_zoneidx, ac->nodemask) + wakeup_kswapd(zone, order, zone_idx(ac->preferred_zone)); } static inline int @@ -2622,9 +2592,7 @@ bool gfp_pfmemalloc_allowed(gfp_t gfp_mask) static inline struct page * __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, - struct zonelist *zonelist, enum zone_type high_zoneidx, - nodemask_t *nodemask, struct zone *preferred_zone, - int classzone_idx, int migratetype) + struct alloc_context *ac) { const gfp_t wait = gfp_mask & __GFP_WAIT; struct page *page = NULL; @@ -2658,10 +2626,9 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, (gfp_mask & GFP_THISNODE) == GFP_THISNODE) goto nopage; -restart: +retry: if (!(gfp_mask & __GFP_NO_KSWAPD)) - wake_all_kswapds(order, zonelist, high_zoneidx, - preferred_zone, nodemask); + wake_all_kswapds(order, ac); /* * OK, we're below the kswapd watermark and have kicked background @@ -2674,18 +2641,16 @@ restart: * Find the true preferred zone if the allocation is unconstrained by * cpusets. */ - if (!(alloc_flags & ALLOC_CPUSET) && !nodemask) { + if (!(alloc_flags & ALLOC_CPUSET) && !ac->nodemask) { struct zoneref *preferred_zoneref; - preferred_zoneref = first_zones_zonelist(zonelist, high_zoneidx, - NULL, &preferred_zone); - classzone_idx = zonelist_zone_idx(preferred_zoneref); + preferred_zoneref = first_zones_zonelist(ac->zonelist, + ac->high_zoneidx, NULL, &ac->preferred_zone); + ac->classzone_idx = zonelist_zone_idx(preferred_zoneref); } -rebalance: /* This is the last chance, in general, before the goto nopage. */ - page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist, - high_zoneidx, alloc_flags & ~ALLOC_NO_WATERMARKS, - preferred_zone, classzone_idx, migratetype); + page = get_page_from_freelist(gfp_mask, order, + alloc_flags & ~ALLOC_NO_WATERMARKS, ac); if (page) goto got_pg; @@ -2696,11 +2661,10 @@ rebalance: * the allocation is high priority and these type of * allocations are system rather than user orientated */ - zonelist = node_zonelist(numa_node_id(), gfp_mask); + ac->zonelist = node_zonelist(numa_node_id(), gfp_mask); + + page = __alloc_pages_high_priority(gfp_mask, order, ac); - page = __alloc_pages_high_priority(gfp_mask, order, - zonelist, high_zoneidx, nodemask, - preferred_zone, classzone_idx, migratetype); if (page) { goto got_pg; } @@ -2729,11 +2693,9 @@ rebalance: * Try direct compaction. The first pass is asynchronous. Subsequent * attempts after direct reclaim are synchronous */ - page = __alloc_pages_direct_compact(gfp_mask, order, zonelist, - high_zoneidx, nodemask, alloc_flags, - preferred_zone, - classzone_idx, migratetype, - migration_mode, &contended_compaction, + page = __alloc_pages_direct_compact(gfp_mask, order, alloc_flags, ac, + migration_mode, + &contended_compaction, &deferred_compaction); if (page) goto got_pg; @@ -2779,74 +2741,40 @@ rebalance: migration_mode = MIGRATE_SYNC_LIGHT; /* Try direct reclaim and then allocating */ - page = __alloc_pages_direct_reclaim(gfp_mask, order, - zonelist, high_zoneidx, - nodemask, - alloc_flags, preferred_zone, - classzone_idx, migratetype, - &did_some_progress); + page = __alloc_pages_direct_reclaim(gfp_mask, order, alloc_flags, ac, + &did_some_progress); if (page) goto got_pg; - /* - * If we failed to make any progress reclaiming, then we are - * running out of options and have to consider going OOM - */ - if (!did_some_progress) { - if (oom_gfp_allowed(gfp_mask)) { - if (oom_killer_disabled) - goto nopage; - /* Coredumps can quickly deplete all memory reserves */ - if ((current->flags & PF_DUMPCORE) && - !(gfp_mask & __GFP_NOFAIL)) - goto nopage; - page = __alloc_pages_may_oom(gfp_mask, order, - zonelist, high_zoneidx, - nodemask, preferred_zone, - classzone_idx, migratetype); - if (page) - goto got_pg; - - if (!(gfp_mask & __GFP_NOFAIL)) { - /* - * The oom killer is not called for high-order - * allocations that may fail, so if no progress - * is being made, there are no other options and - * retrying is unlikely to help. - */ - if (order > PAGE_ALLOC_COSTLY_ORDER) - goto nopage; - /* - * The oom killer is not called for lowmem - * allocations to prevent needlessly killing - * innocent tasks. - */ - if (high_zoneidx < ZONE_NORMAL) - goto nopage; - } - - goto restart; - } - } - /* Check if we should retry the allocation */ pages_reclaimed += did_some_progress; if (should_alloc_retry(gfp_mask, order, did_some_progress, pages_reclaimed)) { + /* + * If we fail to make progress by freeing individual + * pages, but the allocation wants us to keep going, + * start OOM killing tasks. + */ + if (!did_some_progress) { + page = __alloc_pages_may_oom(gfp_mask, order, ac, + &did_some_progress); + if (page) + goto got_pg; + if (!did_some_progress) + goto nopage; + } /* Wait for some write requests to complete then retry */ - wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50); - goto rebalance; + wait_iff_congested(ac->preferred_zone, BLK_RW_ASYNC, HZ/50); + goto retry; } else { /* * High-order allocations do not necessarily loop after * direct reclaim and reclaim/compaction depends on compaction * being called after reclaim so call directly if necessary */ - page = __alloc_pages_direct_compact(gfp_mask, order, zonelist, - high_zoneidx, nodemask, alloc_flags, - preferred_zone, - classzone_idx, migratetype, - migration_mode, &contended_compaction, + page = __alloc_pages_direct_compact(gfp_mask, order, + alloc_flags, ac, migration_mode, + &contended_compaction, &deferred_compaction); if (page) goto got_pg; @@ -2854,11 +2782,7 @@ rebalance: nopage: warn_alloc_failed(gfp_mask, order, NULL); - return page; got_pg: - if (kmemcheck_enabled) - kmemcheck_pagealloc_alloc(page, order, gfp_mask); - return page; } @@ -2869,14 +2793,16 @@ struct page * __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist, nodemask_t *nodemask) { - enum zone_type high_zoneidx = gfp_zone(gfp_mask); - struct zone *preferred_zone; struct zoneref *preferred_zoneref; struct page *page = NULL; - int migratetype = gfpflags_to_migratetype(gfp_mask); unsigned int cpuset_mems_cookie; int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET|ALLOC_FAIR; - int classzone_idx; + gfp_t alloc_mask; /* The gfp_t that was actually used for allocation */ + struct alloc_context ac = { + .high_zoneidx = gfp_zone(gfp_mask), + .nodemask = nodemask, + .migratetype = gfpflags_to_migratetype(gfp_mask), + }; gfp_mask &= gfp_allowed_mask; @@ -2895,37 +2821,40 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, if (unlikely(!zonelist->_zonerefs->zone)) return NULL; - if (IS_ENABLED(CONFIG_CMA) && migratetype == MIGRATE_MOVABLE) + if (IS_ENABLED(CONFIG_CMA) && ac.migratetype == MIGRATE_MOVABLE) alloc_flags |= ALLOC_CMA; retry_cpuset: cpuset_mems_cookie = read_mems_allowed_begin(); + /* We set it here, as __alloc_pages_slowpath might have changed it */ + ac.zonelist = zonelist; /* The preferred zone is used for statistics later */ - preferred_zoneref = first_zones_zonelist(zonelist, high_zoneidx, - nodemask ? : &cpuset_current_mems_allowed, - &preferred_zone); - if (!preferred_zone) + preferred_zoneref = first_zones_zonelist(ac.zonelist, ac.high_zoneidx, + ac.nodemask ? : &cpuset_current_mems_allowed, + &ac.preferred_zone); + if (!ac.preferred_zone) goto out; - classzone_idx = zonelist_zone_idx(preferred_zoneref); + ac.classzone_idx = zonelist_zone_idx(preferred_zoneref); /* First allocation attempt */ - page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order, - zonelist, high_zoneidx, alloc_flags, - preferred_zone, classzone_idx, migratetype); + alloc_mask = gfp_mask|__GFP_HARDWALL; + page = get_page_from_freelist(alloc_mask, order, alloc_flags, &ac); if (unlikely(!page)) { /* * Runtime PM, block IO and its error handling path * can deadlock because I/O on the device might not * complete. */ - gfp_mask = memalloc_noio_flags(gfp_mask); - page = __alloc_pages_slowpath(gfp_mask, order, - zonelist, high_zoneidx, nodemask, - preferred_zone, classzone_idx, migratetype); + alloc_mask = memalloc_noio_flags(gfp_mask); + + page = __alloc_pages_slowpath(alloc_mask, order, &ac); } - trace_mm_page_alloc(page, order, gfp_mask, migratetype); + if (kmemcheck_enabled && page) + kmemcheck_pagealloc_alloc(page, order, gfp_mask); + + trace_mm_page_alloc(page, order, alloc_mask, ac.migratetype); out: /* @@ -3945,18 +3874,29 @@ static int __build_all_zonelists(void *data) return 0; } +static noinline void __init +build_all_zonelists_init(void) +{ + __build_all_zonelists(NULL); + mminit_verify_zonelist(); + cpuset_init_current_mems_allowed(); +} + /* * Called with zonelists_mutex held always * unless system_state == SYSTEM_BOOTING. + * + * __ref due to (1) call of __meminit annotated setup_zone_pageset + * [we're only called with non-NULL zone through __meminit paths] and + * (2) call of __init annotated helper build_all_zonelists_init + * [protected by SYSTEM_BOOTING]. */ void __ref build_all_zonelists(pg_data_t *pgdat, struct zone *zone) { set_zonelist_order(); if (system_state == SYSTEM_BOOTING) { - __build_all_zonelists(NULL); - mminit_verify_zonelist(); - cpuset_init_current_mems_allowed(); + build_all_zonelists_init(); } else { #ifdef CONFIG_MEMORY_HOTPLUG if (zone) @@ -5059,8 +4999,8 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size, pgdat->node_start_pfn = node_start_pfn; #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP get_pfn_range_for_nid(nid, &start_pfn, &end_pfn); - printk(KERN_INFO "Initmem setup node %d [mem %#010Lx-%#010Lx]\n", nid, - (u64) start_pfn << PAGE_SHIFT, (u64) (end_pfn << PAGE_SHIFT) - 1); + pr_info("Initmem setup node %d [mem %#018Lx-%#018Lx]\n", nid, + (u64)start_pfn << PAGE_SHIFT, ((u64)end_pfn << PAGE_SHIFT) - 1); #endif calculate_node_totalpages(pgdat, start_pfn, end_pfn, zones_size, zholes_size); @@ -5432,9 +5372,10 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn) arch_zone_highest_possible_pfn[i]) pr_cont("empty\n"); else - pr_cont("[mem %0#10lx-%0#10lx]\n", - arch_zone_lowest_possible_pfn[i] << PAGE_SHIFT, - (arch_zone_highest_possible_pfn[i] + pr_cont("[mem %#018Lx-%#018Lx]\n", + (u64)arch_zone_lowest_possible_pfn[i] + << PAGE_SHIFT, + ((u64)arch_zone_highest_possible_pfn[i] << PAGE_SHIFT) - 1); } @@ -5442,15 +5383,16 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn) pr_info("Movable zone start for each node\n"); for (i = 0; i < MAX_NUMNODES; i++) { if (zone_movable_pfn[i]) - pr_info(" Node %d: %#010lx\n", i, - zone_movable_pfn[i] << PAGE_SHIFT); + pr_info(" Node %d: %#018Lx\n", i, + (u64)zone_movable_pfn[i] << PAGE_SHIFT); } /* Print out the early node map */ pr_info("Early memory node ranges\n"); for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) - pr_info(" node %3d: [mem %#010lx-%#010lx]\n", nid, - start_pfn << PAGE_SHIFT, (end_pfn << PAGE_SHIFT) - 1); + pr_info(" node %3d: [mem %#018Lx-%#018Lx]\n", nid, + (u64)start_pfn << PAGE_SHIFT, + ((u64)end_pfn << PAGE_SHIFT) - 1); /* Initialise every node */ mminit_verify_pageflags_layout(); |