diff options
Diffstat (limited to 'mm/page_alloc.c')
-rw-r--r-- | mm/page_alloc.c | 670 |
1 files changed, 410 insertions, 260 deletions
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index f3e0c69a97b7..6cbde310abed 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -55,12 +55,13 @@ #include <linux/kmemleak.h> #include <linux/compaction.h> #include <trace/events/kmem.h> +#include <trace/events/oom.h> #include <linux/prefetch.h> #include <linux/mm_inline.h> #include <linux/migrate.h> -#include <linux/page_ext.h> #include <linux/hugetlb.h> #include <linux/sched/rt.h> +#include <linux/sched/mm.h> #include <linux/page_owner.h> #include <linux/kthread.h> #include <linux/memcontrol.h> @@ -91,6 +92,10 @@ EXPORT_PER_CPU_SYMBOL(_numa_mem_); int _node_numa_mem_[MAX_NUMNODES]; #endif +/* work_structs for global per-cpu drains */ +DEFINE_MUTEX(pcpu_drain_mutex); +DEFINE_PER_CPU(struct work_struct, pcpu_drain); + #ifdef CONFIG_GCC_PLUGIN_LATENT_ENTROPY volatile unsigned long latent_entropy __latent_entropy; EXPORT_SYMBOL(latent_entropy); @@ -714,7 +719,7 @@ static inline void rmv_page_order(struct page *page) /* * This function checks whether a page is free && is the buddy * we can do coalesce a page and its buddy if - * (a) the buddy is not in a hole && + * (a) the buddy is not in a hole (check before calling!) && * (b) the buddy is in the buddy system && * (c) a page and its buddy have the same order && * (d) a page and its buddy are in the same zone. @@ -729,9 +734,6 @@ static inline void rmv_page_order(struct page *page) static inline int page_is_buddy(struct page *page, struct page *buddy, unsigned int order) { - if (!pfn_valid_within(page_to_pfn(buddy))) - return 0; - if (page_is_guard(buddy) && page_order(buddy) == order) { if (page_zone_id(page) != page_zone_id(buddy)) return 0; @@ -787,9 +789,8 @@ static inline void __free_one_page(struct page *page, struct zone *zone, unsigned int order, int migratetype) { - unsigned long page_idx; - unsigned long combined_idx; - unsigned long uninitialized_var(buddy_idx); + unsigned long combined_pfn; + unsigned long uninitialized_var(buddy_pfn); struct page *buddy; unsigned int max_order; @@ -802,15 +803,16 @@ static inline void __free_one_page(struct page *page, if (likely(!is_migrate_isolate(migratetype))) __mod_zone_freepage_state(zone, 1 << order, migratetype); - page_idx = pfn & ((1 << MAX_ORDER) - 1); - - VM_BUG_ON_PAGE(page_idx & ((1 << order) - 1), page); + VM_BUG_ON_PAGE(pfn & ((1 << order) - 1), page); VM_BUG_ON_PAGE(bad_range(zone, page), page); continue_merging: while (order < max_order - 1) { - buddy_idx = __find_buddy_index(page_idx, order); - buddy = page + (buddy_idx - page_idx); + buddy_pfn = __find_buddy_pfn(pfn, order); + buddy = page + (buddy_pfn - pfn); + + if (!pfn_valid_within(buddy_pfn)) + goto done_merging; if (!page_is_buddy(page, buddy, order)) goto done_merging; /* @@ -824,9 +826,9 @@ continue_merging: zone->free_area[order].nr_free--; rmv_page_order(buddy); } - combined_idx = buddy_idx & page_idx; - page = page + (combined_idx - page_idx); - page_idx = combined_idx; + combined_pfn = buddy_pfn & pfn; + page = page + (combined_pfn - pfn); + pfn = combined_pfn; order++; } if (max_order < MAX_ORDER) { @@ -841,8 +843,8 @@ continue_merging: if (unlikely(has_isolate_pageblock(zone))) { int buddy_mt; - buddy_idx = __find_buddy_index(page_idx, order); - buddy = page + (buddy_idx - page_idx); + buddy_pfn = __find_buddy_pfn(pfn, order); + buddy = page + (buddy_pfn - pfn); buddy_mt = get_pageblock_migratetype(buddy); if (migratetype != buddy_mt @@ -865,13 +867,14 @@ done_merging: * so it's less likely to be used soon and more likely to be merged * as a higher order page */ - if ((order < MAX_ORDER-2) && pfn_valid_within(page_to_pfn(buddy))) { + if ((order < MAX_ORDER-2) && pfn_valid_within(buddy_pfn)) { struct page *higher_page, *higher_buddy; - combined_idx = buddy_idx & page_idx; - higher_page = page + (combined_idx - page_idx); - buddy_idx = __find_buddy_index(combined_idx, order + 1); - higher_buddy = higher_page + (buddy_idx - combined_idx); - if (page_is_buddy(higher_page, higher_buddy, order + 1)) { + combined_pfn = buddy_pfn & pfn; + higher_page = page + (combined_pfn - pfn); + buddy_pfn = __find_buddy_pfn(combined_pfn, order + 1); + higher_buddy = higher_page + (buddy_pfn - combined_pfn); + if (pfn_valid_within(buddy_pfn) && + page_is_buddy(higher_page, higher_buddy, order + 1)) { list_add_tail(&page->lru, &zone->free_area[order].free_list[migratetype]); goto out; @@ -1087,10 +1090,10 @@ static void free_pcppages_bulk(struct zone *zone, int count, { int migratetype = 0; int batch_free = 0; - unsigned long nr_scanned; + unsigned long nr_scanned, flags; bool isolated_pageblocks; - spin_lock(&zone->lock); + spin_lock_irqsave(&zone->lock, flags); isolated_pageblocks = has_isolate_pageblock(zone); nr_scanned = node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED); if (nr_scanned) @@ -1139,7 +1142,7 @@ static void free_pcppages_bulk(struct zone *zone, int count, trace_mm_page_pcpu_drain(page, 0, mt); } while (--count && --batch_free && !list_empty(list)); } - spin_unlock(&zone->lock); + spin_unlock_irqrestore(&zone->lock, flags); } static void free_one_page(struct zone *zone, @@ -1147,8 +1150,9 @@ static void free_one_page(struct zone *zone, unsigned int order, int migratetype) { - unsigned long nr_scanned; - spin_lock(&zone->lock); + unsigned long nr_scanned, flags; + spin_lock_irqsave(&zone->lock, flags); + __count_vm_events(PGFREE, 1 << order); nr_scanned = node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED); if (nr_scanned) __mod_node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED, -nr_scanned); @@ -1158,7 +1162,7 @@ static void free_one_page(struct zone *zone, migratetype = get_pfnblock_migratetype(page, pfn); } __free_one_page(page, pfn, zone, order, migratetype); - spin_unlock(&zone->lock); + spin_unlock_irqrestore(&zone->lock, flags); } static void __meminit __init_single_page(struct page *page, unsigned long pfn, @@ -1236,7 +1240,6 @@ void __meminit reserve_bootmem_region(phys_addr_t start, phys_addr_t end) static void __free_pages_ok(struct page *page, unsigned int order) { - unsigned long flags; int migratetype; unsigned long pfn = page_to_pfn(page); @@ -1244,10 +1247,7 @@ static void __free_pages_ok(struct page *page, unsigned int order) return; migratetype = get_pfnblock_migratetype(page, pfn); - local_irq_save(flags); - __count_vm_events(PGFREE, 1 << order); free_one_page(page_zone(page), page, pfn, order, migratetype); - local_irq_restore(flags); } static void __init __free_pages_boot_core(struct page *page, unsigned int order) @@ -2219,8 +2219,9 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, int migratetype, bool cold) { int i, alloced = 0; + unsigned long flags; - spin_lock(&zone->lock); + spin_lock_irqsave(&zone->lock, flags); for (i = 0; i < count; ++i) { struct page *page = __rmqueue(zone, order, migratetype); if (unlikely(page == NULL)) @@ -2256,7 +2257,7 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, * pages added to the pcp list. */ __mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order)); - spin_unlock(&zone->lock); + spin_unlock_irqrestore(&zone->lock, flags); return alloced; } @@ -2341,16 +2342,26 @@ void drain_local_pages(struct zone *zone) drain_pages(cpu); } +static void drain_local_pages_wq(struct work_struct *work) +{ + /* + * drain_all_pages doesn't use proper cpu hotplug protection so + * we can race with cpu offline when the WQ can move this from + * a cpu pinned worker to an unbound one. We can operate on a different + * cpu which is allright but we also have to make sure to not move to + * a different one. + */ + preempt_disable(); + drain_local_pages(NULL); + preempt_enable(); +} + /* * Spill all the per-cpu pages from all CPUs back into the buddy allocator. * * When zone parameter is non-NULL, spill just the single zone's pages. * - * Note that this code is protected against sending an IPI to an offline - * CPU but does not guarantee sending an IPI to newly hotplugged CPUs: - * on_each_cpu_mask() blocks hotplug and won't talk to offlined CPUs but - * nothing keeps CPUs from showing up after we populated the cpumask and - * before the call to on_each_cpu_mask(). + * Note that this can be extremely slow as the draining happens in a workqueue. */ void drain_all_pages(struct zone *zone) { @@ -2362,6 +2373,21 @@ void drain_all_pages(struct zone *zone) */ static cpumask_t cpus_with_pcps; + /* Workqueues cannot recurse */ + if (current->flags & PF_WQ_WORKER) + return; + + /* + * Do not drain if one is already in progress unless it's specific to + * a zone. Such callers are primarily CMA and memory hotplug and need + * the drain to be complete when the call returns. + */ + if (unlikely(!mutex_trylock(&pcpu_drain_mutex))) { + if (!zone) + return; + mutex_lock(&pcpu_drain_mutex); + } + /* * We don't care about racing with CPU hotplug event * as offline notification will cause the notified @@ -2392,8 +2418,16 @@ void drain_all_pages(struct zone *zone) else cpumask_clear_cpu(cpu, &cpus_with_pcps); } - on_each_cpu_mask(&cpus_with_pcps, (smp_call_func_t) drain_local_pages, - zone, 1); + + for_each_cpu(cpu, &cpus_with_pcps) { + struct work_struct *work = per_cpu_ptr(&pcpu_drain, cpu); + INIT_WORK(work, drain_local_pages_wq); + schedule_work_on(cpu, work); + } + for_each_cpu(cpu, &cpus_with_pcps) + flush_work(per_cpu_ptr(&pcpu_drain, cpu)); + + mutex_unlock(&pcpu_drain_mutex); } #ifdef CONFIG_HIBERNATION @@ -2444,17 +2478,20 @@ void free_hot_cold_page(struct page *page, bool cold) { struct zone *zone = page_zone(page); struct per_cpu_pages *pcp; - unsigned long flags; unsigned long pfn = page_to_pfn(page); int migratetype; + if (in_interrupt()) { + __free_pages_ok(page, 0); + return; + } + if (!free_pcp_prepare(page)) return; migratetype = get_pfnblock_migratetype(page, pfn); set_pcppage_migratetype(page, migratetype); - local_irq_save(flags); - __count_vm_event(PGFREE); + preempt_disable(); /* * We only track unmovable, reclaimable and movable on pcp lists. @@ -2471,6 +2508,7 @@ void free_hot_cold_page(struct page *page, bool cold) migratetype = MIGRATE_MOVABLE; } + __count_vm_event(PGFREE); pcp = &this_cpu_ptr(zone->pageset)->pcp; if (!cold) list_add(&page->lru, &pcp->lists[migratetype]); @@ -2484,7 +2522,7 @@ void free_hot_cold_page(struct page *page, bool cold) } out: - local_irq_restore(flags); + preempt_enable(); } /* @@ -2602,74 +2640,105 @@ static inline void zone_statistics(struct zone *preferred_zone, struct zone *z) #endif } +/* Remove page from the per-cpu list, caller must protect the list */ +static struct page *__rmqueue_pcplist(struct zone *zone, int migratetype, + bool cold, struct per_cpu_pages *pcp, + struct list_head *list) +{ + struct page *page; + + VM_BUG_ON(in_interrupt()); + + do { + if (list_empty(list)) { + pcp->count += rmqueue_bulk(zone, 0, + pcp->batch, list, + migratetype, cold); + if (unlikely(list_empty(list))) + return NULL; + } + + if (cold) + page = list_last_entry(list, struct page, lru); + else + page = list_first_entry(list, struct page, lru); + + list_del(&page->lru); + pcp->count--; + } while (check_new_pcp(page)); + + return page; +} + +/* Lock and remove page from the per-cpu list */ +static struct page *rmqueue_pcplist(struct zone *preferred_zone, + struct zone *zone, unsigned int order, + gfp_t gfp_flags, int migratetype) +{ + struct per_cpu_pages *pcp; + struct list_head *list; + bool cold = ((gfp_flags & __GFP_COLD) != 0); + struct page *page; + + preempt_disable(); + pcp = &this_cpu_ptr(zone->pageset)->pcp; + list = &pcp->lists[migratetype]; + page = __rmqueue_pcplist(zone, migratetype, cold, pcp, list); + if (page) { + __count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order); + zone_statistics(preferred_zone, zone); + } + preempt_enable(); + return page; +} + /* * Allocate a page from the given zone. Use pcplists for order-0 allocations. */ static inline -struct page *buffered_rmqueue(struct zone *preferred_zone, +struct page *rmqueue(struct zone *preferred_zone, struct zone *zone, unsigned int order, gfp_t gfp_flags, unsigned int alloc_flags, int migratetype) { unsigned long flags; struct page *page; - bool cold = ((gfp_flags & __GFP_COLD) != 0); - if (likely(order == 0)) { - struct per_cpu_pages *pcp; - struct list_head *list; - - local_irq_save(flags); - do { - pcp = &this_cpu_ptr(zone->pageset)->pcp; - list = &pcp->lists[migratetype]; - if (list_empty(list)) { - pcp->count += rmqueue_bulk(zone, 0, - pcp->batch, list, - migratetype, cold); - if (unlikely(list_empty(list))) - goto failed; - } - - if (cold) - page = list_last_entry(list, struct page, lru); - else - page = list_first_entry(list, struct page, lru); - - list_del(&page->lru); - pcp->count--; + if (likely(order == 0) && !in_interrupt()) { + page = rmqueue_pcplist(preferred_zone, zone, order, + gfp_flags, migratetype); + goto out; + } - } while (check_new_pcp(page)); - } else { - /* - * We most definitely don't want callers attempting to - * allocate greater than order-1 page units with __GFP_NOFAIL. - */ - WARN_ON_ONCE((gfp_flags & __GFP_NOFAIL) && (order > 1)); - spin_lock_irqsave(&zone->lock, flags); + /* + * We most definitely don't want callers attempting to + * allocate greater than order-1 page units with __GFP_NOFAIL. + */ + WARN_ON_ONCE((gfp_flags & __GFP_NOFAIL) && (order > 1)); + spin_lock_irqsave(&zone->lock, flags); - do { - page = NULL; - if (alloc_flags & ALLOC_HARDER) { - page = __rmqueue_smallest(zone, order, MIGRATE_HIGHATOMIC); - if (page) - trace_mm_page_alloc_zone_locked(page, order, migratetype); - } - if (!page) - page = __rmqueue(zone, order, migratetype); - } while (page && check_new_pages(page, order)); - spin_unlock(&zone->lock); + do { + page = NULL; + if (alloc_flags & ALLOC_HARDER) { + page = __rmqueue_smallest(zone, order, MIGRATE_HIGHATOMIC); + if (page) + trace_mm_page_alloc_zone_locked(page, order, migratetype); + } if (!page) - goto failed; - __mod_zone_freepage_state(zone, -(1 << order), - get_pcppage_migratetype(page)); - } + page = __rmqueue(zone, order, migratetype); + } while (page && check_new_pages(page, order)); + spin_unlock(&zone->lock); + if (!page) + goto failed; + __mod_zone_freepage_state(zone, -(1 << order), + get_pcppage_migratetype(page)); __count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order); zone_statistics(preferred_zone, zone); local_irq_restore(flags); - VM_BUG_ON_PAGE(bad_range(zone, page), page); +out: + VM_BUG_ON_PAGE(page && bad_range(zone, page), page); return page; failed: @@ -2877,7 +2946,7 @@ bool zone_watermark_ok_safe(struct zone *z, unsigned int order, #ifdef CONFIG_NUMA static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone) { - return node_distance(zone_to_nid(local_zone), zone_to_nid(zone)) < + return node_distance(zone_to_nid(local_zone), zone_to_nid(zone)) <= RECLAIM_DISTANCE; } #else /* CONFIG_NUMA */ @@ -2974,7 +3043,7 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags, } try_this_zone: - page = buffered_rmqueue(ac->preferred_zoneref->zone, zone, order, + page = rmqueue(ac->preferred_zoneref->zone, zone, order, gfp_mask, alloc_flags, ac->migratetype); if (page) { prep_new_page(page, order, gfp_mask, alloc_flags); @@ -3007,18 +3076,12 @@ static inline bool should_suppress_show_mem(void) return ret; } -static DEFINE_RATELIMIT_STATE(nopage_rs, - DEFAULT_RATELIMIT_INTERVAL, - DEFAULT_RATELIMIT_BURST); - -void warn_alloc(gfp_t gfp_mask, const char *fmt, ...) +static void warn_alloc_show_mem(gfp_t gfp_mask, nodemask_t *nodemask) { unsigned int filter = SHOW_MEM_FILTER_NODES; - struct va_format vaf; - va_list args; + static DEFINE_RATELIMIT_STATE(show_mem_rs, HZ, 1); - if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs) || - debug_guardpage_minorder() > 0) + if (should_suppress_show_mem() || !__ratelimit(&show_mem_rs)) return; /* @@ -3033,6 +3096,20 @@ void warn_alloc(gfp_t gfp_mask, const char *fmt, ...) if (in_interrupt() || !(gfp_mask & __GFP_DIRECT_RECLAIM)) filter &= ~SHOW_MEM_FILTER_NODES; + show_mem(filter, nodemask); +} + +void warn_alloc(gfp_t gfp_mask, nodemask_t *nodemask, const char *fmt, ...) +{ + struct va_format vaf; + va_list args; + static DEFINE_RATELIMIT_STATE(nopage_rs, DEFAULT_RATELIMIT_INTERVAL, + DEFAULT_RATELIMIT_BURST); + + if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs) || + debug_guardpage_minorder() > 0) + return; + pr_warn("%s: ", current->comm); va_start(args, fmt); @@ -3041,11 +3118,36 @@ void warn_alloc(gfp_t gfp_mask, const char *fmt, ...) pr_cont("%pV", &vaf); va_end(args); - pr_cont(", mode:%#x(%pGg)\n", gfp_mask, &gfp_mask); + pr_cont(", mode:%#x(%pGg), nodemask=", gfp_mask, &gfp_mask); + if (nodemask) + pr_cont("%*pbl\n", nodemask_pr_args(nodemask)); + else + pr_cont("(null)\n"); + + cpuset_print_current_mems_allowed(); dump_stack(); - if (!should_suppress_show_mem()) - show_mem(filter); + warn_alloc_show_mem(gfp_mask, nodemask); +} + +static inline struct page * +__alloc_pages_cpuset_fallback(gfp_t gfp_mask, unsigned int order, + unsigned int alloc_flags, + const struct alloc_context *ac) +{ + struct page *page; + + page = get_page_from_freelist(gfp_mask, order, + alloc_flags|ALLOC_CPUSET, ac); + /* + * fallback to ignore cpuset restriction if our nodes + * are depleted + */ + if (!page) + page = get_page_from_freelist(gfp_mask, order, + alloc_flags, ac); + + return page; } static inline struct page * @@ -3083,47 +3185,42 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, if (page) goto out; - if (!(gfp_mask & __GFP_NOFAIL)) { - /* Coredumps can quickly deplete all memory reserves */ - if (current->flags & PF_DUMPCORE) - goto out; - /* The OOM killer will not help higher order allocs */ - if (order > PAGE_ALLOC_COSTLY_ORDER) - goto out; - /* The OOM killer does not needlessly kill tasks for lowmem */ - if (ac->high_zoneidx < ZONE_NORMAL) - goto out; - if (pm_suspended_storage()) - goto out; - /* - * XXX: GFP_NOFS allocations should rather fail than rely on - * other request to make a forward progress. - * We are in an unfortunate situation where out_of_memory cannot - * do much for this context but let's try it to at least get - * access to memory reserved if the current task is killed (see - * out_of_memory). Once filesystems are ready to handle allocation - * failures more gracefully we should just bail out here. - */ + /* Coredumps can quickly deplete all memory reserves */ + if (current->flags & PF_DUMPCORE) + goto out; + /* The OOM killer will not help higher order allocs */ + if (order > PAGE_ALLOC_COSTLY_ORDER) + goto out; + /* The OOM killer does not needlessly kill tasks for lowmem */ + if (ac->high_zoneidx < ZONE_NORMAL) + goto out; + if (pm_suspended_storage()) + goto out; + /* + * XXX: GFP_NOFS allocations should rather fail than rely on + * other request to make a forward progress. + * We are in an unfortunate situation where out_of_memory cannot + * do much for this context but let's try it to at least get + * access to memory reserved if the current task is killed (see + * out_of_memory). Once filesystems are ready to handle allocation + * failures more gracefully we should just bail out here. + */ + + /* The OOM killer may not free memory on a specific node */ + if (gfp_mask & __GFP_THISNODE) + goto out; - /* The OOM killer may not free memory on a specific node */ - if (gfp_mask & __GFP_THISNODE) - goto out; - } /* Exhausted what can be done so it's blamo time */ if (out_of_memory(&oc) || WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL)) { *did_some_progress = 1; - if (gfp_mask & __GFP_NOFAIL) { - page = get_page_from_freelist(gfp_mask, order, - ALLOC_NO_WATERMARKS|ALLOC_CPUSET, ac); - /* - * fallback to ignore cpuset restriction if our nodes - * are depleted - */ - if (!page) - page = get_page_from_freelist(gfp_mask, order, + /* + * Help non-failing allocations by giving them access to memory + * reserves + */ + if (gfp_mask & __GFP_NOFAIL) + page = __alloc_pages_cpuset_fallback(gfp_mask, order, ALLOC_NO_WATERMARKS, ac); - } } out: mutex_unlock(&oom_lock); @@ -3192,6 +3289,9 @@ should_compact_retry(struct alloc_context *ac, int order, int alloc_flags, { int max_retries = MAX_COMPACT_RETRIES; int min_priority; + bool ret = false; + int retries = *compaction_retries; + enum compact_priority priority = *compact_priority; if (!order) return false; @@ -3213,8 +3313,10 @@ should_compact_retry(struct alloc_context *ac, int order, int alloc_flags, * But do not retry if the given zonelist is not suitable for * compaction. */ - if (compaction_withdrawn(compact_result)) - return compaction_zonelist_suitable(ac, order, alloc_flags); + if (compaction_withdrawn(compact_result)) { + ret = compaction_zonelist_suitable(ac, order, alloc_flags); + goto out; + } /* * !costly requests are much more important than __GFP_REPEAT @@ -3226,8 +3328,10 @@ should_compact_retry(struct alloc_context *ac, int order, int alloc_flags, */ if (order > PAGE_ALLOC_COSTLY_ORDER) max_retries /= 4; - if (*compaction_retries <= max_retries) - return true; + if (*compaction_retries <= max_retries) { + ret = true; + goto out; + } /* * Make sure there are attempts at the highest priority if we exhausted @@ -3236,12 +3340,15 @@ should_compact_retry(struct alloc_context *ac, int order, int alloc_flags, check_priority: min_priority = (order > PAGE_ALLOC_COSTLY_ORDER) ? MIN_COMPACT_COSTLY_PRIORITY : MIN_COMPACT_PRIORITY; + if (*compact_priority > min_priority) { (*compact_priority)--; *compaction_retries = 0; - return true; + ret = true; } - return false; +out: + trace_compact_retry(order, priority, compact_result, retries, max_retries, ret); + return ret; } #else static inline struct page * @@ -3464,6 +3571,8 @@ should_reclaim_retry(gfp_t gfp_mask, unsigned order, ac->nodemask) { unsigned long available; unsigned long reclaimable; + unsigned long min_wmark = min_wmark_pages(zone); + bool wmark; available = reclaimable = zone_reclaimable_pages(zone); available -= DIV_ROUND_UP((*no_progress_loops) * available, @@ -3474,8 +3583,11 @@ should_reclaim_retry(gfp_t gfp_mask, unsigned order, * Would the allocation succeed if we reclaimed the whole * available? */ - if (__zone_watermark_ok(zone, order, min_wmark_pages(zone), - ac_classzone_idx(ac), alloc_flags, available)) { + wmark = __zone_watermark_ok(zone, order, min_wmark, + ac_classzone_idx(ac), alloc_flags, available); + trace_reclaim_retry_zone(z, order, reclaimable, + available, min_wmark, *no_progress_loops, wmark); + if (wmark) { /* * If we didn't make any progress and have a lot of * dirty + writeback pages then we should wait for @@ -3555,6 +3667,14 @@ retry_cpuset: no_progress_loops = 0; compact_priority = DEF_COMPACT_PRIORITY; cpuset_mems_cookie = read_mems_allowed_begin(); + + /* + * The fast path uses conservative alloc_flags to succeed only until + * kswapd needs to be woken up, and to avoid the cost of setting up + * alloc_flags precisely. So we do that now. + */ + alloc_flags = gfp_to_alloc_flags(gfp_mask); + /* * We need to recalculate the starting point for the zonelist iterator * because we might have used different nodemask in the fast path, or @@ -3566,14 +3686,6 @@ retry_cpuset: if (!ac->preferred_zoneref->zone) goto nopage; - - /* - * The fast path uses conservative alloc_flags to succeed only until - * kswapd needs to be woken up, and to avoid the cost of setting up - * alloc_flags precisely. So we do that now. - */ - alloc_flags = gfp_to_alloc_flags(gfp_mask); - if (gfp_mask & __GFP_KSWAPD_RECLAIM) wake_all_kswapds(order, ac); @@ -3650,35 +3762,21 @@ retry: goto got_pg; /* Caller is not willing to reclaim, we can't balance anything */ - if (!can_direct_reclaim) { - /* - * All existing users of the __GFP_NOFAIL are blockable, so warn - * of any new users that actually allow this type of allocation - * to fail. - */ - WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL); + if (!can_direct_reclaim) goto nopage; - } - /* Avoid recursion of direct reclaim */ - if (current->flags & PF_MEMALLOC) { - /* - * __GFP_NOFAIL request from this context is rather bizarre - * because we cannot reclaim anything and only can loop waiting - * for somebody to do a work for us. - */ - if (WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL)) { - cond_resched(); - goto retry; - } - goto nopage; + /* Make sure we know about allocations which stall for too long */ + if (time_after(jiffies, alloc_start + stall_timeout)) { + warn_alloc(gfp_mask, ac->nodemask, + "page allocation stalls for %ums, order:%u", + jiffies_to_msecs(jiffies-alloc_start), order); + stall_timeout += 10 * HZ; } - /* Avoid allocations with no watermarks from looping endlessly */ - if (test_thread_flag(TIF_MEMDIE) && !(gfp_mask & __GFP_NOFAIL)) + /* Avoid recursion of direct reclaim */ + if (current->flags & PF_MEMALLOC) goto nopage; - /* Try direct reclaim and then allocating */ page = __alloc_pages_direct_reclaim(gfp_mask, order, alloc_flags, ac, &did_some_progress); @@ -3702,14 +3800,6 @@ retry: if (order > PAGE_ALLOC_COSTLY_ORDER && !(gfp_mask & __GFP_REPEAT)) goto nopage; - /* Make sure we know about allocations which stall for too long */ - if (time_after(jiffies, alloc_start + stall_timeout)) { - warn_alloc(gfp_mask, - "page allocation stalls for %ums, order:%u", - jiffies_to_msecs(jiffies-alloc_start), order); - stall_timeout += 10 * HZ; - } - if (should_reclaim_retry(gfp_mask, order, ac, alloc_flags, did_some_progress > 0, &no_progress_loops)) goto retry; @@ -3738,6 +3828,10 @@ retry: if (page) goto got_pg; + /* Avoid allocations with no watermarks from looping endlessly */ + if (test_thread_flag(TIF_MEMDIE)) + goto nopage; + /* Retry as long as the OOM killer is making progress */ if (did_some_progress) { no_progress_loops = 0; @@ -3755,82 +3849,123 @@ nopage: if (read_mems_allowed_retry(cpuset_mems_cookie)) goto retry_cpuset; - warn_alloc(gfp_mask, + /* + * Make sure that __GFP_NOFAIL request doesn't leak out and make sure + * we always retry + */ + if (gfp_mask & __GFP_NOFAIL) { + /* + * All existing users of the __GFP_NOFAIL are blockable, so warn + * of any new users that actually require GFP_NOWAIT + */ + if (WARN_ON_ONCE(!can_direct_reclaim)) + goto fail; + + /* + * PF_MEMALLOC request from this context is rather bizarre + * because we cannot reclaim anything and only can loop waiting + * for somebody to do a work for us + */ + WARN_ON_ONCE(current->flags & PF_MEMALLOC); + + /* + * non failing costly orders are a hard requirement which we + * are not prepared for much so let's warn about these users + * so that we can identify them and convert them to something + * else. + */ + WARN_ON_ONCE(order > PAGE_ALLOC_COSTLY_ORDER); + + /* + * Help non-failing allocations by giving them access to memory + * reserves but do not use ALLOC_NO_WATERMARKS because this + * could deplete whole memory reserves which would just make + * the situation worse + */ + page = __alloc_pages_cpuset_fallback(gfp_mask, order, ALLOC_HARDER, ac); + if (page) + goto got_pg; + + cond_resched(); + goto retry; + } +fail: + warn_alloc(gfp_mask, ac->nodemask, "page allocation failure: order:%u", order); got_pg: return page; } -/* - * This is the 'heart' of the zoned buddy allocator. - */ -struct page * -__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, - struct zonelist *zonelist, nodemask_t *nodemask) +static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order, + struct zonelist *zonelist, nodemask_t *nodemask, + struct alloc_context *ac, gfp_t *alloc_mask, + unsigned int *alloc_flags) { - struct page *page; - unsigned int alloc_flags = ALLOC_WMARK_LOW; - gfp_t alloc_mask = gfp_mask; /* The gfp_t that was actually used for allocation */ - struct alloc_context ac = { - .high_zoneidx = gfp_zone(gfp_mask), - .zonelist = zonelist, - .nodemask = nodemask, - .migratetype = gfpflags_to_migratetype(gfp_mask), - }; + ac->high_zoneidx = gfp_zone(gfp_mask); + ac->zonelist = zonelist; + ac->nodemask = nodemask; + ac->migratetype = gfpflags_to_migratetype(gfp_mask); if (cpusets_enabled()) { - alloc_mask |= __GFP_HARDWALL; - alloc_flags |= ALLOC_CPUSET; - if (!ac.nodemask) - ac.nodemask = &cpuset_current_mems_allowed; + *alloc_mask |= __GFP_HARDWALL; + if (!ac->nodemask) + ac->nodemask = &cpuset_current_mems_allowed; + else + *alloc_flags |= ALLOC_CPUSET; } - gfp_mask &= gfp_allowed_mask; - lockdep_trace_alloc(gfp_mask); might_sleep_if(gfp_mask & __GFP_DIRECT_RECLAIM); if (should_fail_alloc_page(gfp_mask, order)) - return NULL; + return false; - /* - * Check the zones suitable for the gfp_mask contain at least one - * valid zone. It's possible to have an empty zonelist as a result - * of __GFP_THISNODE and a memoryless node - */ - if (unlikely(!zonelist->_zonerefs->zone)) - return NULL; + if (IS_ENABLED(CONFIG_CMA) && ac->migratetype == MIGRATE_MOVABLE) + *alloc_flags |= ALLOC_CMA; - if (IS_ENABLED(CONFIG_CMA) && ac.migratetype == MIGRATE_MOVABLE) - alloc_flags |= ALLOC_CMA; + return true; +} +/* Determine whether to spread dirty pages and what the first usable zone */ +static inline void finalise_ac(gfp_t gfp_mask, + unsigned int order, struct alloc_context *ac) +{ /* Dirty zone balancing only done in the fast path */ - ac.spread_dirty_pages = (gfp_mask & __GFP_WRITE); + ac->spread_dirty_pages = (gfp_mask & __GFP_WRITE); /* * The preferred zone is used for statistics but crucially it is * also used as the starting point for the zonelist iterator. It * may get reset for allocations that ignore memory policies. */ - ac.preferred_zoneref = first_zones_zonelist(ac.zonelist, - ac.high_zoneidx, ac.nodemask); - if (!ac.preferred_zoneref->zone) { - page = NULL; - /* - * This might be due to race with cpuset_current_mems_allowed - * update, so make sure we retry with original nodemask in the - * slow path. - */ - goto no_zone; - } + ac->preferred_zoneref = first_zones_zonelist(ac->zonelist, + ac->high_zoneidx, ac->nodemask); +} + +/* + * This is the 'heart' of the zoned buddy allocator. + */ +struct page * +__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, + struct zonelist *zonelist, nodemask_t *nodemask) +{ + struct page *page; + unsigned int alloc_flags = ALLOC_WMARK_LOW; + gfp_t alloc_mask = gfp_mask; /* The gfp_t that was actually used for allocation */ + struct alloc_context ac = { }; + + gfp_mask &= gfp_allowed_mask; + if (!prepare_alloc_pages(gfp_mask, order, zonelist, nodemask, &ac, &alloc_mask, &alloc_flags)) + return NULL; + + finalise_ac(gfp_mask, order, &ac); /* First allocation attempt */ page = get_page_from_freelist(alloc_mask, order, alloc_flags, &ac); if (likely(page)) goto out; -no_zone: /* * Runtime PM, block IO and its error handling path can deadlock * because I/O on the device might not complete. @@ -4252,20 +4387,20 @@ void si_meminfo_node(struct sysinfo *val, int nid) * Determine whether the node should be displayed or not, depending on whether * SHOW_MEM_FILTER_NODES was passed to show_free_areas(). */ -bool skip_free_areas_node(unsigned int flags, int nid) +static bool show_mem_node_skip(unsigned int flags, int nid, nodemask_t *nodemask) { - bool ret = false; - unsigned int cpuset_mems_cookie; - if (!(flags & SHOW_MEM_FILTER_NODES)) - goto out; + return false; - do { - cpuset_mems_cookie = read_mems_allowed_begin(); - ret = !node_isset(nid, cpuset_current_mems_allowed); - } while (read_mems_allowed_retry(cpuset_mems_cookie)); -out: - return ret; + /* + * no node mask - aka implicit memory numa policy. Do not bother with + * the synchronization - read_mems_allowed_begin - because we do not + * have to be precise here. + */ + if (!nodemask) + nodemask = &cpuset_current_mems_allowed; + + return !node_isset(nid, *nodemask); } #define K(x) ((x) << (PAGE_SHIFT-10)) @@ -4306,7 +4441,7 @@ static void show_migration_types(unsigned char type) * SHOW_MEM_FILTER_NODES: suppress nodes that are not allowed by current's * cpuset. */ -void show_free_areas(unsigned int filter) +void show_free_areas(unsigned int filter, nodemask_t *nodemask) { unsigned long free_pcp = 0; int cpu; @@ -4314,7 +4449,7 @@ void show_free_areas(unsigned int filter) pg_data_t *pgdat; for_each_populated_zone(zone) { - if (skip_free_areas_node(filter, zone_to_nid(zone))) + if (show_mem_node_skip(filter, zone_to_nid(zone), nodemask)) continue; for_each_online_cpu(cpu) @@ -4348,6 +4483,9 @@ void show_free_areas(unsigned int filter) global_page_state(NR_FREE_CMA_PAGES)); for_each_online_pgdat(pgdat) { + if (show_mem_node_skip(filter, pgdat->node_id, nodemask)) + continue; + printk("Node %d" " active_anon:%lukB" " inactive_anon:%lukB" @@ -4397,7 +4535,7 @@ void show_free_areas(unsigned int filter) for_each_populated_zone(zone) { int i; - if (skip_free_areas_node(filter, zone_to_nid(zone))) + if (show_mem_node_skip(filter, zone_to_nid(zone), nodemask)) continue; free_pcp = 0; @@ -4462,7 +4600,7 @@ void show_free_areas(unsigned int filter) unsigned long nr[MAX_ORDER], flags, total = 0; unsigned char types[MAX_ORDER]; - if (skip_free_areas_node(filter, zone_to_nid(zone))) + if (show_mem_node_skip(filter, zone_to_nid(zone), nodemask)) continue; show_node(zone); printk(KERN_CONT "%s: ", zone->name); @@ -5083,8 +5221,17 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, if (context != MEMMAP_EARLY) goto not_early; - if (!early_pfn_valid(pfn)) + if (!early_pfn_valid(pfn)) { +#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP + /* + * Skip to the pfn preceding the next valid one (or + * end_pfn), such that we hit a valid pfn (or end_pfn) + * on our next iteration of the loop. + */ + pfn = memblock_next_valid_pfn(pfn, end_pfn) - 1; +#endif continue; + } if (!early_pfn_in_nid(pfn, nid)) continue; if (!update_defer_init(pgdat, pfn, end_pfn, &nr_initialised)) @@ -5780,7 +5927,7 @@ static unsigned long __paginginit calc_memmap_size(unsigned long spanned_pages, * the zone and SPARSEMEM is in use. If there are holes within the * zone, each populated memory region may cost us one or two extra * memmap pages due to alignment because memmap pages for each - * populated regions may not naturally algined on page boundary. + * populated regions may not be naturally aligned on page boundary. * So the (present_pages >> 4) heuristic is a tradeoff for that. */ if (spanned_pages > present_pages + (present_pages >> 4) && @@ -6344,8 +6491,6 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn) start_pfn = end_pfn; } - arch_zone_lowest_possible_pfn[ZONE_MOVABLE] = 0; - arch_zone_highest_possible_pfn[ZONE_MOVABLE] = 0; /* Find the PFNs that ZONE_MOVABLE begins at in each node */ memset(zone_movable_pfn, 0, sizeof(zone_movable_pfn)); @@ -7081,8 +7226,9 @@ void *__init alloc_large_system_hash(const char *tablename, * If @count is not zero, it is okay to include less @count unmovable pages * * PageLRU check without isolation or lru_lock could race so that - * MIGRATE_MOVABLE block might include unmovable pages. It means you can't - * expect this function should be exact. + * MIGRATE_MOVABLE block might include unmovable pages. And __PageMovable + * check without lock_page also may miss some movable non-lru pages at + * race condition. So you can't expect this function should be exact. */ bool has_unmovable_pages(struct zone *zone, struct page *page, int count, bool skip_hwpoisoned_pages) @@ -7138,6 +7284,9 @@ bool has_unmovable_pages(struct zone *zone, struct page *page, int count, if (skip_hwpoisoned_pages && PageHWPoison(page)) continue; + if (__PageMovable(page)) + continue; + if (!PageLRU(page)) found++; /* @@ -7249,6 +7398,7 @@ static int __alloc_contig_migrate_range(struct compact_control *cc, * #MIGRATE_MOVABLE or #MIGRATE_CMA). All pageblocks * in range must have the same migratetype and it must * be either of the two. + * @gfp_mask: GFP mask to use during compaction * * The PFN range does not have to be pageblock or MAX_ORDER_NR_PAGES * aligned, however it's the caller's responsibility to guarantee that @@ -7262,7 +7412,7 @@ static int __alloc_contig_migrate_range(struct compact_control *cc, * need to be freed with free_contig_range(). */ int alloc_contig_range(unsigned long start, unsigned long end, - unsigned migratetype) + unsigned migratetype, gfp_t gfp_mask) { unsigned long outer_start, outer_end; unsigned int order; @@ -7274,7 +7424,7 @@ int alloc_contig_range(unsigned long start, unsigned long end, .zone = page_zone(pfn_to_page(start)), .mode = MIGRATE_SYNC, .ignore_skip_hint = true, - .gfp_mask = GFP_KERNEL, + .gfp_mask = memalloc_noio_flags(gfp_mask), }; INIT_LIST_HEAD(&cc.migratepages); |