diff options
Diffstat (limited to 'mm/page_alloc.c')
-rw-r--r-- | mm/page_alloc.c | 325 |
1 files changed, 177 insertions, 148 deletions
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 44030096da63..009ac285fea7 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -51,7 +51,6 @@ #include <linux/page_cgroup.h> #include <linux/debugobjects.h> #include <linux/kmemleak.h> -#include <linux/memory.h> #include <linux/compaction.h> #include <trace/events/kmem.h> #include <linux/ftrace_event.h> @@ -219,7 +218,12 @@ EXPORT_SYMBOL(nr_online_nodes); int page_group_by_mobility_disabled __read_mostly; -static void set_pageblock_migratetype(struct page *page, int migratetype) +/* + * NOTE: + * Don't use set_pageblock_migratetype(page, MIGRATE_ISOLATE) directly. + * Instead, use {un}set_pageblock_isolate. + */ +void set_pageblock_migratetype(struct page *page, int migratetype) { if (unlikely(page_group_by_mobility_disabled)) @@ -954,7 +958,7 @@ static int move_freepages(struct zone *zone, return pages_moved; } -static int move_freepages_block(struct zone *zone, struct page *page, +int move_freepages_block(struct zone *zone, struct page *page, int migratetype) { unsigned long start_pfn, end_pfn; @@ -1158,8 +1162,10 @@ void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp) to_drain = pcp->batch; else to_drain = pcp->count; - free_pcppages_bulk(zone, to_drain, pcp); - pcp->count -= to_drain; + if (to_drain > 0) { + free_pcppages_bulk(zone, to_drain, pcp); + pcp->count -= to_drain; + } local_irq_restore(flags); } #endif @@ -1529,16 +1535,16 @@ static int __init setup_fail_page_alloc(char *str) } __setup("fail_page_alloc=", setup_fail_page_alloc); -static int should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) +static bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) { if (order < fail_page_alloc.min_order) - return 0; + return false; if (gfp_mask & __GFP_NOFAIL) - return 0; + return false; if (fail_page_alloc.ignore_gfp_highmem && (gfp_mask & __GFP_HIGHMEM)) - return 0; + return false; if (fail_page_alloc.ignore_gfp_wait && (gfp_mask & __GFP_WAIT)) - return 0; + return false; return should_fail(&fail_page_alloc.attr, 1 << order); } @@ -1578,9 +1584,9 @@ late_initcall(fail_page_alloc_debugfs); #else /* CONFIG_FAIL_PAGE_ALLOC */ -static inline int should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) +static inline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) { - return 0; + return false; } #endif /* CONFIG_FAIL_PAGE_ALLOC */ @@ -1594,6 +1600,7 @@ static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark, { /* free_pages my go negative - that's OK */ long min = mark; + long lowmem_reserve = z->lowmem_reserve[classzone_idx]; int o; free_pages -= (1 << order) - 1; @@ -1602,7 +1609,7 @@ static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark, if (alloc_flags & ALLOC_HARDER) min -= min / 4; - if (free_pages <= min + z->lowmem_reserve[classzone_idx]) + if (free_pages <= min + lowmem_reserve) return false; for (o = 0; o < order; o++) { /* At the next order, this order's pages become unavailable */ @@ -1617,6 +1624,20 @@ static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark, return true; } +#ifdef CONFIG_MEMORY_ISOLATION +static inline unsigned long nr_zone_isolate_freepages(struct zone *zone) +{ + if (unlikely(zone->nr_pageblock_isolate)) + return zone->nr_pageblock_isolate * pageblock_nr_pages; + return 0; +} +#else +static inline unsigned long nr_zone_isolate_freepages(struct zone *zone) +{ + return 0; +} +#endif + bool zone_watermark_ok(struct zone *z, int order, unsigned long mark, int classzone_idx, int alloc_flags) { @@ -1632,6 +1653,14 @@ bool zone_watermark_ok_safe(struct zone *z, int order, unsigned long mark, if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark) free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES); + /* + * If the zone has MIGRATE_ISOLATE type free pages, we should consider + * it. nr_zone_isolate_freepages is never accurate so kswapd might not + * sleep although it could do so. But this is more desirable for memory + * hotplug than sleeping which can cause a livelock in the direct + * reclaim path. + */ + free_pages -= nr_zone_isolate_freepages(z); return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags, free_pages); } @@ -2087,8 +2116,8 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist, high_zoneidx, - alloc_flags, preferred_zone, - migratetype); + alloc_flags & ~ALLOC_NO_WATERMARKS, + preferred_zone, migratetype); if (page) { preferred_zone->compact_considered = 0; preferred_zone->compact_defer_shift = 0; @@ -2180,8 +2209,8 @@ __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order, retry: page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist, high_zoneidx, - alloc_flags, preferred_zone, - migratetype); + alloc_flags & ~ALLOC_NO_WATERMARKS, + preferred_zone, migratetype); /* * If an allocation failed after direct reclaim, it could be because @@ -2265,15 +2294,24 @@ gfp_to_alloc_flags(gfp_t gfp_mask) alloc_flags |= ALLOC_HARDER; if (likely(!(gfp_mask & __GFP_NOMEMALLOC))) { - if (!in_interrupt() && - ((current->flags & PF_MEMALLOC) || - unlikely(test_thread_flag(TIF_MEMDIE)))) + if (gfp_mask & __GFP_MEMALLOC) + alloc_flags |= ALLOC_NO_WATERMARKS; + else if (in_serving_softirq() && (current->flags & PF_MEMALLOC)) + alloc_flags |= ALLOC_NO_WATERMARKS; + else if (!in_interrupt() && + ((current->flags & PF_MEMALLOC) || + unlikely(test_thread_flag(TIF_MEMDIE)))) alloc_flags |= ALLOC_NO_WATERMARKS; } return alloc_flags; } +bool gfp_pfmemalloc_allowed(gfp_t gfp_mask) +{ + return !!(gfp_to_alloc_flags(gfp_mask) & ALLOC_NO_WATERMARKS); +} + static inline struct page * __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist, enum zone_type high_zoneidx, @@ -2340,11 +2378,27 @@ rebalance: /* Allocate without watermarks if the context allows */ if (alloc_flags & ALLOC_NO_WATERMARKS) { + /* + * Ignore mempolicies if ALLOC_NO_WATERMARKS on the grounds + * the allocation is high priority and these type of + * allocations are system rather than user orientated + */ + zonelist = node_zonelist(numa_node_id(), gfp_mask); + page = __alloc_pages_high_priority(gfp_mask, order, zonelist, high_zoneidx, nodemask, preferred_zone, migratetype); - if (page) + if (page) { + /* + * page->pfmemalloc is set when ALLOC_NO_WATERMARKS was + * necessary to allocate the page. The expectation is + * that the caller is taking steps that will free more + * memory. The caller should avoid the page being used + * for !PFMEMALLOC purposes. + */ + page->pfmemalloc = true; goto got_pg; + } } /* Atomic allocations - we can't balance anything */ @@ -2463,8 +2517,8 @@ nopage: got_pg: if (kmemcheck_enabled) kmemcheck_pagealloc_alloc(page, order, gfp_mask); - return page; + return page; } /* @@ -2515,6 +2569,8 @@ retry_cpuset: page = __alloc_pages_slowpath(gfp_mask, order, zonelist, high_zoneidx, nodemask, preferred_zone, migratetype); + else + page->pfmemalloc = false; trace_mm_page_alloc(page, order, gfp_mask, migratetype); @@ -3030,7 +3086,7 @@ int numa_zonelist_order_handler(ctl_table *table, int write, user_zonelist_order = oldval; } else if (oldval != user_zonelist_order) { mutex_lock(&zonelists_mutex); - build_all_zonelists(NULL); + build_all_zonelists(NULL, NULL); mutex_unlock(&zonelists_mutex); } } @@ -3409,14 +3465,21 @@ static void setup_zone_pageset(struct zone *zone); DEFINE_MUTEX(zonelists_mutex); /* return values int ....just for stop_machine() */ -static __init_refok int __build_all_zonelists(void *data) +static int __build_all_zonelists(void *data) { int nid; int cpu; + pg_data_t *self = data; #ifdef CONFIG_NUMA memset(node_load, 0, sizeof(node_load)); #endif + + if (self && !node_online(self->node_id)) { + build_zonelists(self); + build_zonelist_cache(self); + } + for_each_online_node(nid) { pg_data_t *pgdat = NODE_DATA(nid); @@ -3461,7 +3524,7 @@ static __init_refok int __build_all_zonelists(void *data) * Called with zonelists_mutex held always * unless system_state == SYSTEM_BOOTING. */ -void __ref build_all_zonelists(void *data) +void __ref build_all_zonelists(pg_data_t *pgdat, struct zone *zone) { set_zonelist_order(); @@ -3473,10 +3536,10 @@ void __ref build_all_zonelists(void *data) /* we have to stop all cpus to guarantee there is no user of zonelist */ #ifdef CONFIG_MEMORY_HOTPLUG - if (data) - setup_zone_pageset((struct zone *)data); + if (zone) + setup_zone_pageset(zone); #endif - stop_machine(__build_all_zonelists, NULL, NULL); + stop_machine(__build_all_zonelists, pgdat, NULL); /* cpuset refresh routine should be here */ } vm_total_pages = nr_free_pagecache_pages(); @@ -3746,7 +3809,7 @@ static void __meminit zone_init_free_lists(struct zone *zone) memmap_init_zone((size), (nid), (zone), (start_pfn), MEMMAP_EARLY) #endif -static int zone_batchsize(struct zone *zone) +static int __meminit zone_batchsize(struct zone *zone) { #ifdef CONFIG_MMU int batch; @@ -3828,7 +3891,7 @@ static void setup_pagelist_highmark(struct per_cpu_pageset *p, pcp->batch = PAGE_SHIFT * 8; } -static void setup_zone_pageset(struct zone *zone) +static void __meminit setup_zone_pageset(struct zone *zone) { int cpu; @@ -3901,32 +3964,6 @@ int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages) return 0; } -static int __zone_pcp_update(void *data) -{ - struct zone *zone = data; - int cpu; - unsigned long batch = zone_batchsize(zone), flags; - - for_each_possible_cpu(cpu) { - struct per_cpu_pageset *pset; - struct per_cpu_pages *pcp; - - pset = per_cpu_ptr(zone->pageset, cpu); - pcp = &pset->pcp; - - local_irq_save(flags); - free_pcppages_bulk(zone, pcp->count, pcp); - setup_pageset(pset, batch); - local_irq_restore(flags); - } - return 0; -} - -void zone_pcp_update(struct zone *zone) -{ - stop_machine(__zone_pcp_update, zone, NULL); -} - static __meminit void zone_pcp_init(struct zone *zone) { /* @@ -3942,7 +3979,7 @@ static __meminit void zone_pcp_init(struct zone *zone) zone_batchsize(zone)); } -__meminit int init_currently_empty_zone(struct zone *zone, +int __meminit init_currently_empty_zone(struct zone *zone, unsigned long zone_start_pfn, unsigned long size, enum memmap_context context) @@ -4301,7 +4338,7 @@ static inline void setup_usemap(struct pglist_data *pgdat, #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE /* Initialise the number of pages represented by NR_PAGEBLOCK_BITS */ -static inline void __init set_pageblock_order(void) +void __init set_pageblock_order(void) { unsigned int order; @@ -4329,7 +4366,7 @@ static inline void __init set_pageblock_order(void) * include/linux/pageblock-flags.h for the values of pageblock_order based on * the kernel config */ -static inline void set_pageblock_order(void) +void __init set_pageblock_order(void) { } @@ -4340,6 +4377,8 @@ static inline void set_pageblock_order(void) * - mark all pages reserved * - mark all memory queues empty * - clear the memory bitmaps + * + * NOTE: pgdat should get zeroed by caller. */ static void __paginginit free_area_init_core(struct pglist_data *pgdat, unsigned long *zones_size, unsigned long *zholes_size) @@ -4350,9 +4389,8 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat, int ret; pgdat_resize_init(pgdat); - pgdat->nr_zones = 0; init_waitqueue_head(&pgdat->kswapd_wait); - pgdat->kswapd_max_order = 0; + init_waitqueue_head(&pgdat->pfmemalloc_wait); pgdat_page_cgroup_init(pgdat); for (j = 0; j < MAX_NR_ZONES; j++) { @@ -4394,6 +4432,11 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat, zone->spanned_pages = size; zone->present_pages = realsize; +#if defined CONFIG_COMPACTION || defined CONFIG_CMA + zone->compact_cached_free_pfn = zone->zone_start_pfn + + zone->spanned_pages; + zone->compact_cached_free_pfn &= ~(pageblock_nr_pages-1); +#endif #ifdef CONFIG_NUMA zone->node = nid; zone->min_unmapped_pages = (realsize*sysctl_min_unmapped_ratio) @@ -4408,8 +4451,6 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat, zone_pcp_init(zone); lruvec_init(&zone->lruvec, zone); - zap_zone_vm_stats(zone); - zone->flags = 0; if (!size) continue; @@ -4469,6 +4510,9 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size, { pg_data_t *pgdat = NODE_DATA(nid); + /* pg_data_t should be reset to zero when it's allocated */ + WARN_ON(pgdat->nr_zones || pgdat->classzone_idx); + pgdat->node_id = nid; pgdat->node_start_pfn = node_start_pfn; calculate_node_totalpages(pgdat, zones_size, zholes_size); @@ -4750,7 +4794,7 @@ out: } /* Any regular memory on that node ? */ -static void check_for_regular_memory(pg_data_t *pgdat) +static void __init check_for_regular_memory(pg_data_t *pgdat) { #ifdef CONFIG_HIGHMEM enum zone_type zone_type; @@ -5468,26 +5512,27 @@ void set_pageblock_flags_group(struct page *page, unsigned long flags, } /* - * This is designed as sub function...plz see page_isolation.c also. - * set/clear page block's type to be ISOLATE. - * page allocater never alloc memory from ISOLATE block. + * This function checks whether pageblock includes unmovable pages or not. + * If @count is not zero, it is okay to include less @count unmovable pages + * + * PageLRU check wihtout isolation or lru_lock could race so that + * MIGRATE_MOVABLE block might include unmovable pages. It means you can't + * expect this function should be exact. */ - -static int -__count_immobile_pages(struct zone *zone, struct page *page, int count) +bool has_unmovable_pages(struct zone *zone, struct page *page, int count) { unsigned long pfn, iter, found; int mt; /* * For avoiding noise data, lru_add_drain_all() should be called - * If ZONE_MOVABLE, the zone never contains immobile pages + * If ZONE_MOVABLE, the zone never contains unmovable pages */ if (zone_idx(zone) == ZONE_MOVABLE) - return true; + return false; mt = get_pageblock_migratetype(page); if (mt == MIGRATE_MOVABLE || is_migrate_cma(mt)) - return true; + return false; pfn = page_to_pfn(page); for (found = 0, iter = 0; iter < pageblock_nr_pages; iter++) { @@ -5497,11 +5542,18 @@ __count_immobile_pages(struct zone *zone, struct page *page, int count) continue; page = pfn_to_page(check); - if (!page_count(page)) { + /* + * We can't use page_count without pin a page + * because another CPU can free compound page. + * This check already skips compound tails of THP + * because their page->_count is zero at all time. + */ + if (!atomic_read(&page->_count)) { if (PageBuddy(page)) iter += (1 << page_order(page)) - 1; continue; } + if (!PageLRU(page)) found++; /* @@ -5518,9 +5570,9 @@ __count_immobile_pages(struct zone *zone, struct page *page, int count) * page at boot. */ if (found > count) - return false; + return true; } - return true; + return false; } bool is_pageblock_removable_nolock(struct page *page) @@ -5544,77 +5596,7 @@ bool is_pageblock_removable_nolock(struct page *page) zone->zone_start_pfn + zone->spanned_pages <= pfn) return false; - return __count_immobile_pages(zone, page, 0); -} - -int set_migratetype_isolate(struct page *page) -{ - struct zone *zone; - unsigned long flags, pfn; - struct memory_isolate_notify arg; - int notifier_ret; - int ret = -EBUSY; - - zone = page_zone(page); - - spin_lock_irqsave(&zone->lock, flags); - - pfn = page_to_pfn(page); - arg.start_pfn = pfn; - arg.nr_pages = pageblock_nr_pages; - arg.pages_found = 0; - - /* - * It may be possible to isolate a pageblock even if the - * migratetype is not MIGRATE_MOVABLE. The memory isolation - * notifier chain is used by balloon drivers to return the - * number of pages in a range that are held by the balloon - * driver to shrink memory. If all the pages are accounted for - * by balloons, are free, or on the LRU, isolation can continue. - * Later, for example, when memory hotplug notifier runs, these - * pages reported as "can be isolated" should be isolated(freed) - * by the balloon driver through the memory notifier chain. - */ - notifier_ret = memory_isolate_notify(MEM_ISOLATE_COUNT, &arg); - notifier_ret = notifier_to_errno(notifier_ret); - if (notifier_ret) - goto out; - /* - * FIXME: Now, memory hotplug doesn't call shrink_slab() by itself. - * We just check MOVABLE pages. - */ - if (__count_immobile_pages(zone, page, arg.pages_found)) - ret = 0; - - /* - * immobile means "not-on-lru" paes. If immobile is larger than - * removable-by-driver pages reported by notifier, we'll fail. - */ - -out: - if (!ret) { - set_pageblock_migratetype(page, MIGRATE_ISOLATE); - move_freepages_block(zone, page, MIGRATE_ISOLATE); - } - - spin_unlock_irqrestore(&zone->lock, flags); - if (!ret) - drain_all_pages(); - return ret; -} - -void unset_migratetype_isolate(struct page *page, unsigned migratetype) -{ - struct zone *zone; - unsigned long flags; - zone = page_zone(page); - spin_lock_irqsave(&zone->lock, flags); - if (get_pageblock_migratetype(page) != MIGRATE_ISOLATE) - goto out; - set_pageblock_migratetype(page, migratetype); - move_freepages_block(zone, page, migratetype); -out: - spin_unlock_irqrestore(&zone->lock, flags); + return !has_unmovable_pages(zone, page, 0); } #ifdef CONFIG_CMA @@ -5635,7 +5617,12 @@ static struct page * __alloc_contig_migrate_alloc(struct page *page, unsigned long private, int **resultp) { - return alloc_page(GFP_HIGHUSER_MOVABLE); + gfp_t gfp_mask = GFP_USER | __GFP_MOVABLE; + + if (PageHighMem(page)) + gfp_mask |= __GFP_HIGHMEM; + + return alloc_page(gfp_mask); } /* [start, end) must belong to a single zone. */ @@ -5864,7 +5851,49 @@ void free_contig_range(unsigned long pfn, unsigned nr_pages) } #endif +#ifdef CONFIG_MEMORY_HOTPLUG +static int __meminit __zone_pcp_update(void *data) +{ + struct zone *zone = data; + int cpu; + unsigned long batch = zone_batchsize(zone), flags; + + for_each_possible_cpu(cpu) { + struct per_cpu_pageset *pset; + struct per_cpu_pages *pcp; + + pset = per_cpu_ptr(zone->pageset, cpu); + pcp = &pset->pcp; + + local_irq_save(flags); + if (pcp->count > 0) + free_pcppages_bulk(zone, pcp->count, pcp); + setup_pageset(pset, batch); + local_irq_restore(flags); + } + return 0; +} + +void __meminit zone_pcp_update(struct zone *zone) +{ + stop_machine(__zone_pcp_update, zone, NULL); +} +#endif + #ifdef CONFIG_MEMORY_HOTREMOVE +void zone_pcp_reset(struct zone *zone) +{ + unsigned long flags; + + /* avoid races with drain_pages() */ + local_irq_save(flags); + if (zone->pageset != &boot_pageset) { + free_percpu(zone->pageset); + zone->pageset = &boot_pageset; + } + local_irq_restore(flags); +} + /* * All pages in the range must be isolated before calling this. */ |