diff options
Diffstat (limited to 'mm/page_alloc.c')
-rw-r--r-- | mm/page_alloc.c | 190 |
1 files changed, 122 insertions, 68 deletions
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index ef19f22b2b7d..446bb36ee59d 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -18,7 +18,6 @@ #include <linux/mm.h> #include <linux/swap.h> #include <linux/interrupt.h> -#include <linux/rwsem.h> #include <linux/pagemap.h> #include <linux/jiffies.h> #include <linux/bootmem.h> @@ -126,6 +125,24 @@ unsigned long dirty_balance_reserve __read_mostly; int percpu_pagelist_fraction; gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK; +/* + * A cached value of the page's pageblock's migratetype, used when the page is + * put on a pcplist. Used to avoid the pageblock migratetype lookup when + * freeing from pcplists in most cases, at the cost of possibly becoming stale. + * Also the migratetype set in the page does not necessarily match the pcplist + * index, e.g. page might have MIGRATE_CMA set but be on a pcplist with any + * other index - this ensures that it will be put on the correct CMA freelist. + */ +static inline int get_pcppage_migratetype(struct page *page) +{ + return page->index; +} + +static inline void set_pcppage_migratetype(struct page *page, int migratetype) +{ + page->index = migratetype; +} + #ifdef CONFIG_PM_SLEEP /* * The following functions are used by the suspend/hibernate code to temporarily @@ -207,6 +224,9 @@ static char * const zone_names[MAX_NR_ZONES] = { "HighMem", #endif "Movable", +#ifdef CONFIG_ZONE_DEVICE + "Device", +#endif }; int min_free_kbytes = 1024; @@ -789,7 +809,11 @@ static void free_pcppages_bulk(struct zone *zone, int count, page = list_entry(list->prev, struct page, lru); /* must delete as __free_one_page list manipulates */ list_del(&page->lru); - mt = get_freepage_migratetype(page); + + mt = get_pcppage_migratetype(page); + /* MIGRATE_ISOLATE page should not go to pcplists */ + VM_BUG_ON_PAGE(is_migrate_isolate(mt), page); + /* Pageblock could have been isolated meanwhile */ if (unlikely(has_isolate_pageblock(zone))) mt = get_pageblock_migratetype(page); @@ -953,7 +977,6 @@ static void __free_pages_ok(struct page *page, unsigned int order) migratetype = get_pfnblock_migratetype(page, pfn); local_irq_save(flags); __count_vm_events(PGFREE, 1 << order); - set_freepage_migratetype(page, migratetype); free_one_page(page_zone(page), page, pfn, order, migratetype); local_irq_restore(flags); } @@ -981,21 +1004,21 @@ static void __init __free_pages_boot_core(struct page *page, #if defined(CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID) || \ defined(CONFIG_HAVE_MEMBLOCK_NODE_MAP) -/* Only safe to use early in boot when initialisation is single-threaded */ + static struct mminit_pfnnid_cache early_pfnnid_cache __meminitdata; int __meminit early_pfn_to_nid(unsigned long pfn) { + static DEFINE_SPINLOCK(early_pfn_lock); int nid; - /* The system will behave unpredictably otherwise */ - BUG_ON(system_state != SYSTEM_BOOTING); - + spin_lock(&early_pfn_lock); nid = __early_pfn_to_nid(pfn, &early_pfnnid_cache); - if (nid >= 0) - return nid; - /* just returns 0 */ - return 0; + if (nid < 0) + nid = 0; + spin_unlock(&early_pfn_lock); + + return nid; } #endif @@ -1060,7 +1083,15 @@ static void __init deferred_free_range(struct page *page, __free_pages_boot_core(page, pfn, 0); } -static __initdata DECLARE_RWSEM(pgdat_init_rwsem); +/* Completion tracking for deferred_init_memmap() threads */ +static atomic_t pgdat_init_n_undone __initdata; +static __initdata DECLARE_COMPLETION(pgdat_init_all_done_comp); + +static inline void __init pgdat_init_report_one_done(void) +{ + if (atomic_dec_and_test(&pgdat_init_n_undone)) + complete(&pgdat_init_all_done_comp); +} /* Initialise remaining memory on a node */ static int __init deferred_init_memmap(void *data) @@ -1077,7 +1108,7 @@ static int __init deferred_init_memmap(void *data) const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id); if (first_init_pfn == ULONG_MAX) { - up_read(&pgdat_init_rwsem); + pgdat_init_report_one_done(); return 0; } @@ -1177,7 +1208,8 @@ free_range: pr_info("node %d initialised, %lu pages in %ums\n", nid, nr_pages, jiffies_to_msecs(jiffies - start)); - up_read(&pgdat_init_rwsem); + + pgdat_init_report_one_done(); return 0; } @@ -1185,14 +1217,17 @@ void __init page_alloc_init_late(void) { int nid; + /* There will be num_node_state(N_MEMORY) threads */ + atomic_set(&pgdat_init_n_undone, num_node_state(N_MEMORY)); for_each_node_state(nid, N_MEMORY) { - down_read(&pgdat_init_rwsem); kthread_run(deferred_init_memmap, NODE_DATA(nid), "pgdatinit%d", nid); } /* Block until all are initialised */ - down_write(&pgdat_init_rwsem); - up_write(&pgdat_init_rwsem); + wait_for_completion(&pgdat_init_all_done_comp); + + /* Reinit limits that are based on free pages after the kernel is up */ + files_maxfiles_init(); } #endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */ @@ -1285,6 +1320,10 @@ static inline int check_new_page(struct page *page) bad_reason = "non-NULL mapping"; if (unlikely(atomic_read(&page->_count) != 0)) bad_reason = "nonzero _count"; + if (unlikely(page->flags & __PG_HWPOISON)) { + bad_reason = "HWPoisoned (hardware-corrupted)"; + bad_flags = __PG_HWPOISON; + } if (unlikely(page->flags & PAGE_FLAGS_CHECK_AT_PREP)) { bad_reason = "PAGE_FLAGS_CHECK_AT_PREP flag set"; bad_flags = PAGE_FLAGS_CHECK_AT_PREP; @@ -1328,12 +1367,15 @@ static int prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags, set_page_owner(page, order, gfp_flags); /* - * page->pfmemalloc is set when ALLOC_NO_WATERMARKS was necessary to + * page is set pfmemalloc when ALLOC_NO_WATERMARKS was necessary to * allocate the page. The expectation is that the caller is taking * steps that will free more memory. The caller should avoid the page * being used for !PFMEMALLOC purposes. */ - page->pfmemalloc = !!(alloc_flags & ALLOC_NO_WATERMARKS); + if (alloc_flags & ALLOC_NO_WATERMARKS) + set_page_pfmemalloc(page); + else + clear_page_pfmemalloc(page); return 0; } @@ -1362,7 +1404,7 @@ struct page *__rmqueue_smallest(struct zone *zone, unsigned int order, rmv_page_order(page); area->nr_free--; expand(zone, page, order, current_order, area, migratetype); - set_freepage_migratetype(page, migratetype); + set_pcppage_migratetype(page, migratetype); return page; } @@ -1439,7 +1481,6 @@ int move_freepages(struct zone *zone, order = page_order(page); list_move(&page->lru, &zone->free_area[order].free_list[migratetype]); - set_freepage_migratetype(page, migratetype); page += 1 << order; pages_moved += 1 << order; } @@ -1609,14 +1650,13 @@ __rmqueue_fallback(struct zone *zone, unsigned int order, int start_migratetype) expand(zone, page, order, current_order, area, start_migratetype); /* - * The freepage_migratetype may differ from pageblock's + * The pcppage_migratetype may differ from pageblock's * migratetype depending on the decisions in - * try_to_steal_freepages(). This is OK as long as it - * does not differ for MIGRATE_CMA pageblocks. For CMA - * we need to make sure unallocated pages flushed from - * pcp lists are returned to the correct freelist. + * find_suitable_fallback(). This is OK as long as it does not + * differ for MIGRATE_CMA pageblocks. Those can be used as + * fallback only via special __rmqueue_cma_fallback() function */ - set_freepage_migratetype(page, start_migratetype); + set_pcppage_migratetype(page, start_migratetype); trace_mm_page_alloc_extfrag(page, order, current_order, start_migratetype, fallback_mt); @@ -1692,7 +1732,7 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, else list_add_tail(&page->lru, list); list = &page->lru; - if (is_migrate_cma(get_freepage_migratetype(page))) + if (is_migrate_cma(get_pcppage_migratetype(page))) __mod_zone_page_state(zone, NR_FREE_CMA_PAGES, -(1 << order)); } @@ -1889,7 +1929,7 @@ void free_hot_cold_page(struct page *page, bool cold) return; migratetype = get_pfnblock_migratetype(page, pfn); - set_freepage_migratetype(page, migratetype); + set_pcppage_migratetype(page, migratetype); local_irq_save(flags); __count_vm_event(PGFREE); @@ -2094,7 +2134,7 @@ struct page *buffered_rmqueue(struct zone *preferred_zone, if (!page) goto failed; __mod_zone_freepage_state(zone, -(1 << order), - get_freepage_migratetype(page)); + get_pcppage_migratetype(page)); } __mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order)); @@ -2119,13 +2159,13 @@ failed: static struct { struct fault_attr attr; - u32 ignore_gfp_highmem; - u32 ignore_gfp_wait; + bool ignore_gfp_highmem; + bool ignore_gfp_wait; u32 min_order; } fail_page_alloc = { .attr = FAULT_ATTR_INITIALIZER, - .ignore_gfp_wait = 1, - .ignore_gfp_highmem = 1, + .ignore_gfp_wait = true, + .ignore_gfp_highmem = true, .min_order = 1, }; @@ -2675,6 +2715,12 @@ static inline struct page * __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, const struct alloc_context *ac, unsigned long *did_some_progress) { + struct oom_control oc = { + .zonelist = ac->zonelist, + .nodemask = ac->nodemask, + .gfp_mask = gfp_mask, + .order = order, + }; struct page *page; *did_some_progress = 0; @@ -2726,8 +2772,7 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, goto out; } /* Exhausted what can be done so it's blamo time */ - if (out_of_memory(ac->zonelist, gfp_mask, order, ac->nodemask, false) - || WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL)) + if (out_of_memory(&oc) || WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL)) *did_some_progress = 1; out: mutex_unlock(&oom_lock); @@ -3330,7 +3375,7 @@ refill: atomic_add(size - 1, &page->_count); /* reset page count bias and offset to start of new frag */ - nc->pfmemalloc = page->pfmemalloc; + nc->pfmemalloc = page_is_pfmemalloc(page); nc->pagecnt_bias = size; nc->offset = size; } @@ -3383,24 +3428,24 @@ EXPORT_SYMBOL(__free_page_frag); struct page *alloc_kmem_pages(gfp_t gfp_mask, unsigned int order) { struct page *page; - struct mem_cgroup *memcg = NULL; - if (!memcg_kmem_newpage_charge(gfp_mask, &memcg, order)) - return NULL; page = alloc_pages(gfp_mask, order); - memcg_kmem_commit_charge(page, memcg, order); + if (page && memcg_kmem_charge(page, gfp_mask, order) != 0) { + __free_pages(page, order); + page = NULL; + } return page; } struct page *alloc_kmem_pages_node(int nid, gfp_t gfp_mask, unsigned int order) { struct page *page; - struct mem_cgroup *memcg = NULL; - if (!memcg_kmem_newpage_charge(gfp_mask, &memcg, order)) - return NULL; page = alloc_pages_node(nid, gfp_mask, order); - memcg_kmem_commit_charge(page, memcg, order); + if (page && memcg_kmem_charge(page, gfp_mask, order) != 0) { + __free_pages(page, order); + page = NULL; + } return page; } @@ -3410,7 +3455,7 @@ struct page *alloc_kmem_pages_node(int nid, gfp_t gfp_mask, unsigned int order) */ void __free_kmem_pages(struct page *page, unsigned int order) { - memcg_kmem_uncharge_pages(page, order); + memcg_kmem_uncharge(page, order); __free_pages(page, order); } @@ -3469,8 +3514,6 @@ EXPORT_SYMBOL(alloc_pages_exact); * * Like alloc_pages_exact(), but try to allocate on node nid first before falling * back. - * Note this is not alloc_pages_exact_node() which allocates on a specific node, - * but is not exact. */ void * __meminit alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask) { @@ -4857,8 +4900,7 @@ static __meminit void zone_pcp_init(struct zone *zone) int __meminit init_currently_empty_zone(struct zone *zone, unsigned long zone_start_pfn, - unsigned long size, - enum memmap_context context) + unsigned long size) { struct pglist_data *pgdat = zone->zone_pgdat; int ret; @@ -5045,6 +5087,10 @@ static unsigned long __meminit zone_spanned_pages_in_node(int nid, { unsigned long zone_start_pfn, zone_end_pfn; + /* When hotadd a new node from cpu_up(), the node should be empty */ + if (!node_start_pfn && !node_end_pfn) + return 0; + /* Get the start and end of the zone */ zone_start_pfn = arch_zone_lowest_possible_pfn[zone_type]; zone_end_pfn = arch_zone_highest_possible_pfn[zone_type]; @@ -5108,6 +5154,10 @@ static unsigned long __meminit zone_absent_pages_in_node(int nid, unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type]; unsigned long zone_start_pfn, zone_end_pfn; + /* When hotadd a new node from cpu_up(), the node should be empty */ + if (!node_start_pfn && !node_end_pfn) + return 0; + zone_start_pfn = clamp(node_start_pfn, zone_low, zone_high); zone_end_pfn = clamp(node_end_pfn, zone_low, zone_high); @@ -5277,8 +5327,7 @@ static unsigned long __paginginit calc_memmap_size(unsigned long spanned_pages, * * NOTE: pgdat should get zeroed by caller. */ -static void __paginginit free_area_init_core(struct pglist_data *pgdat, - unsigned long node_start_pfn, unsigned long node_end_pfn) +static void __paginginit free_area_init_core(struct pglist_data *pgdat) { enum zone_type j; int nid = pgdat->node_id; @@ -5363,8 +5412,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat, set_pageblock_order(); setup_usemap(pgdat, zone, zone_start_pfn, size); - ret = init_currently_empty_zone(zone, zone_start_pfn, - size, MEMMAP_EARLY); + ret = init_currently_empty_zone(zone, zone_start_pfn, size); BUG_ON(ret); memmap_init(size, nid, j, zone_start_pfn); zone_start_pfn += size; @@ -5373,6 +5421,8 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat, static void __init_refok alloc_node_mem_map(struct pglist_data *pgdat) { + unsigned long __maybe_unused offset = 0; + /* Skip empty nodes */ if (!pgdat->node_spanned_pages) return; @@ -5389,6 +5439,7 @@ static void __init_refok alloc_node_mem_map(struct pglist_data *pgdat) * for the buddy allocator to function correctly. */ start = pgdat->node_start_pfn & ~(MAX_ORDER_NR_PAGES - 1); + offset = pgdat->node_start_pfn - start; end = pgdat_end_pfn(pgdat); end = ALIGN(end, MAX_ORDER_NR_PAGES); size = (end - start) * sizeof(struct page); @@ -5396,7 +5447,7 @@ static void __init_refok alloc_node_mem_map(struct pglist_data *pgdat) if (!map) map = memblock_virt_alloc_node_nopanic(size, pgdat->node_id); - pgdat->node_mem_map = map + (pgdat->node_start_pfn - start); + pgdat->node_mem_map = map + offset; } #ifndef CONFIG_NEED_MULTIPLE_NODES /* @@ -5404,9 +5455,9 @@ static void __init_refok alloc_node_mem_map(struct pglist_data *pgdat) */ if (pgdat == NODE_DATA(0)) { mem_map = NODE_DATA(0)->node_mem_map; -#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP +#if defined(CONFIG_HAVE_MEMBLOCK_NODE_MAP) || defined(CONFIG_FLATMEM) if (page_to_pfn(mem_map) != pgdat->node_start_pfn) - mem_map -= (pgdat->node_start_pfn - ARCH_PFN_OFFSET); + mem_map -= offset; #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ } #endif @@ -5429,7 +5480,8 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size, #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP get_pfn_range_for_nid(nid, &start_pfn, &end_pfn); pr_info("Initmem setup node %d [mem %#018Lx-%#018Lx]\n", nid, - (u64)start_pfn << PAGE_SHIFT, ((u64)end_pfn << PAGE_SHIFT) - 1); + (u64)start_pfn << PAGE_SHIFT, + end_pfn ? ((u64)end_pfn << PAGE_SHIFT) - 1 : 0); #endif calculate_node_totalpages(pgdat, start_pfn, end_pfn, zones_size, zholes_size); @@ -5441,7 +5493,7 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size, (unsigned long)pgdat->node_mem_map); #endif - free_area_init_core(pgdat, start_pfn, end_pfn); + free_area_init_core(pgdat); } #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP @@ -5452,11 +5504,9 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size, */ void __init setup_nr_node_ids(void) { - unsigned int node; - unsigned int highest = 0; + unsigned int highest; - for_each_node_mask(node, node_possible_map) - highest = node; + highest = find_last_bit(node_possible_map.bits, MAX_NUMNODES); nr_node_ids = highest + 1; } #endif @@ -5619,13 +5669,17 @@ static void __init find_zone_movable_pfns_for_nodes(void) */ required_movablecore = roundup(required_movablecore, MAX_ORDER_NR_PAGES); + required_movablecore = min(totalpages, required_movablecore); corepages = totalpages - required_movablecore; required_kernelcore = max(required_kernelcore, corepages); } - /* If kernelcore was not specified, there is no ZONE_MOVABLE */ - if (!required_kernelcore) + /* + * If kernelcore was not specified or kernelcore size is larger + * than totalpages, there is no ZONE_MOVABLE. + */ + if (!required_kernelcore || required_kernelcore >= totalpages) goto out; /* usable_startpfn is the lowest possible pfn ZONE_MOVABLE can be at */ @@ -5977,7 +6031,7 @@ void __init mem_init_print_info(const char *str) * set_dma_reserve - set the specified number of pages reserved in the first zone * @new_dma_reserve: The number of pages to mark reserved * - * The per-cpu batchsize and zone watermarks are determined by present_pages. + * The per-cpu batchsize and zone watermarks are determined by managed_pages. * In the DMA zone, a significant percentage may be consumed by kernel image * and other unfreeable allocations which can skew the watermarks badly. This * function may optionally be used to account for unfreeable pages in the @@ -6030,7 +6084,7 @@ void __init page_alloc_init(void) } /* - * calculate_totalreserve_pages - called when sysctl_lower_zone_reserve_ratio + * calculate_totalreserve_pages - called when sysctl_lowmem_reserve_ratio * or min_free_kbytes changes. */ static void calculate_totalreserve_pages(void) @@ -6074,7 +6128,7 @@ static void calculate_totalreserve_pages(void) /* * setup_per_zone_lowmem_reserve - called whenever - * sysctl_lower_zone_reserve_ratio changes. Ensures that each zone + * sysctl_lowmem_reserve_ratio changes. Ensures that each zone * has a correct pages reserved value, so an adequate number of * pages are left in the zone after a successful __alloc_pages(). */ |