diff options
Diffstat (limited to 'mm/page_alloc.c')
-rw-r--r-- | mm/page_alloc.c | 275 |
1 files changed, 159 insertions, 116 deletions
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index ebffa0e4a9c0..5e6fa06f2784 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -380,20 +380,6 @@ void prep_compound_page(struct page *page, unsigned long order) } } -static inline void prep_zero_page(struct page *page, unsigned int order, - gfp_t gfp_flags) -{ - int i; - - /* - * clear_highpage() will use KM_USER0, so it's a bug to use __GFP_ZERO - * and __GFP_HIGHMEM from hard or soft interrupt context. - */ - VM_BUG_ON((gfp_flags & __GFP_HIGHMEM) && in_interrupt()); - for (i = 0; i < (1 << order); i++) - clear_highpage(page + i); -} - #ifdef CONFIG_DEBUG_PAGEALLOC unsigned int _debug_guardpage_minorder; bool _debug_pagealloc_enabled __read_mostly; @@ -975,7 +961,8 @@ static int prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags, kasan_alloc_pages(page, order); if (gfp_flags & __GFP_ZERO) - prep_zero_page(page, order, gfp_flags); + for (i = 0; i < (1 << order); i++) + clear_highpage(page + i); if (order && (gfp_flags & __GFP_COMP)) prep_compound_page(page, order); @@ -2322,48 +2309,6 @@ void warn_alloc_failed(gfp_t gfp_mask, int order, const char *fmt, ...) show_mem(filter); } -static inline int -should_alloc_retry(gfp_t gfp_mask, unsigned int order, - unsigned long did_some_progress, - unsigned long pages_reclaimed) -{ - /* Do not loop if specifically requested */ - if (gfp_mask & __GFP_NORETRY) - return 0; - - /* Always retry if specifically requested */ - if (gfp_mask & __GFP_NOFAIL) - return 1; - - /* - * Suspend converts GFP_KERNEL to __GFP_WAIT which can prevent reclaim - * making forward progress without invoking OOM. Suspend also disables - * storage devices so kswapd will not help. Bail if we are suspending. - */ - if (!did_some_progress && pm_suspended_storage()) - return 0; - - /* - * In this implementation, order <= PAGE_ALLOC_COSTLY_ORDER - * means __GFP_NOFAIL, but that may not be true in other - * implementations. - */ - if (order <= PAGE_ALLOC_COSTLY_ORDER) - return 1; - - /* - * For order > PAGE_ALLOC_COSTLY_ORDER, if __GFP_REPEAT is - * specified, then we retry until we no longer reclaim any pages - * (above), or we've reclaimed an order of pages at least as - * large as the allocation's order. In both cases, if the - * allocation still fails, we stop retrying. - */ - if (gfp_mask & __GFP_REPEAT && pages_reclaimed < (1 << order)) - return 1; - - return 0; -} - static inline struct page * __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, const struct alloc_context *ac, unsigned long *did_some_progress) @@ -2373,10 +2318,10 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, *did_some_progress = 0; /* - * Acquire the per-zone oom lock for each zone. If that - * fails, somebody else is making progress for us. + * Acquire the oom lock. If that fails, somebody else is + * making progress for us. */ - if (!oom_zonelist_trylock(ac->zonelist, gfp_mask)) { + if (!mutex_trylock(&oom_lock)) { *did_some_progress = 1; schedule_timeout_uninterruptible(1); return NULL; @@ -2402,16 +2347,18 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, /* The OOM killer does not needlessly kill tasks for lowmem */ if (ac->high_zoneidx < ZONE_NORMAL) goto out; - /* The OOM killer does not compensate for light reclaim */ + /* The OOM killer does not compensate for IO-less reclaim */ if (!(gfp_mask & __GFP_FS)) { /* * XXX: Page reclaim didn't yield anything, * and the OOM killer can't be invoked, but - * keep looping as per should_alloc_retry(). + * keep looping as per tradition. */ *did_some_progress = 1; goto out; } + if (pm_suspended_storage()) + goto out; /* The OOM killer may not free memory on a specific node */ if (gfp_mask & __GFP_THISNODE) goto out; @@ -2421,7 +2368,7 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, || WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL)) *did_some_progress = 1; out: - oom_zonelist_unlock(ac->zonelist, gfp_mask); + mutex_unlock(&oom_lock); return page; } @@ -2794,40 +2741,40 @@ retry: if (page) goto got_pg; - /* Check if we should retry the allocation */ + /* Do not loop if specifically requested */ + if (gfp_mask & __GFP_NORETRY) + goto noretry; + + /* Keep reclaiming pages as long as there is reasonable progress */ pages_reclaimed += did_some_progress; - if (should_alloc_retry(gfp_mask, order, did_some_progress, - pages_reclaimed)) { - /* - * If we fail to make progress by freeing individual - * pages, but the allocation wants us to keep going, - * start OOM killing tasks. - */ - if (!did_some_progress) { - page = __alloc_pages_may_oom(gfp_mask, order, ac, - &did_some_progress); - if (page) - goto got_pg; - if (!did_some_progress) - goto nopage; - } + if ((did_some_progress && order <= PAGE_ALLOC_COSTLY_ORDER) || + ((gfp_mask & __GFP_REPEAT) && pages_reclaimed < (1 << order))) { /* Wait for some write requests to complete then retry */ wait_iff_congested(ac->preferred_zone, BLK_RW_ASYNC, HZ/50); goto retry; - } else { - /* - * High-order allocations do not necessarily loop after - * direct reclaim and reclaim/compaction depends on compaction - * being called after reclaim so call directly if necessary - */ - page = __alloc_pages_direct_compact(gfp_mask, order, - alloc_flags, ac, migration_mode, - &contended_compaction, - &deferred_compaction); - if (page) - goto got_pg; } + /* Reclaim has failed us, start killing things */ + page = __alloc_pages_may_oom(gfp_mask, order, ac, &did_some_progress); + if (page) + goto got_pg; + + /* Retry as long as the OOM killer is making progress */ + if (did_some_progress) + goto retry; + +noretry: + /* + * High-order allocations do not necessarily loop after + * direct reclaim and reclaim/compaction depends on compaction + * being called after reclaim so call directly if necessary + */ + page = __alloc_pages_direct_compact(gfp_mask, order, alloc_flags, + ac, migration_mode, + &contended_compaction, + &deferred_compaction); + if (page) + goto got_pg; nopage: warn_alloc_failed(gfp_mask, order, NULL); got_pg: @@ -2967,6 +2914,104 @@ void free_pages(unsigned long addr, unsigned int order) EXPORT_SYMBOL(free_pages); /* + * Page Fragment: + * An arbitrary-length arbitrary-offset area of memory which resides + * within a 0 or higher order page. Multiple fragments within that page + * are individually refcounted, in the page's reference counter. + * + * The page_frag functions below provide a simple allocation framework for + * page fragments. This is used by the network stack and network device + * drivers to provide a backing region of memory for use as either an + * sk_buff->head, or to be used in the "frags" portion of skb_shared_info. + */ +static struct page *__page_frag_refill(struct page_frag_cache *nc, + gfp_t gfp_mask) +{ + struct page *page = NULL; + gfp_t gfp = gfp_mask; + +#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE) + gfp_mask |= __GFP_COMP | __GFP_NOWARN | __GFP_NORETRY | + __GFP_NOMEMALLOC; + page = alloc_pages_node(NUMA_NO_NODE, gfp_mask, + PAGE_FRAG_CACHE_MAX_ORDER); + nc->size = page ? PAGE_FRAG_CACHE_MAX_SIZE : PAGE_SIZE; +#endif + if (unlikely(!page)) + page = alloc_pages_node(NUMA_NO_NODE, gfp, 0); + + nc->va = page ? page_address(page) : NULL; + + return page; +} + +void *__alloc_page_frag(struct page_frag_cache *nc, + unsigned int fragsz, gfp_t gfp_mask) +{ + unsigned int size = PAGE_SIZE; + struct page *page; + int offset; + + if (unlikely(!nc->va)) { +refill: + page = __page_frag_refill(nc, gfp_mask); + if (!page) + return NULL; + +#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE) + /* if size can vary use size else just use PAGE_SIZE */ + size = nc->size; +#endif + /* Even if we own the page, we do not use atomic_set(). + * This would break get_page_unless_zero() users. + */ + atomic_add(size - 1, &page->_count); + + /* reset page count bias and offset to start of new frag */ + nc->pfmemalloc = page->pfmemalloc; + nc->pagecnt_bias = size; + nc->offset = size; + } + + offset = nc->offset - fragsz; + if (unlikely(offset < 0)) { + page = virt_to_page(nc->va); + + if (!atomic_sub_and_test(nc->pagecnt_bias, &page->_count)) + goto refill; + +#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE) + /* if size can vary use size else just use PAGE_SIZE */ + size = nc->size; +#endif + /* OK, page count is 0, we can safely set it */ + atomic_set(&page->_count, size); + + /* reset page count bias and offset to start of new frag */ + nc->pagecnt_bias = size; + offset = size - fragsz; + } + + nc->pagecnt_bias--; + nc->offset = offset; + + return nc->va + offset; +} +EXPORT_SYMBOL(__alloc_page_frag); + +/* + * Frees a page fragment allocated out of either a compound or order 0 page. + */ +void __free_page_frag(void *addr) +{ + struct page *page = virt_to_head_page(addr); + + if (unlikely(put_page_testzero(page))) + __free_pages_ok(page, compound_order(page)); +} +EXPORT_SYMBOL(__free_page_frag); + +/* * alloc_kmem_pages charges newly allocated pages to the kmem resource counter * of the current memory cgroup. * @@ -4769,22 +4814,28 @@ static void __meminit calculate_node_totalpages(struct pglist_data *pgdat, unsigned long *zones_size, unsigned long *zholes_size) { - unsigned long realtotalpages, totalpages = 0; + unsigned long realtotalpages = 0, totalpages = 0; enum zone_type i; - for (i = 0; i < MAX_NR_ZONES; i++) - totalpages += zone_spanned_pages_in_node(pgdat->node_id, i, - node_start_pfn, - node_end_pfn, - zones_size); - pgdat->node_spanned_pages = totalpages; - - realtotalpages = totalpages; - for (i = 0; i < MAX_NR_ZONES; i++) - realtotalpages -= - zone_absent_pages_in_node(pgdat->node_id, i, + for (i = 0; i < MAX_NR_ZONES; i++) { + struct zone *zone = pgdat->node_zones + i; + unsigned long size, real_size; + + size = zone_spanned_pages_in_node(pgdat->node_id, i, + node_start_pfn, + node_end_pfn, + zones_size); + real_size = size - zone_absent_pages_in_node(pgdat->node_id, i, node_start_pfn, node_end_pfn, zholes_size); + zone->spanned_pages = size; + zone->present_pages = real_size; + + totalpages += size; + realtotalpages += real_size; + } + + pgdat->node_spanned_pages = totalpages; pgdat->node_present_pages = realtotalpages; printk(KERN_DEBUG "On node %d totalpages: %lu\n", pgdat->node_id, realtotalpages); @@ -4894,8 +4945,7 @@ static unsigned long __paginginit calc_memmap_size(unsigned long spanned_pages, * NOTE: pgdat should get zeroed by caller. */ static void __paginginit free_area_init_core(struct pglist_data *pgdat, - unsigned long node_start_pfn, unsigned long node_end_pfn, - unsigned long *zones_size, unsigned long *zholes_size) + unsigned long node_start_pfn, unsigned long node_end_pfn) { enum zone_type j; int nid = pgdat->node_id; @@ -4916,12 +4966,8 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat, struct zone *zone = pgdat->node_zones + j; unsigned long size, realsize, freesize, memmap_pages; - size = zone_spanned_pages_in_node(nid, j, node_start_pfn, - node_end_pfn, zones_size); - realsize = freesize = size - zone_absent_pages_in_node(nid, j, - node_start_pfn, - node_end_pfn, - zholes_size); + size = zone->spanned_pages; + realsize = freesize = zone->present_pages; /* * Adjust freesize so that it accounts for how much memory @@ -4956,8 +5002,6 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat, nr_kernel_pages -= memmap_pages; nr_all_pages += freesize; - zone->spanned_pages = size; - zone->present_pages = realsize; /* * Set an approximate value for lowmem here, it will be adjusted * when the bootmem allocator frees pages into the buddy system. @@ -5063,8 +5107,7 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size, (unsigned long)pgdat->node_mem_map); #endif - free_area_init_core(pgdat, start_pfn, end_pfn, - zones_size, zholes_size); + free_area_init_core(pgdat, start_pfn, end_pfn); } #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP @@ -6013,9 +6056,9 @@ out: return ret; } +#ifdef CONFIG_NUMA int hashdist = HASHDIST_DEFAULT; -#ifdef CONFIG_NUMA static int __init set_hashdist(char *str) { if (!str) |