diff options
Diffstat (limited to 'mm/page_alloc.c')
-rw-r--r-- | mm/page_alloc.c | 205 |
1 files changed, 119 insertions, 86 deletions
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 9d666df5ef95..63358d9f9aa9 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -43,6 +43,7 @@ #include <linux/vmalloc.h> #include <linux/vmstat.h> #include <linux/mempolicy.h> +#include <linux/memremap.h> #include <linux/stop_machine.h> #include <linux/sort.h> #include <linux/pfn.h> @@ -114,13 +115,6 @@ static DEFINE_SPINLOCK(managed_page_count_lock); unsigned long totalram_pages __read_mostly; unsigned long totalreserve_pages __read_mostly; unsigned long totalcma_pages __read_mostly; -/* - * When calculating the number of globally allowed dirty pages, there - * is a certain number of per-zone reserves that should not be - * considered dirtyable memory. This is the sum of those reserves - * over all existing zones that contribute dirtyable memory. - */ -unsigned long dirty_balance_reserve __read_mostly; int percpu_pagelist_fraction; gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK; @@ -229,13 +223,15 @@ static char * const zone_names[MAX_NR_ZONES] = { #endif }; -static void free_compound_page(struct page *page); compound_page_dtor * const compound_page_dtors[] = { NULL, free_compound_page, #ifdef CONFIG_HUGETLB_PAGE free_huge_page, #endif +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + free_transhuge_page, +#endif }; int min_free_kbytes = 1024; @@ -457,7 +453,7 @@ out: * This usage means that zero-order pages may not be compound. */ -static void free_compound_page(struct page *page) +void free_compound_page(struct page *page) { __free_pages_ok(page, compound_order(page)); } @@ -473,8 +469,10 @@ void prep_compound_page(struct page *page, unsigned int order) for (i = 1; i < nr_pages; i++) { struct page *p = page + i; set_page_count(p, 0); + p->mapping = TAIL_MAPPING; set_compound_head(p, page); } + atomic_set(compound_mapcount_ptr(page), -1); } #ifdef CONFIG_DEBUG_PAGEALLOC @@ -739,7 +737,7 @@ static inline int free_pages_check(struct page *page) const char *bad_reason = NULL; unsigned long bad_flags = 0; - if (unlikely(page_mapcount(page))) + if (unlikely(atomic_read(&page->_mapcount) != -1)) bad_reason = "nonzero mapcount"; if (unlikely(page->mapping != NULL)) bad_reason = "non-NULL mapping"; @@ -812,7 +810,7 @@ static void free_pcppages_bulk(struct zone *zone, int count, do { int mt; /* migratetype of the to-be-freed page */ - page = list_entry(list->prev, struct page, lru); + page = list_last_entry(list, struct page, lru); /* must delete as __free_one_page list manipulates */ list_del(&page->lru); @@ -863,6 +861,27 @@ static int free_tail_pages_check(struct page *head_page, struct page *page) ret = 0; goto out; } + switch (page - head_page) { + case 1: + /* the first tail page: ->mapping is compound_mapcount() */ + if (unlikely(compound_mapcount(page))) { + bad_page(page, "nonzero compound_mapcount", 0); + goto out; + } + break; + case 2: + /* + * the second tail page: ->mapping is + * page_deferred_list().next -- ignore value. + */ + break; + default: + if (page->mapping != TAIL_MAPPING) { + bad_page(page, "corrupted mapping in tail page", 0); + goto out; + } + break; + } if (unlikely(!PageTail(page))) { bad_page(page, "PageTail not set", 0); goto out; @@ -873,6 +892,7 @@ static int free_tail_pages_check(struct page *head_page, struct page *page) } ret = 0; out: + page->mapping = NULL; clear_compound_head(page); return ret; } @@ -1336,7 +1356,7 @@ static inline int check_new_page(struct page *page) const char *bad_reason = NULL; unsigned long bad_flags = 0; - if (unlikely(page_mapcount(page))) + if (unlikely(atomic_read(&page->_mapcount) != -1)) bad_reason = "nonzero mapcount"; if (unlikely(page->mapping != NULL)) bad_reason = "non-NULL mapping"; @@ -1417,11 +1437,10 @@ struct page *__rmqueue_smallest(struct zone *zone, unsigned int order, /* Find a page of the appropriate size in the preferred list */ for (current_order = order; current_order < MAX_ORDER; ++current_order) { area = &(zone->free_area[current_order]); - if (list_empty(&area->free_list[migratetype])) - continue; - - page = list_entry(area->free_list[migratetype].next, + page = list_first_entry_or_null(&area->free_list[migratetype], struct page, lru); + if (!page) + continue; list_del(&page->lru); rmv_page_order(page); area->nr_free--; @@ -1700,12 +1719,12 @@ static void unreserve_highatomic_pageblock(const struct alloc_context *ac) for (order = 0; order < MAX_ORDER; order++) { struct free_area *area = &(zone->free_area[order]); - if (list_empty(&area->free_list[MIGRATE_HIGHATOMIC])) + page = list_first_entry_or_null( + &area->free_list[MIGRATE_HIGHATOMIC], + struct page, lru); + if (!page) continue; - page = list_entry(area->free_list[MIGRATE_HIGHATOMIC].next, - struct page, lru); - /* * It should never happen but changes to locking could * inadvertently allow a per-cpu drain to add pages @@ -1753,7 +1772,7 @@ __rmqueue_fallback(struct zone *zone, unsigned int order, int start_migratetype) if (fallback_mt == -1) continue; - page = list_entry(area->free_list[fallback_mt].next, + page = list_first_entry(&area->free_list[fallback_mt], struct page, lru); if (can_steal) steal_suitable_fallback(zone, page, start_migratetype); @@ -1788,7 +1807,7 @@ __rmqueue_fallback(struct zone *zone, unsigned int order, int start_migratetype) * Call me with the zone->lock already held. */ static struct page *__rmqueue(struct zone *zone, unsigned int order, - int migratetype, gfp_t gfp_flags) + int migratetype) { struct page *page; @@ -1818,7 +1837,7 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, spin_lock(&zone->lock); for (i = 0; i < count; ++i) { - struct page *page = __rmqueue(zone, order, migratetype, 0); + struct page *page = __rmqueue(zone, order, migratetype); if (unlikely(page == NULL)) break; @@ -1988,7 +2007,7 @@ void mark_free_pages(struct zone *zone) unsigned long pfn, max_zone_pfn; unsigned long flags; unsigned int order, t; - struct list_head *curr; + struct page *page; if (zone_is_empty(zone)) return; @@ -1998,17 +2017,17 @@ void mark_free_pages(struct zone *zone) max_zone_pfn = zone_end_pfn(zone); for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) if (pfn_valid(pfn)) { - struct page *page = pfn_to_page(pfn); - + page = pfn_to_page(pfn); if (!swsusp_page_is_forbidden(page)) swsusp_unset_page_free(page); } for_each_migratetype_order(order, t) { - list_for_each(curr, &zone->free_area[order].free_list[t]) { + list_for_each_entry(page, + &zone->free_area[order].free_list[t], lru) { unsigned long i; - pfn = page_to_pfn(list_entry(curr, struct page, lru)); + pfn = page_to_pfn(page); for (i = 0; i < (1UL << order); i++) swsusp_set_page_free(pfn_to_page(pfn + i)); } @@ -2212,9 +2231,9 @@ struct page *buffered_rmqueue(struct zone *preferred_zone, } if (cold) - page = list_entry(list->prev, struct page, lru); + page = list_last_entry(list, struct page, lru); else - page = list_entry(list->next, struct page, lru); + page = list_first_entry(list, struct page, lru); list_del(&page->lru); pcp->count--; @@ -2241,7 +2260,7 @@ struct page *buffered_rmqueue(struct zone *preferred_zone, trace_mm_page_alloc_zone_locked(page, order, migratetype); } if (!page) - page = __rmqueue(zone, order, migratetype, gfp_flags); + page = __rmqueue(zone, order, migratetype); spin_unlock(&zone->lock); if (!page) goto failed; @@ -2740,8 +2759,21 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, goto out; } /* Exhausted what can be done so it's blamo time */ - if (out_of_memory(&oc) || WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL)) + if (out_of_memory(&oc) || WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL)) { *did_some_progress = 1; + + if (gfp_mask & __GFP_NOFAIL) { + page = get_page_from_freelist(gfp_mask, order, + ALLOC_NO_WATERMARKS|ALLOC_CPUSET, ac); + /* + * fallback to ignore cpuset restriction if our nodes + * are depleted + */ + if (!page) + page = get_page_from_freelist(gfp_mask, order, + ALLOC_NO_WATERMARKS, ac); + } + } out: mutex_unlock(&oom_lock); return page; @@ -2876,28 +2908,6 @@ retry: return page; } -/* - * This is called in the allocator slow-path if the allocation request is of - * sufficient urgency to ignore watermarks and take other desperate measures - */ -static inline struct page * -__alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order, - const struct alloc_context *ac) -{ - struct page *page; - - do { - page = get_page_from_freelist(gfp_mask, order, - ALLOC_NO_WATERMARKS, ac); - - if (!page && gfp_mask & __GFP_NOFAIL) - wait_iff_congested(ac->preferred_zone, BLK_RW_ASYNC, - HZ/50); - } while (!page && (gfp_mask & __GFP_NOFAIL)); - - return page; -} - static void wake_all_kswapds(unsigned int order, const struct alloc_context *ac) { struct zoneref *z; @@ -3042,28 +3052,36 @@ retry: * allocations are system rather than user orientated */ ac->zonelist = node_zonelist(numa_node_id(), gfp_mask); - - page = __alloc_pages_high_priority(gfp_mask, order, ac); - - if (page) { + page = get_page_from_freelist(gfp_mask, order, + ALLOC_NO_WATERMARKS, ac); + if (page) goto got_pg; - } } /* Caller is not willing to reclaim, we can't balance anything */ if (!can_direct_reclaim) { /* - * All existing users of the deprecated __GFP_NOFAIL are - * blockable, so warn of any new users that actually allow this - * type of allocation to fail. + * All existing users of the __GFP_NOFAIL are blockable, so warn + * of any new users that actually allow this type of allocation + * to fail. */ WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL); goto nopage; } /* Avoid recursion of direct reclaim */ - if (current->flags & PF_MEMALLOC) + if (current->flags & PF_MEMALLOC) { + /* + * __GFP_NOFAIL request from this context is rather bizarre + * because we cannot reclaim anything and only can loop waiting + * for somebody to do a work for us. + */ + if (WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL)) { + cond_resched(); + goto retry; + } goto nopage; + } /* Avoid allocations with no watermarks from looping endlessly */ if (test_thread_flag(TIF_MEMDIE) && !(gfp_mask & __GFP_NOFAIL)) @@ -3402,7 +3420,8 @@ EXPORT_SYMBOL(__free_page_frag); /* * alloc_kmem_pages charges newly allocated pages to the kmem resource counter - * of the current memory cgroup. + * of the current memory cgroup if __GFP_ACCOUNT is set, other than that it is + * equivalent to alloc_pages. * * It should be used when the caller would like to use kmalloc, but since the * allocation is large, it has to fall back to the page allocator. @@ -4147,8 +4166,7 @@ static void set_zonelist_order(void) static void build_zonelists(pg_data_t *pgdat) { - int j, node, load; - enum zone_type i; + int i, node, load; nodemask_t used_mask; int local_node, prev_node; struct zonelist *zonelist; @@ -4168,7 +4186,7 @@ static void build_zonelists(pg_data_t *pgdat) nodes_clear(used_mask); memset(node_order, 0, sizeof(node_order)); - j = 0; + i = 0; while ((node = find_next_best_node(local_node, &used_mask)) >= 0) { /* @@ -4185,12 +4203,12 @@ static void build_zonelists(pg_data_t *pgdat) if (order == ZONELIST_ORDER_NODE) build_zonelists_in_node_order(pgdat, node); else - node_order[j++] = node; /* remember order */ + node_order[i++] = node; /* remember order */ } if (order == ZONELIST_ORDER_ZONE) { /* calculate node order -- i.e., DMA last! */ - build_zonelists_in_zone_order(pgdat, j); + build_zonelists_in_zone_order(pgdat, i); } build_thisnode_zonelists(pgdat); @@ -4468,16 +4486,22 @@ static inline unsigned long wait_table_bits(unsigned long size) void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, unsigned long start_pfn, enum memmap_context context) { - pg_data_t *pgdat = NODE_DATA(nid); + struct vmem_altmap *altmap = to_vmem_altmap(__pfn_to_phys(start_pfn)); unsigned long end_pfn = start_pfn + size; + pg_data_t *pgdat = NODE_DATA(nid); unsigned long pfn; - struct zone *z; unsigned long nr_initialised = 0; if (highest_memmap_pfn < end_pfn - 1) highest_memmap_pfn = end_pfn - 1; - z = &pgdat->node_zones[zone]; + /* + * Honor reservation requested by the driver for this ZONE_DEVICE + * memory + */ + if (altmap && start_pfn == altmap->base_pfn) + start_pfn += altmap->reserve; + for (pfn = start_pfn; pfn < end_pfn; pfn++) { /* * There can be holes in boot-time mem_map[]s @@ -5956,20 +5980,12 @@ static void calculate_totalreserve_pages(void) if (max > zone->managed_pages) max = zone->managed_pages; + + zone->totalreserve_pages = max; + reserve_pages += max; - /* - * Lowmem reserves are not available to - * GFP_HIGHUSER page cache allocations and - * kswapd tries to balance zones to their high - * watermark. As a result, neither should be - * regarded as dirtyable memory, to prevent a - * situation where reclaim has to clean pages - * in order to balance the zones. - */ - zone->dirty_balance_reserve = max; } } - dirty_balance_reserve = reserve_pages; totalreserve_pages = reserve_pages; } @@ -6724,8 +6740,12 @@ int alloc_contig_range(unsigned long start, unsigned long end, if (ret) return ret; + /* + * In case of -EBUSY, we'd like to know which page causes problem. + * So, just fall through. We will check it in test_pages_isolated(). + */ ret = __alloc_contig_migrate_range(&cc, start, end); - if (ret) + if (ret && ret != -EBUSY) goto done; /* @@ -6752,12 +6772,25 @@ int alloc_contig_range(unsigned long start, unsigned long end, outer_start = start; while (!PageBuddy(pfn_to_page(outer_start))) { if (++order >= MAX_ORDER) { - ret = -EBUSY; - goto done; + outer_start = start; + break; } outer_start &= ~0UL << order; } + if (outer_start != start) { + order = page_order(pfn_to_page(outer_start)); + + /* + * outer_start page could be small order buddy page and + * it doesn't include start page. Adjust outer_start + * in this case to report failed page properly + * on tracepoint in test_pages_isolated() + */ + if (outer_start + (1UL << order) <= start) + outer_start = start; + } + /* Make sure the range is really isolated. */ if (test_pages_isolated(outer_start, end, false)) { pr_info("%s: [%lx, %lx) PFNs busy\n", |