diff options
Diffstat (limited to 'mm/page_alloc.c')
-rw-r--r-- | mm/page_alloc.c | 570 |
1 files changed, 238 insertions, 332 deletions
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index ef44ad736ca1..616a2c956b4b 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -53,8 +53,6 @@ #include <linux/kmemleak.h> #include <linux/compaction.h> #include <trace/events/kmem.h> -#include <linux/ftrace_event.h> -#include <linux/memcontrol.h> #include <linux/prefetch.h> #include <linux/mm_inline.h> #include <linux/migrate.h> @@ -85,6 +83,7 @@ EXPORT_PER_CPU_SYMBOL(numa_node); */ DEFINE_PER_CPU(int, _numa_mem_); /* Kernel "local memory" node */ EXPORT_PER_CPU_SYMBOL(_numa_mem_); +int _node_numa_mem_[MAX_NUMNODES]; #endif /* @@ -468,29 +467,6 @@ static inline void rmv_page_order(struct page *page) } /* - * Locate the struct page for both the matching buddy in our - * pair (buddy1) and the combined O(n+1) page they form (page). - * - * 1) Any buddy B1 will have an order O twin B2 which satisfies - * the following equation: - * B2 = B1 ^ (1 << O) - * For example, if the starting buddy (buddy2) is #8 its order - * 1 buddy is #10: - * B2 = 8 ^ (1 << 1) = 8 ^ 2 = 10 - * - * 2) Any buddy B will have an order O+1 parent P which - * satisfies the following equation: - * P = B & ~(1 << O) - * - * Assumption: *_mem_map is contiguous at least up to MAX_ORDER - */ -static inline unsigned long -__find_buddy_index(unsigned long page_idx, unsigned int order) -{ - return page_idx ^ (1 << order); -} - -/* * This function checks whether a page is free && is the buddy * we can do coalesce a page and its buddy if * (a) the buddy is not in a hole && @@ -570,6 +546,7 @@ static inline void __free_one_page(struct page *page, unsigned long combined_idx; unsigned long uninitialized_var(buddy_idx); struct page *buddy; + int max_order = MAX_ORDER; VM_BUG_ON(!zone_is_initialized(zone)); @@ -578,13 +555,24 @@ static inline void __free_one_page(struct page *page, return; VM_BUG_ON(migratetype == -1); + if (is_migrate_isolate(migratetype)) { + /* + * We restrict max order of merging to prevent merge + * between freepages on isolate pageblock and normal + * pageblock. Without this, pageblock isolation + * could cause incorrect freepage accounting. + */ + max_order = min(MAX_ORDER, pageblock_order + 1); + } else { + __mod_zone_freepage_state(zone, 1 << order, migratetype); + } - page_idx = pfn & ((1 << MAX_ORDER) - 1); + page_idx = pfn & ((1 << max_order) - 1); VM_BUG_ON_PAGE(page_idx & ((1 << order) - 1), page); VM_BUG_ON_PAGE(bad_range(zone, page), page); - while (order < MAX_ORDER-1) { + while (order < max_order - 1) { buddy_idx = __find_buddy_index(page_idx, order); buddy = page + (buddy_idx - page_idx); if (!page_is_buddy(page, buddy, order)) @@ -595,9 +583,11 @@ static inline void __free_one_page(struct page *page, */ if (page_is_guard(buddy)) { clear_page_guard_flag(buddy); - set_page_private(page, 0); - __mod_zone_freepage_state(zone, 1 << order, - migratetype); + set_page_private(buddy, 0); + if (!is_migrate_isolate(migratetype)) { + __mod_zone_freepage_state(zone, 1 << order, + migratetype); + } } else { list_del(&buddy->lru); zone->free_area[order].nr_free--; @@ -680,9 +670,12 @@ static void free_pcppages_bulk(struct zone *zone, int count, int migratetype = 0; int batch_free = 0; int to_free = count; + unsigned long nr_scanned; spin_lock(&zone->lock); - zone->pages_scanned = 0; + nr_scanned = zone_page_state(zone, NR_PAGES_SCANNED); + if (nr_scanned) + __mod_zone_page_state(zone, NR_PAGES_SCANNED, -nr_scanned); while (to_free) { struct page *page; @@ -713,14 +706,12 @@ static void free_pcppages_bulk(struct zone *zone, int count, /* must delete as __free_one_page list manipulates */ list_del(&page->lru); mt = get_freepage_migratetype(page); + if (unlikely(has_isolate_pageblock(zone))) + mt = get_pageblock_migratetype(page); + /* MIGRATE_MOVABLE list may include MIGRATE_RESERVEs */ __free_one_page(page, page_to_pfn(page), zone, 0, mt); trace_mm_page_pcpu_drain(page, 0, mt); - if (likely(!is_migrate_isolate_page(page))) { - __mod_zone_page_state(zone, NR_FREE_PAGES, 1); - if (is_migrate_cma(mt)) - __mod_zone_page_state(zone, NR_FREE_CMA_PAGES, 1); - } } while (--to_free && --batch_free && !list_empty(list)); } spin_unlock(&zone->lock); @@ -731,12 +722,17 @@ static void free_one_page(struct zone *zone, unsigned int order, int migratetype) { + unsigned long nr_scanned; spin_lock(&zone->lock); - zone->pages_scanned = 0; + nr_scanned = zone_page_state(zone, NR_PAGES_SCANNED); + if (nr_scanned) + __mod_zone_page_state(zone, NR_PAGES_SCANNED, -nr_scanned); + if (unlikely(has_isolate_pageblock(zone) || + is_migrate_isolate(migratetype))) { + migratetype = get_pfnblock_migratetype(page, pfn); + } __free_one_page(page, pfn, zone, order, migratetype); - if (unlikely(!is_migrate_isolate(migratetype))) - __mod_zone_freepage_state(zone, 1 << order, migratetype); spin_unlock(&zone->lock); } @@ -1008,7 +1004,7 @@ int move_freepages(struct zone *zone, * Remove at a later date when no bug reports exist related to * grouping pages by mobility */ - BUG_ON(page_zone(start_page) != page_zone(end_page)); + VM_BUG_ON(page_zone(start_page) != page_zone(end_page)); #endif for (page = start_page; page <= end_page;) { @@ -1257,15 +1253,11 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp) { unsigned long flags; - int to_drain; - unsigned long batch; + int to_drain, batch; local_irq_save(flags); batch = ACCESS_ONCE(pcp->batch); - if (pcp->count >= batch) - to_drain = batch; - else - to_drain = pcp->count; + to_drain = min(pcp->count, batch); if (to_drain > 0) { free_pcppages_bulk(zone, to_drain, pcp); pcp->count -= to_drain; @@ -1483,7 +1475,7 @@ void split_page(struct page *page, unsigned int order) } EXPORT_SYMBOL_GPL(split_page); -static int __isolate_free_page(struct page *page, unsigned int order) +int __isolate_free_page(struct page *page, unsigned int order) { unsigned long watermark; struct zone *zone; @@ -1610,6 +1602,9 @@ again: } __mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order)); + if (atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH]) <= 0 && + !test_bit(ZONE_FAIR_DEPLETED, &zone->flags)) + set_bit(ZONE_FAIR_DEPLETED, &zone->flags); __count_zone_vm_events(PGALLOC, zone, 1 << order); zone_statistics(preferred_zone, zone, gfp_flags); @@ -1712,7 +1707,6 @@ static bool __zone_watermark_ok(struct zone *z, unsigned int order, { /* free_pages my go negative - that's OK */ long min = mark; - long lowmem_reserve = z->lowmem_reserve[classzone_idx]; int o; long free_cma = 0; @@ -1727,7 +1721,7 @@ static bool __zone_watermark_ok(struct zone *z, unsigned int order, free_cma = zone_page_state(z, NR_FREE_CMA_PAGES); #endif - if (free_pages - free_cma <= min + lowmem_reserve) + if (free_pages - free_cma <= min + z->lowmem_reserve[classzone_idx]) return false; for (o = 0; o < order; o++) { /* At the next order, this order's pages become unavailable */ @@ -1922,6 +1916,18 @@ static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone) #endif /* CONFIG_NUMA */ +static void reset_alloc_batches(struct zone *preferred_zone) +{ + struct zone *zone = preferred_zone->zone_pgdat->node_zones; + + do { + mod_zone_page_state(zone, NR_ALLOC_BATCH, + high_wmark_pages(zone) - low_wmark_pages(zone) - + atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH])); + clear_bit(ZONE_FAIR_DEPLETED, &zone->flags); + } while (zone++ != preferred_zone); +} + /* * get_page_from_freelist goes through the zonelist trying to allocate * a page. @@ -1939,8 +1945,12 @@ get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order, int did_zlc_setup = 0; /* just call zlc_setup() one time */ bool consider_zone_dirty = (alloc_flags & ALLOC_WMARK_LOW) && (gfp_mask & __GFP_WRITE); + int nr_fair_skipped = 0; + bool zonelist_rescan; zonelist_scan: + zonelist_rescan = false; + /* * Scan zonelist, looking for a zone with enough free. * See also __cpuset_node_allowed_softwall() comment in kernel/cpuset.c. @@ -1964,9 +1974,11 @@ zonelist_scan: */ if (alloc_flags & ALLOC_FAIR) { if (!zone_local(preferred_zone, zone)) + break; + if (test_bit(ZONE_FAIR_DEPLETED, &zone->flags)) { + nr_fair_skipped++; continue; - if (zone_page_state(zone, NR_ALLOC_BATCH) <= 0) - continue; + } } /* * When allocating a page cache page for writing, we @@ -2072,13 +2084,7 @@ this_zone_full: zlc_mark_zone_full(zonelist, z); } - if (unlikely(IS_ENABLED(CONFIG_NUMA) && page == NULL && zlc_active)) { - /* Disable zlc cache for second zonelist scan */ - zlc_active = 0; - goto zonelist_scan; - } - - if (page) + if (page) { /* * page->pfmemalloc is set when ALLOC_NO_WATERMARKS was * necessary to allocate the page. The expectation is @@ -2087,8 +2093,37 @@ this_zone_full: * for !PFMEMALLOC purposes. */ page->pfmemalloc = !!(alloc_flags & ALLOC_NO_WATERMARKS); + return page; + } - return page; + /* + * The first pass makes sure allocations are spread fairly within the + * local node. However, the local node might have free pages left + * after the fairness batches are exhausted, and remote zones haven't + * even been considered yet. Try once more without fairness, and + * include remote zones now, before entering the slowpath and waking + * kswapd: prefer spilling to a remote zone over swapping locally. + */ + if (alloc_flags & ALLOC_FAIR) { + alloc_flags &= ~ALLOC_FAIR; + if (nr_fair_skipped) { + zonelist_rescan = true; + reset_alloc_batches(preferred_zone); + } + if (nr_online_nodes > 1) + zonelist_rescan = true; + } + + if (unlikely(IS_ENABLED(CONFIG_NUMA) && zlc_active)) { + /* Disable zlc cache for second zonelist scan */ + zlc_active = 0; + zonelist_rescan = true; + } + + if (zonelist_rescan) + goto zonelist_scan; + + return NULL; } /* @@ -2201,13 +2236,21 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, { struct page *page; - /* Acquire the OOM killer lock for the zones in zonelist */ - if (!try_set_zonelist_oom(zonelist, gfp_mask)) { + /* Acquire the per-zone oom lock for each zone */ + if (!oom_zonelist_trylock(zonelist, gfp_mask)) { schedule_timeout_uninterruptible(1); return NULL; } /* + * PM-freezer should be notified that there might be an OOM killer on + * its way to kill and wake somebody up. This is too early and we might + * end up not killing anything but false positives are acceptable. + * See freeze_processes. + */ + note_oom_kill(); + + /* * Go through the zonelist yet one more time, keep very high watermark * here, this is only to catch a parallel oom killing, we must fail if * we're still under heavy pressure. @@ -2240,7 +2283,7 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, out_of_memory(zonelist, gfp_mask, order, nodemask, false); out: - clear_zonelist_oom(zonelist, gfp_mask); + oom_zonelist_unlock(zonelist, gfp_mask); return page; } @@ -2251,58 +2294,72 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist, enum zone_type high_zoneidx, nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, int classzone_idx, int migratetype, enum migrate_mode mode, - bool *contended_compaction, bool *deferred_compaction, - unsigned long *did_some_progress) + int *contended_compaction, bool *deferred_compaction) { - if (!order) - return NULL; + struct zone *last_compact_zone = NULL; + unsigned long compact_result; + struct page *page; - if (compaction_deferred(preferred_zone, order)) { - *deferred_compaction = true; + if (!order) return NULL; - } current->flags |= PF_MEMALLOC; - *did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask, + compact_result = try_to_compact_pages(zonelist, order, gfp_mask, nodemask, mode, - contended_compaction); + contended_compaction, + &last_compact_zone); current->flags &= ~PF_MEMALLOC; - if (*did_some_progress != COMPACT_SKIPPED) { - struct page *page; + switch (compact_result) { + case COMPACT_DEFERRED: + *deferred_compaction = true; + /* fall-through */ + case COMPACT_SKIPPED: + return NULL; + default: + break; + } - /* Page migration frees to the PCP lists but we want merging */ - drain_pages(get_cpu()); - put_cpu(); + /* + * At least in one zone compaction wasn't deferred or skipped, so let's + * count a compaction stall + */ + count_vm_event(COMPACTSTALL); - page = get_page_from_freelist(gfp_mask, nodemask, - order, zonelist, high_zoneidx, - alloc_flags & ~ALLOC_NO_WATERMARKS, - preferred_zone, classzone_idx, migratetype); - if (page) { - preferred_zone->compact_blockskip_flush = false; - compaction_defer_reset(preferred_zone, order, true); - count_vm_event(COMPACTSUCCESS); - return page; - } + /* Page migration frees to the PCP lists but we want merging */ + drain_pages(get_cpu()); + put_cpu(); - /* - * It's bad if compaction run occurs and fails. - * The most likely reason is that pages exist, - * but not enough to satisfy watermarks. - */ - count_vm_event(COMPACTFAIL); + page = get_page_from_freelist(gfp_mask, nodemask, + order, zonelist, high_zoneidx, + alloc_flags & ~ALLOC_NO_WATERMARKS, + preferred_zone, classzone_idx, migratetype); - /* - * As async compaction considers a subset of pageblocks, only - * defer if the failure was a sync compaction failure. - */ - if (mode != MIGRATE_ASYNC) - defer_compaction(preferred_zone, order); + if (page) { + struct zone *zone = page_zone(page); - cond_resched(); + zone->compact_blockskip_flush = false; + compaction_defer_reset(zone, order, true); + count_vm_event(COMPACTSUCCESS); + return page; } + /* + * last_compact_zone is where try_to_compact_pages thought allocation + * should succeed, so it did not defer compaction. But here we know + * that it didn't succeed, so we do the defer. + */ + if (last_compact_zone && mode != MIGRATE_ASYNC) + defer_compaction(last_compact_zone, order); + + /* + * It's bad if compaction run occurs and fails. The most likely reason + * is that pages exist, but not enough to satisfy watermarks. + */ + count_vm_event(COMPACTFAIL); + + cond_resched(); + return NULL; } #else @@ -2310,9 +2367,8 @@ static inline struct page * __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist, enum zone_type high_zoneidx, nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, - int classzone_idx, int migratetype, - enum migrate_mode mode, bool *contended_compaction, - bool *deferred_compaction, unsigned long *did_some_progress) + int classzone_idx, int migratetype, enum migrate_mode mode, + int *contended_compaction, bool *deferred_compaction) { return NULL; } @@ -2409,37 +2465,17 @@ __alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order, return page; } -static void reset_alloc_batches(struct zonelist *zonelist, - enum zone_type high_zoneidx, - struct zone *preferred_zone) -{ - struct zoneref *z; - struct zone *zone; - - for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) { - /* - * Only reset the batches of zones that were actually - * considered in the fairness pass, we don't want to - * trash fairness information for zones that are not - * actually part of this zonelist's round-robin cycle. - */ - if (!zone_local(preferred_zone, zone)) - continue; - mod_zone_page_state(zone, NR_ALLOC_BATCH, - high_wmark_pages(zone) - low_wmark_pages(zone) - - atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH])); - } -} - static void wake_all_kswapds(unsigned int order, struct zonelist *zonelist, enum zone_type high_zoneidx, - struct zone *preferred_zone) + struct zone *preferred_zone, + nodemask_t *nodemask) { struct zoneref *z; struct zone *zone; - for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) + for_each_zone_zonelist_nodemask(zone, z, zonelist, + high_zoneidx, nodemask) wakeup_kswapd(zone, order, zone_idx(preferred_zone)); } @@ -2486,7 +2522,7 @@ gfp_to_alloc_flags(gfp_t gfp_mask) alloc_flags |= ALLOC_NO_WATERMARKS; } #ifdef CONFIG_CMA - if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE) + if (gfpflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE) alloc_flags |= ALLOC_CMA; #endif return alloc_flags; @@ -2510,7 +2546,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, unsigned long did_some_progress; enum migrate_mode migration_mode = MIGRATE_ASYNC; bool deferred_compaction = false; - bool contended_compaction = false; + int contended_compaction = COMPACT_CONTENDED_NONE; /* * In the slowpath, we sanity check order to avoid ever trying to @@ -2537,7 +2573,8 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, restart: if (!(gfp_mask & __GFP_NO_KSWAPD)) - wake_all_kswapds(order, zonelist, high_zoneidx, preferred_zone); + wake_all_kswapds(order, zonelist, high_zoneidx, + preferred_zone, nodemask); /* * OK, we're below the kswapd watermark and have kicked background @@ -2610,29 +2647,50 @@ rebalance: preferred_zone, classzone_idx, migratetype, migration_mode, &contended_compaction, - &deferred_compaction, - &did_some_progress); + &deferred_compaction); if (page) goto got_pg; + /* Checks for THP-specific high-order allocations */ + if ((gfp_mask & GFP_TRANSHUGE) == GFP_TRANSHUGE) { + /* + * If compaction is deferred for high-order allocations, it is + * because sync compaction recently failed. If this is the case + * and the caller requested a THP allocation, we do not want + * to heavily disrupt the system, so we fail the allocation + * instead of entering direct reclaim. + */ + if (deferred_compaction) + goto nopage; + + /* + * In all zones where compaction was attempted (and not + * deferred or skipped), lock contention has been detected. + * For THP allocation we do not want to disrupt the others + * so we fallback to base pages instead. + */ + if (contended_compaction == COMPACT_CONTENDED_LOCK) + goto nopage; + + /* + * If compaction was aborted due to need_resched(), we do not + * want to further increase allocation latency, unless it is + * khugepaged trying to collapse. + */ + if (contended_compaction == COMPACT_CONTENDED_SCHED + && !(current->flags & PF_KTHREAD)) + goto nopage; + } + /* * It can become very expensive to allocate transparent hugepages at * fault, so use asynchronous memory compaction for THP unless it is * khugepaged trying to collapse. */ - if (!(gfp_mask & __GFP_NO_KSWAPD) || (current->flags & PF_KTHREAD)) + if ((gfp_mask & GFP_TRANSHUGE) != GFP_TRANSHUGE || + (current->flags & PF_KTHREAD)) migration_mode = MIGRATE_SYNC_LIGHT; - /* - * If compaction is deferred for high-order allocations, it is because - * sync compaction recently failed. In this is the case and the caller - * requested a movable allocation that does not heavily disrupt the - * system then fail the allocation instead of entering direct reclaim. - */ - if ((deferred_compaction || contended_compaction) && - (gfp_mask & __GFP_NO_KSWAPD)) - goto nopage; - /* Try direct reclaim and then allocating */ page = __alloc_pages_direct_reclaim(gfp_mask, order, zonelist, high_zoneidx, @@ -2702,8 +2760,7 @@ rebalance: preferred_zone, classzone_idx, migratetype, migration_mode, &contended_compaction, - &deferred_compaction, - &did_some_progress); + &deferred_compaction); if (page) goto got_pg; } @@ -2729,7 +2786,7 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, struct zone *preferred_zone; struct zoneref *preferred_zoneref; struct page *page = NULL; - int migratetype = allocflags_to_migratetype(gfp_mask); + int migratetype = gfpflags_to_migratetype(gfp_mask); unsigned int cpuset_mems_cookie; int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET|ALLOC_FAIR; int classzone_idx; @@ -2751,6 +2808,9 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, if (unlikely(!zonelist->_zonerefs->zone)) return NULL; + if (IS_ENABLED(CONFIG_CMA) && migratetype == MIGRATE_MOVABLE) + alloc_flags |= ALLOC_CMA; + retry_cpuset: cpuset_mems_cookie = read_mems_allowed_begin(); @@ -2762,33 +2822,12 @@ retry_cpuset: goto out; classzone_idx = zonelist_zone_idx(preferred_zoneref); -#ifdef CONFIG_CMA - if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE) - alloc_flags |= ALLOC_CMA; -#endif -retry: /* First allocation attempt */ page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order, zonelist, high_zoneidx, alloc_flags, preferred_zone, classzone_idx, migratetype); if (unlikely(!page)) { /* - * The first pass makes sure allocations are spread - * fairly within the local node. However, the local - * node might have free pages left after the fairness - * batches are exhausted, and remote zones haven't - * even been considered yet. Try once more without - * fairness, and include remote zones now, before - * entering the slowpath and waking kswapd: prefer - * spilling to a remote zone over swapping locally. - */ - if (alloc_flags & ALLOC_FAIR) { - reset_alloc_batches(zonelist, high_zoneidx, - preferred_zone); - alloc_flags &= ~ALLOC_FAIR; - goto retry; - } - /* * Runtime PM, block IO and its error handling path * can deadlock because I/O on the device might not * complete. @@ -2962,7 +3001,7 @@ EXPORT_SYMBOL(alloc_pages_exact); * Note this is not alloc_pages_exact_node() which allocates on a specific node, * but is not exact. */ -void *alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask) +void * __meminit alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask) { unsigned order = get_order(size); struct page *p = alloc_pages_node(nid, gfp_mask, order); @@ -2970,7 +3009,6 @@ void *alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask) return NULL; return make_alloc_exact((unsigned long)page_address(p), order, size); } -EXPORT_SYMBOL(alloc_pages_exact_nid); /** * free_pages_exact - release memory allocated via alloc_pages_exact() @@ -3052,7 +3090,7 @@ static inline void show_node(struct zone *zone) void si_meminfo(struct sysinfo *val) { val->totalram = totalram_pages; - val->sharedram = 0; + val->sharedram = global_page_state(NR_SHMEM); val->freeram = global_page_state(NR_FREE_PAGES); val->bufferram = nr_blockdev_pages(); val->totalhigh = totalhigh_pages; @@ -3072,6 +3110,7 @@ void si_meminfo_node(struct sysinfo *val, int nid) for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) managed_pages += pgdat->node_zones[zone_type].managed_pages; val->totalram = managed_pages; + val->sharedram = node_page_state(nid, NR_SHMEM); val->freeram = node_page_state(nid, NR_FREE_PAGES); #ifdef CONFIG_HIGHMEM val->totalhigh = pgdat->node_zones[ZONE_HIGHMEM].managed_pages; @@ -3253,12 +3292,12 @@ void show_free_areas(unsigned int filter) K(zone_page_state(zone, NR_BOUNCE)), K(zone_page_state(zone, NR_FREE_CMA_PAGES)), K(zone_page_state(zone, NR_WRITEBACK_TEMP)), - zone->pages_scanned, + K(zone_page_state(zone, NR_PAGES_SCANNED)), (!zone_reclaimable(zone) ? "yes" : "no") ); printk("lowmem_reserve[]:"); for (i = 0; i < MAX_NR_ZONES; i++) - printk(" %lu", zone->lowmem_reserve[i]); + printk(" %ld", zone->lowmem_reserve[i]); printk("\n"); } @@ -3572,68 +3611,30 @@ static void build_zonelists_in_zone_order(pg_data_t *pgdat, int nr_nodes) zonelist->_zonerefs[pos].zone_idx = 0; } +#if defined(CONFIG_64BIT) +/* + * Devices that require DMA32/DMA are relatively rare and do not justify a + * penalty to every machine in case the specialised case applies. Default + * to Node-ordering on 64-bit NUMA machines + */ +static int default_zonelist_order(void) +{ + return ZONELIST_ORDER_NODE; +} +#else +/* + * On 32-bit, the Normal zone needs to be preserved for allocations accessible + * by the kernel. If processes running on node 0 deplete the low memory zone + * then reclaim will occur more frequency increasing stalls and potentially + * be easier to OOM if a large percentage of the zone is under writeback or + * dirty. The problem is significantly worse if CONFIG_HIGHPTE is not set. + * Hence, default to zone ordering on 32-bit. + */ static int default_zonelist_order(void) { - int nid, zone_type; - unsigned long low_kmem_size, total_size; - struct zone *z; - int average_size; - /* - * ZONE_DMA and ZONE_DMA32 can be very small area in the system. - * If they are really small and used heavily, the system can fall - * into OOM very easily. - * This function detect ZONE_DMA/DMA32 size and configures zone order. - */ - /* Is there ZONE_NORMAL ? (ex. ppc has only DMA zone..) */ - low_kmem_size = 0; - total_size = 0; - for_each_online_node(nid) { - for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) { - z = &NODE_DATA(nid)->node_zones[zone_type]; - if (populated_zone(z)) { - if (zone_type < ZONE_NORMAL) - low_kmem_size += z->managed_pages; - total_size += z->managed_pages; - } else if (zone_type == ZONE_NORMAL) { - /* - * If any node has only lowmem, then node order - * is preferred to allow kernel allocations - * locally; otherwise, they can easily infringe - * on other nodes when there is an abundance of - * lowmem available to allocate from. - */ - return ZONELIST_ORDER_NODE; - } - } - } - if (!low_kmem_size || /* there are no DMA area. */ - low_kmem_size > total_size/2) /* DMA/DMA32 is big. */ - return ZONELIST_ORDER_NODE; - /* - * look into each node's config. - * If there is a node whose DMA/DMA32 memory is very big area on - * local memory, NODE_ORDER may be suitable. - */ - average_size = total_size / - (nodes_weight(node_states[N_MEMORY]) + 1); - for_each_online_node(nid) { - low_kmem_size = 0; - total_size = 0; - for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) { - z = &NODE_DATA(nid)->node_zones[zone_type]; - if (populated_zone(z)) { - if (zone_type < ZONE_NORMAL) - low_kmem_size += z->present_pages; - total_size += z->present_pages; - } - } - if (low_kmem_size && - total_size > average_size && /* ignore small node */ - low_kmem_size > total_size * 70/100) - return ZONELIST_ORDER_NODE; - } return ZONELIST_ORDER_ZONE; } +#endif /* CONFIG_64BIT */ static void set_zonelist_order(void) { @@ -4969,6 +4970,8 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size, pgdat->node_start_pfn = node_start_pfn; #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP get_pfn_range_for_nid(nid, &start_pfn, &end_pfn); + printk(KERN_INFO "Initmem setup node %d [mem %#010Lx-%#010Lx]\n", nid, + (u64) start_pfn << PAGE_SHIFT, (u64) (end_pfn << PAGE_SHIFT) - 1); #endif calculate_node_totalpages(pgdat, start_pfn, end_pfn, zones_size, zholes_size); @@ -5579,7 +5582,7 @@ static void calculate_totalreserve_pages(void) for_each_online_pgdat(pgdat) { for (i = 0; i < MAX_NR_ZONES; i++) { struct zone *zone = pgdat->node_zones + i; - unsigned long max = 0; + long max = 0; /* Find valid and maximum lowmem_reserve in the zone */ for (j = i; j < MAX_NR_ZONES; j++) { @@ -5694,9 +5697,8 @@ static void __setup_per_zone_wmarks(void) zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + (tmp >> 1); __mod_zone_page_state(zone, NR_ALLOC_BATCH, - high_wmark_pages(zone) - - low_wmark_pages(zone) - - zone_page_state(zone, NR_ALLOC_BATCH)); + high_wmark_pages(zone) - low_wmark_pages(zone) - + atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH])); setup_zone_migrate_reserve(zone); spin_unlock_irqrestore(&zone->lock, flags); @@ -6271,8 +6273,7 @@ static int __alloc_contig_migrate_range(struct compact_control *cc, if (list_empty(&cc->migratepages)) { cc->nr_migratepages = 0; - pfn = isolate_migratepages_range(cc->zone, cc, - pfn, end, true); + pfn = isolate_migratepages_range(cc, pfn, end); if (!pfn) { ret = -EINTR; break; @@ -6398,13 +6399,12 @@ int alloc_contig_range(unsigned long start, unsigned long end, /* Make sure the range is really isolated. */ if (test_pages_isolated(outer_start, end, false)) { - pr_warn("alloc_contig_range test_pages_isolated(%lx, %lx) failed\n", - outer_start, end); + pr_info("%s: [%lx, %lx) PFNs busy\n", + __func__, outer_start, end); ret = -EBUSY; goto done; } - /* Grab isolated pages from freelists. */ outer_end = isolate_freepages_range(&cc, outer_start, end); if (!outer_end) { @@ -6548,97 +6548,3 @@ bool is_free_buddy_page(struct page *page) return order < MAX_ORDER; } #endif - -static const struct trace_print_flags pageflag_names[] = { - {1UL << PG_locked, "locked" }, - {1UL << PG_error, "error" }, - {1UL << PG_referenced, "referenced" }, - {1UL << PG_uptodate, "uptodate" }, - {1UL << PG_dirty, "dirty" }, - {1UL << PG_lru, "lru" }, - {1UL << PG_active, "active" }, - {1UL << PG_slab, "slab" }, - {1UL << PG_owner_priv_1, "owner_priv_1" }, - {1UL << PG_arch_1, "arch_1" }, - {1UL << PG_reserved, "reserved" }, - {1UL << PG_private, "private" }, - {1UL << PG_private_2, "private_2" }, - {1UL << PG_writeback, "writeback" }, -#ifdef CONFIG_PAGEFLAGS_EXTENDED - {1UL << PG_head, "head" }, - {1UL << PG_tail, "tail" }, -#else - {1UL << PG_compound, "compound" }, -#endif - {1UL << PG_swapcache, "swapcache" }, - {1UL << PG_mappedtodisk, "mappedtodisk" }, - {1UL << PG_reclaim, "reclaim" }, - {1UL << PG_swapbacked, "swapbacked" }, - {1UL << PG_unevictable, "unevictable" }, -#ifdef CONFIG_MMU - {1UL << PG_mlocked, "mlocked" }, -#endif -#ifdef CONFIG_ARCH_USES_PG_UNCACHED - {1UL << PG_uncached, "uncached" }, -#endif -#ifdef CONFIG_MEMORY_FAILURE - {1UL << PG_hwpoison, "hwpoison" }, -#endif -#ifdef CONFIG_TRANSPARENT_HUGEPAGE - {1UL << PG_compound_lock, "compound_lock" }, -#endif -}; - -static void dump_page_flags(unsigned long flags) -{ - const char *delim = ""; - unsigned long mask; - int i; - - BUILD_BUG_ON(ARRAY_SIZE(pageflag_names) != __NR_PAGEFLAGS); - - printk(KERN_ALERT "page flags: %#lx(", flags); - - /* remove zone id */ - flags &= (1UL << NR_PAGEFLAGS) - 1; - - for (i = 0; i < ARRAY_SIZE(pageflag_names) && flags; i++) { - - mask = pageflag_names[i].mask; - if ((flags & mask) != mask) - continue; - - flags &= ~mask; - printk("%s%s", delim, pageflag_names[i].name); - delim = "|"; - } - - /* check for left over flags */ - if (flags) - printk("%s%#lx", delim, flags); - - printk(")\n"); -} - -void dump_page_badflags(struct page *page, const char *reason, - unsigned long badflags) -{ - printk(KERN_ALERT - "page:%p count:%d mapcount:%d mapping:%p index:%#lx\n", - page, atomic_read(&page->_count), page_mapcount(page), - page->mapping, page->index); - dump_page_flags(page->flags); - if (reason) - pr_alert("page dumped because: %s\n", reason); - if (page->flags & badflags) { - pr_alert("bad because of flags:\n"); - dump_page_flags(page->flags & badflags); - } - mem_cgroup_print_bad_page(page); -} - -void dump_page(struct page *page, const char *reason) -{ - dump_page_badflags(page, reason, 0); -} -EXPORT_SYMBOL(dump_page); |