diff options
Diffstat (limited to 'mm')
71 files changed, 3811 insertions, 2545 deletions
diff --git a/mm/Kconfig b/mm/Kconfig index 03cbfa072f42..989f8f3d77e0 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -187,7 +187,6 @@ config MEMORY_HOTPLUG bool "Allow for memory hot-add" depends on SPARSEMEM || X86_64_ACPI_NUMA depends on ARCH_ENABLE_MEMORY_HOTPLUG - depends on (IA64 || X86 || PPC_BOOK3S_64 || SUPERH || S390) config MEMORY_HOTPLUG_SPARSE def_bool y @@ -652,10 +651,9 @@ config IDLE_PAGE_TRACKING config ZONE_DEVICE bool "Device memory (pmem, etc...) hotplug support" if EXPERT - default !ZONE_DMA - depends on !ZONE_DMA depends on MEMORY_HOTPLUG depends on MEMORY_HOTREMOVE + depends on SPARSEMEM_VMEMMAP depends on X86_64 #arch_add_memory() comprehends device memory help @@ -669,3 +667,8 @@ config ZONE_DEVICE config FRAME_VECTOR bool + +config ARCH_USES_HIGH_VMA_FLAGS + bool +config ARCH_HAS_PKEYS + bool diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug index 957d3da53ddd..22f4cd96acb0 100644 --- a/mm/Kconfig.debug +++ b/mm/Kconfig.debug @@ -16,8 +16,8 @@ config DEBUG_PAGEALLOC select PAGE_POISONING if !ARCH_SUPPORTS_DEBUG_PAGEALLOC ---help--- Unmap pages from the kernel linear mapping after free_pages(). - This results in a large slowdown, but helps to find certain types - of memory corruption. + Depending on runtime enablement, this results in a small or large + slowdown, but helps to find certain types of memory corruption. For architectures which don't enable ARCH_SUPPORTS_DEBUG_PAGEALLOC, fill the pages with poison patterns after free_pages() and verify @@ -26,5 +26,69 @@ config DEBUG_PAGEALLOC that would result in incorrect warnings of memory corruption after a resume because free pages are not saved to the suspend image. + By default this option will have a small overhead, e.g. by not + allowing the kernel mapping to be backed by large pages on some + architectures. Even bigger overhead comes when the debugging is + enabled by DEBUG_PAGEALLOC_ENABLE_DEFAULT or the debug_pagealloc + command line parameter. + +config DEBUG_PAGEALLOC_ENABLE_DEFAULT + bool "Enable debug page memory allocations by default?" + default n + depends on DEBUG_PAGEALLOC + ---help--- + Enable debug page memory allocations by default? This value + can be overridden by debug_pagealloc=off|on. + config PAGE_POISONING + bool "Poison pages after freeing" + select PAGE_EXTENSION + select PAGE_POISONING_NO_SANITY if HIBERNATION + ---help--- + Fill the pages with poison patterns after free_pages() and verify + the patterns before alloc_pages. The filling of the memory helps + reduce the risk of information leaks from freed data. This does + have a potential performance impact. + + Note that "poison" here is not the same thing as the "HWPoison" + for CONFIG_MEMORY_FAILURE. This is software poisoning only. + + If unsure, say N + +config PAGE_POISONING_NO_SANITY + depends on PAGE_POISONING + bool "Only poison, don't sanity check" + ---help--- + Skip the sanity checking on alloc, only fill the pages with + poison on free. This reduces some of the overhead of the + poisoning feature. + + If you are only interested in sanitization, say Y. Otherwise + say N. + +config PAGE_POISONING_ZERO + bool "Use zero for poisoning instead of random data" + depends on PAGE_POISONING + ---help--- + Instead of using the existing poison value, fill the pages with + zeros. This makes it harder to detect when errors are occurring + due to sanitization but the zeroing at free means that it is + no longer necessary to write zeros when GFP_ZERO is used on + allocation. + + Enabling page poisoning with this option will disable hibernation + + If unsure, say N bool + +config DEBUG_PAGE_REF + bool "Enable tracepoint to track down page reference manipulation" + depends on DEBUG_KERNEL + depends on TRACEPOINTS + ---help--- + This is a feature to add tracepoint for tracking down page reference + manipulation. This tracking is useful to diagnose functional failure + due to migration failures caused by page reference mismatches. Be + careful when enabling this feature because it adds about 30 KB to the + kernel code. However the runtime performance overhead is virtually + nil until the tracepoints are actually enabled. diff --git a/mm/Makefile b/mm/Makefile index 2ed43191fc3b..deb467edca2d 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -3,8 +3,24 @@ # KASAN_SANITIZE_slab_common.o := n +KASAN_SANITIZE_slab.o := n KASAN_SANITIZE_slub.o := n +# These files are disabled because they produce non-interesting and/or +# flaky coverage that is not a function of syscall inputs. E.g. slab is out of +# free pages, or a task is migrated between nodes. +KCOV_INSTRUMENT_slab_common.o := n +KCOV_INSTRUMENT_slob.o := n +KCOV_INSTRUMENT_slab.o := n +KCOV_INSTRUMENT_slub.o := n +KCOV_INSTRUMENT_page_alloc.o := n +KCOV_INSTRUMENT_debug-pagealloc.o := n +KCOV_INSTRUMENT_kmemleak.o := n +KCOV_INSTRUMENT_kmemcheck.o := n +KCOV_INSTRUMENT_memcontrol.o := n +KCOV_INSTRUMENT_mmzone.o := n +KCOV_INSTRUMENT_vmstat.o := n + mmu-y := nommu.o mmu-$(CONFIG_MMU) := gup.o highmem.o memory.o mincore.o \ mlock.o mmap.o mprotect.o mremap.o msync.o rmap.o \ @@ -48,7 +64,7 @@ obj-$(CONFIG_SPARSEMEM_VMEMMAP) += sparse-vmemmap.o obj-$(CONFIG_SLOB) += slob.o obj-$(CONFIG_MMU_NOTIFIER) += mmu_notifier.o obj-$(CONFIG_KSM) += ksm.o -obj-$(CONFIG_PAGE_POISONING) += debug-pagealloc.o +obj-$(CONFIG_PAGE_POISONING) += page_poison.o obj-$(CONFIG_SLAB) += slab.o obj-$(CONFIG_SLUB) += slub.o obj-$(CONFIG_KMEMCHECK) += kmemcheck.o @@ -81,3 +97,4 @@ obj-$(CONFIG_CMA_DEBUGFS) += cma_debug.o obj-$(CONFIG_USERFAULTFD) += userfaultfd.o obj-$(CONFIG_IDLE_PAGE_TRACKING) += page_idle.o obj-$(CONFIG_FRAME_VECTOR) += frame_vector.o +obj-$(CONFIG_DEBUG_PAGE_REF) += debug_page_ref.o diff --git a/mm/backing-dev.c b/mm/backing-dev.c index c554d173a65f..bfbd7096b6ed 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -1026,8 +1026,8 @@ int pdflush_proc_obsolete(struct ctl_table *table, int write, if (copy_to_user(buffer, kbuf, sizeof(kbuf))) return -EFAULT; - printk_once(KERN_WARNING "%s exported in /proc is scheduled for removal\n", - table->procname); + pr_warn_once("%s exported in /proc is scheduled for removal\n", + table->procname); *lenp = 2; *ppos += *lenp; diff --git a/mm/balloon_compaction.c b/mm/balloon_compaction.c index 300117f1a08f..57b3e9bd6bc5 100644 --- a/mm/balloon_compaction.c +++ b/mm/balloon_compaction.c @@ -13,10 +13,10 @@ /* * balloon_page_enqueue - allocates a new page and inserts it into the balloon * page list. - * @b_dev_info: balloon device decriptor where we will insert a new page to + * @b_dev_info: balloon device descriptor where we will insert a new page to * * Driver must call it to properly allocate a new enlisted balloon page - * before definetively removing it from the guest system. + * before definitively removing it from the guest system. * This function returns the page address for the recently enqueued page or * NULL in the case we fail to allocate a new page this turn. */ diff --git a/mm/bootmem.c b/mm/bootmem.c index 91e32bc8517f..0aa7dda52402 100644 --- a/mm/bootmem.c +++ b/mm/bootmem.c @@ -50,8 +50,7 @@ early_param("bootmem_debug", bootmem_debug_setup); #define bdebug(fmt, args...) ({ \ if (unlikely(bootmem_debug)) \ - printk(KERN_INFO \ - "bootmem::%s " fmt, \ + pr_info("bootmem::%s " fmt, \ __func__, ## args); \ }) @@ -680,7 +679,7 @@ static void * __init ___alloc_bootmem(unsigned long size, unsigned long align, /* * Whoops, we cannot satisfy the allocation request. */ - printk(KERN_ALERT "bootmem alloc of %lu bytes failed!\n", size); + pr_alert("bootmem alloc of %lu bytes failed!\n", size); panic("Out of memory"); return NULL; } @@ -755,7 +754,7 @@ void * __init ___alloc_bootmem_node(pg_data_t *pgdat, unsigned long size, if (ptr) return ptr; - printk(KERN_ALERT "bootmem alloc of %lu bytes failed!\n", size); + pr_alert("bootmem alloc of %lu bytes failed!\n", size); panic("Out of memory"); return NULL; } diff --git a/mm/compaction.c b/mm/compaction.c index 585de54dbe8c..ccf97b02b85f 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -7,6 +7,7 @@ * * Copyright IBM Corp. 2007-2010 Mel Gorman <mel@csn.ul.ie> */ +#include <linux/cpu.h> #include <linux/swap.h> #include <linux/migrate.h> #include <linux/compaction.h> @@ -17,6 +18,8 @@ #include <linux/balloon_compaction.h> #include <linux/page-isolation.h> #include <linux/kasan.h> +#include <linux/kthread.h> +#include <linux/freezer.h> #include "internal.h" #ifdef CONFIG_COMPACTION @@ -71,49 +74,6 @@ static inline bool migrate_async_suitable(int migratetype) return is_migrate_cma(migratetype) || migratetype == MIGRATE_MOVABLE; } -/* - * Check that the whole (or subset of) a pageblock given by the interval of - * [start_pfn, end_pfn) is valid and within the same zone, before scanning it - * with the migration of free compaction scanner. The scanners then need to - * use only pfn_valid_within() check for arches that allow holes within - * pageblocks. - * - * Return struct page pointer of start_pfn, or NULL if checks were not passed. - * - * It's possible on some configurations to have a setup like node0 node1 node0 - * i.e. it's possible that all pages within a zones range of pages do not - * belong to a single zone. We assume that a border between node0 and node1 - * can occur within a single pageblock, but not a node0 node1 node0 - * interleaving within a single pageblock. It is therefore sufficient to check - * the first and last page of a pageblock and avoid checking each individual - * page in a pageblock. - */ -static struct page *pageblock_pfn_to_page(unsigned long start_pfn, - unsigned long end_pfn, struct zone *zone) -{ - struct page *start_page; - struct page *end_page; - - /* end_pfn is one past the range we are checking */ - end_pfn--; - - if (!pfn_valid(start_pfn) || !pfn_valid(end_pfn)) - return NULL; - - start_page = pfn_to_page(start_pfn); - - if (page_zone(start_page) != zone) - return NULL; - - end_page = pfn_to_page(end_pfn); - - /* This gives a shorter code than deriving page_zone(end_page) */ - if (page_zone_id(start_page) != page_zone_id(end_page)) - return NULL; - - return start_page; -} - #ifdef CONFIG_COMPACTION /* Do not skip compaction more than 64 times */ @@ -200,7 +160,8 @@ static void reset_cached_positions(struct zone *zone) { zone->compact_cached_migrate_pfn[0] = zone->zone_start_pfn; zone->compact_cached_migrate_pfn[1] = zone->zone_start_pfn; - zone->compact_cached_free_pfn = zone_end_pfn(zone); + zone->compact_cached_free_pfn = + round_down(zone_end_pfn(zone) - 1, pageblock_nr_pages); } /* @@ -554,13 +515,17 @@ unsigned long isolate_freepages_range(struct compact_control *cc, unsigned long start_pfn, unsigned long end_pfn) { - unsigned long isolated, pfn, block_end_pfn; + unsigned long isolated, pfn, block_start_pfn, block_end_pfn; LIST_HEAD(freelist); pfn = start_pfn; + block_start_pfn = pfn & ~(pageblock_nr_pages - 1); + if (block_start_pfn < cc->zone->zone_start_pfn) + block_start_pfn = cc->zone->zone_start_pfn; block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages); for (; pfn < end_pfn; pfn += isolated, + block_start_pfn = block_end_pfn, block_end_pfn += pageblock_nr_pages) { /* Protect pfn from changing by isolate_freepages_block */ unsigned long isolate_start_pfn = pfn; @@ -573,11 +538,13 @@ isolate_freepages_range(struct compact_control *cc, * scanning range to right one. */ if (pfn >= block_end_pfn) { + block_start_pfn = pfn & ~(pageblock_nr_pages - 1); block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages); block_end_pfn = min(block_end_pfn, end_pfn); } - if (!pageblock_pfn_to_page(pfn, block_end_pfn, cc->zone)) + if (!pageblock_pfn_to_page(block_start_pfn, + block_end_pfn, cc->zone)) break; isolated = isolate_freepages_block(cc, &isolate_start_pfn, @@ -863,18 +830,23 @@ unsigned long isolate_migratepages_range(struct compact_control *cc, unsigned long start_pfn, unsigned long end_pfn) { - unsigned long pfn, block_end_pfn; + unsigned long pfn, block_start_pfn, block_end_pfn; /* Scan block by block. First and last block may be incomplete */ pfn = start_pfn; + block_start_pfn = pfn & ~(pageblock_nr_pages - 1); + if (block_start_pfn < cc->zone->zone_start_pfn) + block_start_pfn = cc->zone->zone_start_pfn; block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages); for (; pfn < end_pfn; pfn = block_end_pfn, + block_start_pfn = block_end_pfn, block_end_pfn += pageblock_nr_pages) { block_end_pfn = min(block_end_pfn, end_pfn); - if (!pageblock_pfn_to_page(pfn, block_end_pfn, cc->zone)) + if (!pageblock_pfn_to_page(block_start_pfn, + block_end_pfn, cc->zone)) continue; pfn = isolate_migratepages_block(cc, pfn, block_end_pfn, @@ -1103,7 +1075,9 @@ int sysctl_compact_unevictable_allowed __read_mostly = 1; static isolate_migrate_t isolate_migratepages(struct zone *zone, struct compact_control *cc) { - unsigned long low_pfn, end_pfn; + unsigned long block_start_pfn; + unsigned long block_end_pfn; + unsigned long low_pfn; unsigned long isolate_start_pfn; struct page *page; const isolate_mode_t isolate_mode = @@ -1115,16 +1089,21 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone, * initialized by compact_zone() */ low_pfn = cc->migrate_pfn; + block_start_pfn = cc->migrate_pfn & ~(pageblock_nr_pages - 1); + if (block_start_pfn < zone->zone_start_pfn) + block_start_pfn = zone->zone_start_pfn; /* Only scan within a pageblock boundary */ - end_pfn = ALIGN(low_pfn + 1, pageblock_nr_pages); + block_end_pfn = ALIGN(low_pfn + 1, pageblock_nr_pages); /* * Iterate over whole pageblocks until we find the first suitable. * Do not cross the free scanner. */ - for (; end_pfn <= cc->free_pfn; - low_pfn = end_pfn, end_pfn += pageblock_nr_pages) { + for (; block_end_pfn <= cc->free_pfn; + low_pfn = block_end_pfn, + block_start_pfn = block_end_pfn, + block_end_pfn += pageblock_nr_pages) { /* * This can potentially iterate a massively long zone with @@ -1135,7 +1114,8 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone, && compact_should_abort(cc)) break; - page = pageblock_pfn_to_page(low_pfn, end_pfn, zone); + page = pageblock_pfn_to_page(block_start_pfn, block_end_pfn, + zone); if (!page) continue; @@ -1154,8 +1134,8 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone, /* Perform the isolation */ isolate_start_pfn = low_pfn; - low_pfn = isolate_migratepages_block(cc, low_pfn, end_pfn, - isolate_mode); + low_pfn = isolate_migratepages_block(cc, low_pfn, + block_end_pfn, isolate_mode); if (!low_pfn || cc->contended) { acct_isolated(zone, cc); @@ -1211,11 +1191,11 @@ static int __compact_finished(struct zone *zone, struct compact_control *cc, /* * Mark that the PG_migrate_skip information should be cleared - * by kswapd when it goes to sleep. kswapd does not set the + * by kswapd when it goes to sleep. kcompactd does not set the * flag itself as the decision to be clear should be directly * based on an allocation request. */ - if (!current_is_kswapd()) + if (cc->direct_compaction) zone->compact_blockskip_flush = true; return COMPACT_COMPLETE; @@ -1358,10 +1338,9 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) /* * Clear pageblock skip if there were failures recently and compaction - * is about to be retried after being deferred. kswapd does not do - * this reset as it'll reset the cached information when going to sleep. + * is about to be retried after being deferred. */ - if (compaction_restarting(zone, cc->order) && !current_is_kswapd()) + if (compaction_restarting(zone, cc->order)) __reset_isolation_suitable(zone); /* @@ -1371,11 +1350,11 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) */ cc->migrate_pfn = zone->compact_cached_migrate_pfn[sync]; cc->free_pfn = zone->compact_cached_free_pfn; - if (cc->free_pfn < start_pfn || cc->free_pfn > end_pfn) { - cc->free_pfn = end_pfn & ~(pageblock_nr_pages-1); + if (cc->free_pfn < start_pfn || cc->free_pfn >= end_pfn) { + cc->free_pfn = round_down(end_pfn - 1, pageblock_nr_pages); zone->compact_cached_free_pfn = cc->free_pfn; } - if (cc->migrate_pfn < start_pfn || cc->migrate_pfn > end_pfn) { + if (cc->migrate_pfn < start_pfn || cc->migrate_pfn >= end_pfn) { cc->migrate_pfn = start_pfn; zone->compact_cached_migrate_pfn[0] = cc->migrate_pfn; zone->compact_cached_migrate_pfn[1] = cc->migrate_pfn; @@ -1497,6 +1476,7 @@ static unsigned long compact_zone_order(struct zone *zone, int order, .mode = mode, .alloc_flags = alloc_flags, .classzone_idx = classzone_idx, + .direct_compaction = true, }; INIT_LIST_HEAD(&cc.freepages); INIT_LIST_HEAD(&cc.migratepages); @@ -1759,4 +1739,223 @@ void compaction_unregister_node(struct node *node) } #endif /* CONFIG_SYSFS && CONFIG_NUMA */ +static inline bool kcompactd_work_requested(pg_data_t *pgdat) +{ + return pgdat->kcompactd_max_order > 0; +} + +static bool kcompactd_node_suitable(pg_data_t *pgdat) +{ + int zoneid; + struct zone *zone; + enum zone_type classzone_idx = pgdat->kcompactd_classzone_idx; + + for (zoneid = 0; zoneid < classzone_idx; zoneid++) { + zone = &pgdat->node_zones[zoneid]; + + if (!populated_zone(zone)) + continue; + + if (compaction_suitable(zone, pgdat->kcompactd_max_order, 0, + classzone_idx) == COMPACT_CONTINUE) + return true; + } + + return false; +} + +static void kcompactd_do_work(pg_data_t *pgdat) +{ + /* + * With no special task, compact all zones so that a page of requested + * order is allocatable. + */ + int zoneid; + struct zone *zone; + struct compact_control cc = { + .order = pgdat->kcompactd_max_order, + .classzone_idx = pgdat->kcompactd_classzone_idx, + .mode = MIGRATE_SYNC_LIGHT, + .ignore_skip_hint = true, + + }; + bool success = false; + + trace_mm_compaction_kcompactd_wake(pgdat->node_id, cc.order, + cc.classzone_idx); + count_vm_event(KCOMPACTD_WAKE); + + for (zoneid = 0; zoneid < cc.classzone_idx; zoneid++) { + int status; + + zone = &pgdat->node_zones[zoneid]; + if (!populated_zone(zone)) + continue; + + if (compaction_deferred(zone, cc.order)) + continue; + + if (compaction_suitable(zone, cc.order, 0, zoneid) != + COMPACT_CONTINUE) + continue; + + cc.nr_freepages = 0; + cc.nr_migratepages = 0; + cc.zone = zone; + INIT_LIST_HEAD(&cc.freepages); + INIT_LIST_HEAD(&cc.migratepages); + + status = compact_zone(zone, &cc); + + if (zone_watermark_ok(zone, cc.order, low_wmark_pages(zone), + cc.classzone_idx, 0)) { + success = true; + compaction_defer_reset(zone, cc.order, false); + } else if (status == COMPACT_COMPLETE) { + /* + * We use sync migration mode here, so we defer like + * sync direct compaction does. + */ + defer_compaction(zone, cc.order); + } + + VM_BUG_ON(!list_empty(&cc.freepages)); + VM_BUG_ON(!list_empty(&cc.migratepages)); + } + + /* + * Regardless of success, we are done until woken up next. But remember + * the requested order/classzone_idx in case it was higher/tighter than + * our current ones + */ + if (pgdat->kcompactd_max_order <= cc.order) + pgdat->kcompactd_max_order = 0; + if (pgdat->kcompactd_classzone_idx >= cc.classzone_idx) + pgdat->kcompactd_classzone_idx = pgdat->nr_zones - 1; +} + +void wakeup_kcompactd(pg_data_t *pgdat, int order, int classzone_idx) +{ + if (!order) + return; + + if (pgdat->kcompactd_max_order < order) + pgdat->kcompactd_max_order = order; + + if (pgdat->kcompactd_classzone_idx > classzone_idx) + pgdat->kcompactd_classzone_idx = classzone_idx; + + if (!waitqueue_active(&pgdat->kcompactd_wait)) + return; + + if (!kcompactd_node_suitable(pgdat)) + return; + + trace_mm_compaction_wakeup_kcompactd(pgdat->node_id, order, + classzone_idx); + wake_up_interruptible(&pgdat->kcompactd_wait); +} + +/* + * The background compaction daemon, started as a kernel thread + * from the init process. + */ +static int kcompactd(void *p) +{ + pg_data_t *pgdat = (pg_data_t*)p; + struct task_struct *tsk = current; + + const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id); + + if (!cpumask_empty(cpumask)) + set_cpus_allowed_ptr(tsk, cpumask); + + set_freezable(); + + pgdat->kcompactd_max_order = 0; + pgdat->kcompactd_classzone_idx = pgdat->nr_zones - 1; + + while (!kthread_should_stop()) { + trace_mm_compaction_kcompactd_sleep(pgdat->node_id); + wait_event_freezable(pgdat->kcompactd_wait, + kcompactd_work_requested(pgdat)); + + kcompactd_do_work(pgdat); + } + + return 0; +} + +/* + * This kcompactd start function will be called by init and node-hot-add. + * On node-hot-add, kcompactd will moved to proper cpus if cpus are hot-added. + */ +int kcompactd_run(int nid) +{ + pg_data_t *pgdat = NODE_DATA(nid); + int ret = 0; + + if (pgdat->kcompactd) + return 0; + + pgdat->kcompactd = kthread_run(kcompactd, pgdat, "kcompactd%d", nid); + if (IS_ERR(pgdat->kcompactd)) { + pr_err("Failed to start kcompactd on node %d\n", nid); + ret = PTR_ERR(pgdat->kcompactd); + pgdat->kcompactd = NULL; + } + return ret; +} + +/* + * Called by memory hotplug when all memory in a node is offlined. Caller must + * hold mem_hotplug_begin/end(). + */ +void kcompactd_stop(int nid) +{ + struct task_struct *kcompactd = NODE_DATA(nid)->kcompactd; + + if (kcompactd) { + kthread_stop(kcompactd); + NODE_DATA(nid)->kcompactd = NULL; + } +} + +/* + * It's optimal to keep kcompactd on the same CPUs as their memory, but + * not required for correctness. So if the last cpu in a node goes + * away, we get changed to run anywhere: as the first one comes back, + * restore their cpu bindings. + */ +static int cpu_callback(struct notifier_block *nfb, unsigned long action, + void *hcpu) +{ + int nid; + + if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN) { + for_each_node_state(nid, N_MEMORY) { + pg_data_t *pgdat = NODE_DATA(nid); + const struct cpumask *mask; + + mask = cpumask_of_node(pgdat->node_id); + + if (cpumask_any_and(cpu_online_mask, mask) < nr_cpu_ids) + /* One of our CPUs online: restore mask */ + set_cpus_allowed_ptr(pgdat->kcompactd, mask); + } + } + return NOTIFY_OK; +} + +static int __init kcompactd_init(void) +{ + int nid; + + for_each_node_state(nid, N_MEMORY) + kcompactd_run(nid); + hotcpu_notifier(cpu_callback, 0); + return 0; +} +subsys_initcall(kcompactd_init) + #endif /* CONFIG_COMPACTION */ diff --git a/mm/debug.c b/mm/debug.c index f05b2d5d6481..8865bfb41b0b 100644 --- a/mm/debug.c +++ b/mm/debug.c @@ -9,91 +9,52 @@ #include <linux/mm.h> #include <linux/trace_events.h> #include <linux/memcontrol.h> - -static const struct trace_print_flags pageflag_names[] = { - {1UL << PG_locked, "locked" }, - {1UL << PG_error, "error" }, - {1UL << PG_referenced, "referenced" }, - {1UL << PG_uptodate, "uptodate" }, - {1UL << PG_dirty, "dirty" }, - {1UL << PG_lru, "lru" }, - {1UL << PG_active, "active" }, - {1UL << PG_slab, "slab" }, - {1UL << PG_owner_priv_1, "owner_priv_1" }, - {1UL << PG_arch_1, "arch_1" }, - {1UL << PG_reserved, "reserved" }, - {1UL << PG_private, "private" }, - {1UL << PG_private_2, "private_2" }, - {1UL << PG_writeback, "writeback" }, - {1UL << PG_head, "head" }, - {1UL << PG_swapcache, "swapcache" }, - {1UL << PG_mappedtodisk, "mappedtodisk" }, - {1UL << PG_reclaim, "reclaim" }, - {1UL << PG_swapbacked, "swapbacked" }, - {1UL << PG_unevictable, "unevictable" }, -#ifdef CONFIG_MMU - {1UL << PG_mlocked, "mlocked" }, -#endif -#ifdef CONFIG_ARCH_USES_PG_UNCACHED - {1UL << PG_uncached, "uncached" }, -#endif -#ifdef CONFIG_MEMORY_FAILURE - {1UL << PG_hwpoison, "hwpoison" }, -#endif -#if defined(CONFIG_IDLE_PAGE_TRACKING) && defined(CONFIG_64BIT) - {1UL << PG_young, "young" }, - {1UL << PG_idle, "idle" }, -#endif +#include <trace/events/mmflags.h> +#include <linux/migrate.h> +#include <linux/page_owner.h> + +#include "internal.h" + +char *migrate_reason_names[MR_TYPES] = { + "compaction", + "memory_failure", + "memory_hotplug", + "syscall_or_cpuset", + "mempolicy_mbind", + "numa_misplaced", + "cma", }; -static void dump_flags(unsigned long flags, - const struct trace_print_flags *names, int count) -{ - const char *delim = ""; - unsigned long mask; - int i; - - pr_emerg("flags: %#lx(", flags); - - /* remove zone id */ - flags &= (1UL << NR_PAGEFLAGS) - 1; - - for (i = 0; i < count && flags; i++) { - - mask = names[i].mask; - if ((flags & mask) != mask) - continue; - - flags &= ~mask; - pr_cont("%s%s", delim, names[i].name); - delim = "|"; - } +const struct trace_print_flags pageflag_names[] = { + __def_pageflag_names, + {0, NULL} +}; - /* check for left over flags */ - if (flags) - pr_cont("%s%#lx", delim, flags); +const struct trace_print_flags gfpflag_names[] = { + __def_gfpflag_names, + {0, NULL} +}; - pr_cont(")\n"); -} +const struct trace_print_flags vmaflag_names[] = { + __def_vmaflag_names, + {0, NULL} +}; -void dump_page_badflags(struct page *page, const char *reason, - unsigned long badflags) +void __dump_page(struct page *page, const char *reason) { pr_emerg("page:%p count:%d mapcount:%d mapping:%p index:%#lx", - page, atomic_read(&page->_count), page_mapcount(page), + page, page_ref_count(page), page_mapcount(page), page->mapping, page->index); if (PageCompound(page)) pr_cont(" compound_mapcount: %d", compound_mapcount(page)); pr_cont("\n"); - BUILD_BUG_ON(ARRAY_SIZE(pageflag_names) != __NR_PAGEFLAGS); - dump_flags(page->flags, pageflag_names, ARRAY_SIZE(pageflag_names)); + BUILD_BUG_ON(ARRAY_SIZE(pageflag_names) != __NR_PAGEFLAGS + 1); + + pr_emerg("flags: %#lx(%pGp)\n", page->flags, &page->flags); + if (reason) pr_alert("page dumped because: %s\n", reason); - if (page->flags & badflags) { - pr_alert("bad because of flags:\n"); - dump_flags(page->flags & badflags, - pageflag_names, ARRAY_SIZE(pageflag_names)); - } + #ifdef CONFIG_MEMCG if (page->mem_cgroup) pr_alert("page->mem_cgroup:%p\n", page->mem_cgroup); @@ -102,67 +63,26 @@ void dump_page_badflags(struct page *page, const char *reason, void dump_page(struct page *page, const char *reason) { - dump_page_badflags(page, reason, 0); + __dump_page(page, reason); + dump_page_owner(page); } EXPORT_SYMBOL(dump_page); #ifdef CONFIG_DEBUG_VM -static const struct trace_print_flags vmaflags_names[] = { - {VM_READ, "read" }, - {VM_WRITE, "write" }, - {VM_EXEC, "exec" }, - {VM_SHARED, "shared" }, - {VM_MAYREAD, "mayread" }, - {VM_MAYWRITE, "maywrite" }, - {VM_MAYEXEC, "mayexec" }, - {VM_MAYSHARE, "mayshare" }, - {VM_GROWSDOWN, "growsdown" }, - {VM_PFNMAP, "pfnmap" }, - {VM_DENYWRITE, "denywrite" }, - {VM_LOCKONFAULT, "lockonfault" }, - {VM_LOCKED, "locked" }, - {VM_IO, "io" }, - {VM_SEQ_READ, "seqread" }, - {VM_RAND_READ, "randread" }, - {VM_DONTCOPY, "dontcopy" }, - {VM_DONTEXPAND, "dontexpand" }, - {VM_ACCOUNT, "account" }, - {VM_NORESERVE, "noreserve" }, - {VM_HUGETLB, "hugetlb" }, -#if defined(CONFIG_X86) - {VM_PAT, "pat" }, -#elif defined(CONFIG_PPC) - {VM_SAO, "sao" }, -#elif defined(CONFIG_PARISC) || defined(CONFIG_METAG) || defined(CONFIG_IA64) - {VM_GROWSUP, "growsup" }, -#elif !defined(CONFIG_MMU) - {VM_MAPPED_COPY, "mappedcopy" }, -#else - {VM_ARCH_1, "arch_1" }, -#endif - {VM_DONTDUMP, "dontdump" }, -#ifdef CONFIG_MEM_SOFT_DIRTY - {VM_SOFTDIRTY, "softdirty" }, -#endif - {VM_MIXEDMAP, "mixedmap" }, - {VM_HUGEPAGE, "hugepage" }, - {VM_NOHUGEPAGE, "nohugepage" }, - {VM_MERGEABLE, "mergeable" }, -}; - void dump_vma(const struct vm_area_struct *vma) { pr_emerg("vma %p start %p end %p\n" "next %p prev %p mm %p\n" "prot %lx anon_vma %p vm_ops %p\n" - "pgoff %lx file %p private_data %p\n", + "pgoff %lx file %p private_data %p\n" + "flags: %#lx(%pGv)\n", vma, (void *)vma->vm_start, (void *)vma->vm_end, vma->vm_next, vma->vm_prev, vma->vm_mm, (unsigned long)pgprot_val(vma->vm_page_prot), vma->anon_vma, vma->vm_ops, vma->vm_pgoff, - vma->vm_file, vma->vm_private_data); - dump_flags(vma->vm_flags, vmaflags_names, ARRAY_SIZE(vmaflags_names)); + vma->vm_file, vma->vm_private_data, + vma->vm_flags, &vma->vm_flags); } EXPORT_SYMBOL(dump_vma); @@ -196,7 +116,7 @@ void dump_mm(const struct mm_struct *mm) #if defined(CONFIG_NUMA_BALANCING) || defined(CONFIG_COMPACTION) "tlb_flush_pending %d\n" #endif - "%s", /* This is here to hold the comma */ + "def_flags: %#lx(%pGv)\n", mm, mm->mmap, mm->vmacache_seqnum, mm->task_size, #ifdef CONFIG_MMU @@ -230,11 +150,8 @@ void dump_mm(const struct mm_struct *mm) #if defined(CONFIG_NUMA_BALANCING) || defined(CONFIG_COMPACTION) mm->tlb_flush_pending, #endif - "" /* This is here to not have a comma! */ - ); - - dump_flags(mm->def_flags, vmaflags_names, - ARRAY_SIZE(vmaflags_names)); + mm->def_flags, &mm->def_flags + ); } #endif /* CONFIG_DEBUG_VM */ diff --git a/mm/debug_page_ref.c b/mm/debug_page_ref.c new file mode 100644 index 000000000000..1aef3d562e52 --- /dev/null +++ b/mm/debug_page_ref.c @@ -0,0 +1,54 @@ +#include <linux/mm_types.h> +#include <linux/tracepoint.h> + +#define CREATE_TRACE_POINTS +#include <trace/events/page_ref.h> + +void __page_ref_set(struct page *page, int v) +{ + trace_page_ref_set(page, v); +} +EXPORT_SYMBOL(__page_ref_set); +EXPORT_TRACEPOINT_SYMBOL(page_ref_set); + +void __page_ref_mod(struct page *page, int v) +{ + trace_page_ref_mod(page, v); +} +EXPORT_SYMBOL(__page_ref_mod); +EXPORT_TRACEPOINT_SYMBOL(page_ref_mod); + +void __page_ref_mod_and_test(struct page *page, int v, int ret) +{ + trace_page_ref_mod_and_test(page, v, ret); +} +EXPORT_SYMBOL(__page_ref_mod_and_test); +EXPORT_TRACEPOINT_SYMBOL(page_ref_mod_and_test); + +void __page_ref_mod_and_return(struct page *page, int v, int ret) +{ + trace_page_ref_mod_and_return(page, v, ret); +} +EXPORT_SYMBOL(__page_ref_mod_and_return); +EXPORT_TRACEPOINT_SYMBOL(page_ref_mod_and_return); + +void __page_ref_mod_unless(struct page *page, int v, int u) +{ + trace_page_ref_mod_unless(page, v, u); +} +EXPORT_SYMBOL(__page_ref_mod_unless); +EXPORT_TRACEPOINT_SYMBOL(page_ref_mod_unless); + +void __page_ref_freeze(struct page *page, int v, int ret) +{ + trace_page_ref_freeze(page, v, ret); +} +EXPORT_SYMBOL(__page_ref_freeze); +EXPORT_TRACEPOINT_SYMBOL(page_ref_freeze); + +void __page_ref_unfreeze(struct page *page, int v) +{ + trace_page_ref_unfreeze(page, v); +} +EXPORT_SYMBOL(__page_ref_unfreeze); +EXPORT_TRACEPOINT_SYMBOL(page_ref_unfreeze); diff --git a/mm/dmapool.c b/mm/dmapool.c index 57312b5d6e12..abcbfe86c25a 100644 --- a/mm/dmapool.c +++ b/mm/dmapool.c @@ -294,8 +294,7 @@ void dma_pool_destroy(struct dma_pool *pool) "dma_pool_destroy %s, %p busy\n", pool->name, page->vaddr); else - printk(KERN_ERR - "dma_pool_destroy %s, %p busy\n", + pr_err("dma_pool_destroy %s, %p busy\n", pool->name, page->vaddr); /* leak the still-in-use consistent memory */ list_del(&page->page_list); @@ -424,7 +423,7 @@ void dma_pool_free(struct dma_pool *pool, void *vaddr, dma_addr_t dma) "dma_pool_free %s, %p/%lx (bad dma)\n", pool->name, vaddr, (unsigned long)dma); else - printk(KERN_ERR "dma_pool_free %s, %p/%lx (bad dma)\n", + pr_err("dma_pool_free %s, %p/%lx (bad dma)\n", pool->name, vaddr, (unsigned long)dma); return; } @@ -438,8 +437,7 @@ void dma_pool_free(struct dma_pool *pool, void *vaddr, dma_addr_t dma) "dma_pool_free %s, %p (bad vaddr)/%Lx\n", pool->name, vaddr, (unsigned long long)dma); else - printk(KERN_ERR - "dma_pool_free %s, %p (bad vaddr)/%Lx\n", + pr_err("dma_pool_free %s, %p (bad vaddr)/%Lx\n", pool->name, vaddr, (unsigned long long)dma); return; } @@ -452,13 +450,11 @@ void dma_pool_free(struct dma_pool *pool, void *vaddr, dma_addr_t dma) } spin_unlock_irqrestore(&pool->lock, flags); if (pool->dev) - dev_err(pool->dev, "dma_pool_free %s, dma %Lx " - "already free\n", pool->name, - (unsigned long long)dma); + dev_err(pool->dev, "dma_pool_free %s, dma %Lx already free\n", + pool->name, (unsigned long long)dma); else - printk(KERN_ERR "dma_pool_free %s, dma %Lx " - "already free\n", pool->name, - (unsigned long long)dma); + pr_err("dma_pool_free %s, dma %Lx already free\n", + pool->name, (unsigned long long)dma); return; } } diff --git a/mm/failslab.c b/mm/failslab.c index 79171b4a5826..b0fac98cd938 100644 --- a/mm/failslab.c +++ b/mm/failslab.c @@ -1,5 +1,7 @@ #include <linux/fault-inject.h> #include <linux/slab.h> +#include <linux/mm.h> +#include "slab.h" static struct { struct fault_attr attr; @@ -11,18 +13,22 @@ static struct { .cache_filter = false, }; -bool should_failslab(size_t size, gfp_t gfpflags, unsigned long cache_flags) +bool should_failslab(struct kmem_cache *s, gfp_t gfpflags) { + /* No fault-injection for bootstrap cache */ + if (unlikely(s == kmem_cache)) + return false; + if (gfpflags & __GFP_NOFAIL) return false; if (failslab.ignore_gfp_reclaim && (gfpflags & __GFP_RECLAIM)) return false; - if (failslab.cache_filter && !(cache_flags & SLAB_FAILSLAB)) + if (failslab.cache_filter && !(s->flags & SLAB_FAILSLAB)) return false; - return should_fail(&failslab.attr, size); + return should_fail(&failslab.attr, s->object_size); } static int __init setup_failslab(char *str) diff --git a/mm/filemap.c b/mm/filemap.c index da7a35d83de7..a8c69c8c0a90 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -101,7 +101,7 @@ * ->tree_lock (page_remove_rmap->set_page_dirty) * bdi.wb->list_lock (page_remove_rmap->set_page_dirty) * ->inode->i_lock (page_remove_rmap->set_page_dirty) - * ->memcg->move_lock (page_remove_rmap->mem_cgroup_begin_page_stat) + * ->memcg->move_lock (page_remove_rmap->lock_page_memcg) * bdi.wb->list_lock (zap_pte_range->set_page_dirty) * ->inode->i_lock (zap_pte_range->set_page_dirty) * ->private_lock (zap_pte_range->__set_page_dirty_buffers) @@ -176,11 +176,9 @@ static void page_cache_tree_delete(struct address_space *mapping, /* * Delete a page from the page cache and free it. Caller has to make * sure the page is locked and that nobody else uses it - or that usage - * is safe. The caller must hold the mapping's tree_lock and - * mem_cgroup_begin_page_stat(). + * is safe. The caller must hold the mapping's tree_lock. */ -void __delete_from_page_cache(struct page *page, void *shadow, - struct mem_cgroup *memcg) +void __delete_from_page_cache(struct page *page, void *shadow) { struct address_space *mapping = page->mapping; @@ -239,8 +237,7 @@ void __delete_from_page_cache(struct page *page, void *shadow, * anyway will be cleared before returning page into buddy allocator. */ if (WARN_ON_ONCE(PageDirty(page))) - account_page_cleaned(page, mapping, memcg, - inode_to_wb(mapping->host)); + account_page_cleaned(page, mapping, inode_to_wb(mapping->host)); } /** @@ -254,7 +251,6 @@ void __delete_from_page_cache(struct page *page, void *shadow, void delete_from_page_cache(struct page *page) { struct address_space *mapping = page->mapping; - struct mem_cgroup *memcg; unsigned long flags; void (*freepage)(struct page *); @@ -263,11 +259,9 @@ void delete_from_page_cache(struct page *page) freepage = mapping->a_ops->freepage; - memcg = mem_cgroup_begin_page_stat(page); spin_lock_irqsave(&mapping->tree_lock, flags); - __delete_from_page_cache(page, NULL, memcg); + __delete_from_page_cache(page, NULL); spin_unlock_irqrestore(&mapping->tree_lock, flags); - mem_cgroup_end_page_stat(memcg); if (freepage) freepage(page); @@ -551,7 +545,6 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask) if (!error) { struct address_space *mapping = old->mapping; void (*freepage)(struct page *); - struct mem_cgroup *memcg; unsigned long flags; pgoff_t offset = old->index; @@ -561,9 +554,8 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask) new->mapping = mapping; new->index = offset; - memcg = mem_cgroup_begin_page_stat(old); spin_lock_irqsave(&mapping->tree_lock, flags); - __delete_from_page_cache(old, NULL, memcg); + __delete_from_page_cache(old, NULL); error = radix_tree_insert(&mapping->page_tree, offset, new); BUG_ON(error); mapping->nrpages++; @@ -576,8 +568,7 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask) if (PageSwapBacked(new)) __inc_zone_page_state(new, NR_SHMEM); spin_unlock_irqrestore(&mapping->tree_lock, flags); - mem_cgroup_end_page_stat(memcg); - mem_cgroup_replace_page(old, new); + mem_cgroup_migrate(old, new); radix_tree_preload_end(); if (freepage) freepage(old); @@ -595,7 +586,7 @@ static int page_cache_tree_insert(struct address_space *mapping, void **slot; int error; - error = __radix_tree_create(&mapping->page_tree, page->index, + error = __radix_tree_create(&mapping->page_tree, page->index, 0, &node, &slot); if (error) return error; @@ -1264,7 +1255,6 @@ unsigned find_get_entries(struct address_space *mapping, return 0; rcu_read_lock(); -restart: radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) { struct page *page; repeat: @@ -1272,8 +1262,10 @@ repeat: if (unlikely(!page)) continue; if (radix_tree_exception(page)) { - if (radix_tree_deref_retry(page)) - goto restart; + if (radix_tree_deref_retry(page)) { + slot = radix_tree_iter_retry(&iter); + continue; + } /* * A shadow entry of a recently evicted page, a swap * entry from shmem/tmpfs or a DAX entry. Return it @@ -1326,7 +1318,6 @@ unsigned find_get_pages(struct address_space *mapping, pgoff_t start, return 0; rcu_read_lock(); -restart: radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) { struct page *page; repeat: @@ -1336,13 +1327,8 @@ repeat: if (radix_tree_exception(page)) { if (radix_tree_deref_retry(page)) { - /* - * Transient condition which can only trigger - * when entry at index 0 moves out of or back - * to root: none yet gotten, safe to restart. - */ - WARN_ON(iter.index); - goto restart; + slot = radix_tree_iter_retry(&iter); + continue; } /* * A shadow entry of a recently evicted page, @@ -1393,7 +1379,6 @@ unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t index, return 0; rcu_read_lock(); -restart: radix_tree_for_each_contig(slot, &mapping->page_tree, &iter, index) { struct page *page; repeat: @@ -1404,12 +1389,8 @@ repeat: if (radix_tree_exception(page)) { if (radix_tree_deref_retry(page)) { - /* - * Transient condition which can only trigger - * when entry at index 0 moves out of or back - * to root: none yet gotten, safe to restart. - */ - goto restart; + slot = radix_tree_iter_retry(&iter); + continue; } /* * A shadow entry of a recently evicted page, @@ -1469,7 +1450,6 @@ unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *index, return 0; rcu_read_lock(); -restart: radix_tree_for_each_tagged(slot, &mapping->page_tree, &iter, *index, tag) { struct page *page; @@ -1480,12 +1460,8 @@ repeat: if (radix_tree_exception(page)) { if (radix_tree_deref_retry(page)) { - /* - * Transient condition which can only trigger - * when entry at index 0 moves out of or back - * to root: none yet gotten, safe to restart. - */ - goto restart; + slot = radix_tree_iter_retry(&iter); + continue; } /* * A shadow entry of a recently evicted page. @@ -1548,7 +1524,6 @@ unsigned find_get_entries_tag(struct address_space *mapping, pgoff_t start, return 0; rcu_read_lock(); -restart: radix_tree_for_each_tagged(slot, &mapping->page_tree, &iter, start, tag) { struct page *page; @@ -1558,12 +1533,8 @@ repeat: continue; if (radix_tree_exception(page)) { if (radix_tree_deref_retry(page)) { - /* - * Transient condition which can only trigger - * when entry at index 0 moves out of or back - * to root: none yet gotten, safe to restart. - */ - goto restart; + slot = radix_tree_iter_retry(&iter); + continue; } /* @@ -1668,6 +1639,15 @@ find_page: index, last_index - index); } if (!PageUptodate(page)) { + /* + * See comment in do_read_cache_page on why + * wait_on_page_locked is used to avoid unnecessarily + * serialisations and why it's safe. + */ + wait_on_page_locked_killable(page); + if (PageUptodate(page)) + goto page_ok; + if (inode->i_blkbits == PAGE_CACHE_SHIFT || !mapping->a_ops->is_partially_uptodate) goto page_not_up_to_date; @@ -1860,15 +1840,16 @@ generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter) ssize_t retval = 0; loff_t *ppos = &iocb->ki_pos; loff_t pos = *ppos; + size_t count = iov_iter_count(iter); + + if (!count) + goto out; /* skip atime */ if (iocb->ki_flags & IOCB_DIRECT) { struct address_space *mapping = file->f_mapping; struct inode *inode = mapping->host; - size_t count = iov_iter_count(iter); loff_t size; - if (!count) - goto out; /* skip atime */ size = i_size_read(inode); retval = filemap_write_and_wait_range(mapping, pos, pos + count - 1); @@ -2171,10 +2152,11 @@ repeat: if (unlikely(!page)) goto next; if (radix_tree_exception(page)) { - if (radix_tree_deref_retry(page)) - break; - else - goto next; + if (radix_tree_deref_retry(page)) { + slot = radix_tree_iter_retry(&iter); + continue; + } + goto next; } if (!page_cache_get_speculative(page)) @@ -2303,7 +2285,7 @@ static struct page *wait_on_page_read(struct page *page) return page; } -static struct page *__read_cache_page(struct address_space *mapping, +static struct page *do_read_cache_page(struct address_space *mapping, pgoff_t index, int (*filler)(void *, struct page *), void *data, @@ -2325,53 +2307,74 @@ repeat: /* Presumably ENOMEM for radix tree node */ return ERR_PTR(err); } + +filler: err = filler(data, page); if (err < 0) { page_cache_release(page); - page = ERR_PTR(err); - } else { - page = wait_on_page_read(page); + return ERR_PTR(err); } - } - return page; -} - -static struct page *do_read_cache_page(struct address_space *mapping, - pgoff_t index, - int (*filler)(void *, struct page *), - void *data, - gfp_t gfp) -{ - struct page *page; - int err; + page = wait_on_page_read(page); + if (IS_ERR(page)) + return page; + goto out; + } + if (PageUptodate(page)) + goto out; -retry: - page = __read_cache_page(mapping, index, filler, data, gfp); - if (IS_ERR(page)) - return page; + /* + * Page is not up to date and may be locked due one of the following + * case a: Page is being filled and the page lock is held + * case b: Read/write error clearing the page uptodate status + * case c: Truncation in progress (page locked) + * case d: Reclaim in progress + * + * Case a, the page will be up to date when the page is unlocked. + * There is no need to serialise on the page lock here as the page + * is pinned so the lock gives no additional protection. Even if the + * the page is truncated, the data is still valid if PageUptodate as + * it's a race vs truncate race. + * Case b, the page will not be up to date + * Case c, the page may be truncated but in itself, the data may still + * be valid after IO completes as it's a read vs truncate race. The + * operation must restart if the page is not uptodate on unlock but + * otherwise serialising on page lock to stabilise the mapping gives + * no additional guarantees to the caller as the page lock is + * released before return. + * Case d, similar to truncation. If reclaim holds the page lock, it + * will be a race with remove_mapping that determines if the mapping + * is valid on unlock but otherwise the data is valid and there is + * no need to serialise with page lock. + * + * As the page lock gives no additional guarantee, we optimistically + * wait on the page to be unlocked and check if it's up to date and + * use the page if it is. Otherwise, the page lock is required to + * distinguish between the different cases. The motivation is that we + * avoid spurious serialisations and wakeups when multiple processes + * wait on the same page for IO to complete. + */ + wait_on_page_locked(page); if (PageUptodate(page)) goto out; + /* Distinguish between all the cases under the safety of the lock */ lock_page(page); + + /* Case c or d, restart the operation */ if (!page->mapping) { unlock_page(page); page_cache_release(page); - goto retry; + goto repeat; } + + /* Someone else locked and filled the page in a very small window */ if (PageUptodate(page)) { unlock_page(page); goto out; } - err = filler(data, page); - if (err < 0) { - page_cache_release(page); - return ERR_PTR(err); - } else { - page = wait_on_page_read(page); - if (IS_ERR(page)) - return page; - } + goto filler; + out: mark_page_accessed(page); return page; diff --git a/mm/frame_vector.c b/mm/frame_vector.c index 7cf2b7163222..381bb07ed14f 100644 --- a/mm/frame_vector.c +++ b/mm/frame_vector.c @@ -58,7 +58,7 @@ int get_vaddr_frames(unsigned long start, unsigned int nr_frames, if (!(vma->vm_flags & (VM_IO | VM_PFNMAP))) { vec->got_ref = true; vec->is_pfns = false; - ret = get_user_pages_locked(current, mm, start, nr_frames, + ret = get_user_pages_locked(start, nr_frames, write, force, (struct page **)(vec->ptrs), &locked); goto out; } @@ -1,3 +1,4 @@ +#define __DISABLE_GUP_DEPRECATED 1 #include <linux/kernel.h> #include <linux/errno.h> #include <linux/err.h> @@ -14,6 +15,7 @@ #include <linux/rwsem.h> #include <linux/hugetlb.h> +#include <asm/mmu_context.h> #include <asm/pgtable.h> #include <asm/tlbflush.h> @@ -363,6 +365,8 @@ static int faultin_page(struct task_struct *tsk, struct vm_area_struct *vma, return -ENOENT; if (*flags & FOLL_WRITE) fault_flags |= FAULT_FLAG_WRITE; + if (*flags & FOLL_REMOTE) + fault_flags |= FAULT_FLAG_REMOTE; if (nonblocking) fault_flags |= FAULT_FLAG_ALLOW_RETRY; if (*flags & FOLL_NOWAIT) @@ -413,11 +417,13 @@ static int faultin_page(struct task_struct *tsk, struct vm_area_struct *vma, static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags) { vm_flags_t vm_flags = vma->vm_flags; + int write = (gup_flags & FOLL_WRITE); + int foreign = (gup_flags & FOLL_REMOTE); if (vm_flags & (VM_IO | VM_PFNMAP)) return -EFAULT; - if (gup_flags & FOLL_WRITE) { + if (write) { if (!(vm_flags & VM_WRITE)) { if (!(gup_flags & FOLL_FORCE)) return -EFAULT; @@ -443,6 +449,12 @@ static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags) if (!(vm_flags & VM_MAYREAD)) return -EFAULT; } + /* + * gups are always data accesses, not instruction + * fetches, so execute=false here + */ + if (!arch_vma_access_permitted(vma, write, false, foreign)) + return -EFAULT; return 0; } @@ -609,6 +621,28 @@ next_page: } EXPORT_SYMBOL(__get_user_pages); +bool vma_permits_fault(struct vm_area_struct *vma, unsigned int fault_flags) +{ + bool write = !!(fault_flags & FAULT_FLAG_WRITE); + bool foreign = !!(fault_flags & FAULT_FLAG_REMOTE); + vm_flags_t vm_flags = write ? VM_WRITE : VM_READ; + + if (!(vm_flags & vma->vm_flags)) + return false; + + /* + * The architecture might have a hardware protection + * mechanism other than read/write that can deny access. + * + * gup always represents data access, not instruction + * fetches, so execute=false here: + */ + if (!arch_vma_access_permitted(vma, write, false, foreign)) + return false; + + return true; +} + /* * fixup_user_fault() - manually resolve a user page fault * @tsk: the task_struct to use for page fault accounting, or @@ -644,7 +678,6 @@ int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm, bool *unlocked) { struct vm_area_struct *vma; - vm_flags_t vm_flags; int ret, major = 0; if (unlocked) @@ -655,8 +688,7 @@ retry: if (!vma || address < vma->vm_start) return -EFAULT; - vm_flags = (fault_flags & FAULT_FLAG_WRITE) ? VM_WRITE : VM_READ; - if (!(vm_flags & vma->vm_flags)) + if (!vma_permits_fault(vma, fault_flags)) return -EFAULT; ret = handle_mm_fault(mm, vma, address, fault_flags); @@ -807,15 +839,15 @@ static __always_inline long __get_user_pages_locked(struct task_struct *tsk, * if (locked) * up_read(&mm->mmap_sem); */ -long get_user_pages_locked(struct task_struct *tsk, struct mm_struct *mm, - unsigned long start, unsigned long nr_pages, +long get_user_pages_locked6(unsigned long start, unsigned long nr_pages, int write, int force, struct page **pages, int *locked) { - return __get_user_pages_locked(tsk, mm, start, nr_pages, write, force, - pages, NULL, locked, true, FOLL_TOUCH); + return __get_user_pages_locked(current, current->mm, start, nr_pages, + write, force, pages, NULL, locked, true, + FOLL_TOUCH); } -EXPORT_SYMBOL(get_user_pages_locked); +EXPORT_SYMBOL(get_user_pages_locked6); /* * Same as get_user_pages_unlocked(...., FOLL_TOUCH) but it allows to @@ -860,17 +892,16 @@ EXPORT_SYMBOL(__get_user_pages_unlocked); * or if "force" shall be set to 1 (get_user_pages_fast misses the * "force" parameter). */ -long get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm, - unsigned long start, unsigned long nr_pages, +long get_user_pages_unlocked5(unsigned long start, unsigned long nr_pages, int write, int force, struct page **pages) { - return __get_user_pages_unlocked(tsk, mm, start, nr_pages, write, - force, pages, FOLL_TOUCH); + return __get_user_pages_unlocked(current, current->mm, start, nr_pages, + write, force, pages, FOLL_TOUCH); } -EXPORT_SYMBOL(get_user_pages_unlocked); +EXPORT_SYMBOL(get_user_pages_unlocked5); /* - * get_user_pages() - pin user pages in memory + * get_user_pages_remote() - pin user pages in memory * @tsk: the task_struct to use for page fault accounting, or * NULL if faults are not to be recorded. * @mm: mm_struct of target mm @@ -924,14 +955,32 @@ EXPORT_SYMBOL(get_user_pages_unlocked); * should use get_user_pages because it cannot pass * FAULT_FLAG_ALLOW_RETRY to handle_mm_fault. */ -long get_user_pages(struct task_struct *tsk, struct mm_struct *mm, - unsigned long start, unsigned long nr_pages, int write, - int force, struct page **pages, struct vm_area_struct **vmas) +long get_user_pages_remote(struct task_struct *tsk, struct mm_struct *mm, + unsigned long start, unsigned long nr_pages, + int write, int force, struct page **pages, + struct vm_area_struct **vmas) { return __get_user_pages_locked(tsk, mm, start, nr_pages, write, force, - pages, vmas, NULL, false, FOLL_TOUCH); + pages, vmas, NULL, false, + FOLL_TOUCH | FOLL_REMOTE); +} +EXPORT_SYMBOL(get_user_pages_remote); + +/* + * This is the same as get_user_pages_remote(), just with a + * less-flexible calling convention where we assume that the task + * and mm being operated on are the current task's. We also + * obviously don't pass FOLL_REMOTE in here. + */ +long get_user_pages6(unsigned long start, unsigned long nr_pages, + int write, int force, struct page **pages, + struct vm_area_struct **vmas) +{ + return __get_user_pages_locked(current, current->mm, start, nr_pages, + write, force, pages, vmas, NULL, false, + FOLL_TOUCH); } -EXPORT_SYMBOL(get_user_pages); +EXPORT_SYMBOL(get_user_pages6); /** * populate_vma_page_range() - populate a range of pages in the vma. @@ -1144,6 +1193,9 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end, pte_protnone(pte) || (write && !pte_write(pte))) goto pte_unmap; + if (!arch_pte_access_permitted(pte, write)) + goto pte_unmap; + VM_BUG_ON(!pfn_valid(pte_pfn(pte))); page = pte_page(pte); head = compound_head(page); @@ -1467,3 +1519,38 @@ int get_user_pages_fast(unsigned long start, int nr_pages, int write, } #endif /* CONFIG_HAVE_GENERIC_RCU_GUP */ + +long get_user_pages8(struct task_struct *tsk, struct mm_struct *mm, + unsigned long start, unsigned long nr_pages, + int write, int force, struct page **pages, + struct vm_area_struct **vmas) +{ + WARN_ONCE(tsk != current, "get_user_pages() called on remote task"); + WARN_ONCE(mm != current->mm, "get_user_pages() called on remote mm"); + + return get_user_pages6(start, nr_pages, write, force, pages, vmas); +} +EXPORT_SYMBOL(get_user_pages8); + +long get_user_pages_locked8(struct task_struct *tsk, struct mm_struct *mm, + unsigned long start, unsigned long nr_pages, + int write, int force, struct page **pages, int *locked) +{ + WARN_ONCE(tsk != current, "get_user_pages_locked() called on remote task"); + WARN_ONCE(mm != current->mm, "get_user_pages_locked() called on remote mm"); + + return get_user_pages_locked6(start, nr_pages, write, force, pages, locked); +} +EXPORT_SYMBOL(get_user_pages_locked8); + +long get_user_pages_unlocked7(struct task_struct *tsk, struct mm_struct *mm, + unsigned long start, unsigned long nr_pages, + int write, int force, struct page **pages) +{ + WARN_ONCE(tsk != current, "get_user_pages_unlocked() called on remote task"); + WARN_ONCE(mm != current->mm, "get_user_pages_unlocked() called on remote mm"); + + return get_user_pages_unlocked5(start, nr_pages, write, force, pages); +} +EXPORT_SYMBOL(get_user_pages_unlocked7); + diff --git a/mm/huge_memory.c b/mm/huge_memory.c index e10a4fee88d2..86f9f8b82f8e 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -78,12 +78,12 @@ unsigned long transparent_hugepage_flags __read_mostly = #ifdef CONFIG_TRANSPARENT_HUGEPAGE_MADVISE (1<<TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG)| #endif - (1<<TRANSPARENT_HUGEPAGE_DEFRAG_FLAG)| + (1<<TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG)| (1<<TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG)| (1<<TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG); /* default scan 8*512 pte (or vmas) every 30 second */ -static unsigned int khugepaged_pages_to_scan __read_mostly = HPAGE_PMD_NR*8; +static unsigned int khugepaged_pages_to_scan __read_mostly; static unsigned int khugepaged_pages_collapsed; static unsigned int khugepaged_full_scans; static unsigned int khugepaged_scan_sleep_millisecs __read_mostly = 10000; @@ -98,7 +98,7 @@ static DECLARE_WAIT_QUEUE_HEAD(khugepaged_wait); * it would have happened if the vma was large enough during page * fault. */ -static unsigned int khugepaged_max_ptes_none __read_mostly = HPAGE_PMD_NR-1; +static unsigned int khugepaged_max_ptes_none __read_mostly; static int khugepaged(void *none); static int khugepaged_slab_init(void); @@ -168,8 +168,7 @@ static void set_recommended_min_free_kbytes(void) if (recommended_min > min_free_kbytes) { if (user_min_free_kbytes >= 0) - pr_info("raising min_free_kbytes from %d to %lu " - "to help transparent hugepage allocations\n", + pr_info("raising min_free_kbytes from %d to %lu to help transparent hugepage allocations\n", min_free_kbytes, recommended_min); min_free_kbytes = recommended_min; @@ -270,37 +269,35 @@ static struct shrinker huge_zero_page_shrinker = { #ifdef CONFIG_SYSFS -static ssize_t double_flag_show(struct kobject *kobj, - struct kobj_attribute *attr, char *buf, - enum transparent_hugepage_flag enabled, - enum transparent_hugepage_flag req_madv) -{ - if (test_bit(enabled, &transparent_hugepage_flags)) { - VM_BUG_ON(test_bit(req_madv, &transparent_hugepage_flags)); - return sprintf(buf, "[always] madvise never\n"); - } else if (test_bit(req_madv, &transparent_hugepage_flags)) - return sprintf(buf, "always [madvise] never\n"); - else - return sprintf(buf, "always madvise [never]\n"); -} -static ssize_t double_flag_store(struct kobject *kobj, +static ssize_t triple_flag_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count, enum transparent_hugepage_flag enabled, + enum transparent_hugepage_flag deferred, enum transparent_hugepage_flag req_madv) { - if (!memcmp("always", buf, + if (!memcmp("defer", buf, + min(sizeof("defer")-1, count))) { + if (enabled == deferred) + return -EINVAL; + clear_bit(enabled, &transparent_hugepage_flags); + clear_bit(req_madv, &transparent_hugepage_flags); + set_bit(deferred, &transparent_hugepage_flags); + } else if (!memcmp("always", buf, min(sizeof("always")-1, count))) { - set_bit(enabled, &transparent_hugepage_flags); + clear_bit(deferred, &transparent_hugepage_flags); clear_bit(req_madv, &transparent_hugepage_flags); + set_bit(enabled, &transparent_hugepage_flags); } else if (!memcmp("madvise", buf, min(sizeof("madvise")-1, count))) { clear_bit(enabled, &transparent_hugepage_flags); + clear_bit(deferred, &transparent_hugepage_flags); set_bit(req_madv, &transparent_hugepage_flags); } else if (!memcmp("never", buf, min(sizeof("never")-1, count))) { clear_bit(enabled, &transparent_hugepage_flags); clear_bit(req_madv, &transparent_hugepage_flags); + clear_bit(deferred, &transparent_hugepage_flags); } else return -EINVAL; @@ -310,17 +307,22 @@ static ssize_t double_flag_store(struct kobject *kobj, static ssize_t enabled_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { - return double_flag_show(kobj, attr, buf, - TRANSPARENT_HUGEPAGE_FLAG, - TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG); + if (test_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags)) + return sprintf(buf, "[always] madvise never\n"); + else if (test_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, &transparent_hugepage_flags)) + return sprintf(buf, "always [madvise] never\n"); + else + return sprintf(buf, "always madvise [never]\n"); } + static ssize_t enabled_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { ssize_t ret; - ret = double_flag_store(kobj, attr, buf, count, + ret = triple_flag_store(kobj, attr, buf, count, + TRANSPARENT_HUGEPAGE_FLAG, TRANSPARENT_HUGEPAGE_FLAG, TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG); @@ -378,16 +380,23 @@ static ssize_t single_flag_store(struct kobject *kobj, static ssize_t defrag_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { - return double_flag_show(kobj, attr, buf, - TRANSPARENT_HUGEPAGE_DEFRAG_FLAG, - TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG); + if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags)) + return sprintf(buf, "[always] defer madvise never\n"); + if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags)) + return sprintf(buf, "always [defer] madvise never\n"); + else if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags)) + return sprintf(buf, "always defer [madvise] never\n"); + else + return sprintf(buf, "always defer madvise [never]\n"); + } static ssize_t defrag_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { - return double_flag_store(kobj, attr, buf, count, - TRANSPARENT_HUGEPAGE_DEFRAG_FLAG, + return triple_flag_store(kobj, attr, buf, count, + TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, + TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG); } static struct kobj_attribute defrag_attr = @@ -660,6 +669,18 @@ static int __init hugepage_init(void) return -EINVAL; } + khugepaged_pages_to_scan = HPAGE_PMD_NR * 8; + khugepaged_max_ptes_none = HPAGE_PMD_NR - 1; + /* + * hugepages can't be allocated by the buddy allocator + */ + MAYBE_BUILD_BUG_ON(HPAGE_PMD_ORDER >= MAX_ORDER); + /* + * we use page->mapping and page->index in second tail page + * as list_head: assuming THP order >= 2 + */ + MAYBE_BUILD_BUG_ON(HPAGE_PMD_ORDER < 2); + err = hugepage_init_sysfs(&hugepage_kobj); if (err) goto err_sysfs; @@ -764,7 +785,6 @@ void prep_transhuge_page(struct page *page) * we use page->mapping and page->indexlru in second tail page * as list_head: assuming THP order >= 2 */ - BUILD_BUG_ON(HPAGE_PMD_ORDER < 2); INIT_LIST_HEAD(page_deferred_list(page)); set_compound_page_dtor(page, TRANSHUGE_PAGE_DTOR); @@ -843,9 +863,30 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm, return 0; } -static inline gfp_t alloc_hugepage_gfpmask(int defrag, gfp_t extra_gfp) +/* + * If THP is set to always then directly reclaim/compact as necessary + * If set to defer then do no reclaim and defer to khugepaged + * If set to madvise and the VMA is flagged then directly reclaim/compact + */ +static inline gfp_t alloc_hugepage_direct_gfpmask(struct vm_area_struct *vma) { - return (GFP_TRANSHUGE & ~(defrag ? 0 : __GFP_RECLAIM)) | extra_gfp; + gfp_t reclaim_flags = 0; + + if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags) && + (vma->vm_flags & VM_HUGEPAGE)) + reclaim_flags = __GFP_DIRECT_RECLAIM; + else if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags)) + reclaim_flags = __GFP_KSWAPD_RECLAIM; + else if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags)) + reclaim_flags = __GFP_DIRECT_RECLAIM; + + return GFP_TRANSHUGE | reclaim_flags; +} + +/* Defrag for khugepaged will enter direct reclaim/compaction if necessary */ +static inline gfp_t alloc_hugepage_khugepaged_gfpmask(void) +{ + return GFP_TRANSHUGE | (khugepaged_defrag() ? __GFP_DIRECT_RECLAIM : 0); } /* Caller must hold page table lock. */ @@ -919,7 +960,7 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, } return ret; } - gfp = alloc_hugepage_gfpmask(transparent_hugepage_defrag(vma), 0); + gfp = alloc_hugepage_direct_gfpmask(vma); page = alloc_hugepage_vma(gfp, vma, haddr, HPAGE_PMD_ORDER); if (unlikely(!page)) { count_vm_event(THP_FAULT_FALLBACK); @@ -1279,7 +1320,7 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, alloc: if (transparent_hugepage_enabled(vma) && !transparent_hugepage_debug_cow()) { - huge_gfp = alloc_hugepage_gfpmask(transparent_hugepage_defrag(vma), 0); + huge_gfp = alloc_hugepage_direct_gfpmask(vma); new_page = alloc_hugepage_vma(huge_gfp, vma, haddr, HPAGE_PMD_ORDER); } else new_page = NULL; @@ -2249,11 +2290,12 @@ static int khugepaged_find_target_node(void) return 0; } -static inline struct page *alloc_hugepage(int defrag) +static inline struct page *alloc_khugepaged_hugepage(void) { struct page *page; - page = alloc_pages(alloc_hugepage_gfpmask(defrag, 0), HPAGE_PMD_ORDER); + page = alloc_pages(alloc_hugepage_khugepaged_gfpmask(), + HPAGE_PMD_ORDER); if (page) prep_transhuge_page(page); return page; @@ -2264,7 +2306,7 @@ static struct page *khugepaged_alloc_hugepage(bool *wait) struct page *hpage; do { - hpage = alloc_hugepage(khugepaged_defrag()); + hpage = alloc_khugepaged_hugepage(); if (!hpage) { count_vm_event(THP_COLLAPSE_ALLOC_FAILED); if (!*wait) @@ -2335,8 +2377,7 @@ static void collapse_huge_page(struct mm_struct *mm, VM_BUG_ON(address & ~HPAGE_PMD_MASK); /* Only allocate from the target node */ - gfp = alloc_hugepage_gfpmask(khugepaged_defrag(), __GFP_OTHER_NODE) | - __GFP_THISNODE; + gfp = alloc_hugepage_khugepaged_gfpmask() | __GFP_OTHER_NODE | __GFP_THISNODE; /* release the mmap_sem read lock. */ new_page = khugepaged_alloc_page(hpage, gfp, mm, address, node); @@ -2537,7 +2578,7 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, } khugepaged_node_load[node]++; if (!PageLRU(page)) { - result = SCAN_SCAN_ABORT; + result = SCAN_PAGE_LRU; goto out_unmap; } if (PageLocked(page)) { @@ -2857,7 +2898,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, page = pmd_page(*pmd); VM_BUG_ON_PAGE(!page_count(page), page); - atomic_add(HPAGE_PMD_NR - 1, &page->_count); + page_ref_add(page, HPAGE_PMD_NR - 1); write = pmd_write(*pmd); young = pmd_young(*pmd); dirty = pmd_dirty(*pmd); @@ -2947,44 +2988,33 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, } void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, - unsigned long address) + unsigned long address, bool freeze) { spinlock_t *ptl; struct mm_struct *mm = vma->vm_mm; - struct page *page = NULL; unsigned long haddr = address & HPAGE_PMD_MASK; mmu_notifier_invalidate_range_start(mm, haddr, haddr + HPAGE_PMD_SIZE); ptl = pmd_lock(mm, pmd); if (pmd_trans_huge(*pmd)) { - page = pmd_page(*pmd); + struct page *page = pmd_page(*pmd); if (PageMlocked(page)) - get_page(page); - else - page = NULL; + clear_page_mlock(page); } else if (!pmd_devmap(*pmd)) goto out; - __split_huge_pmd_locked(vma, pmd, haddr, false); + __split_huge_pmd_locked(vma, pmd, haddr, freeze); out: spin_unlock(ptl); mmu_notifier_invalidate_range_end(mm, haddr, haddr + HPAGE_PMD_SIZE); - if (page) { - lock_page(page); - munlock_vma_page(page); - unlock_page(page); - put_page(page); - } } -static void split_huge_pmd_address(struct vm_area_struct *vma, - unsigned long address) +void split_huge_pmd_address(struct vm_area_struct *vma, unsigned long address, + bool freeze, struct page *page) { pgd_t *pgd; pud_t *pud; pmd_t *pmd; - VM_BUG_ON(!(address & ~HPAGE_PMD_MASK)); - pgd = pgd_offset(vma->vm_mm, address); if (!pgd_present(*pgd)) return; @@ -2996,11 +3026,20 @@ static void split_huge_pmd_address(struct vm_area_struct *vma, pmd = pmd_offset(pud, address); if (!pmd_present(*pmd) || (!pmd_trans_huge(*pmd) && !pmd_devmap(*pmd))) return; + + /* + * If caller asks to setup a migration entries, we need a page to check + * pmd against. Otherwise we can end up replacing wrong page. + */ + VM_BUG_ON(freeze && !page); + if (page && page != pmd_page(*pmd)) + return; + /* * Caller holds the mmap_sem write mode, so a huge pmd cannot * materialize from under us. */ - split_huge_pmd(vma, pmd, address); + __split_huge_pmd(vma, pmd, address, freeze); } void vma_adjust_trans_huge(struct vm_area_struct *vma, @@ -3016,7 +3055,7 @@ void vma_adjust_trans_huge(struct vm_area_struct *vma, if (start & ~HPAGE_PMD_MASK && (start & HPAGE_PMD_MASK) >= vma->vm_start && (start & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= vma->vm_end) - split_huge_pmd_address(vma, start); + split_huge_pmd_address(vma, start, false, NULL); /* * If the new end address isn't hpage aligned and it could @@ -3026,7 +3065,7 @@ void vma_adjust_trans_huge(struct vm_area_struct *vma, if (end & ~HPAGE_PMD_MASK && (end & HPAGE_PMD_MASK) >= vma->vm_start && (end & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= vma->vm_end) - split_huge_pmd_address(vma, end); + split_huge_pmd_address(vma, end, false, NULL); /* * If we're also updating the vma->vm_next->vm_start, if the new @@ -3040,208 +3079,58 @@ void vma_adjust_trans_huge(struct vm_area_struct *vma, if (nstart & ~HPAGE_PMD_MASK && (nstart & HPAGE_PMD_MASK) >= next->vm_start && (nstart & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= next->vm_end) - split_huge_pmd_address(next, nstart); + split_huge_pmd_address(next, nstart, false, NULL); } } -static void freeze_page_vma(struct vm_area_struct *vma, struct page *page, - unsigned long address) +static void freeze_page(struct page *page) { - unsigned long haddr = address & HPAGE_PMD_MASK; - spinlock_t *ptl; - pgd_t *pgd; - pud_t *pud; - pmd_t *pmd; - pte_t *pte; - int i, nr = HPAGE_PMD_NR; - - /* Skip pages which doesn't belong to the VMA */ - if (address < vma->vm_start) { - int off = (vma->vm_start - address) >> PAGE_SHIFT; - page += off; - nr -= off; - address = vma->vm_start; - } - - pgd = pgd_offset(vma->vm_mm, address); - if (!pgd_present(*pgd)) - return; - pud = pud_offset(pgd, address); - if (!pud_present(*pud)) - return; - pmd = pmd_offset(pud, address); - ptl = pmd_lock(vma->vm_mm, pmd); - if (!pmd_present(*pmd)) { - spin_unlock(ptl); - return; - } - if (pmd_trans_huge(*pmd)) { - if (page == pmd_page(*pmd)) - __split_huge_pmd_locked(vma, pmd, haddr, true); - spin_unlock(ptl); - return; - } - spin_unlock(ptl); - - pte = pte_offset_map_lock(vma->vm_mm, pmd, address, &ptl); - for (i = 0; i < nr; i++, address += PAGE_SIZE, page++, pte++) { - pte_t entry, swp_pte; - swp_entry_t swp_entry; - - /* - * We've just crossed page table boundary: need to map next one. - * It can happen if THP was mremaped to non PMD-aligned address. - */ - if (unlikely(address == haddr + HPAGE_PMD_SIZE)) { - pte_unmap_unlock(pte - 1, ptl); - pmd = mm_find_pmd(vma->vm_mm, address); - if (!pmd) - return; - pte = pte_offset_map_lock(vma->vm_mm, pmd, - address, &ptl); - } - - if (!pte_present(*pte)) - continue; - if (page_to_pfn(page) != pte_pfn(*pte)) - continue; - flush_cache_page(vma, address, page_to_pfn(page)); - entry = ptep_clear_flush(vma, address, pte); - if (pte_dirty(entry)) - SetPageDirty(page); - swp_entry = make_migration_entry(page, pte_write(entry)); - swp_pte = swp_entry_to_pte(swp_entry); - if (pte_soft_dirty(entry)) - swp_pte = pte_swp_mksoft_dirty(swp_pte); - set_pte_at(vma->vm_mm, address, pte, swp_pte); - page_remove_rmap(page, false); - put_page(page); - } - pte_unmap_unlock(pte - 1, ptl); -} - -static void freeze_page(struct anon_vma *anon_vma, struct page *page) -{ - struct anon_vma_chain *avc; - pgoff_t pgoff = page_to_pgoff(page); + enum ttu_flags ttu_flags = TTU_MIGRATION | TTU_IGNORE_MLOCK | + TTU_IGNORE_ACCESS | TTU_RMAP_LOCKED; + int i, ret; VM_BUG_ON_PAGE(!PageHead(page), page); - anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, - pgoff + HPAGE_PMD_NR - 1) { - unsigned long address = __vma_address(page, avc->vma); - - mmu_notifier_invalidate_range_start(avc->vma->vm_mm, - address, address + HPAGE_PMD_SIZE); - freeze_page_vma(avc->vma, page, address); - mmu_notifier_invalidate_range_end(avc->vma->vm_mm, - address, address + HPAGE_PMD_SIZE); - } -} - -static void unfreeze_page_vma(struct vm_area_struct *vma, struct page *page, - unsigned long address) -{ - spinlock_t *ptl; - pmd_t *pmd; - pte_t *pte, entry; - swp_entry_t swp_entry; - unsigned long haddr = address & HPAGE_PMD_MASK; - int i, nr = HPAGE_PMD_NR; - - /* Skip pages which doesn't belong to the VMA */ - if (address < vma->vm_start) { - int off = (vma->vm_start - address) >> PAGE_SHIFT; - page += off; - nr -= off; - address = vma->vm_start; - } - - pmd = mm_find_pmd(vma->vm_mm, address); - if (!pmd) - return; - - pte = pte_offset_map_lock(vma->vm_mm, pmd, address, &ptl); - for (i = 0; i < nr; i++, address += PAGE_SIZE, page++, pte++) { - /* - * We've just crossed page table boundary: need to map next one. - * It can happen if THP was mremaped to non-PMD aligned address. - */ - if (unlikely(address == haddr + HPAGE_PMD_SIZE)) { - pte_unmap_unlock(pte - 1, ptl); - pmd = mm_find_pmd(vma->vm_mm, address); - if (!pmd) - return; - pte = pte_offset_map_lock(vma->vm_mm, pmd, - address, &ptl); - } - - if (!is_swap_pte(*pte)) - continue; - - swp_entry = pte_to_swp_entry(*pte); - if (!is_migration_entry(swp_entry)) - continue; - if (migration_entry_to_page(swp_entry) != page) - continue; - - get_page(page); - page_add_anon_rmap(page, vma, address, false); - - entry = pte_mkold(mk_pte(page, vma->vm_page_prot)); - if (PageDirty(page)) - entry = pte_mkdirty(entry); - if (is_write_migration_entry(swp_entry)) - entry = maybe_mkwrite(entry, vma); - - flush_dcache_page(page); - set_pte_at(vma->vm_mm, address, pte, entry); + /* We only need TTU_SPLIT_HUGE_PMD once */ + ret = try_to_unmap(page, ttu_flags | TTU_SPLIT_HUGE_PMD); + for (i = 1; !ret && i < HPAGE_PMD_NR; i++) { + /* Cut short if the page is unmapped */ + if (page_count(page) == 1) + return; - /* No need to invalidate - it was non-present before */ - update_mmu_cache(vma, address, pte); + ret = try_to_unmap(page + i, ttu_flags); } - pte_unmap_unlock(pte - 1, ptl); + VM_BUG_ON(ret); } -static void unfreeze_page(struct anon_vma *anon_vma, struct page *page) +static void unfreeze_page(struct page *page) { - struct anon_vma_chain *avc; - pgoff_t pgoff = page_to_pgoff(page); - - anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, - pgoff, pgoff + HPAGE_PMD_NR - 1) { - unsigned long address = __vma_address(page, avc->vma); + int i; - mmu_notifier_invalidate_range_start(avc->vma->vm_mm, - address, address + HPAGE_PMD_SIZE); - unfreeze_page_vma(avc->vma, page, address); - mmu_notifier_invalidate_range_end(avc->vma->vm_mm, - address, address + HPAGE_PMD_SIZE); - } + for (i = 0; i < HPAGE_PMD_NR; i++) + remove_migration_ptes(page + i, page + i, true); } -static int __split_huge_page_tail(struct page *head, int tail, +static void __split_huge_page_tail(struct page *head, int tail, struct lruvec *lruvec, struct list_head *list) { - int mapcount; struct page *page_tail = head + tail; - mapcount = atomic_read(&page_tail->_mapcount) + 1; - VM_BUG_ON_PAGE(atomic_read(&page_tail->_count) != 0, page_tail); + VM_BUG_ON_PAGE(atomic_read(&page_tail->_mapcount) != -1, page_tail); + VM_BUG_ON_PAGE(page_ref_count(page_tail) != 0, page_tail); /* * tail_page->_count is zero and not changing from under us. But * get_page_unless_zero() may be running from under us on the - * tail_page. If we used atomic_set() below instead of atomic_add(), we + * tail_page. If we used atomic_set() below instead of atomic_inc(), we * would then run atomic_set() concurrently with * get_page_unless_zero(), and atomic_set() is implemented in C not * using locked ops. spin_unlock on x86 sometime uses locked ops * because of PPro errata 66, 92, so unless somebody can guarantee * atomic_set() here would be safe on all archs (and not only on x86), - * it's safer to use atomic_add(). + * it's safer to use atomic_inc(). */ - atomic_add(mapcount + 1, &page_tail->_count); - + page_ref_inc(page_tail); page_tail->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; page_tail->flags |= (head->flags & @@ -3275,8 +3164,6 @@ static int __split_huge_page_tail(struct page *head, int tail, page_tail->index = head->index + tail; page_cpupid_xchg_last(page_tail, page_cpupid_last(head)); lru_add_page_tail(head, page_tail, lruvec, list); - - return mapcount; } static void __split_huge_page(struct page *page, struct list_head *list) @@ -3284,7 +3171,7 @@ static void __split_huge_page(struct page *page, struct list_head *list) struct page *head = compound_head(page); struct zone *zone = page_zone(head); struct lruvec *lruvec; - int i, tail_mapcount; + int i; /* prevent PageLRU to go away from under us, and freeze lru stats */ spin_lock_irq(&zone->lru_lock); @@ -3293,15 +3180,13 @@ static void __split_huge_page(struct page *page, struct list_head *list) /* complete memcg works before add pages to LRU */ mem_cgroup_split_huge_fixup(head); - tail_mapcount = 0; for (i = HPAGE_PMD_NR - 1; i >= 1; i--) - tail_mapcount += __split_huge_page_tail(head, i, lruvec, list); - atomic_sub(tail_mapcount, &head->_count); + __split_huge_page_tail(head, i, lruvec, list); ClearPageCompound(head); spin_unlock_irq(&zone->lru_lock); - unfreeze_page(page_anon_vma(head), head); + unfreeze_page(head); for (i = 0; i < HPAGE_PMD_NR; i++) { struct page *subpage = head + i; @@ -3397,7 +3282,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) } mlocked = PageMlocked(page); - freeze_page(anon_vma, head); + freeze_page(head); VM_BUG_ON_PAGE(compound_mapcount(head), head); /* Make sure the page is not on per-CPU pagevec as it takes pin */ @@ -3426,7 +3311,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) BUG(); } else { spin_unlock_irqrestore(&pgdata->split_queue_lock, flags); - unfreeze_page(anon_vma, head); + unfreeze_page(head); ret = -EBUSY; } @@ -3461,6 +3346,7 @@ void deferred_split_huge_page(struct page *page) spin_lock_irqsave(&pgdata->split_queue_lock, flags); if (list_empty(page_deferred_list(page))) { + count_vm_event(THP_DEFERRED_SPLIT_PAGE); list_add_tail(page_deferred_list(page), &pgdata->split_queue); pgdata->split_queue_len++; } diff --git a/mm/hugetlb.c b/mm/hugetlb.c index aefba5a9cc47..06058eaa173b 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -2665,7 +2665,7 @@ void __init hugetlb_add_hstate(unsigned int order) unsigned long i; if (size_to_hstate(PAGE_SIZE << order)) { - pr_warning("hugepagesz= specified twice, ignoring\n"); + pr_warn("hugepagesz= specified twice, ignoring\n"); return; } BUG_ON(hugetlb_max_hstate >= HUGE_MAX_HSTATE); @@ -2701,8 +2701,7 @@ static int __init hugetlb_nrpages_setup(char *s) mhp = &parsed_hstate->max_huge_pages; if (mhp == last_mhp) { - pr_warning("hugepages= specified twice without " - "interleaving hugepagesz=, ignoring\n"); + pr_warn("hugepages= specified twice without interleaving hugepagesz=, ignoring\n"); return 1; } diff --git a/mm/internal.h b/mm/internal.h index a38a21ebddb4..b79abb6721cf 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -14,6 +14,7 @@ #include <linux/fs.h> #include <linux/mm.h> #include <linux/pagemap.h> +#include <linux/tracepoint-defs.h> /* * The set of flags that only affect watermark checking and reclaim @@ -37,10 +38,10 @@ void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma, unsigned long floor, unsigned long ceiling); -static inline void set_page_count(struct page *page, int v) -{ - atomic_set(&page->_count, v); -} +void unmap_page_range(struct mmu_gather *tlb, + struct vm_area_struct *vma, + unsigned long addr, unsigned long end, + struct zap_details *details); extern int __do_page_cache_readahead(struct address_space *mapping, struct file *filp, pgoff_t offset, unsigned long nr_to_read, @@ -63,7 +64,7 @@ static inline unsigned long ra_submit(struct file_ra_state *ra, static inline void set_page_refcounted(struct page *page) { VM_BUG_ON_PAGE(PageTail(page), page); - VM_BUG_ON_PAGE(atomic_read(&page->_count), page); + VM_BUG_ON_PAGE(page_ref_count(page), page); set_page_count(page, 1); } @@ -131,13 +132,22 @@ __find_buddy_index(unsigned long page_idx, unsigned int order) return page_idx ^ (1 << order); } +extern struct page *__pageblock_pfn_to_page(unsigned long start_pfn, + unsigned long end_pfn, struct zone *zone); + +static inline struct page *pageblock_pfn_to_page(unsigned long start_pfn, + unsigned long end_pfn, struct zone *zone) +{ + if (zone->contiguous) + return pfn_to_page(start_pfn); + + return __pageblock_pfn_to_page(start_pfn, end_pfn, zone); +} + extern int __isolate_free_page(struct page *page, unsigned int order); extern void __free_pages_bootmem(struct page *page, unsigned long pfn, unsigned int order); extern void prep_compound_page(struct page *page, unsigned int order); -#ifdef CONFIG_MEMORY_FAILURE -extern bool is_free_buddy_page(struct page *page); -#endif extern int user_min_free_kbytes; #if defined CONFIG_COMPACTION || defined CONFIG_CMA @@ -162,6 +172,7 @@ struct compact_control { unsigned long last_migrated_pfn;/* Not yet flushed page being freed */ enum migrate_mode mode; /* Async or sync migration mode */ bool ignore_skip_hint; /* Scan blocks even if marked skip */ + bool direct_compaction; /* False from kcompactd or /proc/... */ int order; /* order a direct compactor needs */ const gfp_t gfp_mask; /* gfp mask of a direct compactor */ const int alloc_flags; /* alloc flags of a direct compactor */ @@ -380,7 +391,7 @@ extern int mminit_loglevel; do { \ if (level < mminit_loglevel) { \ if (level <= MMINIT_WARNING) \ - printk(KERN_WARNING "mminit::" prefix " " fmt, ##arg); \ + pr_warn("mminit::" prefix " " fmt, ##arg); \ else \ printk(KERN_DEBUG "mminit::" prefix " " fmt, ##arg); \ } \ @@ -466,4 +477,9 @@ static inline void try_to_unmap_flush_dirty(void) } #endif /* CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH */ + +extern const struct trace_print_flags pageflag_names[]; +extern const struct trace_print_flags vmaflag_names[]; +extern const struct trace_print_flags gfpflag_names[]; + #endif /* __MM_INTERNAL_H */ diff --git a/mm/kasan/Makefile b/mm/kasan/Makefile index a61460d9f5b0..131daadf40e4 100644 --- a/mm/kasan/Makefile +++ b/mm/kasan/Makefile @@ -1,5 +1,6 @@ KASAN_SANITIZE := n UBSAN_SANITIZE_kasan.o := n +KCOV_INSTRUMENT := n CFLAGS_REMOVE_kasan.o = -pg # Function splitter causes unnecessary splits in __asan_load1/__asan_store1 diff --git a/mm/kasan/kasan.c b/mm/kasan/kasan.c index 1ad20ade8c91..acb3b6c4dd89 100644 --- a/mm/kasan/kasan.c +++ b/mm/kasan/kasan.c @@ -17,7 +17,9 @@ #define DISABLE_BRANCH_PROFILING #include <linux/export.h> +#include <linux/interrupt.h> #include <linux/init.h> +#include <linux/kasan.h> #include <linux/kernel.h> #include <linux/kmemleak.h> #include <linux/linkage.h> @@ -32,7 +34,6 @@ #include <linux/string.h> #include <linux/types.h> #include <linux/vmalloc.h> -#include <linux/kasan.h> #include "kasan.h" #include "../slab.h" @@ -334,6 +335,59 @@ void kasan_free_pages(struct page *page, unsigned int order) KASAN_FREE_PAGE); } +#ifdef CONFIG_SLAB +/* + * Adaptive redzone policy taken from the userspace AddressSanitizer runtime. + * For larger allocations larger redzones are used. + */ +static size_t optimal_redzone(size_t object_size) +{ + int rz = + object_size <= 64 - 16 ? 16 : + object_size <= 128 - 32 ? 32 : + object_size <= 512 - 64 ? 64 : + object_size <= 4096 - 128 ? 128 : + object_size <= (1 << 14) - 256 ? 256 : + object_size <= (1 << 15) - 512 ? 512 : + object_size <= (1 << 16) - 1024 ? 1024 : 2048; + return rz; +} + +void kasan_cache_create(struct kmem_cache *cache, size_t *size, + unsigned long *flags) +{ + int redzone_adjust; + /* Make sure the adjusted size is still less than + * KMALLOC_MAX_CACHE_SIZE. + * TODO: this check is only useful for SLAB, but not SLUB. We'll need + * to skip it for SLUB when it starts using kasan_cache_create(). + */ + if (*size > KMALLOC_MAX_CACHE_SIZE - + sizeof(struct kasan_alloc_meta) - + sizeof(struct kasan_free_meta)) + return; + *flags |= SLAB_KASAN; + /* Add alloc meta. */ + cache->kasan_info.alloc_meta_offset = *size; + *size += sizeof(struct kasan_alloc_meta); + + /* Add free meta. */ + if (cache->flags & SLAB_DESTROY_BY_RCU || cache->ctor || + cache->object_size < sizeof(struct kasan_free_meta)) { + cache->kasan_info.free_meta_offset = *size; + *size += sizeof(struct kasan_free_meta); + } + redzone_adjust = optimal_redzone(cache->object_size) - + (*size - cache->object_size); + if (redzone_adjust > 0) + *size += redzone_adjust; + *size = min(KMALLOC_MAX_CACHE_SIZE, + max(*size, + cache->object_size + + optimal_redzone(cache->object_size))); +} +#endif + void kasan_poison_slab(struct page *page) { kasan_poison_shadow(page_address(page), @@ -351,11 +405,81 @@ void kasan_poison_object_data(struct kmem_cache *cache, void *object) kasan_poison_shadow(object, round_up(cache->object_size, KASAN_SHADOW_SCALE_SIZE), KASAN_KMALLOC_REDZONE); +#ifdef CONFIG_SLAB + if (cache->flags & SLAB_KASAN) { + struct kasan_alloc_meta *alloc_info = + get_alloc_info(cache, object); + alloc_info->state = KASAN_STATE_INIT; + } +#endif } -void kasan_slab_alloc(struct kmem_cache *cache, void *object) +#ifdef CONFIG_SLAB +static inline int in_irqentry_text(unsigned long ptr) { - kasan_kmalloc(cache, object, cache->object_size); + return (ptr >= (unsigned long)&__irqentry_text_start && + ptr < (unsigned long)&__irqentry_text_end) || + (ptr >= (unsigned long)&__softirqentry_text_start && + ptr < (unsigned long)&__softirqentry_text_end); +} + +static inline void filter_irq_stacks(struct stack_trace *trace) +{ + int i; + + if (!trace->nr_entries) + return; + for (i = 0; i < trace->nr_entries; i++) + if (in_irqentry_text(trace->entries[i])) { + /* Include the irqentry function into the stack. */ + trace->nr_entries = i + 1; + break; + } +} + +static inline depot_stack_handle_t save_stack(gfp_t flags) +{ + unsigned long entries[KASAN_STACK_DEPTH]; + struct stack_trace trace = { + .nr_entries = 0, + .entries = entries, + .max_entries = KASAN_STACK_DEPTH, + .skip = 0 + }; + + save_stack_trace(&trace); + filter_irq_stacks(&trace); + if (trace.nr_entries != 0 && + trace.entries[trace.nr_entries-1] == ULONG_MAX) + trace.nr_entries--; + + return depot_save_stack(&trace, flags); +} + +static inline void set_track(struct kasan_track *track, gfp_t flags) +{ + track->pid = current->pid; + track->stack = save_stack(flags); +} + +struct kasan_alloc_meta *get_alloc_info(struct kmem_cache *cache, + const void *object) +{ + BUILD_BUG_ON(sizeof(struct kasan_alloc_meta) > 32); + return (void *)object + cache->kasan_info.alloc_meta_offset; +} + +struct kasan_free_meta *get_free_info(struct kmem_cache *cache, + const void *object) +{ + BUILD_BUG_ON(sizeof(struct kasan_free_meta) > 32); + return (void *)object + cache->kasan_info.free_meta_offset; +} +#endif + +void kasan_slab_alloc(struct kmem_cache *cache, void *object, gfp_t flags) +{ + kasan_kmalloc(cache, object, cache->object_size, flags); } void kasan_slab_free(struct kmem_cache *cache, void *object) @@ -367,10 +491,22 @@ void kasan_slab_free(struct kmem_cache *cache, void *object) if (unlikely(cache->flags & SLAB_DESTROY_BY_RCU)) return; +#ifdef CONFIG_SLAB + if (cache->flags & SLAB_KASAN) { + struct kasan_free_meta *free_info = + get_free_info(cache, object); + struct kasan_alloc_meta *alloc_info = + get_alloc_info(cache, object); + alloc_info->state = KASAN_STATE_FREE; + set_track(&free_info->track); + } +#endif + kasan_poison_shadow(object, rounded_up_size, KASAN_KMALLOC_FREE); } -void kasan_kmalloc(struct kmem_cache *cache, const void *object, size_t size) +void kasan_kmalloc(struct kmem_cache *cache, const void *object, size_t size, + gfp_t flags) { unsigned long redzone_start; unsigned long redzone_end; @@ -386,10 +522,20 @@ void kasan_kmalloc(struct kmem_cache *cache, const void *object, size_t size) kasan_unpoison_shadow(object, size); kasan_poison_shadow((void *)redzone_start, redzone_end - redzone_start, KASAN_KMALLOC_REDZONE); +#ifdef CONFIG_SLAB + if (cache->flags & SLAB_KASAN) { + struct kasan_alloc_meta *alloc_info = + get_alloc_info(cache, object); + + alloc_info->state = KASAN_STATE_ALLOC; + alloc_info->alloc_size = size; + set_track(&alloc_info->track, flags); + } +#endif } EXPORT_SYMBOL(kasan_kmalloc); -void kasan_kmalloc_large(const void *ptr, size_t size) +void kasan_kmalloc_large(const void *ptr, size_t size, gfp_t flags) { struct page *page; unsigned long redzone_start; @@ -408,7 +554,7 @@ void kasan_kmalloc_large(const void *ptr, size_t size) KASAN_PAGE_REDZONE); } -void kasan_krealloc(const void *object, size_t size) +void kasan_krealloc(const void *object, size_t size, gfp_t flags) { struct page *page; @@ -418,9 +564,9 @@ void kasan_krealloc(const void *object, size_t size) page = virt_to_head_page(object); if (unlikely(!PageSlab(page))) - kasan_kmalloc_large(object, size); + kasan_kmalloc_large(object, size, flags); else - kasan_kmalloc(page->slab_cache, object, size); + kasan_kmalloc(page->slab_cache, object, size, flags); } void kasan_kfree(void *ptr) diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h index 4f6c62e5c21e..30a2f0ba0e09 100644 --- a/mm/kasan/kasan.h +++ b/mm/kasan/kasan.h @@ -2,6 +2,7 @@ #define __MM_KASAN_KASAN_H #include <linux/kasan.h> +#include <linux/stackdepot.h> #define KASAN_SHADOW_SCALE_SIZE (1UL << KASAN_SHADOW_SCALE_SHIFT) #define KASAN_SHADOW_MASK (KASAN_SHADOW_SCALE_SIZE - 1) @@ -54,6 +55,42 @@ struct kasan_global { #endif }; +/** + * Structures to keep alloc and free tracks * + */ + +enum kasan_state { + KASAN_STATE_INIT, + KASAN_STATE_ALLOC, + KASAN_STATE_FREE +}; + +#define KASAN_STACK_DEPTH 64 + +struct kasan_track { + u32 pid; + depot_stack_handle_t stack; +}; + +struct kasan_alloc_meta { + struct kasan_track track; + u32 state : 2; /* enum kasan_state */ + u32 alloc_size : 30; + u32 reserved; +}; + +struct kasan_free_meta { + /* Allocator freelist pointer, unused by KASAN. */ + void **freelist; + struct kasan_track track; +}; + +struct kasan_alloc_meta *get_alloc_info(struct kmem_cache *cache, + const void *object); +struct kasan_free_meta *get_free_info(struct kmem_cache *cache, + const void *object); + + static inline const void *kasan_shadow_to_mem(const void *shadow_addr) { return (void *)(((unsigned long)shadow_addr - KASAN_SHADOW_OFFSET) diff --git a/mm/kasan/report.c b/mm/kasan/report.c index 12f222d0224b..60869a5a0124 100644 --- a/mm/kasan/report.c +++ b/mm/kasan/report.c @@ -18,6 +18,7 @@ #include <linux/printk.h> #include <linux/sched.h> #include <linux/slab.h> +#include <linux/stackdepot.h> #include <linux/stacktrace.h> #include <linux/string.h> #include <linux/types.h> @@ -115,6 +116,53 @@ static inline bool init_task_stack_addr(const void *addr) sizeof(init_thread_union.stack)); } +#ifdef CONFIG_SLAB +static void print_track(struct kasan_track *track) +{ + pr_err("PID = %u\n", track->pid); + if (track->stack) { + struct stack_trace trace; + + depot_fetch_stack(track->stack, &trace); + print_stack_trace(&trace, 0); + } else { + pr_err("(stack is not available)\n"); + } +} + +static void object_err(struct kmem_cache *cache, struct page *page, + void *object, char *unused_reason) +{ + struct kasan_alloc_meta *alloc_info = get_alloc_info(cache, object); + struct kasan_free_meta *free_info; + + dump_stack(); + pr_err("Object at %p, in cache %s\n", object, cache->name); + if (!(cache->flags & SLAB_KASAN)) + return; + switch (alloc_info->state) { + case KASAN_STATE_INIT: + pr_err("Object not allocated yet\n"); + break; + case KASAN_STATE_ALLOC: + pr_err("Object allocated with size %u bytes.\n", + alloc_info->alloc_size); + pr_err("Allocation:\n"); + print_track(&alloc_info->track); + break; + case KASAN_STATE_FREE: + pr_err("Object freed, allocated with size %u bytes\n", + alloc_info->alloc_size); + free_info = get_free_info(cache, object); + pr_err("Allocation:\n"); + print_track(&alloc_info->track); + pr_err("Deallocation:\n"); + print_track(&free_info->track); + break; + } +} +#endif + static void print_address_description(struct kasan_access_info *info) { const void *addr = info->access_addr; @@ -126,17 +174,10 @@ static void print_address_description(struct kasan_access_info *info) if (PageSlab(page)) { void *object; struct kmem_cache *cache = page->slab_cache; - void *last_object; - - object = virt_to_obj(cache, page_address(page), addr); - last_object = page_address(page) + - page->objects * cache->size; - - if (unlikely(object > last_object)) - object = last_object; /* we hit into padding */ - + object = nearest_obj(cache, page, + (void *)info->access_addr); object_err(cache, page, object, - "kasan: bad access detected"); + "kasan: bad access detected"); return; } dump_page(page, "kasan: bad access detected"); @@ -146,7 +187,6 @@ static void print_address_description(struct kasan_access_info *info) if (!init_task_stack_addr(addr)) pr_err("Address belongs to variable %pS\n", addr); } - dump_stack(); } @@ -214,8 +254,7 @@ static void kasan_report_error(struct kasan_access_info *info) */ kasan_disable_current(); spin_lock_irqsave(&report_lock, flags); - pr_err("=================================" - "=================================\n"); + pr_err("==================================================================\n"); if (info->access_addr < kasan_shadow_to_mem((void *)KASAN_SHADOW_START)) { if ((unsigned long)info->access_addr < PAGE_SIZE) @@ -236,8 +275,7 @@ static void kasan_report_error(struct kasan_access_info *info) print_address_description(info); print_shadow_for_address(info->first_bad_addr); } - pr_err("=================================" - "=================================\n"); + pr_err("==================================================================\n"); add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); spin_unlock_irqrestore(&report_lock, flags); kasan_enable_current(); diff --git a/mm/kmemcheck.c b/mm/kmemcheck.c index cab58bb592d8..5bf191756a4a 100644 --- a/mm/kmemcheck.c +++ b/mm/kmemcheck.c @@ -20,8 +20,7 @@ void kmemcheck_alloc_shadow(struct page *page, int order, gfp_t flags, int node) shadow = alloc_pages_node(node, flags | __GFP_NOTRACK, order); if (!shadow) { if (printk_ratelimit()) - printk(KERN_ERR "kmemcheck: failed to allocate " - "shadow bitmap\n"); + pr_err("kmemcheck: failed to allocate shadow bitmap\n"); return; } @@ -60,6 +59,9 @@ void kmemcheck_free_shadow(struct page *page, int order) void kmemcheck_slab_alloc(struct kmem_cache *s, gfp_t gfpflags, void *object, size_t size) { + if (unlikely(!object)) /* Skip object if allocation failed */ + return; + /* * Has already been memset(), which initializes the shadow for us * as well. diff --git a/mm/kmemleak-test.c b/mm/kmemleak-test.c index dcdcadb69533..dd3c23a801b1 100644 --- a/mm/kmemleak-test.c +++ b/mm/kmemleak-test.c @@ -49,7 +49,7 @@ static int __init kmemleak_test_init(void) struct test_node *elem; int i; - printk(KERN_INFO "Kmemleak testing\n"); + pr_info("Kmemleak testing\n"); /* make some orphan objects */ pr_info("kmalloc(32) = %p\n", kmalloc(32, GFP_KERNEL)); diff --git a/mm/kmemleak.c b/mm/kmemleak.c index 25c0ad36fe38..e6429926e957 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -276,7 +276,7 @@ static void kmemleak_disable(void); * Print a warning and dump the stack trace. */ #define kmemleak_warn(x...) do { \ - pr_warning(x); \ + pr_warn(x); \ dump_stack(); \ kmemleak_warning = 1; \ } while (0) @@ -543,7 +543,7 @@ static struct kmemleak_object *create_object(unsigned long ptr, size_t size, object = kmem_cache_alloc(object_cache, gfp_kmemleak_mask(gfp)); if (!object) { - pr_warning("Cannot allocate a kmemleak_object structure\n"); + pr_warn("Cannot allocate a kmemleak_object structure\n"); kmemleak_disable(); return NULL; } @@ -596,8 +596,7 @@ static struct kmemleak_object *create_object(unsigned long ptr, size_t size, else if (parent->pointer + parent->size <= ptr) link = &parent->rb_node.rb_right; else { - kmemleak_stop("Cannot insert 0x%lx into the object " - "search tree (overlaps existing)\n", + kmemleak_stop("Cannot insert 0x%lx into the object search tree (overlaps existing)\n", ptr); /* * No need for parent->lock here since "parent" cannot @@ -670,8 +669,8 @@ static void delete_object_part(unsigned long ptr, size_t size) object = find_and_remove_object(ptr, 1); if (!object) { #ifdef DEBUG - kmemleak_warn("Partially freeing unknown object at 0x%08lx " - "(size %zu)\n", ptr, size); + kmemleak_warn("Partially freeing unknown object at 0x%08lx (size %zu)\n", + ptr, size); #endif return; } @@ -717,8 +716,8 @@ static void paint_ptr(unsigned long ptr, int color) object = find_and_get_object(ptr, 0); if (!object) { - kmemleak_warn("Trying to color unknown object " - "at 0x%08lx as %s\n", ptr, + kmemleak_warn("Trying to color unknown object at 0x%08lx as %s\n", + ptr, (color == KMEMLEAK_GREY) ? "Grey" : (color == KMEMLEAK_BLACK) ? "Black" : "Unknown"); return; @@ -764,7 +763,7 @@ static void add_scan_area(unsigned long ptr, size_t size, gfp_t gfp) area = kmem_cache_alloc(scan_area_cache, gfp_kmemleak_mask(gfp)); if (!area) { - pr_warning("Cannot allocate a scan area\n"); + pr_warn("Cannot allocate a scan area\n"); goto out; } @@ -1463,8 +1462,8 @@ static void kmemleak_scan(void) if (new_leaks) { kmemleak_found_leaks = true; - pr_info("%d new suspected memory leaks (see " - "/sys/kernel/debug/kmemleak)\n", new_leaks); + pr_info("%d new suspected memory leaks (see /sys/kernel/debug/kmemleak)\n", + new_leaks); } } @@ -1515,7 +1514,7 @@ static void start_scan_thread(void) return; scan_thread = kthread_run(kmemleak_scan_thread, NULL, "kmemleak"); if (IS_ERR(scan_thread)) { - pr_warning("Failed to create the scan thread\n"); + pr_warn("Failed to create the scan thread\n"); scan_thread = NULL; } } @@ -1795,8 +1794,7 @@ static void kmemleak_do_cleanup(struct work_struct *work) if (!kmemleak_found_leaks) __kmemleak_do_cleanup(); else - pr_info("Kmemleak disabled without freeing internal data. " - "Reclaim the memory with \"echo clear > /sys/kernel/debug/kmemleak\"\n"); + pr_info("Kmemleak disabled without freeing internal data. Reclaim the memory with \"echo clear > /sys/kernel/debug/kmemleak\".\n"); } static DECLARE_WORK(cleanup_work, kmemleak_do_cleanup); @@ -1874,8 +1872,8 @@ void __init kmemleak_init(void) scan_area_cache = KMEM_CACHE(kmemleak_scan_area, SLAB_NOLEAKTRACE); if (crt_early_log > ARRAY_SIZE(early_log)) - pr_warning("Early log buffer exceeded (%d), please increase " - "DEBUG_KMEMLEAK_EARLY_LOG_SIZE\n", crt_early_log); + pr_warn("Early log buffer exceeded (%d), please increase DEBUG_KMEMLEAK_EARLY_LOG_SIZE\n", + crt_early_log); /* the kernel is still in UP mode, so disabling the IRQs is enough */ local_irq_save(flags); @@ -1960,7 +1958,7 @@ static int __init kmemleak_late_init(void) dentry = debugfs_create_file("kmemleak", S_IRUGO, NULL, NULL, &kmemleak_fops); if (!dentry) - pr_warning("Failed to create the debugfs kmemleak file\n"); + pr_warn("Failed to create the debugfs kmemleak file\n"); mutex_lock(&scan_mutex); start_scan_thread(); mutex_unlock(&scan_mutex); @@ -352,13 +352,17 @@ static inline bool ksm_test_exit(struct mm_struct *mm) /* * We use break_ksm to break COW on a ksm page: it's a stripped down * - * if (get_user_pages(current, mm, addr, 1, 1, 1, &page, NULL) == 1) + * if (get_user_pages(addr, 1, 1, 1, &page, NULL) == 1) * put_page(page); * * but taking great care only to touch a ksm page, in a VM_MERGEABLE vma, * in case the application has unmapped and remapped mm,addr meanwhile. * Could a ksm page appear anywhere else? Actually yes, in a VM_PFNMAP * mmap of /dev/mem or /dev/kmem, where we would not want to touch it. + * + * FAULT_FLAG/FOLL_REMOTE are because we do this outside the context + * of the process that owns 'vma'. We also do not want to enforce + * protection keys here anyway. */ static int break_ksm(struct vm_area_struct *vma, unsigned long addr) { @@ -367,12 +371,14 @@ static int break_ksm(struct vm_area_struct *vma, unsigned long addr) do { cond_resched(); - page = follow_page(vma, addr, FOLL_GET | FOLL_MIGRATION); + page = follow_page(vma, addr, + FOLL_GET | FOLL_MIGRATION | FOLL_REMOTE); if (IS_ERR_OR_NULL(page)) break; if (PageKsm(page)) ret = handle_mm_fault(vma->vm_mm, vma, addr, - FAULT_FLAG_WRITE); + FAULT_FLAG_WRITE | + FAULT_FLAG_REMOTE); else ret = VM_FAULT_WRITE; put_page(page); diff --git a/mm/madvise.c b/mm/madvise.c index f56825b6d2e1..a01147359f3b 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -555,8 +555,9 @@ static int madvise_hwpoison(int bhv, unsigned long start, unsigned long end) } pr_info("Injecting memory failure for page %#lx at %#lx\n", page_to_pfn(p), start); - /* Ignore return value for now */ - memory_failure(page_to_pfn(p), 0, MF_COUNT_INCREASED); + ret = memory_failure(page_to_pfn(p), 0, MF_COUNT_INCREASED); + if (ret) + return ret; } return 0; } @@ -638,14 +639,28 @@ madvise_behavior_valid(int behavior) * some pages ahead. * MADV_DONTNEED - the application is finished with the given range, * so the kernel can free resources associated with it. + * MADV_FREE - the application marks pages in the given range as lazy free, + * where actual purges are postponed until memory pressure happens. * MADV_REMOVE - the application wants to free up the given range of * pages and associated backing store. * MADV_DONTFORK - omit this area from child's address space when forking: * typically, to avoid COWing pages pinned by get_user_pages(). * MADV_DOFORK - cancel MADV_DONTFORK: no longer omit this area when forking. + * MADV_HWPOISON - trigger memory error handler as if the given memory range + * were corrupted by unrecoverable hardware memory failure. + * MADV_SOFT_OFFLINE - try to soft-offline the given range of memory. * MADV_MERGEABLE - the application recommends that KSM try to merge pages in * this area with pages of identical content from other such areas. * MADV_UNMERGEABLE- cancel MADV_MERGEABLE: no longer merge pages with others. + * MADV_HUGEPAGE - the application wants to back the given range by transparent + * huge pages in the future. Existing pages might be coalesced and + * new pages might be allocated as THP. + * MADV_NOHUGEPAGE - mark the given range as not worth being backed by + * transparent huge pages so the existing pages will not be + * coalesced into THP and new pages will not be allocated as THP. + * MADV_DONTDUMP - the application wants to prevent pages in the given range + * from being included in its core dump. + * MADV_DODUMP - cancel MADV_DONTDUMP: no longer exclude from core dump. * * return values: * zero - success diff --git a/mm/memblock.c b/mm/memblock.c index dd7989929f13..b570dddb4cb9 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -238,8 +238,7 @@ phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t size, * so we use WARN_ONCE() here to see the stack trace if * fail happens. */ - WARN_ONCE(1, "memblock: bottom-up allocation failed, " - "memory hotunplug may be affected\n"); + WARN_ONCE(1, "memblock: bottom-up allocation failed, memory hotunplug may be affected\n"); } return __memblock_find_range_top_down(start, end, size, align, nid, @@ -612,14 +611,12 @@ static int __init_memblock memblock_add_region(phys_addr_t base, int nid, unsigned long flags) { - struct memblock_type *type = &memblock.memory; - memblock_dbg("memblock_add: [%#016llx-%#016llx] flags %#02lx %pF\n", (unsigned long long)base, (unsigned long long)base + size - 1, flags, (void *)_RET_IP_); - return memblock_add_range(type, base, size, nid, flags); + return memblock_add_range(&memblock.memory, base, size, nid, flags); } int __init_memblock memblock_add(phys_addr_t base, phys_addr_t size) @@ -740,14 +737,12 @@ static int __init_memblock memblock_reserve_region(phys_addr_t base, int nid, unsigned long flags) { - struct memblock_type *type = &memblock.reserved; - memblock_dbg("memblock_reserve: [%#016llx-%#016llx] flags %#02lx %pF\n", (unsigned long long)base, (unsigned long long)base + size - 1, flags, (void *)_RET_IP_); - return memblock_add_range(type, base, size, nid, flags); + return memblock_add_range(&memblock.reserved, base, size, nid, flags); } int __init_memblock memblock_reserve(phys_addr_t base, phys_addr_t size) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index d06cae2de783..36db05fa8acb 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -268,31 +268,6 @@ static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg) return (memcg == root_mem_cgroup); } -/* - * We restrict the id in the range of [1, 65535], so it can fit into - * an unsigned short. - */ -#define MEM_CGROUP_ID_MAX USHRT_MAX - -static inline unsigned short mem_cgroup_id(struct mem_cgroup *memcg) -{ - return memcg->css.id; -} - -/* - * A helper function to get mem_cgroup from ID. must be called under - * rcu_read_lock(). The caller is responsible for calling - * css_tryget_online() if the mem_cgroup is used for charging. (dropping - * refcnt from swap can be called against removed memcg.) - */ -static inline struct mem_cgroup *mem_cgroup_from_id(unsigned short id) -{ - struct cgroup_subsys_state *css; - - css = css_from_id(id, &memory_cgrp_subsys); - return mem_cgroup_from_css(css); -} - #ifndef CONFIG_SLOB /* * This will be the memcg's index in each cache's ->memcg_params.memcg_caches. @@ -663,9 +638,8 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg, __this_cpu_add(memcg->stat->nr_page_events, nr_pages); } -static unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg, - int nid, - unsigned int lru_mask) +unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg, + int nid, unsigned int lru_mask) { unsigned long nr = 0; int zid; @@ -1176,12 +1150,9 @@ static bool mem_cgroup_wait_acct_move(struct mem_cgroup *memcg) */ void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p) { - /* oom_info_lock ensures that parallel ooms do not interleave */ - static DEFINE_MUTEX(oom_info_lock); struct mem_cgroup *iter; unsigned int i; - mutex_lock(&oom_info_lock); rcu_read_lock(); if (p) { @@ -1225,7 +1196,6 @@ void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p) pr_cont("\n"); } - mutex_unlock(&oom_info_lock); } /* @@ -1262,7 +1232,7 @@ static unsigned long mem_cgroup_get_limit(struct mem_cgroup *memcg) return limit; } -static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, +static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, int order) { struct oom_control oc = { @@ -1340,6 +1310,7 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, } unlock: mutex_unlock(&oom_lock); + return chosen; } #if MAX_NUMNODES > 1 @@ -1709,19 +1680,13 @@ cleanup: } /** - * mem_cgroup_begin_page_stat - begin a page state statistics transaction - * @page: page that is going to change accounted state - * - * This function must mark the beginning of an accounted page state - * change to prevent double accounting when the page is concurrently - * being moved to another memcg: + * lock_page_memcg - lock a page->mem_cgroup binding + * @page: the page * - * memcg = mem_cgroup_begin_page_stat(page); - * if (TestClearPageState(page)) - * mem_cgroup_update_page_stat(memcg, state, -1); - * mem_cgroup_end_page_stat(memcg); + * This function protects unlocked LRU pages from being moved to + * another cgroup and stabilizes their page->mem_cgroup binding. */ -struct mem_cgroup *mem_cgroup_begin_page_stat(struct page *page) +void lock_page_memcg(struct page *page) { struct mem_cgroup *memcg; unsigned long flags; @@ -1730,25 +1695,18 @@ struct mem_cgroup *mem_cgroup_begin_page_stat(struct page *page) * The RCU lock is held throughout the transaction. The fast * path can get away without acquiring the memcg->move_lock * because page moving starts with an RCU grace period. - * - * The RCU lock also protects the memcg from being freed when - * the page state that is going to change is the only thing - * preventing the page from being uncharged. - * E.g. end-writeback clearing PageWriteback(), which allows - * migration to go ahead and uncharge the page before the - * account transaction might be complete. */ rcu_read_lock(); if (mem_cgroup_disabled()) - return NULL; + return; again: memcg = page->mem_cgroup; if (unlikely(!memcg)) - return NULL; + return; if (atomic_read(&memcg->moving_account) <= 0) - return memcg; + return; spin_lock_irqsave(&memcg->move_lock, flags); if (memcg != page->mem_cgroup) { @@ -1759,21 +1717,23 @@ again: /* * When charge migration first begins, we can have locked and * unlocked page stat updates happening concurrently. Track - * the task who has the lock for mem_cgroup_end_page_stat(). + * the task who has the lock for unlock_page_memcg(). */ memcg->move_lock_task = current; memcg->move_lock_flags = flags; - return memcg; + return; } -EXPORT_SYMBOL(mem_cgroup_begin_page_stat); +EXPORT_SYMBOL(lock_page_memcg); /** - * mem_cgroup_end_page_stat - finish a page state statistics transaction - * @memcg: the memcg that was accounted against + * unlock_page_memcg - unlock a page->mem_cgroup binding + * @page: the page */ -void mem_cgroup_end_page_stat(struct mem_cgroup *memcg) +void unlock_page_memcg(struct page *page) { + struct mem_cgroup *memcg = page->mem_cgroup; + if (memcg && memcg->move_lock_task == current) { unsigned long flags = memcg->move_lock_flags; @@ -1785,7 +1745,7 @@ void mem_cgroup_end_page_stat(struct mem_cgroup *memcg) rcu_read_unlock(); } -EXPORT_SYMBOL(mem_cgroup_end_page_stat); +EXPORT_SYMBOL(unlock_page_memcg); /* * size of first charge trial. "32" comes from vmscan.c's magic value. @@ -2361,9 +2321,6 @@ int __memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, int order, struct page_counter *counter; int ret; - if (!memcg_kmem_online(memcg)) - return 0; - ret = try_charge(memcg, gfp, nr_pages); if (ret) return ret; @@ -2382,10 +2339,11 @@ int __memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, int order, int __memcg_kmem_charge(struct page *page, gfp_t gfp, int order) { struct mem_cgroup *memcg; - int ret; + int ret = 0; memcg = get_mem_cgroup_from_mm(current->mm); - ret = __memcg_kmem_charge_memcg(page, gfp, order, memcg); + if (!mem_cgroup_is_root(memcg)) + ret = __memcg_kmem_charge_memcg(page, gfp, order, memcg); css_put(&memcg->css); return ret; } @@ -2755,39 +2713,48 @@ static int mem_cgroup_hierarchy_write(struct cgroup_subsys_state *css, return retval; } -static unsigned long tree_stat(struct mem_cgroup *memcg, - enum mem_cgroup_stat_index idx) +static void tree_stat(struct mem_cgroup *memcg, unsigned long *stat) { struct mem_cgroup *iter; - unsigned long val = 0; + int i; - for_each_mem_cgroup_tree(iter, memcg) - val += mem_cgroup_read_stat(iter, idx); + memset(stat, 0, sizeof(*stat) * MEMCG_NR_STAT); - return val; + for_each_mem_cgroup_tree(iter, memcg) { + for (i = 0; i < MEMCG_NR_STAT; i++) + stat[i] += mem_cgroup_read_stat(iter, i); + } } -static unsigned long tree_events(struct mem_cgroup *memcg, - enum mem_cgroup_events_index idx) +static void tree_events(struct mem_cgroup *memcg, unsigned long *events) { struct mem_cgroup *iter; - unsigned long val = 0; + int i; - for_each_mem_cgroup_tree(iter, memcg) - val += mem_cgroup_read_events(iter, idx); + memset(events, 0, sizeof(*events) * MEMCG_NR_EVENTS); - return val; + for_each_mem_cgroup_tree(iter, memcg) { + for (i = 0; i < MEMCG_NR_EVENTS; i++) + events[i] += mem_cgroup_read_events(iter, i); + } } static unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap) { - unsigned long val; + unsigned long val = 0; if (mem_cgroup_is_root(memcg)) { - val = tree_stat(memcg, MEM_CGROUP_STAT_CACHE); - val += tree_stat(memcg, MEM_CGROUP_STAT_RSS); - if (swap) - val += tree_stat(memcg, MEM_CGROUP_STAT_SWAP); + struct mem_cgroup *iter; + + for_each_mem_cgroup_tree(iter, memcg) { + val += mem_cgroup_read_stat(iter, + MEM_CGROUP_STAT_CACHE); + val += mem_cgroup_read_stat(iter, + MEM_CGROUP_STAT_RSS); + if (swap) + val += mem_cgroup_read_stat(iter, + MEM_CGROUP_STAT_SWAP); + } } else { if (!swap) val = page_counter_read(&memcg->memory); @@ -2853,6 +2820,9 @@ static int memcg_online_kmem(struct mem_cgroup *memcg) { int memcg_id; + if (cgroup_memory_nokmem) + return 0; + BUG_ON(memcg->kmemcg_id >= 0); BUG_ON(memcg->kmem_state); @@ -2873,24 +2843,6 @@ static int memcg_online_kmem(struct mem_cgroup *memcg) return 0; } -static int memcg_propagate_kmem(struct mem_cgroup *parent, - struct mem_cgroup *memcg) -{ - int ret = 0; - - mutex_lock(&memcg_limit_mutex); - /* - * If the parent cgroup is not kmem-online now, it cannot be - * onlined after this point, because it has at least one child - * already. - */ - if (memcg_kmem_online(parent) || - (cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_nokmem)) - ret = memcg_online_kmem(memcg); - mutex_unlock(&memcg_limit_mutex); - return ret; -} - static void memcg_offline_kmem(struct mem_cgroup *memcg) { struct cgroup_subsys_state *css; @@ -2949,10 +2901,6 @@ static void memcg_free_kmem(struct mem_cgroup *memcg) } } #else -static int memcg_propagate_kmem(struct mem_cgroup *parent, struct mem_cgroup *memcg) -{ - return 0; -} static int memcg_online_kmem(struct mem_cgroup *memcg) { return 0; @@ -2968,22 +2916,10 @@ static void memcg_free_kmem(struct mem_cgroup *memcg) static int memcg_update_kmem_limit(struct mem_cgroup *memcg, unsigned long limit) { - int ret = 0; + int ret; mutex_lock(&memcg_limit_mutex); - /* Top-level cgroup doesn't propagate from root */ - if (!memcg_kmem_online(memcg)) { - if (cgroup_is_populated(memcg->css.cgroup) || - (memcg->use_hierarchy && memcg_has_children(memcg))) - ret = -EBUSY; - if (ret) - goto out; - ret = memcg_online_kmem(memcg); - if (ret) - goto out; - } ret = page_counter_limit(&memcg->kmem, limit); -out: mutex_unlock(&memcg_limit_mutex); return ret; } @@ -4234,7 +4170,7 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) return &memcg->css; } - error = memcg_propagate_kmem(parent, memcg); + error = memcg_online_kmem(memcg); if (error) goto fail; @@ -4318,9 +4254,11 @@ static void mem_cgroup_css_reset(struct cgroup_subsys_state *css) { struct mem_cgroup *memcg = mem_cgroup_from_css(css); - mem_cgroup_resize_limit(memcg, PAGE_COUNTER_MAX); - mem_cgroup_resize_memsw_limit(memcg, PAGE_COUNTER_MAX); - memcg_update_kmem_limit(memcg, PAGE_COUNTER_MAX); + page_counter_limit(&memcg->memory, PAGE_COUNTER_MAX); + page_counter_limit(&memcg->swap, PAGE_COUNTER_MAX); + page_counter_limit(&memcg->memsw, PAGE_COUNTER_MAX); + page_counter_limit(&memcg->kmem, PAGE_COUNTER_MAX); + page_counter_limit(&memcg->tcpmem, PAGE_COUNTER_MAX); memcg->low = 0; memcg->high = PAGE_COUNTER_MAX; memcg->soft_limit = PAGE_COUNTER_MAX; @@ -4488,7 +4426,7 @@ static int mem_cgroup_move_account(struct page *page, VM_BUG_ON(compound && !PageTransHuge(page)); /* - * Prevent mem_cgroup_replace_page() from looking at + * Prevent mem_cgroup_migrate() from looking at * page->mem_cgroup of its source page while we change it. */ ret = -EBUSY; @@ -4923,9 +4861,9 @@ static void mem_cgroup_move_charge(struct mm_struct *mm) lru_add_drain_all(); /* - * Signal mem_cgroup_begin_page_stat() to take the memcg's - * move_lock while we're moving its pages to another memcg. - * Then wait for already started RCU-only updates to finish. + * Signal lock_page_memcg() to take the memcg's move_lock + * while we're moving its pages to another memcg. Then wait + * for already started RCU-only updates to finish. */ atomic_inc(&mc.from->moving_account); synchronize_rcu(); @@ -5051,6 +4989,7 @@ static ssize_t memory_high_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); + unsigned long nr_pages; unsigned long high; int err; @@ -5061,6 +5000,11 @@ static ssize_t memory_high_write(struct kernfs_open_file *of, memcg->high = high; + nr_pages = page_counter_read(&memcg->memory); + if (nr_pages > high) + try_to_free_mem_cgroup_pages(memcg, nr_pages - high, + GFP_KERNEL, true); + memcg_wb_domain_size_changed(memcg); return nbytes; } @@ -5082,6 +5026,8 @@ static ssize_t memory_max_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); + unsigned int nr_reclaims = MEM_CGROUP_RECLAIM_RETRIES; + bool drained = false; unsigned long max; int err; @@ -5090,9 +5036,36 @@ static ssize_t memory_max_write(struct kernfs_open_file *of, if (err) return err; - err = mem_cgroup_resize_limit(memcg, max); - if (err) - return err; + xchg(&memcg->memory.limit, max); + + for (;;) { + unsigned long nr_pages = page_counter_read(&memcg->memory); + + if (nr_pages <= max) + break; + + if (signal_pending(current)) { + err = -EINTR; + break; + } + + if (!drained) { + drain_all_stock(memcg); + drained = true; + continue; + } + + if (nr_reclaims) { + if (!try_to_free_mem_cgroup_pages(memcg, nr_pages - max, + GFP_KERNEL, true)) + nr_reclaims--; + continue; + } + + mem_cgroup_events(memcg, MEMCG_OOM, 1); + if (!mem_cgroup_out_of_memory(memcg, GFP_KERNEL, 0)) + break; + } memcg_wb_domain_size_changed(memcg); return nbytes; @@ -5113,6 +5086,8 @@ static int memory_events_show(struct seq_file *m, void *v) static int memory_stat_show(struct seq_file *m, void *v) { struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); + unsigned long stat[MEMCG_NR_STAT]; + unsigned long events[MEMCG_NR_EVENTS]; int i; /* @@ -5126,22 +5101,27 @@ static int memory_stat_show(struct seq_file *m, void *v) * Current memory state: */ + tree_stat(memcg, stat); + tree_events(memcg, events); + seq_printf(m, "anon %llu\n", - (u64)tree_stat(memcg, MEM_CGROUP_STAT_RSS) * PAGE_SIZE); + (u64)stat[MEM_CGROUP_STAT_RSS] * PAGE_SIZE); seq_printf(m, "file %llu\n", - (u64)tree_stat(memcg, MEM_CGROUP_STAT_CACHE) * PAGE_SIZE); + (u64)stat[MEM_CGROUP_STAT_CACHE] * PAGE_SIZE); + seq_printf(m, "kernel_stack %llu\n", + (u64)stat[MEMCG_KERNEL_STACK] * PAGE_SIZE); + seq_printf(m, "slab %llu\n", + (u64)(stat[MEMCG_SLAB_RECLAIMABLE] + + stat[MEMCG_SLAB_UNRECLAIMABLE]) * PAGE_SIZE); seq_printf(m, "sock %llu\n", - (u64)tree_stat(memcg, MEMCG_SOCK) * PAGE_SIZE); + (u64)stat[MEMCG_SOCK] * PAGE_SIZE); seq_printf(m, "file_mapped %llu\n", - (u64)tree_stat(memcg, MEM_CGROUP_STAT_FILE_MAPPED) * - PAGE_SIZE); + (u64)stat[MEM_CGROUP_STAT_FILE_MAPPED] * PAGE_SIZE); seq_printf(m, "file_dirty %llu\n", - (u64)tree_stat(memcg, MEM_CGROUP_STAT_DIRTY) * - PAGE_SIZE); + (u64)stat[MEM_CGROUP_STAT_DIRTY] * PAGE_SIZE); seq_printf(m, "file_writeback %llu\n", - (u64)tree_stat(memcg, MEM_CGROUP_STAT_WRITEBACK) * - PAGE_SIZE); + (u64)stat[MEM_CGROUP_STAT_WRITEBACK] * PAGE_SIZE); for (i = 0; i < NR_LRU_LISTS; i++) { struct mem_cgroup *mi; @@ -5153,12 +5133,17 @@ static int memory_stat_show(struct seq_file *m, void *v) mem_cgroup_lru_names[i], (u64)val * PAGE_SIZE); } + seq_printf(m, "slab_reclaimable %llu\n", + (u64)stat[MEMCG_SLAB_RECLAIMABLE] * PAGE_SIZE); + seq_printf(m, "slab_unreclaimable %llu\n", + (u64)stat[MEMCG_SLAB_UNRECLAIMABLE] * PAGE_SIZE); + /* Accumulated memory events */ seq_printf(m, "pgfault %lu\n", - tree_events(memcg, MEM_CGROUP_EVENTS_PGFAULT)); + events[MEM_CGROUP_EVENTS_PGFAULT]); seq_printf(m, "pgmajfault %lu\n", - tree_events(memcg, MEM_CGROUP_EVENTS_PGMAJFAULT)); + events[MEM_CGROUP_EVENTS_PGMAJFAULT]); return 0; } @@ -5431,6 +5416,10 @@ static void uncharge_list(struct list_head *page_list) struct list_head *next; struct page *page; + /* + * Note that the list can be a single page->lru; hence the + * do-while loop instead of a simple list_for_each_entry(). + */ next = page_list->next; do { unsigned int nr_pages = 1; @@ -5517,16 +5506,16 @@ void mem_cgroup_uncharge_list(struct list_head *page_list) } /** - * mem_cgroup_replace_page - migrate a charge to another page - * @oldpage: currently charged page - * @newpage: page to transfer the charge to + * mem_cgroup_migrate - charge a page's replacement + * @oldpage: currently circulating page + * @newpage: replacement page * - * Migrate the charge from @oldpage to @newpage. + * Charge @newpage as a replacement page for @oldpage. @oldpage will + * be uncharged upon free. * * Both pages must be locked, @newpage->mapping must be set up. - * Either or both pages might be on the LRU already. */ -void mem_cgroup_replace_page(struct page *oldpage, struct page *newpage) +void mem_cgroup_migrate(struct page *oldpage, struct page *newpage) { struct mem_cgroup *memcg; unsigned int nr_pages; @@ -5559,7 +5548,7 @@ void mem_cgroup_replace_page(struct page *oldpage, struct page *newpage) page_counter_charge(&memcg->memsw, nr_pages); css_get_many(&memcg->css, nr_pages); - commit_charge(newpage, memcg, true); + commit_charge(newpage, memcg, false); local_irq_disable(); mem_cgroup_charge_statistics(memcg, newpage, compound, nr_pages); diff --git a/mm/memory-failure.c b/mm/memory-failure.c index ac595e7a3a95..5a544c6c0717 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -184,9 +184,8 @@ static int kill_proc(struct task_struct *t, unsigned long addr, int trapno, struct siginfo si; int ret; - printk(KERN_ERR - "MCE %#lx: Killing %s:%d due to hardware memory corruption\n", - pfn, t->comm, t->pid); + pr_err("MCE %#lx: Killing %s:%d due to hardware memory corruption\n", + pfn, t->comm, t->pid); si.si_signo = SIGBUS; si.si_errno = 0; si.si_addr = (void *)addr; @@ -209,8 +208,8 @@ static int kill_proc(struct task_struct *t, unsigned long addr, int trapno, ret = send_sig_info(SIGBUS, &si, t); /* synchronous? */ } if (ret < 0) - printk(KERN_INFO "MCE: Error sending signal to %s:%d: %d\n", - t->comm, t->pid, ret); + pr_info("MCE: Error sending signal to %s:%d: %d\n", + t->comm, t->pid, ret); return ret; } @@ -290,8 +289,7 @@ static void add_to_kill(struct task_struct *tsk, struct page *p, } else { tk = kmalloc(sizeof(struct to_kill), GFP_ATOMIC); if (!tk) { - printk(KERN_ERR - "MCE: Out of memory while machine check handling\n"); + pr_err("MCE: Out of memory while machine check handling\n"); return; } } @@ -336,9 +334,8 @@ static void kill_procs(struct list_head *to_kill, int forcekill, int trapno, * signal and then access the memory. Just kill it. */ if (fail || tk->addr_valid == 0) { - printk(KERN_ERR - "MCE %#lx: forcibly killing %s:%d because of failure to unmap corrupted page\n", - pfn, tk->tsk->comm, tk->tsk->pid); + pr_err("MCE %#lx: forcibly killing %s:%d because of failure to unmap corrupted page\n", + pfn, tk->tsk->comm, tk->tsk->pid); force_sig(SIGKILL, tk->tsk); } @@ -350,9 +347,8 @@ static void kill_procs(struct list_head *to_kill, int forcekill, int trapno, */ else if (kill_proc(tk->tsk, tk->addr, trapno, pfn, page, flags) < 0) - printk(KERN_ERR - "MCE %#lx: Cannot send advisory machine check signal to %s:%d\n", - pfn, tk->tsk->comm, tk->tsk->pid); + pr_err("MCE %#lx: Cannot send advisory machine check signal to %s:%d\n", + pfn, tk->tsk->comm, tk->tsk->pid); } put_task_struct(tk->tsk); kfree(tk); @@ -563,7 +559,7 @@ static int me_kernel(struct page *p, unsigned long pfn) */ static int me_unknown(struct page *p, unsigned long pfn) { - printk(KERN_ERR "MCE %#lx: Unknown page state\n", pfn); + pr_err("MCE %#lx: Unknown page state\n", pfn); return MF_FAILED; } @@ -608,8 +604,8 @@ static int me_pagecache_clean(struct page *p, unsigned long pfn) if (mapping->a_ops->error_remove_page) { err = mapping->a_ops->error_remove_page(mapping, p); if (err != 0) { - printk(KERN_INFO "MCE %#lx: Failed to punch page: %d\n", - pfn, err); + pr_info("MCE %#lx: Failed to punch page: %d\n", + pfn, err); } else if (page_has_private(p) && !try_to_release_page(p, GFP_NOIO)) { pr_info("MCE %#lx: failed to release buffers\n", pfn); @@ -624,8 +620,7 @@ static int me_pagecache_clean(struct page *p, unsigned long pfn) if (invalidate_inode_page(p)) ret = MF_RECOVERED; else - printk(KERN_INFO "MCE %#lx: Failed to invalidate\n", - pfn); + pr_info("MCE %#lx: Failed to invalidate\n", pfn); } return ret; } @@ -826,8 +821,6 @@ static struct page_state { #undef lru #undef swapbacked #undef head -#undef tail -#undef compound #undef slab #undef reserved @@ -856,8 +849,7 @@ static int page_action(struct page_state *ps, struct page *p, if (ps->action == me_swapcache_dirty && result == MF_DELAYED) count--; if (count != 0) { - printk(KERN_ERR - "MCE %#lx: %s still referenced by %d users\n", + pr_err("MCE %#lx: %s still referenced by %d users\n", pfn, action_page_types[ps->type], count); result = MF_FAILED; } @@ -936,8 +928,7 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn, } if (PageSwapCache(p)) { - printk(KERN_ERR - "MCE %#lx: keeping poisoned page in swap cache\n", pfn); + pr_err("MCE %#lx: keeping poisoned page in swap cache\n", pfn); ttu |= TTU_IGNORE_HWPOISON; } @@ -955,8 +946,7 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn, } else { kill = 0; ttu |= TTU_IGNORE_HWPOISON; - printk(KERN_INFO - "MCE %#lx: corrupted page was clean: dropped without side effects\n", + pr_info("MCE %#lx: corrupted page was clean: dropped without side effects\n", pfn); } } @@ -974,8 +964,8 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn, ret = try_to_unmap(hpage, ttu); if (ret != SWAP_SUCCESS) - printk(KERN_ERR "MCE %#lx: failed to unmap page (mapcount=%d)\n", - pfn, page_mapcount(hpage)); + pr_err("MCE %#lx: failed to unmap page (mapcount=%d)\n", + pfn, page_mapcount(hpage)); /* * Now that the dirty bit has been propagated to the @@ -1042,16 +1032,14 @@ int memory_failure(unsigned long pfn, int trapno, int flags) panic("Memory failure from trap %d on page %lx", trapno, pfn); if (!pfn_valid(pfn)) { - printk(KERN_ERR - "MCE %#lx: memory outside kernel control\n", - pfn); + pr_err("MCE %#lx: memory outside kernel control\n", pfn); return -ENXIO; } p = pfn_to_page(pfn); orig_head = hpage = compound_head(p); if (TestSetPageHWPoison(p)) { - printk(KERN_ERR "MCE %#lx: already hardware poisoned\n", pfn); + pr_err("MCE %#lx: already hardware poisoned\n", pfn); return 0; } @@ -1182,7 +1170,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags) * unpoison always clear PG_hwpoison inside page lock */ if (!PageHWPoison(p)) { - printk(KERN_ERR "MCE %#lx: just unpoisoned\n", pfn); + pr_err("MCE %#lx: just unpoisoned\n", pfn); num_poisoned_pages_sub(nr_pages); unlock_page(hpage); put_hwpoison_page(hpage); diff --git a/mm/memory.c b/mm/memory.c index 8132787ae4d5..098f00d05461 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -65,6 +65,7 @@ #include <linux/userfaultfd_k.h> #include <asm/io.h> +#include <asm/mmu_context.h> #include <asm/pgalloc.h> #include <asm/uaccess.h> #include <asm/tlb.h> @@ -562,8 +563,7 @@ void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma, } } -int __pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma, - pmd_t *pmd, unsigned long address) +int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address) { spinlock_t *ptl; pgtable_t new = pte_alloc_one(mm, address); @@ -661,9 +661,8 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr, return; } if (nr_unshown) { - printk(KERN_ALERT - "BUG: Bad page map: %lu messages suppressed\n", - nr_unshown); + pr_alert("BUG: Bad page map: %lu messages suppressed\n", + nr_unshown); nr_unshown = 0; } nr_shown = 0; @@ -674,15 +673,13 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr, mapping = vma->vm_file ? vma->vm_file->f_mapping : NULL; index = linear_page_index(vma, addr); - printk(KERN_ALERT - "BUG: Bad page map in process %s pte:%08llx pmd:%08llx\n", - current->comm, - (long long)pte_val(pte), (long long)pmd_val(*pmd)); + pr_alert("BUG: Bad page map in process %s pte:%08llx pmd:%08llx\n", + current->comm, + (long long)pte_val(pte), (long long)pmd_val(*pmd)); if (page) dump_page(page, "bad pte"); - printk(KERN_ALERT - "addr:%p vm_flags:%08lx anon_vma:%p mapping:%p index:%lx\n", - (void *)addr, vma->vm_flags, vma->anon_vma, mapping, index); + pr_alert("addr:%p vm_flags:%08lx anon_vma:%p mapping:%p index:%lx\n", + (void *)addr, vma->vm_flags, vma->anon_vma, mapping, index); /* * Choose text because data symbols depend on CONFIG_KALLSYMS_ALL=y */ @@ -1105,6 +1102,12 @@ again: if (!PageAnon(page)) { if (pte_dirty(ptent)) { + /* + * oom_reaper cannot tear down dirty + * pages + */ + if (unlikely(details && details->ignore_dirty)) + continue; force_flush = 1; set_page_dirty(page); } @@ -1123,8 +1126,8 @@ again: } continue; } - /* If details->check_mapping, we leave swap entries. */ - if (unlikely(details)) + /* only check swap_entries if explicitly asked for in details */ + if (unlikely(details && !details->check_swap_entries)) continue; entry = pte_to_swp_entry(ptent); @@ -1229,7 +1232,7 @@ static inline unsigned long zap_pud_range(struct mmu_gather *tlb, return addr; } -static void unmap_page_range(struct mmu_gather *tlb, +void unmap_page_range(struct mmu_gather *tlb, struct vm_area_struct *vma, unsigned long addr, unsigned long end, struct zap_details *details) @@ -1237,9 +1240,6 @@ static void unmap_page_range(struct mmu_gather *tlb, pgd_t *pgd; unsigned long next; - if (details && !details->check_mapping) - details = NULL; - BUG_ON(addr >= end); tlb_start_vma(tlb, vma); pgd = pgd_offset(vma->vm_mm, addr); @@ -1551,8 +1551,29 @@ out: int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr, unsigned long pfn) { + return vm_insert_pfn_prot(vma, addr, pfn, vma->vm_page_prot); +} +EXPORT_SYMBOL(vm_insert_pfn); + +/** + * vm_insert_pfn_prot - insert single pfn into user vma with specified pgprot + * @vma: user vma to map to + * @addr: target user address of this page + * @pfn: source kernel pfn + * @pgprot: pgprot flags for the inserted page + * + * This is exactly like vm_insert_pfn, except that it allows drivers to + * to override pgprot on a per-page basis. + * + * This only makes sense for IO mappings, and it makes no sense for + * cow mappings. In general, using multiple vmas is preferable; + * vm_insert_pfn_prot should only be used if using multiple VMAs is + * impractical. + */ +int vm_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr, + unsigned long pfn, pgprot_t pgprot) +{ int ret; - pgprot_t pgprot = vma->vm_page_prot; /* * Technically, architectures with pte_special can avoid all these * restrictions (same for remap_pfn_range). However we would like @@ -1574,7 +1595,7 @@ int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr, return ret; } -EXPORT_SYMBOL(vm_insert_pfn); +EXPORT_SYMBOL(vm_insert_pfn_prot); int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr, pfn_t pfn) @@ -1876,7 +1897,9 @@ int apply_to_page_range(struct mm_struct *mm, unsigned long addr, unsigned long end = addr + size; int err; - BUG_ON(addr >= end); + if (WARN_ON(addr >= end)) + return -EINVAL; + pgd = pgd_offset(mm, addr); do { next = pgd_addr_end(addr, end); @@ -2412,7 +2435,7 @@ static inline void unmap_mapping_range_tree(struct rb_root *root, void unmap_mapping_range(struct address_space *mapping, loff_t const holebegin, loff_t const holelen, int even_cows) { - struct zap_details details; + struct zap_details details = { }; pgoff_t hba = holebegin >> PAGE_SHIFT; pgoff_t hlen = (holelen + PAGE_SIZE - 1) >> PAGE_SHIFT; @@ -3122,8 +3145,7 @@ static int do_fault(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, pte_t *page_table, pmd_t *pmd, unsigned int flags, pte_t orig_pte) { - pgoff_t pgoff = (((address & PAGE_MASK) - - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; + pgoff_t pgoff = linear_page_index(vma, address); pte_unmap(page_table); /* The VMA was not fully populated on mmap() or missing VM_DONTEXPAND */ @@ -3357,6 +3379,11 @@ static int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, pmd_t *pmd; pte_t *pte; + if (!arch_vma_access_permitted(vma, flags & FAULT_FLAG_WRITE, + flags & FAULT_FLAG_INSTRUCTION, + flags & FAULT_FLAG_REMOTE)) + return VM_FAULT_SIGSEGV; + if (unlikely(is_vm_hugetlb_page(vma))) return hugetlb_fault(mm, vma, address, flags); @@ -3397,12 +3424,11 @@ static int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, } /* - * Use __pte_alloc instead of pte_alloc_map, because we can't + * Use pte_alloc() instead of pte_alloc_map, because we can't * run pte_offset_map on the pmd, if an huge pmd could * materialize from under us from a different thread. */ - if (unlikely(pmd_none(*pmd)) && - unlikely(__pte_alloc(mm, vma, pmd, address))) + if (unlikely(pte_alloc(mm, pmd, address))) return VM_FAULT_OOM; /* * If a huge pmd materialized under us just retry later. Use @@ -3674,7 +3700,7 @@ static int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm, void *maddr; struct page *page = NULL; - ret = get_user_pages(tsk, mm, addr, 1, + ret = get_user_pages_remote(tsk, mm, addr, 1, write, 1, &page, &vma); if (ret <= 0) { #ifndef CONFIG_HAVE_IOREMAP_PROT diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 4af58a3a8ffa..aa34431c3f31 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -33,6 +33,7 @@ #include <linux/hugetlb.h> #include <linux/memblock.h> #include <linux/bootmem.h> +#include <linux/compaction.h> #include <asm/tlbflush.h> @@ -77,6 +78,9 @@ static struct { #define memhp_lock_acquire() lock_map_acquire(&mem_hotplug.dep_map) #define memhp_lock_release() lock_map_release(&mem_hotplug.dep_map) +bool memhp_auto_online; +EXPORT_SYMBOL_GPL(memhp_auto_online); + void get_online_mems(void) { might_sleep(); @@ -138,7 +142,7 @@ static struct resource *register_memory_resource(u64 start, u64 size) res->name = "System RAM"; res->start = start; res->end = start + size - 1; - res->flags = IORESOURCE_MEM | IORESOURCE_BUSY; + res->flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY; if (request_resource(&iomem_resource, res) < 0) { pr_debug("System RAM resource %pR cannot be added\n", res); kfree(res); @@ -163,7 +167,7 @@ void get_page_bootmem(unsigned long info, struct page *page, page->lru.next = (struct list_head *) type; SetPagePrivate(page); set_page_private(page, info); - atomic_inc(&page->_count); + page_ref_inc(page); } void put_page_bootmem(struct page *page) @@ -174,7 +178,7 @@ void put_page_bootmem(struct page *page) BUG_ON(type < MEMORY_HOTPLUG_MIN_BOOTMEM_TYPE || type > MEMORY_HOTPLUG_MAX_BOOTMEM_TYPE); - if (atomic_dec_return(&page->_count) == 1) { + if (page_ref_dec_return(page) == 1) { ClearPagePrivate(page); set_page_private(page, 0); INIT_LIST_HEAD(&page->lru); @@ -509,6 +513,8 @@ int __ref __add_pages(int nid, struct zone *zone, unsigned long phys_start_pfn, int start_sec, end_sec; struct vmem_altmap *altmap; + clear_zone_contiguous(zone); + /* during initialize mem_map, align hot-added range to section */ start_sec = pfn_to_section_nr(phys_start_pfn); end_sec = pfn_to_section_nr(phys_start_pfn + nr_pages - 1); @@ -521,7 +527,8 @@ int __ref __add_pages(int nid, struct zone *zone, unsigned long phys_start_pfn, if (altmap->base_pfn != phys_start_pfn || vmem_altmap_offset(altmap) > nr_pages) { pr_warn_once("memory add fail, invalid altmap\n"); - return -EINVAL; + err = -EINVAL; + goto out; } altmap->alloc = 0; } @@ -539,7 +546,8 @@ int __ref __add_pages(int nid, struct zone *zone, unsigned long phys_start_pfn, err = 0; } vmemmap_populate_print_last(); - +out: + set_zone_contiguous(zone); return err; } EXPORT_SYMBOL_GPL(__add_pages); @@ -811,6 +819,8 @@ int __remove_pages(struct zone *zone, unsigned long phys_start_pfn, } } + clear_zone_contiguous(zone); + /* * We can only remove entire sections */ @@ -826,6 +836,9 @@ int __remove_pages(struct zone *zone, unsigned long phys_start_pfn, if (ret) break; } + + set_zone_contiguous(zone); + return ret; } EXPORT_SYMBOL_GPL(__remove_pages); @@ -1042,14 +1055,13 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ arg.nr_pages = nr_pages; node_states_check_changes_online(nr_pages, zone, &arg); - nid = pfn_to_nid(pfn); + nid = zone_to_nid(zone); ret = memory_notify(MEM_GOING_ONLINE, &arg); ret = notifier_to_errno(ret); - if (ret) { - memory_notify(MEM_CANCEL_ONLINE, &arg); - return ret; - } + if (ret) + goto failed_addition; + /* * If this zone is not populated, then it is not in zonelist. * This means the page allocator ignores this zone. @@ -1067,12 +1079,7 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ if (need_zonelists_rebuild) zone_pcp_reset(zone); mutex_unlock(&zonelists_mutex); - printk(KERN_DEBUG "online_pages [mem %#010llx-%#010llx] failed\n", - (unsigned long long) pfn << PAGE_SHIFT, - (((unsigned long long) pfn + nr_pages) - << PAGE_SHIFT) - 1); - memory_notify(MEM_CANCEL_ONLINE, &arg); - return ret; + goto failed_addition; } zone->present_pages += onlined_pages; @@ -1082,7 +1089,7 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ pgdat_resize_unlock(zone->zone_pgdat, &flags); if (onlined_pages) { - node_states_set_node(zone_to_nid(zone), &arg); + node_states_set_node(nid, &arg); if (need_zonelists_rebuild) build_all_zonelists(NULL, NULL); else @@ -1093,8 +1100,10 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ init_per_zone_wmark_min(); - if (onlined_pages) - kswapd_run(zone_to_nid(zone)); + if (onlined_pages) { + kswapd_run(nid); + kcompactd_run(nid); + } vm_total_pages = nr_free_pagecache_pages(); @@ -1103,6 +1112,13 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ if (onlined_pages) memory_notify(MEM_ONLINE, &arg); return 0; + +failed_addition: + pr_debug("online_pages [mem %#010llx-%#010llx] failed\n", + (unsigned long long) pfn << PAGE_SHIFT, + (((unsigned long long) pfn + nr_pages) << PAGE_SHIFT) - 1); + memory_notify(MEM_CANCEL_ONLINE, &arg); + return ret; } #endif /* CONFIG_MEMORY_HOTPLUG_SPARSE */ @@ -1261,8 +1277,13 @@ int zone_for_memory(int nid, u64 start, u64 size, int zone_default, return zone_default; } +static int online_memory_block(struct memory_block *mem, void *arg) +{ + return memory_block_change_state(mem, MEM_ONLINE, MEM_OFFLINE); +} + /* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */ -int __ref add_memory_resource(int nid, struct resource *res) +int __ref add_memory_resource(int nid, struct resource *res, bool online) { u64 start, size; pg_data_t *pgdat = NULL; @@ -1322,6 +1343,11 @@ int __ref add_memory_resource(int nid, struct resource *res) /* create new memmap entry */ firmware_map_add_hotplug(start, start + size, "System RAM"); + /* online pages if requested */ + if (online) + walk_memory_range(PFN_DOWN(start), PFN_UP(start + size - 1), + NULL, online_memory_block); + goto out; error: @@ -1345,7 +1371,7 @@ int __ref add_memory(int nid, u64 start, u64 size) if (IS_ERR(res)) return PTR_ERR(res); - ret = add_memory_resource(nid, res); + ret = add_memory_resource(nid, res, memhp_auto_online); if (ret < 0) release_memory_resource(res); return ret; @@ -1504,8 +1530,7 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) } else { #ifdef CONFIG_DEBUG_VM - printk(KERN_ALERT "removing pfn %lx from LRU failed\n", - pfn); + pr_alert("removing pfn %lx from LRU failed\n", pfn); dump_page(page, "failed to remove from LRU"); #endif put_page(page); @@ -1833,7 +1858,7 @@ repeat: ret = -EBUSY; goto failed_removal; } - printk(KERN_INFO "Offlined Pages %ld\n", offlined_pages); + pr_info("Offlined Pages %ld\n", offlined_pages); /* Ok, all of our target is isolated. We cannot do rollback at this point. */ offline_isolated_pages(start_pfn, end_pfn); @@ -1858,8 +1883,10 @@ repeat: zone_pcp_update(zone); node_states_clear_node(node, &arg); - if (arg.status_change_nid >= 0) + if (arg.status_change_nid >= 0) { kswapd_stop(node); + kcompactd_stop(node); + } vm_total_pages = nr_free_pagecache_pages(); writeback_set_ratelimit(); @@ -1868,9 +1895,9 @@ repeat: return 0; failed_removal: - printk(KERN_INFO "memory offlining [mem %#010llx-%#010llx] failed\n", - (unsigned long long) start_pfn << PAGE_SHIFT, - ((unsigned long long) end_pfn << PAGE_SHIFT) - 1); + pr_debug("memory offlining [mem %#010llx-%#010llx] failed\n", + (unsigned long long) start_pfn << PAGE_SHIFT, + ((unsigned long long) end_pfn << PAGE_SHIFT) - 1); memory_notify(MEM_CANCEL_OFFLINE, &arg); /* pushback to free area */ undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE); @@ -1943,8 +1970,7 @@ static int check_memblock_offlined_cb(struct memory_block *mem, void *arg) beginpa = PFN_PHYS(section_nr_to_pfn(mem->start_section_nr)); endpa = PFN_PHYS(section_nr_to_pfn(mem->end_section_nr + 1))-1; - pr_warn("removing memory fails, because memory " - "[%pa-%pa] is onlined\n", + pr_warn("removing memory fails, because memory [%pa-%pa] is onlined\n", &beginpa, &endpa); } diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 9a3f6b90e628..36cc01bc950a 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -643,7 +643,9 @@ static int queue_pages_test_walk(unsigned long start, unsigned long end, if (flags & MPOL_MF_LAZY) { /* Similar to task_numa_work, skip inaccessible VMAs */ - if (vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)) + if (!is_vm_hugetlb_page(vma) && + (vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)) && + !(vma->vm_flags & VM_MIXEDMAP)) change_prot_numa(vma, start, endvma); return 1; } @@ -844,12 +846,12 @@ static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes) } } -static int lookup_node(struct mm_struct *mm, unsigned long addr) +static int lookup_node(unsigned long addr) { struct page *p; int err; - err = get_user_pages(current, mm, addr & PAGE_MASK, 1, 0, 0, &p, NULL); + err = get_user_pages(addr & PAGE_MASK, 1, 0, 0, &p, NULL); if (err >= 0) { err = page_to_nid(p); put_page(p); @@ -904,7 +906,7 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask, if (flags & MPOL_F_NODE) { if (flags & MPOL_F_ADDR) { - err = lookup_node(mm, addr); + err = lookup_node(addr); if (err < 0) goto out; *policy = err; @@ -2557,9 +2559,7 @@ static void __init check_numabalancing_enable(void) set_numabalancing_state(numabalancing_override == 1); if (num_online_nodes() > 1 && !numabalancing_override) { - pr_info("%s automatic NUMA balancing. " - "Configure with numa_balancing= or the " - "kernel.numa_balancing sysctl", + pr_info("%s automatic NUMA balancing. Configure with numa_balancing= or the kernel.numa_balancing sysctl\n", numabalancing_default ? "Enabling" : "Disabling"); set_numabalancing_state(numabalancing_default); } diff --git a/mm/mempool.c b/mm/mempool.c index 7924f4f58a6d..9b7a14a791cc 100644 --- a/mm/mempool.c +++ b/mm/mempool.c @@ -112,12 +112,12 @@ static void kasan_poison_element(mempool_t *pool, void *element) kasan_free_pages(element, (unsigned long)pool->pool_data); } -static void kasan_unpoison_element(mempool_t *pool, void *element) +static void kasan_unpoison_element(mempool_t *pool, void *element, gfp_t flags) { if (pool->alloc == mempool_alloc_slab) - kasan_slab_alloc(pool->pool_data, element); + kasan_slab_alloc(pool->pool_data, element, flags); if (pool->alloc == mempool_kmalloc) - kasan_krealloc(element, (size_t)pool->pool_data); + kasan_krealloc(element, (size_t)pool->pool_data, flags); if (pool->alloc == mempool_alloc_pages) kasan_alloc_pages(element, (unsigned long)pool->pool_data); } @@ -130,12 +130,12 @@ static void add_element(mempool_t *pool, void *element) pool->elements[pool->curr_nr++] = element; } -static void *remove_element(mempool_t *pool) +static void *remove_element(mempool_t *pool, gfp_t flags) { void *element = pool->elements[--pool->curr_nr]; BUG_ON(pool->curr_nr < 0); - kasan_unpoison_element(pool, element); + kasan_unpoison_element(pool, element, flags); check_element(pool, element); return element; } @@ -154,7 +154,7 @@ void mempool_destroy(mempool_t *pool) return; while (pool->curr_nr) { - void *element = remove_element(pool); + void *element = remove_element(pool, GFP_KERNEL); pool->free(element, pool->pool_data); } kfree(pool->elements); @@ -250,7 +250,7 @@ int mempool_resize(mempool_t *pool, int new_min_nr) spin_lock_irqsave(&pool->lock, flags); if (new_min_nr <= pool->min_nr) { while (new_min_nr < pool->curr_nr) { - element = remove_element(pool); + element = remove_element(pool, GFP_KERNEL); spin_unlock_irqrestore(&pool->lock, flags); pool->free(element, pool->pool_data); spin_lock_irqsave(&pool->lock, flags); @@ -310,25 +310,36 @@ EXPORT_SYMBOL(mempool_resize); * returns NULL. Note that due to preallocation, this function * *never* fails when called from process contexts. (it might * fail if called from an IRQ context.) - * Note: using __GFP_ZERO is not supported. + * Note: neither __GFP_NOMEMALLOC nor __GFP_ZERO are supported. */ -void * mempool_alloc(mempool_t *pool, gfp_t gfp_mask) +void *mempool_alloc(mempool_t *pool, gfp_t gfp_mask) { void *element; unsigned long flags; wait_queue_t wait; gfp_t gfp_temp; + /* If oom killed, memory reserves are essential to prevent livelock */ + VM_WARN_ON_ONCE(gfp_mask & __GFP_NOMEMALLOC); + /* No element size to zero on allocation */ VM_WARN_ON_ONCE(gfp_mask & __GFP_ZERO); + might_sleep_if(gfp_mask & __GFP_DIRECT_RECLAIM); - gfp_mask |= __GFP_NOMEMALLOC; /* don't allocate emergency reserves */ gfp_mask |= __GFP_NORETRY; /* don't loop in __alloc_pages */ gfp_mask |= __GFP_NOWARN; /* failures are OK */ gfp_temp = gfp_mask & ~(__GFP_DIRECT_RECLAIM|__GFP_IO); repeat_alloc: + if (likely(pool->curr_nr)) { + /* + * Don't allocate from emergency reserves if there are + * elements available. This check is racy, but it will + * be rechecked each loop. + */ + gfp_temp |= __GFP_NOMEMALLOC; + } element = pool->alloc(gfp_temp, pool->pool_data); if (likely(element != NULL)) @@ -336,7 +347,7 @@ repeat_alloc: spin_lock_irqsave(&pool->lock, flags); if (likely(pool->curr_nr)) { - element = remove_element(pool); + element = remove_element(pool, gfp_temp); spin_unlock_irqrestore(&pool->lock, flags); /* paired with rmb in mempool_free(), read comment there */ smp_wmb(); @@ -352,11 +363,12 @@ repeat_alloc: * We use gfp mask w/o direct reclaim or IO for the first round. If * alloc failed with that and @pool was empty, retry immediately. */ - if (gfp_temp != gfp_mask) { + if ((gfp_temp & ~__GFP_NOMEMALLOC) != gfp_mask) { spin_unlock_irqrestore(&pool->lock, flags); gfp_temp = gfp_mask; goto repeat_alloc; } + gfp_temp = gfp_mask; /* We must not sleep if !__GFP_DIRECT_RECLAIM */ if (!(gfp_mask & __GFP_DIRECT_RECLAIM)) { diff --git a/mm/migrate.c b/mm/migrate.c index 3ad0fea5c438..6c822a7b27e0 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -38,6 +38,7 @@ #include <linux/balloon_compaction.h> #include <linux/mmu_notifier.h> #include <linux/page_idle.h> +#include <linux/page_owner.h> #include <asm/tlbflush.h> @@ -171,7 +172,7 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma, else page_add_file_rmap(new); - if (vma->vm_flags & VM_LOCKED) + if (vma->vm_flags & VM_LOCKED && !PageTransCompound(new)) mlock_vma_page(new); /* No need to invalidate - it was non-present before */ @@ -186,14 +187,17 @@ out: * Get rid of all migration entries and replace them by * references to the indicated page. */ -static void remove_migration_ptes(struct page *old, struct page *new) +void remove_migration_ptes(struct page *old, struct page *new, bool locked) { struct rmap_walk_control rwc = { .rmap_one = remove_migration_pte, .arg = old, }; - rmap_walk(new, &rwc); + if (locked) + rmap_walk_locked(new, &rwc); + else + rmap_walk(new, &rwc); } /* @@ -325,7 +329,6 @@ int migrate_page_move_mapping(struct address_space *mapping, return -EAGAIN; /* No turning back from here */ - set_page_memcg(newpage, page_memcg(page)); newpage->index = page->index; newpage->mapping = page->mapping; if (PageSwapBacked(page)) @@ -349,7 +352,7 @@ int migrate_page_move_mapping(struct address_space *mapping, return -EAGAIN; } - if (!page_freeze_refs(page, expected_count)) { + if (!page_ref_freeze(page, expected_count)) { spin_unlock_irq(&mapping->tree_lock); return -EAGAIN; } @@ -363,7 +366,7 @@ int migrate_page_move_mapping(struct address_space *mapping, */ if (mode == MIGRATE_ASYNC && head && !buffer_migrate_lock_buffers(head, mode)) { - page_unfreeze_refs(page, expected_count); + page_ref_unfreeze(page, expected_count); spin_unlock_irq(&mapping->tree_lock); return -EAGAIN; } @@ -372,7 +375,6 @@ int migrate_page_move_mapping(struct address_space *mapping, * Now we know that no one else is looking at the page: * no turning back from here. */ - set_page_memcg(newpage, page_memcg(page)); newpage->index = page->index; newpage->mapping = page->mapping; if (PageSwapBacked(page)) @@ -398,7 +400,7 @@ int migrate_page_move_mapping(struct address_space *mapping, * to one less reference. * We know this isn't the last reference. */ - page_unfreeze_refs(page, expected_count - 1); + page_ref_unfreeze(page, expected_count - 1); spin_unlock(&mapping->tree_lock); /* Leave irq disabled to prevent preemption while updating stats */ @@ -452,21 +454,22 @@ int migrate_huge_page_move_mapping(struct address_space *mapping, return -EAGAIN; } - if (!page_freeze_refs(page, expected_count)) { + if (!page_ref_freeze(page, expected_count)) { spin_unlock_irq(&mapping->tree_lock); return -EAGAIN; } - set_page_memcg(newpage, page_memcg(page)); newpage->index = page->index; newpage->mapping = page->mapping; + get_page(newpage); radix_tree_replace_slot(pslot, newpage); - page_unfreeze_refs(page, expected_count - 1); + page_ref_unfreeze(page, expected_count - 1); spin_unlock_irq(&mapping->tree_lock); + return MIGRATEPAGE_SUCCESS; } @@ -578,6 +581,10 @@ void migrate_page_copy(struct page *newpage, struct page *page) */ if (PageWriteback(newpage)) end_page_writeback(newpage); + + copy_page_owner(page, newpage); + + mem_cgroup_migrate(page, newpage); } /************************************************************ @@ -698,7 +705,7 @@ static int writeout(struct address_space *mapping, struct page *page) * At this point we know that the migration attempt cannot * be successful. */ - remove_migration_ptes(page, page); + remove_migration_ptes(page, page, false); rc = mapping->a_ops->writepage(page, &wbc); @@ -772,7 +779,6 @@ static int move_to_new_page(struct page *newpage, struct page *page, * page is freed; but stats require that PageAnon be left as PageAnon. */ if (rc == MIGRATEPAGE_SUCCESS) { - set_page_memcg(page, NULL); if (!PageAnon(page)) page->mapping = NULL; } @@ -897,7 +903,7 @@ static int __unmap_and_move(struct page *page, struct page *newpage, if (page_was_mapped) remove_migration_ptes(page, - rc == MIGRATEPAGE_SUCCESS ? newpage : page); + rc == MIGRATEPAGE_SUCCESS ? newpage : page, false); out_unlock_both: unlock_page(newpage); @@ -952,8 +958,10 @@ static ICE_noinline int unmap_and_move(new_page_t get_new_page, } rc = __unmap_and_move(page, newpage, force, mode); - if (rc == MIGRATEPAGE_SUCCESS) + if (rc == MIGRATEPAGE_SUCCESS) { put_new_page = NULL; + set_page_owner_migrate_reason(newpage, reason); + } out: if (rc != -EAGAIN) { @@ -1018,7 +1026,7 @@ out: static int unmap_and_move_huge_page(new_page_t get_new_page, free_page_t put_new_page, unsigned long private, struct page *hpage, int force, - enum migrate_mode mode) + enum migrate_mode mode, int reason) { int rc = -EAGAIN; int *result = NULL; @@ -1065,7 +1073,7 @@ static int unmap_and_move_huge_page(new_page_t get_new_page, if (page_was_mapped) remove_migration_ptes(hpage, - rc == MIGRATEPAGE_SUCCESS ? new_hpage : hpage); + rc == MIGRATEPAGE_SUCCESS ? new_hpage : hpage, false); unlock_page(new_hpage); @@ -1076,6 +1084,7 @@ put_anon: if (rc == MIGRATEPAGE_SUCCESS) { hugetlb_cgroup_migrate(hpage, new_hpage); put_new_page = NULL; + set_page_owner_migrate_reason(new_hpage, reason); } unlock_page(hpage); @@ -1148,7 +1157,7 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page, if (PageHuge(page)) rc = unmap_and_move_huge_page(get_new_page, put_new_page, private, page, - pass > 2, mode); + pass > 2, mode, reason); else rc = unmap_and_move(get_new_page, put_new_page, private, page, pass > 2, mode, @@ -1767,7 +1776,10 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm, put_page(new_page); goto out_fail; } - + /* + * We are not sure a pending tlb flush here is for a huge page + * mapping or not. Hence use the tlb range variant + */ if (mm_tlb_flush_pending(mm)) flush_tlb_range(vma, mmun_start, mmun_end); @@ -1823,12 +1835,11 @@ fail_putback: page_add_anon_rmap(new_page, vma, mmun_start, true); pmdp_huge_clear_flush_notify(vma, mmun_start, pmd); set_pmd_at(mm, mmun_start, pmd, entry); - flush_tlb_range(vma, mmun_start, mmun_end); update_mmu_cache_pmd(vma, address, &entry); if (page_count(page) != 2) { set_pmd_at(mm, mmun_start, pmd, orig_entry); - flush_tlb_range(vma, mmun_start, mmun_end); + flush_pmd_tlb_range(vma, mmun_start, mmun_end); mmu_notifier_invalidate_range(mm, mmun_start, mmun_end); update_mmu_cache_pmd(vma, address, &entry); page_remove_rmap(new_page, true); @@ -1836,9 +1847,8 @@ fail_putback: } mlock_migrate_page(new_page, page); - set_page_memcg(new_page, page_memcg(page)); - set_page_memcg(page, NULL); page_remove_rmap(page, true); + set_page_owner_migrate_reason(new_page, MR_NUMA_MISPLACED); spin_unlock(ptl); mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); diff --git a/mm/mm_init.c b/mm/mm_init.c index fdadf918de76..5b72266b4b03 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -55,13 +55,12 @@ void __init mminit_verify_zonelist(void) /* Iterate the zonelist */ for_each_zone_zonelist(zone, z, zonelist, zoneid) { #ifdef CONFIG_NUMA - printk(KERN_CONT "%d:%s ", - zone->node, zone->name); + pr_cont("%d:%s ", zone->node, zone->name); #else - printk(KERN_CONT "0:%s ", zone->name); + pr_cont("0:%s ", zone->name); #endif /* CONFIG_NUMA */ } - printk(KERN_CONT "\n"); + pr_cont("\n"); } } } diff --git a/mm/mmap.c b/mm/mmap.c index 76d1ec29149b..bd2e1a533bc1 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -37,12 +37,12 @@ #include <linux/khugepaged.h> #include <linux/uprobes.h> #include <linux/rbtree_augmented.h> -#include <linux/sched/sysctl.h> #include <linux/notifier.h> #include <linux/memory.h> #include <linux/printk.h> #include <linux/userfaultfd_k.h> #include <linux/moduleparam.h> +#include <linux/pkeys.h> #include <asm/uaccess.h> #include <asm/cacheflush.h> @@ -123,130 +123,6 @@ void vma_set_page_prot(struct vm_area_struct *vma) } } - -int sysctl_overcommit_memory __read_mostly = OVERCOMMIT_GUESS; /* heuristic overcommit */ -int sysctl_overcommit_ratio __read_mostly = 50; /* default is 50% */ -unsigned long sysctl_overcommit_kbytes __read_mostly; -int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT; -unsigned long sysctl_user_reserve_kbytes __read_mostly = 1UL << 17; /* 128MB */ -unsigned long sysctl_admin_reserve_kbytes __read_mostly = 1UL << 13; /* 8MB */ -/* - * Make sure vm_committed_as in one cacheline and not cacheline shared with - * other variables. It can be updated by several CPUs frequently. - */ -struct percpu_counter vm_committed_as ____cacheline_aligned_in_smp; - -/* - * The global memory commitment made in the system can be a metric - * that can be used to drive ballooning decisions when Linux is hosted - * as a guest. On Hyper-V, the host implements a policy engine for dynamically - * balancing memory across competing virtual machines that are hosted. - * Several metrics drive this policy engine including the guest reported - * memory commitment. - */ -unsigned long vm_memory_committed(void) -{ - return percpu_counter_read_positive(&vm_committed_as); -} -EXPORT_SYMBOL_GPL(vm_memory_committed); - -/* - * Check that a process has enough memory to allocate a new virtual - * mapping. 0 means there is enough memory for the allocation to - * succeed and -ENOMEM implies there is not. - * - * We currently support three overcommit policies, which are set via the - * vm.overcommit_memory sysctl. See Documentation/vm/overcommit-accounting - * - * Strict overcommit modes added 2002 Feb 26 by Alan Cox. - * Additional code 2002 Jul 20 by Robert Love. - * - * cap_sys_admin is 1 if the process has admin privileges, 0 otherwise. - * - * Note this is a helper function intended to be used by LSMs which - * wish to use this logic. - */ -int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) -{ - long free, allowed, reserve; - - VM_WARN_ONCE(percpu_counter_read(&vm_committed_as) < - -(s64)vm_committed_as_batch * num_online_cpus(), - "memory commitment underflow"); - - vm_acct_memory(pages); - - /* - * Sometimes we want to use more memory than we have - */ - if (sysctl_overcommit_memory == OVERCOMMIT_ALWAYS) - return 0; - - if (sysctl_overcommit_memory == OVERCOMMIT_GUESS) { - free = global_page_state(NR_FREE_PAGES); - free += global_page_state(NR_FILE_PAGES); - - /* - * shmem pages shouldn't be counted as free in this - * case, they can't be purged, only swapped out, and - * that won't affect the overall amount of available - * memory in the system. - */ - free -= global_page_state(NR_SHMEM); - - free += get_nr_swap_pages(); - - /* - * Any slabs which are created with the - * SLAB_RECLAIM_ACCOUNT flag claim to have contents - * which are reclaimable, under pressure. The dentry - * cache and most inode caches should fall into this - */ - free += global_page_state(NR_SLAB_RECLAIMABLE); - - /* - * Leave reserved pages. The pages are not for anonymous pages. - */ - if (free <= totalreserve_pages) - goto error; - else - free -= totalreserve_pages; - - /* - * Reserve some for root - */ - if (!cap_sys_admin) - free -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10); - - if (free > pages) - return 0; - - goto error; - } - - allowed = vm_commit_limit(); - /* - * Reserve some for root - */ - if (!cap_sys_admin) - allowed -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10); - - /* - * Don't let a single process grow so big a user can't recover - */ - if (mm) { - reserve = sysctl_user_reserve_kbytes >> (PAGE_SHIFT - 10); - allowed -= min_t(long, mm->total_vm / 32, reserve); - } - - if (percpu_counter_read_positive(&vm_committed_as) < allowed) - return 0; -error: - vm_unacct_memory(pages); - - return -ENOMEM; -} - /* * Requires inode->i_mapping->i_mmap_rwsem */ @@ -1270,6 +1146,7 @@ unsigned long do_mmap(struct file *file, unsigned long addr, unsigned long pgoff, unsigned long *populate) { struct mm_struct *mm = current->mm; + int pkey = 0; *populate = 0; @@ -1309,11 +1186,17 @@ unsigned long do_mmap(struct file *file, unsigned long addr, if (offset_in_page(addr)) return addr; + if (prot == PROT_EXEC) { + pkey = execute_only_pkey(mm); + if (pkey < 0) + pkey = 0; + } + /* Do simple checking here so the lower-level routines won't have * to. we assume access permissions have been handled by the open * of the memory object, so we don't do any here. */ - vm_flags |= calc_vm_prot_bits(prot) | calc_vm_flag_bits(flags) | + vm_flags |= calc_vm_prot_bits(prot, pkey) | calc_vm_flag_bits(flags) | mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC; if (flags & MAP_LOCKED) @@ -2642,9 +2525,8 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size, unsigned long ret = -EINVAL; struct file *file; - pr_warn_once("%s (%d) uses deprecated remap_file_pages() syscall. " - "See Documentation/vm/remap_file_pages.txt.\n", - current->comm, current->pid); + pr_warn_once("%s (%d) uses deprecated remap_file_pages() syscall. See Documentation/vm/remap_file_pages.txt.\n", + current->comm, current->pid); if (prot) return ret; @@ -3010,8 +2892,7 @@ bool may_expand_vm(struct mm_struct *mm, vm_flags_t flags, unsigned long npages) if (is_data_mapping(flags) && mm->data_vm + npages > rlimit(RLIMIT_DATA) >> PAGE_SHIFT) { if (ignore_rlimit_data) - pr_warn_once("%s (%d): VmData %lu exceed data ulimit " - "%lu. Will be forbidden soon.\n", + pr_warn_once("%s (%d): VmData %lu exceed data ulimit %lu. Will be forbidden soon.\n", current->comm, current->pid, (mm->data_vm + npages) << PAGE_SHIFT, rlimit(RLIMIT_DATA)); @@ -3066,11 +2947,16 @@ static int special_mapping_fault(struct vm_area_struct *vma, pgoff_t pgoff; struct page **pages; - if (vma->vm_ops == &legacy_special_mapping_vmops) + if (vma->vm_ops == &legacy_special_mapping_vmops) { pages = vma->vm_private_data; - else - pages = ((struct vm_special_mapping *)vma->vm_private_data)-> - pages; + } else { + struct vm_special_mapping *sm = vma->vm_private_data; + + if (sm->fault) + return sm->fault(sm, vma, vmf); + + pages = sm->pages; + } for (pgoff = vmf->pgoff; pgoff && *pages; ++pages) pgoff--; diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c index 5fbdd367bbed..f4259e496f83 100644 --- a/mm/mmu_notifier.c +++ b/mm/mmu_notifier.c @@ -3,7 +3,7 @@ * * Copyright (C) 2008 Qumranet, Inc. * Copyright (C) 2008 SGI - * Christoph Lameter <clameter@sgi.com> + * Christoph Lameter <cl@linux.com> * * This work is licensed under the terms of the GNU GPL, version 2. See * the COPYING file in the top-level directory. diff --git a/mm/mprotect.c b/mm/mprotect.c index f7cb3d4d9c2e..b650c5412f58 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -24,6 +24,7 @@ #include <linux/migrate.h> #include <linux/perf_event.h> #include <linux/ksm.h> +#include <linux/pkeys.h> #include <asm/uaccess.h> #include <asm/pgtable.h> #include <asm/cacheflush.h> @@ -354,10 +355,13 @@ fail: SYSCALL_DEFINE3(mprotect, unsigned long, start, size_t, len, unsigned long, prot) { - unsigned long vm_flags, nstart, end, tmp, reqprot; + unsigned long nstart, end, tmp, reqprot; struct vm_area_struct *vma, *prev; int error = -EINVAL; const int grows = prot & (PROT_GROWSDOWN|PROT_GROWSUP); + const bool rier = (current->personality & READ_IMPLIES_EXEC) && + (prot & PROT_READ); + prot &= ~(PROT_GROWSDOWN|PROT_GROWSUP); if (grows == (PROT_GROWSDOWN|PROT_GROWSUP)) /* can't be both */ return -EINVAL; @@ -374,13 +378,6 @@ SYSCALL_DEFINE3(mprotect, unsigned long, start, size_t, len, return -EINVAL; reqprot = prot; - /* - * Does the application expect PROT_READ to imply PROT_EXEC: - */ - if ((prot & PROT_READ) && (current->personality & READ_IMPLIES_EXEC)) - prot |= PROT_EXEC; - - vm_flags = calc_vm_prot_bits(prot); down_write(¤t->mm->mmap_sem); @@ -411,10 +408,15 @@ SYSCALL_DEFINE3(mprotect, unsigned long, start, size_t, len, for (nstart = start ; ; ) { unsigned long newflags; + int pkey = arch_override_mprotect_pkey(vma, prot, -1); /* Here we know that vma->vm_start <= nstart < vma->vm_end. */ - newflags = vm_flags; + /* Does the application expect PROT_READ to imply PROT_EXEC */ + if (rier && (vma->vm_flags & VM_MAYEXEC)) + prot |= PROT_EXEC; + + newflags = calc_vm_prot_bits(prot, pkey); newflags |= (vma->vm_flags & ~(VM_READ | VM_WRITE | VM_EXEC)); /* newflags >> 4 shift VM_MAY% in place of VM_% */ @@ -445,6 +447,7 @@ SYSCALL_DEFINE3(mprotect, unsigned long, start, size_t, len, error = -ENOMEM; goto out; } + prot = reqprot; } out: up_write(¤t->mm->mmap_sem); diff --git a/mm/mremap.c b/mm/mremap.c index 8eeba02fc991..3fa0a467df66 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -20,7 +20,6 @@ #include <linux/security.h> #include <linux/syscalls.h> #include <linux/mmu_notifier.h> -#include <linux/sched/sysctl.h> #include <linux/uaccess.h> #include <linux/mm-arch-hooks.h> @@ -214,8 +213,7 @@ unsigned long move_page_tables(struct vm_area_struct *vma, continue; VM_BUG_ON(pmd_trans_huge(*old_pmd)); } - if (pmd_none(*new_pmd) && __pte_alloc(new_vma->vm_mm, new_vma, - new_pmd, new_addr)) + if (pte_alloc(new_vma->vm_mm, new_pmd, new_addr)) break; next = (new_addr + PMD_SIZE) & PMD_MASK; if (extent > next - new_addr) diff --git a/mm/nobootmem.c b/mm/nobootmem.c index 99feb2b07fc5..bd05a70f44b9 100644 --- a/mm/nobootmem.c +++ b/mm/nobootmem.c @@ -288,7 +288,7 @@ static void * __init ___alloc_bootmem(unsigned long size, unsigned long align, /* * Whoops, we cannot satisfy the allocation request. */ - printk(KERN_ALERT "bootmem alloc of %lu bytes failed!\n", size); + pr_alert("bootmem alloc of %lu bytes failed!\n", size); panic("Out of memory"); return NULL; } @@ -360,7 +360,7 @@ static void * __init ___alloc_bootmem_node(pg_data_t *pgdat, unsigned long size, if (ptr) return ptr; - printk(KERN_ALERT "bootmem alloc of %lu bytes failed!\n", size); + pr_alert("bootmem alloc of %lu bytes failed!\n", size); panic("Out of memory"); return NULL; } diff --git a/mm/nommu.c b/mm/nommu.c index fbf6f0f1d6c9..de8b6b6580c1 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -15,6 +15,8 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#define __DISABLE_GUP_DEPRECATED + #include <linux/export.h> #include <linux/mm.h> #include <linux/vmacache.h> @@ -33,7 +35,6 @@ #include <linux/security.h> #include <linux/syscalls.h> #include <linux/audit.h> -#include <linux/sched/sysctl.h> #include <linux/printk.h> #include <asm/uaccess.h> @@ -48,33 +49,11 @@ struct page *mem_map; unsigned long max_mapnr; EXPORT_SYMBOL(max_mapnr); unsigned long highest_memmap_pfn; -struct percpu_counter vm_committed_as; -int sysctl_overcommit_memory = OVERCOMMIT_GUESS; /* heuristic overcommit */ -int sysctl_overcommit_ratio = 50; /* default is 50% */ -unsigned long sysctl_overcommit_kbytes __read_mostly; -int sysctl_max_map_count = DEFAULT_MAX_MAP_COUNT; int sysctl_nr_trim_pages = CONFIG_NOMMU_INITIAL_TRIM_EXCESS; -unsigned long sysctl_user_reserve_kbytes __read_mostly = 1UL << 17; /* 128MB */ -unsigned long sysctl_admin_reserve_kbytes __read_mostly = 1UL << 13; /* 8MB */ int heap_stack_gap = 0; atomic_long_t mmap_pages_allocated; -/* - * The global memory commitment made in the system can be a metric - * that can be used to drive ballooning decisions when Linux is hosted - * as a guest. On Hyper-V, the host implements a policy engine for dynamically - * balancing memory across competing virtual machines that are hosted. - * Several metrics drive this policy engine including the guest reported - * memory commitment. - */ -unsigned long vm_memory_committed(void) -{ - return percpu_counter_read_positive(&vm_committed_as); -} - -EXPORT_SYMBOL_GPL(vm_memory_committed); - EXPORT_SYMBOL(mem_map); /* list of mapped, potentially shareable regions */ @@ -182,8 +161,7 @@ finish_or_fault: * slab page or a secondary page from a compound page * - don't permit access to VMAs that don't support it, such as I/O mappings */ -long get_user_pages(struct task_struct *tsk, struct mm_struct *mm, - unsigned long start, unsigned long nr_pages, +long get_user_pages6(unsigned long start, unsigned long nr_pages, int write, int force, struct page **pages, struct vm_area_struct **vmas) { @@ -194,20 +172,18 @@ long get_user_pages(struct task_struct *tsk, struct mm_struct *mm, if (force) flags |= FOLL_FORCE; - return __get_user_pages(tsk, mm, start, nr_pages, flags, pages, vmas, - NULL); + return __get_user_pages(current, current->mm, start, nr_pages, flags, + pages, vmas, NULL); } -EXPORT_SYMBOL(get_user_pages); +EXPORT_SYMBOL(get_user_pages6); -long get_user_pages_locked(struct task_struct *tsk, struct mm_struct *mm, - unsigned long start, unsigned long nr_pages, - int write, int force, struct page **pages, - int *locked) +long get_user_pages_locked6(unsigned long start, unsigned long nr_pages, + int write, int force, struct page **pages, + int *locked) { - return get_user_pages(tsk, mm, start, nr_pages, write, force, - pages, NULL); + return get_user_pages6(start, nr_pages, write, force, pages, NULL); } -EXPORT_SYMBOL(get_user_pages_locked); +EXPORT_SYMBOL(get_user_pages_locked6); long __get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm, unsigned long start, unsigned long nr_pages, @@ -216,21 +192,20 @@ long __get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm, { long ret; down_read(&mm->mmap_sem); - ret = get_user_pages(tsk, mm, start, nr_pages, write, force, - pages, NULL); + ret = __get_user_pages(tsk, mm, start, nr_pages, gup_flags, pages, + NULL, NULL); up_read(&mm->mmap_sem); return ret; } EXPORT_SYMBOL(__get_user_pages_unlocked); -long get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm, - unsigned long start, unsigned long nr_pages, +long get_user_pages_unlocked5(unsigned long start, unsigned long nr_pages, int write, int force, struct page **pages) { - return __get_user_pages_unlocked(tsk, mm, start, nr_pages, write, - force, pages, 0); + return __get_user_pages_unlocked(current, current->mm, start, nr_pages, + write, force, pages, 0); } -EXPORT_SYMBOL(get_user_pages_unlocked); +EXPORT_SYMBOL(get_user_pages_unlocked5); /** * follow_pfn - look up PFN at a user virtual address @@ -1084,7 +1059,7 @@ static unsigned long determine_vm_flags(struct file *file, { unsigned long vm_flags; - vm_flags = calc_vm_prot_bits(prot) | calc_vm_flag_bits(flags); + vm_flags = calc_vm_prot_bits(prot, 0) | calc_vm_flag_bits(flags); /* vm_flags |= mm->def_flags; */ if (!(capabilities & NOMMU_MAP_DIRECT)) { @@ -1829,100 +1804,6 @@ void unmap_mapping_range(struct address_space *mapping, } EXPORT_SYMBOL(unmap_mapping_range); -/* - * Check that a process has enough memory to allocate a new virtual - * mapping. 0 means there is enough memory for the allocation to - * succeed and -ENOMEM implies there is not. - * - * We currently support three overcommit policies, which are set via the - * vm.overcommit_memory sysctl. See Documentation/vm/overcommit-accounting - * - * Strict overcommit modes added 2002 Feb 26 by Alan Cox. - * Additional code 2002 Jul 20 by Robert Love. - * - * cap_sys_admin is 1 if the process has admin privileges, 0 otherwise. - * - * Note this is a helper function intended to be used by LSMs which - * wish to use this logic. - */ -int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) -{ - long free, allowed, reserve; - - vm_acct_memory(pages); - - /* - * Sometimes we want to use more memory than we have - */ - if (sysctl_overcommit_memory == OVERCOMMIT_ALWAYS) - return 0; - - if (sysctl_overcommit_memory == OVERCOMMIT_GUESS) { - free = global_page_state(NR_FREE_PAGES); - free += global_page_state(NR_FILE_PAGES); - - /* - * shmem pages shouldn't be counted as free in this - * case, they can't be purged, only swapped out, and - * that won't affect the overall amount of available - * memory in the system. - */ - free -= global_page_state(NR_SHMEM); - - free += get_nr_swap_pages(); - - /* - * Any slabs which are created with the - * SLAB_RECLAIM_ACCOUNT flag claim to have contents - * which are reclaimable, under pressure. The dentry - * cache and most inode caches should fall into this - */ - free += global_page_state(NR_SLAB_RECLAIMABLE); - - /* - * Leave reserved pages. The pages are not for anonymous pages. - */ - if (free <= totalreserve_pages) - goto error; - else - free -= totalreserve_pages; - - /* - * Reserve some for root - */ - if (!cap_sys_admin) - free -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10); - - if (free > pages) - return 0; - - goto error; - } - - allowed = vm_commit_limit(); - /* - * Reserve some 3% for root - */ - if (!cap_sys_admin) - allowed -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10); - - /* - * Don't let a single process grow so big a user can't recover - */ - if (mm) { - reserve = sysctl_user_reserve_kbytes >> (PAGE_SHIFT - 10); - allowed -= min_t(long, mm->total_vm / 32, reserve); - } - - if (percpu_counter_read_positive(&vm_committed_as) < allowed) - return 0; - -error: - vm_unacct_memory(pages); - - return -ENOMEM; -} - int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) { BUG(); @@ -2108,3 +1989,31 @@ static int __meminit init_admin_reserve(void) return 0; } subsys_initcall(init_admin_reserve); + +long get_user_pages8(struct task_struct *tsk, struct mm_struct *mm, + unsigned long start, unsigned long nr_pages, + int write, int force, struct page **pages, + struct vm_area_struct **vmas) +{ + return get_user_pages6(start, nr_pages, write, force, pages, vmas); +} +EXPORT_SYMBOL(get_user_pages8); + +long get_user_pages_locked8(struct task_struct *tsk, struct mm_struct *mm, + unsigned long start, unsigned long nr_pages, + int write, int force, struct page **pages, + int *locked) +{ + return get_user_pages_locked6(start, nr_pages, write, + force, pages, locked); +} +EXPORT_SYMBOL(get_user_pages_locked8); + +long get_user_pages_unlocked7(struct task_struct *tsk, struct mm_struct *mm, + unsigned long start, unsigned long nr_pages, + int write, int force, struct page **pages) +{ + return get_user_pages_unlocked5(start, nr_pages, write, force, pages); +} +EXPORT_SYMBOL(get_user_pages_unlocked7); + diff --git a/mm/oom_kill.c b/mm/oom_kill.c index dc490c06941b..b34d279a7ee6 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -35,6 +35,11 @@ #include <linux/freezer.h> #include <linux/ftrace.h> #include <linux/ratelimit.h> +#include <linux/kthread.h> +#include <linux/init.h> + +#include <asm/tlb.h> +#include "internal.h" #define CREATE_TRACE_POINTS #include <trace/events/oom.h> @@ -287,9 +292,6 @@ enum oom_scan_t oom_scan_process_thread(struct oom_control *oc, if (oom_task_origin(task)) return OOM_SCAN_SELECT; - if (task_will_free_mem(task) && !is_sysrq_oom(oc)) - return OOM_SCAN_ABORT; - return OOM_SCAN_OK; } @@ -386,10 +388,10 @@ static void dump_tasks(struct mem_cgroup *memcg, const nodemask_t *nodemask) static void dump_header(struct oom_control *oc, struct task_struct *p, struct mem_cgroup *memcg) { - pr_warning("%s invoked oom-killer: gfp_mask=0x%x, order=%d, " - "oom_score_adj=%hd\n", - current->comm, oc->gfp_mask, oc->order, + pr_warn("%s invoked oom-killer: gfp_mask=%#x(%pGg), order=%d, oom_score_adj=%hd\n", + current->comm, oc->gfp_mask, &oc->gfp_mask, oc->order, current->signal->oom_score_adj); + cpuset_print_current_mems_allowed(); dump_stack(); if (memcg) @@ -408,6 +410,172 @@ static DECLARE_WAIT_QUEUE_HEAD(oom_victims_wait); bool oom_killer_disabled __read_mostly; +#define K(x) ((x) << (PAGE_SHIFT-10)) + +#ifdef CONFIG_MMU +/* + * OOM Reaper kernel thread which tries to reap the memory used by the OOM + * victim (if that is possible) to help the OOM killer to move on. + */ +static struct task_struct *oom_reaper_th; +static DECLARE_WAIT_QUEUE_HEAD(oom_reaper_wait); +static struct task_struct *oom_reaper_list; +static DEFINE_SPINLOCK(oom_reaper_lock); + + +static bool __oom_reap_task(struct task_struct *tsk) +{ + struct mmu_gather tlb; + struct vm_area_struct *vma; + struct mm_struct *mm; + struct task_struct *p; + struct zap_details details = {.check_swap_entries = true, + .ignore_dirty = true}; + bool ret = true; + + /* + * Make sure we find the associated mm_struct even when the particular + * thread has already terminated and cleared its mm. + * We might have race with exit path so consider our work done if there + * is no mm. + */ + p = find_lock_task_mm(tsk); + if (!p) + return true; + + mm = p->mm; + if (!atomic_inc_not_zero(&mm->mm_users)) { + task_unlock(p); + return true; + } + + task_unlock(p); + + if (!down_read_trylock(&mm->mmap_sem)) { + ret = false; + goto out; + } + + tlb_gather_mmu(&tlb, mm, 0, -1); + for (vma = mm->mmap ; vma; vma = vma->vm_next) { + if (is_vm_hugetlb_page(vma)) + continue; + + /* + * mlocked VMAs require explicit munlocking before unmap. + * Let's keep it simple here and skip such VMAs. + */ + if (vma->vm_flags & VM_LOCKED) + continue; + + /* + * Only anonymous pages have a good chance to be dropped + * without additional steps which we cannot afford as we + * are OOM already. + * + * We do not even care about fs backed pages because all + * which are reclaimable have already been reclaimed and + * we do not want to block exit_mmap by keeping mm ref + * count elevated without a good reason. + */ + if (vma_is_anonymous(vma) || !(vma->vm_flags & VM_SHARED)) + unmap_page_range(&tlb, vma, vma->vm_start, vma->vm_end, + &details); + } + tlb_finish_mmu(&tlb, 0, -1); + pr_info("oom_reaper: reaped process %d (%s), now anon-rss:%lukB, file-rss:%lukB, shmem-rss:%lukB\n", + task_pid_nr(tsk), tsk->comm, + K(get_mm_counter(mm, MM_ANONPAGES)), + K(get_mm_counter(mm, MM_FILEPAGES)), + K(get_mm_counter(mm, MM_SHMEMPAGES))); + up_read(&mm->mmap_sem); + + /* + * Clear TIF_MEMDIE because the task shouldn't be sitting on a + * reasonably reclaimable memory anymore. OOM killer can continue + * by selecting other victim if unmapping hasn't led to any + * improvements. This also means that selecting this task doesn't + * make any sense. + */ + tsk->signal->oom_score_adj = OOM_SCORE_ADJ_MIN; + exit_oom_victim(tsk); +out: + mmput(mm); + return ret; +} + +#define MAX_OOM_REAP_RETRIES 10 +static void oom_reap_task(struct task_struct *tsk) +{ + int attempts = 0; + + /* Retry the down_read_trylock(mmap_sem) a few times */ + while (attempts++ < MAX_OOM_REAP_RETRIES && !__oom_reap_task(tsk)) + schedule_timeout_idle(HZ/10); + + if (attempts > MAX_OOM_REAP_RETRIES) { + pr_info("oom_reaper: unable to reap pid:%d (%s)\n", + task_pid_nr(tsk), tsk->comm); + debug_show_all_locks(); + } + + /* Drop a reference taken by wake_oom_reaper */ + put_task_struct(tsk); +} + +static int oom_reaper(void *unused) +{ + set_freezable(); + + while (true) { + struct task_struct *tsk = NULL; + + wait_event_freezable(oom_reaper_wait, oom_reaper_list != NULL); + spin_lock(&oom_reaper_lock); + if (oom_reaper_list != NULL) { + tsk = oom_reaper_list; + oom_reaper_list = tsk->oom_reaper_list; + } + spin_unlock(&oom_reaper_lock); + + if (tsk) + oom_reap_task(tsk); + } + + return 0; +} + +static void wake_oom_reaper(struct task_struct *tsk) +{ + if (!oom_reaper_th || tsk->oom_reaper_list) + return; + + get_task_struct(tsk); + + spin_lock(&oom_reaper_lock); + tsk->oom_reaper_list = oom_reaper_list; + oom_reaper_list = tsk; + spin_unlock(&oom_reaper_lock); + wake_up(&oom_reaper_wait); +} + +static int __init oom_init(void) +{ + oom_reaper_th = kthread_run(oom_reaper, NULL, "oom_reaper"); + if (IS_ERR(oom_reaper_th)) { + pr_err("Unable to start OOM reaper %ld. Continuing regardless\n", + PTR_ERR(oom_reaper_th)); + oom_reaper_th = NULL; + } + return 0; +} +subsys_initcall(oom_init) +#else +static void wake_oom_reaper(struct task_struct *tsk) +{ +} +#endif + /** * mark_oom_victim - mark the given task as OOM victim * @tsk: task to mark @@ -434,9 +602,10 @@ void mark_oom_victim(struct task_struct *tsk) /** * exit_oom_victim - note the exit of an OOM victim */ -void exit_oom_victim(void) +void exit_oom_victim(struct task_struct *tsk) { - clear_thread_flag(TIF_MEMDIE); + if (!test_and_clear_tsk_thread_flag(tsk, TIF_MEMDIE)) + return; if (!atomic_dec_return(&oom_victims)) wake_up_all(&oom_victims_wait); @@ -458,15 +627,11 @@ void exit_oom_victim(void) bool oom_killer_disable(void) { /* - * Make sure to not race with an ongoing OOM killer - * and that the current is not the victim. + * Make sure to not race with an ongoing OOM killer. Check that the + * current is not killed (possibly due to sharing the victim's memory). */ - mutex_lock(&oom_lock); - if (test_thread_flag(TIF_MEMDIE)) { - mutex_unlock(&oom_lock); + if (mutex_lock_killable(&oom_lock)) return false; - } - oom_killer_disabled = true; mutex_unlock(&oom_lock); @@ -501,7 +666,6 @@ static bool process_shares_mm(struct task_struct *p, struct mm_struct *mm) return false; } -#define K(x) ((x) << (PAGE_SHIFT-10)) /* * Must be called while holding a reference to p, which will be released upon * returning. @@ -517,6 +681,7 @@ void oom_kill_process(struct oom_control *oc, struct task_struct *p, unsigned int victim_points = 0; static DEFINE_RATELIMIT_STATE(oom_rs, DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST); + bool can_oom_reap = true; /* * If the task is already exiting, don't alarm the sysadmin or kill @@ -607,17 +772,23 @@ void oom_kill_process(struct oom_control *oc, struct task_struct *p, continue; if (same_thread_group(p, victim)) continue; - if (unlikely(p->flags & PF_KTHREAD)) - continue; - if (is_global_init(p)) - continue; - if (p->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) + if (unlikely(p->flags & PF_KTHREAD) || is_global_init(p) || + p->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) { + /* + * We cannot use oom_reaper for the mm shared by this + * process because it wouldn't get killed and so the + * memory might be still used. + */ + can_oom_reap = false; continue; - + } do_send_sig_info(SIGKILL, SEND_SIG_FORCED, p, true); } rcu_read_unlock(); + if (can_oom_reap) + wake_oom_reaper(victim); + mmdrop(mm); put_task_struct(victim); } diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 6fe7d15bd1f7..11ff8f758631 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -1169,6 +1169,7 @@ static void wb_update_dirty_ratelimit(struct dirty_throttle_control *dtc, unsigned long balanced_dirty_ratelimit; unsigned long step; unsigned long x; + unsigned long shift; /* * The dirty rate will match the writeout rate in long term, except @@ -1293,11 +1294,11 @@ static void wb_update_dirty_ratelimit(struct dirty_throttle_control *dtc, * rate itself is constantly fluctuating. So decrease the track speed * when it gets close to the target. Helps eliminate pointless tremors. */ - step >>= dirty_ratelimit / (2 * step + 1); - /* - * Limit the tracking speed to avoid overshooting. - */ - step = (step + 7) / 8; + shift = dirty_ratelimit / (2 * step + 1); + if (shift < BITS_PER_LONG) + step = DIV_ROUND_UP(step >> shift, 8); + else + step = 0; if (dirty_ratelimit < balanced_dirty_ratelimit) dirty_ratelimit += step; @@ -2409,12 +2410,11 @@ int __set_page_dirty_no_writeback(struct page *page) /* * Helper function for set_page_dirty family. * - * Caller must hold mem_cgroup_begin_page_stat(). + * Caller must hold lock_page_memcg(). * * NOTE: This relies on being atomic wrt interrupts. */ -void account_page_dirtied(struct page *page, struct address_space *mapping, - struct mem_cgroup *memcg) +void account_page_dirtied(struct page *page, struct address_space *mapping) { struct inode *inode = mapping->host; @@ -2426,7 +2426,7 @@ void account_page_dirtied(struct page *page, struct address_space *mapping, inode_attach_wb(inode, page); wb = inode_to_wb(inode); - mem_cgroup_inc_page_stat(memcg, MEM_CGROUP_STAT_DIRTY); + mem_cgroup_inc_page_stat(page, MEM_CGROUP_STAT_DIRTY); __inc_zone_page_state(page, NR_FILE_DIRTY); __inc_zone_page_state(page, NR_DIRTIED); __inc_wb_stat(wb, WB_RECLAIMABLE); @@ -2441,13 +2441,13 @@ EXPORT_SYMBOL(account_page_dirtied); /* * Helper function for deaccounting dirty page without writeback. * - * Caller must hold mem_cgroup_begin_page_stat(). + * Caller must hold lock_page_memcg(). */ void account_page_cleaned(struct page *page, struct address_space *mapping, - struct mem_cgroup *memcg, struct bdi_writeback *wb) + struct bdi_writeback *wb) { if (mapping_cap_account_dirty(mapping)) { - mem_cgroup_dec_page_stat(memcg, MEM_CGROUP_STAT_DIRTY); + mem_cgroup_dec_page_stat(page, MEM_CGROUP_STAT_DIRTY); dec_zone_page_state(page, NR_FILE_DIRTY); dec_wb_stat(wb, WB_RECLAIMABLE); task_io_account_cancelled_write(PAGE_CACHE_SIZE); @@ -2468,26 +2468,24 @@ void account_page_cleaned(struct page *page, struct address_space *mapping, */ int __set_page_dirty_nobuffers(struct page *page) { - struct mem_cgroup *memcg; - - memcg = mem_cgroup_begin_page_stat(page); + lock_page_memcg(page); if (!TestSetPageDirty(page)) { struct address_space *mapping = page_mapping(page); unsigned long flags; if (!mapping) { - mem_cgroup_end_page_stat(memcg); + unlock_page_memcg(page); return 1; } spin_lock_irqsave(&mapping->tree_lock, flags); BUG_ON(page_mapping(page) != mapping); WARN_ON_ONCE(!PagePrivate(page) && !PageUptodate(page)); - account_page_dirtied(page, mapping, memcg); + account_page_dirtied(page, mapping); radix_tree_tag_set(&mapping->page_tree, page_index(page), PAGECACHE_TAG_DIRTY); spin_unlock_irqrestore(&mapping->tree_lock, flags); - mem_cgroup_end_page_stat(memcg); + unlock_page_memcg(page); if (mapping->host) { /* !PageAnon && !swapper_space */ @@ -2495,7 +2493,7 @@ int __set_page_dirty_nobuffers(struct page *page) } return 1; } - mem_cgroup_end_page_stat(memcg); + unlock_page_memcg(page); return 0; } EXPORT_SYMBOL(__set_page_dirty_nobuffers); @@ -2625,17 +2623,16 @@ void cancel_dirty_page(struct page *page) if (mapping_cap_account_dirty(mapping)) { struct inode *inode = mapping->host; struct bdi_writeback *wb; - struct mem_cgroup *memcg; bool locked; - memcg = mem_cgroup_begin_page_stat(page); + lock_page_memcg(page); wb = unlocked_inode_to_wb_begin(inode, &locked); if (TestClearPageDirty(page)) - account_page_cleaned(page, mapping, memcg, wb); + account_page_cleaned(page, mapping, wb); unlocked_inode_to_wb_end(inode, locked); - mem_cgroup_end_page_stat(memcg); + unlock_page_memcg(page); } else { ClearPageDirty(page); } @@ -2666,7 +2663,6 @@ int clear_page_dirty_for_io(struct page *page) if (mapping && mapping_cap_account_dirty(mapping)) { struct inode *inode = mapping->host; struct bdi_writeback *wb; - struct mem_cgroup *memcg; bool locked; /* @@ -2704,16 +2700,14 @@ int clear_page_dirty_for_io(struct page *page) * always locked coming in here, so we get the desired * exclusion. */ - memcg = mem_cgroup_begin_page_stat(page); wb = unlocked_inode_to_wb_begin(inode, &locked); if (TestClearPageDirty(page)) { - mem_cgroup_dec_page_stat(memcg, MEM_CGROUP_STAT_DIRTY); + mem_cgroup_dec_page_stat(page, MEM_CGROUP_STAT_DIRTY); dec_zone_page_state(page, NR_FILE_DIRTY); dec_wb_stat(wb, WB_RECLAIMABLE); ret = 1; } unlocked_inode_to_wb_end(inode, locked); - mem_cgroup_end_page_stat(memcg); return ret; } return TestClearPageDirty(page); @@ -2723,10 +2717,9 @@ EXPORT_SYMBOL(clear_page_dirty_for_io); int test_clear_page_writeback(struct page *page) { struct address_space *mapping = page_mapping(page); - struct mem_cgroup *memcg; int ret; - memcg = mem_cgroup_begin_page_stat(page); + lock_page_memcg(page); if (mapping) { struct inode *inode = mapping->host; struct backing_dev_info *bdi = inode_to_bdi(inode); @@ -2750,21 +2743,20 @@ int test_clear_page_writeback(struct page *page) ret = TestClearPageWriteback(page); } if (ret) { - mem_cgroup_dec_page_stat(memcg, MEM_CGROUP_STAT_WRITEBACK); + mem_cgroup_dec_page_stat(page, MEM_CGROUP_STAT_WRITEBACK); dec_zone_page_state(page, NR_WRITEBACK); inc_zone_page_state(page, NR_WRITTEN); } - mem_cgroup_end_page_stat(memcg); + unlock_page_memcg(page); return ret; } int __test_set_page_writeback(struct page *page, bool keep_write) { struct address_space *mapping = page_mapping(page); - struct mem_cgroup *memcg; int ret; - memcg = mem_cgroup_begin_page_stat(page); + lock_page_memcg(page); if (mapping) { struct inode *inode = mapping->host; struct backing_dev_info *bdi = inode_to_bdi(inode); @@ -2792,10 +2784,10 @@ int __test_set_page_writeback(struct page *page, bool keep_write) ret = TestSetPageWriteback(page); } if (!ret) { - mem_cgroup_inc_page_stat(memcg, MEM_CGROUP_STAT_WRITEBACK); + mem_cgroup_inc_page_stat(page, MEM_CGROUP_STAT_WRITEBACK); inc_zone_page_state(page, NR_WRITEBACK); } - mem_cgroup_end_page_stat(memcg); + unlock_page_memcg(page); return ret; } diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 838ca8bb64f7..59de90d5d3a3 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -223,6 +223,19 @@ static char * const zone_names[MAX_NR_ZONES] = { #endif }; +char * const migratetype_names[MIGRATE_TYPES] = { + "Unmovable", + "Movable", + "Reclaimable", + "HighAtomic", +#ifdef CONFIG_CMA + "CMA", +#endif +#ifdef CONFIG_MEMORY_ISOLATION + "Isolate", +#endif +}; + compound_page_dtor * const compound_page_dtors[] = { NULL, free_compound_page, @@ -236,6 +249,7 @@ compound_page_dtor * const compound_page_dtors[] = { int min_free_kbytes = 1024; int user_min_free_kbytes = -1; +int watermark_scale_factor = 10; static unsigned long __meminitdata nr_kernel_pages; static unsigned long __meminitdata nr_all_pages; @@ -247,6 +261,7 @@ static unsigned long __meminitdata arch_zone_highest_possible_pfn[MAX_NR_ZONES]; static unsigned long __initdata required_kernelcore; static unsigned long __initdata required_movablecore; static unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES]; +static bool mirrored_kernelcore; /* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */ int movable_zone; @@ -293,13 +308,20 @@ static inline bool update_defer_init(pg_data_t *pgdat, unsigned long pfn, unsigned long zone_end, unsigned long *nr_initialised) { + unsigned long max_initialise; + /* Always populate low zones for address-contrained allocations */ if (zone_end < pgdat_end_pfn(pgdat)) return true; + /* + * Initialise at least 2G of a node but also take into account that + * two large system hashes that can take up 1GB for 0.25TB/node. + */ + max_initialise = max(2UL << (30 - PAGE_SHIFT), + (pgdat->node_spanned_pages >> 8)); - /* Initialise at least 2G of the highest zone */ (*nr_initialised)++; - if (*nr_initialised > (2UL << (30 - PAGE_SHIFT)) && + if ((*nr_initialised > max_initialise) && (pfn & (PAGES_PER_SECTION - 1)) == 0) { pgdat->first_deferred_pfn = pfn; return false; @@ -416,7 +438,7 @@ static void bad_page(struct page *page, const char *reason, goto out; } if (nr_unshown) { - printk(KERN_ALERT + pr_alert( "BUG: Bad page state: %lu messages suppressed\n", nr_unshown); nr_unshown = 0; @@ -426,9 +448,14 @@ static void bad_page(struct page *page, const char *reason, if (nr_shown++ == 0) resume = jiffies + 60 * HZ; - printk(KERN_ALERT "BUG: Bad page state in process %s pfn:%05lx\n", + pr_alert("BUG: Bad page state in process %s pfn:%05lx\n", current->comm, page_to_pfn(page)); - dump_page_badflags(page, reason, bad_flags); + __dump_page(page, reason); + bad_flags &= page->flags; + if (bad_flags) + pr_alert("bad because of flags: %#lx(%pGp)\n", + bad_flags, &bad_flags); + dump_page_owner(page); print_modules(); dump_stack(); @@ -477,7 +504,9 @@ void prep_compound_page(struct page *page, unsigned int order) #ifdef CONFIG_DEBUG_PAGEALLOC unsigned int _debug_guardpage_minorder; -bool _debug_pagealloc_enabled __read_mostly; +bool _debug_pagealloc_enabled __read_mostly + = IS_ENABLED(CONFIG_DEBUG_PAGEALLOC_ENABLE_DEFAULT); +EXPORT_SYMBOL(_debug_pagealloc_enabled); bool _debug_guardpage_enabled __read_mostly; static int __init early_debug_pagealloc(char *buf) @@ -488,6 +517,9 @@ static int __init early_debug_pagealloc(char *buf) if (strcmp(buf, "on") == 0) _debug_pagealloc_enabled = true; + if (strcmp(buf, "off") == 0) + _debug_pagealloc_enabled = false; + return 0; } early_param("debug_pagealloc", early_debug_pagealloc); @@ -519,11 +551,11 @@ static int __init debug_guardpage_minorder_setup(char *buf) unsigned long res; if (kstrtoul(buf, 10, &res) < 0 || res > MAX_ORDER / 2) { - printk(KERN_ERR "Bad debug_guardpage_minorder value\n"); + pr_err("Bad debug_guardpage_minorder value\n"); return 0; } _debug_guardpage_minorder = res; - printk(KERN_INFO "Setting debug_guardpage_minorder to %lu\n", res); + pr_info("Setting debug_guardpage_minorder to %lu\n", res); return 0; } __setup("debug_guardpage_minorder=", debug_guardpage_minorder_setup); @@ -660,34 +692,28 @@ static inline void __free_one_page(struct page *page, unsigned long combined_idx; unsigned long uninitialized_var(buddy_idx); struct page *buddy; - unsigned int max_order = MAX_ORDER; + unsigned int max_order; + + max_order = min_t(unsigned int, MAX_ORDER, pageblock_order + 1); VM_BUG_ON(!zone_is_initialized(zone)); VM_BUG_ON_PAGE(page->flags & PAGE_FLAGS_CHECK_AT_PREP, page); VM_BUG_ON(migratetype == -1); - if (is_migrate_isolate(migratetype)) { - /* - * We restrict max order of merging to prevent merge - * between freepages on isolate pageblock and normal - * pageblock. Without this, pageblock isolation - * could cause incorrect freepage accounting. - */ - max_order = min_t(unsigned int, MAX_ORDER, pageblock_order + 1); - } else { + if (likely(!is_migrate_isolate(migratetype))) __mod_zone_freepage_state(zone, 1 << order, migratetype); - } - page_idx = pfn & ((1 << max_order) - 1); + page_idx = pfn & ((1 << MAX_ORDER) - 1); VM_BUG_ON_PAGE(page_idx & ((1 << order) - 1), page); VM_BUG_ON_PAGE(bad_range(zone, page), page); +continue_merging: while (order < max_order - 1) { buddy_idx = __find_buddy_index(page_idx, order); buddy = page + (buddy_idx - page_idx); if (!page_is_buddy(page, buddy, order)) - break; + goto done_merging; /* * Our buddy is free or it is CONFIG_DEBUG_PAGEALLOC guard page, * merge with it and move up one order. @@ -704,6 +730,32 @@ static inline void __free_one_page(struct page *page, page_idx = combined_idx; order++; } + if (max_order < MAX_ORDER) { + /* If we are here, it means order is >= pageblock_order. + * We want to prevent merge between freepages on isolate + * pageblock and normal pageblock. Without this, pageblock + * isolation could cause incorrect freepage or CMA accounting. + * + * We don't want to hit this code for the more frequent + * low-order merging. + */ + if (unlikely(has_isolate_pageblock(zone))) { + int buddy_mt; + + buddy_idx = __find_buddy_index(page_idx, order); + buddy = page + (buddy_idx - page_idx); + buddy_mt = get_pageblock_migratetype(buddy); + + if (migratetype != buddy_mt + && (is_migrate_isolate(migratetype) || + is_migrate_isolate(buddy_mt))) + goto done_merging; + } + max_order++; + goto continue_merging; + } + +done_merging: set_page_order(page, order); /* @@ -741,7 +793,7 @@ static inline int free_pages_check(struct page *page) bad_reason = "nonzero mapcount"; if (unlikely(page->mapping != NULL)) bad_reason = "non-NULL mapping"; - if (unlikely(atomic_read(&page->_count) != 0)) + if (unlikely(page_ref_count(page) != 0)) bad_reason = "nonzero _count"; if (unlikely(page->flags & PAGE_FLAGS_CHECK_AT_FREE)) { bad_reason = "PAGE_FLAGS_CHECK_AT_FREE flag(s) set"; @@ -1002,6 +1054,7 @@ static bool free_pages_prepare(struct page *page, unsigned int order) PAGE_SIZE << order); } arch_free_page(page, order); + kernel_poison_pages(page, 1 << order, 0); kernel_map_pages(page, 1 << order, 0); return true; @@ -1104,6 +1157,75 @@ void __init __free_pages_bootmem(struct page *page, unsigned long pfn, return __free_pages_boot_core(page, pfn, order); } +/* + * Check that the whole (or subset of) a pageblock given by the interval of + * [start_pfn, end_pfn) is valid and within the same zone, before scanning it + * with the migration of free compaction scanner. The scanners then need to + * use only pfn_valid_within() check for arches that allow holes within + * pageblocks. + * + * Return struct page pointer of start_pfn, or NULL if checks were not passed. + * + * It's possible on some configurations to have a setup like node0 node1 node0 + * i.e. it's possible that all pages within a zones range of pages do not + * belong to a single zone. We assume that a border between node0 and node1 + * can occur within a single pageblock, but not a node0 node1 node0 + * interleaving within a single pageblock. It is therefore sufficient to check + * the first and last page of a pageblock and avoid checking each individual + * page in a pageblock. + */ +struct page *__pageblock_pfn_to_page(unsigned long start_pfn, + unsigned long end_pfn, struct zone *zone) +{ + struct page *start_page; + struct page *end_page; + + /* end_pfn is one past the range we are checking */ + end_pfn--; + + if (!pfn_valid(start_pfn) || !pfn_valid(end_pfn)) + return NULL; + + start_page = pfn_to_page(start_pfn); + + if (page_zone(start_page) != zone) + return NULL; + + end_page = pfn_to_page(end_pfn); + + /* This gives a shorter code than deriving page_zone(end_page) */ + if (page_zone_id(start_page) != page_zone_id(end_page)) + return NULL; + + return start_page; +} + +void set_zone_contiguous(struct zone *zone) +{ + unsigned long block_start_pfn = zone->zone_start_pfn; + unsigned long block_end_pfn; + + block_end_pfn = ALIGN(block_start_pfn + 1, pageblock_nr_pages); + for (; block_start_pfn < zone_end_pfn(zone); + block_start_pfn = block_end_pfn, + block_end_pfn += pageblock_nr_pages) { + + block_end_pfn = min(block_end_pfn, zone_end_pfn(zone)); + + if (!__pageblock_pfn_to_page(block_start_pfn, + block_end_pfn, zone)) + return; + } + + /* We confirm that there is no hole */ + zone->contiguous = true; +} + +void clear_zone_contiguous(struct zone *zone) +{ + zone->contiguous = false; +} + #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT static void __init deferred_free_range(struct page *page, unsigned long pfn, int nr_pages) @@ -1254,9 +1376,13 @@ free_range: pgdat_init_report_one_done(); return 0; } +#endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */ void __init page_alloc_init_late(void) { + struct zone *zone; + +#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT int nid; /* There will be num_node_state(N_MEMORY) threads */ @@ -1270,8 +1396,11 @@ void __init page_alloc_init_late(void) /* Reinit limits that are based on free pages after the kernel is up */ files_maxfiles_init(); +#endif + + for_each_populated_zone(zone) + set_zone_contiguous(zone); } -#endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */ #ifdef CONFIG_CMA /* Free whole pageblock and set its migration type to MIGRATE_CMA. */ @@ -1360,7 +1489,7 @@ static inline int check_new_page(struct page *page) bad_reason = "nonzero mapcount"; if (unlikely(page->mapping != NULL)) bad_reason = "non-NULL mapping"; - if (unlikely(atomic_read(&page->_count) != 0)) + if (unlikely(page_ref_count(page) != 0)) bad_reason = "nonzero _count"; if (unlikely(page->flags & __PG_HWPOISON)) { bad_reason = "HWPoisoned (hardware-corrupted)"; @@ -1381,15 +1510,24 @@ static inline int check_new_page(struct page *page) return 0; } +static inline bool free_pages_prezeroed(bool poisoned) +{ + return IS_ENABLED(CONFIG_PAGE_POISONING_ZERO) && + page_poisoning_enabled() && poisoned; +} + static int prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags, int alloc_flags) { int i; + bool poisoned = true; for (i = 0; i < (1 << order); i++) { struct page *p = page + i; if (unlikely(check_new_page(p))) return 1; + if (poisoned) + poisoned &= page_is_poisoned(p); } set_page_private(page, 0); @@ -1397,9 +1535,10 @@ static int prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags, arch_alloc_page(page, order); kernel_map_pages(page, 1 << order, 1); + kernel_poison_pages(page, 1 << order, 1); kasan_alloc_pages(page, order); - if (gfp_flags & __GFP_ZERO) + if (!free_pages_prezeroed(poisoned) && (gfp_flags & __GFP_ZERO)) for (i = 0; i < (1 << order); i++) clear_highpage(page + i); @@ -2238,19 +2377,11 @@ struct page *buffered_rmqueue(struct zone *preferred_zone, list_del(&page->lru); pcp->count--; } else { - if (unlikely(gfp_flags & __GFP_NOFAIL)) { - /* - * __GFP_NOFAIL is not to be used in new code. - * - * All __GFP_NOFAIL callers should be fixed so that they - * properly detect and handle allocation failures. - * - * We most definitely don't want callers attempting to - * allocate greater than order-1 page units with - * __GFP_NOFAIL. - */ - WARN_ON_ONCE(order > 1); - } + /* + * We most definitely don't want callers attempting to + * allocate greater than order-1 page units with __GFP_NOFAIL. + */ + WARN_ON_ONCE((gfp_flags & __GFP_NOFAIL) && (order > 1)); spin_lock_irqsave(&zone->lock, flags); page = NULL; @@ -2690,9 +2821,8 @@ void warn_alloc_failed(gfp_t gfp_mask, unsigned int order, const char *fmt, ...) va_end(args); } - pr_warn("%s: page allocation failure: order:%u, mode:0x%x\n", - current->comm, order, gfp_mask); - + pr_warn("%s: page allocation failure: order:%u, mode:%#x(%pGg)\n", + current->comm, order, gfp_mask, &gfp_mask); dump_stack(); if (!should_suppress_show_mem()) show_mem(filter); @@ -2748,8 +2878,12 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, * XXX: Page reclaim didn't yield anything, * and the OOM killer can't be invoked, but * keep looping as per tradition. + * + * But do not keep looping if oom_killer_disable() + * was already called, for the system is trying to + * enter a quiescent state during suspend. */ - *did_some_progress = 1; + *did_some_progress = !oom_killer_disabled; goto out; } if (pm_suspended_storage()) @@ -3008,14 +3142,6 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, (__GFP_ATOMIC|__GFP_DIRECT_RECLAIM))) gfp_mask &= ~__GFP_ATOMIC; - /* - * If this allocation cannot block and it is for a specific node, then - * fail early. There's no need to wakeup kswapd or retry for a - * speculative node-specific allocation. - */ - if (IS_ENABLED(CONFIG_NUMA) && (gfp_mask & __GFP_THISNODE) && !can_direct_reclaim) - goto nopage; - retry: if (gfp_mask & __GFP_KSWAPD_RECLAIM) wake_all_kswapds(order, ac); @@ -3372,7 +3498,7 @@ refill: /* Even if we own the page, we do not use atomic_set(). * This would break get_page_unless_zero() users. */ - atomic_add(size - 1, &page->_count); + page_ref_add(page, size - 1); /* reset page count bias and offset to start of new frag */ nc->pfmemalloc = page_is_pfmemalloc(page); @@ -3384,7 +3510,7 @@ refill: if (unlikely(offset < 0)) { page = virt_to_page(nc->va); - if (!atomic_sub_and_test(nc->pagecnt_bias, &page->_count)) + if (!page_ref_sub_and_test(page, nc->pagecnt_bias)) goto refill; #if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE) @@ -3392,7 +3518,7 @@ refill: size = nc->size; #endif /* OK, page count is 0, we can safely set it */ - atomic_set(&page->_count, size); + set_page_count(page, size); /* reset page count bias and offset to start of new frag */ nc->pagecnt_bias = size; @@ -3603,6 +3729,49 @@ static inline void show_node(struct zone *zone) printk("Node %d ", zone_to_nid(zone)); } +long si_mem_available(void) +{ + long available; + unsigned long pagecache; + unsigned long wmark_low = 0; + unsigned long pages[NR_LRU_LISTS]; + struct zone *zone; + int lru; + + for (lru = LRU_BASE; lru < NR_LRU_LISTS; lru++) + pages[lru] = global_page_state(NR_LRU_BASE + lru); + + for_each_zone(zone) + wmark_low += zone->watermark[WMARK_LOW]; + + /* + * Estimate the amount of memory available for userspace allocations, + * without causing swapping. + */ + available = global_page_state(NR_FREE_PAGES) - totalreserve_pages; + + /* + * Not all the page cache can be freed, otherwise the system will + * start swapping. Assume at least half of the page cache, or the + * low watermark worth of cache, needs to stay. + */ + pagecache = pages[LRU_ACTIVE_FILE] + pages[LRU_INACTIVE_FILE]; + pagecache -= min(pagecache / 2, wmark_low); + available += pagecache; + + /* + * Part of the reclaimable slab consists of items that are in use, + * and cannot be freed. Cap this estimate at the low watermark. + */ + available += global_page_state(NR_SLAB_RECLAIMABLE) - + min(global_page_state(NR_SLAB_RECLAIMABLE) / 2, wmark_low); + + if (available < 0) + available = 0; + return available; +} +EXPORT_SYMBOL_GPL(si_mem_available); + void si_meminfo(struct sysinfo *val) { val->totalram = totalram_pages; @@ -3935,9 +4104,7 @@ static int __parse_numa_zonelist_order(char *s) } else if (*s == 'z' || *s == 'Z') { user_zonelist_order = ZONELIST_ORDER_ZONE; } else { - printk(KERN_WARNING - "Ignoring invalid numa_zonelist_order value: " - "%s\n", s); + pr_warn("Ignoring invalid numa_zonelist_order value: %s\n", s); return -EINVAL; } return 0; @@ -4401,12 +4568,11 @@ void __ref build_all_zonelists(pg_data_t *pgdat, struct zone *zone) else page_group_by_mobility_disabled = 0; - pr_info("Built %i zonelists in %s order, mobility grouping %s. " - "Total pages: %ld\n", - nr_online_nodes, - zonelist_order_name[current_zonelist_order], - page_group_by_mobility_disabled ? "off" : "on", - vm_total_pages); + pr_info("Built %i zonelists in %s order, mobility grouping %s. Total pages: %ld\n", + nr_online_nodes, + zonelist_order_name[current_zonelist_order], + page_group_by_mobility_disabled ? "off" : "on", + vm_total_pages); #ifdef CONFIG_NUMA pr_info("Policy zone: %s\n", zone_names[policy_zone]); #endif @@ -4491,6 +4657,9 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, pg_data_t *pgdat = NODE_DATA(nid); unsigned long pfn; unsigned long nr_initialised = 0; +#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP + struct memblock_region *r = NULL, *tmp; +#endif if (highest_memmap_pfn < end_pfn - 1) highest_memmap_pfn = end_pfn - 1; @@ -4504,20 +4673,51 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, for (pfn = start_pfn; pfn < end_pfn; pfn++) { /* - * There can be holes in boot-time mem_map[]s - * handed to this function. They do not - * exist on hotplugged memory. + * There can be holes in boot-time mem_map[]s handed to this + * function. They do not exist on hotplugged memory. + */ + if (context != MEMMAP_EARLY) + goto not_early; + + if (!early_pfn_valid(pfn)) + continue; + if (!early_pfn_in_nid(pfn, nid)) + continue; + if (!update_defer_init(pgdat, pfn, end_pfn, &nr_initialised)) + break; + +#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP + /* + * If not mirrored_kernelcore and ZONE_MOVABLE exists, range + * from zone_movable_pfn[nid] to end of each node should be + * ZONE_MOVABLE not ZONE_NORMAL. skip it. */ - if (context == MEMMAP_EARLY) { - if (!early_pfn_valid(pfn)) + if (!mirrored_kernelcore && zone_movable_pfn[nid]) + if (zone == ZONE_NORMAL && pfn >= zone_movable_pfn[nid]) continue; - if (!early_pfn_in_nid(pfn, nid)) + + /* + * Check given memblock attribute by firmware which can affect + * kernel memory layout. If zone==ZONE_MOVABLE but memory is + * mirrored, it's an overlapped memmap init. skip it. + */ + if (mirrored_kernelcore && zone == ZONE_MOVABLE) { + if (!r || pfn >= memblock_region_memory_end_pfn(r)) { + for_each_memblock(memory, tmp) + if (pfn < memblock_region_memory_end_pfn(tmp)) + break; + r = tmp; + } + if (pfn >= memblock_region_memory_base_pfn(r) && + memblock_is_mirror(r)) { + /* already initialized as NORMAL */ + pfn = memblock_region_memory_end_pfn(r); continue; - if (!update_defer_init(pgdat, pfn, end_pfn, - &nr_initialised)) - break; + } } +#endif +not_early: /* * Mark the block movable so that blocks are reserved for * movable at startup. This will force kernel allocations @@ -4934,11 +5134,6 @@ static void __meminit adjust_zone_range_for_zone_movable(int nid, *zone_end_pfn = min(node_end_pfn, arch_zone_highest_possible_pfn[movable_zone]); - /* Adjust for ZONE_MOVABLE starting within this range */ - } else if (*zone_start_pfn < zone_movable_pfn[nid] && - *zone_end_pfn > zone_movable_pfn[nid]) { - *zone_end_pfn = zone_movable_pfn[nid]; - /* Check if this whole range is within ZONE_MOVABLE */ } else if (*zone_start_pfn >= zone_movable_pfn[nid]) *zone_start_pfn = *zone_end_pfn; @@ -4953,31 +5148,31 @@ static unsigned long __meminit zone_spanned_pages_in_node(int nid, unsigned long zone_type, unsigned long node_start_pfn, unsigned long node_end_pfn, + unsigned long *zone_start_pfn, + unsigned long *zone_end_pfn, unsigned long *ignored) { - unsigned long zone_start_pfn, zone_end_pfn; - /* When hotadd a new node from cpu_up(), the node should be empty */ if (!node_start_pfn && !node_end_pfn) return 0; /* Get the start and end of the zone */ - zone_start_pfn = arch_zone_lowest_possible_pfn[zone_type]; - zone_end_pfn = arch_zone_highest_possible_pfn[zone_type]; + *zone_start_pfn = arch_zone_lowest_possible_pfn[zone_type]; + *zone_end_pfn = arch_zone_highest_possible_pfn[zone_type]; adjust_zone_range_for_zone_movable(nid, zone_type, node_start_pfn, node_end_pfn, - &zone_start_pfn, &zone_end_pfn); + zone_start_pfn, zone_end_pfn); /* Check that this node has pages within the zone's required range */ - if (zone_end_pfn < node_start_pfn || zone_start_pfn > node_end_pfn) + if (*zone_end_pfn < node_start_pfn || *zone_start_pfn > node_end_pfn) return 0; /* Move the zone boundaries inside the node if necessary */ - zone_end_pfn = min(zone_end_pfn, node_end_pfn); - zone_start_pfn = max(zone_start_pfn, node_start_pfn); + *zone_end_pfn = min(*zone_end_pfn, node_end_pfn); + *zone_start_pfn = max(*zone_start_pfn, node_start_pfn); /* Return the spanned pages */ - return zone_end_pfn - zone_start_pfn; + return *zone_end_pfn - *zone_start_pfn; } /* @@ -5023,6 +5218,7 @@ static unsigned long __meminit zone_absent_pages_in_node(int nid, unsigned long zone_low = arch_zone_lowest_possible_pfn[zone_type]; unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type]; unsigned long zone_start_pfn, zone_end_pfn; + unsigned long nr_absent; /* When hotadd a new node from cpu_up(), the node should be empty */ if (!node_start_pfn && !node_end_pfn) @@ -5034,7 +5230,39 @@ static unsigned long __meminit zone_absent_pages_in_node(int nid, adjust_zone_range_for_zone_movable(nid, zone_type, node_start_pfn, node_end_pfn, &zone_start_pfn, &zone_end_pfn); - return __absent_pages_in_range(nid, zone_start_pfn, zone_end_pfn); + nr_absent = __absent_pages_in_range(nid, zone_start_pfn, zone_end_pfn); + + /* + * ZONE_MOVABLE handling. + * Treat pages to be ZONE_MOVABLE in ZONE_NORMAL as absent pages + * and vice versa. + */ + if (zone_movable_pfn[nid]) { + if (mirrored_kernelcore) { + unsigned long start_pfn, end_pfn; + struct memblock_region *r; + + for_each_memblock(memory, r) { + start_pfn = clamp(memblock_region_memory_base_pfn(r), + zone_start_pfn, zone_end_pfn); + end_pfn = clamp(memblock_region_memory_end_pfn(r), + zone_start_pfn, zone_end_pfn); + + if (zone_type == ZONE_MOVABLE && + memblock_is_mirror(r)) + nr_absent += end_pfn - start_pfn; + + if (zone_type == ZONE_NORMAL && + !memblock_is_mirror(r)) + nr_absent += end_pfn - start_pfn; + } + } else { + if (zone_type == ZONE_NORMAL) + nr_absent += node_end_pfn - zone_movable_pfn[nid]; + } + } + + return nr_absent; } #else /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ @@ -5042,8 +5270,18 @@ static inline unsigned long __meminit zone_spanned_pages_in_node(int nid, unsigned long zone_type, unsigned long node_start_pfn, unsigned long node_end_pfn, + unsigned long *zone_start_pfn, + unsigned long *zone_end_pfn, unsigned long *zones_size) { + unsigned int zone; + + *zone_start_pfn = node_start_pfn; + for (zone = 0; zone < zone_type; zone++) + *zone_start_pfn += zones_size[zone]; + + *zone_end_pfn = *zone_start_pfn + zones_size[zone_type]; + return zones_size[zone_type]; } @@ -5072,15 +5310,22 @@ static void __meminit calculate_node_totalpages(struct pglist_data *pgdat, for (i = 0; i < MAX_NR_ZONES; i++) { struct zone *zone = pgdat->node_zones + i; + unsigned long zone_start_pfn, zone_end_pfn; unsigned long size, real_size; size = zone_spanned_pages_in_node(pgdat->node_id, i, node_start_pfn, node_end_pfn, + &zone_start_pfn, + &zone_end_pfn, zones_size); real_size = size - zone_absent_pages_in_node(pgdat->node_id, i, node_start_pfn, node_end_pfn, zholes_size); + if (size) + zone->zone_start_pfn = zone_start_pfn; + else + zone->zone_start_pfn = 0; zone->spanned_pages = size; zone->present_pages = real_size; @@ -5201,7 +5446,6 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat) { enum zone_type j; int nid = pgdat->node_id; - unsigned long zone_start_pfn = pgdat->node_start_pfn; int ret; pgdat_resize_init(pgdat); @@ -5217,11 +5461,15 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat) #endif init_waitqueue_head(&pgdat->kswapd_wait); init_waitqueue_head(&pgdat->pfmemalloc_wait); +#ifdef CONFIG_COMPACTION + init_waitqueue_head(&pgdat->kcompactd_wait); +#endif pgdat_page_ext_init(pgdat); for (j = 0; j < MAX_NR_ZONES; j++) { struct zone *zone = pgdat->node_zones + j; unsigned long size, realsize, freesize, memmap_pages; + unsigned long zone_start_pfn = zone->zone_start_pfn; size = zone->spanned_pages; realsize = freesize = zone->present_pages; @@ -5240,8 +5488,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat) " %s zone: %lu pages used for memmap\n", zone_names[j], memmap_pages); } else - printk(KERN_WARNING - " %s zone: %lu pages exceeds freesize %lu\n", + pr_warn(" %s zone: %lu pages exceeds freesize %lu\n", zone_names[j], memmap_pages, freesize); } @@ -5290,7 +5537,6 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat) ret = init_currently_empty_zone(zone, zone_start_pfn, size); BUG_ON(ret); memmap_init(size, nid, j, zone_start_pfn); - zone_start_pfn += size; } } @@ -5358,6 +5604,8 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size, pr_info("Initmem setup node %d [mem %#018Lx-%#018Lx]\n", nid, (u64)start_pfn << PAGE_SHIFT, end_pfn ? ((u64)end_pfn << PAGE_SHIFT) - 1 : 0); +#else + start_pfn = node_start_pfn; #endif calculate_node_totalpages(pgdat, start_pfn, end_pfn, zones_size, zholes_size); @@ -5448,8 +5696,7 @@ static unsigned long __init find_min_pfn_for_node(int nid) min_pfn = min(min_pfn, start_pfn); if (min_pfn == ULONG_MAX) { - printk(KERN_WARNING - "Could not find start_pfn for node %d\n", nid); + pr_warn("Could not find start_pfn for node %d\n", nid); return 0; } @@ -5529,6 +5776,36 @@ static void __init find_zone_movable_pfns_for_nodes(void) } /* + * If kernelcore=mirror is specified, ignore movablecore option + */ + if (mirrored_kernelcore) { + bool mem_below_4gb_not_mirrored = false; + + for_each_memblock(memory, r) { + if (memblock_is_mirror(r)) + continue; + + nid = r->nid; + + usable_startpfn = memblock_region_memory_base_pfn(r); + + if (usable_startpfn < 0x100000) { + mem_below_4gb_not_mirrored = true; + continue; + } + + zone_movable_pfn[nid] = zone_movable_pfn[nid] ? + min(usable_startpfn, zone_movable_pfn[nid]) : + usable_startpfn; + } + + if (mem_below_4gb_not_mirrored) + pr_warn("This configuration results in unmirrored kernel memory."); + + goto out2; + } + + /* * If movablecore=nn[KMG] was specified, calculate what size of * kernelcore that corresponds so that memory usable for * any allocation type is evenly spread. If both kernelcore @@ -5788,6 +6065,12 @@ static int __init cmdline_parse_core(char *p, unsigned long *core) */ static int __init cmdline_parse_kernelcore(char *p) { + /* parse kernelcore=mirror */ + if (parse_option_str(p, "mirror")) { + mirrored_kernelcore = true; + return 0; + } + return cmdline_parse_core(p, &required_kernelcore); } @@ -5885,22 +6168,21 @@ void __init mem_init_print_info(const char *str) #undef adj_init_size - pr_info("Memory: %luK/%luK available " - "(%luK kernel code, %luK rwdata, %luK rodata, " - "%luK init, %luK bss, %luK reserved, %luK cma-reserved" + pr_info("Memory: %luK/%luK available (%luK kernel code, %luK rwdata, %luK rodata, %luK init, %luK bss, %luK reserved, %luK cma-reserved" #ifdef CONFIG_HIGHMEM - ", %luK highmem" + ", %luK highmem" #endif - "%s%s)\n", - nr_free_pages() << (PAGE_SHIFT-10), physpages << (PAGE_SHIFT-10), - codesize >> 10, datasize >> 10, rosize >> 10, - (init_data_size + init_code_size) >> 10, bss_size >> 10, - (physpages - totalram_pages - totalcma_pages) << (PAGE_SHIFT-10), - totalcma_pages << (PAGE_SHIFT-10), + "%s%s)\n", + nr_free_pages() << (PAGE_SHIFT - 10), + physpages << (PAGE_SHIFT - 10), + codesize >> 10, datasize >> 10, rosize >> 10, + (init_data_size + init_code_size) >> 10, bss_size >> 10, + (physpages - totalram_pages - totalcma_pages) << (PAGE_SHIFT - 10), + totalcma_pages << (PAGE_SHIFT - 10), #ifdef CONFIG_HIGHMEM - totalhigh_pages << (PAGE_SHIFT-10), + totalhigh_pages << (PAGE_SHIFT - 10), #endif - str ? ", " : "", str ? str : ""); + str ? ", " : "", str ? str : ""); } /** @@ -6075,8 +6357,17 @@ static void __setup_per_zone_wmarks(void) zone->watermark[WMARK_MIN] = tmp; } - zone->watermark[WMARK_LOW] = min_wmark_pages(zone) + (tmp >> 2); - zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + (tmp >> 1); + /* + * Set the kswapd watermarks distance according to the + * scale factor in proportion to available memory, but + * ensure a minimum size on small systems. + */ + tmp = max_t(u64, tmp >> 2, + mult_frac(zone->managed_pages, + watermark_scale_factor, 10000)); + + zone->watermark[WMARK_LOW] = min_wmark_pages(zone) + tmp; + zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + tmp * 2; __mod_zone_page_state(zone, NR_ALLOC_BATCH, high_wmark_pages(zone) - low_wmark_pages(zone) - @@ -6217,6 +6508,21 @@ int min_free_kbytes_sysctl_handler(struct ctl_table *table, int write, return 0; } +int watermark_scale_factor_sysctl_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *length, loff_t *ppos) +{ + int rc; + + rc = proc_dointvec_minmax(table, write, buffer, length, ppos); + if (rc) + return rc; + + if (write) + setup_per_zone_wmarks(); + + return 0; +} + #ifdef CONFIG_NUMA int sysctl_min_unmapped_ratio_sysctl_handler(struct ctl_table *table, int write, void __user *buffer, size_t *length, loff_t *ppos) @@ -6408,11 +6714,8 @@ void *__init alloc_large_system_hash(const char *tablename, if (!table) panic("Failed to allocate %s hash table\n", tablename); - printk(KERN_INFO "%s hash table entries: %ld (order: %d, %lu bytes)\n", - tablename, - (1UL << log2qty), - ilog2(size) - PAGE_SHIFT, - size); + pr_info("%s hash table entries: %ld (order: %d, %lu bytes)\n", + tablename, 1UL << log2qty, ilog2(size) - PAGE_SHIFT, size); if (_hash_shift) *_hash_shift = log2qty; @@ -6563,7 +6866,7 @@ bool has_unmovable_pages(struct zone *zone, struct page *page, int count, * This check already skips compound tails of THP * because their page->_count is zero at all time. */ - if (!atomic_read(&page->_count)) { + if (!page_ref_count(page)) { if (PageBuddy(page)) iter += (1 << page_order(page)) - 1; continue; @@ -6913,8 +7216,8 @@ __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn) BUG_ON(!PageBuddy(page)); order = page_order(page); #ifdef CONFIG_DEBUG_VM - printk(KERN_INFO "remove from free list %lx %d %lx\n", - pfn, 1 << order, end_pfn); + pr_info("remove from free list %lx %d %lx\n", + pfn, 1 << order, end_pfn); #endif list_del(&page->lru); rmv_page_order(page); @@ -6927,7 +7230,6 @@ __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn) } #endif -#ifdef CONFIG_MEMORY_FAILURE bool is_free_buddy_page(struct page *page) { struct zone *zone = page_zone(page); @@ -6946,4 +7248,3 @@ bool is_free_buddy_page(struct page *page) return order < MAX_ORDER; } -#endif diff --git a/mm/page_ext.c b/mm/page_ext.c index 292ca7b8debd..2d864e64f7fe 100644 --- a/mm/page_ext.c +++ b/mm/page_ext.c @@ -106,12 +106,15 @@ struct page_ext *lookup_page_ext(struct page *page) struct page_ext *base; base = NODE_DATA(page_to_nid(page))->node_page_ext; -#ifdef CONFIG_DEBUG_VM +#if defined(CONFIG_DEBUG_VM) || defined(CONFIG_PAGE_POISONING) /* * The sanity checks the page allocator does upon freeing a * page can reach here before the page_ext arrays are * allocated when feeding a range of pages to the allocator * for the first time during bootup or memory hotplug. + * + * This check is also necessary for ensuring page poisoning + * works as expected when enabled */ if (unlikely(!base)) return NULL; @@ -180,12 +183,15 @@ struct page_ext *lookup_page_ext(struct page *page) { unsigned long pfn = page_to_pfn(page); struct mem_section *section = __pfn_to_section(pfn); -#ifdef CONFIG_DEBUG_VM +#if defined(CONFIG_DEBUG_VM) || defined(CONFIG_PAGE_POISONING) /* * The sanity checks the page allocator does upon freeing a * page can reach here before the page_ext arrays are * allocated when feeding a range of pages to the allocator * for the first time during bootup or memory hotplug. + * + * This check is also necessary for ensuring page poisoning + * works as expected when enabled */ if (!section->page_ext) return NULL; diff --git a/mm/page_io.c b/mm/page_io.c index b995a5ba5e8f..18aac7819cc9 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -56,31 +56,20 @@ void end_swap_bio_write(struct bio *bio) * Also clear PG_reclaim to avoid rotate_reclaimable_page() */ set_page_dirty(page); - printk(KERN_ALERT "Write-error on swap-device (%u:%u:%Lu)\n", - imajor(bio->bi_bdev->bd_inode), - iminor(bio->bi_bdev->bd_inode), - (unsigned long long)bio->bi_iter.bi_sector); + pr_alert("Write-error on swap-device (%u:%u:%llu)\n", + imajor(bio->bi_bdev->bd_inode), + iminor(bio->bi_bdev->bd_inode), + (unsigned long long)bio->bi_iter.bi_sector); ClearPageReclaim(page); } end_page_writeback(page); bio_put(bio); } -static void end_swap_bio_read(struct bio *bio) +static void swap_slot_free_notify(struct page *page) { - struct page *page = bio->bi_io_vec[0].bv_page; - - if (bio->bi_error) { - SetPageError(page); - ClearPageUptodate(page); - printk(KERN_ALERT "Read-error on swap-device (%u:%u:%Lu)\n", - imajor(bio->bi_bdev->bd_inode), - iminor(bio->bi_bdev->bd_inode), - (unsigned long long)bio->bi_iter.bi_sector); - goto out; - } - - SetPageUptodate(page); + struct swap_info_struct *sis; + struct gendisk *disk; /* * There is no guarantee that the page is in swap cache - the software @@ -88,42 +77,59 @@ static void end_swap_bio_read(struct bio *bio) * swapcache page. So we must check PG_swapcache before proceeding with * this optimization. */ - if (likely(PageSwapCache(page))) { - struct swap_info_struct *sis; + if (unlikely(!PageSwapCache(page))) + return; - sis = page_swap_info(page); - if (sis->flags & SWP_BLKDEV) { - /* - * The swap subsystem performs lazy swap slot freeing, - * expecting that the page will be swapped out again. - * So we can avoid an unnecessary write if the page - * isn't redirtied. - * This is good for real swap storage because we can - * reduce unnecessary I/O and enhance wear-leveling - * if an SSD is used as the as swap device. - * But if in-memory swap device (eg zram) is used, - * this causes a duplicated copy between uncompressed - * data in VM-owned memory and compressed data in - * zram-owned memory. So let's free zram-owned memory - * and make the VM-owned decompressed page *dirty*, - * so the page should be swapped out somewhere again if - * we again wish to reclaim it. - */ - struct gendisk *disk = sis->bdev->bd_disk; - if (disk->fops->swap_slot_free_notify) { - swp_entry_t entry; - unsigned long offset; + sis = page_swap_info(page); + if (!(sis->flags & SWP_BLKDEV)) + return; - entry.val = page_private(page); - offset = swp_offset(entry); + /* + * The swap subsystem performs lazy swap slot freeing, + * expecting that the page will be swapped out again. + * So we can avoid an unnecessary write if the page + * isn't redirtied. + * This is good for real swap storage because we can + * reduce unnecessary I/O and enhance wear-leveling + * if an SSD is used as the as swap device. + * But if in-memory swap device (eg zram) is used, + * this causes a duplicated copy between uncompressed + * data in VM-owned memory and compressed data in + * zram-owned memory. So let's free zram-owned memory + * and make the VM-owned decompressed page *dirty*, + * so the page should be swapped out somewhere again if + * we again wish to reclaim it. + */ + disk = sis->bdev->bd_disk; + if (disk->fops->swap_slot_free_notify) { + swp_entry_t entry; + unsigned long offset; - SetPageDirty(page); - disk->fops->swap_slot_free_notify(sis->bdev, - offset); - } - } + entry.val = page_private(page); + offset = swp_offset(entry); + + SetPageDirty(page); + disk->fops->swap_slot_free_notify(sis->bdev, + offset); + } +} + +static void end_swap_bio_read(struct bio *bio) +{ + struct page *page = bio->bi_io_vec[0].bv_page; + + if (bio->bi_error) { + SetPageError(page); + ClearPageUptodate(page); + pr_alert("Read-error on swap-device (%u:%u:%llu)\n", + imajor(bio->bi_bdev->bd_inode), + iminor(bio->bi_bdev->bd_inode), + (unsigned long long)bio->bi_iter.bi_sector); + goto out; } + SetPageUptodate(page); + swap_slot_free_notify(page); out: unlock_page(page); bio_put(bio); @@ -216,7 +222,7 @@ reprobe: out: return ret; bad_bmap: - printk(KERN_ERR "swapon: swapfile has holes\n"); + pr_err("swapon: swapfile has holes\n"); ret = -EINVAL; goto out; } @@ -290,8 +296,8 @@ int __swap_writepage(struct page *page, struct writeback_control *wbc, */ set_page_dirty(page); ClearPageReclaim(page); - pr_err_ratelimited("Write error on dio swapfile (%Lu)\n", - page_file_offset(page)); + pr_err_ratelimited("Write error on dio swapfile (%llu)\n", + page_file_offset(page)); } end_page_writeback(page); return ret; @@ -347,6 +353,7 @@ int swap_readpage(struct page *page) ret = bdev_read_page(sis->bdev, swap_page_sector(page), page); if (!ret) { + swap_slot_free_notify(page); count_vm_event(PSWPIN); return 0; } diff --git a/mm/page_owner.c b/mm/page_owner.c index 983c3a10fa07..ac3d8d129974 100644 --- a/mm/page_owner.c +++ b/mm/page_owner.c @@ -5,10 +5,12 @@ #include <linux/bootmem.h> #include <linux/stacktrace.h> #include <linux/page_owner.h> +#include <linux/jump_label.h> +#include <linux/migrate.h> #include "internal.h" static bool page_owner_disabled = true; -bool page_owner_inited __read_mostly; +DEFINE_STATIC_KEY_FALSE(page_owner_inited); static void init_early_allocated_pages(void); @@ -37,7 +39,7 @@ static void init_page_owner(void) if (page_owner_disabled) return; - page_owner_inited = true; + static_branch_enable(&page_owner_inited); init_early_allocated_pages(); } @@ -72,10 +74,18 @@ void __set_page_owner(struct page *page, unsigned int order, gfp_t gfp_mask) page_ext->order = order; page_ext->gfp_mask = gfp_mask; page_ext->nr_entries = trace.nr_entries; + page_ext->last_migrate_reason = -1; __set_bit(PAGE_EXT_OWNER, &page_ext->flags); } +void __set_page_owner_migrate_reason(struct page *page, int reason) +{ + struct page_ext *page_ext = lookup_page_ext(page); + + page_ext->last_migrate_reason = reason; +} + gfp_t __get_page_owner_gfp(struct page *page) { struct page_ext *page_ext = lookup_page_ext(page); @@ -83,6 +93,31 @@ gfp_t __get_page_owner_gfp(struct page *page) return page_ext->gfp_mask; } +void __copy_page_owner(struct page *oldpage, struct page *newpage) +{ + struct page_ext *old_ext = lookup_page_ext(oldpage); + struct page_ext *new_ext = lookup_page_ext(newpage); + int i; + + new_ext->order = old_ext->order; + new_ext->gfp_mask = old_ext->gfp_mask; + new_ext->nr_entries = old_ext->nr_entries; + + for (i = 0; i < ARRAY_SIZE(new_ext->trace_entries); i++) + new_ext->trace_entries[i] = old_ext->trace_entries[i]; + + /* + * We don't clear the bit on the oldpage as it's going to be freed + * after migration. Until then, the info can be useful in case of + * a bug, and the overal stats will be off a bit only temporarily. + * Also, migrate_misplaced_transhuge_page() can still fail the + * migration and then we want the oldpage to retain the info. But + * in that case we also don't need to explicitly clear the info from + * the new page, which will be freed. + */ + __set_bit(PAGE_EXT_OWNER, &new_ext->flags); +} + static ssize_t print_page_owner(char __user *buf, size_t count, unsigned long pfn, struct page *page, struct page_ext *page_ext) @@ -100,8 +135,9 @@ print_page_owner(char __user *buf, size_t count, unsigned long pfn, return -ENOMEM; ret = snprintf(kbuf, count, - "Page allocated via order %u, mask 0x%x\n", - page_ext->order, page_ext->gfp_mask); + "Page allocated via order %u, mask %#x(%pGg)\n", + page_ext->order, page_ext->gfp_mask, + &page_ext->gfp_mask); if (ret >= count) goto err; @@ -110,23 +146,12 @@ print_page_owner(char __user *buf, size_t count, unsigned long pfn, pageblock_mt = get_pfnblock_migratetype(page, pfn); page_mt = gfpflags_to_migratetype(page_ext->gfp_mask); ret += snprintf(kbuf + ret, count - ret, - "PFN %lu Block %lu type %d %s Flags %s%s%s%s%s%s%s%s%s%s%s%s\n", + "PFN %lu type %s Block %lu type %s Flags %#lx(%pGp)\n", pfn, + migratetype_names[page_mt], pfn >> pageblock_order, - pageblock_mt, - pageblock_mt != page_mt ? "Fallback" : " ", - PageLocked(page) ? "K" : " ", - PageError(page) ? "E" : " ", - PageReferenced(page) ? "R" : " ", - PageUptodate(page) ? "U" : " ", - PageDirty(page) ? "D" : " ", - PageLRU(page) ? "L" : " ", - PageActive(page) ? "A" : " ", - PageSlab(page) ? "S" : " ", - PageWriteback(page) ? "W" : " ", - PageCompound(page) ? "C" : " ", - PageSwapCache(page) ? "B" : " ", - PageMappedToDisk(page) ? "M" : " "); + migratetype_names[pageblock_mt], + page->flags, &page->flags); if (ret >= count) goto err; @@ -135,6 +160,14 @@ print_page_owner(char __user *buf, size_t count, unsigned long pfn, if (ret >= count) goto err; + if (page_ext->last_migrate_reason != -1) { + ret += snprintf(kbuf + ret, count - ret, + "Page has been migrated, last migrate reason: %s\n", + migrate_reason_names[page_ext->last_migrate_reason]); + if (ret >= count) + goto err; + } + ret += snprintf(kbuf + ret, count - ret, "\n"); if (ret >= count) goto err; @@ -150,6 +183,30 @@ err: return -ENOMEM; } +void __dump_page_owner(struct page *page) +{ + struct page_ext *page_ext = lookup_page_ext(page); + struct stack_trace trace = { + .nr_entries = page_ext->nr_entries, + .entries = &page_ext->trace_entries[0], + }; + gfp_t gfp_mask = page_ext->gfp_mask; + int mt = gfpflags_to_migratetype(gfp_mask); + + if (!test_bit(PAGE_EXT_OWNER, &page_ext->flags)) { + pr_alert("page_owner info is not active (free page?)\n"); + return; + } + + pr_alert("page allocated via order %u, migratetype %s, gfp_mask %#x(%pGg)\n", + page_ext->order, migratetype_names[mt], gfp_mask, &gfp_mask); + print_stack_trace(&trace, 0); + + if (page_ext->last_migrate_reason != -1) + pr_alert("page has been migrated, last migrate reason: %s\n", + migrate_reason_names[page_ext->last_migrate_reason]); +} + static ssize_t read_page_owner(struct file *file, char __user *buf, size_t count, loff_t *ppos) { @@ -157,7 +214,7 @@ read_page_owner(struct file *file, char __user *buf, size_t count, loff_t *ppos) struct page *page; struct page_ext *page_ext; - if (!page_owner_inited) + if (!static_branch_unlikely(&page_owner_inited)) return -EINVAL; page = NULL; @@ -305,7 +362,7 @@ static int __init pageowner_init(void) { struct dentry *dentry; - if (!page_owner_inited) { + if (!static_branch_unlikely(&page_owner_inited)) { pr_info("page_owner is disabled\n"); return 0; } diff --git a/mm/debug-pagealloc.c b/mm/page_poison.c index 5bf5906ce13b..479e7ea2bea6 100644 --- a/mm/debug-pagealloc.c +++ b/mm/page_poison.c @@ -6,22 +6,48 @@ #include <linux/poison.h> #include <linux/ratelimit.h> -static bool page_poisoning_enabled __read_mostly; +static bool __page_poisoning_enabled __read_mostly; +static bool want_page_poisoning __read_mostly; -static bool need_page_poisoning(void) +static int early_page_poison_param(char *buf) { - if (!debug_pagealloc_enabled()) - return false; + if (!buf) + return -EINVAL; + + if (strcmp(buf, "on") == 0) + want_page_poisoning = true; + else if (strcmp(buf, "off") == 0) + want_page_poisoning = false; - return true; + return 0; +} +early_param("page_poison", early_page_poison_param); + +bool page_poisoning_enabled(void) +{ + return __page_poisoning_enabled; +} + +static bool need_page_poisoning(void) +{ + return want_page_poisoning; } static void init_page_poisoning(void) { - if (!debug_pagealloc_enabled()) - return; + /* + * page poisoning is debug page alloc for some arches. If either + * of those options are enabled, enable poisoning + */ + if (!IS_ENABLED(CONFIG_ARCH_SUPPORTS_DEBUG_PAGEALLOC)) { + if (!want_page_poisoning && !debug_pagealloc_enabled()) + return; + } else { + if (!want_page_poisoning) + return; + } - page_poisoning_enabled = true; + __page_poisoning_enabled = true; } struct page_ext_operations page_poisoning_ops = { @@ -45,11 +71,14 @@ static inline void clear_page_poison(struct page *page) __clear_bit(PAGE_EXT_DEBUG_POISON, &page_ext->flags); } -static inline bool page_poison(struct page *page) +bool page_is_poisoned(struct page *page) { struct page_ext *page_ext; page_ext = lookup_page_ext(page); + if (!page_ext) + return false; + return test_bit(PAGE_EXT_DEBUG_POISON, &page_ext->flags); } @@ -83,6 +112,9 @@ static void check_poison_mem(unsigned char *mem, size_t bytes) unsigned char *start; unsigned char *end; + if (IS_ENABLED(CONFIG_PAGE_POISONING_NO_SANITY)) + return; + start = memchr_inv(mem, PAGE_POISON, bytes); if (!start) return; @@ -95,9 +127,9 @@ static void check_poison_mem(unsigned char *mem, size_t bytes) if (!__ratelimit(&ratelimit)) return; else if (start == end && single_bit_flip(*start, PAGE_POISON)) - printk(KERN_ERR "pagealloc: single bit error\n"); + pr_err("pagealloc: single bit error\n"); else - printk(KERN_ERR "pagealloc: memory corruption\n"); + pr_err("pagealloc: memory corruption\n"); print_hex_dump(KERN_ERR, "", DUMP_PREFIX_ADDRESS, 16, 1, start, end - start + 1, 1); @@ -108,7 +140,7 @@ static void unpoison_page(struct page *page) { void *addr; - if (!page_poison(page)) + if (!page_is_poisoned(page)) return; addr = kmap_atomic(page); @@ -125,9 +157,9 @@ static void unpoison_pages(struct page *page, int n) unpoison_page(page + i); } -void __kernel_map_pages(struct page *page, int numpages, int enable) +void kernel_poison_pages(struct page *page, int numpages, int enable) { - if (!page_poisoning_enabled) + if (!page_poisoning_enabled()) return; if (enable) @@ -135,3 +167,10 @@ void __kernel_map_pages(struct page *page, int numpages, int enable) else poison_pages(page, numpages); } + +#ifndef CONFIG_ARCH_SUPPORTS_DEBUG_PAGEALLOC +void __kernel_map_pages(struct page *page, int numpages, int enable) +{ + /* This function does nothing, all work is done via poison pages */ +} +#endif diff --git a/mm/percpu-km.c b/mm/percpu-km.c index 10e3d0b8a86d..d66911ff42d9 100644 --- a/mm/percpu-km.c +++ b/mm/percpu-km.c @@ -95,7 +95,7 @@ static int __init pcpu_verify_alloc_info(const struct pcpu_alloc_info *ai) /* all units must be in a single group */ if (ai->nr_groups != 1) { - printk(KERN_CRIT "percpu: can't handle more than one groups\n"); + pr_crit("can't handle more than one group\n"); return -EINVAL; } @@ -103,8 +103,8 @@ static int __init pcpu_verify_alloc_info(const struct pcpu_alloc_info *ai) alloc_pages = roundup_pow_of_two(nr_pages); if (alloc_pages > nr_pages) - printk(KERN_WARNING "percpu: wasting %zu pages per chunk\n", - alloc_pages - nr_pages); + pr_warn("wasting %zu pages per chunk\n", + alloc_pages - nr_pages); return 0; } diff --git a/mm/percpu.c b/mm/percpu.c index 998607adf6eb..0c59684f1ff2 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -53,6 +53,8 @@ * setup the first chunk containing the kernel static percpu area */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/bitmap.h> #include <linux/bootmem.h> #include <linux/err.h> @@ -888,8 +890,8 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved, size = ALIGN(size, 2); if (unlikely(!size || size > PCPU_MIN_UNIT_SIZE || align > PAGE_SIZE)) { - WARN(true, "illegal size (%zu) or align (%zu) for " - "percpu allocation\n", size, align); + WARN(true, "illegal size (%zu) or align (%zu) for percpu allocation\n", + size, align); return NULL; } @@ -1033,11 +1035,11 @@ fail_unlock: spin_unlock_irqrestore(&pcpu_lock, flags); fail: if (!is_atomic && warn_limit) { - pr_warning("PERCPU: allocation failed, size=%zu align=%zu atomic=%d, %s\n", - size, align, is_atomic, err); + pr_warn("allocation failed, size=%zu align=%zu atomic=%d, %s\n", + size, align, is_atomic, err); dump_stack(); if (!--warn_limit) - pr_info("PERCPU: limit reached, disable warning\n"); + pr_info("limit reached, disable warning\n"); } if (is_atomic) { /* see the flag handling in pcpu_blance_workfn() */ @@ -1449,20 +1451,20 @@ static void pcpu_dump_alloc_info(const char *lvl, for (alloc_end += gi->nr_units / upa; alloc < alloc_end; alloc++) { if (!(alloc % apl)) { - printk(KERN_CONT "\n"); + pr_cont("\n"); printk("%spcpu-alloc: ", lvl); } - printk(KERN_CONT "[%0*d] ", group_width, group); + pr_cont("[%0*d] ", group_width, group); for (unit_end += upa; unit < unit_end; unit++) if (gi->cpu_map[unit] != NR_CPUS) - printk(KERN_CONT "%0*d ", cpu_width, - gi->cpu_map[unit]); + pr_cont("%0*d ", + cpu_width, gi->cpu_map[unit]); else - printk(KERN_CONT "%s ", empty_str); + pr_cont("%s ", empty_str); } } - printk(KERN_CONT "\n"); + pr_cont("\n"); } /** @@ -1538,8 +1540,8 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, #define PCPU_SETUP_BUG_ON(cond) do { \ if (unlikely(cond)) { \ - pr_emerg("PERCPU: failed to initialize, %s", #cond); \ - pr_emerg("PERCPU: cpu_possible_mask=%*pb\n", \ + pr_emerg("failed to initialize, %s\n", #cond); \ + pr_emerg("cpu_possible_mask=%*pb\n", \ cpumask_pr_args(cpu_possible_mask)); \ pcpu_dump_alloc_info(KERN_EMERG, ai); \ BUG(); \ @@ -1723,7 +1725,7 @@ static int __init percpu_alloc_setup(char *str) pcpu_chosen_fc = PCPU_FC_PAGE; #endif else - pr_warning("PERCPU: unknown allocator %s specified\n", str); + pr_warn("unknown allocator %s specified\n", str); return 0; } @@ -2016,9 +2018,8 @@ int __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size, /* warn if maximum distance is further than 75% of vmalloc space */ if (max_distance > VMALLOC_TOTAL * 3 / 4) { - pr_warning("PERCPU: max_distance=0x%zx too large for vmalloc " - "space 0x%lx\n", max_distance, - VMALLOC_TOTAL); + pr_warn("max_distance=0x%zx too large for vmalloc space 0x%lx\n", + max_distance, VMALLOC_TOTAL); #ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK /* and fail if we have fallback */ rc = -EINVAL; @@ -2026,7 +2027,7 @@ int __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size, #endif } - pr_info("PERCPU: Embedded %zu pages/cpu @%p s%zu r%zu d%zu u%zu\n", + pr_info("Embedded %zu pages/cpu @%p s%zu r%zu d%zu u%zu\n", PFN_DOWN(size_sum), base, ai->static_size, ai->reserved_size, ai->dyn_size, ai->unit_size); @@ -2100,8 +2101,8 @@ int __init pcpu_page_first_chunk(size_t reserved_size, ptr = alloc_fn(cpu, PAGE_SIZE, PAGE_SIZE); if (!ptr) { - pr_warning("PERCPU: failed to allocate %s page " - "for cpu%u\n", psize_str, cpu); + pr_warn("failed to allocate %s page for cpu%u\n", + psize_str, cpu); goto enomem; } /* kmemleak tracks the percpu allocations separately */ @@ -2140,7 +2141,7 @@ int __init pcpu_page_first_chunk(size_t reserved_size, } /* we're ready, commit */ - pr_info("PERCPU: %d %s pages/cpu @%p s%zu r%zu d%zu\n", + pr_info("%d %s pages/cpu @%p s%zu r%zu d%zu\n", unit_pages, psize_str, vm.addr, ai->static_size, ai->reserved_size, ai->dyn_size); diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c index 06a005b979a7..71c5f9109f2a 100644 --- a/mm/pgtable-generic.c +++ b/mm/pgtable-generic.c @@ -84,20 +84,6 @@ pte_t ptep_clear_flush(struct vm_area_struct *vma, unsigned long address, #ifdef CONFIG_TRANSPARENT_HUGEPAGE -#ifndef __HAVE_ARCH_FLUSH_PMD_TLB_RANGE - -/* - * ARCHes with special requirements for evicting THP backing TLB entries can - * implement this. Otherwise also, it can help optimize normal TLB flush in - * THP regime. stock flush_tlb_range() typically has optimization to nuke the - * entire TLB if flush span is greater than a threshold, which will - * likely be true for a single huge page. Thus a single thp flush will - * invalidate the entire TLB which is not desirable. - * e.g. see arch/arc: flush_pmd_tlb_range - */ -#define flush_pmd_tlb_range(vma, addr, end) flush_tlb_range(vma, addr, end) -#endif - #ifndef __HAVE_ARCH_PMDP_SET_ACCESS_FLAGS int pmdp_set_access_flags(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp, diff --git a/mm/process_vm_access.c b/mm/process_vm_access.c index 5d453e58ddbf..07514d41ebcc 100644 --- a/mm/process_vm_access.c +++ b/mm/process_vm_access.c @@ -98,9 +98,14 @@ static int process_vm_rw_single_vec(unsigned long addr, int pages = min(nr_pages, max_pages_per_loop); size_t bytes; - /* Get the pages we're interested in */ - pages = get_user_pages_unlocked(task, mm, pa, pages, - vm_write, 0, process_pages); + /* + * Get the pages we're interested in. We must + * add FOLL_REMOTE because task/mm might not + * current/current->mm + */ + pages = __get_user_pages_unlocked(task, mm, pa, pages, + vm_write, 0, process_pages, + FOLL_REMOTE); if (pages <= 0) return -EFAULT; diff --git a/mm/quicklist.c b/mm/quicklist.c index 942212970529..daf6ff6e199a 100644 --- a/mm/quicklist.c +++ b/mm/quicklist.c @@ -8,7 +8,7 @@ * improved on it. * * Copyright (C) 2007 SGI, - * Christoph Lameter <clameter@sgi.com> + * Christoph Lameter <cl@linux.com> * Generalized, added support for multiple lists and * constructors / destructors. */ diff --git a/mm/rmap.c b/mm/rmap.c index 79f3bf047f38..c399a0d41b31 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1287,21 +1287,17 @@ void page_add_new_anon_rmap(struct page *page, */ void page_add_file_rmap(struct page *page) { - struct mem_cgroup *memcg; - - memcg = mem_cgroup_begin_page_stat(page); + lock_page_memcg(page); if (atomic_inc_and_test(&page->_mapcount)) { __inc_zone_page_state(page, NR_FILE_MAPPED); - mem_cgroup_inc_page_stat(memcg, MEM_CGROUP_STAT_FILE_MAPPED); + mem_cgroup_inc_page_stat(page, MEM_CGROUP_STAT_FILE_MAPPED); } - mem_cgroup_end_page_stat(memcg); + unlock_page_memcg(page); } static void page_remove_file_rmap(struct page *page) { - struct mem_cgroup *memcg; - - memcg = mem_cgroup_begin_page_stat(page); + lock_page_memcg(page); /* Hugepages are not counted in NR_FILE_MAPPED for now. */ if (unlikely(PageHuge(page))) { @@ -1320,12 +1316,12 @@ static void page_remove_file_rmap(struct page *page) * pte lock(a spinlock) is held, which implies preemption disabled. */ __dec_zone_page_state(page, NR_FILE_MAPPED); - mem_cgroup_dec_page_stat(memcg, MEM_CGROUP_STAT_FILE_MAPPED); + mem_cgroup_dec_page_stat(page, MEM_CGROUP_STAT_FILE_MAPPED); if (unlikely(PageMlocked(page))) clear_page_mlock(page); out: - mem_cgroup_end_page_stat(memcg); + unlock_page_memcg(page); } static void page_remove_anon_compound_rmap(struct page *page) @@ -1435,6 +1431,14 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, if ((flags & TTU_MUNLOCK) && !(vma->vm_flags & VM_LOCKED)) goto out; + if (flags & TTU_SPLIT_HUGE_PMD) { + split_huge_pmd_address(vma, address, + flags & TTU_MIGRATION, page); + /* check if we have anything to do after split */ + if (page_mapcount(page) == 0) + goto out; + } + pte = page_check_address(page, mm, address, &ptl, 0); if (!pte) goto out; @@ -1580,10 +1584,10 @@ static bool invalid_migration_vma(struct vm_area_struct *vma, void *arg) return is_vma_temporary_stack(vma); } -static int page_not_mapped(struct page *page) +static int page_mapcount_is_zero(struct page *page) { - return !page_mapped(page); -}; + return !page_mapcount(page); +} /** * try_to_unmap - try to remove all page table mappings to a page @@ -1610,12 +1614,10 @@ int try_to_unmap(struct page *page, enum ttu_flags flags) struct rmap_walk_control rwc = { .rmap_one = try_to_unmap_one, .arg = &rp, - .done = page_not_mapped, + .done = page_mapcount_is_zero, .anon_lock = page_lock_anon_vma_read, }; - VM_BUG_ON_PAGE(!PageHuge(page) && PageTransHuge(page), page); - /* * During exec, a temporary VMA is setup and later moved. * The VMA is moved under the anon_vma lock but not the @@ -1627,9 +1629,12 @@ int try_to_unmap(struct page *page, enum ttu_flags flags) if ((flags & TTU_MIGRATION) && !PageKsm(page) && PageAnon(page)) rwc.invalid_vma = invalid_migration_vma; - ret = rmap_walk(page, &rwc); + if (flags & TTU_RMAP_LOCKED) + ret = rmap_walk_locked(page, &rwc); + else + ret = rmap_walk(page, &rwc); - if (ret != SWAP_MLOCK && !page_mapped(page)) { + if (ret != SWAP_MLOCK && !page_mapcount(page)) { ret = SWAP_SUCCESS; if (rp.lazyfreed && !PageDirty(page)) ret = SWAP_LZFREE; @@ -1637,6 +1642,11 @@ int try_to_unmap(struct page *page, enum ttu_flags flags) return ret; } +static int page_not_mapped(struct page *page) +{ + return !page_mapped(page); +}; + /** * try_to_munlock - try to munlock a page * @page: the page to be munlocked @@ -1719,14 +1729,21 @@ static struct anon_vma *rmap_walk_anon_lock(struct page *page, * vm_flags for that VMA. That should be OK, because that vma shouldn't be * LOCKED. */ -static int rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc) +static int rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc, + bool locked) { struct anon_vma *anon_vma; pgoff_t pgoff; struct anon_vma_chain *avc; int ret = SWAP_AGAIN; - anon_vma = rmap_walk_anon_lock(page, rwc); + if (locked) { + anon_vma = page_anon_vma(page); + /* anon_vma disappear under us? */ + VM_BUG_ON_PAGE(!anon_vma, page); + } else { + anon_vma = rmap_walk_anon_lock(page, rwc); + } if (!anon_vma) return ret; @@ -1746,7 +1763,9 @@ static int rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc) if (rwc->done && rwc->done(page)) break; } - anon_vma_unlock_read(anon_vma); + + if (!locked) + anon_vma_unlock_read(anon_vma); return ret; } @@ -1763,9 +1782,10 @@ static int rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc) * vm_flags for that VMA. That should be OK, because that vma shouldn't be * LOCKED. */ -static int rmap_walk_file(struct page *page, struct rmap_walk_control *rwc) +static int rmap_walk_file(struct page *page, struct rmap_walk_control *rwc, + bool locked) { - struct address_space *mapping = page->mapping; + struct address_space *mapping = page_mapping(page); pgoff_t pgoff; struct vm_area_struct *vma; int ret = SWAP_AGAIN; @@ -1782,7 +1802,8 @@ static int rmap_walk_file(struct page *page, struct rmap_walk_control *rwc) return ret; pgoff = page_to_pgoff(page); - i_mmap_lock_read(mapping); + if (!locked) + i_mmap_lock_read(mapping); vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { unsigned long address = vma_address(page, vma); @@ -1799,7 +1820,8 @@ static int rmap_walk_file(struct page *page, struct rmap_walk_control *rwc) } done: - i_mmap_unlock_read(mapping); + if (!locked) + i_mmap_unlock_read(mapping); return ret; } @@ -1808,9 +1830,20 @@ int rmap_walk(struct page *page, struct rmap_walk_control *rwc) if (unlikely(PageKsm(page))) return rmap_walk_ksm(page, rwc); else if (PageAnon(page)) - return rmap_walk_anon(page, rwc); + return rmap_walk_anon(page, rwc, false); + else + return rmap_walk_file(page, rwc, false); +} + +/* Like rmap_walk, but caller holds relevant rmap lock */ +int rmap_walk_locked(struct page *page, struct rmap_walk_control *rwc) +{ + /* no ksm support for now */ + VM_BUG_ON_PAGE(PageKsm(page), page); + if (PageAnon(page)) + return rmap_walk_anon(page, rwc, true); else - return rmap_walk_file(page, rwc); + return rmap_walk_file(page, rwc, true); } #ifdef CONFIG_HUGETLB_PAGE diff --git a/mm/shmem.c b/mm/shmem.c index 440e2a7e6c1c..9428c51ab2d6 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -376,28 +376,23 @@ unsigned long shmem_partial_swap_usage(struct address_space *mapping, rcu_read_lock(); -restart: radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) { if (iter.index >= end) break; page = radix_tree_deref_slot(slot); - /* - * This should only be possible to happen at index 0, so we - * don't need to reset the counter, nor do we risk infinite - * restarts. - */ - if (radix_tree_deref_retry(page)) - goto restart; + if (radix_tree_deref_retry(page)) { + slot = radix_tree_iter_retry(&iter); + continue; + } if (radix_tree_exceptional_entry(page)) swapped++; if (need_resched()) { cond_resched_rcu(); - start = iter.index + 1; - goto restart; + slot = radix_tree_iter_next(&iter); } } @@ -1116,7 +1111,7 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp, */ oldpage = newpage; } else { - mem_cgroup_replace_page(oldpage, newpage); + mem_cgroup_migrate(oldpage, newpage); lru_cache_add_anon(newpage); *pagep = newpage; } @@ -1947,12 +1942,13 @@ static void shmem_tag_pins(struct address_space *mapping) start = 0; rcu_read_lock(); -restart: radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) { page = radix_tree_deref_slot(slot); if (!page || radix_tree_exception(page)) { - if (radix_tree_deref_retry(page)) - goto restart; + if (radix_tree_deref_retry(page)) { + slot = radix_tree_iter_retry(&iter); + continue; + } } else if (page_count(page) - page_mapcount(page) > 1) { spin_lock_irq(&mapping->tree_lock); radix_tree_tag_set(&mapping->page_tree, iter.index, @@ -1962,8 +1958,7 @@ restart: if (need_resched()) { cond_resched_rcu(); - start = iter.index + 1; - goto restart; + slot = radix_tree_iter_next(&iter); } } rcu_read_unlock(); @@ -2000,14 +1995,15 @@ static int shmem_wait_for_pins(struct address_space *mapping) start = 0; rcu_read_lock(); -restart: radix_tree_for_each_tagged(slot, &mapping->page_tree, &iter, start, SHMEM_TAG_PINNED) { page = radix_tree_deref_slot(slot); if (radix_tree_exception(page)) { - if (radix_tree_deref_retry(page)) - goto restart; + if (radix_tree_deref_retry(page)) { + slot = radix_tree_iter_retry(&iter); + continue; + } page = NULL; } @@ -2032,8 +2028,7 @@ restart: continue_resched: if (need_resched()) { cond_resched_rcu(); - start = iter.index + 1; - goto restart; + slot = radix_tree_iter_next(&iter); } } rcu_read_unlock(); @@ -2823,9 +2818,8 @@ static int shmem_parse_options(char *options, struct shmem_sb_info *sbinfo, if ((value = strchr(this_char,'=')) != NULL) { *value++ = 0; } else { - printk(KERN_ERR - "tmpfs: No value for mount option '%s'\n", - this_char); + pr_err("tmpfs: No value for mount option '%s'\n", + this_char); goto error; } @@ -2880,8 +2874,7 @@ static int shmem_parse_options(char *options, struct shmem_sb_info *sbinfo, if (mpol_parse_str(value, &mpol)) goto bad_val; } else { - printk(KERN_ERR "tmpfs: Bad mount option %s\n", - this_char); + pr_err("tmpfs: Bad mount option %s\n", this_char); goto error; } } @@ -2889,7 +2882,7 @@ static int shmem_parse_options(char *options, struct shmem_sb_info *sbinfo, return 0; bad_val: - printk(KERN_ERR "tmpfs: Bad value '%s' for mount option '%s'\n", + pr_err("tmpfs: Bad value '%s' for mount option '%s'\n", value, this_char); error: mpol_put(mpol); @@ -3286,14 +3279,14 @@ int __init shmem_init(void) error = register_filesystem(&shmem_fs_type); if (error) { - printk(KERN_ERR "Could not register tmpfs\n"); + pr_err("Could not register tmpfs\n"); goto out2; } shm_mnt = kern_mount(&shmem_fs_type); if (IS_ERR(shm_mnt)) { error = PTR_ERR(shm_mnt); - printk(KERN_ERR "Could not kern_mount tmpfs\n"); + pr_err("Could not kern_mount tmpfs\n"); goto out1; } return 0; diff --git a/mm/slab.c b/mm/slab.c index 621fbcb35a36..17e2848979c5 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -169,12 +169,6 @@ typedef unsigned short freelist_idx_t; #define SLAB_OBJ_MAX_NUM ((1 << sizeof(freelist_idx_t) * BITS_PER_BYTE) - 1) /* - * true if a page was allocated from pfmemalloc reserves for network-based - * swap - */ -static bool pfmemalloc_active __read_mostly; - -/* * struct array_cache * * Purpose: @@ -195,10 +189,6 @@ struct array_cache { * Must have this definition in here for the proper * alignment of array_cache. Also simplifies accessing * the entries. - * - * Entries should not be directly dereferenced as - * entries belonging to slabs marked pfmemalloc will - * have the lower bits set SLAB_OBJ_PFMEMALLOC */ }; @@ -207,33 +197,6 @@ struct alien_cache { struct array_cache ac; }; -#define SLAB_OBJ_PFMEMALLOC 1 -static inline bool is_obj_pfmemalloc(void *objp) -{ - return (unsigned long)objp & SLAB_OBJ_PFMEMALLOC; -} - -static inline void set_obj_pfmemalloc(void **objp) -{ - *objp = (void *)((unsigned long)*objp | SLAB_OBJ_PFMEMALLOC); - return; -} - -static inline void clear_obj_pfmemalloc(void **objp) -{ - *objp = (void *)((unsigned long)*objp & ~SLAB_OBJ_PFMEMALLOC); -} - -/* - * bootstrap: The caches do not work without cpuarrays anymore, but the - * cpuarrays are allocated from the generic caches... - */ -#define BOOT_CPUCACHE_ENTRIES 1 -struct arraycache_init { - struct array_cache cache; - void *entries[BOOT_CPUCACHE_ENTRIES]; -}; - /* * Need this for bootstrapping a per node allocator. */ @@ -280,9 +243,10 @@ static void kmem_cache_node_init(struct kmem_cache_node *parent) MAKE_LIST((cachep), (&(ptr)->slabs_free), slabs_free, nodeid); \ } while (0) +#define CFLGS_OBJFREELIST_SLAB (0x40000000UL) #define CFLGS_OFF_SLAB (0x80000000UL) +#define OBJFREELIST_SLAB(x) ((x)->flags & CFLGS_OBJFREELIST_SLAB) #define OFF_SLAB(x) ((x)->flags & CFLGS_OFF_SLAB) -#define OFF_SLAB_MIN_SIZE (max_t(size_t, PAGE_SIZE >> 5, KMALLOC_MIN_SIZE + 1)) #define BATCHREFILL_LIMIT 16 /* @@ -390,36 +354,26 @@ static void **dbg_userword(struct kmem_cache *cachep, void *objp) #endif -#define OBJECT_FREE (0) -#define OBJECT_ACTIVE (1) - #ifdef CONFIG_DEBUG_SLAB_LEAK -static void set_obj_status(struct page *page, int idx, int val) +static inline bool is_store_user_clean(struct kmem_cache *cachep) { - int freelist_size; - char *status; - struct kmem_cache *cachep = page->slab_cache; - - freelist_size = cachep->num * sizeof(freelist_idx_t); - status = (char *)page->freelist + freelist_size; - status[idx] = val; + return atomic_read(&cachep->store_user_clean) == 1; } -static inline unsigned int get_obj_status(struct page *page, int idx) +static inline void set_store_user_clean(struct kmem_cache *cachep) { - int freelist_size; - char *status; - struct kmem_cache *cachep = page->slab_cache; - - freelist_size = cachep->num * sizeof(freelist_idx_t); - status = (char *)page->freelist + freelist_size; + atomic_set(&cachep->store_user_clean, 1); +} - return status[idx]; +static inline void set_store_user_dirty(struct kmem_cache *cachep) +{ + if (is_store_user_clean(cachep)) + atomic_set(&cachep->store_user_clean, 0); } #else -static inline void set_obj_status(struct page *page, int idx, int val) {} +static inline void set_store_user_dirty(struct kmem_cache *cachep) {} #endif @@ -457,6 +411,7 @@ static inline unsigned int obj_to_index(const struct kmem_cache *cache, return reciprocal_divide(offset, cache->reciprocal_buffer_size); } +#define BOOT_CPUCACHE_ENTRIES 1 /* internal cache of cache description objs */ static struct kmem_cache kmem_cache_boot = { .batchcount = 1, @@ -475,61 +430,13 @@ static inline struct array_cache *cpu_cache_get(struct kmem_cache *cachep) return this_cpu_ptr(cachep->cpu_cache); } -static size_t calculate_freelist_size(int nr_objs, size_t align) -{ - size_t freelist_size; - - freelist_size = nr_objs * sizeof(freelist_idx_t); - if (IS_ENABLED(CONFIG_DEBUG_SLAB_LEAK)) - freelist_size += nr_objs * sizeof(char); - - if (align) - freelist_size = ALIGN(freelist_size, align); - - return freelist_size; -} - -static int calculate_nr_objs(size_t slab_size, size_t buffer_size, - size_t idx_size, size_t align) -{ - int nr_objs; - size_t remained_size; - size_t freelist_size; - int extra_space = 0; - - if (IS_ENABLED(CONFIG_DEBUG_SLAB_LEAK)) - extra_space = sizeof(char); - /* - * Ignore padding for the initial guess. The padding - * is at most @align-1 bytes, and @buffer_size is at - * least @align. In the worst case, this result will - * be one greater than the number of objects that fit - * into the memory allocation when taking the padding - * into account. - */ - nr_objs = slab_size / (buffer_size + idx_size + extra_space); - - /* - * This calculated number will be either the right - * amount, or one greater than what we want. - */ - remained_size = slab_size - nr_objs * buffer_size; - freelist_size = calculate_freelist_size(nr_objs, align); - if (remained_size < freelist_size) - nr_objs--; - - return nr_objs; -} - /* * Calculate the number of objects and left-over bytes for a given buffer size. */ -static void cache_estimate(unsigned long gfporder, size_t buffer_size, - size_t align, int flags, size_t *left_over, - unsigned int *num) +static unsigned int cache_estimate(unsigned long gfporder, size_t buffer_size, + unsigned long flags, size_t *left_over) { - int nr_objs; - size_t mgmt_size; + unsigned int num; size_t slab_size = PAGE_SIZE << gfporder; /* @@ -537,26 +444,28 @@ static void cache_estimate(unsigned long gfporder, size_t buffer_size, * on it. For the latter case, the memory allocated for a * slab is used for: * - * - One unsigned int for each object - * - Padding to respect alignment of @align * - @buffer_size bytes for each object + * - One freelist_idx_t for each object + * + * We don't need to consider alignment of freelist because + * freelist will be at the end of slab page. The objects will be + * at the correct alignment. * * If the slab management structure is off the slab, then the * alignment will already be calculated into the size. Because * the slabs are all pages aligned, the objects will be at the * correct alignment when allocated. */ - if (flags & CFLGS_OFF_SLAB) { - mgmt_size = 0; - nr_objs = slab_size / buffer_size; - + if (flags & (CFLGS_OBJFREELIST_SLAB | CFLGS_OFF_SLAB)) { + num = slab_size / buffer_size; + *left_over = slab_size % buffer_size; } else { - nr_objs = calculate_nr_objs(slab_size, buffer_size, - sizeof(freelist_idx_t), align); - mgmt_size = calculate_freelist_size(nr_objs, align); + num = slab_size / (buffer_size + sizeof(freelist_idx_t)); + *left_over = slab_size % + (buffer_size + sizeof(freelist_idx_t)); } - *num = nr_objs; - *left_over = slab_size - nr_objs*buffer_size - mgmt_size; + + return num; } #if DEBUG @@ -565,7 +474,7 @@ static void cache_estimate(unsigned long gfporder, size_t buffer_size, static void __slab_error(const char *function, struct kmem_cache *cachep, char *msg) { - printk(KERN_ERR "slab error in %s(): cache `%s': %s\n", + pr_err("slab error in %s(): cache `%s': %s\n", function, cachep->name, msg); dump_stack(); add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); @@ -687,120 +596,21 @@ static struct array_cache *alloc_arraycache(int node, int entries, return ac; } -static inline bool is_slab_pfmemalloc(struct page *page) +static noinline void cache_free_pfmemalloc(struct kmem_cache *cachep, + struct page *page, void *objp) { - return PageSlabPfmemalloc(page); -} - -/* Clears pfmemalloc_active if no slabs have pfmalloc set */ -static void recheck_pfmemalloc_active(struct kmem_cache *cachep, - struct array_cache *ac) -{ - struct kmem_cache_node *n = get_node(cachep, numa_mem_id()); - struct page *page; - unsigned long flags; - - if (!pfmemalloc_active) - return; - - spin_lock_irqsave(&n->list_lock, flags); - list_for_each_entry(page, &n->slabs_full, lru) - if (is_slab_pfmemalloc(page)) - goto out; - - list_for_each_entry(page, &n->slabs_partial, lru) - if (is_slab_pfmemalloc(page)) - goto out; - - list_for_each_entry(page, &n->slabs_free, lru) - if (is_slab_pfmemalloc(page)) - goto out; - - pfmemalloc_active = false; -out: - spin_unlock_irqrestore(&n->list_lock, flags); -} - -static void *__ac_get_obj(struct kmem_cache *cachep, struct array_cache *ac, - gfp_t flags, bool force_refill) -{ - int i; - void *objp = ac->entry[--ac->avail]; - - /* Ensure the caller is allowed to use objects from PFMEMALLOC slab */ - if (unlikely(is_obj_pfmemalloc(objp))) { - struct kmem_cache_node *n; - - if (gfp_pfmemalloc_allowed(flags)) { - clear_obj_pfmemalloc(&objp); - return objp; - } - - /* The caller cannot use PFMEMALLOC objects, find another one */ - for (i = 0; i < ac->avail; i++) { - /* If a !PFMEMALLOC object is found, swap them */ - if (!is_obj_pfmemalloc(ac->entry[i])) { - objp = ac->entry[i]; - ac->entry[i] = ac->entry[ac->avail]; - ac->entry[ac->avail] = objp; - return objp; - } - } - - /* - * If there are empty slabs on the slabs_free list and we are - * being forced to refill the cache, mark this one !pfmemalloc. - */ - n = get_node(cachep, numa_mem_id()); - if (!list_empty(&n->slabs_free) && force_refill) { - struct page *page = virt_to_head_page(objp); - ClearPageSlabPfmemalloc(page); - clear_obj_pfmemalloc(&objp); - recheck_pfmemalloc_active(cachep, ac); - return objp; - } - - /* No !PFMEMALLOC objects available */ - ac->avail++; - objp = NULL; - } - - return objp; -} - -static inline void *ac_get_obj(struct kmem_cache *cachep, - struct array_cache *ac, gfp_t flags, bool force_refill) -{ - void *objp; - - if (unlikely(sk_memalloc_socks())) - objp = __ac_get_obj(cachep, ac, flags, force_refill); - else - objp = ac->entry[--ac->avail]; - - return objp; -} - -static noinline void *__ac_put_obj(struct kmem_cache *cachep, - struct array_cache *ac, void *objp) -{ - if (unlikely(pfmemalloc_active)) { - /* Some pfmemalloc slabs exist, check if this is one */ - struct page *page = virt_to_head_page(objp); - if (PageSlabPfmemalloc(page)) - set_obj_pfmemalloc(&objp); - } + struct kmem_cache_node *n; + int page_node; + LIST_HEAD(list); - return objp; -} + page_node = page_to_nid(page); + n = get_node(cachep, page_node); -static inline void ac_put_obj(struct kmem_cache *cachep, struct array_cache *ac, - void *objp) -{ - if (unlikely(sk_memalloc_socks())) - objp = __ac_put_obj(cachep, ac, objp); + spin_lock(&n->list_lock); + free_block(cachep, &objp, 1, page_node, &list); + spin_unlock(&n->list_lock); - ac->entry[ac->avail++] = objp; + slabs_destroy(cachep, &list); } /* @@ -860,7 +670,7 @@ static inline void *____cache_alloc_node(struct kmem_cache *cachep, static inline gfp_t gfp_exact_node(gfp_t flags) { - return flags; + return flags & ~__GFP_NOFAIL; } #else /* CONFIG_NUMA */ @@ -1003,7 +813,7 @@ static int __cache_free_alien(struct kmem_cache *cachep, void *objp, STATS_INC_ACOVERFLOW(cachep); __drain_alien_cache(cachep, ac, page_node, &list); } - ac_put_obj(cachep, ac, objp); + ac->entry[ac->avail++] = objp; spin_unlock(&alien->lock); slabs_destroy(cachep, &list); } else { @@ -1031,12 +841,12 @@ static inline int cache_free_alien(struct kmem_cache *cachep, void *objp) } /* - * Construct gfp mask to allocate from a specific node but do not direct reclaim - * or warn about failures. kswapd may still wake to reclaim in the background. + * Construct gfp mask to allocate from a specific node but do not reclaim or + * warn about failures. */ static inline gfp_t gfp_exact_node(gfp_t flags) { - return (flags | __GFP_THISNODE | __GFP_NOWARN) & ~__GFP_DIRECT_RECLAIM; + return (flags | __GFP_THISNODE | __GFP_NOWARN) & ~(__GFP_RECLAIM|__GFP_NOFAIL); } #endif @@ -1540,10 +1350,9 @@ slab_out_of_memory(struct kmem_cache *cachep, gfp_t gfpflags, int nodeid) if ((gfpflags & __GFP_NOWARN) || !__ratelimit(&slab_oom_rs)) return; - printk(KERN_WARNING - "SLAB: Unable to allocate memory on node %d (gfp=0x%x)\n", - nodeid, gfpflags); - printk(KERN_WARNING " cache: %s, object size: %d, order: %d\n", + pr_warn("SLAB: Unable to allocate memory on node %d, gfp=%#x(%pGg)\n", + nodeid, gfpflags, &gfpflags); + pr_warn(" cache: %s, object size: %d, order: %d\n", cachep->name, cachep->size, cachep->gfporder); for_each_kmem_cache_node(cachep, node, n) { @@ -1567,8 +1376,7 @@ slab_out_of_memory(struct kmem_cache *cachep, gfp_t gfpflags, int nodeid) num_slabs += active_slabs; num_objs = num_slabs * cachep->num; - printk(KERN_WARNING - " node %d: slabs: %ld/%ld, objs: %ld/%ld, free: %ld\n", + pr_warn(" node %d: slabs: %ld/%ld, objs: %ld/%ld, free: %ld\n", node, active_slabs, num_slabs, active_objs, num_objs, free_objects); } @@ -1604,10 +1412,6 @@ static struct page *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, return NULL; } - /* Record if ALLOC_NO_WATERMARKS was set when allocating the slab */ - if (page_is_pfmemalloc(page)) - pfmemalloc_active = true; - nr_pages = (1 << cachep->gfporder); if (cachep->flags & SLAB_RECLAIM_ACCOUNT) add_zone_page_state(page_zone(page), @@ -1615,8 +1419,10 @@ static struct page *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, else add_zone_page_state(page_zone(page), NR_SLAB_UNRECLAIMABLE, nr_pages); + __SetPageSlab(page); - if (page_is_pfmemalloc(page)) + /* Record if ALLOC_NO_WATERMARKS was set when allocating the slab */ + if (sk_memalloc_socks() && page_is_pfmemalloc(page)) SetPageSlabPfmemalloc(page); if (kmemcheck_enabled && !(cachep->flags & SLAB_NOTRACK)) { @@ -1636,9 +1442,10 @@ static struct page *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, */ static void kmem_freepages(struct kmem_cache *cachep, struct page *page) { - const unsigned long nr_freed = (1 << cachep->gfporder); + int order = cachep->gfporder; + unsigned long nr_freed = (1 << order); - kmemcheck_free_shadow(page, cachep->gfporder); + kmemcheck_free_shadow(page, order); if (cachep->flags & SLAB_RECLAIM_ACCOUNT) sub_zone_page_state(page_zone(page), @@ -1655,7 +1462,8 @@ static void kmem_freepages(struct kmem_cache *cachep, struct page *page) if (current->reclaim_state) current->reclaim_state->reclaimed_slab += nr_freed; - __free_kmem_pages(page, cachep->gfporder); + memcg_uncharge_slab(page, order, cachep); + __free_pages(page, order); } static void kmem_rcu_free(struct rcu_head *head) @@ -1670,6 +1478,14 @@ static void kmem_rcu_free(struct rcu_head *head) } #if DEBUG +static bool is_debug_pagealloc_cache(struct kmem_cache *cachep) +{ + if (debug_pagealloc_enabled() && OFF_SLAB(cachep) && + (cachep->size % PAGE_SIZE) == 0) + return true; + + return false; +} #ifdef CONFIG_DEBUG_PAGEALLOC static void store_stackinfo(struct kmem_cache *cachep, unsigned long *addr, @@ -1703,6 +1519,23 @@ static void store_stackinfo(struct kmem_cache *cachep, unsigned long *addr, } *addr++ = 0x87654321; } + +static void slab_kernel_map(struct kmem_cache *cachep, void *objp, + int map, unsigned long caller) +{ + if (!is_debug_pagealloc_cache(cachep)) + return; + + if (caller) + store_stackinfo(cachep, objp, caller); + + kernel_map_pages(virt_to_page(objp), cachep->size / PAGE_SIZE, map); +} + +#else +static inline void slab_kernel_map(struct kmem_cache *cachep, void *objp, + int map, unsigned long caller) {} + #endif static void poison_obj(struct kmem_cache *cachep, void *addr, unsigned char val) @@ -1720,7 +1553,7 @@ static void dump_line(char *data, int offset, int limit) unsigned char error = 0; int bad_count = 0; - printk(KERN_ERR "%03x: ", offset); + pr_err("%03x: ", offset); for (i = 0; i < limit; i++) { if (data[offset + i] != POISON_FREE) { error = data[offset + i]; @@ -1733,13 +1566,11 @@ static void dump_line(char *data, int offset, int limit) if (bad_count == 1) { error ^= POISON_FREE; if (!(error & (error - 1))) { - printk(KERN_ERR "Single bit error detected. Probably " - "bad RAM.\n"); + pr_err("Single bit error detected. Probably bad RAM.\n"); #ifdef CONFIG_X86 - printk(KERN_ERR "Run memtest86+ or a similar memory " - "test tool.\n"); + pr_err("Run memtest86+ or a similar memory test tool.\n"); #else - printk(KERN_ERR "Run a memory test tool.\n"); + pr_err("Run a memory test tool.\n"); #endif } } @@ -1754,13 +1585,13 @@ static void print_objinfo(struct kmem_cache *cachep, void *objp, int lines) char *realobj; if (cachep->flags & SLAB_RED_ZONE) { - printk(KERN_ERR "Redzone: 0x%llx/0x%llx.\n", - *dbg_redzone1(cachep, objp), - *dbg_redzone2(cachep, objp)); + pr_err("Redzone: 0x%llx/0x%llx\n", + *dbg_redzone1(cachep, objp), + *dbg_redzone2(cachep, objp)); } if (cachep->flags & SLAB_STORE_USER) { - printk(KERN_ERR "Last user: [<%p>](%pSR)\n", + pr_err("Last user: [<%p>](%pSR)\n", *dbg_userword(cachep, objp), *dbg_userword(cachep, objp)); } @@ -1781,6 +1612,9 @@ static void check_poison_obj(struct kmem_cache *cachep, void *objp) int size, i; int lines = 0; + if (is_debug_pagealloc_cache(cachep)) + return; + realobj = (char *)objp + obj_offset(cachep); size = cachep->object_size; @@ -1793,9 +1627,9 @@ static void check_poison_obj(struct kmem_cache *cachep, void *objp) /* Mismatch ! */ /* Print header */ if (lines == 0) { - printk(KERN_ERR - "Slab corruption (%s): %s start=%p, len=%d\n", - print_tainted(), cachep->name, realobj, size); + pr_err("Slab corruption (%s): %s start=%p, len=%d\n", + print_tainted(), cachep->name, + realobj, size); print_objinfo(cachep, objp, 0); } /* Hexdump the affected line */ @@ -1822,15 +1656,13 @@ static void check_poison_obj(struct kmem_cache *cachep, void *objp) if (objnr) { objp = index_to_obj(cachep, page, objnr - 1); realobj = (char *)objp + obj_offset(cachep); - printk(KERN_ERR "Prev obj: start=%p, len=%d\n", - realobj, size); + pr_err("Prev obj: start=%p, len=%d\n", realobj, size); print_objinfo(cachep, objp, 2); } if (objnr + 1 < cachep->num) { objp = index_to_obj(cachep, page, objnr + 1); realobj = (char *)objp + obj_offset(cachep); - printk(KERN_ERR "Next obj: start=%p, len=%d\n", - realobj, size); + pr_err("Next obj: start=%p, len=%d\n", realobj, size); print_objinfo(cachep, objp, 2); } } @@ -1842,28 +1674,24 @@ static void slab_destroy_debugcheck(struct kmem_cache *cachep, struct page *page) { int i; + + if (OBJFREELIST_SLAB(cachep) && cachep->flags & SLAB_POISON) { + poison_obj(cachep, page->freelist - obj_offset(cachep), + POISON_FREE); + } + for (i = 0; i < cachep->num; i++) { void *objp = index_to_obj(cachep, page, i); if (cachep->flags & SLAB_POISON) { -#ifdef CONFIG_DEBUG_PAGEALLOC - if (cachep->size % PAGE_SIZE == 0 && - OFF_SLAB(cachep)) - kernel_map_pages(virt_to_page(objp), - cachep->size / PAGE_SIZE, 1); - else - check_poison_obj(cachep, objp); -#else check_poison_obj(cachep, objp); -#endif + slab_kernel_map(cachep, objp, 1, 0); } if (cachep->flags & SLAB_RED_ZONE) { if (*dbg_redzone1(cachep, objp) != RED_INACTIVE) - slab_error(cachep, "start of a freed object " - "was overwritten"); + slab_error(cachep, "start of a freed object was overwritten"); if (*dbg_redzone2(cachep, objp) != RED_INACTIVE) - slab_error(cachep, "end of a freed object " - "was overwritten"); + slab_error(cachep, "end of a freed object was overwritten"); } } } @@ -1916,7 +1744,6 @@ static void slabs_destroy(struct kmem_cache *cachep, struct list_head *list) * calculate_slab_order - calculate size (page order) of slabs * @cachep: pointer to the cache that is being created * @size: size of objects to be created in this cache. - * @align: required alignment for the objects. * @flags: slab allocation flags * * Also calculates the number of objects per slab. @@ -1926,9 +1753,8 @@ static void slabs_destroy(struct kmem_cache *cachep, struct list_head *list) * towards high-order requests, this should be changed. */ static size_t calculate_slab_order(struct kmem_cache *cachep, - size_t size, size_t align, unsigned long flags) + size_t size, unsigned long flags) { - unsigned long offslab_limit; size_t left_over = 0; int gfporder; @@ -1936,7 +1762,7 @@ static size_t calculate_slab_order(struct kmem_cache *cachep, unsigned int num; size_t remainder; - cache_estimate(gfporder, size, align, flags, &remainder, &num); + num = cache_estimate(gfporder, size, flags, &remainder); if (!num) continue; @@ -1945,19 +1771,24 @@ static size_t calculate_slab_order(struct kmem_cache *cachep, break; if (flags & CFLGS_OFF_SLAB) { - size_t freelist_size_per_obj = sizeof(freelist_idx_t); + struct kmem_cache *freelist_cache; + size_t freelist_size; + + freelist_size = num * sizeof(freelist_idx_t); + freelist_cache = kmalloc_slab(freelist_size, 0u); + if (!freelist_cache) + continue; + /* - * Max number of objs-per-slab for caches which - * use off-slab slabs. Needed to avoid a possible - * looping condition in cache_grow(). + * Needed to avoid possible looping condition + * in cache_grow() */ - if (IS_ENABLED(CONFIG_DEBUG_SLAB_LEAK)) - freelist_size_per_obj += sizeof(char); - offslab_limit = size; - offslab_limit /= freelist_size_per_obj; + if (OFF_SLAB(freelist_cache)) + continue; - if (num > offslab_limit) - break; + /* check if off slab has enough benefit */ + if (freelist_cache->size > cachep->size / 2) + continue; } /* Found something acceptable - save it away */ @@ -2075,6 +1906,79 @@ __kmem_cache_alias(const char *name, size_t size, size_t align, return cachep; } +static bool set_objfreelist_slab_cache(struct kmem_cache *cachep, + size_t size, unsigned long flags) +{ + size_t left; + + cachep->num = 0; + + if (cachep->ctor || flags & SLAB_DESTROY_BY_RCU) + return false; + + left = calculate_slab_order(cachep, size, + flags | CFLGS_OBJFREELIST_SLAB); + if (!cachep->num) + return false; + + if (cachep->num * sizeof(freelist_idx_t) > cachep->object_size) + return false; + + cachep->colour = left / cachep->colour_off; + + return true; +} + +static bool set_off_slab_cache(struct kmem_cache *cachep, + size_t size, unsigned long flags) +{ + size_t left; + + cachep->num = 0; + + /* + * Always use on-slab management when SLAB_NOLEAKTRACE + * to avoid recursive calls into kmemleak. + */ + if (flags & SLAB_NOLEAKTRACE) + return false; + + /* + * Size is large, assume best to place the slab management obj + * off-slab (should allow better packing of objs). + */ + left = calculate_slab_order(cachep, size, flags | CFLGS_OFF_SLAB); + if (!cachep->num) + return false; + + /* + * If the slab has been placed off-slab, and we have enough space then + * move it on-slab. This is at the expense of any extra colouring. + */ + if (left >= cachep->num * sizeof(freelist_idx_t)) + return false; + + cachep->colour = left / cachep->colour_off; + + return true; +} + +static bool set_on_slab_cache(struct kmem_cache *cachep, + size_t size, unsigned long flags) +{ + size_t left; + + cachep->num = 0; + + left = calculate_slab_order(cachep, size, flags); + if (!cachep->num) + return false; + + cachep->colour = left / cachep->colour_off; + + return true; +} + /** * __kmem_cache_create - Create a cache. * @cachep: cache management descriptor @@ -2099,7 +2003,6 @@ __kmem_cache_alias(const char *name, size_t size, size_t align, int __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags) { - size_t left_over, freelist_size; size_t ralign = BYTES_PER_WORD; gfp_t gfp; int err; @@ -2119,8 +2022,6 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags) if (!(flags & SLAB_DESTROY_BY_RCU)) flags |= SLAB_POISON; #endif - if (flags & SLAB_DESTROY_BY_RCU) - BUG_ON(flags & SLAB_POISON); #endif /* @@ -2152,6 +2053,10 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags) * 4) Store it. */ cachep->align = ralign; + cachep->colour_off = cache_line_size(); + /* Offset must be a multiple of the alignment. */ + if (cachep->colour_off < cachep->align) + cachep->colour_off = cachep->align; if (slab_is_available()) gfp = GFP_KERNEL; @@ -2179,36 +2084,9 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags) else size += BYTES_PER_WORD; } -#if FORCED_DEBUG && defined(CONFIG_DEBUG_PAGEALLOC) - /* - * To activate debug pagealloc, off-slab management is necessary - * requirement. In early phase of initialization, small sized slab - * doesn't get initialized so it would not be possible. So, we need - * to check size >= 256. It guarantees that all necessary small - * sized slab is initialized in current slab initialization sequence. - */ - if (!slab_early_init && size >= kmalloc_size(INDEX_NODE) && - size >= 256 && cachep->object_size > cache_line_size() && - ALIGN(size, cachep->align) < PAGE_SIZE) { - cachep->obj_offset += PAGE_SIZE - ALIGN(size, cachep->align); - size = PAGE_SIZE; - } -#endif #endif - /* - * Determine if the slab management is 'on' or 'off' slab. - * (bootstrapping cannot cope with offslab caches so don't do - * it too early on. Always use on-slab management when - * SLAB_NOLEAKTRACE to avoid recursive calls into kmemleak) - */ - if (size >= OFF_SLAB_MIN_SIZE && !slab_early_init && - !(flags & SLAB_NOLEAKTRACE)) - /* - * Size is large, assume best to place the slab management obj - * off-slab (should allow better packing of objs). - */ - flags |= CFLGS_OFF_SLAB; + kasan_cache_create(cachep, &size, &flags); size = ALIGN(size, cachep->align); /* @@ -2218,42 +2096,46 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags) if (FREELIST_BYTE_INDEX && size < SLAB_OBJ_MIN_SIZE) size = ALIGN(SLAB_OBJ_MIN_SIZE, cachep->align); - left_over = calculate_slab_order(cachep, size, cachep->align, flags); - - if (!cachep->num) - return -E2BIG; - - freelist_size = calculate_freelist_size(cachep->num, cachep->align); - +#if DEBUG /* - * If the slab has been placed off-slab, and we have enough space then - * move it on-slab. This is at the expense of any extra colouring. + * To activate debug pagealloc, off-slab management is necessary + * requirement. In early phase of initialization, small sized slab + * doesn't get initialized so it would not be possible. So, we need + * to check size >= 256. It guarantees that all necessary small + * sized slab is initialized in current slab initialization sequence. */ - if (flags & CFLGS_OFF_SLAB && left_over >= freelist_size) { - flags &= ~CFLGS_OFF_SLAB; - left_over -= freelist_size; + if (debug_pagealloc_enabled() && (flags & SLAB_POISON) && + size >= 256 && cachep->object_size > cache_line_size()) { + if (size < PAGE_SIZE || size % PAGE_SIZE == 0) { + size_t tmp_size = ALIGN(size, PAGE_SIZE); + + if (set_off_slab_cache(cachep, tmp_size, flags)) { + flags |= CFLGS_OFF_SLAB; + cachep->obj_offset += tmp_size - size; + size = tmp_size; + goto done; + } + } } +#endif - if (flags & CFLGS_OFF_SLAB) { - /* really off slab. No need for manual alignment */ - freelist_size = calculate_freelist_size(cachep->num, 0); + if (set_objfreelist_slab_cache(cachep, size, flags)) { + flags |= CFLGS_OBJFREELIST_SLAB; + goto done; + } -#ifdef CONFIG_PAGE_POISONING - /* If we're going to use the generic kernel_map_pages() - * poisoning, then it's going to smash the contents of - * the redzone and userword anyhow, so switch them off. - */ - if (size % PAGE_SIZE == 0 && flags & SLAB_POISON) - flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER); -#endif + if (set_off_slab_cache(cachep, size, flags)) { + flags |= CFLGS_OFF_SLAB; + goto done; } - cachep->colour_off = cache_line_size(); - /* Offset must be a multiple of the alignment. */ - if (cachep->colour_off < cachep->align) - cachep->colour_off = cachep->align; - cachep->colour = left_over / cachep->colour_off; - cachep->freelist_size = freelist_size; + if (set_on_slab_cache(cachep, size, flags)) + goto done; + + return -E2BIG; + +done: + cachep->freelist_size = cachep->num * sizeof(freelist_idx_t); cachep->flags = flags; cachep->allocflags = __GFP_COMP; if (CONFIG_ZONE_DMA_FLAG && (flags & SLAB_CACHE_DMA)) @@ -2261,16 +2143,21 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags) cachep->size = size; cachep->reciprocal_buffer_size = reciprocal_value(size); - if (flags & CFLGS_OFF_SLAB) { - cachep->freelist_cache = kmalloc_slab(freelist_size, 0u); - /* - * This is a possibility for one of the kmalloc_{dma,}_caches. - * But since we go off slab only for object size greater than - * OFF_SLAB_MIN_SIZE, and kmalloc_{dma,}_caches get created - * in ascending order,this should not happen at all. - * But leave a BUG_ON for some lucky dude. - */ - BUG_ON(ZERO_OR_NULL_PTR(cachep->freelist_cache)); +#if DEBUG + /* + * If we're going to use the generic kernel_map_pages() + * poisoning, then it's going to smash the contents of + * the redzone and userword anyhow, so switch them off. + */ + if (IS_ENABLED(CONFIG_PAGE_POISONING) && + (cachep->flags & SLAB_POISON) && + is_debug_pagealloc_cache(cachep)) + cachep->flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER); +#endif + + if (OFF_SLAB(cachep)) { + cachep->freelist_cache = + kmalloc_slab(cachep->freelist_size, 0u); } err = setup_cpu_cache(cachep, gfp); @@ -2377,9 +2264,6 @@ static int drain_freelist(struct kmem_cache *cache, } page = list_entry(p, struct page, lru); -#if DEBUG - BUG_ON(page->active); -#endif list_del(&page->lru); /* * Safe to drop the lock. The slab is no longer linked @@ -2454,18 +2338,23 @@ static void *alloc_slabmgmt(struct kmem_cache *cachep, void *freelist; void *addr = page_address(page); - if (OFF_SLAB(cachep)) { + page->s_mem = addr + colour_off; + page->active = 0; + + if (OBJFREELIST_SLAB(cachep)) + freelist = NULL; + else if (OFF_SLAB(cachep)) { /* Slab management obj is off-slab. */ freelist = kmem_cache_alloc_node(cachep->freelist_cache, local_flags, nodeid); if (!freelist) return NULL; } else { - freelist = addr + colour_off; - colour_off += cachep->freelist_size; + /* We will use last bytes at the slab for freelist */ + freelist = addr + (PAGE_SIZE << cachep->gfporder) - + cachep->freelist_size; } - page->active = 0; - page->s_mem = addr + colour_off; + return freelist; } @@ -2480,17 +2369,14 @@ static inline void set_free_obj(struct page *page, ((freelist_idx_t *)(page->freelist))[idx] = val; } -static void cache_init_objs(struct kmem_cache *cachep, - struct page *page) +static void cache_init_objs_debug(struct kmem_cache *cachep, struct page *page) { +#if DEBUG int i; for (i = 0; i < cachep->num; i++) { void *objp = index_to_obj(cachep, page, i); -#if DEBUG - /* need to poison the objs? */ - if (cachep->flags & SLAB_POISON) - poison_obj(cachep, objp, POISON_FREE); + if (cachep->flags & SLAB_STORE_USER) *dbg_userword(cachep, objp) = NULL; @@ -2503,26 +2389,51 @@ static void cache_init_objs(struct kmem_cache *cachep, * cache which they are a constructor for. Otherwise, deadlock. * They must also be threaded. */ - if (cachep->ctor && !(cachep->flags & SLAB_POISON)) + if (cachep->ctor && !(cachep->flags & SLAB_POISON)) { + kasan_unpoison_object_data(cachep, + objp + obj_offset(cachep)); cachep->ctor(objp + obj_offset(cachep)); + kasan_poison_object_data( + cachep, objp + obj_offset(cachep)); + } if (cachep->flags & SLAB_RED_ZONE) { if (*dbg_redzone2(cachep, objp) != RED_INACTIVE) - slab_error(cachep, "constructor overwrote the" - " end of an object"); + slab_error(cachep, "constructor overwrote the end of an object"); if (*dbg_redzone1(cachep, objp) != RED_INACTIVE) - slab_error(cachep, "constructor overwrote the" - " start of an object"); + slab_error(cachep, "constructor overwrote the start of an object"); } - if ((cachep->size % PAGE_SIZE) == 0 && - OFF_SLAB(cachep) && cachep->flags & SLAB_POISON) - kernel_map_pages(virt_to_page(objp), - cachep->size / PAGE_SIZE, 0); -#else - if (cachep->ctor) - cachep->ctor(objp); + /* need to poison the objs? */ + if (cachep->flags & SLAB_POISON) { + poison_obj(cachep, objp, POISON_FREE); + slab_kernel_map(cachep, objp, 0, 0); + } + } #endif - set_obj_status(page, i, OBJECT_FREE); +} + +static void cache_init_objs(struct kmem_cache *cachep, + struct page *page) +{ + int i; + void *objp; + + cache_init_objs_debug(cachep, page); + + if (OBJFREELIST_SLAB(cachep)) { + page->freelist = index_to_obj(cachep, page, cachep->num - 1) + + obj_offset(cachep); + } + + for (i = 0; i < cachep->num; i++) { + /* constructor could break poison info */ + if (DEBUG == 0 && cachep->ctor) { + objp = index_to_obj(cachep, page, i); + kasan_unpoison_object_data(cachep, objp); + cachep->ctor(objp); + kasan_poison_object_data(cachep, objp); + } + set_free_obj(page, i, i); } } @@ -2537,40 +2448,41 @@ static void kmem_flagcheck(struct kmem_cache *cachep, gfp_t flags) } } -static void *slab_get_obj(struct kmem_cache *cachep, struct page *page, - int nodeid) +static void *slab_get_obj(struct kmem_cache *cachep, struct page *page) { void *objp; objp = index_to_obj(cachep, page, get_free_obj(page, page->active)); page->active++; + #if DEBUG - WARN_ON(page_to_nid(virt_to_page(objp)) != nodeid); + if (cachep->flags & SLAB_STORE_USER) + set_store_user_dirty(cachep); #endif return objp; } -static void slab_put_obj(struct kmem_cache *cachep, struct page *page, - void *objp, int nodeid) +static void slab_put_obj(struct kmem_cache *cachep, + struct page *page, void *objp) { unsigned int objnr = obj_to_index(cachep, page, objp); #if DEBUG unsigned int i; - /* Verify that the slab belongs to the intended node */ - WARN_ON(page_to_nid(virt_to_page(objp)) != nodeid); - /* Verify double free bug */ for (i = page->active; i < cachep->num; i++) { if (get_free_obj(page, i) == objnr) { - printk(KERN_ERR "slab: double free detected in cache " - "'%s', objp %p\n", cachep->name, objp); + pr_err("slab: double free detected in cache '%s', objp %p\n", + cachep->name, objp); BUG(); } } #endif page->active--; + if (!page->freelist) + page->freelist = objp + obj_offset(cachep); + set_free_obj(page, page->active, objnr); } @@ -2645,11 +2557,12 @@ static int cache_grow(struct kmem_cache *cachep, /* Get slab management. */ freelist = alloc_slabmgmt(cachep, page, offset, local_flags & ~GFP_CONSTRAINT_MASK, nodeid); - if (!freelist) + if (OFF_SLAB(cachep) && !freelist) goto opps1; slab_map_pages(cachep, page, freelist); + kasan_poison_slab(page); cache_init_objs(cachep, page); if (gfpflags_allow_blocking(local_flags)) @@ -2681,7 +2594,7 @@ failed: static void kfree_debugcheck(const void *objp) { if (!virt_addr_valid(objp)) { - printk(KERN_ERR "kfree_debugcheck: out of range ptr %lxh.\n", + pr_err("kfree_debugcheck: out of range ptr %lxh\n", (unsigned long)objp); BUG(); } @@ -2705,8 +2618,8 @@ static inline void verify_redzone_free(struct kmem_cache *cache, void *obj) else slab_error(cache, "memory outside object was overwritten"); - printk(KERN_ERR "%p: redzone 1:0x%llx, redzone 2:0x%llx.\n", - obj, redzone1, redzone2); + pr_err("%p: redzone 1:0x%llx, redzone 2:0x%llx\n", + obj, redzone1, redzone2); } static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp, @@ -2726,27 +2639,19 @@ static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp, *dbg_redzone1(cachep, objp) = RED_INACTIVE; *dbg_redzone2(cachep, objp) = RED_INACTIVE; } - if (cachep->flags & SLAB_STORE_USER) + if (cachep->flags & SLAB_STORE_USER) { + set_store_user_dirty(cachep); *dbg_userword(cachep, objp) = (void *)caller; + } objnr = obj_to_index(cachep, page, objp); BUG_ON(objnr >= cachep->num); BUG_ON(objp != index_to_obj(cachep, page, objnr)); - set_obj_status(page, objnr, OBJECT_FREE); if (cachep->flags & SLAB_POISON) { -#ifdef CONFIG_DEBUG_PAGEALLOC - if ((cachep->size % PAGE_SIZE)==0 && OFF_SLAB(cachep)) { - store_stackinfo(cachep, objp, caller); - kernel_map_pages(virt_to_page(objp), - cachep->size / PAGE_SIZE, 0); - } else { - poison_obj(cachep, objp, POISON_FREE); - } -#else poison_obj(cachep, objp, POISON_FREE); -#endif + slab_kernel_map(cachep, objp, 0, caller); } return objp; } @@ -2756,7 +2661,85 @@ static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp, #define cache_free_debugcheck(x,objp,z) (objp) #endif -static struct page *get_first_slab(struct kmem_cache_node *n) +static inline void fixup_objfreelist_debug(struct kmem_cache *cachep, + void **list) +{ +#if DEBUG + void *next = *list; + void *objp; + + while (next) { + objp = next - obj_offset(cachep); + next = *(void **)next; + poison_obj(cachep, objp, POISON_FREE); + } +#endif +} + +static inline void fixup_slab_list(struct kmem_cache *cachep, + struct kmem_cache_node *n, struct page *page, + void **list) +{ + /* move slabp to correct slabp list: */ + list_del(&page->lru); + if (page->active == cachep->num) { + list_add(&page->lru, &n->slabs_full); + if (OBJFREELIST_SLAB(cachep)) { +#if DEBUG + /* Poisoning will be done without holding the lock */ + if (cachep->flags & SLAB_POISON) { + void **objp = page->freelist; + + *objp = *list; + *list = objp; + } +#endif + page->freelist = NULL; + } + } else + list_add(&page->lru, &n->slabs_partial); +} + +/* Try to find non-pfmemalloc slab if needed */ +static noinline struct page *get_valid_first_slab(struct kmem_cache_node *n, + struct page *page, bool pfmemalloc) +{ + if (!page) + return NULL; + + if (pfmemalloc) + return page; + + if (!PageSlabPfmemalloc(page)) + return page; + + /* No need to keep pfmemalloc slab if we have enough free objects */ + if (n->free_objects > n->free_limit) { + ClearPageSlabPfmemalloc(page); + return page; + } + + /* Move pfmemalloc slab to the end of list to speed up next search */ + list_del(&page->lru); + if (!page->active) + list_add_tail(&page->lru, &n->slabs_free); + else + list_add_tail(&page->lru, &n->slabs_partial); + + list_for_each_entry(page, &n->slabs_partial, lru) { + if (!PageSlabPfmemalloc(page)) + return page; + } + + list_for_each_entry(page, &n->slabs_free, lru) { + if (!PageSlabPfmemalloc(page)) + return page; + } + + return NULL; +} + +static struct page *get_first_slab(struct kmem_cache_node *n, bool pfmemalloc) { struct page *page; @@ -2768,21 +2751,51 @@ static struct page *get_first_slab(struct kmem_cache_node *n) struct page, lru); } + if (sk_memalloc_socks()) + return get_valid_first_slab(n, page, pfmemalloc); + return page; } -static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags, - bool force_refill) +static noinline void *cache_alloc_pfmemalloc(struct kmem_cache *cachep, + struct kmem_cache_node *n, gfp_t flags) +{ + struct page *page; + void *obj; + void *list = NULL; + + if (!gfp_pfmemalloc_allowed(flags)) + return NULL; + + spin_lock(&n->list_lock); + page = get_first_slab(n, true); + if (!page) { + spin_unlock(&n->list_lock); + return NULL; + } + + obj = slab_get_obj(cachep, page); + n->free_objects--; + + fixup_slab_list(cachep, n, page, &list); + + spin_unlock(&n->list_lock); + fixup_objfreelist_debug(cachep, &list); + + return obj; +} + +static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags) { int batchcount; struct kmem_cache_node *n; struct array_cache *ac; int node; + void *list = NULL; check_irq_off(); node = numa_mem_id(); - if (unlikely(force_refill)) - goto force_grow; + retry: ac = cpu_cache_get(cachep); batchcount = ac->batchcount; @@ -2808,7 +2821,7 @@ retry: while (batchcount > 0) { struct page *page; /* Get slab alloc is to come from. */ - page = get_first_slab(n); + page = get_first_slab(n, false); if (!page) goto must_grow; @@ -2826,26 +2839,29 @@ retry: STATS_INC_ACTIVE(cachep); STATS_SET_HIGH(cachep); - ac_put_obj(cachep, ac, slab_get_obj(cachep, page, - node)); + ac->entry[ac->avail++] = slab_get_obj(cachep, page); } - /* move slabp to correct slabp list: */ - list_del(&page->lru); - if (page->active == cachep->num) - list_add(&page->lru, &n->slabs_full); - else - list_add(&page->lru, &n->slabs_partial); + fixup_slab_list(cachep, n, page, &list); } must_grow: n->free_objects -= ac->avail; alloc_done: spin_unlock(&n->list_lock); + fixup_objfreelist_debug(cachep, &list); if (unlikely(!ac->avail)) { int x; -force_grow: + + /* Check if we can use obj in pfmemalloc slab */ + if (sk_memalloc_socks()) { + void *obj = cache_alloc_pfmemalloc(cachep, n, flags); + + if (obj) + return obj; + } + x = cache_grow(cachep, gfp_exact_node(flags), node, NULL); /* cache_grow can reenable interrupts, then ac could change. */ @@ -2853,7 +2869,7 @@ force_grow: node = numa_mem_id(); /* no objects in sight? abort */ - if (!x && (ac->avail == 0 || force_refill)) + if (!x && ac->avail == 0) return NULL; if (!ac->avail) /* objects refilled by interrupt? */ @@ -2861,7 +2877,7 @@ force_grow: } ac->touched = 1; - return ac_get_obj(cachep, ac, flags, force_refill); + return ac->entry[--ac->avail]; } static inline void cache_alloc_debugcheck_before(struct kmem_cache *cachep, @@ -2877,20 +2893,11 @@ static inline void cache_alloc_debugcheck_before(struct kmem_cache *cachep, static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep, gfp_t flags, void *objp, unsigned long caller) { - struct page *page; - if (!objp) return objp; if (cachep->flags & SLAB_POISON) { -#ifdef CONFIG_DEBUG_PAGEALLOC - if ((cachep->size % PAGE_SIZE) == 0 && OFF_SLAB(cachep)) - kernel_map_pages(virt_to_page(objp), - cachep->size / PAGE_SIZE, 1); - else - check_poison_obj(cachep, objp); -#else check_poison_obj(cachep, objp); -#endif + slab_kernel_map(cachep, objp, 1, 0); poison_obj(cachep, objp, POISON_INUSE); } if (cachep->flags & SLAB_STORE_USER) @@ -2899,25 +2906,21 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep, if (cachep->flags & SLAB_RED_ZONE) { if (*dbg_redzone1(cachep, objp) != RED_INACTIVE || *dbg_redzone2(cachep, objp) != RED_INACTIVE) { - slab_error(cachep, "double free, or memory outside" - " object was overwritten"); - printk(KERN_ERR - "%p: redzone 1:0x%llx, redzone 2:0x%llx\n", - objp, *dbg_redzone1(cachep, objp), - *dbg_redzone2(cachep, objp)); + slab_error(cachep, "double free, or memory outside object was overwritten"); + pr_err("%p: redzone 1:0x%llx, redzone 2:0x%llx\n", + objp, *dbg_redzone1(cachep, objp), + *dbg_redzone2(cachep, objp)); } *dbg_redzone1(cachep, objp) = RED_ACTIVE; *dbg_redzone2(cachep, objp) = RED_ACTIVE; } - page = virt_to_head_page(objp); - set_obj_status(page, obj_to_index(cachep, page, objp), OBJECT_ACTIVE); objp += obj_offset(cachep); if (cachep->ctor && cachep->flags & SLAB_POISON) cachep->ctor(objp); if (ARCH_SLAB_MINALIGN && ((unsigned long)objp & (ARCH_SLAB_MINALIGN-1))) { - printk(KERN_ERR "0x%p: not aligned to ARCH_SLAB_MINALIGN=%d\n", + pr_err("0x%p: not aligned to ARCH_SLAB_MINALIGN=%d\n", objp, (int)ARCH_SLAB_MINALIGN); } return objp; @@ -2926,40 +2929,24 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep, #define cache_alloc_debugcheck_after(a,b,objp,d) (objp) #endif -static bool slab_should_failslab(struct kmem_cache *cachep, gfp_t flags) -{ - if (unlikely(cachep == kmem_cache)) - return false; - - return should_failslab(cachep->object_size, flags, cachep->flags); -} - static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags) { void *objp; struct array_cache *ac; - bool force_refill = false; check_irq_off(); ac = cpu_cache_get(cachep); if (likely(ac->avail)) { ac->touched = 1; - objp = ac_get_obj(cachep, ac, flags, false); + objp = ac->entry[--ac->avail]; - /* - * Allow for the possibility all avail objects are not allowed - * by the current flags - */ - if (objp) { - STATS_INC_ALLOCHIT(cachep); - goto out; - } - force_refill = true; + STATS_INC_ALLOCHIT(cachep); + goto out; } STATS_INC_ALLOCMISS(cachep); - objp = cache_alloc_refill(cachep, flags, force_refill); + objp = cache_alloc_refill(cachep, flags); /* * the 'ac' may be updated by cache_alloc_refill(), * and kmemleak_erase() requires its correct value. @@ -3097,6 +3084,7 @@ static void *____cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, struct page *page; struct kmem_cache_node *n; void *obj; + void *list = NULL; int x; VM_BUG_ON(nodeid < 0 || nodeid >= MAX_NUMNODES); @@ -3106,7 +3094,7 @@ static void *____cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, retry: check_irq_off(); spin_lock(&n->list_lock); - page = get_first_slab(n); + page = get_first_slab(n, false); if (!page) goto must_grow; @@ -3118,17 +3106,13 @@ retry: BUG_ON(page->active == cachep->num); - obj = slab_get_obj(cachep, page, nodeid); + obj = slab_get_obj(cachep, page); n->free_objects--; - /* move slabp to correct slabp list: */ - list_del(&page->lru); - if (page->active == cachep->num) - list_add(&page->lru, &n->slabs_full); - else - list_add(&page->lru, &n->slabs_partial); + fixup_slab_list(cachep, n, page, &list); spin_unlock(&n->list_lock); + fixup_objfreelist_debug(cachep, &list); goto done; must_grow: @@ -3152,14 +3136,10 @@ slab_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid, int slab_node = numa_mem_id(); flags &= gfp_allowed_mask; - - lockdep_trace_alloc(flags); - - if (slab_should_failslab(cachep, flags)) + cachep = slab_pre_alloc_hook(cachep, flags); + if (unlikely(!cachep)) return NULL; - cachep = memcg_kmem_get_cache(cachep, flags); - cache_alloc_debugcheck_before(cachep, flags); local_irq_save(save_flags); @@ -3188,16 +3168,11 @@ slab_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid, out: local_irq_restore(save_flags); ptr = cache_alloc_debugcheck_after(cachep, flags, ptr, caller); - kmemleak_alloc_recursive(ptr, cachep->object_size, 1, cachep->flags, - flags); - if (likely(ptr)) { - kmemcheck_slab_alloc(cachep, flags, ptr, cachep->object_size); - if (unlikely(flags & __GFP_ZERO)) - memset(ptr, 0, cachep->object_size); - } + if (unlikely(flags & __GFP_ZERO) && ptr) + memset(ptr, 0, cachep->object_size); - memcg_kmem_put_cache(cachep); + slab_post_alloc_hook(cachep, flags, 1, &ptr); return ptr; } @@ -3240,30 +3215,21 @@ slab_alloc(struct kmem_cache *cachep, gfp_t flags, unsigned long caller) void *objp; flags &= gfp_allowed_mask; - - lockdep_trace_alloc(flags); - - if (slab_should_failslab(cachep, flags)) + cachep = slab_pre_alloc_hook(cachep, flags); + if (unlikely(!cachep)) return NULL; - cachep = memcg_kmem_get_cache(cachep, flags); - cache_alloc_debugcheck_before(cachep, flags); local_irq_save(save_flags); objp = __do_cache_alloc(cachep, flags); local_irq_restore(save_flags); objp = cache_alloc_debugcheck_after(cachep, flags, objp, caller); - kmemleak_alloc_recursive(objp, cachep->object_size, 1, cachep->flags, - flags); prefetchw(objp); - if (likely(objp)) { - kmemcheck_slab_alloc(cachep, flags, objp, cachep->object_size); - if (unlikely(flags & __GFP_ZERO)) - memset(objp, 0, cachep->object_size); - } + if (unlikely(flags & __GFP_ZERO) && objp) + memset(objp, 0, cachep->object_size); - memcg_kmem_put_cache(cachep); + slab_post_alloc_hook(cachep, flags, 1, &objp); return objp; } @@ -3281,13 +3247,12 @@ static void free_block(struct kmem_cache *cachep, void **objpp, void *objp; struct page *page; - clear_obj_pfmemalloc(&objpp[i]); objp = objpp[i]; page = virt_to_head_page(objp); list_del(&page->lru); check_spinlock_acquired_node(cachep, node); - slab_put_obj(cachep, page, objp, node); + slab_put_obj(cachep, page, objp); STATS_DEC_ACTIVE(cachep); n->free_objects++; @@ -3317,9 +3282,7 @@ static void cache_flusharray(struct kmem_cache *cachep, struct array_cache *ac) LIST_HEAD(list); batchcount = ac->batchcount; -#if DEBUG - BUG_ON(!batchcount || batchcount > ac->avail); -#endif + check_irq_off(); n = get_node(cachep, node); spin_lock(&n->list_lock); @@ -3366,6 +3329,8 @@ static inline void __cache_free(struct kmem_cache *cachep, void *objp, { struct array_cache *ac = cpu_cache_get(cachep); + kasan_slab_free(cachep, objp); + check_irq_off(); kmemleak_free_recursive(objp, cachep->flags); objp = cache_free_debugcheck(cachep, objp, caller); @@ -3389,7 +3354,16 @@ static inline void __cache_free(struct kmem_cache *cachep, void *objp, cache_flusharray(cachep, ac); } - ac_put_obj(cachep, ac, objp); + if (sk_memalloc_socks()) { + struct page *page = virt_to_head_page(objp); + + if (unlikely(PageSlabPfmemalloc(page))) { + cache_free_pfmemalloc(cachep, page, objp); + return; + } + } + + ac->entry[ac->avail++] = objp; } /** @@ -3404,6 +3378,7 @@ void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags) { void *ret = slab_alloc(cachep, flags, _RET_IP_); + kasan_slab_alloc(cachep, ret, flags); trace_kmem_cache_alloc(_RET_IP_, ret, cachep->object_size, cachep->size, flags); @@ -3411,16 +3386,53 @@ void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags) } EXPORT_SYMBOL(kmem_cache_alloc); -void kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p) +static __always_inline void +cache_alloc_debugcheck_after_bulk(struct kmem_cache *s, gfp_t flags, + size_t size, void **p, unsigned long caller) { - __kmem_cache_free_bulk(s, size, p); + size_t i; + + for (i = 0; i < size; i++) + p[i] = cache_alloc_debugcheck_after(s, flags, p[i], caller); } -EXPORT_SYMBOL(kmem_cache_free_bulk); int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, - void **p) + void **p) { - return __kmem_cache_alloc_bulk(s, flags, size, p); + size_t i; + + s = slab_pre_alloc_hook(s, flags); + if (!s) + return 0; + + cache_alloc_debugcheck_before(s, flags); + + local_irq_disable(); + for (i = 0; i < size; i++) { + void *objp = __do_cache_alloc(s, flags); + + if (unlikely(!objp)) + goto error; + p[i] = objp; + } + local_irq_enable(); + + cache_alloc_debugcheck_after_bulk(s, flags, size, p, _RET_IP_); + + /* Clear memory outside IRQ disabled section */ + if (unlikely(flags & __GFP_ZERO)) + for (i = 0; i < size; i++) + memset(p[i], 0, s->object_size); + + slab_post_alloc_hook(s, flags, size, p); + /* FIXME: Trace call missing. Christoph would like a bulk variant */ + return size; +error: + local_irq_enable(); + cache_alloc_debugcheck_after_bulk(s, flags, i, p, _RET_IP_); + slab_post_alloc_hook(s, flags, i, p); + __kmem_cache_free_bulk(s, i, p); + return 0; } EXPORT_SYMBOL(kmem_cache_alloc_bulk); @@ -3432,6 +3444,7 @@ kmem_cache_alloc_trace(struct kmem_cache *cachep, gfp_t flags, size_t size) ret = slab_alloc(cachep, flags, _RET_IP_); + kasan_kmalloc(cachep, ret, size, flags); trace_kmalloc(_RET_IP_, ret, size, cachep->size, flags); return ret; @@ -3455,6 +3468,7 @@ void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid) { void *ret = slab_alloc_node(cachep, flags, nodeid, _RET_IP_); + kasan_slab_alloc(cachep, ret, flags); trace_kmem_cache_alloc_node(_RET_IP_, ret, cachep->object_size, cachep->size, flags, nodeid); @@ -3473,6 +3487,7 @@ void *kmem_cache_alloc_node_trace(struct kmem_cache *cachep, ret = slab_alloc_node(cachep, flags, nodeid, _RET_IP_); + kasan_kmalloc(cachep, ret, size, flags); trace_kmalloc_node(_RET_IP_, ret, size, cachep->size, flags, nodeid); @@ -3485,11 +3500,15 @@ static __always_inline void * __do_kmalloc_node(size_t size, gfp_t flags, int node, unsigned long caller) { struct kmem_cache *cachep; + void *ret; cachep = kmalloc_slab(size, flags); if (unlikely(ZERO_OR_NULL_PTR(cachep))) return cachep; - return kmem_cache_alloc_node_trace(cachep, flags, node, size); + ret = kmem_cache_alloc_node_trace(cachep, flags, node, size); + kasan_kmalloc(cachep, ret, size, flags); + + return ret; } void *__kmalloc_node(size_t size, gfp_t flags, int node) @@ -3523,6 +3542,7 @@ static __always_inline void *__do_kmalloc(size_t size, gfp_t flags, return cachep; ret = slab_alloc(cachep, flags, caller); + kasan_kmalloc(cachep, ret, size, flags); trace_kmalloc(caller, ret, size, cachep->size, flags); @@ -3567,6 +3587,32 @@ void kmem_cache_free(struct kmem_cache *cachep, void *objp) } EXPORT_SYMBOL(kmem_cache_free); +void kmem_cache_free_bulk(struct kmem_cache *orig_s, size_t size, void **p) +{ + struct kmem_cache *s; + size_t i; + + local_irq_disable(); + for (i = 0; i < size; i++) { + void *objp = p[i]; + + if (!orig_s) /* called via kfree_bulk */ + s = virt_to_cache(objp); + else + s = cache_from_obj(orig_s, objp); + + debug_check_no_locks_freed(objp, s->object_size); + if (!(s->flags & SLAB_DEBUG_OBJECTS)) + debug_check_no_obj_freed(objp, s->object_size); + + __cache_free(s, objp, _RET_IP_); + } + local_irq_enable(); + + /* FIXME: add tracing */ +} +EXPORT_SYMBOL(kmem_cache_free_bulk); + /** * kfree - free previously allocated memory * @objp: pointer returned by kmalloc. @@ -3812,7 +3858,7 @@ static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp) skip_setup: err = do_tune_cpucache(cachep, limit, batchcount, shared, gfp); if (err) - printk(KERN_ERR "enable_cpucache failed for %s, error %d.\n", + pr_err("enable_cpucache failed for %s, error %d\n", cachep->name, -err); return err; } @@ -3968,7 +4014,7 @@ void get_slabinfo(struct kmem_cache *cachep, struct slabinfo *sinfo) name = cachep->name; if (error) - printk(KERN_ERR "slab: cache %s error: %s\n", name, error); + pr_err("slab: cache %s error: %s\n", name, error); sinfo->active_objs = active_objs; sinfo->num_objs = num_objs; @@ -3996,8 +4042,7 @@ void slabinfo_show_stats(struct seq_file *m, struct kmem_cache *cachep) unsigned long node_frees = cachep->node_frees; unsigned long overflows = cachep->node_overflow; - seq_printf(m, " : globalstat %7lu %6lu %5lu %4lu " - "%4lu %4lu %4lu %4lu %4lu", + seq_printf(m, " : globalstat %7lu %6lu %5lu %4lu %4lu %4lu %4lu %4lu %4lu", allocs, high, grown, reaped, errors, max_freeable, node_allocs, node_frees, overflows); @@ -4102,15 +4147,34 @@ static void handle_slab(unsigned long *n, struct kmem_cache *c, struct page *page) { void *p; - int i; + int i, j; + unsigned long v; if (n[0] == n[1]) return; for (i = 0, p = page->s_mem; i < c->num; i++, p += c->size) { - if (get_obj_status(page, i) != OBJECT_ACTIVE) + bool active = true; + + for (j = page->active; j < c->num; j++) { + if (get_free_obj(page, j) == i) { + active = false; + break; + } + } + + if (!active) continue; - if (!add_caller(n, (unsigned long)*dbg_userword(c, p))) + /* + * probe_kernel_read() is used for DEBUG_PAGEALLOC. page table + * mapping is established when actual object allocation and + * we could mistakenly access the unmapped object in the cpu + * cache. + */ + if (probe_kernel_read(&v, dbg_userword(c, p), sizeof(v))) + continue; + + if (!add_caller(n, v)) return; } } @@ -4146,21 +4210,31 @@ static int leaks_show(struct seq_file *m, void *p) if (!(cachep->flags & SLAB_RED_ZONE)) return 0; - /* OK, we can do it */ + /* + * Set store_user_clean and start to grab stored user information + * for all objects on this cache. If some alloc/free requests comes + * during the processing, information would be wrong so restart + * whole processing. + */ + do { + set_store_user_clean(cachep); + drain_cpu_caches(cachep); - x[1] = 0; + x[1] = 0; - for_each_kmem_cache_node(cachep, node, n) { + for_each_kmem_cache_node(cachep, node, n) { - check_irq_on(); - spin_lock_irq(&n->list_lock); + check_irq_on(); + spin_lock_irq(&n->list_lock); + + list_for_each_entry(page, &n->slabs_full, lru) + handle_slab(x, cachep, page); + list_for_each_entry(page, &n->slabs_partial, lru) + handle_slab(x, cachep, page); + spin_unlock_irq(&n->list_lock); + } + } while (!is_store_user_clean(cachep)); - list_for_each_entry(page, &n->slabs_full, lru) - handle_slab(x, cachep, page); - list_for_each_entry(page, &n->slabs_partial, lru) - handle_slab(x, cachep, page); - spin_unlock_irq(&n->list_lock); - } name = cachep->name; if (x[0] == x[1]) { /* Increase the buffer size */ @@ -4240,10 +4314,18 @@ module_init(slab_proc_init); */ size_t ksize(const void *objp) { + size_t size; + BUG_ON(!objp); if (unlikely(objp == ZERO_SIZE_PTR)) return 0; - return virt_to_cache(objp)->object_size; + size = virt_to_cache(objp)->object_size; + /* We assume that ksize callers could use the whole allocated area, + * so we need to unpoison this area. + */ + kasan_krealloc(objp, size, GFP_NOWAIT); + + return size; } EXPORT_SYMBOL(ksize); diff --git a/mm/slab.h b/mm/slab.h index 2eedacea439d..5969769fbee6 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -38,6 +38,10 @@ struct kmem_cache { #endif #include <linux/memcontrol.h> +#include <linux/fault-inject.h> +#include <linux/kmemcheck.h> +#include <linux/kasan.h> +#include <linux/kmemleak.h> /* * State of the slab allocator. @@ -121,7 +125,7 @@ static inline unsigned long kmem_cache_flags(unsigned long object_size, #define SLAB_DEBUG_FLAGS (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER) #elif defined(CONFIG_SLUB_DEBUG) #define SLAB_DEBUG_FLAGS (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \ - SLAB_TRACE | SLAB_DEBUG_FREE) + SLAB_TRACE | SLAB_CONSISTENCY_CHECKS) #else #define SLAB_DEBUG_FLAGS (0) #endif @@ -168,7 +172,7 @@ ssize_t slabinfo_write(struct file *file, const char __user *buffer, /* * Generic implementation of bulk operations * These are useful for situations in which the allocator cannot - * perform optimizations. In that case segments of the objecct listed + * perform optimizations. In that case segments of the object listed * may be allocated or freed using these operations. */ void __kmem_cache_free_bulk(struct kmem_cache *, size_t, void **); @@ -242,12 +246,33 @@ static __always_inline int memcg_charge_slab(struct page *page, gfp_t gfp, int order, struct kmem_cache *s) { + int ret; + if (!memcg_kmem_enabled()) return 0; if (is_root_cache(s)) return 0; - return __memcg_kmem_charge_memcg(page, gfp, order, - s->memcg_params.memcg); + + ret = __memcg_kmem_charge_memcg(page, gfp, order, + s->memcg_params.memcg); + if (ret) + return ret; + + memcg_kmem_update_page_stat(page, + (s->flags & SLAB_RECLAIM_ACCOUNT) ? + MEMCG_SLAB_RECLAIMABLE : MEMCG_SLAB_UNRECLAIMABLE, + 1 << order); + return 0; +} + +static __always_inline void memcg_uncharge_slab(struct page *page, int order, + struct kmem_cache *s) +{ + memcg_kmem_update_page_stat(page, + (s->flags & SLAB_RECLAIM_ACCOUNT) ? + MEMCG_SLAB_RECLAIMABLE : MEMCG_SLAB_UNRECLAIMABLE, + -(1 << order)); + memcg_kmem_uncharge(page, order); } extern void slab_init_memcg_params(struct kmem_cache *); @@ -290,6 +315,11 @@ static inline int memcg_charge_slab(struct page *page, gfp_t gfp, int order, return 0; } +static inline void memcg_uncharge_slab(struct page *page, int order, + struct kmem_cache *s) +{ +} + static inline void slab_init_memcg_params(struct kmem_cache *s) { } @@ -307,7 +337,8 @@ static inline struct kmem_cache *cache_from_obj(struct kmem_cache *s, void *x) * to not do even the assignment. In that case, slab_equal_or_root * will also be a constant. */ - if (!memcg_kmem_enabled() && !unlikely(s->flags & SLAB_DEBUG_FREE)) + if (!memcg_kmem_enabled() && + !unlikely(s->flags & SLAB_CONSISTENCY_CHECKS)) return s; page = virt_to_head_page(x); @@ -321,6 +352,64 @@ static inline struct kmem_cache *cache_from_obj(struct kmem_cache *s, void *x) return s; } +static inline size_t slab_ksize(const struct kmem_cache *s) +{ +#ifndef CONFIG_SLUB + return s->object_size; + +#else /* CONFIG_SLUB */ +# ifdef CONFIG_SLUB_DEBUG + /* + * Debugging requires use of the padding between object + * and whatever may come after it. + */ + if (s->flags & (SLAB_RED_ZONE | SLAB_POISON)) + return s->object_size; +# endif + /* + * If we have the need to store the freelist pointer + * back there or track user information then we can + * only use the space before that information. + */ + if (s->flags & (SLAB_DESTROY_BY_RCU | SLAB_STORE_USER)) + return s->inuse; + /* + * Else we can use all the padding etc for the allocation + */ + return s->size; +#endif +} + +static inline struct kmem_cache *slab_pre_alloc_hook(struct kmem_cache *s, + gfp_t flags) +{ + flags &= gfp_allowed_mask; + lockdep_trace_alloc(flags); + might_sleep_if(gfpflags_allow_blocking(flags)); + + if (should_failslab(s, flags)) + return NULL; + + return memcg_kmem_get_cache(s, flags); +} + +static inline void slab_post_alloc_hook(struct kmem_cache *s, gfp_t flags, + size_t size, void **p) +{ + size_t i; + + flags &= gfp_allowed_mask; + for (i = 0; i < size; i++) { + void *object = p[i]; + + kmemcheck_slab_alloc(s, flags, object, slab_ksize(s)); + kmemleak_alloc_recursive(object, s->object_size, 1, + s->flags, flags); + kasan_slab_alloc(s, object, flags); + } + memcg_kmem_put_cache(s); +} + #ifndef CONFIG_SLOB /* * The slab lists for all objects. diff --git a/mm/slab_common.c b/mm/slab_common.c index 065b7bdabdc3..3239bfd758e6 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -35,7 +35,7 @@ struct kmem_cache *kmem_cache; */ #define SLAB_NEVER_MERGE (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \ SLAB_TRACE | SLAB_DESTROY_BY_RCU | SLAB_NOLEAKTRACE | \ - SLAB_FAILSLAB) + SLAB_FAILSLAB | SLAB_KASAN) #define SLAB_MERGE_SAME (SLAB_RECLAIM_ACCOUNT | SLAB_CACHE_DMA | \ SLAB_NOTRACK | SLAB_ACCOUNT) @@ -109,8 +109,12 @@ void __kmem_cache_free_bulk(struct kmem_cache *s, size_t nr, void **p) { size_t i; - for (i = 0; i < nr; i++) - kmem_cache_free(s, p[i]); + for (i = 0; i < nr; i++) { + if (s) + kmem_cache_free(s, p[i]); + else + kfree(p[i]); + } } int __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t nr, @@ -438,7 +442,7 @@ out_unlock: panic("kmem_cache_create: Failed to create slab '%s'. Error %d\n", name, err); else { - printk(KERN_WARNING "kmem_cache_create(%s) failed with error %d", + pr_warn("kmem_cache_create(%s) failed with error %d\n", name, err); dump_stack(); } @@ -506,7 +510,7 @@ void memcg_create_kmem_cache(struct mem_cgroup *memcg, * The memory cgroup could have been offlined while the cache * creation work was pending. */ - if (!memcg_kmem_online(memcg)) + if (memcg->kmem_state != KMEM_ONLINE) goto out_unlock; idx = memcg_cache_id(memcg); @@ -722,8 +726,8 @@ void kmem_cache_destroy(struct kmem_cache *s) err = shutdown_cache(s, &release, &need_rcu_barrier); if (err) { - pr_err("kmem_cache_destroy %s: " - "Slab cache still has objects\n", s->name); + pr_err("kmem_cache_destroy %s: Slab cache still has objects\n", + s->name); dump_stack(); } out_unlock: @@ -1009,7 +1013,7 @@ void *kmalloc_order(size_t size, gfp_t flags, unsigned int order) page = alloc_kmem_pages(flags, order); ret = page ? page_address(page) : NULL; kmemleak_alloc(ret, size, 1, flags); - kasan_kmalloc_large(ret, size); + kasan_kmalloc_large(ret, size, flags); return ret; } EXPORT_SYMBOL(kmalloc_order); @@ -1043,13 +1047,11 @@ static void print_slabinfo_header(struct seq_file *m) #else seq_puts(m, "slabinfo - version: 2.1\n"); #endif - seq_puts(m, "# name <active_objs> <num_objs> <objsize> " - "<objperslab> <pagesperslab>"); + seq_puts(m, "# name <active_objs> <num_objs> <objsize> <objperslab> <pagesperslab>"); seq_puts(m, " : tunables <limit> <batchcount> <sharedfactor>"); seq_puts(m, " : slabdata <active_slabs> <num_slabs> <sharedavail>"); #ifdef CONFIG_DEBUG_SLAB - seq_puts(m, " : globalstat <listallocs> <maxobjs> <grown> <reaped> " - "<error> <maxfreeable> <nodeallocs> <remotefrees> <alienoverflow>"); + seq_puts(m, " : globalstat <listallocs> <maxobjs> <grown> <reaped> <error> <maxfreeable> <nodeallocs> <remotefrees> <alienoverflow>"); seq_puts(m, " : cpustat <allochit> <allocmiss> <freehit> <freemiss>"); #endif seq_putc(m, '\n'); @@ -1190,7 +1192,7 @@ static __always_inline void *__do_krealloc(const void *p, size_t new_size, ks = ksize(p); if (ks >= new_size) { - kasan_krealloc((void *)p, new_size); + kasan_krealloc((void *)p, new_size, flags); return (void *)p; } diff --git a/mm/slub.c b/mm/slub.c index d8fbd4a6ed59..4dbb109eb8cd 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -124,6 +124,14 @@ static inline int kmem_cache_debug(struct kmem_cache *s) #endif } +static inline void *fixup_red_left(struct kmem_cache *s, void *p) +{ + if (kmem_cache_debug(s) && s->flags & SLAB_RED_ZONE) + p += s->red_left_pad; + + return p; +} + static inline bool kmem_cache_has_cpu_partial(struct kmem_cache *s) { #ifdef CONFIG_SLUB_CPU_PARTIAL @@ -160,10 +168,18 @@ static inline bool kmem_cache_has_cpu_partial(struct kmem_cache *s) */ #define MAX_PARTIAL 10 -#define DEBUG_DEFAULT_FLAGS (SLAB_DEBUG_FREE | SLAB_RED_ZONE | \ +#define DEBUG_DEFAULT_FLAGS (SLAB_CONSISTENCY_CHECKS | SLAB_RED_ZONE | \ SLAB_POISON | SLAB_STORE_USER) /* + * These debug flags cannot use CMPXCHG because there might be consistency + * issues when checking or reading debug information + */ +#define SLAB_NO_CMPXCHG (SLAB_CONSISTENCY_CHECKS | SLAB_STORE_USER | \ + SLAB_TRACE) + + +/* * Debugging flags that require metadata to be stored in the slab. These get * disabled when slub_debug=O is used and a cache's min order increases with * metadata. @@ -224,24 +240,6 @@ static inline void stat(const struct kmem_cache *s, enum stat_item si) * Core slab cache functions *******************************************************************/ -/* Verify that a pointer has an address that is valid within a slab page */ -static inline int check_valid_pointer(struct kmem_cache *s, - struct page *page, const void *object) -{ - void *base; - - if (!object) - return 1; - - base = page_address(page); - if (object < base || object >= base + page->objects * s->size || - (object - base) % s->size) { - return 0; - } - - return 1; -} - static inline void *get_freepointer(struct kmem_cache *s, void *object) { return *(void **)(object + s->offset); @@ -256,11 +254,10 @@ static inline void *get_freepointer_safe(struct kmem_cache *s, void *object) { void *p; -#ifdef CONFIG_DEBUG_PAGEALLOC + if (!debug_pagealloc_enabled()) + return get_freepointer(s, object); + probe_kernel_read(&p, (void **)(object + s->offset), sizeof(p)); -#else - p = get_freepointer(s, object); -#endif return p; } @@ -271,12 +268,14 @@ static inline void set_freepointer(struct kmem_cache *s, void *object, void *fp) /* Loop over all objects in a slab */ #define for_each_object(__p, __s, __addr, __objects) \ - for (__p = (__addr); __p < (__addr) + (__objects) * (__s)->size;\ - __p += (__s)->size) + for (__p = fixup_red_left(__s, __addr); \ + __p < (__addr) + (__objects) * (__s)->size; \ + __p += (__s)->size) #define for_each_object_idx(__p, __idx, __s, __addr, __objects) \ - for (__p = (__addr), __idx = 1; __idx <= __objects;\ - __p += (__s)->size, __idx++) + for (__p = fixup_red_left(__s, __addr), __idx = 1; \ + __idx <= __objects; \ + __p += (__s)->size, __idx++) /* Determine object index from a given position */ static inline int slab_index(void *p, struct kmem_cache *s, void *addr) @@ -284,30 +283,6 @@ static inline int slab_index(void *p, struct kmem_cache *s, void *addr) return (p - addr) / s->size; } -static inline size_t slab_ksize(const struct kmem_cache *s) -{ -#ifdef CONFIG_SLUB_DEBUG - /* - * Debugging requires use of the padding between object - * and whatever may come after it. - */ - if (s->flags & (SLAB_RED_ZONE | SLAB_POISON)) - return s->object_size; - -#endif - /* - * If we have the need to store the freelist pointer - * back there or track user information then we can - * only use the space before that information. - */ - if (s->flags & (SLAB_DESTROY_BY_RCU | SLAB_STORE_USER)) - return s->inuse; - /* - * Else we can use all the padding etc for the allocation - */ - return s->size; -} - static inline int order_objects(int order, unsigned long size, int reserved) { return ((PAGE_SIZE << order) - reserved) / size; @@ -458,6 +433,22 @@ static void get_map(struct kmem_cache *s, struct page *page, unsigned long *map) set_bit(slab_index(p, s, addr), map); } +static inline int size_from_object(struct kmem_cache *s) +{ + if (s->flags & SLAB_RED_ZONE) + return s->size - s->red_left_pad; + + return s->size; +} + +static inline void *restore_red_left(struct kmem_cache *s, void *p) +{ + if (s->flags & SLAB_RED_ZONE) + p -= s->red_left_pad; + + return p; +} + /* * Debug settings: */ @@ -491,6 +482,26 @@ static inline void metadata_access_disable(void) /* * Object debugging */ + +/* Verify that a pointer has an address that is valid within a slab page */ +static inline int check_valid_pointer(struct kmem_cache *s, + struct page *page, void *object) +{ + void *base; + + if (!object) + return 1; + + base = page_address(page); + object = restore_red_left(s, object); + if (object < base || object >= base + page->objects * s->size || + (object - base) % s->size) { + return 0; + } + + return 1; +} + static void print_section(char *text, u8 *addr, unsigned int length) { metadata_access_enable(); @@ -630,7 +641,9 @@ static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p) pr_err("INFO: Object 0x%p @offset=%tu fp=0x%p\n\n", p, p - addr, get_freepointer(s, p)); - if (p > addr + 16) + if (s->flags & SLAB_RED_ZONE) + print_section("Redzone ", p - s->red_left_pad, s->red_left_pad); + else if (p > addr + 16) print_section("Bytes b4 ", p - 16, 16); print_section("Object ", p, min_t(unsigned long, s->object_size, @@ -647,9 +660,9 @@ static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p) if (s->flags & SLAB_STORE_USER) off += 2 * sizeof(struct track); - if (off != s->size) + if (off != size_from_object(s)) /* Beginning of the filler is the free pointer */ - print_section("Padding ", p + off, s->size - off); + print_section("Padding ", p + off, size_from_object(s) - off); dump_stack(); } @@ -679,6 +692,9 @@ static void init_object(struct kmem_cache *s, void *object, u8 val) { u8 *p = object; + if (s->flags & SLAB_RED_ZONE) + memset(p - s->red_left_pad, val, s->red_left_pad); + if (s->flags & __OBJECT_POISON) { memset(p, POISON_FREE, s->object_size - 1); p[s->object_size - 1] = POISON_END; @@ -771,11 +787,11 @@ static int check_pad_bytes(struct kmem_cache *s, struct page *page, u8 *p) /* We also have user information there */ off += 2 * sizeof(struct track); - if (s->size == off) + if (size_from_object(s) == off) return 1; return check_bytes_and_report(s, page, p, "Object padding", - p + off, POISON_INUSE, s->size - off); + p + off, POISON_INUSE, size_from_object(s) - off); } /* Check the pad bytes at the end of a slab page */ @@ -820,6 +836,10 @@ static int check_object(struct kmem_cache *s, struct page *page, if (s->flags & SLAB_RED_ZONE) { if (!check_bytes_and_report(s, page, object, "Redzone", + object - s->red_left_pad, val, s->red_left_pad)) + return 0; + + if (!check_bytes_and_report(s, page, object, "Redzone", endobject, val, s->inuse - s->object_size)) return 0; } else { @@ -930,14 +950,14 @@ static int on_freelist(struct kmem_cache *s, struct page *page, void *search) max_objects = MAX_OBJS_PER_PAGE; if (page->objects != max_objects) { - slab_err(s, page, "Wrong number of objects. Found %d but " - "should be %d", page->objects, max_objects); + slab_err(s, page, "Wrong number of objects. Found %d but should be %d", + page->objects, max_objects); page->objects = max_objects; slab_fix(s, "Number of objects adjusted."); } if (page->inuse != page->objects - nr) { - slab_err(s, page, "Wrong object count. Counter is %d but " - "counted were %d", page->inuse, page->objects - nr); + slab_err(s, page, "Wrong object count. Counter is %d but counted were %d", + page->inuse, page->objects - nr); page->inuse = page->objects - nr; slab_fix(s, "Object count adjusted."); } @@ -1031,20 +1051,32 @@ static void setup_object_debug(struct kmem_cache *s, struct page *page, init_tracking(s, object); } -static noinline int alloc_debug_processing(struct kmem_cache *s, +static inline int alloc_consistency_checks(struct kmem_cache *s, struct page *page, void *object, unsigned long addr) { if (!check_slab(s, page)) - goto bad; + return 0; if (!check_valid_pointer(s, page, object)) { object_err(s, page, object, "Freelist Pointer check fails"); - goto bad; + return 0; } if (!check_object(s, page, object, SLUB_RED_INACTIVE)) - goto bad; + return 0; + + return 1; +} + +static noinline int alloc_debug_processing(struct kmem_cache *s, + struct page *page, + void *object, unsigned long addr) +{ + if (s->flags & SLAB_CONSISTENCY_CHECKS) { + if (!alloc_consistency_checks(s, page, object, addr)) + goto bad; + } /* Success perform special debug activities for allocs */ if (s->flags & SLAB_STORE_USER) @@ -1067,42 +1099,26 @@ bad: return 0; } -/* Supports checking bulk free of a constructed freelist */ -static noinline struct kmem_cache_node *free_debug_processing( - struct kmem_cache *s, struct page *page, - void *head, void *tail, int bulk_cnt, - unsigned long addr, unsigned long *flags) +static inline int free_consistency_checks(struct kmem_cache *s, + struct page *page, void *object, unsigned long addr) { - struct kmem_cache_node *n = get_node(s, page_to_nid(page)); - void *object = head; - int cnt = 0; - - spin_lock_irqsave(&n->list_lock, *flags); - slab_lock(page); - - if (!check_slab(s, page)) - goto fail; - -next_object: - cnt++; - if (!check_valid_pointer(s, page, object)) { slab_err(s, page, "Invalid object pointer 0x%p", object); - goto fail; + return 0; } if (on_freelist(s, page, object)) { object_err(s, page, object, "Object already free"); - goto fail; + return 0; } if (!check_object(s, page, object, SLUB_RED_ACTIVE)) - goto out; + return 0; if (unlikely(s != page->slab_cache)) { if (!PageSlab(page)) { - slab_err(s, page, "Attempt to free object(0x%p) " - "outside of slab", object); + slab_err(s, page, "Attempt to free object(0x%p) outside of slab", + object); } else if (!page->slab_cache) { pr_err("SLUB <none>: no slab for object 0x%p.\n", object); @@ -1110,7 +1126,37 @@ next_object: } else object_err(s, page, object, "page slab pointer corrupt."); - goto fail; + return 0; + } + return 1; +} + +/* Supports checking bulk free of a constructed freelist */ +static noinline int free_debug_processing( + struct kmem_cache *s, struct page *page, + void *head, void *tail, int bulk_cnt, + unsigned long addr) +{ + struct kmem_cache_node *n = get_node(s, page_to_nid(page)); + void *object = head; + int cnt = 0; + unsigned long uninitialized_var(flags); + int ret = 0; + + spin_lock_irqsave(&n->list_lock, flags); + slab_lock(page); + + if (s->flags & SLAB_CONSISTENCY_CHECKS) { + if (!check_slab(s, page)) + goto out; + } + +next_object: + cnt++; + + if (s->flags & SLAB_CONSISTENCY_CHECKS) { + if (!free_consistency_checks(s, page, object, addr)) + goto out; } if (s->flags & SLAB_STORE_USER) @@ -1124,23 +1170,18 @@ next_object: object = get_freepointer(s, object); goto next_object; } + ret = 1; + out: if (cnt != bulk_cnt) slab_err(s, page, "Bulk freelist count(%d) invalid(%d)\n", bulk_cnt, cnt); slab_unlock(page); - /* - * Keep node_lock to preserve integrity - * until the object is actually freed - */ - return n; - -fail: - slab_unlock(page); - spin_unlock_irqrestore(&n->list_lock, *flags); - slab_fix(s, "Object at 0x%p not freed", object); - return NULL; + spin_unlock_irqrestore(&n->list_lock, flags); + if (!ret) + slab_fix(s, "Object at 0x%p not freed", object); + return ret; } static int __init setup_slub_debug(char *str) @@ -1172,7 +1213,7 @@ static int __init setup_slub_debug(char *str) for (; *str && *str != ','; str++) { switch (tolower(*str)) { case 'f': - slub_debug |= SLAB_DEBUG_FREE; + slub_debug |= SLAB_CONSISTENCY_CHECKS; break; case 'z': slub_debug |= SLAB_RED_ZONE; @@ -1231,10 +1272,10 @@ static inline void setup_object_debug(struct kmem_cache *s, static inline int alloc_debug_processing(struct kmem_cache *s, struct page *page, void *object, unsigned long addr) { return 0; } -static inline struct kmem_cache_node *free_debug_processing( +static inline int free_debug_processing( struct kmem_cache *s, struct page *page, void *head, void *tail, int bulk_cnt, - unsigned long addr, unsigned long *flags) { return NULL; } + unsigned long addr) { return 0; } static inline int slab_pad_check(struct kmem_cache *s, struct page *page) { return 1; } @@ -1272,7 +1313,7 @@ static inline void dec_slabs_node(struct kmem_cache *s, int node, static inline void kmalloc_large_node_hook(void *ptr, size_t size, gfp_t flags) { kmemleak_alloc(ptr, size, 1, flags); - kasan_kmalloc_large(ptr, size); + kasan_kmalloc_large(ptr, size, flags); } static inline void kfree_hook(const void *x) @@ -1281,36 +1322,6 @@ static inline void kfree_hook(const void *x) kasan_kfree_large(x); } -static inline struct kmem_cache *slab_pre_alloc_hook(struct kmem_cache *s, - gfp_t flags) -{ - flags &= gfp_allowed_mask; - lockdep_trace_alloc(flags); - might_sleep_if(gfpflags_allow_blocking(flags)); - - if (should_failslab(s->object_size, flags, s->flags)) - return NULL; - - return memcg_kmem_get_cache(s, flags); -} - -static inline void slab_post_alloc_hook(struct kmem_cache *s, gfp_t flags, - size_t size, void **p) -{ - size_t i; - - flags &= gfp_allowed_mask; - for (i = 0; i < size; i++) { - void *object = p[i]; - - kmemcheck_slab_alloc(s, flags, object, slab_ksize(s)); - kmemleak_alloc_recursive(object, s->object_size, 1, - s->flags, flags); - kasan_slab_alloc(s, object); - } - memcg_kmem_put_cache(s); -} - static inline void slab_free_hook(struct kmem_cache *s, void *x) { kmemleak_free_recursive(x, s->flags); @@ -1415,7 +1426,7 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) */ alloc_gfp = (flags | __GFP_NOWARN | __GFP_NORETRY) & ~__GFP_NOFAIL; if ((alloc_gfp & __GFP_DIRECT_RECLAIM) && oo_order(oo) > oo_order(s->min)) - alloc_gfp = (alloc_gfp | __GFP_NOMEMALLOC) & ~__GFP_DIRECT_RECLAIM; + alloc_gfp = (alloc_gfp | __GFP_NOMEMALLOC) & ~(__GFP_RECLAIM|__GFP_NOFAIL); page = alloc_slab_page(s, alloc_gfp, node, oo); if (unlikely(!page)) { @@ -1470,7 +1481,7 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) set_freepointer(s, p, NULL); } - page->freelist = start; + page->freelist = fixup_red_left(s, start); page->inuse = page->objects; page->frozen = 1; @@ -1506,7 +1517,7 @@ static void __free_slab(struct kmem_cache *s, struct page *page) int order = compound_order(page); int pages = 1 << order; - if (kmem_cache_debug(s)) { + if (s->flags & SLAB_CONSISTENCY_CHECKS) { void *p; slab_pad_check(s, page); @@ -1528,7 +1539,8 @@ static void __free_slab(struct kmem_cache *s, struct page *page) page_mapcount_reset(page); if (current->reclaim_state) current->reclaim_state->reclaimed_slab += pages; - __free_kmem_pages(page, order); + memcg_uncharge_slab(page, order, s); + __free_pages(page, order); } #define need_reserve_slab_rcu \ @@ -2224,8 +2236,8 @@ slab_out_of_memory(struct kmem_cache *s, gfp_t gfpflags, int nid) if ((gfpflags & __GFP_NOWARN) || !__ratelimit(&slub_oom_rs)) return; - pr_warn("SLUB: Unable to allocate memory on node %d (gfp=0x%x)\n", - nid, gfpflags); + pr_warn("SLUB: Unable to allocate memory on node %d, gfp=%#x(%pGg)\n", + nid, gfpflags, &gfpflags); pr_warn(" cache: %s, object size: %d, buffer size: %d, default order: %d, min order: %d\n", s->name, s->object_size, s->size, oo_order(s->oo), oo_order(s->min)); @@ -2584,7 +2596,7 @@ void *kmem_cache_alloc_trace(struct kmem_cache *s, gfp_t gfpflags, size_t size) { void *ret = slab_alloc(s, gfpflags, _RET_IP_); trace_kmalloc(_RET_IP_, ret, size, s->size, gfpflags); - kasan_kmalloc(s, ret, size); + kasan_kmalloc(s, ret, size, gfpflags); return ret; } EXPORT_SYMBOL(kmem_cache_alloc_trace); @@ -2612,7 +2624,7 @@ void *kmem_cache_alloc_node_trace(struct kmem_cache *s, trace_kmalloc_node(_RET_IP_, ret, size, s->size, gfpflags, node); - kasan_kmalloc(s, ret, size); + kasan_kmalloc(s, ret, size, gfpflags); return ret; } EXPORT_SYMBOL(kmem_cache_alloc_node_trace); @@ -2642,8 +2654,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page, stat(s, FREE_SLOWPATH); if (kmem_cache_debug(s) && - !(n = free_debug_processing(s, page, head, tail, cnt, - addr, &flags))) + !free_debug_processing(s, page, head, tail, cnt, addr)) return; do { @@ -2815,6 +2826,7 @@ struct detached_freelist { void *tail; void *freelist; int cnt; + struct kmem_cache *s; }; /* @@ -2829,26 +2841,45 @@ struct detached_freelist { * synchronization primitive. Look ahead in the array is limited due * to performance reasons. */ -static int build_detached_freelist(struct kmem_cache *s, size_t size, - void **p, struct detached_freelist *df) +static inline +int build_detached_freelist(struct kmem_cache *s, size_t size, + void **p, struct detached_freelist *df) { size_t first_skipped_index = 0; int lookahead = 3; void *object; + struct page *page; /* Always re-init detached_freelist */ df->page = NULL; do { object = p[--size]; + /* Do we need !ZERO_OR_NULL_PTR(object) here? (for kfree) */ } while (!object && size); if (!object) return 0; + page = virt_to_head_page(object); + if (!s) { + /* Handle kalloc'ed objects */ + if (unlikely(!PageSlab(page))) { + BUG_ON(!PageCompound(page)); + kfree_hook(object); + __free_kmem_pages(page, compound_order(page)); + p[size] = NULL; /* mark object processed */ + return size; + } + /* Derive kmem_cache from object */ + df->s = page->slab_cache; + } else { + df->s = cache_from_obj(s, object); /* Support for memcg */ + } + /* Start new detached freelist */ - set_freepointer(s, object, NULL); - df->page = virt_to_head_page(object); + df->page = page; + set_freepointer(df->s, object, NULL); df->tail = object; df->freelist = object; p[size] = NULL; /* mark object processed */ @@ -2862,7 +2893,7 @@ static int build_detached_freelist(struct kmem_cache *s, size_t size, /* df->page is always set at this point */ if (df->page == virt_to_head_page(object)) { /* Opportunity build freelist */ - set_freepointer(s, object, df->freelist); + set_freepointer(df->s, object, df->freelist); df->freelist = object; df->cnt++; p[size] = NULL; /* mark object processed */ @@ -2881,25 +2912,20 @@ static int build_detached_freelist(struct kmem_cache *s, size_t size, return first_skipped_index; } - /* Note that interrupts must be enabled when calling this function. */ -void kmem_cache_free_bulk(struct kmem_cache *orig_s, size_t size, void **p) +void kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p) { if (WARN_ON(!size)) return; do { struct detached_freelist df; - struct kmem_cache *s; - - /* Support for memcg */ - s = cache_from_obj(orig_s, p[size - 1]); size = build_detached_freelist(s, size, p, &df); if (unlikely(!df.page)) continue; - slab_free(s, df.page, df.freelist, df.tail, df.cnt, _RET_IP_); + slab_free(df.s, df.page, df.freelist, df.tail, df.cnt,_RET_IP_); } while (likely(size)); } EXPORT_SYMBOL(kmem_cache_free_bulk); @@ -3156,7 +3182,8 @@ static void early_kmem_cache_node_alloc(int node) init_object(kmem_cache_node, n, SLUB_RED_ACTIVE); init_tracking(kmem_cache_node, n); #endif - kasan_kmalloc(kmem_cache_node, n, sizeof(struct kmem_cache_node)); + kasan_kmalloc(kmem_cache_node, n, sizeof(struct kmem_cache_node), + GFP_KERNEL); init_kmem_cache_node(n); inc_slabs_node(kmem_cache_node, node, page->objects); @@ -3285,7 +3312,7 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order) */ size += 2 * sizeof(struct track); - if (flags & SLAB_RED_ZONE) + if (flags & SLAB_RED_ZONE) { /* * Add some empty padding so that we can catch * overwrites from earlier objects rather than let @@ -3294,6 +3321,11 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order) * of the object. */ size += sizeof(void *); + + s->red_left_pad = sizeof(void *); + s->red_left_pad = ALIGN(s->red_left_pad, s->align); + size += s->red_left_pad; + } #endif /* @@ -3357,7 +3389,7 @@ static int kmem_cache_open(struct kmem_cache *s, unsigned long flags) #if defined(CONFIG_HAVE_CMPXCHG_DOUBLE) && \ defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE) - if (system_has_cmpxchg_double() && (s->flags & SLAB_DEBUG_FLAGS) == 0) + if (system_has_cmpxchg_double() && (s->flags & SLAB_NO_CMPXCHG) == 0) /* Enable fast mode */ s->flags |= __CMPXCHG_DOUBLE; #endif @@ -3408,10 +3440,9 @@ static int kmem_cache_open(struct kmem_cache *s, unsigned long flags) free_kmem_cache_nodes(s); error: if (flags & SLAB_PANIC) - panic("Cannot create slab %s size=%lu realsize=%u " - "order=%u offset=%u flags=%lx\n", - s->name, (unsigned long)s->size, s->size, - oo_order(s->oo), s->offset, flags); + panic("Cannot create slab %s size=%lu realsize=%u order=%u offset=%u flags=%lx\n", + s->name, (unsigned long)s->size, s->size, + oo_order(s->oo), s->offset, flags); return -EINVAL; } @@ -3531,7 +3562,7 @@ void *__kmalloc(size_t size, gfp_t flags) trace_kmalloc(_RET_IP_, ret, size, s->size, flags); - kasan_kmalloc(s, ret, size); + kasan_kmalloc(s, ret, size, flags); return ret; } @@ -3576,7 +3607,7 @@ void *__kmalloc_node(size_t size, gfp_t flags, int node) trace_kmalloc_node(_RET_IP_, ret, size, s->size, flags, node); - kasan_kmalloc(s, ret, size); + kasan_kmalloc(s, ret, size, flags); return ret; } @@ -3605,7 +3636,7 @@ size_t ksize(const void *object) size_t size = __ksize(object); /* We assume that ksize callers could use whole allocated area, so we need unpoison this area. */ - kasan_krealloc(object, size); + kasan_krealloc(object, size, GFP_NOWAIT); return size; } EXPORT_SYMBOL(ksize); @@ -4812,16 +4843,16 @@ SLAB_ATTR_RO(total_objects); static ssize_t sanity_checks_show(struct kmem_cache *s, char *buf) { - return sprintf(buf, "%d\n", !!(s->flags & SLAB_DEBUG_FREE)); + return sprintf(buf, "%d\n", !!(s->flags & SLAB_CONSISTENCY_CHECKS)); } static ssize_t sanity_checks_store(struct kmem_cache *s, const char *buf, size_t length) { - s->flags &= ~SLAB_DEBUG_FREE; + s->flags &= ~SLAB_CONSISTENCY_CHECKS; if (buf[0] == '1') { s->flags &= ~__CMPXCHG_DOUBLE; - s->flags |= SLAB_DEBUG_FREE; + s->flags |= SLAB_CONSISTENCY_CHECKS; } return length; } @@ -4865,7 +4896,6 @@ static ssize_t red_zone_store(struct kmem_cache *s, s->flags &= ~SLAB_RED_ZONE; if (buf[0] == '1') { - s->flags &= ~__CMPXCHG_DOUBLE; s->flags |= SLAB_RED_ZONE; } calculate_sizes(s, -1); @@ -4886,7 +4916,6 @@ static ssize_t poison_store(struct kmem_cache *s, s->flags &= ~SLAB_POISON; if (buf[0] == '1') { - s->flags &= ~__CMPXCHG_DOUBLE; s->flags |= SLAB_POISON; } calculate_sizes(s, -1); @@ -5356,7 +5385,7 @@ static char *create_unique_id(struct kmem_cache *s) *p++ = 'd'; if (s->flags & SLAB_RECLAIM_ACCOUNT) *p++ = 'a'; - if (s->flags & SLAB_DEBUG_FREE) + if (s->flags & SLAB_CONSISTENCY_CHECKS) *p++ = 'F'; if (!(s->flags & SLAB_NOTRACK)) *p++ = 't'; diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c index b60802b3e5ea..68885dcbaf40 100644 --- a/mm/sparse-vmemmap.c +++ b/mm/sparse-vmemmap.c @@ -166,8 +166,8 @@ void __meminit vmemmap_verify(pte_t *pte, int node, int actual_node = early_pfn_to_nid(pfn); if (node_distance(actual_node, node) > LOCAL_DISTANCE) - printk(KERN_WARNING "[%lx-%lx] potential offnode " - "page_structs\n", start, end - 1); + pr_warn("[%lx-%lx] potential offnode page_structs\n", + start, end - 1); } pte_t * __meminit vmemmap_pte_populate(pmd_t *pmd, unsigned long addr, int node) @@ -292,8 +292,8 @@ void __init sparse_mem_maps_populate_node(struct page **map_map, if (map_map[pnum]) continue; ms = __nr_to_section(pnum); - printk(KERN_ERR "%s: sparsemem memory map backing failed " - "some memory will not be available.\n", __func__); + pr_err("%s: sparsemem memory map backing failed some memory will not be available\n", + __func__); ms->section_mem_map = 0; } diff --git a/mm/sparse.c b/mm/sparse.c index 3717ceed4177..5d0cf4540364 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -313,9 +313,8 @@ static void __init check_usemap_section_nr(int nid, unsigned long *usemap) usemap_nid = sparse_early_nid(__nr_to_section(usemap_snr)); if (usemap_nid != nid) { - printk(KERN_INFO - "node %d must be removed before remove section %ld\n", - nid, usemap_snr); + pr_info("node %d must be removed before remove section %ld\n", + nid, usemap_snr); return; } /* @@ -324,10 +323,8 @@ static void __init check_usemap_section_nr(int nid, unsigned long *usemap) * gather other removable sections for dynamic partitioning. * Just notify un-removable section's number here. */ - printk(KERN_INFO "Section %ld and %ld (node %d)", usemap_snr, - pgdat_snr, nid); - printk(KERN_CONT - " have a circular dependency on usemap and pgdat allocations\n"); + pr_info("Section %ld and %ld (node %d) have a circular dependency on usemap and pgdat allocations\n", + usemap_snr, pgdat_snr, nid); } #else static unsigned long * __init @@ -355,7 +352,7 @@ static void __init sparse_early_usemaps_alloc_node(void *data, usemap = sparse_early_usemaps_alloc_pgdat_section(NODE_DATA(nodeid), size * usemap_count); if (!usemap) { - printk(KERN_WARNING "%s: allocation failed\n", __func__); + pr_warn("%s: allocation failed\n", __func__); return; } @@ -428,8 +425,8 @@ void __init sparse_mem_maps_populate_node(struct page **map_map, if (map_map[pnum]) continue; ms = __nr_to_section(pnum); - printk(KERN_ERR "%s: sparsemem memory map backing failed " - "some memory will not be available.\n", __func__); + pr_err("%s: sparsemem memory map backing failed some memory will not be available\n", + __func__); ms->section_mem_map = 0; } } @@ -456,8 +453,8 @@ static struct page __init *sparse_early_mem_map_alloc(unsigned long pnum) if (map) return map; - printk(KERN_ERR "%s: sparsemem memory map backing failed " - "some memory will not be available.\n", __func__); + pr_err("%s: sparsemem memory map backing failed some memory will not be available\n", + __func__); ms->section_mem_map = 0; return NULL; } diff --git a/mm/swap_cgroup.c b/mm/swap_cgroup.c index b5f7f24b8dd1..310ac0b8f974 100644 --- a/mm/swap_cgroup.c +++ b/mm/swap_cgroup.c @@ -174,9 +174,8 @@ int swap_cgroup_swapon(int type, unsigned long max_pages) return 0; nomem: - printk(KERN_INFO "couldn't allocate enough memory for swap_cgroup.\n"); - printk(KERN_INFO - "swap_cgroup can be disabled by swapaccount=0 boot option\n"); + pr_info("couldn't allocate enough memory for swap_cgroup\n"); + pr_info("swap_cgroup can be disabled by swapaccount=0 boot option\n"); return -ENOMEM; } diff --git a/mm/swapfile.c b/mm/swapfile.c index d2c37365e2d6..560ad380634c 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -48,6 +48,12 @@ static sector_t map_swap_entry(swp_entry_t, struct block_device**); DEFINE_SPINLOCK(swap_lock); static unsigned int nr_swapfiles; atomic_long_t nr_swap_pages; +/* + * Some modules use swappable objects and may try to swap them out under + * memory pressure (via the shrinker). Before doing so, they may wish to + * check to see if any swap space is available. + */ +EXPORT_SYMBOL_GPL(nr_swap_pages); /* protected with swap_lock. reading in vm_swap_full() doesn't need lock */ long total_swap_pages; static int least_priority; @@ -2526,8 +2532,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) (swap_flags & SWAP_FLAG_PRIO_MASK) >> SWAP_FLAG_PRIO_SHIFT; enable_swap_info(p, prio, swap_map, cluster_info, frontswap_map); - pr_info("Adding %uk swap on %s. " - "Priority:%d extents:%d across:%lluk %s%s%s%s%s\n", + pr_info("Adding %uk swap on %s. Priority:%d extents:%d across:%lluk %s%s%s%s%s\n", p->pages<<(PAGE_SHIFT-10), name->name, p->prio, nr_extents, (unsigned long long)span<<(PAGE_SHIFT-10), (p->flags & SWP_SOLIDSTATE) ? "SS" : "", diff --git a/mm/truncate.c b/mm/truncate.c index e3ee0e27cd17..7598b552ae03 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -519,7 +519,6 @@ EXPORT_SYMBOL(invalidate_mapping_pages); static int invalidate_complete_page2(struct address_space *mapping, struct page *page) { - struct mem_cgroup *memcg; unsigned long flags; if (page->mapping != mapping) @@ -528,15 +527,13 @@ invalidate_complete_page2(struct address_space *mapping, struct page *page) if (page_has_private(page) && !try_to_release_page(page, GFP_KERNEL)) return 0; - memcg = mem_cgroup_begin_page_stat(page); spin_lock_irqsave(&mapping->tree_lock, flags); if (PageDirty(page)) goto failed; BUG_ON(page_has_private(page)); - __delete_from_page_cache(page, NULL, memcg); + __delete_from_page_cache(page, NULL); spin_unlock_irqrestore(&mapping->tree_lock, flags); - mem_cgroup_end_page_stat(memcg); if (mapping->a_ops->freepage) mapping->a_ops->freepage(page); @@ -545,7 +542,6 @@ invalidate_complete_page2(struct address_space *mapping, struct page *page) return 1; failed: spin_unlock_irqrestore(&mapping->tree_lock, flags); - mem_cgroup_end_page_stat(memcg); return 0; } diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index 806b0c758c5b..9f3a0290b273 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -230,8 +230,7 @@ retry: break; } if (unlikely(pmd_none(dst_pmdval)) && - unlikely(__pte_alloc(dst_mm, dst_vma, dst_pmd, - dst_addr))) { + unlikely(__pte_alloc(dst_mm, dst_pmd, dst_addr))) { err = -ENOMEM; break; } diff --git a/mm/util.c b/mm/util.c index 4fb14ca5a419..6cc81e7b8705 100644 --- a/mm/util.c +++ b/mm/util.c @@ -283,9 +283,7 @@ EXPORT_SYMBOL_GPL(__get_user_pages_fast); int __weak get_user_pages_fast(unsigned long start, int nr_pages, int write, struct page **pages) { - struct mm_struct *mm = current->mm; - return get_user_pages_unlocked(current, mm, start, nr_pages, - write, 0, pages); + return get_user_pages_unlocked(start, nr_pages, write, 0, pages); } EXPORT_SYMBOL_GPL(get_user_pages_fast); @@ -396,6 +394,13 @@ int __page_mapcount(struct page *page) } EXPORT_SYMBOL_GPL(__page_mapcount); +int sysctl_overcommit_memory __read_mostly = OVERCOMMIT_GUESS; +int sysctl_overcommit_ratio __read_mostly = 50; +unsigned long sysctl_overcommit_kbytes __read_mostly; +int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT; +unsigned long sysctl_user_reserve_kbytes __read_mostly = 1UL << 17; /* 128MB */ +unsigned long sysctl_admin_reserve_kbytes __read_mostly = 1UL << 13; /* 8MB */ + int overcommit_ratio_handler(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) @@ -437,6 +442,123 @@ unsigned long vm_commit_limit(void) return allowed; } +/* + * Make sure vm_committed_as in one cacheline and not cacheline shared with + * other variables. It can be updated by several CPUs frequently. + */ +struct percpu_counter vm_committed_as ____cacheline_aligned_in_smp; + +/* + * The global memory commitment made in the system can be a metric + * that can be used to drive ballooning decisions when Linux is hosted + * as a guest. On Hyper-V, the host implements a policy engine for dynamically + * balancing memory across competing virtual machines that are hosted. + * Several metrics drive this policy engine including the guest reported + * memory commitment. + */ +unsigned long vm_memory_committed(void) +{ + return percpu_counter_read_positive(&vm_committed_as); +} +EXPORT_SYMBOL_GPL(vm_memory_committed); + +/* + * Check that a process has enough memory to allocate a new virtual + * mapping. 0 means there is enough memory for the allocation to + * succeed and -ENOMEM implies there is not. + * + * We currently support three overcommit policies, which are set via the + * vm.overcommit_memory sysctl. See Documentation/vm/overcommit-accounting + * + * Strict overcommit modes added 2002 Feb 26 by Alan Cox. + * Additional code 2002 Jul 20 by Robert Love. + * + * cap_sys_admin is 1 if the process has admin privileges, 0 otherwise. + * + * Note this is a helper function intended to be used by LSMs which + * wish to use this logic. + */ +int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) +{ + long free, allowed, reserve; + + VM_WARN_ONCE(percpu_counter_read(&vm_committed_as) < + -(s64)vm_committed_as_batch * num_online_cpus(), + "memory commitment underflow"); + + vm_acct_memory(pages); + + /* + * Sometimes we want to use more memory than we have + */ + if (sysctl_overcommit_memory == OVERCOMMIT_ALWAYS) + return 0; + + if (sysctl_overcommit_memory == OVERCOMMIT_GUESS) { + free = global_page_state(NR_FREE_PAGES); + free += global_page_state(NR_FILE_PAGES); + + /* + * shmem pages shouldn't be counted as free in this + * case, they can't be purged, only swapped out, and + * that won't affect the overall amount of available + * memory in the system. + */ + free -= global_page_state(NR_SHMEM); + + free += get_nr_swap_pages(); + + /* + * Any slabs which are created with the + * SLAB_RECLAIM_ACCOUNT flag claim to have contents + * which are reclaimable, under pressure. The dentry + * cache and most inode caches should fall into this + */ + free += global_page_state(NR_SLAB_RECLAIMABLE); + + /* + * Leave reserved pages. The pages are not for anonymous pages. + */ + if (free <= totalreserve_pages) + goto error; + else + free -= totalreserve_pages; + + /* + * Reserve some for root + */ + if (!cap_sys_admin) + free -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10); + + if (free > pages) + return 0; + + goto error; + } + + allowed = vm_commit_limit(); + /* + * Reserve some for root + */ + if (!cap_sys_admin) + allowed -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10); + + /* + * Don't let a single process grow so big a user can't recover + */ + if (mm) { + reserve = sysctl_user_reserve_kbytes >> (PAGE_SHIFT - 10); + allowed -= min_t(long, mm->total_vm / 32, reserve); + } + + if (percpu_counter_read_positive(&vm_committed_as) < allowed) + return 0; +error: + vm_unacct_memory(pages); + + return -ENOMEM; +} + /** * get_cmdline() - copy the cmdline value to a buffer. * @task: the task whose cmdline value to copy. diff --git a/mm/vmalloc.c b/mm/vmalloc.c index fb42a5bffe47..ae7d20b447ff 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -469,8 +469,8 @@ overflow: goto retry; } if (printk_ratelimit()) - pr_warn("vmap allocation for size %lu failed: " - "use vmalloc=<size> to increase size.\n", size); + pr_warn("vmap allocation for size %lu failed: use vmalloc=<size> to increase size\n", + size); kfree(va); return ERR_PTR(-EBUSY); } @@ -531,22 +531,21 @@ static void unmap_vmap_area(struct vmap_area *va) static void vmap_debug_free_range(unsigned long start, unsigned long end) { /* - * Unmap page tables and force a TLB flush immediately if - * CONFIG_DEBUG_PAGEALLOC is set. This catches use after free - * bugs similarly to those in linear kernel virtual address - * space after a page has been freed. + * Unmap page tables and force a TLB flush immediately if pagealloc + * debugging is enabled. This catches use after free bugs similarly to + * those in linear kernel virtual address space after a page has been + * freed. * - * All the lazy freeing logic is still retained, in order to - * minimise intrusiveness of this debugging feature. + * All the lazy freeing logic is still retained, in order to minimise + * intrusiveness of this debugging feature. * - * This is going to be *slow* (linear kernel virtual address - * debugging doesn't do a broadcast TLB flush so it is a lot - * faster). + * This is going to be *slow* (linear kernel virtual address debugging + * doesn't do a broadcast TLB flush so it is a lot faster). */ -#ifdef CONFIG_DEBUG_PAGEALLOC - vunmap_page_range(start, end); - flush_tlb_kernel_range(start, end); -#endif + if (debug_pagealloc_enabled()) { + vunmap_page_range(start, end); + flush_tlb_kernel_range(start, end); + } } /* @@ -1086,7 +1085,7 @@ void vm_unmap_ram(const void *mem, unsigned int count) BUG_ON(!addr); BUG_ON(addr < VMALLOC_START); BUG_ON(addr > VMALLOC_END); - BUG_ON(!IS_ALIGNED(addr, PAGE_SIZE)); + BUG_ON(!PAGE_ALIGNED(addr)); debug_check_no_locks_freed(mem, size); vmap_debug_free_range(addr, addr+size); diff --git a/mm/vmscan.c b/mm/vmscan.c index 71b1c29948db..b934223eaa45 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -195,25 +195,25 @@ static unsigned long zone_reclaimable_pages(struct zone *zone) { unsigned long nr; - nr = zone_page_state(zone, NR_ACTIVE_FILE) + - zone_page_state(zone, NR_INACTIVE_FILE) + - zone_page_state(zone, NR_ISOLATED_FILE); + nr = zone_page_state_snapshot(zone, NR_ACTIVE_FILE) + + zone_page_state_snapshot(zone, NR_INACTIVE_FILE) + + zone_page_state_snapshot(zone, NR_ISOLATED_FILE); if (get_nr_swap_pages() > 0) - nr += zone_page_state(zone, NR_ACTIVE_ANON) + - zone_page_state(zone, NR_INACTIVE_ANON) + - zone_page_state(zone, NR_ISOLATED_ANON); + nr += zone_page_state_snapshot(zone, NR_ACTIVE_ANON) + + zone_page_state_snapshot(zone, NR_INACTIVE_ANON) + + zone_page_state_snapshot(zone, NR_ISOLATED_ANON); return nr; } bool zone_reclaimable(struct zone *zone) { - return zone_page_state(zone, NR_PAGES_SCANNED) < + return zone_page_state_snapshot(zone, NR_PAGES_SCANNED) < zone_reclaimable_pages(zone) * 6; } -static unsigned long get_lru_size(struct lruvec *lruvec, enum lru_list lru) +unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru) { if (!mem_cgroup_disabled()) return mem_cgroup_get_lru_size(lruvec, lru); @@ -228,14 +228,6 @@ int register_shrinker(struct shrinker *shrinker) { size_t size = sizeof(*shrinker->nr_deferred); - /* - * If we only have one possible node in the system anyway, save - * ourselves the trouble and disable NUMA aware behavior. This way we - * will save memory and some small loop time later. - */ - if (nr_node_ids == 1) - shrinker->flags &= ~SHRINKER_NUMA_AWARE; - if (shrinker->flags & SHRINKER_NUMA_AWARE) size *= nr_node_ids; @@ -390,9 +382,8 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl, * * @memcg specifies the memory cgroup to target. If it is not NULL, * only shrinkers with SHRINKER_MEMCG_AWARE set will be called to scan - * objects from the memory cgroup specified. Otherwise all shrinkers - * are called, and memcg aware shrinkers are supposed to scan the - * global list then. + * objects from the memory cgroup specified. Otherwise, only unaware + * shrinkers are called. * * @nr_scanned and @nr_eligible form a ratio that indicate how much of * the available objects should be scanned. Page reclaim for example @@ -412,7 +403,7 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid, struct shrinker *shrinker; unsigned long freed = 0; - if (memcg && !memcg_kmem_online(memcg)) + if (memcg && (!memcg_kmem_enabled() || !mem_cgroup_online(memcg))) return 0; if (nr_scanned == 0) @@ -436,7 +427,13 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid, .memcg = memcg, }; - if (memcg && !(shrinker->flags & SHRINKER_MEMCG_AWARE)) + /* + * If kernel memory accounting is disabled, we ignore + * SHRINKER_MEMCG_AWARE flag and call all shrinkers + * passing NULL for memcg. + */ + if (memcg_kmem_enabled() && + !!memcg != !!(shrinker->flags & SHRINKER_MEMCG_AWARE)) continue; if (!(shrinker->flags & SHRINKER_NUMA_AWARE)) @@ -611,12 +608,10 @@ static int __remove_mapping(struct address_space *mapping, struct page *page, bool reclaimed) { unsigned long flags; - struct mem_cgroup *memcg; BUG_ON(!PageLocked(page)); BUG_ON(mapping != page_mapping(page)); - memcg = mem_cgroup_begin_page_stat(page); spin_lock_irqsave(&mapping->tree_lock, flags); /* * The non racy check for a busy page. @@ -643,11 +638,11 @@ static int __remove_mapping(struct address_space *mapping, struct page *page, * Note that if SetPageDirty is always performed via set_page_dirty, * and thus under tree_lock, then this ordering is not required. */ - if (!page_freeze_refs(page, 2)) + if (!page_ref_freeze(page, 2)) goto cannot_free; /* note: atomic_cmpxchg in page_freeze_refs provides the smp_rmb */ if (unlikely(PageDirty(page))) { - page_unfreeze_refs(page, 2); + page_ref_unfreeze(page, 2); goto cannot_free; } @@ -656,7 +651,6 @@ static int __remove_mapping(struct address_space *mapping, struct page *page, mem_cgroup_swapout(page, swap); __delete_from_swap_cache(page); spin_unlock_irqrestore(&mapping->tree_lock, flags); - mem_cgroup_end_page_stat(memcg); swapcache_free(swap); } else { void (*freepage)(struct page *); @@ -682,9 +676,8 @@ static int __remove_mapping(struct address_space *mapping, struct page *page, if (reclaimed && page_is_file_cache(page) && !mapping_exiting(mapping) && !dax_mapping(mapping)) shadow = workingset_eviction(mapping, page); - __delete_from_page_cache(page, shadow, memcg); + __delete_from_page_cache(page, shadow); spin_unlock_irqrestore(&mapping->tree_lock, flags); - mem_cgroup_end_page_stat(memcg); if (freepage != NULL) freepage(page); @@ -694,7 +687,6 @@ static int __remove_mapping(struct address_space *mapping, struct page *page, cannot_free: spin_unlock_irqrestore(&mapping->tree_lock, flags); - mem_cgroup_end_page_stat(memcg); return 0; } @@ -712,7 +704,7 @@ int remove_mapping(struct address_space *mapping, struct page *page) * drops the pagecache ref for us without requiring another * atomic operation. */ - page_unfreeze_refs(page, 1); + page_ref_unfreeze(page, 1); return 1; } return 0; @@ -1931,8 +1923,8 @@ static bool inactive_file_is_low(struct lruvec *lruvec) unsigned long inactive; unsigned long active; - inactive = get_lru_size(lruvec, LRU_INACTIVE_FILE); - active = get_lru_size(lruvec, LRU_ACTIVE_FILE); + inactive = lruvec_lru_size(lruvec, LRU_INACTIVE_FILE); + active = lruvec_lru_size(lruvec, LRU_ACTIVE_FILE); return active > inactive; } @@ -2071,7 +2063,7 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg, * system is under heavy pressure. */ if (!inactive_file_is_low(lruvec) && - get_lru_size(lruvec, LRU_INACTIVE_FILE) >> sc->priority) { + lruvec_lru_size(lruvec, LRU_INACTIVE_FILE) >> sc->priority) { scan_balance = SCAN_FILE; goto out; } @@ -2097,10 +2089,10 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg, * anon in [0], file in [1] */ - anon = get_lru_size(lruvec, LRU_ACTIVE_ANON) + - get_lru_size(lruvec, LRU_INACTIVE_ANON); - file = get_lru_size(lruvec, LRU_ACTIVE_FILE) + - get_lru_size(lruvec, LRU_INACTIVE_FILE); + anon = lruvec_lru_size(lruvec, LRU_ACTIVE_ANON) + + lruvec_lru_size(lruvec, LRU_INACTIVE_ANON); + file = lruvec_lru_size(lruvec, LRU_ACTIVE_FILE) + + lruvec_lru_size(lruvec, LRU_INACTIVE_FILE); spin_lock_irq(&zone->lru_lock); if (unlikely(reclaim_stat->recent_scanned[0] > anon / 4)) { @@ -2138,7 +2130,7 @@ out: unsigned long size; unsigned long scan; - size = get_lru_size(lruvec, lru); + size = lruvec_lru_size(lruvec, lru); scan = size >> sc->priority; if (!scan && pass && force_scan) @@ -2981,18 +2973,23 @@ static void age_active_anon(struct zone *zone, struct scan_control *sc) } while (memcg); } -static bool zone_balanced(struct zone *zone, int order, - unsigned long balance_gap, int classzone_idx) +static bool zone_balanced(struct zone *zone, int order, bool highorder, + unsigned long balance_gap, int classzone_idx) { - if (!zone_watermark_ok_safe(zone, order, high_wmark_pages(zone) + - balance_gap, classzone_idx)) - return false; + unsigned long mark = high_wmark_pages(zone) + balance_gap; - if (IS_ENABLED(CONFIG_COMPACTION) && order && compaction_suitable(zone, - order, 0, classzone_idx) == COMPACT_SKIPPED) - return false; + /* + * When checking from pgdat_balanced(), kswapd should stop and sleep + * when it reaches the high order-0 watermark and let kcompactd take + * over. Other callers such as wakeup_kswapd() want to determine the + * true high-order watermark. + */ + if (IS_ENABLED(CONFIG_COMPACTION) && !highorder) { + mark += (1UL << order); + order = 0; + } - return true; + return zone_watermark_ok_safe(zone, order, mark, classzone_idx); } /* @@ -3042,7 +3039,7 @@ static bool pgdat_balanced(pg_data_t *pgdat, int order, int classzone_idx) continue; } - if (zone_balanced(zone, order, 0, i)) + if (zone_balanced(zone, order, false, 0, i)) balanced_pages += zone->managed_pages; else if (!order) return false; @@ -3096,10 +3093,8 @@ static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, long remaining, */ static bool kswapd_shrink_zone(struct zone *zone, int classzone_idx, - struct scan_control *sc, - unsigned long *nr_attempted) + struct scan_control *sc) { - int testorder = sc->order; unsigned long balance_gap; bool lowmem_pressure; @@ -3107,17 +3102,6 @@ static bool kswapd_shrink_zone(struct zone *zone, sc->nr_to_reclaim = max(SWAP_CLUSTER_MAX, high_wmark_pages(zone)); /* - * Kswapd reclaims only single pages with compaction enabled. Trying - * too hard to reclaim until contiguous free pages have become - * available can hurt performance by evicting too much useful data - * from memory. Do not reclaim more than needed for compaction. - */ - if (IS_ENABLED(CONFIG_COMPACTION) && sc->order && - compaction_suitable(zone, sc->order, 0, classzone_idx) - != COMPACT_SKIPPED) - testorder = 0; - - /* * We put equal pressure on every zone, unless one zone has way too * many pages free already. The "too many pages" is defined as the * high wmark plus a "gap" where the gap is either the low @@ -3131,15 +3115,12 @@ static bool kswapd_shrink_zone(struct zone *zone, * reclaim is necessary */ lowmem_pressure = (buffer_heads_over_limit && is_highmem(zone)); - if (!lowmem_pressure && zone_balanced(zone, testorder, + if (!lowmem_pressure && zone_balanced(zone, sc->order, false, balance_gap, classzone_idx)) return true; shrink_zone(zone, sc, zone_idx(zone) == classzone_idx); - /* Account for the number of pages attempted to reclaim */ - *nr_attempted += sc->nr_to_reclaim; - clear_bit(ZONE_WRITEBACK, &zone->flags); /* @@ -3149,7 +3130,7 @@ static bool kswapd_shrink_zone(struct zone *zone, * waits. */ if (zone_reclaimable(zone) && - zone_balanced(zone, testorder, 0, classzone_idx)) { + zone_balanced(zone, sc->order, false, 0, classzone_idx)) { clear_bit(ZONE_CONGESTED, &zone->flags); clear_bit(ZONE_DIRTY, &zone->flags); } @@ -3161,7 +3142,7 @@ static bool kswapd_shrink_zone(struct zone *zone, * For kswapd, balance_pgdat() will work across all this node's zones until * they are all at high_wmark_pages(zone). * - * Returns the final order kswapd was reclaiming at + * Returns the highest zone idx kswapd was reclaiming at * * There is special handling here for zones which are full of pinned pages. * This can happen if the pages are all mlocked, or if they are all used by @@ -3178,8 +3159,7 @@ static bool kswapd_shrink_zone(struct zone *zone, * interoperates with the page allocator fallback scheme to ensure that aging * of pages is balanced across the zones. */ -static unsigned long balance_pgdat(pg_data_t *pgdat, int order, - int *classzone_idx) +static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx) { int i; int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */ @@ -3196,9 +3176,7 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order, count_vm_event(PAGEOUTRUN); do { - unsigned long nr_attempted = 0; bool raise_priority = true; - bool pgdat_needs_compaction = (order > 0); sc.nr_reclaimed = 0; @@ -3233,7 +3211,7 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order, break; } - if (!zone_balanced(zone, order, 0, 0)) { + if (!zone_balanced(zone, order, false, 0, 0)) { end_zone = i; break; } else { @@ -3249,24 +3227,6 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order, if (i < 0) goto out; - for (i = 0; i <= end_zone; i++) { - struct zone *zone = pgdat->node_zones + i; - - if (!populated_zone(zone)) - continue; - - /* - * If any zone is currently balanced then kswapd will - * not call compaction as it is expected that the - * necessary pages are already available. - */ - if (pgdat_needs_compaction && - zone_watermark_ok(zone, order, - low_wmark_pages(zone), - *classzone_idx, 0)) - pgdat_needs_compaction = false; - } - /* * If we're getting trouble reclaiming, start doing writepage * even in laptop mode. @@ -3310,8 +3270,7 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order, * that that high watermark would be met at 100% * efficiency. */ - if (kswapd_shrink_zone(zone, end_zone, - &sc, &nr_attempted)) + if (kswapd_shrink_zone(zone, end_zone, &sc)) raise_priority = false; } @@ -3324,49 +3283,29 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order, pfmemalloc_watermark_ok(pgdat)) wake_up_all(&pgdat->pfmemalloc_wait); - /* - * Fragmentation may mean that the system cannot be rebalanced - * for high-order allocations in all zones. If twice the - * allocation size has been reclaimed and the zones are still - * not balanced then recheck the watermarks at order-0 to - * prevent kswapd reclaiming excessively. Assume that a - * process requested a high-order can direct reclaim/compact. - */ - if (order && sc.nr_reclaimed >= 2UL << order) - order = sc.order = 0; - /* Check if kswapd should be suspending */ if (try_to_freeze() || kthread_should_stop()) break; /* - * Compact if necessary and kswapd is reclaiming at least the - * high watermark number of pages as requsted - */ - if (pgdat_needs_compaction && sc.nr_reclaimed > nr_attempted) - compact_pgdat(pgdat, order); - - /* * Raise priority if scanning rate is too low or there was no * progress in reclaiming pages */ if (raise_priority || !sc.nr_reclaimed) sc.priority--; } while (sc.priority >= 1 && - !pgdat_balanced(pgdat, order, *classzone_idx)); + !pgdat_balanced(pgdat, order, classzone_idx)); out: /* - * Return the order we were reclaiming at so prepare_kswapd_sleep() - * makes a decision on the order we were last reclaiming at. However, - * if another caller entered the allocator slow path while kswapd - * was awake, order will remain at the higher level + * Return the highest zone idx we were reclaiming at so + * prepare_kswapd_sleep() makes the same decisions as here. */ - *classzone_idx = end_zone; - return order; + return end_zone; } -static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx) +static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, + int classzone_idx, int balanced_classzone_idx) { long remaining = 0; DEFINE_WAIT(wait); @@ -3377,7 +3316,8 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx) prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE); /* Try to sleep for a short interval */ - if (prepare_kswapd_sleep(pgdat, order, remaining, classzone_idx)) { + if (prepare_kswapd_sleep(pgdat, order, remaining, + balanced_classzone_idx)) { remaining = schedule_timeout(HZ/10); finish_wait(&pgdat->kswapd_wait, &wait); prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE); @@ -3387,7 +3327,8 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx) * After a short sleep, check if it was a premature sleep. If not, then * go fully to sleep until explicitly woken up. */ - if (prepare_kswapd_sleep(pgdat, order, remaining, classzone_idx)) { + if (prepare_kswapd_sleep(pgdat, order, remaining, + balanced_classzone_idx)) { trace_mm_vmscan_kswapd_sleep(pgdat->node_id); /* @@ -3408,6 +3349,12 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx) */ reset_isolation_suitable(pgdat); + /* + * We have freed the memory, now we should compact it to make + * allocation of the requested order possible. + */ + wakeup_kcompactd(pgdat, order, classzone_idx); + if (!kthread_should_stop()) schedule(); @@ -3437,7 +3384,6 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx) static int kswapd(void *p) { unsigned long order, new_order; - unsigned balanced_order; int classzone_idx, new_classzone_idx; int balanced_classzone_idx; pg_data_t *pgdat = (pg_data_t*)p; @@ -3470,24 +3416,19 @@ static int kswapd(void *p) set_freezable(); order = new_order = 0; - balanced_order = 0; classzone_idx = new_classzone_idx = pgdat->nr_zones - 1; balanced_classzone_idx = classzone_idx; for ( ; ; ) { bool ret; /* - * If the last balance_pgdat was unsuccessful it's unlikely a - * new request of a similar or harder type will succeed soon - * so consider going to sleep on the basis we reclaimed at + * While we were reclaiming, there might have been another + * wakeup, so check the values. */ - if (balanced_classzone_idx >= new_classzone_idx && - balanced_order == new_order) { - new_order = pgdat->kswapd_max_order; - new_classzone_idx = pgdat->classzone_idx; - pgdat->kswapd_max_order = 0; - pgdat->classzone_idx = pgdat->nr_zones - 1; - } + new_order = pgdat->kswapd_max_order; + new_classzone_idx = pgdat->classzone_idx; + pgdat->kswapd_max_order = 0; + pgdat->classzone_idx = pgdat->nr_zones - 1; if (order < new_order || classzone_idx > new_classzone_idx) { /* @@ -3497,7 +3438,7 @@ static int kswapd(void *p) order = new_order; classzone_idx = new_classzone_idx; } else { - kswapd_try_to_sleep(pgdat, balanced_order, + kswapd_try_to_sleep(pgdat, order, classzone_idx, balanced_classzone_idx); order = pgdat->kswapd_max_order; classzone_idx = pgdat->classzone_idx; @@ -3517,9 +3458,8 @@ static int kswapd(void *p) */ if (!ret) { trace_mm_vmscan_kswapd_wake(pgdat->node_id, order); - balanced_classzone_idx = classzone_idx; - balanced_order = balance_pgdat(pgdat, order, - &balanced_classzone_idx); + balanced_classzone_idx = balance_pgdat(pgdat, order, + classzone_idx); } } @@ -3549,7 +3489,7 @@ void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx) } if (!waitqueue_active(&pgdat->kswapd_wait)) return; - if (zone_balanced(zone, order, 0, 0)) + if (zone_balanced(zone, order, true, 0, 0)) return; trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, zone_idx(zone), order); diff --git a/mm/vmstat.c b/mm/vmstat.c index 084c6725b373..5e4300482897 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -826,6 +826,7 @@ const char * const vmstat_text[] = { "compact_stall", "compact_fail", "compact_success", + "compact_daemon_wake", #endif #ifdef CONFIG_HUGETLB_PAGE @@ -847,6 +848,7 @@ const char * const vmstat_text[] = { "thp_collapse_alloc_failed", "thp_split_page", "thp_split_page_failed", + "thp_deferred_split_page", "thp_split_pmd", "thp_zero_page_alloc", "thp_zero_page_alloc_failed", @@ -924,19 +926,6 @@ static void walk_zones_in_node(struct seq_file *m, pg_data_t *pgdat, #endif #ifdef CONFIG_PROC_FS -static char * const migratetype_names[MIGRATE_TYPES] = { - "Unmovable", - "Movable", - "Reclaimable", - "HighAtomic", -#ifdef CONFIG_CMA - "CMA", -#endif -#ifdef CONFIG_MEMORY_ISOLATION - "Isolate", -#endif -}; - static void frag_show_print(struct seq_file *m, pg_data_t *pgdat, struct zone *zone) { @@ -1133,7 +1122,7 @@ static void pagetypeinfo_showmixedcount(struct seq_file *m, pg_data_t *pgdat) #ifdef CONFIG_PAGE_OWNER int mtype; - if (!page_owner_inited) + if (!static_branch_unlikely(&page_owner_inited)) return; drain_all_pages(NULL); diff --git a/mm/workingset.c b/mm/workingset.c index 61ead9e5549d..8a75f8d2916a 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -152,8 +152,25 @@ * refault distance will immediately activate the refaulting page. */ -static void *pack_shadow(unsigned long eviction, struct zone *zone) +#define EVICTION_SHIFT (RADIX_TREE_EXCEPTIONAL_ENTRY + \ + ZONES_SHIFT + NODES_SHIFT + \ + MEM_CGROUP_ID_SHIFT) +#define EVICTION_MASK (~0UL >> EVICTION_SHIFT) + +/* + * Eviction timestamps need to be able to cover the full range of + * actionable refaults. However, bits are tight in the radix tree + * entry, and after storing the identifier for the lruvec there might + * not be enough left to represent every single actionable refault. In + * that case, we have to sacrifice granularity for distance, and group + * evictions into coarser buckets by shaving off lower timestamp bits. + */ +static unsigned int bucket_order __read_mostly; + +static void *pack_shadow(int memcgid, struct zone *zone, unsigned long eviction) { + eviction >>= bucket_order; + eviction = (eviction << MEM_CGROUP_ID_SHIFT) | memcgid; eviction = (eviction << NODES_SHIFT) | zone_to_nid(zone); eviction = (eviction << ZONES_SHIFT) | zone_idx(zone); eviction = (eviction << RADIX_TREE_EXCEPTIONAL_SHIFT); @@ -161,45 +178,23 @@ static void *pack_shadow(unsigned long eviction, struct zone *zone) return (void *)(eviction | RADIX_TREE_EXCEPTIONAL_ENTRY); } -static void unpack_shadow(void *shadow, - struct zone **zone, - unsigned long *distance) +static void unpack_shadow(void *shadow, int *memcgidp, struct zone **zonep, + unsigned long *evictionp) { unsigned long entry = (unsigned long)shadow; - unsigned long eviction; - unsigned long refault; - unsigned long mask; - int zid, nid; + int memcgid, nid, zid; entry >>= RADIX_TREE_EXCEPTIONAL_SHIFT; zid = entry & ((1UL << ZONES_SHIFT) - 1); entry >>= ZONES_SHIFT; nid = entry & ((1UL << NODES_SHIFT) - 1); entry >>= NODES_SHIFT; - eviction = entry; + memcgid = entry & ((1UL << MEM_CGROUP_ID_SHIFT) - 1); + entry >>= MEM_CGROUP_ID_SHIFT; - *zone = NODE_DATA(nid)->node_zones + zid; - - refault = atomic_long_read(&(*zone)->inactive_age); - mask = ~0UL >> (NODES_SHIFT + ZONES_SHIFT + - RADIX_TREE_EXCEPTIONAL_SHIFT); - /* - * The unsigned subtraction here gives an accurate distance - * across inactive_age overflows in most cases. - * - * There is a special case: usually, shadow entries have a - * short lifetime and are either refaulted or reclaimed along - * with the inode before they get too old. But it is not - * impossible for the inactive_age to lap a shadow entry in - * the field, which can then can result in a false small - * refault distance, leading to a false activation should this - * old entry actually refault again. However, earlier kernels - * used to deactivate unconditionally with *every* reclaim - * invocation for the longest time, so the occasional - * inappropriate activation leading to pressure on the active - * list is not a problem. - */ - *distance = (refault - eviction) & mask; + *memcgidp = memcgid; + *zonep = NODE_DATA(nid)->node_zones + zid; + *evictionp = entry << bucket_order; } /** @@ -212,11 +207,20 @@ static void unpack_shadow(void *shadow, */ void *workingset_eviction(struct address_space *mapping, struct page *page) { + struct mem_cgroup *memcg = page_memcg(page); struct zone *zone = page_zone(page); + int memcgid = mem_cgroup_id(memcg); unsigned long eviction; + struct lruvec *lruvec; - eviction = atomic_long_inc_return(&zone->inactive_age); - return pack_shadow(eviction, zone); + /* Page is fully exclusive and pins page->mem_cgroup */ + VM_BUG_ON_PAGE(PageLRU(page), page); + VM_BUG_ON_PAGE(page_count(page), page); + VM_BUG_ON_PAGE(!PageLocked(page), page); + + lruvec = mem_cgroup_zone_lruvec(zone, memcg); + eviction = atomic_long_inc_return(&lruvec->inactive_age); + return pack_shadow(memcgid, zone, eviction); } /** @@ -231,12 +235,64 @@ void *workingset_eviction(struct address_space *mapping, struct page *page) bool workingset_refault(void *shadow) { unsigned long refault_distance; + unsigned long active_file; + struct mem_cgroup *memcg; + unsigned long eviction; + struct lruvec *lruvec; + unsigned long refault; struct zone *zone; + int memcgid; + + unpack_shadow(shadow, &memcgid, &zone, &eviction); + + rcu_read_lock(); + /* + * Look up the memcg associated with the stored ID. It might + * have been deleted since the page's eviction. + * + * Note that in rare events the ID could have been recycled + * for a new cgroup that refaults a shared page. This is + * impossible to tell from the available data. However, this + * should be a rare and limited disturbance, and activations + * are always speculative anyway. Ultimately, it's the aging + * algorithm's job to shake out the minimum access frequency + * for the active cache. + * + * XXX: On !CONFIG_MEMCG, this will always return NULL; it + * would be better if the root_mem_cgroup existed in all + * configurations instead. + */ + memcg = mem_cgroup_from_id(memcgid); + if (!mem_cgroup_disabled() && !memcg) { + rcu_read_unlock(); + return false; + } + lruvec = mem_cgroup_zone_lruvec(zone, memcg); + refault = atomic_long_read(&lruvec->inactive_age); + active_file = lruvec_lru_size(lruvec, LRU_ACTIVE_FILE); + rcu_read_unlock(); + + /* + * The unsigned subtraction here gives an accurate distance + * across inactive_age overflows in most cases. + * + * There is a special case: usually, shadow entries have a + * short lifetime and are either refaulted or reclaimed along + * with the inode before they get too old. But it is not + * impossible for the inactive_age to lap a shadow entry in + * the field, which can then can result in a false small + * refault distance, leading to a false activation should this + * old entry actually refault again. However, earlier kernels + * used to deactivate unconditionally with *every* reclaim + * invocation for the longest time, so the occasional + * inappropriate activation leading to pressure on the active + * list is not a problem. + */ + refault_distance = (refault - eviction) & EVICTION_MASK; - unpack_shadow(shadow, &zone, &refault_distance); inc_zone_state(zone, WORKINGSET_REFAULT); - if (refault_distance <= zone_page_state(zone, NR_ACTIVE_FILE)) { + if (refault_distance <= active_file) { inc_zone_state(zone, WORKINGSET_ACTIVATE); return true; } @@ -249,7 +305,22 @@ bool workingset_refault(void *shadow) */ void workingset_activation(struct page *page) { - atomic_long_inc(&page_zone(page)->inactive_age); + struct lruvec *lruvec; + + lock_page_memcg(page); + /* + * Filter non-memcg pages here, e.g. unmap can call + * mark_page_accessed() on VDSO pages. + * + * XXX: See workingset_refault() - this should return + * root_mem_cgroup even for !CONFIG_MEMCG. + */ + if (!mem_cgroup_disabled() && !page_memcg(page)) + goto out; + lruvec = mem_cgroup_zone_lruvec(page_zone(page), page_memcg(page)); + atomic_long_inc(&lruvec->inactive_age); +out: + unlock_page_memcg(page); } /* @@ -278,7 +349,13 @@ static unsigned long count_shadow_nodes(struct shrinker *shrinker, shadow_nodes = list_lru_shrink_count(&workingset_shadow_nodes, sc); local_irq_enable(); - pages = node_present_pages(sc->nid); + if (memcg_kmem_enabled()) + pages = mem_cgroup_node_nr_lru_pages(sc->memcg, sc->nid, + LRU_ALL_FILE); + else + pages = node_page_state(sc->nid, NR_ACTIVE_FILE) + + node_page_state(sc->nid, NR_INACTIVE_FILE); + /* * Active cache pages are limited to 50% of memory, and shadow * entries that represent a refault distance bigger than that @@ -387,7 +464,7 @@ static struct shrinker workingset_shadow_shrinker = { .count_objects = count_shadow_nodes, .scan_objects = scan_shadow_nodes, .seeks = DEFAULT_SEEKS, - .flags = SHRINKER_NUMA_AWARE, + .flags = SHRINKER_NUMA_AWARE | SHRINKER_MEMCG_AWARE, }; /* @@ -398,8 +475,25 @@ static struct lock_class_key shadow_nodes_key; static int __init workingset_init(void) { + unsigned int timestamp_bits; + unsigned int max_order; int ret; + BUILD_BUG_ON(BITS_PER_LONG < EVICTION_SHIFT); + /* + * Calculate the eviction bucket size to cover the longest + * actionable refault distance, which is currently half of + * memory (totalram_pages/2). However, memory hotplug may add + * some more pages at runtime, so keep working with up to + * double the initial memory by using totalram_pages as-is. + */ + timestamp_bits = BITS_PER_LONG - EVICTION_SHIFT; + max_order = fls_long(totalram_pages - 1); + if (max_order > timestamp_bits) + bucket_order = max_order - timestamp_bits; + printk("workingset: timestamp_bits=%d max_order=%d bucket_order=%u\n", + timestamp_bits, max_order, bucket_order); + ret = list_lru_init_key(&workingset_shadow_nodes, &shadow_nodes_key); if (ret) goto err; diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index 2d7c4c11fc63..e72efb109fde 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -281,7 +281,6 @@ struct mapping_area { #endif char *vm_addr; /* address of kmap_atomic()'ed pages */ enum zs_mapmode vm_mm; /* mapping mode */ - bool huge; }; static int create_handle_cache(struct zs_pool *pool) @@ -495,6 +494,8 @@ static void __exit zs_stat_exit(void) debugfs_remove_recursive(zs_stat_root); } +static unsigned long zs_can_compact(struct size_class *class); + static int zs_stats_size_show(struct seq_file *s, void *v) { int i; @@ -502,14 +503,15 @@ static int zs_stats_size_show(struct seq_file *s, void *v) struct size_class *class; int objs_per_zspage; unsigned long class_almost_full, class_almost_empty; - unsigned long obj_allocated, obj_used, pages_used; + unsigned long obj_allocated, obj_used, pages_used, freeable; unsigned long total_class_almost_full = 0, total_class_almost_empty = 0; unsigned long total_objs = 0, total_used_objs = 0, total_pages = 0; + unsigned long total_freeable = 0; - seq_printf(s, " %5s %5s %11s %12s %13s %10s %10s %16s\n", + seq_printf(s, " %5s %5s %11s %12s %13s %10s %10s %16s %8s\n", "class", "size", "almost_full", "almost_empty", "obj_allocated", "obj_used", "pages_used", - "pages_per_zspage"); + "pages_per_zspage", "freeable"); for (i = 0; i < zs_size_classes; i++) { class = pool->size_class[i]; @@ -522,6 +524,7 @@ static int zs_stats_size_show(struct seq_file *s, void *v) class_almost_empty = zs_stat_get(class, CLASS_ALMOST_EMPTY); obj_allocated = zs_stat_get(class, OBJ_ALLOCATED); obj_used = zs_stat_get(class, OBJ_USED); + freeable = zs_can_compact(class); spin_unlock(&class->lock); objs_per_zspage = get_maxobj_per_zspage(class->size, @@ -529,23 +532,25 @@ static int zs_stats_size_show(struct seq_file *s, void *v) pages_used = obj_allocated / objs_per_zspage * class->pages_per_zspage; - seq_printf(s, " %5u %5u %11lu %12lu %13lu %10lu %10lu %16d\n", + seq_printf(s, " %5u %5u %11lu %12lu %13lu" + " %10lu %10lu %16d %8lu\n", i, class->size, class_almost_full, class_almost_empty, obj_allocated, obj_used, pages_used, - class->pages_per_zspage); + class->pages_per_zspage, freeable); total_class_almost_full += class_almost_full; total_class_almost_empty += class_almost_empty; total_objs += obj_allocated; total_used_objs += obj_used; total_pages += pages_used; + total_freeable += freeable; } seq_puts(s, "\n"); - seq_printf(s, " %5s %5s %11lu %12lu %13lu %10lu %10lu\n", + seq_printf(s, " %5s %5s %11lu %12lu %13lu %10lu %10lu %16s %8lu\n", "Total", "", total_class_almost_full, total_class_almost_empty, total_objs, - total_used_objs, total_pages); + total_used_objs, total_pages, "", total_freeable); return 0; } @@ -1127,11 +1132,9 @@ static void __zs_unmap_object(struct mapping_area *area, goto out; buf = area->vm_buf; - if (!area->huge) { - buf = buf + ZS_HANDLE_SIZE; - size -= ZS_HANDLE_SIZE; - off += ZS_HANDLE_SIZE; - } + buf = buf + ZS_HANDLE_SIZE; + size -= ZS_HANDLE_SIZE; + off += ZS_HANDLE_SIZE; sizes[0] = PAGE_SIZE - off; sizes[1] = size - sizes[0]; |