diff options
author | Ingo Molnar <mingo@elte.hu> | 2008-10-28 16:26:12 +0100 |
---|---|---|
committer | Ingo Molnar <mingo@elte.hu> | 2008-10-28 16:26:12 +0100 |
commit | 7a9787e1eba95a166265e6a260cf30af04ef0a99 (patch) | |
tree | e730a4565e0318140d2fbd2f0415d18a339d7336 /mm | |
parent | x86, pci: introduce config option for pci reroute quirks (was: [PATCH 0/3] Bo... (diff) | |
parent | Linux 2.6.28-rc2 (diff) | |
download | linux-7a9787e1eba95a166265e6a260cf30af04ef0a99.tar.xz linux-7a9787e1eba95a166265e6a260cf30af04ef0a99.zip |
Merge commit 'v2.6.28-rc2' into x86/pci-ioapic-boot-irq-quirks
Diffstat (limited to 'mm')
-rw-r--r-- | mm/Kconfig | 23 | ||||
-rw-r--r-- | mm/Makefile | 6 | ||||
-rw-r--r-- | mm/allocpercpu.c | 24 | ||||
-rw-r--r-- | mm/bootmem.c | 948 | ||||
-rw-r--r-- | mm/bounce.c | 2 | ||||
-rw-r--r-- | mm/fadvise.c | 2 | ||||
-rw-r--r-- | mm/filemap.c | 474 | ||||
-rw-r--r-- | mm/filemap_xip.c | 70 | ||||
-rw-r--r-- | mm/fremap.c | 30 | ||||
-rw-r--r-- | mm/highmem.c | 6 | ||||
-rw-r--r-- | mm/hugetlb.c | 1733 | ||||
-rw-r--r-- | mm/internal.h | 192 | ||||
-rw-r--r-- | mm/madvise.c | 4 | ||||
-rw-r--r-- | mm/memcontrol.c | 772 | ||||
-rw-r--r-- | mm/memory.c | 421 | ||||
-rw-r--r-- | mm/memory_hotplug.c | 99 | ||||
-rw-r--r-- | mm/mempolicy.c | 21 | ||||
-rw-r--r-- | mm/migrate.c | 323 | ||||
-rw-r--r-- | mm/mlock.c | 445 | ||||
-rw-r--r-- | mm/mm_init.c | 152 | ||||
-rw-r--r-- | mm/mmap.c | 267 | ||||
-rw-r--r-- | mm/mmu_notifier.c | 277 | ||||
-rw-r--r-- | mm/mmzone.c | 2 | ||||
-rw-r--r-- | mm/mprotect.c | 9 | ||||
-rw-r--r-- | mm/mremap.c | 14 | ||||
-rw-r--r-- | mm/nommu.c | 69 | ||||
-rw-r--r-- | mm/oom_kill.c | 6 | ||||
-rw-r--r-- | mm/page-writeback.c | 34 | ||||
-rw-r--r-- | mm/page_alloc.c | 304 | ||||
-rw-r--r-- | mm/page_cgroup.c | 256 | ||||
-rw-r--r-- | mm/page_isolation.c | 13 | ||||
-rw-r--r-- | mm/pdflush.c | 6 | ||||
-rw-r--r-- | mm/quicklist.c | 9 | ||||
-rw-r--r-- | mm/readahead.c | 10 | ||||
-rw-r--r-- | mm/rmap.c | 380 | ||||
-rw-r--r-- | mm/shmem.c | 118 | ||||
-rw-r--r-- | mm/shmem_acl.c | 2 | ||||
-rw-r--r-- | mm/slab.c | 64 | ||||
-rw-r--r-- | mm/slob.c | 28 | ||||
-rw-r--r-- | mm/slub.c | 149 | ||||
-rw-r--r-- | mm/sparse.c | 116 | ||||
-rw-r--r-- | mm/swap.c | 183 | ||||
-rw-r--r-- | mm/swap_state.c | 47 | ||||
-rw-r--r-- | mm/swapfile.c | 90 | ||||
-rw-r--r-- | mm/tiny-shmem.c | 27 | ||||
-rw-r--r-- | mm/truncate.c | 22 | ||||
-rw-r--r-- | mm/util.c | 70 | ||||
-rw-r--r-- | mm/vmalloc.c | 1056 | ||||
-rw-r--r-- | mm/vmscan.c | 1117 | ||||
-rw-r--r-- | mm/vmstat.c | 124 |
50 files changed, 7869 insertions, 2747 deletions
diff --git a/mm/Kconfig b/mm/Kconfig index c4de85285bb4..5b5790f8a816 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -101,7 +101,7 @@ config HAVE_MEMORY_PRESENT # with gcc 3.4 and later. # config SPARSEMEM_STATIC - def_bool n + bool # # Architecture platforms which require a two level mem_section in SPARSEMEM @@ -113,7 +113,7 @@ config SPARSEMEM_EXTREME depends on SPARSEMEM && !SPARSEMEM_STATIC config SPARSEMEM_VMEMMAP_ENABLE - def_bool n + bool config SPARSEMEM_VMEMMAP bool "Sparse Memory virtual memmap" @@ -174,7 +174,7 @@ config SPLIT_PTLOCK_CPUS config MIGRATION bool "Page migration" def_bool y - depends on NUMA + depends on NUMA || ARCH_ENABLE_MEMORY_HOTREMOVE help Allows the migration of the physical location of pages of processes while the virtual addresses are not changed. This is useful for @@ -187,6 +187,9 @@ config RESOURCES_64BIT help This option allows memory and IO resources to be 64 bit. +config PHYS_ADDR_T_64BIT + def_bool 64BIT || ARCH_PHYS_ADDR_T_64BIT + config ZONE_DMA_FLAG int default "0" if !ZONE_DMA @@ -205,3 +208,17 @@ config NR_QUICK config VIRT_TO_BUS def_bool y depends on !ARCH_NO_VIRT_TO_BUS + +config UNEVICTABLE_LRU + bool "Add LRU list to track non-evictable pages" + default y + depends on MMU + help + Keeps unevictable pages off of the active and inactive pageout + lists, so kswapd will not waste CPU time or have its balancing + algorithms thrown off by scanning these pages. Selecting this + will use one page flag and increase the code size a little, + say Y unless you know what you are doing. + +config MMU_NOTIFIER + bool diff --git a/mm/Makefile b/mm/Makefile index 18c143b3c46c..c06b45a1ff5f 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -11,7 +11,7 @@ obj-y := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \ maccess.o page_alloc.o page-writeback.o pdflush.o \ readahead.o swap.o truncate.o vmscan.o \ prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \ - page_isolation.o $(mmu-y) + page_isolation.o mm_init.o $(mmu-y) obj-$(CONFIG_PROC_PAGE_MONITOR) += pagewalk.o obj-$(CONFIG_BOUNCE) += bounce.o @@ -25,6 +25,7 @@ obj-$(CONFIG_SHMEM) += shmem.o obj-$(CONFIG_TMPFS_POSIX_ACL) += shmem_acl.o obj-$(CONFIG_TINY_SHMEM) += tiny-shmem.o obj-$(CONFIG_SLOB) += slob.o +obj-$(CONFIG_MMU_NOTIFIER) += mmu_notifier.o obj-$(CONFIG_SLAB) += slab.o obj-$(CONFIG_SLUB) += slub.o obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o @@ -32,5 +33,4 @@ obj-$(CONFIG_FS_XIP) += filemap_xip.o obj-$(CONFIG_MIGRATION) += migrate.o obj-$(CONFIG_SMP) += allocpercpu.o obj-$(CONFIG_QUICKLIST) += quicklist.o -obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o - +obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o page_cgroup.o diff --git a/mm/allocpercpu.c b/mm/allocpercpu.c index 05f2b4009ccc..4297bc41bfd2 100644 --- a/mm/allocpercpu.c +++ b/mm/allocpercpu.c @@ -18,27 +18,28 @@ * Depopulating per-cpu data for a cpu going offline would be a typical * use case. You need to register a cpu hotplug handler for that purpose. */ -void percpu_depopulate(void *__pdata, int cpu) +static void percpu_depopulate(void *__pdata, int cpu) { struct percpu_data *pdata = __percpu_disguise(__pdata); kfree(pdata->ptrs[cpu]); pdata->ptrs[cpu] = NULL; } -EXPORT_SYMBOL_GPL(percpu_depopulate); /** * percpu_depopulate_mask - depopulate per-cpu data for some cpu's * @__pdata: per-cpu data to depopulate * @mask: depopulate per-cpu data for cpu's selected through mask bits */ -void __percpu_depopulate_mask(void *__pdata, cpumask_t *mask) +static void __percpu_depopulate_mask(void *__pdata, cpumask_t *mask) { int cpu; - for_each_cpu_mask(cpu, *mask) + for_each_cpu_mask_nr(cpu, *mask) percpu_depopulate(__pdata, cpu); } -EXPORT_SYMBOL_GPL(__percpu_depopulate_mask); + +#define percpu_depopulate_mask(__pdata, mask) \ + __percpu_depopulate_mask((__pdata), &(mask)) /** * percpu_populate - populate per-cpu data for given cpu @@ -51,7 +52,7 @@ EXPORT_SYMBOL_GPL(__percpu_depopulate_mask); * use case. You need to register a cpu hotplug handler for that purpose. * Per-cpu object is populated with zeroed buffer. */ -void *percpu_populate(void *__pdata, size_t size, gfp_t gfp, int cpu) +static void *percpu_populate(void *__pdata, size_t size, gfp_t gfp, int cpu) { struct percpu_data *pdata = __percpu_disguise(__pdata); int node = cpu_to_node(cpu); @@ -68,7 +69,6 @@ void *percpu_populate(void *__pdata, size_t size, gfp_t gfp, int cpu) pdata->ptrs[cpu] = kzalloc(size, gfp); return pdata->ptrs[cpu]; } -EXPORT_SYMBOL_GPL(percpu_populate); /** * percpu_populate_mask - populate per-cpu data for more cpu's @@ -79,14 +79,14 @@ EXPORT_SYMBOL_GPL(percpu_populate); * * Per-cpu objects are populated with zeroed buffers. */ -int __percpu_populate_mask(void *__pdata, size_t size, gfp_t gfp, - cpumask_t *mask) +static int __percpu_populate_mask(void *__pdata, size_t size, gfp_t gfp, + cpumask_t *mask) { cpumask_t populated; int cpu; cpus_clear(populated); - for_each_cpu_mask(cpu, *mask) + for_each_cpu_mask_nr(cpu, *mask) if (unlikely(!percpu_populate(__pdata, size, gfp, cpu))) { __percpu_depopulate_mask(__pdata, &populated); return -ENOMEM; @@ -94,7 +94,9 @@ int __percpu_populate_mask(void *__pdata, size_t size, gfp_t gfp, cpu_set(cpu, populated); return 0; } -EXPORT_SYMBOL_GPL(__percpu_populate_mask); + +#define percpu_populate_mask(__pdata, size, gfp, mask) \ + __percpu_populate_mask((__pdata), (size), (gfp), &(mask)) /** * percpu_alloc_mask - initial setup of per-cpu data diff --git a/mm/bootmem.c b/mm/bootmem.c index 8d9f60e06f62..ac5a891f142a 100644 --- a/mm/bootmem.c +++ b/mm/bootmem.c @@ -1,12 +1,12 @@ /* - * linux/mm/bootmem.c + * bootmem - A boot-time physical memory allocator and configurator * * Copyright (C) 1999 Ingo Molnar - * Discontiguous memory support, Kanoj Sarcar, SGI, Nov 1999 + * 1999 Kanoj Sarcar, SGI + * 2008 Johannes Weiner * - * simple boot-time physical memory area allocator and - * free memory collector. It's used to deal with reserved - * system memory and memory holes as well. + * Access to this subsystem has to be serialized externally (which is true + * for the boot process anyway). */ #include <linux/init.h> #include <linux/pfn.h> @@ -19,15 +19,10 @@ #include "internal.h" -/* - * Access to this subsystem has to be serialized externally. (this is - * true for the boot process anyway) - */ unsigned long max_low_pfn; unsigned long min_low_pfn; unsigned long max_pfn; -static LIST_HEAD(bdata_list); #ifdef CONFIG_CRASH_DUMP /* * If we have booted due to a crash, max_pfn will be a very low value. We need @@ -36,63 +31,72 @@ static LIST_HEAD(bdata_list); unsigned long saved_max_pfn; #endif -/* return the number of _pages_ that will be allocated for the boot bitmap */ -unsigned long __init bootmem_bootmap_pages(unsigned long pages) +bootmem_data_t bootmem_node_data[MAX_NUMNODES] __initdata; + +static struct list_head bdata_list __initdata = LIST_HEAD_INIT(bdata_list); + +static int bootmem_debug; + +static int __init bootmem_debug_setup(char *buf) { - unsigned long mapsize; + bootmem_debug = 1; + return 0; +} +early_param("bootmem_debug", bootmem_debug_setup); - mapsize = (pages+7)/8; - mapsize = (mapsize + ~PAGE_MASK) & PAGE_MASK; - mapsize >>= PAGE_SHIFT; +#define bdebug(fmt, args...) ({ \ + if (unlikely(bootmem_debug)) \ + printk(KERN_INFO \ + "bootmem::%s " fmt, \ + __func__, ## args); \ +}) - return mapsize; +static unsigned long __init bootmap_bytes(unsigned long pages) +{ + unsigned long bytes = (pages + 7) / 8; + + return ALIGN(bytes, sizeof(long)); } -/* - * link bdata in order +/** + * bootmem_bootmap_pages - calculate bitmap size in pages + * @pages: number of pages the bitmap has to represent */ -static void __init link_bootmem(bootmem_data_t *bdata) +unsigned long __init bootmem_bootmap_pages(unsigned long pages) { - bootmem_data_t *ent; + unsigned long bytes = bootmap_bytes(pages); - if (list_empty(&bdata_list)) { - list_add(&bdata->list, &bdata_list); - return; - } - /* insert in order */ - list_for_each_entry(ent, &bdata_list, list) { - if (bdata->node_boot_start < ent->node_boot_start) { - list_add_tail(&bdata->list, &ent->list); - return; - } - } - list_add_tail(&bdata->list, &bdata_list); + return PAGE_ALIGN(bytes) >> PAGE_SHIFT; } /* - * Given an initialised bdata, it returns the size of the boot bitmap + * link bdata in order */ -static unsigned long __init get_mapsize(bootmem_data_t *bdata) +static void __init link_bootmem(bootmem_data_t *bdata) { - unsigned long mapsize; - unsigned long start = PFN_DOWN(bdata->node_boot_start); - unsigned long end = bdata->node_low_pfn; + struct list_head *iter; - mapsize = ((end - start) + 7) / 8; - return ALIGN(mapsize, sizeof(long)); + list_for_each(iter, &bdata_list) { + bootmem_data_t *ent; + + ent = list_entry(iter, bootmem_data_t, list); + if (bdata->node_min_pfn < ent->node_min_pfn) + break; + } + list_add_tail(&bdata->list, iter); } /* * Called once to set up the allocator itself. */ -static unsigned long __init init_bootmem_core(pg_data_t *pgdat, +static unsigned long __init init_bootmem_core(bootmem_data_t *bdata, unsigned long mapstart, unsigned long start, unsigned long end) { - bootmem_data_t *bdata = pgdat->bdata; unsigned long mapsize; + mminit_validate_memmodel_limits(&start, &end); bdata->node_bootmem_map = phys_to_virt(PFN_PHYS(mapstart)); - bdata->node_boot_start = PFN_PHYS(start); + bdata->node_min_pfn = start; bdata->node_low_pfn = end; link_bootmem(bdata); @@ -100,429 +104,484 @@ static unsigned long __init init_bootmem_core(pg_data_t *pgdat, * Initially all pages are reserved - setup_arch() has to * register free RAM areas explicitly. */ - mapsize = get_mapsize(bdata); + mapsize = bootmap_bytes(end - start); memset(bdata->node_bootmem_map, 0xff, mapsize); + bdebug("nid=%td start=%lx map=%lx end=%lx mapsize=%lx\n", + bdata - bootmem_node_data, start, mapstart, end, mapsize); + return mapsize; } -/* - * Marks a particular physical memory range as unallocatable. Usable RAM - * might be used for boot-time allocations - or it might get added - * to the free page pool later on. +/** + * init_bootmem_node - register a node as boot memory + * @pgdat: node to register + * @freepfn: pfn where the bitmap for this node is to be placed + * @startpfn: first pfn on the node + * @endpfn: first pfn after the node + * + * Returns the number of bytes needed to hold the bitmap for this node. */ -static int __init can_reserve_bootmem_core(bootmem_data_t *bdata, - unsigned long addr, unsigned long size, int flags) +unsigned long __init init_bootmem_node(pg_data_t *pgdat, unsigned long freepfn, + unsigned long startpfn, unsigned long endpfn) { - unsigned long sidx, eidx; - unsigned long i; + return init_bootmem_core(pgdat->bdata, freepfn, startpfn, endpfn); +} - BUG_ON(!size); +/** + * init_bootmem - register boot memory + * @start: pfn where the bitmap is to be placed + * @pages: number of available physical pages + * + * Returns the number of bytes needed to hold the bitmap. + */ +unsigned long __init init_bootmem(unsigned long start, unsigned long pages) +{ + max_low_pfn = pages; + min_low_pfn = start; + return init_bootmem_core(NODE_DATA(0)->bdata, start, 0, pages); +} + +static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata) +{ + int aligned; + struct page *page; + unsigned long start, end, pages, count = 0; - /* out of range, don't hold other */ - if (addr + size < bdata->node_boot_start || - PFN_DOWN(addr) > bdata->node_low_pfn) + if (!bdata->node_bootmem_map) return 0; + start = bdata->node_min_pfn; + end = bdata->node_low_pfn; + /* - * Round up to index to the range. + * If the start is aligned to the machines wordsize, we might + * be able to free pages in bulks of that order. */ - if (addr > bdata->node_boot_start) - sidx= PFN_DOWN(addr - bdata->node_boot_start); - else - sidx = 0; + aligned = !(start & (BITS_PER_LONG - 1)); - eidx = PFN_UP(addr + size - bdata->node_boot_start); - if (eidx > bdata->node_low_pfn - PFN_DOWN(bdata->node_boot_start)) - eidx = bdata->node_low_pfn - PFN_DOWN(bdata->node_boot_start); + bdebug("nid=%td start=%lx end=%lx aligned=%d\n", + bdata - bootmem_node_data, start, end, aligned); - for (i = sidx; i < eidx; i++) { - if (test_bit(i, bdata->node_bootmem_map)) { - if (flags & BOOTMEM_EXCLUSIVE) - return -EBUSY; + while (start < end) { + unsigned long *map, idx, vec; + + map = bdata->node_bootmem_map; + idx = start - bdata->node_min_pfn; + vec = ~map[idx / BITS_PER_LONG]; + + if (aligned && vec == ~0UL && start + BITS_PER_LONG < end) { + int order = ilog2(BITS_PER_LONG); + + __free_pages_bootmem(pfn_to_page(start), order); + count += BITS_PER_LONG; + } else { + unsigned long off = 0; + + while (vec && off < BITS_PER_LONG) { + if (vec & 1) { + page = pfn_to_page(start + off); + __free_pages_bootmem(page, 0); + count++; + } + vec >>= 1; + off++; + } } + start += BITS_PER_LONG; } - return 0; + page = virt_to_page(bdata->node_bootmem_map); + pages = bdata->node_low_pfn - bdata->node_min_pfn; + pages = bootmem_bootmap_pages(pages); + count += pages; + while (pages--) + __free_pages_bootmem(page++, 0); + bdebug("nid=%td released=%lx\n", bdata - bootmem_node_data, count); + + return count; } -static void __init reserve_bootmem_core(bootmem_data_t *bdata, - unsigned long addr, unsigned long size, int flags) +/** + * free_all_bootmem_node - release a node's free pages to the buddy allocator + * @pgdat: node to be released + * + * Returns the number of pages actually released. + */ +unsigned long __init free_all_bootmem_node(pg_data_t *pgdat) { - unsigned long sidx, eidx; - unsigned long i; + register_page_bootmem_info_node(pgdat); + return free_all_bootmem_core(pgdat->bdata); +} - BUG_ON(!size); +/** + * free_all_bootmem - release free pages to the buddy allocator + * + * Returns the number of pages actually released. + */ +unsigned long __init free_all_bootmem(void) +{ + return free_all_bootmem_core(NODE_DATA(0)->bdata); +} - /* out of range */ - if (addr + size < bdata->node_boot_start || - PFN_DOWN(addr) > bdata->node_low_pfn) - return; +static void __init __free(bootmem_data_t *bdata, + unsigned long sidx, unsigned long eidx) +{ + unsigned long idx; - /* - * Round up to index to the range. - */ - if (addr > bdata->node_boot_start) - sidx= PFN_DOWN(addr - bdata->node_boot_start); - else - sidx = 0; + bdebug("nid=%td start=%lx end=%lx\n", bdata - bootmem_node_data, + sidx + bdata->node_min_pfn, + eidx + bdata->node_min_pfn); - eidx = PFN_UP(addr + size - bdata->node_boot_start); - if (eidx > bdata->node_low_pfn - PFN_DOWN(bdata->node_boot_start)) - eidx = bdata->node_low_pfn - PFN_DOWN(bdata->node_boot_start); + if (bdata->hint_idx > sidx) + bdata->hint_idx = sidx; - for (i = sidx; i < eidx; i++) { - if (test_and_set_bit(i, bdata->node_bootmem_map)) { -#ifdef CONFIG_DEBUG_BOOTMEM - printk("hm, page %08lx reserved twice.\n", i*PAGE_SIZE); -#endif + for (idx = sidx; idx < eidx; idx++) + if (!test_and_clear_bit(idx, bdata->node_bootmem_map)) + BUG(); +} + +static int __init __reserve(bootmem_data_t *bdata, unsigned long sidx, + unsigned long eidx, int flags) +{ + unsigned long idx; + int exclusive = flags & BOOTMEM_EXCLUSIVE; + + bdebug("nid=%td start=%lx end=%lx flags=%x\n", + bdata - bootmem_node_data, + sidx + bdata->node_min_pfn, + eidx + bdata->node_min_pfn, + flags); + + for (idx = sidx; idx < eidx; idx++) + if (test_and_set_bit(idx, bdata->node_bootmem_map)) { + if (exclusive) { + __free(bdata, sidx, idx); + return -EBUSY; + } + bdebug("silent double reserve of PFN %lx\n", + idx + bdata->node_min_pfn); } - } + return 0; } -static void __init free_bootmem_core(bootmem_data_t *bdata, unsigned long addr, - unsigned long size) +static int __init mark_bootmem_node(bootmem_data_t *bdata, + unsigned long start, unsigned long end, + int reserve, int flags) { unsigned long sidx, eidx; - unsigned long i; - BUG_ON(!size); + bdebug("nid=%td start=%lx end=%lx reserve=%d flags=%x\n", + bdata - bootmem_node_data, start, end, reserve, flags); - /* out range */ - if (addr + size < bdata->node_boot_start || - PFN_DOWN(addr) > bdata->node_low_pfn) - return; - /* - * round down end of usable mem, partially free pages are - * considered reserved. - */ + BUG_ON(start < bdata->node_min_pfn); + BUG_ON(end > bdata->node_low_pfn); - if (addr >= bdata->node_boot_start && addr < bdata->last_success) - bdata->last_success = addr; + sidx = start - bdata->node_min_pfn; + eidx = end - bdata->node_min_pfn; - /* - * Round up to index to the range. - */ - if (PFN_UP(addr) > PFN_DOWN(bdata->node_boot_start)) - sidx = PFN_UP(addr) - PFN_DOWN(bdata->node_boot_start); + if (reserve) + return __reserve(bdata, sidx, eidx, flags); else - sidx = 0; + __free(bdata, sidx, eidx); + return 0; +} - eidx = PFN_DOWN(addr + size - bdata->node_boot_start); - if (eidx > bdata->node_low_pfn - PFN_DOWN(bdata->node_boot_start)) - eidx = bdata->node_low_pfn - PFN_DOWN(bdata->node_boot_start); +static int __init mark_bootmem(unsigned long start, unsigned long end, + int reserve, int flags) +{ + unsigned long pos; + bootmem_data_t *bdata; - for (i = sidx; i < eidx; i++) { - if (unlikely(!test_and_clear_bit(i, bdata->node_bootmem_map))) - BUG(); + pos = start; + list_for_each_entry(bdata, &bdata_list, list) { + int err; + unsigned long max; + + if (pos < bdata->node_min_pfn || + pos >= bdata->node_low_pfn) { + BUG_ON(pos != start); + continue; + } + + max = min(bdata->node_low_pfn, end); + + err = mark_bootmem_node(bdata, pos, max, reserve, flags); + if (reserve && err) { + mark_bootmem(start, pos, 0, 0); + return err; + } + + if (max == end) + return 0; + pos = bdata->node_low_pfn; } + BUG(); } -/* - * We 'merge' subsequent allocations to save space. We might 'lose' - * some fraction of a page if allocations cannot be satisfied due to - * size constraints on boxes where there is physical RAM space - * fragmentation - in these cases (mostly large memory boxes) this - * is not a problem. - * - * On low memory boxes we get it right in 100% of the cases. +/** + * free_bootmem_node - mark a page range as usable + * @pgdat: node the range resides on + * @physaddr: starting address of the range + * @size: size of the range in bytes * - * alignment has to be a power of 2 value. + * Partial pages will be considered reserved and left as they are. * - * NOTE: This function is _not_ reentrant. + * The range must reside completely on the specified node. */ -void * __init -__alloc_bootmem_core(struct bootmem_data *bdata, unsigned long size, - unsigned long align, unsigned long goal, unsigned long limit) +void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr, + unsigned long size) { - unsigned long areasize, preferred; - unsigned long i, start = 0, incr, eidx, end_pfn; - void *ret; - unsigned long node_boot_start; - void *node_bootmem_map; - - if (!size) { - printk("__alloc_bootmem_core(): zero-sized request\n"); - BUG(); - } - BUG_ON(align & (align-1)); + unsigned long start, end; - /* on nodes without memory - bootmem_map is NULL */ - if (!bdata->node_bootmem_map) - return NULL; - - /* bdata->node_boot_start is supposed to be (12+6)bits alignment on x86_64 ? */ - node_boot_start = bdata->node_boot_start; - node_bootmem_map = bdata->node_bootmem_map; - if (align) { - node_boot_start = ALIGN(bdata->node_boot_start, align); - if (node_boot_start > bdata->node_boot_start) - node_bootmem_map = (unsigned long *)bdata->node_bootmem_map + - PFN_DOWN(node_boot_start - bdata->node_boot_start)/BITS_PER_LONG; - } - - if (limit && node_boot_start >= limit) - return NULL; + start = PFN_UP(physaddr); + end = PFN_DOWN(physaddr + size); - end_pfn = bdata->node_low_pfn; - limit = PFN_DOWN(limit); - if (limit && end_pfn > limit) - end_pfn = limit; + mark_bootmem_node(pgdat->bdata, start, end, 0, 0); +} - eidx = end_pfn - PFN_DOWN(node_boot_start); +/** + * free_bootmem - mark a page range as usable + * @addr: starting address of the range + * @size: size of the range in bytes + * + * Partial pages will be considered reserved and left as they are. + * + * The range must be contiguous but may span node boundaries. + */ +void __init free_bootmem(unsigned long addr, unsigned long size) +{ + unsigned long start, end; - /* - * We try to allocate bootmem pages above 'goal' - * first, then we try to allocate lower pages. - */ - preferred = 0; - if (goal && PFN_DOWN(goal) < end_pfn) { - if (goal > node_boot_start) - preferred = goal - node_boot_start; - - if (bdata->last_success > node_boot_start && - bdata->last_success - node_boot_start >= preferred) - if (!limit || (limit && limit > bdata->last_success)) - preferred = bdata->last_success - node_boot_start; - } + start = PFN_UP(addr); + end = PFN_DOWN(addr + size); - preferred = PFN_DOWN(ALIGN(preferred, align)); - areasize = (size + PAGE_SIZE-1) / PAGE_SIZE; - incr = align >> PAGE_SHIFT ? : 1; + mark_bootmem(start, end, 0, 0); +} -restart_scan: - for (i = preferred; i < eidx;) { - unsigned long j; +/** + * reserve_bootmem_node - mark a page range as reserved + * @pgdat: node the range resides on + * @physaddr: starting address of the range + * @size: size of the range in bytes + * @flags: reservation flags (see linux/bootmem.h) + * + * Partial pages will be reserved. + * + * The range must reside completely on the specified node. + */ +int __init reserve_bootmem_node(pg_data_t *pgdat, unsigned long physaddr, + unsigned long size, int flags) +{ + unsigned long start, end; - i = find_next_zero_bit(node_bootmem_map, eidx, i); - i = ALIGN(i, incr); - if (i >= eidx) - break; - if (test_bit(i, node_bootmem_map)) { - i += incr; - continue; - } - for (j = i + 1; j < i + areasize; ++j) { - if (j >= eidx) - goto fail_block; - if (test_bit(j, node_bootmem_map)) - goto fail_block; - } - start = i; - goto found; - fail_block: - i = ALIGN(j, incr); - if (i == j) - i += incr; - } + start = PFN_DOWN(physaddr); + end = PFN_UP(physaddr + size); - if (preferred > 0) { - preferred = 0; - goto restart_scan; - } - return NULL; + return mark_bootmem_node(pgdat->bdata, start, end, 1, flags); +} -found: - bdata->last_success = PFN_PHYS(start) + node_boot_start; - BUG_ON(start >= eidx); +#ifndef CONFIG_HAVE_ARCH_BOOTMEM_NODE +/** + * reserve_bootmem - mark a page range as usable + * @addr: starting address of the range + * @size: size of the range in bytes + * @flags: reservation flags (see linux/bootmem.h) + * + * Partial pages will be reserved. + * + * The range must be contiguous but may span node boundaries. + */ +int __init reserve_bootmem(unsigned long addr, unsigned long size, + int flags) +{ + unsigned long start, end; - /* - * Is the next page of the previous allocation-end the start - * of this allocation's buffer? If yes then we can 'merge' - * the previous partial page with this allocation. - */ - if (align < PAGE_SIZE && - bdata->last_offset && bdata->last_pos+1 == start) { - unsigned long offset, remaining_size; - offset = ALIGN(bdata->last_offset, align); - BUG_ON(offset > PAGE_SIZE); - remaining_size = PAGE_SIZE - offset; - if (size < remaining_size) { - areasize = 0; - /* last_pos unchanged */ - bdata->last_offset = offset + size; - ret = phys_to_virt(bdata->last_pos * PAGE_SIZE + - offset + node_boot_start); - } else { - remaining_size = size - remaining_size; - areasize = (remaining_size + PAGE_SIZE-1) / PAGE_SIZE; - ret = phys_to_virt(bdata->last_pos * PAGE_SIZE + - offset + node_boot_start); - bdata->last_pos = start + areasize - 1; - bdata->last_offset = remaining_size; - } - bdata->last_offset &= ~PAGE_MASK; - } else { - bdata->last_pos = start + areasize - 1; - bdata->last_offset = size & ~PAGE_MASK; - ret = phys_to_virt(start * PAGE_SIZE + node_boot_start); - } + start = PFN_DOWN(addr); + end = PFN_UP(addr + size); - /* - * Reserve the area now: - */ - for (i = start; i < start + areasize; i++) - if (unlikely(test_and_set_bit(i, node_bootmem_map))) - BUG(); - memset(ret, 0, size); - return ret; + return mark_bootmem(start, end, 1, flags); } +#endif /* !CONFIG_HAVE_ARCH_BOOTMEM_NODE */ -static unsigned long __init free_all_bootmem_core(pg_data_t *pgdat) +static unsigned long align_idx(struct bootmem_data *bdata, unsigned long idx, + unsigned long step) { - struct page *page; - unsigned long pfn; - bootmem_data_t *bdata = pgdat->bdata; - unsigned long i, count, total = 0; - unsigned long idx; - unsigned long *map; - int gofast = 0; - - BUG_ON(!bdata->node_bootmem_map); - - count = 0; - /* first extant page of the node */ - pfn = PFN_DOWN(bdata->node_boot_start); - idx = bdata->node_low_pfn - pfn; - map = bdata->node_bootmem_map; - /* Check physaddr is O(LOG2(BITS_PER_LONG)) page aligned */ - if (bdata->node_boot_start == 0 || - ffs(bdata->node_boot_start) - PAGE_SHIFT > ffs(BITS_PER_LONG)) - gofast = 1; - for (i = 0; i < idx; ) { - unsigned long v = ~map[i / BITS_PER_LONG]; - - if (gofast && v == ~0UL) { - int order; - - page = pfn_to_page(pfn); - count += BITS_PER_LONG; - order = ffs(BITS_PER_LONG) - 1; - __free_pages_bootmem(page, order); - i += BITS_PER_LONG; - page += BITS_PER_LONG; - } else if (v) { - unsigned long m; - - page = pfn_to_page(pfn); - for (m = 1; m && i < idx; m<<=1, page++, i++) { - if (v & m) { - count++; - __free_pages_bootmem(page, 0); - } - } - } else { - i += BITS_PER_LONG; - } - pfn += BITS_PER_LONG; - } - total += count; + unsigned long base = bdata->node_min_pfn; /* - * Now free the allocator bitmap itself, it's not - * needed anymore: + * Align the index with respect to the node start so that the + * combination of both satisfies the requested alignment. */ - page = virt_to_page(bdata->node_bootmem_map); - count = 0; - idx = (get_mapsize(bdata) + PAGE_SIZE-1) >> PAGE_SHIFT; - for (i = 0; i < idx; i++, page++) { - __free_pages_bootmem(page, 0); - count++; - } - total += count; - bdata->node_bootmem_map = NULL; - return total; + return ALIGN(base + idx, step) - base; } -unsigned long __init init_bootmem_node(pg_data_t *pgdat, unsigned long freepfn, - unsigned long startpfn, unsigned long endpfn) +static unsigned long align_off(struct bootmem_data *bdata, unsigned long off, + unsigned long align) { - return init_bootmem_core(pgdat, freepfn, startpfn, endpfn); + unsigned long base = PFN_PHYS(bdata->node_min_pfn); + + /* Same as align_idx for byte offsets */ + + return ALIGN(base + off, align) - base; } -int __init reserve_bootmem_node(pg_data_t *pgdat, unsigned long physaddr, - unsigned long size, int flags) +static void * __init alloc_bootmem_core(struct bootmem_data *bdata, + unsigned long size, unsigned long align, + unsigned long goal, unsigned long limit) { - int ret; + unsigned long fallback = 0; + unsigned long min, max, start, sidx, midx, step; - ret = can_reserve_bootmem_core(pgdat->bdata, physaddr, size, flags); - if (ret < 0) - return -ENOMEM; - reserve_bootmem_core(pgdat->bdata, physaddr, size, flags); + BUG_ON(!size); + BUG_ON(align & (align - 1)); + BUG_ON(limit && goal + size > limit); - return 0; -} + if (!bdata->node_bootmem_map) + return NULL; -void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr, - unsigned long size) -{ - free_bootmem_core(pgdat->bdata, physaddr, size); -} + bdebug("nid=%td size=%lx [%lu pages] align=%lx goal=%lx limit=%lx\n", + bdata - bootmem_node_data, size, PAGE_ALIGN(size) >> PAGE_SHIFT, + align, goal, limit); -unsigned long __init free_all_bootmem_node(pg_data_t *pgdat) -{ - register_page_bootmem_info_node(pgdat); - return free_all_bootmem_core(pgdat); -} + min = bdata->node_min_pfn; + max = bdata->node_low_pfn; -unsigned long __init init_bootmem(unsigned long start, unsigned long pages) -{ - max_low_pfn = pages; - min_low_pfn = start; - return init_bootmem_core(NODE_DATA(0), start, 0, pages); -} + goal >>= PAGE_SHIFT; + limit >>= PAGE_SHIFT; -#ifndef CONFIG_HAVE_ARCH_BOOTMEM_NODE -int __init reserve_bootmem(unsigned long addr, unsigned long size, - int flags) -{ - bootmem_data_t *bdata; - int ret; + if (limit && max > limit) + max = limit; + if (max <= min) + return NULL; - list_for_each_entry(bdata, &bdata_list, list) { - ret = can_reserve_bootmem_core(bdata, addr, size, flags); - if (ret < 0) - return ret; + step = max(align >> PAGE_SHIFT, 1UL); + + if (goal && min < goal && goal < max) + start = ALIGN(goal, step); + else + start = ALIGN(min, step); + + sidx = start - bdata->node_min_pfn; + midx = max - bdata->node_min_pfn; + + if (bdata->hint_idx > sidx) { + /* + * Handle the valid case of sidx being zero and still + * catch the fallback below. + */ + fallback = sidx + 1; + sidx = align_idx(bdata, bdata->hint_idx, step); } - list_for_each_entry(bdata, &bdata_list, list) - reserve_bootmem_core(bdata, addr, size, flags); - return 0; -} -#endif /* !CONFIG_HAVE_ARCH_BOOTMEM_NODE */ + while (1) { + int merge; + void *region; + unsigned long eidx, i, start_off, end_off; +find_block: + sidx = find_next_zero_bit(bdata->node_bootmem_map, midx, sidx); + sidx = align_idx(bdata, sidx, step); + eidx = sidx + PFN_UP(size); -void __init free_bootmem(unsigned long addr, unsigned long size) -{ - bootmem_data_t *bdata; - list_for_each_entry(bdata, &bdata_list, list) - free_bootmem_core(bdata, addr, size); -} + if (sidx >= midx || eidx > midx) + break; -unsigned long __init free_all_bootmem(void) -{ - return free_all_bootmem_core(NODE_DATA(0)); + for (i = sidx; i < eidx; i++) + if (test_bit(i, bdata->node_bootmem_map)) { + sidx = align_idx(bdata, i, step); + if (sidx == i) + sidx += step; + goto find_block; + } + + if (bdata->last_end_off & (PAGE_SIZE - 1) && + PFN_DOWN(bdata->last_end_off) + 1 == sidx) + start_off = align_off(bdata, bdata->last_end_off, align); + else + start_off = PFN_PHYS(sidx); + + merge = PFN_DOWN(start_off) < sidx; + end_off = start_off + size; + + bdata->last_end_off = end_off; + bdata->hint_idx = PFN_UP(end_off); + + /* + * Reserve the area now: + */ + if (__reserve(bdata, PFN_DOWN(start_off) + merge, + PFN_UP(end_off), BOOTMEM_EXCLUSIVE)) + BUG(); + + region = phys_to_virt(PFN_PHYS(bdata->node_min_pfn) + + start_off); + memset(region, 0, size); + return region; + } + + if (fallback) { + sidx = align_idx(bdata, fallback - 1, step); + fallback = 0; + goto find_block; + } + + return NULL; } -void * __init __alloc_bootmem_nopanic(unsigned long size, unsigned long align, - unsigned long goal) +static void * __init ___alloc_bootmem_nopanic(unsigned long size, + unsigned long align, + unsigned long goal, + unsigned long limit) { bootmem_data_t *bdata; - void *ptr; +restart: list_for_each_entry(bdata, &bdata_list, list) { - ptr = __alloc_bootmem_core(bdata, size, align, goal, 0); - if (ptr) - return ptr; + void *region; + + if (goal && bdata->node_low_pfn <= PFN_DOWN(goal)) + continue; + if (limit && bdata->node_min_pfn >= PFN_DOWN(limit)) + break; + + region = alloc_bootmem_core(bdata, size, align, goal, limit); + if (region) + return region; + } + + if (goal) { + goal = 0; + goto restart; } + return NULL; } -void * __init __alloc_bootmem(unsigned long size, unsigned long align, - unsigned long goal) +/** + * __alloc_bootmem_nopanic - allocate boot memory without panicking + * @size: size of the request in bytes + * @align: alignment of the region + * @goal: preferred starting address of the region + * + * The goal is dropped if it can not be satisfied and the allocation will + * fall back to memory below @goal. + * + * Allocation may happen on any node in the system. + * + * Returns NULL on failure. + */ +void * __init __alloc_bootmem_nopanic(unsigned long size, unsigned long align, + unsigned long goal) { - void *mem = __alloc_bootmem_nopanic(size,align,goal); + return ___alloc_bootmem_nopanic(size, align, goal, 0); +} + +static void * __init ___alloc_bootmem(unsigned long size, unsigned long align, + unsigned long goal, unsigned long limit) +{ + void *mem = ___alloc_bootmem_nopanic(size, align, goal, limit); if (mem) return mem; @@ -534,78 +593,135 @@ void * __init __alloc_bootmem(unsigned long size, unsigned long align, return NULL; } +/** + * __alloc_bootmem - allocate boot memory + * @size: size of the request in bytes + * @align: alignment of the region + * @goal: preferred starting address of the region + * + * The goal is dropped if it can not be satisfied and the allocation will + * fall back to memory below @goal. + * + * Allocation may happen on any node in the system. + * + * The function panics if the request can not be satisfied. + */ +void * __init __alloc_bootmem(unsigned long size, unsigned long align, + unsigned long goal) +{ + return ___alloc_bootmem(size, align, goal, 0); +} -void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size, - unsigned long align, unsigned long goal) +static void * __init ___alloc_bootmem_node(bootmem_data_t *bdata, + unsigned long size, unsigned long align, + unsigned long goal, unsigned long limit) { void *ptr; - ptr = __alloc_bootmem_core(pgdat->bdata, size, align, goal, 0); + ptr = alloc_bootmem_core(bdata, size, align, goal, limit); if (ptr) return ptr; - return __alloc_bootmem(size, align, goal); + return ___alloc_bootmem(size, align, goal, limit); +} + +/** + * __alloc_bootmem_node - allocate boot memory from a specific node + * @pgdat: node to allocate from + * @size: size of the request in bytes + * @align: alignment of the region + * @goal: preferred starting address of the region + * + * The goal is dropped if it can not be satisfied and the allocation will + * fall back to memory below @goal. + * + * Allocation may fall back to any node in the system if the specified node + * can not hold the requested memory. + * + * The function panics if the request can not be satisfied. + */ +void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size, + unsigned long align, unsigned long goal) +{ + return ___alloc_bootmem_node(pgdat->bdata, size, align, goal, 0); } #ifdef CONFIG_SPARSEMEM +/** + * alloc_bootmem_section - allocate boot memory from a specific section + * @size: size of the request in bytes + * @section_nr: sparse map section to allocate from + * + * Return NULL on failure. + */ void * __init alloc_bootmem_section(unsigned long size, unsigned long section_nr) { - void *ptr; - unsigned long limit, goal, start_nr, end_nr, pfn; - struct pglist_data *pgdat; + bootmem_data_t *bdata; + unsigned long pfn, goal, limit; pfn = section_nr_to_pfn(section_nr); - goal = PFN_PHYS(pfn); - limit = PFN_PHYS(section_nr_to_pfn(section_nr + 1)) - 1; - pgdat = NODE_DATA(early_pfn_to_nid(pfn)); - ptr = __alloc_bootmem_core(pgdat->bdata, size, SMP_CACHE_BYTES, goal, - limit); + goal = pfn << PAGE_SHIFT; + limit = section_nr_to_pfn(section_nr + 1) << PAGE_SHIFT; + bdata = &bootmem_node_data[early_pfn_to_nid(pfn)]; - if (!ptr) - return NULL; + return alloc_bootmem_core(bdata, size, SMP_CACHE_BYTES, goal, limit); +} +#endif - start_nr = pfn_to_section_nr(PFN_DOWN(__pa(ptr))); - end_nr = pfn_to_section_nr(PFN_DOWN(__pa(ptr) + size)); - if (start_nr != section_nr || end_nr != section_nr) { - printk(KERN_WARNING "alloc_bootmem failed on section %ld.\n", - section_nr); - free_bootmem_core(pgdat->bdata, __pa(ptr), size); - ptr = NULL; - } +void * __init __alloc_bootmem_node_nopanic(pg_data_t *pgdat, unsigned long size, + unsigned long align, unsigned long goal) +{ + void *ptr; - return ptr; + ptr = alloc_bootmem_core(pgdat->bdata, size, align, goal, 0); + if (ptr) + return ptr; + + return __alloc_bootmem_nopanic(size, align, goal); } -#endif #ifndef ARCH_LOW_ADDRESS_LIMIT #define ARCH_LOW_ADDRESS_LIMIT 0xffffffffUL #endif +/** + * __alloc_bootmem_low - allocate low boot memory + * @size: size of the request in bytes + * @align: alignment of the region + * @goal: preferred starting address of the region + * + * The goal is dropped if it can not be satisfied and the allocation will + * fall back to memory below @goal. + * + * Allocation may happen on any node in the system. + * + * The function panics if the request can not be satisfied. + */ void * __init __alloc_bootmem_low(unsigned long size, unsigned long align, unsigned long goal) { - bootmem_data_t *bdata; - void *ptr; - - list_for_each_entry(bdata, &bdata_list, list) { - ptr = __alloc_bootmem_core(bdata, size, align, goal, - ARCH_LOW_ADDRESS_LIMIT); - if (ptr) - return ptr; - } - - /* - * Whoops, we cannot satisfy the allocation request. - */ - printk(KERN_ALERT "low bootmem alloc of %lu bytes failed!\n", size); - panic("Out of low memory"); - return NULL; + return ___alloc_bootmem(size, align, goal, ARCH_LOW_ADDRESS_LIMIT); } +/** + * __alloc_bootmem_low_node - allocate low boot memory from a specific node + * @pgdat: node to allocate from + * @size: size of the request in bytes + * @align: alignment of the region + * @goal: preferred starting address of the region + * + * The goal is dropped if it can not be satisfied and the allocation will + * fall back to memory below @goal. + * + * Allocation may fall back to any node in the system if the specified node + * can not hold the requested memory. + * + * The function panics if the request can not be satisfied. + */ void * __init __alloc_bootmem_low_node(pg_data_t *pgdat, unsigned long size, unsigned long align, unsigned long goal) { - return __alloc_bootmem_core(pgdat->bdata, size, align, goal, - ARCH_LOW_ADDRESS_LIMIT); + return ___alloc_bootmem_node(pgdat->bdata, size, align, + goal, ARCH_LOW_ADDRESS_LIMIT); } diff --git a/mm/bounce.c b/mm/bounce.c index b6d2d0f1019b..06722c403058 100644 --- a/mm/bounce.c +++ b/mm/bounce.c @@ -267,7 +267,7 @@ void blk_queue_bounce(struct request_queue *q, struct bio **bio_orig) /* * Data-less bio, nothing to bounce */ - if (bio_empty_barrier(*bio_orig)) + if (!bio_has_data(*bio_orig)) return; /* diff --git a/mm/fadvise.c b/mm/fadvise.c index 343cfdfebd9e..a1da969bd980 100644 --- a/mm/fadvise.c +++ b/mm/fadvise.c @@ -3,7 +3,7 @@ * * Copyright (C) 2002, Linus Torvalds * - * 11Jan2003 akpm@digeo.com + * 11Jan2003 Andrew Morton * Initial version. */ diff --git a/mm/filemap.c b/mm/filemap.c index 65d9d9e2b755..ab8553658af3 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -33,6 +33,7 @@ #include <linux/cpuset.h> #include <linux/hardirq.h> /* for BUG_ON(!in_atomic()) only */ #include <linux/memcontrol.h> +#include <linux/mm_inline.h> /* for page_is_file_cache() */ #include "internal.h" /* @@ -42,9 +43,6 @@ #include <asm/mman.h> -static ssize_t -generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, - loff_t offset, unsigned long nr_segs); /* * Shared mappings implemented 30.11.1994. It's not fully working yet, @@ -112,18 +110,18 @@ generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, /* * Remove a page from the page cache and free it. Caller has to make * sure the page is locked and that nobody else uses it - or that usage - * is safe. The caller must hold a write_lock on the mapping's tree_lock. + * is safe. The caller must hold the mapping's tree_lock. */ void __remove_from_page_cache(struct page *page) { struct address_space *mapping = page->mapping; - mem_cgroup_uncharge_page(page); radix_tree_delete(&mapping->page_tree, page->index); page->mapping = NULL; mapping->nrpages--; __dec_zone_page_state(page, NR_FILE_PAGES); BUG_ON(page_mapped(page)); + mem_cgroup_uncharge_cache_page(page); /* * Some filesystems seem to re-dirty the page even after @@ -144,9 +142,9 @@ void remove_from_page_cache(struct page *page) BUG_ON(!PageLocked(page)); - write_lock_irq(&mapping->tree_lock); + spin_lock_irq(&mapping->tree_lock); __remove_from_page_cache(page); - write_unlock_irq(&mapping->tree_lock); + spin_unlock_irq(&mapping->tree_lock); } static int sync_page(void *word) @@ -445,55 +443,74 @@ int filemap_write_and_wait_range(struct address_space *mapping, } /** - * add_to_page_cache - add newly allocated pagecache pages + * add_to_page_cache_locked - add a locked page to the pagecache * @page: page to add * @mapping: the page's address_space * @offset: page index * @gfp_mask: page allocation mode * - * This function is used to add newly allocated pagecache pages; - * the page is new, so we can just run SetPageLocked() against it. - * The other page state flags were set by rmqueue(). - * + * This function is used to add a page to the pagecache. It must be locked. * This function does not add the page to the LRU. The caller must do that. */ -int add_to_page_cache(struct page *page, struct address_space *mapping, +int add_to_page_cache_locked(struct page *page, struct address_space *mapping, pgoff_t offset, gfp_t gfp_mask) { - int error = mem_cgroup_cache_charge(page, current->mm, + int error; + + VM_BUG_ON(!PageLocked(page)); + + error = mem_cgroup_cache_charge(page, current->mm, gfp_mask & ~__GFP_HIGHMEM); if (error) goto out; error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM); if (error == 0) { - write_lock_irq(&mapping->tree_lock); + page_cache_get(page); + page->mapping = mapping; + page->index = offset; + + spin_lock_irq(&mapping->tree_lock); error = radix_tree_insert(&mapping->page_tree, offset, page); - if (!error) { - page_cache_get(page); - SetPageLocked(page); - page->mapping = mapping; - page->index = offset; + if (likely(!error)) { mapping->nrpages++; __inc_zone_page_state(page, NR_FILE_PAGES); - } else - mem_cgroup_uncharge_page(page); + } else { + page->mapping = NULL; + mem_cgroup_uncharge_cache_page(page); + page_cache_release(page); + } - write_unlock_irq(&mapping->tree_lock); + spin_unlock_irq(&mapping->tree_lock); radix_tree_preload_end(); } else - mem_cgroup_uncharge_page(page); + mem_cgroup_uncharge_cache_page(page); out: return error; } -EXPORT_SYMBOL(add_to_page_cache); +EXPORT_SYMBOL(add_to_page_cache_locked); int add_to_page_cache_lru(struct page *page, struct address_space *mapping, pgoff_t offset, gfp_t gfp_mask) { - int ret = add_to_page_cache(page, mapping, offset, gfp_mask); - if (ret == 0) - lru_cache_add(page); + int ret; + + /* + * Splice_read and readahead add shmem/tmpfs pages into the page cache + * before shmem_readpage has a chance to mark them as SwapBacked: they + * need to go on the active_anon lru below, and mem_cgroup_cache_charge + * (called in add_to_page_cache) needs to know where they're going too. + */ + if (mapping_cap_swap_backed(mapping)) + SetPageSwapBacked(page); + + ret = add_to_page_cache(page, mapping, offset, gfp_mask); + if (ret == 0) { + if (page_is_file_cache(page)) + lru_cache_add_file(page); + else + lru_cache_add_active_anon(page); + } return ret; } @@ -556,17 +573,14 @@ EXPORT_SYMBOL(wait_on_page_bit); * mechananism between PageLocked pages and PageWriteback pages is shared. * But that's OK - sleepers in wait_on_page_writeback() just go back to sleep. * - * The first mb is necessary to safely close the critical section opened by the - * TestSetPageLocked(), the second mb is necessary to enforce ordering between - * the clear_bit and the read of the waitqueue (to avoid SMP races with a - * parallel wait_on_page_locked()). + * The mb is necessary to enforce ordering between the clear_bit and the read + * of the waitqueue (to avoid SMP races with a parallel wait_on_page_locked()). */ void unlock_page(struct page *page) { - smp_mb__before_clear_bit(); - if (!TestClearPageLocked(page)) - BUG(); - smp_mb__after_clear_bit(); + VM_BUG_ON(!PageLocked(page)); + clear_bit_unlock(PG_locked, &page->flags); + smp_mb__after_clear_bit(); wake_up_page(page, PG_locked); } EXPORT_SYMBOL(unlock_page); @@ -636,15 +650,35 @@ void __lock_page_nosync(struct page *page) * Is there a pagecache struct page at the given (mapping, offset) tuple? * If yes, increment its refcount and return it; if no, return NULL. */ -struct page * find_get_page(struct address_space *mapping, pgoff_t offset) +struct page *find_get_page(struct address_space *mapping, pgoff_t offset) { + void **pagep; struct page *page; - read_lock_irq(&mapping->tree_lock); - page = radix_tree_lookup(&mapping->page_tree, offset); - if (page) - page_cache_get(page); - read_unlock_irq(&mapping->tree_lock); + rcu_read_lock(); +repeat: + page = NULL; + pagep = radix_tree_lookup_slot(&mapping->page_tree, offset); + if (pagep) { + page = radix_tree_deref_slot(pagep); + if (unlikely(!page || page == RADIX_TREE_RETRY)) + goto repeat; + + if (!page_cache_get_speculative(page)) + goto repeat; + + /* + * Has the page moved? + * This is part of the lockless pagecache protocol. See + * include/linux/pagemap.h for details. + */ + if (unlikely(page != *pagep)) { + page_cache_release(page); + goto repeat; + } + } + rcu_read_unlock(); + return page; } EXPORT_SYMBOL(find_get_page); @@ -659,32 +693,22 @@ EXPORT_SYMBOL(find_get_page); * * Returns zero if the page was not present. find_lock_page() may sleep. */ -struct page *find_lock_page(struct address_space *mapping, - pgoff_t offset) +struct page *find_lock_page(struct address_space *mapping, pgoff_t offset) { struct page *page; repeat: - read_lock_irq(&mapping->tree_lock); - page = radix_tree_lookup(&mapping->page_tree, offset); + page = find_get_page(mapping, offset); if (page) { - page_cache_get(page); - if (TestSetPageLocked(page)) { - read_unlock_irq(&mapping->tree_lock); - __lock_page(page); - - /* Has the page been truncated while we slept? */ - if (unlikely(page->mapping != mapping)) { - unlock_page(page); - page_cache_release(page); - goto repeat; - } - VM_BUG_ON(page->index != offset); - goto out; + lock_page(page); + /* Has the page been truncated? */ + if (unlikely(page->mapping != mapping)) { + unlock_page(page); + page_cache_release(page); + goto repeat; } + VM_BUG_ON(page->index != offset); } - read_unlock_irq(&mapping->tree_lock); -out: return page; } EXPORT_SYMBOL(find_lock_page); @@ -750,13 +774,39 @@ unsigned find_get_pages(struct address_space *mapping, pgoff_t start, { unsigned int i; unsigned int ret; + unsigned int nr_found; + + rcu_read_lock(); +restart: + nr_found = radix_tree_gang_lookup_slot(&mapping->page_tree, + (void ***)pages, start, nr_pages); + ret = 0; + for (i = 0; i < nr_found; i++) { + struct page *page; +repeat: + page = radix_tree_deref_slot((void **)pages[i]); + if (unlikely(!page)) + continue; + /* + * this can only trigger if nr_found == 1, making livelock + * a non issue. + */ + if (unlikely(page == RADIX_TREE_RETRY)) + goto restart; - read_lock_irq(&mapping->tree_lock); - ret = radix_tree_gang_lookup(&mapping->page_tree, - (void **)pages, start, nr_pages); - for (i = 0; i < ret; i++) - page_cache_get(pages[i]); - read_unlock_irq(&mapping->tree_lock); + if (!page_cache_get_speculative(page)) + goto repeat; + + /* Has the page moved? */ + if (unlikely(page != *((void **)pages[i]))) { + page_cache_release(page); + goto repeat; + } + + pages[ret] = page; + ret++; + } + rcu_read_unlock(); return ret; } @@ -777,19 +827,44 @@ unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t index, { unsigned int i; unsigned int ret; + unsigned int nr_found; + + rcu_read_lock(); +restart: + nr_found = radix_tree_gang_lookup_slot(&mapping->page_tree, + (void ***)pages, index, nr_pages); + ret = 0; + for (i = 0; i < nr_found; i++) { + struct page *page; +repeat: + page = radix_tree_deref_slot((void **)pages[i]); + if (unlikely(!page)) + continue; + /* + * this can only trigger if nr_found == 1, making livelock + * a non issue. + */ + if (unlikely(page == RADIX_TREE_RETRY)) + goto restart; - read_lock_irq(&mapping->tree_lock); - ret = radix_tree_gang_lookup(&mapping->page_tree, - (void **)pages, index, nr_pages); - for (i = 0; i < ret; i++) { - if (pages[i]->mapping == NULL || pages[i]->index != index) + if (page->mapping == NULL || page->index != index) break; - page_cache_get(pages[i]); + if (!page_cache_get_speculative(page)) + goto repeat; + + /* Has the page moved? */ + if (unlikely(page != *((void **)pages[i]))) { + page_cache_release(page); + goto repeat; + } + + pages[ret] = page; + ret++; index++; } - read_unlock_irq(&mapping->tree_lock); - return i; + rcu_read_unlock(); + return ret; } EXPORT_SYMBOL(find_get_pages_contig); @@ -809,15 +884,43 @@ unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *index, { unsigned int i; unsigned int ret; + unsigned int nr_found; + + rcu_read_lock(); +restart: + nr_found = radix_tree_gang_lookup_tag_slot(&mapping->page_tree, + (void ***)pages, *index, nr_pages, tag); + ret = 0; + for (i = 0; i < nr_found; i++) { + struct page *page; +repeat: + page = radix_tree_deref_slot((void **)pages[i]); + if (unlikely(!page)) + continue; + /* + * this can only trigger if nr_found == 1, making livelock + * a non issue. + */ + if (unlikely(page == RADIX_TREE_RETRY)) + goto restart; + + if (!page_cache_get_speculative(page)) + goto repeat; + + /* Has the page moved? */ + if (unlikely(page != *((void **)pages[i]))) { + page_cache_release(page); + goto repeat; + } + + pages[ret] = page; + ret++; + } + rcu_read_unlock(); - read_lock_irq(&mapping->tree_lock); - ret = radix_tree_gang_lookup_tag(&mapping->page_tree, - (void **)pages, *index, nr_pages, tag); - for (i = 0; i < ret; i++) - page_cache_get(pages[i]); if (ret) *index = pages[ret - 1]->index + 1; - read_unlock_irq(&mapping->tree_lock); + return ret; } EXPORT_SYMBOL(find_get_pages_tag); @@ -841,7 +944,7 @@ grab_cache_page_nowait(struct address_space *mapping, pgoff_t index) struct page *page = find_get_page(mapping, index); if (page) { - if (!TestSetPageLocked(page)) + if (trylock_page(page)) return page; page_cache_release(page); return NULL; @@ -933,8 +1036,17 @@ find_page: ra, filp, page, index, last_index - index); } - if (!PageUptodate(page)) - goto page_not_up_to_date; + if (!PageUptodate(page)) { + if (inode->i_blkbits == PAGE_CACHE_SHIFT || + !mapping->a_ops->is_partially_uptodate) + goto page_not_up_to_date; + if (!trylock_page(page)) + goto page_not_up_to_date; + if (!mapping->a_ops->is_partially_uptodate(page, + desc, offset)) + goto page_not_up_to_date_locked; + unlock_page(page); + } page_ok: /* * i_size must be checked after we know the page is Uptodate. @@ -1001,9 +1113,11 @@ page_ok: page_not_up_to_date: /* Get exclusive access to the page ... */ - if (lock_page_killable(page)) - goto readpage_eio; + error = lock_page_killable(page); + if (unlikely(error)) + goto readpage_error; +page_not_up_to_date_locked: /* Did it get truncated before we got the lock? */ if (!page->mapping) { unlock_page(page); @@ -1030,8 +1144,9 @@ readpage: } if (!PageUptodate(page)) { - if (lock_page_killable(page)) - goto readpage_eio; + error = lock_page_killable(page); + if (unlikely(error)) + goto readpage_error; if (!PageUptodate(page)) { if (page->mapping == NULL) { /* @@ -1043,15 +1158,14 @@ readpage: } unlock_page(page); shrink_readahead_size_eio(filp, ra); - goto readpage_eio; + error = -EIO; + goto readpage_error; } unlock_page(page); } goto page_ok; -readpage_eio: - error = -EIO; readpage_error: /* UHHUH! A synchronous read error occurred. Report it */ desc->error = error; @@ -1086,8 +1200,7 @@ out: ra->prev_pos |= prev_offset; *ppos = ((loff_t)index << PAGE_CACHE_SHIFT) + offset; - if (filp) - file_accessed(filp); + file_accessed(filp); } int file_read_actor(read_descriptor_t *desc, struct page *page, @@ -1200,42 +1313,41 @@ generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov, mapping = filp->f_mapping; inode = mapping->host; - retval = 0; if (!count) goto out; /* skip atime */ size = i_size_read(inode); if (pos < size) { - retval = generic_file_direct_IO(READ, iocb, - iov, pos, nr_segs); + retval = filemap_write_and_wait(mapping); + if (!retval) { + retval = mapping->a_ops->direct_IO(READ, iocb, + iov, pos, nr_segs); + } if (retval > 0) *ppos = pos + retval; - } - if (likely(retval != 0)) { - file_accessed(filp); - goto out; + if (retval) { + file_accessed(filp); + goto out; + } } } - retval = 0; - if (count) { - for (seg = 0; seg < nr_segs; seg++) { - read_descriptor_t desc; + for (seg = 0; seg < nr_segs; seg++) { + read_descriptor_t desc; - desc.written = 0; - desc.arg.buf = iov[seg].iov_base; - desc.count = iov[seg].iov_len; - if (desc.count == 0) - continue; - desc.error = 0; - do_generic_file_read(filp,ppos,&desc,file_read_actor); - retval += desc.written; - if (desc.error) { - retval = retval ?: desc.error; - break; - } - if (desc.count > 0) - break; + desc.written = 0; + desc.arg.buf = iov[seg].iov_base; + desc.count = iov[seg].iov_len; + if (desc.count == 0) + continue; + desc.error = 0; + do_generic_file_read(filp, ppos, &desc, file_read_actor); + retval += desc.written; + if (desc.error) { + retval = retval ?: desc.error; + break; } + if (desc.count > 0) + break; } out: return retval; @@ -1669,8 +1781,9 @@ static int __remove_suid(struct dentry *dentry, int kill) return notify_change(dentry, &newattrs); } -int remove_suid(struct dentry *dentry) +int file_remove_suid(struct file *file) { + struct dentry *dentry = file->f_path.dentry; int killsuid = should_remove_suid(dentry); int killpriv = security_inode_need_killpriv(dentry); int error = 0; @@ -1684,7 +1797,7 @@ int remove_suid(struct dentry *dentry) return error; } -EXPORT_SYMBOL(remove_suid); +EXPORT_SYMBOL(file_remove_suid); static size_t __iovec_copy_from_user_inatomic(char *vaddr, const struct iovec *iov, size_t base, size_t bytes) @@ -1779,7 +1892,7 @@ void iov_iter_advance(struct iov_iter *i, size_t bytes) * The !iov->iov_len check ensures we skip over unlikely * zero-length segments (without overruning the iovec). */ - while (bytes || unlikely(!iov->iov_len && i->count)) { + while (bytes || unlikely(i->count && !iov->iov_len)) { int copy; copy = min(bytes, iov->iov_len - base); @@ -2004,11 +2117,62 @@ generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov, struct address_space *mapping = file->f_mapping; struct inode *inode = mapping->host; ssize_t written; + size_t write_len; + pgoff_t end; if (count != ocount) *nr_segs = iov_shorten((struct iovec *)iov, *nr_segs, count); - written = generic_file_direct_IO(WRITE, iocb, iov, pos, *nr_segs); + /* + * Unmap all mmappings of the file up-front. + * + * This will cause any pte dirty bits to be propagated into the + * pageframes for the subsequent filemap_write_and_wait(). + */ + write_len = iov_length(iov, *nr_segs); + end = (pos + write_len - 1) >> PAGE_CACHE_SHIFT; + if (mapping_mapped(mapping)) + unmap_mapping_range(mapping, pos, write_len, 0); + + written = filemap_write_and_wait(mapping); + if (written) + goto out; + + /* + * After a write we want buffered reads to be sure to go to disk to get + * the new data. We invalidate clean cached page from the region we're + * about to write. We do this *before* the write so that we can return + * without clobbering -EIOCBQUEUED from ->direct_IO(). + */ + if (mapping->nrpages) { + written = invalidate_inode_pages2_range(mapping, + pos >> PAGE_CACHE_SHIFT, end); + /* + * If a page can not be invalidated, return 0 to fall back + * to buffered write. + */ + if (written) { + if (written == -EBUSY) + return 0; + goto out; + } + } + + written = mapping->a_ops->direct_IO(WRITE, iocb, iov, pos, *nr_segs); + + /* + * Finally, try again to invalidate clean pages which might have been + * cached by non-direct readahead, or faulted in by get_user_pages() + * if the source of the write was an mmap'ed region of the file + * we're writing. Either one is a pretty crazy thing to do, + * so we don't support it 100%. If this invalidation + * fails, tough, the write still worked... + */ + if (mapping->nrpages) { + invalidate_inode_pages2_range(mapping, + pos >> PAGE_CACHE_SHIFT, end); + } + if (written > 0) { loff_t end = pos + written; if (end > i_size_read(inode) && !S_ISBLK(inode->i_mode)) { @@ -2024,6 +2188,7 @@ generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov, * i_mutex is held, which protects generic_osync_inode() from * livelocking. AIO O_DIRECT ops attempt to sync metadata here. */ +out: if ((written >= 0 || written == -EIOCBQUEUED) && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) { int err = generic_osync_inode(inode, mapping, OSYNC_METADATA); @@ -2395,7 +2560,7 @@ __generic_file_aio_write_nolock(struct kiocb *iocb, const struct iovec *iov, if (count == 0) goto out; - err = remove_suid(file->f_path.dentry); + err = file_remove_suid(file); if (err) goto out; @@ -2511,66 +2676,6 @@ ssize_t generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov, } EXPORT_SYMBOL(generic_file_aio_write); -/* - * Called under i_mutex for writes to S_ISREG files. Returns -EIO if something - * went wrong during pagecache shootdown. - */ -static ssize_t -generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, - loff_t offset, unsigned long nr_segs) -{ - struct file *file = iocb->ki_filp; - struct address_space *mapping = file->f_mapping; - ssize_t retval; - size_t write_len; - pgoff_t end = 0; /* silence gcc */ - - /* - * If it's a write, unmap all mmappings of the file up-front. This - * will cause any pte dirty bits to be propagated into the pageframes - * for the subsequent filemap_write_and_wait(). - */ - if (rw == WRITE) { - write_len = iov_length(iov, nr_segs); - end = (offset + write_len - 1) >> PAGE_CACHE_SHIFT; - if (mapping_mapped(mapping)) - unmap_mapping_range(mapping, offset, write_len, 0); - } - - retval = filemap_write_and_wait(mapping); - if (retval) - goto out; - - /* - * After a write we want buffered reads to be sure to go to disk to get - * the new data. We invalidate clean cached page from the region we're - * about to write. We do this *before* the write so that we can return - * -EIO without clobbering -EIOCBQUEUED from ->direct_IO(). - */ - if (rw == WRITE && mapping->nrpages) { - retval = invalidate_inode_pages2_range(mapping, - offset >> PAGE_CACHE_SHIFT, end); - if (retval) - goto out; - } - - retval = mapping->a_ops->direct_IO(rw, iocb, iov, offset, nr_segs); - - /* - * Finally, try again to invalidate clean pages which might have been - * cached by non-direct readahead, or faulted in by get_user_pages() - * if the source of the write was an mmap'ed region of the file - * we're writing. Either one is a pretty crazy thing to do, - * so we don't support it 100%. If this invalidation - * fails, tough, the write still worked... - */ - if (rw == WRITE && mapping->nrpages) { - invalidate_inode_pages2_range(mapping, offset >> PAGE_CACHE_SHIFT, end); - } -out: - return retval; -} - /** * try_to_release_page() - release old fs-specific metadata on a page * @@ -2582,9 +2687,8 @@ out: * Otherwise return zero. * * The @gfp_mask argument specifies whether I/O may be performed to release - * this page (__GFP_IO), and whether the call may block (__GFP_WAIT). + * this page (__GFP_IO), and whether the call may block (__GFP_WAIT & __GFP_FS). * - * NOTE: @gfp_mask may go away, and this function may become non-blocking. */ int try_to_release_page(struct page *page, gfp_t gfp_mask) { diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c index 3e744abcce9d..b5167dfb2f2d 100644 --- a/mm/filemap_xip.c +++ b/mm/filemap_xip.c @@ -13,7 +13,10 @@ #include <linux/module.h> #include <linux/uio.h> #include <linux/rmap.h> +#include <linux/mmu_notifier.h> #include <linux/sched.h> +#include <linux/seqlock.h> +#include <linux/mutex.h> #include <asm/tlbflush.h> #include <asm/io.h> @@ -21,22 +24,18 @@ * We do use our own empty page to avoid interference with other users * of ZERO_PAGE(), such as /dev/zero */ +static DEFINE_MUTEX(xip_sparse_mutex); +static seqcount_t xip_sparse_seq = SEQCNT_ZERO; static struct page *__xip_sparse_page; +/* called under xip_sparse_mutex */ static struct page *xip_sparse_page(void) { if (!__xip_sparse_page) { struct page *page = alloc_page(GFP_HIGHUSER | __GFP_ZERO); - if (page) { - static DEFINE_SPINLOCK(xip_alloc_lock); - spin_lock(&xip_alloc_lock); - if (!__xip_sparse_page) - __xip_sparse_page = page; - else - __free_page(page); - spin_unlock(&xip_alloc_lock); - } + if (page) + __xip_sparse_page = page; } return __xip_sparse_page; } @@ -173,22 +172,27 @@ __xip_unmap (struct address_space * mapping, pte_t pteval; spinlock_t *ptl; struct page *page; + unsigned count; + int locked = 0; + + count = read_seqcount_begin(&xip_sparse_seq); page = __xip_sparse_page; if (!page) return; +retry: spin_lock(&mapping->i_mmap_lock); vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { mm = vma->vm_mm; address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); BUG_ON(address < vma->vm_start || address >= vma->vm_end); - pte = page_check_address(page, mm, address, &ptl); + pte = page_check_address(page, mm, address, &ptl, 1); if (pte) { /* Nuke the page table entry. */ flush_cache_page(vma, address, pte_pfn(*pte)); - pteval = ptep_clear_flush(vma, address, pte); + pteval = ptep_clear_flush_notify(vma, address, pte); page_remove_rmap(page, vma); dec_mm_counter(mm, file_rss); BUG_ON(pte_dirty(pteval)); @@ -197,6 +201,14 @@ __xip_unmap (struct address_space * mapping, } } spin_unlock(&mapping->i_mmap_lock); + + if (locked) { + mutex_unlock(&xip_sparse_mutex); + } else if (read_seqcount_retry(&xip_sparse_seq, count)) { + mutex_lock(&xip_sparse_mutex); + locked = 1; + goto retry; + } } /* @@ -217,7 +229,7 @@ static int xip_file_fault(struct vm_area_struct *vma, struct vm_fault *vmf) int error; /* XXX: are VM_FAULT_ codes OK? */ - +again: size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; if (vmf->pgoff >= size) return VM_FAULT_SIGBUS; @@ -236,8 +248,10 @@ static int xip_file_fault(struct vm_area_struct *vma, struct vm_fault *vmf) int err; /* maybe shared writable, allocate new block */ + mutex_lock(&xip_sparse_mutex); error = mapping->a_ops->get_xip_mem(mapping, vmf->pgoff, 1, &xip_mem, &xip_pfn); + mutex_unlock(&xip_sparse_mutex); if (error) return VM_FAULT_SIGBUS; /* unmap sparse mappings at pgoff from all other vmas */ @@ -251,14 +265,34 @@ found: BUG_ON(err); return VM_FAULT_NOPAGE; } else { + int err, ret = VM_FAULT_OOM; + + mutex_lock(&xip_sparse_mutex); + write_seqcount_begin(&xip_sparse_seq); + error = mapping->a_ops->get_xip_mem(mapping, vmf->pgoff, 0, + &xip_mem, &xip_pfn); + if (unlikely(!error)) { + write_seqcount_end(&xip_sparse_seq); + mutex_unlock(&xip_sparse_mutex); + goto again; + } + if (error != -ENODATA) + goto out; /* not shared and writable, use xip_sparse_page() */ page = xip_sparse_page(); if (!page) - return VM_FAULT_OOM; + goto out; + err = vm_insert_page(vma, (unsigned long)vmf->virtual_address, + page); + if (err == -ENOMEM) + goto out; - page_cache_get(page); - vmf->page = page; - return 0; + ret = VM_FAULT_NOPAGE; +out: + write_seqcount_end(&xip_sparse_seq); + mutex_unlock(&xip_sparse_mutex); + + return ret; } } @@ -307,8 +341,10 @@ __xip_file_write(struct file *filp, const char __user *buf, &xip_mem, &xip_pfn); if (status == -ENODATA) { /* we allocate a new page unmap it */ + mutex_lock(&xip_sparse_mutex); status = a_ops->get_xip_mem(mapping, index, 1, &xip_mem, &xip_pfn); + mutex_unlock(&xip_sparse_mutex); if (!status) /* unmap page at pgoff from all other vmas */ __xip_unmap(mapping, index); @@ -380,7 +416,7 @@ xip_file_write(struct file *filp, const char __user *buf, size_t len, if (count == 0) goto out_backing; - ret = remove_suid(filp->f_path.dentry); + ret = file_remove_suid(filp); if (ret) goto out_backing; diff --git a/mm/fremap.c b/mm/fremap.c index 07a9c82ce1a3..7d12ca70ef7b 100644 --- a/mm/fremap.c +++ b/mm/fremap.c @@ -15,11 +15,14 @@ #include <linux/rmap.h> #include <linux/module.h> #include <linux/syscalls.h> +#include <linux/mmu_notifier.h> #include <asm/mmu_context.h> #include <asm/cacheflush.h> #include <asm/tlbflush.h> +#include "internal.h" + static void zap_pte(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) { @@ -214,13 +217,31 @@ asmlinkage long sys_remap_file_pages(unsigned long start, unsigned long size, spin_unlock(&mapping->i_mmap_lock); } + if (vma->vm_flags & VM_LOCKED) { + /* + * drop PG_Mlocked flag for over-mapped range + */ + unsigned int saved_flags = vma->vm_flags; + munlock_vma_pages_range(vma, start, start + size); + vma->vm_flags = saved_flags; + } + + mmu_notifier_invalidate_range_start(mm, start, start + size); err = populate_range(mm, vma, start, size, pgoff); + mmu_notifier_invalidate_range_end(mm, start, start + size); if (!err && !(flags & MAP_NONBLOCK)) { - if (unlikely(has_write_lock)) { - downgrade_write(&mm->mmap_sem); - has_write_lock = 0; + if (vma->vm_flags & VM_LOCKED) { + /* + * might be mapping previously unmapped range of file + */ + mlock_vma_pages_range(vma, start, start + size); + } else { + if (unlikely(has_write_lock)) { + downgrade_write(&mm->mmap_sem); + has_write_lock = 0; + } + make_pages_present(start, start+size); } - make_pages_present(start, start+size); } /* @@ -237,4 +258,3 @@ out: return err; } - diff --git a/mm/highmem.c b/mm/highmem.c index 7da4a7b6af11..b36b83b920ff 100644 --- a/mm/highmem.c +++ b/mm/highmem.c @@ -40,6 +40,7 @@ #ifdef CONFIG_HIGHMEM unsigned long totalhigh_pages __read_mostly; +EXPORT_SYMBOL(totalhigh_pages); unsigned int nr_free_highpages (void) { @@ -69,6 +70,7 @@ static DECLARE_WAIT_QUEUE_HEAD(pkmap_map_wait); static void flush_all_zero_pkmaps(void) { int i; + int need_flush = 0; flush_cache_kmaps(); @@ -100,8 +102,10 @@ static void flush_all_zero_pkmaps(void) &pkmap_page_table[i]); set_page_address(page, NULL); + need_flush = 1; } - flush_tlb_kernel_range(PKMAP_ADDR(0), PKMAP_ADDR(LAST_PKMAP)); + if (need_flush) + flush_tlb_kernel_range(PKMAP_ADDR(0), PKMAP_ADDR(LAST_PKMAP)); } /** diff --git a/mm/hugetlb.c b/mm/hugetlb.c index ab171274ef21..421aee99b84a 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -7,45 +7,360 @@ #include <linux/init.h> #include <linux/module.h> #include <linux/mm.h> +#include <linux/seq_file.h> #include <linux/sysctl.h> #include <linux/highmem.h> +#include <linux/mmu_notifier.h> #include <linux/nodemask.h> #include <linux/pagemap.h> #include <linux/mempolicy.h> #include <linux/cpuset.h> #include <linux/mutex.h> +#include <linux/bootmem.h> +#include <linux/sysfs.h> #include <asm/page.h> #include <asm/pgtable.h> +#include <asm/io.h> #include <linux/hugetlb.h> #include "internal.h" const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL; -static unsigned long nr_huge_pages, free_huge_pages, resv_huge_pages; -static unsigned long surplus_huge_pages; -static unsigned long nr_overcommit_huge_pages; -unsigned long max_huge_pages; -unsigned long sysctl_overcommit_huge_pages; -static struct list_head hugepage_freelists[MAX_NUMNODES]; -static unsigned int nr_huge_pages_node[MAX_NUMNODES]; -static unsigned int free_huge_pages_node[MAX_NUMNODES]; -static unsigned int surplus_huge_pages_node[MAX_NUMNODES]; static gfp_t htlb_alloc_mask = GFP_HIGHUSER; unsigned long hugepages_treat_as_movable; -static int hugetlb_next_nid; + +static int max_hstate; +unsigned int default_hstate_idx; +struct hstate hstates[HUGE_MAX_HSTATE]; + +__initdata LIST_HEAD(huge_boot_pages); + +/* for command line parsing */ +static struct hstate * __initdata parsed_hstate; +static unsigned long __initdata default_hstate_max_huge_pages; +static unsigned long __initdata default_hstate_size; + +#define for_each_hstate(h) \ + for ((h) = hstates; (h) < &hstates[max_hstate]; (h)++) /* * Protects updates to hugepage_freelists, nr_huge_pages, and free_huge_pages */ static DEFINE_SPINLOCK(hugetlb_lock); -static void clear_huge_page(struct page *page, unsigned long addr) +/* + * Region tracking -- allows tracking of reservations and instantiated pages + * across the pages in a mapping. + * + * The region data structures are protected by a combination of the mmap_sem + * and the hugetlb_instantion_mutex. To access or modify a region the caller + * must either hold the mmap_sem for write, or the mmap_sem for read and + * the hugetlb_instantiation mutex: + * + * down_write(&mm->mmap_sem); + * or + * down_read(&mm->mmap_sem); + * mutex_lock(&hugetlb_instantiation_mutex); + */ +struct file_region { + struct list_head link; + long from; + long to; +}; + +static long region_add(struct list_head *head, long f, long t) +{ + struct file_region *rg, *nrg, *trg; + + /* Locate the region we are either in or before. */ + list_for_each_entry(rg, head, link) + if (f <= rg->to) + break; + + /* Round our left edge to the current segment if it encloses us. */ + if (f > rg->from) + f = rg->from; + + /* Check for and consume any regions we now overlap with. */ + nrg = rg; + list_for_each_entry_safe(rg, trg, rg->link.prev, link) { + if (&rg->link == head) + break; + if (rg->from > t) + break; + + /* If this area reaches higher then extend our area to + * include it completely. If this is not the first area + * which we intend to reuse, free it. */ + if (rg->to > t) + t = rg->to; + if (rg != nrg) { + list_del(&rg->link); + kfree(rg); + } + } + nrg->from = f; + nrg->to = t; + return 0; +} + +static long region_chg(struct list_head *head, long f, long t) +{ + struct file_region *rg, *nrg; + long chg = 0; + + /* Locate the region we are before or in. */ + list_for_each_entry(rg, head, link) + if (f <= rg->to) + break; + + /* If we are below the current region then a new region is required. + * Subtle, allocate a new region at the position but make it zero + * size such that we can guarantee to record the reservation. */ + if (&rg->link == head || t < rg->from) { + nrg = kmalloc(sizeof(*nrg), GFP_KERNEL); + if (!nrg) + return -ENOMEM; + nrg->from = f; + nrg->to = f; + INIT_LIST_HEAD(&nrg->link); + list_add(&nrg->link, rg->link.prev); + + return t - f; + } + + /* Round our left edge to the current segment if it encloses us. */ + if (f > rg->from) + f = rg->from; + chg = t - f; + + /* Check for and consume any regions we now overlap with. */ + list_for_each_entry(rg, rg->link.prev, link) { + if (&rg->link == head) + break; + if (rg->from > t) + return chg; + + /* We overlap with this area, if it extends futher than + * us then we must extend ourselves. Account for its + * existing reservation. */ + if (rg->to > t) { + chg += rg->to - t; + t = rg->to; + } + chg -= rg->to - rg->from; + } + return chg; +} + +static long region_truncate(struct list_head *head, long end) +{ + struct file_region *rg, *trg; + long chg = 0; + + /* Locate the region we are either in or before. */ + list_for_each_entry(rg, head, link) + if (end <= rg->to) + break; + if (&rg->link == head) + return 0; + + /* If we are in the middle of a region then adjust it. */ + if (end > rg->from) { + chg = rg->to - end; + rg->to = end; + rg = list_entry(rg->link.next, typeof(*rg), link); + } + + /* Drop any remaining regions. */ + list_for_each_entry_safe(rg, trg, rg->link.prev, link) { + if (&rg->link == head) + break; + chg += rg->to - rg->from; + list_del(&rg->link); + kfree(rg); + } + return chg; +} + +static long region_count(struct list_head *head, long f, long t) +{ + struct file_region *rg; + long chg = 0; + + /* Locate each segment we overlap with, and count that overlap. */ + list_for_each_entry(rg, head, link) { + int seg_from; + int seg_to; + + if (rg->to <= f) + continue; + if (rg->from >= t) + break; + + seg_from = max(rg->from, f); + seg_to = min(rg->to, t); + + chg += seg_to - seg_from; + } + + return chg; +} + +/* + * Convert the address within this vma to the page offset within + * the mapping, in pagecache page units; huge pages here. + */ +static pgoff_t vma_hugecache_offset(struct hstate *h, + struct vm_area_struct *vma, unsigned long address) +{ + return ((address - vma->vm_start) >> huge_page_shift(h)) + + (vma->vm_pgoff >> huge_page_order(h)); +} + +/* + * Flags for MAP_PRIVATE reservations. These are stored in the bottom + * bits of the reservation map pointer, which are always clear due to + * alignment. + */ +#define HPAGE_RESV_OWNER (1UL << 0) +#define HPAGE_RESV_UNMAPPED (1UL << 1) +#define HPAGE_RESV_MASK (HPAGE_RESV_OWNER | HPAGE_RESV_UNMAPPED) + +/* + * These helpers are used to track how many pages are reserved for + * faults in a MAP_PRIVATE mapping. Only the process that called mmap() + * is guaranteed to have their future faults succeed. + * + * With the exception of reset_vma_resv_huge_pages() which is called at fork(), + * the reserve counters are updated with the hugetlb_lock held. It is safe + * to reset the VMA at fork() time as it is not in use yet and there is no + * chance of the global counters getting corrupted as a result of the values. + * + * The private mapping reservation is represented in a subtly different + * manner to a shared mapping. A shared mapping has a region map associated + * with the underlying file, this region map represents the backing file + * pages which have ever had a reservation assigned which this persists even + * after the page is instantiated. A private mapping has a region map + * associated with the original mmap which is attached to all VMAs which + * reference it, this region map represents those offsets which have consumed + * reservation ie. where pages have been instantiated. + */ +static unsigned long get_vma_private_data(struct vm_area_struct *vma) +{ + return (unsigned long)vma->vm_private_data; +} + +static void set_vma_private_data(struct vm_area_struct *vma, + unsigned long value) +{ + vma->vm_private_data = (void *)value; +} + +struct resv_map { + struct kref refs; + struct list_head regions; +}; + +static struct resv_map *resv_map_alloc(void) +{ + struct resv_map *resv_map = kmalloc(sizeof(*resv_map), GFP_KERNEL); + if (!resv_map) + return NULL; + + kref_init(&resv_map->refs); + INIT_LIST_HEAD(&resv_map->regions); + + return resv_map; +} + +static void resv_map_release(struct kref *ref) +{ + struct resv_map *resv_map = container_of(ref, struct resv_map, refs); + + /* Clear out any active regions before we release the map. */ + region_truncate(&resv_map->regions, 0); + kfree(resv_map); +} + +static struct resv_map *vma_resv_map(struct vm_area_struct *vma) +{ + VM_BUG_ON(!is_vm_hugetlb_page(vma)); + if (!(vma->vm_flags & VM_SHARED)) + return (struct resv_map *)(get_vma_private_data(vma) & + ~HPAGE_RESV_MASK); + return NULL; +} + +static void set_vma_resv_map(struct vm_area_struct *vma, struct resv_map *map) +{ + VM_BUG_ON(!is_vm_hugetlb_page(vma)); + VM_BUG_ON(vma->vm_flags & VM_SHARED); + + set_vma_private_data(vma, (get_vma_private_data(vma) & + HPAGE_RESV_MASK) | (unsigned long)map); +} + +static void set_vma_resv_flags(struct vm_area_struct *vma, unsigned long flags) +{ + VM_BUG_ON(!is_vm_hugetlb_page(vma)); + VM_BUG_ON(vma->vm_flags & VM_SHARED); + + set_vma_private_data(vma, get_vma_private_data(vma) | flags); +} + +static int is_vma_resv_set(struct vm_area_struct *vma, unsigned long flag) +{ + VM_BUG_ON(!is_vm_hugetlb_page(vma)); + + return (get_vma_private_data(vma) & flag) != 0; +} + +/* Decrement the reserved pages in the hugepage pool by one */ +static void decrement_hugepage_resv_vma(struct hstate *h, + struct vm_area_struct *vma) +{ + if (vma->vm_flags & VM_NORESERVE) + return; + + if (vma->vm_flags & VM_SHARED) { + /* Shared mappings always use reserves */ + h->resv_huge_pages--; + } else if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) { + /* + * Only the process that called mmap() has reserves for + * private mappings. + */ + h->resv_huge_pages--; + } +} + +/* Reset counters to 0 and clear all HPAGE_RESV_* flags */ +void reset_vma_resv_huge_pages(struct vm_area_struct *vma) +{ + VM_BUG_ON(!is_vm_hugetlb_page(vma)); + if (!(vma->vm_flags & VM_SHARED)) + vma->vm_private_data = (void *)0; +} + +/* Returns true if the VMA has associated reserve pages */ +static int vma_has_reserves(struct vm_area_struct *vma) +{ + if (vma->vm_flags & VM_SHARED) + return 1; + if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) + return 1; + return 0; +} + +static void clear_huge_page(struct page *page, + unsigned long addr, unsigned long sz) { int i; might_sleep(); - for (i = 0; i < (HPAGE_SIZE/PAGE_SIZE); i++) { + for (i = 0; i < sz/PAGE_SIZE; i++) { cond_resched(); clear_user_highpage(page + i, addr + i * PAGE_SIZE); } @@ -55,42 +370,44 @@ static void copy_huge_page(struct page *dst, struct page *src, unsigned long addr, struct vm_area_struct *vma) { int i; + struct hstate *h = hstate_vma(vma); might_sleep(); - for (i = 0; i < HPAGE_SIZE/PAGE_SIZE; i++) { + for (i = 0; i < pages_per_huge_page(h); i++) { cond_resched(); copy_user_highpage(dst + i, src + i, addr + i*PAGE_SIZE, vma); } } -static void enqueue_huge_page(struct page *page) +static void enqueue_huge_page(struct hstate *h, struct page *page) { int nid = page_to_nid(page); - list_add(&page->lru, &hugepage_freelists[nid]); - free_huge_pages++; - free_huge_pages_node[nid]++; + list_add(&page->lru, &h->hugepage_freelists[nid]); + h->free_huge_pages++; + h->free_huge_pages_node[nid]++; } -static struct page *dequeue_huge_page(void) +static struct page *dequeue_huge_page(struct hstate *h) { int nid; struct page *page = NULL; for (nid = 0; nid < MAX_NUMNODES; ++nid) { - if (!list_empty(&hugepage_freelists[nid])) { - page = list_entry(hugepage_freelists[nid].next, + if (!list_empty(&h->hugepage_freelists[nid])) { + page = list_entry(h->hugepage_freelists[nid].next, struct page, lru); list_del(&page->lru); - free_huge_pages--; - free_huge_pages_node[nid]--; + h->free_huge_pages--; + h->free_huge_pages_node[nid]--; break; } } return page; } -static struct page *dequeue_huge_page_vma(struct vm_area_struct *vma, - unsigned long address) +static struct page *dequeue_huge_page_vma(struct hstate *h, + struct vm_area_struct *vma, + unsigned long address, int avoid_reserve) { int nid; struct page *page = NULL; @@ -101,18 +418,33 @@ static struct page *dequeue_huge_page_vma(struct vm_area_struct *vma, struct zone *zone; struct zoneref *z; + /* + * A child process with MAP_PRIVATE mappings created by their parent + * have no page reserves. This check ensures that reservations are + * not "stolen". The child may still get SIGKILLed + */ + if (!vma_has_reserves(vma) && + h->free_huge_pages - h->resv_huge_pages == 0) + return NULL; + + /* If reserves cannot be used, ensure enough pages are in the pool */ + if (avoid_reserve && h->free_huge_pages - h->resv_huge_pages == 0) + return NULL; + for_each_zone_zonelist_nodemask(zone, z, zonelist, MAX_NR_ZONES - 1, nodemask) { nid = zone_to_nid(zone); if (cpuset_zone_allowed_softwall(zone, htlb_alloc_mask) && - !list_empty(&hugepage_freelists[nid])) { - page = list_entry(hugepage_freelists[nid].next, + !list_empty(&h->hugepage_freelists[nid])) { + page = list_entry(h->hugepage_freelists[nid].next, struct page, lru); list_del(&page->lru); - free_huge_pages--; - free_huge_pages_node[nid]--; - if (vma && vma->vm_flags & VM_MAYSHARE) - resv_huge_pages--; + h->free_huge_pages--; + h->free_huge_pages_node[nid]--; + + if (!avoid_reserve) + decrement_hugepage_resv_vma(h, vma); + break; } } @@ -120,12 +452,13 @@ static struct page *dequeue_huge_page_vma(struct vm_area_struct *vma, return page; } -static void update_and_free_page(struct page *page) +static void update_and_free_page(struct hstate *h, struct page *page) { int i; - nr_huge_pages--; - nr_huge_pages_node[page_to_nid(page)]--; - for (i = 0; i < (HPAGE_SIZE / PAGE_SIZE); i++) { + + h->nr_huge_pages--; + h->nr_huge_pages_node[page_to_nid(page)]--; + for (i = 0; i < pages_per_huge_page(h); i++) { page[i].flags &= ~(1 << PG_locked | 1 << PG_error | 1 << PG_referenced | 1 << PG_dirty | 1 << PG_active | 1 << PG_reserved | 1 << PG_private | 1<< PG_writeback); @@ -133,11 +466,27 @@ static void update_and_free_page(struct page *page) set_compound_page_dtor(page, NULL); set_page_refcounted(page); arch_release_hugepage(page); - __free_pages(page, HUGETLB_PAGE_ORDER); + __free_pages(page, huge_page_order(h)); +} + +struct hstate *size_to_hstate(unsigned long size) +{ + struct hstate *h; + + for_each_hstate(h) { + if (huge_page_size(h) == size) + return h; + } + return NULL; } static void free_huge_page(struct page *page) { + /* + * Can't pass hstate in here because it is called from the + * compound page destructor. + */ + struct hstate *h = page_hstate(page); int nid = page_to_nid(page); struct address_space *mapping; @@ -147,12 +496,12 @@ static void free_huge_page(struct page *page) INIT_LIST_HEAD(&page->lru); spin_lock(&hugetlb_lock); - if (surplus_huge_pages_node[nid]) { - update_and_free_page(page); - surplus_huge_pages--; - surplus_huge_pages_node[nid]--; + if (h->surplus_huge_pages_node[nid] && huge_page_order(h) < MAX_ORDER) { + update_and_free_page(h, page); + h->surplus_huge_pages--; + h->surplus_huge_pages_node[nid]--; } else { - enqueue_huge_page(page); + enqueue_huge_page(h, page); } spin_unlock(&hugetlb_lock); if (mapping) @@ -164,7 +513,7 @@ static void free_huge_page(struct page *page) * balanced by operating on them in a round-robin fashion. * Returns 1 if an adjustment was made. */ -static int adjust_pool_surplus(int delta) +static int adjust_pool_surplus(struct hstate *h, int delta) { static int prev_nid; int nid = prev_nid; @@ -177,15 +526,15 @@ static int adjust_pool_surplus(int delta) nid = first_node(node_online_map); /* To shrink on this node, there must be a surplus page */ - if (delta < 0 && !surplus_huge_pages_node[nid]) + if (delta < 0 && !h->surplus_huge_pages_node[nid]) continue; /* Surplus cannot exceed the total number of pages */ - if (delta > 0 && surplus_huge_pages_node[nid] >= - nr_huge_pages_node[nid]) + if (delta > 0 && h->surplus_huge_pages_node[nid] >= + h->nr_huge_pages_node[nid]) continue; - surplus_huge_pages += delta; - surplus_huge_pages_node[nid] += delta; + h->surplus_huge_pages += delta; + h->surplus_huge_pages_node[nid] += delta; ret = 1; break; } while (nid != prev_nid); @@ -194,59 +543,74 @@ static int adjust_pool_surplus(int delta) return ret; } -static struct page *alloc_fresh_huge_page_node(int nid) +static void prep_new_huge_page(struct hstate *h, struct page *page, int nid) +{ + set_compound_page_dtor(page, free_huge_page); + spin_lock(&hugetlb_lock); + h->nr_huge_pages++; + h->nr_huge_pages_node[nid]++; + spin_unlock(&hugetlb_lock); + put_page(page); /* free it into the hugepage allocator */ +} + +static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid) { struct page *page; + if (h->order >= MAX_ORDER) + return NULL; + page = alloc_pages_node(nid, htlb_alloc_mask|__GFP_COMP|__GFP_THISNODE| __GFP_REPEAT|__GFP_NOWARN, - HUGETLB_PAGE_ORDER); + huge_page_order(h)); if (page) { if (arch_prepare_hugepage(page)) { - __free_pages(page, HUGETLB_PAGE_ORDER); + __free_pages(page, huge_page_order(h)); return NULL; } - set_compound_page_dtor(page, free_huge_page); - spin_lock(&hugetlb_lock); - nr_huge_pages++; - nr_huge_pages_node[nid]++; - spin_unlock(&hugetlb_lock); - put_page(page); /* free it into the hugepage allocator */ + prep_new_huge_page(h, page, nid); } return page; } -static int alloc_fresh_huge_page(void) +/* + * Use a helper variable to find the next node and then + * copy it back to hugetlb_next_nid afterwards: + * otherwise there's a window in which a racer might + * pass invalid nid MAX_NUMNODES to alloc_pages_node. + * But we don't need to use a spin_lock here: it really + * doesn't matter if occasionally a racer chooses the + * same nid as we do. Move nid forward in the mask even + * if we just successfully allocated a hugepage so that + * the next caller gets hugepages on the next node. + */ +static int hstate_next_node(struct hstate *h) +{ + int next_nid; + next_nid = next_node(h->hugetlb_next_nid, node_online_map); + if (next_nid == MAX_NUMNODES) + next_nid = first_node(node_online_map); + h->hugetlb_next_nid = next_nid; + return next_nid; +} + +static int alloc_fresh_huge_page(struct hstate *h) { struct page *page; int start_nid; int next_nid; int ret = 0; - start_nid = hugetlb_next_nid; + start_nid = h->hugetlb_next_nid; do { - page = alloc_fresh_huge_page_node(hugetlb_next_nid); + page = alloc_fresh_huge_page_node(h, h->hugetlb_next_nid); if (page) ret = 1; - /* - * Use a helper variable to find the next node and then - * copy it back to hugetlb_next_nid afterwards: - * otherwise there's a window in which a racer might - * pass invalid nid MAX_NUMNODES to alloc_pages_node. - * But we don't need to use a spin_lock here: it really - * doesn't matter if occasionally a racer chooses the - * same nid as we do. Move nid forward in the mask even - * if we just successfully allocated a hugepage so that - * the next caller gets hugepages on the next node. - */ - next_nid = next_node(hugetlb_next_nid, node_online_map); - if (next_nid == MAX_NUMNODES) - next_nid = first_node(node_online_map); - hugetlb_next_nid = next_nid; - } while (!page && hugetlb_next_nid != start_nid); + next_nid = hstate_next_node(h); + } while (!page && h->hugetlb_next_nid != start_nid); if (ret) count_vm_event(HTLB_BUDDY_PGALLOC); @@ -256,12 +620,15 @@ static int alloc_fresh_huge_page(void) return ret; } -static struct page *alloc_buddy_huge_page(struct vm_area_struct *vma, - unsigned long address) +static struct page *alloc_buddy_huge_page(struct hstate *h, + struct vm_area_struct *vma, unsigned long address) { struct page *page; unsigned int nid; + if (h->order >= MAX_ORDER) + return NULL; + /* * Assume we will successfully allocate the surplus page to * prevent racing processes from causing the surplus to exceed @@ -286,18 +653,23 @@ static struct page *alloc_buddy_huge_page(struct vm_area_struct *vma, * per-node value is checked there. */ spin_lock(&hugetlb_lock); - if (surplus_huge_pages >= nr_overcommit_huge_pages) { + if (h->surplus_huge_pages >= h->nr_overcommit_huge_pages) { spin_unlock(&hugetlb_lock); return NULL; } else { - nr_huge_pages++; - surplus_huge_pages++; + h->nr_huge_pages++; + h->surplus_huge_pages++; } spin_unlock(&hugetlb_lock); page = alloc_pages(htlb_alloc_mask|__GFP_COMP| __GFP_REPEAT|__GFP_NOWARN, - HUGETLB_PAGE_ORDER); + huge_page_order(h)); + + if (page && arch_prepare_hugepage(page)) { + __free_pages(page, huge_page_order(h)); + return NULL; + } spin_lock(&hugetlb_lock); if (page) { @@ -312,12 +684,12 @@ static struct page *alloc_buddy_huge_page(struct vm_area_struct *vma, /* * We incremented the global counters already */ - nr_huge_pages_node[nid]++; - surplus_huge_pages_node[nid]++; + h->nr_huge_pages_node[nid]++; + h->surplus_huge_pages_node[nid]++; __count_vm_event(HTLB_BUDDY_PGALLOC); } else { - nr_huge_pages--; - surplus_huge_pages--; + h->nr_huge_pages--; + h->surplus_huge_pages--; __count_vm_event(HTLB_BUDDY_PGALLOC_FAIL); } spin_unlock(&hugetlb_lock); @@ -329,16 +701,16 @@ static struct page *alloc_buddy_huge_page(struct vm_area_struct *vma, * Increase the hugetlb pool such that it can accomodate a reservation * of size 'delta'. */ -static int gather_surplus_pages(int delta) +static int gather_surplus_pages(struct hstate *h, int delta) { struct list_head surplus_list; struct page *page, *tmp; int ret, i; int needed, allocated; - needed = (resv_huge_pages + delta) - free_huge_pages; + needed = (h->resv_huge_pages + delta) - h->free_huge_pages; if (needed <= 0) { - resv_huge_pages += delta; + h->resv_huge_pages += delta; return 0; } @@ -349,7 +721,7 @@ static int gather_surplus_pages(int delta) retry: spin_unlock(&hugetlb_lock); for (i = 0; i < needed; i++) { - page = alloc_buddy_huge_page(NULL, 0); + page = alloc_buddy_huge_page(h, NULL, 0); if (!page) { /* * We were not able to allocate enough pages to @@ -370,7 +742,8 @@ retry: * because either resv_huge_pages or free_huge_pages may have changed. */ spin_lock(&hugetlb_lock); - needed = (resv_huge_pages + delta) - (free_huge_pages + allocated); + needed = (h->resv_huge_pages + delta) - + (h->free_huge_pages + allocated); if (needed > 0) goto retry; @@ -383,7 +756,7 @@ retry: * before they are reserved. */ needed += allocated; - resv_huge_pages += delta; + h->resv_huge_pages += delta; ret = 0; free: /* Free the needed pages to the hugetlb pool */ @@ -391,7 +764,7 @@ free: if ((--needed) < 0) break; list_del(&page->lru); - enqueue_huge_page(page); + enqueue_huge_page(h, page); } /* Free unnecessary surplus pages to the buddy allocator */ @@ -419,7 +792,8 @@ free: * allocated to satisfy the reservation must be explicitly freed if they were * never used. */ -static void return_unused_surplus_pages(unsigned long unused_resv_pages) +static void return_unused_surplus_pages(struct hstate *h, + unsigned long unused_resv_pages) { static int nid = -1; struct page *page; @@ -434,157 +808,269 @@ static void return_unused_surplus_pages(unsigned long unused_resv_pages) unsigned long remaining_iterations = num_online_nodes(); /* Uncommit the reservation */ - resv_huge_pages -= unused_resv_pages; + h->resv_huge_pages -= unused_resv_pages; - nr_pages = min(unused_resv_pages, surplus_huge_pages); + /* Cannot return gigantic pages currently */ + if (h->order >= MAX_ORDER) + return; + + nr_pages = min(unused_resv_pages, h->surplus_huge_pages); while (remaining_iterations-- && nr_pages) { nid = next_node(nid, node_online_map); if (nid == MAX_NUMNODES) nid = first_node(node_online_map); - if (!surplus_huge_pages_node[nid]) + if (!h->surplus_huge_pages_node[nid]) continue; - if (!list_empty(&hugepage_freelists[nid])) { - page = list_entry(hugepage_freelists[nid].next, + if (!list_empty(&h->hugepage_freelists[nid])) { + page = list_entry(h->hugepage_freelists[nid].next, struct page, lru); list_del(&page->lru); - update_and_free_page(page); - free_huge_pages--; - free_huge_pages_node[nid]--; - surplus_huge_pages--; - surplus_huge_pages_node[nid]--; + update_and_free_page(h, page); + h->free_huge_pages--; + h->free_huge_pages_node[nid]--; + h->surplus_huge_pages--; + h->surplus_huge_pages_node[nid]--; nr_pages--; remaining_iterations = num_online_nodes(); } } } +/* + * Determine if the huge page at addr within the vma has an associated + * reservation. Where it does not we will need to logically increase + * reservation and actually increase quota before an allocation can occur. + * Where any new reservation would be required the reservation change is + * prepared, but not committed. Once the page has been quota'd allocated + * an instantiated the change should be committed via vma_commit_reservation. + * No action is required on failure. + */ +static int vma_needs_reservation(struct hstate *h, + struct vm_area_struct *vma, unsigned long addr) +{ + struct address_space *mapping = vma->vm_file->f_mapping; + struct inode *inode = mapping->host; + + if (vma->vm_flags & VM_SHARED) { + pgoff_t idx = vma_hugecache_offset(h, vma, addr); + return region_chg(&inode->i_mapping->private_list, + idx, idx + 1); -static struct page *alloc_huge_page_shared(struct vm_area_struct *vma, - unsigned long addr) + } else if (!is_vma_resv_set(vma, HPAGE_RESV_OWNER)) { + return 1; + + } else { + int err; + pgoff_t idx = vma_hugecache_offset(h, vma, addr); + struct resv_map *reservations = vma_resv_map(vma); + + err = region_chg(&reservations->regions, idx, idx + 1); + if (err < 0) + return err; + return 0; + } +} +static void vma_commit_reservation(struct hstate *h, + struct vm_area_struct *vma, unsigned long addr) { - struct page *page; + struct address_space *mapping = vma->vm_file->f_mapping; + struct inode *inode = mapping->host; - spin_lock(&hugetlb_lock); - page = dequeue_huge_page_vma(vma, addr); - spin_unlock(&hugetlb_lock); - return page ? page : ERR_PTR(-VM_FAULT_OOM); + if (vma->vm_flags & VM_SHARED) { + pgoff_t idx = vma_hugecache_offset(h, vma, addr); + region_add(&inode->i_mapping->private_list, idx, idx + 1); + + } else if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) { + pgoff_t idx = vma_hugecache_offset(h, vma, addr); + struct resv_map *reservations = vma_resv_map(vma); + + /* Mark this page used in the map. */ + region_add(&reservations->regions, idx, idx + 1); + } } -static struct page *alloc_huge_page_private(struct vm_area_struct *vma, - unsigned long addr) +static struct page *alloc_huge_page(struct vm_area_struct *vma, + unsigned long addr, int avoid_reserve) { - struct page *page = NULL; + struct hstate *h = hstate_vma(vma); + struct page *page; + struct address_space *mapping = vma->vm_file->f_mapping; + struct inode *inode = mapping->host; + unsigned int chg; - if (hugetlb_get_quota(vma->vm_file->f_mapping, 1)) - return ERR_PTR(-VM_FAULT_SIGBUS); + /* + * Processes that did not create the mapping will have no reserves and + * will not have accounted against quota. Check that the quota can be + * made before satisfying the allocation + * MAP_NORESERVE mappings may also need pages and quota allocated + * if no reserve mapping overlaps. + */ + chg = vma_needs_reservation(h, vma, addr); + if (chg < 0) + return ERR_PTR(chg); + if (chg) + if (hugetlb_get_quota(inode->i_mapping, chg)) + return ERR_PTR(-ENOSPC); spin_lock(&hugetlb_lock); - if (free_huge_pages > resv_huge_pages) - page = dequeue_huge_page_vma(vma, addr); + page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve); spin_unlock(&hugetlb_lock); + if (!page) { - page = alloc_buddy_huge_page(vma, addr); + page = alloc_buddy_huge_page(h, vma, addr); if (!page) { - hugetlb_put_quota(vma->vm_file->f_mapping, 1); + hugetlb_put_quota(inode->i_mapping, chg); return ERR_PTR(-VM_FAULT_OOM); } } + + set_page_refcounted(page); + set_page_private(page, (unsigned long) mapping); + + vma_commit_reservation(h, vma, addr); + return page; } -static struct page *alloc_huge_page(struct vm_area_struct *vma, - unsigned long addr) +__attribute__((weak)) int alloc_bootmem_huge_page(struct hstate *h) { - struct page *page; - struct address_space *mapping = vma->vm_file->f_mapping; + struct huge_bootmem_page *m; + int nr_nodes = nodes_weight(node_online_map); - if (vma->vm_flags & VM_MAYSHARE) - page = alloc_huge_page_shared(vma, addr); - else - page = alloc_huge_page_private(vma, addr); + while (nr_nodes) { + void *addr; + + addr = __alloc_bootmem_node_nopanic( + NODE_DATA(h->hugetlb_next_nid), + huge_page_size(h), huge_page_size(h), 0); - if (!IS_ERR(page)) { - set_page_refcounted(page); - set_page_private(page, (unsigned long) mapping); + if (addr) { + /* + * Use the beginning of the huge page to store the + * huge_bootmem_page struct (until gather_bootmem + * puts them into the mem_map). + */ + m = addr; + if (m) + goto found; + } + hstate_next_node(h); + nr_nodes--; } - return page; + return 0; + +found: + BUG_ON((unsigned long)virt_to_phys(m) & (huge_page_size(h) - 1)); + /* Put them into a private list first because mem_map is not up yet */ + list_add(&m->list, &huge_boot_pages); + m->hstate = h; + return 1; } -static int __init hugetlb_init(void) +/* Put bootmem huge pages into the standard lists after mem_map is up */ +static void __init gather_bootmem_prealloc(void) { - unsigned long i; - - if (HPAGE_SHIFT == 0) - return 0; - - for (i = 0; i < MAX_NUMNODES; ++i) - INIT_LIST_HEAD(&hugepage_freelists[i]); + struct huge_bootmem_page *m; + + list_for_each_entry(m, &huge_boot_pages, list) { + struct page *page = virt_to_page(m); + struct hstate *h = m->hstate; + __ClearPageReserved(page); + WARN_ON(page_count(page) != 1); + prep_compound_page(page, h->order); + prep_new_huge_page(h, page, page_to_nid(page)); + } +} - hugetlb_next_nid = first_node(node_online_map); +static void __init hugetlb_hstate_alloc_pages(struct hstate *h) +{ + unsigned long i; - for (i = 0; i < max_huge_pages; ++i) { - if (!alloc_fresh_huge_page()) + for (i = 0; i < h->max_huge_pages; ++i) { + if (h->order >= MAX_ORDER) { + if (!alloc_bootmem_huge_page(h)) + break; + } else if (!alloc_fresh_huge_page(h)) break; } - max_huge_pages = free_huge_pages = nr_huge_pages = i; - printk("Total HugeTLB memory allocated, %ld\n", free_huge_pages); - return 0; + h->max_huge_pages = i; } -module_init(hugetlb_init); -static int __init hugetlb_setup(char *s) +static void __init hugetlb_init_hstates(void) { - if (sscanf(s, "%lu", &max_huge_pages) <= 0) - max_huge_pages = 0; - return 1; + struct hstate *h; + + for_each_hstate(h) { + /* oversize hugepages were init'ed in early boot */ + if (h->order < MAX_ORDER) + hugetlb_hstate_alloc_pages(h); + } } -__setup("hugepages=", hugetlb_setup); -static unsigned int cpuset_mems_nr(unsigned int *array) +static char * __init memfmt(char *buf, unsigned long n) { - int node; - unsigned int nr = 0; - - for_each_node_mask(node, cpuset_current_mems_allowed) - nr += array[node]; + if (n >= (1UL << 30)) + sprintf(buf, "%lu GB", n >> 30); + else if (n >= (1UL << 20)) + sprintf(buf, "%lu MB", n >> 20); + else + sprintf(buf, "%lu KB", n >> 10); + return buf; +} - return nr; +static void __init report_hugepages(void) +{ + struct hstate *h; + + for_each_hstate(h) { + char buf[32]; + printk(KERN_INFO "HugeTLB registered %s page size, " + "pre-allocated %ld pages\n", + memfmt(buf, huge_page_size(h)), + h->free_huge_pages); + } } -#ifdef CONFIG_SYSCTL #ifdef CONFIG_HIGHMEM -static void try_to_free_low(unsigned long count) +static void try_to_free_low(struct hstate *h, unsigned long count) { int i; + if (h->order >= MAX_ORDER) + return; + for (i = 0; i < MAX_NUMNODES; ++i) { struct page *page, *next; - list_for_each_entry_safe(page, next, &hugepage_freelists[i], lru) { - if (count >= nr_huge_pages) + struct list_head *freel = &h->hugepage_freelists[i]; + list_for_each_entry_safe(page, next, freel, lru) { + if (count >= h->nr_huge_pages) return; if (PageHighMem(page)) continue; list_del(&page->lru); - update_and_free_page(page); - free_huge_pages--; - free_huge_pages_node[page_to_nid(page)]--; + update_and_free_page(h, page); + h->free_huge_pages--; + h->free_huge_pages_node[page_to_nid(page)]--; } } } #else -static inline void try_to_free_low(unsigned long count) +static inline void try_to_free_low(struct hstate *h, unsigned long count) { } #endif -#define persistent_huge_pages (nr_huge_pages - surplus_huge_pages) -static unsigned long set_max_huge_pages(unsigned long count) +#define persistent_huge_pages(h) (h->nr_huge_pages - h->surplus_huge_pages) +static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count) { unsigned long min_count, ret; + if (h->order >= MAX_ORDER) + return h->max_huge_pages; + /* * Increase the pool size * First take pages out of surplus state. Then make up the @@ -597,20 +1083,19 @@ static unsigned long set_max_huge_pages(unsigned long count) * within all the constraints specified by the sysctls. */ spin_lock(&hugetlb_lock); - while (surplus_huge_pages && count > persistent_huge_pages) { - if (!adjust_pool_surplus(-1)) + while (h->surplus_huge_pages && count > persistent_huge_pages(h)) { + if (!adjust_pool_surplus(h, -1)) break; } - while (count > persistent_huge_pages) { - int ret; + while (count > persistent_huge_pages(h)) { /* * If this allocation races such that we no longer need the * page, free_huge_page will handle it by freeing the page * and reducing the surplus. */ spin_unlock(&hugetlb_lock); - ret = alloc_fresh_huge_page(); + ret = alloc_fresh_huge_page(h); spin_lock(&hugetlb_lock); if (!ret) goto out; @@ -632,31 +1117,305 @@ static unsigned long set_max_huge_pages(unsigned long count) * and won't grow the pool anywhere else. Not until one of the * sysctls are changed, or the surplus pages go out of use. */ - min_count = resv_huge_pages + nr_huge_pages - free_huge_pages; + min_count = h->resv_huge_pages + h->nr_huge_pages - h->free_huge_pages; min_count = max(count, min_count); - try_to_free_low(min_count); - while (min_count < persistent_huge_pages) { - struct page *page = dequeue_huge_page(); + try_to_free_low(h, min_count); + while (min_count < persistent_huge_pages(h)) { + struct page *page = dequeue_huge_page(h); if (!page) break; - update_and_free_page(page); + update_and_free_page(h, page); } - while (count < persistent_huge_pages) { - if (!adjust_pool_surplus(1)) + while (count < persistent_huge_pages(h)) { + if (!adjust_pool_surplus(h, 1)) break; } out: - ret = persistent_huge_pages; + ret = persistent_huge_pages(h); spin_unlock(&hugetlb_lock); return ret; } +#define HSTATE_ATTR_RO(_name) \ + static struct kobj_attribute _name##_attr = __ATTR_RO(_name) + +#define HSTATE_ATTR(_name) \ + static struct kobj_attribute _name##_attr = \ + __ATTR(_name, 0644, _name##_show, _name##_store) + +static struct kobject *hugepages_kobj; +static struct kobject *hstate_kobjs[HUGE_MAX_HSTATE]; + +static struct hstate *kobj_to_hstate(struct kobject *kobj) +{ + int i; + for (i = 0; i < HUGE_MAX_HSTATE; i++) + if (hstate_kobjs[i] == kobj) + return &hstates[i]; + BUG(); + return NULL; +} + +static ssize_t nr_hugepages_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct hstate *h = kobj_to_hstate(kobj); + return sprintf(buf, "%lu\n", h->nr_huge_pages); +} +static ssize_t nr_hugepages_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, size_t count) +{ + int err; + unsigned long input; + struct hstate *h = kobj_to_hstate(kobj); + + err = strict_strtoul(buf, 10, &input); + if (err) + return 0; + + h->max_huge_pages = set_max_huge_pages(h, input); + + return count; +} +HSTATE_ATTR(nr_hugepages); + +static ssize_t nr_overcommit_hugepages_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct hstate *h = kobj_to_hstate(kobj); + return sprintf(buf, "%lu\n", h->nr_overcommit_huge_pages); +} +static ssize_t nr_overcommit_hugepages_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, size_t count) +{ + int err; + unsigned long input; + struct hstate *h = kobj_to_hstate(kobj); + + err = strict_strtoul(buf, 10, &input); + if (err) + return 0; + + spin_lock(&hugetlb_lock); + h->nr_overcommit_huge_pages = input; + spin_unlock(&hugetlb_lock); + + return count; +} +HSTATE_ATTR(nr_overcommit_hugepages); + +static ssize_t free_hugepages_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct hstate *h = kobj_to_hstate(kobj); + return sprintf(buf, "%lu\n", h->free_huge_pages); +} +HSTATE_ATTR_RO(free_hugepages); + +static ssize_t resv_hugepages_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct hstate *h = kobj_to_hstate(kobj); + return sprintf(buf, "%lu\n", h->resv_huge_pages); +} +HSTATE_ATTR_RO(resv_hugepages); + +static ssize_t surplus_hugepages_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct hstate *h = kobj_to_hstate(kobj); + return sprintf(buf, "%lu\n", h->surplus_huge_pages); +} +HSTATE_ATTR_RO(surplus_hugepages); + +static struct attribute *hstate_attrs[] = { + &nr_hugepages_attr.attr, + &nr_overcommit_hugepages_attr.attr, + &free_hugepages_attr.attr, + &resv_hugepages_attr.attr, + &surplus_hugepages_attr.attr, + NULL, +}; + +static struct attribute_group hstate_attr_group = { + .attrs = hstate_attrs, +}; + +static int __init hugetlb_sysfs_add_hstate(struct hstate *h) +{ + int retval; + + hstate_kobjs[h - hstates] = kobject_create_and_add(h->name, + hugepages_kobj); + if (!hstate_kobjs[h - hstates]) + return -ENOMEM; + + retval = sysfs_create_group(hstate_kobjs[h - hstates], + &hstate_attr_group); + if (retval) + kobject_put(hstate_kobjs[h - hstates]); + + return retval; +} + +static void __init hugetlb_sysfs_init(void) +{ + struct hstate *h; + int err; + + hugepages_kobj = kobject_create_and_add("hugepages", mm_kobj); + if (!hugepages_kobj) + return; + + for_each_hstate(h) { + err = hugetlb_sysfs_add_hstate(h); + if (err) + printk(KERN_ERR "Hugetlb: Unable to add hstate %s", + h->name); + } +} + +static void __exit hugetlb_exit(void) +{ + struct hstate *h; + + for_each_hstate(h) { + kobject_put(hstate_kobjs[h - hstates]); + } + + kobject_put(hugepages_kobj); +} +module_exit(hugetlb_exit); + +static int __init hugetlb_init(void) +{ + /* Some platform decide whether they support huge pages at boot + * time. On these, such as powerpc, HPAGE_SHIFT is set to 0 when + * there is no such support + */ + if (HPAGE_SHIFT == 0) + return 0; + + if (!size_to_hstate(default_hstate_size)) { + default_hstate_size = HPAGE_SIZE; + if (!size_to_hstate(default_hstate_size)) + hugetlb_add_hstate(HUGETLB_PAGE_ORDER); + } + default_hstate_idx = size_to_hstate(default_hstate_size) - hstates; + if (default_hstate_max_huge_pages) + default_hstate.max_huge_pages = default_hstate_max_huge_pages; + + hugetlb_init_hstates(); + + gather_bootmem_prealloc(); + + report_hugepages(); + + hugetlb_sysfs_init(); + + return 0; +} +module_init(hugetlb_init); + +/* Should be called on processing a hugepagesz=... option */ +void __init hugetlb_add_hstate(unsigned order) +{ + struct hstate *h; + unsigned long i; + + if (size_to_hstate(PAGE_SIZE << order)) { + printk(KERN_WARNING "hugepagesz= specified twice, ignoring\n"); + return; + } + BUG_ON(max_hstate >= HUGE_MAX_HSTATE); + BUG_ON(order == 0); + h = &hstates[max_hstate++]; + h->order = order; + h->mask = ~((1ULL << (order + PAGE_SHIFT)) - 1); + h->nr_huge_pages = 0; + h->free_huge_pages = 0; + for (i = 0; i < MAX_NUMNODES; ++i) + INIT_LIST_HEAD(&h->hugepage_freelists[i]); + h->hugetlb_next_nid = first_node(node_online_map); + snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB", + huge_page_size(h)/1024); + + parsed_hstate = h; +} + +static int __init hugetlb_nrpages_setup(char *s) +{ + unsigned long *mhp; + static unsigned long *last_mhp; + + /* + * !max_hstate means we haven't parsed a hugepagesz= parameter yet, + * so this hugepages= parameter goes to the "default hstate". + */ + if (!max_hstate) + mhp = &default_hstate_max_huge_pages; + else + mhp = &parsed_hstate->max_huge_pages; + + if (mhp == last_mhp) { + printk(KERN_WARNING "hugepages= specified twice without " + "interleaving hugepagesz=, ignoring\n"); + return 1; + } + + if (sscanf(s, "%lu", mhp) <= 0) + *mhp = 0; + + /* + * Global state is always initialized later in hugetlb_init. + * But we need to allocate >= MAX_ORDER hstates here early to still + * use the bootmem allocator. + */ + if (max_hstate && parsed_hstate->order >= MAX_ORDER) + hugetlb_hstate_alloc_pages(parsed_hstate); + + last_mhp = mhp; + + return 1; +} +__setup("hugepages=", hugetlb_nrpages_setup); + +static int __init hugetlb_default_setup(char *s) +{ + default_hstate_size = memparse(s, &s); + return 1; +} +__setup("default_hugepagesz=", hugetlb_default_setup); + +static unsigned int cpuset_mems_nr(unsigned int *array) +{ + int node; + unsigned int nr = 0; + + for_each_node_mask(node, cpuset_current_mems_allowed) + nr += array[node]; + + return nr; +} + +#ifdef CONFIG_SYSCTL int hugetlb_sysctl_handler(struct ctl_table *table, int write, struct file *file, void __user *buffer, size_t *length, loff_t *ppos) { + struct hstate *h = &default_hstate; + unsigned long tmp; + + if (!write) + tmp = h->max_huge_pages; + + table->data = &tmp; + table->maxlen = sizeof(unsigned long); proc_doulongvec_minmax(table, write, file, buffer, length, ppos); - max_huge_pages = set_max_huge_pages(max_huge_pages); + + if (write) + h->max_huge_pages = set_max_huge_pages(h, tmp); + return 0; } @@ -676,45 +1435,141 @@ int hugetlb_overcommit_handler(struct ctl_table *table, int write, struct file *file, void __user *buffer, size_t *length, loff_t *ppos) { + struct hstate *h = &default_hstate; + unsigned long tmp; + + if (!write) + tmp = h->nr_overcommit_huge_pages; + + table->data = &tmp; + table->maxlen = sizeof(unsigned long); proc_doulongvec_minmax(table, write, file, buffer, length, ppos); - spin_lock(&hugetlb_lock); - nr_overcommit_huge_pages = sysctl_overcommit_huge_pages; - spin_unlock(&hugetlb_lock); + + if (write) { + spin_lock(&hugetlb_lock); + h->nr_overcommit_huge_pages = tmp; + spin_unlock(&hugetlb_lock); + } + return 0; } #endif /* CONFIG_SYSCTL */ -int hugetlb_report_meminfo(char *buf) +void hugetlb_report_meminfo(struct seq_file *m) { - return sprintf(buf, - "HugePages_Total: %5lu\n" - "HugePages_Free: %5lu\n" - "HugePages_Rsvd: %5lu\n" - "HugePages_Surp: %5lu\n" - "Hugepagesize: %5lu kB\n", - nr_huge_pages, - free_huge_pages, - resv_huge_pages, - surplus_huge_pages, - HPAGE_SIZE/1024); + struct hstate *h = &default_hstate; + seq_printf(m, + "HugePages_Total: %5lu\n" + "HugePages_Free: %5lu\n" + "HugePages_Rsvd: %5lu\n" + "HugePages_Surp: %5lu\n" + "Hugepagesize: %8lu kB\n", + h->nr_huge_pages, + h->free_huge_pages, + h->resv_huge_pages, + h->surplus_huge_pages, + 1UL << (huge_page_order(h) + PAGE_SHIFT - 10)); } int hugetlb_report_node_meminfo(int nid, char *buf) { + struct hstate *h = &default_hstate; return sprintf(buf, "Node %d HugePages_Total: %5u\n" "Node %d HugePages_Free: %5u\n" "Node %d HugePages_Surp: %5u\n", - nid, nr_huge_pages_node[nid], - nid, free_huge_pages_node[nid], - nid, surplus_huge_pages_node[nid]); + nid, h->nr_huge_pages_node[nid], + nid, h->free_huge_pages_node[nid], + nid, h->surplus_huge_pages_node[nid]); } /* Return the number pages of memory we physically have, in PAGE_SIZE units. */ unsigned long hugetlb_total_pages(void) { - return nr_huge_pages * (HPAGE_SIZE / PAGE_SIZE); + struct hstate *h = &default_hstate; + return h->nr_huge_pages * pages_per_huge_page(h); +} + +static int hugetlb_acct_memory(struct hstate *h, long delta) +{ + int ret = -ENOMEM; + + spin_lock(&hugetlb_lock); + /* + * When cpuset is configured, it breaks the strict hugetlb page + * reservation as the accounting is done on a global variable. Such + * reservation is completely rubbish in the presence of cpuset because + * the reservation is not checked against page availability for the + * current cpuset. Application can still potentially OOM'ed by kernel + * with lack of free htlb page in cpuset that the task is in. + * Attempt to enforce strict accounting with cpuset is almost + * impossible (or too ugly) because cpuset is too fluid that + * task or memory node can be dynamically moved between cpusets. + * + * The change of semantics for shared hugetlb mapping with cpuset is + * undesirable. However, in order to preserve some of the semantics, + * we fall back to check against current free page availability as + * a best attempt and hopefully to minimize the impact of changing + * semantics that cpuset has. + */ + if (delta > 0) { + if (gather_surplus_pages(h, delta) < 0) + goto out; + + if (delta > cpuset_mems_nr(h->free_huge_pages_node)) { + return_unused_surplus_pages(h, delta); + goto out; + } + } + + ret = 0; + if (delta < 0) + return_unused_surplus_pages(h, (unsigned long) -delta); + +out: + spin_unlock(&hugetlb_lock); + return ret; +} + +static void hugetlb_vm_op_open(struct vm_area_struct *vma) +{ + struct resv_map *reservations = vma_resv_map(vma); + + /* + * This new VMA should share its siblings reservation map if present. + * The VMA will only ever have a valid reservation map pointer where + * it is being copied for another still existing VMA. As that VMA + * has a reference to the reservation map it cannot dissappear until + * after this open call completes. It is therefore safe to take a + * new reference here without additional locking. + */ + if (reservations) + kref_get(&reservations->refs); +} + +static void hugetlb_vm_op_close(struct vm_area_struct *vma) +{ + struct hstate *h = hstate_vma(vma); + struct resv_map *reservations = vma_resv_map(vma); + unsigned long reserve; + unsigned long start; + unsigned long end; + + if (reservations) { + start = vma_hugecache_offset(h, vma, vma->vm_start); + end = vma_hugecache_offset(h, vma, vma->vm_end); + + reserve = (end - start) - + region_count(&reservations->regions, start, end); + + kref_put(&reservations->refs, resv_map_release); + + if (reserve) { + hugetlb_acct_memory(h, -reserve); + hugetlb_put_quota(vma->vm_file->f_mapping, reserve); + } + } } /* @@ -731,6 +1586,8 @@ static int hugetlb_vm_op_fault(struct vm_area_struct *vma, struct vm_fault *vmf) struct vm_operations_struct hugetlb_vm_ops = { .fault = hugetlb_vm_op_fault, + .open = hugetlb_vm_op_open, + .close = hugetlb_vm_op_close, }; static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page, @@ -769,14 +1626,16 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, struct page *ptepage; unsigned long addr; int cow; + struct hstate *h = hstate_vma(vma); + unsigned long sz = huge_page_size(h); cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; - for (addr = vma->vm_start; addr < vma->vm_end; addr += HPAGE_SIZE) { + for (addr = vma->vm_start; addr < vma->vm_end; addr += sz) { src_pte = huge_pte_offset(src, addr); if (!src_pte) continue; - dst_pte = huge_pte_alloc(dst, addr); + dst_pte = huge_pte_alloc(dst, addr, sz); if (!dst_pte) goto nomem; @@ -804,7 +1663,7 @@ nomem: } void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, - unsigned long end) + unsigned long end, struct page *ref_page) { struct mm_struct *mm = vma->vm_mm; unsigned long address; @@ -812,6 +1671,9 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, pte_t pte; struct page *page; struct page *tmp; + struct hstate *h = hstate_vma(vma); + unsigned long sz = huge_page_size(h); + /* * A page gathering list, protected by per file i_mmap_lock. The * lock is used to avoid list corruption from multiple unmapping @@ -820,11 +1682,12 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, LIST_HEAD(page_list); WARN_ON(!is_vm_hugetlb_page(vma)); - BUG_ON(start & ~HPAGE_MASK); - BUG_ON(end & ~HPAGE_MASK); + BUG_ON(start & ~huge_page_mask(h)); + BUG_ON(end & ~huge_page_mask(h)); + mmu_notifier_invalidate_range_start(mm, start, end); spin_lock(&mm->page_table_lock); - for (address = start; address < end; address += HPAGE_SIZE) { + for (address = start; address < end; address += sz) { ptep = huge_pte_offset(mm, address); if (!ptep) continue; @@ -832,6 +1695,27 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, if (huge_pmd_unshare(mm, &address, ptep)) continue; + /* + * If a reference page is supplied, it is because a specific + * page is being unmapped, not a range. Ensure the page we + * are about to unmap is the actual page of interest. + */ + if (ref_page) { + pte = huge_ptep_get(ptep); + if (huge_pte_none(pte)) + continue; + page = pte_page(pte); + if (page != ref_page) + continue; + + /* + * Mark the VMA as having unmapped its page so that + * future faults in this VMA will fail rather than + * looking like data was lost + */ + set_vma_resv_flags(vma, HPAGE_RESV_UNMAPPED); + } + pte = huge_ptep_get_and_clear(mm, address, ptep); if (huge_pte_none(pte)) continue; @@ -843,6 +1727,7 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, } spin_unlock(&mm->page_table_lock); flush_tlb_range(vma, start, end); + mmu_notifier_invalidate_range_end(mm, start, end); list_for_each_entry_safe(page, tmp, &page_list, lru) { list_del(&page->lru); put_page(page); @@ -850,31 +1735,69 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, } void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, - unsigned long end) + unsigned long end, struct page *ref_page) { + spin_lock(&vma->vm_file->f_mapping->i_mmap_lock); + __unmap_hugepage_range(vma, start, end, ref_page); + spin_unlock(&vma->vm_file->f_mapping->i_mmap_lock); +} + +/* + * This is called when the original mapper is failing to COW a MAP_PRIVATE + * mappping it owns the reserve page for. The intention is to unmap the page + * from other VMAs and let the children be SIGKILLed if they are faulting the + * same region. + */ +static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma, + struct page *page, unsigned long address) +{ + struct vm_area_struct *iter_vma; + struct address_space *mapping; + struct prio_tree_iter iter; + pgoff_t pgoff; + /* - * It is undesirable to test vma->vm_file as it should be non-null - * for valid hugetlb area. However, vm_file will be NULL in the error - * cleanup path of do_mmap_pgoff. When hugetlbfs ->mmap method fails, - * do_mmap_pgoff() nullifies vma->vm_file before calling this function - * to clean up. Since no pte has actually been setup, it is safe to - * do nothing in this case. + * vm_pgoff is in PAGE_SIZE units, hence the different calculation + * from page cache lookup which is in HPAGE_SIZE units. */ - if (vma->vm_file) { - spin_lock(&vma->vm_file->f_mapping->i_mmap_lock); - __unmap_hugepage_range(vma, start, end); - spin_unlock(&vma->vm_file->f_mapping->i_mmap_lock); + address = address & huge_page_mask(hstate_vma(vma)); + pgoff = ((address - vma->vm_start) >> PAGE_SHIFT) + + (vma->vm_pgoff >> PAGE_SHIFT); + mapping = (struct address_space *)page_private(page); + + vma_prio_tree_foreach(iter_vma, &iter, &mapping->i_mmap, pgoff, pgoff) { + /* Do not unmap the current VMA */ + if (iter_vma == vma) + continue; + + /* + * Unmap the page from other VMAs without their own reserves. + * They get marked to be SIGKILLed if they fault in these + * areas. This is because a future no-page fault on this VMA + * could insert a zeroed page instead of the data existing + * from the time of fork. This would look like data corruption + */ + if (!is_vma_resv_set(iter_vma, HPAGE_RESV_OWNER)) + unmap_hugepage_range(iter_vma, + address, address + HPAGE_SIZE, + page); } + + return 1; } static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long address, pte_t *ptep, pte_t pte) + unsigned long address, pte_t *ptep, pte_t pte, + struct page *pagecache_page) { + struct hstate *h = hstate_vma(vma); struct page *old_page, *new_page; int avoidcopy; + int outside_reserve = 0; old_page = pte_page(pte); +retry_avoidcopy: /* If no-one else is actually using this page, avoid the copy * and just make the page writable */ avoidcopy = (page_count(old_page) == 1); @@ -883,11 +1806,43 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma, return 0; } + /* + * If the process that created a MAP_PRIVATE mapping is about to + * perform a COW due to a shared page count, attempt to satisfy + * the allocation without using the existing reserves. The pagecache + * page is used to determine if the reserve at this address was + * consumed or not. If reserves were used, a partial faulted mapping + * at the time of fork() could consume its reserves on COW instead + * of the full address range. + */ + if (!(vma->vm_flags & VM_SHARED) && + is_vma_resv_set(vma, HPAGE_RESV_OWNER) && + old_page != pagecache_page) + outside_reserve = 1; + page_cache_get(old_page); - new_page = alloc_huge_page(vma, address); + new_page = alloc_huge_page(vma, address, outside_reserve); if (IS_ERR(new_page)) { page_cache_release(old_page); + + /* + * If a process owning a MAP_PRIVATE mapping fails to COW, + * it is due to references held by a child and an insufficient + * huge page pool. To guarantee the original mappers + * reliability, unmap the page from child processes. The child + * may get SIGKILLed if it later faults. + */ + if (outside_reserve) { + BUG_ON(huge_pte_none(pte)); + if (unmap_ref_private(mm, vma, old_page, address)) { + BUG_ON(page_count(old_page) != 1); + BUG_ON(huge_pte_none(pte)); + goto retry_avoidcopy; + } + WARN_ON_ONCE(1); + } + return -PTR_ERR(new_page); } @@ -896,7 +1851,7 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma, __SetPageUptodate(new_page); spin_lock(&mm->page_table_lock); - ptep = huge_pte_offset(mm, address & HPAGE_MASK); + ptep = huge_pte_offset(mm, address & huge_page_mask(h)); if (likely(pte_same(huge_ptep_get(ptep), pte))) { /* Break COW */ huge_ptep_clear_flush(vma, address, ptep); @@ -910,19 +1865,44 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma, return 0; } +/* Return the pagecache page at a given address within a VMA */ +static struct page *hugetlbfs_pagecache_page(struct hstate *h, + struct vm_area_struct *vma, unsigned long address) +{ + struct address_space *mapping; + pgoff_t idx; + + mapping = vma->vm_file->f_mapping; + idx = vma_hugecache_offset(h, vma, address); + + return find_lock_page(mapping, idx); +} + static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, pte_t *ptep, int write_access) { + struct hstate *h = hstate_vma(vma); int ret = VM_FAULT_SIGBUS; - unsigned long idx; + pgoff_t idx; unsigned long size; struct page *page; struct address_space *mapping; pte_t new_pte; + /* + * Currently, we are forced to kill the process in the event the + * original mapper has unmapped pages from the child due to a failed + * COW. Warn that such a situation has occured as it may not be obvious + */ + if (is_vma_resv_set(vma, HPAGE_RESV_UNMAPPED)) { + printk(KERN_WARNING + "PID %d killed due to inadequate hugepage pool\n", + current->pid); + return ret; + } + mapping = vma->vm_file->f_mapping; - idx = ((address - vma->vm_start) >> HPAGE_SHIFT) - + (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT)); + idx = vma_hugecache_offset(h, vma, address); /* * Use page lock to guard against racing truncation @@ -931,15 +1911,15 @@ static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma, retry: page = find_lock_page(mapping, idx); if (!page) { - size = i_size_read(mapping->host) >> HPAGE_SHIFT; + size = i_size_read(mapping->host) >> huge_page_shift(h); if (idx >= size) goto out; - page = alloc_huge_page(vma, address); + page = alloc_huge_page(vma, address, 0); if (IS_ERR(page)) { ret = -PTR_ERR(page); goto out; } - clear_huge_page(page, address); + clear_huge_page(page, address, huge_page_size(h)); __SetPageUptodate(page); if (vma->vm_flags & VM_SHARED) { @@ -955,14 +1935,26 @@ retry: } spin_lock(&inode->i_lock); - inode->i_blocks += BLOCKS_PER_HUGEPAGE; + inode->i_blocks += blocks_per_huge_page(h); spin_unlock(&inode->i_lock); } else lock_page(page); } + /* + * If we are going to COW a private mapping later, we examine the + * pending reservations for this page now. This will ensure that + * any allocations necessary to record that reservation occur outside + * the spinlock. + */ + if (write_access && !(vma->vm_flags & VM_SHARED)) + if (vma_needs_reservation(h, vma, address) < 0) { + ret = VM_FAULT_OOM; + goto backout_unlocked; + } + spin_lock(&mm->page_table_lock); - size = i_size_read(mapping->host) >> HPAGE_SHIFT; + size = i_size_read(mapping->host) >> huge_page_shift(h); if (idx >= size) goto backout; @@ -976,7 +1968,7 @@ retry: if (write_access && !(vma->vm_flags & VM_SHARED)) { /* Optimization, do the COW without a second fault */ - ret = hugetlb_cow(mm, vma, address, ptep, new_pte); + ret = hugetlb_cow(mm, vma, address, ptep, new_pte, page); } spin_unlock(&mm->page_table_lock); @@ -986,6 +1978,7 @@ out: backout: spin_unlock(&mm->page_table_lock); +backout_unlocked: unlock_page(page); put_page(page); goto out; @@ -997,9 +1990,11 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, pte_t *ptep; pte_t entry; int ret; + struct page *pagecache_page = NULL; static DEFINE_MUTEX(hugetlb_instantiation_mutex); + struct hstate *h = hstate_vma(vma); - ptep = huge_pte_alloc(mm, address); + ptep = huge_pte_alloc(mm, address, huge_page_size(h)); if (!ptep) return VM_FAULT_OOM; @@ -1012,23 +2007,79 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, entry = huge_ptep_get(ptep); if (huge_pte_none(entry)) { ret = hugetlb_no_page(mm, vma, address, ptep, write_access); - mutex_unlock(&hugetlb_instantiation_mutex); - return ret; + goto out_mutex; } ret = 0; + /* + * If we are going to COW the mapping later, we examine the pending + * reservations for this page now. This will ensure that any + * allocations necessary to record that reservation occur outside the + * spinlock. For private mappings, we also lookup the pagecache + * page now as it is used to determine if a reservation has been + * consumed. + */ + if (write_access && !pte_write(entry)) { + if (vma_needs_reservation(h, vma, address) < 0) { + ret = VM_FAULT_OOM; + goto out_mutex; + } + + if (!(vma->vm_flags & VM_SHARED)) + pagecache_page = hugetlbfs_pagecache_page(h, + vma, address); + } + spin_lock(&mm->page_table_lock); /* Check for a racing update before calling hugetlb_cow */ - if (likely(pte_same(entry, huge_ptep_get(ptep)))) - if (write_access && !pte_write(entry)) - ret = hugetlb_cow(mm, vma, address, ptep, entry); + if (unlikely(!pte_same(entry, huge_ptep_get(ptep)))) + goto out_page_table_lock; + + + if (write_access) { + if (!pte_write(entry)) { + ret = hugetlb_cow(mm, vma, address, ptep, entry, + pagecache_page); + goto out_page_table_lock; + } + entry = pte_mkdirty(entry); + } + entry = pte_mkyoung(entry); + if (huge_ptep_set_access_flags(vma, address, ptep, entry, write_access)) + update_mmu_cache(vma, address, entry); + +out_page_table_lock: spin_unlock(&mm->page_table_lock); + + if (pagecache_page) { + unlock_page(pagecache_page); + put_page(pagecache_page); + } + +out_mutex: mutex_unlock(&hugetlb_instantiation_mutex); return ret; } +/* Can be overriden by architectures */ +__attribute__((weak)) struct page * +follow_huge_pud(struct mm_struct *mm, unsigned long address, + pud_t *pud, int write) +{ + BUG(); + return NULL; +} + +static int huge_zeropage_ok(pte_t *ptep, int write, int shared) +{ + if (!ptep || write || shared) + return 0; + else + return huge_pte_none(huge_ptep_get(ptep)); +} + int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, struct page **pages, struct vm_area_struct **vmas, unsigned long *position, int *length, int i, @@ -1037,6 +2088,9 @@ int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long pfn_offset; unsigned long vaddr = *position; int remainder = *length; + struct hstate *h = hstate_vma(vma); + int zeropage_ok = 0; + int shared = vma->vm_flags & VM_SHARED; spin_lock(&mm->page_table_lock); while (vaddr < vma->vm_end && remainder) { @@ -1048,9 +2102,12 @@ int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, * each hugepage. We have to make * sure we get the * first, for the page indexing below to work. */ - pte = huge_pte_offset(mm, vaddr & HPAGE_MASK); + pte = huge_pte_offset(mm, vaddr & huge_page_mask(h)); + if (huge_zeropage_ok(pte, write, shared)) + zeropage_ok = 1; - if (!pte || huge_pte_none(huge_ptep_get(pte)) || + if (!pte || + (huge_pte_none(huge_ptep_get(pte)) && !zeropage_ok) || (write && !pte_write(huge_ptep_get(pte)))) { int ret; @@ -1066,12 +2123,15 @@ int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, break; } - pfn_offset = (vaddr & ~HPAGE_MASK) >> PAGE_SHIFT; + pfn_offset = (vaddr & ~huge_page_mask(h)) >> PAGE_SHIFT; page = pte_page(huge_ptep_get(pte)); same_page: if (pages) { - get_page(page); - pages[i] = page + pfn_offset; + if (zeropage_ok) + pages[i] = ZERO_PAGE(0); + else + pages[i] = page + pfn_offset; + get_page(pages[i]); } if (vmas) @@ -1082,7 +2142,7 @@ same_page: --remainder; ++i; if (vaddr < vma->vm_end && remainder && - pfn_offset < HPAGE_SIZE/PAGE_SIZE) { + pfn_offset < pages_per_huge_page(h)) { /* * We use pfn_offset to avoid touching the pageframes * of this compound page. @@ -1104,13 +2164,14 @@ void hugetlb_change_protection(struct vm_area_struct *vma, unsigned long start = address; pte_t *ptep; pte_t pte; + struct hstate *h = hstate_vma(vma); BUG_ON(address >= end); flush_cache_range(vma, address, end); spin_lock(&vma->vm_file->f_mapping->i_mmap_lock); spin_lock(&mm->page_table_lock); - for (; address < end; address += HPAGE_SIZE) { + for (; address < end; address += huge_page_size(h)) { ptep = huge_pte_offset(mm, address); if (!ptep) continue; @@ -1128,195 +2189,59 @@ void hugetlb_change_protection(struct vm_area_struct *vma, flush_tlb_range(vma, start, end); } -struct file_region { - struct list_head link; - long from; - long to; -}; - -static long region_add(struct list_head *head, long f, long t) -{ - struct file_region *rg, *nrg, *trg; - - /* Locate the region we are either in or before. */ - list_for_each_entry(rg, head, link) - if (f <= rg->to) - break; - - /* Round our left edge to the current segment if it encloses us. */ - if (f > rg->from) - f = rg->from; - - /* Check for and consume any regions we now overlap with. */ - nrg = rg; - list_for_each_entry_safe(rg, trg, rg->link.prev, link) { - if (&rg->link == head) - break; - if (rg->from > t) - break; - - /* If this area reaches higher then extend our area to - * include it completely. If this is not the first area - * which we intend to reuse, free it. */ - if (rg->to > t) - t = rg->to; - if (rg != nrg) { - list_del(&rg->link); - kfree(rg); - } - } - nrg->from = f; - nrg->to = t; - return 0; -} - -static long region_chg(struct list_head *head, long f, long t) -{ - struct file_region *rg, *nrg; - long chg = 0; - - /* Locate the region we are before or in. */ - list_for_each_entry(rg, head, link) - if (f <= rg->to) - break; - - /* If we are below the current region then a new region is required. - * Subtle, allocate a new region at the position but make it zero - * size such that we can guarantee to record the reservation. */ - if (&rg->link == head || t < rg->from) { - nrg = kmalloc(sizeof(*nrg), GFP_KERNEL); - if (!nrg) - return -ENOMEM; - nrg->from = f; - nrg->to = f; - INIT_LIST_HEAD(&nrg->link); - list_add(&nrg->link, rg->link.prev); - - return t - f; - } - - /* Round our left edge to the current segment if it encloses us. */ - if (f > rg->from) - f = rg->from; - chg = t - f; - - /* Check for and consume any regions we now overlap with. */ - list_for_each_entry(rg, rg->link.prev, link) { - if (&rg->link == head) - break; - if (rg->from > t) - return chg; - - /* We overlap with this area, if it extends futher than - * us then we must extend ourselves. Account for its - * existing reservation. */ - if (rg->to > t) { - chg += rg->to - t; - t = rg->to; - } - chg -= rg->to - rg->from; - } - return chg; -} - -static long region_truncate(struct list_head *head, long end) +int hugetlb_reserve_pages(struct inode *inode, + long from, long to, + struct vm_area_struct *vma) { - struct file_region *rg, *trg; - long chg = 0; + long ret, chg; + struct hstate *h = hstate_inode(inode); - /* Locate the region we are either in or before. */ - list_for_each_entry(rg, head, link) - if (end <= rg->to) - break; - if (&rg->link == head) + if (vma && vma->vm_flags & VM_NORESERVE) return 0; - /* If we are in the middle of a region then adjust it. */ - if (end > rg->from) { - chg = rg->to - end; - rg->to = end; - rg = list_entry(rg->link.next, typeof(*rg), link); - } - - /* Drop any remaining regions. */ - list_for_each_entry_safe(rg, trg, rg->link.prev, link) { - if (&rg->link == head) - break; - chg += rg->to - rg->from; - list_del(&rg->link); - kfree(rg); - } - return chg; -} - -static int hugetlb_acct_memory(long delta) -{ - int ret = -ENOMEM; - - spin_lock(&hugetlb_lock); /* - * When cpuset is configured, it breaks the strict hugetlb page - * reservation as the accounting is done on a global variable. Such - * reservation is completely rubbish in the presence of cpuset because - * the reservation is not checked against page availability for the - * current cpuset. Application can still potentially OOM'ed by kernel - * with lack of free htlb page in cpuset that the task is in. - * Attempt to enforce strict accounting with cpuset is almost - * impossible (or too ugly) because cpuset is too fluid that - * task or memory node can be dynamically moved between cpusets. - * - * The change of semantics for shared hugetlb mapping with cpuset is - * undesirable. However, in order to preserve some of the semantics, - * we fall back to check against current free page availability as - * a best attempt and hopefully to minimize the impact of changing - * semantics that cpuset has. + * Shared mappings base their reservation on the number of pages that + * are already allocated on behalf of the file. Private mappings need + * to reserve the full area even if read-only as mprotect() may be + * called to make the mapping read-write. Assume !vma is a shm mapping */ - if (delta > 0) { - if (gather_surplus_pages(delta) < 0) - goto out; - - if (delta > cpuset_mems_nr(free_huge_pages_node)) { - return_unused_surplus_pages(delta); - goto out; - } - } + if (!vma || vma->vm_flags & VM_SHARED) + chg = region_chg(&inode->i_mapping->private_list, from, to); + else { + struct resv_map *resv_map = resv_map_alloc(); + if (!resv_map) + return -ENOMEM; - ret = 0; - if (delta < 0) - return_unused_surplus_pages((unsigned long) -delta); + chg = to - from; -out: - spin_unlock(&hugetlb_lock); - return ret; -} - -int hugetlb_reserve_pages(struct inode *inode, long from, long to) -{ - long ret, chg; + set_vma_resv_map(vma, resv_map); + set_vma_resv_flags(vma, HPAGE_RESV_OWNER); + } - chg = region_chg(&inode->i_mapping->private_list, from, to); if (chg < 0) return chg; if (hugetlb_get_quota(inode->i_mapping, chg)) return -ENOSPC; - ret = hugetlb_acct_memory(chg); + ret = hugetlb_acct_memory(h, chg); if (ret < 0) { hugetlb_put_quota(inode->i_mapping, chg); return ret; } - region_add(&inode->i_mapping->private_list, from, to); + if (!vma || vma->vm_flags & VM_SHARED) + region_add(&inode->i_mapping->private_list, from, to); return 0; } void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed) { + struct hstate *h = hstate_inode(inode); long chg = region_truncate(&inode->i_mapping->private_list, offset); spin_lock(&inode->i_lock); - inode->i_blocks -= BLOCKS_PER_HUGEPAGE * freed; + inode->i_blocks -= blocks_per_huge_page(h); spin_unlock(&inode->i_lock); hugetlb_put_quota(inode->i_mapping, (chg - freed)); - hugetlb_acct_memory(-(chg - freed)); + hugetlb_acct_memory(h, -(chg - freed)); } diff --git a/mm/internal.h b/mm/internal.h index 0034e947e4bc..e4e728bdf324 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -13,6 +13,11 @@ #include <linux/mm.h> +void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma, + unsigned long floor, unsigned long ceiling); + +extern void prep_compound_page(struct page *page, unsigned long order); + static inline void set_page_count(struct page *page, int v) { atomic_set(&page->_count, v); @@ -34,6 +39,15 @@ static inline void __put_page(struct page *page) atomic_dec(&page->_count); } +/* + * in mm/vmscan.c: + */ +extern int isolate_lru_page(struct page *page); +extern void putback_lru_page(struct page *page); + +/* + * in mm/page_alloc.c + */ extern void __free_pages_bootmem(struct page *page, unsigned int order); /* @@ -47,6 +61,120 @@ static inline unsigned long page_order(struct page *page) return page_private(page); } +extern long mlock_vma_pages_range(struct vm_area_struct *vma, + unsigned long start, unsigned long end); +extern void munlock_vma_pages_range(struct vm_area_struct *vma, + unsigned long start, unsigned long end); +static inline void munlock_vma_pages_all(struct vm_area_struct *vma) +{ + munlock_vma_pages_range(vma, vma->vm_start, vma->vm_end); +} + +#ifdef CONFIG_UNEVICTABLE_LRU +/* + * unevictable_migrate_page() called only from migrate_page_copy() to + * migrate unevictable flag to new page. + * Note that the old page has been isolated from the LRU lists at this + * point so we don't need to worry about LRU statistics. + */ +static inline void unevictable_migrate_page(struct page *new, struct page *old) +{ + if (TestClearPageUnevictable(old)) + SetPageUnevictable(new); +} +#else +static inline void unevictable_migrate_page(struct page *new, struct page *old) +{ +} +#endif + +#ifdef CONFIG_UNEVICTABLE_LRU +/* + * Called only in fault path via page_evictable() for a new page + * to determine if it's being mapped into a LOCKED vma. + * If so, mark page as mlocked. + */ +static inline int is_mlocked_vma(struct vm_area_struct *vma, struct page *page) +{ + VM_BUG_ON(PageLRU(page)); + + if (likely((vma->vm_flags & (VM_LOCKED | VM_SPECIAL)) != VM_LOCKED)) + return 0; + + if (!TestSetPageMlocked(page)) { + inc_zone_page_state(page, NR_MLOCK); + count_vm_event(UNEVICTABLE_PGMLOCKED); + } + return 1; +} + +/* + * must be called with vma's mmap_sem held for read, and page locked. + */ +extern void mlock_vma_page(struct page *page); + +/* + * Clear the page's PageMlocked(). This can be useful in a situation where + * we want to unconditionally remove a page from the pagecache -- e.g., + * on truncation or freeing. + * + * It is legal to call this function for any page, mlocked or not. + * If called for a page that is still mapped by mlocked vmas, all we do + * is revert to lazy LRU behaviour -- semantics are not broken. + */ +extern void __clear_page_mlock(struct page *page); +static inline void clear_page_mlock(struct page *page) +{ + if (unlikely(TestClearPageMlocked(page))) + __clear_page_mlock(page); +} + +/* + * mlock_migrate_page - called only from migrate_page_copy() to + * migrate the Mlocked page flag; update statistics. + */ +static inline void mlock_migrate_page(struct page *newpage, struct page *page) +{ + if (TestClearPageMlocked(page)) { + unsigned long flags; + + local_irq_save(flags); + __dec_zone_page_state(page, NR_MLOCK); + SetPageMlocked(newpage); + __inc_zone_page_state(newpage, NR_MLOCK); + local_irq_restore(flags); + } +} + +/* + * free_page_mlock() -- clean up attempts to free and mlocked() page. + * Page should not be on lru, so no need to fix that up. + * free_pages_check() will verify... + */ +static inline void free_page_mlock(struct page *page) +{ + if (unlikely(TestClearPageMlocked(page))) { + unsigned long flags; + + local_irq_save(flags); + __dec_zone_page_state(page, NR_MLOCK); + __count_vm_event(UNEVICTABLE_MLOCKFREED); + local_irq_restore(flags); + } +} + +#else /* CONFIG_UNEVICTABLE_LRU */ +static inline int is_mlocked_vma(struct vm_area_struct *v, struct page *p) +{ + return 0; +} +static inline void clear_page_mlock(struct page *page) { } +static inline void mlock_vma_page(struct page *page) { } +static inline void mlock_migrate_page(struct page *new, struct page *old) { } +static inline void free_page_mlock(struct page *page) { } + +#endif /* CONFIG_UNEVICTABLE_LRU */ + /* * FLATMEM and DISCONTIGMEM configurations use alloc_bootmem_node, * so all functions starting at paging_init should be marked __init @@ -59,4 +187,68 @@ static inline unsigned long page_order(struct page *page) #define __paginginit __init #endif +/* Memory initialisation debug and verification */ +enum mminit_level { + MMINIT_WARNING, + MMINIT_VERIFY, + MMINIT_TRACE +}; + +#ifdef CONFIG_DEBUG_MEMORY_INIT + +extern int mminit_loglevel; + +#define mminit_dprintk(level, prefix, fmt, arg...) \ +do { \ + if (level < mminit_loglevel) { \ + printk(level <= MMINIT_WARNING ? KERN_WARNING : KERN_DEBUG); \ + printk(KERN_CONT "mminit::" prefix " " fmt, ##arg); \ + } \ +} while (0) + +extern void mminit_verify_pageflags_layout(void); +extern void mminit_verify_page_links(struct page *page, + enum zone_type zone, unsigned long nid, unsigned long pfn); +extern void mminit_verify_zonelist(void); + +#else + +static inline void mminit_dprintk(enum mminit_level level, + const char *prefix, const char *fmt, ...) +{ +} + +static inline void mminit_verify_pageflags_layout(void) +{ +} + +static inline void mminit_verify_page_links(struct page *page, + enum zone_type zone, unsigned long nid, unsigned long pfn) +{ +} + +static inline void mminit_verify_zonelist(void) +{ +} +#endif /* CONFIG_DEBUG_MEMORY_INIT */ + +/* mminit_validate_memmodel_limits is independent of CONFIG_DEBUG_MEMORY_INIT */ +#if defined(CONFIG_SPARSEMEM) +extern void mminit_validate_memmodel_limits(unsigned long *start_pfn, + unsigned long *end_pfn); +#else +static inline void mminit_validate_memmodel_limits(unsigned long *start_pfn, + unsigned long *end_pfn) +{ +} +#endif /* CONFIG_SPARSEMEM */ + +#define GUP_FLAGS_WRITE 0x1 +#define GUP_FLAGS_FORCE 0x2 +#define GUP_FLAGS_IGNORE_VMA_PERMISSIONS 0x4 + +int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, + unsigned long start, int len, int flags, + struct page **pages, struct vm_area_struct **vmas); + #endif diff --git a/mm/madvise.c b/mm/madvise.c index 23a0ec3e0ea0..f9349c18a1b5 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -132,10 +132,10 @@ static long madvise_willneed(struct vm_area_struct * vma, * Application no longer needs these pages. If the pages are dirty, * it's OK to just throw them away. The app will be more careful about * data it wants to keep. Be sure to free swap resources too. The - * zap_page_range call sets things up for refill_inactive to actually free + * zap_page_range call sets things up for shrink_active_list to actually free * these pages later if no one else has touched them in the meantime, * although we could add these pages to a global reuse list for - * refill_inactive to pick up before reclaiming other pages. + * shrink_active_list to pick up before reclaiming other pages. * * NB: This interface discards data rather than pushes it out to swap, * as some implementations do. This has performance implications for diff --git a/mm/memcontrol.c b/mm/memcontrol.c index e46451e1d9b7..866dcc7eeb0c 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -32,12 +32,13 @@ #include <linux/fs.h> #include <linux/seq_file.h> #include <linux/vmalloc.h> +#include <linux/mm_inline.h> +#include <linux/page_cgroup.h> #include <asm/uaccess.h> -struct cgroup_subsys mem_cgroup_subsys; -static const int MEM_CGROUP_RECLAIM_RETRIES = 5; -static struct kmem_cache *page_cgroup_cache; +struct cgroup_subsys mem_cgroup_subsys __read_mostly; +#define MEM_CGROUP_RECLAIM_RETRIES 5 /* * Statistics for memory cgroup. @@ -65,11 +66,10 @@ struct mem_cgroup_stat { /* * For accounting under irq disable, no need for increment preempt count. */ -static void __mem_cgroup_stat_add_safe(struct mem_cgroup_stat *stat, +static inline void __mem_cgroup_stat_add_safe(struct mem_cgroup_stat_cpu *stat, enum mem_cgroup_stat_index idx, int val) { - int cpu = smp_processor_id(); - stat->cpustat[cpu].count[idx] += val; + stat->count[idx] += val; } static s64 mem_cgroup_read_stat(struct mem_cgroup_stat *stat, @@ -85,22 +85,13 @@ static s64 mem_cgroup_read_stat(struct mem_cgroup_stat *stat, /* * per-zone information in memory controller. */ - -enum mem_cgroup_zstat_index { - MEM_CGROUP_ZSTAT_ACTIVE, - MEM_CGROUP_ZSTAT_INACTIVE, - - NR_MEM_CGROUP_ZSTAT, -}; - struct mem_cgroup_per_zone { /* * spin_lock to protect the per cgroup LRU */ spinlock_t lru_lock; - struct list_head active_list; - struct list_head inactive_list; - unsigned long count[NR_MEM_CGROUP_ZSTAT]; + struct list_head lists[NR_LRU_LISTS]; + unsigned long count[NR_LRU_LISTS]; }; /* Macro for accessing counter */ #define MEM_CGROUP_ZSTAT(mz, idx) ((mz)->count[(idx)]) @@ -144,69 +135,52 @@ struct mem_cgroup { }; static struct mem_cgroup init_mem_cgroup; -/* - * We use the lower bit of the page->page_cgroup pointer as a bit spin - * lock. We need to ensure that page->page_cgroup is at least two - * byte aligned (based on comments from Nick Piggin). But since - * bit_spin_lock doesn't actually set that lock bit in a non-debug - * uniprocessor kernel, we should avoid setting it here too. - */ -#define PAGE_CGROUP_LOCK_BIT 0x0 -#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) -#define PAGE_CGROUP_LOCK (1 << PAGE_CGROUP_LOCK_BIT) -#else -#define PAGE_CGROUP_LOCK 0x0 -#endif - -/* - * A page_cgroup page is associated with every page descriptor. The - * page_cgroup helps us identify information about the cgroup - */ -struct page_cgroup { - struct list_head lru; /* per cgroup LRU list */ - struct page *page; - struct mem_cgroup *mem_cgroup; - int ref_cnt; /* cached, mapped, migrating */ - int flags; -}; -#define PAGE_CGROUP_FLAG_CACHE (0x1) /* charged as cache */ -#define PAGE_CGROUP_FLAG_ACTIVE (0x2) /* page is active in this cgroup */ - -static int page_cgroup_nid(struct page_cgroup *pc) -{ - return page_to_nid(pc->page); -} - -static enum zone_type page_cgroup_zid(struct page_cgroup *pc) -{ - return page_zonenum(pc->page); -} - enum charge_type { MEM_CGROUP_CHARGE_TYPE_CACHE = 0, MEM_CGROUP_CHARGE_TYPE_MAPPED, + MEM_CGROUP_CHARGE_TYPE_SHMEM, /* used by page migration of shmem */ + MEM_CGROUP_CHARGE_TYPE_FORCE, /* used by force_empty */ + NR_CHARGE_TYPE, +}; + +/* only for here (for easy reading.) */ +#define PCGF_CACHE (1UL << PCG_CACHE) +#define PCGF_USED (1UL << PCG_USED) +#define PCGF_ACTIVE (1UL << PCG_ACTIVE) +#define PCGF_LOCK (1UL << PCG_LOCK) +#define PCGF_FILE (1UL << PCG_FILE) +static const unsigned long +pcg_default_flags[NR_CHARGE_TYPE] = { + PCGF_CACHE | PCGF_FILE | PCGF_USED | PCGF_LOCK, /* File Cache */ + PCGF_ACTIVE | PCGF_USED | PCGF_LOCK, /* Anon */ + PCGF_ACTIVE | PCGF_CACHE | PCGF_USED | PCGF_LOCK, /* Shmem */ + 0, /* FORCE */ }; /* * Always modified under lru lock. Then, not necessary to preempt_disable() */ -static void mem_cgroup_charge_statistics(struct mem_cgroup *mem, int flags, - bool charge) +static void mem_cgroup_charge_statistics(struct mem_cgroup *mem, + struct page_cgroup *pc, + bool charge) { int val = (charge)? 1 : -1; struct mem_cgroup_stat *stat = &mem->stat; + struct mem_cgroup_stat_cpu *cpustat; VM_BUG_ON(!irqs_disabled()); - if (flags & PAGE_CGROUP_FLAG_CACHE) - __mem_cgroup_stat_add_safe(stat, MEM_CGROUP_STAT_CACHE, val); + + cpustat = &stat->cpustat[smp_processor_id()]; + if (PageCgroupCache(pc)) + __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_CACHE, val); else - __mem_cgroup_stat_add_safe(stat, MEM_CGROUP_STAT_RSS, val); + __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_RSS, val); if (charge) - __mem_cgroup_stat_add_safe(stat, + __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_PGPGIN_COUNT, 1); else - __mem_cgroup_stat_add_safe(stat, + __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_PGPGOUT_COUNT, 1); } @@ -227,7 +201,7 @@ page_cgroup_zoneinfo(struct page_cgroup *pc) } static unsigned long mem_cgroup_get_all_zonestat(struct mem_cgroup *mem, - enum mem_cgroup_zstat_index idx) + enum lru_list idx) { int nid, zid; struct mem_cgroup_per_zone *mz; @@ -250,89 +224,89 @@ static struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont) struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p) { + /* + * mm_update_next_owner() may clear mm->owner to NULL + * if it races with swapoff, page migration, etc. + * So this can be called with p == NULL. + */ + if (unlikely(!p)) + return NULL; + return container_of(task_subsys_state(p, mem_cgroup_subsys_id), struct mem_cgroup, css); } -static inline int page_cgroup_locked(struct page *page) -{ - return bit_spin_is_locked(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup); -} - -static void page_assign_page_cgroup(struct page *page, struct page_cgroup *pc) -{ - VM_BUG_ON(!page_cgroup_locked(page)); - page->page_cgroup = ((unsigned long)pc | PAGE_CGROUP_LOCK); -} - -struct page_cgroup *page_get_page_cgroup(struct page *page) -{ - return (struct page_cgroup *) (page->page_cgroup & ~PAGE_CGROUP_LOCK); -} - -static void lock_page_cgroup(struct page *page) -{ - bit_spin_lock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup); -} - -static int try_lock_page_cgroup(struct page *page) -{ - return bit_spin_trylock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup); -} - -static void unlock_page_cgroup(struct page *page) -{ - bit_spin_unlock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup); -} - static void __mem_cgroup_remove_list(struct mem_cgroup_per_zone *mz, struct page_cgroup *pc) { - int from = pc->flags & PAGE_CGROUP_FLAG_ACTIVE; + int lru = LRU_BASE; + + if (PageCgroupUnevictable(pc)) + lru = LRU_UNEVICTABLE; + else { + if (PageCgroupActive(pc)) + lru += LRU_ACTIVE; + if (PageCgroupFile(pc)) + lru += LRU_FILE; + } - if (from) - MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_ACTIVE) -= 1; - else - MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_INACTIVE) -= 1; + MEM_CGROUP_ZSTAT(mz, lru) -= 1; - mem_cgroup_charge_statistics(pc->mem_cgroup, pc->flags, false); - list_del_init(&pc->lru); + mem_cgroup_charge_statistics(pc->mem_cgroup, pc, false); + list_del(&pc->lru); } static void __mem_cgroup_add_list(struct mem_cgroup_per_zone *mz, struct page_cgroup *pc) { - int to = pc->flags & PAGE_CGROUP_FLAG_ACTIVE; - - if (!to) { - MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_INACTIVE) += 1; - list_add(&pc->lru, &mz->inactive_list); - } else { - MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_ACTIVE) += 1; - list_add(&pc->lru, &mz->active_list); + int lru = LRU_BASE; + + if (PageCgroupUnevictable(pc)) + lru = LRU_UNEVICTABLE; + else { + if (PageCgroupActive(pc)) + lru += LRU_ACTIVE; + if (PageCgroupFile(pc)) + lru += LRU_FILE; } - mem_cgroup_charge_statistics(pc->mem_cgroup, pc->flags, true); + + MEM_CGROUP_ZSTAT(mz, lru) += 1; + list_add(&pc->lru, &mz->lists[lru]); + + mem_cgroup_charge_statistics(pc->mem_cgroup, pc, true); } -static void __mem_cgroup_move_lists(struct page_cgroup *pc, bool active) +static void __mem_cgroup_move_lists(struct page_cgroup *pc, enum lru_list lru) { - int from = pc->flags & PAGE_CGROUP_FLAG_ACTIVE; struct mem_cgroup_per_zone *mz = page_cgroup_zoneinfo(pc); + int active = PageCgroupActive(pc); + int file = PageCgroupFile(pc); + int unevictable = PageCgroupUnevictable(pc); + enum lru_list from = unevictable ? LRU_UNEVICTABLE : + (LRU_FILE * !!file + !!active); - if (from) - MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_ACTIVE) -= 1; - else - MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_INACTIVE) -= 1; + if (lru == from) + return; - if (active) { - MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_ACTIVE) += 1; - pc->flags |= PAGE_CGROUP_FLAG_ACTIVE; - list_move(&pc->lru, &mz->active_list); + MEM_CGROUP_ZSTAT(mz, from) -= 1; + /* + * However this is done under mz->lru_lock, another flags, which + * are not related to LRU, will be modified from out-of-lock. + * We have to use atomic set/clear flags. + */ + if (is_unevictable_lru(lru)) { + ClearPageCgroupActive(pc); + SetPageCgroupUnevictable(pc); } else { - MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_INACTIVE) += 1; - pc->flags &= ~PAGE_CGROUP_FLAG_ACTIVE; - list_move(&pc->lru, &mz->inactive_list); + if (is_active_lru(lru)) + SetPageCgroupActive(pc); + else + ClearPageCgroupActive(pc); + ClearPageCgroupUnevictable(pc); } + + MEM_CGROUP_ZSTAT(mz, lru) += 1; + list_move(&pc->lru, &mz->lists[lru]); } int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem) @@ -348,12 +322,15 @@ int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem) /* * This routine assumes that the appropriate zone's lru lock is already held */ -void mem_cgroup_move_lists(struct page *page, bool active) +void mem_cgroup_move_lists(struct page *page, enum lru_list lru) { struct page_cgroup *pc; struct mem_cgroup_per_zone *mz; unsigned long flags; + if (mem_cgroup_subsys.disabled) + return; + /* * We cannot lock_page_cgroup while holding zone's lru_lock, * because other holders of lock_page_cgroup can be interrupted @@ -361,17 +338,16 @@ void mem_cgroup_move_lists(struct page *page, bool active) * safely get to page_cgroup without it, so just try_lock it: * mem_cgroup_isolate_pages allows for page left on wrong list. */ - if (!try_lock_page_cgroup(page)) + pc = lookup_page_cgroup(page); + if (!trylock_page_cgroup(pc)) return; - - pc = page_get_page_cgroup(page); - if (pc) { + if (pc && PageCgroupUsed(pc)) { mz = page_cgroup_zoneinfo(pc); spin_lock_irqsave(&mz->lru_lock, flags); - __mem_cgroup_move_lists(pc, active); + __mem_cgroup_move_lists(pc, lru); spin_unlock_irqrestore(&mz->lru_lock, flags); } - unlock_page_cgroup(page); + unlock_page_cgroup(pc); } /* @@ -392,21 +368,6 @@ int mem_cgroup_calc_mapped_ratio(struct mem_cgroup *mem) } /* - * This function is called from vmscan.c. In page reclaiming loop. balance - * between active and inactive list is calculated. For memory controller - * page reclaiming, we should use using mem_cgroup's imbalance rather than - * zone's global lru imbalance. - */ -long mem_cgroup_reclaim_imbalance(struct mem_cgroup *mem) -{ - unsigned long active, inactive; - /* active and inactive are the number of pages. 'long' is ok.*/ - active = mem_cgroup_get_all_zonestat(mem, MEM_CGROUP_ZSTAT_ACTIVE); - inactive = mem_cgroup_get_all_zonestat(mem, MEM_CGROUP_ZSTAT_INACTIVE); - return (long) (active / (inactive + 1)); -} - -/* * prev_priority control...this will be used in memory reclaim path. */ int mem_cgroup_get_reclaim_priority(struct mem_cgroup *mem) @@ -433,28 +394,17 @@ void mem_cgroup_record_reclaim_priority(struct mem_cgroup *mem, int priority) * (see include/linux/mmzone.h) */ -long mem_cgroup_calc_reclaim_active(struct mem_cgroup *mem, - struct zone *zone, int priority) +long mem_cgroup_calc_reclaim(struct mem_cgroup *mem, struct zone *zone, + int priority, enum lru_list lru) { - long nr_active; + long nr_pages; int nid = zone->zone_pgdat->node_id; int zid = zone_idx(zone); struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(mem, nid, zid); - nr_active = MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_ACTIVE); - return (nr_active >> priority); -} + nr_pages = MEM_CGROUP_ZSTAT(mz, lru); -long mem_cgroup_calc_reclaim_inactive(struct mem_cgroup *mem, - struct zone *zone, int priority) -{ - long nr_inactive; - int nid = zone->zone_pgdat->node_id; - int zid = zone_idx(zone); - struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(mem, nid, zid); - - nr_inactive = MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_INACTIVE); - return (nr_inactive >> priority); + return (nr_pages >> priority); } unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan, @@ -462,7 +412,7 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan, unsigned long *scanned, int order, int mode, struct zone *z, struct mem_cgroup *mem_cont, - int active) + int active, int file) { unsigned long nr_taken = 0; struct page *page; @@ -473,38 +423,38 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan, int nid = z->zone_pgdat->node_id; int zid = zone_idx(z); struct mem_cgroup_per_zone *mz; + int lru = LRU_FILE * !!file + !!active; BUG_ON(!mem_cont); mz = mem_cgroup_zoneinfo(mem_cont, nid, zid); - if (active) - src = &mz->active_list; - else - src = &mz->inactive_list; - + src = &mz->lists[lru]; spin_lock(&mz->lru_lock); scan = 0; list_for_each_entry_safe_reverse(pc, tmp, src, lru) { if (scan >= nr_to_scan) break; + if (unlikely(!PageCgroupUsed(pc))) + continue; page = pc->page; if (unlikely(!PageLRU(page))) continue; - if (PageActive(page) && !active) { - __mem_cgroup_move_lists(pc, true); - continue; - } - if (!PageActive(page) && active) { - __mem_cgroup_move_lists(pc, false); + /* + * TODO: play better with lumpy reclaim, grabbing anything. + */ + if (PageUnevictable(page) || + (PageActive(page) && !active) || + (!PageActive(page) && active)) { + __mem_cgroup_move_lists(pc, page_lru(page)); continue; } scan++; list_move(&pc->lru, &pc_list); - if (__isolate_lru_page(page, mode) == 0) { + if (__isolate_lru_page(page, mode, file) == 0) { list_move(&page->lru, dst); nr_taken++; } @@ -524,63 +474,45 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan, * < 0 if the cgroup is over its limit */ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm, - gfp_t gfp_mask, enum charge_type ctype) + gfp_t gfp_mask, enum charge_type ctype, + struct mem_cgroup *memcg) { struct mem_cgroup *mem; struct page_cgroup *pc; - unsigned long flags; unsigned long nr_retries = MEM_CGROUP_RECLAIM_RETRIES; struct mem_cgroup_per_zone *mz; + unsigned long flags; - if (mem_cgroup_subsys.disabled) + pc = lookup_page_cgroup(page); + /* can happen at boot */ + if (unlikely(!pc)) return 0; - - /* - * Should page_cgroup's go to their own slab? - * One could optimize the performance of the charging routine - * by saving a bit in the page_flags and using it as a lock - * to see if the cgroup page already has a page_cgroup associated - * with it - */ -retry: - lock_page_cgroup(page); - pc = page_get_page_cgroup(page); - /* - * The page_cgroup exists and - * the page has already been accounted. - */ - if (pc) { - VM_BUG_ON(pc->page != page); - VM_BUG_ON(pc->ref_cnt <= 0); - - pc->ref_cnt++; - unlock_page_cgroup(page); - goto done; - } - unlock_page_cgroup(page); - - pc = kmem_cache_zalloc(page_cgroup_cache, gfp_mask); - if (pc == NULL) - goto err; - + prefetchw(pc); /* * We always charge the cgroup the mm_struct belongs to. * The mm_struct's mem_cgroup changes on task migration if the * thread group leader migrates. It's possible that mm is not * set, if so charge the init_mm (happens for pagecache usage). */ - if (!mm) - mm = &init_mm; - rcu_read_lock(); - mem = mem_cgroup_from_task(rcu_dereference(mm->owner)); - /* - * For every charge from the cgroup, increment reference count - */ - css_get(&mem->css); - rcu_read_unlock(); + if (likely(!memcg)) { + rcu_read_lock(); + mem = mem_cgroup_from_task(rcu_dereference(mm->owner)); + if (unlikely(!mem)) { + rcu_read_unlock(); + return 0; + } + /* + * For every charge from the cgroup, increment reference count + */ + css_get(&mem->css); + rcu_read_unlock(); + } else { + mem = memcg; + css_get(&memcg->css); + } - while (res_counter_charge(&mem->res, PAGE_SIZE)) { + while (unlikely(res_counter_charge(&mem->res, PAGE_SIZE))) { if (!(gfp_mask & __GFP_WAIT)) goto out; @@ -603,63 +535,104 @@ retry: } } - pc->ref_cnt = 1; - pc->mem_cgroup = mem; - pc->page = page; - pc->flags = PAGE_CGROUP_FLAG_ACTIVE; - if (ctype == MEM_CGROUP_CHARGE_TYPE_CACHE) - pc->flags = PAGE_CGROUP_FLAG_CACHE; - - lock_page_cgroup(page); - if (page_get_page_cgroup(page)) { - unlock_page_cgroup(page); - /* - * Another charge has been added to this page already. - * We take lock_page_cgroup(page) again and read - * page->cgroup, increment refcnt.... just retry is OK. - */ + + lock_page_cgroup(pc); + if (unlikely(PageCgroupUsed(pc))) { + unlock_page_cgroup(pc); res_counter_uncharge(&mem->res, PAGE_SIZE); css_put(&mem->css); - kmem_cache_free(page_cgroup_cache, pc); - goto retry; + + goto done; } - page_assign_page_cgroup(page, pc); + pc->mem_cgroup = mem; + /* + * If a page is accounted as a page cache, insert to inactive list. + * If anon, insert to active list. + */ + pc->flags = pcg_default_flags[ctype]; mz = page_cgroup_zoneinfo(pc); + spin_lock_irqsave(&mz->lru_lock, flags); __mem_cgroup_add_list(mz, pc); spin_unlock_irqrestore(&mz->lru_lock, flags); + unlock_page_cgroup(pc); - unlock_page_cgroup(page); done: return 0; out: css_put(&mem->css); - kmem_cache_free(page_cgroup_cache, pc); -err: return -ENOMEM; } int mem_cgroup_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask) { + if (mem_cgroup_subsys.disabled) + return 0; + if (PageCompound(page)) + return 0; + /* + * If already mapped, we don't have to account. + * If page cache, page->mapping has address_space. + * But page->mapping may have out-of-use anon_vma pointer, + * detecit it by PageAnon() check. newly-mapped-anon's page->mapping + * is NULL. + */ + if (page_mapped(page) || (page->mapping && !PageAnon(page))) + return 0; + if (unlikely(!mm)) + mm = &init_mm; return mem_cgroup_charge_common(page, mm, gfp_mask, - MEM_CGROUP_CHARGE_TYPE_MAPPED); + MEM_CGROUP_CHARGE_TYPE_MAPPED, NULL); } int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask) { - if (!mm) + if (mem_cgroup_subsys.disabled) + return 0; + if (PageCompound(page)) + return 0; + /* + * Corner case handling. This is called from add_to_page_cache() + * in usual. But some FS (shmem) precharges this page before calling it + * and call add_to_page_cache() with GFP_NOWAIT. + * + * For GFP_NOWAIT case, the page may be pre-charged before calling + * add_to_page_cache(). (See shmem.c) check it here and avoid to call + * charge twice. (It works but has to pay a bit larger cost.) + */ + if (!(gfp_mask & __GFP_WAIT)) { + struct page_cgroup *pc; + + + pc = lookup_page_cgroup(page); + if (!pc) + return 0; + lock_page_cgroup(pc); + if (PageCgroupUsed(pc)) { + unlock_page_cgroup(pc); + return 0; + } + unlock_page_cgroup(pc); + } + + if (unlikely(!mm)) mm = &init_mm; - return mem_cgroup_charge_common(page, mm, gfp_mask, - MEM_CGROUP_CHARGE_TYPE_CACHE); + + if (page_is_file_cache(page)) + return mem_cgroup_charge_common(page, mm, gfp_mask, + MEM_CGROUP_CHARGE_TYPE_CACHE, NULL); + else + return mem_cgroup_charge_common(page, mm, gfp_mask, + MEM_CGROUP_CHARGE_TYPE_SHMEM, NULL); } /* - * Uncharging is always a welcome operation, we never complain, simply - * uncharge. + * uncharge if !page_mapped(page) */ -void mem_cgroup_uncharge_page(struct page *page) +static void +__mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) { struct page_cgroup *pc; struct mem_cgroup *mem; @@ -672,106 +645,172 @@ void mem_cgroup_uncharge_page(struct page *page) /* * Check if our page_cgroup is valid */ - lock_page_cgroup(page); - pc = page_get_page_cgroup(page); - if (!pc) - goto unlock; + pc = lookup_page_cgroup(page); + if (unlikely(!pc || !PageCgroupUsed(pc))) + return; - VM_BUG_ON(pc->page != page); - VM_BUG_ON(pc->ref_cnt <= 0); + lock_page_cgroup(pc); + if ((ctype == MEM_CGROUP_CHARGE_TYPE_MAPPED && page_mapped(page)) + || !PageCgroupUsed(pc)) { + /* This happens at race in zap_pte_range() and do_swap_page()*/ + unlock_page_cgroup(pc); + return; + } + ClearPageCgroupUsed(pc); + mem = pc->mem_cgroup; - if (--(pc->ref_cnt) == 0) { - mz = page_cgroup_zoneinfo(pc); - spin_lock_irqsave(&mz->lru_lock, flags); - __mem_cgroup_remove_list(mz, pc); - spin_unlock_irqrestore(&mz->lru_lock, flags); + mz = page_cgroup_zoneinfo(pc); + spin_lock_irqsave(&mz->lru_lock, flags); + __mem_cgroup_remove_list(mz, pc); + spin_unlock_irqrestore(&mz->lru_lock, flags); + unlock_page_cgroup(pc); - page_assign_page_cgroup(page, NULL); - unlock_page_cgroup(page); + res_counter_uncharge(&mem->res, PAGE_SIZE); + css_put(&mem->css); - mem = pc->mem_cgroup; - res_counter_uncharge(&mem->res, PAGE_SIZE); - css_put(&mem->css); + return; +} - kmem_cache_free(page_cgroup_cache, pc); +void mem_cgroup_uncharge_page(struct page *page) +{ + /* early check. */ + if (page_mapped(page)) return; - } + if (page->mapping && !PageAnon(page)) + return; + __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_MAPPED); +} -unlock: - unlock_page_cgroup(page); +void mem_cgroup_uncharge_cache_page(struct page *page) +{ + VM_BUG_ON(page_mapped(page)); + VM_BUG_ON(page->mapping); + __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE); } /* - * Returns non-zero if a page (under migration) has valid page_cgroup member. - * Refcnt of page_cgroup is incremented. + * Before starting migration, account against new page. */ -int mem_cgroup_prepare_migration(struct page *page) +int mem_cgroup_prepare_migration(struct page *page, struct page *newpage) { struct page_cgroup *pc; + struct mem_cgroup *mem = NULL; + enum charge_type ctype = MEM_CGROUP_CHARGE_TYPE_MAPPED; + int ret = 0; if (mem_cgroup_subsys.disabled) return 0; - lock_page_cgroup(page); - pc = page_get_page_cgroup(page); - if (pc) - pc->ref_cnt++; - unlock_page_cgroup(page); - return pc != NULL; + pc = lookup_page_cgroup(page); + lock_page_cgroup(pc); + if (PageCgroupUsed(pc)) { + mem = pc->mem_cgroup; + css_get(&mem->css); + if (PageCgroupCache(pc)) { + if (page_is_file_cache(page)) + ctype = MEM_CGROUP_CHARGE_TYPE_CACHE; + else + ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM; + } + } + unlock_page_cgroup(pc); + if (mem) { + ret = mem_cgroup_charge_common(newpage, NULL, GFP_KERNEL, + ctype, mem); + css_put(&mem->css); + } + return ret; } -void mem_cgroup_end_migration(struct page *page) +/* remove redundant charge if migration failed*/ +void mem_cgroup_end_migration(struct page *newpage) { - mem_cgroup_uncharge_page(page); + /* + * At success, page->mapping is not NULL. + * special rollback care is necessary when + * 1. at migration failure. (newpage->mapping is cleared in this case) + * 2. the newpage was moved but not remapped again because the task + * exits and the newpage is obsolete. In this case, the new page + * may be a swapcache. So, we just call mem_cgroup_uncharge_page() + * always for avoiding mess. The page_cgroup will be removed if + * unnecessary. File cache pages is still on radix-tree. Don't + * care it. + */ + if (!newpage->mapping) + __mem_cgroup_uncharge_common(newpage, + MEM_CGROUP_CHARGE_TYPE_FORCE); + else if (PageAnon(newpage)) + mem_cgroup_uncharge_page(newpage); } /* - * We know both *page* and *newpage* are now not-on-LRU and PG_locked. - * And no race with uncharge() routines because page_cgroup for *page* - * has extra one reference by mem_cgroup_prepare_migration. + * A call to try to shrink memory usage under specified resource controller. + * This is typically used for page reclaiming for shmem for reducing side + * effect of page allocation from shmem, which is used by some mem_cgroup. */ -void mem_cgroup_page_migration(struct page *page, struct page *newpage) +int mem_cgroup_shrink_usage(struct mm_struct *mm, gfp_t gfp_mask) { - struct page_cgroup *pc; - struct mem_cgroup_per_zone *mz; - unsigned long flags; + struct mem_cgroup *mem; + int progress = 0; + int retry = MEM_CGROUP_RECLAIM_RETRIES; - lock_page_cgroup(page); - pc = page_get_page_cgroup(page); - if (!pc) { - unlock_page_cgroup(page); - return; + if (mem_cgroup_subsys.disabled) + return 0; + if (!mm) + return 0; + + rcu_read_lock(); + mem = mem_cgroup_from_task(rcu_dereference(mm->owner)); + if (unlikely(!mem)) { + rcu_read_unlock(); + return 0; } + css_get(&mem->css); + rcu_read_unlock(); - mz = page_cgroup_zoneinfo(pc); - spin_lock_irqsave(&mz->lru_lock, flags); - __mem_cgroup_remove_list(mz, pc); - spin_unlock_irqrestore(&mz->lru_lock, flags); + do { + progress = try_to_free_mem_cgroup_pages(mem, gfp_mask); + progress += res_counter_check_under_limit(&mem->res); + } while (!progress && --retry); - page_assign_page_cgroup(page, NULL); - unlock_page_cgroup(page); + css_put(&mem->css); + if (!retry) + return -ENOMEM; + return 0; +} - pc->page = newpage; - lock_page_cgroup(newpage); - page_assign_page_cgroup(newpage, pc); +int mem_cgroup_resize_limit(struct mem_cgroup *memcg, unsigned long long val) +{ - mz = page_cgroup_zoneinfo(pc); - spin_lock_irqsave(&mz->lru_lock, flags); - __mem_cgroup_add_list(mz, pc); - spin_unlock_irqrestore(&mz->lru_lock, flags); + int retry_count = MEM_CGROUP_RECLAIM_RETRIES; + int progress; + int ret = 0; - unlock_page_cgroup(newpage); + while (res_counter_set_limit(&memcg->res, val)) { + if (signal_pending(current)) { + ret = -EINTR; + break; + } + if (!retry_count) { + ret = -EBUSY; + break; + } + progress = try_to_free_mem_cgroup_pages(memcg, GFP_KERNEL); + if (!progress) + retry_count--; + } + return ret; } + /* * This routine traverse page_cgroup in given list and drop them all. - * This routine ignores page_cgroup->ref_cnt. * *And* this routine doesn't reclaim page itself, just removes page_cgroup. */ #define FORCE_UNCHARGE_BATCH (128) static void mem_cgroup_force_empty_list(struct mem_cgroup *mem, struct mem_cgroup_per_zone *mz, - int active) + enum lru_list lru) { struct page_cgroup *pc; struct page *page; @@ -779,22 +818,31 @@ static void mem_cgroup_force_empty_list(struct mem_cgroup *mem, unsigned long flags; struct list_head *list; - if (active) - list = &mz->active_list; - else - list = &mz->inactive_list; + list = &mz->lists[lru]; spin_lock_irqsave(&mz->lru_lock, flags); while (!list_empty(list)) { pc = list_entry(list->prev, struct page_cgroup, lru); page = pc->page; + if (!PageCgroupUsed(pc)) + break; get_page(page); spin_unlock_irqrestore(&mz->lru_lock, flags); - mem_cgroup_uncharge_page(page); - put_page(page); - if (--count <= 0) { - count = FORCE_UNCHARGE_BATCH; - cond_resched(); + /* + * Check if this page is on LRU. !LRU page can be found + * if it's under page migration. + */ + if (PageLRU(page)) { + __mem_cgroup_uncharge_common(page, + MEM_CGROUP_CHARGE_TYPE_FORCE); + put_page(page); + if (--count <= 0) { + count = FORCE_UNCHARGE_BATCH; + cond_resched(); + } + } else { + spin_lock_irqsave(&mz->lru_lock, flags); + break; } spin_lock_irqsave(&mz->lru_lock, flags); } @@ -810,9 +858,6 @@ static int mem_cgroup_force_empty(struct mem_cgroup *mem) int ret = -EBUSY; int node, zid; - if (mem_cgroup_subsys.disabled) - return 0; - css_get(&mem->css); /* * page reclaim code (kswapd etc..) will move pages between @@ -822,15 +867,17 @@ static int mem_cgroup_force_empty(struct mem_cgroup *mem) while (mem->res.usage > 0) { if (atomic_read(&mem->css.cgroup->count) > 0) goto out; + /* This is for making all *used* pages to be on LRU. */ + lru_add_drain_all(); for_each_node_state(node, N_POSSIBLE) for (zid = 0; zid < MAX_NR_ZONES; zid++) { struct mem_cgroup_per_zone *mz; + enum lru_list l; mz = mem_cgroup_zoneinfo(mem, node, zid); - /* drop all page_cgroup in active_list */ - mem_cgroup_force_empty_list(mem, mz, 1); - /* drop all page_cgroup in inactive_list */ - mem_cgroup_force_empty_list(mem, mz, 0); + for_each_lru(l) + mem_cgroup_force_empty_list(mem, mz, l); } + cond_resched(); } ret = 0; out: @@ -838,32 +885,34 @@ out: return ret; } -static int mem_cgroup_write_strategy(char *buf, unsigned long long *tmp) -{ - *tmp = memparse(buf, &buf); - if (*buf != '\0') - return -EINVAL; - - /* - * Round up the value to the closest page size - */ - *tmp = ((*tmp + PAGE_SIZE - 1) >> PAGE_SHIFT) << PAGE_SHIFT; - return 0; -} - static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft) { return res_counter_read_u64(&mem_cgroup_from_cont(cont)->res, cft->private); } - -static ssize_t mem_cgroup_write(struct cgroup *cont, struct cftype *cft, - struct file *file, const char __user *userbuf, - size_t nbytes, loff_t *ppos) +/* + * The user of this function is... + * RES_LIMIT. + */ +static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft, + const char *buffer) { - return res_counter_write(&mem_cgroup_from_cont(cont)->res, - cft->private, userbuf, nbytes, ppos, - mem_cgroup_write_strategy); + struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); + unsigned long long val; + int ret; + + switch (cft->private) { + case RES_LIMIT: + /* This function does all necessary parse...reuse it */ + ret = res_counter_memparse_write_strategy(buffer, &val); + if (!ret) + ret = mem_cgroup_resize_limit(memcg, val); + break; + default: + ret = -EINVAL; /* should be BUG() ? */ + break; + } + return ret; } static int mem_cgroup_reset(struct cgroup *cont, unsigned int event) @@ -913,14 +962,27 @@ static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft, } /* showing # of active pages */ { - unsigned long active, inactive; - - inactive = mem_cgroup_get_all_zonestat(mem_cont, - MEM_CGROUP_ZSTAT_INACTIVE); - active = mem_cgroup_get_all_zonestat(mem_cont, - MEM_CGROUP_ZSTAT_ACTIVE); - cb->fill(cb, "active", (active) * PAGE_SIZE); - cb->fill(cb, "inactive", (inactive) * PAGE_SIZE); + unsigned long active_anon, inactive_anon; + unsigned long active_file, inactive_file; + unsigned long unevictable; + + inactive_anon = mem_cgroup_get_all_zonestat(mem_cont, + LRU_INACTIVE_ANON); + active_anon = mem_cgroup_get_all_zonestat(mem_cont, + LRU_ACTIVE_ANON); + inactive_file = mem_cgroup_get_all_zonestat(mem_cont, + LRU_INACTIVE_FILE); + active_file = mem_cgroup_get_all_zonestat(mem_cont, + LRU_ACTIVE_FILE); + unevictable = mem_cgroup_get_all_zonestat(mem_cont, + LRU_UNEVICTABLE); + + cb->fill(cb, "active_anon", (active_anon) * PAGE_SIZE); + cb->fill(cb, "inactive_anon", (inactive_anon) * PAGE_SIZE); + cb->fill(cb, "active_file", (active_file) * PAGE_SIZE); + cb->fill(cb, "inactive_file", (inactive_file) * PAGE_SIZE); + cb->fill(cb, "unevictable", unevictable * PAGE_SIZE); + } return 0; } @@ -940,7 +1002,7 @@ static struct cftype mem_cgroup_files[] = { { .name = "limit_in_bytes", .private = RES_LIMIT, - .write = mem_cgroup_write, + .write_string = mem_cgroup_write, .read_u64 = mem_cgroup_read, }, { @@ -963,6 +1025,7 @@ static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node) { struct mem_cgroup_per_node *pn; struct mem_cgroup_per_zone *mz; + enum lru_list l; int zone, tmp = node; /* * This routine is called against possible nodes. @@ -983,9 +1046,9 @@ static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node) for (zone = 0; zone < MAX_NR_ZONES; zone++) { mz = &pn->zoneinfo[zone]; - INIT_LIST_HEAD(&mz->active_list); - INIT_LIST_HEAD(&mz->inactive_list); spin_lock_init(&mz->lru_lock); + for_each_lru(l) + INIT_LIST_HEAD(&mz->lists[l]); } return 0; } @@ -1026,7 +1089,6 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) if (unlikely((cont->parent) == NULL)) { mem = &init_mem_cgroup; - page_cgroup_cache = KMEM_CACHE(page_cgroup, SLAB_PANIC); } else { mem = mem_cgroup_alloc(); if (!mem) @@ -1070,8 +1132,6 @@ static void mem_cgroup_destroy(struct cgroup_subsys *ss, static int mem_cgroup_populate(struct cgroup_subsys *ss, struct cgroup *cont) { - if (mem_cgroup_subsys.disabled) - return 0; return cgroup_add_files(cont, ss, mem_cgroup_files, ARRAY_SIZE(mem_cgroup_files)); } @@ -1084,9 +1144,6 @@ static void mem_cgroup_move_task(struct cgroup_subsys *ss, struct mm_struct *mm; struct mem_cgroup *mem, *old_mem; - if (mem_cgroup_subsys.disabled) - return; - mm = get_task_mm(p); if (mm == NULL) return; @@ -1094,9 +1151,6 @@ static void mem_cgroup_move_task(struct cgroup_subsys *ss, mem = mem_cgroup_from_cont(cont); old_mem = mem_cgroup_from_cont(old_cont); - if (mem == old_mem) - goto out; - /* * Only thread group leaders are allowed to migrate, the mm_struct is * in effect owned by the leader diff --git a/mm/memory.c b/mm/memory.c index 2302d228fe04..164951c47305 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -51,6 +51,7 @@ #include <linux/init.h> #include <linux/writeback.h> #include <linux/memcontrol.h> +#include <linux/mmu_notifier.h> #include <asm/pgalloc.h> #include <asm/uaccess.h> @@ -61,6 +62,8 @@ #include <linux/swapops.h> #include <linux/elf.h> +#include "internal.h" + #ifndef CONFIG_NEED_MULTIPLE_NODES /* use the per-pgdat data instead for discontigmem - mbligh */ unsigned long max_mapnr; @@ -211,7 +214,7 @@ static inline void free_pud_range(struct mmu_gather *tlb, pgd_t *pgd, * * Must be called with pagetable lock held. */ -void free_pgd_range(struct mmu_gather **tlb, +void free_pgd_range(struct mmu_gather *tlb, unsigned long addr, unsigned long end, unsigned long floor, unsigned long ceiling) { @@ -262,16 +265,16 @@ void free_pgd_range(struct mmu_gather **tlb, return; start = addr; - pgd = pgd_offset((*tlb)->mm, addr); + pgd = pgd_offset(tlb->mm, addr); do { next = pgd_addr_end(addr, end); if (pgd_none_or_clear_bad(pgd)) continue; - free_pud_range(*tlb, pgd, addr, next, floor, ceiling); + free_pud_range(tlb, pgd, addr, next, floor, ceiling); } while (pgd++, addr = next, addr != end); } -void free_pgtables(struct mmu_gather **tlb, struct vm_area_struct *vma, +void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma, unsigned long floor, unsigned long ceiling) { while (vma) { @@ -372,7 +375,8 @@ static inline void add_mm_rss(struct mm_struct *mm, int file_rss, int anon_rss) * * The calling function must still handle the error. */ -void print_bad_pte(struct vm_area_struct *vma, pte_t pte, unsigned long vaddr) +static void print_bad_pte(struct vm_area_struct *vma, pte_t pte, + unsigned long vaddr) { printk(KERN_ERR "Bad pte = %08llx, process = %s, " "vm_flags = %lx, vaddr = %lx\n", @@ -649,6 +653,7 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, unsigned long next; unsigned long addr = vma->vm_start; unsigned long end = vma->vm_end; + int ret; /* * Don't copy ptes where a page fault will fill them correctly. @@ -664,17 +669,33 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, if (is_vm_hugetlb_page(vma)) return copy_hugetlb_page_range(dst_mm, src_mm, vma); + /* + * We need to invalidate the secondary MMU mappings only when + * there could be a permission downgrade on the ptes of the + * parent mm. And a permission downgrade will only happen if + * is_cow_mapping() returns true. + */ + if (is_cow_mapping(vma->vm_flags)) + mmu_notifier_invalidate_range_start(src_mm, addr, end); + + ret = 0; dst_pgd = pgd_offset(dst_mm, addr); src_pgd = pgd_offset(src_mm, addr); do { next = pgd_addr_end(addr, end); if (pgd_none_or_clear_bad(src_pgd)) continue; - if (copy_pud_range(dst_mm, src_mm, dst_pgd, src_pgd, - vma, addr, next)) - return -ENOMEM; + if (unlikely(copy_pud_range(dst_mm, src_mm, dst_pgd, src_pgd, + vma, addr, next))) { + ret = -ENOMEM; + break; + } } while (dst_pgd++, src_pgd++, addr = next, addr != end); - return 0; + + if (is_cow_mapping(vma->vm_flags)) + mmu_notifier_invalidate_range_end(src_mm, + vma->vm_start, end); + return ret; } static unsigned long zap_pte_range(struct mmu_gather *tlb, @@ -878,7 +899,9 @@ unsigned long unmap_vmas(struct mmu_gather **tlbp, unsigned long start = start_addr; spinlock_t *i_mmap_lock = details? details->i_mmap_lock: NULL; int fullmm = (*tlbp)->fullmm; + struct mm_struct *mm = vma->vm_mm; + mmu_notifier_invalidate_range_start(mm, start_addr, end_addr); for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next) { unsigned long end; @@ -899,9 +922,23 @@ unsigned long unmap_vmas(struct mmu_gather **tlbp, } if (unlikely(is_vm_hugetlb_page(vma))) { - unmap_hugepage_range(vma, start, end); - zap_work -= (end - start) / - (HPAGE_SIZE / PAGE_SIZE); + /* + * It is undesirable to test vma->vm_file as it + * should be non-null for valid hugetlb area. + * However, vm_file will be NULL in the error + * cleanup path of do_mmap_pgoff. When + * hugetlbfs ->mmap method fails, + * do_mmap_pgoff() nullifies vma->vm_file + * before calling this function to clean up. + * Since no pte has actually been setup, it is + * safe to do nothing in this case. + */ + if (vma->vm_file) { + unmap_hugepage_range(vma, start, end, NULL); + zap_work -= (end - start) / + pages_per_huge_page(hstate_vma(vma)); + } + start = end; } else start = unmap_page_range(*tlbp, vma, @@ -929,6 +966,7 @@ unsigned long unmap_vmas(struct mmu_gather **tlbp, } } out: + mmu_notifier_invalidate_range_end(mm, start_addr, end_addr); return start; /* which is now the end (or restart) address */ } @@ -956,6 +994,29 @@ unsigned long zap_page_range(struct vm_area_struct *vma, unsigned long address, return end; } +/** + * zap_vma_ptes - remove ptes mapping the vma + * @vma: vm_area_struct holding ptes to be zapped + * @address: starting address of pages to zap + * @size: number of bytes to zap + * + * This function only unmaps ptes assigned to VM_PFNMAP vmas. + * + * The entire address range must be fully contained within the vma. + * + * Returns 0 if successful. + */ +int zap_vma_ptes(struct vm_area_struct *vma, unsigned long address, + unsigned long size) +{ + if (address < vma->vm_start || address + size > vma->vm_end || + !(vma->vm_flags & VM_PFNMAP)) + return -1; + zap_page_range(vma, address, size, NULL); + return 0; +} +EXPORT_SYMBOL_GPL(zap_vma_ptes); + /* * Do a quick page-table lookup for a single page. */ @@ -982,19 +1043,24 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address, goto no_page_table; pud = pud_offset(pgd, address); - if (pud_none(*pud) || unlikely(pud_bad(*pud))) + if (pud_none(*pud)) goto no_page_table; - + if (pud_huge(*pud)) { + BUG_ON(flags & FOLL_GET); + page = follow_huge_pud(mm, address, pud, flags & FOLL_WRITE); + goto out; + } + if (unlikely(pud_bad(*pud))) + goto no_page_table; + pmd = pmd_offset(pud, address); if (pmd_none(*pmd)) goto no_page_table; - if (pmd_huge(*pmd)) { BUG_ON(flags & FOLL_GET); page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE); goto out; } - if (unlikely(pmd_bad(*pmd))) goto no_page_table; @@ -1058,19 +1124,22 @@ static inline int use_zero_page(struct vm_area_struct *vma) if (vma->vm_flags & (VM_LOCKED | VM_SHARED)) return 0; /* - * And if we have a fault or a nopfn routine, it's not an - * anonymous region. + * And if we have a fault routine, it's not an anonymous region. */ - return !vma->vm_ops || - (!vma->vm_ops->fault && !vma->vm_ops->nopfn); + return !vma->vm_ops || !vma->vm_ops->fault; } -int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, - unsigned long start, int len, int write, int force, + + +int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, + unsigned long start, int len, int flags, struct page **pages, struct vm_area_struct **vmas) { int i; - unsigned int vm_flags; + unsigned int vm_flags = 0; + int write = !!(flags & GUP_FLAGS_WRITE); + int force = !!(flags & GUP_FLAGS_FORCE); + int ignore = !!(flags & GUP_FLAGS_IGNORE_VMA_PERMISSIONS); if (len <= 0) return 0; @@ -1094,7 +1163,9 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, pud_t *pud; pmd_t *pmd; pte_t *pte; - if (write) /* user gate pages are read-only */ + + /* user gate pages are read-only */ + if (!ignore && write) return i ? : -EFAULT; if (pg > TASK_SIZE) pgd = pgd_offset_k(pg); @@ -1126,8 +1197,9 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, continue; } - if (!vma || (vma->vm_flags & (VM_IO | VM_PFNMAP)) - || !(vm_flags & vma->vm_flags)) + if (!vma || + (vma->vm_flags & (VM_IO | VM_PFNMAP)) || + (!ignore && !(vm_flags & vma->vm_flags))) return i ? : -EFAULT; if (is_vm_hugetlb_page(vma)) { @@ -1202,6 +1274,23 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, } while (len); return i; } + +int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, + unsigned long start, int len, int write, int force, + struct page **pages, struct vm_area_struct **vmas) +{ + int flags = 0; + + if (write) + flags |= GUP_FLAGS_WRITE; + if (force) + flags |= GUP_FLAGS_FORCE; + + return __get_user_pages(tsk, mm, + start, len, flags, + pages, vmas); +} + EXPORT_SYMBOL(get_user_pages); pte_t *get_locked_pte(struct mm_struct *mm, unsigned long addr, @@ -1232,18 +1321,14 @@ static int insert_page(struct vm_area_struct *vma, unsigned long addr, pte_t *pte; spinlock_t *ptl; - retval = mem_cgroup_charge(page, mm, GFP_KERNEL); - if (retval) - goto out; - retval = -EINVAL; if (PageAnon(page)) - goto out_uncharge; + goto out; retval = -ENOMEM; flush_dcache_page(page); pte = get_locked_pte(mm, addr, &ptl); if (!pte) - goto out_uncharge; + goto out; retval = -EBUSY; if (!pte_none(*pte)) goto out_unlock; @@ -1259,8 +1344,6 @@ static int insert_page(struct vm_area_struct *vma, unsigned long addr, return retval; out_unlock: pte_unmap_unlock(pte, ptl); -out_uncharge: - mem_cgroup_uncharge_page(page); out: return retval; } @@ -1338,6 +1421,11 @@ out: * * This function should only be called from a vm_ops->fault handler, and * in that case the handler should return NULL. + * + * vma cannot be a COW mapping. + * + * As this is called only for pages that do not currently exist, we + * do not need to flush old virtual caches or the TLB. */ int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr, unsigned long pfn) @@ -1548,6 +1636,8 @@ static int apply_to_pmd_range(struct mm_struct *mm, pud_t *pud, unsigned long next; int err; + BUG_ON(pud_huge(*pud)); + pmd = pmd_alloc(mm, pud, addr); if (!pmd) return -ENOMEM; @@ -1589,10 +1679,11 @@ int apply_to_page_range(struct mm_struct *mm, unsigned long addr, { pgd_t *pgd; unsigned long next; - unsigned long end = addr + size; + unsigned long start = addr, end = addr + size; int err; BUG_ON(addr >= end); + mmu_notifier_invalidate_range_start(mm, start, end); pgd = pgd_offset(mm, addr); do { next = pgd_addr_end(addr, end); @@ -1600,6 +1691,7 @@ int apply_to_page_range(struct mm_struct *mm, unsigned long addr, if (err) break; } while (pgd++, addr = next, addr != end); + mmu_notifier_invalidate_range_end(mm, start, end); return err; } EXPORT_SYMBOL_GPL(apply_to_page_range); @@ -1716,7 +1808,7 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, * not dirty accountable. */ if (PageAnon(old_page)) { - if (!TestSetPageLocked(old_page)) { + if (trylock_page(old_page)) { reuse = can_share_swap_page(old_page); unlock_page(old_page); } @@ -1785,6 +1877,15 @@ gotten: new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address); if (!new_page) goto oom; + /* + * Don't let another task, with possibly unlocked vma, + * keep the mlocked page. + */ + if (vma->vm_flags & VM_LOCKED) { + lock_page(old_page); /* for LRU manipulation */ + clear_page_mlock(old_page); + unlock_page(old_page); + } cow_user_page(new_page, old_page, address, vma); __SetPageUptodate(new_page); @@ -1812,12 +1913,14 @@ gotten: * seen in the presence of one thread doing SMC and another * thread doing COW. */ - ptep_clear_flush(vma, address, page_table); - set_pte_at(mm, address, page_table, entry); - update_mmu_cache(vma, address, entry); - lru_cache_add_active(new_page); + ptep_clear_flush_notify(vma, address, page_table); + SetPageSwapBacked(new_page); + lru_cache_add_active_or_unevictable(new_page, vma); page_add_new_anon_rmap(new_page, vma, address); +//TODO: is this safe? do_anonymous_page() does it this way. + set_pte_at(mm, address, page_table, entry); + update_mmu_cache(vma, address, entry); if (old_page) { /* * Only after switching the pte to the new page may @@ -2215,16 +2318,17 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, count_vm_event(PGMAJFAULT); } + mark_page_accessed(page); + + lock_page(page); + delayacct_clear_flag(DELAYACCT_PF_SWAPIN); + if (mem_cgroup_charge(page, mm, GFP_KERNEL)) { - delayacct_clear_flag(DELAYACCT_PF_SWAPIN); ret = VM_FAULT_OOM; + unlock_page(page); goto out; } - mark_page_accessed(page); - lock_page(page); - delayacct_clear_flag(DELAYACCT_PF_SWAPIN); - /* * Back out if somebody else already faulted in this pte. */ @@ -2251,7 +2355,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, page_add_anon_rmap(page, vma, address); swap_free(entry); - if (vm_swap_full()) + if (vm_swap_full() || (vma->vm_flags & VM_LOCKED) || PageMlocked(page)) remove_exclusive_swap_page(page); unlock_page(page); @@ -2309,7 +2413,8 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, if (!pte_none(*page_table)) goto release; inc_mm_counter(mm, anon_rss); - lru_cache_add_active(page); + SetPageSwapBacked(page); + lru_cache_add_active_or_unevictable(page, vma); page_add_new_anon_rmap(page, vma, address); set_pte_at(mm, address, page_table, entry); @@ -2350,6 +2455,7 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, struct page *page; pte_t entry; int anon = 0; + int charged = 0; struct page *dirty_page = NULL; struct vm_fault vmf; int ret; @@ -2390,6 +2496,18 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, ret = VM_FAULT_OOM; goto out; } + if (mem_cgroup_charge(page, mm, GFP_KERNEL)) { + ret = VM_FAULT_OOM; + page_cache_release(page); + goto out; + } + charged = 1; + /* + * Don't let another task, with possibly unlocked vma, + * keep the mlocked page. + */ + if (vma->vm_flags & VM_LOCKED) + clear_page_mlock(vmf.page); copy_user_highpage(page, vmf.page, address, vma); __SetPageUptodate(page); } else { @@ -2424,11 +2542,6 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, } - if (mem_cgroup_charge(page, mm, GFP_KERNEL)) { - ret = VM_FAULT_OOM; - goto out; - } - page_table = pte_offset_map_lock(mm, pmd, address, &ptl); /* @@ -2447,11 +2560,11 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, entry = mk_pte(page, vma->vm_page_prot); if (flags & FAULT_FLAG_WRITE) entry = maybe_mkwrite(pte_mkdirty(entry), vma); - set_pte_at(mm, address, page_table, entry); if (anon) { - inc_mm_counter(mm, anon_rss); - lru_cache_add_active(page); - page_add_new_anon_rmap(page, vma, address); + inc_mm_counter(mm, anon_rss); + SetPageSwapBacked(page); + lru_cache_add_active_or_unevictable(page, vma); + page_add_new_anon_rmap(page, vma, address); } else { inc_mm_counter(mm, file_rss); page_add_file_rmap(page); @@ -2460,11 +2573,14 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, get_page(dirty_page); } } +//TODO: is this safe? do_anonymous_page() does it this way. + set_pte_at(mm, address, page_table, entry); /* no need to invalidate: a not-present page won't be cached */ update_mmu_cache(vma, address, entry); } else { - mem_cgroup_uncharge_page(page); + if (charged) + mem_cgroup_uncharge_page(page); if (anon) page_cache_release(page); else @@ -2501,59 +2617,6 @@ static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma, return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte); } - -/* - * do_no_pfn() tries to create a new page mapping for a page without - * a struct_page backing it - * - * As this is called only for pages that do not currently exist, we - * do not need to flush old virtual caches or the TLB. - * - * We enter with non-exclusive mmap_sem (to exclude vma changes, - * but allow concurrent faults), and pte mapped but not yet locked. - * We return with mmap_sem still held, but pte unmapped and unlocked. - * - * It is expected that the ->nopfn handler always returns the same pfn - * for a given virtual mapping. - * - * Mark this `noinline' to prevent it from bloating the main pagefault code. - */ -static noinline int do_no_pfn(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long address, pte_t *page_table, pmd_t *pmd, - int write_access) -{ - spinlock_t *ptl; - pte_t entry; - unsigned long pfn; - - pte_unmap(page_table); - BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))); - BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags)); - - pfn = vma->vm_ops->nopfn(vma, address & PAGE_MASK); - - BUG_ON((vma->vm_flags & VM_MIXEDMAP) && pfn_valid(pfn)); - - if (unlikely(pfn == NOPFN_OOM)) - return VM_FAULT_OOM; - else if (unlikely(pfn == NOPFN_SIGBUS)) - return VM_FAULT_SIGBUS; - else if (unlikely(pfn == NOPFN_REFAULT)) - return 0; - - page_table = pte_offset_map_lock(mm, pmd, address, &ptl); - - /* Only go through if we didn't race with anybody else... */ - if (pte_none(*page_table)) { - entry = pfn_pte(pfn, vma->vm_page_prot); - if (write_access) - entry = maybe_mkwrite(pte_mkdirty(entry), vma); - set_pte_at(mm, address, page_table, entry); - } - pte_unmap_unlock(page_table, ptl); - return 0; -} - /* * Fault of a previously existing named mapping. Repopulate the pte * from the encoded file_pte if possible. This enables swappable @@ -2614,9 +2677,6 @@ static inline int handle_pte_fault(struct mm_struct *mm, if (likely(vma->vm_ops->fault)) return do_linear_fault(mm, vma, address, pte, pmd, write_access, entry); - if (unlikely(vma->vm_ops->nopfn)) - return do_no_pfn(mm, vma, address, pte, - pmd, write_access); } return do_anonymous_page(mm, vma, address, pte, pmd, write_access); @@ -2748,7 +2808,7 @@ int make_pages_present(unsigned long addr, unsigned long end) vma = find_vma(current->mm, addr); if (!vma) - return -1; + return -ENOMEM; write = (vma->vm_flags & VM_WRITE) != 0; BUG_ON(addr >= end); BUG_ON(end > vma->vm_end); @@ -2757,7 +2817,7 @@ int make_pages_present(unsigned long addr, unsigned long end) len, write, 0, NULL, NULL); if (ret < 0) return ret; - return ret == len ? 0 : -1; + return ret == len ? 0 : -EFAULT; } #if !defined(__HAVE_ARCH_GATE_AREA) @@ -2804,6 +2864,86 @@ int in_gate_area_no_task(unsigned long addr) #endif /* __HAVE_ARCH_GATE_AREA */ +#ifdef CONFIG_HAVE_IOREMAP_PROT +static resource_size_t follow_phys(struct vm_area_struct *vma, + unsigned long address, unsigned int flags, + unsigned long *prot) +{ + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + pte_t *ptep, pte; + spinlock_t *ptl; + resource_size_t phys_addr = 0; + struct mm_struct *mm = vma->vm_mm; + + VM_BUG_ON(!(vma->vm_flags & (VM_IO | VM_PFNMAP))); + + pgd = pgd_offset(mm, address); + if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd))) + goto no_page_table; + + pud = pud_offset(pgd, address); + if (pud_none(*pud) || unlikely(pud_bad(*pud))) + goto no_page_table; + + pmd = pmd_offset(pud, address); + if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd))) + goto no_page_table; + + /* We cannot handle huge page PFN maps. Luckily they don't exist. */ + if (pmd_huge(*pmd)) + goto no_page_table; + + ptep = pte_offset_map_lock(mm, pmd, address, &ptl); + if (!ptep) + goto out; + + pte = *ptep; + if (!pte_present(pte)) + goto unlock; + if ((flags & FOLL_WRITE) && !pte_write(pte)) + goto unlock; + phys_addr = pte_pfn(pte); + phys_addr <<= PAGE_SHIFT; /* Shift here to avoid overflow on PAE */ + + *prot = pgprot_val(pte_pgprot(pte)); + +unlock: + pte_unmap_unlock(ptep, ptl); +out: + return phys_addr; +no_page_table: + return 0; +} + +int generic_access_phys(struct vm_area_struct *vma, unsigned long addr, + void *buf, int len, int write) +{ + resource_size_t phys_addr; + unsigned long prot = 0; + void *maddr; + int offset = addr & (PAGE_SIZE-1); + + if (!(vma->vm_flags & (VM_IO | VM_PFNMAP))) + return -EINVAL; + + phys_addr = follow_phys(vma, addr, write, &prot); + + if (!phys_addr) + return -EINVAL; + + maddr = ioremap_prot(phys_addr, PAGE_SIZE, prot); + if (write) + memcpy_toio(maddr + offset, buf, len); + else + memcpy_fromio(buf, maddr + offset, len); + iounmap(maddr); + + return len; +} +#endif + /* * Access another process' address space. * Source/target buffer must be kernel space, @@ -2813,7 +2953,6 @@ int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, in { struct mm_struct *mm; struct vm_area_struct *vma; - struct page *page; void *old_buf = buf; mm = get_task_mm(tsk); @@ -2825,28 +2964,44 @@ int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, in while (len) { int bytes, ret, offset; void *maddr; + struct page *page = NULL; ret = get_user_pages(tsk, mm, addr, 1, write, 1, &page, &vma); - if (ret <= 0) - break; - - bytes = len; - offset = addr & (PAGE_SIZE-1); - if (bytes > PAGE_SIZE-offset) - bytes = PAGE_SIZE-offset; - - maddr = kmap(page); - if (write) { - copy_to_user_page(vma, page, addr, - maddr + offset, buf, bytes); - set_page_dirty_lock(page); + if (ret <= 0) { + /* + * Check if this is a VM_IO | VM_PFNMAP VMA, which + * we can access using slightly different code. + */ +#ifdef CONFIG_HAVE_IOREMAP_PROT + vma = find_vma(mm, addr); + if (!vma) + break; + if (vma->vm_ops && vma->vm_ops->access) + ret = vma->vm_ops->access(vma, addr, buf, + len, write); + if (ret <= 0) +#endif + break; + bytes = ret; } else { - copy_from_user_page(vma, page, addr, - buf, maddr + offset, bytes); + bytes = len; + offset = addr & (PAGE_SIZE-1); + if (bytes > PAGE_SIZE-offset) + bytes = PAGE_SIZE-offset; + + maddr = kmap(page); + if (write) { + copy_to_user_page(vma, page, addr, + maddr + offset, buf, bytes); + set_page_dirty_lock(page); + } else { + copy_from_user_page(vma, page, addr, + buf, maddr + offset, bytes); + } + kunmap(page); + page_cache_release(page); } - kunmap(page); - page_cache_release(page); len -= bytes; buf += bytes; addr += bytes; diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 833f854eabe5..6837a1014372 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -26,6 +26,7 @@ #include <linux/delay.h> #include <linux/migrate.h> #include <linux/page-isolation.h> +#include <linux/pfn.h> #include <asm/tlbflush.h> @@ -62,9 +63,9 @@ static void release_memory_resource(struct resource *res) #ifdef CONFIG_MEMORY_HOTPLUG_SPARSE #ifndef CONFIG_SPARSEMEM_VMEMMAP -static void get_page_bootmem(unsigned long info, struct page *page, int magic) +static void get_page_bootmem(unsigned long info, struct page *page, int type) { - atomic_set(&page->_mapcount, magic); + atomic_set(&page->_mapcount, type); SetPagePrivate(page); set_page_private(page, info); atomic_inc(&page->_count); @@ -72,10 +73,10 @@ static void get_page_bootmem(unsigned long info, struct page *page, int magic) void put_page_bootmem(struct page *page) { - int magic; + int type; - magic = atomic_read(&page->_mapcount); - BUG_ON(magic >= -1); + type = atomic_read(&page->_mapcount); + BUG_ON(type >= -1); if (atomic_dec_return(&page->_count) == 1) { ClearPagePrivate(page); @@ -86,7 +87,7 @@ void put_page_bootmem(struct page *page) } -void register_page_bootmem_info_section(unsigned long start_pfn) +static void register_page_bootmem_info_section(unsigned long start_pfn) { unsigned long *usemap, mapsize, section_nr, i; struct mem_section *ms; @@ -119,7 +120,7 @@ void register_page_bootmem_info_section(unsigned long start_pfn) mapsize = PAGE_ALIGN(usemap_size()) >> PAGE_SHIFT; for (i = 0; i < mapsize; i++, page++) - get_page_bootmem(section_nr, page, MIX_INFO); + get_page_bootmem(section_nr, page, MIX_SECTION_INFO); } @@ -323,11 +324,11 @@ int __remove_pages(struct zone *zone, unsigned long phys_start_pfn, BUG_ON(phys_start_pfn & ~PAGE_SECTION_MASK); BUG_ON(nr_pages % PAGES_PER_SECTION); - release_mem_region(phys_start_pfn << PAGE_SHIFT, nr_pages * PAGE_SIZE); - sections_to_remove = nr_pages / PAGES_PER_SECTION; for (i = 0; i < sections_to_remove; i++) { unsigned long pfn = phys_start_pfn + i*PAGES_PER_SECTION; + release_mem_region(pfn << PAGE_SHIFT, + PAGES_PER_SECTION << PAGE_SHIFT); ret = __remove_section(zone, __pfn_to_section(pfn)); if (ret) break; @@ -429,7 +430,9 @@ int online_pages(unsigned long pfn, unsigned long nr_pages) if (need_zonelists_rebuild) build_all_zonelists(); - vm_total_pages = nr_free_pagecache_pages(); + else + vm_total_pages = nr_free_pagecache_pages(); + writeback_set_ratelimit(); if (onlined_pages) @@ -455,7 +458,7 @@ static pg_data_t *hotadd_new_pgdat(int nid, u64 start) /* we can use NODE_DATA(nid) from here */ /* init node's zones as empty zones, we don't have any present pages.*/ - free_area_init_node(nid, pgdat, zones_size, start_pfn, zholes_size); + free_area_init_node(nid, zones_size, start_pfn, zholes_size); return pgdat; } @@ -521,6 +524,66 @@ EXPORT_SYMBOL_GPL(add_memory); #ifdef CONFIG_MEMORY_HOTREMOVE /* + * A free page on the buddy free lists (not the per-cpu lists) has PageBuddy + * set and the size of the free page is given by page_order(). Using this, + * the function determines if the pageblock contains only free pages. + * Due to buddy contraints, a free page at least the size of a pageblock will + * be located at the start of the pageblock + */ +static inline int pageblock_free(struct page *page) +{ + return PageBuddy(page) && page_order(page) >= pageblock_order; +} + +/* Return the start of the next active pageblock after a given page */ +static struct page *next_active_pageblock(struct page *page) +{ + int pageblocks_stride; + + /* Ensure the starting page is pageblock-aligned */ + BUG_ON(page_to_pfn(page) & (pageblock_nr_pages - 1)); + + /* Move forward by at least 1 * pageblock_nr_pages */ + pageblocks_stride = 1; + + /* If the entire pageblock is free, move to the end of free page */ + if (pageblock_free(page)) + pageblocks_stride += page_order(page) - pageblock_order; + + return page + (pageblocks_stride * pageblock_nr_pages); +} + +/* Checks if this range of memory is likely to be hot-removable. */ +int is_mem_section_removable(unsigned long start_pfn, unsigned long nr_pages) +{ + int type; + struct page *page = pfn_to_page(start_pfn); + struct page *end_page = page + nr_pages; + + /* Check the starting page of each pageblock within the range */ + for (; page < end_page; page = next_active_pageblock(page)) { + type = get_pageblock_migratetype(page); + + /* + * A pageblock containing MOVABLE or free pages is considered + * removable + */ + if (type != MIGRATE_MOVABLE && !pageblock_free(page)) + return 0; + + /* + * A pageblock starting with a PageReserved page is not + * considered removable. + */ + if (PageReserved(page)) + return 0; + } + + /* All pageblocks in the memory block are likely to be hot-removable */ + return 1; +} + +/* * Confirm all pages in a range [start, end) is belongs to the same zone. */ static int test_pages_in_a_zone(unsigned long start_pfn, unsigned long end_pfn) @@ -595,8 +658,9 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) * We can skip free pages. And we can only deal with pages on * LRU. */ - ret = isolate_lru_page(page, &source); + ret = isolate_lru_page(page); if (!ret) { /* Success */ + list_add_tail(&page->lru, &source); move_pages--; } else { /* Becasue we don't have big zone->lock. we should @@ -787,10 +851,19 @@ failed_removal: return ret; } + +int remove_memory(u64 start, u64 size) +{ + unsigned long start_pfn, end_pfn; + + start_pfn = PFN_DOWN(start); + end_pfn = start_pfn + PFN_DOWN(size); + return offline_pages(start_pfn, end_pfn, 120 * HZ); +} #else int remove_memory(u64 start, u64 size) { return -EINVAL; } -EXPORT_SYMBOL_GPL(remove_memory); #endif /* CONFIG_MEMORY_HOTREMOVE */ +EXPORT_SYMBOL_GPL(remove_memory); diff --git a/mm/mempolicy.c b/mm/mempolicy.c index c94e58b192c3..36f42573a335 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -93,6 +93,8 @@ #include <asm/tlbflush.h> #include <asm/uaccess.h> +#include "internal.h" + /* Internal flags */ #define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0) /* Skip checks for continuous vmas */ #define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1) /* Invert check for nodemask */ @@ -762,8 +764,11 @@ static void migrate_page_add(struct page *page, struct list_head *pagelist, /* * Avoid migrating a page that is shared with others. */ - if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1) - isolate_lru_page(page, pagelist); + if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1) { + if (!isolate_lru_page(page)) { + list_add_tail(&page->lru, pagelist); + } + } } static struct page *new_node_page(struct page *page, unsigned long node, int **x) @@ -803,7 +808,6 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest, int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags) { - LIST_HEAD(pagelist); int busy = 0; int err = 0; nodemask_t tmp; @@ -1481,7 +1485,7 @@ struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr, if (unlikely((*mpol)->mode == MPOL_INTERLEAVE)) { zl = node_zonelist(interleave_nid(*mpol, vma, addr, - HPAGE_SHIFT), gfp_flags); + huge_page_shift(hstate_vma(vma))), gfp_flags); } else { zl = policy_zonelist(gfp_flags, *mpol); if ((*mpol)->mode == MPOL_BIND) @@ -2198,7 +2202,7 @@ static void gather_stats(struct page *page, void *private, int pte_dirty) if (PageSwapCache(page)) md->swapcache++; - if (PageActive(page)) + if (PageActive(page) || PageUnevictable(page)) md->active++; if (PageWriteback(page)) @@ -2220,9 +2224,12 @@ static void check_huge_range(struct vm_area_struct *vma, { unsigned long addr; struct page *page; + struct hstate *h = hstate_vma(vma); + unsigned long sz = huge_page_size(h); - for (addr = start; addr < end; addr += HPAGE_SIZE) { - pte_t *ptep = huge_pte_offset(vma->vm_mm, addr & HPAGE_MASK); + for (addr = start; addr < end; addr += sz) { + pte_t *ptep = huge_pte_offset(vma->vm_mm, + addr & huge_page_mask(h)); pte_t pte; if (!ptep) diff --git a/mm/migrate.c b/mm/migrate.c index 55bd355d170d..6602941bfab0 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -30,42 +30,13 @@ #include <linux/vmalloc.h> #include <linux/security.h> #include <linux/memcontrol.h> +#include <linux/syscalls.h> #include "internal.h" #define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru)) /* - * Isolate one page from the LRU lists. If successful put it onto - * the indicated list with elevated page count. - * - * Result: - * -EBUSY: page not on LRU list - * 0: page removed from LRU list and added to the specified list. - */ -int isolate_lru_page(struct page *page, struct list_head *pagelist) -{ - int ret = -EBUSY; - - if (PageLRU(page)) { - struct zone *zone = page_zone(page); - - spin_lock_irq(&zone->lru_lock); - if (PageLRU(page) && get_page_unless_zero(page)) { - ret = 0; - ClearPageLRU(page); - if (PageActive(page)) - del_page_from_active_list(zone, page); - else - del_page_from_inactive_list(zone, page); - list_add_tail(&page->lru, pagelist); - } - spin_unlock_irq(&zone->lru_lock); - } - return ret; -} - -/* * migrate_prep() needs to be called before we start compiling a list of pages * to be migrated using isolate_lru_page(). */ @@ -82,23 +53,9 @@ int migrate_prep(void) return 0; } -static inline void move_to_lru(struct page *page) -{ - if (PageActive(page)) { - /* - * lru_cache_add_active checks that - * the PG_active bit is off. - */ - ClearPageActive(page); - lru_cache_add_active(page); - } else { - lru_cache_add(page); - } - put_page(page); -} - /* - * Add isolated pages on the list back to the LRU. + * Add isolated pages on the list back to the LRU under page lock + * to avoid leaking evictable pages back onto unevictable list. * * returns the number of pages put back. */ @@ -110,7 +67,7 @@ int putback_lru_pages(struct list_head *l) list_for_each_entry_safe(page, page2, l, lru) { list_del(&page->lru); - move_to_lru(page); + putback_lru_page(page); count++; } return count; @@ -284,7 +241,15 @@ void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd, page = migration_entry_to_page(entry); - get_page(page); + /* + * Once radix-tree replacement of page migration started, page_count + * *must* be zero. And, we don't want to call wait_on_page_locked() + * against a page without get_page(). + * So, we use get_page_unless_zero(), here. Even failed, page fault + * will occur again. + */ + if (!get_page_unless_zero(page)) + goto out; pte_unmap_unlock(ptep, ptl); wait_on_page_locked(page); put_page(page); @@ -304,6 +269,7 @@ out: static int migrate_page_move_mapping(struct address_space *mapping, struct page *newpage, struct page *page) { + int expected_count; void **pslot; if (!mapping) { @@ -313,14 +279,20 @@ static int migrate_page_move_mapping(struct address_space *mapping, return 0; } - write_lock_irq(&mapping->tree_lock); + spin_lock_irq(&mapping->tree_lock); pslot = radix_tree_lookup_slot(&mapping->page_tree, page_index(page)); - if (page_count(page) != 2 + !!PagePrivate(page) || + expected_count = 2 + !!PagePrivate(page); + if (page_count(page) != expected_count || (struct page *)radix_tree_deref_slot(pslot) != page) { - write_unlock_irq(&mapping->tree_lock); + spin_unlock_irq(&mapping->tree_lock); + return -EAGAIN; + } + + if (!page_freeze_refs(page, expected_count)) { + spin_unlock_irq(&mapping->tree_lock); return -EAGAIN; } @@ -337,6 +309,7 @@ static int migrate_page_move_mapping(struct address_space *mapping, radix_tree_replace_slot(pslot, newpage); + page_unfreeze_refs(page, expected_count); /* * Drop cache reference from old page. * We know this isn't the last reference. @@ -356,7 +329,7 @@ static int migrate_page_move_mapping(struct address_space *mapping, __dec_zone_page_state(page, NR_FILE_PAGES); __inc_zone_page_state(newpage, NR_FILE_PAGES); - write_unlock_irq(&mapping->tree_lock); + spin_unlock_irq(&mapping->tree_lock); return 0; } @@ -366,6 +339,8 @@ static int migrate_page_move_mapping(struct address_space *mapping, */ static void migrate_page_copy(struct page *newpage, struct page *page) { + int anon; + copy_highpage(newpage, page); if (PageError(page)) @@ -374,8 +349,11 @@ static void migrate_page_copy(struct page *newpage, struct page *page) SetPageReferenced(newpage); if (PageUptodate(page)) SetPageUptodate(newpage); - if (PageActive(page)) + if (TestClearPageActive(page)) { + VM_BUG_ON(PageUnevictable(page)); SetPageActive(newpage); + } else + unevictable_migrate_page(newpage, page); if (PageChecked(page)) SetPageChecked(newpage); if (PageMappedToDisk(page)) @@ -393,14 +371,20 @@ static void migrate_page_copy(struct page *newpage, struct page *page) __set_page_dirty_nobuffers(newpage); } + mlock_migrate_page(newpage, page); + #ifdef CONFIG_SWAP ClearPageSwapCache(page); #endif - ClearPageActive(page); ClearPagePrivate(page); set_page_private(page, 0); + /* page->mapping contains a flag for PageAnon() */ + anon = PageAnon(page); page->mapping = NULL; + if (!anon) /* This page was removed from radix-tree. */ + mem_cgroup_uncharge_cache_page(page); + /* * If any waiters have accumulated on the new page then * wake them up. @@ -575,6 +559,10 @@ static int fallback_migrate_page(struct address_space *mapping, * * The new page will have replaced the old page if this function * is successful. + * + * Return value: + * < 0 - error code + * == 0 - success */ static int move_to_new_page(struct page *newpage, struct page *page) { @@ -586,12 +574,14 @@ static int move_to_new_page(struct page *newpage, struct page *page) * establishing additional references. We are the only one * holding a reference to the new page at this point. */ - if (TestSetPageLocked(newpage)) + if (!trylock_page(newpage)) BUG(); /* Prepare mapping for the new page.*/ newpage->index = page->index; newpage->mapping = page->mapping; + if (PageSwapBacked(page)) + SetPageSwapBacked(newpage); mapping = page_mapping(page); if (!mapping) @@ -610,7 +600,6 @@ static int move_to_new_page(struct page *newpage, struct page *page) rc = fallback_migrate_page(mapping, newpage, page); if (!rc) { - mem_cgroup_page_migration(page, newpage); remove_migration_ptes(page, newpage); } else newpage->mapping = NULL; @@ -636,12 +625,21 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private, if (!newpage) return -ENOMEM; - if (page_count(page) == 1) + if (page_count(page) == 1) { /* page was freed from under us. So we are done. */ goto move_newpage; + } + + charge = mem_cgroup_prepare_migration(page, newpage); + if (charge == -ENOMEM) { + rc = -ENOMEM; + goto move_newpage; + } + /* prepare cgroup just returns 0 or -ENOMEM */ + BUG_ON(charge); rc = -EAGAIN; - if (TestSetPageLocked(page)) { + if (!trylock_page(page)) { if (!force) goto move_newpage; lock_page(page); @@ -691,25 +689,19 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private, goto rcu_unlock; } - charge = mem_cgroup_prepare_migration(page); /* Establish migration ptes or remove ptes */ try_to_unmap(page, 1); if (!page_mapped(page)) rc = move_to_new_page(newpage, page); - if (rc) { + if (rc) remove_migration_ptes(page, page); - if (charge) - mem_cgroup_end_migration(page); - } else if (charge) - mem_cgroup_end_migration(newpage); rcu_unlock: if (rcu_locked) rcu_read_unlock(); unlock: - unlock_page(page); if (rc != -EAGAIN) { @@ -720,15 +712,19 @@ unlock: * restored. */ list_del(&page->lru); - move_to_lru(page); + putback_lru_page(page); } move_newpage: + if (!charge) + mem_cgroup_end_migration(newpage); + /* * Move the new page to the LRU. If migration was not successful * then this will free the page. */ - move_to_lru(newpage); + putback_lru_page(newpage); + if (result) { if (rc) *result = rc; @@ -835,9 +831,11 @@ static struct page *new_page_node(struct page *p, unsigned long private, * Move a set of pages as indicated in the pm array. The addr * field must be set to the virtual address of the page to be moved * and the node number must contain a valid target node. + * The pm array ends with node = MAX_NUMNODES. */ -static int do_move_pages(struct mm_struct *mm, struct page_to_node *pm, - int migrate_all) +static int do_move_page_to_node_array(struct mm_struct *mm, + struct page_to_node *pm, + int migrate_all) { int err; struct page_to_node *pp; @@ -891,7 +889,9 @@ static int do_move_pages(struct mm_struct *mm, struct page_to_node *pm, !migrate_all) goto put_and_set; - err = isolate_lru_page(page, &pagelist); + err = isolate_lru_page(page); + if (!err) + list_add_tail(&page->lru, &pagelist); put_and_set: /* * Either remove the duplicate refcount from @@ -903,36 +903,118 @@ set_status: pp->status = err; } + err = 0; if (!list_empty(&pagelist)) err = migrate_pages(&pagelist, new_page_node, (unsigned long)pm); - else - err = -ENOENT; up_read(&mm->mmap_sem); return err; } /* - * Determine the nodes of a list of pages. The addr in the pm array - * must have been set to the virtual address of which we want to determine - * the node number. + * Migrate an array of page address onto an array of nodes and fill + * the corresponding array of status. */ -static int do_pages_stat(struct mm_struct *mm, struct page_to_node *pm) +static int do_pages_move(struct mm_struct *mm, struct task_struct *task, + unsigned long nr_pages, + const void __user * __user *pages, + const int __user *nodes, + int __user *status, int flags) { + struct page_to_node *pm = NULL; + nodemask_t task_nodes; + int err = 0; + int i; + + task_nodes = cpuset_mems_allowed(task); + + /* Limit nr_pages so that the multiplication may not overflow */ + if (nr_pages >= ULONG_MAX / sizeof(struct page_to_node) - 1) { + err = -E2BIG; + goto out; + } + + pm = vmalloc((nr_pages + 1) * sizeof(struct page_to_node)); + if (!pm) { + err = -ENOMEM; + goto out; + } + + /* + * Get parameters from user space and initialize the pm + * array. Return various errors if the user did something wrong. + */ + for (i = 0; i < nr_pages; i++) { + const void __user *p; + + err = -EFAULT; + if (get_user(p, pages + i)) + goto out_pm; + + pm[i].addr = (unsigned long)p; + if (nodes) { + int node; + + if (get_user(node, nodes + i)) + goto out_pm; + + err = -ENODEV; + if (!node_state(node, N_HIGH_MEMORY)) + goto out_pm; + + err = -EACCES; + if (!node_isset(node, task_nodes)) + goto out_pm; + + pm[i].node = node; + } else + pm[i].node = 0; /* anything to not match MAX_NUMNODES */ + } + /* End marker */ + pm[nr_pages].node = MAX_NUMNODES; + + err = do_move_page_to_node_array(mm, pm, flags & MPOL_MF_MOVE_ALL); + if (err >= 0) + /* Return status information */ + for (i = 0; i < nr_pages; i++) + if (put_user(pm[i].status, status + i)) + err = -EFAULT; + +out_pm: + vfree(pm); +out: + return err; +} + +/* + * Determine the nodes of an array of pages and store it in an array of status. + */ +static int do_pages_stat(struct mm_struct *mm, unsigned long nr_pages, + const void __user * __user *pages, + int __user *status) +{ + unsigned long i; + int err; + down_read(&mm->mmap_sem); - for ( ; pm->node != MAX_NUMNODES; pm++) { + for (i = 0; i < nr_pages; i++) { + const void __user *p; + unsigned long addr; struct vm_area_struct *vma; struct page *page; - int err; err = -EFAULT; - vma = find_vma(mm, pm->addr); + if (get_user(p, pages+i)) + goto out; + addr = (unsigned long) p; + + vma = find_vma(mm, addr); if (!vma) goto set_status; - page = follow_page(vma, pm->addr, 0); + page = follow_page(vma, addr, 0); err = PTR_ERR(page); if (IS_ERR(page)) @@ -945,11 +1027,13 @@ static int do_pages_stat(struct mm_struct *mm, struct page_to_node *pm) err = page_to_nid(page); set_status: - pm->status = err; + put_user(err, status+i); } + err = 0; +out: up_read(&mm->mmap_sem); - return 0; + return err; } /* @@ -961,12 +1045,9 @@ asmlinkage long sys_move_pages(pid_t pid, unsigned long nr_pages, const int __user *nodes, int __user *status, int flags) { - int err = 0; - int i; struct task_struct *task; - nodemask_t task_nodes; struct mm_struct *mm; - struct page_to_node *pm = NULL; + int err; /* Check flags */ if (flags & ~(MPOL_MF_MOVE|MPOL_MF_MOVE_ALL)) @@ -998,79 +1079,24 @@ asmlinkage long sys_move_pages(pid_t pid, unsigned long nr_pages, (current->uid != task->suid) && (current->uid != task->uid) && !capable(CAP_SYS_NICE)) { err = -EPERM; - goto out2; + goto out; } err = security_task_movememory(task); if (err) - goto out2; - - - task_nodes = cpuset_mems_allowed(task); - - /* Limit nr_pages so that the multiplication may not overflow */ - if (nr_pages >= ULONG_MAX / sizeof(struct page_to_node) - 1) { - err = -E2BIG; - goto out2; - } - - pm = vmalloc((nr_pages + 1) * sizeof(struct page_to_node)); - if (!pm) { - err = -ENOMEM; - goto out2; - } - - /* - * Get parameters from user space and initialize the pm - * array. Return various errors if the user did something wrong. - */ - for (i = 0; i < nr_pages; i++) { - const void __user *p; - - err = -EFAULT; - if (get_user(p, pages + i)) - goto out; - - pm[i].addr = (unsigned long)p; - if (nodes) { - int node; - - if (get_user(node, nodes + i)) - goto out; - - err = -ENODEV; - if (!node_state(node, N_HIGH_MEMORY)) - goto out; - - err = -EACCES; - if (!node_isset(node, task_nodes)) - goto out; + goto out; - pm[i].node = node; - } else - pm[i].node = 0; /* anything to not match MAX_NUMNODES */ + if (nodes) { + err = do_pages_move(mm, task, nr_pages, pages, nodes, status, + flags); + } else { + err = do_pages_stat(mm, nr_pages, pages, status); } - /* End marker */ - pm[nr_pages].node = MAX_NUMNODES; - - if (nodes) - err = do_move_pages(mm, pm, flags & MPOL_MF_MOVE_ALL); - else - err = do_pages_stat(mm, pm); - - if (err >= 0) - /* Return status information */ - for (i = 0; i < nr_pages; i++) - if (put_user(pm[i].status, status + i)) - err = -EFAULT; out: - vfree(pm); -out2: mmput(mm); return err; } -#endif /* * Call migration functions in the vma_ops that may prepare @@ -1092,3 +1118,4 @@ int migrate_vmas(struct mm_struct *mm, const nodemask_t *to, } return err; } +#endif diff --git a/mm/mlock.c b/mm/mlock.c index 7b2656055d6a..008ea70b7afa 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -8,10 +8,18 @@ #include <linux/capability.h> #include <linux/mman.h> #include <linux/mm.h> +#include <linux/swap.h> +#include <linux/swapops.h> +#include <linux/pagemap.h> #include <linux/mempolicy.h> #include <linux/syscalls.h> #include <linux/sched.h> #include <linux/module.h> +#include <linux/rmap.h> +#include <linux/mmzone.h> +#include <linux/hugetlb.h> + +#include "internal.h" int can_do_mlock(void) { @@ -23,17 +31,381 @@ int can_do_mlock(void) } EXPORT_SYMBOL(can_do_mlock); +#ifdef CONFIG_UNEVICTABLE_LRU +/* + * Mlocked pages are marked with PageMlocked() flag for efficient testing + * in vmscan and, possibly, the fault path; and to support semi-accurate + * statistics. + * + * An mlocked page [PageMlocked(page)] is unevictable. As such, it will + * be placed on the LRU "unevictable" list, rather than the [in]active lists. + * The unevictable list is an LRU sibling list to the [in]active lists. + * PageUnevictable is set to indicate the unevictable state. + * + * When lazy mlocking via vmscan, it is important to ensure that the + * vma's VM_LOCKED status is not concurrently being modified, otherwise we + * may have mlocked a page that is being munlocked. So lazy mlock must take + * the mmap_sem for read, and verify that the vma really is locked + * (see mm/rmap.c). + */ + +/* + * LRU accounting for clear_page_mlock() + */ +void __clear_page_mlock(struct page *page) +{ + VM_BUG_ON(!PageLocked(page)); + + if (!page->mapping) { /* truncated ? */ + return; + } + + dec_zone_page_state(page, NR_MLOCK); + count_vm_event(UNEVICTABLE_PGCLEARED); + if (!isolate_lru_page(page)) { + putback_lru_page(page); + } else { + /* + * Page not on the LRU yet. Flush all pagevecs and retry. + */ + lru_add_drain_all(); + if (!isolate_lru_page(page)) + putback_lru_page(page); + else if (PageUnevictable(page)) + count_vm_event(UNEVICTABLE_PGSTRANDED); + + } +} + +/* + * Mark page as mlocked if not already. + * If page on LRU, isolate and putback to move to unevictable list. + */ +void mlock_vma_page(struct page *page) +{ + BUG_ON(!PageLocked(page)); + + if (!TestSetPageMlocked(page)) { + inc_zone_page_state(page, NR_MLOCK); + count_vm_event(UNEVICTABLE_PGMLOCKED); + if (!isolate_lru_page(page)) + putback_lru_page(page); + } +} + +/* + * called from munlock()/munmap() path with page supposedly on the LRU. + * + * Note: unlike mlock_vma_page(), we can't just clear the PageMlocked + * [in try_to_munlock()] and then attempt to isolate the page. We must + * isolate the page to keep others from messing with its unevictable + * and mlocked state while trying to munlock. However, we pre-clear the + * mlocked state anyway as we might lose the isolation race and we might + * not get another chance to clear PageMlocked. If we successfully + * isolate the page and try_to_munlock() detects other VM_LOCKED vmas + * mapping the page, it will restore the PageMlocked state, unless the page + * is mapped in a non-linear vma. So, we go ahead and SetPageMlocked(), + * perhaps redundantly. + * If we lose the isolation race, and the page is mapped by other VM_LOCKED + * vmas, we'll detect this in vmscan--via try_to_munlock() or try_to_unmap() + * either of which will restore the PageMlocked state by calling + * mlock_vma_page() above, if it can grab the vma's mmap sem. + */ +static void munlock_vma_page(struct page *page) +{ + BUG_ON(!PageLocked(page)); + + if (TestClearPageMlocked(page)) { + dec_zone_page_state(page, NR_MLOCK); + if (!isolate_lru_page(page)) { + int ret = try_to_munlock(page); + /* + * did try_to_unlock() succeed or punt? + */ + if (ret == SWAP_SUCCESS || ret == SWAP_AGAIN) + count_vm_event(UNEVICTABLE_PGMUNLOCKED); + + putback_lru_page(page); + } else { + /* + * We lost the race. let try_to_unmap() deal + * with it. At least we get the page state and + * mlock stats right. However, page is still on + * the noreclaim list. We'll fix that up when + * the page is eventually freed or we scan the + * noreclaim list. + */ + if (PageUnevictable(page)) + count_vm_event(UNEVICTABLE_PGSTRANDED); + else + count_vm_event(UNEVICTABLE_PGMUNLOCKED); + } + } +} + +/** + * __mlock_vma_pages_range() - mlock/munlock a range of pages in the vma. + * @vma: target vma + * @start: start address + * @end: end address + * @mlock: 0 indicate munlock, otherwise mlock. + * + * If @mlock == 0, unlock an mlocked range; + * else mlock the range of pages. This takes care of making the pages present , + * too. + * + * return 0 on success, negative error code on error. + * + * vma->vm_mm->mmap_sem must be held for at least read. + */ +static long __mlock_vma_pages_range(struct vm_area_struct *vma, + unsigned long start, unsigned long end, + int mlock) +{ + struct mm_struct *mm = vma->vm_mm; + unsigned long addr = start; + struct page *pages[16]; /* 16 gives a reasonable batch */ + int nr_pages = (end - start) / PAGE_SIZE; + int ret; + int gup_flags = 0; + + VM_BUG_ON(start & ~PAGE_MASK); + VM_BUG_ON(end & ~PAGE_MASK); + VM_BUG_ON(start < vma->vm_start); + VM_BUG_ON(end > vma->vm_end); + VM_BUG_ON((!rwsem_is_locked(&mm->mmap_sem)) && + (atomic_read(&mm->mm_users) != 0)); + + /* + * mlock: don't page populate if page has PROT_NONE permission. + * munlock: the pages always do munlock althrough + * its has PROT_NONE permission. + */ + if (!mlock) + gup_flags |= GUP_FLAGS_IGNORE_VMA_PERMISSIONS; + + if (vma->vm_flags & VM_WRITE) + gup_flags |= GUP_FLAGS_WRITE; + + lru_add_drain_all(); /* push cached pages to LRU */ + + while (nr_pages > 0) { + int i; + + cond_resched(); + + /* + * get_user_pages makes pages present if we are + * setting mlock. and this extra reference count will + * disable migration of this page. However, page may + * still be truncated out from under us. + */ + ret = __get_user_pages(current, mm, addr, + min_t(int, nr_pages, ARRAY_SIZE(pages)), + gup_flags, pages, NULL); + /* + * This can happen for, e.g., VM_NONLINEAR regions before + * a page has been allocated and mapped at a given offset, + * or for addresses that map beyond end of a file. + * We'll mlock the the pages if/when they get faulted in. + */ + if (ret < 0) + break; + if (ret == 0) { + /* + * We know the vma is there, so the only time + * we cannot get a single page should be an + * error (ret < 0) case. + */ + WARN_ON(1); + break; + } + + lru_add_drain(); /* push cached pages to LRU */ + + for (i = 0; i < ret; i++) { + struct page *page = pages[i]; + + lock_page(page); + /* + * Because we lock page here and migration is blocked + * by the elevated reference, we need only check for + * page truncation (file-cache only). + */ + if (page->mapping) { + if (mlock) + mlock_vma_page(page); + else + munlock_vma_page(page); + } + unlock_page(page); + put_page(page); /* ref from get_user_pages() */ + + /* + * here we assume that get_user_pages() has given us + * a list of virtually contiguous pages. + */ + addr += PAGE_SIZE; /* for next get_user_pages() */ + nr_pages--; + } + ret = 0; + } + + lru_add_drain_all(); /* to update stats */ + + return ret; /* count entire vma as locked_vm */ +} + +/* + * convert get_user_pages() return value to posix mlock() error + */ +static int __mlock_posix_error_return(long retval) +{ + if (retval == -EFAULT) + retval = -ENOMEM; + else if (retval == -ENOMEM) + retval = -EAGAIN; + return retval; +} + +#else /* CONFIG_UNEVICTABLE_LRU */ + +/* + * Just make pages present if VM_LOCKED. No-op if unlocking. + */ +static long __mlock_vma_pages_range(struct vm_area_struct *vma, + unsigned long start, unsigned long end, + int mlock) +{ + if (mlock && (vma->vm_flags & VM_LOCKED)) + return make_pages_present(start, end); + return 0; +} + +static inline int __mlock_posix_error_return(long retval) +{ + return 0; +} + +#endif /* CONFIG_UNEVICTABLE_LRU */ + +/** + * mlock_vma_pages_range() - mlock pages in specified vma range. + * @vma - the vma containing the specfied address range + * @start - starting address in @vma to mlock + * @end - end address [+1] in @vma to mlock + * + * For mmap()/mremap()/expansion of mlocked vma. + * + * return 0 on success for "normal" vmas. + * + * return number of pages [> 0] to be removed from locked_vm on success + * of "special" vmas. + * + * return negative error if vma spanning @start-@range disappears while + * mmap semaphore is dropped. Unlikely? + */ +long mlock_vma_pages_range(struct vm_area_struct *vma, + unsigned long start, unsigned long end) +{ + struct mm_struct *mm = vma->vm_mm; + int nr_pages = (end - start) / PAGE_SIZE; + BUG_ON(!(vma->vm_flags & VM_LOCKED)); + + /* + * filter unlockable vmas + */ + if (vma->vm_flags & (VM_IO | VM_PFNMAP)) + goto no_mlock; + + if (!((vma->vm_flags & (VM_DONTEXPAND | VM_RESERVED)) || + is_vm_hugetlb_page(vma) || + vma == get_gate_vma(current))) { + long error; + downgrade_write(&mm->mmap_sem); + + error = __mlock_vma_pages_range(vma, start, end, 1); + + up_read(&mm->mmap_sem); + /* vma can change or disappear */ + down_write(&mm->mmap_sem); + vma = find_vma(mm, start); + /* non-NULL vma must contain @start, but need to check @end */ + if (!vma || end > vma->vm_end) + return -ENOMEM; + + return 0; /* hide other errors from mmap(), et al */ + } + + /* + * User mapped kernel pages or huge pages: + * make these pages present to populate the ptes, but + * fall thru' to reset VM_LOCKED--no need to unlock, and + * return nr_pages so these don't get counted against task's + * locked limit. huge pages are already counted against + * locked vm limit. + */ + make_pages_present(start, end); + +no_mlock: + vma->vm_flags &= ~VM_LOCKED; /* and don't come back! */ + return nr_pages; /* error or pages NOT mlocked */ +} + + +/* + * munlock_vma_pages_range() - munlock all pages in the vma range.' + * @vma - vma containing range to be munlock()ed. + * @start - start address in @vma of the range + * @end - end of range in @vma. + * + * For mremap(), munmap() and exit(). + * + * Called with @vma VM_LOCKED. + * + * Returns with VM_LOCKED cleared. Callers must be prepared to + * deal with this. + * + * We don't save and restore VM_LOCKED here because pages are + * still on lru. In unmap path, pages might be scanned by reclaim + * and re-mlocked by try_to_{munlock|unmap} before we unmap and + * free them. This will result in freeing mlocked pages. + */ +void munlock_vma_pages_range(struct vm_area_struct *vma, + unsigned long start, unsigned long end) +{ + vma->vm_flags &= ~VM_LOCKED; + __mlock_vma_pages_range(vma, start, end, 0); +} + +/* + * mlock_fixup - handle mlock[all]/munlock[all] requests. + * + * Filters out "special" vmas -- VM_LOCKED never gets set for these, and + * munlock is a no-op. However, for some special vmas, we go ahead and + * populate the ptes via make_pages_present(). + * + * For vmas that pass the filters, merge/split as appropriate. + */ static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev, unsigned long start, unsigned long end, unsigned int newflags) { - struct mm_struct * mm = vma->vm_mm; + struct mm_struct *mm = vma->vm_mm; pgoff_t pgoff; - int pages; + int nr_pages; int ret = 0; - - if (newflags == vma->vm_flags) { - *prev = vma; - goto out; + int lock = newflags & VM_LOCKED; + + if (newflags == vma->vm_flags || + (vma->vm_flags & (VM_IO | VM_PFNMAP))) + goto out; /* don't set VM_LOCKED, don't count */ + + if ((vma->vm_flags & (VM_DONTEXPAND | VM_RESERVED)) || + is_vm_hugetlb_page(vma) || + vma == get_gate_vma(current)) { + if (lock) + make_pages_present(start, end); + goto out; /* don't set VM_LOCKED, don't count */ } pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); @@ -44,8 +416,6 @@ static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev, goto success; } - *prev = vma; - if (start != vma->vm_start) { ret = split_vma(mm, vma, start, 1); if (ret) @@ -60,26 +430,61 @@ static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev, success: /* + * Keep track of amount of locked VM. + */ + nr_pages = (end - start) >> PAGE_SHIFT; + if (!lock) + nr_pages = -nr_pages; + mm->locked_vm += nr_pages; + + /* * vm_flags is protected by the mmap_sem held in write mode. * It's okay if try_to_unmap_one unmaps a page just after we - * set VM_LOCKED, make_pages_present below will bring it back. + * set VM_LOCKED, __mlock_vma_pages_range will bring it back. */ vma->vm_flags = newflags; - /* - * Keep track of amount of locked VM. - */ - pages = (end - start) >> PAGE_SHIFT; - if (newflags & VM_LOCKED) { - pages = -pages; - if (!(newflags & VM_IO)) - ret = make_pages_present(start, end); + if (lock) { + /* + * mmap_sem is currently held for write. Downgrade the write + * lock to a read lock so that other faults, mmap scans, ... + * while we fault in all pages. + */ + downgrade_write(&mm->mmap_sem); + + ret = __mlock_vma_pages_range(vma, start, end, 1); + + /* + * Need to reacquire mmap sem in write mode, as our callers + * expect this. We have no support for atomically upgrading + * a sem to write, so we need to check for ranges while sem + * is unlocked. + */ + up_read(&mm->mmap_sem); + /* vma can change or disappear */ + down_write(&mm->mmap_sem); + *prev = find_vma(mm, start); + /* non-NULL *prev must contain @start, but need to check @end */ + if (!(*prev) || end > (*prev)->vm_end) + ret = -ENOMEM; + else if (ret > 0) { + mm->locked_vm -= ret; + ret = 0; + } else + ret = __mlock_posix_error_return(ret); /* translate if needed */ + } else { + /* + * TODO: for unlocking, pages will already be resident, so + * we don't need to wait for allocations/reclaim/pagein, ... + * However, unlocking a very large region can still take a + * while. Should we downgrade the semaphore for both lock + * AND unlock ? + */ + __mlock_vma_pages_range(vma, start, end, 0); } - mm->locked_vm -= pages; out: - if (ret == -ENOMEM) - ret = -EAGAIN; + *prev = vma; return ret; } diff --git a/mm/mm_init.c b/mm/mm_init.c new file mode 100644 index 000000000000..4e0e26591dfa --- /dev/null +++ b/mm/mm_init.c @@ -0,0 +1,152 @@ +/* + * mm_init.c - Memory initialisation verification and debugging + * + * Copyright 2008 IBM Corporation, 2008 + * Author Mel Gorman <mel@csn.ul.ie> + * + */ +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/kobject.h> +#include <linux/module.h> +#include "internal.h" + +#ifdef CONFIG_DEBUG_MEMORY_INIT +int mminit_loglevel; + +#ifndef SECTIONS_SHIFT +#define SECTIONS_SHIFT 0 +#endif + +/* The zonelists are simply reported, validation is manual. */ +void mminit_verify_zonelist(void) +{ + int nid; + + if (mminit_loglevel < MMINIT_VERIFY) + return; + + for_each_online_node(nid) { + pg_data_t *pgdat = NODE_DATA(nid); + struct zone *zone; + struct zoneref *z; + struct zonelist *zonelist; + int i, listid, zoneid; + + BUG_ON(MAX_ZONELISTS > 2); + for (i = 0; i < MAX_ZONELISTS * MAX_NR_ZONES; i++) { + + /* Identify the zone and nodelist */ + zoneid = i % MAX_NR_ZONES; + listid = i / MAX_NR_ZONES; + zonelist = &pgdat->node_zonelists[listid]; + zone = &pgdat->node_zones[zoneid]; + if (!populated_zone(zone)) + continue; + + /* Print information about the zonelist */ + printk(KERN_DEBUG "mminit::zonelist %s %d:%s = ", + listid > 0 ? "thisnode" : "general", nid, + zone->name); + + /* Iterate the zonelist */ + for_each_zone_zonelist(zone, z, zonelist, zoneid) { +#ifdef CONFIG_NUMA + printk(KERN_CONT "%d:%s ", + zone->node, zone->name); +#else + printk(KERN_CONT "0:%s ", zone->name); +#endif /* CONFIG_NUMA */ + } + printk(KERN_CONT "\n"); + } + } +} + +void __init mminit_verify_pageflags_layout(void) +{ + int shift, width; + unsigned long or_mask, add_mask; + + shift = 8 * sizeof(unsigned long); + width = shift - SECTIONS_WIDTH - NODES_WIDTH - ZONES_WIDTH; + mminit_dprintk(MMINIT_TRACE, "pageflags_layout_widths", + "Section %d Node %d Zone %d Flags %d\n", + SECTIONS_WIDTH, + NODES_WIDTH, + ZONES_WIDTH, + NR_PAGEFLAGS); + mminit_dprintk(MMINIT_TRACE, "pageflags_layout_shifts", + "Section %d Node %d Zone %d\n", + SECTIONS_SHIFT, + NODES_SHIFT, + ZONES_SHIFT); + mminit_dprintk(MMINIT_TRACE, "pageflags_layout_offsets", + "Section %lu Node %lu Zone %lu\n", + (unsigned long)SECTIONS_PGSHIFT, + (unsigned long)NODES_PGSHIFT, + (unsigned long)ZONES_PGSHIFT); + mminit_dprintk(MMINIT_TRACE, "pageflags_layout_zoneid", + "Zone ID: %lu -> %lu\n", + (unsigned long)ZONEID_PGOFF, + (unsigned long)(ZONEID_PGOFF + ZONEID_SHIFT)); + mminit_dprintk(MMINIT_TRACE, "pageflags_layout_usage", + "location: %d -> %d unused %d -> %d flags %d -> %d\n", + shift, width, width, NR_PAGEFLAGS, NR_PAGEFLAGS, 0); +#ifdef NODE_NOT_IN_PAGE_FLAGS + mminit_dprintk(MMINIT_TRACE, "pageflags_layout_nodeflags", + "Node not in page flags"); +#endif + + if (SECTIONS_WIDTH) { + shift -= SECTIONS_WIDTH; + BUG_ON(shift != SECTIONS_PGSHIFT); + } + if (NODES_WIDTH) { + shift -= NODES_WIDTH; + BUG_ON(shift != NODES_PGSHIFT); + } + if (ZONES_WIDTH) { + shift -= ZONES_WIDTH; + BUG_ON(shift != ZONES_PGSHIFT); + } + + /* Check for bitmask overlaps */ + or_mask = (ZONES_MASK << ZONES_PGSHIFT) | + (NODES_MASK << NODES_PGSHIFT) | + (SECTIONS_MASK << SECTIONS_PGSHIFT); + add_mask = (ZONES_MASK << ZONES_PGSHIFT) + + (NODES_MASK << NODES_PGSHIFT) + + (SECTIONS_MASK << SECTIONS_PGSHIFT); + BUG_ON(or_mask != add_mask); +} + +void __meminit mminit_verify_page_links(struct page *page, enum zone_type zone, + unsigned long nid, unsigned long pfn) +{ + BUG_ON(page_to_nid(page) != nid); + BUG_ON(page_zonenum(page) != zone); + BUG_ON(page_to_pfn(page) != pfn); +} + +static __init int set_mminit_loglevel(char *str) +{ + get_option(&str, &mminit_loglevel); + return 0; +} +early_param("mminit_loglevel", set_mminit_loglevel); +#endif /* CONFIG_DEBUG_MEMORY_INIT */ + +struct kobject *mm_kobj; +EXPORT_SYMBOL_GPL(mm_kobj); + +static int __init mm_sysfs_init(void) +{ + mm_kobj = kobject_create_and_add("mm", kernel_kobj); + if (!mm_kobj) + return -ENOMEM; + + return 0; +} + +__initcall(mm_sysfs_init); diff --git a/mm/mmap.c b/mm/mmap.c index 1d102b956fd8..74f4d158022e 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -26,12 +26,15 @@ #include <linux/mount.h> #include <linux/mempolicy.h> #include <linux/rmap.h> +#include <linux/mmu_notifier.h> #include <asm/uaccess.h> #include <asm/cacheflush.h> #include <asm/tlb.h> #include <asm/mmu_context.h> +#include "internal.h" + #ifndef arch_mmap_check #define arch_mmap_check(addr, len, flags) (0) #endif @@ -367,7 +370,7 @@ find_vma_prepare(struct mm_struct *mm, unsigned long addr, if (vma_tmp->vm_end > addr) { vma = vma_tmp; if (vma_tmp->vm_start <= addr) - return vma; + break; __rb_link = &__rb_parent->rb_left; } else { rb_prev = __rb_parent; @@ -407,7 +410,7 @@ void __vma_link_rb(struct mm_struct *mm, struct vm_area_struct *vma, rb_insert_color(&vma->vm_rb, &mm->mm_rb); } -static inline void __vma_link_file(struct vm_area_struct *vma) +static void __vma_link_file(struct vm_area_struct *vma) { struct file * file; @@ -659,8 +662,6 @@ again: remove_next = 1 + (end > next->vm_end); * If the vma has a ->close operation then the driver probably needs to release * per-vma resources, so we don't attempt to merge those. */ -#define VM_SPECIAL (VM_IO | VM_DONTEXPAND | VM_RESERVED | VM_PFNMAP) - static inline int is_mergeable_vma(struct vm_area_struct *vma, struct file *file, unsigned long vm_flags) { @@ -969,6 +970,7 @@ unsigned long do_mmap_pgoff(struct file * file, unsigned long addr, return -EPERM; vm_flags |= VM_LOCKED; } + /* mlock MCL_FUTURE? */ if (vm_flags & VM_LOCKED) { unsigned long locked, lock_limit; @@ -1027,6 +1029,10 @@ unsigned long do_mmap_pgoff(struct file * file, unsigned long addr, } else { switch (flags & MAP_TYPE) { case MAP_SHARED: + /* + * Ignore pgoff. + */ + pgoff = 0; vm_flags |= VM_SHARED | VM_MAYSHARE; break; case MAP_PRIVATE: @@ -1108,6 +1114,9 @@ munmap_back: if (!may_expand_vm(mm, len >> PAGE_SHIFT)) return -ENOMEM; + if (flags & MAP_NORESERVE) + vm_flags |= VM_NORESERVE; + if (accountable && (!(flags & MAP_NORESERVE) || sysctl_overcommit_memory == OVERCOMMIT_NEVER)) { if (vm_flags & VM_SHARED) { @@ -1129,10 +1138,12 @@ munmap_back: * The VM_SHARED test is necessary because shmem_zero_setup * will create the file object for a shared anonymous map below. */ - if (!file && !(vm_flags & VM_SHARED) && - vma_merge(mm, prev, addr, addr + len, vm_flags, - NULL, NULL, pgoff, NULL)) - goto out; + if (!file && !(vm_flags & VM_SHARED)) { + vma = vma_merge(mm, prev, addr, addr + len, vm_flags, + NULL, NULL, pgoff, NULL); + if (vma) + goto out; + } /* * Determine the object being mapped and call the appropriate @@ -1214,10 +1225,14 @@ out: mm->total_vm += len >> PAGE_SHIFT; vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT); if (vm_flags & VM_LOCKED) { - mm->locked_vm += len >> PAGE_SHIFT; - make_pages_present(addr, addr + len); - } - if ((flags & MAP_POPULATE) && !(flags & MAP_NONBLOCK)) + /* + * makes pages present; downgrades, drops, reacquires mmap_sem + */ + long nr_pages = mlock_vma_pages_range(vma, addr, addr + len); + if (nr_pages < 0) + return nr_pages; /* vma gone! */ + mm->locked_vm += (len >> PAGE_SHIFT) - nr_pages; + } else if ((flags & MAP_POPULATE) && !(flags & MAP_NONBLOCK)) make_pages_present(addr, addr + len); return addr; @@ -1576,7 +1591,7 @@ static int acct_stack_growth(struct vm_area_struct * vma, unsigned long size, un * vma is the last one with address > vma->vm_end. Have to extend vma. */ #ifndef CONFIG_IA64 -static inline +static #endif int expand_upwards(struct vm_area_struct *vma, unsigned long address) { @@ -1626,7 +1641,7 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address) /* * vma is the first one with address < vma->vm_start. Have to extend vma. */ -static inline int expand_downwards(struct vm_area_struct *vma, +static int expand_downwards(struct vm_area_struct *vma, unsigned long address) { int error; @@ -1688,10 +1703,12 @@ find_extend_vma(struct mm_struct *mm, unsigned long addr) vma = find_vma_prev(mm, addr, &prev); if (vma && (vma->vm_start <= addr)) return vma; - if (!prev || expand_stack(prev, addr)) + if (expand_stack(prev, addr)) return NULL; - if (prev->vm_flags & VM_LOCKED) - make_pages_present(addr, prev->vm_end); + if (prev->vm_flags & VM_LOCKED) { + if (mlock_vma_pages_range(prev, addr, prev->vm_end) < 0) + return NULL; /* vma gone! */ + } return prev; } #else @@ -1717,8 +1734,10 @@ find_extend_vma(struct mm_struct * mm, unsigned long addr) start = vma->vm_start; if (expand_stack(vma, addr)) return NULL; - if (vma->vm_flags & VM_LOCKED) - make_pages_present(addr, start); + if (vma->vm_flags & VM_LOCKED) { + if (mlock_vma_pages_range(vma, addr, start) < 0) + return NULL; /* vma gone! */ + } return vma; } #endif @@ -1737,8 +1756,6 @@ static void remove_vma_list(struct mm_struct *mm, struct vm_area_struct *vma) long nrpages = vma_pages(vma); mm->total_vm -= nrpages; - if (vma->vm_flags & VM_LOCKED) - mm->locked_vm -= nrpages; vm_stat_account(mm, vma->vm_flags, vma->vm_file, -nrpages); vma = remove_vma(vma); } while (vma); @@ -1763,7 +1780,7 @@ static void unmap_region(struct mm_struct *mm, update_hiwater_rss(mm); unmap_vmas(&tlb, vma, start, end, &nr_accounted, NULL); vm_unacct_memory(nr_accounted); - free_pgtables(&tlb, vma, prev? prev->vm_end: FIRST_USER_ADDRESS, + free_pgtables(tlb, vma, prev? prev->vm_end: FIRST_USER_ADDRESS, next? next->vm_start: 0); tlb_finish_mmu(tlb, start, end); } @@ -1807,7 +1824,8 @@ int split_vma(struct mm_struct * mm, struct vm_area_struct * vma, struct mempolicy *pol; struct vm_area_struct *new; - if (is_vm_hugetlb_page(vma) && (addr & ~HPAGE_MASK)) + if (is_vm_hugetlb_page(vma) && (addr & + ~(huge_page_mask(hstate_vma(vma))))) return -EINVAL; if (mm->map_count >= sysctl_max_map_count) @@ -1903,6 +1921,20 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len) vma = prev? prev->vm_next: mm->mmap; /* + * unlock any mlock()ed ranges before detaching vmas + */ + if (mm->locked_vm) { + struct vm_area_struct *tmp = vma; + while (tmp && tmp->vm_start < end) { + if (tmp->vm_flags & VM_LOCKED) { + mm->locked_vm -= vma_pages(tmp); + munlock_vma_pages_all(tmp); + } + tmp = tmp->vm_next; + } + } + + /* * Remove the vma's, and unmap the actual pages */ detach_vmas_to_be_unmapped(mm, vma, prev, end); @@ -2014,8 +2046,9 @@ unsigned long do_brk(unsigned long addr, unsigned long len) return -ENOMEM; /* Can we just expand an old private anonymous mapping? */ - if (vma_merge(mm, prev, addr, addr + len, flags, - NULL, NULL, pgoff, NULL)) + vma = vma_merge(mm, prev, addr, addr + len, flags, + NULL, NULL, pgoff, NULL); + if (vma) goto out; /* @@ -2037,8 +2070,8 @@ unsigned long do_brk(unsigned long addr, unsigned long len) out: mm->total_vm += len >> PAGE_SHIFT; if (flags & VM_LOCKED) { - mm->locked_vm += len >> PAGE_SHIFT; - make_pages_present(addr, addr + len); + if (!mlock_vma_pages_range(vma, addr, addr + len)) + mm->locked_vm += (len >> PAGE_SHIFT); } return addr; } @@ -2049,13 +2082,23 @@ EXPORT_SYMBOL(do_brk); void exit_mmap(struct mm_struct *mm) { struct mmu_gather *tlb; - struct vm_area_struct *vma = mm->mmap; + struct vm_area_struct *vma; unsigned long nr_accounted = 0; unsigned long end; /* mm's last user has gone, and its about to be pulled down */ arch_exit_mmap(mm); - + mmu_notifier_release(mm); + + if (mm->locked_vm) { + vma = mm->mmap; + while (vma) { + if (vma->vm_flags & VM_LOCKED) + munlock_vma_pages_all(vma); + vma = vma->vm_next; + } + } + vma = mm->mmap; lru_add_drain(); flush_cache_mm(mm); tlb = tlb_gather_mmu(mm, 1); @@ -2063,7 +2106,7 @@ void exit_mmap(struct mm_struct *mm) /* Use -1 here to ensure all VMAs in the mm are unmapped */ end = unmap_vmas(&tlb, vma, 0, -1, &nr_accounted, NULL); vm_unacct_memory(nr_accounted); - free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, 0); + free_pgtables(tlb, vma, FIRST_USER_ADDRESS, 0); tlb_finish_mmu(tlb, 0, end); /* @@ -2262,3 +2305,167 @@ int install_special_mapping(struct mm_struct *mm, return 0; } + +static DEFINE_MUTEX(mm_all_locks_mutex); + +static void vm_lock_anon_vma(struct mm_struct *mm, struct anon_vma *anon_vma) +{ + if (!test_bit(0, (unsigned long *) &anon_vma->head.next)) { + /* + * The LSB of head.next can't change from under us + * because we hold the mm_all_locks_mutex. + */ + spin_lock_nest_lock(&anon_vma->lock, &mm->mmap_sem); + /* + * We can safely modify head.next after taking the + * anon_vma->lock. If some other vma in this mm shares + * the same anon_vma we won't take it again. + * + * No need of atomic instructions here, head.next + * can't change from under us thanks to the + * anon_vma->lock. + */ + if (__test_and_set_bit(0, (unsigned long *) + &anon_vma->head.next)) + BUG(); + } +} + +static void vm_lock_mapping(struct mm_struct *mm, struct address_space *mapping) +{ + if (!test_bit(AS_MM_ALL_LOCKS, &mapping->flags)) { + /* + * AS_MM_ALL_LOCKS can't change from under us because + * we hold the mm_all_locks_mutex. + * + * Operations on ->flags have to be atomic because + * even if AS_MM_ALL_LOCKS is stable thanks to the + * mm_all_locks_mutex, there may be other cpus + * changing other bitflags in parallel to us. + */ + if (test_and_set_bit(AS_MM_ALL_LOCKS, &mapping->flags)) + BUG(); + spin_lock_nest_lock(&mapping->i_mmap_lock, &mm->mmap_sem); + } +} + +/* + * This operation locks against the VM for all pte/vma/mm related + * operations that could ever happen on a certain mm. This includes + * vmtruncate, try_to_unmap, and all page faults. + * + * The caller must take the mmap_sem in write mode before calling + * mm_take_all_locks(). The caller isn't allowed to release the + * mmap_sem until mm_drop_all_locks() returns. + * + * mmap_sem in write mode is required in order to block all operations + * that could modify pagetables and free pages without need of + * altering the vma layout (for example populate_range() with + * nonlinear vmas). It's also needed in write mode to avoid new + * anon_vmas to be associated with existing vmas. + * + * A single task can't take more than one mm_take_all_locks() in a row + * or it would deadlock. + * + * The LSB in anon_vma->head.next and the AS_MM_ALL_LOCKS bitflag in + * mapping->flags avoid to take the same lock twice, if more than one + * vma in this mm is backed by the same anon_vma or address_space. + * + * We can take all the locks in random order because the VM code + * taking i_mmap_lock or anon_vma->lock outside the mmap_sem never + * takes more than one of them in a row. Secondly we're protected + * against a concurrent mm_take_all_locks() by the mm_all_locks_mutex. + * + * mm_take_all_locks() and mm_drop_all_locks are expensive operations + * that may have to take thousand of locks. + * + * mm_take_all_locks() can fail if it's interrupted by signals. + */ +int mm_take_all_locks(struct mm_struct *mm) +{ + struct vm_area_struct *vma; + int ret = -EINTR; + + BUG_ON(down_read_trylock(&mm->mmap_sem)); + + mutex_lock(&mm_all_locks_mutex); + + for (vma = mm->mmap; vma; vma = vma->vm_next) { + if (signal_pending(current)) + goto out_unlock; + if (vma->vm_file && vma->vm_file->f_mapping) + vm_lock_mapping(mm, vma->vm_file->f_mapping); + } + + for (vma = mm->mmap; vma; vma = vma->vm_next) { + if (signal_pending(current)) + goto out_unlock; + if (vma->anon_vma) + vm_lock_anon_vma(mm, vma->anon_vma); + } + + ret = 0; + +out_unlock: + if (ret) + mm_drop_all_locks(mm); + + return ret; +} + +static void vm_unlock_anon_vma(struct anon_vma *anon_vma) +{ + if (test_bit(0, (unsigned long *) &anon_vma->head.next)) { + /* + * The LSB of head.next can't change to 0 from under + * us because we hold the mm_all_locks_mutex. + * + * We must however clear the bitflag before unlocking + * the vma so the users using the anon_vma->head will + * never see our bitflag. + * + * No need of atomic instructions here, head.next + * can't change from under us until we release the + * anon_vma->lock. + */ + if (!__test_and_clear_bit(0, (unsigned long *) + &anon_vma->head.next)) + BUG(); + spin_unlock(&anon_vma->lock); + } +} + +static void vm_unlock_mapping(struct address_space *mapping) +{ + if (test_bit(AS_MM_ALL_LOCKS, &mapping->flags)) { + /* + * AS_MM_ALL_LOCKS can't change to 0 from under us + * because we hold the mm_all_locks_mutex. + */ + spin_unlock(&mapping->i_mmap_lock); + if (!test_and_clear_bit(AS_MM_ALL_LOCKS, + &mapping->flags)) + BUG(); + } +} + +/* + * The mmap_sem cannot be released by the caller until + * mm_drop_all_locks() returns. + */ +void mm_drop_all_locks(struct mm_struct *mm) +{ + struct vm_area_struct *vma; + + BUG_ON(down_read_trylock(&mm->mmap_sem)); + BUG_ON(!mutex_is_locked(&mm_all_locks_mutex)); + + for (vma = mm->mmap; vma; vma = vma->vm_next) { + if (vma->anon_vma) + vm_unlock_anon_vma(vma->anon_vma); + if (vma->vm_file && vma->vm_file->f_mapping) + vm_unlock_mapping(vma->vm_file->f_mapping); + } + + mutex_unlock(&mm_all_locks_mutex); +} diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c new file mode 100644 index 000000000000..5f4ef0250bee --- /dev/null +++ b/mm/mmu_notifier.c @@ -0,0 +1,277 @@ +/* + * linux/mm/mmu_notifier.c + * + * Copyright (C) 2008 Qumranet, Inc. + * Copyright (C) 2008 SGI + * Christoph Lameter <clameter@sgi.com> + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + */ + +#include <linux/rculist.h> +#include <linux/mmu_notifier.h> +#include <linux/module.h> +#include <linux/mm.h> +#include <linux/err.h> +#include <linux/rcupdate.h> +#include <linux/sched.h> + +/* + * This function can't run concurrently against mmu_notifier_register + * because mm->mm_users > 0 during mmu_notifier_register and exit_mmap + * runs with mm_users == 0. Other tasks may still invoke mmu notifiers + * in parallel despite there being no task using this mm any more, + * through the vmas outside of the exit_mmap context, such as with + * vmtruncate. This serializes against mmu_notifier_unregister with + * the mmu_notifier_mm->lock in addition to RCU and it serializes + * against the other mmu notifiers with RCU. struct mmu_notifier_mm + * can't go away from under us as exit_mmap holds an mm_count pin + * itself. + */ +void __mmu_notifier_release(struct mm_struct *mm) +{ + struct mmu_notifier *mn; + + spin_lock(&mm->mmu_notifier_mm->lock); + while (unlikely(!hlist_empty(&mm->mmu_notifier_mm->list))) { + mn = hlist_entry(mm->mmu_notifier_mm->list.first, + struct mmu_notifier, + hlist); + /* + * We arrived before mmu_notifier_unregister so + * mmu_notifier_unregister will do nothing other than + * to wait ->release to finish and + * mmu_notifier_unregister to return. + */ + hlist_del_init_rcu(&mn->hlist); + /* + * RCU here will block mmu_notifier_unregister until + * ->release returns. + */ + rcu_read_lock(); + spin_unlock(&mm->mmu_notifier_mm->lock); + /* + * if ->release runs before mmu_notifier_unregister it + * must be handled as it's the only way for the driver + * to flush all existing sptes and stop the driver + * from establishing any more sptes before all the + * pages in the mm are freed. + */ + if (mn->ops->release) + mn->ops->release(mn, mm); + rcu_read_unlock(); + spin_lock(&mm->mmu_notifier_mm->lock); + } + spin_unlock(&mm->mmu_notifier_mm->lock); + + /* + * synchronize_rcu here prevents mmu_notifier_release to + * return to exit_mmap (which would proceed freeing all pages + * in the mm) until the ->release method returns, if it was + * invoked by mmu_notifier_unregister. + * + * The mmu_notifier_mm can't go away from under us because one + * mm_count is hold by exit_mmap. + */ + synchronize_rcu(); +} + +/* + * If no young bitflag is supported by the hardware, ->clear_flush_young can + * unmap the address and return 1 or 0 depending if the mapping previously + * existed or not. + */ +int __mmu_notifier_clear_flush_young(struct mm_struct *mm, + unsigned long address) +{ + struct mmu_notifier *mn; + struct hlist_node *n; + int young = 0; + + rcu_read_lock(); + hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) { + if (mn->ops->clear_flush_young) + young |= mn->ops->clear_flush_young(mn, mm, address); + } + rcu_read_unlock(); + + return young; +} + +void __mmu_notifier_invalidate_page(struct mm_struct *mm, + unsigned long address) +{ + struct mmu_notifier *mn; + struct hlist_node *n; + + rcu_read_lock(); + hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) { + if (mn->ops->invalidate_page) + mn->ops->invalidate_page(mn, mm, address); + } + rcu_read_unlock(); +} + +void __mmu_notifier_invalidate_range_start(struct mm_struct *mm, + unsigned long start, unsigned long end) +{ + struct mmu_notifier *mn; + struct hlist_node *n; + + rcu_read_lock(); + hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) { + if (mn->ops->invalidate_range_start) + mn->ops->invalidate_range_start(mn, mm, start, end); + } + rcu_read_unlock(); +} + +void __mmu_notifier_invalidate_range_end(struct mm_struct *mm, + unsigned long start, unsigned long end) +{ + struct mmu_notifier *mn; + struct hlist_node *n; + + rcu_read_lock(); + hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) { + if (mn->ops->invalidate_range_end) + mn->ops->invalidate_range_end(mn, mm, start, end); + } + rcu_read_unlock(); +} + +static int do_mmu_notifier_register(struct mmu_notifier *mn, + struct mm_struct *mm, + int take_mmap_sem) +{ + struct mmu_notifier_mm *mmu_notifier_mm; + int ret; + + BUG_ON(atomic_read(&mm->mm_users) <= 0); + + ret = -ENOMEM; + mmu_notifier_mm = kmalloc(sizeof(struct mmu_notifier_mm), GFP_KERNEL); + if (unlikely(!mmu_notifier_mm)) + goto out; + + if (take_mmap_sem) + down_write(&mm->mmap_sem); + ret = mm_take_all_locks(mm); + if (unlikely(ret)) + goto out_cleanup; + + if (!mm_has_notifiers(mm)) { + INIT_HLIST_HEAD(&mmu_notifier_mm->list); + spin_lock_init(&mmu_notifier_mm->lock); + mm->mmu_notifier_mm = mmu_notifier_mm; + mmu_notifier_mm = NULL; + } + atomic_inc(&mm->mm_count); + + /* + * Serialize the update against mmu_notifier_unregister. A + * side note: mmu_notifier_release can't run concurrently with + * us because we hold the mm_users pin (either implicitly as + * current->mm or explicitly with get_task_mm() or similar). + * We can't race against any other mmu notifier method either + * thanks to mm_take_all_locks(). + */ + spin_lock(&mm->mmu_notifier_mm->lock); + hlist_add_head(&mn->hlist, &mm->mmu_notifier_mm->list); + spin_unlock(&mm->mmu_notifier_mm->lock); + + mm_drop_all_locks(mm); +out_cleanup: + if (take_mmap_sem) + up_write(&mm->mmap_sem); + /* kfree() does nothing if mmu_notifier_mm is NULL */ + kfree(mmu_notifier_mm); +out: + BUG_ON(atomic_read(&mm->mm_users) <= 0); + return ret; +} + +/* + * Must not hold mmap_sem nor any other VM related lock when calling + * this registration function. Must also ensure mm_users can't go down + * to zero while this runs to avoid races with mmu_notifier_release, + * so mm has to be current->mm or the mm should be pinned safely such + * as with get_task_mm(). If the mm is not current->mm, the mm_users + * pin should be released by calling mmput after mmu_notifier_register + * returns. mmu_notifier_unregister must be always called to + * unregister the notifier. mm_count is automatically pinned to allow + * mmu_notifier_unregister to safely run at any time later, before or + * after exit_mmap. ->release will always be called before exit_mmap + * frees the pages. + */ +int mmu_notifier_register(struct mmu_notifier *mn, struct mm_struct *mm) +{ + return do_mmu_notifier_register(mn, mm, 1); +} +EXPORT_SYMBOL_GPL(mmu_notifier_register); + +/* + * Same as mmu_notifier_register but here the caller must hold the + * mmap_sem in write mode. + */ +int __mmu_notifier_register(struct mmu_notifier *mn, struct mm_struct *mm) +{ + return do_mmu_notifier_register(mn, mm, 0); +} +EXPORT_SYMBOL_GPL(__mmu_notifier_register); + +/* this is called after the last mmu_notifier_unregister() returned */ +void __mmu_notifier_mm_destroy(struct mm_struct *mm) +{ + BUG_ON(!hlist_empty(&mm->mmu_notifier_mm->list)); + kfree(mm->mmu_notifier_mm); + mm->mmu_notifier_mm = LIST_POISON1; /* debug */ +} + +/* + * This releases the mm_count pin automatically and frees the mm + * structure if it was the last user of it. It serializes against + * running mmu notifiers with RCU and against mmu_notifier_unregister + * with the unregister lock + RCU. All sptes must be dropped before + * calling mmu_notifier_unregister. ->release or any other notifier + * method may be invoked concurrently with mmu_notifier_unregister, + * and only after mmu_notifier_unregister returned we're guaranteed + * that ->release or any other method can't run anymore. + */ +void mmu_notifier_unregister(struct mmu_notifier *mn, struct mm_struct *mm) +{ + BUG_ON(atomic_read(&mm->mm_count) <= 0); + + spin_lock(&mm->mmu_notifier_mm->lock); + if (!hlist_unhashed(&mn->hlist)) { + hlist_del_rcu(&mn->hlist); + + /* + * RCU here will force exit_mmap to wait ->release to finish + * before freeing the pages. + */ + rcu_read_lock(); + spin_unlock(&mm->mmu_notifier_mm->lock); + /* + * exit_mmap will block in mmu_notifier_release to + * guarantee ->release is called before freeing the + * pages. + */ + if (mn->ops->release) + mn->ops->release(mn, mm); + rcu_read_unlock(); + } else + spin_unlock(&mm->mmu_notifier_mm->lock); + + /* + * Wait any running method to finish, of course including + * ->release if it was run by mmu_notifier_relase instead of us. + */ + synchronize_rcu(); + + BUG_ON(atomic_read(&mm->mm_count) <= 0); + + mmdrop(mm); +} +EXPORT_SYMBOL_GPL(mmu_notifier_unregister); diff --git a/mm/mmzone.c b/mm/mmzone.c index 486ed595ee6f..16ce8b955dcf 100644 --- a/mm/mmzone.c +++ b/mm/mmzone.c @@ -69,6 +69,6 @@ struct zoneref *next_zones_zonelist(struct zoneref *z, (z->zone && !zref_in_nodemask(z, nodes))) z++; - *zone = zonelist_zone(z++); + *zone = zonelist_zone(z); return z; } diff --git a/mm/mprotect.c b/mm/mprotect.c index 360d9cc8b38c..fded06f923f4 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -21,6 +21,7 @@ #include <linux/syscalls.h> #include <linux/swap.h> #include <linux/swapops.h> +#include <linux/mmu_notifier.h> #include <asm/uaccess.h> #include <asm/pgtable.h> #include <asm/cacheflush.h> @@ -153,12 +154,10 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev, * If we make a private mapping writable we increase our commit; * but (without finer accounting) cannot reduce our commit if we * make it unwritable again. - * - * FIXME? We haven't defined a VM_NORESERVE flag, so mprotecting - * a MAP_NORESERVE private mapping to writable will now reserve. */ if (newflags & VM_WRITE) { - if (!(oldflags & (VM_ACCOUNT|VM_WRITE|VM_SHARED))) { + if (!(oldflags & (VM_ACCOUNT|VM_WRITE| + VM_SHARED|VM_NORESERVE))) { charged = nrpages; if (security_vm_enough_memory(charged)) return -ENOMEM; @@ -205,10 +204,12 @@ success: dirty_accountable = 1; } + mmu_notifier_invalidate_range_start(mm, start, end); if (is_vm_hugetlb_page(vma)) hugetlb_change_protection(vma, start, end, vma->vm_page_prot); else change_protection(vma, start, end, vma->vm_page_prot, dirty_accountable); + mmu_notifier_invalidate_range_end(mm, start, end); vm_stat_account(mm, oldflags, vma->vm_file, -nrpages); vm_stat_account(mm, newflags, vma->vm_file, nrpages); return 0; diff --git a/mm/mremap.c b/mm/mremap.c index 08e3c7f2bd15..58a2908f42f5 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -18,11 +18,14 @@ #include <linux/highmem.h> #include <linux/security.h> #include <linux/syscalls.h> +#include <linux/mmu_notifier.h> #include <asm/uaccess.h> #include <asm/cacheflush.h> #include <asm/tlbflush.h> +#include "internal.h" + static pmd_t *get_old_pmd(struct mm_struct *mm, unsigned long addr) { pgd_t *pgd; @@ -74,7 +77,11 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, struct mm_struct *mm = vma->vm_mm; pte_t *old_pte, *new_pte, pte; spinlock_t *old_ptl, *new_ptl; + unsigned long old_start; + old_start = old_addr; + mmu_notifier_invalidate_range_start(vma->vm_mm, + old_start, old_end); if (vma->vm_file) { /* * Subtle point from Rajesh Venkatasubramanian: before @@ -116,6 +123,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, pte_unmap_unlock(old_pte - 1, old_ptl); if (mapping) spin_unlock(&mapping->i_mmap_lock); + mmu_notifier_invalidate_range_end(vma->vm_mm, old_start, old_end); } #define LATENCY_LIMIT (64 * PAGE_SIZE) @@ -232,8 +240,8 @@ static unsigned long move_vma(struct vm_area_struct *vma, if (vm_flags & VM_LOCKED) { mm->locked_vm += new_len >> PAGE_SHIFT; if (new_len > old_len) - make_pages_present(new_addr + old_len, - new_addr + new_len); + mlock_vma_pages_range(new_vma, new_addr + old_len, + new_addr + new_len); } return new_addr; @@ -373,7 +381,7 @@ unsigned long do_mremap(unsigned long addr, vm_stat_account(mm, vma->vm_flags, vma->vm_file, pages); if (vma->vm_flags & VM_LOCKED) { mm->locked_vm += pages; - make_pages_present(addr + old_len, + mlock_vma_pages_range(vma, addr + old_len, addr + new_len); } ret = addr; diff --git a/mm/nommu.c b/mm/nommu.c index 4462b6a3fcb9..2696b24f2bb3 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -22,7 +22,7 @@ #include <linux/pagemap.h> #include <linux/slab.h> #include <linux/vmalloc.h> -#include <linux/ptrace.h> +#include <linux/tracehook.h> #include <linux/blkdev.h> #include <linux/backing-dev.h> #include <linux/mount.h> @@ -34,6 +34,8 @@ #include <asm/tlb.h> #include <asm/tlbflush.h> +#include "internal.h" + void *high_memory; struct page *mem_map; unsigned long max_mapnr; @@ -128,20 +130,16 @@ unsigned int kobjsize(const void *objp) return PAGE_SIZE << compound_order(page); } -/* - * get a list of pages in an address range belonging to the specified process - * and indicate the VMA that covers each page - * - this is potentially dodgy as we may end incrementing the page count of a - * slab page or a secondary page from a compound page - * - don't permit access to VMAs that don't support it, such as I/O mappings - */ -int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, - unsigned long start, int len, int write, int force, - struct page **pages, struct vm_area_struct **vmas) +int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, + unsigned long start, int len, int flags, + struct page **pages, struct vm_area_struct **vmas) { struct vm_area_struct *vma; unsigned long vm_flags; int i; + int write = !!(flags & GUP_FLAGS_WRITE); + int force = !!(flags & GUP_FLAGS_FORCE); + int ignore = !!(flags & GUP_FLAGS_IGNORE_VMA_PERMISSIONS); /* calculate required read or write permissions. * - if 'force' is set, we only require the "MAY" flags. @@ -156,7 +154,7 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, /* protect what we can, including chardevs */ if (vma->vm_flags & (VM_IO | VM_PFNMAP) || - !(vm_flags & vma->vm_flags)) + (!ignore && !(vm_flags & vma->vm_flags))) goto finish_or_fault; if (pages) { @@ -174,6 +172,30 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, finish_or_fault: return i ? : -EFAULT; } + + +/* + * get a list of pages in an address range belonging to the specified process + * and indicate the VMA that covers each page + * - this is potentially dodgy as we may end incrementing the page count of a + * slab page or a secondary page from a compound page + * - don't permit access to VMAs that don't support it, such as I/O mappings + */ +int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, + unsigned long start, int len, int write, int force, + struct page **pages, struct vm_area_struct **vmas) +{ + int flags = 0; + + if (write) + flags |= GUP_FLAGS_WRITE; + if (force) + flags |= GUP_FLAGS_FORCE; + + return __get_user_pages(tsk, mm, + start, len, flags, + pages, vmas); +} EXPORT_SYMBOL(get_user_pages); DEFINE_RWLOCK(vmlist_lock); @@ -266,6 +288,27 @@ void *vmalloc_node(unsigned long size, int node) } EXPORT_SYMBOL(vmalloc_node); +#ifndef PAGE_KERNEL_EXEC +# define PAGE_KERNEL_EXEC PAGE_KERNEL +#endif + +/** + * vmalloc_exec - allocate virtually contiguous, executable memory + * @size: allocation size + * + * Kernel-internal function to allocate enough pages to cover @size + * the page level allocator and map them into contiguous and + * executable kernel virtual space. + * + * For tight control over page level allocator and protection flags + * use __vmalloc() instead. + */ + +void *vmalloc_exec(unsigned long size) +{ + return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL_EXEC); +} + /** * vmalloc_32 - allocate virtually contiguous memory (32bit addressable) * @size: allocation size @@ -745,7 +788,7 @@ static unsigned long determine_vm_flags(struct file *file, * it's being traced - otherwise breakpoints set in it may interfere * with another untraced process */ - if ((flags & MAP_PRIVATE) && (current->ptrace & PT_PTRACED)) + if ((flags & MAP_PRIVATE) && tracehook_expect_breakpoints(current)) vm_flags &= ~VM_MAYSHARE; return vm_flags; diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 8a5467ee6265..64e5b4bcd964 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -26,6 +26,7 @@ #include <linux/module.h> #include <linux/notifier.h> #include <linux/memcontrol.h> +#include <linux/security.h> int sysctl_panic_on_oom; int sysctl_oom_kill_allocating_task; @@ -128,7 +129,8 @@ unsigned long badness(struct task_struct *p, unsigned long uptime) * Superuser processes are usually more important, so we make it * less likely that we kill those. */ - if (__capable(p, CAP_SYS_ADMIN) || __capable(p, CAP_SYS_RESOURCE)) + if (has_capability(p, CAP_SYS_ADMIN) || + has_capability(p, CAP_SYS_RESOURCE)) points /= 4; /* @@ -137,7 +139,7 @@ unsigned long badness(struct task_struct *p, unsigned long uptime) * tend to only have this flag set on applications they think * of as important. */ - if (__capable(p, CAP_SYS_RAWIO)) + if (has_capability(p, CAP_SYS_RAWIO)) points /= 4; /* diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 94c6d8988ab3..2970e35fd03f 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -7,7 +7,7 @@ * Contains functions related to writing back dirty pages at the * address_space level. * - * 10Apr2002 akpm@zip.com.au + * 10Apr2002 Andrew Morton * Initial version */ @@ -329,9 +329,7 @@ static unsigned long highmem_dirtyable_memory(unsigned long total) struct zone *z = &NODE_DATA(node)->node_zones[ZONE_HIGHMEM]; - x += zone_page_state(z, NR_FREE_PAGES) - + zone_page_state(z, NR_INACTIVE) - + zone_page_state(z, NR_ACTIVE); + x += zone_page_state(z, NR_FREE_PAGES) + zone_lru_pages(z); } /* * Make sure that the number of highmem pages is never larger @@ -355,9 +353,7 @@ unsigned long determine_dirtyable_memory(void) { unsigned long x; - x = global_page_state(NR_FREE_PAGES) - + global_page_state(NR_INACTIVE) - + global_page_state(NR_ACTIVE); + x = global_page_state(NR_FREE_PAGES) + global_lru_pages(); if (!vm_highmem_is_dirtyable) x -= highmem_dirtyable_memory(x); @@ -876,6 +872,7 @@ int write_cache_pages(struct address_space *mapping, pgoff_t end; /* Inclusive */ int scanned = 0; int range_whole = 0; + long nr_to_write = wbc->nr_to_write; if (wbc->nonblocking && bdi_write_congested(bdi)) { wbc->encountered_congestion = 1; @@ -939,7 +936,7 @@ retry: unlock_page(page); ret = 0; } - if (ret || (--(wbc->nr_to_write) <= 0)) + if (ret || (--nr_to_write <= 0)) done = 1; if (wbc->nonblocking && bdi_write_congested(bdi)) { wbc->encountered_congestion = 1; @@ -958,11 +955,12 @@ retry: index = 0; goto retry; } - if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0)) - mapping->writeback_index = index; + if (!wbc->no_nrwrite_index_update) { + if (wbc->range_cyclic || (range_whole && nr_to_write > 0)) + mapping->writeback_index = index; + wbc->nr_to_write = nr_to_write; + } - if (wbc->range_cont) - wbc->range_start = index << PAGE_CACHE_SHIFT; return ret; } EXPORT_SYMBOL(write_cache_pages); @@ -1088,7 +1086,7 @@ int __set_page_dirty_nobuffers(struct page *page) if (!mapping) return 1; - write_lock_irq(&mapping->tree_lock); + spin_lock_irq(&mapping->tree_lock); mapping2 = page_mapping(page); if (mapping2) { /* Race with truncate? */ BUG_ON(mapping2 != mapping); @@ -1102,7 +1100,7 @@ int __set_page_dirty_nobuffers(struct page *page) radix_tree_tag_set(&mapping->page_tree, page_index(page), PAGECACHE_TAG_DIRTY); } - write_unlock_irq(&mapping->tree_lock); + spin_unlock_irq(&mapping->tree_lock); if (mapping->host) { /* !PageAnon && !swapper_space */ __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); @@ -1258,7 +1256,7 @@ int test_clear_page_writeback(struct page *page) struct backing_dev_info *bdi = mapping->backing_dev_info; unsigned long flags; - write_lock_irqsave(&mapping->tree_lock, flags); + spin_lock_irqsave(&mapping->tree_lock, flags); ret = TestClearPageWriteback(page); if (ret) { radix_tree_tag_clear(&mapping->page_tree, @@ -1269,7 +1267,7 @@ int test_clear_page_writeback(struct page *page) __bdi_writeout_inc(bdi); } } - write_unlock_irqrestore(&mapping->tree_lock, flags); + spin_unlock_irqrestore(&mapping->tree_lock, flags); } else { ret = TestClearPageWriteback(page); } @@ -1287,7 +1285,7 @@ int test_set_page_writeback(struct page *page) struct backing_dev_info *bdi = mapping->backing_dev_info; unsigned long flags; - write_lock_irqsave(&mapping->tree_lock, flags); + spin_lock_irqsave(&mapping->tree_lock, flags); ret = TestSetPageWriteback(page); if (!ret) { radix_tree_tag_set(&mapping->page_tree, @@ -1300,7 +1298,7 @@ int test_set_page_writeback(struct page *page) radix_tree_tag_clear(&mapping->page_tree, page_index(page), PAGECACHE_TAG_DIRTY); - write_unlock_irqrestore(&mapping->tree_lock, flags); + spin_unlock_irqrestore(&mapping->tree_lock, flags); } else { ret = TestSetPageWriteback(page); } diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 79ac4afc908c..d0a240fbb8bf 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -44,7 +44,7 @@ #include <linux/backing-dev.h> #include <linux/fault-inject.h> #include <linux/page-isolation.h> -#include <linux/memcontrol.h> +#include <linux/page_cgroup.h> #include <linux/debugobjects.h> #include <asm/tlbflush.h> @@ -153,9 +153,9 @@ static unsigned long __meminitdata dma_reserve; static unsigned long __meminitdata node_boundary_start_pfn[MAX_NUMNODES]; static unsigned long __meminitdata node_boundary_end_pfn[MAX_NUMNODES]; #endif /* CONFIG_MEMORY_HOTPLUG_RESERVE */ - unsigned long __initdata required_kernelcore; + static unsigned long __initdata required_kernelcore; static unsigned long __initdata required_movablecore; - unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES]; + static unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES]; /* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */ int movable_zone; @@ -223,17 +223,12 @@ static inline int bad_range(struct zone *zone, struct page *page) static void bad_page(struct page *page) { - void *pc = page_get_page_cgroup(page); - printk(KERN_EMERG "Bad page state in process '%s'\n" KERN_EMERG "page:%p flags:0x%0*lx mapping:%p mapcount:%d count:%d\n", current->comm, page, (int)(2*sizeof(unsigned long)), (unsigned long)page->flags, page->mapping, page_mapcount(page), page_count(page)); - if (pc) { - printk(KERN_EMERG "cgroup:%p\n", pc); - page_reset_bad_cgroup(page); - } + printk(KERN_EMERG "Trying to fix it up, but a reboot is needed\n" KERN_EMERG "Backtrace:\n"); dump_stack(); @@ -264,17 +259,18 @@ static void free_compound_page(struct page *page) __free_pages_ok(page, compound_order(page)); } -static void prep_compound_page(struct page *page, unsigned long order) +void prep_compound_page(struct page *page, unsigned long order) { int i; int nr_pages = 1 << order; + struct page *p = page + 1; set_compound_page_dtor(page, free_compound_page); set_compound_order(page, order); __SetPageHead(page); - for (i = 1; i < nr_pages; i++) { - struct page *p = page + i; - + for (i = 1; i < nr_pages; i++, p++) { + if (unlikely((i & (MAX_ORDER_NR_PAGES - 1)) == 0)) + p = pfn_to_page(page_to_pfn(page) + i); __SetPageTail(p); p->first_page = page; } @@ -284,6 +280,7 @@ static void destroy_compound_page(struct page *page, unsigned long order) { int i; int nr_pages = 1 << order; + struct page *p = page + 1; if (unlikely(compound_order(page) != order)) bad_page(page); @@ -291,8 +288,9 @@ static void destroy_compound_page(struct page *page, unsigned long order) if (unlikely(!PageHead(page))) bad_page(page); __ClearPageHead(page); - for (i = 1; i < nr_pages; i++) { - struct page *p = page + i; + for (i = 1; i < nr_pages; i++, p++) { + if (unlikely((i & (MAX_ORDER_NR_PAGES - 1)) == 0)) + p = pfn_to_page(page_to_pfn(page) + i); if (unlikely(!PageTail(p) | (p->first_page != page))) @@ -432,8 +430,9 @@ static inline void __free_one_page(struct page *page, buddy = __page_find_buddy(page, page_idx, order); if (!page_is_buddy(page, buddy, order)) - break; /* Move the buddy up one level. */ + break; + /* Our buddy is free, merge with it and move up one order. */ list_del(&buddy->lru); zone->free_area[order].nr_free--; rmv_page_order(buddy); @@ -450,14 +449,16 @@ static inline void __free_one_page(struct page *page, static inline int free_pages_check(struct page *page) { + free_page_mlock(page); if (unlikely(page_mapcount(page) | (page->mapping != NULL) | - (page_get_page_cgroup(page) != NULL) | (page_count(page) != 0) | (page->flags & PAGE_FLAGS_CHECK_AT_FREE))) bad_page(page); if (PageDirty(page)) __ClearPageDirty(page); + if (PageSwapBacked(page)) + __ClearPageSwapBacked(page); /* * For now, we report if PG_reserved was found set, but do not * clear it, and do not free the page. But we shall soon need @@ -532,7 +533,7 @@ static void __free_pages_ok(struct page *page, unsigned int order) /* * permit the bootmem allocator to evade page validation on high-order frees */ -void __free_pages_bootmem(struct page *page, unsigned int order) +void __meminit __free_pages_bootmem(struct page *page, unsigned int order) { if (order == 0) { __ClearPageReserved(page); @@ -596,7 +597,6 @@ static int prep_new_page(struct page *page, int order, gfp_t gfp_flags) { if (unlikely(page_mapcount(page) | (page->mapping != NULL) | - (page_get_page_cgroup(page) != NULL) | (page_count(page) != 0) | (page->flags & PAGE_FLAGS_CHECK_AT_PREP))) bad_page(page); @@ -610,7 +610,11 @@ static int prep_new_page(struct page *page, int order, gfp_t gfp_flags) page->flags &= ~(1 << PG_uptodate | 1 << PG_error | 1 << PG_reclaim | 1 << PG_referenced | 1 << PG_arch_1 | - 1 << PG_owner_priv_1 | 1 << PG_mappedtodisk); + 1 << PG_owner_priv_1 | 1 << PG_mappedtodisk +#ifdef CONFIG_UNEVICTABLE_LRU + | 1 << PG_mlocked +#endif + ); set_page_private(page, 0); set_page_refcounted(page); @@ -673,9 +677,9 @@ static int fallbacks[MIGRATE_TYPES][MIGRATE_TYPES-1] = { * Note that start_page and end_pages are not aligned on a pageblock * boundary. If alignment is required, use move_freepages_block() */ -int move_freepages(struct zone *zone, - struct page *start_page, struct page *end_page, - int migratetype) +static int move_freepages(struct zone *zone, + struct page *start_page, struct page *end_page, + int migratetype) { struct page *page; unsigned long order; @@ -693,6 +697,9 @@ int move_freepages(struct zone *zone, #endif for (page = start_page; page <= end_page;) { + /* Make sure we are not inadvertently changing nodes */ + VM_BUG_ON(page_to_nid(page) != zone_to_nid(zone)); + if (!pfn_valid_within(page_to_pfn(page))) { page++; continue; @@ -714,7 +721,8 @@ int move_freepages(struct zone *zone, return pages_moved; } -int move_freepages_block(struct zone *zone, struct page *page, int migratetype) +static int move_freepages_block(struct zone *zone, struct page *page, + int migratetype) { unsigned long start_pfn, end_pfn; struct page *start_page, *end_page; @@ -1429,7 +1437,7 @@ try_next_zone: /* * This is the 'heart' of the zoned buddy allocator. */ -static struct page * +struct page * __alloc_pages_internal(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist, nodemask_t *nodemask) { @@ -1632,22 +1640,7 @@ nopage: got_pg: return page; } - -struct page * -__alloc_pages(gfp_t gfp_mask, unsigned int order, - struct zonelist *zonelist) -{ - return __alloc_pages_internal(gfp_mask, order, zonelist, NULL); -} - -struct page * -__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, - struct zonelist *zonelist, nodemask_t *nodemask) -{ - return __alloc_pages_internal(gfp_mask, order, zonelist, nodemask); -} - -EXPORT_SYMBOL(__alloc_pages); +EXPORT_SYMBOL(__alloc_pages_internal); /* * Common helper functions. @@ -1711,6 +1704,59 @@ void free_pages(unsigned long addr, unsigned int order) EXPORT_SYMBOL(free_pages); +/** + * alloc_pages_exact - allocate an exact number physically-contiguous pages. + * @size: the number of bytes to allocate + * @gfp_mask: GFP flags for the allocation + * + * This function is similar to alloc_pages(), except that it allocates the + * minimum number of pages to satisfy the request. alloc_pages() can only + * allocate memory in power-of-two pages. + * + * This function is also limited by MAX_ORDER. + * + * Memory allocated by this function must be released by free_pages_exact(). + */ +void *alloc_pages_exact(size_t size, gfp_t gfp_mask) +{ + unsigned int order = get_order(size); + unsigned long addr; + + addr = __get_free_pages(gfp_mask, order); + if (addr) { + unsigned long alloc_end = addr + (PAGE_SIZE << order); + unsigned long used = addr + PAGE_ALIGN(size); + + split_page(virt_to_page(addr), order); + while (used < alloc_end) { + free_page(used); + used += PAGE_SIZE; + } + } + + return (void *)addr; +} +EXPORT_SYMBOL(alloc_pages_exact); + +/** + * free_pages_exact - release memory allocated via alloc_pages_exact() + * @virt: the value returned by alloc_pages_exact. + * @size: size of allocation, same value as passed to alloc_pages_exact(). + * + * Release the memory allocated by a previous call to alloc_pages_exact. + */ +void free_pages_exact(void *virt, size_t size) +{ + unsigned long addr = (unsigned long)virt; + unsigned long end = addr + PAGE_ALIGN(size); + + while (addr < end) { + free_page(addr); + addr += PAGE_SIZE; + } +} +EXPORT_SYMBOL(free_pages_exact); + static unsigned int nr_free_zone_pages(int offset) { struct zoneref *z; @@ -1816,10 +1862,21 @@ void show_free_areas(void) } } - printk("Active:%lu inactive:%lu dirty:%lu writeback:%lu unstable:%lu\n" + printk("Active_anon:%lu active_file:%lu inactive_anon:%lu\n" + " inactive_file:%lu" +//TODO: check/adjust line lengths +#ifdef CONFIG_UNEVICTABLE_LRU + " unevictable:%lu" +#endif + " dirty:%lu writeback:%lu unstable:%lu\n" " free:%lu slab:%lu mapped:%lu pagetables:%lu bounce:%lu\n", - global_page_state(NR_ACTIVE), - global_page_state(NR_INACTIVE), + global_page_state(NR_ACTIVE_ANON), + global_page_state(NR_ACTIVE_FILE), + global_page_state(NR_INACTIVE_ANON), + global_page_state(NR_INACTIVE_FILE), +#ifdef CONFIG_UNEVICTABLE_LRU + global_page_state(NR_UNEVICTABLE), +#endif global_page_state(NR_FILE_DIRTY), global_page_state(NR_WRITEBACK), global_page_state(NR_UNSTABLE_NFS), @@ -1842,8 +1899,13 @@ void show_free_areas(void) " min:%lukB" " low:%lukB" " high:%lukB" - " active:%lukB" - " inactive:%lukB" + " active_anon:%lukB" + " inactive_anon:%lukB" + " active_file:%lukB" + " inactive_file:%lukB" +#ifdef CONFIG_UNEVICTABLE_LRU + " unevictable:%lukB" +#endif " present:%lukB" " pages_scanned:%lu" " all_unreclaimable? %s" @@ -1853,8 +1915,13 @@ void show_free_areas(void) K(zone->pages_min), K(zone->pages_low), K(zone->pages_high), - K(zone_page_state(zone, NR_ACTIVE)), - K(zone_page_state(zone, NR_INACTIVE)), + K(zone_page_state(zone, NR_ACTIVE_ANON)), + K(zone_page_state(zone, NR_INACTIVE_ANON)), + K(zone_page_state(zone, NR_ACTIVE_FILE)), + K(zone_page_state(zone, NR_INACTIVE_FILE)), +#ifdef CONFIG_UNEVICTABLE_LRU + K(zone_page_state(zone, NR_UNEVICTABLE)), +#endif K(zone->present_pages), zone->pages_scanned, (zone_is_all_unreclaimable(zone) ? "yes" : "no") @@ -2332,7 +2399,7 @@ static void build_zonelist_cache(pg_data_t *pgdat) #endif /* CONFIG_NUMA */ -/* return values int ....just for stop_machine_run() */ +/* return values int ....just for stop_machine() */ static int __build_all_zonelists(void *dummy) { int nid; @@ -2352,11 +2419,12 @@ void build_all_zonelists(void) if (system_state == SYSTEM_BOOTING) { __build_all_zonelists(NULL); + mminit_verify_zonelist(); cpuset_init_current_mems_allowed(); } else { /* we have to stop all cpus to guarantee there is no user of zonelist */ - stop_machine_run(__build_all_zonelists, NULL, NR_CPUS); + stop_machine(__build_all_zonelists, NULL, NULL); /* cpuset refresh routine should be here */ } vm_total_pages = nr_free_pagecache_pages(); @@ -2475,6 +2543,10 @@ static void setup_zone_migrate_reserve(struct zone *zone) continue; page = pfn_to_page(pfn); + /* Watch out for overlapping nodes */ + if (page_to_nid(page) != zone_to_nid(zone)) + continue; + /* Blocks with reserved pages will never free, skip them. */ if (PageReserved(page)) continue; @@ -2534,6 +2606,7 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, } page = pfn_to_page(pfn); set_page_links(page, zone, nid, pfn); + mminit_verify_page_links(page, zone, nid, pfn); init_page_count(page); reset_page_mapcount(page); SetPageReserved(page); @@ -2611,7 +2684,7 @@ static int zone_batchsize(struct zone *zone) return batch; } -inline void setup_pageset(struct per_cpu_pageset *p, unsigned long batch) +static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch) { struct per_cpu_pages *pcp; @@ -2836,6 +2909,12 @@ __meminit int init_currently_empty_zone(struct zone *zone, zone->zone_start_pfn = zone_start_pfn; + mminit_dprintk(MMINIT_TRACE, "memmap_init", + "Initialising map node %d zone %lu pfns %lu -> %lu\n", + pgdat->node_id, + (unsigned long)zone_idx(zone), + zone_start_pfn, (zone_start_pfn + size)); + zone_init_free_lists(zone); return 0; @@ -2975,7 +3054,8 @@ void __init sparse_memory_present_with_active_regions(int nid) void __init push_node_boundaries(unsigned int nid, unsigned long start_pfn, unsigned long end_pfn) { - printk(KERN_DEBUG "Entering push_node_boundaries(%u, %lu, %lu)\n", + mminit_dprintk(MMINIT_TRACE, "zoneboundary", + "Entering push_node_boundaries(%u, %lu, %lu)\n", nid, start_pfn, end_pfn); /* Initialise the boundary for this node if necessary */ @@ -2993,7 +3073,8 @@ void __init push_node_boundaries(unsigned int nid, static void __meminit account_node_boundary(unsigned int nid, unsigned long *start_pfn, unsigned long *end_pfn) { - printk(KERN_DEBUG "Entering account_node_boundary(%u, %lu, %lu)\n", + mminit_dprintk(MMINIT_TRACE, "zoneboundary", + "Entering account_node_boundary(%u, %lu, %lu)\n", nid, *start_pfn, *end_pfn); /* Return if boundary information has not been provided */ @@ -3050,7 +3131,7 @@ void __meminit get_pfn_range_for_nid(unsigned int nid, * assumption is made that zones within a node are ordered in monotonic * increasing memory addresses so that the "highest" populated zone is used */ -void __init find_usable_zone_for_movable(void) +static void __init find_usable_zone_for_movable(void) { int zone_index; for (zone_index = MAX_NR_ZONES - 1; zone_index >= 0; zone_index--) { @@ -3076,7 +3157,7 @@ void __init find_usable_zone_for_movable(void) * highest usable zone for ZONE_MOVABLE. This preserves the assumption that * zones within a node are in order of monotonic increases memory addresses */ -void __meminit adjust_zone_range_for_zone_movable(int nid, +static void __meminit adjust_zone_range_for_zone_movable(int nid, unsigned long zone_type, unsigned long node_start_pfn, unsigned long node_end_pfn, @@ -3137,7 +3218,7 @@ static unsigned long __meminit zone_spanned_pages_in_node(int nid, * Return the number of holes in a range on a node. If nid is MAX_NUMNODES, * then all holes in the requested range will be accounted for. */ -unsigned long __meminit __absent_pages_in_range(int nid, +static unsigned long __meminit __absent_pages_in_range(int nid, unsigned long range_start_pfn, unsigned long range_end_pfn) { @@ -3350,10 +3431,12 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat, pgdat->nr_zones = 0; init_waitqueue_head(&pgdat->kswapd_wait); pgdat->kswapd_max_order = 0; + pgdat_page_cgroup_init(pgdat); for (j = 0; j < MAX_NR_ZONES; j++) { struct zone *zone = pgdat->node_zones + j; unsigned long size, realsize, memmap_pages; + enum lru_list l; size = zone_spanned_pages_in_node(nid, j, zones_size); realsize = size - zone_absent_pages_in_node(nid, j, @@ -3404,10 +3487,14 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat, zone->prev_priority = DEF_PRIORITY; zone_pcp_init(zone); - INIT_LIST_HEAD(&zone->active_list); - INIT_LIST_HEAD(&zone->inactive_list); - zone->nr_scan_active = 0; - zone->nr_scan_inactive = 0; + for_each_lru(l) { + INIT_LIST_HEAD(&zone->lru[l].list); + zone->lru[l].nr_scan = 0; + } + zone->recent_rotated[0] = 0; + zone->recent_rotated[1] = 0; + zone->recent_scanned[0] = 0; + zone->recent_scanned[1] = 0; zap_zone_vm_stats(zone); zone->flags = 0; if (!size) @@ -3464,10 +3551,11 @@ static void __init_refok alloc_node_mem_map(struct pglist_data *pgdat) #endif /* CONFIG_FLAT_NODE_MEM_MAP */ } -void __paginginit free_area_init_node(int nid, struct pglist_data *pgdat, - unsigned long *zones_size, unsigned long node_start_pfn, - unsigned long *zholes_size) +void __paginginit free_area_init_node(int nid, unsigned long *zones_size, + unsigned long node_start_pfn, unsigned long *zholes_size) { + pg_data_t *pgdat = NODE_DATA(nid); + pgdat->node_id = nid; pgdat->node_start_pfn = node_start_pfn; calculate_node_totalpages(pgdat, zones_size, zholes_size); @@ -3520,10 +3608,13 @@ void __init add_active_range(unsigned int nid, unsigned long start_pfn, { int i; - printk(KERN_DEBUG "Entering add_active_range(%d, %#lx, %#lx) " - "%d entries of %d used\n", - nid, start_pfn, end_pfn, - nr_nodemap_entries, MAX_ACTIVE_REGIONS); + mminit_dprintk(MMINIT_TRACE, "memory_register", + "Entering add_active_range(%d, %#lx, %#lx) " + "%d entries of %d used\n", + nid, start_pfn, end_pfn, + nr_nodemap_entries, MAX_ACTIVE_REGIONS); + + mminit_validate_memmodel_limits(&start_pfn, &end_pfn); /* Merge with existing active regions if possible */ for (i = 0; i < nr_nodemap_entries; i++) { @@ -3669,7 +3760,7 @@ static void __init sort_node_map(void) } /* Find the lowest pfn for a node */ -unsigned long __init find_min_pfn_for_node(int nid) +static unsigned long __init find_min_pfn_for_node(int nid) { int i; unsigned long min_pfn = ULONG_MAX; @@ -3698,23 +3789,6 @@ unsigned long __init find_min_pfn_with_active_regions(void) return find_min_pfn_for_node(MAX_NUMNODES); } -/** - * find_max_pfn_with_active_regions - Find the maximum PFN registered - * - * It returns the maximum PFN based on information provided via - * add_active_range(). - */ -unsigned long __init find_max_pfn_with_active_regions(void) -{ - int i; - unsigned long max_pfn = 0; - - for (i = 0; i < nr_nodemap_entries; i++) - max_pfn = max(max_pfn, early_node_map[i].end_pfn); - - return max_pfn; -} - /* * early_calculate_totalpages() * Sum pages in active regions for movable zone. @@ -3741,7 +3815,7 @@ static unsigned long __init early_calculate_totalpages(void) * memory. When they don't, some nodes will have more kernelcore than * others */ -void __init find_zone_movable_pfns_for_nodes(unsigned long *movable_pfn) +static void __init find_zone_movable_pfns_for_nodes(unsigned long *movable_pfn) { int i, nid; unsigned long usable_startpfn; @@ -3904,7 +3978,7 @@ static void check_for_regular_memory(pg_data_t *pgdat) void __init free_area_init_nodes(unsigned long *max_zone_pfn) { unsigned long nid; - enum zone_type i; + int i; /* Sort early_node_map as initialisation assumes it is sorted */ sort_node_map(); @@ -3957,10 +4031,11 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn) early_node_map[i].end_pfn); /* Initialise every node */ + mminit_verify_pageflags_layout(); setup_nr_node_ids(); for_each_online_node(nid) { pg_data_t *pgdat = NODE_DATA(nid); - free_area_init_node(nid, pgdat, NULL, + free_area_init_node(nid, NULL, find_min_pfn_for_node(nid), NULL); /* Any memory on that node */ @@ -4025,15 +4100,13 @@ void __init set_dma_reserve(unsigned long new_dma_reserve) } #ifndef CONFIG_NEED_MULTIPLE_NODES -static bootmem_data_t contig_bootmem_data; -struct pglist_data contig_page_data = { .bdata = &contig_bootmem_data }; - +struct pglist_data __refdata contig_page_data = { .bdata = &bootmem_node_data[0] }; EXPORT_SYMBOL(contig_page_data); #endif void __init free_area_init(unsigned long *zones_size) { - free_area_init_node(0, NODE_DATA(0), zones_size, + free_area_init_node(0, zones_size, __pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL); } @@ -4163,7 +4236,7 @@ void setup_per_zone_pages_min(void) for_each_zone(zone) { u64 tmp; - spin_lock_irqsave(&zone->lru_lock, flags); + spin_lock_irqsave(&zone->lock, flags); tmp = (u64)pages_min * zone->present_pages; do_div(tmp, lowmem_pages); if (is_highmem(zone)) { @@ -4195,13 +4268,53 @@ void setup_per_zone_pages_min(void) zone->pages_low = zone->pages_min + (tmp >> 2); zone->pages_high = zone->pages_min + (tmp >> 1); setup_zone_migrate_reserve(zone); - spin_unlock_irqrestore(&zone->lru_lock, flags); + spin_unlock_irqrestore(&zone->lock, flags); } /* update totalreserve_pages */ calculate_totalreserve_pages(); } +/** + * setup_per_zone_inactive_ratio - called when min_free_kbytes changes. + * + * The inactive anon list should be small enough that the VM never has to + * do too much work, but large enough that each inactive page has a chance + * to be referenced again before it is swapped out. + * + * The inactive_anon ratio is the target ratio of ACTIVE_ANON to + * INACTIVE_ANON pages on this zone's LRU, maintained by the + * pageout code. A zone->inactive_ratio of 3 means 3:1 or 25% of + * the anonymous pages are kept on the inactive list. + * + * total target max + * memory ratio inactive anon + * ------------------------------------- + * 10MB 1 5MB + * 100MB 1 50MB + * 1GB 3 250MB + * 10GB 10 0.9GB + * 100GB 31 3GB + * 1TB 101 10GB + * 10TB 320 32GB + */ +void setup_per_zone_inactive_ratio(void) +{ + struct zone *zone; + + for_each_zone(zone) { + unsigned int gb, ratio; + + /* Zone size in gigabytes */ + gb = zone->present_pages >> (30 - PAGE_SHIFT); + ratio = int_sqrt(10 * gb); + if (!ratio) + ratio = 1; + + zone->inactive_ratio = ratio; + } +} + /* * Initialise min_free_kbytes. * @@ -4239,6 +4352,7 @@ static int __init init_per_zone_pages_min(void) min_free_kbytes = 65536; setup_per_zone_pages_min(); setup_per_zone_lowmem_reserve(); + setup_per_zone_inactive_ratio(); return 0; } module_init(init_per_zone_pages_min) @@ -4400,7 +4514,7 @@ void *__init alloc_large_system_hash(const char *tablename, do { size = bucketsize << log2qty; if (flags & HASH_EARLY) - table = alloc_bootmem(size); + table = alloc_bootmem_nopanic(size); else if (hashdist) table = __vmalloc(size, GFP_ATOMIC, PAGE_KERNEL); else { diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c new file mode 100644 index 000000000000..f59d797dc5a9 --- /dev/null +++ b/mm/page_cgroup.c @@ -0,0 +1,256 @@ +#include <linux/mm.h> +#include <linux/mmzone.h> +#include <linux/bootmem.h> +#include <linux/bit_spinlock.h> +#include <linux/page_cgroup.h> +#include <linux/hash.h> +#include <linux/slab.h> +#include <linux/memory.h> +#include <linux/vmalloc.h> +#include <linux/cgroup.h> + +static void __meminit +__init_page_cgroup(struct page_cgroup *pc, unsigned long pfn) +{ + pc->flags = 0; + pc->mem_cgroup = NULL; + pc->page = pfn_to_page(pfn); +} +static unsigned long total_usage; + +#if !defined(CONFIG_SPARSEMEM) + + +void __init pgdat_page_cgroup_init(struct pglist_data *pgdat) +{ + pgdat->node_page_cgroup = NULL; +} + +struct page_cgroup *lookup_page_cgroup(struct page *page) +{ + unsigned long pfn = page_to_pfn(page); + unsigned long offset; + struct page_cgroup *base; + + base = NODE_DATA(page_to_nid(page))->node_page_cgroup; + if (unlikely(!base)) + return NULL; + + offset = pfn - NODE_DATA(page_to_nid(page))->node_start_pfn; + return base + offset; +} + +static int __init alloc_node_page_cgroup(int nid) +{ + struct page_cgroup *base, *pc; + unsigned long table_size; + unsigned long start_pfn, nr_pages, index; + + start_pfn = NODE_DATA(nid)->node_start_pfn; + nr_pages = NODE_DATA(nid)->node_spanned_pages; + + table_size = sizeof(struct page_cgroup) * nr_pages; + + base = __alloc_bootmem_node_nopanic(NODE_DATA(nid), + table_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS)); + if (!base) + return -ENOMEM; + for (index = 0; index < nr_pages; index++) { + pc = base + index; + __init_page_cgroup(pc, start_pfn + index); + } + NODE_DATA(nid)->node_page_cgroup = base; + total_usage += table_size; + return 0; +} + +void __init page_cgroup_init(void) +{ + + int nid, fail; + + if (mem_cgroup_subsys.disabled) + return; + + for_each_online_node(nid) { + fail = alloc_node_page_cgroup(nid); + if (fail) + goto fail; + } + printk(KERN_INFO "allocated %ld bytes of page_cgroup\n", total_usage); + printk(KERN_INFO "please try cgroup_disable=memory option if you" + " don't want\n"); + return; +fail: + printk(KERN_CRIT "allocation of page_cgroup was failed.\n"); + printk(KERN_CRIT "please try cgroup_disable=memory boot option\n"); + panic("Out of memory"); +} + +#else /* CONFIG_FLAT_NODE_MEM_MAP */ + +struct page_cgroup *lookup_page_cgroup(struct page *page) +{ + unsigned long pfn = page_to_pfn(page); + struct mem_section *section = __pfn_to_section(pfn); + + return section->page_cgroup + pfn; +} + +int __meminit init_section_page_cgroup(unsigned long pfn) +{ + struct mem_section *section; + struct page_cgroup *base, *pc; + unsigned long table_size; + int nid, index; + + section = __pfn_to_section(pfn); + + if (section->page_cgroup) + return 0; + + nid = page_to_nid(pfn_to_page(pfn)); + + table_size = sizeof(struct page_cgroup) * PAGES_PER_SECTION; + if (slab_is_available()) { + base = kmalloc_node(table_size, GFP_KERNEL, nid); + if (!base) + base = vmalloc_node(table_size, nid); + } else { + base = __alloc_bootmem_node_nopanic(NODE_DATA(nid), table_size, + PAGE_SIZE, __pa(MAX_DMA_ADDRESS)); + } + + if (!base) { + printk(KERN_ERR "page cgroup allocation failure\n"); + return -ENOMEM; + } + + for (index = 0; index < PAGES_PER_SECTION; index++) { + pc = base + index; + __init_page_cgroup(pc, pfn + index); + } + + section = __pfn_to_section(pfn); + section->page_cgroup = base - pfn; + total_usage += table_size; + return 0; +} +#ifdef CONFIG_MEMORY_HOTPLUG +void __free_page_cgroup(unsigned long pfn) +{ + struct mem_section *ms; + struct page_cgroup *base; + + ms = __pfn_to_section(pfn); + if (!ms || !ms->page_cgroup) + return; + base = ms->page_cgroup + pfn; + if (is_vmalloc_addr(base)) { + vfree(base); + ms->page_cgroup = NULL; + } else { + struct page *page = virt_to_page(base); + if (!PageReserved(page)) { /* Is bootmem ? */ + kfree(base); + ms->page_cgroup = NULL; + } + } +} + +int online_page_cgroup(unsigned long start_pfn, + unsigned long nr_pages, + int nid) +{ + unsigned long start, end, pfn; + int fail = 0; + + start = start_pfn & (PAGES_PER_SECTION - 1); + end = ALIGN(start_pfn + nr_pages, PAGES_PER_SECTION); + + for (pfn = start; !fail && pfn < end; pfn += PAGES_PER_SECTION) { + if (!pfn_present(pfn)) + continue; + fail = init_section_page_cgroup(pfn); + } + if (!fail) + return 0; + + /* rollback */ + for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION) + __free_page_cgroup(pfn); + + return -ENOMEM; +} + +int offline_page_cgroup(unsigned long start_pfn, + unsigned long nr_pages, int nid) +{ + unsigned long start, end, pfn; + + start = start_pfn & (PAGES_PER_SECTION - 1); + end = ALIGN(start_pfn + nr_pages, PAGES_PER_SECTION); + + for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION) + __free_page_cgroup(pfn); + return 0; + +} + +static int page_cgroup_callback(struct notifier_block *self, + unsigned long action, void *arg) +{ + struct memory_notify *mn = arg; + int ret = 0; + switch (action) { + case MEM_GOING_ONLINE: + ret = online_page_cgroup(mn->start_pfn, + mn->nr_pages, mn->status_change_nid); + break; + case MEM_CANCEL_ONLINE: + case MEM_OFFLINE: + offline_page_cgroup(mn->start_pfn, + mn->nr_pages, mn->status_change_nid); + break; + case MEM_GOING_OFFLINE: + break; + case MEM_ONLINE: + case MEM_CANCEL_OFFLINE: + break; + } + ret = notifier_from_errno(ret); + return ret; +} + +#endif + +void __init page_cgroup_init(void) +{ + unsigned long pfn; + int fail = 0; + + if (mem_cgroup_subsys.disabled) + return; + + for (pfn = 0; !fail && pfn < max_pfn; pfn += PAGES_PER_SECTION) { + if (!pfn_present(pfn)) + continue; + fail = init_section_page_cgroup(pfn); + } + if (fail) { + printk(KERN_CRIT "try cgroup_disable=memory boot option\n"); + panic("Out of memory"); + } else { + hotplug_memory_notifier(page_cgroup_callback, 0); + } + printk(KERN_INFO "allocated %ld bytes of page_cgroup\n", total_usage); + printk(KERN_INFO "please try cgroup_disable=memory option if you don't" + " want\n"); +} + +void __init pgdat_page_cgroup_init(struct pglist_data *pgdat) +{ + return; +} + +#endif diff --git a/mm/page_isolation.c b/mm/page_isolation.c index 3444b58033c8..b70a7fec1ff6 100644 --- a/mm/page_isolation.c +++ b/mm/page_isolation.c @@ -2,7 +2,6 @@ * linux/mm/page_isolation.c */ -#include <stddef.h> #include <linux/mm.h> #include <linux/page-isolation.h> #include <linux/pageblock-flags.h> @@ -115,8 +114,10 @@ __test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn) int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn) { - unsigned long pfn; + unsigned long pfn, flags; struct page *page; + struct zone *zone; + int ret; pfn = start_pfn; /* @@ -132,7 +133,9 @@ int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn) if (pfn < end_pfn) return -EBUSY; /* Check all pages are free or Marked as ISOLATED */ - if (__test_page_isolated_in_pageblock(start_pfn, end_pfn)) - return 0; - return -EBUSY; + zone = page_zone(pfn_to_page(pfn)); + spin_lock_irqsave(&zone->lock, flags); + ret = __test_page_isolated_in_pageblock(start_pfn, end_pfn); + spin_unlock_irqrestore(&zone->lock, flags); + return ret ? 0 : -EBUSY; } diff --git a/mm/pdflush.c b/mm/pdflush.c index 9d834aa4b979..a0a14c4d5072 100644 --- a/mm/pdflush.c +++ b/mm/pdflush.c @@ -3,7 +3,7 @@ * * Copyright (C) 2002, Linus Torvalds. * - * 09Apr2002 akpm@zip.com.au + * 09Apr2002 Andrew Morton * Initial version * 29Feb2004 kaos@sgi.com * Move worker thread creation to kthread to avoid chewing @@ -130,7 +130,7 @@ static int __pdflush(struct pdflush_work *my_work) * Thread creation: For how long have there been zero * available threads? */ - if (jiffies - last_empty_jifs > 1 * HZ) { + if (time_after(jiffies, last_empty_jifs + 1 * HZ)) { /* unlocked list_empty() test is OK here */ if (list_empty(&pdflush_list)) { /* unlocked test is OK here */ @@ -151,7 +151,7 @@ static int __pdflush(struct pdflush_work *my_work) if (nr_pdflush_threads <= MIN_PDFLUSH_THREADS) continue; pdf = list_entry(pdflush_list.prev, struct pdflush_work, list); - if (jiffies - pdf->when_i_went_to_sleep > 1 * HZ) { + if (time_after(jiffies, pdf->when_i_went_to_sleep + 1 * HZ)) { /* Limit exit rate */ pdf->when_i_went_to_sleep = jiffies; break; /* exeunt */ diff --git a/mm/quicklist.c b/mm/quicklist.c index 3f703f7cb398..8dbb6805ef35 100644 --- a/mm/quicklist.c +++ b/mm/quicklist.c @@ -26,7 +26,10 @@ DEFINE_PER_CPU(struct quicklist, quicklist)[CONFIG_NR_QUICK]; static unsigned long max_pages(unsigned long min_pages) { unsigned long node_free_pages, max; - struct zone *zones = NODE_DATA(numa_node_id())->node_zones; + int node = numa_node_id(); + struct zone *zones = NODE_DATA(node)->node_zones; + int num_cpus_on_node; + node_to_cpumask_ptr(cpumask_on_node, node); node_free_pages = #ifdef CONFIG_ZONE_DMA @@ -38,6 +41,10 @@ static unsigned long max_pages(unsigned long min_pages) zone_page_state(&zones[ZONE_NORMAL], NR_FREE_PAGES); max = node_free_pages / FRACTION_OF_NODE_MEM; + + num_cpus_on_node = cpus_weight_nr(*cpumask_on_node); + max /= num_cpus_on_node; + return max(max, min_pages); } diff --git a/mm/readahead.c b/mm/readahead.c index d8723a5f6496..bec83c15a78f 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -3,7 +3,7 @@ * * Copyright (C) 2002, Linus Torvalds * - * 09Apr2002 akpm@zip.com.au + * 09Apr2002 Andrew Morton * Initial version. */ @@ -229,7 +229,7 @@ int do_page_cache_readahead(struct address_space *mapping, struct file *filp, */ unsigned long max_sane_readahead(unsigned long nr) { - return min(nr, (node_page_state(numa_node_id(), NR_INACTIVE) + return min(nr, (node_page_state(numa_node_id(), NR_INACTIVE_FILE) + node_page_state(numa_node_id(), NR_FREE_PAGES)) / 2); } @@ -382,9 +382,9 @@ ondemand_readahead(struct address_space *mapping, if (hit_readahead_marker) { pgoff_t start; - read_lock_irq(&mapping->tree_lock); - start = radix_tree_next_hole(&mapping->page_tree, offset, max+1); - read_unlock_irq(&mapping->tree_lock); + rcu_read_lock(); + start = radix_tree_next_hole(&mapping->page_tree, offset,max+1); + rcu_read_unlock(); if (!start || start - offset > max) return 0; diff --git a/mm/rmap.c b/mm/rmap.c index bf0a5b7cfb8e..10993942d6c9 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -49,12 +49,51 @@ #include <linux/module.h> #include <linux/kallsyms.h> #include <linux/memcontrol.h> +#include <linux/mmu_notifier.h> #include <asm/tlbflush.h> -struct kmem_cache *anon_vma_cachep; +#include "internal.h" -/* This must be called under the mmap_sem. */ +static struct kmem_cache *anon_vma_cachep; + +static inline struct anon_vma *anon_vma_alloc(void) +{ + return kmem_cache_alloc(anon_vma_cachep, GFP_KERNEL); +} + +static inline void anon_vma_free(struct anon_vma *anon_vma) +{ + kmem_cache_free(anon_vma_cachep, anon_vma); +} + +/** + * anon_vma_prepare - attach an anon_vma to a memory region + * @vma: the memory region in question + * + * This makes sure the memory mapping described by 'vma' has + * an 'anon_vma' attached to it, so that we can associate the + * anonymous pages mapped into it with that anon_vma. + * + * The common case will be that we already have one, but if + * if not we either need to find an adjacent mapping that we + * can re-use the anon_vma from (very common when the only + * reason for splitting a vma has been mprotect()), or we + * allocate a new one. + * + * Anon-vma allocations are very subtle, because we may have + * optimistically looked up an anon_vma in page_lock_anon_vma() + * and that may actually touch the spinlock even in the newly + * allocated vma (it depends on RCU to make sure that the + * anon_vma isn't actually destroyed). + * + * As a result, we need to do proper anon_vma locking even + * for the new allocation. At the same time, we do not want + * to do any locking for the common case of already having + * an anon_vma. + * + * This must be called with the mmap_sem held for reading. + */ int anon_vma_prepare(struct vm_area_struct *vma) { struct anon_vma *anon_vma = vma->anon_vma; @@ -62,20 +101,17 @@ int anon_vma_prepare(struct vm_area_struct *vma) might_sleep(); if (unlikely(!anon_vma)) { struct mm_struct *mm = vma->vm_mm; - struct anon_vma *allocated, *locked; + struct anon_vma *allocated; anon_vma = find_mergeable_anon_vma(vma); - if (anon_vma) { - allocated = NULL; - locked = anon_vma; - spin_lock(&locked->lock); - } else { + allocated = NULL; + if (!anon_vma) { anon_vma = anon_vma_alloc(); if (unlikely(!anon_vma)) return -ENOMEM; allocated = anon_vma; - locked = NULL; } + spin_lock(&anon_vma->lock); /* page_table_lock to protect against threads */ spin_lock(&mm->page_table_lock); @@ -86,8 +122,7 @@ int anon_vma_prepare(struct vm_area_struct *vma) } spin_unlock(&mm->page_table_lock); - if (locked) - spin_unlock(&locked->lock); + spin_unlock(&anon_vma->lock); if (unlikely(allocated)) anon_vma_free(allocated); } @@ -138,7 +173,7 @@ void anon_vma_unlink(struct vm_area_struct *vma) anon_vma_free(anon_vma); } -static void anon_vma_ctor(struct kmem_cache *cachep, void *data) +static void anon_vma_ctor(void *data) { struct anon_vma *anon_vma = data; @@ -156,7 +191,7 @@ void __init anon_vma_init(void) * Getting a lock on a stable anon_vma from a page off the LRU is * tricky: page_lock_anon_vma rely on RCU to guard against the races. */ -static struct anon_vma *page_lock_anon_vma(struct page *page) +struct anon_vma *page_lock_anon_vma(struct page *page) { struct anon_vma *anon_vma; unsigned long anon_mapping; @@ -176,7 +211,7 @@ out: return NULL; } -static void page_unlock_anon_vma(struct anon_vma *anon_vma) +void page_unlock_anon_vma(struct anon_vma *anon_vma) { spin_unlock(&anon_vma->lock); rcu_read_unlock(); @@ -223,10 +258,14 @@ unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma) /* * Check that @page is mapped at @address into @mm. * + * If @sync is false, page_check_address may perform a racy check to avoid + * the page table lock when the pte is not present (helpful when reclaiming + * highly shared pages). + * * On success returns with pte mapped and locked. */ pte_t *page_check_address(struct page *page, struct mm_struct *mm, - unsigned long address, spinlock_t **ptlp) + unsigned long address, spinlock_t **ptlp, int sync) { pgd_t *pgd; pud_t *pud; @@ -248,7 +287,7 @@ pte_t *page_check_address(struct page *page, struct mm_struct *mm, pte = pte_offset_map(pmd, address); /* Make a quick check before getting the lock */ - if (!pte_present(*pte)) { + if (!sync && !pte_present(*pte)) { pte_unmap(pte); return NULL; } @@ -263,6 +302,32 @@ pte_t *page_check_address(struct page *page, struct mm_struct *mm, return NULL; } +/** + * page_mapped_in_vma - check whether a page is really mapped in a VMA + * @page: the page to test + * @vma: the VMA to test + * + * Returns 1 if the page is mapped into the page tables of the VMA, 0 + * if the page is not mapped into the page tables of this VMA. Only + * valid for normal file or anonymous VMAs. + */ +static int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma) +{ + unsigned long address; + pte_t *pte; + spinlock_t *ptl; + + address = vma_address(page, vma); + if (address == -EFAULT) /* out of vma range */ + return 0; + pte = page_check_address(page, vma->vm_mm, address, &ptl, 1); + if (!pte) /* the page is not in this mm */ + return 0; + pte_unmap_unlock(pte, ptl); + + return 1; +} + /* * Subfunctions of page_referenced: page_referenced_one called * repeatedly from either page_referenced_anon or page_referenced_file. @@ -280,14 +345,21 @@ static int page_referenced_one(struct page *page, if (address == -EFAULT) goto out; - pte = page_check_address(page, mm, address, &ptl); + pte = page_check_address(page, mm, address, &ptl, 0); if (!pte) goto out; + /* + * Don't want to elevate referenced for mlocked page that gets this far, + * in order that it progresses to try_to_unmap and is moved to the + * unevictable list. + */ if (vma->vm_flags & VM_LOCKED) { - referenced++; *mapcount = 1; /* break early from loop */ - } else if (ptep_clear_flush_young(vma, address, pte)) + goto out_unmap; + } + + if (ptep_clear_flush_young_notify(vma, address, pte)) referenced++; /* Pretend the page is referenced if the task has the @@ -296,6 +368,7 @@ static int page_referenced_one(struct page *page, rwsem_is_locked(&mm->mmap_sem)) referenced++; +out_unmap: (*mapcount)--; pte_unmap_unlock(pte, ptl); out: @@ -385,11 +458,6 @@ static int page_referenced_file(struct page *page, */ if (mem_cont && !mm_match_cgroup(vma->vm_mm, mem_cont)) continue; - if ((vma->vm_flags & (VM_LOCKED|VM_MAYSHARE)) - == (VM_LOCKED|VM_MAYSHARE)) { - referenced++; - break; - } referenced += page_referenced_one(page, vma, &mapcount); if (!mapcount) break; @@ -421,7 +489,7 @@ int page_referenced(struct page *page, int is_locked, referenced += page_referenced_anon(page, mem_cont); else if (is_locked) referenced += page_referenced_file(page, mem_cont); - else if (TestSetPageLocked(page)) + else if (!trylock_page(page)) referenced++; else { if (page->mapping) @@ -449,7 +517,7 @@ static int page_mkclean_one(struct page *page, struct vm_area_struct *vma) if (address == -EFAULT) goto out; - pte = page_check_address(page, mm, address, &ptl); + pte = page_check_address(page, mm, address, &ptl, 1); if (!pte) goto out; @@ -457,7 +525,7 @@ static int page_mkclean_one(struct page *page, struct vm_area_struct *vma) pte_t entry; flush_cache_page(vma, address, pte_pfn(*pte)); - entry = ptep_clear_flush(vma, address, pte); + entry = ptep_clear_flush_notify(vma, address, pte); entry = pte_wrprotect(entry); entry = pte_mkclean(entry); set_pte_at(mm, address, pte, entry); @@ -576,14 +644,8 @@ void page_add_anon_rmap(struct page *page, VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end); if (atomic_inc_and_test(&page->_mapcount)) __page_set_anon_rmap(page, vma, address); - else { + else __page_check_anon_rmap(page, vma, address); - /* - * We unconditionally charged during prepare, we uncharge here - * This takes care of balancing the reference counts - */ - mem_cgroup_uncharge_page(page); - } } /** @@ -614,12 +676,6 @@ void page_add_file_rmap(struct page *page) { if (atomic_inc_and_test(&page->_mapcount)) __inc_zone_page_state(page, NR_FILE_MAPPED); - else - /* - * We unconditionally charged during prepare, we uncharge here - * This takes care of balancing the reference counts - */ - mem_cgroup_uncharge_page(page); } #ifdef CONFIG_DEBUG_VM @@ -670,6 +726,22 @@ void page_remove_rmap(struct page *page, struct vm_area_struct *vma) } /* + * Now that the last pte has gone, s390 must transfer dirty + * flag from storage key to struct page. We can usually skip + * this if the page is anon, so about to be freed; but perhaps + * not if it's in swapcache - there might be another pte slot + * containing the swap entry, but page not yet written to swap. + */ + if ((!PageAnon(page) || PageSwapCache(page)) && + page_test_dirty(page)) { + page_clear_dirty(page); + set_page_dirty(page); + } + if (PageAnon(page)) + mem_cgroup_uncharge_page(page); + __dec_zone_page_state(page, + PageAnon(page) ? NR_ANON_PAGES : NR_FILE_MAPPED); + /* * It would be tidy to reset the PageAnon mapping here, * but that might overwrite a racing page_add_anon_rmap * which increments mapcount after us but sets mapping @@ -678,14 +750,6 @@ void page_remove_rmap(struct page *page, struct vm_area_struct *vma) * Leaving it set also helps swapoff to reinstate ptes * faster for those pages still in swapcache. */ - if (page_test_dirty(page)) { - page_clear_dirty(page); - set_page_dirty(page); - } - mem_cgroup_uncharge_page(page); - - __dec_zone_page_state(page, - PageAnon(page) ? NR_ANON_PAGES : NR_FILE_MAPPED); } } @@ -707,7 +771,7 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, if (address == -EFAULT) goto out; - pte = page_check_address(page, mm, address, &ptl); + pte = page_check_address(page, mm, address, &ptl, 0); if (!pte) goto out; @@ -716,15 +780,20 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, * If it's recently referenced (perhaps page_referenced * skipped over this mm) then we should reactivate it. */ - if (!migration && ((vma->vm_flags & VM_LOCKED) || - (ptep_clear_flush_young(vma, address, pte)))) { - ret = SWAP_FAIL; - goto out_unmap; - } + if (!migration) { + if (vma->vm_flags & VM_LOCKED) { + ret = SWAP_MLOCK; + goto out_unmap; + } + if (ptep_clear_flush_young_notify(vma, address, pte)) { + ret = SWAP_FAIL; + goto out_unmap; + } + } /* Nuke the page table entry. */ flush_cache_page(vma, address, page_to_pfn(page)); - pteval = ptep_clear_flush(vma, address, pte); + pteval = ptep_clear_flush_notify(vma, address, pte); /* Move the dirty bit to the physical page now the pte is gone. */ if (pte_dirty(pteval)) @@ -801,12 +870,17 @@ out: * For very sparsely populated VMAs this is a little inefficient - chances are * there there won't be many ptes located within the scan cluster. In this case * maybe we could scan further - to the end of the pte page, perhaps. + * + * Mlocked pages: check VM_LOCKED under mmap_sem held for read, if we can + * acquire it without blocking. If vma locked, mlock the pages in the cluster, + * rather than unmapping them. If we encounter the "check_page" that vmscan is + * trying to unmap, return SWAP_MLOCK, else default SWAP_AGAIN. */ #define CLUSTER_SIZE min(32*PAGE_SIZE, PMD_SIZE) #define CLUSTER_MASK (~(CLUSTER_SIZE - 1)) -static void try_to_unmap_cluster(unsigned long cursor, - unsigned int *mapcount, struct vm_area_struct *vma) +static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount, + struct vm_area_struct *vma, struct page *check_page) { struct mm_struct *mm = vma->vm_mm; pgd_t *pgd; @@ -818,6 +892,8 @@ static void try_to_unmap_cluster(unsigned long cursor, struct page *page; unsigned long address; unsigned long end; + int ret = SWAP_AGAIN; + int locked_vma = 0; address = (vma->vm_start + cursor) & CLUSTER_MASK; end = address + CLUSTER_SIZE; @@ -828,15 +904,26 @@ static void try_to_unmap_cluster(unsigned long cursor, pgd = pgd_offset(mm, address); if (!pgd_present(*pgd)) - return; + return ret; pud = pud_offset(pgd, address); if (!pud_present(*pud)) - return; + return ret; pmd = pmd_offset(pud, address); if (!pmd_present(*pmd)) - return; + return ret; + + /* + * MLOCK_PAGES => feature is configured. + * if we can acquire the mmap_sem for read, and vma is VM_LOCKED, + * keep the sem while scanning the cluster for mlocking pages. + */ + if (MLOCK_PAGES && down_read_trylock(&vma->vm_mm->mmap_sem)) { + locked_vma = (vma->vm_flags & VM_LOCKED); + if (!locked_vma) + up_read(&vma->vm_mm->mmap_sem); /* don't need it */ + } pte = pte_offset_map_lock(mm, pmd, address, &ptl); @@ -849,12 +936,19 @@ static void try_to_unmap_cluster(unsigned long cursor, page = vm_normal_page(vma, address, *pte); BUG_ON(!page || PageAnon(page)); - if (ptep_clear_flush_young(vma, address, pte)) + if (locked_vma) { + mlock_vma_page(page); /* no-op if already mlocked */ + if (page == check_page) + ret = SWAP_MLOCK; + continue; /* don't unmap */ + } + + if (ptep_clear_flush_young_notify(vma, address, pte)) continue; /* Nuke the page table entry. */ flush_cache_page(vma, address, pte_pfn(*pte)); - pteval = ptep_clear_flush(vma, address, pte); + pteval = ptep_clear_flush_notify(vma, address, pte); /* If nonlinear, store the file page offset in the pte. */ if (page->index != linear_page_index(vma, address)) @@ -870,39 +964,104 @@ static void try_to_unmap_cluster(unsigned long cursor, (*mapcount)--; } pte_unmap_unlock(pte - 1, ptl); + if (locked_vma) + up_read(&vma->vm_mm->mmap_sem); + return ret; } -static int try_to_unmap_anon(struct page *page, int migration) +/* + * common handling for pages mapped in VM_LOCKED vmas + */ +static int try_to_mlock_page(struct page *page, struct vm_area_struct *vma) +{ + int mlocked = 0; + + if (down_read_trylock(&vma->vm_mm->mmap_sem)) { + if (vma->vm_flags & VM_LOCKED) { + mlock_vma_page(page); + mlocked++; /* really mlocked the page */ + } + up_read(&vma->vm_mm->mmap_sem); + } + return mlocked; +} + +/** + * try_to_unmap_anon - unmap or unlock anonymous page using the object-based + * rmap method + * @page: the page to unmap/unlock + * @unlock: request for unlock rather than unmap [unlikely] + * @migration: unmapping for migration - ignored if @unlock + * + * Find all the mappings of a page using the mapping pointer and the vma chains + * contained in the anon_vma struct it points to. + * + * This function is only called from try_to_unmap/try_to_munlock for + * anonymous pages. + * When called from try_to_munlock(), the mmap_sem of the mm containing the vma + * where the page was found will be held for write. So, we won't recheck + * vm_flags for that VMA. That should be OK, because that vma shouldn't be + * 'LOCKED. + */ +static int try_to_unmap_anon(struct page *page, int unlock, int migration) { struct anon_vma *anon_vma; struct vm_area_struct *vma; + unsigned int mlocked = 0; int ret = SWAP_AGAIN; + if (MLOCK_PAGES && unlikely(unlock)) + ret = SWAP_SUCCESS; /* default for try_to_munlock() */ + anon_vma = page_lock_anon_vma(page); if (!anon_vma) return ret; list_for_each_entry(vma, &anon_vma->head, anon_vma_node) { - ret = try_to_unmap_one(page, vma, migration); - if (ret == SWAP_FAIL || !page_mapped(page)) - break; + if (MLOCK_PAGES && unlikely(unlock)) { + if (!((vma->vm_flags & VM_LOCKED) && + page_mapped_in_vma(page, vma))) + continue; /* must visit all unlocked vmas */ + ret = SWAP_MLOCK; /* saw at least one mlocked vma */ + } else { + ret = try_to_unmap_one(page, vma, migration); + if (ret == SWAP_FAIL || !page_mapped(page)) + break; + } + if (ret == SWAP_MLOCK) { + mlocked = try_to_mlock_page(page, vma); + if (mlocked) + break; /* stop if actually mlocked page */ + } } page_unlock_anon_vma(anon_vma); + + if (mlocked) + ret = SWAP_MLOCK; /* actually mlocked the page */ + else if (ret == SWAP_MLOCK) + ret = SWAP_AGAIN; /* saw VM_LOCKED vma */ + return ret; } /** - * try_to_unmap_file - unmap file page using the object-based rmap method - * @page: the page to unmap - * @migration: migration flag + * try_to_unmap_file - unmap/unlock file page using the object-based rmap method + * @page: the page to unmap/unlock + * @unlock: request for unlock rather than unmap [unlikely] + * @migration: unmapping for migration - ignored if @unlock * * Find all the mappings of a page using the mapping pointer and the vma chains * contained in the address_space struct it points to. * - * This function is only called from try_to_unmap for object-based pages. + * This function is only called from try_to_unmap/try_to_munlock for + * object-based pages. + * When called from try_to_munlock(), the mmap_sem of the mm containing the vma + * where the page was found will be held for write. So, we won't recheck + * vm_flags for that VMA. That should be OK, because that vma shouldn't be + * 'LOCKED. */ -static int try_to_unmap_file(struct page *page, int migration) +static int try_to_unmap_file(struct page *page, int unlock, int migration) { struct address_space *mapping = page->mapping; pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); @@ -913,20 +1072,44 @@ static int try_to_unmap_file(struct page *page, int migration) unsigned long max_nl_cursor = 0; unsigned long max_nl_size = 0; unsigned int mapcount; + unsigned int mlocked = 0; + + if (MLOCK_PAGES && unlikely(unlock)) + ret = SWAP_SUCCESS; /* default for try_to_munlock() */ spin_lock(&mapping->i_mmap_lock); vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { - ret = try_to_unmap_one(page, vma, migration); - if (ret == SWAP_FAIL || !page_mapped(page)) - goto out; + if (MLOCK_PAGES && unlikely(unlock)) { + if (!(vma->vm_flags & VM_LOCKED)) + continue; /* must visit all vmas */ + ret = SWAP_MLOCK; + } else { + ret = try_to_unmap_one(page, vma, migration); + if (ret == SWAP_FAIL || !page_mapped(page)) + goto out; + } + if (ret == SWAP_MLOCK) { + mlocked = try_to_mlock_page(page, vma); + if (mlocked) + break; /* stop if actually mlocked page */ + } } + if (mlocked) + goto out; + if (list_empty(&mapping->i_mmap_nonlinear)) goto out; list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.vm_set.list) { - if ((vma->vm_flags & VM_LOCKED) && !migration) + if (MLOCK_PAGES && unlikely(unlock)) { + if (!(vma->vm_flags & VM_LOCKED)) + continue; /* must visit all vmas */ + ret = SWAP_MLOCK; /* leave mlocked == 0 */ + goto out; /* no need to look further */ + } + if (!MLOCK_PAGES && !migration && (vma->vm_flags & VM_LOCKED)) continue; cursor = (unsigned long) vma->vm_private_data; if (cursor > max_nl_cursor) @@ -936,7 +1119,7 @@ static int try_to_unmap_file(struct page *page, int migration) max_nl_size = cursor; } - if (max_nl_size == 0) { /* any nonlinears locked or reserved */ + if (max_nl_size == 0) { /* all nonlinears locked or reserved ? */ ret = SWAP_FAIL; goto out; } @@ -960,12 +1143,16 @@ static int try_to_unmap_file(struct page *page, int migration) do { list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.vm_set.list) { - if ((vma->vm_flags & VM_LOCKED) && !migration) + if (!MLOCK_PAGES && !migration && + (vma->vm_flags & VM_LOCKED)) continue; cursor = (unsigned long) vma->vm_private_data; while ( cursor < max_nl_cursor && cursor < vma->vm_end - vma->vm_start) { - try_to_unmap_cluster(cursor, &mapcount, vma); + ret = try_to_unmap_cluster(cursor, &mapcount, + vma, page); + if (ret == SWAP_MLOCK) + mlocked = 2; /* to return below */ cursor += CLUSTER_SIZE; vma->vm_private_data = (void *) cursor; if ((int)mapcount <= 0) @@ -986,6 +1173,10 @@ static int try_to_unmap_file(struct page *page, int migration) vma->vm_private_data = NULL; out: spin_unlock(&mapping->i_mmap_lock); + if (mlocked) + ret = SWAP_MLOCK; /* actually mlocked the page */ + else if (ret == SWAP_MLOCK) + ret = SWAP_AGAIN; /* saw VM_LOCKED vma */ return ret; } @@ -1001,6 +1192,7 @@ out: * SWAP_SUCCESS - we succeeded in removing all mappings * SWAP_AGAIN - we missed a mapping, try again later * SWAP_FAIL - the page is unswappable + * SWAP_MLOCK - page is mlocked. */ int try_to_unmap(struct page *page, int migration) { @@ -1009,12 +1201,36 @@ int try_to_unmap(struct page *page, int migration) BUG_ON(!PageLocked(page)); if (PageAnon(page)) - ret = try_to_unmap_anon(page, migration); + ret = try_to_unmap_anon(page, 0, migration); else - ret = try_to_unmap_file(page, migration); - - if (!page_mapped(page)) + ret = try_to_unmap_file(page, 0, migration); + if (ret != SWAP_MLOCK && !page_mapped(page)) ret = SWAP_SUCCESS; return ret; } +#ifdef CONFIG_UNEVICTABLE_LRU +/** + * try_to_munlock - try to munlock a page + * @page: the page to be munlocked + * + * Called from munlock code. Checks all of the VMAs mapping the page + * to make sure nobody else has this page mlocked. The page will be + * returned with PG_mlocked cleared if no other vmas have it mlocked. + * + * Return values are: + * + * SWAP_SUCCESS - no vma's holding page mlocked. + * SWAP_AGAIN - page mapped in mlocked vma -- couldn't acquire mmap sem + * SWAP_MLOCK - page is now mlocked. + */ +int try_to_munlock(struct page *page) +{ + VM_BUG_ON(!PageLocked(page) || PageLRU(page)); + + if (PageAnon(page)) + return try_to_unmap_anon(page, 1, 0); + else + return try_to_unmap_file(page, 1, 0); +} +#endif diff --git a/mm/shmem.c b/mm/shmem.c index e2a6ae1a44e9..d38d7e61fcd0 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -50,14 +50,12 @@ #include <linux/migrate.h> #include <linux/highmem.h> #include <linux/seq_file.h> +#include <linux/magic.h> #include <asm/uaccess.h> #include <asm/div64.h> #include <asm/pgtable.h> -/* This magic number is used in glibc for posix shared memory */ -#define TMPFS_MAGIC 0x01021994 - #define ENTRIES_PER_PAGE (PAGE_CACHE_SIZE/sizeof(unsigned long)) #define ENTRIES_PER_PAGEPAGE (ENTRIES_PER_PAGE*ENTRIES_PER_PAGE) #define BLOCKS_PER_PAGE (PAGE_CACHE_SIZE/512) @@ -201,7 +199,7 @@ static struct vm_operations_struct shmem_vm_ops; static struct backing_dev_info shmem_backing_dev_info __read_mostly = { .ra_pages = 0, /* No readahead */ - .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK, + .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK | BDI_CAP_SWAP_BACKED, .unplug_io_fn = default_unplug_io_fn, }; @@ -922,20 +920,26 @@ found: error = 1; if (!inode) goto out; - /* Precharge page while we can wait, compensate afterwards */ + /* Precharge page using GFP_KERNEL while we can wait */ error = mem_cgroup_cache_charge(page, current->mm, GFP_KERNEL); if (error) goto out; error = radix_tree_preload(GFP_KERNEL); - if (error) - goto uncharge; + if (error) { + mem_cgroup_uncharge_cache_page(page); + goto out; + } error = 1; spin_lock(&info->lock); ptr = shmem_swp_entry(info, idx, NULL); - if (ptr && ptr->val == entry.val) - error = add_to_page_cache(page, inode->i_mapping, + if (ptr && ptr->val == entry.val) { + error = add_to_page_cache_locked(page, inode->i_mapping, idx, GFP_NOWAIT); + /* does mem_cgroup_uncharge_cache_page on error */ + } else /* we must compensate for our precharge above */ + mem_cgroup_uncharge_cache_page(page); + if (error == -EEXIST) { struct page *filepage = find_get_page(inode->i_mapping, idx); error = 1; @@ -961,8 +965,6 @@ found: shmem_swp_unmap(ptr); spin_unlock(&info->lock); radix_tree_preload_end(); -uncharge: - mem_cgroup_uncharge_page(page); out: unlock_page(page); page_cache_release(page); @@ -1261,7 +1263,7 @@ repeat: } /* We have to do this with page locked to prevent races */ - if (TestSetPageLocked(swappage)) { + if (!trylock_page(swappage)) { shmem_swp_unmap(entry); spin_unlock(&info->lock); wait_on_page_locked(swappage); @@ -1297,8 +1299,8 @@ repeat: SetPageUptodate(filepage); set_page_dirty(filepage); swap_free(swap); - } else if (!(error = add_to_page_cache( - swappage, mapping, idx, GFP_NOWAIT))) { + } else if (!(error = add_to_page_cache_locked(swappage, mapping, + idx, GFP_NOWAIT))) { info->flags |= SHMEM_PAGEIN; shmem_swp_set(info, entry, 0); shmem_swp_unmap(entry); @@ -1311,24 +1313,21 @@ repeat: shmem_swp_unmap(entry); spin_unlock(&info->lock); unlock_page(swappage); + page_cache_release(swappage); if (error == -ENOMEM) { /* allow reclaim from this memory cgroup */ - error = mem_cgroup_cache_charge(swappage, - current->mm, gfp & ~__GFP_HIGHMEM); - if (error) { - page_cache_release(swappage); + error = mem_cgroup_shrink_usage(current->mm, + gfp); + if (error) goto failed; - } - mem_cgroup_uncharge_page(swappage); } - page_cache_release(swappage); goto repeat; } } else if (sgp == SGP_READ && !filepage) { shmem_swp_unmap(entry); filepage = find_get_page(mapping, idx); if (filepage && - (!PageUptodate(filepage) || TestSetPageLocked(filepage))) { + (!PageUptodate(filepage) || !trylock_page(filepage))) { spin_unlock(&info->lock); wait_on_page_locked(filepage); page_cache_release(filepage); @@ -1358,6 +1357,8 @@ repeat: } if (!filepage) { + int ret; + spin_unlock(&info->lock); filepage = shmem_alloc_page(gfp, info, idx); if (!filepage) { @@ -1366,6 +1367,7 @@ repeat: error = -ENOMEM; goto failed; } + SetPageSwapBacked(filepage); /* Precharge page while we can wait, compensate after */ error = mem_cgroup_cache_charge(filepage, current->mm, @@ -1386,10 +1388,18 @@ repeat: swap = *entry; shmem_swp_unmap(entry); } - if (error || swap.val || 0 != add_to_page_cache_lru( - filepage, mapping, idx, GFP_NOWAIT)) { + ret = error || swap.val; + if (ret) + mem_cgroup_uncharge_cache_page(filepage); + else + ret = add_to_page_cache_lru(filepage, mapping, + idx, GFP_NOWAIT); + /* + * At add_to_page_cache_lru() failure, uncharge will + * be done automatically. + */ + if (ret) { spin_unlock(&info->lock); - mem_cgroup_uncharge_page(filepage); page_cache_release(filepage); shmem_unacct_blocks(info->flags, 1); shmem_free_blocks(inode, 1); @@ -1398,7 +1408,6 @@ repeat: goto failed; goto repeat; } - mem_cgroup_uncharge_page(filepage); info->flags |= SHMEM_PAGEIN; } @@ -1468,12 +1477,16 @@ int shmem_lock(struct file *file, int lock, struct user_struct *user) if (!user_shm_lock(inode->i_size, user)) goto out_nomem; info->flags |= VM_LOCKED; + mapping_set_unevictable(file->f_mapping); } if (!lock && (info->flags & VM_LOCKED) && user) { user_shm_unlock(inode->i_size, user); info->flags &= ~VM_LOCKED; + mapping_clear_unevictable(file->f_mapping); + scan_mapping_unevictable_pages(file->f_mapping); } retval = 0; + out_nomem: spin_unlock(&info->lock); return retval; @@ -1503,7 +1516,6 @@ shmem_get_inode(struct super_block *sb, int mode, dev_t dev) inode->i_uid = current->fsuid; inode->i_gid = current->fsgid; inode->i_blocks = 0; - inode->i_mapping->a_ops = &shmem_aops; inode->i_mapping->backing_dev_info = &shmem_backing_dev_info; inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; inode->i_generation = get_seconds(); @@ -1518,6 +1530,7 @@ shmem_get_inode(struct super_block *sb, int mode, dev_t dev) init_special_inode(inode, mode, dev); break; case S_IFREG: + inode->i_mapping->a_ops = &shmem_aops; inode->i_op = &shmem_inode_operations; inode->i_fop = &shmem_file_operations; mpol_shared_policy_init(&info->policy, @@ -1690,26 +1703,38 @@ static void do_shmem_file_read(struct file *filp, loff_t *ppos, read_descriptor_ file_accessed(filp); } -static ssize_t shmem_file_read(struct file *filp, char __user *buf, size_t count, loff_t *ppos) +static ssize_t shmem_file_aio_read(struct kiocb *iocb, + const struct iovec *iov, unsigned long nr_segs, loff_t pos) { - read_descriptor_t desc; + struct file *filp = iocb->ki_filp; + ssize_t retval; + unsigned long seg; + size_t count; + loff_t *ppos = &iocb->ki_pos; - if ((ssize_t) count < 0) - return -EINVAL; - if (!access_ok(VERIFY_WRITE, buf, count)) - return -EFAULT; - if (!count) - return 0; + retval = generic_segment_checks(iov, &nr_segs, &count, VERIFY_WRITE); + if (retval) + return retval; - desc.written = 0; - desc.count = count; - desc.arg.buf = buf; - desc.error = 0; + for (seg = 0; seg < nr_segs; seg++) { + read_descriptor_t desc; - do_shmem_file_read(filp, ppos, &desc, file_read_actor); - if (desc.written) - return desc.written; - return desc.error; + desc.written = 0; + desc.arg.buf = iov[seg].iov_base; + desc.count = iov[seg].iov_len; + if (desc.count == 0) + continue; + desc.error = 0; + do_shmem_file_read(filp, ppos, &desc, file_read_actor); + retval += desc.written; + if (desc.error) { + retval = retval ?: desc.error; + break; + } + if (desc.count > 0) + break; + } + return retval; } static int shmem_statfs(struct dentry *dentry, struct kstatfs *buf) @@ -1907,6 +1932,7 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s return error; } unlock_page(page); + inode->i_mapping->a_ops = &shmem_aops; inode->i_op = &shmem_symlink_inode_operations; kaddr = kmap_atomic(page, KM_USER0); memcpy(kaddr, symname, len); @@ -2330,7 +2356,7 @@ static void shmem_destroy_inode(struct inode *inode) kmem_cache_free(shmem_inode_cachep, SHMEM_I(inode)); } -static void init_once(struct kmem_cache *cachep, void *foo) +static void init_once(void *foo) { struct shmem_inode_info *p = (struct shmem_inode_info *) foo; @@ -2369,8 +2395,9 @@ static const struct file_operations shmem_file_operations = { .mmap = shmem_mmap, #ifdef CONFIG_TMPFS .llseek = generic_file_llseek, - .read = shmem_file_read, + .read = do_sync_read, .write = do_sync_write, + .aio_read = shmem_file_aio_read, .aio_write = generic_file_aio_write, .fsync = simple_sync_file, .splice_read = generic_file_splice_read, @@ -2558,6 +2585,7 @@ put_memory: shmem_unacct_size(flags, size); return ERR_PTR(error); } +EXPORT_SYMBOL_GPL(shmem_file_setup); /** * shmem_zero_setup - setup a shared anonymous mapping diff --git a/mm/shmem_acl.c b/mm/shmem_acl.c index f5664c5b9eb1..8e5aadd7dcd6 100644 --- a/mm/shmem_acl.c +++ b/mm/shmem_acl.c @@ -191,7 +191,7 @@ shmem_check_acl(struct inode *inode, int mask) * shmem_permission - permission() inode operation */ int -shmem_permission(struct inode *inode, int mask, struct nameidata *nd) +shmem_permission(struct inode *inode, int mask) { return generic_permission(inode, mask, shmem_check_acl); } diff --git a/mm/slab.c b/mm/slab.c index 052e7d64537e..09187517f9dc 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -95,6 +95,7 @@ #include <linux/init.h> #include <linux/compiler.h> #include <linux/cpuset.h> +#include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/notifier.h> #include <linux/kallsyms.h> @@ -406,7 +407,7 @@ struct kmem_cache { unsigned int dflags; /* dynamic flags */ /* constructor func */ - void (*ctor)(struct kmem_cache *, void *); + void (*ctor)(void *obj); /* 5) cache creation/removal */ const char *name; @@ -2137,8 +2138,7 @@ static int __init_refok setup_cpu_cache(struct kmem_cache *cachep) */ struct kmem_cache * kmem_cache_create (const char *name, size_t size, size_t align, - unsigned long flags, - void (*ctor)(struct kmem_cache *, void *)) + unsigned long flags, void (*ctor)(void *)) { size_t left_over, slab_size, ralign; struct kmem_cache *cachep = NULL, *pc; @@ -2653,7 +2653,7 @@ static void cache_init_objs(struct kmem_cache *cachep, * They must also be threaded. */ if (cachep->ctor && !(cachep->flags & SLAB_POISON)) - cachep->ctor(cachep, objp + obj_offset(cachep)); + cachep->ctor(objp + obj_offset(cachep)); if (cachep->flags & SLAB_RED_ZONE) { if (*dbg_redzone2(cachep, objp) != RED_INACTIVE) @@ -2669,7 +2669,7 @@ static void cache_init_objs(struct kmem_cache *cachep, cachep->buffer_size / PAGE_SIZE, 0); #else if (cachep->ctor) - cachep->ctor(cachep, objp); + cachep->ctor(objp); #endif slab_bufctl(slabp)[i] = i + 1; } @@ -3093,7 +3093,7 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep, #endif objp += obj_offset(cachep); if (cachep->ctor && cachep->flags & SLAB_POISON) - cachep->ctor(cachep, objp); + cachep->ctor(objp); #if ARCH_SLAB_MINALIGN if ((u32)objp & (ARCH_SLAB_MINALIGN-1)) { printk(KERN_ERR "0x%p: not aligned to ARCH_SLAB_MINALIGN=%d\n", @@ -4259,7 +4259,7 @@ static int s_show(struct seq_file *m, void *p) * + further values on SMP and with statistics enabled */ -const struct seq_operations slabinfo_op = { +static const struct seq_operations slabinfo_op = { .start = s_start, .next = s_next, .stop = s_stop, @@ -4316,6 +4316,19 @@ ssize_t slabinfo_write(struct file *file, const char __user * buffer, return res; } +static int slabinfo_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &slabinfo_op); +} + +static const struct file_operations proc_slabinfo_operations = { + .open = slabinfo_open, + .read = seq_read, + .write = slabinfo_write, + .llseek = seq_lseek, + .release = seq_release, +}; + #ifdef CONFIG_DEBUG_SLAB_LEAK static void *leaks_start(struct seq_file *m, loff_t *pos) @@ -4444,13 +4457,47 @@ static int leaks_show(struct seq_file *m, void *p) return 0; } -const struct seq_operations slabstats_op = { +static const struct seq_operations slabstats_op = { .start = leaks_start, .next = s_next, .stop = s_stop, .show = leaks_show, }; + +static int slabstats_open(struct inode *inode, struct file *file) +{ + unsigned long *n = kzalloc(PAGE_SIZE, GFP_KERNEL); + int ret = -ENOMEM; + if (n) { + ret = seq_open(file, &slabstats_op); + if (!ret) { + struct seq_file *m = file->private_data; + *n = PAGE_SIZE / (2 * sizeof(unsigned long)); + m->private = n; + n = NULL; + } + kfree(n); + } + return ret; +} + +static const struct file_operations proc_slabstats_operations = { + .open = slabstats_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_private, +}; +#endif + +static int __init slab_proc_init(void) +{ + proc_create("slabinfo",S_IWUSR|S_IRUGO,NULL,&proc_slabinfo_operations); +#ifdef CONFIG_DEBUG_SLAB_LEAK + proc_create("slab_allocators", 0, NULL, &proc_slabstats_operations); #endif + return 0; +} +module_init(slab_proc_init); #endif /** @@ -4473,4 +4520,3 @@ size_t ksize(const void *objp) return obj_size(virt_to_cache(objp)); } -EXPORT_SYMBOL(ksize); diff --git a/mm/slob.c b/mm/slob.c index a3ad6671adf1..cb675d126791 100644 --- a/mm/slob.c +++ b/mm/slob.c @@ -130,17 +130,17 @@ static LIST_HEAD(free_slob_large); */ static inline int slob_page(struct slob_page *sp) { - return test_bit(PG_active, &sp->flags); + return PageSlobPage((struct page *)sp); } static inline void set_slob_page(struct slob_page *sp) { - __set_bit(PG_active, &sp->flags); + __SetPageSlobPage((struct page *)sp); } static inline void clear_slob_page(struct slob_page *sp) { - __clear_bit(PG_active, &sp->flags); + __ClearPageSlobPage((struct page *)sp); } /* @@ -148,19 +148,19 @@ static inline void clear_slob_page(struct slob_page *sp) */ static inline int slob_page_free(struct slob_page *sp) { - return test_bit(PG_private, &sp->flags); + return PageSlobFree((struct page *)sp); } static void set_slob_page_free(struct slob_page *sp, struct list_head *list) { list_add(&sp->list, list); - __set_bit(PG_private, &sp->flags); + __SetPageSlobFree((struct page *)sp); } static inline void clear_slob_page_free(struct slob_page *sp) { list_del(&sp->list); - __clear_bit(PG_private, &sp->flags); + __ClearPageSlobFree((struct page *)sp); } #define SLOB_UNIT sizeof(slob_t) @@ -514,23 +514,23 @@ size_t ksize(const void *block) return 0; sp = (struct slob_page *)virt_to_page(block); - if (slob_page(sp)) - return ((slob_t *)block - 1)->units + SLOB_UNIT; - else + if (slob_page(sp)) { + int align = max(ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN); + unsigned int *m = (unsigned int *)(block - align); + return SLOB_UNITS(*m) * SLOB_UNIT; + } else return sp->page.private; } -EXPORT_SYMBOL(ksize); struct kmem_cache { unsigned int size, align; unsigned long flags; const char *name; - void (*ctor)(struct kmem_cache *, void *); + void (*ctor)(void *); }; struct kmem_cache *kmem_cache_create(const char *name, size_t size, - size_t align, unsigned long flags, - void (*ctor)(struct kmem_cache *, void *)) + size_t align, unsigned long flags, void (*ctor)(void *)) { struct kmem_cache *c; @@ -575,7 +575,7 @@ void *kmem_cache_alloc_node(struct kmem_cache *c, gfp_t flags, int node) b = slob_new_page(flags, get_order(c->size), node); if (c->ctor) - c->ctor(c, b); + c->ctor(b); return b; } diff --git a/mm/slub.c b/mm/slub.c index 35ab38a94b46..7ad489af9561 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -14,6 +14,7 @@ #include <linux/interrupt.h> #include <linux/bitops.h> #include <linux/slab.h> +#include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/cpu.h> #include <linux/cpuset.h> @@ -102,44 +103,12 @@ * the fast path and disables lockless freelists. */ -#define FROZEN (1 << PG_active) - #ifdef CONFIG_SLUB_DEBUG -#define SLABDEBUG (1 << PG_error) +#define SLABDEBUG 1 #else #define SLABDEBUG 0 #endif -static inline int SlabFrozen(struct page *page) -{ - return page->flags & FROZEN; -} - -static inline void SetSlabFrozen(struct page *page) -{ - page->flags |= FROZEN; -} - -static inline void ClearSlabFrozen(struct page *page) -{ - page->flags &= ~FROZEN; -} - -static inline int SlabDebug(struct page *page) -{ - return page->flags & SLABDEBUG; -} - -static inline void SetSlabDebug(struct page *page) -{ - page->flags |= SLABDEBUG; -} - -static inline void ClearSlabDebug(struct page *page) -{ - page->flags &= ~SLABDEBUG; -} - /* * Issues still to be resolved: * @@ -492,7 +461,7 @@ static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p) if (p > addr + 16) print_section("Bytes b4", p - 16, 16); - print_section("Object", p, min(s->objsize, 128)); + print_section("Object", p, min_t(unsigned long, s->objsize, PAGE_SIZE)); if (s->flags & SLAB_RED_ZONE) print_section("Redzone", p + s->objsize, @@ -971,7 +940,7 @@ static int free_debug_processing(struct kmem_cache *s, struct page *page, } /* Special debug activities for freeing objects */ - if (!SlabFrozen(page) && !page->freelist) + if (!PageSlubFrozen(page) && !page->freelist) remove_full(s, page); if (s->flags & SLAB_STORE_USER) set_track(s, object, TRACK_FREE, addr); @@ -1044,7 +1013,7 @@ __setup("slub_debug", setup_slub_debug); static unsigned long kmem_cache_flags(unsigned long objsize, unsigned long flags, const char *name, - void (*ctor)(struct kmem_cache *, void *)) + void (*ctor)(void *)) { /* * Enable debugging if selected on the kernel commandline. @@ -1072,7 +1041,7 @@ static inline int check_object(struct kmem_cache *s, struct page *page, static inline void add_full(struct kmem_cache_node *n, struct page *page) {} static inline unsigned long kmem_cache_flags(unsigned long objsize, unsigned long flags, const char *name, - void (*ctor)(struct kmem_cache *, void *)) + void (*ctor)(void *)) { return flags; } @@ -1135,7 +1104,7 @@ static void setup_object(struct kmem_cache *s, struct page *page, { setup_object_debug(s, page, object); if (unlikely(s->ctor)) - s->ctor(s, object); + s->ctor(object); } static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node) @@ -1157,7 +1126,7 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node) page->flags |= 1 << PG_slab; if (s->flags & (SLAB_DEBUG_FREE | SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | SLAB_TRACE)) - SetSlabDebug(page); + __SetPageSlubDebug(page); start = page_address(page); @@ -1184,14 +1153,14 @@ static void __free_slab(struct kmem_cache *s, struct page *page) int order = compound_order(page); int pages = 1 << order; - if (unlikely(SlabDebug(page))) { + if (unlikely(SLABDEBUG && PageSlubDebug(page))) { void *p; slab_pad_check(s, page); for_each_object(p, s, page_address(page), page->objects) check_object(s, page, p, 0); - ClearSlabDebug(page); + __ClearPageSlubDebug(page); } mod_zone_page_state(page_zone(page), @@ -1288,7 +1257,7 @@ static inline int lock_and_freeze_slab(struct kmem_cache_node *n, if (slab_trylock(page)) { list_del(&page->lru); n->nr_partial--; - SetSlabFrozen(page); + __SetPageSlubFrozen(page); return 1; } return 0; @@ -1361,7 +1330,7 @@ static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags) n = get_node(s, zone_to_nid(zone)); if (n && cpuset_zone_allowed_hardwall(zone, flags) && - n->nr_partial > MIN_PARTIAL) { + n->nr_partial > n->min_partial) { page = get_partial_node(n); if (page) return page; @@ -1398,7 +1367,7 @@ static void unfreeze_slab(struct kmem_cache *s, struct page *page, int tail) struct kmem_cache_node *n = get_node(s, page_to_nid(page)); struct kmem_cache_cpu *c = get_cpu_slab(s, smp_processor_id()); - ClearSlabFrozen(page); + __ClearPageSlubFrozen(page); if (page->inuse) { if (page->freelist) { @@ -1406,13 +1375,14 @@ static void unfreeze_slab(struct kmem_cache *s, struct page *page, int tail) stat(c, tail ? DEACTIVATE_TO_TAIL : DEACTIVATE_TO_HEAD); } else { stat(c, DEACTIVATE_FULL); - if (SlabDebug(page) && (s->flags & SLAB_STORE_USER)) + if (SLABDEBUG && PageSlubDebug(page) && + (s->flags & SLAB_STORE_USER)) add_full(n, page); } slab_unlock(page); } else { stat(c, DEACTIVATE_EMPTY); - if (n->nr_partial < MIN_PARTIAL) { + if (n->nr_partial < n->min_partial) { /* * Adding an empty slab to the partial slabs in order * to avoid page allocator overhead. This slab needs @@ -1495,15 +1465,7 @@ static void flush_cpu_slab(void *d) static void flush_all(struct kmem_cache *s) { -#ifdef CONFIG_SMP on_each_cpu(flush_cpu_slab, s, 1); -#else - unsigned long flags; - - local_irq_save(flags); - flush_cpu_slab(s); - local_irq_restore(flags); -#endif } /* @@ -1559,7 +1521,7 @@ load_freelist: object = c->page->freelist; if (unlikely(!object)) goto another_slab; - if (unlikely(SlabDebug(c->page))) + if (unlikely(SLABDEBUG && PageSlubDebug(c->page))) goto debug; c->freelist = object[c->offset]; @@ -1596,7 +1558,7 @@ new_slab: if (c->page) flush_slab(s, c); slab_lock(new); - SetSlabFrozen(new); + __SetPageSlubFrozen(new); c->page = new; goto load_freelist; } @@ -1682,7 +1644,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page, stat(c, FREE_SLOWPATH); slab_lock(page); - if (unlikely(SlabDebug(page))) + if (unlikely(SLABDEBUG && PageSlubDebug(page))) goto debug; checks_ok: @@ -1690,7 +1652,7 @@ checks_ok: page->freelist = object; page->inuse--; - if (unlikely(SlabFrozen(page))) { + if (unlikely(PageSlubFrozen(page))) { stat(c, FREE_FROZEN); goto out_unlock; } @@ -1952,13 +1914,26 @@ static void init_kmem_cache_cpu(struct kmem_cache *s, #endif } -static void init_kmem_cache_node(struct kmem_cache_node *n) +static void +init_kmem_cache_node(struct kmem_cache_node *n, struct kmem_cache *s) { n->nr_partial = 0; + + /* + * The larger the object size is, the more pages we want on the partial + * list to avoid pounding the page allocator excessively. + */ + n->min_partial = ilog2(s->size); + if (n->min_partial < MIN_PARTIAL) + n->min_partial = MIN_PARTIAL; + else if (n->min_partial > MAX_PARTIAL) + n->min_partial = MAX_PARTIAL; + spin_lock_init(&n->list_lock); INIT_LIST_HEAD(&n->partial); #ifdef CONFIG_SLUB_DEBUG atomic_long_set(&n->nr_slabs, 0); + atomic_long_set(&n->total_objects, 0); INIT_LIST_HEAD(&n->full); #endif } @@ -2126,7 +2101,7 @@ static struct kmem_cache_node *early_kmem_cache_node_alloc(gfp_t gfpflags, init_object(kmalloc_caches, n, 1); init_tracking(kmalloc_caches, n); #endif - init_kmem_cache_node(n); + init_kmem_cache_node(n, kmalloc_caches); inc_slabs_node(kmalloc_caches, node, page->objects); /* @@ -2183,7 +2158,7 @@ static int init_kmem_cache_nodes(struct kmem_cache *s, gfp_t gfpflags) } s->node[node] = n; - init_kmem_cache_node(n); + init_kmem_cache_node(n, s); } return 1; } @@ -2194,7 +2169,7 @@ static void free_kmem_cache_nodes(struct kmem_cache *s) static int init_kmem_cache_nodes(struct kmem_cache *s, gfp_t gfpflags) { - init_kmem_cache_node(&s->local_node); + init_kmem_cache_node(&s->local_node, s); return 1; } #endif @@ -2325,7 +2300,7 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order) static int kmem_cache_open(struct kmem_cache *s, gfp_t gfpflags, const char *name, size_t size, size_t align, unsigned long flags, - void (*ctor)(struct kmem_cache *, void *)) + void (*ctor)(void *)) { memset(s, 0, kmem_size); s->name = name; @@ -2339,7 +2314,7 @@ static int kmem_cache_open(struct kmem_cache *s, gfp_t gfpflags, s->refcount = 1; #ifdef CONFIG_NUMA - s->remote_node_defrag_ratio = 100; + s->remote_node_defrag_ratio = 1000; #endif if (!init_kmem_cache_nodes(s, gfpflags & ~SLUB_DMA)) goto error; @@ -2754,7 +2729,6 @@ size_t ksize(const void *object) */ return s->size; } -EXPORT_SYMBOL(ksize); void kfree(const void *x) { @@ -2929,7 +2903,7 @@ static int slab_mem_going_online_callback(void *arg) ret = -ENOMEM; goto out; } - init_kmem_cache_node(n); + init_kmem_cache_node(n, s); s->node[nid] = n; } out: @@ -3081,7 +3055,7 @@ static int slab_unmergeable(struct kmem_cache *s) static struct kmem_cache *find_mergeable(size_t size, size_t align, unsigned long flags, const char *name, - void (*ctor)(struct kmem_cache *, void *)) + void (*ctor)(void *)) { struct kmem_cache *s; @@ -3121,8 +3095,7 @@ static struct kmem_cache *find_mergeable(size_t size, } struct kmem_cache *kmem_cache_create(const char *name, size_t size, - size_t align, unsigned long flags, - void (*ctor)(struct kmem_cache *, void *)) + size_t align, unsigned long flags, void (*ctor)(void *)) { struct kmem_cache *s; @@ -3325,12 +3298,12 @@ static void validate_slab_slab(struct kmem_cache *s, struct page *page, s->name, page); if (s->flags & DEBUG_DEFAULT_FLAGS) { - if (!SlabDebug(page)) - printk(KERN_ERR "SLUB %s: SlabDebug not set " + if (!PageSlubDebug(page)) + printk(KERN_ERR "SLUB %s: SlubDebug not set " "on slab 0x%p\n", s->name, page); } else { - if (SlabDebug(page)) - printk(KERN_ERR "SLUB %s: SlabDebug set on " + if (PageSlubDebug(page)) + printk(KERN_ERR "SLUB %s: SlubDebug set on " "slab 0x%p\n", s->name, page); } } @@ -4087,7 +4060,7 @@ static ssize_t remote_node_defrag_ratio_store(struct kmem_cache *s, if (err) return err; - if (ratio < 100) + if (ratio <= 100) s->remote_node_defrag_ratio = ratio * 10; return length; @@ -4445,14 +4418,6 @@ __initcall(slab_sysfs_init); * The /proc/slabinfo ABI */ #ifdef CONFIG_SLABINFO - -ssize_t slabinfo_write(struct file *file, const char __user *buffer, - size_t count, loff_t *ppos) -{ - return -EINVAL; -} - - static void print_slabinfo_header(struct seq_file *m) { seq_puts(m, "slabinfo - version: 2.1\n"); @@ -4520,11 +4485,29 @@ static int s_show(struct seq_file *m, void *p) return 0; } -const struct seq_operations slabinfo_op = { +static const struct seq_operations slabinfo_op = { .start = s_start, .next = s_next, .stop = s_stop, .show = s_show, }; +static int slabinfo_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &slabinfo_op); +} + +static const struct file_operations proc_slabinfo_operations = { + .open = slabinfo_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +static int __init slab_proc_init(void) +{ + proc_create("slabinfo",S_IWUSR|S_IRUGO,NULL,&proc_slabinfo_operations); + return 0; +} +module_init(slab_proc_init); #endif /* CONFIG_SLABINFO */ diff --git a/mm/sparse.c b/mm/sparse.c index 36511c7b5e2c..39db301b920d 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -147,22 +147,41 @@ static inline int sparse_early_nid(struct mem_section *section) return (section->section_mem_map >> SECTION_NID_SHIFT); } -/* Record a memory area against a node. */ -void __init memory_present(int nid, unsigned long start, unsigned long end) +/* Validate the physical addressing limitations of the model */ +void __meminit mminit_validate_memmodel_limits(unsigned long *start_pfn, + unsigned long *end_pfn) { - unsigned long max_arch_pfn = 1UL << (MAX_PHYSMEM_BITS-PAGE_SHIFT); - unsigned long pfn; + unsigned long max_sparsemem_pfn = 1UL << (MAX_PHYSMEM_BITS-PAGE_SHIFT); /* * Sanity checks - do not allow an architecture to pass * in larger pfns than the maximum scope of sparsemem: */ - if (start >= max_arch_pfn) - return; - if (end >= max_arch_pfn) - end = max_arch_pfn; + if (*start_pfn > max_sparsemem_pfn) { + mminit_dprintk(MMINIT_WARNING, "pfnvalidation", + "Start of range %lu -> %lu exceeds SPARSEMEM max %lu\n", + *start_pfn, *end_pfn, max_sparsemem_pfn); + WARN_ON_ONCE(1); + *start_pfn = max_sparsemem_pfn; + *end_pfn = max_sparsemem_pfn; + } + + if (*end_pfn > max_sparsemem_pfn) { + mminit_dprintk(MMINIT_WARNING, "pfnvalidation", + "End of range %lu -> %lu exceeds SPARSEMEM max %lu\n", + *start_pfn, *end_pfn, max_sparsemem_pfn); + WARN_ON_ONCE(1); + *end_pfn = max_sparsemem_pfn; + } +} + +/* Record a memory area against a node. */ +void __init memory_present(int nid, unsigned long start, unsigned long end) +{ + unsigned long pfn; start &= PAGE_SECTION_MASK; + mminit_validate_memmodel_limits(&start, &end); for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION) { unsigned long section = pfn_to_section_nr(pfn); struct mem_section *ms; @@ -187,6 +206,7 @@ unsigned long __init node_memmap_size_bytes(int nid, unsigned long start_pfn, unsigned long pfn; unsigned long nr_pages = 0; + mminit_validate_memmodel_limits(&start_pfn, &end_pfn); for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) { if (nid != early_pfn_to_nid(pfn)) continue; @@ -248,16 +268,92 @@ static unsigned long *__kmalloc_section_usemap(void) } #endif /* CONFIG_MEMORY_HOTPLUG */ +#ifdef CONFIG_MEMORY_HOTREMOVE +static unsigned long * __init +sparse_early_usemap_alloc_pgdat_section(struct pglist_data *pgdat) +{ + unsigned long section_nr; + + /* + * A page may contain usemaps for other sections preventing the + * page being freed and making a section unremovable while + * other sections referencing the usemap retmain active. Similarly, + * a pgdat can prevent a section being removed. If section A + * contains a pgdat and section B contains the usemap, both + * sections become inter-dependent. This allocates usemaps + * from the same section as the pgdat where possible to avoid + * this problem. + */ + section_nr = pfn_to_section_nr(__pa(pgdat) >> PAGE_SHIFT); + return alloc_bootmem_section(usemap_size(), section_nr); +} + +static void __init check_usemap_section_nr(int nid, unsigned long *usemap) +{ + unsigned long usemap_snr, pgdat_snr; + static unsigned long old_usemap_snr = NR_MEM_SECTIONS; + static unsigned long old_pgdat_snr = NR_MEM_SECTIONS; + struct pglist_data *pgdat = NODE_DATA(nid); + int usemap_nid; + + usemap_snr = pfn_to_section_nr(__pa(usemap) >> PAGE_SHIFT); + pgdat_snr = pfn_to_section_nr(__pa(pgdat) >> PAGE_SHIFT); + if (usemap_snr == pgdat_snr) + return; + + if (old_usemap_snr == usemap_snr && old_pgdat_snr == pgdat_snr) + /* skip redundant message */ + return; + + old_usemap_snr = usemap_snr; + old_pgdat_snr = pgdat_snr; + + usemap_nid = sparse_early_nid(__nr_to_section(usemap_snr)); + if (usemap_nid != nid) { + printk(KERN_INFO + "node %d must be removed before remove section %ld\n", + nid, usemap_snr); + return; + } + /* + * There is a circular dependency. + * Some platforms allow un-removable section because they will just + * gather other removable sections for dynamic partitioning. + * Just notify un-removable section's number here. + */ + printk(KERN_INFO "Section %ld and %ld (node %d)", usemap_snr, + pgdat_snr, nid); + printk(KERN_CONT + " have a circular dependency on usemap and pgdat allocations\n"); +} +#else +static unsigned long * __init +sparse_early_usemap_alloc_pgdat_section(struct pglist_data *pgdat) +{ + return NULL; +} + +static void __init check_usemap_section_nr(int nid, unsigned long *usemap) +{ +} +#endif /* CONFIG_MEMORY_HOTREMOVE */ + static unsigned long *__init sparse_early_usemap_alloc(unsigned long pnum) { unsigned long *usemap; struct mem_section *ms = __nr_to_section(pnum); int nid = sparse_early_nid(ms); - usemap = alloc_bootmem_node(NODE_DATA(nid), usemap_size()); + usemap = sparse_early_usemap_alloc_pgdat_section(NODE_DATA(nid)); if (usemap) return usemap; + usemap = alloc_bootmem_node(NODE_DATA(nid), usemap_size()); + if (usemap) { + check_usemap_section_nr(nid, usemap); + return usemap; + } + /* Stupid: suppress gcc warning for SPARSEMEM && !NUMA */ nid = 0; @@ -280,7 +376,7 @@ struct page __init *sparse_mem_map_populate(unsigned long pnum, int nid) } #endif /* !CONFIG_SPARSEMEM_VMEMMAP */ -struct page __init *sparse_early_mem_map_alloc(unsigned long pnum) +static struct page __init *sparse_early_mem_map_alloc(unsigned long pnum) { struct page *map; struct mem_section *ms = __nr_to_section(pnum); diff --git a/mm/swap.c b/mm/swap.c index 45c9f25a8a3b..2152e48a7b8f 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -31,12 +31,13 @@ #include <linux/backing-dev.h> #include <linux/memcontrol.h> +#include "internal.h" + /* How many pages do we try to swap or page in/out together? */ int page_cluster; -static DEFINE_PER_CPU(struct pagevec, lru_add_pvecs) = { 0, }; -static DEFINE_PER_CPU(struct pagevec, lru_add_active_pvecs) = { 0, }; -static DEFINE_PER_CPU(struct pagevec, lru_rotate_pvecs) = { 0, }; +static DEFINE_PER_CPU(struct pagevec[NR_LRU_LISTS], lru_add_pvecs); +static DEFINE_PER_CPU(struct pagevec, lru_rotate_pvecs); /* * This path almost never happens for VM activity - pages are normally @@ -116,8 +117,9 @@ static void pagevec_move_tail(struct pagevec *pvec) zone = pagezone; spin_lock(&zone->lru_lock); } - if (PageLRU(page) && !PageActive(page)) { - list_move_tail(&page->lru, &zone->inactive_list); + if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) { + int lru = page_is_file_cache(page); + list_move_tail(&page->lru, &zone->lru[lru].list); pgmoved++; } } @@ -136,7 +138,7 @@ static void pagevec_move_tail(struct pagevec *pvec) void rotate_reclaimable_page(struct page *page) { if (!PageLocked(page) && !PageDirty(page) && !PageActive(page) && - PageLRU(page)) { + !PageUnevictable(page) && PageLRU(page)) { struct pagevec *pvec; unsigned long flags; @@ -157,12 +159,19 @@ void activate_page(struct page *page) struct zone *zone = page_zone(page); spin_lock_irq(&zone->lru_lock); - if (PageLRU(page) && !PageActive(page)) { - del_page_from_inactive_list(zone, page); + if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) { + int file = page_is_file_cache(page); + int lru = LRU_BASE + file; + del_page_from_lru_list(zone, page, lru); + SetPageActive(page); - add_page_to_active_list(zone, page); + lru += LRU_ACTIVE; + add_page_to_lru_list(zone, page, lru); __count_vm_event(PGACTIVATE); - mem_cgroup_move_lists(page, true); + mem_cgroup_move_lists(page, lru); + + zone->recent_rotated[!!file]++; + zone->recent_scanned[!!file]++; } spin_unlock_irq(&zone->lru_lock); } @@ -176,7 +185,8 @@ void activate_page(struct page *page) */ void mark_page_accessed(struct page *page) { - if (!PageActive(page) && PageReferenced(page) && PageLRU(page)) { + if (!PageActive(page) && !PageUnevictable(page) && + PageReferenced(page) && PageLRU(page)) { activate_page(page); ClearPageReferenced(page); } else if (!PageReferenced(page)) { @@ -186,28 +196,73 @@ void mark_page_accessed(struct page *page) EXPORT_SYMBOL(mark_page_accessed); -/** - * lru_cache_add: add a page to the page lists - * @page: the page to add - */ -void lru_cache_add(struct page *page) +void __lru_cache_add(struct page *page, enum lru_list lru) { - struct pagevec *pvec = &get_cpu_var(lru_add_pvecs); + struct pagevec *pvec = &get_cpu_var(lru_add_pvecs)[lru]; page_cache_get(page); if (!pagevec_add(pvec, page)) - __pagevec_lru_add(pvec); + ____pagevec_lru_add(pvec, lru); put_cpu_var(lru_add_pvecs); } -void lru_cache_add_active(struct page *page) +/** + * lru_cache_add_lru - add a page to a page list + * @page: the page to be added to the LRU. + * @lru: the LRU list to which the page is added. + */ +void lru_cache_add_lru(struct page *page, enum lru_list lru) { - struct pagevec *pvec = &get_cpu_var(lru_add_active_pvecs); + if (PageActive(page)) { + VM_BUG_ON(PageUnevictable(page)); + ClearPageActive(page); + } else if (PageUnevictable(page)) { + VM_BUG_ON(PageActive(page)); + ClearPageUnevictable(page); + } - page_cache_get(page); - if (!pagevec_add(pvec, page)) - __pagevec_lru_add_active(pvec); - put_cpu_var(lru_add_active_pvecs); + VM_BUG_ON(PageLRU(page) || PageActive(page) || PageUnevictable(page)); + __lru_cache_add(page, lru); +} + +/** + * add_page_to_unevictable_list - add a page to the unevictable list + * @page: the page to be added to the unevictable list + * + * Add page directly to its zone's unevictable list. To avoid races with + * tasks that might be making the page evictable, through eg. munlock, + * munmap or exit, while it's not on the lru, we want to add the page + * while it's locked or otherwise "invisible" to other tasks. This is + * difficult to do when using the pagevec cache, so bypass that. + */ +void add_page_to_unevictable_list(struct page *page) +{ + struct zone *zone = page_zone(page); + + spin_lock_irq(&zone->lru_lock); + SetPageUnevictable(page); + SetPageLRU(page); + add_page_to_lru_list(zone, page, LRU_UNEVICTABLE); + spin_unlock_irq(&zone->lru_lock); +} + +/** + * lru_cache_add_active_or_unevictable + * @page: the page to be added to LRU + * @vma: vma in which page is mapped for determining reclaimability + * + * place @page on active or unevictable LRU list, depending on + * page_evictable(). Note that if the page is not evictable, + * it goes directly back onto it's zone's unevictable list. It does + * NOT use a per cpu pagevec. + */ +void lru_cache_add_active_or_unevictable(struct page *page, + struct vm_area_struct *vma) +{ + if (page_evictable(page, vma)) + lru_cache_add_lru(page, LRU_ACTIVE + page_is_file_cache(page)); + else + add_page_to_unevictable_list(page); } /* @@ -217,15 +272,15 @@ void lru_cache_add_active(struct page *page) */ static void drain_cpu_pagevecs(int cpu) { + struct pagevec *pvecs = per_cpu(lru_add_pvecs, cpu); struct pagevec *pvec; + int lru; - pvec = &per_cpu(lru_add_pvecs, cpu); - if (pagevec_count(pvec)) - __pagevec_lru_add(pvec); - - pvec = &per_cpu(lru_add_active_pvecs, cpu); - if (pagevec_count(pvec)) - __pagevec_lru_add_active(pvec); + for_each_lru(lru) { + pvec = &pvecs[lru - LRU_BASE]; + if (pagevec_count(pvec)) + ____pagevec_lru_add(pvec, lru); + } pvec = &per_cpu(lru_rotate_pvecs, cpu); if (pagevec_count(pvec)) { @@ -244,7 +299,7 @@ void lru_add_drain(void) put_cpu(); } -#ifdef CONFIG_NUMA +#if defined(CONFIG_NUMA) || defined(CONFIG_UNEVICTABLE_LRU) static void lru_add_drain_per_cpu(struct work_struct *dummy) { lru_add_drain(); @@ -278,9 +333,10 @@ int lru_add_drain_all(void) * Avoid taking zone->lru_lock if possible, but if it is taken, retain it * for the remainder of the operation. * - * The locking in this function is against shrink_cache(): we recheck the - * page count inside the lock to see whether shrink_cache grabbed the page - * via the LRU. If it did, give up: shrink_cache will free it. + * The locking in this function is against shrink_inactive_list(): we recheck + * the page count inside the lock to see whether shrink_inactive_list() + * grabbed the page via the LRU. If it did, give up: shrink_inactive_list() + * will free it. */ void release_pages(struct page **pages, int nr, int cold) { @@ -307,6 +363,7 @@ void release_pages(struct page **pages, int nr, int cold) if (PageLRU(page)) { struct zone *pagezone = page_zone(page); + if (pagezone != zone) { if (zone) spin_unlock_irqrestore(&zone->lru_lock, @@ -379,10 +436,11 @@ void __pagevec_release_nonlru(struct pagevec *pvec) * Add the passed pages to the LRU, then drop the caller's refcount * on them. Reinitialises the caller's pagevec. */ -void __pagevec_lru_add(struct pagevec *pvec) +void ____pagevec_lru_add(struct pagevec *pvec, enum lru_list lru) { int i; struct zone *zone = NULL; + VM_BUG_ON(is_unevictable_lru(lru)); for (i = 0; i < pagevec_count(pvec); i++) { struct page *page = pvec->pages[i]; @@ -394,9 +452,13 @@ void __pagevec_lru_add(struct pagevec *pvec) zone = pagezone; spin_lock_irq(&zone->lru_lock); } + VM_BUG_ON(PageActive(page)); + VM_BUG_ON(PageUnevictable(page)); VM_BUG_ON(PageLRU(page)); SetPageLRU(page); - add_page_to_inactive_list(zone, page); + if (is_active_lru(lru)) + SetPageActive(page); + add_page_to_lru_list(zone, page, lru); } if (zone) spin_unlock_irq(&zone->lru_lock); @@ -404,48 +466,45 @@ void __pagevec_lru_add(struct pagevec *pvec) pagevec_reinit(pvec); } -EXPORT_SYMBOL(__pagevec_lru_add); +EXPORT_SYMBOL(____pagevec_lru_add); -void __pagevec_lru_add_active(struct pagevec *pvec) +/* + * Try to drop buffers from the pages in a pagevec + */ +void pagevec_strip(struct pagevec *pvec) { int i; - struct zone *zone = NULL; for (i = 0; i < pagevec_count(pvec); i++) { struct page *page = pvec->pages[i]; - struct zone *pagezone = page_zone(page); - if (pagezone != zone) { - if (zone) - spin_unlock_irq(&zone->lru_lock); - zone = pagezone; - spin_lock_irq(&zone->lru_lock); + if (PagePrivate(page) && trylock_page(page)) { + if (PagePrivate(page)) + try_to_release_page(page, 0); + unlock_page(page); } - VM_BUG_ON(PageLRU(page)); - SetPageLRU(page); - VM_BUG_ON(PageActive(page)); - SetPageActive(page); - add_page_to_active_list(zone, page); } - if (zone) - spin_unlock_irq(&zone->lru_lock); - release_pages(pvec->pages, pvec->nr, pvec->cold); - pagevec_reinit(pvec); } -/* - * Try to drop buffers from the pages in a pagevec +/** + * pagevec_swap_free - try to free swap space from the pages in a pagevec + * @pvec: pagevec with swapcache pages to free the swap space of + * + * The caller needs to hold an extra reference to each page and + * not hold the page lock on the pages. This function uses a + * trylock on the page lock so it may not always free the swap + * space associated with a page. */ -void pagevec_strip(struct pagevec *pvec) +void pagevec_swap_free(struct pagevec *pvec) { int i; for (i = 0; i < pagevec_count(pvec); i++) { struct page *page = pvec->pages[i]; - if (PagePrivate(page) && !TestSetPageLocked(page)) { - if (PagePrivate(page)) - try_to_release_page(page, 0); + if (PageSwapCache(page) && trylock_page(page)) { + if (PageSwapCache(page)) + remove_exclusive_swap_page_ref(page); unlock_page(page); } } @@ -493,7 +552,7 @@ EXPORT_SYMBOL(pagevec_lookup_tag); */ #define ACCT_THRESHOLD max(16, NR_CPUS * 2) -static DEFINE_PER_CPU(long, committed_space) = 0; +static DEFINE_PER_CPU(long, committed_space); void vm_acct_memory(long pages) { diff --git a/mm/swap_state.c b/mm/swap_state.c index d8aadaf2a0ba..3353c9029cef 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -33,13 +33,13 @@ static const struct address_space_operations swap_aops = { }; static struct backing_dev_info swap_backing_dev_info = { - .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK, + .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK | BDI_CAP_SWAP_BACKED, .unplug_io_fn = swap_unplug_io_fn, }; struct address_space swapper_space = { .page_tree = RADIX_TREE_INIT(GFP_ATOMIC|__GFP_NOWARN), - .tree_lock = __RW_LOCK_UNLOCKED(swapper_space.tree_lock), + .tree_lock = __SPIN_LOCK_UNLOCKED(swapper_space.tree_lock), .a_ops = &swap_aops, .i_mmap_nonlinear = LIST_HEAD_INIT(swapper_space.i_mmap_nonlinear), .backing_dev_info = &swap_backing_dev_info, @@ -56,15 +56,16 @@ static struct { void show_swap_cache_info(void) { - printk("Swap cache: add %lu, delete %lu, find %lu/%lu\n", + printk("%lu pages in swap cache\n", total_swapcache_pages); + printk("Swap cache stats: add %lu, delete %lu, find %lu/%lu\n", swap_cache_info.add_total, swap_cache_info.del_total, swap_cache_info.find_success, swap_cache_info.find_total); - printk("Free swap = %lukB\n", nr_swap_pages << (PAGE_SHIFT - 10)); + printk("Free swap = %ldkB\n", nr_swap_pages << (PAGE_SHIFT - 10)); printk("Total swap = %lukB\n", total_swap_pages << (PAGE_SHIFT - 10)); } /* - * add_to_swap_cache resembles add_to_page_cache on swapper_space, + * add_to_swap_cache resembles add_to_page_cache_locked on swapper_space, * but sets SwapCache flag and private instead of mapping and index. */ int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask) @@ -74,21 +75,29 @@ int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask) BUG_ON(!PageLocked(page)); BUG_ON(PageSwapCache(page)); BUG_ON(PagePrivate(page)); + BUG_ON(!PageSwapBacked(page)); error = radix_tree_preload(gfp_mask); if (!error) { - write_lock_irq(&swapper_space.tree_lock); + page_cache_get(page); + SetPageSwapCache(page); + set_page_private(page, entry.val); + + spin_lock_irq(&swapper_space.tree_lock); error = radix_tree_insert(&swapper_space.page_tree, entry.val, page); - if (!error) { - page_cache_get(page); - SetPageSwapCache(page); - set_page_private(page, entry.val); + if (likely(!error)) { total_swapcache_pages++; __inc_zone_page_state(page, NR_FILE_PAGES); INC_CACHE_INFO(add_total); } - write_unlock_irq(&swapper_space.tree_lock); + spin_unlock_irq(&swapper_space.tree_lock); radix_tree_preload_end(); + + if (unlikely(error)) { + set_page_private(page, 0UL); + ClearPageSwapCache(page); + page_cache_release(page); + } } return error; } @@ -175,9 +184,9 @@ void delete_from_swap_cache(struct page *page) entry.val = page_private(page); - write_lock_irq(&swapper_space.tree_lock); + spin_lock_irq(&swapper_space.tree_lock); __delete_from_swap_cache(page); - write_unlock_irq(&swapper_space.tree_lock); + spin_unlock_irq(&swapper_space.tree_lock); swap_free(entry); page_cache_release(page); @@ -193,7 +202,7 @@ void delete_from_swap_cache(struct page *page) */ static inline void free_swap_cache(struct page *page) { - if (PageSwapCache(page) && !TestSetPageLocked(page)) { + if (PageSwapCache(page) && trylock_page(page)) { remove_exclusive_swap_page(page); unlock_page(page); } @@ -294,17 +303,19 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, * re-using the just freed swap entry for an existing page. * May fail (-ENOMEM) if radix-tree node allocation failed. */ - SetPageLocked(new_page); + __set_page_locked(new_page); + SetPageSwapBacked(new_page); err = add_to_swap_cache(new_page, entry, gfp_mask & GFP_KERNEL); - if (!err) { + if (likely(!err)) { /* * Initiate read into locked page and return. */ - lru_cache_add_active(new_page); + lru_cache_add_anon(new_page); swap_readpage(NULL, new_page); return new_page; } - ClearPageLocked(new_page); + ClearPageSwapBacked(new_page); + __clear_page_locked(new_page); swap_free(entry); } while (err != -ENOMEM); diff --git a/mm/swapfile.c b/mm/swapfile.c index bd1bb5920306..90cb67a5417c 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -33,17 +33,18 @@ #include <asm/tlbflush.h> #include <linux/swapops.h> -DEFINE_SPINLOCK(swap_lock); -unsigned int nr_swapfiles; +static DEFINE_SPINLOCK(swap_lock); +static unsigned int nr_swapfiles; long total_swap_pages; static int swap_overflow; +static int least_priority; static const char Bad_file[] = "Bad swap file entry "; static const char Unused_file[] = "Unused swap file entry "; static const char Bad_offset[] = "Bad swap offset entry "; static const char Unused_offset[] = "Unused swap offset entry "; -struct swap_list_t swap_list = {-1, -1}; +static struct swap_list_t swap_list = {-1, -1}; static struct swap_info_struct swap_info[MAX_SWAPFILES]; @@ -343,7 +344,7 @@ int can_share_swap_page(struct page *page) * Work out if there are any other processes sharing this * swap cache page. Free it if you can. Return success. */ -int remove_exclusive_swap_page(struct page *page) +static int remove_exclusive_swap_page_count(struct page *page, int count) { int retval; struct swap_info_struct * p; @@ -356,7 +357,7 @@ int remove_exclusive_swap_page(struct page *page) return 0; if (PageWriteback(page)) return 0; - if (page_count(page) != 2) /* 2: us + cache */ + if (page_count(page) != count) /* us + cache + ptes */ return 0; entry.val = page_private(page); @@ -368,13 +369,13 @@ int remove_exclusive_swap_page(struct page *page) retval = 0; if (p->swap_map[swp_offset(entry)] == 1) { /* Recheck the page count with the swapcache lock held.. */ - write_lock_irq(&swapper_space.tree_lock); - if ((page_count(page) == 2) && !PageWriteback(page)) { + spin_lock_irq(&swapper_space.tree_lock); + if ((page_count(page) == count) && !PageWriteback(page)) { __delete_from_swap_cache(page); SetPageDirty(page); retval = 1; } - write_unlock_irq(&swapper_space.tree_lock); + spin_unlock_irq(&swapper_space.tree_lock); } spin_unlock(&swap_lock); @@ -387,6 +388,25 @@ int remove_exclusive_swap_page(struct page *page) } /* + * Most of the time the page should have two references: one for the + * process and one for the swap cache. + */ +int remove_exclusive_swap_page(struct page *page) +{ + return remove_exclusive_swap_page_count(page, 2); +} + +/* + * The pageout code holds an extra reference to the page. That raises + * the reference count to test for to 2 for a page that is only in the + * swap cache plus 1 for each process that maps the page. + */ +int remove_exclusive_swap_page_ref(struct page *page) +{ + return remove_exclusive_swap_page_count(page, 2 + page_mapcount(page)); +} + +/* * Free the swap entry like above, but also try to * free the page cache entry if it is the last user. */ @@ -402,7 +422,7 @@ void free_swap_and_cache(swp_entry_t entry) if (p) { if (swap_entry_free(p, swp_offset(entry)) == 1) { page = find_get_page(&swapper_space, entry.val); - if (page && unlikely(TestSetPageLocked(page))) { + if (page && !trylock_page(page)) { page_cache_release(page); page = NULL; } @@ -655,8 +675,8 @@ static int unuse_mm(struct mm_struct *mm, if (!down_read_trylock(&mm->mmap_sem)) { /* - * Activate page so shrink_cache is unlikely to unmap its - * ptes while lock is dropped, so swapoff can make progress. + * Activate page so shrink_inactive_list is unlikely to unmap + * its ptes while lock is dropped, so swapoff can make progress. */ activate_page(page); unlock_page(page); @@ -1260,6 +1280,11 @@ asmlinkage long sys_swapoff(const char __user * specialfile) /* just pick something that's safe... */ swap_list.next = swap_list.head; } + if (p->prio < 0) { + for (i = p->next; i >= 0; i = swap_info[i].next) + swap_info[i].prio = p->prio--; + least_priority++; + } nr_swap_pages -= p->pages; total_swap_pages -= p->pages; p->flags &= ~SWP_WRITEOK; @@ -1272,9 +1297,14 @@ asmlinkage long sys_swapoff(const char __user * specialfile) if (err) { /* re-insert swap space back into swap_list */ spin_lock(&swap_lock); - for (prev = -1, i = swap_list.head; i >= 0; prev = i, i = swap_info[i].next) + if (p->prio < 0) + p->prio = --least_priority; + prev = -1; + for (i = swap_list.head; i >= 0; i = swap_info[i].next) { if (p->prio >= swap_info[i].prio) break; + prev = i; + } p->next = i; if (prev < 0) swap_list.head = swap_list.next = p - swap_info; @@ -1447,7 +1477,6 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags) unsigned int type; int i, prev; int error; - static int least_priority; union swap_header *swap_header = NULL; int swap_header_version; unsigned int nr_good_pages = 0; @@ -1455,7 +1484,7 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags) sector_t span; unsigned long maxpages = 1; int swapfilesize; - unsigned short *swap_map; + unsigned short *swap_map = NULL; struct page *page = NULL; struct inode *inode = NULL; int did_down = 0; @@ -1474,22 +1503,10 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags) } if (type >= nr_swapfiles) nr_swapfiles = type+1; + memset(p, 0, sizeof(*p)); INIT_LIST_HEAD(&p->extent_list); p->flags = SWP_USED; - p->swap_file = NULL; - p->old_block_size = 0; - p->swap_map = NULL; - p->lowest_bit = 0; - p->highest_bit = 0; - p->cluster_nr = 0; - p->inuse_pages = 0; p->next = -1; - if (swap_flags & SWAP_FLAG_PREFER) { - p->prio = - (swap_flags & SWAP_FLAG_PRIO_MASK)>>SWAP_FLAG_PRIO_SHIFT; - } else { - p->prio = --least_priority; - } spin_unlock(&swap_lock); name = getname(specialfile); error = PTR_ERR(name); @@ -1632,19 +1649,20 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags) goto bad_swap; /* OK, set up the swap map and apply the bad block list */ - if (!(p->swap_map = vmalloc(maxpages * sizeof(short)))) { + swap_map = vmalloc(maxpages * sizeof(short)); + if (!swap_map) { error = -ENOMEM; goto bad_swap; } error = 0; - memset(p->swap_map, 0, maxpages * sizeof(short)); + memset(swap_map, 0, maxpages * sizeof(short)); for (i = 0; i < swap_header->info.nr_badpages; i++) { int page_nr = swap_header->info.badpages[i]; if (page_nr <= 0 || page_nr >= swap_header->info.last_page) error = -EINVAL; else - p->swap_map[page_nr] = SWAP_MAP_BAD; + swap_map[page_nr] = SWAP_MAP_BAD; } nr_good_pages = swap_header->info.last_page - swap_header->info.nr_badpages - @@ -1654,7 +1672,7 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags) } if (nr_good_pages) { - p->swap_map[0] = SWAP_MAP_BAD; + swap_map[0] = SWAP_MAP_BAD; p->max = maxpages; p->pages = nr_good_pages; nr_extents = setup_swap_extents(p, &span); @@ -1672,6 +1690,12 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags) mutex_lock(&swapon_mutex); spin_lock(&swap_lock); + if (swap_flags & SWAP_FLAG_PREFER) + p->prio = + (swap_flags & SWAP_FLAG_PRIO_MASK) >> SWAP_FLAG_PRIO_SHIFT; + else + p->prio = --least_priority; + p->swap_map = swap_map; p->flags = SWP_ACTIVE; nr_swap_pages += nr_good_pages; total_swap_pages += nr_good_pages; @@ -1707,12 +1731,8 @@ bad_swap: destroy_swap_extents(p); bad_swap_2: spin_lock(&swap_lock); - swap_map = p->swap_map; p->swap_file = NULL; - p->swap_map = NULL; p->flags = 0; - if (!(swap_flags & SWAP_FLAG_PREFER)) - ++least_priority; spin_unlock(&swap_lock); vfree(swap_map); if (swap_file) diff --git a/mm/tiny-shmem.c b/mm/tiny-shmem.c index ae532f501943..3e67d575ee6e 100644 --- a/mm/tiny-shmem.c +++ b/mm/tiny-shmem.c @@ -65,36 +65,37 @@ struct file *shmem_file_setup(char *name, loff_t size, unsigned long flags) if (!dentry) goto put_memory; + error = -ENFILE; + file = get_empty_filp(); + if (!file) + goto put_dentry; + error = -ENOSPC; inode = ramfs_get_inode(root->d_sb, S_IFREG | S_IRWXUGO, 0); if (!inode) - goto put_dentry; + goto close_file; d_instantiate(dentry, inode); - error = -ENFILE; - file = alloc_file(shm_mnt, dentry, FMODE_WRITE | FMODE_READ, - &ramfs_file_operations); - if (!file) - goto put_dentry; - + inode->i_size = size; inode->i_nlink = 0; /* It is unlinked */ + init_file(file, shm_mnt, dentry, FMODE_WRITE | FMODE_READ, + &ramfs_file_operations); - /* notify everyone as to the change of file size */ - error = do_truncate(dentry, size, 0, file); - if (error < 0) +#ifndef CONFIG_MMU + error = ramfs_nommu_expand_for_mapping(inode, size); + if (error) goto close_file; - +#endif return file; close_file: put_filp(file); - return ERR_PTR(error); - put_dentry: dput(dentry); put_memory: return ERR_PTR(error); } +EXPORT_SYMBOL_GPL(shmem_file_setup); /** * shmem_zero_setup - setup a shared anonymous mapping diff --git a/mm/truncate.c b/mm/truncate.c index b8961cb63414..1229211104f8 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -3,7 +3,7 @@ * * Copyright (C) 2002, Linus Torvalds * - * 10Sep2002 akpm@zip.com.au + * 10Sep2002 Andrew Morton * Initial version. */ @@ -18,6 +18,7 @@ #include <linux/task_io_accounting_ops.h> #include <linux/buffer_head.h> /* grr. try_to_release_page, do_invalidatepage */ +#include "internal.h" /** @@ -103,8 +104,8 @@ truncate_complete_page(struct address_space *mapping, struct page *page) cancel_dirty_page(page, PAGE_CACHE_SIZE); + clear_page_mlock(page); remove_from_page_cache(page); - ClearPageUptodate(page); ClearPageMappedToDisk(page); page_cache_release(page); /* pagecache ref */ } @@ -128,6 +129,7 @@ invalidate_complete_page(struct address_space *mapping, struct page *page) if (PagePrivate(page) && !try_to_release_page(page, 0)) return 0; + clear_page_mlock(page); ret = remove_mapping(mapping, page); return ret; @@ -188,7 +190,7 @@ void truncate_inode_pages_range(struct address_space *mapping, if (page_index > next) next = page_index; next++; - if (TestSetPageLocked(page)) + if (!trylock_page(page)) continue; if (PageWriteback(page)) { unlock_page(page); @@ -281,7 +283,7 @@ unsigned long __invalidate_mapping_pages(struct address_space *mapping, pgoff_t index; int lock_failed; - lock_failed = TestSetPageLocked(page); + lock_failed = !trylock_page(page); /* * We really shouldn't be looking at the ->index of an @@ -349,18 +351,18 @@ invalidate_complete_page2(struct address_space *mapping, struct page *page) if (PagePrivate(page) && !try_to_release_page(page, GFP_KERNEL)) return 0; - write_lock_irq(&mapping->tree_lock); + spin_lock_irq(&mapping->tree_lock); if (PageDirty(page)) goto failed; + clear_page_mlock(page); BUG_ON(PagePrivate(page)); __remove_from_page_cache(page); - write_unlock_irq(&mapping->tree_lock); - ClearPageUptodate(page); + spin_unlock_irq(&mapping->tree_lock); page_cache_release(page); /* pagecache ref */ return 1; failed: - write_unlock_irq(&mapping->tree_lock); + spin_unlock_irq(&mapping->tree_lock); return 0; } @@ -382,7 +384,7 @@ static int do_launder_page(struct address_space *mapping, struct page *page) * Any pages which are found to be mapped into pagetables are unmapped prior to * invalidation. * - * Returns -EIO if any pages could not be invalidated. + * Returns -EBUSY if any pages could not be invalidated. */ int invalidate_inode_pages2_range(struct address_space *mapping, pgoff_t start, pgoff_t end) @@ -442,7 +444,7 @@ int invalidate_inode_pages2_range(struct address_space *mapping, ret2 = do_launder_page(mapping, page); if (ret2 == 0) { if (!invalidate_complete_page2(mapping, page)) - ret2 = -EIO; + ret2 = -EBUSY; } if (ret2 < 0) ret = ret2; diff --git a/mm/util.c b/mm/util.c index 8f18683825bc..cb00b748ce47 100644 --- a/mm/util.c +++ b/mm/util.c @@ -1,7 +1,9 @@ +#include <linux/mm.h> #include <linux/slab.h> #include <linux/string.h> #include <linux/module.h> #include <linux/err.h> +#include <linux/sched.h> #include <asm/uaccess.h> /** @@ -68,25 +70,22 @@ void *kmemdup(const void *src, size_t len, gfp_t gfp) EXPORT_SYMBOL(kmemdup); /** - * krealloc - reallocate memory. The contents will remain unchanged. + * __krealloc - like krealloc() but don't free @p. * @p: object to reallocate memory for. * @new_size: how many bytes of memory are required. * @flags: the type of memory to allocate. * - * The contents of the object pointed to are preserved up to the - * lesser of the new and old sizes. If @p is %NULL, krealloc() - * behaves exactly like kmalloc(). If @size is 0 and @p is not a - * %NULL pointer, the object pointed to is freed. + * This function is like krealloc() except it never frees the originally + * allocated buffer. Use this if you don't want to free the buffer immediately + * like, for example, with RCU. */ -void *krealloc(const void *p, size_t new_size, gfp_t flags) +void *__krealloc(const void *p, size_t new_size, gfp_t flags) { void *ret; size_t ks = 0; - if (unlikely(!new_size)) { - kfree(p); + if (unlikely(!new_size)) return ZERO_SIZE_PTR; - } if (p) ks = ksize(p); @@ -95,10 +94,37 @@ void *krealloc(const void *p, size_t new_size, gfp_t flags) return (void *)p; ret = kmalloc_track_caller(new_size, flags); - if (ret && p) { + if (ret && p) memcpy(ret, p, ks); + + return ret; +} +EXPORT_SYMBOL(__krealloc); + +/** + * krealloc - reallocate memory. The contents will remain unchanged. + * @p: object to reallocate memory for. + * @new_size: how many bytes of memory are required. + * @flags: the type of memory to allocate. + * + * The contents of the object pointed to are preserved up to the + * lesser of the new and old sizes. If @p is %NULL, krealloc() + * behaves exactly like kmalloc(). If @size is 0 and @p is not a + * %NULL pointer, the object pointed to is freed. + */ +void *krealloc(const void *p, size_t new_size, gfp_t flags) +{ + void *ret; + + if (unlikely(!new_size)) { kfree(p); + return ZERO_SIZE_PTR; } + + ret = __krealloc(p, new_size, flags); + if (ret && p != ret) + kfree(p); + return ret; } EXPORT_SYMBOL(krealloc); @@ -136,3 +162,27 @@ char *strndup_user(const char __user *s, long n) return p; } EXPORT_SYMBOL(strndup_user); + +#ifndef HAVE_ARCH_PICK_MMAP_LAYOUT +void arch_pick_mmap_layout(struct mm_struct *mm) +{ + mm->mmap_base = TASK_UNMAPPED_BASE; + mm->get_unmapped_area = arch_get_unmapped_area; + mm->unmap_area = arch_unmap_area; +} +#endif + +int __attribute__((weak)) get_user_pages_fast(unsigned long start, + int nr_pages, int write, struct page **pages) +{ + struct mm_struct *mm = current->mm; + int ret; + + down_read(&mm->mmap_sem); + ret = get_user_pages(current, mm, start, nr_pages, + write, 0, pages, NULL); + up_read(&mm->mmap_sem); + + return ret; +} +EXPORT_SYMBOL_GPL(get_user_pages_fast); diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 6e45b0f3d125..036536945dd9 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -8,26 +8,28 @@ * Numa awareness, Christoph Lameter, SGI, June 2005 */ +#include <linux/vmalloc.h> #include <linux/mm.h> #include <linux/module.h> #include <linux/highmem.h> #include <linux/slab.h> #include <linux/spinlock.h> #include <linux/interrupt.h> +#include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/debugobjects.h> -#include <linux/vmalloc.h> #include <linux/kallsyms.h> +#include <linux/list.h> +#include <linux/rbtree.h> +#include <linux/radix-tree.h> +#include <linux/rcupdate.h> +#include <asm/atomic.h> #include <asm/uaccess.h> #include <asm/tlbflush.h> -DEFINE_RWLOCK(vmlist_lock); -struct vm_struct *vmlist; - -static void *__vmalloc_node(unsigned long size, gfp_t gfp_mask, pgprot_t prot, - int node, void *caller); +/*** Page table manipulation functions ***/ static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end) { @@ -40,8 +42,7 @@ static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end) } while (pte++, addr += PAGE_SIZE, addr != end); } -static inline void vunmap_pmd_range(pud_t *pud, unsigned long addr, - unsigned long end) +static void vunmap_pmd_range(pud_t *pud, unsigned long addr, unsigned long end) { pmd_t *pmd; unsigned long next; @@ -55,8 +56,7 @@ static inline void vunmap_pmd_range(pud_t *pud, unsigned long addr, } while (pmd++, addr = next, addr != end); } -static inline void vunmap_pud_range(pgd_t *pgd, unsigned long addr, - unsigned long end) +static void vunmap_pud_range(pgd_t *pgd, unsigned long addr, unsigned long end) { pud_t *pud; unsigned long next; @@ -70,12 +70,10 @@ static inline void vunmap_pud_range(pgd_t *pgd, unsigned long addr, } while (pud++, addr = next, addr != end); } -void unmap_kernel_range(unsigned long addr, unsigned long size) +static void vunmap_page_range(unsigned long addr, unsigned long end) { pgd_t *pgd; unsigned long next; - unsigned long start = addr; - unsigned long end = addr + size; BUG_ON(addr >= end); pgd = pgd_offset_k(addr); @@ -86,35 +84,36 @@ void unmap_kernel_range(unsigned long addr, unsigned long size) continue; vunmap_pud_range(pgd, addr, next); } while (pgd++, addr = next, addr != end); - flush_tlb_kernel_range(start, end); -} - -static void unmap_vm_area(struct vm_struct *area) -{ - unmap_kernel_range((unsigned long)area->addr, area->size); } static int vmap_pte_range(pmd_t *pmd, unsigned long addr, - unsigned long end, pgprot_t prot, struct page ***pages) + unsigned long end, pgprot_t prot, struct page **pages, int *nr) { pte_t *pte; + /* + * nr is a running index into the array which helps higher level + * callers keep track of where we're up to. + */ + pte = pte_alloc_kernel(pmd, addr); if (!pte) return -ENOMEM; do { - struct page *page = **pages; - WARN_ON(!pte_none(*pte)); - if (!page) + struct page *page = pages[*nr]; + + if (WARN_ON(!pte_none(*pte))) + return -EBUSY; + if (WARN_ON(!page)) return -ENOMEM; set_pte_at(&init_mm, addr, pte, mk_pte(page, prot)); - (*pages)++; + (*nr)++; } while (pte++, addr += PAGE_SIZE, addr != end); return 0; } -static inline int vmap_pmd_range(pud_t *pud, unsigned long addr, - unsigned long end, pgprot_t prot, struct page ***pages) +static int vmap_pmd_range(pud_t *pud, unsigned long addr, + unsigned long end, pgprot_t prot, struct page **pages, int *nr) { pmd_t *pmd; unsigned long next; @@ -124,14 +123,14 @@ static inline int vmap_pmd_range(pud_t *pud, unsigned long addr, return -ENOMEM; do { next = pmd_addr_end(addr, end); - if (vmap_pte_range(pmd, addr, next, prot, pages)) + if (vmap_pte_range(pmd, addr, next, prot, pages, nr)) return -ENOMEM; } while (pmd++, addr = next, addr != end); return 0; } -static inline int vmap_pud_range(pgd_t *pgd, unsigned long addr, - unsigned long end, pgprot_t prot, struct page ***pages) +static int vmap_pud_range(pgd_t *pgd, unsigned long addr, + unsigned long end, pgprot_t prot, struct page **pages, int *nr) { pud_t *pud; unsigned long next; @@ -141,50 +140,78 @@ static inline int vmap_pud_range(pgd_t *pgd, unsigned long addr, return -ENOMEM; do { next = pud_addr_end(addr, end); - if (vmap_pmd_range(pud, addr, next, prot, pages)) + if (vmap_pmd_range(pud, addr, next, prot, pages, nr)) return -ENOMEM; } while (pud++, addr = next, addr != end); return 0; } -int map_vm_area(struct vm_struct *area, pgprot_t prot, struct page ***pages) +/* + * Set up page tables in kva (addr, end). The ptes shall have prot "prot", and + * will have pfns corresponding to the "pages" array. + * + * Ie. pte at addr+N*PAGE_SIZE shall point to pfn corresponding to pages[N] + */ +static int vmap_page_range(unsigned long addr, unsigned long end, + pgprot_t prot, struct page **pages) { pgd_t *pgd; unsigned long next; - unsigned long addr = (unsigned long) area->addr; - unsigned long end = addr + area->size - PAGE_SIZE; - int err; + int err = 0; + int nr = 0; BUG_ON(addr >= end); pgd = pgd_offset_k(addr); do { next = pgd_addr_end(addr, end); - err = vmap_pud_range(pgd, addr, next, prot, pages); + err = vmap_pud_range(pgd, addr, next, prot, pages, &nr); if (err) break; } while (pgd++, addr = next, addr != end); - flush_cache_vmap((unsigned long) area->addr, end); - return err; + flush_cache_vmap(addr, end); + + if (unlikely(err)) + return err; + return nr; +} + +static inline int is_vmalloc_or_module_addr(const void *x) +{ + /* + * x86-64 and sparc64 put modules in a special place, + * and fall back on vmalloc() if that fails. Others + * just put it in the vmalloc space. + */ +#if defined(CONFIG_MODULES) && defined(MODULES_VADDR) + unsigned long addr = (unsigned long)x; + if (addr >= MODULES_VADDR && addr < MODULES_END) + return 1; +#endif + return is_vmalloc_addr(x); } -EXPORT_SYMBOL_GPL(map_vm_area); /* - * Map a vmalloc()-space virtual address to the physical page. + * Walk a vmap address to the struct page it maps. */ struct page *vmalloc_to_page(const void *vmalloc_addr) { unsigned long addr = (unsigned long) vmalloc_addr; struct page *page = NULL; pgd_t *pgd = pgd_offset_k(addr); - pud_t *pud; - pmd_t *pmd; - pte_t *ptep, pte; + + /* + * XXX we might need to change this if we add VIRTUAL_BUG_ON for + * architectures that do not vmalloc module space + */ + VIRTUAL_BUG_ON(!is_vmalloc_or_module_addr(vmalloc_addr)); if (!pgd_none(*pgd)) { - pud = pud_offset(pgd, addr); + pud_t *pud = pud_offset(pgd, addr); if (!pud_none(*pud)) { - pmd = pmd_offset(pud, addr); + pmd_t *pmd = pmd_offset(pud, addr); if (!pmd_none(*pmd)) { + pte_t *ptep, pte; + ptep = pte_offset_map(pmd, addr); pte = *ptep; if (pte_present(pte)) @@ -206,13 +233,751 @@ unsigned long vmalloc_to_pfn(const void *vmalloc_addr) } EXPORT_SYMBOL(vmalloc_to_pfn); -static struct vm_struct * -__get_vm_area_node(unsigned long size, unsigned long flags, unsigned long start, - unsigned long end, int node, gfp_t gfp_mask, void *caller) + +/*** Global kva allocator ***/ + +#define VM_LAZY_FREE 0x01 +#define VM_LAZY_FREEING 0x02 +#define VM_VM_AREA 0x04 + +struct vmap_area { + unsigned long va_start; + unsigned long va_end; + unsigned long flags; + struct rb_node rb_node; /* address sorted rbtree */ + struct list_head list; /* address sorted list */ + struct list_head purge_list; /* "lazy purge" list */ + void *private; + struct rcu_head rcu_head; +}; + +static DEFINE_SPINLOCK(vmap_area_lock); +static struct rb_root vmap_area_root = RB_ROOT; +static LIST_HEAD(vmap_area_list); + +static struct vmap_area *__find_vmap_area(unsigned long addr) { - struct vm_struct **p, *tmp, *area; - unsigned long align = 1; + struct rb_node *n = vmap_area_root.rb_node; + + while (n) { + struct vmap_area *va; + + va = rb_entry(n, struct vmap_area, rb_node); + if (addr < va->va_start) + n = n->rb_left; + else if (addr > va->va_start) + n = n->rb_right; + else + return va; + } + + return NULL; +} + +static void __insert_vmap_area(struct vmap_area *va) +{ + struct rb_node **p = &vmap_area_root.rb_node; + struct rb_node *parent = NULL; + struct rb_node *tmp; + + while (*p) { + struct vmap_area *tmp; + + parent = *p; + tmp = rb_entry(parent, struct vmap_area, rb_node); + if (va->va_start < tmp->va_end) + p = &(*p)->rb_left; + else if (va->va_end > tmp->va_start) + p = &(*p)->rb_right; + else + BUG(); + } + + rb_link_node(&va->rb_node, parent, p); + rb_insert_color(&va->rb_node, &vmap_area_root); + + /* address-sort this list so it is usable like the vmlist */ + tmp = rb_prev(&va->rb_node); + if (tmp) { + struct vmap_area *prev; + prev = rb_entry(tmp, struct vmap_area, rb_node); + list_add_rcu(&va->list, &prev->list); + } else + list_add_rcu(&va->list, &vmap_area_list); +} + +static void purge_vmap_area_lazy(void); + +/* + * Allocate a region of KVA of the specified size and alignment, within the + * vstart and vend. + */ +static struct vmap_area *alloc_vmap_area(unsigned long size, + unsigned long align, + unsigned long vstart, unsigned long vend, + int node, gfp_t gfp_mask) +{ + struct vmap_area *va; + struct rb_node *n; + unsigned long addr; + int purged = 0; + + BUG_ON(size & ~PAGE_MASK); + + addr = ALIGN(vstart, align); + + va = kmalloc_node(sizeof(struct vmap_area), + gfp_mask & GFP_RECLAIM_MASK, node); + if (unlikely(!va)) + return ERR_PTR(-ENOMEM); + +retry: + spin_lock(&vmap_area_lock); + /* XXX: could have a last_hole cache */ + n = vmap_area_root.rb_node; + if (n) { + struct vmap_area *first = NULL; + + do { + struct vmap_area *tmp; + tmp = rb_entry(n, struct vmap_area, rb_node); + if (tmp->va_end >= addr) { + if (!first && tmp->va_start < addr + size) + first = tmp; + n = n->rb_left; + } else { + first = tmp; + n = n->rb_right; + } + } while (n); + + if (!first) + goto found; + + if (first->va_end < addr) { + n = rb_next(&first->rb_node); + if (n) + first = rb_entry(n, struct vmap_area, rb_node); + else + goto found; + } + + while (addr + size >= first->va_start && addr + size <= vend) { + addr = ALIGN(first->va_end + PAGE_SIZE, align); + + n = rb_next(&first->rb_node); + if (n) + first = rb_entry(n, struct vmap_area, rb_node); + else + goto found; + } + } +found: + if (addr + size > vend) { + spin_unlock(&vmap_area_lock); + if (!purged) { + purge_vmap_area_lazy(); + purged = 1; + goto retry; + } + if (printk_ratelimit()) + printk(KERN_WARNING "vmap allocation failed: " + "use vmalloc=<size> to increase size.\n"); + return ERR_PTR(-EBUSY); + } + + BUG_ON(addr & (align-1)); + + va->va_start = addr; + va->va_end = addr + size; + va->flags = 0; + __insert_vmap_area(va); + spin_unlock(&vmap_area_lock); + + return va; +} + +static void rcu_free_va(struct rcu_head *head) +{ + struct vmap_area *va = container_of(head, struct vmap_area, rcu_head); + + kfree(va); +} + +static void __free_vmap_area(struct vmap_area *va) +{ + BUG_ON(RB_EMPTY_NODE(&va->rb_node)); + rb_erase(&va->rb_node, &vmap_area_root); + RB_CLEAR_NODE(&va->rb_node); + list_del_rcu(&va->list); + + call_rcu(&va->rcu_head, rcu_free_va); +} + +/* + * Free a region of KVA allocated by alloc_vmap_area + */ +static void free_vmap_area(struct vmap_area *va) +{ + spin_lock(&vmap_area_lock); + __free_vmap_area(va); + spin_unlock(&vmap_area_lock); +} + +/* + * Clear the pagetable entries of a given vmap_area + */ +static void unmap_vmap_area(struct vmap_area *va) +{ + vunmap_page_range(va->va_start, va->va_end); +} + +/* + * lazy_max_pages is the maximum amount of virtual address space we gather up + * before attempting to purge with a TLB flush. + * + * There is a tradeoff here: a larger number will cover more kernel page tables + * and take slightly longer to purge, but it will linearly reduce the number of + * global TLB flushes that must be performed. It would seem natural to scale + * this number up linearly with the number of CPUs (because vmapping activity + * could also scale linearly with the number of CPUs), however it is likely + * that in practice, workloads might be constrained in other ways that mean + * vmap activity will not scale linearly with CPUs. Also, I want to be + * conservative and not introduce a big latency on huge systems, so go with + * a less aggressive log scale. It will still be an improvement over the old + * code, and it will be simple to change the scale factor if we find that it + * becomes a problem on bigger systems. + */ +static unsigned long lazy_max_pages(void) +{ + unsigned int log; + + log = fls(num_online_cpus()); + + return log * (32UL * 1024 * 1024 / PAGE_SIZE); +} + +static atomic_t vmap_lazy_nr = ATOMIC_INIT(0); + +/* + * Purges all lazily-freed vmap areas. + * + * If sync is 0 then don't purge if there is already a purge in progress. + * If force_flush is 1, then flush kernel TLBs between *start and *end even + * if we found no lazy vmap areas to unmap (callers can use this to optimise + * their own TLB flushing). + * Returns with *start = min(*start, lowest purged address) + * *end = max(*end, highest purged address) + */ +static void __purge_vmap_area_lazy(unsigned long *start, unsigned long *end, + int sync, int force_flush) +{ + static DEFINE_SPINLOCK(purge_lock); + LIST_HEAD(valist); + struct vmap_area *va; + int nr = 0; + + /* + * If sync is 0 but force_flush is 1, we'll go sync anyway but callers + * should not expect such behaviour. This just simplifies locking for + * the case that isn't actually used at the moment anyway. + */ + if (!sync && !force_flush) { + if (!spin_trylock(&purge_lock)) + return; + } else + spin_lock(&purge_lock); + + rcu_read_lock(); + list_for_each_entry_rcu(va, &vmap_area_list, list) { + if (va->flags & VM_LAZY_FREE) { + if (va->va_start < *start) + *start = va->va_start; + if (va->va_end > *end) + *end = va->va_end; + nr += (va->va_end - va->va_start) >> PAGE_SHIFT; + unmap_vmap_area(va); + list_add_tail(&va->purge_list, &valist); + va->flags |= VM_LAZY_FREEING; + va->flags &= ~VM_LAZY_FREE; + } + } + rcu_read_unlock(); + + if (nr) { + BUG_ON(nr > atomic_read(&vmap_lazy_nr)); + atomic_sub(nr, &vmap_lazy_nr); + } + + if (nr || force_flush) + flush_tlb_kernel_range(*start, *end); + + if (nr) { + spin_lock(&vmap_area_lock); + list_for_each_entry(va, &valist, purge_list) + __free_vmap_area(va); + spin_unlock(&vmap_area_lock); + } + spin_unlock(&purge_lock); +} + +/* + * Kick off a purge of the outstanding lazy areas. + */ +static void purge_vmap_area_lazy(void) +{ + unsigned long start = ULONG_MAX, end = 0; + + __purge_vmap_area_lazy(&start, &end, 0, 0); +} + +/* + * Free and unmap a vmap area + */ +static void free_unmap_vmap_area(struct vmap_area *va) +{ + va->flags |= VM_LAZY_FREE; + atomic_add((va->va_end - va->va_start) >> PAGE_SHIFT, &vmap_lazy_nr); + if (unlikely(atomic_read(&vmap_lazy_nr) > lazy_max_pages())) + purge_vmap_area_lazy(); +} + +static struct vmap_area *find_vmap_area(unsigned long addr) +{ + struct vmap_area *va; + + spin_lock(&vmap_area_lock); + va = __find_vmap_area(addr); + spin_unlock(&vmap_area_lock); + + return va; +} + +static void free_unmap_vmap_area_addr(unsigned long addr) +{ + struct vmap_area *va; + + va = find_vmap_area(addr); + BUG_ON(!va); + free_unmap_vmap_area(va); +} + + +/*** Per cpu kva allocator ***/ + +/* + * vmap space is limited especially on 32 bit architectures. Ensure there is + * room for at least 16 percpu vmap blocks per CPU. + */ +/* + * If we had a constant VMALLOC_START and VMALLOC_END, we'd like to be able + * to #define VMALLOC_SPACE (VMALLOC_END-VMALLOC_START). Guess + * instead (we just need a rough idea) + */ +#if BITS_PER_LONG == 32 +#define VMALLOC_SPACE (128UL*1024*1024) +#else +#define VMALLOC_SPACE (128UL*1024*1024*1024) +#endif + +#define VMALLOC_PAGES (VMALLOC_SPACE / PAGE_SIZE) +#define VMAP_MAX_ALLOC BITS_PER_LONG /* 256K with 4K pages */ +#define VMAP_BBMAP_BITS_MAX 1024 /* 4MB with 4K pages */ +#define VMAP_BBMAP_BITS_MIN (VMAP_MAX_ALLOC*2) +#define VMAP_MIN(x, y) ((x) < (y) ? (x) : (y)) /* can't use min() */ +#define VMAP_MAX(x, y) ((x) > (y) ? (x) : (y)) /* can't use max() */ +#define VMAP_BBMAP_BITS VMAP_MIN(VMAP_BBMAP_BITS_MAX, \ + VMAP_MAX(VMAP_BBMAP_BITS_MIN, \ + VMALLOC_PAGES / NR_CPUS / 16)) + +#define VMAP_BLOCK_SIZE (VMAP_BBMAP_BITS * PAGE_SIZE) + +struct vmap_block_queue { + spinlock_t lock; + struct list_head free; + struct list_head dirty; + unsigned int nr_dirty; +}; + +struct vmap_block { + spinlock_t lock; + struct vmap_area *va; + struct vmap_block_queue *vbq; + unsigned long free, dirty; + DECLARE_BITMAP(alloc_map, VMAP_BBMAP_BITS); + DECLARE_BITMAP(dirty_map, VMAP_BBMAP_BITS); + union { + struct { + struct list_head free_list; + struct list_head dirty_list; + }; + struct rcu_head rcu_head; + }; +}; + +/* Queue of free and dirty vmap blocks, for allocation and flushing purposes */ +static DEFINE_PER_CPU(struct vmap_block_queue, vmap_block_queue); + +/* + * Radix tree of vmap blocks, indexed by address, to quickly find a vmap block + * in the free path. Could get rid of this if we change the API to return a + * "cookie" from alloc, to be passed to free. But no big deal yet. + */ +static DEFINE_SPINLOCK(vmap_block_tree_lock); +static RADIX_TREE(vmap_block_tree, GFP_ATOMIC); + +/* + * We should probably have a fallback mechanism to allocate virtual memory + * out of partially filled vmap blocks. However vmap block sizing should be + * fairly reasonable according to the vmalloc size, so it shouldn't be a + * big problem. + */ + +static unsigned long addr_to_vb_idx(unsigned long addr) +{ + addr -= VMALLOC_START & ~(VMAP_BLOCK_SIZE-1); + addr /= VMAP_BLOCK_SIZE; + return addr; +} + +static struct vmap_block *new_vmap_block(gfp_t gfp_mask) +{ + struct vmap_block_queue *vbq; + struct vmap_block *vb; + struct vmap_area *va; + unsigned long vb_idx; + int node, err; + + node = numa_node_id(); + + vb = kmalloc_node(sizeof(struct vmap_block), + gfp_mask & GFP_RECLAIM_MASK, node); + if (unlikely(!vb)) + return ERR_PTR(-ENOMEM); + + va = alloc_vmap_area(VMAP_BLOCK_SIZE, VMAP_BLOCK_SIZE, + VMALLOC_START, VMALLOC_END, + node, gfp_mask); + if (unlikely(IS_ERR(va))) { + kfree(vb); + return ERR_PTR(PTR_ERR(va)); + } + + err = radix_tree_preload(gfp_mask); + if (unlikely(err)) { + kfree(vb); + free_vmap_area(va); + return ERR_PTR(err); + } + + spin_lock_init(&vb->lock); + vb->va = va; + vb->free = VMAP_BBMAP_BITS; + vb->dirty = 0; + bitmap_zero(vb->alloc_map, VMAP_BBMAP_BITS); + bitmap_zero(vb->dirty_map, VMAP_BBMAP_BITS); + INIT_LIST_HEAD(&vb->free_list); + INIT_LIST_HEAD(&vb->dirty_list); + + vb_idx = addr_to_vb_idx(va->va_start); + spin_lock(&vmap_block_tree_lock); + err = radix_tree_insert(&vmap_block_tree, vb_idx, vb); + spin_unlock(&vmap_block_tree_lock); + BUG_ON(err); + radix_tree_preload_end(); + + vbq = &get_cpu_var(vmap_block_queue); + vb->vbq = vbq; + spin_lock(&vbq->lock); + list_add(&vb->free_list, &vbq->free); + spin_unlock(&vbq->lock); + put_cpu_var(vmap_cpu_blocks); + + return vb; +} + +static void rcu_free_vb(struct rcu_head *head) +{ + struct vmap_block *vb = container_of(head, struct vmap_block, rcu_head); + + kfree(vb); +} + +static void free_vmap_block(struct vmap_block *vb) +{ + struct vmap_block *tmp; + unsigned long vb_idx; + + spin_lock(&vb->vbq->lock); + if (!list_empty(&vb->free_list)) + list_del(&vb->free_list); + if (!list_empty(&vb->dirty_list)) + list_del(&vb->dirty_list); + spin_unlock(&vb->vbq->lock); + + vb_idx = addr_to_vb_idx(vb->va->va_start); + spin_lock(&vmap_block_tree_lock); + tmp = radix_tree_delete(&vmap_block_tree, vb_idx); + spin_unlock(&vmap_block_tree_lock); + BUG_ON(tmp != vb); + + free_unmap_vmap_area(vb->va); + call_rcu(&vb->rcu_head, rcu_free_vb); +} + +static void *vb_alloc(unsigned long size, gfp_t gfp_mask) +{ + struct vmap_block_queue *vbq; + struct vmap_block *vb; + unsigned long addr = 0; + unsigned int order; + + BUG_ON(size & ~PAGE_MASK); + BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC); + order = get_order(size); + +again: + rcu_read_lock(); + vbq = &get_cpu_var(vmap_block_queue); + list_for_each_entry_rcu(vb, &vbq->free, free_list) { + int i; + + spin_lock(&vb->lock); + i = bitmap_find_free_region(vb->alloc_map, + VMAP_BBMAP_BITS, order); + + if (i >= 0) { + addr = vb->va->va_start + (i << PAGE_SHIFT); + BUG_ON(addr_to_vb_idx(addr) != + addr_to_vb_idx(vb->va->va_start)); + vb->free -= 1UL << order; + if (vb->free == 0) { + spin_lock(&vbq->lock); + list_del_init(&vb->free_list); + spin_unlock(&vbq->lock); + } + spin_unlock(&vb->lock); + break; + } + spin_unlock(&vb->lock); + } + put_cpu_var(vmap_cpu_blocks); + rcu_read_unlock(); + + if (!addr) { + vb = new_vmap_block(gfp_mask); + if (IS_ERR(vb)) + return vb; + goto again; + } + + return (void *)addr; +} + +static void vb_free(const void *addr, unsigned long size) +{ + unsigned long offset; + unsigned long vb_idx; + unsigned int order; + struct vmap_block *vb; + + BUG_ON(size & ~PAGE_MASK); + BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC); + order = get_order(size); + + offset = (unsigned long)addr & (VMAP_BLOCK_SIZE - 1); + + vb_idx = addr_to_vb_idx((unsigned long)addr); + rcu_read_lock(); + vb = radix_tree_lookup(&vmap_block_tree, vb_idx); + rcu_read_unlock(); + BUG_ON(!vb); + + spin_lock(&vb->lock); + bitmap_allocate_region(vb->dirty_map, offset >> PAGE_SHIFT, order); + if (!vb->dirty) { + spin_lock(&vb->vbq->lock); + list_add(&vb->dirty_list, &vb->vbq->dirty); + spin_unlock(&vb->vbq->lock); + } + vb->dirty += 1UL << order; + if (vb->dirty == VMAP_BBMAP_BITS) { + BUG_ON(vb->free || !list_empty(&vb->free_list)); + spin_unlock(&vb->lock); + free_vmap_block(vb); + } else + spin_unlock(&vb->lock); +} + +/** + * vm_unmap_aliases - unmap outstanding lazy aliases in the vmap layer + * + * The vmap/vmalloc layer lazily flushes kernel virtual mappings primarily + * to amortize TLB flushing overheads. What this means is that any page you + * have now, may, in a former life, have been mapped into kernel virtual + * address by the vmap layer and so there might be some CPUs with TLB entries + * still referencing that page (additional to the regular 1:1 kernel mapping). + * + * vm_unmap_aliases flushes all such lazy mappings. After it returns, we can + * be sure that none of the pages we have control over will have any aliases + * from the vmap layer. + */ +void vm_unmap_aliases(void) +{ + unsigned long start = ULONG_MAX, end = 0; + int cpu; + int flush = 0; + + for_each_possible_cpu(cpu) { + struct vmap_block_queue *vbq = &per_cpu(vmap_block_queue, cpu); + struct vmap_block *vb; + + rcu_read_lock(); + list_for_each_entry_rcu(vb, &vbq->free, free_list) { + int i; + + spin_lock(&vb->lock); + i = find_first_bit(vb->dirty_map, VMAP_BBMAP_BITS); + while (i < VMAP_BBMAP_BITS) { + unsigned long s, e; + int j; + j = find_next_zero_bit(vb->dirty_map, + VMAP_BBMAP_BITS, i); + + s = vb->va->va_start + (i << PAGE_SHIFT); + e = vb->va->va_start + (j << PAGE_SHIFT); + vunmap_page_range(s, e); + flush = 1; + + if (s < start) + start = s; + if (e > end) + end = e; + + i = j; + i = find_next_bit(vb->dirty_map, + VMAP_BBMAP_BITS, i); + } + spin_unlock(&vb->lock); + } + rcu_read_unlock(); + } + + __purge_vmap_area_lazy(&start, &end, 1, flush); +} +EXPORT_SYMBOL_GPL(vm_unmap_aliases); + +/** + * vm_unmap_ram - unmap linear kernel address space set up by vm_map_ram + * @mem: the pointer returned by vm_map_ram + * @count: the count passed to that vm_map_ram call (cannot unmap partial) + */ +void vm_unmap_ram(const void *mem, unsigned int count) +{ + unsigned long size = count << PAGE_SHIFT; + unsigned long addr = (unsigned long)mem; + + BUG_ON(!addr); + BUG_ON(addr < VMALLOC_START); + BUG_ON(addr > VMALLOC_END); + BUG_ON(addr & (PAGE_SIZE-1)); + + debug_check_no_locks_freed(mem, size); + + if (likely(count <= VMAP_MAX_ALLOC)) + vb_free(mem, size); + else + free_unmap_vmap_area_addr(addr); +} +EXPORT_SYMBOL(vm_unmap_ram); + +/** + * vm_map_ram - map pages linearly into kernel virtual address (vmalloc space) + * @pages: an array of pointers to the pages to be mapped + * @count: number of pages + * @node: prefer to allocate data structures on this node + * @prot: memory protection to use. PAGE_KERNEL for regular RAM + * @returns: a pointer to the address that has been mapped, or NULL on failure + */ +void *vm_map_ram(struct page **pages, unsigned int count, int node, pgprot_t prot) +{ + unsigned long size = count << PAGE_SHIFT; unsigned long addr; + void *mem; + + if (likely(count <= VMAP_MAX_ALLOC)) { + mem = vb_alloc(size, GFP_KERNEL); + if (IS_ERR(mem)) + return NULL; + addr = (unsigned long)mem; + } else { + struct vmap_area *va; + va = alloc_vmap_area(size, PAGE_SIZE, + VMALLOC_START, VMALLOC_END, node, GFP_KERNEL); + if (IS_ERR(va)) + return NULL; + + addr = va->va_start; + mem = (void *)addr; + } + if (vmap_page_range(addr, addr + size, prot, pages) < 0) { + vm_unmap_ram(mem, count); + return NULL; + } + return mem; +} +EXPORT_SYMBOL(vm_map_ram); + +void __init vmalloc_init(void) +{ + int i; + + for_each_possible_cpu(i) { + struct vmap_block_queue *vbq; + + vbq = &per_cpu(vmap_block_queue, i); + spin_lock_init(&vbq->lock); + INIT_LIST_HEAD(&vbq->free); + INIT_LIST_HEAD(&vbq->dirty); + vbq->nr_dirty = 0; + } +} + +void unmap_kernel_range(unsigned long addr, unsigned long size) +{ + unsigned long end = addr + size; + vunmap_page_range(addr, end); + flush_tlb_kernel_range(addr, end); +} + +int map_vm_area(struct vm_struct *area, pgprot_t prot, struct page ***pages) +{ + unsigned long addr = (unsigned long)area->addr; + unsigned long end = addr + area->size - PAGE_SIZE; + int err; + + err = vmap_page_range(addr, end, prot, *pages); + if (err > 0) { + *pages += err; + err = 0; + } + + return err; +} +EXPORT_SYMBOL_GPL(map_vm_area); + +/*** Old vmalloc interfaces ***/ +DEFINE_RWLOCK(vmlist_lock); +struct vm_struct *vmlist; + +static struct vm_struct *__get_vm_area_node(unsigned long size, + unsigned long flags, unsigned long start, unsigned long end, + int node, gfp_t gfp_mask, void *caller) +{ + static struct vmap_area *va; + struct vm_struct *area; + struct vm_struct *tmp, **p; + unsigned long align = 1; BUG_ON(in_interrupt()); if (flags & VM_IOREMAP) { @@ -225,13 +990,12 @@ __get_vm_area_node(unsigned long size, unsigned long flags, unsigned long start, align = 1ul << bit; } - addr = ALIGN(start, align); + size = PAGE_ALIGN(size); if (unlikely(!size)) return NULL; area = kmalloc_node(sizeof(*area), gfp_mask & GFP_RECLAIM_MASK, node); - if (unlikely(!area)) return NULL; @@ -240,48 +1004,32 @@ __get_vm_area_node(unsigned long size, unsigned long flags, unsigned long start, */ size += PAGE_SIZE; - write_lock(&vmlist_lock); - for (p = &vmlist; (tmp = *p) != NULL ;p = &tmp->next) { - if ((unsigned long)tmp->addr < addr) { - if((unsigned long)tmp->addr + tmp->size >= addr) - addr = ALIGN(tmp->size + - (unsigned long)tmp->addr, align); - continue; - } - if ((size + addr) < addr) - goto out; - if (size + addr <= (unsigned long)tmp->addr) - goto found; - addr = ALIGN(tmp->size + (unsigned long)tmp->addr, align); - if (addr > end - size) - goto out; + va = alloc_vmap_area(size, align, start, end, node, gfp_mask); + if (IS_ERR(va)) { + kfree(area); + return NULL; } - if ((size + addr) < addr) - goto out; - if (addr > end - size) - goto out; - -found: - area->next = *p; - *p = area; area->flags = flags; - area->addr = (void *)addr; + area->addr = (void *)va->va_start; area->size = size; area->pages = NULL; area->nr_pages = 0; area->phys_addr = 0; area->caller = caller; + va->private = area; + va->flags |= VM_VM_AREA; + + write_lock(&vmlist_lock); + for (p = &vmlist; (tmp = *p) != NULL; p = &tmp->next) { + if (tmp->addr >= area->addr) + break; + } + area->next = *p; + *p = area; write_unlock(&vmlist_lock); return area; - -out: - write_unlock(&vmlist_lock); - kfree(area); - if (printk_ratelimit()) - printk(KERN_WARNING "allocation failed: out of vmalloc space - use vmalloc=<size> to increase size.\n"); - return NULL; } struct vm_struct *__get_vm_area(unsigned long size, unsigned long flags, @@ -321,39 +1069,15 @@ struct vm_struct *get_vm_area_node(unsigned long size, unsigned long flags, gfp_mask, __builtin_return_address(0)); } -/* Caller must hold vmlist_lock */ -static struct vm_struct *__find_vm_area(const void *addr) +static struct vm_struct *find_vm_area(const void *addr) { - struct vm_struct *tmp; + struct vmap_area *va; - for (tmp = vmlist; tmp != NULL; tmp = tmp->next) { - if (tmp->addr == addr) - break; - } + va = find_vmap_area((unsigned long)addr); + if (va && va->flags & VM_VM_AREA) + return va->private; - return tmp; -} - -/* Caller must hold vmlist_lock */ -static struct vm_struct *__remove_vm_area(const void *addr) -{ - struct vm_struct **p, *tmp; - - for (p = &vmlist ; (tmp = *p) != NULL ;p = &tmp->next) { - if (tmp->addr == addr) - goto found; - } return NULL; - -found: - unmap_vm_area(tmp); - *p = tmp->next; - - /* - * Remove the guard page. - */ - tmp->size -= PAGE_SIZE; - return tmp; } /** @@ -366,11 +1090,24 @@ found: */ struct vm_struct *remove_vm_area(const void *addr) { - struct vm_struct *v; - write_lock(&vmlist_lock); - v = __remove_vm_area(addr); - write_unlock(&vmlist_lock); - return v; + struct vmap_area *va; + + va = find_vmap_area((unsigned long)addr); + if (va && va->flags & VM_VM_AREA) { + struct vm_struct *vm = va->private; + struct vm_struct *tmp, **p; + free_unmap_vmap_area(va); + vm->size -= PAGE_SIZE; + + write_lock(&vmlist_lock); + for (p = &vmlist; (tmp = *p) != vm; p = &tmp->next) + ; + *p = tmp->next; + write_unlock(&vmlist_lock); + + return vm; + } + return NULL; } static void __vunmap(const void *addr, int deallocate_pages) @@ -381,16 +1118,14 @@ static void __vunmap(const void *addr, int deallocate_pages) return; if ((PAGE_SIZE-1) & (unsigned long)addr) { - printk(KERN_ERR "Trying to vfree() bad address (%p)\n", addr); - WARN_ON(1); + WARN(1, KERN_ERR "Trying to vfree() bad address (%p)\n", addr); return; } area = remove_vm_area(addr); if (unlikely(!area)) { - printk(KERN_ERR "Trying to vfree() nonexistent vm area (%p)\n", + WARN(1, KERN_ERR "Trying to vfree() nonexistent vm area (%p)\n", addr); - WARN_ON(1); return; } @@ -482,6 +1217,8 @@ void *vmap(struct page **pages, unsigned int count, } EXPORT_SYMBOL(vmap); +static void *__vmalloc_node(unsigned long size, gfp_t gfp_mask, pgprot_t prot, + int node, void *caller); static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, pgprot_t prot, int node, void *caller) { @@ -608,10 +1345,8 @@ void *vmalloc_user(unsigned long size) ret = __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO, PAGE_KERNEL); if (ret) { - write_lock(&vmlist_lock); - area = __find_vm_area(ret); + area = find_vm_area(ret); area->flags |= VM_USERMAP; - write_unlock(&vmlist_lock); } return ret; } @@ -691,10 +1426,8 @@ void *vmalloc_32_user(unsigned long size) ret = __vmalloc(size, GFP_VMALLOC32 | __GFP_ZERO, PAGE_KERNEL); if (ret) { - write_lock(&vmlist_lock); - area = __find_vm_area(ret); + area = find_vm_area(ret); area->flags |= VM_USERMAP; - write_unlock(&vmlist_lock); } return ret; } @@ -795,26 +1528,25 @@ int remap_vmalloc_range(struct vm_area_struct *vma, void *addr, struct vm_struct *area; unsigned long uaddr = vma->vm_start; unsigned long usize = vma->vm_end - vma->vm_start; - int ret; if ((PAGE_SIZE-1) & (unsigned long)addr) return -EINVAL; - read_lock(&vmlist_lock); - area = __find_vm_area(addr); + area = find_vm_area(addr); if (!area) - goto out_einval_locked; + return -EINVAL; if (!(area->flags & VM_USERMAP)) - goto out_einval_locked; + return -EINVAL; if (usize + (pgoff << PAGE_SHIFT) > area->size - PAGE_SIZE) - goto out_einval_locked; - read_unlock(&vmlist_lock); + return -EINVAL; addr += pgoff << PAGE_SHIFT; do { struct page *page = vmalloc_to_page(addr); + int ret; + ret = vm_insert_page(vma, uaddr, page); if (ret) return ret; @@ -827,11 +1559,7 @@ int remap_vmalloc_range(struct vm_area_struct *vma, void *addr, /* Prevent "things" like memory migration? VM_flags need a cleanup... */ vma->vm_flags |= VM_RESERVED; - return ret; - -out_einval_locked: - read_unlock(&vmlist_lock); - return -EINVAL; + return 0; } EXPORT_SYMBOL(remap_vmalloc_range); @@ -931,6 +1659,25 @@ static void s_stop(struct seq_file *m, void *p) read_unlock(&vmlist_lock); } +static void show_numa_info(struct seq_file *m, struct vm_struct *v) +{ + if (NUMA_BUILD) { + unsigned int nr, *counters = m->private; + + if (!counters) + return; + + memset(counters, 0, nr_node_ids * sizeof(unsigned int)); + + for (nr = 0; nr < v->nr_pages; nr++) + counters[page_to_nid(v->pages[nr])]++; + + for_each_node_state(nr, N_HIGH_MEMORY) + if (counters[nr]) + seq_printf(m, " N%u=%u", nr, counters[nr]); + } +} + static int s_show(struct seq_file *m, void *p) { struct vm_struct *v = p; @@ -967,15 +1714,46 @@ static int s_show(struct seq_file *m, void *p) if (v->flags & VM_VPAGES) seq_printf(m, " vpages"); + show_numa_info(m, v); seq_putc(m, '\n'); return 0; } -const struct seq_operations vmalloc_op = { +static const struct seq_operations vmalloc_op = { .start = s_start, .next = s_next, .stop = s_stop, .show = s_show, }; + +static int vmalloc_open(struct inode *inode, struct file *file) +{ + unsigned int *ptr = NULL; + int ret; + + if (NUMA_BUILD) + ptr = kmalloc(nr_node_ids * sizeof(unsigned int), GFP_KERNEL); + ret = seq_open(file, &vmalloc_op); + if (!ret) { + struct seq_file *m = file->private_data; + m->private = ptr; + } else + kfree(ptr); + return ret; +} + +static const struct file_operations proc_vmalloc_operations = { + .open = vmalloc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_private, +}; + +static int __init proc_vmalloc_init(void) +{ + proc_create("vmallocinfo", S_IRUSR, NULL, &proc_vmalloc_operations); + return 0; +} +module_init(proc_vmalloc_init); #endif diff --git a/mm/vmscan.c b/mm/vmscan.c index 967d30ccd92b..3b5860294bb6 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -38,6 +38,8 @@ #include <linux/kthread.h> #include <linux/freezer.h> #include <linux/memcontrol.h> +#include <linux/delayacct.h> +#include <linux/sysctl.h> #include <asm/tlbflush.h> #include <asm/div64.h> @@ -77,7 +79,7 @@ struct scan_control { unsigned long (*isolate_pages)(unsigned long nr, struct list_head *dst, unsigned long *scanned, int order, int mode, struct zone *z, struct mem_cgroup *mem_cont, - int active); + int active, int file); }; #define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru)) @@ -390,17 +392,15 @@ static pageout_t pageout(struct page *page, struct address_space *mapping, } /* - * Attempt to detach a locked page from its ->mapping. If it is dirty or if - * someone else has a ref on the page, abort and return 0. If it was - * successfully detached, return 1. Assumes the caller has a single ref on - * this page. + * Same as remove_mapping, but if the page is removed from the mapping, it + * gets returned with a refcount of 0. */ -int remove_mapping(struct address_space *mapping, struct page *page) +static int __remove_mapping(struct address_space *mapping, struct page *page) { BUG_ON(!PageLocked(page)); BUG_ON(mapping != page_mapping(page)); - write_lock_irq(&mapping->tree_lock); + spin_lock_irq(&mapping->tree_lock); /* * The non racy check for a busy page. * @@ -426,32 +426,131 @@ int remove_mapping(struct address_space *mapping, struct page *page) * Note that if SetPageDirty is always performed via set_page_dirty, * and thus under tree_lock, then this ordering is not required. */ - if (unlikely(page_count(page) != 2)) + if (!page_freeze_refs(page, 2)) goto cannot_free; - smp_rmb(); - if (unlikely(PageDirty(page))) + /* note: atomic_cmpxchg in page_freeze_refs provides the smp_rmb */ + if (unlikely(PageDirty(page))) { + page_unfreeze_refs(page, 2); goto cannot_free; + } if (PageSwapCache(page)) { swp_entry_t swap = { .val = page_private(page) }; __delete_from_swap_cache(page); - write_unlock_irq(&mapping->tree_lock); + spin_unlock_irq(&mapping->tree_lock); swap_free(swap); - __put_page(page); /* The pagecache ref */ - return 1; + } else { + __remove_from_page_cache(page); + spin_unlock_irq(&mapping->tree_lock); } - __remove_from_page_cache(page); - write_unlock_irq(&mapping->tree_lock); - __put_page(page); return 1; cannot_free: - write_unlock_irq(&mapping->tree_lock); + spin_unlock_irq(&mapping->tree_lock); return 0; } /* + * Attempt to detach a locked page from its ->mapping. If it is dirty or if + * someone else has a ref on the page, abort and return 0. If it was + * successfully detached, return 1. Assumes the caller has a single ref on + * this page. + */ +int remove_mapping(struct address_space *mapping, struct page *page) +{ + if (__remove_mapping(mapping, page)) { + /* + * Unfreezing the refcount with 1 rather than 2 effectively + * drops the pagecache ref for us without requiring another + * atomic operation. + */ + page_unfreeze_refs(page, 1); + return 1; + } + return 0; +} + +/** + * putback_lru_page - put previously isolated page onto appropriate LRU list + * @page: page to be put back to appropriate lru list + * + * Add previously isolated @page to appropriate LRU list. + * Page may still be unevictable for other reasons. + * + * lru_lock must not be held, interrupts must be enabled. + */ +#ifdef CONFIG_UNEVICTABLE_LRU +void putback_lru_page(struct page *page) +{ + int lru; + int active = !!TestClearPageActive(page); + int was_unevictable = PageUnevictable(page); + + VM_BUG_ON(PageLRU(page)); + +redo: + ClearPageUnevictable(page); + + if (page_evictable(page, NULL)) { + /* + * For evictable pages, we can use the cache. + * In event of a race, worst case is we end up with an + * unevictable page on [in]active list. + * We know how to handle that. + */ + lru = active + page_is_file_cache(page); + lru_cache_add_lru(page, lru); + } else { + /* + * Put unevictable pages directly on zone's unevictable + * list. + */ + lru = LRU_UNEVICTABLE; + add_page_to_unevictable_list(page); + } + mem_cgroup_move_lists(page, lru); + + /* + * page's status can change while we move it among lru. If an evictable + * page is on unevictable list, it never be freed. To avoid that, + * check after we added it to the list, again. + */ + if (lru == LRU_UNEVICTABLE && page_evictable(page, NULL)) { + if (!isolate_lru_page(page)) { + put_page(page); + goto redo; + } + /* This means someone else dropped this page from LRU + * So, it will be freed or putback to LRU again. There is + * nothing to do here. + */ + } + + if (was_unevictable && lru != LRU_UNEVICTABLE) + count_vm_event(UNEVICTABLE_PGRESCUED); + else if (!was_unevictable && lru == LRU_UNEVICTABLE) + count_vm_event(UNEVICTABLE_PGCULLED); + + put_page(page); /* drop ref from isolate */ +} + +#else /* CONFIG_UNEVICTABLE_LRU */ + +void putback_lru_page(struct page *page) +{ + int lru; + VM_BUG_ON(PageLRU(page)); + + lru = !!TestClearPageActive(page) + page_is_file_cache(page); + lru_cache_add_lru(page, lru); + mem_cgroup_move_lists(page, lru); + put_page(page); +} +#endif /* CONFIG_UNEVICTABLE_LRU */ + + +/* * shrink_page_list() returns the number of reclaimed pages */ static unsigned long shrink_page_list(struct list_head *page_list, @@ -477,13 +576,16 @@ static unsigned long shrink_page_list(struct list_head *page_list, page = lru_to_page(page_list); list_del(&page->lru); - if (TestSetPageLocked(page)) + if (!trylock_page(page)) goto keep; VM_BUG_ON(PageActive(page)); sc->nr_scanned++; + if (unlikely(!page_evictable(page, NULL))) + goto cull_mlocked; + if (!sc->may_swap && page_mapped(page)) goto keep_locked; @@ -520,9 +622,19 @@ static unsigned long shrink_page_list(struct list_head *page_list, * Anonymous process memory has backing store? * Try to allocate it some swap space here. */ - if (PageAnon(page) && !PageSwapCache(page)) + if (PageAnon(page) && !PageSwapCache(page)) { + switch (try_to_munlock(page)) { + case SWAP_FAIL: /* shouldn't happen */ + case SWAP_AGAIN: + goto keep_locked; + case SWAP_MLOCK: + goto cull_mlocked; + case SWAP_SUCCESS: + ; /* fall thru'; add to swap cache */ + } if (!add_to_swap(page, GFP_ATOMIC)) goto activate_locked; + } #endif /* CONFIG_SWAP */ mapping = page_mapping(page); @@ -537,6 +649,8 @@ static unsigned long shrink_page_list(struct list_head *page_list, goto activate_locked; case SWAP_AGAIN: goto keep_locked; + case SWAP_MLOCK: + goto cull_mlocked; case SWAP_SUCCESS: ; /* try to free the page below */ } @@ -563,7 +677,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, * A synchronous write - probably a ramdisk. Go * ahead and try to reclaim the page. */ - if (TestSetPageLocked(page)) + if (!trylock_page(page)) goto keep; if (PageDirty(page) || PageWriteback(page)) goto keep_locked; @@ -583,7 +697,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, * possible for a page to have PageDirty set, but it is actually * clean (all its buffers are clean). This happens if the * buffers were written out directly, with submit_bh(). ext3 - * will do this, as well as the blockdev mapping. + * will do this, as well as the blockdev mapping. * try_to_release_page() will discover that cleanness and will * drop the buffers and mark the page clean - it can be freed. * @@ -597,32 +711,64 @@ static unsigned long shrink_page_list(struct list_head *page_list, if (PagePrivate(page)) { if (!try_to_release_page(page, sc->gfp_mask)) goto activate_locked; - if (!mapping && page_count(page) == 1) - goto free_it; + if (!mapping && page_count(page) == 1) { + unlock_page(page); + if (put_page_testzero(page)) + goto free_it; + else { + /* + * rare race with speculative reference. + * the speculative reference will free + * this page shortly, so we may + * increment nr_reclaimed here (and + * leave it off the LRU). + */ + nr_reclaimed++; + continue; + } + } } - if (!mapping || !remove_mapping(mapping, page)) + if (!mapping || !__remove_mapping(mapping, page)) goto keep_locked; + /* + * At this point, we have no other references and there is + * no way to pick any more up (removed from LRU, removed + * from pagecache). Can use non-atomic bitops now (and + * we obviously don't have to worry about waking up a process + * waiting on the page lock, because there are no references. + */ + __clear_page_locked(page); free_it: - unlock_page(page); nr_reclaimed++; - if (!pagevec_add(&freed_pvec, page)) - __pagevec_release_nonlru(&freed_pvec); + if (!pagevec_add(&freed_pvec, page)) { + __pagevec_free(&freed_pvec); + pagevec_reinit(&freed_pvec); + } + continue; + +cull_mlocked: + unlock_page(page); + putback_lru_page(page); continue; activate_locked: + /* Not a candidate for swapping, so reclaim swap space. */ + if (PageSwapCache(page) && vm_swap_full()) + remove_exclusive_swap_page_ref(page); + VM_BUG_ON(PageActive(page)); SetPageActive(page); pgactivate++; keep_locked: unlock_page(page); keep: list_add(&page->lru, &ret_pages); - VM_BUG_ON(PageLRU(page)); + VM_BUG_ON(PageLRU(page) || PageUnevictable(page)); } list_splice(&ret_pages, page_list); if (pagevec_count(&freed_pvec)) - __pagevec_release_nonlru(&freed_pvec); + __pagevec_free(&freed_pvec); count_vm_events(PGACTIVATE, pgactivate); return nr_reclaimed; } @@ -642,7 +788,7 @@ keep: * * returns 0 on success, -ve errno on failure. */ -int __isolate_lru_page(struct page *page, int mode) +int __isolate_lru_page(struct page *page, int mode, int file) { int ret = -EINVAL; @@ -658,6 +804,17 @@ int __isolate_lru_page(struct page *page, int mode) if (mode != ISOLATE_BOTH && (!PageActive(page) != !mode)) return ret; + if (mode != ISOLATE_BOTH && (!page_is_file_cache(page) != !file)) + return ret; + + /* + * When this function is being called for lumpy reclaim, we + * initially look into all LRU pages, active, inactive and + * unevictable; only give shrink_page_list evictable pages. + */ + if (PageUnevictable(page)) + return ret; + ret = -EBUSY; if (likely(get_page_unless_zero(page))) { /* @@ -688,12 +845,13 @@ int __isolate_lru_page(struct page *page, int mode) * @scanned: The number of pages that were scanned. * @order: The caller's attempted allocation order * @mode: One of the LRU isolation modes + * @file: True [1] if isolating file [!anon] pages * * returns how many pages were moved onto *@dst. */ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, struct list_head *src, struct list_head *dst, - unsigned long *scanned, int order, int mode) + unsigned long *scanned, int order, int mode, int file) { unsigned long nr_taken = 0; unsigned long scan; @@ -710,7 +868,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, VM_BUG_ON(!PageLRU(page)); - switch (__isolate_lru_page(page, mode)) { + switch (__isolate_lru_page(page, mode, file)) { case 0: list_move(&page->lru, dst); nr_taken++; @@ -753,10 +911,11 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, break; cursor_page = pfn_to_page(pfn); + /* Check that we have not crossed a zone boundary. */ if (unlikely(page_zone_id(cursor_page) != zone_id)) continue; - switch (__isolate_lru_page(cursor_page, mode)) { + switch (__isolate_lru_page(cursor_page, mode, file)) { case 0: list_move(&cursor_page->lru, dst); nr_taken++; @@ -767,7 +926,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, /* else it is being freed elsewhere */ list_move(&cursor_page->lru, src); default: - break; + break; /* ! on LRU or wrong list */ } } } @@ -781,40 +940,93 @@ static unsigned long isolate_pages_global(unsigned long nr, unsigned long *scanned, int order, int mode, struct zone *z, struct mem_cgroup *mem_cont, - int active) + int active, int file) { + int lru = LRU_BASE; if (active) - return isolate_lru_pages(nr, &z->active_list, dst, - scanned, order, mode); - else - return isolate_lru_pages(nr, &z->inactive_list, dst, - scanned, order, mode); + lru += LRU_ACTIVE; + if (file) + lru += LRU_FILE; + return isolate_lru_pages(nr, &z->lru[lru].list, dst, scanned, order, + mode, !!file); } /* * clear_active_flags() is a helper for shrink_active_list(), clearing * any active bits from the pages in the list. */ -static unsigned long clear_active_flags(struct list_head *page_list) +static unsigned long clear_active_flags(struct list_head *page_list, + unsigned int *count) { int nr_active = 0; + int lru; struct page *page; - list_for_each_entry(page, page_list, lru) + list_for_each_entry(page, page_list, lru) { + lru = page_is_file_cache(page); if (PageActive(page)) { + lru += LRU_ACTIVE; ClearPageActive(page); nr_active++; } + count[lru]++; + } return nr_active; } +/** + * isolate_lru_page - tries to isolate a page from its LRU list + * @page: page to isolate from its LRU list + * + * Isolates a @page from an LRU list, clears PageLRU and adjusts the + * vmstat statistic corresponding to whatever LRU list the page was on. + * + * Returns 0 if the page was removed from an LRU list. + * Returns -EBUSY if the page was not on an LRU list. + * + * The returned page will have PageLRU() cleared. If it was found on + * the active list, it will have PageActive set. If it was found on + * the unevictable list, it will have the PageUnevictable bit set. That flag + * may need to be cleared by the caller before letting the page go. + * + * The vmstat statistic corresponding to the list on which the page was + * found will be decremented. + * + * Restrictions: + * (1) Must be called with an elevated refcount on the page. This is a + * fundamentnal difference from isolate_lru_pages (which is called + * without a stable reference). + * (2) the lru_lock must not be held. + * (3) interrupts must be enabled. + */ +int isolate_lru_page(struct page *page) +{ + int ret = -EBUSY; + + if (PageLRU(page)) { + struct zone *zone = page_zone(page); + + spin_lock_irq(&zone->lru_lock); + if (PageLRU(page) && get_page_unless_zero(page)) { + int lru = page_lru(page); + ret = 0; + ClearPageLRU(page); + + del_page_from_lru_list(zone, page, lru); + } + spin_unlock_irq(&zone->lru_lock); + } + return ret; +} + /* * shrink_inactive_list() is a helper for shrink_zone(). It returns the number * of reclaimed pages */ static unsigned long shrink_inactive_list(unsigned long max_scan, - struct zone *zone, struct scan_control *sc) + struct zone *zone, struct scan_control *sc, + int priority, int file) { LIST_HEAD(page_list); struct pagevec pvec; @@ -831,20 +1043,43 @@ static unsigned long shrink_inactive_list(unsigned long max_scan, unsigned long nr_scan; unsigned long nr_freed; unsigned long nr_active; + unsigned int count[NR_LRU_LISTS] = { 0, }; + int mode = ISOLATE_INACTIVE; + + /* + * If we need a large contiguous chunk of memory, or have + * trouble getting a small set of contiguous pages, we + * will reclaim both active and inactive pages. + * + * We use the same threshold as pageout congestion_wait below. + */ + if (sc->order > PAGE_ALLOC_COSTLY_ORDER) + mode = ISOLATE_BOTH; + else if (sc->order && priority < DEF_PRIORITY - 2) + mode = ISOLATE_BOTH; nr_taken = sc->isolate_pages(sc->swap_cluster_max, - &page_list, &nr_scan, sc->order, - (sc->order > PAGE_ALLOC_COSTLY_ORDER)? - ISOLATE_BOTH : ISOLATE_INACTIVE, - zone, sc->mem_cgroup, 0); - nr_active = clear_active_flags(&page_list); + &page_list, &nr_scan, sc->order, mode, + zone, sc->mem_cgroup, 0, file); + nr_active = clear_active_flags(&page_list, count); __count_vm_events(PGDEACTIVATE, nr_active); - __mod_zone_page_state(zone, NR_ACTIVE, -nr_active); - __mod_zone_page_state(zone, NR_INACTIVE, - -(nr_taken - nr_active)); - if (scan_global_lru(sc)) + __mod_zone_page_state(zone, NR_ACTIVE_FILE, + -count[LRU_ACTIVE_FILE]); + __mod_zone_page_state(zone, NR_INACTIVE_FILE, + -count[LRU_INACTIVE_FILE]); + __mod_zone_page_state(zone, NR_ACTIVE_ANON, + -count[LRU_ACTIVE_ANON]); + __mod_zone_page_state(zone, NR_INACTIVE_ANON, + -count[LRU_INACTIVE_ANON]); + + if (scan_global_lru(sc)) { zone->pages_scanned += nr_scan; + zone->recent_scanned[0] += count[LRU_INACTIVE_ANON]; + zone->recent_scanned[0] += count[LRU_ACTIVE_ANON]; + zone->recent_scanned[1] += count[LRU_INACTIVE_FILE]; + zone->recent_scanned[1] += count[LRU_ACTIVE_FILE]; + } spin_unlock_irq(&zone->lru_lock); nr_scanned += nr_scan; @@ -864,7 +1099,7 @@ static unsigned long shrink_inactive_list(unsigned long max_scan, * The attempt at page out may have made some * of the pages active, mark them inactive again. */ - nr_active = clear_active_flags(&page_list); + nr_active = clear_active_flags(&page_list, count); count_vm_events(PGDEACTIVATE, nr_active); nr_freed += shrink_page_list(&page_list, sc, @@ -889,14 +1124,24 @@ static unsigned long shrink_inactive_list(unsigned long max_scan, * Put back any unfreeable pages. */ while (!list_empty(&page_list)) { + int lru; page = lru_to_page(&page_list); VM_BUG_ON(PageLRU(page)); - SetPageLRU(page); list_del(&page->lru); - if (PageActive(page)) - add_page_to_active_list(zone, page); - else - add_page_to_inactive_list(zone, page); + if (unlikely(!page_evictable(page, NULL))) { + spin_unlock_irq(&zone->lru_lock); + putback_lru_page(page); + spin_lock_irq(&zone->lru_lock); + continue; + } + SetPageLRU(page); + lru = page_lru(page); + add_page_to_lru_list(zone, page, lru); + mem_cgroup_move_lists(page, lru); + if (PageActive(page) && scan_global_lru(sc)) { + int file = !!page_is_file_cache(page); + zone->recent_rotated[file]++; + } if (!pagevec_add(&pvec, page)) { spin_unlock_irq(&zone->lru_lock); __pagevec_release(&pvec); @@ -927,115 +1172,7 @@ static inline void note_zone_scanning_priority(struct zone *zone, int priority) static inline int zone_is_near_oom(struct zone *zone) { - return zone->pages_scanned >= (zone_page_state(zone, NR_ACTIVE) - + zone_page_state(zone, NR_INACTIVE))*3; -} - -/* - * Determine we should try to reclaim mapped pages. - * This is called only when sc->mem_cgroup is NULL. - */ -static int calc_reclaim_mapped(struct scan_control *sc, struct zone *zone, - int priority) -{ - long mapped_ratio; - long distress; - long swap_tendency; - long imbalance; - int reclaim_mapped = 0; - int prev_priority; - - if (scan_global_lru(sc) && zone_is_near_oom(zone)) - return 1; - /* - * `distress' is a measure of how much trouble we're having - * reclaiming pages. 0 -> no problems. 100 -> great trouble. - */ - if (scan_global_lru(sc)) - prev_priority = zone->prev_priority; - else - prev_priority = mem_cgroup_get_reclaim_priority(sc->mem_cgroup); - - distress = 100 >> min(prev_priority, priority); - - /* - * The point of this algorithm is to decide when to start - * reclaiming mapped memory instead of just pagecache. Work out - * how much memory - * is mapped. - */ - if (scan_global_lru(sc)) - mapped_ratio = ((global_page_state(NR_FILE_MAPPED) + - global_page_state(NR_ANON_PAGES)) * 100) / - vm_total_pages; - else - mapped_ratio = mem_cgroup_calc_mapped_ratio(sc->mem_cgroup); - - /* - * Now decide how much we really want to unmap some pages. The - * mapped ratio is downgraded - just because there's a lot of - * mapped memory doesn't necessarily mean that page reclaim - * isn't succeeding. - * - * The distress ratio is important - we don't want to start - * going oom. - * - * A 100% value of vm_swappiness overrides this algorithm - * altogether. - */ - swap_tendency = mapped_ratio / 2 + distress + sc->swappiness; - - /* - * If there's huge imbalance between active and inactive - * (think active 100 times larger than inactive) we should - * become more permissive, or the system will take too much - * cpu before it start swapping during memory pressure. - * Distress is about avoiding early-oom, this is about - * making swappiness graceful despite setting it to low - * values. - * - * Avoid div by zero with nr_inactive+1, and max resulting - * value is vm_total_pages. - */ - if (scan_global_lru(sc)) { - imbalance = zone_page_state(zone, NR_ACTIVE); - imbalance /= zone_page_state(zone, NR_INACTIVE) + 1; - } else - imbalance = mem_cgroup_reclaim_imbalance(sc->mem_cgroup); - - /* - * Reduce the effect of imbalance if swappiness is low, - * this means for a swappiness very low, the imbalance - * must be much higher than 100 for this logic to make - * the difference. - * - * Max temporary value is vm_total_pages*100. - */ - imbalance *= (vm_swappiness + 1); - imbalance /= 100; - - /* - * If not much of the ram is mapped, makes the imbalance - * less relevant, it's high priority we refill the inactive - * list with mapped pages only in presence of high ratio of - * mapped pages. - * - * Max temporary value is vm_total_pages*100. - */ - imbalance *= mapped_ratio; - imbalance /= 100; - - /* apply imbalance feedback to swap_tendency */ - swap_tendency += imbalance; - - /* - * Now use this metric to decide whether to start moving mapped - * memory onto the inactive list. - */ - if (swap_tendency >= 100) - reclaim_mapped = 1; - - return reclaim_mapped; + return zone->pages_scanned >= (zone_lru_pages(zone) * 3); } /* @@ -1058,53 +1195,71 @@ static int calc_reclaim_mapped(struct scan_control *sc, struct zone *zone, static void shrink_active_list(unsigned long nr_pages, struct zone *zone, - struct scan_control *sc, int priority) + struct scan_control *sc, int priority, int file) { unsigned long pgmoved; int pgdeactivate = 0; unsigned long pgscanned; LIST_HEAD(l_hold); /* The pages which were snipped off */ - LIST_HEAD(l_inactive); /* Pages to go onto the inactive_list */ - LIST_HEAD(l_active); /* Pages to go onto the active_list */ + LIST_HEAD(l_inactive); struct page *page; struct pagevec pvec; - int reclaim_mapped = 0; - - if (sc->may_swap) - reclaim_mapped = calc_reclaim_mapped(sc, zone, priority); + enum lru_list lru; lru_add_drain(); spin_lock_irq(&zone->lru_lock); pgmoved = sc->isolate_pages(nr_pages, &l_hold, &pgscanned, sc->order, ISOLATE_ACTIVE, zone, - sc->mem_cgroup, 1); + sc->mem_cgroup, 1, file); /* * zone->pages_scanned is used for detect zone's oom * mem_cgroup remembers nr_scan by itself. */ - if (scan_global_lru(sc)) + if (scan_global_lru(sc)) { zone->pages_scanned += pgscanned; + zone->recent_scanned[!!file] += pgmoved; + } - __mod_zone_page_state(zone, NR_ACTIVE, -pgmoved); + if (file) + __mod_zone_page_state(zone, NR_ACTIVE_FILE, -pgmoved); + else + __mod_zone_page_state(zone, NR_ACTIVE_ANON, -pgmoved); spin_unlock_irq(&zone->lru_lock); + pgmoved = 0; while (!list_empty(&l_hold)) { cond_resched(); page = lru_to_page(&l_hold); list_del(&page->lru); - if (page_mapped(page)) { - if (!reclaim_mapped || - (total_swap_pages == 0 && PageAnon(page)) || - page_referenced(page, 0, sc->mem_cgroup)) { - list_add(&page->lru, &l_active); - continue; - } + + if (unlikely(!page_evictable(page, NULL))) { + putback_lru_page(page); + continue; } + + /* page_referenced clears PageReferenced */ + if (page_mapping_inuse(page) && + page_referenced(page, 0, sc->mem_cgroup)) + pgmoved++; + list_add(&page->lru, &l_inactive); } + /* + * Count referenced pages from currently used mappings as + * rotated, even though they are moved to the inactive list. + * This helps balance scan pressure between file and anonymous + * pages in get_scan_ratio. + */ + zone->recent_rotated[!!file] += pgmoved; + + /* + * Move the pages to the [file or anon] inactive list. + */ pagevec_init(&pvec, 1); + pgmoved = 0; + lru = LRU_BASE + file * LRU_FILE; spin_lock_irq(&zone->lru_lock); while (!list_empty(&l_inactive)) { page = lru_to_page(&l_inactive); @@ -1114,11 +1269,11 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, VM_BUG_ON(!PageActive(page)); ClearPageActive(page); - list_move(&page->lru, &zone->inactive_list); - mem_cgroup_move_lists(page, false); + list_move(&page->lru, &zone->lru[lru].list); + mem_cgroup_move_lists(page, lru); pgmoved++; if (!pagevec_add(&pvec, page)) { - __mod_zone_page_state(zone, NR_INACTIVE, pgmoved); + __mod_zone_page_state(zone, NR_LRU_BASE + lru, pgmoved); spin_unlock_irq(&zone->lru_lock); pgdeactivate += pgmoved; pgmoved = 0; @@ -1128,104 +1283,189 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, spin_lock_irq(&zone->lru_lock); } } - __mod_zone_page_state(zone, NR_INACTIVE, pgmoved); + __mod_zone_page_state(zone, NR_LRU_BASE + lru, pgmoved); pgdeactivate += pgmoved; if (buffer_heads_over_limit) { spin_unlock_irq(&zone->lru_lock); pagevec_strip(&pvec); spin_lock_irq(&zone->lru_lock); } - - pgmoved = 0; - while (!list_empty(&l_active)) { - page = lru_to_page(&l_active); - prefetchw_prev_lru_page(page, &l_active, flags); - VM_BUG_ON(PageLRU(page)); - SetPageLRU(page); - VM_BUG_ON(!PageActive(page)); - - list_move(&page->lru, &zone->active_list); - mem_cgroup_move_lists(page, true); - pgmoved++; - if (!pagevec_add(&pvec, page)) { - __mod_zone_page_state(zone, NR_ACTIVE, pgmoved); - pgmoved = 0; - spin_unlock_irq(&zone->lru_lock); - __pagevec_release(&pvec); - spin_lock_irq(&zone->lru_lock); - } - } - __mod_zone_page_state(zone, NR_ACTIVE, pgmoved); - __count_zone_vm_events(PGREFILL, zone, pgscanned); __count_vm_events(PGDEACTIVATE, pgdeactivate); spin_unlock_irq(&zone->lru_lock); + if (vm_swap_full()) + pagevec_swap_free(&pvec); pagevec_release(&pvec); } +static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan, + struct zone *zone, struct scan_control *sc, int priority) +{ + int file = is_file_lru(lru); + + if (lru == LRU_ACTIVE_FILE) { + shrink_active_list(nr_to_scan, zone, sc, priority, file); + return 0; + } + + if (lru == LRU_ACTIVE_ANON && + (!scan_global_lru(sc) || inactive_anon_is_low(zone))) { + shrink_active_list(nr_to_scan, zone, sc, priority, file); + return 0; + } + return shrink_inactive_list(nr_to_scan, zone, sc, priority, file); +} + +/* + * Determine how aggressively the anon and file LRU lists should be + * scanned. The relative value of each set of LRU lists is determined + * by looking at the fraction of the pages scanned we did rotate back + * onto the active list instead of evict. + * + * percent[0] specifies how much pressure to put on ram/swap backed + * memory, while percent[1] determines pressure on the file LRUs. + */ +static void get_scan_ratio(struct zone *zone, struct scan_control *sc, + unsigned long *percent) +{ + unsigned long anon, file, free; + unsigned long anon_prio, file_prio; + unsigned long ap, fp; + + anon = zone_page_state(zone, NR_ACTIVE_ANON) + + zone_page_state(zone, NR_INACTIVE_ANON); + file = zone_page_state(zone, NR_ACTIVE_FILE) + + zone_page_state(zone, NR_INACTIVE_FILE); + free = zone_page_state(zone, NR_FREE_PAGES); + + /* If we have no swap space, do not bother scanning anon pages. */ + if (nr_swap_pages <= 0) { + percent[0] = 0; + percent[1] = 100; + return; + } + + /* If we have very few page cache pages, force-scan anon pages. */ + if (unlikely(file + free <= zone->pages_high)) { + percent[0] = 100; + percent[1] = 0; + return; + } + + /* + * OK, so we have swap space and a fair amount of page cache + * pages. We use the recently rotated / recently scanned + * ratios to determine how valuable each cache is. + * + * Because workloads change over time (and to avoid overflow) + * we keep these statistics as a floating average, which ends + * up weighing recent references more than old ones. + * + * anon in [0], file in [1] + */ + if (unlikely(zone->recent_scanned[0] > anon / 4)) { + spin_lock_irq(&zone->lru_lock); + zone->recent_scanned[0] /= 2; + zone->recent_rotated[0] /= 2; + spin_unlock_irq(&zone->lru_lock); + } + + if (unlikely(zone->recent_scanned[1] > file / 4)) { + spin_lock_irq(&zone->lru_lock); + zone->recent_scanned[1] /= 2; + zone->recent_rotated[1] /= 2; + spin_unlock_irq(&zone->lru_lock); + } + + /* + * With swappiness at 100, anonymous and file have the same priority. + * This scanning priority is essentially the inverse of IO cost. + */ + anon_prio = sc->swappiness; + file_prio = 200 - sc->swappiness; + + /* + * anon recent_rotated[0] + * %anon = 100 * ----------- / ----------------- * IO cost + * anon + file rotate_sum + */ + ap = (anon_prio + 1) * (zone->recent_scanned[0] + 1); + ap /= zone->recent_rotated[0] + 1; + + fp = (file_prio + 1) * (zone->recent_scanned[1] + 1); + fp /= zone->recent_rotated[1] + 1; + + /* Normalize to percentages */ + percent[0] = 100 * ap / (ap + fp + 1); + percent[1] = 100 - percent[0]; +} + + /* * This is a basic per-zone page freer. Used by both kswapd and direct reclaim. */ static unsigned long shrink_zone(int priority, struct zone *zone, struct scan_control *sc) { - unsigned long nr_active; - unsigned long nr_inactive; + unsigned long nr[NR_LRU_LISTS]; unsigned long nr_to_scan; unsigned long nr_reclaimed = 0; + unsigned long percent[2]; /* anon @ 0; file @ 1 */ + enum lru_list l; - if (scan_global_lru(sc)) { - /* - * Add one to nr_to_scan just to make sure that the kernel - * will slowly sift through the active list. - */ - zone->nr_scan_active += - (zone_page_state(zone, NR_ACTIVE) >> priority) + 1; - nr_active = zone->nr_scan_active; - zone->nr_scan_inactive += - (zone_page_state(zone, NR_INACTIVE) >> priority) + 1; - nr_inactive = zone->nr_scan_inactive; - if (nr_inactive >= sc->swap_cluster_max) - zone->nr_scan_inactive = 0; - else - nr_inactive = 0; - - if (nr_active >= sc->swap_cluster_max) - zone->nr_scan_active = 0; - else - nr_active = 0; - } else { - /* - * This reclaim occurs not because zone memory shortage but - * because memory controller hits its limit. - * Then, don't modify zone reclaim related data. - */ - nr_active = mem_cgroup_calc_reclaim_active(sc->mem_cgroup, - zone, priority); - - nr_inactive = mem_cgroup_calc_reclaim_inactive(sc->mem_cgroup, - zone, priority); - } + get_scan_ratio(zone, sc, percent); + for_each_evictable_lru(l) { + if (scan_global_lru(sc)) { + int file = is_file_lru(l); + int scan; - while (nr_active || nr_inactive) { - if (nr_active) { - nr_to_scan = min(nr_active, - (unsigned long)sc->swap_cluster_max); - nr_active -= nr_to_scan; - shrink_active_list(nr_to_scan, zone, sc, priority); + scan = zone_page_state(zone, NR_LRU_BASE + l); + if (priority) { + scan >>= priority; + scan = (scan * percent[file]) / 100; + } + zone->lru[l].nr_scan += scan; + nr[l] = zone->lru[l].nr_scan; + if (nr[l] >= sc->swap_cluster_max) + zone->lru[l].nr_scan = 0; + else + nr[l] = 0; + } else { + /* + * This reclaim occurs not because zone memory shortage + * but because memory controller hits its limit. + * Don't modify zone reclaim related data. + */ + nr[l] = mem_cgroup_calc_reclaim(sc->mem_cgroup, zone, + priority, l); } + } - if (nr_inactive) { - nr_to_scan = min(nr_inactive, + while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] || + nr[LRU_INACTIVE_FILE]) { + for_each_evictable_lru(l) { + if (nr[l]) { + nr_to_scan = min(nr[l], (unsigned long)sc->swap_cluster_max); - nr_inactive -= nr_to_scan; - nr_reclaimed += shrink_inactive_list(nr_to_scan, zone, - sc); + nr[l] -= nr_to_scan; + + nr_reclaimed += shrink_list(l, nr_to_scan, + zone, sc, priority); + } } } + /* + * Even if we did not try to evict anon pages at all, we want to + * rebalance the anon lru active/inactive ratio. + */ + if (!scan_global_lru(sc) || inactive_anon_is_low(zone)) + shrink_active_list(SWAP_CLUSTER_MAX, zone, sc, priority, 0); + else if (!scan_global_lru(sc)) + shrink_active_list(SWAP_CLUSTER_MAX, zone, sc, priority, 0); + throttle_vm_writeout(sc->gfp_mask); return nr_reclaimed; } @@ -1286,7 +1526,7 @@ static unsigned long shrink_zones(int priority, struct zonelist *zonelist, return nr_reclaimed; } - + /* * This is the main entry point to direct page reclaim. * @@ -1316,6 +1556,8 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, struct zone *zone; enum zone_type high_zoneidx = gfp_zone(sc->gfp_mask); + delayacct_freepages_start(); + if (scan_global_lru(sc)) count_vm_event(ALLOCSTALL); /* @@ -1327,8 +1569,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) continue; - lru_pages += zone_page_state(zone, NR_ACTIVE) - + zone_page_state(zone, NR_INACTIVE); + lru_pages += zone_lru_pages(zone); } } @@ -1371,7 +1612,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, if (sc->nr_scanned && priority < DEF_PRIORITY - 2) congestion_wait(WRITE, HZ/10); } - /* top priority shrink_caches still had more to do? don't OOM, then */ + /* top priority shrink_zones still had more to do? don't OOM, then */ if (!sc->all_unreclaimable && scan_global_lru(sc)) ret = nr_reclaimed; out: @@ -1396,6 +1637,8 @@ out: } else mem_cgroup_record_reclaim_priority(sc->mem_cgroup, priority); + delayacct_freepages_end(); + return ret; } @@ -1516,6 +1759,14 @@ loop_again: priority != DEF_PRIORITY) continue; + /* + * Do some background aging of the anon list, to give + * pages a chance to be referenced before reclaiming. + */ + if (inactive_anon_is_low(zone)) + shrink_active_list(SWAP_CLUSTER_MAX, zone, + &sc, priority, 0); + if (!zone_watermark_ok(zone, order, zone->pages_high, 0, 0)) { end_zone = i; @@ -1528,8 +1779,7 @@ loop_again: for (i = 0; i <= end_zone; i++) { struct zone *zone = pgdat->node_zones + i; - lru_pages += zone_page_state(zone, NR_ACTIVE) - + zone_page_state(zone, NR_INACTIVE); + lru_pages += zone_lru_pages(zone); } /* @@ -1573,8 +1823,7 @@ loop_again: if (zone_is_all_unreclaimable(zone)) continue; if (nr_slab == 0 && zone->pages_scanned >= - (zone_page_state(zone, NR_ACTIVE) - + zone_page_state(zone, NR_INACTIVE)) * 6) + (zone_lru_pages(zone) * 6)) zone_set_flag(zone, ZONE_ALL_UNRECLAIMABLE); /* @@ -1628,7 +1877,7 @@ out: /* * The background pageout daemon, started as a kernel thread - * from the init process. + * from the init process. * * This basically trickles out pages so that we have _some_ * free memory available even if there is no other activity @@ -1722,6 +1971,14 @@ void wakeup_kswapd(struct zone *zone, int order) wake_up_interruptible(&pgdat->kswapd_wait); } +unsigned long global_lru_pages(void) +{ + return global_page_state(NR_ACTIVE_ANON) + + global_page_state(NR_ACTIVE_FILE) + + global_page_state(NR_INACTIVE_ANON) + + global_page_state(NR_INACTIVE_FILE); +} + #ifdef CONFIG_PM /* * Helper function for shrink_all_memory(). Tries to reclaim 'nr_pages' pages @@ -1735,6 +1992,7 @@ static unsigned long shrink_all_zones(unsigned long nr_pages, int prio, { struct zone *zone; unsigned long nr_to_scan, ret = 0; + enum lru_list l; for_each_zone(zone) { @@ -1744,38 +2002,31 @@ static unsigned long shrink_all_zones(unsigned long nr_pages, int prio, if (zone_is_all_unreclaimable(zone) && prio != DEF_PRIORITY) continue; - /* For pass = 0 we don't shrink the active list */ - if (pass > 0) { - zone->nr_scan_active += - (zone_page_state(zone, NR_ACTIVE) >> prio) + 1; - if (zone->nr_scan_active >= nr_pages || pass > 3) { - zone->nr_scan_active = 0; + for_each_evictable_lru(l) { + /* For pass = 0, we don't shrink the active list */ + if (pass == 0 && + (l == LRU_ACTIVE || l == LRU_ACTIVE_FILE)) + continue; + + zone->lru[l].nr_scan += + (zone_page_state(zone, NR_LRU_BASE + l) + >> prio) + 1; + if (zone->lru[l].nr_scan >= nr_pages || pass > 3) { + zone->lru[l].nr_scan = 0; nr_to_scan = min(nr_pages, - zone_page_state(zone, NR_ACTIVE)); - shrink_active_list(nr_to_scan, zone, sc, prio); + zone_page_state(zone, + NR_LRU_BASE + l)); + ret += shrink_list(l, nr_to_scan, zone, + sc, prio); + if (ret >= nr_pages) + return ret; } } - - zone->nr_scan_inactive += - (zone_page_state(zone, NR_INACTIVE) >> prio) + 1; - if (zone->nr_scan_inactive >= nr_pages || pass > 3) { - zone->nr_scan_inactive = 0; - nr_to_scan = min(nr_pages, - zone_page_state(zone, NR_INACTIVE)); - ret += shrink_inactive_list(nr_to_scan, zone, sc); - if (ret >= nr_pages) - return ret; - } } return ret; } -static unsigned long count_lru_pages(void) -{ - return global_page_state(NR_ACTIVE) + global_page_state(NR_INACTIVE); -} - /* * Try to free `nr_pages' of memory, system-wide, and return the number of * freed pages. @@ -1801,7 +2052,7 @@ unsigned long shrink_all_memory(unsigned long nr_pages) current->reclaim_state = &reclaim_state; - lru_pages = count_lru_pages(); + lru_pages = global_lru_pages(); nr_slab = global_page_state(NR_SLAB_RECLAIMABLE); /* If slab caches are huge, it's better to hit them first */ while (nr_slab >= lru_pages) { @@ -1844,7 +2095,7 @@ unsigned long shrink_all_memory(unsigned long nr_pages) reclaim_state.reclaimed_slab = 0; shrink_slab(sc.nr_scanned, sc.gfp_mask, - count_lru_pages()); + global_lru_pages()); ret += reclaim_state.reclaimed_slab; if (ret >= nr_pages) goto out; @@ -1861,7 +2112,7 @@ unsigned long shrink_all_memory(unsigned long nr_pages) if (!ret) { do { reclaim_state.reclaimed_slab = 0; - shrink_slab(nr_pages, sc.gfp_mask, count_lru_pages()); + shrink_slab(nr_pages, sc.gfp_mask, global_lru_pages()); ret += reclaim_state.reclaimed_slab; } while (ret < nr_pages && reclaim_state.reclaimed_slab > 0); } @@ -1940,7 +2191,7 @@ module_init(kswapd_init) int zone_reclaim_mode __read_mostly; #define RECLAIM_OFF 0 -#define RECLAIM_ZONE (1<<0) /* Run shrink_cache on the zone */ +#define RECLAIM_ZONE (1<<0) /* Run shrink_inactive_list on the zone */ #define RECLAIM_WRITE (1<<1) /* Writeout pages during reclaim */ #define RECLAIM_SWAP (1<<2) /* Swap pages out during reclaim */ @@ -2089,3 +2340,285 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) return ret; } #endif + +#ifdef CONFIG_UNEVICTABLE_LRU +/* + * page_evictable - test whether a page is evictable + * @page: the page to test + * @vma: the VMA in which the page is or will be mapped, may be NULL + * + * Test whether page is evictable--i.e., should be placed on active/inactive + * lists vs unevictable list. The vma argument is !NULL when called from the + * fault path to determine how to instantate a new page. + * + * Reasons page might not be evictable: + * (1) page's mapping marked unevictable + * (2) page is part of an mlocked VMA + * + */ +int page_evictable(struct page *page, struct vm_area_struct *vma) +{ + + if (mapping_unevictable(page_mapping(page))) + return 0; + + if (PageMlocked(page) || (vma && is_mlocked_vma(vma, page))) + return 0; + + return 1; +} + +static void show_page_path(struct page *page) +{ + char buf[256]; + if (page_is_file_cache(page)) { + struct address_space *mapping = page->mapping; + struct dentry *dentry; + pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); + + spin_lock(&mapping->i_mmap_lock); + dentry = d_find_alias(mapping->host); + printk(KERN_INFO "rescued: %s %lu\n", + dentry_path(dentry, buf, 256), pgoff); + spin_unlock(&mapping->i_mmap_lock); + } else { +#if defined(CONFIG_MM_OWNER) && defined(CONFIG_MMU) + struct anon_vma *anon_vma; + struct vm_area_struct *vma; + + anon_vma = page_lock_anon_vma(page); + if (!anon_vma) + return; + + list_for_each_entry(vma, &anon_vma->head, anon_vma_node) { + printk(KERN_INFO "rescued: anon %s\n", + vma->vm_mm->owner->comm); + break; + } + page_unlock_anon_vma(anon_vma); +#endif + } +} + + +/** + * check_move_unevictable_page - check page for evictability and move to appropriate zone lru list + * @page: page to check evictability and move to appropriate lru list + * @zone: zone page is in + * + * Checks a page for evictability and moves the page to the appropriate + * zone lru list. + * + * Restrictions: zone->lru_lock must be held, page must be on LRU and must + * have PageUnevictable set. + */ +static void check_move_unevictable_page(struct page *page, struct zone *zone) +{ + VM_BUG_ON(PageActive(page)); + +retry: + ClearPageUnevictable(page); + if (page_evictable(page, NULL)) { + enum lru_list l = LRU_INACTIVE_ANON + page_is_file_cache(page); + + show_page_path(page); + + __dec_zone_state(zone, NR_UNEVICTABLE); + list_move(&page->lru, &zone->lru[l].list); + __inc_zone_state(zone, NR_INACTIVE_ANON + l); + __count_vm_event(UNEVICTABLE_PGRESCUED); + } else { + /* + * rotate unevictable list + */ + SetPageUnevictable(page); + list_move(&page->lru, &zone->lru[LRU_UNEVICTABLE].list); + if (page_evictable(page, NULL)) + goto retry; + } +} + +/** + * scan_mapping_unevictable_pages - scan an address space for evictable pages + * @mapping: struct address_space to scan for evictable pages + * + * Scan all pages in mapping. Check unevictable pages for + * evictability and move them to the appropriate zone lru list. + */ +void scan_mapping_unevictable_pages(struct address_space *mapping) +{ + pgoff_t next = 0; + pgoff_t end = (i_size_read(mapping->host) + PAGE_CACHE_SIZE - 1) >> + PAGE_CACHE_SHIFT; + struct zone *zone; + struct pagevec pvec; + + if (mapping->nrpages == 0) + return; + + pagevec_init(&pvec, 0); + while (next < end && + pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) { + int i; + int pg_scanned = 0; + + zone = NULL; + + for (i = 0; i < pagevec_count(&pvec); i++) { + struct page *page = pvec.pages[i]; + pgoff_t page_index = page->index; + struct zone *pagezone = page_zone(page); + + pg_scanned++; + if (page_index > next) + next = page_index; + next++; + + if (pagezone != zone) { + if (zone) + spin_unlock_irq(&zone->lru_lock); + zone = pagezone; + spin_lock_irq(&zone->lru_lock); + } + + if (PageLRU(page) && PageUnevictable(page)) + check_move_unevictable_page(page, zone); + } + if (zone) + spin_unlock_irq(&zone->lru_lock); + pagevec_release(&pvec); + + count_vm_events(UNEVICTABLE_PGSCANNED, pg_scanned); + } + +} + +/** + * scan_zone_unevictable_pages - check unevictable list for evictable pages + * @zone - zone of which to scan the unevictable list + * + * Scan @zone's unevictable LRU lists to check for pages that have become + * evictable. Move those that have to @zone's inactive list where they + * become candidates for reclaim, unless shrink_inactive_zone() decides + * to reactivate them. Pages that are still unevictable are rotated + * back onto @zone's unevictable list. + */ +#define SCAN_UNEVICTABLE_BATCH_SIZE 16UL /* arbitrary lock hold batch size */ +void scan_zone_unevictable_pages(struct zone *zone) +{ + struct list_head *l_unevictable = &zone->lru[LRU_UNEVICTABLE].list; + unsigned long scan; + unsigned long nr_to_scan = zone_page_state(zone, NR_UNEVICTABLE); + + while (nr_to_scan > 0) { + unsigned long batch_size = min(nr_to_scan, + SCAN_UNEVICTABLE_BATCH_SIZE); + + spin_lock_irq(&zone->lru_lock); + for (scan = 0; scan < batch_size; scan++) { + struct page *page = lru_to_page(l_unevictable); + + if (!trylock_page(page)) + continue; + + prefetchw_prev_lru_page(page, l_unevictable, flags); + + if (likely(PageLRU(page) && PageUnevictable(page))) + check_move_unevictable_page(page, zone); + + unlock_page(page); + } + spin_unlock_irq(&zone->lru_lock); + + nr_to_scan -= batch_size; + } +} + + +/** + * scan_all_zones_unevictable_pages - scan all unevictable lists for evictable pages + * + * A really big hammer: scan all zones' unevictable LRU lists to check for + * pages that have become evictable. Move those back to the zones' + * inactive list where they become candidates for reclaim. + * This occurs when, e.g., we have unswappable pages on the unevictable lists, + * and we add swap to the system. As such, it runs in the context of a task + * that has possibly/probably made some previously unevictable pages + * evictable. + */ +void scan_all_zones_unevictable_pages(void) +{ + struct zone *zone; + + for_each_zone(zone) { + scan_zone_unevictable_pages(zone); + } +} + +/* + * scan_unevictable_pages [vm] sysctl handler. On demand re-scan of + * all nodes' unevictable lists for evictable pages + */ +unsigned long scan_unevictable_pages; + +int scan_unevictable_handler(struct ctl_table *table, int write, + struct file *file, void __user *buffer, + size_t *length, loff_t *ppos) +{ + proc_doulongvec_minmax(table, write, file, buffer, length, ppos); + + if (write && *(unsigned long *)table->data) + scan_all_zones_unevictable_pages(); + + scan_unevictable_pages = 0; + return 0; +} + +/* + * per node 'scan_unevictable_pages' attribute. On demand re-scan of + * a specified node's per zone unevictable lists for evictable pages. + */ + +static ssize_t read_scan_unevictable_node(struct sys_device *dev, + struct sysdev_attribute *attr, + char *buf) +{ + return sprintf(buf, "0\n"); /* always zero; should fit... */ +} + +static ssize_t write_scan_unevictable_node(struct sys_device *dev, + struct sysdev_attribute *attr, + const char *buf, size_t count) +{ + struct zone *node_zones = NODE_DATA(dev->id)->node_zones; + struct zone *zone; + unsigned long res; + unsigned long req = strict_strtoul(buf, 10, &res); + + if (!req) + return 1; /* zero is no-op */ + + for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) { + if (!populated_zone(zone)) + continue; + scan_zone_unevictable_pages(zone); + } + return 1; +} + + +static SYSDEV_ATTR(scan_unevictable_pages, S_IRUGO | S_IWUSR, + read_scan_unevictable_node, + write_scan_unevictable_node); + +int scan_unevictable_register_node(struct node *node) +{ + return sysdev_create_file(&node->sysdev, &attr_scan_unevictable_pages); +} + +void scan_unevictable_unregister_node(struct node *node) +{ + sysdev_remove_file(&node->sysdev, &attr_scan_unevictable_pages); +} + +#endif diff --git a/mm/vmstat.c b/mm/vmstat.c index db9eabb2c5b3..c3ccfda23adc 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -8,11 +8,12 @@ * Copyright (C) 2006 Silicon Graphics, Inc., * Christoph Lameter <christoph@lameter.com> */ - +#include <linux/fs.h> #include <linux/mm.h> #include <linux/err.h> #include <linux/module.h> #include <linux/cpu.h> +#include <linux/vmstat.h> #include <linux/sched.h> #ifdef CONFIG_VM_EVENT_COUNTERS @@ -26,7 +27,7 @@ static void sum_vm_events(unsigned long *ret, cpumask_t *cpumask) memset(ret, 0, NR_VM_EVENT_ITEMS * sizeof(unsigned long)); - for_each_cpu_mask(cpu, *cpumask) { + for_each_cpu_mask_nr(cpu, *cpumask) { struct vm_event_state *this = &per_cpu(vm_event_states, cpu); for (i = 0; i < NR_VM_EVENT_ITEMS; i++) @@ -383,7 +384,7 @@ void zone_statistics(struct zone *preferred_zone, struct zone *z) #endif #ifdef CONFIG_PROC_FS - +#include <linux/proc_fs.h> #include <linux/seq_file.h> static char * const migratetype_names[MIGRATE_TYPES] = { @@ -515,9 +516,26 @@ static void pagetypeinfo_showblockcount_print(struct seq_file *m, continue; page = pfn_to_page(pfn); +#ifdef CONFIG_ARCH_FLATMEM_HAS_HOLES + /* + * Ordinarily, memory holes in flatmem still have a valid + * memmap for the PFN range. However, an architecture for + * embedded systems (e.g. ARM) can free up the memmap backing + * holes to save memory on the assumption the memmap is + * never used. The page_zone linkages are then broken even + * though pfn_valid() returns true. Skip the page if the + * linkages are broken. Even if this test passed, the impact + * is that the counters for the movable type are off but + * fragmentation monitoring is likely meaningless on small + * systems. + */ + if (page_zone(page) != zone) + continue; +#endif mtype = get_pageblock_migratetype(page); - count[mtype]++; + if (mtype < MIGRATE_TYPES) + count[mtype]++; } /* Print counts */ @@ -563,20 +581,44 @@ static int pagetypeinfo_show(struct seq_file *m, void *arg) return 0; } -const struct seq_operations fragmentation_op = { +static const struct seq_operations fragmentation_op = { .start = frag_start, .next = frag_next, .stop = frag_stop, .show = frag_show, }; -const struct seq_operations pagetypeinfo_op = { +static int fragmentation_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &fragmentation_op); +} + +static const struct file_operations fragmentation_file_operations = { + .open = fragmentation_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +static const struct seq_operations pagetypeinfo_op = { .start = frag_start, .next = frag_next, .stop = frag_stop, .show = pagetypeinfo_show, }; +static int pagetypeinfo_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &pagetypeinfo_op); +} + +static const struct file_operations pagetypeinfo_file_ops = { + .open = pagetypeinfo_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + #ifdef CONFIG_ZONE_DMA #define TEXT_FOR_DMA(xx) xx "_dma", #else @@ -601,8 +643,14 @@ const struct seq_operations pagetypeinfo_op = { static const char * const vmstat_text[] = { /* Zoned VM counters */ "nr_free_pages", - "nr_inactive", - "nr_active", + "nr_inactive_anon", + "nr_active_anon", + "nr_inactive_file", + "nr_active_file", +#ifdef CONFIG_UNEVICTABLE_LRU + "nr_unevictable", + "nr_mlock", +#endif "nr_anon_pages", "nr_mapped", "nr_file_pages", @@ -657,6 +705,16 @@ static const char * const vmstat_text[] = { "htlb_buddy_alloc_success", "htlb_buddy_alloc_fail", #endif +#ifdef CONFIG_UNEVICTABLE_LRU + "unevictable_pgs_culled", + "unevictable_pgs_scanned", + "unevictable_pgs_rescued", + "unevictable_pgs_mlocked", + "unevictable_pgs_munlocked", + "unevictable_pgs_cleared", + "unevictable_pgs_stranded", + "unevictable_pgs_mlockfreed", +#endif #endif }; @@ -670,7 +728,7 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, "\n min %lu" "\n low %lu" "\n high %lu" - "\n scanned %lu (a: %lu i: %lu)" + "\n scanned %lu (aa: %lu ia: %lu af: %lu if: %lu)" "\n spanned %lu" "\n present %lu", zone_page_state(zone, NR_FREE_PAGES), @@ -678,7 +736,10 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, zone->pages_low, zone->pages_high, zone->pages_scanned, - zone->nr_scan_active, zone->nr_scan_inactive, + zone->lru[LRU_ACTIVE_ANON].nr_scan, + zone->lru[LRU_INACTIVE_ANON].nr_scan, + zone->lru[LRU_ACTIVE_FILE].nr_scan, + zone->lru[LRU_INACTIVE_FILE].nr_scan, zone->spanned_pages, zone->present_pages); @@ -715,10 +776,12 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, seq_printf(m, "\n all_unreclaimable: %u" "\n prev_priority: %i" - "\n start_pfn: %lu", + "\n start_pfn: %lu" + "\n inactive_ratio: %u", zone_is_all_unreclaimable(zone), zone->prev_priority, - zone->zone_start_pfn); + zone->zone_start_pfn, + zone->inactive_ratio); seq_putc(m, '\n'); } @@ -732,7 +795,7 @@ static int zoneinfo_show(struct seq_file *m, void *arg) return 0; } -const struct seq_operations zoneinfo_op = { +static const struct seq_operations zoneinfo_op = { .start = frag_start, /* iterate over all zones. The same as in * fragmentation. */ .next = frag_next, @@ -740,6 +803,18 @@ const struct seq_operations zoneinfo_op = { .show = zoneinfo_show, }; +static int zoneinfo_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &zoneinfo_op); +} + +static const struct file_operations proc_zoneinfo_file_operations = { + .open = zoneinfo_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + static void *vmstat_start(struct seq_file *m, loff_t *pos) { unsigned long *v; @@ -795,13 +870,24 @@ static void vmstat_stop(struct seq_file *m, void *arg) m->private = NULL; } -const struct seq_operations vmstat_op = { +static const struct seq_operations vmstat_op = { .start = vmstat_start, .next = vmstat_next, .stop = vmstat_stop, .show = vmstat_show, }; +static int vmstat_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &vmstat_op); +} + +static const struct file_operations proc_vmstat_file_operations = { + .open = vmstat_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; #endif /* CONFIG_PROC_FS */ #ifdef CONFIG_SMP @@ -859,9 +945,11 @@ static int __cpuinit vmstat_cpuup_callback(struct notifier_block *nfb, static struct notifier_block __cpuinitdata vmstat_notifier = { &vmstat_cpuup_callback, NULL, 0 }; +#endif static int __init setup_vmstat(void) { +#ifdef CONFIG_SMP int cpu; refresh_zone_stat_thresholds(); @@ -869,7 +957,13 @@ static int __init setup_vmstat(void) for_each_online_cpu(cpu) start_cpu_timer(cpu); +#endif +#ifdef CONFIG_PROC_FS + proc_create("buddyinfo", S_IRUGO, NULL, &fragmentation_file_operations); + proc_create("pagetypeinfo", S_IRUGO, NULL, &pagetypeinfo_file_ops); + proc_create("vmstat", S_IRUGO, NULL, &proc_vmstat_file_operations); + proc_create("zoneinfo", S_IRUGO, NULL, &proc_zoneinfo_file_operations); +#endif return 0; } module_init(setup_vmstat) -#endif |