diff options
author | Mark Brown <broonie@opensource.wolfsonmicro.com> | 2009-10-06 17:01:27 +0200 |
---|---|---|
committer | Mark Brown <broonie@opensource.wolfsonmicro.com> | 2009-10-06 17:01:27 +0200 |
commit | 907bc6c7fc7071b00083fc11e510e47dd93df45d (patch) | |
tree | 0697a608561522c00da9e1814974a2eb051bb96d /mm | |
parent | ASoC: Add virtual enumeration support for DAPM muxes (diff) | |
parent | Merge branch 'for-2.6.32' of git://git.kernel.org/pub/scm/linux/kernel/git/br... (diff) | |
download | linux-907bc6c7fc7071b00083fc11e510e47dd93df45d.tar.xz linux-907bc6c7fc7071b00083fc11e510e47dd93df45d.zip |
Merge branch 'for-2.6.32' into for-2.6.33
Diffstat (limited to 'mm')
-rw-r--r-- | mm/Kconfig | 36 | ||||
-rw-r--r-- | mm/Kconfig.debug | 12 | ||||
-rw-r--r-- | mm/Makefile | 13 | ||||
-rw-r--r-- | mm/allocpercpu.c | 28 | ||||
-rw-r--r-- | mm/backing-dev.c | 434 | ||||
-rw-r--r-- | mm/bootmem.c | 10 | ||||
-rw-r--r-- | mm/dmapool.c | 2 | ||||
-rw-r--r-- | mm/filemap.c | 183 | ||||
-rw-r--r-- | mm/filemap_xip.c | 2 | ||||
-rw-r--r-- | mm/hugetlb.c | 268 | ||||
-rw-r--r-- | mm/hwpoison-inject.c | 41 | ||||
-rw-r--r-- | mm/internal.h | 10 | ||||
-rw-r--r-- | mm/kmemleak-test.c | 6 | ||||
-rw-r--r-- | mm/kmemleak.c | 707 | ||||
-rw-r--r-- | mm/ksm.c | 1711 | ||||
-rw-r--r-- | mm/madvise.c | 83 | ||||
-rw-r--r-- | mm/memcontrol.c | 739 | ||||
-rw-r--r-- | mm/memory-failure.c | 832 | ||||
-rw-r--r-- | mm/memory.c | 332 | ||||
-rw-r--r-- | mm/memory_hotplug.c | 13 | ||||
-rw-r--r-- | mm/mempolicy.c | 84 | ||||
-rw-r--r-- | mm/mempool.c | 9 | ||||
-rw-r--r-- | mm/migrate.c | 26 | ||||
-rw-r--r-- | mm/mlock.c | 128 | ||||
-rw-r--r-- | mm/mmap.c | 64 | ||||
-rw-r--r-- | mm/mmu_context.c | 58 | ||||
-rw-r--r-- | mm/mmu_notifier.c | 20 | ||||
-rw-r--r-- | mm/mprotect.c | 4 | ||||
-rw-r--r-- | mm/mremap.c | 18 | ||||
-rw-r--r-- | mm/nommu.c | 167 | ||||
-rw-r--r-- | mm/oom_kill.c | 96 | ||||
-rw-r--r-- | mm/page-writeback.c | 251 | ||||
-rw-r--r-- | mm/page_alloc.c | 386 | ||||
-rw-r--r-- | mm/page_cgroup.c | 12 | ||||
-rw-r--r-- | mm/pdflush.c | 269 | ||||
-rw-r--r-- | mm/percpu.c | 1471 | ||||
-rw-r--r-- | mm/quicklist.c | 5 | ||||
-rw-r--r-- | mm/rmap.c | 143 | ||||
-rw-r--r-- | mm/shmem.c | 44 | ||||
-rw-r--r-- | mm/shmem_acl.c | 11 | ||||
-rw-r--r-- | mm/slab.c | 10 | ||||
-rw-r--r-- | mm/slob.c | 7 | ||||
-rw-r--r-- | mm/slub.c | 103 | ||||
-rw-r--r-- | mm/sparse-vmemmap.c | 8 | ||||
-rw-r--r-- | mm/sparse.c | 9 | ||||
-rw-r--r-- | mm/swap.c | 8 | ||||
-rw-r--r-- | mm/swap_state.c | 144 | ||||
-rw-r--r-- | mm/swapfile.c | 30 | ||||
-rw-r--r-- | mm/truncate.c | 136 | ||||
-rw-r--r-- | mm/vmalloc.c | 561 | ||||
-rw-r--r-- | mm/vmscan.c | 291 | ||||
-rw-r--r-- | mm/vmstat.c | 5 |
52 files changed, 7707 insertions, 2333 deletions
diff --git a/mm/Kconfig b/mm/Kconfig index c948d4ca8bde..edd300aca173 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -153,7 +153,7 @@ config MEMORY_HOTREMOVE # config PAGEFLAGS_EXTENDED def_bool y - depends on 64BIT || SPARSEMEM_VMEMMAP || !NUMA || !SPARSEMEM + depends on 64BIT || SPARSEMEM_VMEMMAP || !SPARSEMEM # Heavily threaded applications may benefit from splitting the mm-wide # page_table_lock, so that faults on different parts of the user address @@ -214,6 +214,18 @@ config HAVE_MLOCKED_PAGE_BIT config MMU_NOTIFIER bool +config KSM + bool "Enable KSM for page merging" + depends on MMU + help + Enable Kernel Samepage Merging: KSM periodically scans those areas + of an application's address space that an app has advised may be + mergeable. When it finds pages of identical content, it replaces + the many instances by a single resident page with that content, so + saving memory until one or another app needs to modify the content. + Recommended for use with KVM, or with other duplicative applications. + See Documentation/vm/ksm.txt for more information. + config DEFAULT_MMAP_MIN_ADDR int "Low address space to protect from user allocation" default 4096 @@ -225,13 +237,29 @@ config DEFAULT_MMAP_MIN_ADDR For most ia64, ppc64 and x86 users with lots of address space a value of 65536 is reasonable and should cause no problems. On arm and other archs it should not be higher than 32768. - Programs which use vm86 functionality would either need additional - permissions from either the LSM or the capabilities module or have - this protection disabled. + Programs which use vm86 functionality or have some need to map + this low address space will need CAP_SYS_RAWIO or disable this + protection by setting the value to 0. This value can be changed after boot using the /proc/sys/vm/mmap_min_addr tunable. +config ARCH_SUPPORTS_MEMORY_FAILURE + bool + +config MEMORY_FAILURE + depends on MMU + depends on ARCH_SUPPORTS_MEMORY_FAILURE + bool "Enable recovery from hardware memory errors" + help + Enables code to recover from some memory failures on systems + with MCA recovery. This allows a system to continue running + even when some of its memory has uncorrected errors. This requires + special hardware support and typically ECC memory. + +config HWPOISON_INJECT + tristate "Poison pages injector" + depends on MEMORY_FAILURE && DEBUG_KERNEL config NOMMU_INITIAL_TRIM_EXCESS int "Turn on mmap() excess space trimming before booting" diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug index aa99fd1f7109..af7cfb43d2f0 100644 --- a/mm/Kconfig.debug +++ b/mm/Kconfig.debug @@ -6,7 +6,7 @@ config DEBUG_PAGEALLOC ---help--- Unmap pages from the kernel linear mapping after free_pages(). This results in a large slowdown, but helps to find certain types - of memory corruptions. + of memory corruption. config WANT_PAGE_DEBUG_FLAGS bool @@ -17,11 +17,11 @@ config PAGE_POISONING depends on !HIBERNATION select DEBUG_PAGEALLOC select WANT_PAGE_DEBUG_FLAGS - help + ---help--- Fill the pages with poison patterns after free_pages() and verify the patterns before alloc_pages(). This results in a large slowdown, - but helps to find certain types of memory corruptions. + but helps to find certain types of memory corruption. - This option cannot enalbe with hibernation. Otherwise, it will get - wrong messages for memory corruption because the free pages are not - saved to the suspend image. + This option cannot be enabled in combination with hibernation as + that would result in incorrect warnings of memory corruption after + a resume because free pages are not saved to the suspend image. diff --git a/mm/Makefile b/mm/Makefile index 5e0bd6426693..ebf849042ed3 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -5,16 +5,16 @@ mmu-y := nommu.o mmu-$(CONFIG_MMU) := fremap.o highmem.o madvise.o memory.o mincore.o \ mlock.o mmap.o mprotect.o mremap.o msync.o rmap.o \ - vmalloc.o + vmalloc.o pagewalk.o obj-y := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \ - maccess.o page_alloc.o page-writeback.o pdflush.o \ + maccess.o page_alloc.o page-writeback.o \ readahead.o swap.o truncate.o vmscan.o shmem.o \ prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \ - page_isolation.o mm_init.o $(mmu-y) + page_isolation.o mm_init.o mmu_context.o \ + $(mmu-y) obj-y += init-mm.o -obj-$(CONFIG_PROC_PAGE_MONITOR) += pagewalk.o obj-$(CONFIG_BOUNCE) += bounce.o obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o thrash.o obj-$(CONFIG_HAS_DMA) += dmapool.o @@ -25,6 +25,7 @@ obj-$(CONFIG_SPARSEMEM_VMEMMAP) += sparse-vmemmap.o obj-$(CONFIG_TMPFS_POSIX_ACL) += shmem_acl.o obj-$(CONFIG_SLOB) += slob.o obj-$(CONFIG_MMU_NOTIFIER) += mmu_notifier.o +obj-$(CONFIG_KSM) += ksm.o obj-$(CONFIG_PAGE_POISONING) += debug-pagealloc.o obj-$(CONFIG_SLAB) += slab.o obj-$(CONFIG_SLUB) += slub.o @@ -33,12 +34,14 @@ obj-$(CONFIG_FAILSLAB) += failslab.o obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o obj-$(CONFIG_FS_XIP) += filemap_xip.o obj-$(CONFIG_MIGRATION) += migrate.o -ifdef CONFIG_HAVE_DYNAMIC_PER_CPU_AREA +ifndef CONFIG_HAVE_LEGACY_PER_CPU_AREA obj-$(CONFIG_SMP) += percpu.o else obj-$(CONFIG_SMP) += allocpercpu.o endif obj-$(CONFIG_QUICKLIST) += quicklist.o obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o page_cgroup.o +obj-$(CONFIG_MEMORY_FAILURE) += memory-failure.o +obj-$(CONFIG_HWPOISON_INJECT) += hwpoison-inject.o obj-$(CONFIG_DEBUG_KMEMLEAK) += kmemleak.o obj-$(CONFIG_DEBUG_KMEMLEAK_TEST) += kmemleak-test.o diff --git a/mm/allocpercpu.c b/mm/allocpercpu.c index dfdee6a47359..df34ceae0c67 100644 --- a/mm/allocpercpu.c +++ b/mm/allocpercpu.c @@ -5,6 +5,8 @@ */ #include <linux/mm.h> #include <linux/module.h> +#include <linux/bootmem.h> +#include <asm/sections.h> #ifndef cache_line_size #define cache_line_size() L1_CACHE_BYTES @@ -147,3 +149,29 @@ void free_percpu(void *__pdata) kfree(__percpu_disguise(__pdata)); } EXPORT_SYMBOL_GPL(free_percpu); + +/* + * Generic percpu area setup. + */ +#ifndef CONFIG_HAVE_SETUP_PER_CPU_AREA +unsigned long __per_cpu_offset[NR_CPUS] __read_mostly; + +EXPORT_SYMBOL(__per_cpu_offset); + +void __init setup_per_cpu_areas(void) +{ + unsigned long size, i; + char *ptr; + unsigned long nr_possible_cpus = num_possible_cpus(); + + /* Copy section for each CPU (we discard the original) */ + size = ALIGN(PERCPU_ENOUGH_ROOM, PAGE_SIZE); + ptr = alloc_bootmem_pages(size * nr_possible_cpus); + + for_each_possible_cpu(i) { + __per_cpu_offset[i] = ptr - __per_cpu_start; + memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start); + ptr += size; + } +} +#endif /* CONFIG_HAVE_SETUP_PER_CPU_AREA */ diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 493b468a5035..3d3accb1f800 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -1,8 +1,11 @@ #include <linux/wait.h> #include <linux/backing-dev.h> +#include <linux/kthread.h> +#include <linux/freezer.h> #include <linux/fs.h> #include <linux/pagemap.h> +#include <linux/mm.h> #include <linux/sched.h> #include <linux/module.h> #include <linux/writeback.h> @@ -14,6 +17,7 @@ void default_unplug_io_fn(struct backing_dev_info *bdi, struct page *page) EXPORT_SYMBOL(default_unplug_io_fn); struct backing_dev_info default_backing_dev_info = { + .name = "default", .ra_pages = VM_MAX_READAHEAD * 1024 / PAGE_CACHE_SIZE, .state = 0, .capabilities = BDI_CAP_MAP_COPY, @@ -23,6 +27,24 @@ EXPORT_SYMBOL_GPL(default_backing_dev_info); static struct class *bdi_class; +/* + * bdi_lock protects updates to bdi_list and bdi_pending_list, as well as + * reader side protection for bdi_pending_list. bdi_list has RCU reader side + * locking. + */ +DEFINE_SPINLOCK(bdi_lock); +LIST_HEAD(bdi_list); +LIST_HEAD(bdi_pending_list); + +static struct task_struct *sync_supers_tsk; +static struct timer_list sync_supers_timer; + +static int bdi_sync_supers(void *); +static void sync_supers_timer_fn(unsigned long); +static void arm_supers_timer(void); + +static void bdi_add_default_flusher_task(struct backing_dev_info *bdi); + #ifdef CONFIG_DEBUG_FS #include <linux/debugfs.h> #include <linux/seq_file.h> @@ -37,9 +59,29 @@ static void bdi_debug_init(void) static int bdi_debug_stats_show(struct seq_file *m, void *v) { struct backing_dev_info *bdi = m->private; + struct bdi_writeback *wb; unsigned long background_thresh; unsigned long dirty_thresh; unsigned long bdi_thresh; + unsigned long nr_dirty, nr_io, nr_more_io, nr_wb; + struct inode *inode; + + /* + * inode lock is enough here, the bdi->wb_list is protected by + * RCU on the reader side + */ + nr_wb = nr_dirty = nr_io = nr_more_io = 0; + spin_lock(&inode_lock); + list_for_each_entry(wb, &bdi->wb_list, list) { + nr_wb++; + list_for_each_entry(inode, &wb->b_dirty, i_list) + nr_dirty++; + list_for_each_entry(inode, &wb->b_io, i_list) + nr_io++; + list_for_each_entry(inode, &wb->b_more_io, i_list) + nr_more_io++; + } + spin_unlock(&inode_lock); get_dirty_limits(&background_thresh, &dirty_thresh, &bdi_thresh, bdi); @@ -49,12 +91,22 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v) "BdiReclaimable: %8lu kB\n" "BdiDirtyThresh: %8lu kB\n" "DirtyThresh: %8lu kB\n" - "BackgroundThresh: %8lu kB\n", + "BackgroundThresh: %8lu kB\n" + "WriteBack threads:%8lu\n" + "b_dirty: %8lu\n" + "b_io: %8lu\n" + "b_more_io: %8lu\n" + "bdi_list: %8u\n" + "state: %8lx\n" + "wb_mask: %8lx\n" + "wb_list: %8u\n" + "wb_cnt: %8u\n", (unsigned long) K(bdi_stat(bdi, BDI_WRITEBACK)), (unsigned long) K(bdi_stat(bdi, BDI_RECLAIMABLE)), - K(bdi_thresh), - K(dirty_thresh), - K(background_thresh)); + K(bdi_thresh), K(dirty_thresh), + K(background_thresh), nr_wb, nr_dirty, nr_io, nr_more_io, + !list_empty(&bdi->bdi_list), bdi->state, bdi->wb_mask, + !list_empty(&bdi->wb_list), bdi->wb_cnt); #undef K return 0; @@ -185,6 +237,13 @@ static int __init default_bdi_init(void) { int err; + sync_supers_tsk = kthread_run(bdi_sync_supers, NULL, "sync_supers"); + BUG_ON(IS_ERR(sync_supers_tsk)); + + init_timer(&sync_supers_timer); + setup_timer(&sync_supers_timer, sync_supers_timer_fn, 0); + arm_supers_timer(); + err = bdi_init(&default_backing_dev_info); if (!err) bdi_register(&default_backing_dev_info, NULL, "default"); @@ -193,6 +252,279 @@ static int __init default_bdi_init(void) } subsys_initcall(default_bdi_init); +static void bdi_wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi) +{ + memset(wb, 0, sizeof(*wb)); + + wb->bdi = bdi; + wb->last_old_flush = jiffies; + INIT_LIST_HEAD(&wb->b_dirty); + INIT_LIST_HEAD(&wb->b_io); + INIT_LIST_HEAD(&wb->b_more_io); +} + +static void bdi_task_init(struct backing_dev_info *bdi, + struct bdi_writeback *wb) +{ + struct task_struct *tsk = current; + + spin_lock(&bdi->wb_lock); + list_add_tail_rcu(&wb->list, &bdi->wb_list); + spin_unlock(&bdi->wb_lock); + + tsk->flags |= PF_FLUSHER | PF_SWAPWRITE; + set_freezable(); + + /* + * Our parent may run at a different priority, just set us to normal + */ + set_user_nice(tsk, 0); +} + +static int bdi_start_fn(void *ptr) +{ + struct bdi_writeback *wb = ptr; + struct backing_dev_info *bdi = wb->bdi; + int ret; + + /* + * Add us to the active bdi_list + */ + spin_lock_bh(&bdi_lock); + list_add_rcu(&bdi->bdi_list, &bdi_list); + spin_unlock_bh(&bdi_lock); + + bdi_task_init(bdi, wb); + + /* + * Clear pending bit and wakeup anybody waiting to tear us down + */ + clear_bit(BDI_pending, &bdi->state); + smp_mb__after_clear_bit(); + wake_up_bit(&bdi->state, BDI_pending); + + ret = bdi_writeback_task(wb); + + /* + * Remove us from the list + */ + spin_lock(&bdi->wb_lock); + list_del_rcu(&wb->list); + spin_unlock(&bdi->wb_lock); + + /* + * Flush any work that raced with us exiting. No new work + * will be added, since this bdi isn't discoverable anymore. + */ + if (!list_empty(&bdi->work_list)) + wb_do_writeback(wb, 1); + + wb->task = NULL; + return ret; +} + +int bdi_has_dirty_io(struct backing_dev_info *bdi) +{ + return wb_has_dirty_io(&bdi->wb); +} + +static void bdi_flush_io(struct backing_dev_info *bdi) +{ + struct writeback_control wbc = { + .bdi = bdi, + .sync_mode = WB_SYNC_NONE, + .older_than_this = NULL, + .range_cyclic = 1, + .nr_to_write = 1024, + }; + + writeback_inodes_wbc(&wbc); +} + +/* + * kupdated() used to do this. We cannot do it from the bdi_forker_task() + * or we risk deadlocking on ->s_umount. The longer term solution would be + * to implement sync_supers_bdi() or similar and simply do it from the + * bdi writeback tasks individually. + */ +static int bdi_sync_supers(void *unused) +{ + set_user_nice(current, 0); + + while (!kthread_should_stop()) { + set_current_state(TASK_INTERRUPTIBLE); + schedule(); + + /* + * Do this periodically, like kupdated() did before. + */ + sync_supers(); + } + + return 0; +} + +static void arm_supers_timer(void) +{ + unsigned long next; + + next = msecs_to_jiffies(dirty_writeback_interval * 10) + jiffies; + mod_timer(&sync_supers_timer, round_jiffies_up(next)); +} + +static void sync_supers_timer_fn(unsigned long unused) +{ + wake_up_process(sync_supers_tsk); + arm_supers_timer(); +} + +static int bdi_forker_task(void *ptr) +{ + struct bdi_writeback *me = ptr; + + bdi_task_init(me->bdi, me); + + for (;;) { + struct backing_dev_info *bdi, *tmp; + struct bdi_writeback *wb; + + /* + * Temporary measure, we want to make sure we don't see + * dirty data on the default backing_dev_info + */ + if (wb_has_dirty_io(me) || !list_empty(&me->bdi->work_list)) + wb_do_writeback(me, 0); + + spin_lock_bh(&bdi_lock); + + /* + * Check if any existing bdi's have dirty data without + * a thread registered. If so, set that up. + */ + list_for_each_entry_safe(bdi, tmp, &bdi_list, bdi_list) { + if (bdi->wb.task) + continue; + if (list_empty(&bdi->work_list) && + !bdi_has_dirty_io(bdi)) + continue; + + bdi_add_default_flusher_task(bdi); + } + + set_current_state(TASK_INTERRUPTIBLE); + + if (list_empty(&bdi_pending_list)) { + unsigned long wait; + + spin_unlock_bh(&bdi_lock); + wait = msecs_to_jiffies(dirty_writeback_interval * 10); + schedule_timeout(wait); + try_to_freeze(); + continue; + } + + __set_current_state(TASK_RUNNING); + + /* + * This is our real job - check for pending entries in + * bdi_pending_list, and create the tasks that got added + */ + bdi = list_entry(bdi_pending_list.next, struct backing_dev_info, + bdi_list); + list_del_init(&bdi->bdi_list); + spin_unlock_bh(&bdi_lock); + + wb = &bdi->wb; + wb->task = kthread_run(bdi_start_fn, wb, "flush-%s", + dev_name(bdi->dev)); + /* + * If task creation fails, then readd the bdi to + * the pending list and force writeout of the bdi + * from this forker thread. That will free some memory + * and we can try again. + */ + if (IS_ERR(wb->task)) { + wb->task = NULL; + + /* + * Add this 'bdi' to the back, so we get + * a chance to flush other bdi's to free + * memory. + */ + spin_lock_bh(&bdi_lock); + list_add_tail(&bdi->bdi_list, &bdi_pending_list); + spin_unlock_bh(&bdi_lock); + + bdi_flush_io(bdi); + } + } + + return 0; +} + +static void bdi_add_to_pending(struct rcu_head *head) +{ + struct backing_dev_info *bdi; + + bdi = container_of(head, struct backing_dev_info, rcu_head); + INIT_LIST_HEAD(&bdi->bdi_list); + + spin_lock(&bdi_lock); + list_add_tail(&bdi->bdi_list, &bdi_pending_list); + spin_unlock(&bdi_lock); + + /* + * We are now on the pending list, wake up bdi_forker_task() + * to finish the job and add us back to the active bdi_list + */ + wake_up_process(default_backing_dev_info.wb.task); +} + +/* + * Add the default flusher task that gets created for any bdi + * that has dirty data pending writeout + */ +void static bdi_add_default_flusher_task(struct backing_dev_info *bdi) +{ + if (!bdi_cap_writeback_dirty(bdi)) + return; + + if (WARN_ON(!test_bit(BDI_registered, &bdi->state))) { + printk(KERN_ERR "bdi %p/%s is not registered!\n", + bdi, bdi->name); + return; + } + + /* + * Check with the helper whether to proceed adding a task. Will only + * abort if we two or more simultanous calls to + * bdi_add_default_flusher_task() occured, further additions will block + * waiting for previous additions to finish. + */ + if (!test_and_set_bit(BDI_pending, &bdi->state)) { + list_del_rcu(&bdi->bdi_list); + + /* + * We must wait for the current RCU period to end before + * moving to the pending list. So schedule that operation + * from an RCU callback. + */ + call_rcu(&bdi->rcu_head, bdi_add_to_pending); + } +} + +/* + * Remove bdi from bdi_list, and ensure that it is no longer visible + */ +static void bdi_remove_from_list(struct backing_dev_info *bdi) +{ + spin_lock_bh(&bdi_lock); + list_del_rcu(&bdi->bdi_list); + spin_unlock_bh(&bdi_lock); + + synchronize_rcu(); +} + int bdi_register(struct backing_dev_info *bdi, struct device *parent, const char *fmt, ...) { @@ -211,9 +543,33 @@ int bdi_register(struct backing_dev_info *bdi, struct device *parent, goto exit; } + spin_lock_bh(&bdi_lock); + list_add_tail_rcu(&bdi->bdi_list, &bdi_list); + spin_unlock_bh(&bdi_lock); + bdi->dev = dev; - bdi_debug_register(bdi, dev_name(dev)); + /* + * Just start the forker thread for our default backing_dev_info, + * and add other bdi's to the list. They will get a thread created + * on-demand when they need it. + */ + if (bdi_cap_flush_forker(bdi)) { + struct bdi_writeback *wb = &bdi->wb; + + wb->task = kthread_run(bdi_forker_task, wb, "bdi-%s", + dev_name(dev)); + if (IS_ERR(wb->task)) { + wb->task = NULL; + ret = -ENOMEM; + + bdi_remove_from_list(bdi); + goto exit; + } + } + + bdi_debug_register(bdi, dev_name(dev)); + set_bit(BDI_registered, &bdi->state); exit: return ret; } @@ -225,9 +581,40 @@ int bdi_register_dev(struct backing_dev_info *bdi, dev_t dev) } EXPORT_SYMBOL(bdi_register_dev); +/* + * Remove bdi from the global list and shutdown any threads we have running + */ +static void bdi_wb_shutdown(struct backing_dev_info *bdi) +{ + struct bdi_writeback *wb; + + if (!bdi_cap_writeback_dirty(bdi)) + return; + + /* + * If setup is pending, wait for that to complete first + */ + wait_on_bit(&bdi->state, BDI_pending, bdi_sched_wait, + TASK_UNINTERRUPTIBLE); + + /* + * Make sure nobody finds us on the bdi_list anymore + */ + bdi_remove_from_list(bdi); + + /* + * Finally, kill the kernel threads. We don't need to be RCU + * safe anymore, since the bdi is gone from visibility. + */ + list_for_each_entry(wb, &bdi->wb_list, list) + kthread_stop(wb->task); +} + void bdi_unregister(struct backing_dev_info *bdi) { if (bdi->dev) { + if (!bdi_cap_flush_forker(bdi)) + bdi_wb_shutdown(bdi); bdi_debug_unregister(bdi); device_unregister(bdi->dev); bdi->dev = NULL; @@ -237,14 +624,26 @@ EXPORT_SYMBOL(bdi_unregister); int bdi_init(struct backing_dev_info *bdi) { - int i; - int err; + int i, err; bdi->dev = NULL; bdi->min_ratio = 0; bdi->max_ratio = 100; bdi->max_prop_frac = PROP_FRAC_BASE; + spin_lock_init(&bdi->wb_lock); + INIT_RCU_HEAD(&bdi->rcu_head); + INIT_LIST_HEAD(&bdi->bdi_list); + INIT_LIST_HEAD(&bdi->wb_list); + INIT_LIST_HEAD(&bdi->work_list); + + bdi_wb_init(&bdi->wb, bdi); + + /* + * Just one thread support for now, hard code mask and count + */ + bdi->wb_mask = 1; + bdi->wb_cnt = 1; for (i = 0; i < NR_BDI_STAT_ITEMS; i++) { err = percpu_counter_init(&bdi->bdi_stat[i], 0); @@ -269,6 +668,20 @@ void bdi_destroy(struct backing_dev_info *bdi) { int i; + /* + * Splice our entries to the default_backing_dev_info, if this + * bdi disappears + */ + if (bdi_has_dirty_io(bdi)) { + struct bdi_writeback *dst = &default_backing_dev_info.wb; + + spin_lock(&inode_lock); + list_splice(&bdi->wb.b_dirty, &dst->b_dirty); + list_splice(&bdi->wb.b_io, &dst->b_io); + list_splice(&bdi->wb.b_more_io, &dst->b_more_io); + spin_unlock(&inode_lock); + } + bdi_unregister(bdi); for (i = 0; i < NR_BDI_STAT_ITEMS; i++) @@ -283,7 +696,6 @@ static wait_queue_head_t congestion_wqh[2] = { __WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[1]) }; - void clear_bdi_congested(struct backing_dev_info *bdi, int sync) { enum bdi_state bit; @@ -308,18 +720,18 @@ EXPORT_SYMBOL(set_bdi_congested); /** * congestion_wait - wait for a backing_dev to become uncongested - * @rw: READ or WRITE + * @sync: SYNC or ASYNC IO * @timeout: timeout in jiffies * * Waits for up to @timeout jiffies for a backing_dev (any backing_dev) to exit * write congestion. If no backing_devs are congested then just wait for the * next write to be completed. */ -long congestion_wait(int rw, long timeout) +long congestion_wait(int sync, long timeout) { long ret; DEFINE_WAIT(wait); - wait_queue_head_t *wqh = &congestion_wqh[rw]; + wait_queue_head_t *wqh = &congestion_wqh[sync]; prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE); ret = io_schedule_timeout(timeout); diff --git a/mm/bootmem.c b/mm/bootmem.c index d2a9ce952768..555d5d2731c6 100644 --- a/mm/bootmem.c +++ b/mm/bootmem.c @@ -12,6 +12,7 @@ #include <linux/pfn.h> #include <linux/bootmem.h> #include <linux/module.h> +#include <linux/kmemleak.h> #include <asm/bug.h> #include <asm/io.h> @@ -335,6 +336,8 @@ void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr, { unsigned long start, end; + kmemleak_free_part(__va(physaddr), size); + start = PFN_UP(physaddr); end = PFN_DOWN(physaddr + size); @@ -354,6 +357,8 @@ void __init free_bootmem(unsigned long addr, unsigned long size) { unsigned long start, end; + kmemleak_free_part(__va(addr), size); + start = PFN_UP(addr); end = PFN_DOWN(addr + size); @@ -516,6 +521,11 @@ find_block: region = phys_to_virt(PFN_PHYS(bdata->node_min_pfn) + start_off); memset(region, 0, size); + /* + * The min_count is set to 0 so that bootmem allocated blocks + * are never reported as leaks. + */ + kmemleak_alloc(region, size, 0, 0); return region; } diff --git a/mm/dmapool.c b/mm/dmapool.c index b1f0885dda22..3df063706f53 100644 --- a/mm/dmapool.c +++ b/mm/dmapool.c @@ -86,10 +86,12 @@ show_pools(struct device *dev, struct device_attribute *attr, char *buf) unsigned pages = 0; unsigned blocks = 0; + spin_lock_irq(&pool->lock); list_for_each_entry(page, &pool->page_list, page_list) { pages++; blocks += page->in_use; } + spin_unlock_irq(&pool->lock); /* per-pool info, no real statistics yet */ temp = scnprintf(next, size, "%-16s %4u %4Zu %4Zu %2u\n", diff --git a/mm/filemap.c b/mm/filemap.c index 22396713feb9..ef169f37156d 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -39,11 +39,10 @@ /* * FIXME: remove all knowledge of the buffer layer from the core VM */ -#include <linux/buffer_head.h> /* for generic_osync_inode */ +#include <linux/buffer_head.h> /* for try_to_free_buffers */ #include <asm/mman.h> - /* * Shared mappings implemented 30.11.1994. It's not fully working yet, * though. @@ -59,7 +58,7 @@ /* * Lock ordering: * - * ->i_mmap_lock (vmtruncate) + * ->i_mmap_lock (truncate_pagecache) * ->private_lock (__free_pte->__set_page_dirty_buffers) * ->swap_lock (exclusive_swap_page, others) * ->mapping->tree_lock @@ -105,6 +104,10 @@ * * ->task->proc_lock * ->dcache_lock (proc_pid_lookup) + * + * (code doesn't rely on that order, so you could switch it around) + * ->tasklist_lock (memory_failure, collect_procs_ao) + * ->i_mmap_lock */ /* @@ -120,6 +123,8 @@ void __remove_from_page_cache(struct page *page) page->mapping = NULL; mapping->nrpages--; __dec_zone_page_state(page, NR_FILE_PAGES); + if (PageSwapBacked(page)) + __dec_zone_page_state(page, NR_SHMEM); BUG_ON(page_mapped(page)); /* @@ -307,68 +312,24 @@ int wait_on_page_writeback_range(struct address_space *mapping, } /** - * sync_page_range - write and wait on all pages in the passed range - * @inode: target inode - * @mapping: target address_space - * @pos: beginning offset in pages to write - * @count: number of bytes to write - * - * Write and wait upon all the pages in the passed range. This is a "data - * integrity" operation. It waits upon in-flight writeout before starting and - * waiting upon new writeout. If there was an IO error, return it. + * filemap_fdatawait_range - wait for all under-writeback pages to complete in a given range + * @mapping: address space structure to wait for + * @start: offset in bytes where the range starts + * @end: offset in bytes where the range ends (inclusive) * - * We need to re-take i_mutex during the generic_osync_inode list walk because - * it is otherwise livelockable. - */ -int sync_page_range(struct inode *inode, struct address_space *mapping, - loff_t pos, loff_t count) -{ - pgoff_t start = pos >> PAGE_CACHE_SHIFT; - pgoff_t end = (pos + count - 1) >> PAGE_CACHE_SHIFT; - int ret; - - if (!mapping_cap_writeback_dirty(mapping) || !count) - return 0; - ret = filemap_fdatawrite_range(mapping, pos, pos + count - 1); - if (ret == 0) { - mutex_lock(&inode->i_mutex); - ret = generic_osync_inode(inode, mapping, OSYNC_METADATA); - mutex_unlock(&inode->i_mutex); - } - if (ret == 0) - ret = wait_on_page_writeback_range(mapping, start, end); - return ret; -} -EXPORT_SYMBOL(sync_page_range); - -/** - * sync_page_range_nolock - write & wait on all pages in the passed range without locking - * @inode: target inode - * @mapping: target address_space - * @pos: beginning offset in pages to write - * @count: number of bytes to write + * Walk the list of under-writeback pages of the given address space + * in the given range and wait for all of them. * - * Note: Holding i_mutex across sync_page_range_nolock() is not a good idea - * as it forces O_SYNC writers to different parts of the same file - * to be serialised right until io completion. + * This is just a simple wrapper so that callers don't have to convert offsets + * to page indexes themselves */ -int sync_page_range_nolock(struct inode *inode, struct address_space *mapping, - loff_t pos, loff_t count) +int filemap_fdatawait_range(struct address_space *mapping, loff_t start, + loff_t end) { - pgoff_t start = pos >> PAGE_CACHE_SHIFT; - pgoff_t end = (pos + count - 1) >> PAGE_CACHE_SHIFT; - int ret; - - if (!mapping_cap_writeback_dirty(mapping) || !count) - return 0; - ret = filemap_fdatawrite_range(mapping, pos, pos + count - 1); - if (ret == 0) - ret = generic_osync_inode(inode, mapping, OSYNC_METADATA); - if (ret == 0) - ret = wait_on_page_writeback_range(mapping, start, end); - return ret; + return wait_on_page_writeback_range(mapping, start >> PAGE_CACHE_SHIFT, + end >> PAGE_CACHE_SHIFT); } -EXPORT_SYMBOL(sync_page_range_nolock); +EXPORT_SYMBOL(filemap_fdatawait_range); /** * filemap_fdatawait - wait for all under-writeback pages to complete @@ -476,6 +437,8 @@ int add_to_page_cache_locked(struct page *page, struct address_space *mapping, if (likely(!error)) { mapping->nrpages++; __inc_zone_page_state(page, NR_FILE_PAGES); + if (PageSwapBacked(page)) + __inc_zone_page_state(page, NR_SHMEM); spin_unlock_irq(&mapping->tree_lock); } else { page->mapping = NULL; @@ -1648,7 +1611,7 @@ page_not_uptodate: } EXPORT_SYMBOL(filemap_fault); -struct vm_operations_struct generic_file_vm_ops = { +const struct vm_operations_struct generic_file_vm_ops = { .fault = filemap_fault, }; @@ -2167,20 +2130,7 @@ generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov, } *ppos = end; } - - /* - * Sync the fs metadata but not the minor inode changes and - * of course not the data as we did direct DMA for the IO. - * i_mutex is held, which protects generic_osync_inode() from - * livelocking. AIO O_DIRECT ops attempt to sync metadata here. - */ out: - if ((written >= 0 || written == -EIOCBQUEUED) && - ((file->f_flags & O_SYNC) || IS_SYNC(inode))) { - int err = generic_osync_inode(inode, mapping, OSYNC_METADATA); - if (err < 0) - written = err; - } return written; } EXPORT_SYMBOL(generic_file_direct_write); @@ -2272,6 +2222,7 @@ again: pagefault_enable(); flush_dcache_page(page); + mark_page_accessed(page); status = a_ops->write_end(file, mapping, pos, bytes, copied, page, fsdata); if (unlikely(status < 0)) @@ -2311,8 +2262,6 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov, { struct file *file = iocb->ki_filp; struct address_space *mapping = file->f_mapping; - const struct address_space_operations *a_ops = mapping->a_ops; - struct inode *inode = mapping->host; ssize_t status; struct iov_iter i; @@ -2322,16 +2271,6 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov, if (likely(status >= 0)) { written += status; *ppos = pos + status; - - /* - * For now, when the user asks for O_SYNC, we'll actually give - * O_DSYNC - */ - if (unlikely((file->f_flags & O_SYNC) || IS_SYNC(inode))) { - if (!a_ops->writepage || !is_sync_kiocb(iocb)) - status = generic_osync_inode(inode, mapping, - OSYNC_METADATA|OSYNC_DATA); - } } /* @@ -2347,9 +2286,27 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov, } EXPORT_SYMBOL(generic_file_buffered_write); -static ssize_t -__generic_file_aio_write_nolock(struct kiocb *iocb, const struct iovec *iov, - unsigned long nr_segs, loff_t *ppos) +/** + * __generic_file_aio_write - write data to a file + * @iocb: IO state structure (file, offset, etc.) + * @iov: vector with data to write + * @nr_segs: number of segments in the vector + * @ppos: position where to write + * + * This function does all the work needed for actually writing data to a + * file. It does all basic checks, removes SUID from the file, updates + * modification times and calls proper subroutines depending on whether we + * do direct IO or a standard buffered write. + * + * It expects i_mutex to be grabbed unless we work on a block device or similar + * object which does not need locking at all. + * + * This function does *not* take care of syncing data in case of O_SYNC write. + * A caller has to handle it. This is mainly due to the fact that we want to + * avoid syncing under i_mutex. + */ +ssize_t __generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov, + unsigned long nr_segs, loff_t *ppos) { struct file *file = iocb->ki_filp; struct address_space * mapping = file->f_mapping; @@ -2446,51 +2403,37 @@ out: current->backing_dev_info = NULL; return written ? written : err; } +EXPORT_SYMBOL(__generic_file_aio_write); -ssize_t generic_file_aio_write_nolock(struct kiocb *iocb, - const struct iovec *iov, unsigned long nr_segs, loff_t pos) -{ - struct file *file = iocb->ki_filp; - struct address_space *mapping = file->f_mapping; - struct inode *inode = mapping->host; - ssize_t ret; - - BUG_ON(iocb->ki_pos != pos); - - ret = __generic_file_aio_write_nolock(iocb, iov, nr_segs, - &iocb->ki_pos); - - if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) { - ssize_t err; - - err = sync_page_range_nolock(inode, mapping, pos, ret); - if (err < 0) - ret = err; - } - return ret; -} -EXPORT_SYMBOL(generic_file_aio_write_nolock); - +/** + * generic_file_aio_write - write data to a file + * @iocb: IO state structure + * @iov: vector with data to write + * @nr_segs: number of segments in the vector + * @pos: position in file where to write + * + * This is a wrapper around __generic_file_aio_write() to be used by most + * filesystems. It takes care of syncing the file in case of O_SYNC file + * and acquires i_mutex as needed. + */ ssize_t generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov, unsigned long nr_segs, loff_t pos) { struct file *file = iocb->ki_filp; - struct address_space *mapping = file->f_mapping; - struct inode *inode = mapping->host; + struct inode *inode = file->f_mapping->host; ssize_t ret; BUG_ON(iocb->ki_pos != pos); mutex_lock(&inode->i_mutex); - ret = __generic_file_aio_write_nolock(iocb, iov, nr_segs, - &iocb->ki_pos); + ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos); mutex_unlock(&inode->i_mutex); - if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) { + if (ret > 0 || ret == -EIOCBQUEUED) { ssize_t err; - err = sync_page_range(inode, mapping, pos, ret); - if (err < 0) + err = generic_write_sync(file, pos, ret); + if (err < 0 && ret > 0) ret = err; } return ret; diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c index 427dfe3ce78c..1888b2d71bb8 100644 --- a/mm/filemap_xip.c +++ b/mm/filemap_xip.c @@ -296,7 +296,7 @@ out: } } -static struct vm_operations_struct xip_file_vm_ops = { +static const struct vm_operations_struct xip_file_vm_ops = { .fault = xip_file_fault, }; diff --git a/mm/hugetlb.c b/mm/hugetlb.c index d0351e31f474..5d7601b02874 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -234,6 +234,7 @@ unsigned long vma_kernel_pagesize(struct vm_area_struct *vma) return 1UL << (hstate->order + PAGE_SHIFT); } +EXPORT_SYMBOL_GPL(vma_kernel_pagesize); /* * Return the page size being used by the MMU to back a VMA. In the majority @@ -455,24 +456,6 @@ static void enqueue_huge_page(struct hstate *h, struct page *page) h->free_huge_pages_node[nid]++; } -static struct page *dequeue_huge_page(struct hstate *h) -{ - int nid; - struct page *page = NULL; - - for (nid = 0; nid < MAX_NUMNODES; ++nid) { - if (!list_empty(&h->hugepage_freelists[nid])) { - page = list_entry(h->hugepage_freelists[nid].next, - struct page, lru); - list_del(&page->lru); - h->free_huge_pages--; - h->free_huge_pages_node[nid]--; - break; - } - } - return page; -} - static struct page *dequeue_huge_page_vma(struct hstate *h, struct vm_area_struct *vma, unsigned long address, int avoid_reserve) @@ -640,7 +623,7 @@ static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid) /* * Use a helper variable to find the next node and then - * copy it back to hugetlb_next_nid afterwards: + * copy it back to next_nid_to_alloc afterwards: * otherwise there's a window in which a racer might * pass invalid nid MAX_NUMNODES to alloc_pages_exact_node. * But we don't need to use a spin_lock here: it really @@ -649,13 +632,13 @@ static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid) * if we just successfully allocated a hugepage so that * the next caller gets hugepages on the next node. */ -static int hstate_next_node(struct hstate *h) +static int hstate_next_node_to_alloc(struct hstate *h) { int next_nid; - next_nid = next_node(h->hugetlb_next_nid, node_online_map); + next_nid = next_node(h->next_nid_to_alloc, node_online_map); if (next_nid == MAX_NUMNODES) next_nid = first_node(node_online_map); - h->hugetlb_next_nid = next_nid; + h->next_nid_to_alloc = next_nid; return next_nid; } @@ -666,14 +649,15 @@ static int alloc_fresh_huge_page(struct hstate *h) int next_nid; int ret = 0; - start_nid = h->hugetlb_next_nid; + start_nid = h->next_nid_to_alloc; + next_nid = start_nid; do { - page = alloc_fresh_huge_page_node(h, h->hugetlb_next_nid); + page = alloc_fresh_huge_page_node(h, next_nid); if (page) ret = 1; - next_nid = hstate_next_node(h); - } while (!page && h->hugetlb_next_nid != start_nid); + next_nid = hstate_next_node_to_alloc(h); + } while (!page && next_nid != start_nid); if (ret) count_vm_event(HTLB_BUDDY_PGALLOC); @@ -683,6 +667,61 @@ static int alloc_fresh_huge_page(struct hstate *h) return ret; } +/* + * helper for free_pool_huge_page() - find next node + * from which to free a huge page + */ +static int hstate_next_node_to_free(struct hstate *h) +{ + int next_nid; + next_nid = next_node(h->next_nid_to_free, node_online_map); + if (next_nid == MAX_NUMNODES) + next_nid = first_node(node_online_map); + h->next_nid_to_free = next_nid; + return next_nid; +} + +/* + * Free huge page from pool from next node to free. + * Attempt to keep persistent huge pages more or less + * balanced over allowed nodes. + * Called with hugetlb_lock locked. + */ +static int free_pool_huge_page(struct hstate *h, bool acct_surplus) +{ + int start_nid; + int next_nid; + int ret = 0; + + start_nid = h->next_nid_to_free; + next_nid = start_nid; + + do { + /* + * If we're returning unused surplus pages, only examine + * nodes with surplus pages. + */ + if ((!acct_surplus || h->surplus_huge_pages_node[next_nid]) && + !list_empty(&h->hugepage_freelists[next_nid])) { + struct page *page = + list_entry(h->hugepage_freelists[next_nid].next, + struct page, lru); + list_del(&page->lru); + h->free_huge_pages--; + h->free_huge_pages_node[next_nid]--; + if (acct_surplus) { + h->surplus_huge_pages--; + h->surplus_huge_pages_node[next_nid]--; + } + update_and_free_page(h, page); + ret = 1; + } + next_nid = hstate_next_node_to_free(h); + } while (!ret && next_nid != start_nid); + + return ret; +} + static struct page *alloc_buddy_huge_page(struct hstate *h, struct vm_area_struct *vma, unsigned long address) { @@ -854,22 +893,13 @@ free: * When releasing a hugetlb pool reservation, any surplus pages that were * allocated to satisfy the reservation must be explicitly freed if they were * never used. + * Called with hugetlb_lock held. */ static void return_unused_surplus_pages(struct hstate *h, unsigned long unused_resv_pages) { - static int nid = -1; - struct page *page; unsigned long nr_pages; - /* - * We want to release as many surplus pages as possible, spread - * evenly across all nodes. Iterate across all nodes until we - * can no longer free unreserved surplus pages. This occurs when - * the nodes with surplus pages have no free pages. - */ - unsigned long remaining_iterations = nr_online_nodes; - /* Uncommit the reservation */ h->resv_huge_pages -= unused_resv_pages; @@ -879,26 +909,17 @@ static void return_unused_surplus_pages(struct hstate *h, nr_pages = min(unused_resv_pages, h->surplus_huge_pages); - while (remaining_iterations-- && nr_pages) { - nid = next_node(nid, node_online_map); - if (nid == MAX_NUMNODES) - nid = first_node(node_online_map); - - if (!h->surplus_huge_pages_node[nid]) - continue; - - if (!list_empty(&h->hugepage_freelists[nid])) { - page = list_entry(h->hugepage_freelists[nid].next, - struct page, lru); - list_del(&page->lru); - update_and_free_page(h, page); - h->free_huge_pages--; - h->free_huge_pages_node[nid]--; - h->surplus_huge_pages--; - h->surplus_huge_pages_node[nid]--; - nr_pages--; - remaining_iterations = nr_online_nodes; - } + /* + * We want to release as many surplus pages as possible, spread + * evenly across all nodes. Iterate across all nodes until we + * can no longer free unreserved surplus pages. This occurs when + * the nodes with surplus pages have no free pages. + * free_pool_huge_page() will balance the the frees across the + * on-line nodes for us and will handle the hstate accounting. + */ + while (nr_pages--) { + if (!free_pool_huge_page(h, 1)) + break; } } @@ -1007,9 +1028,10 @@ int __weak alloc_bootmem_huge_page(struct hstate *h) void *addr; addr = __alloc_bootmem_node_nopanic( - NODE_DATA(h->hugetlb_next_nid), + NODE_DATA(h->next_nid_to_alloc), huge_page_size(h), huge_page_size(h), 0); + hstate_next_node_to_alloc(h); if (addr) { /* * Use the beginning of the huge page to store the @@ -1019,7 +1041,6 @@ int __weak alloc_bootmem_huge_page(struct hstate *h) m = addr; goto found; } - hstate_next_node(h); nr_nodes--; } return 0; @@ -1140,31 +1161,43 @@ static inline void try_to_free_low(struct hstate *h, unsigned long count) */ static int adjust_pool_surplus(struct hstate *h, int delta) { - static int prev_nid; - int nid = prev_nid; + int start_nid, next_nid; int ret = 0; VM_BUG_ON(delta != -1 && delta != 1); - do { - nid = next_node(nid, node_online_map); - if (nid == MAX_NUMNODES) - nid = first_node(node_online_map); - /* To shrink on this node, there must be a surplus page */ - if (delta < 0 && !h->surplus_huge_pages_node[nid]) - continue; - /* Surplus cannot exceed the total number of pages */ - if (delta > 0 && h->surplus_huge_pages_node[nid] >= + if (delta < 0) + start_nid = h->next_nid_to_alloc; + else + start_nid = h->next_nid_to_free; + next_nid = start_nid; + + do { + int nid = next_nid; + if (delta < 0) { + next_nid = hstate_next_node_to_alloc(h); + /* + * To shrink on this node, there must be a surplus page + */ + if (!h->surplus_huge_pages_node[nid]) + continue; + } + if (delta > 0) { + next_nid = hstate_next_node_to_free(h); + /* + * Surplus cannot exceed the total number of pages + */ + if (h->surplus_huge_pages_node[nid] >= h->nr_huge_pages_node[nid]) - continue; + continue; + } h->surplus_huge_pages += delta; h->surplus_huge_pages_node[nid] += delta; ret = 1; break; - } while (nid != prev_nid); + } while (next_nid != start_nid); - prev_nid = nid; return ret; } @@ -1226,10 +1259,8 @@ static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count) min_count = max(count, min_count); try_to_free_low(h, min_count); while (min_count < persistent_huge_pages(h)) { - struct page *page = dequeue_huge_page(h); - if (!page) + if (!free_pool_huge_page(h, 0)) break; - update_and_free_page(h, page); } while (count < persistent_huge_pages(h)) { if (!adjust_pool_surplus(h, 1)) @@ -1441,7 +1472,8 @@ void __init hugetlb_add_hstate(unsigned order) h->free_huge_pages = 0; for (i = 0; i < MAX_NUMNODES; ++i) INIT_LIST_HEAD(&h->hugepage_freelists[i]); - h->hugetlb_next_nid = first_node(node_online_map); + h->next_nid_to_alloc = first_node(node_online_map); + h->next_nid_to_free = first_node(node_online_map); snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB", huge_page_size(h)/1024); @@ -1505,7 +1537,7 @@ static unsigned int cpuset_mems_nr(unsigned int *array) #ifdef CONFIG_SYSCTL int hugetlb_sysctl_handler(struct ctl_table *table, int write, - struct file *file, void __user *buffer, + void __user *buffer, size_t *length, loff_t *ppos) { struct hstate *h = &default_hstate; @@ -1516,7 +1548,7 @@ int hugetlb_sysctl_handler(struct ctl_table *table, int write, table->data = &tmp; table->maxlen = sizeof(unsigned long); - proc_doulongvec_minmax(table, write, file, buffer, length, ppos); + proc_doulongvec_minmax(table, write, buffer, length, ppos); if (write) h->max_huge_pages = set_max_huge_pages(h, tmp); @@ -1525,10 +1557,10 @@ int hugetlb_sysctl_handler(struct ctl_table *table, int write, } int hugetlb_treat_movable_handler(struct ctl_table *table, int write, - struct file *file, void __user *buffer, + void __user *buffer, size_t *length, loff_t *ppos) { - proc_dointvec(table, write, file, buffer, length, ppos); + proc_dointvec(table, write, buffer, length, ppos); if (hugepages_treat_as_movable) htlb_alloc_mask = GFP_HIGHUSER_MOVABLE; else @@ -1537,7 +1569,7 @@ int hugetlb_treat_movable_handler(struct ctl_table *table, int write, } int hugetlb_overcommit_handler(struct ctl_table *table, int write, - struct file *file, void __user *buffer, + void __user *buffer, size_t *length, loff_t *ppos) { struct hstate *h = &default_hstate; @@ -1548,7 +1580,7 @@ int hugetlb_overcommit_handler(struct ctl_table *table, int write, table->data = &tmp; table->maxlen = sizeof(unsigned long); - proc_doulongvec_minmax(table, write, file, buffer, length, ppos); + proc_doulongvec_minmax(table, write, buffer, length, ppos); if (write) { spin_lock(&hugetlb_lock); @@ -1689,7 +1721,7 @@ static int hugetlb_vm_op_fault(struct vm_area_struct *vma, struct vm_fault *vmf) return 0; } -struct vm_operations_struct hugetlb_vm_ops = { +const struct vm_operations_struct hugetlb_vm_ops = { .fault = hugetlb_vm_op_fault, .open = hugetlb_vm_op_open, .close = hugetlb_vm_op_close, @@ -1984,6 +2016,26 @@ static struct page *hugetlbfs_pagecache_page(struct hstate *h, return find_lock_page(mapping, idx); } +/* + * Return whether there is a pagecache page to back given address within VMA. + * Caller follow_hugetlb_page() holds page_table_lock so we cannot lock_page. + */ +static bool hugetlbfs_pagecache_present(struct hstate *h, + struct vm_area_struct *vma, unsigned long address) +{ + struct address_space *mapping; + pgoff_t idx; + struct page *page; + + mapping = vma->vm_file->f_mapping; + idx = vma_hugecache_offset(h, vma, address); + + page = find_get_page(mapping, idx); + if (page) + put_page(page); + return page != NULL; +} + static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, pte_t *ptep, unsigned int flags) { @@ -2179,54 +2231,55 @@ follow_huge_pud(struct mm_struct *mm, unsigned long address, return NULL; } -static int huge_zeropage_ok(pte_t *ptep, int write, int shared) -{ - if (!ptep || write || shared) - return 0; - else - return huge_pte_none(huge_ptep_get(ptep)); -} - int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, struct page **pages, struct vm_area_struct **vmas, unsigned long *position, int *length, int i, - int write) + unsigned int flags) { unsigned long pfn_offset; unsigned long vaddr = *position; int remainder = *length; struct hstate *h = hstate_vma(vma); - int zeropage_ok = 0; - int shared = vma->vm_flags & VM_SHARED; spin_lock(&mm->page_table_lock); while (vaddr < vma->vm_end && remainder) { pte_t *pte; + int absent; struct page *page; /* * Some archs (sparc64, sh*) have multiple pte_ts to - * each hugepage. We have to make * sure we get the + * each hugepage. We have to make sure we get the * first, for the page indexing below to work. */ pte = huge_pte_offset(mm, vaddr & huge_page_mask(h)); - if (huge_zeropage_ok(pte, write, shared)) - zeropage_ok = 1; + absent = !pte || huge_pte_none(huge_ptep_get(pte)); + + /* + * When coredumping, it suits get_dump_page if we just return + * an error where there's an empty slot with no huge pagecache + * to back it. This way, we avoid allocating a hugepage, and + * the sparse dumpfile avoids allocating disk blocks, but its + * huge holes still show up with zeroes where they need to be. + */ + if (absent && (flags & FOLL_DUMP) && + !hugetlbfs_pagecache_present(h, vma, vaddr)) { + remainder = 0; + break; + } - if (!pte || - (huge_pte_none(huge_ptep_get(pte)) && !zeropage_ok) || - (write && !pte_write(huge_ptep_get(pte)))) { + if (absent || + ((flags & FOLL_WRITE) && !pte_write(huge_ptep_get(pte)))) { int ret; spin_unlock(&mm->page_table_lock); - ret = hugetlb_fault(mm, vma, vaddr, write); + ret = hugetlb_fault(mm, vma, vaddr, + (flags & FOLL_WRITE) ? FAULT_FLAG_WRITE : 0); spin_lock(&mm->page_table_lock); if (!(ret & VM_FAULT_ERROR)) continue; remainder = 0; - if (!i) - i = -EFAULT; break; } @@ -2234,10 +2287,7 @@ int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, page = pte_page(huge_ptep_get(pte)); same_page: if (pages) { - if (zeropage_ok) - pages[i] = ZERO_PAGE(0); - else - pages[i] = mem_map_offset(page, pfn_offset); + pages[i] = mem_map_offset(page, pfn_offset); get_page(pages[i]); } @@ -2261,7 +2311,7 @@ same_page: *length = remainder; *position = vaddr; - return i; + return i ? i : -EFAULT; } void hugetlb_change_protection(struct vm_area_struct *vma, @@ -2370,7 +2420,7 @@ void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed) long chg = region_truncate(&inode->i_mapping->private_list, offset); spin_lock(&inode->i_lock); - inode->i_blocks -= blocks_per_huge_page(h); + inode->i_blocks -= (blocks_per_huge_page(h) * freed); spin_unlock(&inode->i_lock); hugetlb_put_quota(inode->i_mapping, (chg - freed)); diff --git a/mm/hwpoison-inject.c b/mm/hwpoison-inject.c new file mode 100644 index 000000000000..e1d85137f086 --- /dev/null +++ b/mm/hwpoison-inject.c @@ -0,0 +1,41 @@ +/* Inject a hwpoison memory failure on a arbitary pfn */ +#include <linux/module.h> +#include <linux/debugfs.h> +#include <linux/kernel.h> +#include <linux/mm.h> + +static struct dentry *hwpoison_dir, *corrupt_pfn; + +static int hwpoison_inject(void *data, u64 val) +{ + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + printk(KERN_INFO "Injecting memory failure at pfn %Lx\n", val); + return __memory_failure(val, 18, 0); +} + +DEFINE_SIMPLE_ATTRIBUTE(hwpoison_fops, NULL, hwpoison_inject, "%lli\n"); + +static void pfn_inject_exit(void) +{ + if (hwpoison_dir) + debugfs_remove_recursive(hwpoison_dir); +} + +static int pfn_inject_init(void) +{ + hwpoison_dir = debugfs_create_dir("hwpoison", NULL); + if (hwpoison_dir == NULL) + return -ENOMEM; + corrupt_pfn = debugfs_create_file("corrupt-pfn", 0600, hwpoison_dir, + NULL, &hwpoison_fops); + if (corrupt_pfn == NULL) { + pfn_inject_exit(); + return -ENOMEM; + } + return 0; +} + +module_init(pfn_inject_init); +module_exit(pfn_inject_exit); +MODULE_LICENSE("GPL"); diff --git a/mm/internal.h b/mm/internal.h index f290c4db528b..22ec8d2b0fb8 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -37,6 +37,8 @@ static inline void __put_page(struct page *page) atomic_dec(&page->_count); } +extern unsigned long highest_memmap_pfn; + /* * in mm/vmscan.c: */ @@ -46,7 +48,6 @@ extern void putback_lru_page(struct page *page); /* * in mm/page_alloc.c */ -extern unsigned long highest_memmap_pfn; extern void __free_pages_bootmem(struct page *page, unsigned int order); extern void prep_compound_page(struct page *page, unsigned long order); @@ -250,13 +251,8 @@ static inline void mminit_validate_memmodel_limits(unsigned long *start_pfn, } #endif /* CONFIG_SPARSEMEM */ -#define GUP_FLAGS_WRITE 0x1 -#define GUP_FLAGS_FORCE 0x2 -#define GUP_FLAGS_IGNORE_VMA_PERMISSIONS 0x4 -#define GUP_FLAGS_IGNORE_SIGKILL 0x8 - int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, - unsigned long start, int len, int flags, + unsigned long start, int len, unsigned int foll_flags, struct page **pages, struct vm_area_struct **vmas); #define ZONE_RECLAIM_NOSCAN -2 diff --git a/mm/kmemleak-test.c b/mm/kmemleak-test.c index d5292fc6f523..177a5169bbde 100644 --- a/mm/kmemleak-test.c +++ b/mm/kmemleak-test.c @@ -36,7 +36,7 @@ struct test_node { }; static LIST_HEAD(test_list); -static DEFINE_PER_CPU(void *, test_pointer); +static DEFINE_PER_CPU(void *, kmemleak_test_pointer); /* * Some very simple testing. This function needs to be extended for @@ -86,9 +86,9 @@ static int __init kmemleak_test_init(void) } for_each_possible_cpu(i) { - per_cpu(test_pointer, i) = kmalloc(129, GFP_KERNEL); + per_cpu(kmemleak_test_pointer, i) = kmalloc(129, GFP_KERNEL); pr_info("kmemleak: kmalloc(129) = %p\n", - per_cpu(test_pointer, i)); + per_cpu(kmemleak_test_pointer, i)); } return 0; diff --git a/mm/kmemleak.c b/mm/kmemleak.c index c96f2c8700aa..4ea4510e2996 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -48,10 +48,10 @@ * scanned. This list is only modified during a scanning episode when the * scan_mutex is held. At the end of a scan, the gray_list is always empty. * Note that the kmemleak_object.use_count is incremented when an object is - * added to the gray_list and therefore cannot be freed - * - kmemleak_mutex (mutex): prevents multiple users of the "kmemleak" debugfs - * file together with modifications to the memory scanning parameters - * including the scan_thread pointer + * added to the gray_list and therefore cannot be freed. This mutex also + * prevents multiple users of the "kmemleak" debugfs file together with + * modifications to the memory scanning parameters including the scan_thread + * pointer * * The kmemleak_object structures have a use_count incremented or decremented * using the get_object()/put_object() functions. When the use_count becomes @@ -92,22 +92,24 @@ #include <linux/string.h> #include <linux/nodemask.h> #include <linux/mm.h> +#include <linux/workqueue.h> #include <asm/sections.h> #include <asm/processor.h> #include <asm/atomic.h> +#include <linux/kmemcheck.h> #include <linux/kmemleak.h> /* * Kmemleak configuration and common defines. */ #define MAX_TRACE 16 /* stack trace length */ -#define REPORTS_NR 50 /* maximum number of reported leaks */ #define MSECS_MIN_AGE 5000 /* minimum object age for reporting */ -#define MSECS_SCAN_YIELD 10 /* CPU yielding period */ #define SECS_FIRST_SCAN 60 /* delay before the first scan */ #define SECS_SCAN_WAIT 600 /* subsequent auto scanning delay */ +#define GRAY_LIST_PASSES 25 /* maximum number of gray list scans */ +#define MAX_SCAN_SIZE 4096 /* maximum size of a scanned block */ #define BYTES_PER_POINTER sizeof(void *) @@ -121,6 +123,9 @@ struct kmemleak_scan_area { size_t length; }; +#define KMEMLEAK_GREY 0 +#define KMEMLEAK_BLACK -1 + /* * Structure holding the metadata for each allocated memory block. * Modifications to such objects should be made while holding the @@ -159,6 +164,17 @@ struct kmemleak_object { #define OBJECT_REPORTED (1 << 1) /* flag set to not scan the object */ #define OBJECT_NO_SCAN (1 << 2) +/* flag set on newly allocated objects */ +#define OBJECT_NEW (1 << 3) + +/* number of bytes to print per line; must be 16 or 32 */ +#define HEX_ROW_SIZE 16 +/* number of bytes to print at a time (1, 2, 4, 8) */ +#define HEX_GROUP_SIZE 1 +/* include ASCII after the hex output */ +#define HEX_ASCII 1 +/* max number of lines to be printed */ +#define HEX_MAX_LINES 2 /* the list of all allocated objects */ static LIST_HEAD(object_list); @@ -186,22 +202,16 @@ static atomic_t kmemleak_error = ATOMIC_INIT(0); static unsigned long min_addr = ULONG_MAX; static unsigned long max_addr; -/* used for yielding the CPU to other tasks during scanning */ -static unsigned long next_scan_yield; static struct task_struct *scan_thread; -static unsigned long jiffies_scan_yield; +/* used to avoid reporting of recently allocated objects */ static unsigned long jiffies_min_age; +static unsigned long jiffies_last_scan; /* delay between automatic memory scannings */ static signed long jiffies_scan_wait; /* enables or disables the task stacks scanning */ -static int kmemleak_stack_scan; -/* mutex protecting the memory scanning */ +static int kmemleak_stack_scan = 1; +/* protects the memory scanning, parameters and debug/kmemleak file access */ static DEFINE_MUTEX(scan_mutex); -/* mutex protecting the access to the /sys/kernel/debug/kmemleak file */ -static DEFINE_MUTEX(kmemleak_mutex); - -/* number of leaks reported (for limitation purposes) */ -static int reported_leaks; /* * Early object allocation/freeing logging. Kmemleak is initialized after the @@ -215,6 +225,7 @@ static int reported_leaks; enum { KMEMLEAK_ALLOC, KMEMLEAK_FREE, + KMEMLEAK_FREE_PART, KMEMLEAK_NOT_LEAK, KMEMLEAK_IGNORE, KMEMLEAK_SCAN_AREA, @@ -232,11 +243,14 @@ struct early_log { int min_count; /* minimum reference count */ unsigned long offset; /* scan area offset */ size_t length; /* scan area length */ + unsigned long trace[MAX_TRACE]; /* stack trace */ + unsigned int trace_len; /* stack trace length */ }; /* early logging buffer and current position */ -static struct early_log early_log[200]; -static int crt_early_log; +static struct early_log + early_log[CONFIG_DEBUG_KMEMLEAK_EARLY_LOG_SIZE] __initdata; +static int crt_early_log __initdata; static void kmemleak_disable(void); @@ -259,6 +273,35 @@ static void kmemleak_disable(void); } while (0) /* + * Printing of the objects hex dump to the seq file. The number of lines to be + * printed is limited to HEX_MAX_LINES to prevent seq file spamming. The + * actual number of printed bytes depends on HEX_ROW_SIZE. It must be called + * with the object->lock held. + */ +static void hex_dump_object(struct seq_file *seq, + struct kmemleak_object *object) +{ + const u8 *ptr = (const u8 *)object->pointer; + int i, len, remaining; + unsigned char linebuf[HEX_ROW_SIZE * 5]; + + /* limit the number of lines to HEX_MAX_LINES */ + remaining = len = + min(object->size, (size_t)(HEX_MAX_LINES * HEX_ROW_SIZE)); + + seq_printf(seq, " hex dump (first %d bytes):\n", len); + for (i = 0; i < len; i += HEX_ROW_SIZE) { + int linelen = min(remaining, HEX_ROW_SIZE); + + remaining -= HEX_ROW_SIZE; + hex_dump_to_buffer(ptr + i, linelen, HEX_ROW_SIZE, + HEX_GROUP_SIZE, linebuf, sizeof(linebuf), + HEX_ASCII); + seq_printf(seq, " %s\n", linebuf); + } +} + +/* * Object colors, encoded with count and min_count: * - white - orphan object, not enough references to it (count < min_count) * - gray - not orphan, not marked as false positive (min_count == 0) or @@ -268,23 +311,21 @@ static void kmemleak_disable(void); * Newly created objects don't have any color assigned (object->count == -1) * before the next memory scan when they become white. */ -static int color_white(const struct kmemleak_object *object) +static bool color_white(const struct kmemleak_object *object) { - return object->count != -1 && object->count < object->min_count; + return object->count != KMEMLEAK_BLACK && + object->count < object->min_count; } -static int color_gray(const struct kmemleak_object *object) +static bool color_gray(const struct kmemleak_object *object) { - return object->min_count != -1 && object->count >= object->min_count; + return object->min_count != KMEMLEAK_BLACK && + object->count >= object->min_count; } -/* - * Objects are considered referenced if their color is gray and they have not - * been deleted. - */ -static int referenced_object(struct kmemleak_object *object) +static bool color_black(const struct kmemleak_object *object) { - return (object->flags & OBJECT_ALLOCATED) && color_gray(object); + return object->min_count == KMEMLEAK_BLACK; } /* @@ -292,45 +333,32 @@ static int referenced_object(struct kmemleak_object *object) * not be deleted and have a minimum age to avoid false positives caused by * pointers temporarily stored in CPU registers. */ -static int unreferenced_object(struct kmemleak_object *object) +static bool unreferenced_object(struct kmemleak_object *object) { return (object->flags & OBJECT_ALLOCATED) && color_white(object) && - time_is_before_eq_jiffies(object->jiffies + jiffies_min_age); + time_before_eq(object->jiffies + jiffies_min_age, + jiffies_last_scan); } /* - * Printing of the (un)referenced objects information, either to the seq file - * or to the kernel log. The print_referenced/print_unreferenced functions - * must be called with the object->lock held. + * Printing of the unreferenced objects information to the seq file. The + * print_unreferenced function must be called with the object->lock held. */ -#define print_helper(seq, x...) do { \ - struct seq_file *s = (seq); \ - if (s) \ - seq_printf(s, x); \ - else \ - pr_info(x); \ -} while (0) - -static void print_referenced(struct kmemleak_object *object) -{ - pr_info("referenced object 0x%08lx (size %zu)\n", - object->pointer, object->size); -} - static void print_unreferenced(struct seq_file *seq, struct kmemleak_object *object) { int i; - print_helper(seq, "unreferenced object 0x%08lx (size %zu):\n", - object->pointer, object->size); - print_helper(seq, " comm \"%s\", pid %d, jiffies %lu\n", - object->comm, object->pid, object->jiffies); - print_helper(seq, " backtrace:\n"); + seq_printf(seq, "unreferenced object 0x%08lx (size %zu):\n", + object->pointer, object->size); + seq_printf(seq, " comm \"%s\", pid %d, jiffies %lu\n", + object->comm, object->pid, object->jiffies); + hex_dump_object(seq, object); + seq_printf(seq, " backtrace:\n"); for (i = 0; i < object->trace_len; i++) { void *ptr = (void *)object->trace[i]; - print_helper(seq, " [<%p>] %pS\n", ptr, ptr); + seq_printf(seq, " [<%p>] %pS\n", ptr, ptr); } } @@ -352,6 +380,7 @@ static void dump_object_info(struct kmemleak_object *object) object->comm, object->pid, object->jiffies); pr_notice(" min_count = %d\n", object->min_count); pr_notice(" count = %d\n", object->count); + pr_notice(" flags = 0x%lx\n", object->flags); pr_notice(" backtrace:\n"); print_stack_trace(&trace, 4); } @@ -456,21 +485,36 @@ static struct kmemleak_object *find_and_get_object(unsigned long ptr, int alias) } /* + * Save stack trace to the given array of MAX_TRACE size. + */ +static int __save_stack_trace(unsigned long *trace) +{ + struct stack_trace stack_trace; + + stack_trace.max_entries = MAX_TRACE; + stack_trace.nr_entries = 0; + stack_trace.entries = trace; + stack_trace.skip = 2; + save_stack_trace(&stack_trace); + + return stack_trace.nr_entries; +} + +/* * Create the metadata (struct kmemleak_object) corresponding to an allocated * memory block and add it to the object_list and object_tree_root. */ -static void create_object(unsigned long ptr, size_t size, int min_count, - gfp_t gfp) +static struct kmemleak_object *create_object(unsigned long ptr, size_t size, + int min_count, gfp_t gfp) { unsigned long flags; struct kmemleak_object *object; struct prio_tree_node *node; - struct stack_trace trace; object = kmem_cache_alloc(object_cache, gfp & GFP_KMEMLEAK_MASK); if (!object) { kmemleak_stop("Cannot allocate a kmemleak_object structure\n"); - return; + return NULL; } INIT_LIST_HEAD(&object->object_list); @@ -478,7 +522,7 @@ static void create_object(unsigned long ptr, size_t size, int min_count, INIT_HLIST_HEAD(&object->area_list); spin_lock_init(&object->lock); atomic_set(&object->use_count, 1); - object->flags = OBJECT_ALLOCATED; + object->flags = OBJECT_ALLOCATED | OBJECT_NEW; object->pointer = ptr; object->size = size; object->min_count = min_count; @@ -504,18 +548,14 @@ static void create_object(unsigned long ptr, size_t size, int min_count, } /* kernel backtrace */ - trace.max_entries = MAX_TRACE; - trace.nr_entries = 0; - trace.entries = object->trace; - trace.skip = 1; - save_stack_trace(&trace); - object->trace_len = trace.nr_entries; + object->trace_len = __save_stack_trace(object->trace); INIT_PRIO_TREE_NODE(&object->tree_node); object->tree_node.start = ptr; object->tree_node.last = ptr + size - 1; write_lock_irqsave(&kmemleak_lock, flags); + min_addr = min(min_addr, ptr); max_addr = max(max_addr, ptr + size); node = prio_tree_insert(&object_tree_root, &object->tree_node); @@ -526,101 +566,157 @@ static void create_object(unsigned long ptr, size_t size, int min_count, * random memory blocks. */ if (node != &object->tree_node) { - unsigned long flags; - kmemleak_stop("Cannot insert 0x%lx into the object search tree " "(already existing)\n", ptr); object = lookup_object(ptr, 1); - spin_lock_irqsave(&object->lock, flags); + spin_lock(&object->lock); dump_object_info(object); - spin_unlock_irqrestore(&object->lock, flags); + spin_unlock(&object->lock); goto out; } list_add_tail_rcu(&object->object_list, &object_list); out: write_unlock_irqrestore(&kmemleak_lock, flags); + return object; } /* * Remove the metadata (struct kmemleak_object) for a memory block from the * object_list and object_tree_root and decrement its use_count. */ -static void delete_object(unsigned long ptr) +static void __delete_object(struct kmemleak_object *object) { unsigned long flags; - struct kmemleak_object *object; write_lock_irqsave(&kmemleak_lock, flags); - object = lookup_object(ptr, 0); - if (!object) { - kmemleak_warn("Freeing unknown object at 0x%08lx\n", - ptr); - write_unlock_irqrestore(&kmemleak_lock, flags); - return; - } prio_tree_remove(&object_tree_root, &object->tree_node); list_del_rcu(&object->object_list); write_unlock_irqrestore(&kmemleak_lock, flags); WARN_ON(!(object->flags & OBJECT_ALLOCATED)); - WARN_ON(atomic_read(&object->use_count) < 1); + WARN_ON(atomic_read(&object->use_count) < 2); /* * Locking here also ensures that the corresponding memory block * cannot be freed when it is being scanned. */ spin_lock_irqsave(&object->lock, flags); - if (object->flags & OBJECT_REPORTED) - print_referenced(object); object->flags &= ~OBJECT_ALLOCATED; spin_unlock_irqrestore(&object->lock, flags); put_object(object); } /* - * Make a object permanently as gray-colored so that it can no longer be - * reported as a leak. This is used in general to mark a false positive. + * Look up the metadata (struct kmemleak_object) corresponding to ptr and + * delete it. */ -static void make_gray_object(unsigned long ptr) +static void delete_object_full(unsigned long ptr) { - unsigned long flags; struct kmemleak_object *object; object = find_and_get_object(ptr, 0); if (!object) { - kmemleak_warn("Graying unknown object at 0x%08lx\n", ptr); +#ifdef DEBUG + kmemleak_warn("Freeing unknown object at 0x%08lx\n", + ptr); +#endif return; } - - spin_lock_irqsave(&object->lock, flags); - object->min_count = 0; - spin_unlock_irqrestore(&object->lock, flags); + __delete_object(object); put_object(object); } /* - * Mark the object as black-colored so that it is ignored from scans and - * reporting. + * Look up the metadata (struct kmemleak_object) corresponding to ptr and + * delete it. If the memory block is partially freed, the function may create + * additional metadata for the remaining parts of the block. */ -static void make_black_object(unsigned long ptr) +static void delete_object_part(unsigned long ptr, size_t size) { - unsigned long flags; struct kmemleak_object *object; + unsigned long start, end; - object = find_and_get_object(ptr, 0); + object = find_and_get_object(ptr, 1); if (!object) { - kmemleak_warn("Blacking unknown object at 0x%08lx\n", ptr); +#ifdef DEBUG + kmemleak_warn("Partially freeing unknown object at 0x%08lx " + "(size %zu)\n", ptr, size); +#endif return; } + __delete_object(object); + + /* + * Create one or two objects that may result from the memory block + * split. Note that partial freeing is only done by free_bootmem() and + * this happens before kmemleak_init() is called. The path below is + * only executed during early log recording in kmemleak_init(), so + * GFP_KERNEL is enough. + */ + start = object->pointer; + end = object->pointer + object->size; + if (ptr > start) + create_object(start, ptr - start, object->min_count, + GFP_KERNEL); + if (ptr + size < end) + create_object(ptr + size, end - ptr - size, object->min_count, + GFP_KERNEL); + + put_object(object); +} + +static void __paint_it(struct kmemleak_object *object, int color) +{ + object->min_count = color; + if (color == KMEMLEAK_BLACK) + object->flags |= OBJECT_NO_SCAN; +} + +static void paint_it(struct kmemleak_object *object, int color) +{ + unsigned long flags; spin_lock_irqsave(&object->lock, flags); - object->min_count = -1; + __paint_it(object, color); spin_unlock_irqrestore(&object->lock, flags); +} + +static void paint_ptr(unsigned long ptr, int color) +{ + struct kmemleak_object *object; + + object = find_and_get_object(ptr, 0); + if (!object) { + kmemleak_warn("Trying to color unknown object " + "at 0x%08lx as %s\n", ptr, + (color == KMEMLEAK_GREY) ? "Grey" : + (color == KMEMLEAK_BLACK) ? "Black" : "Unknown"); + return; + } + paint_it(object, color); put_object(object); } /* + * Make a object permanently as gray-colored so that it can no longer be + * reported as a leak. This is used in general to mark a false positive. + */ +static void make_gray_object(unsigned long ptr) +{ + paint_ptr(ptr, KMEMLEAK_GREY); +} + +/* + * Mark the object as black-colored so that it is ignored from scans and + * reporting. + */ +static void make_black_object(unsigned long ptr) +{ + paint_ptr(ptr, KMEMLEAK_BLACK); +} + +/* * Add a scanning area to the object. If at least one such area is added, * kmemleak will only scan these ranges rather than the whole memory block. */ @@ -689,14 +785,16 @@ static void object_no_scan(unsigned long ptr) * Log an early kmemleak_* call to the early_log buffer. These calls will be * processed later once kmemleak is fully initialized. */ -static void log_early(int op_type, const void *ptr, size_t size, - int min_count, unsigned long offset, size_t length) +static void __init log_early(int op_type, const void *ptr, size_t size, + int min_count, unsigned long offset, size_t length) { unsigned long flags; struct early_log *log; if (crt_early_log >= ARRAY_SIZE(early_log)) { - kmemleak_stop("Early log buffer exceeded\n"); + pr_warning("Early log buffer exceeded, " + "please increase DEBUG_KMEMLEAK_EARLY_LOG_SIZE\n"); + kmemleak_disable(); return; } @@ -712,16 +810,45 @@ static void log_early(int op_type, const void *ptr, size_t size, log->min_count = min_count; log->offset = offset; log->length = length; + if (op_type == KMEMLEAK_ALLOC) + log->trace_len = __save_stack_trace(log->trace); crt_early_log++; local_irq_restore(flags); } /* + * Log an early allocated block and populate the stack trace. + */ +static void early_alloc(struct early_log *log) +{ + struct kmemleak_object *object; + unsigned long flags; + int i; + + if (!atomic_read(&kmemleak_enabled) || !log->ptr || IS_ERR(log->ptr)) + return; + + /* + * RCU locking needed to ensure object is not freed via put_object(). + */ + rcu_read_lock(); + object = create_object((unsigned long)log->ptr, log->size, + log->min_count, GFP_KERNEL); + spin_lock_irqsave(&object->lock, flags); + for (i = 0; i < log->trace_len; i++) + object->trace[i] = log->trace[i]; + object->trace_len = log->trace_len; + spin_unlock_irqrestore(&object->lock, flags); + rcu_read_unlock(); +} + +/* * Memory allocation function callback. This function is called from the * kernel allocators when a new block is allocated (kmem_cache_alloc, kmalloc, * vmalloc etc.). */ -void kmemleak_alloc(const void *ptr, size_t size, int min_count, gfp_t gfp) +void __ref kmemleak_alloc(const void *ptr, size_t size, int min_count, + gfp_t gfp) { pr_debug("%s(0x%p, %zu, %d)\n", __func__, ptr, size, min_count); @@ -736,22 +863,37 @@ EXPORT_SYMBOL_GPL(kmemleak_alloc); * Memory freeing function callback. This function is called from the kernel * allocators when a block is freed (kmem_cache_free, kfree, vfree etc.). */ -void kmemleak_free(const void *ptr) +void __ref kmemleak_free(const void *ptr) { pr_debug("%s(0x%p)\n", __func__, ptr); if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr)) - delete_object((unsigned long)ptr); + delete_object_full((unsigned long)ptr); else if (atomic_read(&kmemleak_early_log)) log_early(KMEMLEAK_FREE, ptr, 0, 0, 0, 0); } EXPORT_SYMBOL_GPL(kmemleak_free); /* + * Partial memory freeing function callback. This function is usually called + * from bootmem allocator when (part of) a memory block is freed. + */ +void __ref kmemleak_free_part(const void *ptr, size_t size) +{ + pr_debug("%s(0x%p)\n", __func__, ptr); + + if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr)) + delete_object_part((unsigned long)ptr, size); + else if (atomic_read(&kmemleak_early_log)) + log_early(KMEMLEAK_FREE_PART, ptr, size, 0, 0, 0); +} +EXPORT_SYMBOL_GPL(kmemleak_free_part); + +/* * Mark an already allocated memory block as a false positive. This will cause * the block to no longer be reported as leak and always be scanned. */ -void kmemleak_not_leak(const void *ptr) +void __ref kmemleak_not_leak(const void *ptr) { pr_debug("%s(0x%p)\n", __func__, ptr); @@ -767,7 +909,7 @@ EXPORT_SYMBOL(kmemleak_not_leak); * corresponding block is not a leak and does not contain any references to * other allocated memory blocks. */ -void kmemleak_ignore(const void *ptr) +void __ref kmemleak_ignore(const void *ptr) { pr_debug("%s(0x%p)\n", __func__, ptr); @@ -781,8 +923,8 @@ EXPORT_SYMBOL(kmemleak_ignore); /* * Limit the range to be scanned in an allocated memory block. */ -void kmemleak_scan_area(const void *ptr, unsigned long offset, size_t length, - gfp_t gfp) +void __ref kmemleak_scan_area(const void *ptr, unsigned long offset, + size_t length, gfp_t gfp) { pr_debug("%s(0x%p)\n", __func__, ptr); @@ -796,7 +938,7 @@ EXPORT_SYMBOL(kmemleak_scan_area); /* * Inform kmemleak not to scan the given memory block. */ -void kmemleak_no_scan(const void *ptr) +void __ref kmemleak_no_scan(const void *ptr) { pr_debug("%s(0x%p)\n", __func__, ptr); @@ -808,21 +950,6 @@ void kmemleak_no_scan(const void *ptr) EXPORT_SYMBOL(kmemleak_no_scan); /* - * Yield the CPU so that other tasks get a chance to run. The yielding is - * rate-limited to avoid excessive number of calls to the schedule() function - * during memory scanning. - */ -static void scan_yield(void) -{ - might_sleep(); - - if (time_is_before_eq_jiffies(next_scan_yield)) { - schedule(); - next_scan_yield = jiffies + jiffies_scan_yield; - } -} - -/* * Memory scanning is a long process and it needs to be interruptable. This * function checks whether such interrupt condition occured. */ @@ -848,28 +975,28 @@ static int scan_should_stop(void) * found to the gray list. */ static void scan_block(void *_start, void *_end, - struct kmemleak_object *scanned) + struct kmemleak_object *scanned, int allow_resched) { unsigned long *ptr; unsigned long *start = PTR_ALIGN(_start, BYTES_PER_POINTER); unsigned long *end = _end - (BYTES_PER_POINTER - 1); for (ptr = start; ptr < end; ptr++) { - unsigned long flags; - unsigned long pointer = *ptr; struct kmemleak_object *object; + unsigned long flags; + unsigned long pointer; + if (allow_resched) + cond_resched(); if (scan_should_stop()) break; - /* - * When scanning a memory block with a corresponding - * kmemleak_object, the CPU yielding is handled in the calling - * code since it holds the object->lock to avoid the block - * freeing. - */ - if (!scanned) - scan_yield(); + /* don't scan uninitialized memory */ + if (!kmemcheck_is_obj_initialized((unsigned long)ptr, + BYTES_PER_POINTER)) + continue; + + pointer = *ptr; object = find_and_get_object(pointer, 1); if (!object) @@ -929,14 +1056,25 @@ static void scan_object(struct kmemleak_object *object) if (!(object->flags & OBJECT_ALLOCATED)) /* already freed object */ goto out; - if (hlist_empty(&object->area_list)) - scan_block((void *)object->pointer, - (void *)(object->pointer + object->size), object); - else + if (hlist_empty(&object->area_list)) { + void *start = (void *)object->pointer; + void *end = (void *)(object->pointer + object->size); + + while (start < end && (object->flags & OBJECT_ALLOCATED) && + !(object->flags & OBJECT_NO_SCAN)) { + scan_block(start, min(start + MAX_SCAN_SIZE, end), + object, 0); + start += MAX_SCAN_SIZE; + + spin_unlock_irqrestore(&object->lock, flags); + cond_resched(); + spin_lock_irqsave(&object->lock, flags); + } + } else hlist_for_each_entry(area, elem, &object->area_list, node) scan_block((void *)(object->pointer + area->offset), (void *)(object->pointer + area->offset - + area->length), object); + + area->length), object, 0); out: spin_unlock_irqrestore(&object->lock, flags); } @@ -950,8 +1088,11 @@ static void kmemleak_scan(void) { unsigned long flags; struct kmemleak_object *object, *tmp; - struct task_struct *task; int i; + int new_leaks = 0; + int gray_list_pass = 0; + + jiffies_last_scan = jiffies; /* prepare the kmemleak_object's */ rcu_read_lock(); @@ -970,6 +1111,7 @@ static void kmemleak_scan(void) #endif /* reset the reference count (whiten the object) */ object->count = 0; + object->flags &= ~OBJECT_NEW; if (color_gray(object) && get_object(object)) list_add_tail(&object->gray_list, &gray_list); @@ -978,14 +1120,14 @@ static void kmemleak_scan(void) rcu_read_unlock(); /* data/bss scanning */ - scan_block(_sdata, _edata, NULL); - scan_block(__bss_start, __bss_stop, NULL); + scan_block(_sdata, _edata, NULL, 1); + scan_block(__bss_start, __bss_stop, NULL, 1); #ifdef CONFIG_SMP /* per-cpu sections scanning */ for_each_possible_cpu(i) scan_block(__per_cpu_start + per_cpu_offset(i), - __per_cpu_end + per_cpu_offset(i), NULL); + __per_cpu_end + per_cpu_offset(i), NULL, 1); #endif /* @@ -1007,19 +1149,21 @@ static void kmemleak_scan(void) /* only scan if page is in use */ if (page_count(page) == 0) continue; - scan_block(page, page + 1, NULL); + scan_block(page, page + 1, NULL, 1); } } /* - * Scanning the task stacks may introduce false negatives and it is - * not enabled by default. + * Scanning the task stacks (may introduce false negatives). */ if (kmemleak_stack_scan) { + struct task_struct *p, *g; + read_lock(&tasklist_lock); - for_each_process(task) - scan_block(task_stack_page(task), - task_stack_page(task) + THREAD_SIZE, NULL); + do_each_thread(g, p) { + scan_block(task_stack_page(p), task_stack_page(p) + + THREAD_SIZE, NULL, 0); + } while_each_thread(g, p); read_unlock(&tasklist_lock); } @@ -1031,9 +1175,10 @@ static void kmemleak_scan(void) * kmemleak objects cannot be freed from outside the loop because their * use_count was increased. */ +repeat: object = list_entry(gray_list.next, typeof(*object), gray_list); while (&object->gray_list != &gray_list) { - scan_yield(); + cond_resched(); /* may add new objects to the list */ if (!scan_should_stop()) @@ -1048,7 +1193,59 @@ static void kmemleak_scan(void) object = tmp; } + + if (scan_should_stop() || ++gray_list_pass >= GRAY_LIST_PASSES) + goto scan_end; + + /* + * Check for new objects allocated during this scanning and add them + * to the gray list. + */ + rcu_read_lock(); + list_for_each_entry_rcu(object, &object_list, object_list) { + spin_lock_irqsave(&object->lock, flags); + if ((object->flags & OBJECT_NEW) && !color_black(object) && + get_object(object)) { + object->flags &= ~OBJECT_NEW; + list_add_tail(&object->gray_list, &gray_list); + } + spin_unlock_irqrestore(&object->lock, flags); + } + rcu_read_unlock(); + + if (!list_empty(&gray_list)) + goto repeat; + +scan_end: WARN_ON(!list_empty(&gray_list)); + + /* + * If scanning was stopped or new objects were being allocated at a + * higher rate than gray list scanning, do not report any new + * unreferenced objects. + */ + if (scan_should_stop() || gray_list_pass >= GRAY_LIST_PASSES) + return; + + /* + * Scanning result reporting. + */ + rcu_read_lock(); + list_for_each_entry_rcu(object, &object_list, object_list) { + spin_lock_irqsave(&object->lock, flags); + if (unreferenced_object(object) && + !(object->flags & OBJECT_REPORTED)) { + object->flags |= OBJECT_REPORTED; + new_leaks++; + } + spin_unlock_irqrestore(&object->lock, flags); + } + rcu_read_unlock(); + + if (new_leaks) + pr_info("%d new suspected memory leaks (see " + "/sys/kernel/debug/kmemleak)\n", new_leaks); + } /* @@ -1060,6 +1257,7 @@ static int kmemleak_scan_thread(void *arg) static int first_run = 1; pr_info("Automatic memory scanning thread started\n"); + set_user_nice(current, 10); /* * Wait before the first scan to allow the system to fully initialize. @@ -1070,36 +1268,12 @@ static int kmemleak_scan_thread(void *arg) } while (!kthread_should_stop()) { - struct kmemleak_object *object; signed long timeout = jiffies_scan_wait; mutex_lock(&scan_mutex); - kmemleak_scan(); - reported_leaks = 0; - - rcu_read_lock(); - list_for_each_entry_rcu(object, &object_list, object_list) { - unsigned long flags; - - if (reported_leaks >= REPORTS_NR) - break; - spin_lock_irqsave(&object->lock, flags); - if (!(object->flags & OBJECT_REPORTED) && - unreferenced_object(object)) { - print_unreferenced(NULL, object); - object->flags |= OBJECT_REPORTED; - reported_leaks++; - } else if ((object->flags & OBJECT_REPORTED) && - referenced_object(object)) { - print_referenced(object); - object->flags &= ~OBJECT_REPORTED; - } - spin_unlock_irqrestore(&object->lock, flags); - } - rcu_read_unlock(); - mutex_unlock(&scan_mutex); + /* wait before the next scan */ while (timeout && !kthread_should_stop()) timeout = schedule_timeout_interruptible(timeout); @@ -1112,9 +1286,9 @@ static int kmemleak_scan_thread(void *arg) /* * Start the automatic memory scanning thread. This function must be called - * with the kmemleak_mutex held. + * with the scan_mutex held. */ -void start_scan_thread(void) +static void start_scan_thread(void) { if (scan_thread) return; @@ -1127,9 +1301,9 @@ void start_scan_thread(void) /* * Stop the automatic memory scanning thread. This function must be called - * with the kmemleak_mutex held. + * with the scan_mutex held. */ -void stop_scan_thread(void) +static void stop_scan_thread(void) { if (scan_thread) { kthread_stop(scan_thread); @@ -1146,13 +1320,11 @@ static void *kmemleak_seq_start(struct seq_file *seq, loff_t *pos) { struct kmemleak_object *object; loff_t n = *pos; + int err; - if (!n) { - kmemleak_scan(); - reported_leaks = 0; - } - if (reported_leaks >= REPORTS_NR) - return NULL; + err = mutex_lock_interruptible(&scan_mutex); + if (err < 0) + return ERR_PTR(err); rcu_read_lock(); list_for_each_entry_rcu(object, &object_list, object_list) { @@ -1163,7 +1335,6 @@ static void *kmemleak_seq_start(struct seq_file *seq, loff_t *pos) } object = NULL; out: - rcu_read_unlock(); return object; } @@ -1178,17 +1349,13 @@ static void *kmemleak_seq_next(struct seq_file *seq, void *v, loff_t *pos) struct list_head *n = &prev_obj->object_list; ++(*pos); - if (reported_leaks >= REPORTS_NR) - goto out; - rcu_read_lock(); list_for_each_continue_rcu(n, &object_list) { next_obj = list_entry(n, struct kmemleak_object, object_list); if (get_object(next_obj)) break; } - rcu_read_unlock(); -out: + put_object(prev_obj); return next_obj; } @@ -1198,8 +1365,16 @@ out: */ static void kmemleak_seq_stop(struct seq_file *seq, void *v) { - if (v) - put_object(v); + if (!IS_ERR(v)) { + /* + * kmemleak_seq_start may return ERR_PTR if the scan_mutex + * waiting was interrupted, so only release it if !IS_ERR. + */ + rcu_read_unlock(); + mutex_unlock(&scan_mutex); + if (v) + put_object(v); + } } /* @@ -1211,11 +1386,8 @@ static int kmemleak_seq_show(struct seq_file *seq, void *v) unsigned long flags; spin_lock_irqsave(&object->lock, flags); - if (!unreferenced_object(object)) - goto out; - print_unreferenced(seq, object); - reported_leaks++; -out: + if ((object->flags & OBJECT_REPORTED) && unreferenced_object(object)) + print_unreferenced(seq, object); spin_unlock_irqrestore(&object->lock, flags); return 0; } @@ -1229,43 +1401,58 @@ static const struct seq_operations kmemleak_seq_ops = { static int kmemleak_open(struct inode *inode, struct file *file) { - int ret = 0; - if (!atomic_read(&kmemleak_enabled)) return -EBUSY; - ret = mutex_lock_interruptible(&kmemleak_mutex); - if (ret < 0) - goto out; - if (file->f_mode & FMODE_READ) { - ret = mutex_lock_interruptible(&scan_mutex); - if (ret < 0) - goto kmemleak_unlock; - ret = seq_open(file, &kmemleak_seq_ops); - if (ret < 0) - goto scan_unlock; - } - return ret; - -scan_unlock: - mutex_unlock(&scan_mutex); -kmemleak_unlock: - mutex_unlock(&kmemleak_mutex); -out: - return ret; + return seq_open(file, &kmemleak_seq_ops); } static int kmemleak_release(struct inode *inode, struct file *file) { - int ret = 0; + return seq_release(inode, file); +} - if (file->f_mode & FMODE_READ) { - seq_release(inode, file); - mutex_unlock(&scan_mutex); +static int dump_str_object_info(const char *str) +{ + unsigned long flags; + struct kmemleak_object *object; + unsigned long addr; + + addr= simple_strtoul(str, NULL, 0); + object = find_and_get_object(addr, 0); + if (!object) { + pr_info("Unknown object at 0x%08lx\n", addr); + return -EINVAL; } - mutex_unlock(&kmemleak_mutex); - return ret; + spin_lock_irqsave(&object->lock, flags); + dump_object_info(object); + spin_unlock_irqrestore(&object->lock, flags); + + put_object(object); + return 0; +} + +/* + * We use grey instead of black to ensure we can do future scans on the same + * objects. If we did not do future scans these black objects could + * potentially contain references to newly allocated objects in the future and + * we'd end up with false positives. + */ +static void kmemleak_clear(void) +{ + struct kmemleak_object *object; + unsigned long flags; + + rcu_read_lock(); + list_for_each_entry_rcu(object, &object_list, object_list) { + spin_lock_irqsave(&object->lock, flags); + if ((object->flags & OBJECT_REPORTED) && + unreferenced_object(object)) + __paint_it(object, KMEMLEAK_GREY); + spin_unlock_irqrestore(&object->lock, flags); + } + rcu_read_unlock(); } /* @@ -1278,21 +1465,27 @@ static int kmemleak_release(struct inode *inode, struct file *file) * scan=off - stop the automatic memory scanning thread * scan=... - set the automatic memory scanning period in seconds (0 to * disable it) + * scan - trigger a memory scan + * clear - mark all current reported unreferenced kmemleak objects as + * grey to ignore printing them + * dump=... - dump information about the object found at the given address */ static ssize_t kmemleak_write(struct file *file, const char __user *user_buf, size_t size, loff_t *ppos) { char buf[64]; int buf_size; - - if (!atomic_read(&kmemleak_enabled)) - return -EBUSY; + int ret; buf_size = min(size, (sizeof(buf) - 1)); if (strncpy_from_user(buf, user_buf, buf_size) < 0) return -EFAULT; buf[buf_size] = 0; + ret = mutex_lock_interruptible(&scan_mutex); + if (ret < 0) + return ret; + if (strncmp(buf, "off", 3) == 0) kmemleak_disable(); else if (strncmp(buf, "stack=on", 8) == 0) @@ -1305,18 +1498,28 @@ static ssize_t kmemleak_write(struct file *file, const char __user *user_buf, stop_scan_thread(); else if (strncmp(buf, "scan=", 5) == 0) { unsigned long secs; - int err; - err = strict_strtoul(buf + 5, 0, &secs); - if (err < 0) - return err; + ret = strict_strtoul(buf + 5, 0, &secs); + if (ret < 0) + goto out; stop_scan_thread(); if (secs) { jiffies_scan_wait = msecs_to_jiffies(secs * 1000); start_scan_thread(); } - } else - return -EINVAL; + } else if (strncmp(buf, "scan", 4) == 0) + kmemleak_scan(); + else if (strncmp(buf, "clear", 5) == 0) + kmemleak_clear(); + else if (strncmp(buf, "dump=", 5) == 0) + ret = dump_str_object_info(buf + 5); + else + ret = -EINVAL; + +out: + mutex_unlock(&scan_mutex); + if (ret < 0) + return ret; /* ignore the rest of the buffer, only one command at a time */ *ppos += size; @@ -1336,36 +1539,21 @@ static const struct file_operations kmemleak_fops = { * Perform the freeing of the kmemleak internal objects after waiting for any * current memory scan to complete. */ -static int kmemleak_cleanup_thread(void *arg) +static void kmemleak_do_cleanup(struct work_struct *work) { struct kmemleak_object *object; - mutex_lock(&kmemleak_mutex); + mutex_lock(&scan_mutex); stop_scan_thread(); - mutex_unlock(&kmemleak_mutex); - mutex_lock(&scan_mutex); rcu_read_lock(); list_for_each_entry_rcu(object, &object_list, object_list) - delete_object(object->pointer); + delete_object_full(object->pointer); rcu_read_unlock(); mutex_unlock(&scan_mutex); - - return 0; } -/* - * Start the clean-up thread. - */ -static void kmemleak_cleanup(void) -{ - struct task_struct *cleanup_thread; - - cleanup_thread = kthread_run(kmemleak_cleanup_thread, NULL, - "kmemleak-clean"); - if (IS_ERR(cleanup_thread)) - pr_warning("Failed to create the clean-up thread\n"); -} +static DECLARE_WORK(cleanup_work, kmemleak_do_cleanup); /* * Disable kmemleak. No memory allocation/freeing will be traced once this @@ -1383,7 +1571,7 @@ static void kmemleak_disable(void) /* check whether it is too early for a kernel thread */ if (atomic_read(&kmemleak_initialized)) - kmemleak_cleanup(); + schedule_work(&cleanup_work); pr_info("Kernel memory leak detector disabled\n"); } @@ -1411,7 +1599,6 @@ void __init kmemleak_init(void) int i; unsigned long flags; - jiffies_scan_yield = msecs_to_jiffies(MSECS_SCAN_YIELD); jiffies_min_age = msecs_to_jiffies(MSECS_MIN_AGE); jiffies_scan_wait = msecs_to_jiffies(SECS_SCAN_WAIT * 1000); @@ -1437,12 +1624,14 @@ void __init kmemleak_init(void) switch (log->op_type) { case KMEMLEAK_ALLOC: - kmemleak_alloc(log->ptr, log->size, log->min_count, - GFP_KERNEL); + early_alloc(log); break; case KMEMLEAK_FREE: kmemleak_free(log->ptr); break; + case KMEMLEAK_FREE_PART: + kmemleak_free_part(log->ptr, log->size); + break; case KMEMLEAK_NOT_LEAK: kmemleak_not_leak(log->ptr); break; @@ -1478,7 +1667,7 @@ static int __init kmemleak_late_init(void) * after setting kmemleak_initialized and we may end up with * two clean-up threads but serialized by scan_mutex. */ - kmemleak_cleanup(); + schedule_work(&cleanup_work); return -ENOMEM; } @@ -1486,9 +1675,9 @@ static int __init kmemleak_late_init(void) &kmemleak_fops); if (!dentry) pr_warning("Failed to create the debugfs kmemleak file\n"); - mutex_lock(&kmemleak_mutex); + mutex_lock(&scan_mutex); start_scan_thread(); - mutex_unlock(&kmemleak_mutex); + mutex_unlock(&scan_mutex); pr_info("Kernel memory leak detector initialized\n"); diff --git a/mm/ksm.c b/mm/ksm.c new file mode 100644 index 000000000000..f7edac356f46 --- /dev/null +++ b/mm/ksm.c @@ -0,0 +1,1711 @@ +/* + * Memory merging support. + * + * This code enables dynamic sharing of identical pages found in different + * memory areas, even if they are not shared by fork() + * + * Copyright (C) 2008-2009 Red Hat, Inc. + * Authors: + * Izik Eidus + * Andrea Arcangeli + * Chris Wright + * Hugh Dickins + * + * This work is licensed under the terms of the GNU GPL, version 2. + */ + +#include <linux/errno.h> +#include <linux/mm.h> +#include <linux/fs.h> +#include <linux/mman.h> +#include <linux/sched.h> +#include <linux/rwsem.h> +#include <linux/pagemap.h> +#include <linux/rmap.h> +#include <linux/spinlock.h> +#include <linux/jhash.h> +#include <linux/delay.h> +#include <linux/kthread.h> +#include <linux/wait.h> +#include <linux/slab.h> +#include <linux/rbtree.h> +#include <linux/mmu_notifier.h> +#include <linux/swap.h> +#include <linux/ksm.h> + +#include <asm/tlbflush.h> + +/* + * A few notes about the KSM scanning process, + * to make it easier to understand the data structures below: + * + * In order to reduce excessive scanning, KSM sorts the memory pages by their + * contents into a data structure that holds pointers to the pages' locations. + * + * Since the contents of the pages may change at any moment, KSM cannot just + * insert the pages into a normal sorted tree and expect it to find anything. + * Therefore KSM uses two data structures - the stable and the unstable tree. + * + * The stable tree holds pointers to all the merged pages (ksm pages), sorted + * by their contents. Because each such page is write-protected, searching on + * this tree is fully assured to be working (except when pages are unmapped), + * and therefore this tree is called the stable tree. + * + * In addition to the stable tree, KSM uses a second data structure called the + * unstable tree: this tree holds pointers to pages which have been found to + * be "unchanged for a period of time". The unstable tree sorts these pages + * by their contents, but since they are not write-protected, KSM cannot rely + * upon the unstable tree to work correctly - the unstable tree is liable to + * be corrupted as its contents are modified, and so it is called unstable. + * + * KSM solves this problem by several techniques: + * + * 1) The unstable tree is flushed every time KSM completes scanning all + * memory areas, and then the tree is rebuilt again from the beginning. + * 2) KSM will only insert into the unstable tree, pages whose hash value + * has not changed since the previous scan of all memory areas. + * 3) The unstable tree is a RedBlack Tree - so its balancing is based on the + * colors of the nodes and not on their contents, assuring that even when + * the tree gets "corrupted" it won't get out of balance, so scanning time + * remains the same (also, searching and inserting nodes in an rbtree uses + * the same algorithm, so we have no overhead when we flush and rebuild). + * 4) KSM never flushes the stable tree, which means that even if it were to + * take 10 attempts to find a page in the unstable tree, once it is found, + * it is secured in the stable tree. (When we scan a new page, we first + * compare it against the stable tree, and then against the unstable tree.) + */ + +/** + * struct mm_slot - ksm information per mm that is being scanned + * @link: link to the mm_slots hash list + * @mm_list: link into the mm_slots list, rooted in ksm_mm_head + * @rmap_list: head for this mm_slot's list of rmap_items + * @mm: the mm that this information is valid for + */ +struct mm_slot { + struct hlist_node link; + struct list_head mm_list; + struct list_head rmap_list; + struct mm_struct *mm; +}; + +/** + * struct ksm_scan - cursor for scanning + * @mm_slot: the current mm_slot we are scanning + * @address: the next address inside that to be scanned + * @rmap_item: the current rmap that we are scanning inside the rmap_list + * @seqnr: count of completed full scans (needed when removing unstable node) + * + * There is only the one ksm_scan instance of this cursor structure. + */ +struct ksm_scan { + struct mm_slot *mm_slot; + unsigned long address; + struct rmap_item *rmap_item; + unsigned long seqnr; +}; + +/** + * struct rmap_item - reverse mapping item for virtual addresses + * @link: link into mm_slot's rmap_list (rmap_list is per mm) + * @mm: the memory structure this rmap_item is pointing into + * @address: the virtual address this rmap_item tracks (+ flags in low bits) + * @oldchecksum: previous checksum of the page at that virtual address + * @node: rb_node of this rmap_item in either unstable or stable tree + * @next: next rmap_item hanging off the same node of the stable tree + * @prev: previous rmap_item hanging off the same node of the stable tree + */ +struct rmap_item { + struct list_head link; + struct mm_struct *mm; + unsigned long address; /* + low bits used for flags below */ + union { + unsigned int oldchecksum; /* when unstable */ + struct rmap_item *next; /* when stable */ + }; + union { + struct rb_node node; /* when tree node */ + struct rmap_item *prev; /* in stable list */ + }; +}; + +#define SEQNR_MASK 0x0ff /* low bits of unstable tree seqnr */ +#define NODE_FLAG 0x100 /* is a node of unstable or stable tree */ +#define STABLE_FLAG 0x200 /* is a node or list item of stable tree */ + +/* The stable and unstable tree heads */ +static struct rb_root root_stable_tree = RB_ROOT; +static struct rb_root root_unstable_tree = RB_ROOT; + +#define MM_SLOTS_HASH_HEADS 1024 +static struct hlist_head *mm_slots_hash; + +static struct mm_slot ksm_mm_head = { + .mm_list = LIST_HEAD_INIT(ksm_mm_head.mm_list), +}; +static struct ksm_scan ksm_scan = { + .mm_slot = &ksm_mm_head, +}; + +static struct kmem_cache *rmap_item_cache; +static struct kmem_cache *mm_slot_cache; + +/* The number of nodes in the stable tree */ +static unsigned long ksm_pages_shared; + +/* The number of page slots additionally sharing those nodes */ +static unsigned long ksm_pages_sharing; + +/* The number of nodes in the unstable tree */ +static unsigned long ksm_pages_unshared; + +/* The number of rmap_items in use: to calculate pages_volatile */ +static unsigned long ksm_rmap_items; + +/* Limit on the number of unswappable pages used */ +static unsigned long ksm_max_kernel_pages; + +/* Number of pages ksmd should scan in one batch */ +static unsigned int ksm_thread_pages_to_scan = 100; + +/* Milliseconds ksmd should sleep between batches */ +static unsigned int ksm_thread_sleep_millisecs = 20; + +#define KSM_RUN_STOP 0 +#define KSM_RUN_MERGE 1 +#define KSM_RUN_UNMERGE 2 +static unsigned int ksm_run = KSM_RUN_STOP; + +static DECLARE_WAIT_QUEUE_HEAD(ksm_thread_wait); +static DEFINE_MUTEX(ksm_thread_mutex); +static DEFINE_SPINLOCK(ksm_mmlist_lock); + +#define KSM_KMEM_CACHE(__struct, __flags) kmem_cache_create("ksm_"#__struct,\ + sizeof(struct __struct), __alignof__(struct __struct),\ + (__flags), NULL) + +static void __init ksm_init_max_kernel_pages(void) +{ + ksm_max_kernel_pages = nr_free_buffer_pages() / 4; +} + +static int __init ksm_slab_init(void) +{ + rmap_item_cache = KSM_KMEM_CACHE(rmap_item, 0); + if (!rmap_item_cache) + goto out; + + mm_slot_cache = KSM_KMEM_CACHE(mm_slot, 0); + if (!mm_slot_cache) + goto out_free; + + return 0; + +out_free: + kmem_cache_destroy(rmap_item_cache); +out: + return -ENOMEM; +} + +static void __init ksm_slab_free(void) +{ + kmem_cache_destroy(mm_slot_cache); + kmem_cache_destroy(rmap_item_cache); + mm_slot_cache = NULL; +} + +static inline struct rmap_item *alloc_rmap_item(void) +{ + struct rmap_item *rmap_item; + + rmap_item = kmem_cache_zalloc(rmap_item_cache, GFP_KERNEL); + if (rmap_item) + ksm_rmap_items++; + return rmap_item; +} + +static inline void free_rmap_item(struct rmap_item *rmap_item) +{ + ksm_rmap_items--; + rmap_item->mm = NULL; /* debug safety */ + kmem_cache_free(rmap_item_cache, rmap_item); +} + +static inline struct mm_slot *alloc_mm_slot(void) +{ + if (!mm_slot_cache) /* initialization failed */ + return NULL; + return kmem_cache_zalloc(mm_slot_cache, GFP_KERNEL); +} + +static inline void free_mm_slot(struct mm_slot *mm_slot) +{ + kmem_cache_free(mm_slot_cache, mm_slot); +} + +static int __init mm_slots_hash_init(void) +{ + mm_slots_hash = kzalloc(MM_SLOTS_HASH_HEADS * sizeof(struct hlist_head), + GFP_KERNEL); + if (!mm_slots_hash) + return -ENOMEM; + return 0; +} + +static void __init mm_slots_hash_free(void) +{ + kfree(mm_slots_hash); +} + +static struct mm_slot *get_mm_slot(struct mm_struct *mm) +{ + struct mm_slot *mm_slot; + struct hlist_head *bucket; + struct hlist_node *node; + + bucket = &mm_slots_hash[((unsigned long)mm / sizeof(struct mm_struct)) + % MM_SLOTS_HASH_HEADS]; + hlist_for_each_entry(mm_slot, node, bucket, link) { + if (mm == mm_slot->mm) + return mm_slot; + } + return NULL; +} + +static void insert_to_mm_slots_hash(struct mm_struct *mm, + struct mm_slot *mm_slot) +{ + struct hlist_head *bucket; + + bucket = &mm_slots_hash[((unsigned long)mm / sizeof(struct mm_struct)) + % MM_SLOTS_HASH_HEADS]; + mm_slot->mm = mm; + INIT_LIST_HEAD(&mm_slot->rmap_list); + hlist_add_head(&mm_slot->link, bucket); +} + +static inline int in_stable_tree(struct rmap_item *rmap_item) +{ + return rmap_item->address & STABLE_FLAG; +} + +/* + * ksmd, and unmerge_and_remove_all_rmap_items(), must not touch an mm's + * page tables after it has passed through ksm_exit() - which, if necessary, + * takes mmap_sem briefly to serialize against them. ksm_exit() does not set + * a special flag: they can just back out as soon as mm_users goes to zero. + * ksm_test_exit() is used throughout to make this test for exit: in some + * places for correctness, in some places just to avoid unnecessary work. + */ +static inline bool ksm_test_exit(struct mm_struct *mm) +{ + return atomic_read(&mm->mm_users) == 0; +} + +/* + * We use break_ksm to break COW on a ksm page: it's a stripped down + * + * if (get_user_pages(current, mm, addr, 1, 1, 1, &page, NULL) == 1) + * put_page(page); + * + * but taking great care only to touch a ksm page, in a VM_MERGEABLE vma, + * in case the application has unmapped and remapped mm,addr meanwhile. + * Could a ksm page appear anywhere else? Actually yes, in a VM_PFNMAP + * mmap of /dev/mem or /dev/kmem, where we would not want to touch it. + */ +static int break_ksm(struct vm_area_struct *vma, unsigned long addr) +{ + struct page *page; + int ret = 0; + + do { + cond_resched(); + page = follow_page(vma, addr, FOLL_GET); + if (!page) + break; + if (PageKsm(page)) + ret = handle_mm_fault(vma->vm_mm, vma, addr, + FAULT_FLAG_WRITE); + else + ret = VM_FAULT_WRITE; + put_page(page); + } while (!(ret & (VM_FAULT_WRITE | VM_FAULT_SIGBUS | VM_FAULT_OOM))); + /* + * We must loop because handle_mm_fault() may back out if there's + * any difficulty e.g. if pte accessed bit gets updated concurrently. + * + * VM_FAULT_WRITE is what we have been hoping for: it indicates that + * COW has been broken, even if the vma does not permit VM_WRITE; + * but note that a concurrent fault might break PageKsm for us. + * + * VM_FAULT_SIGBUS could occur if we race with truncation of the + * backing file, which also invalidates anonymous pages: that's + * okay, that truncation will have unmapped the PageKsm for us. + * + * VM_FAULT_OOM: at the time of writing (late July 2009), setting + * aside mem_cgroup limits, VM_FAULT_OOM would only be set if the + * current task has TIF_MEMDIE set, and will be OOM killed on return + * to user; and ksmd, having no mm, would never be chosen for that. + * + * But if the mm is in a limited mem_cgroup, then the fault may fail + * with VM_FAULT_OOM even if the current task is not TIF_MEMDIE; and + * even ksmd can fail in this way - though it's usually breaking ksm + * just to undo a merge it made a moment before, so unlikely to oom. + * + * That's a pity: we might therefore have more kernel pages allocated + * than we're counting as nodes in the stable tree; but ksm_do_scan + * will retry to break_cow on each pass, so should recover the page + * in due course. The important thing is to not let VM_MERGEABLE + * be cleared while any such pages might remain in the area. + */ + return (ret & VM_FAULT_OOM) ? -ENOMEM : 0; +} + +static void break_cow(struct mm_struct *mm, unsigned long addr) +{ + struct vm_area_struct *vma; + + down_read(&mm->mmap_sem); + if (ksm_test_exit(mm)) + goto out; + vma = find_vma(mm, addr); + if (!vma || vma->vm_start > addr) + goto out; + if (!(vma->vm_flags & VM_MERGEABLE) || !vma->anon_vma) + goto out; + break_ksm(vma, addr); +out: + up_read(&mm->mmap_sem); +} + +static struct page *get_mergeable_page(struct rmap_item *rmap_item) +{ + struct mm_struct *mm = rmap_item->mm; + unsigned long addr = rmap_item->address; + struct vm_area_struct *vma; + struct page *page; + + down_read(&mm->mmap_sem); + if (ksm_test_exit(mm)) + goto out; + vma = find_vma(mm, addr); + if (!vma || vma->vm_start > addr) + goto out; + if (!(vma->vm_flags & VM_MERGEABLE) || !vma->anon_vma) + goto out; + + page = follow_page(vma, addr, FOLL_GET); + if (!page) + goto out; + if (PageAnon(page)) { + flush_anon_page(vma, page, addr); + flush_dcache_page(page); + } else { + put_page(page); +out: page = NULL; + } + up_read(&mm->mmap_sem); + return page; +} + +/* + * get_ksm_page: checks if the page at the virtual address in rmap_item + * is still PageKsm, in which case we can trust the content of the page, + * and it returns the gotten page; but NULL if the page has been zapped. + */ +static struct page *get_ksm_page(struct rmap_item *rmap_item) +{ + struct page *page; + + page = get_mergeable_page(rmap_item); + if (page && !PageKsm(page)) { + put_page(page); + page = NULL; + } + return page; +} + +/* + * Removing rmap_item from stable or unstable tree. + * This function will clean the information from the stable/unstable tree. + */ +static void remove_rmap_item_from_tree(struct rmap_item *rmap_item) +{ + if (in_stable_tree(rmap_item)) { + struct rmap_item *next_item = rmap_item->next; + + if (rmap_item->address & NODE_FLAG) { + if (next_item) { + rb_replace_node(&rmap_item->node, + &next_item->node, + &root_stable_tree); + next_item->address |= NODE_FLAG; + ksm_pages_sharing--; + } else { + rb_erase(&rmap_item->node, &root_stable_tree); + ksm_pages_shared--; + } + } else { + struct rmap_item *prev_item = rmap_item->prev; + + BUG_ON(prev_item->next != rmap_item); + prev_item->next = next_item; + if (next_item) { + BUG_ON(next_item->prev != rmap_item); + next_item->prev = rmap_item->prev; + } + ksm_pages_sharing--; + } + + rmap_item->next = NULL; + + } else if (rmap_item->address & NODE_FLAG) { + unsigned char age; + /* + * Usually ksmd can and must skip the rb_erase, because + * root_unstable_tree was already reset to RB_ROOT. + * But be careful when an mm is exiting: do the rb_erase + * if this rmap_item was inserted by this scan, rather + * than left over from before. + */ + age = (unsigned char)(ksm_scan.seqnr - rmap_item->address); + BUG_ON(age > 1); + if (!age) + rb_erase(&rmap_item->node, &root_unstable_tree); + ksm_pages_unshared--; + } + + rmap_item->address &= PAGE_MASK; + + cond_resched(); /* we're called from many long loops */ +} + +static void remove_trailing_rmap_items(struct mm_slot *mm_slot, + struct list_head *cur) +{ + struct rmap_item *rmap_item; + + while (cur != &mm_slot->rmap_list) { + rmap_item = list_entry(cur, struct rmap_item, link); + cur = cur->next; + remove_rmap_item_from_tree(rmap_item); + list_del(&rmap_item->link); + free_rmap_item(rmap_item); + } +} + +/* + * Though it's very tempting to unmerge in_stable_tree(rmap_item)s rather + * than check every pte of a given vma, the locking doesn't quite work for + * that - an rmap_item is assigned to the stable tree after inserting ksm + * page and upping mmap_sem. Nor does it fit with the way we skip dup'ing + * rmap_items from parent to child at fork time (so as not to waste time + * if exit comes before the next scan reaches it). + * + * Similarly, although we'd like to remove rmap_items (so updating counts + * and freeing memory) when unmerging an area, it's easier to leave that + * to the next pass of ksmd - consider, for example, how ksmd might be + * in cmp_and_merge_page on one of the rmap_items we would be removing. + */ +static int unmerge_ksm_pages(struct vm_area_struct *vma, + unsigned long start, unsigned long end) +{ + unsigned long addr; + int err = 0; + + for (addr = start; addr < end && !err; addr += PAGE_SIZE) { + if (ksm_test_exit(vma->vm_mm)) + break; + if (signal_pending(current)) + err = -ERESTARTSYS; + else + err = break_ksm(vma, addr); + } + return err; +} + +#ifdef CONFIG_SYSFS +/* + * Only called through the sysfs control interface: + */ +static int unmerge_and_remove_all_rmap_items(void) +{ + struct mm_slot *mm_slot; + struct mm_struct *mm; + struct vm_area_struct *vma; + int err = 0; + + spin_lock(&ksm_mmlist_lock); + ksm_scan.mm_slot = list_entry(ksm_mm_head.mm_list.next, + struct mm_slot, mm_list); + spin_unlock(&ksm_mmlist_lock); + + for (mm_slot = ksm_scan.mm_slot; + mm_slot != &ksm_mm_head; mm_slot = ksm_scan.mm_slot) { + mm = mm_slot->mm; + down_read(&mm->mmap_sem); + for (vma = mm->mmap; vma; vma = vma->vm_next) { + if (ksm_test_exit(mm)) + break; + if (!(vma->vm_flags & VM_MERGEABLE) || !vma->anon_vma) + continue; + err = unmerge_ksm_pages(vma, + vma->vm_start, vma->vm_end); + if (err) + goto error; + } + + remove_trailing_rmap_items(mm_slot, mm_slot->rmap_list.next); + + spin_lock(&ksm_mmlist_lock); + ksm_scan.mm_slot = list_entry(mm_slot->mm_list.next, + struct mm_slot, mm_list); + if (ksm_test_exit(mm)) { + hlist_del(&mm_slot->link); + list_del(&mm_slot->mm_list); + spin_unlock(&ksm_mmlist_lock); + + free_mm_slot(mm_slot); + clear_bit(MMF_VM_MERGEABLE, &mm->flags); + up_read(&mm->mmap_sem); + mmdrop(mm); + } else { + spin_unlock(&ksm_mmlist_lock); + up_read(&mm->mmap_sem); + } + } + + ksm_scan.seqnr = 0; + return 0; + +error: + up_read(&mm->mmap_sem); + spin_lock(&ksm_mmlist_lock); + ksm_scan.mm_slot = &ksm_mm_head; + spin_unlock(&ksm_mmlist_lock); + return err; +} +#endif /* CONFIG_SYSFS */ + +static u32 calc_checksum(struct page *page) +{ + u32 checksum; + void *addr = kmap_atomic(page, KM_USER0); + checksum = jhash2(addr, PAGE_SIZE / 4, 17); + kunmap_atomic(addr, KM_USER0); + return checksum; +} + +static int memcmp_pages(struct page *page1, struct page *page2) +{ + char *addr1, *addr2; + int ret; + + addr1 = kmap_atomic(page1, KM_USER0); + addr2 = kmap_atomic(page2, KM_USER1); + ret = memcmp(addr1, addr2, PAGE_SIZE); + kunmap_atomic(addr2, KM_USER1); + kunmap_atomic(addr1, KM_USER0); + return ret; +} + +static inline int pages_identical(struct page *page1, struct page *page2) +{ + return !memcmp_pages(page1, page2); +} + +static int write_protect_page(struct vm_area_struct *vma, struct page *page, + pte_t *orig_pte) +{ + struct mm_struct *mm = vma->vm_mm; + unsigned long addr; + pte_t *ptep; + spinlock_t *ptl; + int swapped; + int err = -EFAULT; + + addr = page_address_in_vma(page, vma); + if (addr == -EFAULT) + goto out; + + ptep = page_check_address(page, mm, addr, &ptl, 0); + if (!ptep) + goto out; + + if (pte_write(*ptep)) { + pte_t entry; + + swapped = PageSwapCache(page); + flush_cache_page(vma, addr, page_to_pfn(page)); + /* + * Ok this is tricky, when get_user_pages_fast() run it doesnt + * take any lock, therefore the check that we are going to make + * with the pagecount against the mapcount is racey and + * O_DIRECT can happen right after the check. + * So we clear the pte and flush the tlb before the check + * this assure us that no O_DIRECT can happen after the check + * or in the middle of the check. + */ + entry = ptep_clear_flush(vma, addr, ptep); + /* + * Check that no O_DIRECT or similar I/O is in progress on the + * page + */ + if ((page_mapcount(page) + 2 + swapped) != page_count(page)) { + set_pte_at_notify(mm, addr, ptep, entry); + goto out_unlock; + } + entry = pte_wrprotect(entry); + set_pte_at_notify(mm, addr, ptep, entry); + } + *orig_pte = *ptep; + err = 0; + +out_unlock: + pte_unmap_unlock(ptep, ptl); +out: + return err; +} + +/** + * replace_page - replace page in vma by new ksm page + * @vma: vma that holds the pte pointing to oldpage + * @oldpage: the page we are replacing by newpage + * @newpage: the ksm page we replace oldpage by + * @orig_pte: the original value of the pte + * + * Returns 0 on success, -EFAULT on failure. + */ +static int replace_page(struct vm_area_struct *vma, struct page *oldpage, + struct page *newpage, pte_t orig_pte) +{ + struct mm_struct *mm = vma->vm_mm; + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + pte_t *ptep; + spinlock_t *ptl; + unsigned long addr; + pgprot_t prot; + int err = -EFAULT; + + prot = vm_get_page_prot(vma->vm_flags & ~VM_WRITE); + + addr = page_address_in_vma(oldpage, vma); + if (addr == -EFAULT) + goto out; + + pgd = pgd_offset(mm, addr); + if (!pgd_present(*pgd)) + goto out; + + pud = pud_offset(pgd, addr); + if (!pud_present(*pud)) + goto out; + + pmd = pmd_offset(pud, addr); + if (!pmd_present(*pmd)) + goto out; + + ptep = pte_offset_map_lock(mm, pmd, addr, &ptl); + if (!pte_same(*ptep, orig_pte)) { + pte_unmap_unlock(ptep, ptl); + goto out; + } + + get_page(newpage); + page_add_ksm_rmap(newpage); + + flush_cache_page(vma, addr, pte_pfn(*ptep)); + ptep_clear_flush(vma, addr, ptep); + set_pte_at_notify(mm, addr, ptep, mk_pte(newpage, prot)); + + page_remove_rmap(oldpage); + put_page(oldpage); + + pte_unmap_unlock(ptep, ptl); + err = 0; +out: + return err; +} + +/* + * try_to_merge_one_page - take two pages and merge them into one + * @vma: the vma that hold the pte pointing into oldpage + * @oldpage: the page that we want to replace with newpage + * @newpage: the page that we want to map instead of oldpage + * + * Note: + * oldpage should be a PageAnon page, while newpage should be a PageKsm page, + * or a newly allocated kernel page which page_add_ksm_rmap will make PageKsm. + * + * This function returns 0 if the pages were merged, -EFAULT otherwise. + */ +static int try_to_merge_one_page(struct vm_area_struct *vma, + struct page *oldpage, + struct page *newpage) +{ + pte_t orig_pte = __pte(0); + int err = -EFAULT; + + if (!(vma->vm_flags & VM_MERGEABLE)) + goto out; + + if (!PageAnon(oldpage)) + goto out; + + get_page(newpage); + get_page(oldpage); + + /* + * We need the page lock to read a stable PageSwapCache in + * write_protect_page(). We use trylock_page() instead of + * lock_page() because we don't want to wait here - we + * prefer to continue scanning and merging different pages, + * then come back to this page when it is unlocked. + */ + if (!trylock_page(oldpage)) + goto out_putpage; + /* + * If this anonymous page is mapped only here, its pte may need + * to be write-protected. If it's mapped elsewhere, all of its + * ptes are necessarily already write-protected. But in either + * case, we need to lock and check page_count is not raised. + */ + if (write_protect_page(vma, oldpage, &orig_pte)) { + unlock_page(oldpage); + goto out_putpage; + } + unlock_page(oldpage); + + if (pages_identical(oldpage, newpage)) + err = replace_page(vma, oldpage, newpage, orig_pte); + +out_putpage: + put_page(oldpage); + put_page(newpage); +out: + return err; +} + +/* + * try_to_merge_with_ksm_page - like try_to_merge_two_pages, + * but no new kernel page is allocated: kpage must already be a ksm page. + */ +static int try_to_merge_with_ksm_page(struct mm_struct *mm1, + unsigned long addr1, + struct page *page1, + struct page *kpage) +{ + struct vm_area_struct *vma; + int err = -EFAULT; + + down_read(&mm1->mmap_sem); + if (ksm_test_exit(mm1)) + goto out; + + vma = find_vma(mm1, addr1); + if (!vma || vma->vm_start > addr1) + goto out; + + err = try_to_merge_one_page(vma, page1, kpage); +out: + up_read(&mm1->mmap_sem); + return err; +} + +/* + * try_to_merge_two_pages - take two identical pages and prepare them + * to be merged into one page. + * + * This function returns 0 if we successfully mapped two identical pages + * into one page, -EFAULT otherwise. + * + * Note that this function allocates a new kernel page: if one of the pages + * is already a ksm page, try_to_merge_with_ksm_page should be used. + */ +static int try_to_merge_two_pages(struct mm_struct *mm1, unsigned long addr1, + struct page *page1, struct mm_struct *mm2, + unsigned long addr2, struct page *page2) +{ + struct vm_area_struct *vma; + struct page *kpage; + int err = -EFAULT; + + /* + * The number of nodes in the stable tree + * is the number of kernel pages that we hold. + */ + if (ksm_max_kernel_pages && + ksm_max_kernel_pages <= ksm_pages_shared) + return err; + + kpage = alloc_page(GFP_HIGHUSER); + if (!kpage) + return err; + + down_read(&mm1->mmap_sem); + if (ksm_test_exit(mm1)) { + up_read(&mm1->mmap_sem); + goto out; + } + vma = find_vma(mm1, addr1); + if (!vma || vma->vm_start > addr1) { + up_read(&mm1->mmap_sem); + goto out; + } + + copy_user_highpage(kpage, page1, addr1, vma); + err = try_to_merge_one_page(vma, page1, kpage); + up_read(&mm1->mmap_sem); + + if (!err) { + err = try_to_merge_with_ksm_page(mm2, addr2, page2, kpage); + /* + * If that fails, we have a ksm page with only one pte + * pointing to it: so break it. + */ + if (err) + break_cow(mm1, addr1); + } +out: + put_page(kpage); + return err; +} + +/* + * stable_tree_search - search page inside the stable tree + * @page: the page that we are searching identical pages to. + * @page2: pointer into identical page that we are holding inside the stable + * tree that we have found. + * @rmap_item: the reverse mapping item + * + * This function checks if there is a page inside the stable tree + * with identical content to the page that we are scanning right now. + * + * This function return rmap_item pointer to the identical item if found, + * NULL otherwise. + */ +static struct rmap_item *stable_tree_search(struct page *page, + struct page **page2, + struct rmap_item *rmap_item) +{ + struct rb_node *node = root_stable_tree.rb_node; + + while (node) { + struct rmap_item *tree_rmap_item, *next_rmap_item; + int ret; + + tree_rmap_item = rb_entry(node, struct rmap_item, node); + while (tree_rmap_item) { + BUG_ON(!in_stable_tree(tree_rmap_item)); + cond_resched(); + page2[0] = get_ksm_page(tree_rmap_item); + if (page2[0]) + break; + next_rmap_item = tree_rmap_item->next; + remove_rmap_item_from_tree(tree_rmap_item); + tree_rmap_item = next_rmap_item; + } + if (!tree_rmap_item) + return NULL; + + ret = memcmp_pages(page, page2[0]); + + if (ret < 0) { + put_page(page2[0]); + node = node->rb_left; + } else if (ret > 0) { + put_page(page2[0]); + node = node->rb_right; + } else { + return tree_rmap_item; + } + } + + return NULL; +} + +/* + * stable_tree_insert - insert rmap_item pointing to new ksm page + * into the stable tree. + * + * @page: the page that we are searching identical page to inside the stable + * tree. + * @rmap_item: pointer to the reverse mapping item. + * + * This function returns rmap_item if success, NULL otherwise. + */ +static struct rmap_item *stable_tree_insert(struct page *page, + struct rmap_item *rmap_item) +{ + struct rb_node **new = &root_stable_tree.rb_node; + struct rb_node *parent = NULL; + + while (*new) { + struct rmap_item *tree_rmap_item, *next_rmap_item; + struct page *tree_page; + int ret; + + tree_rmap_item = rb_entry(*new, struct rmap_item, node); + while (tree_rmap_item) { + BUG_ON(!in_stable_tree(tree_rmap_item)); + cond_resched(); + tree_page = get_ksm_page(tree_rmap_item); + if (tree_page) + break; + next_rmap_item = tree_rmap_item->next; + remove_rmap_item_from_tree(tree_rmap_item); + tree_rmap_item = next_rmap_item; + } + if (!tree_rmap_item) + return NULL; + + ret = memcmp_pages(page, tree_page); + put_page(tree_page); + + parent = *new; + if (ret < 0) + new = &parent->rb_left; + else if (ret > 0) + new = &parent->rb_right; + else { + /* + * It is not a bug that stable_tree_search() didn't + * find this node: because at that time our page was + * not yet write-protected, so may have changed since. + */ + return NULL; + } + } + + rmap_item->address |= NODE_FLAG | STABLE_FLAG; + rmap_item->next = NULL; + rb_link_node(&rmap_item->node, parent, new); + rb_insert_color(&rmap_item->node, &root_stable_tree); + + ksm_pages_shared++; + return rmap_item; +} + +/* + * unstable_tree_search_insert - search and insert items into the unstable tree. + * + * @page: the page that we are going to search for identical page or to insert + * into the unstable tree + * @page2: pointer into identical page that was found inside the unstable tree + * @rmap_item: the reverse mapping item of page + * + * This function searches for a page in the unstable tree identical to the + * page currently being scanned; and if no identical page is found in the + * tree, we insert rmap_item as a new object into the unstable tree. + * + * This function returns pointer to rmap_item found to be identical + * to the currently scanned page, NULL otherwise. + * + * This function does both searching and inserting, because they share + * the same walking algorithm in an rbtree. + */ +static struct rmap_item *unstable_tree_search_insert(struct page *page, + struct page **page2, + struct rmap_item *rmap_item) +{ + struct rb_node **new = &root_unstable_tree.rb_node; + struct rb_node *parent = NULL; + + while (*new) { + struct rmap_item *tree_rmap_item; + int ret; + + tree_rmap_item = rb_entry(*new, struct rmap_item, node); + page2[0] = get_mergeable_page(tree_rmap_item); + if (!page2[0]) + return NULL; + + /* + * Don't substitute an unswappable ksm page + * just for one good swappable forked page. + */ + if (page == page2[0]) { + put_page(page2[0]); + return NULL; + } + + ret = memcmp_pages(page, page2[0]); + + parent = *new; + if (ret < 0) { + put_page(page2[0]); + new = &parent->rb_left; + } else if (ret > 0) { + put_page(page2[0]); + new = &parent->rb_right; + } else { + return tree_rmap_item; + } + } + + rmap_item->address |= NODE_FLAG; + rmap_item->address |= (ksm_scan.seqnr & SEQNR_MASK); + rb_link_node(&rmap_item->node, parent, new); + rb_insert_color(&rmap_item->node, &root_unstable_tree); + + ksm_pages_unshared++; + return NULL; +} + +/* + * stable_tree_append - add another rmap_item to the linked list of + * rmap_items hanging off a given node of the stable tree, all sharing + * the same ksm page. + */ +static void stable_tree_append(struct rmap_item *rmap_item, + struct rmap_item *tree_rmap_item) +{ + rmap_item->next = tree_rmap_item->next; + rmap_item->prev = tree_rmap_item; + + if (tree_rmap_item->next) + tree_rmap_item->next->prev = rmap_item; + + tree_rmap_item->next = rmap_item; + rmap_item->address |= STABLE_FLAG; + + ksm_pages_sharing++; +} + +/* + * cmp_and_merge_page - first see if page can be merged into the stable tree; + * if not, compare checksum to previous and if it's the same, see if page can + * be inserted into the unstable tree, or merged with a page already there and + * both transferred to the stable tree. + * + * @page: the page that we are searching identical page to. + * @rmap_item: the reverse mapping into the virtual address of this page + */ +static void cmp_and_merge_page(struct page *page, struct rmap_item *rmap_item) +{ + struct page *page2[1]; + struct rmap_item *tree_rmap_item; + unsigned int checksum; + int err; + + if (in_stable_tree(rmap_item)) + remove_rmap_item_from_tree(rmap_item); + + /* We first start with searching the page inside the stable tree */ + tree_rmap_item = stable_tree_search(page, page2, rmap_item); + if (tree_rmap_item) { + if (page == page2[0]) /* forked */ + err = 0; + else + err = try_to_merge_with_ksm_page(rmap_item->mm, + rmap_item->address, + page, page2[0]); + put_page(page2[0]); + + if (!err) { + /* + * The page was successfully merged: + * add its rmap_item to the stable tree. + */ + stable_tree_append(rmap_item, tree_rmap_item); + } + return; + } + + /* + * A ksm page might have got here by fork, but its other + * references have already been removed from the stable tree. + * Or it might be left over from a break_ksm which failed + * when the mem_cgroup had reached its limit: try again now. + */ + if (PageKsm(page)) + break_cow(rmap_item->mm, rmap_item->address); + + /* + * In case the hash value of the page was changed from the last time we + * have calculated it, this page to be changed frequely, therefore we + * don't want to insert it to the unstable tree, and we don't want to + * waste our time to search if there is something identical to it there. + */ + checksum = calc_checksum(page); + if (rmap_item->oldchecksum != checksum) { + rmap_item->oldchecksum = checksum; + return; + } + + tree_rmap_item = unstable_tree_search_insert(page, page2, rmap_item); + if (tree_rmap_item) { + err = try_to_merge_two_pages(rmap_item->mm, + rmap_item->address, page, + tree_rmap_item->mm, + tree_rmap_item->address, page2[0]); + /* + * As soon as we merge this page, we want to remove the + * rmap_item of the page we have merged with from the unstable + * tree, and insert it instead as new node in the stable tree. + */ + if (!err) { + rb_erase(&tree_rmap_item->node, &root_unstable_tree); + tree_rmap_item->address &= ~NODE_FLAG; + ksm_pages_unshared--; + + /* + * If we fail to insert the page into the stable tree, + * we will have 2 virtual addresses that are pointing + * to a ksm page left outside the stable tree, + * in which case we need to break_cow on both. + */ + if (stable_tree_insert(page2[0], tree_rmap_item)) + stable_tree_append(rmap_item, tree_rmap_item); + else { + break_cow(tree_rmap_item->mm, + tree_rmap_item->address); + break_cow(rmap_item->mm, rmap_item->address); + } + } + + put_page(page2[0]); + } +} + +static struct rmap_item *get_next_rmap_item(struct mm_slot *mm_slot, + struct list_head *cur, + unsigned long addr) +{ + struct rmap_item *rmap_item; + + while (cur != &mm_slot->rmap_list) { + rmap_item = list_entry(cur, struct rmap_item, link); + if ((rmap_item->address & PAGE_MASK) == addr) { + if (!in_stable_tree(rmap_item)) + remove_rmap_item_from_tree(rmap_item); + return rmap_item; + } + if (rmap_item->address > addr) + break; + cur = cur->next; + remove_rmap_item_from_tree(rmap_item); + list_del(&rmap_item->link); + free_rmap_item(rmap_item); + } + + rmap_item = alloc_rmap_item(); + if (rmap_item) { + /* It has already been zeroed */ + rmap_item->mm = mm_slot->mm; + rmap_item->address = addr; + list_add_tail(&rmap_item->link, cur); + } + return rmap_item; +} + +static struct rmap_item *scan_get_next_rmap_item(struct page **page) +{ + struct mm_struct *mm; + struct mm_slot *slot; + struct vm_area_struct *vma; + struct rmap_item *rmap_item; + + if (list_empty(&ksm_mm_head.mm_list)) + return NULL; + + slot = ksm_scan.mm_slot; + if (slot == &ksm_mm_head) { + root_unstable_tree = RB_ROOT; + + spin_lock(&ksm_mmlist_lock); + slot = list_entry(slot->mm_list.next, struct mm_slot, mm_list); + ksm_scan.mm_slot = slot; + spin_unlock(&ksm_mmlist_lock); +next_mm: + ksm_scan.address = 0; + ksm_scan.rmap_item = list_entry(&slot->rmap_list, + struct rmap_item, link); + } + + mm = slot->mm; + down_read(&mm->mmap_sem); + if (ksm_test_exit(mm)) + vma = NULL; + else + vma = find_vma(mm, ksm_scan.address); + + for (; vma; vma = vma->vm_next) { + if (!(vma->vm_flags & VM_MERGEABLE)) + continue; + if (ksm_scan.address < vma->vm_start) + ksm_scan.address = vma->vm_start; + if (!vma->anon_vma) + ksm_scan.address = vma->vm_end; + + while (ksm_scan.address < vma->vm_end) { + if (ksm_test_exit(mm)) + break; + *page = follow_page(vma, ksm_scan.address, FOLL_GET); + if (*page && PageAnon(*page)) { + flush_anon_page(vma, *page, ksm_scan.address); + flush_dcache_page(*page); + rmap_item = get_next_rmap_item(slot, + ksm_scan.rmap_item->link.next, + ksm_scan.address); + if (rmap_item) { + ksm_scan.rmap_item = rmap_item; + ksm_scan.address += PAGE_SIZE; + } else + put_page(*page); + up_read(&mm->mmap_sem); + return rmap_item; + } + if (*page) + put_page(*page); + ksm_scan.address += PAGE_SIZE; + cond_resched(); + } + } + + if (ksm_test_exit(mm)) { + ksm_scan.address = 0; + ksm_scan.rmap_item = list_entry(&slot->rmap_list, + struct rmap_item, link); + } + /* + * Nuke all the rmap_items that are above this current rmap: + * because there were no VM_MERGEABLE vmas with such addresses. + */ + remove_trailing_rmap_items(slot, ksm_scan.rmap_item->link.next); + + spin_lock(&ksm_mmlist_lock); + ksm_scan.mm_slot = list_entry(slot->mm_list.next, + struct mm_slot, mm_list); + if (ksm_scan.address == 0) { + /* + * We've completed a full scan of all vmas, holding mmap_sem + * throughout, and found no VM_MERGEABLE: so do the same as + * __ksm_exit does to remove this mm from all our lists now. + * This applies either when cleaning up after __ksm_exit + * (but beware: we can reach here even before __ksm_exit), + * or when all VM_MERGEABLE areas have been unmapped (and + * mmap_sem then protects against race with MADV_MERGEABLE). + */ + hlist_del(&slot->link); + list_del(&slot->mm_list); + spin_unlock(&ksm_mmlist_lock); + + free_mm_slot(slot); + clear_bit(MMF_VM_MERGEABLE, &mm->flags); + up_read(&mm->mmap_sem); + mmdrop(mm); + } else { + spin_unlock(&ksm_mmlist_lock); + up_read(&mm->mmap_sem); + } + + /* Repeat until we've completed scanning the whole list */ + slot = ksm_scan.mm_slot; + if (slot != &ksm_mm_head) + goto next_mm; + + ksm_scan.seqnr++; + return NULL; +} + +/** + * ksm_do_scan - the ksm scanner main worker function. + * @scan_npages - number of pages we want to scan before we return. + */ +static void ksm_do_scan(unsigned int scan_npages) +{ + struct rmap_item *rmap_item; + struct page *page; + + while (scan_npages--) { + cond_resched(); + rmap_item = scan_get_next_rmap_item(&page); + if (!rmap_item) + return; + if (!PageKsm(page) || !in_stable_tree(rmap_item)) + cmp_and_merge_page(page, rmap_item); + else if (page_mapcount(page) == 1) { + /* + * Replace now-unshared ksm page by ordinary page. + */ + break_cow(rmap_item->mm, rmap_item->address); + remove_rmap_item_from_tree(rmap_item); + rmap_item->oldchecksum = calc_checksum(page); + } + put_page(page); + } +} + +static int ksmd_should_run(void) +{ + return (ksm_run & KSM_RUN_MERGE) && !list_empty(&ksm_mm_head.mm_list); +} + +static int ksm_scan_thread(void *nothing) +{ + set_user_nice(current, 5); + + while (!kthread_should_stop()) { + mutex_lock(&ksm_thread_mutex); + if (ksmd_should_run()) + ksm_do_scan(ksm_thread_pages_to_scan); + mutex_unlock(&ksm_thread_mutex); + + if (ksmd_should_run()) { + schedule_timeout_interruptible( + msecs_to_jiffies(ksm_thread_sleep_millisecs)); + } else { + wait_event_interruptible(ksm_thread_wait, + ksmd_should_run() || kthread_should_stop()); + } + } + return 0; +} + +int ksm_madvise(struct vm_area_struct *vma, unsigned long start, + unsigned long end, int advice, unsigned long *vm_flags) +{ + struct mm_struct *mm = vma->vm_mm; + int err; + + switch (advice) { + case MADV_MERGEABLE: + /* + * Be somewhat over-protective for now! + */ + if (*vm_flags & (VM_MERGEABLE | VM_SHARED | VM_MAYSHARE | + VM_PFNMAP | VM_IO | VM_DONTEXPAND | + VM_RESERVED | VM_HUGETLB | VM_INSERTPAGE | + VM_MIXEDMAP | VM_SAO)) + return 0; /* just ignore the advice */ + + if (!test_bit(MMF_VM_MERGEABLE, &mm->flags)) { + err = __ksm_enter(mm); + if (err) + return err; + } + + *vm_flags |= VM_MERGEABLE; + break; + + case MADV_UNMERGEABLE: + if (!(*vm_flags & VM_MERGEABLE)) + return 0; /* just ignore the advice */ + + if (vma->anon_vma) { + err = unmerge_ksm_pages(vma, start, end); + if (err) + return err; + } + + *vm_flags &= ~VM_MERGEABLE; + break; + } + + return 0; +} + +int __ksm_enter(struct mm_struct *mm) +{ + struct mm_slot *mm_slot; + int needs_wakeup; + + mm_slot = alloc_mm_slot(); + if (!mm_slot) + return -ENOMEM; + + /* Check ksm_run too? Would need tighter locking */ + needs_wakeup = list_empty(&ksm_mm_head.mm_list); + + spin_lock(&ksm_mmlist_lock); + insert_to_mm_slots_hash(mm, mm_slot); + /* + * Insert just behind the scanning cursor, to let the area settle + * down a little; when fork is followed by immediate exec, we don't + * want ksmd to waste time setting up and tearing down an rmap_list. + */ + list_add_tail(&mm_slot->mm_list, &ksm_scan.mm_slot->mm_list); + spin_unlock(&ksm_mmlist_lock); + + set_bit(MMF_VM_MERGEABLE, &mm->flags); + atomic_inc(&mm->mm_count); + + if (needs_wakeup) + wake_up_interruptible(&ksm_thread_wait); + + return 0; +} + +void __ksm_exit(struct mm_struct *mm) +{ + struct mm_slot *mm_slot; + int easy_to_free = 0; + + /* + * This process is exiting: if it's straightforward (as is the + * case when ksmd was never running), free mm_slot immediately. + * But if it's at the cursor or has rmap_items linked to it, use + * mmap_sem to synchronize with any break_cows before pagetables + * are freed, and leave the mm_slot on the list for ksmd to free. + * Beware: ksm may already have noticed it exiting and freed the slot. + */ + + spin_lock(&ksm_mmlist_lock); + mm_slot = get_mm_slot(mm); + if (mm_slot && ksm_scan.mm_slot != mm_slot) { + if (list_empty(&mm_slot->rmap_list)) { + hlist_del(&mm_slot->link); + list_del(&mm_slot->mm_list); + easy_to_free = 1; + } else { + list_move(&mm_slot->mm_list, + &ksm_scan.mm_slot->mm_list); + } + } + spin_unlock(&ksm_mmlist_lock); + + if (easy_to_free) { + free_mm_slot(mm_slot); + clear_bit(MMF_VM_MERGEABLE, &mm->flags); + mmdrop(mm); + } else if (mm_slot) { + down_write(&mm->mmap_sem); + up_write(&mm->mmap_sem); + } +} + +#ifdef CONFIG_SYSFS +/* + * This all compiles without CONFIG_SYSFS, but is a waste of space. + */ + +#define KSM_ATTR_RO(_name) \ + static struct kobj_attribute _name##_attr = __ATTR_RO(_name) +#define KSM_ATTR(_name) \ + static struct kobj_attribute _name##_attr = \ + __ATTR(_name, 0644, _name##_show, _name##_store) + +static ssize_t sleep_millisecs_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sprintf(buf, "%u\n", ksm_thread_sleep_millisecs); +} + +static ssize_t sleep_millisecs_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + unsigned long msecs; + int err; + + err = strict_strtoul(buf, 10, &msecs); + if (err || msecs > UINT_MAX) + return -EINVAL; + + ksm_thread_sleep_millisecs = msecs; + + return count; +} +KSM_ATTR(sleep_millisecs); + +static ssize_t pages_to_scan_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sprintf(buf, "%u\n", ksm_thread_pages_to_scan); +} + +static ssize_t pages_to_scan_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + int err; + unsigned long nr_pages; + + err = strict_strtoul(buf, 10, &nr_pages); + if (err || nr_pages > UINT_MAX) + return -EINVAL; + + ksm_thread_pages_to_scan = nr_pages; + + return count; +} +KSM_ATTR(pages_to_scan); + +static ssize_t run_show(struct kobject *kobj, struct kobj_attribute *attr, + char *buf) +{ + return sprintf(buf, "%u\n", ksm_run); +} + +static ssize_t run_store(struct kobject *kobj, struct kobj_attribute *attr, + const char *buf, size_t count) +{ + int err; + unsigned long flags; + + err = strict_strtoul(buf, 10, &flags); + if (err || flags > UINT_MAX) + return -EINVAL; + if (flags > KSM_RUN_UNMERGE) + return -EINVAL; + + /* + * KSM_RUN_MERGE sets ksmd running, and 0 stops it running. + * KSM_RUN_UNMERGE stops it running and unmerges all rmap_items, + * breaking COW to free the unswappable pages_shared (but leaves + * mm_slots on the list for when ksmd may be set running again). + */ + + mutex_lock(&ksm_thread_mutex); + if (ksm_run != flags) { + ksm_run = flags; + if (flags & KSM_RUN_UNMERGE) { + current->flags |= PF_OOM_ORIGIN; + err = unmerge_and_remove_all_rmap_items(); + current->flags &= ~PF_OOM_ORIGIN; + if (err) { + ksm_run = KSM_RUN_STOP; + count = err; + } + } + } + mutex_unlock(&ksm_thread_mutex); + + if (flags & KSM_RUN_MERGE) + wake_up_interruptible(&ksm_thread_wait); + + return count; +} +KSM_ATTR(run); + +static ssize_t max_kernel_pages_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + int err; + unsigned long nr_pages; + + err = strict_strtoul(buf, 10, &nr_pages); + if (err) + return -EINVAL; + + ksm_max_kernel_pages = nr_pages; + + return count; +} + +static ssize_t max_kernel_pages_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sprintf(buf, "%lu\n", ksm_max_kernel_pages); +} +KSM_ATTR(max_kernel_pages); + +static ssize_t pages_shared_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sprintf(buf, "%lu\n", ksm_pages_shared); +} +KSM_ATTR_RO(pages_shared); + +static ssize_t pages_sharing_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sprintf(buf, "%lu\n", ksm_pages_sharing); +} +KSM_ATTR_RO(pages_sharing); + +static ssize_t pages_unshared_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sprintf(buf, "%lu\n", ksm_pages_unshared); +} +KSM_ATTR_RO(pages_unshared); + +static ssize_t pages_volatile_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + long ksm_pages_volatile; + + ksm_pages_volatile = ksm_rmap_items - ksm_pages_shared + - ksm_pages_sharing - ksm_pages_unshared; + /* + * It was not worth any locking to calculate that statistic, + * but it might therefore sometimes be negative: conceal that. + */ + if (ksm_pages_volatile < 0) + ksm_pages_volatile = 0; + return sprintf(buf, "%ld\n", ksm_pages_volatile); +} +KSM_ATTR_RO(pages_volatile); + +static ssize_t full_scans_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sprintf(buf, "%lu\n", ksm_scan.seqnr); +} +KSM_ATTR_RO(full_scans); + +static struct attribute *ksm_attrs[] = { + &sleep_millisecs_attr.attr, + &pages_to_scan_attr.attr, + &run_attr.attr, + &max_kernel_pages_attr.attr, + &pages_shared_attr.attr, + &pages_sharing_attr.attr, + &pages_unshared_attr.attr, + &pages_volatile_attr.attr, + &full_scans_attr.attr, + NULL, +}; + +static struct attribute_group ksm_attr_group = { + .attrs = ksm_attrs, + .name = "ksm", +}; +#endif /* CONFIG_SYSFS */ + +static int __init ksm_init(void) +{ + struct task_struct *ksm_thread; + int err; + + ksm_init_max_kernel_pages(); + + err = ksm_slab_init(); + if (err) + goto out; + + err = mm_slots_hash_init(); + if (err) + goto out_free1; + + ksm_thread = kthread_run(ksm_scan_thread, NULL, "ksmd"); + if (IS_ERR(ksm_thread)) { + printk(KERN_ERR "ksm: creating kthread failed\n"); + err = PTR_ERR(ksm_thread); + goto out_free2; + } + +#ifdef CONFIG_SYSFS + err = sysfs_create_group(mm_kobj, &ksm_attr_group); + if (err) { + printk(KERN_ERR "ksm: register sysfs failed\n"); + kthread_stop(ksm_thread); + goto out_free2; + } +#endif /* CONFIG_SYSFS */ + + return 0; + +out_free2: + mm_slots_hash_free(); +out_free1: + ksm_slab_free(); +out: + return err; +} +module_init(ksm_init) diff --git a/mm/madvise.c b/mm/madvise.c index 76eb4193acdd..35b1479b7c9d 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -11,6 +11,7 @@ #include <linux/mempolicy.h> #include <linux/hugetlb.h> #include <linux/sched.h> +#include <linux/ksm.h> /* * Any behaviour which results in changes to the vma->vm_flags needs to @@ -41,7 +42,7 @@ static long madvise_behavior(struct vm_area_struct * vma, struct mm_struct * mm = vma->vm_mm; int error = 0; pgoff_t pgoff; - int new_flags = vma->vm_flags; + unsigned long new_flags = vma->vm_flags; switch (behavior) { case MADV_NORMAL: @@ -57,8 +58,18 @@ static long madvise_behavior(struct vm_area_struct * vma, new_flags |= VM_DONTCOPY; break; case MADV_DOFORK: + if (vma->vm_flags & VM_IO) { + error = -EINVAL; + goto out; + } new_flags &= ~VM_DONTCOPY; break; + case MADV_MERGEABLE: + case MADV_UNMERGEABLE: + error = ksm_madvise(vma, start, end, behavior, &new_flags); + if (error) + goto out; + break; } if (new_flags == vma->vm_flags) { @@ -207,41 +218,46 @@ static long madvise_remove(struct vm_area_struct *vma, return error; } +#ifdef CONFIG_MEMORY_FAILURE +/* + * Error injection support for memory error handling. + */ +static int madvise_hwpoison(unsigned long start, unsigned long end) +{ + int ret = 0; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + for (; start < end; start += PAGE_SIZE) { + struct page *p; + int ret = get_user_pages(current, current->mm, start, 1, + 0, 0, &p, NULL); + if (ret != 1) + return ret; + printk(KERN_INFO "Injecting memory failure for page %lx at %lx\n", + page_to_pfn(p), start); + /* Ignore return value for now */ + __memory_failure(page_to_pfn(p), 0, 1); + put_page(p); + } + return ret; +} +#endif + static long madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev, unsigned long start, unsigned long end, int behavior) { - long error; - switch (behavior) { - case MADV_DOFORK: - if (vma->vm_flags & VM_IO) { - error = -EINVAL; - break; - } - case MADV_DONTFORK: - case MADV_NORMAL: - case MADV_SEQUENTIAL: - case MADV_RANDOM: - error = madvise_behavior(vma, prev, start, end, behavior); - break; case MADV_REMOVE: - error = madvise_remove(vma, prev, start, end); - break; - + return madvise_remove(vma, prev, start, end); case MADV_WILLNEED: - error = madvise_willneed(vma, prev, start, end); - break; - + return madvise_willneed(vma, prev, start, end); case MADV_DONTNEED: - error = madvise_dontneed(vma, prev, start, end); - break; - + return madvise_dontneed(vma, prev, start, end); default: - BUG(); - break; + return madvise_behavior(vma, prev, start, end, behavior); } - return error; } static int @@ -256,12 +272,17 @@ madvise_behavior_valid(int behavior) case MADV_REMOVE: case MADV_WILLNEED: case MADV_DONTNEED: +#ifdef CONFIG_KSM + case MADV_MERGEABLE: + case MADV_UNMERGEABLE: +#endif return 1; default: return 0; } } + /* * The madvise(2) system call. * @@ -286,6 +307,12 @@ madvise_behavior_valid(int behavior) * so the kernel can free resources associated with it. * MADV_REMOVE - the application wants to free up the given range of * pages and associated backing store. + * MADV_DONTFORK - omit this area from child's address space when forking: + * typically, to avoid COWing pages pinned by get_user_pages(). + * MADV_DOFORK - cancel MADV_DONTFORK: no longer omit this area when forking. + * MADV_MERGEABLE - the application recommends that KSM try to merge pages in + * this area with pages of identical content from other such areas. + * MADV_UNMERGEABLE- cancel MADV_MERGEABLE: no longer merge pages with others. * * return values: * zero - success @@ -307,6 +334,10 @@ SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior) int write; size_t len; +#ifdef CONFIG_MEMORY_FAILURE + if (behavior == MADV_HWPOISON) + return madvise_hwpoison(start, start+len_in); +#endif if (!madvise_behavior_valid(behavior)) return error; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index e2fa20dadf40..f99f5991d6bb 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -29,6 +29,7 @@ #include <linux/rcupdate.h> #include <linux/limits.h> #include <linux/mutex.h> +#include <linux/rbtree.h> #include <linux/slab.h> #include <linux/swap.h> #include <linux/spinlock.h> @@ -43,6 +44,7 @@ struct cgroup_subsys mem_cgroup_subsys __read_mostly; #define MEM_CGROUP_RECLAIM_RETRIES 5 +struct mem_cgroup *root_mem_cgroup __read_mostly; #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP /* Turned on only when memory cgroup is enabled && really_do_swap_account = 1 */ @@ -53,6 +55,7 @@ static int really_do_swap_account __initdata = 1; /* for remember boot option*/ #endif static DEFINE_MUTEX(memcg_tasklist); /* can be hold under cgroup_mutex */ +#define SOFTLIMIT_EVENTS_THRESH (1000) /* * Statistics for memory cgroup. @@ -66,6 +69,8 @@ enum mem_cgroup_stat_index { MEM_CGROUP_STAT_MAPPED_FILE, /* # of pages charged as file rss */ MEM_CGROUP_STAT_PGPGIN_COUNT, /* # of pages paged in */ MEM_CGROUP_STAT_PGPGOUT_COUNT, /* # of pages paged out */ + MEM_CGROUP_STAT_EVENTS, /* sum of pagein + pageout for internal use */ + MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */ MEM_CGROUP_STAT_NSTATS, }; @@ -78,6 +83,20 @@ struct mem_cgroup_stat { struct mem_cgroup_stat_cpu cpustat[0]; }; +static inline void +__mem_cgroup_stat_reset_safe(struct mem_cgroup_stat_cpu *stat, + enum mem_cgroup_stat_index idx) +{ + stat->count[idx] = 0; +} + +static inline s64 +__mem_cgroup_stat_read_local(struct mem_cgroup_stat_cpu *stat, + enum mem_cgroup_stat_index idx) +{ + return stat->count[idx]; +} + /* * For accounting under irq disable, no need for increment preempt count. */ @@ -117,6 +136,12 @@ struct mem_cgroup_per_zone { unsigned long count[NR_LRU_LISTS]; struct zone_reclaim_stat reclaim_stat; + struct rb_node tree_node; /* RB tree node */ + unsigned long long usage_in_excess;/* Set to the value by which */ + /* the soft limit is exceeded*/ + bool on_tree; + struct mem_cgroup *mem; /* Back pointer, we cannot */ + /* use container_of */ }; /* Macro for accessing counter */ #define MEM_CGROUP_ZSTAT(mz, idx) ((mz)->count[(idx)]) @@ -130,6 +155,26 @@ struct mem_cgroup_lru_info { }; /* + * Cgroups above their limits are maintained in a RB-Tree, independent of + * their hierarchy representation + */ + +struct mem_cgroup_tree_per_zone { + struct rb_root rb_root; + spinlock_t lock; +}; + +struct mem_cgroup_tree_per_node { + struct mem_cgroup_tree_per_zone rb_tree_per_zone[MAX_NR_ZONES]; +}; + +struct mem_cgroup_tree { + struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES]; +}; + +static struct mem_cgroup_tree soft_limit_tree __read_mostly; + +/* * The memory controller data structure. The memory controller controls both * page cache and RSS per cgroup. We would eventually like to provide * statistics based on the statistics developed by Rik Van Riel for clock-pro, @@ -186,6 +231,13 @@ struct mem_cgroup { struct mem_cgroup_stat stat; }; +/* + * Maximum loops in mem_cgroup_hierarchical_reclaim(), used for soft + * limit reclaim to prevent infinite loops, if they ever occur. + */ +#define MEM_CGROUP_MAX_RECLAIM_LOOPS (100) +#define MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS (2) + enum charge_type { MEM_CGROUP_CHARGE_TYPE_CACHE = 0, MEM_CGROUP_CHARGE_TYPE_MAPPED, @@ -200,13 +252,8 @@ enum charge_type { #define PCGF_CACHE (1UL << PCG_CACHE) #define PCGF_USED (1UL << PCG_USED) #define PCGF_LOCK (1UL << PCG_LOCK) -static const unsigned long -pcg_default_flags[NR_CHARGE_TYPE] = { - PCGF_CACHE | PCGF_USED | PCGF_LOCK, /* File Cache */ - PCGF_USED | PCGF_LOCK, /* Anon */ - PCGF_CACHE | PCGF_USED | PCGF_LOCK, /* Shmem */ - 0, /* FORCE */ -}; +/* Not used, but added here for completeness */ +#define PCGF_ACCT (1UL << PCG_ACCT) /* for encoding cft->private value on file */ #define _MEM (0) @@ -215,15 +262,237 @@ pcg_default_flags[NR_CHARGE_TYPE] = { #define MEMFILE_TYPE(val) (((val) >> 16) & 0xffff) #define MEMFILE_ATTR(val) ((val) & 0xffff) +/* + * Reclaim flags for mem_cgroup_hierarchical_reclaim + */ +#define MEM_CGROUP_RECLAIM_NOSWAP_BIT 0x0 +#define MEM_CGROUP_RECLAIM_NOSWAP (1 << MEM_CGROUP_RECLAIM_NOSWAP_BIT) +#define MEM_CGROUP_RECLAIM_SHRINK_BIT 0x1 +#define MEM_CGROUP_RECLAIM_SHRINK (1 << MEM_CGROUP_RECLAIM_SHRINK_BIT) +#define MEM_CGROUP_RECLAIM_SOFT_BIT 0x2 +#define MEM_CGROUP_RECLAIM_SOFT (1 << MEM_CGROUP_RECLAIM_SOFT_BIT) + static void mem_cgroup_get(struct mem_cgroup *mem); static void mem_cgroup_put(struct mem_cgroup *mem); static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem); +static struct mem_cgroup_per_zone * +mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid) +{ + return &mem->info.nodeinfo[nid]->zoneinfo[zid]; +} + +static struct mem_cgroup_per_zone * +page_cgroup_zoneinfo(struct page_cgroup *pc) +{ + struct mem_cgroup *mem = pc->mem_cgroup; + int nid = page_cgroup_nid(pc); + int zid = page_cgroup_zid(pc); + + if (!mem) + return NULL; + + return mem_cgroup_zoneinfo(mem, nid, zid); +} + +static struct mem_cgroup_tree_per_zone * +soft_limit_tree_node_zone(int nid, int zid) +{ + return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid]; +} + +static struct mem_cgroup_tree_per_zone * +soft_limit_tree_from_page(struct page *page) +{ + int nid = page_to_nid(page); + int zid = page_zonenum(page); + + return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid]; +} + +static void +__mem_cgroup_insert_exceeded(struct mem_cgroup *mem, + struct mem_cgroup_per_zone *mz, + struct mem_cgroup_tree_per_zone *mctz, + unsigned long long new_usage_in_excess) +{ + struct rb_node **p = &mctz->rb_root.rb_node; + struct rb_node *parent = NULL; + struct mem_cgroup_per_zone *mz_node; + + if (mz->on_tree) + return; + + mz->usage_in_excess = new_usage_in_excess; + if (!mz->usage_in_excess) + return; + while (*p) { + parent = *p; + mz_node = rb_entry(parent, struct mem_cgroup_per_zone, + tree_node); + if (mz->usage_in_excess < mz_node->usage_in_excess) + p = &(*p)->rb_left; + /* + * We can't avoid mem cgroups that are over their soft + * limit by the same amount + */ + else if (mz->usage_in_excess >= mz_node->usage_in_excess) + p = &(*p)->rb_right; + } + rb_link_node(&mz->tree_node, parent, p); + rb_insert_color(&mz->tree_node, &mctz->rb_root); + mz->on_tree = true; +} + +static void +__mem_cgroup_remove_exceeded(struct mem_cgroup *mem, + struct mem_cgroup_per_zone *mz, + struct mem_cgroup_tree_per_zone *mctz) +{ + if (!mz->on_tree) + return; + rb_erase(&mz->tree_node, &mctz->rb_root); + mz->on_tree = false; +} + +static void +mem_cgroup_remove_exceeded(struct mem_cgroup *mem, + struct mem_cgroup_per_zone *mz, + struct mem_cgroup_tree_per_zone *mctz) +{ + spin_lock(&mctz->lock); + __mem_cgroup_remove_exceeded(mem, mz, mctz); + spin_unlock(&mctz->lock); +} + +static bool mem_cgroup_soft_limit_check(struct mem_cgroup *mem) +{ + bool ret = false; + int cpu; + s64 val; + struct mem_cgroup_stat_cpu *cpustat; + + cpu = get_cpu(); + cpustat = &mem->stat.cpustat[cpu]; + val = __mem_cgroup_stat_read_local(cpustat, MEM_CGROUP_STAT_EVENTS); + if (unlikely(val > SOFTLIMIT_EVENTS_THRESH)) { + __mem_cgroup_stat_reset_safe(cpustat, MEM_CGROUP_STAT_EVENTS); + ret = true; + } + put_cpu(); + return ret; +} + +static void mem_cgroup_update_tree(struct mem_cgroup *mem, struct page *page) +{ + unsigned long long excess; + struct mem_cgroup_per_zone *mz; + struct mem_cgroup_tree_per_zone *mctz; + int nid = page_to_nid(page); + int zid = page_zonenum(page); + mctz = soft_limit_tree_from_page(page); + + /* + * Necessary to update all ancestors when hierarchy is used. + * because their event counter is not touched. + */ + for (; mem; mem = parent_mem_cgroup(mem)) { + mz = mem_cgroup_zoneinfo(mem, nid, zid); + excess = res_counter_soft_limit_excess(&mem->res); + /* + * We have to update the tree if mz is on RB-tree or + * mem is over its softlimit. + */ + if (excess || mz->on_tree) { + spin_lock(&mctz->lock); + /* if on-tree, remove it */ + if (mz->on_tree) + __mem_cgroup_remove_exceeded(mem, mz, mctz); + /* + * Insert again. mz->usage_in_excess will be updated. + * If excess is 0, no tree ops. + */ + __mem_cgroup_insert_exceeded(mem, mz, mctz, excess); + spin_unlock(&mctz->lock); + } + } +} + +static void mem_cgroup_remove_from_trees(struct mem_cgroup *mem) +{ + int node, zone; + struct mem_cgroup_per_zone *mz; + struct mem_cgroup_tree_per_zone *mctz; + + for_each_node_state(node, N_POSSIBLE) { + for (zone = 0; zone < MAX_NR_ZONES; zone++) { + mz = mem_cgroup_zoneinfo(mem, node, zone); + mctz = soft_limit_tree_node_zone(node, zone); + mem_cgroup_remove_exceeded(mem, mz, mctz); + } + } +} + +static inline unsigned long mem_cgroup_get_excess(struct mem_cgroup *mem) +{ + return res_counter_soft_limit_excess(&mem->res) >> PAGE_SHIFT; +} + +static struct mem_cgroup_per_zone * +__mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz) +{ + struct rb_node *rightmost = NULL; + struct mem_cgroup_per_zone *mz; + +retry: + mz = NULL; + rightmost = rb_last(&mctz->rb_root); + if (!rightmost) + goto done; /* Nothing to reclaim from */ + + mz = rb_entry(rightmost, struct mem_cgroup_per_zone, tree_node); + /* + * Remove the node now but someone else can add it back, + * we will to add it back at the end of reclaim to its correct + * position in the tree. + */ + __mem_cgroup_remove_exceeded(mz->mem, mz, mctz); + if (!res_counter_soft_limit_excess(&mz->mem->res) || + !css_tryget(&mz->mem->css)) + goto retry; +done: + return mz; +} + +static struct mem_cgroup_per_zone * +mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz) +{ + struct mem_cgroup_per_zone *mz; + + spin_lock(&mctz->lock); + mz = __mem_cgroup_largest_soft_limit_node(mctz); + spin_unlock(&mctz->lock); + return mz; +} + +static void mem_cgroup_swap_statistics(struct mem_cgroup *mem, + bool charge) +{ + int val = (charge) ? 1 : -1; + struct mem_cgroup_stat *stat = &mem->stat; + struct mem_cgroup_stat_cpu *cpustat; + int cpu = get_cpu(); + + cpustat = &stat->cpustat[cpu]; + __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_SWAPOUT, val); + put_cpu(); +} + static void mem_cgroup_charge_statistics(struct mem_cgroup *mem, struct page_cgroup *pc, bool charge) { - int val = (charge)? 1 : -1; + int val = (charge) ? 1 : -1; struct mem_cgroup_stat *stat = &mem->stat; struct mem_cgroup_stat_cpu *cpustat; int cpu = get_cpu(); @@ -240,28 +509,10 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *mem, else __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_PGPGOUT_COUNT, 1); + __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_EVENTS, 1); put_cpu(); } -static struct mem_cgroup_per_zone * -mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid) -{ - return &mem->info.nodeinfo[nid]->zoneinfo[zid]; -} - -static struct mem_cgroup_per_zone * -page_cgroup_zoneinfo(struct page_cgroup *pc) -{ - struct mem_cgroup *mem = pc->mem_cgroup; - int nid = page_cgroup_nid(pc); - int zid = page_cgroup_zid(pc); - - if (!mem) - return NULL; - - return mem_cgroup_zoneinfo(mem, nid, zid); -} - static unsigned long mem_cgroup_get_local_zonestat(struct mem_cgroup *mem, enum lru_list idx) { @@ -354,6 +605,11 @@ static int mem_cgroup_walk_tree(struct mem_cgroup *root, void *data, return ret; } +static inline bool mem_cgroup_is_root(struct mem_cgroup *mem) +{ + return (mem == root_mem_cgroup); +} + /* * Following LRU functions are allowed to be used without PCG_LOCK. * Operations are called by routine of global LRU independently from memcg. @@ -371,22 +627,24 @@ static int mem_cgroup_walk_tree(struct mem_cgroup *root, void *data, void mem_cgroup_del_lru_list(struct page *page, enum lru_list lru) { struct page_cgroup *pc; - struct mem_cgroup *mem; struct mem_cgroup_per_zone *mz; if (mem_cgroup_disabled()) return; pc = lookup_page_cgroup(page); /* can happen while we handle swapcache. */ - if (list_empty(&pc->lru) || !pc->mem_cgroup) + if (!TestClearPageCgroupAcctLRU(pc)) return; + VM_BUG_ON(!pc->mem_cgroup); /* * We don't check PCG_USED bit. It's cleared when the "page" is finally * removed from global LRU. */ mz = page_cgroup_zoneinfo(pc); - mem = pc->mem_cgroup; MEM_CGROUP_ZSTAT(mz, lru) -= 1; + if (mem_cgroup_is_root(pc->mem_cgroup)) + return; + VM_BUG_ON(list_empty(&pc->lru)); list_del_init(&pc->lru); return; } @@ -410,8 +668,8 @@ void mem_cgroup_rotate_lru_list(struct page *page, enum lru_list lru) * For making pc->mem_cgroup visible, insert smp_rmb() here. */ smp_rmb(); - /* unused page is not rotated. */ - if (!PageCgroupUsed(pc)) + /* unused or root page is not rotated. */ + if (!PageCgroupUsed(pc) || mem_cgroup_is_root(pc->mem_cgroup)) return; mz = page_cgroup_zoneinfo(pc); list_move(&pc->lru, &mz->lists[lru]); @@ -425,6 +683,7 @@ void mem_cgroup_add_lru_list(struct page *page, enum lru_list lru) if (mem_cgroup_disabled()) return; pc = lookup_page_cgroup(page); + VM_BUG_ON(PageCgroupAcctLRU(pc)); /* * Used bit is set without atomic ops but after smp_wmb(). * For making pc->mem_cgroup visible, insert smp_rmb() here. @@ -435,6 +694,9 @@ void mem_cgroup_add_lru_list(struct page *page, enum lru_list lru) mz = page_cgroup_zoneinfo(pc); MEM_CGROUP_ZSTAT(mz, lru) += 1; + SetPageCgroupAcctLRU(pc); + if (mem_cgroup_is_root(pc->mem_cgroup)) + return; list_add(&pc->lru, &mz->lists[lru]); } @@ -469,7 +731,7 @@ static void mem_cgroup_lru_add_after_commit_swapcache(struct page *page) spin_lock_irqsave(&zone->lru_lock, flags); /* link when the page is linked to LRU but page_cgroup isn't */ - if (PageLRU(page) && list_empty(&pc->lru)) + if (PageLRU(page) && !PageCgroupAcctLRU(pc)) mem_cgroup_add_lru_list(page, page_lru(page)); spin_unlock_irqrestore(&zone->lru_lock, flags); } @@ -648,7 +910,7 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan, int nid = z->zone_pgdat->node_id; int zid = zone_idx(z); struct mem_cgroup_per_zone *mz; - int lru = LRU_FILE * !!file + !!active; + int lru = LRU_FILE * file + active; int ret; BUG_ON(!mem_cont); @@ -855,28 +1117,62 @@ mem_cgroup_select_victim(struct mem_cgroup *root_mem) * If shrink==true, for avoiding to free too much, this returns immedieately. */ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem, - gfp_t gfp_mask, bool noswap, bool shrink) + struct zone *zone, + gfp_t gfp_mask, + unsigned long reclaim_options) { struct mem_cgroup *victim; int ret, total = 0; int loop = 0; + bool noswap = reclaim_options & MEM_CGROUP_RECLAIM_NOSWAP; + bool shrink = reclaim_options & MEM_CGROUP_RECLAIM_SHRINK; + bool check_soft = reclaim_options & MEM_CGROUP_RECLAIM_SOFT; + unsigned long excess = mem_cgroup_get_excess(root_mem); /* If memsw_is_minimum==1, swap-out is of-no-use. */ if (root_mem->memsw_is_minimum) noswap = true; - while (loop < 2) { + while (1) { victim = mem_cgroup_select_victim(root_mem); - if (victim == root_mem) + if (victim == root_mem) { loop++; + if (loop >= 2) { + /* + * If we have not been able to reclaim + * anything, it might because there are + * no reclaimable pages under this hierarchy + */ + if (!check_soft || !total) { + css_put(&victim->css); + break; + } + /* + * We want to do more targetted reclaim. + * excess >> 2 is not to excessive so as to + * reclaim too much, nor too less that we keep + * coming back to reclaim from this cgroup + */ + if (total >= (excess >> 2) || + (loop > MEM_CGROUP_MAX_RECLAIM_LOOPS)) { + css_put(&victim->css); + break; + } + } + } if (!mem_cgroup_local_usage(&victim->stat)) { /* this cgroup's local usage == 0 */ css_put(&victim->css); continue; } /* we use swappiness of local cgroup */ - ret = try_to_free_mem_cgroup_pages(victim, gfp_mask, noswap, - get_swappiness(victim)); + if (check_soft) + ret = mem_cgroup_shrink_node_zone(victim, gfp_mask, + noswap, get_swappiness(victim), zone, + zone->zone_pgdat->node_id); + else + ret = try_to_free_mem_cgroup_pages(victim, gfp_mask, + noswap, get_swappiness(victim)); css_put(&victim->css); /* * At shrinking usage, we can't check we should stop here or @@ -886,7 +1182,10 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem, if (shrink) return ret; total += ret; - if (mem_cgroup_check_under_limit(root_mem)) + if (check_soft) { + if (res_counter_check_under_soft_limit(&root_mem->res)) + return total; + } else if (mem_cgroup_check_under_limit(root_mem)) return 1 + total; } return total; @@ -965,7 +1264,7 @@ done: */ static int __mem_cgroup_try_charge(struct mm_struct *mm, gfp_t gfp_mask, struct mem_cgroup **memcg, - bool oom) + bool oom, struct page *page) { struct mem_cgroup *mem, *mem_over_limit; int nr_retries = MEM_CGROUP_RECLAIM_RETRIES; @@ -996,9 +1295,11 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm, VM_BUG_ON(css_is_removed(&mem->css)); while (1) { - int ret; - bool noswap = false; + int ret = 0; + unsigned long flags = 0; + if (mem_cgroup_is_root(mem)) + goto done; ret = res_counter_charge(&mem->res, PAGE_SIZE, &fail_res); if (likely(!ret)) { if (!do_swap_account) @@ -1009,7 +1310,7 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm, break; /* mem+swap counter fails */ res_counter_uncharge(&mem->res, PAGE_SIZE); - noswap = true; + flags |= MEM_CGROUP_RECLAIM_NOSWAP; mem_over_limit = mem_cgroup_from_res_counter(fail_res, memsw); } else @@ -1020,8 +1321,8 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm, if (!(gfp_mask & __GFP_WAIT)) goto nomem; - ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, gfp_mask, - noswap, false); + ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, NULL, + gfp_mask, flags); if (ret) continue; @@ -1046,13 +1347,19 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm, goto nomem; } } + /* + * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree. + * if they exceeds softlimit. + */ + if (mem_cgroup_soft_limit_check(mem)) + mem_cgroup_update_tree(mem, page); +done: return 0; nomem: css_put(&mem->css); return -ENOMEM; } - /* * A helper function to get mem_cgroup from ID. must be called under * rcu_read_lock(). The caller must check css_is_removed() or some if @@ -1119,15 +1426,37 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem, lock_page_cgroup(pc); if (unlikely(PageCgroupUsed(pc))) { unlock_page_cgroup(pc); - res_counter_uncharge(&mem->res, PAGE_SIZE); - if (do_swap_account) - res_counter_uncharge(&mem->memsw, PAGE_SIZE); + if (!mem_cgroup_is_root(mem)) { + res_counter_uncharge(&mem->res, PAGE_SIZE); + if (do_swap_account) + res_counter_uncharge(&mem->memsw, PAGE_SIZE); + } css_put(&mem->css); return; } + pc->mem_cgroup = mem; + /* + * We access a page_cgroup asynchronously without lock_page_cgroup(). + * Especially when a page_cgroup is taken from a page, pc->mem_cgroup + * is accessed after testing USED bit. To make pc->mem_cgroup visible + * before USED bit, we need memory barrier here. + * See mem_cgroup_add_lru_list(), etc. + */ smp_wmb(); - pc->flags = pcg_default_flags[ctype]; + switch (ctype) { + case MEM_CGROUP_CHARGE_TYPE_CACHE: + case MEM_CGROUP_CHARGE_TYPE_SHMEM: + SetPageCgroupCache(pc); + SetPageCgroupUsed(pc); + break; + case MEM_CGROUP_CHARGE_TYPE_MAPPED: + ClearPageCgroupCache(pc); + SetPageCgroupUsed(pc); + break; + default: + break; + } mem_cgroup_charge_statistics(mem, pc, true); @@ -1178,7 +1507,8 @@ static int mem_cgroup_move_account(struct page_cgroup *pc, if (pc->mem_cgroup != from) goto out; - res_counter_uncharge(&from->res, PAGE_SIZE); + if (!mem_cgroup_is_root(from)) + res_counter_uncharge(&from->res, PAGE_SIZE); mem_cgroup_charge_statistics(from, pc, false); page = pc->page; @@ -1197,7 +1527,7 @@ static int mem_cgroup_move_account(struct page_cgroup *pc, 1); } - if (do_swap_account) + if (do_swap_account && !mem_cgroup_is_root(from)) res_counter_uncharge(&from->memsw, PAGE_SIZE); css_put(&from->css); @@ -1207,6 +1537,12 @@ static int mem_cgroup_move_account(struct page_cgroup *pc, ret = 0; out: unlock_page_cgroup(pc); + /* + * We charges against "to" which may not have any tasks. Then, "to" + * can be under rmdir(). But in current implementation, caller of + * this function is just force_empty() and it's garanteed that + * "to" is never removed. So, we don't check rmdir status here. + */ return ret; } @@ -1232,7 +1568,7 @@ static int mem_cgroup_move_parent(struct page_cgroup *pc, parent = mem_cgroup_from_cont(pcg); - ret = __mem_cgroup_try_charge(NULL, gfp_mask, &parent, false); + ret = __mem_cgroup_try_charge(NULL, gfp_mask, &parent, false, page); if (ret || !parent) return ret; @@ -1262,9 +1598,11 @@ uncharge: /* drop extra refcnt by try_charge() */ css_put(&parent->css); /* uncharge if move fails */ - res_counter_uncharge(&parent->res, PAGE_SIZE); - if (do_swap_account) - res_counter_uncharge(&parent->memsw, PAGE_SIZE); + if (!mem_cgroup_is_root(parent)) { + res_counter_uncharge(&parent->res, PAGE_SIZE); + if (do_swap_account) + res_counter_uncharge(&parent->memsw, PAGE_SIZE); + } return ret; } @@ -1289,7 +1627,7 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm, prefetchw(pc); mem = memcg; - ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, true); + ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, true, page); if (ret || !mem) return ret; @@ -1408,14 +1746,14 @@ int mem_cgroup_try_charge_swapin(struct mm_struct *mm, if (!mem) goto charge_cur_mm; *ptr = mem; - ret = __mem_cgroup_try_charge(NULL, mask, ptr, true); + ret = __mem_cgroup_try_charge(NULL, mask, ptr, true, page); /* drop extra refcnt from tryget */ css_put(&mem->css); return ret; charge_cur_mm: if (unlikely(!mm)) mm = &init_mm; - return __mem_cgroup_try_charge(mm, mask, ptr, true); + return __mem_cgroup_try_charge(mm, mask, ptr, true, page); } static void @@ -1428,6 +1766,7 @@ __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr, return; if (!ptr) return; + cgroup_exclude_rmdir(&ptr->css); pc = lookup_page_cgroup(page); mem_cgroup_lru_del_before_commit_swapcache(page); __mem_cgroup_commit_charge(ptr, pc, ctype); @@ -1452,13 +1791,19 @@ __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr, * This recorded memcg can be obsolete one. So, avoid * calling css_tryget */ - res_counter_uncharge(&memcg->memsw, PAGE_SIZE); + if (!mem_cgroup_is_root(memcg)) + res_counter_uncharge(&memcg->memsw, PAGE_SIZE); + mem_cgroup_swap_statistics(memcg, false); mem_cgroup_put(memcg); } rcu_read_unlock(); } - /* add this page(page_cgroup) to the LRU we want. */ - + /* + * At swapin, we may charge account against cgroup which has no tasks. + * So, rmdir()->pre_destroy() can be called while we do this charge. + * In that case, we need to call pre_destroy() again. check it here. + */ + cgroup_release_and_wakeup_rmdir(&ptr->css); } void mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr) @@ -1473,9 +1818,11 @@ void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *mem) return; if (!mem) return; - res_counter_uncharge(&mem->res, PAGE_SIZE); - if (do_swap_account) - res_counter_uncharge(&mem->memsw, PAGE_SIZE); + if (!mem_cgroup_is_root(mem)) { + res_counter_uncharge(&mem->res, PAGE_SIZE); + if (do_swap_account) + res_counter_uncharge(&mem->memsw, PAGE_SIZE); + } css_put(&mem->css); } @@ -1527,9 +1874,14 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) break; } - res_counter_uncharge(&mem->res, PAGE_SIZE); - if (do_swap_account && (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT)) - res_counter_uncharge(&mem->memsw, PAGE_SIZE); + if (!mem_cgroup_is_root(mem)) { + res_counter_uncharge(&mem->res, PAGE_SIZE); + if (do_swap_account && + (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT)) + res_counter_uncharge(&mem->memsw, PAGE_SIZE); + } + if (ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) + mem_cgroup_swap_statistics(mem, true); mem_cgroup_charge_statistics(mem, pc, false); ClearPageCgroupUsed(pc); @@ -1543,6 +1895,8 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) mz = page_cgroup_zoneinfo(pc); unlock_page_cgroup(pc); + if (mem_cgroup_soft_limit_check(mem)) + mem_cgroup_update_tree(mem, page); /* at swapout, this memcg will be accessed to record to swap */ if (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT) css_put(&mem->css); @@ -1618,7 +1972,9 @@ void mem_cgroup_uncharge_swap(swp_entry_t ent) * We uncharge this because swap is freed. * This memcg can be obsolete one. We avoid calling css_tryget */ - res_counter_uncharge(&memcg->memsw, PAGE_SIZE); + if (!mem_cgroup_is_root(memcg)) + res_counter_uncharge(&memcg->memsw, PAGE_SIZE); + mem_cgroup_swap_statistics(memcg, false); mem_cgroup_put(memcg); } rcu_read_unlock(); @@ -1647,7 +2003,8 @@ int mem_cgroup_prepare_migration(struct page *page, struct mem_cgroup **ptr) unlock_page_cgroup(pc); if (mem) { - ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, &mem, false); + ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, &mem, false, + page); css_put(&mem->css); } *ptr = mem; @@ -1664,7 +2021,7 @@ void mem_cgroup_end_migration(struct mem_cgroup *mem, if (!mem) return; - + cgroup_exclude_rmdir(&mem->css); /* at migration success, oldpage->mapping is NULL. */ if (oldpage->mapping) { target = oldpage; @@ -1704,6 +2061,12 @@ void mem_cgroup_end_migration(struct mem_cgroup *mem, */ if (ctype == MEM_CGROUP_CHARGE_TYPE_MAPPED) mem_cgroup_uncharge_page(target); + /* + * At migration, we may charge account against cgroup which has no tasks + * So, rmdir()->pre_destroy() can be called while we do this charge. + * In that case, we need to call pre_destroy() again. check it here. + */ + cgroup_release_and_wakeup_rmdir(&mem->css); } /* @@ -1781,8 +2144,9 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg, if (!ret) break; - progress = mem_cgroup_hierarchical_reclaim(memcg, GFP_KERNEL, - false, true); + progress = mem_cgroup_hierarchical_reclaim(memcg, NULL, + GFP_KERNEL, + MEM_CGROUP_RECLAIM_SHRINK); curusage = res_counter_read_u64(&memcg->res, RES_USAGE); /* Usage is reduced ? */ if (curusage >= oldusage) @@ -1834,7 +2198,9 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg, if (!ret) break; - mem_cgroup_hierarchical_reclaim(memcg, GFP_KERNEL, true, true); + mem_cgroup_hierarchical_reclaim(memcg, NULL, GFP_KERNEL, + MEM_CGROUP_RECLAIM_NOSWAP | + MEM_CGROUP_RECLAIM_SHRINK); curusage = res_counter_read_u64(&memcg->memsw, RES_USAGE); /* Usage is reduced ? */ if (curusage >= oldusage) @@ -1845,6 +2211,97 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg, return ret; } +unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, + gfp_t gfp_mask, int nid, + int zid) +{ + unsigned long nr_reclaimed = 0; + struct mem_cgroup_per_zone *mz, *next_mz = NULL; + unsigned long reclaimed; + int loop = 0; + struct mem_cgroup_tree_per_zone *mctz; + unsigned long long excess; + + if (order > 0) + return 0; + + mctz = soft_limit_tree_node_zone(nid, zid); + /* + * This loop can run a while, specially if mem_cgroup's continuously + * keep exceeding their soft limit and putting the system under + * pressure + */ + do { + if (next_mz) + mz = next_mz; + else + mz = mem_cgroup_largest_soft_limit_node(mctz); + if (!mz) + break; + + reclaimed = mem_cgroup_hierarchical_reclaim(mz->mem, zone, + gfp_mask, + MEM_CGROUP_RECLAIM_SOFT); + nr_reclaimed += reclaimed; + spin_lock(&mctz->lock); + + /* + * If we failed to reclaim anything from this memory cgroup + * it is time to move on to the next cgroup + */ + next_mz = NULL; + if (!reclaimed) { + do { + /* + * Loop until we find yet another one. + * + * By the time we get the soft_limit lock + * again, someone might have aded the + * group back on the RB tree. Iterate to + * make sure we get a different mem. + * mem_cgroup_largest_soft_limit_node returns + * NULL if no other cgroup is present on + * the tree + */ + next_mz = + __mem_cgroup_largest_soft_limit_node(mctz); + if (next_mz == mz) { + css_put(&next_mz->mem->css); + next_mz = NULL; + } else /* next_mz == NULL or other memcg */ + break; + } while (1); + } + __mem_cgroup_remove_exceeded(mz->mem, mz, mctz); + excess = res_counter_soft_limit_excess(&mz->mem->res); + /* + * One school of thought says that we should not add + * back the node to the tree if reclaim returns 0. + * But our reclaim could return 0, simply because due + * to priority we are exposing a smaller subset of + * memory to reclaim from. Consider this as a longer + * term TODO. + */ + /* If excess == 0, no tree ops */ + __mem_cgroup_insert_exceeded(mz->mem, mz, mctz, excess); + spin_unlock(&mctz->lock); + css_put(&mz->mem->css); + loop++; + /* + * Could not reclaim anything and there are no more + * mem cgroups to try or we seem to be looping without + * reclaiming anything. + */ + if (!nr_reclaimed && + (next_mz == NULL || + loop > MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS)) + break; + } while (!nr_reclaimed); + if (next_mz) + css_put(&next_mz->mem->css); + return nr_reclaimed; +} + /* * This routine traverse page_cgroup in given list and drop them all. * *And* this routine doesn't reclaim page itself, just removes page_cgroup. @@ -1973,7 +2430,7 @@ try_to_free: if (!progress) { nr_retries--; /* maybe some writeback is necessary */ - congestion_wait(WRITE, HZ/10); + congestion_wait(BLK_RW_ASYNC, HZ/10); } } @@ -2029,20 +2486,64 @@ static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft, return retval; } +struct mem_cgroup_idx_data { + s64 val; + enum mem_cgroup_stat_index idx; +}; + +static int +mem_cgroup_get_idx_stat(struct mem_cgroup *mem, void *data) +{ + struct mem_cgroup_idx_data *d = data; + d->val += mem_cgroup_read_stat(&mem->stat, d->idx); + return 0; +} + +static void +mem_cgroup_get_recursive_idx_stat(struct mem_cgroup *mem, + enum mem_cgroup_stat_index idx, s64 *val) +{ + struct mem_cgroup_idx_data d; + d.idx = idx; + d.val = 0; + mem_cgroup_walk_tree(mem, &d, mem_cgroup_get_idx_stat); + *val = d.val; +} + static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft) { struct mem_cgroup *mem = mem_cgroup_from_cont(cont); - u64 val = 0; + u64 idx_val, val; int type, name; type = MEMFILE_TYPE(cft->private); name = MEMFILE_ATTR(cft->private); switch (type) { case _MEM: - val = res_counter_read_u64(&mem->res, name); + if (name == RES_USAGE && mem_cgroup_is_root(mem)) { + mem_cgroup_get_recursive_idx_stat(mem, + MEM_CGROUP_STAT_CACHE, &idx_val); + val = idx_val; + mem_cgroup_get_recursive_idx_stat(mem, + MEM_CGROUP_STAT_RSS, &idx_val); + val += idx_val; + val <<= PAGE_SHIFT; + } else + val = res_counter_read_u64(&mem->res, name); break; case _MEMSWAP: - val = res_counter_read_u64(&mem->memsw, name); + if (name == RES_USAGE && mem_cgroup_is_root(mem)) { + mem_cgroup_get_recursive_idx_stat(mem, + MEM_CGROUP_STAT_CACHE, &idx_val); + val = idx_val; + mem_cgroup_get_recursive_idx_stat(mem, + MEM_CGROUP_STAT_RSS, &idx_val); + val += idx_val; + mem_cgroup_get_recursive_idx_stat(mem, + MEM_CGROUP_STAT_SWAPOUT, &idx_val); + val <<= PAGE_SHIFT; + } else + val = res_counter_read_u64(&mem->memsw, name); break; default: BUG(); @@ -2066,6 +2567,10 @@ static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft, name = MEMFILE_ATTR(cft->private); switch (name) { case RES_LIMIT: + if (mem_cgroup_is_root(memcg)) { /* Can't set limit on root */ + ret = -EINVAL; + break; + } /* This function does all necessary parse...reuse it */ ret = res_counter_memparse_write_strategy(buffer, &val); if (ret) @@ -2075,6 +2580,20 @@ static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft, else ret = mem_cgroup_resize_memsw_limit(memcg, val); break; + case RES_SOFT_LIMIT: + ret = res_counter_memparse_write_strategy(buffer, &val); + if (ret) + break; + /* + * For memsw, soft limits are hard to implement in terms + * of semantics, for now, we support soft limits for + * control without swap + */ + if (type == _MEM) + ret = res_counter_set_soft_limit(&memcg->res, val); + else + ret = -EINVAL; + break; default: ret = -EINVAL; /* should be BUG() ? */ break; @@ -2132,6 +2651,7 @@ static int mem_cgroup_reset(struct cgroup *cont, unsigned int event) res_counter_reset_failcnt(&mem->memsw); break; } + return 0; } @@ -2143,6 +2663,7 @@ enum { MCS_MAPPED_FILE, MCS_PGPGIN, MCS_PGPGOUT, + MCS_SWAP, MCS_INACTIVE_ANON, MCS_ACTIVE_ANON, MCS_INACTIVE_FILE, @@ -2164,6 +2685,7 @@ struct { {"mapped_file", "total_mapped_file"}, {"pgpgin", "total_pgpgin"}, {"pgpgout", "total_pgpgout"}, + {"swap", "total_swap"}, {"inactive_anon", "total_inactive_anon"}, {"active_anon", "total_active_anon"}, {"inactive_file", "total_inactive_file"}, @@ -2188,6 +2710,10 @@ static int mem_cgroup_get_local_stat(struct mem_cgroup *mem, void *data) s->stat[MCS_PGPGIN] += val; val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_PGPGOUT_COUNT); s->stat[MCS_PGPGOUT] += val; + if (do_swap_account) { + val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_SWAPOUT); + s->stat[MCS_SWAP] += val * PAGE_SIZE; + } /* per zone stat */ val = mem_cgroup_get_local_zonestat(mem, LRU_INACTIVE_ANON); @@ -2219,8 +2745,11 @@ static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft, memset(&mystat, 0, sizeof(mystat)); mem_cgroup_get_local_stat(mem_cont, &mystat); - for (i = 0; i < NR_MCS_STAT; i++) + for (i = 0; i < NR_MCS_STAT; i++) { + if (i == MCS_SWAP && !do_swap_account) + continue; cb->fill(cb, memcg_stat_strings[i].local_name, mystat.stat[i]); + } /* Hierarchical information */ { @@ -2233,9 +2762,11 @@ static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft, memset(&mystat, 0, sizeof(mystat)); mem_cgroup_get_total_stat(mem_cont, &mystat); - for (i = 0; i < NR_MCS_STAT; i++) + for (i = 0; i < NR_MCS_STAT; i++) { + if (i == MCS_SWAP && !do_swap_account) + continue; cb->fill(cb, memcg_stat_strings[i].total_name, mystat.stat[i]); - + } #ifdef CONFIG_DEBUG_VM cb->fill(cb, "inactive_ratio", calc_inactive_ratio(mem_cont, NULL)); @@ -2328,6 +2859,12 @@ static struct cftype mem_cgroup_files[] = { .read_u64 = mem_cgroup_read, }, { + .name = "soft_limit_in_bytes", + .private = MEMFILE_PRIVATE(_MEM, RES_SOFT_LIMIT), + .write_string = mem_cgroup_write, + .read_u64 = mem_cgroup_read, + }, + { .name = "failcnt", .private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT), .trigger = mem_cgroup_reset, @@ -2421,6 +2958,9 @@ static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node) mz = &pn->zoneinfo[zone]; for_each_lru(l) INIT_LIST_HEAD(&mz->lists[l]); + mz->usage_in_excess = 0; + mz->on_tree = false; + mz->mem = mem; } return 0; } @@ -2466,6 +3006,7 @@ static void __mem_cgroup_free(struct mem_cgroup *mem) { int node; + mem_cgroup_remove_from_trees(mem); free_css_id(&mem_cgroup_subsys, &mem->css); for_each_node_state(node, N_POSSIBLE) @@ -2514,6 +3055,31 @@ static void __init enable_swap_cgroup(void) } #endif +static int mem_cgroup_soft_limit_tree_init(void) +{ + struct mem_cgroup_tree_per_node *rtpn; + struct mem_cgroup_tree_per_zone *rtpz; + int tmp, node, zone; + + for_each_node_state(node, N_POSSIBLE) { + tmp = node; + if (!node_state(node, N_NORMAL_MEMORY)) + tmp = -1; + rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, tmp); + if (!rtpn) + return 1; + + soft_limit_tree.rb_tree_per_node[node] = rtpn; + + for (zone = 0; zone < MAX_NR_ZONES; zone++) { + rtpz = &rtpn->rb_tree_per_zone[zone]; + rtpz->rb_root = RB_ROOT; + spin_lock_init(&rtpz->lock); + } + } + return 0; +} + static struct cgroup_subsys_state * __ref mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) { @@ -2528,10 +3094,15 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) for_each_node_state(node, N_POSSIBLE) if (alloc_mem_cgroup_per_zone_info(mem, node)) goto free_out; + /* root ? */ if (cont->parent == NULL) { enable_swap_cgroup(); parent = NULL; + root_mem_cgroup = mem; + if (mem_cgroup_soft_limit_tree_init()) + goto free_out; + } else { parent = mem_cgroup_from_cont(cont->parent); mem->use_hierarchy = parent->use_hierarchy; @@ -2560,6 +3131,7 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) return &mem->css; free_out: __mem_cgroup_free(mem); + root_mem_cgroup = NULL; return ERR_PTR(error); } @@ -2595,7 +3167,8 @@ static int mem_cgroup_populate(struct cgroup_subsys *ss, static void mem_cgroup_move_task(struct cgroup_subsys *ss, struct cgroup *cont, struct cgroup *old_cont, - struct task_struct *p) + struct task_struct *p, + bool threadgroup) { mutex_lock(&memcg_tasklist); /* diff --git a/mm/memory-failure.c b/mm/memory-failure.c new file mode 100644 index 000000000000..729d4b15b645 --- /dev/null +++ b/mm/memory-failure.c @@ -0,0 +1,832 @@ +/* + * Copyright (C) 2008, 2009 Intel Corporation + * Authors: Andi Kleen, Fengguang Wu + * + * This software may be redistributed and/or modified under the terms of + * the GNU General Public License ("GPL") version 2 only as published by the + * Free Software Foundation. + * + * High level machine check handler. Handles pages reported by the + * hardware as being corrupted usually due to a 2bit ECC memory or cache + * failure. + * + * Handles page cache pages in various states. The tricky part + * here is that we can access any page asynchronous to other VM + * users, because memory failures could happen anytime and anywhere, + * possibly violating some of their assumptions. This is why this code + * has to be extremely careful. Generally it tries to use normal locking + * rules, as in get the standard locks, even if that means the + * error handling takes potentially a long time. + * + * The operation to map back from RMAP chains to processes has to walk + * the complete process list and has non linear complexity with the number + * mappings. In short it can be quite slow. But since memory corruptions + * are rare we hope to get away with this. + */ + +/* + * Notebook: + * - hugetlb needs more code + * - kcore/oldmem/vmcore/mem/kmem check for hwpoison pages + * - pass bad pages to kdump next kernel + */ +#define DEBUG 1 /* remove me in 2.6.34 */ +#include <linux/kernel.h> +#include <linux/mm.h> +#include <linux/page-flags.h> +#include <linux/sched.h> +#include <linux/rmap.h> +#include <linux/pagemap.h> +#include <linux/swap.h> +#include <linux/backing-dev.h> +#include "internal.h" + +int sysctl_memory_failure_early_kill __read_mostly = 0; + +int sysctl_memory_failure_recovery __read_mostly = 1; + +atomic_long_t mce_bad_pages __read_mostly = ATOMIC_LONG_INIT(0); + +/* + * Send all the processes who have the page mapped an ``action optional'' + * signal. + */ +static int kill_proc_ao(struct task_struct *t, unsigned long addr, int trapno, + unsigned long pfn) +{ + struct siginfo si; + int ret; + + printk(KERN_ERR + "MCE %#lx: Killing %s:%d early due to hardware memory corruption\n", + pfn, t->comm, t->pid); + si.si_signo = SIGBUS; + si.si_errno = 0; + si.si_code = BUS_MCEERR_AO; + si.si_addr = (void *)addr; +#ifdef __ARCH_SI_TRAPNO + si.si_trapno = trapno; +#endif + si.si_addr_lsb = PAGE_SHIFT; + /* + * Don't use force here, it's convenient if the signal + * can be temporarily blocked. + * This could cause a loop when the user sets SIGBUS + * to SIG_IGN, but hopefully noone will do that? + */ + ret = send_sig_info(SIGBUS, &si, t); /* synchronous? */ + if (ret < 0) + printk(KERN_INFO "MCE: Error sending signal to %s:%d: %d\n", + t->comm, t->pid, ret); + return ret; +} + +/* + * Kill all processes that have a poisoned page mapped and then isolate + * the page. + * + * General strategy: + * Find all processes having the page mapped and kill them. + * But we keep a page reference around so that the page is not + * actually freed yet. + * Then stash the page away + * + * There's no convenient way to get back to mapped processes + * from the VMAs. So do a brute-force search over all + * running processes. + * + * Remember that machine checks are not common (or rather + * if they are common you have other problems), so this shouldn't + * be a performance issue. + * + * Also there are some races possible while we get from the + * error detection to actually handle it. + */ + +struct to_kill { + struct list_head nd; + struct task_struct *tsk; + unsigned long addr; + unsigned addr_valid:1; +}; + +/* + * Failure handling: if we can't find or can't kill a process there's + * not much we can do. We just print a message and ignore otherwise. + */ + +/* + * Schedule a process for later kill. + * Uses GFP_ATOMIC allocations to avoid potential recursions in the VM. + * TBD would GFP_NOIO be enough? + */ +static void add_to_kill(struct task_struct *tsk, struct page *p, + struct vm_area_struct *vma, + struct list_head *to_kill, + struct to_kill **tkc) +{ + struct to_kill *tk; + + if (*tkc) { + tk = *tkc; + *tkc = NULL; + } else { + tk = kmalloc(sizeof(struct to_kill), GFP_ATOMIC); + if (!tk) { + printk(KERN_ERR + "MCE: Out of memory while machine check handling\n"); + return; + } + } + tk->addr = page_address_in_vma(p, vma); + tk->addr_valid = 1; + + /* + * In theory we don't have to kill when the page was + * munmaped. But it could be also a mremap. Since that's + * likely very rare kill anyways just out of paranoia, but use + * a SIGKILL because the error is not contained anymore. + */ + if (tk->addr == -EFAULT) { + pr_debug("MCE: Unable to find user space address %lx in %s\n", + page_to_pfn(p), tsk->comm); + tk->addr_valid = 0; + } + get_task_struct(tsk); + tk->tsk = tsk; + list_add_tail(&tk->nd, to_kill); +} + +/* + * Kill the processes that have been collected earlier. + * + * Only do anything when DOIT is set, otherwise just free the list + * (this is used for clean pages which do not need killing) + * Also when FAIL is set do a force kill because something went + * wrong earlier. + */ +static void kill_procs_ao(struct list_head *to_kill, int doit, int trapno, + int fail, unsigned long pfn) +{ + struct to_kill *tk, *next; + + list_for_each_entry_safe (tk, next, to_kill, nd) { + if (doit) { + /* + * In case something went wrong with munmaping + * make sure the process doesn't catch the + * signal and then access the memory. Just kill it. + * the signal handlers + */ + if (fail || tk->addr_valid == 0) { + printk(KERN_ERR + "MCE %#lx: forcibly killing %s:%d because of failure to unmap corrupted page\n", + pfn, tk->tsk->comm, tk->tsk->pid); + force_sig(SIGKILL, tk->tsk); + } + + /* + * In theory the process could have mapped + * something else on the address in-between. We could + * check for that, but we need to tell the + * process anyways. + */ + else if (kill_proc_ao(tk->tsk, tk->addr, trapno, + pfn) < 0) + printk(KERN_ERR + "MCE %#lx: Cannot send advisory machine check signal to %s:%d\n", + pfn, tk->tsk->comm, tk->tsk->pid); + } + put_task_struct(tk->tsk); + kfree(tk); + } +} + +static int task_early_kill(struct task_struct *tsk) +{ + if (!tsk->mm) + return 0; + if (tsk->flags & PF_MCE_PROCESS) + return !!(tsk->flags & PF_MCE_EARLY); + return sysctl_memory_failure_early_kill; +} + +/* + * Collect processes when the error hit an anonymous page. + */ +static void collect_procs_anon(struct page *page, struct list_head *to_kill, + struct to_kill **tkc) +{ + struct vm_area_struct *vma; + struct task_struct *tsk; + struct anon_vma *av; + + read_lock(&tasklist_lock); + av = page_lock_anon_vma(page); + if (av == NULL) /* Not actually mapped anymore */ + goto out; + for_each_process (tsk) { + if (!task_early_kill(tsk)) + continue; + list_for_each_entry (vma, &av->head, anon_vma_node) { + if (!page_mapped_in_vma(page, vma)) + continue; + if (vma->vm_mm == tsk->mm) + add_to_kill(tsk, page, vma, to_kill, tkc); + } + } + page_unlock_anon_vma(av); +out: + read_unlock(&tasklist_lock); +} + +/* + * Collect processes when the error hit a file mapped page. + */ +static void collect_procs_file(struct page *page, struct list_head *to_kill, + struct to_kill **tkc) +{ + struct vm_area_struct *vma; + struct task_struct *tsk; + struct prio_tree_iter iter; + struct address_space *mapping = page->mapping; + + /* + * A note on the locking order between the two locks. + * We don't rely on this particular order. + * If you have some other code that needs a different order + * feel free to switch them around. Or add a reverse link + * from mm_struct to task_struct, then this could be all + * done without taking tasklist_lock and looping over all tasks. + */ + + read_lock(&tasklist_lock); + spin_lock(&mapping->i_mmap_lock); + for_each_process(tsk) { + pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); + + if (!task_early_kill(tsk)) + continue; + + vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, + pgoff) { + /* + * Send early kill signal to tasks where a vma covers + * the page but the corrupted page is not necessarily + * mapped it in its pte. + * Assume applications who requested early kill want + * to be informed of all such data corruptions. + */ + if (vma->vm_mm == tsk->mm) + add_to_kill(tsk, page, vma, to_kill, tkc); + } + } + spin_unlock(&mapping->i_mmap_lock); + read_unlock(&tasklist_lock); +} + +/* + * Collect the processes who have the corrupted page mapped to kill. + * This is done in two steps for locking reasons. + * First preallocate one tokill structure outside the spin locks, + * so that we can kill at least one process reasonably reliable. + */ +static void collect_procs(struct page *page, struct list_head *tokill) +{ + struct to_kill *tk; + + if (!page->mapping) + return; + + tk = kmalloc(sizeof(struct to_kill), GFP_NOIO); + if (!tk) + return; + if (PageAnon(page)) + collect_procs_anon(page, tokill, &tk); + else + collect_procs_file(page, tokill, &tk); + kfree(tk); +} + +/* + * Error handlers for various types of pages. + */ + +enum outcome { + FAILED, /* Error handling failed */ + DELAYED, /* Will be handled later */ + IGNORED, /* Error safely ignored */ + RECOVERED, /* Successfully recovered */ +}; + +static const char *action_name[] = { + [FAILED] = "Failed", + [DELAYED] = "Delayed", + [IGNORED] = "Ignored", + [RECOVERED] = "Recovered", +}; + +/* + * Error hit kernel page. + * Do nothing, try to be lucky and not touch this instead. For a few cases we + * could be more sophisticated. + */ +static int me_kernel(struct page *p, unsigned long pfn) +{ + return DELAYED; +} + +/* + * Already poisoned page. + */ +static int me_ignore(struct page *p, unsigned long pfn) +{ + return IGNORED; +} + +/* + * Page in unknown state. Do nothing. + */ +static int me_unknown(struct page *p, unsigned long pfn) +{ + printk(KERN_ERR "MCE %#lx: Unknown page state\n", pfn); + return FAILED; +} + +/* + * Free memory + */ +static int me_free(struct page *p, unsigned long pfn) +{ + return DELAYED; +} + +/* + * Clean (or cleaned) page cache page. + */ +static int me_pagecache_clean(struct page *p, unsigned long pfn) +{ + int err; + int ret = FAILED; + struct address_space *mapping; + + if (!isolate_lru_page(p)) + page_cache_release(p); + + /* + * For anonymous pages we're done the only reference left + * should be the one m_f() holds. + */ + if (PageAnon(p)) + return RECOVERED; + + /* + * Now truncate the page in the page cache. This is really + * more like a "temporary hole punch" + * Don't do this for block devices when someone else + * has a reference, because it could be file system metadata + * and that's not safe to truncate. + */ + mapping = page_mapping(p); + if (!mapping) { + /* + * Page has been teared down in the meanwhile + */ + return FAILED; + } + + /* + * Truncation is a bit tricky. Enable it per file system for now. + * + * Open: to take i_mutex or not for this? Right now we don't. + */ + if (mapping->a_ops->error_remove_page) { + err = mapping->a_ops->error_remove_page(mapping, p); + if (err != 0) { + printk(KERN_INFO "MCE %#lx: Failed to punch page: %d\n", + pfn, err); + } else if (page_has_private(p) && + !try_to_release_page(p, GFP_NOIO)) { + pr_debug("MCE %#lx: failed to release buffers\n", pfn); + } else { + ret = RECOVERED; + } + } else { + /* + * If the file system doesn't support it just invalidate + * This fails on dirty or anything with private pages + */ + if (invalidate_inode_page(p)) + ret = RECOVERED; + else + printk(KERN_INFO "MCE %#lx: Failed to invalidate\n", + pfn); + } + return ret; +} + +/* + * Dirty cache page page + * Issues: when the error hit a hole page the error is not properly + * propagated. + */ +static int me_pagecache_dirty(struct page *p, unsigned long pfn) +{ + struct address_space *mapping = page_mapping(p); + + SetPageError(p); + /* TBD: print more information about the file. */ + if (mapping) { + /* + * IO error will be reported by write(), fsync(), etc. + * who check the mapping. + * This way the application knows that something went + * wrong with its dirty file data. + * + * There's one open issue: + * + * The EIO will be only reported on the next IO + * operation and then cleared through the IO map. + * Normally Linux has two mechanisms to pass IO error + * first through the AS_EIO flag in the address space + * and then through the PageError flag in the page. + * Since we drop pages on memory failure handling the + * only mechanism open to use is through AS_AIO. + * + * This has the disadvantage that it gets cleared on + * the first operation that returns an error, while + * the PageError bit is more sticky and only cleared + * when the page is reread or dropped. If an + * application assumes it will always get error on + * fsync, but does other operations on the fd before + * and the page is dropped inbetween then the error + * will not be properly reported. + * + * This can already happen even without hwpoisoned + * pages: first on metadata IO errors (which only + * report through AS_EIO) or when the page is dropped + * at the wrong time. + * + * So right now we assume that the application DTRT on + * the first EIO, but we're not worse than other parts + * of the kernel. + */ + mapping_set_error(mapping, EIO); + } + + return me_pagecache_clean(p, pfn); +} + +/* + * Clean and dirty swap cache. + * + * Dirty swap cache page is tricky to handle. The page could live both in page + * cache and swap cache(ie. page is freshly swapped in). So it could be + * referenced concurrently by 2 types of PTEs: + * normal PTEs and swap PTEs. We try to handle them consistently by calling + * try_to_unmap(TTU_IGNORE_HWPOISON) to convert the normal PTEs to swap PTEs, + * and then + * - clear dirty bit to prevent IO + * - remove from LRU + * - but keep in the swap cache, so that when we return to it on + * a later page fault, we know the application is accessing + * corrupted data and shall be killed (we installed simple + * interception code in do_swap_page to catch it). + * + * Clean swap cache pages can be directly isolated. A later page fault will + * bring in the known good data from disk. + */ +static int me_swapcache_dirty(struct page *p, unsigned long pfn) +{ + int ret = FAILED; + + ClearPageDirty(p); + /* Trigger EIO in shmem: */ + ClearPageUptodate(p); + + if (!isolate_lru_page(p)) { + page_cache_release(p); + ret = DELAYED; + } + + return ret; +} + +static int me_swapcache_clean(struct page *p, unsigned long pfn) +{ + int ret = FAILED; + + if (!isolate_lru_page(p)) { + page_cache_release(p); + ret = RECOVERED; + } + delete_from_swap_cache(p); + return ret; +} + +/* + * Huge pages. Needs work. + * Issues: + * No rmap support so we cannot find the original mapper. In theory could walk + * all MMs and look for the mappings, but that would be non atomic and racy. + * Need rmap for hugepages for this. Alternatively we could employ a heuristic, + * like just walking the current process and hoping it has it mapped (that + * should be usually true for the common "shared database cache" case) + * Should handle free huge pages and dequeue them too, but this needs to + * handle huge page accounting correctly. + */ +static int me_huge_page(struct page *p, unsigned long pfn) +{ + return FAILED; +} + +/* + * Various page states we can handle. + * + * A page state is defined by its current page->flags bits. + * The table matches them in order and calls the right handler. + * + * This is quite tricky because we can access page at any time + * in its live cycle, so all accesses have to be extremly careful. + * + * This is not complete. More states could be added. + * For any missing state don't attempt recovery. + */ + +#define dirty (1UL << PG_dirty) +#define sc (1UL << PG_swapcache) +#define unevict (1UL << PG_unevictable) +#define mlock (1UL << PG_mlocked) +#define writeback (1UL << PG_writeback) +#define lru (1UL << PG_lru) +#define swapbacked (1UL << PG_swapbacked) +#define head (1UL << PG_head) +#define tail (1UL << PG_tail) +#define compound (1UL << PG_compound) +#define slab (1UL << PG_slab) +#define buddy (1UL << PG_buddy) +#define reserved (1UL << PG_reserved) + +static struct page_state { + unsigned long mask; + unsigned long res; + char *msg; + int (*action)(struct page *p, unsigned long pfn); +} error_states[] = { + { reserved, reserved, "reserved kernel", me_ignore }, + { buddy, buddy, "free kernel", me_free }, + + /* + * Could in theory check if slab page is free or if we can drop + * currently unused objects without touching them. But just + * treat it as standard kernel for now. + */ + { slab, slab, "kernel slab", me_kernel }, + +#ifdef CONFIG_PAGEFLAGS_EXTENDED + { head, head, "huge", me_huge_page }, + { tail, tail, "huge", me_huge_page }, +#else + { compound, compound, "huge", me_huge_page }, +#endif + + { sc|dirty, sc|dirty, "swapcache", me_swapcache_dirty }, + { sc|dirty, sc, "swapcache", me_swapcache_clean }, + + { unevict|dirty, unevict|dirty, "unevictable LRU", me_pagecache_dirty}, + { unevict, unevict, "unevictable LRU", me_pagecache_clean}, + +#ifdef CONFIG_HAVE_MLOCKED_PAGE_BIT + { mlock|dirty, mlock|dirty, "mlocked LRU", me_pagecache_dirty }, + { mlock, mlock, "mlocked LRU", me_pagecache_clean }, +#endif + + { lru|dirty, lru|dirty, "LRU", me_pagecache_dirty }, + { lru|dirty, lru, "clean LRU", me_pagecache_clean }, + { swapbacked, swapbacked, "anonymous", me_pagecache_clean }, + + /* + * Catchall entry: must be at end. + */ + { 0, 0, "unknown page state", me_unknown }, +}; + +#undef lru + +static void action_result(unsigned long pfn, char *msg, int result) +{ + struct page *page = NULL; + if (pfn_valid(pfn)) + page = pfn_to_page(pfn); + + printk(KERN_ERR "MCE %#lx: %s%s page recovery: %s\n", + pfn, + page && PageDirty(page) ? "dirty " : "", + msg, action_name[result]); +} + +static int page_action(struct page_state *ps, struct page *p, + unsigned long pfn, int ref) +{ + int result; + + result = ps->action(p, pfn); + action_result(pfn, ps->msg, result); + if (page_count(p) != 1 + ref) + printk(KERN_ERR + "MCE %#lx: %s page still referenced by %d users\n", + pfn, ps->msg, page_count(p) - 1); + + /* Could do more checks here if page looks ok */ + /* + * Could adjust zone counters here to correct for the missing page. + */ + + return result == RECOVERED ? 0 : -EBUSY; +} + +#define N_UNMAP_TRIES 5 + +/* + * Do all that is necessary to remove user space mappings. Unmap + * the pages and send SIGBUS to the processes if the data was dirty. + */ +static void hwpoison_user_mappings(struct page *p, unsigned long pfn, + int trapno) +{ + enum ttu_flags ttu = TTU_UNMAP | TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS; + struct address_space *mapping; + LIST_HEAD(tokill); + int ret; + int i; + int kill = 1; + + if (PageReserved(p) || PageCompound(p) || PageSlab(p)) + return; + + if (!PageLRU(p)) + lru_add_drain_all(); + + /* + * This check implies we don't kill processes if their pages + * are in the swap cache early. Those are always late kills. + */ + if (!page_mapped(p)) + return; + + if (PageSwapCache(p)) { + printk(KERN_ERR + "MCE %#lx: keeping poisoned page in swap cache\n", pfn); + ttu |= TTU_IGNORE_HWPOISON; + } + + /* + * Propagate the dirty bit from PTEs to struct page first, because we + * need this to decide if we should kill or just drop the page. + */ + mapping = page_mapping(p); + if (!PageDirty(p) && mapping && mapping_cap_writeback_dirty(mapping)) { + if (page_mkclean(p)) { + SetPageDirty(p); + } else { + kill = 0; + ttu |= TTU_IGNORE_HWPOISON; + printk(KERN_INFO + "MCE %#lx: corrupted page was clean: dropped without side effects\n", + pfn); + } + } + + /* + * First collect all the processes that have the page + * mapped in dirty form. This has to be done before try_to_unmap, + * because ttu takes the rmap data structures down. + * + * Error handling: We ignore errors here because + * there's nothing that can be done. + */ + if (kill) + collect_procs(p, &tokill); + + /* + * try_to_unmap can fail temporarily due to races. + * Try a few times (RED-PEN better strategy?) + */ + for (i = 0; i < N_UNMAP_TRIES; i++) { + ret = try_to_unmap(p, ttu); + if (ret == SWAP_SUCCESS) + break; + pr_debug("MCE %#lx: try_to_unmap retry needed %d\n", pfn, ret); + } + + if (ret != SWAP_SUCCESS) + printk(KERN_ERR "MCE %#lx: failed to unmap page (mapcount=%d)\n", + pfn, page_mapcount(p)); + + /* + * Now that the dirty bit has been propagated to the + * struct page and all unmaps done we can decide if + * killing is needed or not. Only kill when the page + * was dirty, otherwise the tokill list is merely + * freed. When there was a problem unmapping earlier + * use a more force-full uncatchable kill to prevent + * any accesses to the poisoned memory. + */ + kill_procs_ao(&tokill, !!PageDirty(p), trapno, + ret != SWAP_SUCCESS, pfn); +} + +int __memory_failure(unsigned long pfn, int trapno, int ref) +{ + struct page_state *ps; + struct page *p; + int res; + + if (!sysctl_memory_failure_recovery) + panic("Memory failure from trap %d on page %lx", trapno, pfn); + + if (!pfn_valid(pfn)) { + action_result(pfn, "memory outside kernel control", IGNORED); + return -EIO; + } + + p = pfn_to_page(pfn); + if (TestSetPageHWPoison(p)) { + action_result(pfn, "already hardware poisoned", IGNORED); + return 0; + } + + atomic_long_add(1, &mce_bad_pages); + + /* + * We need/can do nothing about count=0 pages. + * 1) it's a free page, and therefore in safe hand: + * prep_new_page() will be the gate keeper. + * 2) it's part of a non-compound high order page. + * Implies some kernel user: cannot stop them from + * R/W the page; let's pray that the page has been + * used and will be freed some time later. + * In fact it's dangerous to directly bump up page count from 0, + * that may make page_freeze_refs()/page_unfreeze_refs() mismatch. + */ + if (!get_page_unless_zero(compound_head(p))) { + action_result(pfn, "free or high order kernel", IGNORED); + return PageBuddy(compound_head(p)) ? 0 : -EBUSY; + } + + /* + * Lock the page and wait for writeback to finish. + * It's very difficult to mess with pages currently under IO + * and in many cases impossible, so we just avoid it here. + */ + lock_page_nosync(p); + wait_on_page_writeback(p); + + /* + * Now take care of user space mappings. + */ + hwpoison_user_mappings(p, pfn, trapno); + + /* + * Torn down by someone else? + */ + if (PageLRU(p) && !PageSwapCache(p) && p->mapping == NULL) { + action_result(pfn, "already truncated LRU", IGNORED); + res = 0; + goto out; + } + + res = -EBUSY; + for (ps = error_states;; ps++) { + if ((p->flags & ps->mask) == ps->res) { + res = page_action(ps, p, pfn, ref); + break; + } + } +out: + unlock_page(p); + return res; +} +EXPORT_SYMBOL_GPL(__memory_failure); + +/** + * memory_failure - Handle memory failure of a page. + * @pfn: Page Number of the corrupted page + * @trapno: Trap number reported in the signal to user space. + * + * This function is called by the low level machine check code + * of an architecture when it detects hardware memory corruption + * of a page. It tries its best to recover, which includes + * dropping pages, killing processes etc. + * + * The function is primarily of use for corruptions that + * happen outside the current execution context (e.g. when + * detected by a background scrubber) + * + * Must run in process context (e.g. a work queue) with interrupts + * enabled and no spinlocks hold. + */ +void memory_failure(unsigned long pfn, int trapno) +{ + __memory_failure(pfn, trapno, 0); +} diff --git a/mm/memory.c b/mm/memory.c index f46ac18ba231..7e91b5f9f690 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -45,6 +45,7 @@ #include <linux/swap.h> #include <linux/highmem.h> #include <linux/pagemap.h> +#include <linux/ksm.h> #include <linux/rmap.h> #include <linux/module.h> #include <linux/delayacct.h> @@ -56,6 +57,7 @@ #include <linux/swapops.h> #include <linux/elf.h> +#include <asm/io.h> #include <asm/pgalloc.h> #include <asm/uaccess.h> #include <asm/tlb.h> @@ -106,6 +108,18 @@ static int __init disable_randmaps(char *s) } __setup("norandmaps", disable_randmaps); +unsigned long zero_pfn __read_mostly; +unsigned long highest_memmap_pfn __read_mostly; + +/* + * CONFIG_MMU architectures set up ZERO_PAGE in their paging_init() + */ +static int __init init_zero_pfn(void) +{ + zero_pfn = page_to_pfn(ZERO_PAGE(0)); + return 0; +} +core_initcall(init_zero_pfn); /* * If a p?d_bad entry is found while walking page tables, report @@ -135,11 +149,12 @@ void pmd_clear_bad(pmd_t *pmd) * Note: this doesn't free the actual pages themselves. That * has been handled earlier when unmapping all the memory regions. */ -static void free_pte_range(struct mmu_gather *tlb, pmd_t *pmd) +static void free_pte_range(struct mmu_gather *tlb, pmd_t *pmd, + unsigned long addr) { pgtable_t token = pmd_pgtable(*pmd); pmd_clear(pmd); - pte_free_tlb(tlb, token); + pte_free_tlb(tlb, token, addr); tlb->mm->nr_ptes--; } @@ -157,7 +172,7 @@ static inline void free_pmd_range(struct mmu_gather *tlb, pud_t *pud, next = pmd_addr_end(addr, end); if (pmd_none_or_clear_bad(pmd)) continue; - free_pte_range(tlb, pmd); + free_pte_range(tlb, pmd, addr); } while (pmd++, addr = next, addr != end); start &= PUD_MASK; @@ -173,7 +188,7 @@ static inline void free_pmd_range(struct mmu_gather *tlb, pud_t *pud, pmd = pmd_offset(pud, start); pud_clear(pud); - pmd_free_tlb(tlb, pmd); + pmd_free_tlb(tlb, pmd, start); } static inline void free_pud_range(struct mmu_gather *tlb, pgd_t *pgd, @@ -206,7 +221,7 @@ static inline void free_pud_range(struct mmu_gather *tlb, pgd_t *pgd, pud = pud_offset(pgd, start); pgd_clear(pgd); - pud_free_tlb(tlb, pud); + pud_free_tlb(tlb, pud, start); } /* @@ -282,7 +297,8 @@ void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma, unsigned long addr = vma->vm_start; /* - * Hide vma from rmap and vmtruncate before freeing pgtables + * Hide vma from rmap and truncate_pagecache before freeing + * pgtables */ anon_vma_unlink(vma); unlink_file_vma(vma); @@ -441,6 +457,20 @@ static inline int is_cow_mapping(unsigned int flags) return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; } +#ifndef is_zero_pfn +static inline int is_zero_pfn(unsigned long pfn) +{ + return pfn == zero_pfn; +} +#endif + +#ifndef my_zero_pfn +static inline unsigned long my_zero_pfn(unsigned long addr) +{ + return zero_pfn; +} +#endif + /* * vm_normal_page -- This function gets the "struct page" associated with a pte. * @@ -496,7 +526,9 @@ struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, if (HAVE_PTE_SPECIAL) { if (likely(!pte_special(pte))) goto check_pfn; - if (!(vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP))) + if (vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP)) + return NULL; + if (!is_zero_pfn(pfn)) print_bad_pte(vma, addr, pte, NULL); return NULL; } @@ -518,6 +550,8 @@ struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, } } + if (is_zero_pfn(pfn)) + return NULL; check_pfn: if (unlikely(pfn > highest_memmap_pfn)) { print_bad_pte(vma, addr, pte, NULL); @@ -595,8 +629,8 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, page = vm_normal_page(vma, addr, pte); if (page) { get_page(page); - page_dup_rmap(page, vma, addr); - rss[!!PageAnon(page)]++; + page_dup_rmap(page); + rss[PageAnon(page)]++; } out_set_pte: @@ -1141,9 +1175,14 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address, goto no_page; if ((flags & FOLL_WRITE) && !pte_write(pte)) goto unlock; + page = vm_normal_page(vma, address, pte); - if (unlikely(!page)) - goto bad_page; + if (unlikely(!page)) { + if ((flags & FOLL_DUMP) || + !is_zero_pfn(pte_pfn(pte))) + goto bad_page; + page = pte_page(pte); + } if (flags & FOLL_GET) get_page(page); @@ -1171,65 +1210,46 @@ no_page: pte_unmap_unlock(ptep, ptl); if (!pte_none(pte)) return page; - /* Fall through to ZERO_PAGE handling */ + no_page_table: /* * When core dumping an enormous anonymous area that nobody - * has touched so far, we don't want to allocate page tables. + * has touched so far, we don't want to allocate unnecessary pages or + * page tables. Return error instead of NULL to skip handle_mm_fault, + * then get_dump_page() will return NULL to leave a hole in the dump. + * But we can only make this optimization where a hole would surely + * be zero-filled if handle_mm_fault() actually did handle it. */ - if (flags & FOLL_ANON) { - page = ZERO_PAGE(0); - if (flags & FOLL_GET) - get_page(page); - BUG_ON(flags & FOLL_WRITE); - } + if ((flags & FOLL_DUMP) && + (!vma->vm_ops || !vma->vm_ops->fault)) + return ERR_PTR(-EFAULT); return page; } -/* Can we do the FOLL_ANON optimization? */ -static inline int use_zero_page(struct vm_area_struct *vma) -{ - /* - * We don't want to optimize FOLL_ANON for make_pages_present() - * when it tries to page in a VM_LOCKED region. As to VM_SHARED, - * we want to get the page from the page tables to make sure - * that we serialize and update with any other user of that - * mapping. - */ - if (vma->vm_flags & (VM_LOCKED | VM_SHARED)) - return 0; - /* - * And if we have a fault routine, it's not an anonymous region. - */ - return !vma->vm_ops || !vma->vm_ops->fault; -} - - - int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, - unsigned long start, int len, int flags, - struct page **pages, struct vm_area_struct **vmas) + unsigned long start, int nr_pages, unsigned int gup_flags, + struct page **pages, struct vm_area_struct **vmas) { int i; - unsigned int vm_flags = 0; - int write = !!(flags & GUP_FLAGS_WRITE); - int force = !!(flags & GUP_FLAGS_FORCE); - int ignore = !!(flags & GUP_FLAGS_IGNORE_VMA_PERMISSIONS); - int ignore_sigkill = !!(flags & GUP_FLAGS_IGNORE_SIGKILL); + unsigned long vm_flags; - if (len <= 0) + if (nr_pages <= 0) return 0; + + VM_BUG_ON(!!pages != !!(gup_flags & FOLL_GET)); + /* * Require read or write permissions. - * If 'force' is set, we only require the "MAY" flags. + * If FOLL_FORCE is set, we only require the "MAY" flags. */ - vm_flags = write ? (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD); - vm_flags &= force ? (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE); + vm_flags = (gup_flags & FOLL_WRITE) ? + (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD); + vm_flags &= (gup_flags & FOLL_FORCE) ? + (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE); i = 0; do { struct vm_area_struct *vma; - unsigned int foll_flags; vma = find_extend_vma(mm, start); if (!vma && in_gate_area(tsk, start)) { @@ -1241,7 +1261,7 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, pte_t *pte; /* user gate pages are read-only */ - if (!ignore && write) + if (gup_flags & FOLL_WRITE) return i ? : -EFAULT; if (pg > TASK_SIZE) pgd = pgd_offset_k(pg); @@ -1269,44 +1289,32 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, vmas[i] = gate_vma; i++; start += PAGE_SIZE; - len--; + nr_pages--; continue; } if (!vma || (vma->vm_flags & (VM_IO | VM_PFNMAP)) || - (!ignore && !(vm_flags & vma->vm_flags))) + !(vm_flags & vma->vm_flags)) return i ? : -EFAULT; if (is_vm_hugetlb_page(vma)) { i = follow_hugetlb_page(mm, vma, pages, vmas, - &start, &len, i, write); + &start, &nr_pages, i, gup_flags); continue; } - foll_flags = FOLL_TOUCH; - if (pages) - foll_flags |= FOLL_GET; - if (!write && use_zero_page(vma)) - foll_flags |= FOLL_ANON; - do { struct page *page; + unsigned int foll_flags = gup_flags; /* * If we have a pending SIGKILL, don't keep faulting - * pages and potentially allocating memory, unless - * current is handling munlock--e.g., on exit. In - * that case, we are not allocating memory. Rather, - * we're only unlocking already resident/mapped pages. + * pages and potentially allocating memory. */ - if (unlikely(!ignore_sigkill && - fatal_signal_pending(current))) + if (unlikely(fatal_signal_pending(current))) return i ? i : -ERESTARTSYS; - if (write) - foll_flags |= FOLL_WRITE; - cond_resched(); while (!(page = follow_page(vma, start, foll_flags))) { int ret; @@ -1318,7 +1326,8 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, if (ret & VM_FAULT_ERROR) { if (ret & VM_FAULT_OOM) return i ? i : -ENOMEM; - else if (ret & VM_FAULT_SIGBUS) + if (ret & + (VM_FAULT_HWPOISON|VM_FAULT_SIGBUS)) return i ? i : -EFAULT; BUG(); } @@ -1357,9 +1366,9 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, vmas[i] = vma; i++; start += PAGE_SIZE; - len--; - } while (len && start < vma->vm_end); - } while (len); + nr_pages--; + } while (nr_pages && start < vma->vm_end); + } while (nr_pages); return i; } @@ -1368,7 +1377,7 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, * @tsk: task_struct of target task * @mm: mm_struct of target mm * @start: starting user address - * @len: number of pages from start to pin + * @nr_pages: number of pages from start to pin * @write: whether pages will be written to by the caller * @force: whether to force write access even if user mapping is * readonly. This will result in the page being COWed even @@ -1380,7 +1389,7 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, * Or NULL if the caller does not require them. * * Returns number of pages pinned. This may be fewer than the number - * requested. If len is 0 or negative, returns 0. If no pages + * requested. If nr_pages is 0 or negative, returns 0. If no pages * were pinned, returns -errno. Each page returned must be released * with a put_page() call when it is finished with. vmas will only * remain valid while mmap_sem is held. @@ -1414,23 +1423,50 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, * See also get_user_pages_fast, for performance critical applications. */ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, - unsigned long start, int len, int write, int force, + unsigned long start, int nr_pages, int write, int force, struct page **pages, struct vm_area_struct **vmas) { - int flags = 0; + int flags = FOLL_TOUCH; + if (pages) + flags |= FOLL_GET; if (write) - flags |= GUP_FLAGS_WRITE; + flags |= FOLL_WRITE; if (force) - flags |= GUP_FLAGS_FORCE; + flags |= FOLL_FORCE; - return __get_user_pages(tsk, mm, - start, len, flags, - pages, vmas); + return __get_user_pages(tsk, mm, start, nr_pages, flags, pages, vmas); } - EXPORT_SYMBOL(get_user_pages); +/** + * get_dump_page() - pin user page in memory while writing it to core dump + * @addr: user address + * + * Returns struct page pointer of user page pinned for dump, + * to be freed afterwards by page_cache_release() or put_page(). + * + * Returns NULL on any kind of failure - a hole must then be inserted into + * the corefile, to preserve alignment with its headers; and also returns + * NULL wherever the ZERO_PAGE, or an anonymous pte_none, has been found - + * allowing a hole to be left in the corefile to save diskspace. + * + * Called without mmap_sem, but after all other threads have been killed. + */ +#ifdef CONFIG_ELF_CORE +struct page *get_dump_page(unsigned long addr) +{ + struct vm_area_struct *vma; + struct page *page; + + if (__get_user_pages(current, current->mm, addr, 1, + FOLL_FORCE | FOLL_DUMP | FOLL_GET, &page, &vma) < 1) + return NULL; + flush_cache_page(vma, addr, page_to_pfn(page)); + return page; +} +#endif /* CONFIG_ELF_CORE */ + pte_t *get_locked_pte(struct mm_struct *mm, unsigned long addr, spinlock_t **ptl) { @@ -1608,7 +1644,8 @@ int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr, * If we don't have pte special, then we have to use the pfn_valid() * based VM_MIXEDMAP scheme (see vm_normal_page), and thus we *must* * refcount the page if pfn_valid is true (hence insert_page rather - * than insert_pfn). + * than insert_pfn). If a zero_pfn were inserted into a VM_MIXEDMAP + * without pte special, it would there be refcounted as a normal page. */ if (!HAVE_PTE_SPECIAL && pfn_valid(pfn)) { struct page *page; @@ -1974,7 +2011,7 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, * Take out anonymous pages first, anonymous shared vmas are * not dirty accountable. */ - if (PageAnon(old_page)) { + if (PageAnon(old_page) && !PageKsm(old_page)) { if (!trylock_page(old_page)) { page_cache_get(old_page); pte_unmap_unlock(page_table, ptl); @@ -2075,10 +2112,19 @@ gotten: if (unlikely(anon_vma_prepare(vma))) goto oom; - VM_BUG_ON(old_page == ZERO_PAGE(0)); - new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address); - if (!new_page) - goto oom; + + if (is_zero_pfn(pte_pfn(orig_pte))) { + new_page = alloc_zeroed_user_highpage_movable(vma, address); + if (!new_page) + goto oom; + } else { + new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address); + if (!new_page) + goto oom; + cow_user_page(new_page, old_page, address, vma); + } + __SetPageUptodate(new_page); + /* * Don't let another task, with possibly unlocked vma, * keep the mlocked page. @@ -2088,8 +2134,6 @@ gotten: clear_page_mlock(old_page); unlock_page(old_page); } - cow_user_page(new_page, old_page, address, vma); - __SetPageUptodate(new_page); if (mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL)) goto oom_free_new; @@ -2115,9 +2159,14 @@ gotten: * seen in the presence of one thread doing SMC and another * thread doing COW. */ - ptep_clear_flush_notify(vma, address, page_table); + ptep_clear_flush(vma, address, page_table); page_add_new_anon_rmap(new_page, vma, address); - set_pte_at(mm, address, page_table, entry); + /* + * We call the notify macro here because, when using secondary + * mmu page tables (such as kvm shadow page tables), we want the + * new page to be mapped directly into the secondary page table. + */ + set_pte_at_notify(mm, address, page_table, entry); update_mmu_cache(vma, address, entry); if (old_page) { /* @@ -2360,7 +2409,7 @@ restart: * @mapping: the address space containing mmaps to be unmapped. * @holebegin: byte in first page to unmap, relative to the start of * the underlying file. This will be rounded down to a PAGE_SIZE - * boundary. Note that this is different from vmtruncate(), which + * boundary. Note that this is different from truncate_pagecache(), which * must keep the partial page. In contrast, we must get rid of * partial pages. * @holelen: size of prospective hole in bytes. This will be rounded @@ -2411,63 +2460,6 @@ void unmap_mapping_range(struct address_space *mapping, } EXPORT_SYMBOL(unmap_mapping_range); -/** - * vmtruncate - unmap mappings "freed" by truncate() syscall - * @inode: inode of the file used - * @offset: file offset to start truncating - * - * NOTE! We have to be ready to update the memory sharing - * between the file and the memory map for a potential last - * incomplete page. Ugly, but necessary. - */ -int vmtruncate(struct inode * inode, loff_t offset) -{ - if (inode->i_size < offset) { - unsigned long limit; - - limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur; - if (limit != RLIM_INFINITY && offset > limit) - goto out_sig; - if (offset > inode->i_sb->s_maxbytes) - goto out_big; - i_size_write(inode, offset); - } else { - struct address_space *mapping = inode->i_mapping; - - /* - * truncation of in-use swapfiles is disallowed - it would - * cause subsequent swapout to scribble on the now-freed - * blocks. - */ - if (IS_SWAPFILE(inode)) - return -ETXTBSY; - i_size_write(inode, offset); - - /* - * unmap_mapping_range is called twice, first simply for - * efficiency so that truncate_inode_pages does fewer - * single-page unmaps. However after this first call, and - * before truncate_inode_pages finishes, it is possible for - * private pages to be COWed, which remain after - * truncate_inode_pages finishes, hence the second - * unmap_mapping_range call must be made for correctness. - */ - unmap_mapping_range(mapping, offset + PAGE_SIZE - 1, 0, 1); - truncate_inode_pages(mapping, offset); - unmap_mapping_range(mapping, offset + PAGE_SIZE - 1, 0, 1); - } - - if (inode->i_op->truncate) - inode->i_op->truncate(inode); - return 0; - -out_sig: - send_sig(SIGXFSZ, current, 0); -out_big: - return -EFBIG; -} -EXPORT_SYMBOL(vmtruncate); - int vmtruncate_range(struct inode *inode, loff_t offset, loff_t end) { struct address_space *mapping = inode->i_mapping; @@ -2512,8 +2504,15 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, goto out; entry = pte_to_swp_entry(orig_pte); - if (is_migration_entry(entry)) { - migration_entry_wait(mm, pmd, address); + if (unlikely(non_swap_entry(entry))) { + if (is_migration_entry(entry)) { + migration_entry_wait(mm, pmd, address); + } else if (is_hwpoison_entry(entry)) { + ret = VM_FAULT_HWPOISON; + } else { + print_bad_pte(vma, address, orig_pte, NULL); + ret = VM_FAULT_OOM; + } goto out; } delayacct_set_flag(DELAYACCT_PF_SWAPIN); @@ -2537,6 +2536,10 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, /* Had to read the page from swap area: Major fault */ ret = VM_FAULT_MAJOR; count_vm_event(PGMAJFAULT); + } else if (PageHWPoison(page)) { + ret = VM_FAULT_HWPOISON; + delayacct_clear_flag(DELAYACCT_PF_SWAPIN); + goto out; } lock_page(page); @@ -2625,6 +2628,16 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, spinlock_t *ptl; pte_t entry; + if (!(flags & FAULT_FLAG_WRITE)) { + entry = pte_mkspecial(pfn_pte(my_zero_pfn(address), + vma->vm_page_prot)); + ptl = pte_lockptr(mm, pmd); + spin_lock(ptl); + if (!pte_none(*page_table)) + goto unlock; + goto setpte; + } + /* Allocate our own private page. */ pte_unmap(page_table); @@ -2639,13 +2652,16 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, goto oom_free_page; entry = mk_pte(page, vma->vm_page_prot); - entry = maybe_mkwrite(pte_mkdirty(entry), vma); + if (vma->vm_flags & VM_WRITE) + entry = pte_mkwrite(pte_mkdirty(entry)); page_table = pte_offset_map_lock(mm, pmd, address, &ptl); if (!pte_none(*page_table)) goto release; + inc_mm_counter(mm, anon_rss); page_add_new_anon_rmap(page, vma, address); +setpte: set_pte_at(mm, address, page_table, entry); /* No need to invalidate - it was non-present before */ @@ -2700,6 +2716,12 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) return ret; + if (unlikely(PageHWPoison(vmf.page))) { + if (ret & VM_FAULT_LOCKED) + unlock_page(vmf.page); + return VM_FAULT_HWPOISON; + } + /* * For consistency in subsequent calls, make the faulted page always * locked. diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index e4412a676c88..821dee596377 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -339,8 +339,11 @@ EXPORT_SYMBOL_GPL(__remove_pages); void online_page(struct page *page) { + unsigned long pfn = page_to_pfn(page); + totalram_pages++; - num_physpages++; + if (pfn >= num_physpages) + num_physpages = pfn + 1; #ifdef CONFIG_HIGHMEM if (PageHighMem(page)) @@ -410,7 +413,7 @@ int online_pages(unsigned long pfn, unsigned long nr_pages) if (!populated_zone(zone)) need_zonelists_rebuild = 1; - ret = walk_memory_resource(pfn, nr_pages, &onlined_pages, + ret = walk_system_ram_range(pfn, nr_pages, &onlined_pages, online_pages_range); if (ret) { printk(KERN_DEBUG "online_pages %lx at %lx failed\n", @@ -422,6 +425,7 @@ int online_pages(unsigned long pfn, unsigned long nr_pages) zone->present_pages += onlined_pages; zone->zone_pgdat->node_present_pages += onlined_pages; + zone_pcp_update(zone); setup_per_zone_wmarks(); calculate_zone_inactive_ratio(zone); if (onlined_pages) { @@ -701,7 +705,7 @@ offline_isolated_pages_cb(unsigned long start, unsigned long nr_pages, static void offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn) { - walk_memory_resource(start_pfn, end_pfn - start_pfn, NULL, + walk_system_ram_range(start_pfn, end_pfn - start_pfn, NULL, offline_isolated_pages_cb); } @@ -727,7 +731,7 @@ check_pages_isolated(unsigned long start_pfn, unsigned long end_pfn) long offlined = 0; int ret; - ret = walk_memory_resource(start_pfn, end_pfn - start_pfn, &offlined, + ret = walk_system_ram_range(start_pfn, end_pfn - start_pfn, &offlined, check_pages_isolated_cb); if (ret < 0) offlined = (long)ret; @@ -831,7 +835,6 @@ repeat: zone->present_pages -= offlined_pages; zone->zone_pgdat->node_present_pages -= offlined_pages; totalram_pages -= offlined_pages; - num_physpages -= offlined_pages; setup_per_zone_wmarks(); calculate_zone_inactive_ratio(zone); diff --git a/mm/mempolicy.c b/mm/mempolicy.c index e08e2c4da63a..7dd9d9f80694 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -191,25 +191,27 @@ static int mpol_new_bind(struct mempolicy *pol, const nodemask_t *nodes) * Must be called holding task's alloc_lock to protect task's mems_allowed * and mempolicy. May also be called holding the mmap_semaphore for write. */ -static int mpol_set_nodemask(struct mempolicy *pol, const nodemask_t *nodes) +static int mpol_set_nodemask(struct mempolicy *pol, + const nodemask_t *nodes, struct nodemask_scratch *nsc) { - nodemask_t cpuset_context_nmask; int ret; /* if mode is MPOL_DEFAULT, pol is NULL. This is right. */ if (pol == NULL) return 0; + /* Check N_HIGH_MEMORY */ + nodes_and(nsc->mask1, + cpuset_current_mems_allowed, node_states[N_HIGH_MEMORY]); VM_BUG_ON(!nodes); if (pol->mode == MPOL_PREFERRED && nodes_empty(*nodes)) nodes = NULL; /* explicit local allocation */ else { if (pol->flags & MPOL_F_RELATIVE_NODES) - mpol_relative_nodemask(&cpuset_context_nmask, nodes, - &cpuset_current_mems_allowed); + mpol_relative_nodemask(&nsc->mask2, nodes,&nsc->mask1); else - nodes_and(cpuset_context_nmask, *nodes, - cpuset_current_mems_allowed); + nodes_and(nsc->mask2, *nodes, nsc->mask1); + if (mpol_store_user_nodemask(pol)) pol->w.user_nodemask = *nodes; else @@ -217,8 +219,10 @@ static int mpol_set_nodemask(struct mempolicy *pol, const nodemask_t *nodes) cpuset_current_mems_allowed; } - ret = mpol_ops[pol->mode].create(pol, - nodes ? &cpuset_context_nmask : NULL); + if (nodes) + ret = mpol_ops[pol->mode].create(pol, &nsc->mask2); + else + ret = mpol_ops[pol->mode].create(pol, NULL); return ret; } @@ -620,12 +624,17 @@ static long do_set_mempolicy(unsigned short mode, unsigned short flags, { struct mempolicy *new, *old; struct mm_struct *mm = current->mm; + NODEMASK_SCRATCH(scratch); int ret; - new = mpol_new(mode, flags, nodes); - if (IS_ERR(new)) - return PTR_ERR(new); + if (!scratch) + return -ENOMEM; + new = mpol_new(mode, flags, nodes); + if (IS_ERR(new)) { + ret = PTR_ERR(new); + goto out; + } /* * prevent changing our mempolicy while show_numa_maps() * is using it. @@ -635,13 +644,13 @@ static long do_set_mempolicy(unsigned short mode, unsigned short flags, if (mm) down_write(&mm->mmap_sem); task_lock(current); - ret = mpol_set_nodemask(new, nodes); + ret = mpol_set_nodemask(new, nodes, scratch); if (ret) { task_unlock(current); if (mm) up_write(&mm->mmap_sem); mpol_put(new); - return ret; + goto out; } old = current->mempolicy; current->mempolicy = new; @@ -654,7 +663,10 @@ static long do_set_mempolicy(unsigned short mode, unsigned short flags, up_write(&mm->mmap_sem); mpol_put(old); - return 0; + ret = 0; +out: + NODEMASK_SCRATCH_FREE(scratch); + return ret; } /* @@ -1014,12 +1026,20 @@ static long do_mbind(unsigned long start, unsigned long len, if (err) return err; } - down_write(&mm->mmap_sem); - task_lock(current); - err = mpol_set_nodemask(new, nmask); - task_unlock(current); + { + NODEMASK_SCRATCH(scratch); + if (scratch) { + down_write(&mm->mmap_sem); + task_lock(current); + err = mpol_set_nodemask(new, nmask, scratch); + task_unlock(current); + if (err) + up_write(&mm->mmap_sem); + } else + err = -ENOMEM; + NODEMASK_SCRATCH_FREE(scratch); + } if (err) { - up_write(&mm->mmap_sem); mpol_put(new); return err; } @@ -1891,6 +1911,7 @@ restart: * Install non-NULL @mpol in inode's shared policy rb-tree. * On entry, the current task has a reference on a non-NULL @mpol. * This must be released on exit. + * This is called at get_inode() calls and we can use GFP_KERNEL. */ void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol) { @@ -1902,19 +1923,24 @@ void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol) if (mpol) { struct vm_area_struct pvma; struct mempolicy *new; + NODEMASK_SCRATCH(scratch); + if (!scratch) + return; /* contextualize the tmpfs mount point mempolicy */ new = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask); if (IS_ERR(new)) { mpol_put(mpol); /* drop our ref on sb mpol */ + NODEMASK_SCRATCH_FREE(scratch); return; /* no valid nodemask intersection */ } task_lock(current); - ret = mpol_set_nodemask(new, &mpol->w.user_nodemask); + ret = mpol_set_nodemask(new, &mpol->w.user_nodemask, scratch); task_unlock(current); mpol_put(mpol); /* drop our ref on sb mpol */ if (ret) { + NODEMASK_SCRATCH_FREE(scratch); mpol_put(new); return; } @@ -1924,6 +1950,7 @@ void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol) pvma.vm_end = TASK_SIZE; /* policy covers entire file */ mpol_set_shared_policy(sp, &pvma, new); /* adds ref */ mpol_put(new); /* drop initial ref */ + NODEMASK_SCRATCH_FREE(scratch); } } @@ -2140,13 +2167,18 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context) err = 1; else { int ret; - - task_lock(current); - ret = mpol_set_nodemask(new, &nodes); - task_unlock(current); - if (ret) + NODEMASK_SCRATCH(scratch); + if (scratch) { + task_lock(current); + ret = mpol_set_nodemask(new, &nodes, scratch); + task_unlock(current); + } else + ret = -ENOMEM; + NODEMASK_SCRATCH_FREE(scratch); + if (ret) { err = 1; - else if (no_context) { + mpol_put(new); + } else if (no_context) { /* save for contextualization */ new->w.user_nodemask = nodes; } diff --git a/mm/mempool.c b/mm/mempool.c index a46eb1b4bb66..1a3bc3d4d554 100644 --- a/mm/mempool.c +++ b/mm/mempool.c @@ -303,18 +303,11 @@ EXPORT_SYMBOL(mempool_free_slab); */ void *mempool_kmalloc(gfp_t gfp_mask, void *pool_data) { - size_t size = (size_t)(long)pool_data; + size_t size = (size_t)pool_data; return kmalloc(size, gfp_mask); } EXPORT_SYMBOL(mempool_kmalloc); -void *mempool_kzalloc(gfp_t gfp_mask, void *pool_data) -{ - size_t size = (size_t) pool_data; - return kzalloc(size, gfp_mask); -} -EXPORT_SYMBOL(mempool_kzalloc); - void mempool_kfree(void *element, void *pool_data) { kfree(element); diff --git a/mm/migrate.c b/mm/migrate.c index 939888f9ddab..1a4bf4813780 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -67,6 +67,8 @@ int putback_lru_pages(struct list_head *l) list_for_each_entry_safe(page, page2, l, lru) { list_del(&page->lru); + dec_zone_page_state(page, NR_ISOLATED_ANON + + page_is_file_cache(page)); putback_lru_page(page); count++; } @@ -147,7 +149,7 @@ out: static void remove_file_migration_ptes(struct page *old, struct page *new) { struct vm_area_struct *vma; - struct address_space *mapping = page_mapping(new); + struct address_space *mapping = new->mapping; struct prio_tree_iter iter; pgoff_t pgoff = new->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); @@ -270,7 +272,7 @@ static int migrate_page_move_mapping(struct address_space *mapping, pslot = radix_tree_lookup_slot(&mapping->page_tree, page_index(page)); - expected_count = 2 + !!page_has_private(page); + expected_count = 2 + page_has_private(page); if (page_count(page) != expected_count || (struct page *)radix_tree_deref_slot(pslot) != page) { spin_unlock_irq(&mapping->tree_lock); @@ -312,7 +314,10 @@ static int migrate_page_move_mapping(struct address_space *mapping, */ __dec_zone_page_state(page, NR_FILE_PAGES); __inc_zone_page_state(newpage, NR_FILE_PAGES); - + if (PageSwapBacked(page)) { + __dec_zone_page_state(page, NR_SHMEM); + __inc_zone_page_state(newpage, NR_SHMEM); + } spin_unlock_irq(&mapping->tree_lock); return 0; @@ -664,13 +669,15 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private, * needs to be effective. */ try_to_free_buffers(page); + goto rcu_unlock; } - goto rcu_unlock; + goto skip_unmap; } /* Establish migration ptes or remove ptes */ - try_to_unmap(page, 1); + try_to_unmap(page, TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS); +skip_unmap: if (!page_mapped(page)) rc = move_to_new_page(newpage, page); @@ -693,6 +700,8 @@ unlock: * restored. */ list_del(&page->lru); + dec_zone_page_state(page, NR_ISOLATED_ANON + + page_is_file_cache(page)); putback_lru_page(page); } @@ -737,6 +746,13 @@ int migrate_pages(struct list_head *from, struct page *page2; int swapwrite = current->flags & PF_SWAPWRITE; int rc; + unsigned long flags; + + local_irq_save(flags); + list_for_each_entry(page, from, lru) + __inc_zone_page_state(page, NR_ISOLATED_ANON + + page_is_file_cache(page)); + local_irq_restore(flags); if (!swapwrite) current->flags |= PF_SWAPWRITE; diff --git a/mm/mlock.c b/mm/mlock.c index 45eb650b9654..bd6f0e466f6c 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -139,49 +139,36 @@ static void munlock_vma_page(struct page *page) } /** - * __mlock_vma_pages_range() - mlock/munlock a range of pages in the vma. + * __mlock_vma_pages_range() - mlock a range of pages in the vma. * @vma: target vma * @start: start address * @end: end address - * @mlock: 0 indicate munlock, otherwise mlock. * - * If @mlock == 0, unlock an mlocked range; - * else mlock the range of pages. This takes care of making the pages present , - * too. + * This takes care of making the pages present too. * * return 0 on success, negative error code on error. * * vma->vm_mm->mmap_sem must be held for at least read. */ static long __mlock_vma_pages_range(struct vm_area_struct *vma, - unsigned long start, unsigned long end, - int mlock) + unsigned long start, unsigned long end) { struct mm_struct *mm = vma->vm_mm; unsigned long addr = start; struct page *pages[16]; /* 16 gives a reasonable batch */ int nr_pages = (end - start) / PAGE_SIZE; int ret = 0; - int gup_flags = 0; + int gup_flags; VM_BUG_ON(start & ~PAGE_MASK); VM_BUG_ON(end & ~PAGE_MASK); VM_BUG_ON(start < vma->vm_start); VM_BUG_ON(end > vma->vm_end); - VM_BUG_ON((!rwsem_is_locked(&mm->mmap_sem)) && - (atomic_read(&mm->mm_users) != 0)); - - /* - * mlock: don't page populate if vma has PROT_NONE permission. - * munlock: always do munlock although the vma has PROT_NONE - * permission, or SIGKILL is pending. - */ - if (!mlock) - gup_flags |= GUP_FLAGS_IGNORE_VMA_PERMISSIONS | - GUP_FLAGS_IGNORE_SIGKILL; + VM_BUG_ON(!rwsem_is_locked(&mm->mmap_sem)); + gup_flags = FOLL_TOUCH | FOLL_GET; if (vma->vm_flags & VM_WRITE) - gup_flags |= GUP_FLAGS_WRITE; + gup_flags |= FOLL_WRITE; while (nr_pages > 0) { int i; @@ -201,51 +188,45 @@ static long __mlock_vma_pages_range(struct vm_area_struct *vma, * This can happen for, e.g., VM_NONLINEAR regions before * a page has been allocated and mapped at a given offset, * or for addresses that map beyond end of a file. - * We'll mlock the the pages if/when they get faulted in. + * We'll mlock the pages if/when they get faulted in. */ if (ret < 0) break; - if (ret == 0) { - /* - * We know the vma is there, so the only time - * we cannot get a single page should be an - * error (ret < 0) case. - */ - WARN_ON(1); - break; - } lru_add_drain(); /* push cached pages to LRU */ for (i = 0; i < ret; i++) { struct page *page = pages[i]; - lock_page(page); - /* - * Because we lock page here and migration is blocked - * by the elevated reference, we need only check for - * page truncation (file-cache only). - */ if (page->mapping) { - if (mlock) + /* + * That preliminary check is mainly to avoid + * the pointless overhead of lock_page on the + * ZERO_PAGE: which might bounce very badly if + * there is contention. However, we're still + * dirtying its cacheline with get/put_page: + * we'll add another __get_user_pages flag to + * avoid it if that case turns out to matter. + */ + lock_page(page); + /* + * Because we lock page here and migration is + * blocked by the elevated reference, we need + * only check for file-cache page truncation. + */ + if (page->mapping) mlock_vma_page(page); - else - munlock_vma_page(page); + unlock_page(page); } - unlock_page(page); - put_page(page); /* ref from get_user_pages() */ - - /* - * here we assume that get_user_pages() has given us - * a list of virtually contiguous pages. - */ - addr += PAGE_SIZE; /* for next get_user_pages() */ - nr_pages--; + put_page(page); /* ref from get_user_pages() */ } + + addr += ret * PAGE_SIZE; + nr_pages -= ret; ret = 0; } - return ret; /* count entire vma as locked_vm */ + return ret; /* 0 or negative error code */ } /* @@ -289,7 +270,7 @@ long mlock_vma_pages_range(struct vm_area_struct *vma, is_vm_hugetlb_page(vma) || vma == get_gate_vma(current))) { - __mlock_vma_pages_range(vma, start, end, 1); + __mlock_vma_pages_range(vma, start, end); /* Hide errors from mmap() and other callers */ return 0; @@ -310,7 +291,6 @@ no_mlock: return nr_pages; /* error or pages NOT mlocked */ } - /* * munlock_vma_pages_range() - munlock all pages in the vma range.' * @vma - vma containing range to be munlock()ed. @@ -330,10 +310,38 @@ no_mlock: * free them. This will result in freeing mlocked pages. */ void munlock_vma_pages_range(struct vm_area_struct *vma, - unsigned long start, unsigned long end) + unsigned long start, unsigned long end) { + unsigned long addr; + + lru_add_drain(); vma->vm_flags &= ~VM_LOCKED; - __mlock_vma_pages_range(vma, start, end, 0); + + for (addr = start; addr < end; addr += PAGE_SIZE) { + struct page *page; + /* + * Although FOLL_DUMP is intended for get_dump_page(), + * it just so happens that its special treatment of the + * ZERO_PAGE (returning an error instead of doing get_page) + * suits munlock very well (and if somehow an abnormal page + * has sneaked into the range, we won't oops here: great). + */ + page = follow_page(vma, addr, FOLL_GET | FOLL_DUMP); + if (page && !IS_ERR(page)) { + lock_page(page); + /* + * Like in __mlock_vma_pages_range(), + * because we lock page here and migration is + * blocked by the elevated reference, we need + * only check for file-cache page truncation. + */ + if (page->mapping) + munlock_vma_page(page); + unlock_page(page); + put_page(page); + } + cond_resched(); + } } /* @@ -400,18 +408,14 @@ success: * It's okay if try_to_unmap_one unmaps a page just after we * set VM_LOCKED, __mlock_vma_pages_range will bring it back. */ - vma->vm_flags = newflags; if (lock) { - ret = __mlock_vma_pages_range(vma, start, end, 1); - - if (ret > 0) { - mm->locked_vm -= ret; - ret = 0; - } else - ret = __mlock_posix_error_return(ret); /* translate if needed */ + vma->vm_flags = newflags; + ret = __mlock_vma_pages_range(vma, start, end); + if (ret < 0) + ret = __mlock_posix_error_return(ret); } else { - __mlock_vma_pages_range(vma, start, end, 0); + munlock_vma_pages_range(vma, start, end); } out: diff --git a/mm/mmap.c b/mm/mmap.c index 34579b23ebd5..73f5e4b64010 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -28,7 +28,7 @@ #include <linux/mempolicy.h> #include <linux/rmap.h> #include <linux/mmu_notifier.h> -#include <linux/perf_counter.h> +#include <linux/perf_event.h> #include <asm/uaccess.h> #include <asm/cacheflush.h> @@ -88,9 +88,6 @@ int sysctl_overcommit_ratio = 50; /* default is 50% */ int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT; struct percpu_counter vm_committed_as; -/* amount of vm to protect from userspace access */ -unsigned long mmap_min_addr = CONFIG_DEFAULT_MMAP_MIN_ADDR; - /* * Check that a process has enough memory to allocate a new virtual * mapping. 0 means there is enough memory for the allocation to @@ -573,9 +570,9 @@ again: remove_next = 1 + (end > next->vm_end); /* * When changing only vma->vm_end, we don't really need - * anon_vma lock: but is that case worth optimizing out? + * anon_vma lock. */ - if (vma->anon_vma) + if (vma->anon_vma && (insert || importer || start != vma->vm_start)) anon_vma = vma->anon_vma; if (anon_vma) { spin_lock(&anon_vma->lock); @@ -659,9 +656,6 @@ again: remove_next = 1 + (end > next->vm_end); validate_mm(mm); } -/* Flags that can be inherited from an existing mapping when merging */ -#define VM_MERGEABLE_FLAGS (VM_CAN_NONLINEAR) - /* * If the vma has a ->close operation then the driver probably needs to release * per-vma resources, so we don't attempt to merge those. @@ -669,7 +663,8 @@ again: remove_next = 1 + (end > next->vm_end); static inline int is_mergeable_vma(struct vm_area_struct *vma, struct file *file, unsigned long vm_flags) { - if ((vma->vm_flags ^ vm_flags) & ~VM_MERGEABLE_FLAGS) + /* VM_CAN_NONLINEAR may get set later by f_op->mmap() */ + if ((vma->vm_flags ^ vm_flags) & ~VM_CAN_NONLINEAR) return 0; if (vma->vm_file != file) return 0; @@ -908,7 +903,7 @@ void vm_stat_account(struct mm_struct *mm, unsigned long flags, #endif /* CONFIG_PROC_FS */ /* - * The caller must hold down_write(current->mm->mmap_sem). + * The caller must hold down_write(¤t->mm->mmap_sem). */ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, @@ -954,6 +949,24 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, if (mm->map_count > sysctl_max_map_count) return -ENOMEM; + if (flags & MAP_HUGETLB) { + struct user_struct *user = NULL; + if (file) + return -EINVAL; + + /* + * VM_NORESERVE is used because the reservations will be + * taken when vm_ops->mmap() is called + * A dummy user value is used because we are not locking + * memory so no accounting is necessary + */ + len = ALIGN(len, huge_page_size(&default_hstate)); + file = hugetlb_file_setup(HUGETLB_ANON_FILE, len, VM_NORESERVE, + &user, HUGETLB_ANONHUGE_INODE); + if (IS_ERR(file)) + return PTR_ERR(file); + } + /* Obtain the address to map to. we verify (or select) it and ensure * that it represents a valid section of the address space. */ @@ -968,11 +981,9 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, vm_flags = calc_vm_prot_bits(prot) | calc_vm_flag_bits(flags) | mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC; - if (flags & MAP_LOCKED) { + if (flags & MAP_LOCKED) if (!can_do_mlock()) return -EPERM; - vm_flags |= VM_LOCKED; - } /* mlock MCL_FUTURE? */ if (vm_flags & VM_LOCKED) { @@ -1198,21 +1209,21 @@ munmap_back: goto unmap_and_free_vma; if (vm_flags & VM_EXECUTABLE) added_exe_file_vma(mm); + + /* Can addr have changed?? + * + * Answer: Yes, several device drivers can do it in their + * f_op->mmap method. -DaveM + */ + addr = vma->vm_start; + pgoff = vma->vm_pgoff; + vm_flags = vma->vm_flags; } else if (vm_flags & VM_SHARED) { error = shmem_zero_setup(vma); if (error) goto free_vma; } - /* Can addr have changed?? - * - * Answer: Yes, several device drivers can do it in their - * f_op->mmap method. -DaveM - */ - addr = vma->vm_start; - pgoff = vma->vm_pgoff; - vm_flags = vma->vm_flags; - if (vma_wants_writenotify(vma)) vma->vm_page_prot = vm_get_page_prot(vm_flags & ~VM_SHARED); @@ -1223,7 +1234,7 @@ munmap_back: if (correct_wcount) atomic_inc(&inode->i_writecount); out: - perf_counter_mmap(vma); + perf_event_mmap(vma); mm->total_vm += len >> PAGE_SHIFT; vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT); @@ -2114,6 +2125,7 @@ void exit_mmap(struct mm_struct *mm) /* Use -1 here to ensure all VMAs in the mm are unmapped */ end = unmap_vmas(&tlb, vma, 0, -1, &nr_accounted, NULL); vm_unacct_memory(nr_accounted); + free_pgtables(tlb, vma, FIRST_USER_ADDRESS, 0); tlb_finish_mmu(tlb, 0, end); @@ -2270,7 +2282,7 @@ static void special_mapping_close(struct vm_area_struct *vma) { } -static struct vm_operations_struct special_mapping_vmops = { +static const struct vm_operations_struct special_mapping_vmops = { .close = special_mapping_close, .fault = special_mapping_fault, }; @@ -2311,7 +2323,7 @@ int install_special_mapping(struct mm_struct *mm, mm->total_vm += len >> PAGE_SHIFT; - perf_counter_mmap(vma); + perf_event_mmap(vma); return 0; } diff --git a/mm/mmu_context.c b/mm/mmu_context.c new file mode 100644 index 000000000000..ded9081f4021 --- /dev/null +++ b/mm/mmu_context.c @@ -0,0 +1,58 @@ +/* Copyright (C) 2009 Red Hat, Inc. + * + * See ../COPYING for licensing terms. + */ + +#include <linux/mm.h> +#include <linux/mmu_context.h> +#include <linux/sched.h> + +#include <asm/mmu_context.h> + +/* + * use_mm + * Makes the calling kernel thread take on the specified + * mm context. + * Called by the retry thread execute retries within the + * iocb issuer's mm context, so that copy_from/to_user + * operations work seamlessly for aio. + * (Note: this routine is intended to be called only + * from a kernel thread context) + */ +void use_mm(struct mm_struct *mm) +{ + struct mm_struct *active_mm; + struct task_struct *tsk = current; + + task_lock(tsk); + active_mm = tsk->active_mm; + if (active_mm != mm) { + atomic_inc(&mm->mm_count); + tsk->active_mm = mm; + } + tsk->mm = mm; + switch_mm(active_mm, mm, tsk); + task_unlock(tsk); + + if (active_mm != mm) + mmdrop(active_mm); +} + +/* + * unuse_mm + * Reverses the effect of use_mm, i.e. releases the + * specified mm context which was earlier taken on + * by the calling kernel thread + * (Note: this routine is intended to be called only + * from a kernel thread context) + */ +void unuse_mm(struct mm_struct *mm) +{ + struct task_struct *tsk = current; + + task_lock(tsk); + tsk->mm = NULL; + /* active_mm is still 'mm' */ + enter_lazy_tlb(mm, tsk); + task_unlock(tsk); +} diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c index 5f4ef0250bee..7e33f2cb3c77 100644 --- a/mm/mmu_notifier.c +++ b/mm/mmu_notifier.c @@ -99,6 +99,26 @@ int __mmu_notifier_clear_flush_young(struct mm_struct *mm, return young; } +void __mmu_notifier_change_pte(struct mm_struct *mm, unsigned long address, + pte_t pte) +{ + struct mmu_notifier *mn; + struct hlist_node *n; + + rcu_read_lock(); + hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) { + if (mn->ops->change_pte) + mn->ops->change_pte(mn, mm, address, pte); + /* + * Some drivers don't have change_pte, + * so we must call invalidate_page in that case. + */ + else if (mn->ops->invalidate_page) + mn->ops->invalidate_page(mn, mm, address); + } + rcu_read_unlock(); +} + void __mmu_notifier_invalidate_page(struct mm_struct *mm, unsigned long address) { diff --git a/mm/mprotect.c b/mm/mprotect.c index d80311baeb2d..8bc969d8112d 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -23,7 +23,7 @@ #include <linux/swapops.h> #include <linux/mmu_notifier.h> #include <linux/migrate.h> -#include <linux/perf_counter.h> +#include <linux/perf_event.h> #include <asm/uaccess.h> #include <asm/pgtable.h> #include <asm/cacheflush.h> @@ -300,7 +300,7 @@ SYSCALL_DEFINE3(mprotect, unsigned long, start, size_t, len, error = mprotect_fixup(vma, &prev, nstart, tmp, newflags); if (error) goto out; - perf_counter_mmap(vma); + perf_event_mmap(vma); nstart = tmp; if (nstart < prev->vm_end) diff --git a/mm/mremap.c b/mm/mremap.c index a39b7b91be46..97bff2547719 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -11,6 +11,7 @@ #include <linux/hugetlb.h> #include <linux/slab.h> #include <linux/shm.h> +#include <linux/ksm.h> #include <linux/mman.h> #include <linux/swap.h> #include <linux/capability.h> @@ -85,8 +86,8 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, if (vma->vm_file) { /* * Subtle point from Rajesh Venkatasubramanian: before - * moving file-based ptes, we must lock vmtruncate out, - * since it might clean the dst vma before the src vma, + * moving file-based ptes, we must lock truncate_pagecache + * out, since it might clean the dst vma before the src vma, * and we propagate stale pages into the dst afterward. */ mapping = vma->vm_file->f_mapping; @@ -174,6 +175,7 @@ static unsigned long move_vma(struct vm_area_struct *vma, unsigned long excess = 0; unsigned long hiwater_vm; int split = 0; + int err; /* * We'd prefer to avoid failure later on in do_munmap: @@ -182,6 +184,18 @@ static unsigned long move_vma(struct vm_area_struct *vma, if (mm->map_count >= sysctl_max_map_count - 3) return -ENOMEM; + /* + * Advise KSM to break any KSM pages in the area to be moved: + * it would be confusing if they were to turn up at the new + * location, where they happen to coincide with different KSM + * pages recently unmapped. But leave vma->vm_flags as it was, + * so KSM can come around to merge on vma and new_vma afterwards. + */ + err = ksm_madvise(vma, old_addr, old_addr + old_len, + MADV_UNMERGEABLE, &vm_flags); + if (err) + return err; + new_pgoff = vma->vm_pgoff + ((old_addr - vma->vm_start) >> PAGE_SHIFT); new_vma = copy_vma(&vma, new_addr, new_len, new_pgoff); if (!new_vma) diff --git a/mm/nommu.c b/mm/nommu.c index 2fd2ad5da98e..5189b5aed8c0 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -33,6 +33,7 @@ #include <asm/uaccess.h> #include <asm/tlb.h> #include <asm/tlbflush.h> +#include <asm/mmu_context.h> #include "internal.h" static inline __attribute__((format(printf, 1, 2))) @@ -56,12 +57,11 @@ void no_printk(const char *fmt, ...) no_printk(KERN_DEBUG FMT"\n", ##__VA_ARGS__) #endif -#include "internal.h" - void *high_memory; struct page *mem_map; unsigned long max_mapnr; unsigned long num_physpages; +unsigned long highest_memmap_pfn; struct percpu_counter vm_committed_as; int sysctl_overcommit_memory = OVERCOMMIT_GUESS; /* heuristic overcommit */ int sysctl_overcommit_ratio = 50; /* default is 50% */ @@ -69,9 +69,6 @@ int sysctl_max_map_count = DEFAULT_MAX_MAP_COUNT; int sysctl_nr_trim_pages = CONFIG_NOMMU_INITIAL_TRIM_EXCESS; int heap_stack_gap = 0; -/* amount of vm to protect from userspace access */ -unsigned long mmap_min_addr = CONFIG_DEFAULT_MMAP_MIN_ADDR; - atomic_long_t mmap_pages_allocated; EXPORT_SYMBOL(mem_map); @@ -82,50 +79,10 @@ static struct kmem_cache *vm_region_jar; struct rb_root nommu_region_tree = RB_ROOT; DECLARE_RWSEM(nommu_region_sem); -struct vm_operations_struct generic_file_vm_ops = { +const struct vm_operations_struct generic_file_vm_ops = { }; /* - * Handle all mappings that got truncated by a "truncate()" - * system call. - * - * NOTE! We have to be ready to update the memory sharing - * between the file and the memory map for a potential last - * incomplete page. Ugly, but necessary. - */ -int vmtruncate(struct inode *inode, loff_t offset) -{ - struct address_space *mapping = inode->i_mapping; - unsigned long limit; - - if (inode->i_size < offset) - goto do_expand; - i_size_write(inode, offset); - - truncate_inode_pages(mapping, offset); - goto out_truncate; - -do_expand: - limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur; - if (limit != RLIM_INFINITY && offset > limit) - goto out_sig; - if (offset > inode->i_sb->s_maxbytes) - goto out; - i_size_write(inode, offset); - -out_truncate: - if (inode->i_op->truncate) - inode->i_op->truncate(inode); - return 0; -out_sig: - send_sig(SIGXFSZ, current, 0); -out: - return -EFBIG; -} - -EXPORT_SYMBOL(vmtruncate); - -/* * Return the total memory allocated for this pointer, not * just what the caller asked for. * @@ -173,30 +130,29 @@ unsigned int kobjsize(const void *objp) } int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, - unsigned long start, int len, int flags, - struct page **pages, struct vm_area_struct **vmas) + unsigned long start, int nr_pages, unsigned int foll_flags, + struct page **pages, struct vm_area_struct **vmas) { struct vm_area_struct *vma; unsigned long vm_flags; int i; - int write = !!(flags & GUP_FLAGS_WRITE); - int force = !!(flags & GUP_FLAGS_FORCE); - int ignore = !!(flags & GUP_FLAGS_IGNORE_VMA_PERMISSIONS); /* calculate required read or write permissions. - * - if 'force' is set, we only require the "MAY" flags. + * If FOLL_FORCE is set, we only require the "MAY" flags. */ - vm_flags = write ? (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD); - vm_flags &= force ? (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE); + vm_flags = (foll_flags & FOLL_WRITE) ? + (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD); + vm_flags &= (foll_flags & FOLL_FORCE) ? + (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE); - for (i = 0; i < len; i++) { + for (i = 0; i < nr_pages; i++) { vma = find_vma(mm, start); if (!vma) goto finish_or_fault; /* protect what we can, including chardevs */ - if (vma->vm_flags & (VM_IO | VM_PFNMAP) || - (!ignore && !(vm_flags & vma->vm_flags))) + if ((vma->vm_flags & (VM_IO | VM_PFNMAP)) || + !(vm_flags & vma->vm_flags)) goto finish_or_fault; if (pages) { @@ -215,7 +171,6 @@ finish_or_fault: return i ? : -EFAULT; } - /* * get a list of pages in an address range belonging to the specified process * and indicate the VMA that covers each page @@ -224,22 +179,41 @@ finish_or_fault: * - don't permit access to VMAs that don't support it, such as I/O mappings */ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, - unsigned long start, int len, int write, int force, + unsigned long start, int nr_pages, int write, int force, struct page **pages, struct vm_area_struct **vmas) { int flags = 0; if (write) - flags |= GUP_FLAGS_WRITE; + flags |= FOLL_WRITE; if (force) - flags |= GUP_FLAGS_FORCE; + flags |= FOLL_FORCE; - return __get_user_pages(tsk, mm, - start, len, flags, - pages, vmas); + return __get_user_pages(tsk, mm, start, nr_pages, flags, pages, vmas); } EXPORT_SYMBOL(get_user_pages); +/** + * follow_pfn - look up PFN at a user virtual address + * @vma: memory mapping + * @address: user virtual address + * @pfn: location to store found PFN + * + * Only IO mappings and raw PFN mappings are allowed. + * + * Returns zero and the pfn at @pfn on success, -ve otherwise. + */ +int follow_pfn(struct vm_area_struct *vma, unsigned long address, + unsigned long *pfn) +{ + if (!(vma->vm_flags & (VM_IO | VM_PFNMAP))) + return -EINVAL; + + *pfn = address >> PAGE_SHIFT; + return 0; +} +EXPORT_SYMBOL(follow_pfn); + DEFINE_RWLOCK(vmlist_lock); struct vm_struct *vmlist; @@ -611,6 +585,22 @@ static void put_nommu_region(struct vm_region *region) } /* + * update protection on a vma + */ +static void protect_vma(struct vm_area_struct *vma, unsigned long flags) +{ +#ifdef CONFIG_MPU + struct mm_struct *mm = vma->vm_mm; + long start = vma->vm_start & PAGE_MASK; + while (start < vma->vm_end) { + protect_page(mm, start, flags); + start += PAGE_SIZE; + } + update_protections(mm); +#endif +} + +/* * add a VMA into a process's mm_struct in the appropriate place in the list * and tree and add to the address space's page tree also if not an anonymous * page @@ -629,6 +619,8 @@ static void add_vma_to_mm(struct mm_struct *mm, struct vm_area_struct *vma) mm->map_count++; vma->vm_mm = mm; + protect_vma(vma, vma->vm_flags); + /* add the VMA to the mapping */ if (vma->vm_file) { mapping = vma->vm_file->f_mapping; @@ -691,6 +683,8 @@ static void delete_vma_from_mm(struct vm_area_struct *vma) kenter("%p", vma); + protect_vma(vma, 0); + mm->map_count--; if (mm->mmap_cache == vma) mm->mmap_cache = NULL; @@ -832,7 +826,7 @@ static int validate_mmap_request(struct file *file, int ret; /* do the simple checks first */ - if (flags & MAP_FIXED || addr) { + if (flags & MAP_FIXED) { printk(KERN_DEBUG "%d: Can't do fixed-address/overlay mmap of RAM\n", current->pid); @@ -903,6 +897,10 @@ static int validate_mmap_request(struct file *file, if (!file->f_op->read) capabilities &= ~BDI_CAP_MAP_COPY; + /* The file shall have been opened with read permission. */ + if (!(file->f_mode & FMODE_READ)) + return -EACCES; + if (flags & MAP_SHARED) { /* do checks for writing, appending and locking */ if ((prot & PROT_WRITE) && @@ -1036,7 +1034,7 @@ static int do_mmap_shared_file(struct vm_area_struct *vma) ret = vma->vm_file->f_op->mmap(vma->vm_file, vma); if (ret == 0) { vma->vm_region->vm_top = vma->vm_region->vm_end; - return ret; + return 0; } if (ret != -ENOSYS) return ret; @@ -1053,7 +1051,8 @@ static int do_mmap_shared_file(struct vm_area_struct *vma) */ static int do_mmap_private(struct vm_area_struct *vma, struct vm_region *region, - unsigned long len) + unsigned long len, + unsigned long capabilities) { struct page *pages; unsigned long total, point, n, rlen; @@ -1064,13 +1063,13 @@ static int do_mmap_private(struct vm_area_struct *vma, * shared mappings on devices or memory * - VM_MAYSHARE will be set if it may attempt to share */ - if (vma->vm_file) { + if (capabilities & BDI_CAP_MAP_DIRECT) { ret = vma->vm_file->f_op->mmap(vma->vm_file, vma); if (ret == 0) { /* shouldn't return success if we're not sharing */ BUG_ON(!(vma->vm_flags & VM_MAYSHARE)); vma->vm_region->vm_top = vma->vm_region->vm_end; - return ret; + return 0; } if (ret != -ENOSYS) return ret; @@ -1183,9 +1182,6 @@ unsigned long do_mmap_pgoff(struct file *file, kenter(",%lx,%lx,%lx,%lx,%lx", addr, len, prot, flags, pgoff); - if (!(flags & MAP_FIXED)) - addr = round_hint_to_min(addr); - /* decide whether we should attempt the mapping, and if so what sort of * mapping */ ret = validate_mmap_request(file, addr, len, prot, flags, pgoff, @@ -1195,6 +1191,9 @@ unsigned long do_mmap_pgoff(struct file *file, return ret; } + /* we ignore the address hint */ + addr = 0; + /* we've determined that we can make the mapping, now translate what we * now know into VMA flags */ vm_flags = determine_vm_flags(file, prot, flags, capabilities); @@ -1308,7 +1307,7 @@ unsigned long do_mmap_pgoff(struct file *file, * - this is the hook for quasi-memory character devices to * tell us the location of a shared mapping */ - if (file && file->f_op->get_unmapped_area) { + if (capabilities & BDI_CAP_MAP_DIRECT) { addr = file->f_op->get_unmapped_area(file, addr, len, pgoff, flags); if (IS_ERR((void *) addr)) { @@ -1333,14 +1332,15 @@ unsigned long do_mmap_pgoff(struct file *file, vma->vm_region = region; - /* set up the mapping */ + /* set up the mapping + * - the region is filled in if BDI_CAP_MAP_DIRECT is still set + */ if (file && vma->vm_flags & VM_SHARED) ret = do_mmap_shared_file(vma); else - ret = do_mmap_private(vma, region, len); + ret = do_mmap_private(vma, region, len, capabilities); if (ret < 0) - goto error_put_region; - + goto error_just_free; add_nommu_region(region); /* okay... we have a mapping; now we have to register it */ @@ -1359,19 +1359,6 @@ share: kleave(" = %lx", result); return result; -error_put_region: - __put_nommu_region(region); - if (vma) { - if (vma->vm_file) { - fput(vma->vm_file); - if (vma->vm_flags & VM_EXECUTABLE) - removed_exe_file_vma(vma->vm_mm); - } - kmem_cache_free(vm_area_cachep, vma); - } - kleave(" = %d [pr]", ret); - return ret; - error_just_free: up_write(&nommu_region_sem); error: diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 175a67a78a99..ea2147dabba6 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -34,6 +34,23 @@ int sysctl_oom_dump_tasks; static DEFINE_SPINLOCK(zone_scan_lock); /* #define DEBUG */ +/* + * Is all threads of the target process nodes overlap ours? + */ +static int has_intersects_mems_allowed(struct task_struct *tsk) +{ + struct task_struct *t; + + t = tsk; + do { + if (cpuset_mems_allowed_intersects(current, t)) + return 1; + t = next_thread(t); + } while (t != tsk); + + return 0; +} + /** * badness - calculate a numeric value for how bad this task has been * @p: task struct of which task we should calculate @@ -58,7 +75,13 @@ unsigned long badness(struct task_struct *p, unsigned long uptime) unsigned long points, cpu_time, run_time; struct mm_struct *mm; struct task_struct *child; - int oom_adj; + int oom_adj = p->signal->oom_adj; + struct task_cputime task_time; + unsigned long utime; + unsigned long stime; + + if (oom_adj == OOM_DISABLE) + return 0; task_lock(p); mm = p->mm; @@ -66,11 +89,6 @@ unsigned long badness(struct task_struct *p, unsigned long uptime) task_unlock(p); return 0; } - oom_adj = mm->oom_adj; - if (oom_adj == OOM_DISABLE) { - task_unlock(p); - return 0; - } /* * The memory size of the process is the basis for the badness. @@ -85,7 +103,7 @@ unsigned long badness(struct task_struct *p, unsigned long uptime) /* * swapoff can easily use up all memory, so kill those first. */ - if (p->flags & PF_SWAPOFF) + if (p->flags & PF_OOM_ORIGIN) return ULONG_MAX; /* @@ -108,8 +126,11 @@ unsigned long badness(struct task_struct *p, unsigned long uptime) * of seconds. There is no particular reason for this other than * that it turned out to work very well in practice. */ - cpu_time = (cputime_to_jiffies(p->utime) + cputime_to_jiffies(p->stime)) - >> (SHIFT_HZ + 3); + thread_group_cputime(p, &task_time); + utime = cputime_to_jiffies(task_time.utime); + stime = cputime_to_jiffies(task_time.stime); + cpu_time = (utime + stime) >> (SHIFT_HZ + 3); + if (uptime >= p->start_time.tv_sec) run_time = (uptime - p->start_time.tv_sec) >> 10; @@ -150,7 +171,7 @@ unsigned long badness(struct task_struct *p, unsigned long uptime) * because p may have allocated or otherwise mapped memory on * this node before. However it will be less likely. */ - if (!cpuset_mems_allowed_intersects(current, p)) + if (!has_intersects_mems_allowed(p)) points /= 8; /* @@ -206,13 +227,13 @@ static inline enum oom_constraint constrained_alloc(struct zonelist *zonelist, static struct task_struct *select_bad_process(unsigned long *ppoints, struct mem_cgroup *mem) { - struct task_struct *g, *p; + struct task_struct *p; struct task_struct *chosen = NULL; struct timespec uptime; *ppoints = 0; do_posix_clock_monotonic_gettime(&uptime); - do_each_thread(g, p) { + for_each_process(p) { unsigned long points; /* @@ -257,12 +278,15 @@ static struct task_struct *select_bad_process(unsigned long *ppoints, *ppoints = ULONG_MAX; } + if (p->signal->oom_adj == OOM_DISABLE) + continue; + points = badness(p, uptime.tv_sec); - if (points > *ppoints) { + if (points > *ppoints || !chosen) { chosen = p; *ppoints = points; } - } while_each_thread(g, p); + } return chosen; } @@ -307,7 +331,8 @@ static void dump_tasks(const struct mem_cgroup *mem) } printk(KERN_INFO "[%5d] %5d %5d %8lu %8lu %3d %3d %s\n", p->pid, __task_cred(p)->uid, p->tgid, mm->total_vm, - get_mm_rss(mm), (int)task_cpu(p), mm->oom_adj, p->comm); + get_mm_rss(mm), (int)task_cpu(p), p->signal->oom_adj, + p->comm); task_unlock(p); } while_each_thread(g, p); } @@ -325,8 +350,11 @@ static void __oom_kill_task(struct task_struct *p, int verbose) return; } - if (!p->mm) + if (!p->mm) { + WARN_ON(1); + printk(KERN_WARNING "tried to kill an mm-less task!\n"); return; + } if (verbose) printk(KERN_ERR "Killed process %d (%s)\n", @@ -345,27 +373,18 @@ static void __oom_kill_task(struct task_struct *p, int verbose) static int oom_kill_task(struct task_struct *p) { - struct mm_struct *mm; - struct task_struct *g, *q; - - task_lock(p); - mm = p->mm; - if (!mm || mm->oom_adj == OOM_DISABLE) { - task_unlock(p); + /* WARNING: mm may not be dereferenced since we did not obtain its + * value from get_task_mm(p). This is OK since all we need to do is + * compare mm to q->mm below. + * + * Furthermore, even if mm contains a non-NULL value, p->mm may + * change to NULL at any time since we do not hold task_lock(p). + * However, this is of no concern to us. + */ + if (!p->mm || p->signal->oom_adj == OOM_DISABLE) return 1; - } - task_unlock(p); - __oom_kill_task(p, 1); - /* - * kill all processes that share the ->mm (i.e. all threads), - * but are in a different thread group. Don't let them have access - * to memory reserves though, otherwise we might deplete all memory. - */ - do_each_thread(g, q) { - if (q->mm == mm && !same_thread_group(q, p)) - force_sig(SIGKILL, q); - } while_each_thread(g, q); + __oom_kill_task(p, 1); return 0; } @@ -377,11 +396,11 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, struct task_struct *c; if (printk_ratelimit()) { - task_lock(current); printk(KERN_WARNING "%s invoked oom-killer: " "gfp_mask=0x%x, order=%d, oom_adj=%d\n", current->comm, gfp_mask, order, - current->mm ? current->mm->oom_adj : OOM_DISABLE); + current->signal->oom_adj); + task_lock(current); cpuset_print_task_mems_allowed(current); task_unlock(current); dump_stack(); @@ -394,9 +413,8 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, /* * If the task is already exiting, don't alarm the sysadmin or kill * its children or threads, just set TIF_MEMDIE so it can die quickly - * if its mm is still attached. */ - if (p->mm && (p->flags & PF_EXITING)) { + if (p->flags & PF_EXITING) { __oom_kill_task(p, 0); return 0; } diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 7b0dcea4935b..a3b14090b1fb 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -36,15 +36,6 @@ #include <linux/pagevec.h> /* - * The maximum number of pages to writeout in a single bdflush/kupdate - * operation. We do this so we don't hold I_SYNC against an inode for - * enormous amounts of time, which would block a userspace task which has - * been forced to throttle against that inode. Also, the code reevaluates - * the dirty each time it has written this many pages. - */ -#define MAX_WRITEBACK_PAGES 1024 - -/* * After a CPU has dirtied this many pages, balance_dirty_pages_ratelimited * will look to see if it needs to force writeback or throttling. */ @@ -53,18 +44,21 @@ static long ratelimit_pages = 32; /* * When balance_dirty_pages decides that the caller needs to perform some * non-background writeback, this is how many pages it will attempt to write. - * It should be somewhat larger than RATELIMIT_PAGES to ensure that reasonably + * It should be somewhat larger than dirtied pages to ensure that reasonably * large amounts of I/O are submitted. */ -static inline long sync_writeback_pages(void) +static inline long sync_writeback_pages(unsigned long dirtied) { - return ratelimit_pages + ratelimit_pages / 2; + if (dirtied < ratelimit_pages) + dirtied = ratelimit_pages; + + return dirtied + dirtied / 2; } /* The following parameters are exported via /proc/sys/vm */ /* - * Start background writeback (via pdflush) at this percentage + * Start background writeback (via writeback threads) at this percentage */ int dirty_background_ratio = 10; @@ -117,8 +111,6 @@ EXPORT_SYMBOL(laptop_mode); /* End of sysctl-exported parameters */ -static void background_writeout(unsigned long _min_pages); - /* * Scale the writeback cache size proportional to the relative writeout speeds. * @@ -166,37 +158,37 @@ static void update_completion_period(void) } int dirty_background_ratio_handler(struct ctl_table *table, int write, - struct file *filp, void __user *buffer, size_t *lenp, + void __user *buffer, size_t *lenp, loff_t *ppos) { int ret; - ret = proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos); + ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); if (ret == 0 && write) dirty_background_bytes = 0; return ret; } int dirty_background_bytes_handler(struct ctl_table *table, int write, - struct file *filp, void __user *buffer, size_t *lenp, + void __user *buffer, size_t *lenp, loff_t *ppos) { int ret; - ret = proc_doulongvec_minmax(table, write, filp, buffer, lenp, ppos); + ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos); if (ret == 0 && write) dirty_background_ratio = 0; return ret; } int dirty_ratio_handler(struct ctl_table *table, int write, - struct file *filp, void __user *buffer, size_t *lenp, + void __user *buffer, size_t *lenp, loff_t *ppos) { int old_ratio = vm_dirty_ratio; int ret; - ret = proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos); + ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); if (ret == 0 && write && vm_dirty_ratio != old_ratio) { update_completion_period(); vm_dirty_bytes = 0; @@ -206,13 +198,13 @@ int dirty_ratio_handler(struct ctl_table *table, int write, int dirty_bytes_handler(struct ctl_table *table, int write, - struct file *filp, void __user *buffer, size_t *lenp, + void __user *buffer, size_t *lenp, loff_t *ppos) { unsigned long old_bytes = vm_dirty_bytes; int ret; - ret = proc_doulongvec_minmax(table, write, filp, buffer, lenp, ppos); + ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos); if (ret == 0 && write && vm_dirty_bytes != old_bytes) { update_completion_period(); vm_dirty_ratio = 0; @@ -320,15 +312,13 @@ static void task_dirty_limit(struct task_struct *tsk, unsigned long *pdirty) /* * */ -static DEFINE_SPINLOCK(bdi_lock); static unsigned int bdi_min_ratio; int bdi_set_min_ratio(struct backing_dev_info *bdi, unsigned int min_ratio) { int ret = 0; - unsigned long flags; - spin_lock_irqsave(&bdi_lock, flags); + spin_lock_bh(&bdi_lock); if (min_ratio > bdi->max_ratio) { ret = -EINVAL; } else { @@ -340,27 +330,26 @@ int bdi_set_min_ratio(struct backing_dev_info *bdi, unsigned int min_ratio) ret = -EINVAL; } } - spin_unlock_irqrestore(&bdi_lock, flags); + spin_unlock_bh(&bdi_lock); return ret; } int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned max_ratio) { - unsigned long flags; int ret = 0; if (max_ratio > 100) return -EINVAL; - spin_lock_irqsave(&bdi_lock, flags); + spin_lock_bh(&bdi_lock); if (bdi->min_ratio > max_ratio) { ret = -EINVAL; } else { bdi->max_ratio = max_ratio; bdi->max_prop_frac = (PROP_FRAC_BASE * max_ratio) / 100; } - spin_unlock_irqrestore(&bdi_lock, flags); + spin_unlock_bh(&bdi_lock); return ret; } @@ -394,7 +383,8 @@ static unsigned long highmem_dirtyable_memory(unsigned long total) struct zone *z = &NODE_DATA(node)->node_zones[ZONE_HIGHMEM]; - x += zone_page_state(z, NR_FREE_PAGES) + zone_lru_pages(z); + x += zone_page_state(z, NR_FREE_PAGES) + + zone_reclaimable_pages(z); } /* * Make sure that the number of highmem pages is never larger @@ -418,7 +408,7 @@ unsigned long determine_dirtyable_memory(void) { unsigned long x; - x = global_page_state(NR_FREE_PAGES) + global_lru_pages(); + x = global_page_state(NR_FREE_PAGES) + global_reclaimable_pages(); if (!vm_highmem_is_dirtyable) x -= highmem_dirtyable_memory(x); @@ -487,10 +477,11 @@ get_dirty_limits(unsigned long *pbackground, unsigned long *pdirty, * balance_dirty_pages() must be called by processes which are generating dirty * data. It looks at the number of dirty pages in the machine and will force * the caller to perform writeback if the system is over `vm_dirty_ratio'. - * If we're over `background_thresh' then pdflush is woken to perform some - * writeout. + * If we're over `background_thresh' then the writeback threads are woken to + * perform some writeout. */ -static void balance_dirty_pages(struct address_space *mapping) +static void balance_dirty_pages(struct address_space *mapping, + unsigned long write_chunk) { long nr_reclaimable, bdi_nr_reclaimable; long nr_writeback, bdi_nr_writeback; @@ -498,7 +489,7 @@ static void balance_dirty_pages(struct address_space *mapping) unsigned long dirty_thresh; unsigned long bdi_thresh; unsigned long pages_written = 0; - unsigned long write_chunk = sync_writeback_pages(); + unsigned long pause = 1; struct backing_dev_info *bdi = mapping->backing_dev_info; @@ -541,9 +532,12 @@ static void balance_dirty_pages(struct address_space *mapping) * filesystems (i.e. NFS) in which data may have been * written to the server's write cache, but has not yet * been flushed to permanent storage. + * Only move pages to writeback if this bdi is over its + * threshold otherwise wait until the disk writes catch + * up. */ - if (bdi_nr_reclaimable) { - writeback_inodes(&wbc); + if (bdi_nr_reclaimable > bdi_thresh) { + writeback_inodes_wbc(&wbc); pages_written += write_chunk - wbc.nr_to_write; get_dirty_limits(&background_thresh, &dirty_thresh, &bdi_thresh, bdi); @@ -572,7 +566,15 @@ static void balance_dirty_pages(struct address_space *mapping) if (pages_written >= write_chunk) break; /* We've done our duty */ - congestion_wait(WRITE, HZ/10); + schedule_timeout_interruptible(pause); + + /* + * Increase the delay for each loop, up to our previous + * default of taking a 100ms nap. + */ + pause <<= 1; + if (pause > HZ / 10) + pause = HZ / 10; } if (bdi_nr_reclaimable + bdi_nr_writeback < bdi_thresh && @@ -580,7 +582,7 @@ static void balance_dirty_pages(struct address_space *mapping) bdi->dirty_exceeded = 0; if (writeback_in_progress(bdi)) - return; /* pdflush is already working this queue */ + return; /* * In laptop mode, we wait until hitting the higher threshold before @@ -591,10 +593,10 @@ static void balance_dirty_pages(struct address_space *mapping) * background_thresh, to keep the amount of dirty memory low. */ if ((laptop_mode && pages_written) || - (!laptop_mode && (global_page_state(NR_FILE_DIRTY) - + global_page_state(NR_UNSTABLE_NFS) + (!laptop_mode && ((global_page_state(NR_FILE_DIRTY) + + global_page_state(NR_UNSTABLE_NFS)) > background_thresh))) - pdflush_operation(background_writeout, 0); + bdi_start_writeback(bdi, NULL, 0); } void set_page_dirty_balance(struct page *page, int page_mkwrite) @@ -607,6 +609,8 @@ void set_page_dirty_balance(struct page *page, int page_mkwrite) } } +static DEFINE_PER_CPU(unsigned long, bdp_ratelimits) = 0; + /** * balance_dirty_pages_ratelimited_nr - balance dirty memory state * @mapping: address_space which was dirtied @@ -624,7 +628,6 @@ void set_page_dirty_balance(struct page *page, int page_mkwrite) void balance_dirty_pages_ratelimited_nr(struct address_space *mapping, unsigned long nr_pages_dirtied) { - static DEFINE_PER_CPU(unsigned long, ratelimits) = 0; unsigned long ratelimit; unsigned long *p; @@ -637,12 +640,13 @@ void balance_dirty_pages_ratelimited_nr(struct address_space *mapping, * tasks in balance_dirty_pages(). Period. */ preempt_disable(); - p = &__get_cpu_var(ratelimits); + p = &__get_cpu_var(bdp_ratelimits); *p += nr_pages_dirtied; if (unlikely(*p >= ratelimit)) { + ratelimit = sync_writeback_pages(*p); *p = 0; preempt_enable(); - balance_dirty_pages(mapping); + balance_dirty_pages(mapping, ratelimit); return; } preempt_enable(); @@ -666,7 +670,7 @@ void throttle_vm_writeout(gfp_t gfp_mask) if (global_page_state(NR_UNSTABLE_NFS) + global_page_state(NR_WRITEBACK) <= dirty_thresh) break; - congestion_wait(WRITE, HZ/10); + congestion_wait(BLK_RW_ASYNC, HZ/10); /* * The caller might hold locks which can prevent IO completion @@ -678,153 +682,35 @@ void throttle_vm_writeout(gfp_t gfp_mask) } } -/* - * writeback at least _min_pages, and keep writing until the amount of dirty - * memory is less than the background threshold, or until we're all clean. - */ -static void background_writeout(unsigned long _min_pages) -{ - long min_pages = _min_pages; - struct writeback_control wbc = { - .bdi = NULL, - .sync_mode = WB_SYNC_NONE, - .older_than_this = NULL, - .nr_to_write = 0, - .nonblocking = 1, - .range_cyclic = 1, - }; - - for ( ; ; ) { - unsigned long background_thresh; - unsigned long dirty_thresh; - - get_dirty_limits(&background_thresh, &dirty_thresh, NULL, NULL); - if (global_page_state(NR_FILE_DIRTY) + - global_page_state(NR_UNSTABLE_NFS) < background_thresh - && min_pages <= 0) - break; - wbc.more_io = 0; - wbc.encountered_congestion = 0; - wbc.nr_to_write = MAX_WRITEBACK_PAGES; - wbc.pages_skipped = 0; - writeback_inodes(&wbc); - min_pages -= MAX_WRITEBACK_PAGES - wbc.nr_to_write; - if (wbc.nr_to_write > 0 || wbc.pages_skipped > 0) { - /* Wrote less than expected */ - if (wbc.encountered_congestion || wbc.more_io) - congestion_wait(WRITE, HZ/10); - else - break; - } - } -} - -/* - * Start writeback of `nr_pages' pages. If `nr_pages' is zero, write back - * the whole world. Returns 0 if a pdflush thread was dispatched. Returns - * -1 if all pdflush threads were busy. - */ -int wakeup_pdflush(long nr_pages) -{ - if (nr_pages == 0) - nr_pages = global_page_state(NR_FILE_DIRTY) + - global_page_state(NR_UNSTABLE_NFS); - return pdflush_operation(background_writeout, nr_pages); -} - -static void wb_timer_fn(unsigned long unused); static void laptop_timer_fn(unsigned long unused); -static DEFINE_TIMER(wb_timer, wb_timer_fn, 0, 0); static DEFINE_TIMER(laptop_mode_wb_timer, laptop_timer_fn, 0, 0); /* - * Periodic writeback of "old" data. - * - * Define "old": the first time one of an inode's pages is dirtied, we mark the - * dirtying-time in the inode's address_space. So this periodic writeback code - * just walks the superblock inode list, writing back any inodes which are - * older than a specific point in time. - * - * Try to run once per dirty_writeback_interval. But if a writeback event - * takes longer than a dirty_writeback_interval interval, then leave a - * one-second gap. - * - * older_than_this takes precedence over nr_to_write. So we'll only write back - * all dirty pages if they are all attached to "old" mappings. - */ -static void wb_kupdate(unsigned long arg) -{ - unsigned long oldest_jif; - unsigned long start_jif; - unsigned long next_jif; - long nr_to_write; - struct writeback_control wbc = { - .bdi = NULL, - .sync_mode = WB_SYNC_NONE, - .older_than_this = &oldest_jif, - .nr_to_write = 0, - .nonblocking = 1, - .for_kupdate = 1, - .range_cyclic = 1, - }; - - sync_supers(); - - oldest_jif = jiffies - msecs_to_jiffies(dirty_expire_interval * 10); - start_jif = jiffies; - next_jif = start_jif + msecs_to_jiffies(dirty_writeback_interval * 10); - nr_to_write = global_page_state(NR_FILE_DIRTY) + - global_page_state(NR_UNSTABLE_NFS) + - (inodes_stat.nr_inodes - inodes_stat.nr_unused); - while (nr_to_write > 0) { - wbc.more_io = 0; - wbc.encountered_congestion = 0; - wbc.nr_to_write = MAX_WRITEBACK_PAGES; - writeback_inodes(&wbc); - if (wbc.nr_to_write > 0) { - if (wbc.encountered_congestion || wbc.more_io) - congestion_wait(WRITE, HZ/10); - else - break; /* All the old data is written */ - } - nr_to_write -= MAX_WRITEBACK_PAGES - wbc.nr_to_write; - } - if (time_before(next_jif, jiffies + HZ)) - next_jif = jiffies + HZ; - if (dirty_writeback_interval) - mod_timer(&wb_timer, next_jif); -} - -/* * sysctl handler for /proc/sys/vm/dirty_writeback_centisecs */ int dirty_writeback_centisecs_handler(ctl_table *table, int write, - struct file *file, void __user *buffer, size_t *length, loff_t *ppos) + void __user *buffer, size_t *length, loff_t *ppos) { - proc_dointvec(table, write, file, buffer, length, ppos); - if (dirty_writeback_interval) - mod_timer(&wb_timer, jiffies + - msecs_to_jiffies(dirty_writeback_interval * 10)); - else - del_timer(&wb_timer); + proc_dointvec(table, write, buffer, length, ppos); return 0; } -static void wb_timer_fn(unsigned long unused) +static void do_laptop_sync(struct work_struct *work) { - if (pdflush_operation(wb_kupdate, 0) < 0) - mod_timer(&wb_timer, jiffies + HZ); /* delay 1 second */ -} - -static void laptop_flush(unsigned long unused) -{ - sys_sync(); + wakeup_flusher_threads(0); + kfree(work); } static void laptop_timer_fn(unsigned long unused) { - pdflush_operation(laptop_flush, 0); + struct work_struct *work; + + work = kmalloc(sizeof(*work), GFP_ATOMIC); + if (work) { + INIT_WORK(work, do_laptop_sync); + schedule_work(work); + } } /* @@ -907,8 +793,6 @@ void __init page_writeback_init(void) { int shift; - mod_timer(&wb_timer, - jiffies + msecs_to_jiffies(dirty_writeback_interval * 10)); writeback_set_ratelimit(); register_cpu_notifier(&ratelimit_nb); @@ -1142,12 +1026,10 @@ int do_writepages(struct address_space *mapping, struct writeback_control *wbc) if (wbc->nr_to_write <= 0) return 0; - wbc->for_writepages = 1; if (mapping->a_ops->writepages) ret = mapping->a_ops->writepages(mapping, wbc); else ret = generic_writepages(mapping, wbc); - wbc->for_writepages = 0; return ret; } @@ -1271,6 +1153,13 @@ int redirty_page_for_writepage(struct writeback_control *wbc, struct page *page) EXPORT_SYMBOL(redirty_page_for_writepage); /* + * Dirty a page. + * + * For pages with a mapping this should be done under the page lock + * for the benefit of asynchronous memory errors who prefer a consistent + * dirty state. This rule can be broken in some special cases, + * but should be better not to. + * * If the mapping doesn't provide a set_page_dirty a_op, then * just fall through and assume that it wants buffer_heads. */ diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 5d714f8fb303..bf720550b44d 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -48,6 +48,7 @@ #include <linux/page_cgroup.h> #include <linux/debugobjects.h> #include <linux/kmemleak.h> +#include <trace/events/kmem.h> #include <asm/tlbflush.h> #include <asm/div64.h> @@ -71,7 +72,6 @@ EXPORT_SYMBOL(node_states); unsigned long totalram_pages __read_mostly; unsigned long totalreserve_pages __read_mostly; -unsigned long highest_memmap_pfn __read_mostly; int percpu_pagelist_fraction; gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK; @@ -123,8 +123,8 @@ static char * const zone_names[MAX_NR_ZONES] = { int min_free_kbytes = 1024; -unsigned long __meminitdata nr_kernel_pages; -unsigned long __meminitdata nr_all_pages; +static unsigned long __meminitdata nr_kernel_pages; +static unsigned long __meminitdata nr_all_pages; static unsigned long __meminitdata dma_reserve; #ifdef CONFIG_ARCH_POPULATES_NODE_MAP @@ -234,6 +234,12 @@ static void bad_page(struct page *page) static unsigned long nr_shown; static unsigned long nr_unshown; + /* Don't complain about poisoned pages */ + if (PageHWPoison(page)) { + __ClearPageBuddy(page); + return; + } + /* * Allow a burst of 60 reports, then keep quiet for that minute; * or allow a steady drip of one report per second. @@ -510,7 +516,7 @@ static inline int free_pages_check(struct page *page) } /* - * Frees a list of pages. + * Frees a number of pages from the PCP lists * Assumes all pages on list are in same zone, and of same order. * count is the number of pages to free. * @@ -520,22 +526,42 @@ static inline int free_pages_check(struct page *page) * And clear the zone's pages_scanned counter, to hold off the "all pages are * pinned" detection logic. */ -static void free_pages_bulk(struct zone *zone, int count, - struct list_head *list, int order) +static void free_pcppages_bulk(struct zone *zone, int count, + struct per_cpu_pages *pcp) { + int migratetype = 0; + int batch_free = 0; + spin_lock(&zone->lock); zone_clear_flag(zone, ZONE_ALL_UNRECLAIMABLE); zone->pages_scanned = 0; - __mod_zone_page_state(zone, NR_FREE_PAGES, count << order); - while (count--) { + __mod_zone_page_state(zone, NR_FREE_PAGES, count); + while (count) { struct page *page; + struct list_head *list; - VM_BUG_ON(list_empty(list)); - page = list_entry(list->prev, struct page, lru); - /* have to delete it as __free_one_page list manipulates */ - list_del(&page->lru); - __free_one_page(page, zone, order, page_private(page)); + /* + * Remove pages from lists in a round-robin fashion. A + * batch_free count is maintained that is incremented when an + * empty list is encountered. This is so more pages are freed + * off fuller lists instead of spinning excessively around empty + * lists + */ + do { + batch_free++; + if (++migratetype == MIGRATE_PCPTYPES) + migratetype = 0; + list = &pcp->lists[migratetype]; + } while (list_empty(list)); + + do { + page = list_entry(list->prev, struct page, lru); + /* must delete as __free_one_page list manipulates */ + list_del(&page->lru); + __free_one_page(page, zone, 0, migratetype); + trace_mm_page_pcpu_drain(page, 0, migratetype); + } while (--count && --batch_free && !list_empty(list)); } spin_unlock(&zone->lock); } @@ -557,7 +583,7 @@ static void __free_pages_ok(struct page *page, unsigned int order) unsigned long flags; int i; int bad = 0; - int wasMlocked = TestClearPageMlocked(page); + int wasMlocked = __TestClearPageMlocked(page); kmemcheck_free_shadow(page, order); @@ -646,7 +672,7 @@ static inline void expand(struct zone *zone, struct page *page, /* * This page is about to be returned from the page allocator */ -static int prep_new_page(struct page *page, int order, gfp_t gfp_flags) +static inline int check_new_page(struct page *page) { if (unlikely(page_mapcount(page) | (page->mapping != NULL) | @@ -655,6 +681,18 @@ static int prep_new_page(struct page *page, int order, gfp_t gfp_flags) bad_page(page); return 1; } + return 0; +} + +static int prep_new_page(struct page *page, int order, gfp_t gfp_flags) +{ + int i; + + for (i = 0; i < (1 << order); i++) { + struct page *p = page + i; + if (unlikely(check_new_page(p))) + return 1; + } set_page_private(page, 0); set_page_refcounted(page); @@ -783,6 +821,17 @@ static int move_freepages_block(struct zone *zone, struct page *page, return move_freepages(zone, start_page, end_page, migratetype); } +static void change_pageblock_range(struct page *pageblock_page, + int start_order, int migratetype) +{ + int nr_pageblocks = 1 << (start_order - pageblock_order); + + while (nr_pageblocks--) { + set_pageblock_migratetype(pageblock_page, migratetype); + pageblock_page += pageblock_nr_pages; + } +} + /* Remove an element from the buddy allocator from the fallback list */ static inline struct page * __rmqueue_fallback(struct zone *zone, int order, int start_migratetype) @@ -817,13 +866,15 @@ __rmqueue_fallback(struct zone *zone, int order, int start_migratetype) * agressive about taking ownership of free pages */ if (unlikely(current_order >= (pageblock_order >> 1)) || - start_migratetype == MIGRATE_RECLAIMABLE) { + start_migratetype == MIGRATE_RECLAIMABLE || + page_group_by_mobility_disabled) { unsigned long pages; pages = move_freepages_block(zone, page, start_migratetype); /* Claim the whole block if over half of it is free */ - if (pages >= (1 << (pageblock_order-1))) + if (pages >= (1 << (pageblock_order-1)) || + page_group_by_mobility_disabled) set_pageblock_migratetype(page, start_migratetype); @@ -834,11 +885,16 @@ __rmqueue_fallback(struct zone *zone, int order, int start_migratetype) list_del(&page->lru); rmv_page_order(page); - if (current_order == pageblock_order) - set_pageblock_migratetype(page, + /* Take ownership for orders >= pageblock_order */ + if (current_order >= pageblock_order) + change_pageblock_range(page, current_order, start_migratetype); expand(zone, page, order, current_order, area, migratetype); + + trace_mm_page_alloc_extfrag(page, order, current_order, + start_migratetype, migratetype); + return page; } } @@ -872,6 +928,7 @@ retry_reserve: } } + trace_mm_page_alloc_zone_locked(page, order, migratetype); return page; } @@ -882,7 +939,7 @@ retry_reserve: */ static int rmqueue_bulk(struct zone *zone, unsigned int order, unsigned long count, struct list_head *list, - int migratetype) + int migratetype, int cold) { int i; @@ -901,7 +958,10 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, * merge IO requests if the physical pages are ordered * properly. */ - list_add(&page->lru, list); + if (likely(cold == 0)) + list_add(&page->lru, list); + else + list_add_tail(&page->lru, list); set_page_private(page, migratetype); list = &page->lru; } @@ -929,7 +989,7 @@ void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp) to_drain = pcp->batch; else to_drain = pcp->count; - free_pages_bulk(zone, to_drain, &pcp->list, 0); + free_pcppages_bulk(zone, to_drain, pcp); pcp->count -= to_drain; local_irq_restore(flags); } @@ -955,7 +1015,7 @@ static void drain_pages(unsigned int cpu) pcp = &pset->pcp; local_irq_save(flags); - free_pages_bulk(zone, pcp->count, &pcp->list, 0); + free_pcppages_bulk(zone, pcp->count, pcp); pcp->count = 0; local_irq_restore(flags); } @@ -1021,7 +1081,8 @@ static void free_hot_cold_page(struct page *page, int cold) struct zone *zone = page_zone(page); struct per_cpu_pages *pcp; unsigned long flags; - int wasMlocked = TestClearPageMlocked(page); + int migratetype; + int wasMlocked = __TestClearPageMlocked(page); kmemcheck_free_shadow(page, 0); @@ -1038,35 +1099,49 @@ static void free_hot_cold_page(struct page *page, int cold) kernel_map_pages(page, 1, 0); pcp = &zone_pcp(zone, get_cpu())->pcp; - set_page_private(page, get_pageblock_migratetype(page)); + migratetype = get_pageblock_migratetype(page); + set_page_private(page, migratetype); local_irq_save(flags); if (unlikely(wasMlocked)) free_page_mlock(page); __count_vm_event(PGFREE); + /* + * We only track unmovable, reclaimable and movable on pcp lists. + * Free ISOLATE pages back to the allocator because they are being + * offlined but treat RESERVE as movable pages so we can get those + * areas back if necessary. Otherwise, we may have to free + * excessively into the page allocator + */ + if (migratetype >= MIGRATE_PCPTYPES) { + if (unlikely(migratetype == MIGRATE_ISOLATE)) { + free_one_page(zone, page, 0, migratetype); + goto out; + } + migratetype = MIGRATE_MOVABLE; + } + if (cold) - list_add_tail(&page->lru, &pcp->list); + list_add_tail(&page->lru, &pcp->lists[migratetype]); else - list_add(&page->lru, &pcp->list); + list_add(&page->lru, &pcp->lists[migratetype]); pcp->count++; if (pcp->count >= pcp->high) { - free_pages_bulk(zone, pcp->batch, &pcp->list, 0); + free_pcppages_bulk(zone, pcp->batch, pcp); pcp->count -= pcp->batch; } + +out: local_irq_restore(flags); put_cpu(); } void free_hot_page(struct page *page) { + trace_mm_page_free_direct(page, 0); free_hot_cold_page(page, 0); } -void free_cold_page(struct page *page) -{ - free_hot_cold_page(page, 1); -} - /* * split_page takes a non-compound higher-order page, and splits it into * n (1<<order) sub-pages: page[0..n] @@ -1114,33 +1189,23 @@ again: cpu = get_cpu(); if (likely(order == 0)) { struct per_cpu_pages *pcp; + struct list_head *list; pcp = &zone_pcp(zone, cpu)->pcp; + list = &pcp->lists[migratetype]; local_irq_save(flags); - if (!pcp->count) { - pcp->count = rmqueue_bulk(zone, 0, - pcp->batch, &pcp->list, migratetype); - if (unlikely(!pcp->count)) + if (list_empty(list)) { + pcp->count += rmqueue_bulk(zone, 0, + pcp->batch, list, + migratetype, cold); + if (unlikely(list_empty(list))) goto failed; } - /* Find a page of the appropriate migrate type */ - if (cold) { - list_for_each_entry_reverse(page, &pcp->list, lru) - if (page_private(page) == migratetype) - break; - } else { - list_for_each_entry(page, &pcp->list, lru) - if (page_private(page) == migratetype) - break; - } - - /* Allocate more to the pcp list if necessary */ - if (unlikely(&page->lru == &pcp->list)) { - pcp->count += rmqueue_bulk(zone, 0, - pcp->batch, &pcp->list, migratetype); - page = list_entry(pcp->list.next, struct page, lru); - } + if (cold) + page = list_entry(list->prev, struct page, lru); + else + page = list_entry(list->next, struct page, lru); list_del(&page->lru); pcp->count--; @@ -1620,10 +1685,6 @@ __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order, /* We now go into synchronous reclaim */ cpuset_memory_pressure_bump(); - - /* - * The task's cpuset might have expanded its set of allowable nodes - */ p->flags |= PF_MEMALLOC; lockdep_set_current_reclaim_state(gfp_mask); reclaim_state.reclaimed_slab = 0; @@ -1666,7 +1727,7 @@ __alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order, preferred_zone, migratetype); if (!page && gfp_mask & __GFP_NOFAIL) - congestion_wait(WRITE, HZ/50); + congestion_wait(BLK_RW_ASYNC, HZ/50); } while (!page && (gfp_mask & __GFP_NOFAIL)); return page; @@ -1740,8 +1801,10 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, * be using allocators in order of preference for an area that is * too large. */ - if (WARN_ON_ONCE(order >= MAX_ORDER)) + if (order >= MAX_ORDER) { + WARN_ON_ONCE(!(gfp_mask & __GFP_NOWARN)); return NULL; + } /* * GFP_THISNODE (meaning __GFP_THISNODE, __GFP_NORETRY and @@ -1756,6 +1819,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, wake_all_kswapd(order, zonelist, high_zoneidx); +restart: /* * OK, we're below the kswapd watermark and have kicked background * reclaim. Now things get more complex, so set up alloc_flags according @@ -1763,7 +1827,6 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, */ alloc_flags = gfp_to_alloc_flags(gfp_mask); -restart: /* This is the last chance, in general, before the goto nopage. */ page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist, high_zoneidx, alloc_flags & ~ALLOC_NO_WATERMARKS, @@ -1789,6 +1852,10 @@ rebalance: if (p->flags & PF_MEMALLOC) goto nopage; + /* Avoid allocations with no watermarks from looping endlessly */ + if (test_thread_flag(TIF_MEMDIE) && !(gfp_mask & __GFP_NOFAIL)) + goto nopage; + /* Try direct reclaim and then allocating */ page = __alloc_pages_direct_reclaim(gfp_mask, order, zonelist, high_zoneidx, @@ -1831,7 +1898,7 @@ rebalance: pages_reclaimed += did_some_progress; if (should_alloc_retry(gfp_mask, order, pages_reclaimed)) { /* Wait for some write requests to complete then retry */ - congestion_wait(WRITE, HZ/50); + congestion_wait(BLK_RW_ASYNC, HZ/50); goto rebalance; } @@ -1894,6 +1961,7 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, zonelist, high_zoneidx, nodemask, preferred_zone, migratetype); + trace_mm_page_alloc(page, order, gfp_mask, migratetype); return page; } EXPORT_SYMBOL(__alloc_pages_nodemask); @@ -1903,44 +1971,41 @@ EXPORT_SYMBOL(__alloc_pages_nodemask); */ unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order) { - struct page * page; + struct page *page; + + /* + * __get_free_pages() returns a 32-bit address, which cannot represent + * a highmem page + */ + VM_BUG_ON((gfp_mask & __GFP_HIGHMEM) != 0); + page = alloc_pages(gfp_mask, order); if (!page) return 0; return (unsigned long) page_address(page); } - EXPORT_SYMBOL(__get_free_pages); unsigned long get_zeroed_page(gfp_t gfp_mask) { - struct page * page; - - /* - * get_zeroed_page() returns a 32-bit address, which cannot represent - * a highmem page - */ - VM_BUG_ON((gfp_mask & __GFP_HIGHMEM) != 0); - - page = alloc_pages(gfp_mask | __GFP_ZERO, 0); - if (page) - return (unsigned long) page_address(page); - return 0; + return __get_free_pages(gfp_mask | __GFP_ZERO, 0); } - EXPORT_SYMBOL(get_zeroed_page); void __pagevec_free(struct pagevec *pvec) { int i = pagevec_count(pvec); - while (--i >= 0) + while (--i >= 0) { + trace_mm_pagevec_free(pvec->pages[i], pvec->cold); free_hot_cold_page(pvec->pages[i], pvec->cold); + } } void __free_pages(struct page *page, unsigned int order) { if (put_page_testzero(page)) { + trace_mm_page_free_direct(page, order); if (order == 0) free_hot_page(page); else @@ -1983,7 +2048,7 @@ void *alloc_pages_exact(size_t size, gfp_t gfp_mask) unsigned long alloc_end = addr + (PAGE_SIZE << order); unsigned long used = addr + PAGE_ALIGN(size); - split_page(virt_to_page(addr), order); + split_page(virt_to_page((void *)addr), order); while (used < alloc_end) { free_page(used); used += PAGE_SIZE; @@ -2115,23 +2180,28 @@ void show_free_areas(void) } } - printk("Active_anon:%lu active_file:%lu inactive_anon:%lu\n" - " inactive_file:%lu" + printk("active_anon:%lu inactive_anon:%lu isolated_anon:%lu\n" + " active_file:%lu inactive_file:%lu isolated_file:%lu\n" " unevictable:%lu" - " dirty:%lu writeback:%lu unstable:%lu\n" - " free:%lu slab:%lu mapped:%lu pagetables:%lu bounce:%lu\n", + " dirty:%lu writeback:%lu unstable:%lu buffer:%lu\n" + " free:%lu slab_reclaimable:%lu slab_unreclaimable:%lu\n" + " mapped:%lu shmem:%lu pagetables:%lu bounce:%lu\n", global_page_state(NR_ACTIVE_ANON), - global_page_state(NR_ACTIVE_FILE), global_page_state(NR_INACTIVE_ANON), + global_page_state(NR_ISOLATED_ANON), + global_page_state(NR_ACTIVE_FILE), global_page_state(NR_INACTIVE_FILE), + global_page_state(NR_ISOLATED_FILE), global_page_state(NR_UNEVICTABLE), global_page_state(NR_FILE_DIRTY), global_page_state(NR_WRITEBACK), global_page_state(NR_UNSTABLE_NFS), + nr_blockdev_pages(), global_page_state(NR_FREE_PAGES), - global_page_state(NR_SLAB_RECLAIMABLE) + - global_page_state(NR_SLAB_UNRECLAIMABLE), + global_page_state(NR_SLAB_RECLAIMABLE), + global_page_state(NR_SLAB_UNRECLAIMABLE), global_page_state(NR_FILE_MAPPED), + global_page_state(NR_SHMEM), global_page_state(NR_PAGETABLE), global_page_state(NR_BOUNCE)); @@ -2149,7 +2219,21 @@ void show_free_areas(void) " active_file:%lukB" " inactive_file:%lukB" " unevictable:%lukB" + " isolated(anon):%lukB" + " isolated(file):%lukB" " present:%lukB" + " mlocked:%lukB" + " dirty:%lukB" + " writeback:%lukB" + " mapped:%lukB" + " shmem:%lukB" + " slab_reclaimable:%lukB" + " slab_unreclaimable:%lukB" + " kernel_stack:%lukB" + " pagetables:%lukB" + " unstable:%lukB" + " bounce:%lukB" + " writeback_tmp:%lukB" " pages_scanned:%lu" " all_unreclaimable? %s" "\n", @@ -2163,7 +2247,22 @@ void show_free_areas(void) K(zone_page_state(zone, NR_ACTIVE_FILE)), K(zone_page_state(zone, NR_INACTIVE_FILE)), K(zone_page_state(zone, NR_UNEVICTABLE)), + K(zone_page_state(zone, NR_ISOLATED_ANON)), + K(zone_page_state(zone, NR_ISOLATED_FILE)), K(zone->present_pages), + K(zone_page_state(zone, NR_MLOCK)), + K(zone_page_state(zone, NR_FILE_DIRTY)), + K(zone_page_state(zone, NR_WRITEBACK)), + K(zone_page_state(zone, NR_FILE_MAPPED)), + K(zone_page_state(zone, NR_SHMEM)), + K(zone_page_state(zone, NR_SLAB_RECLAIMABLE)), + K(zone_page_state(zone, NR_SLAB_UNRECLAIMABLE)), + zone_page_state(zone, NR_KERNEL_STACK) * + THREAD_SIZE / 1024, + K(zone_page_state(zone, NR_PAGETABLE)), + K(zone_page_state(zone, NR_UNSTABLE_NFS)), + K(zone_page_state(zone, NR_BOUNCE)), + K(zone_page_state(zone, NR_WRITEBACK_TEMP)), zone->pages_scanned, (zone_is_all_unreclaimable(zone) ? "yes" : "no") ); @@ -2292,7 +2391,7 @@ early_param("numa_zonelist_order", setup_numa_zonelist_order); * sysctl handler for numa_zonelist_order */ int numa_zonelist_order_handler(ctl_table *table, int write, - struct file *file, void __user *buffer, size_t *length, + void __user *buffer, size_t *length, loff_t *ppos) { char saved_string[NUMA_ZONELIST_ORDER_LEN]; @@ -2301,7 +2400,7 @@ int numa_zonelist_order_handler(ctl_table *table, int write, if (write) strncpy(saved_string, (char*)table->data, NUMA_ZONELIST_ORDER_LEN); - ret = proc_dostring(table, write, file, buffer, length, ppos); + ret = proc_dostring(table, write, buffer, length, ppos); if (ret) return ret; if (write) { @@ -2533,7 +2632,6 @@ static void build_zonelists(pg_data_t *pgdat) prev_node = local_node; nodes_clear(used_mask); - memset(node_load, 0, sizeof(node_load)); memset(node_order, 0, sizeof(node_order)); j = 0; @@ -2642,6 +2740,9 @@ static int __build_all_zonelists(void *dummy) { int nid; +#ifdef CONFIG_NUMA + memset(node_load, 0, sizeof(node_load)); +#endif for_each_online_node(nid) { pg_data_t *pgdat = NODE_DATA(nid); @@ -2768,7 +2869,8 @@ static void setup_zone_migrate_reserve(struct zone *zone) { unsigned long start_pfn, pfn, end_pfn; struct page *page; - unsigned long reserve, block_migratetype; + unsigned long block_migratetype; + int reserve; /* Get the start pfn, end pfn and the number of blocks to reserve */ start_pfn = zone->zone_start_pfn; @@ -2776,6 +2878,15 @@ static void setup_zone_migrate_reserve(struct zone *zone) reserve = roundup(min_wmark_pages(zone), pageblock_nr_pages) >> pageblock_order; + /* + * Reserve blocks are generally in place to help high-order atomic + * allocations that are short-lived. A min_free_kbytes value that + * would result in more than 2 reserve blocks for atomic allocations + * is assumed to be in place to help anti-fragmentation for the + * future allocation of hugepages at runtime. + */ + reserve = min(2, reserve); + for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) { if (!pfn_valid(pfn)) continue; @@ -2946,6 +3057,7 @@ static int zone_batchsize(struct zone *zone) static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch) { struct per_cpu_pages *pcp; + int migratetype; memset(p, 0, sizeof(*p)); @@ -2953,7 +3065,8 @@ static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch) pcp->count = 0; pcp->high = 6 * batch; pcp->batch = max(1UL, 1 * batch); - INIT_LIST_HEAD(&pcp->list); + for (migratetype = 0; migratetype < MIGRATE_PCPTYPES; migratetype++) + INIT_LIST_HEAD(&pcp->lists[migratetype]); } /* @@ -3131,6 +3244,32 @@ int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages) return 0; } +static int __zone_pcp_update(void *data) +{ + struct zone *zone = data; + int cpu; + unsigned long batch = zone_batchsize(zone), flags; + + for (cpu = 0; cpu < NR_CPUS; cpu++) { + struct per_cpu_pageset *pset; + struct per_cpu_pages *pcp; + + pset = zone_pcp(zone, cpu); + pcp = &pset->pcp; + + local_irq_save(flags); + free_pcppages_bulk(zone, pcp->count, pcp); + setup_pageset(pset, batch); + local_irq_restore(flags); + } + return 0; +} + +void zone_pcp_update(struct zone *zone) +{ + stop_machine(__zone_pcp_update, zone, NULL); +} + static __meminit void zone_pcp_init(struct zone *zone) { int cpu; @@ -3705,7 +3844,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat, zone_pcp_init(zone); for_each_lru(l) { INIT_LIST_HEAD(&zone->lru[l].list); - zone->lru[l].nr_saved_scan = 0; + zone->reclaim_stat.nr_saved_scan[l] = 0; } zone->reclaim_stat.recent_rotated[0] = 0; zone->reclaim_stat.recent_rotated[1] = 0; @@ -4032,6 +4171,8 @@ static void __init find_zone_movable_pfns_for_nodes(unsigned long *movable_pfn) int i, nid; unsigned long usable_startpfn; unsigned long kernelcore_node, kernelcore_remaining; + /* save the state before borrow the nodemask */ + nodemask_t saved_node_state = node_states[N_HIGH_MEMORY]; unsigned long totalpages = early_calculate_totalpages(); int usable_nodes = nodes_weight(node_states[N_HIGH_MEMORY]); @@ -4059,7 +4200,7 @@ static void __init find_zone_movable_pfns_for_nodes(unsigned long *movable_pfn) /* If kernelcore was not specified, there is no ZONE_MOVABLE */ if (!required_kernelcore) - return; + goto out; /* usable_startpfn is the lowest possible pfn ZONE_MOVABLE can be at */ find_usable_zone_for_movable(); @@ -4158,6 +4299,10 @@ restart: for (nid = 0; nid < MAX_NUMNODES; nid++) zone_movable_pfn[nid] = roundup(zone_movable_pfn[nid], MAX_ORDER_NR_PAGES); + +out: + /* restore the node_state */ + node_states[N_HIGH_MEMORY] = saved_node_state; } /* Any regular memory on that node ? */ @@ -4242,11 +4387,6 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn) early_node_map[i].start_pfn, early_node_map[i].end_pfn); - /* - * find_zone_movable_pfns_for_nodes/early_calculate_totalpages init - * that node_mask, clear it at first - */ - nodes_clear(node_states[N_HIGH_MEMORY]); /* Initialise every node */ mminit_verify_pageflags_layout(); setup_nr_node_ids(); @@ -4493,7 +4633,7 @@ void setup_per_zone_wmarks(void) calculate_totalreserve_pages(); } -/** +/* * The inactive anon list should be small enough that the VM never has to * do too much work, but large enough that each inactive page has a chance * to be referenced again before it is swapped out. @@ -4584,9 +4724,9 @@ module_init(init_per_zone_wmark_min) * changes. */ int min_free_kbytes_sysctl_handler(ctl_table *table, int write, - struct file *file, void __user *buffer, size_t *length, loff_t *ppos) + void __user *buffer, size_t *length, loff_t *ppos) { - proc_dointvec(table, write, file, buffer, length, ppos); + proc_dointvec(table, write, buffer, length, ppos); if (write) setup_per_zone_wmarks(); return 0; @@ -4594,12 +4734,12 @@ int min_free_kbytes_sysctl_handler(ctl_table *table, int write, #ifdef CONFIG_NUMA int sysctl_min_unmapped_ratio_sysctl_handler(ctl_table *table, int write, - struct file *file, void __user *buffer, size_t *length, loff_t *ppos) + void __user *buffer, size_t *length, loff_t *ppos) { struct zone *zone; int rc; - rc = proc_dointvec_minmax(table, write, file, buffer, length, ppos); + rc = proc_dointvec_minmax(table, write, buffer, length, ppos); if (rc) return rc; @@ -4610,12 +4750,12 @@ int sysctl_min_unmapped_ratio_sysctl_handler(ctl_table *table, int write, } int sysctl_min_slab_ratio_sysctl_handler(ctl_table *table, int write, - struct file *file, void __user *buffer, size_t *length, loff_t *ppos) + void __user *buffer, size_t *length, loff_t *ppos) { struct zone *zone; int rc; - rc = proc_dointvec_minmax(table, write, file, buffer, length, ppos); + rc = proc_dointvec_minmax(table, write, buffer, length, ppos); if (rc) return rc; @@ -4636,9 +4776,9 @@ int sysctl_min_slab_ratio_sysctl_handler(ctl_table *table, int write, * if in function of the boot time zone sizes. */ int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write, - struct file *file, void __user *buffer, size_t *length, loff_t *ppos) + void __user *buffer, size_t *length, loff_t *ppos) { - proc_dointvec_minmax(table, write, file, buffer, length, ppos); + proc_dointvec_minmax(table, write, buffer, length, ppos); setup_per_zone_lowmem_reserve(); return 0; } @@ -4650,13 +4790,13 @@ int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write, */ int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write, - struct file *file, void __user *buffer, size_t *length, loff_t *ppos) + void __user *buffer, size_t *length, loff_t *ppos) { struct zone *zone; unsigned int cpu; int ret; - ret = proc_dointvec_minmax(table, write, file, buffer, length, ppos); + ret = proc_dointvec_minmax(table, write, buffer, length, ppos); if (!write || (ret == -EINVAL)) return ret; for_each_populated_zone(zone) { @@ -4716,7 +4856,14 @@ void *__init alloc_large_system_hash(const char *tablename, numentries <<= (PAGE_SHIFT - scale); /* Make sure we've got at least a 0-order allocation.. */ - if (unlikely((numentries * bucketsize) < PAGE_SIZE)) + if (unlikely(flags & HASH_SMALL)) { + /* Makes no sense without HASH_EARLY */ + WARN_ON(!(flags & HASH_EARLY)); + if (!(numentries >> *_hash_shift)) { + numentries = 1UL << *_hash_shift; + BUG_ON(!numentries); + } + } else if (unlikely((numentries * bucketsize) < PAGE_SIZE)) numentries = PAGE_SIZE / bucketsize; } numentries = roundup_pow_of_two(numentries); @@ -4744,8 +4891,10 @@ void *__init alloc_large_system_hash(const char *tablename, * some pages at the end of hash table which * alloc_pages_exact() automatically does */ - if (get_order(size) < MAX_ORDER) + if (get_order(size) < MAX_ORDER) { table = alloc_pages_exact(size, GFP_ATOMIC); + kmemleak_alloc(table, size, 1, GFP_ATOMIC); + } } } while (!table && size > PAGE_SIZE && --log2qty); @@ -4763,16 +4912,6 @@ void *__init alloc_large_system_hash(const char *tablename, if (_hash_mask) *_hash_mask = (1 << log2qty) - 1; - /* - * If hashdist is set, the table allocation is done with __vmalloc() - * which invokes the kmemleak_alloc() callback. This function may also - * be called before the slab and kmemleak are initialised when - * kmemleak simply buffers the request to be executed later - * (GFP_ATOMIC flag ignored in this case). - */ - if (!hashdist) - kmemleak_alloc(table, size, 1, GFP_ATOMIC); - return table; } @@ -4866,13 +5005,16 @@ int set_migratetype_isolate(struct page *page) struct zone *zone; unsigned long flags; int ret = -EBUSY; + int zone_idx; zone = page_zone(page); + zone_idx = zone_idx(zone); spin_lock_irqsave(&zone->lock, flags); /* * In future, more migrate types will be able to be isolation target. */ - if (get_pageblock_migratetype(page) != MIGRATE_MOVABLE) + if (get_pageblock_migratetype(page) != MIGRATE_MOVABLE && + zone_idx != ZONE_MOVABLE) goto out; set_pageblock_migratetype(page, MIGRATE_ISOLATE); move_freepages_block(zone, page, MIGRATE_ISOLATE); diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c index f22b4ebbd8dc..3d535d594826 100644 --- a/mm/page_cgroup.c +++ b/mm/page_cgroup.c @@ -116,10 +116,16 @@ static int __init_refok init_section_page_cgroup(unsigned long pfn) nid = page_to_nid(pfn_to_page(pfn)); table_size = sizeof(struct page_cgroup) * PAGES_PER_SECTION; VM_BUG_ON(!slab_is_available()); - base = kmalloc_node(table_size, + if (node_state(nid, N_HIGH_MEMORY)) { + base = kmalloc_node(table_size, GFP_KERNEL | __GFP_NOWARN, nid); - if (!base) - base = vmalloc_node(table_size, nid); + if (!base) + base = vmalloc_node(table_size, nid); + } else { + base = kmalloc(table_size, GFP_KERNEL | __GFP_NOWARN); + if (!base) + base = vmalloc(table_size); + } } else { /* * We don't have to allocate page_cgroup again, but diff --git a/mm/pdflush.c b/mm/pdflush.c deleted file mode 100644 index 235ac440c44e..000000000000 --- a/mm/pdflush.c +++ /dev/null @@ -1,269 +0,0 @@ -/* - * mm/pdflush.c - worker threads for writing back filesystem data - * - * Copyright (C) 2002, Linus Torvalds. - * - * 09Apr2002 Andrew Morton - * Initial version - * 29Feb2004 kaos@sgi.com - * Move worker thread creation to kthread to avoid chewing - * up stack space with nested calls to kernel_thread. - */ - -#include <linux/sched.h> -#include <linux/list.h> -#include <linux/signal.h> -#include <linux/spinlock.h> -#include <linux/gfp.h> -#include <linux/init.h> -#include <linux/module.h> -#include <linux/fs.h> /* Needed by writeback.h */ -#include <linux/writeback.h> /* Prototypes pdflush_operation() */ -#include <linux/kthread.h> -#include <linux/cpuset.h> -#include <linux/freezer.h> - - -/* - * Minimum and maximum number of pdflush instances - */ -#define MIN_PDFLUSH_THREADS 2 -#define MAX_PDFLUSH_THREADS 8 - -static void start_one_pdflush_thread(void); - - -/* - * The pdflush threads are worker threads for writing back dirty data. - * Ideally, we'd like one thread per active disk spindle. But the disk - * topology is very hard to divine at this level. Instead, we take - * care in various places to prevent more than one pdflush thread from - * performing writeback against a single filesystem. pdflush threads - * have the PF_FLUSHER flag set in current->flags to aid in this. - */ - -/* - * All the pdflush threads. Protected by pdflush_lock - */ -static LIST_HEAD(pdflush_list); -static DEFINE_SPINLOCK(pdflush_lock); - -/* - * The count of currently-running pdflush threads. Protected - * by pdflush_lock. - * - * Readable by sysctl, but not writable. Published to userspace at - * /proc/sys/vm/nr_pdflush_threads. - */ -int nr_pdflush_threads = 0; - -/* - * The time at which the pdflush thread pool last went empty - */ -static unsigned long last_empty_jifs; - -/* - * The pdflush thread. - * - * Thread pool management algorithm: - * - * - The minimum and maximum number of pdflush instances are bound - * by MIN_PDFLUSH_THREADS and MAX_PDFLUSH_THREADS. - * - * - If there have been no idle pdflush instances for 1 second, create - * a new one. - * - * - If the least-recently-went-to-sleep pdflush thread has been asleep - * for more than one second, terminate a thread. - */ - -/* - * A structure for passing work to a pdflush thread. Also for passing - * state information between pdflush threads. Protected by pdflush_lock. - */ -struct pdflush_work { - struct task_struct *who; /* The thread */ - void (*fn)(unsigned long); /* A callback function */ - unsigned long arg0; /* An argument to the callback */ - struct list_head list; /* On pdflush_list, when idle */ - unsigned long when_i_went_to_sleep; -}; - -static int __pdflush(struct pdflush_work *my_work) -{ - current->flags |= PF_FLUSHER | PF_SWAPWRITE; - set_freezable(); - my_work->fn = NULL; - my_work->who = current; - INIT_LIST_HEAD(&my_work->list); - - spin_lock_irq(&pdflush_lock); - for ( ; ; ) { - struct pdflush_work *pdf; - - set_current_state(TASK_INTERRUPTIBLE); - list_move(&my_work->list, &pdflush_list); - my_work->when_i_went_to_sleep = jiffies; - spin_unlock_irq(&pdflush_lock); - schedule(); - try_to_freeze(); - spin_lock_irq(&pdflush_lock); - if (!list_empty(&my_work->list)) { - /* - * Someone woke us up, but without removing our control - * structure from the global list. swsusp will do this - * in try_to_freeze()->refrigerator(). Handle it. - */ - my_work->fn = NULL; - continue; - } - if (my_work->fn == NULL) { - printk("pdflush: bogus wakeup\n"); - continue; - } - spin_unlock_irq(&pdflush_lock); - - (*my_work->fn)(my_work->arg0); - - spin_lock_irq(&pdflush_lock); - - /* - * Thread creation: For how long have there been zero - * available threads? - * - * To throttle creation, we reset last_empty_jifs. - */ - if (time_after(jiffies, last_empty_jifs + 1 * HZ)) { - if (list_empty(&pdflush_list)) { - if (nr_pdflush_threads < MAX_PDFLUSH_THREADS) { - last_empty_jifs = jiffies; - nr_pdflush_threads++; - spin_unlock_irq(&pdflush_lock); - start_one_pdflush_thread(); - spin_lock_irq(&pdflush_lock); - } - } - } - - my_work->fn = NULL; - - /* - * Thread destruction: For how long has the sleepiest - * thread slept? - */ - if (list_empty(&pdflush_list)) - continue; - if (nr_pdflush_threads <= MIN_PDFLUSH_THREADS) - continue; - pdf = list_entry(pdflush_list.prev, struct pdflush_work, list); - if (time_after(jiffies, pdf->when_i_went_to_sleep + 1 * HZ)) { - /* Limit exit rate */ - pdf->when_i_went_to_sleep = jiffies; - break; /* exeunt */ - } - } - nr_pdflush_threads--; - spin_unlock_irq(&pdflush_lock); - return 0; -} - -/* - * Of course, my_work wants to be just a local in __pdflush(). It is - * separated out in this manner to hopefully prevent the compiler from - * performing unfortunate optimisations against the auto variables. Because - * these are visible to other tasks and CPUs. (No problem has actually - * been observed. This is just paranoia). - */ -static int pdflush(void *dummy) -{ - struct pdflush_work my_work; - cpumask_var_t cpus_allowed; - - /* - * Since the caller doesn't even check kthread_run() worked, let's not - * freak out too much if this fails. - */ - if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) { - printk(KERN_WARNING "pdflush failed to allocate cpumask\n"); - return 0; - } - - /* - * pdflush can spend a lot of time doing encryption via dm-crypt. We - * don't want to do that at keventd's priority. - */ - set_user_nice(current, 0); - - /* - * Some configs put our parent kthread in a limited cpuset, - * which kthread() overrides, forcing cpus_allowed == cpu_all_mask. - * Our needs are more modest - cut back to our cpusets cpus_allowed. - * This is needed as pdflush's are dynamically created and destroyed. - * The boottime pdflush's are easily placed w/o these 2 lines. - */ - cpuset_cpus_allowed(current, cpus_allowed); - set_cpus_allowed_ptr(current, cpus_allowed); - free_cpumask_var(cpus_allowed); - - return __pdflush(&my_work); -} - -/* - * Attempt to wake up a pdflush thread, and get it to do some work for you. - * Returns zero if it indeed managed to find a worker thread, and passed your - * payload to it. - */ -int pdflush_operation(void (*fn)(unsigned long), unsigned long arg0) -{ - unsigned long flags; - int ret = 0; - - BUG_ON(fn == NULL); /* Hard to diagnose if it's deferred */ - - spin_lock_irqsave(&pdflush_lock, flags); - if (list_empty(&pdflush_list)) { - ret = -1; - } else { - struct pdflush_work *pdf; - - pdf = list_entry(pdflush_list.next, struct pdflush_work, list); - list_del_init(&pdf->list); - if (list_empty(&pdflush_list)) - last_empty_jifs = jiffies; - pdf->fn = fn; - pdf->arg0 = arg0; - wake_up_process(pdf->who); - } - spin_unlock_irqrestore(&pdflush_lock, flags); - - return ret; -} - -static void start_one_pdflush_thread(void) -{ - struct task_struct *k; - - k = kthread_run(pdflush, NULL, "pdflush"); - if (unlikely(IS_ERR(k))) { - spin_lock_irq(&pdflush_lock); - nr_pdflush_threads--; - spin_unlock_irq(&pdflush_lock); - } -} - -static int __init pdflush_init(void) -{ - int i; - - /* - * Pre-set nr_pdflush_threads... If we fail to create, - * the count will be decremented. - */ - nr_pdflush_threads = MIN_PDFLUSH_THREADS; - - for (i = 0; i < MIN_PDFLUSH_THREADS; i++) - start_one_pdflush_thread(); - return 0; -} - -module_init(pdflush_init); diff --git a/mm/percpu.c b/mm/percpu.c index c0b2c1a76e81..4a048abad043 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -8,12 +8,13 @@ * * This is percpu allocator which can handle both static and dynamic * areas. Percpu areas are allocated in chunks in vmalloc area. Each - * chunk is consisted of num_possible_cpus() units and the first chunk - * is used for static percpu variables in the kernel image (special - * boot time alloc/init handling necessary as these areas need to be - * brought up before allocation services are running). Unit grows as - * necessary and all units grow or shrink in unison. When a chunk is - * filled up, another chunk is allocated. ie. in vmalloc area + * chunk is consisted of boot-time determined number of units and the + * first chunk is used for static percpu variables in the kernel image + * (special boot time alloc/init handling necessary as these areas + * need to be brought up before allocation services are running). + * Unit grows as necessary and all units grow or shrink in unison. + * When a chunk is filled up, another chunk is allocated. ie. in + * vmalloc area * * c0 c1 c2 * ------------------- ------------------- ------------ @@ -22,11 +23,13 @@ * * Allocation is done in offset-size areas of single unit space. Ie, * an area of 512 bytes at 6k in c1 occupies 512 bytes at 6k of c1:u0, - * c1:u1, c1:u2 and c1:u3. Percpu access can be done by configuring - * percpu base registers pcpu_unit_size apart. + * c1:u1, c1:u2 and c1:u3. On UMA, units corresponds directly to + * cpus. On NUMA, the mapping can be non-linear and even sparse. + * Percpu access can be done by configuring percpu base registers + * according to cpu to unit mapping and pcpu_unit_size. * - * There are usually many small percpu allocations many of them as - * small as 4 bytes. The allocator organizes chunks into lists + * There are usually many small percpu allocations many of them being + * as small as 4 bytes. The allocator organizes chunks into lists * according to free size and tries to allocate from the fullest one. * Each chunk keeps the maximum contiguous area size hint which is * guaranteed to be eqaul to or larger than the maximum contiguous @@ -43,7 +46,7 @@ * * To use this allocator, arch code should do the followings. * - * - define CONFIG_HAVE_DYNAMIC_PER_CPU_AREA + * - drop CONFIG_HAVE_LEGACY_PER_CPU_AREA * * - define __addr_to_pcpu_ptr() and __pcpu_ptr_to_addr() to translate * regular address to percpu pointer and back if they need to be @@ -55,7 +58,9 @@ #include <linux/bitmap.h> #include <linux/bootmem.h> +#include <linux/err.h> #include <linux/list.h> +#include <linux/log2.h> #include <linux/mm.h> #include <linux/module.h> #include <linux/mutex.h> @@ -89,25 +94,38 @@ struct pcpu_chunk { struct list_head list; /* linked to pcpu_slot lists */ int free_size; /* free bytes in the chunk */ int contig_hint; /* max contiguous size hint */ - struct vm_struct *vm; /* mapped vmalloc region */ + void *base_addr; /* base address of this chunk */ int map_used; /* # of map entries used */ int map_alloc; /* # of map entries allocated */ int *map; /* allocation map */ + struct vm_struct **vms; /* mapped vmalloc regions */ bool immutable; /* no [de]population allowed */ - struct page **page; /* points to page array */ - struct page *page_ar[]; /* #cpus * UNIT_PAGES */ + unsigned long populated[]; /* populated bitmap */ }; static int pcpu_unit_pages __read_mostly; static int pcpu_unit_size __read_mostly; -static int pcpu_chunk_size __read_mostly; +static int pcpu_nr_units __read_mostly; +static int pcpu_atom_size __read_mostly; static int pcpu_nr_slots __read_mostly; static size_t pcpu_chunk_struct_size __read_mostly; +/* cpus with the lowest and highest unit numbers */ +static unsigned int pcpu_first_unit_cpu __read_mostly; +static unsigned int pcpu_last_unit_cpu __read_mostly; + /* the address of the first chunk which starts with the kernel static area */ void *pcpu_base_addr __read_mostly; EXPORT_SYMBOL_GPL(pcpu_base_addr); +static const int *pcpu_unit_map __read_mostly; /* cpu -> unit */ +const unsigned long *pcpu_unit_offsets __read_mostly; /* cpu -> unit offset */ + +/* group information, used for vm allocation */ +static int pcpu_nr_groups __read_mostly; +static const unsigned long *pcpu_group_offsets __read_mostly; +static const size_t *pcpu_group_sizes __read_mostly; + /* * The first chunk which always exists. Note that unlike other * chunks, this one can be allocated and mapped in several different @@ -129,9 +147,9 @@ static int pcpu_reserved_chunk_limit; * Synchronization rules. * * There are two locks - pcpu_alloc_mutex and pcpu_lock. The former - * protects allocation/reclaim paths, chunks and chunk->page arrays. - * The latter is a spinlock and protects the index data structures - - * chunk slots, chunks and area maps in chunks. + * protects allocation/reclaim paths, chunks, populated bitmap and + * vmalloc mapping. The latter is a spinlock and protects the index + * data structures - chunk slots, chunks and area maps in chunks. * * During allocation, pcpu_alloc_mutex is kept locked all the time and * pcpu_lock is grabbed and released as necessary. All actual memory @@ -178,26 +196,23 @@ static int pcpu_chunk_slot(const struct pcpu_chunk *chunk) static int pcpu_page_idx(unsigned int cpu, int page_idx) { - return cpu * pcpu_unit_pages + page_idx; -} - -static struct page **pcpu_chunk_pagep(struct pcpu_chunk *chunk, - unsigned int cpu, int page_idx) -{ - return &chunk->page[pcpu_page_idx(cpu, page_idx)]; + return pcpu_unit_map[cpu] * pcpu_unit_pages + page_idx; } static unsigned long pcpu_chunk_addr(struct pcpu_chunk *chunk, unsigned int cpu, int page_idx) { - return (unsigned long)chunk->vm->addr + - (pcpu_page_idx(cpu, page_idx) << PAGE_SHIFT); + return (unsigned long)chunk->base_addr + pcpu_unit_offsets[cpu] + + (page_idx << PAGE_SHIFT); } -static bool pcpu_chunk_page_occupied(struct pcpu_chunk *chunk, - int page_idx) +static struct page *pcpu_chunk_page(struct pcpu_chunk *chunk, + unsigned int cpu, int page_idx) { - return *pcpu_chunk_pagep(chunk, 0, page_idx) != NULL; + /* must not be used on pre-mapped chunk */ + WARN_ON(chunk->immutable); + + return vmalloc_to_page((void *)pcpu_chunk_addr(chunk, cpu, page_idx)); } /* set the pointer to a chunk in a page struct */ @@ -212,6 +227,34 @@ static struct pcpu_chunk *pcpu_get_page_chunk(struct page *page) return (struct pcpu_chunk *)page->index; } +static void pcpu_next_unpop(struct pcpu_chunk *chunk, int *rs, int *re, int end) +{ + *rs = find_next_zero_bit(chunk->populated, end, *rs); + *re = find_next_bit(chunk->populated, end, *rs + 1); +} + +static void pcpu_next_pop(struct pcpu_chunk *chunk, int *rs, int *re, int end) +{ + *rs = find_next_bit(chunk->populated, end, *rs); + *re = find_next_zero_bit(chunk->populated, end, *rs + 1); +} + +/* + * (Un)populated page region iterators. Iterate over (un)populated + * page regions betwen @start and @end in @chunk. @rs and @re should + * be integer variables and will be set to start and end page index of + * the current region. + */ +#define pcpu_for_each_unpop_region(chunk, rs, re, start, end) \ + for ((rs) = (start), pcpu_next_unpop((chunk), &(rs), &(re), (end)); \ + (rs) < (re); \ + (rs) = (re) + 1, pcpu_next_unpop((chunk), &(rs), &(re), (end))) + +#define pcpu_for_each_pop_region(chunk, rs, re, start, end) \ + for ((rs) = (start), pcpu_next_pop((chunk), &(rs), &(re), (end)); \ + (rs) < (re); \ + (rs) = (re) + 1, pcpu_next_pop((chunk), &(rs), &(re), (end))) + /** * pcpu_mem_alloc - allocate memory * @size: bytes to allocate @@ -287,16 +330,24 @@ static void pcpu_chunk_relocate(struct pcpu_chunk *chunk, int oslot) */ static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr) { - void *first_start = pcpu_first_chunk->vm->addr; + void *first_start = pcpu_first_chunk->base_addr; /* is it in the first chunk? */ - if (addr >= first_start && addr < first_start + pcpu_chunk_size) { + if (addr >= first_start && addr < first_start + pcpu_unit_size) { /* is it in the reserved area? */ if (addr < first_start + pcpu_reserved_chunk_limit) return pcpu_reserved_chunk; return pcpu_first_chunk; } + /* + * The address is relative to unit0 which might be unused and + * thus unmapped. Offset the address to the unit space of the + * current processor before looking it up in the vmalloc + * space. Note that any possible cpu id can be used here, so + * there's no need to worry about preemption or cpu hotplug. + */ + addr += pcpu_unit_offsets[raw_smp_processor_id()]; return pcpu_get_page_chunk(vmalloc_to_page(addr)); } @@ -545,126 +596,327 @@ static void pcpu_free_area(struct pcpu_chunk *chunk, int freeme) } /** - * pcpu_unmap - unmap pages out of a pcpu_chunk + * pcpu_get_pages_and_bitmap - get temp pages array and bitmap * @chunk: chunk of interest - * @page_start: page index of the first page to unmap - * @page_end: page index of the last page to unmap + 1 - * @flush: whether to flush cache and tlb or not + * @bitmapp: output parameter for bitmap + * @may_alloc: may allocate the array * - * For each cpu, unmap pages [@page_start,@page_end) out of @chunk. - * If @flush is true, vcache is flushed before unmapping and tlb - * after. + * Returns pointer to array of pointers to struct page and bitmap, + * both of which can be indexed with pcpu_page_idx(). The returned + * array is cleared to zero and *@bitmapp is copied from + * @chunk->populated. Note that there is only one array and bitmap + * and access exclusion is the caller's responsibility. + * + * CONTEXT: + * pcpu_alloc_mutex and does GFP_KERNEL allocation if @may_alloc. + * Otherwise, don't care. + * + * RETURNS: + * Pointer to temp pages array on success, NULL on failure. */ -static void pcpu_unmap(struct pcpu_chunk *chunk, int page_start, int page_end, - bool flush) +static struct page **pcpu_get_pages_and_bitmap(struct pcpu_chunk *chunk, + unsigned long **bitmapp, + bool may_alloc) { - unsigned int last = num_possible_cpus() - 1; - unsigned int cpu; + static struct page **pages; + static unsigned long *bitmap; + size_t pages_size = pcpu_nr_units * pcpu_unit_pages * sizeof(pages[0]); + size_t bitmap_size = BITS_TO_LONGS(pcpu_unit_pages) * + sizeof(unsigned long); + + if (!pages || !bitmap) { + if (may_alloc && !pages) + pages = pcpu_mem_alloc(pages_size); + if (may_alloc && !bitmap) + bitmap = pcpu_mem_alloc(bitmap_size); + if (!pages || !bitmap) + return NULL; + } - /* unmap must not be done on immutable chunk */ - WARN_ON(chunk->immutable); + memset(pages, 0, pages_size); + bitmap_copy(bitmap, chunk->populated, pcpu_unit_pages); - /* - * Each flushing trial can be very expensive, issue flush on - * the whole region at once rather than doing it for each cpu. - * This could be an overkill but is more scalable. - */ - if (flush) - flush_cache_vunmap(pcpu_chunk_addr(chunk, 0, page_start), - pcpu_chunk_addr(chunk, last, page_end)); + *bitmapp = bitmap; + return pages; +} - for_each_possible_cpu(cpu) - unmap_kernel_range_noflush( - pcpu_chunk_addr(chunk, cpu, page_start), - (page_end - page_start) << PAGE_SHIFT); - - /* ditto as flush_cache_vunmap() */ - if (flush) - flush_tlb_kernel_range(pcpu_chunk_addr(chunk, 0, page_start), - pcpu_chunk_addr(chunk, last, page_end)); +/** + * pcpu_free_pages - free pages which were allocated for @chunk + * @chunk: chunk pages were allocated for + * @pages: array of pages to be freed, indexed by pcpu_page_idx() + * @populated: populated bitmap + * @page_start: page index of the first page to be freed + * @page_end: page index of the last page to be freed + 1 + * + * Free pages [@page_start and @page_end) in @pages for all units. + * The pages were allocated for @chunk. + */ +static void pcpu_free_pages(struct pcpu_chunk *chunk, + struct page **pages, unsigned long *populated, + int page_start, int page_end) +{ + unsigned int cpu; + int i; + + for_each_possible_cpu(cpu) { + for (i = page_start; i < page_end; i++) { + struct page *page = pages[pcpu_page_idx(cpu, i)]; + + if (page) + __free_page(page); + } + } } /** - * pcpu_depopulate_chunk - depopulate and unmap an area of a pcpu_chunk - * @chunk: chunk to depopulate - * @off: offset to the area to depopulate - * @size: size of the area to depopulate in bytes - * @flush: whether to flush cache and tlb or not - * - * For each cpu, depopulate and unmap pages [@page_start,@page_end) - * from @chunk. If @flush is true, vcache is flushed before unmapping - * and tlb after. - * - * CONTEXT: - * pcpu_alloc_mutex. + * pcpu_alloc_pages - allocates pages for @chunk + * @chunk: target chunk + * @pages: array to put the allocated pages into, indexed by pcpu_page_idx() + * @populated: populated bitmap + * @page_start: page index of the first page to be allocated + * @page_end: page index of the last page to be allocated + 1 + * + * Allocate pages [@page_start,@page_end) into @pages for all units. + * The allocation is for @chunk. Percpu core doesn't care about the + * content of @pages and will pass it verbatim to pcpu_map_pages(). */ -static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int off, int size, - bool flush) +static int pcpu_alloc_pages(struct pcpu_chunk *chunk, + struct page **pages, unsigned long *populated, + int page_start, int page_end) { - int page_start = PFN_DOWN(off); - int page_end = PFN_UP(off + size); - int unmap_start = -1; - int uninitialized_var(unmap_end); + const gfp_t gfp = GFP_KERNEL | __GFP_HIGHMEM | __GFP_COLD; unsigned int cpu; int i; - for (i = page_start; i < page_end; i++) { - for_each_possible_cpu(cpu) { - struct page **pagep = pcpu_chunk_pagep(chunk, cpu, i); + for_each_possible_cpu(cpu) { + for (i = page_start; i < page_end; i++) { + struct page **pagep = &pages[pcpu_page_idx(cpu, i)]; + + *pagep = alloc_pages_node(cpu_to_node(cpu), gfp, 0); + if (!*pagep) { + pcpu_free_pages(chunk, pages, populated, + page_start, page_end); + return -ENOMEM; + } + } + } + return 0; +} - if (!*pagep) - continue; +/** + * pcpu_pre_unmap_flush - flush cache prior to unmapping + * @chunk: chunk the regions to be flushed belongs to + * @page_start: page index of the first page to be flushed + * @page_end: page index of the last page to be flushed + 1 + * + * Pages in [@page_start,@page_end) of @chunk are about to be + * unmapped. Flush cache. As each flushing trial can be very + * expensive, issue flush on the whole region at once rather than + * doing it for each cpu. This could be an overkill but is more + * scalable. + */ +static void pcpu_pre_unmap_flush(struct pcpu_chunk *chunk, + int page_start, int page_end) +{ + flush_cache_vunmap( + pcpu_chunk_addr(chunk, pcpu_first_unit_cpu, page_start), + pcpu_chunk_addr(chunk, pcpu_last_unit_cpu, page_end)); +} - __free_page(*pagep); +static void __pcpu_unmap_pages(unsigned long addr, int nr_pages) +{ + unmap_kernel_range_noflush(addr, nr_pages << PAGE_SHIFT); +} + +/** + * pcpu_unmap_pages - unmap pages out of a pcpu_chunk + * @chunk: chunk of interest + * @pages: pages array which can be used to pass information to free + * @populated: populated bitmap + * @page_start: page index of the first page to unmap + * @page_end: page index of the last page to unmap + 1 + * + * For each cpu, unmap pages [@page_start,@page_end) out of @chunk. + * Corresponding elements in @pages were cleared by the caller and can + * be used to carry information to pcpu_free_pages() which will be + * called after all unmaps are finished. The caller should call + * proper pre/post flush functions. + */ +static void pcpu_unmap_pages(struct pcpu_chunk *chunk, + struct page **pages, unsigned long *populated, + int page_start, int page_end) +{ + unsigned int cpu; + int i; - /* - * If it's partial depopulation, it might get - * populated or depopulated again. Mark the - * page gone. - */ - *pagep = NULL; + for_each_possible_cpu(cpu) { + for (i = page_start; i < page_end; i++) { + struct page *page; - unmap_start = unmap_start < 0 ? i : unmap_start; - unmap_end = i + 1; + page = pcpu_chunk_page(chunk, cpu, i); + WARN_ON(!page); + pages[pcpu_page_idx(cpu, i)] = page; } + __pcpu_unmap_pages(pcpu_chunk_addr(chunk, cpu, page_start), + page_end - page_start); } - if (unmap_start >= 0) - pcpu_unmap(chunk, unmap_start, unmap_end, flush); + for (i = page_start; i < page_end; i++) + __clear_bit(i, populated); } /** - * pcpu_map - map pages into a pcpu_chunk + * pcpu_post_unmap_tlb_flush - flush TLB after unmapping + * @chunk: pcpu_chunk the regions to be flushed belong to + * @page_start: page index of the first page to be flushed + * @page_end: page index of the last page to be flushed + 1 + * + * Pages [@page_start,@page_end) of @chunk have been unmapped. Flush + * TLB for the regions. This can be skipped if the area is to be + * returned to vmalloc as vmalloc will handle TLB flushing lazily. + * + * As with pcpu_pre_unmap_flush(), TLB flushing also is done at once + * for the whole region. + */ +static void pcpu_post_unmap_tlb_flush(struct pcpu_chunk *chunk, + int page_start, int page_end) +{ + flush_tlb_kernel_range( + pcpu_chunk_addr(chunk, pcpu_first_unit_cpu, page_start), + pcpu_chunk_addr(chunk, pcpu_last_unit_cpu, page_end)); +} + +static int __pcpu_map_pages(unsigned long addr, struct page **pages, + int nr_pages) +{ + return map_kernel_range_noflush(addr, nr_pages << PAGE_SHIFT, + PAGE_KERNEL, pages); +} + +/** + * pcpu_map_pages - map pages into a pcpu_chunk * @chunk: chunk of interest + * @pages: pages array containing pages to be mapped + * @populated: populated bitmap * @page_start: page index of the first page to map * @page_end: page index of the last page to map + 1 * - * For each cpu, map pages [@page_start,@page_end) into @chunk. - * vcache is flushed afterwards. + * For each cpu, map pages [@page_start,@page_end) into @chunk. The + * caller is responsible for calling pcpu_post_map_flush() after all + * mappings are complete. + * + * This function is responsible for setting corresponding bits in + * @chunk->populated bitmap and whatever is necessary for reverse + * lookup (addr -> chunk). */ -static int pcpu_map(struct pcpu_chunk *chunk, int page_start, int page_end) +static int pcpu_map_pages(struct pcpu_chunk *chunk, + struct page **pages, unsigned long *populated, + int page_start, int page_end) { - unsigned int last = num_possible_cpus() - 1; - unsigned int cpu; - int err; - - /* map must not be done on immutable chunk */ - WARN_ON(chunk->immutable); + unsigned int cpu, tcpu; + int i, err; for_each_possible_cpu(cpu) { - err = map_kernel_range_noflush( - pcpu_chunk_addr(chunk, cpu, page_start), - (page_end - page_start) << PAGE_SHIFT, - PAGE_KERNEL, - pcpu_chunk_pagep(chunk, cpu, page_start)); + err = __pcpu_map_pages(pcpu_chunk_addr(chunk, cpu, page_start), + &pages[pcpu_page_idx(cpu, page_start)], + page_end - page_start); if (err < 0) - return err; + goto err; + } + + /* mapping successful, link chunk and mark populated */ + for (i = page_start; i < page_end; i++) { + for_each_possible_cpu(cpu) + pcpu_set_page_chunk(pages[pcpu_page_idx(cpu, i)], + chunk); + __set_bit(i, populated); } - /* flush at once, please read comments in pcpu_unmap() */ - flush_cache_vmap(pcpu_chunk_addr(chunk, 0, page_start), - pcpu_chunk_addr(chunk, last, page_end)); return 0; + +err: + for_each_possible_cpu(tcpu) { + if (tcpu == cpu) + break; + __pcpu_unmap_pages(pcpu_chunk_addr(chunk, tcpu, page_start), + page_end - page_start); + } + return err; +} + +/** + * pcpu_post_map_flush - flush cache after mapping + * @chunk: pcpu_chunk the regions to be flushed belong to + * @page_start: page index of the first page to be flushed + * @page_end: page index of the last page to be flushed + 1 + * + * Pages [@page_start,@page_end) of @chunk have been mapped. Flush + * cache. + * + * As with pcpu_pre_unmap_flush(), TLB flushing also is done at once + * for the whole region. + */ +static void pcpu_post_map_flush(struct pcpu_chunk *chunk, + int page_start, int page_end) +{ + flush_cache_vmap( + pcpu_chunk_addr(chunk, pcpu_first_unit_cpu, page_start), + pcpu_chunk_addr(chunk, pcpu_last_unit_cpu, page_end)); +} + +/** + * pcpu_depopulate_chunk - depopulate and unmap an area of a pcpu_chunk + * @chunk: chunk to depopulate + * @off: offset to the area to depopulate + * @size: size of the area to depopulate in bytes + * @flush: whether to flush cache and tlb or not + * + * For each cpu, depopulate and unmap pages [@page_start,@page_end) + * from @chunk. If @flush is true, vcache is flushed before unmapping + * and tlb after. + * + * CONTEXT: + * pcpu_alloc_mutex. + */ +static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int off, int size) +{ + int page_start = PFN_DOWN(off); + int page_end = PFN_UP(off + size); + struct page **pages; + unsigned long *populated; + int rs, re; + + /* quick path, check whether it's empty already */ + pcpu_for_each_unpop_region(chunk, rs, re, page_start, page_end) { + if (rs == page_start && re == page_end) + return; + break; + } + + /* immutable chunks can't be depopulated */ + WARN_ON(chunk->immutable); + + /* + * If control reaches here, there must have been at least one + * successful population attempt so the temp pages array must + * be available now. + */ + pages = pcpu_get_pages_and_bitmap(chunk, &populated, false); + BUG_ON(!pages); + + /* unmap and free */ + pcpu_pre_unmap_flush(chunk, page_start, page_end); + + pcpu_for_each_pop_region(chunk, rs, re, page_start, page_end) + pcpu_unmap_pages(chunk, pages, populated, rs, re); + + /* no need to flush tlb, vmalloc will handle it lazily */ + + pcpu_for_each_pop_region(chunk, rs, re, page_start, page_end) + pcpu_free_pages(chunk, pages, populated, rs, re); + + /* commit new bitmap */ + bitmap_copy(chunk->populated, populated, pcpu_unit_pages); } /** @@ -681,58 +933,68 @@ static int pcpu_map(struct pcpu_chunk *chunk, int page_start, int page_end) */ static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size) { - const gfp_t alloc_mask = GFP_KERNEL | __GFP_HIGHMEM | __GFP_COLD; int page_start = PFN_DOWN(off); int page_end = PFN_UP(off + size); - int map_start = -1; - int uninitialized_var(map_end); + int free_end = page_start, unmap_end = page_start; + struct page **pages; + unsigned long *populated; unsigned int cpu; - int i; + int rs, re, rc; - for (i = page_start; i < page_end; i++) { - if (pcpu_chunk_page_occupied(chunk, i)) { - if (map_start >= 0) { - if (pcpu_map(chunk, map_start, map_end)) - goto err; - map_start = -1; - } - continue; - } + /* quick path, check whether all pages are already there */ + pcpu_for_each_pop_region(chunk, rs, re, page_start, page_end) { + if (rs == page_start && re == page_end) + goto clear; + break; + } - map_start = map_start < 0 ? i : map_start; - map_end = i + 1; + /* need to allocate and map pages, this chunk can't be immutable */ + WARN_ON(chunk->immutable); - for_each_possible_cpu(cpu) { - struct page **pagep = pcpu_chunk_pagep(chunk, cpu, i); + pages = pcpu_get_pages_and_bitmap(chunk, &populated, true); + if (!pages) + return -ENOMEM; - *pagep = alloc_pages_node(cpu_to_node(cpu), - alloc_mask, 0); - if (!*pagep) - goto err; - pcpu_set_page_chunk(*pagep, chunk); - } + /* alloc and map */ + pcpu_for_each_unpop_region(chunk, rs, re, page_start, page_end) { + rc = pcpu_alloc_pages(chunk, pages, populated, rs, re); + if (rc) + goto err_free; + free_end = re; } - if (map_start >= 0 && pcpu_map(chunk, map_start, map_end)) - goto err; + pcpu_for_each_unpop_region(chunk, rs, re, page_start, page_end) { + rc = pcpu_map_pages(chunk, pages, populated, rs, re); + if (rc) + goto err_unmap; + unmap_end = re; + } + pcpu_post_map_flush(chunk, page_start, page_end); + /* commit new bitmap */ + bitmap_copy(chunk->populated, populated, pcpu_unit_pages); +clear: for_each_possible_cpu(cpu) - memset(chunk->vm->addr + cpu * pcpu_unit_size + off, 0, - size); - + memset((void *)pcpu_chunk_addr(chunk, cpu, 0) + off, 0, size); return 0; -err: - /* likely under heavy memory pressure, give memory back */ - pcpu_depopulate_chunk(chunk, off, size, true); - return -ENOMEM; + +err_unmap: + pcpu_pre_unmap_flush(chunk, page_start, unmap_end); + pcpu_for_each_unpop_region(chunk, rs, re, page_start, unmap_end) + pcpu_unmap_pages(chunk, pages, populated, rs, re); + pcpu_post_unmap_tlb_flush(chunk, page_start, unmap_end); +err_free: + pcpu_for_each_unpop_region(chunk, rs, re, page_start, free_end) + pcpu_free_pages(chunk, pages, populated, rs, re); + return rc; } static void free_pcpu_chunk(struct pcpu_chunk *chunk) { if (!chunk) return; - if (chunk->vm) - free_vm_area(chunk->vm); + if (chunk->vms) + pcpu_free_vm_areas(chunk->vms, pcpu_nr_groups); pcpu_mem_free(chunk->map, chunk->map_alloc * sizeof(chunk->map[0])); kfree(chunk); } @@ -748,10 +1010,11 @@ static struct pcpu_chunk *alloc_pcpu_chunk(void) chunk->map = pcpu_mem_alloc(PCPU_DFL_MAP_ALLOC * sizeof(chunk->map[0])); chunk->map_alloc = PCPU_DFL_MAP_ALLOC; chunk->map[chunk->map_used++] = pcpu_unit_size; - chunk->page = chunk->page_ar; - chunk->vm = get_vm_area(pcpu_chunk_size, GFP_KERNEL); - if (!chunk->vm) { + chunk->vms = pcpu_get_vm_areas(pcpu_group_offsets, pcpu_group_sizes, + pcpu_nr_groups, pcpu_atom_size, + GFP_KERNEL); + if (!chunk->vms) { free_pcpu_chunk(chunk); return NULL; } @@ -759,6 +1022,7 @@ static struct pcpu_chunk *alloc_pcpu_chunk(void) INIT_LIST_HEAD(&chunk->list); chunk->free_size = pcpu_unit_size; chunk->contig_hint = pcpu_unit_size; + chunk->base_addr = chunk->vms[0]->addr - pcpu_group_offsets[0]; return chunk; } @@ -779,7 +1043,9 @@ static struct pcpu_chunk *alloc_pcpu_chunk(void) */ static void *pcpu_alloc(size_t size, size_t align, bool reserved) { + static int warn_limit = 10; struct pcpu_chunk *chunk; + const char *err; int slot, off; if (unlikely(!size || size > PCPU_MIN_UNIT_SIZE || align > PAGE_SIZE)) { @@ -795,11 +1061,14 @@ static void *pcpu_alloc(size_t size, size_t align, bool reserved) if (reserved && pcpu_reserved_chunk) { chunk = pcpu_reserved_chunk; if (size > chunk->contig_hint || - pcpu_extend_area_map(chunk) < 0) + pcpu_extend_area_map(chunk) < 0) { + err = "failed to extend area map of reserved chunk"; goto fail_unlock; + } off = pcpu_alloc_area(chunk, size, align); if (off >= 0) goto area_found; + err = "alloc from reserved chunk failed"; goto fail_unlock; } @@ -816,6 +1085,7 @@ restart: case 1: goto restart; /* pcpu_lock dropped, restart */ default: + err = "failed to extend area map"; goto fail_unlock; } @@ -829,8 +1099,10 @@ restart: spin_unlock_irq(&pcpu_lock); chunk = alloc_pcpu_chunk(); - if (!chunk) + if (!chunk) { + err = "failed to allocate new chunk"; goto fail_unlock_mutex; + } spin_lock_irq(&pcpu_lock); pcpu_chunk_relocate(chunk, -1); @@ -843,17 +1115,26 @@ area_found: if (pcpu_populate_chunk(chunk, off, size)) { spin_lock_irq(&pcpu_lock); pcpu_free_area(chunk, off); + err = "failed to populate"; goto fail_unlock; } mutex_unlock(&pcpu_alloc_mutex); - return __addr_to_pcpu_ptr(chunk->vm->addr + off); + /* return address relative to base address */ + return __addr_to_pcpu_ptr(chunk->base_addr + off); fail_unlock: spin_unlock_irq(&pcpu_lock); fail_unlock_mutex: mutex_unlock(&pcpu_alloc_mutex); + if (warn_limit) { + pr_warning("PERCPU: allocation failed, size=%zu align=%zu, " + "%s\n", size, align, err); + dump_stack(); + if (!--warn_limit) + pr_info("PERCPU: limit reached, disable warning\n"); + } return NULL; } @@ -926,12 +1207,13 @@ static void pcpu_reclaim(struct work_struct *work) } spin_unlock_irq(&pcpu_lock); - mutex_unlock(&pcpu_alloc_mutex); list_for_each_entry_safe(chunk, next, &todo, list) { - pcpu_depopulate_chunk(chunk, 0, pcpu_unit_size, false); + pcpu_depopulate_chunk(chunk, 0, pcpu_unit_size); free_pcpu_chunk(chunk); } + + mutex_unlock(&pcpu_alloc_mutex); } /** @@ -956,7 +1238,7 @@ void free_percpu(void *ptr) spin_lock_irqsave(&pcpu_lock, flags); chunk = pcpu_chunk_addr_search(addr); - off = addr - chunk->vm->addr; + off = addr - chunk->base_addr; pcpu_free_area(chunk, off); @@ -975,30 +1257,299 @@ void free_percpu(void *ptr) } EXPORT_SYMBOL_GPL(free_percpu); +static inline size_t pcpu_calc_fc_sizes(size_t static_size, + size_t reserved_size, + ssize_t *dyn_sizep) +{ + size_t size_sum; + + size_sum = PFN_ALIGN(static_size + reserved_size + + (*dyn_sizep >= 0 ? *dyn_sizep : 0)); + if (*dyn_sizep != 0) + *dyn_sizep = size_sum - static_size - reserved_size; + + return size_sum; +} + /** - * pcpu_setup_first_chunk - initialize the first percpu chunk - * @get_page_fn: callback to fetch page pointer - * @static_size: the size of static percpu area in bytes + * pcpu_alloc_alloc_info - allocate percpu allocation info + * @nr_groups: the number of groups + * @nr_units: the number of units + * + * Allocate ai which is large enough for @nr_groups groups containing + * @nr_units units. The returned ai's groups[0].cpu_map points to the + * cpu_map array which is long enough for @nr_units and filled with + * NR_CPUS. It's the caller's responsibility to initialize cpu_map + * pointer of other groups. + * + * RETURNS: + * Pointer to the allocated pcpu_alloc_info on success, NULL on + * failure. + */ +struct pcpu_alloc_info * __init pcpu_alloc_alloc_info(int nr_groups, + int nr_units) +{ + struct pcpu_alloc_info *ai; + size_t base_size, ai_size; + void *ptr; + int unit; + + base_size = ALIGN(sizeof(*ai) + nr_groups * sizeof(ai->groups[0]), + __alignof__(ai->groups[0].cpu_map[0])); + ai_size = base_size + nr_units * sizeof(ai->groups[0].cpu_map[0]); + + ptr = alloc_bootmem_nopanic(PFN_ALIGN(ai_size)); + if (!ptr) + return NULL; + ai = ptr; + ptr += base_size; + + ai->groups[0].cpu_map = ptr; + + for (unit = 0; unit < nr_units; unit++) + ai->groups[0].cpu_map[unit] = NR_CPUS; + + ai->nr_groups = nr_groups; + ai->__ai_size = PFN_ALIGN(ai_size); + + return ai; +} + +/** + * pcpu_free_alloc_info - free percpu allocation info + * @ai: pcpu_alloc_info to free + * + * Free @ai which was allocated by pcpu_alloc_alloc_info(). + */ +void __init pcpu_free_alloc_info(struct pcpu_alloc_info *ai) +{ + free_bootmem(__pa(ai), ai->__ai_size); +} + +/** + * pcpu_build_alloc_info - build alloc_info considering distances between CPUs * @reserved_size: the size of reserved percpu area in bytes * @dyn_size: free size for dynamic allocation in bytes, -1 for auto - * @unit_size: unit size in bytes, must be multiple of PAGE_SIZE, -1 for auto - * @base_addr: mapped address, NULL for auto - * @populate_pte_fn: callback to allocate pagetable, NULL if unnecessary + * @atom_size: allocation atom size + * @cpu_distance_fn: callback to determine distance between cpus, optional + * + * This function determines grouping of units, their mappings to cpus + * and other parameters considering needed percpu size, allocation + * atom size and distances between CPUs. + * + * Groups are always mutliples of atom size and CPUs which are of + * LOCAL_DISTANCE both ways are grouped together and share space for + * units in the same group. The returned configuration is guaranteed + * to have CPUs on different nodes on different groups and >=75% usage + * of allocated virtual address space. + * + * RETURNS: + * On success, pointer to the new allocation_info is returned. On + * failure, ERR_PTR value is returned. + */ +struct pcpu_alloc_info * __init pcpu_build_alloc_info( + size_t reserved_size, ssize_t dyn_size, + size_t atom_size, + pcpu_fc_cpu_distance_fn_t cpu_distance_fn) +{ + static int group_map[NR_CPUS] __initdata; + static int group_cnt[NR_CPUS] __initdata; + const size_t static_size = __per_cpu_end - __per_cpu_start; + int group_cnt_max = 0, nr_groups = 1, nr_units = 0; + size_t size_sum, min_unit_size, alloc_size; + int upa, max_upa, uninitialized_var(best_upa); /* units_per_alloc */ + int last_allocs, group, unit; + unsigned int cpu, tcpu; + struct pcpu_alloc_info *ai; + unsigned int *cpu_map; + + /* this function may be called multiple times */ + memset(group_map, 0, sizeof(group_map)); + memset(group_cnt, 0, sizeof(group_map)); + + /* + * Determine min_unit_size, alloc_size and max_upa such that + * alloc_size is multiple of atom_size and is the smallest + * which can accomodate 4k aligned segments which are equal to + * or larger than min_unit_size. + */ + size_sum = pcpu_calc_fc_sizes(static_size, reserved_size, &dyn_size); + min_unit_size = max_t(size_t, size_sum, PCPU_MIN_UNIT_SIZE); + + alloc_size = roundup(min_unit_size, atom_size); + upa = alloc_size / min_unit_size; + while (alloc_size % upa || ((alloc_size / upa) & ~PAGE_MASK)) + upa--; + max_upa = upa; + + /* group cpus according to their proximity */ + for_each_possible_cpu(cpu) { + group = 0; + next_group: + for_each_possible_cpu(tcpu) { + if (cpu == tcpu) + break; + if (group_map[tcpu] == group && cpu_distance_fn && + (cpu_distance_fn(cpu, tcpu) > LOCAL_DISTANCE || + cpu_distance_fn(tcpu, cpu) > LOCAL_DISTANCE)) { + group++; + nr_groups = max(nr_groups, group + 1); + goto next_group; + } + } + group_map[cpu] = group; + group_cnt[group]++; + group_cnt_max = max(group_cnt_max, group_cnt[group]); + } + + /* + * Expand unit size until address space usage goes over 75% + * and then as much as possible without using more address + * space. + */ + last_allocs = INT_MAX; + for (upa = max_upa; upa; upa--) { + int allocs = 0, wasted = 0; + + if (alloc_size % upa || ((alloc_size / upa) & ~PAGE_MASK)) + continue; + + for (group = 0; group < nr_groups; group++) { + int this_allocs = DIV_ROUND_UP(group_cnt[group], upa); + allocs += this_allocs; + wasted += this_allocs * upa - group_cnt[group]; + } + + /* + * Don't accept if wastage is over 25%. The + * greater-than comparison ensures upa==1 always + * passes the following check. + */ + if (wasted > num_possible_cpus() / 3) + continue; + + /* and then don't consume more memory */ + if (allocs > last_allocs) + break; + last_allocs = allocs; + best_upa = upa; + } + upa = best_upa; + + /* allocate and fill alloc_info */ + for (group = 0; group < nr_groups; group++) + nr_units += roundup(group_cnt[group], upa); + + ai = pcpu_alloc_alloc_info(nr_groups, nr_units); + if (!ai) + return ERR_PTR(-ENOMEM); + cpu_map = ai->groups[0].cpu_map; + + for (group = 0; group < nr_groups; group++) { + ai->groups[group].cpu_map = cpu_map; + cpu_map += roundup(group_cnt[group], upa); + } + + ai->static_size = static_size; + ai->reserved_size = reserved_size; + ai->dyn_size = dyn_size; + ai->unit_size = alloc_size / upa; + ai->atom_size = atom_size; + ai->alloc_size = alloc_size; + + for (group = 0, unit = 0; group_cnt[group]; group++) { + struct pcpu_group_info *gi = &ai->groups[group]; + + /* + * Initialize base_offset as if all groups are located + * back-to-back. The caller should update this to + * reflect actual allocation. + */ + gi->base_offset = unit * ai->unit_size; + + for_each_possible_cpu(cpu) + if (group_map[cpu] == group) + gi->cpu_map[gi->nr_units++] = cpu; + gi->nr_units = roundup(gi->nr_units, upa); + unit += gi->nr_units; + } + BUG_ON(unit != nr_units); + + return ai; +} + +/** + * pcpu_dump_alloc_info - print out information about pcpu_alloc_info + * @lvl: loglevel + * @ai: allocation info to dump + * + * Print out information about @ai using loglevel @lvl. + */ +static void pcpu_dump_alloc_info(const char *lvl, + const struct pcpu_alloc_info *ai) +{ + int group_width = 1, cpu_width = 1, width; + char empty_str[] = "--------"; + int alloc = 0, alloc_end = 0; + int group, v; + int upa, apl; /* units per alloc, allocs per line */ + + v = ai->nr_groups; + while (v /= 10) + group_width++; + + v = num_possible_cpus(); + while (v /= 10) + cpu_width++; + empty_str[min_t(int, cpu_width, sizeof(empty_str) - 1)] = '\0'; + + upa = ai->alloc_size / ai->unit_size; + width = upa * (cpu_width + 1) + group_width + 3; + apl = rounddown_pow_of_two(max(60 / width, 1)); + + printk("%spcpu-alloc: s%zu r%zu d%zu u%zu alloc=%zu*%zu", + lvl, ai->static_size, ai->reserved_size, ai->dyn_size, + ai->unit_size, ai->alloc_size / ai->atom_size, ai->atom_size); + + for (group = 0; group < ai->nr_groups; group++) { + const struct pcpu_group_info *gi = &ai->groups[group]; + int unit = 0, unit_end = 0; + + BUG_ON(gi->nr_units % upa); + for (alloc_end += gi->nr_units / upa; + alloc < alloc_end; alloc++) { + if (!(alloc % apl)) { + printk("\n"); + printk("%spcpu-alloc: ", lvl); + } + printk("[%0*d] ", group_width, group); + + for (unit_end += upa; unit < unit_end; unit++) + if (gi->cpu_map[unit] != NR_CPUS) + printk("%0*d ", cpu_width, + gi->cpu_map[unit]); + else + printk("%s ", empty_str); + } + } + printk("\n"); +} + +/** + * pcpu_setup_first_chunk - initialize the first percpu chunk + * @ai: pcpu_alloc_info describing how to percpu area is shaped + * @base_addr: mapped address * * Initialize the first percpu chunk which contains the kernel static * perpcu area. This function is to be called from arch percpu area - * setup path. The first two parameters are mandatory. The rest are - * optional. - * - * @get_page_fn() should return pointer to percpu page given cpu - * number and page number. It should at least return enough pages to - * cover the static area. The returned pages for static area should - * have been initialized with valid data. If @unit_size is specified, - * it can also return pages after the static area. NULL return - * indicates end of pages for the cpu. Note that @get_page_fn() must - * return the same number of pages for all cpus. - * - * @reserved_size, if non-zero, specifies the amount of bytes to + * setup path. + * + * @ai contains all information necessary to initialize the first + * chunk and prime the dynamic percpu allocator. + * + * @ai->static_size is the size of static percpu area. + * + * @ai->reserved_size, if non-zero, specifies the amount of bytes to * reserve after the static area in the first chunk. This reserves * the first chunk such that it's available only through reserved * percpu allocation. This is primarily used to serve module percpu @@ -1006,22 +1557,29 @@ EXPORT_SYMBOL_GPL(free_percpu); * limited offset range for symbol relocations to guarantee module * percpu symbols fall inside the relocatable range. * - * @dyn_size, if non-negative, determines the number of bytes - * available for dynamic allocation in the first chunk. Specifying - * non-negative value makes percpu leave alone the area beyond - * @static_size + @reserved_size + @dyn_size. + * @ai->dyn_size determines the number of bytes available for dynamic + * allocation in the first chunk. The area between @ai->static_size + + * @ai->reserved_size + @ai->dyn_size and @ai->unit_size is unused. * - * @unit_size, if non-negative, specifies unit size and must be - * aligned to PAGE_SIZE and equal to or larger than @static_size + - * @reserved_size + if non-negative, @dyn_size. + * @ai->unit_size specifies unit size and must be aligned to PAGE_SIZE + * and equal to or larger than @ai->static_size + @ai->reserved_size + + * @ai->dyn_size. * - * Non-null @base_addr means that the caller already allocated virtual - * region for the first chunk and mapped it. percpu must not mess - * with the chunk. Note that @base_addr with 0 @unit_size or non-NULL - * @populate_pte_fn doesn't make any sense. + * @ai->atom_size is the allocation atom size and used as alignment + * for vm areas. * - * @populate_pte_fn is used to populate the pagetable. NULL means the - * caller already populated the pagetable. + * @ai->alloc_size is the allocation size and always multiple of + * @ai->atom_size. This is larger than @ai->atom_size if + * @ai->unit_size is larger than @ai->atom_size. + * + * @ai->nr_groups and @ai->groups describe virtual memory layout of + * percpu areas. Units which should be colocated are put into the + * same group. Dynamic VM areas will be allocated according to these + * groupings. If @ai->nr_groups is zero, a single group containing + * all units is assumed. + * + * The caller should have mapped the first chunk at @base_addr and + * copied static data to each unit. * * If the first chunk ends up with both reserved and dynamic areas, it * is served by two chunks - one to serve the core static and reserved @@ -1031,49 +1589,98 @@ EXPORT_SYMBOL_GPL(free_percpu); * and available for dynamic allocation like any other chunks. * * RETURNS: - * The determined pcpu_unit_size which can be used to initialize - * percpu access. + * 0 on success, -errno on failure. */ -size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn, - size_t static_size, size_t reserved_size, - ssize_t dyn_size, ssize_t unit_size, - void *base_addr, - pcpu_populate_pte_fn_t populate_pte_fn) +int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, + void *base_addr) { - static struct vm_struct first_vm; + static char cpus_buf[4096] __initdata; static int smap[2], dmap[2]; - size_t size_sum = static_size + reserved_size + - (dyn_size >= 0 ? dyn_size : 0); + size_t dyn_size = ai->dyn_size; + size_t size_sum = ai->static_size + ai->reserved_size + dyn_size; struct pcpu_chunk *schunk, *dchunk = NULL; + unsigned long *group_offsets; + size_t *group_sizes; + unsigned long *unit_off; unsigned int cpu; - int nr_pages; - int err, i; + int *unit_map; + int group, unit, i; + + cpumask_scnprintf(cpus_buf, sizeof(cpus_buf), cpu_possible_mask); - /* santiy checks */ +#define PCPU_SETUP_BUG_ON(cond) do { \ + if (unlikely(cond)) { \ + pr_emerg("PERCPU: failed to initialize, %s", #cond); \ + pr_emerg("PERCPU: cpu_possible_mask=%s\n", cpus_buf); \ + pcpu_dump_alloc_info(KERN_EMERG, ai); \ + BUG(); \ + } \ +} while (0) + + /* sanity checks */ BUILD_BUG_ON(ARRAY_SIZE(smap) >= PCPU_DFL_MAP_ALLOC || ARRAY_SIZE(dmap) >= PCPU_DFL_MAP_ALLOC); - BUG_ON(!static_size); - if (unit_size >= 0) { - BUG_ON(unit_size < size_sum); - BUG_ON(unit_size & ~PAGE_MASK); - BUG_ON(unit_size < PCPU_MIN_UNIT_SIZE); - } else - BUG_ON(base_addr); - BUG_ON(base_addr && populate_pte_fn); - - if (unit_size >= 0) - pcpu_unit_pages = unit_size >> PAGE_SHIFT; - else - pcpu_unit_pages = max_t(int, PCPU_MIN_UNIT_SIZE >> PAGE_SHIFT, - PFN_UP(size_sum)); + PCPU_SETUP_BUG_ON(ai->nr_groups <= 0); + PCPU_SETUP_BUG_ON(!ai->static_size); + PCPU_SETUP_BUG_ON(!base_addr); + PCPU_SETUP_BUG_ON(ai->unit_size < size_sum); + PCPU_SETUP_BUG_ON(ai->unit_size & ~PAGE_MASK); + PCPU_SETUP_BUG_ON(ai->unit_size < PCPU_MIN_UNIT_SIZE); + + /* process group information and build config tables accordingly */ + group_offsets = alloc_bootmem(ai->nr_groups * sizeof(group_offsets[0])); + group_sizes = alloc_bootmem(ai->nr_groups * sizeof(group_sizes[0])); + unit_map = alloc_bootmem(nr_cpu_ids * sizeof(unit_map[0])); + unit_off = alloc_bootmem(nr_cpu_ids * sizeof(unit_off[0])); + + for (cpu = 0; cpu < nr_cpu_ids; cpu++) + unit_map[cpu] = UINT_MAX; + pcpu_first_unit_cpu = NR_CPUS; + + for (group = 0, unit = 0; group < ai->nr_groups; group++, unit += i) { + const struct pcpu_group_info *gi = &ai->groups[group]; + + group_offsets[group] = gi->base_offset; + group_sizes[group] = gi->nr_units * ai->unit_size; + + for (i = 0; i < gi->nr_units; i++) { + cpu = gi->cpu_map[i]; + if (cpu == NR_CPUS) + continue; - pcpu_unit_size = pcpu_unit_pages << PAGE_SHIFT; - pcpu_chunk_size = num_possible_cpus() * pcpu_unit_size; - pcpu_chunk_struct_size = sizeof(struct pcpu_chunk) - + num_possible_cpus() * pcpu_unit_pages * sizeof(struct page *); + PCPU_SETUP_BUG_ON(cpu > nr_cpu_ids); + PCPU_SETUP_BUG_ON(!cpu_possible(cpu)); + PCPU_SETUP_BUG_ON(unit_map[cpu] != UINT_MAX); + + unit_map[cpu] = unit + i; + unit_off[cpu] = gi->base_offset + i * ai->unit_size; + + if (pcpu_first_unit_cpu == NR_CPUS) + pcpu_first_unit_cpu = cpu; + } + } + pcpu_last_unit_cpu = cpu; + pcpu_nr_units = unit; + + for_each_possible_cpu(cpu) + PCPU_SETUP_BUG_ON(unit_map[cpu] == UINT_MAX); - if (dyn_size < 0) - dyn_size = pcpu_unit_size - static_size - reserved_size; + /* we're done parsing the input, undefine BUG macro and dump config */ +#undef PCPU_SETUP_BUG_ON + pcpu_dump_alloc_info(KERN_INFO, ai); + + pcpu_nr_groups = ai->nr_groups; + pcpu_group_offsets = group_offsets; + pcpu_group_sizes = group_sizes; + pcpu_unit_map = unit_map; + pcpu_unit_offsets = unit_off; + + /* determine basic parameters */ + pcpu_unit_pages = ai->unit_size >> PAGE_SHIFT; + pcpu_unit_size = pcpu_unit_pages << PAGE_SHIFT; + pcpu_atom_size = ai->atom_size; + pcpu_chunk_struct_size = sizeof(struct pcpu_chunk) + + BITS_TO_LONGS(pcpu_unit_pages) * sizeof(unsigned long); /* * Allocate chunk slots. The additional last slot is for @@ -1093,181 +1700,367 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn, */ schunk = alloc_bootmem(pcpu_chunk_struct_size); INIT_LIST_HEAD(&schunk->list); - schunk->vm = &first_vm; + schunk->base_addr = base_addr; schunk->map = smap; schunk->map_alloc = ARRAY_SIZE(smap); - schunk->page = schunk->page_ar; + schunk->immutable = true; + bitmap_fill(schunk->populated, pcpu_unit_pages); - if (reserved_size) { - schunk->free_size = reserved_size; + if (ai->reserved_size) { + schunk->free_size = ai->reserved_size; pcpu_reserved_chunk = schunk; - pcpu_reserved_chunk_limit = static_size + reserved_size; + pcpu_reserved_chunk_limit = ai->static_size + ai->reserved_size; } else { schunk->free_size = dyn_size; dyn_size = 0; /* dynamic area covered */ } schunk->contig_hint = schunk->free_size; - schunk->map[schunk->map_used++] = -static_size; + schunk->map[schunk->map_used++] = -ai->static_size; if (schunk->free_size) schunk->map[schunk->map_used++] = schunk->free_size; /* init dynamic chunk if necessary */ if (dyn_size) { - dchunk = alloc_bootmem(sizeof(struct pcpu_chunk)); + dchunk = alloc_bootmem(pcpu_chunk_struct_size); INIT_LIST_HEAD(&dchunk->list); - dchunk->vm = &first_vm; + dchunk->base_addr = base_addr; dchunk->map = dmap; dchunk->map_alloc = ARRAY_SIZE(dmap); - dchunk->page = schunk->page_ar; /* share page map with schunk */ + dchunk->immutable = true; + bitmap_fill(dchunk->populated, pcpu_unit_pages); dchunk->contig_hint = dchunk->free_size = dyn_size; dchunk->map[dchunk->map_used++] = -pcpu_reserved_chunk_limit; dchunk->map[dchunk->map_used++] = dchunk->free_size; } - /* allocate vm address */ - first_vm.flags = VM_ALLOC; - first_vm.size = pcpu_chunk_size; - - if (!base_addr) - vm_area_register_early(&first_vm, PAGE_SIZE); - else { - /* - * Pages already mapped. No need to remap into - * vmalloc area. In this case the first chunks can't - * be mapped or unmapped by percpu and are marked - * immutable. - */ - first_vm.addr = base_addr; - schunk->immutable = true; - if (dchunk) - dchunk->immutable = true; - } - - /* assign pages */ - nr_pages = -1; - for_each_possible_cpu(cpu) { - for (i = 0; i < pcpu_unit_pages; i++) { - struct page *page = get_page_fn(cpu, i); - - if (!page) - break; - *pcpu_chunk_pagep(schunk, cpu, i) = page; - } - - BUG_ON(i < PFN_UP(static_size)); - - if (nr_pages < 0) - nr_pages = i; - else - BUG_ON(nr_pages != i); - } - - /* map them */ - if (populate_pte_fn) { - for_each_possible_cpu(cpu) - for (i = 0; i < nr_pages; i++) - populate_pte_fn(pcpu_chunk_addr(schunk, - cpu, i)); - - err = pcpu_map(schunk, 0, nr_pages); - if (err) - panic("failed to setup static percpu area, err=%d\n", - err); - } - /* link the first chunk in */ pcpu_first_chunk = dchunk ?: schunk; pcpu_chunk_relocate(pcpu_first_chunk, -1); /* we're done */ - pcpu_base_addr = (void *)pcpu_chunk_addr(schunk, 0, 0); - return pcpu_unit_size; + pcpu_base_addr = base_addr; + return 0; } -/* - * Embedding first chunk setup helper. - */ -static void *pcpue_ptr __initdata; -static size_t pcpue_size __initdata; -static size_t pcpue_unit_size __initdata; +const char *pcpu_fc_names[PCPU_FC_NR] __initdata = { + [PCPU_FC_AUTO] = "auto", + [PCPU_FC_EMBED] = "embed", + [PCPU_FC_PAGE] = "page", +}; -static struct page * __init pcpue_get_page(unsigned int cpu, int pageno) -{ - size_t off = (size_t)pageno << PAGE_SHIFT; +enum pcpu_fc pcpu_chosen_fc __initdata = PCPU_FC_AUTO; - if (off >= pcpue_size) - return NULL; +static int __init percpu_alloc_setup(char *str) +{ + if (0) + /* nada */; +#ifdef CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK + else if (!strcmp(str, "embed")) + pcpu_chosen_fc = PCPU_FC_EMBED; +#endif +#ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK + else if (!strcmp(str, "page")) + pcpu_chosen_fc = PCPU_FC_PAGE; +#endif + else + pr_warning("PERCPU: unknown allocator %s specified\n", str); - return virt_to_page(pcpue_ptr + cpu * pcpue_unit_size + off); + return 0; } +early_param("percpu_alloc", percpu_alloc_setup); +#if defined(CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK) || \ + !defined(CONFIG_HAVE_SETUP_PER_CPU_AREA) /** * pcpu_embed_first_chunk - embed the first percpu chunk into bootmem - * @static_size: the size of static percpu area in bytes * @reserved_size: the size of reserved percpu area in bytes * @dyn_size: free size for dynamic allocation in bytes, -1 for auto - * @unit_size: unit size in bytes, must be multiple of PAGE_SIZE, -1 for auto + * @atom_size: allocation atom size + * @cpu_distance_fn: callback to determine distance between cpus, optional + * @alloc_fn: function to allocate percpu page + * @free_fn: funtion to free percpu page * * This is a helper to ease setting up embedded first percpu chunk and * can be called where pcpu_setup_first_chunk() is expected. * * If this function is used to setup the first chunk, it is allocated - * as a contiguous area using bootmem allocator and used as-is without - * being mapped into vmalloc area. This enables the first chunk to - * piggy back on the linear physical mapping which often uses larger - * page size. + * by calling @alloc_fn and used as-is without being mapped into + * vmalloc area. Allocations are always whole multiples of @atom_size + * aligned to @atom_size. + * + * This enables the first chunk to piggy back on the linear physical + * mapping which often uses larger page size. Please note that this + * can result in very sparse cpu->unit mapping on NUMA machines thus + * requiring large vmalloc address space. Don't use this allocator if + * vmalloc space is not orders of magnitude larger than distances + * between node memory addresses (ie. 32bit NUMA machines). * * When @dyn_size is positive, dynamic area might be larger than - * specified to fill page alignment. Also, when @dyn_size is auto, - * @dyn_size does not fill the whole first chunk but only what's - * necessary for page alignment after static and reserved areas. + * specified to fill page alignment. When @dyn_size is auto, + * @dyn_size is just big enough to fill page alignment after static + * and reserved areas. * * If the needed size is smaller than the minimum or specified unit - * size, the leftover is returned to the bootmem allocator. + * size, the leftover is returned using @free_fn. * * RETURNS: - * The determined pcpu_unit_size which can be used to initialize - * percpu access on success, -errno on failure. + * 0 on success, -errno on failure. */ -ssize_t __init pcpu_embed_first_chunk(size_t static_size, size_t reserved_size, - ssize_t dyn_size, ssize_t unit_size) +int __init pcpu_embed_first_chunk(size_t reserved_size, ssize_t dyn_size, + size_t atom_size, + pcpu_fc_cpu_distance_fn_t cpu_distance_fn, + pcpu_fc_alloc_fn_t alloc_fn, + pcpu_fc_free_fn_t free_fn) { - unsigned int cpu; + void *base = (void *)ULONG_MAX; + void **areas = NULL; + struct pcpu_alloc_info *ai; + size_t size_sum, areas_size, max_distance; + int group, i, rc; + + ai = pcpu_build_alloc_info(reserved_size, dyn_size, atom_size, + cpu_distance_fn); + if (IS_ERR(ai)) + return PTR_ERR(ai); + + size_sum = ai->static_size + ai->reserved_size + ai->dyn_size; + areas_size = PFN_ALIGN(ai->nr_groups * sizeof(void *)); + + areas = alloc_bootmem_nopanic(areas_size); + if (!areas) { + rc = -ENOMEM; + goto out_free; + } - /* determine parameters and allocate */ - pcpue_size = PFN_ALIGN(static_size + reserved_size + - (dyn_size >= 0 ? dyn_size : 0)); - if (dyn_size != 0) - dyn_size = pcpue_size - static_size - reserved_size; - - if (unit_size >= 0) { - BUG_ON(unit_size < pcpue_size); - pcpue_unit_size = unit_size; - } else - pcpue_unit_size = max_t(size_t, pcpue_size, PCPU_MIN_UNIT_SIZE); - - pcpue_ptr = __alloc_bootmem_nopanic( - num_possible_cpus() * pcpue_unit_size, - PAGE_SIZE, __pa(MAX_DMA_ADDRESS)); - if (!pcpue_ptr) - return -ENOMEM; + /* allocate, copy and determine base address */ + for (group = 0; group < ai->nr_groups; group++) { + struct pcpu_group_info *gi = &ai->groups[group]; + unsigned int cpu = NR_CPUS; + void *ptr; + + for (i = 0; i < gi->nr_units && cpu == NR_CPUS; i++) + cpu = gi->cpu_map[i]; + BUG_ON(cpu == NR_CPUS); + + /* allocate space for the whole group */ + ptr = alloc_fn(cpu, gi->nr_units * ai->unit_size, atom_size); + if (!ptr) { + rc = -ENOMEM; + goto out_free_areas; + } + areas[group] = ptr; - /* return the leftover and copy */ - for_each_possible_cpu(cpu) { - void *ptr = pcpue_ptr + cpu * pcpue_unit_size; + base = min(ptr, base); - free_bootmem(__pa(ptr + pcpue_size), - pcpue_unit_size - pcpue_size); - memcpy(ptr, __per_cpu_load, static_size); + for (i = 0; i < gi->nr_units; i++, ptr += ai->unit_size) { + if (gi->cpu_map[i] == NR_CPUS) { + /* unused unit, free whole */ + free_fn(ptr, ai->unit_size); + continue; + } + /* copy and return the unused part */ + memcpy(ptr, __per_cpu_load, ai->static_size); + free_fn(ptr + size_sum, ai->unit_size - size_sum); + } + } + + /* base address is now known, determine group base offsets */ + max_distance = 0; + for (group = 0; group < ai->nr_groups; group++) { + ai->groups[group].base_offset = areas[group] - base; + max_distance = max(max_distance, ai->groups[group].base_offset); + } + max_distance += ai->unit_size; + + /* warn if maximum distance is further than 75% of vmalloc space */ + if (max_distance > (VMALLOC_END - VMALLOC_START) * 3 / 4) { + pr_warning("PERCPU: max_distance=0x%lx too large for vmalloc " + "space 0x%lx\n", + max_distance, VMALLOC_END - VMALLOC_START); +#ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK + /* and fail if we have fallback */ + rc = -EINVAL; + goto out_free; +#endif + } + + pr_info("PERCPU: Embedded %zu pages/cpu @%p s%zu r%zu d%zu u%zu\n", + PFN_DOWN(size_sum), base, ai->static_size, ai->reserved_size, + ai->dyn_size, ai->unit_size); + + rc = pcpu_setup_first_chunk(ai, base); + goto out_free; + +out_free_areas: + for (group = 0; group < ai->nr_groups; group++) + free_fn(areas[group], + ai->groups[group].nr_units * ai->unit_size); +out_free: + pcpu_free_alloc_info(ai); + if (areas) + free_bootmem(__pa(areas), areas_size); + return rc; +} +#endif /* CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK || + !CONFIG_HAVE_SETUP_PER_CPU_AREA */ + +#ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK +/** + * pcpu_page_first_chunk - map the first chunk using PAGE_SIZE pages + * @reserved_size: the size of reserved percpu area in bytes + * @alloc_fn: function to allocate percpu page, always called with PAGE_SIZE + * @free_fn: funtion to free percpu page, always called with PAGE_SIZE + * @populate_pte_fn: function to populate pte + * + * This is a helper to ease setting up page-remapped first percpu + * chunk and can be called where pcpu_setup_first_chunk() is expected. + * + * This is the basic allocator. Static percpu area is allocated + * page-by-page into vmalloc area. + * + * RETURNS: + * 0 on success, -errno on failure. + */ +int __init pcpu_page_first_chunk(size_t reserved_size, + pcpu_fc_alloc_fn_t alloc_fn, + pcpu_fc_free_fn_t free_fn, + pcpu_fc_populate_pte_fn_t populate_pte_fn) +{ + static struct vm_struct vm; + struct pcpu_alloc_info *ai; + char psize_str[16]; + int unit_pages; + size_t pages_size; + struct page **pages; + int unit, i, j, rc; + + snprintf(psize_str, sizeof(psize_str), "%luK", PAGE_SIZE >> 10); + + ai = pcpu_build_alloc_info(reserved_size, -1, PAGE_SIZE, NULL); + if (IS_ERR(ai)) + return PTR_ERR(ai); + BUG_ON(ai->nr_groups != 1); + BUG_ON(ai->groups[0].nr_units != num_possible_cpus()); + + unit_pages = ai->unit_size >> PAGE_SHIFT; + + /* unaligned allocations can't be freed, round up to page size */ + pages_size = PFN_ALIGN(unit_pages * num_possible_cpus() * + sizeof(pages[0])); + pages = alloc_bootmem(pages_size); + + /* allocate pages */ + j = 0; + for (unit = 0; unit < num_possible_cpus(); unit++) + for (i = 0; i < unit_pages; i++) { + unsigned int cpu = ai->groups[0].cpu_map[unit]; + void *ptr; + + ptr = alloc_fn(cpu, PAGE_SIZE, PAGE_SIZE); + if (!ptr) { + pr_warning("PERCPU: failed to allocate %s page " + "for cpu%u\n", psize_str, cpu); + goto enomem; + } + pages[j++] = virt_to_page(ptr); + } + + /* allocate vm area, map the pages and copy static data */ + vm.flags = VM_ALLOC; + vm.size = num_possible_cpus() * ai->unit_size; + vm_area_register_early(&vm, PAGE_SIZE); + + for (unit = 0; unit < num_possible_cpus(); unit++) { + unsigned long unit_addr = + (unsigned long)vm.addr + unit * ai->unit_size; + + for (i = 0; i < unit_pages; i++) + populate_pte_fn(unit_addr + (i << PAGE_SHIFT)); + + /* pte already populated, the following shouldn't fail */ + rc = __pcpu_map_pages(unit_addr, &pages[unit * unit_pages], + unit_pages); + if (rc < 0) + panic("failed to map percpu area, err=%d\n", rc); + + /* + * FIXME: Archs with virtual cache should flush local + * cache for the linear mapping here - something + * equivalent to flush_cache_vmap() on the local cpu. + * flush_cache_vmap() can't be used as most supporting + * data structures are not set up yet. + */ + + /* copy static data */ + memcpy((void *)unit_addr, __per_cpu_load, ai->static_size); } /* we're ready, commit */ - pr_info("PERCPU: Embedded %zu pages at %p, static data %zu bytes\n", - pcpue_size >> PAGE_SHIFT, pcpue_ptr, static_size); + pr_info("PERCPU: %d %s pages/cpu @%p s%zu r%zu d%zu\n", + unit_pages, psize_str, vm.addr, ai->static_size, + ai->reserved_size, ai->dyn_size); + + rc = pcpu_setup_first_chunk(ai, vm.addr); + goto out_free_ar; + +enomem: + while (--j >= 0) + free_fn(page_address(pages[j]), PAGE_SIZE); + rc = -ENOMEM; +out_free_ar: + free_bootmem(__pa(pages), pages_size); + pcpu_free_alloc_info(ai); + return rc; +} +#endif /* CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK */ + +/* + * Generic percpu area setup. + * + * The embedding helper is used because its behavior closely resembles + * the original non-dynamic generic percpu area setup. This is + * important because many archs have addressing restrictions and might + * fail if the percpu area is located far away from the previous + * location. As an added bonus, in non-NUMA cases, embedding is + * generally a good idea TLB-wise because percpu area can piggy back + * on the physical linear memory mapping which uses large page + * mappings on applicable archs. + */ +#ifndef CONFIG_HAVE_SETUP_PER_CPU_AREA +unsigned long __per_cpu_offset[NR_CPUS] __read_mostly; +EXPORT_SYMBOL(__per_cpu_offset); + +static void * __init pcpu_dfl_fc_alloc(unsigned int cpu, size_t size, + size_t align) +{ + return __alloc_bootmem_nopanic(size, align, __pa(MAX_DMA_ADDRESS)); +} - return pcpu_setup_first_chunk(pcpue_get_page, static_size, - reserved_size, dyn_size, - pcpue_unit_size, pcpue_ptr, NULL); +static void __init pcpu_dfl_fc_free(void *ptr, size_t size) +{ + free_bootmem(__pa(ptr), size); +} + +void __init setup_per_cpu_areas(void) +{ + unsigned long delta; + unsigned int cpu; + int rc; + + /* + * Always reserve area for module percpu variables. That's + * what the legacy allocator did. + */ + rc = pcpu_embed_first_chunk(PERCPU_MODULE_RESERVE, + PERCPU_DYNAMIC_RESERVE, PAGE_SIZE, NULL, + pcpu_dfl_fc_alloc, pcpu_dfl_fc_free); + if (rc < 0) + panic("Failed to initialized percpu areas."); + + delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start; + for_each_possible_cpu(cpu) + __per_cpu_offset[cpu] = delta + pcpu_unit_offsets[cpu]; } +#endif /* CONFIG_HAVE_SETUP_PER_CPU_AREA */ diff --git a/mm/quicklist.c b/mm/quicklist.c index e66d07d1b4ff..6633965bb27b 100644 --- a/mm/quicklist.c +++ b/mm/quicklist.c @@ -19,7 +19,7 @@ #include <linux/module.h> #include <linux/quicklist.h> -DEFINE_PER_CPU(struct quicklist, quicklist)[CONFIG_NR_QUICK]; +DEFINE_PER_CPU(struct quicklist [CONFIG_NR_QUICK], quicklist); #define FRACTION_OF_NODE_MEM 16 @@ -29,7 +29,6 @@ static unsigned long max_pages(unsigned long min_pages) int node = numa_node_id(); struct zone *zones = NODE_DATA(node)->node_zones; int num_cpus_on_node; - const struct cpumask *cpumask_on_node = cpumask_of_node(node); node_free_pages = #ifdef CONFIG_ZONE_DMA @@ -42,7 +41,7 @@ static unsigned long max_pages(unsigned long min_pages) max = node_free_pages / FRACTION_OF_NODE_MEM; - num_cpus_on_node = cpus_weight_nr(*cpumask_on_node); + num_cpus_on_node = cpumask_weight(cpumask_of_node(node)); max /= num_cpus_on_node; return max(max, min_pages); diff --git a/mm/rmap.c b/mm/rmap.c index 836c6c63e1f2..dd43373a483f 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -36,6 +36,11 @@ * mapping->tree_lock (widely used, in set_page_dirty, * in arch-dependent flush_dcache_mmap_lock, * within inode_lock in __sync_single_inode) + * + * (code doesn't rely on that order so it could be switched around) + * ->tasklist_lock + * anon_vma->lock (memory_failure, collect_procs_anon) + * pte map lock */ #include <linux/mm.h> @@ -191,7 +196,7 @@ void __init anon_vma_init(void) * Getting a lock on a stable anon_vma from a page off the LRU is * tricky: page_lock_anon_vma rely on RCU to guard against the races. */ -static struct anon_vma *page_lock_anon_vma(struct page *page) +struct anon_vma *page_lock_anon_vma(struct page *page) { struct anon_vma *anon_vma; unsigned long anon_mapping; @@ -211,7 +216,7 @@ out: return NULL; } -static void page_unlock_anon_vma(struct anon_vma *anon_vma) +void page_unlock_anon_vma(struct anon_vma *anon_vma) { spin_unlock(&anon_vma->lock); rcu_read_unlock(); @@ -237,8 +242,8 @@ vma_address(struct page *page, struct vm_area_struct *vma) } /* - * At what user virtual address is page expected in vma? checking that the - * page matches the vma: currently only used on anon pages, by unuse_vma; + * At what user virtual address is page expected in vma? + * checking that the page matches the vma. */ unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma) { @@ -311,7 +316,7 @@ pte_t *page_check_address(struct page *page, struct mm_struct *mm, * if the page is not mapped into the page tables of this VMA. Only * valid for normal file or anonymous VMAs. */ -static int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma) +int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma) { unsigned long address; pte_t *pte; @@ -358,6 +363,7 @@ static int page_referenced_one(struct page *page, */ if (vma->vm_flags & VM_LOCKED) { *mapcount = 1; /* break early from loop */ + *vm_flags |= VM_LOCKED; goto out_unmap; } @@ -709,27 +715,6 @@ void page_add_file_rmap(struct page *page) } } -#ifdef CONFIG_DEBUG_VM -/** - * page_dup_rmap - duplicate pte mapping to a page - * @page: the page to add the mapping to - * @vma: the vm area being duplicated - * @address: the user virtual address mapped - * - * For copy_page_range only: minimal extract from page_add_file_rmap / - * page_add_anon_rmap, avoiding unnecessary tests (already checked) so it's - * quicker. - * - * The caller needs to hold the pte lock. - */ -void page_dup_rmap(struct page *page, struct vm_area_struct *vma, unsigned long address) -{ - if (PageAnon(page)) - __page_check_anon_rmap(page, vma, address); - atomic_inc(&page->_mapcount); -} -#endif - /** * page_remove_rmap - take down pte mapping from a page * @page: page to remove mapping from @@ -738,34 +723,37 @@ void page_dup_rmap(struct page *page, struct vm_area_struct *vma, unsigned long */ void page_remove_rmap(struct page *page) { - if (atomic_add_negative(-1, &page->_mapcount)) { - /* - * Now that the last pte has gone, s390 must transfer dirty - * flag from storage key to struct page. We can usually skip - * this if the page is anon, so about to be freed; but perhaps - * not if it's in swapcache - there might be another pte slot - * containing the swap entry, but page not yet written to swap. - */ - if ((!PageAnon(page) || PageSwapCache(page)) && - page_test_dirty(page)) { - page_clear_dirty(page); - set_page_dirty(page); - } - if (PageAnon(page)) - mem_cgroup_uncharge_page(page); - __dec_zone_page_state(page, - PageAnon(page) ? NR_ANON_PAGES : NR_FILE_MAPPED); - mem_cgroup_update_mapped_file_stat(page, -1); - /* - * It would be tidy to reset the PageAnon mapping here, - * but that might overwrite a racing page_add_anon_rmap - * which increments mapcount after us but sets mapping - * before us: so leave the reset to free_hot_cold_page, - * and remember that it's only reliable while mapped. - * Leaving it set also helps swapoff to reinstate ptes - * faster for those pages still in swapcache. - */ + /* page still mapped by someone else? */ + if (!atomic_add_negative(-1, &page->_mapcount)) + return; + + /* + * Now that the last pte has gone, s390 must transfer dirty + * flag from storage key to struct page. We can usually skip + * this if the page is anon, so about to be freed; but perhaps + * not if it's in swapcache - there might be another pte slot + * containing the swap entry, but page not yet written to swap. + */ + if ((!PageAnon(page) || PageSwapCache(page)) && page_test_dirty(page)) { + page_clear_dirty(page); + set_page_dirty(page); + } + if (PageAnon(page)) { + mem_cgroup_uncharge_page(page); + __dec_zone_page_state(page, NR_ANON_PAGES); + } else { + __dec_zone_page_state(page, NR_FILE_MAPPED); } + mem_cgroup_update_mapped_file_stat(page, -1); + /* + * It would be tidy to reset the PageAnon mapping here, + * but that might overwrite a racing page_add_anon_rmap + * which increments mapcount after us but sets mapping + * before us: so leave the reset to free_hot_cold_page, + * and remember that it's only reliable while mapped. + * Leaving it set also helps swapoff to reinstate ptes + * faster for those pages still in swapcache. + */ } /* @@ -773,7 +761,7 @@ void page_remove_rmap(struct page *page) * repeatedly from either try_to_unmap_anon or try_to_unmap_file. */ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, - int migration) + enum ttu_flags flags) { struct mm_struct *mm = vma->vm_mm; unsigned long address; @@ -795,11 +783,13 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, * If it's recently referenced (perhaps page_referenced * skipped over this mm) then we should reactivate it. */ - if (!migration) { + if (!(flags & TTU_IGNORE_MLOCK)) { if (vma->vm_flags & VM_LOCKED) { ret = SWAP_MLOCK; goto out_unmap; } + } + if (!(flags & TTU_IGNORE_ACCESS)) { if (ptep_clear_flush_young_notify(vma, address, pte)) { ret = SWAP_FAIL; goto out_unmap; @@ -817,7 +807,14 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, /* Update high watermark before we lower rss */ update_hiwater_rss(mm); - if (PageAnon(page)) { + if (PageHWPoison(page) && !(flags & TTU_IGNORE_HWPOISON)) { + if (PageAnon(page)) + dec_mm_counter(mm, anon_rss); + else + dec_mm_counter(mm, file_rss); + set_pte_at(mm, address, pte, + swp_entry_to_pte(make_hwpoison_entry(page))); + } else if (PageAnon(page)) { swp_entry_t entry = { .val = page_private(page) }; if (PageSwapCache(page)) { @@ -839,12 +836,12 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, * pte. do_swap_page() will wait until the migration * pte is removed and then restart fault handling. */ - BUG_ON(!migration); + BUG_ON(TTU_ACTION(flags) != TTU_MIGRATION); entry = make_migration_entry(page, pte_write(pteval)); } set_pte_at(mm, address, pte, swp_entry_to_pte(entry)); BUG_ON(pte_file(*pte)); - } else if (PAGE_MIGRATION && migration) { + } else if (PAGE_MIGRATION && (TTU_ACTION(flags) == TTU_MIGRATION)) { /* Establish migration entry for a file page */ swp_entry_t entry; entry = make_migration_entry(page, pte_write(pteval)); @@ -1013,12 +1010,13 @@ static int try_to_mlock_page(struct page *page, struct vm_area_struct *vma) * vm_flags for that VMA. That should be OK, because that vma shouldn't be * 'LOCKED. */ -static int try_to_unmap_anon(struct page *page, int unlock, int migration) +static int try_to_unmap_anon(struct page *page, enum ttu_flags flags) { struct anon_vma *anon_vma; struct vm_area_struct *vma; unsigned int mlocked = 0; int ret = SWAP_AGAIN; + int unlock = TTU_ACTION(flags) == TTU_MUNLOCK; if (MLOCK_PAGES && unlikely(unlock)) ret = SWAP_SUCCESS; /* default for try_to_munlock() */ @@ -1034,7 +1032,7 @@ static int try_to_unmap_anon(struct page *page, int unlock, int migration) continue; /* must visit all unlocked vmas */ ret = SWAP_MLOCK; /* saw at least one mlocked vma */ } else { - ret = try_to_unmap_one(page, vma, migration); + ret = try_to_unmap_one(page, vma, flags); if (ret == SWAP_FAIL || !page_mapped(page)) break; } @@ -1058,8 +1056,7 @@ static int try_to_unmap_anon(struct page *page, int unlock, int migration) /** * try_to_unmap_file - unmap/unlock file page using the object-based rmap method * @page: the page to unmap/unlock - * @unlock: request for unlock rather than unmap [unlikely] - * @migration: unmapping for migration - ignored if @unlock + * @flags: action and flags * * Find all the mappings of a page using the mapping pointer and the vma chains * contained in the address_space struct it points to. @@ -1071,7 +1068,7 @@ static int try_to_unmap_anon(struct page *page, int unlock, int migration) * vm_flags for that VMA. That should be OK, because that vma shouldn't be * 'LOCKED. */ -static int try_to_unmap_file(struct page *page, int unlock, int migration) +static int try_to_unmap_file(struct page *page, enum ttu_flags flags) { struct address_space *mapping = page->mapping; pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); @@ -1083,6 +1080,7 @@ static int try_to_unmap_file(struct page *page, int unlock, int migration) unsigned long max_nl_size = 0; unsigned int mapcount; unsigned int mlocked = 0; + int unlock = TTU_ACTION(flags) == TTU_MUNLOCK; if (MLOCK_PAGES && unlikely(unlock)) ret = SWAP_SUCCESS; /* default for try_to_munlock() */ @@ -1095,7 +1093,7 @@ static int try_to_unmap_file(struct page *page, int unlock, int migration) continue; /* must visit all vmas */ ret = SWAP_MLOCK; } else { - ret = try_to_unmap_one(page, vma, migration); + ret = try_to_unmap_one(page, vma, flags); if (ret == SWAP_FAIL || !page_mapped(page)) goto out; } @@ -1120,7 +1118,8 @@ static int try_to_unmap_file(struct page *page, int unlock, int migration) ret = SWAP_MLOCK; /* leave mlocked == 0 */ goto out; /* no need to look further */ } - if (!MLOCK_PAGES && !migration && (vma->vm_flags & VM_LOCKED)) + if (!MLOCK_PAGES && !(flags & TTU_IGNORE_MLOCK) && + (vma->vm_flags & VM_LOCKED)) continue; cursor = (unsigned long) vma->vm_private_data; if (cursor > max_nl_cursor) @@ -1154,7 +1153,7 @@ static int try_to_unmap_file(struct page *page, int unlock, int migration) do { list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.vm_set.list) { - if (!MLOCK_PAGES && !migration && + if (!MLOCK_PAGES && !(flags & TTU_IGNORE_MLOCK) && (vma->vm_flags & VM_LOCKED)) continue; cursor = (unsigned long) vma->vm_private_data; @@ -1194,7 +1193,7 @@ out: /** * try_to_unmap - try to remove all page table mappings to a page * @page: the page to get unmapped - * @migration: migration flag + * @flags: action and flags * * Tries to remove all the page table entries which are mapping this * page, used in the pageout path. Caller must hold the page lock. @@ -1205,16 +1204,16 @@ out: * SWAP_FAIL - the page is unswappable * SWAP_MLOCK - page is mlocked. */ -int try_to_unmap(struct page *page, int migration) +int try_to_unmap(struct page *page, enum ttu_flags flags) { int ret; BUG_ON(!PageLocked(page)); if (PageAnon(page)) - ret = try_to_unmap_anon(page, 0, migration); + ret = try_to_unmap_anon(page, flags); else - ret = try_to_unmap_file(page, 0, migration); + ret = try_to_unmap_file(page, flags); if (ret != SWAP_MLOCK && !page_mapped(page)) ret = SWAP_SUCCESS; return ret; @@ -1239,8 +1238,8 @@ int try_to_munlock(struct page *page) VM_BUG_ON(!PageLocked(page) || PageLRU(page)); if (PageAnon(page)) - return try_to_unmap_anon(page, 1, 0); + return try_to_unmap_anon(page, TTU_MUNLOCK); else - return try_to_unmap_file(page, 1, 0); + return try_to_unmap_file(page, TTU_MUNLOCK); } diff --git a/mm/shmem.c b/mm/shmem.c index d713239ce2ce..356dd99566ec 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -49,7 +49,6 @@ static struct vfsmount *shm_mnt; #include <linux/backing-dev.h> #include <linux/shmem_fs.h> #include <linux/writeback.h> -#include <linux/vfs.h> #include <linux/blkdev.h> #include <linux/security.h> #include <linux/swapops.h> @@ -219,7 +218,7 @@ static const struct file_operations shmem_file_operations; static const struct inode_operations shmem_inode_operations; static const struct inode_operations shmem_dir_inode_operations; static const struct inode_operations shmem_special_inode_operations; -static struct vm_operations_struct shmem_vm_ops; +static const struct vm_operations_struct shmem_vm_ops; static struct backing_dev_info shmem_backing_dev_info __read_mostly = { .ra_pages = 0, /* No readahead */ @@ -1047,8 +1046,9 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc) * sync from ever calling shmem_writepage; but a stacking filesystem * may use the ->writepage of its underlying filesystem, in which case * tmpfs should write out to swap only in response to memory pressure, - * and not for pdflush or sync. However, in those cases, we do still - * want to check if there's a redundant swappage to be discarded. + * and not for the writeback threads or sync. However, in those cases, + * we do still want to check if there's a redundant swappage to be + * discarded. */ if (wbc->for_reclaim) swap = get_swap_page(); @@ -1097,6 +1097,10 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc) shmem_swp_unmap(entry); unlock: spin_unlock(&info->lock); + /* + * add_to_swap_cache() doesn't return -EEXIST, so we can safely + * clear SWAP_HAS_CACHE flag. + */ swapcache_free(swap, NULL); redirty: set_page_dirty(page); @@ -1630,8 +1634,8 @@ shmem_write_end(struct file *file, struct address_space *mapping, if (pos + copied > inode->i_size) i_size_write(inode, pos + copied); - unlock_page(page); set_page_dirty(page); + unlock_page(page); page_cache_release(page); return copied; @@ -1968,13 +1972,13 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s iput(inode); return error; } - unlock_page(page); inode->i_mapping->a_ops = &shmem_aops; inode->i_op = &shmem_symlink_inode_operations; kaddr = kmap_atomic(page, KM_USER0); memcpy(kaddr, symname, len); kunmap_atomic(kaddr, KM_USER0); set_page_dirty(page); + unlock_page(page); page_cache_release(page); } if (dir->i_mode & S_ISGID) @@ -2298,8 +2302,7 @@ static void shmem_put_super(struct super_block *sb) sb->s_fs_info = NULL; } -static int shmem_fill_super(struct super_block *sb, - void *data, int silent) +int shmem_fill_super(struct super_block *sb, void *data, int silent) { struct inode *inode; struct dentry *root; @@ -2307,17 +2310,14 @@ static int shmem_fill_super(struct super_block *sb, int err = -ENOMEM; /* Round up to L1_CACHE_BYTES to resist false sharing */ - sbinfo = kmalloc(max((int)sizeof(struct shmem_sb_info), + sbinfo = kzalloc(max((int)sizeof(struct shmem_sb_info), L1_CACHE_BYTES), GFP_KERNEL); if (!sbinfo) return -ENOMEM; - sbinfo->max_blocks = 0; - sbinfo->max_inodes = 0; sbinfo->mode = S_IRWXUGO | S_ISVTX; sbinfo->uid = current_fsuid(); sbinfo->gid = current_fsgid(); - sbinfo->mpol = NULL; sb->s_fs_info = sbinfo; #ifdef CONFIG_TMPFS @@ -2421,6 +2421,7 @@ static const struct address_space_operations shmem_aops = { .write_end = shmem_write_end, #endif .migratepage = migrate_page, + .error_remove_page = generic_error_remove_page, }; static const struct file_operations shmem_file_operations = { @@ -2446,7 +2447,7 @@ static const struct inode_operations shmem_inode_operations = { .getxattr = generic_getxattr, .listxattr = generic_listxattr, .removexattr = generic_removexattr, - .permission = shmem_permission, + .check_acl = shmem_check_acl, #endif }; @@ -2469,7 +2470,7 @@ static const struct inode_operations shmem_dir_inode_operations = { .getxattr = generic_getxattr, .listxattr = generic_listxattr, .removexattr = generic_removexattr, - .permission = shmem_permission, + .check_acl = shmem_check_acl, #endif }; @@ -2480,7 +2481,7 @@ static const struct inode_operations shmem_special_inode_operations = { .getxattr = generic_getxattr, .listxattr = generic_listxattr, .removexattr = generic_removexattr, - .permission = shmem_permission, + .check_acl = shmem_check_acl, #endif }; @@ -2497,7 +2498,7 @@ static const struct super_operations shmem_ops = { .put_super = shmem_put_super, }; -static struct vm_operations_struct shmem_vm_ops = { +static const struct vm_operations_struct shmem_vm_ops = { .fault = shmem_fault, #ifdef CONFIG_NUMA .set_policy = shmem_set_policy, @@ -2519,7 +2520,7 @@ static struct file_system_type tmpfs_fs_type = { .kill_sb = kill_litter_super, }; -static int __init init_tmpfs(void) +int __init init_tmpfs(void) { int error; @@ -2576,7 +2577,7 @@ static struct file_system_type tmpfs_fs_type = { .kill_sb = kill_litter_super, }; -static int __init init_tmpfs(void) +int __init init_tmpfs(void) { BUG_ON(register_filesystem(&tmpfs_fs_type) != 0); @@ -2591,6 +2592,11 @@ int shmem_unuse(swp_entry_t entry, struct page *page) return 0; } +int shmem_lock(struct file *file, int lock, struct user_struct *user) +{ + return 0; +} + #define shmem_vm_ops generic_file_vm_ops #define shmem_file_operations ramfs_file_operations #define shmem_get_inode(sb, mode, dev, flags) ramfs_get_inode(sb, mode, dev) @@ -2687,5 +2693,3 @@ int shmem_zero_setup(struct vm_area_struct *vma) vma->vm_ops = &shmem_vm_ops; return 0; } - -module_init(init_tmpfs) diff --git a/mm/shmem_acl.c b/mm/shmem_acl.c index 606a8e757a42..df2c87fdae50 100644 --- a/mm/shmem_acl.c +++ b/mm/shmem_acl.c @@ -157,7 +157,7 @@ shmem_acl_init(struct inode *inode, struct inode *dir) /** * shmem_check_acl - check_acl() callback for generic_permission() */ -static int +int shmem_check_acl(struct inode *inode, int mask) { struct posix_acl *acl = shmem_get_acl(inode, ACL_TYPE_ACCESS); @@ -169,12 +169,3 @@ shmem_check_acl(struct inode *inode, int mask) } return -EAGAIN; } - -/** - * shmem_permission - permission() inode operation - */ -int -shmem_permission(struct inode *inode, int mask) -{ - return generic_permission(inode, mask, shmem_check_acl); -} diff --git a/mm/slab.c b/mm/slab.c index e74a16e4ced6..7dfa481c96ba 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -1384,7 +1384,7 @@ void __init kmem_cache_init(void) * Fragmentation resistance on low memory - only use bigger * page orders on machines with more than 32MB of memory. */ - if (num_physpages > (32 << 20) >> PAGE_SHIFT) + if (totalram_pages > (32 << 20) >> PAGE_SHIFT) slab_break_gfp_order = BREAK_GFP_ORDER_HI; /* Bootstrap is tricky, because several objects are allocated @@ -1544,9 +1544,6 @@ void __init kmem_cache_init(void) } g_cpucache_up = EARLY; - - /* Annotate slab for lockdep -- annotate the malloc caches */ - init_lock_keys(); } void __init kmem_cache_init_late(void) @@ -1563,6 +1560,9 @@ void __init kmem_cache_init_late(void) /* Done! */ g_cpucache_up = FULL; + /* Annotate slab for lockdep -- annotate the malloc caches */ + init_lock_keys(); + /* * Register a cpu startup notifier callback that initializes * cpu_cache_get for all new cpus @@ -2547,7 +2547,7 @@ void kmem_cache_destroy(struct kmem_cache *cachep) } if (unlikely(cachep->flags & SLAB_DESTROY_BY_RCU)) - synchronize_rcu(); + rcu_barrier(); __kmem_cache_destroy(cachep); mutex_unlock(&cache_chain_mutex); diff --git a/mm/slob.c b/mm/slob.c index c78742defdc6..837ebd64cc34 100644 --- a/mm/slob.c +++ b/mm/slob.c @@ -595,6 +595,8 @@ EXPORT_SYMBOL(kmem_cache_create); void kmem_cache_destroy(struct kmem_cache *c) { kmemleak_free(c); + if (c->flags & SLAB_DESTROY_BY_RCU) + rcu_barrier(); slob_free(c, sizeof(struct kmem_cache)); } EXPORT_SYMBOL(kmem_cache_destroy); @@ -690,3 +692,8 @@ void __init kmem_cache_init(void) { slob_ready = 1; } + +void __init kmem_cache_init_late(void) +{ + /* Nothing to do */ +} diff --git a/mm/slub.c b/mm/slub.c index 819f056b39c6..4996fc719552 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -21,7 +21,6 @@ #include <linux/kmemcheck.h> #include <linux/cpu.h> #include <linux/cpuset.h> -#include <linux/kmemleak.h> #include <linux/mempolicy.h> #include <linux/ctype.h> #include <linux/debugobjects.h> @@ -142,6 +141,13 @@ SLAB_POISON | SLAB_STORE_USER) /* + * Debugging flags that require metadata to be stored in the slab. These get + * disabled when slub_debug=O is used and a cache's min order increases with + * metadata. + */ +#define DEBUG_METADATA_FLAGS (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER) + +/* * Set of flags that will prevent slab merging */ #define SLUB_NEVER_MERGE (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \ @@ -326,6 +332,7 @@ static int slub_debug; #endif static char *slub_debug_slabs; +static int disable_higher_order_debug; /* * Object debugging @@ -647,7 +654,7 @@ static int slab_pad_check(struct kmem_cache *s, struct page *page) slab_err(s, page, "Padding overwritten. 0x%p-0x%p", fault, end - 1); print_section("Padding", end - remainder, remainder); - restore_bytes(s, "slab padding", POISON_INUSE, start, end); + restore_bytes(s, "slab padding", POISON_INUSE, end - remainder, end); return 0; } @@ -977,6 +984,15 @@ static int __init setup_slub_debug(char *str) */ goto check_slabs; + if (tolower(*str) == 'o') { + /* + * Avoid enabling debugging on caches if its minimum order + * would increase as a result. + */ + disable_higher_order_debug = 1; + goto out; + } + slub_debug = 0; if (*str == '-') /* @@ -1027,8 +1043,8 @@ static unsigned long kmem_cache_flags(unsigned long objsize, * Enable debugging if selected on the kernel commandline. */ if (slub_debug && (!slub_debug_slabs || - strncmp(slub_debug_slabs, name, strlen(slub_debug_slabs)) == 0)) - flags |= slub_debug; + !strncmp(slub_debug_slabs, name, strlen(slub_debug_slabs)))) + flags |= slub_debug; return flags; } @@ -1055,6 +1071,8 @@ static inline unsigned long kmem_cache_flags(unsigned long objsize, } #define slub_debug 0 +#define disable_higher_order_debug 0 + static inline unsigned long slabs_node(struct kmem_cache *s, int node) { return 0; } static inline unsigned long node_nr_slabs(struct kmem_cache_node *n) @@ -1110,8 +1128,7 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) } if (kmemcheck_enabled - && !(s->flags & (SLAB_NOTRACK | DEBUG_DEFAULT_FLAGS))) - { + && !(s->flags & (SLAB_NOTRACK | DEBUG_DEFAULT_FLAGS))) { int pages = 1 << oo_order(oo); kmemcheck_alloc_shadow(page, oo_order(oo), flags, node); @@ -1561,6 +1578,10 @@ slab_out_of_memory(struct kmem_cache *s, gfp_t gfpflags, int nid) "default order: %d, min order: %d\n", s->name, s->objsize, s->size, oo_order(s->oo), oo_order(s->min)); + if (oo_order(s->min) > get_order(s->objsize)) + printk(KERN_WARNING " %s debugging increased min order, use " + "slub_debug=O to disable.\n", s->name); + for_each_online_node(node) { struct kmem_cache_node *n = get_node(s, node); unsigned long nr_slabs; @@ -2002,7 +2023,7 @@ static inline int calculate_order(int size) return order; fraction /= 2; } - min_objects --; + min_objects--; } /* @@ -2092,8 +2113,8 @@ init_kmem_cache_node(struct kmem_cache_node *n, struct kmem_cache *s) */ #define NR_KMEM_CACHE_CPU 100 -static DEFINE_PER_CPU(struct kmem_cache_cpu, - kmem_cache_cpu)[NR_KMEM_CACHE_CPU]; +static DEFINE_PER_CPU(struct kmem_cache_cpu [NR_KMEM_CACHE_CPU], + kmem_cache_cpu); static DEFINE_PER_CPU(struct kmem_cache_cpu *, kmem_cache_cpu_free); static DECLARE_BITMAP(kmem_cach_cpu_free_init_once, CONFIG_NR_CPUS); @@ -2401,6 +2422,7 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order) * on bootup. */ align = calculate_alignment(flags, align, s->objsize); + s->align = align; /* * SLUB stores one object immediately after another beginning from @@ -2453,6 +2475,18 @@ static int kmem_cache_open(struct kmem_cache *s, gfp_t gfpflags, if (!calculate_sizes(s, -1)) goto error; + if (disable_higher_order_debug) { + /* + * Disable debugging flags that store metadata if the min slab + * order increased. + */ + if (get_order(s->size) > get_order(s->objsize)) { + s->flags &= ~DEBUG_METADATA_FLAGS; + s->offset = 0; + if (!calculate_sizes(s, -1)) + goto error; + } + } /* * The larger the object size is, the more pages we want on the partial @@ -2605,6 +2639,8 @@ void kmem_cache_destroy(struct kmem_cache *s) "still has objects.\n", s->name, __func__); dump_stack(); } + if (s->flags & SLAB_DESTROY_BY_RCU) + rcu_barrier(); sysfs_slab_remove(s); } else up_write(&slub_lock); @@ -2789,6 +2825,11 @@ static s8 size_index[24] = { 2 /* 192 */ }; +static inline int size_index_elem(size_t bytes) +{ + return (bytes - 1) / 8; +} + static struct kmem_cache *get_slab(size_t size, gfp_t flags) { int index; @@ -2797,7 +2838,7 @@ static struct kmem_cache *get_slab(size_t size, gfp_t flags) if (!size) return ZERO_SIZE_PTR; - index = size_index[(size - 1) / 8]; + index = size_index[size_index_elem(size)]; } else index = fls(size - 1); @@ -2833,13 +2874,15 @@ EXPORT_SYMBOL(__kmalloc); static void *kmalloc_large_node(size_t size, gfp_t flags, int node) { struct page *page; + void *ptr = NULL; flags |= __GFP_COMP | __GFP_NOTRACK; page = alloc_pages_node(node, flags, get_order(size)); if (page) - return page_address(page); - else - return NULL; + ptr = page_address(page); + + kmemleak_alloc(ptr, size, 1, flags); + return ptr; } #ifdef CONFIG_NUMA @@ -2924,6 +2967,7 @@ void kfree(const void *x) page = virt_to_head_page(x); if (unlikely(!PageSlab(page))) { BUG_ON(!PageCompound(page)); + kmemleak_free(x); put_page(page); return; } @@ -3152,10 +3196,12 @@ void __init kmem_cache_init(void) slab_state = PARTIAL; /* Caches that are not of the two-to-the-power-of size */ - if (KMALLOC_MIN_SIZE <= 64) { + if (KMALLOC_MIN_SIZE <= 32) { create_kmalloc_cache(&kmalloc_caches[1], "kmalloc-96", 96, GFP_NOWAIT); caches++; + } + if (KMALLOC_MIN_SIZE <= 64) { create_kmalloc_cache(&kmalloc_caches[2], "kmalloc-192", 192, GFP_NOWAIT); caches++; @@ -3182,17 +3228,28 @@ void __init kmem_cache_init(void) BUILD_BUG_ON(KMALLOC_MIN_SIZE > 256 || (KMALLOC_MIN_SIZE & (KMALLOC_MIN_SIZE - 1))); - for (i = 8; i < KMALLOC_MIN_SIZE; i += 8) - size_index[(i - 1) / 8] = KMALLOC_SHIFT_LOW; + for (i = 8; i < KMALLOC_MIN_SIZE; i += 8) { + int elem = size_index_elem(i); + if (elem >= ARRAY_SIZE(size_index)) + break; + size_index[elem] = KMALLOC_SHIFT_LOW; + } - if (KMALLOC_MIN_SIZE == 128) { + if (KMALLOC_MIN_SIZE == 64) { + /* + * The 96 byte size cache is not used if the alignment + * is 64 byte. + */ + for (i = 64 + 8; i <= 96; i += 8) + size_index[size_index_elem(i)] = 7; + } else if (KMALLOC_MIN_SIZE == 128) { /* * The 192 byte sized cache is not used if the alignment * is 128 byte. Redirect kmalloc to use the 256 byte cache * instead. */ for (i = 128 + 8; i <= 192; i += 8) - size_index[(i - 1) / 8] = 8; + size_index[size_index_elem(i)] = 8; } slab_state = UP; @@ -3288,6 +3345,9 @@ struct kmem_cache *kmem_cache_create(const char *name, size_t size, { struct kmem_cache *s; + if (WARN_ON(!name)) + return NULL; + down_write(&slub_lock); s = find_mergeable(size, align, flags, name, ctor); if (s) { @@ -4539,8 +4599,11 @@ static int sysfs_slab_add(struct kmem_cache *s) } err = sysfs_create_group(&s->kobj, &slab_attr_group); - if (err) + if (err) { + kobject_del(&s->kobj); + kobject_put(&s->kobj); return err; + } kobject_uevent(&s->kobj, KOBJ_ADD); if (!unmergeable) { /* Setup first alias */ @@ -4722,7 +4785,7 @@ static const struct file_operations proc_slabinfo_operations = { static int __init slab_proc_init(void) { - proc_create("slabinfo",S_IWUSR|S_IRUGO,NULL,&proc_slabinfo_operations); + proc_create("slabinfo", S_IRUGO, NULL, &proc_slabinfo_operations); return 0; } module_init(slab_proc_init); diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c index a13ea6401ae7..d9714bdcb4a3 100644 --- a/mm/sparse-vmemmap.c +++ b/mm/sparse-vmemmap.c @@ -48,8 +48,14 @@ void * __meminit vmemmap_alloc_block(unsigned long size, int node) { /* If the main allocator is up use that, fallback to bootmem. */ if (slab_is_available()) { - struct page *page = alloc_pages_node(node, + struct page *page; + + if (node_state(node, N_HIGH_MEMORY)) + page = alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, get_order(size)); + else + page = alloc_pages(GFP_KERNEL | __GFP_ZERO, + get_order(size)); if (page) return page_address(page); return NULL; diff --git a/mm/sparse.c b/mm/sparse.c index da432d9f0ae8..6ce4aab69e99 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -62,9 +62,12 @@ static struct mem_section noinline __init_refok *sparse_index_alloc(int nid) unsigned long array_size = SECTIONS_PER_ROOT * sizeof(struct mem_section); - if (slab_is_available()) - section = kmalloc_node(array_size, GFP_KERNEL, nid); - else + if (slab_is_available()) { + if (node_state(nid, N_HIGH_MEMORY)) + section = kmalloc_node(array_size, GFP_KERNEL, nid); + else + section = kmalloc(array_size, GFP_KERNEL); + } else section = alloc_bootmem_node(NODE_DATA(nid), array_size); if (section) diff --git a/mm/swap.c b/mm/swap.c index cb29ae5d33ab..308e57d8d7ed 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -118,7 +118,7 @@ static void pagevec_move_tail(struct pagevec *pvec) spin_lock(&zone->lru_lock); } if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) { - int lru = page_is_file_cache(page); + int lru = page_lru_base_type(page); list_move_tail(&page->lru, &zone->lru[lru].list); pgmoved++; } @@ -181,7 +181,7 @@ void activate_page(struct page *page) spin_lock_irq(&zone->lru_lock); if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) { int file = page_is_file_cache(page); - int lru = LRU_BASE + file; + int lru = page_lru_base_type(page); del_page_from_lru_list(zone, page, lru); SetPageActive(page); @@ -189,7 +189,7 @@ void activate_page(struct page *page) add_page_to_lru_list(zone, page, lru); __count_vm_event(PGACTIVATE); - update_page_reclaim_stat(zone, page, !!file, 1); + update_page_reclaim_stat(zone, page, file, 1); } spin_unlock_irq(&zone->lru_lock); } @@ -496,7 +496,7 @@ EXPORT_SYMBOL(pagevec_lookup_tag); */ void __init swap_setup(void) { - unsigned long megs = num_physpages >> (20 - PAGE_SHIFT); + unsigned long megs = totalram_pages >> (20 - PAGE_SHIFT); #ifdef CONFIG_SWAP bdi_init(swapper_space.backing_dev_info); diff --git a/mm/swap_state.c b/mm/swap_state.c index 42cd38eba79f..6d1daeb1cb4a 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -34,6 +34,7 @@ static const struct address_space_operations swap_aops = { }; static struct backing_dev_info swap_backing_dev_info = { + .name = "swap", .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK | BDI_CAP_SWAP_BACKED, .unplug_io_fn = swap_unplug_io_fn, }; @@ -66,10 +67,10 @@ void show_swap_cache_info(void) } /* - * add_to_swap_cache resembles add_to_page_cache_locked on swapper_space, + * __add_to_swap_cache resembles add_to_page_cache_locked on swapper_space, * but sets SwapCache flag and private instead of mapping and index. */ -int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask) +static int __add_to_swap_cache(struct page *page, swp_entry_t entry) { int error; @@ -77,28 +78,43 @@ int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask) VM_BUG_ON(PageSwapCache(page)); VM_BUG_ON(!PageSwapBacked(page)); + page_cache_get(page); + SetPageSwapCache(page); + set_page_private(page, entry.val); + + spin_lock_irq(&swapper_space.tree_lock); + error = radix_tree_insert(&swapper_space.page_tree, entry.val, page); + if (likely(!error)) { + total_swapcache_pages++; + __inc_zone_page_state(page, NR_FILE_PAGES); + INC_CACHE_INFO(add_total); + } + spin_unlock_irq(&swapper_space.tree_lock); + + if (unlikely(error)) { + /* + * Only the context which have set SWAP_HAS_CACHE flag + * would call add_to_swap_cache(). + * So add_to_swap_cache() doesn't returns -EEXIST. + */ + VM_BUG_ON(error == -EEXIST); + set_page_private(page, 0UL); + ClearPageSwapCache(page); + page_cache_release(page); + } + + return error; +} + + +int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask) +{ + int error; + error = radix_tree_preload(gfp_mask); if (!error) { - page_cache_get(page); - SetPageSwapCache(page); - set_page_private(page, entry.val); - - spin_lock_irq(&swapper_space.tree_lock); - error = radix_tree_insert(&swapper_space.page_tree, - entry.val, page); - if (likely(!error)) { - total_swapcache_pages++; - __inc_zone_page_state(page, NR_FILE_PAGES); - INC_CACHE_INFO(add_total); - } - spin_unlock_irq(&swapper_space.tree_lock); + error = __add_to_swap_cache(page, entry); radix_tree_preload_end(); - - if (unlikely(error)) { - set_page_private(page, 0UL); - ClearPageSwapCache(page); - page_cache_release(page); - } } return error; } @@ -136,38 +152,34 @@ int add_to_swap(struct page *page) VM_BUG_ON(!PageLocked(page)); VM_BUG_ON(!PageUptodate(page)); - for (;;) { - entry = get_swap_page(); - if (!entry.val) - return 0; + entry = get_swap_page(); + if (!entry.val) + return 0; + /* + * Radix-tree node allocations from PF_MEMALLOC contexts could + * completely exhaust the page allocator. __GFP_NOMEMALLOC + * stops emergency reserves from being allocated. + * + * TODO: this could cause a theoretical memory reclaim + * deadlock in the swap out path. + */ + /* + * Add it to the swap cache and mark it dirty + */ + err = add_to_swap_cache(page, entry, + __GFP_HIGH|__GFP_NOMEMALLOC|__GFP_NOWARN); + + if (!err) { /* Success */ + SetPageDirty(page); + return 1; + } else { /* -ENOMEM radix-tree allocation failure */ /* - * Radix-tree node allocations from PF_MEMALLOC contexts could - * completely exhaust the page allocator. __GFP_NOMEMALLOC - * stops emergency reserves from being allocated. - * - * TODO: this could cause a theoretical memory reclaim - * deadlock in the swap out path. - */ - /* - * Add it to the swap cache and mark it dirty + * add_to_swap_cache() doesn't return -EEXIST, so we can safely + * clear SWAP_HAS_CACHE flag. */ - err = add_to_swap_cache(page, entry, - __GFP_HIGH|__GFP_NOMEMALLOC|__GFP_NOWARN); - - switch (err) { - case 0: /* Success */ - SetPageDirty(page); - return 1; - case -EEXIST: - /* Raced with "speculative" read_swap_cache_async */ - swapcache_free(entry, NULL); - continue; - default: - /* -ENOMEM radix-tree allocation failure */ - swapcache_free(entry, NULL); - return 0; - } + swapcache_free(entry, NULL); + return 0; } } @@ -289,26 +301,31 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, } /* + * call radix_tree_preload() while we can wait. + */ + err = radix_tree_preload(gfp_mask & GFP_KERNEL); + if (err) + break; + + /* * Swap entry may have been freed since our caller observed it. */ err = swapcache_prepare(entry); - if (err == -EEXIST) /* seems racy */ + if (err == -EEXIST) { /* seems racy */ + radix_tree_preload_end(); continue; - if (err) /* swp entry is obsolete ? */ + } + if (err) { /* swp entry is obsolete ? */ + radix_tree_preload_end(); break; + } - /* - * Associate the page with swap entry in the swap cache. - * May fail (-EEXIST) if there is already a page associated - * with this entry in the swap cache: added by a racing - * read_swap_cache_async, or add_to_swap or shmem_writepage - * re-using the just freed swap entry for an existing page. - * May fail (-ENOMEM) if radix-tree node allocation failed. - */ + /* May fail (-ENOMEM) if radix-tree node allocation failed. */ __set_page_locked(new_page); SetPageSwapBacked(new_page); - err = add_to_swap_cache(new_page, entry, gfp_mask & GFP_KERNEL); + err = __add_to_swap_cache(new_page, entry); if (likely(!err)) { + radix_tree_preload_end(); /* * Initiate read into locked page and return. */ @@ -316,8 +333,13 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, swap_readpage(new_page); return new_page; } + radix_tree_preload_end(); ClearPageSwapBacked(new_page); __clear_page_locked(new_page); + /* + * add_to_swap_cache() doesn't return -EEXIST, so we can safely + * clear SWAP_HAS_CACHE flag. + */ swapcache_free(entry, NULL); } while (err != -ENOMEM); diff --git a/mm/swapfile.c b/mm/swapfile.c index d1ade1a48ee7..a1bc6b9af9a2 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -161,7 +161,8 @@ static int discard_swap(struct swap_info_struct *si) } err = blkdev_issue_discard(si->bdev, start_block, - nr_blocks, GFP_KERNEL); + nr_blocks, GFP_KERNEL, + DISCARD_FL_BARRIER); if (err) break; @@ -200,7 +201,8 @@ static void discard_swap_cluster(struct swap_info_struct *si, start_block <<= PAGE_SHIFT - 9; nr_blocks <<= PAGE_SHIFT - 9; if (blkdev_issue_discard(si->bdev, start_block, - nr_blocks, GFP_NOIO)) + nr_blocks, GFP_NOIO, + DISCARD_FL_BARRIER)) break; } @@ -697,7 +699,7 @@ int free_swap_and_cache(swp_entry_t entry) struct swap_info_struct *p; struct page *page = NULL; - if (is_migration_entry(entry)) + if (non_swap_entry(entry)) return 1; p = swap_info_get(entry); @@ -753,7 +755,7 @@ int swap_type_of(dev_t device, sector_t offset, struct block_device **bdev_p) if (!bdev) { if (bdev_p) - *bdev_p = bdget(sis->bdev->bd_dev); + *bdev_p = bdgrab(sis->bdev); spin_unlock(&swap_lock); return i; @@ -765,7 +767,7 @@ int swap_type_of(dev_t device, sector_t offset, struct block_device **bdev_p) struct swap_extent, list); if (se->start_block == offset) { if (bdev_p) - *bdev_p = bdget(sis->bdev->bd_dev); + *bdev_p = bdgrab(sis->bdev); spin_unlock(&swap_lock); bdput(bdev); @@ -1573,9 +1575,9 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) p->flags &= ~SWP_WRITEOK; spin_unlock(&swap_lock); - current->flags |= PF_SWAPOFF; + current->flags |= PF_OOM_ORIGIN; err = try_to_unuse(type); - current->flags &= ~PF_SWAPOFF; + current->flags &= ~PF_OOM_ORIGIN; if (err) { /* re-insert swap space back into swap_list */ @@ -1972,12 +1974,14 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) goto bad_swap; } - if (blk_queue_nonrot(bdev_get_queue(p->bdev))) { - p->flags |= SWP_SOLIDSTATE; - p->cluster_next = 1 + (random32() % p->highest_bit); + if (p->bdev) { + if (blk_queue_nonrot(bdev_get_queue(p->bdev))) { + p->flags |= SWP_SOLIDSTATE; + p->cluster_next = 1 + (random32() % p->highest_bit); + } + if (discard_swap(p) == 0) + p->flags |= SWP_DISCARDABLE; } - if (discard_swap(p) == 0) - p->flags |= SWP_DISCARDABLE; mutex_lock(&swapon_mutex); spin_lock(&swap_lock); @@ -2083,7 +2087,7 @@ static int __swap_duplicate(swp_entry_t entry, bool cache) int count; bool has_cache; - if (is_migration_entry(entry)) + if (non_swap_entry(entry)) return -EINVAL; type = swp_type(entry); diff --git a/mm/truncate.c b/mm/truncate.c index ccc3ecf7cb98..450cebdabfc0 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -93,11 +93,11 @@ EXPORT_SYMBOL(cancel_dirty_page); * its lock, b) when a concurrent invalidate_mapping_pages got there first and * c) when tmpfs swizzles a page between a tmpfs inode and swapper_space. */ -static void +static int truncate_complete_page(struct address_space *mapping, struct page *page) { if (page->mapping != mapping) - return; + return -EIO; if (page_has_private(page)) do_invalidatepage(page, 0); @@ -108,6 +108,7 @@ truncate_complete_page(struct address_space *mapping, struct page *page) remove_from_page_cache(page); ClearPageMappedToDisk(page); page_cache_release(page); /* pagecache ref */ + return 0; } /* @@ -135,6 +136,51 @@ invalidate_complete_page(struct address_space *mapping, struct page *page) return ret; } +int truncate_inode_page(struct address_space *mapping, struct page *page) +{ + if (page_mapped(page)) { + unmap_mapping_range(mapping, + (loff_t)page->index << PAGE_CACHE_SHIFT, + PAGE_CACHE_SIZE, 0); + } + return truncate_complete_page(mapping, page); +} + +/* + * Used to get rid of pages on hardware memory corruption. + */ +int generic_error_remove_page(struct address_space *mapping, struct page *page) +{ + if (!mapping) + return -EINVAL; + /* + * Only punch for normal data pages for now. + * Handling other types like directories would need more auditing. + */ + if (!S_ISREG(mapping->host->i_mode)) + return -EIO; + return truncate_inode_page(mapping, page); +} +EXPORT_SYMBOL(generic_error_remove_page); + +/* + * Safely invalidate one page from its pagecache mapping. + * It only drops clean, unused pages. The page must be locked. + * + * Returns 1 if the page is successfully invalidated, otherwise 0. + */ +int invalidate_inode_page(struct page *page) +{ + struct address_space *mapping = page_mapping(page); + if (!mapping) + return 0; + if (PageDirty(page) || PageWriteback(page)) + return 0; + if (page_mapped(page)) + return 0; + return invalidate_complete_page(mapping, page); +} + /** * truncate_inode_pages - truncate range of pages specified by start & end byte offsets * @mapping: mapping to truncate @@ -196,12 +242,7 @@ void truncate_inode_pages_range(struct address_space *mapping, unlock_page(page); continue; } - if (page_mapped(page)) { - unmap_mapping_range(mapping, - (loff_t)page_index<<PAGE_CACHE_SHIFT, - PAGE_CACHE_SIZE, 0); - } - truncate_complete_page(mapping, page); + truncate_inode_page(mapping, page); unlock_page(page); } pagevec_release(&pvec); @@ -238,15 +279,10 @@ void truncate_inode_pages_range(struct address_space *mapping, break; lock_page(page); wait_on_page_writeback(page); - if (page_mapped(page)) { - unmap_mapping_range(mapping, - (loff_t)page->index<<PAGE_CACHE_SHIFT, - PAGE_CACHE_SIZE, 0); - } + truncate_inode_page(mapping, page); if (page->index > next) next = page->index; next++; - truncate_complete_page(mapping, page); unlock_page(page); } pagevec_release(&pvec); @@ -311,12 +347,8 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping, if (lock_failed) continue; - if (PageDirty(page) || PageWriteback(page)) - goto unlock; - if (page_mapped(page)) - goto unlock; - ret += invalidate_complete_page(mapping, page); -unlock: + ret += invalidate_inode_page(page); + unlock_page(page); if (next > end) break; @@ -465,3 +497,67 @@ int invalidate_inode_pages2(struct address_space *mapping) return invalidate_inode_pages2_range(mapping, 0, -1); } EXPORT_SYMBOL_GPL(invalidate_inode_pages2); + +/** + * truncate_pagecache - unmap and remove pagecache that has been truncated + * @inode: inode + * @old: old file offset + * @new: new file offset + * + * inode's new i_size must already be written before truncate_pagecache + * is called. + * + * This function should typically be called before the filesystem + * releases resources associated with the freed range (eg. deallocates + * blocks). This way, pagecache will always stay logically coherent + * with on-disk format, and the filesystem would not have to deal with + * situations such as writepage being called for a page that has already + * had its underlying blocks deallocated. + */ +void truncate_pagecache(struct inode *inode, loff_t old, loff_t new) +{ + if (new < old) { + struct address_space *mapping = inode->i_mapping; + + /* + * unmap_mapping_range is called twice, first simply for + * efficiency so that truncate_inode_pages does fewer + * single-page unmaps. However after this first call, and + * before truncate_inode_pages finishes, it is possible for + * private pages to be COWed, which remain after + * truncate_inode_pages finishes, hence the second + * unmap_mapping_range call must be made for correctness. + */ + unmap_mapping_range(mapping, new + PAGE_SIZE - 1, 0, 1); + truncate_inode_pages(mapping, new); + unmap_mapping_range(mapping, new + PAGE_SIZE - 1, 0, 1); + } +} +EXPORT_SYMBOL(truncate_pagecache); + +/** + * vmtruncate - unmap mappings "freed" by truncate() syscall + * @inode: inode of the file used + * @offset: file offset to start truncating + * + * NOTE! We have to be ready to update the memory sharing + * between the file and the memory map for a potential last + * incomplete page. Ugly, but necessary. + */ +int vmtruncate(struct inode *inode, loff_t offset) +{ + loff_t oldsize; + int error; + + error = inode_newsize_ok(inode, offset); + if (error) + return error; + oldsize = inode->i_size; + i_size_write(inode, offset); + truncate_pagecache(inode, oldsize, offset); + if (inode->i_op->truncate) + inode->i_op->truncate(inode); + + return error; +} +EXPORT_SYMBOL(vmtruncate); diff --git a/mm/vmalloc.c b/mm/vmalloc.c index f8189a4b3e13..69511e663234 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -25,7 +25,7 @@ #include <linux/rcupdate.h> #include <linux/pfn.h> #include <linux/kmemleak.h> - +#include <linux/highmem.h> #include <asm/atomic.h> #include <asm/uaccess.h> #include <asm/tlbflush.h> @@ -168,11 +168,9 @@ static int vmap_page_range_noflush(unsigned long start, unsigned long end, next = pgd_addr_end(addr, end); err = vmap_pud_range(pgd, addr, next, prot, pages, &nr); if (err) - break; + return err; } while (pgd++, addr = next, addr != end); - if (unlikely(err)) - return err; return nr; } @@ -186,7 +184,7 @@ static int vmap_page_range(unsigned long start, unsigned long end, return ret; } -static inline int is_vmalloc_or_module_addr(const void *x) +int is_vmalloc_or_module_addr(const void *x) { /* * ARM, x86-64 and sparc64 put modules in a special place, @@ -265,6 +263,7 @@ struct vmap_area { static DEFINE_SPINLOCK(vmap_area_lock); static struct rb_root vmap_area_root = RB_ROOT; static LIST_HEAD(vmap_area_list); +static unsigned long vmap_area_pcpu_hole; static struct vmap_area *__find_vmap_area(unsigned long addr) { @@ -431,6 +430,15 @@ static void __free_vmap_area(struct vmap_area *va) RB_CLEAR_NODE(&va->rb_node); list_del_rcu(&va->list); + /* + * Track the highest possible candidate for pcpu area + * allocation. Areas outside of vmalloc area can be returned + * here too, consider only end addresses which fall inside + * vmalloc area proper. + */ + if (va->va_end > VMALLOC_START && va->va_end <= VMALLOC_END) + vmap_area_pcpu_hole = max(vmap_area_pcpu_hole, va->va_end); + call_rcu(&va->rcu_head, rcu_free_va); } @@ -1038,6 +1046,9 @@ void __init vmalloc_init(void) va->va_end = va->va_start + tmp->size; __insert_vmap_area(va); } + + vmap_area_pcpu_hole = VMALLOC_END; + vmap_initialized = true; } @@ -1122,13 +1133,34 @@ EXPORT_SYMBOL_GPL(map_vm_area); DEFINE_RWLOCK(vmlist_lock); struct vm_struct *vmlist; +static void insert_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va, + unsigned long flags, void *caller) +{ + struct vm_struct *tmp, **p; + + vm->flags = flags; + vm->addr = (void *)va->va_start; + vm->size = va->va_end - va->va_start; + vm->caller = caller; + va->private = vm; + va->flags |= VM_VM_AREA; + + write_lock(&vmlist_lock); + for (p = &vmlist; (tmp = *p) != NULL; p = &tmp->next) { + if (tmp->addr >= vm->addr) + break; + } + vm->next = *p; + *p = vm; + write_unlock(&vmlist_lock); +} + static struct vm_struct *__get_vm_area_node(unsigned long size, unsigned long flags, unsigned long start, unsigned long end, int node, gfp_t gfp_mask, void *caller) { static struct vmap_area *va; struct vm_struct *area; - struct vm_struct *tmp, **p; unsigned long align = 1; BUG_ON(in_interrupt()); @@ -1147,7 +1179,7 @@ static struct vm_struct *__get_vm_area_node(unsigned long size, if (unlikely(!size)) return NULL; - area = kmalloc_node(sizeof(*area), gfp_mask & GFP_RECLAIM_MASK, node); + area = kzalloc_node(sizeof(*area), gfp_mask & GFP_RECLAIM_MASK, node); if (unlikely(!area)) return NULL; @@ -1162,25 +1194,7 @@ static struct vm_struct *__get_vm_area_node(unsigned long size, return NULL; } - area->flags = flags; - area->addr = (void *)va->va_start; - area->size = size; - area->pages = NULL; - area->nr_pages = 0; - area->phys_addr = 0; - area->caller = caller; - va->private = area; - va->flags |= VM_VM_AREA; - - write_lock(&vmlist_lock); - for (p = &vmlist; (tmp = *p) != NULL; p = &tmp->next) { - if (tmp->addr >= area->addr) - break; - } - area->next = *p; - *p = area; - write_unlock(&vmlist_lock); - + insert_vmalloc_vm(area, va, flags, caller); return area; } @@ -1256,17 +1270,21 @@ struct vm_struct *remove_vm_area(const void *addr) if (va && va->flags & VM_VM_AREA) { struct vm_struct *vm = va->private; struct vm_struct *tmp, **p; - - vmap_debug_free_range(va->va_start, va->va_end); - free_unmap_vmap_area(va); - vm->size -= PAGE_SIZE; - + /* + * remove from list and disallow access to this vm_struct + * before unmap. (address range confliction is maintained by + * vmap.) + */ write_lock(&vmlist_lock); for (p = &vmlist; (tmp = *p) != vm; p = &tmp->next) ; *p = tmp->next; write_unlock(&vmlist_lock); + vmap_debug_free_range(va->va_start, va->va_end); + free_unmap_vmap_area(va); + vm->size -= PAGE_SIZE; + return vm; } return NULL; @@ -1368,7 +1386,7 @@ void *vmap(struct page **pages, unsigned int count, might_sleep(); - if (count > num_physpages) + if (count > totalram_pages) return NULL; area = get_vm_area_caller((count << PAGE_SHIFT), flags, @@ -1475,7 +1493,7 @@ static void *__vmalloc_node(unsigned long size, gfp_t gfp_mask, pgprot_t prot, unsigned long real_size = size; size = PAGE_ALIGN(size); - if (!size || (size >> PAGE_SHIFT) > num_physpages) + if (!size || (size >> PAGE_SHIFT) > totalram_pages) return NULL; area = __get_vm_area_node(size, VM_ALLOC, VMALLOC_START, VMALLOC_END, @@ -1625,10 +1643,120 @@ void *vmalloc_32_user(unsigned long size) } EXPORT_SYMBOL(vmalloc_32_user); +/* + * small helper routine , copy contents to buf from addr. + * If the page is not present, fill zero. + */ + +static int aligned_vread(char *buf, char *addr, unsigned long count) +{ + struct page *p; + int copied = 0; + + while (count) { + unsigned long offset, length; + + offset = (unsigned long)addr & ~PAGE_MASK; + length = PAGE_SIZE - offset; + if (length > count) + length = count; + p = vmalloc_to_page(addr); + /* + * To do safe access to this _mapped_ area, we need + * lock. But adding lock here means that we need to add + * overhead of vmalloc()/vfree() calles for this _debug_ + * interface, rarely used. Instead of that, we'll use + * kmap() and get small overhead in this access function. + */ + if (p) { + /* + * we can expect USER0 is not used (see vread/vwrite's + * function description) + */ + void *map = kmap_atomic(p, KM_USER0); + memcpy(buf, map + offset, length); + kunmap_atomic(map, KM_USER0); + } else + memset(buf, 0, length); + + addr += length; + buf += length; + copied += length; + count -= length; + } + return copied; +} + +static int aligned_vwrite(char *buf, char *addr, unsigned long count) +{ + struct page *p; + int copied = 0; + + while (count) { + unsigned long offset, length; + + offset = (unsigned long)addr & ~PAGE_MASK; + length = PAGE_SIZE - offset; + if (length > count) + length = count; + p = vmalloc_to_page(addr); + /* + * To do safe access to this _mapped_ area, we need + * lock. But adding lock here means that we need to add + * overhead of vmalloc()/vfree() calles for this _debug_ + * interface, rarely used. Instead of that, we'll use + * kmap() and get small overhead in this access function. + */ + if (p) { + /* + * we can expect USER0 is not used (see vread/vwrite's + * function description) + */ + void *map = kmap_atomic(p, KM_USER0); + memcpy(map + offset, buf, length); + kunmap_atomic(map, KM_USER0); + } + addr += length; + buf += length; + copied += length; + count -= length; + } + return copied; +} + +/** + * vread() - read vmalloc area in a safe way. + * @buf: buffer for reading data + * @addr: vm address. + * @count: number of bytes to be read. + * + * Returns # of bytes which addr and buf should be increased. + * (same number to @count). Returns 0 if [addr...addr+count) doesn't + * includes any intersect with alive vmalloc area. + * + * This function checks that addr is a valid vmalloc'ed area, and + * copy data from that area to a given buffer. If the given memory range + * of [addr...addr+count) includes some valid address, data is copied to + * proper area of @buf. If there are memory holes, they'll be zero-filled. + * IOREMAP area is treated as memory hole and no copy is done. + * + * If [addr...addr+count) doesn't includes any intersects with alive + * vm_struct area, returns 0. + * @buf should be kernel's buffer. Because this function uses KM_USER0, + * the caller should guarantee KM_USER0 is not used. + * + * Note: In usual ops, vread() is never necessary because the caller + * should know vmalloc() area is valid and can use memcpy(). + * This is for routines which have to access vmalloc area without + * any informaion, as /dev/kmem. + * + */ + long vread(char *buf, char *addr, unsigned long count) { struct vm_struct *tmp; char *vaddr, *buf_start = buf; + unsigned long buflen = count; unsigned long n; /* Don't allow overflow */ @@ -1636,7 +1764,7 @@ long vread(char *buf, char *addr, unsigned long count) count = -(unsigned long) addr; read_lock(&vmlist_lock); - for (tmp = vmlist; tmp; tmp = tmp->next) { + for (tmp = vmlist; count && tmp; tmp = tmp->next) { vaddr = (char *) tmp->addr; if (addr >= vaddr + tmp->size - PAGE_SIZE) continue; @@ -1649,32 +1777,72 @@ long vread(char *buf, char *addr, unsigned long count) count--; } n = vaddr + tmp->size - PAGE_SIZE - addr; - do { - if (count == 0) - goto finished; - *buf = *addr; - buf++; - addr++; - count--; - } while (--n > 0); + if (n > count) + n = count; + if (!(tmp->flags & VM_IOREMAP)) + aligned_vread(buf, addr, n); + else /* IOREMAP area is treated as memory hole */ + memset(buf, 0, n); + buf += n; + addr += n; + count -= n; } finished: read_unlock(&vmlist_lock); - return buf - buf_start; + + if (buf == buf_start) + return 0; + /* zero-fill memory holes */ + if (buf != buf_start + buflen) + memset(buf, 0, buflen - (buf - buf_start)); + + return buflen; } +/** + * vwrite() - write vmalloc area in a safe way. + * @buf: buffer for source data + * @addr: vm address. + * @count: number of bytes to be read. + * + * Returns # of bytes which addr and buf should be incresed. + * (same number to @count). + * If [addr...addr+count) doesn't includes any intersect with valid + * vmalloc area, returns 0. + * + * This function checks that addr is a valid vmalloc'ed area, and + * copy data from a buffer to the given addr. If specified range of + * [addr...addr+count) includes some valid address, data is copied from + * proper area of @buf. If there are memory holes, no copy to hole. + * IOREMAP area is treated as memory hole and no copy is done. + * + * If [addr...addr+count) doesn't includes any intersects with alive + * vm_struct area, returns 0. + * @buf should be kernel's buffer. Because this function uses KM_USER0, + * the caller should guarantee KM_USER0 is not used. + * + * Note: In usual ops, vwrite() is never necessary because the caller + * should know vmalloc() area is valid and can use memcpy(). + * This is for routines which have to access vmalloc area without + * any informaion, as /dev/kmem. + * + * The caller should guarantee KM_USER1 is not used. + */ + long vwrite(char *buf, char *addr, unsigned long count) { struct vm_struct *tmp; - char *vaddr, *buf_start = buf; - unsigned long n; + char *vaddr; + unsigned long n, buflen; + int copied = 0; /* Don't allow overflow */ if ((unsigned long) addr + count < count) count = -(unsigned long) addr; + buflen = count; read_lock(&vmlist_lock); - for (tmp = vmlist; tmp; tmp = tmp->next) { + for (tmp = vmlist; count && tmp; tmp = tmp->next) { vaddr = (char *) tmp->addr; if (addr >= vaddr + tmp->size - PAGE_SIZE) continue; @@ -1686,18 +1854,21 @@ long vwrite(char *buf, char *addr, unsigned long count) count--; } n = vaddr + tmp->size - PAGE_SIZE - addr; - do { - if (count == 0) - goto finished; - *addr = *buf; - buf++; - addr++; - count--; - } while (--n > 0); + if (n > count) + n = count; + if (!(tmp->flags & VM_IOREMAP)) { + aligned_vwrite(buf, addr, n); + copied++; + } + buf += n; + addr += n; + count -= n; } finished: read_unlock(&vmlist_lock); - return buf - buf_start; + if (!copied) + return 0; + return buflen; } /** @@ -1818,6 +1989,286 @@ void free_vm_area(struct vm_struct *area) } EXPORT_SYMBOL_GPL(free_vm_area); +static struct vmap_area *node_to_va(struct rb_node *n) +{ + return n ? rb_entry(n, struct vmap_area, rb_node) : NULL; +} + +/** + * pvm_find_next_prev - find the next and prev vmap_area surrounding @end + * @end: target address + * @pnext: out arg for the next vmap_area + * @pprev: out arg for the previous vmap_area + * + * Returns: %true if either or both of next and prev are found, + * %false if no vmap_area exists + * + * Find vmap_areas end addresses of which enclose @end. ie. if not + * NULL, *pnext->va_end > @end and *pprev->va_end <= @end. + */ +static bool pvm_find_next_prev(unsigned long end, + struct vmap_area **pnext, + struct vmap_area **pprev) +{ + struct rb_node *n = vmap_area_root.rb_node; + struct vmap_area *va = NULL; + + while (n) { + va = rb_entry(n, struct vmap_area, rb_node); + if (end < va->va_end) + n = n->rb_left; + else if (end > va->va_end) + n = n->rb_right; + else + break; + } + + if (!va) + return false; + + if (va->va_end > end) { + *pnext = va; + *pprev = node_to_va(rb_prev(&(*pnext)->rb_node)); + } else { + *pprev = va; + *pnext = node_to_va(rb_next(&(*pprev)->rb_node)); + } + return true; +} + +/** + * pvm_determine_end - find the highest aligned address between two vmap_areas + * @pnext: in/out arg for the next vmap_area + * @pprev: in/out arg for the previous vmap_area + * @align: alignment + * + * Returns: determined end address + * + * Find the highest aligned address between *@pnext and *@pprev below + * VMALLOC_END. *@pnext and *@pprev are adjusted so that the aligned + * down address is between the end addresses of the two vmap_areas. + * + * Please note that the address returned by this function may fall + * inside *@pnext vmap_area. The caller is responsible for checking + * that. + */ +static unsigned long pvm_determine_end(struct vmap_area **pnext, + struct vmap_area **pprev, + unsigned long align) +{ + const unsigned long vmalloc_end = VMALLOC_END & ~(align - 1); + unsigned long addr; + + if (*pnext) + addr = min((*pnext)->va_start & ~(align - 1), vmalloc_end); + else + addr = vmalloc_end; + + while (*pprev && (*pprev)->va_end > addr) { + *pnext = *pprev; + *pprev = node_to_va(rb_prev(&(*pnext)->rb_node)); + } + + return addr; +} + +/** + * pcpu_get_vm_areas - allocate vmalloc areas for percpu allocator + * @offsets: array containing offset of each area + * @sizes: array containing size of each area + * @nr_vms: the number of areas to allocate + * @align: alignment, all entries in @offsets and @sizes must be aligned to this + * @gfp_mask: allocation mask + * + * Returns: kmalloc'd vm_struct pointer array pointing to allocated + * vm_structs on success, %NULL on failure + * + * Percpu allocator wants to use congruent vm areas so that it can + * maintain the offsets among percpu areas. This function allocates + * congruent vmalloc areas for it. These areas tend to be scattered + * pretty far, distance between two areas easily going up to + * gigabytes. To avoid interacting with regular vmallocs, these areas + * are allocated from top. + * + * Despite its complicated look, this allocator is rather simple. It + * does everything top-down and scans areas from the end looking for + * matching slot. While scanning, if any of the areas overlaps with + * existing vmap_area, the base address is pulled down to fit the + * area. Scanning is repeated till all the areas fit and then all + * necessary data structres are inserted and the result is returned. + */ +struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets, + const size_t *sizes, int nr_vms, + size_t align, gfp_t gfp_mask) +{ + const unsigned long vmalloc_start = ALIGN(VMALLOC_START, align); + const unsigned long vmalloc_end = VMALLOC_END & ~(align - 1); + struct vmap_area **vas, *prev, *next; + struct vm_struct **vms; + int area, area2, last_area, term_area; + unsigned long base, start, end, last_end; + bool purged = false; + + gfp_mask &= GFP_RECLAIM_MASK; + + /* verify parameters and allocate data structures */ + BUG_ON(align & ~PAGE_MASK || !is_power_of_2(align)); + for (last_area = 0, area = 0; area < nr_vms; area++) { + start = offsets[area]; + end = start + sizes[area]; + + /* is everything aligned properly? */ + BUG_ON(!IS_ALIGNED(offsets[area], align)); + BUG_ON(!IS_ALIGNED(sizes[area], align)); + + /* detect the area with the highest address */ + if (start > offsets[last_area]) + last_area = area; + + for (area2 = 0; area2 < nr_vms; area2++) { + unsigned long start2 = offsets[area2]; + unsigned long end2 = start2 + sizes[area2]; + + if (area2 == area) + continue; + + BUG_ON(start2 >= start && start2 < end); + BUG_ON(end2 <= end && end2 > start); + } + } + last_end = offsets[last_area] + sizes[last_area]; + + if (vmalloc_end - vmalloc_start < last_end) { + WARN_ON(true); + return NULL; + } + + vms = kzalloc(sizeof(vms[0]) * nr_vms, gfp_mask); + vas = kzalloc(sizeof(vas[0]) * nr_vms, gfp_mask); + if (!vas || !vms) + goto err_free; + + for (area = 0; area < nr_vms; area++) { + vas[area] = kzalloc(sizeof(struct vmap_area), gfp_mask); + vms[area] = kzalloc(sizeof(struct vm_struct), gfp_mask); + if (!vas[area] || !vms[area]) + goto err_free; + } +retry: + spin_lock(&vmap_area_lock); + + /* start scanning - we scan from the top, begin with the last area */ + area = term_area = last_area; + start = offsets[area]; + end = start + sizes[area]; + + if (!pvm_find_next_prev(vmap_area_pcpu_hole, &next, &prev)) { + base = vmalloc_end - last_end; + goto found; + } + base = pvm_determine_end(&next, &prev, align) - end; + + while (true) { + BUG_ON(next && next->va_end <= base + end); + BUG_ON(prev && prev->va_end > base + end); + + /* + * base might have underflowed, add last_end before + * comparing. + */ + if (base + last_end < vmalloc_start + last_end) { + spin_unlock(&vmap_area_lock); + if (!purged) { + purge_vmap_area_lazy(); + purged = true; + goto retry; + } + goto err_free; + } + + /* + * If next overlaps, move base downwards so that it's + * right below next and then recheck. + */ + if (next && next->va_start < base + end) { + base = pvm_determine_end(&next, &prev, align) - end; + term_area = area; + continue; + } + + /* + * If prev overlaps, shift down next and prev and move + * base so that it's right below new next and then + * recheck. + */ + if (prev && prev->va_end > base + start) { + next = prev; + prev = node_to_va(rb_prev(&next->rb_node)); + base = pvm_determine_end(&next, &prev, align) - end; + term_area = area; + continue; + } + + /* + * This area fits, move on to the previous one. If + * the previous one is the terminal one, we're done. + */ + area = (area + nr_vms - 1) % nr_vms; + if (area == term_area) + break; + start = offsets[area]; + end = start + sizes[area]; + pvm_find_next_prev(base + end, &next, &prev); + } +found: + /* we've found a fitting base, insert all va's */ + for (area = 0; area < nr_vms; area++) { + struct vmap_area *va = vas[area]; + + va->va_start = base + offsets[area]; + va->va_end = va->va_start + sizes[area]; + __insert_vmap_area(va); + } + + vmap_area_pcpu_hole = base + offsets[last_area]; + + spin_unlock(&vmap_area_lock); + + /* insert all vm's */ + for (area = 0; area < nr_vms; area++) + insert_vmalloc_vm(vms[area], vas[area], VM_ALLOC, + pcpu_get_vm_areas); + + kfree(vas); + return vms; + +err_free: + for (area = 0; area < nr_vms; area++) { + if (vas) + kfree(vas[area]); + if (vms) + kfree(vms[area]); + } + kfree(vas); + kfree(vms); + return NULL; +} + +/** + * pcpu_free_vm_areas - free vmalloc areas for percpu allocator + * @vms: vm_struct pointer array returned by pcpu_get_vm_areas() + * @nr_vms: the number of allocated areas + * + * Free vm_structs and the array allocated by pcpu_get_vm_areas(). + */ +void pcpu_free_vm_areas(struct vm_struct **vms, int nr_vms) +{ + int i; + + for (i = 0; i < nr_vms; i++) + free_vm_area(vms[i]); + kfree(vms); +} #ifdef CONFIG_PROC_FS static void *s_start(struct seq_file *m, loff_t *pos) diff --git a/mm/vmscan.c b/mm/vmscan.c index 54155268dfca..64e438898832 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -148,8 +148,8 @@ static struct zone_reclaim_stat *get_reclaim_stat(struct zone *zone, return &zone->reclaim_stat; } -static unsigned long zone_nr_pages(struct zone *zone, struct scan_control *sc, - enum lru_list lru) +static unsigned long zone_nr_lru_pages(struct zone *zone, + struct scan_control *sc, enum lru_list lru) { if (!scanning_global_lru(sc)) return mem_cgroup_zone_nr_pages(sc->mem_cgroup, zone, lru); @@ -286,7 +286,12 @@ static inline int page_mapping_inuse(struct page *page) static inline int is_page_cache_freeable(struct page *page) { - return page_count(page) - !!page_has_private(page) == 2; + /* + * A freeable page cache page is referenced only by the caller + * that isolated the page, the page cache radix tree and + * optional buffer heads at page->private. + */ + return page_count(page) - page_has_private(page) == 2; } static int may_write_to_queue(struct backing_dev_info *bdi) @@ -361,7 +366,6 @@ static pageout_t pageout(struct page *page, struct address_space *mapping, * block, for some throttling. This happens by accident, because * swap_backing_dev_info is bust: it doesn't reflect the * congestion state of the swapdevs. Easy to fix, if needed. - * See swapfile.c:page_queue_congested(). */ if (!is_page_cache_freeable(page)) return PAGE_KEEP; @@ -531,7 +535,7 @@ redo: * unevictable page on [in]active list. * We know how to handle that. */ - lru = active + page_is_file_cache(page); + lru = active + page_lru_base_type(page); lru_cache_add_lru(page, lru); } else { /* @@ -630,9 +634,14 @@ static unsigned long shrink_page_list(struct list_head *page_list, referenced = page_referenced(page, 1, sc->mem_cgroup, &vm_flags); - /* In active use or really unfreeable? Activate it. */ + /* + * In active use or really unfreeable? Activate it. + * If page which have PG_mlocked lost isoltation race, + * try_to_unmap moves it to unevictable list + */ if (sc->order <= PAGE_ALLOC_COSTLY_ORDER && - referenced && page_mapping_inuse(page)) + referenced && page_mapping_inuse(page) + && !(vm_flags & VM_LOCKED)) goto activate_locked; /* @@ -654,7 +663,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, * processes. Try to unmap it here. */ if (page_mapped(page) && mapping) { - switch (try_to_unmap(page, 0)) { + switch (try_to_unmap(page, TTU_UNMAP)) { case SWAP_FAIL: goto activate_locked; case SWAP_AGAIN: @@ -816,7 +825,7 @@ int __isolate_lru_page(struct page *page, int mode, int file) if (mode != ISOLATE_BOTH && (!PageActive(page) != !mode)) return ret; - if (mode != ISOLATE_BOTH && (!page_is_file_cache(page) != !file)) + if (mode != ISOLATE_BOTH && page_is_file_cache(page) != file) return ret; /* @@ -930,6 +939,16 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, /* Check that we have not crossed a zone boundary. */ if (unlikely(page_zone_id(cursor_page) != zone_id)) continue; + + /* + * If we don't have enough swap space, reclaiming of + * anon page which don't already have a swap slot is + * pointless. + */ + if (nr_swap_pages <= 0 && PageAnon(cursor_page) && + !PageSwapCache(cursor_page)) + continue; + if (__isolate_lru_page(cursor_page, mode, file) == 0) { list_move(&cursor_page->lru, dst); mem_cgroup_del_lru(cursor_page); @@ -956,7 +975,7 @@ static unsigned long isolate_pages_global(unsigned long nr, if (file) lru += LRU_FILE; return isolate_lru_pages(nr, &z->lru[lru].list, dst, scanned, order, - mode, !!file); + mode, file); } /* @@ -971,7 +990,7 @@ static unsigned long clear_active_flags(struct list_head *page_list, struct page *page; list_for_each_entry(page, page_list, lru) { - lru = page_is_file_cache(page); + lru = page_lru_base_type(page); if (PageActive(page)) { lru += LRU_ACTIVE; ClearPageActive(page); @@ -1029,6 +1048,31 @@ int isolate_lru_page(struct page *page) } /* + * Are there way too many processes in the direct reclaim path already? + */ +static int too_many_isolated(struct zone *zone, int file, + struct scan_control *sc) +{ + unsigned long inactive, isolated; + + if (current_is_kswapd()) + return 0; + + if (!scanning_global_lru(sc)) + return 0; + + if (file) { + inactive = zone_page_state(zone, NR_INACTIVE_FILE); + isolated = zone_page_state(zone, NR_ISOLATED_FILE); + } else { + inactive = zone_page_state(zone, NR_INACTIVE_ANON); + isolated = zone_page_state(zone, NR_ISOLATED_ANON); + } + + return isolated > inactive; +} + +/* * shrink_inactive_list() is a helper for shrink_zone(). It returns the number * of reclaimed pages */ @@ -1043,6 +1087,14 @@ static unsigned long shrink_inactive_list(unsigned long max_scan, struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc); int lumpy_reclaim = 0; + while (unlikely(too_many_isolated(zone, file, sc))) { + congestion_wait(WRITE, HZ/10); + + /* We are about to die and free our memory. Return now. */ + if (fatal_signal_pending(current)) + return SWAP_CLUSTER_MAX; + } + /* * If we need a large contiguous chunk of memory, or have * trouble getting a small set of contiguous pages, we @@ -1067,10 +1119,26 @@ static unsigned long shrink_inactive_list(unsigned long max_scan, unsigned long nr_active; unsigned int count[NR_LRU_LISTS] = { 0, }; int mode = lumpy_reclaim ? ISOLATE_BOTH : ISOLATE_INACTIVE; + unsigned long nr_anon; + unsigned long nr_file; nr_taken = sc->isolate_pages(sc->swap_cluster_max, &page_list, &nr_scan, sc->order, mode, zone, sc->mem_cgroup, 0, file); + + if (scanning_global_lru(sc)) { + zone->pages_scanned += nr_scan; + if (current_is_kswapd()) + __count_zone_vm_events(PGSCAN_KSWAPD, zone, + nr_scan); + else + __count_zone_vm_events(PGSCAN_DIRECT, zone, + nr_scan); + } + + if (nr_taken == 0) + goto done; + nr_active = clear_active_flags(&page_list, count); __count_vm_events(PGDEACTIVATE, nr_active); @@ -1083,8 +1151,10 @@ static unsigned long shrink_inactive_list(unsigned long max_scan, __mod_zone_page_state(zone, NR_INACTIVE_ANON, -count[LRU_INACTIVE_ANON]); - if (scanning_global_lru(sc)) - zone->pages_scanned += nr_scan; + nr_anon = count[LRU_ACTIVE_ANON] + count[LRU_INACTIVE_ANON]; + nr_file = count[LRU_ACTIVE_FILE] + count[LRU_INACTIVE_FILE]; + __mod_zone_page_state(zone, NR_ISOLATED_ANON, nr_anon); + __mod_zone_page_state(zone, NR_ISOLATED_FILE, nr_file); reclaim_stat->recent_scanned[0] += count[LRU_INACTIVE_ANON]; reclaim_stat->recent_scanned[0] += count[LRU_ACTIVE_ANON]; @@ -1104,7 +1174,7 @@ static unsigned long shrink_inactive_list(unsigned long max_scan, */ if (nr_freed < nr_taken && !current_is_kswapd() && lumpy_reclaim) { - congestion_wait(WRITE, HZ/10); + congestion_wait(BLK_RW_ASYNC, HZ/10); /* * The attempt at page out may have made some @@ -1118,18 +1188,12 @@ static unsigned long shrink_inactive_list(unsigned long max_scan, } nr_reclaimed += nr_freed; + local_irq_disable(); - if (current_is_kswapd()) { - __count_zone_vm_events(PGSCAN_KSWAPD, zone, nr_scan); + if (current_is_kswapd()) __count_vm_events(KSWAPD_STEAL, nr_freed); - } else if (scanning_global_lru(sc)) - __count_zone_vm_events(PGSCAN_DIRECT, zone, nr_scan); - __count_zone_vm_events(PGSTEAL, zone, nr_freed); - if (nr_taken == 0) - goto done; - spin_lock(&zone->lru_lock); /* * Put back any unfreeable pages. @@ -1148,8 +1212,8 @@ static unsigned long shrink_inactive_list(unsigned long max_scan, SetPageLRU(page); lru = page_lru(page); add_page_to_lru_list(zone, page, lru); - if (PageActive(page)) { - int file = !!page_is_file_cache(page); + if (is_active_lru(lru)) { + int file = is_file_lru(lru); reclaim_stat->recent_rotated[file]++; } if (!pagevec_add(&pvec, page)) { @@ -1158,10 +1222,13 @@ static unsigned long shrink_inactive_list(unsigned long max_scan, spin_lock_irq(&zone->lru_lock); } } + __mod_zone_page_state(zone, NR_ISOLATED_ANON, -nr_anon); + __mod_zone_page_state(zone, NR_ISOLATED_FILE, -nr_file); + } while (nr_scanned < max_scan); - spin_unlock(&zone->lru_lock); + done: - local_irq_enable(); + spin_unlock_irq(&zone->lru_lock); pagevec_release(&pvec); return nr_reclaimed; } @@ -1210,15 +1277,10 @@ static void move_active_pages_to_lru(struct zone *zone, while (!list_empty(list)) { page = lru_to_page(list); - prefetchw_prev_lru_page(page, list, flags); VM_BUG_ON(PageLRU(page)); SetPageLRU(page); - VM_BUG_ON(!PageActive(page)); - if (!is_active_lru(lru)) - ClearPageActive(page); /* we are de-activating */ - list_move(&page->lru, &zone->lru[lru].list); mem_cgroup_add_lru_list(page, lru); pgmoved++; @@ -1239,7 +1301,7 @@ static void move_active_pages_to_lru(struct zone *zone, static void shrink_active_list(unsigned long nr_pages, struct zone *zone, struct scan_control *sc, int priority, int file) { - unsigned long pgmoved; + unsigned long nr_taken; unsigned long pgscanned; unsigned long vm_flags; LIST_HEAD(l_hold); /* The pages which were snipped off */ @@ -1247,10 +1309,11 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, LIST_HEAD(l_inactive); struct page *page; struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc); + unsigned long nr_rotated = 0; lru_add_drain(); spin_lock_irq(&zone->lru_lock); - pgmoved = sc->isolate_pages(nr_pages, &l_hold, &pgscanned, sc->order, + nr_taken = sc->isolate_pages(nr_pages, &l_hold, &pgscanned, sc->order, ISOLATE_ACTIVE, zone, sc->mem_cgroup, 1, file); /* @@ -1260,16 +1323,16 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, if (scanning_global_lru(sc)) { zone->pages_scanned += pgscanned; } - reclaim_stat->recent_scanned[!!file] += pgmoved; + reclaim_stat->recent_scanned[file] += nr_taken; __count_zone_vm_events(PGREFILL, zone, pgscanned); if (file) - __mod_zone_page_state(zone, NR_ACTIVE_FILE, -pgmoved); + __mod_zone_page_state(zone, NR_ACTIVE_FILE, -nr_taken); else - __mod_zone_page_state(zone, NR_ACTIVE_ANON, -pgmoved); + __mod_zone_page_state(zone, NR_ACTIVE_ANON, -nr_taken); + __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, nr_taken); spin_unlock_irq(&zone->lru_lock); - pgmoved = 0; /* count referenced (mapping) mapped pages */ while (!list_empty(&l_hold)) { cond_resched(); page = lru_to_page(&l_hold); @@ -1283,7 +1346,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, /* page_referenced clears PageReferenced */ if (page_mapping_inuse(page) && page_referenced(page, 0, sc->mem_cgroup, &vm_flags)) { - pgmoved++; + nr_rotated++; /* * Identify referenced, file-backed active pages and * give them one more trip around the active list. So @@ -1299,6 +1362,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, } } + ClearPageActive(page); /* we are de-activating */ list_add(&page->lru, &l_inactive); } @@ -1312,13 +1376,13 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, * helps balance scan pressure between file and anonymous pages in * get_scan_ratio. */ - reclaim_stat->recent_rotated[!!file] += pgmoved; + reclaim_stat->recent_rotated[file] += nr_rotated; move_active_pages_to_lru(zone, &l_active, LRU_ACTIVE + file * LRU_FILE); move_active_pages_to_lru(zone, &l_inactive, LRU_BASE + file * LRU_FILE); - + __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, -nr_taken); spin_unlock_irq(&zone->lru_lock); } @@ -1424,10 +1488,10 @@ static void get_scan_ratio(struct zone *zone, struct scan_control *sc, unsigned long ap, fp; struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc); - anon = zone_nr_pages(zone, sc, LRU_ACTIVE_ANON) + - zone_nr_pages(zone, sc, LRU_INACTIVE_ANON); - file = zone_nr_pages(zone, sc, LRU_ACTIVE_FILE) + - zone_nr_pages(zone, sc, LRU_INACTIVE_FILE); + anon = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_ANON) + + zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON); + file = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_FILE) + + zone_nr_lru_pages(zone, sc, LRU_INACTIVE_FILE); if (scanning_global_lru(sc)) { free = zone_page_state(zone, NR_FREE_PAGES); @@ -1521,6 +1585,7 @@ static void shrink_zone(int priority, struct zone *zone, enum lru_list l; unsigned long nr_reclaimed = sc->nr_reclaimed; unsigned long swap_cluster_max = sc->swap_cluster_max; + struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc); int noswap = 0; /* If we have no swap space, do not bother scanning anon pages. */ @@ -1535,17 +1600,14 @@ static void shrink_zone(int priority, struct zone *zone, int file = is_file_lru(l); unsigned long scan; - scan = zone_nr_pages(zone, sc, l); + scan = zone_nr_lru_pages(zone, sc, l); if (priority || noswap) { scan >>= priority; scan = (scan * percent[file]) / 100; } - if (scanning_global_lru(sc)) - nr[l] = nr_scan_try_batch(scan, - &zone->lru[l].nr_saved_scan, - swap_cluster_max); - else - nr[l] = scan; + nr[l] = nr_scan_try_batch(scan, + &reclaim_stat->nr_saved_scan[l], + swap_cluster_max); } while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] || @@ -1647,10 +1709,10 @@ static void shrink_zones(int priority, struct zonelist *zonelist, * * If the caller is !__GFP_FS then the probability of a failure is reasonably * high - the zone may be full of dirty or under-writeback pages, which this - * caller can't do much about. We kick pdflush and take explicit naps in the - * hope that some of these pages can be written. But if the allocating task - * holds filesystem locks which prevent writeout this might not work, and the - * allocation attempt will fail. + * caller can't do much about. We kick the writeback threads and take explicit + * naps in the hope that some of these pages can be written. But if the + * allocating task holds filesystem locks which prevent writeout this might not + * work, and the allocation attempt will fail. * * returns: 0, if no pages reclaimed * else, the number of pages reclaimed @@ -1680,7 +1742,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) continue; - lru_pages += zone_lru_pages(zone); + lru_pages += zone_reclaimable_pages(zone); } } @@ -1715,13 +1777,13 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, */ if (total_scanned > sc->swap_cluster_max + sc->swap_cluster_max / 2) { - wakeup_pdflush(laptop_mode ? 0 : total_scanned); + wakeup_flusher_threads(laptop_mode ? 0 : total_scanned); sc->may_writepage = 1; } /* Take a nap, wait for some writeback to complete */ if (sc->nr_scanned && priority < DEF_PRIORITY - 2) - congestion_wait(WRITE, HZ/10); + congestion_wait(BLK_RW_ASYNC, HZ/10); } /* top priority shrink_zones still had more to do? don't OOM, then */ if (!sc->all_unreclaimable && scanning_global_lru(sc)) @@ -1774,11 +1836,45 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order, #ifdef CONFIG_CGROUP_MEM_RES_CTLR +unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem, + gfp_t gfp_mask, bool noswap, + unsigned int swappiness, + struct zone *zone, int nid) +{ + struct scan_control sc = { + .may_writepage = !laptop_mode, + .may_unmap = 1, + .may_swap = !noswap, + .swap_cluster_max = SWAP_CLUSTER_MAX, + .swappiness = swappiness, + .order = 0, + .mem_cgroup = mem, + .isolate_pages = mem_cgroup_isolate_pages, + }; + nodemask_t nm = nodemask_of_node(nid); + + sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | + (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK); + sc.nodemask = &nm; + sc.nr_reclaimed = 0; + sc.nr_scanned = 0; + /* + * NOTE: Although we can get the priority field, using it + * here is not a good idea, since it limits the pages we can scan. + * if we don't reclaim here, the shrink_zone from balance_pgdat + * will pick up pages from other mem cgroup's as well. We hack + * the priority and make it zero. + */ + shrink_zone(0, zone, &sc); + return sc.nr_reclaimed; +} + unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont, gfp_t gfp_mask, bool noswap, unsigned int swappiness) { + struct zonelist *zonelist; struct scan_control sc = { .may_writepage = !laptop_mode, .may_unmap = 1, @@ -1790,7 +1886,6 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont, .isolate_pages = mem_cgroup_isolate_pages, .nodemask = NULL, /* we don't care the placement */ }; - struct zonelist *zonelist; sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK); @@ -1897,7 +1992,7 @@ loop_again: for (i = 0; i <= end_zone; i++) { struct zone *zone = pgdat->node_zones + i; - lru_pages += zone_lru_pages(zone); + lru_pages += zone_reclaimable_pages(zone); } /* @@ -1912,6 +2007,7 @@ loop_again: for (i = 0; i <= end_zone; i++) { struct zone *zone = pgdat->node_zones + i; int nr_slab; + int nid, zid; if (!populated_zone(zone)) continue; @@ -1926,6 +2022,15 @@ loop_again: temp_priority[i] = priority; sc.nr_scanned = 0; note_zone_scanning_priority(zone, priority); + + nid = pgdat->node_id; + zid = zone_idx(zone); + /* + * Call soft limit reclaim before calling shrink_zone. + * For now we ignore the return value + */ + mem_cgroup_soft_limit_reclaim(zone, order, sc.gfp_mask, + nid, zid); /* * We put equal pressure on every zone, unless one * zone has way too many pages free already. @@ -1941,7 +2046,7 @@ loop_again: if (zone_is_all_unreclaimable(zone)) continue; if (nr_slab == 0 && zone->pages_scanned >= - (zone_lru_pages(zone) * 6)) + (zone_reclaimable_pages(zone) * 6)) zone_set_flag(zone, ZONE_ALL_UNRECLAIMABLE); /* @@ -1960,7 +2065,7 @@ loop_again: * another pass across the zones. */ if (total_scanned && priority < DEF_PRIORITY - 2) - congestion_wait(WRITE, HZ/10); + congestion_wait(BLK_RW_ASYNC, HZ/10); /* * We do this so kswapd doesn't build up large priorities for @@ -2108,12 +2213,39 @@ void wakeup_kswapd(struct zone *zone, int order) wake_up_interruptible(&pgdat->kswapd_wait); } -unsigned long global_lru_pages(void) +/* + * The reclaimable count would be mostly accurate. + * The less reclaimable pages may be + * - mlocked pages, which will be moved to unevictable list when encountered + * - mapped pages, which may require several travels to be reclaimed + * - dirty pages, which is not "instantly" reclaimable + */ +unsigned long global_reclaimable_pages(void) { - return global_page_state(NR_ACTIVE_ANON) - + global_page_state(NR_ACTIVE_FILE) - + global_page_state(NR_INACTIVE_ANON) - + global_page_state(NR_INACTIVE_FILE); + int nr; + + nr = global_page_state(NR_ACTIVE_FILE) + + global_page_state(NR_INACTIVE_FILE); + + if (nr_swap_pages > 0) + nr += global_page_state(NR_ACTIVE_ANON) + + global_page_state(NR_INACTIVE_ANON); + + return nr; +} + +unsigned long zone_reclaimable_pages(struct zone *zone) +{ + int nr; + + nr = zone_page_state(zone, NR_ACTIVE_FILE) + + zone_page_state(zone, NR_INACTIVE_FILE); + + if (nr_swap_pages > 0) + nr += zone_page_state(zone, NR_ACTIVE_ANON) + + zone_page_state(zone, NR_INACTIVE_ANON); + + return nr; } #ifdef CONFIG_HIBERNATION @@ -2128,6 +2260,7 @@ static void shrink_all_zones(unsigned long nr_pages, int prio, { struct zone *zone; unsigned long nr_reclaimed = 0; + struct zone_reclaim_stat *reclaim_stat; for_each_populated_zone(zone) { enum lru_list l; @@ -2144,11 +2277,14 @@ static void shrink_all_zones(unsigned long nr_pages, int prio, l == LRU_ACTIVE_FILE)) continue; - zone->lru[l].nr_saved_scan += (lru_pages >> prio) + 1; - if (zone->lru[l].nr_saved_scan >= nr_pages || pass > 3) { + reclaim_stat = get_reclaim_stat(zone, sc); + reclaim_stat->nr_saved_scan[l] += + (lru_pages >> prio) + 1; + if (reclaim_stat->nr_saved_scan[l] + >= nr_pages || pass > 3) { unsigned long nr_to_scan; - zone->lru[l].nr_saved_scan = 0; + reclaim_stat->nr_saved_scan[l] = 0; nr_to_scan = min(nr_pages, lru_pages); nr_reclaimed += shrink_list(l, nr_to_scan, zone, sc, prio); @@ -2185,7 +2321,7 @@ unsigned long shrink_all_memory(unsigned long nr_pages) current->reclaim_state = &reclaim_state; - lru_pages = global_lru_pages(); + lru_pages = global_reclaimable_pages(); nr_slab = global_page_state(NR_SLAB_RECLAIMABLE); /* If slab caches are huge, it's better to hit them first */ while (nr_slab >= lru_pages) { @@ -2227,13 +2363,13 @@ unsigned long shrink_all_memory(unsigned long nr_pages) reclaim_state.reclaimed_slab = 0; shrink_slab(sc.nr_scanned, sc.gfp_mask, - global_lru_pages()); + global_reclaimable_pages()); sc.nr_reclaimed += reclaim_state.reclaimed_slab; if (sc.nr_reclaimed >= nr_pages) goto out; if (sc.nr_scanned && prio < DEF_PRIORITY - 2) - congestion_wait(WRITE, HZ / 10); + congestion_wait(BLK_RW_ASYNC, HZ / 10); } } @@ -2244,7 +2380,8 @@ unsigned long shrink_all_memory(unsigned long nr_pages) if (!sc.nr_reclaimed) { do { reclaim_state.reclaimed_slab = 0; - shrink_slab(nr_pages, sc.gfp_mask, global_lru_pages()); + shrink_slab(nr_pages, sc.gfp_mask, + global_reclaimable_pages()); sc.nr_reclaimed += reclaim_state.reclaimed_slab; } while (sc.nr_reclaimed < nr_pages && reclaim_state.reclaimed_slab > 0); @@ -2564,7 +2701,7 @@ static void check_move_unevictable_page(struct page *page, struct zone *zone) retry: ClearPageUnevictable(page); if (page_evictable(page, NULL)) { - enum lru_list l = LRU_INACTIVE_ANON + page_is_file_cache(page); + enum lru_list l = page_lru_base_type(page); __dec_zone_state(zone, NR_UNEVICTABLE); list_move(&page->lru, &zone->lru[l].list); @@ -2707,10 +2844,10 @@ static void scan_all_zones_unevictable_pages(void) unsigned long scan_unevictable_pages; int scan_unevictable_handler(struct ctl_table *table, int write, - struct file *file, void __user *buffer, + void __user *buffer, size_t *length, loff_t *ppos) { - proc_doulongvec_minmax(table, write, file, buffer, length, ppos); + proc_doulongvec_minmax(table, write, buffer, length, ppos); if (write && *(unsigned long *)table->data) scan_all_zones_unevictable_pages(); diff --git a/mm/vmstat.c b/mm/vmstat.c index 138bed53706e..c81321f9feec 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -639,11 +639,14 @@ static const char * const vmstat_text[] = { "nr_slab_reclaimable", "nr_slab_unreclaimable", "nr_page_table_pages", + "nr_kernel_stack", "nr_unstable", "nr_bounce", "nr_vmscan_write", "nr_writeback_temp", - + "nr_isolated_anon", + "nr_isolated_file", + "nr_shmem", #ifdef CONFIG_NUMA "numa_hit", "numa_miss", |