diff options
Diffstat (limited to 'mm')
-rw-r--r-- | mm/Kconfig | 23 | ||||
-rw-r--r-- | mm/Makefile | 1 | ||||
-rw-r--r-- | mm/cleancache.c | 244 | ||||
-rw-r--r-- | mm/filemap.c | 12 | ||||
-rw-r--r-- | mm/fremap.c | 2 | ||||
-rw-r--r-- | mm/hugetlb.c | 4 | ||||
-rw-r--r-- | mm/memcontrol.c | 364 | ||||
-rw-r--r-- | mm/memory.c | 4 | ||||
-rw-r--r-- | mm/mlock.c | 8 | ||||
-rw-r--r-- | mm/mmap.c | 8 | ||||
-rw-r--r-- | mm/page_alloc.c | 4 | ||||
-rw-r--r-- | mm/page_cgroup.c | 28 | ||||
-rw-r--r-- | mm/shmem.c | 11 | ||||
-rw-r--r-- | mm/truncate.c | 6 | ||||
-rw-r--r-- | mm/vmalloc.c | 4 | ||||
-rw-r--r-- | mm/vmscan.c | 104 |
16 files changed, 723 insertions, 104 deletions
diff --git a/mm/Kconfig b/mm/Kconfig index e9c0c61f2ddd..8ca47a5ee9c8 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -347,3 +347,26 @@ config NEED_PER_CPU_KM depends on !SMP bool default y + +config CLEANCACHE + bool "Enable cleancache driver to cache clean pages if tmem is present" + default n + help + Cleancache can be thought of as a page-granularity victim cache + for clean pages that the kernel's pageframe replacement algorithm + (PFRA) would like to keep around, but can't since there isn't enough + memory. So when the PFRA "evicts" a page, it first attempts to use + cleancacne code to put the data contained in that page into + "transcendent memory", memory that is not directly accessible or + addressable by the kernel and is of unknown and possibly + time-varying size. And when a cleancache-enabled + filesystem wishes to access a page in a file on disk, it first + checks cleancache to see if it already contains it; if it does, + the page is copied into the kernel and a disk access is avoided. + When a transcendent memory driver is available (such as zcache or + Xen transcendent memory), a significant I/O reduction + may be achieved. When none is available, all cleancache calls + are reduced to a single pointer-compare-against-NULL resulting + in a negligible performance hit. + + If unsure, say Y to enable cleancache diff --git a/mm/Makefile b/mm/Makefile index 42a8326c3e3d..836e4163c1bf 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -49,3 +49,4 @@ obj-$(CONFIG_MEMORY_FAILURE) += memory-failure.o obj-$(CONFIG_HWPOISON_INJECT) += hwpoison-inject.o obj-$(CONFIG_DEBUG_KMEMLEAK) += kmemleak.o obj-$(CONFIG_DEBUG_KMEMLEAK_TEST) += kmemleak-test.o +obj-$(CONFIG_CLEANCACHE) += cleancache.o diff --git a/mm/cleancache.c b/mm/cleancache.c new file mode 100644 index 000000000000..bcaae4c2a770 --- /dev/null +++ b/mm/cleancache.c @@ -0,0 +1,244 @@ +/* + * Cleancache frontend + * + * This code provides the generic "frontend" layer to call a matching + * "backend" driver implementation of cleancache. See + * Documentation/vm/cleancache.txt for more information. + * + * Copyright (C) 2009-2010 Oracle Corp. All rights reserved. + * Author: Dan Magenheimer + * + * This work is licensed under the terms of the GNU GPL, version 2. + */ + +#include <linux/module.h> +#include <linux/fs.h> +#include <linux/exportfs.h> +#include <linux/mm.h> +#include <linux/cleancache.h> + +/* + * This global enablement flag may be read thousands of times per second + * by cleancache_get/put/flush even on systems where cleancache_ops + * is not claimed (e.g. cleancache is config'ed on but remains + * disabled), so is preferred to the slower alternative: a function + * call that checks a non-global. + */ +int cleancache_enabled; +EXPORT_SYMBOL(cleancache_enabled); + +/* + * cleancache_ops is set by cleancache_ops_register to contain the pointers + * to the cleancache "backend" implementation functions. + */ +static struct cleancache_ops cleancache_ops; + +/* useful stats available in /sys/kernel/mm/cleancache */ +static unsigned long cleancache_succ_gets; +static unsigned long cleancache_failed_gets; +static unsigned long cleancache_puts; +static unsigned long cleancache_flushes; + +/* + * register operations for cleancache, returning previous thus allowing + * detection of multiple backends and possible nesting + */ +struct cleancache_ops cleancache_register_ops(struct cleancache_ops *ops) +{ + struct cleancache_ops old = cleancache_ops; + + cleancache_ops = *ops; + cleancache_enabled = 1; + return old; +} +EXPORT_SYMBOL(cleancache_register_ops); + +/* Called by a cleancache-enabled filesystem at time of mount */ +void __cleancache_init_fs(struct super_block *sb) +{ + sb->cleancache_poolid = (*cleancache_ops.init_fs)(PAGE_SIZE); +} +EXPORT_SYMBOL(__cleancache_init_fs); + +/* Called by a cleancache-enabled clustered filesystem at time of mount */ +void __cleancache_init_shared_fs(char *uuid, struct super_block *sb) +{ + sb->cleancache_poolid = + (*cleancache_ops.init_shared_fs)(uuid, PAGE_SIZE); +} +EXPORT_SYMBOL(__cleancache_init_shared_fs); + +/* + * If the filesystem uses exportable filehandles, use the filehandle as + * the key, else use the inode number. + */ +static int cleancache_get_key(struct inode *inode, + struct cleancache_filekey *key) +{ + int (*fhfn)(struct dentry *, __u32 *fh, int *, int); + int len = 0, maxlen = CLEANCACHE_KEY_MAX; + struct super_block *sb = inode->i_sb; + + key->u.ino = inode->i_ino; + if (sb->s_export_op != NULL) { + fhfn = sb->s_export_op->encode_fh; + if (fhfn) { + struct dentry d; + d.d_inode = inode; + len = (*fhfn)(&d, &key->u.fh[0], &maxlen, 0); + if (len <= 0 || len == 255) + return -1; + if (maxlen > CLEANCACHE_KEY_MAX) + return -1; + } + } + return 0; +} + +/* + * "Get" data from cleancache associated with the poolid/inode/index + * that were specified when the data was put to cleanache and, if + * successful, use it to fill the specified page with data and return 0. + * The pageframe is unchanged and returns -1 if the get fails. + * Page must be locked by caller. + */ +int __cleancache_get_page(struct page *page) +{ + int ret = -1; + int pool_id; + struct cleancache_filekey key = { .u.key = { 0 } }; + + VM_BUG_ON(!PageLocked(page)); + pool_id = page->mapping->host->i_sb->cleancache_poolid; + if (pool_id < 0) + goto out; + + if (cleancache_get_key(page->mapping->host, &key) < 0) + goto out; + + ret = (*cleancache_ops.get_page)(pool_id, key, page->index, page); + if (ret == 0) + cleancache_succ_gets++; + else + cleancache_failed_gets++; +out: + return ret; +} +EXPORT_SYMBOL(__cleancache_get_page); + +/* + * "Put" data from a page to cleancache and associate it with the + * (previously-obtained per-filesystem) poolid and the page's, + * inode and page index. Page must be locked. Note that a put_page + * always "succeeds", though a subsequent get_page may succeed or fail. + */ +void __cleancache_put_page(struct page *page) +{ + int pool_id; + struct cleancache_filekey key = { .u.key = { 0 } }; + + VM_BUG_ON(!PageLocked(page)); + pool_id = page->mapping->host->i_sb->cleancache_poolid; + if (pool_id >= 0 && + cleancache_get_key(page->mapping->host, &key) >= 0) { + (*cleancache_ops.put_page)(pool_id, key, page->index, page); + cleancache_puts++; + } +} +EXPORT_SYMBOL(__cleancache_put_page); + +/* + * Flush any data from cleancache associated with the poolid and the + * page's inode and page index so that a subsequent "get" will fail. + */ +void __cleancache_flush_page(struct address_space *mapping, struct page *page) +{ + /* careful... page->mapping is NULL sometimes when this is called */ + int pool_id = mapping->host->i_sb->cleancache_poolid; + struct cleancache_filekey key = { .u.key = { 0 } }; + + if (pool_id >= 0) { + VM_BUG_ON(!PageLocked(page)); + if (cleancache_get_key(mapping->host, &key) >= 0) { + (*cleancache_ops.flush_page)(pool_id, key, page->index); + cleancache_flushes++; + } + } +} +EXPORT_SYMBOL(__cleancache_flush_page); + +/* + * Flush all data from cleancache associated with the poolid and the + * mappings's inode so that all subsequent gets to this poolid/inode + * will fail. + */ +void __cleancache_flush_inode(struct address_space *mapping) +{ + int pool_id = mapping->host->i_sb->cleancache_poolid; + struct cleancache_filekey key = { .u.key = { 0 } }; + + if (pool_id >= 0 && cleancache_get_key(mapping->host, &key) >= 0) + (*cleancache_ops.flush_inode)(pool_id, key); +} +EXPORT_SYMBOL(__cleancache_flush_inode); + +/* + * Called by any cleancache-enabled filesystem at time of unmount; + * note that pool_id is surrendered and may be reutrned by a subsequent + * cleancache_init_fs or cleancache_init_shared_fs + */ +void __cleancache_flush_fs(struct super_block *sb) +{ + if (sb->cleancache_poolid >= 0) { + int old_poolid = sb->cleancache_poolid; + sb->cleancache_poolid = -1; + (*cleancache_ops.flush_fs)(old_poolid); + } +} +EXPORT_SYMBOL(__cleancache_flush_fs); + +#ifdef CONFIG_SYSFS + +/* see Documentation/ABI/xxx/sysfs-kernel-mm-cleancache */ + +#define CLEANCACHE_SYSFS_RO(_name) \ + static ssize_t cleancache_##_name##_show(struct kobject *kobj, \ + struct kobj_attribute *attr, char *buf) \ + { \ + return sprintf(buf, "%lu\n", cleancache_##_name); \ + } \ + static struct kobj_attribute cleancache_##_name##_attr = { \ + .attr = { .name = __stringify(_name), .mode = 0444 }, \ + .show = cleancache_##_name##_show, \ + } + +CLEANCACHE_SYSFS_RO(succ_gets); +CLEANCACHE_SYSFS_RO(failed_gets); +CLEANCACHE_SYSFS_RO(puts); +CLEANCACHE_SYSFS_RO(flushes); + +static struct attribute *cleancache_attrs[] = { + &cleancache_succ_gets_attr.attr, + &cleancache_failed_gets_attr.attr, + &cleancache_puts_attr.attr, + &cleancache_flushes_attr.attr, + NULL, +}; + +static struct attribute_group cleancache_attr_group = { + .attrs = cleancache_attrs, + .name = "cleancache", +}; + +#endif /* CONFIG_SYSFS */ + +static int __init init_cleancache(void) +{ +#ifdef CONFIG_SYSFS + int err; + + err = sysfs_create_group(mm_kobj, &cleancache_attr_group); +#endif /* CONFIG_SYSFS */ + return 0; +} +module_init(init_cleancache) diff --git a/mm/filemap.c b/mm/filemap.c index 68e782b3d3de..bcdc393b6580 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -34,6 +34,7 @@ #include <linux/hardirq.h> /* for BUG_ON(!in_atomic()) only */ #include <linux/memcontrol.h> #include <linux/mm_inline.h> /* for page_is_file_cache() */ +#include <linux/cleancache.h> #include "internal.h" /* @@ -118,6 +119,16 @@ void __delete_from_page_cache(struct page *page) { struct address_space *mapping = page->mapping; + /* + * if we're uptodate, flush out into the cleancache, otherwise + * invalidate any existing cleancache entries. We can't leave + * stale data around in the cleancache once our page is gone + */ + if (PageUptodate(page) && PageMappedToDisk(page)) + cleancache_put_page(page); + else + cleancache_flush_page(mapping, page); + radix_tree_delete(&mapping->page_tree, page->index); page->mapping = NULL; mapping->nrpages--; @@ -1650,6 +1661,7 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) /* No page in the page cache at all */ do_sync_mmap_readahead(vma, ra, file, offset); count_vm_event(PGMAJFAULT); + mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT); ret = VM_FAULT_MAJOR; retry_find: page = find_get_page(mapping, offset); diff --git a/mm/fremap.c b/mm/fremap.c index 7f4123056e06..b8e0e2d468af 100644 --- a/mm/fremap.c +++ b/mm/fremap.c @@ -224,7 +224,7 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size, /* * drop PG_Mlocked flag for over-mapped range */ - unsigned int saved_flags = vma->vm_flags; + vm_flags_t saved_flags = vma->vm_flags; munlock_vma_pages_range(vma, start, start + size); vma->vm_flags = saved_flags; } diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 5fd68b95c671..f33bb319b73f 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -2833,7 +2833,7 @@ void hugetlb_change_protection(struct vm_area_struct *vma, int hugetlb_reserve_pages(struct inode *inode, long from, long to, struct vm_area_struct *vma, - int acctflag) + vm_flags_t vm_flags) { long ret, chg; struct hstate *h = hstate_inode(inode); @@ -2843,7 +2843,7 @@ int hugetlb_reserve_pages(struct inode *inode, * attempt will be made for VM_NORESERVE to allocate a page * and filesystem quota without using reserves */ - if (acctflag & VM_NORESERVE) + if (vm_flags & VM_NORESERVE) return 0; /* diff --git a/mm/memcontrol.c b/mm/memcontrol.c index d5fd3dcd3f2e..bd9052a5d3ad 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -94,6 +94,8 @@ enum mem_cgroup_events_index { MEM_CGROUP_EVENTS_PGPGIN, /* # of pages paged in */ MEM_CGROUP_EVENTS_PGPGOUT, /* # of pages paged out */ MEM_CGROUP_EVENTS_COUNT, /* # of pages paged in/out */ + MEM_CGROUP_EVENTS_PGFAULT, /* # of page-faults */ + MEM_CGROUP_EVENTS_PGMAJFAULT, /* # of major page-faults */ MEM_CGROUP_EVENTS_NSTATS, }; /* @@ -231,6 +233,11 @@ struct mem_cgroup { * reclaimed from. */ int last_scanned_child; + int last_scanned_node; +#if MAX_NUMNODES > 1 + nodemask_t scan_nodes; + unsigned long next_scan_node_update; +#endif /* * Should the accounting and control be hierarchical, per subtree? */ @@ -585,6 +592,16 @@ static void mem_cgroup_swap_statistics(struct mem_cgroup *mem, this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_SWAPOUT], val); } +void mem_cgroup_pgfault(struct mem_cgroup *mem, int val) +{ + this_cpu_add(mem->stat->events[MEM_CGROUP_EVENTS_PGFAULT], val); +} + +void mem_cgroup_pgmajfault(struct mem_cgroup *mem, int val) +{ + this_cpu_add(mem->stat->events[MEM_CGROUP_EVENTS_PGMAJFAULT], val); +} + static unsigned long mem_cgroup_read_events(struct mem_cgroup *mem, enum mem_cgroup_events_index idx) { @@ -624,18 +641,27 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *mem, preempt_enable(); } +static unsigned long +mem_cgroup_get_zonestat_node(struct mem_cgroup *mem, int nid, enum lru_list idx) +{ + struct mem_cgroup_per_zone *mz; + u64 total = 0; + int zid; + + for (zid = 0; zid < MAX_NR_ZONES; zid++) { + mz = mem_cgroup_zoneinfo(mem, nid, zid); + total += MEM_CGROUP_ZSTAT(mz, idx); + } + return total; +} static unsigned long mem_cgroup_get_local_zonestat(struct mem_cgroup *mem, enum lru_list idx) { - int nid, zid; - struct mem_cgroup_per_zone *mz; + int nid; u64 total = 0; for_each_online_node(nid) - for (zid = 0; zid < MAX_NR_ZONES; zid++) { - mz = mem_cgroup_zoneinfo(mem, nid, zid); - total += MEM_CGROUP_ZSTAT(mz, idx); - } + total += mem_cgroup_get_zonestat_node(mem, nid, idx); return total; } @@ -813,6 +839,33 @@ static inline bool mem_cgroup_is_root(struct mem_cgroup *mem) return (mem == root_mem_cgroup); } +void mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx) +{ + struct mem_cgroup *mem; + + if (!mm) + return; + + rcu_read_lock(); + mem = mem_cgroup_from_task(rcu_dereference(mm->owner)); + if (unlikely(!mem)) + goto out; + + switch (idx) { + case PGMAJFAULT: + mem_cgroup_pgmajfault(mem, 1); + break; + case PGFAULT: + mem_cgroup_pgfault(mem, 1); + break; + default: + BUG(); + } +out: + rcu_read_unlock(); +} +EXPORT_SYMBOL(mem_cgroup_count_vm_event); + /* * Following LRU functions are allowed to be used without PCG_LOCK. * Operations are called by routine of global LRU independently from memcg. @@ -1064,9 +1117,9 @@ int mem_cgroup_inactive_file_is_low(struct mem_cgroup *memcg) return (active > inactive); } -unsigned long mem_cgroup_zone_nr_pages(struct mem_cgroup *memcg, - struct zone *zone, - enum lru_list lru) +unsigned long mem_cgroup_zone_nr_lru_pages(struct mem_cgroup *memcg, + struct zone *zone, + enum lru_list lru) { int nid = zone_to_nid(zone); int zid = zone_idx(zone); @@ -1075,6 +1128,93 @@ unsigned long mem_cgroup_zone_nr_pages(struct mem_cgroup *memcg, return MEM_CGROUP_ZSTAT(mz, lru); } +#ifdef CONFIG_NUMA +static unsigned long mem_cgroup_node_nr_file_lru_pages(struct mem_cgroup *memcg, + int nid) +{ + unsigned long ret; + + ret = mem_cgroup_get_zonestat_node(memcg, nid, LRU_INACTIVE_FILE) + + mem_cgroup_get_zonestat_node(memcg, nid, LRU_ACTIVE_FILE); + + return ret; +} + +static unsigned long mem_cgroup_nr_file_lru_pages(struct mem_cgroup *memcg) +{ + u64 total = 0; + int nid; + + for_each_node_state(nid, N_HIGH_MEMORY) + total += mem_cgroup_node_nr_file_lru_pages(memcg, nid); + + return total; +} + +static unsigned long mem_cgroup_node_nr_anon_lru_pages(struct mem_cgroup *memcg, + int nid) +{ + unsigned long ret; + + ret = mem_cgroup_get_zonestat_node(memcg, nid, LRU_INACTIVE_ANON) + + mem_cgroup_get_zonestat_node(memcg, nid, LRU_ACTIVE_ANON); + + return ret; +} + +static unsigned long mem_cgroup_nr_anon_lru_pages(struct mem_cgroup *memcg) +{ + u64 total = 0; + int nid; + + for_each_node_state(nid, N_HIGH_MEMORY) + total += mem_cgroup_node_nr_anon_lru_pages(memcg, nid); + + return total; +} + +static unsigned long +mem_cgroup_node_nr_unevictable_lru_pages(struct mem_cgroup *memcg, int nid) +{ + return mem_cgroup_get_zonestat_node(memcg, nid, LRU_UNEVICTABLE); +} + +static unsigned long +mem_cgroup_nr_unevictable_lru_pages(struct mem_cgroup *memcg) +{ + u64 total = 0; + int nid; + + for_each_node_state(nid, N_HIGH_MEMORY) + total += mem_cgroup_node_nr_unevictable_lru_pages(memcg, nid); + + return total; +} + +static unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg, + int nid) +{ + enum lru_list l; + u64 total = 0; + + for_each_lru(l) + total += mem_cgroup_get_zonestat_node(memcg, nid, l); + + return total; +} + +static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg) +{ + u64 total = 0; + int nid; + + for_each_node_state(nid, N_HIGH_MEMORY) + total += mem_cgroup_node_nr_lru_pages(memcg, nid); + + return total; +} +#endif /* CONFIG_NUMA */ + struct zone_reclaim_stat *mem_cgroup_get_reclaim_stat(struct mem_cgroup *memcg, struct zone *zone) { @@ -1418,6 +1558,81 @@ mem_cgroup_select_victim(struct mem_cgroup *root_mem) return ret; } +#if MAX_NUMNODES > 1 + +/* + * Always updating the nodemask is not very good - even if we have an empty + * list or the wrong list here, we can start from some node and traverse all + * nodes based on the zonelist. So update the list loosely once per 10 secs. + * + */ +static void mem_cgroup_may_update_nodemask(struct mem_cgroup *mem) +{ + int nid; + + if (time_after(mem->next_scan_node_update, jiffies)) + return; + + mem->next_scan_node_update = jiffies + 10*HZ; + /* make a nodemask where this memcg uses memory from */ + mem->scan_nodes = node_states[N_HIGH_MEMORY]; + + for_each_node_mask(nid, node_states[N_HIGH_MEMORY]) { + + if (mem_cgroup_get_zonestat_node(mem, nid, LRU_INACTIVE_FILE) || + mem_cgroup_get_zonestat_node(mem, nid, LRU_ACTIVE_FILE)) + continue; + + if (total_swap_pages && + (mem_cgroup_get_zonestat_node(mem, nid, LRU_INACTIVE_ANON) || + mem_cgroup_get_zonestat_node(mem, nid, LRU_ACTIVE_ANON))) + continue; + node_clear(nid, mem->scan_nodes); + } +} + +/* + * Selecting a node where we start reclaim from. Because what we need is just + * reducing usage counter, start from anywhere is O,K. Considering + * memory reclaim from current node, there are pros. and cons. + * + * Freeing memory from current node means freeing memory from a node which + * we'll use or we've used. So, it may make LRU bad. And if several threads + * hit limits, it will see a contention on a node. But freeing from remote + * node means more costs for memory reclaim because of memory latency. + * + * Now, we use round-robin. Better algorithm is welcomed. + */ +int mem_cgroup_select_victim_node(struct mem_cgroup *mem) +{ + int node; + + mem_cgroup_may_update_nodemask(mem); + node = mem->last_scanned_node; + + node = next_node(node, mem->scan_nodes); + if (node == MAX_NUMNODES) + node = first_node(mem->scan_nodes); + /* + * We call this when we hit limit, not when pages are added to LRU. + * No LRU may hold pages because all pages are UNEVICTABLE or + * memcg is too small and all pages are not on LRU. In that case, + * we use curret node. + */ + if (unlikely(node == MAX_NUMNODES)) + node = numa_node_id(); + + mem->last_scanned_node = node; + return node; +} + +#else +int mem_cgroup_select_victim_node(struct mem_cgroup *mem) +{ + return 0; +} +#endif + /* * Scan the hierarchy if needed to reclaim memory. We remember the last child * we reclaimed from, so that we don't end up penalizing one child extensively @@ -1433,7 +1648,8 @@ mem_cgroup_select_victim(struct mem_cgroup *root_mem) static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem, struct zone *zone, gfp_t gfp_mask, - unsigned long reclaim_options) + unsigned long reclaim_options, + unsigned long *total_scanned) { struct mem_cgroup *victim; int ret, total = 0; @@ -1442,6 +1658,7 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem, bool shrink = reclaim_options & MEM_CGROUP_RECLAIM_SHRINK; bool check_soft = reclaim_options & MEM_CGROUP_RECLAIM_SOFT; unsigned long excess; + unsigned long nr_scanned; excess = res_counter_soft_limit_excess(&root_mem->res) >> PAGE_SHIFT; @@ -1484,10 +1701,12 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem, continue; } /* we use swappiness of local cgroup */ - if (check_soft) + if (check_soft) { ret = mem_cgroup_shrink_node_zone(victim, gfp_mask, - noswap, get_swappiness(victim), zone); - else + noswap, get_swappiness(victim), zone, + &nr_scanned); + *total_scanned += nr_scanned; + } else ret = try_to_free_mem_cgroup_pages(victim, gfp_mask, noswap, get_swappiness(victim)); css_put(&victim->css); @@ -1503,7 +1722,7 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem, if (!res_counter_soft_limit_excess(&root_mem->res)) return total; } else if (mem_cgroup_margin(root_mem)) - return 1 + total; + return total; } return total; } @@ -1928,7 +2147,7 @@ static int mem_cgroup_do_charge(struct mem_cgroup *mem, gfp_t gfp_mask, return CHARGE_WOULDBLOCK; ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, NULL, - gfp_mask, flags); + gfp_mask, flags, NULL); if (mem_cgroup_margin(mem_over_limit) >= nr_pages) return CHARGE_RETRY; /* @@ -3211,7 +3430,8 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg, break; mem_cgroup_hierarchical_reclaim(memcg, NULL, GFP_KERNEL, - MEM_CGROUP_RECLAIM_SHRINK); + MEM_CGROUP_RECLAIM_SHRINK, + NULL); curusage = res_counter_read_u64(&memcg->res, RES_USAGE); /* Usage is reduced ? */ if (curusage >= oldusage) @@ -3271,7 +3491,8 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg, mem_cgroup_hierarchical_reclaim(memcg, NULL, GFP_KERNEL, MEM_CGROUP_RECLAIM_NOSWAP | - MEM_CGROUP_RECLAIM_SHRINK); + MEM_CGROUP_RECLAIM_SHRINK, + NULL); curusage = res_counter_read_u64(&memcg->memsw, RES_USAGE); /* Usage is reduced ? */ if (curusage >= oldusage) @@ -3285,7 +3506,8 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg, } unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, - gfp_t gfp_mask) + gfp_t gfp_mask, + unsigned long *total_scanned) { unsigned long nr_reclaimed = 0; struct mem_cgroup_per_zone *mz, *next_mz = NULL; @@ -3293,6 +3515,7 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, int loop = 0; struct mem_cgroup_tree_per_zone *mctz; unsigned long long excess; + unsigned long nr_scanned; if (order > 0) return 0; @@ -3311,10 +3534,13 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, if (!mz) break; + nr_scanned = 0; reclaimed = mem_cgroup_hierarchical_reclaim(mz->mem, zone, gfp_mask, - MEM_CGROUP_RECLAIM_SOFT); + MEM_CGROUP_RECLAIM_SOFT, + &nr_scanned); nr_reclaimed += reclaimed; + *total_scanned += nr_scanned; spin_lock(&mctz->lock); /* @@ -3337,10 +3563,9 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, */ next_mz = __mem_cgroup_largest_soft_limit_node(mctz); - if (next_mz == mz) { + if (next_mz == mz) css_put(&next_mz->mem->css); - next_mz = NULL; - } else /* next_mz == NULL or other memcg */ + else /* next_mz == NULL or other memcg */ break; } while (1); } @@ -3772,6 +3997,8 @@ enum { MCS_PGPGIN, MCS_PGPGOUT, MCS_SWAP, + MCS_PGFAULT, + MCS_PGMAJFAULT, MCS_INACTIVE_ANON, MCS_ACTIVE_ANON, MCS_INACTIVE_FILE, @@ -3794,6 +4021,8 @@ struct { {"pgpgin", "total_pgpgin"}, {"pgpgout", "total_pgpgout"}, {"swap", "total_swap"}, + {"pgfault", "total_pgfault"}, + {"pgmajfault", "total_pgmajfault"}, {"inactive_anon", "total_inactive_anon"}, {"active_anon", "total_active_anon"}, {"inactive_file", "total_inactive_file"}, @@ -3822,6 +4051,10 @@ mem_cgroup_get_local_stat(struct mem_cgroup *mem, struct mcs_total_stat *s) val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_SWAPOUT); s->stat[MCS_SWAP] += val * PAGE_SIZE; } + val = mem_cgroup_read_events(mem, MEM_CGROUP_EVENTS_PGFAULT); + s->stat[MCS_PGFAULT] += val; + val = mem_cgroup_read_events(mem, MEM_CGROUP_EVENTS_PGMAJFAULT); + s->stat[MCS_PGMAJFAULT] += val; /* per zone stat */ val = mem_cgroup_get_local_zonestat(mem, LRU_INACTIVE_ANON); @@ -3845,6 +4078,51 @@ mem_cgroup_get_total_stat(struct mem_cgroup *mem, struct mcs_total_stat *s) mem_cgroup_get_local_stat(iter, s); } +#ifdef CONFIG_NUMA +static int mem_control_numa_stat_show(struct seq_file *m, void *arg) +{ + int nid; + unsigned long total_nr, file_nr, anon_nr, unevictable_nr; + unsigned long node_nr; + struct cgroup *cont = m->private; + struct mem_cgroup *mem_cont = mem_cgroup_from_cont(cont); + + total_nr = mem_cgroup_nr_lru_pages(mem_cont); + seq_printf(m, "total=%lu", total_nr); + for_each_node_state(nid, N_HIGH_MEMORY) { + node_nr = mem_cgroup_node_nr_lru_pages(mem_cont, nid); + seq_printf(m, " N%d=%lu", nid, node_nr); + } + seq_putc(m, '\n'); + + file_nr = mem_cgroup_nr_file_lru_pages(mem_cont); + seq_printf(m, "file=%lu", file_nr); + for_each_node_state(nid, N_HIGH_MEMORY) { + node_nr = mem_cgroup_node_nr_file_lru_pages(mem_cont, nid); + seq_printf(m, " N%d=%lu", nid, node_nr); + } + seq_putc(m, '\n'); + + anon_nr = mem_cgroup_nr_anon_lru_pages(mem_cont); + seq_printf(m, "anon=%lu", anon_nr); + for_each_node_state(nid, N_HIGH_MEMORY) { + node_nr = mem_cgroup_node_nr_anon_lru_pages(mem_cont, nid); + seq_printf(m, " N%d=%lu", nid, node_nr); + } + seq_putc(m, '\n'); + + unevictable_nr = mem_cgroup_nr_unevictable_lru_pages(mem_cont); + seq_printf(m, "unevictable=%lu", unevictable_nr); + for_each_node_state(nid, N_HIGH_MEMORY) { + node_nr = mem_cgroup_node_nr_unevictable_lru_pages(mem_cont, + nid); + seq_printf(m, " N%d=%lu", nid, node_nr); + } + seq_putc(m, '\n'); + return 0; +} +#endif /* CONFIG_NUMA */ + static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft, struct cgroup_map_cb *cb) { @@ -3855,6 +4133,7 @@ static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft, memset(&mystat, 0, sizeof(mystat)); mem_cgroup_get_local_stat(mem_cont, &mystat); + for (i = 0; i < NR_MCS_STAT; i++) { if (i == MCS_SWAP && !do_swap_account) continue; @@ -4278,6 +4557,22 @@ static int mem_cgroup_oom_control_write(struct cgroup *cgrp, return 0; } +#ifdef CONFIG_NUMA +static const struct file_operations mem_control_numa_stat_file_operations = { + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static int mem_control_numa_stat_open(struct inode *unused, struct file *file) +{ + struct cgroup *cont = file->f_dentry->d_parent->d_fsdata; + + file->f_op = &mem_control_numa_stat_file_operations; + return single_open(file, mem_control_numa_stat_show, cont); +} +#endif /* CONFIG_NUMA */ + static struct cftype mem_cgroup_files[] = { { .name = "usage_in_bytes", @@ -4341,6 +4636,12 @@ static struct cftype mem_cgroup_files[] = { .unregister_event = mem_cgroup_oom_unregister_event, .private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL), }, +#ifdef CONFIG_NUMA + { + .name = "numa_stat", + .open = mem_control_numa_stat_open, + }, +#endif }; #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP @@ -4596,6 +4897,7 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) res_counter_init(&mem->memsw, NULL); } mem->last_scanned_child = 0; + mem->last_scanned_node = MAX_NUMNODES; INIT_LIST_HEAD(&mem->oom_notify); if (parent) @@ -4953,8 +5255,7 @@ static void mem_cgroup_clear_mc(void) static int mem_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgroup, - struct task_struct *p, - bool threadgroup) + struct task_struct *p) { int ret = 0; struct mem_cgroup *mem = mem_cgroup_from_cont(cgroup); @@ -4993,8 +5294,7 @@ static int mem_cgroup_can_attach(struct cgroup_subsys *ss, static void mem_cgroup_cancel_attach(struct cgroup_subsys *ss, struct cgroup *cgroup, - struct task_struct *p, - bool threadgroup) + struct task_struct *p) { mem_cgroup_clear_mc(); } @@ -5112,8 +5412,7 @@ retry: static void mem_cgroup_move_task(struct cgroup_subsys *ss, struct cgroup *cont, struct cgroup *old_cont, - struct task_struct *p, - bool threadgroup) + struct task_struct *p) { struct mm_struct *mm; @@ -5131,22 +5430,19 @@ static void mem_cgroup_move_task(struct cgroup_subsys *ss, #else /* !CONFIG_MMU */ static int mem_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgroup, - struct task_struct *p, - bool threadgroup) + struct task_struct *p) { return 0; } static void mem_cgroup_cancel_attach(struct cgroup_subsys *ss, struct cgroup *cgroup, - struct task_struct *p, - bool threadgroup) + struct task_struct *p) { } static void mem_cgroup_move_task(struct cgroup_subsys *ss, struct cgroup *cont, struct cgroup *old_cont, - struct task_struct *p, - bool threadgroup) + struct task_struct *p) { } #endif diff --git a/mm/memory.c b/mm/memory.c index b73f677f0bb1..6953d3926e01 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -730,7 +730,7 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr, add_taint(TAINT_BAD_PAGE); } -static inline int is_cow_mapping(unsigned int flags) +static inline int is_cow_mapping(vm_flags_t flags) { return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; } @@ -2874,6 +2874,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, /* Had to read the page from swap area: Major fault */ ret = VM_FAULT_MAJOR; count_vm_event(PGMAJFAULT); + mem_cgroup_count_vm_event(mm, PGMAJFAULT); } else if (PageHWPoison(page)) { /* * hwpoisoned dirty swapcache pages are kept for killing @@ -3413,6 +3414,7 @@ int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, __set_current_state(TASK_RUNNING); count_vm_event(PGFAULT); + mem_cgroup_count_vm_event(mm, PGFAULT); /* do counter updates before entering really critical section. */ check_sync_rss_stat(current); diff --git a/mm/mlock.c b/mm/mlock.c index 516b2c2ddd5a..048260c4e02e 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -307,13 +307,13 @@ void munlock_vma_pages_range(struct vm_area_struct *vma, * For vmas that pass the filters, merge/split as appropriate. */ static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev, - unsigned long start, unsigned long end, unsigned int newflags) + unsigned long start, unsigned long end, vm_flags_t newflags) { struct mm_struct *mm = vma->vm_mm; pgoff_t pgoff; int nr_pages; int ret = 0; - int lock = newflags & VM_LOCKED; + int lock = !!(newflags & VM_LOCKED); if (newflags == vma->vm_flags || (vma->vm_flags & VM_SPECIAL) || is_vm_hugetlb_page(vma) || vma == get_gate_vma(current->mm)) @@ -385,7 +385,7 @@ static int do_mlock(unsigned long start, size_t len, int on) prev = vma; for (nstart = start ; ; ) { - unsigned int newflags; + vm_flags_t newflags; /* Here we know that vma->vm_start <= nstart < vma->vm_end. */ @@ -524,7 +524,7 @@ static int do_mlockall(int flags) goto out; for (vma = current->mm->mmap; vma ; vma = prev->vm_next) { - unsigned int newflags; + vm_flags_t newflags; newflags = vma->vm_flags | VM_LOCKED; if (!(flags & MCL_CURRENT)) diff --git a/mm/mmap.c b/mm/mmap.c index ac2631b7477f..bbdc9af5e117 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -960,7 +960,7 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, { struct mm_struct * mm = current->mm; struct inode *inode; - unsigned int vm_flags; + vm_flags_t vm_flags; int error; unsigned long reqprot = prot; @@ -1165,7 +1165,7 @@ SYSCALL_DEFINE1(old_mmap, struct mmap_arg_struct __user *, arg) */ int vma_wants_writenotify(struct vm_area_struct *vma) { - unsigned int vm_flags = vma->vm_flags; + vm_flags_t vm_flags = vma->vm_flags; /* If it was private or non-writable, the write bit is already clear */ if ((vm_flags & (VM_WRITE|VM_SHARED)) != ((VM_WRITE|VM_SHARED))) @@ -1193,7 +1193,7 @@ int vma_wants_writenotify(struct vm_area_struct *vma) * We account for memory if it's a private writeable mapping, * not hugepages and VM_NORESERVE wasn't set. */ -static inline int accountable_mapping(struct file *file, unsigned int vm_flags) +static inline int accountable_mapping(struct file *file, vm_flags_t vm_flags) { /* * hugetlb has its own accounting separate from the core VM @@ -1207,7 +1207,7 @@ static inline int accountable_mapping(struct file *file, unsigned int vm_flags) unsigned long mmap_region(struct file *file, unsigned long addr, unsigned long len, unsigned long flags, - unsigned int vm_flags, unsigned long pgoff) + vm_flags_t vm_flags, unsigned long pgoff) { struct mm_struct *mm = current->mm; struct vm_area_struct *vma, *prev; diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 2a00f17c3bf4..a4e1db3f1981 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4323,10 +4323,8 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat, zone->zone_pgdat = pgdat; zone_pcp_init(zone); - for_each_lru(l) { + for_each_lru(l) INIT_LIST_HEAD(&zone->lru[l].list); - zone->reclaim_stat.nr_saved_scan[l] = 0; - } zone->reclaim_stat.recent_rotated[0] = 0; zone->reclaim_stat.recent_rotated[1] = 0; zone->reclaim_stat.recent_scanned[0] = 0; diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c index 2daadc322ba6..74ccff61d1be 100644 --- a/mm/page_cgroup.c +++ b/mm/page_cgroup.c @@ -130,7 +130,7 @@ struct page *lookup_cgroup_page(struct page_cgroup *pc) return page; } -static void *__init_refok alloc_page_cgroup(size_t size, int nid) +static void *__meminit alloc_page_cgroup(size_t size, int nid) { void *addr = NULL; @@ -162,7 +162,7 @@ static void free_page_cgroup(void *addr) } #endif -static int __init_refok init_section_page_cgroup(unsigned long pfn) +static int __meminit init_section_page_cgroup(unsigned long pfn) { struct page_cgroup *base, *pc; struct mem_section *section; @@ -475,7 +475,7 @@ int swap_cgroup_swapon(int type, unsigned long max_pages) if (!do_swap_account) return 0; - length = ((max_pages/SC_PER_PAGE) + 1); + length = DIV_ROUND_UP(max_pages, SC_PER_PAGE); array_size = length * sizeof(void *); array = vmalloc(array_size); @@ -492,8 +492,8 @@ int swap_cgroup_swapon(int type, unsigned long max_pages) /* memory shortage */ ctrl->map = NULL; ctrl->length = 0; - vfree(array); mutex_unlock(&swap_cgroup_mutex); + vfree(array); goto nomem; } mutex_unlock(&swap_cgroup_mutex); @@ -508,7 +508,8 @@ nomem: void swap_cgroup_swapoff(int type) { - int i; + struct page **map; + unsigned long i, length; struct swap_cgroup_ctrl *ctrl; if (!do_swap_account) @@ -516,17 +517,20 @@ void swap_cgroup_swapoff(int type) mutex_lock(&swap_cgroup_mutex); ctrl = &swap_cgroup_ctrl[type]; - if (ctrl->map) { - for (i = 0; i < ctrl->length; i++) { - struct page *page = ctrl->map[i]; + map = ctrl->map; + length = ctrl->length; + ctrl->map = NULL; + ctrl->length = 0; + mutex_unlock(&swap_cgroup_mutex); + + if (map) { + for (i = 0; i < length; i++) { + struct page *page = map[i]; if (page) __free_page(page); } - vfree(ctrl->map); - ctrl->map = NULL; - ctrl->length = 0; + vfree(map); } - mutex_unlock(&swap_cgroup_mutex); } #endif diff --git a/mm/shmem.c b/mm/shmem.c index 69edb45a9f28..1acfb2687bfa 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1305,12 +1305,10 @@ repeat: swappage = lookup_swap_cache(swap); if (!swappage) { shmem_swp_unmap(entry); + spin_unlock(&info->lock); /* here we actually do the io */ - if (type && !(*type & VM_FAULT_MAJOR)) { - __count_vm_event(PGMAJFAULT); + if (type) *type |= VM_FAULT_MAJOR; - } - spin_unlock(&info->lock); swappage = shmem_swapin(swap, gfp, info, idx); if (!swappage) { spin_lock(&info->lock); @@ -1549,7 +1547,10 @@ static int shmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf) error = shmem_getpage(inode, vmf->pgoff, &vmf->page, SGP_CACHE, &ret); if (error) return ((error == -ENOMEM) ? VM_FAULT_OOM : VM_FAULT_SIGBUS); - + if (ret & VM_FAULT_MAJOR) { + count_vm_event(PGMAJFAULT); + mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT); + } return ret | VM_FAULT_LOCKED; } diff --git a/mm/truncate.c b/mm/truncate.c index a95667529135..3a29a6180212 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -19,6 +19,7 @@ #include <linux/task_io_accounting_ops.h> #include <linux/buffer_head.h> /* grr. try_to_release_page, do_invalidatepage */ +#include <linux/cleancache.h> #include "internal.h" @@ -51,6 +52,7 @@ void do_invalidatepage(struct page *page, unsigned long offset) static inline void truncate_partial_page(struct page *page, unsigned partial) { zero_user_segment(page, partial, PAGE_CACHE_SIZE); + cleancache_flush_page(page->mapping, page); if (page_has_private(page)) do_invalidatepage(page, partial); } @@ -214,6 +216,7 @@ void truncate_inode_pages_range(struct address_space *mapping, pgoff_t next; int i; + cleancache_flush_inode(mapping); if (mapping->nrpages == 0) return; @@ -291,6 +294,7 @@ void truncate_inode_pages_range(struct address_space *mapping, pagevec_release(&pvec); mem_cgroup_uncharge_end(); } + cleancache_flush_inode(mapping); } EXPORT_SYMBOL(truncate_inode_pages_range); @@ -440,6 +444,7 @@ int invalidate_inode_pages2_range(struct address_space *mapping, int did_range_unmap = 0; int wrapped = 0; + cleancache_flush_inode(mapping); pagevec_init(&pvec, 0); next = start; while (next <= end && !wrapped && @@ -498,6 +503,7 @@ int invalidate_inode_pages2_range(struct address_space *mapping, mem_cgroup_uncharge_end(); cond_resched(); } + cleancache_flush_inode(mapping); return ret; } EXPORT_SYMBOL_GPL(invalidate_inode_pages2_range); diff --git a/mm/vmalloc.c b/mm/vmalloc.c index b5ccf3158d82..1d34d75366a7 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -2153,10 +2153,6 @@ struct vm_struct *alloc_vm_area(size_t size) return NULL; } - /* Make sure the pagetables are constructed in process kernel - mappings */ - vmalloc_sync_all(); - return area; } EXPORT_SYMBOL_GPL(alloc_vm_area); diff --git a/mm/vmscan.c b/mm/vmscan.c index 7e0116150dc7..faa0a088f9cc 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -173,7 +173,7 @@ static unsigned long zone_nr_lru_pages(struct zone *zone, struct scan_control *sc, enum lru_list lru) { if (!scanning_global_lru(sc)) - return mem_cgroup_zone_nr_pages(sc->mem_cgroup, zone, lru); + return mem_cgroup_zone_nr_lru_pages(sc->mem_cgroup, zone, lru); return zone_page_state(zone, NR_LRU_BASE + lru); } @@ -1718,26 +1718,6 @@ static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan, } /* - * Smallish @nr_to_scan's are deposited in @nr_saved_scan, - * until we collected @swap_cluster_max pages to scan. - */ -static unsigned long nr_scan_try_batch(unsigned long nr_to_scan, - unsigned long *nr_saved_scan) -{ - unsigned long nr; - - *nr_saved_scan += nr_to_scan; - nr = *nr_saved_scan; - - if (nr >= SWAP_CLUSTER_MAX) - *nr_saved_scan = 0; - else - nr = 0; - - return nr; -} - -/* * Determine how aggressively the anon and file LRU lists should be * scanned. The relative value of each set of LRU lists is determined * by looking at the fraction of the pages scanned we did rotate back @@ -1755,6 +1735,22 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc, u64 fraction[2], denominator; enum lru_list l; int noswap = 0; + int force_scan = 0; + + + anon = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_ANON) + + zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON); + file = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_FILE) + + zone_nr_lru_pages(zone, sc, LRU_INACTIVE_FILE); + + if (((anon + file) >> priority) < SWAP_CLUSTER_MAX) { + /* kswapd does zone balancing and need to scan this zone */ + if (scanning_global_lru(sc) && current_is_kswapd()) + force_scan = 1; + /* memcg may have small limit and need to avoid priority drop */ + if (!scanning_global_lru(sc)) + force_scan = 1; + } /* If we have no swap space, do not bother scanning anon pages. */ if (!sc->may_swap || (nr_swap_pages <= 0)) { @@ -1765,11 +1761,6 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc, goto out; } - anon = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_ANON) + - zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON); - file = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_FILE) + - zone_nr_lru_pages(zone, sc, LRU_INACTIVE_FILE); - if (scanning_global_lru(sc)) { free = zone_page_state(zone, NR_FREE_PAGES); /* If we have very few page cache pages, @@ -1836,8 +1827,23 @@ out: scan >>= priority; scan = div64_u64(scan * fraction[file], denominator); } - nr[l] = nr_scan_try_batch(scan, - &reclaim_stat->nr_saved_scan[l]); + + /* + * If zone is small or memcg is small, nr[l] can be 0. + * This results no-scan on this priority and priority drop down. + * For global direct reclaim, it can visit next zone and tend + * not to have problems. For global kswapd, it's for zone + * balancing and it need to scan a small amounts. When using + * memcg, priority drop can cause big latency. So, it's better + * to scan small amount. See may_noscan above. + */ + if (!scan && force_scan) { + if (file) + scan = SWAP_CLUSTER_MAX; + else if (!noswap) + scan = SWAP_CLUSTER_MAX; + } + nr[l] = scan; } } @@ -1977,11 +1983,14 @@ restart: * If a zone is deemed to be full of pinned pages then just give it a light * scan then give up on it. */ -static void shrink_zones(int priority, struct zonelist *zonelist, +static unsigned long shrink_zones(int priority, struct zonelist *zonelist, struct scan_control *sc) { struct zoneref *z; struct zone *zone; + unsigned long nr_soft_reclaimed; + unsigned long nr_soft_scanned; + unsigned long total_scanned = 0; for_each_zone_zonelist_nodemask(zone, z, zonelist, gfp_zone(sc->gfp_mask), sc->nodemask) { @@ -1998,8 +2007,17 @@ static void shrink_zones(int priority, struct zonelist *zonelist, continue; /* Let kswapd poll it */ } + nr_soft_scanned = 0; + nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone, + sc->order, sc->gfp_mask, + &nr_soft_scanned); + sc->nr_reclaimed += nr_soft_reclaimed; + total_scanned += nr_soft_scanned; + shrink_zone(priority, zone, sc); } + + return total_scanned; } static bool zone_reclaimable(struct zone *zone) @@ -2064,7 +2082,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, sc->nr_scanned = 0; if (!priority) disable_swap_token(); - shrink_zones(priority, zonelist, sc); + total_scanned += shrink_zones(priority, zonelist, sc); /* * Don't shrink slabs when reclaiming memory from * over limit cgroups @@ -2171,9 +2189,11 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order, unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem, gfp_t gfp_mask, bool noswap, unsigned int swappiness, - struct zone *zone) + struct zone *zone, + unsigned long *nr_scanned) { struct scan_control sc = { + .nr_scanned = 0, .nr_to_reclaim = SWAP_CLUSTER_MAX, .may_writepage = !laptop_mode, .may_unmap = 1, @@ -2182,6 +2202,7 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem, .order = 0, .mem_cgroup = mem, }; + sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK); @@ -2200,6 +2221,7 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem, trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed); + *nr_scanned = sc.nr_scanned; return sc.nr_reclaimed; } @@ -2210,6 +2232,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont, { struct zonelist *zonelist; unsigned long nr_reclaimed; + int nid; struct scan_control sc = { .may_writepage = !laptop_mode, .may_unmap = 1, @@ -2226,7 +2249,14 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont, .gfp_mask = sc.gfp_mask, }; - zonelist = NODE_DATA(numa_node_id())->node_zonelists; + /* + * Unlike direct reclaim via alloc_pages(), memcg's reclaim doesn't + * take care of from where we get pages. So the node where we start the + * scan does not need to be the current node. + */ + nid = mem_cgroup_select_victim_node(mem_cont); + + zonelist = NODE_DATA(nid)->node_zonelists; trace_mm_vmscan_memcg_reclaim_begin(0, sc.may_writepage, @@ -2347,6 +2377,8 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order, int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */ unsigned long total_scanned; struct reclaim_state *reclaim_state = current->reclaim_state; + unsigned long nr_soft_reclaimed; + unsigned long nr_soft_scanned; struct scan_control sc = { .gfp_mask = GFP_KERNEL, .may_unmap = 1, @@ -2439,11 +2471,15 @@ loop_again: sc.nr_scanned = 0; + nr_soft_scanned = 0; /* * Call soft limit reclaim before calling shrink_zone. - * For now we ignore the return value */ - mem_cgroup_soft_limit_reclaim(zone, order, sc.gfp_mask); + nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone, + order, sc.gfp_mask, + &nr_soft_scanned); + sc.nr_reclaimed += nr_soft_reclaimed; + total_scanned += nr_soft_scanned; /* * We put equal pressure on every zone, unless |