From 0f8975ec4db2c8b5bd111b211292ca9be0feb6b8 Mon Sep 17 00:00:00 2001 From: Pavel Emelyanov <xemul@parallels.com> Date: Wed, 3 Jul 2013 15:01:20 -0700 Subject: mm: soft-dirty bits for user memory changes tracking The soft-dirty is a bit on a PTE which helps to track which pages a task writes to. In order to do this tracking one should 1. Clear soft-dirty bits from PTEs ("echo 4 > /proc/PID/clear_refs) 2. Wait some time. 3. Read soft-dirty bits (55'th in /proc/PID/pagemap2 entries) To do this tracking, the writable bit is cleared from PTEs when the soft-dirty bit is. Thus, after this, when the task tries to modify a page at some virtual address the #PF occurs and the kernel sets the soft-dirty bit on the respective PTE. Note, that although all the task's address space is marked as r/o after the soft-dirty bits clear, the #PF-s that occur after that are processed fast. This is so, since the pages are still mapped to physical memory, and thus all the kernel does is finds this fact out and puts back writable, dirty and soft-dirty bits on the PTE. Another thing to note, is that when mremap moves PTEs they are marked with soft-dirty as well, since from the user perspective mremap modifies the virtual memory at mremap's new address. Signed-off-by: Pavel Emelyanov <xemul@parallels.com> Cc: Matt Mackall <mpm@selenic.com> Cc: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com> Cc: Glauber Costa <glommer@parallels.com> Cc: Marcelo Tosatti <mtosatti@redhat.com> Cc: KOSAKI Motohiro <kosaki.motohiro@gmail.com> Cc: Stephen Rothwell <sfr@canb.auug.org.au> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> --- mm/Kconfig | 12 ++++++++++++ mm/huge_memory.c | 2 +- mm/mremap.c | 2 +- 3 files changed, 14 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/Kconfig b/mm/Kconfig index f5e698e30d4a..7e28ecfa8aa4 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -477,3 +477,15 @@ config FRONTSWAP and swap data is stored as normal on the matching swap device. If unsure, say Y to enable frontswap. + +config MEM_SOFT_DIRTY + bool "Track memory changes" + depends on CHECKPOINT_RESTORE && HAVE_ARCH_SOFT_DIRTY + select PROC_PAGE_MONITOR + help + This option enables memory changes tracking by introducing a + soft-dirty bit on pte-s. This bit it set when someone writes + into a page just as regular dirty bit, but unlike the latter + it can be cleared by hands. + + See Documentation/vm/soft-dirty.txt for more details. diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 362c329b83fe..d8b3b850150c 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1429,7 +1429,7 @@ int move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma, if (ret == 1) { pmd = pmdp_get_and_clear(mm, old_addr, old_pmd); VM_BUG_ON(!pmd_none(*new_pmd)); - set_pmd_at(mm, new_addr, new_pmd, pmd); + set_pmd_at(mm, new_addr, new_pmd, pmd_mksoft_dirty(pmd)); spin_unlock(&mm->page_table_lock); } out: diff --git a/mm/mremap.c b/mm/mremap.c index 463a25705ac6..3708655378e9 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -126,7 +126,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, continue; pte = ptep_get_and_clear(mm, old_addr, old_pte); pte = move_pte(pte, new_vma->vm_page_prot, old_addr, new_addr); - set_pte_at(mm, new_addr, new_pte, pte); + set_pte_at(mm, new_addr, new_pte, pte_mksoft_dirty(pte)); } arch_leave_lazy_mmu_mode(); -- cgit v1.2.3 From ffbdccf5e1facd18b54429a749667fb185c10f20 Mon Sep 17 00:00:00 2001 From: David Rientjes <rientjes@google.com> Date: Wed, 3 Jul 2013 15:01:23 -0700 Subject: mm, memcg: don't take task_lock in task_in_mem_cgroup For processes that have detached their mm's, task_in_mem_cgroup() unnecessarily takes task_lock() when rcu_read_lock() is all that is necessary to call mem_cgroup_from_task(). While we're here, switch task_in_mem_cgroup() to return bool. Signed-off-by: David Rientjes <rientjes@google.com> Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Cc: Johannes Weiner <hannes@cmpxchg.org> Acked-by: Michal Hocko <mhocko@suse.cz> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> --- include/linux/memcontrol.h | 9 +++++---- mm/memcontrol.c | 11 ++++++----- 2 files changed, 11 insertions(+), 9 deletions(-) (limited to 'mm') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index d6183f06d8c1..7b4d9d79570b 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -77,7 +77,8 @@ extern void mem_cgroup_uncharge_cache_page(struct page *page); bool __mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg, struct mem_cgroup *memcg); -int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *memcg); +bool task_in_mem_cgroup(struct task_struct *task, + const struct mem_cgroup *memcg); extern struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page); extern struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p); @@ -273,10 +274,10 @@ static inline bool mm_match_cgroup(struct mm_struct *mm, return true; } -static inline int task_in_mem_cgroup(struct task_struct *task, - const struct mem_cgroup *memcg) +static inline bool task_in_mem_cgroup(struct task_struct *task, + const struct mem_cgroup *memcg) { - return 1; + return true; } static inline struct cgroup_subsys_state diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 194721839cf5..4748966b1511 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1448,11 +1448,12 @@ static bool mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg, return ret; } -int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *memcg) +bool task_in_mem_cgroup(struct task_struct *task, + const struct mem_cgroup *memcg) { - int ret; struct mem_cgroup *curr = NULL; struct task_struct *p; + bool ret; p = find_lock_task_mm(task); if (p) { @@ -1464,14 +1465,14 @@ int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *memcg) * killer still needs to detect if they have already been oom * killed to prevent needlessly killing additional tasks. */ - task_lock(task); + rcu_read_lock(); curr = mem_cgroup_from_task(task); if (curr) css_get(&curr->css); - task_unlock(task); + rcu_read_unlock(); } if (!curr) - return 0; + return false; /* * We should check use_hierarchy of "memcg" not "curr". Because checking * use_hierarchy of "curr" here make this function true if hierarchy is -- cgit v1.2.3 From b430e9d1c6d416306d44dbf3aa3148be7af78abc Mon Sep 17 00:00:00 2001 From: Minchan Kim <minchan@kernel.org> Date: Wed, 3 Jul 2013 15:01:24 -0700 Subject: mm: remove compressed copy from zram in-memory Swap subsystem does lazy swap slot free with expecting the page would be swapped out again so we can avoid unnecessary write. But the problem in in-memory swap(ex, zram) is that it consumes memory space until vm_swap_full(ie, used half of all of swap device) condition meet. It could be bad if we use multiple swap device, small in-memory swap and big storage swap or in-memory swap alone. This patch makes swap subsystem free swap slot as soon as swap-read is completed and make the swapcache page dirty so the page should be written out the swap device to reclaim it. It means we never lose it. I tested this patch with kernel compile workload. 1. before compile time : 9882.42 zram max wasted space by fragmentation: 13471881 byte memory space consumed by zram: 174227456 byte the number of slot free notify: 206684 2. after compile time : 9653.90 zram max wasted space by fragmentation: 11805932 byte memory space consumed by zram: 154001408 byte the number of slot free notify: 426972 [akpm@linux-foundation.org: tweak comment text] [artem.savkov@gmail.com: fix BUG due to non-swapcache pages in end_swap_bio_read()] [akpm@linux-foundation.org: invert unlikely() test, augment comment, 80-col cleanup] Signed-off-by: Dan Magenheimer <dan.magenheimer@oracle.com> Signed-off-by: Minchan Kim <minchan@kernel.org> Signed-off-by: Artem Savkov <artem.savkov@gmail.com> Cc: Hugh Dickins <hughd@google.com> Cc: Seth Jennings <sjenning@linux.vnet.ibm.com> Cc: Nitin Gupta <ngupta@vflare.org> Cc: Konrad Rzeszutek Wilk <konrad@darnok.org> Cc: Shaohua Li <shli@kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> --- mm/page_io.c | 50 ++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 48 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/page_io.c b/mm/page_io.c index a8a3ef45fed7..ba05b64e5d8d 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -21,6 +21,7 @@ #include <linux/writeback.h> #include <linux/frontswap.h> #include <linux/aio.h> +#include <linux/blkdev.h> #include <asm/pgtable.h> static struct bio *get_swap_bio(gfp_t gfp_flags, @@ -80,9 +81,54 @@ void end_swap_bio_read(struct bio *bio, int err) imajor(bio->bi_bdev->bd_inode), iminor(bio->bi_bdev->bd_inode), (unsigned long long)bio->bi_sector); - } else { - SetPageUptodate(page); + goto out; } + + SetPageUptodate(page); + + /* + * There is no guarantee that the page is in swap cache - the software + * suspend code (at least) uses end_swap_bio_read() against a non- + * swapcache page. So we must check PG_swapcache before proceeding with + * this optimization. + */ + if (likely(PageSwapCache(page))) { + struct swap_info_struct *sis; + + sis = page_swap_info(page); + if (sis->flags & SWP_BLKDEV) { + /* + * The swap subsystem performs lazy swap slot freeing, + * expecting that the page will be swapped out again. + * So we can avoid an unnecessary write if the page + * isn't redirtied. + * This is good for real swap storage because we can + * reduce unnecessary I/O and enhance wear-leveling + * if an SSD is used as the as swap device. + * But if in-memory swap device (eg zram) is used, + * this causes a duplicated copy between uncompressed + * data in VM-owned memory and compressed data in + * zram-owned memory. So let's free zram-owned memory + * and make the VM-owned decompressed page *dirty*, + * so the page should be swapped out somewhere again if + * we again wish to reclaim it. + */ + struct gendisk *disk = sis->bdev->bd_disk; + if (disk->fops->swap_slot_free_notify) { + swp_entry_t entry; + unsigned long offset; + + entry.val = page_private(page); + offset = swp_offset(entry); + + SetPageDirty(page); + disk->fops->swap_slot_free_notify(sis->bdev, + offset); + } + } + } + +out: unlock_page(page); bio_put(bio); } -- cgit v1.2.3 From d6e932177090463e5c709e9e61bbd705a33a1609 Mon Sep 17 00:00:00 2001 From: Libin <huawei.libin@huawei.com> Date: Wed, 3 Jul 2013 15:01:26 -0700 Subject: mm: use vma_pages() to replace (vm_end - vm_start) >> PAGE_SHIFT (*->vm_end - *->vm_start) >> PAGE_SHIFT operation is implemented as a inline funcion vma_pages() in linux/mm.h, so using it. Signed-off-by: Libin <huawei.libin@huawei.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> --- mm/memory.c | 2 +- mm/mmap.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/memory.c b/mm/memory.c index 95d0cce63583..a101bbcacfd7 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2904,7 +2904,7 @@ static inline void unmap_mapping_range_tree(struct rb_root *root, details->first_index, details->last_index) { vba = vma->vm_pgoff; - vea = vba + ((vma->vm_end - vma->vm_start) >> PAGE_SHIFT) - 1; + vea = vba + vma_pages(vma) - 1; /* Assume for now that PAGE_CACHE_SHIFT == PAGE_SHIFT */ zba = details->first_index; if (zba < vba) diff --git a/mm/mmap.c b/mm/mmap.c index f681e1842fad..8468ffd05bae 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -955,7 +955,7 @@ can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags, if (is_mergeable_vma(vma, file, vm_flags) && is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) { pgoff_t vm_pglen; - vm_pglen = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT; + vm_pglen = vma_pages(vma); if (vma->vm_pgoff + vm_pglen == vm_pgoff) return 1; } -- cgit v1.2.3 From 4008bab7b3969ad9f9dd1d02096a3f0aa5610bd2 Mon Sep 17 00:00:00 2001 From: Cody P Schafer <cody@linux.vnet.ibm.com> Date: Wed, 3 Jul 2013 15:01:28 -0700 Subject: mm/page_alloc: factor out setting of pcp->high and pcp->batch "Problems" with the current code: 1: there is a lack of synchronization in setting ->high and ->batch in percpu_pagelist_fraction_sysctl_handler() 2: stop_machine() in zone_pcp_update() is unnecissary. 3: zone_pcp_update() does not consider the case where percpu_pagelist_fraction is non-zero To fix: 1: add memory barriers, a safe ->batch value, an update side mutex when updating ->high and ->batch, and use ACCESS_ONCE() for ->batch users that expect a stable value. 2: avoid draining pages in zone_pcp_update(), rely upon the memory barriers added to fix #1 3: factor out quite a few functions, and then call the appropriate one. Note that it results in a change to the behavior of zone_pcp_update(), which is used by memory_hotplug. I'm rather certain that I've diserned (and preserved) the essential behavior (changing ->high and ->batch), and only eliminated unneeded actions (draining the per cpu pages), but this may not be the case. Further note that the draining of pages that previously took place in zone_pcp_update() occured after repeated draining when attempting to offline a page, and after the offline has "succeeded". It appears that the draining was added to zone_pcp_update() to avoid refactoring setup_pageset() into 2 funtions. This patch: Creates pageset_set_batch() for use in setup_pageset(). pageset_set_batch() imitates the functionality of setup_pagelist_highmark(), but uses the boot time (percpu_pagelist_fraction == 0) calculations for determining ->high based on ->batch. Signed-off-by: Cody P Schafer <cody@linux.vnet.ibm.com> Cc: Gilad Ben-Yossef <gilad@benyossef.com> Cc: KOSAKI Motohiro <kosaki.motohiro@gmail.com> Cc: Mel Gorman <mgorman@suse.de> Cc: Pekka Enberg <penberg@kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> --- mm/page_alloc.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index c3edb624fccf..d4bcc20ab6f0 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4032,6 +4032,14 @@ static int __meminit zone_batchsize(struct zone *zone) #endif } +/* a companion to setup_pagelist_highmark() */ +static void pageset_set_batch(struct per_cpu_pageset *p, unsigned long batch) +{ + struct per_cpu_pages *pcp = &p->pcp; + pcp->high = 6 * batch; + pcp->batch = max(1UL, 1 * batch); +} + static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch) { struct per_cpu_pages *pcp; @@ -4041,8 +4049,7 @@ static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch) pcp = &p->pcp; pcp->count = 0; - pcp->high = 6 * batch; - pcp->batch = max(1UL, 1 * batch); + pageset_set_batch(p, batch); for (migratetype = 0; migratetype < MIGRATE_PCPTYPES; migratetype++) INIT_LIST_HEAD(&pcp->lists[migratetype]); } @@ -4051,7 +4058,6 @@ static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch) * setup_pagelist_highmark() sets the high water mark for hot per_cpu_pagelist * to the value high for the pageset p. */ - static void setup_pagelist_highmark(struct per_cpu_pageset *p, unsigned long high) { -- cgit v1.2.3 From c8e251fadc6220261f6e0c6b8a4f1cdf27626165 Mon Sep 17 00:00:00 2001 From: Cody P Schafer <cody@linux.vnet.ibm.com> Date: Wed, 3 Jul 2013 15:01:29 -0700 Subject: mm/page_alloc: prevent concurrent updaters of pcp ->batch and ->high Because we are going to rely upon a careful transision between old and new ->high and ->batch values using memory barriers and will remove stop_machine(), we need to prevent multiple updaters from interweaving their memory writes. Add a simple mutex to protect both update loops. Signed-off-by: Cody P Schafer <cody@linux.vnet.ibm.com> Cc: Gilad Ben-Yossef <gilad@benyossef.com> Cc: KOSAKI Motohiro <kosaki.motohiro@gmail.com> Cc: Mel Gorman <mgorman@suse.de> Cc: Pekka Enberg <penberg@kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> --- mm/page_alloc.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index d4bcc20ab6f0..8d4335779633 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -65,6 +65,9 @@ #include <asm/div64.h> #include "internal.h" +/* prevent >1 _updater_ of zone percpu pageset ->high and ->batch fields */ +static DEFINE_MUTEX(pcp_batch_high_lock); + #ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID DEFINE_PER_CPU(int, numa_node); EXPORT_PER_CPU_SYMBOL(numa_node); @@ -5557,6 +5560,8 @@ int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write, ret = proc_dointvec_minmax(table, write, buffer, length, ppos); if (!write || (ret < 0)) return ret; + + mutex_lock(&pcp_batch_high_lock); for_each_populated_zone(zone) { for_each_possible_cpu(cpu) { unsigned long high; @@ -5565,6 +5570,7 @@ int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write, per_cpu_ptr(zone->pageset, cpu), high); } } + mutex_unlock(&pcp_batch_high_lock); return 0; } @@ -6078,7 +6084,9 @@ static int __meminit __zone_pcp_update(void *data) void __meminit zone_pcp_update(struct zone *zone) { + mutex_lock(&pcp_batch_high_lock); stop_machine(__zone_pcp_update, zone, NULL); + mutex_unlock(&pcp_batch_high_lock); } #endif -- cgit v1.2.3 From 8d7a8fa97abeb4fd6b3975d32c9f859875157770 Mon Sep 17 00:00:00 2001 From: Cody P Schafer <cody@linux.vnet.ibm.com> Date: Wed, 3 Jul 2013 15:01:31 -0700 Subject: mm/page_alloc: insert memory barriers to allow async update of pcp batch and high Introduce pageset_update() to perform a safe transision from one set of pcp->{batch,high} to a new set using memory barriers. This ensures that batch is always set to a safe value (1) prior to updating high, and ensure that high is fully updated before setting the real value of batch. It avoids ->batch ever rising above ->high. Suggested by Gilad Ben-Yossef in these threads: https://lkml.org/lkml/2013/4/9/23 https://lkml.org/lkml/2013/4/10/49 Also reproduces his proposed comment. Signed-off-by: Cody P Schafer <cody@linux.vnet.ibm.com> Reviewed-by: Gilad Ben-Yossef <gilad@benyossef.com> Cc: KOSAKI Motohiro <kosaki.motohiro@gmail.com> Cc: Mel Gorman <mgorman@suse.de> Cc: Pekka Enberg <penberg@kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> --- mm/page_alloc.c | 41 ++++++++++++++++++++++++++++++++--------- 1 file changed, 32 insertions(+), 9 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 8d4335779633..eaaef2a09424 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4035,12 +4035,37 @@ static int __meminit zone_batchsize(struct zone *zone) #endif } +/* + * pcp->high and pcp->batch values are related and dependent on one another: + * ->batch must never be higher then ->high. + * The following function updates them in a safe manner without read side + * locking. + * + * Any new users of pcp->batch and pcp->high should ensure they can cope with + * those fields changing asynchronously (acording the the above rule). + * + * mutex_is_locked(&pcp_batch_high_lock) required when calling this function + * outside of boot time (or some other assurance that no concurrent updaters + * exist). + */ +static void pageset_update(struct per_cpu_pages *pcp, unsigned long high, + unsigned long batch) +{ + /* start with a fail safe value for batch */ + pcp->batch = 1; + smp_wmb(); + + /* Update high, then batch, in order */ + pcp->high = high; + smp_wmb(); + + pcp->batch = batch; +} + /* a companion to setup_pagelist_highmark() */ static void pageset_set_batch(struct per_cpu_pageset *p, unsigned long batch) { - struct per_cpu_pages *pcp = &p->pcp; - pcp->high = 6 * batch; - pcp->batch = max(1UL, 1 * batch); + pageset_update(&p->pcp, 6 * batch, max(1UL, 1 * batch)); } static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch) @@ -4064,13 +4089,11 @@ static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch) static void setup_pagelist_highmark(struct per_cpu_pageset *p, unsigned long high) { - struct per_cpu_pages *pcp; + unsigned long batch = max(1UL, high / 4); + if ((high / 4) > (PAGE_SHIFT * 8)) + batch = PAGE_SHIFT * 8; - pcp = &p->pcp; - pcp->high = high; - pcp->batch = max(1UL, high/4); - if ((high/4) > (PAGE_SHIFT * 8)) - pcp->batch = PAGE_SHIFT * 8; + pageset_update(&p->pcp, high, batch); } static void __meminit setup_zone_pageset(struct zone *zone) -- cgit v1.2.3 From 998d39cb236fe464af86a3492a24d2f67ee1efc2 Mon Sep 17 00:00:00 2001 From: Cody P Schafer <cody@linux.vnet.ibm.com> Date: Wed, 3 Jul 2013 15:01:32 -0700 Subject: mm/page_alloc: protect pcp->batch accesses with ACCESS_ONCE pcp->batch could change at any point, avoid relying on it being a stable value. Signed-off-by: Cody P Schafer <cody@linux.vnet.ibm.com> Cc: Gilad Ben-Yossef <gilad@benyossef.com> Cc: KOSAKI Motohiro <kosaki.motohiro@gmail.com> Cc: Mel Gorman <mgorman@suse.de> Cc: Pekka Enberg <penberg@kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> --- mm/page_alloc.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index eaaef2a09424..97b8f861e63d 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1182,10 +1182,12 @@ void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp) { unsigned long flags; int to_drain; + unsigned long batch; local_irq_save(flags); - if (pcp->count >= pcp->batch) - to_drain = pcp->batch; + batch = ACCESS_ONCE(pcp->batch); + if (pcp->count >= batch) + to_drain = batch; else to_drain = pcp->count; if (to_drain > 0) { @@ -1353,8 +1355,9 @@ void free_hot_cold_page(struct page *page, int cold) list_add(&page->lru, &pcp->lists[migratetype]); pcp->count++; if (pcp->count >= pcp->high) { - free_pcppages_bulk(zone, pcp->batch, pcp); - pcp->count -= pcp->batch; + unsigned long batch = ACCESS_ONCE(pcp->batch); + free_pcppages_bulk(zone, batch, pcp); + pcp->count -= batch; } out: -- cgit v1.2.3 From 0a647f3811d6af56405a819341ceac23e31d4572 Mon Sep 17 00:00:00 2001 From: Cody P Schafer <cody@linux.vnet.ibm.com> Date: Wed, 3 Jul 2013 15:01:33 -0700 Subject: mm/page_alloc: convert zone_pcp_update() to rely on memory barriers instead of stop_machine() zone_pcp_update()'s goal is to adjust the ->high and ->mark members of a percpu pageset based on a zone's ->managed_pages. We don't need to drain the entire percpu pageset just to modify these fields. This lets us avoid calling setup_pageset() (and the draining required to call it) and instead allows simply setting the fields' values (with some attention paid to memory barriers to prevent the relationship between ->batch and ->high from being thrown off). This does change the behavior of zone_pcp_update() as the percpu pagesets will not be drained when zone_pcp_update() is called (they will end up being shrunk, not completely drained, later when a 0-order page is freed in free_hot_cold_page()). Signed-off-by: Cody P Schafer <cody@linux.vnet.ibm.com> Cc: Gilad Ben-Yossef <gilad@benyossef.com> Cc: KOSAKI Motohiro <kosaki.motohiro@gmail.com> Cc: Mel Gorman <mgorman@suse.de> Cc: Pekka Enberg <penberg@kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> --- mm/page_alloc.c | 33 +++++++++------------------------ 1 file changed, 9 insertions(+), 24 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 97b8f861e63d..8125263be60f 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -6085,33 +6085,18 @@ void free_contig_range(unsigned long pfn, unsigned nr_pages) #endif #ifdef CONFIG_MEMORY_HOTPLUG -static int __meminit __zone_pcp_update(void *data) -{ - struct zone *zone = data; - int cpu; - unsigned long batch = zone_batchsize(zone), flags; - - for_each_possible_cpu(cpu) { - struct per_cpu_pageset *pset; - struct per_cpu_pages *pcp; - - pset = per_cpu_ptr(zone->pageset, cpu); - pcp = &pset->pcp; - - local_irq_save(flags); - if (pcp->count > 0) - free_pcppages_bulk(zone, pcp->count, pcp); - drain_zonestat(zone, pset); - setup_pageset(pset, batch); - local_irq_restore(flags); - } - return 0; -} - +/* + * The zone indicated has a new number of managed_pages; batch sizes and percpu + * page high values need to be recalulated. + */ void __meminit zone_pcp_update(struct zone *zone) { + unsigned cpu; + unsigned long batch; mutex_lock(&pcp_batch_high_lock); - stop_machine(__zone_pcp_update, zone, NULL); + batch = zone_batchsize(zone); + for_each_possible_cpu(cpu) + pageset_set_batch(per_cpu_ptr(zone->pageset, cpu), batch); mutex_unlock(&pcp_batch_high_lock); } #endif -- cgit v1.2.3 From 22a7f12b1606327f0e11fcdf9043ae00bf9917df Mon Sep 17 00:00:00 2001 From: Cody P Schafer <cody@linux.vnet.ibm.com> Date: Wed, 3 Jul 2013 15:01:34 -0700 Subject: mm/page_alloc: when handling percpu_pagelist_fraction, don't unneedly recalulate high Simply moves calculation of the new 'high' value outside the for_each_possible_cpu() loop, as it does not depend on the cpu. Signed-off-by: Cody P Schafer <cody@linux.vnet.ibm.com> Cc: Gilad Ben-Yossef <gilad@benyossef.com> Cc: KOSAKI Motohiro <kosaki.motohiro@gmail.com> Cc: Mel Gorman <mgorman@suse.de> Cc: Pekka Enberg <penberg@kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> --- mm/page_alloc.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 8125263be60f..386de0f11bea 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5575,7 +5575,6 @@ int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write, * cpu. It is the fraction of total pages in each zone that a hot per cpu pagelist * can have before it gets flushed back to buddy allocator. */ - int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write, void __user *buffer, size_t *length, loff_t *ppos) { @@ -5589,12 +5588,11 @@ int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write, mutex_lock(&pcp_batch_high_lock); for_each_populated_zone(zone) { - for_each_possible_cpu(cpu) { - unsigned long high; - high = zone->managed_pages / percpu_pagelist_fraction; + unsigned long high; + high = zone->managed_pages / percpu_pagelist_fraction; + for_each_possible_cpu(cpu) setup_pagelist_highmark( - per_cpu_ptr(zone->pageset, cpu), high); - } + per_cpu_ptr(zone->pageset, cpu), high); } mutex_unlock(&pcp_batch_high_lock); return 0; -- cgit v1.2.3 From 88c90dbccaaed35991b5336fec84294de1d23538 Mon Sep 17 00:00:00 2001 From: Cody P Schafer <cody@linux.vnet.ibm.com> Date: Wed, 3 Jul 2013 15:01:35 -0700 Subject: mm/page_alloc: factor setup_pageset() into pageset_init() and pageset_set_batch() Signed-off-by: Cody P Schafer <cody@linux.vnet.ibm.com> Cc: Gilad Ben-Yossef <gilad@benyossef.com> Cc: KOSAKI Motohiro <kosaki.motohiro@gmail.com> Cc: Mel Gorman <mgorman@suse.de> Cc: Pekka Enberg <penberg@kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> --- mm/page_alloc.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 386de0f11bea..a235149d9406 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4071,7 +4071,7 @@ static void pageset_set_batch(struct per_cpu_pageset *p, unsigned long batch) pageset_update(&p->pcp, 6 * batch, max(1UL, 1 * batch)); } -static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch) +static void pageset_init(struct per_cpu_pageset *p) { struct per_cpu_pages *pcp; int migratetype; @@ -4080,11 +4080,16 @@ static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch) pcp = &p->pcp; pcp->count = 0; - pageset_set_batch(p, batch); for (migratetype = 0; migratetype < MIGRATE_PCPTYPES; migratetype++) INIT_LIST_HEAD(&pcp->lists[migratetype]); } +static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch) +{ + pageset_init(p); + pageset_set_batch(p, batch); +} + /* * setup_pagelist_highmark() sets the high water mark for hot per_cpu_pagelist * to the value high for the pageset p. -- cgit v1.2.3 From dd1895e2c5c9ed3a791d1d8eb4a6a3e241ec9d6e Mon Sep 17 00:00:00 2001 From: Cody P Schafer <cody@linux.vnet.ibm.com> Date: Wed, 3 Jul 2013 15:01:36 -0700 Subject: mm/page_alloc: relocate comment to be directly above code it refers to. Signed-off-by: Cody P Schafer <cody@linux.vnet.ibm.com> Cc: Gilad Ben-Yossef <gilad@benyossef.com> Cc: KOSAKI Motohiro <kosaki.motohiro@gmail.com> Cc: Mel Gorman <mgorman@suse.de> Cc: Pekka Enberg <penberg@kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> --- mm/page_alloc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index a235149d9406..2793ce50f316 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3711,12 +3711,12 @@ void __ref build_all_zonelists(pg_data_t *pgdat, struct zone *zone) mminit_verify_zonelist(); cpuset_init_current_mems_allowed(); } else { - /* we have to stop all cpus to guarantee there is no user - of zonelist */ #ifdef CONFIG_MEMORY_HOTPLUG if (zone) setup_zone_pageset(zone); #endif + /* we have to stop all cpus to guarantee there is no user + of zonelist */ stop_machine(__build_all_zonelists, pgdat, NULL); /* cpuset refresh routine should be here */ } -- cgit v1.2.3 From 56cef2b85c28d81efd39f2eeaddce28678756fe3 Mon Sep 17 00:00:00 2001 From: Cody P Schafer <cody@linux.vnet.ibm.com> Date: Wed, 3 Jul 2013 15:01:38 -0700 Subject: mm/page_alloc: factor zone_pageset_init() out of setup_zone_pageset() Signed-off-by: Cody P Schafer <cody@linux.vnet.ibm.com> Cc: Gilad Ben-Yossef <gilad@benyossef.com> Cc: KOSAKI Motohiro <kosaki.motohiro@gmail.com> Cc: Mel Gorman <mgorman@suse.de> Cc: Pekka Enberg <penberg@kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> --- mm/page_alloc.c | 27 +++++++++++++++------------ 1 file changed, 15 insertions(+), 12 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 2793ce50f316..ee6fe7faabad 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4104,22 +4104,25 @@ static void setup_pagelist_highmark(struct per_cpu_pageset *p, pageset_update(&p->pcp, high, batch); } +static void __meminit zone_pageset_init(struct zone *zone, int cpu) +{ + struct per_cpu_pageset *pcp = per_cpu_ptr(zone->pageset, cpu); + + pageset_init(pcp); + if (percpu_pagelist_fraction) + setup_pagelist_highmark(pcp, + (zone->managed_pages / + percpu_pagelist_fraction)); + else + pageset_set_batch(pcp, zone_batchsize(zone)); +} + static void __meminit setup_zone_pageset(struct zone *zone) { int cpu; - zone->pageset = alloc_percpu(struct per_cpu_pageset); - - for_each_possible_cpu(cpu) { - struct per_cpu_pageset *pcp = per_cpu_ptr(zone->pageset, cpu); - - setup_pageset(pcp, zone_batchsize(zone)); - - if (percpu_pagelist_fraction) - setup_pagelist_highmark(pcp, - (zone->managed_pages / - percpu_pagelist_fraction)); - } + for_each_possible_cpu(cpu) + zone_pageset_init(zone, cpu); } /* -- cgit v1.2.3 From 737af4c0110fc69a81dc7464a74a4113f7645255 Mon Sep 17 00:00:00 2001 From: Cody P Schafer <cody@linux.vnet.ibm.com> Date: Wed, 3 Jul 2013 15:01:39 -0700 Subject: mm/page_alloc: in zone_pcp_update(), uze zone_pageset_init() Previously, zone_pcp_update() called pageset_set_batch() directly, essentially assuming that percpu_pagelist_fraction == 0. Correct this by calling zone_pageset_init(), which chooses the appropriate ->batch and ->high calculations. Signed-off-by: Cody P Schafer <cody@linux.vnet.ibm.com> Cc: Gilad Ben-Yossef <gilad@benyossef.com> Cc: KOSAKI Motohiro <kosaki.motohiro@gmail.com> Cc: Mel Gorman <mgorman@suse.de> Cc: Pekka Enberg <penberg@kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> --- mm/page_alloc.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index ee6fe7faabad..c7344d17660b 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -6098,11 +6098,9 @@ void free_contig_range(unsigned long pfn, unsigned nr_pages) void __meminit zone_pcp_update(struct zone *zone) { unsigned cpu; - unsigned long batch; mutex_lock(&pcp_batch_high_lock); - batch = zone_batchsize(zone); for_each_possible_cpu(cpu) - pageset_set_batch(per_cpu_ptr(zone->pageset, cpu), batch); + zone_pageset_init(zone, cpu); mutex_unlock(&pcp_batch_high_lock); } #endif -- cgit v1.2.3 From 3664033c56f211a3dcf28d9d68c604ed447d8d79 Mon Sep 17 00:00:00 2001 From: Cody P Schafer <cody@linux.vnet.ibm.com> Date: Wed, 3 Jul 2013 15:01:40 -0700 Subject: mm/page_alloc: rename setup_pagelist_highmark() to match naming of pageset_set_batch() Signed-off-by: Cody P Schafer <cody@linux.vnet.ibm.com> Cc: Gilad Ben-Yossef <gilad@benyossef.com> Cc: KOSAKI Motohiro <kosaki.motohiro@gmail.com> Cc: Mel Gorman <mgorman@suse.de> Cc: Pekka Enberg <penberg@kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> --- mm/page_alloc.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index c7344d17660b..03a3f943d98e 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4065,7 +4065,7 @@ static void pageset_update(struct per_cpu_pages *pcp, unsigned long high, pcp->batch = batch; } -/* a companion to setup_pagelist_highmark() */ +/* a companion to pageset_set_high() */ static void pageset_set_batch(struct per_cpu_pageset *p, unsigned long batch) { pageset_update(&p->pcp, 6 * batch, max(1UL, 1 * batch)); @@ -4091,10 +4091,10 @@ static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch) } /* - * setup_pagelist_highmark() sets the high water mark for hot per_cpu_pagelist + * pageset_set_high() sets the high water mark for hot per_cpu_pagelist * to the value high for the pageset p. */ -static void setup_pagelist_highmark(struct per_cpu_pageset *p, +static void pageset_set_high(struct per_cpu_pageset *p, unsigned long high) { unsigned long batch = max(1UL, high / 4); @@ -4110,7 +4110,7 @@ static void __meminit zone_pageset_init(struct zone *zone, int cpu) pageset_init(pcp); if (percpu_pagelist_fraction) - setup_pagelist_highmark(pcp, + pageset_set_high(pcp, (zone->managed_pages / percpu_pagelist_fraction)); else @@ -5599,8 +5599,8 @@ int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write, unsigned long high; high = zone->managed_pages / percpu_pagelist_fraction; for_each_possible_cpu(cpu) - setup_pagelist_highmark( - per_cpu_ptr(zone->pageset, cpu), high); + pageset_set_high(per_cpu_ptr(zone->pageset, cpu), + high); } mutex_unlock(&pcp_batch_high_lock); return 0; -- cgit v1.2.3 From 169f6c1999ca6d0c5e06e8d810817ed3d1ebf017 Mon Sep 17 00:00:00 2001 From: Cody P Schafer <cody@linux.vnet.ibm.com> Date: Wed, 3 Jul 2013 15:01:41 -0700 Subject: mm/page_alloc: don't re-init pageset in zone_pcp_update() When memory hotplug is triggered, we call pageset_init() on per-cpu-pagesets which both contain pages and are in use, causing both the leakage of those pages and (potentially) bad behaviour if a page is allocated from a pageset while it is being cleared. Avoid this by factoring out pageset_set_high_and_batch() (which contains all needed logic too set a pageset's ->high and ->batch inrespective of system state) from zone_pageset_init() and using the new pageset_set_high_and_batch() instead of zone_pageset_init() in zone_pcp_update(). Signed-off-by: Cody P Schafer <cody@linux.vnet.ibm.com> Cc: Valdis Kletnieks <Valdis.Kletnieks@vt.edu> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> --- mm/page_alloc.c | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 03a3f943d98e..fab9506273be 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4104,11 +4104,9 @@ static void pageset_set_high(struct per_cpu_pageset *p, pageset_update(&p->pcp, high, batch); } -static void __meminit zone_pageset_init(struct zone *zone, int cpu) +static void __meminit pageset_set_high_and_batch(struct zone *zone, + struct per_cpu_pageset *pcp) { - struct per_cpu_pageset *pcp = per_cpu_ptr(zone->pageset, cpu); - - pageset_init(pcp); if (percpu_pagelist_fraction) pageset_set_high(pcp, (zone->managed_pages / @@ -4117,6 +4115,14 @@ static void __meminit zone_pageset_init(struct zone *zone, int cpu) pageset_set_batch(pcp, zone_batchsize(zone)); } +static void __meminit zone_pageset_init(struct zone *zone, int cpu) +{ + struct per_cpu_pageset *pcp = per_cpu_ptr(zone->pageset, cpu); + + pageset_init(pcp); + pageset_set_high_and_batch(zone, pcp); +} + static void __meminit setup_zone_pageset(struct zone *zone) { int cpu; @@ -6100,7 +6106,8 @@ void __meminit zone_pcp_update(struct zone *zone) unsigned cpu; mutex_lock(&pcp_batch_high_lock); for_each_possible_cpu(cpu) - zone_pageset_init(zone, cpu); + pageset_set_high_and_batch(zone, + per_cpu_ptr(zone->pageset, cpu)); mutex_unlock(&pcp_batch_high_lock); } #endif -- cgit v1.2.3 From 75485363ce8552698bfb9970d901f755d5713cca Mon Sep 17 00:00:00 2001 From: Mel Gorman <mgorman@suse.de> Date: Wed, 3 Jul 2013 15:01:42 -0700 Subject: mm: vmscan: limit the number of pages kswapd reclaims at each priority This series does not fix all the current known problems with reclaim but it addresses one important swapping bug when there is background IO. Changelog since V3 - Drop the slab shrink changes in light of Glaubers series and discussions highlighted that there were a number of potential problems with the patch. (mel) - Rebased to 3.10-rc1 Changelog since V2 - Preserve ratio properly for proportional scanning (kamezawa) Changelog since V1 - Rename ZONE_DIRTY to ZONE_TAIL_LRU_DIRTY (andi) - Reformat comment in shrink_page_list (andi) - Clarify some comments (dhillf) - Rework how the proportional scanning is preserved - Add PageReclaim check before kswapd starts writeback - Reset sc.nr_reclaimed on every full zone scan Kswapd and page reclaim behaviour has been screwy in one way or the other for a long time. Very broadly speaking it worked in the far past because machines were limited in memory so it did not have that many pages to scan and it stalled congestion_wait() frequently to prevent it going completely nuts. In recent times it has behaved very unsatisfactorily with some of the problems compounded by the removal of stall logic and the introduction of transparent hugepage support with high-order reclaims. There are many variations of bugs that are rooted in this area. One example is reports of a large copy operations or backup causing the machine to grind to a halt or applications pushed to swap. Sometimes in low memory situations a large percentage of memory suddenly gets reclaimed. In other cases an application starts and kswapd hits 100% CPU usage for prolonged periods of time and so on. There is now talk of introducing features like an extra free kbytes tunable to work around aspects of the problem instead of trying to deal with it. It's compounded by the problem that it can be very workload and machine specific. This series aims at addressing some of the worst of these problems without attempting to fundmentally alter how page reclaim works. Patches 1-2 limits the number of pages kswapd reclaims while still obeying the anon/file proportion of the LRUs it should be scanning. Patches 3-4 control how and when kswapd raises its scanning priority and deletes the scanning restart logic which is tricky to follow. Patch 5 notes that it is too easy for kswapd to reach priority 0 when scanning and then reclaim the world. Down with that sort of thing. Patch 6 notes that kswapd starts writeback based on scanning priority which is not necessarily related to dirty pages. It will have kswapd writeback pages if a number of unqueued dirty pages have been recently encountered at the tail of the LRU. Patch 7 notes that sometimes kswapd should stall waiting on IO to complete to reduce LRU churn and the likelihood that it'll reclaim young clean pages or push applications to swap. It will cause kswapd to block on IO if it detects that pages being reclaimed under writeback are recycling through the LRU before the IO completes. Patchies 8-9 are cosmetic but balance_pgdat() is easier to follow after they are applied. This was tested using memcached+memcachetest while some background IO was in progress as implemented by the parallel IO tests implement in MM Tests. memcachetest benchmarks how many operations/second memcached can service and it is run multiple times. It starts with no background IO and then re-runs the test with larger amounts of IO in the background to roughly simulate a large copy in progress. The expectation is that the IO should have little or no impact on memcachetest which is running entirely in memory. 3.10.0-rc1 3.10.0-rc1 vanilla lessdisrupt-v4 Ops memcachetest-0M 22155.00 ( 0.00%) 22180.00 ( 0.11%) Ops memcachetest-715M 22720.00 ( 0.00%) 22355.00 ( -1.61%) Ops memcachetest-2385M 3939.00 ( 0.00%) 23450.00 (495.33%) Ops memcachetest-4055M 3628.00 ( 0.00%) 24341.00 (570.92%) Ops io-duration-0M 0.00 ( 0.00%) 0.00 ( 0.00%) Ops io-duration-715M 12.00 ( 0.00%) 7.00 ( 41.67%) Ops io-duration-2385M 118.00 ( 0.00%) 21.00 ( 82.20%) Ops io-duration-4055M 162.00 ( 0.00%) 36.00 ( 77.78%) Ops swaptotal-0M 0.00 ( 0.00%) 0.00 ( 0.00%) Ops swaptotal-715M 140134.00 ( 0.00%) 18.00 ( 99.99%) Ops swaptotal-2385M 392438.00 ( 0.00%) 0.00 ( 0.00%) Ops swaptotal-4055M 449037.00 ( 0.00%) 27864.00 ( 93.79%) Ops swapin-0M 0.00 ( 0.00%) 0.00 ( 0.00%) Ops swapin-715M 0.00 ( 0.00%) 0.00 ( 0.00%) Ops swapin-2385M 148031.00 ( 0.00%) 0.00 ( 0.00%) Ops swapin-4055M 135109.00 ( 0.00%) 0.00 ( 0.00%) Ops minorfaults-0M 1529984.00 ( 0.00%) 1530235.00 ( -0.02%) Ops minorfaults-715M 1794168.00 ( 0.00%) 1613750.00 ( 10.06%) Ops minorfaults-2385M 1739813.00 ( 0.00%) 1609396.00 ( 7.50%) Ops minorfaults-4055M 1754460.00 ( 0.00%) 1614810.00 ( 7.96%) Ops majorfaults-0M 0.00 ( 0.00%) 0.00 ( 0.00%) Ops majorfaults-715M 185.00 ( 0.00%) 180.00 ( 2.70%) Ops majorfaults-2385M 24472.00 ( 0.00%) 101.00 ( 99.59%) Ops majorfaults-4055M 22302.00 ( 0.00%) 229.00 ( 98.97%) Note how the vanilla kernels performance collapses when there is enough IO taking place in the background. This drop in performance is part of what users complain of when they start backups. Note how the swapin and major fault figures indicate that processes were being pushed to swap prematurely. With the series applied, there is no noticable performance drop and while there is still some swap activity, it's tiny. 20 iterations of this test were run in total and averaged. Every 5 iterations, additional IO was generated in the background using dd to measure how the workload was impacted. The 0M, 715M, 2385M and 4055M subblock refer to the amount of IO going on in the background at each iteration. So memcachetest-2385M is reporting how many transactions/second memcachetest recorded on average over 5 iterations while there was 2385M of IO going on in the ground. There are six blocks of information reported here memcachetest is the transactions/second reported by memcachetest. In the vanilla kernel note that performance drops from around 22K/sec to just under 4K/second when there is 2385M of IO going on in the background. This is one type of performance collapse users complain about if a large cp or backup starts in the background io-duration refers to how long it takes for the background IO to complete. It's showing that with the patched kernel that the IO completes faster while not interfering with the memcache workload swaptotal is the total amount of swap traffic. With the patched kernel, the total amount of swapping is much reduced although it is still not zero. swapin in this case is an indication as to whether we are swap trashing. The closer the swapin/swapout ratio is to 1, the worse the trashing is. Note with the patched kernel that there is no swapin activity indicating that all the pages swapped were really inactive unused pages. minorfaults are just minor faults. An increased number of minor faults can indicate that page reclaim is unmapping the pages but not swapping them out before they are faulted back in. With the patched kernel, there is only a small change in minor faults majorfaults are just major faults in the target workload and a high number can indicate that a workload is being prematurely swapped. With the patched kernel, major faults are much reduced. As there are no swapin's recorded so it's not being swapped. The likely explanation is that that libraries or configuration files used by the workload during startup get paged out by the background IO. Overall with the series applied, there is no noticable performance drop due to background IO and while there is still some swap activity, it's tiny and the lack of swapins imply that the swapped pages were inactive and unused. 3.10.0-rc1 3.10.0-rc1 vanilla lessdisrupt-v4 Page Ins 1234608 101892 Page Outs 12446272 11810468 Swap Ins 283406 0 Swap Outs 698469 27882 Direct pages scanned 0 136480 Kswapd pages scanned 6266537 5369364 Kswapd pages reclaimed 1088989 930832 Direct pages reclaimed 0 120901 Kswapd efficiency 17% 17% Kswapd velocity 5398.371 4635.115 Direct efficiency 100% 88% Direct velocity 0.000 117.817 Percentage direct scans 0% 2% Page writes by reclaim 1655843 4009929 Page writes file 957374 3982047 Page writes anon 698469 27882 Page reclaim immediate 5245 1745 Page rescued immediate 0 0 Slabs scanned 33664 25216 Direct inode steals 0 0 Kswapd inode steals 19409 778 Kswapd skipped wait 0 0 THP fault alloc 35 30 THP collapse alloc 472 401 THP splits 27 22 THP fault fallback 0 0 THP collapse fail 0 1 Compaction stalls 0 4 Compaction success 0 0 Compaction failures 0 4 Page migrate success 0 0 Page migrate failure 0 0 Compaction pages isolated 0 0 Compaction migrate scanned 0 0 Compaction free scanned 0 0 Compaction cost 0 0 NUMA PTE updates 0 0 NUMA hint faults 0 0 NUMA hint local faults 0 0 NUMA pages migrated 0 0 AutoNUMA cost 0 0 Unfortunately, note that there is a small amount of direct reclaim due to kswapd no longer reclaiming the world. ftrace indicates that the direct reclaim stalls are mostly harmless with the vast bulk of the stalls incurred by dd 23 tclsh-3367 38 memcachetest-13733 49 memcachetest-12443 57 tee-3368 1541 dd-13826 1981 dd-12539 A consequence of the direct reclaim for dd is that the processes for the IO workload may show a higher system CPU usage. There is also a risk that kswapd not reclaiming the world may mean that it stays awake balancing zones, does not stall on the appropriate events and continually scans pages it cannot reclaim consuming CPU. This will be visible as continued high CPU usage but in my own tests I only saw a single spike lasting less than a second and I did not observe any problems related to reclaim while running the series on my desktop. This patch: The number of pages kswapd can reclaim is bound by the number of pages it scans which is related to the size of the zone and the scanning priority. In many cases the priority remains low because it's reset every SWAP_CLUSTER_MAX reclaimed pages but in the event kswapd scans a large number of pages it cannot reclaim, it will raise the priority and potentially discard a large percentage of the zone as sc->nr_to_reclaim is ULONG_MAX. The user-visible effect is a reclaim "spike" where a large percentage of memory is suddenly freed. It would be bad enough if this was just unused memory but because of how anon/file pages are balanced it is possible that applications get pushed to swap unnecessarily. This patch limits the number of pages kswapd will reclaim to the high watermark. Reclaim will still overshoot due to it not being a hard limit as shrink_lruvec() will ignore the sc.nr_to_reclaim at DEF_PRIORITY but it prevents kswapd reclaiming the world at higher priorities. The number of pages it reclaims is not adjusted for high-order allocations as kswapd will reclaim excessively if it is to balance zones for high-order allocations. Signed-off-by: Mel Gorman <mgorman@suse.de> Reviewed-by: Rik van Riel <riel@redhat.com> Reviewed-by: Michal Hocko <mhocko@suse.cz> Acked-by: Johannes Weiner <hannes@cmpxchg.org> Acked-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Cc: Jiri Slaby <jslaby@suse.cz> Cc: Valdis Kletnieks <Valdis.Kletnieks@vt.edu> Tested-by: Zlatko Calusic <zcalusic@bitsync.net> Cc: dormando <dormando@rydia.net> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> --- mm/vmscan.c | 49 +++++++++++++++++++++++++++++-------------------- 1 file changed, 29 insertions(+), 20 deletions(-) (limited to 'mm') diff --git a/mm/vmscan.c b/mm/vmscan.c index fa6a85378ee4..cdbc0699ea21 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2600,6 +2600,32 @@ static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, long remaining, return pgdat_balanced(pgdat, order, classzone_idx); } +/* + * kswapd shrinks the zone by the number of pages required to reach + * the high watermark. + */ +static void kswapd_shrink_zone(struct zone *zone, + struct scan_control *sc, + unsigned long lru_pages) +{ + unsigned long nr_slab; + struct reclaim_state *reclaim_state = current->reclaim_state; + struct shrink_control shrink = { + .gfp_mask = sc->gfp_mask, + }; + + /* Reclaim above the high watermark. */ + sc->nr_to_reclaim = max(SWAP_CLUSTER_MAX, high_wmark_pages(zone)); + shrink_zone(zone, sc); + + reclaim_state->reclaimed_slab = 0; + nr_slab = shrink_slab(&shrink, sc->nr_scanned, lru_pages); + sc->nr_reclaimed += reclaim_state->reclaimed_slab; + + if (nr_slab == 0 && !zone_reclaimable(zone)) + zone->all_unreclaimable = 1; +} + /* * For kswapd, balance_pgdat() will work across all this node's zones until * they are all at high_wmark_pages(zone). @@ -2627,24 +2653,15 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order, bool pgdat_is_balanced = false; int i; int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */ - struct reclaim_state *reclaim_state = current->reclaim_state; unsigned long nr_soft_reclaimed; unsigned long nr_soft_scanned; struct scan_control sc = { .gfp_mask = GFP_KERNEL, .may_unmap = 1, .may_swap = 1, - /* - * kswapd doesn't want to be bailed out while reclaim. because - * we want to put equal scanning pressure on each zone. - */ - .nr_to_reclaim = ULONG_MAX, .order = order, .target_mem_cgroup = NULL, }; - struct shrink_control shrink = { - .gfp_mask = sc.gfp_mask, - }; loop_again: sc.priority = DEF_PRIORITY; sc.nr_reclaimed = 0; @@ -2716,7 +2733,7 @@ loop_again: */ for (i = 0; i <= end_zone; i++) { struct zone *zone = pgdat->node_zones + i; - int nr_slab, testorder; + int testorder; unsigned long balance_gap; if (!populated_zone(zone)) @@ -2764,16 +2781,8 @@ loop_again: if ((buffer_heads_over_limit && is_highmem_idx(i)) || !zone_balanced(zone, testorder, - balance_gap, end_zone)) { - shrink_zone(zone, &sc); - - reclaim_state->reclaimed_slab = 0; - nr_slab = shrink_slab(&shrink, sc.nr_scanned, lru_pages); - sc.nr_reclaimed += reclaim_state->reclaimed_slab; - - if (nr_slab == 0 && !zone_reclaimable(zone)) - zone->all_unreclaimable = 1; - } + balance_gap, end_zone)) + kswapd_shrink_zone(zone, &sc, lru_pages); /* * If we're getting trouble reclaiming, start doing -- cgit v1.2.3 From e82e0561dae9f3ae5a21fc2d3d3ccbe69d90be46 Mon Sep 17 00:00:00 2001 From: Mel Gorman <mgorman@suse.de> Date: Wed, 3 Jul 2013 15:01:44 -0700 Subject: mm: vmscan: obey proportional scanning requirements for kswapd Simplistically, the anon and file LRU lists are scanned proportionally depending on the value of vm.swappiness although there are other factors taken into account by get_scan_count(). The patch "mm: vmscan: Limit the number of pages kswapd reclaims" limits the number of pages kswapd reclaims but it breaks this proportional scanning and may evenly shrink anon/file LRUs regardless of vm.swappiness. This patch preserves the proportional scanning and reclaim. It does mean that kswapd will reclaim more than requested but the number of pages will be related to the high watermark. [mhocko@suse.cz: Correct proportional reclaim for memcg and simplify] [kamezawa.hiroyu@jp.fujitsu.com: Recalculate scan based on target] [hannes@cmpxchg.org: Account for already scanned pages properly] Signed-off-by: Mel Gorman <mgorman@suse.de> Acked-by: Rik van Riel <riel@redhat.com> Reviewed-by: Michal Hocko <mhocko@suse.cz> Cc: Johannes Weiner <hannes@cmpxchg.org> Acked-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Cc: Jiri Slaby <jslaby@suse.cz> Cc: Valdis Kletnieks <Valdis.Kletnieks@vt.edu> Tested-by: Zlatko Calusic <zcalusic@bitsync.net> Cc: dormando <dormando@rydia.net> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> --- mm/vmscan.c | 67 +++++++++++++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 59 insertions(+), 8 deletions(-) (limited to 'mm') diff --git a/mm/vmscan.c b/mm/vmscan.c index cdbc0699ea21..26ad67f1962c 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1822,17 +1822,25 @@ out: static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) { unsigned long nr[NR_LRU_LISTS]; + unsigned long targets[NR_LRU_LISTS]; unsigned long nr_to_scan; enum lru_list lru; unsigned long nr_reclaimed = 0; unsigned long nr_to_reclaim = sc->nr_to_reclaim; struct blk_plug plug; + bool scan_adjusted = false; get_scan_count(lruvec, sc, nr); + /* Record the original scan target for proportional adjustments later */ + memcpy(targets, nr, sizeof(nr)); + blk_start_plug(&plug); while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] || nr[LRU_INACTIVE_FILE]) { + unsigned long nr_anon, nr_file, percentage; + unsigned long nr_scanned; + for_each_evictable_lru(lru) { if (nr[lru]) { nr_to_scan = min(nr[lru], SWAP_CLUSTER_MAX); @@ -1842,17 +1850,60 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) lruvec, sc); } } + + if (nr_reclaimed < nr_to_reclaim || scan_adjusted) + continue; + /* - * On large memory systems, scan >> priority can become - * really large. This is fine for the starting priority; - * we want to put equal scanning pressure on each zone. - * However, if the VM has a harder time of freeing pages, - * with multiple processes reclaiming pages, the total - * freeing target can get unreasonably large. + * For global direct reclaim, reclaim only the number of pages + * requested. Less care is taken to scan proportionally as it + * is more important to minimise direct reclaim stall latency + * than it is to properly age the LRU lists. */ - if (nr_reclaimed >= nr_to_reclaim && - sc->priority < DEF_PRIORITY) + if (global_reclaim(sc) && !current_is_kswapd()) break; + + /* + * For kswapd and memcg, reclaim at least the number of pages + * requested. Ensure that the anon and file LRUs shrink + * proportionally what was requested by get_scan_count(). We + * stop reclaiming one LRU and reduce the amount scanning + * proportional to the original scan target. + */ + nr_file = nr[LRU_INACTIVE_FILE] + nr[LRU_ACTIVE_FILE]; + nr_anon = nr[LRU_INACTIVE_ANON] + nr[LRU_ACTIVE_ANON]; + + if (nr_file > nr_anon) { + unsigned long scan_target = targets[LRU_INACTIVE_ANON] + + targets[LRU_ACTIVE_ANON] + 1; + lru = LRU_BASE; + percentage = nr_anon * 100 / scan_target; + } else { + unsigned long scan_target = targets[LRU_INACTIVE_FILE] + + targets[LRU_ACTIVE_FILE] + 1; + lru = LRU_FILE; + percentage = nr_file * 100 / scan_target; + } + + /* Stop scanning the smaller of the LRU */ + nr[lru] = 0; + nr[lru + LRU_ACTIVE] = 0; + + /* + * Recalculate the other LRU scan count based on its original + * scan target and the percentage scanning already complete + */ + lru = (lru == LRU_FILE) ? LRU_BASE : LRU_FILE; + nr_scanned = targets[lru] - nr[lru]; + nr[lru] = targets[lru] * (100 - percentage) / 100; + nr[lru] -= min(nr[lru], nr_scanned); + + lru += LRU_ACTIVE; + nr_scanned = targets[lru] - nr[lru]; + nr[lru] = targets[lru] * (100 - percentage) / 100; + nr[lru] -= min(nr[lru], nr_scanned); + + scan_adjusted = true; } blk_finish_plug(&plug); sc->nr_reclaimed += nr_reclaimed; -- cgit v1.2.3 From b8e83b942a16eb73e63406592d3178207a4f07a1 Mon Sep 17 00:00:00 2001 From: Mel Gorman <mgorman@suse.de> Date: Wed, 3 Jul 2013 15:01:45 -0700 Subject: mm: vmscan: flatten kswapd priority loop kswapd stops raising the scanning priority when at least SWAP_CLUSTER_MAX pages have been reclaimed or the pgdat is considered balanced. It then rechecks if it needs to restart at DEF_PRIORITY and whether high-order reclaim needs to be reset. This is not wrong per-se but it is confusing to follow and forcing kswapd to stay at DEF_PRIORITY may require several restarts before it has scanned enough pages to meet the high watermark even at 100% efficiency. This patch irons out the logic a bit by controlling when priority is raised and removing the "goto loop_again". This patch has kswapd raise the scanning priority until it is scanning enough pages that it could meet the high watermark in one shrink of the LRU lists if it is able to reclaim at 100% efficiency. It will not raise the scanning prioirty higher unless it is failing to reclaim any pages. To avoid infinite looping for high-order allocation requests kswapd will not reclaim for high-order allocations when it has reclaimed at least twice the number of pages as the allocation request. Signed-off-by: Mel Gorman <mgorman@suse.de> Acked-by: Johannes Weiner <hannes@cmpxchg.org> Reviewed-by: Michal Hocko <mhocko@suse.cz> Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Cc: Rik van Riel <riel@redhat.com> Cc: Jiri Slaby <jslaby@suse.cz> Cc: Valdis Kletnieks <Valdis.Kletnieks@vt.edu> Tested-by: Zlatko Calusic <zcalusic@bitsync.net> Cc: dormando <dormando@rydia.net> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> --- mm/vmscan.c | 86 +++++++++++++++++++++++++++++-------------------------------- 1 file changed, 41 insertions(+), 45 deletions(-) (limited to 'mm') diff --git a/mm/vmscan.c b/mm/vmscan.c index 26ad67f1962c..1c10ee512215 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2654,8 +2654,12 @@ static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, long remaining, /* * kswapd shrinks the zone by the number of pages required to reach * the high watermark. + * + * Returns true if kswapd scanned at least the requested number of pages to + * reclaim. This is used to determine if the scanning priority needs to be + * raised. */ -static void kswapd_shrink_zone(struct zone *zone, +static bool kswapd_shrink_zone(struct zone *zone, struct scan_control *sc, unsigned long lru_pages) { @@ -2675,6 +2679,8 @@ static void kswapd_shrink_zone(struct zone *zone, if (nr_slab == 0 && !zone_reclaimable(zone)) zone->all_unreclaimable = 1; + + return sc->nr_scanned >= sc->nr_to_reclaim; } /* @@ -2701,26 +2707,26 @@ static void kswapd_shrink_zone(struct zone *zone, static unsigned long balance_pgdat(pg_data_t *pgdat, int order, int *classzone_idx) { - bool pgdat_is_balanced = false; int i; int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */ unsigned long nr_soft_reclaimed; unsigned long nr_soft_scanned; struct scan_control sc = { .gfp_mask = GFP_KERNEL, + .priority = DEF_PRIORITY, .may_unmap = 1, .may_swap = 1, + .may_writepage = !laptop_mode, .order = order, .target_mem_cgroup = NULL, }; -loop_again: - sc.priority = DEF_PRIORITY; - sc.nr_reclaimed = 0; - sc.may_writepage = !laptop_mode; count_vm_event(PAGEOUTRUN); do { unsigned long lru_pages = 0; + bool raise_priority = true; + + sc.nr_reclaimed = 0; /* * Scan in the highmem->dma direction for the highest @@ -2762,10 +2768,8 @@ loop_again: } } - if (i < 0) { - pgdat_is_balanced = true; + if (i < 0) goto out; - } for (i = 0; i <= end_zone; i++) { struct zone *zone = pgdat->node_zones + i; @@ -2832,8 +2836,16 @@ loop_again: if ((buffer_heads_over_limit && is_highmem_idx(i)) || !zone_balanced(zone, testorder, - balance_gap, end_zone)) - kswapd_shrink_zone(zone, &sc, lru_pages); + balance_gap, end_zone)) { + /* + * There should be no need to raise the + * scanning priority if enough pages are + * already being scanned that high + * watermark would be met at 100% efficiency. + */ + if (kswapd_shrink_zone(zone, &sc, lru_pages)) + raise_priority = false; + } /* * If we're getting trouble reclaiming, start doing @@ -2868,46 +2880,29 @@ loop_again: pfmemalloc_watermark_ok(pgdat)) wake_up(&pgdat->pfmemalloc_wait); - if (pgdat_balanced(pgdat, order, *classzone_idx)) { - pgdat_is_balanced = true; - break; /* kswapd: all done */ - } - /* - * We do this so kswapd doesn't build up large priorities for - * example when it is freeing in parallel with allocators. It - * matches the direct reclaim path behaviour in terms of impact - * on zone->*_priority. + * Fragmentation may mean that the system cannot be rebalanced + * for high-order allocations in all zones. If twice the + * allocation size has been reclaimed and the zones are still + * not balanced then recheck the watermarks at order-0 to + * prevent kswapd reclaiming excessively. Assume that a + * process requested a high-order can direct reclaim/compact. */ - if (sc.nr_reclaimed >= SWAP_CLUSTER_MAX) - break; - } while (--sc.priority >= 0); - -out: - if (!pgdat_is_balanced) { - cond_resched(); + if (order && sc.nr_reclaimed >= 2UL << order) + order = sc.order = 0; - try_to_freeze(); + /* Check if kswapd should be suspending */ + if (try_to_freeze() || kthread_should_stop()) + break; /* - * Fragmentation may mean that the system cannot be - * rebalanced for high-order allocations in all zones. - * At this point, if nr_reclaimed < SWAP_CLUSTER_MAX, - * it means the zones have been fully scanned and are still - * not balanced. For high-order allocations, there is - * little point trying all over again as kswapd may - * infinite loop. - * - * Instead, recheck all watermarks at order-0 as they - * are the most important. If watermarks are ok, kswapd will go - * back to sleep. High-order users can still perform direct - * reclaim if they wish. + * Raise priority if scanning rate is too low or there was no + * progress in reclaiming pages */ - if (sc.nr_reclaimed < SWAP_CLUSTER_MAX) - order = sc.order = 0; - - goto loop_again; - } + if (raise_priority || !sc.nr_reclaimed) + sc.priority--; + } while (sc.priority >= 0 && + !pgdat_balanced(pgdat, order, *classzone_idx)); /* * If kswapd was reclaiming at a higher order, it has the option of @@ -2936,6 +2931,7 @@ out: compact_pgdat(pgdat, order); } +out: /* * Return the order we were reclaiming at so prepare_kswapd_sleep() * makes a decision on the order we were last reclaiming at. However, -- cgit v1.2.3 From 2ab44f434586b8ccb11f781b4c2730492e6628f5 Mon Sep 17 00:00:00 2001 From: Mel Gorman <mgorman@suse.de> Date: Wed, 3 Jul 2013 15:01:47 -0700 Subject: mm: vmscan: decide whether to compact the pgdat based on reclaim progress In the past, kswapd makes a decision on whether to compact memory after the pgdat was considered balanced. This more or less worked but it is late to make such a decision and does not fit well now that kswapd makes a decision whether to exit the zone scanning loop depending on reclaim progress. This patch will compact a pgdat if at least the requested number of pages were reclaimed from unbalanced zones for a given priority. If any zone is currently balanced, kswapd will not call compaction as it is expected the necessary pages are already available. Signed-off-by: Mel Gorman <mgorman@suse.de> Acked-by: Johannes Weiner <hannes@cmpxchg.org> Reviewed-by: Michal Hocko <mhocko@suse.cz> Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Cc: Rik van Riel <riel@redhat.com> Cc: Jiri Slaby <jslaby@suse.cz> Cc: Valdis Kletnieks <Valdis.Kletnieks@vt.edu> Tested-by: Zlatko Calusic <zcalusic@bitsync.net> Cc: dormando <dormando@rydia.net> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> --- mm/vmscan.c | 59 ++++++++++++++++++++++++++++++----------------------------- 1 file changed, 30 insertions(+), 29 deletions(-) (limited to 'mm') diff --git a/mm/vmscan.c b/mm/vmscan.c index 1c10ee512215..cd0980393bac 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2661,7 +2661,8 @@ static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, long remaining, */ static bool kswapd_shrink_zone(struct zone *zone, struct scan_control *sc, - unsigned long lru_pages) + unsigned long lru_pages, + unsigned long *nr_attempted) { unsigned long nr_slab; struct reclaim_state *reclaim_state = current->reclaim_state; @@ -2677,6 +2678,9 @@ static bool kswapd_shrink_zone(struct zone *zone, nr_slab = shrink_slab(&shrink, sc->nr_scanned, lru_pages); sc->nr_reclaimed += reclaim_state->reclaimed_slab; + /* Account for the number of pages attempted to reclaim */ + *nr_attempted += sc->nr_to_reclaim; + if (nr_slab == 0 && !zone_reclaimable(zone)) zone->all_unreclaimable = 1; @@ -2724,7 +2728,9 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order, do { unsigned long lru_pages = 0; + unsigned long nr_attempted = 0; bool raise_priority = true; + bool pgdat_needs_compaction = (order > 0); sc.nr_reclaimed = 0; @@ -2774,7 +2780,21 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order, for (i = 0; i <= end_zone; i++) { struct zone *zone = pgdat->node_zones + i; + if (!populated_zone(zone)) + continue; + lru_pages += zone_reclaimable_pages(zone); + + /* + * If any zone is currently balanced then kswapd will + * not call compaction as it is expected that the + * necessary pages are already available. + */ + if (pgdat_needs_compaction && + zone_watermark_ok(zone, order, + low_wmark_pages(zone), + *classzone_idx, 0)) + pgdat_needs_compaction = false; } /* @@ -2843,7 +2863,8 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order, * already being scanned that high * watermark would be met at 100% efficiency. */ - if (kswapd_shrink_zone(zone, &sc, lru_pages)) + if (kswapd_shrink_zone(zone, &sc, lru_pages, + &nr_attempted)) raise_priority = false; } @@ -2895,6 +2916,13 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order, if (try_to_freeze() || kthread_should_stop()) break; + /* + * Compact if necessary and kswapd is reclaiming at least the + * high watermark number of pages as requsted + */ + if (pgdat_needs_compaction && sc.nr_reclaimed > nr_attempted) + compact_pgdat(pgdat, order); + /* * Raise priority if scanning rate is too low or there was no * progress in reclaiming pages @@ -2904,33 +2932,6 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order, } while (sc.priority >= 0 && !pgdat_balanced(pgdat, order, *classzone_idx)); - /* - * If kswapd was reclaiming at a higher order, it has the option of - * sleeping without all zones being balanced. Before it does, it must - * ensure that the watermarks for order-0 on *all* zones are met and - * that the congestion flags are cleared. The congestion flag must - * be cleared as kswapd is the only mechanism that clears the flag - * and it is potentially going to sleep here. - */ - if (order) { - int zones_need_compaction = 1; - - for (i = 0; i <= end_zone; i++) { - struct zone *zone = pgdat->node_zones + i; - - if (!populated_zone(zone)) - continue; - - /* Check if the memory needs to be defragmented. */ - if (zone_watermark_ok(zone, order, - low_wmark_pages(zone), *classzone_idx, 0)) - zones_need_compaction = 0; - } - - if (zones_need_compaction) - compact_pgdat(pgdat, order); - } - out: /* * Return the order we were reclaiming at so prepare_kswapd_sleep() -- cgit v1.2.3 From 9aa41348a8d11427feec350b21dcdd4330fd20c4 Mon Sep 17 00:00:00 2001 From: Mel Gorman <mgorman@suse.de> Date: Wed, 3 Jul 2013 15:01:48 -0700 Subject: mm: vmscan: do not allow kswapd to scan at maximum priority Page reclaim at priority 0 will scan the entire LRU as priority 0 is considered to be a near OOM condition. Kswapd can reach priority 0 quite easily if it is encountering a large number of pages it cannot reclaim such as pages under writeback. When this happens, kswapd reclaims very aggressively even though there may be no real risk of allocation failure or OOM. This patch prevents kswapd reaching priority 0 and trying to reclaim the world. Direct reclaimers will still reach priority 0 in the event of an OOM situation. Signed-off-by: Mel Gorman <mgorman@suse.de> Acked-by: Rik van Riel <riel@redhat.com> Acked-by: Johannes Weiner <hannes@cmpxchg.org> Reviewed-by: Michal Hocko <mhocko@suse.cz> Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Cc: Jiri Slaby <jslaby@suse.cz> Cc: Valdis Kletnieks <Valdis.Kletnieks@vt.edu> Tested-by: Zlatko Calusic <zcalusic@bitsync.net> Cc: dormando <dormando@rydia.net> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> --- mm/vmscan.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/vmscan.c b/mm/vmscan.c index cd0980393bac..1505c573719d 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2929,7 +2929,7 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order, */ if (raise_priority || !sc.nr_reclaimed) sc.priority--; - } while (sc.priority >= 0 && + } while (sc.priority >= 1 && !pgdat_balanced(pgdat, order, *classzone_idx)); out: -- cgit v1.2.3 From d43006d503ac921c7df4f94d13c17db6f13c9d26 Mon Sep 17 00:00:00 2001 From: Mel Gorman <mgorman@suse.de> Date: Wed, 3 Jul 2013 15:01:50 -0700 Subject: mm: vmscan: have kswapd writeback pages based on dirty pages encountered, not priority Currently kswapd queues dirty pages for writeback if scanning at an elevated priority but the priority kswapd scans at is not related to the number of unqueued dirty encountered. Since commit "mm: vmscan: Flatten kswapd priority loop", the priority is related to the size of the LRU and the zone watermark which is no indication as to whether kswapd should write pages or not. This patch tracks if an excessive number of unqueued dirty pages are being encountered at the end of the LRU. If so, it indicates that dirty pages are being recycled before flusher threads can clean them and flags the zone so that kswapd will start writing pages until the zone is balanced. Signed-off-by: Mel Gorman <mgorman@suse.de> Acked-by: Johannes Weiner <hannes@cmpxchg.org> Reviewed-by: Michal Hocko <mhocko@suse.cz> Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Cc: Rik van Riel <riel@redhat.com> Cc: Jiri Slaby <jslaby@suse.cz> Cc: Valdis Kletnieks <Valdis.Kletnieks@vt.edu> Tested-by: Zlatko Calusic <zcalusic@bitsync.net> Cc: dormando <dormando@rydia.net> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> --- include/linux/mmzone.h | 9 +++++++++ mm/vmscan.c | 31 +++++++++++++++++++++++++------ 2 files changed, 34 insertions(+), 6 deletions(-) (limited to 'mm') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 5c76737d836b..2aaf72f7e345 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -495,6 +495,10 @@ typedef enum { ZONE_CONGESTED, /* zone has many dirty pages backed by * a congested BDI */ + ZONE_TAIL_LRU_DIRTY, /* reclaim scanning has recently found + * many dirty file pages at the tail + * of the LRU. + */ } zone_flags_t; static inline void zone_set_flag(struct zone *zone, zone_flags_t flag) @@ -517,6 +521,11 @@ static inline int zone_is_reclaim_congested(const struct zone *zone) return test_bit(ZONE_CONGESTED, &zone->flags); } +static inline int zone_is_reclaim_dirty(const struct zone *zone) +{ + return test_bit(ZONE_TAIL_LRU_DIRTY, &zone->flags); +} + static inline int zone_is_reclaim_locked(const struct zone *zone) { return test_bit(ZONE_RECLAIM_LOCKED, &zone->flags); diff --git a/mm/vmscan.c b/mm/vmscan.c index 1505c573719d..d6c916d808ba 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -676,13 +676,14 @@ static unsigned long shrink_page_list(struct list_head *page_list, struct zone *zone, struct scan_control *sc, enum ttu_flags ttu_flags, - unsigned long *ret_nr_dirty, + unsigned long *ret_nr_unqueued_dirty, unsigned long *ret_nr_writeback, bool force_reclaim) { LIST_HEAD(ret_pages); LIST_HEAD(free_pages); int pgactivate = 0; + unsigned long nr_unqueued_dirty = 0; unsigned long nr_dirty = 0; unsigned long nr_congested = 0; unsigned long nr_reclaimed = 0; @@ -808,14 +809,17 @@ static unsigned long shrink_page_list(struct list_head *page_list, if (PageDirty(page)) { nr_dirty++; + if (!PageWriteback(page)) + nr_unqueued_dirty++; + /* * Only kswapd can writeback filesystem pages to - * avoid risk of stack overflow but do not writeback - * unless under significant pressure. + * avoid risk of stack overflow but only writeback + * if many dirty pages have been encountered. */ if (page_is_file_cache(page) && (!current_is_kswapd() || - sc->priority >= DEF_PRIORITY - 2)) { + !zone_is_reclaim_dirty(zone))) { /* * Immediately reclaim when written back. * Similar in principal to deactivate_page() @@ -960,7 +964,7 @@ keep: list_splice(&ret_pages, page_list); count_vm_events(PGACTIVATE, pgactivate); mem_cgroup_uncharge_end(); - *ret_nr_dirty += nr_dirty; + *ret_nr_unqueued_dirty += nr_unqueued_dirty; *ret_nr_writeback += nr_writeback; return nr_reclaimed; } @@ -1373,6 +1377,15 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, (nr_taken >> (DEF_PRIORITY - sc->priority))) wait_iff_congested(zone, BLK_RW_ASYNC, HZ/10); + /* + * Similarly, if many dirty pages are encountered that are not + * currently being written then flag that kswapd should start + * writing back pages. + */ + if (global_reclaim(sc) && nr_dirty && + nr_dirty >= (nr_taken >> (DEF_PRIORITY - sc->priority))) + zone_set_flag(zone, ZONE_TAIL_LRU_DIRTY); + trace_mm_vmscan_lru_shrink_inactive(zone->zone_pgdat->node_id, zone_idx(zone), nr_scanned, nr_reclaimed, @@ -2769,8 +2782,12 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order, end_zone = i; break; } else { - /* If balanced, clear the congested flag */ + /* + * If balanced, clear the dirty and congested + * flags + */ zone_clear_flag(zone, ZONE_CONGESTED); + zone_clear_flag(zone, ZONE_TAIL_LRU_DIRTY); } } @@ -2888,8 +2905,10 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order, * possible there are dirty pages backed by * congested BDIs but as pressure is relieved, * speculatively avoid congestion waits + * or writing pages from kswapd context. */ zone_clear_flag(zone, ZONE_CONGESTED); + zone_clear_flag(zone, ZONE_TAIL_LRU_DIRTY); } /* -- cgit v1.2.3 From 283aba9f9e0e4882bf09bd37a2983379a6fae805 Mon Sep 17 00:00:00 2001 From: Mel Gorman <mgorman@suse.de> Date: Wed, 3 Jul 2013 15:01:51 -0700 Subject: mm: vmscan: block kswapd if it is encountering pages under writeback Historically, kswapd used to congestion_wait() at higher priorities if it was not making forward progress. This made no sense as the failure to make progress could be completely independent of IO. It was later replaced by wait_iff_congested() and removed entirely by commit 258401a6 (mm: don't wait on congested zones in balance_pgdat()) as it was duplicating logic in shrink_inactive_list(). This is problematic. If kswapd encounters many pages under writeback and it continues to scan until it reaches the high watermark then it will quickly skip over the pages under writeback and reclaim clean young pages or push applications out to swap. The use of wait_iff_congested() is not suited to kswapd as it will only stall if the underlying BDI is really congested or a direct reclaimer was unable to write to the underlying BDI. kswapd bypasses the BDI congestion as it sets PF_SWAPWRITE but even if this was taken into account then it would cause direct reclaimers to stall on writeback which is not desirable. This patch sets a ZONE_WRITEBACK flag if direct reclaim or kswapd is encountering too many pages under writeback. If this flag is set and kswapd encounters a PageReclaim page under writeback then it'll assume that the LRU lists are being recycled too quickly before IO can complete and block waiting for some IO to complete. Signed-off-by: Mel Gorman <mgorman@suse.de> Reviewed-by: Michal Hocko <mhocko@suse.cz> Acked-by: Rik van Riel <riel@redhat.com> Cc: Johannes Weiner <hannes@cmpxchg.org> Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Cc: Jiri Slaby <jslaby@suse.cz> Cc: Valdis Kletnieks <Valdis.Kletnieks@vt.edu> Tested-by: Zlatko Calusic <zcalusic@bitsync.net> Cc: dormando <dormando@rydia.net> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> --- include/linux/mmzone.h | 8 +++++ mm/vmscan.c | 82 ++++++++++++++++++++++++++++++++++++-------------- 2 files changed, 68 insertions(+), 22 deletions(-) (limited to 'mm') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 2aaf72f7e345..fce64afba042 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -499,6 +499,9 @@ typedef enum { * many dirty file pages at the tail * of the LRU. */ + ZONE_WRITEBACK, /* reclaim scanning has recently found + * many pages under writeback + */ } zone_flags_t; static inline void zone_set_flag(struct zone *zone, zone_flags_t flag) @@ -526,6 +529,11 @@ static inline int zone_is_reclaim_dirty(const struct zone *zone) return test_bit(ZONE_TAIL_LRU_DIRTY, &zone->flags); } +static inline int zone_is_reclaim_writeback(const struct zone *zone) +{ + return test_bit(ZONE_WRITEBACK, &zone->flags); +} + static inline int zone_is_reclaim_locked(const struct zone *zone) { return test_bit(ZONE_RECLAIM_LOCKED, &zone->flags); diff --git a/mm/vmscan.c b/mm/vmscan.c index d6c916d808ba..1109de0c35bf 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -724,25 +724,55 @@ static unsigned long shrink_page_list(struct list_head *page_list, may_enter_fs = (sc->gfp_mask & __GFP_FS) || (PageSwapCache(page) && (sc->gfp_mask & __GFP_IO)); + /* + * If a page at the tail of the LRU is under writeback, there + * are three cases to consider. + * + * 1) If reclaim is encountering an excessive number of pages + * under writeback and this page is both under writeback and + * PageReclaim then it indicates that pages are being queued + * for IO but are being recycled through the LRU before the + * IO can complete. Waiting on the page itself risks an + * indefinite stall if it is impossible to writeback the + * page due to IO error or disconnected storage so instead + * block for HZ/10 or until some IO completes then clear the + * ZONE_WRITEBACK flag to recheck if the condition exists. + * + * 2) Global reclaim encounters a page, memcg encounters a + * page that is not marked for immediate reclaim or + * the caller does not have __GFP_IO. In this case mark + * the page for immediate reclaim and continue scanning. + * + * __GFP_IO is checked because a loop driver thread might + * enter reclaim, and deadlock if it waits on a page for + * which it is needed to do the write (loop masks off + * __GFP_IO|__GFP_FS for this reason); but more thought + * would probably show more reasons. + * + * Don't require __GFP_FS, since we're not going into the + * FS, just waiting on its writeback completion. Worryingly, + * ext4 gfs2 and xfs allocate pages with + * grab_cache_page_write_begin(,,AOP_FLAG_NOFS), so testing + * may_enter_fs here is liable to OOM on them. + * + * 3) memcg encounters a page that is not already marked + * PageReclaim. memcg does not have any dirty pages + * throttling so we could easily OOM just because too many + * pages are in writeback and there is nothing else to + * reclaim. Wait for the writeback to complete. + */ if (PageWriteback(page)) { - /* - * memcg doesn't have any dirty pages throttling so we - * could easily OOM just because too many pages are in - * writeback and there is nothing else to reclaim. - * - * Check __GFP_IO, certainly because a loop driver - * thread might enter reclaim, and deadlock if it waits - * on a page for which it is needed to do the write - * (loop masks off __GFP_IO|__GFP_FS for this reason); - * but more thought would probably show more reasons. - * - * Don't require __GFP_FS, since we're not going into - * the FS, just waiting on its writeback completion. - * Worryingly, ext4 gfs2 and xfs allocate pages with - * grab_cache_page_write_begin(,,AOP_FLAG_NOFS), so - * testing may_enter_fs here is liable to OOM on them. - */ - if (global_reclaim(sc) || + /* Case 1 above */ + if (current_is_kswapd() && + PageReclaim(page) && + zone_is_reclaim_writeback(zone)) { + unlock_page(page); + congestion_wait(BLK_RW_ASYNC, HZ/10); + zone_clear_flag(zone, ZONE_WRITEBACK); + goto keep; + + /* Case 2 above */ + } else if (global_reclaim(sc) || !PageReclaim(page) || !(sc->gfp_mask & __GFP_IO)) { /* * This is slightly racy - end_page_writeback() @@ -757,9 +787,13 @@ static unsigned long shrink_page_list(struct list_head *page_list, */ SetPageReclaim(page); nr_writeback++; + goto keep_locked; + + /* Case 3 above */ + } else { + wait_on_page_writeback(page); } - wait_on_page_writeback(page); } if (!force_reclaim) @@ -1374,8 +1408,10 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, * isolated page is PageWriteback */ if (nr_writeback && nr_writeback >= - (nr_taken >> (DEF_PRIORITY - sc->priority))) + (nr_taken >> (DEF_PRIORITY - sc->priority))) { wait_iff_congested(zone, BLK_RW_ASYNC, HZ/10); + zone_set_flag(zone, ZONE_WRITEBACK); + } /* * Similarly, if many dirty pages are encountered that are not @@ -2669,8 +2705,8 @@ static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, long remaining, * the high watermark. * * Returns true if kswapd scanned at least the requested number of pages to - * reclaim. This is used to determine if the scanning priority needs to be - * raised. + * reclaim or if the lack of progress was due to pages under writeback. + * This is used to determine if the scanning priority needs to be raised. */ static bool kswapd_shrink_zone(struct zone *zone, struct scan_control *sc, @@ -2697,6 +2733,8 @@ static bool kswapd_shrink_zone(struct zone *zone, if (nr_slab == 0 && !zone_reclaimable(zone)) zone->all_unreclaimable = 1; + zone_clear_flag(zone, ZONE_WRITEBACK); + return sc->nr_scanned >= sc->nr_to_reclaim; } -- cgit v1.2.3 From b7ea3c417b6c2e74ca1cb051568f60377908928d Mon Sep 17 00:00:00 2001 From: Mel Gorman <mgorman@suse.de> Date: Wed, 3 Jul 2013 15:01:53 -0700 Subject: mm: vmscan: check if kswapd should writepage once per pgdat scan Currently kswapd checks if it should start writepage as it shrinks each zone without taking into consideration if the zone is balanced or not. This is not wrong as such but it does not make much sense either. This patch checks once per pgdat scan if kswapd should be writing pages. Signed-off-by: Mel Gorman <mgorman@suse.de> Reviewed-by: Michal Hocko <mhocko@suse.cz> Acked-by: Rik van Riel <riel@redhat.com> Acked-by: Johannes Weiner <hannes@cmpxchg.org> Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Cc: Jiri Slaby <jslaby@suse.cz> Cc: Valdis Kletnieks <Valdis.Kletnieks@vt.edu> Tested-by: Zlatko Calusic <zcalusic@bitsync.net> Cc: dormando <dormando@rydia.net> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> --- mm/vmscan.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'mm') diff --git a/mm/vmscan.c b/mm/vmscan.c index 1109de0c35bf..a2d0c6842616 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2852,6 +2852,13 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order, pgdat_needs_compaction = false; } + /* + * If we're getting trouble reclaiming, start doing writepage + * even in laptop mode. + */ + if (sc.priority < DEF_PRIORITY - 2) + sc.may_writepage = 1; + /* * Now scan the zone in the dma->highmem direction, stopping * at the last zone which needs scanning. @@ -2923,13 +2930,6 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order, raise_priority = false; } - /* - * If we're getting trouble reclaiming, start doing - * writepage even in laptop mode. - */ - if (sc.priority < DEF_PRIORITY - 2) - sc.may_writepage = 1; - if (zone->all_unreclaimable) { if (end_zone && end_zone == i) end_zone--; -- cgit v1.2.3 From 7c954f6de6b630de30f265a079aad359f159ebe9 Mon Sep 17 00:00:00 2001 From: Mel Gorman <mgorman@suse.de> Date: Wed, 3 Jul 2013 15:01:54 -0700 Subject: mm: vmscan: move logic from balance_pgdat() to kswapd_shrink_zone() balance_pgdat() is very long and some of the logic can and should be internal to kswapd_shrink_zone(). Move it so the flow of balance_pgdat() is marginally easier to follow. Signed-off-by: Mel Gorman <mgorman@suse.de> Acked-by: Johannes Weiner <hannes@cmpxchg.org> Reviewed-by: Michal Hocko <mhocko@suse.cz> Acked-by: Rik van Riel <riel@redhat.com> Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Cc: Jiri Slaby <jslaby@suse.cz> Cc: Valdis Kletnieks <Valdis.Kletnieks@vt.edu> Tested-by: Zlatko Calusic <zcalusic@bitsync.net> Cc: dormando <dormando@rydia.net> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> --- mm/vmscan.c | 110 +++++++++++++++++++++++++++++------------------------------- 1 file changed, 54 insertions(+), 56 deletions(-) (limited to 'mm') diff --git a/mm/vmscan.c b/mm/vmscan.c index a2d0c6842616..4a43c289b23a 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2709,18 +2709,53 @@ static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, long remaining, * This is used to determine if the scanning priority needs to be raised. */ static bool kswapd_shrink_zone(struct zone *zone, + int classzone_idx, struct scan_control *sc, unsigned long lru_pages, unsigned long *nr_attempted) { unsigned long nr_slab; + int testorder = sc->order; + unsigned long balance_gap; struct reclaim_state *reclaim_state = current->reclaim_state; struct shrink_control shrink = { .gfp_mask = sc->gfp_mask, }; + bool lowmem_pressure; /* Reclaim above the high watermark. */ sc->nr_to_reclaim = max(SWAP_CLUSTER_MAX, high_wmark_pages(zone)); + + /* + * Kswapd reclaims only single pages with compaction enabled. Trying + * too hard to reclaim until contiguous free pages have become + * available can hurt performance by evicting too much useful data + * from memory. Do not reclaim more than needed for compaction. + */ + if (IS_ENABLED(CONFIG_COMPACTION) && sc->order && + compaction_suitable(zone, sc->order) != + COMPACT_SKIPPED) + testorder = 0; + + /* + * We put equal pressure on every zone, unless one zone has way too + * many pages free already. The "too many pages" is defined as the + * high wmark plus a "gap" where the gap is either the low + * watermark or 1% of the zone, whichever is smaller. + */ + balance_gap = min(low_wmark_pages(zone), + (zone->managed_pages + KSWAPD_ZONE_BALANCE_GAP_RATIO-1) / + KSWAPD_ZONE_BALANCE_GAP_RATIO); + + /* + * If there is no low memory pressure or the zone is balanced then no + * reclaim is necessary + */ + lowmem_pressure = (buffer_heads_over_limit && is_highmem(zone)); + if (!lowmem_pressure && zone_balanced(zone, testorder, + balance_gap, classzone_idx)) + return true; + shrink_zone(zone, sc); reclaim_state->reclaimed_slab = 0; @@ -2735,6 +2770,18 @@ static bool kswapd_shrink_zone(struct zone *zone, zone_clear_flag(zone, ZONE_WRITEBACK); + /* + * If a zone reaches its high watermark, consider it to be no longer + * congested. It's possible there are dirty pages backed by congested + * BDIs but as pressure is relieved, speculatively avoid congestion + * waits. + */ + if (!zone->all_unreclaimable && + zone_balanced(zone, testorder, 0, classzone_idx)) { + zone_clear_flag(zone, ZONE_CONGESTED); + zone_clear_flag(zone, ZONE_TAIL_LRU_DIRTY); + } + return sc->nr_scanned >= sc->nr_to_reclaim; } @@ -2870,8 +2917,6 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order, */ for (i = 0; i <= end_zone; i++) { struct zone *zone = pgdat->node_zones + i; - int testorder; - unsigned long balance_gap; if (!populated_zone(zone)) continue; @@ -2892,61 +2937,14 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order, sc.nr_reclaimed += nr_soft_reclaimed; /* - * We put equal pressure on every zone, unless - * one zone has way too many pages free - * already. The "too many pages" is defined - * as the high wmark plus a "gap" where the - * gap is either the low watermark or 1% - * of the zone, whichever is smaller. - */ - balance_gap = min(low_wmark_pages(zone), - (zone->managed_pages + - KSWAPD_ZONE_BALANCE_GAP_RATIO-1) / - KSWAPD_ZONE_BALANCE_GAP_RATIO); - /* - * Kswapd reclaims only single pages with compaction - * enabled. Trying too hard to reclaim until contiguous - * free pages have become available can hurt performance - * by evicting too much useful data from memory. - * Do not reclaim more than needed for compaction. + * There should be no need to raise the scanning + * priority if enough pages are already being scanned + * that that high watermark would be met at 100% + * efficiency. */ - testorder = order; - if (IS_ENABLED(CONFIG_COMPACTION) && order && - compaction_suitable(zone, order) != - COMPACT_SKIPPED) - testorder = 0; - - if ((buffer_heads_over_limit && is_highmem_idx(i)) || - !zone_balanced(zone, testorder, - balance_gap, end_zone)) { - /* - * There should be no need to raise the - * scanning priority if enough pages are - * already being scanned that high - * watermark would be met at 100% efficiency. - */ - if (kswapd_shrink_zone(zone, &sc, lru_pages, - &nr_attempted)) - raise_priority = false; - } - - if (zone->all_unreclaimable) { - if (end_zone && end_zone == i) - end_zone--; - continue; - } - - if (zone_balanced(zone, testorder, 0, end_zone)) - /* - * If a zone reaches its high watermark, - * consider it to be no longer congested. It's - * possible there are dirty pages backed by - * congested BDIs but as pressure is relieved, - * speculatively avoid congestion waits - * or writing pages from kswapd context. - */ - zone_clear_flag(zone, ZONE_CONGESTED); - zone_clear_flag(zone, ZONE_TAIL_LRU_DIRTY); + if (kswapd_shrink_zone(zone, end_zone, &sc, + lru_pages, &nr_attempted)) + raise_priority = false; } /* -- cgit v1.2.3 From e2be15f6c3eecedfbe1550cca8d72c5057abbbd2 Mon Sep 17 00:00:00 2001 From: Mel Gorman <mgorman@suse.de> Date: Wed, 3 Jul 2013 15:01:57 -0700 Subject: mm: vmscan: stall page reclaim and writeback pages based on dirty/writepage pages encountered Further testing of the "Reduce system disruption due to kswapd" discovered a few problems. First and foremost, it's possible for pages under writeback to be freed which will lead to badness. Second, as pages were not being swapped the file LRU was being scanned faster and clean file pages were being reclaimed. In some cases this results in increased read IO to re-read data from disk. Third, more pages were being written from kswapd context which can adversly affect IO performance. Lastly, it was observed that PageDirty pages are not necessarily dirty on all filesystems (buffers can be clean while PageDirty is set and ->writepage generates no IO) and not all filesystems set PageWriteback when the page is being written (e.g. ext3). This disconnect confuses the reclaim stalling logic. This follow-up series is aimed at these problems. The tests were based on three kernels vanilla: kernel 3.9 as that is what the current mmotm uses as a baseline mmotm-20130522 is mmotm as of 22nd May with "Reduce system disruption due to kswapd" applied on top as per what should be in Andrew's tree right now lessdisrupt-v7r10 is this follow-up series on top of the mmotm kernel The first test used memcached+memcachetest while some background IO was in progress as implemented by the parallel IO tests implement in MM Tests. memcachetest benchmarks how many operations/second memcached can service. It starts with no background IO on a freshly created ext4 filesystem and then re-runs the test with larger amounts of IO in the background to roughly simulate a large copy in progress. The expectation is that the IO should have little or no impact on memcachetest which is running entirely in memory. parallelio 3.9.0 3.9.0 3.9.0 vanilla mm1-mmotm-20130522 mm1-lessdisrupt-v7r10 Ops memcachetest-0M 23117.00 ( 0.00%) 22780.00 ( -1.46%) 22763.00 ( -1.53%) Ops memcachetest-715M 23774.00 ( 0.00%) 23299.00 ( -2.00%) 22934.00 ( -3.53%) Ops memcachetest-2385M 4208.00 ( 0.00%) 24154.00 (474.00%) 23765.00 (464.76%) Ops memcachetest-4055M 4104.00 ( 0.00%) 25130.00 (512.33%) 24614.00 (499.76%) Ops io-duration-0M 0.00 ( 0.00%) 0.00 ( 0.00%) 0.00 ( 0.00%) Ops io-duration-715M 12.00 ( 0.00%) 7.00 ( 41.67%) 6.00 ( 50.00%) Ops io-duration-2385M 116.00 ( 0.00%) 21.00 ( 81.90%) 21.00 ( 81.90%) Ops io-duration-4055M 160.00 ( 0.00%) 36.00 ( 77.50%) 35.00 ( 78.12%) Ops swaptotal-0M 0.00 ( 0.00%) 0.00 ( 0.00%) 0.00 ( 0.00%) Ops swaptotal-715M 140138.00 ( 0.00%) 18.00 ( 99.99%) 18.00 ( 99.99%) Ops swaptotal-2385M 385682.00 ( 0.00%) 0.00 ( 0.00%) 0.00 ( 0.00%) Ops swaptotal-4055M 418029.00 ( 0.00%) 0.00 ( 0.00%) 0.00 ( 0.00%) Ops swapin-0M 0.00 ( 0.00%) 0.00 ( 0.00%) 0.00 ( 0.00%) Ops swapin-715M 144.00 ( 0.00%) 0.00 ( 0.00%) 0.00 ( 0.00%) Ops swapin-2385M 134227.00 ( 0.00%) 0.00 ( 0.00%) 0.00 ( 0.00%) Ops swapin-4055M 125618.00 ( 0.00%) 0.00 ( 0.00%) 0.00 ( 0.00%) Ops minorfaults-0M 1536429.00 ( 0.00%) 1531632.00 ( 0.31%) 1533541.00 ( 0.19%) Ops minorfaults-715M 1786996.00 ( 0.00%) 1612148.00 ( 9.78%) 1608832.00 ( 9.97%) Ops minorfaults-2385M 1757952.00 ( 0.00%) 1614874.00 ( 8.14%) 1613541.00 ( 8.21%) Ops minorfaults-4055M 1774460.00 ( 0.00%) 1633400.00 ( 7.95%) 1630881.00 ( 8.09%) Ops majorfaults-0M 1.00 ( 0.00%) 0.00 ( 0.00%) 0.00 ( 0.00%) Ops majorfaults-715M 184.00 ( 0.00%) 167.00 ( 9.24%) 166.00 ( 9.78%) Ops majorfaults-2385M 24444.00 ( 0.00%) 155.00 ( 99.37%) 93.00 ( 99.62%) Ops majorfaults-4055M 21357.00 ( 0.00%) 147.00 ( 99.31%) 134.00 ( 99.37%) memcachetest is the transactions/second reported by memcachetest. In the vanilla kernel note that performance drops from around 23K/sec to just over 4K/second when there is 2385M of IO going on in the background. With current mmotm, there is no collapse in performance and with this follow-up series there is little change. swaptotal is the total amount of swap traffic. With mmotm and the follow-up series, the total amount of swapping is much reduced. 3.9.0 3.9.0 3.9.0 vanillamm1-mmotm-20130522mm1-lessdisrupt-v7r10 Minor Faults 11160152 10706748 10622316 Major Faults 46305 755 678 Swap Ins 260249 0 0 Swap Outs 683860 18 18 Direct pages scanned 0 678 2520 Kswapd pages scanned 6046108 8814900 1639279 Kswapd pages reclaimed 1081954 1172267 1094635 Direct pages reclaimed 0 566 2304 Kswapd efficiency 17% 13% 66% Kswapd velocity 5217.560 7618.953 1414.879 Direct efficiency 100% 83% 91% Direct velocity 0.000 0.586 2.175 Percentage direct scans 0% 0% 0% Zone normal velocity 5105.086 6824.681 671.158 Zone dma32 velocity 112.473 794.858 745.896 Zone dma velocity 0.000 0.000 0.000 Page writes by reclaim 1929612.000 6861768.000 32821.000 Page writes file 1245752 6861750 32803 Page writes anon 683860 18 18 Page reclaim immediate 7484 40 239 Sector Reads 1130320 93996 86900 Sector Writes 13508052 10823500 11804436 Page rescued immediate 0 0 0 Slabs scanned 33536 27136 18560 Direct inode steals 0 0 0 Kswapd inode steals 8641 1035 0 Kswapd skipped wait 0 0 0 THP fault alloc 8 37 33 THP collapse alloc 508 552 515 THP splits 24 1 1 THP fault fallback 0 0 0 THP collapse fail 0 0 0 There are a number of observations to make here 1. Swap outs are almost eliminated. Swap ins are 0 indicating that the pages swapped were really unused anonymous pages. Related to that, major faults are much reduced. 2. kswapd efficiency was impacted by the initial series but with these follow-up patches, the efficiency is now at 66% indicating that far fewer pages were skipped during scanning due to dirty or writeback pages. 3. kswapd velocity is reduced indicating that fewer pages are being scanned with the follow-up series as kswapd now stalls when the tail of the LRU queue is full of unqueued dirty pages. The stall gives flushers a chance to catch-up so kswapd can reclaim clean pages when it wakes 4. In light of Zlatko's recent reports about zone scanning imbalances, mmtests now reports scanning velocity on a per-zone basis. With mainline, you can see that the scanning activity is dominated by the Normal zone with over 45 times more scanning in Normal than the DMA32 zone. With the series currently in mmotm, the ratio is slightly better but it is still the case that the bulk of scanning is in the highest zone. With this follow-up series, the ratio of scanning between the Normal and DMA32 zone is roughly equal. 5. As Dave Chinner observed, the current patches in mmotm increased the number of pages written from kswapd context which is expected to adversly impact IO performance. With the follow-up patches, far fewer pages are written from kswapd context than the mainline kernel 6. With the series in mmotm, fewer inodes were reclaimed by kswapd. With the follow-up series, there is less slab shrinking activity and no inodes were reclaimed. 7. Note that "Sectors Read" is drastically reduced implying that the source data being used for the IO is not being aggressively discarded due to page reclaim skipping over dirty pages and reclaiming clean pages. Note that the reducion in reads could also be due to inode data not being re-read from disk after a slab shrink. 3.9.0 3.9.0 3.9.0 vanillamm1-mmotm-20130522mm1-lessdisrupt-v7r10 Mean sda-avgqz 166.99 32.09 33.44 Mean sda-await 853.64 192.76 185.43 Mean sda-r_await 6.31 9.24 5.97 Mean sda-w_await 2992.81 202.65 192.43 Max sda-avgqz 1409.91 718.75 698.98 Max sda-await 6665.74 3538.00 3124.23 Max sda-r_await 58.96 111.95 58.00 Max sda-w_await 28458.94 3977.29 3148.61 In light of the changes in writes from reclaim context, the number of reads and Dave Chinner's concerns about IO performance I took a closer look at the IO stats for the test disk. Few observations 1. The average queue size is reduced by the initial series and roughly the same with this follow up. 2. Average wait times for writes are reduced and as the IO is completing faster it at least implies that the gain is because flushers are writing the files efficiently instead of page reclaim getting in the way. 3. The reduction in maximum write latency is staggering. 28 seconds down to 3 seconds. Jan Kara asked how NFS is affected by all of this. Unstable pages can be taken into account as one of the patches in the series shows but it is still the case that filesystems with unusual handling of dirty or writeback could still be treated better. Tests like postmark, fsmark and largedd showed up nothing useful. On my test setup, pages are simply not being written back from reclaim context with or without the patches and there are no changes in performance. My test setup probably is just not strong enough network-wise to be really interesting. I ran a longer-lived memcached test with IO going to NFS instead of a local disk parallelio 3.9.0 3.9.0 3.9.0 vanilla mm1-mmotm-20130522 mm1-lessdisrupt-v7r10 Ops memcachetest-0M 23323.00 ( 0.00%) 23241.00 ( -0.35%) 23321.00 ( -0.01%) Ops memcachetest-715M 25526.00 ( 0.00%) 24763.00 ( -2.99%) 23242.00 ( -8.95%) Ops memcachetest-2385M 8814.00 ( 0.00%) 26924.00 (205.47%) 23521.00 (166.86%) Ops memcachetest-4055M 5835.00 ( 0.00%) 26827.00 (359.76%) 25560.00 (338.05%) Ops io-duration-0M 0.00 ( 0.00%) 0.00 ( 0.00%) 0.00 ( 0.00%) Ops io-duration-715M 65.00 ( 0.00%) 71.00 ( -9.23%) 11.00 ( 83.08%) Ops io-duration-2385M 129.00 ( 0.00%) 94.00 ( 27.13%) 53.00 ( 58.91%) Ops io-duration-4055M 301.00 ( 0.00%) 100.00 ( 66.78%) 108.00 ( 64.12%) Ops swaptotal-0M 0.00 ( 0.00%) 0.00 ( 0.00%) 0.00 ( 0.00%) Ops swaptotal-715M 14394.00 ( 0.00%) 949.00 ( 93.41%) 63.00 ( 99.56%) Ops swaptotal-2385M 401483.00 ( 0.00%) 24437.00 ( 93.91%) 30118.00 ( 92.50%) Ops swaptotal-4055M 554123.00 ( 0.00%) 35688.00 ( 93.56%) 63082.00 ( 88.62%) Ops swapin-0M 0.00 ( 0.00%) 0.00 ( 0.00%) 0.00 ( 0.00%) Ops swapin-715M 4522.00 ( 0.00%) 560.00 ( 87.62%) 63.00 ( 98.61%) Ops swapin-2385M 169861.00 ( 0.00%) 5026.00 ( 97.04%) 13917.00 ( 91.81%) Ops swapin-4055M 192374.00 ( 0.00%) 10056.00 ( 94.77%) 25729.00 ( 86.63%) Ops minorfaults-0M 1445969.00 ( 0.00%) 1520878.00 ( -5.18%) 1454024.00 ( -0.56%) Ops minorfaults-715M 1557288.00 ( 0.00%) 1528482.00 ( 1.85%) 1535776.00 ( 1.38%) Ops minorfaults-2385M 1692896.00 ( 0.00%) 1570523.00 ( 7.23%) 1559622.00 ( 7.87%) Ops minorfaults-4055M 1654985.00 ( 0.00%) 1581456.00 ( 4.44%) 1596713.00 ( 3.52%) Ops majorfaults-0M 0.00 ( 0.00%) 1.00 (-99.00%) 0.00 ( 0.00%) Ops majorfaults-715M 763.00 ( 0.00%) 265.00 ( 65.27%) 75.00 ( 90.17%) Ops majorfaults-2385M 23861.00 ( 0.00%) 894.00 ( 96.25%) 2189.00 ( 90.83%) Ops majorfaults-4055M 27210.00 ( 0.00%) 1569.00 ( 94.23%) 4088.00 ( 84.98%) 1. Performance does not collapse due to IO which is good. IO is also completing faster. Note with mmotm, IO completes in a third of the time and faster again with this series applied 2. Swapping is reduced, although not eliminated. The figures for the follow-up look bad but it does vary a bit as the stalling is not perfect for nfs or filesystems like ext3 with unusual handling of dirty and writeback pages 3. There are swapins, particularly with larger amounts of IO indicating that active pages are being reclaimed. However, the number of much reduced. 3.9.0 3.9.0 3.9.0 vanillamm1-mmotm-20130522mm1-lessdisrupt-v7r10 Minor Faults 36339175 35025445 35219699 Major Faults 310964 27108 51887 Swap Ins 2176399 173069 333316 Swap Outs 3344050 357228 504824 Direct pages scanned 8972 77283 43242 Kswapd pages scanned 20899983 8939566 14772851 Kswapd pages reclaimed 6193156 5172605 5231026 Direct pages reclaimed 8450 73802 39514 Kswapd efficiency 29% 57% 35% Kswapd velocity 3929.743 1847.499 3058.840 Direct efficiency 94% 95% 91% Direct velocity 1.687 15.972 8.954 Percentage direct scans 0% 0% 0% Zone normal velocity 3721.907 939.103 2185.142 Zone dma32 velocity 209.522 924.368 882.651 Zone dma velocity 0.000 0.000 0.000 Page writes by reclaim 4082185.000 526319.000 537114.000 Page writes file 738135 169091 32290 Page writes anon 3344050 357228 504824 Page reclaim immediate 9524 170 5595843 Sector Reads 8909900 861192 1483680 Sector Writes 13428980 1488744 2076800 Page rescued immediate 0 0 0 Slabs scanned 38016 31744 28672 Direct inode steals 0 0 0 Kswapd inode steals 424 0 0 Kswapd skipped wait 0 0 0 THP fault alloc 14 15 119 THP collapse alloc 1767 1569 1618 THP splits 30 29 25 THP fault fallback 0 0 0 THP collapse fail 8 5 0 Compaction stalls 17 41 100 Compaction success 7 31 95 Compaction failures 10 10 5 Page migrate success 7083 22157 62217 Page migrate failure 0 0 0 Compaction pages isolated 14847 48758 135830 Compaction migrate scanned 18328 48398 138929 Compaction free scanned 2000255 355827 1720269 Compaction cost 7 24 68 I guess the main takeaway again is the much reduced page writes from reclaim context and reduced reads. 3.9.0 3.9.0 3.9.0 vanillamm1-mmotm-20130522mm1-lessdisrupt-v7r10 Mean sda-avgqz 23.58 0.35 0.44 Mean sda-await 133.47 15.72 15.46 Mean sda-r_await 4.72 4.69 3.95 Mean sda-w_await 507.69 28.40 33.68 Max sda-avgqz 680.60 12.25 23.14 Max sda-await 3958.89 221.83 286.22 Max sda-r_await 63.86 61.23 67.29 Max sda-w_await 11710.38 883.57 1767.28 And as before, write wait times are much reduced. This patch: The patch "mm: vmscan: Have kswapd writeback pages based on dirty pages encountered, not priority" decides whether to writeback pages from reclaim context based on the number of dirty pages encountered. This situation is flagged too easily and flushers are not given the chance to catch up resulting in more pages being written from reclaim context and potentially impacting IO performance. The check for PageWriteback is also misplaced as it happens within a PageDirty check which is nonsense as the dirty may have been cleared for IO. The accounting is updated very late and pages that are already under writeback, were reactivated, could not unmapped or could not be released are all missed. Similarly, a page is considered congested for reasons other than being congested and pages that cannot be written out in the correct context are skipped. Finally, it considers stalling and writing back filesystem pages due to encountering dirty anonymous pages at the tail of the LRU which is dumb. This patch causes kswapd to begin writing filesystem pages from reclaim context only if page reclaim found that all filesystem pages at the tail of the LRU were unqueued dirty pages. Before it starts writing filesystem pages, it will stall to give flushers a chance to catch up. The decision on whether wait_iff_congested is also now determined by dirty filesystem pages only. Congested pages are based on whether the underlying BDI is congested regardless of the context of the reclaiming process. Signed-off-by: Mel Gorman <mgorman@suse.de> Cc: Johannes Weiner <hannes@cmpxchg.org> Cc: Michal Hocko <mhocko@suse.cz> Cc: Rik van Riel <riel@redhat.com> Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Cc: Jiri Slaby <jslaby@suse.cz> Cc: Valdis Kletnieks <Valdis.Kletnieks@vt.edu> Cc: Zlatko Calusic <zcalusic@bitsync.net> Cc: dormando <dormando@rydia.net> Cc: Trond Myklebust <trond.myklebust@fys.uio.no> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> --- mm/vmscan.c | 61 ++++++++++++++++++++++++++++++++++++++++++++++++------------- 1 file changed, 48 insertions(+), 13 deletions(-) (limited to 'mm') diff --git a/mm/vmscan.c b/mm/vmscan.c index 4a43c289b23a..999ef0b9399a 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -669,6 +669,25 @@ static enum page_references page_check_references(struct page *page, return PAGEREF_RECLAIM; } +/* Check if a page is dirty or under writeback */ +static void page_check_dirty_writeback(struct page *page, + bool *dirty, bool *writeback) +{ + /* + * Anonymous pages are not handled by flushers and must be written + * from reclaim context. Do not stall reclaim based on them + */ + if (!page_is_file_cache(page)) { + *dirty = false; + *writeback = false; + return; + } + + /* By default assume that the page flags are accurate */ + *dirty = PageDirty(page); + *writeback = PageWriteback(page); +} + /* * shrink_page_list() returns the number of reclaimed pages */ @@ -697,6 +716,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, struct page *page; int may_enter_fs; enum page_references references = PAGEREF_RECLAIM_CLEAN; + bool dirty, writeback; cond_resched(); @@ -724,6 +744,24 @@ static unsigned long shrink_page_list(struct list_head *page_list, may_enter_fs = (sc->gfp_mask & __GFP_FS) || (PageSwapCache(page) && (sc->gfp_mask & __GFP_IO)); + /* + * The number of dirty pages determines if a zone is marked + * reclaim_congested which affects wait_iff_congested. kswapd + * will stall and start writing pages if the tail of the LRU + * is all dirty unqueued pages. + */ + page_check_dirty_writeback(page, &dirty, &writeback); + if (dirty || writeback) + nr_dirty++; + + if (dirty && !writeback) + nr_unqueued_dirty++; + + /* Treat this page as congested if underlying BDI is */ + mapping = page_mapping(page); + if (mapping && bdi_write_congested(mapping->backing_dev_info)) + nr_congested++; + /* * If a page at the tail of the LRU is under writeback, there * are three cases to consider. @@ -819,9 +857,10 @@ static unsigned long shrink_page_list(struct list_head *page_list, if (!add_to_swap(page, page_list)) goto activate_locked; may_enter_fs = 1; - } - mapping = page_mapping(page); + /* Adding to swap updated mapping */ + mapping = page_mapping(page); + } /* * The page is mapped into the page tables of one or more @@ -841,11 +880,6 @@ static unsigned long shrink_page_list(struct list_head *page_list, } if (PageDirty(page)) { - nr_dirty++; - - if (!PageWriteback(page)) - nr_unqueued_dirty++; - /* * Only kswapd can writeback filesystem pages to * avoid risk of stack overflow but only writeback @@ -876,7 +910,6 @@ static unsigned long shrink_page_list(struct list_head *page_list, /* Page is dirty, try to write it out here */ switch (pageout(page, mapping, sc)) { case PAGE_KEEP: - nr_congested++; goto keep_locked; case PAGE_ACTIVATE: goto activate_locked; @@ -1318,7 +1351,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, unsigned long nr_scanned; unsigned long nr_reclaimed = 0; unsigned long nr_taken; - unsigned long nr_dirty = 0; + unsigned long nr_unqueued_dirty = 0; unsigned long nr_writeback = 0; isolate_mode_t isolate_mode = 0; int file = is_file_lru(lru); @@ -1361,7 +1394,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, return 0; nr_reclaimed = shrink_page_list(&page_list, zone, sc, TTU_UNMAP, - &nr_dirty, &nr_writeback, false); + &nr_unqueued_dirty, &nr_writeback, false); spin_lock_irq(&zone->lru_lock); @@ -1416,11 +1449,13 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, /* * Similarly, if many dirty pages are encountered that are not * currently being written then flag that kswapd should start - * writing back pages. + * writing back pages and stall to give a chance for flushers + * to catch up. */ - if (global_reclaim(sc) && nr_dirty && - nr_dirty >= (nr_taken >> (DEF_PRIORITY - sc->priority))) + if (global_reclaim(sc) && nr_unqueued_dirty == nr_taken) { + congestion_wait(BLK_RW_ASYNC, HZ/10); zone_set_flag(zone, ZONE_TAIL_LRU_DIRTY); + } trace_mm_vmscan_lru_shrink_inactive(zone->zone_pgdat->node_id, zone_idx(zone), -- cgit v1.2.3 From b1a6f21e3b2315d46ae8af88a8f4eb8ea2763107 Mon Sep 17 00:00:00 2001 From: Mel Gorman <mgorman@suse.de> Date: Wed, 3 Jul 2013 15:01:58 -0700 Subject: mm: vmscan: stall page reclaim after a list of pages have been processed Commit "mm: vmscan: Block kswapd if it is encountering pages under writeback" blocks page reclaim if it encounters pages under writeback marked for immediate reclaim. It blocks while pages are still isolated from the LRU which is unnecessary. This patch defers the blocking until after the isolated pages have been processed and tidies up some of the comments. Signed-off-by: Mel Gorman <mgorman@suse.de> Cc: Johannes Weiner <hannes@cmpxchg.org> Cc: Michal Hocko <mhocko@suse.cz> Cc: Rik van Riel <riel@redhat.com> Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Cc: Jiri Slaby <jslaby@suse.cz> Cc: Valdis Kletnieks <Valdis.Kletnieks@vt.edu> Cc: Zlatko Calusic <zcalusic@bitsync.net> Cc: dormando <dormando@rydia.net> Cc: Trond Myklebust <trond.myklebust@fys.uio.no> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> --- mm/vmscan.c | 49 +++++++++++++++++++++++++++++++++---------------- 1 file changed, 33 insertions(+), 16 deletions(-) (limited to 'mm') diff --git a/mm/vmscan.c b/mm/vmscan.c index 999ef0b9399a..5b1a79c8f0cb 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -697,6 +697,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, enum ttu_flags ttu_flags, unsigned long *ret_nr_unqueued_dirty, unsigned long *ret_nr_writeback, + unsigned long *ret_nr_immediate, bool force_reclaim) { LIST_HEAD(ret_pages); @@ -707,6 +708,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, unsigned long nr_congested = 0; unsigned long nr_reclaimed = 0; unsigned long nr_writeback = 0; + unsigned long nr_immediate = 0; cond_resched(); @@ -773,8 +775,8 @@ static unsigned long shrink_page_list(struct list_head *page_list, * IO can complete. Waiting on the page itself risks an * indefinite stall if it is impossible to writeback the * page due to IO error or disconnected storage so instead - * block for HZ/10 or until some IO completes then clear the - * ZONE_WRITEBACK flag to recheck if the condition exists. + * note that the LRU is being scanned too quickly and the + * caller can stall after page list has been processed. * * 2) Global reclaim encounters a page, memcg encounters a * page that is not marked for immediate reclaim or @@ -804,10 +806,8 @@ static unsigned long shrink_page_list(struct list_head *page_list, if (current_is_kswapd() && PageReclaim(page) && zone_is_reclaim_writeback(zone)) { - unlock_page(page); - congestion_wait(BLK_RW_ASYNC, HZ/10); - zone_clear_flag(zone, ZONE_WRITEBACK); - goto keep; + nr_immediate++; + goto keep_locked; /* Case 2 above */ } else if (global_reclaim(sc) || @@ -1033,6 +1033,7 @@ keep: mem_cgroup_uncharge_end(); *ret_nr_unqueued_dirty += nr_unqueued_dirty; *ret_nr_writeback += nr_writeback; + *ret_nr_immediate += nr_immediate; return nr_reclaimed; } @@ -1044,7 +1045,7 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone, .priority = DEF_PRIORITY, .may_unmap = 1, }; - unsigned long ret, dummy1, dummy2; + unsigned long ret, dummy1, dummy2, dummy3; struct page *page, *next; LIST_HEAD(clean_pages); @@ -1057,7 +1058,7 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone, ret = shrink_page_list(&clean_pages, zone, &sc, TTU_UNMAP|TTU_IGNORE_ACCESS, - &dummy1, &dummy2, true); + &dummy1, &dummy2, &dummy3, true); list_splice(&clean_pages, page_list); __mod_zone_page_state(zone, NR_ISOLATED_FILE, -ret); return ret; @@ -1353,6 +1354,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, unsigned long nr_taken; unsigned long nr_unqueued_dirty = 0; unsigned long nr_writeback = 0; + unsigned long nr_immediate = 0; isolate_mode_t isolate_mode = 0; int file = is_file_lru(lru); struct zone *zone = lruvec_zone(lruvec); @@ -1394,7 +1396,8 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, return 0; nr_reclaimed = shrink_page_list(&page_list, zone, sc, TTU_UNMAP, - &nr_unqueued_dirty, &nr_writeback, false); + &nr_unqueued_dirty, &nr_writeback, &nr_immediate, + false); spin_lock_irq(&zone->lru_lock); @@ -1447,14 +1450,28 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, } /* - * Similarly, if many dirty pages are encountered that are not - * currently being written then flag that kswapd should start - * writing back pages and stall to give a chance for flushers - * to catch up. + * memcg will stall in page writeback so only consider forcibly + * stalling for global reclaim */ - if (global_reclaim(sc) && nr_unqueued_dirty == nr_taken) { - congestion_wait(BLK_RW_ASYNC, HZ/10); - zone_set_flag(zone, ZONE_TAIL_LRU_DIRTY); + if (global_reclaim(sc)) { + /* + * If dirty pages are scanned that are not queued for IO, it + * implies that flushers are not keeping up. In this case, flag + * the zone ZONE_TAIL_LRU_DIRTY and kswapd will start writing + * pages from reclaim context. It will forcibly stall in the + * next check. + */ + if (nr_unqueued_dirty == nr_taken) + zone_set_flag(zone, ZONE_TAIL_LRU_DIRTY); + + /* + * In addition, if kswapd scans pages marked marked for + * immediate reclaim and under writeback (nr_immediate), it + * implies that pages are cycling through the LRU faster than + * they are written so also forcibly stall. + */ + if (nr_unqueued_dirty == nr_taken || nr_immediate) + congestion_wait(BLK_RW_ASYNC, HZ/10); } trace_mm_vmscan_lru_shrink_inactive(zone->zone_pgdat->node_id, -- cgit v1.2.3 From f7ab8db791a8692f5ed4201dbae25722c1732a8d Mon Sep 17 00:00:00 2001 From: Mel Gorman <mgorman@suse.de> Date: Wed, 3 Jul 2013 15:02:00 -0700 Subject: mm: vmscan: set zone flags before blocking In shrink_page_list a decision may be made to stall and flag a zone as ZONE_WRITEBACK so that if a large number of unqueued dirty pages are encountered later then the reclaimer will stall. Set ZONE_WRITEBACK before potentially going to sleep so it is noticed sooner. Signed-off-by: Mel Gorman <mgorman@suse.de> Cc: Johannes Weiner <hannes@cmpxchg.org> Cc: Michal Hocko <mhocko@suse.cz> Cc: Rik van Riel <riel@redhat.com> Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Cc: Jiri Slaby <jslaby@suse.cz> Cc: Valdis Kletnieks <Valdis.Kletnieks@vt.edu> Cc: Zlatko Calusic <zcalusic@bitsync.net> Cc: dormando <dormando@rydia.net> Cc: Trond Myklebust <trond.myklebust@fys.uio.no> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> --- mm/vmscan.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/vmscan.c b/mm/vmscan.c index 5b1a79c8f0cb..5f80d018bffa 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1445,8 +1445,8 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, */ if (nr_writeback && nr_writeback >= (nr_taken >> (DEF_PRIORITY - sc->priority))) { - wait_iff_congested(zone, BLK_RW_ASYNC, HZ/10); zone_set_flag(zone, ZONE_WRITEBACK); + wait_iff_congested(zone, BLK_RW_ASYNC, HZ/10); } /* -- cgit v1.2.3 From 8e950282804558e4605401b9c79c1d34f0d73507 Mon Sep 17 00:00:00 2001 From: Mel Gorman <mgorman@suse.de> Date: Wed, 3 Jul 2013 15:02:02 -0700 Subject: mm: vmscan: move direct reclaim wait_iff_congested into shrink_list shrink_inactive_list makes decisions on whether to stall based on the number of dirty pages encountered. The wait_iff_congested() call in shrink_page_list does no such thing and it's arbitrary. This patch moves the decision on whether to set ZONE_CONGESTED and the wait_iff_congested call into shrink_page_list. This keeps all the decisions on whether to stall or not in the one place. Signed-off-by: Mel Gorman <mgorman@suse.de> Cc: Johannes Weiner <hannes@cmpxchg.org> Cc: Michal Hocko <mhocko@suse.cz> Cc: Rik van Riel <riel@redhat.com> Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Cc: Jiri Slaby <jslaby@suse.cz> Cc: Valdis Kletnieks <Valdis.Kletnieks@vt.edu> Cc: Zlatko Calusic <zcalusic@bitsync.net> Cc: dormando <dormando@rydia.net> Cc: Trond Myklebust <trond.myklebust@fys.uio.no> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> --- mm/vmscan.c | 62 ++++++++++++++++++++++++++++++++----------------------------- 1 file changed, 33 insertions(+), 29 deletions(-) (limited to 'mm') diff --git a/mm/vmscan.c b/mm/vmscan.c index 5f80d018bffa..4898daf074cf 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -695,7 +695,9 @@ static unsigned long shrink_page_list(struct list_head *page_list, struct zone *zone, struct scan_control *sc, enum ttu_flags ttu_flags, + unsigned long *ret_nr_dirty, unsigned long *ret_nr_unqueued_dirty, + unsigned long *ret_nr_congested, unsigned long *ret_nr_writeback, unsigned long *ret_nr_immediate, bool force_reclaim) @@ -1017,20 +1019,13 @@ keep: VM_BUG_ON(PageLRU(page) || PageUnevictable(page)); } - /* - * Tag a zone as congested if all the dirty pages encountered were - * backed by a congested BDI. In this case, reclaimers should just - * back off and wait for congestion to clear because further reclaim - * will encounter the same problem - */ - if (nr_dirty && nr_dirty == nr_congested && global_reclaim(sc)) - zone_set_flag(zone, ZONE_CONGESTED); - free_hot_cold_page_list(&free_pages, 1); list_splice(&ret_pages, page_list); count_vm_events(PGACTIVATE, pgactivate); mem_cgroup_uncharge_end(); + *ret_nr_dirty += nr_dirty; + *ret_nr_congested += nr_congested; *ret_nr_unqueued_dirty += nr_unqueued_dirty; *ret_nr_writeback += nr_writeback; *ret_nr_immediate += nr_immediate; @@ -1045,7 +1040,7 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone, .priority = DEF_PRIORITY, .may_unmap = 1, }; - unsigned long ret, dummy1, dummy2, dummy3; + unsigned long ret, dummy1, dummy2, dummy3, dummy4, dummy5; struct page *page, *next; LIST_HEAD(clean_pages); @@ -1057,8 +1052,8 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone, } ret = shrink_page_list(&clean_pages, zone, &sc, - TTU_UNMAP|TTU_IGNORE_ACCESS, - &dummy1, &dummy2, &dummy3, true); + TTU_UNMAP|TTU_IGNORE_ACCESS, + &dummy1, &dummy2, &dummy3, &dummy4, &dummy5, true); list_splice(&clean_pages, page_list); __mod_zone_page_state(zone, NR_ISOLATED_FILE, -ret); return ret; @@ -1352,6 +1347,8 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, unsigned long nr_scanned; unsigned long nr_reclaimed = 0; unsigned long nr_taken; + unsigned long nr_dirty = 0; + unsigned long nr_congested = 0; unsigned long nr_unqueued_dirty = 0; unsigned long nr_writeback = 0; unsigned long nr_immediate = 0; @@ -1396,8 +1393,9 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, return 0; nr_reclaimed = shrink_page_list(&page_list, zone, sc, TTU_UNMAP, - &nr_unqueued_dirty, &nr_writeback, &nr_immediate, - false); + &nr_dirty, &nr_unqueued_dirty, &nr_congested, + &nr_writeback, &nr_immediate, + false); spin_lock_irq(&zone->lru_lock); @@ -1431,7 +1429,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, * same way balance_dirty_pages() manages. * * This scales the number of dirty pages that must be under writeback - * before throttling depending on priority. It is a simple backoff + * before a zone gets flagged ZONE_WRITEBACK. It is a simple backoff * function that has the most effect in the range DEF_PRIORITY to * DEF_PRIORITY-2 which is the priority reclaim is considered to be * in trouble and reclaim is considered to be in trouble. @@ -1442,18 +1440,27 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, * ... * DEF_PRIORITY-6 For SWAP_CLUSTER_MAX isolated pages, throttle if any * isolated page is PageWriteback + * + * Once a zone is flagged ZONE_WRITEBACK, kswapd will count the number + * of pages under pages flagged for immediate reclaim and stall if any + * are encountered in the nr_immediate check below. */ if (nr_writeback && nr_writeback >= - (nr_taken >> (DEF_PRIORITY - sc->priority))) { + (nr_taken >> (DEF_PRIORITY - sc->priority))) zone_set_flag(zone, ZONE_WRITEBACK); - wait_iff_congested(zone, BLK_RW_ASYNC, HZ/10); - } /* * memcg will stall in page writeback so only consider forcibly * stalling for global reclaim */ if (global_reclaim(sc)) { + /* + * Tag a zone as congested if all the dirty pages scanned were + * backed by a congested BDI and wait_iff_congested will stall. + */ + if (nr_dirty && nr_dirty == nr_congested) + zone_set_flag(zone, ZONE_CONGESTED); + /* * If dirty pages are scanned that are not queued for IO, it * implies that flushers are not keeping up. In this case, flag @@ -1474,6 +1481,14 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, congestion_wait(BLK_RW_ASYNC, HZ/10); } + /* + * Stall direct reclaim for IO completions if underlying BDIs or zone + * is congested. Allow kswapd to continue until it starts encountering + * unqueued dirty pages or cycling through the LRU too quickly. + */ + if (!sc->hibernation_mode && !current_is_kswapd()) + wait_iff_congested(zone, BLK_RW_ASYNC, HZ/10); + trace_mm_vmscan_lru_shrink_inactive(zone->zone_pgdat->node_id, zone_idx(zone), nr_scanned, nr_reclaimed, @@ -2374,17 +2389,6 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, WB_REASON_TRY_TO_FREE_PAGES); sc->may_writepage = 1; } - - /* Take a nap, wait for some writeback to complete */ - if (!sc->hibernation_mode && sc->nr_scanned && - sc->priority < DEF_PRIORITY - 2) { - struct zone *preferred_zone; - - first_zones_zonelist(zonelist, gfp_zone(sc->gfp_mask), - &cpuset_current_mems_allowed, - &preferred_zone); - wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/10); - } } while (--sc->priority >= 0); out: -- cgit v1.2.3 From d04e8acd03e5c3421ef18e3da7bc88d56179ca42 Mon Sep 17 00:00:00 2001 From: Mel Gorman <mgorman@suse.de> Date: Wed, 3 Jul 2013 15:02:03 -0700 Subject: mm: vmscan: treat pages marked for immediate reclaim as zone congestion Currently a zone will only be marked congested if the underlying BDI is congested but if dirty pages are spread across zones it is possible that an individual zone is full of dirty pages without being congested. The impact is that zone gets scanned very quickly potentially reclaiming really clean pages. This patch treats pages marked for immediate reclaim as congested for the purposes of marking a zone ZONE_CONGESTED and stalling in wait_iff_congested. Signed-off-by: Mel Gorman <mgorman@suse.de> Cc: Johannes Weiner <hannes@cmpxchg.org> Cc: Michal Hocko <mhocko@suse.cz> Cc: Rik van Riel <riel@redhat.com> Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Cc: Jiri Slaby <jslaby@suse.cz> Cc: Valdis Kletnieks <Valdis.Kletnieks@vt.edu> Cc: Zlatko Calusic <zcalusic@bitsync.net> Cc: dormando <dormando@rydia.net> Cc: Trond Myklebust <trond.myklebust@fys.uio.no> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> --- mm/vmscan.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/vmscan.c b/mm/vmscan.c index 4898daf074cf..bf4778479e3a 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -761,9 +761,15 @@ static unsigned long shrink_page_list(struct list_head *page_list, if (dirty && !writeback) nr_unqueued_dirty++; - /* Treat this page as congested if underlying BDI is */ + /* + * Treat this page as congested if the underlying BDI is or if + * pages are cycling through the LRU so quickly that the + * pages marked for immediate reclaim are making it to the + * end of the LRU a second time. + */ mapping = page_mapping(page); - if (mapping && bdi_write_congested(mapping->backing_dev_info)) + if ((mapping && bdi_write_congested(mapping->backing_dev_info)) || + (writeback && PageReclaim(page))) nr_congested++; /* -- cgit v1.2.3 From b45972265f823ed01eae0867a176320071665787 Mon Sep 17 00:00:00 2001 From: Mel Gorman <mgorman@suse.de> Date: Wed, 3 Jul 2013 15:02:05 -0700 Subject: mm: vmscan: take page buffers dirty and locked state into account Page reclaim keeps track of dirty and under writeback pages and uses it to determine if wait_iff_congested() should stall or if kswapd should begin writing back pages. This fails to account for buffer pages that can be under writeback but not PageWriteback which is the case for filesystems like ext3 ordered mode. Furthermore, PageDirty buffer pages can have all the buffers clean and writepage does no IO so it should not be accounted as congested. This patch adds an address_space operation that filesystems may optionally use to check if a page is really dirty or really under writeback. An implementation is provided for for buffer_heads is added and used for block operations and ext3 in ordered mode. By default the page flags are obeyed. Credit goes to Jan Kara for identifying that the page flags alone are not sufficient for ext3 and sanity checking a number of ideas on how the problem could be addressed. Signed-off-by: Mel Gorman <mgorman@suse.de> Cc: Johannes Weiner <hannes@cmpxchg.org> Cc: Michal Hocko <mhocko@suse.cz> Cc: Rik van Riel <riel@redhat.com> Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Cc: Jiri Slaby <jslaby@suse.cz> Cc: Valdis Kletnieks <Valdis.Kletnieks@vt.edu> Cc: Zlatko Calusic <zcalusic@bitsync.net> Cc: dormando <dormando@rydia.net> Cc: Trond Myklebust <trond.myklebust@fys.uio.no> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> --- fs/block_dev.c | 1 + fs/buffer.c | 34 ++++++++++++++++++++++++++++++++++ fs/ext3/inode.c | 1 + include/linux/buffer_head.h | 3 +++ include/linux/fs.h | 1 + mm/vmscan.c | 10 ++++++++++ 6 files changed, 50 insertions(+) (limited to 'mm') diff --git a/fs/block_dev.c b/fs/block_dev.c index 431b6a04ebfd..bb43ce081d6e 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -1562,6 +1562,7 @@ static const struct address_space_operations def_blk_aops = { .writepages = generic_writepages, .releasepage = blkdev_releasepage, .direct_IO = blkdev_direct_IO, + .is_dirty_writeback = buffer_check_dirty_writeback, }; const struct file_operations def_blk_fops = { diff --git a/fs/buffer.c b/fs/buffer.c index f93392e2df12..4d7433534f5c 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -82,6 +82,40 @@ void unlock_buffer(struct buffer_head *bh) } EXPORT_SYMBOL(unlock_buffer); +/* + * Returns if the page has dirty or writeback buffers. If all the buffers + * are unlocked and clean then the PageDirty information is stale. If + * any of the pages are locked, it is assumed they are locked for IO. + */ +void buffer_check_dirty_writeback(struct page *page, + bool *dirty, bool *writeback) +{ + struct buffer_head *head, *bh; + *dirty = false; + *writeback = false; + + BUG_ON(!PageLocked(page)); + + if (!page_has_buffers(page)) + return; + + if (PageWriteback(page)) + *writeback = true; + + head = page_buffers(page); + bh = head; + do { + if (buffer_locked(bh)) + *writeback = true; + + if (buffer_dirty(bh)) + *dirty = true; + + bh = bh->b_this_page; + } while (bh != head); +} +EXPORT_SYMBOL(buffer_check_dirty_writeback); + /* * Block until a buffer comes unlocked. This doesn't stop it * from becoming locked again - you have to lock it yourself diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c index f67668f724ba..2bd85486b879 100644 --- a/fs/ext3/inode.c +++ b/fs/ext3/inode.c @@ -1985,6 +1985,7 @@ static const struct address_space_operations ext3_ordered_aops = { .direct_IO = ext3_direct_IO, .migratepage = buffer_migrate_page, .is_partially_uptodate = block_is_partially_uptodate, + .is_dirty_writeback = buffer_check_dirty_writeback, .error_remove_page = generic_error_remove_page, }; diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h index f5a3b838ddb0..91fa9a94ae92 100644 --- a/include/linux/buffer_head.h +++ b/include/linux/buffer_head.h @@ -139,6 +139,9 @@ BUFFER_FNS(Prio, prio) }) #define page_has_buffers(page) PagePrivate(page) +void buffer_check_dirty_writeback(struct page *page, + bool *dirty, bool *writeback); + /* * Declarations */ diff --git a/include/linux/fs.h b/include/linux/fs.h index 2b82c8041490..99be011e00de 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -380,6 +380,7 @@ struct address_space_operations { int (*launder_page) (struct page *); int (*is_partially_uptodate) (struct page *, read_descriptor_t *, unsigned long); + void (*is_dirty_writeback) (struct page *, bool *, bool *); int (*error_remove_page)(struct address_space *, struct page *); /* swapfile support */ diff --git a/mm/vmscan.c b/mm/vmscan.c index bf4778479e3a..c85794399848 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -673,6 +673,8 @@ static enum page_references page_check_references(struct page *page, static void page_check_dirty_writeback(struct page *page, bool *dirty, bool *writeback) { + struct address_space *mapping; + /* * Anonymous pages are not handled by flushers and must be written * from reclaim context. Do not stall reclaim based on them @@ -686,6 +688,14 @@ static void page_check_dirty_writeback(struct page *page, /* By default assume that the page flags are accurate */ *dirty = PageDirty(page); *writeback = PageWriteback(page); + + /* Verify dirty/writeback state if the filesystem supports it */ + if (!page_has_private(page)) + return; + + mapping = page_mapping(page); + if (mapping && mapping->a_ops->is_dirty_writeback) + mapping->a_ops->is_dirty_writeback(page, dirty, writeback); } /* -- cgit v1.2.3 From aa47228a18e6d49369df877463095b899aff495f Mon Sep 17 00:00:00 2001 From: Cody P Schafer <cody@linux.vnet.ibm.com> Date: Wed, 3 Jul 2013 15:02:10 -0700 Subject: memory_hotplug: use pgdat_resize_lock() in online_pages() mmzone.h documents node_size_lock (which pgdat_resize_lock() locks) as follows: * Must be held any time you expect node_start_pfn, node_present_pages * or node_spanned_pages stay constant. [...] So actually hold it when we update node_present_pages in online_pages(). Signed-off-by: Cody P Schafer <cody@linux.vnet.ibm.com> Cc: David Rientjes <rientjes@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> --- mm/memory_hotplug.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'mm') diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 1ad92b46753e..527c51084bb8 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -918,6 +918,7 @@ static void node_states_set_node(int node, struct memory_notify *arg) int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_type) { + unsigned long flags; unsigned long onlined_pages = 0; struct zone *zone; int need_zonelists_rebuild = 0; @@ -996,7 +997,11 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ zone->managed_pages += onlined_pages; zone->present_pages += onlined_pages; + + pgdat_resize_lock(zone->zone_pgdat, &flags); zone->zone_pgdat->node_present_pages += onlined_pages; + pgdat_resize_unlock(zone->zone_pgdat, &flags); + if (onlined_pages) { node_states_set_node(zone_to_nid(zone), &arg); if (need_zonelists_rebuild) -- cgit v1.2.3 From d702909f0aa14fe678d74d7f974aa66bfb211d0b Mon Sep 17 00:00:00 2001 From: Cody P Schafer <cody@linux.vnet.ibm.com> Date: Wed, 3 Jul 2013 15:02:11 -0700 Subject: memory_hotplug: use pgdat_resize_lock() in __offline_pages() mmzone.h documents node_size_lock (which pgdat_resize_lock() locks) as follows: * Must be held any time you expect node_start_pfn, node_present_pages * or node_spanned_pages stay constant. [...] So actually hold it when we update node_present_pages in __offline_pages(). [akpm@linux-foundation.org: fix build] Signed-off-by: Cody P Schafer <cody@linux.vnet.ibm.com> Cc: David Rientjes <rientjes@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> --- mm/memory_hotplug.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'mm') diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 527c51084bb8..a66d0023d219 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1492,6 +1492,7 @@ static int __ref __offline_pages(unsigned long start_pfn, unsigned long pfn, nr_pages, expire; long offlined_pages; int ret, drain, retry_max, node; + unsigned long flags; struct zone *zone; struct memory_notify arg; @@ -1585,7 +1586,11 @@ repeat: /* removal success */ zone->managed_pages -= offlined_pages; zone->present_pages -= offlined_pages; + + pgdat_resize_lock(zone->zone_pgdat, &flags); zone->zone_pgdat->node_present_pages -= offlined_pages; + pgdat_resize_unlock(zone->zone_pgdat, &flags); + totalram_pages -= offlined_pages; init_per_zone_wmark_min(); -- cgit v1.2.3 From cef2ac3f6c8ab532e49cf69d05f540931ad8ee64 Mon Sep 17 00:00:00 2001 From: HATAYAMA Daisuke <d.hatayama@jp.fujitsu.com> Date: Wed, 3 Jul 2013 15:02:17 -0700 Subject: vmalloc: make find_vm_area check in range Currently, __find_vmap_area searches for the kernel VM area starting at a given address. This patch changes this behavior so that it searches for the kernel VM area to which the address belongs. This change is needed by remap_vmalloc_range_partial to be introduced in later patch that receives any position of kernel VM area as target address. This patch changes the condition (addr > va->va_start) to the equivalent (addr >= va->va_end) by taking advantage of the fact that each kernel VM area is non-overlapping. Signed-off-by: HATAYAMA Daisuke <d.hatayama@jp.fujitsu.com> Acked-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com> Cc: Vivek Goyal <vgoyal@redhat.com> Cc: Atsushi Kumagai <kumagai-atsushi@mxc.nes.nec.co.jp> Cc: Lisa Mitchell <lisa.mitchell@hp.com> Cc: Zhang Yanfei <zhangyanfei@cn.fujitsu.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> --- mm/vmalloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/vmalloc.c b/mm/vmalloc.c index d365724feb05..3875fa2f0f60 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -292,7 +292,7 @@ static struct vmap_area *__find_vmap_area(unsigned long addr) va = rb_entry(n, struct vmap_area, rb_node); if (addr < va->va_start) n = n->rb_left; - else if (addr > va->va_start) + else if (addr >= va->va_end) n = n->rb_right; else return va; -- cgit v1.2.3 From e69e9d4aee712a22665f008ae0550bb3d7c7f7c1 Mon Sep 17 00:00:00 2001 From: HATAYAMA Daisuke <d.hatayama@jp.fujitsu.com> Date: Wed, 3 Jul 2013 15:02:18 -0700 Subject: vmalloc: introduce remap_vmalloc_range_partial We want to allocate ELF note segment buffer on the 2nd kernel in vmalloc space and remap it to user-space in order to reduce the risk that memory allocation fails on system with huge number of CPUs and so with huge ELF note segment that exceeds 11-order block size. Although there's already remap_vmalloc_range for the purpose of remapping vmalloc memory to user-space, we need to specify user-space range via vma. Mmap on /proc/vmcore needs to remap range across multiple objects, so the interface that requires vma to cover full range is problematic. This patch introduces remap_vmalloc_range_partial that receives user-space range as a pair of base address and size and can be used for mmap on /proc/vmcore case. remap_vmalloc_range is rewritten using remap_vmalloc_range_partial. [akpm@linux-foundation.org: use PAGE_ALIGNED()] Signed-off-by: HATAYAMA Daisuke <d.hatayama@jp.fujitsu.com> Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com> Cc: Vivek Goyal <vgoyal@redhat.com> Cc: Atsushi Kumagai <kumagai-atsushi@mxc.nes.nec.co.jp> Cc: Lisa Mitchell <lisa.mitchell@hp.com> Cc: Zhang Yanfei <zhangyanfei@cn.fujitsu.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> --- include/linux/vmalloc.h | 4 +++ mm/vmalloc.c | 67 +++++++++++++++++++++++++++++++++---------------- 2 files changed, 49 insertions(+), 22 deletions(-) (limited to 'mm') diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index 7d5773a99f20..dd0a2c810529 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -82,6 +82,10 @@ extern void *vmap(struct page **pages, unsigned int count, unsigned long flags, pgprot_t prot); extern void vunmap(const void *addr); +extern int remap_vmalloc_range_partial(struct vm_area_struct *vma, + unsigned long uaddr, void *kaddr, + unsigned long size); + extern int remap_vmalloc_range(struct vm_area_struct *vma, void *addr, unsigned long pgoff); void vmalloc_sync_all(void); diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 3875fa2f0f60..b7259906a806 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -1476,10 +1476,9 @@ static void __vunmap(const void *addr, int deallocate_pages) if (!addr) return; - if ((PAGE_SIZE-1) & (unsigned long)addr) { - WARN(1, KERN_ERR "Trying to vfree() bad address (%p)\n", addr); + if (WARN(!PAGE_ALIGNED(addr), "Trying to vfree() bad address (%p)\n", + addr)); return; - } area = remove_vm_area(addr); if (unlikely(!area)) { @@ -2148,42 +2147,43 @@ finished: } /** - * remap_vmalloc_range - map vmalloc pages to userspace - * @vma: vma to cover (map full range of vma) - * @addr: vmalloc memory - * @pgoff: number of pages into addr before first page to map + * remap_vmalloc_range_partial - map vmalloc pages to userspace + * @vma: vma to cover + * @uaddr: target user address to start at + * @kaddr: virtual address of vmalloc kernel memory + * @size: size of map area * * Returns: 0 for success, -Exxx on failure * - * This function checks that addr is a valid vmalloc'ed area, and - * that it is big enough to cover the vma. Will return failure if - * that criteria isn't met. + * This function checks that @kaddr is a valid vmalloc'ed area, + * and that it is big enough to cover the range starting at + * @uaddr in @vma. Will return failure if that criteria isn't + * met. * * Similar to remap_pfn_range() (see mm/memory.c) */ -int remap_vmalloc_range(struct vm_area_struct *vma, void *addr, - unsigned long pgoff) +int remap_vmalloc_range_partial(struct vm_area_struct *vma, unsigned long uaddr, + void *kaddr, unsigned long size) { struct vm_struct *area; - unsigned long uaddr = vma->vm_start; - unsigned long usize = vma->vm_end - vma->vm_start; - if ((PAGE_SIZE-1) & (unsigned long)addr) + size = PAGE_ALIGN(size); + + if (!PAGE_ALIGNED(uaddr) || !PAGE_ALIGNED(kaddr)) return -EINVAL; - area = find_vm_area(addr); + area = find_vm_area(kaddr); if (!area) return -EINVAL; if (!(area->flags & VM_USERMAP)) return -EINVAL; - if (usize + (pgoff << PAGE_SHIFT) > area->size - PAGE_SIZE) + if (kaddr + size > area->addr + area->size) return -EINVAL; - addr += pgoff << PAGE_SHIFT; do { - struct page *page = vmalloc_to_page(addr); + struct page *page = vmalloc_to_page(kaddr); int ret; ret = vm_insert_page(vma, uaddr, page); @@ -2191,14 +2191,37 @@ int remap_vmalloc_range(struct vm_area_struct *vma, void *addr, return ret; uaddr += PAGE_SIZE; - addr += PAGE_SIZE; - usize -= PAGE_SIZE; - } while (usize > 0); + kaddr += PAGE_SIZE; + size -= PAGE_SIZE; + } while (size > 0); vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; return 0; } +EXPORT_SYMBOL(remap_vmalloc_range_partial); + +/** + * remap_vmalloc_range - map vmalloc pages to userspace + * @vma: vma to cover (map full range of vma) + * @addr: vmalloc memory + * @pgoff: number of pages into addr before first page to map + * + * Returns: 0 for success, -Exxx on failure + * + * This function checks that addr is a valid vmalloc'ed area, and + * that it is big enough to cover the vma. Will return failure if + * that criteria isn't met. + * + * Similar to remap_pfn_range() (see mm/memory.c) + */ +int remap_vmalloc_range(struct vm_area_struct *vma, void *addr, + unsigned long pgoff) +{ + return remap_vmalloc_range_partial(vma, vma->vm_start, + addr + (pgoff << PAGE_SHIFT), + vma->vm_end - vma->vm_start); +} EXPORT_SYMBOL(remap_vmalloc_range); /* -- cgit v1.2.3 From c6286c983900c77410a951874f1589f4a41fbbae Mon Sep 17 00:00:00 2001 From: Mel Gorman <mgorman@suse.de> Date: Wed, 3 Jul 2013 15:02:26 -0700 Subject: mm: add tracepoints for LRU activation and insertions Andrew Perepechko reported a problem whereby pages are being prematurely evicted as the mark_page_accessed() hint is ignored for pages that are currently on a pagevec -- http://www.spinics.net/lists/linux-ext4/msg37340.html . Alexey Lyahkov and Robin Dong have also reported problems recently that could be due to hot pages reaching the end of the inactive list too quickly and be reclaimed. Rather than addressing this on a per-filesystem basis, this series aims to fix the mark_page_accessed() interface by deferring what LRU a page is added to pagevec drain time and allowing mark_page_accessed() to call SetPageActive on a pagevec page. Patch 1 adds two tracepoints for LRU page activation and insertion. Using these processes it's possible to build a model of pages in the LRU that can be processed offline. Patch 2 defers making the decision on what LRU to add a page to until when the pagevec is drained. Patch 3 searches the local pagevec for pages to mark PageActive on mark_page_accessed. The changelog explains why only the local pagevec is examined. Patches 4 and 5 tidy up the API. postmark, a dd-based test and fs-mark both single and threaded mode were run but none of them showed any performance degradation or gain as a result of the patch. Using patch 1, I built a *very* basic model of the LRU to examine offline what the average age of different page types on the LRU were in milliseconds. Of course, capturing the trace distorts the test as it's written to local disk but it does not matter for the purposes of this test. The average age of pages in milliseconds were vanilla deferdrain Average age mapped anon: 1454 1250 Average age mapped file: 127841 155552 Average age unmapped anon: 85 235 Average age unmapped file: 73633 38884 Average age unmapped buffers: 74054 116155 The LRU activity was mostly files which you'd expect for a dd-based workload. Note that the average age of buffer pages is increased by the series and it is expected this is due to the fact that the buffer pages are now getting added to the active list when drained from the pagevecs. Note that the average age of the unmapped file data is decreased as they are still added to the inactive list and are reclaimed before the buffers. There is no guarantee this is a universal win for all workloads and it would be nice if the filesystem people gave some thought as to whether this decision is generally a win or a loss. This patch: Using these tracepoints it is possible to model LRU activity and the average residency of pages of different types. This can be used to debug problems related to premature reclaim of pages of particular types. Signed-off-by: Mel Gorman <mgorman@suse.de> Reviewed-by: Rik van Riel <riel@redhat.com> Cc: Jan Kara <jack@suse.cz> Cc: Johannes Weiner <hannes@cmpxchg.org> Cc: Alexey Lyahkov <alexey.lyashkov@gmail.com> Cc: Andrew Perepechko <anserper@ya.ru> Cc: Robin Dong <sanbai@taobao.com> Cc: Theodore Tso <tytso@mit.edu> Cc: Hugh Dickins <hughd@google.com> Cc: Rik van Riel <riel@redhat.com> Cc: Bernd Schubert <bernd.schubert@fastmail.fm> Cc: David Howells <dhowells@redhat.com> Cc: Trond Myklebust <Trond.Myklebust@netapp.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> --- include/trace/events/pagemap.h | 89 ++++++++++++++++++++++++++++++++++++++++++ mm/swap.c | 5 +++ 2 files changed, 94 insertions(+) create mode 100644 include/trace/events/pagemap.h (limited to 'mm') diff --git a/include/trace/events/pagemap.h b/include/trace/events/pagemap.h new file mode 100644 index 000000000000..1c9fabde69e4 --- /dev/null +++ b/include/trace/events/pagemap.h @@ -0,0 +1,89 @@ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM pagemap + +#if !defined(_TRACE_PAGEMAP_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_PAGEMAP_H + +#include <linux/tracepoint.h> +#include <linux/mm.h> + +#define PAGEMAP_MAPPED 0x0001u +#define PAGEMAP_ANONYMOUS 0x0002u +#define PAGEMAP_FILE 0x0004u +#define PAGEMAP_SWAPCACHE 0x0008u +#define PAGEMAP_SWAPBACKED 0x0010u +#define PAGEMAP_MAPPEDDISK 0x0020u +#define PAGEMAP_BUFFERS 0x0040u + +#define trace_pagemap_flags(page) ( \ + (PageAnon(page) ? PAGEMAP_ANONYMOUS : PAGEMAP_FILE) | \ + (page_mapped(page) ? PAGEMAP_MAPPED : 0) | \ + (PageSwapCache(page) ? PAGEMAP_SWAPCACHE : 0) | \ + (PageSwapBacked(page) ? PAGEMAP_SWAPBACKED : 0) | \ + (PageMappedToDisk(page) ? PAGEMAP_MAPPEDDISK : 0) | \ + (page_has_private(page) ? PAGEMAP_BUFFERS : 0) \ + ) + +TRACE_EVENT(mm_lru_insertion, + + TP_PROTO( + struct page *page, + unsigned long pfn, + int lru, + unsigned long flags + ), + + TP_ARGS(page, pfn, lru, flags), + + TP_STRUCT__entry( + __field(struct page *, page ) + __field(unsigned long, pfn ) + __field(int, lru ) + __field(unsigned long, flags ) + ), + + TP_fast_assign( + __entry->page = page; + __entry->pfn = pfn; + __entry->lru = lru; + __entry->flags = flags; + ), + + /* Flag format is based on page-types.c formatting for pagemap */ + TP_printk("page=%p pfn=%lu lru=%d flags=%s%s%s%s%s%s", + __entry->page, + __entry->pfn, + __entry->lru, + __entry->flags & PAGEMAP_MAPPED ? "M" : " ", + __entry->flags & PAGEMAP_ANONYMOUS ? "a" : "f", + __entry->flags & PAGEMAP_SWAPCACHE ? "s" : " ", + __entry->flags & PAGEMAP_SWAPBACKED ? "b" : " ", + __entry->flags & PAGEMAP_MAPPEDDISK ? "d" : " ", + __entry->flags & PAGEMAP_BUFFERS ? "B" : " ") +); + +TRACE_EVENT(mm_lru_activate, + + TP_PROTO(struct page *page, unsigned long pfn), + + TP_ARGS(page, pfn), + + TP_STRUCT__entry( + __field(struct page *, page ) + __field(unsigned long, pfn ) + ), + + TP_fast_assign( + __entry->page = page; + __entry->pfn = pfn; + ), + + /* Flag format is based on page-types.c formatting for pagemap */ + TP_printk("page=%p pfn=%lu", __entry->page, __entry->pfn) + +); + +#endif /* _TRACE_PAGEMAP_H */ + +/* This part must be outside protection */ +#include <trace/define_trace.h> diff --git a/mm/swap.c b/mm/swap.c index dfd7d71d6841..53c9ceb7b816 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -34,6 +34,9 @@ #include "internal.h" +#define CREATE_TRACE_POINTS +#include <trace/events/pagemap.h> + /* How many pages do we try to swap or page in/out together? */ int page_cluster; @@ -384,6 +387,7 @@ static void __activate_page(struct page *page, struct lruvec *lruvec, SetPageActive(page); lru += LRU_ACTIVE; add_page_to_lru_list(page, lruvec, lru); + trace_mm_lru_activate(page, page_to_pfn(page)); __count_vm_event(PGACTIVATE); update_page_reclaim_stat(lruvec, file, 1); @@ -808,6 +812,7 @@ static void __pagevec_lru_add_fn(struct page *page, struct lruvec *lruvec, SetPageActive(page); add_page_to_lru_list(page, lruvec, lru); update_page_reclaim_stat(lruvec, file, active); + trace_mm_lru_insertion(page, page_to_pfn(page), lru, trace_pagemap_flags(page)); } /* -- cgit v1.2.3 From 13f7f78981e49f288d871bb918545ef5c952e00b Mon Sep 17 00:00:00 2001 From: Mel Gorman <mgorman@suse.de> Date: Wed, 3 Jul 2013 15:02:28 -0700 Subject: mm: pagevec: defer deciding which LRU to add a page to until pagevec drain time mark_page_accessed() cannot activate an inactive page that is located on an inactive LRU pagevec. Hints from filesystems may be ignored as a result. In preparation for fixing that problem, this patch removes the per-LRU pagevecs and leaves just one pagevec. The final LRU the page is added to is deferred until the pagevec is drained. This means that fewer pagevecs are available and potentially there is greater contention on the LRU lock. However, this only applies in the case where there is an almost perfect mix of file, anon, active and inactive pages being added to the LRU. In practice I expect that we are adding stream of pages of a particular time and that the changes in contention will barely be measurable. Signed-off-by: Mel Gorman <mgorman@suse.de> Acked-by: Rik van Riel <riel@redhat.com> Cc: Jan Kara <jack@suse.cz> Acked-by: Johannes Weiner <hannes@cmpxchg.org> Cc: Alexey Lyahkov <alexey.lyashkov@gmail.com> Cc: Andrew Perepechko <anserper@ya.ru> Cc: Robin Dong <sanbai@taobao.com> Cc: Theodore Tso <tytso@mit.edu> Cc: Hugh Dickins <hughd@google.com> Cc: Rik van Riel <riel@redhat.com> Cc: Bernd Schubert <bernd.schubert@fastmail.fm> Cc: David Howells <dhowells@redhat.com> Cc: Trond Myklebust <Trond.Myklebust@netapp.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> --- mm/swap.c | 47 +++++++++++++++++++++-------------------------- 1 file changed, 21 insertions(+), 26 deletions(-) (limited to 'mm') diff --git a/mm/swap.c b/mm/swap.c index 53c9ceb7b816..868b493431c2 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -40,7 +40,7 @@ /* How many pages do we try to swap or page in/out together? */ int page_cluster; -static DEFINE_PER_CPU(struct pagevec[NR_LRU_LISTS], lru_add_pvecs); +static DEFINE_PER_CPU(struct pagevec, lru_add_pvec); static DEFINE_PER_CPU(struct pagevec, lru_rotate_pvecs); static DEFINE_PER_CPU(struct pagevec, lru_deactivate_pvecs); @@ -452,22 +452,25 @@ void mark_page_accessed(struct page *page) EXPORT_SYMBOL(mark_page_accessed); /* - * Order of operations is important: flush the pagevec when it's already - * full, not when adding the last page, to make sure that last page is - * not added to the LRU directly when passed to this function. Because - * mark_page_accessed() (called after this when writing) only activates - * pages that are on the LRU, linear writes in subpage chunks would see - * every PAGEVEC_SIZE page activated, which is unexpected. + * Queue the page for addition to the LRU via pagevec. The decision on whether + * to add the page to the [in]active [file|anon] list is deferred until the + * pagevec is drained. This gives a chance for the caller of __lru_cache_add() + * have the page added to the active list using mark_page_accessed(). */ void __lru_cache_add(struct page *page, enum lru_list lru) { - struct pagevec *pvec = &get_cpu_var(lru_add_pvecs)[lru]; + struct pagevec *pvec = &get_cpu_var(lru_add_pvec); + + if (is_active_lru(lru)) + SetPageActive(page); + else + ClearPageActive(page); page_cache_get(page); if (!pagevec_space(pvec)) __pagevec_lru_add(pvec, lru); pagevec_add(pvec, page); - put_cpu_var(lru_add_pvecs); + put_cpu_var(lru_add_pvec); } EXPORT_SYMBOL(__lru_cache_add); @@ -480,13 +483,11 @@ void lru_cache_add_lru(struct page *page, enum lru_list lru) { if (PageActive(page)) { VM_BUG_ON(PageUnevictable(page)); - ClearPageActive(page); } else if (PageUnevictable(page)) { VM_BUG_ON(PageActive(page)); - ClearPageUnevictable(page); } - VM_BUG_ON(PageLRU(page) || PageActive(page) || PageUnevictable(page)); + VM_BUG_ON(PageLRU(page)); __lru_cache_add(page, lru); } @@ -587,15 +588,10 @@ static void lru_deactivate_fn(struct page *page, struct lruvec *lruvec, */ void lru_add_drain_cpu(int cpu) { - struct pagevec *pvecs = per_cpu(lru_add_pvecs, cpu); - struct pagevec *pvec; - int lru; + struct pagevec *pvec = &per_cpu(lru_add_pvec, cpu); - for_each_lru(lru) { - pvec = &pvecs[lru - LRU_BASE]; - if (pagevec_count(pvec)) - __pagevec_lru_add(pvec, lru); - } + if (pagevec_count(pvec)) + __pagevec_lru_add(pvec, NR_LRU_LISTS); pvec = &per_cpu(lru_rotate_pvecs, cpu); if (pagevec_count(pvec)) { @@ -799,17 +795,16 @@ void lru_add_page_tail(struct page *page, struct page *page_tail, static void __pagevec_lru_add_fn(struct page *page, struct lruvec *lruvec, void *arg) { - enum lru_list lru = (enum lru_list)arg; - int file = is_file_lru(lru); - int active = is_active_lru(lru); + enum lru_list requested_lru = (enum lru_list)arg; + int file = page_is_file_cache(page); + int active = PageActive(page); + enum lru_list lru = page_lru(page); - VM_BUG_ON(PageActive(page)); + WARN_ON_ONCE(requested_lru < NR_LRU_LISTS && requested_lru != lru); VM_BUG_ON(PageUnevictable(page)); VM_BUG_ON(PageLRU(page)); SetPageLRU(page); - if (active) - SetPageActive(page); add_page_to_lru_list(page, lruvec, lru); update_page_reclaim_stat(lruvec, file, active); trace_mm_lru_insertion(page, page_to_pfn(page), lru, trace_pagemap_flags(page)); -- cgit v1.2.3 From 059285a25f30c13ed4f5d91cecd6094b9b20bb7b Mon Sep 17 00:00:00 2001 From: Mel Gorman <mgorman@suse.de> Date: Wed, 3 Jul 2013 15:02:30 -0700 Subject: mm: activate !PageLRU pages on mark_page_accessed if page is on local pagevec If a page is on a pagevec then it is !PageLRU and mark_page_accessed() may fail to move a page to the active list as expected. Now that the LRU is selected at LRU drain time, mark pages PageActive if they are on the local pagevec so it gets moved to the correct list at LRU drain time. Using a debugging patch it was found that for a simple git checkout based workload that pages were never added to the active file list in practice but with this patch applied they are. before after LRU Add Active File 0 750583 LRU Add Active Anon 2640587 2702818 LRU Add Inactive File 8833662 8068353 LRU Add Inactive Anon 207 200 Note that only pages on the local pagevec are considered on purpose. A !PageLRU page could be in the process of being released, reclaimed, migrated or on a remote pagevec that is currently being drained. Marking it PageActive is vunerable to races where PageLRU and Active bits are checked at the wrong time. Page reclaim will trigger VM_BUG_ONs but depending on when the race hits, it could also free a PageActive page to the page allocator and trigger a bad_page warning. Similarly a potential race exists between a per-cpu drain on a pagevec list and an activation on a remote CPU. lru_add_drain_cpu __pagevec_lru_add lru = page_lru(page); mark_page_accessed if (PageLRU(page)) activate_page else SetPageActive SetPageLRU(page); add_page_to_lru_list(page, lruvec, lru); In this case a PageActive page is added to the inactivate list and later the inactive/active stats will get skewed. While the PageActive checks in vmscan could be removed and potentially dealt with, a skew in the statistics would be very difficult to detect. Hence this patch deals just with the common case where a page being marked accessed has just been added to the local pagevec. Signed-off-by: Mel Gorman <mgorman@suse.de> Cc: Jan Kara <jack@suse.cz> Cc: Rik van Riel <riel@redhat.com> Acked-by: Johannes Weiner <hannes@cmpxchg.org> Cc: Alexey Lyahkov <alexey.lyashkov@gmail.com> Cc: Andrew Perepechko <anserper@ya.ru> Cc: Robin Dong <sanbai@taobao.com> Cc: Theodore Tso <tytso@mit.edu> Cc: Hugh Dickins <hughd@google.com> Cc: Rik van Riel <riel@redhat.com> Cc: Bernd Schubert <bernd.schubert@fastmail.fm> Cc: David Howells <dhowells@redhat.com> Cc: Trond Myklebust <Trond.Myklebust@netapp.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> --- mm/swap.c | 41 +++++++++++++++++++++++++++++++++++++++-- 1 file changed, 39 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/swap.c b/mm/swap.c index 868b493431c2..c53d161fc76d 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -432,6 +432,33 @@ void activate_page(struct page *page) } #endif +static void __lru_cache_activate_page(struct page *page) +{ + struct pagevec *pvec = &get_cpu_var(lru_add_pvec); + int i; + + /* + * Search backwards on the optimistic assumption that the page being + * activated has just been added to this pagevec. Note that only + * the local pagevec is examined as a !PageLRU page could be in the + * process of being released, reclaimed, migrated or on a remote + * pagevec that is currently being drained. Furthermore, marking + * a remote pagevec's page PageActive potentially hits a race where + * a page is marked PageActive just after it is added to the inactive + * list causing accounting errors and BUG_ON checks to trigger. + */ + for (i = pagevec_count(pvec) - 1; i >= 0; i--) { + struct page *pagevec_page = pvec->pages[i]; + + if (pagevec_page == page) { + SetPageActive(page); + break; + } + } + + put_cpu_var(lru_add_pvec); +} + /* * Mark a page as having seen activity. * @@ -442,8 +469,18 @@ void activate_page(struct page *page) void mark_page_accessed(struct page *page) { if (!PageActive(page) && !PageUnevictable(page) && - PageReferenced(page) && PageLRU(page)) { - activate_page(page); + PageReferenced(page)) { + + /* + * If the page is on the LRU, queue it for activation via + * activate_page_pvecs. Otherwise, assume the page is on a + * pagevec, mark it active and it'll be moved to the active + * LRU on the next drain. + */ + if (PageLRU(page)) + activate_page(page); + else + __lru_cache_activate_page(page); ClearPageReferenced(page); } else if (!PageReferenced(page)) { SetPageReferenced(page); -- cgit v1.2.3 From a0b8cab3b9b2efadabdcff264c450ca515e2619c Mon Sep 17 00:00:00 2001 From: Mel Gorman <mgorman@suse.de> Date: Wed, 3 Jul 2013 15:02:32 -0700 Subject: mm: remove lru parameter from __pagevec_lru_add and remove parts of pagevec API Now that the LRU to add a page to is decided at LRU-add time, remove the misleading lru parameter from __pagevec_lru_add. A consequence of this is that the pagevec_lru_add_file, pagevec_lru_add_anon and similar helpers are misleading as the caller no longer has direct control over what LRU the page is added to. Unused helpers are removed by this patch and existing users of pagevec_lru_add_file() are converted to use lru_cache_add_file() directly and use the per-cpu pagevecs instead of creating their own pagevec. Signed-off-by: Mel Gorman <mgorman@suse.de> Reviewed-by: Jan Kara <jack@suse.cz> Reviewed-by: Rik van Riel <riel@redhat.com> Acked-by: Johannes Weiner <hannes@cmpxchg.org> Cc: Alexey Lyahkov <alexey.lyashkov@gmail.com> Cc: Andrew Perepechko <anserper@ya.ru> Cc: Robin Dong <sanbai@taobao.com> Cc: Theodore Tso <tytso@mit.edu> Cc: Hugh Dickins <hughd@google.com> Cc: Rik van Riel <riel@redhat.com> Cc: Bernd Schubert <bernd.schubert@fastmail.fm> Cc: David Howells <dhowells@redhat.com> Cc: Trond Myklebust <Trond.Myklebust@netapp.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> --- fs/cachefiles/rdwr.c | 30 +++++++----------------------- fs/nfs/dir.c | 7 ++----- include/linux/pagevec.h | 34 +--------------------------------- mm/swap.c | 12 ++++-------- 4 files changed, 14 insertions(+), 69 deletions(-) (limited to 'mm') diff --git a/fs/cachefiles/rdwr.c b/fs/cachefiles/rdwr.c index 317f9ee9c991..ebaff368120d 100644 --- a/fs/cachefiles/rdwr.c +++ b/fs/cachefiles/rdwr.c @@ -12,6 +12,7 @@ #include <linux/mount.h> #include <linux/slab.h> #include <linux/file.h> +#include <linux/swap.h> #include "internal.h" /* @@ -227,8 +228,7 @@ static void cachefiles_read_copier(struct fscache_operation *_op) */ static int cachefiles_read_backing_file_one(struct cachefiles_object *object, struct fscache_retrieval *op, - struct page *netpage, - struct pagevec *pagevec) + struct page *netpage) { struct cachefiles_one_read *monitor; struct address_space *bmapping; @@ -237,8 +237,6 @@ static int cachefiles_read_backing_file_one(struct cachefiles_object *object, _enter(""); - pagevec_reinit(pagevec); - _debug("read back %p{%lu,%d}", netpage, netpage->index, page_count(netpage)); @@ -283,9 +281,7 @@ installed_new_backing_page: backpage = newpage; newpage = NULL; - page_cache_get(backpage); - pagevec_add(pagevec, backpage); - __pagevec_lru_add_file(pagevec); + lru_cache_add_file(backpage); read_backing_page: ret = bmapping->a_ops->readpage(NULL, backpage); @@ -452,8 +448,7 @@ int cachefiles_read_or_alloc_page(struct fscache_retrieval *op, if (block) { /* submit the apparently valid page to the backing fs to be * read from disk */ - ret = cachefiles_read_backing_file_one(object, op, page, - &pagevec); + ret = cachefiles_read_backing_file_one(object, op, page); } else if (cachefiles_has_space(cache, 0, 1) == 0) { /* there's space in the cache we can use */ fscache_mark_page_cached(op, page); @@ -482,14 +477,11 @@ static int cachefiles_read_backing_file(struct cachefiles_object *object, { struct cachefiles_one_read *monitor = NULL; struct address_space *bmapping = object->backer->d_inode->i_mapping; - struct pagevec lru_pvec; struct page *newpage = NULL, *netpage, *_n, *backpage = NULL; int ret = 0; _enter(""); - pagevec_init(&lru_pvec, 0); - list_for_each_entry_safe(netpage, _n, list, lru) { list_del(&netpage->lru); @@ -534,9 +526,7 @@ static int cachefiles_read_backing_file(struct cachefiles_object *object, backpage = newpage; newpage = NULL; - page_cache_get(backpage); - if (!pagevec_add(&lru_pvec, backpage)) - __pagevec_lru_add_file(&lru_pvec); + lru_cache_add_file(backpage); reread_backing_page: ret = bmapping->a_ops->readpage(NULL, backpage); @@ -559,9 +549,7 @@ static int cachefiles_read_backing_file(struct cachefiles_object *object, goto nomem; } - page_cache_get(netpage); - if (!pagevec_add(&lru_pvec, netpage)) - __pagevec_lru_add_file(&lru_pvec); + lru_cache_add_file(netpage); /* install a monitor */ page_cache_get(netpage); @@ -643,9 +631,7 @@ static int cachefiles_read_backing_file(struct cachefiles_object *object, fscache_mark_page_cached(op, netpage); - page_cache_get(netpage); - if (!pagevec_add(&lru_pvec, netpage)) - __pagevec_lru_add_file(&lru_pvec); + lru_cache_add_file(netpage); /* the netpage is unlocked and marked up to date here */ fscache_end_io(op, netpage, 0); @@ -661,8 +647,6 @@ static int cachefiles_read_backing_file(struct cachefiles_object *object, out: /* tidy up */ - pagevec_lru_add_file(&lru_pvec); - if (newpage) page_cache_release(newpage); if (netpage) diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 5d051419527b..d7ed697133f0 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -33,6 +33,7 @@ #include <linux/pagevec.h> #include <linux/namei.h> #include <linux/mount.h> +#include <linux/swap.h> #include <linux/sched.h> #include <linux/kmemleak.h> #include <linux/xattr.h> @@ -1758,7 +1759,6 @@ EXPORT_SYMBOL_GPL(nfs_unlink); */ int nfs_symlink(struct inode *dir, struct dentry *dentry, const char *symname) { - struct pagevec lru_pvec; struct page *page; char *kaddr; struct iattr attr; @@ -1798,11 +1798,8 @@ int nfs_symlink(struct inode *dir, struct dentry *dentry, const char *symname) * No big deal if we can't add this page to the page cache here. * READLINK will get the missing page from the server if needed. */ - pagevec_init(&lru_pvec, 0); - if (!add_to_page_cache(page, dentry->d_inode->i_mapping, 0, + if (!add_to_page_cache_lru(page, dentry->d_inode->i_mapping, 0, GFP_KERNEL)) { - pagevec_add(&lru_pvec, page); - pagevec_lru_add_file(&lru_pvec); SetPageUptodate(page); unlock_page(page); } else diff --git a/include/linux/pagevec.h b/include/linux/pagevec.h index 2aa12b8499c0..e4dbfab37729 100644 --- a/include/linux/pagevec.h +++ b/include/linux/pagevec.h @@ -21,7 +21,7 @@ struct pagevec { }; void __pagevec_release(struct pagevec *pvec); -void __pagevec_lru_add(struct pagevec *pvec, enum lru_list lru); +void __pagevec_lru_add(struct pagevec *pvec); unsigned pagevec_lookup(struct pagevec *pvec, struct address_space *mapping, pgoff_t start, unsigned nr_pages); unsigned pagevec_lookup_tag(struct pagevec *pvec, @@ -64,36 +64,4 @@ static inline void pagevec_release(struct pagevec *pvec) __pagevec_release(pvec); } -static inline void __pagevec_lru_add_anon(struct pagevec *pvec) -{ - __pagevec_lru_add(pvec, LRU_INACTIVE_ANON); -} - -static inline void __pagevec_lru_add_active_anon(struct pagevec *pvec) -{ - __pagevec_lru_add(pvec, LRU_ACTIVE_ANON); -} - -static inline void __pagevec_lru_add_file(struct pagevec *pvec) -{ - __pagevec_lru_add(pvec, LRU_INACTIVE_FILE); -} - -static inline void __pagevec_lru_add_active_file(struct pagevec *pvec) -{ - __pagevec_lru_add(pvec, LRU_ACTIVE_FILE); -} - -static inline void pagevec_lru_add_file(struct pagevec *pvec) -{ - if (pagevec_count(pvec)) - __pagevec_lru_add_file(pvec); -} - -static inline void pagevec_lru_add_anon(struct pagevec *pvec) -{ - if (pagevec_count(pvec)) - __pagevec_lru_add_anon(pvec); -} - #endif /* _LINUX_PAGEVEC_H */ diff --git a/mm/swap.c b/mm/swap.c index c53d161fc76d..6a9d0c43924a 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -505,7 +505,7 @@ void __lru_cache_add(struct page *page, enum lru_list lru) page_cache_get(page); if (!pagevec_space(pvec)) - __pagevec_lru_add(pvec, lru); + __pagevec_lru_add(pvec); pagevec_add(pvec, page); put_cpu_var(lru_add_pvec); } @@ -628,7 +628,7 @@ void lru_add_drain_cpu(int cpu) struct pagevec *pvec = &per_cpu(lru_add_pvec, cpu); if (pagevec_count(pvec)) - __pagevec_lru_add(pvec, NR_LRU_LISTS); + __pagevec_lru_add(pvec); pvec = &per_cpu(lru_rotate_pvecs, cpu); if (pagevec_count(pvec)) { @@ -832,12 +832,10 @@ void lru_add_page_tail(struct page *page, struct page *page_tail, static void __pagevec_lru_add_fn(struct page *page, struct lruvec *lruvec, void *arg) { - enum lru_list requested_lru = (enum lru_list)arg; int file = page_is_file_cache(page); int active = PageActive(page); enum lru_list lru = page_lru(page); - WARN_ON_ONCE(requested_lru < NR_LRU_LISTS && requested_lru != lru); VM_BUG_ON(PageUnevictable(page)); VM_BUG_ON(PageLRU(page)); @@ -851,11 +849,9 @@ static void __pagevec_lru_add_fn(struct page *page, struct lruvec *lruvec, * Add the passed pages to the LRU, then drop the caller's refcount * on them. Reinitialises the caller's pagevec. */ -void __pagevec_lru_add(struct pagevec *pvec, enum lru_list lru) +void __pagevec_lru_add(struct pagevec *pvec) { - VM_BUG_ON(is_unevictable_lru(lru)); - - pagevec_lru_move_fn(pvec, __pagevec_lru_add_fn, (void *)lru); + pagevec_lru_move_fn(pvec, __pagevec_lru_add_fn, NULL); } EXPORT_SYMBOL(__pagevec_lru_add); -- cgit v1.2.3 From c53954a092d07c5684d31ea1fc813d262cff08a5 Mon Sep 17 00:00:00 2001 From: Mel Gorman <mgorman@suse.de> Date: Wed, 3 Jul 2013 15:02:34 -0700 Subject: mm: remove lru parameter from __lru_cache_add and lru_cache_add_lru Similar to __pagevec_lru_add, this patch removes the LRU parameter from __lru_cache_add and lru_cache_add_lru as the caller does not control the exact LRU the page gets added to. lru_cache_add_lru gets renamed to lru_cache_add the name is silly without the lru parameter. With the parameter removed, it is required that the caller indicate if they want the page added to the active or inactive list by setting or clearing PageActive respectively. [akpm@linux-foundation.org: Suggested the patch] [gang.chen@asianux.com: fix used-unintialized warning] Signed-off-by: Mel Gorman <mgorman@suse.de> Signed-off-by: Chen Gang <gang.chen@asianux.com> Cc: Jan Kara <jack@suse.cz> Cc: Rik van Riel <riel@redhat.com> Acked-by: Johannes Weiner <hannes@cmpxchg.org> Cc: Alexey Lyahkov <alexey.lyashkov@gmail.com> Cc: Andrew Perepechko <anserper@ya.ru> Cc: Robin Dong <sanbai@taobao.com> Cc: Theodore Tso <tytso@mit.edu> Cc: Hugh Dickins <hughd@google.com> Cc: Rik van Riel <riel@redhat.com> Cc: Bernd Schubert <bernd.schubert@fastmail.fm> Cc: David Howells <dhowells@redhat.com> Cc: Trond Myklebust <Trond.Myklebust@netapp.com> Cc: Mel Gorman <mgorman@suse.de> Cc: Rik van Riel <riel@redhat.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> --- include/linux/swap.h | 11 +++++++---- mm/rmap.c | 7 ++++--- mm/swap.c | 17 +++++++---------- mm/vmscan.c | 5 ++--- 4 files changed, 20 insertions(+), 20 deletions(-) (limited to 'mm') diff --git a/include/linux/swap.h b/include/linux/swap.h index 1701ce4be746..85d74373002c 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -10,6 +10,7 @@ #include <linux/node.h> #include <linux/fs.h> #include <linux/atomic.h> +#include <linux/page-flags.h> #include <asm/page.h> struct notifier_block; @@ -233,8 +234,8 @@ extern unsigned long nr_free_pagecache_pages(void); /* linux/mm/swap.c */ -extern void __lru_cache_add(struct page *, enum lru_list lru); -extern void lru_cache_add_lru(struct page *, enum lru_list lru); +extern void __lru_cache_add(struct page *); +extern void lru_cache_add(struct page *); extern void lru_add_page_tail(struct page *page, struct page *page_tail, struct lruvec *lruvec, struct list_head *head); extern void activate_page(struct page *); @@ -254,12 +255,14 @@ extern void add_page_to_unevictable_list(struct page *page); */ static inline void lru_cache_add_anon(struct page *page) { - __lru_cache_add(page, LRU_INACTIVE_ANON); + ClearPageActive(page); + __lru_cache_add(page); } static inline void lru_cache_add_file(struct page *page) { - __lru_cache_add(page, LRU_INACTIVE_FILE); + ClearPageActive(page); + __lru_cache_add(page); } /* linux/mm/vmscan.c */ diff --git a/mm/rmap.c b/mm/rmap.c index 6280da86b5d6..e22ceeb6e5ec 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1093,9 +1093,10 @@ void page_add_new_anon_rmap(struct page *page, else __inc_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES); __page_set_anon_rmap(page, vma, address, 1); - if (!mlocked_vma_newpage(vma, page)) - lru_cache_add_lru(page, LRU_ACTIVE_ANON); - else + if (!mlocked_vma_newpage(vma, page)) { + SetPageActive(page); + lru_cache_add(page); + } else add_page_to_unevictable_list(page); } diff --git a/mm/swap.c b/mm/swap.c index 6a9d0c43924a..4a1d0d2c52fa 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -494,15 +494,10 @@ EXPORT_SYMBOL(mark_page_accessed); * pagevec is drained. This gives a chance for the caller of __lru_cache_add() * have the page added to the active list using mark_page_accessed(). */ -void __lru_cache_add(struct page *page, enum lru_list lru) +void __lru_cache_add(struct page *page) { struct pagevec *pvec = &get_cpu_var(lru_add_pvec); - if (is_active_lru(lru)) - SetPageActive(page); - else - ClearPageActive(page); - page_cache_get(page); if (!pagevec_space(pvec)) __pagevec_lru_add(pvec); @@ -512,11 +507,10 @@ void __lru_cache_add(struct page *page, enum lru_list lru) EXPORT_SYMBOL(__lru_cache_add); /** - * lru_cache_add_lru - add a page to a page list + * lru_cache_add - add a page to a page list * @page: the page to be added to the LRU. - * @lru: the LRU list to which the page is added. */ -void lru_cache_add_lru(struct page *page, enum lru_list lru) +void lru_cache_add(struct page *page) { if (PageActive(page)) { VM_BUG_ON(PageUnevictable(page)); @@ -525,7 +519,7 @@ void lru_cache_add_lru(struct page *page, enum lru_list lru) } VM_BUG_ON(PageLRU(page)); - __lru_cache_add(page, lru); + __lru_cache_add(page); } /** @@ -745,6 +739,9 @@ void release_pages(struct page **pages, int nr, int cold) del_page_from_lru_list(page, lruvec, page_off_lru(page)); } + /* Clear Active bit in case of parallel mark_page_accessed */ + ClearPageActive(page); + list_add(&page->lru, &pages_to_free); } if (zone) diff --git a/mm/vmscan.c b/mm/vmscan.c index c85794399848..99b3ac7771ad 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -546,7 +546,6 @@ int remove_mapping(struct address_space *mapping, struct page *page) void putback_lru_page(struct page *page) { int lru; - int active = !!TestClearPageActive(page); int was_unevictable = PageUnevictable(page); VM_BUG_ON(PageLRU(page)); @@ -561,8 +560,8 @@ redo: * unevictable page on [in]active list. * We know how to handle that. */ - lru = active + page_lru_base_type(page); - lru_cache_add_lru(page, lru); + lru = page_lru_base_type(page); + lru_cache_add(page); } else { /* * Put unevictable pages directly on zone's unevictable -- cgit v1.2.3 From dacbde0963d62a4962d5e8a5cc38dfd1f016124b Mon Sep 17 00:00:00 2001 From: Chen Gang <gang.chen@asianux.com> Date: Wed, 3 Jul 2013 15:02:35 -0700 Subject: mm/page_alloc.c: add additional checking and return value for the 'table->data' - check the length of the procfs data before copying it into a fixed size array. - when __parse_numa_zonelist_order() fails, save the error code for return. - 'char*' --> 'char *' coding style fix Signed-off-by: Chen Gang <gang.chen@asianux.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> --- mm/page_alloc.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index fab9506273be..a662c74a0f5d 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3256,18 +3256,25 @@ int numa_zonelist_order_handler(ctl_table *table, int write, static DEFINE_MUTEX(zl_order_mutex); mutex_lock(&zl_order_mutex); - if (write) - strcpy(saved_string, (char*)table->data); + if (write) { + if (strlen((char *)table->data) >= NUMA_ZONELIST_ORDER_LEN) { + ret = -EINVAL; + goto out; + } + strcpy(saved_string, (char *)table->data); + } ret = proc_dostring(table, write, buffer, length, ppos); if (ret) goto out; if (write) { int oldval = user_zonelist_order; - if (__parse_numa_zonelist_order((char*)table->data)) { + + ret = __parse_numa_zonelist_order((char *)table->data); + if (ret) { /* * bogus value. restore saved string */ - strncpy((char*)table->data, saved_string, + strncpy((char *)table->data, saved_string, NUMA_ZONELIST_ORDER_LEN); user_zonelist_order = oldval; } else if (oldval != user_zonelist_order) { -- cgit v1.2.3 From 9bde916bc73255dcee3d8aded990443675daa707 Mon Sep 17 00:00:00 2001 From: Chen Gang <gang.chen@asianux.com> Date: Wed, 3 Jul 2013 15:02:36 -0700 Subject: mm/nommu.c: add additional check for vread() just like vwrite() has done vwrite() checks for overflow. vread() should do the same thing. Since vwrite() checks the source buffer address, vread() should check the destination buffer address. Signed-off-by: Chen Gang <gang.chen@asianux.com> Cc: Al Viro <viro@zeniv.linux.org.uk> Cc: Michel Lespinasse <walken@google.com> Cc: Rik van Riel <riel@redhat.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> --- mm/nommu.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'mm') diff --git a/mm/nommu.c b/mm/nommu.c index 298884dcd6e7..1898b2fe9da5 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -282,6 +282,10 @@ EXPORT_SYMBOL(vmalloc_to_pfn); long vread(char *buf, char *addr, unsigned long count) { + /* Don't allow overflow */ + if ((unsigned long) buf + count < count) + count = -(unsigned long) buf; + memcpy(buf, addr, count); return count; } -- cgit v1.2.3 From f15bdfa802bfa5eb6b4b5a241b97ec9fa1204a35 Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com> Date: Wed, 3 Jul 2013 15:02:37 -0700 Subject: mm/memory-failure.c: fix memory leak in successful soft offlining After a successful page migration by soft offlining, the source page is not properly freed and it's never reusable even if we unpoison it afterward. This is caused by the race between freeing page and setting PG_hwpoison. In successful soft offlining, the source page is put (and the refcount becomes 0) by putback_lru_page() in unmap_and_move(), where it's linked to pagevec and actual freeing back to buddy is delayed. So if PG_hwpoison is set for the page before freeing, the freeing does not functions as expected (in such case freeing aborts in free_pages_prepare() check.) This patch tries to make sure to free the source page before setting PG_hwpoison on it. To avoid reallocating, the page keeps MIGRATE_ISOLATE until after setting PG_hwpoison. This patch also removes obsolete comments about "keeping elevated refcount" because what they say is not true. Unlike memory_failure(), soft_offline_page() uses no special page isolation code, and the soft-offlined pages have no elevated. Signed-off-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com> Cc: Andi Kleen <andi@firstfloor.org> Cc: Mel Gorman <mel@csn.ul.ie> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> --- mm/memory-failure.c | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/memory-failure.c b/mm/memory-failure.c index ceb0c7f1932f..2c13aa7a0164 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1410,7 +1410,8 @@ static int __get_any_page(struct page *p, unsigned long pfn, int flags) /* * Isolate the page, so that it doesn't get reallocated if it - * was free. + * was free. This flag should be kept set until the source page + * is freed and PG_hwpoison on it is set. */ set_migratetype_isolate(p, true); /* @@ -1433,7 +1434,6 @@ static int __get_any_page(struct page *p, unsigned long pfn, int flags) /* Not a free page */ ret = 1; } - unset_migratetype_isolate(p, MIGRATE_MOVABLE); unlock_memory_hotplug(); return ret; } @@ -1494,7 +1494,6 @@ static int soft_offline_huge_page(struct page *page, int flags) atomic_long_add(1 << compound_trans_order(hpage), &num_poisoned_pages); } - /* keep elevated page count for bad page */ return ret; } @@ -1559,7 +1558,7 @@ int soft_offline_page(struct page *page, int flags) atomic_long_inc(&num_poisoned_pages); } } - /* keep elevated page count for bad page */ + unset_migratetype_isolate(page, MIGRATE_MOVABLE); return ret; } @@ -1625,7 +1624,22 @@ static int __soft_offline_page(struct page *page, int flags) if (ret > 0) ret = -EIO; } else { + /* + * After page migration succeeds, the source page can + * be trapped in pagevec and actual freeing is delayed. + * Freeing code works differently based on PG_hwpoison, + * so there's a race. We need to make sure that the + * source page should be freed back to buddy before + * setting PG_hwpoison. + */ + if (!is_free_buddy_page(page)) + lru_add_drain_all(); + if (!is_free_buddy_page(page)) + drain_all_pages(); SetPageHWPoison(page); + if (!is_free_buddy_page(page)) + pr_info("soft offline: %#lx: page leaked\n", + pfn); atomic_long_inc(&num_poisoned_pages); } } else { -- cgit v1.2.3 From 4996eed867a7215958267252fafddc41d5f26140 Mon Sep 17 00:00:00 2001 From: Toshi Kani <toshi.kani@hp.com> Date: Wed, 3 Jul 2013 15:02:39 -0700 Subject: mm/memory_hotplug.c: change normal message to use pr_debug During early boot-up, iomem_resource is set up from the boot descriptor table, such as EFI Memory Table and e820. Later, acpi_memory_device_add() calls add_memory() for each ACPI memory device object as it enumerates ACPI namespace. This add_memory() call is expected to fail in register_memory_resource() at boot since iomem_resource has been set up from EFI/e820. As a result, add_memory() returns -EEXIST, which acpi_memory_device_add() handles as the normal case. This scheme works fine, but the following error message is logged for every ACPI memory device object during boot-up. "System RAM resource %pR cannot be added\n" This patch changes register_memory_resource() to use pr_debug() for the message as it shows up under the normal case. Signed-off-by: Toshi Kani <toshi.kani@hp.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> --- mm/memory_hotplug.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index a66d0023d219..e3097f299f67 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -75,7 +75,7 @@ static struct resource *register_memory_resource(u64 start, u64 size) res->end = start + size - 1; res->flags = IORESOURCE_MEM | IORESOURCE_BUSY; if (request_resource(&iomem_resource, res) < 0) { - printk("System RAM resource %pR cannot be added\n", res); + pr_debug("System RAM resource %pR cannot be added\n", res); kfree(res); res = NULL; } -- cgit v1.2.3 From cea27eb2a202959783f81254c48c250ddd80e129 Mon Sep 17 00:00:00 2001 From: Wanpeng Li <liwanp@linux.vnet.ibm.com> Date: Wed, 3 Jul 2013 15:02:40 -0700 Subject: mm/memory-hotplug: fix lowmem count overflow when offline pages The logic for the memory-remove code fails to correctly account the Total High Memory when a memory block which contains High Memory is offlined as shown in the example below. The following patch fixes it. Before logic memory remove: MemTotal: 7603740 kB MemFree: 6329612 kB Buffers: 94352 kB Cached: 872008 kB SwapCached: 0 kB Active: 626932 kB Inactive: 519216 kB Active(anon): 180776 kB Inactive(anon): 222944 kB Active(file): 446156 kB Inactive(file): 296272 kB Unevictable: 0 kB Mlocked: 0 kB HighTotal: 7294672 kB HighFree: 5704696 kB LowTotal: 309068 kB LowFree: 624916 kB After logic memory remove: MemTotal: 7079452 kB MemFree: 5805976 kB Buffers: 94372 kB Cached: 872000 kB SwapCached: 0 kB Active: 626936 kB Inactive: 519236 kB Active(anon): 180780 kB Inactive(anon): 222944 kB Active(file): 446156 kB Inactive(file): 296292 kB Unevictable: 0 kB Mlocked: 0 kB HighTotal: 7294672 kB HighFree: 5181024 kB LowTotal: 4294752076 kB LowFree: 624952 kB [mhocko@suse.cz: fix CONFIG_HIGHMEM=n build] Signed-off-by: Wanpeng Li <liwanp@linux.vnet.ibm.com> Reviewed-by: Michal Hocko <mhocko@suse.cz> Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Cc: David Rientjes <rientjes@google.com> Cc: <stable@vger.kernel.org> [2.6.24+] Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> --- mm/page_alloc.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index a662c74a0f5d..d711dcdda362 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -6185,6 +6185,10 @@ __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn) list_del(&page->lru); rmv_page_order(page); zone->free_area[order].nr_free--; +#ifdef CONFIG_HIGHMEM + if (PageHighMem(page)) + totalhigh_pages -= 1 << order; +#endif for (i = 0; i < (1 << order); i++) SetPageReserved((page+i)); pfn += (1 << order); -- cgit v1.2.3 From 2415cf12e04d415b16d9c2f2a705bcd6cd9a0474 Mon Sep 17 00:00:00 2001 From: Wanpeng Li <liwanp@linux.vnet.ibm.com> Date: Wed, 3 Jul 2013 15:02:43 -0700 Subject: mm/hugetlb: use already existing interface huge_page_shift Use the already existing interface huge_page_shift instead of h->order + PAGE_SHIFT. Signed-off-by: Wanpeng Li <liwanp@linux.vnet.ibm.com> Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Cc: David Rientjes <rientjes@google.com> Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org> Reviewed-by: Michal Hocko <mhocko@suse.cz> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> --- arch/powerpc/mm/hugetlbpage.c | 2 +- mm/hugetlb.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c index 77fdd2cef33b..4210549ac95e 100644 --- a/arch/powerpc/mm/hugetlbpage.c +++ b/arch/powerpc/mm/hugetlbpage.c @@ -357,7 +357,7 @@ void add_gpage(u64 addr, u64 page_size, unsigned long number_of_pages) int alloc_bootmem_huge_page(struct hstate *hstate) { struct huge_bootmem_page *m; - int idx = shift_to_mmu_psize(hstate->order + PAGE_SHIFT); + int idx = shift_to_mmu_psize(huge_page_shift(hstate)); int nr_gpages = gpage_freearray[idx].nr_gpages; if (nr_gpages == 0) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index aed085ad11a8..fe095158859e 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -319,7 +319,7 @@ unsigned long vma_kernel_pagesize(struct vm_area_struct *vma) hstate = hstate_vma(vma); - return 1UL << (hstate->order + PAGE_SHIFT); + return 1UL << huge_page_shift(hstate); } EXPORT_SYMBOL_GPL(vma_kernel_pagesize); -- cgit v1.2.3 From 917d9290af749fac9c4d90bacf18699c9d8ba28d Mon Sep 17 00:00:00 2001 From: Tim Chen <tim.c.chen@linux.intel.com> Date: Wed, 3 Jul 2013 15:02:44 -0700 Subject: mm: tune vm_committed_as percpu_counter batching size Currently the per cpu counter's batch size for memory accounting is configured as twice the number of cpus in the system. However, for system with very large memory, it is more appropriate to make it proportional to the memory size per cpu in the system. For example, for a x86_64 system with 64 cpus and 128 GB of memory, the batch size is only 2*64 pages (0.5 MB). So any memory accounting changes of more than 0.5MB will overflow the per cpu counter into the global counter. Instead, for the new scheme, the batch size is configured to be 0.4% of the memory/cpu = 8MB (128 GB/64 /256), which is more inline with the memory size. I've done a repeated brk test of 800KB (from will-it-scale test suite) with 80 concurrent processes on a 4 socket Westmere machine with a total of 40 cores. Without the patch, about 80% of cpu is spent on spin-lock contention within the vm_committed_as counter. With the patch, there's a 73x speedup on the benchmark and the lock contention drops off almost entirely. [akpm@linux-foundation.org: fix section mismatch] Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com> Cc: Tejun Heo <tj@kernel.org> Cc: Eric Dumazet <eric.dumazet@gmail.com> Cc: Dave Hansen <dave.hansen@intel.com> Cc: Andi Kleen <ak@linux.intel.com> Cc: Wu Fengguang <fengguang.wu@intel.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> --- include/linux/mman.h | 8 +++++++- mm/mm_init.c | 47 +++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 54 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/include/linux/mman.h b/include/linux/mman.h index 9aa863da287f..92dc257251e4 100644 --- a/include/linux/mman.h +++ b/include/linux/mman.h @@ -11,11 +11,17 @@ extern int sysctl_overcommit_memory; extern int sysctl_overcommit_ratio; extern struct percpu_counter vm_committed_as; +#ifdef CONFIG_SMP +extern s32 vm_committed_as_batch; +#else +#define vm_committed_as_batch 0 +#endif + unsigned long vm_memory_committed(void); static inline void vm_acct_memory(long pages) { - percpu_counter_add(&vm_committed_as, pages); + __percpu_counter_add(&vm_committed_as, pages, vm_committed_as_batch); } static inline void vm_unacct_memory(long pages) diff --git a/mm/mm_init.c b/mm/mm_init.c index c280a02ea11e..633c08863fd8 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -9,6 +9,8 @@ #include <linux/init.h> #include <linux/kobject.h> #include <linux/export.h> +#include <linux/memory.h> +#include <linux/notifier.h> #include "internal.h" #ifdef CONFIG_DEBUG_MEMORY_INIT @@ -147,6 +149,51 @@ early_param("mminit_loglevel", set_mminit_loglevel); struct kobject *mm_kobj; EXPORT_SYMBOL_GPL(mm_kobj); +#ifdef CONFIG_SMP +s32 vm_committed_as_batch = 32; + +static void __meminit mm_compute_batch(void) +{ + u64 memsized_batch; + s32 nr = num_present_cpus(); + s32 batch = max_t(s32, nr*2, 32); + + /* batch size set to 0.4% of (total memory/#cpus), or max int32 */ + memsized_batch = min_t(u64, (totalram_pages/nr)/256, 0x7fffffff); + + vm_committed_as_batch = max_t(s32, memsized_batch, batch); +} + +static int __meminit mm_compute_batch_notifier(struct notifier_block *self, + unsigned long action, void *arg) +{ + switch (action) { + case MEM_ONLINE: + case MEM_OFFLINE: + mm_compute_batch(); + default: + break; + } + return NOTIFY_OK; +} + +static struct notifier_block compute_batch_nb __meminitdata = { + .notifier_call = mm_compute_batch_notifier, + .priority = IPC_CALLBACK_PRI, /* use lowest priority */ +}; + +static int __init mm_compute_batch_init(void) +{ + mm_compute_batch(); + register_hotmemory_notifier(&compute_batch_nb); + + return 0; +} + +__initcall(mm_compute_batch_init); + +#endif + static int __init mm_sysfs_init(void) { mm_kobj = kobject_create_and_add("mm", kernel_kobj); -- cgit v1.2.3 From dcf6b7ddd7df8965727746f89c59229b23180e5a Mon Sep 17 00:00:00 2001 From: Rafael Aquini <aquini@redhat.com> Date: Wed, 3 Jul 2013 15:02:46 -0700 Subject: swap: discard while swapping only if SWAP_FLAG_DISCARD_PAGES Considering the use cases where the swap device supports discard: a) and can do it quickly; b) but it's slow to do in small granularities (or concurrent with other I/O); c) but the implementation is so horrendous that you don't even want to send one down; And assuming that the sysadmin considers it useful to send the discards down at all, we would (probably) want the following solutions: i. do the fine-grained discards for freed swap pages, if device is capable of doing so optimally; ii. do single-time (batched) swap area discards, either at swapon or via something like fstrim (not implemented yet); iii. allow doing both single-time and fine-grained discards; or iv. turn it off completely (default behavior) As implemented today, one can only enable/disable discards for swap, but one cannot select, for instance, solution (ii) on a swap device like (b) even though the single-time discard is regarded to be interesting, or necessary to the workload because it would imply (1), and the device is not capable of performing it optimally. This patch addresses the scenario depicted above by introducing a way to ensure the (probably) wanted solutions (i, ii, iii and iv) can be flexibly flagged through swapon(8) to allow a sysadmin to select the best suitable swap discard policy accordingly to system constraints. This patch introduces SWAP_FLAG_DISCARD_PAGES and SWAP_FLAG_DISCARD_ONCE new flags to allow more flexibe swap discard policies being flagged through swapon(8). The default behavior is to keep both single-time, or batched, area discards (SWAP_FLAG_DISCARD_ONCE) and fine-grained discards for page-clusters (SWAP_FLAG_DISCARD_PAGES) enabled, in order to keep consistentcy with older kernel behavior, as well as maintain compatibility with older swapon(8). However, through the new introduced flags the best suitable discard policy can be selected accordingly to any given swap device constraint. [akpm@linux-foundation.org: tweak comments] Signed-off-by: Rafael Aquini <aquini@redhat.com> Acked-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com> Cc: Hugh Dickins <hughd@google.com> Cc: Shaohua Li <shli@kernel.org> Cc: Karel Zak <kzak@redhat.com> Cc: Jeff Moyer <jmoyer@redhat.com> Cc: Rik van Riel <riel@redhat.com> Cc: Larry Woodman <lwoodman@redhat.com> Cc: Mel Gorman <mel@csn.ul.ie> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> --- include/linux/swap.h | 13 +++++++++---- mm/swapfile.c | 55 +++++++++++++++++++++++++++++++++++++++++++++++----- 2 files changed, 59 insertions(+), 9 deletions(-) (limited to 'mm') diff --git a/include/linux/swap.h b/include/linux/swap.h index 85d74373002c..d95cde5e257d 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -20,10 +20,13 @@ struct bio; #define SWAP_FLAG_PREFER 0x8000 /* set if swap priority specified */ #define SWAP_FLAG_PRIO_MASK 0x7fff #define SWAP_FLAG_PRIO_SHIFT 0 -#define SWAP_FLAG_DISCARD 0x10000 /* discard swap cluster after use */ +#define SWAP_FLAG_DISCARD 0x10000 /* enable discard for swap */ +#define SWAP_FLAG_DISCARD_ONCE 0x20000 /* discard swap area at swapon-time */ +#define SWAP_FLAG_DISCARD_PAGES 0x40000 /* discard page-clusters after use */ #define SWAP_FLAGS_VALID (SWAP_FLAG_PRIO_MASK | SWAP_FLAG_PREFER | \ - SWAP_FLAG_DISCARD) + SWAP_FLAG_DISCARD | SWAP_FLAG_DISCARD_ONCE | \ + SWAP_FLAG_DISCARD_PAGES) static inline int current_is_kswapd(void) { @@ -147,14 +150,16 @@ struct swap_extent { enum { SWP_USED = (1 << 0), /* is slot in swap_info[] used? */ SWP_WRITEOK = (1 << 1), /* ok to write to this swap? */ - SWP_DISCARDABLE = (1 << 2), /* swapon+blkdev support discard */ + SWP_DISCARDABLE = (1 << 2), /* blkdev support discard */ SWP_DISCARDING = (1 << 3), /* now discarding a free cluster */ SWP_SOLIDSTATE = (1 << 4), /* blkdev seeks are cheap */ SWP_CONTINUED = (1 << 5), /* swap_map has count continuation */ SWP_BLKDEV = (1 << 6), /* its a block device */ SWP_FILE = (1 << 7), /* set after swap_activate success */ + SWP_AREA_DISCARD = (1 << 8), /* single-time swap area discards */ + SWP_PAGE_DISCARD = (1 << 9), /* freed swap page-cluster discards */ /* add others here before... */ - SWP_SCANNING = (1 << 8), /* refcount in scan_swap_map */ + SWP_SCANNING = (1 << 10), /* refcount in scan_swap_map */ }; #define SWAP_CLUSTER_MAX 32UL diff --git a/mm/swapfile.c b/mm/swapfile.c index 746af55b8455..36af6eeaa67e 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -212,7 +212,7 @@ static unsigned long scan_swap_map(struct swap_info_struct *si, si->cluster_nr = SWAPFILE_CLUSTER - 1; goto checks; } - if (si->flags & SWP_DISCARDABLE) { + if (si->flags & SWP_PAGE_DISCARD) { /* * Start range check on racing allocations, in case * they overlap the cluster we eventually decide on @@ -322,7 +322,7 @@ checks: if (si->lowest_alloc) { /* - * Only set when SWP_DISCARDABLE, and there's a scan + * Only set when SWP_PAGE_DISCARD, and there's a scan * for a free cluster in progress or just completed. */ if (found_free_cluster) { @@ -2016,6 +2016,20 @@ static int setup_swap_map_and_extents(struct swap_info_struct *p, return nr_extents; } +/* + * Helper to sys_swapon determining if a given swap + * backing device queue supports DISCARD operations. + */ +static bool swap_discardable(struct swap_info_struct *si) +{ + struct request_queue *q = bdev_get_queue(si->bdev); + + if (!q || !blk_queue_discard(q)) + return false; + + return true; +} + SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) { struct swap_info_struct *p; @@ -2123,8 +2137,37 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) p->flags |= SWP_SOLIDSTATE; p->cluster_next = 1 + (prandom_u32() % p->highest_bit); } - if ((swap_flags & SWAP_FLAG_DISCARD) && discard_swap(p) == 0) - p->flags |= SWP_DISCARDABLE; + + if ((swap_flags & SWAP_FLAG_DISCARD) && swap_discardable(p)) { + /* + * When discard is enabled for swap with no particular + * policy flagged, we set all swap discard flags here in + * order to sustain backward compatibility with older + * swapon(8) releases. + */ + p->flags |= (SWP_DISCARDABLE | SWP_AREA_DISCARD | + SWP_PAGE_DISCARD); + + /* + * By flagging sys_swapon, a sysadmin can tell us to + * either do single-time area discards only, or to just + * perform discards for released swap page-clusters. + * Now it's time to adjust the p->flags accordingly. + */ + if (swap_flags & SWAP_FLAG_DISCARD_ONCE) + p->flags &= ~SWP_PAGE_DISCARD; + else if (swap_flags & SWAP_FLAG_DISCARD_PAGES) + p->flags &= ~SWP_AREA_DISCARD; + + /* issue a swapon-time discard if it's still required */ + if (p->flags & SWP_AREA_DISCARD) { + int err = discard_swap(p); + if (unlikely(err)) + printk(KERN_ERR + "swapon: discard_swap(%p): %d\n", + p, err); + } + } } mutex_lock(&swapon_mutex); @@ -2135,11 +2178,13 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) enable_swap_info(p, prio, swap_map, frontswap_map); printk(KERN_INFO "Adding %uk swap on %s. " - "Priority:%d extents:%d across:%lluk %s%s%s\n", + "Priority:%d extents:%d across:%lluk %s%s%s%s%s\n", p->pages<<(PAGE_SHIFT-10), name->name, p->prio, nr_extents, (unsigned long long)span<<(PAGE_SHIFT-10), (p->flags & SWP_SOLIDSTATE) ? "SS" : "", (p->flags & SWP_DISCARDABLE) ? "D" : "", + (p->flags & SWP_AREA_DISCARD) ? "s" : "", + (p->flags & SWP_PAGE_DISCARD) ? "c" : "", (frontswap_map) ? "FS" : ""); mutex_unlock(&swapon_mutex); -- cgit v1.2.3 From 11199692d83dd3fe1511203024fb9853d176ec4c Mon Sep 17 00:00:00 2001 From: Jiang Liu <liuj97@gmail.com> Date: Wed, 3 Jul 2013 15:02:48 -0700 Subject: mm: change signature of free_reserved_area() to fix building warnings Change signature of free_reserved_area() according to Russell King's suggestion to fix following build warnings: arch/arm/mm/init.c: In function 'mem_init': arch/arm/mm/init.c:603:2: warning: passing argument 1 of 'free_reserved_area' makes integer from pointer without a cast [enabled by default] free_reserved_area(__va(PHYS_PFN_OFFSET), swapper_pg_dir, 0, NULL); ^ In file included from include/linux/mman.h:4:0, from arch/arm/mm/init.c:15: include/linux/mm.h:1301:22: note: expected 'long unsigned int' but argument is of type 'void *' extern unsigned long free_reserved_area(unsigned long start, unsigned long end, mm/page_alloc.c: In function 'free_reserved_area': >> mm/page_alloc.c:5134:3: warning: passing argument 1 of 'virt_to_phys' makes pointer from integer without a cast [enabled by default] In file included from arch/mips/include/asm/page.h:49:0, from include/linux/mmzone.h:20, from include/linux/gfp.h:4, from include/linux/mm.h:8, from mm/page_alloc.c:18: arch/mips/include/asm/io.h:119:29: note: expected 'const volatile void *' but argument is of type 'long unsigned int' mm/page_alloc.c: In function 'free_area_init_nodes': mm/page_alloc.c:5030:34: warning: array subscript is below array bounds [-Warray-bounds] Also address some minor code review comments. Signed-off-by: Jiang Liu <jiang.liu@huawei.com> Reported-by: Arnd Bergmann <arnd@arndb.de> Cc: "H. Peter Anvin" <hpa@zytor.com> Cc: "Michael S. Tsirkin" <mst@redhat.com> Cc: <sworddragon2@aol.com> Cc: Catalin Marinas <catalin.marinas@arm.com> Cc: Chris Metcalf <cmetcalf@tilera.com> Cc: David Howells <dhowells@redhat.com> Cc: Geert Uytterhoeven <geert@linux-m68k.org> Cc: Ingo Molnar <mingo@redhat.com> Cc: Jeremy Fitzhardinge <jeremy@goop.org> Cc: Jianguo Wu <wujianguo@huawei.com> Cc: Joonsoo Kim <js1304@gmail.com> Cc: Kamezawa Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com> Cc: Marek Szyprowski <m.szyprowski@samsung.com> Cc: Mel Gorman <mel@csn.ul.ie> Cc: Michel Lespinasse <walken@google.com> Cc: Minchan Kim <minchan@kernel.org> Cc: Rik van Riel <riel@redhat.com> Cc: Rusty Russell <rusty@rustcorp.com.au> Cc: Tang Chen <tangchen@cn.fujitsu.com> Cc: Tejun Heo <tj@kernel.org> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: Wen Congyang <wency@cn.fujitsu.com> Cc: Will Deacon <will.deacon@arm.com> Cc: Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com> Cc: Yinghai Lu <yinghai@kernel.org> Cc: Russell King <rmk@arm.linux.org.uk> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> --- arch/alpha/kernel/sys_nautilus.c | 4 ++-- arch/alpha/mm/init.c | 2 +- arch/arc/mm/init.c | 2 +- arch/arm/mm/init.c | 2 +- arch/arm64/mm/init.c | 2 +- arch/avr32/mm/init.c | 2 +- arch/blackfin/mm/init.c | 2 +- arch/c6x/mm/init.c | 2 +- arch/frv/mm/init.c | 2 +- arch/h8300/mm/init.c | 2 +- arch/ia64/mm/init.c | 3 +-- arch/m32r/mm/init.c | 2 +- arch/m68k/mm/init.c | 2 +- arch/metag/mm/init.c | 3 ++- arch/microblaze/mm/init.c | 2 +- arch/mips/mm/init.c | 3 ++- arch/mn10300/mm/init.c | 3 ++- arch/openrisc/mm/init.c | 2 +- arch/parisc/mm/init.c | 3 ++- arch/powerpc/kernel/kvm.c | 2 +- arch/powerpc/mm/mem.c | 2 +- arch/s390/mm/init.c | 3 ++- arch/score/mm/init.c | 3 ++- arch/sh/mm/init.c | 2 +- arch/sparc/mm/init_32.c | 4 ++-- arch/sparc/mm/init_64.c | 4 ++-- arch/um/kernel/mem.c | 2 +- arch/unicore32/mm/init.c | 2 +- arch/xtensa/mm/init.c | 2 +- include/linux/mm.h | 5 ++--- mm/page_alloc.c | 19 ++++++++++--------- 31 files changed, 50 insertions(+), 45 deletions(-) (limited to 'mm') diff --git a/arch/alpha/kernel/sys_nautilus.c b/arch/alpha/kernel/sys_nautilus.c index 1d4aabfcf9a1..891bd274ccb5 100644 --- a/arch/alpha/kernel/sys_nautilus.c +++ b/arch/alpha/kernel/sys_nautilus.c @@ -238,8 +238,8 @@ nautilus_init_pci(void) if (pci_mem < memtop) memtop = pci_mem; if (memtop > alpha_mv.min_mem_address) { - free_reserved_area((unsigned long)__va(alpha_mv.min_mem_address), - (unsigned long)__va(memtop), 0, NULL); + free_reserved_area(__va(alpha_mv.min_mem_address), + __va(memtop), 0, NULL); printk("nautilus_init_pci: %ldk freed\n", (memtop - alpha_mv.min_mem_address) >> 10); } diff --git a/arch/alpha/mm/init.c b/arch/alpha/mm/init.c index 0ba85ee4a466..d54848d4e464 100644 --- a/arch/alpha/mm/init.c +++ b/arch/alpha/mm/init.c @@ -326,6 +326,6 @@ free_initmem(void) void free_initrd_mem(unsigned long start, unsigned long end) { - free_reserved_area(start, end, 0, "initrd"); + free_reserved_area((void *)start, (void *)end, 0, "initrd"); } #endif diff --git a/arch/arc/mm/init.c b/arch/arc/mm/init.c index 4a177365b2c4..dce02e4716a1 100644 --- a/arch/arc/mm/init.c +++ b/arch/arc/mm/init.c @@ -152,7 +152,7 @@ void __init_refok free_initmem(void) #ifdef CONFIG_BLK_DEV_INITRD void __init free_initrd_mem(unsigned long start, unsigned long end) { - free_reserved_area(start, end, 0, "initrd"); + free_reserved_area((void *)start, (void *)end, 0, "initrd"); } #endif diff --git a/arch/arm/mm/init.c b/arch/arm/mm/init.c index 2ffee02d1d5c..7fae391caf86 100644 --- a/arch/arm/mm/init.c +++ b/arch/arm/mm/init.c @@ -745,7 +745,7 @@ void free_initrd_mem(unsigned long start, unsigned long end) { if (!keep_initrd) { poison_init_mem((void *)start, PAGE_ALIGN(end) - start); - free_reserved_area(start, end, 0, "initrd"); + free_reserved_area((void *)start, (void *)end, 0, "initrd"); } } diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c index f497ca77925a..6041e4008a83 100644 --- a/arch/arm64/mm/init.c +++ b/arch/arm64/mm/init.c @@ -398,7 +398,7 @@ void free_initrd_mem(unsigned long start, unsigned long end) { if (!keep_initrd) { poison_init_mem((void *)start, PAGE_ALIGN(end) - start); - free_reserved_area(start, end, 0, "initrd"); + free_reserved_area((void *)start, (void *)end, 0, "initrd"); } } diff --git a/arch/avr32/mm/init.c b/arch/avr32/mm/init.c index e66e8406f992..5a79fa08cb3c 100644 --- a/arch/avr32/mm/init.c +++ b/arch/avr32/mm/init.c @@ -154,6 +154,6 @@ void free_initmem(void) #ifdef CONFIG_BLK_DEV_INITRD void free_initrd_mem(unsigned long start, unsigned long end) { - free_reserved_area(start, end, 0, "initrd"); + free_reserved_area((void *)start, (void *)end, 0, "initrd"); } #endif diff --git a/arch/blackfin/mm/init.c b/arch/blackfin/mm/init.c index 82d01a71207f..8e9eab272811 100644 --- a/arch/blackfin/mm/init.c +++ b/arch/blackfin/mm/init.c @@ -133,7 +133,7 @@ void __init mem_init(void) void __init free_initrd_mem(unsigned long start, unsigned long end) { #ifndef CONFIG_MPU - free_reserved_area(start, end, 0, "initrd"); + free_reserved_area((void *)start, (void *)end, 0, "initrd"); #endif } #endif diff --git a/arch/c6x/mm/init.c b/arch/c6x/mm/init.c index b74ccb5a7690..07bfcc98a3b7 100644 --- a/arch/c6x/mm/init.c +++ b/arch/c6x/mm/init.c @@ -78,7 +78,7 @@ void __init mem_init(void) #ifdef CONFIG_BLK_DEV_INITRD void __init free_initrd_mem(unsigned long start, unsigned long end) { - free_reserved_area(start, end, 0, "initrd"); + free_reserved_area((void *)start, (void *)end, 0, "initrd"); } #endif diff --git a/arch/frv/mm/init.c b/arch/frv/mm/init.c index dee354fa6b64..a67f3a5897b8 100644 --- a/arch/frv/mm/init.c +++ b/arch/frv/mm/init.c @@ -173,6 +173,6 @@ void free_initmem(void) #ifdef CONFIG_BLK_DEV_INITRD void __init free_initrd_mem(unsigned long start, unsigned long end) { - free_reserved_area(start, end, 0, "initrd"); + free_reserved_area((void *)start, (void *)end, 0, "initrd"); } /* end free_initrd_mem() */ #endif diff --git a/arch/h8300/mm/init.c b/arch/h8300/mm/init.c index ff349d70a29b..57e03c59861d 100644 --- a/arch/h8300/mm/init.c +++ b/arch/h8300/mm/init.c @@ -161,7 +161,7 @@ void __init mem_init(void) #ifdef CONFIG_BLK_DEV_INITRD void free_initrd_mem(unsigned long start, unsigned long end) { - free_reserved_area(start, end, 0, "initrd"); + free_reserved_area((void *)start, (void *)end, 0, "initrd"); } #endif diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c index d1fe4b402601..da568c2e839f 100644 --- a/arch/ia64/mm/init.c +++ b/arch/ia64/mm/init.c @@ -154,8 +154,7 @@ ia64_init_addr_space (void) void free_initmem (void) { - free_reserved_area((unsigned long)ia64_imva(__init_begin), - (unsigned long)ia64_imva(__init_end), + free_reserved_area(ia64_imva(__init_begin), ia64_imva(__init_end), 0, "unused kernel"); } diff --git a/arch/m32r/mm/init.c b/arch/m32r/mm/init.c index ab4cbce91a9b..d80412d0c14e 100644 --- a/arch/m32r/mm/init.c +++ b/arch/m32r/mm/init.c @@ -191,6 +191,6 @@ void free_initmem(void) *======================================================================*/ void free_initrd_mem(unsigned long start, unsigned long end) { - free_reserved_area(start, end, 0, "initrd"); + free_reserved_area((void *)start, (void *)end, 0, "initrd"); } #endif diff --git a/arch/m68k/mm/init.c b/arch/m68k/mm/init.c index 1af2ca3411f6..95de725534e9 100644 --- a/arch/m68k/mm/init.c +++ b/arch/m68k/mm/init.c @@ -202,6 +202,6 @@ void __init mem_init(void) #ifdef CONFIG_BLK_DEV_INITRD void free_initrd_mem(unsigned long start, unsigned long end) { - free_reserved_area(start, end, 0, "initrd"); + free_reserved_area((void *)start, (void *)end, 0, "initrd"); } #endif diff --git a/arch/metag/mm/init.c b/arch/metag/mm/init.c index d05b8455c44c..5e2238dd72e0 100644 --- a/arch/metag/mm/init.c +++ b/arch/metag/mm/init.c @@ -414,7 +414,8 @@ void free_initmem(void) #ifdef CONFIG_BLK_DEV_INITRD void free_initrd_mem(unsigned long start, unsigned long end) { - free_reserved_area(start, end, POISON_FREE_INITMEM, "initrd"); + free_reserved_area((void *)start, (void *)end, POISON_FREE_INITMEM, + "initrd"); } #endif diff --git a/arch/microblaze/mm/init.c b/arch/microblaze/mm/init.c index b38ae3acfeb4..d7b8ada9345f 100644 --- a/arch/microblaze/mm/init.c +++ b/arch/microblaze/mm/init.c @@ -235,7 +235,7 @@ void __init setup_memory(void) #ifdef CONFIG_BLK_DEV_INITRD void free_initrd_mem(unsigned long start, unsigned long end) { - free_reserved_area(start, end, 0, "initrd"); + free_reserved_area((void *)start, (void *)end, 0, "initrd"); } #endif diff --git a/arch/mips/mm/init.c b/arch/mips/mm/init.c index 9b973e0af9cb..268f2a94031b 100644 --- a/arch/mips/mm/init.c +++ b/arch/mips/mm/init.c @@ -440,7 +440,8 @@ void free_init_pages(const char *what, unsigned long begin, unsigned long end) #ifdef CONFIG_BLK_DEV_INITRD void free_initrd_mem(unsigned long start, unsigned long end) { - free_reserved_area(start, end, POISON_FREE_INITMEM, "initrd"); + free_reserved_area((void *)start, (void *)end, POISON_FREE_INITMEM, + "initrd"); } #endif diff --git a/arch/mn10300/mm/init.c b/arch/mn10300/mm/init.c index 5a8ace63a6b4..e19049d1f2b9 100644 --- a/arch/mn10300/mm/init.c +++ b/arch/mn10300/mm/init.c @@ -152,6 +152,7 @@ void free_initmem(void) #ifdef CONFIG_BLK_DEV_INITRD void free_initrd_mem(unsigned long start, unsigned long end) { - free_reserved_area(start, end, POISON_FREE_INITMEM, "initrd"); + free_reserved_area((void *)start, (void *)end, POISON_FREE_INITMEM, + "initrd"); } #endif diff --git a/arch/openrisc/mm/init.c b/arch/openrisc/mm/init.c index b3cbc6703837..ab113325dc4c 100644 --- a/arch/openrisc/mm/init.c +++ b/arch/openrisc/mm/init.c @@ -261,7 +261,7 @@ void __init mem_init(void) #ifdef CONFIG_BLK_DEV_INITRD void free_initrd_mem(unsigned long start, unsigned long end) { - free_reserved_area(start, end, 0, "initrd"); + free_reserved_area((void *)start, (void *)end, 0, "initrd"); } #endif diff --git a/arch/parisc/mm/init.c b/arch/parisc/mm/init.c index 505b56c6b9b9..3223d5e4a372 100644 --- a/arch/parisc/mm/init.c +++ b/arch/parisc/mm/init.c @@ -1101,6 +1101,7 @@ void flush_tlb_all(void) #ifdef CONFIG_BLK_DEV_INITRD void free_initrd_mem(unsigned long start, unsigned long end) { - num_physpages += free_reserved_area(start, end, 0, "initrd"); + num_physpages += free_reserved_area((void *)start, (void *)end, 0, + "initrd"); } #endif diff --git a/arch/powerpc/kernel/kvm.c b/arch/powerpc/kernel/kvm.c index 6782221d49bd..5e4830a33c08 100644 --- a/arch/powerpc/kernel/kvm.c +++ b/arch/powerpc/kernel/kvm.c @@ -756,7 +756,7 @@ static __init void kvm_free_tmp(void) end = (ulong)&kvm_tmp[ARRAY_SIZE(kvm_tmp)] & PAGE_MASK; /* Free the tmp space we don't need */ - free_reserved_area(start, end, 0, NULL); + free_reserved_area((void *)start, (void *)end, 0, NULL); } static int __init kvm_guest_init(void) diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c index 0988a26e0413..347c5b1bbd62 100644 --- a/arch/powerpc/mm/mem.c +++ b/arch/powerpc/mm/mem.c @@ -407,7 +407,7 @@ void free_initmem(void) #ifdef CONFIG_BLK_DEV_INITRD void __init free_initrd_mem(unsigned long start, unsigned long end) { - free_reserved_area(start, end, 0, "initrd"); + free_reserved_area((void *)start, (void *)end, 0, "initrd"); } #endif diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c index 89ebae4008f2..0878c89fe7d2 100644 --- a/arch/s390/mm/init.c +++ b/arch/s390/mm/init.c @@ -172,7 +172,8 @@ void free_initmem(void) #ifdef CONFIG_BLK_DEV_INITRD void __init free_initrd_mem(unsigned long start, unsigned long end) { - free_reserved_area(start, end, POISON_FREE_INITMEM, "initrd"); + free_reserved_area((void *)start, (void *)end, POISON_FREE_INITMEM, + "initrd"); } #endif diff --git a/arch/score/mm/init.c b/arch/score/mm/init.c index 0940682ab38b..f5dd61eb4544 100644 --- a/arch/score/mm/init.c +++ b/arch/score/mm/init.c @@ -108,7 +108,8 @@ void __init mem_init(void) #ifdef CONFIG_BLK_DEV_INITRD void free_initrd_mem(unsigned long start, unsigned long end) { - free_reserved_area(start, end, POISON_FREE_INITMEM, "initrd"); + free_reserved_area((void *)start, (void *)end, POISON_FREE_INITMEM, + "initrd"); } #endif diff --git a/arch/sh/mm/init.c b/arch/sh/mm/init.c index 20f9ead650d3..b892a9b7d7e3 100644 --- a/arch/sh/mm/init.c +++ b/arch/sh/mm/init.c @@ -505,7 +505,7 @@ void free_initmem(void) #ifdef CONFIG_BLK_DEV_INITRD void free_initrd_mem(unsigned long start, unsigned long end) { - free_reserved_area(start, end, 0, "initrd"); + free_reserved_area((void *)start, (void *)end, 0, "initrd"); } #endif diff --git a/arch/sparc/mm/init_32.c b/arch/sparc/mm/init_32.c index af472cf7c69a..d5f9c023826f 100644 --- a/arch/sparc/mm/init_32.c +++ b/arch/sparc/mm/init_32.c @@ -372,8 +372,8 @@ void free_initmem (void) #ifdef CONFIG_BLK_DEV_INITRD void free_initrd_mem(unsigned long start, unsigned long end) { - num_physpages += free_reserved_area(start, end, POISON_FREE_INITMEM, - "initrd"); + num_physpages += free_reserved_area((void *)start, (void *)end, + POISON_FREE_INITMEM, "initrd"); } #endif diff --git a/arch/sparc/mm/init_64.c b/arch/sparc/mm/init_64.c index 04fd55a6e461..8269deb84eda 100644 --- a/arch/sparc/mm/init_64.c +++ b/arch/sparc/mm/init_64.c @@ -2131,8 +2131,8 @@ void free_initmem(void) #ifdef CONFIG_BLK_DEV_INITRD void free_initrd_mem(unsigned long start, unsigned long end) { - num_physpages += free_reserved_area(start, end, POISON_FREE_INITMEM, - "initrd"); + num_physpages += free_reserved_area((void *)start, (void *)end, + POISON_FREE_INITMEM, "initrd"); } #endif diff --git a/arch/um/kernel/mem.c b/arch/um/kernel/mem.c index 9df292b270a8..2aa7a2448d58 100644 --- a/arch/um/kernel/mem.c +++ b/arch/um/kernel/mem.c @@ -244,7 +244,7 @@ void free_initmem(void) #ifdef CONFIG_BLK_DEV_INITRD void free_initrd_mem(unsigned long start, unsigned long end) { - free_reserved_area(start, end, 0, "initrd"); + free_reserved_area((void *)start, (void *)end, 0, "initrd"); } #endif diff --git a/arch/unicore32/mm/init.c b/arch/unicore32/mm/init.c index 63df12d71ce3..220755cc9700 100644 --- a/arch/unicore32/mm/init.c +++ b/arch/unicore32/mm/init.c @@ -486,7 +486,7 @@ static int keep_initrd; void free_initrd_mem(unsigned long start, unsigned long end) { if (!keep_initrd) - free_reserved_area(start, end, 0, "initrd"); + free_reserved_area((void *)start, (void *)end, 0, "initrd"); } static int __init keepinitrd_setup(char *__unused) diff --git a/arch/xtensa/mm/init.c b/arch/xtensa/mm/init.c index bba125b4bb06..4d658efc3289 100644 --- a/arch/xtensa/mm/init.c +++ b/arch/xtensa/mm/init.c @@ -214,7 +214,7 @@ extern int initrd_is_mapped; void free_initrd_mem(unsigned long start, unsigned long end) { if (initrd_is_mapped) - free_reserved_area(start, end, 0, "initrd"); + free_reserved_area((void *)start, (void *)end, 0, "initrd"); } #endif diff --git a/include/linux/mm.h b/include/linux/mm.h index 949bd7035895..be1b96ce0650 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1311,7 +1311,7 @@ extern void free_initmem(void); * "poison" if it's non-zero. * Return pages freed into the buddy system. */ -extern unsigned long free_reserved_area(unsigned long start, unsigned long end, +extern unsigned long free_reserved_area(void *start, void *end, int poison, char *s); #ifdef CONFIG_HIGHMEM /* @@ -1355,8 +1355,7 @@ static inline unsigned long free_initmem_default(int poison) { extern char __init_begin[], __init_end[]; - return free_reserved_area(PAGE_ALIGN((unsigned long)&__init_begin) , - ((unsigned long)&__init_end) & PAGE_MASK, + return free_reserved_area(&__init_begin, &__init_end, poison, "unused kernel"); } diff --git a/mm/page_alloc.c b/mm/page_alloc.c index d711dcdda362..be18ccd017bb 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5206,25 +5206,26 @@ early_param("movablecore", cmdline_parse_movablecore); #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ -unsigned long free_reserved_area(unsigned long start, unsigned long end, - int poison, char *s) +unsigned long free_reserved_area(void *start, void *end, int poison, char *s) { - unsigned long pages, pos; + void *pos; + unsigned long pages = 0; - pos = start = PAGE_ALIGN(start); - end &= PAGE_MASK; - for (pages = 0; pos < end; pos += PAGE_SIZE, pages++) { + start = (void *)PAGE_ALIGN((unsigned long)start); + end = (void *)((unsigned long)end & PAGE_MASK); + for (pos = start; pos < end; pos += PAGE_SIZE, pages++) { if (poison) - memset((void *)pos, poison, PAGE_SIZE); - free_reserved_page(virt_to_page((void *)pos)); + memset(pos, poison, PAGE_SIZE); + free_reserved_page(virt_to_page(pos)); } if (pages && s) - pr_info("Freeing %s memory: %ldK (%lx - %lx)\n", + pr_info("Freeing %s memory: %ldK (%p - %p)\n", s, pages << (PAGE_SHIFT - 10), start, end); return pages; } +EXPORT_SYMBOL(free_reserved_area); #ifdef CONFIG_HIGHMEM void free_highmem_page(struct page *page) -- cgit v1.2.3 From dbe67df4ba78c79db547c7864e1120981c144c97 Mon Sep 17 00:00:00 2001 From: Jiang Liu <liuj97@gmail.com> Date: Wed, 3 Jul 2013 15:02:51 -0700 Subject: mm: enhance free_reserved_area() to support poisoning memory with zero Address more review comments from last round of code review. 1) Enhance free_reserved_area() to support poisoning freed memory with pattern '0'. This could be used to get rid of poison_init_mem() on ARM64. 2) A previous patch has disabled memory poison for initmem on s390 by mistake, so restore to the original behavior. 3) Remove redundant PAGE_ALIGN() when calling free_reserved_area(). Signed-off-by: Jiang Liu <jiang.liu@huawei.com> Cc: Geert Uytterhoeven <geert@linux-m68k.org> Cc: "H. Peter Anvin" <hpa@zytor.com> Cc: "Michael S. Tsirkin" <mst@redhat.com> Cc: <sworddragon2@aol.com> Cc: Arnd Bergmann <arnd@arndb.de> Cc: Catalin Marinas <catalin.marinas@arm.com> Cc: Chris Metcalf <cmetcalf@tilera.com> Cc: David Howells <dhowells@redhat.com> Cc: Ingo Molnar <mingo@redhat.com> Cc: Jeremy Fitzhardinge <jeremy@goop.org> Cc: Jianguo Wu <wujianguo@huawei.com> Cc: Joonsoo Kim <js1304@gmail.com> Cc: Kamezawa Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com> Cc: Marek Szyprowski <m.szyprowski@samsung.com> Cc: Mel Gorman <mel@csn.ul.ie> Cc: Michel Lespinasse <walken@google.com> Cc: Minchan Kim <minchan@kernel.org> Cc: Rik van Riel <riel@redhat.com> Cc: Rusty Russell <rusty@rustcorp.com.au> Cc: Tang Chen <tangchen@cn.fujitsu.com> Cc: Tejun Heo <tj@kernel.org> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: Wen Congyang <wency@cn.fujitsu.com> Cc: Will Deacon <will.deacon@arm.com> Cc: Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com> Cc: Yinghai Lu <yinghai@kernel.org> Cc: Russell King <rmk@arm.linux.org.uk> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> --- arch/alpha/kernel/sys_nautilus.c | 2 +- arch/alpha/mm/init.c | 4 ++-- arch/arc/mm/init.c | 4 ++-- arch/arm/mm/init.c | 8 ++++---- arch/arm64/mm/init.c | 4 ++-- arch/avr32/mm/init.c | 4 ++-- arch/blackfin/mm/init.c | 4 ++-- arch/c6x/mm/init.c | 4 ++-- arch/cris/mm/init.c | 2 +- arch/frv/mm/init.c | 4 ++-- arch/h8300/mm/init.c | 4 ++-- arch/ia64/mm/init.c | 2 +- arch/m32r/mm/init.c | 4 ++-- arch/m68k/mm/init.c | 4 ++-- arch/microblaze/mm/init.c | 4 ++-- arch/openrisc/mm/init.c | 4 ++-- arch/parisc/mm/init.c | 4 ++-- arch/powerpc/kernel/kvm.c | 9 ++------- arch/powerpc/mm/mem.c | 2 +- arch/s390/mm/init.c | 2 +- arch/sh/mm/init.c | 4 ++-- arch/um/kernel/mem.c | 2 +- arch/unicore32/mm/init.c | 4 ++-- arch/xtensa/mm/init.c | 4 ++-- include/linux/mm.h | 7 ++++--- mm/page_alloc.c | 2 +- 26 files changed, 49 insertions(+), 53 deletions(-) (limited to 'mm') diff --git a/arch/alpha/kernel/sys_nautilus.c b/arch/alpha/kernel/sys_nautilus.c index 891bd274ccb5..837c0fa58317 100644 --- a/arch/alpha/kernel/sys_nautilus.c +++ b/arch/alpha/kernel/sys_nautilus.c @@ -239,7 +239,7 @@ nautilus_init_pci(void) memtop = pci_mem; if (memtop > alpha_mv.min_mem_address) { free_reserved_area(__va(alpha_mv.min_mem_address), - __va(memtop), 0, NULL); + __va(memtop), -1, NULL); printk("nautilus_init_pci: %ldk freed\n", (memtop - alpha_mv.min_mem_address) >> 10); } diff --git a/arch/alpha/mm/init.c b/arch/alpha/mm/init.c index d54848d4e464..218c29c14bb3 100644 --- a/arch/alpha/mm/init.c +++ b/arch/alpha/mm/init.c @@ -319,13 +319,13 @@ mem_init(void) void free_initmem(void) { - free_initmem_default(0); + free_initmem_default(-1); } #ifdef CONFIG_BLK_DEV_INITRD void free_initrd_mem(unsigned long start, unsigned long end) { - free_reserved_area((void *)start, (void *)end, 0, "initrd"); + free_reserved_area((void *)start, (void *)end, -1, "initrd"); } #endif diff --git a/arch/arc/mm/init.c b/arch/arc/mm/init.c index dce02e4716a1..f9c707712096 100644 --- a/arch/arc/mm/init.c +++ b/arch/arc/mm/init.c @@ -146,13 +146,13 @@ void __init mem_init(void) */ void __init_refok free_initmem(void) { - free_initmem_default(0); + free_initmem_default(-1); } #ifdef CONFIG_BLK_DEV_INITRD void __init free_initrd_mem(unsigned long start, unsigned long end) { - free_reserved_area((void *)start, (void *)end, 0, "initrd"); + free_reserved_area((void *)start, (void *)end, -1, "initrd"); } #endif diff --git a/arch/arm/mm/init.c b/arch/arm/mm/init.c index 7fae391caf86..2070651c1bb4 100644 --- a/arch/arm/mm/init.c +++ b/arch/arm/mm/init.c @@ -601,7 +601,7 @@ void __init mem_init(void) #ifdef CONFIG_SA1111 /* now that our DMA memory is actually so designated, we can free it */ - free_reserved_area(__va(PHYS_PFN_OFFSET), swapper_pg_dir, 0, NULL); + free_reserved_area(__va(PHYS_PFN_OFFSET), swapper_pg_dir, -1, NULL); #endif free_highpages(); @@ -729,12 +729,12 @@ void free_initmem(void) extern char __tcm_start, __tcm_end; poison_init_mem(&__tcm_start, &__tcm_end - &__tcm_start); - free_reserved_area(&__tcm_start, &__tcm_end, 0, "TCM link"); + free_reserved_area(&__tcm_start, &__tcm_end, -1, "TCM link"); #endif poison_init_mem(__init_begin, __init_end - __init_begin); if (!machine_is_integrator() && !machine_is_cintegrator()) - free_initmem_default(0); + free_initmem_default(-1); } #ifdef CONFIG_BLK_DEV_INITRD @@ -745,7 +745,7 @@ void free_initrd_mem(unsigned long start, unsigned long end) { if (!keep_initrd) { poison_init_mem((void *)start, PAGE_ALIGN(end) - start); - free_reserved_area((void *)start, (void *)end, 0, "initrd"); + free_reserved_area((void *)start, (void *)end, -1, "initrd"); } } diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c index 6041e4008a83..997c6345cdd6 100644 --- a/arch/arm64/mm/init.c +++ b/arch/arm64/mm/init.c @@ -387,7 +387,7 @@ void __init mem_init(void) void free_initmem(void) { poison_init_mem(__init_begin, __init_end - __init_begin); - free_initmem_default(0); + free_initmem_default(-1); } #ifdef CONFIG_BLK_DEV_INITRD @@ -398,7 +398,7 @@ void free_initrd_mem(unsigned long start, unsigned long end) { if (!keep_initrd) { poison_init_mem((void *)start, PAGE_ALIGN(end) - start); - free_reserved_area((void *)start, (void *)end, 0, "initrd"); + free_reserved_area((void *)start, (void *)end, -1, "initrd"); } } diff --git a/arch/avr32/mm/init.c b/arch/avr32/mm/init.c index 5a79fa08cb3c..b079e04f6954 100644 --- a/arch/avr32/mm/init.c +++ b/arch/avr32/mm/init.c @@ -148,12 +148,12 @@ void __init mem_init(void) void free_initmem(void) { - free_initmem_default(0); + free_initmem_default(-1); } #ifdef CONFIG_BLK_DEV_INITRD void free_initrd_mem(unsigned long start, unsigned long end) { - free_reserved_area((void *)start, (void *)end, 0, "initrd"); + free_reserved_area((void *)start, (void *)end, -1, "initrd"); } #endif diff --git a/arch/blackfin/mm/init.c b/arch/blackfin/mm/init.c index 8e9eab272811..fa241f5a7dcf 100644 --- a/arch/blackfin/mm/init.c +++ b/arch/blackfin/mm/init.c @@ -133,7 +133,7 @@ void __init mem_init(void) void __init free_initrd_mem(unsigned long start, unsigned long end) { #ifndef CONFIG_MPU - free_reserved_area((void *)start, (void *)end, 0, "initrd"); + free_reserved_area((void *)start, (void *)end, -1, "initrd"); #endif } #endif @@ -141,7 +141,7 @@ void __init free_initrd_mem(unsigned long start, unsigned long end) void __init_refok free_initmem(void) { #if defined CONFIG_RAMKERNEL && !defined CONFIG_MPU - free_initmem_default(0); + free_initmem_default(-1); if (memory_start == (unsigned long)(&__init_end)) memory_start = (unsigned long)(&__init_begin); #endif diff --git a/arch/c6x/mm/init.c b/arch/c6x/mm/init.c index 07bfcc98a3b7..3987a20fdee6 100644 --- a/arch/c6x/mm/init.c +++ b/arch/c6x/mm/init.c @@ -78,11 +78,11 @@ void __init mem_init(void) #ifdef CONFIG_BLK_DEV_INITRD void __init free_initrd_mem(unsigned long start, unsigned long end) { - free_reserved_area((void *)start, (void *)end, 0, "initrd"); + free_reserved_area((void *)start, (void *)end, -1, "initrd"); } #endif void __init free_initmem(void) { - free_initmem_default(0); + free_initmem_default(-1); } diff --git a/arch/cris/mm/init.c b/arch/cris/mm/init.c index 9ac80946dada..8fec26392ae7 100644 --- a/arch/cris/mm/init.c +++ b/arch/cris/mm/init.c @@ -65,5 +65,5 @@ mem_init(void) void free_initmem(void) { - free_initmem_default(0); + free_initmem_default(-1); } diff --git a/arch/frv/mm/init.c b/arch/frv/mm/init.c index a67f3a5897b8..8ba9d22d0d91 100644 --- a/arch/frv/mm/init.c +++ b/arch/frv/mm/init.c @@ -162,7 +162,7 @@ void __init mem_init(void) void free_initmem(void) { #if defined(CONFIG_RAMKERNEL) && !defined(CONFIG_PROTECT_KERNEL) - free_initmem_default(0); + free_initmem_default(-1); #endif } /* end free_initmem() */ @@ -173,6 +173,6 @@ void free_initmem(void) #ifdef CONFIG_BLK_DEV_INITRD void __init free_initrd_mem(unsigned long start, unsigned long end) { - free_reserved_area((void *)start, (void *)end, 0, "initrd"); + free_reserved_area((void *)start, (void *)end, -1, "initrd"); } /* end free_initrd_mem() */ #endif diff --git a/arch/h8300/mm/init.c b/arch/h8300/mm/init.c index 57e03c59861d..c831f1dba132 100644 --- a/arch/h8300/mm/init.c +++ b/arch/h8300/mm/init.c @@ -161,7 +161,7 @@ void __init mem_init(void) #ifdef CONFIG_BLK_DEV_INITRD void free_initrd_mem(unsigned long start, unsigned long end) { - free_reserved_area((void *)start, (void *)end, 0, "initrd"); + free_reserved_area((void *)start, (void *)end, -1, "initrd"); } #endif @@ -169,7 +169,7 @@ void free_initmem(void) { #ifdef CONFIG_RAMKERNEL - free_initmem_default(0); + free_initmem_default(-1); #endif } diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c index da568c2e839f..f8a4f38b0ad5 100644 --- a/arch/ia64/mm/init.c +++ b/arch/ia64/mm/init.c @@ -155,7 +155,7 @@ void free_initmem (void) { free_reserved_area(ia64_imva(__init_begin), ia64_imva(__init_end), - 0, "unused kernel"); + -1, "unused kernel"); } void __init diff --git a/arch/m32r/mm/init.c b/arch/m32r/mm/init.c index d80412d0c14e..cca87d918436 100644 --- a/arch/m32r/mm/init.c +++ b/arch/m32r/mm/init.c @@ -181,7 +181,7 @@ void __init mem_init(void) *======================================================================*/ void free_initmem(void) { - free_initmem_default(0); + free_initmem_default(-1); } #ifdef CONFIG_BLK_DEV_INITRD @@ -191,6 +191,6 @@ void free_initmem(void) *======================================================================*/ void free_initrd_mem(unsigned long start, unsigned long end) { - free_reserved_area((void *)start, (void *)end, 0, "initrd"); + free_reserved_area((void *)start, (void *)end, -1, "initrd"); } #endif diff --git a/arch/m68k/mm/init.c b/arch/m68k/mm/init.c index 95de725534e9..ab0b54ca5d85 100644 --- a/arch/m68k/mm/init.c +++ b/arch/m68k/mm/init.c @@ -110,7 +110,7 @@ void __init paging_init(void) void free_initmem(void) { #ifndef CONFIG_MMU_SUN3 - free_initmem_default(0); + free_initmem_default(-1); #endif /* CONFIG_MMU_SUN3 */ } @@ -202,6 +202,6 @@ void __init mem_init(void) #ifdef CONFIG_BLK_DEV_INITRD void free_initrd_mem(unsigned long start, unsigned long end) { - free_reserved_area((void *)start, (void *)end, 0, "initrd"); + free_reserved_area((void *)start, (void *)end, -1, "initrd"); } #endif diff --git a/arch/microblaze/mm/init.c b/arch/microblaze/mm/init.c index d7b8ada9345f..d149e0ebb767 100644 --- a/arch/microblaze/mm/init.c +++ b/arch/microblaze/mm/init.c @@ -235,13 +235,13 @@ void __init setup_memory(void) #ifdef CONFIG_BLK_DEV_INITRD void free_initrd_mem(unsigned long start, unsigned long end) { - free_reserved_area((void *)start, (void *)end, 0, "initrd"); + free_reserved_area((void *)start, (void *)end, -1, "initrd"); } #endif void free_initmem(void) { - free_initmem_default(0); + free_initmem_default(-1); } void __init mem_init(void) diff --git a/arch/openrisc/mm/init.c b/arch/openrisc/mm/init.c index ab113325dc4c..c371e4a0fcac 100644 --- a/arch/openrisc/mm/init.c +++ b/arch/openrisc/mm/init.c @@ -261,11 +261,11 @@ void __init mem_init(void) #ifdef CONFIG_BLK_DEV_INITRD void free_initrd_mem(unsigned long start, unsigned long end) { - free_reserved_area((void *)start, (void *)end, 0, "initrd"); + free_reserved_area((void *)start, (void *)end, -1, "initrd"); } #endif void free_initmem(void) { - free_initmem_default(0); + free_initmem_default(-1); } diff --git a/arch/parisc/mm/init.c b/arch/parisc/mm/init.c index 3223d5e4a372..ebac7bd76b56 100644 --- a/arch/parisc/mm/init.c +++ b/arch/parisc/mm/init.c @@ -532,7 +532,7 @@ void free_initmem(void) * pages are no-longer executable */ flush_icache_range(init_begin, init_end); - num_physpages += free_initmem_default(0); + num_physpages += free_initmem_default(-1); /* set up a new led state on systems shipped LED State panel */ pdc_chassis_send_status(PDC_CHASSIS_DIRECT_BCOMPLETE); @@ -1101,7 +1101,7 @@ void flush_tlb_all(void) #ifdef CONFIG_BLK_DEV_INITRD void free_initrd_mem(unsigned long start, unsigned long end) { - num_physpages += free_reserved_area((void *)start, (void *)end, 0, + num_physpages += free_reserved_area((void *)start, (void *)end, -1, "initrd"); } #endif diff --git a/arch/powerpc/kernel/kvm.c b/arch/powerpc/kernel/kvm.c index 5e4830a33c08..db28032e320e 100644 --- a/arch/powerpc/kernel/kvm.c +++ b/arch/powerpc/kernel/kvm.c @@ -750,13 +750,8 @@ EXPORT_SYMBOL_GPL(kvm_hypercall); static __init void kvm_free_tmp(void) { - unsigned long start, end; - - start = (ulong)&kvm_tmp[kvm_tmp_index + (PAGE_SIZE - 1)] & PAGE_MASK; - end = (ulong)&kvm_tmp[ARRAY_SIZE(kvm_tmp)] & PAGE_MASK; - - /* Free the tmp space we don't need */ - free_reserved_area((void *)start, (void *)end, 0, NULL); + free_reserved_area(&kvm_tmp[kvm_tmp_index], + &kvm_tmp[ARRAY_SIZE(kvm_tmp)], -1, NULL); } static int __init kvm_guest_init(void) diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c index 347c5b1bbd62..7f47a05f55af 100644 --- a/arch/powerpc/mm/mem.c +++ b/arch/powerpc/mm/mem.c @@ -407,7 +407,7 @@ void free_initmem(void) #ifdef CONFIG_BLK_DEV_INITRD void __init free_initrd_mem(unsigned long start, unsigned long end) { - free_reserved_area((void *)start, (void *)end, 0, "initrd"); + free_reserved_area((void *)start, (void *)end, -1, "initrd"); } #endif diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c index 0878c89fe7d2..bf01d18422ec 100644 --- a/arch/s390/mm/init.c +++ b/arch/s390/mm/init.c @@ -166,7 +166,7 @@ void __init mem_init(void) void free_initmem(void) { - free_initmem_default(0); + free_initmem_default(POISON_FREE_INITMEM); } #ifdef CONFIG_BLK_DEV_INITRD diff --git a/arch/sh/mm/init.c b/arch/sh/mm/init.c index b892a9b7d7e3..d3af56b7a098 100644 --- a/arch/sh/mm/init.c +++ b/arch/sh/mm/init.c @@ -499,13 +499,13 @@ void __init mem_init(void) void free_initmem(void) { - free_initmem_default(0); + free_initmem_default(-1); } #ifdef CONFIG_BLK_DEV_INITRD void free_initrd_mem(unsigned long start, unsigned long end) { - free_reserved_area((void *)start, (void *)end, 0, "initrd"); + free_reserved_area((void *)start, (void *)end, -1, "initrd"); } #endif diff --git a/arch/um/kernel/mem.c b/arch/um/kernel/mem.c index 2aa7a2448d58..8ff0b7ae8ec0 100644 --- a/arch/um/kernel/mem.c +++ b/arch/um/kernel/mem.c @@ -244,7 +244,7 @@ void free_initmem(void) #ifdef CONFIG_BLK_DEV_INITRD void free_initrd_mem(unsigned long start, unsigned long end) { - free_reserved_area((void *)start, (void *)end, 0, "initrd"); + free_reserved_area((void *)start, (void *)end, -1, "initrd"); } #endif diff --git a/arch/unicore32/mm/init.c b/arch/unicore32/mm/init.c index 220755cc9700..df9b8abcb6a5 100644 --- a/arch/unicore32/mm/init.c +++ b/arch/unicore32/mm/init.c @@ -476,7 +476,7 @@ void __init mem_init(void) void free_initmem(void) { - free_initmem_default(0); + free_initmem_default(-1); } #ifdef CONFIG_BLK_DEV_INITRD @@ -486,7 +486,7 @@ static int keep_initrd; void free_initrd_mem(unsigned long start, unsigned long end) { if (!keep_initrd) - free_reserved_area((void *)start, (void *)end, 0, "initrd"); + free_reserved_area((void *)start, (void *)end, -1, "initrd"); } static int __init keepinitrd_setup(char *__unused) diff --git a/arch/xtensa/mm/init.c b/arch/xtensa/mm/init.c index 4d658efc3289..026d29bee30b 100644 --- a/arch/xtensa/mm/init.c +++ b/arch/xtensa/mm/init.c @@ -214,11 +214,11 @@ extern int initrd_is_mapped; void free_initrd_mem(unsigned long start, unsigned long end) { if (initrd_is_mapped) - free_reserved_area((void *)start, (void *)end, 0, "initrd"); + free_reserved_area((void *)start, (void *)end, -1, "initrd"); } #endif void free_initmem(void) { - free_initmem_default(0); + free_initmem_default(-1); } diff --git a/include/linux/mm.h b/include/linux/mm.h index be1b96ce0650..083cc0ba2384 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1308,7 +1308,7 @@ extern void free_initmem(void); /* * Free reserved pages within range [PAGE_ALIGN(start), end & PAGE_MASK) * into the buddy system. The freed pages will be poisoned with pattern - * "poison" if it's non-zero. + * "poison" if it's within range [0, UCHAR_MAX]. * Return pages freed into the buddy system. */ extern unsigned long free_reserved_area(void *start, void *end, @@ -1348,8 +1348,9 @@ static inline void mark_page_reserved(struct page *page) /* * Default method to free all the __init memory into the buddy system. - * The freed pages will be poisoned with pattern "poison" if it is - * non-zero. Return pages freed into the buddy system. + * The freed pages will be poisoned with pattern "poison" if it's within + * range [0, UCHAR_MAX]. + * Return pages freed into the buddy system. */ static inline unsigned long free_initmem_default(int poison) { diff --git a/mm/page_alloc.c b/mm/page_alloc.c index be18ccd017bb..6780b2e18aa1 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5214,7 +5214,7 @@ unsigned long free_reserved_area(void *start, void *end, int poison, char *s) start = (void *)PAGE_ALIGN((unsigned long)start); end = (void *)((unsigned long)end & PAGE_MASK); for (pos = start; pos < end; pos += PAGE_SIZE, pages++) { - if (poison) + if ((unsigned int)poison <= 0xFF) memset(pos, poison, PAGE_SIZE); free_reserved_page(virt_to_page(pos)); } -- cgit v1.2.3 From 834405c3b6aebf6853663796401cdfe11aac6275 Mon Sep 17 00:00:00 2001 From: Jiang Liu <liuj97@gmail.com> Date: Wed, 3 Jul 2013 15:03:04 -0700 Subject: mm: fix some trivial typos in comments Fix some trivial typos in comments. Signed-off-by: Jiang Liu <jiang.liu@huawei.com> Cc: Wen Congyang <wency@cn.fujitsu.com> Cc: Tang Chen <tangchen@cn.fujitsu.com> Cc: Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com> Cc: Mel Gorman <mel@csn.ul.ie> Cc: Minchan Kim <minchan@kernel.org> Cc: Marek Szyprowski <m.szyprowski@samsung.com> Cc: "H. Peter Anvin" <hpa@zytor.com> Cc: "Michael S. Tsirkin" <mst@redhat.com> Cc: <sworddragon2@aol.com> Cc: Arnd Bergmann <arnd@arndb.de> Cc: Catalin Marinas <catalin.marinas@arm.com> Cc: Chris Metcalf <cmetcalf@tilera.com> Cc: David Howells <dhowells@redhat.com> Cc: Geert Uytterhoeven <geert@linux-m68k.org> Cc: Ingo Molnar <mingo@redhat.com> Cc: Jeremy Fitzhardinge <jeremy@goop.org> Cc: Jianguo Wu <wujianguo@huawei.com> Cc: Joonsoo Kim <js1304@gmail.com> Cc: Kamezawa Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com> Cc: Michel Lespinasse <walken@google.com> Cc: Rik van Riel <riel@redhat.com> Cc: Rusty Russell <rusty@rustcorp.com.au> Cc: Tejun Heo <tj@kernel.org> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: Will Deacon <will.deacon@arm.com> Cc: Yinghai Lu <yinghai@kernel.org> Cc: Russell King <rmk@arm.linux.org.uk> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> --- mm/memory_hotplug.c | 2 +- mm/page_alloc.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index e3097f299f67..6096cb918735 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -309,7 +309,7 @@ static int __meminit move_pfn_range_left(struct zone *z1, struct zone *z2, /* can't move pfns which are higher than @z2 */ if (end_pfn > zone_end_pfn(z2)) goto out_fail; - /* the move out part mast at the left most of @z2 */ + /* the move out part must be at the left most of @z2 */ if (start_pfn > z2->zone_start_pfn) goto out_fail; /* must included/overlap */ diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 6780b2e18aa1..657daea88aa8 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2845,7 +2845,7 @@ EXPORT_SYMBOL(free_pages_exact); * nr_free_zone_pages() counts the number of counts pages which are beyond the * high watermark within all zones at or below a given zone index. For each * zone, the number of pages is calculated as: - * present_pages - high_pages + * managed_pages - high_pages */ static unsigned long nr_free_zone_pages(int offset) { -- cgit v1.2.3 From 4f9f47745e948eca18bb97c82dbb4d53f2380086 Mon Sep 17 00:00:00 2001 From: Jiang Liu <liuj97@gmail.com> Date: Wed, 3 Jul 2013 15:03:07 -0700 Subject: mm: use managed_pages to calculate default zonelist order Use zone->managed_pages instead of zone->present_pages to calculate default zonelist order because managed_pages means allocatable pages. Signed-off-by: Jiang Liu <jiang.liu@huawei.com> Cc: Mel Gorman <mel@csn.ul.ie> Cc: Minchan Kim <minchan@kernel.org> Cc: Marek Szyprowski <m.szyprowski@samsung.com> Cc: "H. Peter Anvin" <hpa@zytor.com> Cc: "Michael S. Tsirkin" <mst@redhat.com> Cc: <sworddragon2@aol.com> Cc: Arnd Bergmann <arnd@arndb.de> Cc: Catalin Marinas <catalin.marinas@arm.com> Cc: Chris Metcalf <cmetcalf@tilera.com> Cc: David Howells <dhowells@redhat.com> Cc: Geert Uytterhoeven <geert@linux-m68k.org> Cc: Ingo Molnar <mingo@redhat.com> Cc: Jeremy Fitzhardinge <jeremy@goop.org> Cc: Jianguo Wu <wujianguo@huawei.com> Cc: Joonsoo Kim <js1304@gmail.com> Cc: Kamezawa Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com> Cc: Michel Lespinasse <walken@google.com> Cc: Rik van Riel <riel@redhat.com> Cc: Rusty Russell <rusty@rustcorp.com.au> Cc: Tang Chen <tangchen@cn.fujitsu.com> Cc: Tejun Heo <tj@kernel.org> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: Wen Congyang <wency@cn.fujitsu.com> Cc: Will Deacon <will.deacon@arm.com> Cc: Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com> Cc: Yinghai Lu <yinghai@kernel.org> Cc: Russell King <rmk@arm.linux.org.uk> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> --- mm/page_alloc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 657daea88aa8..f22542f6dc12 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3438,8 +3438,8 @@ static int default_zonelist_order(void) z = &NODE_DATA(nid)->node_zones[zone_type]; if (populated_zone(z)) { if (zone_type < ZONE_NORMAL) - low_kmem_size += z->present_pages; - total_size += z->present_pages; + low_kmem_size += z->managed_pages; + total_size += z->managed_pages; } else if (zone_type == ZONE_NORMAL) { /* * If any node has only lowmem, then node order -- cgit v1.2.3 From 7b4b2a0d6c8500350784beb83a6a55e60ea3bea3 Mon Sep 17 00:00:00 2001 From: Jiang Liu <liuj97@gmail.com> Date: Wed, 3 Jul 2013 15:03:11 -0700 Subject: mm: accurately calculate zone->managed_pages for highmem zones Commit "mm: introduce new field 'managed_pages' to struct zone" assumes that all highmem pages will be freed into the buddy system by function mem_init(). But that's not always true, some architectures may reserve some highmem pages during boot. For example PPC may allocate highmem pages for giagant HugeTLB pages, and several architectures have code to check PageReserved flag to exclude highmem pages allocated during boot when freeing highmem pages into the buddy system. So treat highmem pages in the same way as normal pages, that is to: 1) reset zone->managed_pages to zero in mem_init(). 2) recalculate managed_pages when freeing pages into the buddy system. Signed-off-by: Jiang Liu <jiang.liu@huawei.com> Cc: "H. Peter Anvin" <hpa@zytor.com> Cc: Tejun Heo <tj@kernel.org> Cc: Joonsoo Kim <js1304@gmail.com> Cc: Yinghai Lu <yinghai@kernel.org> Cc: Mel Gorman <mel@csn.ul.ie> Cc: Minchan Kim <minchan@kernel.org> Cc: Kamezawa Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Cc: Marek Szyprowski <m.szyprowski@samsung.com> Cc: "Michael S. Tsirkin" <mst@redhat.com> Cc: <sworddragon2@aol.com> Cc: Arnd Bergmann <arnd@arndb.de> Cc: Catalin Marinas <catalin.marinas@arm.com> Cc: Chris Metcalf <cmetcalf@tilera.com> Cc: David Howells <dhowells@redhat.com> Cc: Geert Uytterhoeven <geert@linux-m68k.org> Cc: Ingo Molnar <mingo@redhat.com> Cc: Jeremy Fitzhardinge <jeremy@goop.org> Cc: Jianguo Wu <wujianguo@huawei.com> Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com> Cc: Michel Lespinasse <walken@google.com> Cc: Rik van Riel <riel@redhat.com> Cc: Rusty Russell <rusty@rustcorp.com.au> Cc: Tang Chen <tangchen@cn.fujitsu.com> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: Wen Congyang <wency@cn.fujitsu.com> Cc: Will Deacon <will.deacon@arm.com> Cc: Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com> Cc: Russell King <rmk@arm.linux.org.uk> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> --- arch/metag/mm/init.c | 6 ++++++ arch/x86/mm/highmem_32.c | 6 ++++++ include/linux/bootmem.h | 1 + mm/bootmem.c | 32 ++++++++++++++++++-------------- mm/nobootmem.c | 30 ++++++++++++++++-------------- mm/page_alloc.c | 1 + 6 files changed, 48 insertions(+), 28 deletions(-) (limited to 'mm') diff --git a/arch/metag/mm/init.c b/arch/metag/mm/init.c index 5e2238dd72e0..d7595f58fad5 100644 --- a/arch/metag/mm/init.c +++ b/arch/metag/mm/init.c @@ -380,6 +380,12 @@ void __init mem_init(void) #ifdef CONFIG_HIGHMEM unsigned long tmp; + + /* + * Explicitly reset zone->managed_pages because highmem pages are + * freed before calling free_all_bootmem_node(); + */ + reset_all_zones_managed_pages(); for (tmp = highstart_pfn; tmp < highend_pfn; tmp++) free_highmem_page(pfn_to_page(tmp)); num_physpages += totalhigh_pages; diff --git a/arch/x86/mm/highmem_32.c b/arch/x86/mm/highmem_32.c index 252b8f5489ba..4500142bc4aa 100644 --- a/arch/x86/mm/highmem_32.c +++ b/arch/x86/mm/highmem_32.c @@ -1,6 +1,7 @@ #include <linux/highmem.h> #include <linux/module.h> #include <linux/swap.h> /* for totalram_pages */ +#include <linux/bootmem.h> void *kmap(struct page *page) { @@ -121,6 +122,11 @@ void __init set_highmem_pages_init(void) struct zone *zone; int nid; + /* + * Explicitly reset zone->managed_pages because set_highmem_pages_init() + * is invoked before free_all_bootmem() + */ + reset_all_zones_managed_pages(); for_each_zone(zone) { unsigned long zone_start_pfn, zone_end_pfn; diff --git a/include/linux/bootmem.h b/include/linux/bootmem.h index 5f0b0e1f7c08..0e48c3221d82 100644 --- a/include/linux/bootmem.h +++ b/include/linux/bootmem.h @@ -46,6 +46,7 @@ extern unsigned long init_bootmem(unsigned long addr, unsigned long memend); extern unsigned long free_all_bootmem_node(pg_data_t *pgdat); extern unsigned long free_all_bootmem(void); +extern void reset_all_zones_managed_pages(void); extern void free_bootmem_node(pg_data_t *pgdat, unsigned long addr, diff --git a/mm/bootmem.c b/mm/bootmem.c index 2b0bcb019ec2..eb792323187b 100644 --- a/mm/bootmem.c +++ b/mm/bootmem.c @@ -241,20 +241,26 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata) return count; } -static void reset_node_lowmem_managed_pages(pg_data_t *pgdat) +static int reset_managed_pages_done __initdata; + +static inline void __init reset_node_managed_pages(pg_data_t *pgdat) { struct zone *z; - /* - * In free_area_init_core(), highmem zone's managed_pages is set to - * present_pages, and bootmem allocator doesn't allocate from highmem - * zones. So there's no need to recalculate managed_pages because all - * highmem pages will be managed by the buddy system. Here highmem - * zone also includes highmem movable zone. - */ + if (reset_managed_pages_done) + return; + for (z = pgdat->node_zones; z < pgdat->node_zones + MAX_NR_ZONES; z++) - if (!is_highmem(z)) - z->managed_pages = 0; + z->managed_pages = 0; +} + +void __init reset_all_zones_managed_pages(void) +{ + struct pglist_data *pgdat; + + for_each_online_pgdat(pgdat) + reset_node_managed_pages(pgdat); + reset_managed_pages_done = 1; } /** @@ -266,7 +272,7 @@ static void reset_node_lowmem_managed_pages(pg_data_t *pgdat) unsigned long __init free_all_bootmem_node(pg_data_t *pgdat) { register_page_bootmem_info_node(pgdat); - reset_node_lowmem_managed_pages(pgdat); + reset_node_managed_pages(pgdat); return free_all_bootmem_core(pgdat->bdata); } @@ -279,10 +285,8 @@ unsigned long __init free_all_bootmem(void) { unsigned long total_pages = 0; bootmem_data_t *bdata; - struct pglist_data *pgdat; - for_each_online_pgdat(pgdat) - reset_node_lowmem_managed_pages(pgdat); + reset_all_zones_managed_pages(); list_for_each_entry(bdata, &bdata_list, list) total_pages += free_all_bootmem_core(bdata); diff --git a/mm/nobootmem.c b/mm/nobootmem.c index bdd3fa2fc73b..0ae8d91365af 100644 --- a/mm/nobootmem.c +++ b/mm/nobootmem.c @@ -137,20 +137,25 @@ static unsigned long __init free_low_memory_core_early(void) return count; } -static void reset_node_lowmem_managed_pages(pg_data_t *pgdat) +static int reset_managed_pages_done __initdata; + +static inline void __init reset_node_managed_pages(pg_data_t *pgdat) { struct zone *z; - /* - * In free_area_init_core(), highmem zone's managed_pages is set to - * present_pages, and bootmem allocator doesn't allocate from highmem - * zones. So there's no need to recalculate managed_pages because all - * highmem pages will be managed by the buddy system. Here highmem - * zone also includes highmem movable zone. - */ + if (reset_managed_pages_done) + return; for (z = pgdat->node_zones; z < pgdat->node_zones + MAX_NR_ZONES; z++) - if (!is_highmem(z)) - z->managed_pages = 0; + z->managed_pages = 0; +} + +void __init reset_all_zones_managed_pages(void) +{ + struct pglist_data *pgdat; + + for_each_online_pgdat(pgdat) + reset_node_managed_pages(pgdat); + reset_managed_pages_done = 1; } /** @@ -160,10 +165,7 @@ static void reset_node_lowmem_managed_pages(pg_data_t *pgdat) */ unsigned long __init free_all_bootmem(void) { - struct pglist_data *pgdat; - - for_each_online_pgdat(pgdat) - reset_node_lowmem_managed_pages(pgdat); + reset_all_zones_managed_pages(); /* * We need to use MAX_NUMNODES instead of NODE_DATA(0)->node_id diff --git a/mm/page_alloc.c b/mm/page_alloc.c index f22542f6dc12..22438eba00b6 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5232,6 +5232,7 @@ void free_highmem_page(struct page *page) { __free_reserved_page(page); totalram_pages++; + page_zone(page)->managed_pages++; totalhigh_pages++; } #endif -- cgit v1.2.3 From c3d5f5f0c2bc4eabeaf49f1a21e1aeb965246cd2 Mon Sep 17 00:00:00 2001 From: Jiang Liu <liuj97@gmail.com> Date: Wed, 3 Jul 2013 15:03:14 -0700 Subject: mm: use a dedicated lock to protect totalram_pages and zone->managed_pages Currently lock_memory_hotplug()/unlock_memory_hotplug() are used to protect totalram_pages and zone->managed_pages. Other than the memory hotplug driver, totalram_pages and zone->managed_pages may also be modified at runtime by other drivers, such as Xen balloon, virtio_balloon etc. For those cases, memory hotplug lock is a little too heavy, so introduce a dedicated lock to protect totalram_pages and zone->managed_pages. Now we have a simplified locking rules totalram_pages and zone->managed_pages as: 1) no locking for read accesses because they are unsigned long. 2) no locking for write accesses at boot time in single-threaded context. 3) serialize write accesses at runtime by acquiring the dedicated managed_page_count_lock. Also adjust zone->managed_pages when freeing reserved pages into the buddy system, to keep totalram_pages and zone->managed_pages in consistence. [akpm@linux-foundation.org: don't export adjust_managed_page_count to modules (for now)] Signed-off-by: Jiang Liu <jiang.liu@huawei.com> Cc: Mel Gorman <mel@csn.ul.ie> Cc: Michel Lespinasse <walken@google.com> Cc: Rik van Riel <riel@redhat.com> Cc: Minchan Kim <minchan@kernel.org> Cc: "H. Peter Anvin" <hpa@zytor.com> Cc: "Michael S. Tsirkin" <mst@redhat.com> Cc: <sworddragon2@aol.com> Cc: Arnd Bergmann <arnd@arndb.de> Cc: Catalin Marinas <catalin.marinas@arm.com> Cc: Chris Metcalf <cmetcalf@tilera.com> Cc: David Howells <dhowells@redhat.com> Cc: Geert Uytterhoeven <geert@linux-m68k.org> Cc: Ingo Molnar <mingo@redhat.com> Cc: Jeremy Fitzhardinge <jeremy@goop.org> Cc: Jianguo Wu <wujianguo@huawei.com> Cc: Joonsoo Kim <js1304@gmail.com> Cc: Kamezawa Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com> Cc: Marek Szyprowski <m.szyprowski@samsung.com> Cc: Rusty Russell <rusty@rustcorp.com.au> Cc: Tang Chen <tangchen@cn.fujitsu.com> Cc: Tejun Heo <tj@kernel.org> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: Wen Congyang <wency@cn.fujitsu.com> Cc: Will Deacon <will.deacon@arm.com> Cc: Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com> Cc: Yinghai Lu <yinghai@kernel.org> Cc: Russell King <rmk@arm.linux.org.uk> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> --- include/linux/mm.h | 6 ++---- include/linux/mmzone.h | 14 ++++++++++---- mm/page_alloc.c | 11 +++++++++++ 3 files changed, 23 insertions(+), 8 deletions(-) (limited to 'mm') diff --git a/include/linux/mm.h b/include/linux/mm.h index 083cc0ba2384..4310f80ce956 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1313,6 +1313,7 @@ extern void free_initmem(void); */ extern unsigned long free_reserved_area(void *start, void *end, int poison, char *s); + #ifdef CONFIG_HIGHMEM /* * Free a highmem page into the buddy system, adjusting totalhigh_pages @@ -1321,10 +1322,7 @@ extern unsigned long free_reserved_area(void *start, void *end, extern void free_highmem_page(struct page *page); #endif -static inline void adjust_managed_page_count(struct page *page, long count) -{ - totalram_pages += count; -} +extern void adjust_managed_page_count(struct page *page, long count); /* Free the reserved page into the buddy system, so it gets managed. */ static inline void __free_reserved_page(struct page *page) diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index e511f9429f1e..09d381b71fd8 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -474,10 +474,16 @@ struct zone { * frequently read in proximity to zone->lock. It's good to * give them a chance of being in the same cacheline. * - * Write access to present_pages and managed_pages at runtime should - * be protected by lock_memory_hotplug()/unlock_memory_hotplug(). - * Any reader who can't tolerant drift of present_pages and - * managed_pages should hold memory hotplug lock to get a stable value. + * Write access to present_pages at runtime should be protected by + * lock_memory_hotplug()/unlock_memory_hotplug(). Any reader who can't + * tolerant drift of present_pages should hold memory hotplug lock to + * get a stable value. + * + * Read access to managed_pages should be safe because it's unsigned + * long. Write access to zone->managed_pages and totalram_pages are + * protected by managed_page_count_lock at runtime. Idealy only + * adjust_managed_page_count() should be used instead of directly + * touching zone->managed_pages and totalram_pages. */ unsigned long spanned_pages; unsigned long present_pages; diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 22438eba00b6..93f292a60cb0 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -103,6 +103,9 @@ nodemask_t node_states[NR_NODE_STATES] __read_mostly = { }; EXPORT_SYMBOL(node_states); +/* Protect totalram_pages and zone->managed_pages */ +static DEFINE_SPINLOCK(managed_page_count_lock); + unsigned long totalram_pages __read_mostly; unsigned long totalreserve_pages __read_mostly; /* @@ -5206,6 +5209,14 @@ early_param("movablecore", cmdline_parse_movablecore); #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ +void adjust_managed_page_count(struct page *page, long count) +{ + spin_lock(&managed_page_count_lock); + page_zone(page)->managed_pages += count; + totalram_pages += count; + spin_unlock(&managed_page_count_lock); +} + unsigned long free_reserved_area(void *start, void *end, int poison, char *s) { void *pos; -- cgit v1.2.3 From 170a5a7eb2bf10161197e5490fbc29ca4561aedb Mon Sep 17 00:00:00 2001 From: Jiang Liu <liuj97@gmail.com> Date: Wed, 3 Jul 2013 15:03:17 -0700 Subject: mm: make __free_pages_bootmem() only available at boot time In order to simpilify management of totalram_pages and zone->managed_pages, make __free_pages_bootmem() only available at boot time. With this change applied, __free_pages_bootmem() will only be used by bootmem.c and nobootmem.c at boot time, so mark it as __init. Other callers of __free_pages_bootmem() have been converted to use free_reserved_page(), which handles totalram_pages and zone->managed_pages in a safer way. This patch also fix a bug in free_pagetable() for x86_64, which should increase zone->managed_pages instead of zone->present_pages when freeing reserved pages. And now we have managed_pages_count_lock to protect totalram_pages and zone->managed_pages, so remove the redundant ppb_lock lock in put_page_bootmem(). This greatly simplifies the locking rules. Signed-off-by: Jiang Liu <jiang.liu@huawei.com> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: Ingo Molnar <mingo@redhat.com> Cc: "H. Peter Anvin" <hpa@zytor.com> Cc: Yinghai Lu <yinghai@kernel.org> Cc: Wen Congyang <wency@cn.fujitsu.com> Cc: Tang Chen <tangchen@cn.fujitsu.com> Cc: Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com> Cc: Mel Gorman <mel@csn.ul.ie> Cc: Minchan Kim <minchan@kernel.org> Cc: "Michael S. Tsirkin" <mst@redhat.com> Cc: <sworddragon2@aol.com> Cc: Arnd Bergmann <arnd@arndb.de> Cc: Catalin Marinas <catalin.marinas@arm.com> Cc: Chris Metcalf <cmetcalf@tilera.com> Cc: David Howells <dhowells@redhat.com> Cc: Geert Uytterhoeven <geert@linux-m68k.org> Cc: Jeremy Fitzhardinge <jeremy@goop.org> Cc: Jianguo Wu <wujianguo@huawei.com> Cc: Joonsoo Kim <js1304@gmail.com> Cc: Kamezawa Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com> Cc: Marek Szyprowski <m.szyprowski@samsung.com> Cc: Michel Lespinasse <walken@google.com> Cc: Rik van Riel <riel@redhat.com> Cc: Rusty Russell <rusty@rustcorp.com.au> Cc: Tejun Heo <tj@kernel.org> Cc: Will Deacon <will.deacon@arm.com> Cc: Russell King <rmk@arm.linux.org.uk> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> --- arch/x86/mm/init_64.c | 18 ++---------------- mm/memory_hotplug.c | 16 ++-------------- mm/page_alloc.c | 9 +-------- 3 files changed, 5 insertions(+), 38 deletions(-) (limited to 'mm') diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index b7bdf7bebf3b..ec312a92b137 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -712,36 +712,22 @@ EXPORT_SYMBOL_GPL(arch_add_memory); static void __meminit free_pagetable(struct page *page, int order) { - struct zone *zone; - bool bootmem = false; unsigned long magic; unsigned int nr_pages = 1 << order; /* bootmem page has reserved flag */ if (PageReserved(page)) { __ClearPageReserved(page); - bootmem = true; magic = (unsigned long)page->lru.next; if (magic == SECTION_INFO || magic == MIX_SECTION_INFO) { while (nr_pages--) put_page_bootmem(page++); } else - __free_pages_bootmem(page, order); + while (nr_pages--) + free_reserved_page(page++); } else free_pages((unsigned long)page_address(page), order); - - /* - * SECTION_INFO pages and MIX_SECTION_INFO pages - * are all allocated by bootmem. - */ - if (bootmem) { - zone = page_zone(page); - zone_span_writelock(zone); - zone->present_pages += nr_pages; - zone_span_writeunlock(zone); - totalram_pages += nr_pages; - } } static void __meminit free_pte_table(pte_t *pte_start, pmd_t *pmd) diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 6096cb918735..814ecb2d262f 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -101,12 +101,9 @@ void get_page_bootmem(unsigned long info, struct page *page, atomic_inc(&page->_count); } -/* reference to __meminit __free_pages_bootmem is valid - * so use __ref to tell modpost not to generate a warning */ -void __ref put_page_bootmem(struct page *page) +void put_page_bootmem(struct page *page) { unsigned long type; - static DEFINE_MUTEX(ppb_lock); type = (unsigned long) page->lru.next; BUG_ON(type < MEMORY_HOTPLUG_MIN_BOOTMEM_TYPE || @@ -116,17 +113,8 @@ void __ref put_page_bootmem(struct page *page) ClearPagePrivate(page); set_page_private(page, 0); INIT_LIST_HEAD(&page->lru); - - /* - * Please refer to comment for __free_pages_bootmem() - * for why we serialize here. - */ - mutex_lock(&ppb_lock); - __free_pages_bootmem(page, 0); - mutex_unlock(&ppb_lock); - totalram_pages++; + free_reserved_page(page); } - } #ifdef CONFIG_HAVE_BOOTMEM_INFO_NODE diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 93f292a60cb0..2437a7e17aba 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -745,14 +745,7 @@ static void __free_pages_ok(struct page *page, unsigned int order) local_irq_restore(flags); } -/* - * Read access to zone->managed_pages is safe because it's unsigned long, - * but we still need to serialize writers. Currently all callers of - * __free_pages_bootmem() except put_page_bootmem() should only be used - * at boot time. So for shorter boot time, we shift the burden to - * put_page_bootmem() to serialize writers. - */ -void __meminit __free_pages_bootmem(struct page *page, unsigned int order) +void __init __free_pages_bootmem(struct page *page, unsigned int order) { unsigned int nr_pages = 1 << order; unsigned int loop; -- cgit v1.2.3 From 3dcc0571cd64816309765b7c7e4691a4cadf2ee7 Mon Sep 17 00:00:00 2001 From: Jiang Liu <liuj97@gmail.com> Date: Wed, 3 Jul 2013 15:03:21 -0700 Subject: mm: correctly update zone->managed_pages Enhance adjust_managed_page_count() to adjust totalhigh_pages for highmem pages. And change code which directly adjusts totalram_pages to use adjust_managed_page_count() because it adjusts totalram_pages, totalhigh_pages and zone->managed_pages altogether in a safe way. Remove inc_totalhigh_pages() and dec_totalhigh_pages() from xen/balloon driver bacause adjust_managed_page_count() has already adjusted totalhigh_pages. This patch also fixes two bugs: 1) enhances virtio_balloon driver to adjust totalhigh_pages when reserve/unreserve pages. 2) enhance memory_hotplug.c to adjust totalhigh_pages when hot-removing memory. We still need to deal with modifications of totalram_pages in file arch/powerpc/platforms/pseries/cmm.c, but need help from PPC experts. [akpm@linux-foundation.org: remove ifdef, per Wanpeng Li, virtio_balloon.c cleanup, per Sergei] [akpm@linux-foundation.org: export adjust_managed_page_count() to modules, for drivers/virtio/virtio_balloon.c] Signed-off-by: Jiang Liu <jiang.liu@huawei.com> Cc: Chris Metcalf <cmetcalf@tilera.com> Cc: Rusty Russell <rusty@rustcorp.com.au> Cc: "Michael S. Tsirkin" <mst@redhat.com> Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com> Cc: Jeremy Fitzhardinge <jeremy@goop.org> Cc: Wen Congyang <wency@cn.fujitsu.com> Cc: Tang Chen <tangchen@cn.fujitsu.com> Cc: Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com> Cc: Mel Gorman <mel@csn.ul.ie> Cc: Minchan Kim <minchan@kernel.org> Cc: "H. Peter Anvin" <hpa@zytor.com> Cc: <sworddragon2@aol.com> Cc: Arnd Bergmann <arnd@arndb.de> Cc: Catalin Marinas <catalin.marinas@arm.com> Cc: David Howells <dhowells@redhat.com> Cc: Geert Uytterhoeven <geert@linux-m68k.org> Cc: Ingo Molnar <mingo@redhat.com> Cc: Jianguo Wu <wujianguo@huawei.com> Cc: Joonsoo Kim <js1304@gmail.com> Cc: Kamezawa Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Cc: Marek Szyprowski <m.szyprowski@samsung.com> Cc: Michel Lespinasse <walken@google.com> Cc: Rik van Riel <riel@redhat.com> Cc: Tejun Heo <tj@kernel.org> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: Will Deacon <will.deacon@arm.com> Cc: Yinghai Lu <yinghai@kernel.org> Cc: Russell King <rmk@arm.linux.org.uk> Cc: Sergei Shtylyov <sergei.shtylyov@cogentembedded.com> Cc: Wu Fengguang <fengguang.wu@intel.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> --- drivers/virtio/virtio_balloon.c | 7 ++++--- drivers/xen/balloon.c | 23 +++++------------------ mm/hugetlb.c | 2 +- mm/memory_hotplug.c | 16 +++------------- mm/page_alloc.c | 11 ++++++----- 5 files changed, 19 insertions(+), 40 deletions(-) (limited to 'mm') diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c index bd3ae324a1a2..0098810df69d 100644 --- a/drivers/virtio/virtio_balloon.c +++ b/drivers/virtio/virtio_balloon.c @@ -148,7 +148,7 @@ static void fill_balloon(struct virtio_balloon *vb, size_t num) } set_page_pfns(vb->pfns + vb->num_pfns, page); vb->num_pages += VIRTIO_BALLOON_PAGES_PER_PAGE; - totalram_pages--; + adjust_managed_page_count(page, -1); } /* Did we get any? */ @@ -163,8 +163,9 @@ static void release_pages_by_pfn(const u32 pfns[], unsigned int num) /* Find pfns pointing at start of each page, get pages and free them. */ for (i = 0; i < num; i += VIRTIO_BALLOON_PAGES_PER_PAGE) { - balloon_page_free(balloon_pfn_to_page(pfns[i])); - totalram_pages++; + struct page *page = balloon_pfn_to_page(pfns[i]); + balloon_page_free(page); + adjust_managed_page_count(page, 1); } } diff --git a/drivers/xen/balloon.c b/drivers/xen/balloon.c index 930fb6817901..c8aab4e97833 100644 --- a/drivers/xen/balloon.c +++ b/drivers/xen/balloon.c @@ -89,14 +89,6 @@ EXPORT_SYMBOL_GPL(balloon_stats); /* We increase/decrease in batches which fit in a page */ static xen_pfn_t frame_list[PAGE_SIZE / sizeof(unsigned long)]; -#ifdef CONFIG_HIGHMEM -#define inc_totalhigh_pages() (totalhigh_pages++) -#define dec_totalhigh_pages() (totalhigh_pages--) -#else -#define inc_totalhigh_pages() do {} while (0) -#define dec_totalhigh_pages() do {} while (0) -#endif - /* List of ballooned pages, threaded through the mem_map array. */ static LIST_HEAD(ballooned_pages); @@ -132,9 +124,7 @@ static void __balloon_append(struct page *page) static void balloon_append(struct page *page) { __balloon_append(page); - if (PageHighMem(page)) - dec_totalhigh_pages(); - totalram_pages--; + adjust_managed_page_count(page, -1); } /* balloon_retrieve: rescue a page from the balloon, if it is not empty. */ @@ -151,13 +141,12 @@ static struct page *balloon_retrieve(bool prefer_highmem) page = list_entry(ballooned_pages.next, struct page, lru); list_del(&page->lru); - if (PageHighMem(page)) { + if (PageHighMem(page)) balloon_stats.balloon_high--; - inc_totalhigh_pages(); - } else + else balloon_stats.balloon_low--; - totalram_pages++; + adjust_managed_page_count(page, 1); return page; } @@ -372,9 +361,7 @@ static enum bp_state increase_reservation(unsigned long nr_pages) #endif /* Relinquish the page back to the allocator. */ - ClearPageReserved(page); - init_page_count(page); - __free_page(page); + __free_reserved_page(page); } balloon_stats.current_pages += rc; diff --git a/mm/hugetlb.c b/mm/hugetlb.c index fe095158859e..83aff0a4d093 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1263,7 +1263,7 @@ static void __init gather_bootmem_prealloc(void) * side-effects, like CommitLimit going negative. */ if (h->order > (MAX_ORDER - 1)) - totalram_pages += 1 << h->order; + adjust_managed_page_count(page, 1 << h->order); } } diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 814ecb2d262f..5e34922124a3 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -772,20 +772,13 @@ EXPORT_SYMBOL_GPL(__online_page_set_limits); void __online_page_increment_counters(struct page *page) { - totalram_pages++; - -#ifdef CONFIG_HIGHMEM - if (PageHighMem(page)) - totalhigh_pages++; -#endif + adjust_managed_page_count(page, 1); } EXPORT_SYMBOL_GPL(__online_page_increment_counters); void __online_page_free(struct page *page) { - ClearPageReserved(page); - init_page_count(page); - __free_page(page); + __free_reserved_page(page); } EXPORT_SYMBOL_GPL(__online_page_free); @@ -983,7 +976,6 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ return ret; } - zone->managed_pages += onlined_pages; zone->present_pages += onlined_pages; pgdat_resize_lock(zone->zone_pgdat, &flags); @@ -1572,15 +1564,13 @@ repeat: /* reset pagetype flags and makes migrate type to be MOVABLE */ undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE); /* removal success */ - zone->managed_pages -= offlined_pages; + adjust_managed_page_count(pfn_to_page(start_pfn), -offlined_pages); zone->present_pages -= offlined_pages; pgdat_resize_lock(zone->zone_pgdat, &flags); zone->zone_pgdat->node_present_pages -= offlined_pages; pgdat_resize_unlock(zone->zone_pgdat, &flags); - totalram_pages -= offlined_pages; - init_per_zone_wmark_min(); if (!populated_zone(zone)) { diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 2437a7e17aba..1481439ee2e4 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -780,11 +780,7 @@ void __init init_cma_reserved_pageblock(struct page *page) set_page_refcounted(page); set_pageblock_migratetype(page, MIGRATE_CMA); __free_pages(page, pageblock_order); - totalram_pages += pageblock_nr_pages; -#ifdef CONFIG_HIGHMEM - if (PageHighMem(page)) - totalhigh_pages += pageblock_nr_pages; -#endif + adjust_managed_page_count(page, pageblock_nr_pages); } #endif @@ -5207,8 +5203,13 @@ void adjust_managed_page_count(struct page *page, long count) spin_lock(&managed_page_count_lock); page_zone(page)->managed_pages += count; totalram_pages += count; +#ifdef CONFIG_HIGHMEM + if (PageHighMem(page)) + totalhigh_pages += count; +#endif spin_unlock(&managed_page_count_lock); } +EXPORT_SYMBOL(adjust_managed_page_count); unsigned long free_reserved_area(void *start, void *end, int poison, char *s) { -- cgit v1.2.3 From 0c988534737a358fdff42fcce78f0ff1a12dbfc5 Mon Sep 17 00:00:00 2001 From: Jiang Liu <liuj97@gmail.com> Date: Wed, 3 Jul 2013 15:03:24 -0700 Subject: mm: concentrate modification of totalram_pages into the mm core Concentrate code to modify totalram_pages into the mm core, so the arch memory initialized code doesn't need to take care of it. With these changes applied, only following functions from mm core modify global variable totalram_pages: free_bootmem_late(), free_all_bootmem(), free_all_bootmem_node(), adjust_managed_page_count(). With this patch applied, it will be much more easier for us to keep totalram_pages and zone->managed_pages in consistence. Signed-off-by: Jiang Liu <jiang.liu@huawei.com> Acked-by: David Howells <dhowells@redhat.com> Cc: "H. Peter Anvin" <hpa@zytor.com> Cc: "Michael S. Tsirkin" <mst@redhat.com> Cc: <sworddragon2@aol.com> Cc: Arnd Bergmann <arnd@arndb.de> Cc: Catalin Marinas <catalin.marinas@arm.com> Cc: Chris Metcalf <cmetcalf@tilera.com> Cc: Geert Uytterhoeven <geert@linux-m68k.org> Cc: Ingo Molnar <mingo@redhat.com> Cc: Jeremy Fitzhardinge <jeremy@goop.org> Cc: Jianguo Wu <wujianguo@huawei.com> Cc: Joonsoo Kim <js1304@gmail.com> Cc: Kamezawa Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com> Cc: Marek Szyprowski <m.szyprowski@samsung.com> Cc: Mel Gorman <mel@csn.ul.ie> Cc: Michel Lespinasse <walken@google.com> Cc: Minchan Kim <minchan@kernel.org> Cc: Rik van Riel <riel@redhat.com> Cc: Rusty Russell <rusty@rustcorp.com.au> Cc: Tang Chen <tangchen@cn.fujitsu.com> Cc: Tejun Heo <tj@kernel.org> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: Wen Congyang <wency@cn.fujitsu.com> Cc: Will Deacon <will.deacon@arm.com> Cc: Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com> Cc: Yinghai Lu <yinghai@kernel.org> Cc: Russell King <rmk@arm.linux.org.uk> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> --- arch/alpha/mm/init.c | 2 +- arch/alpha/mm/numa.c | 2 +- arch/arc/mm/init.c | 2 +- arch/arm/mm/init.c | 3 +-- arch/arm64/mm/init.c | 2 +- arch/avr32/mm/init.c | 2 -- arch/blackfin/mm/init.c | 2 +- arch/c6x/mm/init.c | 2 +- arch/cris/mm/init.c | 2 +- arch/frv/mm/init.c | 2 +- arch/h8300/mm/init.c | 2 +- arch/hexagon/mm/init.c | 2 +- arch/ia64/mm/init.c | 2 +- arch/m32r/mm/init.c | 2 +- arch/m68k/mm/init.c | 4 ++-- arch/metag/mm/init.c | 5 +---- arch/microblaze/mm/init.c | 2 +- arch/mips/mm/init.c | 2 +- arch/mips/sgi-ip27/ip27-memory.c | 2 +- arch/mn10300/mm/init.c | 2 +- arch/openrisc/mm/init.c | 2 +- arch/parisc/mm/init.c | 4 ++-- arch/powerpc/mm/mem.c | 5 ++--- arch/s390/mm/init.c | 2 +- arch/score/mm/init.c | 2 +- arch/sh/mm/init.c | 2 +- arch/sparc/mm/init_32.c | 3 +-- arch/sparc/mm/init_64.c | 2 +- arch/tile/mm/init.c | 2 +- arch/um/kernel/mem.c | 2 +- arch/unicore32/mm/init.c | 2 +- arch/x86/mm/init_32.c | 2 +- arch/x86/mm/init_64.c | 2 +- arch/xtensa/mm/init.c | 2 +- mm/bootmem.c | 9 ++++++++- mm/nobootmem.c | 7 ++++++- 36 files changed, 50 insertions(+), 46 deletions(-) (limited to 'mm') diff --git a/arch/alpha/mm/init.c b/arch/alpha/mm/init.c index 218c29c14bb3..eee47a453d7d 100644 --- a/arch/alpha/mm/init.c +++ b/arch/alpha/mm/init.c @@ -309,7 +309,7 @@ void __init mem_init(void) { max_mapnr = num_physpages = max_low_pfn; - totalram_pages += free_all_bootmem(); + free_all_bootmem(); high_memory = (void *) __va(max_low_pfn * PAGE_SIZE); printk_memory_info(); diff --git a/arch/alpha/mm/numa.c b/arch/alpha/mm/numa.c index 33885048fa36..857452c13c4d 100644 --- a/arch/alpha/mm/numa.c +++ b/arch/alpha/mm/numa.c @@ -334,7 +334,7 @@ void __init mem_init(void) /* * This will free up the bootmem, ie, slot 0 memory */ - totalram_pages += free_all_bootmem_node(NODE_DATA(nid)); + free_all_bootmem_node(NODE_DATA(nid)); pfn = NODE_DATA(nid)->node_start_pfn; for (i = 0; i < node_spanned_pages(nid); i++, pfn++) diff --git a/arch/arc/mm/init.c b/arch/arc/mm/init.c index f9c707712096..c668a600f652 100644 --- a/arch/arc/mm/init.c +++ b/arch/arc/mm/init.c @@ -111,7 +111,7 @@ void __init mem_init(void) high_memory = (void *)(CONFIG_LINUX_LINK_BASE + arc_mem_sz); - totalram_pages = free_all_bootmem(); + free_all_bootmem(); /* count all reserved pages [kernel code/data/mem_map..] */ reserved_pages = 0; diff --git a/arch/arm/mm/init.c b/arch/arm/mm/init.c index 2070651c1bb4..06e9ce17d1d2 100644 --- a/arch/arm/mm/init.c +++ b/arch/arm/mm/init.c @@ -596,8 +596,7 @@ void __init mem_init(void) /* this will put all unused low memory onto the freelists */ free_unused_memmap(&meminfo); - - totalram_pages += free_all_bootmem(); + free_all_bootmem(); #ifdef CONFIG_SA1111 /* now that our DMA memory is actually so designated, we can free it */ diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c index a398eb9018bb..93de98afedd7 100644 --- a/arch/arm64/mm/init.c +++ b/arch/arm64/mm/init.c @@ -284,7 +284,7 @@ void __init mem_init(void) free_unused_memmap(); #endif - totalram_pages += free_all_bootmem(); + free_all_bootmem(); reserved_pages = free_pages = 0; diff --git a/arch/avr32/mm/init.c b/arch/avr32/mm/init.c index b079e04f6954..af6890fd7319 100644 --- a/arch/avr32/mm/init.c +++ b/arch/avr32/mm/init.c @@ -117,8 +117,6 @@ void __init mem_init(void) if (pgdat->node_spanned_pages != 0) node_pages = free_all_bootmem_node(pgdat); - totalram_pages += node_pages; - for (i = 0; i < node_pages; i++) if (PageReserved(pgdat->node_mem_map + i)) reservedpages++; diff --git a/arch/blackfin/mm/init.c b/arch/blackfin/mm/init.c index fa241f5a7dcf..c73d80ef564f 100644 --- a/arch/blackfin/mm/init.c +++ b/arch/blackfin/mm/init.c @@ -104,7 +104,7 @@ void __init mem_init(void) printk(KERN_DEBUG "Kernel managed physical pages: %lu\n", num_physpages); /* This will put all low memory onto the freelists. */ - totalram_pages = free_all_bootmem(); + free_all_bootmem(); reservedpages = 0; for (tmp = ARCH_PFN_OFFSET; tmp < max_mapnr; tmp++) diff --git a/arch/c6x/mm/init.c b/arch/c6x/mm/init.c index 3987a20fdee6..c9ae8ce731d5 100644 --- a/arch/c6x/mm/init.c +++ b/arch/c6x/mm/init.c @@ -65,7 +65,7 @@ void __init mem_init(void) high_memory = (void *)(memory_end & PAGE_MASK); /* this will put all memory onto the freelists */ - totalram_pages = free_all_bootmem(); + free_all_bootmem(); codek = (_etext - _stext) >> 10; datak = (_end - _sdata) >> 10; diff --git a/arch/cris/mm/init.c b/arch/cris/mm/init.c index 8fec26392ae7..52b8b56ae305 100644 --- a/arch/cris/mm/init.c +++ b/arch/cris/mm/init.c @@ -33,7 +33,7 @@ mem_init(void) max_mapnr = num_physpages = max_low_pfn - min_low_pfn; /* this will put all memory onto the freelists */ - totalram_pages = free_all_bootmem(); + free_all_bootmem(); reservedpages = 0; for (tmp = 0; tmp < max_mapnr; tmp++) { diff --git a/arch/frv/mm/init.c b/arch/frv/mm/init.c index 8ba9d22d0d91..3dcc88803a4f 100644 --- a/arch/frv/mm/init.c +++ b/arch/frv/mm/init.c @@ -123,7 +123,7 @@ void __init mem_init(void) int codek = 0, datak = 0; /* this will put all low memory onto the freelists */ - totalram_pages = free_all_bootmem(); + free_all_bootmem(); #ifdef CONFIG_MMU for (loop = 0 ; loop < npages ; loop++) diff --git a/arch/h8300/mm/init.c b/arch/h8300/mm/init.c index c831f1dba132..a506dd4724e0 100644 --- a/arch/h8300/mm/init.c +++ b/arch/h8300/mm/init.c @@ -140,7 +140,7 @@ void __init mem_init(void) max_mapnr = num_physpages = MAP_NR(high_memory); /* this will put all low memory onto the freelists */ - totalram_pages = free_all_bootmem(); + free_all_bootmem(); codek = (_etext - _stext) >> 10; datak = (__bss_stop - _sdata) >> 10; diff --git a/arch/hexagon/mm/init.c b/arch/hexagon/mm/init.c index 2561d259a296..0ab5b4350e93 100644 --- a/arch/hexagon/mm/init.c +++ b/arch/hexagon/mm/init.c @@ -70,7 +70,7 @@ unsigned long long kmap_generation; void __init mem_init(void) { /* No idea where this is actually declared. Seems to evade LXR. */ - totalram_pages += free_all_bootmem(); + free_all_bootmem(); num_physpages = bootmem_lastpg-ARCH_PFN_OFFSET; printk(KERN_INFO "totalram_pages = %ld\n", totalram_pages); diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c index f8a4f38b0ad5..d141f7ea0be5 100644 --- a/arch/ia64/mm/init.c +++ b/arch/ia64/mm/init.c @@ -622,7 +622,7 @@ mem_init (void) for_each_online_pgdat(pgdat) if (pgdat->bdata->node_bootmem_map) - totalram_pages += free_all_bootmem_node(pgdat); + free_all_bootmem_node(pgdat); reserved_pages = 0; efi_memmap_walk(count_reserved_pages, &reserved_pages); diff --git a/arch/m32r/mm/init.c b/arch/m32r/mm/init.c index cca87d918436..a501838233ab 100644 --- a/arch/m32r/mm/init.c +++ b/arch/m32r/mm/init.c @@ -158,7 +158,7 @@ void __init mem_init(void) /* this will put all low memory onto the freelists */ for_each_online_node(nid) - totalram_pages += free_all_bootmem_node(NODE_DATA(nid)); + free_all_bootmem_node(NODE_DATA(nid)); reservedpages = reservedpages_count() - hole_pages; codesize = (unsigned long) &_etext - (unsigned long)&_text; diff --git a/arch/m68k/mm/init.c b/arch/m68k/mm/init.c index ab0b54ca5d85..614c60a04459 100644 --- a/arch/m68k/mm/init.c +++ b/arch/m68k/mm/init.c @@ -155,11 +155,11 @@ void __init mem_init(void) int i; /* this will put all memory onto the freelists */ - totalram_pages = num_physpages = 0; + num_physpages = 0; for_each_online_pgdat(pgdat) { num_physpages += pgdat->node_present_pages; - totalram_pages += free_all_bootmem_node(pgdat); + free_all_bootmem_node(pgdat); for (i = 0; i < pgdat->node_spanned_pages; i++) { struct page *page = pgdat->node_mem_map + i; char *addr = page_to_virt(page); diff --git a/arch/metag/mm/init.c b/arch/metag/mm/init.c index d7595f58fad5..ce81d7c43983 100644 --- a/arch/metag/mm/init.c +++ b/arch/metag/mm/init.c @@ -393,14 +393,11 @@ void __init mem_init(void) for_each_online_node(nid) { pg_data_t *pgdat = NODE_DATA(nid); - unsigned long node_pages = 0; num_physpages += pgdat->node_present_pages; if (pgdat->node_spanned_pages) - node_pages = free_all_bootmem_node(pgdat); - - totalram_pages += node_pages; + free_all_bootmem_node(pgdat); } pr_info("Memory: %luk/%luk available\n", diff --git a/arch/microblaze/mm/init.c b/arch/microblaze/mm/init.c index d149e0ebb767..b384cbc2c8f2 100644 --- a/arch/microblaze/mm/init.c +++ b/arch/microblaze/mm/init.c @@ -252,7 +252,7 @@ void __init mem_init(void) high_memory = (void *)__va(memory_start + lowmem_size - 1); /* this will put all memory onto the freelists */ - totalram_pages += free_all_bootmem(); + free_all_bootmem(); for_each_online_pgdat(pgdat) { unsigned long i; diff --git a/arch/mips/mm/init.c b/arch/mips/mm/init.c index 268f2a94031b..e7333f15b1b7 100644 --- a/arch/mips/mm/init.c +++ b/arch/mips/mm/init.c @@ -374,7 +374,7 @@ void __init mem_init(void) #endif high_memory = (void *) __va(max_low_pfn << PAGE_SHIFT); - totalram_pages += free_all_bootmem(); + free_all_bootmem(); setup_zero_pages(); /* Setup zeroed pages. */ reservedpages = ram = 0; diff --git a/arch/mips/sgi-ip27/ip27-memory.c b/arch/mips/sgi-ip27/ip27-memory.c index 1230f56429d7..aecac4a08360 100644 --- a/arch/mips/sgi-ip27/ip27-memory.c +++ b/arch/mips/sgi-ip27/ip27-memory.c @@ -489,7 +489,7 @@ void __init mem_init(void) /* * This will free up the bootmem, ie, slot 0 memory. */ - totalram_pages += free_all_bootmem_node(NODE_DATA(node)); + free_all_bootmem_node(NODE_DATA(node)); } setup_zero_pages(); /* This comes from node 0 */ diff --git a/arch/mn10300/mm/init.c b/arch/mn10300/mm/init.c index e19049d1f2b9..7590d91627f2 100644 --- a/arch/mn10300/mm/init.c +++ b/arch/mn10300/mm/init.c @@ -114,7 +114,7 @@ void __init mem_init(void) memset(empty_zero_page, 0, PAGE_SIZE); /* this will put all low memory onto the freelists */ - totalram_pages += free_all_bootmem(); + free_all_bootmem(); reservedpages = 0; for (tmp = 0; tmp < num_physpages; tmp++) diff --git a/arch/openrisc/mm/init.c b/arch/openrisc/mm/init.c index c371e4a0fcac..16c1e135cf34 100644 --- a/arch/openrisc/mm/init.c +++ b/arch/openrisc/mm/init.c @@ -207,7 +207,7 @@ static int __init free_pages_init(void) int reservedpages, pfn; /* this will put all low memory onto the freelists */ - totalram_pages = free_all_bootmem(); + free_all_bootmem(); reservedpages = 0; for (pfn = 0; pfn < max_low_pfn; pfn++) { diff --git a/arch/parisc/mm/init.c b/arch/parisc/mm/init.c index ebac7bd76b56..d8aaaf06ede2 100644 --- a/arch/parisc/mm/init.c +++ b/arch/parisc/mm/init.c @@ -593,13 +593,13 @@ void __init mem_init(void) #ifndef CONFIG_DISCONTIGMEM max_mapnr = page_to_pfn(virt_to_page(high_memory - 1)) + 1; - totalram_pages += free_all_bootmem(); + free_all_bootmem(); #else { int i; for (i = 0; i < npmem_ranges; i++) - totalram_pages += free_all_bootmem_node(NODE_DATA(i)); + free_all_bootmem_node(NODE_DATA(i)); } #endif diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c index 7f47a05f55af..3bcfc0d0d322 100644 --- a/arch/powerpc/mm/mem.c +++ b/arch/powerpc/mm/mem.c @@ -318,13 +318,12 @@ void __init mem_init(void) for_each_online_node(nid) { if (NODE_DATA(nid)->node_spanned_pages != 0) { printk("freeing bootmem node %d\n", nid); - totalram_pages += - free_all_bootmem_node(NODE_DATA(nid)); + free_all_bootmem_node(NODE_DATA(nid)); } } #else max_mapnr = max_pfn; - totalram_pages += free_all_bootmem(); + free_all_bootmem(); #endif for_each_online_pgdat(pgdat) { for (i = 0; i < pgdat->node_spanned_pages; i++) { diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c index bf01d18422ec..a2aafe1b2300 100644 --- a/arch/s390/mm/init.c +++ b/arch/s390/mm/init.c @@ -144,7 +144,7 @@ void __init mem_init(void) cmma_init(); /* this will put all low memory onto the freelists */ - totalram_pages += free_all_bootmem(); + free_all_bootmem(); setup_zero_pages(); /* Setup zeroed pages. */ reservedpages = 0; diff --git a/arch/score/mm/init.c b/arch/score/mm/init.c index f5dd61eb4544..a8b917742dec 100644 --- a/arch/score/mm/init.c +++ b/arch/score/mm/init.c @@ -79,7 +79,7 @@ void __init mem_init(void) unsigned long tmp, ram = 0; high_memory = (void *) __va(max_low_pfn << PAGE_SHIFT); - totalram_pages += free_all_bootmem(); + free_all_bootmem(); setup_zero_page(); /* Setup zeroed pages. */ reservedpages = 0; diff --git a/arch/sh/mm/init.c b/arch/sh/mm/init.c index d3af56b7a098..fc0c8e1c32a7 100644 --- a/arch/sh/mm/init.c +++ b/arch/sh/mm/init.c @@ -422,7 +422,7 @@ void __init mem_init(void) num_physpages += pgdat->node_present_pages; if (pgdat->node_spanned_pages) - totalram_pages += free_all_bootmem_node(pgdat); + free_all_bootmem_node(pgdat); node_high_memory = (void *)__va((pgdat->node_start_pfn + diff --git a/arch/sparc/mm/init_32.c b/arch/sparc/mm/init_32.c index d5f9c023826f..a438abb5495e 100644 --- a/arch/sparc/mm/init_32.c +++ b/arch/sparc/mm/init_32.c @@ -323,8 +323,7 @@ void __init mem_init(void) max_mapnr = last_valid_pfn - pfn_base; high_memory = __va(max_low_pfn << PAGE_SHIFT); - - totalram_pages = free_all_bootmem(); + free_all_bootmem(); for (i = 0; sp_banks[i].num_bytes != 0; i++) { unsigned long start_pfn = sp_banks[i].base_addr >> PAGE_SHIFT; diff --git a/arch/sparc/mm/init_64.c b/arch/sparc/mm/init_64.c index 8269deb84eda..752d73837f9e 100644 --- a/arch/sparc/mm/init_64.c +++ b/arch/sparc/mm/init_64.c @@ -2061,7 +2061,7 @@ void __init mem_init(void) high_memory = __va(last_valid_pfn << PAGE_SHIFT); register_page_bootmem_info(); - totalram_pages = free_all_bootmem(); + free_all_bootmem(); /* We subtract one to account for the mem_map_zero page * allocated below. diff --git a/arch/tile/mm/init.c b/arch/tile/mm/init.c index ccfeb3f2e769..45ce26d4e474 100644 --- a/arch/tile/mm/init.c +++ b/arch/tile/mm/init.c @@ -846,7 +846,7 @@ void __init mem_init(void) set_max_mapnr_init(); /* this will put all bootmem onto the freelists */ - totalram_pages += free_all_bootmem(); + free_all_bootmem(); #ifndef CONFIG_64BIT /* count all remaining LOWMEM and give all HIGHMEM to page allocator */ diff --git a/arch/um/kernel/mem.c b/arch/um/kernel/mem.c index 8ff0b7ae8ec0..b0c763094ffb 100644 --- a/arch/um/kernel/mem.c +++ b/arch/um/kernel/mem.c @@ -65,7 +65,7 @@ void __init mem_init(void) uml_reserved = brk_end; /* this will put all low memory onto the freelists */ - totalram_pages = free_all_bootmem(); + free_all_bootmem(); max_low_pfn = totalram_pages; #ifdef CONFIG_HIGHMEM setup_highmem(end_iomem, highmem); diff --git a/arch/unicore32/mm/init.c b/arch/unicore32/mm/init.c index df9b8abcb6a5..7d1356c466b9 100644 --- a/arch/unicore32/mm/init.c +++ b/arch/unicore32/mm/init.c @@ -392,7 +392,7 @@ void __init mem_init(void) free_unused_memmap(&meminfo); /* this will put all unused low memory onto the freelists */ - totalram_pages += free_all_bootmem(); + free_all_bootmem(); reserved_pages = free_pages = 0; diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 3ac7e319918d..9fa46baada27 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -759,7 +759,7 @@ void __init mem_init(void) set_highmem_pages_init(); /* this will put all low memory onto the freelists */ - totalram_pages += free_all_bootmem(); + free_all_bootmem(); reservedpages = 0; for (tmp = 0; tmp < max_low_pfn; tmp++) diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index ec312a92b137..9577638f3ead 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -1054,7 +1054,7 @@ void __init mem_init(void) register_page_bootmem_info(); /* this will put all memory onto the freelists */ - totalram_pages = free_all_bootmem(); + free_all_bootmem(); absent_pages = absent_pages_in_range(0, max_pfn); reservedpages = max_pfn - totalram_pages - absent_pages; diff --git a/arch/xtensa/mm/init.c b/arch/xtensa/mm/init.c index 026d29bee30b..663c1619562c 100644 --- a/arch/xtensa/mm/init.c +++ b/arch/xtensa/mm/init.c @@ -184,7 +184,7 @@ void __init mem_init(void) #error HIGHGMEM not implemented in init.c #endif - totalram_pages += free_all_bootmem(); + free_all_bootmem(); reservedpages = ram = 0; for (tmp = 0; tmp < max_mapnr; tmp++) { diff --git a/mm/bootmem.c b/mm/bootmem.c index eb792323187b..58609bbf584e 100644 --- a/mm/bootmem.c +++ b/mm/bootmem.c @@ -271,9 +271,14 @@ void __init reset_all_zones_managed_pages(void) */ unsigned long __init free_all_bootmem_node(pg_data_t *pgdat) { + unsigned long pages; + register_page_bootmem_info_node(pgdat); reset_node_managed_pages(pgdat); - return free_all_bootmem_core(pgdat->bdata); + pages = free_all_bootmem_core(pgdat->bdata); + totalram_pages += pages; + + return pages; } /** @@ -291,6 +296,8 @@ unsigned long __init free_all_bootmem(void) list_for_each_entry(bdata, &bdata_list, list) total_pages += free_all_bootmem_core(bdata); + totalram_pages += total_pages; + return total_pages; } diff --git a/mm/nobootmem.c b/mm/nobootmem.c index 0ae8d91365af..61107cf55bb3 100644 --- a/mm/nobootmem.c +++ b/mm/nobootmem.c @@ -165,6 +165,8 @@ void __init reset_all_zones_managed_pages(void) */ unsigned long __init free_all_bootmem(void) { + unsigned long pages; + reset_all_zones_managed_pages(); /* @@ -172,7 +174,10 @@ unsigned long __init free_all_bootmem(void) * because in some case like Node0 doesn't have RAM installed * low ram will be on Node1 */ - return free_low_memory_core_early(); + pages = free_low_memory_core_early(); + totalram_pages += pages; + + return pages; } /** -- cgit v1.2.3 From cdd91a77043ba81585236ef61f65c18222b212e6 Mon Sep 17 00:00:00 2001 From: Jiang Liu <liuj97@gmail.com> Date: Wed, 3 Jul 2013 15:03:27 -0700 Subject: mm: report available pages as "MemTotal" for each NUMA node As reported by https://bugzilla.kernel.org/show_bug.cgi?id=53501, "MemTotal" from /proc/meminfo means memory pages managed by the buddy system (managed_pages), but "MemTotal" from /sys/.../node/nodex/meminfo means physical pages present (present_pages) within the NUMA node. There's a difference between managed_pages and present_pages due to bootmem allocator and reserved pages. And Documentation/filesystems/proc.txt says MemTotal: Total usable ram (i.e. physical ram minus a few reserved bits and the kernel binary code) So change /sys/.../node/nodex/meminfo to report available pages within the node as "MemTotal". Signed-off-by: Jiang Liu <jiang.liu@huawei.com> Reported-by: <sworddragon2@aol.com> Cc: Mel Gorman <mel@csn.ul.ie> Cc: Minchan Kim <minchan@kernel.org> Cc: "H. Peter Anvin" <hpa@zytor.com> Cc: "Michael S. Tsirkin" <mst@redhat.com> Cc: Arnd Bergmann <arnd@arndb.de> Cc: Catalin Marinas <catalin.marinas@arm.com> Cc: Chris Metcalf <cmetcalf@tilera.com> Cc: David Howells <dhowells@redhat.com> Cc: Geert Uytterhoeven <geert@linux-m68k.org> Cc: Ingo Molnar <mingo@redhat.com> Cc: Jeremy Fitzhardinge <jeremy@goop.org> Cc: Jianguo Wu <wujianguo@huawei.com> Cc: Joonsoo Kim <js1304@gmail.com> Cc: Kamezawa Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com> Cc: Marek Szyprowski <m.szyprowski@samsung.com> Cc: Michel Lespinasse <walken@google.com> Cc: Rik van Riel <riel@redhat.com> Cc: Rusty Russell <rusty@rustcorp.com.au> Cc: Tang Chen <tangchen@cn.fujitsu.com> Cc: Tejun Heo <tj@kernel.org> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: Wen Congyang <wency@cn.fujitsu.com> Cc: Will Deacon <will.deacon@arm.com> Cc: Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com> Cc: Yinghai Lu <yinghai@kernel.org> Cc: Russell King <rmk@arm.linux.org.uk> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> --- mm/page_alloc.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 1481439ee2e4..d9445c4f5fd7 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2904,9 +2904,13 @@ EXPORT_SYMBOL(si_meminfo); #ifdef CONFIG_NUMA void si_meminfo_node(struct sysinfo *val, int nid) { + int zone_type; /* needs to be signed */ + unsigned long managed_pages = 0; pg_data_t *pgdat = NODE_DATA(nid); - val->totalram = pgdat->node_present_pages; + for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) + managed_pages += pgdat->node_zones[zone_type].managed_pages; + val->totalram = managed_pages; val->freeram = node_page_state(nid, NR_FREE_PAGES); #ifdef CONFIG_HIGHMEM val->totalhigh = pgdat->node_zones[ZONE_HIGHMEM].managed_pages; -- cgit v1.2.3 From e6c495a96ce02574e765d5140039a64c8d4e8c9e Mon Sep 17 00:00:00 2001 From: Vineet Gupta <Vineet.Gupta1@synopsys.com> Date: Wed, 3 Jul 2013 15:03:31 -0700 Subject: mm: fix the TLB range flushed when __tlb_remove_page() runs out of slots zap_pte_range loops from @addr to @end. In the middle, if it runs out of batching slots, TLB entries needs to be flushed for @start to @interim, NOT @interim to @end. Since ARC port doesn't use page free batching I can't test it myself but this seems like the right thing to do. Observed this when working on a fix for the issue at thread: http://www.spinics.net/lists/linux-arch/msg21736.html Signed-off-by: Vineet Gupta <vgupta@synopsys.com> Cc: Mel Gorman <mgorman@suse.de> Cc: Hugh Dickins <hughd@google.com> Cc: Rik van Riel <riel@redhat.com> Cc: David Rientjes <rientjes@google.com> Cc: Peter Zijlstra <peterz@infradead.org> Acked-by: Catalin Marinas <catalin.marinas@arm.com> Cc: Max Filippov <jcmvbkbc@gmail.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> --- mm/memory.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/memory.c b/mm/memory.c index a101bbcacfd7..407533219673 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1101,6 +1101,7 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, spinlock_t *ptl; pte_t *start_pte; pte_t *pte; + unsigned long range_start = addr; again: init_rss_vec(rss); @@ -1206,12 +1207,14 @@ again: force_flush = 0; #ifdef HAVE_GENERIC_MMU_GATHER - tlb->start = addr; - tlb->end = end; + tlb->start = range_start; + tlb->end = addr; #endif tlb_flush_mmu(tlb); - if (addr != end) + if (addr != end) { + range_start = addr; goto again; + } } return addr; -- cgit v1.2.3 From 7ee3d4e8cd560500192d80ca84d7f15d6dee0807 Mon Sep 17 00:00:00 2001 From: Jiang Liu <liuj97@gmail.com> Date: Wed, 3 Jul 2013 15:03:41 -0700 Subject: mm: introduce helper function mem_init_print_info() to simplify mem_init() Introduce helper function mem_init_print_info() to simplify mem_init() across different architectures, which also unifies the format and information printed. Function mem_init_print_info() calculates memory statistics information without walking each page, so it should be a little faster on some architectures. Also introduce another helper get_num_physpages() to kill the global variable num_physpages. Signed-off-by: Jiang Liu <jiang.liu@huawei.com> Cc: Mel Gorman <mgorman@suse.de> Cc: Michel Lespinasse <walken@google.com> Cc: Rik van Riel <riel@redhat.com> Cc: Minchan Kim <minchan@kernel.org> Cc: Marek Szyprowski <m.szyprowski@samsung.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> --- include/linux/mm.h | 12 ++++++++++++ mm/page_alloc.c | 52 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 64 insertions(+) (limited to 'mm') diff --git a/include/linux/mm.h b/include/linux/mm.h index 4310f80ce956..09c235301dbb 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1323,6 +1323,7 @@ extern void free_highmem_page(struct page *page); #endif extern void adjust_managed_page_count(struct page *page, long count); +extern void mem_init_print_info(const char *str); /* Free the reserved page into the buddy system, so it gets managed. */ static inline void __free_reserved_page(struct page *page) @@ -1358,6 +1359,17 @@ static inline unsigned long free_initmem_default(int poison) poison, "unused kernel"); } +static inline unsigned long get_num_physpages(void) +{ + int nid; + unsigned long phys_pages = 0; + + for_each_online_node(nid) + phys_pages += node_present_pages(nid); + + return phys_pages; +} + #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP /* * With CONFIG_HAVE_MEMBLOCK_NODE_MAP set, an architecture may initialise its diff --git a/mm/page_alloc.c b/mm/page_alloc.c index d9445c4f5fd7..327516b7aee9 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -61,6 +61,7 @@ #include <linux/hugetlb.h> #include <linux/sched/rt.h> +#include <asm/sections.h> #include <asm/tlbflush.h> #include <asm/div64.h> #include "internal.h" @@ -5246,6 +5247,57 @@ void free_highmem_page(struct page *page) } #endif + +void __init mem_init_print_info(const char *str) +{ + unsigned long physpages, codesize, datasize, rosize, bss_size; + unsigned long init_code_size, init_data_size; + + physpages = get_num_physpages(); + codesize = _etext - _stext; + datasize = _edata - _sdata; + rosize = __end_rodata - __start_rodata; + bss_size = __bss_stop - __bss_start; + init_data_size = __init_end - __init_begin; + init_code_size = _einittext - _sinittext; + + /* + * Detect special cases and adjust section sizes accordingly: + * 1) .init.* may be embedded into .data sections + * 2) .init.text.* may be out of [__init_begin, __init_end], + * please refer to arch/tile/kernel/vmlinux.lds.S. + * 3) .rodata.* may be embedded into .text or .data sections. + */ +#define adj_init_size(start, end, size, pos, adj) \ + if (start <= pos && pos < end && size > adj) \ + size -= adj; + + adj_init_size(__init_begin, __init_end, init_data_size, + _sinittext, init_code_size); + adj_init_size(_stext, _etext, codesize, _sinittext, init_code_size); + adj_init_size(_sdata, _edata, datasize, __init_begin, init_data_size); + adj_init_size(_stext, _etext, codesize, __start_rodata, rosize); + adj_init_size(_sdata, _edata, datasize, __start_rodata, rosize); + +#undef adj_init_size + + printk("Memory: %luK/%luK available " + "(%luK kernel code, %luK rwdata, %luK rodata, " + "%luK init, %luK bss, %luK reserved" +#ifdef CONFIG_HIGHMEM + ", %luK highmem" +#endif + "%s%s)\n", + nr_free_pages() << (PAGE_SHIFT-10), physpages << (PAGE_SHIFT-10), + codesize >> 10, datasize >> 10, rosize >> 10, + (init_data_size + init_code_size) >> 10, bss_size >> 10, + (physpages - totalram_pages) << (PAGE_SHIFT-10), +#ifdef CONFIG_HIGHMEM + totalhigh_pages << (PAGE_SHIFT-10), +#endif + str ? ", " : "", str ? str : ""); +} + /** * set_dma_reserve - set the specified number of pages reserved in the first zone * @new_dma_reserve: The number of pages to mark reserved -- cgit v1.2.3 From e461d627d5c0957457eb354843f3c29b50646d63 Mon Sep 17 00:00:00 2001 From: Jiang Liu <liuj97@gmail.com> Date: Wed, 3 Jul 2013 15:03:44 -0700 Subject: mm/hotplug: prepare for removing num_physpages Prepare for removing num_physpages. Signed-off-by: Jiang Liu <jiang.liu@huawei.com> Cc: Wen Congyang <wency@cn.fujitsu.com> Cc: Tang Chen <tangchen@cn.fujitsu.com> Cc: Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> --- mm/memory_hotplug.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'mm') diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 5e34922124a3..106602e5a70e 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -763,10 +763,6 @@ EXPORT_SYMBOL_GPL(restore_online_page_callback); void __online_page_set_limits(struct page *page) { - unsigned long pfn = page_to_pfn(page); - - if (pfn >= num_physpages) - num_physpages = pfn + 1; } EXPORT_SYMBOL_GPL(__online_page_set_limits); -- cgit v1.2.3 From 1895418189e08c1d1eec4fbdb5fb41d793f57ba5 Mon Sep 17 00:00:00 2001 From: Jiang Liu <liuj97@gmail.com> Date: Wed, 3 Jul 2013 15:04:21 -0700 Subject: mm: kill global variable num_physpages Now all references to num_physpages have been removed, so kill it. Signed-off-by: Jiang Liu <jiang.liu@huawei.com> Cc: Mel Gorman <mgorman@suse.de> Cc: Michel Lespinasse <walken@google.com> Cc: Rik van Riel <riel@redhat.com> Cc: Jiang Liu <jiang.liu@huawei.com> Cc: Hugh Dickins <hughd@google.com> Cc: David Rientjes <rientjes@google.com> Cc: Al Viro <viro@zeniv.linux.org.uk> Cc: Konstantin Khlebnikov <khlebnikov@openvz.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> --- include/linux/mm.h | 1 - mm/memory.c | 2 -- mm/nommu.c | 2 -- 3 files changed, 5 deletions(-) (limited to 'mm') diff --git a/include/linux/mm.h b/include/linux/mm.h index 09c235301dbb..e550a1773a9e 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -29,7 +29,6 @@ struct writeback_control; extern unsigned long max_mapnr; #endif -extern unsigned long num_physpages; extern unsigned long totalram_pages; extern void * high_memory; extern int page_cluster; diff --git a/mm/memory.c b/mm/memory.c index 407533219673..b68812d682b6 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -82,7 +82,6 @@ EXPORT_SYMBOL(max_mapnr); EXPORT_SYMBOL(mem_map); #endif -unsigned long num_physpages; /* * A number of key systems in x86 including ioremap() rely on the assumption * that high_memory defines the upper bound on direct map memory, then end @@ -92,7 +91,6 @@ unsigned long num_physpages; */ void * high_memory; -EXPORT_SYMBOL(num_physpages); EXPORT_SYMBOL(high_memory); /* diff --git a/mm/nommu.c b/mm/nommu.c index 1898b2fe9da5..e44e6e0a125c 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -56,7 +56,6 @@ void *high_memory; struct page *mem_map; unsigned long max_mapnr; -unsigned long num_physpages; unsigned long highest_memmap_pfn; struct percpu_counter vm_committed_as; int sysctl_overcommit_memory = OVERCOMMIT_GUESS; /* heuristic overcommit */ @@ -85,7 +84,6 @@ unsigned long vm_memory_committed(void) EXPORT_SYMBOL_GPL(vm_memory_committed); EXPORT_SYMBOL(mem_map); -EXPORT_SYMBOL(num_physpages); /* list of mapped, potentially shareable regions */ static struct kmem_cache *vm_region_jar; -- cgit v1.2.3 From e1280be0d8614be94e5bef48b6c830dfa03e82a7 Mon Sep 17 00:00:00 2001 From: Jiang Liu <liuj97@gmail.com> Date: Wed, 3 Jul 2013 15:04:34 -0700 Subject: mm: kill free_all_bootmem_node() Now nobody makes use of free_all_bootmem_node(), kill it. Signed-off-by: Jiang Liu <jiang.liu@huawei.com> Cc: Johannes Weiner <hannes@cmpxchg.org> Cc: "David S. Miller" <davem@davemloft.net> Cc: Yinghai Lu <yinghai@kernel.org> Acked-by: Tejun Heo <tj@kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> --- include/linux/bootmem.h | 1 - mm/bootmem.c | 18 ------------------ 2 files changed, 19 deletions(-) (limited to 'mm') diff --git a/include/linux/bootmem.h b/include/linux/bootmem.h index 0e48c3221d82..f1f07d31a3af 100644 --- a/include/linux/bootmem.h +++ b/include/linux/bootmem.h @@ -44,7 +44,6 @@ extern unsigned long init_bootmem_node(pg_data_t *pgdat, unsigned long endpfn); extern unsigned long init_bootmem(unsigned long addr, unsigned long memend); -extern unsigned long free_all_bootmem_node(pg_data_t *pgdat); extern unsigned long free_all_bootmem(void); extern void reset_all_zones_managed_pages(void); diff --git a/mm/bootmem.c b/mm/bootmem.c index 58609bbf584e..6ab7744e692e 100644 --- a/mm/bootmem.c +++ b/mm/bootmem.c @@ -263,24 +263,6 @@ void __init reset_all_zones_managed_pages(void) reset_managed_pages_done = 1; } -/** - * free_all_bootmem_node - release a node's free pages to the buddy allocator - * @pgdat: node to be released - * - * Returns the number of pages actually released. - */ -unsigned long __init free_all_bootmem_node(pg_data_t *pgdat) -{ - unsigned long pages; - - register_page_bootmem_info_node(pgdat); - reset_node_managed_pages(pgdat); - pages = free_all_bootmem_core(pgdat->bdata); - totalram_pages += pages; - - return pages; -} - /** * free_all_bootmem - release free pages to the buddy allocator * -- cgit v1.2.3 From 55878e88c59221c3187e1c24ec3b15eb79c374c0 Mon Sep 17 00:00:00 2001 From: Cody P Schafer <cody@linux.vnet.ibm.com> Date: Wed, 3 Jul 2013 15:04:44 -0700 Subject: sparsemem: add BUILD_BUG_ON when sizeof mem_section is non-power-of-2 Instead of leaving a hidden trap for the next person who comes along and wants to add something to mem_section, add a big fat warning about it needing to be a power-of-2, and insert a BUILD_BUG_ON() in sparse_init() to catch mistakes. Right now non-power-of-2 mem_sections cause a number of WARNs at boot (which don't clearly point to the size of mem_section as an issue), but the system limps on (temporarily, at least). This is based upon Dave Hansen's earlier RFC where he ran into the same issue: "sparsemem: fix boot when SECTIONS_PER_ROOT is not power-of-2" http://lkml.indiana.edu/hypermail/linux/kernel/1205.2/03077.html Signed-off-by: Cody P Schafer <cody@linux.vnet.ibm.com> Acked-by: Dave Hansen <dave.hansen@linux.intel.com> Cc: Jiang Liu <liuj97@gmail.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> --- include/linux/mmzone.h | 4 ++++ mm/sparse.c | 3 +++ 2 files changed, 7 insertions(+) (limited to 'mm') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 09d381b71fd8..ae19af5ec02c 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -1137,6 +1137,10 @@ struct mem_section { struct page_cgroup *page_cgroup; unsigned long pad; #endif + /* + * WARNING: mem_section must be a power-of-2 in size for the + * calculation and use of SECTION_ROOT_MASK to make sense. + */ }; #ifdef CONFIG_SPARSEMEM_EXTREME diff --git a/mm/sparse.c b/mm/sparse.c index 1c91f0d3f6ab..3194ec414728 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -481,6 +481,9 @@ void __init sparse_init(void) struct page **map_map; #endif + /* see include/linux/mmzone.h 'struct mem_section' definition */ + BUILD_BUG_ON(!is_power_of_2(sizeof(struct mem_section))); + /* Setup pageblock_order for HUGETLB_PAGE_SIZE_VARIABLE */ set_pageblock_order(); -- cgit v1.2.3 From d82b1d85760a8344d06272da67f0684243235fac Mon Sep 17 00:00:00 2001 From: Zhang Yanfei <zhangyanfei@cn.fujitsu.com> Date: Wed, 3 Jul 2013 15:04:47 -0700 Subject: mm, vmalloc: only call setup_vmalloc_vm() only in __get_vm_area_node() Now for insert_vmalloc_vm, it only calls the two functions: - setup_vmalloc_vm: fill vm_struct and vmap_area instances - clear_vm_unlist: clear VM_UNLIST bit in vm_struct->flags So in __get_vm_area_node(), if VM_UNLIST bit unset in flags, that is the else branch here, we don't need to clear VM_UNLIST bit for vm->flags since this bit is obviously not set. That is to say, we could only call setup_vmalloc_vm instead of insert_vmalloc_vm here. And then we could even remove the if test here. Signed-off-by: Zhang Yanfei <zhangyanfei@cn.fujitsu.com> Acked-by: Joonsoo Kim <iamjoonsoo.kim@lge.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> --- mm/vmalloc.c | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) (limited to 'mm') diff --git a/mm/vmalloc.c b/mm/vmalloc.c index b7259906a806..d23e70ec45ad 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -1367,16 +1367,7 @@ static struct vm_struct *__get_vm_area_node(unsigned long size, return NULL; } - /* - * When this function is called from __vmalloc_node_range, - * we add VM_UNLIST flag to avoid accessing uninitialized - * members of vm_struct such as pages and nr_pages fields. - * They will be set later. - */ - if (flags & VM_UNLIST) - setup_vmalloc_vm(area, va, flags, caller); - else - insert_vmalloc_vm(area, va, flags, caller); + setup_vmalloc_vm(area, va, flags, caller); return area; } -- cgit v1.2.3 From 3645cb4a4eb2002dad17b314559badf8a20e55a7 Mon Sep 17 00:00:00 2001 From: Zhang Yanfei <zhangyanfei@cn.fujitsu.com> Date: Wed, 3 Jul 2013 15:04:48 -0700 Subject: mm, vmalloc: call setup_vmalloc_vm() instead of insert_vmalloc_vm() Here we pass flags with only VM_ALLOC bit set, it is unnecessary to call clear_vm_unlist to clear VM_UNLIST bit. So use setup_vmalloc_vm instead of insert_vmalloc_vm. Signed-off-by: Zhang Yanfei <zhangyanfei@cn.fujitsu.com> Acked-by: Joonsoo Kim <iamjoonsoo.kim@lge.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> --- mm/vmalloc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/vmalloc.c b/mm/vmalloc.c index d23e70ec45ad..db48d513598f 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -2526,8 +2526,8 @@ found: /* insert all vm's */ for (area = 0; area < nr_vms; area++) - insert_vmalloc_vm(vms[area], vas[area], VM_ALLOC, - pcpu_get_vm_areas); + setup_vmalloc_vm(vms[area], vas[area], VM_ALLOC, + pcpu_get_vm_areas); kfree(vas); return vms; -- cgit v1.2.3 From f6d480059bedaf4feb06466c770f5fcace9eca31 Mon Sep 17 00:00:00 2001 From: Zhang Yanfei <zhangyanfei@cn.fujitsu.com> Date: Wed, 3 Jul 2013 15:04:49 -0700 Subject: mm, vmalloc: remove insert_vmalloc_vm() Now this function is nowhere used, we can remove it directly. Signed-off-by: Zhang Yanfei <zhangyanfei@cn.fujitsu.com> Acked-by: Joonsoo Kim <iamjoonsoo.kim@lge.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> --- mm/vmalloc.c | 7 ------- 1 file changed, 7 deletions(-) (limited to 'mm') diff --git a/mm/vmalloc.c b/mm/vmalloc.c index db48d513598f..bd60bffd9aef 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -1322,13 +1322,6 @@ static void clear_vm_unlist(struct vm_struct *vm) vm->flags &= ~VM_UNLIST; } -static void insert_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va, - unsigned long flags, const void *caller) -{ - setup_vmalloc_vm(vm, va, flags, caller); - clear_vm_unlist(vm); -} - static struct vm_struct *__get_vm_area_node(unsigned long size, unsigned long align, unsigned long flags, unsigned long start, unsigned long end, int node, gfp_t gfp_mask, const void *caller) -- cgit v1.2.3 From 0f2d4a8e27108ad3b2555396b06392be590fe287 Mon Sep 17 00:00:00 2001 From: Zhang Yanfei <zhangyanfei@cn.fujitsu.com> Date: Wed, 3 Jul 2013 15:04:50 -0700 Subject: mm, vmalloc: use clamp() to simplify code Signed-off-by: Zhang Yanfei <zhangyanfei@cn.fujitsu.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> --- mm/vmalloc.c | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) (limited to 'mm') diff --git a/mm/vmalloc.c b/mm/vmalloc.c index bd60bffd9aef..91a10472a39a 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -1330,16 +1330,8 @@ static struct vm_struct *__get_vm_area_node(unsigned long size, struct vm_struct *area; BUG_ON(in_interrupt()); - if (flags & VM_IOREMAP) { - int bit = fls(size); - - if (bit > IOREMAP_MAX_ORDER) - bit = IOREMAP_MAX_ORDER; - else if (bit < PAGE_SHIFT) - bit = PAGE_SHIFT; - - align = 1ul << bit; - } + if (flags & VM_IOREMAP) + align = 1ul << clamp(fls(size), PAGE_SHIFT, IOREMAP_MAX_ORDER); size = PAGE_ALIGN(size); if (unlikely(!size)) -- cgit v1.2.3 From 519ebea3bf6df45439e79c54bda1d9e29fe13a64 Mon Sep 17 00:00:00 2001 From: Johannes Weiner <hannes@cmpxchg.org> Date: Wed, 3 Jul 2013 15:04:51 -0700 Subject: mm: memcontrol: factor out reclaim iterator loading and updating mem_cgroup_iter() is too hard to follow. Factor out the lockless reclaim iterator loading and updating so it's easier to follow the big picture. Also document the iterator invalidation mechanism a bit more extensively. Signed-off-by: Johannes Weiner <hannes@cmpxchg.org> Reported-by: Tejun Heo <tj@kernel.org> Reviewed-by: Tejun Heo <tj@kernel.org> Acked-by: Michal Hocko <mhocko@suse.cz> Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Cc: Glauber Costa <glommer@parallels.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> --- mm/memcontrol.c | 86 ++++++++++++++++++++++++++++++++++++++------------------- 1 file changed, 57 insertions(+), 29 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 4748966b1511..2e851f453814 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1148,6 +1148,58 @@ skip_node: return NULL; } +static void mem_cgroup_iter_invalidate(struct mem_cgroup *root) +{ + /* + * When a group in the hierarchy below root is destroyed, the + * hierarchy iterator can no longer be trusted since it might + * have pointed to the destroyed group. Invalidate it. + */ + atomic_inc(&root->dead_count); +} + +static struct mem_cgroup * +mem_cgroup_iter_load(struct mem_cgroup_reclaim_iter *iter, + struct mem_cgroup *root, + int *sequence) +{ + struct mem_cgroup *position = NULL; + /* + * A cgroup destruction happens in two stages: offlining and + * release. They are separated by a RCU grace period. + * + * If the iterator is valid, we may still race with an + * offlining. The RCU lock ensures the object won't be + * released, tryget will fail if we lost the race. + */ + *sequence = atomic_read(&root->dead_count); + if (iter->last_dead_count == *sequence) { + smp_rmb(); + position = iter->last_visited; + if (position && !css_tryget(&position->css)) + position = NULL; + } + return position; +} + +static void mem_cgroup_iter_update(struct mem_cgroup_reclaim_iter *iter, + struct mem_cgroup *last_visited, + struct mem_cgroup *new_position, + int sequence) +{ + if (last_visited) + css_put(&last_visited->css); + /* + * We store the sequence count from the time @last_visited was + * loaded successfully instead of rereading it here so that we + * don't lose destruction events in between. We could have + * raced with the destruction of @new_position after all. + */ + iter->last_visited = new_position; + smp_wmb(); + iter->last_dead_count = sequence; +} + /** * mem_cgroup_iter - iterate over memory cgroup hierarchy * @root: hierarchy root @@ -1171,7 +1223,6 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, { struct mem_cgroup *memcg = NULL; struct mem_cgroup *last_visited = NULL; - unsigned long uninitialized_var(dead_count); if (mem_cgroup_disabled()) return NULL; @@ -1191,6 +1242,7 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, rcu_read_lock(); while (!memcg) { struct mem_cgroup_reclaim_iter *uninitialized_var(iter); + int uninitialized_var(seq); if (reclaim) { int nid = zone_to_nid(reclaim->zone); @@ -1204,37 +1256,13 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, goto out_unlock; } - /* - * If the dead_count mismatches, a destruction - * has happened or is happening concurrently. - * If the dead_count matches, a destruction - * might still happen concurrently, but since - * we checked under RCU, that destruction - * won't free the object until we release the - * RCU reader lock. Thus, the dead_count - * check verifies the pointer is still valid, - * css_tryget() verifies the cgroup pointed to - * is alive. - */ - dead_count = atomic_read(&root->dead_count); - if (dead_count == iter->last_dead_count) { - smp_rmb(); - last_visited = iter->last_visited; - if (last_visited && - !css_tryget(&last_visited->css)) - last_visited = NULL; - } + last_visited = mem_cgroup_iter_load(iter, root, &seq); } memcg = __mem_cgroup_iter_next(root, last_visited); if (reclaim) { - if (last_visited) - css_put(&last_visited->css); - - iter->last_visited = memcg; - smp_wmb(); - iter->last_dead_count = dead_count; + mem_cgroup_iter_update(iter, last_visited, memcg, seq); if (!memcg) iter->generation++; @@ -6318,14 +6346,14 @@ static void mem_cgroup_invalidate_reclaim_iterators(struct mem_cgroup *memcg) struct mem_cgroup *parent = memcg; while ((parent = parent_mem_cgroup(parent))) - atomic_inc(&parent->dead_count); + mem_cgroup_iter_invalidate(parent); /* * if the root memcg is not hierarchical we have to check it * explicitely. */ if (!root_mem_cgroup->use_hierarchy) - atomic_inc(&root_mem_cgroup->dead_count); + mem_cgroup_iter_invalidate(root_mem_cgroup); } static void mem_cgroup_css_offline(struct cgroup *cont) -- cgit v1.2.3 From 02aa2a37636c8fa4fb9322d91be46ff8225b7de0 Mon Sep 17 00:00:00 2001 From: Kees Cook <keescook@chromium.org> Date: Wed, 3 Jul 2013 15:04:56 -0700 Subject: drivers: avoid format string in dev_set_name Calling dev_set_name with a single paramter causes it to be handled as a format string. Many callers are passing potentially dynamic string content, so use "%s" in those cases to avoid any potential accidents, including wrappers like device_create*() and bdi_register(). Signed-off-by: Kees Cook <keescook@chromium.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> --- drivers/base/attribute_container.c | 2 +- drivers/devfreq/devfreq.c | 2 +- drivers/extcon/extcon-class.c | 2 +- drivers/hsi/hsi.c | 2 +- drivers/ide/ide-cd.c | 2 +- drivers/ide/ide-gd.c | 2 +- drivers/ide/ide-probe.c | 4 ++-- drivers/ide/ide-tape.c | 2 +- drivers/infiniband/core/sysfs.c | 2 +- drivers/infiniband/hw/qib/qib_file_ops.c | 2 +- drivers/isdn/mISDN/dsp_pipeline.c | 2 +- drivers/mtd/mtdcore.c | 2 +- drivers/platform/x86/wmi.c | 2 +- drivers/scsi/sd.c | 2 +- drivers/staging/android/timed_output.c | 2 +- drivers/staging/dgrp/dgrp_sysfs.c | 2 +- drivers/uwb/lc-dev.c | 2 +- drivers/video/backlight/backlight.c | 2 +- drivers/video/backlight/lcd.c | 2 +- drivers/video/output.c | 2 +- drivers/xen/xenbus/xenbus_probe.c | 2 +- mm/backing-dev.c | 5 ++--- sound/sound_core.c | 2 +- 23 files changed, 25 insertions(+), 26 deletions(-) (limited to 'mm') diff --git a/drivers/base/attribute_container.c b/drivers/base/attribute_container.c index d78b204e65c1..ecc1929d7f6a 100644 --- a/drivers/base/attribute_container.c +++ b/drivers/base/attribute_container.c @@ -167,7 +167,7 @@ attribute_container_add_device(struct device *dev, ic->classdev.parent = get_device(dev); ic->classdev.class = cont->class; cont->class->dev_release = attribute_container_release; - dev_set_name(&ic->classdev, dev_name(dev)); + dev_set_name(&ic->classdev, "%s", dev_name(dev)); if (fn) fn(cont, dev, &ic->classdev); else diff --git a/drivers/devfreq/devfreq.c b/drivers/devfreq/devfreq.c index 3b367973a802..af8372feb587 100644 --- a/drivers/devfreq/devfreq.c +++ b/drivers/devfreq/devfreq.c @@ -477,7 +477,7 @@ struct devfreq *devfreq_add_device(struct device *dev, GFP_KERNEL); devfreq->last_stat_updated = jiffies; - dev_set_name(&devfreq->dev, dev_name(dev)); + dev_set_name(&devfreq->dev, "%s", dev_name(dev)); err = device_register(&devfreq->dev); if (err) { put_device(&devfreq->dev); diff --git a/drivers/extcon/extcon-class.c b/drivers/extcon/extcon-class.c index 8c69803558fe..18ccadef43fd 100644 --- a/drivers/extcon/extcon-class.c +++ b/drivers/extcon/extcon-class.c @@ -602,7 +602,7 @@ int extcon_dev_register(struct extcon_dev *edev, struct device *dev) edev->dev->class = extcon_class; edev->dev->release = extcon_dev_release; - dev_set_name(edev->dev, edev->name ? edev->name : dev_name(dev)); + dev_set_name(edev->dev, "%s", edev->name ? edev->name : dev_name(dev)); if (edev->max_supported) { char buf[10]; diff --git a/drivers/hsi/hsi.c b/drivers/hsi/hsi.c index 833dd1afbf46..66d44581e1b1 100644 --- a/drivers/hsi/hsi.c +++ b/drivers/hsi/hsi.c @@ -75,7 +75,7 @@ static void hsi_new_client(struct hsi_port *port, struct hsi_board_info *info) cl->device.bus = &hsi_bus_type; cl->device.parent = &port->device; cl->device.release = hsi_client_release; - dev_set_name(&cl->device, info->name); + dev_set_name(&cl->device, "%s", info->name); cl->device.platform_data = info->platform_data; if (info->archdata) cl->device.archdata = *info->archdata; diff --git a/drivers/ide/ide-cd.c b/drivers/ide/ide-cd.c index 2ff620444930..0b510bafd90e 100644 --- a/drivers/ide/ide-cd.c +++ b/drivers/ide/ide-cd.c @@ -1756,7 +1756,7 @@ static int ide_cd_probe(ide_drive_t *drive) info->dev.parent = &drive->gendev; info->dev.release = ide_cd_release; - dev_set_name(&info->dev, dev_name(&drive->gendev)); + dev_set_name(&info->dev, "%s", dev_name(&drive->gendev)); if (device_register(&info->dev)) goto out_free_disk; diff --git a/drivers/ide/ide-gd.c b/drivers/ide/ide-gd.c index de86631e767d..838996a0039e 100644 --- a/drivers/ide/ide-gd.c +++ b/drivers/ide/ide-gd.c @@ -392,7 +392,7 @@ static int ide_gd_probe(ide_drive_t *drive) idkp->dev.parent = &drive->gendev; idkp->dev.release = ide_disk_release; - dev_set_name(&idkp->dev, dev_name(&drive->gendev)); + dev_set_name(&idkp->dev, "%s", dev_name(&drive->gendev)); if (device_register(&idkp->dev)) goto out_free_disk; diff --git a/drivers/ide/ide-probe.c b/drivers/ide/ide-probe.c index 068cef0a987a..2a744a91370e 100644 --- a/drivers/ide/ide-probe.c +++ b/drivers/ide/ide-probe.c @@ -545,7 +545,7 @@ static int ide_register_port(ide_hwif_t *hwif) int ret; /* register with global device tree */ - dev_set_name(&hwif->gendev, hwif->name); + dev_set_name(&hwif->gendev, "%s", hwif->name); dev_set_drvdata(&hwif->gendev, hwif); if (hwif->gendev.parent == NULL) hwif->gendev.parent = hwif->dev; @@ -559,7 +559,7 @@ static int ide_register_port(ide_hwif_t *hwif) } hwif->portdev = device_create(ide_port_class, &hwif->gendev, - MKDEV(0, 0), hwif, hwif->name); + MKDEV(0, 0), hwif, "%s", hwif->name); if (IS_ERR(hwif->portdev)) { ret = PTR_ERR(hwif->portdev); device_unregister(&hwif->gendev); diff --git a/drivers/ide/ide-tape.c b/drivers/ide/ide-tape.c index c6c574bd5f59..1793aea4a7d2 100644 --- a/drivers/ide/ide-tape.c +++ b/drivers/ide/ide-tape.c @@ -1985,7 +1985,7 @@ static int ide_tape_probe(ide_drive_t *drive) tape->dev.parent = &drive->gendev; tape->dev.release = ide_tape_release; - dev_set_name(&tape->dev, dev_name(&drive->gendev)); + dev_set_name(&tape->dev, "%s", dev_name(&drive->gendev)); if (device_register(&tape->dev)) goto out_free_disk; diff --git a/drivers/infiniband/core/sysfs.c b/drivers/infiniband/core/sysfs.c index 246fdc151652..99904f7d59e3 100644 --- a/drivers/infiniband/core/sysfs.c +++ b/drivers/infiniband/core/sysfs.c @@ -813,7 +813,7 @@ int ib_device_register_sysfs(struct ib_device *device, class_dev->class = &ib_class; class_dev->parent = device->dma_device; - dev_set_name(class_dev, device->name); + dev_set_name(class_dev, "%s", device->name); dev_set_drvdata(class_dev, device); INIT_LIST_HEAD(&device->port_list); diff --git a/drivers/infiniband/hw/qib/qib_file_ops.c b/drivers/infiniband/hw/qib/qib_file_ops.c index b56c9428f3c5..9dd0bc89c3aa 100644 --- a/drivers/infiniband/hw/qib/qib_file_ops.c +++ b/drivers/infiniband/hw/qib/qib_file_ops.c @@ -2208,7 +2208,7 @@ int qib_cdev_init(int minor, const char *name, goto err_cdev; } - device = device_create(qib_class, NULL, dev, NULL, name); + device = device_create(qib_class, NULL, dev, NULL, "%s", name); if (!IS_ERR(device)) goto done; ret = PTR_ERR(device); diff --git a/drivers/isdn/mISDN/dsp_pipeline.c b/drivers/isdn/mISDN/dsp_pipeline.c index 88305c9cbff5..8b1a66c6ca8a 100644 --- a/drivers/isdn/mISDN/dsp_pipeline.c +++ b/drivers/isdn/mISDN/dsp_pipeline.c @@ -102,7 +102,7 @@ int mISDN_dsp_element_register(struct mISDN_dsp_element *elem) entry->dev.class = elements_class; entry->dev.release = mISDN_dsp_dev_release; dev_set_drvdata(&entry->dev, elem); - dev_set_name(&entry->dev, elem->name); + dev_set_name(&entry->dev, "%s", elem->name); ret = device_register(&entry->dev); if (ret) { printk(KERN_ERR "%s: failed to register %s\n", diff --git a/drivers/mtd/mtdcore.c b/drivers/mtd/mtdcore.c index c400c57c394a..048c823f5c51 100644 --- a/drivers/mtd/mtdcore.c +++ b/drivers/mtd/mtdcore.c @@ -1151,7 +1151,7 @@ static int __init mtd_bdi_init(struct backing_dev_info *bdi, const char *name) ret = bdi_init(bdi); if (!ret) - ret = bdi_register(bdi, NULL, name); + ret = bdi_register(bdi, NULL, "%s", name); if (ret) bdi_destroy(bdi); diff --git a/drivers/platform/x86/wmi.c b/drivers/platform/x86/wmi.c index e4ac38aca580..b13344c59808 100644 --- a/drivers/platform/x86/wmi.c +++ b/drivers/platform/x86/wmi.c @@ -743,7 +743,7 @@ static int wmi_create_device(const struct guid_block *gblock, wblock->dev.class = &wmi_class; wmi_gtoa(gblock->guid, guid_string); - dev_set_name(&wblock->dev, guid_string); + dev_set_name(&wblock->dev, "%s", guid_string); dev_set_drvdata(&wblock->dev, wblock); diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c index c1c555242d0d..8fa3d0b73ad9 100644 --- a/drivers/scsi/sd.c +++ b/drivers/scsi/sd.c @@ -2931,7 +2931,7 @@ static int sd_probe(struct device *dev) device_initialize(&sdkp->dev); sdkp->dev.parent = dev; sdkp->dev.class = &sd_disk_class; - dev_set_name(&sdkp->dev, dev_name(dev)); + dev_set_name(&sdkp->dev, "%s", dev_name(dev)); if (device_add(&sdkp->dev)) goto out_free_index; diff --git a/drivers/staging/android/timed_output.c b/drivers/staging/android/timed_output.c index ec9e2ae2de0d..ee3a57f22832 100644 --- a/drivers/staging/android/timed_output.c +++ b/drivers/staging/android/timed_output.c @@ -78,7 +78,7 @@ int timed_output_dev_register(struct timed_output_dev *tdev) tdev->index = atomic_inc_return(&device_count); tdev->dev = device_create(timed_output_class, NULL, - MKDEV(0, tdev->index), NULL, tdev->name); + MKDEV(0, tdev->index), NULL, "%s", tdev->name); if (IS_ERR(tdev->dev)) return PTR_ERR(tdev->dev); diff --git a/drivers/staging/dgrp/dgrp_sysfs.c b/drivers/staging/dgrp/dgrp_sysfs.c index 7d1b36d1e75f..8cee9c8bc38b 100644 --- a/drivers/staging/dgrp/dgrp_sysfs.c +++ b/drivers/staging/dgrp/dgrp_sysfs.c @@ -273,7 +273,7 @@ void dgrp_create_node_class_sysfs_files(struct nd_struct *nd) sprintf(name, "node%ld", nd->nd_major); nd->nd_class_dev = device_create(dgrp_class, dgrp_class_nodes_dev, - MKDEV(0, nd->nd_major), NULL, name); + MKDEV(0, nd->nd_major), NULL, "%s", name); ret = sysfs_create_group(&nd->nd_class_dev->kobj, &dgrp_node_attribute_group); diff --git a/drivers/uwb/lc-dev.c b/drivers/uwb/lc-dev.c index 5241f1d0ef7a..9209eafc75b1 100644 --- a/drivers/uwb/lc-dev.c +++ b/drivers/uwb/lc-dev.c @@ -440,7 +440,7 @@ void uwbd_dev_onair(struct uwb_rc *rc, struct uwb_beca_e *bce) uwb_dev_init(uwb_dev); /* This sets refcnt to one, we own it */ uwb_dev->mac_addr = *bce->mac_addr; uwb_dev->dev_addr = bce->dev_addr; - dev_set_name(&uwb_dev->dev, macbuf); + dev_set_name(&uwb_dev->dev, "%s", macbuf); result = uwb_dev_add(uwb_dev, &rc->uwb_dev.dev, rc); if (result < 0) { dev_err(dev, "new device %s: cannot instantiate device\n", diff --git a/drivers/video/backlight/backlight.c b/drivers/video/backlight/backlight.c index c74e7aa46731..e3c279083253 100644 --- a/drivers/video/backlight/backlight.c +++ b/drivers/video/backlight/backlight.c @@ -304,7 +304,7 @@ struct backlight_device *backlight_device_register(const char *name, new_bd->dev.class = backlight_class; new_bd->dev.parent = parent; new_bd->dev.release = bl_device_release; - dev_set_name(&new_bd->dev, name); + dev_set_name(&new_bd->dev, "%s", name); dev_set_drvdata(&new_bd->dev, devdata); /* Set default properties */ diff --git a/drivers/video/backlight/lcd.c b/drivers/video/backlight/lcd.c index 34fb6bd798c8..3649fd9ddb3a 100644 --- a/drivers/video/backlight/lcd.c +++ b/drivers/video/backlight/lcd.c @@ -219,7 +219,7 @@ struct lcd_device *lcd_device_register(const char *name, struct device *parent, new_ld->dev.class = lcd_class; new_ld->dev.parent = parent; new_ld->dev.release = lcd_device_release; - dev_set_name(&new_ld->dev, name); + dev_set_name(&new_ld->dev, "%s", name); dev_set_drvdata(&new_ld->dev, devdata); rc = device_register(&new_ld->dev); diff --git a/drivers/video/output.c b/drivers/video/output.c index 0d6f2cda9369..6285b9718451 100644 --- a/drivers/video/output.c +++ b/drivers/video/output.c @@ -97,7 +97,7 @@ struct output_device *video_output_register(const char *name, new_dev->props = op; new_dev->dev.class = &video_output_class; new_dev->dev.parent = dev; - dev_set_name(&new_dev->dev, name); + dev_set_name(&new_dev->dev, "%s", name); dev_set_drvdata(&new_dev->dev, devdata); ret_code = device_register(&new_dev->dev); if (ret_code) { diff --git a/drivers/xen/xenbus/xenbus_probe.c b/drivers/xen/xenbus/xenbus_probe.c index 56cfaaa9d006..8c9db16500e3 100644 --- a/drivers/xen/xenbus/xenbus_probe.c +++ b/drivers/xen/xenbus/xenbus_probe.c @@ -447,7 +447,7 @@ int xenbus_probe_node(struct xen_bus_type *bus, if (err) goto fail; - dev_set_name(&xendev->dev, devname); + dev_set_name(&xendev->dev, "%s", devname); /* Register with generic device framework. */ err = device_register(&xendev->dev); diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 502517492258..d014ee5fcbbd 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -515,7 +515,6 @@ EXPORT_SYMBOL(bdi_destroy); int bdi_setup_and_register(struct backing_dev_info *bdi, char *name, unsigned int cap) { - char tmp[32]; int err; bdi->name = name; @@ -524,8 +523,8 @@ int bdi_setup_and_register(struct backing_dev_info *bdi, char *name, if (err) return err; - sprintf(tmp, "%.28s%s", name, "-%d"); - err = bdi_register(bdi, NULL, tmp, atomic_long_inc_return(&bdi_seq)); + err = bdi_register(bdi, NULL, "%.28s-%ld", name, + atomic_long_inc_return(&bdi_seq)); if (err) { bdi_destroy(bdi); return err; diff --git a/sound/sound_core.c b/sound/sound_core.c index 359753fc24e1..45759f4cca75 100644 --- a/sound/sound_core.c +++ b/sound/sound_core.c @@ -292,7 +292,7 @@ retry: } device_create(sound_class, dev, MKDEV(SOUND_MAJOR, s->unit_minor), - NULL, s->name+6); + NULL, "%s", s->name+6); return s->unit_minor; fail: -- cgit v1.2.3