From 2aa15890f3c191326678f1bd68af61ec6b8753ec Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Wed, 23 Feb 2011 13:49:47 +0100 Subject: mm: prevent concurrent unmap_mapping_range() on the same inode Michael Leun reported that running parallel opens on a fuse filesystem can trigger a "kernel BUG at mm/truncate.c:475" Gurudas Pai reported the same bug on NFS. The reason is, unmap_mapping_range() is not prepared for more than one concurrent invocation per inode. For example: thread1: going through a big range, stops in the middle of a vma and stores the restart address in vm_truncate_count. thread2: comes in with a small (e.g. single page) unmap request on the same vma, somewhere before restart_address, finds that the vma was already unmapped up to the restart address and happily returns without doing anything. Another scenario would be two big unmap requests, both having to restart the unmapping and each one setting vm_truncate_count to its own value. This could go on forever without any of them being able to finish. Truncate and hole punching already serialize with i_mutex. Other callers of unmap_mapping_range() do not, and it's difficult to get i_mutex protection for all callers. In particular ->d_revalidate(), which calls invalidate_inode_pages2_range() in fuse, may be called with or without i_mutex. This patch adds a new mutex to 'struct address_space' to prevent running multiple concurrent unmap_mapping_range() on the same mapping. [ We'll hopefully get rid of all this with the upcoming mm preemptibility series by Peter Zijlstra, the "mm: Remove i_mmap_mutex lockbreak" patch in particular. But that is for 2.6.39 ] Signed-off-by: Miklos Szeredi Reported-by: Michael Leun Reported-by: Gurudas Pai Tested-by: Gurudas Pai Acked-by: Hugh Dickins Cc: stable@kernel.org Signed-off-by: Linus Torvalds --- mm/memory.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'mm') diff --git a/mm/memory.c b/mm/memory.c index 8e8c18324863..5823698c2b71 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2648,6 +2648,7 @@ void unmap_mapping_range(struct address_space *mapping, details.last_index = ULONG_MAX; details.i_mmap_lock = &mapping->i_mmap_lock; + mutex_lock(&mapping->unmap_mutex); spin_lock(&mapping->i_mmap_lock); /* Protect against endless unmapping loops */ @@ -2664,6 +2665,7 @@ void unmap_mapping_range(struct address_space *mapping, if (unlikely(!list_empty(&mapping->i_mmap_nonlinear))) unmap_mapping_range_list(&mapping->i_mmap_nonlinear, &details); spin_unlock(&mapping->i_mmap_lock); + mutex_unlock(&mapping->unmap_mutex); } EXPORT_SYMBOL(unmap_mapping_range); -- cgit v1.2.3 From a3e8cc643d22d2c8ed36b9be7d9c9ca21efcf7f7 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Wed, 23 Feb 2011 21:39:49 -0800 Subject: mm: fix possible cause of a page_mapped BUG Robert Swiecki reported a BUG_ON(page_mapped) from a fuzzer, punching a hole with madvise(,, MADV_REMOVE). That path is under mutex, and cannot be explained by lack of serialization in unmap_mapping_range(). Reviewing the code, I found one place where vm_truncate_count handling should have been updated, when I switched at the last minute from one way of managing the restart_addr to another: mremap move changes the virtual addresses, so it ought to adjust the restart_addr. But rather than exporting the notion of restart_addr from memory.c, or converting to restart_pgoff throughout, simply reset vm_truncate_count to 0 to force a rescan if mremap move races with preempted truncation. We have no confirmation that this fixes Robert's BUG, but it is a fix that's worth making anyway. Signed-off-by: Hugh Dickins Signed-off-by: Linus Torvalds --- mm/mremap.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/mremap.c b/mm/mremap.c index 9925b6391b80..1de98d492ddc 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -94,9 +94,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, */ mapping = vma->vm_file->f_mapping; spin_lock(&mapping->i_mmap_lock); - if (new_vma->vm_truncate_count && - new_vma->vm_truncate_count != vma->vm_truncate_count) - new_vma->vm_truncate_count = 0; + new_vma->vm_truncate_count = 0; } /* -- cgit v1.2.3 From 8074b26f67165bf045d92e778c9c10dc5e207fc6 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Thu, 24 Feb 2011 15:49:53 +0100 Subject: mm: fix refcounting in swapon Grab a reference to bdev before calling blkdev_get(), which expects the refcount to be already incremented and either returns success or decrements the refcount and returns an error. The bug was introduced by e525fd89 (block: make blkdev_get/put() handle exclusive access), which didn't take into account this behavior of blkdev_get(). Acked-by: Tejun Heo Signed-off-by: Miklos Szeredi Signed-off-by: Linus Torvalds --- mm/swapfile.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/swapfile.c b/mm/swapfile.c index 07a458d72fa8..0341c5700e34 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1940,7 +1940,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) error = -EINVAL; if (S_ISBLK(inode->i_mode)) { - bdev = I_BDEV(inode); + bdev = bdgrab(I_BDEV(inode)); error = blkdev_get(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL, sys_swapon); if (error < 0) { -- cgit v1.2.3 From a879bf582dfb3a79d30d76ca3af2ae8a0f39010c Mon Sep 17 00:00:00 2001 From: Greg Thelen Date: Fri, 25 Feb 2011 14:44:13 -0800 Subject: mm: grab rcu read lock in move_pages() The move_pages() usage of find_task_by_vpid() requires rcu_read_lock() to prevent free_pid() from reclaiming the pid. Without this patch, RCU warnings are printed in v2.6.38-rc4 move_pages() with: CONFIG_LOCKUP_DETECTOR=y CONFIG_PREEMPT=y CONFIG_LOCKDEP=y CONFIG_PROVE_LOCKING=y CONFIG_PROVE_RCU=y Previously, migrate_pages() went through a similar transformation replacing usage of tasklist_lock with rcu read lock: commit 55cfaa3cbdd29c4919ecb5fb8965c310f357e48c Author: Zeng Zhaoming Date: Thu Dec 2 14:31:13 2010 -0800 mm/mempolicy.c: add rcu read lock to protect pid structure commit 1e50df39f6e2c3a4a3394df62baa8a213df16c54 Author: KOSAKI Motohiro Date: Thu Jan 13 15:46:14 2011 -0800 mempolicy: remove tasklist_lock from migrate_pages Signed-off-by: Greg Thelen Cc: Mel Gorman Cc: Minchan Kim Cc: Rik van Riel Cc: KAMEZAWA Hiroyuki Cc: "Paul E. McKenney" Cc: Tetsuo Handa Cc: Sergey Senozhatsky Cc: Oleg Nesterov Cc: Zeng Zhaoming Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/migrate.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/migrate.c b/mm/migrate.c index 766115253807..352de555626c 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1287,14 +1287,14 @@ SYSCALL_DEFINE6(move_pages, pid_t, pid, unsigned long, nr_pages, return -EPERM; /* Find the mm_struct */ - read_lock(&tasklist_lock); + rcu_read_lock(); task = pid ? find_task_by_vpid(pid) : current; if (!task) { - read_unlock(&tasklist_lock); + rcu_read_unlock(); return -ESRCH; } mm = get_task_mm(task); - read_unlock(&tasklist_lock); + rcu_read_unlock(); if (!mm) return -EINVAL; -- cgit v1.2.3 From 2876592f231d436c295b67726313f6f3cfb6e243 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Fri, 25 Feb 2011 14:44:20 -0800 Subject: mm: vmscan: stop reclaim/compaction earlier due to insufficient progress if !__GFP_REPEAT should_continue_reclaim() for reclaim/compaction allows scanning to continue even if pages are not being reclaimed until the full list is scanned. In terms of allocation success, this makes sense but potentially it introduces unwanted latency for high-order allocations such as transparent hugepages and network jumbo frames that would prefer to fail the allocation attempt and fallback to order-0 pages. Worse, there is a potential that the full LRU scan will clear all the young bits, distort page aging information and potentially push pages into swap that would have otherwise remained resident. This patch will stop reclaim/compaction if no pages were reclaimed in the last SWAP_CLUSTER_MAX pages that were considered. For allocations such as hugetlbfs that use __GFP_REPEAT and have fewer fallback options, the full LRU list may still be scanned. Order-0 allocation should not be affected because RECLAIM_MODE_COMPACTION is not set so the following avoids the gfp_mask being examined: if (!(sc->reclaim_mode & RECLAIM_MODE_COMPACTION)) return false; A tool was developed based on ftrace that tracked the latency of high-order allocations while transparent hugepage support was enabled and three benchmarks were run. The "fix-infinite" figures are 2.6.38-rc4 with Johannes's patch "vmscan: fix zone shrinking exit when scan work is done" applied. STREAM Highorder Allocation Latency Statistics fix-infinite break-early 1 :: Count 10298 10229 1 :: Min 0.4560 0.4640 1 :: Mean 1.0589 1.0183 1 :: Max 14.5990 11.7510 1 :: Stddev 0.5208 0.4719 2 :: Count 2 1 2 :: Min 1.8610 3.7240 2 :: Mean 3.4325 3.7240 2 :: Max 5.0040 3.7240 2 :: Stddev 1.5715 0.0000 9 :: Count 111696 111694 9 :: Min 0.5230 0.4110 9 :: Mean 10.5831 10.5718 9 :: Max 38.4480 43.2900 9 :: Stddev 1.1147 1.1325 Mean time for order-1 allocations is reduced. order-2 looks increased but with so few allocations, it's not particularly significant. THP mean allocation latency is also reduced. That said, allocation time varies so significantly that the reductions are within noise. Max allocation time is reduced by a significant amount for low-order allocations but reduced for THP allocations which presumably are now breaking before reclaim has done enough work. SysBench Highorder Allocation Latency Statistics fix-infinite break-early 1 :: Count 15745 15677 1 :: Min 0.4250 0.4550 1 :: Mean 1.1023 1.0810 1 :: Max 14.4590 10.8220 1 :: Stddev 0.5117 0.5100 2 :: Count 1 1 2 :: Min 3.0040 2.1530 2 :: Mean 3.0040 2.1530 2 :: Max 3.0040 2.1530 2 :: Stddev 0.0000 0.0000 9 :: Count 2017 1931 9 :: Min 0.4980 0.7480 9 :: Mean 10.4717 10.3840 9 :: Max 24.9460 26.2500 9 :: Stddev 1.1726 1.1966 Again, mean time for order-1 allocations is reduced while order-2 allocations are too few to draw conclusions from. The mean time for THP allocations is also slightly reduced albeit the reductions are within varianes. Once again, our maximum allocation time is significantly reduced for low-order allocations and slightly increased for THP allocations. Anon stream mmap reference Highorder Allocation Latency Statistics 1 :: Count 1376 1790 1 :: Min 0.4940 0.5010 1 :: Mean 1.0289 0.9732 1 :: Max 6.2670 4.2540 1 :: Stddev 0.4142 0.2785 2 :: Count 1 - 2 :: Min 1.9060 - 2 :: Mean 1.9060 - 2 :: Max 1.9060 - 2 :: Stddev 0.0000 - 9 :: Count 11266 11257 9 :: Min 0.4990 0.4940 9 :: Mean 27250.4669 24256.1919 9 :: Max 11439211.0000 6008885.0000 9 :: Stddev 226427.4624 186298.1430 This benchmark creates one thread per CPU which references an amount of anonymous memory 1.5 times the size of physical RAM. This pounds swap quite heavily and is intended to exercise THP a bit. Mean allocation time for order-1 is reduced as before. It's also reduced for THP allocations but the variations here are pretty massive due to swap. As before, maximum allocation times are significantly reduced. Overall, the patch reduces the mean and maximum allocation latencies for the smaller high-order allocations. This was with Slab configured so it would be expected to be more significant with Slub which uses these size allocations more aggressively. The mean allocation times for THP allocations are also slightly reduced. The maximum latency was slightly increased as predicted by the comments due to reclaim/compaction breaking early. However, workloads care more about the latency of lower-order allocations than THP so it's an acceptable trade-off. Signed-off-by: Mel Gorman Acked-by: Andrea Arcangeli Acked-by: Johannes Weiner Reviewed-by: Minchan Kim Acked-by: Andrea Arcangeli Acked-by: Rik van Riel Cc: Michal Hocko Cc: Kent Overstreet Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 32 ++++++++++++++++++++++---------- 1 file changed, 22 insertions(+), 10 deletions(-) (limited to 'mm') diff --git a/mm/vmscan.c b/mm/vmscan.c index 17497d0cd8b9..6771ea70bfe7 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1841,16 +1841,28 @@ static inline bool should_continue_reclaim(struct zone *zone, if (!(sc->reclaim_mode & RECLAIM_MODE_COMPACTION)) return false; - /* - * If we failed to reclaim and have scanned the full list, stop. - * NOTE: Checking just nr_reclaimed would exit reclaim/compaction far - * faster but obviously would be less likely to succeed - * allocation. If this is desirable, use GFP_REPEAT to decide - * if both reclaimed and scanned should be checked or just - * reclaimed - */ - if (!nr_reclaimed && !nr_scanned) - return false; + /* Consider stopping depending on scan and reclaim activity */ + if (sc->gfp_mask & __GFP_REPEAT) { + /* + * For __GFP_REPEAT allocations, stop reclaiming if the + * full LRU list has been scanned and we are still failing + * to reclaim pages. This full LRU scan is potentially + * expensive but a __GFP_REPEAT caller really wants to succeed + */ + if (!nr_reclaimed && !nr_scanned) + return false; + } else { + /* + * For non-__GFP_REPEAT allocations which can presumably + * fail without consequence, stop if we failed to reclaim + * any pages from the last SWAP_CLUSTER_MAX number of + * pages that were scanned. This will return to the + * caller faster at the risk reclaim/compaction and + * the resulting allocation attempt fails + */ + if (!nr_reclaimed) + return false; + } /* * If we have not reclaimed enough pages for compaction and the -- cgit v1.2.3 From 29723fccc837d20039078f7a571e8d457eb0d6c6 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Fri, 25 Feb 2011 14:44:25 -0800 Subject: mm: fix dubious code in __count_immobile_pages() When pfn_valid_within() failed 'iter' was incremented twice. Signed-off-by: Namhyung Kim Reviewed-by: KAMEZAWA Hiroyuki Reviewed-by: Minchan Kim Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index a873e61e312e..cdef1d4b4e47 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5376,10 +5376,9 @@ __count_immobile_pages(struct zone *zone, struct page *page, int count) for (found = 0, iter = 0; iter < pageblock_nr_pages; iter++) { unsigned long check = pfn + iter; - if (!pfn_valid_within(check)) { - iter++; + if (!pfn_valid_within(check)) continue; - } + page = pfn_to_page(check); if (!page_count(page)) { if (PageBuddy(page)) -- cgit v1.2.3 From 8eac563c1c3a2047083022357ae63722b19e4e08 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Fri, 25 Feb 2011 14:44:28 -0800 Subject: thp: fix interleaving for transparent hugepages The THP code didn't pass the correct interleaving shift to the memory policy code. Fix this here by adjusting for the order. Signed-off-by: Andi Kleen Reviewed-by: Christoph Lameter Acked-by: Andrea Arcangeli Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mempolicy.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 368fc9d23610..49355a970be2 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1830,7 +1830,7 @@ alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma, if (unlikely(pol->mode == MPOL_INTERLEAVE)) { unsigned nid; - nid = interleave_nid(pol, vma, addr, PAGE_SHIFT); + nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order); mpol_cond_put(pol); page = alloc_page_interleave(gfp, order, nid); put_mems_allowed(); -- cgit v1.2.3 From e5598f8bf5449bc09e4005600ead32e6f2a3e79b Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Fri, 25 Feb 2011 14:44:29 -0800 Subject: memcg: more mem_cgroup_uncharge() batching It seems odd that truncate_inode_pages_range(), called not only when truncating but also when evicting inodes, has mem_cgroup_uncharge_start and _end() batching in its second loop to clear up a few leftovers, but not in its first loop that does almost all the work: add them there too. Signed-off-by: Hugh Dickins Acked-by: KAMEZAWA Hiroyuki Acked-by: Balbir Singh Acked-by: Daisuke Nishimura Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/truncate.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'mm') diff --git a/mm/truncate.c b/mm/truncate.c index 49feb46e77b8..d64296be00d3 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -225,6 +225,7 @@ void truncate_inode_pages_range(struct address_space *mapping, next = start; while (next <= end && pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) { + mem_cgroup_uncharge_start(); for (i = 0; i < pagevec_count(&pvec); i++) { struct page *page = pvec.pages[i]; pgoff_t page_index = page->index; @@ -247,6 +248,7 @@ void truncate_inode_pages_range(struct address_space *mapping, unlock_page(page); } pagevec_release(&pvec); + mem_cgroup_uncharge_end(); cond_resched(); } -- cgit v1.2.3 From 2f5f9486f8c12e3aa40fe3775a18cb14efc5cea2 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Fri, 4 Mar 2011 17:36:29 -0800 Subject: mm: change alloc_pages_vma to pass down the policy node for local policy Currently alloc_pages_vma() always uses the local node as policy node for the LOCAL policy. Pass this node down as an argument instead. No behaviour change from this patch, but will be needed for followons. Acked-by: Andrea Arcangeli Signed-off-by: Andi Kleen Reviewed-by: KAMEZAWA Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/gfp.h | 9 +++++---- mm/huge_memory.c | 2 +- mm/mempolicy.c | 11 +++++------ 3 files changed, 11 insertions(+), 11 deletions(-) (limited to 'mm') diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 0b84c61607e8..37b8af5db091 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -332,16 +332,17 @@ alloc_pages(gfp_t gfp_mask, unsigned int order) return alloc_pages_current(gfp_mask, order); } extern struct page *alloc_pages_vma(gfp_t gfp_mask, int order, - struct vm_area_struct *vma, unsigned long addr); + struct vm_area_struct *vma, unsigned long addr, + int node); #else #define alloc_pages(gfp_mask, order) \ alloc_pages_node(numa_node_id(), gfp_mask, order) -#define alloc_pages_vma(gfp_mask, order, vma, addr) \ +#define alloc_pages_vma(gfp_mask, order, vma, addr, node) \ alloc_pages(gfp_mask, order) #endif #define alloc_page(gfp_mask) alloc_pages(gfp_mask, 0) -#define alloc_page_vma(gfp_mask, vma, addr) \ - alloc_pages_vma(gfp_mask, 0, vma, addr) +#define alloc_page_vma(gfp_mask, vma, addr) \ + alloc_pages_vma(gfp_mask, 0, vma, addr, numa_node_id()) extern unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order); extern unsigned long get_zeroed_page(gfp_t gfp_mask); diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 3e29781ee762..c7c2cd925599 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -653,7 +653,7 @@ static inline struct page *alloc_hugepage_vma(int defrag, unsigned long haddr) { return alloc_pages_vma(alloc_hugepage_gfpmask(defrag), - HPAGE_PMD_ORDER, vma, haddr); + HPAGE_PMD_ORDER, vma, haddr, numa_node_id()); } #ifndef CONFIG_NUMA diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 49355a970be2..25a5a9146619 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1524,10 +1524,9 @@ static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy) } /* Return a zonelist indicated by gfp for node representing a mempolicy */ -static struct zonelist *policy_zonelist(gfp_t gfp, struct mempolicy *policy) +static struct zonelist *policy_zonelist(gfp_t gfp, struct mempolicy *policy, + int nd) { - int nd = numa_node_id(); - switch (policy->mode) { case MPOL_PREFERRED: if (!(policy->flags & MPOL_F_LOCAL)) @@ -1679,7 +1678,7 @@ struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr, zl = node_zonelist(interleave_nid(*mpol, vma, addr, huge_page_shift(hstate_vma(vma))), gfp_flags); } else { - zl = policy_zonelist(gfp_flags, *mpol); + zl = policy_zonelist(gfp_flags, *mpol, numa_node_id()); if ((*mpol)->mode == MPOL_BIND) *nodemask = &(*mpol)->v.nodes; } @@ -1820,7 +1819,7 @@ static struct page *alloc_page_interleave(gfp_t gfp, unsigned order, */ struct page * alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma, - unsigned long addr) + unsigned long addr, int node) { struct mempolicy *pol = get_vma_policy(current, vma, addr); struct zonelist *zl; @@ -1836,7 +1835,7 @@ alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma, put_mems_allowed(); return page; } - zl = policy_zonelist(gfp, pol); + zl = policy_zonelist(gfp, pol, node); if (unlikely(mpol_needs_cond_ref(pol))) { /* * slow path: ref counted shared policy -- cgit v1.2.3 From 19ee151e140daa5183c4984981801e542e0544fb Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Fri, 4 Mar 2011 17:36:31 -0800 Subject: mm: preserve original node for transparent huge page copies This makes a difference for LOCAL policy, where the node cannot be determined from the policy itself, but has to be gotten from the original page. Acked-by: Andrea Arcangeli Signed-off-by: Andi Kleen Reviewed-by: KAMEZAWA Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index c7c2cd925599..1802db819e28 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -799,8 +799,8 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm, } for (i = 0; i < HPAGE_PMD_NR; i++) { - pages[i] = alloc_page_vma(GFP_HIGHUSER_MOVABLE, - vma, address); + pages[i] = alloc_page_vma_node(GFP_HIGHUSER_MOVABLE, + vma, address, page_to_nid(page)); if (unlikely(!pages[i] || mem_cgroup_newpage_charge(pages[i], mm, GFP_KERNEL))) { -- cgit v1.2.3 From 5c4b4be3b6b937256103a5ae49177e0c3a17cb8f Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Fri, 4 Mar 2011 17:36:32 -0800 Subject: mm: use correct numa policy node for transparent hugepages Pass down the correct node for a transparent hugepage allocation. Most callers continue to use the current node, however the hugepaged daemon now uses the previous node of the first to be collapsed page instead. This ensures that khugepaged does not mess up local memory for an existing process which uses local policy. The choice of node is somewhat primitive currently: it just uses the node of the first page in the pmd range. An alternative would be to look at multiple pages and use the most popular node. I used the simplest variant for now which should work well enough for the case of all pages being on the same node. [akpm@linux-foundation.org: coding-style fixes] Acked-by: Andrea Arcangeli Signed-off-by: Andi Kleen Reviewed-by: KAMEZAWA Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 24 +++++++++++++++++------- mm/mempolicy.c | 3 ++- 2 files changed, 19 insertions(+), 8 deletions(-) (limited to 'mm') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 1802db819e28..dbe99a5f2073 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -650,10 +650,10 @@ static inline gfp_t alloc_hugepage_gfpmask(int defrag) static inline struct page *alloc_hugepage_vma(int defrag, struct vm_area_struct *vma, - unsigned long haddr) + unsigned long haddr, int nd) { return alloc_pages_vma(alloc_hugepage_gfpmask(defrag), - HPAGE_PMD_ORDER, vma, haddr, numa_node_id()); + HPAGE_PMD_ORDER, vma, haddr, nd); } #ifndef CONFIG_NUMA @@ -678,7 +678,7 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, if (unlikely(khugepaged_enter(vma))) return VM_FAULT_OOM; page = alloc_hugepage_vma(transparent_hugepage_defrag(vma), - vma, haddr); + vma, haddr, numa_node_id()); if (unlikely(!page)) goto out; if (unlikely(mem_cgroup_newpage_charge(page, mm, GFP_KERNEL))) { @@ -902,7 +902,7 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, if (transparent_hugepage_enabled(vma) && !transparent_hugepage_debug_cow()) new_page = alloc_hugepage_vma(transparent_hugepage_defrag(vma), - vma, haddr); + vma, haddr, numa_node_id()); else new_page = NULL; @@ -1745,7 +1745,8 @@ static void __collapse_huge_page_copy(pte_t *pte, struct page *page, static void collapse_huge_page(struct mm_struct *mm, unsigned long address, struct page **hpage, - struct vm_area_struct *vma) + struct vm_area_struct *vma, + int node) { pgd_t *pgd; pud_t *pud; @@ -1773,7 +1774,8 @@ static void collapse_huge_page(struct mm_struct *mm, * mmap_sem in read mode is good idea also to allow greater * scalability. */ - new_page = alloc_hugepage_vma(khugepaged_defrag(), vma, address); + new_page = alloc_hugepage_vma(khugepaged_defrag(), vma, address, + node); if (unlikely(!new_page)) { up_read(&mm->mmap_sem); *hpage = ERR_PTR(-ENOMEM); @@ -1919,6 +1921,7 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, struct page *page; unsigned long _address; spinlock_t *ptl; + int node = -1; VM_BUG_ON(address & ~HPAGE_PMD_MASK); @@ -1949,6 +1952,13 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, page = vm_normal_page(vma, _address, pteval); if (unlikely(!page)) goto out_unmap; + /* + * Chose the node of the first page. This could + * be more sophisticated and look at more pages, + * but isn't for now. + */ + if (node == -1) + node = page_to_nid(page); VM_BUG_ON(PageCompound(page)); if (!PageLRU(page) || PageLocked(page) || !PageAnon(page)) goto out_unmap; @@ -1965,7 +1975,7 @@ out_unmap: pte_unmap_unlock(pte, ptl); if (ret) /* collapse_huge_page will return with the mmap_sem released */ - collapse_huge_page(mm, address, hpage, vma); + collapse_huge_page(mm, address, hpage, vma, node); out: return ret; } diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 25a5a9146619..b53ec99f1428 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1891,7 +1891,8 @@ struct page *alloc_pages_current(gfp_t gfp, unsigned order) page = alloc_page_interleave(gfp, order, interleave_nodes(pol)); else page = __alloc_pages_nodemask(gfp, order, - policy_zonelist(gfp, pol), policy_nodemask(gfp, pol)); + policy_zonelist(gfp, pol, numa_node_id()), + policy_nodemask(gfp, pol)); put_mems_allowed(); return page; } -- cgit v1.2.3