summaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
authorIngo Molnar <mingo@kernel.org>2014-01-12 14:12:44 +0100
committerIngo Molnar <mingo@kernel.org>2014-01-12 14:12:44 +0100
commitdba861461f88c12249ac78fb877866c04f99deb3 (patch)
tree5812b143581bcc66c7c542f01ba0cb22e489b8e5 /mm
parentMerge branch 'timers/posix-timers-for-tip-v2' of git://git.kernel.org/pub/scm... (diff)
parentMerge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net (diff)
downloadlinux-dba861461f88c12249ac78fb877866c04f99deb3.tar.xz
linux-dba861461f88c12249ac78fb877866c04f99deb3.zip
Merge branch 'linus' into timers/core
Pick up the latest fixes and refresh the branch. Signed-off-by: Ingo Molnar <mingo@kernel.org>
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig2
-rw-r--r--mm/compaction.c4
-rw-r--r--mm/fremap.c8
-rw-r--r--mm/huge_memory.c54
-rw-r--r--mm/memcontrol.c43
-rw-r--r--mm/memory-failure.c24
-rw-r--r--mm/memory.c2
-rw-r--r--mm/mempolicy.c16
-rw-r--r--mm/migrate.c82
-rw-r--r--mm/mlock.c44
-rw-r--r--mm/mprotect.c13
-rw-r--r--mm/page_alloc.c19
-rw-r--r--mm/pgtable-generic.c8
-rw-r--r--mm/rmap.c4
-rw-r--r--mm/shmem.c36
15 files changed, 281 insertions, 78 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index eb69f352401d..723bbe04a0b0 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -543,7 +543,7 @@ config ZSWAP
config MEM_SOFT_DIRTY
bool "Track memory changes"
- depends on CHECKPOINT_RESTORE && HAVE_ARCH_SOFT_DIRTY
+ depends on CHECKPOINT_RESTORE && HAVE_ARCH_SOFT_DIRTY && PROC_FS
select PROC_PAGE_MONITOR
help
This option enables memory changes tracking by introducing a
diff --git a/mm/compaction.c b/mm/compaction.c
index 805165bcd3dd..f58bcd016f43 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -134,6 +134,10 @@ static void update_pageblock_skip(struct compact_control *cc,
bool migrate_scanner)
{
struct zone *zone = cc->zone;
+
+ if (cc->ignore_skip_hint)
+ return;
+
if (!page)
return;
diff --git a/mm/fremap.c b/mm/fremap.c
index 5bff08147768..bbc4d660221a 100644
--- a/mm/fremap.c
+++ b/mm/fremap.c
@@ -208,9 +208,10 @@ get_write_lock:
if (mapping_cap_account_dirty(mapping)) {
unsigned long addr;
struct file *file = get_file(vma->vm_file);
+ /* mmap_region may free vma; grab the info now */
+ vm_flags = vma->vm_flags;
- addr = mmap_region(file, start, size,
- vma->vm_flags, pgoff);
+ addr = mmap_region(file, start, size, vm_flags, pgoff);
fput(file);
if (IS_ERR_VALUE(addr)) {
err = addr;
@@ -218,7 +219,7 @@ get_write_lock:
BUG_ON(addr != start);
err = 0;
}
- goto out;
+ goto out_freed;
}
mutex_lock(&mapping->i_mmap_mutex);
flush_dcache_mmap_lock(mapping);
@@ -253,6 +254,7 @@ get_write_lock:
out:
if (vma)
vm_flags = vma->vm_flags;
+out_freed:
if (likely(!has_write_lock))
up_read(&mm->mmap_sem);
else
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index bccd5a628ea6..9c0b17295ba0 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -882,6 +882,7 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
ret = 0;
goto out_unlock;
}
+
if (unlikely(pmd_trans_splitting(pmd))) {
/* split huge page running from under us */
spin_unlock(src_ptl);
@@ -1243,6 +1244,10 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
if ((flags & FOLL_DUMP) && is_huge_zero_pmd(*pmd))
return ERR_PTR(-EFAULT);
+ /* Full NUMA hinting faults to serialise migration in fault paths */
+ if ((flags & FOLL_NUMA) && pmd_numa(*pmd))
+ goto out;
+
page = pmd_page(*pmd);
VM_BUG_ON(!PageHead(page));
if (flags & FOLL_TOUCH) {
@@ -1295,6 +1300,17 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
if (unlikely(!pmd_same(pmd, *pmdp)))
goto out_unlock;
+ /*
+ * If there are potential migrations, wait for completion and retry
+ * without disrupting NUMA hinting information. Do not relock and
+ * check_same as the page may no longer be mapped.
+ */
+ if (unlikely(pmd_trans_migrating(*pmdp))) {
+ spin_unlock(ptl);
+ wait_migrate_huge_page(vma->anon_vma, pmdp);
+ goto out;
+ }
+
page = pmd_page(pmd);
BUG_ON(is_huge_zero_page(page));
page_nid = page_to_nid(page);
@@ -1323,23 +1339,22 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
/* If the page was locked, there are no parallel migrations */
if (page_locked)
goto clear_pmdnuma;
+ }
- /*
- * Otherwise wait for potential migrations and retry. We do
- * relock and check_same as the page may no longer be mapped.
- * As the fault is being retried, do not account for it.
- */
+ /* Migration could have started since the pmd_trans_migrating check */
+ if (!page_locked) {
spin_unlock(ptl);
wait_on_page_locked(page);
page_nid = -1;
goto out;
}
- /* Page is misplaced, serialise migrations and parallel THP splits */
+ /*
+ * Page is misplaced. Page lock serialises migrations. Acquire anon_vma
+ * to serialises splits
+ */
get_page(page);
spin_unlock(ptl);
- if (!page_locked)
- lock_page(page);
anon_vma = page_lock_anon_vma_read(page);
/* Confirm the PMD did not change while page_table_lock was released */
@@ -1351,6 +1366,13 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
goto out_unlock;
}
+ /* Bail if we fail to protect against THP splits for any reason */
+ if (unlikely(!anon_vma)) {
+ put_page(page);
+ page_nid = -1;
+ goto clear_pmdnuma;
+ }
+
/*
* Migrate the THP to the requested node, returns with page unlocked
* and pmd_numa cleared.
@@ -1481,8 +1503,18 @@ int move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma,
pmd = pmdp_get_and_clear(mm, old_addr, old_pmd);
VM_BUG_ON(!pmd_none(*new_pmd));
set_pmd_at(mm, new_addr, new_pmd, pmd_mksoft_dirty(pmd));
- if (new_ptl != old_ptl)
+ if (new_ptl != old_ptl) {
+ pgtable_t pgtable;
+
+ /*
+ * Move preallocated PTE page table if new_pmd is on
+ * different PMD page table.
+ */
+ pgtable = pgtable_trans_huge_withdraw(mm, old_pmd);
+ pgtable_trans_huge_deposit(mm, new_pmd, pgtable);
+
spin_unlock(new_ptl);
+ }
spin_unlock(old_ptl);
}
out:
@@ -1507,6 +1539,8 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
ret = 1;
if (!prot_numa) {
entry = pmdp_get_and_clear(mm, addr, pmd);
+ if (pmd_numa(entry))
+ entry = pmd_mknonnuma(entry);
entry = pmd_modify(entry, newprot);
ret = HPAGE_PMD_NR;
BUG_ON(pmd_write(entry));
@@ -1521,7 +1555,7 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
*/
if (!is_huge_zero_page(page) &&
!pmd_numa(*pmd)) {
- entry = pmdp_get_and_clear(mm, addr, pmd);
+ entry = *pmd;
entry = pmd_mknuma(entry);
ret = HPAGE_PMD_NR;
}
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index f1a0ae6e11b8..7f1a356153c0 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -338,7 +338,7 @@ struct mem_cgroup {
static size_t memcg_size(void)
{
return sizeof(struct mem_cgroup) +
- nr_node_ids * sizeof(struct mem_cgroup_per_node);
+ nr_node_ids * sizeof(struct mem_cgroup_per_node *);
}
/* internal only representation about the status of kmem accounting. */
@@ -2694,7 +2694,10 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
goto bypass;
if (unlikely(task_in_memcg_oom(current)))
- goto bypass;
+ goto nomem;
+
+ if (gfp_mask & __GFP_NOFAIL)
+ oom = false;
/*
* We always charge the cgroup the mm_struct belongs to.
@@ -6352,6 +6355,42 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
static void mem_cgroup_css_free(struct cgroup_subsys_state *css)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
+ /*
+ * XXX: css_offline() would be where we should reparent all
+ * memory to prepare the cgroup for destruction. However,
+ * memcg does not do css_tryget() and res_counter charging
+ * under the same RCU lock region, which means that charging
+ * could race with offlining. Offlining only happens to
+ * cgroups with no tasks in them but charges can show up
+ * without any tasks from the swapin path when the target
+ * memcg is looked up from the swapout record and not from the
+ * current task as it usually is. A race like this can leak
+ * charges and put pages with stale cgroup pointers into
+ * circulation:
+ *
+ * #0 #1
+ * lookup_swap_cgroup_id()
+ * rcu_read_lock()
+ * mem_cgroup_lookup()
+ * css_tryget()
+ * rcu_read_unlock()
+ * disable css_tryget()
+ * call_rcu()
+ * offline_css()
+ * reparent_charges()
+ * res_counter_charge()
+ * css_put()
+ * css_free()
+ * pc->mem_cgroup = dead memcg
+ * add page to lru
+ *
+ * The bulk of the charges are still moved in offline_css() to
+ * avoid pinning a lot of pages in case a long-term reference
+ * like a swapout record is deferring the css_free() to long
+ * after offlining. But this makes sure we catch any charges
+ * made after offlining:
+ */
+ mem_cgroup_reparent_charges(memcg);
memcg_destroy_kmem(memcg);
__mem_cgroup_free(memcg);
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index b7c171602ba1..fabe55046c1d 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -938,6 +938,16 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
BUG_ON(!PageHWPoison(p));
return SWAP_FAIL;
}
+ /*
+ * We pinned the head page for hwpoison handling,
+ * now we split the thp and we are interested in
+ * the hwpoisoned raw page, so move the refcount
+ * to it.
+ */
+ if (hpage != p) {
+ put_page(hpage);
+ get_page(p);
+ }
/* THP is split, so ppage should be the real poisoned page. */
ppage = p;
}
@@ -1505,10 +1515,16 @@ static int soft_offline_huge_page(struct page *page, int flags)
if (ret > 0)
ret = -EIO;
} else {
- set_page_hwpoison_huge_page(hpage);
- dequeue_hwpoisoned_huge_page(hpage);
- atomic_long_add(1 << compound_order(hpage),
- &num_poisoned_pages);
+ /* overcommit hugetlb page will be freed to buddy */
+ if (PageHuge(page)) {
+ set_page_hwpoison_huge_page(hpage);
+ dequeue_hwpoisoned_huge_page(hpage);
+ atomic_long_add(1 << compound_order(hpage),
+ &num_poisoned_pages);
+ } else {
+ SetPageHWPoison(page);
+ atomic_long_inc(&num_poisoned_pages);
+ }
}
return ret;
}
diff --git a/mm/memory.c b/mm/memory.c
index 5d9025f3b3e1..6768ce9e57d2 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -4271,7 +4271,7 @@ void copy_user_huge_page(struct page *dst, struct page *src,
}
#endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLBFS */
-#if USE_SPLIT_PTE_PTLOCKS && BLOATED_SPINLOCKS
+#if USE_SPLIT_PTE_PTLOCKS && ALLOC_SPLIT_PTLOCKS
bool ptlock_alloc(struct page *page)
{
spinlock_t *ptl;
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index eca4a3129129..0cd2c4d4e270 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1197,14 +1197,16 @@ static struct page *new_vma_page(struct page *page, unsigned long private, int *
break;
vma = vma->vm_next;
}
+
+ if (PageHuge(page)) {
+ if (vma)
+ return alloc_huge_page_noerr(vma, address, 1);
+ else
+ return NULL;
+ }
/*
- * queue_pages_range() confirms that @page belongs to some vma,
- * so vma shouldn't be NULL.
+ * if !vma, alloc_page_vma() will use task or system default policy
*/
- BUG_ON(!vma);
-
- if (PageHuge(page))
- return alloc_huge_page_noerr(vma, address, 1);
return alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
}
#else
@@ -1318,7 +1320,7 @@ static long do_mbind(unsigned long start, unsigned long len,
if (nr_failed && (flags & MPOL_MF_STRICT))
err = -EIO;
} else
- putback_lru_pages(&pagelist);
+ putback_movable_pages(&pagelist);
up_write(&mm->mmap_sem);
mpol_out:
diff --git a/mm/migrate.c b/mm/migrate.c
index bb940045fe85..9194375b2307 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -36,6 +36,7 @@
#include <linux/hugetlb_cgroup.h>
#include <linux/gfp.h>
#include <linux/balloon_compaction.h>
+#include <linux/mmu_notifier.h>
#include <asm/tlbflush.h>
@@ -316,14 +317,15 @@ static inline bool buffer_migrate_lock_buffers(struct buffer_head *head,
*/
int migrate_page_move_mapping(struct address_space *mapping,
struct page *newpage, struct page *page,
- struct buffer_head *head, enum migrate_mode mode)
+ struct buffer_head *head, enum migrate_mode mode,
+ int extra_count)
{
- int expected_count = 0;
+ int expected_count = 1 + extra_count;
void **pslot;
if (!mapping) {
/* Anonymous page without mapping */
- if (page_count(page) != 1)
+ if (page_count(page) != expected_count)
return -EAGAIN;
return MIGRATEPAGE_SUCCESS;
}
@@ -333,7 +335,7 @@ int migrate_page_move_mapping(struct address_space *mapping,
pslot = radix_tree_lookup_slot(&mapping->page_tree,
page_index(page));
- expected_count = 2 + page_has_private(page);
+ expected_count += 1 + page_has_private(page);
if (page_count(page) != expected_count ||
radix_tree_deref_slot_protected(pslot, &mapping->tree_lock) != page) {
spin_unlock_irq(&mapping->tree_lock);
@@ -583,7 +585,7 @@ int migrate_page(struct address_space *mapping,
BUG_ON(PageWriteback(page)); /* Writeback must be complete */
- rc = migrate_page_move_mapping(mapping, newpage, page, NULL, mode);
+ rc = migrate_page_move_mapping(mapping, newpage, page, NULL, mode, 0);
if (rc != MIGRATEPAGE_SUCCESS)
return rc;
@@ -610,7 +612,7 @@ int buffer_migrate_page(struct address_space *mapping,
head = page_buffers(page);
- rc = migrate_page_move_mapping(mapping, newpage, page, head, mode);
+ rc = migrate_page_move_mapping(mapping, newpage, page, head, mode, 0);
if (rc != MIGRATEPAGE_SUCCESS)
return rc;
@@ -1654,6 +1656,18 @@ int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page)
return 1;
}
+bool pmd_trans_migrating(pmd_t pmd)
+{
+ struct page *page = pmd_page(pmd);
+ return PageLocked(page);
+}
+
+void wait_migrate_huge_page(struct anon_vma *anon_vma, pmd_t *pmd)
+{
+ struct page *page = pmd_page(*pmd);
+ wait_on_page_locked(page);
+}
+
/*
* Attempt to migrate a misplaced page to the specified destination
* node. Caller is expected to have an elevated reference count on
@@ -1716,12 +1730,14 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
struct page *page, int node)
{
spinlock_t *ptl;
- unsigned long haddr = address & HPAGE_PMD_MASK;
pg_data_t *pgdat = NODE_DATA(node);
int isolated = 0;
struct page *new_page = NULL;
struct mem_cgroup *memcg = NULL;
int page_lru = page_is_file_cache(page);
+ unsigned long mmun_start = address & HPAGE_PMD_MASK;
+ unsigned long mmun_end = mmun_start + HPAGE_PMD_SIZE;
+ pmd_t orig_entry;
/*
* Rate-limit the amount of data that is being migrated to a node.
@@ -1744,6 +1760,9 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
goto out_fail;
}
+ if (mm_tlb_flush_pending(mm))
+ flush_tlb_range(vma, mmun_start, mmun_end);
+
/* Prepare a page as a migration target */
__set_page_locked(new_page);
SetPageSwapBacked(new_page);
@@ -1755,9 +1774,12 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
WARN_ON(PageLRU(new_page));
/* Recheck the target PMD */
+ mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
ptl = pmd_lock(mm, pmd);
- if (unlikely(!pmd_same(*pmd, entry))) {
+ if (unlikely(!pmd_same(*pmd, entry) || page_count(page) != 2)) {
+fail_putback:
spin_unlock(ptl);
+ mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
/* Reverse changes made by migrate_page_copy() */
if (TestClearPageActive(new_page))
@@ -1774,7 +1796,8 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
putback_lru_page(page);
mod_zone_page_state(page_zone(page),
NR_ISOLATED_ANON + page_lru, -HPAGE_PMD_NR);
- goto out_fail;
+
+ goto out_unlock;
}
/*
@@ -1786,16 +1809,35 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
*/
mem_cgroup_prepare_migration(page, new_page, &memcg);
+ orig_entry = *pmd;
entry = mk_pmd(new_page, vma->vm_page_prot);
- entry = pmd_mknonnuma(entry);
- entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
entry = pmd_mkhuge(entry);
+ entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
- pmdp_clear_flush(vma, haddr, pmd);
- set_pmd_at(mm, haddr, pmd, entry);
- page_add_new_anon_rmap(new_page, vma, haddr);
+ /*
+ * Clear the old entry under pagetable lock and establish the new PTE.
+ * Any parallel GUP will either observe the old page blocking on the
+ * page lock, block on the page table lock or observe the new page.
+ * The SetPageUptodate on the new page and page_add_new_anon_rmap
+ * guarantee the copy is visible before the pagetable update.
+ */
+ flush_cache_range(vma, mmun_start, mmun_end);
+ page_add_new_anon_rmap(new_page, vma, mmun_start);
+ pmdp_clear_flush(vma, mmun_start, pmd);
+ set_pmd_at(mm, mmun_start, pmd, entry);
+ flush_tlb_range(vma, mmun_start, mmun_end);
update_mmu_cache_pmd(vma, address, &entry);
+
+ if (page_count(page) != 2) {
+ set_pmd_at(mm, mmun_start, pmd, orig_entry);
+ flush_tlb_range(vma, mmun_start, mmun_end);
+ update_mmu_cache_pmd(vma, address, &entry);
+ page_remove_rmap(new_page);
+ goto fail_putback;
+ }
+
page_remove_rmap(page);
+
/*
* Finish the charge transaction under the page table lock to
* prevent split_huge_page() from dividing up the charge
@@ -1803,6 +1845,7 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
*/
mem_cgroup_end_migration(memcg, page, new_page, true);
spin_unlock(ptl);
+ mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
unlock_page(new_page);
unlock_page(page);
@@ -1820,10 +1863,15 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
out_fail:
count_vm_events(PGMIGRATE_FAIL, HPAGE_PMD_NR);
out_dropref:
- entry = pmd_mknonnuma(entry);
- set_pmd_at(mm, haddr, pmd, entry);
- update_mmu_cache_pmd(vma, address, &entry);
+ ptl = pmd_lock(mm, pmd);
+ if (pmd_same(*pmd, entry)) {
+ entry = pmd_mknonnuma(entry);
+ set_pmd_at(mm, mmun_start, pmd, entry);
+ update_mmu_cache_pmd(vma, address, &entry);
+ }
+ spin_unlock(ptl);
+out_unlock:
unlock_page(page);
put_page(page);
return 0;
diff --git a/mm/mlock.c b/mm/mlock.c
index d480cd6fc475..192e6eebe4f2 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -133,7 +133,10 @@ static void __munlock_isolation_failed(struct page *page)
/**
* munlock_vma_page - munlock a vma page
- * @page - page to be unlocked
+ * @page - page to be unlocked, either a normal page or THP page head
+ *
+ * returns the size of the page as a page mask (0 for normal page,
+ * HPAGE_PMD_NR - 1 for THP head page)
*
* called from munlock()/munmap() path with page supposedly on the LRU.
* When we munlock a page, because the vma where we found the page is being
@@ -148,21 +151,30 @@ static void __munlock_isolation_failed(struct page *page)
*/
unsigned int munlock_vma_page(struct page *page)
{
- unsigned int page_mask = 0;
+ unsigned int nr_pages;
BUG_ON(!PageLocked(page));
if (TestClearPageMlocked(page)) {
- unsigned int nr_pages = hpage_nr_pages(page);
+ nr_pages = hpage_nr_pages(page);
mod_zone_page_state(page_zone(page), NR_MLOCK, -nr_pages);
- page_mask = nr_pages - 1;
if (!isolate_lru_page(page))
__munlock_isolated_page(page);
else
__munlock_isolation_failed(page);
+ } else {
+ nr_pages = hpage_nr_pages(page);
}
- return page_mask;
+ /*
+ * Regardless of the original PageMlocked flag, we determine nr_pages
+ * after touching the flag. This leaves a possible race with a THP page
+ * split, such that a whole THP page was munlocked, but nr_pages == 1.
+ * Returning a smaller mask due to that is OK, the worst that can
+ * happen is subsequent useless scanning of the former tail pages.
+ * The NR_MLOCK accounting can however become broken.
+ */
+ return nr_pages - 1;
}
/**
@@ -286,10 +298,12 @@ static void __munlock_pagevec(struct pagevec *pvec, struct zone *zone)
{
int i;
int nr = pagevec_count(pvec);
- int delta_munlocked = -nr;
+ int delta_munlocked;
struct pagevec pvec_putback;
int pgrescued = 0;
+ pagevec_init(&pvec_putback, 0);
+
/* Phase 1: page isolation */
spin_lock_irq(&zone->lru_lock);
for (i = 0; i < nr; i++) {
@@ -318,18 +332,21 @@ skip_munlock:
/*
* We won't be munlocking this page in the next phase
* but we still need to release the follow_page_mask()
- * pin.
+ * pin. We cannot do it under lru_lock however. If it's
+ * the last pin, __page_cache_release would deadlock.
*/
+ pagevec_add(&pvec_putback, pvec->pages[i]);
pvec->pages[i] = NULL;
- put_page(page);
- delta_munlocked++;
}
}
+ delta_munlocked = -nr + pagevec_count(&pvec_putback);
__mod_zone_page_state(zone, NR_MLOCK, delta_munlocked);
spin_unlock_irq(&zone->lru_lock);
+ /* Now we can release pins of pages that we are not munlocking */
+ pagevec_release(&pvec_putback);
+
/* Phase 2: page munlock */
- pagevec_init(&pvec_putback, 0);
for (i = 0; i < nr; i++) {
struct page *page = pvec->pages[i];
@@ -440,7 +457,8 @@ void munlock_vma_pages_range(struct vm_area_struct *vma,
while (start < end) {
struct page *page = NULL;
- unsigned int page_mask, page_increm;
+ unsigned int page_mask;
+ unsigned long page_increm;
struct pagevec pvec;
struct zone *zone;
int zoneid;
@@ -490,7 +508,9 @@ void munlock_vma_pages_range(struct vm_area_struct *vma,
goto next;
}
}
- page_increm = 1 + (~(start >> PAGE_SHIFT) & page_mask);
+ /* It's a bug to munlock in the middle of a THP page */
+ VM_BUG_ON((start >> PAGE_SHIFT) & page_mask);
+ page_increm = 1 + page_mask;
start += page_increm * PAGE_SIZE;
next:
cond_resched();
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 26667971c824..bb53a6591aea 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -52,17 +52,21 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
pte_t ptent;
bool updated = false;
- ptent = ptep_modify_prot_start(mm, addr, pte);
if (!prot_numa) {
+ ptent = ptep_modify_prot_start(mm, addr, pte);
+ if (pte_numa(ptent))
+ ptent = pte_mknonnuma(ptent);
ptent = pte_modify(ptent, newprot);
updated = true;
} else {
struct page *page;
+ ptent = *pte;
page = vm_normal_page(vma, addr, oldpte);
if (page) {
if (!pte_numa(oldpte)) {
ptent = pte_mknuma(ptent);
+ set_pte_at(mm, addr, pte, ptent);
updated = true;
}
}
@@ -79,7 +83,10 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
if (updated)
pages++;
- ptep_modify_prot_commit(mm, addr, pte, ptent);
+
+ /* Only !prot_numa always clears the pte */
+ if (!prot_numa)
+ ptep_modify_prot_commit(mm, addr, pte, ptent);
} else if (IS_ENABLED(CONFIG_MIGRATION) && !pte_file(oldpte)) {
swp_entry_t entry = pte_to_swp_entry(oldpte);
@@ -181,6 +188,7 @@ static unsigned long change_protection_range(struct vm_area_struct *vma,
BUG_ON(addr >= end);
pgd = pgd_offset(mm, addr);
flush_cache_range(vma, addr, end);
+ set_tlb_flush_pending(mm);
do {
next = pgd_addr_end(addr, end);
if (pgd_none_or_clear_bad(pgd))
@@ -192,6 +200,7 @@ static unsigned long change_protection_range(struct vm_area_struct *vma,
/* Only flush the TLB if we actually modified any entries: */
if (pages)
flush_tlb_range(vma, start, end);
+ clear_tlb_flush_pending(mm);
return pages;
}
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 580a5f075ed0..5248fe070aa4 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1816,7 +1816,7 @@ static void zlc_clear_zones_full(struct zonelist *zonelist)
static bool zone_local(struct zone *local_zone, struct zone *zone)
{
- return node_distance(local_zone->node, zone->node) == LOCAL_DISTANCE;
+ return local_zone->node == zone->node;
}
static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)
@@ -1913,18 +1913,17 @@ zonelist_scan:
* page was allocated in should have no effect on the
* time the page has in memory before being reclaimed.
*
- * When zone_reclaim_mode is enabled, try to stay in
- * local zones in the fastpath. If that fails, the
- * slowpath is entered, which will do another pass
- * starting with the local zones, but ultimately fall
- * back to remote zones that do not partake in the
- * fairness round-robin cycle of this zonelist.
+ * Try to stay in local zones in the fastpath. If
+ * that fails, the slowpath is entered, which will do
+ * another pass starting with the local zones, but
+ * ultimately fall back to remote zones that do not
+ * partake in the fairness round-robin cycle of this
+ * zonelist.
*/
if (alloc_flags & ALLOC_WMARK_LOW) {
if (zone_page_state(zone, NR_ALLOC_BATCH) <= 0)
continue;
- if (zone_reclaim_mode &&
- !zone_local(preferred_zone, zone))
+ if (!zone_local(preferred_zone, zone))
continue;
}
/*
@@ -2390,7 +2389,7 @@ static void prepare_slowpath(gfp_t gfp_mask, unsigned int order,
* thrash fairness information for zones that are not
* actually part of this zonelist's round-robin cycle.
*/
- if (zone_reclaim_mode && !zone_local(preferred_zone, zone))
+ if (!zone_local(preferred_zone, zone))
continue;
mod_zone_page_state(zone, NR_ALLOC_BATCH,
high_wmark_pages(zone) -
diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c
index cbb38545d9d6..a8b919925934 100644
--- a/mm/pgtable-generic.c
+++ b/mm/pgtable-generic.c
@@ -110,9 +110,10 @@ int pmdp_clear_flush_young(struct vm_area_struct *vma,
pte_t ptep_clear_flush(struct vm_area_struct *vma, unsigned long address,
pte_t *ptep)
{
+ struct mm_struct *mm = (vma)->vm_mm;
pte_t pte;
- pte = ptep_get_and_clear((vma)->vm_mm, address, ptep);
- if (pte_accessible(pte))
+ pte = ptep_get_and_clear(mm, address, ptep);
+ if (pte_accessible(mm, pte))
flush_tlb_page(vma, address);
return pte;
}
@@ -191,6 +192,9 @@ pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp)
void pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
pmd_t *pmdp)
{
+ pmd_t entry = *pmdp;
+ if (pmd_numa(entry))
+ entry = pmd_mknonnuma(entry);
set_pmd_at(vma->vm_mm, address, pmdp, pmd_mknotpresent(*pmdp));
flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
}
diff --git a/mm/rmap.c b/mm/rmap.c
index 55c8b8dc9ffb..068522d8502a 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -600,7 +600,11 @@ pte_t *__page_check_address(struct page *page, struct mm_struct *mm,
spinlock_t *ptl;
if (unlikely(PageHuge(page))) {
+ /* when pud is not present, pte will be NULL */
pte = huge_pte_offset(mm, address);
+ if (!pte)
+ return NULL;
+
ptl = huge_pte_lockptr(page_hstate(page), mm, pte);
goto check;
}
diff --git a/mm/shmem.c b/mm/shmem.c
index 8297623fcaed..902a14842b74 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -2918,13 +2918,8 @@ static struct dentry_operations anon_ops = {
.d_dname = simple_dname
};
-/**
- * shmem_file_setup - get an unlinked file living in tmpfs
- * @name: name for dentry (to be seen in /proc/<pid>/maps
- * @size: size to be set for the file
- * @flags: VM_NORESERVE suppresses pre-accounting of the entire object size
- */
-struct file *shmem_file_setup(const char *name, loff_t size, unsigned long flags)
+static struct file *__shmem_file_setup(const char *name, loff_t size,
+ unsigned long flags, unsigned int i_flags)
{
struct file *res;
struct inode *inode;
@@ -2957,6 +2952,7 @@ struct file *shmem_file_setup(const char *name, loff_t size, unsigned long flags
if (!inode)
goto put_dentry;
+ inode->i_flags |= i_flags;
d_instantiate(path.dentry, inode);
inode->i_size = size;
clear_nlink(inode); /* It is unlinked */
@@ -2977,6 +2973,32 @@ put_memory:
shmem_unacct_size(flags, size);
return res;
}
+
+/**
+ * shmem_kernel_file_setup - get an unlinked file living in tmpfs which must be
+ * kernel internal. There will be NO LSM permission checks against the
+ * underlying inode. So users of this interface must do LSM checks at a
+ * higher layer. The one user is the big_key implementation. LSM checks
+ * are provided at the key level rather than the inode level.
+ * @name: name for dentry (to be seen in /proc/<pid>/maps
+ * @size: size to be set for the file
+ * @flags: VM_NORESERVE suppresses pre-accounting of the entire object size
+ */
+struct file *shmem_kernel_file_setup(const char *name, loff_t size, unsigned long flags)
+{
+ return __shmem_file_setup(name, size, flags, S_PRIVATE);
+}
+
+/**
+ * shmem_file_setup - get an unlinked file living in tmpfs
+ * @name: name for dentry (to be seen in /proc/<pid>/maps
+ * @size: size to be set for the file
+ * @flags: VM_NORESERVE suppresses pre-accounting of the entire object size
+ */
+struct file *shmem_file_setup(const char *name, loff_t size, unsigned long flags)
+{
+ return __shmem_file_setup(name, size, flags, 0);
+}
EXPORT_SYMBOL_GPL(shmem_file_setup);
/**