summaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2016-01-15 20:41:44 +0100
committerLinus Torvalds <torvalds@linux-foundation.org>2016-01-15 20:41:44 +0100
commit875fc4f5ddf35605581f9a5900c14afef48611f2 (patch)
treee237a28a71a5d1e72eaf0ecda737eb5c8614c72c /mm
parentMerge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/jik... (diff)
parentzsmalloc: reorganize struct size_class to pack 4 bytes hole (diff)
downloadlinux-875fc4f5ddf35605581f9a5900c14afef48611f2.tar.xz
linux-875fc4f5ddf35605581f9a5900c14afef48611f2.zip
Merge branch 'akpm' (patches from Andrew)
Merge first patch-bomb from Andrew Morton: - A few hotfixes which missed 4.4 becasue I was asleep. cc'ed to -stable - A few misc fixes - OCFS2 updates - Part of MM. Including pretty large changes to page-flags handling and to thp management which have been buffered up for 2-3 cycles now. I have a lot of MM material this time. [ It turns out the THP part wasn't quite ready, so that got dropped from this series - Linus ] * emailed patches from Andrew Morton <akpm@linux-foundation.org>: (117 commits) zsmalloc: reorganize struct size_class to pack 4 bytes hole mm/zbud.c: use list_last_entry() instead of list_tail_entry() zram/zcomp: do not zero out zcomp private pages zram: pass gfp from zcomp frontend to backend zram: try vmalloc() after kmalloc() zram/zcomp: use GFP_NOIO to allocate streams mm: add tracepoint for scanning pages drivers/base/memory.c: fix kernel warning during memory hotplug on ppc64 mm/page_isolation: use macro to judge the alignment mm: fix noisy sparse warning in LIBCFS_ALLOC_PRE() mm: rework virtual memory accounting include/linux/memblock.h: fix ordering of 'flags' argument in comments mm: move lru_to_page to mm_inline.h Documentation/filesystems: describe the shared memory usage/accounting memory-hotplug: don't BUG() in register_memory_resource() hugetlb: make mm and fs code explicitly non-modular mm/swapfile.c: use list_for_each_entry_safe in free_swap_count_continuations mm: /proc/pid/clear_refs: no need to clear VM_SOFTDIRTY in clear_soft_dirty_pmd() mm: make sure isolate_lru_page() is never called for tail page vmstat: make vmstat_updater deferrable again and shut down on idle ...
Diffstat (limited to 'mm')
-rw-r--r--mm/backing-dev.c2
-rw-r--r--mm/compaction.c18
-rw-r--r--mm/debug.c4
-rw-r--r--mm/filemap.c9
-rw-r--r--mm/huge_memory.c166
-rw-r--r--mm/hugetlb.c39
-rw-r--r--mm/kmemleak.c3
-rw-r--r--mm/ksm.c20
-rw-r--r--mm/memblock.c45
-rw-r--r--mm/memcontrol.c310
-rw-r--r--mm/memory.c47
-rw-r--r--mm/memory_hotplug.c9
-rw-r--r--mm/mempolicy.c30
-rw-r--r--mm/mlock.c2
-rw-r--r--mm/mmap.c81
-rw-r--r--mm/mmzone.c8
-rw-r--r--mm/mprotect.c8
-rw-r--r--mm/mremap.c7
-rw-r--r--mm/nommu.c2
-rw-r--r--mm/oom_kill.c5
-rw-r--r--mm/page-writeback.c14
-rw-r--r--mm/page_alloc.c158
-rw-r--r--mm/page_isolation.c22
-rw-r--r--mm/pgtable-generic.c9
-rw-r--r--mm/readahead.c9
-rw-r--r--mm/rmap.c18
-rw-r--r--mm/shmem.c83
-rw-r--r--mm/slab.c48
-rw-r--r--mm/slab.h5
-rw-r--r--mm/slab_common.c3
-rw-r--r--mm/slub.c2
-rw-r--r--mm/swapfile.c23
-rw-r--r--mm/vmalloc.c23
-rw-r--r--mm/vmpressure.c78
-rw-r--r--mm/vmscan.c40
-rw-r--r--mm/vmstat.c69
-rw-r--r--mm/zbud.c5
-rw-r--r--mm/zsmalloc.c4
38 files changed, 871 insertions, 557 deletions
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 7340353f8aea..cc5d29d2da9b 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -672,7 +672,7 @@ static int cgwb_bdi_init(struct backing_dev_info *bdi)
ret = wb_init(&bdi->wb, bdi, 1, GFP_KERNEL);
if (!ret) {
- bdi->wb.memcg_css = mem_cgroup_root_css;
+ bdi->wb.memcg_css = &root_mem_cgroup->css;
bdi->wb.blkcg_css = blkcg_root_css;
}
return ret;
diff --git a/mm/compaction.c b/mm/compaction.c
index de3e1e71cd9f..585de54dbe8c 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -1658,14 +1658,15 @@ static void __compact_pgdat(pg_data_t *pgdat, struct compact_control *cc)
!compaction_deferred(zone, cc->order))
compact_zone(zone, cc);
- if (cc->order > 0) {
- if (zone_watermark_ok(zone, cc->order,
- low_wmark_pages(zone), 0, 0))
- compaction_defer_reset(zone, cc->order, false);
- }
-
VM_BUG_ON(!list_empty(&cc->freepages));
VM_BUG_ON(!list_empty(&cc->migratepages));
+
+ if (is_via_compact_memory(cc->order))
+ continue;
+
+ if (zone_watermark_ok(zone, cc->order,
+ low_wmark_pages(zone), 0, 0))
+ compaction_defer_reset(zone, cc->order, false);
}
}
@@ -1708,7 +1709,10 @@ static void compact_nodes(void)
/* The written value is actually unused, all memory is compacted */
int sysctl_compact_memory;
-/* This is the entry point for compacting all nodes via /proc/sys/vm */
+/*
+ * This is the entry point for compacting all nodes via
+ * /proc/sys/vm/compact_memory
+ */
int sysctl_compaction_handler(struct ctl_table *table, int write,
void __user *buffer, size_t *length, loff_t *ppos)
{
diff --git a/mm/debug.c b/mm/debug.c
index 668aa35191ca..5d2072ed8d5e 100644
--- a/mm/debug.c
+++ b/mm/debug.c
@@ -175,7 +175,7 @@ void dump_mm(const struct mm_struct *mm)
"mmap_base %lu mmap_legacy_base %lu highest_vm_end %lu\n"
"pgd %p mm_users %d mm_count %d nr_ptes %lu nr_pmds %lu map_count %d\n"
"hiwater_rss %lx hiwater_vm %lx total_vm %lx locked_vm %lx\n"
- "pinned_vm %lx shared_vm %lx exec_vm %lx stack_vm %lx\n"
+ "pinned_vm %lx data_vm %lx exec_vm %lx stack_vm %lx\n"
"start_code %lx end_code %lx start_data %lx end_data %lx\n"
"start_brk %lx brk %lx start_stack %lx\n"
"arg_start %lx arg_end %lx env_start %lx env_end %lx\n"
@@ -209,7 +209,7 @@ void dump_mm(const struct mm_struct *mm)
mm_nr_pmds((struct mm_struct *)mm),
mm->map_count,
mm->hiwater_rss, mm->hiwater_vm, mm->total_vm, mm->locked_vm,
- mm->pinned_vm, mm->shared_vm, mm->exec_vm, mm->stack_vm,
+ mm->pinned_vm, mm->data_vm, mm->exec_vm, mm->stack_vm,
mm->start_code, mm->end_code, mm->start_data, mm->end_data,
mm->start_brk, mm->brk, mm->start_stack,
mm->arg_start, mm->arg_end, mm->env_start, mm->env_end,
diff --git a/mm/filemap.c b/mm/filemap.c
index 1bb007624b53..ff42d31c891a 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1812,19 +1812,18 @@ EXPORT_SYMBOL(generic_file_read_iter);
* This adds the requested page to the page cache if it isn't already there,
* and schedules an I/O to read in its contents from disk.
*/
-static int page_cache_read(struct file *file, pgoff_t offset)
+static int page_cache_read(struct file *file, pgoff_t offset, gfp_t gfp_mask)
{
struct address_space *mapping = file->f_mapping;
struct page *page;
int ret;
do {
- page = page_cache_alloc_cold(mapping);
+ page = __page_cache_alloc(gfp_mask|__GFP_COLD);
if (!page)
return -ENOMEM;
- ret = add_to_page_cache_lru(page, mapping, offset,
- mapping_gfp_constraint(mapping, GFP_KERNEL));
+ ret = add_to_page_cache_lru(page, mapping, offset, gfp_mask & GFP_KERNEL);
if (ret == 0)
ret = mapping->a_ops->readpage(file, page);
else if (ret == -EEXIST)
@@ -2005,7 +2004,7 @@ no_cached_page:
* We're only likely to ever get here if MADV_RANDOM is in
* effect.
*/
- error = page_cache_read(file, offset);
+ error = page_cache_read(file, offset, vmf->gfp_mask);
/*
* The page we want has now been added to the page cache.
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 62fe06bb7d04..f952f055fdcf 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -31,6 +31,33 @@
#include <asm/pgalloc.h>
#include "internal.h"
+enum scan_result {
+ SCAN_FAIL,
+ SCAN_SUCCEED,
+ SCAN_PMD_NULL,
+ SCAN_EXCEED_NONE_PTE,
+ SCAN_PTE_NON_PRESENT,
+ SCAN_PAGE_RO,
+ SCAN_NO_REFERENCED_PAGE,
+ SCAN_PAGE_NULL,
+ SCAN_SCAN_ABORT,
+ SCAN_PAGE_COUNT,
+ SCAN_PAGE_LRU,
+ SCAN_PAGE_LOCK,
+ SCAN_PAGE_ANON,
+ SCAN_ANY_PROCESS,
+ SCAN_VMA_NULL,
+ SCAN_VMA_CHECK,
+ SCAN_ADDRESS_RANGE,
+ SCAN_SWAP_CACHE_PAGE,
+ SCAN_DEL_PAGE_LRU,
+ SCAN_ALLOC_HUGE_PAGE_FAIL,
+ SCAN_CGROUP_CHARGE_FAIL
+};
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/huge_memory.h>
+
/*
* By default transparent hugepage support is disabled in order that avoid
* to risk increase the memory footprint of applications without a guaranteed
@@ -2198,26 +2225,33 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
unsigned long address,
pte_t *pte)
{
- struct page *page;
+ struct page *page = NULL;
pte_t *_pte;
- int none_or_zero = 0;
+ int none_or_zero = 0, result = 0;
bool referenced = false, writable = false;
+
for (_pte = pte; _pte < pte+HPAGE_PMD_NR;
_pte++, address += PAGE_SIZE) {
pte_t pteval = *_pte;
if (pte_none(pteval) || (pte_present(pteval) &&
is_zero_pfn(pte_pfn(pteval)))) {
if (!userfaultfd_armed(vma) &&
- ++none_or_zero <= khugepaged_max_ptes_none)
+ ++none_or_zero <= khugepaged_max_ptes_none) {
continue;
- else
+ } else {
+ result = SCAN_EXCEED_NONE_PTE;
goto out;
+ }
}
- if (!pte_present(pteval))
+ if (!pte_present(pteval)) {
+ result = SCAN_PTE_NON_PRESENT;
goto out;
+ }
page = vm_normal_page(vma, address, pteval);
- if (unlikely(!page))
+ if (unlikely(!page)) {
+ result = SCAN_PAGE_NULL;
goto out;
+ }
VM_BUG_ON_PAGE(PageCompound(page), page);
VM_BUG_ON_PAGE(!PageAnon(page), page);
@@ -2229,8 +2263,10 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
* is needed to serialize against split_huge_page
* when invoked from the VM.
*/
- if (!trylock_page(page))
+ if (!trylock_page(page)) {
+ result = SCAN_PAGE_LOCK;
goto out;
+ }
/*
* cannot use mapcount: can't collapse if there's a gup pin.
@@ -2239,6 +2275,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
*/
if (page_count(page) != 1 + !!PageSwapCache(page)) {
unlock_page(page);
+ result = SCAN_PAGE_COUNT;
goto out;
}
if (pte_write(pteval)) {
@@ -2246,6 +2283,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
} else {
if (PageSwapCache(page) && !reuse_swap_page(page)) {
unlock_page(page);
+ result = SCAN_SWAP_CACHE_PAGE;
goto out;
}
/*
@@ -2260,6 +2298,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
*/
if (isolate_lru_page(page)) {
unlock_page(page);
+ result = SCAN_DEL_PAGE_LRU;
goto out;
}
/* 0 stands for page_is_file_cache(page) == false */
@@ -2273,10 +2312,21 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
mmu_notifier_test_young(vma->vm_mm, address))
referenced = true;
}
- if (likely(referenced && writable))
- return 1;
+ if (likely(writable)) {
+ if (likely(referenced)) {
+ result = SCAN_SUCCEED;
+ trace_mm_collapse_huge_page_isolate(page_to_pfn(page), none_or_zero,
+ referenced, writable, result);
+ return 1;
+ }
+ } else {
+ result = SCAN_PAGE_RO;
+ }
+
out:
release_pte_pages(pte, _pte);
+ trace_mm_collapse_huge_page_isolate(page_to_pfn(page), none_or_zero,
+ referenced, writable, result);
return 0;
}
@@ -2513,7 +2563,7 @@ static void collapse_huge_page(struct mm_struct *mm,
pgtable_t pgtable;
struct page *new_page;
spinlock_t *pmd_ptl, *pte_ptl;
- int isolated;
+ int isolated, result = 0;
unsigned long hstart, hend;
struct mem_cgroup *memcg;
unsigned long mmun_start; /* For mmu_notifiers */
@@ -2528,12 +2578,15 @@ static void collapse_huge_page(struct mm_struct *mm,
/* release the mmap_sem read lock. */
new_page = khugepaged_alloc_page(hpage, gfp, mm, address, node);
- if (!new_page)
- return;
+ if (!new_page) {
+ result = SCAN_ALLOC_HUGE_PAGE_FAIL;
+ goto out_nolock;
+ }
- if (unlikely(mem_cgroup_try_charge(new_page, mm,
- gfp, &memcg)))
- return;
+ if (unlikely(mem_cgroup_try_charge(new_page, mm, gfp, &memcg))) {
+ result = SCAN_CGROUP_CHARGE_FAIL;
+ goto out_nolock;
+ }
/*
* Prevent all access to pagetables with the exception of
@@ -2541,21 +2594,31 @@ static void collapse_huge_page(struct mm_struct *mm,
* handled by the anon_vma lock + PG_lock.
*/
down_write(&mm->mmap_sem);
- if (unlikely(khugepaged_test_exit(mm)))
+ if (unlikely(khugepaged_test_exit(mm))) {
+ result = SCAN_ANY_PROCESS;
goto out;
+ }
vma = find_vma(mm, address);
- if (!vma)
+ if (!vma) {
+ result = SCAN_VMA_NULL;
goto out;
+ }
hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
hend = vma->vm_end & HPAGE_PMD_MASK;
- if (address < hstart || address + HPAGE_PMD_SIZE > hend)
+ if (address < hstart || address + HPAGE_PMD_SIZE > hend) {
+ result = SCAN_ADDRESS_RANGE;
goto out;
- if (!hugepage_vma_check(vma))
+ }
+ if (!hugepage_vma_check(vma)) {
+ result = SCAN_VMA_CHECK;
goto out;
+ }
pmd = mm_find_pmd(mm, address);
- if (!pmd)
+ if (!pmd) {
+ result = SCAN_PMD_NULL;
goto out;
+ }
anon_vma_lock_write(vma->anon_vma);
@@ -2592,6 +2655,7 @@ static void collapse_huge_page(struct mm_struct *mm,
pmd_populate(mm, pmd, pmd_pgtable(_pmd));
spin_unlock(pmd_ptl);
anon_vma_unlock_write(vma->anon_vma);
+ result = SCAN_FAIL;
goto out;
}
@@ -2629,10 +2693,15 @@ static void collapse_huge_page(struct mm_struct *mm,
*hpage = NULL;
khugepaged_pages_collapsed++;
+ result = SCAN_SUCCEED;
out_up_write:
up_write(&mm->mmap_sem);
+ trace_mm_collapse_huge_page(mm, isolated, result);
return;
+out_nolock:
+ trace_mm_collapse_huge_page(mm, isolated, result);
+ return;
out:
mem_cgroup_cancel_charge(new_page, memcg);
goto out_up_write;
@@ -2645,8 +2714,8 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
{
pmd_t *pmd;
pte_t *pte, *_pte;
- int ret = 0, none_or_zero = 0;
- struct page *page;
+ int ret = 0, none_or_zero = 0, result = 0;
+ struct page *page = NULL;
unsigned long _address;
spinlock_t *ptl;
int node = NUMA_NO_NODE;
@@ -2655,8 +2724,10 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
VM_BUG_ON(address & ~HPAGE_PMD_MASK);
pmd = mm_find_pmd(mm, address);
- if (!pmd)
+ if (!pmd) {
+ result = SCAN_PMD_NULL;
goto out;
+ }
memset(khugepaged_node_load, 0, sizeof(khugepaged_node_load));
pte = pte_offset_map_lock(mm, pmd, address, &ptl);
@@ -2665,19 +2736,25 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
pte_t pteval = *_pte;
if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) {
if (!userfaultfd_armed(vma) &&
- ++none_or_zero <= khugepaged_max_ptes_none)
+ ++none_or_zero <= khugepaged_max_ptes_none) {
continue;
- else
+ } else {
+ result = SCAN_EXCEED_NONE_PTE;
goto out_unmap;
+ }
}
- if (!pte_present(pteval))
+ if (!pte_present(pteval)) {
+ result = SCAN_PTE_NON_PRESENT;
goto out_unmap;
+ }
if (pte_write(pteval))
writable = true;
page = vm_normal_page(vma, _address, pteval);
- if (unlikely(!page))
+ if (unlikely(!page)) {
+ result = SCAN_PAGE_NULL;
goto out_unmap;
+ }
/*
* Record which node the original page is from and save this
* information to khugepaged_node_load[].
@@ -2685,26 +2762,49 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
* hit record.
*/
node = page_to_nid(page);
- if (khugepaged_scan_abort(node))
+ if (khugepaged_scan_abort(node)) {
+ result = SCAN_SCAN_ABORT;
goto out_unmap;
+ }
khugepaged_node_load[node]++;
VM_BUG_ON_PAGE(PageCompound(page), page);
- if (!PageLRU(page) || PageLocked(page) || !PageAnon(page))
+ if (!PageLRU(page)) {
+ result = SCAN_SCAN_ABORT;
+ goto out_unmap;
+ }
+ if (PageLocked(page)) {
+ result = SCAN_PAGE_LOCK;
+ goto out_unmap;
+ }
+ if (!PageAnon(page)) {
+ result = SCAN_PAGE_ANON;
goto out_unmap;
+ }
+
/*
* cannot use mapcount: can't collapse if there's a gup pin.
* The page must only be referenced by the scanned process
* and page swap cache.
*/
- if (page_count(page) != 1 + !!PageSwapCache(page))
+ if (page_count(page) != 1 + !!PageSwapCache(page)) {
+ result = SCAN_PAGE_COUNT;
goto out_unmap;
+ }
if (pte_young(pteval) ||
page_is_young(page) || PageReferenced(page) ||
mmu_notifier_test_young(vma->vm_mm, address))
referenced = true;
}
- if (referenced && writable)
- ret = 1;
+ if (writable) {
+ if (referenced) {
+ result = SCAN_SUCCEED;
+ ret = 1;
+ } else {
+ result = SCAN_NO_REFERENCED_PAGE;
+ }
+ } else {
+ result = SCAN_PAGE_RO;
+ }
out_unmap:
pte_unmap_unlock(pte, ptl);
if (ret) {
@@ -2713,6 +2813,8 @@ out_unmap:
collapse_huge_page(mm, address, hpage, vma, node);
}
out:
+ trace_mm_khugepaged_scan_pmd(mm, page_to_pfn(page), writable, referenced,
+ none_or_zero, result);
return ret;
}
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index ef6963b577fd..be934df69b85 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -4,7 +4,6 @@
*/
#include <linux/list.h>
#include <linux/init.h>
-#include <linux/module.h>
#include <linux/mm.h>
#include <linux/seq_file.h>
#include <linux/sysctl.h>
@@ -2549,25 +2548,6 @@ static void hugetlb_unregister_node(struct node *node)
nhs->hugepages_kobj = NULL;
}
-/*
- * hugetlb module exit: unregister hstate attributes from node devices
- * that have them.
- */
-static void hugetlb_unregister_all_nodes(void)
-{
- int nid;
-
- /*
- * disable node device registrations.
- */
- register_hugetlbfs_with_node(NULL, NULL);
-
- /*
- * remove hstate attributes from any nodes that have them.
- */
- for (nid = 0; nid < nr_node_ids; nid++)
- hugetlb_unregister_node(node_devices[nid]);
-}
/*
* Register hstate attributes for a single node device.
@@ -2632,27 +2612,10 @@ static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp)
return NULL;
}
-static void hugetlb_unregister_all_nodes(void) { }
-
static void hugetlb_register_all_nodes(void) { }
#endif
-static void __exit hugetlb_exit(void)
-{
- struct hstate *h;
-
- hugetlb_unregister_all_nodes();
-
- for_each_hstate(h) {
- kobject_put(hstate_kobjs[hstate_index(h)]);
- }
-
- kobject_put(hugepages_kobj);
- kfree(hugetlb_fault_mutex_table);
-}
-module_exit(hugetlb_exit);
-
static int __init hugetlb_init(void)
{
int i;
@@ -2690,7 +2653,7 @@ static int __init hugetlb_init(void)
mutex_init(&hugetlb_fault_mutex_table[i]);
return 0;
}
-module_init(hugetlb_init);
+subsys_initcall(hugetlb_init);
/* Should be called on processing a hugepagesz=... option */
void __init hugetlb_add_hstate(unsigned int order)
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index 19423a45d7d7..25c0ad36fe38 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -122,8 +122,7 @@
#define BYTES_PER_POINTER sizeof(void *)
/* GFP bitmask for kmemleak internal allocations */
-#define gfp_kmemleak_mask(gfp) (((gfp) & (GFP_KERNEL | GFP_ATOMIC | \
- __GFP_NOACCOUNT)) | \
+#define gfp_kmemleak_mask(gfp) (((gfp) & (GFP_KERNEL | GFP_ATOMIC)) | \
__GFP_NORETRY | __GFP_NOMEMALLOC | \
__GFP_NOWARN)
diff --git a/mm/ksm.c b/mm/ksm.c
index b5cd647daa52..2d162c5625f6 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -740,8 +740,7 @@ static int remove_stable_node(struct stable_node *stable_node)
static int remove_all_stable_nodes(void)
{
- struct stable_node *stable_node;
- struct list_head *this, *next;
+ struct stable_node *stable_node, *next;
int nid;
int err = 0;
@@ -756,8 +755,7 @@ static int remove_all_stable_nodes(void)
cond_resched();
}
}
- list_for_each_safe(this, next, &migrate_nodes) {
- stable_node = list_entry(this, struct stable_node, list);
+ list_for_each_entry_safe(stable_node, next, &migrate_nodes, list) {
if (remove_stable_node(stable_node))
err = -EBUSY;
cond_resched();
@@ -1583,13 +1581,11 @@ static struct rmap_item *scan_get_next_rmap_item(struct page **page)
* so prune them once before each full scan.
*/
if (!ksm_merge_across_nodes) {
- struct stable_node *stable_node;
- struct list_head *this, *next;
+ struct stable_node *stable_node, *next;
struct page *page;
- list_for_each_safe(this, next, &migrate_nodes) {
- stable_node = list_entry(this,
- struct stable_node, list);
+ list_for_each_entry_safe(stable_node, next,
+ &migrate_nodes, list) {
page = get_ksm_page(stable_node, false);
if (page)
put_page(page);
@@ -2012,8 +2008,7 @@ static void wait_while_offlining(void)
static void ksm_check_stable_tree(unsigned long start_pfn,
unsigned long end_pfn)
{
- struct stable_node *stable_node;
- struct list_head *this, *next;
+ struct stable_node *stable_node, *next;
struct rb_node *node;
int nid;
@@ -2034,8 +2029,7 @@ static void ksm_check_stable_tree(unsigned long start_pfn,
cond_resched();
}
}
- list_for_each_safe(this, next, &migrate_nodes) {
- stable_node = list_entry(this, struct stable_node, list);
+ list_for_each_entry_safe(stable_node, next, &migrate_nodes, list) {
if (stable_node->kpfn >= start_pfn &&
stable_node->kpfn < end_pfn)
remove_node_from_stable_tree(stable_node);
diff --git a/mm/memblock.c b/mm/memblock.c
index 07ff069fef25..d2ed81e59a94 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -96,13 +96,10 @@ bool __init_memblock memblock_overlaps_region(struct memblock_type *type,
{
unsigned long i;
- for (i = 0; i < type->cnt; i++) {
- phys_addr_t rgnbase = type->regions[i].base;
- phys_addr_t rgnsize = type->regions[i].size;
- if (memblock_addrs_overlap(base, size, rgnbase, rgnsize))
+ for (i = 0; i < type->cnt; i++)
+ if (memblock_addrs_overlap(base, size, type->regions[i].base,
+ type->regions[i].size))
break;
- }
-
return i < type->cnt;
}
@@ -528,7 +525,8 @@ int __init_memblock memblock_add_range(struct memblock_type *type,
bool insert = false;
phys_addr_t obase = base;
phys_addr_t end = base + memblock_cap_size(base, &size);
- int i, nr_new;
+ int idx, nr_new;
+ struct memblock_region *rgn;
if (!size)
return 0;
@@ -552,8 +550,7 @@ repeat:
base = obase;
nr_new = 0;
- for (i = 0; i < type->cnt; i++) {
- struct memblock_region *rgn = &type->regions[i];
+ for_each_memblock_type(type, rgn) {
phys_addr_t rbase = rgn->base;
phys_addr_t rend = rbase + rgn->size;
@@ -572,7 +569,7 @@ repeat:
WARN_ON(flags != rgn->flags);
nr_new++;
if (insert)
- memblock_insert_region(type, i++, base,
+ memblock_insert_region(type, idx++, base,
rbase - base, nid,
flags);
}
@@ -584,7 +581,7 @@ repeat:
if (base < end) {
nr_new++;
if (insert)
- memblock_insert_region(type, i, base, end - base,
+ memblock_insert_region(type, idx, base, end - base,
nid, flags);
}
@@ -651,7 +648,8 @@ static int __init_memblock memblock_isolate_range(struct memblock_type *type,
int *start_rgn, int *end_rgn)
{
phys_addr_t end = base + memblock_cap_size(base, &size);
- int i;
+ int idx;
+ struct memblock_region *rgn;
*start_rgn = *end_rgn = 0;
@@ -663,8 +661,7 @@ static int __init_memblock memblock_isolate_range(struct memblock_type *type,
if (memblock_double_array(type, base, size) < 0)
return -ENOMEM;
- for (i = 0; i < type->cnt; i++) {
- struct memblock_region *rgn = &type->regions[i];
+ for_each_memblock_type(type, rgn) {
phys_addr_t rbase = rgn->base;
phys_addr_t rend = rbase + rgn->size;
@@ -681,7 +678,7 @@ static int __init_memblock memblock_isolate_range(struct memblock_type *type,
rgn->base = base;
rgn->size -= base - rbase;
type->total_size -= base - rbase;
- memblock_insert_region(type, i, rbase, base - rbase,
+ memblock_insert_region(type, idx, rbase, base - rbase,
memblock_get_region_node(rgn),
rgn->flags);
} else if (rend > end) {
@@ -692,14 +689,14 @@ static int __init_memblock memblock_isolate_range(struct memblock_type *type,
rgn->base = end;
rgn->size -= end - rbase;
type->total_size -= end - rbase;
- memblock_insert_region(type, i--, rbase, end - rbase,
+ memblock_insert_region(type, idx--, rbase, end - rbase,
memblock_get_region_node(rgn),
rgn->flags);
} else {
/* @rgn is fully contained, record it */
if (!*end_rgn)
- *start_rgn = i;
- *end_rgn = i + 1;
+ *start_rgn = idx;
+ *end_rgn = idx + 1;
}
}
@@ -1528,12 +1525,12 @@ static int __init_memblock memblock_search(struct memblock_type *type, phys_addr
return -1;
}
-int __init memblock_is_reserved(phys_addr_t addr)
+bool __init memblock_is_reserved(phys_addr_t addr)
{
return memblock_search(&memblock.reserved, addr) != -1;
}
-int __init_memblock memblock_is_memory(phys_addr_t addr)
+bool __init_memblock memblock_is_memory(phys_addr_t addr)
{
return memblock_search(&memblock.memory, addr) != -1;
}
@@ -1641,12 +1638,12 @@ static void __init_memblock memblock_dump(struct memblock_type *type, char *name
{
unsigned long long base, size;
unsigned long flags;
- int i;
+ int idx;
+ struct memblock_region *rgn;
pr_info(" %s.cnt = 0x%lx\n", name, type->cnt);
- for (i = 0; i < type->cnt; i++) {
- struct memblock_region *rgn = &type->regions[i];
+ for_each_memblock_type(type, rgn) {
char nid_buf[32] = "";
base = rgn->base;
@@ -1658,7 +1655,7 @@ static void __init_memblock memblock_dump(struct memblock_type *type, char *name
memblock_get_region_node(rgn));
#endif
pr_info(" %s[%#x]\t[%#016llx-%#016llx], %#llx bytes%s flags: %#lx\n",
- name, i, base, base + size - 1, size, nid_buf, flags);
+ name, idx, base, base + size - 1, size, nid_buf, flags);
}
}
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 14cb1db4c52b..54eae4f19d80 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -76,9 +76,12 @@
struct cgroup_subsys memory_cgrp_subsys __read_mostly;
EXPORT_SYMBOL(memory_cgrp_subsys);
+struct mem_cgroup *root_mem_cgroup __read_mostly;
+
#define MEM_CGROUP_RECLAIM_RETRIES 5
-static struct mem_cgroup *root_mem_cgroup __read_mostly;
-struct cgroup_subsys_state *mem_cgroup_root_css __read_mostly;
+
+/* Socket memory accounting disabled? */
+static bool cgroup_memory_nosocket;
/* Whether the swap controller is active */
#ifdef CONFIG_MEMCG_SWAP
@@ -87,6 +90,12 @@ int do_swap_account __read_mostly;
#define do_swap_account 0
#endif
+/* Whether legacy memory+swap accounting is active */
+static bool do_memsw_account(void)
+{
+ return !cgroup_subsys_on_dfl(memory_cgrp_subsys) && do_swap_account;
+}
+
static const char * const mem_cgroup_stat_names[] = {
"cache",
"rss",
@@ -288,64 +297,6 @@ static inline struct mem_cgroup *mem_cgroup_from_id(unsigned short id)
return mem_cgroup_from_css(css);
}
-/* Writing them here to avoid exposing memcg's inner layout */
-#if defined(CONFIG_INET) && defined(CONFIG_MEMCG_KMEM)
-
-void sock_update_memcg(struct sock *sk)
-{
- if (mem_cgroup_sockets_enabled) {
- struct mem_cgroup *memcg;
- struct cg_proto *cg_proto;
-
- BUG_ON(!sk->sk_prot->proto_cgroup);
-
- /* Socket cloning can throw us here with sk_cgrp already
- * filled. It won't however, necessarily happen from
- * process context. So the test for root memcg given
- * the current task's memcg won't help us in this case.
- *
- * Respecting the original socket's memcg is a better
- * decision in this case.
- */
- if (sk->sk_cgrp) {
- BUG_ON(mem_cgroup_is_root(sk->sk_cgrp->memcg));
- css_get(&sk->sk_cgrp->memcg->css);
- return;
- }
-
- rcu_read_lock();
- memcg = mem_cgroup_from_task(current);
- cg_proto = sk->sk_prot->proto_cgroup(memcg);
- if (cg_proto && test_bit(MEMCG_SOCK_ACTIVE, &cg_proto->flags) &&
- css_tryget_online(&memcg->css)) {
- sk->sk_cgrp = cg_proto;
- }
- rcu_read_unlock();
- }
-}
-EXPORT_SYMBOL(sock_update_memcg);
-
-void sock_release_memcg(struct sock *sk)
-{
- if (mem_cgroup_sockets_enabled && sk->sk_cgrp) {
- struct mem_cgroup *memcg;
- WARN_ON(!sk->sk_cgrp->memcg);
- memcg = sk->sk_cgrp->memcg;
- css_put(&sk->sk_cgrp->memcg->css);
- }
-}
-
-struct cg_proto *tcp_proto_cgroup(struct mem_cgroup *memcg)
-{
- if (!memcg || mem_cgroup_is_root(memcg))
- return NULL;
-
- return &memcg->tcp_mem;
-}
-EXPORT_SYMBOL(tcp_proto_cgroup);
-
-#endif
-
#ifdef CONFIG_MEMCG_KMEM
/*
* This will be the memcg's index in each cache's ->memcg_params.memcg_caches.
@@ -395,7 +346,7 @@ void memcg_put_cache_ids(void)
* conditional to this static branch, we'll have to allow modules that does
* kmem_cache_alloc and the such to see this symbol as well
*/
-struct static_key memcg_kmem_enabled_key;
+DEFINE_STATIC_KEY_FALSE(memcg_kmem_enabled_key);
EXPORT_SYMBOL(memcg_kmem_enabled_key);
#endif /* CONFIG_MEMCG_KMEM */
@@ -1162,9 +1113,6 @@ bool task_in_mem_cgroup(struct task_struct *task, struct mem_cgroup *memcg)
return ret;
}
-#define mem_cgroup_from_counter(counter, member) \
- container_of(counter, struct mem_cgroup, member)
-
/**
* mem_cgroup_margin - calculate chargeable space of a memory cgroup
* @memcg: the memory cgroup
@@ -1183,7 +1131,7 @@ static unsigned long mem_cgroup_margin(struct mem_cgroup *memcg)
if (count < limit)
margin = limit - count;
- if (do_swap_account) {
+ if (do_memsw_account()) {
count = page_counter_read(&memcg->memsw);
limit = READ_ONCE(memcg->memsw.limit);
if (count <= limit)
@@ -1286,7 +1234,7 @@ void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
pr_cont(":");
for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
- if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account)
+ if (i == MEM_CGROUP_STAT_SWAP && !do_memsw_account())
continue;
pr_cont(" %s:%luKB", mem_cgroup_stat_names[i],
K(mem_cgroup_read_stat(iter, i)));
@@ -1909,7 +1857,7 @@ static void drain_stock(struct memcg_stock_pcp *stock)
if (stock->nr_pages) {
page_counter_uncharge(&old->memory, stock->nr_pages);
- if (do_swap_account)
+ if (do_memsw_account())
page_counter_uncharge(&old->memsw, stock->nr_pages);
css_put_many(&old->css, stock->nr_pages);
stock->nr_pages = 0;
@@ -1997,6 +1945,26 @@ static int memcg_cpu_hotplug_callback(struct notifier_block *nb,
return NOTIFY_OK;
}
+static void reclaim_high(struct mem_cgroup *memcg,
+ unsigned int nr_pages,
+ gfp_t gfp_mask)
+{
+ do {
+ if (page_counter_read(&memcg->memory) <= memcg->high)
+ continue;
+ mem_cgroup_events(memcg, MEMCG_HIGH, 1);
+ try_to_free_mem_cgroup_pages(memcg, nr_pages, gfp_mask, true);
+ } while ((memcg = parent_mem_cgroup(memcg)));
+}
+
+static void high_work_func(struct work_struct *work)
+{
+ struct mem_cgroup *memcg;
+
+ memcg = container_of(work, struct mem_cgroup, high_work);
+ reclaim_high(memcg, CHARGE_BATCH, GFP_KERNEL);
+}
+
/*
* Scheduled by try_charge() to be executed from the userland return path
* and reclaims memory over the high limit.
@@ -2004,20 +1972,13 @@ static int memcg_cpu_hotplug_callback(struct notifier_block *nb,
void mem_cgroup_handle_over_high(void)
{
unsigned int nr_pages = current->memcg_nr_pages_over_high;
- struct mem_cgroup *memcg, *pos;
+ struct mem_cgroup *memcg;
if (likely(!nr_pages))
return;
- pos = memcg = get_mem_cgroup_from_mm(current->mm);
-
- do {
- if (page_counter_read(&pos->memory) <= pos->high)
- continue;
- mem_cgroup_events(pos, MEMCG_HIGH, 1);
- try_to_free_mem_cgroup_pages(pos, nr_pages, GFP_KERNEL, true);
- } while ((pos = parent_mem_cgroup(pos)));
-
+ memcg = get_mem_cgroup_from_mm(current->mm);
+ reclaim_high(memcg, nr_pages, GFP_KERNEL);
css_put(&memcg->css);
current->memcg_nr_pages_over_high = 0;
}
@@ -2039,11 +2000,11 @@ retry:
if (consume_stock(memcg, nr_pages))
return 0;
- if (!do_swap_account ||
+ if (!do_memsw_account() ||
page_counter_try_charge(&memcg->memsw, batch, &counter)) {
if (page_counter_try_charge(&memcg->memory, batch, &counter))
goto done_restock;
- if (do_swap_account)
+ if (do_memsw_account())
page_counter_uncharge(&memcg->memsw, batch);
mem_over_limit = mem_cgroup_from_counter(counter, memory);
} else {
@@ -2130,7 +2091,7 @@ force:
* temporarily by force charging it.
*/
page_counter_charge(&memcg->memory, nr_pages);
- if (do_swap_account)
+ if (do_memsw_account())
page_counter_charge(&memcg->memsw, nr_pages);
css_get_many(&memcg->css, nr_pages);
@@ -2152,6 +2113,11 @@ done_restock:
*/
do {
if (page_counter_read(&memcg->memory) > memcg->high) {
+ /* Don't bother a random interrupted task */
+ if (in_interrupt()) {
+ schedule_work(&memcg->high_work);
+ break;
+ }
current->memcg_nr_pages_over_high += batch;
set_notify_resume(current);
break;
@@ -2167,7 +2133,7 @@ static void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages)
return;
page_counter_uncharge(&memcg->memory, nr_pages);
- if (do_swap_account)
+ if (do_memsw_account())
page_counter_uncharge(&memcg->memsw, nr_pages);
css_put_many(&memcg->css, nr_pages);
@@ -2356,7 +2322,7 @@ static void memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg,
* Can't be called in interrupt context or from kernel threads.
* This function needs to be called with rcu_read_lock() held.
*/
-struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep)
+struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep, gfp_t gfp)
{
struct mem_cgroup *memcg;
struct kmem_cache *memcg_cachep;
@@ -2364,6 +2330,12 @@ struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep)
VM_BUG_ON(!is_root_cache(cachep));
+ if (cachep->flags & SLAB_ACCOUNT)
+ gfp |= __GFP_ACCOUNT;
+
+ if (!(gfp & __GFP_ACCOUNT))
+ return cachep;
+
if (current->memcg_kmem_skip_account)
return cachep;
@@ -2447,7 +2419,7 @@ void __memcg_kmem_uncharge(struct page *page, int order)
page_counter_uncharge(&memcg->kmem, nr_pages);
page_counter_uncharge(&memcg->memory, nr_pages);
- if (do_swap_account)
+ if (do_memsw_account())
page_counter_uncharge(&memcg->memsw, nr_pages);
page->mem_cgroup = NULL;
@@ -2935,7 +2907,7 @@ static int memcg_activate_kmem(struct mem_cgroup *memcg,
err = page_counter_limit(&memcg->kmem, nr_pages);
VM_BUG_ON(err);
- static_key_slow_inc(&memcg_kmem_enabled_key);
+ static_branch_inc(&memcg_kmem_enabled_key);
/*
* A memory cgroup is considered kmem-active as soon as it gets
* kmemcg_id. Setting the id after enabling static branching will
@@ -3162,7 +3134,7 @@ static int memcg_stat_show(struct seq_file *m, void *v)
BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_lru_names) != NR_LRU_LISTS);
for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
- if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account)
+ if (i == MEM_CGROUP_STAT_SWAP && !do_memsw_account())
continue;
seq_printf(m, "%s %lu\n", mem_cgroup_stat_names[i],
mem_cgroup_read_stat(memcg, i) * PAGE_SIZE);
@@ -3184,14 +3156,14 @@ static int memcg_stat_show(struct seq_file *m, void *v)
}
seq_printf(m, "hierarchical_memory_limit %llu\n",
(u64)memory * PAGE_SIZE);
- if (do_swap_account)
+ if (do_memsw_account())
seq_printf(m, "hierarchical_memsw_limit %llu\n",
(u64)memsw * PAGE_SIZE);
for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
unsigned long long val = 0;
- if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account)
+ if (i == MEM_CGROUP_STAT_SWAP && !do_memsw_account())
continue;
for_each_mem_cgroup_tree(mi, memcg)
val += mem_cgroup_read_stat(mi, i) * PAGE_SIZE;
@@ -3322,7 +3294,7 @@ static void mem_cgroup_threshold(struct mem_cgroup *memcg)
{
while (memcg) {
__mem_cgroup_threshold(memcg, false);
- if (do_swap_account)
+ if (do_memsw_account())
__mem_cgroup_threshold(memcg, true);
memcg = parent_mem_cgroup(memcg);
@@ -3621,7 +3593,7 @@ static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
if (ret)
return ret;
- return mem_cgroup_sockets_init(memcg, ss);
+ return tcp_init_cgroup(memcg, ss);
}
static void memcg_deactivate_kmem(struct mem_cgroup *memcg)
@@ -3674,10 +3646,10 @@ static void memcg_destroy_kmem(struct mem_cgroup *memcg)
{
if (memcg->kmem_acct_activated) {
memcg_destroy_kmem_caches(memcg);
- static_key_slow_dec(&memcg_kmem_enabled_key);
+ static_branch_dec(&memcg_kmem_enabled_key);
WARN_ON(page_counter_read(&memcg->kmem));
}
- mem_cgroup_sockets_destroy(memcg);
+ tcp_destroy_cgroup(memcg);
}
#else
static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
@@ -4196,6 +4168,8 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg)
{
int node;
+ cancel_work_sync(&memcg->high_work);
+
mem_cgroup_remove_from_trees(memcg);
for_each_node(node)
@@ -4206,17 +4180,6 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg)
kfree(memcg);
}
-/*
- * Returns the parent mem_cgroup in memcgroup hierarchy with hierarchy enabled.
- */
-struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg)
-{
- if (!memcg->memory.parent)
- return NULL;
- return mem_cgroup_from_counter(memcg->memory.parent, memory);
-}
-EXPORT_SYMBOL(parent_mem_cgroup);
-
static struct cgroup_subsys_state * __ref
mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
{
@@ -4235,7 +4198,6 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
/* root ? */
if (parent_css == NULL) {
root_mem_cgroup = memcg;
- mem_cgroup_root_css = &memcg->css;
page_counter_init(&memcg->memory, NULL);
memcg->high = PAGE_COUNTER_MAX;
memcg->soft_limit = PAGE_COUNTER_MAX;
@@ -4243,6 +4205,7 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
page_counter_init(&memcg->kmem, NULL);
}
+ INIT_WORK(&memcg->high_work, high_work_func);
memcg->last_scanned_node = MAX_NUMNODES;
INIT_LIST_HEAD(&memcg->oom_notify);
memcg->move_charge_at_immigrate = 0;
@@ -4257,6 +4220,9 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
#ifdef CONFIG_CGROUP_WRITEBACK
INIT_LIST_HEAD(&memcg->cgwb_list);
#endif
+#ifdef CONFIG_INET
+ memcg->socket_pressure = jiffies;
+#endif
return &memcg->css;
free_out:
@@ -4314,6 +4280,11 @@ mem_cgroup_css_online(struct cgroup_subsys_state *css)
if (ret)
return ret;
+#ifdef CONFIG_INET
+ if (cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_nosocket)
+ static_branch_inc(&memcg_sockets_enabled_key);
+#endif
+
/*
* Make sure the memcg is initialized: mem_cgroup_iter()
* orders reading memcg->initialized against its callers
@@ -4360,6 +4331,10 @@ static void mem_cgroup_css_free(struct cgroup_subsys_state *css)
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
memcg_destroy_kmem(memcg);
+#ifdef CONFIG_INET
+ if (cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_nosocket)
+ static_branch_dec(&memcg_sockets_enabled_key);
+#endif
__mem_cgroup_free(memcg);
}
@@ -4476,7 +4451,7 @@ static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
* we call find_get_page() with swapper_space directly.
*/
page = find_get_page(swap_address_space(ent), ent.val);
- if (do_swap_account)
+ if (do_memsw_account())
entry->val = ent.val;
return page;
@@ -4511,7 +4486,7 @@ static struct page *mc_handle_file_pte(struct vm_area_struct *vma,
page = find_get_entry(mapping, pgoff);
if (radix_tree_exceptional_entry(page)) {
swp_entry_t swp = radix_to_swp_entry(page);
- if (do_swap_account)
+ if (do_memsw_account())
*entry = swp;
page = find_get_page(swap_address_space(swp), swp.val);
}
@@ -5304,7 +5279,7 @@ int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm,
if (page->mem_cgroup)
goto out;
- if (do_swap_account) {
+ if (do_memsw_account()) {
swp_entry_t ent = { .val = page_private(page), };
unsigned short id = lookup_swap_cgroup_id(ent);
@@ -5378,7 +5353,7 @@ void mem_cgroup_commit_charge(struct page *page, struct mem_cgroup *memcg,
memcg_check_events(memcg, page);
local_irq_enable();
- if (do_swap_account && PageSwapCache(page)) {
+ if (do_memsw_account() && PageSwapCache(page)) {
swp_entry_t entry = { .val = page_private(page) };
/*
* The swap entry might not get freed for a long time,
@@ -5427,7 +5402,7 @@ static void uncharge_batch(struct mem_cgroup *memcg, unsigned long pgpgout,
if (!mem_cgroup_is_root(memcg)) {
page_counter_uncharge(&memcg->memory, nr_pages);
- if (do_swap_account)
+ if (do_memsw_account())
page_counter_uncharge(&memcg->memsw, nr_pages);
memcg_oom_recover(memcg);
}
@@ -5580,6 +5555,121 @@ void mem_cgroup_replace_page(struct page *oldpage, struct page *newpage)
commit_charge(newpage, memcg, true);
}
+#ifdef CONFIG_INET
+
+DEFINE_STATIC_KEY_FALSE(memcg_sockets_enabled_key);
+EXPORT_SYMBOL(memcg_sockets_enabled_key);
+
+void sock_update_memcg(struct sock *sk)
+{
+ struct mem_cgroup *memcg;
+
+ /* Socket cloning can throw us here with sk_cgrp already
+ * filled. It won't however, necessarily happen from
+ * process context. So the test for root memcg given
+ * the current task's memcg won't help us in this case.
+ *
+ * Respecting the original socket's memcg is a better
+ * decision in this case.
+ */
+ if (sk->sk_memcg) {
+ BUG_ON(mem_cgroup_is_root(sk->sk_memcg));
+ css_get(&sk->sk_memcg->css);
+ return;
+ }
+
+ rcu_read_lock();
+ memcg = mem_cgroup_from_task(current);
+ if (memcg == root_mem_cgroup)
+ goto out;
+#ifdef CONFIG_MEMCG_KMEM
+ if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && !memcg->tcp_mem.active)
+ goto out;
+#endif
+ if (css_tryget_online(&memcg->css))
+ sk->sk_memcg = memcg;
+out:
+ rcu_read_unlock();
+}
+EXPORT_SYMBOL(sock_update_memcg);
+
+void sock_release_memcg(struct sock *sk)
+{
+ WARN_ON(!sk->sk_memcg);
+ css_put(&sk->sk_memcg->css);
+}
+
+/**
+ * mem_cgroup_charge_skmem - charge socket memory
+ * @memcg: memcg to charge
+ * @nr_pages: number of pages to charge
+ *
+ * Charges @nr_pages to @memcg. Returns %true if the charge fit within
+ * @memcg's configured limit, %false if the charge had to be forced.
+ */
+bool mem_cgroup_charge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages)
+{
+ gfp_t gfp_mask = GFP_KERNEL;
+
+#ifdef CONFIG_MEMCG_KMEM
+ if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) {
+ struct page_counter *counter;
+
+ if (page_counter_try_charge(&memcg->tcp_mem.memory_allocated,
+ nr_pages, &counter)) {
+ memcg->tcp_mem.memory_pressure = 0;
+ return true;
+ }
+ page_counter_charge(&memcg->tcp_mem.memory_allocated, nr_pages);
+ memcg->tcp_mem.memory_pressure = 1;
+ return false;
+ }
+#endif
+ /* Don't block in the packet receive path */
+ if (in_softirq())
+ gfp_mask = GFP_NOWAIT;
+
+ if (try_charge(memcg, gfp_mask, nr_pages) == 0)
+ return true;
+
+ try_charge(memcg, gfp_mask|__GFP_NOFAIL, nr_pages);
+ return false;
+}
+
+/**
+ * mem_cgroup_uncharge_skmem - uncharge socket memory
+ * @memcg - memcg to uncharge
+ * @nr_pages - number of pages to uncharge
+ */
+void mem_cgroup_uncharge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages)
+{
+#ifdef CONFIG_MEMCG_KMEM
+ if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) {
+ page_counter_uncharge(&memcg->tcp_mem.memory_allocated,
+ nr_pages);
+ return;
+ }
+#endif
+ page_counter_uncharge(&memcg->memory, nr_pages);
+ css_put_many(&memcg->css, nr_pages);
+}
+
+#endif /* CONFIG_INET */
+
+static int __init cgroup_memory(char *s)
+{
+ char *token;
+
+ while ((token = strsep(&s, ",")) != NULL) {
+ if (!*token)
+ continue;
+ if (!strcmp(token, "nosocket"))
+ cgroup_memory_nosocket = true;
+ }
+ return 0;
+}
+__setup("cgroup.memory=", cgroup_memory);
+
/*
* subsys_initcall() for memory controller.
*
@@ -5635,7 +5725,7 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
VM_BUG_ON_PAGE(PageLRU(page), page);
VM_BUG_ON_PAGE(page_count(page), page);
- if (!do_swap_account)
+ if (!do_memsw_account())
return;
memcg = page->mem_cgroup;
@@ -5675,7 +5765,7 @@ void mem_cgroup_uncharge_swap(swp_entry_t entry)
struct mem_cgroup *memcg;
unsigned short id;
- if (!do_swap_account)
+ if (!do_memsw_account())
return;
id = swap_cgroup_record(entry, 0);
diff --git a/mm/memory.c b/mm/memory.c
index c387430f06c3..d4e4d37c1989 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -832,10 +832,7 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
} else if (is_migration_entry(entry)) {
page = migration_entry_to_page(entry);
- if (PageAnon(page))
- rss[MM_ANONPAGES]++;
- else
- rss[MM_FILEPAGES]++;
+ rss[mm_counter(page)]++;
if (is_write_migration_entry(entry) &&
is_cow_mapping(vm_flags)) {
@@ -874,10 +871,7 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
if (page) {
get_page(page);
page_dup_rmap(page);
- if (PageAnon(page))
- rss[MM_ANONPAGES]++;
- else
- rss[MM_FILEPAGES]++;
+ rss[mm_counter(page)]++;
}
out_set_pte:
@@ -1113,9 +1107,8 @@ again:
tlb_remove_tlb_entry(tlb, pte, addr);
if (unlikely(!page))
continue;
- if (PageAnon(page))
- rss[MM_ANONPAGES]--;
- else {
+
+ if (!PageAnon(page)) {
if (pte_dirty(ptent)) {
force_flush = 1;
set_page_dirty(page);
@@ -1123,8 +1116,8 @@ again:
if (pte_young(ptent) &&
likely(!(vma->vm_flags & VM_SEQ_READ)))
mark_page_accessed(page);
- rss[MM_FILEPAGES]--;
}
+ rss[mm_counter(page)]--;
page_remove_rmap(page);
if (unlikely(page_mapcount(page) < 0))
print_bad_pte(vma, addr, ptent, page);
@@ -1146,11 +1139,7 @@ again:
struct page *page;
page = migration_entry_to_page(entry);
-
- if (PageAnon(page))
- rss[MM_ANONPAGES]--;
- else
- rss[MM_FILEPAGES]--;
+ rss[mm_counter(page)]--;
}
if (unlikely(!free_swap_and_cache(entry)))
print_bad_pte(vma, addr, ptent, NULL);
@@ -1460,7 +1449,7 @@ static int insert_page(struct vm_area_struct *vma, unsigned long addr,
/* Ok, finally just insert the thing.. */
get_page(page);
- inc_mm_counter_fast(mm, MM_FILEPAGES);
+ inc_mm_counter_fast(mm, mm_counter_file(page));
page_add_file_rmap(page);
set_pte_at(mm, addr, pte, mk_pte(page, prot));
@@ -1949,6 +1938,20 @@ static inline void cow_user_page(struct page *dst, struct page *src, unsigned lo
copy_user_highpage(dst, src, va, vma);
}
+static gfp_t __get_fault_gfp_mask(struct vm_area_struct *vma)
+{
+ struct file *vm_file = vma->vm_file;
+
+ if (vm_file)
+ return mapping_gfp_mask(vm_file->f_mapping) | __GFP_FS | __GFP_IO;
+
+ /*
+ * Special mappings (e.g. VDSO) do not have any file so fake
+ * a default GFP_KERNEL for them.
+ */
+ return GFP_KERNEL;
+}
+
/*
* Notify the address space that the page is about to become writable so that
* it can prohibit this or wait for the page to get into an appropriate state.
@@ -1964,6 +1967,7 @@ static int do_page_mkwrite(struct vm_area_struct *vma, struct page *page,
vmf.virtual_address = (void __user *)(address & PAGE_MASK);
vmf.pgoff = page->index;
vmf.flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE;
+ vmf.gfp_mask = __get_fault_gfp_mask(vma);
vmf.page = page;
vmf.cow_page = NULL;
@@ -2097,7 +2101,8 @@ static int wp_page_copy(struct mm_struct *mm, struct vm_area_struct *vma,
if (likely(pte_same(*page_table, orig_pte))) {
if (old_page) {
if (!PageAnon(old_page)) {
- dec_mm_counter_fast(mm, MM_FILEPAGES);
+ dec_mm_counter_fast(mm,
+ mm_counter_file(old_page));
inc_mm_counter_fast(mm, MM_ANONPAGES);
}
} else {
@@ -2767,6 +2772,7 @@ static int __do_fault(struct vm_area_struct *vma, unsigned long address,
vmf.pgoff = pgoff;
vmf.flags = flags;
vmf.page = NULL;
+ vmf.gfp_mask = __get_fault_gfp_mask(vma);
vmf.cow_page = cow_page;
ret = vma->vm_ops->fault(vma, &vmf);
@@ -2820,7 +2826,7 @@ void do_set_pte(struct vm_area_struct *vma, unsigned long address,
inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
page_add_new_anon_rmap(page, vma, address);
} else {
- inc_mm_counter_fast(vma->vm_mm, MM_FILEPAGES);
+ inc_mm_counter_fast(vma->vm_mm, mm_counter_file(page));
page_add_file_rmap(page);
}
set_pte_at(vma->vm_mm, address, pte, entry);
@@ -2933,6 +2939,7 @@ static void do_fault_around(struct vm_area_struct *vma, unsigned long address,
vmf.pgoff = pgoff;
vmf.max_pgoff = max_pgoff;
vmf.flags = flags;
+ vmf.gfp_mask = __get_fault_gfp_mask(vma);
vma->vm_ops->map_pages(vma, &vmf);
}
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index a042a9d537bb..92f95952692b 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -131,7 +131,8 @@ static struct resource *register_memory_resource(u64 start, u64 size)
{
struct resource *res;
res = kzalloc(sizeof(struct resource), GFP_KERNEL);
- BUG_ON(!res);
+ if (!res)
+ return ERR_PTR(-ENOMEM);
res->name = "System RAM";
res->start = start;
@@ -140,7 +141,7 @@ static struct resource *register_memory_resource(u64 start, u64 size)
if (request_resource(&iomem_resource, res) < 0) {
pr_debug("System RAM resource %pR cannot be added\n", res);
kfree(res);
- res = NULL;
+ return ERR_PTR(-EEXIST);
}
return res;
}
@@ -1312,8 +1313,8 @@ int __ref add_memory(int nid, u64 start, u64 size)
int ret;
res = register_memory_resource(start, size);
- if (!res)
- return -EEXIST;
+ if (IS_ERR(res))
+ return PTR_ERR(res);
ret = add_memory_resource(nid, res);
if (ret < 0)
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 87a177917cb2..d8caff071a30 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -2142,12 +2142,14 @@ bool __mpol_equal(struct mempolicy *a, struct mempolicy *b)
*
* Remember policies even when nobody has shared memory mapped.
* The policies are kept in Red-Black tree linked from the inode.
- * They are protected by the sp->lock spinlock, which should be held
+ * They are protected by the sp->lock rwlock, which should be held
* for any accesses to the tree.
*/
-/* lookup first element intersecting start-end */
-/* Caller holds sp->lock */
+/*
+ * lookup first element intersecting start-end. Caller holds sp->lock for
+ * reading or for writing
+ */
static struct sp_node *
sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
{
@@ -2178,8 +2180,10 @@ sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
return rb_entry(n, struct sp_node, nd);
}
-/* Insert a new shared policy into the list. */
-/* Caller holds sp->lock */
+/*
+ * Insert a new shared policy into the list. Caller holds sp->lock for
+ * writing.
+ */
static void sp_insert(struct shared_policy *sp, struct sp_node *new)
{
struct rb_node **p = &sp->root.rb_node;
@@ -2211,13 +2215,13 @@ mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
if (!sp->root.rb_node)
return NULL;
- spin_lock(&sp->lock);
+ read_lock(&sp->lock);
sn = sp_lookup(sp, idx, idx+1);
if (sn) {
mpol_get(sn->policy);
pol = sn->policy;
}
- spin_unlock(&sp->lock);
+ read_unlock(&sp->lock);
return pol;
}
@@ -2360,7 +2364,7 @@ static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
int ret = 0;
restart:
- spin_lock(&sp->lock);
+ write_lock(&sp->lock);
n = sp_lookup(sp, start, end);
/* Take care of old policies in the same range. */
while (n && n->start < end) {
@@ -2393,7 +2397,7 @@ restart:
}
if (new)
sp_insert(sp, new);
- spin_unlock(&sp->lock);
+ write_unlock(&sp->lock);
ret = 0;
err_out:
@@ -2405,7 +2409,7 @@ err_out:
return ret;
alloc_new:
- spin_unlock(&sp->lock);
+ write_unlock(&sp->lock);
ret = -ENOMEM;
n_new = kmem_cache_alloc(sn_cache, GFP_KERNEL);
if (!n_new)
@@ -2431,7 +2435,7 @@ void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
int ret;
sp->root = RB_ROOT; /* empty tree == default mempolicy */
- spin_lock_init(&sp->lock);
+ rwlock_init(&sp->lock);
if (mpol) {
struct vm_area_struct pvma;
@@ -2497,14 +2501,14 @@ void mpol_free_shared_policy(struct shared_policy *p)
if (!p->root.rb_node)
return;
- spin_lock(&p->lock);
+ write_lock(&p->lock);
next = rb_first(&p->root);
while (next) {
n = rb_entry(next, struct sp_node, nd);
next = rb_next(&n->nd);
sp_delete(p, n);
}
- spin_unlock(&p->lock);
+ write_unlock(&p->lock);
}
#ifdef CONFIG_NUMA_BALANCING
diff --git a/mm/mlock.c b/mm/mlock.c
index 339d9e0949b6..9cb87cbc4071 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -425,7 +425,7 @@ void munlock_vma_pages_range(struct vm_area_struct *vma,
vma->vm_flags &= VM_LOCKED_CLEAR_MASK;
while (start < end) {
- struct page *page = NULL;
+ struct page *page;
unsigned int page_mask;
unsigned long page_increm;
struct pagevec pvec;
diff --git a/mm/mmap.c b/mm/mmap.c
index 2ce04a649f6b..b3f00b616b81 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -58,6 +58,18 @@
#define arch_rebalance_pgtables(addr, len) (addr)
#endif
+#ifdef CONFIG_HAVE_ARCH_MMAP_RND_BITS
+const int mmap_rnd_bits_min = CONFIG_ARCH_MMAP_RND_BITS_MIN;
+const int mmap_rnd_bits_max = CONFIG_ARCH_MMAP_RND_BITS_MAX;
+int mmap_rnd_bits __read_mostly = CONFIG_ARCH_MMAP_RND_BITS;
+#endif
+#ifdef CONFIG_HAVE_ARCH_MMAP_RND_COMPAT_BITS
+const int mmap_rnd_compat_bits_min = CONFIG_ARCH_MMAP_RND_COMPAT_BITS_MIN;
+const int mmap_rnd_compat_bits_max = CONFIG_ARCH_MMAP_RND_COMPAT_BITS_MAX;
+int mmap_rnd_compat_bits __read_mostly = CONFIG_ARCH_MMAP_RND_COMPAT_BITS;
+#endif
+
+
static void unmap_region(struct mm_struct *mm,
struct vm_area_struct *vma, struct vm_area_struct *prev,
unsigned long start, unsigned long end);
@@ -1208,24 +1220,6 @@ none:
return NULL;
}
-#ifdef CONFIG_PROC_FS
-void vm_stat_account(struct mm_struct *mm, unsigned long flags,
- struct file *file, long pages)
-{
- const unsigned long stack_flags
- = VM_STACK_FLAGS & (VM_GROWSUP|VM_GROWSDOWN);
-
- mm->total_vm += pages;
-
- if (file) {
- mm->shared_vm += pages;
- if ((flags & (VM_EXEC|VM_WRITE)) == VM_EXEC)
- mm->exec_vm += pages;
- } else if (flags & stack_flags)
- mm->stack_vm += pages;
-}
-#endif /* CONFIG_PROC_FS */
-
/*
* If a hint addr is less than mmap_min_addr change hint to be as
* low as possible but still greater than mmap_min_addr
@@ -1544,19 +1538,17 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
unsigned long charged = 0;
/* Check against address space limit. */
- if (!may_expand_vm(mm, len >> PAGE_SHIFT)) {
+ if (!may_expand_vm(mm, vm_flags, len >> PAGE_SHIFT)) {
unsigned long nr_pages;
/*
* MAP_FIXED may remove pages of mappings that intersects with
* requested mapping. Account for the pages it would unmap.
*/
- if (!(vm_flags & MAP_FIXED))
- return -ENOMEM;
-
nr_pages = count_vma_pages_range(mm, addr, addr + len);
- if (!may_expand_vm(mm, (len >> PAGE_SHIFT) - nr_pages))
+ if (!may_expand_vm(mm, vm_flags,
+ (len >> PAGE_SHIFT) - nr_pages))
return -ENOMEM;
}
@@ -1655,7 +1647,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
out:
perf_event_mmap(vma);
- vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT);
+ vm_stat_account(mm, vm_flags, len >> PAGE_SHIFT);
if (vm_flags & VM_LOCKED) {
if (!((vm_flags & VM_SPECIAL) || is_vm_hugetlb_page(vma) ||
vma == get_gate_vma(current->mm)))
@@ -2102,7 +2094,7 @@ static int acct_stack_growth(struct vm_area_struct *vma, unsigned long size, uns
unsigned long new_start, actual_size;
/* address space limit tests */
- if (!may_expand_vm(mm, grow))
+ if (!may_expand_vm(mm, vma->vm_flags, grow))
return -ENOMEM;
/* Stack limit test */
@@ -2199,8 +2191,7 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address)
spin_lock(&mm->page_table_lock);
if (vma->vm_flags & VM_LOCKED)
mm->locked_vm += grow;
- vm_stat_account(mm, vma->vm_flags,
- vma->vm_file, grow);
+ vm_stat_account(mm, vma->vm_flags, grow);
anon_vma_interval_tree_pre_update_vma(vma);
vma->vm_end = address;
anon_vma_interval_tree_post_update_vma(vma);
@@ -2275,8 +2266,7 @@ int expand_downwards(struct vm_area_struct *vma,
spin_lock(&mm->page_table_lock);
if (vma->vm_flags & VM_LOCKED)
mm->locked_vm += grow;
- vm_stat_account(mm, vma->vm_flags,
- vma->vm_file, grow);
+ vm_stat_account(mm, vma->vm_flags, grow);
anon_vma_interval_tree_pre_update_vma(vma);
vma->vm_start = address;
vma->vm_pgoff -= grow;
@@ -2390,7 +2380,7 @@ static void remove_vma_list(struct mm_struct *mm, struct vm_area_struct *vma)
if (vma->vm_flags & VM_ACCOUNT)
nr_accounted += nrpages;
- vm_stat_account(mm, vma->vm_flags, vma->vm_file, -nrpages);
+ vm_stat_account(mm, vma->vm_flags, -nrpages);
vma = remove_vma(vma);
} while (vma);
vm_unacct_memory(nr_accounted);
@@ -2760,7 +2750,7 @@ static unsigned long do_brk(unsigned long addr, unsigned long len)
}
/* Check against address space limits *after* clearing old maps... */
- if (!may_expand_vm(mm, len >> PAGE_SHIFT))
+ if (!may_expand_vm(mm, flags, len >> PAGE_SHIFT))
return -ENOMEM;
if (mm->map_count > sysctl_max_map_count)
@@ -2795,6 +2785,7 @@ static unsigned long do_brk(unsigned long addr, unsigned long len)
out:
perf_event_mmap(vma);
mm->total_vm += len >> PAGE_SHIFT;
+ mm->data_vm += len >> PAGE_SHIFT;
if (flags & VM_LOCKED)
mm->locked_vm += (len >> PAGE_SHIFT);
vma->vm_flags |= VM_SOFTDIRTY;
@@ -2986,16 +2977,28 @@ out:
* Return true if the calling process may expand its vm space by the passed
* number of pages
*/
-int may_expand_vm(struct mm_struct *mm, unsigned long npages)
+bool may_expand_vm(struct mm_struct *mm, vm_flags_t flags, unsigned long npages)
{
- unsigned long cur = mm->total_vm; /* pages */
- unsigned long lim;
+ if (mm->total_vm + npages > rlimit(RLIMIT_AS) >> PAGE_SHIFT)
+ return false;
- lim = rlimit(RLIMIT_AS) >> PAGE_SHIFT;
+ if ((flags & (VM_WRITE | VM_SHARED | (VM_STACK_FLAGS &
+ (VM_GROWSUP | VM_GROWSDOWN)))) == VM_WRITE)
+ return mm->data_vm + npages <= rlimit(RLIMIT_DATA);
- if (cur + npages > lim)
- return 0;
- return 1;
+ return true;
+}
+
+void vm_stat_account(struct mm_struct *mm, vm_flags_t flags, long npages)
+{
+ mm->total_vm += npages;
+
+ if ((flags & (VM_EXEC | VM_WRITE)) == VM_EXEC)
+ mm->exec_vm += npages;
+ else if (flags & (VM_STACK_FLAGS & (VM_GROWSUP | VM_GROWSDOWN)))
+ mm->stack_vm += npages;
+ else if ((flags & (VM_WRITE | VM_SHARED)) == VM_WRITE)
+ mm->data_vm += npages;
}
static int special_mapping_fault(struct vm_area_struct *vma,
@@ -3077,7 +3080,7 @@ static struct vm_area_struct *__install_special_mapping(
if (ret)
goto out;
- mm->total_vm += len >> PAGE_SHIFT;
+ vm_stat_account(mm, vma->vm_flags, len >> PAGE_SHIFT);
perf_event_mmap(vma);
diff --git a/mm/mmzone.c b/mm/mmzone.c
index 7d87ebb0d632..52687fb4de6f 100644
--- a/mm/mmzone.c
+++ b/mm/mmzone.c
@@ -72,16 +72,16 @@ struct zoneref *next_zones_zonelist(struct zoneref *z,
}
#ifdef CONFIG_ARCH_HAS_HOLES_MEMORYMODEL
-int memmap_valid_within(unsigned long pfn,
+bool memmap_valid_within(unsigned long pfn,
struct page *page, struct zone *zone)
{
if (page_to_pfn(page) != pfn)
- return 0;
+ return false;
if (page_zone(page) != zone)
- return 0;
+ return false;
- return 1;
+ return true;
}
#endif /* CONFIG_ARCH_HAS_HOLES_MEMORYMODEL */
diff --git a/mm/mprotect.c b/mm/mprotect.c
index ef5be8eaab00..c764402c464f 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -278,6 +278,10 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev,
* even if read-only so there is no need to account for them here
*/
if (newflags & VM_WRITE) {
+ /* Check space limits when area turns into data. */
+ if (!may_expand_vm(mm, newflags, nrpages) &&
+ may_expand_vm(mm, oldflags, nrpages))
+ return -ENOMEM;
if (!(oldflags & (VM_ACCOUNT|VM_WRITE|VM_HUGETLB|
VM_SHARED|VM_NORESERVE))) {
charged = nrpages;
@@ -334,8 +338,8 @@ success:
populate_vma_page_range(vma, start, end, NULL);
}
- vm_stat_account(mm, oldflags, vma->vm_file, -nrpages);
- vm_stat_account(mm, newflags, vma->vm_file, nrpages);
+ vm_stat_account(mm, oldflags, -nrpages);
+ vm_stat_account(mm, newflags, nrpages);
perf_event_mmap(vma);
return 0;
diff --git a/mm/mremap.c b/mm/mremap.c
index de824e72c3e8..e55b157865d5 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -317,7 +317,7 @@ static unsigned long move_vma(struct vm_area_struct *vma,
* If this were a serious issue, we'd add a flag to do_munmap().
*/
hiwater_vm = mm->hiwater_vm;
- vm_stat_account(mm, vma->vm_flags, vma->vm_file, new_len>>PAGE_SHIFT);
+ vm_stat_account(mm, vma->vm_flags, new_len >> PAGE_SHIFT);
/* Tell pfnmap has moved from this vma */
if (unlikely(vma->vm_flags & VM_PFNMAP))
@@ -383,7 +383,8 @@ static struct vm_area_struct *vma_to_resize(unsigned long addr,
return ERR_PTR(-EAGAIN);
}
- if (!may_expand_vm(mm, (new_len - old_len) >> PAGE_SHIFT))
+ if (!may_expand_vm(mm, vma->vm_flags,
+ (new_len - old_len) >> PAGE_SHIFT))
return ERR_PTR(-ENOMEM);
if (vma->vm_flags & VM_ACCOUNT) {
@@ -545,7 +546,7 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
goto out;
}
- vm_stat_account(mm, vma->vm_flags, vma->vm_file, pages);
+ vm_stat_account(mm, vma->vm_flags, pages);
if (vma->vm_flags & VM_LOCKED) {
mm->locked_vm += pages;
locked = true;
diff --git a/mm/nommu.c b/mm/nommu.c
index 92be862c859b..fbf6f0f1d6c9 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -560,7 +560,7 @@ void __init mmap_init(void)
ret = percpu_counter_init(&vm_committed_as, 0, GFP_KERNEL);
VM_BUG_ON(ret);
- vm_region_jar = KMEM_CACHE(vm_region, SLAB_PANIC);
+ vm_region_jar = KMEM_CACHE(vm_region, SLAB_PANIC|SLAB_ACCOUNT);
}
/*
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index c12680993ff3..dc490c06941b 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -585,10 +585,11 @@ void oom_kill_process(struct oom_control *oc, struct task_struct *p,
*/
do_send_sig_info(SIGKILL, SEND_SIG_FORCED, victim, true);
mark_oom_victim(victim);
- pr_err("Killed process %d (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB\n",
+ pr_err("Killed process %d (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB, shmem-rss:%lukB\n",
task_pid_nr(victim), victim->comm, K(victim->mm->total_vm),
K(get_mm_counter(victim->mm, MM_ANONPAGES)),
- K(get_mm_counter(victim->mm, MM_FILEPAGES)));
+ K(get_mm_counter(victim->mm, MM_FILEPAGES)),
+ K(get_mm_counter(victim->mm, MM_SHMEMPAGES)));
task_unlock(victim);
/*
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index d15d88c8efa1..6fe7d15bd1f7 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -278,7 +278,12 @@ static unsigned long zone_dirtyable_memory(struct zone *zone)
unsigned long nr_pages;
nr_pages = zone_page_state(zone, NR_FREE_PAGES);
- nr_pages -= min(nr_pages, zone->dirty_balance_reserve);
+ /*
+ * Pages reserved for the kernel should not be considered
+ * dirtyable, to prevent a situation where reclaim has to
+ * clean pages in order to balance the zones.
+ */
+ nr_pages -= min(nr_pages, zone->totalreserve_pages);
nr_pages += zone_page_state(zone, NR_INACTIVE_FILE);
nr_pages += zone_page_state(zone, NR_ACTIVE_FILE);
@@ -332,7 +337,12 @@ static unsigned long global_dirtyable_memory(void)
unsigned long x;
x = global_page_state(NR_FREE_PAGES);
- x -= min(x, dirty_balance_reserve);
+ /*
+ * Pages reserved for the kernel should not be considered
+ * dirtyable, to prevent a situation where reclaim has to
+ * clean pages in order to balance the zones.
+ */
+ x -= min(x, totalreserve_pages);
x += global_page_state(NR_INACTIVE_FILE);
x += global_page_state(NR_ACTIVE_FILE);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 9d666df5ef95..ce63d603820f 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -114,13 +114,6 @@ static DEFINE_SPINLOCK(managed_page_count_lock);
unsigned long totalram_pages __read_mostly;
unsigned long totalreserve_pages __read_mostly;
unsigned long totalcma_pages __read_mostly;
-/*
- * When calculating the number of globally allowed dirty pages, there
- * is a certain number of per-zone reserves that should not be
- * considered dirtyable memory. This is the sum of those reserves
- * over all existing zones that contribute dirtyable memory.
- */
-unsigned long dirty_balance_reserve __read_mostly;
int percpu_pagelist_fraction;
gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK;
@@ -812,7 +805,7 @@ static void free_pcppages_bulk(struct zone *zone, int count,
do {
int mt; /* migratetype of the to-be-freed page */
- page = list_entry(list->prev, struct page, lru);
+ page = list_last_entry(list, struct page, lru);
/* must delete as __free_one_page list manipulates */
list_del(&page->lru);
@@ -1417,11 +1410,10 @@ struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
/* Find a page of the appropriate size in the preferred list */
for (current_order = order; current_order < MAX_ORDER; ++current_order) {
area = &(zone->free_area[current_order]);
- if (list_empty(&area->free_list[migratetype]))
- continue;
-
- page = list_entry(area->free_list[migratetype].next,
+ page = list_first_entry_or_null(&area->free_list[migratetype],
struct page, lru);
+ if (!page)
+ continue;
list_del(&page->lru);
rmv_page_order(page);
area->nr_free--;
@@ -1700,12 +1692,12 @@ static void unreserve_highatomic_pageblock(const struct alloc_context *ac)
for (order = 0; order < MAX_ORDER; order++) {
struct free_area *area = &(zone->free_area[order]);
- if (list_empty(&area->free_list[MIGRATE_HIGHATOMIC]))
+ page = list_first_entry_or_null(
+ &area->free_list[MIGRATE_HIGHATOMIC],
+ struct page, lru);
+ if (!page)
continue;
- page = list_entry(area->free_list[MIGRATE_HIGHATOMIC].next,
- struct page, lru);
-
/*
* It should never happen but changes to locking could
* inadvertently allow a per-cpu drain to add pages
@@ -1753,7 +1745,7 @@ __rmqueue_fallback(struct zone *zone, unsigned int order, int start_migratetype)
if (fallback_mt == -1)
continue;
- page = list_entry(area->free_list[fallback_mt].next,
+ page = list_first_entry(&area->free_list[fallback_mt],
struct page, lru);
if (can_steal)
steal_suitable_fallback(zone, page, start_migratetype);
@@ -1788,7 +1780,7 @@ __rmqueue_fallback(struct zone *zone, unsigned int order, int start_migratetype)
* Call me with the zone->lock already held.
*/
static struct page *__rmqueue(struct zone *zone, unsigned int order,
- int migratetype, gfp_t gfp_flags)
+ int migratetype)
{
struct page *page;
@@ -1818,7 +1810,7 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
spin_lock(&zone->lock);
for (i = 0; i < count; ++i) {
- struct page *page = __rmqueue(zone, order, migratetype, 0);
+ struct page *page = __rmqueue(zone, order, migratetype);
if (unlikely(page == NULL))
break;
@@ -1988,7 +1980,7 @@ void mark_free_pages(struct zone *zone)
unsigned long pfn, max_zone_pfn;
unsigned long flags;
unsigned int order, t;
- struct list_head *curr;
+ struct page *page;
if (zone_is_empty(zone))
return;
@@ -1998,17 +1990,17 @@ void mark_free_pages(struct zone *zone)
max_zone_pfn = zone_end_pfn(zone);
for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
if (pfn_valid(pfn)) {
- struct page *page = pfn_to_page(pfn);
-
+ page = pfn_to_page(pfn);
if (!swsusp_page_is_forbidden(page))
swsusp_unset_page_free(page);
}
for_each_migratetype_order(order, t) {
- list_for_each(curr, &zone->free_area[order].free_list[t]) {
+ list_for_each_entry(page,
+ &zone->free_area[order].free_list[t], lru) {
unsigned long i;
- pfn = page_to_pfn(list_entry(curr, struct page, lru));
+ pfn = page_to_pfn(page);
for (i = 0; i < (1UL << order); i++)
swsusp_set_page_free(pfn_to_page(pfn + i));
}
@@ -2212,9 +2204,9 @@ struct page *buffered_rmqueue(struct zone *preferred_zone,
}
if (cold)
- page = list_entry(list->prev, struct page, lru);
+ page = list_last_entry(list, struct page, lru);
else
- page = list_entry(list->next, struct page, lru);
+ page = list_first_entry(list, struct page, lru);
list_del(&page->lru);
pcp->count--;
@@ -2241,7 +2233,7 @@ struct page *buffered_rmqueue(struct zone *preferred_zone,
trace_mm_page_alloc_zone_locked(page, order, migratetype);
}
if (!page)
- page = __rmqueue(zone, order, migratetype, gfp_flags);
+ page = __rmqueue(zone, order, migratetype);
spin_unlock(&zone->lock);
if (!page)
goto failed;
@@ -2740,8 +2732,21 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
goto out;
}
/* Exhausted what can be done so it's blamo time */
- if (out_of_memory(&oc) || WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL))
+ if (out_of_memory(&oc) || WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL)) {
*did_some_progress = 1;
+
+ if (gfp_mask & __GFP_NOFAIL) {
+ page = get_page_from_freelist(gfp_mask, order,
+ ALLOC_NO_WATERMARKS|ALLOC_CPUSET, ac);
+ /*
+ * fallback to ignore cpuset restriction if our nodes
+ * are depleted
+ */
+ if (!page)
+ page = get_page_from_freelist(gfp_mask, order,
+ ALLOC_NO_WATERMARKS, ac);
+ }
+ }
out:
mutex_unlock(&oom_lock);
return page;
@@ -2876,28 +2881,6 @@ retry:
return page;
}
-/*
- * This is called in the allocator slow-path if the allocation request is of
- * sufficient urgency to ignore watermarks and take other desperate measures
- */
-static inline struct page *
-__alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order,
- const struct alloc_context *ac)
-{
- struct page *page;
-
- do {
- page = get_page_from_freelist(gfp_mask, order,
- ALLOC_NO_WATERMARKS, ac);
-
- if (!page && gfp_mask & __GFP_NOFAIL)
- wait_iff_congested(ac->preferred_zone, BLK_RW_ASYNC,
- HZ/50);
- } while (!page && (gfp_mask & __GFP_NOFAIL));
-
- return page;
-}
-
static void wake_all_kswapds(unsigned int order, const struct alloc_context *ac)
{
struct zoneref *z;
@@ -3042,28 +3025,36 @@ retry:
* allocations are system rather than user orientated
*/
ac->zonelist = node_zonelist(numa_node_id(), gfp_mask);
-
- page = __alloc_pages_high_priority(gfp_mask, order, ac);
-
- if (page) {
+ page = get_page_from_freelist(gfp_mask, order,
+ ALLOC_NO_WATERMARKS, ac);
+ if (page)
goto got_pg;
- }
}
/* Caller is not willing to reclaim, we can't balance anything */
if (!can_direct_reclaim) {
/*
- * All existing users of the deprecated __GFP_NOFAIL are
- * blockable, so warn of any new users that actually allow this
- * type of allocation to fail.
+ * All existing users of the __GFP_NOFAIL are blockable, so warn
+ * of any new users that actually allow this type of allocation
+ * to fail.
*/
WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL);
goto nopage;
}
/* Avoid recursion of direct reclaim */
- if (current->flags & PF_MEMALLOC)
+ if (current->flags & PF_MEMALLOC) {
+ /*
+ * __GFP_NOFAIL request from this context is rather bizarre
+ * because we cannot reclaim anything and only can loop waiting
+ * for somebody to do a work for us.
+ */
+ if (WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL)) {
+ cond_resched();
+ goto retry;
+ }
goto nopage;
+ }
/* Avoid allocations with no watermarks from looping endlessly */
if (test_thread_flag(TIF_MEMDIE) && !(gfp_mask & __GFP_NOFAIL))
@@ -3402,7 +3393,8 @@ EXPORT_SYMBOL(__free_page_frag);
/*
* alloc_kmem_pages charges newly allocated pages to the kmem resource counter
- * of the current memory cgroup.
+ * of the current memory cgroup if __GFP_ACCOUNT is set, other than that it is
+ * equivalent to alloc_pages.
*
* It should be used when the caller would like to use kmalloc, but since the
* allocation is large, it has to fall back to the page allocator.
@@ -4147,8 +4139,7 @@ static void set_zonelist_order(void)
static void build_zonelists(pg_data_t *pgdat)
{
- int j, node, load;
- enum zone_type i;
+ int i, node, load;
nodemask_t used_mask;
int local_node, prev_node;
struct zonelist *zonelist;
@@ -4168,7 +4159,7 @@ static void build_zonelists(pg_data_t *pgdat)
nodes_clear(used_mask);
memset(node_order, 0, sizeof(node_order));
- j = 0;
+ i = 0;
while ((node = find_next_best_node(local_node, &used_mask)) >= 0) {
/*
@@ -4185,12 +4176,12 @@ static void build_zonelists(pg_data_t *pgdat)
if (order == ZONELIST_ORDER_NODE)
build_zonelists_in_node_order(pgdat, node);
else
- node_order[j++] = node; /* remember order */
+ node_order[i++] = node; /* remember order */
}
if (order == ZONELIST_ORDER_ZONE) {
/* calculate node order -- i.e., DMA last! */
- build_zonelists_in_zone_order(pgdat, j);
+ build_zonelists_in_zone_order(pgdat, i);
}
build_thisnode_zonelists(pgdat);
@@ -5956,20 +5947,12 @@ static void calculate_totalreserve_pages(void)
if (max > zone->managed_pages)
max = zone->managed_pages;
+
+ zone->totalreserve_pages = max;
+
reserve_pages += max;
- /*
- * Lowmem reserves are not available to
- * GFP_HIGHUSER page cache allocations and
- * kswapd tries to balance zones to their high
- * watermark. As a result, neither should be
- * regarded as dirtyable memory, to prevent a
- * situation where reclaim has to clean pages
- * in order to balance the zones.
- */
- zone->dirty_balance_reserve = max;
}
}
- dirty_balance_reserve = reserve_pages;
totalreserve_pages = reserve_pages;
}
@@ -6724,8 +6707,12 @@ int alloc_contig_range(unsigned long start, unsigned long end,
if (ret)
return ret;
+ /*
+ * In case of -EBUSY, we'd like to know which page causes problem.
+ * So, just fall through. We will check it in test_pages_isolated().
+ */
ret = __alloc_contig_migrate_range(&cc, start, end);
- if (ret)
+ if (ret && ret != -EBUSY)
goto done;
/*
@@ -6752,12 +6739,25 @@ int alloc_contig_range(unsigned long start, unsigned long end,
outer_start = start;
while (!PageBuddy(pfn_to_page(outer_start))) {
if (++order >= MAX_ORDER) {
- ret = -EBUSY;
- goto done;
+ outer_start = start;
+ break;
}
outer_start &= ~0UL << order;
}
+ if (outer_start != start) {
+ order = page_order(pfn_to_page(outer_start));
+
+ /*
+ * outer_start page could be small order buddy page and
+ * it doesn't include start page. Adjust outer_start
+ * in this case to report failed page properly
+ * on tracepoint in test_pages_isolated()
+ */
+ if (outer_start + (1UL << order) <= start)
+ outer_start = start;
+ }
+
/* Make sure the range is really isolated. */
if (test_pages_isolated(outer_start, end, false)) {
pr_info("%s: [%lx, %lx) PFNs busy\n",
diff --git a/mm/page_isolation.c b/mm/page_isolation.c
index 4568fd58f70a..5e139fec6c6c 100644
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -9,6 +9,9 @@
#include <linux/hugetlb.h>
#include "internal.h"
+#define CREATE_TRACE_POINTS
+#include <trace/events/page_isolation.h>
+
static int set_migratetype_isolate(struct page *page,
bool skip_hwpoisoned_pages)
{
@@ -162,8 +165,8 @@ int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
unsigned long undo_pfn;
struct page *page;
- BUG_ON((start_pfn) & (pageblock_nr_pages - 1));
- BUG_ON((end_pfn) & (pageblock_nr_pages - 1));
+ BUG_ON(!IS_ALIGNED(start_pfn, pageblock_nr_pages));
+ BUG_ON(!IS_ALIGNED(end_pfn, pageblock_nr_pages));
for (pfn = start_pfn;
pfn < end_pfn;
@@ -212,7 +215,7 @@ int undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
*
* Returns 1 if all pages in the range are isolated.
*/
-static int
+static unsigned long
__test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn,
bool skip_hwpoisoned_pages)
{
@@ -237,9 +240,8 @@ __test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn,
else
break;
}
- if (pfn < end_pfn)
- return 0;
- return 1;
+
+ return pfn;
}
int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn,
@@ -248,7 +250,6 @@ int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn,
unsigned long pfn, flags;
struct page *page;
struct zone *zone;
- int ret;
/*
* Note: pageblock_nr_pages != MAX_ORDER. Then, chunks of free pages
@@ -266,10 +267,13 @@ int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn,
/* Check all pages are free or marked as ISOLATED */
zone = page_zone(page);
spin_lock_irqsave(&zone->lock, flags);
- ret = __test_page_isolated_in_pageblock(start_pfn, end_pfn,
+ pfn = __test_page_isolated_in_pageblock(start_pfn, end_pfn,
skip_hwpoisoned_pages);
spin_unlock_irqrestore(&zone->lock, flags);
- return ret ? 0 : -EBUSY;
+
+ trace_test_pages_isolated(start_pfn, end_pfn, pfn);
+
+ return pfn < end_pfn ? -EBUSY : 0;
}
struct page *alloc_migrate_target(struct page *page, unsigned long private,
diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c
index 7d3db0247983..4c681baff363 100644
--- a/mm/pgtable-generic.c
+++ b/mm/pgtable-generic.c
@@ -176,13 +176,10 @@ pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp)
/* FIFO */
pgtable = pmd_huge_pte(mm, pmdp);
- if (list_empty(&pgtable->lru))
- pmd_huge_pte(mm, pmdp) = NULL;
- else {
- pmd_huge_pte(mm, pmdp) = list_entry(pgtable->lru.next,
- struct page, lru);
+ pmd_huge_pte(mm, pmdp) = list_first_entry_or_null(&pgtable->lru,
+ struct page, lru);
+ if (pmd_huge_pte(mm, pmdp))
list_del(&pgtable->lru);
- }
return pgtable;
}
#endif
diff --git a/mm/readahead.c b/mm/readahead.c
index ba22d7fe0afb..20e58e820e44 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -17,6 +17,7 @@
#include <linux/pagemap.h>
#include <linux/syscalls.h>
#include <linux/file.h>
+#include <linux/mm_inline.h>
#include "internal.h"
@@ -32,8 +33,6 @@ file_ra_state_init(struct file_ra_state *ra, struct address_space *mapping)
}
EXPORT_SYMBOL_GPL(file_ra_state_init);
-#define list_to_page(head) (list_entry((head)->prev, struct page, lru))
-
/*
* see if a page needs releasing upon read_cache_pages() failure
* - the caller of read_cache_pages() may have set PG_private or PG_fscache
@@ -64,7 +63,7 @@ static void read_cache_pages_invalidate_pages(struct address_space *mapping,
struct page *victim;
while (!list_empty(pages)) {
- victim = list_to_page(pages);
+ victim = lru_to_page(pages);
list_del(&victim->lru);
read_cache_pages_invalidate_page(mapping, victim);
}
@@ -87,7 +86,7 @@ int read_cache_pages(struct address_space *mapping, struct list_head *pages,
int ret = 0;
while (!list_empty(pages)) {
- page = list_to_page(pages);
+ page = lru_to_page(pages);
list_del(&page->lru);
if (add_to_page_cache_lru(page, mapping, page->index,
mapping_gfp_constraint(mapping, GFP_KERNEL))) {
@@ -125,7 +124,7 @@ static int read_pages(struct address_space *mapping, struct file *filp,
}
for (page_idx = 0; page_idx < nr_pages; page_idx++) {
- struct page *page = list_to_page(pages);
+ struct page *page = lru_to_page(pages);
list_del(&page->lru);
if (!add_to_page_cache_lru(page, mapping, page->index,
mapping_gfp_constraint(mapping, GFP_KERNEL))) {
diff --git a/mm/rmap.c b/mm/rmap.c
index b577fbb98d4b..622756c16ac8 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -428,8 +428,10 @@ static void anon_vma_ctor(void *data)
void __init anon_vma_init(void)
{
anon_vma_cachep = kmem_cache_create("anon_vma", sizeof(struct anon_vma),
- 0, SLAB_DESTROY_BY_RCU|SLAB_PANIC, anon_vma_ctor);
- anon_vma_chain_cachep = KMEM_CACHE(anon_vma_chain, SLAB_PANIC);
+ 0, SLAB_DESTROY_BY_RCU|SLAB_PANIC|SLAB_ACCOUNT,
+ anon_vma_ctor);
+ anon_vma_chain_cachep = KMEM_CACHE(anon_vma_chain,
+ SLAB_PANIC|SLAB_ACCOUNT);
}
/*
@@ -1362,10 +1364,7 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
if (PageHuge(page)) {
hugetlb_count_sub(1 << compound_order(page), mm);
} else {
- if (PageAnon(page))
- dec_mm_counter(mm, MM_ANONPAGES);
- else
- dec_mm_counter(mm, MM_FILEPAGES);
+ dec_mm_counter(mm, mm_counter(page));
}
set_pte_at(mm, address, pte,
swp_entry_to_pte(make_hwpoison_entry(page)));
@@ -1375,10 +1374,7 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
* interest anymore. Simply discard the pte, vmscan
* will take care of the rest.
*/
- if (PageAnon(page))
- dec_mm_counter(mm, MM_ANONPAGES);
- else
- dec_mm_counter(mm, MM_FILEPAGES);
+ dec_mm_counter(mm, mm_counter(page));
} else if (IS_ENABLED(CONFIG_MIGRATION) && (flags & TTU_MIGRATION)) {
swp_entry_t entry;
pte_t swp_pte;
@@ -1418,7 +1414,7 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
swp_pte = pte_swp_mksoft_dirty(swp_pte);
set_pte_at(mm, address, pte, swp_pte);
} else
- dec_mm_counter(mm, MM_FILEPAGES);
+ dec_mm_counter(mm, mm_counter_file(page));
page_remove_rmap(page);
page_cache_release(page);
diff --git a/mm/shmem.c b/mm/shmem.c
index 642471b0ddea..970ff5b80853 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -360,6 +360,87 @@ static int shmem_free_swap(struct address_space *mapping,
}
/*
+ * Determine (in bytes) how many of the shmem object's pages mapped by the
+ * given offsets are swapped out.
+ *
+ * This is safe to call without i_mutex or mapping->tree_lock thanks to RCU,
+ * as long as the inode doesn't go away and racy results are not a problem.
+ */
+unsigned long shmem_partial_swap_usage(struct address_space *mapping,
+ pgoff_t start, pgoff_t end)
+{
+ struct radix_tree_iter iter;
+ void **slot;
+ struct page *page;
+ unsigned long swapped = 0;
+
+ rcu_read_lock();
+
+restart:
+ radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) {
+ if (iter.index >= end)
+ break;
+
+ page = radix_tree_deref_slot(slot);
+
+ /*
+ * This should only be possible to happen at index 0, so we
+ * don't need to reset the counter, nor do we risk infinite
+ * restarts.
+ */
+ if (radix_tree_deref_retry(page))
+ goto restart;
+
+ if (radix_tree_exceptional_entry(page))
+ swapped++;
+
+ if (need_resched()) {
+ cond_resched_rcu();
+ start = iter.index + 1;
+ goto restart;
+ }
+ }
+
+ rcu_read_unlock();
+
+ return swapped << PAGE_SHIFT;
+}
+
+/*
+ * Determine (in bytes) how many of the shmem object's pages mapped by the
+ * given vma is swapped out.
+ *
+ * This is safe to call without i_mutex or mapping->tree_lock thanks to RCU,
+ * as long as the inode doesn't go away and racy results are not a problem.
+ */
+unsigned long shmem_swap_usage(struct vm_area_struct *vma)
+{
+ struct inode *inode = file_inode(vma->vm_file);
+ struct shmem_inode_info *info = SHMEM_I(inode);
+ struct address_space *mapping = inode->i_mapping;
+ unsigned long swapped;
+
+ /* Be careful as we don't hold info->lock */
+ swapped = READ_ONCE(info->swapped);
+
+ /*
+ * The easier cases are when the shmem object has nothing in swap, or
+ * the vma maps it whole. Then we can simply use the stats that we
+ * already track.
+ */
+ if (!swapped)
+ return 0;
+
+ if (!vma->vm_pgoff && vma->vm_end - vma->vm_start >= inode->i_size)
+ return swapped << PAGE_SHIFT;
+
+ /* Here comes the more involved part */
+ return shmem_partial_swap_usage(mapping,
+ linear_page_index(vma, vma->vm_start),
+ linear_page_index(vma, vma->vm_end));
+}
+
+/*
* SysV IPC SHM_UNLOCK restore Unevictable pages to their evictable lists.
*/
void shmem_unlock_mapping(struct address_space *mapping)
@@ -3064,7 +3145,7 @@ static int shmem_init_inodecache(void)
{
shmem_inode_cachep = kmem_cache_create("shmem_inode_cache",
sizeof(struct shmem_inode_info),
- 0, SLAB_PANIC, shmem_init_inode);
+ 0, SLAB_PANIC|SLAB_ACCOUNT, shmem_init_inode);
return 0;
}
diff --git a/mm/slab.c b/mm/slab.c
index 4765c97ce690..6ecc697a8bc4 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -2756,6 +2756,21 @@ static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp,
#define cache_free_debugcheck(x,objp,z) (objp)
#endif
+static struct page *get_first_slab(struct kmem_cache_node *n)
+{
+ struct page *page;
+
+ page = list_first_entry_or_null(&n->slabs_partial,
+ struct page, lru);
+ if (!page) {
+ n->free_touched = 1;
+ page = list_first_entry_or_null(&n->slabs_free,
+ struct page, lru);
+ }
+
+ return page;
+}
+
static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags,
bool force_refill)
{
@@ -2791,18 +2806,12 @@ retry:
}
while (batchcount > 0) {
- struct list_head *entry;
struct page *page;
/* Get slab alloc is to come from. */
- entry = n->slabs_partial.next;
- if (entry == &n->slabs_partial) {
- n->free_touched = 1;
- entry = n->slabs_free.next;
- if (entry == &n->slabs_free)
- goto must_grow;
- }
+ page = get_first_slab(n);
+ if (!page)
+ goto must_grow;
- page = list_entry(entry, struct page, lru);
check_spinlock_acquired(cachep);
/*
@@ -3085,7 +3094,6 @@ retry:
static void *____cache_alloc_node(struct kmem_cache *cachep, gfp_t flags,
int nodeid)
{
- struct list_head *entry;
struct page *page;
struct kmem_cache_node *n;
void *obj;
@@ -3098,15 +3106,10 @@ static void *____cache_alloc_node(struct kmem_cache *cachep, gfp_t flags,
retry:
check_irq_off();
spin_lock(&n->list_lock);
- entry = n->slabs_partial.next;
- if (entry == &n->slabs_partial) {
- n->free_touched = 1;
- entry = n->slabs_free.next;
- if (entry == &n->slabs_free)
- goto must_grow;
- }
+ page = get_first_slab(n);
+ if (!page)
+ goto must_grow;
- page = list_entry(entry, struct page, lru);
check_spinlock_acquired_node(cachep, nodeid);
STATS_INC_NODEALLOCS(cachep);
@@ -3338,17 +3341,12 @@ free_done:
#if STATS
{
int i = 0;
- struct list_head *p;
-
- p = n->slabs_free.next;
- while (p != &(n->slabs_free)) {
- struct page *page;
+ struct page *page;
- page = list_entry(p, struct page, lru);
+ list_for_each_entry(page, &n->slabs_free, lru) {
BUG_ON(page->active);
i++;
- p = p->next;
}
STATS_SET_FREEABLE(cachep, i);
}
diff --git a/mm/slab.h b/mm/slab.h
index 7b6087197997..c63b8699cfa3 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -128,10 +128,11 @@ static inline unsigned long kmem_cache_flags(unsigned long object_size,
#if defined(CONFIG_SLAB)
#define SLAB_CACHE_FLAGS (SLAB_MEM_SPREAD | SLAB_NOLEAKTRACE | \
- SLAB_RECLAIM_ACCOUNT | SLAB_TEMPORARY | SLAB_NOTRACK)
+ SLAB_RECLAIM_ACCOUNT | SLAB_TEMPORARY | \
+ SLAB_NOTRACK | SLAB_ACCOUNT)
#elif defined(CONFIG_SLUB)
#define SLAB_CACHE_FLAGS (SLAB_NOLEAKTRACE | SLAB_RECLAIM_ACCOUNT | \
- SLAB_TEMPORARY | SLAB_NOTRACK)
+ SLAB_TEMPORARY | SLAB_NOTRACK | SLAB_ACCOUNT)
#else
#define SLAB_CACHE_FLAGS (0)
#endif
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 3c6a86b4ec25..e016178063e1 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -37,7 +37,8 @@ struct kmem_cache *kmem_cache;
SLAB_TRACE | SLAB_DESTROY_BY_RCU | SLAB_NOLEAKTRACE | \
SLAB_FAILSLAB)
-#define SLAB_MERGE_SAME (SLAB_RECLAIM_ACCOUNT | SLAB_CACHE_DMA | SLAB_NOTRACK)
+#define SLAB_MERGE_SAME (SLAB_RECLAIM_ACCOUNT | SLAB_CACHE_DMA | \
+ SLAB_NOTRACK | SLAB_ACCOUNT)
/*
* Merge control. If this is set then no merging of slab caches will occur.
diff --git a/mm/slub.c b/mm/slub.c
index 46997517406e..2d0e610d195a 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -5362,6 +5362,8 @@ static char *create_unique_id(struct kmem_cache *s)
*p++ = 'F';
if (!(s->flags & SLAB_NOTRACK))
*p++ = 't';
+ if (s->flags & SLAB_ACCOUNT)
+ *p++ = 'A';
if (p != name + 1)
*p++ = '-';
p += sprintf(p, "%07d", s->size);
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 58877312cf6b..e6b8591a3ed2 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -165,8 +165,6 @@ static void discard_swap_cluster(struct swap_info_struct *si,
int found_extent = 0;
while (nr_pages) {
- struct list_head *lh;
-
if (se->start_page <= start_page &&
start_page < se->start_page + se->nr_pages) {
pgoff_t offset = start_page - se->start_page;
@@ -188,8 +186,7 @@ static void discard_swap_cluster(struct swap_info_struct *si,
break;
}
- lh = se->list.next;
- se = list_entry(lh, struct swap_extent, list);
+ se = list_next_entry(se, list);
}
}
@@ -903,7 +900,7 @@ int swp_swapcount(swp_entry_t entry)
VM_BUG_ON(page_private(page) != SWP_CONTINUED);
do {
- page = list_entry(page->lru.next, struct page, lru);
+ page = list_next_entry(page, lru);
map = kmap_atomic(page);
tmp_count = map[offset];
kunmap_atomic(map);
@@ -1633,14 +1630,11 @@ static sector_t map_swap_entry(swp_entry_t entry, struct block_device **bdev)
se = start_se;
for ( ; ; ) {
- struct list_head *lh;
-
if (se->start_page <= offset &&
offset < (se->start_page + se->nr_pages)) {
return se->start_block + (offset - se->start_page);
}
- lh = se->list.next;
- se = list_entry(lh, struct swap_extent, list);
+ se = list_next_entry(se, list);
sis->curr_swap_extent = se;
BUG_ON(se == start_se); /* It *must* be present */
}
@@ -1664,7 +1658,7 @@ static void destroy_swap_extents(struct swap_info_struct *sis)
while (!list_empty(&sis->first_swap_extent.list)) {
struct swap_extent *se;
- se = list_entry(sis->first_swap_extent.list.next,
+ se = list_first_entry(&sis->first_swap_extent.list,
struct swap_extent, list);
list_del(&se->list);
kfree(se);
@@ -2959,11 +2953,10 @@ static void free_swap_count_continuations(struct swap_info_struct *si)
struct page *head;
head = vmalloc_to_page(si->swap_map + offset);
if (page_private(head)) {
- struct list_head *this, *next;
- list_for_each_safe(this, next, &head->lru) {
- struct page *page;
- page = list_entry(this, struct page, lru);
- list_del(this);
+ struct page *page, *next;
+
+ list_for_each_entry_safe(page, next, &head->lru, lru) {
+ list_del(&page->lru);
__free_page(page);
}
}
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 8e3c9c5a3042..58ceeb107960 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -441,8 +441,7 @@ nocache:
if (list_is_last(&first->list, &vmap_area_list))
goto found;
- first = list_entry(first->list.next,
- struct vmap_area, list);
+ first = list_next_entry(first, list);
}
found:
@@ -1477,13 +1476,10 @@ static void __vunmap(const void *addr, int deallocate_pages)
struct page *page = area->pages[i];
BUG_ON(!page);
- __free_page(page);
+ __free_kmem_pages(page, 0);
}
- if (area->flags & VM_VPAGES)
- vfree(area->pages);
- else
- kfree(area->pages);
+ kvfree(area->pages);
}
kfree(area);
@@ -1593,7 +1589,6 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
if (array_size > PAGE_SIZE) {
pages = __vmalloc_node(array_size, 1, nested_gfp|__GFP_HIGHMEM,
PAGE_KERNEL, node, area->caller);
- area->flags |= VM_VPAGES;
} else {
pages = kmalloc_node(array_size, nested_gfp, node);
}
@@ -1608,9 +1603,9 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
struct page *page;
if (node == NUMA_NO_NODE)
- page = alloc_page(alloc_mask);
+ page = alloc_kmem_pages(alloc_mask, order);
else
- page = alloc_pages_node(node, alloc_mask, order);
+ page = alloc_kmem_pages_node(node, alloc_mask, order);
if (unlikely(!page)) {
/* Successfully allocated i pages, free them in __vunmap() */
@@ -2559,10 +2554,10 @@ static void *s_start(struct seq_file *m, loff_t *pos)
struct vmap_area *va;
spin_lock(&vmap_area_lock);
- va = list_entry((&vmap_area_list)->next, typeof(*va), list);
+ va = list_first_entry(&vmap_area_list, typeof(*va), list);
while (n > 0 && &va->list != &vmap_area_list) {
n--;
- va = list_entry(va->list.next, typeof(*va), list);
+ va = list_next_entry(va, list);
}
if (!n && &va->list != &vmap_area_list)
return va;
@@ -2576,7 +2571,7 @@ static void *s_next(struct seq_file *m, void *p, loff_t *pos)
struct vmap_area *va = p, *next;
++*pos;
- next = list_entry(va->list.next, typeof(*va), list);
+ next = list_next_entry(va, list);
if (&next->list != &vmap_area_list)
return next;
@@ -2651,7 +2646,7 @@ static int s_show(struct seq_file *m, void *p)
if (v->flags & VM_USERMAP)
seq_puts(m, " user");
- if (v->flags & VM_VPAGES)
+ if (is_vmalloc_addr(v->pages))
seq_puts(m, " vpages");
show_numa_info(m, v);
diff --git a/mm/vmpressure.c b/mm/vmpressure.c
index c5afd573d7da..9a6c0704211c 100644
--- a/mm/vmpressure.c
+++ b/mm/vmpressure.c
@@ -137,14 +137,11 @@ struct vmpressure_event {
};
static bool vmpressure_event(struct vmpressure *vmpr,
- unsigned long scanned, unsigned long reclaimed)
+ enum vmpressure_levels level)
{
struct vmpressure_event *ev;
- enum vmpressure_levels level;
bool signalled = false;
- level = vmpressure_calc_level(scanned, reclaimed);
-
mutex_lock(&vmpr->events_lock);
list_for_each_entry(ev, &vmpr->events, node) {
@@ -164,6 +161,7 @@ static void vmpressure_work_fn(struct work_struct *work)
struct vmpressure *vmpr = work_to_vmpressure(work);
unsigned long scanned;
unsigned long reclaimed;
+ enum vmpressure_levels level;
spin_lock(&vmpr->sr_lock);
/*
@@ -174,19 +172,21 @@ static void vmpressure_work_fn(struct work_struct *work)
* here. No need for any locks here since we don't care if
* vmpr->reclaimed is in sync.
*/
- scanned = vmpr->scanned;
+ scanned = vmpr->tree_scanned;
if (!scanned) {
spin_unlock(&vmpr->sr_lock);
return;
}
- reclaimed = vmpr->reclaimed;
- vmpr->scanned = 0;
- vmpr->reclaimed = 0;
+ reclaimed = vmpr->tree_reclaimed;
+ vmpr->tree_scanned = 0;
+ vmpr->tree_reclaimed = 0;
spin_unlock(&vmpr->sr_lock);
+ level = vmpressure_calc_level(scanned, reclaimed);
+
do {
- if (vmpressure_event(vmpr, scanned, reclaimed))
+ if (vmpressure_event(vmpr, level))
break;
/*
* If not handled, propagate the event upward into the
@@ -199,6 +199,7 @@ static void vmpressure_work_fn(struct work_struct *work)
* vmpressure() - Account memory pressure through scanned/reclaimed ratio
* @gfp: reclaimer's gfp mask
* @memcg: cgroup memory controller handle
+ * @tree: legacy subtree mode
* @scanned: number of pages scanned
* @reclaimed: number of pages reclaimed
*
@@ -206,9 +207,16 @@ static void vmpressure_work_fn(struct work_struct *work)
* "instantaneous" memory pressure (scanned/reclaimed ratio). The raw
* pressure index is then further refined and averaged over time.
*
+ * If @tree is set, vmpressure is in traditional userspace reporting
+ * mode: @memcg is considered the pressure root and userspace is
+ * notified of the entire subtree's reclaim efficiency.
+ *
+ * If @tree is not set, reclaim efficiency is recorded for @memcg, and
+ * only in-kernel users are notified.
+ *
* This function does not return any value.
*/
-void vmpressure(gfp_t gfp, struct mem_cgroup *memcg,
+void vmpressure(gfp_t gfp, struct mem_cgroup *memcg, bool tree,
unsigned long scanned, unsigned long reclaimed)
{
struct vmpressure *vmpr = memcg_to_vmpressure(memcg);
@@ -238,15 +246,47 @@ void vmpressure(gfp_t gfp, struct mem_cgroup *memcg,
if (!scanned)
return;
- spin_lock(&vmpr->sr_lock);
- vmpr->scanned += scanned;
- vmpr->reclaimed += reclaimed;
- scanned = vmpr->scanned;
- spin_unlock(&vmpr->sr_lock);
+ if (tree) {
+ spin_lock(&vmpr->sr_lock);
+ vmpr->tree_scanned += scanned;
+ vmpr->tree_reclaimed += reclaimed;
+ scanned = vmpr->scanned;
+ spin_unlock(&vmpr->sr_lock);
- if (scanned < vmpressure_win)
- return;
- schedule_work(&vmpr->work);
+ if (scanned < vmpressure_win)
+ return;
+ schedule_work(&vmpr->work);
+ } else {
+ enum vmpressure_levels level;
+
+ /* For now, no users for root-level efficiency */
+ if (!memcg || memcg == root_mem_cgroup)
+ return;
+
+ spin_lock(&vmpr->sr_lock);
+ scanned = vmpr->scanned += scanned;
+ reclaimed = vmpr->reclaimed += reclaimed;
+ if (scanned < vmpressure_win) {
+ spin_unlock(&vmpr->sr_lock);
+ return;
+ }
+ vmpr->scanned = vmpr->reclaimed = 0;
+ spin_unlock(&vmpr->sr_lock);
+
+ level = vmpressure_calc_level(scanned, reclaimed);
+
+ if (level > VMPRESSURE_LOW) {
+ /*
+ * Let the socket buffer allocator know that
+ * we are having trouble reclaiming LRU pages.
+ *
+ * For hysteresis keep the pressure state
+ * asserted for a second in which subsequent
+ * pressure events can occur.
+ */
+ memcg->socket_pressure = jiffies + HZ;
+ }
+ }
}
/**
@@ -276,7 +316,7 @@ void vmpressure_prio(gfp_t gfp, struct mem_cgroup *memcg, int prio)
* to the vmpressure() basically means that we signal 'critical'
* level.
*/
- vmpressure(gfp, memcg, vmpressure_win, 0);
+ vmpressure(gfp, memcg, true, vmpressure_win, 0);
}
/**
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 2aec4241b42a..108bd119f2f6 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -106,8 +106,6 @@ struct scan_control {
unsigned long nr_reclaimed;
};
-#define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru))
-
#ifdef ARCH_HAS_PREFETCH
#define prefetch_prev_lru_page(_page, _base, _field) \
do { \
@@ -197,11 +195,13 @@ static unsigned long zone_reclaimable_pages(struct zone *zone)
unsigned long nr;
nr = zone_page_state(zone, NR_ACTIVE_FILE) +
- zone_page_state(zone, NR_INACTIVE_FILE);
+ zone_page_state(zone, NR_INACTIVE_FILE) +
+ zone_page_state(zone, NR_ISOLATED_FILE);
if (get_nr_swap_pages() > 0)
nr += zone_page_state(zone, NR_ACTIVE_ANON) +
- zone_page_state(zone, NR_INACTIVE_ANON);
+ zone_page_state(zone, NR_INACTIVE_ANON) +
+ zone_page_state(zone, NR_ISOLATED_ANON);
return nr;
}
@@ -594,7 +594,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping,
/* synchronous write or broken a_ops? */
ClearPageReclaim(page);
}
- trace_mm_vmscan_writepage(page, trace_reclaim_flags(page));
+ trace_mm_vmscan_writepage(page);
inc_zone_page_state(page, NR_VMSCAN_WRITE);
return PAGE_SUCCESS;
}
@@ -1426,6 +1426,7 @@ int isolate_lru_page(struct page *page)
int ret = -EBUSY;
VM_BUG_ON_PAGE(!page_count(page), page);
+ VM_BUG_ON_PAGE(PageTail(page), page);
if (PageLRU(page)) {
struct zone *zone = page_zone(page);
@@ -1691,11 +1692,8 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
current_may_throttle())
wait_iff_congested(zone, BLK_RW_ASYNC, HZ/10);
- trace_mm_vmscan_lru_shrink_inactive(zone->zone_pgdat->node_id,
- zone_idx(zone),
- nr_scanned, nr_reclaimed,
- sc->priority,
- trace_shrink_flags(file));
+ trace_mm_vmscan_lru_shrink_inactive(zone, nr_scanned, nr_reclaimed,
+ sc->priority, file);
return nr_reclaimed;
}
@@ -2046,10 +2044,16 @@ static void get_scan_count(struct lruvec *lruvec, int swappiness,
}
/*
- * There is enough inactive page cache, do not reclaim
- * anything from the anonymous working set right now.
+ * If there is enough inactive page cache, i.e. if the size of the
+ * inactive list is greater than that of the active list *and* the
+ * inactive list actually has some pages to scan on this priority, we
+ * do not reclaim anything from the anonymous working set right now.
+ * Without the second condition we could end up never scanning an
+ * lruvec even if it has plenty of old anonymous pages unless the
+ * system is under heavy pressure.
*/
- if (!inactive_file_is_low(lruvec)) {
+ if (!inactive_file_is_low(lruvec) &&
+ get_lru_size(lruvec, LRU_INACTIVE_FILE) >> sc->priority) {
scan_balance = SCAN_FILE;
goto out;
}
@@ -2393,6 +2397,7 @@ static bool shrink_zone(struct zone *zone, struct scan_control *sc,
memcg = mem_cgroup_iter(root, NULL, &reclaim);
do {
unsigned long lru_pages;
+ unsigned long reclaimed;
unsigned long scanned;
struct lruvec *lruvec;
int swappiness;
@@ -2405,6 +2410,7 @@ static bool shrink_zone(struct zone *zone, struct scan_control *sc,
lruvec = mem_cgroup_zone_lruvec(zone, memcg);
swappiness = mem_cgroup_swappiness(memcg);
+ reclaimed = sc->nr_reclaimed;
scanned = sc->nr_scanned;
shrink_lruvec(lruvec, swappiness, sc, &lru_pages);
@@ -2415,6 +2421,11 @@ static bool shrink_zone(struct zone *zone, struct scan_control *sc,
memcg, sc->nr_scanned - scanned,
lru_pages);
+ /* Record the group's reclaim efficiency */
+ vmpressure(sc->gfp_mask, memcg, false,
+ sc->nr_scanned - scanned,
+ sc->nr_reclaimed - reclaimed);
+
/*
* Direct reclaim and kswapd have to scan all memory
* cgroups to fulfill the overall scan target for the
@@ -2446,7 +2457,8 @@ static bool shrink_zone(struct zone *zone, struct scan_control *sc,
reclaim_state->reclaimed_slab = 0;
}
- vmpressure(sc->gfp_mask, sc->target_mem_cgroup,
+ /* Record the subtree's reclaim efficiency */
+ vmpressure(sc->gfp_mask, sc->target_mem_cgroup, true,
sc->nr_scanned - nr_scanned,
sc->nr_reclaimed - nr_reclaimed);
diff --git a/mm/vmstat.c b/mm/vmstat.c
index c54fd2924f25..83a003bc3cae 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -460,7 +460,7 @@ static int fold_diff(int *diff)
*
* The function returns the number of global counters updated.
*/
-static int refresh_cpu_vm_stats(void)
+static int refresh_cpu_vm_stats(bool do_pagesets)
{
struct zone *zone;
int i;
@@ -484,33 +484,35 @@ static int refresh_cpu_vm_stats(void)
#endif
}
}
- cond_resched();
#ifdef CONFIG_NUMA
- /*
- * Deal with draining the remote pageset of this
- * processor
- *
- * Check if there are pages remaining in this pageset
- * if not then there is nothing to expire.
- */
- if (!__this_cpu_read(p->expire) ||
+ if (do_pagesets) {
+ cond_resched();
+ /*
+ * Deal with draining the remote pageset of this
+ * processor
+ *
+ * Check if there are pages remaining in this pageset
+ * if not then there is nothing to expire.
+ */
+ if (!__this_cpu_read(p->expire) ||
!__this_cpu_read(p->pcp.count))
- continue;
+ continue;
- /*
- * We never drain zones local to this processor.
- */
- if (zone_to_nid(zone) == numa_node_id()) {
- __this_cpu_write(p->expire, 0);
- continue;
- }
+ /*
+ * We never drain zones local to this processor.
+ */
+ if (zone_to_nid(zone) == numa_node_id()) {
+ __this_cpu_write(p->expire, 0);
+ continue;
+ }
- if (__this_cpu_dec_return(p->expire))
- continue;
+ if (__this_cpu_dec_return(p->expire))
+ continue;
- if (__this_cpu_read(p->pcp.count)) {
- drain_zone_pages(zone, this_cpu_ptr(&p->pcp));
- changes++;
+ if (__this_cpu_read(p->pcp.count)) {
+ drain_zone_pages(zone, this_cpu_ptr(&p->pcp));
+ changes++;
+ }
}
#endif
}
@@ -1386,7 +1388,7 @@ static cpumask_var_t cpu_stat_off;
static void vmstat_update(struct work_struct *w)
{
- if (refresh_cpu_vm_stats()) {
+ if (refresh_cpu_vm_stats(true)) {
/*
* Counters were updated so we expect more updates
* to occur in the future. Keep on running the
@@ -1418,6 +1420,23 @@ static void vmstat_update(struct work_struct *w)
}
/*
+ * Switch off vmstat processing and then fold all the remaining differentials
+ * until the diffs stay at zero. The function is used by NOHZ and can only be
+ * invoked when tick processing is not active.
+ */
+void quiet_vmstat(void)
+{
+ if (system_state != SYSTEM_RUNNING)
+ return;
+
+ do {
+ if (!cpumask_test_and_set_cpu(smp_processor_id(), cpu_stat_off))
+ cancel_delayed_work(this_cpu_ptr(&vmstat_work));
+
+ } while (refresh_cpu_vm_stats(false));
+}
+
+/*
* Check if the diffs for a certain cpu indicate that
* an update is needed.
*/
@@ -1449,7 +1468,7 @@ static bool need_update(int cpu)
*/
static void vmstat_shepherd(struct work_struct *w);
-static DECLARE_DELAYED_WORK(shepherd, vmstat_shepherd);
+static DECLARE_DEFERRABLE_WORK(shepherd, vmstat_shepherd);
static void vmstat_shepherd(struct work_struct *w)
{
diff --git a/mm/zbud.c b/mm/zbud.c
index d8a181fd779b..b42322e50f63 100644
--- a/mm/zbud.c
+++ b/mm/zbud.c
@@ -463,9 +463,6 @@ void zbud_free(struct zbud_pool *pool, unsigned long handle)
spin_unlock(&pool->lock);
}
-#define list_tail_entry(ptr, type, member) \
- list_entry((ptr)->prev, type, member)
-
/**
* zbud_reclaim_page() - evicts allocations from a pool page and frees it
* @pool: pool from which a page will attempt to be evicted
@@ -514,7 +511,7 @@ int zbud_reclaim_page(struct zbud_pool *pool, unsigned int retries)
return -EINVAL;
}
for (i = 0; i < retries; i++) {
- zhdr = list_tail_entry(&pool->lru, struct zbud_header, lru);
+ zhdr = list_last_entry(&pool->lru, struct zbud_header, lru);
list_del(&zhdr->lru);
list_del(&zhdr->buddy);
/* Protect zbud page against free */
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index 9f15bdd9163c..e7414cec220b 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -213,10 +213,10 @@ struct size_class {
int size;
unsigned int index;
- /* Number of PAGE_SIZE sized pages to combine to form a 'zspage' */
- int pages_per_zspage;
struct zs_size_stat stats;
+ /* Number of PAGE_SIZE sized pages to combine to form a 'zspage' */
+ int pages_per_zspage;
/* huge object: pages_per_zspage == 1 && maxobj_per_zspage == 1 */
bool huge;
};