summaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig1
-rw-r--r--mm/Kconfig.debug1
-rw-r--r--mm/backing-dev.c186
-rw-r--r--mm/cma.c31
-rw-r--r--mm/cma.h1
-rw-r--r--mm/cma_debug.c2
-rw-r--r--mm/compaction.c6
-rw-r--r--mm/filemap.c75
-rw-r--r--mm/gup.c150
-rw-r--r--mm/huge_memory.c99
-rw-r--r--mm/hwpoison-inject.c3
-rw-r--r--mm/internal.h17
-rw-r--r--mm/kasan/kasan.c3
-rw-r--r--mm/kasan/kasan.h2
-rw-r--r--mm/kasan/report.c187
-rw-r--r--mm/khugepaged.c12
-rw-r--r--mm/ksm.c16
-rw-r--r--mm/madvise.c56
-rw-r--r--mm/memblock.c56
-rw-r--r--mm/memcontrol.c248
-rw-r--r--mm/memory-failure.c79
-rw-r--r--mm/memory.c2
-rw-r--r--mm/memory_hotplug.c6
-rw-r--r--mm/migrate.c12
-rw-r--r--mm/mlock.c6
-rw-r--r--mm/mmap.c2
-rw-r--r--mm/oom_kill.c2
-rw-r--r--mm/page-writeback.c15
-rw-r--r--mm/page_alloc.c120
-rw-r--r--mm/page_ext.c13
-rw-r--r--mm/page_idle.c4
-rw-r--r--mm/page_isolation.c6
-rw-r--r--mm/page_poison.c77
-rw-r--r--mm/percpu.c40
-rw-r--r--mm/rmap.c148
-rw-r--r--mm/rodata_test.c17
-rw-r--r--mm/slab.c7
-rw-r--r--mm/sparse.c5
-rw-r--r--mm/swap.c59
-rw-r--r--mm/swap_slots.c4
-rw-r--r--mm/swap_state.c12
-rw-r--r--mm/swapfile.c35
-rw-r--r--mm/truncate.c13
-rw-r--r--mm/usercopy.c19
-rw-r--r--mm/vmalloc.c2
-rw-r--r--mm/vmscan.c508
-rw-r--r--mm/vmstat.c75
-rw-r--r--mm/workingset.c6
-rw-r--r--mm/z3fold.c9
-rw-r--r--mm/zsmalloc.c2
50 files changed, 1338 insertions, 1119 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index 9b8fccb969dc..beb7a455915d 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -312,7 +312,6 @@ config NEED_BOUNCE_POOL
config NR_QUICK
int
depends on QUICKLIST
- default "2" if AVR32
default "1"
config VIRT_TO_BUS
diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug
index 79d0fd13b5b3..5b0adf1435de 100644
--- a/mm/Kconfig.debug
+++ b/mm/Kconfig.debug
@@ -42,7 +42,6 @@ config DEBUG_PAGEALLOC_ENABLE_DEFAULT
config PAGE_POISONING
bool "Poison pages after freeing"
- select PAGE_EXTENSION
select PAGE_POISONING_NO_SANITY if HIBERNATION
---help---
Fill the pages with poison patterns after free_pages() and verify
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index c6f2a37028c2..f028a9a472fd 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -12,8 +12,6 @@
#include <linux/device.h>
#include <trace/events/writeback.h>
-static atomic_long_t bdi_seq = ATOMIC_LONG_INIT(0);
-
struct backing_dev_info noop_backing_dev_info = {
.name = "noop",
.capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK,
@@ -242,6 +240,8 @@ static __init int bdi_class_init(void)
}
postcore_initcall(bdi_class_init);
+static int bdi_init(struct backing_dev_info *bdi);
+
static int __init default_bdi_init(void)
{
int err;
@@ -294,6 +294,8 @@ static int wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi,
memset(wb, 0, sizeof(*wb));
+ if (wb != &bdi->wb)
+ bdi_get(bdi);
wb->bdi = bdi;
wb->last_old_flush = jiffies;
INIT_LIST_HEAD(&wb->b_dirty);
@@ -314,8 +316,10 @@ static int wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi,
wb->dirty_sleep = jiffies;
wb->congested = wb_congested_get_create(bdi, blkcg_id, gfp);
- if (!wb->congested)
- return -ENOMEM;
+ if (!wb->congested) {
+ err = -ENOMEM;
+ goto out_put_bdi;
+ }
err = fprop_local_init_percpu(&wb->completions, gfp);
if (err)
@@ -335,9 +339,14 @@ out_destroy_stat:
fprop_local_destroy_percpu(&wb->completions);
out_put_cong:
wb_congested_put(wb->congested);
+out_put_bdi:
+ if (wb != &bdi->wb)
+ bdi_put(bdi);
return err;
}
+static void cgwb_remove_from_bdi_list(struct bdi_writeback *wb);
+
/*
* Remove bdi from the global list and shutdown any threads we have running
*/
@@ -347,10 +356,18 @@ static void wb_shutdown(struct bdi_writeback *wb)
spin_lock_bh(&wb->work_lock);
if (!test_and_clear_bit(WB_registered, &wb->state)) {
spin_unlock_bh(&wb->work_lock);
+ /*
+ * Wait for wb shutdown to finish if someone else is just
+ * running wb_shutdown(). Otherwise we could proceed to wb /
+ * bdi destruction before wb_shutdown() is finished.
+ */
+ wait_on_bit(&wb->state, WB_shutting_down, TASK_UNINTERRUPTIBLE);
return;
}
+ set_bit(WB_shutting_down, &wb->state);
spin_unlock_bh(&wb->work_lock);
+ cgwb_remove_from_bdi_list(wb);
/*
* Drain work list and shutdown the delayed_work. !WB_registered
* tells wb_workfn() that @wb is dying and its work_list needs to
@@ -359,6 +376,12 @@ static void wb_shutdown(struct bdi_writeback *wb)
mod_delayed_work(bdi_wq, &wb->dwork, 0);
flush_delayed_work(&wb->dwork);
WARN_ON(!list_empty(&wb->work_list));
+ /*
+ * Make sure bit gets cleared after shutdown is finished. Matches with
+ * the barrier provided by test_and_clear_bit() above.
+ */
+ smp_wmb();
+ clear_bit(WB_shutting_down, &wb->state);
}
static void wb_exit(struct bdi_writeback *wb)
@@ -372,6 +395,8 @@ static void wb_exit(struct bdi_writeback *wb)
fprop_local_destroy_percpu(&wb->completions);
wb_congested_put(wb->congested);
+ if (wb != &wb->bdi->wb)
+ bdi_put(wb->bdi);
}
#ifdef CONFIG_CGROUP_WRITEBACK
@@ -381,11 +406,9 @@ static void wb_exit(struct bdi_writeback *wb)
/*
* cgwb_lock protects bdi->cgwb_tree, bdi->cgwb_congested_tree,
* blkcg->cgwb_list, and memcg->cgwb_list. bdi->cgwb_tree is also RCU
- * protected. cgwb_release_wait is used to wait for the completion of cgwb
- * releases from bdi destruction path.
+ * protected.
*/
static DEFINE_SPINLOCK(cgwb_lock);
-static DECLARE_WAIT_QUEUE_HEAD(cgwb_release_wait);
/**
* wb_congested_get_create - get or create a wb_congested
@@ -438,7 +461,7 @@ retry:
return NULL;
atomic_set(&new_congested->refcnt, 0);
- new_congested->bdi = bdi;
+ new_congested->__bdi = bdi;
new_congested->blkcg_id = blkcg_id;
goto retry;
@@ -466,10 +489,10 @@ void wb_congested_put(struct bdi_writeback_congested *congested)
}
/* bdi might already have been destroyed leaving @congested unlinked */
- if (congested->bdi) {
+ if (congested->__bdi) {
rb_erase(&congested->rb_node,
- &congested->bdi->cgwb_congested_tree);
- congested->bdi = NULL;
+ &congested->__bdi->cgwb_congested_tree);
+ congested->__bdi = NULL;
}
spin_unlock_irqrestore(&cgwb_lock, flags);
@@ -480,11 +503,6 @@ static void cgwb_release_workfn(struct work_struct *work)
{
struct bdi_writeback *wb = container_of(work, struct bdi_writeback,
release_work);
- struct backing_dev_info *bdi = wb->bdi;
-
- spin_lock_irq(&cgwb_lock);
- list_del_rcu(&wb->bdi_node);
- spin_unlock_irq(&cgwb_lock);
wb_shutdown(wb);
@@ -495,9 +513,6 @@ static void cgwb_release_workfn(struct work_struct *work)
percpu_ref_exit(&wb->refcnt);
wb_exit(wb);
kfree_rcu(wb, rcu);
-
- if (atomic_dec_and_test(&bdi->usage_cnt))
- wake_up_all(&cgwb_release_wait);
}
static void cgwb_release(struct percpu_ref *refcnt)
@@ -517,6 +532,13 @@ static void cgwb_kill(struct bdi_writeback *wb)
percpu_ref_kill(&wb->refcnt);
}
+static void cgwb_remove_from_bdi_list(struct bdi_writeback *wb)
+{
+ spin_lock_irq(&cgwb_lock);
+ list_del_rcu(&wb->bdi_node);
+ spin_unlock_irq(&cgwb_lock);
+}
+
static int cgwb_create(struct backing_dev_info *bdi,
struct cgroup_subsys_state *memcg_css, gfp_t gfp)
{
@@ -580,7 +602,6 @@ static int cgwb_create(struct backing_dev_info *bdi,
/* we might have raced another instance of this function */
ret = radix_tree_insert(&bdi->cgwb_tree, memcg_css->id, wb);
if (!ret) {
- atomic_inc(&bdi->usage_cnt);
list_add_tail_rcu(&wb->bdi_node, &bdi->wb_list);
list_add(&wb->memcg_node, memcg_cgwb_list);
list_add(&wb->blkcg_node, blkcg_cgwb_list);
@@ -670,7 +691,6 @@ static int cgwb_bdi_init(struct backing_dev_info *bdi)
INIT_RADIX_TREE(&bdi->cgwb_tree, GFP_ATOMIC);
bdi->cgwb_congested_tree = RB_ROOT;
- atomic_set(&bdi->usage_cnt, 1);
ret = wb_init(&bdi->wb, bdi, 1, GFP_KERNEL);
if (!ret) {
@@ -680,29 +700,26 @@ static int cgwb_bdi_init(struct backing_dev_info *bdi)
return ret;
}
-static void cgwb_bdi_destroy(struct backing_dev_info *bdi)
+static void cgwb_bdi_unregister(struct backing_dev_info *bdi)
{
struct radix_tree_iter iter;
void **slot;
+ struct bdi_writeback *wb;
WARN_ON(test_bit(WB_registered, &bdi->wb.state));
spin_lock_irq(&cgwb_lock);
radix_tree_for_each_slot(slot, &bdi->cgwb_tree, &iter, 0)
cgwb_kill(*slot);
- spin_unlock_irq(&cgwb_lock);
- /*
- * All cgwb's must be shutdown and released before returning. Drain
- * the usage counter to wait for all cgwb's ever created on @bdi.
- */
- atomic_dec(&bdi->usage_cnt);
- wait_event(cgwb_release_wait, !atomic_read(&bdi->usage_cnt));
- /*
- * Grab back our reference so that we hold it when @bdi gets
- * re-registered.
- */
- atomic_inc(&bdi->usage_cnt);
+ while (!list_empty(&bdi->wb_list)) {
+ wb = list_first_entry(&bdi->wb_list, struct bdi_writeback,
+ bdi_node);
+ spin_unlock_irq(&cgwb_lock);
+ wb_shutdown(wb);
+ spin_lock_irq(&cgwb_lock);
+ }
+ spin_unlock_irq(&cgwb_lock);
}
/**
@@ -752,11 +769,18 @@ static void cgwb_bdi_exit(struct backing_dev_info *bdi)
rb_entry(rbn, struct bdi_writeback_congested, rb_node);
rb_erase(rbn, &bdi->cgwb_congested_tree);
- congested->bdi = NULL; /* mark @congested unlinked */
+ congested->__bdi = NULL; /* mark @congested unlinked */
}
spin_unlock_irq(&cgwb_lock);
}
+static void cgwb_bdi_register(struct backing_dev_info *bdi)
+{
+ spin_lock_irq(&cgwb_lock);
+ list_add_tail_rcu(&bdi->wb.bdi_node, &bdi->wb_list);
+ spin_unlock_irq(&cgwb_lock);
+}
+
#else /* CONFIG_CGROUP_WRITEBACK */
static int cgwb_bdi_init(struct backing_dev_info *bdi)
@@ -777,16 +801,26 @@ static int cgwb_bdi_init(struct backing_dev_info *bdi)
return 0;
}
-static void cgwb_bdi_destroy(struct backing_dev_info *bdi) { }
+static void cgwb_bdi_unregister(struct backing_dev_info *bdi) { }
static void cgwb_bdi_exit(struct backing_dev_info *bdi)
{
wb_congested_put(bdi->wb_congested);
}
+static void cgwb_bdi_register(struct backing_dev_info *bdi)
+{
+ list_add_tail_rcu(&bdi->wb.bdi_node, &bdi->wb_list);
+}
+
+static void cgwb_remove_from_bdi_list(struct bdi_writeback *wb)
+{
+ list_del_rcu(&wb->bdi_node);
+}
+
#endif /* CONFIG_CGROUP_WRITEBACK */
-int bdi_init(struct backing_dev_info *bdi)
+static int bdi_init(struct backing_dev_info *bdi)
{
int ret;
@@ -802,11 +836,8 @@ int bdi_init(struct backing_dev_info *bdi)
ret = cgwb_bdi_init(bdi);
- list_add_tail_rcu(&bdi->wb.bdi_node, &bdi->wb_list);
-
return ret;
}
-EXPORT_SYMBOL(bdi_init);
struct backing_dev_info *bdi_alloc_node(gfp_t gfp_mask, int node_id)
{
@@ -823,22 +854,20 @@ struct backing_dev_info *bdi_alloc_node(gfp_t gfp_mask, int node_id)
}
return bdi;
}
+EXPORT_SYMBOL(bdi_alloc_node);
-int bdi_register(struct backing_dev_info *bdi, struct device *parent,
- const char *fmt, ...)
+int bdi_register_va(struct backing_dev_info *bdi, const char *fmt, va_list args)
{
- va_list args;
struct device *dev;
if (bdi->dev) /* The driver needs to use separate queues per device */
return 0;
- va_start(args, fmt);
- dev = device_create_vargs(bdi_class, parent, MKDEV(0, 0), bdi, fmt, args);
- va_end(args);
+ dev = device_create_vargs(bdi_class, NULL, MKDEV(0, 0), bdi, fmt, args);
if (IS_ERR(dev))
return PTR_ERR(dev);
+ cgwb_bdi_register(bdi);
bdi->dev = dev;
bdi_debug_register(bdi, dev_name(dev));
@@ -851,20 +880,25 @@ int bdi_register(struct backing_dev_info *bdi, struct device *parent,
trace_writeback_bdi_register(bdi);
return 0;
}
-EXPORT_SYMBOL(bdi_register);
+EXPORT_SYMBOL(bdi_register_va);
-int bdi_register_dev(struct backing_dev_info *bdi, dev_t dev)
+int bdi_register(struct backing_dev_info *bdi, const char *fmt, ...)
{
- return bdi_register(bdi, NULL, "%u:%u", MAJOR(dev), MINOR(dev));
+ va_list args;
+ int ret;
+
+ va_start(args, fmt);
+ ret = bdi_register_va(bdi, fmt, args);
+ va_end(args);
+ return ret;
}
-EXPORT_SYMBOL(bdi_register_dev);
+EXPORT_SYMBOL(bdi_register);
int bdi_register_owner(struct backing_dev_info *bdi, struct device *owner)
{
int rc;
- rc = bdi_register(bdi, NULL, "%u:%u", MAJOR(owner->devt),
- MINOR(owner->devt));
+ rc = bdi_register(bdi, "%u:%u", MAJOR(owner->devt), MINOR(owner->devt));
if (rc)
return rc;
/* Leaking owner reference... */
@@ -892,7 +926,7 @@ void bdi_unregister(struct backing_dev_info *bdi)
/* make sure nobody finds us on the bdi_list anymore */
bdi_remove_from_list(bdi);
wb_shutdown(&bdi->wb);
- cgwb_bdi_destroy(bdi);
+ cgwb_bdi_unregister(bdi);
if (bdi->dev) {
bdi_debug_unregister(bdi);
@@ -906,19 +940,16 @@ void bdi_unregister(struct backing_dev_info *bdi)
}
}
-static void bdi_exit(struct backing_dev_info *bdi)
-{
- WARN_ON_ONCE(bdi->dev);
- wb_exit(&bdi->wb);
- cgwb_bdi_exit(bdi);
-}
-
static void release_bdi(struct kref *ref)
{
struct backing_dev_info *bdi =
container_of(ref, struct backing_dev_info, refcnt);
- bdi_exit(bdi);
+ if (test_bit(WB_registered, &bdi->wb.state))
+ bdi_unregister(bdi);
+ WARN_ON_ONCE(bdi->dev);
+ wb_exit(&bdi->wb);
+ cgwb_bdi_exit(bdi);
kfree(bdi);
}
@@ -926,38 +957,7 @@ void bdi_put(struct backing_dev_info *bdi)
{
kref_put(&bdi->refcnt, release_bdi);
}
-
-void bdi_destroy(struct backing_dev_info *bdi)
-{
- bdi_unregister(bdi);
- bdi_exit(bdi);
-}
-EXPORT_SYMBOL(bdi_destroy);
-
-/*
- * For use from filesystems to quickly init and register a bdi associated
- * with dirty writeback
- */
-int bdi_setup_and_register(struct backing_dev_info *bdi, char *name)
-{
- int err;
-
- bdi->name = name;
- bdi->capabilities = 0;
- err = bdi_init(bdi);
- if (err)
- return err;
-
- err = bdi_register(bdi, NULL, "%.28s-%ld", name,
- atomic_long_inc_return(&bdi_seq));
- if (err) {
- bdi_destroy(bdi);
- return err;
- }
-
- return 0;
-}
-EXPORT_SYMBOL(bdi_setup_and_register);
+EXPORT_SYMBOL(bdi_put);
static wait_queue_head_t congestion_wqh[2] = {
__WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[0]),
diff --git a/mm/cma.c b/mm/cma.c
index a6033e344430..978b4a1441ef 100644
--- a/mm/cma.c
+++ b/mm/cma.c
@@ -53,6 +53,11 @@ unsigned long cma_get_size(const struct cma *cma)
return cma->count << PAGE_SHIFT;
}
+const char *cma_get_name(const struct cma *cma)
+{
+ return cma->name ? cma->name : "(undefined)";
+}
+
static unsigned long cma_bitmap_aligned_mask(const struct cma *cma,
int align_order)
{
@@ -168,6 +173,7 @@ core_initcall(cma_init_reserved_areas);
*/
int __init cma_init_reserved_mem(phys_addr_t base, phys_addr_t size,
unsigned int order_per_bit,
+ const char *name,
struct cma **res_cma)
{
struct cma *cma;
@@ -198,6 +204,13 @@ int __init cma_init_reserved_mem(phys_addr_t base, phys_addr_t size,
* subsystems (like slab allocator) are available.
*/
cma = &cma_areas[cma_area_count];
+ if (name) {
+ cma->name = name;
+ } else {
+ cma->name = kasprintf(GFP_KERNEL, "cma%d\n", cma_area_count);
+ if (!cma->name)
+ return -ENOMEM;
+ }
cma->base_pfn = PFN_DOWN(base);
cma->count = size >> PAGE_SHIFT;
cma->order_per_bit = order_per_bit;
@@ -229,7 +242,7 @@ int __init cma_init_reserved_mem(phys_addr_t base, phys_addr_t size,
int __init cma_declare_contiguous(phys_addr_t base,
phys_addr_t size, phys_addr_t limit,
phys_addr_t alignment, unsigned int order_per_bit,
- bool fixed, struct cma **res_cma)
+ bool fixed, const char *name, struct cma **res_cma)
{
phys_addr_t memblock_end = memblock_end_of_DRAM();
phys_addr_t highmem_start;
@@ -335,7 +348,7 @@ int __init cma_declare_contiguous(phys_addr_t base,
base = addr;
}
- ret = cma_init_reserved_mem(base, size, order_per_bit, res_cma);
+ ret = cma_init_reserved_mem(base, size, order_per_bit, name, res_cma);
if (ret)
goto err;
@@ -491,3 +504,17 @@ bool cma_release(struct cma *cma, const struct page *pages, unsigned int count)
return true;
}
+
+int cma_for_each_area(int (*it)(struct cma *cma, void *data), void *data)
+{
+ int i;
+
+ for (i = 0; i < cma_area_count; i++) {
+ int ret = it(&cma_areas[i], data);
+
+ if (ret)
+ return ret;
+ }
+
+ return 0;
+}
diff --git a/mm/cma.h b/mm/cma.h
index 17c75a4246c8..49861286279d 100644
--- a/mm/cma.h
+++ b/mm/cma.h
@@ -11,6 +11,7 @@ struct cma {
struct hlist_head mem_head;
spinlock_t mem_head_lock;
#endif
+ const char *name;
};
extern struct cma cma_areas[MAX_CMA_AREAS];
diff --git a/mm/cma_debug.c b/mm/cma_debug.c
index ffc0c3d0ae64..595b757bef72 100644
--- a/mm/cma_debug.c
+++ b/mm/cma_debug.c
@@ -167,7 +167,7 @@ static void cma_debugfs_add_one(struct cma *cma, int idx)
char name[16];
int u32s;
- sprintf(name, "cma-%d", idx);
+ sprintf(name, "cma-%s", cma->name);
tmp = debugfs_create_dir(name, cma_debugfs_root);
diff --git a/mm/compaction.c b/mm/compaction.c
index 81e1eaa2a2cf..09c5282ebdd2 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -992,9 +992,6 @@ isolate_migratepages_range(struct compact_control *cc, unsigned long start_pfn,
static bool suitable_migration_target(struct compact_control *cc,
struct page *page)
{
- if (cc->ignore_block_suitable)
- return true;
-
/* If the page is a large free page, then disallow migration */
if (PageBuddy(page)) {
/*
@@ -1006,6 +1003,9 @@ static bool suitable_migration_target(struct compact_control *cc,
return false;
}
+ if (cc->ignore_block_suitable)
+ return true;
+
/* If the block is MIGRATE_MOVABLE or MIGRATE_CMA, allow migration */
if (migrate_async_suitable(get_pageblock_migratetype(page)))
return true;
diff --git a/mm/filemap.c b/mm/filemap.c
index 1694623a6289..681da61080bc 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -519,7 +519,7 @@ EXPORT_SYMBOL(filemap_write_and_wait);
*
* Write out and wait upon file offsets lstart->lend, inclusive.
*
- * Note that `lend' is inclusive (describes the last byte to be written) so
+ * Note that @lend is inclusive (describes the last byte to be written) so
* that this function can be used to write to the very end-of-file (end = -1).
*/
int filemap_write_and_wait_range(struct address_space *mapping,
@@ -1277,12 +1277,14 @@ EXPORT_SYMBOL(find_lock_entry);
*
* PCG flags modify how the page is returned.
*
- * FGP_ACCESSED: the page will be marked accessed
- * FGP_LOCK: Page is return locked
- * FGP_CREAT: If page is not present then a new page is allocated using
- * @gfp_mask and added to the page cache and the VM's LRU
- * list. The page is returned locked and with an increased
- * refcount. Otherwise, %NULL is returned.
+ * @fgp_flags can be:
+ *
+ * - FGP_ACCESSED: the page will be marked accessed
+ * - FGP_LOCK: Page is return locked
+ * - FGP_CREAT: If page is not present then a new page is allocated using
+ * @gfp_mask and added to the page cache and the VM's LRU
+ * list. The page is returned locked and with an increased
+ * refcount. Otherwise, NULL is returned.
*
* If FGP_LOCK or FGP_CREAT are specified then the function may sleep even
* if the GFP flags specified for FGP_CREAT are atomic.
@@ -2033,7 +2035,6 @@ generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
if (iocb->ki_flags & IOCB_DIRECT) {
struct address_space *mapping = file->f_mapping;
struct inode *inode = mapping->host;
- struct iov_iter data = *iter;
loff_t size;
size = i_size_read(inode);
@@ -2044,11 +2045,12 @@ generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
file_accessed(file);
- retval = mapping->a_ops->direct_IO(iocb, &data);
+ retval = mapping->a_ops->direct_IO(iocb, iter);
if (retval >= 0) {
iocb->ki_pos += retval;
- iov_iter_advance(iter, retval);
+ count -= retval;
}
+ iov_iter_revert(iter, iov_iter_count(iter) - count);
/*
* Btrfs can have a short DIO read if we encounter
@@ -2059,7 +2061,7 @@ generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
* the rest of the read. Buffered reads will not work for
* DAX files, so don't bother trying.
*/
- if (retval < 0 || !iov_iter_count(iter) || iocb->ki_pos >= size ||
+ if (retval < 0 || !count || iocb->ki_pos >= size ||
IS_DAX(inode))
goto out;
}
@@ -2202,12 +2204,12 @@ int filemap_fault(struct vm_fault *vmf)
struct file_ra_state *ra = &file->f_ra;
struct inode *inode = mapping->host;
pgoff_t offset = vmf->pgoff;
+ pgoff_t max_off;
struct page *page;
- loff_t size;
int ret = 0;
- size = round_up(i_size_read(inode), PAGE_SIZE);
- if (offset >= size >> PAGE_SHIFT)
+ max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
+ if (unlikely(offset >= max_off))
return VM_FAULT_SIGBUS;
/*
@@ -2256,8 +2258,8 @@ retry_find:
* Found the page and have a reference on it.
* We must recheck i_size under page lock.
*/
- size = round_up(i_size_read(inode), PAGE_SIZE);
- if (unlikely(offset >= size >> PAGE_SHIFT)) {
+ max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
+ if (unlikely(offset >= max_off)) {
unlock_page(page);
put_page(page);
return VM_FAULT_SIGBUS;
@@ -2323,7 +2325,7 @@ void filemap_map_pages(struct vm_fault *vmf,
struct file *file = vmf->vma->vm_file;
struct address_space *mapping = file->f_mapping;
pgoff_t last_pgoff = start_pgoff;
- loff_t size;
+ unsigned long max_idx;
struct page *head, *page;
rcu_read_lock();
@@ -2369,8 +2371,8 @@ repeat:
if (page->mapping != mapping || !PageUptodate(page))
goto unlock;
- size = round_up(i_size_read(mapping->host), PAGE_SIZE);
- if (page->index >= size >> PAGE_SHIFT)
+ max_idx = DIV_ROUND_UP(i_size_read(mapping->host), PAGE_SIZE);
+ if (page->index >= max_idx)
goto unlock;
if (file->f_ra.mmap_miss > 0)
@@ -2704,7 +2706,6 @@ generic_file_direct_write(struct kiocb *iocb, struct iov_iter *from)
ssize_t written;
size_t write_len;
pgoff_t end;
- struct iov_iter data;
write_len = iov_iter_count(from);
end = (pos + write_len - 1) >> PAGE_SHIFT;
@@ -2719,22 +2720,19 @@ generic_file_direct_write(struct kiocb *iocb, struct iov_iter *from)
* about to write. We do this *before* the write so that we can return
* without clobbering -EIOCBQUEUED from ->direct_IO().
*/
- if (mapping->nrpages) {
- written = invalidate_inode_pages2_range(mapping,
+ written = invalidate_inode_pages2_range(mapping,
pos >> PAGE_SHIFT, end);
- /*
- * If a page can not be invalidated, return 0 to fall back
- * to buffered write.
- */
- if (written) {
- if (written == -EBUSY)
- return 0;
- goto out;
- }
+ /*
+ * If a page can not be invalidated, return 0 to fall back
+ * to buffered write.
+ */
+ if (written) {
+ if (written == -EBUSY)
+ return 0;
+ goto out;
}
- data = *from;
- written = mapping->a_ops->direct_IO(iocb, &data);
+ written = mapping->a_ops->direct_IO(iocb, from);
/*
* Finally, try again to invalidate clean pages which might have been
@@ -2744,20 +2742,19 @@ generic_file_direct_write(struct kiocb *iocb, struct iov_iter *from)
* so we don't support it 100%. If this invalidation
* fails, tough, the write still worked...
*/
- if (mapping->nrpages) {
- invalidate_inode_pages2_range(mapping,
- pos >> PAGE_SHIFT, end);
- }
+ invalidate_inode_pages2_range(mapping,
+ pos >> PAGE_SHIFT, end);
if (written > 0) {
pos += written;
- iov_iter_advance(from, written);
+ write_len -= written;
if (pos > i_size_read(inode) && !S_ISBLK(inode->i_mode)) {
i_size_write(inode, pos);
mark_inode_dirty(inode);
}
iocb->ki_pos = pos;
}
+ iov_iter_revert(from, write_len - iov_iter_count(from));
out:
return written;
}
@@ -3001,7 +2998,7 @@ EXPORT_SYMBOL(generic_file_write_iter);
* @gfp_mask: memory allocation flags (and I/O mode)
*
* The address_space is to try to release any data against the page
- * (presumably at page->private). If the release was successful, return `1'.
+ * (presumably at page->private). If the release was successful, return '1'.
* Otherwise return zero.
*
* This may also be called if PG_fscache is set on a page, indicating that the
diff --git a/mm/gup.c b/mm/gup.c
index 04aa405350dc..d9e6fddcc51f 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -1189,34 +1189,57 @@ struct page *get_dump_page(unsigned long addr)
*/
#ifdef CONFIG_HAVE_GENERIC_RCU_GUP
+#ifndef gup_get_pte
+/*
+ * We assume that the PTE can be read atomically. If this is not the case for
+ * your architecture, please provide the helper.
+ */
+static inline pte_t gup_get_pte(pte_t *ptep)
+{
+ return READ_ONCE(*ptep);
+}
+#endif
+
+static void undo_dev_pagemap(int *nr, int nr_start, struct page **pages)
+{
+ while ((*nr) - nr_start) {
+ struct page *page = pages[--(*nr)];
+
+ ClearPageReferenced(page);
+ put_page(page);
+ }
+}
+
#ifdef __HAVE_ARCH_PTE_SPECIAL
static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
int write, struct page **pages, int *nr)
{
+ struct dev_pagemap *pgmap = NULL;
+ int nr_start = *nr, ret = 0;
pte_t *ptep, *ptem;
- int ret = 0;
ptem = ptep = pte_offset_map(&pmd, addr);
do {
- /*
- * In the line below we are assuming that the pte can be read
- * atomically. If this is not the case for your architecture,
- * please wrap this in a helper function!
- *
- * for an example see gup_get_pte in arch/x86/mm/gup.c
- */
- pte_t pte = READ_ONCE(*ptep);
+ pte_t pte = gup_get_pte(ptep);
struct page *head, *page;
/*
* Similar to the PMD case below, NUMA hinting must take slow
* path using the pte_protnone check.
*/
- if (!pte_present(pte) || pte_special(pte) ||
- pte_protnone(pte) || (write && !pte_write(pte)))
+ if (pte_protnone(pte))
goto pte_unmap;
- if (!arch_pte_access_permitted(pte, write))
+ if (!pte_access_permitted(pte, write))
+ goto pte_unmap;
+
+ if (pte_devmap(pte)) {
+ pgmap = get_dev_pagemap(pte_pfn(pte), pgmap);
+ if (unlikely(!pgmap)) {
+ undo_dev_pagemap(nr, nr_start, pages);
+ goto pte_unmap;
+ }
+ } else if (pte_special(pte))
goto pte_unmap;
VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
@@ -1232,6 +1255,9 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
}
VM_BUG_ON_PAGE(compound_head(page) != head, page);
+
+ put_dev_pagemap(pgmap);
+ SetPageReferenced(page);
pages[*nr] = page;
(*nr)++;
@@ -1261,15 +1287,76 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
}
#endif /* __HAVE_ARCH_PTE_SPECIAL */
+#ifdef __HAVE_ARCH_PTE_DEVMAP
+static int __gup_device_huge(unsigned long pfn, unsigned long addr,
+ unsigned long end, struct page **pages, int *nr)
+{
+ int nr_start = *nr;
+ struct dev_pagemap *pgmap = NULL;
+
+ do {
+ struct page *page = pfn_to_page(pfn);
+
+ pgmap = get_dev_pagemap(pfn, pgmap);
+ if (unlikely(!pgmap)) {
+ undo_dev_pagemap(nr, nr_start, pages);
+ return 0;
+ }
+ SetPageReferenced(page);
+ pages[*nr] = page;
+ get_page(page);
+ put_dev_pagemap(pgmap);
+ (*nr)++;
+ pfn++;
+ } while (addr += PAGE_SIZE, addr != end);
+ return 1;
+}
+
+static int __gup_device_huge_pmd(pmd_t pmd, unsigned long addr,
+ unsigned long end, struct page **pages, int *nr)
+{
+ unsigned long fault_pfn;
+
+ fault_pfn = pmd_pfn(pmd) + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
+ return __gup_device_huge(fault_pfn, addr, end, pages, nr);
+}
+
+static int __gup_device_huge_pud(pud_t pud, unsigned long addr,
+ unsigned long end, struct page **pages, int *nr)
+{
+ unsigned long fault_pfn;
+
+ fault_pfn = pud_pfn(pud) + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
+ return __gup_device_huge(fault_pfn, addr, end, pages, nr);
+}
+#else
+static int __gup_device_huge_pmd(pmd_t pmd, unsigned long addr,
+ unsigned long end, struct page **pages, int *nr)
+{
+ BUILD_BUG();
+ return 0;
+}
+
+static int __gup_device_huge_pud(pud_t pud, unsigned long addr,
+ unsigned long end, struct page **pages, int *nr)
+{
+ BUILD_BUG();
+ return 0;
+}
+#endif
+
static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr,
unsigned long end, int write, struct page **pages, int *nr)
{
struct page *head, *page;
int refs;
- if (write && !pmd_write(orig))
+ if (!pmd_access_permitted(orig, write))
return 0;
+ if (pmd_devmap(orig))
+ return __gup_device_huge_pmd(orig, addr, end, pages, nr);
+
refs = 0;
head = pmd_page(orig);
page = head + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
@@ -1293,6 +1380,7 @@ static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr,
return 0;
}
+ SetPageReferenced(head);
return 1;
}
@@ -1302,9 +1390,12 @@ static int gup_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr,
struct page *head, *page;
int refs;
- if (write && !pud_write(orig))
+ if (!pud_access_permitted(orig, write))
return 0;
+ if (pud_devmap(orig))
+ return __gup_device_huge_pud(orig, addr, end, pages, nr);
+
refs = 0;
head = pud_page(orig);
page = head + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
@@ -1328,6 +1419,7 @@ static int gup_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr,
return 0;
}
+ SetPageReferenced(head);
return 1;
}
@@ -1338,9 +1430,10 @@ static int gup_huge_pgd(pgd_t orig, pgd_t *pgdp, unsigned long addr,
int refs;
struct page *head, *page;
- if (write && !pgd_write(orig))
+ if (!pgd_access_permitted(orig, write))
return 0;
+ BUILD_BUG_ON(pgd_devmap(orig));
refs = 0;
head = pgd_page(orig);
page = head + ((addr & ~PGDIR_MASK) >> PAGE_SHIFT);
@@ -1364,6 +1457,7 @@ static int gup_huge_pgd(pgd_t orig, pgd_t *pgdp, unsigned long addr,
return 0;
}
+ SetPageReferenced(head);
return 1;
}
@@ -1481,7 +1575,7 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write,
end = start + len;
if (unlikely(!access_ok(write ? VERIFY_WRITE : VERIFY_READ,
- start, len)))
+ (void __user *)start, len)))
return 0;
/*
@@ -1520,6 +1614,21 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write,
return nr;
}
+#ifndef gup_fast_permitted
+/*
+ * Check if it's allowed to use __get_user_pages_fast() for the range, or
+ * we need to fall back to the slow version:
+ */
+bool gup_fast_permitted(unsigned long start, int nr_pages, int write)
+{
+ unsigned long len, end;
+
+ len = (unsigned long) nr_pages << PAGE_SHIFT;
+ end = start + len;
+ return end >= start;
+}
+#endif
+
/**
* get_user_pages_fast() - pin user pages in memory
* @start: starting user address
@@ -1539,11 +1648,14 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write,
int get_user_pages_fast(unsigned long start, int nr_pages, int write,
struct page **pages)
{
- int nr, ret;
+ int nr = 0, ret = 0;
start &= PAGE_MASK;
- nr = __get_user_pages_fast(start, nr_pages, write, pages);
- ret = nr;
+
+ if (gup_fast_permitted(start, nr_pages, write)) {
+ nr = __get_user_pages_fast(start, nr_pages, write, pages);
+ ret = nr;
+ }
if (nr < nr_pages) {
/* Try to get the remaining pages with get_user_pages */
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index fef4cf210cc7..b787c4cfda0e 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1564,18 +1564,16 @@ bool madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
ClearPageDirty(page);
unlock_page(page);
- if (PageActive(page))
- deactivate_page(page);
-
if (pmd_young(orig_pmd) || pmd_dirty(orig_pmd)) {
- orig_pmd = pmdp_huge_get_and_clear_full(tlb->mm, addr, pmd,
- tlb->fullmm);
+ pmdp_invalidate(vma, addr, pmd);
orig_pmd = pmd_mkold(orig_pmd);
orig_pmd = pmd_mkclean(orig_pmd);
set_pmd_at(mm, addr, pmd, orig_pmd);
tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
}
+
+ mark_page_lazyfree(page);
ret = true;
out:
spin_unlock(ptl);
@@ -1724,37 +1722,69 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
{
struct mm_struct *mm = vma->vm_mm;
spinlock_t *ptl;
- int ret = 0;
+ pmd_t entry;
+ bool preserve_write;
+ int ret;
ptl = __pmd_trans_huge_lock(pmd, vma);
- if (ptl) {
- pmd_t entry;
- bool preserve_write = prot_numa && pmd_write(*pmd);
- ret = 1;
+ if (!ptl)
+ return 0;
- /*
- * Avoid trapping faults against the zero page. The read-only
- * data is likely to be read-cached on the local CPU and
- * local/remote hits to the zero page are not interesting.
- */
- if (prot_numa && is_huge_zero_pmd(*pmd)) {
- spin_unlock(ptl);
- return ret;
- }
+ preserve_write = prot_numa && pmd_write(*pmd);
+ ret = 1;
- if (!prot_numa || !pmd_protnone(*pmd)) {
- entry = pmdp_huge_get_and_clear_notify(mm, addr, pmd);
- entry = pmd_modify(entry, newprot);
- if (preserve_write)
- entry = pmd_mk_savedwrite(entry);
- ret = HPAGE_PMD_NR;
- set_pmd_at(mm, addr, pmd, entry);
- BUG_ON(vma_is_anonymous(vma) && !preserve_write &&
- pmd_write(entry));
- }
- spin_unlock(ptl);
- }
+ /*
+ * Avoid trapping faults against the zero page. The read-only
+ * data is likely to be read-cached on the local CPU and
+ * local/remote hits to the zero page are not interesting.
+ */
+ if (prot_numa && is_huge_zero_pmd(*pmd))
+ goto unlock;
+
+ if (prot_numa && pmd_protnone(*pmd))
+ goto unlock;
+
+ /*
+ * In case prot_numa, we are under down_read(mmap_sem). It's critical
+ * to not clear pmd intermittently to avoid race with MADV_DONTNEED
+ * which is also under down_read(mmap_sem):
+ *
+ * CPU0: CPU1:
+ * change_huge_pmd(prot_numa=1)
+ * pmdp_huge_get_and_clear_notify()
+ * madvise_dontneed()
+ * zap_pmd_range()
+ * pmd_trans_huge(*pmd) == 0 (without ptl)
+ * // skip the pmd
+ * set_pmd_at();
+ * // pmd is re-established
+ *
+ * The race makes MADV_DONTNEED miss the huge pmd and don't clear it
+ * which may break userspace.
+ *
+ * pmdp_invalidate() is required to make sure we don't miss
+ * dirty/young flags set by hardware.
+ */
+ entry = *pmd;
+ pmdp_invalidate(vma, addr, pmd);
+
+ /*
+ * Recover dirty/young flags. It relies on pmdp_invalidate to not
+ * corrupt them.
+ */
+ if (pmd_dirty(*pmd))
+ entry = pmd_mkdirty(entry);
+ if (pmd_young(*pmd))
+ entry = pmd_mkyoung(entry);
+ entry = pmd_modify(entry, newprot);
+ if (preserve_write)
+ entry = pmd_mk_savedwrite(entry);
+ ret = HPAGE_PMD_NR;
+ set_pmd_at(mm, addr, pmd, entry);
+ BUG_ON(vma_is_anonymous(vma) && !preserve_write && pmd_write(entry));
+unlock:
+ spin_unlock(ptl);
return ret;
}
@@ -2114,15 +2144,15 @@ static void freeze_page(struct page *page)
{
enum ttu_flags ttu_flags = TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS |
TTU_RMAP_LOCKED | TTU_SPLIT_HUGE_PMD;
- int ret;
+ bool unmap_success;
VM_BUG_ON_PAGE(!PageHead(page), page);
if (PageAnon(page))
ttu_flags |= TTU_MIGRATION;
- ret = try_to_unmap(page, ttu_flags);
- VM_BUG_ON_PAGE(ret, page);
+ unmap_success = try_to_unmap(page, ttu_flags);
+ VM_BUG_ON_PAGE(!unmap_success, page);
}
static void unfreeze_page(struct page *page)
@@ -2368,7 +2398,6 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
VM_BUG_ON_PAGE(is_huge_zero_page(page), page);
VM_BUG_ON_PAGE(!PageLocked(page), page);
- VM_BUG_ON_PAGE(!PageSwapBacked(page), page);
VM_BUG_ON_PAGE(!PageCompound(page), page);
if (PageAnon(head)) {
diff --git a/mm/hwpoison-inject.c b/mm/hwpoison-inject.c
index 9d26fd9fefe4..356df057a2a8 100644
--- a/mm/hwpoison-inject.c
+++ b/mm/hwpoison-inject.c
@@ -34,8 +34,7 @@ static int hwpoison_inject(void *data, u64 val)
if (!hwpoison_filter_enable)
goto inject;
- if (!PageLRU(hpage) && !PageHuge(p))
- shake_page(hpage, 0);
+ shake_page(hpage, 0);
/*
* This implies unable to support non-LRU pages.
*/
diff --git a/mm/internal.h b/mm/internal.h
index 266efaeaa370..04d08ef91224 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -81,11 +81,16 @@ static inline void set_page_refcounted(struct page *page)
extern unsigned long highest_memmap_pfn;
/*
+ * Maximum number of reclaim retries without progress before the OOM
+ * killer is consider the only way forward.
+ */
+#define MAX_RECLAIM_RETRIES 16
+
+/*
* in mm/vmscan.c:
*/
extern int isolate_lru_page(struct page *page);
extern void putback_lru_page(struct page *page);
-extern bool pgdat_reclaimable(struct pglist_data *pgdat);
/*
* in mm/rmap.c:
@@ -505,4 +510,14 @@ extern const struct trace_print_flags pageflag_names[];
extern const struct trace_print_flags vmaflag_names[];
extern const struct trace_print_flags gfpflag_names[];
+static inline bool is_migrate_highatomic(enum migratetype migratetype)
+{
+ return migratetype == MIGRATE_HIGHATOMIC;
+}
+
+static inline bool is_migrate_highatomic_page(struct page *page)
+{
+ return get_pageblock_migratetype(page) == MIGRATE_HIGHATOMIC;
+}
+
#endif /* __MM_INTERNAL_H */
diff --git a/mm/kasan/kasan.c b/mm/kasan/kasan.c
index 98b27195e38b..9348d27088c1 100644
--- a/mm/kasan/kasan.c
+++ b/mm/kasan/kasan.c
@@ -577,7 +577,8 @@ bool kasan_slab_free(struct kmem_cache *cache, void *object)
shadow_byte = READ_ONCE(*(s8 *)kasan_mem_to_shadow(object));
if (shadow_byte < 0 || shadow_byte >= KASAN_SHADOW_SCALE_SIZE) {
- kasan_report_double_free(cache, object, shadow_byte);
+ kasan_report_double_free(cache, object,
+ __builtin_return_address(1));
return true;
}
diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h
index dd2dea8eb077..1229298cce64 100644
--- a/mm/kasan/kasan.h
+++ b/mm/kasan/kasan.h
@@ -99,7 +99,7 @@ static inline const void *kasan_shadow_to_mem(const void *shadow_addr)
void kasan_report(unsigned long addr, size_t size,
bool is_write, unsigned long ip);
void kasan_report_double_free(struct kmem_cache *cache, void *object,
- s8 shadow);
+ void *ip);
#if defined(CONFIG_SLAB) || defined(CONFIG_SLUB)
void quarantine_put(struct kasan_free_meta *info, struct kmem_cache *cache);
diff --git a/mm/kasan/report.c b/mm/kasan/report.c
index ab42a0803f16..beee0e980e2d 100644
--- a/mm/kasan/report.c
+++ b/mm/kasan/report.c
@@ -51,7 +51,13 @@ static const void *find_first_bad_addr(const void *addr, size_t size)
return first_bad_addr;
}
-static void print_error_description(struct kasan_access_info *info)
+static bool addr_has_shadow(struct kasan_access_info *info)
+{
+ return (info->access_addr >=
+ kasan_shadow_to_mem((void *)KASAN_SHADOW_START));
+}
+
+static const char *get_shadow_bug_type(struct kasan_access_info *info)
{
const char *bug_type = "unknown-crash";
u8 *shadow_addr;
@@ -98,12 +104,39 @@ static void print_error_description(struct kasan_access_info *info)
break;
}
- pr_err("BUG: KASAN: %s in %pS at addr %p\n",
- bug_type, (void *)info->ip,
- info->access_addr);
- pr_err("%s of size %zu by task %s/%d\n",
- info->is_write ? "Write" : "Read",
- info->access_size, current->comm, task_pid_nr(current));
+ return bug_type;
+}
+
+const char *get_wild_bug_type(struct kasan_access_info *info)
+{
+ const char *bug_type = "unknown-crash";
+
+ if ((unsigned long)info->access_addr < PAGE_SIZE)
+ bug_type = "null-ptr-deref";
+ else if ((unsigned long)info->access_addr < TASK_SIZE)
+ bug_type = "user-memory-access";
+ else
+ bug_type = "wild-memory-access";
+
+ return bug_type;
+}
+
+static const char *get_bug_type(struct kasan_access_info *info)
+{
+ if (addr_has_shadow(info))
+ return get_shadow_bug_type(info);
+ return get_wild_bug_type(info);
+}
+
+static void print_error_description(struct kasan_access_info *info)
+{
+ const char *bug_type = get_bug_type(info);
+
+ pr_err("BUG: KASAN: %s in %pS\n",
+ bug_type, (void *)info->ip);
+ pr_err("%s of size %zu at addr %p by task %s/%d\n",
+ info->is_write ? "Write" : "Read", info->access_size,
+ info->access_addr, current->comm, task_pid_nr(current));
}
static inline bool kernel_or_module_addr(const void *addr)
@@ -144,9 +177,9 @@ static void kasan_end_report(unsigned long *flags)
kasan_enable_current();
}
-static void print_track(struct kasan_track *track)
+static void print_track(struct kasan_track *track, const char *prefix)
{
- pr_err("PID = %u\n", track->pid);
+ pr_err("%s by task %u:\n", prefix, track->pid);
if (track->stack) {
struct stack_trace trace;
@@ -157,59 +190,84 @@ static void print_track(struct kasan_track *track)
}
}
-static void kasan_object_err(struct kmem_cache *cache, void *object)
+static struct page *addr_to_page(const void *addr)
{
- struct kasan_alloc_meta *alloc_info = get_alloc_info(cache, object);
+ if ((addr >= (void *)PAGE_OFFSET) &&
+ (addr < high_memory))
+ return virt_to_head_page(addr);
+ return NULL;
+}
- dump_stack();
- pr_err("Object at %p, in cache %s size: %d\n", object, cache->name,
- cache->object_size);
+static void describe_object_addr(struct kmem_cache *cache, void *object,
+ const void *addr)
+{
+ unsigned long access_addr = (unsigned long)addr;
+ unsigned long object_addr = (unsigned long)object;
+ const char *rel_type;
+ int rel_bytes;
- if (!(cache->flags & SLAB_KASAN))
+ pr_err("The buggy address belongs to the object at %p\n"
+ " which belongs to the cache %s of size %d\n",
+ object, cache->name, cache->object_size);
+
+ if (!addr)
return;
- pr_err("Allocated:\n");
- print_track(&alloc_info->alloc_track);
- pr_err("Freed:\n");
- print_track(&alloc_info->free_track);
+ if (access_addr < object_addr) {
+ rel_type = "to the left";
+ rel_bytes = object_addr - access_addr;
+ } else if (access_addr >= object_addr + cache->object_size) {
+ rel_type = "to the right";
+ rel_bytes = access_addr - (object_addr + cache->object_size);
+ } else {
+ rel_type = "inside";
+ rel_bytes = access_addr - object_addr;
+ }
+
+ pr_err("The buggy address is located %d bytes %s of\n"
+ " %d-byte region [%p, %p)\n",
+ rel_bytes, rel_type, cache->object_size, (void *)object_addr,
+ (void *)(object_addr + cache->object_size));
}
-void kasan_report_double_free(struct kmem_cache *cache, void *object,
- s8 shadow)
+static void describe_object(struct kmem_cache *cache, void *object,
+ const void *addr)
{
- unsigned long flags;
+ struct kasan_alloc_meta *alloc_info = get_alloc_info(cache, object);
- kasan_start_report(&flags);
- pr_err("BUG: Double free or freeing an invalid pointer\n");
- pr_err("Unexpected shadow byte: 0x%hhX\n", shadow);
- kasan_object_err(cache, object);
- kasan_end_report(&flags);
+ if (cache->flags & SLAB_KASAN) {
+ print_track(&alloc_info->alloc_track, "Allocated");
+ pr_err("\n");
+ print_track(&alloc_info->free_track, "Freed");
+ pr_err("\n");
+ }
+
+ describe_object_addr(cache, object, addr);
}
-static void print_address_description(struct kasan_access_info *info)
+static void print_address_description(void *addr)
{
- const void *addr = info->access_addr;
+ struct page *page = addr_to_page(addr);
- if ((addr >= (void *)PAGE_OFFSET) &&
- (addr < high_memory)) {
- struct page *page = virt_to_head_page(addr);
-
- if (PageSlab(page)) {
- void *object;
- struct kmem_cache *cache = page->slab_cache;
- object = nearest_obj(cache, page,
- (void *)info->access_addr);
- kasan_object_err(cache, object);
- return;
- }
- dump_page(page, "kasan: bad access detected");
+ dump_stack();
+ pr_err("\n");
+
+ if (page && PageSlab(page)) {
+ struct kmem_cache *cache = page->slab_cache;
+ void *object = nearest_obj(cache, page, addr);
+
+ describe_object(cache, object, addr);
}
- if (kernel_or_module_addr(addr)) {
- if (!init_task_stack_addr(addr))
- pr_err("Address belongs to variable %pS\n", addr);
+ if (kernel_or_module_addr(addr) && !init_task_stack_addr(addr)) {
+ pr_err("The buggy address belongs to the variable:\n");
+ pr_err(" %pS\n", addr);
+ }
+
+ if (page) {
+ pr_err("The buggy address belongs to the page:\n");
+ dump_page(page, "kasan: bad access detected");
}
- dump_stack();
}
static bool row_is_guilty(const void *row, const void *guilty)
@@ -264,31 +322,34 @@ static void print_shadow_for_address(const void *addr)
}
}
+void kasan_report_double_free(struct kmem_cache *cache, void *object,
+ void *ip)
+{
+ unsigned long flags;
+
+ kasan_start_report(&flags);
+ pr_err("BUG: KASAN: double-free or invalid-free in %pS\n", ip);
+ pr_err("\n");
+ print_address_description(object);
+ pr_err("\n");
+ print_shadow_for_address(object);
+ kasan_end_report(&flags);
+}
+
static void kasan_report_error(struct kasan_access_info *info)
{
unsigned long flags;
- const char *bug_type;
kasan_start_report(&flags);
- if (info->access_addr <
- kasan_shadow_to_mem((void *)KASAN_SHADOW_START)) {
- if ((unsigned long)info->access_addr < PAGE_SIZE)
- bug_type = "null-ptr-deref";
- else if ((unsigned long)info->access_addr < TASK_SIZE)
- bug_type = "user-memory-access";
- else
- bug_type = "wild-memory-access";
- pr_err("BUG: KASAN: %s on address %p\n",
- bug_type, info->access_addr);
- pr_err("%s of size %zu by task %s/%d\n",
- info->is_write ? "Write" : "Read",
- info->access_size, current->comm,
- task_pid_nr(current));
+ print_error_description(info);
+ pr_err("\n");
+
+ if (!addr_has_shadow(info)) {
dump_stack();
} else {
- print_error_description(info);
- print_address_description(info);
+ print_address_description((void *)info->access_addr);
+ pr_err("\n");
print_shadow_for_address(info->first_bad_addr);
}
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index ba40b7f673f4..7cb9c88bb4a3 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -483,8 +483,7 @@ void __khugepaged_exit(struct mm_struct *mm)
static void release_pte_page(struct page *page)
{
- /* 0 stands for page_is_file_cache(page) == false */
- dec_node_page_state(page, NR_ISOLATED_ANON + 0);
+ dec_node_page_state(page, NR_ISOLATED_ANON + page_is_file_cache(page));
unlock_page(page);
putback_lru_page(page);
}
@@ -532,7 +531,6 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
VM_BUG_ON_PAGE(PageCompound(page), page);
VM_BUG_ON_PAGE(!PageAnon(page), page);
- VM_BUG_ON_PAGE(!PageSwapBacked(page), page);
/*
* We can do it before isolate_lru_page because the
@@ -550,7 +548,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
* The page must only be referenced by the scanned process
* and page swap cache.
*/
- if (page_count(page) != 1 + !!PageSwapCache(page)) {
+ if (page_count(page) != 1 + PageSwapCache(page)) {
unlock_page(page);
result = SCAN_PAGE_COUNT;
goto out;
@@ -579,8 +577,8 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
result = SCAN_DEL_PAGE_LRU;
goto out;
}
- /* 0 stands for page_is_file_cache(page) == false */
- inc_node_page_state(page, NR_ISOLATED_ANON + 0);
+ inc_node_page_state(page,
+ NR_ISOLATED_ANON + page_is_file_cache(page));
VM_BUG_ON_PAGE(!PageLocked(page), page);
VM_BUG_ON_PAGE(PageLRU(page), page);
@@ -1183,7 +1181,7 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
* The page must only be referenced by the scanned process
* and page swap cache.
*/
- if (page_count(page) != 1 + !!PageSwapCache(page)) {
+ if (page_count(page) != 1 + PageSwapCache(page)) {
result = SCAN_PAGE_COUNT;
goto out_unmap;
}
diff --git a/mm/ksm.c b/mm/ksm.c
index 19b4f2dea7a5..d9fc0e456128 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -1933,11 +1933,10 @@ struct page *ksm_might_need_to_copy(struct page *page,
return new_page;
}
-int rmap_walk_ksm(struct page *page, struct rmap_walk_control *rwc)
+void rmap_walk_ksm(struct page *page, struct rmap_walk_control *rwc)
{
struct stable_node *stable_node;
struct rmap_item *rmap_item;
- int ret = SWAP_AGAIN;
int search_new_forks = 0;
VM_BUG_ON_PAGE(!PageKsm(page), page);
@@ -1950,7 +1949,7 @@ int rmap_walk_ksm(struct page *page, struct rmap_walk_control *rwc)
stable_node = page_stable_node(page);
if (!stable_node)
- return ret;
+ return;
again:
hlist_for_each_entry(rmap_item, &stable_node->hlist, hlist) {
struct anon_vma *anon_vma = rmap_item->anon_vma;
@@ -1978,23 +1977,20 @@ again:
if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg))
continue;
- ret = rwc->rmap_one(page, vma,
- rmap_item->address, rwc->arg);
- if (ret != SWAP_AGAIN) {
+ if (!rwc->rmap_one(page, vma,
+ rmap_item->address, rwc->arg)) {
anon_vma_unlock_read(anon_vma);
- goto out;
+ return;
}
if (rwc->done && rwc->done(page)) {
anon_vma_unlock_read(anon_vma);
- goto out;
+ return;
}
}
anon_vma_unlock_read(anon_vma);
}
if (!search_new_forks++)
goto again;
-out:
- return ret;
}
#ifdef CONFIG_MIGRATION
diff --git a/mm/madvise.c b/mm/madvise.c
index 7a2abf0127ae..25b78ee4fc2c 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -411,10 +411,9 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
ptent = pte_mkold(ptent);
ptent = pte_mkclean(ptent);
set_pte_at(mm, addr, pte, ptent);
- if (PageActive(page))
- deactivate_page(page);
tlb_remove_tlb_entry(tlb, pte, addr);
}
+ mark_page_lazyfree(page);
}
out:
if (nr_swap) {
@@ -606,34 +605,40 @@ static long madvise_remove(struct vm_area_struct *vma,
/*
* Error injection support for memory error handling.
*/
-static int madvise_hwpoison(int bhv, unsigned long start, unsigned long end)
+static int madvise_inject_error(int behavior,
+ unsigned long start, unsigned long end)
{
- struct page *p;
+ struct page *page;
+
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
+
for (; start < end; start += PAGE_SIZE <<
- compound_order(compound_head(p))) {
+ compound_order(compound_head(page))) {
int ret;
- ret = get_user_pages_fast(start, 1, 0, &p);
+ ret = get_user_pages_fast(start, 1, 0, &page);
if (ret != 1)
return ret;
- if (PageHWPoison(p)) {
- put_page(p);
+ if (PageHWPoison(page)) {
+ put_page(page);
continue;
}
- if (bhv == MADV_SOFT_OFFLINE) {
- pr_info("Soft offlining page %#lx at %#lx\n",
- page_to_pfn(p), start);
- ret = soft_offline_page(p, MF_COUNT_INCREASED);
+
+ if (behavior == MADV_SOFT_OFFLINE) {
+ pr_info("Soft offlining pfn %#lx at process virtual address %#lx\n",
+ page_to_pfn(page), start);
+
+ ret = soft_offline_page(page, MF_COUNT_INCREASED);
if (ret)
return ret;
continue;
}
- pr_info("Injecting memory failure for page %#lx at %#lx\n",
- page_to_pfn(p), start);
- ret = memory_failure(page_to_pfn(p), 0, MF_COUNT_INCREASED);
+ pr_info("Injecting memory failure for pfn %#lx at process virtual address %#lx\n",
+ page_to_pfn(page), start);
+
+ ret = memory_failure(page_to_pfn(page), 0, MF_COUNT_INCREASED);
if (ret)
return ret;
}
@@ -651,13 +656,7 @@ madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev,
case MADV_WILLNEED:
return madvise_willneed(vma, prev, start, end);
case MADV_FREE:
- /*
- * XXX: In this implementation, MADV_FREE works like
- * MADV_DONTNEED on swapless system or full swap.
- */
- if (get_nr_swap_pages() > 0)
- return madvise_free(vma, prev, start, end);
- /* passthrough */
+ return madvise_free(vma, prev, start, end);
case MADV_DONTNEED:
return madvise_dontneed(vma, prev, start, end);
default:
@@ -688,6 +687,10 @@ madvise_behavior_valid(int behavior)
#endif
case MADV_DONTDUMP:
case MADV_DODUMP:
+#ifdef CONFIG_MEMORY_FAILURE
+ case MADV_SOFT_OFFLINE:
+ case MADV_HWPOISON:
+#endif
return true;
default:
@@ -761,10 +764,6 @@ SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior)
size_t len;
struct blk_plug plug;
-#ifdef CONFIG_MEMORY_FAILURE
- if (behavior == MADV_HWPOISON || behavior == MADV_SOFT_OFFLINE)
- return madvise_hwpoison(behavior, start, start+len_in);
-#endif
if (!madvise_behavior_valid(behavior))
return error;
@@ -784,6 +783,11 @@ SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior)
if (end == start)
return error;
+#ifdef CONFIG_MEMORY_FAILURE
+ if (behavior == MADV_HWPOISON || behavior == MADV_SOFT_OFFLINE)
+ return madvise_inject_error(behavior, start, start + len_in);
+#endif
+
write = madvise_need_mmap_write(behavior);
if (write) {
if (down_write_killable(&current->mm->mmap_sem))
diff --git a/mm/memblock.c b/mm/memblock.c
index 696f06d17c4e..b049c9b2dba8 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -805,6 +805,18 @@ int __init_memblock memblock_mark_nomap(phys_addr_t base, phys_addr_t size)
}
/**
+ * memblock_clear_nomap - Clear flag MEMBLOCK_NOMAP for a specified region.
+ * @base: the base phys addr of the region
+ * @size: the size of the region
+ *
+ * Return 0 on success, -errno on failure.
+ */
+int __init_memblock memblock_clear_nomap(phys_addr_t base, phys_addr_t size)
+{
+ return memblock_setclr_flag(base, size, 0, MEMBLOCK_NOMAP);
+}
+
+/**
* __next_reserved_mem_region - next function for for_each_reserved_region()
* @idx: pointer to u64 loop variable
* @out_start: ptr to phys_addr_t for start address of the region, can be %NULL
@@ -1531,11 +1543,37 @@ void __init memblock_enforce_memory_limit(phys_addr_t limit)
(phys_addr_t)ULLONG_MAX);
}
+void __init memblock_cap_memory_range(phys_addr_t base, phys_addr_t size)
+{
+ int start_rgn, end_rgn;
+ int i, ret;
+
+ if (!size)
+ return;
+
+ ret = memblock_isolate_range(&memblock.memory, base, size,
+ &start_rgn, &end_rgn);
+ if (ret)
+ return;
+
+ /* remove all the MAP regions */
+ for (i = memblock.memory.cnt - 1; i >= end_rgn; i--)
+ if (!memblock_is_nomap(&memblock.memory.regions[i]))
+ memblock_remove_region(&memblock.memory, i);
+
+ for (i = start_rgn - 1; i >= 0; i--)
+ if (!memblock_is_nomap(&memblock.memory.regions[i]))
+ memblock_remove_region(&memblock.memory, i);
+
+ /* truncate the reserved regions */
+ memblock_remove_range(&memblock.reserved, 0, base);
+ memblock_remove_range(&memblock.reserved,
+ base + size, (phys_addr_t)ULLONG_MAX);
+}
+
void __init memblock_mem_limit_remove_map(phys_addr_t limit)
{
- struct memblock_type *type = &memblock.memory;
phys_addr_t max_addr;
- int i, ret, start_rgn, end_rgn;
if (!limit)
return;
@@ -1546,19 +1584,7 @@ void __init memblock_mem_limit_remove_map(phys_addr_t limit)
if (max_addr == (phys_addr_t)ULLONG_MAX)
return;
- ret = memblock_isolate_range(type, max_addr, (phys_addr_t)ULLONG_MAX,
- &start_rgn, &end_rgn);
- if (ret)
- return;
-
- /* remove all the MAP regions above the limit */
- for (i = end_rgn - 1; i >= start_rgn; i--) {
- if (!memblock_is_nomap(&type->regions[i]))
- memblock_remove_region(type, i);
- }
- /* truncate the reserved regions */
- memblock_remove_range(&memblock.reserved, max_addr,
- (phys_addr_t)ULLONG_MAX);
+ memblock_cap_memory_range(0, max_addr);
}
static int __init_memblock memblock_search(struct memblock_type *type, phys_addr_t addr)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 2bd7541d7c11..ff73899af61a 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -100,24 +100,7 @@ static bool do_memsw_account(void)
return !cgroup_subsys_on_dfl(memory_cgrp_subsys) && do_swap_account;
}
-static const char * const mem_cgroup_stat_names[] = {
- "cache",
- "rss",
- "rss_huge",
- "mapped_file",
- "dirty",
- "writeback",
- "swap",
-};
-
-static const char * const mem_cgroup_events_names[] = {
- "pgpgin",
- "pgpgout",
- "pgfault",
- "pgmajfault",
-};
-
-static const char * const mem_cgroup_lru_names[] = {
+static const char *const mem_cgroup_lru_names[] = {
"inactive_anon",
"active_anon",
"inactive_file",
@@ -568,32 +551,15 @@ mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz)
* common workload, threshold and synchronization as vmstat[] should be
* implemented.
*/
-static unsigned long
-mem_cgroup_read_stat(struct mem_cgroup *memcg, enum mem_cgroup_stat_index idx)
-{
- long val = 0;
- int cpu;
-
- /* Per-cpu values can be negative, use a signed accumulator */
- for_each_possible_cpu(cpu)
- val += per_cpu(memcg->stat->count[idx], cpu);
- /*
- * Summing races with updates, so val may be negative. Avoid exposing
- * transient negative values.
- */
- if (val < 0)
- val = 0;
- return val;
-}
-static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg,
- enum mem_cgroup_events_index idx)
+static unsigned long memcg_sum_events(struct mem_cgroup *memcg,
+ enum memcg_event_item event)
{
unsigned long val = 0;
int cpu;
for_each_possible_cpu(cpu)
- val += per_cpu(memcg->stat->events[idx], cpu);
+ val += per_cpu(memcg->stat->events[event], cpu);
return val;
}
@@ -606,23 +572,23 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg,
* counted as CACHE even if it's on ANON LRU.
*/
if (PageAnon(page))
- __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_RSS],
- nr_pages);
- else
- __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_CACHE],
- nr_pages);
+ __this_cpu_add(memcg->stat->count[MEMCG_RSS], nr_pages);
+ else {
+ __this_cpu_add(memcg->stat->count[MEMCG_CACHE], nr_pages);
+ if (PageSwapBacked(page))
+ __this_cpu_add(memcg->stat->count[NR_SHMEM], nr_pages);
+ }
if (compound) {
VM_BUG_ON_PAGE(!PageTransHuge(page), page);
- __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE],
- nr_pages);
+ __this_cpu_add(memcg->stat->count[MEMCG_RSS_HUGE], nr_pages);
}
/* pagein of a big page is an event. So, ignore page size */
if (nr_pages > 0)
- __this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGIN]);
+ __this_cpu_inc(memcg->stat->events[PGPGIN]);
else {
- __this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGOUT]);
+ __this_cpu_inc(memcg->stat->events[PGPGOUT]);
nr_pages = -nr_pages; /* for event */
}
@@ -1144,6 +1110,28 @@ static bool mem_cgroup_wait_acct_move(struct mem_cgroup *memcg)
return false;
}
+unsigned int memcg1_stats[] = {
+ MEMCG_CACHE,
+ MEMCG_RSS,
+ MEMCG_RSS_HUGE,
+ NR_SHMEM,
+ NR_FILE_MAPPED,
+ NR_FILE_DIRTY,
+ NR_WRITEBACK,
+ MEMCG_SWAP,
+};
+
+static const char *const memcg1_stat_names[] = {
+ "cache",
+ "rss",
+ "rss_huge",
+ "shmem",
+ "mapped_file",
+ "dirty",
+ "writeback",
+ "swap",
+};
+
#define K(x) ((x) << (PAGE_SHIFT-10))
/**
* mem_cgroup_print_oom_info: Print OOM information relevant to memory controller.
@@ -1188,11 +1176,11 @@ void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
pr_cont_cgroup_path(iter->css.cgroup);
pr_cont(":");
- for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
- if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account)
+ for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) {
+ if (memcg1_stats[i] == MEMCG_SWAP && !do_swap_account)
continue;
- pr_cont(" %s:%luKB", mem_cgroup_stat_names[i],
- K(mem_cgroup_read_stat(iter, i)));
+ pr_cont(" %s:%luKB", memcg1_stat_names[i],
+ K(memcg_page_state(iter, memcg1_stats[i])));
}
for (i = 0; i < NR_LRU_LISTS; i++)
@@ -1837,7 +1825,7 @@ static void reclaim_high(struct mem_cgroup *memcg,
do {
if (page_counter_read(&memcg->memory) <= memcg->high)
continue;
- mem_cgroup_events(memcg, MEMCG_HIGH, 1);
+ mem_cgroup_event(memcg, MEMCG_HIGH);
try_to_free_mem_cgroup_pages(memcg, nr_pages, gfp_mask, true);
} while ((memcg = parent_mem_cgroup(memcg)));
}
@@ -1928,7 +1916,7 @@ retry:
if (!gfpflags_allow_blocking(gfp_mask))
goto nomem;
- mem_cgroup_events(mem_over_limit, MEMCG_MAX, 1);
+ mem_cgroup_event(mem_over_limit, MEMCG_MAX);
nr_reclaimed = try_to_free_mem_cgroup_pages(mem_over_limit, nr_pages,
gfp_mask, may_swap);
@@ -1971,7 +1959,7 @@ retry:
if (fatal_signal_pending(current))
goto force;
- mem_cgroup_events(mem_over_limit, MEMCG_OOM, 1);
+ mem_cgroup_event(mem_over_limit, MEMCG_OOM);
mem_cgroup_oom(mem_over_limit, gfp_mask,
get_order(nr_pages * PAGE_SIZE));
@@ -2381,7 +2369,7 @@ void mem_cgroup_split_huge_fixup(struct page *head)
for (i = 1; i < HPAGE_PMD_NR; i++)
head[i].mem_cgroup = head->mem_cgroup;
- __this_cpu_sub(head->mem_cgroup->stat->count[MEM_CGROUP_STAT_RSS_HUGE],
+ __this_cpu_sub(head->mem_cgroup->stat->count[MEMCG_RSS_HUGE],
HPAGE_PMD_NR);
}
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
@@ -2391,7 +2379,7 @@ static void mem_cgroup_swap_statistics(struct mem_cgroup *memcg,
bool charge)
{
int val = (charge) ? 1 : -1;
- this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_SWAP], val);
+ this_cpu_add(memcg->stat->count[MEMCG_SWAP], val);
}
/**
@@ -2725,7 +2713,7 @@ static void tree_stat(struct mem_cgroup *memcg, unsigned long *stat)
for_each_mem_cgroup_tree(iter, memcg) {
for (i = 0; i < MEMCG_NR_STAT; i++)
- stat[i] += mem_cgroup_read_stat(iter, i);
+ stat[i] += memcg_page_state(iter, i);
}
}
@@ -2738,7 +2726,7 @@ static void tree_events(struct mem_cgroup *memcg, unsigned long *events)
for_each_mem_cgroup_tree(iter, memcg) {
for (i = 0; i < MEMCG_NR_EVENTS; i++)
- events[i] += mem_cgroup_read_events(iter, i);
+ events[i] += memcg_sum_events(iter, i);
}
}
@@ -2750,13 +2738,10 @@ static unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
struct mem_cgroup *iter;
for_each_mem_cgroup_tree(iter, memcg) {
- val += mem_cgroup_read_stat(iter,
- MEM_CGROUP_STAT_CACHE);
- val += mem_cgroup_read_stat(iter,
- MEM_CGROUP_STAT_RSS);
+ val += memcg_page_state(iter, MEMCG_CACHE);
+ val += memcg_page_state(iter, MEMCG_RSS);
if (swap)
- val += mem_cgroup_read_stat(iter,
- MEM_CGROUP_STAT_SWAP);
+ val += memcg_page_state(iter, MEMCG_SWAP);
}
} else {
if (!swap)
@@ -3131,6 +3116,21 @@ static int memcg_numa_stat_show(struct seq_file *m, void *v)
}
#endif /* CONFIG_NUMA */
+/* Universal VM events cgroup1 shows, original sort order */
+unsigned int memcg1_events[] = {
+ PGPGIN,
+ PGPGOUT,
+ PGFAULT,
+ PGMAJFAULT,
+};
+
+static const char *const memcg1_event_names[] = {
+ "pgpgin",
+ "pgpgout",
+ "pgfault",
+ "pgmajfault",
+};
+
static int memcg_stat_show(struct seq_file *m, void *v)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
@@ -3138,22 +3138,20 @@ static int memcg_stat_show(struct seq_file *m, void *v)
struct mem_cgroup *mi;
unsigned int i;
- BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_stat_names) !=
- MEM_CGROUP_STAT_NSTATS);
- BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_events_names) !=
- MEM_CGROUP_EVENTS_NSTATS);
+ BUILD_BUG_ON(ARRAY_SIZE(memcg1_stat_names) != ARRAY_SIZE(memcg1_stats));
BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_lru_names) != NR_LRU_LISTS);
- for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
- if (i == MEM_CGROUP_STAT_SWAP && !do_memsw_account())
+ for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) {
+ if (memcg1_stats[i] == MEMCG_SWAP && !do_memsw_account())
continue;
- seq_printf(m, "%s %lu\n", mem_cgroup_stat_names[i],
- mem_cgroup_read_stat(memcg, i) * PAGE_SIZE);
+ seq_printf(m, "%s %lu\n", memcg1_stat_names[i],
+ memcg_page_state(memcg, memcg1_stats[i]) *
+ PAGE_SIZE);
}
- for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++)
- seq_printf(m, "%s %lu\n", mem_cgroup_events_names[i],
- mem_cgroup_read_events(memcg, i));
+ for (i = 0; i < ARRAY_SIZE(memcg1_events); i++)
+ seq_printf(m, "%s %lu\n", memcg1_event_names[i],
+ memcg_sum_events(memcg, memcg1_events[i]));
for (i = 0; i < NR_LRU_LISTS; i++)
seq_printf(m, "%s %lu\n", mem_cgroup_lru_names[i],
@@ -3171,23 +3169,23 @@ static int memcg_stat_show(struct seq_file *m, void *v)
seq_printf(m, "hierarchical_memsw_limit %llu\n",
(u64)memsw * PAGE_SIZE);
- for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
+ for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) {
unsigned long long val = 0;
- if (i == MEM_CGROUP_STAT_SWAP && !do_memsw_account())
+ if (memcg1_stats[i] == MEMCG_SWAP && !do_memsw_account())
continue;
for_each_mem_cgroup_tree(mi, memcg)
- val += mem_cgroup_read_stat(mi, i) * PAGE_SIZE;
- seq_printf(m, "total_%s %llu\n", mem_cgroup_stat_names[i], val);
+ val += memcg_page_state(mi, memcg1_stats[i]) *
+ PAGE_SIZE;
+ seq_printf(m, "total_%s %llu\n", memcg1_stat_names[i], val);
}
- for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++) {
+ for (i = 0; i < ARRAY_SIZE(memcg1_events); i++) {
unsigned long long val = 0;
for_each_mem_cgroup_tree(mi, memcg)
- val += mem_cgroup_read_events(mi, i);
- seq_printf(m, "total_%s %llu\n",
- mem_cgroup_events_names[i], val);
+ val += memcg_sum_events(mi, memcg1_events[i]);
+ seq_printf(m, "total_%s %llu\n", memcg1_event_names[i], val);
}
for (i = 0; i < NR_LRU_LISTS; i++) {
@@ -3652,10 +3650,10 @@ void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages,
struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css);
struct mem_cgroup *parent;
- *pdirty = mem_cgroup_read_stat(memcg, MEM_CGROUP_STAT_DIRTY);
+ *pdirty = memcg_page_state(memcg, NR_FILE_DIRTY);
/* this should eventually include NR_UNSTABLE_NFS */
- *pwriteback = mem_cgroup_read_stat(memcg, MEM_CGROUP_STAT_WRITEBACK);
+ *pwriteback = memcg_page_state(memcg, NR_WRITEBACK);
*pfilepages = mem_cgroup_nr_lru_pages(memcg, (1 << LRU_INACTIVE_FILE) |
(1 << LRU_ACTIVE_FILE));
*pheadroom = PAGE_COUNTER_MAX;
@@ -4511,33 +4509,29 @@ static int mem_cgroup_move_account(struct page *page,
spin_lock_irqsave(&from->move_lock, flags);
if (!anon && page_mapped(page)) {
- __this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED],
- nr_pages);
- __this_cpu_add(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED],
- nr_pages);
+ __this_cpu_sub(from->stat->count[NR_FILE_MAPPED], nr_pages);
+ __this_cpu_add(to->stat->count[NR_FILE_MAPPED], nr_pages);
}
/*
* move_lock grabbed above and caller set from->moving_account, so
- * mem_cgroup_update_page_stat() will serialize updates to PageDirty.
+ * mod_memcg_page_state will serialize updates to PageDirty.
* So mapping should be stable for dirty pages.
*/
if (!anon && PageDirty(page)) {
struct address_space *mapping = page_mapping(page);
if (mapping_cap_account_dirty(mapping)) {
- __this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_DIRTY],
+ __this_cpu_sub(from->stat->count[NR_FILE_DIRTY],
nr_pages);
- __this_cpu_add(to->stat->count[MEM_CGROUP_STAT_DIRTY],
+ __this_cpu_add(to->stat->count[NR_FILE_DIRTY],
nr_pages);
}
}
if (PageWriteback(page)) {
- __this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_WRITEBACK],
- nr_pages);
- __this_cpu_add(to->stat->count[MEM_CGROUP_STAT_WRITEBACK],
- nr_pages);
+ __this_cpu_sub(from->stat->count[NR_WRITEBACK], nr_pages);
+ __this_cpu_add(to->stat->count[NR_WRITEBACK], nr_pages);
}
/*
@@ -5154,7 +5148,7 @@ static ssize_t memory_max_write(struct kernfs_open_file *of,
continue;
}
- mem_cgroup_events(memcg, MEMCG_OOM, 1);
+ mem_cgroup_event(memcg, MEMCG_OOM);
if (!mem_cgroup_out_of_memory(memcg, GFP_KERNEL, 0))
break;
}
@@ -5167,10 +5161,10 @@ static int memory_events_show(struct seq_file *m, void *v)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
- seq_printf(m, "low %lu\n", mem_cgroup_read_events(memcg, MEMCG_LOW));
- seq_printf(m, "high %lu\n", mem_cgroup_read_events(memcg, MEMCG_HIGH));
- seq_printf(m, "max %lu\n", mem_cgroup_read_events(memcg, MEMCG_MAX));
- seq_printf(m, "oom %lu\n", mem_cgroup_read_events(memcg, MEMCG_OOM));
+ seq_printf(m, "low %lu\n", memcg_sum_events(memcg, MEMCG_LOW));
+ seq_printf(m, "high %lu\n", memcg_sum_events(memcg, MEMCG_HIGH));
+ seq_printf(m, "max %lu\n", memcg_sum_events(memcg, MEMCG_MAX));
+ seq_printf(m, "oom %lu\n", memcg_sum_events(memcg, MEMCG_OOM));
return 0;
}
@@ -5197,9 +5191,9 @@ static int memory_stat_show(struct seq_file *m, void *v)
tree_events(memcg, events);
seq_printf(m, "anon %llu\n",
- (u64)stat[MEM_CGROUP_STAT_RSS] * PAGE_SIZE);
+ (u64)stat[MEMCG_RSS] * PAGE_SIZE);
seq_printf(m, "file %llu\n",
- (u64)stat[MEM_CGROUP_STAT_CACHE] * PAGE_SIZE);
+ (u64)stat[MEMCG_CACHE] * PAGE_SIZE);
seq_printf(m, "kernel_stack %llu\n",
(u64)stat[MEMCG_KERNEL_STACK_KB] * 1024);
seq_printf(m, "slab %llu\n",
@@ -5208,12 +5202,14 @@ static int memory_stat_show(struct seq_file *m, void *v)
seq_printf(m, "sock %llu\n",
(u64)stat[MEMCG_SOCK] * PAGE_SIZE);
+ seq_printf(m, "shmem %llu\n",
+ (u64)stat[NR_SHMEM] * PAGE_SIZE);
seq_printf(m, "file_mapped %llu\n",
- (u64)stat[MEM_CGROUP_STAT_FILE_MAPPED] * PAGE_SIZE);
+ (u64)stat[NR_FILE_MAPPED] * PAGE_SIZE);
seq_printf(m, "file_dirty %llu\n",
- (u64)stat[MEM_CGROUP_STAT_DIRTY] * PAGE_SIZE);
+ (u64)stat[NR_FILE_DIRTY] * PAGE_SIZE);
seq_printf(m, "file_writeback %llu\n",
- (u64)stat[MEM_CGROUP_STAT_WRITEBACK] * PAGE_SIZE);
+ (u64)stat[NR_WRITEBACK] * PAGE_SIZE);
for (i = 0; i < NR_LRU_LISTS; i++) {
struct mem_cgroup *mi;
@@ -5232,10 +5228,15 @@ static int memory_stat_show(struct seq_file *m, void *v)
/* Accumulated memory events */
- seq_printf(m, "pgfault %lu\n",
- events[MEM_CGROUP_EVENTS_PGFAULT]);
- seq_printf(m, "pgmajfault %lu\n",
- events[MEM_CGROUP_EVENTS_PGMAJFAULT]);
+ seq_printf(m, "pgfault %lu\n", events[PGFAULT]);
+ seq_printf(m, "pgmajfault %lu\n", events[PGMAJFAULT]);
+
+ seq_printf(m, "workingset_refault %lu\n",
+ stat[WORKINGSET_REFAULT]);
+ seq_printf(m, "workingset_activate %lu\n",
+ stat[WORKINGSET_ACTIVATE]);
+ seq_printf(m, "workingset_nodereclaim %lu\n",
+ stat[WORKINGSET_NODERECLAIM]);
return 0;
}
@@ -5476,8 +5477,8 @@ void mem_cgroup_cancel_charge(struct page *page, struct mem_cgroup *memcg,
static void uncharge_batch(struct mem_cgroup *memcg, unsigned long pgpgout,
unsigned long nr_anon, unsigned long nr_file,
- unsigned long nr_huge, unsigned long nr_kmem,
- struct page *dummy_page)
+ unsigned long nr_kmem, unsigned long nr_huge,
+ unsigned long nr_shmem, struct page *dummy_page)
{
unsigned long nr_pages = nr_anon + nr_file + nr_kmem;
unsigned long flags;
@@ -5492,10 +5493,11 @@ static void uncharge_batch(struct mem_cgroup *memcg, unsigned long pgpgout,
}
local_irq_save(flags);
- __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS], nr_anon);
- __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_CACHE], nr_file);
- __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE], nr_huge);
- __this_cpu_add(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGOUT], pgpgout);
+ __this_cpu_sub(memcg->stat->count[MEMCG_RSS], nr_anon);
+ __this_cpu_sub(memcg->stat->count[MEMCG_CACHE], nr_file);
+ __this_cpu_sub(memcg->stat->count[MEMCG_RSS_HUGE], nr_huge);
+ __this_cpu_sub(memcg->stat->count[NR_SHMEM], nr_shmem);
+ __this_cpu_add(memcg->stat->events[PGPGOUT], pgpgout);
__this_cpu_add(memcg->stat->nr_page_events, nr_pages);
memcg_check_events(memcg, dummy_page);
local_irq_restore(flags);
@@ -5507,6 +5509,7 @@ static void uncharge_batch(struct mem_cgroup *memcg, unsigned long pgpgout,
static void uncharge_list(struct list_head *page_list)
{
struct mem_cgroup *memcg = NULL;
+ unsigned long nr_shmem = 0;
unsigned long nr_anon = 0;
unsigned long nr_file = 0;
unsigned long nr_huge = 0;
@@ -5539,9 +5542,9 @@ static void uncharge_list(struct list_head *page_list)
if (memcg != page->mem_cgroup) {
if (memcg) {
uncharge_batch(memcg, pgpgout, nr_anon, nr_file,
- nr_huge, nr_kmem, page);
- pgpgout = nr_anon = nr_file =
- nr_huge = nr_kmem = 0;
+ nr_kmem, nr_huge, nr_shmem, page);
+ pgpgout = nr_anon = nr_file = nr_kmem = 0;
+ nr_huge = nr_shmem = 0;
}
memcg = page->mem_cgroup;
}
@@ -5555,8 +5558,11 @@ static void uncharge_list(struct list_head *page_list)
}
if (PageAnon(page))
nr_anon += nr_pages;
- else
+ else {
nr_file += nr_pages;
+ if (PageSwapBacked(page))
+ nr_shmem += nr_pages;
+ }
pgpgout++;
} else {
nr_kmem += 1 << compound_order(page);
@@ -5568,7 +5574,7 @@ static void uncharge_list(struct list_head *page_list)
if (memcg)
uncharge_batch(memcg, pgpgout, nr_anon, nr_file,
- nr_huge, nr_kmem, page);
+ nr_kmem, nr_huge, nr_shmem, page);
}
/**
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 27f7210e7fab..73066b80d14a 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -220,6 +220,9 @@ static int kill_proc(struct task_struct *t, unsigned long addr, int trapno,
*/
void shake_page(struct page *p, int access)
{
+ if (PageHuge(p))
+ return;
+
if (!PageSlab(p)) {
lru_add_drain_all();
if (PageLRU(p))
@@ -322,7 +325,7 @@ static void add_to_kill(struct task_struct *tsk, struct page *p,
* wrong earlier.
*/
static void kill_procs(struct list_head *to_kill, int forcekill, int trapno,
- int fail, struct page *page, unsigned long pfn,
+ bool fail, struct page *page, unsigned long pfn,
int flags)
{
struct to_kill *tk, *next;
@@ -904,35 +907,36 @@ EXPORT_SYMBOL_GPL(get_hwpoison_page);
* Do all that is necessary to remove user space mappings. Unmap
* the pages and send SIGBUS to the processes if the data was dirty.
*/
-static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
+static bool hwpoison_user_mappings(struct page *p, unsigned long pfn,
int trapno, int flags, struct page **hpagep)
{
- enum ttu_flags ttu = TTU_UNMAP | TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS;
+ enum ttu_flags ttu = TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS;
struct address_space *mapping;
LIST_HEAD(tokill);
- int ret;
+ bool unmap_success;
int kill = 1, forcekill;
struct page *hpage = *hpagep;
+ bool mlocked = PageMlocked(hpage);
/*
* Here we are interested only in user-mapped pages, so skip any
* other types of pages.
*/
if (PageReserved(p) || PageSlab(p))
- return SWAP_SUCCESS;
+ return true;
if (!(PageLRU(hpage) || PageHuge(p)))
- return SWAP_SUCCESS;
+ return true;
/*
* This check implies we don't kill processes if their pages
* are in the swap cache early. Those are always late kills.
*/
if (!page_mapped(hpage))
- return SWAP_SUCCESS;
+ return true;
if (PageKsm(p)) {
pr_err("Memory failure: %#lx: can't handle KSM pages.\n", pfn);
- return SWAP_FAIL;
+ return false;
}
if (PageSwapCache(p)) {
@@ -971,12 +975,19 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
if (kill)
collect_procs(hpage, &tokill, flags & MF_ACTION_REQUIRED);
- ret = try_to_unmap(hpage, ttu);
- if (ret != SWAP_SUCCESS)
+ unmap_success = try_to_unmap(hpage, ttu);
+ if (!unmap_success)
pr_err("Memory failure: %#lx: failed to unmap page (mapcount=%d)\n",
pfn, page_mapcount(hpage));
/*
+ * try_to_unmap() might put mlocked page in lru cache, so call
+ * shake_page() again to ensure that it's flushed.
+ */
+ if (mlocked)
+ shake_page(hpage, 0);
+
+ /*
* Now that the dirty bit has been propagated to the
* struct page and all unmaps done we can decide if
* killing is needed or not. Only kill when the page
@@ -987,10 +998,9 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
* any accesses to the poisoned memory.
*/
forcekill = PageDirty(hpage) || (flags & MF_MUST_KILL);
- kill_procs(&tokill, forcekill, trapno,
- ret != SWAP_SUCCESS, p, pfn, flags);
+ kill_procs(&tokill, forcekill, trapno, !unmap_success, p, pfn, flags);
- return ret;
+ return unmap_success;
}
static void set_page_hwpoison_huge_page(struct page *hpage)
@@ -1138,22 +1148,14 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
* The check (unnecessarily) ignores LRU pages being isolated and
* walked by the page reclaim code, however that's not a big loss.
*/
- if (!PageHuge(p)) {
- if (!PageLRU(p))
- shake_page(p, 0);
- if (!PageLRU(p)) {
- /*
- * shake_page could have turned it free.
- */
- if (is_free_buddy_page(p)) {
- if (flags & MF_COUNT_INCREASED)
- action_result(pfn, MF_MSG_BUDDY, MF_DELAYED);
- else
- action_result(pfn, MF_MSG_BUDDY_2ND,
- MF_DELAYED);
- return 0;
- }
- }
+ shake_page(p, 0);
+ /* shake_page could have turned it free. */
+ if (!PageLRU(p) && is_free_buddy_page(p)) {
+ if (flags & MF_COUNT_INCREASED)
+ action_result(pfn, MF_MSG_BUDDY, MF_DELAYED);
+ else
+ action_result(pfn, MF_MSG_BUDDY_2ND, MF_DELAYED);
+ return 0;
}
lock_page(hpage);
@@ -1230,8 +1232,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
* When the raw error page is thp tail page, hpage points to the raw
* page after thp split.
*/
- if (hwpoison_user_mappings(p, pfn, trapno, flags, &hpage)
- != SWAP_SUCCESS) {
+ if (!hwpoison_user_mappings(p, pfn, trapno, flags, &hpage)) {
action_result(pfn, MF_MSG_UNMAP_FAILED, MF_IGNORED);
res = -EBUSY;
goto out;
@@ -1543,8 +1544,8 @@ static int get_any_page(struct page *page, unsigned long pfn, int flags)
if (ret == 1 && !PageLRU(page)) {
/* Drop page reference which is from __get_any_page() */
put_hwpoison_page(page);
- pr_info("soft_offline: %#lx: unknown non LRU page type %lx\n",
- pfn, page->flags);
+ pr_info("soft_offline: %#lx: unknown non LRU page type %lx (%pGp)\n",
+ pfn, page->flags, &page->flags);
return -EIO;
}
}
@@ -1585,8 +1586,8 @@ static int soft_offline_huge_page(struct page *page, int flags)
ret = migrate_pages(&pagelist, new_page, NULL, MPOL_MF_MOVE_ALL,
MIGRATE_SYNC, MR_MEMORY_FAILURE);
if (ret) {
- pr_info("soft offline: %#lx: migration failed %d, type %lx\n",
- pfn, ret, page->flags);
+ pr_info("soft offline: %#lx: migration failed %d, type %lx (%pGp)\n",
+ pfn, ret, page->flags, &page->flags);
/*
* We know that soft_offline_huge_page() tries to migrate
* only one hugepage pointed to by hpage, so we need not
@@ -1677,14 +1678,14 @@ static int __soft_offline_page(struct page *page, int flags)
if (!list_empty(&pagelist))
putback_movable_pages(&pagelist);
- pr_info("soft offline: %#lx: migration failed %d, type %lx\n",
- pfn, ret, page->flags);
+ pr_info("soft offline: %#lx: migration failed %d, type %lx (%pGp)\n",
+ pfn, ret, page->flags, &page->flags);
if (ret > 0)
ret = -EIO;
}
} else {
- pr_info("soft offline: %#lx: isolation failed: %d, page count %d, type %lx\n",
- pfn, ret, page_count(page), page->flags);
+ pr_info("soft offline: %#lx: isolation failed: %d, page count %d, type %lx (%pGp)\n",
+ pfn, ret, page_count(page), page->flags, &page->flags);
}
return ret;
}
diff --git a/mm/memory.c b/mm/memory.c
index 235ba51b2fbf..6ff5d729ded0 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -4298,7 +4298,7 @@ void __might_fault(const char *file, int line)
* get paged out, therefore we'll never actually fault, and the
* below annotations will generate false positives.
*/
- if (segment_eq(get_fs(), KERNEL_DS))
+ if (uaccess_kernel())
return;
if (pagefault_disabled())
return;
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 6fa7208bcd56..b63d7d1239df 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1208,7 +1208,11 @@ static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start)
arch_refresh_nodedata(nid, pgdat);
} else {
- /* Reset the nr_zones, order and classzone_idx before reuse */
+ /*
+ * Reset the nr_zones, order and classzone_idx before reuse.
+ * Note that kswapd will init kswapd_classzone_idx properly
+ * when it starts in the near future.
+ */
pgdat->nr_zones = 0;
pgdat->kswapd_order = 0;
pgdat->kswapd_classzone_idx = 0;
diff --git a/mm/migrate.c b/mm/migrate.c
index ed97c2c14fa8..89a0a1707f4c 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -184,9 +184,9 @@ void putback_movable_pages(struct list_head *l)
unlock_page(page);
put_page(page);
} else {
- putback_lru_page(page);
dec_node_page_state(page, NR_ISOLATED_ANON +
page_is_file_cache(page));
+ putback_lru_page(page);
}
}
}
@@ -194,7 +194,7 @@ void putback_movable_pages(struct list_head *l)
/*
* Restore a potential migration pte to a working pte entry
*/
-static int remove_migration_pte(struct page *page, struct vm_area_struct *vma,
+static bool remove_migration_pte(struct page *page, struct vm_area_struct *vma,
unsigned long addr, void *old)
{
struct page_vma_mapped_walk pvmw = {
@@ -253,7 +253,7 @@ static int remove_migration_pte(struct page *page, struct vm_area_struct *vma,
update_mmu_cache(vma, pvmw.address, pvmw.pte);
}
- return SWAP_AGAIN;
+ return true;
}
/*
@@ -1722,9 +1722,6 @@ static bool migrate_balanced_pgdat(struct pglist_data *pgdat,
{
int z;
- if (!pgdat_reclaimable(pgdat))
- return false;
-
for (z = pgdat->nr_zones - 1; z >= 0; z--) {
struct zone *zone = pgdat->node_zones + z;
@@ -1947,7 +1944,8 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
/* Prepare a page as a migration target */
__SetPageLocked(new_page);
- __SetPageSwapBacked(new_page);
+ if (PageSwapBacked(page))
+ __SetPageSwapBacked(new_page);
/* anon mapping, we can simply copy page->mapping to the new page: */
new_page->mapping = page->mapping;
diff --git a/mm/mlock.c b/mm/mlock.c
index 0dd9ca18e19e..c483c5c20b4b 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -123,17 +123,15 @@ static bool __munlock_isolate_lru_page(struct page *page, bool getpage)
*/
static void __munlock_isolated_page(struct page *page)
{
- int ret = SWAP_AGAIN;
-
/*
* Optimization: if the page was mapped just once, that's our mapping
* and we don't need to check all the other vmas.
*/
if (page_mapcount(page) > 1)
- ret = try_to_munlock(page);
+ try_to_munlock(page);
/* Did try_to_unlock() succeed or punt? */
- if (ret != SWAP_MLOCK)
+ if (!PageMlocked(page))
count_vm_event(UNEVICTABLE_PGMUNLOCKED);
putback_lru_page(page);
diff --git a/mm/mmap.c b/mm/mmap.c
index bfbe8856d134..f82741e199c0 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1479,7 +1479,7 @@ SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len,
struct user_struct *user = NULL;
struct hstate *hs;
- hs = hstate_sizelog((flags >> MAP_HUGE_SHIFT) & SHM_HUGE_MASK);
+ hs = hstate_sizelog((flags >> MAP_HUGE_SHIFT) & MAP_HUGE_MASK);
if (!hs)
return -EINVAL;
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index d083714a2bb9..04c9143a8625 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -685,6 +685,7 @@ void exit_oom_victim(void)
void oom_killer_enable(void)
{
oom_killer_disabled = false;
+ pr_info("OOM killer enabled.\n");
}
/**
@@ -721,6 +722,7 @@ bool oom_killer_disable(signed long timeout)
oom_killer_enable();
return false;
}
+ pr_info("OOM killer disabled.\n");
return true;
}
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 03a70d8a6030..143c1c25d680 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -650,9 +650,8 @@ int wb_domain_init(struct wb_domain *dom, gfp_t gfp)
spin_lock_init(&dom->lock);
- init_timer_deferrable(&dom->period_timer);
- dom->period_timer.function = writeout_period;
- dom->period_timer.data = (unsigned long)dom;
+ setup_deferrable_timer(&dom->period_timer, writeout_period,
+ (unsigned long)dom);
dom->dirty_limit_tstamp = jiffies;
@@ -2434,7 +2433,7 @@ void account_page_dirtied(struct page *page, struct address_space *mapping)
inode_attach_wb(inode, page);
wb = inode_to_wb(inode);
- mem_cgroup_inc_page_stat(page, MEM_CGROUP_STAT_DIRTY);
+ inc_memcg_page_state(page, NR_FILE_DIRTY);
__inc_node_page_state(page, NR_FILE_DIRTY);
__inc_zone_page_state(page, NR_ZONE_WRITE_PENDING);
__inc_node_page_state(page, NR_DIRTIED);
@@ -2456,7 +2455,7 @@ void account_page_cleaned(struct page *page, struct address_space *mapping,
struct bdi_writeback *wb)
{
if (mapping_cap_account_dirty(mapping)) {
- mem_cgroup_dec_page_stat(page, MEM_CGROUP_STAT_DIRTY);
+ dec_memcg_page_state(page, NR_FILE_DIRTY);
dec_node_page_state(page, NR_FILE_DIRTY);
dec_zone_page_state(page, NR_ZONE_WRITE_PENDING);
dec_wb_stat(wb, WB_RECLAIMABLE);
@@ -2713,7 +2712,7 @@ int clear_page_dirty_for_io(struct page *page)
*/
wb = unlocked_inode_to_wb_begin(inode, &locked);
if (TestClearPageDirty(page)) {
- mem_cgroup_dec_page_stat(page, MEM_CGROUP_STAT_DIRTY);
+ dec_memcg_page_state(page, NR_FILE_DIRTY);
dec_node_page_state(page, NR_FILE_DIRTY);
dec_zone_page_state(page, NR_ZONE_WRITE_PENDING);
dec_wb_stat(wb, WB_RECLAIMABLE);
@@ -2760,7 +2759,7 @@ int test_clear_page_writeback(struct page *page)
ret = TestClearPageWriteback(page);
}
if (ret) {
- mem_cgroup_dec_page_stat(page, MEM_CGROUP_STAT_WRITEBACK);
+ dec_memcg_page_state(page, NR_WRITEBACK);
dec_node_page_state(page, NR_WRITEBACK);
dec_zone_page_state(page, NR_ZONE_WRITE_PENDING);
inc_node_page_state(page, NR_WRITTEN);
@@ -2815,7 +2814,7 @@ int __test_set_page_writeback(struct page *page, bool keep_write)
ret = TestSetPageWriteback(page);
}
if (!ret) {
- mem_cgroup_inc_page_stat(page, MEM_CGROUP_STAT_WRITEBACK);
+ inc_memcg_page_state(page, NR_WRITEBACK);
inc_node_page_state(page, NR_WRITEBACK);
inc_zone_page_state(page, NR_ZONE_WRITE_PENDING);
}
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index f3d603cef2c0..2c25de46c58f 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -65,6 +65,7 @@
#include <linux/page_owner.h>
#include <linux/kthread.h>
#include <linux/memcontrol.h>
+#include <linux/ftrace.h>
#include <asm/sections.h>
#include <asm/tlbflush.h>
@@ -1090,14 +1091,10 @@ static void free_pcppages_bulk(struct zone *zone, int count,
{
int migratetype = 0;
int batch_free = 0;
- unsigned long nr_scanned, flags;
bool isolated_pageblocks;
- spin_lock_irqsave(&zone->lock, flags);
+ spin_lock(&zone->lock);
isolated_pageblocks = has_isolate_pageblock(zone);
- nr_scanned = node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED);
- if (nr_scanned)
- __mod_node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED, -nr_scanned);
while (count) {
struct page *page;
@@ -1142,7 +1139,7 @@ static void free_pcppages_bulk(struct zone *zone, int count,
trace_mm_page_pcpu_drain(page, 0, mt);
} while (--count && --batch_free && !list_empty(list));
}
- spin_unlock_irqrestore(&zone->lock, flags);
+ spin_unlock(&zone->lock);
}
static void free_one_page(struct zone *zone,
@@ -1150,19 +1147,13 @@ static void free_one_page(struct zone *zone,
unsigned int order,
int migratetype)
{
- unsigned long nr_scanned, flags;
- spin_lock_irqsave(&zone->lock, flags);
- __count_vm_events(PGFREE, 1 << order);
- nr_scanned = node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED);
- if (nr_scanned)
- __mod_node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED, -nr_scanned);
-
+ spin_lock(&zone->lock);
if (unlikely(has_isolate_pageblock(zone) ||
is_migrate_isolate(migratetype))) {
migratetype = get_pfnblock_migratetype(page, pfn);
}
__free_one_page(page, pfn, zone, order, migratetype);
- spin_unlock_irqrestore(&zone->lock, flags);
+ spin_unlock(&zone->lock);
}
static void __meminit __init_single_page(struct page *page, unsigned long pfn,
@@ -1240,6 +1231,7 @@ void __meminit reserve_bootmem_region(phys_addr_t start, phys_addr_t end)
static void __free_pages_ok(struct page *page, unsigned int order)
{
+ unsigned long flags;
int migratetype;
unsigned long pfn = page_to_pfn(page);
@@ -1247,7 +1239,10 @@ static void __free_pages_ok(struct page *page, unsigned int order)
return;
migratetype = get_pfnblock_migratetype(page, pfn);
+ local_irq_save(flags);
+ __count_vm_events(PGFREE, 1 << order);
free_one_page(page_zone(page), page, pfn, order, migratetype);
+ local_irq_restore(flags);
}
static void __init __free_pages_boot_core(struct page *page, unsigned int order)
@@ -1695,10 +1690,10 @@ static inline int check_new_page(struct page *page)
return 1;
}
-static inline bool free_pages_prezeroed(bool poisoned)
+static inline bool free_pages_prezeroed(void)
{
return IS_ENABLED(CONFIG_PAGE_POISONING_ZERO) &&
- page_poisoning_enabled() && poisoned;
+ page_poisoning_enabled();
}
#ifdef CONFIG_DEBUG_VM
@@ -1752,17 +1747,10 @@ static void prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags
unsigned int alloc_flags)
{
int i;
- bool poisoned = true;
-
- for (i = 0; i < (1 << order); i++) {
- struct page *p = page + i;
- if (poisoned)
- poisoned &= page_is_poisoned(p);
- }
post_alloc_hook(page, order, gfp_flags);
- if (!free_pages_prezeroed(poisoned) && (gfp_flags & __GFP_ZERO))
+ if (!free_pages_prezeroed() && (gfp_flags & __GFP_ZERO))
for (i = 0; i < (1 << order); i++)
clear_highpage(page + i);
@@ -2042,8 +2030,8 @@ static void reserve_highatomic_pageblock(struct page *page, struct zone *zone,
/* Yoink! */
mt = get_pageblock_migratetype(page);
- if (mt != MIGRATE_HIGHATOMIC &&
- !is_migrate_isolate(mt) && !is_migrate_cma(mt)) {
+ if (!is_migrate_highatomic(mt) && !is_migrate_isolate(mt)
+ && !is_migrate_cma(mt)) {
zone->nr_reserved_highatomic += pageblock_nr_pages;
set_pageblock_migratetype(page, MIGRATE_HIGHATOMIC);
move_freepages_block(zone, page, MIGRATE_HIGHATOMIC);
@@ -2100,8 +2088,7 @@ static bool unreserve_highatomic_pageblock(const struct alloc_context *ac,
* from highatomic to ac->migratetype. So we should
* adjust the count once.
*/
- if (get_pageblock_migratetype(page) ==
- MIGRATE_HIGHATOMIC) {
+ if (is_migrate_highatomic_page(page)) {
/*
* It should never happen but changes to
* locking could inadvertently allow a per-cpu
@@ -2158,8 +2145,7 @@ __rmqueue_fallback(struct zone *zone, unsigned int order, int start_migratetype)
page = list_first_entry(&area->free_list[fallback_mt],
struct page, lru);
- if (can_steal &&
- get_pageblock_migratetype(page) != MIGRATE_HIGHATOMIC)
+ if (can_steal && !is_migrate_highatomic_page(page))
steal_suitable_fallback(zone, page, start_migratetype);
/* Remove the page from the freelists */
@@ -2219,9 +2205,8 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
int migratetype, bool cold)
{
int i, alloced = 0;
- unsigned long flags;
- spin_lock_irqsave(&zone->lock, flags);
+ spin_lock(&zone->lock);
for (i = 0; i < count; ++i) {
struct page *page = __rmqueue(zone, order, migratetype);
if (unlikely(page == NULL))
@@ -2257,7 +2242,7 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
* pages added to the pcp list.
*/
__mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order));
- spin_unlock_irqrestore(&zone->lock, flags);
+ spin_unlock(&zone->lock);
return alloced;
}
@@ -2485,25 +2470,22 @@ void free_hot_cold_page(struct page *page, bool cold)
{
struct zone *zone = page_zone(page);
struct per_cpu_pages *pcp;
+ unsigned long flags;
unsigned long pfn = page_to_pfn(page);
int migratetype;
- if (in_interrupt()) {
- __free_pages_ok(page, 0);
- return;
- }
-
if (!free_pcp_prepare(page))
return;
migratetype = get_pfnblock_migratetype(page, pfn);
set_pcppage_migratetype(page, migratetype);
- preempt_disable();
+ local_irq_save(flags);
+ __count_vm_event(PGFREE);
/*
* We only track unmovable, reclaimable and movable on pcp lists.
* Free ISOLATE pages back to the allocator because they are being
- * offlined but treat RESERVE as movable pages so we can get those
+ * offlined but treat HIGHATOMIC as movable pages so we can get those
* areas back if necessary. Otherwise, we may have to free
* excessively into the page allocator
*/
@@ -2515,7 +2497,6 @@ void free_hot_cold_page(struct page *page, bool cold)
migratetype = MIGRATE_MOVABLE;
}
- __count_vm_event(PGFREE);
pcp = &this_cpu_ptr(zone->pageset)->pcp;
if (!cold)
list_add(&page->lru, &pcp->lists[migratetype]);
@@ -2529,7 +2510,7 @@ void free_hot_cold_page(struct page *page, bool cold)
}
out:
- preempt_enable();
+ local_irq_restore(flags);
}
/*
@@ -2614,7 +2595,7 @@ int __isolate_free_page(struct page *page, unsigned int order)
for (; page < endpage; page += pageblock_nr_pages) {
int mt = get_pageblock_migratetype(page);
if (!is_migrate_isolate(mt) && !is_migrate_cma(mt)
- && mt != MIGRATE_HIGHATOMIC)
+ && !is_migrate_highatomic(mt))
set_pageblock_migratetype(page,
MIGRATE_MOVABLE);
}
@@ -2654,8 +2635,6 @@ static struct page *__rmqueue_pcplist(struct zone *zone, int migratetype,
{
struct page *page;
- VM_BUG_ON(in_interrupt());
-
do {
if (list_empty(list)) {
pcp->count += rmqueue_bulk(zone, 0,
@@ -2686,8 +2665,9 @@ static struct page *rmqueue_pcplist(struct zone *preferred_zone,
struct list_head *list;
bool cold = ((gfp_flags & __GFP_COLD) != 0);
struct page *page;
+ unsigned long flags;
- preempt_disable();
+ local_irq_save(flags);
pcp = &this_cpu_ptr(zone->pageset)->pcp;
list = &pcp->lists[migratetype];
page = __rmqueue_pcplist(zone, migratetype, cold, pcp, list);
@@ -2695,7 +2675,7 @@ static struct page *rmqueue_pcplist(struct zone *preferred_zone,
__count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order);
zone_statistics(preferred_zone, zone);
}
- preempt_enable();
+ local_irq_restore(flags);
return page;
}
@@ -2711,7 +2691,7 @@ struct page *rmqueue(struct zone *preferred_zone,
unsigned long flags;
struct page *page;
- if (likely(order == 0) && !in_interrupt()) {
+ if (likely(order == 0)) {
page = rmqueue_pcplist(preferred_zone, zone, order,
gfp_flags, migratetype);
goto out;
@@ -3113,8 +3093,7 @@ void warn_alloc(gfp_t gfp_mask, nodemask_t *nodemask, const char *fmt, ...)
static DEFINE_RATELIMIT_STATE(nopage_rs, DEFAULT_RATELIMIT_INTERVAL,
DEFAULT_RATELIMIT_BURST);
- if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs) ||
- debug_guardpage_minorder() > 0)
+ if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs))
return;
pr_warn("%s: ", current->comm);
@@ -3525,19 +3504,12 @@ bool gfp_pfmemalloc_allowed(gfp_t gfp_mask)
}
/*
- * Maximum number of reclaim retries without any progress before OOM killer
- * is consider as the only way to move forward.
- */
-#define MAX_RECLAIM_RETRIES 16
-
-/*
* Checks whether it makes sense to retry the reclaim to make a forward progress
* for the given allocation request.
- * The reclaim feedback represented by did_some_progress (any progress during
- * the last reclaim round) and no_progress_loops (number of reclaim rounds without
- * any progress in a row) is considered as well as the reclaimable pages on the
- * applicable zone list (with a backoff mechanism which is a function of
- * no_progress_loops).
+ *
+ * We give up when we either have tried MAX_RECLAIM_RETRIES in a row
+ * without success, or when we couldn't even meet the watermark if we
+ * reclaimed all remaining pages on the LRU lists.
*
* Returns true if a retry is viable or false to enter the oom path.
*/
@@ -3582,13 +3554,11 @@ should_reclaim_retry(gfp_t gfp_mask, unsigned order,
bool wmark;
available = reclaimable = zone_reclaimable_pages(zone);
- available -= DIV_ROUND_UP((*no_progress_loops) * available,
- MAX_RECLAIM_RETRIES);
available += zone_page_state_snapshot(zone, NR_FREE_PAGES);
/*
- * Would the allocation succeed if we reclaimed the whole
- * available?
+ * Would the allocation succeed if we reclaimed all
+ * reclaimable pages?
*/
wmark = __zone_watermark_ok(zone, order, min_wmark,
ac_classzone_idx(ac), alloc_flags, available);
@@ -3774,7 +3744,7 @@ retry:
/* Make sure we know about allocations which stall for too long */
if (time_after(jiffies, alloc_start + stall_timeout)) {
- warn_alloc(gfp_mask, ac->nodemask,
+ warn_alloc(gfp_mask & ~__GFP_NOWARN, ac->nodemask,
"page allocation stalls for %ums, order:%u",
jiffies_to_msecs(jiffies-alloc_start), order);
stall_timeout += 10 * HZ;
@@ -3974,10 +3944,12 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
goto out;
/*
- * Runtime PM, block IO and its error handling path can deadlock
- * because I/O on the device might not complete.
+ * Apply scoped allocation constraints. This is mainly about GFP_NOFS
+ * resp. GFP_NOIO which has to be inherited for all allocation requests
+ * from a particular context which has been marked by
+ * memalloc_no{fs,io}_{save,restore}.
*/
- alloc_mask = memalloc_noio_flags(gfp_mask);
+ alloc_mask = current_gfp_context(gfp_mask);
ac.spread_dirty_pages = false;
/*
@@ -4250,7 +4222,8 @@ EXPORT_SYMBOL(free_pages_exact);
* nr_free_zone_pages() counts the number of counts pages which are beyond the
* high watermark within all zones at or below a given zone index. For each
* zone, the number of pages is calculated as:
- * managed_pages - high_pages
+ *
+ * nr_free_zone_pages = managed_pages - high_pages
*/
static unsigned long nr_free_zone_pages(int offset)
{
@@ -4512,7 +4485,6 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask)
#endif
" writeback_tmp:%lukB"
" unstable:%lukB"
- " pages_scanned:%lu"
" all_unreclaimable? %s"
"\n",
pgdat->node_id,
@@ -4535,8 +4507,8 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask)
#endif
K(node_page_state(pgdat, NR_WRITEBACK_TEMP)),
K(node_page_state(pgdat, NR_UNSTABLE_NFS)),
- node_page_state(pgdat, NR_PAGES_SCANNED),
- !pgdat_reclaimable(pgdat) ? "yes" : "no");
+ pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES ?
+ "yes" : "no");
}
for_each_populated_zone(zone) {
@@ -7431,7 +7403,7 @@ int alloc_contig_range(unsigned long start, unsigned long end,
.zone = page_zone(pfn_to_page(start)),
.mode = MIGRATE_SYNC,
.ignore_skip_hint = true,
- .gfp_mask = memalloc_noio_flags(gfp_mask),
+ .gfp_mask = current_gfp_context(gfp_mask),
};
INIT_LIST_HEAD(&cc.migratepages);
diff --git a/mm/page_ext.c b/mm/page_ext.c
index 121dcffc4ec1..88ccc044b09a 100644
--- a/mm/page_ext.c
+++ b/mm/page_ext.c
@@ -59,9 +59,6 @@
static struct page_ext_operations *page_ext_ops[] = {
&debug_guardpage_ops,
-#ifdef CONFIG_PAGE_POISONING
- &page_poisoning_ops,
-#endif
#ifdef CONFIG_PAGE_OWNER
&page_owner_ops,
#endif
@@ -127,15 +124,12 @@ struct page_ext *lookup_page_ext(struct page *page)
struct page_ext *base;
base = NODE_DATA(page_to_nid(page))->node_page_ext;
-#if defined(CONFIG_DEBUG_VM) || defined(CONFIG_PAGE_POISONING)
+#if defined(CONFIG_DEBUG_VM)
/*
* The sanity checks the page allocator does upon freeing a
* page can reach here before the page_ext arrays are
* allocated when feeding a range of pages to the allocator
* for the first time during bootup or memory hotplug.
- *
- * This check is also necessary for ensuring page poisoning
- * works as expected when enabled
*/
if (unlikely(!base))
return NULL;
@@ -204,15 +198,12 @@ struct page_ext *lookup_page_ext(struct page *page)
{
unsigned long pfn = page_to_pfn(page);
struct mem_section *section = __pfn_to_section(pfn);
-#if defined(CONFIG_DEBUG_VM) || defined(CONFIG_PAGE_POISONING)
+#if defined(CONFIG_DEBUG_VM)
/*
* The sanity checks the page allocator does upon freeing a
* page can reach here before the page_ext arrays are
* allocated when feeding a range of pages to the allocator
* for the first time during bootup or memory hotplug.
- *
- * This check is also necessary for ensuring page poisoning
- * works as expected when enabled
*/
if (!section->page_ext)
return NULL;
diff --git a/mm/page_idle.c b/mm/page_idle.c
index b0ee56c56b58..1b0f48c62316 100644
--- a/mm/page_idle.c
+++ b/mm/page_idle.c
@@ -50,7 +50,7 @@ static struct page *page_idle_get_page(unsigned long pfn)
return page;
}
-static int page_idle_clear_pte_refs_one(struct page *page,
+static bool page_idle_clear_pte_refs_one(struct page *page,
struct vm_area_struct *vma,
unsigned long addr, void *arg)
{
@@ -84,7 +84,7 @@ static int page_idle_clear_pte_refs_one(struct page *page,
*/
set_page_young(page);
}
- return SWAP_AGAIN;
+ return true;
}
static void page_idle_clear_pte_refs(struct page *page)
diff --git a/mm/page_isolation.c b/mm/page_isolation.c
index f4e17a57926a..7927bbb54a4e 100644
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -88,7 +88,7 @@ static void unset_migratetype_isolate(struct page *page, unsigned migratetype)
zone = page_zone(page);
spin_lock_irqsave(&zone->lock, flags);
- if (get_pageblock_migratetype(page) != MIGRATE_ISOLATE)
+ if (!is_migrate_isolate_page(page))
goto out;
/*
@@ -205,7 +205,7 @@ int undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
pfn < end_pfn;
pfn += pageblock_nr_pages) {
page = __first_valid_page(pfn, pageblock_nr_pages);
- if (!page || get_pageblock_migratetype(page) != MIGRATE_ISOLATE)
+ if (!page || !is_migrate_isolate_page(page))
continue;
unset_migratetype_isolate(page, migratetype);
}
@@ -262,7 +262,7 @@ int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn,
*/
for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) {
page = __first_valid_page(pfn, pageblock_nr_pages);
- if (page && get_pageblock_migratetype(page) != MIGRATE_ISOLATE)
+ if (page && !is_migrate_isolate_page(page))
break;
}
page = __first_valid_page(start_pfn, end_pfn - start_pfn);
diff --git a/mm/page_poison.c b/mm/page_poison.c
index 2e647c65916b..be19e989ccff 100644
--- a/mm/page_poison.c
+++ b/mm/page_poison.c
@@ -6,7 +6,6 @@
#include <linux/poison.h>
#include <linux/ratelimit.h>
-static bool __page_poisoning_enabled __read_mostly;
static bool want_page_poisoning __read_mostly;
static int early_page_poison_param(char *buf)
@@ -19,74 +18,21 @@ early_param("page_poison", early_page_poison_param);
bool page_poisoning_enabled(void)
{
- return __page_poisoning_enabled;
-}
-
-static bool need_page_poisoning(void)
-{
- return want_page_poisoning;
-}
-
-static void init_page_poisoning(void)
-{
/*
- * page poisoning is debug page alloc for some arches. If either
- * of those options are enabled, enable poisoning
+ * Assumes that debug_pagealloc_enabled is set before
+ * free_all_bootmem.
+ * Page poisoning is debug page alloc for some arches. If
+ * either of those options are enabled, enable poisoning.
*/
- if (!IS_ENABLED(CONFIG_ARCH_SUPPORTS_DEBUG_PAGEALLOC)) {
- if (!want_page_poisoning && !debug_pagealloc_enabled())
- return;
- } else {
- if (!want_page_poisoning)
- return;
- }
-
- __page_poisoning_enabled = true;
-}
-
-struct page_ext_operations page_poisoning_ops = {
- .need = need_page_poisoning,
- .init = init_page_poisoning,
-};
-
-static inline void set_page_poison(struct page *page)
-{
- struct page_ext *page_ext;
-
- page_ext = lookup_page_ext(page);
- if (unlikely(!page_ext))
- return;
-
- __set_bit(PAGE_EXT_DEBUG_POISON, &page_ext->flags);
-}
-
-static inline void clear_page_poison(struct page *page)
-{
- struct page_ext *page_ext;
-
- page_ext = lookup_page_ext(page);
- if (unlikely(!page_ext))
- return;
-
- __clear_bit(PAGE_EXT_DEBUG_POISON, &page_ext->flags);
-}
-
-bool page_is_poisoned(struct page *page)
-{
- struct page_ext *page_ext;
-
- page_ext = lookup_page_ext(page);
- if (unlikely(!page_ext))
- return false;
-
- return test_bit(PAGE_EXT_DEBUG_POISON, &page_ext->flags);
+ return (want_page_poisoning ||
+ (!IS_ENABLED(CONFIG_ARCH_SUPPORTS_DEBUG_PAGEALLOC) &&
+ debug_pagealloc_enabled()));
}
static void poison_page(struct page *page)
{
void *addr = kmap_atomic(page);
- set_page_poison(page);
memset(addr, PAGE_POISON, PAGE_SIZE);
kunmap_atomic(addr);
}
@@ -140,12 +86,13 @@ static void unpoison_page(struct page *page)
{
void *addr;
- if (!page_is_poisoned(page))
- return;
-
addr = kmap_atomic(page);
+ /*
+ * Page poisoning when enabled poisons each and every page
+ * that is freed to buddy. Thus no extra check is done to
+ * see if a page was posioned.
+ */
check_poison_mem(addr, PAGE_SIZE);
- clear_page_poison(page);
kunmap_atomic(addr);
}
diff --git a/mm/percpu.c b/mm/percpu.c
index 60a6488e9e6d..e0aa8ae7bde7 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -1284,18 +1284,7 @@ void free_percpu(void __percpu *ptr)
}
EXPORT_SYMBOL_GPL(free_percpu);
-/**
- * is_kernel_percpu_address - test whether address is from static percpu area
- * @addr: address to test
- *
- * Test whether @addr belongs to in-kernel static percpu area. Module
- * static percpu areas are not considered. For those, use
- * is_module_percpu_address().
- *
- * RETURNS:
- * %true if @addr is from in-kernel static percpu area, %false otherwise.
- */
-bool is_kernel_percpu_address(unsigned long addr)
+bool __is_kernel_percpu_address(unsigned long addr, unsigned long *can_addr)
{
#ifdef CONFIG_SMP
const size_t static_size = __per_cpu_end - __per_cpu_start;
@@ -1304,16 +1293,39 @@ bool is_kernel_percpu_address(unsigned long addr)
for_each_possible_cpu(cpu) {
void *start = per_cpu_ptr(base, cpu);
+ void *va = (void *)addr;
- if ((void *)addr >= start && (void *)addr < start + static_size)
+ if (va >= start && va < start + static_size) {
+ if (can_addr) {
+ *can_addr = (unsigned long) (va - start);
+ *can_addr += (unsigned long)
+ per_cpu_ptr(base, get_boot_cpu_id());
+ }
return true;
- }
+ }
+ }
#endif
/* on UP, can't distinguish from other static vars, always false */
return false;
}
/**
+ * is_kernel_percpu_address - test whether address is from static percpu area
+ * @addr: address to test
+ *
+ * Test whether @addr belongs to in-kernel static percpu area. Module
+ * static percpu areas are not considered. For those, use
+ * is_module_percpu_address().
+ *
+ * RETURNS:
+ * %true if @addr is from in-kernel static percpu area, %false otherwise.
+ */
+bool is_kernel_percpu_address(unsigned long addr)
+{
+ return __is_kernel_percpu_address(addr, NULL);
+}
+
+/**
* per_cpu_ptr_to_phys - convert translated percpu address to physical address
* @addr: the address to be converted to physical address
*
diff --git a/mm/rmap.c b/mm/rmap.c
index f6838015810f..3ff241f714eb 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -724,7 +724,7 @@ struct page_referenced_arg {
/*
* arg: page_referenced_arg will be passed
*/
-static int page_referenced_one(struct page *page, struct vm_area_struct *vma,
+static bool page_referenced_one(struct page *page, struct vm_area_struct *vma,
unsigned long address, void *arg)
{
struct page_referenced_arg *pra = arg;
@@ -741,7 +741,7 @@ static int page_referenced_one(struct page *page, struct vm_area_struct *vma,
if (vma->vm_flags & VM_LOCKED) {
page_vma_mapped_walk_done(&pvmw);
pra->vm_flags |= VM_LOCKED;
- return SWAP_FAIL; /* To break the loop */
+ return false; /* To break the loop */
}
if (pvmw.pte) {
@@ -781,9 +781,9 @@ static int page_referenced_one(struct page *page, struct vm_area_struct *vma,
}
if (!pra->mapcount)
- return SWAP_SUCCESS; /* To break the loop */
+ return false; /* To break the loop */
- return SWAP_AGAIN;
+ return true;
}
static bool invalid_page_referenced_vma(struct vm_area_struct *vma, void *arg)
@@ -812,7 +812,6 @@ int page_referenced(struct page *page,
struct mem_cgroup *memcg,
unsigned long *vm_flags)
{
- int ret;
int we_locked = 0;
struct page_referenced_arg pra = {
.mapcount = total_mapcount(page),
@@ -846,7 +845,7 @@ int page_referenced(struct page *page,
rwc.invalid_vma = invalid_page_referenced_vma;
}
- ret = rmap_walk(page, &rwc);
+ rmap_walk(page, &rwc);
*vm_flags = pra.vm_flags;
if (we_locked)
@@ -855,7 +854,7 @@ int page_referenced(struct page *page,
return pra.referenced;
}
-static int page_mkclean_one(struct page *page, struct vm_area_struct *vma,
+static bool page_mkclean_one(struct page *page, struct vm_area_struct *vma,
unsigned long address, void *arg)
{
struct page_vma_mapped_walk pvmw = {
@@ -908,7 +907,7 @@ static int page_mkclean_one(struct page *page, struct vm_area_struct *vma,
}
}
- return SWAP_AGAIN;
+ return true;
}
static bool invalid_mkclean_vma(struct vm_area_struct *vma, void *arg)
@@ -1159,7 +1158,7 @@ void page_add_file_rmap(struct page *page, bool compound)
goto out;
}
__mod_node_page_state(page_pgdat(page), NR_FILE_MAPPED, nr);
- mem_cgroup_update_page_stat(page, MEM_CGROUP_STAT_FILE_MAPPED, nr);
+ mod_memcg_page_state(page, NR_FILE_MAPPED, nr);
out:
unlock_page_memcg(page);
}
@@ -1199,7 +1198,7 @@ static void page_remove_file_rmap(struct page *page, bool compound)
* pte lock(a spinlock) is held, which implies preemption disabled.
*/
__mod_node_page_state(page_pgdat(page), NR_FILE_MAPPED, -nr);
- mem_cgroup_update_page_stat(page, MEM_CGROUP_STAT_FILE_MAPPED, -nr);
+ mod_memcg_page_state(page, NR_FILE_MAPPED, -nr);
if (unlikely(PageMlocked(page)))
clear_page_mlock(page);
@@ -1288,15 +1287,10 @@ void page_remove_rmap(struct page *page, bool compound)
*/
}
-struct rmap_private {
- enum ttu_flags flags;
- int lazyfreed;
-};
-
/*
* @arg: enum ttu_flags will be passed to this argument
*/
-static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
+static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
unsigned long address, void *arg)
{
struct mm_struct *mm = vma->vm_mm;
@@ -1307,13 +1301,12 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
};
pte_t pteval;
struct page *subpage;
- int ret = SWAP_AGAIN;
- struct rmap_private *rp = arg;
- enum ttu_flags flags = rp->flags;
+ bool ret = true;
+ enum ttu_flags flags = (enum ttu_flags)arg;
/* munlock has nothing to gain from examining un-locked vmas */
if ((flags & TTU_MUNLOCK) && !(vma->vm_flags & VM_LOCKED))
- return SWAP_AGAIN;
+ return true;
if (flags & TTU_SPLIT_HUGE_PMD) {
split_huge_pmd_address(vma, address,
@@ -1336,7 +1329,7 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
*/
mlock_vma_page(page);
}
- ret = SWAP_MLOCK;
+ ret = false;
page_vma_mapped_walk_done(&pvmw);
break;
}
@@ -1354,7 +1347,7 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
if (!(flags & TTU_IGNORE_ACCESS)) {
if (ptep_clear_flush_young_notify(vma, address,
pvmw.pte)) {
- ret = SWAP_FAIL;
+ ret = false;
page_vma_mapped_walk_done(&pvmw);
break;
}
@@ -1424,18 +1417,34 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
* Store the swap location in the pte.
* See handle_pte_fault() ...
*/
- VM_BUG_ON_PAGE(!PageSwapCache(page), page);
+ if (unlikely(PageSwapBacked(page) != PageSwapCache(page))) {
+ WARN_ON_ONCE(1);
+ ret = false;
+ page_vma_mapped_walk_done(&pvmw);
+ break;
+ }
+
+ /* MADV_FREE page check */
+ if (!PageSwapBacked(page)) {
+ if (!PageDirty(page)) {
+ dec_mm_counter(mm, MM_ANONPAGES);
+ goto discard;
+ }
- if (!PageDirty(page) && (flags & TTU_LZFREE)) {
- /* It's a freeable page by MADV_FREE */
- dec_mm_counter(mm, MM_ANONPAGES);
- rp->lazyfreed++;
- goto discard;
+ /*
+ * If the page was redirtied, it cannot be
+ * discarded. Remap the page to page table.
+ */
+ set_pte_at(mm, address, pvmw.pte, pteval);
+ SetPageSwapBacked(page);
+ ret = false;
+ page_vma_mapped_walk_done(&pvmw);
+ break;
}
if (swap_duplicate(entry) < 0) {
set_pte_at(mm, address, pvmw.pte, pteval);
- ret = SWAP_FAIL;
+ ret = false;
page_vma_mapped_walk_done(&pvmw);
break;
}
@@ -1492,24 +1501,14 @@ static int page_mapcount_is_zero(struct page *page)
*
* Tries to remove all the page table entries which are mapping this
* page, used in the pageout path. Caller must hold the page lock.
- * Return values are:
*
- * SWAP_SUCCESS - we succeeded in removing all mappings
- * SWAP_AGAIN - we missed a mapping, try again later
- * SWAP_FAIL - the page is unswappable
- * SWAP_MLOCK - page is mlocked.
+ * If unmap is successful, return true. Otherwise, false.
*/
-int try_to_unmap(struct page *page, enum ttu_flags flags)
+bool try_to_unmap(struct page *page, enum ttu_flags flags)
{
- int ret;
- struct rmap_private rp = {
- .flags = flags,
- .lazyfreed = 0,
- };
-
struct rmap_walk_control rwc = {
.rmap_one = try_to_unmap_one,
- .arg = &rp,
+ .arg = (void *)flags,
.done = page_mapcount_is_zero,
.anon_lock = page_lock_anon_vma_read,
};
@@ -1526,16 +1525,11 @@ int try_to_unmap(struct page *page, enum ttu_flags flags)
rwc.invalid_vma = invalid_migration_vma;
if (flags & TTU_RMAP_LOCKED)
- ret = rmap_walk_locked(page, &rwc);
+ rmap_walk_locked(page, &rwc);
else
- ret = rmap_walk(page, &rwc);
+ rmap_walk(page, &rwc);
- if (ret != SWAP_MLOCK && !page_mapcount(page)) {
- ret = SWAP_SUCCESS;
- if (rp.lazyfreed && !PageDirty(page))
- ret = SWAP_LZFREE;
- }
- return ret;
+ return !page_mapcount(page) ? true : false;
}
static int page_not_mapped(struct page *page)
@@ -1550,34 +1544,22 @@ static int page_not_mapped(struct page *page)
* Called from munlock code. Checks all of the VMAs mapping the page
* to make sure nobody else has this page mlocked. The page will be
* returned with PG_mlocked cleared if no other vmas have it mlocked.
- *
- * Return values are:
- *
- * SWAP_AGAIN - no vma is holding page mlocked, or,
- * SWAP_AGAIN - page mapped in mlocked vma -- couldn't acquire mmap sem
- * SWAP_FAIL - page cannot be located at present
- * SWAP_MLOCK - page is now mlocked.
*/
-int try_to_munlock(struct page *page)
-{
- int ret;
- struct rmap_private rp = {
- .flags = TTU_MUNLOCK,
- .lazyfreed = 0,
- };
+void try_to_munlock(struct page *page)
+{
struct rmap_walk_control rwc = {
.rmap_one = try_to_unmap_one,
- .arg = &rp,
+ .arg = (void *)TTU_MUNLOCK,
.done = page_not_mapped,
.anon_lock = page_lock_anon_vma_read,
};
VM_BUG_ON_PAGE(!PageLocked(page) || PageLRU(page), page);
+ VM_BUG_ON_PAGE(PageCompound(page) && PageDoubleMap(page), page);
- ret = rmap_walk(page, &rwc);
- return ret;
+ rmap_walk(page, &rwc);
}
void __put_anon_vma(struct anon_vma *anon_vma)
@@ -1625,13 +1607,12 @@ static struct anon_vma *rmap_walk_anon_lock(struct page *page,
* vm_flags for that VMA. That should be OK, because that vma shouldn't be
* LOCKED.
*/
-static int rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc,
+static void rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc,
bool locked)
{
struct anon_vma *anon_vma;
pgoff_t pgoff_start, pgoff_end;
struct anon_vma_chain *avc;
- int ret = SWAP_AGAIN;
if (locked) {
anon_vma = page_anon_vma(page);
@@ -1641,7 +1622,7 @@ static int rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc,
anon_vma = rmap_walk_anon_lock(page, rwc);
}
if (!anon_vma)
- return ret;
+ return;
pgoff_start = page_to_pgoff(page);
pgoff_end = pgoff_start + hpage_nr_pages(page) - 1;
@@ -1655,8 +1636,7 @@ static int rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc,
if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg))
continue;
- ret = rwc->rmap_one(page, vma, address, rwc->arg);
- if (ret != SWAP_AGAIN)
+ if (!rwc->rmap_one(page, vma, address, rwc->arg))
break;
if (rwc->done && rwc->done(page))
break;
@@ -1664,7 +1644,6 @@ static int rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc,
if (!locked)
anon_vma_unlock_read(anon_vma);
- return ret;
}
/*
@@ -1680,13 +1659,12 @@ static int rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc,
* vm_flags for that VMA. That should be OK, because that vma shouldn't be
* LOCKED.
*/
-static int rmap_walk_file(struct page *page, struct rmap_walk_control *rwc,
+static void rmap_walk_file(struct page *page, struct rmap_walk_control *rwc,
bool locked)
{
struct address_space *mapping = page_mapping(page);
pgoff_t pgoff_start, pgoff_end;
struct vm_area_struct *vma;
- int ret = SWAP_AGAIN;
/*
* The page lock not only makes sure that page->mapping cannot
@@ -1697,7 +1675,7 @@ static int rmap_walk_file(struct page *page, struct rmap_walk_control *rwc,
VM_BUG_ON_PAGE(!PageLocked(page), page);
if (!mapping)
- return ret;
+ return;
pgoff_start = page_to_pgoff(page);
pgoff_end = pgoff_start + hpage_nr_pages(page) - 1;
@@ -1712,8 +1690,7 @@ static int rmap_walk_file(struct page *page, struct rmap_walk_control *rwc,
if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg))
continue;
- ret = rwc->rmap_one(page, vma, address, rwc->arg);
- if (ret != SWAP_AGAIN)
+ if (!rwc->rmap_one(page, vma, address, rwc->arg))
goto done;
if (rwc->done && rwc->done(page))
goto done;
@@ -1722,28 +1699,27 @@ static int rmap_walk_file(struct page *page, struct rmap_walk_control *rwc,
done:
if (!locked)
i_mmap_unlock_read(mapping);
- return ret;
}
-int rmap_walk(struct page *page, struct rmap_walk_control *rwc)
+void rmap_walk(struct page *page, struct rmap_walk_control *rwc)
{
if (unlikely(PageKsm(page)))
- return rmap_walk_ksm(page, rwc);
+ rmap_walk_ksm(page, rwc);
else if (PageAnon(page))
- return rmap_walk_anon(page, rwc, false);
+ rmap_walk_anon(page, rwc, false);
else
- return rmap_walk_file(page, rwc, false);
+ rmap_walk_file(page, rwc, false);
}
/* Like rmap_walk, but caller holds relevant rmap lock */
-int rmap_walk_locked(struct page *page, struct rmap_walk_control *rwc)
+void rmap_walk_locked(struct page *page, struct rmap_walk_control *rwc)
{
/* no ksm support for now */
VM_BUG_ON_PAGE(PageKsm(page), page);
if (PageAnon(page))
- return rmap_walk_anon(page, rwc, true);
+ rmap_walk_anon(page, rwc, true);
else
- return rmap_walk_file(page, rwc, true);
+ rmap_walk_file(page, rwc, true);
}
#ifdef CONFIG_HUGETLB_PAGE
diff --git a/mm/rodata_test.c b/mm/rodata_test.c
index 0fd21670b513..6bb4deb12e78 100644
--- a/mm/rodata_test.c
+++ b/mm/rodata_test.c
@@ -9,11 +9,12 @@
* as published by the Free Software Foundation; version 2
* of the License.
*/
+#define pr_fmt(fmt) "rodata_test: " fmt
+
#include <linux/uaccess.h>
#include <asm/sections.h>
const int rodata_test_data = 0xC3;
-EXPORT_SYMBOL_GPL(rodata_test_data);
void rodata_test(void)
{
@@ -23,20 +24,20 @@ void rodata_test(void)
/* test 1: read the value */
/* If this test fails, some previous testrun has clobbered the state */
if (!rodata_test_data) {
- pr_err("rodata_test: test 1 fails (start data)\n");
+ pr_err("test 1 fails (start data)\n");
return;
}
/* test 2: write to the variable; this should fault */
if (!probe_kernel_write((void *)&rodata_test_data,
- (void *)&zero, sizeof(zero))) {
- pr_err("rodata_test: test data was not read only\n");
+ (void *)&zero, sizeof(zero))) {
+ pr_err("test data was not read only\n");
return;
}
/* test 3: check the value hasn't changed */
if (rodata_test_data == zero) {
- pr_err("rodata_test: test data was changed\n");
+ pr_err("test data was changed\n");
return;
}
@@ -44,13 +45,13 @@ void rodata_test(void)
start = (unsigned long)__start_rodata;
end = (unsigned long)__end_rodata;
if (start & (PAGE_SIZE - 1)) {
- pr_err("rodata_test: start of .rodata is not page size aligned\n");
+ pr_err("start of .rodata is not page size aligned\n");
return;
}
if (end & (PAGE_SIZE - 1)) {
- pr_err("rodata_test: end of .rodata is not page size aligned\n");
+ pr_err("end of .rodata is not page size aligned\n");
return;
}
- pr_info("rodata_test: all tests were successful\n");
+ pr_info("all tests were successful\n");
}
diff --git a/mm/slab.c b/mm/slab.c
index 807d86c76908..1880d482a0cb 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -3879,7 +3879,12 @@ static int __do_tune_cpucache(struct kmem_cache *cachep, int limit,
prev = cachep->cpu_cache;
cachep->cpu_cache = cpu_cache;
- kick_all_cpus_sync();
+ /*
+ * Without a previous cpu_cache there's no need to synchronize remote
+ * cpus, so skip the IPIs.
+ */
+ if (prev)
+ kick_all_cpus_sync();
check_irq_on();
cachep->batchcount = batchcount;
diff --git a/mm/sparse.c b/mm/sparse.c
index db6bf3c97ea2..6903c8fc3085 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -248,10 +248,7 @@ static int __meminit sparse_init_one_section(struct mem_section *ms,
unsigned long usemap_size(void)
{
- unsigned long size_bytes;
- size_bytes = roundup(SECTION_BLOCKFLAGS_BITS, 8) / 8;
- size_bytes = roundup(size_bytes, sizeof(unsigned long));
- return size_bytes;
+ return BITS_TO_LONGS(SECTION_BLOCKFLAGS_BITS) * sizeof(unsigned long);
}
#ifdef CONFIG_MEMORY_HOTPLUG
diff --git a/mm/swap.c b/mm/swap.c
index 5dabf444d724..98d08b4579fa 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -46,7 +46,7 @@ int page_cluster;
static DEFINE_PER_CPU(struct pagevec, lru_add_pvec);
static DEFINE_PER_CPU(struct pagevec, lru_rotate_pvecs);
static DEFINE_PER_CPU(struct pagevec, lru_deactivate_file_pvecs);
-static DEFINE_PER_CPU(struct pagevec, lru_deactivate_pvecs);
+static DEFINE_PER_CPU(struct pagevec, lru_lazyfree_pvecs);
#ifdef CONFIG_SMP
static DEFINE_PER_CPU(struct pagevec, activate_page_pvecs);
#endif
@@ -97,6 +97,16 @@ static void __put_compound_page(struct page *page)
void __put_page(struct page *page)
{
+ if (is_zone_device_page(page)) {
+ put_dev_pagemap(page->pgmap);
+
+ /*
+ * The page belongs to the device that created pgmap. Do
+ * not return it to page allocator.
+ */
+ return;
+ }
+
if (unlikely(PageCompound(page)))
__put_compound_page(page);
else
@@ -561,20 +571,27 @@ static void lru_deactivate_file_fn(struct page *page, struct lruvec *lruvec,
}
-static void lru_deactivate_fn(struct page *page, struct lruvec *lruvec,
+static void lru_lazyfree_fn(struct page *page, struct lruvec *lruvec,
void *arg)
{
- if (PageLRU(page) && PageActive(page) && !PageUnevictable(page)) {
- int file = page_is_file_cache(page);
- int lru = page_lru_base_type(page);
+ if (PageLRU(page) && PageAnon(page) && PageSwapBacked(page) &&
+ !PageUnevictable(page)) {
+ bool active = PageActive(page);
- del_page_from_lru_list(page, lruvec, lru + LRU_ACTIVE);
+ del_page_from_lru_list(page, lruvec,
+ LRU_INACTIVE_ANON + active);
ClearPageActive(page);
ClearPageReferenced(page);
- add_page_to_lru_list(page, lruvec, lru);
+ /*
+ * lazyfree pages are clean anonymous pages. They have
+ * SwapBacked flag cleared to distinguish normal anonymous
+ * pages
+ */
+ ClearPageSwapBacked(page);
+ add_page_to_lru_list(page, lruvec, LRU_INACTIVE_FILE);
- __count_vm_event(PGDEACTIVATE);
- update_page_reclaim_stat(lruvec, file, 0);
+ __count_vm_events(PGLAZYFREE, hpage_nr_pages(page));
+ update_page_reclaim_stat(lruvec, 1, 0);
}
}
@@ -604,9 +621,9 @@ void lru_add_drain_cpu(int cpu)
if (pagevec_count(pvec))
pagevec_lru_move_fn(pvec, lru_deactivate_file_fn, NULL);
- pvec = &per_cpu(lru_deactivate_pvecs, cpu);
+ pvec = &per_cpu(lru_lazyfree_pvecs, cpu);
if (pagevec_count(pvec))
- pagevec_lru_move_fn(pvec, lru_deactivate_fn, NULL);
+ pagevec_lru_move_fn(pvec, lru_lazyfree_fn, NULL);
activate_page_drain(cpu);
}
@@ -638,22 +655,22 @@ void deactivate_file_page(struct page *page)
}
/**
- * deactivate_page - deactivate a page
+ * mark_page_lazyfree - make an anon page lazyfree
* @page: page to deactivate
*
- * deactivate_page() moves @page to the inactive list if @page was on the active
- * list and was not an unevictable page. This is done to accelerate the reclaim
- * of @page.
+ * mark_page_lazyfree() moves @page to the inactive file list.
+ * This is done to accelerate the reclaim of @page.
*/
-void deactivate_page(struct page *page)
+void mark_page_lazyfree(struct page *page)
{
- if (PageLRU(page) && PageActive(page) && !PageUnevictable(page)) {
- struct pagevec *pvec = &get_cpu_var(lru_deactivate_pvecs);
+ if (PageLRU(page) && PageAnon(page) && PageSwapBacked(page) &&
+ !PageUnevictable(page)) {
+ struct pagevec *pvec = &get_cpu_var(lru_lazyfree_pvecs);
get_page(page);
if (!pagevec_add(pvec, page) || PageCompound(page))
- pagevec_lru_move_fn(pvec, lru_deactivate_fn, NULL);
- put_cpu_var(lru_deactivate_pvecs);
+ pagevec_lru_move_fn(pvec, lru_lazyfree_fn, NULL);
+ put_cpu_var(lru_lazyfree_pvecs);
}
}
@@ -693,7 +710,7 @@ void lru_add_drain_all(void)
if (pagevec_count(&per_cpu(lru_add_pvec, cpu)) ||
pagevec_count(&per_cpu(lru_rotate_pvecs, cpu)) ||
pagevec_count(&per_cpu(lru_deactivate_file_pvecs, cpu)) ||
- pagevec_count(&per_cpu(lru_deactivate_pvecs, cpu)) ||
+ pagevec_count(&per_cpu(lru_lazyfree_pvecs, cpu)) ||
need_activate_page_drain(cpu)) {
INIT_WORK(work, lru_add_drain_per_cpu);
queue_work_on(cpu, mm_percpu_wq, work);
diff --git a/mm/swap_slots.c b/mm/swap_slots.c
index b1ccb58ad397..aa1c415f4abd 100644
--- a/mm/swap_slots.c
+++ b/mm/swap_slots.c
@@ -241,8 +241,10 @@ int enable_swap_slots_cache(void)
ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "swap_slots_cache",
alloc_swap_slot_cache, free_slot_cache);
- if (ret < 0)
+ if (WARN_ONCE(ret < 0, "Cache allocation failed (%s), operating "
+ "without swap slots cache.\n", __func__))
goto out_unlock;
+
swap_slot_cache_initialized = true;
__reenable_swap_slots_cache();
out_unlock:
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 473b71e052a8..7bfb9bd1ca21 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -360,17 +360,7 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
/*
* We might race against get_swap_page() and stumble
* across a SWAP_HAS_CACHE swap_map entry whose page
- * has not been brought into the swapcache yet, while
- * the other end is scheduled away waiting on discard
- * I/O completion at scan_swap_map().
- *
- * In order to avoid turning this transitory state
- * into a permanent loop around this -EEXIST case
- * if !CONFIG_PREEMPT and the I/O completion happens
- * to be waiting on the CPU waitqueue where we are now
- * busy looping, we just conditionally invoke the
- * scheduler here, if there are some more important
- * tasks to run.
+ * has not been brought into the swapcache yet.
*/
cond_resched();
continue;
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 178130880b90..b86b2aca3fb9 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -335,7 +335,7 @@ static void cluster_list_add_tail(struct swap_cluster_list *list,
ci_tail = ci + tail;
spin_lock_nested(&ci_tail->lock, SINGLE_DEPTH_NESTING);
cluster_set_next(ci_tail, idx);
- unlock_cluster(ci_tail);
+ spin_unlock(&ci_tail->lock);
cluster_set_next_flag(&list->tail, idx, 0);
}
}
@@ -672,6 +672,9 @@ checks:
else
goto done;
}
+ si->swap_map[offset] = usage;
+ inc_cluster_info_page(si, si->cluster_info, offset);
+ unlock_cluster(ci);
if (offset == si->lowest_bit)
si->lowest_bit++;
@@ -685,9 +688,6 @@ checks:
plist_del(&si->avail_list, &swap_avail_head);
spin_unlock(&swap_avail_lock);
}
- si->swap_map[offset] = usage;
- inc_cluster_info_page(si, si->cluster_info, offset);
- unlock_cluster(ci);
si->cluster_next = offset + 1;
slots[n_ret++] = swp_entry(si->type, offset);
@@ -1079,8 +1079,6 @@ void swapcache_free_entries(swp_entry_t *entries, int n)
p = swap_info_get_cont(entries[i], prev);
if (p)
swap_entry_free(p, entries[i]);
- else
- break;
prev = p;
}
if (p)
@@ -1111,6 +1109,18 @@ int page_swapcount(struct page *page)
return count;
}
+static int swap_swapcount(struct swap_info_struct *si, swp_entry_t entry)
+{
+ int count = 0;
+ pgoff_t offset = swp_offset(entry);
+ struct swap_cluster_info *ci;
+
+ ci = lock_cluster_or_swap_info(si, offset);
+ count = swap_count(si->swap_map[offset]);
+ unlock_cluster_or_swap_info(si, ci);
+ return count;
+}
+
/*
* How many references to @entry are currently swapped out?
* This does not give an exact answer when swap count is continued,
@@ -1119,17 +1129,11 @@ int page_swapcount(struct page *page)
int __swp_swapcount(swp_entry_t entry)
{
int count = 0;
- pgoff_t offset;
struct swap_info_struct *si;
- struct swap_cluster_info *ci;
si = __swap_info_get(entry);
- if (si) {
- offset = swp_offset(entry);
- ci = lock_cluster_or_swap_info(si, offset);
- count = swap_count(si->swap_map[offset]);
- unlock_cluster_or_swap_info(si, ci);
- }
+ if (si)
+ count = swap_swapcount(si, entry);
return count;
}
@@ -1291,7 +1295,8 @@ int free_swap_and_cache(swp_entry_t entry)
* Also recheck PageSwapCache now page is locked (above).
*/
if (PageSwapCache(page) && !PageWriteback(page) &&
- (!page_mapped(page) || mem_cgroup_swap_full(page))) {
+ (!page_mapped(page) || mem_cgroup_swap_full(page)) &&
+ !swap_swapcount(p, entry)) {
delete_from_swap_cache(page);
SetPageDirty(page);
}
diff --git a/mm/truncate.c b/mm/truncate.c
index 6263affdef88..83a059e8cd1d 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -266,9 +266,8 @@ void truncate_inode_pages_range(struct address_space *mapping,
pgoff_t index;
int i;
- cleancache_invalidate_inode(mapping);
if (mapping->nrpages == 0 && mapping->nrexceptional == 0)
- return;
+ goto out;
/* Offsets within partial pages */
partial_start = lstart & (PAGE_SIZE - 1);
@@ -363,7 +362,7 @@ void truncate_inode_pages_range(struct address_space *mapping,
* will be released, just zeroed, so we can bail out now.
*/
if (start >= end)
- return;
+ goto out;
index = start;
for ( ; ; ) {
@@ -410,6 +409,8 @@ void truncate_inode_pages_range(struct address_space *mapping,
pagevec_release(&pvec);
index++;
}
+
+out:
cleancache_invalidate_inode(mapping);
}
EXPORT_SYMBOL(truncate_inode_pages_range);
@@ -623,7 +624,9 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
int ret2 = 0;
int did_range_unmap = 0;
- cleancache_invalidate_inode(mapping);
+ if (mapping->nrpages == 0 && mapping->nrexceptional == 0)
+ goto out;
+
pagevec_init(&pvec, 0);
index = start;
while (index <= end && pagevec_lookup_entries(&pvec, mapping, index,
@@ -686,6 +689,8 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
cond_resched();
index++;
}
+
+out:
cleancache_invalidate_inode(mapping);
return ret;
}
diff --git a/mm/usercopy.c b/mm/usercopy.c
index d155e12563b1..a9852b24715d 100644
--- a/mm/usercopy.c
+++ b/mm/usercopy.c
@@ -19,15 +19,9 @@
#include <linux/sched.h>
#include <linux/sched/task.h>
#include <linux/sched/task_stack.h>
+#include <linux/thread_info.h>
#include <asm/sections.h>
-enum {
- BAD_STACK = -1,
- NOT_STACK = 0,
- GOOD_FRAME,
- GOOD_STACK,
-};
-
/*
* Checks if a given pointer and length is contained by the current
* stack frame (if possible).
@@ -206,17 +200,6 @@ static inline const char *check_heap_object(const void *ptr, unsigned long n,
{
struct page *page;
- /*
- * Some architectures (arm64) return true for virt_addr_valid() on
- * vmalloced addresses. Work around this by checking for vmalloc
- * first.
- *
- * We also need to check for module addresses explicitly since we
- * may copy static data from modules to userspace
- */
- if (is_vmalloc_or_module_addr(ptr))
- return NULL;
-
if (!virt_addr_valid(ptr))
return NULL;
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 0b057628a7ba..b52aeed3f58e 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -1579,7 +1579,7 @@ void vfree_atomic(const void *addr)
* have CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG, but making the calling
* conventions for vfree() arch-depenedent would be a really bad idea)
*
- * NOTE: assumes that the object at *addr has a size >= sizeof(llist_node)
+ * NOTE: assumes that the object at @addr has a size >= sizeof(llist_node)
*/
void vfree(const void *addr)
{
diff --git a/mm/vmscan.c b/mm/vmscan.c
index bc8031ef994d..4e7ed65842af 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -97,8 +97,13 @@ struct scan_control {
/* Can pages be swapped as part of reclaim? */
unsigned int may_swap:1;
- /* Can cgroups be reclaimed below their normal consumption range? */
- unsigned int may_thrash:1;
+ /*
+ * Cgroups are not reclaimed below their configured memory.low,
+ * unless we threaten to OOM. If any cgroups are skipped due to
+ * memory.low and nothing was reclaimed, go back for memory.low.
+ */
+ unsigned int memcg_low_reclaim:1;
+ unsigned int memcg_low_skipped:1;
unsigned int hibernation_mode:1;
@@ -230,12 +235,6 @@ unsigned long pgdat_reclaimable_pages(struct pglist_data *pgdat)
return nr;
}
-bool pgdat_reclaimable(struct pglist_data *pgdat)
-{
- return node_page_state_snapshot(pgdat, NR_PAGES_SCANNED) <
- pgdat_reclaimable_pages(pgdat) * 6;
-}
-
/**
* lruvec_lru_size - Returns the number of pages on the given LRU list.
* @lruvec: lru vector
@@ -912,7 +911,8 @@ static void page_check_dirty_writeback(struct page *page,
* Anonymous pages are not handled by flushers and must be written
* from reclaim context. Do not stall reclaim based on them
*/
- if (!page_is_file_cache(page)) {
+ if (!page_is_file_cache(page) ||
+ (PageAnon(page) && !PageSwapBacked(page))) {
*dirty = false;
*writeback = false;
return;
@@ -972,8 +972,6 @@ static unsigned long shrink_page_list(struct list_head *page_list,
int may_enter_fs;
enum page_references references = PAGEREF_RECLAIM_CLEAN;
bool dirty, writeback;
- bool lazyfree = false;
- int ret = SWAP_SUCCESS;
cond_resched();
@@ -988,13 +986,14 @@ static unsigned long shrink_page_list(struct list_head *page_list,
sc->nr_scanned++;
if (unlikely(!page_evictable(page)))
- goto cull_mlocked;
+ goto activate_locked;
if (!sc->may_unmap && page_mapped(page))
goto keep_locked;
/* Double the slab pressure for mapped and swapcache pages */
- if (page_mapped(page) || PageSwapCache(page))
+ if ((page_mapped(page) || PageSwapCache(page)) &&
+ !(PageAnon(page) && !PageSwapBacked(page)))
sc->nr_scanned++;
may_enter_fs = (sc->gfp_mask & __GFP_FS) ||
@@ -1120,13 +1119,14 @@ static unsigned long shrink_page_list(struct list_head *page_list,
/*
* Anonymous process memory has backing store?
* Try to allocate it some swap space here.
+ * Lazyfree page could be freed directly
*/
- if (PageAnon(page) && !PageSwapCache(page)) {
+ if (PageAnon(page) && PageSwapBacked(page) &&
+ !PageSwapCache(page)) {
if (!(sc->gfp_mask & __GFP_IO))
goto keep_locked;
if (!add_to_swap(page, page_list))
goto activate_locked;
- lazyfree = true;
may_enter_fs = 1;
/* Adding to swap updated mapping */
@@ -1143,21 +1143,10 @@ static unsigned long shrink_page_list(struct list_head *page_list,
* The page is mapped into the page tables of one or more
* processes. Try to unmap it here.
*/
- if (page_mapped(page) && mapping) {
- switch (ret = try_to_unmap(page, lazyfree ?
- (ttu_flags | TTU_BATCH_FLUSH | TTU_LZFREE) :
- (ttu_flags | TTU_BATCH_FLUSH))) {
- case SWAP_FAIL:
+ if (page_mapped(page)) {
+ if (!try_to_unmap(page, ttu_flags | TTU_BATCH_FLUSH)) {
nr_unmap_fail++;
goto activate_locked;
- case SWAP_AGAIN:
- goto keep_locked;
- case SWAP_MLOCK:
- goto cull_mlocked;
- case SWAP_LZFREE:
- goto lazyfree;
- case SWAP_SUCCESS:
- ; /* try to free the page below */
}
}
@@ -1267,10 +1256,18 @@ static unsigned long shrink_page_list(struct list_head *page_list,
}
}
-lazyfree:
- if (!mapping || !__remove_mapping(mapping, page, true))
- goto keep_locked;
+ if (PageAnon(page) && !PageSwapBacked(page)) {
+ /* follow __remove_mapping for reference */
+ if (!page_ref_freeze(page, 1))
+ goto keep_locked;
+ if (PageDirty(page)) {
+ page_ref_unfreeze(page, 1);
+ goto keep_locked;
+ }
+ count_vm_event(PGLAZYFREED);
+ } else if (!mapping || !__remove_mapping(mapping, page, true))
+ goto keep_locked;
/*
* At this point, we have no other references and there is
* no way to pick any more up (removed from LRU, removed
@@ -1280,9 +1277,6 @@ lazyfree:
*/
__ClearPageLocked(page);
free_it:
- if (ret == SWAP_LZFREE)
- count_vm_event(PGLAZYFREED);
-
nr_reclaimed++;
/*
@@ -1292,20 +1286,16 @@ free_it:
list_add(&page->lru, &free_pages);
continue;
-cull_mlocked:
- if (PageSwapCache(page))
- try_to_free_swap(page);
- unlock_page(page);
- list_add(&page->lru, &ret_pages);
- continue;
-
activate_locked:
/* Not a candidate for swapping, so reclaim swap space. */
- if (PageSwapCache(page) && mem_cgroup_swap_full(page))
+ if (PageSwapCache(page) && (mem_cgroup_swap_full(page) ||
+ PageMlocked(page)))
try_to_free_swap(page);
VM_BUG_ON_PAGE(PageActive(page), page);
- SetPageActive(page);
- pgactivate++;
+ if (!PageMlocked(page)) {
+ SetPageActive(page);
+ pgactivate++;
+ }
keep_locked:
unlock_page(page);
keep:
@@ -1354,7 +1344,7 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone,
}
ret = shrink_page_list(&clean_pages, zone->zone_pgdat, &sc,
- TTU_UNMAP|TTU_IGNORE_ACCESS, NULL, true);
+ TTU_IGNORE_ACCESS, NULL, true);
list_splice(&clean_pages, page_list);
mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_FILE, -ret);
return ret;
@@ -1478,12 +1468,12 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
unsigned long nr_taken = 0;
unsigned long nr_zone_taken[MAX_NR_ZONES] = { 0 };
unsigned long nr_skipped[MAX_NR_ZONES] = { 0, };
- unsigned long skipped = 0, total_skipped = 0;
+ unsigned long skipped = 0;
unsigned long scan, nr_pages;
LIST_HEAD(pages_skipped);
for (scan = 0; scan < nr_to_scan && nr_taken < nr_to_scan &&
- !list_empty(src);) {
+ !list_empty(src); scan++) {
struct page *page;
page = lru_to_page(src);
@@ -1497,12 +1487,6 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
continue;
}
- /*
- * Account for scanned and skipped separetly to avoid the pgdat
- * being prematurely marked unreclaimable by pgdat_reclaimable.
- */
- scan++;
-
switch (__isolate_lru_page(page, mode)) {
case 0:
nr_pages = hpage_nr_pages(page);
@@ -1531,6 +1515,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
if (!list_empty(&pages_skipped)) {
int zid;
+ list_splice(&pages_skipped, src);
for (zid = 0; zid < MAX_NR_ZONES; zid++) {
if (!nr_skipped[zid])
continue;
@@ -1538,17 +1523,8 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
__count_zid_vm_events(PGSCAN_SKIP, zid, nr_skipped[zid]);
skipped += nr_skipped[zid];
}
-
- /*
- * Account skipped pages as a partial scan as the pgdat may be
- * close to unreclaimable. If the LRU list is empty, account
- * skipped pages as a full scan.
- */
- total_skipped = list_empty(src) ? skipped : skipped >> 2;
-
- list_splice(&pages_skipped, src);
}
- *nr_scanned = scan + total_skipped;
+ *nr_scanned = scan;
trace_mm_vmscan_lru_isolate(sc->reclaim_idx, sc->order, nr_to_scan,
scan, skipped, nr_taken, mode, lru);
update_lru_sizes(lruvec, lru, nr_zone_taken);
@@ -1750,7 +1726,6 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
reclaim_stat->recent_scanned[file] += nr_taken;
if (global_reclaim(sc)) {
- __mod_node_page_state(pgdat, NR_PAGES_SCANNED, nr_scanned);
if (current_is_kswapd())
__count_vm_events(PGSCAN_KSWAPD, nr_scanned);
else
@@ -1761,7 +1736,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
if (nr_taken == 0)
return 0;
- nr_reclaimed = shrink_page_list(&page_list, pgdat, sc, TTU_UNMAP,
+ nr_reclaimed = shrink_page_list(&page_list, pgdat, sc, 0,
&stat, false);
spin_lock_irq(&pgdat->lru_lock);
@@ -1953,8 +1928,6 @@ static void shrink_active_list(unsigned long nr_to_scan,
__mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, nr_taken);
reclaim_stat->recent_scanned[file] += nr_taken;
- if (global_reclaim(sc))
- __mod_node_page_state(pgdat, NR_PAGES_SCANNED, nr_scanned);
__count_vm_events(PGREFILL, nr_scanned);
spin_unlock_irq(&pgdat->lru_lock);
@@ -2033,6 +2006,8 @@ static void shrink_active_list(unsigned long nr_to_scan,
* Both inactive lists should also be large enough that each inactive
* page has a chance to be referenced again before it is reclaimed.
*
+ * If that fails and refaulting is observed, the inactive list grows.
+ *
* The inactive_ratio is the target ratio of ACTIVE to INACTIVE pages
* on this LRU, maintained by the pageout code. A zone->inactive_ratio
* of 3 means 3:1 or 25% of the pages are kept on the inactive list.
@@ -2049,12 +2024,15 @@ static void shrink_active_list(unsigned long nr_to_scan,
* 10TB 320 32GB
*/
static bool inactive_list_is_low(struct lruvec *lruvec, bool file,
- struct scan_control *sc, bool trace)
+ struct mem_cgroup *memcg,
+ struct scan_control *sc, bool actual_reclaim)
{
- unsigned long inactive_ratio;
- unsigned long inactive, active;
- enum lru_list inactive_lru = file * LRU_FILE;
enum lru_list active_lru = file * LRU_FILE + LRU_ACTIVE;
+ struct pglist_data *pgdat = lruvec_pgdat(lruvec);
+ enum lru_list inactive_lru = file * LRU_FILE;
+ unsigned long inactive, active;
+ unsigned long inactive_ratio;
+ unsigned long refaults;
unsigned long gb;
/*
@@ -2067,27 +2045,42 @@ static bool inactive_list_is_low(struct lruvec *lruvec, bool file,
inactive = lruvec_lru_size(lruvec, inactive_lru, sc->reclaim_idx);
active = lruvec_lru_size(lruvec, active_lru, sc->reclaim_idx);
- gb = (inactive + active) >> (30 - PAGE_SHIFT);
- if (gb)
- inactive_ratio = int_sqrt(10 * gb);
+ if (memcg)
+ refaults = memcg_page_state(memcg, WORKINGSET_ACTIVATE);
else
- inactive_ratio = 1;
+ refaults = node_page_state(pgdat, WORKINGSET_ACTIVATE);
+
+ /*
+ * When refaults are being observed, it means a new workingset
+ * is being established. Disable active list protection to get
+ * rid of the stale workingset quickly.
+ */
+ if (file && actual_reclaim && lruvec->refaults != refaults) {
+ inactive_ratio = 0;
+ } else {
+ gb = (inactive + active) >> (30 - PAGE_SHIFT);
+ if (gb)
+ inactive_ratio = int_sqrt(10 * gb);
+ else
+ inactive_ratio = 1;
+ }
- if (trace)
- trace_mm_vmscan_inactive_list_is_low(lruvec_pgdat(lruvec)->node_id,
- sc->reclaim_idx,
- lruvec_lru_size(lruvec, inactive_lru, MAX_NR_ZONES), inactive,
- lruvec_lru_size(lruvec, active_lru, MAX_NR_ZONES), active,
- inactive_ratio, file);
+ if (actual_reclaim)
+ trace_mm_vmscan_inactive_list_is_low(pgdat->node_id, sc->reclaim_idx,
+ lruvec_lru_size(lruvec, inactive_lru, MAX_NR_ZONES), inactive,
+ lruvec_lru_size(lruvec, active_lru, MAX_NR_ZONES), active,
+ inactive_ratio, file);
return inactive * inactive_ratio < active;
}
static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan,
- struct lruvec *lruvec, struct scan_control *sc)
+ struct lruvec *lruvec, struct mem_cgroup *memcg,
+ struct scan_control *sc)
{
if (is_active_lru(lru)) {
- if (inactive_list_is_low(lruvec, is_file_lru(lru), sc, true))
+ if (inactive_list_is_low(lruvec, is_file_lru(lru),
+ memcg, sc, true))
shrink_active_list(nr_to_scan, lruvec, sc, lru);
return 0;
}
@@ -2123,30 +2116,8 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg,
unsigned long anon_prio, file_prio;
enum scan_balance scan_balance;
unsigned long anon, file;
- bool force_scan = false;
unsigned long ap, fp;
enum lru_list lru;
- bool some_scanned;
- int pass;
-
- /*
- * If the zone or memcg is small, nr[l] can be 0. This
- * results in no scanning on this priority and a potential
- * priority drop. Global direct reclaim can go to the next
- * zone and tends to have no problems. Global kswapd is for
- * zone balancing and it needs to scan a minimum amount. When
- * reclaiming for a memcg, a priority drop can cause high
- * latencies, so it's better to scan a minimum amount there as
- * well.
- */
- if (current_is_kswapd()) {
- if (!pgdat_reclaimable(pgdat))
- force_scan = true;
- if (!mem_cgroup_online(memcg))
- force_scan = true;
- }
- if (!global_reclaim(sc))
- force_scan = true;
/* If we have no swap space, do not bother scanning anon pages. */
if (!sc->may_swap || mem_cgroup_get_nr_swap_pages(memcg) <= 0) {
@@ -2218,7 +2189,7 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg,
* lruvec even if it has plenty of old anonymous pages unless the
* system is under heavy pressure.
*/
- if (!inactive_list_is_low(lruvec, true, sc, false) &&
+ if (!inactive_list_is_low(lruvec, true, memcg, sc, false) &&
lruvec_lru_size(lruvec, LRU_INACTIVE_FILE, sc->reclaim_idx) >> sc->priority) {
scan_balance = SCAN_FILE;
goto out;
@@ -2277,55 +2248,48 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg,
fraction[1] = fp;
denominator = ap + fp + 1;
out:
- some_scanned = false;
- /* Only use force_scan on second pass. */
- for (pass = 0; !some_scanned && pass < 2; pass++) {
- *lru_pages = 0;
- for_each_evictable_lru(lru) {
- int file = is_file_lru(lru);
- unsigned long size;
- unsigned long scan;
-
- size = lruvec_lru_size(lruvec, lru, sc->reclaim_idx);
- scan = size >> sc->priority;
-
- if (!scan && pass && force_scan)
- scan = min(size, SWAP_CLUSTER_MAX);
-
- switch (scan_balance) {
- case SCAN_EQUAL:
- /* Scan lists relative to size */
- break;
- case SCAN_FRACT:
- /*
- * Scan types proportional to swappiness and
- * their relative recent reclaim efficiency.
- */
- scan = div64_u64(scan * fraction[file],
- denominator);
- break;
- case SCAN_FILE:
- case SCAN_ANON:
- /* Scan one type exclusively */
- if ((scan_balance == SCAN_FILE) != file) {
- size = 0;
- scan = 0;
- }
- break;
- default:
- /* Look ma, no brain */
- BUG();
- }
+ *lru_pages = 0;
+ for_each_evictable_lru(lru) {
+ int file = is_file_lru(lru);
+ unsigned long size;
+ unsigned long scan;
- *lru_pages += size;
- nr[lru] = scan;
+ size = lruvec_lru_size(lruvec, lru, sc->reclaim_idx);
+ scan = size >> sc->priority;
+ /*
+ * If the cgroup's already been deleted, make sure to
+ * scrape out the remaining cache.
+ */
+ if (!scan && !mem_cgroup_online(memcg))
+ scan = min(size, SWAP_CLUSTER_MAX);
+ switch (scan_balance) {
+ case SCAN_EQUAL:
+ /* Scan lists relative to size */
+ break;
+ case SCAN_FRACT:
/*
- * Skip the second pass and don't force_scan,
- * if we found something to scan.
+ * Scan types proportional to swappiness and
+ * their relative recent reclaim efficiency.
*/
- some_scanned |= !!scan;
+ scan = div64_u64(scan * fraction[file],
+ denominator);
+ break;
+ case SCAN_FILE:
+ case SCAN_ANON:
+ /* Scan one type exclusively */
+ if ((scan_balance == SCAN_FILE) != file) {
+ size = 0;
+ scan = 0;
+ }
+ break;
+ default:
+ /* Look ma, no brain */
+ BUG();
}
+
+ *lru_pages += size;
+ nr[lru] = scan;
}
}
@@ -2376,7 +2340,7 @@ static void shrink_node_memcg(struct pglist_data *pgdat, struct mem_cgroup *memc
nr[lru] -= nr_to_scan;
nr_reclaimed += shrink_list(lru, nr_to_scan,
- lruvec, sc);
+ lruvec, memcg, sc);
}
}
@@ -2443,7 +2407,7 @@ static void shrink_node_memcg(struct pglist_data *pgdat, struct mem_cgroup *memc
* Even if we did not try to evict anon pages at all, we want to
* rebalance the anon lru active/inactive ratio.
*/
- if (inactive_list_is_low(lruvec, false, sc, true))
+ if (inactive_list_is_low(lruvec, false, memcg, sc, true))
shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
sc, LRU_ACTIVE_ANON);
}
@@ -2557,9 +2521,11 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc)
unsigned long scanned;
if (mem_cgroup_low(root, memcg)) {
- if (!sc->may_thrash)
+ if (!sc->memcg_low_reclaim) {
+ sc->memcg_low_skipped = 1;
continue;
- mem_cgroup_events(memcg, MEMCG_LOW, 1);
+ }
+ mem_cgroup_event(memcg, MEMCG_LOW);
}
reclaimed = sc->nr_reclaimed;
@@ -2620,6 +2586,15 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc)
} while (should_continue_reclaim(pgdat, sc->nr_reclaimed - nr_reclaimed,
sc->nr_scanned - nr_scanned, sc));
+ /*
+ * Kswapd gives up on balancing particular nodes after too
+ * many failures to reclaim anything from them and goes to
+ * sleep. On reclaim progress, reset the failure counter. A
+ * successful direct reclaim run will revive a dormant kswapd.
+ */
+ if (reclaimable)
+ pgdat->kswapd_failures = 0;
+
return reclaimable;
}
@@ -2694,10 +2669,6 @@ static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
GFP_KERNEL | __GFP_HARDWALL))
continue;
- if (sc->priority != DEF_PRIORITY &&
- !pgdat_reclaimable(zone->zone_pgdat))
- continue; /* Let kswapd poll it */
-
/*
* If we already have plenty of memory free for
* compaction in this zone, don't free any more.
@@ -2752,6 +2723,25 @@ static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
sc->gfp_mask = orig_mask;
}
+static void snapshot_refaults(struct mem_cgroup *root_memcg, pg_data_t *pgdat)
+{
+ struct mem_cgroup *memcg;
+
+ memcg = mem_cgroup_iter(root_memcg, NULL, NULL);
+ do {
+ unsigned long refaults;
+ struct lruvec *lruvec;
+
+ if (memcg)
+ refaults = memcg_page_state(memcg, WORKINGSET_ACTIVATE);
+ else
+ refaults = node_page_state(pgdat, WORKINGSET_ACTIVATE);
+
+ lruvec = mem_cgroup_lruvec(pgdat, memcg);
+ lruvec->refaults = refaults;
+ } while ((memcg = mem_cgroup_iter(root_memcg, memcg, NULL)));
+}
+
/*
* This is the main entry point to direct page reclaim.
*
@@ -2772,6 +2762,9 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
struct scan_control *sc)
{
int initial_priority = sc->priority;
+ pg_data_t *last_pgdat;
+ struct zoneref *z;
+ struct zone *zone;
retry:
delayacct_freepages_start();
@@ -2798,6 +2791,15 @@ retry:
sc->may_writepage = 1;
} while (--sc->priority >= 0);
+ last_pgdat = NULL;
+ for_each_zone_zonelist_nodemask(zone, z, zonelist, sc->reclaim_idx,
+ sc->nodemask) {
+ if (zone->zone_pgdat == last_pgdat)
+ continue;
+ last_pgdat = zone->zone_pgdat;
+ snapshot_refaults(sc->target_mem_cgroup, zone->zone_pgdat);
+ }
+
delayacct_freepages_end();
if (sc->nr_reclaimed)
@@ -2808,16 +2810,17 @@ retry:
return 1;
/* Untapped cgroup reserves? Don't OOM, retry. */
- if (!sc->may_thrash) {
+ if (sc->memcg_low_skipped) {
sc->priority = initial_priority;
- sc->may_thrash = 1;
+ sc->memcg_low_reclaim = 1;
+ sc->memcg_low_skipped = 0;
goto retry;
}
return 0;
}
-static bool pfmemalloc_watermark_ok(pg_data_t *pgdat)
+static bool allow_direct_reclaim(pg_data_t *pgdat)
{
struct zone *zone;
unsigned long pfmemalloc_reserve = 0;
@@ -2825,10 +2828,15 @@ static bool pfmemalloc_watermark_ok(pg_data_t *pgdat)
int i;
bool wmark_ok;
+ if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES)
+ return true;
+
for (i = 0; i <= ZONE_NORMAL; i++) {
zone = &pgdat->node_zones[i];
- if (!managed_zone(zone) ||
- pgdat_reclaimable_pages(pgdat) == 0)
+ if (!managed_zone(zone))
+ continue;
+
+ if (!zone_reclaimable_pages(zone))
continue;
pfmemalloc_reserve += min_wmark_pages(zone);
@@ -2905,7 +2913,7 @@ static bool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist,
/* Throttle based on the first usable node */
pgdat = zone->zone_pgdat;
- if (pfmemalloc_watermark_ok(pgdat))
+ if (allow_direct_reclaim(pgdat))
goto out;
break;
}
@@ -2927,14 +2935,14 @@ static bool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist,
*/
if (!(gfp_mask & __GFP_FS)) {
wait_event_interruptible_timeout(pgdat->pfmemalloc_wait,
- pfmemalloc_watermark_ok(pgdat), HZ);
+ allow_direct_reclaim(pgdat), HZ);
goto check_pending;
}
/* Throttle until kswapd wakes the process */
wait_event_killable(zone->zone_pgdat->pfmemalloc_wait,
- pfmemalloc_watermark_ok(pgdat));
+ allow_direct_reclaim(pgdat));
check_pending:
if (fatal_signal_pending(current))
@@ -2950,7 +2958,7 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
unsigned long nr_reclaimed;
struct scan_control sc = {
.nr_to_reclaim = SWAP_CLUSTER_MAX,
- .gfp_mask = (gfp_mask = memalloc_noio_flags(gfp_mask)),
+ .gfp_mask = (gfp_mask = current_gfp_context(gfp_mask)),
.reclaim_idx = gfp_zone(gfp_mask),
.order = order,
.nodemask = nodemask,
@@ -3030,7 +3038,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
int nid;
struct scan_control sc = {
.nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX),
- .gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
+ .gfp_mask = (current_gfp_context(gfp_mask) & GFP_RECLAIM_MASK) |
(GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK),
.reclaim_idx = MAX_NR_ZONES - 1,
.target_mem_cgroup = memcg,
@@ -3076,7 +3084,7 @@ static void age_active_anon(struct pglist_data *pgdat,
do {
struct lruvec *lruvec = mem_cgroup_lruvec(pgdat, memcg);
- if (inactive_list_is_low(lruvec, false, sc, true))
+ if (inactive_list_is_low(lruvec, false, memcg, sc, true))
shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
sc, LRU_ACTIVE_ANON);
@@ -3084,22 +3092,44 @@ static void age_active_anon(struct pglist_data *pgdat,
} while (memcg);
}
-static bool zone_balanced(struct zone *zone, int order, int classzone_idx)
+/*
+ * Returns true if there is an eligible zone balanced for the request order
+ * and classzone_idx
+ */
+static bool pgdat_balanced(pg_data_t *pgdat, int order, int classzone_idx)
{
- unsigned long mark = high_wmark_pages(zone);
+ int i;
+ unsigned long mark = -1;
+ struct zone *zone;
- if (!zone_watermark_ok_safe(zone, order, mark, classzone_idx))
- return false;
+ for (i = 0; i <= classzone_idx; i++) {
+ zone = pgdat->node_zones + i;
+
+ if (!managed_zone(zone))
+ continue;
+
+ mark = high_wmark_pages(zone);
+ if (zone_watermark_ok_safe(zone, order, mark, classzone_idx))
+ return true;
+ }
/*
- * If any eligible zone is balanced then the node is not considered
- * to be congested or dirty
+ * If a node has no populated zone within classzone_idx, it does not
+ * need balancing by definition. This can happen if a zone-restricted
+ * allocation tries to wake a remote kswapd.
*/
- clear_bit(PGDAT_CONGESTED, &zone->zone_pgdat->flags);
- clear_bit(PGDAT_DIRTY, &zone->zone_pgdat->flags);
- clear_bit(PGDAT_WRITEBACK, &zone->zone_pgdat->flags);
+ if (mark == -1)
+ return true;
- return true;
+ return false;
+}
+
+/* Clear pgdat state for congested, dirty or under writeback. */
+static void clear_pgdat_congested(pg_data_t *pgdat)
+{
+ clear_bit(PGDAT_CONGESTED, &pgdat->flags);
+ clear_bit(PGDAT_DIRTY, &pgdat->flags);
+ clear_bit(PGDAT_WRITEBACK, &pgdat->flags);
}
/*
@@ -3110,11 +3140,9 @@ static bool zone_balanced(struct zone *zone, int order, int classzone_idx)
*/
static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, int classzone_idx)
{
- int i;
-
/*
* The throttled processes are normally woken up in balance_pgdat() as
- * soon as pfmemalloc_watermark_ok() is true. But there is a potential
+ * soon as allow_direct_reclaim() is true. But there is a potential
* race between when kswapd checks the watermarks and a process gets
* throttled. There is also a potential race if processes get
* throttled, kswapd wakes, a large process exits thereby balancing the
@@ -3128,17 +3156,16 @@ static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, int classzone_idx)
if (waitqueue_active(&pgdat->pfmemalloc_wait))
wake_up_all(&pgdat->pfmemalloc_wait);
- for (i = 0; i <= classzone_idx; i++) {
- struct zone *zone = pgdat->node_zones + i;
-
- if (!managed_zone(zone))
- continue;
+ /* Hopeless node, leave it to direct reclaim */
+ if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES)
+ return true;
- if (!zone_balanced(zone, order, classzone_idx))
- return false;
+ if (pgdat_balanced(pgdat, order, classzone_idx)) {
+ clear_pgdat_congested(pgdat);
+ return true;
}
- return true;
+ return false;
}
/*
@@ -3214,9 +3241,9 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx)
count_vm_event(PAGEOUTRUN);
do {
+ unsigned long nr_reclaimed = sc.nr_reclaimed;
bool raise_priority = true;
- sc.nr_reclaimed = 0;
sc.reclaim_idx = classzone_idx;
/*
@@ -3241,23 +3268,12 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx)
}
/*
- * Only reclaim if there are no eligible zones. Check from
- * high to low zone as allocations prefer higher zones.
- * Scanning from low to high zone would allow congestion to be
- * cleared during a very small window when a small low
- * zone was balanced even under extreme pressure when the
- * overall node may be congested. Note that sc.reclaim_idx
- * is not used as buffer_heads_over_limit may have adjusted
- * it.
+ * Only reclaim if there are no eligible zones. Note that
+ * sc.reclaim_idx is not used as buffer_heads_over_limit may
+ * have adjusted it.
*/
- for (i = classzone_idx; i >= 0; i--) {
- zone = pgdat->node_zones + i;
- if (!managed_zone(zone))
- continue;
-
- if (zone_balanced(zone, sc.order, classzone_idx))
- goto out;
- }
+ if (pgdat_balanced(pgdat, sc.order, classzone_idx))
+ goto out;
/*
* Do some background aging of the anon list, to give
@@ -3271,7 +3287,7 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx)
* If we're getting trouble reclaiming, start doing writepage
* even in laptop mode.
*/
- if (sc.priority < DEF_PRIORITY - 2 || !pgdat_reclaimable(pgdat))
+ if (sc.priority < DEF_PRIORITY - 2)
sc.may_writepage = 1;
/* Call soft limit reclaim before calling shrink_node. */
@@ -3295,7 +3311,7 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx)
* able to safely make forward progress. Wake them
*/
if (waitqueue_active(&pgdat->pfmemalloc_wait) &&
- pfmemalloc_watermark_ok(pgdat))
+ allow_direct_reclaim(pgdat))
wake_up_all(&pgdat->pfmemalloc_wait);
/* Check if kswapd should be suspending */
@@ -3306,11 +3322,16 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx)
* Raise priority if scanning rate is too low or there was no
* progress in reclaiming pages
*/
- if (raise_priority || !sc.nr_reclaimed)
+ nr_reclaimed = sc.nr_reclaimed - nr_reclaimed;
+ if (raise_priority || !nr_reclaimed)
sc.priority--;
} while (sc.priority >= 1);
+ if (!sc.nr_reclaimed)
+ pgdat->kswapd_failures++;
+
out:
+ snapshot_refaults(NULL, pgdat);
/*
* Return the order kswapd stopped reclaiming at as
* prepare_kswapd_sleep() takes it into account. If another caller
@@ -3320,6 +3341,22 @@ out:
return sc.order;
}
+/*
+ * pgdat->kswapd_classzone_idx is the highest zone index that a recent
+ * allocation request woke kswapd for. When kswapd has not woken recently,
+ * the value is MAX_NR_ZONES which is not a valid index. This compares a
+ * given classzone and returns it or the highest classzone index kswapd
+ * was recently woke for.
+ */
+static enum zone_type kswapd_classzone_idx(pg_data_t *pgdat,
+ enum zone_type classzone_idx)
+{
+ if (pgdat->kswapd_classzone_idx == MAX_NR_ZONES)
+ return classzone_idx;
+
+ return max(pgdat->kswapd_classzone_idx, classzone_idx);
+}
+
static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_order,
unsigned int classzone_idx)
{
@@ -3331,7 +3368,13 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_o
prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
- /* Try to sleep for a short interval */
+ /*
+ * Try to sleep for a short interval. Note that kcompactd will only be
+ * woken if it is possible to sleep for a short interval. This is
+ * deliberate on the assumption that if reclaim cannot keep an
+ * eligible zone balanced that it's also unlikely that compaction will
+ * succeed.
+ */
if (prepare_kswapd_sleep(pgdat, reclaim_order, classzone_idx)) {
/*
* Compaction records what page blocks it recently failed to
@@ -3355,7 +3398,7 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_o
* the previous request that slept prematurely.
*/
if (remaining) {
- pgdat->kswapd_classzone_idx = max(pgdat->kswapd_classzone_idx, classzone_idx);
+ pgdat->kswapd_classzone_idx = kswapd_classzone_idx(pgdat, classzone_idx);
pgdat->kswapd_order = max(pgdat->kswapd_order, reclaim_order);
}
@@ -3409,7 +3452,8 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_o
*/
static int kswapd(void *p)
{
- unsigned int alloc_order, reclaim_order, classzone_idx;
+ unsigned int alloc_order, reclaim_order;
+ unsigned int classzone_idx = MAX_NR_ZONES - 1;
pg_data_t *pgdat = (pg_data_t*)p;
struct task_struct *tsk = current;
@@ -3439,20 +3483,23 @@ static int kswapd(void *p)
tsk->flags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD;
set_freezable();
- pgdat->kswapd_order = alloc_order = reclaim_order = 0;
- pgdat->kswapd_classzone_idx = classzone_idx = 0;
+ pgdat->kswapd_order = 0;
+ pgdat->kswapd_classzone_idx = MAX_NR_ZONES;
for ( ; ; ) {
bool ret;
+ alloc_order = reclaim_order = pgdat->kswapd_order;
+ classzone_idx = kswapd_classzone_idx(pgdat, classzone_idx);
+
kswapd_try_sleep:
kswapd_try_to_sleep(pgdat, alloc_order, reclaim_order,
classzone_idx);
/* Read the new order and classzone_idx */
alloc_order = reclaim_order = pgdat->kswapd_order;
- classzone_idx = pgdat->kswapd_classzone_idx;
+ classzone_idx = kswapd_classzone_idx(pgdat, 0);
pgdat->kswapd_order = 0;
- pgdat->kswapd_classzone_idx = 0;
+ pgdat->kswapd_classzone_idx = MAX_NR_ZONES;
ret = try_to_freeze();
if (kthread_should_stop())
@@ -3478,9 +3525,6 @@ kswapd_try_sleep:
reclaim_order = balance_pgdat(pgdat, alloc_order, classzone_idx);
if (reclaim_order < alloc_order)
goto kswapd_try_sleep;
-
- alloc_order = reclaim_order = pgdat->kswapd_order;
- classzone_idx = pgdat->kswapd_classzone_idx;
}
tsk->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD);
@@ -3496,7 +3540,6 @@ kswapd_try_sleep:
void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx)
{
pg_data_t *pgdat;
- int z;
if (!managed_zone(zone))
return;
@@ -3504,22 +3547,20 @@ void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx)
if (!cpuset_zone_allowed(zone, GFP_KERNEL | __GFP_HARDWALL))
return;
pgdat = zone->zone_pgdat;
- pgdat->kswapd_classzone_idx = max(pgdat->kswapd_classzone_idx, classzone_idx);
+ pgdat->kswapd_classzone_idx = kswapd_classzone_idx(pgdat,
+ classzone_idx);
pgdat->kswapd_order = max(pgdat->kswapd_order, order);
if (!waitqueue_active(&pgdat->kswapd_wait))
return;
- /* Only wake kswapd if all zones are unbalanced */
- for (z = 0; z <= classzone_idx; z++) {
- zone = pgdat->node_zones + z;
- if (!managed_zone(zone))
- continue;
+ /* Hopeless node, leave it to direct reclaim */
+ if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES)
+ return;
- if (zone_balanced(zone, order, classzone_idx))
- return;
- }
+ if (pgdat_balanced(pgdat, order, classzone_idx))
+ return;
- trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, zone_idx(zone), order);
+ trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, classzone_idx, order);
wake_up_interruptible(&pgdat->kswapd_wait);
}
@@ -3725,7 +3766,7 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in
int classzone_idx = gfp_zone(gfp_mask);
struct scan_control sc = {
.nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX),
- .gfp_mask = (gfp_mask = memalloc_noio_flags(gfp_mask)),
+ .gfp_mask = (gfp_mask = current_gfp_context(gfp_mask)),
.order = order,
.priority = NODE_RECLAIM_PRIORITY,
.may_writepage = !!(node_reclaim_mode & RECLAIM_WRITE),
@@ -3779,9 +3820,6 @@ int node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order)
sum_zone_node_page_state(pgdat->node_id, NR_SLAB_RECLAIMABLE) <= pgdat->min_slab_pages)
return NODE_RECLAIM_FULL;
- if (!pgdat_reclaimable(pgdat))
- return NODE_RECLAIM_FULL;
-
/*
* Do not scan if the allocation should not be delayed.
*/
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 809025ed97ea..f5fa1bd1eb16 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -954,7 +954,6 @@ const char * const vmstat_text[] = {
"nr_unevictable",
"nr_isolated_anon",
"nr_isolated_file",
- "nr_pages_scanned",
"workingset_refault",
"workingset_activate",
"workingset_nodereclaim",
@@ -992,6 +991,7 @@ const char * const vmstat_text[] = {
"pgfree",
"pgactivate",
"pgdeactivate",
+ "pglazyfree",
"pgfault",
"pgmajfault",
@@ -1124,8 +1124,12 @@ static void frag_stop(struct seq_file *m, void *arg)
{
}
-/* Walk all the zones in a node and print using a callback */
+/*
+ * Walk zones in a node and print using a callback.
+ * If @assert_populated is true, only use callback for zones that are populated.
+ */
static void walk_zones_in_node(struct seq_file *m, pg_data_t *pgdat,
+ bool assert_populated,
void (*print)(struct seq_file *m, pg_data_t *, struct zone *))
{
struct zone *zone;
@@ -1133,7 +1137,7 @@ static void walk_zones_in_node(struct seq_file *m, pg_data_t *pgdat,
unsigned long flags;
for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) {
- if (!populated_zone(zone))
+ if (assert_populated && !populated_zone(zone))
continue;
spin_lock_irqsave(&zone->lock, flags);
@@ -1161,7 +1165,7 @@ static void frag_show_print(struct seq_file *m, pg_data_t *pgdat,
static int frag_show(struct seq_file *m, void *arg)
{
pg_data_t *pgdat = (pg_data_t *)arg;
- walk_zones_in_node(m, pgdat, frag_show_print);
+ walk_zones_in_node(m, pgdat, true, frag_show_print);
return 0;
}
@@ -1202,7 +1206,7 @@ static int pagetypeinfo_showfree(struct seq_file *m, void *arg)
seq_printf(m, "%6d ", order);
seq_putc(m, '\n');
- walk_zones_in_node(m, pgdat, pagetypeinfo_showfree_print);
+ walk_zones_in_node(m, pgdat, true, pagetypeinfo_showfree_print);
return 0;
}
@@ -1254,7 +1258,7 @@ static int pagetypeinfo_showblockcount(struct seq_file *m, void *arg)
for (mtype = 0; mtype < MIGRATE_TYPES; mtype++)
seq_printf(m, "%12s ", migratetype_names[mtype]);
seq_putc(m, '\n');
- walk_zones_in_node(m, pgdat, pagetypeinfo_showblockcount_print);
+ walk_zones_in_node(m, pgdat, true, pagetypeinfo_showblockcount_print);
return 0;
}
@@ -1280,7 +1284,7 @@ static void pagetypeinfo_showmixedcount(struct seq_file *m, pg_data_t *pgdat)
seq_printf(m, "%12s ", migratetype_names[mtype]);
seq_putc(m, '\n');
- walk_zones_in_node(m, pgdat, pagetypeinfo_showmixedcount_print);
+ walk_zones_in_node(m, pgdat, true, pagetypeinfo_showmixedcount_print);
#endif /* CONFIG_PAGE_OWNER */
}
@@ -1378,7 +1382,6 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
"\n min %lu"
"\n low %lu"
"\n high %lu"
- "\n node_scanned %lu"
"\n spanned %lu"
"\n present %lu"
"\n managed %lu",
@@ -1386,23 +1389,28 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
min_wmark_pages(zone),
low_wmark_pages(zone),
high_wmark_pages(zone),
- node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED),
zone->spanned_pages,
zone->present_pages,
zone->managed_pages);
- for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
- seq_printf(m, "\n %-12s %lu", vmstat_text[i],
- zone_page_state(zone, i));
-
seq_printf(m,
"\n protection: (%ld",
zone->lowmem_reserve[0]);
for (i = 1; i < ARRAY_SIZE(zone->lowmem_reserve); i++)
seq_printf(m, ", %ld", zone->lowmem_reserve[i]);
- seq_printf(m,
- ")"
- "\n pagesets");
+ seq_putc(m, ')');
+
+ /* If unpopulated, no other information is useful */
+ if (!populated_zone(zone)) {
+ seq_putc(m, '\n');
+ return;
+ }
+
+ for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
+ seq_printf(m, "\n %-12s %lu", vmstat_text[i],
+ zone_page_state(zone, i));
+
+ seq_printf(m, "\n pagesets");
for_each_online_cpu(i) {
struct per_cpu_pageset *pageset;
@@ -1425,19 +1433,22 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
"\n node_unreclaimable: %u"
"\n start_pfn: %lu"
"\n node_inactive_ratio: %u",
- !pgdat_reclaimable(zone->zone_pgdat),
+ pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES,
zone->zone_start_pfn,
zone->zone_pgdat->inactive_ratio);
seq_putc(m, '\n');
}
/*
- * Output information about zones in @pgdat.
+ * Output information about zones in @pgdat. All zones are printed regardless
+ * of whether they are populated or not: lowmem_reserve_ratio operates on the
+ * set of all zones and userspace would not be aware of such zones if they are
+ * suppressed here (zoneinfo displays the effect of lowmem_reserve_ratio).
*/
static int zoneinfo_show(struct seq_file *m, void *arg)
{
pg_data_t *pgdat = (pg_data_t *)arg;
- walk_zones_in_node(m, pgdat, zoneinfo_show_print);
+ walk_zones_in_node(m, pgdat, false, zoneinfo_show_print);
return 0;
}
@@ -1586,22 +1597,9 @@ int vmstat_refresh(struct ctl_table *table, int write,
for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) {
val = atomic_long_read(&vm_zone_stat[i]);
if (val < 0) {
- switch (i) {
- case NR_PAGES_SCANNED:
- /*
- * This is often seen to go negative in
- * recent kernels, but not to go permanently
- * negative. Whilst it would be nicer not to
- * have exceptions, rooting them out would be
- * another task, of rather low priority.
- */
- break;
- default:
- pr_warn("%s: %s %ld\n",
- __func__, vmstat_text[i], val);
- err = -EINVAL;
- break;
- }
+ pr_warn("%s: %s %ld\n",
+ __func__, vmstat_text[i], val);
+ err = -EINVAL;
}
}
if (err)
@@ -1768,8 +1766,7 @@ void __init init_mm_internals(void)
{
int ret __maybe_unused;
- mm_percpu_wq = alloc_workqueue("mm_percpu_wq",
- WQ_FREEZABLE|WQ_MEM_RECLAIM, 0);
+ mm_percpu_wq = alloc_workqueue("mm_percpu_wq", WQ_MEM_RECLAIM, 0);
#ifdef CONFIG_SMP
ret = cpuhp_setup_state_nocalls(CPUHP_MM_VMSTAT_DEAD, "mm/vmstat:dead",
@@ -1857,7 +1854,7 @@ static int unusable_show(struct seq_file *m, void *arg)
if (!node_state(pgdat->node_id, N_MEMORY))
return 0;
- walk_zones_in_node(m, pgdat, unusable_show_print);
+ walk_zones_in_node(m, pgdat, true, unusable_show_print);
return 0;
}
@@ -1909,7 +1906,7 @@ static int extfrag_show(struct seq_file *m, void *arg)
{
pg_data_t *pgdat = (pg_data_t *)arg;
- walk_zones_in_node(m, pgdat, extfrag_show_print);
+ walk_zones_in_node(m, pgdat, true, extfrag_show_print);
return 0;
}
diff --git a/mm/workingset.c b/mm/workingset.c
index eda05c71fa49..b8c9ab678479 100644
--- a/mm/workingset.c
+++ b/mm/workingset.c
@@ -269,7 +269,6 @@ bool workingset_refault(void *shadow)
lruvec = mem_cgroup_lruvec(pgdat, memcg);
refault = atomic_long_read(&lruvec->inactive_age);
active_file = lruvec_lru_size(lruvec, LRU_ACTIVE_FILE, MAX_NR_ZONES);
- rcu_read_unlock();
/*
* The unsigned subtraction here gives an accurate distance
@@ -290,11 +289,15 @@ bool workingset_refault(void *shadow)
refault_distance = (refault - eviction) & EVICTION_MASK;
inc_node_state(pgdat, WORKINGSET_REFAULT);
+ inc_memcg_state(memcg, WORKINGSET_REFAULT);
if (refault_distance <= active_file) {
inc_node_state(pgdat, WORKINGSET_ACTIVATE);
+ inc_memcg_state(memcg, WORKINGSET_ACTIVATE);
+ rcu_read_unlock();
return true;
}
+ rcu_read_unlock();
return false;
}
@@ -472,6 +475,7 @@ static enum lru_status shadow_lru_isolate(struct list_head *item,
if (WARN_ON_ONCE(node->exceptional))
goto out_invalid;
inc_node_state(page_pgdat(virt_to_page(node)), WORKINGSET_NODERECLAIM);
+ inc_memcg_page_state(virt_to_page(node), WORKINGSET_NODERECLAIM);
__radix_tree_delete_node(&mapping->page_tree, node,
workingset_update_node, mapping);
diff --git a/mm/z3fold.c b/mm/z3fold.c
index f9492bccfd79..54f63c4a809a 100644
--- a/mm/z3fold.c
+++ b/mm/z3fold.c
@@ -185,6 +185,12 @@ static inline void z3fold_page_lock(struct z3fold_header *zhdr)
spin_lock(&zhdr->page_lock);
}
+/* Try to lock a z3fold page */
+static inline int z3fold_page_trylock(struct z3fold_header *zhdr)
+{
+ return spin_trylock(&zhdr->page_lock);
+}
+
/* Unlock a z3fold page */
static inline void z3fold_page_unlock(struct z3fold_header *zhdr)
{
@@ -385,7 +391,7 @@ static int z3fold_alloc(struct z3fold_pool *pool, size_t size, gfp_t gfp,
spin_lock(&pool->lock);
zhdr = list_first_entry_or_null(&pool->unbuddied[i],
struct z3fold_header, buddy);
- if (!zhdr) {
+ if (!zhdr || !z3fold_page_trylock(zhdr)) {
spin_unlock(&pool->lock);
continue;
}
@@ -394,7 +400,6 @@ static int z3fold_alloc(struct z3fold_pool *pool, size_t size, gfp_t gfp,
spin_unlock(&pool->lock);
page = virt_to_page(zhdr);
- z3fold_page_lock(zhdr);
if (zhdr->first_chunks == 0) {
if (zhdr->middle_chunks != 0 &&
chunks >= zhdr->start_middle)
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index b7ee9c34dbd6..d41edd28298b 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -276,7 +276,7 @@ struct zs_pool {
struct zspage {
struct {
unsigned int fullness:FULLNESS_BITS;
- unsigned int class:CLASS_BITS;
+ unsigned int class:CLASS_BITS + 1;
unsigned int isolated:ISOLATED_BITS;
unsigned int magic:MAGIC_VAL_BITS;
};