summaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig10
-rw-r--r--mm/Makefile13
-rw-r--r--mm/backing-dev.c46
-rw-r--r--mm/balloon_compaction.c123
-rw-r--r--mm/bootmem.c4
-rw-r--r--mm/cma.c87
-rw-r--r--mm/compaction.c674
-rw-r--r--mm/debug.c237
-rw-r--r--mm/dmapool.c58
-rw-r--r--mm/filemap.c27
-rw-r--r--mm/gup.c358
-rw-r--r--mm/huge_memory.c35
-rw-r--r--mm/hugetlb.c14
-rw-r--r--mm/hugetlb_cgroup.c2
-rw-r--r--mm/internal.h26
-rw-r--r--mm/interval_tree.c2
-rw-r--r--mm/iov_iter.c254
-rw-r--r--mm/kmemcheck.c1
-rw-r--r--mm/ksm.c4
-rw-r--r--mm/memblock.c7
-rw-r--r--mm/memcontrol.c421
-rw-r--r--mm/memory-failure.c2
-rw-r--r--mm/memory.c14
-rw-r--r--mm/memory_hotplug.c2
-rw-r--r--mm/mempolicy.c134
-rw-r--r--mm/migrate.c21
-rw-r--r--mm/mlock.c8
-rw-r--r--mm/mmap.c129
-rw-r--r--mm/mmu_notifier.c5
-rw-r--r--mm/mprotect.c20
-rw-r--r--mm/mremap.c5
-rw-r--r--mm/nobootmem.c2
-rw-r--r--mm/nommu.c2
-rw-r--r--mm/oom_kill.c6
-rw-r--r--mm/page-writeback.c10
-rw-r--r--mm/page_alloc.c357
-rw-r--r--mm/pagewalk.c2
-rw-r--r--mm/percpu-km.c16
-rw-r--r--mm/percpu-vm.c184
-rw-r--r--mm/percpu.c524
-rw-r--r--mm/pgtable-generic.c2
-rw-r--r--mm/rmap.c14
-rw-r--r--mm/shmem.c8
-rw-r--r--mm/slab.c364
-rw-r--r--mm/slab.h57
-rw-r--r--mm/slab_common.c178
-rw-r--r--mm/slob.c2
-rw-r--r--mm/slub.c126
-rw-r--r--mm/swap.c30
-rw-r--r--mm/swap_state.c16
-rw-r--r--mm/util.c23
-rw-r--r--mm/vmalloc.c20
-rw-r--r--mm/vmscan.c112
-rw-r--r--mm/vmstat.c153
-rw-r--r--mm/zbud.c14
-rw-r--r--mm/zpool.c2
-rw-r--r--mm/zsmalloc.c47
57 files changed, 3133 insertions, 1881 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index 886db2158538..1d1ae6b078fd 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -137,6 +137,9 @@ config HAVE_MEMBLOCK_NODE_MAP
config HAVE_MEMBLOCK_PHYS_MAP
boolean
+config HAVE_GENERIC_RCU_GUP
+ boolean
+
config ARCH_DISCARD_MEMBLOCK
boolean
@@ -228,11 +231,16 @@ config ARCH_ENABLE_SPLIT_PMD_PTLOCK
boolean
#
+# support for memory balloon
+config MEMORY_BALLOON
+ boolean
+
+#
# support for memory balloon compaction
config BALLOON_COMPACTION
bool "Allow for balloon memory compaction/migration"
def_bool y
- depends on COMPACTION && VIRTIO_BALLOON
+ depends on COMPACTION && MEMORY_BALLOON
help
Memory fragmentation introduced by ballooning might reduce
significantly the number of 2MB contiguous memory blocks that can be
diff --git a/mm/Makefile b/mm/Makefile
index 632ae77e6070..8405eb0023a9 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -3,7 +3,7 @@
#
mmu-y := nommu.o
-mmu-$(CONFIG_MMU) := fremap.o gup.o highmem.o madvise.o memory.o mincore.o \
+mmu-$(CONFIG_MMU) := fremap.o gup.o highmem.o memory.o mincore.o \
mlock.o mmap.o mprotect.o mremap.o msync.o rmap.o \
vmalloc.o pagewalk.o pgtable-generic.o
@@ -11,14 +11,14 @@ ifdef CONFIG_CROSS_MEMORY_ATTACH
mmu-$(CONFIG_MMU) += process_vm_access.o
endif
-obj-y := filemap.o mempool.o oom_kill.o fadvise.o \
+obj-y := filemap.o mempool.o oom_kill.o \
maccess.o page_alloc.o page-writeback.o \
readahead.o swap.o truncate.o vmscan.o shmem.o \
util.o mmzone.o vmstat.o backing-dev.o \
mm_init.o mmu_context.o percpu.o slab_common.o \
- compaction.o balloon_compaction.o vmacache.o \
+ compaction.o vmacache.o \
interval_tree.o list_lru.o workingset.o \
- iov_iter.o $(mmu-y)
+ iov_iter.o debug.o $(mmu-y)
obj-y += init-mm.o
@@ -28,6 +28,10 @@ else
obj-y += bootmem.o
endif
+obj-$(CONFIG_ADVISE_SYSCALLS) += fadvise.o
+ifdef CONFIG_MMU
+ obj-$(CONFIG_ADVISE_SYSCALLS) += madvise.o
+endif
obj-$(CONFIG_HAVE_MEMBLOCK) += memblock.o
obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o
@@ -64,3 +68,4 @@ obj-$(CONFIG_ZBUD) += zbud.o
obj-$(CONFIG_ZSMALLOC) += zsmalloc.o
obj-$(CONFIG_GENERIC_EARLY_IOREMAP) += early_ioremap.o
obj-$(CONFIG_CMA) += cma.o
+obj-$(CONFIG_MEMORY_BALLOON) += balloon_compaction.o
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 1706cbbdf5f0..0ae0df55000b 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -40,7 +40,7 @@ LIST_HEAD(bdi_list);
/* bdi_wq serves all asynchronous writeback tasks */
struct workqueue_struct *bdi_wq;
-void bdi_lock_two(struct bdi_writeback *wb1, struct bdi_writeback *wb2)
+static void bdi_lock_two(struct bdi_writeback *wb1, struct bdi_writeback *wb2)
{
if (wb1 < wb2) {
spin_lock(&wb1->list_lock);
@@ -376,13 +376,7 @@ static void bdi_wb_shutdown(struct backing_dev_info *bdi)
mod_delayed_work(bdi_wq, &bdi->wb.dwork, 0);
flush_delayed_work(&bdi->wb.dwork);
WARN_ON(!list_empty(&bdi->work_list));
-
- /*
- * This shouldn't be necessary unless @bdi for some reason has
- * unflushed dirty IO after work_list is drained. Do it anyway
- * just in case.
- */
- cancel_delayed_work_sync(&bdi->wb.dwork);
+ WARN_ON(delayed_work_pending(&bdi->wb.dwork));
}
/*
@@ -402,21 +396,15 @@ static void bdi_prune_sb(struct backing_dev_info *bdi)
void bdi_unregister(struct backing_dev_info *bdi)
{
- struct device *dev = bdi->dev;
-
- if (dev) {
+ if (bdi->dev) {
bdi_set_min_ratio(bdi, 0);
trace_writeback_bdi_unregister(bdi);
bdi_prune_sb(bdi);
bdi_wb_shutdown(bdi);
bdi_debug_unregister(bdi);
-
- spin_lock_bh(&bdi->wb_lock);
+ device_unregister(bdi->dev);
bdi->dev = NULL;
- spin_unlock_bh(&bdi->wb_lock);
-
- device_unregister(dev);
}
}
EXPORT_SYMBOL(bdi_unregister);
@@ -455,7 +443,7 @@ int bdi_init(struct backing_dev_info *bdi)
bdi_wb_init(&bdi->wb, bdi);
for (i = 0; i < NR_BDI_STAT_ITEMS; i++) {
- err = percpu_counter_init(&bdi->bdi_stat[i], 0);
+ err = percpu_counter_init(&bdi->bdi_stat[i], 0, GFP_KERNEL);
if (err)
goto err;
}
@@ -470,7 +458,7 @@ int bdi_init(struct backing_dev_info *bdi)
bdi->write_bandwidth = INIT_BW;
bdi->avg_write_bandwidth = INIT_BW;
- err = fprop_local_init_percpu(&bdi->completions);
+ err = fprop_local_init_percpu(&bdi->completions, GFP_KERNEL);
if (err) {
err:
@@ -487,8 +475,17 @@ void bdi_destroy(struct backing_dev_info *bdi)
int i;
/*
- * Splice our entries to the default_backing_dev_info, if this
- * bdi disappears
+ * Splice our entries to the default_backing_dev_info. This
+ * condition shouldn't happen. @wb must be empty at this point and
+ * dirty inodes on it might cause other issues. This workaround is
+ * added by ce5f8e779519 ("writeback: splice dirty inode entries to
+ * default bdi on bdi_destroy()") without root-causing the issue.
+ *
+ * http://lkml.kernel.org/g/1253038617-30204-11-git-send-email-jens.axboe@oracle.com
+ * http://thread.gmane.org/gmane.linux.file-systems/35341/focus=35350
+ *
+ * We should probably add WARN_ON() to find out whether it still
+ * happens and track it down if so.
*/
if (bdi_has_dirty_io(bdi)) {
struct bdi_writeback *dst = &default_backing_dev_info.wb;
@@ -503,12 +500,7 @@ void bdi_destroy(struct backing_dev_info *bdi)
bdi_unregister(bdi);
- /*
- * If bdi_unregister() had already been called earlier, the dwork
- * could still be pending because bdi_prune_sb() can race with the
- * bdi_wakeup_thread_delayed() calls from __mark_inode_dirty().
- */
- cancel_delayed_work_sync(&bdi->wb.dwork);
+ WARN_ON(delayed_work_pending(&bdi->wb.dwork));
for (i = 0; i < NR_BDI_STAT_ITEMS; i++)
percpu_counter_destroy(&bdi->bdi_stat[i]);
@@ -631,7 +623,7 @@ long wait_iff_congested(struct zone *zone, int sync, long timeout)
* of sleeping on the congestion queue
*/
if (atomic_read(&nr_bdi_congested[sync]) == 0 ||
- !zone_is_reclaim_congested(zone)) {
+ !test_bit(ZONE_CONGESTED, &zone->flags)) {
cond_resched();
/* In case we scheduled, work out time remaining */
diff --git a/mm/balloon_compaction.c b/mm/balloon_compaction.c
index 6e45a5074bf0..b3cbe19f71b5 100644
--- a/mm/balloon_compaction.c
+++ b/mm/balloon_compaction.c
@@ -11,32 +11,6 @@
#include <linux/balloon_compaction.h>
/*
- * balloon_devinfo_alloc - allocates a balloon device information descriptor.
- * @balloon_dev_descriptor: pointer to reference the balloon device which
- * this struct balloon_dev_info will be servicing.
- *
- * Driver must call it to properly allocate and initialize an instance of
- * struct balloon_dev_info which will be used to reference a balloon device
- * as well as to keep track of the balloon device page list.
- */
-struct balloon_dev_info *balloon_devinfo_alloc(void *balloon_dev_descriptor)
-{
- struct balloon_dev_info *b_dev_info;
- b_dev_info = kmalloc(sizeof(*b_dev_info), GFP_KERNEL);
- if (!b_dev_info)
- return ERR_PTR(-ENOMEM);
-
- b_dev_info->balloon_device = balloon_dev_descriptor;
- b_dev_info->mapping = NULL;
- b_dev_info->isolated_pages = 0;
- spin_lock_init(&b_dev_info->pages_lock);
- INIT_LIST_HEAD(&b_dev_info->pages);
-
- return b_dev_info;
-}
-EXPORT_SYMBOL_GPL(balloon_devinfo_alloc);
-
-/*
* balloon_page_enqueue - allocates a new page and inserts it into the balloon
* page list.
* @b_dev_info: balloon device decriptor where we will insert a new page to
@@ -61,7 +35,8 @@ struct page *balloon_page_enqueue(struct balloon_dev_info *b_dev_info)
*/
BUG_ON(!trylock_page(page));
spin_lock_irqsave(&b_dev_info->pages_lock, flags);
- balloon_page_insert(page, b_dev_info->mapping, &b_dev_info->pages);
+ balloon_page_insert(b_dev_info, page);
+ __count_vm_event(BALLOON_INFLATE);
spin_unlock_irqrestore(&b_dev_info->pages_lock, flags);
unlock_page(page);
return page;
@@ -93,18 +68,14 @@ struct page *balloon_page_dequeue(struct balloon_dev_info *b_dev_info)
* to be released by the balloon driver.
*/
if (trylock_page(page)) {
+ if (!PagePrivate(page)) {
+ /* raced with isolation */
+ unlock_page(page);
+ continue;
+ }
spin_lock_irqsave(&b_dev_info->pages_lock, flags);
- /*
- * Raise the page refcount here to prevent any wrong
- * attempt to isolate this page, in case of coliding
- * with balloon_page_isolate() just after we release
- * the page lock.
- *
- * balloon_page_free() will take care of dropping
- * this extra refcount later.
- */
- get_page(page);
balloon_page_delete(page);
+ __count_vm_event(BALLOON_DEFLATE);
spin_unlock_irqrestore(&b_dev_info->pages_lock, flags);
unlock_page(page);
dequeued_page = true;
@@ -132,62 +103,14 @@ struct page *balloon_page_dequeue(struct balloon_dev_info *b_dev_info)
EXPORT_SYMBOL_GPL(balloon_page_dequeue);
#ifdef CONFIG_BALLOON_COMPACTION
-/*
- * balloon_mapping_alloc - allocates a special ->mapping for ballooned pages.
- * @b_dev_info: holds the balloon device information descriptor.
- * @a_ops: balloon_mapping address_space_operations descriptor.
- *
- * Driver must call it to properly allocate and initialize an instance of
- * struct address_space which will be used as the special page->mapping for
- * balloon device enlisted page instances.
- */
-struct address_space *balloon_mapping_alloc(struct balloon_dev_info *b_dev_info,
- const struct address_space_operations *a_ops)
-{
- struct address_space *mapping;
-
- mapping = kmalloc(sizeof(*mapping), GFP_KERNEL);
- if (!mapping)
- return ERR_PTR(-ENOMEM);
-
- /*
- * Give a clean 'zeroed' status to all elements of this special
- * balloon page->mapping struct address_space instance.
- */
- address_space_init_once(mapping);
-
- /*
- * Set mapping->flags appropriately, to allow balloon pages
- * ->mapping identification.
- */
- mapping_set_balloon(mapping);
- mapping_set_gfp_mask(mapping, balloon_mapping_gfp_mask());
-
- /* balloon's page->mapping->a_ops callback descriptor */
- mapping->a_ops = a_ops;
-
- /*
- * Establish a pointer reference back to the balloon device descriptor
- * this particular page->mapping will be servicing.
- * This is used by compaction / migration procedures to identify and
- * access the balloon device pageset while isolating / migrating pages.
- *
- * As some balloon drivers can register multiple balloon devices
- * for a single guest, this also helps compaction / migration to
- * properly deal with multiple balloon pagesets, when required.
- */
- mapping->private_data = b_dev_info;
- b_dev_info->mapping = mapping;
-
- return mapping;
-}
-EXPORT_SYMBOL_GPL(balloon_mapping_alloc);
static inline void __isolate_balloon_page(struct page *page)
{
- struct balloon_dev_info *b_dev_info = page->mapping->private_data;
+ struct balloon_dev_info *b_dev_info = balloon_page_device(page);
unsigned long flags;
+
spin_lock_irqsave(&b_dev_info->pages_lock, flags);
+ ClearPagePrivate(page);
list_del(&page->lru);
b_dev_info->isolated_pages++;
spin_unlock_irqrestore(&b_dev_info->pages_lock, flags);
@@ -195,20 +118,16 @@ static inline void __isolate_balloon_page(struct page *page)
static inline void __putback_balloon_page(struct page *page)
{
- struct balloon_dev_info *b_dev_info = page->mapping->private_data;
+ struct balloon_dev_info *b_dev_info = balloon_page_device(page);
unsigned long flags;
+
spin_lock_irqsave(&b_dev_info->pages_lock, flags);
+ SetPagePrivate(page);
list_add(&page->lru, &b_dev_info->pages);
b_dev_info->isolated_pages--;
spin_unlock_irqrestore(&b_dev_info->pages_lock, flags);
}
-static inline int __migrate_balloon_page(struct address_space *mapping,
- struct page *newpage, struct page *page, enum migrate_mode mode)
-{
- return page->mapping->a_ops->migratepage(mapping, newpage, page, mode);
-}
-
/* __isolate_lru_page() counterpart for a ballooned page */
bool balloon_page_isolate(struct page *page)
{
@@ -235,12 +154,11 @@ bool balloon_page_isolate(struct page *page)
*/
if (likely(trylock_page(page))) {
/*
- * A ballooned page, by default, has just one refcount.
+ * A ballooned page, by default, has PagePrivate set.
* Prevent concurrent compaction threads from isolating
- * an already isolated balloon page by refcount check.
+ * an already isolated balloon page by clearing it.
*/
- if (__is_movable_balloon_page(page) &&
- page_count(page) == 2) {
+ if (balloon_page_movable(page)) {
__isolate_balloon_page(page);
unlock_page(page);
return true;
@@ -276,7 +194,7 @@ void balloon_page_putback(struct page *page)
int balloon_page_migrate(struct page *newpage,
struct page *page, enum migrate_mode mode)
{
- struct address_space *mapping;
+ struct balloon_dev_info *balloon = balloon_page_device(page);
int rc = -EAGAIN;
/*
@@ -292,9 +210,8 @@ int balloon_page_migrate(struct page *newpage,
return rc;
}
- mapping = page->mapping;
- if (mapping)
- rc = __migrate_balloon_page(mapping, newpage, page, mode);
+ if (balloon && balloon->migratepage)
+ rc = balloon->migratepage(balloon, newpage, page, mode);
unlock_page(newpage);
return rc;
diff --git a/mm/bootmem.c b/mm/bootmem.c
index 90bd3507b413..8a000cebb0d7 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -16,9 +16,9 @@
#include <linux/kmemleak.h>
#include <linux/range.h>
#include <linux/memblock.h>
+#include <linux/bug.h>
+#include <linux/io.h>
-#include <asm/bug.h>
-#include <asm/io.h>
#include <asm/processor.h>
#include "internal.h"
diff --git a/mm/cma.c b/mm/cma.c
index c17751c0dcaf..963bc4add9af 100644
--- a/mm/cma.c
+++ b/mm/cma.c
@@ -32,6 +32,7 @@
#include <linux/slab.h>
#include <linux/log2.h>
#include <linux/cma.h>
+#include <linux/highmem.h>
struct cma {
unsigned long base_pfn;
@@ -57,7 +58,9 @@ unsigned long cma_get_size(struct cma *cma)
static unsigned long cma_bitmap_aligned_mask(struct cma *cma, int align_order)
{
- return (1UL << (align_order >> cma->order_per_bit)) - 1;
+ if (align_order <= cma->order_per_bit)
+ return 0;
+ return (1UL << (align_order - cma->order_per_bit)) - 1;
}
static unsigned long cma_bitmap_maxno(struct cma *cma)
@@ -140,6 +143,54 @@ static int __init cma_init_reserved_areas(void)
core_initcall(cma_init_reserved_areas);
/**
+ * cma_init_reserved_mem() - create custom contiguous area from reserved memory
+ * @base: Base address of the reserved area
+ * @size: Size of the reserved area (in bytes),
+ * @order_per_bit: Order of pages represented by one bit on bitmap.
+ * @res_cma: Pointer to store the created cma region.
+ *
+ * This function creates custom contiguous area from already reserved memory.
+ */
+int __init cma_init_reserved_mem(phys_addr_t base, phys_addr_t size,
+ int order_per_bit, struct cma **res_cma)
+{
+ struct cma *cma;
+ phys_addr_t alignment;
+
+ /* Sanity checks */
+ if (cma_area_count == ARRAY_SIZE(cma_areas)) {
+ pr_err("Not enough slots for CMA reserved regions!\n");
+ return -ENOSPC;
+ }
+
+ if (!size || !memblock_is_region_reserved(base, size))
+ return -EINVAL;
+
+ /* ensure minimal alignment requied by mm core */
+ alignment = PAGE_SIZE << max(MAX_ORDER - 1, pageblock_order);
+
+ /* alignment should be aligned with order_per_bit */
+ if (!IS_ALIGNED(alignment >> PAGE_SHIFT, 1 << order_per_bit))
+ return -EINVAL;
+
+ if (ALIGN(base, alignment) != base || ALIGN(size, alignment) != size)
+ return -EINVAL;
+
+ /*
+ * Each reserved area must be initialised later, when more kernel
+ * subsystems (like slab allocator) are available.
+ */
+ cma = &cma_areas[cma_area_count];
+ cma->base_pfn = PFN_DOWN(base);
+ cma->count = size >> PAGE_SHIFT;
+ cma->order_per_bit = order_per_bit;
+ *res_cma = cma;
+ cma_area_count++;
+
+ return 0;
+}
+
+/**
* cma_declare_contiguous() - reserve custom contiguous area
* @base: Base address of the reserved area optional, use 0 for any
* @size: Size of the reserved area (in bytes),
@@ -162,7 +213,8 @@ int __init cma_declare_contiguous(phys_addr_t base,
phys_addr_t alignment, unsigned int order_per_bit,
bool fixed, struct cma **res_cma)
{
- struct cma *cma;
+ phys_addr_t memblock_end = memblock_end_of_DRAM();
+ phys_addr_t highmem_start = __pa(high_memory);
int ret = 0;
pr_debug("%s(size %lx, base %08lx, limit %08lx alignment %08lx)\n",
@@ -196,6 +248,24 @@ int __init cma_declare_contiguous(phys_addr_t base,
if (!IS_ALIGNED(size >> PAGE_SHIFT, 1 << order_per_bit))
return -EINVAL;
+ /*
+ * adjust limit to avoid crossing low/high memory boundary for
+ * automatically allocated regions
+ */
+ if (((limit == 0 || limit > memblock_end) &&
+ (memblock_end - size < highmem_start &&
+ memblock_end > highmem_start)) ||
+ (!fixed && limit > highmem_start && limit - size < highmem_start)) {
+ limit = highmem_start;
+ }
+
+ if (fixed && base < highmem_start && base+size > highmem_start) {
+ ret = -EINVAL;
+ pr_err("Region at %08lx defined on low/high memory boundary (%08lx)\n",
+ (unsigned long)base, (unsigned long)highmem_start);
+ goto err;
+ }
+
/* Reserve memory */
if (base && fixed) {
if (memblock_is_region_reserved(base, size) ||
@@ -214,16 +284,9 @@ int __init cma_declare_contiguous(phys_addr_t base,
}
}
- /*
- * Each reserved area must be initialised later, when more kernel
- * subsystems (like slab allocator) are available.
- */
- cma = &cma_areas[cma_area_count];
- cma->base_pfn = PFN_DOWN(base);
- cma->count = size >> PAGE_SHIFT;
- cma->order_per_bit = order_per_bit;
- *res_cma = cma;
- cma_area_count++;
+ ret = cma_init_reserved_mem(base, size, order_per_bit, res_cma);
+ if (ret)
+ goto err;
pr_info("Reserved %ld MiB at %08lx\n", (unsigned long)size / SZ_1M,
(unsigned long)base);
diff --git a/mm/compaction.c b/mm/compaction.c
index 21bf292b642a..edba18aed173 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -67,6 +67,49 @@ static inline bool migrate_async_suitable(int migratetype)
return is_migrate_cma(migratetype) || migratetype == MIGRATE_MOVABLE;
}
+/*
+ * Check that the whole (or subset of) a pageblock given by the interval of
+ * [start_pfn, end_pfn) is valid and within the same zone, before scanning it
+ * with the migration of free compaction scanner. The scanners then need to
+ * use only pfn_valid_within() check for arches that allow holes within
+ * pageblocks.
+ *
+ * Return struct page pointer of start_pfn, or NULL if checks were not passed.
+ *
+ * It's possible on some configurations to have a setup like node0 node1 node0
+ * i.e. it's possible that all pages within a zones range of pages do not
+ * belong to a single zone. We assume that a border between node0 and node1
+ * can occur within a single pageblock, but not a node0 node1 node0
+ * interleaving within a single pageblock. It is therefore sufficient to check
+ * the first and last page of a pageblock and avoid checking each individual
+ * page in a pageblock.
+ */
+static struct page *pageblock_pfn_to_page(unsigned long start_pfn,
+ unsigned long end_pfn, struct zone *zone)
+{
+ struct page *start_page;
+ struct page *end_page;
+
+ /* end_pfn is one past the range we are checking */
+ end_pfn--;
+
+ if (!pfn_valid(start_pfn) || !pfn_valid(end_pfn))
+ return NULL;
+
+ start_page = pfn_to_page(start_pfn);
+
+ if (page_zone(start_page) != zone)
+ return NULL;
+
+ end_page = pfn_to_page(end_pfn);
+
+ /* This gives a shorter code than deriving page_zone(end_page) */
+ if (page_zone_id(start_page) != page_zone_id(end_page))
+ return NULL;
+
+ return start_page;
+}
+
#ifdef CONFIG_COMPACTION
/* Returns true if the pageblock should be scanned for pages to isolate. */
static inline bool isolation_suitable(struct compact_control *cc,
@@ -132,7 +175,7 @@ void reset_isolation_suitable(pg_data_t *pgdat)
*/
static void update_pageblock_skip(struct compact_control *cc,
struct page *page, unsigned long nr_isolated,
- bool set_unsuitable, bool migrate_scanner)
+ bool migrate_scanner)
{
struct zone *zone = cc->zone;
unsigned long pfn;
@@ -146,12 +189,7 @@ static void update_pageblock_skip(struct compact_control *cc,
if (nr_isolated)
return;
- /*
- * Only skip pageblocks when all forms of compaction will be known to
- * fail in the near future.
- */
- if (set_unsuitable)
- set_pageblock_skip(page);
+ set_pageblock_skip(page);
pfn = page_to_pfn(page);
@@ -180,52 +218,77 @@ static inline bool isolation_suitable(struct compact_control *cc,
static void update_pageblock_skip(struct compact_control *cc,
struct page *page, unsigned long nr_isolated,
- bool set_unsuitable, bool migrate_scanner)
+ bool migrate_scanner)
{
}
#endif /* CONFIG_COMPACTION */
-static inline bool should_release_lock(spinlock_t *lock)
+/*
+ * Compaction requires the taking of some coarse locks that are potentially
+ * very heavily contended. For async compaction, back out if the lock cannot
+ * be taken immediately. For sync compaction, spin on the lock if needed.
+ *
+ * Returns true if the lock is held
+ * Returns false if the lock is not held and compaction should abort
+ */
+static bool compact_trylock_irqsave(spinlock_t *lock, unsigned long *flags,
+ struct compact_control *cc)
{
- return need_resched() || spin_is_contended(lock);
+ if (cc->mode == MIGRATE_ASYNC) {
+ if (!spin_trylock_irqsave(lock, *flags)) {
+ cc->contended = COMPACT_CONTENDED_LOCK;
+ return false;
+ }
+ } else {
+ spin_lock_irqsave(lock, *flags);
+ }
+
+ return true;
}
/*
* Compaction requires the taking of some coarse locks that are potentially
- * very heavily contended. Check if the process needs to be scheduled or
- * if the lock is contended. For async compaction, back out in the event
- * if contention is severe. For sync compaction, schedule.
+ * very heavily contended. The lock should be periodically unlocked to avoid
+ * having disabled IRQs for a long time, even when there is nobody waiting on
+ * the lock. It might also be that allowing the IRQs will result in
+ * need_resched() becoming true. If scheduling is needed, async compaction
+ * aborts. Sync compaction schedules.
+ * Either compaction type will also abort if a fatal signal is pending.
+ * In either case if the lock was locked, it is dropped and not regained.
*
- * Returns true if the lock is held.
- * Returns false if the lock is released and compaction should abort
+ * Returns true if compaction should abort due to fatal signal pending, or
+ * async compaction due to need_resched()
+ * Returns false when compaction can continue (sync compaction might have
+ * scheduled)
*/
-static bool compact_checklock_irqsave(spinlock_t *lock, unsigned long *flags,
- bool locked, struct compact_control *cc)
+static bool compact_unlock_should_abort(spinlock_t *lock,
+ unsigned long flags, bool *locked, struct compact_control *cc)
{
- if (should_release_lock(lock)) {
- if (locked) {
- spin_unlock_irqrestore(lock, *flags);
- locked = false;
- }
+ if (*locked) {
+ spin_unlock_irqrestore(lock, flags);
+ *locked = false;
+ }
+
+ if (fatal_signal_pending(current)) {
+ cc->contended = COMPACT_CONTENDED_SCHED;
+ return true;
+ }
- /* async aborts if taking too long or contended */
+ if (need_resched()) {
if (cc->mode == MIGRATE_ASYNC) {
- cc->contended = true;
- return false;
+ cc->contended = COMPACT_CONTENDED_SCHED;
+ return true;
}
-
cond_resched();
}
- if (!locked)
- spin_lock_irqsave(lock, *flags);
- return true;
+ return false;
}
/*
* Aside from avoiding lock contention, compaction also periodically checks
* need_resched() and either schedules in sync compaction or aborts async
- * compaction. This is similar to what compact_checklock_irqsave() does, but
+ * compaction. This is similar to what compact_unlock_should_abort() does, but
* is used where no lock is concerned.
*
* Returns false when no scheduling was needed, or sync compaction scheduled.
@@ -236,7 +299,7 @@ static inline bool compact_should_abort(struct compact_control *cc)
/* async compaction aborts if contended */
if (need_resched()) {
if (cc->mode == MIGRATE_ASYNC) {
- cc->contended = true;
+ cc->contended = COMPACT_CONTENDED_SCHED;
return true;
}
@@ -250,8 +313,15 @@ static inline bool compact_should_abort(struct compact_control *cc)
static bool suitable_migration_target(struct page *page)
{
/* If the page is a large free page, then disallow migration */
- if (PageBuddy(page) && page_order(page) >= pageblock_order)
- return false;
+ if (PageBuddy(page)) {
+ /*
+ * We are checking page_order without zone->lock taken. But
+ * the only small danger is that we skip a potentially suitable
+ * pageblock, so it's not worth to check order for valid range.
+ */
+ if (page_order_unsafe(page) >= pageblock_order)
+ return false;
+ }
/* If the block is MIGRATE_MOVABLE or MIGRATE_CMA, allow migration */
if (migrate_async_suitable(get_pageblock_migratetype(page)))
@@ -267,16 +337,16 @@ static bool suitable_migration_target(struct page *page)
* (even though it may still end up isolating some pages).
*/
static unsigned long isolate_freepages_block(struct compact_control *cc,
- unsigned long blockpfn,
+ unsigned long *start_pfn,
unsigned long end_pfn,
struct list_head *freelist,
bool strict)
{
int nr_scanned = 0, total_isolated = 0;
struct page *cursor, *valid_page = NULL;
- unsigned long flags;
+ unsigned long flags = 0;
bool locked = false;
- bool checked_pageblock = false;
+ unsigned long blockpfn = *start_pfn;
cursor = pfn_to_page(blockpfn);
@@ -285,6 +355,16 @@ static unsigned long isolate_freepages_block(struct compact_control *cc,
int isolated, i;
struct page *page = cursor;
+ /*
+ * Periodically drop the lock (if held) regardless of its
+ * contention, to give chance to IRQs. Abort if fatal signal
+ * pending or async compaction detects need_resched()
+ */
+ if (!(blockpfn % SWAP_CLUSTER_MAX)
+ && compact_unlock_should_abort(&cc->zone->lock, flags,
+ &locked, cc))
+ break;
+
nr_scanned++;
if (!pfn_valid_within(blockpfn))
goto isolate_fail;
@@ -295,33 +375,30 @@ static unsigned long isolate_freepages_block(struct compact_control *cc,
goto isolate_fail;
/*
- * The zone lock must be held to isolate freepages.
- * Unfortunately this is a very coarse lock and can be
- * heavily contended if there are parallel allocations
- * or parallel compactions. For async compaction do not
- * spin on the lock and we acquire the lock as late as
- * possible.
+ * If we already hold the lock, we can skip some rechecking.
+ * Note that if we hold the lock now, checked_pageblock was
+ * already set in some previous iteration (or strict is true),
+ * so it is correct to skip the suitable migration target
+ * recheck as well.
*/
- locked = compact_checklock_irqsave(&cc->zone->lock, &flags,
- locked, cc);
- if (!locked)
- break;
-
- /* Recheck this is a suitable migration target under lock */
- if (!strict && !checked_pageblock) {
+ if (!locked) {
/*
- * We need to check suitability of pageblock only once
- * and this isolate_freepages_block() is called with
- * pageblock range, so just check once is sufficient.
+ * The zone lock must be held to isolate freepages.
+ * Unfortunately this is a very coarse lock and can be
+ * heavily contended if there are parallel allocations
+ * or parallel compactions. For async compaction do not
+ * spin on the lock and we acquire the lock as late as
+ * possible.
*/
- checked_pageblock = true;
- if (!suitable_migration_target(page))
+ locked = compact_trylock_irqsave(&cc->zone->lock,
+ &flags, cc);
+ if (!locked)
break;
- }
- /* Recheck this is a buddy page under lock */
- if (!PageBuddy(page))
- goto isolate_fail;
+ /* Recheck this is a buddy page under lock */
+ if (!PageBuddy(page))
+ goto isolate_fail;
+ }
/* Found a free page, break it into order-0 pages */
isolated = split_free_page(page);
@@ -346,6 +423,9 @@ isolate_fail:
}
+ /* Record how far we have got within the block */
+ *start_pfn = blockpfn;
+
trace_mm_compaction_isolate_freepages(nr_scanned, total_isolated);
/*
@@ -361,8 +441,7 @@ isolate_fail:
/* Update the pageblock-skip if the whole pageblock was scanned */
if (blockpfn == end_pfn)
- update_pageblock_skip(cc, valid_page, total_isolated, true,
- false);
+ update_pageblock_skip(cc, valid_page, total_isolated, false);
count_compact_events(COMPACTFREE_SCANNED, nr_scanned);
if (total_isolated)
@@ -390,19 +469,21 @@ isolate_freepages_range(struct compact_control *cc,
unsigned long isolated, pfn, block_end_pfn;
LIST_HEAD(freelist);
- for (pfn = start_pfn; pfn < end_pfn; pfn += isolated) {
- if (!pfn_valid(pfn) || cc->zone != page_zone(pfn_to_page(pfn)))
- break;
+ pfn = start_pfn;
+ block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages);
+
+ for (; pfn < end_pfn; pfn += isolated,
+ block_end_pfn += pageblock_nr_pages) {
+ /* Protect pfn from changing by isolate_freepages_block */
+ unsigned long isolate_start_pfn = pfn;
- /*
- * On subsequent iterations ALIGN() is actually not needed,
- * but we keep it that we not to complicate the code.
- */
- block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages);
block_end_pfn = min(block_end_pfn, end_pfn);
- isolated = isolate_freepages_block(cc, pfn, block_end_pfn,
- &freelist, true);
+ if (!pageblock_pfn_to_page(pfn, block_end_pfn, cc->zone))
+ break;
+
+ isolated = isolate_freepages_block(cc, &isolate_start_pfn,
+ block_end_pfn, &freelist, true);
/*
* In strict mode, isolate_freepages_block() returns 0 if
@@ -433,22 +514,19 @@ isolate_freepages_range(struct compact_control *cc,
}
/* Update the number of anon and file isolated pages in the zone */
-static void acct_isolated(struct zone *zone, bool locked, struct compact_control *cc)
+static void acct_isolated(struct zone *zone, struct compact_control *cc)
{
struct page *page;
unsigned int count[2] = { 0, };
+ if (list_empty(&cc->migratepages))
+ return;
+
list_for_each_entry(page, &cc->migratepages, lru)
count[!!page_is_file_cache(page)]++;
- /* If locked we can use the interrupt unsafe versions */
- if (locked) {
- __mod_zone_page_state(zone, NR_ISOLATED_ANON, count[0]);
- __mod_zone_page_state(zone, NR_ISOLATED_FILE, count[1]);
- } else {
- mod_zone_page_state(zone, NR_ISOLATED_ANON, count[0]);
- mod_zone_page_state(zone, NR_ISOLATED_FILE, count[1]);
- }
+ mod_zone_page_state(zone, NR_ISOLATED_ANON, count[0]);
+ mod_zone_page_state(zone, NR_ISOLATED_FILE, count[1]);
}
/* Similar to reclaim, but different enough that they don't share logic */
@@ -467,40 +545,34 @@ static bool too_many_isolated(struct zone *zone)
}
/**
- * isolate_migratepages_range() - isolate all migrate-able pages in range.
- * @zone: Zone pages are in.
+ * isolate_migratepages_block() - isolate all migrate-able pages within
+ * a single pageblock
* @cc: Compaction control structure.
- * @low_pfn: The first PFN of the range.
- * @end_pfn: The one-past-the-last PFN of the range.
- * @unevictable: true if it allows to isolate unevictable pages
+ * @low_pfn: The first PFN to isolate
+ * @end_pfn: The one-past-the-last PFN to isolate, within same pageblock
+ * @isolate_mode: Isolation mode to be used.
*
* Isolate all pages that can be migrated from the range specified by
- * [low_pfn, end_pfn). Returns zero if there is a fatal signal
- * pending), otherwise PFN of the first page that was not scanned
- * (which may be both less, equal to or more then end_pfn).
+ * [low_pfn, end_pfn). The range is expected to be within same pageblock.
+ * Returns zero if there is a fatal signal pending, otherwise PFN of the
+ * first page that was not scanned (which may be both less, equal to or more
+ * than end_pfn).
*
- * Assumes that cc->migratepages is empty and cc->nr_migratepages is
- * zero.
- *
- * Apart from cc->migratepages and cc->nr_migratetypes this function
- * does not modify any cc's fields, in particular it does not modify
- * (or read for that matter) cc->migrate_pfn.
+ * The pages are isolated on cc->migratepages list (not required to be empty),
+ * and cc->nr_migratepages is updated accordingly. The cc->migrate_pfn field
+ * is neither read nor updated.
*/
-unsigned long
-isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
- unsigned long low_pfn, unsigned long end_pfn, bool unevictable)
+static unsigned long
+isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
+ unsigned long end_pfn, isolate_mode_t isolate_mode)
{
- unsigned long last_pageblock_nr = 0, pageblock_nr;
+ struct zone *zone = cc->zone;
unsigned long nr_scanned = 0, nr_isolated = 0;
struct list_head *migratelist = &cc->migratepages;
struct lruvec *lruvec;
- unsigned long flags;
+ unsigned long flags = 0;
bool locked = false;
struct page *page = NULL, *valid_page = NULL;
- bool set_unsuitable = true;
- const isolate_mode_t mode = (cc->mode == MIGRATE_ASYNC ?
- ISOLATE_ASYNC_MIGRATE : 0) |
- (unevictable ? ISOLATE_UNEVICTABLE : 0);
/*
* Ensure that there are not too many pages isolated from the LRU
@@ -523,72 +595,43 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
/* Time to isolate some pages for migration */
for (; low_pfn < end_pfn; low_pfn++) {
- /* give a chance to irqs before checking need_resched() */
- if (locked && !(low_pfn % SWAP_CLUSTER_MAX)) {
- if (should_release_lock(&zone->lru_lock)) {
- spin_unlock_irqrestore(&zone->lru_lock, flags);
- locked = false;
- }
- }
-
/*
- * migrate_pfn does not necessarily start aligned to a
- * pageblock. Ensure that pfn_valid is called when moving
- * into a new MAX_ORDER_NR_PAGES range in case of large
- * memory holes within the zone
+ * Periodically drop the lock (if held) regardless of its
+ * contention, to give chance to IRQs. Abort async compaction
+ * if contended.
*/
- if ((low_pfn & (MAX_ORDER_NR_PAGES - 1)) == 0) {
- if (!pfn_valid(low_pfn)) {
- low_pfn += MAX_ORDER_NR_PAGES - 1;
- continue;
- }
- }
+ if (!(low_pfn % SWAP_CLUSTER_MAX)
+ && compact_unlock_should_abort(&zone->lru_lock, flags,
+ &locked, cc))
+ break;
if (!pfn_valid_within(low_pfn))
continue;
nr_scanned++;
- /*
- * Get the page and ensure the page is within the same zone.
- * See the comment in isolate_freepages about overlapping
- * nodes. It is deliberate that the new zone lock is not taken
- * as memory compaction should not move pages between nodes.
- */
page = pfn_to_page(low_pfn);
- if (page_zone(page) != zone)
- continue;
if (!valid_page)
valid_page = page;
- /* If isolation recently failed, do not retry */
- pageblock_nr = low_pfn >> pageblock_order;
- if (last_pageblock_nr != pageblock_nr) {
- int mt;
-
- last_pageblock_nr = pageblock_nr;
- if (!isolation_suitable(cc, page))
- goto next_pageblock;
+ /*
+ * Skip if free. We read page order here without zone lock
+ * which is generally unsafe, but the race window is small and
+ * the worst thing that can happen is that we skip some
+ * potential isolation targets.
+ */
+ if (PageBuddy(page)) {
+ unsigned long freepage_order = page_order_unsafe(page);
/*
- * For async migration, also only scan in MOVABLE
- * blocks. Async migration is optimistic to see if
- * the minimum amount of work satisfies the allocation
+ * Without lock, we cannot be sure that what we got is
+ * a valid page order. Consider only values in the
+ * valid order range to prevent low_pfn overflow.
*/
- mt = get_pageblock_migratetype(page);
- if (cc->mode == MIGRATE_ASYNC &&
- !migrate_async_suitable(mt)) {
- set_unsuitable = false;
- goto next_pageblock;
- }
- }
-
- /*
- * Skip if free. page_order cannot be used without zone->lock
- * as nothing prevents parallel allocations or buddy merging.
- */
- if (PageBuddy(page))
+ if (freepage_order > 0 && freepage_order < MAX_ORDER)
+ low_pfn += (1UL << freepage_order) - 1;
continue;
+ }
/*
* Check may be lockless but that's ok as we recheck later.
@@ -597,7 +640,7 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
*/
if (!PageLRU(page)) {
if (unlikely(balloon_page_movable(page))) {
- if (locked && balloon_page_isolate(page)) {
+ if (balloon_page_isolate(page)) {
/* Successfully isolated */
goto isolate_success;
}
@@ -617,8 +660,11 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
*/
if (PageTransHuge(page)) {
if (!locked)
- goto next_pageblock;
- low_pfn += (1 << compound_order(page)) - 1;
+ low_pfn = ALIGN(low_pfn + 1,
+ pageblock_nr_pages) - 1;
+ else
+ low_pfn += (1 << compound_order(page)) - 1;
+
continue;
}
@@ -631,24 +677,26 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
page_count(page) > page_mapcount(page))
continue;
- /* Check if it is ok to still hold the lock */
- locked = compact_checklock_irqsave(&zone->lru_lock, &flags,
- locked, cc);
- if (!locked || fatal_signal_pending(current))
- break;
+ /* If we already hold the lock, we can skip some rechecking */
+ if (!locked) {
+ locked = compact_trylock_irqsave(&zone->lru_lock,
+ &flags, cc);
+ if (!locked)
+ break;
- /* Recheck PageLRU and PageTransHuge under lock */
- if (!PageLRU(page))
- continue;
- if (PageTransHuge(page)) {
- low_pfn += (1 << compound_order(page)) - 1;
- continue;
+ /* Recheck PageLRU and PageTransHuge under lock */
+ if (!PageLRU(page))
+ continue;
+ if (PageTransHuge(page)) {
+ low_pfn += (1 << compound_order(page)) - 1;
+ continue;
+ }
}
lruvec = mem_cgroup_page_lruvec(page, zone);
/* Try isolate the page */
- if (__isolate_lru_page(page, mode) != 0)
+ if (__isolate_lru_page(page, isolate_mode) != 0)
continue;
VM_BUG_ON_PAGE(PageTransCompound(page), page);
@@ -667,14 +715,14 @@ isolate_success:
++low_pfn;
break;
}
-
- continue;
-
-next_pageblock:
- low_pfn = ALIGN(low_pfn + 1, pageblock_nr_pages) - 1;
}
- acct_isolated(zone, locked, cc);
+ /*
+ * The PageBuddy() check could have potentially brought us outside
+ * the range to be scanned.
+ */
+ if (unlikely(low_pfn > end_pfn))
+ low_pfn = end_pfn;
if (locked)
spin_unlock_irqrestore(&zone->lru_lock, flags);
@@ -684,8 +732,7 @@ next_pageblock:
* if the whole pageblock was scanned without isolating any page.
*/
if (low_pfn == end_pfn)
- update_pageblock_skip(cc, valid_page, nr_isolated,
- set_unsuitable, true);
+ update_pageblock_skip(cc, valid_page, nr_isolated, true);
trace_mm_compaction_isolate_migratepages(nr_scanned, nr_isolated);
@@ -696,17 +743,65 @@ next_pageblock:
return low_pfn;
}
+/**
+ * isolate_migratepages_range() - isolate migrate-able pages in a PFN range
+ * @cc: Compaction control structure.
+ * @start_pfn: The first PFN to start isolating.
+ * @end_pfn: The one-past-last PFN.
+ *
+ * Returns zero if isolation fails fatally due to e.g. pending signal.
+ * Otherwise, function returns one-past-the-last PFN of isolated page
+ * (which may be greater than end_pfn if end fell in a middle of a THP page).
+ */
+unsigned long
+isolate_migratepages_range(struct compact_control *cc, unsigned long start_pfn,
+ unsigned long end_pfn)
+{
+ unsigned long pfn, block_end_pfn;
+
+ /* Scan block by block. First and last block may be incomplete */
+ pfn = start_pfn;
+ block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages);
+
+ for (; pfn < end_pfn; pfn = block_end_pfn,
+ block_end_pfn += pageblock_nr_pages) {
+
+ block_end_pfn = min(block_end_pfn, end_pfn);
+
+ if (!pageblock_pfn_to_page(pfn, block_end_pfn, cc->zone))
+ continue;
+
+ pfn = isolate_migratepages_block(cc, pfn, block_end_pfn,
+ ISOLATE_UNEVICTABLE);
+
+ /*
+ * In case of fatal failure, release everything that might
+ * have been isolated in the previous iteration, and signal
+ * the failure back to caller.
+ */
+ if (!pfn) {
+ putback_movable_pages(&cc->migratepages);
+ cc->nr_migratepages = 0;
+ break;
+ }
+ }
+ acct_isolated(cc->zone, cc);
+
+ return pfn;
+}
+
#endif /* CONFIG_COMPACTION || CONFIG_CMA */
#ifdef CONFIG_COMPACTION
/*
* Based on information in the current compact_control, find blocks
* suitable for isolating free pages from and then isolate them.
*/
-static void isolate_freepages(struct zone *zone,
- struct compact_control *cc)
+static void isolate_freepages(struct compact_control *cc)
{
+ struct zone *zone = cc->zone;
struct page *page;
unsigned long block_start_pfn; /* start of current pageblock */
+ unsigned long isolate_start_pfn; /* exact pfn we start at */
unsigned long block_end_pfn; /* end of current pageblock */
unsigned long low_pfn; /* lowest pfn scanner is able to scan */
int nr_freepages = cc->nr_freepages;
@@ -715,14 +810,15 @@ static void isolate_freepages(struct zone *zone,
/*
* Initialise the free scanner. The starting point is where we last
* successfully isolated from, zone-cached value, or the end of the
- * zone when isolating for the first time. We need this aligned to
- * the pageblock boundary, because we do
+ * zone when isolating for the first time. For looping we also need
+ * this pfn aligned down to the pageblock boundary, because we do
* block_start_pfn -= pageblock_nr_pages in the for loop.
* For ending point, take care when isolating in last pageblock of a
* a zone which ends in the middle of a pageblock.
* The low boundary is the end of the pageblock the migration scanner
* is using.
*/
+ isolate_start_pfn = cc->free_pfn;
block_start_pfn = cc->free_pfn & ~(pageblock_nr_pages-1);
block_end_pfn = min(block_start_pfn + pageblock_nr_pages,
zone_end_pfn(zone));
@@ -735,7 +831,8 @@ static void isolate_freepages(struct zone *zone,
*/
for (; block_start_pfn >= low_pfn && cc->nr_migratepages > nr_freepages;
block_end_pfn = block_start_pfn,
- block_start_pfn -= pageblock_nr_pages) {
+ block_start_pfn -= pageblock_nr_pages,
+ isolate_start_pfn = block_start_pfn) {
unsigned long isolated;
/*
@@ -747,18 +844,9 @@ static void isolate_freepages(struct zone *zone,
&& compact_should_abort(cc))
break;
- if (!pfn_valid(block_start_pfn))
- continue;
-
- /*
- * Check for overlapping nodes/zones. It's possible on some
- * configurations to have a setup like
- * node0 node1 node0
- * i.e. it's possible that all pages within a zones range of
- * pages do not belong to a single zone.
- */
- page = pfn_to_page(block_start_pfn);
- if (page_zone(page) != zone)
+ page = pageblock_pfn_to_page(block_start_pfn, block_end_pfn,
+ zone);
+ if (!page)
continue;
/* Check the block is suitable for migration */
@@ -769,13 +857,25 @@ static void isolate_freepages(struct zone *zone,
if (!isolation_suitable(cc, page))
continue;
- /* Found a block suitable for isolating free pages from */
- cc->free_pfn = block_start_pfn;
- isolated = isolate_freepages_block(cc, block_start_pfn,
+ /* Found a block suitable for isolating free pages from. */
+ isolated = isolate_freepages_block(cc, &isolate_start_pfn,
block_end_pfn, freelist, false);
nr_freepages += isolated;
/*
+ * Remember where the free scanner should restart next time,
+ * which is where isolate_freepages_block() left off.
+ * But if it scanned the whole pageblock, isolate_start_pfn
+ * now points at block_end_pfn, which is the start of the next
+ * pageblock.
+ * In that case we will however want to restart at the start
+ * of the previous pageblock.
+ */
+ cc->free_pfn = (isolate_start_pfn < block_end_pfn) ?
+ isolate_start_pfn :
+ block_start_pfn - pageblock_nr_pages;
+
+ /*
* Set a flag that we successfully isolated in this pageblock.
* In the next loop iteration, zone->compact_cached_free_pfn
* will not be updated and thus it will effectively contain the
@@ -822,7 +922,7 @@ static struct page *compaction_alloc(struct page *migratepage,
*/
if (list_empty(&cc->freepages)) {
if (!cc->contended)
- isolate_freepages(cc->zone, cc);
+ isolate_freepages(cc);
if (list_empty(&cc->freepages))
return NULL;
@@ -856,38 +956,84 @@ typedef enum {
} isolate_migrate_t;
/*
- * Isolate all pages that can be migrated from the block pointed to by
- * the migrate scanner within compact_control.
+ * Isolate all pages that can be migrated from the first suitable block,
+ * starting at the block pointed to by the migrate scanner pfn within
+ * compact_control.
*/
static isolate_migrate_t isolate_migratepages(struct zone *zone,
struct compact_control *cc)
{
unsigned long low_pfn, end_pfn;
+ struct page *page;
+ const isolate_mode_t isolate_mode =
+ (cc->mode == MIGRATE_ASYNC ? ISOLATE_ASYNC_MIGRATE : 0);
- /* Do not scan outside zone boundaries */
- low_pfn = max(cc->migrate_pfn, zone->zone_start_pfn);
+ /*
+ * Start at where we last stopped, or beginning of the zone as
+ * initialized by compact_zone()
+ */
+ low_pfn = cc->migrate_pfn;
/* Only scan within a pageblock boundary */
end_pfn = ALIGN(low_pfn + 1, pageblock_nr_pages);
- /* Do not cross the free scanner or scan within a memory hole */
- if (end_pfn > cc->free_pfn || !pfn_valid(low_pfn)) {
- cc->migrate_pfn = end_pfn;
- return ISOLATE_NONE;
- }
+ /*
+ * Iterate over whole pageblocks until we find the first suitable.
+ * Do not cross the free scanner.
+ */
+ for (; end_pfn <= cc->free_pfn;
+ low_pfn = end_pfn, end_pfn += pageblock_nr_pages) {
- /* Perform the isolation */
- low_pfn = isolate_migratepages_range(zone, cc, low_pfn, end_pfn, false);
- if (!low_pfn || cc->contended)
- return ISOLATE_ABORT;
+ /*
+ * This can potentially iterate a massively long zone with
+ * many pageblocks unsuitable, so periodically check if we
+ * need to schedule, or even abort async compaction.
+ */
+ if (!(low_pfn % (SWAP_CLUSTER_MAX * pageblock_nr_pages))
+ && compact_should_abort(cc))
+ break;
+
+ page = pageblock_pfn_to_page(low_pfn, end_pfn, zone);
+ if (!page)
+ continue;
+
+ /* If isolation recently failed, do not retry */
+ if (!isolation_suitable(cc, page))
+ continue;
+
+ /*
+ * For async compaction, also only scan in MOVABLE blocks.
+ * Async compaction is optimistic to see if the minimum amount
+ * of work satisfies the allocation.
+ */
+ if (cc->mode == MIGRATE_ASYNC &&
+ !migrate_async_suitable(get_pageblock_migratetype(page)))
+ continue;
+
+ /* Perform the isolation */
+ low_pfn = isolate_migratepages_block(cc, low_pfn, end_pfn,
+ isolate_mode);
+ if (!low_pfn || cc->contended)
+ return ISOLATE_ABORT;
+
+ /*
+ * Either we isolated something and proceed with migration. Or
+ * we failed and compact_zone should decide if we should
+ * continue or not.
+ */
+ break;
+ }
+
+ acct_isolated(zone, cc);
+ /* Record where migration scanner will be restarted */
cc->migrate_pfn = low_pfn;
- return ISOLATE_SUCCESS;
+ return cc->nr_migratepages ? ISOLATE_SUCCESS : ISOLATE_NONE;
}
-static int compact_finished(struct zone *zone,
- struct compact_control *cc)
+static int compact_finished(struct zone *zone, struct compact_control *cc,
+ const int migratetype)
{
unsigned int order;
unsigned long watermark;
@@ -933,7 +1079,7 @@ static int compact_finished(struct zone *zone,
struct free_area *area = &zone->free_area[order];
/* Job done if page is free of the right migratetype */
- if (!list_empty(&area->free_list[cc->migratetype]))
+ if (!list_empty(&area->free_list[migratetype]))
return COMPACT_PARTIAL;
/* Job done if allocation would set block type */
@@ -999,6 +1145,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
int ret;
unsigned long start_pfn = zone->zone_start_pfn;
unsigned long end_pfn = zone_end_pfn(zone);
+ const int migratetype = gfpflags_to_migratetype(cc->gfp_mask);
const bool sync = cc->mode != MIGRATE_ASYNC;
ret = compaction_suitable(zone, cc->order);
@@ -1041,7 +1188,8 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
migrate_prep_local();
- while ((ret = compact_finished(zone, cc)) == COMPACT_CONTINUE) {
+ while ((ret = compact_finished(zone, cc, migratetype)) ==
+ COMPACT_CONTINUE) {
int err;
switch (isolate_migratepages(zone, cc)) {
@@ -1056,9 +1204,6 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
;
}
- if (!cc->nr_migratepages)
- continue;
-
err = migrate_pages(&cc->migratepages, compaction_alloc,
compaction_free, (unsigned long)cc, cc->mode,
MR_COMPACTION);
@@ -1092,14 +1237,14 @@ out:
}
static unsigned long compact_zone_order(struct zone *zone, int order,
- gfp_t gfp_mask, enum migrate_mode mode, bool *contended)
+ gfp_t gfp_mask, enum migrate_mode mode, int *contended)
{
unsigned long ret;
struct compact_control cc = {
.nr_freepages = 0,
.nr_migratepages = 0,
.order = order,
- .migratetype = allocflags_to_migratetype(gfp_mask),
+ .gfp_mask = gfp_mask,
.zone = zone,
.mode = mode,
};
@@ -1124,48 +1269,117 @@ int sysctl_extfrag_threshold = 500;
* @gfp_mask: The GFP mask of the current allocation
* @nodemask: The allowed nodes to allocate from
* @mode: The migration mode for async, sync light, or sync migration
- * @contended: Return value that is true if compaction was aborted due to lock contention
- * @page: Optionally capture a free page of the requested order during compaction
+ * @contended: Return value that determines if compaction was aborted due to
+ * need_resched() or lock contention
+ * @candidate_zone: Return the zone where we think allocation should succeed
*
* This is the main entry point for direct page compaction.
*/
unsigned long try_to_compact_pages(struct zonelist *zonelist,
int order, gfp_t gfp_mask, nodemask_t *nodemask,
- enum migrate_mode mode, bool *contended)
+ enum migrate_mode mode, int *contended,
+ struct zone **candidate_zone)
{
enum zone_type high_zoneidx = gfp_zone(gfp_mask);
int may_enter_fs = gfp_mask & __GFP_FS;
int may_perform_io = gfp_mask & __GFP_IO;
struct zoneref *z;
struct zone *zone;
- int rc = COMPACT_SKIPPED;
+ int rc = COMPACT_DEFERRED;
int alloc_flags = 0;
+ int all_zones_contended = COMPACT_CONTENDED_LOCK; /* init for &= op */
+
+ *contended = COMPACT_CONTENDED_NONE;
/* Check if the GFP flags allow compaction */
if (!order || !may_enter_fs || !may_perform_io)
- return rc;
-
- count_compact_event(COMPACTSTALL);
+ return COMPACT_SKIPPED;
#ifdef CONFIG_CMA
- if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE)
+ if (gfpflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE)
alloc_flags |= ALLOC_CMA;
#endif
/* Compact each zone in the list */
for_each_zone_zonelist_nodemask(zone, z, zonelist, high_zoneidx,
nodemask) {
int status;
+ int zone_contended;
+
+ if (compaction_deferred(zone, order))
+ continue;
status = compact_zone_order(zone, order, gfp_mask, mode,
- contended);
+ &zone_contended);
rc = max(status, rc);
+ /*
+ * It takes at least one zone that wasn't lock contended
+ * to clear all_zones_contended.
+ */
+ all_zones_contended &= zone_contended;
/* If a normal allocation would succeed, stop compacting */
if (zone_watermark_ok(zone, order, low_wmark_pages(zone), 0,
- alloc_flags))
- break;
+ alloc_flags)) {
+ *candidate_zone = zone;
+ /*
+ * We think the allocation will succeed in this zone,
+ * but it is not certain, hence the false. The caller
+ * will repeat this with true if allocation indeed
+ * succeeds in this zone.
+ */
+ compaction_defer_reset(zone, order, false);
+ /*
+ * It is possible that async compaction aborted due to
+ * need_resched() and the watermarks were ok thanks to
+ * somebody else freeing memory. The allocation can
+ * however still fail so we better signal the
+ * need_resched() contention anyway (this will not
+ * prevent the allocation attempt).
+ */
+ if (zone_contended == COMPACT_CONTENDED_SCHED)
+ *contended = COMPACT_CONTENDED_SCHED;
+
+ goto break_loop;
+ }
+
+ if (mode != MIGRATE_ASYNC) {
+ /*
+ * We think that allocation won't succeed in this zone
+ * so we defer compaction there. If it ends up
+ * succeeding after all, it will be reset.
+ */
+ defer_compaction(zone, order);
+ }
+
+ /*
+ * We might have stopped compacting due to need_resched() in
+ * async compaction, or due to a fatal signal detected. In that
+ * case do not try further zones and signal need_resched()
+ * contention.
+ */
+ if ((zone_contended == COMPACT_CONTENDED_SCHED)
+ || fatal_signal_pending(current)) {
+ *contended = COMPACT_CONTENDED_SCHED;
+ goto break_loop;
+ }
+
+ continue;
+break_loop:
+ /*
+ * We might not have tried all the zones, so be conservative
+ * and assume they are not all lock contended.
+ */
+ all_zones_contended = 0;
+ break;
}
+ /*
+ * If at least one zone wasn't deferred or skipped, we report if all
+ * zones that were tried were lock contended.
+ */
+ if (rc > COMPACT_SKIPPED && all_zones_contended)
+ *contended = COMPACT_CONTENDED_LOCK;
+
return rc;
}
diff --git a/mm/debug.c b/mm/debug.c
new file mode 100644
index 000000000000..5ce45c9a29b5
--- /dev/null
+++ b/mm/debug.c
@@ -0,0 +1,237 @@
+/*
+ * mm/debug.c
+ *
+ * mm/ specific debug routines.
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/ftrace_event.h>
+#include <linux/memcontrol.h>
+
+static const struct trace_print_flags pageflag_names[] = {
+ {1UL << PG_locked, "locked" },
+ {1UL << PG_error, "error" },
+ {1UL << PG_referenced, "referenced" },
+ {1UL << PG_uptodate, "uptodate" },
+ {1UL << PG_dirty, "dirty" },
+ {1UL << PG_lru, "lru" },
+ {1UL << PG_active, "active" },
+ {1UL << PG_slab, "slab" },
+ {1UL << PG_owner_priv_1, "owner_priv_1" },
+ {1UL << PG_arch_1, "arch_1" },
+ {1UL << PG_reserved, "reserved" },
+ {1UL << PG_private, "private" },
+ {1UL << PG_private_2, "private_2" },
+ {1UL << PG_writeback, "writeback" },
+#ifdef CONFIG_PAGEFLAGS_EXTENDED
+ {1UL << PG_head, "head" },
+ {1UL << PG_tail, "tail" },
+#else
+ {1UL << PG_compound, "compound" },
+#endif
+ {1UL << PG_swapcache, "swapcache" },
+ {1UL << PG_mappedtodisk, "mappedtodisk" },
+ {1UL << PG_reclaim, "reclaim" },
+ {1UL << PG_swapbacked, "swapbacked" },
+ {1UL << PG_unevictable, "unevictable" },
+#ifdef CONFIG_MMU
+ {1UL << PG_mlocked, "mlocked" },
+#endif
+#ifdef CONFIG_ARCH_USES_PG_UNCACHED
+ {1UL << PG_uncached, "uncached" },
+#endif
+#ifdef CONFIG_MEMORY_FAILURE
+ {1UL << PG_hwpoison, "hwpoison" },
+#endif
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ {1UL << PG_compound_lock, "compound_lock" },
+#endif
+};
+
+static void dump_flags(unsigned long flags,
+ const struct trace_print_flags *names, int count)
+{
+ const char *delim = "";
+ unsigned long mask;
+ int i;
+
+ pr_emerg("flags: %#lx(", flags);
+
+ /* remove zone id */
+ flags &= (1UL << NR_PAGEFLAGS) - 1;
+
+ for (i = 0; i < count && flags; i++) {
+
+ mask = names[i].mask;
+ if ((flags & mask) != mask)
+ continue;
+
+ flags &= ~mask;
+ pr_cont("%s%s", delim, names[i].name);
+ delim = "|";
+ }
+
+ /* check for left over flags */
+ if (flags)
+ pr_cont("%s%#lx", delim, flags);
+
+ pr_cont(")\n");
+}
+
+void dump_page_badflags(struct page *page, const char *reason,
+ unsigned long badflags)
+{
+ pr_emerg("page:%p count:%d mapcount:%d mapping:%p index:%#lx\n",
+ page, atomic_read(&page->_count), page_mapcount(page),
+ page->mapping, page->index);
+ BUILD_BUG_ON(ARRAY_SIZE(pageflag_names) != __NR_PAGEFLAGS);
+ dump_flags(page->flags, pageflag_names, ARRAY_SIZE(pageflag_names));
+ if (reason)
+ pr_alert("page dumped because: %s\n", reason);
+ if (page->flags & badflags) {
+ pr_alert("bad because of flags:\n");
+ dump_flags(page->flags & badflags,
+ pageflag_names, ARRAY_SIZE(pageflag_names));
+ }
+ mem_cgroup_print_bad_page(page);
+}
+
+void dump_page(struct page *page, const char *reason)
+{
+ dump_page_badflags(page, reason, 0);
+}
+EXPORT_SYMBOL(dump_page);
+
+#ifdef CONFIG_DEBUG_VM
+
+static const struct trace_print_flags vmaflags_names[] = {
+ {VM_READ, "read" },
+ {VM_WRITE, "write" },
+ {VM_EXEC, "exec" },
+ {VM_SHARED, "shared" },
+ {VM_MAYREAD, "mayread" },
+ {VM_MAYWRITE, "maywrite" },
+ {VM_MAYEXEC, "mayexec" },
+ {VM_MAYSHARE, "mayshare" },
+ {VM_GROWSDOWN, "growsdown" },
+ {VM_PFNMAP, "pfnmap" },
+ {VM_DENYWRITE, "denywrite" },
+ {VM_LOCKED, "locked" },
+ {VM_IO, "io" },
+ {VM_SEQ_READ, "seqread" },
+ {VM_RAND_READ, "randread" },
+ {VM_DONTCOPY, "dontcopy" },
+ {VM_DONTEXPAND, "dontexpand" },
+ {VM_ACCOUNT, "account" },
+ {VM_NORESERVE, "noreserve" },
+ {VM_HUGETLB, "hugetlb" },
+ {VM_NONLINEAR, "nonlinear" },
+#if defined(CONFIG_X86)
+ {VM_PAT, "pat" },
+#elif defined(CONFIG_PPC)
+ {VM_SAO, "sao" },
+#elif defined(CONFIG_PARISC) || defined(CONFIG_METAG) || defined(CONFIG_IA64)
+ {VM_GROWSUP, "growsup" },
+#elif !defined(CONFIG_MMU)
+ {VM_MAPPED_COPY, "mappedcopy" },
+#else
+ {VM_ARCH_1, "arch_1" },
+#endif
+ {VM_DONTDUMP, "dontdump" },
+#ifdef CONFIG_MEM_SOFT_DIRTY
+ {VM_SOFTDIRTY, "softdirty" },
+#endif
+ {VM_MIXEDMAP, "mixedmap" },
+ {VM_HUGEPAGE, "hugepage" },
+ {VM_NOHUGEPAGE, "nohugepage" },
+ {VM_MERGEABLE, "mergeable" },
+};
+
+void dump_vma(const struct vm_area_struct *vma)
+{
+ pr_emerg("vma %p start %p end %p\n"
+ "next %p prev %p mm %p\n"
+ "prot %lx anon_vma %p vm_ops %p\n"
+ "pgoff %lx file %p private_data %p\n",
+ vma, (void *)vma->vm_start, (void *)vma->vm_end, vma->vm_next,
+ vma->vm_prev, vma->vm_mm,
+ (unsigned long)pgprot_val(vma->vm_page_prot),
+ vma->anon_vma, vma->vm_ops, vma->vm_pgoff,
+ vma->vm_file, vma->vm_private_data);
+ dump_flags(vma->vm_flags, vmaflags_names, ARRAY_SIZE(vmaflags_names));
+}
+EXPORT_SYMBOL(dump_vma);
+
+void dump_mm(const struct mm_struct *mm)
+{
+ pr_emerg("mm %p mmap %p seqnum %d task_size %lu\n"
+#ifdef CONFIG_MMU
+ "get_unmapped_area %p\n"
+#endif
+ "mmap_base %lu mmap_legacy_base %lu highest_vm_end %lu\n"
+ "pgd %p mm_users %d mm_count %d nr_ptes %lu map_count %d\n"
+ "hiwater_rss %lx hiwater_vm %lx total_vm %lx locked_vm %lx\n"
+ "pinned_vm %lx shared_vm %lx exec_vm %lx stack_vm %lx\n"
+ "start_code %lx end_code %lx start_data %lx end_data %lx\n"
+ "start_brk %lx brk %lx start_stack %lx\n"
+ "arg_start %lx arg_end %lx env_start %lx env_end %lx\n"
+ "binfmt %p flags %lx core_state %p\n"
+#ifdef CONFIG_AIO
+ "ioctx_table %p\n"
+#endif
+#ifdef CONFIG_MEMCG
+ "owner %p "
+#endif
+ "exe_file %p\n"
+#ifdef CONFIG_MMU_NOTIFIER
+ "mmu_notifier_mm %p\n"
+#endif
+#ifdef CONFIG_NUMA_BALANCING
+ "numa_next_scan %lu numa_scan_offset %lu numa_scan_seq %d\n"
+#endif
+#if defined(CONFIG_NUMA_BALANCING) || defined(CONFIG_COMPACTION)
+ "tlb_flush_pending %d\n"
+#endif
+ "%s", /* This is here to hold the comma */
+
+ mm, mm->mmap, mm->vmacache_seqnum, mm->task_size,
+#ifdef CONFIG_MMU
+ mm->get_unmapped_area,
+#endif
+ mm->mmap_base, mm->mmap_legacy_base, mm->highest_vm_end,
+ mm->pgd, atomic_read(&mm->mm_users),
+ atomic_read(&mm->mm_count),
+ atomic_long_read((atomic_long_t *)&mm->nr_ptes),
+ mm->map_count,
+ mm->hiwater_rss, mm->hiwater_vm, mm->total_vm, mm->locked_vm,
+ mm->pinned_vm, mm->shared_vm, mm->exec_vm, mm->stack_vm,
+ mm->start_code, mm->end_code, mm->start_data, mm->end_data,
+ mm->start_brk, mm->brk, mm->start_stack,
+ mm->arg_start, mm->arg_end, mm->env_start, mm->env_end,
+ mm->binfmt, mm->flags, mm->core_state,
+#ifdef CONFIG_AIO
+ mm->ioctx_table,
+#endif
+#ifdef CONFIG_MEMCG
+ mm->owner,
+#endif
+ mm->exe_file,
+#ifdef CONFIG_MMU_NOTIFIER
+ mm->mmu_notifier_mm,
+#endif
+#ifdef CONFIG_NUMA_BALANCING
+ mm->numa_next_scan, mm->numa_scan_offset, mm->numa_scan_seq,
+#endif
+#if defined(CONFIG_NUMA_BALANCING) || defined(CONFIG_COMPACTION)
+ mm->tlb_flush_pending,
+#endif
+ "" /* This is here to not have a comma! */
+ );
+
+ dump_flags(mm->def_flags, vmaflags_names,
+ ARRAY_SIZE(vmaflags_names));
+}
+
+#endif /* CONFIG_DEBUG_VM */
diff --git a/mm/dmapool.c b/mm/dmapool.c
index 306baa594f95..fd5fe4342e93 100644
--- a/mm/dmapool.c
+++ b/mm/dmapool.c
@@ -62,6 +62,7 @@ struct dma_page { /* cacheable header for 'allocation' bytes */
};
static DEFINE_MUTEX(pools_lock);
+static DEFINE_MUTEX(pools_reg_lock);
static ssize_t
show_pools(struct device *dev, struct device_attribute *attr, char *buf)
@@ -132,29 +133,27 @@ struct dma_pool *dma_pool_create(const char *name, struct device *dev,
{
struct dma_pool *retval;
size_t allocation;
+ bool empty = false;
- if (align == 0) {
+ if (align == 0)
align = 1;
- } else if (align & (align - 1)) {
+ else if (align & (align - 1))
return NULL;
- }
- if (size == 0) {
+ if (size == 0)
return NULL;
- } else if (size < 4) {
+ else if (size < 4)
size = 4;
- }
if ((size % align) != 0)
size = ALIGN(size, align);
allocation = max_t(size_t, size, PAGE_SIZE);
- if (!boundary) {
+ if (!boundary)
boundary = allocation;
- } else if ((boundary < size) || (boundary & (boundary - 1))) {
+ else if ((boundary < size) || (boundary & (boundary - 1)))
return NULL;
- }
retval = kmalloc_node(sizeof(*retval), GFP_KERNEL, dev_to_node(dev));
if (!retval)
@@ -172,15 +171,34 @@ struct dma_pool *dma_pool_create(const char *name, struct device *dev,
INIT_LIST_HEAD(&retval->pools);
+ /*
+ * pools_lock ensures that the ->dma_pools list does not get corrupted.
+ * pools_reg_lock ensures that there is not a race between
+ * dma_pool_create() and dma_pool_destroy() or within dma_pool_create()
+ * when the first invocation of dma_pool_create() failed on
+ * device_create_file() and the second assumes that it has been done (I
+ * know it is a short window).
+ */
+ mutex_lock(&pools_reg_lock);
mutex_lock(&pools_lock);
- if (list_empty(&dev->dma_pools) &&
- device_create_file(dev, &dev_attr_pools)) {
- kfree(retval);
- return NULL;
- } else
- list_add(&retval->pools, &dev->dma_pools);
+ if (list_empty(&dev->dma_pools))
+ empty = true;
+ list_add(&retval->pools, &dev->dma_pools);
mutex_unlock(&pools_lock);
-
+ if (empty) {
+ int err;
+
+ err = device_create_file(dev, &dev_attr_pools);
+ if (err) {
+ mutex_lock(&pools_lock);
+ list_del(&retval->pools);
+ mutex_unlock(&pools_lock);
+ mutex_unlock(&pools_reg_lock);
+ kfree(retval);
+ return NULL;
+ }
+ }
+ mutex_unlock(&pools_reg_lock);
return retval;
}
EXPORT_SYMBOL(dma_pool_create);
@@ -251,11 +269,17 @@ static void pool_free_page(struct dma_pool *pool, struct dma_page *page)
*/
void dma_pool_destroy(struct dma_pool *pool)
{
+ bool empty = false;
+
+ mutex_lock(&pools_reg_lock);
mutex_lock(&pools_lock);
list_del(&pool->pools);
if (pool->dev && list_empty(&pool->dev->dma_pools))
- device_remove_file(pool->dev, &dev_attr_pools);
+ empty = true;
mutex_unlock(&pools_lock);
+ if (empty)
+ device_remove_file(pool->dev, &dev_attr_pools);
+ mutex_unlock(&pools_reg_lock);
while (!list_empty(&pool->page_list)) {
struct dma_page *page;
diff --git a/mm/filemap.c b/mm/filemap.c
index 90effcdf948d..14b4642279f1 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -670,17 +670,13 @@ EXPORT_SYMBOL(__page_cache_alloc);
* at a cost of "thundering herd" phenomena during rare hash
* collisions.
*/
-static wait_queue_head_t *page_waitqueue(struct page *page)
+wait_queue_head_t *page_waitqueue(struct page *page)
{
const struct zone *zone = page_zone(page);
return &zone->wait_table[hash_ptr(page, zone->wait_table_bits)];
}
-
-static inline void wake_up_page(struct page *page, int bit)
-{
- __wake_up_bit(page_waitqueue(page), &page->flags, bit);
-}
+EXPORT_SYMBOL(page_waitqueue);
void wait_on_page_bit(struct page *page, int bit_nr)
{
@@ -703,6 +699,19 @@ int wait_on_page_bit_killable(struct page *page, int bit_nr)
bit_wait_io, TASK_KILLABLE);
}
+int wait_on_page_bit_killable_timeout(struct page *page,
+ int bit_nr, unsigned long timeout)
+{
+ DEFINE_WAIT_BIT(wait, &page->flags, bit_nr);
+
+ wait.key.timeout = jiffies + timeout;
+ if (!test_bit(bit_nr, &page->flags))
+ return 0;
+ return __wait_on_bit(page_waitqueue(page), &wait,
+ bit_wait_io_timeout, TASK_KILLABLE);
+}
+EXPORT_SYMBOL_GPL(wait_on_page_bit_killable_timeout);
+
/**
* add_page_wait_queue - Add an arbitrary waiter to a page's wait queue
* @page: Page defining the wait queue of interest
@@ -727,7 +736,7 @@ EXPORT_SYMBOL_GPL(add_page_wait_queue);
*
* Unlocks the page and wakes up sleepers in ___wait_on_page_locked().
* Also wakes sleepers in wait_on_page_writeback() because the wakeup
- * mechananism between PageLocked pages and PageWriteback pages is shared.
+ * mechanism between PageLocked pages and PageWriteback pages is shared.
* But that's OK - sleepers in wait_on_page_writeback() just go back to sleep.
*
* The mb is necessary to enforce ordering between the clear_bit and the read
@@ -1744,7 +1753,7 @@ EXPORT_SYMBOL(generic_file_read_iter);
static int page_cache_read(struct file *file, pgoff_t offset)
{
struct address_space *mapping = file->f_mapping;
- struct page *page;
+ struct page *page;
int ret;
do {
@@ -1761,7 +1770,7 @@ static int page_cache_read(struct file *file, pgoff_t offset)
page_cache_release(page);
} while (ret == AOP_TRUNCATED_PAGE);
-
+
return ret;
}
diff --git a/mm/gup.c b/mm/gup.c
index 91d044b1600d..cd62c8c90d4a 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -10,6 +10,10 @@
#include <linux/swap.h>
#include <linux/swapops.h>
+#include <linux/sched.h>
+#include <linux/rwsem.h>
+#include <asm/pgtable.h>
+
#include "internal.h"
static struct page *no_page_table(struct vm_area_struct *vma,
@@ -281,6 +285,10 @@ static int faultin_page(struct task_struct *tsk, struct vm_area_struct *vma,
fault_flags |= FAULT_FLAG_ALLOW_RETRY;
if (*flags & FOLL_NOWAIT)
fault_flags |= FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_RETRY_NOWAIT;
+ if (*flags & FOLL_TRIED) {
+ VM_WARN_ON_ONCE(fault_flags & FAULT_FLAG_ALLOW_RETRY);
+ fault_flags |= FAULT_FLAG_TRIED;
+ }
ret = handle_mm_fault(mm, vma, address, fault_flags);
if (ret & VM_FAULT_ERROR) {
@@ -672,3 +680,353 @@ struct page *get_dump_page(unsigned long addr)
return page;
}
#endif /* CONFIG_ELF_CORE */
+
+/*
+ * Generic RCU Fast GUP
+ *
+ * get_user_pages_fast attempts to pin user pages by walking the page
+ * tables directly and avoids taking locks. Thus the walker needs to be
+ * protected from page table pages being freed from under it, and should
+ * block any THP splits.
+ *
+ * One way to achieve this is to have the walker disable interrupts, and
+ * rely on IPIs from the TLB flushing code blocking before the page table
+ * pages are freed. This is unsuitable for architectures that do not need
+ * to broadcast an IPI when invalidating TLBs.
+ *
+ * Another way to achieve this is to batch up page table containing pages
+ * belonging to more than one mm_user, then rcu_sched a callback to free those
+ * pages. Disabling interrupts will allow the fast_gup walker to both block
+ * the rcu_sched callback, and an IPI that we broadcast for splitting THPs
+ * (which is a relatively rare event). The code below adopts this strategy.
+ *
+ * Before activating this code, please be aware that the following assumptions
+ * are currently made:
+ *
+ * *) HAVE_RCU_TABLE_FREE is enabled, and tlb_remove_table is used to free
+ * pages containing page tables.
+ *
+ * *) THP splits will broadcast an IPI, this can be achieved by overriding
+ * pmdp_splitting_flush.
+ *
+ * *) ptes can be read atomically by the architecture.
+ *
+ * *) access_ok is sufficient to validate userspace address ranges.
+ *
+ * The last two assumptions can be relaxed by the addition of helper functions.
+ *
+ * This code is based heavily on the PowerPC implementation by Nick Piggin.
+ */
+#ifdef CONFIG_HAVE_GENERIC_RCU_GUP
+
+#ifdef __HAVE_ARCH_PTE_SPECIAL
+static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
+ int write, struct page **pages, int *nr)
+{
+ pte_t *ptep, *ptem;
+ int ret = 0;
+
+ ptem = ptep = pte_offset_map(&pmd, addr);
+ do {
+ /*
+ * In the line below we are assuming that the pte can be read
+ * atomically. If this is not the case for your architecture,
+ * please wrap this in a helper function!
+ *
+ * for an example see gup_get_pte in arch/x86/mm/gup.c
+ */
+ pte_t pte = ACCESS_ONCE(*ptep);
+ struct page *page;
+
+ /*
+ * Similar to the PMD case below, NUMA hinting must take slow
+ * path
+ */
+ if (!pte_present(pte) || pte_special(pte) ||
+ pte_numa(pte) || (write && !pte_write(pte)))
+ goto pte_unmap;
+
+ VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
+ page = pte_page(pte);
+
+ if (!page_cache_get_speculative(page))
+ goto pte_unmap;
+
+ if (unlikely(pte_val(pte) != pte_val(*ptep))) {
+ put_page(page);
+ goto pte_unmap;
+ }
+
+ pages[*nr] = page;
+ (*nr)++;
+
+ } while (ptep++, addr += PAGE_SIZE, addr != end);
+
+ ret = 1;
+
+pte_unmap:
+ pte_unmap(ptem);
+ return ret;
+}
+#else
+
+/*
+ * If we can't determine whether or not a pte is special, then fail immediately
+ * for ptes. Note, we can still pin HugeTLB and THP as these are guaranteed not
+ * to be special.
+ *
+ * For a futex to be placed on a THP tail page, get_futex_key requires a
+ * __get_user_pages_fast implementation that can pin pages. Thus it's still
+ * useful to have gup_huge_pmd even if we can't operate on ptes.
+ */
+static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
+ int write, struct page **pages, int *nr)
+{
+ return 0;
+}
+#endif /* __HAVE_ARCH_PTE_SPECIAL */
+
+static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr,
+ unsigned long end, int write, struct page **pages, int *nr)
+{
+ struct page *head, *page, *tail;
+ int refs;
+
+ if (write && !pmd_write(orig))
+ return 0;
+
+ refs = 0;
+ head = pmd_page(orig);
+ page = head + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
+ tail = page;
+ do {
+ VM_BUG_ON_PAGE(compound_head(page) != head, page);
+ pages[*nr] = page;
+ (*nr)++;
+ page++;
+ refs++;
+ } while (addr += PAGE_SIZE, addr != end);
+
+ if (!page_cache_add_speculative(head, refs)) {
+ *nr -= refs;
+ return 0;
+ }
+
+ if (unlikely(pmd_val(orig) != pmd_val(*pmdp))) {
+ *nr -= refs;
+ while (refs--)
+ put_page(head);
+ return 0;
+ }
+
+ /*
+ * Any tail pages need their mapcount reference taken before we
+ * return. (This allows the THP code to bump their ref count when
+ * they are split into base pages).
+ */
+ while (refs--) {
+ if (PageTail(tail))
+ get_huge_page_tail(tail);
+ tail++;
+ }
+
+ return 1;
+}
+
+static int gup_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr,
+ unsigned long end, int write, struct page **pages, int *nr)
+{
+ struct page *head, *page, *tail;
+ int refs;
+
+ if (write && !pud_write(orig))
+ return 0;
+
+ refs = 0;
+ head = pud_page(orig);
+ page = head + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
+ tail = page;
+ do {
+ VM_BUG_ON_PAGE(compound_head(page) != head, page);
+ pages[*nr] = page;
+ (*nr)++;
+ page++;
+ refs++;
+ } while (addr += PAGE_SIZE, addr != end);
+
+ if (!page_cache_add_speculative(head, refs)) {
+ *nr -= refs;
+ return 0;
+ }
+
+ if (unlikely(pud_val(orig) != pud_val(*pudp))) {
+ *nr -= refs;
+ while (refs--)
+ put_page(head);
+ return 0;
+ }
+
+ while (refs--) {
+ if (PageTail(tail))
+ get_huge_page_tail(tail);
+ tail++;
+ }
+
+ return 1;
+}
+
+static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
+ int write, struct page **pages, int *nr)
+{
+ unsigned long next;
+ pmd_t *pmdp;
+
+ pmdp = pmd_offset(&pud, addr);
+ do {
+ pmd_t pmd = ACCESS_ONCE(*pmdp);
+
+ next = pmd_addr_end(addr, end);
+ if (pmd_none(pmd) || pmd_trans_splitting(pmd))
+ return 0;
+
+ if (unlikely(pmd_trans_huge(pmd) || pmd_huge(pmd))) {
+ /*
+ * NUMA hinting faults need to be handled in the GUP
+ * slowpath for accounting purposes and so that they
+ * can be serialised against THP migration.
+ */
+ if (pmd_numa(pmd))
+ return 0;
+
+ if (!gup_huge_pmd(pmd, pmdp, addr, next, write,
+ pages, nr))
+ return 0;
+
+ } else if (!gup_pte_range(pmd, addr, next, write, pages, nr))
+ return 0;
+ } while (pmdp++, addr = next, addr != end);
+
+ return 1;
+}
+
+static int gup_pud_range(pgd_t *pgdp, unsigned long addr, unsigned long end,
+ int write, struct page **pages, int *nr)
+{
+ unsigned long next;
+ pud_t *pudp;
+
+ pudp = pud_offset(pgdp, addr);
+ do {
+ pud_t pud = ACCESS_ONCE(*pudp);
+
+ next = pud_addr_end(addr, end);
+ if (pud_none(pud))
+ return 0;
+ if (pud_huge(pud)) {
+ if (!gup_huge_pud(pud, pudp, addr, next, write,
+ pages, nr))
+ return 0;
+ } else if (!gup_pmd_range(pud, addr, next, write, pages, nr))
+ return 0;
+ } while (pudp++, addr = next, addr != end);
+
+ return 1;
+}
+
+/*
+ * Like get_user_pages_fast() except it's IRQ-safe in that it won't fall back to
+ * the regular GUP. It will only return non-negative values.
+ */
+int __get_user_pages_fast(unsigned long start, int nr_pages, int write,
+ struct page **pages)
+{
+ struct mm_struct *mm = current->mm;
+ unsigned long addr, len, end;
+ unsigned long next, flags;
+ pgd_t *pgdp;
+ int nr = 0;
+
+ start &= PAGE_MASK;
+ addr = start;
+ len = (unsigned long) nr_pages << PAGE_SHIFT;
+ end = start + len;
+
+ if (unlikely(!access_ok(write ? VERIFY_WRITE : VERIFY_READ,
+ start, len)))
+ return 0;
+
+ /*
+ * Disable interrupts. We use the nested form as we can already have
+ * interrupts disabled by get_futex_key.
+ *
+ * With interrupts disabled, we block page table pages from being
+ * freed from under us. See mmu_gather_tlb in asm-generic/tlb.h
+ * for more details.
+ *
+ * We do not adopt an rcu_read_lock(.) here as we also want to
+ * block IPIs that come from THPs splitting.
+ */
+
+ local_irq_save(flags);
+ pgdp = pgd_offset(mm, addr);
+ do {
+ next = pgd_addr_end(addr, end);
+ if (pgd_none(*pgdp))
+ break;
+ else if (!gup_pud_range(pgdp, addr, next, write, pages, &nr))
+ break;
+ } while (pgdp++, addr = next, addr != end);
+ local_irq_restore(flags);
+
+ return nr;
+}
+
+/**
+ * get_user_pages_fast() - pin user pages in memory
+ * @start: starting user address
+ * @nr_pages: number of pages from start to pin
+ * @write: whether pages will be written to
+ * @pages: array that receives pointers to the pages pinned.
+ * Should be at least nr_pages long.
+ *
+ * Attempt to pin user pages in memory without taking mm->mmap_sem.
+ * If not successful, it will fall back to taking the lock and
+ * calling get_user_pages().
+ *
+ * Returns number of pages pinned. This may be fewer than the number
+ * requested. If nr_pages is 0 or negative, returns 0. If no pages
+ * were pinned, returns -errno.
+ */
+int get_user_pages_fast(unsigned long start, int nr_pages, int write,
+ struct page **pages)
+{
+ struct mm_struct *mm = current->mm;
+ int nr, ret;
+
+ start &= PAGE_MASK;
+ nr = __get_user_pages_fast(start, nr_pages, write, pages);
+ ret = nr;
+
+ if (nr < nr_pages) {
+ /* Try to get the remaining pages with get_user_pages */
+ start += nr << PAGE_SHIFT;
+ pages += nr;
+
+ down_read(&mm->mmap_sem);
+ ret = get_user_pages(current, mm, start,
+ nr_pages - nr, write, 0, pages, NULL);
+ up_read(&mm->mmap_sem);
+
+ /* Have to be a bit careful with return values */
+ if (nr > 0) {
+ if (ret < 0)
+ ret = nr;
+ else
+ ret += nr;
+ }
+ }
+
+ return ret;
+}
+
+#endif /* CONFIG_HAVE_GENERIC_RCU_GUP */
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index d9a21d06b862..74c78aa8bc2f 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1096,7 +1096,7 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long mmun_end; /* For mmu_notifiers */
ptl = pmd_lockptr(mm, pmd);
- VM_BUG_ON(!vma->anon_vma);
+ VM_BUG_ON_VMA(!vma->anon_vma, vma);
haddr = address & HPAGE_PMD_MASK;
if (is_huge_zero_pmd(orig_pmd))
goto alloc;
@@ -1795,14 +1795,17 @@ static int __split_huge_page_map(struct page *page,
for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
pte_t *pte, entry;
BUG_ON(PageCompound(page+i));
+ /*
+ * Note that pmd_numa is not transferred deliberately
+ * to avoid any possibility that pte_numa leaks to
+ * a PROT_NONE VMA by accident.
+ */
entry = mk_pte(page + i, vma->vm_page_prot);
entry = maybe_mkwrite(pte_mkdirty(entry), vma);
if (!pmd_write(*pmd))
entry = pte_wrprotect(entry);
if (!pmd_young(*pmd))
entry = pte_mkold(entry);
- if (pmd_numa(*pmd))
- entry = pte_mknuma(entry);
pte = pte_offset_map(&_pmd, haddr);
BUG_ON(!pte_none(*pte));
set_pte_at(mm, haddr, pte, entry);
@@ -2045,7 +2048,7 @@ int __khugepaged_enter(struct mm_struct *mm)
return -ENOMEM;
/* __khugepaged_exit() must not run from under us */
- VM_BUG_ON(khugepaged_test_exit(mm));
+ VM_BUG_ON_MM(khugepaged_test_exit(mm), mm);
if (unlikely(test_and_set_bit(MMF_VM_HUGEPAGE, &mm->flags))) {
free_mm_slot(mm_slot);
return 0;
@@ -2080,7 +2083,7 @@ int khugepaged_enter_vma_merge(struct vm_area_struct *vma)
if (vma->vm_ops)
/* khugepaged not yet working on file or special mappings */
return 0;
- VM_BUG_ON(vma->vm_flags & VM_NO_THP);
+ VM_BUG_ON_VMA(vma->vm_flags & VM_NO_THP, vma);
hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
hend = vma->vm_end & HPAGE_PMD_MASK;
if (hstart < hend)
@@ -2319,23 +2322,17 @@ static struct page
int node)
{
VM_BUG_ON_PAGE(*hpage, *hpage);
+
/*
- * Allocate the page while the vma is still valid and under
- * the mmap_sem read mode so there is no memory allocation
- * later when we take the mmap_sem in write mode. This is more
- * friendly behavior (OTOH it may actually hide bugs) to
- * filesystems in userland with daemons allocating memory in
- * the userland I/O paths. Allocating memory with the
- * mmap_sem in read mode is good idea also to allow greater
- * scalability.
+ * Before allocating the hugepage, release the mmap_sem read lock.
+ * The allocation can take potentially a long time if it involves
+ * sync compaction, and we do not need to hold the mmap_sem during
+ * that. We will recheck the vma after taking it again in write mode.
*/
+ up_read(&mm->mmap_sem);
+
*hpage = alloc_pages_exact_node(node, alloc_hugepage_gfpmask(
khugepaged_defrag(), __GFP_OTHER_NODE), HPAGE_PMD_ORDER);
- /*
- * After allocating the hugepage, release the mmap_sem read lock in
- * preparation for taking it in write mode.
- */
- up_read(&mm->mmap_sem);
if (unlikely(!*hpage)) {
count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
*hpage = ERR_PTR(-ENOMEM);
@@ -2409,7 +2406,7 @@ static bool hugepage_vma_check(struct vm_area_struct *vma)
return false;
if (is_vma_temporary_stack(vma))
return false;
- VM_BUG_ON(vma->vm_flags & VM_NO_THP);
+ VM_BUG_ON_VMA(vma->vm_flags & VM_NO_THP, vma);
return true;
}
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index eeceeeb09019..9fd722769927 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -434,7 +434,7 @@ static inline struct resv_map *inode_resv_map(struct inode *inode)
static struct resv_map *vma_resv_map(struct vm_area_struct *vma)
{
- VM_BUG_ON(!is_vm_hugetlb_page(vma));
+ VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma);
if (vma->vm_flags & VM_MAYSHARE) {
struct address_space *mapping = vma->vm_file->f_mapping;
struct inode *inode = mapping->host;
@@ -449,8 +449,8 @@ static struct resv_map *vma_resv_map(struct vm_area_struct *vma)
static void set_vma_resv_map(struct vm_area_struct *vma, struct resv_map *map)
{
- VM_BUG_ON(!is_vm_hugetlb_page(vma));
- VM_BUG_ON(vma->vm_flags & VM_MAYSHARE);
+ VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma);
+ VM_BUG_ON_VMA(vma->vm_flags & VM_MAYSHARE, vma);
set_vma_private_data(vma, (get_vma_private_data(vma) &
HPAGE_RESV_MASK) | (unsigned long)map);
@@ -458,15 +458,15 @@ static void set_vma_resv_map(struct vm_area_struct *vma, struct resv_map *map)
static void set_vma_resv_flags(struct vm_area_struct *vma, unsigned long flags)
{
- VM_BUG_ON(!is_vm_hugetlb_page(vma));
- VM_BUG_ON(vma->vm_flags & VM_MAYSHARE);
+ VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma);
+ VM_BUG_ON_VMA(vma->vm_flags & VM_MAYSHARE, vma);
set_vma_private_data(vma, get_vma_private_data(vma) | flags);
}
static int is_vma_resv_set(struct vm_area_struct *vma, unsigned long flag)
{
- VM_BUG_ON(!is_vm_hugetlb_page(vma));
+ VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma);
return (get_vma_private_data(vma) & flag) != 0;
}
@@ -474,7 +474,7 @@ static int is_vma_resv_set(struct vm_area_struct *vma, unsigned long flag)
/* Reset counters to 0 and clear all HPAGE_RESV_* flags */
void reset_vma_resv_huge_pages(struct vm_area_struct *vma)
{
- VM_BUG_ON(!is_vm_hugetlb_page(vma));
+ VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma);
if (!(vma->vm_flags & VM_MAYSHARE))
vma->vm_private_data = (void *)0;
}
diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c
index 9eebfadeeee1..a67c26e0f360 100644
--- a/mm/hugetlb_cgroup.c
+++ b/mm/hugetlb_cgroup.c
@@ -217,7 +217,7 @@ void hugetlb_cgroup_uncharge_page(int idx, unsigned long nr_pages,
if (hugetlb_cgroup_disabled())
return;
- VM_BUG_ON(!spin_is_locked(&hugetlb_lock));
+ lockdep_assert_held(&hugetlb_lock);
h_cg = hugetlb_cgroup_from_page(page);
if (unlikely(!h_cg))
return;
diff --git a/mm/internal.h b/mm/internal.h
index a1b651b11c5f..829304090b90 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -142,10 +142,10 @@ struct compact_control {
bool finished_update_migrate;
int order; /* order a direct compactor needs */
- int migratetype; /* MOVABLE, RECLAIMABLE etc */
+ const gfp_t gfp_mask; /* gfp mask of a direct compactor */
struct zone *zone;
- bool contended; /* True if a lock was contended, or
- * need_resched() true during async
+ int contended; /* Signal need_sched() or lock
+ * contention detected during
* compaction
*/
};
@@ -154,8 +154,8 @@ unsigned long
isolate_freepages_range(struct compact_control *cc,
unsigned long start_pfn, unsigned long end_pfn);
unsigned long
-isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
- unsigned long low_pfn, unsigned long end_pfn, bool unevictable);
+isolate_migratepages_range(struct compact_control *cc,
+ unsigned long low_pfn, unsigned long end_pfn);
#endif
@@ -164,7 +164,8 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
* general, page_zone(page)->lock must be held by the caller to prevent the
* page from being allocated in parallel and returning garbage as the order.
* If a caller does not hold page_zone(page)->lock, it must guarantee that the
- * page cannot be allocated or merged in parallel.
+ * page cannot be allocated or merged in parallel. Alternatively, it must
+ * handle invalid values gracefully, and use page_order_unsafe() below.
*/
static inline unsigned long page_order(struct page *page)
{
@@ -172,6 +173,19 @@ static inline unsigned long page_order(struct page *page)
return page_private(page);
}
+/*
+ * Like page_order(), but for callers who cannot afford to hold the zone lock.
+ * PageBuddy() should be checked first by the caller to minimize race window,
+ * and invalid values must be handled gracefully.
+ *
+ * ACCESS_ONCE is used so that if the caller assigns the result into a local
+ * variable and e.g. tests it for valid range before using, the compiler cannot
+ * decide to remove the variable and inline the page_private(page) multiple
+ * times, potentially observing different values in the tests and the actual
+ * use of the result.
+ */
+#define page_order_unsafe(page) ACCESS_ONCE(page_private(page))
+
static inline bool is_cow_mapping(vm_flags_t flags)
{
return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
diff --git a/mm/interval_tree.c b/mm/interval_tree.c
index 4a5822a586e6..8da581fa9060 100644
--- a/mm/interval_tree.c
+++ b/mm/interval_tree.c
@@ -34,7 +34,7 @@ void vma_interval_tree_insert_after(struct vm_area_struct *node,
struct vm_area_struct *parent;
unsigned long last = vma_last_pgoff(node);
- VM_BUG_ON(vma_start_pgoff(node) != vma_start_pgoff(prev));
+ VM_BUG_ON_VMA(vma_start_pgoff(node) != vma_start_pgoff(prev), node);
if (!prev->shared.linear.rb.rb_right) {
parent = prev;
diff --git a/mm/iov_iter.c b/mm/iov_iter.c
index ab88dc0ea1d3..eafcf60f6b83 100644
--- a/mm/iov_iter.c
+++ b/mm/iov_iter.c
@@ -4,6 +4,96 @@
#include <linux/slab.h>
#include <linux/vmalloc.h>
+static size_t copy_to_iter_iovec(void *from, size_t bytes, struct iov_iter *i)
+{
+ size_t skip, copy, left, wanted;
+ const struct iovec *iov;
+ char __user *buf;
+
+ if (unlikely(bytes > i->count))
+ bytes = i->count;
+
+ if (unlikely(!bytes))
+ return 0;
+
+ wanted = bytes;
+ iov = i->iov;
+ skip = i->iov_offset;
+ buf = iov->iov_base + skip;
+ copy = min(bytes, iov->iov_len - skip);
+
+ left = __copy_to_user(buf, from, copy);
+ copy -= left;
+ skip += copy;
+ from += copy;
+ bytes -= copy;
+ while (unlikely(!left && bytes)) {
+ iov++;
+ buf = iov->iov_base;
+ copy = min(bytes, iov->iov_len);
+ left = __copy_to_user(buf, from, copy);
+ copy -= left;
+ skip = copy;
+ from += copy;
+ bytes -= copy;
+ }
+
+ if (skip == iov->iov_len) {
+ iov++;
+ skip = 0;
+ }
+ i->count -= wanted - bytes;
+ i->nr_segs -= iov - i->iov;
+ i->iov = iov;
+ i->iov_offset = skip;
+ return wanted - bytes;
+}
+
+static size_t copy_from_iter_iovec(void *to, size_t bytes, struct iov_iter *i)
+{
+ size_t skip, copy, left, wanted;
+ const struct iovec *iov;
+ char __user *buf;
+
+ if (unlikely(bytes > i->count))
+ bytes = i->count;
+
+ if (unlikely(!bytes))
+ return 0;
+
+ wanted = bytes;
+ iov = i->iov;
+ skip = i->iov_offset;
+ buf = iov->iov_base + skip;
+ copy = min(bytes, iov->iov_len - skip);
+
+ left = __copy_from_user(to, buf, copy);
+ copy -= left;
+ skip += copy;
+ to += copy;
+ bytes -= copy;
+ while (unlikely(!left && bytes)) {
+ iov++;
+ buf = iov->iov_base;
+ copy = min(bytes, iov->iov_len);
+ left = __copy_from_user(to, buf, copy);
+ copy -= left;
+ skip = copy;
+ to += copy;
+ bytes -= copy;
+ }
+
+ if (skip == iov->iov_len) {
+ iov++;
+ skip = 0;
+ }
+ i->count -= wanted - bytes;
+ i->nr_segs -= iov - i->iov;
+ i->iov = iov;
+ i->iov_offset = skip;
+ return wanted - bytes;
+}
+
static size_t copy_page_to_iter_iovec(struct page *page, size_t offset, size_t bytes,
struct iov_iter *i)
{
@@ -166,6 +256,50 @@ done:
return wanted - bytes;
}
+static size_t zero_iovec(size_t bytes, struct iov_iter *i)
+{
+ size_t skip, copy, left, wanted;
+ const struct iovec *iov;
+ char __user *buf;
+
+ if (unlikely(bytes > i->count))
+ bytes = i->count;
+
+ if (unlikely(!bytes))
+ return 0;
+
+ wanted = bytes;
+ iov = i->iov;
+ skip = i->iov_offset;
+ buf = iov->iov_base + skip;
+ copy = min(bytes, iov->iov_len - skip);
+
+ left = __clear_user(buf, copy);
+ copy -= left;
+ skip += copy;
+ bytes -= copy;
+
+ while (unlikely(!left && bytes)) {
+ iov++;
+ buf = iov->iov_base;
+ copy = min(bytes, iov->iov_len);
+ left = __clear_user(buf, copy);
+ copy -= left;
+ skip = copy;
+ bytes -= copy;
+ }
+
+ if (skip == iov->iov_len) {
+ iov++;
+ skip = 0;
+ }
+ i->count -= wanted - bytes;
+ i->nr_segs -= iov - i->iov;
+ i->iov = iov;
+ i->iov_offset = skip;
+ return wanted - bytes;
+}
+
static size_t __iovec_copy_from_user_inatomic(char *vaddr,
const struct iovec *iov, size_t base, size_t bytes)
{
@@ -310,7 +444,7 @@ void iov_iter_init(struct iov_iter *i, int direction,
EXPORT_SYMBOL(iov_iter_init);
static ssize_t get_pages_iovec(struct iov_iter *i,
- struct page **pages, unsigned maxpages,
+ struct page **pages, size_t maxsize, unsigned maxpages,
size_t *start)
{
size_t offset = i->iov_offset;
@@ -323,6 +457,8 @@ static ssize_t get_pages_iovec(struct iov_iter *i,
len = iov->iov_len - offset;
if (len > i->count)
len = i->count;
+ if (len > maxsize)
+ len = maxsize;
addr = (unsigned long)iov->iov_base + offset;
len += *start = addr & (PAGE_SIZE - 1);
if (len > maxpages * PAGE_SIZE)
@@ -412,12 +548,17 @@ static void memcpy_to_page(struct page *page, size_t offset, char *from, size_t
kunmap_atomic(to);
}
-static size_t copy_page_to_iter_bvec(struct page *page, size_t offset, size_t bytes,
- struct iov_iter *i)
+static void memzero_page(struct page *page, size_t offset, size_t len)
+{
+ char *addr = kmap_atomic(page);
+ memset(addr + offset, 0, len);
+ kunmap_atomic(addr);
+}
+
+static size_t copy_to_iter_bvec(void *from, size_t bytes, struct iov_iter *i)
{
size_t skip, copy, wanted;
const struct bio_vec *bvec;
- void *kaddr, *from;
if (unlikely(bytes > i->count))
bytes = i->count;
@@ -430,8 +571,6 @@ static size_t copy_page_to_iter_bvec(struct page *page, size_t offset, size_t by
skip = i->iov_offset;
copy = min_t(size_t, bytes, bvec->bv_len - skip);
- kaddr = kmap_atomic(page);
- from = kaddr + offset;
memcpy_to_page(bvec->bv_page, skip + bvec->bv_offset, from, copy);
skip += copy;
from += copy;
@@ -444,7 +583,6 @@ static size_t copy_page_to_iter_bvec(struct page *page, size_t offset, size_t by
from += copy;
bytes -= copy;
}
- kunmap_atomic(kaddr);
if (skip == bvec->bv_len) {
bvec++;
skip = 0;
@@ -456,12 +594,10 @@ static size_t copy_page_to_iter_bvec(struct page *page, size_t offset, size_t by
return wanted - bytes;
}
-static size_t copy_page_from_iter_bvec(struct page *page, size_t offset, size_t bytes,
- struct iov_iter *i)
+static size_t copy_from_iter_bvec(void *to, size_t bytes, struct iov_iter *i)
{
size_t skip, copy, wanted;
const struct bio_vec *bvec;
- void *kaddr, *to;
if (unlikely(bytes > i->count))
bytes = i->count;
@@ -473,10 +609,6 @@ static size_t copy_page_from_iter_bvec(struct page *page, size_t offset, size_t
bvec = i->bvec;
skip = i->iov_offset;
- kaddr = kmap_atomic(page);
-
- to = kaddr + offset;
-
copy = min(bytes, bvec->bv_len - skip);
memcpy_from_page(to, bvec->bv_page, bvec->bv_offset + skip, copy);
@@ -493,7 +625,6 @@ static size_t copy_page_from_iter_bvec(struct page *page, size_t offset, size_t
to += copy;
bytes -= copy;
}
- kunmap_atomic(kaddr);
if (skip == bvec->bv_len) {
bvec++;
skip = 0;
@@ -505,6 +636,61 @@ static size_t copy_page_from_iter_bvec(struct page *page, size_t offset, size_t
return wanted;
}
+static size_t copy_page_to_iter_bvec(struct page *page, size_t offset,
+ size_t bytes, struct iov_iter *i)
+{
+ void *kaddr = kmap_atomic(page);
+ size_t wanted = copy_to_iter_bvec(kaddr + offset, bytes, i);
+ kunmap_atomic(kaddr);
+ return wanted;
+}
+
+static size_t copy_page_from_iter_bvec(struct page *page, size_t offset,
+ size_t bytes, struct iov_iter *i)
+{
+ void *kaddr = kmap_atomic(page);
+ size_t wanted = copy_from_iter_bvec(kaddr + offset, bytes, i);
+ kunmap_atomic(kaddr);
+ return wanted;
+}
+
+static size_t zero_bvec(size_t bytes, struct iov_iter *i)
+{
+ size_t skip, copy, wanted;
+ const struct bio_vec *bvec;
+
+ if (unlikely(bytes > i->count))
+ bytes = i->count;
+
+ if (unlikely(!bytes))
+ return 0;
+
+ wanted = bytes;
+ bvec = i->bvec;
+ skip = i->iov_offset;
+ copy = min_t(size_t, bytes, bvec->bv_len - skip);
+
+ memzero_page(bvec->bv_page, skip + bvec->bv_offset, copy);
+ skip += copy;
+ bytes -= copy;
+ while (bytes) {
+ bvec++;
+ copy = min(bytes, (size_t)bvec->bv_len);
+ memzero_page(bvec->bv_page, bvec->bv_offset, copy);
+ skip = copy;
+ bytes -= copy;
+ }
+ if (skip == bvec->bv_len) {
+ bvec++;
+ skip = 0;
+ }
+ i->count -= wanted - bytes;
+ i->nr_segs -= bvec - i->bvec;
+ i->bvec = bvec;
+ i->iov_offset = skip;
+ return wanted - bytes;
+}
+
static size_t copy_from_user_bvec(struct page *page,
struct iov_iter *i, unsigned long offset, size_t bytes)
{
@@ -588,13 +774,15 @@ static unsigned long alignment_bvec(const struct iov_iter *i)
}
static ssize_t get_pages_bvec(struct iov_iter *i,
- struct page **pages, unsigned maxpages,
+ struct page **pages, size_t maxsize, unsigned maxpages,
size_t *start)
{
const struct bio_vec *bvec = i->bvec;
size_t len = bvec->bv_len - i->iov_offset;
if (len > i->count)
len = i->count;
+ if (len > maxsize)
+ len = maxsize;
/* can't be more than PAGE_SIZE */
*start = bvec->bv_offset + i->iov_offset;
@@ -668,6 +856,34 @@ size_t copy_page_from_iter(struct page *page, size_t offset, size_t bytes,
}
EXPORT_SYMBOL(copy_page_from_iter);
+size_t copy_to_iter(void *addr, size_t bytes, struct iov_iter *i)
+{
+ if (i->type & ITER_BVEC)
+ return copy_to_iter_bvec(addr, bytes, i);
+ else
+ return copy_to_iter_iovec(addr, bytes, i);
+}
+EXPORT_SYMBOL(copy_to_iter);
+
+size_t copy_from_iter(void *addr, size_t bytes, struct iov_iter *i)
+{
+ if (i->type & ITER_BVEC)
+ return copy_from_iter_bvec(addr, bytes, i);
+ else
+ return copy_from_iter_iovec(addr, bytes, i);
+}
+EXPORT_SYMBOL(copy_from_iter);
+
+size_t iov_iter_zero(size_t bytes, struct iov_iter *i)
+{
+ if (i->type & ITER_BVEC) {
+ return zero_bvec(bytes, i);
+ } else {
+ return zero_iovec(bytes, i);
+ }
+}
+EXPORT_SYMBOL(iov_iter_zero);
+
size_t iov_iter_copy_from_user_atomic(struct page *page,
struct iov_iter *i, unsigned long offset, size_t bytes)
{
@@ -711,13 +927,13 @@ unsigned long iov_iter_alignment(const struct iov_iter *i)
EXPORT_SYMBOL(iov_iter_alignment);
ssize_t iov_iter_get_pages(struct iov_iter *i,
- struct page **pages, unsigned maxpages,
+ struct page **pages, size_t maxsize, unsigned maxpages,
size_t *start)
{
if (i->type & ITER_BVEC)
- return get_pages_bvec(i, pages, maxpages, start);
+ return get_pages_bvec(i, pages, maxsize, maxpages, start);
else
- return get_pages_iovec(i, pages, maxpages, start);
+ return get_pages_iovec(i, pages, maxsize, maxpages, start);
}
EXPORT_SYMBOL(iov_iter_get_pages);
diff --git a/mm/kmemcheck.c b/mm/kmemcheck.c
index fd814fd61319..cab58bb592d8 100644
--- a/mm/kmemcheck.c
+++ b/mm/kmemcheck.c
@@ -2,6 +2,7 @@
#include <linux/mm_types.h>
#include <linux/mm.h>
#include <linux/slab.h>
+#include "slab.h"
#include <linux/kmemcheck.h>
void kmemcheck_alloc_shadow(struct page *page, int order, gfp_t flags, int node)
diff --git a/mm/ksm.c b/mm/ksm.c
index fb7590222706..6b2e337bc03c 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -2310,7 +2310,7 @@ static int __init ksm_init(void)
ksm_thread = kthread_run(ksm_scan_thread, NULL, "ksmd");
if (IS_ERR(ksm_thread)) {
- printk(KERN_ERR "ksm: creating kthread failed\n");
+ pr_err("ksm: creating kthread failed\n");
err = PTR_ERR(ksm_thread);
goto out_free;
}
@@ -2318,7 +2318,7 @@ static int __init ksm_init(void)
#ifdef CONFIG_SYSFS
err = sysfs_create_group(mm_kobj, &ksm_attr_group);
if (err) {
- printk(KERN_ERR "ksm: register sysfs failed\n");
+ pr_err("ksm: register sysfs failed\n");
kthread_stop(ksm_thread);
goto out_free;
}
diff --git a/mm/memblock.c b/mm/memblock.c
index 6d2f219a48b0..6ecb0d937fb5 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -192,8 +192,7 @@ phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t size,
phys_addr_t align, phys_addr_t start,
phys_addr_t end, int nid)
{
- int ret;
- phys_addr_t kernel_end;
+ phys_addr_t kernel_end, ret;
/* pump up @end */
if (end == MEMBLOCK_ALLOC_ACCESSIBLE)
@@ -817,6 +816,10 @@ void __init_memblock __next_mem_range(u64 *idx, int nid,
if (nid != NUMA_NO_NODE && nid != m_nid)
continue;
+ /* skip hotpluggable memory regions if needed */
+ if (movable_node_is_enabled() && memblock_is_hotpluggable(m))
+ continue;
+
if (!type_b) {
if (out_start)
*out_start = m_start;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index ec4dcf1b9562..23976fd885fd 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -292,6 +292,9 @@ struct mem_cgroup {
/* vmpressure notifications */
struct vmpressure vmpressure;
+ /* css_online() has been completed */
+ int initialized;
+
/*
* the counter to account for mem+swap usage.
*/
@@ -315,9 +318,6 @@ struct mem_cgroup {
/* OOM-Killer disable */
int oom_kill_disable;
- /* set when res.limit == memsw.limit */
- bool memsw_is_minimum;
-
/* protect arrays of thresholds */
struct mutex thresholds_lock;
@@ -481,14 +481,6 @@ enum res_type {
#define OOM_CONTROL (0)
/*
- * Reclaim flags for mem_cgroup_hierarchical_reclaim
- */
-#define MEM_CGROUP_RECLAIM_NOSWAP_BIT 0x0
-#define MEM_CGROUP_RECLAIM_NOSWAP (1 << MEM_CGROUP_RECLAIM_NOSWAP_BIT)
-#define MEM_CGROUP_RECLAIM_SHRINK_BIT 0x1
-#define MEM_CGROUP_RECLAIM_SHRINK (1 << MEM_CGROUP_RECLAIM_SHRINK_BIT)
-
-/*
* The memcg_create_mutex will be held whenever a new cgroup is created.
* As a consequence, any change that needs to protect against new child cgroups
* appearing has to hold it as well.
@@ -646,11 +638,13 @@ int memcg_limited_groups_array_size;
struct static_key memcg_kmem_enabled_key;
EXPORT_SYMBOL(memcg_kmem_enabled_key);
+static void memcg_free_cache_id(int id);
+
static void disarm_kmem_keys(struct mem_cgroup *memcg)
{
if (memcg_kmem_is_active(memcg)) {
static_key_slow_dec(&memcg_kmem_enabled_key);
- ida_simple_remove(&kmem_limited_groups, memcg->kmemcg_id);
+ memcg_free_cache_id(memcg->kmemcg_id);
}
/*
* This check can't live in kmem destruction function,
@@ -1099,10 +1093,21 @@ skip_node:
* skipping css reference should be safe.
*/
if (next_css) {
- if ((next_css == &root->css) ||
- ((next_css->flags & CSS_ONLINE) &&
- css_tryget_online(next_css)))
- return mem_cgroup_from_css(next_css);
+ struct mem_cgroup *memcg = mem_cgroup_from_css(next_css);
+
+ if (next_css == &root->css)
+ return memcg;
+
+ if (css_tryget_online(next_css)) {
+ /*
+ * Make sure the memcg is initialized:
+ * mem_cgroup_css_online() orders the the
+ * initialization against setting the flag.
+ */
+ if (smp_load_acquire(&memcg->initialized))
+ return memcg;
+ css_put(next_css);
+ }
prev_css = next_css;
goto skip_node;
@@ -1792,42 +1797,6 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
NULL, "Memory cgroup out of memory");
}
-static unsigned long mem_cgroup_reclaim(struct mem_cgroup *memcg,
- gfp_t gfp_mask,
- unsigned long flags)
-{
- unsigned long total = 0;
- bool noswap = false;
- int loop;
-
- if (flags & MEM_CGROUP_RECLAIM_NOSWAP)
- noswap = true;
- if (!(flags & MEM_CGROUP_RECLAIM_SHRINK) && memcg->memsw_is_minimum)
- noswap = true;
-
- for (loop = 0; loop < MEM_CGROUP_MAX_RECLAIM_LOOPS; loop++) {
- if (loop)
- drain_all_stock_async(memcg);
- total += try_to_free_mem_cgroup_pages(memcg, gfp_mask, noswap);
- /*
- * Allow limit shrinkers, which are triggered directly
- * by userspace, to catch signals and stop reclaim
- * after minimal progress, regardless of the margin.
- */
- if (total && (flags & MEM_CGROUP_RECLAIM_SHRINK))
- break;
- if (mem_cgroup_margin(memcg))
- break;
- /*
- * If nothing was reclaimed after two attempts, there
- * may be no reclaimable pages in this hierarchy.
- */
- if (loop && !total)
- break;
- }
- return total;
-}
-
/**
* test_mem_cgroup_node_reclaimable
* @memcg: the target memcg
@@ -2530,25 +2499,29 @@ static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
struct mem_cgroup *mem_over_limit;
struct res_counter *fail_res;
unsigned long nr_reclaimed;
- unsigned long flags = 0;
unsigned long long size;
+ bool may_swap = true;
+ bool drained = false;
int ret = 0;
+ if (mem_cgroup_is_root(memcg))
+ goto done;
retry:
if (consume_stock(memcg, nr_pages))
goto done;
size = batch * PAGE_SIZE;
- if (!res_counter_charge(&memcg->res, size, &fail_res)) {
- if (!do_swap_account)
- goto done_restock;
- if (!res_counter_charge(&memcg->memsw, size, &fail_res))
+ if (!do_swap_account ||
+ !res_counter_charge(&memcg->memsw, size, &fail_res)) {
+ if (!res_counter_charge(&memcg->res, size, &fail_res))
goto done_restock;
- res_counter_uncharge(&memcg->res, size);
- mem_over_limit = mem_cgroup_from_res_counter(fail_res, memsw);
- flags |= MEM_CGROUP_RECLAIM_NOSWAP;
- } else
+ if (do_swap_account)
+ res_counter_uncharge(&memcg->memsw, size);
mem_over_limit = mem_cgroup_from_res_counter(fail_res, res);
+ } else {
+ mem_over_limit = mem_cgroup_from_res_counter(fail_res, memsw);
+ may_swap = false;
+ }
if (batch > nr_pages) {
batch = nr_pages;
@@ -2572,11 +2545,18 @@ retry:
if (!(gfp_mask & __GFP_WAIT))
goto nomem;
- nr_reclaimed = mem_cgroup_reclaim(mem_over_limit, gfp_mask, flags);
+ nr_reclaimed = try_to_free_mem_cgroup_pages(mem_over_limit, nr_pages,
+ gfp_mask, may_swap);
if (mem_cgroup_margin(mem_over_limit) >= nr_pages)
goto retry;
+ if (!drained) {
+ drain_all_stock_async(mem_over_limit);
+ drained = true;
+ goto retry;
+ }
+
if (gfp_mask & __GFP_NORETRY)
goto nomem;
/*
@@ -2611,9 +2591,7 @@ nomem:
if (!(gfp_mask & __GFP_NOFAIL))
return -ENOMEM;
bypass:
- memcg = root_mem_cgroup;
- ret = -EINTR;
- goto retry;
+ return -EINTR;
done_restock:
if (batch > nr_pages)
@@ -2626,6 +2604,9 @@ static void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages)
{
unsigned long bytes = nr_pages * PAGE_SIZE;
+ if (mem_cgroup_is_root(memcg))
+ return;
+
res_counter_uncharge(&memcg->res, bytes);
if (do_swap_account)
res_counter_uncharge(&memcg->memsw, bytes);
@@ -2640,6 +2621,9 @@ static void __mem_cgroup_cancel_local_charge(struct mem_cgroup *memcg,
{
unsigned long bytes = nr_pages * PAGE_SIZE;
+ if (mem_cgroup_is_root(memcg))
+ return;
+
res_counter_uncharge_until(&memcg->res, memcg->res.parent, bytes);
if (do_swap_account)
res_counter_uncharge_until(&memcg->memsw,
@@ -2778,12 +2762,6 @@ static DEFINE_MUTEX(memcg_slab_mutex);
static DEFINE_MUTEX(activate_kmem_mutex);
-static inline bool memcg_can_account_kmem(struct mem_cgroup *memcg)
-{
- return !mem_cgroup_disabled() && !mem_cgroup_is_root(memcg) &&
- memcg_kmem_is_active(memcg);
-}
-
/*
* This is a bit cumbersome, but it is rarely used and avoids a backpointer
* in the memcg_cache_params struct.
@@ -2803,7 +2781,7 @@ static int mem_cgroup_slabinfo_read(struct seq_file *m, void *v)
struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
struct memcg_cache_params *params;
- if (!memcg_can_account_kmem(memcg))
+ if (!memcg_kmem_is_active(memcg))
return -EIO;
print_slabinfo_header(m);
@@ -2886,19 +2864,44 @@ int memcg_cache_id(struct mem_cgroup *memcg)
return memcg ? memcg->kmemcg_id : -1;
}
-static size_t memcg_caches_array_size(int num_groups)
+static int memcg_alloc_cache_id(void)
{
- ssize_t size;
- if (num_groups <= 0)
- return 0;
+ int id, size;
+ int err;
+
+ id = ida_simple_get(&kmem_limited_groups,
+ 0, MEMCG_CACHES_MAX_SIZE, GFP_KERNEL);
+ if (id < 0)
+ return id;
- size = 2 * num_groups;
+ if (id < memcg_limited_groups_array_size)
+ return id;
+
+ /*
+ * There's no space for the new id in memcg_caches arrays,
+ * so we have to grow them.
+ */
+
+ size = 2 * (id + 1);
if (size < MEMCG_CACHES_MIN_SIZE)
size = MEMCG_CACHES_MIN_SIZE;
else if (size > MEMCG_CACHES_MAX_SIZE)
size = MEMCG_CACHES_MAX_SIZE;
- return size;
+ mutex_lock(&memcg_slab_mutex);
+ err = memcg_update_all_caches(size);
+ mutex_unlock(&memcg_slab_mutex);
+
+ if (err) {
+ ida_simple_remove(&kmem_limited_groups, id);
+ return err;
+ }
+ return id;
+}
+
+static void memcg_free_cache_id(int id)
+{
+ ida_simple_remove(&kmem_limited_groups, id);
}
/*
@@ -2908,97 +2911,7 @@ static size_t memcg_caches_array_size(int num_groups)
*/
void memcg_update_array_size(int num)
{
- if (num > memcg_limited_groups_array_size)
- memcg_limited_groups_array_size = memcg_caches_array_size(num);
-}
-
-int memcg_update_cache_size(struct kmem_cache *s, int num_groups)
-{
- struct memcg_cache_params *cur_params = s->memcg_params;
-
- VM_BUG_ON(!is_root_cache(s));
-
- if (num_groups > memcg_limited_groups_array_size) {
- int i;
- struct memcg_cache_params *new_params;
- ssize_t size = memcg_caches_array_size(num_groups);
-
- size *= sizeof(void *);
- size += offsetof(struct memcg_cache_params, memcg_caches);
-
- new_params = kzalloc(size, GFP_KERNEL);
- if (!new_params)
- return -ENOMEM;
-
- new_params->is_root_cache = true;
-
- /*
- * There is the chance it will be bigger than
- * memcg_limited_groups_array_size, if we failed an allocation
- * in a cache, in which case all caches updated before it, will
- * have a bigger array.
- *
- * But if that is the case, the data after
- * memcg_limited_groups_array_size is certainly unused
- */
- for (i = 0; i < memcg_limited_groups_array_size; i++) {
- if (!cur_params->memcg_caches[i])
- continue;
- new_params->memcg_caches[i] =
- cur_params->memcg_caches[i];
- }
-
- /*
- * Ideally, we would wait until all caches succeed, and only
- * then free the old one. But this is not worth the extra
- * pointer per-cache we'd have to have for this.
- *
- * It is not a big deal if some caches are left with a size
- * bigger than the others. And all updates will reset this
- * anyway.
- */
- rcu_assign_pointer(s->memcg_params, new_params);
- if (cur_params)
- kfree_rcu(cur_params, rcu_head);
- }
- return 0;
-}
-
-int memcg_alloc_cache_params(struct mem_cgroup *memcg, struct kmem_cache *s,
- struct kmem_cache *root_cache)
-{
- size_t size;
-
- if (!memcg_kmem_enabled())
- return 0;
-
- if (!memcg) {
- size = offsetof(struct memcg_cache_params, memcg_caches);
- size += memcg_limited_groups_array_size * sizeof(void *);
- } else
- size = sizeof(struct memcg_cache_params);
-
- s->memcg_params = kzalloc(size, GFP_KERNEL);
- if (!s->memcg_params)
- return -ENOMEM;
-
- if (memcg) {
- s->memcg_params->memcg = memcg;
- s->memcg_params->root_cache = root_cache;
- css_get(&memcg->css);
- } else
- s->memcg_params->is_root_cache = true;
-
- return 0;
-}
-
-void memcg_free_cache_params(struct kmem_cache *s)
-{
- if (!s->memcg_params)
- return;
- if (!s->memcg_params->is_root_cache)
- css_put(&s->memcg_params->memcg->css);
- kfree(s->memcg_params);
+ memcg_limited_groups_array_size = num;
}
static void memcg_register_cache(struct mem_cgroup *memcg,
@@ -3031,6 +2944,7 @@ static void memcg_register_cache(struct mem_cgroup *memcg,
if (!cachep)
return;
+ css_get(&memcg->css);
list_add(&cachep->memcg_params->list, &memcg->memcg_slab_caches);
/*
@@ -3064,6 +2978,9 @@ static void memcg_unregister_cache(struct kmem_cache *cachep)
list_del(&cachep->memcg_params->list);
kmem_cache_destroy(cachep);
+
+ /* drop the reference taken in memcg_register_cache */
+ css_put(&memcg->css);
}
/*
@@ -3241,7 +3158,7 @@ struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep,
rcu_read_lock();
memcg = mem_cgroup_from_task(rcu_dereference(current->mm->owner));
- if (!memcg_can_account_kmem(memcg))
+ if (!memcg_kmem_is_active(memcg))
goto out;
memcg_cachep = cache_from_memcg_idx(cachep, memcg_cache_id(memcg));
@@ -3326,7 +3243,7 @@ __memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **_memcg, int order)
memcg = get_mem_cgroup_from_mm(current->mm);
- if (!memcg_can_account_kmem(memcg)) {
+ if (!memcg_kmem_is_active(memcg)) {
css_put(&memcg->css);
return true;
}
@@ -3668,7 +3585,6 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
unsigned long long val)
{
int retry_count;
- u64 memswlimit, memlimit;
int ret = 0;
int children = mem_cgroup_count_children(memcg);
u64 curusage, oldusage;
@@ -3695,31 +3611,23 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
* We have to guarantee memcg->res.limit <= memcg->memsw.limit.
*/
mutex_lock(&set_limit_mutex);
- memswlimit = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
- if (memswlimit < val) {
+ if (res_counter_read_u64(&memcg->memsw, RES_LIMIT) < val) {
ret = -EINVAL;
mutex_unlock(&set_limit_mutex);
break;
}
- memlimit = res_counter_read_u64(&memcg->res, RES_LIMIT);
- if (memlimit < val)
+ if (res_counter_read_u64(&memcg->res, RES_LIMIT) < val)
enlarge = 1;
ret = res_counter_set_limit(&memcg->res, val);
- if (!ret) {
- if (memswlimit == val)
- memcg->memsw_is_minimum = true;
- else
- memcg->memsw_is_minimum = false;
- }
mutex_unlock(&set_limit_mutex);
if (!ret)
break;
- mem_cgroup_reclaim(memcg, GFP_KERNEL,
- MEM_CGROUP_RECLAIM_SHRINK);
+ try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, true);
+
curusage = res_counter_read_u64(&memcg->res, RES_USAGE);
/* Usage is reduced ? */
if (curusage >= oldusage)
@@ -3737,7 +3645,7 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
unsigned long long val)
{
int retry_count;
- u64 memlimit, memswlimit, oldusage, curusage;
+ u64 oldusage, curusage;
int children = mem_cgroup_count_children(memcg);
int ret = -EBUSY;
int enlarge = 0;
@@ -3756,30 +3664,21 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
* We have to guarantee memcg->res.limit <= memcg->memsw.limit.
*/
mutex_lock(&set_limit_mutex);
- memlimit = res_counter_read_u64(&memcg->res, RES_LIMIT);
- if (memlimit > val) {
+ if (res_counter_read_u64(&memcg->res, RES_LIMIT) > val) {
ret = -EINVAL;
mutex_unlock(&set_limit_mutex);
break;
}
- memswlimit = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
- if (memswlimit < val)
+ if (res_counter_read_u64(&memcg->memsw, RES_LIMIT) < val)
enlarge = 1;
ret = res_counter_set_limit(&memcg->memsw, val);
- if (!ret) {
- if (memlimit == val)
- memcg->memsw_is_minimum = true;
- else
- memcg->memsw_is_minimum = false;
- }
mutex_unlock(&set_limit_mutex);
if (!ret)
break;
- mem_cgroup_reclaim(memcg, GFP_KERNEL,
- MEM_CGROUP_RECLAIM_NOSWAP |
- MEM_CGROUP_RECLAIM_SHRINK);
+ try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, false);
+
curusage = res_counter_read_u64(&memcg->memsw, RES_USAGE);
/* Usage is reduced ? */
if (curusage >= oldusage)
@@ -4028,8 +3927,8 @@ static int mem_cgroup_force_empty(struct mem_cgroup *memcg)
if (signal_pending(current))
return -EINTR;
- progress = try_to_free_mem_cgroup_pages(memcg, GFP_KERNEL,
- false);
+ progress = try_to_free_mem_cgroup_pages(memcg, 1,
+ GFP_KERNEL, true);
if (!progress) {
nr_retries--;
/* maybe some writeback is necessary */
@@ -4093,6 +3992,46 @@ out:
return retval;
}
+static unsigned long mem_cgroup_recursive_stat(struct mem_cgroup *memcg,
+ enum mem_cgroup_stat_index idx)
+{
+ struct mem_cgroup *iter;
+ long val = 0;
+
+ /* Per-cpu values can be negative, use a signed accumulator */
+ for_each_mem_cgroup_tree(iter, memcg)
+ val += mem_cgroup_read_stat(iter, idx);
+
+ if (val < 0) /* race ? */
+ val = 0;
+ return val;
+}
+
+static inline u64 mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
+{
+ u64 val;
+
+ if (!mem_cgroup_is_root(memcg)) {
+ if (!swap)
+ return res_counter_read_u64(&memcg->res, RES_USAGE);
+ else
+ return res_counter_read_u64(&memcg->memsw, RES_USAGE);
+ }
+
+ /*
+ * Transparent hugepages are still accounted for in MEM_CGROUP_STAT_RSS
+ * as well as in MEM_CGROUP_STAT_RSS_HUGE.
+ */
+ val = mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_CACHE);
+ val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_RSS);
+
+ if (swap)
+ val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_SWAP);
+
+ return val << PAGE_SHIFT;
+}
+
+
static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css,
struct cftype *cft)
{
@@ -4102,8 +4041,12 @@ static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css,
switch (type) {
case _MEM:
+ if (name == RES_USAGE)
+ return mem_cgroup_usage(memcg, false);
return res_counter_read_u64(&memcg->res, name);
case _MEMSWAP:
+ if (name == RES_USAGE)
+ return mem_cgroup_usage(memcg, true);
return res_counter_read_u64(&memcg->memsw, name);
case _KMEM:
return res_counter_read_u64(&memcg->kmem, name);
@@ -4150,23 +4093,12 @@ static int __memcg_activate_kmem(struct mem_cgroup *memcg,
if (err)
goto out;
- memcg_id = ida_simple_get(&kmem_limited_groups,
- 0, MEMCG_CACHES_MAX_SIZE, GFP_KERNEL);
+ memcg_id = memcg_alloc_cache_id();
if (memcg_id < 0) {
err = memcg_id;
goto out;
}
- /*
- * Make sure we have enough space for this cgroup in each root cache's
- * memcg_params.
- */
- mutex_lock(&memcg_slab_mutex);
- err = memcg_update_all_caches(memcg_id + 1);
- mutex_unlock(&memcg_slab_mutex);
- if (err)
- goto out_rmid;
-
memcg->kmemcg_id = memcg_id;
INIT_LIST_HEAD(&memcg->memcg_slab_caches);
@@ -4187,10 +4119,6 @@ static int __memcg_activate_kmem(struct mem_cgroup *memcg,
out:
memcg_resume_kmem_account();
return err;
-
-out_rmid:
- ida_simple_remove(&kmem_limited_groups, memcg_id);
- goto out;
}
static int memcg_activate_kmem(struct mem_cgroup *memcg,
@@ -4572,10 +4500,7 @@ static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap)
if (!t)
goto unlock;
- if (!swap)
- usage = res_counter_read_u64(&memcg->res, RES_USAGE);
- else
- usage = res_counter_read_u64(&memcg->memsw, RES_USAGE);
+ usage = mem_cgroup_usage(memcg, swap);
/*
* current_threshold points to threshold just below or equal to usage.
@@ -4673,10 +4598,10 @@ static int __mem_cgroup_usage_register_event(struct mem_cgroup *memcg,
if (type == _MEM) {
thresholds = &memcg->thresholds;
- usage = res_counter_read_u64(&memcg->res, RES_USAGE);
+ usage = mem_cgroup_usage(memcg, false);
} else if (type == _MEMSWAP) {
thresholds = &memcg->memsw_thresholds;
- usage = res_counter_read_u64(&memcg->memsw, RES_USAGE);
+ usage = mem_cgroup_usage(memcg, true);
} else
BUG();
@@ -4762,10 +4687,10 @@ static void __mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
if (type == _MEM) {
thresholds = &memcg->thresholds;
- usage = res_counter_read_u64(&memcg->res, RES_USAGE);
+ usage = mem_cgroup_usage(memcg, false);
} else if (type == _MEMSWAP) {
thresholds = &memcg->memsw_thresholds;
- usage = res_counter_read_u64(&memcg->memsw, RES_USAGE);
+ usage = mem_cgroup_usage(memcg, true);
} else
BUG();
@@ -5502,6 +5427,7 @@ mem_cgroup_css_online(struct cgroup_subsys_state *css)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
struct mem_cgroup *parent = mem_cgroup_from_css(css->parent);
+ int ret;
if (css->id > MEM_CGROUP_ID_MAX)
return -ENOSPC;
@@ -5525,9 +5451,9 @@ mem_cgroup_css_online(struct cgroup_subsys_state *css)
* core guarantees its existence.
*/
} else {
- res_counter_init(&memcg->res, &root_mem_cgroup->res);
- res_counter_init(&memcg->memsw, &root_mem_cgroup->memsw);
- res_counter_init(&memcg->kmem, &root_mem_cgroup->kmem);
+ res_counter_init(&memcg->res, NULL);
+ res_counter_init(&memcg->memsw, NULL);
+ res_counter_init(&memcg->kmem, NULL);
/*
* Deeper hierachy with use_hierarchy == false doesn't make
* much sense so let cgroup subsystem know about this
@@ -5538,7 +5464,18 @@ mem_cgroup_css_online(struct cgroup_subsys_state *css)
}
mutex_unlock(&memcg_create_mutex);
- return memcg_init_kmem(memcg, &memory_cgrp_subsys);
+ ret = memcg_init_kmem(memcg, &memory_cgrp_subsys);
+ if (ret)
+ return ret;
+
+ /*
+ * Make sure the memcg is initialized: mem_cgroup_iter()
+ * orders reading memcg->initialized against its callers
+ * reading the memcg members.
+ */
+ smp_store_release(&memcg->initialized, 1);
+
+ return 0;
}
/*
@@ -5969,8 +5906,9 @@ static void __mem_cgroup_clear_mc(void)
/* we must fixup refcnts and charges */
if (mc.moved_swap) {
/* uncharge swap account from the old cgroup */
- res_counter_uncharge(&mc.from->memsw,
- PAGE_SIZE * mc.moved_swap);
+ if (!mem_cgroup_is_root(mc.from))
+ res_counter_uncharge(&mc.from->memsw,
+ PAGE_SIZE * mc.moved_swap);
for (i = 0; i < mc.moved_swap; i++)
css_put(&mc.from->css);
@@ -5979,8 +5917,9 @@ static void __mem_cgroup_clear_mc(void)
* we charged both to->res and to->memsw, so we should
* uncharge to->res.
*/
- res_counter_uncharge(&mc.to->res,
- PAGE_SIZE * mc.moved_swap);
+ if (!mem_cgroup_is_root(mc.to))
+ res_counter_uncharge(&mc.to->res,
+ PAGE_SIZE * mc.moved_swap);
/* we've already done css_get(mc.to) */
mc.moved_swap = 0;
}
@@ -6345,7 +6284,8 @@ void mem_cgroup_uncharge_swap(swp_entry_t entry)
rcu_read_lock();
memcg = mem_cgroup_lookup(id);
if (memcg) {
- res_counter_uncharge(&memcg->memsw, PAGE_SIZE);
+ if (!mem_cgroup_is_root(memcg))
+ res_counter_uncharge(&memcg->memsw, PAGE_SIZE);
mem_cgroup_swap_statistics(memcg, false);
css_put(&memcg->css);
}
@@ -6509,12 +6449,15 @@ static void uncharge_batch(struct mem_cgroup *memcg, unsigned long pgpgout,
{
unsigned long flags;
- if (nr_mem)
- res_counter_uncharge(&memcg->res, nr_mem * PAGE_SIZE);
- if (nr_memsw)
- res_counter_uncharge(&memcg->memsw, nr_memsw * PAGE_SIZE);
-
- memcg_oom_recover(memcg);
+ if (!mem_cgroup_is_root(memcg)) {
+ if (nr_mem)
+ res_counter_uncharge(&memcg->res,
+ nr_mem * PAGE_SIZE);
+ if (nr_memsw)
+ res_counter_uncharge(&memcg->memsw,
+ nr_memsw * PAGE_SIZE);
+ memcg_oom_recover(memcg);
+ }
local_irq_save(flags);
__this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS], nr_anon);
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 44c6bd201d3a..8639f6b28746 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -148,7 +148,7 @@ static int hwpoison_filter_task(struct page *p)
ino = cgroup_ino(css->cgroup);
css_put(css);
- if (!ino || ino != hwpoison_filter_memcg)
+ if (ino != hwpoison_filter_memcg)
return -EINVAL;
return 0;
diff --git a/mm/memory.c b/mm/memory.c
index ab3537bcfed2..1cc6bfbd872e 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -118,6 +118,8 @@ __setup("norandmaps", disable_randmaps);
unsigned long zero_pfn __read_mostly;
unsigned long highest_memmap_pfn __read_mostly;
+EXPORT_SYMBOL(zero_pfn);
+
/*
* CONFIG_MMU architectures set up ZERO_PAGE in their paging_init()
*/
@@ -751,7 +753,7 @@ struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
unsigned long pfn = pte_pfn(pte);
if (HAVE_PTE_SPECIAL) {
- if (likely(!pte_special(pte) || pte_numa(pte)))
+ if (likely(!pte_special(pte)))
goto check_pfn;
if (vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP))
return NULL;
@@ -777,15 +779,14 @@ struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
}
}
+ if (is_zero_pfn(pfn))
+ return NULL;
check_pfn:
if (unlikely(pfn > highest_memmap_pfn)) {
print_bad_pte(vma, addr, pte, NULL);
return NULL;
}
- if (is_zero_pfn(pfn))
- return NULL;
-
/*
* NOTE! We still have PageReserved() pages in the page tables.
* eg. VDSO mappings can cause them to exist.
@@ -1126,7 +1127,7 @@ again:
addr) != page->index) {
pte_t ptfile = pgoff_to_pte(page->index);
if (pte_soft_dirty(ptent))
- pte_file_mksoft_dirty(ptfile);
+ ptfile = pte_file_mksoft_dirty(ptfile);
set_pte_at(mm, addr, pte, ptfile);
}
if (PageAnon(page))
@@ -2052,7 +2053,8 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
old_page = vm_normal_page(vma, address, orig_pte);
if (!old_page) {
/*
- * VM_MIXEDMAP !pfn_valid() case
+ * VM_MIXEDMAP !pfn_valid() case, or VM_SOFTDIRTY clear on a
+ * VM_PFNMAP VMA.
*
* We should not cow pages in a shared writeable mapping.
* Just mark the pages writable as we can't do any dirty
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 2ff8c2325e96..29d8693d0c61 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1307,7 +1307,7 @@ int is_mem_section_removable(unsigned long start_pfn, unsigned long nr_pages)
/*
* Confirm all pages in a range [start, end) is belongs to the same zone.
*/
-static int test_pages_in_a_zone(unsigned long start_pfn, unsigned long end_pfn)
+int test_pages_in_a_zone(unsigned long start_pfn, unsigned long end_pfn)
{
unsigned long pfn;
struct zone *zone = NULL;
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 8f5330d74f47..e58725aff7e9 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -123,25 +123,23 @@ static struct mempolicy default_policy = {
static struct mempolicy preferred_node_policy[MAX_NUMNODES];
-static struct mempolicy *get_task_policy(struct task_struct *p)
+struct mempolicy *get_task_policy(struct task_struct *p)
{
struct mempolicy *pol = p->mempolicy;
+ int node;
- if (!pol) {
- int node = numa_node_id();
+ if (pol)
+ return pol;
- if (node != NUMA_NO_NODE) {
- pol = &preferred_node_policy[node];
- /*
- * preferred_node_policy is not initialised early in
- * boot
- */
- if (!pol->mode)
- pol = NULL;
- }
+ node = numa_node_id();
+ if (node != NUMA_NO_NODE) {
+ pol = &preferred_node_policy[node];
+ /* preferred_node_policy is not initialised early in boot */
+ if (pol->mode)
+ return pol;
}
- return pol;
+ return &default_policy;
}
static const struct mempolicy_operations {
@@ -683,7 +681,9 @@ queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end,
}
if (flags & MPOL_MF_LAZY) {
- change_prot_numa(vma, start, endvma);
+ /* Similar to task_numa_work, skip inaccessible VMAs */
+ if (vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE))
+ change_prot_numa(vma, start, endvma);
goto next;
}
@@ -804,7 +804,6 @@ static long do_set_mempolicy(unsigned short mode, unsigned short flags,
nodemask_t *nodes)
{
struct mempolicy *new, *old;
- struct mm_struct *mm = current->mm;
NODEMASK_SCRATCH(scratch);
int ret;
@@ -816,20 +815,11 @@ static long do_set_mempolicy(unsigned short mode, unsigned short flags,
ret = PTR_ERR(new);
goto out;
}
- /*
- * prevent changing our mempolicy while show_numa_maps()
- * is using it.
- * Note: do_set_mempolicy() can be called at init time
- * with no 'mm'.
- */
- if (mm)
- down_write(&mm->mmap_sem);
+
task_lock(current);
ret = mpol_set_nodemask(new, nodes, scratch);
if (ret) {
task_unlock(current);
- if (mm)
- up_write(&mm->mmap_sem);
mpol_put(new);
goto out;
}
@@ -839,9 +829,6 @@ static long do_set_mempolicy(unsigned short mode, unsigned short flags,
nodes_weight(new->v.nodes))
current->il_next = first_node(new->v.nodes);
task_unlock(current);
- if (mm)
- up_write(&mm->mmap_sem);
-
mpol_put(old);
ret = 0;
out:
@@ -1605,32 +1592,14 @@ COMPAT_SYSCALL_DEFINE6(mbind, compat_ulong_t, start, compat_ulong_t, len,
#endif
-/*
- * get_vma_policy(@task, @vma, @addr)
- * @task: task for fallback if vma policy == default
- * @vma: virtual memory area whose policy is sought
- * @addr: address in @vma for shared policy lookup
- *
- * Returns effective policy for a VMA at specified address.
- * Falls back to @task or system default policy, as necessary.
- * Current or other task's task mempolicy and non-shared vma policies must be
- * protected by task_lock(task) by the caller.
- * Shared policies [those marked as MPOL_F_SHARED] require an extra reference
- * count--added by the get_policy() vm_op, as appropriate--to protect against
- * freeing by another task. It is the caller's responsibility to free the
- * extra reference for shared policies.
- */
-struct mempolicy *get_vma_policy(struct task_struct *task,
- struct vm_area_struct *vma, unsigned long addr)
+struct mempolicy *__get_vma_policy(struct vm_area_struct *vma,
+ unsigned long addr)
{
- struct mempolicy *pol = get_task_policy(task);
+ struct mempolicy *pol = NULL;
if (vma) {
if (vma->vm_ops && vma->vm_ops->get_policy) {
- struct mempolicy *vpol = vma->vm_ops->get_policy(vma,
- addr);
- if (vpol)
- pol = vpol;
+ pol = vma->vm_ops->get_policy(vma, addr);
} else if (vma->vm_policy) {
pol = vma->vm_policy;
@@ -1644,31 +1613,51 @@ struct mempolicy *get_vma_policy(struct task_struct *task,
mpol_get(pol);
}
}
+
+ return pol;
+}
+
+/*
+ * get_vma_policy(@vma, @addr)
+ * @vma: virtual memory area whose policy is sought
+ * @addr: address in @vma for shared policy lookup
+ *
+ * Returns effective policy for a VMA at specified address.
+ * Falls back to current->mempolicy or system default policy, as necessary.
+ * Shared policies [those marked as MPOL_F_SHARED] require an extra reference
+ * count--added by the get_policy() vm_op, as appropriate--to protect against
+ * freeing by another task. It is the caller's responsibility to free the
+ * extra reference for shared policies.
+ */
+static struct mempolicy *get_vma_policy(struct vm_area_struct *vma,
+ unsigned long addr)
+{
+ struct mempolicy *pol = __get_vma_policy(vma, addr);
+
if (!pol)
- pol = &default_policy;
+ pol = get_task_policy(current);
+
return pol;
}
-bool vma_policy_mof(struct task_struct *task, struct vm_area_struct *vma)
+bool vma_policy_mof(struct vm_area_struct *vma)
{
- struct mempolicy *pol = get_task_policy(task);
- if (vma) {
- if (vma->vm_ops && vma->vm_ops->get_policy) {
- bool ret = false;
+ struct mempolicy *pol;
- pol = vma->vm_ops->get_policy(vma, vma->vm_start);
- if (pol && (pol->flags & MPOL_F_MOF))
- ret = true;
- mpol_cond_put(pol);
+ if (vma->vm_ops && vma->vm_ops->get_policy) {
+ bool ret = false;
- return ret;
- } else if (vma->vm_policy) {
- pol = vma->vm_policy;
- }
+ pol = vma->vm_ops->get_policy(vma, vma->vm_start);
+ if (pol && (pol->flags & MPOL_F_MOF))
+ ret = true;
+ mpol_cond_put(pol);
+
+ return ret;
}
+ pol = vma->vm_policy;
if (!pol)
- return default_policy.flags & MPOL_F_MOF;
+ pol = get_task_policy(current);
return pol->flags & MPOL_F_MOF;
}
@@ -1874,7 +1863,7 @@ struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr,
{
struct zonelist *zl;
- *mpol = get_vma_policy(current, vma, addr);
+ *mpol = get_vma_policy(vma, addr);
*nodemask = NULL; /* assume !MPOL_BIND */
if (unlikely((*mpol)->mode == MPOL_INTERLEAVE)) {
@@ -2029,7 +2018,7 @@ alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
unsigned int cpuset_mems_cookie;
retry_cpuset:
- pol = get_vma_policy(current, vma, addr);
+ pol = get_vma_policy(vma, addr);
cpuset_mems_cookie = read_mems_allowed_begin();
if (unlikely(pol->mode == MPOL_INTERLEAVE)) {
@@ -2046,8 +2035,7 @@ retry_cpuset:
page = __alloc_pages_nodemask(gfp, order,
policy_zonelist(gfp, pol, node),
policy_nodemask(gfp, pol));
- if (unlikely(mpol_needs_cond_ref(pol)))
- __mpol_put(pol);
+ mpol_cond_put(pol);
if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie)))
goto retry_cpuset;
return page;
@@ -2074,12 +2062,12 @@ retry_cpuset:
*/
struct page *alloc_pages_current(gfp_t gfp, unsigned order)
{
- struct mempolicy *pol = get_task_policy(current);
+ struct mempolicy *pol = &default_policy;
struct page *page;
unsigned int cpuset_mems_cookie;
- if (!pol || in_interrupt() || (gfp & __GFP_THISNODE))
- pol = &default_policy;
+ if (!in_interrupt() && !(gfp & __GFP_THISNODE))
+ pol = get_task_policy(current);
retry_cpuset:
cpuset_mems_cookie = read_mems_allowed_begin();
@@ -2296,7 +2284,7 @@ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long
BUG_ON(!vma);
- pol = get_vma_policy(current, vma, addr);
+ pol = get_vma_policy(vma, addr);
if (!(pol->flags & MPOL_F_MOF))
goto out;
diff --git a/mm/migrate.c b/mm/migrate.c
index f78ec9bd454d..01439953abf5 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -146,8 +146,11 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma,
pte = pte_mkold(mk_pte(new, vma->vm_page_prot));
if (pte_swp_soft_dirty(*ptep))
pte = pte_mksoft_dirty(pte);
+
+ /* Recheck VMA as permissions can change since migration started */
if (is_write_migration_entry(entry))
- pte = pte_mkwrite(pte);
+ pte = maybe_mkwrite(pte, vma);
+
#ifdef CONFIG_HUGETLB_PAGE
if (PageHuge(new)) {
pte = pte_mkhuge(pte);
@@ -873,7 +876,7 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
}
}
- if (unlikely(balloon_page_movable(page))) {
+ if (unlikely(isolated_balloon_page(page))) {
/*
* A ballooned page does not need any special attention from
* physical to virtual reverse mapping procedures.
@@ -952,17 +955,6 @@ static int unmap_and_move(new_page_t get_new_page, free_page_t put_new_page,
rc = __unmap_and_move(page, newpage, force, mode);
- if (unlikely(rc == MIGRATEPAGE_BALLOON_SUCCESS)) {
- /*
- * A ballooned page has been migrated already.
- * Now, it's the time to wrap-up counters,
- * handle the page back to Buddy and return.
- */
- dec_zone_page_state(page, NR_ISOLATED_ANON +
- page_is_file_cache(page));
- balloon_page_free(page);
- return MIGRATEPAGE_SUCCESS;
- }
out:
if (rc != -EAGAIN) {
/*
@@ -985,6 +977,9 @@ out:
if (rc != MIGRATEPAGE_SUCCESS && put_new_page) {
ClearPageSwapBacked(newpage);
put_new_page(newpage, private);
+ } else if (unlikely(__is_movable_balloon_page(newpage))) {
+ /* drop our reference, page already in the balloon */
+ put_page(newpage);
} else
putback_lru_page(newpage);
diff --git a/mm/mlock.c b/mm/mlock.c
index ce84cb0b83ef..73cf0987088c 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -233,9 +233,9 @@ long __mlock_vma_pages_range(struct vm_area_struct *vma,
VM_BUG_ON(start & ~PAGE_MASK);
VM_BUG_ON(end & ~PAGE_MASK);
- VM_BUG_ON(start < vma->vm_start);
- VM_BUG_ON(end > vma->vm_end);
- VM_BUG_ON(!rwsem_is_locked(&mm->mmap_sem));
+ VM_BUG_ON_VMA(start < vma->vm_start, vma);
+ VM_BUG_ON_VMA(end > vma->vm_end, vma);
+ VM_BUG_ON_MM(!rwsem_is_locked(&mm->mmap_sem), mm);
gup_flags = FOLL_TOUCH | FOLL_MLOCK;
/*
@@ -789,7 +789,7 @@ static int do_mlockall(int flags)
/* Ignore errors */
mlock_fixup(vma, &prev, vma->vm_start, vma->vm_end, newflags);
- cond_resched();
+ cond_resched_rcu_qs();
}
out:
return 0;
diff --git a/mm/mmap.c b/mm/mmap.c
index c1f2ea4a0b99..7f855206e7fb 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -70,7 +70,7 @@ static void unmap_region(struct mm_struct *mm,
* MAP_SHARED r: (no) no r: (yes) yes r: (no) yes r: (no) yes
* w: (no) no w: (no) no w: (yes) yes w: (no) no
* x: (no) no x: (no) yes x: (no) yes x: (yes) yes
- *
+ *
* MAP_PRIVATE r: (no) no r: (yes) yes r: (no) yes r: (no) yes
* w: (no) no w: (no) no w: (copy) copy w: (no) no
* x: (no) no x: (no) yes x: (no) yes x: (yes) yes
@@ -89,6 +89,25 @@ pgprot_t vm_get_page_prot(unsigned long vm_flags)
}
EXPORT_SYMBOL(vm_get_page_prot);
+static pgprot_t vm_pgprot_modify(pgprot_t oldprot, unsigned long vm_flags)
+{
+ return pgprot_modify(oldprot, vm_get_page_prot(vm_flags));
+}
+
+/* Update vma->vm_page_prot to reflect vma->vm_flags. */
+void vma_set_page_prot(struct vm_area_struct *vma)
+{
+ unsigned long vm_flags = vma->vm_flags;
+
+ vma->vm_page_prot = vm_pgprot_modify(vma->vm_page_prot, vm_flags);
+ if (vma_wants_writenotify(vma)) {
+ vm_flags &= ~VM_SHARED;
+ vma->vm_page_prot = vm_pgprot_modify(vma->vm_page_prot,
+ vm_flags);
+ }
+}
+
+
int sysctl_overcommit_memory __read_mostly = OVERCOMMIT_GUESS; /* heuristic overcommit */
int sysctl_overcommit_ratio __read_mostly = 50; /* default is 50% */
unsigned long sysctl_overcommit_kbytes __read_mostly;
@@ -268,7 +287,7 @@ static unsigned long do_brk(unsigned long addr, unsigned long len);
SYSCALL_DEFINE1(brk, unsigned long, brk)
{
- unsigned long rlim, retval;
+ unsigned long retval;
unsigned long newbrk, oldbrk;
struct mm_struct *mm = current->mm;
unsigned long min_brk;
@@ -298,9 +317,8 @@ SYSCALL_DEFINE1(brk, unsigned long, brk)
* segment grow beyond its set limit the in case where the limit is
* not page aligned -Ram Gupta
*/
- rlim = rlimit(RLIMIT_DATA);
- if (rlim < RLIM_INFINITY && (brk - mm->start_brk) +
- (mm->end_data - mm->start_data) > rlim)
+ if (check_data_rlimit(rlimit(RLIMIT_DATA), brk, mm->start_brk,
+ mm->end_data, mm->start_data))
goto out;
newbrk = PAGE_ALIGN(brk);
@@ -369,20 +387,22 @@ static int browse_rb(struct rb_root *root)
struct vm_area_struct *vma;
vma = rb_entry(nd, struct vm_area_struct, vm_rb);
if (vma->vm_start < prev) {
- pr_info("vm_start %lx prev %lx\n", vma->vm_start, prev);
+ pr_emerg("vm_start %lx < prev %lx\n",
+ vma->vm_start, prev);
bug = 1;
}
if (vma->vm_start < pend) {
- pr_info("vm_start %lx pend %lx\n", vma->vm_start, pend);
+ pr_emerg("vm_start %lx < pend %lx\n",
+ vma->vm_start, pend);
bug = 1;
}
if (vma->vm_start > vma->vm_end) {
- pr_info("vm_end %lx < vm_start %lx\n",
- vma->vm_end, vma->vm_start);
+ pr_emerg("vm_start %lx > vm_end %lx\n",
+ vma->vm_start, vma->vm_end);
bug = 1;
}
if (vma->rb_subtree_gap != vma_compute_subtree_gap(vma)) {
- pr_info("free gap %lx, correct %lx\n",
+ pr_emerg("free gap %lx, correct %lx\n",
vma->rb_subtree_gap,
vma_compute_subtree_gap(vma));
bug = 1;
@@ -396,7 +416,7 @@ static int browse_rb(struct rb_root *root)
for (nd = pn; nd; nd = rb_prev(nd))
j++;
if (i != j) {
- pr_info("backwards %d, forwards %d\n", j, i);
+ pr_emerg("backwards %d, forwards %d\n", j, i);
bug = 1;
}
return bug ? -1 : i;
@@ -409,8 +429,9 @@ static void validate_mm_rb(struct rb_root *root, struct vm_area_struct *ignore)
for (nd = rb_first(root); nd; nd = rb_next(nd)) {
struct vm_area_struct *vma;
vma = rb_entry(nd, struct vm_area_struct, vm_rb);
- BUG_ON(vma != ignore &&
- vma->rb_subtree_gap != vma_compute_subtree_gap(vma));
+ VM_BUG_ON_VMA(vma != ignore &&
+ vma->rb_subtree_gap != vma_compute_subtree_gap(vma),
+ vma);
}
}
@@ -420,8 +441,10 @@ static void validate_mm(struct mm_struct *mm)
int i = 0;
unsigned long highest_address = 0;
struct vm_area_struct *vma = mm->mmap;
+
while (vma) {
struct anon_vma_chain *avc;
+
vma_lock_anon_vma(vma);
list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
anon_vma_interval_tree_verify(avc);
@@ -431,20 +454,21 @@ static void validate_mm(struct mm_struct *mm)
i++;
}
if (i != mm->map_count) {
- pr_info("map_count %d vm_next %d\n", mm->map_count, i);
+ pr_emerg("map_count %d vm_next %d\n", mm->map_count, i);
bug = 1;
}
if (highest_address != mm->highest_vm_end) {
- pr_info("mm->highest_vm_end %lx, found %lx\n",
- mm->highest_vm_end, highest_address);
+ pr_emerg("mm->highest_vm_end %lx, found %lx\n",
+ mm->highest_vm_end, highest_address);
bug = 1;
}
i = browse_rb(&mm->mm_rb);
if (i != mm->map_count) {
- pr_info("map_count %d rb %d\n", mm->map_count, i);
+ if (i != -1)
+ pr_emerg("map_count %d rb %d\n", mm->map_count, i);
bug = 1;
}
- BUG_ON(bug);
+ VM_BUG_ON_MM(bug, mm);
}
#else
#define validate_mm_rb(root, ignore) do { } while (0)
@@ -741,7 +765,7 @@ again: remove_next = 1 + (end > next->vm_end);
* split_vma inserting another: so it must be
* mprotect case 4 shifting the boundary down.
*/
- adjust_next = - ((vma->vm_end - end) >> PAGE_SHIFT);
+ adjust_next = -((vma->vm_end - end) >> PAGE_SHIFT);
exporter = vma;
importer = next;
}
@@ -787,8 +811,8 @@ again: remove_next = 1 + (end > next->vm_end);
if (!anon_vma && adjust_next)
anon_vma = next->anon_vma;
if (anon_vma) {
- VM_BUG_ON(adjust_next && next->anon_vma &&
- anon_vma != next->anon_vma);
+ VM_BUG_ON_VMA(adjust_next && next->anon_vma &&
+ anon_vma != next->anon_vma, next);
anon_vma_lock_write(anon_vma);
anon_vma_interval_tree_pre_update_vma(vma);
if (adjust_next)
@@ -1010,7 +1034,7 @@ can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags,
struct vm_area_struct *vma_merge(struct mm_struct *mm,
struct vm_area_struct *prev, unsigned long addr,
unsigned long end, unsigned long vm_flags,
- struct anon_vma *anon_vma, struct file *file,
+ struct anon_vma *anon_vma, struct file *file,
pgoff_t pgoff, struct mempolicy *policy)
{
pgoff_t pglen = (end - addr) >> PAGE_SHIFT;
@@ -1036,7 +1060,7 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm,
* Can it merge with the predecessor?
*/
if (prev && prev->vm_end == addr &&
- mpol_equal(vma_policy(prev), policy) &&
+ mpol_equal(vma_policy(prev), policy) &&
can_vma_merge_after(prev, vm_flags,
anon_vma, file, pgoff)) {
/*
@@ -1064,7 +1088,7 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm,
* Can this new request be merged in front of next?
*/
if (next && end == next->vm_start &&
- mpol_equal(policy, vma_policy(next)) &&
+ mpol_equal(policy, vma_policy(next)) &&
can_vma_merge_before(next, vm_flags,
anon_vma, file, pgoff+pglen)) {
if (prev && addr < prev->vm_end) /* case 4 */
@@ -1235,7 +1259,7 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
unsigned long flags, unsigned long pgoff,
unsigned long *populate)
{
- struct mm_struct * mm = current->mm;
+ struct mm_struct *mm = current->mm;
vm_flags_t vm_flags;
*populate = 0;
@@ -1263,7 +1287,7 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
/* offset overflow? */
if ((pgoff + (len >> PAGE_SHIFT)) < pgoff)
- return -EOVERFLOW;
+ return -EOVERFLOW;
/* Too many mappings? */
if (mm->map_count > sysctl_max_map_count)
@@ -1470,11 +1494,16 @@ int vma_wants_writenotify(struct vm_area_struct *vma)
if (vma->vm_ops && vma->vm_ops->page_mkwrite)
return 1;
- /* The open routine did something to the protections already? */
+ /* The open routine did something to the protections that pgprot_modify
+ * won't preserve? */
if (pgprot_val(vma->vm_page_prot) !=
- pgprot_val(vm_get_page_prot(vm_flags)))
+ pgprot_val(vm_pgprot_modify(vma->vm_page_prot, vm_flags)))
return 0;
+ /* Do we need to track softdirty? */
+ if (IS_ENABLED(CONFIG_MEM_SOFT_DIRTY) && !(vm_flags & VM_SOFTDIRTY))
+ return 1;
+
/* Specialty mapping? */
if (vm_flags & VM_PFNMAP)
return 0;
@@ -1610,21 +1639,6 @@ munmap_back:
goto free_vma;
}
- if (vma_wants_writenotify(vma)) {
- pgprot_t pprot = vma->vm_page_prot;
-
- /* Can vma->vm_page_prot have changed??
- *
- * Answer: Yes, drivers may have changed it in their
- * f_op->mmap method.
- *
- * Ensures that vmas marked as uncached stay that way.
- */
- vma->vm_page_prot = vm_get_page_prot(vm_flags & ~VM_SHARED);
- if (pgprot_val(pprot) == pgprot_val(pgprot_noncached(pprot)))
- vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
- }
-
vma_link(mm, vma, prev, rb_link, rb_parent);
/* Once vma denies write, undo our temporary denial count */
if (file) {
@@ -1658,6 +1672,8 @@ out:
*/
vma->vm_flags |= VM_SOFTDIRTY;
+ vma_set_page_prot(vma);
+
return addr;
unmap_and_free_vma:
@@ -1921,7 +1937,7 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr,
info.align_mask = 0;
return vm_unmapped_area(&info);
}
-#endif
+#endif
/*
* This mmap-allocator allocates new areas top-down from below the
@@ -2321,13 +2337,13 @@ int expand_stack(struct vm_area_struct *vma, unsigned long address)
}
struct vm_area_struct *
-find_extend_vma(struct mm_struct * mm, unsigned long addr)
+find_extend_vma(struct mm_struct *mm, unsigned long addr)
{
- struct vm_area_struct * vma;
+ struct vm_area_struct *vma;
unsigned long start;
addr &= PAGE_MASK;
- vma = find_vma(mm,addr);
+ vma = find_vma(mm, addr);
if (!vma)
return NULL;
if (vma->vm_start <= addr)
@@ -2376,7 +2392,7 @@ static void unmap_region(struct mm_struct *mm,
struct vm_area_struct *vma, struct vm_area_struct *prev,
unsigned long start, unsigned long end)
{
- struct vm_area_struct *next = prev? prev->vm_next: mm->mmap;
+ struct vm_area_struct *next = prev ? prev->vm_next : mm->mmap;
struct mmu_gather tlb;
lru_add_drain();
@@ -2423,7 +2439,7 @@ detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma,
* __split_vma() bypasses sysctl_max_map_count checking. We use this on the
* munmap path where it doesn't make sense to fail.
*/
-static int __split_vma(struct mm_struct * mm, struct vm_area_struct * vma,
+static int __split_vma(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long addr, int new_below)
{
struct vm_area_struct *new;
@@ -2512,7 +2528,8 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len)
if ((start & ~PAGE_MASK) || start > TASK_SIZE || len > TASK_SIZE-start)
return -EINVAL;
- if ((len = PAGE_ALIGN(len)) == 0)
+ len = PAGE_ALIGN(len);
+ if (len == 0)
return -EINVAL;
/* Find the first overlapping VMA */
@@ -2558,7 +2575,7 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len)
if (error)
return error;
}
- vma = prev? prev->vm_next: mm->mmap;
+ vma = prev ? prev->vm_next : mm->mmap;
/*
* unlock any mlock()ed ranges before detaching vmas
@@ -2621,10 +2638,10 @@ static inline void verify_mm_writelocked(struct mm_struct *mm)
*/
static unsigned long do_brk(unsigned long addr, unsigned long len)
{
- struct mm_struct * mm = current->mm;
- struct vm_area_struct * vma, * prev;
+ struct mm_struct *mm = current->mm;
+ struct vm_area_struct *vma, *prev;
unsigned long flags;
- struct rb_node ** rb_link, * rb_parent;
+ struct rb_node **rb_link, *rb_parent;
pgoff_t pgoff = addr >> PAGE_SHIFT;
int error;
@@ -2848,7 +2865,7 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
* safe. It is only safe to keep the vm_pgoff
* linear if there are no pages mapped yet.
*/
- VM_BUG_ON(faulted_in_anon_vma);
+ VM_BUG_ON_VMA(faulted_in_anon_vma, new_vma);
*vmap = vma = new_vma;
}
*need_rmap_locks = (new_vma->vm_pgoff <= vma->vm_pgoff);
@@ -3196,7 +3213,7 @@ void __init mmap_init(void)
{
int ret;
- ret = percpu_counter_init(&vm_committed_as, 0);
+ ret = percpu_counter_init(&vm_committed_as, 0, GFP_KERNEL);
VM_BUG_ON(ret);
}
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
index 950813b1eb36..2c8da9825fe3 100644
--- a/mm/mmu_notifier.c
+++ b/mm/mmu_notifier.c
@@ -107,7 +107,8 @@ void __mmu_notifier_release(struct mm_struct *mm)
* existed or not.
*/
int __mmu_notifier_clear_flush_young(struct mm_struct *mm,
- unsigned long address)
+ unsigned long start,
+ unsigned long end)
{
struct mmu_notifier *mn;
int young = 0, id;
@@ -115,7 +116,7 @@ int __mmu_notifier_clear_flush_young(struct mm_struct *mm,
id = srcu_read_lock(&srcu);
hlist_for_each_entry_rcu(mn, &mm->mmu_notifier_mm->list, hlist) {
if (mn->ops->clear_flush_young)
- young |= mn->ops->clear_flush_young(mn, mm, address);
+ young |= mn->ops->clear_flush_young(mn, mm, start, end);
}
srcu_read_unlock(&srcu, id);
diff --git a/mm/mprotect.c b/mm/mprotect.c
index c43d557941f8..ace93454ce8e 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -29,13 +29,6 @@
#include <asm/cacheflush.h>
#include <asm/tlbflush.h>
-#ifndef pgprot_modify
-static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot)
-{
- return newprot;
-}
-#endif
-
/*
* For a prot_numa update we only hold mmap_sem for read so there is a
* potential race with faulting where a pmd was temporarily none. This
@@ -93,7 +86,9 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
* Avoid taking write faults for pages we
* know to be dirty.
*/
- if (dirty_accountable && pte_dirty(ptent))
+ if (dirty_accountable && pte_dirty(ptent) &&
+ (pte_soft_dirty(ptent) ||
+ !(vma->vm_flags & VM_SOFTDIRTY)))
ptent = pte_mkwrite(ptent);
ptep_modify_prot_commit(mm, addr, pte, ptent);
updated = true;
@@ -320,13 +315,8 @@ success:
* held in write mode.
*/
vma->vm_flags = newflags;
- vma->vm_page_prot = pgprot_modify(vma->vm_page_prot,
- vm_get_page_prot(newflags));
-
- if (vma_wants_writenotify(vma)) {
- vma->vm_page_prot = vm_get_page_prot(newflags & ~VM_SHARED);
- dirty_accountable = 1;
- }
+ dirty_accountable = vma_wants_writenotify(vma);
+ vma_set_page_prot(vma);
change_protection(vma, start, end, vma->vm_page_prot,
dirty_accountable, 0);
diff --git a/mm/mremap.c b/mm/mremap.c
index 05f1180e9f21..b147f66f4c40 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -21,8 +21,8 @@
#include <linux/syscalls.h>
#include <linux/mmu_notifier.h>
#include <linux/sched/sysctl.h>
+#include <linux/uaccess.h>
-#include <asm/uaccess.h>
#include <asm/cacheflush.h>
#include <asm/tlbflush.h>
@@ -195,7 +195,8 @@ unsigned long move_page_tables(struct vm_area_struct *vma,
if (pmd_trans_huge(*old_pmd)) {
int err = 0;
if (extent == HPAGE_PMD_SIZE) {
- VM_BUG_ON(vma->vm_file || !vma->anon_vma);
+ VM_BUG_ON_VMA(vma->vm_file || !vma->anon_vma,
+ vma);
/* See comment in move_ptes() */
if (need_rmap_locks)
anon_vma_lock_write(vma->anon_vma);
diff --git a/mm/nobootmem.c b/mm/nobootmem.c
index 7ed58602e71b..7c7ab32ee503 100644
--- a/mm/nobootmem.c
+++ b/mm/nobootmem.c
@@ -119,6 +119,8 @@ static unsigned long __init free_low_memory_core_early(void)
phys_addr_t start, end;
u64 i;
+ memblock_clear_hotplug(0, -1);
+
for_each_free_mem_range(i, NUMA_NO_NODE, &start, &end, NULL)
count += __free_memory_core(start, end);
diff --git a/mm/nommu.c b/mm/nommu.c
index a881d9673c6b..bd1808e194a7 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -539,7 +539,7 @@ void __init mmap_init(void)
{
int ret;
- ret = percpu_counter_init(&vm_committed_as, 0);
+ ret = percpu_counter_init(&vm_committed_as, 0, GFP_KERNEL);
VM_BUG_ON(ret);
vm_region_jar = KMEM_CACHE(vm_region, SLAB_PANIC);
}
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 1e11df8fa7ec..bbf405a3a18f 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -565,7 +565,7 @@ bool oom_zonelist_trylock(struct zonelist *zonelist, gfp_t gfp_mask)
spin_lock(&zone_scan_lock);
for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask))
- if (zone_is_oom_locked(zone)) {
+ if (test_bit(ZONE_OOM_LOCKED, &zone->flags)) {
ret = false;
goto out;
}
@@ -575,7 +575,7 @@ bool oom_zonelist_trylock(struct zonelist *zonelist, gfp_t gfp_mask)
* call to oom_zonelist_trylock() doesn't succeed when it shouldn't.
*/
for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask))
- zone_set_flag(zone, ZONE_OOM_LOCKED);
+ set_bit(ZONE_OOM_LOCKED, &zone->flags);
out:
spin_unlock(&zone_scan_lock);
@@ -594,7 +594,7 @@ void oom_zonelist_unlock(struct zonelist *zonelist, gfp_t gfp_mask)
spin_lock(&zone_scan_lock);
for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask))
- zone_clear_flag(zone, ZONE_OOM_LOCKED);
+ clear_bit(ZONE_OOM_LOCKED, &zone->flags);
spin_unlock(&zone_scan_lock);
}
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 91d73ef1744d..ff24c9d83112 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -1075,13 +1075,13 @@ static void bdi_update_dirty_ratelimit(struct backing_dev_info *bdi,
}
if (dirty < setpoint) {
- x = min(bdi->balanced_dirty_ratelimit,
- min(balanced_dirty_ratelimit, task_ratelimit));
+ x = min3(bdi->balanced_dirty_ratelimit,
+ balanced_dirty_ratelimit, task_ratelimit);
if (dirty_ratelimit < x)
step = x - dirty_ratelimit;
} else {
- x = max(bdi->balanced_dirty_ratelimit,
- max(balanced_dirty_ratelimit, task_ratelimit));
+ x = max3(bdi->balanced_dirty_ratelimit,
+ balanced_dirty_ratelimit, task_ratelimit);
if (dirty_ratelimit > x)
step = dirty_ratelimit - x;
}
@@ -1777,7 +1777,7 @@ void __init page_writeback_init(void)
writeback_set_ratelimit();
register_cpu_notifier(&ratelimit_nb);
- fprop_global_init(&writeout_completions);
+ fprop_global_init(&writeout_completions, GFP_KERNEL);
}
/**
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 18cee0d4c8a2..736d8e1b6381 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -53,8 +53,6 @@
#include <linux/kmemleak.h>
#include <linux/compaction.h>
#include <trace/events/kmem.h>
-#include <linux/ftrace_event.h>
-#include <linux/memcontrol.h>
#include <linux/prefetch.h>
#include <linux/mm_inline.h>
#include <linux/migrate.h>
@@ -85,6 +83,7 @@ EXPORT_PER_CPU_SYMBOL(numa_node);
*/
DEFINE_PER_CPU(int, _numa_mem_); /* Kernel "local memory" node */
EXPORT_PER_CPU_SYMBOL(_numa_mem_);
+int _node_numa_mem_[MAX_NUMNODES];
#endif
/*
@@ -1014,7 +1013,7 @@ int move_freepages(struct zone *zone,
* Remove at a later date when no bug reports exist related to
* grouping pages by mobility
*/
- BUG_ON(page_zone(start_page) != page_zone(end_page));
+ VM_BUG_ON(page_zone(start_page) != page_zone(end_page));
#endif
for (page = start_page; page <= end_page;) {
@@ -1612,9 +1611,9 @@ again:
}
__mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order));
- if (zone_page_state(zone, NR_ALLOC_BATCH) == 0 &&
- !zone_is_fair_depleted(zone))
- zone_set_flag(zone, ZONE_FAIR_DEPLETED);
+ if (atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH]) <= 0 &&
+ !test_bit(ZONE_FAIR_DEPLETED, &zone->flags))
+ set_bit(ZONE_FAIR_DEPLETED, &zone->flags);
__count_zone_vm_events(PGALLOC, zone, 1 << order);
zone_statistics(preferred_zone, zone, gfp_flags);
@@ -1934,7 +1933,7 @@ static void reset_alloc_batches(struct zone *preferred_zone)
mod_zone_page_state(zone, NR_ALLOC_BATCH,
high_wmark_pages(zone) - low_wmark_pages(zone) -
atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH]));
- zone_clear_flag(zone, ZONE_FAIR_DEPLETED);
+ clear_bit(ZONE_FAIR_DEPLETED, &zone->flags);
} while (zone++ != preferred_zone);
}
@@ -1985,7 +1984,7 @@ zonelist_scan:
if (alloc_flags & ALLOC_FAIR) {
if (!zone_local(preferred_zone, zone))
break;
- if (zone_is_fair_depleted(zone)) {
+ if (test_bit(ZONE_FAIR_DEPLETED, &zone->flags)) {
nr_fair_skipped++;
continue;
}
@@ -2296,58 +2295,72 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
struct zonelist *zonelist, enum zone_type high_zoneidx,
nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
int classzone_idx, int migratetype, enum migrate_mode mode,
- bool *contended_compaction, bool *deferred_compaction,
- unsigned long *did_some_progress)
+ int *contended_compaction, bool *deferred_compaction)
{
- if (!order)
- return NULL;
+ struct zone *last_compact_zone = NULL;
+ unsigned long compact_result;
+ struct page *page;
- if (compaction_deferred(preferred_zone, order)) {
- *deferred_compaction = true;
+ if (!order)
return NULL;
- }
current->flags |= PF_MEMALLOC;
- *did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask,
+ compact_result = try_to_compact_pages(zonelist, order, gfp_mask,
nodemask, mode,
- contended_compaction);
+ contended_compaction,
+ &last_compact_zone);
current->flags &= ~PF_MEMALLOC;
- if (*did_some_progress != COMPACT_SKIPPED) {
- struct page *page;
+ switch (compact_result) {
+ case COMPACT_DEFERRED:
+ *deferred_compaction = true;
+ /* fall-through */
+ case COMPACT_SKIPPED:
+ return NULL;
+ default:
+ break;
+ }
- /* Page migration frees to the PCP lists but we want merging */
- drain_pages(get_cpu());
- put_cpu();
+ /*
+ * At least in one zone compaction wasn't deferred or skipped, so let's
+ * count a compaction stall
+ */
+ count_vm_event(COMPACTSTALL);
- page = get_page_from_freelist(gfp_mask, nodemask,
- order, zonelist, high_zoneidx,
- alloc_flags & ~ALLOC_NO_WATERMARKS,
- preferred_zone, classzone_idx, migratetype);
- if (page) {
- preferred_zone->compact_blockskip_flush = false;
- compaction_defer_reset(preferred_zone, order, true);
- count_vm_event(COMPACTSUCCESS);
- return page;
- }
+ /* Page migration frees to the PCP lists but we want merging */
+ drain_pages(get_cpu());
+ put_cpu();
- /*
- * It's bad if compaction run occurs and fails.
- * The most likely reason is that pages exist,
- * but not enough to satisfy watermarks.
- */
- count_vm_event(COMPACTFAIL);
+ page = get_page_from_freelist(gfp_mask, nodemask,
+ order, zonelist, high_zoneidx,
+ alloc_flags & ~ALLOC_NO_WATERMARKS,
+ preferred_zone, classzone_idx, migratetype);
- /*
- * As async compaction considers a subset of pageblocks, only
- * defer if the failure was a sync compaction failure.
- */
- if (mode != MIGRATE_ASYNC)
- defer_compaction(preferred_zone, order);
+ if (page) {
+ struct zone *zone = page_zone(page);
- cond_resched();
+ zone->compact_blockskip_flush = false;
+ compaction_defer_reset(zone, order, true);
+ count_vm_event(COMPACTSUCCESS);
+ return page;
}
+ /*
+ * last_compact_zone is where try_to_compact_pages thought allocation
+ * should succeed, so it did not defer compaction. But here we know
+ * that it didn't succeed, so we do the defer.
+ */
+ if (last_compact_zone && mode != MIGRATE_ASYNC)
+ defer_compaction(last_compact_zone, order);
+
+ /*
+ * It's bad if compaction run occurs and fails. The most likely reason
+ * is that pages exist, but not enough to satisfy watermarks.
+ */
+ count_vm_event(COMPACTFAIL);
+
+ cond_resched();
+
return NULL;
}
#else
@@ -2355,9 +2368,8 @@ static inline struct page *
__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
struct zonelist *zonelist, enum zone_type high_zoneidx,
nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
- int classzone_idx, int migratetype,
- enum migrate_mode mode, bool *contended_compaction,
- bool *deferred_compaction, unsigned long *did_some_progress)
+ int classzone_idx, int migratetype, enum migrate_mode mode,
+ int *contended_compaction, bool *deferred_compaction)
{
return NULL;
}
@@ -2457,12 +2469,14 @@ __alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order,
static void wake_all_kswapds(unsigned int order,
struct zonelist *zonelist,
enum zone_type high_zoneidx,
- struct zone *preferred_zone)
+ struct zone *preferred_zone,
+ nodemask_t *nodemask)
{
struct zoneref *z;
struct zone *zone;
- for_each_zone_zonelist(zone, z, zonelist, high_zoneidx)
+ for_each_zone_zonelist_nodemask(zone, z, zonelist,
+ high_zoneidx, nodemask)
wakeup_kswapd(zone, order, zone_idx(preferred_zone));
}
@@ -2509,7 +2523,7 @@ gfp_to_alloc_flags(gfp_t gfp_mask)
alloc_flags |= ALLOC_NO_WATERMARKS;
}
#ifdef CONFIG_CMA
- if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE)
+ if (gfpflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE)
alloc_flags |= ALLOC_CMA;
#endif
return alloc_flags;
@@ -2533,7 +2547,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
unsigned long did_some_progress;
enum migrate_mode migration_mode = MIGRATE_ASYNC;
bool deferred_compaction = false;
- bool contended_compaction = false;
+ int contended_compaction = COMPACT_CONTENDED_NONE;
/*
* In the slowpath, we sanity check order to avoid ever trying to
@@ -2560,7 +2574,8 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
restart:
if (!(gfp_mask & __GFP_NO_KSWAPD))
- wake_all_kswapds(order, zonelist, high_zoneidx, preferred_zone);
+ wake_all_kswapds(order, zonelist, high_zoneidx,
+ preferred_zone, nodemask);
/*
* OK, we're below the kswapd watermark and have kicked background
@@ -2633,20 +2648,40 @@ rebalance:
preferred_zone,
classzone_idx, migratetype,
migration_mode, &contended_compaction,
- &deferred_compaction,
- &did_some_progress);
+ &deferred_compaction);
if (page)
goto got_pg;
- /*
- * If compaction is deferred for high-order allocations, it is because
- * sync compaction recently failed. In this is the case and the caller
- * requested a movable allocation that does not heavily disrupt the
- * system then fail the allocation instead of entering direct reclaim.
- */
- if ((deferred_compaction || contended_compaction) &&
- (gfp_mask & __GFP_NO_KSWAPD))
- goto nopage;
+ /* Checks for THP-specific high-order allocations */
+ if ((gfp_mask & GFP_TRANSHUGE) == GFP_TRANSHUGE) {
+ /*
+ * If compaction is deferred for high-order allocations, it is
+ * because sync compaction recently failed. If this is the case
+ * and the caller requested a THP allocation, we do not want
+ * to heavily disrupt the system, so we fail the allocation
+ * instead of entering direct reclaim.
+ */
+ if (deferred_compaction)
+ goto nopage;
+
+ /*
+ * In all zones where compaction was attempted (and not
+ * deferred or skipped), lock contention has been detected.
+ * For THP allocation we do not want to disrupt the others
+ * so we fallback to base pages instead.
+ */
+ if (contended_compaction == COMPACT_CONTENDED_LOCK)
+ goto nopage;
+
+ /*
+ * If compaction was aborted due to need_resched(), we do not
+ * want to further increase allocation latency, unless it is
+ * khugepaged trying to collapse.
+ */
+ if (contended_compaction == COMPACT_CONTENDED_SCHED
+ && !(current->flags & PF_KTHREAD))
+ goto nopage;
+ }
/*
* It can become very expensive to allocate transparent hugepages at
@@ -2726,8 +2761,7 @@ rebalance:
preferred_zone,
classzone_idx, migratetype,
migration_mode, &contended_compaction,
- &deferred_compaction,
- &did_some_progress);
+ &deferred_compaction);
if (page)
goto got_pg;
}
@@ -2753,7 +2787,7 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
struct zone *preferred_zone;
struct zoneref *preferred_zoneref;
struct page *page = NULL;
- int migratetype = allocflags_to_migratetype(gfp_mask);
+ int migratetype = gfpflags_to_migratetype(gfp_mask);
unsigned int cpuset_mems_cookie;
int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET|ALLOC_FAIR;
int classzone_idx;
@@ -2775,6 +2809,9 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
if (unlikely(!zonelist->_zonerefs->zone))
return NULL;
+ if (IS_ENABLED(CONFIG_CMA) && migratetype == MIGRATE_MOVABLE)
+ alloc_flags |= ALLOC_CMA;
+
retry_cpuset:
cpuset_mems_cookie = read_mems_allowed_begin();
@@ -2786,10 +2823,6 @@ retry_cpuset:
goto out;
classzone_idx = zonelist_zone_idx(preferred_zoneref);
-#ifdef CONFIG_CMA
- if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE)
- alloc_flags |= ALLOC_CMA;
-#endif
/* First allocation attempt */
page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,
zonelist, high_zoneidx, alloc_flags,
@@ -3579,68 +3612,30 @@ static void build_zonelists_in_zone_order(pg_data_t *pgdat, int nr_nodes)
zonelist->_zonerefs[pos].zone_idx = 0;
}
+#if defined(CONFIG_64BIT)
+/*
+ * Devices that require DMA32/DMA are relatively rare and do not justify a
+ * penalty to every machine in case the specialised case applies. Default
+ * to Node-ordering on 64-bit NUMA machines
+ */
+static int default_zonelist_order(void)
+{
+ return ZONELIST_ORDER_NODE;
+}
+#else
+/*
+ * On 32-bit, the Normal zone needs to be preserved for allocations accessible
+ * by the kernel. If processes running on node 0 deplete the low memory zone
+ * then reclaim will occur more frequency increasing stalls and potentially
+ * be easier to OOM if a large percentage of the zone is under writeback or
+ * dirty. The problem is significantly worse if CONFIG_HIGHPTE is not set.
+ * Hence, default to zone ordering on 32-bit.
+ */
static int default_zonelist_order(void)
{
- int nid, zone_type;
- unsigned long low_kmem_size, total_size;
- struct zone *z;
- int average_size;
- /*
- * ZONE_DMA and ZONE_DMA32 can be very small area in the system.
- * If they are really small and used heavily, the system can fall
- * into OOM very easily.
- * This function detect ZONE_DMA/DMA32 size and configures zone order.
- */
- /* Is there ZONE_NORMAL ? (ex. ppc has only DMA zone..) */
- low_kmem_size = 0;
- total_size = 0;
- for_each_online_node(nid) {
- for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) {
- z = &NODE_DATA(nid)->node_zones[zone_type];
- if (populated_zone(z)) {
- if (zone_type < ZONE_NORMAL)
- low_kmem_size += z->managed_pages;
- total_size += z->managed_pages;
- } else if (zone_type == ZONE_NORMAL) {
- /*
- * If any node has only lowmem, then node order
- * is preferred to allow kernel allocations
- * locally; otherwise, they can easily infringe
- * on other nodes when there is an abundance of
- * lowmem available to allocate from.
- */
- return ZONELIST_ORDER_NODE;
- }
- }
- }
- if (!low_kmem_size || /* there are no DMA area. */
- low_kmem_size > total_size/2) /* DMA/DMA32 is big. */
- return ZONELIST_ORDER_NODE;
- /*
- * look into each node's config.
- * If there is a node whose DMA/DMA32 memory is very big area on
- * local memory, NODE_ORDER may be suitable.
- */
- average_size = total_size /
- (nodes_weight(node_states[N_MEMORY]) + 1);
- for_each_online_node(nid) {
- low_kmem_size = 0;
- total_size = 0;
- for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) {
- z = &NODE_DATA(nid)->node_zones[zone_type];
- if (populated_zone(z)) {
- if (zone_type < ZONE_NORMAL)
- low_kmem_size += z->present_pages;
- total_size += z->present_pages;
- }
- }
- if (low_kmem_size &&
- total_size > average_size && /* ignore small node */
- low_kmem_size > total_size * 70/100)
- return ZONELIST_ORDER_NODE;
- }
return ZONELIST_ORDER_ZONE;
}
+#endif /* CONFIG_64BIT */
static void set_zonelist_order(void)
{
@@ -4976,6 +4971,8 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
pgdat->node_start_pfn = node_start_pfn;
#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
get_pfn_range_for_nid(nid, &start_pfn, &end_pfn);
+ printk(KERN_INFO "Initmem setup node %d [mem %#010Lx-%#010Lx]\n", nid,
+ (u64) start_pfn << PAGE_SHIFT, (u64) (end_pfn << PAGE_SHIFT) - 1);
#endif
calculate_node_totalpages(pgdat, start_pfn, end_pfn,
zones_size, zholes_size);
@@ -5701,9 +5698,8 @@ static void __setup_per_zone_wmarks(void)
zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + (tmp >> 1);
__mod_zone_page_state(zone, NR_ALLOC_BATCH,
- high_wmark_pages(zone) -
- low_wmark_pages(zone) -
- zone_page_state(zone, NR_ALLOC_BATCH));
+ high_wmark_pages(zone) - low_wmark_pages(zone) -
+ atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH]));
setup_zone_migrate_reserve(zone);
spin_unlock_irqrestore(&zone->lock, flags);
@@ -6278,8 +6274,7 @@ static int __alloc_contig_migrate_range(struct compact_control *cc,
if (list_empty(&cc->migratepages)) {
cc->nr_migratepages = 0;
- pfn = isolate_migratepages_range(cc->zone, cc,
- pfn, end, true);
+ pfn = isolate_migratepages_range(cc, pfn, end);
if (!pfn) {
ret = -EINTR;
break;
@@ -6555,97 +6550,3 @@ bool is_free_buddy_page(struct page *page)
return order < MAX_ORDER;
}
#endif
-
-static const struct trace_print_flags pageflag_names[] = {
- {1UL << PG_locked, "locked" },
- {1UL << PG_error, "error" },
- {1UL << PG_referenced, "referenced" },
- {1UL << PG_uptodate, "uptodate" },
- {1UL << PG_dirty, "dirty" },
- {1UL << PG_lru, "lru" },
- {1UL << PG_active, "active" },
- {1UL << PG_slab, "slab" },
- {1UL << PG_owner_priv_1, "owner_priv_1" },
- {1UL << PG_arch_1, "arch_1" },
- {1UL << PG_reserved, "reserved" },
- {1UL << PG_private, "private" },
- {1UL << PG_private_2, "private_2" },
- {1UL << PG_writeback, "writeback" },
-#ifdef CONFIG_PAGEFLAGS_EXTENDED
- {1UL << PG_head, "head" },
- {1UL << PG_tail, "tail" },
-#else
- {1UL << PG_compound, "compound" },
-#endif
- {1UL << PG_swapcache, "swapcache" },
- {1UL << PG_mappedtodisk, "mappedtodisk" },
- {1UL << PG_reclaim, "reclaim" },
- {1UL << PG_swapbacked, "swapbacked" },
- {1UL << PG_unevictable, "unevictable" },
-#ifdef CONFIG_MMU
- {1UL << PG_mlocked, "mlocked" },
-#endif
-#ifdef CONFIG_ARCH_USES_PG_UNCACHED
- {1UL << PG_uncached, "uncached" },
-#endif
-#ifdef CONFIG_MEMORY_FAILURE
- {1UL << PG_hwpoison, "hwpoison" },
-#endif
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
- {1UL << PG_compound_lock, "compound_lock" },
-#endif
-};
-
-static void dump_page_flags(unsigned long flags)
-{
- const char *delim = "";
- unsigned long mask;
- int i;
-
- BUILD_BUG_ON(ARRAY_SIZE(pageflag_names) != __NR_PAGEFLAGS);
-
- printk(KERN_ALERT "page flags: %#lx(", flags);
-
- /* remove zone id */
- flags &= (1UL << NR_PAGEFLAGS) - 1;
-
- for (i = 0; i < ARRAY_SIZE(pageflag_names) && flags; i++) {
-
- mask = pageflag_names[i].mask;
- if ((flags & mask) != mask)
- continue;
-
- flags &= ~mask;
- printk("%s%s", delim, pageflag_names[i].name);
- delim = "|";
- }
-
- /* check for left over flags */
- if (flags)
- printk("%s%#lx", delim, flags);
-
- printk(")\n");
-}
-
-void dump_page_badflags(struct page *page, const char *reason,
- unsigned long badflags)
-{
- printk(KERN_ALERT
- "page:%p count:%d mapcount:%d mapping:%p index:%#lx\n",
- page, atomic_read(&page->_count), page_mapcount(page),
- page->mapping, page->index);
- dump_page_flags(page->flags);
- if (reason)
- pr_alert("page dumped because: %s\n", reason);
- if (page->flags & badflags) {
- pr_alert("bad because of flags:\n");
- dump_page_flags(page->flags & badflags);
- }
- mem_cgroup_print_bad_page(page);
-}
-
-void dump_page(struct page *page, const char *reason)
-{
- dump_page_badflags(page, reason, 0);
-}
-EXPORT_SYMBOL(dump_page);
diff --git a/mm/pagewalk.c b/mm/pagewalk.c
index 2beeabf502c5..ad83195521f2 100644
--- a/mm/pagewalk.c
+++ b/mm/pagewalk.c
@@ -177,7 +177,7 @@ int walk_page_range(unsigned long addr, unsigned long end,
if (!walk->mm)
return -EINVAL;
- VM_BUG_ON(!rwsem_is_locked(&walk->mm->mmap_sem));
+ VM_BUG_ON_MM(!rwsem_is_locked(&walk->mm->mmap_sem), walk->mm);
pgd = pgd_offset(walk->mm, addr);
do {
diff --git a/mm/percpu-km.c b/mm/percpu-km.c
index 89633fefc6a2..10e3d0b8a86d 100644
--- a/mm/percpu-km.c
+++ b/mm/percpu-km.c
@@ -33,17 +33,14 @@
#include <linux/log2.h>
-static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size)
+static int pcpu_populate_chunk(struct pcpu_chunk *chunk,
+ int page_start, int page_end)
{
- unsigned int cpu;
-
- for_each_possible_cpu(cpu)
- memset((void *)pcpu_chunk_addr(chunk, cpu, 0) + off, 0, size);
-
return 0;
}
-static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int off, int size)
+static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk,
+ int page_start, int page_end)
{
/* nada */
}
@@ -70,6 +67,11 @@ static struct pcpu_chunk *pcpu_create_chunk(void)
chunk->data = pages;
chunk->base_addr = page_address(pages) - pcpu_group_offsets[0];
+
+ spin_lock_irq(&pcpu_lock);
+ pcpu_chunk_populated(chunk, 0, nr_pages);
+ spin_unlock_irq(&pcpu_lock);
+
return chunk;
}
diff --git a/mm/percpu-vm.c b/mm/percpu-vm.c
index 3707c71ae4cd..538998a137d2 100644
--- a/mm/percpu-vm.c
+++ b/mm/percpu-vm.c
@@ -20,46 +20,25 @@ static struct page *pcpu_chunk_page(struct pcpu_chunk *chunk,
}
/**
- * pcpu_get_pages_and_bitmap - get temp pages array and bitmap
+ * pcpu_get_pages - get temp pages array
* @chunk: chunk of interest
- * @bitmapp: output parameter for bitmap
- * @may_alloc: may allocate the array
*
- * Returns pointer to array of pointers to struct page and bitmap,
- * both of which can be indexed with pcpu_page_idx(). The returned
- * array is cleared to zero and *@bitmapp is copied from
- * @chunk->populated. Note that there is only one array and bitmap
- * and access exclusion is the caller's responsibility.
- *
- * CONTEXT:
- * pcpu_alloc_mutex and does GFP_KERNEL allocation if @may_alloc.
- * Otherwise, don't care.
+ * Returns pointer to array of pointers to struct page which can be indexed
+ * with pcpu_page_idx(). Note that there is only one array and accesses
+ * should be serialized by pcpu_alloc_mutex.
*
* RETURNS:
- * Pointer to temp pages array on success, NULL on failure.
+ * Pointer to temp pages array on success.
*/
-static struct page **pcpu_get_pages_and_bitmap(struct pcpu_chunk *chunk,
- unsigned long **bitmapp,
- bool may_alloc)
+static struct page **pcpu_get_pages(struct pcpu_chunk *chunk_alloc)
{
static struct page **pages;
- static unsigned long *bitmap;
size_t pages_size = pcpu_nr_units * pcpu_unit_pages * sizeof(pages[0]);
- size_t bitmap_size = BITS_TO_LONGS(pcpu_unit_pages) *
- sizeof(unsigned long);
-
- if (!pages || !bitmap) {
- if (may_alloc && !pages)
- pages = pcpu_mem_zalloc(pages_size);
- if (may_alloc && !bitmap)
- bitmap = pcpu_mem_zalloc(bitmap_size);
- if (!pages || !bitmap)
- return NULL;
- }
- bitmap_copy(bitmap, chunk->populated, pcpu_unit_pages);
+ lockdep_assert_held(&pcpu_alloc_mutex);
- *bitmapp = bitmap;
+ if (!pages)
+ pages = pcpu_mem_zalloc(pages_size);
return pages;
}
@@ -67,7 +46,6 @@ static struct page **pcpu_get_pages_and_bitmap(struct pcpu_chunk *chunk,
* pcpu_free_pages - free pages which were allocated for @chunk
* @chunk: chunk pages were allocated for
* @pages: array of pages to be freed, indexed by pcpu_page_idx()
- * @populated: populated bitmap
* @page_start: page index of the first page to be freed
* @page_end: page index of the last page to be freed + 1
*
@@ -75,8 +53,7 @@ static struct page **pcpu_get_pages_and_bitmap(struct pcpu_chunk *chunk,
* The pages were allocated for @chunk.
*/
static void pcpu_free_pages(struct pcpu_chunk *chunk,
- struct page **pages, unsigned long *populated,
- int page_start, int page_end)
+ struct page **pages, int page_start, int page_end)
{
unsigned int cpu;
int i;
@@ -95,7 +72,6 @@ static void pcpu_free_pages(struct pcpu_chunk *chunk,
* pcpu_alloc_pages - allocates pages for @chunk
* @chunk: target chunk
* @pages: array to put the allocated pages into, indexed by pcpu_page_idx()
- * @populated: populated bitmap
* @page_start: page index of the first page to be allocated
* @page_end: page index of the last page to be allocated + 1
*
@@ -104,11 +80,10 @@ static void pcpu_free_pages(struct pcpu_chunk *chunk,
* content of @pages and will pass it verbatim to pcpu_map_pages().
*/
static int pcpu_alloc_pages(struct pcpu_chunk *chunk,
- struct page **pages, unsigned long *populated,
- int page_start, int page_end)
+ struct page **pages, int page_start, int page_end)
{
const gfp_t gfp = GFP_KERNEL | __GFP_HIGHMEM | __GFP_COLD;
- unsigned int cpu;
+ unsigned int cpu, tcpu;
int i;
for_each_possible_cpu(cpu) {
@@ -116,14 +91,23 @@ static int pcpu_alloc_pages(struct pcpu_chunk *chunk,
struct page **pagep = &pages[pcpu_page_idx(cpu, i)];
*pagep = alloc_pages_node(cpu_to_node(cpu), gfp, 0);
- if (!*pagep) {
- pcpu_free_pages(chunk, pages, populated,
- page_start, page_end);
- return -ENOMEM;
- }
+ if (!*pagep)
+ goto err;
}
}
return 0;
+
+err:
+ while (--i >= page_start)
+ __free_page(pages[pcpu_page_idx(cpu, i)]);
+
+ for_each_possible_cpu(tcpu) {
+ if (tcpu == cpu)
+ break;
+ for (i = page_start; i < page_end; i++)
+ __free_page(pages[pcpu_page_idx(tcpu, i)]);
+ }
+ return -ENOMEM;
}
/**
@@ -155,7 +139,6 @@ static void __pcpu_unmap_pages(unsigned long addr, int nr_pages)
* pcpu_unmap_pages - unmap pages out of a pcpu_chunk
* @chunk: chunk of interest
* @pages: pages array which can be used to pass information to free
- * @populated: populated bitmap
* @page_start: page index of the first page to unmap
* @page_end: page index of the last page to unmap + 1
*
@@ -166,8 +149,7 @@ static void __pcpu_unmap_pages(unsigned long addr, int nr_pages)
* proper pre/post flush functions.
*/
static void pcpu_unmap_pages(struct pcpu_chunk *chunk,
- struct page **pages, unsigned long *populated,
- int page_start, int page_end)
+ struct page **pages, int page_start, int page_end)
{
unsigned int cpu;
int i;
@@ -183,8 +165,6 @@ static void pcpu_unmap_pages(struct pcpu_chunk *chunk,
__pcpu_unmap_pages(pcpu_chunk_addr(chunk, cpu, page_start),
page_end - page_start);
}
-
- bitmap_clear(populated, page_start, page_end - page_start);
}
/**
@@ -219,7 +199,6 @@ static int __pcpu_map_pages(unsigned long addr, struct page **pages,
* pcpu_map_pages - map pages into a pcpu_chunk
* @chunk: chunk of interest
* @pages: pages array containing pages to be mapped
- * @populated: populated bitmap
* @page_start: page index of the first page to map
* @page_end: page index of the last page to map + 1
*
@@ -227,13 +206,11 @@ static int __pcpu_map_pages(unsigned long addr, struct page **pages,
* caller is responsible for calling pcpu_post_map_flush() after all
* mappings are complete.
*
- * This function is responsible for setting corresponding bits in
- * @chunk->populated bitmap and whatever is necessary for reverse
- * lookup (addr -> chunk).
+ * This function is responsible for setting up whatever is necessary for
+ * reverse lookup (addr -> chunk).
*/
static int pcpu_map_pages(struct pcpu_chunk *chunk,
- struct page **pages, unsigned long *populated,
- int page_start, int page_end)
+ struct page **pages, int page_start, int page_end)
{
unsigned int cpu, tcpu;
int i, err;
@@ -244,18 +221,12 @@ static int pcpu_map_pages(struct pcpu_chunk *chunk,
page_end - page_start);
if (err < 0)
goto err;
- }
- /* mapping successful, link chunk and mark populated */
- for (i = page_start; i < page_end; i++) {
- for_each_possible_cpu(cpu)
+ for (i = page_start; i < page_end; i++)
pcpu_set_page_chunk(pages[pcpu_page_idx(cpu, i)],
chunk);
- __set_bit(i, populated);
}
-
return 0;
-
err:
for_each_possible_cpu(tcpu) {
if (tcpu == cpu)
@@ -263,6 +234,7 @@ err:
__pcpu_unmap_pages(pcpu_chunk_addr(chunk, tcpu, page_start),
page_end - page_start);
}
+ pcpu_post_unmap_tlb_flush(chunk, page_start, page_end);
return err;
}
@@ -289,123 +261,69 @@ static void pcpu_post_map_flush(struct pcpu_chunk *chunk,
/**
* pcpu_populate_chunk - populate and map an area of a pcpu_chunk
* @chunk: chunk of interest
- * @off: offset to the area to populate
- * @size: size of the area to populate in bytes
+ * @page_start: the start page
+ * @page_end: the end page
*
* For each cpu, populate and map pages [@page_start,@page_end) into
- * @chunk. The area is cleared on return.
+ * @chunk.
*
* CONTEXT:
* pcpu_alloc_mutex, does GFP_KERNEL allocation.
*/
-static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size)
+static int pcpu_populate_chunk(struct pcpu_chunk *chunk,
+ int page_start, int page_end)
{
- int page_start = PFN_DOWN(off);
- int page_end = PFN_UP(off + size);
- int free_end = page_start, unmap_end = page_start;
struct page **pages;
- unsigned long *populated;
- unsigned int cpu;
- int rs, re, rc;
-
- /* quick path, check whether all pages are already there */
- rs = page_start;
- pcpu_next_pop(chunk, &rs, &re, page_end);
- if (rs == page_start && re == page_end)
- goto clear;
- /* need to allocate and map pages, this chunk can't be immutable */
- WARN_ON(chunk->immutable);
-
- pages = pcpu_get_pages_and_bitmap(chunk, &populated, true);
+ pages = pcpu_get_pages(chunk);
if (!pages)
return -ENOMEM;
- /* alloc and map */
- pcpu_for_each_unpop_region(chunk, rs, re, page_start, page_end) {
- rc = pcpu_alloc_pages(chunk, pages, populated, rs, re);
- if (rc)
- goto err_free;
- free_end = re;
- }
+ if (pcpu_alloc_pages(chunk, pages, page_start, page_end))
+ return -ENOMEM;
- pcpu_for_each_unpop_region(chunk, rs, re, page_start, page_end) {
- rc = pcpu_map_pages(chunk, pages, populated, rs, re);
- if (rc)
- goto err_unmap;
- unmap_end = re;
+ if (pcpu_map_pages(chunk, pages, page_start, page_end)) {
+ pcpu_free_pages(chunk, pages, page_start, page_end);
+ return -ENOMEM;
}
pcpu_post_map_flush(chunk, page_start, page_end);
- /* commit new bitmap */
- bitmap_copy(chunk->populated, populated, pcpu_unit_pages);
-clear:
- for_each_possible_cpu(cpu)
- memset((void *)pcpu_chunk_addr(chunk, cpu, 0) + off, 0, size);
return 0;
-
-err_unmap:
- pcpu_pre_unmap_flush(chunk, page_start, unmap_end);
- pcpu_for_each_unpop_region(chunk, rs, re, page_start, unmap_end)
- pcpu_unmap_pages(chunk, pages, populated, rs, re);
- pcpu_post_unmap_tlb_flush(chunk, page_start, unmap_end);
-err_free:
- pcpu_for_each_unpop_region(chunk, rs, re, page_start, free_end)
- pcpu_free_pages(chunk, pages, populated, rs, re);
- return rc;
}
/**
* pcpu_depopulate_chunk - depopulate and unmap an area of a pcpu_chunk
* @chunk: chunk to depopulate
- * @off: offset to the area to depopulate
- * @size: size of the area to depopulate in bytes
+ * @page_start: the start page
+ * @page_end: the end page
*
* For each cpu, depopulate and unmap pages [@page_start,@page_end)
- * from @chunk. If @flush is true, vcache is flushed before unmapping
- * and tlb after.
+ * from @chunk.
*
* CONTEXT:
* pcpu_alloc_mutex.
*/
-static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int off, int size)
+static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk,
+ int page_start, int page_end)
{
- int page_start = PFN_DOWN(off);
- int page_end = PFN_UP(off + size);
struct page **pages;
- unsigned long *populated;
- int rs, re;
-
- /* quick path, check whether it's empty already */
- rs = page_start;
- pcpu_next_unpop(chunk, &rs, &re, page_end);
- if (rs == page_start && re == page_end)
- return;
-
- /* immutable chunks can't be depopulated */
- WARN_ON(chunk->immutable);
/*
* If control reaches here, there must have been at least one
* successful population attempt so the temp pages array must
* be available now.
*/
- pages = pcpu_get_pages_and_bitmap(chunk, &populated, false);
+ pages = pcpu_get_pages(chunk);
BUG_ON(!pages);
/* unmap and free */
pcpu_pre_unmap_flush(chunk, page_start, page_end);
- pcpu_for_each_pop_region(chunk, rs, re, page_start, page_end)
- pcpu_unmap_pages(chunk, pages, populated, rs, re);
+ pcpu_unmap_pages(chunk, pages, page_start, page_end);
/* no need to flush tlb, vmalloc will handle it lazily */
- pcpu_for_each_pop_region(chunk, rs, re, page_start, page_end)
- pcpu_free_pages(chunk, pages, populated, rs, re);
-
- /* commit new bitmap */
- bitmap_copy(chunk->populated, populated, pcpu_unit_pages);
+ pcpu_free_pages(chunk, pages, page_start, page_end);
}
static struct pcpu_chunk *pcpu_create_chunk(void)
diff --git a/mm/percpu.c b/mm/percpu.c
index 2139e30a4b44..014bab65e0ff 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -76,6 +76,10 @@
#define PCPU_SLOT_BASE_SHIFT 5 /* 1-31 shares the same slot */
#define PCPU_DFL_MAP_ALLOC 16 /* start a map with 16 ents */
+#define PCPU_ATOMIC_MAP_MARGIN_LOW 32
+#define PCPU_ATOMIC_MAP_MARGIN_HIGH 64
+#define PCPU_EMPTY_POP_PAGES_LOW 2
+#define PCPU_EMPTY_POP_PAGES_HIGH 4
#ifdef CONFIG_SMP
/* default addr <-> pcpu_ptr mapping, override in asm/percpu.h if necessary */
@@ -102,12 +106,16 @@ struct pcpu_chunk {
int free_size; /* free bytes in the chunk */
int contig_hint; /* max contiguous size hint */
void *base_addr; /* base address of this chunk */
+
int map_used; /* # of map entries used before the sentry */
int map_alloc; /* # of map entries allocated */
int *map; /* allocation map */
+ struct work_struct map_extend_work;/* async ->map[] extension */
+
void *data; /* chunk data */
int first_free; /* no free below this */
bool immutable; /* no [de]population allowed */
+ int nr_populated; /* # of populated pages */
unsigned long populated[]; /* populated bitmap */
};
@@ -151,38 +159,33 @@ static struct pcpu_chunk *pcpu_first_chunk;
static struct pcpu_chunk *pcpu_reserved_chunk;
static int pcpu_reserved_chunk_limit;
+static DEFINE_SPINLOCK(pcpu_lock); /* all internal data structures */
+static DEFINE_MUTEX(pcpu_alloc_mutex); /* chunk create/destroy, [de]pop */
+
+static struct list_head *pcpu_slot __read_mostly; /* chunk list slots */
+
/*
- * Synchronization rules.
- *
- * There are two locks - pcpu_alloc_mutex and pcpu_lock. The former
- * protects allocation/reclaim paths, chunks, populated bitmap and
- * vmalloc mapping. The latter is a spinlock and protects the index
- * data structures - chunk slots, chunks and area maps in chunks.
- *
- * During allocation, pcpu_alloc_mutex is kept locked all the time and
- * pcpu_lock is grabbed and released as necessary. All actual memory
- * allocations are done using GFP_KERNEL with pcpu_lock released. In
- * general, percpu memory can't be allocated with irq off but
- * irqsave/restore are still used in alloc path so that it can be used
- * from early init path - sched_init() specifically.
- *
- * Free path accesses and alters only the index data structures, so it
- * can be safely called from atomic context. When memory needs to be
- * returned to the system, free path schedules reclaim_work which
- * grabs both pcpu_alloc_mutex and pcpu_lock, unlinks chunks to be
- * reclaimed, release both locks and frees the chunks. Note that it's
- * necessary to grab both locks to remove a chunk from circulation as
- * allocation path might be referencing the chunk with only
- * pcpu_alloc_mutex locked.
+ * The number of empty populated pages, protected by pcpu_lock. The
+ * reserved chunk doesn't contribute to the count.
*/
-static DEFINE_MUTEX(pcpu_alloc_mutex); /* protects whole alloc and reclaim */
-static DEFINE_SPINLOCK(pcpu_lock); /* protects index data structures */
+static int pcpu_nr_empty_pop_pages;
-static struct list_head *pcpu_slot __read_mostly; /* chunk list slots */
+/*
+ * Balance work is used to populate or destroy chunks asynchronously. We
+ * try to keep the number of populated free pages between
+ * PCPU_EMPTY_POP_PAGES_LOW and HIGH for atomic allocations and at most one
+ * empty chunk.
+ */
+static void pcpu_balance_workfn(struct work_struct *work);
+static DECLARE_WORK(pcpu_balance_work, pcpu_balance_workfn);
+static bool pcpu_async_enabled __read_mostly;
+static bool pcpu_atomic_alloc_failed;
-/* reclaim work to release fully free chunks, scheduled from free path */
-static void pcpu_reclaim(struct work_struct *work);
-static DECLARE_WORK(pcpu_reclaim_work, pcpu_reclaim);
+static void pcpu_schedule_balance_work(void)
+{
+ if (pcpu_async_enabled)
+ schedule_work(&pcpu_balance_work);
+}
static bool pcpu_addr_in_first_chunk(void *addr)
{
@@ -315,6 +318,38 @@ static void pcpu_mem_free(void *ptr, size_t size)
}
/**
+ * pcpu_count_occupied_pages - count the number of pages an area occupies
+ * @chunk: chunk of interest
+ * @i: index of the area in question
+ *
+ * Count the number of pages chunk's @i'th area occupies. When the area's
+ * start and/or end address isn't aligned to page boundary, the straddled
+ * page is included in the count iff the rest of the page is free.
+ */
+static int pcpu_count_occupied_pages(struct pcpu_chunk *chunk, int i)
+{
+ int off = chunk->map[i] & ~1;
+ int end = chunk->map[i + 1] & ~1;
+
+ if (!PAGE_ALIGNED(off) && i > 0) {
+ int prev = chunk->map[i - 1];
+
+ if (!(prev & 1) && prev <= round_down(off, PAGE_SIZE))
+ off = round_down(off, PAGE_SIZE);
+ }
+
+ if (!PAGE_ALIGNED(end) && i + 1 < chunk->map_used) {
+ int next = chunk->map[i + 1];
+ int nend = chunk->map[i + 2] & ~1;
+
+ if (!(next & 1) && nend >= round_up(end, PAGE_SIZE))
+ end = round_up(end, PAGE_SIZE);
+ }
+
+ return max_t(int, PFN_DOWN(end) - PFN_UP(off), 0);
+}
+
+/**
* pcpu_chunk_relocate - put chunk in the appropriate chunk slot
* @chunk: chunk of interest
* @oslot: the previous slot it was on
@@ -342,9 +377,14 @@ static void pcpu_chunk_relocate(struct pcpu_chunk *chunk, int oslot)
/**
* pcpu_need_to_extend - determine whether chunk area map needs to be extended
* @chunk: chunk of interest
+ * @is_atomic: the allocation context
*
- * Determine whether area map of @chunk needs to be extended to
- * accommodate a new allocation.
+ * Determine whether area map of @chunk needs to be extended. If
+ * @is_atomic, only the amount necessary for a new allocation is
+ * considered; however, async extension is scheduled if the left amount is
+ * low. If !@is_atomic, it aims for more empty space. Combined, this
+ * ensures that the map is likely to have enough available space to
+ * accomodate atomic allocations which can't extend maps directly.
*
* CONTEXT:
* pcpu_lock.
@@ -353,15 +393,26 @@ static void pcpu_chunk_relocate(struct pcpu_chunk *chunk, int oslot)
* New target map allocation length if extension is necessary, 0
* otherwise.
*/
-static int pcpu_need_to_extend(struct pcpu_chunk *chunk)
+static int pcpu_need_to_extend(struct pcpu_chunk *chunk, bool is_atomic)
{
- int new_alloc;
+ int margin, new_alloc;
+
+ if (is_atomic) {
+ margin = 3;
+
+ if (chunk->map_alloc <
+ chunk->map_used + PCPU_ATOMIC_MAP_MARGIN_LOW &&
+ pcpu_async_enabled)
+ schedule_work(&chunk->map_extend_work);
+ } else {
+ margin = PCPU_ATOMIC_MAP_MARGIN_HIGH;
+ }
- if (chunk->map_alloc >= chunk->map_used + 3)
+ if (chunk->map_alloc >= chunk->map_used + margin)
return 0;
new_alloc = PCPU_DFL_MAP_ALLOC;
- while (new_alloc < chunk->map_used + 3)
+ while (new_alloc < chunk->map_used + margin)
new_alloc *= 2;
return new_alloc;
@@ -418,11 +469,76 @@ out_unlock:
return 0;
}
+static void pcpu_map_extend_workfn(struct work_struct *work)
+{
+ struct pcpu_chunk *chunk = container_of(work, struct pcpu_chunk,
+ map_extend_work);
+ int new_alloc;
+
+ spin_lock_irq(&pcpu_lock);
+ new_alloc = pcpu_need_to_extend(chunk, false);
+ spin_unlock_irq(&pcpu_lock);
+
+ if (new_alloc)
+ pcpu_extend_area_map(chunk, new_alloc);
+}
+
+/**
+ * pcpu_fit_in_area - try to fit the requested allocation in a candidate area
+ * @chunk: chunk the candidate area belongs to
+ * @off: the offset to the start of the candidate area
+ * @this_size: the size of the candidate area
+ * @size: the size of the target allocation
+ * @align: the alignment of the target allocation
+ * @pop_only: only allocate from already populated region
+ *
+ * We're trying to allocate @size bytes aligned at @align. @chunk's area
+ * at @off sized @this_size is a candidate. This function determines
+ * whether the target allocation fits in the candidate area and returns the
+ * number of bytes to pad after @off. If the target area doesn't fit, -1
+ * is returned.
+ *
+ * If @pop_only is %true, this function only considers the already
+ * populated part of the candidate area.
+ */
+static int pcpu_fit_in_area(struct pcpu_chunk *chunk, int off, int this_size,
+ int size, int align, bool pop_only)
+{
+ int cand_off = off;
+
+ while (true) {
+ int head = ALIGN(cand_off, align) - off;
+ int page_start, page_end, rs, re;
+
+ if (this_size < head + size)
+ return -1;
+
+ if (!pop_only)
+ return head;
+
+ /*
+ * If the first unpopulated page is beyond the end of the
+ * allocation, the whole allocation is populated;
+ * otherwise, retry from the end of the unpopulated area.
+ */
+ page_start = PFN_DOWN(head + off);
+ page_end = PFN_UP(head + off + size);
+
+ rs = page_start;
+ pcpu_next_unpop(chunk, &rs, &re, PFN_UP(off + this_size));
+ if (rs >= page_end)
+ return head;
+ cand_off = re * PAGE_SIZE;
+ }
+}
+
/**
* pcpu_alloc_area - allocate area from a pcpu_chunk
* @chunk: chunk of interest
* @size: wanted size in bytes
* @align: wanted align
+ * @pop_only: allocate only from the populated area
+ * @occ_pages_p: out param for the number of pages the area occupies
*
* Try to allocate @size bytes area aligned at @align from @chunk.
* Note that this function only allocates the offset. It doesn't
@@ -437,7 +553,8 @@ out_unlock:
* Allocated offset in @chunk on success, -1 if no matching area is
* found.
*/
-static int pcpu_alloc_area(struct pcpu_chunk *chunk, int size, int align)
+static int pcpu_alloc_area(struct pcpu_chunk *chunk, int size, int align,
+ bool pop_only, int *occ_pages_p)
{
int oslot = pcpu_chunk_slot(chunk);
int max_contig = 0;
@@ -453,11 +570,11 @@ static int pcpu_alloc_area(struct pcpu_chunk *chunk, int size, int align)
if (off & 1)
continue;
- /* extra for alignment requirement */
- head = ALIGN(off, align) - off;
-
this_size = (p[1] & ~1) - off;
- if (this_size < head + size) {
+
+ head = pcpu_fit_in_area(chunk, off, this_size, size, align,
+ pop_only);
+ if (head < 0) {
if (!seen_free) {
chunk->first_free = i;
seen_free = true;
@@ -526,6 +643,7 @@ static int pcpu_alloc_area(struct pcpu_chunk *chunk, int size, int align)
chunk->free_size -= size;
*p |= 1;
+ *occ_pages_p = pcpu_count_occupied_pages(chunk, i);
pcpu_chunk_relocate(chunk, oslot);
return off;
}
@@ -541,6 +659,7 @@ static int pcpu_alloc_area(struct pcpu_chunk *chunk, int size, int align)
* pcpu_free_area - free area to a pcpu_chunk
* @chunk: chunk of interest
* @freeme: offset of area to free
+ * @occ_pages_p: out param for the number of pages the area occupies
*
* Free area starting from @freeme to @chunk. Note that this function
* only modifies the allocation map. It doesn't depopulate or unmap
@@ -549,7 +668,8 @@ static int pcpu_alloc_area(struct pcpu_chunk *chunk, int size, int align)
* CONTEXT:
* pcpu_lock.
*/
-static void pcpu_free_area(struct pcpu_chunk *chunk, int freeme)
+static void pcpu_free_area(struct pcpu_chunk *chunk, int freeme,
+ int *occ_pages_p)
{
int oslot = pcpu_chunk_slot(chunk);
int off = 0;
@@ -580,6 +700,8 @@ static void pcpu_free_area(struct pcpu_chunk *chunk, int freeme)
*p = off &= ~1;
chunk->free_size += (p[1] & ~1) - off;
+ *occ_pages_p = pcpu_count_occupied_pages(chunk, i);
+
/* merge with next? */
if (!(p[1] & 1))
to_free++;
@@ -620,6 +742,7 @@ static struct pcpu_chunk *pcpu_alloc_chunk(void)
chunk->map_used = 1;
INIT_LIST_HEAD(&chunk->list);
+ INIT_WORK(&chunk->map_extend_work, pcpu_map_extend_workfn);
chunk->free_size = pcpu_unit_size;
chunk->contig_hint = pcpu_unit_size;
@@ -634,6 +757,50 @@ static void pcpu_free_chunk(struct pcpu_chunk *chunk)
pcpu_mem_free(chunk, pcpu_chunk_struct_size);
}
+/**
+ * pcpu_chunk_populated - post-population bookkeeping
+ * @chunk: pcpu_chunk which got populated
+ * @page_start: the start page
+ * @page_end: the end page
+ *
+ * Pages in [@page_start,@page_end) have been populated to @chunk. Update
+ * the bookkeeping information accordingly. Must be called after each
+ * successful population.
+ */
+static void pcpu_chunk_populated(struct pcpu_chunk *chunk,
+ int page_start, int page_end)
+{
+ int nr = page_end - page_start;
+
+ lockdep_assert_held(&pcpu_lock);
+
+ bitmap_set(chunk->populated, page_start, nr);
+ chunk->nr_populated += nr;
+ pcpu_nr_empty_pop_pages += nr;
+}
+
+/**
+ * pcpu_chunk_depopulated - post-depopulation bookkeeping
+ * @chunk: pcpu_chunk which got depopulated
+ * @page_start: the start page
+ * @page_end: the end page
+ *
+ * Pages in [@page_start,@page_end) have been depopulated from @chunk.
+ * Update the bookkeeping information accordingly. Must be called after
+ * each successful depopulation.
+ */
+static void pcpu_chunk_depopulated(struct pcpu_chunk *chunk,
+ int page_start, int page_end)
+{
+ int nr = page_end - page_start;
+
+ lockdep_assert_held(&pcpu_lock);
+
+ bitmap_clear(chunk->populated, page_start, nr);
+ chunk->nr_populated -= nr;
+ pcpu_nr_empty_pop_pages -= nr;
+}
+
/*
* Chunk management implementation.
*
@@ -695,21 +862,23 @@ static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr)
* @size: size of area to allocate in bytes
* @align: alignment of area (max PAGE_SIZE)
* @reserved: allocate from the reserved chunk if available
+ * @gfp: allocation flags
*
- * Allocate percpu area of @size bytes aligned at @align.
- *
- * CONTEXT:
- * Does GFP_KERNEL allocation.
+ * Allocate percpu area of @size bytes aligned at @align. If @gfp doesn't
+ * contain %GFP_KERNEL, the allocation is atomic.
*
* RETURNS:
* Percpu pointer to the allocated area on success, NULL on failure.
*/
-static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved)
+static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved,
+ gfp_t gfp)
{
static int warn_limit = 10;
struct pcpu_chunk *chunk;
const char *err;
- int slot, off, new_alloc;
+ bool is_atomic = (gfp & GFP_KERNEL) != GFP_KERNEL;
+ int occ_pages = 0;
+ int slot, off, new_alloc, cpu, ret;
unsigned long flags;
void __percpu *ptr;
@@ -728,7 +897,6 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved)
return NULL;
}
- mutex_lock(&pcpu_alloc_mutex);
spin_lock_irqsave(&pcpu_lock, flags);
/* serve reserved allocations from the reserved chunk if available */
@@ -740,16 +908,18 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved)
goto fail_unlock;
}
- while ((new_alloc = pcpu_need_to_extend(chunk))) {
+ while ((new_alloc = pcpu_need_to_extend(chunk, is_atomic))) {
spin_unlock_irqrestore(&pcpu_lock, flags);
- if (pcpu_extend_area_map(chunk, new_alloc) < 0) {
+ if (is_atomic ||
+ pcpu_extend_area_map(chunk, new_alloc) < 0) {
err = "failed to extend area map of reserved chunk";
- goto fail_unlock_mutex;
+ goto fail;
}
spin_lock_irqsave(&pcpu_lock, flags);
}
- off = pcpu_alloc_area(chunk, size, align);
+ off = pcpu_alloc_area(chunk, size, align, is_atomic,
+ &occ_pages);
if (off >= 0)
goto area_found;
@@ -764,13 +934,15 @@ restart:
if (size > chunk->contig_hint)
continue;
- new_alloc = pcpu_need_to_extend(chunk);
+ new_alloc = pcpu_need_to_extend(chunk, is_atomic);
if (new_alloc) {
+ if (is_atomic)
+ continue;
spin_unlock_irqrestore(&pcpu_lock, flags);
if (pcpu_extend_area_map(chunk,
new_alloc) < 0) {
err = "failed to extend area map";
- goto fail_unlock_mutex;
+ goto fail;
}
spin_lock_irqsave(&pcpu_lock, flags);
/*
@@ -780,74 +952,134 @@ restart:
goto restart;
}
- off = pcpu_alloc_area(chunk, size, align);
+ off = pcpu_alloc_area(chunk, size, align, is_atomic,
+ &occ_pages);
if (off >= 0)
goto area_found;
}
}
- /* hmmm... no space left, create a new chunk */
spin_unlock_irqrestore(&pcpu_lock, flags);
- chunk = pcpu_create_chunk();
- if (!chunk) {
- err = "failed to allocate new chunk";
- goto fail_unlock_mutex;
+ /*
+ * No space left. Create a new chunk. We don't want multiple
+ * tasks to create chunks simultaneously. Serialize and create iff
+ * there's still no empty chunk after grabbing the mutex.
+ */
+ if (is_atomic)
+ goto fail;
+
+ mutex_lock(&pcpu_alloc_mutex);
+
+ if (list_empty(&pcpu_slot[pcpu_nr_slots - 1])) {
+ chunk = pcpu_create_chunk();
+ if (!chunk) {
+ mutex_unlock(&pcpu_alloc_mutex);
+ err = "failed to allocate new chunk";
+ goto fail;
+ }
+
+ spin_lock_irqsave(&pcpu_lock, flags);
+ pcpu_chunk_relocate(chunk, -1);
+ } else {
+ spin_lock_irqsave(&pcpu_lock, flags);
}
- spin_lock_irqsave(&pcpu_lock, flags);
- pcpu_chunk_relocate(chunk, -1);
+ mutex_unlock(&pcpu_alloc_mutex);
goto restart;
area_found:
spin_unlock_irqrestore(&pcpu_lock, flags);
- /* populate, map and clear the area */
- if (pcpu_populate_chunk(chunk, off, size)) {
- spin_lock_irqsave(&pcpu_lock, flags);
- pcpu_free_area(chunk, off);
- err = "failed to populate";
- goto fail_unlock;
+ /* populate if not all pages are already there */
+ if (!is_atomic) {
+ int page_start, page_end, rs, re;
+
+ mutex_lock(&pcpu_alloc_mutex);
+
+ page_start = PFN_DOWN(off);
+ page_end = PFN_UP(off + size);
+
+ pcpu_for_each_unpop_region(chunk, rs, re, page_start, page_end) {
+ WARN_ON(chunk->immutable);
+
+ ret = pcpu_populate_chunk(chunk, rs, re);
+
+ spin_lock_irqsave(&pcpu_lock, flags);
+ if (ret) {
+ mutex_unlock(&pcpu_alloc_mutex);
+ pcpu_free_area(chunk, off, &occ_pages);
+ err = "failed to populate";
+ goto fail_unlock;
+ }
+ pcpu_chunk_populated(chunk, rs, re);
+ spin_unlock_irqrestore(&pcpu_lock, flags);
+ }
+
+ mutex_unlock(&pcpu_alloc_mutex);
}
- mutex_unlock(&pcpu_alloc_mutex);
+ if (chunk != pcpu_reserved_chunk)
+ pcpu_nr_empty_pop_pages -= occ_pages;
+
+ if (pcpu_nr_empty_pop_pages < PCPU_EMPTY_POP_PAGES_LOW)
+ pcpu_schedule_balance_work();
+
+ /* clear the areas and return address relative to base address */
+ for_each_possible_cpu(cpu)
+ memset((void *)pcpu_chunk_addr(chunk, cpu, 0) + off, 0, size);
- /* return address relative to base address */
ptr = __addr_to_pcpu_ptr(chunk->base_addr + off);
kmemleak_alloc_percpu(ptr, size);
return ptr;
fail_unlock:
spin_unlock_irqrestore(&pcpu_lock, flags);
-fail_unlock_mutex:
- mutex_unlock(&pcpu_alloc_mutex);
- if (warn_limit) {
- pr_warning("PERCPU: allocation failed, size=%zu align=%zu, "
- "%s\n", size, align, err);
+fail:
+ if (!is_atomic && warn_limit) {
+ pr_warning("PERCPU: allocation failed, size=%zu align=%zu atomic=%d, %s\n",
+ size, align, is_atomic, err);
dump_stack();
if (!--warn_limit)
pr_info("PERCPU: limit reached, disable warning\n");
}
+ if (is_atomic) {
+ /* see the flag handling in pcpu_blance_workfn() */
+ pcpu_atomic_alloc_failed = true;
+ pcpu_schedule_balance_work();
+ }
return NULL;
}
/**
- * __alloc_percpu - allocate dynamic percpu area
+ * __alloc_percpu_gfp - allocate dynamic percpu area
* @size: size of area to allocate in bytes
* @align: alignment of area (max PAGE_SIZE)
+ * @gfp: allocation flags
*
- * Allocate zero-filled percpu area of @size bytes aligned at @align.
- * Might sleep. Might trigger writeouts.
- *
- * CONTEXT:
- * Does GFP_KERNEL allocation.
+ * Allocate zero-filled percpu area of @size bytes aligned at @align. If
+ * @gfp doesn't contain %GFP_KERNEL, the allocation doesn't block and can
+ * be called from any context but is a lot more likely to fail.
*
* RETURNS:
* Percpu pointer to the allocated area on success, NULL on failure.
*/
+void __percpu *__alloc_percpu_gfp(size_t size, size_t align, gfp_t gfp)
+{
+ return pcpu_alloc(size, align, false, gfp);
+}
+EXPORT_SYMBOL_GPL(__alloc_percpu_gfp);
+
+/**
+ * __alloc_percpu - allocate dynamic percpu area
+ * @size: size of area to allocate in bytes
+ * @align: alignment of area (max PAGE_SIZE)
+ *
+ * Equivalent to __alloc_percpu_gfp(size, align, %GFP_KERNEL).
+ */
void __percpu *__alloc_percpu(size_t size, size_t align)
{
- return pcpu_alloc(size, align, false);
+ return pcpu_alloc(size, align, false, GFP_KERNEL);
}
EXPORT_SYMBOL_GPL(__alloc_percpu);
@@ -869,44 +1101,121 @@ EXPORT_SYMBOL_GPL(__alloc_percpu);
*/
void __percpu *__alloc_reserved_percpu(size_t size, size_t align)
{
- return pcpu_alloc(size, align, true);
+ return pcpu_alloc(size, align, true, GFP_KERNEL);
}
/**
- * pcpu_reclaim - reclaim fully free chunks, workqueue function
+ * pcpu_balance_workfn - manage the amount of free chunks and populated pages
* @work: unused
*
* Reclaim all fully free chunks except for the first one.
- *
- * CONTEXT:
- * workqueue context.
*/
-static void pcpu_reclaim(struct work_struct *work)
+static void pcpu_balance_workfn(struct work_struct *work)
{
- LIST_HEAD(todo);
- struct list_head *head = &pcpu_slot[pcpu_nr_slots - 1];
+ LIST_HEAD(to_free);
+ struct list_head *free_head = &pcpu_slot[pcpu_nr_slots - 1];
struct pcpu_chunk *chunk, *next;
+ int slot, nr_to_pop, ret;
+ /*
+ * There's no reason to keep around multiple unused chunks and VM
+ * areas can be scarce. Destroy all free chunks except for one.
+ */
mutex_lock(&pcpu_alloc_mutex);
spin_lock_irq(&pcpu_lock);
- list_for_each_entry_safe(chunk, next, head, list) {
+ list_for_each_entry_safe(chunk, next, free_head, list) {
WARN_ON(chunk->immutable);
/* spare the first one */
- if (chunk == list_first_entry(head, struct pcpu_chunk, list))
+ if (chunk == list_first_entry(free_head, struct pcpu_chunk, list))
continue;
- list_move(&chunk->list, &todo);
+ list_move(&chunk->list, &to_free);
}
spin_unlock_irq(&pcpu_lock);
- list_for_each_entry_safe(chunk, next, &todo, list) {
- pcpu_depopulate_chunk(chunk, 0, pcpu_unit_size);
+ list_for_each_entry_safe(chunk, next, &to_free, list) {
+ int rs, re;
+
+ pcpu_for_each_pop_region(chunk, rs, re, 0, pcpu_unit_pages) {
+ pcpu_depopulate_chunk(chunk, rs, re);
+ spin_lock_irq(&pcpu_lock);
+ pcpu_chunk_depopulated(chunk, rs, re);
+ spin_unlock_irq(&pcpu_lock);
+ }
pcpu_destroy_chunk(chunk);
}
+ /*
+ * Ensure there are certain number of free populated pages for
+ * atomic allocs. Fill up from the most packed so that atomic
+ * allocs don't increase fragmentation. If atomic allocation
+ * failed previously, always populate the maximum amount. This
+ * should prevent atomic allocs larger than PAGE_SIZE from keeping
+ * failing indefinitely; however, large atomic allocs are not
+ * something we support properly and can be highly unreliable and
+ * inefficient.
+ */
+retry_pop:
+ if (pcpu_atomic_alloc_failed) {
+ nr_to_pop = PCPU_EMPTY_POP_PAGES_HIGH;
+ /* best effort anyway, don't worry about synchronization */
+ pcpu_atomic_alloc_failed = false;
+ } else {
+ nr_to_pop = clamp(PCPU_EMPTY_POP_PAGES_HIGH -
+ pcpu_nr_empty_pop_pages,
+ 0, PCPU_EMPTY_POP_PAGES_HIGH);
+ }
+
+ for (slot = pcpu_size_to_slot(PAGE_SIZE); slot < pcpu_nr_slots; slot++) {
+ int nr_unpop = 0, rs, re;
+
+ if (!nr_to_pop)
+ break;
+
+ spin_lock_irq(&pcpu_lock);
+ list_for_each_entry(chunk, &pcpu_slot[slot], list) {
+ nr_unpop = pcpu_unit_pages - chunk->nr_populated;
+ if (nr_unpop)
+ break;
+ }
+ spin_unlock_irq(&pcpu_lock);
+
+ if (!nr_unpop)
+ continue;
+
+ /* @chunk can't go away while pcpu_alloc_mutex is held */
+ pcpu_for_each_unpop_region(chunk, rs, re, 0, pcpu_unit_pages) {
+ int nr = min(re - rs, nr_to_pop);
+
+ ret = pcpu_populate_chunk(chunk, rs, rs + nr);
+ if (!ret) {
+ nr_to_pop -= nr;
+ spin_lock_irq(&pcpu_lock);
+ pcpu_chunk_populated(chunk, rs, rs + nr);
+ spin_unlock_irq(&pcpu_lock);
+ } else {
+ nr_to_pop = 0;
+ }
+
+ if (!nr_to_pop)
+ break;
+ }
+ }
+
+ if (nr_to_pop) {
+ /* ran out of chunks to populate, create a new one and retry */
+ chunk = pcpu_create_chunk();
+ if (chunk) {
+ spin_lock_irq(&pcpu_lock);
+ pcpu_chunk_relocate(chunk, -1);
+ spin_unlock_irq(&pcpu_lock);
+ goto retry_pop;
+ }
+ }
+
mutex_unlock(&pcpu_alloc_mutex);
}
@@ -924,7 +1233,7 @@ void free_percpu(void __percpu *ptr)
void *addr;
struct pcpu_chunk *chunk;
unsigned long flags;
- int off;
+ int off, occ_pages;
if (!ptr)
return;
@@ -938,7 +1247,10 @@ void free_percpu(void __percpu *ptr)
chunk = pcpu_chunk_addr_search(addr);
off = addr - chunk->base_addr;
- pcpu_free_area(chunk, off);
+ pcpu_free_area(chunk, off, &occ_pages);
+
+ if (chunk != pcpu_reserved_chunk)
+ pcpu_nr_empty_pop_pages += occ_pages;
/* if there are more than one fully free chunks, wake up grim reaper */
if (chunk->free_size == pcpu_unit_size) {
@@ -946,7 +1258,7 @@ void free_percpu(void __percpu *ptr)
list_for_each_entry(pos, &pcpu_slot[pcpu_nr_slots - 1], list)
if (pos != chunk) {
- schedule_work(&pcpu_reclaim_work);
+ pcpu_schedule_balance_work();
break;
}
}
@@ -1336,11 +1648,13 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
*/
schunk = memblock_virt_alloc(pcpu_chunk_struct_size, 0);
INIT_LIST_HEAD(&schunk->list);
+ INIT_WORK(&schunk->map_extend_work, pcpu_map_extend_workfn);
schunk->base_addr = base_addr;
schunk->map = smap;
schunk->map_alloc = ARRAY_SIZE(smap);
schunk->immutable = true;
bitmap_fill(schunk->populated, pcpu_unit_pages);
+ schunk->nr_populated = pcpu_unit_pages;
if (ai->reserved_size) {
schunk->free_size = ai->reserved_size;
@@ -1364,11 +1678,13 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
if (dyn_size) {
dchunk = memblock_virt_alloc(pcpu_chunk_struct_size, 0);
INIT_LIST_HEAD(&dchunk->list);
+ INIT_WORK(&dchunk->map_extend_work, pcpu_map_extend_workfn);
dchunk->base_addr = base_addr;
dchunk->map = dmap;
dchunk->map_alloc = ARRAY_SIZE(dmap);
dchunk->immutable = true;
bitmap_fill(dchunk->populated, pcpu_unit_pages);
+ dchunk->nr_populated = pcpu_unit_pages;
dchunk->contig_hint = dchunk->free_size = dyn_size;
dchunk->map[0] = 1;
@@ -1379,6 +1695,8 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
/* link the first chunk in */
pcpu_first_chunk = dchunk ?: schunk;
+ pcpu_nr_empty_pop_pages +=
+ pcpu_count_occupied_pages(pcpu_first_chunk, 1);
pcpu_chunk_relocate(pcpu_first_chunk, -1);
/* we're done */
@@ -1965,3 +2283,15 @@ void __init percpu_init_late(void)
spin_unlock_irqrestore(&pcpu_lock, flags);
}
}
+
+/*
+ * Percpu allocator is initialized early during boot when neither slab or
+ * workqueue is available. Plug async management until everything is up
+ * and running.
+ */
+static int __init percpu_enable_async(void)
+{
+ pcpu_async_enabled = true;
+ return 0;
+}
+subsys_initcall(percpu_enable_async);
diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c
index a8b919925934..dfb79e028ecb 100644
--- a/mm/pgtable-generic.c
+++ b/mm/pgtable-generic.c
@@ -195,7 +195,7 @@ void pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
pmd_t entry = *pmdp;
if (pmd_numa(entry))
entry = pmd_mknonnuma(entry);
- set_pmd_at(vma->vm_mm, address, pmdp, pmd_mknotpresent(*pmdp));
+ set_pmd_at(vma->vm_mm, address, pmdp, pmd_mknotpresent(entry));
flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
}
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
diff --git a/mm/rmap.c b/mm/rmap.c
index 3e8491c504f8..116a5053415b 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -527,7 +527,7 @@ vma_address(struct page *page, struct vm_area_struct *vma)
unsigned long address = __vma_address(page, vma);
/* page should be within @vma mapping range */
- VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end);
+ VM_BUG_ON_VMA(address < vma->vm_start || address >= vma->vm_end, vma);
return address;
}
@@ -897,7 +897,7 @@ void page_move_anon_rmap(struct page *page,
struct anon_vma *anon_vma = vma->anon_vma;
VM_BUG_ON_PAGE(!PageLocked(page), page);
- VM_BUG_ON(!anon_vma);
+ VM_BUG_ON_VMA(!anon_vma, vma);
VM_BUG_ON_PAGE(page->index != linear_page_index(vma, address), page);
anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON;
@@ -1024,7 +1024,7 @@ void do_page_add_anon_rmap(struct page *page,
void page_add_new_anon_rmap(struct page *page,
struct vm_area_struct *vma, unsigned long address)
{
- VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end);
+ VM_BUG_ON_VMA(address < vma->vm_start || address >= vma->vm_end, vma);
SetPageSwapBacked(page);
atomic_set(&page->_mapcount, 0); /* increment count (starts at -1) */
if (PageTransHuge(page))
@@ -1355,7 +1355,11 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount,
continue; /* don't unmap */
}
- if (ptep_clear_flush_young_notify(vma, address, pte))
+ /*
+ * No need for _notify because we're within an
+ * mmu_notifier_invalidate_range_ {start|end} scope.
+ */
+ if (ptep_clear_flush_young(vma, address, pte))
continue;
/* Nuke the page table entry. */
@@ -1666,7 +1670,7 @@ static int rmap_walk_file(struct page *page, struct rmap_walk_control *rwc)
* structure at mapping cannot be freed and reused yet,
* so we can safely take mapping->i_mmap_mutex.
*/
- VM_BUG_ON(!PageLocked(page));
+ VM_BUG_ON_PAGE(!PageLocked(page), page);
if (!mapping)
return ret;
diff --git a/mm/shmem.c b/mm/shmem.c
index 0e5fb225007c..cd6fc7590e54 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -2367,8 +2367,10 @@ static int shmem_rename2(struct inode *old_dir, struct dentry *old_dentry, struc
if (new_dentry->d_inode) {
(void) shmem_unlink(new_dir, new_dentry);
- if (they_are_dirs)
+ if (they_are_dirs) {
+ drop_nlink(new_dentry->d_inode);
drop_nlink(old_dir);
+ }
} else if (they_are_dirs) {
drop_nlink(old_dir);
inc_nlink(new_dir);
@@ -2993,7 +2995,7 @@ int shmem_fill_super(struct super_block *sb, void *data, int silent)
#endif
spin_lock_init(&sbinfo->stat_lock);
- if (percpu_counter_init(&sbinfo->used_blocks, 0))
+ if (percpu_counter_init(&sbinfo->used_blocks, 0, GFP_KERNEL))
goto failed;
sbinfo->free_inodes = sbinfo->max_inodes;
@@ -3075,7 +3077,9 @@ static const struct address_space_operations shmem_aops = {
.write_begin = shmem_write_begin,
.write_end = shmem_write_end,
#endif
+#ifdef CONFIG_MIGRATION
.migratepage = migrate_page,
+#endif
.error_remove_page = generic_error_remove_page,
};
diff --git a/mm/slab.c b/mm/slab.c
index a467b308c682..eb2b2ea30130 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -237,11 +237,10 @@ struct arraycache_init {
/*
* Need this for bootstrapping a per node allocator.
*/
-#define NUM_INIT_LISTS (3 * MAX_NUMNODES)
+#define NUM_INIT_LISTS (2 * MAX_NUMNODES)
static struct kmem_cache_node __initdata init_kmem_cache_node[NUM_INIT_LISTS];
#define CACHE_CACHE 0
-#define SIZE_AC MAX_NUMNODES
-#define SIZE_NODE (2 * MAX_NUMNODES)
+#define SIZE_NODE (MAX_NUMNODES)
static int drain_freelist(struct kmem_cache *cache,
struct kmem_cache_node *n, int tofree);
@@ -253,7 +252,6 @@ static void cache_reap(struct work_struct *unused);
static int slab_early_init = 1;
-#define INDEX_AC kmalloc_index(sizeof(struct arraycache_init))
#define INDEX_NODE kmalloc_index(sizeof(struct kmem_cache_node))
static void kmem_cache_node_init(struct kmem_cache_node *parent)
@@ -458,9 +456,6 @@ static inline unsigned int obj_to_index(const struct kmem_cache *cache,
return reciprocal_divide(offset, cache->reciprocal_buffer_size);
}
-static struct arraycache_init initarray_generic =
- { {0, BOOT_CPUCACHE_ENTRIES, 1, 0} };
-
/* internal cache of cache description objs */
static struct kmem_cache kmem_cache_boot = {
.batchcount = 1,
@@ -476,7 +471,7 @@ static DEFINE_PER_CPU(struct delayed_work, slab_reap_work);
static inline struct array_cache *cpu_cache_get(struct kmem_cache *cachep)
{
- return cachep->array[smp_processor_id()];
+ return this_cpu_ptr(cachep->cpu_cache);
}
static size_t calculate_freelist_size(int nr_objs, size_t align)
@@ -785,8 +780,8 @@ static inline void *ac_get_obj(struct kmem_cache *cachep,
return objp;
}
-static void *__ac_put_obj(struct kmem_cache *cachep, struct array_cache *ac,
- void *objp)
+static noinline void *__ac_put_obj(struct kmem_cache *cachep,
+ struct array_cache *ac, void *objp)
{
if (unlikely(pfmemalloc_active)) {
/* Some pfmemalloc slabs exist, check if this is one */
@@ -984,46 +979,50 @@ static void drain_alien_cache(struct kmem_cache *cachep,
}
}
-static inline int cache_free_alien(struct kmem_cache *cachep, void *objp)
+static int __cache_free_alien(struct kmem_cache *cachep, void *objp,
+ int node, int page_node)
{
- int nodeid = page_to_nid(virt_to_page(objp));
struct kmem_cache_node *n;
struct alien_cache *alien = NULL;
struct array_cache *ac;
- int node;
LIST_HEAD(list);
- node = numa_mem_id();
-
- /*
- * Make sure we are not freeing a object from another node to the array
- * cache on this cpu.
- */
- if (likely(nodeid == node))
- return 0;
-
n = get_node(cachep, node);
STATS_INC_NODEFREES(cachep);
- if (n->alien && n->alien[nodeid]) {
- alien = n->alien[nodeid];
+ if (n->alien && n->alien[page_node]) {
+ alien = n->alien[page_node];
ac = &alien->ac;
spin_lock(&alien->lock);
if (unlikely(ac->avail == ac->limit)) {
STATS_INC_ACOVERFLOW(cachep);
- __drain_alien_cache(cachep, ac, nodeid, &list);
+ __drain_alien_cache(cachep, ac, page_node, &list);
}
ac_put_obj(cachep, ac, objp);
spin_unlock(&alien->lock);
slabs_destroy(cachep, &list);
} else {
- n = get_node(cachep, nodeid);
+ n = get_node(cachep, page_node);
spin_lock(&n->list_lock);
- free_block(cachep, &objp, 1, nodeid, &list);
+ free_block(cachep, &objp, 1, page_node, &list);
spin_unlock(&n->list_lock);
slabs_destroy(cachep, &list);
}
return 1;
}
+
+static inline int cache_free_alien(struct kmem_cache *cachep, void *objp)
+{
+ int page_node = page_to_nid(virt_to_page(objp));
+ int node = numa_mem_id();
+ /*
+ * Make sure we are not freeing a object from another node to the array
+ * cache on this cpu.
+ */
+ if (likely(node == page_node))
+ return 0;
+
+ return __cache_free_alien(cachep, objp, node, page_node);
+}
#endif
/*
@@ -1092,24 +1091,25 @@ static void cpuup_canceled(long cpu)
struct alien_cache **alien;
LIST_HEAD(list);
- /* cpu is dead; no one can alloc from it. */
- nc = cachep->array[cpu];
- cachep->array[cpu] = NULL;
n = get_node(cachep, node);
-
if (!n)
- goto free_array_cache;
+ continue;
spin_lock_irq(&n->list_lock);
/* Free limit for this kmem_cache_node */
n->free_limit -= cachep->batchcount;
- if (nc)
+
+ /* cpu is dead; no one can alloc from it. */
+ nc = per_cpu_ptr(cachep->cpu_cache, cpu);
+ if (nc) {
free_block(cachep, nc->entry, nc->avail, node, &list);
+ nc->avail = 0;
+ }
if (!cpumask_empty(mask)) {
spin_unlock_irq(&n->list_lock);
- goto free_array_cache;
+ goto free_slab;
}
shared = n->shared;
@@ -1129,9 +1129,9 @@ static void cpuup_canceled(long cpu)
drain_alien_cache(cachep, alien);
free_alien_cache(alien);
}
-free_array_cache:
+
+free_slab:
slabs_destroy(cachep, &list);
- kfree(nc);
}
/*
* In the previous loop, all the objects were freed to
@@ -1168,32 +1168,23 @@ static int cpuup_prepare(long cpu)
* array caches
*/
list_for_each_entry(cachep, &slab_caches, list) {
- struct array_cache *nc;
struct array_cache *shared = NULL;
struct alien_cache **alien = NULL;
- nc = alloc_arraycache(node, cachep->limit,
- cachep->batchcount, GFP_KERNEL);
- if (!nc)
- goto bad;
if (cachep->shared) {
shared = alloc_arraycache(node,
cachep->shared * cachep->batchcount,
0xbaadf00d, GFP_KERNEL);
- if (!shared) {
- kfree(nc);
+ if (!shared)
goto bad;
- }
}
if (use_alien_caches) {
alien = alloc_alien_cache(node, cachep->limit, GFP_KERNEL);
if (!alien) {
kfree(shared);
- kfree(nc);
goto bad;
}
}
- cachep->array[cpu] = nc;
n = get_node(cachep, node);
BUG_ON(!n);
@@ -1385,15 +1376,6 @@ static void __init set_up_node(struct kmem_cache *cachep, int index)
}
/*
- * The memory after the last cpu cache pointer is used for the
- * the node pointer.
- */
-static void setup_node_pointer(struct kmem_cache *cachep)
-{
- cachep->node = (struct kmem_cache_node **)&cachep->array[nr_cpu_ids];
-}
-
-/*
* Initialisation. Called after the page allocator have been initialised and
* before smp_init().
*/
@@ -1404,7 +1386,6 @@ void __init kmem_cache_init(void)
BUILD_BUG_ON(sizeof(((struct page *)NULL)->lru) <
sizeof(struct rcu_head));
kmem_cache = &kmem_cache_boot;
- setup_node_pointer(kmem_cache);
if (num_possible_nodes() == 1)
use_alien_caches = 0;
@@ -1412,8 +1393,6 @@ void __init kmem_cache_init(void)
for (i = 0; i < NUM_INIT_LISTS; i++)
kmem_cache_node_init(&init_kmem_cache_node[i]);
- set_up_node(kmem_cache, CACHE_CACHE);
-
/*
* Fragmentation resistance on low memory - only use bigger
* page orders on machines with more than 32MB of memory if
@@ -1448,49 +1427,22 @@ void __init kmem_cache_init(void)
* struct kmem_cache size depends on nr_node_ids & nr_cpu_ids
*/
create_boot_cache(kmem_cache, "kmem_cache",
- offsetof(struct kmem_cache, array[nr_cpu_ids]) +
+ offsetof(struct kmem_cache, node) +
nr_node_ids * sizeof(struct kmem_cache_node *),
SLAB_HWCACHE_ALIGN);
list_add(&kmem_cache->list, &slab_caches);
-
- /* 2+3) create the kmalloc caches */
+ slab_state = PARTIAL;
/*
- * Initialize the caches that provide memory for the array cache and the
- * kmem_cache_node structures first. Without this, further allocations will
- * bug.
+ * Initialize the caches that provide memory for the kmem_cache_node
+ * structures first. Without this, further allocations will bug.
*/
-
- kmalloc_caches[INDEX_AC] = create_kmalloc_cache("kmalloc-ac",
- kmalloc_size(INDEX_AC), ARCH_KMALLOC_FLAGS);
-
- if (INDEX_AC != INDEX_NODE)
- kmalloc_caches[INDEX_NODE] =
- create_kmalloc_cache("kmalloc-node",
+ kmalloc_caches[INDEX_NODE] = create_kmalloc_cache("kmalloc-node",
kmalloc_size(INDEX_NODE), ARCH_KMALLOC_FLAGS);
+ slab_state = PARTIAL_NODE;
slab_early_init = 0;
- /* 4) Replace the bootstrap head arrays */
- {
- struct array_cache *ptr;
-
- ptr = kmalloc(sizeof(struct arraycache_init), GFP_NOWAIT);
-
- memcpy(ptr, cpu_cache_get(kmem_cache),
- sizeof(struct arraycache_init));
-
- kmem_cache->array[smp_processor_id()] = ptr;
-
- ptr = kmalloc(sizeof(struct arraycache_init), GFP_NOWAIT);
-
- BUG_ON(cpu_cache_get(kmalloc_caches[INDEX_AC])
- != &initarray_generic.cache);
- memcpy(ptr, cpu_cache_get(kmalloc_caches[INDEX_AC]),
- sizeof(struct arraycache_init));
-
- kmalloc_caches[INDEX_AC]->array[smp_processor_id()] = ptr;
- }
/* 5) Replace the bootstrap kmem_cache_node */
{
int nid;
@@ -1498,13 +1450,8 @@ void __init kmem_cache_init(void)
for_each_online_node(nid) {
init_list(kmem_cache, &init_kmem_cache_node[CACHE_CACHE + nid], nid);
- init_list(kmalloc_caches[INDEX_AC],
- &init_kmem_cache_node[SIZE_AC + nid], nid);
-
- if (INDEX_AC != INDEX_NODE) {
- init_list(kmalloc_caches[INDEX_NODE],
+ init_list(kmalloc_caches[INDEX_NODE],
&init_kmem_cache_node[SIZE_NODE + nid], nid);
- }
}
}
@@ -2037,56 +1984,53 @@ static size_t calculate_slab_order(struct kmem_cache *cachep,
return left_over;
}
+static struct array_cache __percpu *alloc_kmem_cache_cpus(
+ struct kmem_cache *cachep, int entries, int batchcount)
+{
+ int cpu;
+ size_t size;
+ struct array_cache __percpu *cpu_cache;
+
+ size = sizeof(void *) * entries + sizeof(struct array_cache);
+ cpu_cache = __alloc_percpu(size, sizeof(void *));
+
+ if (!cpu_cache)
+ return NULL;
+
+ for_each_possible_cpu(cpu) {
+ init_arraycache(per_cpu_ptr(cpu_cache, cpu),
+ entries, batchcount);
+ }
+
+ return cpu_cache;
+}
+
static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp)
{
if (slab_state >= FULL)
return enable_cpucache(cachep, gfp);
+ cachep->cpu_cache = alloc_kmem_cache_cpus(cachep, 1, 1);
+ if (!cachep->cpu_cache)
+ return 1;
+
if (slab_state == DOWN) {
- /*
- * Note: Creation of first cache (kmem_cache).
- * The setup_node is taken care
- * of by the caller of __kmem_cache_create
- */
- cachep->array[smp_processor_id()] = &initarray_generic.cache;
- slab_state = PARTIAL;
+ /* Creation of first cache (kmem_cache). */
+ set_up_node(kmem_cache, CACHE_CACHE);
} else if (slab_state == PARTIAL) {
- /*
- * Note: the second kmem_cache_create must create the cache
- * that's used by kmalloc(24), otherwise the creation of
- * further caches will BUG().
- */
- cachep->array[smp_processor_id()] = &initarray_generic.cache;
-
- /*
- * If the cache that's used by kmalloc(sizeof(kmem_cache_node)) is
- * the second cache, then we need to set up all its node/,
- * otherwise the creation of further caches will BUG().
- */
- set_up_node(cachep, SIZE_AC);
- if (INDEX_AC == INDEX_NODE)
- slab_state = PARTIAL_NODE;
- else
- slab_state = PARTIAL_ARRAYCACHE;
+ /* For kmem_cache_node */
+ set_up_node(cachep, SIZE_NODE);
} else {
- /* Remaining boot caches */
- cachep->array[smp_processor_id()] =
- kmalloc(sizeof(struct arraycache_init), gfp);
+ int node;
- if (slab_state == PARTIAL_ARRAYCACHE) {
- set_up_node(cachep, SIZE_NODE);
- slab_state = PARTIAL_NODE;
- } else {
- int node;
- for_each_online_node(node) {
- cachep->node[node] =
- kmalloc_node(sizeof(struct kmem_cache_node),
- gfp, node);
- BUG_ON(!cachep->node[node]);
- kmem_cache_node_init(cachep->node[node]);
- }
+ for_each_online_node(node) {
+ cachep->node[node] = kmalloc_node(
+ sizeof(struct kmem_cache_node), gfp, node);
+ BUG_ON(!cachep->node[node]);
+ kmem_cache_node_init(cachep->node[node]);
}
}
+
cachep->node[numa_mem_id()]->next_reap =
jiffies + REAPTIMEOUT_NODE +
((unsigned long)cachep) % REAPTIMEOUT_NODE;
@@ -2100,6 +2044,32 @@ static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp)
return 0;
}
+unsigned long kmem_cache_flags(unsigned long object_size,
+ unsigned long flags, const char *name,
+ void (*ctor)(void *))
+{
+ return flags;
+}
+
+struct kmem_cache *
+__kmem_cache_alias(const char *name, size_t size, size_t align,
+ unsigned long flags, void (*ctor)(void *))
+{
+ struct kmem_cache *cachep;
+
+ cachep = find_mergeable(size, align, flags, name, ctor);
+ if (cachep) {
+ cachep->refcount++;
+
+ /*
+ * Adjust the object sizes so that we clear
+ * the complete object on kzalloc.
+ */
+ cachep->object_size = max_t(int, cachep->object_size, size);
+ }
+ return cachep;
+}
+
/**
* __kmem_cache_create - Create a cache.
* @cachep: cache management descriptor
@@ -2124,7 +2094,8 @@ static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp)
int
__kmem_cache_create (struct kmem_cache *cachep, unsigned long flags)
{
- size_t left_over, freelist_size, ralign;
+ size_t left_over, freelist_size;
+ size_t ralign = BYTES_PER_WORD;
gfp_t gfp;
int err;
size_t size = cachep->size;
@@ -2157,14 +2128,6 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags)
size &= ~(BYTES_PER_WORD - 1);
}
- /*
- * Redzoning and user store require word alignment or possibly larger.
- * Note this will be overridden by architecture or caller mandated
- * alignment if either is greater than BYTES_PER_WORD.
- */
- if (flags & SLAB_STORE_USER)
- ralign = BYTES_PER_WORD;
-
if (flags & SLAB_RED_ZONE) {
ralign = REDZONE_ALIGN;
/* If redzoning, ensure that the second redzone is suitably
@@ -2190,7 +2153,6 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags)
else
gfp = GFP_NOWAIT;
- setup_node_pointer(cachep);
#if DEBUG
/*
@@ -2447,8 +2409,7 @@ int __kmem_cache_shutdown(struct kmem_cache *cachep)
if (rc)
return rc;
- for_each_online_cpu(i)
- kfree(cachep->array[i]);
+ free_percpu(cachep->cpu_cache);
/* NUMA: free the node structures */
for_each_kmem_cache_node(cachep, i, n) {
@@ -2994,7 +2955,7 @@ out:
#ifdef CONFIG_NUMA
/*
- * Try allocating on another node if PF_SPREAD_SLAB is a mempolicy is set.
+ * Try allocating on another node if PFA_SPREAD_SLAB is a mempolicy is set.
*
* If we are in_interrupt, then process context, including cpusets and
* mempolicy, may not apply and should not be used for allocation policy.
@@ -3226,7 +3187,7 @@ __do_cache_alloc(struct kmem_cache *cache, gfp_t flags)
{
void *objp;
- if (current->mempolicy || unlikely(current->flags & PF_SPREAD_SLAB)) {
+ if (current->mempolicy || cpuset_do_slab_mem_spread()) {
objp = alternate_node_alloc(cache, flags);
if (objp)
goto out;
@@ -3406,7 +3367,7 @@ static inline void __cache_free(struct kmem_cache *cachep, void *objp,
if (nr_online_nodes > 1 && cache_free_alien(cachep, objp))
return;
- if (likely(ac->avail < ac->limit)) {
+ if (ac->avail < ac->limit) {
STATS_INC_FREEHIT(cachep);
} else {
STATS_INC_FREEMISS(cachep);
@@ -3503,7 +3464,6 @@ __do_kmalloc_node(size_t size, gfp_t flags, int node, unsigned long caller)
return kmem_cache_alloc_node_trace(cachep, flags, node, size);
}
-#if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_TRACING)
void *__kmalloc_node(size_t size, gfp_t flags, int node)
{
return __do_kmalloc_node(size, flags, node, _RET_IP_);
@@ -3516,13 +3476,6 @@ void *__kmalloc_node_track_caller(size_t size, gfp_t flags,
return __do_kmalloc_node(size, flags, node, caller);
}
EXPORT_SYMBOL(__kmalloc_node_track_caller);
-#else
-void *__kmalloc_node(size_t size, gfp_t flags, int node)
-{
- return __do_kmalloc_node(size, flags, node, 0);
-}
-EXPORT_SYMBOL(__kmalloc_node);
-#endif /* CONFIG_DEBUG_SLAB || CONFIG_TRACING */
#endif /* CONFIG_NUMA */
/**
@@ -3548,8 +3501,6 @@ static __always_inline void *__do_kmalloc(size_t size, gfp_t flags,
return ret;
}
-
-#if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_TRACING)
void *__kmalloc(size_t size, gfp_t flags)
{
return __do_kmalloc(size, flags, _RET_IP_);
@@ -3562,14 +3513,6 @@ void *__kmalloc_track_caller(size_t size, gfp_t flags, unsigned long caller)
}
EXPORT_SYMBOL(__kmalloc_track_caller);
-#else
-void *__kmalloc(size_t size, gfp_t flags)
-{
- return __do_kmalloc(size, flags, 0);
-}
-EXPORT_SYMBOL(__kmalloc);
-#endif
-
/**
* kmem_cache_free - Deallocate an object
* @cachep: The cache the allocation was from.
@@ -3714,72 +3657,45 @@ fail:
return -ENOMEM;
}
-struct ccupdate_struct {
- struct kmem_cache *cachep;
- struct array_cache *new[0];
-};
-
-static void do_ccupdate_local(void *info)
-{
- struct ccupdate_struct *new = info;
- struct array_cache *old;
-
- check_irq_off();
- old = cpu_cache_get(new->cachep);
-
- new->cachep->array[smp_processor_id()] = new->new[smp_processor_id()];
- new->new[smp_processor_id()] = old;
-}
-
/* Always called with the slab_mutex held */
static int __do_tune_cpucache(struct kmem_cache *cachep, int limit,
int batchcount, int shared, gfp_t gfp)
{
- struct ccupdate_struct *new;
- int i;
+ struct array_cache __percpu *cpu_cache, *prev;
+ int cpu;
- new = kzalloc(sizeof(*new) + nr_cpu_ids * sizeof(struct array_cache *),
- gfp);
- if (!new)
+ cpu_cache = alloc_kmem_cache_cpus(cachep, limit, batchcount);
+ if (!cpu_cache)
return -ENOMEM;
- for_each_online_cpu(i) {
- new->new[i] = alloc_arraycache(cpu_to_mem(i), limit,
- batchcount, gfp);
- if (!new->new[i]) {
- for (i--; i >= 0; i--)
- kfree(new->new[i]);
- kfree(new);
- return -ENOMEM;
- }
- }
- new->cachep = cachep;
-
- on_each_cpu(do_ccupdate_local, (void *)new, 1);
+ prev = cachep->cpu_cache;
+ cachep->cpu_cache = cpu_cache;
+ kick_all_cpus_sync();
check_irq_on();
cachep->batchcount = batchcount;
cachep->limit = limit;
cachep->shared = shared;
- for_each_online_cpu(i) {
+ if (!prev)
+ goto alloc_node;
+
+ for_each_online_cpu(cpu) {
LIST_HEAD(list);
- struct array_cache *ccold = new->new[i];
int node;
struct kmem_cache_node *n;
+ struct array_cache *ac = per_cpu_ptr(prev, cpu);
- if (!ccold)
- continue;
-
- node = cpu_to_mem(i);
+ node = cpu_to_mem(cpu);
n = get_node(cachep, node);
spin_lock_irq(&n->list_lock);
- free_block(cachep, ccold->entry, ccold->avail, node, &list);
+ free_block(cachep, ac->entry, ac->avail, node, &list);
spin_unlock_irq(&n->list_lock);
slabs_destroy(cachep, &list);
- kfree(ccold);
}
- kfree(new);
+ free_percpu(prev);
+
+alloc_node:
return alloc_kmem_cache_node(cachep, gfp);
}
@@ -4262,19 +4178,15 @@ static const struct seq_operations slabstats_op = {
static int slabstats_open(struct inode *inode, struct file *file)
{
- unsigned long *n = kzalloc(PAGE_SIZE, GFP_KERNEL);
- int ret = -ENOMEM;
- if (n) {
- ret = seq_open(file, &slabstats_op);
- if (!ret) {
- struct seq_file *m = file->private_data;
- *n = PAGE_SIZE / (2 * sizeof(unsigned long));
- m->private = n;
- n = NULL;
- }
- kfree(n);
- }
- return ret;
+ unsigned long *n;
+
+ n = __seq_open_private(file, &slabstats_op, PAGE_SIZE);
+ if (!n)
+ return -ENOMEM;
+
+ *n = PAGE_SIZE / (2 * sizeof(unsigned long));
+
+ return 0;
}
static const struct file_operations proc_slabstats_operations = {
diff --git a/mm/slab.h b/mm/slab.h
index 0e0fdd365840..ab019e63e3c2 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -4,6 +4,41 @@
* Internal slab definitions
*/
+#ifdef CONFIG_SLOB
+/*
+ * Common fields provided in kmem_cache by all slab allocators
+ * This struct is either used directly by the allocator (SLOB)
+ * or the allocator must include definitions for all fields
+ * provided in kmem_cache_common in their definition of kmem_cache.
+ *
+ * Once we can do anonymous structs (C11 standard) we could put a
+ * anonymous struct definition in these allocators so that the
+ * separate allocations in the kmem_cache structure of SLAB and
+ * SLUB is no longer needed.
+ */
+struct kmem_cache {
+ unsigned int object_size;/* The original size of the object */
+ unsigned int size; /* The aligned/padded/added on size */
+ unsigned int align; /* Alignment as calculated */
+ unsigned long flags; /* Active flags on the slab */
+ const char *name; /* Slab name for sysfs */
+ int refcount; /* Use counter */
+ void (*ctor)(void *); /* Called on object slot creation */
+ struct list_head list; /* List of all slab caches on the system */
+};
+
+#endif /* CONFIG_SLOB */
+
+#ifdef CONFIG_SLAB
+#include <linux/slab_def.h>
+#endif
+
+#ifdef CONFIG_SLUB
+#include <linux/slub_def.h>
+#endif
+
+#include <linux/memcontrol.h>
+
/*
* State of the slab allocator.
*
@@ -15,7 +50,6 @@
enum slab_state {
DOWN, /* No slab functionality yet */
PARTIAL, /* SLUB: kmem_cache_node available */
- PARTIAL_ARRAYCACHE, /* SLAB: kmalloc size for arraycache available */
PARTIAL_NODE, /* SLAB: kmalloc size for node struct available */
UP, /* Slab caches usable but not all extras yet */
FULL /* Everything is working */
@@ -53,15 +87,30 @@ extern void create_boot_cache(struct kmem_cache *, const char *name,
size_t size, unsigned long flags);
struct mem_cgroup;
-#ifdef CONFIG_SLUB
+
+int slab_unmergeable(struct kmem_cache *s);
+struct kmem_cache *find_mergeable(size_t size, size_t align,
+ unsigned long flags, const char *name, void (*ctor)(void *));
+#ifndef CONFIG_SLOB
struct kmem_cache *
__kmem_cache_alias(const char *name, size_t size, size_t align,
unsigned long flags, void (*ctor)(void *));
+
+unsigned long kmem_cache_flags(unsigned long object_size,
+ unsigned long flags, const char *name,
+ void (*ctor)(void *));
#else
static inline struct kmem_cache *
__kmem_cache_alias(const char *name, size_t size, size_t align,
unsigned long flags, void (*ctor)(void *))
{ return NULL; }
+
+static inline unsigned long kmem_cache_flags(unsigned long object_size,
+ unsigned long flags, const char *name,
+ void (*ctor)(void *))
+{
+ return flags;
+}
#endif
@@ -303,8 +352,8 @@ static inline struct kmem_cache_node *get_node(struct kmem_cache *s, int node)
* a kmem_cache_node structure allocated (which is true for all online nodes)
*/
#define for_each_kmem_cache_node(__s, __node, __n) \
- for (__node = 0; __n = get_node(__s, __node), __node < nr_node_ids; __node++) \
- if (__n)
+ for (__node = 0; __node < nr_node_ids; __node++) \
+ if ((__n = get_node(__s, __node)))
#endif
diff --git a/mm/slab_common.c b/mm/slab_common.c
index d319502b2403..3a6e0cfdf03a 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -30,6 +30,43 @@ LIST_HEAD(slab_caches);
DEFINE_MUTEX(slab_mutex);
struct kmem_cache *kmem_cache;
+/*
+ * Set of flags that will prevent slab merging
+ */
+#define SLAB_NEVER_MERGE (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \
+ SLAB_TRACE | SLAB_DESTROY_BY_RCU | SLAB_NOLEAKTRACE | \
+ SLAB_FAILSLAB)
+
+#define SLAB_MERGE_SAME (SLAB_DEBUG_FREE | SLAB_RECLAIM_ACCOUNT | \
+ SLAB_CACHE_DMA | SLAB_NOTRACK)
+
+/*
+ * Merge control. If this is set then no merging of slab caches will occur.
+ * (Could be removed. This was introduced to pacify the merge skeptics.)
+ */
+static int slab_nomerge;
+
+static int __init setup_slab_nomerge(char *str)
+{
+ slab_nomerge = 1;
+ return 1;
+}
+
+#ifdef CONFIG_SLUB
+__setup_param("slub_nomerge", slub_nomerge, setup_slab_nomerge, 0);
+#endif
+
+__setup("slab_nomerge", setup_slab_nomerge);
+
+/*
+ * Determine the size of a slab object
+ */
+unsigned int kmem_cache_size(struct kmem_cache *s)
+{
+ return s->object_size;
+}
+EXPORT_SYMBOL(kmem_cache_size);
+
#ifdef CONFIG_DEBUG_VM
static int kmem_cache_sanity_check(const char *name, size_t size)
{
@@ -79,6 +116,65 @@ static inline int kmem_cache_sanity_check(const char *name, size_t size)
#endif
#ifdef CONFIG_MEMCG_KMEM
+static int memcg_alloc_cache_params(struct mem_cgroup *memcg,
+ struct kmem_cache *s, struct kmem_cache *root_cache)
+{
+ size_t size;
+
+ if (!memcg_kmem_enabled())
+ return 0;
+
+ if (!memcg) {
+ size = offsetof(struct memcg_cache_params, memcg_caches);
+ size += memcg_limited_groups_array_size * sizeof(void *);
+ } else
+ size = sizeof(struct memcg_cache_params);
+
+ s->memcg_params = kzalloc(size, GFP_KERNEL);
+ if (!s->memcg_params)
+ return -ENOMEM;
+
+ if (memcg) {
+ s->memcg_params->memcg = memcg;
+ s->memcg_params->root_cache = root_cache;
+ } else
+ s->memcg_params->is_root_cache = true;
+
+ return 0;
+}
+
+static void memcg_free_cache_params(struct kmem_cache *s)
+{
+ kfree(s->memcg_params);
+}
+
+static int memcg_update_cache_params(struct kmem_cache *s, int num_memcgs)
+{
+ int size;
+ struct memcg_cache_params *new_params, *cur_params;
+
+ BUG_ON(!is_root_cache(s));
+
+ size = offsetof(struct memcg_cache_params, memcg_caches);
+ size += num_memcgs * sizeof(void *);
+
+ new_params = kzalloc(size, GFP_KERNEL);
+ if (!new_params)
+ return -ENOMEM;
+
+ cur_params = s->memcg_params;
+ memcpy(new_params->memcg_caches, cur_params->memcg_caches,
+ memcg_limited_groups_array_size * sizeof(void *));
+
+ new_params->is_root_cache = true;
+
+ rcu_assign_pointer(s->memcg_params, new_params);
+ if (cur_params)
+ kfree_rcu(cur_params, rcu_head);
+
+ return 0;
+}
+
int memcg_update_all_caches(int num_memcgs)
{
struct kmem_cache *s;
@@ -89,9 +185,8 @@ int memcg_update_all_caches(int num_memcgs)
if (!is_root_cache(s))
continue;
- ret = memcg_update_cache_size(s, num_memcgs);
+ ret = memcg_update_cache_params(s, num_memcgs);
/*
- * See comment in memcontrol.c, memcg_update_cache_size:
* Instead of freeing the memory, we'll just leave the caches
* up to this point in an updated state.
*/
@@ -104,7 +199,80 @@ out:
mutex_unlock(&slab_mutex);
return ret;
}
-#endif
+#else
+static inline int memcg_alloc_cache_params(struct mem_cgroup *memcg,
+ struct kmem_cache *s, struct kmem_cache *root_cache)
+{
+ return 0;
+}
+
+static inline void memcg_free_cache_params(struct kmem_cache *s)
+{
+}
+#endif /* CONFIG_MEMCG_KMEM */
+
+/*
+ * Find a mergeable slab cache
+ */
+int slab_unmergeable(struct kmem_cache *s)
+{
+ if (slab_nomerge || (s->flags & SLAB_NEVER_MERGE))
+ return 1;
+
+ if (!is_root_cache(s))
+ return 1;
+
+ if (s->ctor)
+ return 1;
+
+ /*
+ * We may have set a slab to be unmergeable during bootstrap.
+ */
+ if (s->refcount < 0)
+ return 1;
+
+ return 0;
+}
+
+struct kmem_cache *find_mergeable(size_t size, size_t align,
+ unsigned long flags, const char *name, void (*ctor)(void *))
+{
+ struct kmem_cache *s;
+
+ if (slab_nomerge || (flags & SLAB_NEVER_MERGE))
+ return NULL;
+
+ if (ctor)
+ return NULL;
+
+ size = ALIGN(size, sizeof(void *));
+ align = calculate_alignment(flags, align, size);
+ size = ALIGN(size, align);
+ flags = kmem_cache_flags(size, flags, name, NULL);
+
+ list_for_each_entry(s, &slab_caches, list) {
+ if (slab_unmergeable(s))
+ continue;
+
+ if (size > s->size)
+ continue;
+
+ if ((flags & SLAB_MERGE_SAME) != (s->flags & SLAB_MERGE_SAME))
+ continue;
+ /*
+ * Check if alignment is compatible.
+ * Courtesy of Adrian Drzewiecki
+ */
+ if ((s->size & ~(align - 1)) != s->size)
+ continue;
+
+ if (s->size - size >= sizeof(void *))
+ continue;
+
+ return s;
+ }
+ return NULL;
+}
/*
* Figure out what the alignment of the objects will be given a set of
@@ -211,8 +379,10 @@ kmem_cache_create(const char *name, size_t size, size_t align,
mutex_lock(&slab_mutex);
err = kmem_cache_sanity_check(name, size);
- if (err)
+ if (err) {
+ s = NULL; /* suppress uninit var warning */
goto out_unlock;
+ }
/*
* Some allocators will constraint the set of valid flags to a subset
diff --git a/mm/slob.c b/mm/slob.c
index 21980e0f39a8..96a86206a26b 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -468,7 +468,6 @@ void *__kmalloc(size_t size, gfp_t gfp)
}
EXPORT_SYMBOL(__kmalloc);
-#ifdef CONFIG_TRACING
void *__kmalloc_track_caller(size_t size, gfp_t gfp, unsigned long caller)
{
return __do_kmalloc_node(size, gfp, NUMA_NO_NODE, caller);
@@ -481,7 +480,6 @@ void *__kmalloc_node_track_caller(size_t size, gfp_t gfp,
return __do_kmalloc_node(size, gfp, node, caller);
}
#endif
-#endif
void kfree(const void *block)
{
diff --git a/mm/slub.c b/mm/slub.c
index 3e8afcc07a76..ae7b9f1ad394 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -169,16 +169,6 @@ static inline bool kmem_cache_has_cpu_partial(struct kmem_cache *s)
*/
#define DEBUG_METADATA_FLAGS (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER)
-/*
- * Set of flags that will prevent slab merging
- */
-#define SLUB_NEVER_MERGE (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \
- SLAB_TRACE | SLAB_DESTROY_BY_RCU | SLAB_NOLEAKTRACE | \
- SLAB_FAILSLAB)
-
-#define SLUB_MERGE_SAME (SLAB_DEBUG_FREE | SLAB_RECLAIM_ACCOUNT | \
- SLAB_CACHE_DMA | SLAB_NOTRACK)
-
#define OO_SHIFT 16
#define OO_MASK ((1 << OO_SHIFT) - 1)
#define MAX_OBJS_PER_PAGE 32767 /* since page.objects is u15 */
@@ -1176,7 +1166,7 @@ out:
__setup("slub_debug", setup_slub_debug);
-static unsigned long kmem_cache_flags(unsigned long object_size,
+unsigned long kmem_cache_flags(unsigned long object_size,
unsigned long flags, const char *name,
void (*ctor)(void *))
{
@@ -1208,7 +1198,7 @@ static inline void add_full(struct kmem_cache *s, struct kmem_cache_node *n,
struct page *page) {}
static inline void remove_full(struct kmem_cache *s, struct kmem_cache_node *n,
struct page *page) {}
-static inline unsigned long kmem_cache_flags(unsigned long object_size,
+unsigned long kmem_cache_flags(unsigned long object_size,
unsigned long flags, const char *name,
void (*ctor)(void *))
{
@@ -1699,7 +1689,12 @@ static void *get_partial(struct kmem_cache *s, gfp_t flags, int node,
struct kmem_cache_cpu *c)
{
void *object;
- int searchnode = (node == NUMA_NO_NODE) ? numa_mem_id() : node;
+ int searchnode = node;
+
+ if (node == NUMA_NO_NODE)
+ searchnode = numa_mem_id();
+ else if (!node_present_pages(node))
+ searchnode = node_to_mem_node(node);
object = get_partial_node(s, get_node(s, searchnode), c, flags);
if (object || node != NUMA_NO_NODE)
@@ -2280,11 +2275,18 @@ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
redo:
if (unlikely(!node_match(page, node))) {
- stat(s, ALLOC_NODE_MISMATCH);
- deactivate_slab(s, page, c->freelist);
- c->page = NULL;
- c->freelist = NULL;
- goto new_slab;
+ int searchnode = node;
+
+ if (node != NUMA_NO_NODE && !node_present_pages(node))
+ searchnode = node_to_mem_node(node);
+
+ if (unlikely(!node_match(page, searchnode))) {
+ stat(s, ALLOC_NODE_MISMATCH);
+ deactivate_slab(s, page, c->freelist);
+ c->page = NULL;
+ c->freelist = NULL;
+ goto new_slab;
+ }
}
/*
@@ -2707,12 +2709,6 @@ static int slub_max_order = PAGE_ALLOC_COSTLY_ORDER;
static int slub_min_objects;
/*
- * Merge control. If this is set then no merging of slab caches will occur.
- * (Could be removed. This was introduced to pacify the merge skeptics.)
- */
-static int slub_nomerge;
-
-/*
* Calculate the order of allocation given an slab object size.
*
* The order of allocation has significant impact on performance and other
@@ -3240,14 +3236,6 @@ static int __init setup_slub_min_objects(char *str)
__setup("slub_min_objects=", setup_slub_min_objects);
-static int __init setup_slub_nomerge(char *str)
-{
- slub_nomerge = 1;
- return 1;
-}
-
-__setup("slub_nomerge", setup_slub_nomerge);
-
void *__kmalloc(size_t size, gfp_t flags)
{
struct kmem_cache *s;
@@ -3625,69 +3613,6 @@ void __init kmem_cache_init_late(void)
{
}
-/*
- * Find a mergeable slab cache
- */
-static int slab_unmergeable(struct kmem_cache *s)
-{
- if (slub_nomerge || (s->flags & SLUB_NEVER_MERGE))
- return 1;
-
- if (!is_root_cache(s))
- return 1;
-
- if (s->ctor)
- return 1;
-
- /*
- * We may have set a slab to be unmergeable during bootstrap.
- */
- if (s->refcount < 0)
- return 1;
-
- return 0;
-}
-
-static struct kmem_cache *find_mergeable(size_t size, size_t align,
- unsigned long flags, const char *name, void (*ctor)(void *))
-{
- struct kmem_cache *s;
-
- if (slub_nomerge || (flags & SLUB_NEVER_MERGE))
- return NULL;
-
- if (ctor)
- return NULL;
-
- size = ALIGN(size, sizeof(void *));
- align = calculate_alignment(flags, align, size);
- size = ALIGN(size, align);
- flags = kmem_cache_flags(size, flags, name, NULL);
-
- list_for_each_entry(s, &slab_caches, list) {
- if (slab_unmergeable(s))
- continue;
-
- if (size > s->size)
- continue;
-
- if ((flags & SLUB_MERGE_SAME) != (s->flags & SLUB_MERGE_SAME))
- continue;
- /*
- * Check if alignment is compatible.
- * Courtesy of Adrian Drzewiecki
- */
- if ((s->size & ~(align - 1)) != s->size)
- continue;
-
- if (s->size - size >= sizeof(void *))
- continue;
-
- return s;
- }
- return NULL;
-}
-
struct kmem_cache *
__kmem_cache_alias(const char *name, size_t size, size_t align,
unsigned long flags, void (*ctor)(void *))
@@ -4604,6 +4529,14 @@ static ssize_t trace_show(struct kmem_cache *s, char *buf)
static ssize_t trace_store(struct kmem_cache *s, const char *buf,
size_t length)
{
+ /*
+ * Tracing a merged cache is going to give confusing results
+ * as well as cause other issues like converting a mergeable
+ * cache into an umergeable one.
+ */
+ if (s->refcount > 1)
+ return -EINVAL;
+
s->flags &= ~SLAB_TRACE;
if (buf[0] == '1') {
s->flags &= ~__CMPXCHG_DOUBLE;
@@ -4721,6 +4654,9 @@ static ssize_t failslab_show(struct kmem_cache *s, char *buf)
static ssize_t failslab_store(struct kmem_cache *s, const char *buf,
size_t length)
{
+ if (s->refcount > 1)
+ return -EINVAL;
+
s->flags &= ~SLAB_FAILSLAB;
if (buf[0] == '1')
s->flags |= SLAB_FAILSLAB;
diff --git a/mm/swap.c b/mm/swap.c
index 6b2dc3897cd5..8a12b33936b4 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -887,18 +887,14 @@ void lru_add_drain_all(void)
mutex_unlock(&lock);
}
-/*
- * Batched page_cache_release(). Decrement the reference count on all the
- * passed pages. If it fell to zero then remove the page from the LRU and
- * free it.
- *
- * Avoid taking zone->lru_lock if possible, but if it is taken, retain it
- * for the remainder of the operation.
+/**
+ * release_pages - batched page_cache_release()
+ * @pages: array of pages to release
+ * @nr: number of pages
+ * @cold: whether the pages are cache cold
*
- * The locking in this function is against shrink_inactive_list(): we recheck
- * the page count inside the lock to see whether shrink_inactive_list()
- * grabbed the page via the LRU. If it did, give up: shrink_inactive_list()
- * will free it.
+ * Decrement the reference count on all the pages in @pages. If it
+ * fell to zero, remove the page from the LRU and free it.
*/
void release_pages(struct page **pages, int nr, bool cold)
{
@@ -907,6 +903,7 @@ void release_pages(struct page **pages, int nr, bool cold)
struct zone *zone = NULL;
struct lruvec *lruvec;
unsigned long uninitialized_var(flags);
+ unsigned int uninitialized_var(lock_batch);
for (i = 0; i < nr; i++) {
struct page *page = pages[i];
@@ -920,6 +917,16 @@ void release_pages(struct page **pages, int nr, bool cold)
continue;
}
+ /*
+ * Make sure the IRQ-safe lock-holding time does not get
+ * excessive with a continuous string of pages from the
+ * same zone. The lock is held only if zone != NULL.
+ */
+ if (zone && ++lock_batch == SWAP_CLUSTER_MAX) {
+ spin_unlock_irqrestore(&zone->lru_lock, flags);
+ zone = NULL;
+ }
+
if (!put_page_testzero(page))
continue;
@@ -930,6 +937,7 @@ void release_pages(struct page **pages, int nr, bool cold)
if (zone)
spin_unlock_irqrestore(&zone->lru_lock,
flags);
+ lock_batch = 0;
zone = pagezone;
spin_lock_irqsave(&zone->lru_lock, flags);
}
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 3e0ec83d000c..154444918685 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -28,7 +28,9 @@
static const struct address_space_operations swap_aops = {
.writepage = swap_writepage,
.set_page_dirty = swap_set_page_dirty,
+#ifdef CONFIG_MIGRATION
.migratepage = migrate_page,
+#endif
};
static struct backing_dev_info swap_backing_dev_info = {
@@ -263,18 +265,12 @@ void free_page_and_swap_cache(struct page *page)
void free_pages_and_swap_cache(struct page **pages, int nr)
{
struct page **pagep = pages;
+ int i;
lru_add_drain();
- while (nr) {
- int todo = min(nr, PAGEVEC_SIZE);
- int i;
-
- for (i = 0; i < todo; i++)
- free_swap_cache(pagep[i]);
- release_pages(pagep, todo, false);
- pagep += todo;
- nr -= todo;
- }
+ for (i = 0; i < nr; i++)
+ free_swap_cache(pagep[i]);
+ release_pages(pagep, nr, false);
}
/*
diff --git a/mm/util.c b/mm/util.c
index 093c973f1697..fec39d4509a9 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -170,32 +170,25 @@ static int vm_is_stack_for_task(struct task_struct *t,
/*
* Check if the vma is being used as a stack.
* If is_group is non-zero, check in the entire thread group or else
- * just check in the current task. Returns the pid of the task that
- * the vma is stack for.
+ * just check in the current task. Returns the task_struct of the task
+ * that the vma is stack for. Must be called under rcu_read_lock().
*/
-pid_t vm_is_stack(struct task_struct *task,
- struct vm_area_struct *vma, int in_group)
+struct task_struct *task_of_stack(struct task_struct *task,
+ struct vm_area_struct *vma, bool in_group)
{
- pid_t ret = 0;
-
if (vm_is_stack_for_task(task, vma))
- return task->pid;
+ return task;
if (in_group) {
struct task_struct *t;
- rcu_read_lock();
for_each_thread(task, t) {
- if (vm_is_stack_for_task(t, vma)) {
- ret = t->pid;
- goto done;
- }
+ if (vm_is_stack_for_task(t, vma))
+ return t;
}
-done:
- rcu_read_unlock();
}
- return ret;
+ return NULL;
}
#if defined(CONFIG_MMU) && !defined(HAVE_ARCH_PICK_MMAP_LAYOUT)
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 2b0aa5486092..90520af7f186 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -2646,21 +2646,11 @@ static const struct seq_operations vmalloc_op = {
static int vmalloc_open(struct inode *inode, struct file *file)
{
- unsigned int *ptr = NULL;
- int ret;
-
- if (IS_ENABLED(CONFIG_NUMA)) {
- ptr = kmalloc(nr_node_ids * sizeof(unsigned int), GFP_KERNEL);
- if (ptr == NULL)
- return -ENOMEM;
- }
- ret = seq_open(file, &vmalloc_op);
- if (!ret) {
- struct seq_file *m = file->private_data;
- m->private = ptr;
- } else
- kfree(ptr);
- return ret;
+ if (IS_ENABLED(CONFIG_NUMA))
+ return seq_open_private(file, &vmalloc_op,
+ nr_node_ids * sizeof(unsigned int));
+ else
+ return seq_open(file, &vmalloc_op);
}
static const struct file_operations proc_vmalloc_operations = {
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 2836b5373b2e..dcb47074ae03 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -920,7 +920,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
/* Case 1 above */
if (current_is_kswapd() &&
PageReclaim(page) &&
- zone_is_reclaim_writeback(zone)) {
+ test_bit(ZONE_WRITEBACK, &zone->flags)) {
nr_immediate++;
goto keep_locked;
@@ -1002,7 +1002,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
*/
if (page_is_file_cache(page) &&
(!current_is_kswapd() ||
- !zone_is_reclaim_dirty(zone))) {
+ !test_bit(ZONE_DIRTY, &zone->flags))) {
/*
* Immediately reclaim when written back.
* Similar in principal to deactivate_page()
@@ -1563,7 +1563,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
* are encountered in the nr_immediate check below.
*/
if (nr_writeback && nr_writeback == nr_taken)
- zone_set_flag(zone, ZONE_WRITEBACK);
+ set_bit(ZONE_WRITEBACK, &zone->flags);
/*
* memcg will stall in page writeback so only consider forcibly
@@ -1575,16 +1575,16 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
* backed by a congested BDI and wait_iff_congested will stall.
*/
if (nr_dirty && nr_dirty == nr_congested)
- zone_set_flag(zone, ZONE_CONGESTED);
+ set_bit(ZONE_CONGESTED, &zone->flags);
/*
* If dirty pages are scanned that are not queued for IO, it
* implies that flushers are not keeping up. In this case, flag
- * the zone ZONE_TAIL_LRU_DIRTY and kswapd will start writing
- * pages from reclaim context.
+ * the zone ZONE_DIRTY and kswapd will start writing pages from
+ * reclaim context.
*/
if (nr_unqueued_dirty == nr_taken)
- zone_set_flag(zone, ZONE_TAIL_LRU_DIRTY);
+ set_bit(ZONE_DIRTY, &zone->flags);
/*
* If kswapd scans pages marked marked for immediate
@@ -2315,7 +2315,10 @@ static bool shrink_zone(struct zone *zone, struct scan_control *sc)
return reclaimable;
}
-/* Returns true if compaction should go ahead for a high-order request */
+/*
+ * Returns true if compaction should go ahead for a high-order request, or
+ * the high-order allocation would succeed without compaction.
+ */
static inline bool compaction_ready(struct zone *zone, int order)
{
unsigned long balance_gap, watermark;
@@ -2339,8 +2342,11 @@ static inline bool compaction_ready(struct zone *zone, int order)
if (compaction_deferred(zone, order))
return watermark_ok;
- /* If compaction is not ready to start, keep reclaiming */
- if (!compaction_suitable(zone, order))
+ /*
+ * If compaction is not ready to start and allocation is not likely
+ * to succeed without it, then keep reclaiming.
+ */
+ if (compaction_suitable(zone, order) == COMPACT_SKIPPED)
return false;
return watermark_ok;
@@ -2753,21 +2759,22 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *memcg,
}
unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
+ unsigned long nr_pages,
gfp_t gfp_mask,
- bool noswap)
+ bool may_swap)
{
struct zonelist *zonelist;
unsigned long nr_reclaimed;
int nid;
struct scan_control sc = {
- .nr_to_reclaim = SWAP_CLUSTER_MAX,
+ .nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX),
.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
(GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK),
.target_mem_cgroup = memcg,
.priority = DEF_PRIORITY,
.may_writepage = !laptop_mode,
.may_unmap = 1,
- .may_swap = !noswap,
+ .may_swap = may_swap,
};
/*
@@ -2818,7 +2825,7 @@ static bool zone_balanced(struct zone *zone, int order,
return false;
if (IS_ENABLED(CONFIG_COMPACTION) && order &&
- !compaction_suitable(zone, order))
+ compaction_suitable(zone, order) == COMPACT_SKIPPED)
return false;
return true;
@@ -2978,7 +2985,7 @@ static bool kswapd_shrink_zone(struct zone *zone,
/* Account for the number of pages attempted to reclaim */
*nr_attempted += sc->nr_to_reclaim;
- zone_clear_flag(zone, ZONE_WRITEBACK);
+ clear_bit(ZONE_WRITEBACK, &zone->flags);
/*
* If a zone reaches its high watermark, consider it to be no longer
@@ -2988,8 +2995,8 @@ static bool kswapd_shrink_zone(struct zone *zone,
*/
if (zone_reclaimable(zone) &&
zone_balanced(zone, testorder, 0, classzone_idx)) {
- zone_clear_flag(zone, ZONE_CONGESTED);
- zone_clear_flag(zone, ZONE_TAIL_LRU_DIRTY);
+ clear_bit(ZONE_CONGESTED, &zone->flags);
+ clear_bit(ZONE_DIRTY, &zone->flags);
}
return sc->nr_scanned >= sc->nr_to_reclaim;
@@ -3080,8 +3087,8 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
* If balanced, clear the dirty and congested
* flags
*/
- zone_clear_flag(zone, ZONE_CONGESTED);
- zone_clear_flag(zone, ZONE_TAIL_LRU_DIRTY);
+ clear_bit(ZONE_CONGESTED, &zone->flags);
+ clear_bit(ZONE_DIRTY, &zone->flags);
}
}
@@ -3708,11 +3715,11 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
if (node_state(node_id, N_CPU) && node_id != numa_node_id())
return ZONE_RECLAIM_NOSCAN;
- if (zone_test_and_set_flag(zone, ZONE_RECLAIM_LOCKED))
+ if (test_and_set_bit(ZONE_RECLAIM_LOCKED, &zone->flags))
return ZONE_RECLAIM_NOSCAN;
ret = __zone_reclaim(zone, gfp_mask, order);
- zone_clear_flag(zone, ZONE_RECLAIM_LOCKED);
+ clear_bit(ZONE_RECLAIM_LOCKED, &zone->flags);
if (!ret)
count_vm_event(PGSCAN_ZONE_RECLAIM_FAILED);
@@ -3791,66 +3798,3 @@ void check_move_unevictable_pages(struct page **pages, int nr_pages)
}
}
#endif /* CONFIG_SHMEM */
-
-static void warn_scan_unevictable_pages(void)
-{
- printk_once(KERN_WARNING
- "%s: The scan_unevictable_pages sysctl/node-interface has been "
- "disabled for lack of a legitimate use case. If you have "
- "one, please send an email to linux-mm@kvack.org.\n",
- current->comm);
-}
-
-/*
- * scan_unevictable_pages [vm] sysctl handler. On demand re-scan of
- * all nodes' unevictable lists for evictable pages
- */
-unsigned long scan_unevictable_pages;
-
-int scan_unevictable_handler(struct ctl_table *table, int write,
- void __user *buffer,
- size_t *length, loff_t *ppos)
-{
- warn_scan_unevictable_pages();
- proc_doulongvec_minmax(table, write, buffer, length, ppos);
- scan_unevictable_pages = 0;
- return 0;
-}
-
-#ifdef CONFIG_NUMA
-/*
- * per node 'scan_unevictable_pages' attribute. On demand re-scan of
- * a specified node's per zone unevictable lists for evictable pages.
- */
-
-static ssize_t read_scan_unevictable_node(struct device *dev,
- struct device_attribute *attr,
- char *buf)
-{
- warn_scan_unevictable_pages();
- return sprintf(buf, "0\n"); /* always zero; should fit... */
-}
-
-static ssize_t write_scan_unevictable_node(struct device *dev,
- struct device_attribute *attr,
- const char *buf, size_t count)
-{
- warn_scan_unevictable_pages();
- return 1;
-}
-
-
-static DEVICE_ATTR(scan_unevictable_pages, S_IRUGO | S_IWUSR,
- read_scan_unevictable_node,
- write_scan_unevictable_node);
-
-int scan_unevictable_register_node(struct node *node)
-{
- return device_create_file(&node->dev, &dev_attr_scan_unevictable_pages);
-}
-
-void scan_unevictable_unregister_node(struct node *node)
-{
- device_remove_file(&node->dev, &dev_attr_scan_unevictable_pages);
-}
-#endif
diff --git a/mm/vmstat.c b/mm/vmstat.c
index e9ab104b956f..1b12d390dc68 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -7,6 +7,7 @@
* zoned VM statistics
* Copyright (C) 2006 Silicon Graphics, Inc.,
* Christoph Lameter <christoph@lameter.com>
+ * Copyright (C) 2008-2014 Christoph Lameter
*/
#include <linux/fs.h>
#include <linux/mm.h>
@@ -14,6 +15,7 @@
#include <linux/module.h>
#include <linux/slab.h>
#include <linux/cpu.h>
+#include <linux/cpumask.h>
#include <linux/vmstat.h>
#include <linux/sched.h>
#include <linux/math64.h>
@@ -419,13 +421,22 @@ void dec_zone_page_state(struct page *page, enum zone_stat_item item)
EXPORT_SYMBOL(dec_zone_page_state);
#endif
-static inline void fold_diff(int *diff)
+
+/*
+ * Fold a differential into the global counters.
+ * Returns the number of counters updated.
+ */
+static int fold_diff(int *diff)
{
int i;
+ int changes = 0;
for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
- if (diff[i])
+ if (diff[i]) {
atomic_long_add(diff[i], &vm_stat[i]);
+ changes++;
+ }
+ return changes;
}
/*
@@ -441,12 +452,15 @@ static inline void fold_diff(int *diff)
* statistics in the remote zone struct as well as the global cachelines
* with the global counters. These could cause remote node cache line
* bouncing and will have to be only done when necessary.
+ *
+ * The function returns the number of global counters updated.
*/
-static void refresh_cpu_vm_stats(void)
+static int refresh_cpu_vm_stats(void)
{
struct zone *zone;
int i;
int global_diff[NR_VM_ZONE_STAT_ITEMS] = { 0, };
+ int changes = 0;
for_each_populated_zone(zone) {
struct per_cpu_pageset __percpu *p = zone->pageset;
@@ -486,15 +500,17 @@ static void refresh_cpu_vm_stats(void)
continue;
}
-
if (__this_cpu_dec_return(p->expire))
continue;
- if (__this_cpu_read(p->pcp.count))
+ if (__this_cpu_read(p->pcp.count)) {
drain_zone_pages(zone, this_cpu_ptr(&p->pcp));
+ changes++;
+ }
#endif
}
- fold_diff(global_diff);
+ changes += fold_diff(global_diff);
+ return changes;
}
/*
@@ -735,7 +751,7 @@ static void walk_zones_in_node(struct seq_file *m, pg_data_t *pgdat,
TEXT_FOR_HIGHMEM(xx) xx "_movable",
const char * const vmstat_text[] = {
- /* Zoned VM counters */
+ /* enum zone_stat_item countes */
"nr_free_pages",
"nr_alloc_batch",
"nr_inactive_anon",
@@ -778,10 +794,13 @@ const char * const vmstat_text[] = {
"workingset_nodereclaim",
"nr_anon_transparent_hugepages",
"nr_free_cma",
+
+ /* enum writeback_stat_item counters */
"nr_dirty_threshold",
"nr_dirty_background_threshold",
#ifdef CONFIG_VM_EVENT_COUNTERS
+ /* enum vm_event_item counters */
"pgpgin",
"pgpgout",
"pswpin",
@@ -860,6 +879,13 @@ const char * const vmstat_text[] = {
"thp_zero_page_alloc",
"thp_zero_page_alloc_failed",
#endif
+#ifdef CONFIG_MEMORY_BALLOON
+ "balloon_inflate",
+ "balloon_deflate",
+#ifdef CONFIG_BALLOON_COMPACTION
+ "balloon_migrate",
+#endif
+#endif /* CONFIG_MEMORY_BALLOON */
#ifdef CONFIG_DEBUG_TLBFLUSH
#ifdef CONFIG_SMP
"nr_tlb_remote_flush",
@@ -1229,20 +1255,108 @@ static const struct file_operations proc_vmstat_file_operations = {
#ifdef CONFIG_SMP
static DEFINE_PER_CPU(struct delayed_work, vmstat_work);
int sysctl_stat_interval __read_mostly = HZ;
+static cpumask_var_t cpu_stat_off;
static void vmstat_update(struct work_struct *w)
{
- refresh_cpu_vm_stats();
- schedule_delayed_work(this_cpu_ptr(&vmstat_work),
+ if (refresh_cpu_vm_stats())
+ /*
+ * Counters were updated so we expect more updates
+ * to occur in the future. Keep on running the
+ * update worker thread.
+ */
+ schedule_delayed_work(this_cpu_ptr(&vmstat_work),
+ round_jiffies_relative(sysctl_stat_interval));
+ else {
+ /*
+ * We did not update any counters so the app may be in
+ * a mode where it does not cause counter updates.
+ * We may be uselessly running vmstat_update.
+ * Defer the checking for differentials to the
+ * shepherd thread on a different processor.
+ */
+ int r;
+ /*
+ * Shepherd work thread does not race since it never
+ * changes the bit if its zero but the cpu
+ * online / off line code may race if
+ * worker threads are still allowed during
+ * shutdown / startup.
+ */
+ r = cpumask_test_and_set_cpu(smp_processor_id(),
+ cpu_stat_off);
+ VM_BUG_ON(r);
+ }
+}
+
+/*
+ * Check if the diffs for a certain cpu indicate that
+ * an update is needed.
+ */
+static bool need_update(int cpu)
+{
+ struct zone *zone;
+
+ for_each_populated_zone(zone) {
+ struct per_cpu_pageset *p = per_cpu_ptr(zone->pageset, cpu);
+
+ BUILD_BUG_ON(sizeof(p->vm_stat_diff[0]) != 1);
+ /*
+ * The fast way of checking if there are any vmstat diffs.
+ * This works because the diffs are byte sized items.
+ */
+ if (memchr_inv(p->vm_stat_diff, 0, NR_VM_ZONE_STAT_ITEMS))
+ return true;
+
+ }
+ return false;
+}
+
+
+/*
+ * Shepherd worker thread that checks the
+ * differentials of processors that have their worker
+ * threads for vm statistics updates disabled because of
+ * inactivity.
+ */
+static void vmstat_shepherd(struct work_struct *w);
+
+static DECLARE_DELAYED_WORK(shepherd, vmstat_shepherd);
+
+static void vmstat_shepherd(struct work_struct *w)
+{
+ int cpu;
+
+ get_online_cpus();
+ /* Check processors whose vmstat worker threads have been disabled */
+ for_each_cpu(cpu, cpu_stat_off)
+ if (need_update(cpu) &&
+ cpumask_test_and_clear_cpu(cpu, cpu_stat_off))
+
+ schedule_delayed_work_on(cpu, &per_cpu(vmstat_work, cpu),
+ __round_jiffies_relative(sysctl_stat_interval, cpu));
+
+ put_online_cpus();
+
+ schedule_delayed_work(&shepherd,
round_jiffies_relative(sysctl_stat_interval));
+
}
-static void start_cpu_timer(int cpu)
+static void __init start_shepherd_timer(void)
{
- struct delayed_work *work = &per_cpu(vmstat_work, cpu);
+ int cpu;
+
+ for_each_possible_cpu(cpu)
+ INIT_DEFERRABLE_WORK(per_cpu_ptr(&vmstat_work, cpu),
+ vmstat_update);
+
+ if (!alloc_cpumask_var(&cpu_stat_off, GFP_KERNEL))
+ BUG();
+ cpumask_copy(cpu_stat_off, cpu_online_mask);
- INIT_DEFERRABLE_WORK(work, vmstat_update);
- schedule_delayed_work_on(cpu, work, __round_jiffies_relative(HZ, cpu));
+ schedule_delayed_work(&shepherd,
+ round_jiffies_relative(sysctl_stat_interval));
}
static void vmstat_cpu_dead(int node)
@@ -1273,17 +1387,17 @@ static int vmstat_cpuup_callback(struct notifier_block *nfb,
case CPU_ONLINE:
case CPU_ONLINE_FROZEN:
refresh_zone_stat_thresholds();
- start_cpu_timer(cpu);
node_set_state(cpu_to_node(cpu), N_CPU);
+ cpumask_set_cpu(cpu, cpu_stat_off);
break;
case CPU_DOWN_PREPARE:
case CPU_DOWN_PREPARE_FROZEN:
cancel_delayed_work_sync(&per_cpu(vmstat_work, cpu));
- per_cpu(vmstat_work, cpu).work.func = NULL;
+ cpumask_clear_cpu(cpu, cpu_stat_off);
break;
case CPU_DOWN_FAILED:
case CPU_DOWN_FAILED_FROZEN:
- start_cpu_timer(cpu);
+ cpumask_set_cpu(cpu, cpu_stat_off);
break;
case CPU_DEAD:
case CPU_DEAD_FROZEN:
@@ -1303,15 +1417,10 @@ static struct notifier_block vmstat_notifier =
static int __init setup_vmstat(void)
{
#ifdef CONFIG_SMP
- int cpu;
-
cpu_notifier_register_begin();
__register_cpu_notifier(&vmstat_notifier);
- for_each_online_cpu(cpu) {
- start_cpu_timer(cpu);
- node_set_state(cpu_to_node(cpu), N_CPU);
- }
+ start_shepherd_timer();
cpu_notifier_register_done();
#endif
#ifdef CONFIG_PROC_FS
diff --git a/mm/zbud.c b/mm/zbud.c
index a05790b1915e..ecf1dbef6983 100644
--- a/mm/zbud.c
+++ b/mm/zbud.c
@@ -60,15 +60,17 @@
* NCHUNKS_ORDER determines the internal allocation granularity, effectively
* adjusting internal fragmentation. It also determines the number of
* freelists maintained in each pool. NCHUNKS_ORDER of 6 means that the
- * allocation granularity will be in chunks of size PAGE_SIZE/64, and there
- * will be 64 freelists per pool.
+ * allocation granularity will be in chunks of size PAGE_SIZE/64. As one chunk
+ * in allocated page is occupied by zbud header, NCHUNKS will be calculated to
+ * 63 which shows the max number of free chunks in zbud page, also there will be
+ * 63 freelists per pool.
*/
#define NCHUNKS_ORDER 6
#define CHUNK_SHIFT (PAGE_SHIFT - NCHUNKS_ORDER)
#define CHUNK_SIZE (1 << CHUNK_SHIFT)
-#define NCHUNKS (PAGE_SIZE >> CHUNK_SHIFT)
#define ZHDR_SIZE_ALIGNED CHUNK_SIZE
+#define NCHUNKS ((PAGE_SIZE - ZHDR_SIZE_ALIGNED) >> CHUNK_SHIFT)
/**
* struct zbud_pool - stores metadata for each zbud pool
@@ -195,6 +197,7 @@ static struct zpool_driver zbud_zpool_driver = {
.total_size = zbud_zpool_total_size,
};
+MODULE_ALIAS("zpool-zbud");
#endif /* CONFIG_ZPOOL */
/*****************
@@ -267,10 +270,9 @@ static int num_free_chunks(struct zbud_header *zhdr)
{
/*
* Rather than branch for different situations, just use the fact that
- * free buddies have a length of zero to simplify everything. -1 at the
- * end for the zbud header.
+ * free buddies have a length of zero to simplify everything.
*/
- return NCHUNKS - zhdr->first_chunks - zhdr->last_chunks - 1;
+ return NCHUNKS - zhdr->first_chunks - zhdr->last_chunks;
}
/*****************
diff --git a/mm/zpool.c b/mm/zpool.c
index e40612a1df00..739cdf0d183a 100644
--- a/mm/zpool.c
+++ b/mm/zpool.c
@@ -150,7 +150,7 @@ struct zpool *zpool_create_pool(char *type, gfp_t gfp, struct zpool_ops *ops)
driver = zpool_get_driver(type);
if (!driver) {
- request_module(type);
+ request_module("zpool-%s", type);
driver = zpool_get_driver(type);
}
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index 4e2fc83cb394..839a48c3ca27 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -175,7 +175,7 @@ enum fullness_group {
* n <= N / f, where
* n = number of allocated objects
* N = total number of objects zspage can store
- * f = 1/fullness_threshold_frac
+ * f = fullness_threshold_frac
*
* Similarly, we assign zspage to:
* ZS_ALMOST_FULL when n > N / f
@@ -199,9 +199,6 @@ struct size_class {
spinlock_t lock;
- /* stats */
- u64 pages_allocated;
-
struct page *fullness_list[_ZS_NR_FULLNESS_GROUPS];
};
@@ -220,6 +217,7 @@ struct zs_pool {
struct size_class size_class[ZS_SIZE_CLASSES];
gfp_t flags; /* allocation flags used when growing pool */
+ atomic_long_t pages_allocated;
};
/*
@@ -299,7 +297,7 @@ static void zs_zpool_unmap(void *pool, unsigned long handle)
static u64 zs_zpool_total_size(void *pool)
{
- return zs_get_total_size_bytes(pool);
+ return zs_get_total_pages(pool) << PAGE_SHIFT;
}
static struct zpool_driver zs_zpool_driver = {
@@ -315,6 +313,7 @@ static struct zpool_driver zs_zpool_driver = {
.total_size = zs_zpool_total_size,
};
+MODULE_ALIAS("zpool-zsmalloc");
#endif /* CONFIG_ZPOOL */
/* per-cpu VM mapping areas for zspage accesses that cross page boundaries */
@@ -629,7 +628,7 @@ static void init_zspage(struct page *first_page, struct size_class *class)
while (page) {
struct page *next_page;
struct link_free *link;
- unsigned int i, objs_on_page;
+ unsigned int i = 1;
/*
* page->index stores offset of first object starting
@@ -642,14 +641,10 @@ static void init_zspage(struct page *first_page, struct size_class *class)
link = (struct link_free *)kmap_atomic(page) +
off / sizeof(*link);
- objs_on_page = (PAGE_SIZE - off) / class->size;
- for (i = 1; i <= objs_on_page; i++) {
- off += class->size;
- if (off < PAGE_SIZE) {
- link->next = obj_location_to_handle(page, i);
- link += class->size / sizeof(*link);
- }
+ while ((off += class->size) < PAGE_SIZE) {
+ link->next = obj_location_to_handle(page, i++);
+ link += class->size / sizeof(*link);
}
/*
@@ -661,7 +656,7 @@ static void init_zspage(struct page *first_page, struct size_class *class)
link->next = obj_location_to_handle(next_page, 0);
kunmap_atomic(link);
page = next_page;
- off = (off + class->size) % PAGE_SIZE;
+ off %= PAGE_SIZE;
}
}
@@ -1027,8 +1022,9 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size)
return 0;
set_zspage_mapping(first_page, class->index, ZS_EMPTY);
+ atomic_long_add(class->pages_per_zspage,
+ &pool->pages_allocated);
spin_lock(&class->lock);
- class->pages_allocated += class->pages_per_zspage;
}
obj = (unsigned long)first_page->freelist;
@@ -1081,14 +1077,13 @@ void zs_free(struct zs_pool *pool, unsigned long obj)
first_page->inuse--;
fullness = fix_fullness_group(pool, first_page);
-
- if (fullness == ZS_EMPTY)
- class->pages_allocated -= class->pages_per_zspage;
-
spin_unlock(&class->lock);
- if (fullness == ZS_EMPTY)
+ if (fullness == ZS_EMPTY) {
+ atomic_long_sub(class->pages_per_zspage,
+ &pool->pages_allocated);
free_zspage(first_page);
+ }
}
EXPORT_SYMBOL_GPL(zs_free);
@@ -1182,17 +1177,11 @@ void zs_unmap_object(struct zs_pool *pool, unsigned long handle)
}
EXPORT_SYMBOL_GPL(zs_unmap_object);
-u64 zs_get_total_size_bytes(struct zs_pool *pool)
+unsigned long zs_get_total_pages(struct zs_pool *pool)
{
- int i;
- u64 npages = 0;
-
- for (i = 0; i < ZS_SIZE_CLASSES; i++)
- npages += pool->size_class[i].pages_allocated;
-
- return npages << PAGE_SHIFT;
+ return atomic_long_read(&pool->pages_allocated);
}
-EXPORT_SYMBOL_GPL(zs_get_total_size_bytes);
+EXPORT_SYMBOL_GPL(zs_get_total_pages);
module_init(zs_init);
module_exit(zs_exit);