summaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig21
-rw-r--r--mm/Kconfig.debug32
-rw-r--r--mm/Makefile1
-rw-r--r--mm/backing-dev.c34
-rw-r--r--mm/compaction.c80
-rw-r--r--mm/debug.c56
-rw-r--r--mm/debug_vm_pgtable.c381
-rw-r--r--mm/fadvise.c6
-rw-r--r--mm/filemap.c95
-rw-r--r--mm/frame_vector.c13
-rw-r--r--mm/frontswap.c6
-rw-r--r--mm/gup.c432
-rw-r--r--mm/hmm.c187
-rw-r--r--mm/huge_memory.c322
-rw-r--r--mm/hugetlb.c230
-rw-r--r--mm/init-mm.c4
-rw-r--r--mm/internal.h43
-rw-r--r--mm/kasan/Makefile26
-rw-r--r--mm/kasan/common.c19
-rw-r--r--mm/kasan/generic.c1
-rw-r--r--mm/kasan/init.c11
-rw-r--r--mm/kasan/kasan.h34
-rw-r--r--mm/kasan/report.c22
-rw-r--r--mm/kasan/tags.c1
-rw-r--r--mm/khugepaged.c335
-rw-r--r--mm/ksm.c70
-rw-r--r--mm/list_lru.c2
-rw-r--r--mm/maccess.c278
-rw-r--r--mm/madvise.c56
-rw-r--r--mm/memblock.c21
-rw-r--r--mm/memcontrol.c751
-rw-r--r--mm/memory-failure.c30
-rw-r--r--mm/memory.c155
-rw-r--r--mm/memory_hotplug.c250
-rw-r--r--mm/mempolicy.c41
-rw-r--r--mm/migrate.c46
-rw-r--r--mm/mincore.c6
-rw-r--r--mm/mlock.c22
-rw-r--r--mm/mm_init.c16
-rw-r--r--mm/mmap.c76
-rw-r--r--mm/mmu_gather.c2
-rw-r--r--mm/mmu_notifier.c22
-rw-r--r--mm/mprotect.c22
-rw-r--r--mm/mremap.c33
-rw-r--r--mm/msync.c8
-rw-r--r--mm/nommu.c72
-rw-r--r--mm/oom_kill.c18
-rw-r--r--mm/page-writeback.c80
-rw-r--r--mm/page_alloc.c688
-rw-r--r--mm/page_idle.c7
-rw-r--r--mm/page_io.c1
-rw-r--r--mm/page_isolation.c9
-rw-r--r--mm/page_owner.c7
-rw-r--r--mm/page_reporting.h2
-rw-r--r--mm/pagewalk.c12
-rw-r--r--mm/percpu.c16
-rw-r--r--mm/pgtable-generic.c8
-rw-r--r--mm/process_vm_access.c4
-rw-r--r--mm/ptdump.c21
-rw-r--r--mm/readahead.c275
-rw-r--r--mm/rmap.c65
-rw-r--r--mm/shmem.c128
-rw-r--r--mm/slab.c4
-rw-r--r--mm/slab_common.c3
-rw-r--r--mm/slob.c2
-rw-r--r--mm/slub.c136
-rw-r--r--mm/sparse-vmemmap.c1
-rw-r--r--mm/sparse.c3
-rw-r--r--mm/swap.c245
-rw-r--r--mm/swap_cgroup.c6
-rw-r--r--mm/swap_state.c114
-rw-r--r--mm/swapfile.c214
-rw-r--r--mm/userfaultfd.c31
-rw-r--r--mm/util.c46
-rw-r--r--mm/vmacache.c1
-rw-r--r--mm/vmalloc.c385
-rw-r--r--mm/vmscan.c289
-rw-r--r--mm/vmstat.c66
-rw-r--r--mm/workingset.c21
-rw-r--r--mm/z3fold.c14
-rw-r--r--mm/zbud.c2
-rw-r--r--mm/zsmalloc.c14
82 files changed, 4066 insertions, 3243 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index c1acc34c1c35..f2104cc0d35c 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -126,9 +126,6 @@ config SPARSEMEM_VMEMMAP
pfn_to_page and page_to_pfn operations. This is the most
efficient option when sufficient kernel resources are available.
-config HAVE_MEMBLOCK_NODE_MAP
- bool
-
config HAVE_MEMBLOCK_PHYS_MAP
bool
@@ -136,6 +133,9 @@ config HAVE_FAST_GUP
depends on MMU
bool
+# Don't discard allocated memory used to track "memory" and "reserved" memblocks
+# after early boot, so it can still be used to test for validity of memory.
+# Also, memblocks are updated with memory hot(un)plug.
config ARCH_KEEP_MEMBLOCK
bool
@@ -158,6 +158,7 @@ config MEMORY_HOTPLUG
bool "Allow for memory hot-add"
depends on SPARSEMEM || X86_64_ACPI_NUMA
depends on ARCH_ENABLE_MEMORY_HOTPLUG
+ depends on 64BIT || BROKEN
select NUMA_KEEP_MEMINFO if NUMA
config MEMORY_HOTPLUG_SPARSE
@@ -192,6 +193,9 @@ config MEMORY_HOTREMOVE
# Default to 4 for wider testing, though 8 might be more appropriate.
# ARM's adjust_pte (unused if VIPT) depends on mm-wide page_table_lock.
# PA-RISC 7xxx's spinlock_t would enlarge struct page from 32 to 44 bytes.
+# SPARC32 allocates multiple pte tables within a single page, and therefore
+# a per-page lock leads to problems when multiple tables need to be locked
+# at the same time (e.g. copy_page_range()).
# DEBUG_SPINLOCK and DEBUG_LOCK_ALLOC spinlock_t also enlarge struct page.
#
config SPLIT_PTLOCK_CPUS
@@ -199,6 +203,7 @@ config SPLIT_PTLOCK_CPUS
default "999999" if !MMU
default "999999" if ARM && !CPU_CACHE_VIPT
default "999999" if PARISC && !PA20
+ default "999999" if SPARC32
default "4"
config ARCH_ENABLE_SPLIT_PMD_PTLOCK
@@ -705,9 +710,9 @@ config ZSMALLOC
returned by an alloc(). This handle must be mapped in order to
access the allocated space.
-config PGTABLE_MAPPING
+config ZSMALLOC_PGTABLE_MAPPING
bool "Use page table mapping to access object in zsmalloc"
- depends on ZSMALLOC
+ depends on ZSMALLOC=y
help
By default, zsmalloc uses a copy-based object mapping method to
access allocations that span two pages. However, if a particular
@@ -750,13 +755,13 @@ config DEFERRED_STRUCT_PAGE_INIT
depends on SPARSEMEM
depends on !NEED_PER_CPU_KM
depends on 64BIT
+ select PADATA
help
Ordinarily all struct pages are initialised during early boot in a
single thread. On very large machines this can take a considerable
amount of time. If this option is set, large machines will bring up
- a subset of memmap at boot and then initialise the rest in parallel
- by starting one-off "pgdatinitX" kernel thread for each node X. This
- has a potential performance impact on processes running early in the
+ a subset of memmap at boot and then initialise the rest in parallel.
+ This has a potential performance impact on tasks running early in the
lifetime of the system until these kthreads finish the
initialisation.
diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug
index 0271b22e063f..2409f7fc1567 100644
--- a/mm/Kconfig.debug
+++ b/mm/Kconfig.debug
@@ -118,6 +118,38 @@ config DEBUG_RODATA_TEST
---help---
This option enables a testcase for the setting rodata read-only.
+config ARCH_HAS_DEBUG_WX
+ bool
+
+config DEBUG_WX
+ bool "Warn on W+X mappings at boot"
+ depends on ARCH_HAS_DEBUG_WX
+ depends on MMU
+ select PTDUMP_CORE
+ help
+ Generate a warning if any W+X mappings are found at boot.
+
+ This is useful for discovering cases where the kernel is leaving W+X
+ mappings after applying NX, as such mappings are a security risk.
+
+ Look for a message in dmesg output like this:
+
+ <arch>/mm: Checked W+X mappings: passed, no W+X pages found.
+
+ or like this, if the check failed:
+
+ <arch>/mm: Checked W+X mappings: failed, <N> W+X pages found.
+
+ Note that even if the check fails, your kernel is possibly
+ still fine, as W+X mappings are not a security hole in
+ themselves, what they do is that they make the exploitation
+ of other unfixed kernel bugs easier.
+
+ There is no runtime or memory usage effect of this option
+ once the kernel has booted up - it's a one time check.
+
+ If in doubt, say "Y".
+
config GENERIC_PTDUMP
bool
diff --git a/mm/Makefile b/mm/Makefile
index 7881b8ede627..fa91e963c2f9 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -96,6 +96,7 @@ obj-$(CONFIG_HWPOISON_INJECT) += hwpoison-inject.o
obj-$(CONFIG_DEBUG_KMEMLEAK) += kmemleak.o
obj-$(CONFIG_DEBUG_KMEMLEAK_TEST) += kmemleak-test.o
obj-$(CONFIG_DEBUG_RODATA_TEST) += rodata_test.o
+obj-$(CONFIG_DEBUG_VM_PGTABLE) += debug_vm_pgtable.o
obj-$(CONFIG_PAGE_OWNER) += page_owner.o
obj-$(CONFIG_CLEANCACHE) += cleancache.o
obj-$(CONFIG_MEMORY_ISOLATION) += page_isolation.o
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index c81b4f3a7268..d382272bcc31 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -15,13 +15,12 @@
#include <trace/events/writeback.h>
struct backing_dev_info noop_backing_dev_info = {
- .name = "noop",
.capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK,
};
EXPORT_SYMBOL_GPL(noop_backing_dev_info);
static struct class *bdi_class;
-const char *bdi_unknown_name = "(unknown)";
+static const char *bdi_unknown_name = "(unknown)";
/*
* bdi_lock protects bdi_tree and updates to bdi_list. bdi_list has RCU
@@ -865,12 +864,11 @@ static int bdi_init(struct backing_dev_info *bdi)
return ret;
}
-struct backing_dev_info *bdi_alloc_node(gfp_t gfp_mask, int node_id)
+struct backing_dev_info *bdi_alloc(int node_id)
{
struct backing_dev_info *bdi;
- bdi = kmalloc_node(sizeof(struct backing_dev_info),
- gfp_mask | __GFP_ZERO, node_id);
+ bdi = kzalloc_node(sizeof(*bdi), GFP_KERNEL, node_id);
if (!bdi)
return NULL;
@@ -880,7 +878,7 @@ struct backing_dev_info *bdi_alloc_node(gfp_t gfp_mask, int node_id)
}
return bdi;
}
-EXPORT_SYMBOL(bdi_alloc_node);
+EXPORT_SYMBOL(bdi_alloc);
static struct rb_node **bdi_lookup_rb_node(u64 id, struct rb_node **parentp)
{
@@ -938,7 +936,8 @@ int bdi_register_va(struct backing_dev_info *bdi, const char *fmt, va_list args)
if (bdi->dev) /* The driver needs to use separate queues per device */
return 0;
- dev = device_create_vargs(bdi_class, NULL, MKDEV(0, 0), bdi, fmt, args);
+ vsnprintf(bdi->dev_name, sizeof(bdi->dev_name), fmt, args);
+ dev = device_create(bdi_class, NULL, MKDEV(0, 0), bdi, bdi->dev_name);
if (IS_ERR(dev))
return PTR_ERR(dev);
@@ -963,7 +962,6 @@ int bdi_register_va(struct backing_dev_info *bdi, const char *fmt, va_list args)
trace_writeback_bdi_register(bdi);
return 0;
}
-EXPORT_SYMBOL(bdi_register_va);
int bdi_register(struct backing_dev_info *bdi, const char *fmt, ...)
{
@@ -977,20 +975,12 @@ int bdi_register(struct backing_dev_info *bdi, const char *fmt, ...)
}
EXPORT_SYMBOL(bdi_register);
-int bdi_register_owner(struct backing_dev_info *bdi, struct device *owner)
+void bdi_set_owner(struct backing_dev_info *bdi, struct device *owner)
{
- int rc;
-
- rc = bdi_register(bdi, "%u:%u", MAJOR(owner->devt), MINOR(owner->devt));
- if (rc)
- return rc;
- /* Leaking owner reference... */
- WARN_ON(bdi->owner);
+ WARN_ON_ONCE(bdi->owner);
bdi->owner = owner;
get_device(owner);
- return 0;
}
-EXPORT_SYMBOL(bdi_register_owner);
/*
* Remove bdi from bdi_list, and ensure that it is no longer visible
@@ -1043,6 +1033,14 @@ void bdi_put(struct backing_dev_info *bdi)
}
EXPORT_SYMBOL(bdi_put);
+const char *bdi_dev_name(struct backing_dev_info *bdi)
+{
+ if (!bdi || !bdi->dev)
+ return bdi_unknown_name;
+ return bdi->dev_name;
+}
+EXPORT_SYMBOL_GPL(bdi_dev_name);
+
static wait_queue_head_t congestion_wqh[2] = {
__WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[0]),
__WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[1])
diff --git a/mm/compaction.c b/mm/compaction.c
index 46f0fcc93081..fd988b7e5f2b 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -1401,7 +1401,7 @@ fast_isolate_freepages(struct compact_control *cc)
if (scan_start) {
/*
* Use the highest PFN found above min. If one was
- * not found, be pessemistic for direct compaction
+ * not found, be pessimistic for direct compaction
* and use the min mark.
*/
if (highest) {
@@ -1409,7 +1409,9 @@ fast_isolate_freepages(struct compact_control *cc)
cc->free_pfn = highest;
} else {
if (cc->direct_compaction && pfn_valid(min_pfn)) {
- page = pfn_to_page(min_pfn);
+ page = pageblock_pfn_to_page(min_pfn,
+ pageblock_end_pfn(min_pfn),
+ cc->zone);
cc->free_pfn = min_pfn;
}
}
@@ -1966,7 +1968,7 @@ static enum compact_result compact_finished(struct compact_control *cc)
*/
static enum compact_result __compaction_suitable(struct zone *zone, int order,
unsigned int alloc_flags,
- int classzone_idx,
+ int highest_zoneidx,
unsigned long wmark_target)
{
unsigned long watermark;
@@ -1979,7 +1981,7 @@ static enum compact_result __compaction_suitable(struct zone *zone, int order,
* If watermarks for high-order allocation are already met, there
* should be no need for compaction at all.
*/
- if (zone_watermark_ok(zone, order, watermark, classzone_idx,
+ if (zone_watermark_ok(zone, order, watermark, highest_zoneidx,
alloc_flags))
return COMPACT_SUCCESS;
@@ -1989,9 +1991,9 @@ static enum compact_result __compaction_suitable(struct zone *zone, int order,
* watermark and alloc_flags have to match, or be more pessimistic than
* the check in __isolate_free_page(). We don't use the direct
* compactor's alloc_flags, as they are not relevant for freepage
- * isolation. We however do use the direct compactor's classzone_idx to
- * skip over zones where lowmem reserves would prevent allocation even
- * if compaction succeeds.
+ * isolation. We however do use the direct compactor's highest_zoneidx
+ * to skip over zones where lowmem reserves would prevent allocation
+ * even if compaction succeeds.
* For costly orders, we require low watermark instead of min for
* compaction to proceed to increase its chances.
* ALLOC_CMA is used, as pages in CMA pageblocks are considered
@@ -2000,7 +2002,7 @@ static enum compact_result __compaction_suitable(struct zone *zone, int order,
watermark = (order > PAGE_ALLOC_COSTLY_ORDER) ?
low_wmark_pages(zone) : min_wmark_pages(zone);
watermark += compact_gap(order);
- if (!__zone_watermark_ok(zone, 0, watermark, classzone_idx,
+ if (!__zone_watermark_ok(zone, 0, watermark, highest_zoneidx,
ALLOC_CMA, wmark_target))
return COMPACT_SKIPPED;
@@ -2009,12 +2011,12 @@ static enum compact_result __compaction_suitable(struct zone *zone, int order,
enum compact_result compaction_suitable(struct zone *zone, int order,
unsigned int alloc_flags,
- int classzone_idx)
+ int highest_zoneidx)
{
enum compact_result ret;
int fragindex;
- ret = __compaction_suitable(zone, order, alloc_flags, classzone_idx,
+ ret = __compaction_suitable(zone, order, alloc_flags, highest_zoneidx,
zone_page_state(zone, NR_FREE_PAGES));
/*
* fragmentation index determines if allocation failures are due to
@@ -2055,8 +2057,8 @@ bool compaction_zonelist_suitable(struct alloc_context *ac, int order,
* Make sure at least one zone would pass __compaction_suitable if we continue
* retrying the reclaim.
*/
- for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx,
- ac->nodemask) {
+ for_each_zone_zonelist_nodemask(zone, z, ac->zonelist,
+ ac->highest_zoneidx, ac->nodemask) {
unsigned long available;
enum compact_result compact_result;
@@ -2069,7 +2071,7 @@ bool compaction_zonelist_suitable(struct alloc_context *ac, int order,
available = zone_reclaimable_pages(zone) / order;
available += zone_page_state_snapshot(zone, NR_FREE_PAGES);
compact_result = __compaction_suitable(zone, order, alloc_flags,
- ac_classzone_idx(ac), available);
+ ac->highest_zoneidx, available);
if (compact_result != COMPACT_SKIPPED)
return true;
}
@@ -2098,9 +2100,9 @@ compact_zone(struct compact_control *cc, struct capture_control *capc)
INIT_LIST_HEAD(&cc->freepages);
INIT_LIST_HEAD(&cc->migratepages);
- cc->migratetype = gfpflags_to_migratetype(cc->gfp_mask);
+ cc->migratetype = gfp_migratetype(cc->gfp_mask);
ret = compaction_suitable(cc->zone, cc->order, cc->alloc_flags,
- cc->classzone_idx);
+ cc->highest_zoneidx);
/* Compaction is likely to fail */
if (ret == COMPACT_SUCCESS || ret == COMPACT_SKIPPED)
return ret;
@@ -2243,15 +2245,11 @@ check_drain:
* would succeed.
*/
if (cc->order > 0 && last_migrated_pfn) {
- int cpu;
unsigned long current_block_start =
block_start_pfn(cc->migrate_pfn, cc->order);
if (last_migrated_pfn < current_block_start) {
- cpu = get_cpu();
- lru_add_drain_cpu(cpu);
- drain_local_pages(cc->zone);
- put_cpu();
+ lru_add_drain_cpu_zone(cc->zone);
/* No more flushing until we migrate again */
last_migrated_pfn = 0;
}
@@ -2295,7 +2293,7 @@ out:
static enum compact_result compact_zone_order(struct zone *zone, int order,
gfp_t gfp_mask, enum compact_priority prio,
- unsigned int alloc_flags, int classzone_idx,
+ unsigned int alloc_flags, int highest_zoneidx,
struct page **capture)
{
enum compact_result ret;
@@ -2307,7 +2305,7 @@ static enum compact_result compact_zone_order(struct zone *zone, int order,
.mode = (prio == COMPACT_PRIO_ASYNC) ?
MIGRATE_ASYNC : MIGRATE_SYNC_LIGHT,
.alloc_flags = alloc_flags,
- .classzone_idx = classzone_idx,
+ .highest_zoneidx = highest_zoneidx,
.direct_compaction = true,
.whole_zone = (prio == MIN_COMPACT_PRIORITY),
.ignore_skip_hint = (prio == MIN_COMPACT_PRIORITY),
@@ -2363,8 +2361,8 @@ enum compact_result try_to_compact_pages(gfp_t gfp_mask, unsigned int order,
trace_mm_compaction_try_to_compact_pages(order, gfp_mask, prio);
/* Compact each zone in the list */
- for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx,
- ac->nodemask) {
+ for_each_zone_zonelist_nodemask(zone, z, ac->zonelist,
+ ac->highest_zoneidx, ac->nodemask) {
enum compact_result status;
if (prio > MIN_COMPACT_PRIORITY
@@ -2374,7 +2372,7 @@ enum compact_result try_to_compact_pages(gfp_t gfp_mask, unsigned int order,
}
status = compact_zone_order(zone, order, gfp_mask, prio,
- alloc_flags, ac_classzone_idx(ac), capture);
+ alloc_flags, ac->highest_zoneidx, capture);
rc = max(status, rc);
/* The allocation should succeed, stop compacting */
@@ -2463,7 +2461,7 @@ int sysctl_compact_memory;
* /proc/sys/vm/compact_memory
*/
int sysctl_compaction_handler(struct ctl_table *table, int write,
- void __user *buffer, size_t *length, loff_t *ppos)
+ void *buffer, size_t *length, loff_t *ppos)
{
if (write)
compact_nodes();
@@ -2509,16 +2507,16 @@ static bool kcompactd_node_suitable(pg_data_t *pgdat)
{
int zoneid;
struct zone *zone;
- enum zone_type classzone_idx = pgdat->kcompactd_classzone_idx;
+ enum zone_type highest_zoneidx = pgdat->kcompactd_highest_zoneidx;
- for (zoneid = 0; zoneid <= classzone_idx; zoneid++) {
+ for (zoneid = 0; zoneid <= highest_zoneidx; zoneid++) {
zone = &pgdat->node_zones[zoneid];
if (!populated_zone(zone))
continue;
if (compaction_suitable(zone, pgdat->kcompactd_max_order, 0,
- classzone_idx) == COMPACT_CONTINUE)
+ highest_zoneidx) == COMPACT_CONTINUE)
return true;
}
@@ -2536,16 +2534,16 @@ static void kcompactd_do_work(pg_data_t *pgdat)
struct compact_control cc = {
.order = pgdat->kcompactd_max_order,
.search_order = pgdat->kcompactd_max_order,
- .classzone_idx = pgdat->kcompactd_classzone_idx,
+ .highest_zoneidx = pgdat->kcompactd_highest_zoneidx,
.mode = MIGRATE_SYNC_LIGHT,
.ignore_skip_hint = false,
.gfp_mask = GFP_KERNEL,
};
trace_mm_compaction_kcompactd_wake(pgdat->node_id, cc.order,
- cc.classzone_idx);
+ cc.highest_zoneidx);
count_compact_event(KCOMPACTD_WAKE);
- for (zoneid = 0; zoneid <= cc.classzone_idx; zoneid++) {
+ for (zoneid = 0; zoneid <= cc.highest_zoneidx; zoneid++) {
int status;
zone = &pgdat->node_zones[zoneid];
@@ -2594,16 +2592,16 @@ static void kcompactd_do_work(pg_data_t *pgdat)
/*
* Regardless of success, we are done until woken up next. But remember
- * the requested order/classzone_idx in case it was higher/tighter than
- * our current ones
+ * the requested order/highest_zoneidx in case it was higher/tighter
+ * than our current ones
*/
if (pgdat->kcompactd_max_order <= cc.order)
pgdat->kcompactd_max_order = 0;
- if (pgdat->kcompactd_classzone_idx >= cc.classzone_idx)
- pgdat->kcompactd_classzone_idx = pgdat->nr_zones - 1;
+ if (pgdat->kcompactd_highest_zoneidx >= cc.highest_zoneidx)
+ pgdat->kcompactd_highest_zoneidx = pgdat->nr_zones - 1;
}
-void wakeup_kcompactd(pg_data_t *pgdat, int order, int classzone_idx)
+void wakeup_kcompactd(pg_data_t *pgdat, int order, int highest_zoneidx)
{
if (!order)
return;
@@ -2611,8 +2609,8 @@ void wakeup_kcompactd(pg_data_t *pgdat, int order, int classzone_idx)
if (pgdat->kcompactd_max_order < order)
pgdat->kcompactd_max_order = order;
- if (pgdat->kcompactd_classzone_idx > classzone_idx)
- pgdat->kcompactd_classzone_idx = classzone_idx;
+ if (pgdat->kcompactd_highest_zoneidx > highest_zoneidx)
+ pgdat->kcompactd_highest_zoneidx = highest_zoneidx;
/*
* Pairs with implicit barrier in wait_event_freezable()
@@ -2625,7 +2623,7 @@ void wakeup_kcompactd(pg_data_t *pgdat, int order, int classzone_idx)
return;
trace_mm_compaction_wakeup_kcompactd(pgdat->node_id, order,
- classzone_idx);
+ highest_zoneidx);
wake_up_interruptible(&pgdat->kcompactd_wait);
}
@@ -2646,7 +2644,7 @@ static int kcompactd(void *p)
set_freezable();
pgdat->kcompactd_max_order = 0;
- pgdat->kcompactd_classzone_idx = pgdat->nr_zones - 1;
+ pgdat->kcompactd_highest_zoneidx = pgdat->nr_zones - 1;
while (!kthread_should_stop()) {
unsigned long pflags;
diff --git a/mm/debug.c b/mm/debug.c
index 2189357f0987..b5b1de8c71ac 100644
--- a/mm/debug.c
+++ b/mm/debug.c
@@ -110,13 +110,57 @@ void __dump_page(struct page *page, const char *reason)
else if (PageAnon(page))
type = "anon ";
else if (mapping) {
- if (mapping->host && mapping->host->i_dentry.first) {
- struct dentry *dentry;
- dentry = container_of(mapping->host->i_dentry.first, struct dentry, d_u.d_alias);
- pr_warn("%ps name:\"%pd\"\n", mapping->a_ops, dentry);
- } else
- pr_warn("%ps\n", mapping->a_ops);
+ const struct inode *host;
+ const struct address_space_operations *a_ops;
+ const struct hlist_node *dentry_first;
+ const struct dentry *dentry_ptr;
+ struct dentry dentry;
+
+ /*
+ * mapping can be invalid pointer and we don't want to crash
+ * accessing it, so probe everything depending on it carefully
+ */
+ if (probe_kernel_read(&host, &mapping->host,
+ sizeof(struct inode *)) ||
+ probe_kernel_read(&a_ops, &mapping->a_ops,
+ sizeof(struct address_space_operations *))) {
+ pr_warn("failed to read mapping->host or a_ops, mapping not a valid kernel address?\n");
+ goto out_mapping;
+ }
+
+ if (!host) {
+ pr_warn("mapping->a_ops:%ps\n", a_ops);
+ goto out_mapping;
+ }
+
+ if (probe_kernel_read(&dentry_first,
+ &host->i_dentry.first, sizeof(struct hlist_node *))) {
+ pr_warn("mapping->a_ops:%ps with invalid mapping->host inode address %px\n",
+ a_ops, host);
+ goto out_mapping;
+ }
+
+ if (!dentry_first) {
+ pr_warn("mapping->a_ops:%ps\n", a_ops);
+ goto out_mapping;
+ }
+
+ dentry_ptr = container_of(dentry_first, struct dentry, d_u.d_alias);
+ if (probe_kernel_read(&dentry, dentry_ptr,
+ sizeof(struct dentry))) {
+ pr_warn("mapping->aops:%ps with invalid mapping->host->i_dentry.first %px\n",
+ a_ops, dentry_ptr);
+ } else {
+ /*
+ * if dentry is corrupted, the %pd handler may still
+ * crash, but it's unlikely that we reach here with a
+ * corrupted struct page
+ */
+ pr_warn("mapping->aops:%ps dentry name:\"%pd\"\n",
+ a_ops, &dentry);
+ }
}
+out_mapping:
BUILD_BUG_ON(ARRAY_SIZE(pageflag_names) != __NR_PAGEFLAGS + 1);
pr_warn("%sflags: %#lx(%pGp)%s\n", type, page->flags, &page->flags,
diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c
new file mode 100644
index 000000000000..9ec59c38d6a2
--- /dev/null
+++ b/mm/debug_vm_pgtable.c
@@ -0,0 +1,381 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * This kernel test validates architecture page table helpers and
+ * accessors and helps in verifying their continued compliance with
+ * expected generic MM semantics.
+ *
+ * Copyright (C) 2019 ARM Ltd.
+ *
+ * Author: Anshuman Khandual <anshuman.khandual@arm.com>
+ */
+#define pr_fmt(fmt) "debug_vm_pgtable: %s: " fmt, __func__
+
+#include <linux/gfp.h>
+#include <linux/highmem.h>
+#include <linux/hugetlb.h>
+#include <linux/kernel.h>
+#include <linux/kconfig.h>
+#include <linux/mm.h>
+#include <linux/mman.h>
+#include <linux/mm_types.h>
+#include <linux/module.h>
+#include <linux/pfn_t.h>
+#include <linux/printk.h>
+#include <linux/random.h>
+#include <linux/spinlock.h>
+#include <linux/swap.h>
+#include <linux/swapops.h>
+#include <linux/start_kernel.h>
+#include <linux/sched/mm.h>
+#include <asm/pgalloc.h>
+
+#define VMFLAGS (VM_READ|VM_WRITE|VM_EXEC)
+
+/*
+ * On s390 platform, the lower 4 bits are used to identify given page table
+ * entry type. But these bits might affect the ability to clear entries with
+ * pxx_clear() because of how dynamic page table folding works on s390. So
+ * while loading up the entries do not change the lower 4 bits. It does not
+ * have affect any other platform.
+ */
+#define S390_MASK_BITS 4
+#define RANDOM_ORVALUE GENMASK(BITS_PER_LONG - 1, S390_MASK_BITS)
+#define RANDOM_NZVALUE GENMASK(7, 0)
+
+static void __init pte_basic_tests(unsigned long pfn, pgprot_t prot)
+{
+ pte_t pte = pfn_pte(pfn, prot);
+
+ WARN_ON(!pte_same(pte, pte));
+ WARN_ON(!pte_young(pte_mkyoung(pte_mkold(pte))));
+ WARN_ON(!pte_dirty(pte_mkdirty(pte_mkclean(pte))));
+ WARN_ON(!pte_write(pte_mkwrite(pte_wrprotect(pte))));
+ WARN_ON(pte_young(pte_mkold(pte_mkyoung(pte))));
+ WARN_ON(pte_dirty(pte_mkclean(pte_mkdirty(pte))));
+ WARN_ON(pte_write(pte_wrprotect(pte_mkwrite(pte))));
+}
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+static void __init pmd_basic_tests(unsigned long pfn, pgprot_t prot)
+{
+ pmd_t pmd = pfn_pmd(pfn, prot);
+
+ WARN_ON(!pmd_same(pmd, pmd));
+ WARN_ON(!pmd_young(pmd_mkyoung(pmd_mkold(pmd))));
+ WARN_ON(!pmd_dirty(pmd_mkdirty(pmd_mkclean(pmd))));
+ WARN_ON(!pmd_write(pmd_mkwrite(pmd_wrprotect(pmd))));
+ WARN_ON(pmd_young(pmd_mkold(pmd_mkyoung(pmd))));
+ WARN_ON(pmd_dirty(pmd_mkclean(pmd_mkdirty(pmd))));
+ WARN_ON(pmd_write(pmd_wrprotect(pmd_mkwrite(pmd))));
+ /*
+ * A huge page does not point to next level page table
+ * entry. Hence this must qualify as pmd_bad().
+ */
+ WARN_ON(!pmd_bad(pmd_mkhuge(pmd)));
+}
+
+#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
+static void __init pud_basic_tests(unsigned long pfn, pgprot_t prot)
+{
+ pud_t pud = pfn_pud(pfn, prot);
+
+ WARN_ON(!pud_same(pud, pud));
+ WARN_ON(!pud_young(pud_mkyoung(pud_mkold(pud))));
+ WARN_ON(!pud_write(pud_mkwrite(pud_wrprotect(pud))));
+ WARN_ON(pud_write(pud_wrprotect(pud_mkwrite(pud))));
+ WARN_ON(pud_young(pud_mkold(pud_mkyoung(pud))));
+
+ if (mm_pmd_folded(mm))
+ return;
+
+ /*
+ * A huge page does not point to next level page table
+ * entry. Hence this must qualify as pud_bad().
+ */
+ WARN_ON(!pud_bad(pud_mkhuge(pud)));
+}
+#else /* !CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
+static void __init pud_basic_tests(unsigned long pfn, pgprot_t prot) { }
+#endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
+#else /* !CONFIG_TRANSPARENT_HUGEPAGE */
+static void __init pmd_basic_tests(unsigned long pfn, pgprot_t prot) { }
+static void __init pud_basic_tests(unsigned long pfn, pgprot_t prot) { }
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+
+static void __init p4d_basic_tests(unsigned long pfn, pgprot_t prot)
+{
+ p4d_t p4d;
+
+ memset(&p4d, RANDOM_NZVALUE, sizeof(p4d_t));
+ WARN_ON(!p4d_same(p4d, p4d));
+}
+
+static void __init pgd_basic_tests(unsigned long pfn, pgprot_t prot)
+{
+ pgd_t pgd;
+
+ memset(&pgd, RANDOM_NZVALUE, sizeof(pgd_t));
+ WARN_ON(!pgd_same(pgd, pgd));
+}
+
+#ifndef __PAGETABLE_PUD_FOLDED
+static void __init pud_clear_tests(struct mm_struct *mm, pud_t *pudp)
+{
+ pud_t pud = READ_ONCE(*pudp);
+
+ if (mm_pmd_folded(mm))
+ return;
+
+ pud = __pud(pud_val(pud) | RANDOM_ORVALUE);
+ WRITE_ONCE(*pudp, pud);
+ pud_clear(pudp);
+ pud = READ_ONCE(*pudp);
+ WARN_ON(!pud_none(pud));
+}
+
+static void __init pud_populate_tests(struct mm_struct *mm, pud_t *pudp,
+ pmd_t *pmdp)
+{
+ pud_t pud;
+
+ if (mm_pmd_folded(mm))
+ return;
+ /*
+ * This entry points to next level page table page.
+ * Hence this must not qualify as pud_bad().
+ */
+ pmd_clear(pmdp);
+ pud_clear(pudp);
+ pud_populate(mm, pudp, pmdp);
+ pud = READ_ONCE(*pudp);
+ WARN_ON(pud_bad(pud));
+}
+#else /* !__PAGETABLE_PUD_FOLDED */
+static void __init pud_clear_tests(struct mm_struct *mm, pud_t *pudp) { }
+static void __init pud_populate_tests(struct mm_struct *mm, pud_t *pudp,
+ pmd_t *pmdp)
+{
+}
+#endif /* PAGETABLE_PUD_FOLDED */
+
+#ifndef __PAGETABLE_P4D_FOLDED
+static void __init p4d_clear_tests(struct mm_struct *mm, p4d_t *p4dp)
+{
+ p4d_t p4d = READ_ONCE(*p4dp);
+
+ if (mm_pud_folded(mm))
+ return;
+
+ p4d = __p4d(p4d_val(p4d) | RANDOM_ORVALUE);
+ WRITE_ONCE(*p4dp, p4d);
+ p4d_clear(p4dp);
+ p4d = READ_ONCE(*p4dp);
+ WARN_ON(!p4d_none(p4d));
+}
+
+static void __init p4d_populate_tests(struct mm_struct *mm, p4d_t *p4dp,
+ pud_t *pudp)
+{
+ p4d_t p4d;
+
+ if (mm_pud_folded(mm))
+ return;
+
+ /*
+ * This entry points to next level page table page.
+ * Hence this must not qualify as p4d_bad().
+ */
+ pud_clear(pudp);
+ p4d_clear(p4dp);
+ p4d_populate(mm, p4dp, pudp);
+ p4d = READ_ONCE(*p4dp);
+ WARN_ON(p4d_bad(p4d));
+}
+
+static void __init pgd_clear_tests(struct mm_struct *mm, pgd_t *pgdp)
+{
+ pgd_t pgd = READ_ONCE(*pgdp);
+
+ if (mm_p4d_folded(mm))
+ return;
+
+ pgd = __pgd(pgd_val(pgd) | RANDOM_ORVALUE);
+ WRITE_ONCE(*pgdp, pgd);
+ pgd_clear(pgdp);
+ pgd = READ_ONCE(*pgdp);
+ WARN_ON(!pgd_none(pgd));
+}
+
+static void __init pgd_populate_tests(struct mm_struct *mm, pgd_t *pgdp,
+ p4d_t *p4dp)
+{
+ pgd_t pgd;
+
+ if (mm_p4d_folded(mm))
+ return;
+
+ /*
+ * This entry points to next level page table page.
+ * Hence this must not qualify as pgd_bad().
+ */
+ p4d_clear(p4dp);
+ pgd_clear(pgdp);
+ pgd_populate(mm, pgdp, p4dp);
+ pgd = READ_ONCE(*pgdp);
+ WARN_ON(pgd_bad(pgd));
+}
+#else /* !__PAGETABLE_P4D_FOLDED */
+static void __init p4d_clear_tests(struct mm_struct *mm, p4d_t *p4dp) { }
+static void __init pgd_clear_tests(struct mm_struct *mm, pgd_t *pgdp) { }
+static void __init p4d_populate_tests(struct mm_struct *mm, p4d_t *p4dp,
+ pud_t *pudp)
+{
+}
+static void __init pgd_populate_tests(struct mm_struct *mm, pgd_t *pgdp,
+ p4d_t *p4dp)
+{
+}
+#endif /* PAGETABLE_P4D_FOLDED */
+
+static void __init pte_clear_tests(struct mm_struct *mm, pte_t *ptep,
+ unsigned long vaddr)
+{
+ pte_t pte = READ_ONCE(*ptep);
+
+ pte = __pte(pte_val(pte) | RANDOM_ORVALUE);
+ set_pte_at(mm, vaddr, ptep, pte);
+ barrier();
+ pte_clear(mm, vaddr, ptep);
+ pte = READ_ONCE(*ptep);
+ WARN_ON(!pte_none(pte));
+}
+
+static void __init pmd_clear_tests(struct mm_struct *mm, pmd_t *pmdp)
+{
+ pmd_t pmd = READ_ONCE(*pmdp);
+
+ pmd = __pmd(pmd_val(pmd) | RANDOM_ORVALUE);
+ WRITE_ONCE(*pmdp, pmd);
+ pmd_clear(pmdp);
+ pmd = READ_ONCE(*pmdp);
+ WARN_ON(!pmd_none(pmd));
+}
+
+static void __init pmd_populate_tests(struct mm_struct *mm, pmd_t *pmdp,
+ pgtable_t pgtable)
+{
+ pmd_t pmd;
+
+ /*
+ * This entry points to next level page table page.
+ * Hence this must not qualify as pmd_bad().
+ */
+ pmd_clear(pmdp);
+ pmd_populate(mm, pmdp, pgtable);
+ pmd = READ_ONCE(*pmdp);
+ WARN_ON(pmd_bad(pmd));
+}
+
+static unsigned long __init get_random_vaddr(void)
+{
+ unsigned long random_vaddr, random_pages, total_user_pages;
+
+ total_user_pages = (TASK_SIZE - FIRST_USER_ADDRESS) / PAGE_SIZE;
+
+ random_pages = get_random_long() % total_user_pages;
+ random_vaddr = FIRST_USER_ADDRESS + random_pages * PAGE_SIZE;
+
+ return random_vaddr;
+}
+
+static int __init debug_vm_pgtable(void)
+{
+ struct mm_struct *mm;
+ pgd_t *pgdp;
+ p4d_t *p4dp, *saved_p4dp;
+ pud_t *pudp, *saved_pudp;
+ pmd_t *pmdp, *saved_pmdp, pmd;
+ pte_t *ptep;
+ pgtable_t saved_ptep;
+ pgprot_t prot;
+ phys_addr_t paddr;
+ unsigned long vaddr, pte_aligned, pmd_aligned;
+ unsigned long pud_aligned, p4d_aligned, pgd_aligned;
+ spinlock_t *uninitialized_var(ptl);
+
+ pr_info("Validating architecture page table helpers\n");
+ prot = vm_get_page_prot(VMFLAGS);
+ vaddr = get_random_vaddr();
+ mm = mm_alloc();
+ if (!mm) {
+ pr_err("mm_struct allocation failed\n");
+ return 1;
+ }
+
+ /*
+ * PFN for mapping at PTE level is determined from a standard kernel
+ * text symbol. But pfns for higher page table levels are derived by
+ * masking lower bits of this real pfn. These derived pfns might not
+ * exist on the platform but that does not really matter as pfn_pxx()
+ * helpers will still create appropriate entries for the test. This
+ * helps avoid large memory block allocations to be used for mapping
+ * at higher page table levels.
+ */
+ paddr = __pa_symbol(&start_kernel);
+
+ pte_aligned = (paddr & PAGE_MASK) >> PAGE_SHIFT;
+ pmd_aligned = (paddr & PMD_MASK) >> PAGE_SHIFT;
+ pud_aligned = (paddr & PUD_MASK) >> PAGE_SHIFT;
+ p4d_aligned = (paddr & P4D_MASK) >> PAGE_SHIFT;
+ pgd_aligned = (paddr & PGDIR_MASK) >> PAGE_SHIFT;
+ WARN_ON(!pfn_valid(pte_aligned));
+
+ pgdp = pgd_offset(mm, vaddr);
+ p4dp = p4d_alloc(mm, pgdp, vaddr);
+ pudp = pud_alloc(mm, p4dp, vaddr);
+ pmdp = pmd_alloc(mm, pudp, vaddr);
+ ptep = pte_alloc_map_lock(mm, pmdp, vaddr, &ptl);
+
+ /*
+ * Save all the page table page addresses as the page table
+ * entries will be used for testing with random or garbage
+ * values. These saved addresses will be used for freeing
+ * page table pages.
+ */
+ pmd = READ_ONCE(*pmdp);
+ saved_p4dp = p4d_offset(pgdp, 0UL);
+ saved_pudp = pud_offset(p4dp, 0UL);
+ saved_pmdp = pmd_offset(pudp, 0UL);
+ saved_ptep = pmd_pgtable(pmd);
+
+ pte_basic_tests(pte_aligned, prot);
+ pmd_basic_tests(pmd_aligned, prot);
+ pud_basic_tests(pud_aligned, prot);
+ p4d_basic_tests(p4d_aligned, prot);
+ pgd_basic_tests(pgd_aligned, prot);
+
+ pte_clear_tests(mm, ptep, vaddr);
+ pmd_clear_tests(mm, pmdp);
+ pud_clear_tests(mm, pudp);
+ p4d_clear_tests(mm, p4dp);
+ pgd_clear_tests(mm, pgdp);
+
+ pte_unmap_unlock(ptep, ptl);
+
+ pmd_populate_tests(mm, pmdp, saved_ptep);
+ pud_populate_tests(mm, pudp, saved_pmdp);
+ p4d_populate_tests(mm, p4dp, saved_pudp);
+ pgd_populate_tests(mm, pgdp, saved_p4dp);
+
+ p4d_free(mm, saved_p4dp);
+ pud_free(mm, saved_pudp);
+ pmd_free(mm, saved_pmdp);
+ pte_free(mm, saved_ptep);
+
+ mm_dec_nr_puds(mm);
+ mm_dec_nr_pmds(mm);
+ mm_dec_nr_ptes(mm);
+ mmdrop(mm);
+ return 0;
+}
+late_initcall(debug_vm_pgtable);
diff --git a/mm/fadvise.c b/mm/fadvise.c
index 4f17c83db575..0e66f2aaeea3 100644
--- a/mm/fadvise.c
+++ b/mm/fadvise.c
@@ -22,6 +22,8 @@
#include <asm/unistd.h>
+#include "internal.h"
+
/*
* POSIX_FADV_WILLNEED could set PG_Referenced, and POSIX_FADV_NOREUSE could
* deactivate the pages and clear PG_Referenced.
@@ -102,10 +104,6 @@ int generic_fadvise(struct file *file, loff_t offset, loff_t len, int advice)
if (!nrpages)
nrpages = ~0UL;
- /*
- * Ignore return value because fadvise() shall return
- * success even if filesystem can't retrieve a hint,
- */
force_page_cache_readahead(mapping, file, start_index, nrpages);
break;
case POSIX_FADV_NOREUSE:
diff --git a/mm/filemap.c b/mm/filemap.c
index 23a051a7ef0f..f0ae9a6308cb 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -76,16 +76,16 @@
* ->i_mutex
* ->i_mmap_rwsem (truncate->unmap_mapping_range)
*
- * ->mmap_sem
+ * ->mmap_lock
* ->i_mmap_rwsem
* ->page_table_lock or pte_lock (various, mainly in memory.c)
* ->i_pages lock (arch-dependent flush_dcache_mmap_lock)
*
- * ->mmap_sem
+ * ->mmap_lock
* ->lock_page (access_process_vm)
*
* ->i_mutex (generic_perform_write)
- * ->mmap_sem (fault_in_pages_readable->do_page_fault)
+ * ->mmap_lock (fault_in_pages_readable->do_page_fault)
*
* bdi->wb.list_lock
* sb_lock (fs/fs-writeback.c)
@@ -199,9 +199,9 @@ static void unaccount_page_cache_page(struct address_space *mapping,
nr = hpage_nr_pages(page);
- __mod_node_page_state(page_pgdat(page), NR_FILE_PAGES, -nr);
+ __mod_lruvec_page_state(page, NR_FILE_PAGES, -nr);
if (PageSwapBacked(page)) {
- __mod_node_page_state(page_pgdat(page), NR_SHMEM, -nr);
+ __mod_lruvec_page_state(page, NR_SHMEM, -nr);
if (PageTransHuge(page))
__dec_node_page_state(page, NR_SHMEM_THPS);
} else if (PageTransHuge(page)) {
@@ -802,21 +802,22 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask)
new->mapping = mapping;
new->index = offset;
+ mem_cgroup_migrate(old, new);
+
xas_lock_irqsave(&xas, flags);
xas_store(&xas, new);
old->mapping = NULL;
/* hugetlb pages do not participate in page cache accounting. */
if (!PageHuge(old))
- __dec_node_page_state(new, NR_FILE_PAGES);
+ __dec_lruvec_page_state(old, NR_FILE_PAGES);
if (!PageHuge(new))
- __inc_node_page_state(new, NR_FILE_PAGES);
+ __inc_lruvec_page_state(new, NR_FILE_PAGES);
if (PageSwapBacked(old))
- __dec_node_page_state(new, NR_SHMEM);
+ __dec_lruvec_page_state(old, NR_SHMEM);
if (PageSwapBacked(new))
- __inc_node_page_state(new, NR_SHMEM);
+ __inc_lruvec_page_state(new, NR_SHMEM);
xas_unlock_irqrestore(&xas, flags);
- mem_cgroup_migrate(old, new);
if (freepage)
freepage(old);
put_page(old);
@@ -832,7 +833,6 @@ static int __add_to_page_cache_locked(struct page *page,
{
XA_STATE(xas, &mapping->i_pages, offset);
int huge = PageHuge(page);
- struct mem_cgroup *memcg;
int error;
void *old;
@@ -840,17 +840,16 @@ static int __add_to_page_cache_locked(struct page *page,
VM_BUG_ON_PAGE(PageSwapBacked(page), page);
mapping_set_update(&xas, mapping);
- if (!huge) {
- error = mem_cgroup_try_charge(page, current->mm,
- gfp_mask, &memcg, false);
- if (error)
- return error;
- }
-
get_page(page);
page->mapping = mapping;
page->index = offset;
+ if (!huge) {
+ error = mem_cgroup_charge(page, current->mm, gfp_mask);
+ if (error)
+ goto error;
+ }
+
do {
xas_lock_irq(&xas);
old = xas_load(&xas);
@@ -869,25 +868,23 @@ static int __add_to_page_cache_locked(struct page *page,
/* hugetlb pages do not participate in page cache accounting */
if (!huge)
- __inc_node_page_state(page, NR_FILE_PAGES);
+ __inc_lruvec_page_state(page, NR_FILE_PAGES);
unlock:
xas_unlock_irq(&xas);
} while (xas_nomem(&xas, gfp_mask & GFP_RECLAIM_MASK));
- if (xas_error(&xas))
+ if (xas_error(&xas)) {
+ error = xas_error(&xas);
goto error;
+ }
- if (!huge)
- mem_cgroup_commit_charge(page, memcg, false, false);
trace_mm_filemap_add_to_page_cache(page);
return 0;
error:
page->mapping = NULL;
/* Leave page->index set: truncation relies upon it */
- if (!huge)
- mem_cgroup_cancel_charge(page, memcg, false);
put_page(page);
- return xas_error(&xas);
+ return error;
}
ALLOW_ERROR_INJECTION(__add_to_page_cache_locked, ERRNO);
@@ -1259,7 +1256,7 @@ EXPORT_SYMBOL_GPL(add_page_wait_queue);
* instead.
*
* The read of PG_waiters has to be after (or concurrently with) PG_locked
- * being cleared, but a memory barrier should be unneccssary since it is
+ * being cleared, but a memory barrier should be unnecessary since it is
* in the same byte as PG_locked.
*/
static inline bool clear_bit_unlock_is_negative_byte(long nr, volatile void *mem)
@@ -1374,27 +1371,27 @@ EXPORT_SYMBOL_GPL(__lock_page_killable);
/*
* Return values:
- * 1 - page is locked; mmap_sem is still held.
+ * 1 - page is locked; mmap_lock is still held.
* 0 - page is not locked.
- * mmap_sem has been released (up_read()), unless flags had both
+ * mmap_lock has been released (mmap_read_unlock(), unless flags had both
* FAULT_FLAG_ALLOW_RETRY and FAULT_FLAG_RETRY_NOWAIT set, in
- * which case mmap_sem is still held.
+ * which case mmap_lock is still held.
*
* If neither ALLOW_RETRY nor KILLABLE are set, will always return 1
- * with the page locked and the mmap_sem unperturbed.
+ * with the page locked and the mmap_lock unperturbed.
*/
int __lock_page_or_retry(struct page *page, struct mm_struct *mm,
unsigned int flags)
{
if (fault_flag_allow_retry_first(flags)) {
/*
- * CAUTION! In this case, mmap_sem is not released
+ * CAUTION! In this case, mmap_lock is not released
* even though return 0.
*/
if (flags & FAULT_FLAG_RETRY_NOWAIT)
return 0;
- up_read(&mm->mmap_sem);
+ mmap_read_unlock(mm);
if (flags & FAULT_FLAG_KILLABLE)
wait_on_page_locked_killable(page);
else
@@ -1406,7 +1403,7 @@ int __lock_page_or_retry(struct page *page, struct mm_struct *mm,
ret = __lock_page_killable(page);
if (ret) {
- up_read(&mm->mmap_sem);
+ mmap_read_unlock(mm);
return 0;
}
} else
@@ -1991,7 +1988,7 @@ static void shrink_readahead_size_eio(struct file_ra_state *ra)
* * total number of bytes copied, including those the were already @written
* * negative error code if nothing was copied
*/
-static ssize_t generic_file_buffered_read(struct kiocb *iocb,
+ssize_t generic_file_buffered_read(struct kiocb *iocb,
struct iov_iter *iter, ssize_t written)
{
struct file *filp = iocb->ki_filp;
@@ -2243,6 +2240,7 @@ out:
file_accessed(filp);
return written ? written : error;
}
+EXPORT_SYMBOL_GPL(generic_file_buffered_read);
/**
* generic_file_read_iter - generic filesystem read routine
@@ -2315,14 +2313,14 @@ EXPORT_SYMBOL(generic_file_read_iter);
#ifdef CONFIG_MMU
#define MMAP_LOTSAMISS (100)
/*
- * lock_page_maybe_drop_mmap - lock the page, possibly dropping the mmap_sem
+ * lock_page_maybe_drop_mmap - lock the page, possibly dropping the mmap_lock
* @vmf - the vm_fault for this fault.
* @page - the page to lock.
* @fpin - the pointer to the file we may pin (or is already pinned).
*
- * This works similar to lock_page_or_retry in that it can drop the mmap_sem.
+ * This works similar to lock_page_or_retry in that it can drop the mmap_lock.
* It differs in that it actually returns the page locked if it returns 1 and 0
- * if it couldn't lock the page. If we did have to drop the mmap_sem then fpin
+ * if it couldn't lock the page. If we did have to drop the mmap_lock then fpin
* will point to the pinned file and needs to be fput()'ed at a later point.
*/
static int lock_page_maybe_drop_mmap(struct vm_fault *vmf, struct page *page,
@@ -2333,7 +2331,7 @@ static int lock_page_maybe_drop_mmap(struct vm_fault *vmf, struct page *page,
/*
* NOTE! This will make us return with VM_FAULT_RETRY, but with
- * the mmap_sem still held. That's how FAULT_FLAG_RETRY_NOWAIT
+ * the mmap_lock still held. That's how FAULT_FLAG_RETRY_NOWAIT
* is supposed to work. We have way too many special cases..
*/
if (vmf->flags & FAULT_FLAG_RETRY_NOWAIT)
@@ -2343,13 +2341,13 @@ static int lock_page_maybe_drop_mmap(struct vm_fault *vmf, struct page *page,
if (vmf->flags & FAULT_FLAG_KILLABLE) {
if (__lock_page_killable(page)) {
/*
- * We didn't have the right flags to drop the mmap_sem,
+ * We didn't have the right flags to drop the mmap_lock,
* but all fault_handlers only check for fatal signals
* if we return VM_FAULT_RETRY, so we need to drop the
- * mmap_sem here and return 0 if we don't have a fpin.
+ * mmap_lock here and return 0 if we don't have a fpin.
*/
if (*fpin == NULL)
- up_read(&vmf->vma->vm_mm->mmap_sem);
+ mmap_read_unlock(vmf->vma->vm_mm);
return 0;
}
} else
@@ -2411,7 +2409,7 @@ static struct file *do_sync_mmap_readahead(struct vm_fault *vmf)
/*
* Asynchronous readahead happens when we find the page and PG_readahead,
* so we want to possibly extend the readahead further. We return the file that
- * was pinned if we have to drop the mmap_sem in order to do IO.
+ * was pinned if we have to drop the mmap_lock in order to do IO.
*/
static struct file *do_async_mmap_readahead(struct vm_fault *vmf,
struct page *page)
@@ -2446,12 +2444,12 @@ static struct file *do_async_mmap_readahead(struct vm_fault *vmf,
* it in the page cache, and handles the special cases reasonably without
* having a lot of duplicated code.
*
- * vma->vm_mm->mmap_sem must be held on entry.
+ * vma->vm_mm->mmap_lock must be held on entry.
*
- * If our return value has VM_FAULT_RETRY set, it's because the mmap_sem
+ * If our return value has VM_FAULT_RETRY set, it's because the mmap_lock
* may be dropped before doing I/O or by lock_page_maybe_drop_mmap().
*
- * If our return value does not have VM_FAULT_RETRY set, the mmap_sem
+ * If our return value does not have VM_FAULT_RETRY set, the mmap_lock
* has not been released.
*
* We never return with VM_FAULT_RETRY and a bit from VM_FAULT_ERROR set.
@@ -2521,7 +2519,7 @@ retry_find:
goto page_not_uptodate;
/*
- * We've made it this far and we had to drop our mmap_sem, now is the
+ * We've made it this far and we had to drop our mmap_lock, now is the
* time to return to the upper layer and have it re-find the vma and
* redo the fault.
*/
@@ -2566,13 +2564,12 @@ page_not_uptodate:
if (!error || error == AOP_TRUNCATED_PAGE)
goto retry_find;
- /* Things didn't work out. Return zero to tell the mm layer so. */
shrink_readahead_size_eio(ra);
return VM_FAULT_SIGBUS;
out_retry:
/*
- * We dropped the mmap_sem, we need to return to the fault handler to
+ * We dropped the mmap_lock, we need to return to the fault handler to
* re-find the vma and come back and find our hopefully still populated
* page.
*/
@@ -2636,7 +2633,7 @@ void filemap_map_pages(struct vm_fault *vmf,
if (vmf->pte)
vmf->pte += xas.xa_index - last_pgoff;
last_pgoff = xas.xa_index;
- if (alloc_set_pte(vmf, NULL, page))
+ if (alloc_set_pte(vmf, page))
goto unlock;
unlock_page(page);
goto next;
diff --git a/mm/frame_vector.c b/mm/frame_vector.c
index c431ca81dad5..10f82d5643b6 100644
--- a/mm/frame_vector.c
+++ b/mm/frame_vector.c
@@ -29,7 +29,7 @@
* different type underlying the specified range of virtual addresses.
* When the function isn't able to map a single page, it returns error.
*
- * This function takes care of grabbing mmap_sem as necessary.
+ * This function takes care of grabbing mmap_lock as necessary.
*/
int get_vaddr_frames(unsigned long start, unsigned int nr_frames,
unsigned int gup_flags, struct frame_vector *vec)
@@ -48,7 +48,7 @@ int get_vaddr_frames(unsigned long start, unsigned int nr_frames,
start = untagged_addr(start);
- down_read(&mm->mmap_sem);
+ mmap_read_lock(mm);
locked = 1;
vma = find_vma_intersection(mm, start, start + 1);
if (!vma) {
@@ -72,7 +72,7 @@ int get_vaddr_frames(unsigned long start, unsigned int nr_frames,
if (!(vma->vm_flags & (VM_IO | VM_PFNMAP))) {
vec->got_ref = true;
vec->is_pfns = false;
- ret = get_user_pages_locked(start, nr_frames,
+ ret = pin_user_pages_locked(start, nr_frames,
gup_flags, (struct page **)(vec->ptrs), &locked);
goto out;
}
@@ -102,7 +102,7 @@ int get_vaddr_frames(unsigned long start, unsigned int nr_frames,
} while (vma && vma->vm_flags & (VM_IO | VM_PFNMAP));
out:
if (locked)
- up_read(&mm->mmap_sem);
+ mmap_read_unlock(mm);
if (!ret)
ret = -EFAULT;
if (ret > 0)
@@ -122,7 +122,6 @@ EXPORT_SYMBOL(get_vaddr_frames);
*/
void put_vaddr_frames(struct frame_vector *vec)
{
- int i;
struct page **pages;
if (!vec->got_ref)
@@ -135,8 +134,8 @@ void put_vaddr_frames(struct frame_vector *vec)
*/
if (WARN_ON(IS_ERR(pages)))
goto out;
- for (i = 0; i < vec->nr_frames; i++)
- put_page(pages[i]);
+
+ unpin_user_pages(pages, vec->nr_frames);
vec->got_ref = false;
out:
vec->nr_frames = 0;
diff --git a/mm/frontswap.c b/mm/frontswap.c
index 60bb20e8a951..bfa3a339253e 100644
--- a/mm/frontswap.c
+++ b/mm/frontswap.c
@@ -87,7 +87,7 @@ static inline void inc_frontswap_invalidates(void) { }
*
* This would not guards us against the user deciding to call swapoff right as
* we are calling the backend to initialize (so swapon is in action).
- * Fortunatly for us, the swapon_mutex has been taked by the callee so we are
+ * Fortunately for us, the swapon_mutex has been taken by the callee so we are
* OK. The other scenario where calls to frontswap_store (called via
* swap_writepage) is racing with frontswap_invalidate_area (called via
* swapoff) is again guarded by the swap subsystem.
@@ -413,8 +413,8 @@ static int __frontswap_unuse_pages(unsigned long total, unsigned long *unused,
}
/*
- * Used to check if it's necessory and feasible to unuse pages.
- * Return 1 when nothing to do, 0 when need to shink pages,
+ * Used to check if it's necessary and feasible to unuse pages.
+ * Return 1 when nothing to do, 0 when need to shrink pages,
* error code when there is an error.
*/
static int __frontswap_shrink(unsigned long target_pages,
diff --git a/mm/gup.c b/mm/gup.c
index 6076df8e04a4..de9e36262ccb 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -19,7 +19,6 @@
#include <linux/sched/mm.h>
#include <asm/mmu_context.h>
-#include <asm/pgtable.h>
#include <asm/tlbflush.h>
#include "internal.h"
@@ -382,13 +381,22 @@ static int follow_pfn_pte(struct vm_area_struct *vma, unsigned long address,
}
/*
- * FOLL_FORCE can write to even unwritable pte's, but only
- * after we've gone through a COW cycle and they are dirty.
+ * FOLL_FORCE or a forced COW break can write even to unwritable pte's,
+ * but only after we've gone through a COW cycle and they are dirty.
*/
static inline bool can_follow_write_pte(pte_t pte, unsigned int flags)
{
- return pte_write(pte) ||
- ((flags & FOLL_FORCE) && (flags & FOLL_COW) && pte_dirty(pte));
+ return pte_write(pte) || ((flags & FOLL_COW) && pte_dirty(pte));
+}
+
+/*
+ * A (separate) COW fault might break the page the other way and
+ * get_user_pages() would return the page from what is now the wrong
+ * VM. So we need to force a COW break at GUP time even for reads.
+ */
+static inline bool should_force_cow_break(struct vm_area_struct *vma, unsigned int flags)
+{
+ return is_cow_mapping(vma->vm_flags) && (flags & (FOLL_GET | FOLL_PIN));
}
static struct page *follow_page_pte(struct vm_area_struct *vma,
@@ -584,7 +592,7 @@ retry:
pmdval = READ_ONCE(*pmd);
/*
* MADV_DONTNEED may convert the pmd to null because
- * mmap_sem is held in read mode
+ * mmap_lock is held in read mode
*/
if (pmd_none(pmdval))
return no_page_table(vma, flags);
@@ -847,8 +855,8 @@ unmap:
}
/*
- * mmap_sem must be held on entry. If @locked != NULL and *@flags
- * does not include FOLL_NOWAIT, the mmap_sem may be released. If it
+ * mmap_lock must be held on entry. If @locked != NULL and *@flags
+ * does not include FOLL_NOWAIT, the mmap_lock may be released. If it
* is, *@locked will be set to 0 and -EBUSY returned.
*/
static int faultin_page(struct task_struct *tsk, struct vm_area_struct *vma,
@@ -971,7 +979,7 @@ static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags)
* only intends to ensure the pages are faulted in.
* @vmas: array of pointers to vmas corresponding to each page.
* Or NULL if the caller does not require them.
- * @locked: whether we're still with the mmap_sem held
+ * @locked: whether we're still with the mmap_lock held
*
* Returns either number of pages pinned (which may be less than the
* number requested), or an error. Details about the return value:
@@ -980,12 +988,13 @@ static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags)
* -- If nr_pages is >0, but no pages were pinned, returns -errno.
* -- If nr_pages is >0, and some pages were pinned, returns the number of
* pages pinned. Again, this may be less than nr_pages.
+ * -- 0 return value is possible when the fault would need to be retried.
*
* The caller is responsible for releasing returned @pages, via put_page().
*
- * @vmas are valid only as long as mmap_sem is held.
+ * @vmas are valid only as long as mmap_lock is held.
*
- * Must be called with mmap_sem held. It may be released. See below.
+ * Must be called with mmap_lock held. It may be released. See below.
*
* __get_user_pages walks a process's page tables and takes a reference to
* each struct page that each user address corresponds to at a given
@@ -1006,12 +1015,12 @@ static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags)
* appropriate) must be called after the page is finished with, and
* before put_page is called.
*
- * If @locked != NULL, *@locked will be set to 0 when mmap_sem is
+ * If @locked != NULL, *@locked will be set to 0 when mmap_lock is
* released by an up_read(). That can happen if @gup_flags does not
* have FOLL_NOWAIT.
*
* A caller using such a combination of @locked and @gup_flags
- * must therefore hold the mmap_sem for reading only, and recognize
+ * must therefore hold the mmap_lock for reading only, and recognize
* when it's been released. Otherwise, it must be held for either
* reading or writing and will not be released.
*
@@ -1066,13 +1075,15 @@ static long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
goto out;
}
if (is_vm_hugetlb_page(vma)) {
+ if (should_force_cow_break(vma, foll_flags))
+ foll_flags |= FOLL_WRITE;
i = follow_hugetlb_page(mm, vma, pages, vmas,
&start, &nr_pages, i,
- gup_flags, locked);
+ foll_flags, locked);
if (locked && *locked == 0) {
/*
* We've got a VM_FAULT_RETRY
- * and we've lost mmap_sem.
+ * and we've lost mmap_lock.
* We must stop here.
*/
BUG_ON(gup_flags & FOLL_NOWAIT);
@@ -1082,13 +1093,17 @@ static long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
continue;
}
}
+
+ if (should_force_cow_break(vma, foll_flags))
+ foll_flags |= FOLL_WRITE;
+
retry:
/*
* If we have a pending SIGKILL, don't keep faulting pages and
* potentially allocating memory.
*/
if (fatal_signal_pending(current)) {
- ret = -ERESTARTSYS;
+ ret = -EINTR;
goto out;
}
cond_resched();
@@ -1168,15 +1183,16 @@ static bool vma_permits_fault(struct vm_area_struct *vma,
return true;
}
-/*
+/**
* fixup_user_fault() - manually resolve a user page fault
* @tsk: the task_struct to use for page fault accounting, or
* NULL if faults are not to be recorded.
* @mm: mm_struct of target mm
* @address: user address
* @fault_flags:flags to pass down to handle_mm_fault()
- * @unlocked: did we unlock the mmap_sem while retrying, maybe NULL if caller
- * does not allow retry
+ * @unlocked: did we unlock the mmap_lock while retrying, maybe NULL if caller
+ * does not allow retry. If NULL, the caller must guarantee
+ * that fault_flags does not contain FAULT_FLAG_ALLOW_RETRY.
*
* This is meant to be called in the specific scenario where for locking reasons
* we try to access user memory in atomic context (within a pagefault_disable()
@@ -1195,8 +1211,8 @@ static bool vma_permits_fault(struct vm_area_struct *vma,
* such architectures, gup() will not be enough to make a subsequent access
* succeed.
*
- * This function will not return with an unlocked mmap_sem. So it has not the
- * same semantics wrt the @mm->mmap_sem as does filemap_fault().
+ * This function will not return with an unlocked mmap_lock. So it has not the
+ * same semantics wrt the @mm->mmap_lock as does filemap_fault().
*/
int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm,
unsigned long address, unsigned int fault_flags,
@@ -1218,6 +1234,10 @@ retry:
if (!vma_permits_fault(vma, fault_flags))
return -EFAULT;
+ if ((fault_flags & FAULT_FLAG_KILLABLE) &&
+ fatal_signal_pending(current))
+ return -EINTR;
+
ret = handle_mm_fault(vma, address, fault_flags);
major |= ret & VM_FAULT_MAJOR;
if (ret & VM_FAULT_ERROR) {
@@ -1229,12 +1249,10 @@ retry:
}
if (ret & VM_FAULT_RETRY) {
- down_read(&mm->mmap_sem);
- if (!(fault_flags & FAULT_FLAG_TRIED)) {
- *unlocked = true;
- fault_flags |= FAULT_FLAG_TRIED;
- goto retry;
- }
+ mmap_read_lock(mm);
+ *unlocked = true;
+ fault_flags |= FAULT_FLAG_TRIED;
+ goto retry;
}
if (tsk) {
@@ -1247,6 +1265,10 @@ retry:
}
EXPORT_SYMBOL_GPL(fixup_user_fault);
+/*
+ * Please note that this function, unlike __get_user_pages will not
+ * return 0 for nr_pages > 0 without FOLL_NOWAIT
+ */
static __always_inline long __get_user_pages_locked(struct task_struct *tsk,
struct mm_struct *mm,
unsigned long start,
@@ -1332,7 +1354,7 @@ retry:
break;
}
- ret = down_read_killable(&mm->mmap_sem);
+ ret = mmap_read_lock_killable(mm);
if (ret) {
BUG_ON(ret > 0);
if (!pages_done)
@@ -1367,7 +1389,7 @@ retry:
* We must let the caller know we temporarily dropped the lock
* and so the critical section protected by it was lost.
*/
- up_read(&mm->mmap_sem);
+ mmap_read_unlock(mm);
*locked = 0;
}
return pages_done;
@@ -1378,13 +1400,13 @@ retry:
* @vma: target vma
* @start: start address
* @end: end address
- * @locked: whether the mmap_sem is still held
+ * @locked: whether the mmap_lock is still held
*
* This takes care of mlocking the pages too if VM_LOCKED is set.
*
* return 0 on success, negative error code on error.
*
- * vma->vm_mm->mmap_sem must be held.
+ * vma->vm_mm->mmap_lock must be held.
*
* If @locked is NULL, it may be held for read or write and will
* be unperturbed.
@@ -1403,7 +1425,7 @@ long populate_vma_page_range(struct vm_area_struct *vma,
VM_BUG_ON(end & ~PAGE_MASK);
VM_BUG_ON_VMA(start < vma->vm_start, vma);
VM_BUG_ON_VMA(end > vma->vm_end, vma);
- VM_BUG_ON_MM(!rwsem_is_locked(&mm->mmap_sem), mm);
+ mmap_assert_locked(mm);
gup_flags = FOLL_TOUCH | FOLL_POPULATE | FOLL_MLOCK;
if (vma->vm_flags & VM_LOCKONFAULT)
@@ -1436,7 +1458,7 @@ long populate_vma_page_range(struct vm_area_struct *vma,
*
* This is used to implement mlock() and the MAP_POPULATE / MAP_LOCKED mmap
* flags. VMAs must be already marked with the desired vm_flags, and
- * mmap_sem must not be held.
+ * mmap_lock must not be held.
*/
int __mm_populate(unsigned long start, unsigned long len, int ignore_errors)
{
@@ -1455,7 +1477,7 @@ int __mm_populate(unsigned long start, unsigned long len, int ignore_errors)
*/
if (!locked) {
locked = 1;
- down_read(&mm->mmap_sem);
+ mmap_read_lock(mm);
vma = find_vma(mm, nstart);
} else if (nstart >= vma->vm_end)
vma = vma->vm_next;
@@ -1487,7 +1509,7 @@ int __mm_populate(unsigned long start, unsigned long len, int ignore_errors)
ret = 0;
}
if (locked)
- up_read(&mm->mmap_sem);
+ mmap_read_unlock(mm);
return ret; /* 0 or negative error code */
}
@@ -1503,7 +1525,7 @@ int __mm_populate(unsigned long start, unsigned long len, int ignore_errors)
* NULL wherever the ZERO_PAGE, or an anonymous pte_none, has been found -
* allowing a hole to be left in the corefile to save diskspace.
*
- * Called without mmap_sem, but after all other threads have been killed.
+ * Called without mmap_lock, but after all other threads have been killed.
*/
#ifdef CONFIG_ELF_CORE
struct page *get_dump_page(unsigned long addr)
@@ -1837,7 +1859,7 @@ static long __get_user_pages_remote(struct task_struct *tsk,
gup_flags | FOLL_TOUCH | FOLL_REMOTE);
}
-/*
+/**
* get_user_pages_remote() - pin user pages in memory
* @tsk: the task_struct to use for page fault accounting, or
* NULL if faults are not to be recorded.
@@ -1864,17 +1886,17 @@ static long __get_user_pages_remote(struct task_struct *tsk,
*
* The caller is responsible for releasing returned @pages, via put_page().
*
- * @vmas are valid only as long as mmap_sem is held.
+ * @vmas are valid only as long as mmap_lock is held.
*
- * Must be called with mmap_sem held for read or write.
+ * Must be called with mmap_lock held for read or write.
*
- * get_user_pages walks a process's page tables and takes a reference to
- * each struct page that each user address corresponds to at a given
+ * get_user_pages_remote walks a process's page tables and takes a reference
+ * to each struct page that each user address corresponds to at a given
* instant. That is, it takes the page that would be accessed if a user
* thread accesses the given user virtual address at that instant.
*
* This does not guarantee that the page exists in the user mappings when
- * get_user_pages returns, and there may even be a completely different
+ * get_user_pages_remote returns, and there may even be a completely different
* page there in some cases (eg. if mmapped pagecache has been invalidated
* and subsequently re faulted). However it does guarantee that the page
* won't be freed completely. And mostly callers simply care that the page
@@ -1886,17 +1908,17 @@ static long __get_user_pages_remote(struct task_struct *tsk,
* is written to, set_page_dirty (or set_page_dirty_lock, as appropriate) must
* be called after the page is finished with, and before put_page is called.
*
- * get_user_pages is typically used for fewer-copy IO operations, to get a
- * handle on the memory by some means other than accesses via the user virtual
- * addresses. The pages may be submitted for DMA to devices or accessed via
- * their kernel linear mapping (via the kmap APIs). Care should be taken to
- * use the correct cache flushing APIs.
+ * get_user_pages_remote is typically used for fewer-copy IO operations,
+ * to get a handle on the memory by some means other than accesses
+ * via the user virtual addresses. The pages may be submitted for
+ * DMA to devices or accessed via their kernel linear mapping (via the
+ * kmap APIs). Care should be taken to use the correct cache flushing APIs.
*
* See also get_user_pages_fast, for performance critical applications.
*
- * get_user_pages should be phased out in favor of
+ * get_user_pages_remote should be phased out in favor of
* get_user_pages_locked|unlocked or get_user_pages_fast. Nothing
- * should use get_user_pages because it cannot pass
+ * should use get_user_pages_remote because it cannot pass
* FAULT_FLAG_ALLOW_RETRY to handle_mm_fault.
*/
long get_user_pages_remote(struct task_struct *tsk, struct mm_struct *mm,
@@ -1935,7 +1957,17 @@ static long __get_user_pages_remote(struct task_struct *tsk,
}
#endif /* !CONFIG_MMU */
-/*
+/**
+ * get_user_pages() - pin user pages in memory
+ * @start: starting user address
+ * @nr_pages: number of pages from start to pin
+ * @gup_flags: flags modifying lookup behaviour
+ * @pages: array that receives pointers to the pages pinned.
+ * Should be at least nr_pages long. Or NULL, if caller
+ * only intends to ensure the pages are faulted in.
+ * @vmas: array of pointers to vmas corresponding to each page.
+ * Or NULL if the caller does not require them.
+ *
* This is the same as get_user_pages_remote(), just with a
* less-flexible calling convention where we assume that the task
* and mm being operated on are the current task's and don't allow
@@ -1958,26 +1990,37 @@ long get_user_pages(unsigned long start, unsigned long nr_pages,
}
EXPORT_SYMBOL(get_user_pages);
-/*
- * We can leverage the VM_FAULT_RETRY functionality in the page fault
- * paths better by using either get_user_pages_locked() or
- * get_user_pages_unlocked().
- *
+/**
* get_user_pages_locked() is suitable to replace the form:
*
- * down_read(&mm->mmap_sem);
+ * mmap_read_lock(mm);
* do_something()
* get_user_pages(tsk, mm, ..., pages, NULL);
- * up_read(&mm->mmap_sem);
+ * mmap_read_unlock(mm);
*
* to:
*
* int locked = 1;
- * down_read(&mm->mmap_sem);
+ * mmap_read_lock(mm);
* do_something()
* get_user_pages_locked(tsk, mm, ..., pages, &locked);
* if (locked)
- * up_read(&mm->mmap_sem);
+ * mmap_read_unlock(mm);
+ *
+ * @start: starting user address
+ * @nr_pages: number of pages from start to pin
+ * @gup_flags: flags modifying lookup behaviour
+ * @pages: array that receives pointers to the pages pinned.
+ * Should be at least nr_pages long. Or NULL, if caller
+ * only intends to ensure the pages are faulted in.
+ * @locked: pointer to lock flag indicating whether lock is held and
+ * subsequently whether VM_FAULT_RETRY functionality can be
+ * utilised. Lock must initially be held.
+ *
+ * We can leverage the VM_FAULT_RETRY functionality in the page fault
+ * paths better by using either get_user_pages_locked() or
+ * get_user_pages_unlocked().
+ *
*/
long get_user_pages_locked(unsigned long start, unsigned long nr_pages,
unsigned int gup_flags, struct page **pages,
@@ -1991,6 +2034,12 @@ long get_user_pages_locked(unsigned long start, unsigned long nr_pages,
*/
if (WARN_ON_ONCE(gup_flags & FOLL_LONGTERM))
return -EINVAL;
+ /*
+ * FOLL_PIN must only be set internally by the pin_user_pages*() APIs,
+ * never directly by the caller, so enforce that:
+ */
+ if (WARN_ON_ONCE(gup_flags & FOLL_PIN))
+ return -EINVAL;
return __get_user_pages_locked(current, current->mm, start, nr_pages,
pages, NULL, locked,
@@ -2001,9 +2050,9 @@ EXPORT_SYMBOL(get_user_pages_locked);
/*
* get_user_pages_unlocked() is suitable to replace the form:
*
- * down_read(&mm->mmap_sem);
+ * mmap_read_lock(mm);
* get_user_pages(tsk, mm, ..., pages, NULL);
- * up_read(&mm->mmap_sem);
+ * mmap_read_unlock(mm);
*
* with:
*
@@ -2029,11 +2078,11 @@ long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages,
if (WARN_ON_ONCE(gup_flags & FOLL_LONGTERM))
return -EINVAL;
- down_read(&mm->mmap_sem);
+ mmap_read_lock(mm);
ret = __get_user_pages_locked(current, mm, start, nr_pages, pages, NULL,
&locked, gup_flags | FOLL_TOUCH);
if (locked)
- up_read(&mm->mmap_sem);
+ mmap_read_unlock(mm);
return ret;
}
EXPORT_SYMBOL(get_user_pages_unlocked);
@@ -2250,7 +2299,7 @@ pte_unmap:
* to be special.
*
* For a futex to be placed on a THP tail page, get_futex_key requires a
- * __get_user_pages_fast implementation that can pin pages. Thus it's still
+ * get_user_pages_fast_only implementation that can pin pages. Thus it's still
* useful to have gup_huge_pmd even if we can't operate on ptes.
*/
static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
@@ -2655,7 +2704,7 @@ static inline void gup_pgd_range(unsigned long addr, unsigned long end,
#ifndef gup_fast_permitted
/*
- * Check if it's allowed to use __get_user_pages_fast() for the range, or
+ * Check if it's allowed to use get_user_pages_fast_only() for the range, or
* we need to fall back to the slow version:
*/
static bool gup_fast_permitted(unsigned long start, unsigned long end)
@@ -2664,62 +2713,6 @@ static bool gup_fast_permitted(unsigned long start, unsigned long end)
}
#endif
-/*
- * Like get_user_pages_fast() except it's IRQ-safe in that it won't fall back to
- * the regular GUP.
- * Note a difference with get_user_pages_fast: this always returns the
- * number of pages pinned, 0 if no pages were pinned.
- *
- * If the architecture does not support this function, simply return with no
- * pages pinned.
- */
-int __get_user_pages_fast(unsigned long start, int nr_pages, int write,
- struct page **pages)
-{
- unsigned long len, end;
- unsigned long flags;
- int nr_pinned = 0;
- /*
- * Internally (within mm/gup.c), gup fast variants must set FOLL_GET,
- * because gup fast is always a "pin with a +1 page refcount" request.
- */
- unsigned int gup_flags = FOLL_GET;
-
- if (write)
- gup_flags |= FOLL_WRITE;
-
- start = untagged_addr(start) & PAGE_MASK;
- len = (unsigned long) nr_pages << PAGE_SHIFT;
- end = start + len;
-
- if (end <= start)
- return 0;
- if (unlikely(!access_ok((void __user *)start, len)))
- return 0;
-
- /*
- * Disable interrupts. We use the nested form as we can already have
- * interrupts disabled by get_futex_key.
- *
- * With interrupts disabled, we block page table pages from being
- * freed from under us. See struct mmu_table_batch comments in
- * include/asm-generic/tlb.h for more details.
- *
- * We do not adopt an rcu_read_lock(.) here as we also want to
- * block IPIs that come from THPs splitting.
- */
-
- if (IS_ENABLED(CONFIG_HAVE_FAST_GUP) &&
- gup_fast_permitted(start, end)) {
- local_irq_save(flags);
- gup_pgd_range(start, end, gup_flags, pages, &nr_pinned);
- local_irq_restore(flags);
- }
-
- return nr_pinned;
-}
-EXPORT_SYMBOL_GPL(__get_user_pages_fast);
-
static int __gup_longterm_unlocked(unsigned long start, int nr_pages,
unsigned int gup_flags, struct page **pages)
{
@@ -2730,11 +2723,11 @@ static int __gup_longterm_unlocked(unsigned long start, int nr_pages,
* get_user_pages_unlocked() (see comments in that function)
*/
if (gup_flags & FOLL_LONGTERM) {
- down_read(&current->mm->mmap_sem);
+ mmap_read_lock(current->mm);
ret = __gup_longterm_locked(current, current->mm,
start, nr_pages,
pages, NULL, gup_flags);
- up_read(&current->mm->mmap_sem);
+ mmap_read_unlock(current->mm);
} else {
ret = get_user_pages_unlocked(start, nr_pages,
pages, gup_flags);
@@ -2748,12 +2741,17 @@ static int internal_get_user_pages_fast(unsigned long start, int nr_pages,
struct page **pages)
{
unsigned long addr, len, end;
+ unsigned long flags;
int nr_pinned = 0, ret = 0;
if (WARN_ON_ONCE(gup_flags & ~(FOLL_WRITE | FOLL_LONGTERM |
- FOLL_FORCE | FOLL_PIN | FOLL_GET)))
+ FOLL_FORCE | FOLL_PIN | FOLL_GET |
+ FOLL_FAST_ONLY)))
return -EINVAL;
+ if (!(gup_flags & FOLL_FAST_ONLY))
+ might_lock_read(&current->mm->mmap_lock);
+
start = untagged_addr(start) & PAGE_MASK;
addr = start;
len = (unsigned long) nr_pages << PAGE_SHIFT;
@@ -2764,15 +2762,42 @@ static int internal_get_user_pages_fast(unsigned long start, int nr_pages,
if (unlikely(!access_ok((void __user *)start, len)))
return -EFAULT;
- if (IS_ENABLED(CONFIG_HAVE_FAST_GUP) &&
- gup_fast_permitted(start, end)) {
- local_irq_disable();
- gup_pgd_range(addr, end, gup_flags, pages, &nr_pinned);
- local_irq_enable();
+ /*
+ * The FAST_GUP case requires FOLL_WRITE even for pure reads,
+ * because get_user_pages() may need to cause an early COW in
+ * order to avoid confusing the normal COW routines. So only
+ * targets that are already writable are safe to do by just
+ * looking at the page tables.
+ *
+ * NOTE! With FOLL_FAST_ONLY we allow read-only gup_fast() here,
+ * because there is no slow path to fall back on. But you'd
+ * better be careful about possible COW pages - you'll get _a_
+ * COW page, but not necessarily the one you intended to get
+ * depending on what COW event happens after this. COW may break
+ * the page copy in a random direction.
+ *
+ * Disable interrupts. The nested form is used, in order to allow
+ * full, general purpose use of this routine.
+ *
+ * With interrupts disabled, we block page table pages from being
+ * freed from under us. See struct mmu_table_batch comments in
+ * include/asm-generic/tlb.h for more details.
+ *
+ * We do not adopt an rcu_read_lock(.) here as we also want to
+ * block IPIs that come from THPs splitting.
+ */
+ if (IS_ENABLED(CONFIG_HAVE_FAST_GUP) && gup_fast_permitted(start, end)) {
+ unsigned long fast_flags = gup_flags;
+ if (!(gup_flags & FOLL_FAST_ONLY))
+ fast_flags |= FOLL_WRITE;
+
+ local_irq_save(flags);
+ gup_pgd_range(addr, end, fast_flags, pages, &nr_pinned);
+ local_irq_restore(flags);
ret = nr_pinned;
}
- if (nr_pinned < nr_pages) {
+ if (nr_pinned < nr_pages && !(gup_flags & FOLL_FAST_ONLY)) {
/* Try to get the remaining pages with get_user_pages */
start += nr_pinned << PAGE_SHIFT;
pages += nr_pinned;
@@ -2791,6 +2816,54 @@ static int internal_get_user_pages_fast(unsigned long start, int nr_pages,
return ret;
}
+/**
+ * get_user_pages_fast_only() - pin user pages in memory
+ * @start: starting user address
+ * @nr_pages: number of pages from start to pin
+ * @gup_flags: flags modifying pin behaviour
+ * @pages: array that receives pointers to the pages pinned.
+ * Should be at least nr_pages long.
+ *
+ * Like get_user_pages_fast() except it's IRQ-safe in that it won't fall back to
+ * the regular GUP.
+ * Note a difference with get_user_pages_fast: this always returns the
+ * number of pages pinned, 0 if no pages were pinned.
+ *
+ * If the architecture does not support this function, simply return with no
+ * pages pinned.
+ *
+ * Careful, careful! COW breaking can go either way, so a non-write
+ * access can get ambiguous page results. If you call this function without
+ * 'write' set, you'd better be sure that you're ok with that ambiguity.
+ */
+int get_user_pages_fast_only(unsigned long start, int nr_pages,
+ unsigned int gup_flags, struct page **pages)
+{
+ int nr_pinned;
+ /*
+ * Internally (within mm/gup.c), gup fast variants must set FOLL_GET,
+ * because gup fast is always a "pin with a +1 page refcount" request.
+ *
+ * FOLL_FAST_ONLY is required in order to match the API description of
+ * this routine: no fall back to regular ("slow") GUP.
+ */
+ gup_flags |= FOLL_GET | FOLL_FAST_ONLY;
+
+ nr_pinned = internal_get_user_pages_fast(start, nr_pages, gup_flags,
+ pages);
+
+ /*
+ * As specified in the API description above, this routine is not
+ * allowed to return negative values. However, the common core
+ * routine internal_get_user_pages_fast() *can* return -errno.
+ * Therefore, correct for that here:
+ */
+ if (nr_pinned < 0)
+ nr_pinned = 0;
+
+ return nr_pinned;
+}
+EXPORT_SYMBOL_GPL(get_user_pages_fast_only);
/**
* get_user_pages_fast() - pin user pages in memory
@@ -2800,7 +2873,7 @@ static int internal_get_user_pages_fast(unsigned long start, int nr_pages,
* @pages: array that receives pointers to the pages pinned.
* Should be at least nr_pages long.
*
- * Attempt to pin user pages in memory without taking mm->mmap_sem.
+ * Attempt to pin user pages in memory without taking mm->mmap_lock.
* If not successful, it will fall back to taking the lock and
* calling get_user_pages().
*
@@ -2843,10 +2916,7 @@ EXPORT_SYMBOL_GPL(get_user_pages_fast);
* the arguments here are identical.
*
* FOLL_PIN means that the pages must be released via unpin_user_page(). Please
- * see Documentation/vm/pin_user_pages.rst for further details.
- *
- * This is intended for Case 1 (DIO) in Documentation/vm/pin_user_pages.rst. It
- * is NOT intended for Case 2 (RDMA: long-term pins).
+ * see Documentation/core-api/pin_user_pages.rst for further details.
*/
int pin_user_pages_fast(unsigned long start, int nr_pages,
unsigned int gup_flags, struct page **pages)
@@ -2860,6 +2930,42 @@ int pin_user_pages_fast(unsigned long start, int nr_pages,
}
EXPORT_SYMBOL_GPL(pin_user_pages_fast);
+/*
+ * This is the FOLL_PIN equivalent of get_user_pages_fast_only(). Behavior
+ * is the same, except that this one sets FOLL_PIN instead of FOLL_GET.
+ *
+ * The API rules are the same, too: no negative values may be returned.
+ */
+int pin_user_pages_fast_only(unsigned long start, int nr_pages,
+ unsigned int gup_flags, struct page **pages)
+{
+ int nr_pinned;
+
+ /*
+ * FOLL_GET and FOLL_PIN are mutually exclusive. Note that the API
+ * rules require returning 0, rather than -errno:
+ */
+ if (WARN_ON_ONCE(gup_flags & FOLL_GET))
+ return 0;
+ /*
+ * FOLL_FAST_ONLY is required in order to match the API description of
+ * this routine: no fall back to regular ("slow") GUP.
+ */
+ gup_flags |= (FOLL_PIN | FOLL_FAST_ONLY);
+ nr_pinned = internal_get_user_pages_fast(start, nr_pages, gup_flags,
+ pages);
+ /*
+ * This routine is not allowed to return negative values. However,
+ * internal_get_user_pages_fast() *can* return -errno. Therefore,
+ * correct for that here:
+ */
+ if (nr_pinned < 0)
+ nr_pinned = 0;
+
+ return nr_pinned;
+}
+EXPORT_SYMBOL_GPL(pin_user_pages_fast_only);
+
/**
* pin_user_pages_remote() - pin pages of a remote process (task != current)
*
@@ -2883,10 +2989,7 @@ EXPORT_SYMBOL_GPL(pin_user_pages_fast);
* the arguments here are identical.
*
* FOLL_PIN means that the pages must be released via unpin_user_page(). Please
- * see Documentation/vm/pin_user_pages.rst for details.
- *
- * This is intended for Case 1 (DIO) in Documentation/vm/pin_user_pages.rst. It
- * is NOT intended for Case 2 (RDMA: long-term pins).
+ * see Documentation/core-api/pin_user_pages.rst for details.
*/
long pin_user_pages_remote(struct task_struct *tsk, struct mm_struct *mm,
unsigned long start, unsigned long nr_pages,
@@ -2919,10 +3022,7 @@ EXPORT_SYMBOL(pin_user_pages_remote);
* FOLL_PIN is set.
*
* FOLL_PIN means that the pages must be released via unpin_user_page(). Please
- * see Documentation/vm/pin_user_pages.rst for details.
- *
- * This is intended for Case 1 (DIO) in Documentation/vm/pin_user_pages.rst. It
- * is NOT intended for Case 2 (RDMA: long-term pins).
+ * see Documentation/core-api/pin_user_pages.rst for details.
*/
long pin_user_pages(unsigned long start, unsigned long nr_pages,
unsigned int gup_flags, struct page **pages,
@@ -2937,3 +3037,49 @@ long pin_user_pages(unsigned long start, unsigned long nr_pages,
pages, vmas, gup_flags);
}
EXPORT_SYMBOL(pin_user_pages);
+
+/*
+ * pin_user_pages_unlocked() is the FOLL_PIN variant of
+ * get_user_pages_unlocked(). Behavior is the same, except that this one sets
+ * FOLL_PIN and rejects FOLL_GET.
+ */
+long pin_user_pages_unlocked(unsigned long start, unsigned long nr_pages,
+ struct page **pages, unsigned int gup_flags)
+{
+ /* FOLL_GET and FOLL_PIN are mutually exclusive. */
+ if (WARN_ON_ONCE(gup_flags & FOLL_GET))
+ return -EINVAL;
+
+ gup_flags |= FOLL_PIN;
+ return get_user_pages_unlocked(start, nr_pages, pages, gup_flags);
+}
+EXPORT_SYMBOL(pin_user_pages_unlocked);
+
+/*
+ * pin_user_pages_locked() is the FOLL_PIN variant of get_user_pages_locked().
+ * Behavior is the same, except that this one sets FOLL_PIN and rejects
+ * FOLL_GET.
+ */
+long pin_user_pages_locked(unsigned long start, unsigned long nr_pages,
+ unsigned int gup_flags, struct page **pages,
+ int *locked)
+{
+ /*
+ * FIXME: Current FOLL_LONGTERM behavior is incompatible with
+ * FAULT_FLAG_ALLOW_RETRY because of the FS DAX check requirement on
+ * vmas. As there are no users of this flag in this call we simply
+ * disallow this option for now.
+ */
+ if (WARN_ON_ONCE(gup_flags & FOLL_LONGTERM))
+ return -EINVAL;
+
+ /* FOLL_GET and FOLL_PIN are mutually exclusive. */
+ if (WARN_ON_ONCE(gup_flags & FOLL_GET))
+ return -EINVAL;
+
+ gup_flags |= FOLL_PIN;
+ return __get_user_pages_locked(current, current->mm, start, nr_pages,
+ pages, NULL, locked,
+ gup_flags | FOLL_TOUCH);
+}
+EXPORT_SYMBOL(pin_user_pages_locked);
diff --git a/mm/hmm.c b/mm/hmm.c
index 280585833adf..e9a545751108 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -37,28 +37,13 @@ enum {
HMM_NEED_ALL_BITS = HMM_NEED_FAULT | HMM_NEED_WRITE_FAULT,
};
-/*
- * hmm_device_entry_from_pfn() - create a valid device entry value from pfn
- * @range: range use to encode HMM pfn value
- * @pfn: pfn value for which to create the device entry
- * Return: valid device entry for the pfn
- */
-static uint64_t hmm_device_entry_from_pfn(const struct hmm_range *range,
- unsigned long pfn)
-{
- return (pfn << range->pfn_shift) | range->flags[HMM_PFN_VALID];
-}
-
static int hmm_pfns_fill(unsigned long addr, unsigned long end,
- struct hmm_range *range, enum hmm_pfn_value_e value)
+ struct hmm_range *range, unsigned long cpu_flags)
{
- uint64_t *pfns = range->pfns;
- unsigned long i;
+ unsigned long i = (addr - range->start) >> PAGE_SHIFT;
- i = (addr - range->start) >> PAGE_SHIFT;
for (; addr < end; addr += PAGE_SIZE, i++)
- pfns[i] = range->values[value];
-
+ range->hmm_pfns[i] = cpu_flags;
return 0;
}
@@ -96,7 +81,8 @@ static int hmm_vma_fault(unsigned long addr, unsigned long end,
}
static unsigned int hmm_pte_need_fault(const struct hmm_vma_walk *hmm_vma_walk,
- uint64_t pfns, uint64_t cpu_flags)
+ unsigned long pfn_req_flags,
+ unsigned long cpu_flags)
{
struct hmm_range *range = hmm_vma_walk->range;
@@ -110,27 +96,28 @@ static unsigned int hmm_pte_need_fault(const struct hmm_vma_walk *hmm_vma_walk,
* waste to have the user pre-fill the pfn arrays with a default
* flags value.
*/
- pfns = (pfns & range->pfn_flags_mask) | range->default_flags;
+ pfn_req_flags &= range->pfn_flags_mask;
+ pfn_req_flags |= range->default_flags;
/* We aren't ask to do anything ... */
- if (!(pfns & range->flags[HMM_PFN_VALID]))
+ if (!(pfn_req_flags & HMM_PFN_REQ_FAULT))
return 0;
/* Need to write fault ? */
- if ((pfns & range->flags[HMM_PFN_WRITE]) &&
- !(cpu_flags & range->flags[HMM_PFN_WRITE]))
+ if ((pfn_req_flags & HMM_PFN_REQ_WRITE) &&
+ !(cpu_flags & HMM_PFN_WRITE))
return HMM_NEED_FAULT | HMM_NEED_WRITE_FAULT;
/* If CPU page table is not valid then we need to fault */
- if (!(cpu_flags & range->flags[HMM_PFN_VALID]))
+ if (!(cpu_flags & HMM_PFN_VALID))
return HMM_NEED_FAULT;
return 0;
}
static unsigned int
hmm_range_need_fault(const struct hmm_vma_walk *hmm_vma_walk,
- const uint64_t *pfns, unsigned long npages,
- uint64_t cpu_flags)
+ const unsigned long hmm_pfns[], unsigned long npages,
+ unsigned long cpu_flags)
{
struct hmm_range *range = hmm_vma_walk->range;
unsigned int required_fault = 0;
@@ -142,12 +129,12 @@ hmm_range_need_fault(const struct hmm_vma_walk *hmm_vma_walk,
* hmm_pte_need_fault() will always return 0.
*/
if (!((range->default_flags | range->pfn_flags_mask) &
- range->flags[HMM_PFN_VALID]))
+ HMM_PFN_REQ_FAULT))
return 0;
for (i = 0; i < npages; ++i) {
- required_fault |=
- hmm_pte_need_fault(hmm_vma_walk, pfns[i], cpu_flags);
+ required_fault |= hmm_pte_need_fault(hmm_vma_walk, hmm_pfns[i],
+ cpu_flags);
if (required_fault == HMM_NEED_ALL_BITS)
return required_fault;
}
@@ -161,12 +148,13 @@ static int hmm_vma_walk_hole(unsigned long addr, unsigned long end,
struct hmm_range *range = hmm_vma_walk->range;
unsigned int required_fault;
unsigned long i, npages;
- uint64_t *pfns;
+ unsigned long *hmm_pfns;
i = (addr - range->start) >> PAGE_SHIFT;
npages = (end - addr) >> PAGE_SHIFT;
- pfns = &range->pfns[i];
- required_fault = hmm_range_need_fault(hmm_vma_walk, pfns, npages, 0);
+ hmm_pfns = &range->hmm_pfns[i];
+ required_fault =
+ hmm_range_need_fault(hmm_vma_walk, hmm_pfns, npages, 0);
if (!walk->vma) {
if (required_fault)
return -EFAULT;
@@ -174,46 +162,44 @@ static int hmm_vma_walk_hole(unsigned long addr, unsigned long end,
}
if (required_fault)
return hmm_vma_fault(addr, end, required_fault, walk);
- hmm_vma_walk->last = addr;
- return hmm_pfns_fill(addr, end, range, HMM_PFN_NONE);
+ return hmm_pfns_fill(addr, end, range, 0);
}
-static inline uint64_t pmd_to_hmm_pfn_flags(struct hmm_range *range, pmd_t pmd)
+static inline unsigned long pmd_to_hmm_pfn_flags(struct hmm_range *range,
+ pmd_t pmd)
{
if (pmd_protnone(pmd))
return 0;
- return pmd_write(pmd) ? range->flags[HMM_PFN_VALID] |
- range->flags[HMM_PFN_WRITE] :
- range->flags[HMM_PFN_VALID];
+ return pmd_write(pmd) ? (HMM_PFN_VALID | HMM_PFN_WRITE) : HMM_PFN_VALID;
}
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
static int hmm_vma_handle_pmd(struct mm_walk *walk, unsigned long addr,
- unsigned long end, uint64_t *pfns, pmd_t pmd)
+ unsigned long end, unsigned long hmm_pfns[],
+ pmd_t pmd)
{
struct hmm_vma_walk *hmm_vma_walk = walk->private;
struct hmm_range *range = hmm_vma_walk->range;
unsigned long pfn, npages, i;
unsigned int required_fault;
- uint64_t cpu_flags;
+ unsigned long cpu_flags;
npages = (end - addr) >> PAGE_SHIFT;
cpu_flags = pmd_to_hmm_pfn_flags(range, pmd);
required_fault =
- hmm_range_need_fault(hmm_vma_walk, pfns, npages, cpu_flags);
+ hmm_range_need_fault(hmm_vma_walk, hmm_pfns, npages, cpu_flags);
if (required_fault)
return hmm_vma_fault(addr, end, required_fault, walk);
pfn = pmd_pfn(pmd) + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
for (i = 0; addr < end; addr += PAGE_SIZE, i++, pfn++)
- pfns[i] = hmm_device_entry_from_pfn(range, pfn) | cpu_flags;
- hmm_vma_walk->last = end;
+ hmm_pfns[i] = pfn | cpu_flags;
return 0;
}
#else /* CONFIG_TRANSPARENT_HUGEPAGE */
/* stub to allow the code below to compile */
int hmm_vma_handle_pmd(struct mm_walk *walk, unsigned long addr,
- unsigned long end, uint64_t *pfns, pmd_t pmd);
+ unsigned long end, unsigned long hmm_pfns[], pmd_t pmd);
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
static inline bool hmm_is_device_private_entry(struct hmm_range *range,
@@ -224,31 +210,31 @@ static inline bool hmm_is_device_private_entry(struct hmm_range *range,
range->dev_private_owner;
}
-static inline uint64_t pte_to_hmm_pfn_flags(struct hmm_range *range, pte_t pte)
+static inline unsigned long pte_to_hmm_pfn_flags(struct hmm_range *range,
+ pte_t pte)
{
if (pte_none(pte) || !pte_present(pte) || pte_protnone(pte))
return 0;
- return pte_write(pte) ? range->flags[HMM_PFN_VALID] |
- range->flags[HMM_PFN_WRITE] :
- range->flags[HMM_PFN_VALID];
+ return pte_write(pte) ? (HMM_PFN_VALID | HMM_PFN_WRITE) : HMM_PFN_VALID;
}
static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr,
unsigned long end, pmd_t *pmdp, pte_t *ptep,
- uint64_t *pfn)
+ unsigned long *hmm_pfn)
{
struct hmm_vma_walk *hmm_vma_walk = walk->private;
struct hmm_range *range = hmm_vma_walk->range;
unsigned int required_fault;
- uint64_t cpu_flags;
+ unsigned long cpu_flags;
pte_t pte = *ptep;
- uint64_t orig_pfn = *pfn;
+ uint64_t pfn_req_flags = *hmm_pfn;
if (pte_none(pte)) {
- required_fault = hmm_pte_need_fault(hmm_vma_walk, orig_pfn, 0);
+ required_fault =
+ hmm_pte_need_fault(hmm_vma_walk, pfn_req_flags, 0);
if (required_fault)
goto fault;
- *pfn = range->values[HMM_PFN_NONE];
+ *hmm_pfn = 0;
return 0;
}
@@ -260,17 +246,18 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr,
* the PFN even if not present.
*/
if (hmm_is_device_private_entry(range, entry)) {
- *pfn = hmm_device_entry_from_pfn(range,
- device_private_entry_to_pfn(entry));
- *pfn |= range->flags[HMM_PFN_VALID];
+ cpu_flags = HMM_PFN_VALID;
if (is_write_device_private_entry(entry))
- *pfn |= range->flags[HMM_PFN_WRITE];
+ cpu_flags |= HMM_PFN_WRITE;
+ *hmm_pfn = device_private_entry_to_pfn(entry) |
+ cpu_flags;
return 0;
}
- required_fault = hmm_pte_need_fault(hmm_vma_walk, orig_pfn, 0);
+ required_fault =
+ hmm_pte_need_fault(hmm_vma_walk, pfn_req_flags, 0);
if (!required_fault) {
- *pfn = range->values[HMM_PFN_NONE];
+ *hmm_pfn = 0;
return 0;
}
@@ -290,7 +277,8 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr,
}
cpu_flags = pte_to_hmm_pfn_flags(range, pte);
- required_fault = hmm_pte_need_fault(hmm_vma_walk, orig_pfn, cpu_flags);
+ required_fault =
+ hmm_pte_need_fault(hmm_vma_walk, pfn_req_flags, cpu_flags);
if (required_fault)
goto fault;
@@ -299,15 +287,15 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr,
* fall through and treat it like a normal page.
*/
if (pte_special(pte) && !is_zero_pfn(pte_pfn(pte))) {
- if (hmm_pte_need_fault(hmm_vma_walk, orig_pfn, 0)) {
+ if (hmm_pte_need_fault(hmm_vma_walk, pfn_req_flags, 0)) {
pte_unmap(ptep);
return -EFAULT;
}
- *pfn = range->values[HMM_PFN_SPECIAL];
+ *hmm_pfn = HMM_PFN_ERROR;
return 0;
}
- *pfn = hmm_device_entry_from_pfn(range, pte_pfn(pte)) | cpu_flags;
+ *hmm_pfn = pte_pfn(pte) | cpu_flags;
return 0;
fault:
@@ -323,7 +311,8 @@ static int hmm_vma_walk_pmd(pmd_t *pmdp,
{
struct hmm_vma_walk *hmm_vma_walk = walk->private;
struct hmm_range *range = hmm_vma_walk->range;
- uint64_t *pfns = &range->pfns[(start - range->start) >> PAGE_SHIFT];
+ unsigned long *hmm_pfns =
+ &range->hmm_pfns[(start - range->start) >> PAGE_SHIFT];
unsigned long npages = (end - start) >> PAGE_SHIFT;
unsigned long addr = start;
pte_t *ptep;
@@ -335,16 +324,16 @@ again:
return hmm_vma_walk_hole(start, end, -1, walk);
if (thp_migration_supported() && is_pmd_migration_entry(pmd)) {
- if (hmm_range_need_fault(hmm_vma_walk, pfns, npages, 0)) {
+ if (hmm_range_need_fault(hmm_vma_walk, hmm_pfns, npages, 0)) {
hmm_vma_walk->last = addr;
pmd_migration_entry_wait(walk->mm, pmdp);
return -EBUSY;
}
- return hmm_pfns_fill(start, end, range, HMM_PFN_NONE);
+ return hmm_pfns_fill(start, end, range, 0);
}
if (!pmd_present(pmd)) {
- if (hmm_range_need_fault(hmm_vma_walk, pfns, npages, 0))
+ if (hmm_range_need_fault(hmm_vma_walk, hmm_pfns, npages, 0))
return -EFAULT;
return hmm_pfns_fill(start, end, range, HMM_PFN_ERROR);
}
@@ -364,7 +353,7 @@ again:
if (!pmd_devmap(pmd) && !pmd_trans_huge(pmd))
goto again;
- return hmm_vma_handle_pmd(walk, addr, end, pfns, pmd);
+ return hmm_vma_handle_pmd(walk, addr, end, hmm_pfns, pmd);
}
/*
@@ -374,37 +363,33 @@ again:
* recover.
*/
if (pmd_bad(pmd)) {
- if (hmm_range_need_fault(hmm_vma_walk, pfns, npages, 0))
+ if (hmm_range_need_fault(hmm_vma_walk, hmm_pfns, npages, 0))
return -EFAULT;
return hmm_pfns_fill(start, end, range, HMM_PFN_ERROR);
}
ptep = pte_offset_map(pmdp, addr);
- for (; addr < end; addr += PAGE_SIZE, ptep++, pfns++) {
+ for (; addr < end; addr += PAGE_SIZE, ptep++, hmm_pfns++) {
int r;
- r = hmm_vma_handle_pte(walk, addr, end, pmdp, ptep, pfns);
+ r = hmm_vma_handle_pte(walk, addr, end, pmdp, ptep, hmm_pfns);
if (r) {
/* hmm_vma_handle_pte() did pte_unmap() */
- hmm_vma_walk->last = addr;
return r;
}
}
pte_unmap(ptep - 1);
-
- hmm_vma_walk->last = addr;
return 0;
}
#if defined(CONFIG_ARCH_HAS_PTE_DEVMAP) && \
defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD)
-static inline uint64_t pud_to_hmm_pfn_flags(struct hmm_range *range, pud_t pud)
+static inline unsigned long pud_to_hmm_pfn_flags(struct hmm_range *range,
+ pud_t pud)
{
if (!pud_present(pud))
return 0;
- return pud_write(pud) ? range->flags[HMM_PFN_VALID] |
- range->flags[HMM_PFN_WRITE] :
- range->flags[HMM_PFN_VALID];
+ return pud_write(pud) ? (HMM_PFN_VALID | HMM_PFN_WRITE) : HMM_PFN_VALID;
}
static int hmm_vma_walk_pud(pud_t *pudp, unsigned long start, unsigned long end,
@@ -432,7 +417,8 @@ static int hmm_vma_walk_pud(pud_t *pudp, unsigned long start, unsigned long end,
if (pud_huge(pud) && pud_devmap(pud)) {
unsigned long i, npages, pfn;
unsigned int required_fault;
- uint64_t *pfns, cpu_flags;
+ unsigned long *hmm_pfns;
+ unsigned long cpu_flags;
if (!pud_present(pud)) {
spin_unlock(ptl);
@@ -441,10 +427,10 @@ static int hmm_vma_walk_pud(pud_t *pudp, unsigned long start, unsigned long end,
i = (addr - range->start) >> PAGE_SHIFT;
npages = (end - addr) >> PAGE_SHIFT;
- pfns = &range->pfns[i];
+ hmm_pfns = &range->hmm_pfns[i];
cpu_flags = pud_to_hmm_pfn_flags(range, pud);
- required_fault = hmm_range_need_fault(hmm_vma_walk, pfns,
+ required_fault = hmm_range_need_fault(hmm_vma_walk, hmm_pfns,
npages, cpu_flags);
if (required_fault) {
spin_unlock(ptl);
@@ -453,9 +439,7 @@ static int hmm_vma_walk_pud(pud_t *pudp, unsigned long start, unsigned long end,
pfn = pud_pfn(pud) + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
for (i = 0; i < npages; ++i, ++pfn)
- pfns[i] = hmm_device_entry_from_pfn(range, pfn) |
- cpu_flags;
- hmm_vma_walk->last = end;
+ hmm_pfns[i] = pfn | cpu_flags;
goto out_unlock;
}
@@ -479,8 +463,9 @@ static int hmm_vma_walk_hugetlb_entry(pte_t *pte, unsigned long hmask,
struct hmm_vma_walk *hmm_vma_walk = walk->private;
struct hmm_range *range = hmm_vma_walk->range;
struct vm_area_struct *vma = walk->vma;
- uint64_t orig_pfn, cpu_flags;
unsigned int required_fault;
+ unsigned long pfn_req_flags;
+ unsigned long cpu_flags;
spinlock_t *ptl;
pte_t entry;
@@ -488,9 +473,10 @@ static int hmm_vma_walk_hugetlb_entry(pte_t *pte, unsigned long hmask,
entry = huge_ptep_get(pte);
i = (start - range->start) >> PAGE_SHIFT;
- orig_pfn = range->pfns[i];
+ pfn_req_flags = range->hmm_pfns[i];
cpu_flags = pte_to_hmm_pfn_flags(range, entry);
- required_fault = hmm_pte_need_fault(hmm_vma_walk, orig_pfn, cpu_flags);
+ required_fault =
+ hmm_pte_need_fault(hmm_vma_walk, pfn_req_flags, cpu_flags);
if (required_fault) {
spin_unlock(ptl);
return hmm_vma_fault(addr, end, required_fault, walk);
@@ -498,9 +484,8 @@ static int hmm_vma_walk_hugetlb_entry(pte_t *pte, unsigned long hmask,
pfn = pte_pfn(entry) + ((start & ~hmask) >> PAGE_SHIFT);
for (; addr < end; addr += PAGE_SIZE, i++, pfn++)
- range->pfns[i] = hmm_device_entry_from_pfn(range, pfn) |
- cpu_flags;
- hmm_vma_walk->last = end;
+ range->hmm_pfns[i] = pfn | cpu_flags;
+
spin_unlock(ptl);
return 0;
}
@@ -531,13 +516,12 @@ static int hmm_vma_walk_test(unsigned long start, unsigned long end,
* failure.
*/
if (hmm_range_need_fault(hmm_vma_walk,
- range->pfns +
+ range->hmm_pfns +
((start - range->start) >> PAGE_SHIFT),
(end - start) >> PAGE_SHIFT, 0))
return -EFAULT;
hmm_pfns_fill(start, end, range, HMM_PFN_ERROR);
- hmm_vma_walk->last = end;
/* Skip this vma and continue processing the next vma. */
return 1;
@@ -555,9 +539,7 @@ static const struct mm_walk_ops hmm_walk_ops = {
* hmm_range_fault - try to fault some address in a virtual address range
* @range: argument structure
*
- * Return: the number of valid pages in range->pfns[] (from range start
- * address), which may be zero. On error one of the following status codes
- * can be returned:
+ * Returns 0 on success or one of the following error codes:
*
* -EINVAL: Invalid arguments or mm or virtual address is in an invalid vma
* (e.g., device file vma).
@@ -572,7 +554,7 @@ static const struct mm_walk_ops hmm_walk_ops = {
* This is similar to get_user_pages(), except that it can read the page tables
* without mutating them (ie causing faults).
*/
-long hmm_range_fault(struct hmm_range *range)
+int hmm_range_fault(struct hmm_range *range)
{
struct hmm_vma_walk hmm_vma_walk = {
.range = range,
@@ -581,7 +563,7 @@ long hmm_range_fault(struct hmm_range *range)
struct mm_struct *mm = range->notifier->mm;
int ret;
- lockdep_assert_held(&mm->mmap_sem);
+ mmap_assert_locked(mm);
do {
/* If range is no longer valid force retry. */
@@ -590,10 +572,13 @@ long hmm_range_fault(struct hmm_range *range)
return -EBUSY;
ret = walk_page_range(mm, hmm_vma_walk.last, range->end,
&hmm_walk_ops, &hmm_vma_walk);
+ /*
+ * When -EBUSY is returned the loop restarts with
+ * hmm_vma_walk.last set to an address that has not been stored
+ * in pfns. All entries < last in the pfn array are set to their
+ * output, and all >= are still at their input values.
+ */
} while (ret == -EBUSY);
-
- if (ret)
- return ret;
- return (hmm_vma_walk.last - range->start) >> PAGE_SHIFT;
+ return ret;
}
EXPORT_SYMBOL(hmm_range_fault);
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 6ecd1045113b..78c84bee7e29 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -522,7 +522,7 @@ void prep_transhuge_page(struct page *page)
bool is_transparent_hugepage(struct page *page)
{
if (!PageCompound(page))
- return 0;
+ return false;
page = compound_head(page);
return is_huge_zero_page(page) ||
@@ -587,19 +587,19 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf,
struct page *page, gfp_t gfp)
{
struct vm_area_struct *vma = vmf->vma;
- struct mem_cgroup *memcg;
pgtable_t pgtable;
unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
vm_fault_t ret = 0;
VM_BUG_ON_PAGE(!PageCompound(page), page);
- if (mem_cgroup_try_charge_delay(page, vma->vm_mm, gfp, &memcg, true)) {
+ if (mem_cgroup_charge(page, vma->vm_mm, gfp)) {
put_page(page);
count_vm_event(THP_FAULT_FALLBACK);
count_vm_event(THP_FAULT_FALLBACK_CHARGE);
return VM_FAULT_FALLBACK;
}
+ cgroup_throttle_swaprate(page, gfp);
pgtable = pte_alloc_one(vma->vm_mm);
if (unlikely(!pgtable)) {
@@ -630,7 +630,6 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf,
vm_fault_t ret2;
spin_unlock(vmf->ptl);
- mem_cgroup_cancel_charge(page, memcg, true);
put_page(page);
pte_free(vma->vm_mm, pgtable);
ret2 = handle_userfault(vmf, VM_UFFD_MISSING);
@@ -641,7 +640,6 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf,
entry = mk_huge_pmd(page, vma->vm_page_prot);
entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
page_add_new_anon_rmap(page, vma, haddr, true);
- mem_cgroup_commit_charge(page, memcg, false, true);
lru_cache_add_active_or_unevictable(page, vma);
pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable);
set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry);
@@ -649,7 +647,7 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf,
mm_inc_nr_ptes(vma->vm_mm);
spin_unlock(vmf->ptl);
count_vm_event(THP_FAULT_ALLOC);
- count_memcg_events(memcg, THP_FAULT_ALLOC, 1);
+ count_memcg_event_mm(vma->vm_mm, THP_FAULT_ALLOC);
}
return 0;
@@ -658,7 +656,6 @@ unlock_release:
release:
if (pgtable)
pte_free(vma->vm_mm, pgtable);
- mem_cgroup_cancel_charge(page, memcg, true);
put_page(page);
return ret;
@@ -1255,273 +1252,72 @@ unlock:
spin_unlock(vmf->ptl);
}
-static vm_fault_t do_huge_pmd_wp_page_fallback(struct vm_fault *vmf,
- pmd_t orig_pmd, struct page *page)
-{
- struct vm_area_struct *vma = vmf->vma;
- unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
- struct mem_cgroup *memcg;
- pgtable_t pgtable;
- pmd_t _pmd;
- int i;
- vm_fault_t ret = 0;
- struct page **pages;
- struct mmu_notifier_range range;
-
- pages = kmalloc_array(HPAGE_PMD_NR, sizeof(struct page *),
- GFP_KERNEL);
- if (unlikely(!pages)) {
- ret |= VM_FAULT_OOM;
- goto out;
- }
-
- for (i = 0; i < HPAGE_PMD_NR; i++) {
- pages[i] = alloc_page_vma_node(GFP_HIGHUSER_MOVABLE, vma,
- vmf->address, page_to_nid(page));
- if (unlikely(!pages[i] ||
- mem_cgroup_try_charge_delay(pages[i], vma->vm_mm,
- GFP_KERNEL, &memcg, false))) {
- if (pages[i])
- put_page(pages[i]);
- while (--i >= 0) {
- memcg = (void *)page_private(pages[i]);
- set_page_private(pages[i], 0);
- mem_cgroup_cancel_charge(pages[i], memcg,
- false);
- put_page(pages[i]);
- }
- kfree(pages);
- ret |= VM_FAULT_OOM;
- goto out;
- }
- set_page_private(pages[i], (unsigned long)memcg);
- }
-
- for (i = 0; i < HPAGE_PMD_NR; i++) {
- copy_user_highpage(pages[i], page + i,
- haddr + PAGE_SIZE * i, vma);
- __SetPageUptodate(pages[i]);
- cond_resched();
- }
-
- mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
- haddr, haddr + HPAGE_PMD_SIZE);
- mmu_notifier_invalidate_range_start(&range);
-
- vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
- if (unlikely(!pmd_same(*vmf->pmd, orig_pmd)))
- goto out_free_pages;
- VM_BUG_ON_PAGE(!PageHead(page), page);
-
- /*
- * Leave pmd empty until pte is filled note we must notify here as
- * concurrent CPU thread might write to new page before the call to
- * mmu_notifier_invalidate_range_end() happens which can lead to a
- * device seeing memory write in different order than CPU.
- *
- * See Documentation/vm/mmu_notifier.rst
- */
- pmdp_huge_clear_flush_notify(vma, haddr, vmf->pmd);
-
- pgtable = pgtable_trans_huge_withdraw(vma->vm_mm, vmf->pmd);
- pmd_populate(vma->vm_mm, &_pmd, pgtable);
-
- for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
- pte_t entry;
- entry = mk_pte(pages[i], vma->vm_page_prot);
- entry = maybe_mkwrite(pte_mkdirty(entry), vma);
- memcg = (void *)page_private(pages[i]);
- set_page_private(pages[i], 0);
- page_add_new_anon_rmap(pages[i], vmf->vma, haddr, false);
- mem_cgroup_commit_charge(pages[i], memcg, false, false);
- lru_cache_add_active_or_unevictable(pages[i], vma);
- vmf->pte = pte_offset_map(&_pmd, haddr);
- VM_BUG_ON(!pte_none(*vmf->pte));
- set_pte_at(vma->vm_mm, haddr, vmf->pte, entry);
- pte_unmap(vmf->pte);
- }
- kfree(pages);
-
- smp_wmb(); /* make pte visible before pmd */
- pmd_populate(vma->vm_mm, vmf->pmd, pgtable);
- page_remove_rmap(page, true);
- spin_unlock(vmf->ptl);
-
- /*
- * No need to double call mmu_notifier->invalidate_range() callback as
- * the above pmdp_huge_clear_flush_notify() did already call it.
- */
- mmu_notifier_invalidate_range_only_end(&range);
-
- ret |= VM_FAULT_WRITE;
- put_page(page);
-
-out:
- return ret;
-
-out_free_pages:
- spin_unlock(vmf->ptl);
- mmu_notifier_invalidate_range_end(&range);
- for (i = 0; i < HPAGE_PMD_NR; i++) {
- memcg = (void *)page_private(pages[i]);
- set_page_private(pages[i], 0);
- mem_cgroup_cancel_charge(pages[i], memcg, false);
- put_page(pages[i]);
- }
- kfree(pages);
- goto out;
-}
-
vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf, pmd_t orig_pmd)
{
struct vm_area_struct *vma = vmf->vma;
- struct page *page = NULL, *new_page;
- struct mem_cgroup *memcg;
+ struct page *page;
unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
- struct mmu_notifier_range range;
- gfp_t huge_gfp; /* for allocation and charge */
- vm_fault_t ret = 0;
vmf->ptl = pmd_lockptr(vma->vm_mm, vmf->pmd);
VM_BUG_ON_VMA(!vma->anon_vma, vma);
+
if (is_huge_zero_pmd(orig_pmd))
- goto alloc;
+ goto fallback;
+
spin_lock(vmf->ptl);
- if (unlikely(!pmd_same(*vmf->pmd, orig_pmd)))
- goto out_unlock;
+
+ if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) {
+ spin_unlock(vmf->ptl);
+ return 0;
+ }
page = pmd_page(orig_pmd);
VM_BUG_ON_PAGE(!PageCompound(page) || !PageHead(page), page);
- /*
- * We can only reuse the page if nobody else maps the huge page or it's
- * part.
- */
+
+ /* Lock page for reuse_swap_page() */
if (!trylock_page(page)) {
get_page(page);
spin_unlock(vmf->ptl);
lock_page(page);
spin_lock(vmf->ptl);
if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) {
+ spin_unlock(vmf->ptl);
unlock_page(page);
put_page(page);
- goto out_unlock;
+ return 0;
}
put_page(page);
}
+
+ /*
+ * We can only reuse the page if nobody else maps the huge page or it's
+ * part.
+ */
if (reuse_swap_page(page, NULL)) {
pmd_t entry;
entry = pmd_mkyoung(orig_pmd);
entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
- if (pmdp_set_access_flags(vma, haddr, vmf->pmd, entry, 1))
+ if (pmdp_set_access_flags(vma, haddr, vmf->pmd, entry, 1))
update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
- ret |= VM_FAULT_WRITE;
unlock_page(page);
- goto out_unlock;
- }
- unlock_page(page);
- get_page(page);
- spin_unlock(vmf->ptl);
-alloc:
- if (__transparent_hugepage_enabled(vma) &&
- !transparent_hugepage_debug_cow()) {
- huge_gfp = alloc_hugepage_direct_gfpmask(vma);
- new_page = alloc_hugepage_vma(huge_gfp, vma, haddr, HPAGE_PMD_ORDER);
- } else
- new_page = NULL;
-
- if (likely(new_page)) {
- prep_transhuge_page(new_page);
- } else {
- if (!page) {
- split_huge_pmd(vma, vmf->pmd, vmf->address);
- ret |= VM_FAULT_FALLBACK;
- } else {
- ret = do_huge_pmd_wp_page_fallback(vmf, orig_pmd, page);
- if (ret & VM_FAULT_OOM) {
- split_huge_pmd(vma, vmf->pmd, vmf->address);
- ret |= VM_FAULT_FALLBACK;
- }
- put_page(page);
- }
- count_vm_event(THP_FAULT_FALLBACK);
- goto out;
- }
-
- if (unlikely(mem_cgroup_try_charge_delay(new_page, vma->vm_mm,
- huge_gfp, &memcg, true))) {
- put_page(new_page);
- split_huge_pmd(vma, vmf->pmd, vmf->address);
- if (page)
- put_page(page);
- ret |= VM_FAULT_FALLBACK;
- count_vm_event(THP_FAULT_FALLBACK);
- count_vm_event(THP_FAULT_FALLBACK_CHARGE);
- goto out;
- }
-
- count_vm_event(THP_FAULT_ALLOC);
- count_memcg_events(memcg, THP_FAULT_ALLOC, 1);
-
- if (!page)
- clear_huge_page(new_page, vmf->address, HPAGE_PMD_NR);
- else
- copy_user_huge_page(new_page, page, vmf->address,
- vma, HPAGE_PMD_NR);
- __SetPageUptodate(new_page);
-
- mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
- haddr, haddr + HPAGE_PMD_SIZE);
- mmu_notifier_invalidate_range_start(&range);
-
- spin_lock(vmf->ptl);
- if (page)
- put_page(page);
- if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) {
spin_unlock(vmf->ptl);
- mem_cgroup_cancel_charge(new_page, memcg, true);
- put_page(new_page);
- goto out_mn;
- } else {
- pmd_t entry;
- entry = mk_huge_pmd(new_page, vma->vm_page_prot);
- entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
- pmdp_huge_clear_flush_notify(vma, haddr, vmf->pmd);
- page_add_new_anon_rmap(new_page, vma, haddr, true);
- mem_cgroup_commit_charge(new_page, memcg, false, true);
- lru_cache_add_active_or_unevictable(new_page, vma);
- set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry);
- update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
- if (!page) {
- add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR);
- } else {
- VM_BUG_ON_PAGE(!PageHead(page), page);
- page_remove_rmap(page, true);
- put_page(page);
- }
- ret |= VM_FAULT_WRITE;
+ return VM_FAULT_WRITE;
}
+
+ unlock_page(page);
spin_unlock(vmf->ptl);
-out_mn:
- /*
- * No need to double call mmu_notifier->invalidate_range() callback as
- * the above pmdp_huge_clear_flush_notify() did already call it.
- */
- mmu_notifier_invalidate_range_only_end(&range);
-out:
- return ret;
-out_unlock:
- spin_unlock(vmf->ptl);
- return ret;
+fallback:
+ __split_huge_pmd(vma, vmf->pmd, vmf->address, false, NULL);
+ return VM_FAULT_FALLBACK;
}
/*
- * FOLL_FORCE can write to even unwritable pmd's, but only
- * after we've gone through a COW cycle and they are dirty.
+ * FOLL_FORCE or a forced COW break can write even to unwritable pmd's,
+ * but only after we've gone through a COW cycle and they are dirty.
*/
static inline bool can_follow_write_pmd(pmd_t pmd, unsigned int flags)
{
- return pmd_write(pmd) ||
- ((flags & FOLL_FORCE) && (flags & FOLL_COW) && pmd_dirty(pmd));
+ return pmd_write(pmd) || ((flags & FOLL_COW) && pmd_dirty(pmd));
}
struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
@@ -1582,7 +1378,6 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
goto skip_mlock;
if (!trylock_page(page))
goto skip_mlock;
- lru_add_drain();
if (page->mapping && !PageDoubleMap(page))
mlock_vma_page(page);
unlock_page(page);
@@ -1852,8 +1647,8 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
* pgtable_trans_huge_withdraw after finishing pmdp related
* operations.
*/
- orig_pmd = pmdp_huge_get_and_clear_full(tlb->mm, addr, pmd,
- tlb->fullmm);
+ orig_pmd = pmdp_huge_get_and_clear_full(vma, addr, pmd,
+ tlb->fullmm);
tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
if (vma_is_special_huge(vma)) {
if (arch_needs_pgtable_deposit())
@@ -1951,7 +1746,7 @@ bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr,
/*
* We don't have to worry about the ordering of src and dst
- * ptlocks because exclusive mmap_sem prevents deadlock.
+ * ptlocks because exclusive mmap_lock prevents deadlock.
*/
old_ptl = __pmd_trans_huge_lock(old_pmd, vma);
if (old_ptl) {
@@ -2038,9 +1833,9 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
goto unlock;
/*
- * In case prot_numa, we are under down_read(mmap_sem). It's critical
+ * In case prot_numa, we are under mmap_read_lock(mm). It's critical
* to not clear pmd intermittently to avoid race with MADV_DONTNEED
- * which is also under down_read(mmap_sem):
+ * which is also under mmap_read_lock(mm):
*
* CPU0: CPU1:
* change_huge_pmd(prot_numa=1)
@@ -2360,15 +2155,17 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
atomic_inc(&page[i]._mapcount);
}
+ lock_page_memcg(page);
if (atomic_add_negative(-1, compound_mapcount_ptr(page))) {
/* Last compound_mapcount is gone. */
- __dec_node_page_state(page, NR_ANON_THPS);
+ __dec_lruvec_page_state(page, NR_ANON_THPS);
if (TestClearPageDoubleMap(page)) {
/* No need in mapcount reference anymore */
for (i = 0; i < HPAGE_PMD_NR; i++)
atomic_dec(&page[i]._mapcount);
}
}
+ unlock_page_memcg(page);
smp_wmb(); /* make pte visible before pmd */
pmd_populate(mm, pmd, pgtable);
@@ -2386,6 +2183,8 @@ void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
{
spinlock_t *ptl;
struct mmu_notifier_range range;
+ bool was_locked = false;
+ pmd_t _pmd;
mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
address & HPAGE_PMD_MASK,
@@ -2398,11 +2197,32 @@ void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
* pmd against. Otherwise we can end up replacing wrong page.
*/
VM_BUG_ON(freeze && !page);
- if (page && page != pmd_page(*pmd))
- goto out;
+ if (page) {
+ VM_WARN_ON_ONCE(!PageLocked(page));
+ was_locked = true;
+ if (page != pmd_page(*pmd))
+ goto out;
+ }
+repeat:
if (pmd_trans_huge(*pmd)) {
- page = pmd_page(*pmd);
+ if (!page) {
+ page = pmd_page(*pmd);
+ if (unlikely(!trylock_page(page))) {
+ get_page(page);
+ _pmd = *pmd;
+ spin_unlock(ptl);
+ lock_page(page);
+ spin_lock(ptl);
+ if (unlikely(!pmd_same(*pmd, _pmd))) {
+ unlock_page(page);
+ put_page(page);
+ page = NULL;
+ goto repeat;
+ }
+ put_page(page);
+ }
+ }
if (PageMlocked(page))
clear_page_mlock(page);
} else if (!(pmd_devmap(*pmd) || is_pmd_migration_entry(*pmd)))
@@ -2410,6 +2230,8 @@ void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
__split_huge_pmd_locked(vma, pmd, range.start, freeze);
out:
spin_unlock(ptl);
+ if (!was_locked && page)
+ unlock_page(page);
/*
* No need to double call mmu_notifier->invalidate_range() callback.
* They are 3 cases to consider inside __split_huge_pmd_locked():
@@ -2784,7 +2606,6 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
struct anon_vma *anon_vma = NULL;
struct address_space *mapping = NULL;
int count, mapcount, extra_pins, ret;
- bool mlocked;
unsigned long flags;
pgoff_t end;
@@ -2797,7 +2618,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
if (PageAnon(head)) {
/*
- * The caller does not necessarily hold an mmap_sem that would
+ * The caller does not necessarily hold an mmap_lock that would
* prevent the anon_vma disappearing so we first we take a
* reference to it and then lock the anon_vma for write. This
* is similar to page_lock_anon_vma_read except the write lock
@@ -2843,14 +2664,9 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
goto out_unlock;
}
- mlocked = PageMlocked(head);
unmap_page(head);
VM_BUG_ON_PAGE(compound_mapcount(head), head);
- /* Make sure the page is not on per-CPU pagevec as it takes pin */
- if (mlocked)
- lru_add_drain();
-
/* prevent PageLRU to go away from under us, and freeze lru stats */
spin_lock_irqsave(&pgdata->lru_lock, flags);
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index cd459155d28a..57ece74e3aae 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -31,7 +31,6 @@
#include <linux/cma.h>
#include <asm/page.h>
-#include <asm/pgtable.h>
#include <asm/tlb.h>
#include <linux/io.h>
@@ -59,8 +58,8 @@ __initdata LIST_HEAD(huge_boot_pages);
/* for command line parsing */
static struct hstate * __initdata parsed_hstate;
static unsigned long __initdata default_hstate_max_huge_pages;
-static unsigned long __initdata default_hstate_size;
static bool __initdata parsed_valid_hugepagesz = true;
+static bool __initdata parsed_default_hugepagesz;
/*
* Protects updates to hugepage_freelists, hugepage_activelist, nr_huge_pages,
@@ -85,7 +84,7 @@ static inline void unlock_or_release_subpool(struct hugepage_subpool *spool)
spin_unlock(&spool->lock);
/* If no pages are used, and no other handles to the subpool
- * remain, give up any reservations mased on minimum size and
+ * remain, give up any reservations based on minimum size and
* free the subpool */
if (free) {
if (spool->min_hpages != -1)
@@ -133,7 +132,7 @@ void hugepage_put_subpool(struct hugepage_subpool *spool)
* the request. Otherwise, return the number of pages by which the
* global pools must be adjusted (upward). The returned value may
* only be different than the passed value (delta) in the case where
- * a subpool minimum size must be manitained.
+ * a subpool minimum size must be maintained.
*/
static long hugepage_subpool_get_pages(struct hugepage_subpool *spool,
long delta)
@@ -473,7 +472,7 @@ out_of_memory:
*
* Return the number of new huge pages added to the map. This number is greater
* than or equal to zero. If file_region entries needed to be allocated for
- * this operation and we were not able to allocate, it ruturns -ENOMEM.
+ * this operation and we were not able to allocate, it returns -ENOMEM.
* region_add of regions of length 1 never allocate file_regions and cannot
* fail; region_chg will always allocate at least 1 entry and a region_add for
* 1 page will only require at most 1 entry.
@@ -988,7 +987,7 @@ static bool vma_has_reserves(struct vm_area_struct *vma, long chg)
* We know VM_NORESERVE is not set. Therefore, there SHOULD
* be a region map for all pages. The only situation where
* there is no region map is if a hole was punched via
- * fallocate. In this case, there really are no reverves to
+ * fallocate. In this case, there really are no reserves to
* use. This situation is indicated if chg != 0.
*/
if (chg)
@@ -1519,7 +1518,7 @@ static void prep_compound_gigantic_page(struct page *page, unsigned int order)
* For gigantic hugepages allocated through bootmem at
* boot, it's safer to be consistent with the not-gigantic
* hugepages and clear the PG_reserved bit from all tail pages
- * too. Otherwse drivers using get_user_pages() to access tail
+ * too. Otherwise drivers using get_user_pages() to access tail
* pages may get the reference counting wrong if they see
* PG_reserved set on a tail page (despite the head page not
* having PG_reserved set). Enforcing this consistency between
@@ -3060,7 +3059,7 @@ static void __init hugetlb_sysfs_init(void)
err = hugetlb_sysfs_add_hstate(h, hugepages_kobj,
hstate_kobjs, &hstate_attr_group);
if (err)
- pr_err("Hugetlb: Unable to add hstate %s", h->name);
+ pr_err("HugeTLB: Unable to add hstate %s", h->name);
}
}
@@ -3164,7 +3163,7 @@ static void hugetlb_register_node(struct node *node)
nhs->hstate_kobjs,
&per_node_hstate_attr_group);
if (err) {
- pr_err("Hugetlb: Unable to add hstate %s for node %d\n",
+ pr_err("HugeTLB: Unable to add hstate %s for node %d\n",
h->name, node->dev.id);
hugetlb_unregister_node(node);
break;
@@ -3212,23 +3211,41 @@ static int __init hugetlb_init(void)
{
int i;
- if (!hugepages_supported())
+ if (!hugepages_supported()) {
+ if (hugetlb_max_hstate || default_hstate_max_huge_pages)
+ pr_warn("HugeTLB: huge pages not supported, ignoring associated command-line parameters\n");
return 0;
+ }
- if (!size_to_hstate(default_hstate_size)) {
- if (default_hstate_size != 0) {
- pr_err("HugeTLB: unsupported default_hugepagesz %lu. Reverting to %lu\n",
- default_hstate_size, HPAGE_SIZE);
+ /*
+ * Make sure HPAGE_SIZE (HUGETLB_PAGE_ORDER) hstate exists. Some
+ * architectures depend on setup being done here.
+ */
+ hugetlb_add_hstate(HUGETLB_PAGE_ORDER);
+ if (!parsed_default_hugepagesz) {
+ /*
+ * If we did not parse a default huge page size, set
+ * default_hstate_idx to HPAGE_SIZE hstate. And, if the
+ * number of huge pages for this default size was implicitly
+ * specified, set that here as well.
+ * Note that the implicit setting will overwrite an explicit
+ * setting. A warning will be printed in this case.
+ */
+ default_hstate_idx = hstate_index(size_to_hstate(HPAGE_SIZE));
+ if (default_hstate_max_huge_pages) {
+ if (default_hstate.max_huge_pages) {
+ char buf[32];
+
+ string_get_size(huge_page_size(&default_hstate),
+ 1, STRING_UNITS_2, buf, 32);
+ pr_warn("HugeTLB: Ignoring hugepages=%lu associated with %s page size\n",
+ default_hstate.max_huge_pages, buf);
+ pr_warn("HugeTLB: Using hugepages=%lu for number of default huge pages\n",
+ default_hstate_max_huge_pages);
+ }
+ default_hstate.max_huge_pages =
+ default_hstate_max_huge_pages;
}
-
- default_hstate_size = HPAGE_SIZE;
- if (!size_to_hstate(default_hstate_size))
- hugetlb_add_hstate(HUGETLB_PAGE_ORDER);
- }
- default_hstate_idx = hstate_index(size_to_hstate(default_hstate_size));
- if (default_hstate_max_huge_pages) {
- if (!default_hstate.max_huge_pages)
- default_hstate.max_huge_pages = default_hstate_max_huge_pages;
}
hugetlb_cma_check();
@@ -3256,10 +3273,10 @@ static int __init hugetlb_init(void)
}
subsys_initcall(hugetlb_init);
-/* Should be called on processing a hugepagesz=... option */
-void __init hugetlb_bad_size(void)
+/* Overwritten by architectures with more huge page sizes */
+bool __init __attribute((weak)) arch_hugetlb_valid_size(unsigned long size)
{
- parsed_valid_hugepagesz = false;
+ return size == HPAGE_SIZE;
}
void __init hugetlb_add_hstate(unsigned int order)
@@ -3268,7 +3285,6 @@ void __init hugetlb_add_hstate(unsigned int order)
unsigned long i;
if (size_to_hstate(PAGE_SIZE << order)) {
- pr_warn("hugepagesz= specified twice, ignoring\n");
return;
}
BUG_ON(hugetlb_max_hstate >= HUGE_MAX_HSTATE);
@@ -3289,20 +3305,29 @@ void __init hugetlb_add_hstate(unsigned int order)
parsed_hstate = h;
}
-static int __init hugetlb_nrpages_setup(char *s)
+/*
+ * hugepages command line processing
+ * hugepages normally follows a valid hugepagsz or default_hugepagsz
+ * specification. If not, ignore the hugepages value. hugepages can also
+ * be the first huge page command line option in which case it implicitly
+ * specifies the number of huge pages for the default size.
+ */
+static int __init hugepages_setup(char *s)
{
unsigned long *mhp;
static unsigned long *last_mhp;
if (!parsed_valid_hugepagesz) {
- pr_warn("hugepages = %s preceded by "
- "an unsupported hugepagesz, ignoring\n", s);
+ pr_warn("HugeTLB: hugepages=%s does not follow a valid hugepagesz, ignoring\n", s);
parsed_valid_hugepagesz = true;
- return 1;
+ return 0;
}
+
/*
- * !hugetlb_max_hstate means we haven't parsed a hugepagesz= parameter yet,
- * so this hugepages= parameter goes to the "default hstate".
+ * !hugetlb_max_hstate means we haven't parsed a hugepagesz= parameter
+ * yet, so this hugepages= parameter goes to the "default hstate".
+ * Otherwise, it goes with the previously parsed hugepagesz or
+ * default_hugepagesz.
*/
else if (!hugetlb_max_hstate)
mhp = &default_hstate_max_huge_pages;
@@ -3310,8 +3335,8 @@ static int __init hugetlb_nrpages_setup(char *s)
mhp = &parsed_hstate->max_huge_pages;
if (mhp == last_mhp) {
- pr_warn("hugepages= specified twice without interleaving hugepagesz=, ignoring\n");
- return 1;
+ pr_warn("HugeTLB: hugepages= specified twice without interleaving hugepagesz=, ignoring hugepages=%s\n", s);
+ return 0;
}
if (sscanf(s, "%lu", mhp) <= 0)
@@ -3329,14 +3354,102 @@ static int __init hugetlb_nrpages_setup(char *s)
return 1;
}
-__setup("hugepages=", hugetlb_nrpages_setup);
+__setup("hugepages=", hugepages_setup);
+
+/*
+ * hugepagesz command line processing
+ * A specific huge page size can only be specified once with hugepagesz.
+ * hugepagesz is followed by hugepages on the command line. The global
+ * variable 'parsed_valid_hugepagesz' is used to determine if prior
+ * hugepagesz argument was valid.
+ */
+static int __init hugepagesz_setup(char *s)
+{
+ unsigned long size;
+ struct hstate *h;
+
+ parsed_valid_hugepagesz = false;
+ size = (unsigned long)memparse(s, NULL);
+
+ if (!arch_hugetlb_valid_size(size)) {
+ pr_err("HugeTLB: unsupported hugepagesz=%s\n", s);
+ return 0;
+ }
+
+ h = size_to_hstate(size);
+ if (h) {
+ /*
+ * hstate for this size already exists. This is normally
+ * an error, but is allowed if the existing hstate is the
+ * default hstate. More specifically, it is only allowed if
+ * the number of huge pages for the default hstate was not
+ * previously specified.
+ */
+ if (!parsed_default_hugepagesz || h != &default_hstate ||
+ default_hstate.max_huge_pages) {
+ pr_warn("HugeTLB: hugepagesz=%s specified twice, ignoring\n", s);
+ return 0;
+ }
+
+ /*
+ * No need to call hugetlb_add_hstate() as hstate already
+ * exists. But, do set parsed_hstate so that a following
+ * hugepages= parameter will be applied to this hstate.
+ */
+ parsed_hstate = h;
+ parsed_valid_hugepagesz = true;
+ return 1;
+ }
+
+ hugetlb_add_hstate(ilog2(size) - PAGE_SHIFT);
+ parsed_valid_hugepagesz = true;
+ return 1;
+}
+__setup("hugepagesz=", hugepagesz_setup);
-static int __init hugetlb_default_setup(char *s)
+/*
+ * default_hugepagesz command line input
+ * Only one instance of default_hugepagesz allowed on command line.
+ */
+static int __init default_hugepagesz_setup(char *s)
{
- default_hstate_size = memparse(s, &s);
+ unsigned long size;
+
+ parsed_valid_hugepagesz = false;
+ if (parsed_default_hugepagesz) {
+ pr_err("HugeTLB: default_hugepagesz previously specified, ignoring %s\n", s);
+ return 0;
+ }
+
+ size = (unsigned long)memparse(s, NULL);
+
+ if (!arch_hugetlb_valid_size(size)) {
+ pr_err("HugeTLB: unsupported default_hugepagesz=%s\n", s);
+ return 0;
+ }
+
+ hugetlb_add_hstate(ilog2(size) - PAGE_SHIFT);
+ parsed_valid_hugepagesz = true;
+ parsed_default_hugepagesz = true;
+ default_hstate_idx = hstate_index(size_to_hstate(size));
+
+ /*
+ * The number of default huge pages (for this size) could have been
+ * specified as the first hugetlb parameter: hugepages=X. If so,
+ * then default_hstate_max_huge_pages is set. If the default huge
+ * page size is gigantic (>= MAX_ORDER), then the pages must be
+ * allocated here from bootmem allocator.
+ */
+ if (default_hstate_max_huge_pages) {
+ default_hstate.max_huge_pages = default_hstate_max_huge_pages;
+ if (hstate_is_gigantic(&default_hstate))
+ hugetlb_hstate_alloc_pages(&default_hstate);
+ default_hstate_max_huge_pages = 0;
+ }
+
return 1;
}
-__setup("default_hugepagesz=", hugetlb_default_setup);
+__setup("default_hugepagesz=", default_hugepagesz_setup);
static unsigned int cpuset_mems_nr(unsigned int *array)
{
@@ -3352,7 +3465,7 @@ static unsigned int cpuset_mems_nr(unsigned int *array)
#ifdef CONFIG_SYSCTL
static int hugetlb_sysctl_handler_common(bool obey_mempolicy,
struct ctl_table *table, int write,
- void __user *buffer, size_t *length, loff_t *ppos)
+ void *buffer, size_t *length, loff_t *ppos)
{
struct hstate *h = &default_hstate;
unsigned long tmp = h->max_huge_pages;
@@ -3375,7 +3488,7 @@ out:
}
int hugetlb_sysctl_handler(struct ctl_table *table, int write,
- void __user *buffer, size_t *length, loff_t *ppos)
+ void *buffer, size_t *length, loff_t *ppos)
{
return hugetlb_sysctl_handler_common(false, table, write,
@@ -3384,7 +3497,7 @@ int hugetlb_sysctl_handler(struct ctl_table *table, int write,
#ifdef CONFIG_NUMA
int hugetlb_mempolicy_sysctl_handler(struct ctl_table *table, int write,
- void __user *buffer, size_t *length, loff_t *ppos)
+ void *buffer, size_t *length, loff_t *ppos)
{
return hugetlb_sysctl_handler_common(true, table, write,
buffer, length, ppos);
@@ -3392,8 +3505,7 @@ int hugetlb_mempolicy_sysctl_handler(struct ctl_table *table, int write,
#endif /* CONFIG_NUMA */
int hugetlb_overcommit_handler(struct ctl_table *table, int write,
- void __user *buffer,
- size_t *length, loff_t *ppos)
+ void *buffer, size_t *length, loff_t *ppos)
{
struct hstate *h = &default_hstate;
unsigned long tmp;
@@ -4466,9 +4578,9 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
/*
* entry could be a migration/hwpoison entry at this point, so this
* check prevents the kernel from going below assuming that we have
- * a active hugepage in pagecache. This goto expects the 2nd page fault,
- * and is_hugetlb_entry_(migration|hwpoisoned) check will properly
- * handle it.
+ * an active hugepage in pagecache. This goto expects the 2nd page
+ * fault, and is_hugetlb_entry_(migration|hwpoisoned) check will
+ * properly handle it.
*/
if (!pte_present(entry))
goto out_mutex;
@@ -4583,7 +4695,7 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
(const void __user *) src_addr,
pages_per_huge_page(h), false);
- /* fallback to copy_from_user outside mmap_sem */
+ /* fallback to copy_from_user outside mmap_lock */
if (unlikely(ret)) {
ret = -ENOENT;
*pagep = page;
@@ -5355,8 +5467,8 @@ pte_t *huge_pte_alloc(struct mm_struct *mm,
* huge_pte_offset() - Walk the page table to resolve the hugepage
* entry at address @addr
*
- * Return: Pointer to page table or swap entry (PUD or PMD) for
- * address @addr, or NULL if a p*d_none() entry is encountered and the
+ * Return: Pointer to page table entry (PUD or PMD) for
+ * address @addr, or NULL if a !p*d_present() entry is encountered and the
* size @sz doesn't match the hugepage size at this level of the page
* table.
*/
@@ -5376,20 +5488,16 @@ pte_t *huge_pte_offset(struct mm_struct *mm,
return NULL;
pud = pud_offset(p4d, addr);
- if (sz != PUD_SIZE && pud_none(*pud))
- return NULL;
- /* hugepage or swap? */
- if (pud_huge(*pud) || !pud_present(*pud))
+ if (sz == PUD_SIZE)
+ /* must be pud huge, non-present or none */
return (pte_t *)pud;
-
- pmd = pmd_offset(pud, addr);
- if (sz != PMD_SIZE && pmd_none(*pmd))
+ if (!pud_present(*pud))
return NULL;
- /* hugepage or swap? */
- if (pmd_huge(*pmd) || !pmd_present(*pmd))
- return (pte_t *)pmd;
+ /* must have a valid entry and size to go further */
- return NULL;
+ pmd = pmd_offset(pud, addr);
+ /* must be pmd huge, non-present or none */
+ return (pte_t *)pmd;
}
#endif /* CONFIG_ARCH_WANT_GENERAL_HUGETLB */
diff --git a/mm/init-mm.c b/mm/init-mm.c
index 19603302a77f..3a613c85f9ed 100644
--- a/mm/init-mm.c
+++ b/mm/init-mm.c
@@ -6,10 +6,10 @@
#include <linux/list.h>
#include <linux/cpumask.h>
#include <linux/mman.h>
+#include <linux/pgtable.h>
#include <linux/atomic.h>
#include <linux/user_namespace.h>
-#include <asm/pgtable.h>
#include <asm/mmu.h>
#ifndef INIT_MM_CONTEXT
@@ -31,7 +31,7 @@ struct mm_struct init_mm = {
.pgd = swapper_pg_dir,
.mm_users = ATOMIC_INIT(2),
.mm_count = ATOMIC_INIT(1),
- .mmap_sem = __RWSEM_INITIALIZER(init_mm.mmap_sem),
+ MMAP_LOCK_INITIALIZER(init_mm)
.page_table_lock = __SPIN_LOCK_UNLOCKED(init_mm.page_table_lock),
.arg_lock = __SPIN_LOCK_UNLOCKED(init_mm.arg_lock),
.mmlist = LIST_HEAD_INIT(init_mm.mmlist),
diff --git a/mm/internal.h b/mm/internal.h
index b5634e78f01d..9886db20d94f 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -49,18 +49,20 @@ void unmap_page_range(struct mmu_gather *tlb,
unsigned long addr, unsigned long end,
struct zap_details *details);
-extern unsigned int __do_page_cache_readahead(struct address_space *mapping,
- struct file *filp, pgoff_t offset, unsigned long nr_to_read,
+void force_page_cache_readahead(struct address_space *, struct file *,
+ pgoff_t index, unsigned long nr_to_read);
+void __do_page_cache_readahead(struct address_space *, struct file *,
+ pgoff_t index, unsigned long nr_to_read,
unsigned long lookahead_size);
/*
* Submit IO for the read-ahead request in file_ra_state.
*/
-static inline unsigned long ra_submit(struct file_ra_state *ra,
+static inline void ra_submit(struct file_ra_state *ra,
struct address_space *mapping, struct file *filp)
{
- return __do_page_cache_readahead(mapping, filp,
- ra->start, ra->size, ra->async_size);
+ __do_page_cache_readahead(mapping, filp,
+ ra->start, ra->size, ra->async_size);
}
/**
@@ -125,12 +127,12 @@ extern pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address);
* between functions involved in allocations, including the alloc_pages*
* family of functions.
*
- * nodemask, migratetype and high_zoneidx are initialized only once in
+ * nodemask, migratetype and highest_zoneidx are initialized only once in
* __alloc_pages_nodemask() and then never change.
*
- * zonelist, preferred_zone and classzone_idx are set first in
+ * zonelist, preferred_zone and highest_zoneidx are set first in
* __alloc_pages_nodemask() for the fast path, and might be later changed
- * in __alloc_pages_slowpath(). All other functions pass the whole strucure
+ * in __alloc_pages_slowpath(). All other functions pass the whole structure
* by a const pointer.
*/
struct alloc_context {
@@ -138,12 +140,21 @@ struct alloc_context {
nodemask_t *nodemask;
struct zoneref *preferred_zoneref;
int migratetype;
- enum zone_type high_zoneidx;
+
+ /*
+ * highest_zoneidx represents highest usable zone index of
+ * the allocation request. Due to the nature of the zone,
+ * memory on lower zone than the highest_zoneidx will be
+ * protected by lowmem_reserve[highest_zoneidx].
+ *
+ * highest_zoneidx is also used by reclaim/compaction to limit
+ * the target zone since higher zone than this index cannot be
+ * usable for this allocation request.
+ */
+ enum zone_type highest_zoneidx;
bool spread_dirty_pages;
};
-#define ac_classzone_idx(ac) zonelist_zone_idx(ac->preferred_zoneref)
-
/*
* Locate the struct page for both the matching buddy in our
* pair (buddy1) and the combined O(n+1) page they form (page).
@@ -222,7 +233,7 @@ struct compact_control {
int order; /* order a direct compactor needs */
int migratetype; /* migratetype of direct compactor */
const unsigned int alloc_flags; /* alloc flags of a direct compactor */
- const int classzone_idx; /* zone index of a direct compactor */
+ const int highest_zoneidx; /* zone index of a direct compactor */
enum migrate_mode mode; /* Async or sync migration mode */
bool ignore_skip_hint; /* Scan blocks even if marked skip */
bool no_set_skip_hint; /* Don't mark blocks for skipping */
@@ -333,7 +344,7 @@ static inline void munlock_vma_pages_all(struct vm_area_struct *vma)
}
/*
- * must be called with vma's mmap_sem held for read or write, and page locked.
+ * must be called with vma's mmap_lock held for read or write, and page locked.
*/
extern void mlock_vma_page(struct page *page);
extern unsigned int munlock_vma_page(struct page *page);
@@ -402,13 +413,13 @@ static inline struct file *maybe_unlock_mmap_for_io(struct vm_fault *vmf,
/*
* FAULT_FLAG_RETRY_NOWAIT means we don't want to wait on page locks or
- * anything, so we only pin the file and drop the mmap_sem if only
+ * anything, so we only pin the file and drop the mmap_lock if only
* FAULT_FLAG_ALLOW_RETRY is set, while this is the first attempt.
*/
if (fault_flag_allow_retry_first(flags) &&
!(flags & FAULT_FLAG_RETRY_NOWAIT)) {
fpin = get_file(vmf->vma->vm_file);
- up_read(&vmf->vma->vm_mm->mmap_sem);
+ mmap_read_unlock(vmf->vma->vm_mm);
}
return fpin;
}
@@ -527,7 +538,7 @@ extern unsigned long __must_check vm_mmap_pgoff(struct file *, unsigned long,
unsigned long, unsigned long);
extern void set_pageblock_order(void);
-unsigned long reclaim_clean_pages_from_list(struct zone *zone,
+unsigned int reclaim_clean_pages_from_list(struct zone *zone,
struct list_head *page_list);
/* The ALLOC_WMARK bits are used as an index to zone->watermark */
#define ALLOC_WMARK_MIN WMARK_MIN
diff --git a/mm/kasan/Makefile b/mm/kasan/Makefile
index 08b43de2383b..d532c2587731 100644
--- a/mm/kasan/Makefile
+++ b/mm/kasan/Makefile
@@ -1,23 +1,33 @@
# SPDX-License-Identifier: GPL-2.0
KASAN_SANITIZE := n
-UBSAN_SANITIZE_common.o := n
-UBSAN_SANITIZE_generic.o := n
-UBSAN_SANITIZE_generic_report.o := n
-UBSAN_SANITIZE_tags.o := n
+UBSAN_SANITIZE := n
KCOV_INSTRUMENT := n
+# Disable ftrace to avoid recursion.
CFLAGS_REMOVE_common.o = $(CC_FLAGS_FTRACE)
CFLAGS_REMOVE_generic.o = $(CC_FLAGS_FTRACE)
CFLAGS_REMOVE_generic_report.o = $(CC_FLAGS_FTRACE)
+CFLAGS_REMOVE_init.o = $(CC_FLAGS_FTRACE)
+CFLAGS_REMOVE_quarantine.o = $(CC_FLAGS_FTRACE)
+CFLAGS_REMOVE_report.o = $(CC_FLAGS_FTRACE)
CFLAGS_REMOVE_tags.o = $(CC_FLAGS_FTRACE)
+CFLAGS_REMOVE_tags_report.o = $(CC_FLAGS_FTRACE)
# Function splitter causes unnecessary splits in __asan_load1/__asan_store1
# see: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=63533
+CC_FLAGS_KASAN_RUNTIME := $(call cc-option, -fno-conserve-stack)
+CC_FLAGS_KASAN_RUNTIME += $(call cc-option, -fno-stack-protector)
+# Disable branch tracing to avoid recursion.
+CC_FLAGS_KASAN_RUNTIME += -DDISABLE_BRANCH_PROFILING
-CFLAGS_common.o := $(call cc-option, -fno-conserve-stack -fno-stack-protector)
-CFLAGS_generic.o := $(call cc-option, -fno-conserve-stack -fno-stack-protector)
-CFLAGS_generic_report.o := $(call cc-option, -fno-conserve-stack -fno-stack-protector)
-CFLAGS_tags.o := $(call cc-option, -fno-conserve-stack -fno-stack-protector)
+CFLAGS_common.o := $(CC_FLAGS_KASAN_RUNTIME)
+CFLAGS_generic.o := $(CC_FLAGS_KASAN_RUNTIME)
+CFLAGS_generic_report.o := $(CC_FLAGS_KASAN_RUNTIME)
+CFLAGS_init.o := $(CC_FLAGS_KASAN_RUNTIME)
+CFLAGS_quarantine.o := $(CC_FLAGS_KASAN_RUNTIME)
+CFLAGS_report.o := $(CC_FLAGS_KASAN_RUNTIME)
+CFLAGS_tags.o := $(CC_FLAGS_KASAN_RUNTIME)
+CFLAGS_tags_report.o := $(CC_FLAGS_KASAN_RUNTIME)
obj-$(CONFIG_KASAN) := common.o init.o report.o
obj-$(CONFIG_KASAN_GENERIC) += generic.o generic_report.o quarantine.o
diff --git a/mm/kasan/common.c b/mm/kasan/common.c
index 2906358e42f0..757d4074fe28 100644
--- a/mm/kasan/common.c
+++ b/mm/kasan/common.c
@@ -33,7 +33,6 @@
#include <linux/types.h>
#include <linux/vmalloc.h>
#include <linux/bug.h>
-#include <linux/uaccess.h>
#include <asm/cacheflush.h>
#include <asm/tlbflush.h>
@@ -613,24 +612,6 @@ void kasan_free_shadow(const struct vm_struct *vm)
}
#endif
-extern void __kasan_report(unsigned long addr, size_t size, bool is_write, unsigned long ip);
-extern bool report_enabled(void);
-
-bool kasan_report(unsigned long addr, size_t size, bool is_write, unsigned long ip)
-{
- unsigned long flags = user_access_save();
- bool ret = false;
-
- if (likely(report_enabled())) {
- __kasan_report(addr, size, is_write, ip);
- ret = true;
- }
-
- user_access_restore(flags);
-
- return ret;
-}
-
#ifdef CONFIG_MEMORY_HOTPLUG
static bool shadow_mapped(unsigned long addr)
{
diff --git a/mm/kasan/generic.c b/mm/kasan/generic.c
index 56ff8885fe2e..098a7dbaced6 100644
--- a/mm/kasan/generic.c
+++ b/mm/kasan/generic.c
@@ -15,7 +15,6 @@
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-#define DISABLE_BRANCH_PROFILING
#include <linux/export.h>
#include <linux/interrupt.h>
diff --git a/mm/kasan/init.c b/mm/kasan/init.c
index ce45c491ebcd..fe6be0be1f76 100644
--- a/mm/kasan/init.c
+++ b/mm/kasan/init.c
@@ -250,20 +250,9 @@ int __ref kasan_populate_early_shadow(const void *shadow_start,
* 3,2 - level page tables where we don't have
* puds,pmds, so pgd_populate(), pud_populate()
* is noops.
- *
- * The ifndef is required to avoid build breakage.
- *
- * With 5level-fixup.h, pgd_populate() is not nop and
- * we reference kasan_early_shadow_p4d. It's not defined
- * unless 5-level paging enabled.
- *
- * The ifndef can be dropped once all KASAN-enabled
- * architectures will switch to pgtable-nop4d.h.
*/
-#ifndef __ARCH_HAS_5LEVEL_HACK
pgd_populate(&init_mm, pgd,
lm_alias(kasan_early_shadow_p4d));
-#endif
p4d = p4d_offset(pgd, addr);
p4d_populate(&init_mm, p4d,
lm_alias(kasan_early_shadow_pud));
diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h
index e8f37199d885..cfade6413528 100644
--- a/mm/kasan/kasan.h
+++ b/mm/kasan/kasan.h
@@ -212,8 +212,6 @@ static inline const void *arch_kasan_set_tag(const void *addr, u8 tag)
asmlinkage void kasan_unpoison_task_stack_below(const void *watermark);
void __asan_register_globals(struct kasan_global *globals, size_t size);
void __asan_unregister_globals(struct kasan_global *globals, size_t size);
-void __asan_loadN(unsigned long addr, size_t size);
-void __asan_storeN(unsigned long addr, size_t size);
void __asan_handle_no_return(void);
void __asan_alloca_poison(unsigned long addr, size_t size);
void __asan_allocas_unpoison(const void *stack_top, const void *stack_bottom);
@@ -228,6 +226,8 @@ void __asan_load8(unsigned long addr);
void __asan_store8(unsigned long addr);
void __asan_load16(unsigned long addr);
void __asan_store16(unsigned long addr);
+void __asan_loadN(unsigned long addr, size_t size);
+void __asan_storeN(unsigned long addr, size_t size);
void __asan_load1_noabort(unsigned long addr);
void __asan_store1_noabort(unsigned long addr);
@@ -239,6 +239,21 @@ void __asan_load8_noabort(unsigned long addr);
void __asan_store8_noabort(unsigned long addr);
void __asan_load16_noabort(unsigned long addr);
void __asan_store16_noabort(unsigned long addr);
+void __asan_loadN_noabort(unsigned long addr, size_t size);
+void __asan_storeN_noabort(unsigned long addr, size_t size);
+
+void __asan_report_load1_noabort(unsigned long addr);
+void __asan_report_store1_noabort(unsigned long addr);
+void __asan_report_load2_noabort(unsigned long addr);
+void __asan_report_store2_noabort(unsigned long addr);
+void __asan_report_load4_noabort(unsigned long addr);
+void __asan_report_store4_noabort(unsigned long addr);
+void __asan_report_load8_noabort(unsigned long addr);
+void __asan_report_store8_noabort(unsigned long addr);
+void __asan_report_load16_noabort(unsigned long addr);
+void __asan_report_store16_noabort(unsigned long addr);
+void __asan_report_load_n_noabort(unsigned long addr, size_t size);
+void __asan_report_store_n_noabort(unsigned long addr, size_t size);
void __asan_set_shadow_00(const void *addr, size_t size);
void __asan_set_shadow_f1(const void *addr, size_t size);
@@ -247,4 +262,19 @@ void __asan_set_shadow_f3(const void *addr, size_t size);
void __asan_set_shadow_f5(const void *addr, size_t size);
void __asan_set_shadow_f8(const void *addr, size_t size);
+void __hwasan_load1_noabort(unsigned long addr);
+void __hwasan_store1_noabort(unsigned long addr);
+void __hwasan_load2_noabort(unsigned long addr);
+void __hwasan_store2_noabort(unsigned long addr);
+void __hwasan_load4_noabort(unsigned long addr);
+void __hwasan_store4_noabort(unsigned long addr);
+void __hwasan_load8_noabort(unsigned long addr);
+void __hwasan_store8_noabort(unsigned long addr);
+void __hwasan_load16_noabort(unsigned long addr);
+void __hwasan_store16_noabort(unsigned long addr);
+void __hwasan_loadN_noabort(unsigned long addr, size_t size);
+void __hwasan_storeN_noabort(unsigned long addr, size_t size);
+
+void __hwasan_tag_memory(unsigned long addr, u8 tag, unsigned long size);
+
#endif
diff --git a/mm/kasan/report.c b/mm/kasan/report.c
index 80f23c9da6b0..51ec45407a0b 100644
--- a/mm/kasan/report.c
+++ b/mm/kasan/report.c
@@ -29,6 +29,7 @@
#include <linux/kasan.h>
#include <linux/module.h>
#include <linux/sched/task_stack.h>
+#include <linux/uaccess.h>
#include <asm/sections.h>
@@ -454,7 +455,7 @@ static void print_shadow_for_address(const void *addr)
}
}
-bool report_enabled(void)
+static bool report_enabled(void)
{
if (current->kasan_depth)
return false;
@@ -479,7 +480,8 @@ void kasan_report_invalid_free(void *object, unsigned long ip)
end_report(&flags);
}
-void __kasan_report(unsigned long addr, size_t size, bool is_write, unsigned long ip)
+static void __kasan_report(unsigned long addr, size_t size, bool is_write,
+ unsigned long ip)
{
struct kasan_access_info info;
void *tagged_addr;
@@ -518,6 +520,22 @@ void __kasan_report(unsigned long addr, size_t size, bool is_write, unsigned lon
end_report(&flags);
}
+bool kasan_report(unsigned long addr, size_t size, bool is_write,
+ unsigned long ip)
+{
+ unsigned long flags = user_access_save();
+ bool ret = false;
+
+ if (likely(report_enabled())) {
+ __kasan_report(addr, size, is_write, ip);
+ ret = true;
+ }
+
+ user_access_restore(flags);
+
+ return ret;
+}
+
#ifdef CONFIG_KASAN_INLINE
/*
* With CONFIG_KASAN_INLINE, accesses to bogus pointers (outside the high
diff --git a/mm/kasan/tags.c b/mm/kasan/tags.c
index 25b7734e7013..8a959fdd30e3 100644
--- a/mm/kasan/tags.c
+++ b/mm/kasan/tags.c
@@ -12,7 +12,6 @@
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-#define DISABLE_BRANCH_PROFILING
#include <linux/export.h>
#include <linux/interrupt.h>
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 99d77ffb79c2..b043c40a21d4 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -28,6 +28,8 @@ enum scan_result {
SCAN_SUCCEED,
SCAN_PMD_NULL,
SCAN_EXCEED_NONE_PTE,
+ SCAN_EXCEED_SWAP_PTE,
+ SCAN_EXCEED_SHARED_PTE,
SCAN_PTE_NON_PRESENT,
SCAN_PTE_UFFD_WP,
SCAN_PAGE_RO,
@@ -47,7 +49,6 @@ enum scan_result {
SCAN_DEL_PAGE_LRU,
SCAN_ALLOC_HUGE_PAGE_FAIL,
SCAN_CGROUP_CHARGE_FAIL,
- SCAN_EXCEED_SWAP_PTE,
SCAN_TRUNCATED,
SCAN_PAGE_HAS_PRIVATE,
};
@@ -72,6 +73,7 @@ static DECLARE_WAIT_QUEUE_HEAD(khugepaged_wait);
*/
static unsigned int khugepaged_max_ptes_none __read_mostly;
static unsigned int khugepaged_max_ptes_swap __read_mostly;
+static unsigned int khugepaged_max_ptes_shared __read_mostly;
#define MM_SLOTS_HASH_BITS 10
static __read_mostly DEFINE_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS);
@@ -291,15 +293,43 @@ static struct kobj_attribute khugepaged_max_ptes_swap_attr =
__ATTR(max_ptes_swap, 0644, khugepaged_max_ptes_swap_show,
khugepaged_max_ptes_swap_store);
+static ssize_t khugepaged_max_ptes_shared_show(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ char *buf)
+{
+ return sprintf(buf, "%u\n", khugepaged_max_ptes_shared);
+}
+
+static ssize_t khugepaged_max_ptes_shared_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+ int err;
+ unsigned long max_ptes_shared;
+
+ err = kstrtoul(buf, 10, &max_ptes_shared);
+ if (err || max_ptes_shared > HPAGE_PMD_NR-1)
+ return -EINVAL;
+
+ khugepaged_max_ptes_shared = max_ptes_shared;
+
+ return count;
+}
+
+static struct kobj_attribute khugepaged_max_ptes_shared_attr =
+ __ATTR(max_ptes_shared, 0644, khugepaged_max_ptes_shared_show,
+ khugepaged_max_ptes_shared_store);
+
static struct attribute *khugepaged_attr[] = {
&khugepaged_defrag_attr.attr,
&khugepaged_max_ptes_none_attr.attr,
+ &khugepaged_max_ptes_swap_attr.attr,
+ &khugepaged_max_ptes_shared_attr.attr,
&pages_to_scan_attr.attr,
&pages_collapsed_attr.attr,
&full_scans_attr.attr,
&scan_sleep_millisecs_attr.attr,
&alloc_sleep_millisecs_attr.attr,
- &khugepaged_max_ptes_swap_attr.attr,
NULL,
};
@@ -359,6 +389,7 @@ int __init khugepaged_init(void)
khugepaged_pages_to_scan = HPAGE_PMD_NR * 8;
khugepaged_max_ptes_none = HPAGE_PMD_NR - 1;
khugepaged_max_ptes_swap = HPAGE_PMD_NR / 8;
+ khugepaged_max_ptes_shared = HPAGE_PMD_NR / 2;
return 0;
}
@@ -503,36 +534,61 @@ void __khugepaged_exit(struct mm_struct *mm)
* under mmap sem read mode). Stop here (after we
* return all pagetables will be destroyed) until
* khugepaged has finished working on the pagetables
- * under the mmap_sem.
+ * under the mmap_lock.
*/
- down_write(&mm->mmap_sem);
- up_write(&mm->mmap_sem);
+ mmap_write_lock(mm);
+ mmap_write_unlock(mm);
}
}
static void release_pte_page(struct page *page)
{
- dec_node_page_state(page, NR_ISOLATED_ANON + page_is_file_lru(page));
+ mod_node_page_state(page_pgdat(page),
+ NR_ISOLATED_ANON + page_is_file_lru(page),
+ -compound_nr(page));
unlock_page(page);
putback_lru_page(page);
}
-static void release_pte_pages(pte_t *pte, pte_t *_pte)
+static void release_pte_pages(pte_t *pte, pte_t *_pte,
+ struct list_head *compound_pagelist)
{
+ struct page *page, *tmp;
+
while (--_pte >= pte) {
pte_t pteval = *_pte;
- if (!pte_none(pteval) && !is_zero_pfn(pte_pfn(pteval)))
- release_pte_page(pte_page(pteval));
+
+ page = pte_page(pteval);
+ if (!pte_none(pteval) && !is_zero_pfn(pte_pfn(pteval)) &&
+ !PageCompound(page))
+ release_pte_page(page);
+ }
+
+ list_for_each_entry_safe(page, tmp, compound_pagelist, lru) {
+ list_del(&page->lru);
+ release_pte_page(page);
}
}
+static bool is_refcount_suitable(struct page *page)
+{
+ int expected_refcount;
+
+ expected_refcount = total_mapcount(page);
+ if (PageSwapCache(page))
+ expected_refcount += compound_nr(page);
+
+ return page_count(page) == expected_refcount;
+}
+
static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
unsigned long address,
- pte_t *pte)
+ pte_t *pte,
+ struct list_head *compound_pagelist)
{
struct page *page = NULL;
pte_t *_pte;
- int none_or_zero = 0, result = 0, referenced = 0;
+ int none_or_zero = 0, shared = 0, result = 0, referenced = 0;
bool writable = false;
for (_pte = pte; _pte < pte+HPAGE_PMD_NR;
@@ -558,13 +614,27 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
goto out;
}
- /* TODO: teach khugepaged to collapse THP mapped with pte */
- if (PageCompound(page)) {
- result = SCAN_PAGE_COMPOUND;
+ VM_BUG_ON_PAGE(!PageAnon(page), page);
+
+ if (page_mapcount(page) > 1 &&
+ ++shared > khugepaged_max_ptes_shared) {
+ result = SCAN_EXCEED_SHARED_PTE;
goto out;
}
- VM_BUG_ON_PAGE(!PageAnon(page), page);
+ if (PageCompound(page)) {
+ struct page *p;
+ page = compound_head(page);
+
+ /*
+ * Check if we have dealt with the compound page
+ * already
+ */
+ list_for_each_entry(p, compound_pagelist, lru) {
+ if (page == p)
+ goto next;
+ }
+ }
/*
* We can do it before isolate_lru_page because the
@@ -578,28 +648,30 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
}
/*
- * cannot use mapcount: can't collapse if there's a gup pin.
- * The page must only be referenced by the scanned process
- * and page swap cache.
+ * Check if the page has any GUP (or other external) pins.
+ *
+ * The page table that maps the page has been already unlinked
+ * from the page table tree and this process cannot get
+ * an additinal pin on the page.
+ *
+ * New pins can come later if the page is shared across fork,
+ * but not from this process. The other process cannot write to
+ * the page, only trigger CoW.
*/
- if (page_count(page) != 1 + PageSwapCache(page)) {
+ if (!is_refcount_suitable(page)) {
unlock_page(page);
result = SCAN_PAGE_COUNT;
goto out;
}
- if (pte_write(pteval)) {
- writable = true;
- } else {
- if (PageSwapCache(page) &&
- !reuse_swap_page(page, NULL)) {
- unlock_page(page);
- result = SCAN_SWAP_CACHE_PAGE;
- goto out;
- }
+ if (!pte_write(pteval) && PageSwapCache(page) &&
+ !reuse_swap_page(page, NULL)) {
/*
- * Page is not in the swap cache. It can be collapsed
- * into a THP.
+ * Page is in the swap cache and cannot be re-used.
+ * It cannot be collapsed into a THP.
*/
+ unlock_page(page);
+ result = SCAN_SWAP_CACHE_PAGE;
+ goto out;
}
/*
@@ -611,16 +683,23 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
result = SCAN_DEL_PAGE_LRU;
goto out;
}
- inc_node_page_state(page,
- NR_ISOLATED_ANON + page_is_file_lru(page));
+ mod_node_page_state(page_pgdat(page),
+ NR_ISOLATED_ANON + page_is_file_lru(page),
+ compound_nr(page));
VM_BUG_ON_PAGE(!PageLocked(page), page);
VM_BUG_ON_PAGE(PageLRU(page), page);
+ if (PageCompound(page))
+ list_add_tail(&page->lru, compound_pagelist);
+next:
/* There should be enough young pte to collapse the page */
if (pte_young(pteval) ||
page_is_young(page) || PageReferenced(page) ||
mmu_notifier_test_young(vma->vm_mm, address))
referenced++;
+
+ if (pte_write(pteval))
+ writable = true;
}
if (likely(writable)) {
if (likely(referenced)) {
@@ -634,7 +713,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
}
out:
- release_pte_pages(pte, _pte);
+ release_pte_pages(pte, _pte, compound_pagelist);
trace_mm_collapse_huge_page_isolate(page, none_or_zero,
referenced, writable, result);
return 0;
@@ -643,13 +722,14 @@ out:
static void __collapse_huge_page_copy(pte_t *pte, struct page *page,
struct vm_area_struct *vma,
unsigned long address,
- spinlock_t *ptl)
+ spinlock_t *ptl,
+ struct list_head *compound_pagelist)
{
+ struct page *src_page, *tmp;
pte_t *_pte;
for (_pte = pte; _pte < pte + HPAGE_PMD_NR;
_pte++, page++, address += PAGE_SIZE) {
pte_t pteval = *_pte;
- struct page *src_page;
if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) {
clear_user_highpage(page, address);
@@ -669,8 +749,8 @@ static void __collapse_huge_page_copy(pte_t *pte, struct page *page,
} else {
src_page = pte_page(pteval);
copy_user_highpage(page, src_page, address, vma);
- VM_BUG_ON_PAGE(page_mapcount(src_page) != 1, src_page);
- release_pte_page(src_page);
+ if (!PageCompound(src_page))
+ release_pte_page(src_page);
/*
* ptl mostly unnecessary, but preempt has to
* be disabled to update the per-cpu stats
@@ -687,6 +767,11 @@ static void __collapse_huge_page_copy(pte_t *pte, struct page *page,
free_page_and_swap_cache(src_page);
}
}
+
+ list_for_each_entry_safe(src_page, tmp, compound_pagelist, lru) {
+ list_del(&src_page->lru);
+ release_pte_page(src_page);
+ }
}
static void khugepaged_alloc_sleep(void)
@@ -848,8 +933,8 @@ khugepaged_alloc_page(struct page **hpage, gfp_t gfp, int node)
#endif
/*
- * If mmap_sem temporarily dropped, revalidate vma
- * before taking mmap_sem.
+ * If mmap_lock temporarily dropped, revalidate vma
+ * before taking mmap_lock.
* Return 0 if succeeds, otherwise return none-zero
* value (scan code).
*/
@@ -881,7 +966,7 @@ static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address,
* Only done if khugepaged_scan_pmd believes it is worthwhile.
*
* Called and returns without pte mapped or spinlocks held,
- * but with mmap_sem held to protect against vma changes.
+ * but with mmap_lock held to protect against vma changes.
*/
static bool __collapse_huge_page_swapin(struct mm_struct *mm,
@@ -899,11 +984,6 @@ static bool __collapse_huge_page_swapin(struct mm_struct *mm,
.pgoff = linear_page_index(vma, address),
};
- /* we only decide to swapin, if there is enough young ptes */
- if (referenced < HPAGE_PMD_NR/2) {
- trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 0);
- return false;
- }
vmf.pte = pte_offset_map(pmd, address);
for (; vmf.address < address + HPAGE_PMD_NR*PAGE_SIZE;
vmf.pte++, vmf.address += PAGE_SIZE) {
@@ -913,9 +993,9 @@ static bool __collapse_huge_page_swapin(struct mm_struct *mm,
swapped_in++;
ret = do_swap_page(&vmf);
- /* do_swap_page returns VM_FAULT_RETRY with released mmap_sem */
+ /* do_swap_page returns VM_FAULT_RETRY with released mmap_lock */
if (ret & VM_FAULT_RETRY) {
- down_read(&mm->mmap_sem);
+ mmap_read_lock(mm);
if (hugepage_vma_revalidate(mm, address, &vmf.vma)) {
/* vma is no longer available, don't continue to swapin */
trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 0);
@@ -936,6 +1016,11 @@ static bool __collapse_huge_page_swapin(struct mm_struct *mm,
}
vmf.pte--;
pte_unmap(vmf.pte);
+
+ /* Drain LRU add pagevec to remove extra pin on the swapped in pages */
+ if (swapped_in)
+ lru_add_drain();
+
trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 1);
return true;
}
@@ -943,15 +1028,15 @@ static bool __collapse_huge_page_swapin(struct mm_struct *mm,
static void collapse_huge_page(struct mm_struct *mm,
unsigned long address,
struct page **hpage,
- int node, int referenced)
+ int node, int referenced, int unmapped)
{
+ LIST_HEAD(compound_pagelist);
pmd_t *pmd, _pmd;
pte_t *pte;
pgtable_t pgtable;
struct page *new_page;
spinlock_t *pmd_ptl, *pte_ptl;
int isolated = 0, result = 0;
- struct mem_cgroup *memcg;
struct vm_area_struct *vma;
struct mmu_notifier_range range;
gfp_t gfp;
@@ -962,57 +1047,56 @@ static void collapse_huge_page(struct mm_struct *mm,
gfp = alloc_hugepage_khugepaged_gfpmask() | __GFP_THISNODE;
/*
- * Before allocating the hugepage, release the mmap_sem read lock.
+ * Before allocating the hugepage, release the mmap_lock read lock.
* The allocation can take potentially a long time if it involves
- * sync compaction, and we do not need to hold the mmap_sem during
+ * sync compaction, and we do not need to hold the mmap_lock during
* that. We will recheck the vma after taking it again in write mode.
*/
- up_read(&mm->mmap_sem);
+ mmap_read_unlock(mm);
new_page = khugepaged_alloc_page(hpage, gfp, node);
if (!new_page) {
result = SCAN_ALLOC_HUGE_PAGE_FAIL;
goto out_nolock;
}
- if (unlikely(mem_cgroup_try_charge(new_page, mm, gfp, &memcg, true))) {
+ if (unlikely(mem_cgroup_charge(new_page, mm, gfp))) {
result = SCAN_CGROUP_CHARGE_FAIL;
goto out_nolock;
}
+ count_memcg_page_event(new_page, THP_COLLAPSE_ALLOC);
- down_read(&mm->mmap_sem);
+ mmap_read_lock(mm);
result = hugepage_vma_revalidate(mm, address, &vma);
if (result) {
- mem_cgroup_cancel_charge(new_page, memcg, true);
- up_read(&mm->mmap_sem);
+ mmap_read_unlock(mm);
goto out_nolock;
}
pmd = mm_find_pmd(mm, address);
if (!pmd) {
result = SCAN_PMD_NULL;
- mem_cgroup_cancel_charge(new_page, memcg, true);
- up_read(&mm->mmap_sem);
+ mmap_read_unlock(mm);
goto out_nolock;
}
/*
- * __collapse_huge_page_swapin always returns with mmap_sem locked.
- * If it fails, we release mmap_sem and jump out_nolock.
+ * __collapse_huge_page_swapin always returns with mmap_lock locked.
+ * If it fails, we release mmap_lock and jump out_nolock.
* Continuing to collapse causes inconsistency.
*/
- if (!__collapse_huge_page_swapin(mm, vma, address, pmd, referenced)) {
- mem_cgroup_cancel_charge(new_page, memcg, true);
- up_read(&mm->mmap_sem);
+ if (unmapped && !__collapse_huge_page_swapin(mm, vma, address,
+ pmd, referenced)) {
+ mmap_read_unlock(mm);
goto out_nolock;
}
- up_read(&mm->mmap_sem);
+ mmap_read_unlock(mm);
/*
* Prevent all access to pagetables with the exception of
* gup_fast later handled by the ptep_clear_flush and the VM
* handled by the anon_vma lock + PG_lock.
*/
- down_write(&mm->mmap_sem);
+ mmap_write_lock(mm);
result = SCAN_ANY_PROCESS;
if (!mmget_still_valid(mm))
goto out;
@@ -1044,7 +1128,8 @@ static void collapse_huge_page(struct mm_struct *mm,
mmu_notifier_invalidate_range_end(&range);
spin_lock(pte_ptl);
- isolated = __collapse_huge_page_isolate(vma, address, pte);
+ isolated = __collapse_huge_page_isolate(vma, address, pte,
+ &compound_pagelist);
spin_unlock(pte_ptl);
if (unlikely(!isolated)) {
@@ -1069,7 +1154,8 @@ static void collapse_huge_page(struct mm_struct *mm,
*/
anon_vma_unlock_write(vma->anon_vma);
- __collapse_huge_page_copy(pte, new_page, vma, address, pte_ptl);
+ __collapse_huge_page_copy(pte, new_page, vma, address, pte_ptl,
+ &compound_pagelist);
pte_unmap(pte);
__SetPageUptodate(new_page);
pgtable = pmd_pgtable(_pmd);
@@ -1087,8 +1173,6 @@ static void collapse_huge_page(struct mm_struct *mm,
spin_lock(pmd_ptl);
BUG_ON(!pmd_none(*pmd));
page_add_new_anon_rmap(new_page, vma, address, true);
- mem_cgroup_commit_charge(new_page, memcg, false, true);
- count_memcg_events(memcg, THP_COLLAPSE_ALLOC, 1);
lru_cache_add_active_or_unevictable(new_page, vma);
pgtable_trans_huge_deposit(mm, pmd, pgtable);
set_pmd_at(mm, address, pmd, _pmd);
@@ -1100,12 +1184,13 @@ static void collapse_huge_page(struct mm_struct *mm,
khugepaged_pages_collapsed++;
result = SCAN_SUCCEED;
out_up_write:
- up_write(&mm->mmap_sem);
+ mmap_write_unlock(mm);
out_nolock:
+ if (!IS_ERR_OR_NULL(*hpage))
+ mem_cgroup_uncharge(*hpage);
trace_mm_collapse_huge_page(mm, isolated, result);
return;
out:
- mem_cgroup_cancel_charge(new_page, memcg, true);
goto out_up_write;
}
@@ -1116,7 +1201,8 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
{
pmd_t *pmd;
pte_t *pte, *_pte;
- int ret = 0, none_or_zero = 0, result = 0, referenced = 0;
+ int ret = 0, result = 0, referenced = 0;
+ int none_or_zero = 0, shared = 0;
struct page *page = NULL;
unsigned long _address;
spinlock_t *ptl;
@@ -1188,12 +1274,14 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
goto out_unmap;
}
- /* TODO: teach khugepaged to collapse THP mapped with pte */
- if (PageCompound(page)) {
- result = SCAN_PAGE_COMPOUND;
+ if (page_mapcount(page) > 1 &&
+ ++shared > khugepaged_max_ptes_shared) {
+ result = SCAN_EXCEED_SHARED_PTE;
goto out_unmap;
}
+ page = compound_head(page);
+
/*
* Record which node the original page is from and save this
* information to khugepaged_node_load[].
@@ -1220,11 +1308,23 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
}
/*
- * cannot use mapcount: can't collapse if there's a gup pin.
- * The page must only be referenced by the scanned process
- * and page swap cache.
+ * Check if the page has any GUP (or other external) pins.
+ *
+ * Here the check is racy it may see totmal_mapcount > refcount
+ * in some cases.
+ * For example, one process with one forked child process.
+ * The parent has the PMD split due to MADV_DONTNEED, then
+ * the child is trying unmap the whole PMD, but khugepaged
+ * may be scanning the parent between the child has
+ * PageDoubleMap flag cleared and dec the mapcount. So
+ * khugepaged may see total_mapcount > refcount.
+ *
+ * But such case is ephemeral we could always retry collapse
+ * later. However it may report false positive if the page
+ * has excessive GUP pins (i.e. 512). Anyway the same check
+ * will be done again later the risk seems low.
*/
- if (page_count(page) != 1 + PageSwapCache(page)) {
+ if (!is_refcount_suitable(page)) {
result = SCAN_PAGE_COUNT;
goto out_unmap;
}
@@ -1233,22 +1333,21 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
mmu_notifier_test_young(vma->vm_mm, address))
referenced++;
}
- if (writable) {
- if (referenced) {
- result = SCAN_SUCCEED;
- ret = 1;
- } else {
- result = SCAN_LACK_REFERENCED_PAGE;
- }
- } else {
+ if (!writable) {
result = SCAN_PAGE_RO;
+ } else if (!referenced || (unmapped && referenced < HPAGE_PMD_NR/2)) {
+ result = SCAN_LACK_REFERENCED_PAGE;
+ } else {
+ result = SCAN_SUCCEED;
+ ret = 1;
}
out_unmap:
pte_unmap_unlock(pte, ptl);
if (ret) {
node = khugepaged_find_target_node();
- /* collapse_huge_page will return with the mmap_sem released */
- collapse_huge_page(mm, address, hpage, node, referenced);
+ /* collapse_huge_page will return with the mmap_lock released */
+ collapse_huge_page(mm, address, hpage, node,
+ referenced, unmapped);
}
out:
trace_mm_khugepaged_scan_pmd(mm, page, writable, referenced,
@@ -1418,7 +1517,7 @@ static int khugepaged_collapse_pte_mapped_thps(struct mm_slot *mm_slot)
if (likely(mm_slot->nr_pte_mapped_thp == 0))
return 0;
- if (!down_write_trylock(&mm->mmap_sem))
+ if (!mmap_write_trylock(mm))
return -EBUSY;
if (unlikely(khugepaged_test_exit(mm)))
@@ -1429,7 +1528,7 @@ static int khugepaged_collapse_pte_mapped_thps(struct mm_slot *mm_slot)
out:
mm_slot->nr_pte_mapped_thp = 0;
- up_write(&mm->mmap_sem);
+ mmap_write_unlock(mm);
return 0;
}
@@ -1444,11 +1543,11 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
/*
* Check vma->anon_vma to exclude MAP_PRIVATE mappings that
* got written to. These VMAs are likely not worth investing
- * down_write(mmap_sem) as PMD-mapping is likely to be split
+ * mmap_write_lock(mm) as PMD-mapping is likely to be split
* later.
*
* Not that vma->anon_vma check is racy: it can be set up after
- * the check but before we took mmap_sem by the fault path.
+ * the check but before we took mmap_lock by the fault path.
* But page lock would prevent establishing any new ptes of the
* page, so we are safe.
*
@@ -1468,18 +1567,18 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
if (!pmd)
continue;
/*
- * We need exclusive mmap_sem to retract page table.
+ * We need exclusive mmap_lock to retract page table.
*
* We use trylock due to lock inversion: we need to acquire
- * mmap_sem while holding page lock. Fault path does it in
+ * mmap_lock while holding page lock. Fault path does it in
* reverse order. Trylock is a way to avoid deadlock.
*/
- if (down_write_trylock(&vma->vm_mm->mmap_sem)) {
+ if (mmap_write_trylock(vma->vm_mm)) {
spinlock_t *ptl = pmd_lock(vma->vm_mm, pmd);
/* assume page table is clear */
_pmd = pmdp_collapse_flush(vma, addr, pmd);
spin_unlock(ptl);
- up_write(&vma->vm_mm->mmap_sem);
+ mmap_write_unlock(vma->vm_mm);
mm_dec_nr_ptes(vma->vm_mm);
pte_free(vma->vm_mm, pmd_pgtable(_pmd));
} else {
@@ -1515,7 +1614,6 @@ static void collapse_file(struct mm_struct *mm,
struct address_space *mapping = file->f_mapping;
gfp_t gfp;
struct page *new_page;
- struct mem_cgroup *memcg;
pgoff_t index, end = start + HPAGE_PMD_NR;
LIST_HEAD(pagelist);
XA_STATE_ORDER(xas, &mapping->i_pages, start, HPAGE_PMD_ORDER);
@@ -1534,10 +1632,11 @@ static void collapse_file(struct mm_struct *mm,
goto out;
}
- if (unlikely(mem_cgroup_try_charge(new_page, mm, gfp, &memcg, true))) {
+ if (unlikely(mem_cgroup_charge(new_page, mm, gfp))) {
result = SCAN_CGROUP_CHARGE_FAIL;
goto out;
}
+ count_memcg_page_event(new_page, THP_COLLAPSE_ALLOC);
/* This will be less messy when we use multi-index entries */
do {
@@ -1547,7 +1646,6 @@ static void collapse_file(struct mm_struct *mm,
break;
xas_unlock_irq(&xas);
if (!xas_nomem(&xas, GFP_KERNEL)) {
- mem_cgroup_cancel_charge(new_page, memcg, true);
result = SCAN_FAIL;
goto out;
}
@@ -1692,6 +1790,7 @@ static void collapse_file(struct mm_struct *mm,
if (page_has_private(page) &&
!try_to_release_page(page, GFP_KERNEL)) {
result = SCAN_PAGE_HAS_PRIVATE;
+ putback_lru_page(page);
goto out_unlock;
}
@@ -1740,12 +1839,9 @@ out_unlock:
}
if (nr_none) {
- struct zone *zone = page_zone(new_page);
-
- __mod_node_page_state(zone->zone_pgdat, NR_FILE_PAGES, nr_none);
+ __mod_lruvec_page_state(new_page, NR_FILE_PAGES, nr_none);
if (is_shmem)
- __mod_node_page_state(zone->zone_pgdat,
- NR_SHMEM, nr_none);
+ __mod_lruvec_page_state(new_page, NR_SHMEM, nr_none);
}
xa_locked:
@@ -1783,15 +1879,9 @@ xa_unlocked:
SetPageUptodate(new_page);
page_ref_add(new_page, HPAGE_PMD_NR - 1);
- mem_cgroup_commit_charge(new_page, memcg, false, true);
-
- if (is_shmem) {
+ if (is_shmem)
set_page_dirty(new_page);
- lru_cache_add_anon(new_page);
- } else {
- lru_cache_add_file(new_page);
- }
- count_memcg_events(memcg, THP_COLLAPSE_ALLOC, 1);
+ lru_cache_add(new_page);
/*
* Remove pte page tables, so we can re-fault the page as huge.
@@ -1838,13 +1928,14 @@ xa_unlocked:
VM_BUG_ON(nr_none);
xas_unlock_irq(&xas);
- mem_cgroup_cancel_charge(new_page, memcg, true);
new_page->mapping = NULL;
}
unlock_page(new_page);
out:
VM_BUG_ON(!list_empty(&pagelist));
+ if (!IS_ERR_OR_NULL(*hpage))
+ mem_cgroup_uncharge(*hpage);
/* TODO: tracepoints */
}
@@ -1966,8 +2057,8 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages,
* the next mm on the list.
*/
vma = NULL;
- if (unlikely(!down_read_trylock(&mm->mmap_sem)))
- goto breakouterloop_mmap_sem;
+ if (unlikely(!mmap_read_trylock(mm)))
+ goto breakouterloop_mmap_lock;
if (likely(!khugepaged_test_exit(mm)))
vma = find_vma(mm, khugepaged_scan.address);
@@ -2011,7 +2102,7 @@ skip:
pgoff_t pgoff = linear_page_index(vma,
khugepaged_scan.address);
- up_read(&mm->mmap_sem);
+ mmap_read_unlock(mm);
ret = 1;
khugepaged_scan_file(mm, file, pgoff, hpage);
fput(file);
@@ -2024,15 +2115,15 @@ skip:
khugepaged_scan.address += HPAGE_PMD_SIZE;
progress += HPAGE_PMD_NR;
if (ret)
- /* we released mmap_sem so break loop */
- goto breakouterloop_mmap_sem;
+ /* we released mmap_lock so break loop */
+ goto breakouterloop_mmap_lock;
if (progress >= pages)
goto breakouterloop;
}
}
breakouterloop:
- up_read(&mm->mmap_sem); /* exit_mmap will destroy ptes after this */
-breakouterloop_mmap_sem:
+ mmap_read_unlock(mm); /* exit_mmap will destroy ptes after this */
+breakouterloop_mmap_lock:
spin_lock(&khugepaged_mm_lock);
VM_BUG_ON(khugepaged_scan.mm_slot != mm_slot);
@@ -2083,6 +2174,8 @@ static void khugepaged_do_scan(void)
barrier(); /* write khugepaged_pages_to_scan to local stack */
+ lru_add_drain_all();
+
while (progress < pages) {
if (!khugepaged_prealloc_page(&hpage, &wait))
break;
diff --git a/mm/ksm.c b/mm/ksm.c
index a558da9e7177..4102034cd55a 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -442,7 +442,7 @@ static void insert_to_mm_slots_hash(struct mm_struct *mm,
/*
* ksmd, and unmerge_and_remove_all_rmap_items(), must not touch an mm's
* page tables after it has passed through ksm_exit() - which, if necessary,
- * takes mmap_sem briefly to serialize against them. ksm_exit() does not set
+ * takes mmap_lock briefly to serialize against them. ksm_exit() does not set
* a special flag: they can just back out as soon as mm_users goes to zero.
* ksm_test_exit() is used throughout to make this test for exit: in some
* places for correctness, in some places just to avoid unnecessary work.
@@ -542,11 +542,11 @@ static void break_cow(struct rmap_item *rmap_item)
*/
put_anon_vma(rmap_item->anon_vma);
- down_read(&mm->mmap_sem);
+ mmap_read_lock(mm);
vma = find_mergeable_vma(mm, addr);
if (vma)
break_ksm(vma, addr);
- up_read(&mm->mmap_sem);
+ mmap_read_unlock(mm);
}
static struct page *get_mergeable_page(struct rmap_item *rmap_item)
@@ -556,7 +556,7 @@ static struct page *get_mergeable_page(struct rmap_item *rmap_item)
struct vm_area_struct *vma;
struct page *page;
- down_read(&mm->mmap_sem);
+ mmap_read_lock(mm);
vma = find_mergeable_vma(mm, addr);
if (!vma)
goto out;
@@ -572,7 +572,7 @@ static struct page *get_mergeable_page(struct rmap_item *rmap_item)
out:
page = NULL;
}
- up_read(&mm->mmap_sem);
+ mmap_read_unlock(mm);
return page;
}
@@ -612,7 +612,7 @@ static struct stable_node *alloc_stable_node_chain(struct stable_node *dup,
* Move the old stable node to the second dimension
* queued in the hlist_dup. The invariant is that all
* dup stable_nodes in the chain->hlist point to pages
- * that are wrprotected and have the exact same
+ * that are write protected and have the exact same
* content.
*/
stable_node_chain_add_dup(dup, chain);
@@ -831,7 +831,7 @@ static void remove_trailing_rmap_items(struct mm_slot *mm_slot,
* Though it's very tempting to unmerge rmap_items from stable tree rather
* than check every pte of a given vma, the locking doesn't quite work for
* that - an rmap_item is assigned to the stable tree after inserting ksm
- * page and upping mmap_sem. Nor does it fit with the way we skip dup'ing
+ * page and upping mmap_lock. Nor does it fit with the way we skip dup'ing
* rmap_items from parent to child at fork time (so as not to waste time
* if exit comes before the next scan reaches it).
*
@@ -976,7 +976,7 @@ static int unmerge_and_remove_all_rmap_items(void)
for (mm_slot = ksm_scan.mm_slot;
mm_slot != &ksm_mm_head; mm_slot = ksm_scan.mm_slot) {
mm = mm_slot->mm;
- down_read(&mm->mmap_sem);
+ mmap_read_lock(mm);
for (vma = mm->mmap; vma; vma = vma->vm_next) {
if (ksm_test_exit(mm))
break;
@@ -989,7 +989,7 @@ static int unmerge_and_remove_all_rmap_items(void)
}
remove_trailing_rmap_items(mm_slot, &mm_slot->rmap_list);
- up_read(&mm->mmap_sem);
+ mmap_read_unlock(mm);
spin_lock(&ksm_mmlist_lock);
ksm_scan.mm_slot = list_entry(mm_slot->mm_list.next,
@@ -1012,7 +1012,7 @@ static int unmerge_and_remove_all_rmap_items(void)
return 0;
error:
- up_read(&mm->mmap_sem);
+ mmap_read_unlock(mm);
spin_lock(&ksm_mmlist_lock);
ksm_scan.mm_slot = &ksm_mm_head;
spin_unlock(&ksm_mmlist_lock);
@@ -1148,7 +1148,7 @@ static int replace_page(struct vm_area_struct *vma, struct page *page,
/*
* No need to check ksm_use_zero_pages here: we can only have a
- * zero_page here if ksm_use_zero_pages was enabled alreaady.
+ * zero_page here if ksm_use_zero_pages was enabled already.
*/
if (!is_zero_pfn(page_to_pfn(kpage))) {
get_page(kpage);
@@ -1280,7 +1280,7 @@ static int try_to_merge_with_ksm_page(struct rmap_item *rmap_item,
struct vm_area_struct *vma;
int err = -EFAULT;
- down_read(&mm->mmap_sem);
+ mmap_read_lock(mm);
vma = find_mergeable_vma(mm, rmap_item->address);
if (!vma)
goto out;
@@ -1292,11 +1292,11 @@ static int try_to_merge_with_ksm_page(struct rmap_item *rmap_item,
/* Unstable nid is in union with stable anon_vma: remove first */
remove_rmap_item_from_tree(rmap_item);
- /* Must get reference to anon_vma while still holding mmap_sem */
+ /* Must get reference to anon_vma while still holding mmap_lock */
rmap_item->anon_vma = vma->anon_vma;
get_anon_vma(vma->anon_vma);
out:
- up_read(&mm->mmap_sem);
+ mmap_read_unlock(mm);
return err;
}
@@ -1608,7 +1608,7 @@ again:
* continue. All KSM pages belonging to the
* stable_node dups in a stable_node chain
* have the same content and they're
- * wrprotected at all times. Any will work
+ * write protected at all times. Any will work
* fine to continue the walk.
*/
tree_page = get_ksm_page(stable_node_any,
@@ -1843,7 +1843,7 @@ again:
* continue. All KSM pages belonging to the
* stable_node dups in a stable_node chain
* have the same content and they're
- * wrprotected at all times. Any will work
+ * write protected at all times. Any will work
* fine to continue the walk.
*/
tree_page = get_ksm_page(stable_node_any,
@@ -2001,7 +2001,7 @@ static void stable_tree_append(struct rmap_item *rmap_item,
* duplicate. page_migration could break later if rmap breaks,
* so we can as well crash here. We really need to check for
* rmap_hlist_len == STABLE_NODE_CHAIN, but we can as well check
- * for other negative values as an undeflow if detected here
+ * for other negative values as an underflow if detected here
* for the first time (and not when decreasing rmap_hlist_len)
* would be sign of memory corruption in the stable_node.
*/
@@ -2110,11 +2110,19 @@ static void cmp_and_merge_page(struct page *page, struct rmap_item *rmap_item)
if (ksm_use_zero_pages && (checksum == zero_checksum)) {
struct vm_area_struct *vma;
- down_read(&mm->mmap_sem);
+ mmap_read_lock(mm);
vma = find_mergeable_vma(mm, rmap_item->address);
- err = try_to_merge_one_page(vma, page,
- ZERO_PAGE(rmap_item->address));
- up_read(&mm->mmap_sem);
+ if (vma) {
+ err = try_to_merge_one_page(vma, page,
+ ZERO_PAGE(rmap_item->address));
+ } else {
+ /*
+ * If the vma is out of date, we do not need to
+ * continue.
+ */
+ err = 0;
+ }
+ mmap_read_unlock(mm);
/*
* In case of failure, the page was not really empty, so we
* need to continue. Otherwise we're done.
@@ -2277,7 +2285,7 @@ next_mm:
}
mm = slot->mm;
- down_read(&mm->mmap_sem);
+ mmap_read_lock(mm);
if (ksm_test_exit(mm))
vma = NULL;
else
@@ -2311,7 +2319,7 @@ next_mm:
ksm_scan.address += PAGE_SIZE;
} else
put_page(*page);
- up_read(&mm->mmap_sem);
+ mmap_read_unlock(mm);
return rmap_item;
}
put_page(*page);
@@ -2335,13 +2343,13 @@ next_mm:
struct mm_slot, mm_list);
if (ksm_scan.address == 0) {
/*
- * We've completed a full scan of all vmas, holding mmap_sem
+ * We've completed a full scan of all vmas, holding mmap_lock
* throughout, and found no VM_MERGEABLE: so do the same as
* __ksm_exit does to remove this mm from all our lists now.
* This applies either when cleaning up after __ksm_exit
* (but beware: we can reach here even before __ksm_exit),
* or when all VM_MERGEABLE areas have been unmapped (and
- * mmap_sem then protects against race with MADV_MERGEABLE).
+ * mmap_lock then protects against race with MADV_MERGEABLE).
*/
hash_del(&slot->link);
list_del(&slot->mm_list);
@@ -2349,12 +2357,12 @@ next_mm:
free_mm_slot(slot);
clear_bit(MMF_VM_MERGEABLE, &mm->flags);
- up_read(&mm->mmap_sem);
+ mmap_read_unlock(mm);
mmdrop(mm);
} else {
- up_read(&mm->mmap_sem);
+ mmap_read_unlock(mm);
/*
- * up_read(&mm->mmap_sem) first because after
+ * mmap_read_unlock(mm) first because after
* spin_unlock(&ksm_mmlist_lock) run, the "mm" may
* already have been freed under us by __ksm_exit()
* because the "mm_slot" is still hashed and
@@ -2528,7 +2536,7 @@ void __ksm_exit(struct mm_struct *mm)
* This process is exiting: if it's straightforward (as is the
* case when ksmd was never running), free mm_slot immediately.
* But if it's at the cursor or has rmap_items linked to it, use
- * mmap_sem to synchronize with any break_cows before pagetables
+ * mmap_lock to synchronize with any break_cows before pagetables
* are freed, and leave the mm_slot on the list for ksmd to free.
* Beware: ksm may already have noticed it exiting and freed the slot.
*/
@@ -2552,8 +2560,8 @@ void __ksm_exit(struct mm_struct *mm)
clear_bit(MMF_VM_MERGEABLE, &mm->flags);
mmdrop(mm);
} else if (mm_slot) {
- down_write(&mm->mmap_sem);
- up_write(&mm->mmap_sem);
+ mmap_write_lock(mm);
+ mmap_write_unlock(mm);
}
}
diff --git a/mm/list_lru.c b/mm/list_lru.c
index 4d5294c39bba..9222910ab1cb 100644
--- a/mm/list_lru.c
+++ b/mm/list_lru.c
@@ -213,7 +213,7 @@ restart:
/*
* decrement nr_to_walk first so that we don't livelock if we
- * get stuck on large numbesr of LRU_RETRY items
+ * get stuck on large numbers of LRU_RETRY items
*/
if (!*nr_to_walk)
break;
diff --git a/mm/maccess.c b/mm/maccess.c
index 3ca8d97e5010..88845eda5047 100644
--- a/mm/maccess.c
+++ b/mm/maccess.c
@@ -1,103 +1,128 @@
// SPDX-License-Identifier: GPL-2.0-only
/*
- * Access kernel memory without faulting.
+ * Access kernel or user memory without faulting.
*/
#include <linux/export.h>
#include <linux/mm.h>
#include <linux/uaccess.h>
-static __always_inline long
-probe_read_common(void *dst, const void __user *src, size_t size)
+bool __weak probe_kernel_read_allowed(const void *unsafe_src, size_t size)
{
- long ret;
+ return true;
+}
+
+#ifdef HAVE_GET_KERNEL_NOFAULT
+
+#define probe_kernel_read_loop(dst, src, len, type, err_label) \
+ while (len >= sizeof(type)) { \
+ __get_kernel_nofault(dst, src, type, err_label); \
+ dst += sizeof(type); \
+ src += sizeof(type); \
+ len -= sizeof(type); \
+ }
+
+long probe_kernel_read(void *dst, const void *src, size_t size)
+{
+ if (!probe_kernel_read_allowed(src, size))
+ return -ERANGE;
pagefault_disable();
- ret = __copy_from_user_inatomic(dst, src, size);
+ probe_kernel_read_loop(dst, src, size, u64, Efault);
+ probe_kernel_read_loop(dst, src, size, u32, Efault);
+ probe_kernel_read_loop(dst, src, size, u16, Efault);
+ probe_kernel_read_loop(dst, src, size, u8, Efault);
pagefault_enable();
+ return 0;
+Efault:
+ pagefault_enable();
+ return -EFAULT;
+}
+EXPORT_SYMBOL_GPL(probe_kernel_read);
+
+#define probe_kernel_write_loop(dst, src, len, type, err_label) \
+ while (len >= sizeof(type)) { \
+ __put_kernel_nofault(dst, src, type, err_label); \
+ dst += sizeof(type); \
+ src += sizeof(type); \
+ len -= sizeof(type); \
+ }
- return ret ? -EFAULT : 0;
+long probe_kernel_write(void *dst, const void *src, size_t size)
+{
+ pagefault_disable();
+ probe_kernel_write_loop(dst, src, size, u64, Efault);
+ probe_kernel_write_loop(dst, src, size, u32, Efault);
+ probe_kernel_write_loop(dst, src, size, u16, Efault);
+ probe_kernel_write_loop(dst, src, size, u8, Efault);
+ pagefault_enable();
+ return 0;
+Efault:
+ pagefault_enable();
+ return -EFAULT;
}
-static __always_inline long
-probe_write_common(void __user *dst, const void *src, size_t size)
+long strncpy_from_kernel_nofault(char *dst, const void *unsafe_addr, long count)
{
- long ret;
+ const void *src = unsafe_addr;
+
+ if (unlikely(count <= 0))
+ return 0;
+ if (!probe_kernel_read_allowed(unsafe_addr, count))
+ return -ERANGE;
pagefault_disable();
- ret = __copy_to_user_inatomic(dst, src, size);
+ do {
+ __get_kernel_nofault(dst, src, u8, Efault);
+ dst++;
+ src++;
+ } while (dst[-1] && src - unsafe_addr < count);
pagefault_enable();
- return ret ? -EFAULT : 0;
+ dst[-1] = '\0';
+ return src - unsafe_addr;
+Efault:
+ pagefault_enable();
+ dst[-1] = '\0';
+ return -EFAULT;
}
-
+#else /* HAVE_GET_KERNEL_NOFAULT */
/**
- * probe_kernel_read(): safely attempt to read from a kernel-space location
+ * probe_kernel_read(): safely attempt to read from kernel-space
* @dst: pointer to the buffer that shall take the data
* @src: address to read from
* @size: size of the data chunk
*
- * Safely read from address @src to the buffer at @dst. If a kernel fault
- * happens, handle that and return -EFAULT.
+ * Safely read from kernel address @src to the buffer at @dst. If a kernel
+ * fault happens, handle that and return -EFAULT. If @src is not a valid kernel
+ * address, return -ERANGE.
*
* We ensure that the copy_from_user is executed in atomic context so that
- * do_page_fault() doesn't attempt to take mmap_sem. This makes
+ * do_page_fault() doesn't attempt to take mmap_lock. This makes
* probe_kernel_read() suitable for use within regions where the caller
- * already holds mmap_sem, or other locks which nest inside mmap_sem.
- *
- * probe_kernel_read_strict() is the same as probe_kernel_read() except for
- * the case where architectures have non-overlapping user and kernel address
- * ranges: probe_kernel_read_strict() will additionally return -EFAULT for
- * probing memory on a user address range where probe_user_read() is supposed
- * to be used instead.
+ * already holds mmap_lock, or other locks which nest inside mmap_lock.
*/
-
-long __weak probe_kernel_read(void *dst, const void *src, size_t size)
- __attribute__((alias("__probe_kernel_read")));
-
-long __weak probe_kernel_read_strict(void *dst, const void *src, size_t size)
- __attribute__((alias("__probe_kernel_read")));
-
-long __probe_kernel_read(void *dst, const void *src, size_t size)
+long probe_kernel_read(void *dst, const void *src, size_t size)
{
long ret;
mm_segment_t old_fs = get_fs();
+ if (!probe_kernel_read_allowed(src, size))
+ return -ERANGE;
+
set_fs(KERNEL_DS);
- ret = probe_read_common(dst, (__force const void __user *)src, size);
+ pagefault_disable();
+ ret = __copy_from_user_inatomic(dst, (__force const void __user *)src,
+ size);
+ pagefault_enable();
set_fs(old_fs);
- return ret;
+ if (ret)
+ return -EFAULT;
+ return 0;
}
EXPORT_SYMBOL_GPL(probe_kernel_read);
/**
- * probe_user_read(): safely attempt to read from a user-space location
- * @dst: pointer to the buffer that shall take the data
- * @src: address to read from. This must be a user address.
- * @size: size of the data chunk
- *
- * Safely read from user address @src to the buffer at @dst. If a kernel fault
- * happens, handle that and return -EFAULT.
- */
-
-long __weak probe_user_read(void *dst, const void __user *src, size_t size)
- __attribute__((alias("__probe_user_read")));
-
-long __probe_user_read(void *dst, const void __user *src, size_t size)
-{
- long ret = -EFAULT;
- mm_segment_t old_fs = get_fs();
-
- set_fs(USER_DS);
- if (access_ok(src, size))
- ret = probe_read_common(dst, src, size);
- set_fs(old_fs);
-
- return ret;
-}
-EXPORT_SYMBOL_GPL(probe_user_read);
-
-/**
* probe_kernel_write(): safely attempt to write to a location
* @dst: address to write to
* @src: pointer to the data that shall be written
@@ -106,52 +131,25 @@ EXPORT_SYMBOL_GPL(probe_user_read);
* Safely write to address @dst from the buffer at @src. If a kernel fault
* happens, handle that and return -EFAULT.
*/
-
-long __weak probe_kernel_write(void *dst, const void *src, size_t size)
- __attribute__((alias("__probe_kernel_write")));
-
-long __probe_kernel_write(void *dst, const void *src, size_t size)
+long probe_kernel_write(void *dst, const void *src, size_t size)
{
long ret;
mm_segment_t old_fs = get_fs();
set_fs(KERNEL_DS);
- ret = probe_write_common((__force void __user *)dst, src, size);
- set_fs(old_fs);
-
- return ret;
-}
-EXPORT_SYMBOL_GPL(probe_kernel_write);
-
-/**
- * probe_user_write(): safely attempt to write to a user-space location
- * @dst: address to write to
- * @src: pointer to the data that shall be written
- * @size: size of the data chunk
- *
- * Safely write to address @dst from the buffer at @src. If a kernel fault
- * happens, handle that and return -EFAULT.
- */
-
-long __weak probe_user_write(void __user *dst, const void *src, size_t size)
- __attribute__((alias("__probe_user_write")));
-
-long __probe_user_write(void __user *dst, const void *src, size_t size)
-{
- long ret = -EFAULT;
- mm_segment_t old_fs = get_fs();
-
- set_fs(USER_DS);
- if (access_ok(dst, size))
- ret = probe_write_common(dst, src, size);
+ pagefault_disable();
+ ret = __copy_to_user_inatomic((__force void __user *)dst, src, size);
+ pagefault_enable();
set_fs(old_fs);
- return ret;
+ if (ret)
+ return -EFAULT;
+ return 0;
}
-EXPORT_SYMBOL_GPL(probe_user_write);
/**
- * strncpy_from_unsafe: - Copy a NUL terminated string from unsafe address.
+ * strncpy_from_kernel_nofault: - Copy a NUL terminated string from unsafe
+ * address.
* @dst: Destination address, in kernel space. This buffer must be at
* least @count bytes long.
* @unsafe_addr: Unsafe address.
@@ -161,27 +159,14 @@ EXPORT_SYMBOL_GPL(probe_user_write);
*
* On success, returns the length of the string INCLUDING the trailing NUL.
*
- * If access fails, returns -EFAULT (some data may have been copied
- * and the trailing NUL added).
+ * If access fails, returns -EFAULT (some data may have been copied and the
+ * trailing NUL added). If @unsafe_addr is not a valid kernel address, return
+ * -ERANGE.
*
* If @count is smaller than the length of the string, copies @count-1 bytes,
* sets the last byte of @dst buffer to NUL and returns @count.
- *
- * strncpy_from_unsafe_strict() is the same as strncpy_from_unsafe() except
- * for the case where architectures have non-overlapping user and kernel address
- * ranges: strncpy_from_unsafe_strict() will additionally return -EFAULT for
- * probing memory on a user address range where strncpy_from_unsafe_user() is
- * supposed to be used instead.
*/
-
-long __weak strncpy_from_unsafe(char *dst, const void *unsafe_addr, long count)
- __attribute__((alias("__strncpy_from_unsafe")));
-
-long __weak strncpy_from_unsafe_strict(char *dst, const void *unsafe_addr,
- long count)
- __attribute__((alias("__strncpy_from_unsafe")));
-
-long __strncpy_from_unsafe(char *dst, const void *unsafe_addr, long count)
+long strncpy_from_kernel_nofault(char *dst, const void *unsafe_addr, long count)
{
mm_segment_t old_fs = get_fs();
const void *src = unsafe_addr;
@@ -189,6 +174,8 @@ long __strncpy_from_unsafe(char *dst, const void *unsafe_addr, long count)
if (unlikely(count <= 0))
return 0;
+ if (!probe_kernel_read_allowed(unsafe_addr, count))
+ return -ERANGE;
set_fs(KERNEL_DS);
pagefault_disable();
@@ -203,9 +190,66 @@ long __strncpy_from_unsafe(char *dst, const void *unsafe_addr, long count)
return ret ? -EFAULT : src - unsafe_addr;
}
+#endif /* HAVE_GET_KERNEL_NOFAULT */
+
+/**
+ * probe_user_read(): safely attempt to read from a user-space location
+ * @dst: pointer to the buffer that shall take the data
+ * @src: address to read from. This must be a user address.
+ * @size: size of the data chunk
+ *
+ * Safely read from user address @src to the buffer at @dst. If a kernel fault
+ * happens, handle that and return -EFAULT.
+ */
+long probe_user_read(void *dst, const void __user *src, size_t size)
+{
+ long ret = -EFAULT;
+ mm_segment_t old_fs = get_fs();
+
+ set_fs(USER_DS);
+ if (access_ok(src, size)) {
+ pagefault_disable();
+ ret = __copy_from_user_inatomic(dst, src, size);
+ pagefault_enable();
+ }
+ set_fs(old_fs);
+
+ if (ret)
+ return -EFAULT;
+ return 0;
+}
+EXPORT_SYMBOL_GPL(probe_user_read);
+
+/**
+ * probe_user_write(): safely attempt to write to a user-space location
+ * @dst: address to write to
+ * @src: pointer to the data that shall be written
+ * @size: size of the data chunk
+ *
+ * Safely write to address @dst from the buffer at @src. If a kernel fault
+ * happens, handle that and return -EFAULT.
+ */
+long probe_user_write(void __user *dst, const void *src, size_t size)
+{
+ long ret = -EFAULT;
+ mm_segment_t old_fs = get_fs();
+
+ set_fs(USER_DS);
+ if (access_ok(dst, size)) {
+ pagefault_disable();
+ ret = __copy_to_user_inatomic(dst, src, size);
+ pagefault_enable();
+ }
+ set_fs(old_fs);
+
+ if (ret)
+ return -EFAULT;
+ return 0;
+}
+EXPORT_SYMBOL_GPL(probe_user_write);
/**
- * strncpy_from_unsafe_user: - Copy a NUL terminated string from unsafe user
+ * strncpy_from_user_nofault: - Copy a NUL terminated string from unsafe user
* address.
* @dst: Destination address, in kernel space. This buffer must be at
* least @count bytes long.
@@ -222,7 +266,7 @@ long __strncpy_from_unsafe(char *dst, const void *unsafe_addr, long count)
* If @count is smaller than the length of the string, copies @count-1 bytes,
* sets the last byte of @dst buffer to NUL and returns @count.
*/
-long strncpy_from_unsafe_user(char *dst, const void __user *unsafe_addr,
+long strncpy_from_user_nofault(char *dst, const void __user *unsafe_addr,
long count)
{
mm_segment_t old_fs = get_fs();
@@ -248,7 +292,7 @@ long strncpy_from_unsafe_user(char *dst, const void __user *unsafe_addr,
}
/**
- * strnlen_unsafe_user: - Get the size of a user string INCLUDING final NUL.
+ * strnlen_user_nofault: - Get the size of a user string INCLUDING final NUL.
* @unsafe_addr: The string to measure.
* @count: Maximum count (including NUL)
*
@@ -263,7 +307,7 @@ long strncpy_from_unsafe_user(char *dst, const void __user *unsafe_addr,
* Unlike strnlen_user, this can be used from IRQ handler etc. because
* it disables pagefaults.
*/
-long strnlen_unsafe_user(const void __user *unsafe_addr, long count)
+long strnlen_user_nofault(const void __user *unsafe_addr, long count)
{
mm_segment_t old_fs = get_fs();
int ret;
diff --git a/mm/madvise.c b/mm/madvise.c
index 4bb30ed6c8d2..dd1d43cf026d 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -27,6 +27,7 @@
#include <linux/swapops.h>
#include <linux/shmem_fs.h>
#include <linux/mmu_notifier.h>
+#include <linux/sched/mm.h>
#include <asm/tlb.h>
@@ -39,7 +40,7 @@ struct madvise_walk_private {
/*
* Any behaviour which results in changes to the vma->vm_flags needs to
- * take mmap_sem for writing. Others, which simply traverse vmas, need
+ * take mmap_lock for writing. Others, which simply traverse vmas, need
* to only take it for reading.
*/
static int madvise_need_mmap_write(int behavior)
@@ -164,7 +165,7 @@ static long madvise_behavior(struct vm_area_struct *vma,
success:
/*
- * vm_flags is protected by the mmap_sem held in write mode.
+ * vm_flags is protected by the mmap_lock held in write mode.
*/
vma->vm_flags = new_flags;
@@ -284,16 +285,16 @@ static long madvise_willneed(struct vm_area_struct *vma,
* Filesystem's fadvise may need to take various locks. We need to
* explicitly grab a reference because the vma (and hence the
* vma's reference to the file) can go away as soon as we drop
- * mmap_sem.
+ * mmap_lock.
*/
- *prev = NULL; /* tell sys_madvise we drop mmap_sem */
+ *prev = NULL; /* tell sys_madvise we drop mmap_lock */
get_file(file);
- up_read(&current->mm->mmap_sem);
+ mmap_read_unlock(current->mm);
offset = (loff_t)(start - vma->vm_start)
+ ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
vfs_fadvise(file, offset, end - start, POSIX_FADV_WILLNEED);
fput(file);
- down_read(&current->mm->mmap_sem);
+ mmap_read_lock(current->mm);
return 0;
}
@@ -767,9 +768,9 @@ static long madvise_dontneed_free(struct vm_area_struct *vma,
return -EINVAL;
if (!userfaultfd_remove(vma, start, end)) {
- *prev = NULL; /* mmap_sem has been dropped, prev is stale */
+ *prev = NULL; /* mmap_lock has been dropped, prev is stale */
- down_read(&current->mm->mmap_sem);
+ mmap_read_lock(current->mm);
vma = find_vma(current->mm, start);
if (!vma)
return -ENOMEM;
@@ -790,7 +791,7 @@ static long madvise_dontneed_free(struct vm_area_struct *vma,
if (end > vma->vm_end) {
/*
* Don't fail if end > vma->vm_end. If the old
- * vma was splitted while the mmap_sem was
+ * vma was splitted while the mmap_lock was
* released the effect of the concurrent
* operation may not cause madvise() to
* have an undefined result. There may be an
@@ -825,7 +826,7 @@ static long madvise_remove(struct vm_area_struct *vma,
int error;
struct file *f;
- *prev = NULL; /* tell sys_madvise we drop mmap_sem */
+ *prev = NULL; /* tell sys_madvise we drop mmap_lock */
if (vma->vm_flags & VM_LOCKED)
return -EINVAL;
@@ -846,18 +847,18 @@ static long madvise_remove(struct vm_area_struct *vma,
* Filesystem's fallocate may need to take i_mutex. We need to
* explicitly grab a reference because the vma (and hence the
* vma's reference to the file) can go away as soon as we drop
- * mmap_sem.
+ * mmap_lock.
*/
get_file(f);
if (userfaultfd_remove(vma, start, end)) {
- /* mmap_sem was not released by userfaultfd_remove() */
- up_read(&current->mm->mmap_sem);
+ /* mmap_lock was not released by userfaultfd_remove() */
+ mmap_read_unlock(current->mm);
}
error = vfs_fallocate(f,
FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
offset, end - start);
fput(f);
- down_read(&current->mm->mmap_sem);
+ mmap_read_lock(current->mm);
return error;
}
@@ -1088,10 +1089,27 @@ int do_madvise(unsigned long start, size_t len_in, int behavior)
write = madvise_need_mmap_write(behavior);
if (write) {
- if (down_write_killable(&current->mm->mmap_sem))
+ if (mmap_write_lock_killable(current->mm))
return -EINTR;
+
+ /*
+ * We may have stolen the mm from another process
+ * that is undergoing core dumping.
+ *
+ * Right now that's io_ring, in the future it may
+ * be remote process management and not "current"
+ * at all.
+ *
+ * We need to fix core dumping to not do this,
+ * but for now we have the mmget_still_valid()
+ * model.
+ */
+ if (!mmget_still_valid(current->mm)) {
+ mmap_write_unlock(current->mm);
+ return -EINTR;
+ }
} else {
- down_read(&current->mm->mmap_sem);
+ mmap_read_lock(current->mm);
}
/*
@@ -1135,15 +1153,15 @@ int do_madvise(unsigned long start, size_t len_in, int behavior)
goto out;
if (prev)
vma = prev->vm_next;
- else /* madvise_remove dropped mmap_sem */
+ else /* madvise_remove dropped mmap_lock */
vma = find_vma(current->mm, start);
}
out:
blk_finish_plug(&plug);
if (write)
- up_write(&current->mm->mmap_sem);
+ mmap_write_unlock(current->mm);
else
- up_read(&current->mm->mmap_sem);
+ mmap_read_unlock(current->mm);
return error;
}
diff --git a/mm/memblock.c b/mm/memblock.c
index c79ba6f9920c..39aceafc57f6 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -78,7 +78,7 @@
* * memblock_alloc*() - these functions return the **virtual** address
* of the allocated memory.
*
- * Note, that both API variants use implict assumptions about allowed
+ * Note, that both API variants use implicit assumptions about allowed
* memory ranges and the fallback methods. Consult the documentation
* of memblock_alloc_internal() and memblock_alloc_range_nid()
* functions for more elaborate description.
@@ -620,7 +620,7 @@ repeat:
* area, insert that portion.
*/
if (rbase > base) {
-#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
+#ifdef CONFIG_NEED_MULTIPLE_NODES
WARN_ON(nid != memblock_get_region_node(rgn));
#endif
WARN_ON(flags != rgn->flags);
@@ -1197,7 +1197,6 @@ void __init_memblock __next_mem_range_rev(u64 *idx, int nid,
*idx = ULLONG_MAX;
}
-#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
/*
* Common iterator interface used to define for_each_mem_pfn_range().
*/
@@ -1207,13 +1206,15 @@ void __init_memblock __next_mem_pfn_range(int *idx, int nid,
{
struct memblock_type *type = &memblock.memory;
struct memblock_region *r;
+ int r_nid;
while (++*idx < type->cnt) {
r = &type->regions[*idx];
+ r_nid = memblock_get_region_node(r);
if (PFN_UP(r->base) >= PFN_DOWN(r->base + r->size))
continue;
- if (nid == MAX_NUMNODES || nid == r->nid)
+ if (nid == MAX_NUMNODES || nid == r_nid)
break;
}
if (*idx >= type->cnt) {
@@ -1226,7 +1227,7 @@ void __init_memblock __next_mem_pfn_range(int *idx, int nid,
if (out_end_pfn)
*out_end_pfn = PFN_DOWN(r->base + r->size);
if (out_nid)
- *out_nid = r->nid;
+ *out_nid = r_nid;
}
/**
@@ -1245,6 +1246,7 @@ void __init_memblock __next_mem_pfn_range(int *idx, int nid,
int __init_memblock memblock_set_node(phys_addr_t base, phys_addr_t size,
struct memblock_type *type, int nid)
{
+#ifdef CONFIG_NEED_MULTIPLE_NODES
int start_rgn, end_rgn;
int i, ret;
@@ -1256,9 +1258,10 @@ int __init_memblock memblock_set_node(phys_addr_t base, phys_addr_t size,
memblock_set_region_node(&type->regions[i], nid);
memblock_merge_regions(type);
+#endif
return 0;
}
-#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
+
#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
/**
* __next_mem_pfn_range_in_zone - iterator for for_each_*_range_in_zone()
@@ -1797,7 +1800,6 @@ bool __init_memblock memblock_is_map_memory(phys_addr_t addr)
return !memblock_is_nomap(&memblock.memory.regions[i]);
}
-#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
int __init_memblock memblock_search_pfn_nid(unsigned long pfn,
unsigned long *start_pfn, unsigned long *end_pfn)
{
@@ -1810,9 +1812,8 @@ int __init_memblock memblock_search_pfn_nid(unsigned long pfn,
*start_pfn = PFN_DOWN(type->regions[mid].base);
*end_pfn = PFN_DOWN(type->regions[mid].base + type->regions[mid].size);
- return type->regions[mid].nid;
+ return memblock_get_region_node(&type->regions[mid]);
}
-#endif
/**
* memblock_is_region_memory - check if a region is a subset of memory
@@ -1903,7 +1904,7 @@ static void __init_memblock memblock_dump(struct memblock_type *type)
size = rgn->size;
end = base + size - 1;
flags = rgn->flags;
-#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
+#ifdef CONFIG_NEED_MULTIPLE_NODES
if (memblock_get_region_node(rgn) != MAX_NUMNODES)
snprintf(nid_buf, sizeof(nid_buf), " on node %d",
memblock_get_region_node(rgn));
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 5beea03dd58a..0b38b6ad547d 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -83,9 +83,9 @@ static bool cgroup_memory_nokmem;
/* Whether the swap controller is active */
#ifdef CONFIG_MEMCG_SWAP
-int do_swap_account __read_mostly;
+bool cgroup_memory_noswap __read_mostly;
#else
-#define do_swap_account 0
+#define cgroup_memory_noswap 1
#endif
#ifdef CONFIG_CGROUP_WRITEBACK
@@ -95,7 +95,7 @@ static DECLARE_WAIT_QUEUE_HEAD(memcg_cgwb_frn_waitq);
/* Whether legacy memory+swap accounting is active */
static bool do_memsw_account(void)
{
- return !cgroup_subsys_on_dfl(memory_cgrp_subsys) && do_swap_account;
+ return !cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_noswap;
}
#define THRESHOLDS_EVENTS_TARGET 128
@@ -834,25 +834,8 @@ static unsigned long memcg_events_local(struct mem_cgroup *memcg, int event)
static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg,
struct page *page,
- bool compound, int nr_pages)
+ int nr_pages)
{
- /*
- * Here, RSS means 'mapped anon' and anon's SwapCache. Shmem/tmpfs is
- * counted as CACHE even if it's on ANON LRU.
- */
- if (PageAnon(page))
- __mod_memcg_state(memcg, MEMCG_RSS, nr_pages);
- else {
- __mod_memcg_state(memcg, MEMCG_CACHE, nr_pages);
- if (PageSwapBacked(page))
- __mod_memcg_state(memcg, NR_SHMEM, nr_pages);
- }
-
- if (compound) {
- VM_BUG_ON_PAGE(!PageTransHuge(page), page);
- __mod_memcg_state(memcg, MEMCG_RSS_HUGE, nr_pages);
- }
-
/* pagein of a big page is an event. So, ignore page size */
if (nr_pages > 0)
__count_memcg_events(memcg, PGPGIN, 1);
@@ -1218,9 +1201,8 @@ int mem_cgroup_scan_tasks(struct mem_cgroup *memcg,
* @page: the page
* @pgdat: pgdat of the page
*
- * This function is only safe when following the LRU page isolation
- * and putback protocol: the LRU lock must be held, and the page must
- * either be PageLRU() or the caller must have isolated/allocated it.
+ * This function relies on page->mem_cgroup being stable - see the
+ * access rules in commit_charge().
*/
struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct pglist_data *pgdat)
{
@@ -1314,7 +1296,7 @@ static unsigned long mem_cgroup_margin(struct mem_cgroup *memcg)
if (do_memsw_account()) {
count = page_counter_read(&memcg->memsw);
limit = READ_ONCE(memcg->memsw.max);
- if (count <= limit)
+ if (count < limit)
margin = min(margin, limit - count);
else
margin = 0;
@@ -1389,10 +1371,10 @@ static char *memory_stat_format(struct mem_cgroup *memcg)
*/
seq_buf_printf(&s, "anon %llu\n",
- (u64)memcg_page_state(memcg, MEMCG_RSS) *
+ (u64)memcg_page_state(memcg, NR_ANON_MAPPED) *
PAGE_SIZE);
seq_buf_printf(&s, "file %llu\n",
- (u64)memcg_page_state(memcg, MEMCG_CACHE) *
+ (u64)memcg_page_state(memcg, NR_FILE_PAGES) *
PAGE_SIZE);
seq_buf_printf(&s, "kernel_stack %llu\n",
(u64)memcg_page_state(memcg, MEMCG_KERNEL_STACK_KB) *
@@ -1418,15 +1400,11 @@ static char *memory_stat_format(struct mem_cgroup *memcg)
(u64)memcg_page_state(memcg, NR_WRITEBACK) *
PAGE_SIZE);
- /*
- * TODO: We should eventually replace our own MEMCG_RSS_HUGE counter
- * with the NR_ANON_THP vm counter, but right now it's a pain in the
- * arse because it requires migrating the work out of rmap to a place
- * where the page->mem_cgroup is set up and stable.
- */
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
seq_buf_printf(&s, "anon_thp %llu\n",
- (u64)memcg_page_state(memcg, MEMCG_RSS_HUGE) *
- PAGE_SIZE);
+ (u64)memcg_page_state(memcg, NR_ANON_THPS) *
+ HPAGE_PMD_SIZE);
+#endif
for (i = 0; i < NR_LRU_LISTS; i++)
seq_buf_printf(&s, "%s %llu\n", lru_list_name(i),
@@ -1451,6 +1429,8 @@ static char *memory_stat_format(struct mem_cgroup *memcg)
memcg_page_state(memcg, WORKINGSET_REFAULT));
seq_buf_printf(&s, "workingset_activate %lu\n",
memcg_page_state(memcg, WORKINGSET_ACTIVATE));
+ seq_buf_printf(&s, "workingset_restore %lu\n",
+ memcg_page_state(memcg, WORKINGSET_RESTORE));
seq_buf_printf(&s, "workingset_nodereclaim %lu\n",
memcg_page_state(memcg, WORKINGSET_NODERECLAIM));
@@ -1979,6 +1959,7 @@ void mem_cgroup_print_oom_group(struct mem_cgroup *memcg)
*/
struct mem_cgroup *lock_page_memcg(struct page *page)
{
+ struct page *head = compound_head(page); /* rmap on tail pages */
struct mem_cgroup *memcg;
unsigned long flags;
@@ -1998,7 +1979,7 @@ struct mem_cgroup *lock_page_memcg(struct page *page)
if (mem_cgroup_disabled())
return NULL;
again:
- memcg = page->mem_cgroup;
+ memcg = head->mem_cgroup;
if (unlikely(!memcg))
return NULL;
@@ -2006,7 +1987,7 @@ again:
return memcg;
spin_lock_irqsave(&memcg->move_lock, flags);
- if (memcg != page->mem_cgroup) {
+ if (memcg != head->mem_cgroup) {
spin_unlock_irqrestore(&memcg->move_lock, flags);
goto again;
}
@@ -2049,7 +2030,9 @@ void __unlock_page_memcg(struct mem_cgroup *memcg)
*/
void unlock_page_memcg(struct page *page)
{
- __unlock_page_memcg(page->mem_cgroup);
+ struct page *head = compound_head(page);
+
+ __unlock_page_memcg(head->mem_cgroup);
}
EXPORT_SYMBOL(unlock_page_memcg);
@@ -2250,7 +2233,8 @@ static void reclaim_high(struct mem_cgroup *memcg,
gfp_t gfp_mask)
{
do {
- if (page_counter_read(&memcg->memory) <= READ_ONCE(memcg->high))
+ if (page_counter_read(&memcg->memory) <=
+ READ_ONCE(memcg->memory.high))
continue;
memcg_memory_event(memcg, MEMCG_HIGH);
try_to_free_mem_cgroup_pages(memcg, nr_pages, gfp_mask, true);
@@ -2319,41 +2303,64 @@ static void high_work_func(struct work_struct *work)
#define MEMCG_DELAY_PRECISION_SHIFT 20
#define MEMCG_DELAY_SCALING_SHIFT 14
-/*
- * Get the number of jiffies that we should penalise a mischievous cgroup which
- * is exceeding its memory.high by checking both it and its ancestors.
- */
-static unsigned long calculate_high_delay(struct mem_cgroup *memcg,
- unsigned int nr_pages)
+static u64 calculate_overage(unsigned long usage, unsigned long high)
{
- unsigned long penalty_jiffies;
- u64 max_overage = 0;
+ u64 overage;
- do {
- unsigned long usage, high;
- u64 overage;
+ if (usage <= high)
+ return 0;
- usage = page_counter_read(&memcg->memory);
- high = READ_ONCE(memcg->high);
+ /*
+ * Prevent division by 0 in overage calculation by acting as if
+ * it was a threshold of 1 page
+ */
+ high = max(high, 1UL);
- if (usage <= high)
- continue;
+ overage = usage - high;
+ overage <<= MEMCG_DELAY_PRECISION_SHIFT;
+ return div64_u64(overage, high);
+}
- /*
- * Prevent division by 0 in overage calculation by acting as if
- * it was a threshold of 1 page
- */
- high = max(high, 1UL);
+static u64 mem_find_max_overage(struct mem_cgroup *memcg)
+{
+ u64 overage, max_overage = 0;
+
+ do {
+ overage = calculate_overage(page_counter_read(&memcg->memory),
+ READ_ONCE(memcg->memory.high));
+ max_overage = max(overage, max_overage);
+ } while ((memcg = parent_mem_cgroup(memcg)) &&
+ !mem_cgroup_is_root(memcg));
+
+ return max_overage;
+}
- overage = usage - high;
- overage <<= MEMCG_DELAY_PRECISION_SHIFT;
- overage = div64_u64(overage, high);
+static u64 swap_find_max_overage(struct mem_cgroup *memcg)
+{
+ u64 overage, max_overage = 0;
- if (overage > max_overage)
- max_overage = overage;
+ do {
+ overage = calculate_overage(page_counter_read(&memcg->swap),
+ READ_ONCE(memcg->swap.high));
+ if (overage)
+ memcg_memory_event(memcg, MEMCG_SWAP_HIGH);
+ max_overage = max(overage, max_overage);
} while ((memcg = parent_mem_cgroup(memcg)) &&
!mem_cgroup_is_root(memcg));
+ return max_overage;
+}
+
+/*
+ * Get the number of jiffies that we should penalise a mischievous cgroup which
+ * is exceeding its memory.high by checking both it and its ancestors.
+ */
+static unsigned long calculate_high_delay(struct mem_cgroup *memcg,
+ unsigned int nr_pages,
+ u64 max_overage)
+{
+ unsigned long penalty_jiffies;
+
if (!max_overage)
return 0;
@@ -2377,14 +2384,7 @@ static unsigned long calculate_high_delay(struct mem_cgroup *memcg,
* MEMCG_CHARGE_BATCH pages is nominal, so work out how much smaller or
* larger the current charge patch is than that.
*/
- penalty_jiffies = penalty_jiffies * nr_pages / MEMCG_CHARGE_BATCH;
-
- /*
- * Clamp the max delay per usermode return so as to still keep the
- * application moving forwards and also permit diagnostics, albeit
- * extremely slowly.
- */
- return min(penalty_jiffies, MEMCG_MAX_HIGH_DELAY_JIFFIES);
+ return penalty_jiffies * nr_pages / MEMCG_CHARGE_BATCH;
}
/*
@@ -2409,7 +2409,18 @@ void mem_cgroup_handle_over_high(void)
* memory.high is breached and reclaim is unable to keep up. Throttle
* allocators proactively to slow down excessive growth.
*/
- penalty_jiffies = calculate_high_delay(memcg, nr_pages);
+ penalty_jiffies = calculate_high_delay(memcg, nr_pages,
+ mem_find_max_overage(memcg));
+
+ penalty_jiffies += calculate_high_delay(memcg, nr_pages,
+ swap_find_max_overage(memcg));
+
+ /*
+ * Clamp the max delay per usermode return so as to still keep the
+ * application moving forwards and also permit diagnostics, albeit
+ * extremely slowly.
+ */
+ penalty_jiffies = min(penalty_jiffies, MEMCG_MAX_HIGH_DELAY_JIFFIES);
/*
* Don't sleep if the amount of jiffies this memcg owes us is so low
@@ -2594,12 +2605,32 @@ done_restock:
* reclaim, the cost of mismatch is negligible.
*/
do {
- if (page_counter_read(&memcg->memory) > READ_ONCE(memcg->high)) {
- /* Don't bother a random interrupted task */
- if (in_interrupt()) {
+ bool mem_high, swap_high;
+
+ mem_high = page_counter_read(&memcg->memory) >
+ READ_ONCE(memcg->memory.high);
+ swap_high = page_counter_read(&memcg->swap) >
+ READ_ONCE(memcg->swap.high);
+
+ /* Don't bother a random interrupted task */
+ if (in_interrupt()) {
+ if (mem_high) {
schedule_work(&memcg->high_work);
break;
}
+ continue;
+ }
+
+ if (mem_high || swap_high) {
+ /*
+ * The allocating tasks in this cgroup will need to do
+ * reclaim or be throttled to prevent further growth
+ * of the memory or swap footprints.
+ *
+ * Target some best-effort fairness between the tasks,
+ * and distribute reclaim work and delay penalties
+ * based on how much each task is actually allocating.
+ */
current->memcg_nr_pages_over_high += batch;
set_notify_resume(current);
break;
@@ -2609,6 +2640,7 @@ done_restock:
return 0;
}
+#if defined(CONFIG_MEMCG_KMEM) || defined(CONFIG_MMU)
static void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages)
{
if (mem_cgroup_is_root(memcg))
@@ -2620,70 +2652,20 @@ static void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages)
css_put_many(&memcg->css, nr_pages);
}
+#endif
-static void lock_page_lru(struct page *page, int *isolated)
-{
- pg_data_t *pgdat = page_pgdat(page);
-
- spin_lock_irq(&pgdat->lru_lock);
- if (PageLRU(page)) {
- struct lruvec *lruvec;
-
- lruvec = mem_cgroup_page_lruvec(page, pgdat);
- ClearPageLRU(page);
- del_page_from_lru_list(page, lruvec, page_lru(page));
- *isolated = 1;
- } else
- *isolated = 0;
-}
-
-static void unlock_page_lru(struct page *page, int isolated)
-{
- pg_data_t *pgdat = page_pgdat(page);
-
- if (isolated) {
- struct lruvec *lruvec;
-
- lruvec = mem_cgroup_page_lruvec(page, pgdat);
- VM_BUG_ON_PAGE(PageLRU(page), page);
- SetPageLRU(page);
- add_page_to_lru_list(page, lruvec, page_lru(page));
- }
- spin_unlock_irq(&pgdat->lru_lock);
-}
-
-static void commit_charge(struct page *page, struct mem_cgroup *memcg,
- bool lrucare)
+static void commit_charge(struct page *page, struct mem_cgroup *memcg)
{
- int isolated;
-
VM_BUG_ON_PAGE(page->mem_cgroup, page);
-
/*
- * In some cases, SwapCache and FUSE(splice_buf->radixtree), the page
- * may already be on some other mem_cgroup's LRU. Take care of it.
- */
- if (lrucare)
- lock_page_lru(page, &isolated);
-
- /*
- * Nobody should be changing or seriously looking at
- * page->mem_cgroup at this point:
- *
- * - the page is uncharged
+ * Any of the following ensures page->mem_cgroup stability:
*
- * - the page is off-LRU
- *
- * - an anonymous fault has exclusive page access, except for
- * a locked page table
- *
- * - a page cache insertion, a swapin fault, or a migration
- * have the page locked
+ * - the page lock
+ * - LRU isolation
+ * - lock_page_memcg()
+ * - exclusive reference
*/
page->mem_cgroup = memcg;
-
- if (lrucare)
- unlock_page_lru(page, isolated);
}
#ifdef CONFIG_MEMCG_KMEM
@@ -2802,7 +2784,12 @@ static void memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg,
static inline bool memcg_kmem_bypass(void)
{
- if (in_interrupt() || !current->mm || (current->flags & PF_KTHREAD))
+ if (in_interrupt())
+ return true;
+
+ /* Allow remote memcg charging in kthread contexts. */
+ if ((!current->mm || (current->flags & PF_KTHREAD)) &&
+ !current->active_memcg)
return true;
return false;
}
@@ -3015,8 +3002,6 @@ void mem_cgroup_split_huge_fixup(struct page *head)
for (i = 1; i < HPAGE_PMD_NR; i++)
head[i].mem_cgroup = head->mem_cgroup;
-
- __mod_memcg_state(head->mem_cgroup, MEMCG_RSS_HUGE, -HPAGE_PMD_NR);
}
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
@@ -3201,7 +3186,7 @@ unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order,
* Test whether @memcg has children, dead or alive. Note that this
* function doesn't care whether @memcg has use_hierarchy enabled and
* returns %true if there are child csses according to the cgroup
- * hierarchy. Testing use_hierarchy is the caller's responsiblity.
+ * hierarchy. Testing use_hierarchy is the caller's responsibility.
*/
static inline bool memcg_has_children(struct mem_cgroup *memcg)
{
@@ -3299,8 +3284,8 @@ static unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
unsigned long val;
if (mem_cgroup_is_root(memcg)) {
- val = memcg_page_state(memcg, MEMCG_CACHE) +
- memcg_page_state(memcg, MEMCG_RSS);
+ val = memcg_page_state(memcg, NR_FILE_PAGES) +
+ memcg_page_state(memcg, NR_ANON_MAPPED);
if (swap)
val += memcg_page_state(memcg, MEMCG_SWAP);
} else {
@@ -3688,7 +3673,7 @@ static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css,
#define LRU_ALL ((1 << NR_LRU_LISTS) - 1)
static unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,
- int nid, unsigned int lru_mask)
+ int nid, unsigned int lru_mask, bool tree)
{
struct lruvec *lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(nid));
unsigned long nr = 0;
@@ -3699,13 +3684,17 @@ static unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,
for_each_lru(lru) {
if (!(BIT(lru) & lru_mask))
continue;
- nr += lruvec_page_state_local(lruvec, NR_LRU_BASE + lru);
+ if (tree)
+ nr += lruvec_page_state(lruvec, NR_LRU_BASE + lru);
+ else
+ nr += lruvec_page_state_local(lruvec, NR_LRU_BASE + lru);
}
return nr;
}
static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg,
- unsigned int lru_mask)
+ unsigned int lru_mask,
+ bool tree)
{
unsigned long nr = 0;
enum lru_list lru;
@@ -3713,7 +3702,10 @@ static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg,
for_each_lru(lru) {
if (!(BIT(lru) & lru_mask))
continue;
- nr += memcg_page_state_local(memcg, NR_LRU_BASE + lru);
+ if (tree)
+ nr += memcg_page_state(memcg, NR_LRU_BASE + lru);
+ else
+ nr += memcg_page_state_local(memcg, NR_LRU_BASE + lru);
}
return nr;
}
@@ -3733,34 +3725,28 @@ static int memcg_numa_stat_show(struct seq_file *m, void *v)
};
const struct numa_stat *stat;
int nid;
- unsigned long nr;
struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) {
- nr = mem_cgroup_nr_lru_pages(memcg, stat->lru_mask);
- seq_printf(m, "%s=%lu", stat->name, nr);
- for_each_node_state(nid, N_MEMORY) {
- nr = mem_cgroup_node_nr_lru_pages(memcg, nid,
- stat->lru_mask);
- seq_printf(m, " N%d=%lu", nid, nr);
- }
+ seq_printf(m, "%s=%lu", stat->name,
+ mem_cgroup_nr_lru_pages(memcg, stat->lru_mask,
+ false));
+ for_each_node_state(nid, N_MEMORY)
+ seq_printf(m, " N%d=%lu", nid,
+ mem_cgroup_node_nr_lru_pages(memcg, nid,
+ stat->lru_mask, false));
seq_putc(m, '\n');
}
for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) {
- struct mem_cgroup *iter;
-
- nr = 0;
- for_each_mem_cgroup_tree(iter, memcg)
- nr += mem_cgroup_nr_lru_pages(iter, stat->lru_mask);
- seq_printf(m, "hierarchical_%s=%lu", stat->name, nr);
- for_each_node_state(nid, N_MEMORY) {
- nr = 0;
- for_each_mem_cgroup_tree(iter, memcg)
- nr += mem_cgroup_node_nr_lru_pages(
- iter, nid, stat->lru_mask);
- seq_printf(m, " N%d=%lu", nid, nr);
- }
+
+ seq_printf(m, "hierarchical_%s=%lu", stat->name,
+ mem_cgroup_nr_lru_pages(memcg, stat->lru_mask,
+ true));
+ for_each_node_state(nid, N_MEMORY)
+ seq_printf(m, " N%d=%lu", nid,
+ mem_cgroup_node_nr_lru_pages(memcg, nid,
+ stat->lru_mask, true));
seq_putc(m, '\n');
}
@@ -3769,9 +3755,11 @@ static int memcg_numa_stat_show(struct seq_file *m, void *v)
#endif /* CONFIG_NUMA */
static const unsigned int memcg1_stats[] = {
- MEMCG_CACHE,
- MEMCG_RSS,
- MEMCG_RSS_HUGE,
+ NR_FILE_PAGES,
+ NR_ANON_MAPPED,
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ NR_ANON_THPS,
+#endif
NR_SHMEM,
NR_FILE_MAPPED,
NR_FILE_DIRTY,
@@ -3782,7 +3770,9 @@ static const unsigned int memcg1_stats[] = {
static const char *const memcg1_stat_names[] = {
"cache",
"rss",
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
"rss_huge",
+#endif
"shmem",
"mapped_file",
"dirty",
@@ -3808,11 +3798,16 @@ static int memcg_stat_show(struct seq_file *m, void *v)
BUILD_BUG_ON(ARRAY_SIZE(memcg1_stat_names) != ARRAY_SIZE(memcg1_stats));
for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) {
+ unsigned long nr;
+
if (memcg1_stats[i] == MEMCG_SWAP && !do_memsw_account())
continue;
- seq_printf(m, "%s %lu\n", memcg1_stat_names[i],
- memcg_page_state_local(memcg, memcg1_stats[i]) *
- PAGE_SIZE);
+ nr = memcg_page_state_local(memcg, memcg1_stats[i]);
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ if (memcg1_stats[i] == NR_ANON_THPS)
+ nr *= HPAGE_PMD_NR;
+#endif
+ seq_printf(m, "%s %lu\n", memcg1_stat_names[i], nr * PAGE_SIZE);
}
for (i = 0; i < ARRAY_SIZE(memcg1_events); i++)
@@ -3858,23 +3853,17 @@ static int memcg_stat_show(struct seq_file *m, void *v)
{
pg_data_t *pgdat;
struct mem_cgroup_per_node *mz;
- struct zone_reclaim_stat *rstat;
- unsigned long recent_rotated[2] = {0, 0};
- unsigned long recent_scanned[2] = {0, 0};
+ unsigned long anon_cost = 0;
+ unsigned long file_cost = 0;
for_each_online_pgdat(pgdat) {
mz = mem_cgroup_nodeinfo(memcg, pgdat->node_id);
- rstat = &mz->lruvec.reclaim_stat;
- recent_rotated[0] += rstat->recent_rotated[0];
- recent_rotated[1] += rstat->recent_rotated[1];
- recent_scanned[0] += rstat->recent_scanned[0];
- recent_scanned[1] += rstat->recent_scanned[1];
+ anon_cost += mz->lruvec.anon_cost;
+ file_cost += mz->lruvec.file_cost;
}
- seq_printf(m, "recent_rotated_anon %lu\n", recent_rotated[0]);
- seq_printf(m, "recent_rotated_file %lu\n", recent_rotated[1]);
- seq_printf(m, "recent_scanned_anon %lu\n", recent_scanned[0]);
- seq_printf(m, "recent_scanned_file %lu\n", recent_scanned[1]);
+ seq_printf(m, "anon_cost %lu\n", anon_cost);
+ seq_printf(m, "file_cost %lu\n", file_cost);
}
#endif
@@ -4330,7 +4319,6 @@ void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages,
*pdirty = memcg_exact_page_state(memcg, NR_FILE_DIRTY);
- /* this should eventually include NR_UNSTABLE_NFS */
*pwriteback = memcg_exact_page_state(memcg, NR_WRITEBACK);
*pfilepages = memcg_exact_page_state(memcg, NR_INACTIVE_FILE) +
memcg_exact_page_state(memcg, NR_ACTIVE_FILE);
@@ -4338,7 +4326,7 @@ void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages,
while ((parent = parent_mem_cgroup(memcg))) {
unsigned long ceiling = min(READ_ONCE(memcg->memory.max),
- READ_ONCE(memcg->high));
+ READ_ONCE(memcg->memory.high));
unsigned long used = page_counter_read(&memcg->memory);
*pheadroom = min(*pheadroom, ceiling - min(ceiling, used));
@@ -4850,7 +4838,7 @@ static struct cftype mem_cgroup_legacy_files[] = {
* limited to 16 bit (MEM_CGROUP_ID_MAX), limiting the total number of
* memory-controlled cgroups to 64k.
*
- * However, there usually are many references to the oflline CSS after
+ * However, there usually are many references to the offline CSS after
* the cgroup has been destroyed, such as page cache or reclaimable
* slab objects, that don't need to hang on to the ID. We want to keep
* those dead CSS from occupying IDs, or we might quickly exhaust the
@@ -4990,19 +4978,22 @@ static struct mem_cgroup *mem_cgroup_alloc(void)
unsigned int size;
int node;
int __maybe_unused i;
+ long error = -ENOMEM;
size = sizeof(struct mem_cgroup);
size += nr_node_ids * sizeof(struct mem_cgroup_per_node *);
memcg = kzalloc(size, GFP_KERNEL);
if (!memcg)
- return NULL;
+ return ERR_PTR(error);
memcg->id.id = idr_alloc(&mem_cgroup_idr, NULL,
1, MEM_CGROUP_ID_MAX,
GFP_KERNEL);
- if (memcg->id.id < 0)
+ if (memcg->id.id < 0) {
+ error = memcg->id.id;
goto fail;
+ }
memcg->vmstats_local = alloc_percpu(struct memcg_vmstats_percpu);
if (!memcg->vmstats_local)
@@ -5046,7 +5037,7 @@ static struct mem_cgroup *mem_cgroup_alloc(void)
fail:
mem_cgroup_id_remove(memcg);
__mem_cgroup_free(memcg);
- return NULL;
+ return ERR_PTR(error);
}
static struct cgroup_subsys_state * __ref
@@ -5057,11 +5048,12 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
long error = -ENOMEM;
memcg = mem_cgroup_alloc();
- if (!memcg)
- return ERR_PTR(error);
+ if (IS_ERR(memcg))
+ return ERR_CAST(memcg);
- WRITE_ONCE(memcg->high, PAGE_COUNTER_MAX);
+ page_counter_set_high(&memcg->memory, PAGE_COUNTER_MAX);
memcg->soft_limit = PAGE_COUNTER_MAX;
+ page_counter_set_high(&memcg->swap, PAGE_COUNTER_MAX);
if (parent) {
memcg->swappiness = mem_cgroup_swappiness(parent);
memcg->oom_kill_disable = parent->oom_kill_disable;
@@ -5108,7 +5100,7 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
fail:
mem_cgroup_id_remove(memcg);
mem_cgroup_free(memcg);
- return ERR_PTR(-ENOMEM);
+ return ERR_PTR(error);
}
static int mem_cgroup_css_online(struct cgroup_subsys_state *css)
@@ -5213,8 +5205,9 @@ static void mem_cgroup_css_reset(struct cgroup_subsys_state *css)
page_counter_set_max(&memcg->tcpmem, PAGE_COUNTER_MAX);
page_counter_set_min(&memcg->memory, 0);
page_counter_set_low(&memcg->memory, 0);
- WRITE_ONCE(memcg->high, PAGE_COUNTER_MAX);
+ page_counter_set_high(&memcg->memory, PAGE_COUNTER_MAX);
memcg->soft_limit = PAGE_COUNTER_MAX;
+ page_counter_set_high(&memcg->swap, PAGE_COUNTER_MAX);
memcg_wb_domain_size_changed(memcg);
}
@@ -5305,8 +5298,7 @@ static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
* we call find_get_page() with swapper_space directly.
*/
page = find_get_page(swap_address_space(ent), swp_offset(ent));
- if (do_memsw_account())
- entry->val = ent.val;
+ entry->val = ent.val;
return page;
}
@@ -5340,8 +5332,7 @@ static struct page *mc_handle_file_pte(struct vm_area_struct *vma,
page = find_get_entry(mapping, pgoff);
if (xa_is_value(page)) {
swp_entry_t swp = radix_to_swp_entry(page);
- if (do_memsw_account())
- *entry = swp;
+ *entry = swp;
page = find_get_page(swap_address_space(swp),
swp_offset(swp));
}
@@ -5372,10 +5363,8 @@ static int mem_cgroup_move_account(struct page *page,
{
struct lruvec *from_vec, *to_vec;
struct pglist_data *pgdat;
- unsigned long flags;
unsigned int nr_pages = compound ? hpage_nr_pages(page) : 1;
int ret;
- bool anon;
VM_BUG_ON(from == to);
VM_BUG_ON_PAGE(PageLRU(page), page);
@@ -5393,30 +5382,47 @@ static int mem_cgroup_move_account(struct page *page,
if (page->mem_cgroup != from)
goto out_unlock;
- anon = PageAnon(page);
-
pgdat = page_pgdat(page);
from_vec = mem_cgroup_lruvec(from, pgdat);
to_vec = mem_cgroup_lruvec(to, pgdat);
- spin_lock_irqsave(&from->move_lock, flags);
+ lock_page_memcg(page);
- if (!anon && page_mapped(page)) {
- __mod_lruvec_state(from_vec, NR_FILE_MAPPED, -nr_pages);
- __mod_lruvec_state(to_vec, NR_FILE_MAPPED, nr_pages);
- }
+ if (PageAnon(page)) {
+ if (page_mapped(page)) {
+ __mod_lruvec_state(from_vec, NR_ANON_MAPPED, -nr_pages);
+ __mod_lruvec_state(to_vec, NR_ANON_MAPPED, nr_pages);
+ if (PageTransHuge(page)) {
+ __mod_lruvec_state(from_vec, NR_ANON_THPS,
+ -nr_pages);
+ __mod_lruvec_state(to_vec, NR_ANON_THPS,
+ nr_pages);
+ }
- /*
- * move_lock grabbed above and caller set from->moving_account, so
- * mod_memcg_page_state will serialize updates to PageDirty.
- * So mapping should be stable for dirty pages.
- */
- if (!anon && PageDirty(page)) {
- struct address_space *mapping = page_mapping(page);
+ }
+ } else {
+ __mod_lruvec_state(from_vec, NR_FILE_PAGES, -nr_pages);
+ __mod_lruvec_state(to_vec, NR_FILE_PAGES, nr_pages);
- if (mapping_cap_account_dirty(mapping)) {
- __mod_lruvec_state(from_vec, NR_FILE_DIRTY, -nr_pages);
- __mod_lruvec_state(to_vec, NR_FILE_DIRTY, nr_pages);
+ if (PageSwapBacked(page)) {
+ __mod_lruvec_state(from_vec, NR_SHMEM, -nr_pages);
+ __mod_lruvec_state(to_vec, NR_SHMEM, nr_pages);
+ }
+
+ if (page_mapped(page)) {
+ __mod_lruvec_state(from_vec, NR_FILE_MAPPED, -nr_pages);
+ __mod_lruvec_state(to_vec, NR_FILE_MAPPED, nr_pages);
+ }
+
+ if (PageDirty(page)) {
+ struct address_space *mapping = page_mapping(page);
+
+ if (mapping_cap_account_dirty(mapping)) {
+ __mod_lruvec_state(from_vec, NR_FILE_DIRTY,
+ -nr_pages);
+ __mod_lruvec_state(to_vec, NR_FILE_DIRTY,
+ nr_pages);
+ }
}
}
@@ -5426,22 +5432,30 @@ static int mem_cgroup_move_account(struct page *page,
}
/*
+ * All state has been migrated, let's switch to the new memcg.
+ *
* It is safe to change page->mem_cgroup here because the page
- * is referenced, charged, and isolated - we can't race with
- * uncharging, charging, migration, or LRU putback.
+ * is referenced, charged, isolated, and locked: we can't race
+ * with (un)charging, migration, LRU putback, or anything else
+ * that would rely on a stable page->mem_cgroup.
+ *
+ * Note that lock_page_memcg is a memcg lock, not a page lock,
+ * to save space. As soon as we switch page->mem_cgroup to a
+ * new memcg that isn't locked, the above state can change
+ * concurrently again. Make sure we're truly done with it.
*/
+ smp_mb();
- /* caller should have done css_get */
- page->mem_cgroup = to;
+ page->mem_cgroup = to; /* caller should have done css_get */
- spin_unlock_irqrestore(&from->move_lock, flags);
+ __unlock_page_memcg(from);
ret = 0;
local_irq_disable();
- mem_cgroup_charge_statistics(to, page, compound, nr_pages);
+ mem_cgroup_charge_statistics(to, page, nr_pages);
memcg_check_events(to, page);
- mem_cgroup_charge_statistics(from, page, compound, -nr_pages);
+ mem_cgroup_charge_statistics(from, page, -nr_pages);
memcg_check_events(from, page);
local_irq_enable();
out_unlock:
@@ -5600,9 +5614,9 @@ static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm)
{
unsigned long precharge;
- down_read(&mm->mmap_sem);
+ mmap_read_lock(mm);
walk_page_range(mm, 0, mm->highest_vm_end, &precharge_walk_ops, NULL);
- up_read(&mm->mmap_sem);
+ mmap_read_unlock(mm);
precharge = mc.precharge;
mc.precharge = 0;
@@ -5885,9 +5899,9 @@ static void mem_cgroup_move_charge(void)
atomic_inc(&mc.from->moving_account);
synchronize_rcu();
retry:
- if (unlikely(!down_read_trylock(&mc.mm->mmap_sem))) {
+ if (unlikely(!mmap_read_trylock(mc.mm))) {
/*
- * Someone who are holding the mmap_sem might be waiting in
+ * Someone who are holding the mmap_lock might be waiting in
* waitq. So we cancel all extra charges, wake up all waiters,
* and retry. Because we cancel precharges, we might not be able
* to move enough charges, but moving charge is a best-effort
@@ -5904,7 +5918,7 @@ retry:
walk_page_range(mc.mm, 0, mc.mm->highest_vm_end, &charge_walk_ops,
NULL);
- up_read(&mc.mm->mmap_sem);
+ mmap_read_unlock(mc.mm);
atomic_dec(&mc.from->moving_account);
}
@@ -6012,7 +6026,8 @@ static ssize_t memory_low_write(struct kernfs_open_file *of,
static int memory_high_show(struct seq_file *m, void *v)
{
- return seq_puts_memcg_tunable(m, READ_ONCE(mem_cgroup_from_seq(m)->high));
+ return seq_puts_memcg_tunable(m,
+ READ_ONCE(mem_cgroup_from_seq(m)->memory.high));
}
static ssize_t memory_high_write(struct kernfs_open_file *of,
@@ -6029,7 +6044,7 @@ static ssize_t memory_high_write(struct kernfs_open_file *of,
if (err)
return err;
- WRITE_ONCE(memcg->high, high);
+ page_counter_set_high(&memcg->memory, high);
for (;;) {
unsigned long nr_pages = page_counter_read(&memcg->memory);
@@ -6224,7 +6239,6 @@ static struct cftype memory_files[] = {
},
{
.name = "stat",
- .flags = CFTYPE_NOT_ON_ROOT,
.seq_show = memory_stat_show,
},
{
@@ -6427,125 +6441,63 @@ out:
}
/**
- * mem_cgroup_try_charge - try charging a page
+ * mem_cgroup_charge - charge a newly allocated page to a cgroup
* @page: page to charge
* @mm: mm context of the victim
* @gfp_mask: reclaim mode
- * @memcgp: charged memcg return
- * @compound: charge the page as compound or small page
*
* Try to charge @page to the memcg that @mm belongs to, reclaiming
* pages according to @gfp_mask if necessary.
*
- * Returns 0 on success, with *@memcgp pointing to the charged memcg.
- * Otherwise, an error code is returned.
- *
- * After page->mapping has been set up, the caller must finalize the
- * charge with mem_cgroup_commit_charge(). Or abort the transaction
- * with mem_cgroup_cancel_charge() in case page instantiation fails.
+ * Returns 0 on success. Otherwise, an error code is returned.
*/
-int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm,
- gfp_t gfp_mask, struct mem_cgroup **memcgp,
- bool compound)
+int mem_cgroup_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask)
{
+ unsigned int nr_pages = hpage_nr_pages(page);
struct mem_cgroup *memcg = NULL;
- unsigned int nr_pages = compound ? hpage_nr_pages(page) : 1;
int ret = 0;
if (mem_cgroup_disabled())
goto out;
if (PageSwapCache(page)) {
+ swp_entry_t ent = { .val = page_private(page), };
+ unsigned short id;
+
/*
* Every swap fault against a single page tries to charge the
* page, bail as early as possible. shmem_unuse() encounters
- * already charged pages, too. The USED bit is protected by
- * the page lock, which serializes swap cache removal, which
+ * already charged pages, too. page->mem_cgroup is protected
+ * by the page lock, which serializes swap cache removal, which
* in turn serializes uncharging.
*/
VM_BUG_ON_PAGE(!PageLocked(page), page);
if (compound_head(page)->mem_cgroup)
goto out;
- if (do_swap_account) {
- swp_entry_t ent = { .val = page_private(page), };
- unsigned short id = lookup_swap_cgroup_id(ent);
-
- rcu_read_lock();
- memcg = mem_cgroup_from_id(id);
- if (memcg && !css_tryget_online(&memcg->css))
- memcg = NULL;
- rcu_read_unlock();
- }
+ id = lookup_swap_cgroup_id(ent);
+ rcu_read_lock();
+ memcg = mem_cgroup_from_id(id);
+ if (memcg && !css_tryget_online(&memcg->css))
+ memcg = NULL;
+ rcu_read_unlock();
}
if (!memcg)
memcg = get_mem_cgroup_from_mm(mm);
ret = try_charge(memcg, gfp_mask, nr_pages);
+ if (ret)
+ goto out_put;
- css_put(&memcg->css);
-out:
- *memcgp = memcg;
- return ret;
-}
-
-int mem_cgroup_try_charge_delay(struct page *page, struct mm_struct *mm,
- gfp_t gfp_mask, struct mem_cgroup **memcgp,
- bool compound)
-{
- struct mem_cgroup *memcg;
- int ret;
-
- ret = mem_cgroup_try_charge(page, mm, gfp_mask, memcgp, compound);
- memcg = *memcgp;
- mem_cgroup_throttle_swaprate(memcg, page_to_nid(page), gfp_mask);
- return ret;
-}
-
-/**
- * mem_cgroup_commit_charge - commit a page charge
- * @page: page to charge
- * @memcg: memcg to charge the page to
- * @lrucare: page might be on LRU already
- * @compound: charge the page as compound or small page
- *
- * Finalize a charge transaction started by mem_cgroup_try_charge(),
- * after page->mapping has been set up. This must happen atomically
- * as part of the page instantiation, i.e. under the page table lock
- * for anonymous pages, under the page lock for page and swap cache.
- *
- * In addition, the page must not be on the LRU during the commit, to
- * prevent racing with task migration. If it might be, use @lrucare.
- *
- * Use mem_cgroup_cancel_charge() to cancel the transaction instead.
- */
-void mem_cgroup_commit_charge(struct page *page, struct mem_cgroup *memcg,
- bool lrucare, bool compound)
-{
- unsigned int nr_pages = compound ? hpage_nr_pages(page) : 1;
-
- VM_BUG_ON_PAGE(!page->mapping, page);
- VM_BUG_ON_PAGE(PageLRU(page) && !lrucare, page);
-
- if (mem_cgroup_disabled())
- return;
- /*
- * Swap faults will attempt to charge the same page multiple
- * times. But reuse_swap_page() might have removed the page
- * from swapcache already, so we can't check PageSwapCache().
- */
- if (!memcg)
- return;
-
- commit_charge(page, memcg, lrucare);
+ commit_charge(page, memcg);
local_irq_disable();
- mem_cgroup_charge_statistics(memcg, page, compound, nr_pages);
+ mem_cgroup_charge_statistics(memcg, page, nr_pages);
memcg_check_events(memcg, page);
local_irq_enable();
- if (do_memsw_account() && PageSwapCache(page)) {
+ if (PageSwapCache(page)) {
swp_entry_t entry = { .val = page_private(page) };
/*
* The swap entry might not get freed for a long time,
@@ -6554,42 +6506,18 @@ void mem_cgroup_commit_charge(struct page *page, struct mem_cgroup *memcg,
*/
mem_cgroup_uncharge_swap(entry, nr_pages);
}
-}
-/**
- * mem_cgroup_cancel_charge - cancel a page charge
- * @page: page to charge
- * @memcg: memcg to charge the page to
- * @compound: charge the page as compound or small page
- *
- * Cancel a charge transaction started by mem_cgroup_try_charge().
- */
-void mem_cgroup_cancel_charge(struct page *page, struct mem_cgroup *memcg,
- bool compound)
-{
- unsigned int nr_pages = compound ? hpage_nr_pages(page) : 1;
-
- if (mem_cgroup_disabled())
- return;
- /*
- * Swap faults will attempt to charge the same page multiple
- * times. But reuse_swap_page() might have removed the page
- * from swapcache already, so we can't check PageSwapCache().
- */
- if (!memcg)
- return;
-
- cancel_charge(memcg, nr_pages);
+out_put:
+ css_put(&memcg->css);
+out:
+ return ret;
}
struct uncharge_gather {
struct mem_cgroup *memcg;
+ unsigned long nr_pages;
unsigned long pgpgout;
- unsigned long nr_anon;
- unsigned long nr_file;
unsigned long nr_kmem;
- unsigned long nr_huge;
- unsigned long nr_shmem;
struct page *dummy_page;
};
@@ -6600,37 +6528,32 @@ static inline void uncharge_gather_clear(struct uncharge_gather *ug)
static void uncharge_batch(const struct uncharge_gather *ug)
{
- unsigned long nr_pages = ug->nr_anon + ug->nr_file + ug->nr_kmem;
unsigned long flags;
if (!mem_cgroup_is_root(ug->memcg)) {
- page_counter_uncharge(&ug->memcg->memory, nr_pages);
+ page_counter_uncharge(&ug->memcg->memory, ug->nr_pages);
if (do_memsw_account())
- page_counter_uncharge(&ug->memcg->memsw, nr_pages);
+ page_counter_uncharge(&ug->memcg->memsw, ug->nr_pages);
if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && ug->nr_kmem)
page_counter_uncharge(&ug->memcg->kmem, ug->nr_kmem);
memcg_oom_recover(ug->memcg);
}
local_irq_save(flags);
- __mod_memcg_state(ug->memcg, MEMCG_RSS, -ug->nr_anon);
- __mod_memcg_state(ug->memcg, MEMCG_CACHE, -ug->nr_file);
- __mod_memcg_state(ug->memcg, MEMCG_RSS_HUGE, -ug->nr_huge);
- __mod_memcg_state(ug->memcg, NR_SHMEM, -ug->nr_shmem);
__count_memcg_events(ug->memcg, PGPGOUT, ug->pgpgout);
- __this_cpu_add(ug->memcg->vmstats_percpu->nr_page_events, nr_pages);
+ __this_cpu_add(ug->memcg->vmstats_percpu->nr_page_events, ug->nr_pages);
memcg_check_events(ug->memcg, ug->dummy_page);
local_irq_restore(flags);
if (!mem_cgroup_is_root(ug->memcg))
- css_put_many(&ug->memcg->css, nr_pages);
+ css_put_many(&ug->memcg->css, ug->nr_pages);
}
static void uncharge_page(struct page *page, struct uncharge_gather *ug)
{
+ unsigned long nr_pages;
+
VM_BUG_ON_PAGE(PageLRU(page), page);
- VM_BUG_ON_PAGE(page_count(page) && !is_zone_device_page(page) &&
- !PageHWPoison(page) , page);
if (!page->mem_cgroup)
return;
@@ -6649,23 +6572,13 @@ static void uncharge_page(struct page *page, struct uncharge_gather *ug)
ug->memcg = page->mem_cgroup;
}
- if (!PageKmemcg(page)) {
- unsigned int nr_pages = 1;
+ nr_pages = compound_nr(page);
+ ug->nr_pages += nr_pages;
- if (PageTransHuge(page)) {
- nr_pages = compound_nr(page);
- ug->nr_huge += nr_pages;
- }
- if (PageAnon(page))
- ug->nr_anon += nr_pages;
- else {
- ug->nr_file += nr_pages;
- if (PageSwapBacked(page))
- ug->nr_shmem += nr_pages;
- }
+ if (!PageKmemcg(page)) {
ug->pgpgout++;
} else {
- ug->nr_kmem += compound_nr(page);
+ ug->nr_kmem += nr_pages;
__ClearPageKmemcg(page);
}
@@ -6702,8 +6615,7 @@ static void uncharge_list(struct list_head *page_list)
* mem_cgroup_uncharge - uncharge a page
* @page: page to uncharge
*
- * Uncharge a page previously charged with mem_cgroup_try_charge() and
- * mem_cgroup_commit_charge().
+ * Uncharge a page previously charged with mem_cgroup_charge().
*/
void mem_cgroup_uncharge(struct page *page)
{
@@ -6726,7 +6638,7 @@ void mem_cgroup_uncharge(struct page *page)
* @page_list: list of pages to uncharge
*
* Uncharge a list of pages previously charged with
- * mem_cgroup_try_charge() and mem_cgroup_commit_charge().
+ * mem_cgroup_charge().
*/
void mem_cgroup_uncharge_list(struct list_head *page_list)
{
@@ -6779,11 +6691,10 @@ void mem_cgroup_migrate(struct page *oldpage, struct page *newpage)
page_counter_charge(&memcg->memsw, nr_pages);
css_get_many(&memcg->css, nr_pages);
- commit_charge(newpage, memcg, false);
+ commit_charge(newpage, memcg);
local_irq_save(flags);
- mem_cgroup_charge_statistics(memcg, newpage, PageTransHuge(newpage),
- nr_pages);
+ mem_cgroup_charge_statistics(memcg, newpage, nr_pages);
memcg_check_events(memcg, newpage);
local_irq_restore(flags);
}
@@ -6971,7 +6882,7 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
VM_BUG_ON_PAGE(PageLRU(page), page);
VM_BUG_ON_PAGE(page_count(page), page);
- if (!do_memsw_account())
+ if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
return;
memcg = page->mem_cgroup;
@@ -7000,7 +6911,7 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
if (!mem_cgroup_is_root(memcg))
page_counter_uncharge(&memcg->memory, nr_entries);
- if (memcg != swap_memcg) {
+ if (!cgroup_memory_noswap && memcg != swap_memcg) {
if (!mem_cgroup_is_root(swap_memcg))
page_counter_charge(&swap_memcg->memsw, nr_entries);
page_counter_uncharge(&memcg->memsw, nr_entries);
@@ -7013,8 +6924,7 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
* only synchronisation we have for updating the per-CPU variables.
*/
VM_BUG_ON(!irqs_disabled());
- mem_cgroup_charge_statistics(memcg, page, PageTransHuge(page),
- -nr_entries);
+ mem_cgroup_charge_statistics(memcg, page, -nr_entries);
memcg_check_events(memcg, page);
if (!mem_cgroup_is_root(memcg))
@@ -7037,7 +6947,7 @@ int mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry)
struct mem_cgroup *memcg;
unsigned short oldid;
- if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) || !do_swap_account)
+ if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
return 0;
memcg = page->mem_cgroup;
@@ -7053,7 +6963,7 @@ int mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry)
memcg = mem_cgroup_id_get_online(memcg);
- if (!mem_cgroup_is_root(memcg) &&
+ if (!cgroup_memory_noswap && !mem_cgroup_is_root(memcg) &&
!page_counter_try_charge(&memcg->swap, nr_pages, &counter)) {
memcg_memory_event(memcg, MEMCG_SWAP_MAX);
memcg_memory_event(memcg, MEMCG_SWAP_FAIL);
@@ -7081,14 +6991,11 @@ void mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_pages)
struct mem_cgroup *memcg;
unsigned short id;
- if (!do_swap_account)
- return;
-
id = swap_cgroup_record(entry, 0, nr_pages);
rcu_read_lock();
memcg = mem_cgroup_from_id(id);
if (memcg) {
- if (!mem_cgroup_is_root(memcg)) {
+ if (!cgroup_memory_noswap && !mem_cgroup_is_root(memcg)) {
if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
page_counter_uncharge(&memcg->swap, nr_pages);
else
@@ -7104,7 +7011,7 @@ long mem_cgroup_get_nr_swap_pages(struct mem_cgroup *memcg)
{
long nr_swap_pages = get_nr_swap_pages();
- if (!do_swap_account || !cgroup_subsys_on_dfl(memory_cgrp_subsys))
+ if (cgroup_memory_noswap || !cgroup_subsys_on_dfl(memory_cgrp_subsys))
return nr_swap_pages;
for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg))
nr_swap_pages = min_t(long, nr_swap_pages,
@@ -7121,37 +7028,33 @@ bool mem_cgroup_swap_full(struct page *page)
if (vm_swap_full())
return true;
- if (!do_swap_account || !cgroup_subsys_on_dfl(memory_cgrp_subsys))
+ if (cgroup_memory_noswap || !cgroup_subsys_on_dfl(memory_cgrp_subsys))
return false;
memcg = page->mem_cgroup;
if (!memcg)
return false;
- for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg))
- if (page_counter_read(&memcg->swap) * 2 >=
- READ_ONCE(memcg->swap.max))
+ for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg)) {
+ unsigned long usage = page_counter_read(&memcg->swap);
+
+ if (usage * 2 >= READ_ONCE(memcg->swap.high) ||
+ usage * 2 >= READ_ONCE(memcg->swap.max))
return true;
+ }
return false;
}
-/* for remember boot option*/
-#ifdef CONFIG_MEMCG_SWAP_ENABLED
-static int really_do_swap_account __initdata = 1;
-#else
-static int really_do_swap_account __initdata;
-#endif
-
-static int __init enable_swap_account(char *s)
+static int __init setup_swap_account(char *s)
{
if (!strcmp(s, "1"))
- really_do_swap_account = 1;
+ cgroup_memory_noswap = 0;
else if (!strcmp(s, "0"))
- really_do_swap_account = 0;
+ cgroup_memory_noswap = 1;
return 1;
}
-__setup("swapaccount=", enable_swap_account);
+__setup("swapaccount=", setup_swap_account);
static u64 swap_current_read(struct cgroup_subsys_state *css,
struct cftype *cft)
@@ -7161,6 +7064,29 @@ static u64 swap_current_read(struct cgroup_subsys_state *css,
return (u64)page_counter_read(&memcg->swap) * PAGE_SIZE;
}
+static int swap_high_show(struct seq_file *m, void *v)
+{
+ return seq_puts_memcg_tunable(m,
+ READ_ONCE(mem_cgroup_from_seq(m)->swap.high));
+}
+
+static ssize_t swap_high_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off)
+{
+ struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
+ unsigned long high;
+ int err;
+
+ buf = strstrip(buf);
+ err = page_counter_memparse(buf, "max", &high);
+ if (err)
+ return err;
+
+ page_counter_set_high(&memcg->swap, high);
+
+ return nbytes;
+}
+
static int swap_max_show(struct seq_file *m, void *v)
{
return seq_puts_memcg_tunable(m,
@@ -7188,6 +7114,8 @@ static int swap_events_show(struct seq_file *m, void *v)
{
struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
+ seq_printf(m, "high %lu\n",
+ atomic_long_read(&memcg->memory_events[MEMCG_SWAP_HIGH]));
seq_printf(m, "max %lu\n",
atomic_long_read(&memcg->memory_events[MEMCG_SWAP_MAX]));
seq_printf(m, "fail %lu\n",
@@ -7203,6 +7131,12 @@ static struct cftype swap_files[] = {
.read_u64 = swap_current_read,
},
{
+ .name = "swap.high",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .seq_show = swap_high_show,
+ .write = swap_high_write,
+ },
+ {
.name = "swap.max",
.flags = CFTYPE_NOT_ON_ROOT,
.seq_show = swap_max_show,
@@ -7217,7 +7151,7 @@ static struct cftype swap_files[] = {
{ } /* terminate */
};
-static struct cftype memsw_cgroup_files[] = {
+static struct cftype memsw_files[] = {
{
.name = "memsw.usage_in_bytes",
.private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE),
@@ -7246,13 +7180,16 @@ static struct cftype memsw_cgroup_files[] = {
static int __init mem_cgroup_swap_init(void)
{
- if (!mem_cgroup_disabled() && really_do_swap_account) {
- do_swap_account = 1;
- WARN_ON(cgroup_add_dfl_cftypes(&memory_cgrp_subsys,
- swap_files));
- WARN_ON(cgroup_add_legacy_cftypes(&memory_cgrp_subsys,
- memsw_cgroup_files));
- }
+ /* No memory control -> no swap control */
+ if (mem_cgroup_disabled())
+ cgroup_memory_noswap = true;
+
+ if (cgroup_memory_noswap)
+ return 0;
+
+ WARN_ON(cgroup_add_dfl_cftypes(&memory_cgrp_subsys, swap_files));
+ WARN_ON(cgroup_add_legacy_cftypes(&memory_cgrp_subsys, memsw_files));
+
return 0;
}
subsys_initcall(mem_cgroup_swap_init);
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index a96364be8ab4..ababa368cb68 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -210,14 +210,17 @@ static int kill_proc(struct to_kill *tk, unsigned long pfn, int flags)
{
struct task_struct *t = tk->tsk;
short addr_lsb = tk->size_shift;
- int ret;
+ int ret = 0;
- pr_err("Memory failure: %#lx: Sending SIGBUS to %s:%d due to hardware memory corruption\n",
- pfn, t->comm, t->pid);
+ if ((t->mm == current->mm) || !(flags & MF_ACTION_REQUIRED))
+ pr_err("Memory failure: %#lx: Sending SIGBUS to %s:%d due to hardware memory corruption\n",
+ pfn, t->comm, t->pid);
- if ((flags & MF_ACTION_REQUIRED) && t->mm == current->mm) {
- ret = force_sig_mceerr(BUS_MCEERR_AR, (void __user *)tk->addr,
- addr_lsb);
+ if (flags & MF_ACTION_REQUIRED) {
+ if (t->mm == current->mm)
+ ret = force_sig_mceerr(BUS_MCEERR_AR,
+ (void __user *)tk->addr, addr_lsb);
+ /* send no signal to non-current processes */
} else {
/*
* Don't use force here, it's convenient if the signal
@@ -1493,7 +1496,7 @@ static void memory_failure_work_func(struct work_struct *work)
unsigned long proc_flags;
int gotten;
- mf_cpu = this_cpu_ptr(&memory_failure_cpu);
+ mf_cpu = container_of(work, struct memory_failure_cpu, work);
for (;;) {
spin_lock_irqsave(&mf_cpu->lock, proc_flags);
gotten = kfifo_get(&mf_cpu->fifo, &entry);
@@ -1507,6 +1510,19 @@ static void memory_failure_work_func(struct work_struct *work)
}
}
+/*
+ * Process memory_failure work queued on the specified CPU.
+ * Used to avoid return-to-userspace racing with the memory_failure workqueue.
+ */
+void memory_failure_queue_kick(int cpu)
+{
+ struct memory_failure_cpu *mf_cpu;
+
+ mf_cpu = &per_cpu(memory_failure_cpu, cpu);
+ cancel_work_sync(&mf_cpu->work);
+ memory_failure_work_func(&mf_cpu->work);
+}
+
static int __init memory_failure_init(void)
{
struct memory_failure_cpu *mf_cpu;
diff --git a/mm/memory.c b/mm/memory.c
index f703fe8c8346..dc7f3543b1fd 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -80,7 +80,6 @@
#include <linux/uaccess.h>
#include <asm/tlb.h>
#include <asm/tlbflush.h>
-#include <asm/pgtable.h>
#include "internal.h"
@@ -802,8 +801,6 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
get_page(page);
page_dup_rmap(page, false);
rss[mm_counter(page)]++;
- } else if (pte_devmap(pte)) {
- page = pte_page(pte);
}
out_set_pte:
@@ -1188,7 +1185,7 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
* Here there can be other concurrent MADV_DONTNEED or
* trans huge page faults running, and if the pmd is
* none or trans huge it can change under us. This is
- * because MADV_DONTNEED holds the mmap_sem in read
+ * because MADV_DONTNEED holds the mmap_lock in read
* mode.
*/
if (pmd_none_or_trans_huge_or_clear_bad(pmd))
@@ -1214,7 +1211,7 @@ static inline unsigned long zap_pud_range(struct mmu_gather *tlb,
next = pud_addr_end(addr, end);
if (pud_trans_huge(*pud) || pud_devmap(*pud)) {
if (next - addr != HPAGE_PUD_SIZE) {
- VM_BUG_ON_VMA(!rwsem_is_locked(&tlb->mm->mmap_sem), vma);
+ mmap_assert_locked(tlb->mm);
split_huge_pud(vma, pud, addr);
} else if (zap_huge_pud(tlb, vma, pud, addr))
goto next;
@@ -1595,7 +1592,7 @@ int vm_insert_pages(struct vm_area_struct *vma, unsigned long addr,
if (addr < vma->vm_start || end_addr >= vma->vm_end)
return -EFAULT;
if (!(vma->vm_flags & VM_MIXEDMAP)) {
- BUG_ON(down_read_trylock(&vma->vm_mm->mmap_sem));
+ BUG_ON(mmap_read_trylock(vma->vm_mm));
BUG_ON(vma->vm_flags & VM_PFNMAP);
vma->vm_flags |= VM_MIXEDMAP;
}
@@ -1639,7 +1636,7 @@ EXPORT_SYMBOL(vm_insert_pages);
* The page does not need to be reserved.
*
* Usually this function is called from f_op->mmap() handler
- * under mm->mmap_sem write-lock, so it can change vma->vm_flags.
+ * under mm->mmap_lock write-lock, so it can change vma->vm_flags.
* Caller must set VM_MIXEDMAP on vma if it wants to call this
* function from other places, for example from page-fault handler.
*
@@ -1653,7 +1650,7 @@ int vm_insert_page(struct vm_area_struct *vma, unsigned long addr,
if (!page_count(page))
return -EINVAL;
if (!(vma->vm_flags & VM_MIXEDMAP)) {
- BUG_ON(down_read_trylock(&vma->vm_mm->mmap_sem));
+ BUG_ON(mmap_read_trylock(vma->vm_mm));
BUG_ON(vma->vm_flags & VM_PFNMAP);
vma->vm_flags |= VM_MIXEDMAP;
}
@@ -2436,10 +2433,9 @@ static inline bool cow_user_page(struct page *dst, struct page *src,
if (!likely(pte_same(*vmf->pte, vmf->orig_pte))) {
/*
* Other thread has already handled the fault
- * and we don't need to do anything. If it's
- * not the case, the fault will be triggered
- * again on the same address.
+ * and update local tlb only
*/
+ update_mmu_tlb(vma, addr, vmf->pte);
ret = false;
goto pte_unlock;
}
@@ -2463,13 +2459,14 @@ static inline bool cow_user_page(struct page *dst, struct page *src,
vmf->pte = pte_offset_map_lock(mm, vmf->pmd, addr, &vmf->ptl);
locked = true;
if (!likely(pte_same(*vmf->pte, vmf->orig_pte))) {
- /* The PTE changed under us. Retry page fault. */
+ /* The PTE changed under us, update local tlb */
+ update_mmu_tlb(vma, addr, vmf->pte);
ret = false;
goto pte_unlock;
}
/*
- * The same page can be mapped back since last copy attampt.
+ * The same page can be mapped back since last copy attempt.
* Try to copy again under PTL.
*/
if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE)) {
@@ -2576,7 +2573,7 @@ static vm_fault_t fault_dirty_shared_page(struct vm_fault *vmf)
* mapping may be NULL here because some device drivers do not
* set page.mapping but still dirty their pages
*
- * Drop the mmap_sem before waiting on IO, if we can. The file
+ * Drop the mmap_lock before waiting on IO, if we can. The file
* is pinning the mapping, as per above.
*/
if ((dirtied || page_mkwrite) && mapping) {
@@ -2626,7 +2623,7 @@ static inline void wp_page_reuse(struct vm_fault *vmf)
/*
* Handle the case of a page which we actually need to copy to a new page.
*
- * Called with mmap_sem locked and the old page referenced, but
+ * Called with mmap_lock locked and the old page referenced, but
* without the ptl held.
*
* High level logic flow:
@@ -2647,7 +2644,6 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
struct page *new_page = NULL;
pte_t entry;
int page_copied = 0;
- struct mem_cgroup *memcg;
struct mmu_notifier_range range;
if (unlikely(anon_vma_prepare(vma)))
@@ -2678,8 +2674,9 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
}
}
- if (mem_cgroup_try_charge_delay(new_page, mm, GFP_KERNEL, &memcg, false))
+ if (mem_cgroup_charge(new_page, mm, GFP_KERNEL))
goto oom_free_new;
+ cgroup_throttle_swaprate(new_page, GFP_KERNEL);
__SetPageUptodate(new_page);
@@ -2704,6 +2701,7 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
}
flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte));
entry = mk_pte(new_page, vma->vm_page_prot);
+ entry = pte_sw_mkyoung(entry);
entry = maybe_mkwrite(pte_mkdirty(entry), vma);
/*
* Clear the pte entry and flush it first, before updating the
@@ -2713,7 +2711,6 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
*/
ptep_clear_flush_notify(vma, vmf->address, vmf->pte);
page_add_new_anon_rmap(new_page, vma, vmf->address, false);
- mem_cgroup_commit_charge(new_page, memcg, false, false);
lru_cache_add_active_or_unevictable(new_page, vma);
/*
* We call the notify macro here because, when using secondary
@@ -2752,7 +2749,7 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
new_page = old_page;
page_copied = 1;
} else {
- mem_cgroup_cancel_charge(new_page, memcg, false);
+ update_mmu_tlb(vma, vmf->address, vmf->pte);
}
if (new_page)
@@ -2812,6 +2809,7 @@ vm_fault_t finish_mkwrite_fault(struct vm_fault *vmf)
* pte_offset_map_lock.
*/
if (!pte_same(*vmf->pte, vmf->orig_pte)) {
+ update_mmu_tlb(vmf->vma, vmf->address, vmf->pte);
pte_unmap_unlock(vmf->pte, vmf->ptl);
return VM_FAULT_NOPAGE;
}
@@ -2889,9 +2887,9 @@ static vm_fault_t wp_page_shared(struct vm_fault *vmf)
* change only once the write actually happens. This avoids a few races,
* and potentially makes it more efficient.
*
- * We enter with non-exclusive mmap_sem (to exclude vma changes,
+ * We enter with non-exclusive mmap_lock (to exclude vma changes,
* but allow concurrent faults), with pte both mapped and locked.
- * We return with mmap_sem still held, but pte unmapped and unlocked.
+ * We return with mmap_lock still held, but pte unmapped and unlocked.
*/
static vm_fault_t do_wp_page(struct vm_fault *vmf)
__releases(vmf->ptl)
@@ -2936,6 +2934,7 @@ static vm_fault_t do_wp_page(struct vm_fault *vmf)
vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
vmf->address, &vmf->ptl);
if (!pte_same(*vmf->pte, vmf->orig_pte)) {
+ update_mmu_tlb(vma, vmf->address, vmf->pte);
unlock_page(vmf->page);
pte_unmap_unlock(vmf->pte, vmf->ptl);
put_page(vmf->page);
@@ -3079,18 +3078,17 @@ void unmap_mapping_range(struct address_space *mapping,
EXPORT_SYMBOL(unmap_mapping_range);
/*
- * We enter with non-exclusive mmap_sem (to exclude vma changes,
+ * We enter with non-exclusive mmap_lock (to exclude vma changes,
* but allow concurrent faults), and pte mapped but not yet locked.
* We return with pte unmapped and unlocked.
*
- * We return with the mmap_sem locked or unlocked in the same cases
+ * We return with the mmap_lock locked or unlocked in the same cases
* as does filemap_fault().
*/
vm_fault_t do_swap_page(struct vm_fault *vmf)
{
struct vm_area_struct *vma = vmf->vma;
struct page *page = NULL, *swapcache;
- struct mem_cgroup *memcg;
swp_entry_t entry;
pte_t pte;
int locked;
@@ -3131,10 +3129,21 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma,
vmf->address);
if (page) {
+ int err;
+
__SetPageLocked(page);
__SetPageSwapBacked(page);
set_page_private(page, entry.val);
- lru_cache_add_anon(page);
+
+ /* Tell memcg to use swap ownership records */
+ SetPageSwapCache(page);
+ err = mem_cgroup_charge(page, vma->vm_mm,
+ GFP_KERNEL);
+ ClearPageSwapCache(page);
+ if (err)
+ goto out_page;
+
+ lru_cache_add(page);
swap_readpage(page, true);
}
} else {
@@ -3195,11 +3204,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
goto out_page;
}
- if (mem_cgroup_try_charge_delay(page, vma->vm_mm, GFP_KERNEL,
- &memcg, false)) {
- ret = VM_FAULT_OOM;
- goto out_page;
- }
+ cgroup_throttle_swaprate(page, GFP_KERNEL);
/*
* Back out if somebody else already faulted in this pte.
@@ -3247,11 +3252,9 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
/* ksm created a completely new copy */
if (unlikely(page != swapcache && swapcache)) {
page_add_new_anon_rmap(page, vma, vmf->address, false);
- mem_cgroup_commit_charge(page, memcg, false, false);
lru_cache_add_active_or_unevictable(page, vma);
} else {
do_page_add_anon_rmap(page, vma, vmf->address, exclusive);
- mem_cgroup_commit_charge(page, memcg, true, false);
activate_page(page);
}
@@ -3287,7 +3290,6 @@ unlock:
out:
return ret;
out_nomap:
- mem_cgroup_cancel_charge(page, memcg, false);
pte_unmap_unlock(vmf->pte, vmf->ptl);
out_page:
unlock_page(page);
@@ -3301,14 +3303,13 @@ out_release:
}
/*
- * We enter with non-exclusive mmap_sem (to exclude vma changes,
+ * We enter with non-exclusive mmap_lock (to exclude vma changes,
* but allow concurrent faults), and pte mapped but not yet locked.
- * We return with mmap_sem still held, but pte unmapped and unlocked.
+ * We return with mmap_lock still held, but pte unmapped and unlocked.
*/
static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
{
struct vm_area_struct *vma = vmf->vma;
- struct mem_cgroup *memcg;
struct page *page;
vm_fault_t ret = 0;
pte_t entry;
@@ -3322,10 +3323,10 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
* pte_offset_map() on pmds where a huge pmd might be created
* from a different thread.
*
- * pte_alloc_map() is safe to use under down_write(mmap_sem) or when
+ * pte_alloc_map() is safe to use under mmap_write_lock(mm) or when
* parallel threads are excluded by other means.
*
- * Here we only have down_read(mmap_sem).
+ * Here we only have mmap_read_lock(mm).
*/
if (pte_alloc(vma->vm_mm, vmf->pmd))
return VM_FAULT_OOM;
@@ -3341,8 +3342,10 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
vma->vm_page_prot));
vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
vmf->address, &vmf->ptl);
- if (!pte_none(*vmf->pte))
+ if (!pte_none(*vmf->pte)) {
+ update_mmu_tlb(vma, vmf->address, vmf->pte);
goto unlock;
+ }
ret = check_stable_address_space(vma->vm_mm);
if (ret)
goto unlock;
@@ -3361,9 +3364,9 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
if (!page)
goto oom;
- if (mem_cgroup_try_charge_delay(page, vma->vm_mm, GFP_KERNEL, &memcg,
- false))
+ if (mem_cgroup_charge(page, vma->vm_mm, GFP_KERNEL))
goto oom_free_page;
+ cgroup_throttle_swaprate(page, GFP_KERNEL);
/*
* The memory barrier inside __SetPageUptodate makes sure that
@@ -3373,13 +3376,16 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
__SetPageUptodate(page);
entry = mk_pte(page, vma->vm_page_prot);
+ entry = pte_sw_mkyoung(entry);
if (vma->vm_flags & VM_WRITE)
entry = pte_mkwrite(pte_mkdirty(entry));
vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address,
&vmf->ptl);
- if (!pte_none(*vmf->pte))
+ if (!pte_none(*vmf->pte)) {
+ update_mmu_cache(vma, vmf->address, vmf->pte);
goto release;
+ }
ret = check_stable_address_space(vma->vm_mm);
if (ret)
@@ -3388,14 +3394,12 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
/* Deliver the page fault to userland, check inside PT lock */
if (userfaultfd_missing(vma)) {
pte_unmap_unlock(vmf->pte, vmf->ptl);
- mem_cgroup_cancel_charge(page, memcg, false);
put_page(page);
return handle_userfault(vmf, VM_UFFD_MISSING);
}
inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
page_add_new_anon_rmap(page, vma, vmf->address, false);
- mem_cgroup_commit_charge(page, memcg, false, false);
lru_cache_add_active_or_unevictable(page, vma);
setpte:
set_pte_at(vma->vm_mm, vmf->address, vmf->pte, entry);
@@ -3406,7 +3410,6 @@ unlock:
pte_unmap_unlock(vmf->pte, vmf->ptl);
return ret;
release:
- mem_cgroup_cancel_charge(page, memcg, false);
put_page(page);
goto unlock;
oom_free_page:
@@ -3416,7 +3419,7 @@ oom:
}
/*
- * The mmap_sem must have been held on entry, and may have been
+ * The mmap_lock must have been held on entry, and may have been
* released depending on flags and vma->vm_ops->fault() return value.
* See filemap_fault() and __lock_page_retry().
*/
@@ -3611,7 +3614,6 @@ static vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page)
* mapping. If needed, the fucntion allocates page table or use pre-allocated.
*
* @vmf: fault environment
- * @memcg: memcg to charge page (only for private mappings)
* @page: page to map
*
* Caller must take care of unlocking vmf->ptl, if vmf->pte is non-NULL on
@@ -3622,8 +3624,7 @@ static vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page)
*
* Return: %0 on success, %VM_FAULT_ code in case of error.
*/
-vm_fault_t alloc_set_pte(struct vm_fault *vmf, struct mem_cgroup *memcg,
- struct page *page)
+vm_fault_t alloc_set_pte(struct vm_fault *vmf, struct page *page)
{
struct vm_area_struct *vma = vmf->vma;
bool write = vmf->flags & FAULT_FLAG_WRITE;
@@ -3631,9 +3632,6 @@ vm_fault_t alloc_set_pte(struct vm_fault *vmf, struct mem_cgroup *memcg,
vm_fault_t ret;
if (pmd_none(*vmf->pmd) && PageTransCompound(page)) {
- /* THP on COW? */
- VM_BUG_ON_PAGE(memcg, page);
-
ret = do_set_pmd(vmf, page);
if (ret != VM_FAULT_FALLBACK)
return ret;
@@ -3646,18 +3644,20 @@ vm_fault_t alloc_set_pte(struct vm_fault *vmf, struct mem_cgroup *memcg,
}
/* Re-check under ptl */
- if (unlikely(!pte_none(*vmf->pte)))
+ if (unlikely(!pte_none(*vmf->pte))) {
+ update_mmu_tlb(vma, vmf->address, vmf->pte);
return VM_FAULT_NOPAGE;
+ }
flush_icache_page(vma, page);
entry = mk_pte(page, vma->vm_page_prot);
+ entry = pte_sw_mkyoung(entry);
if (write)
entry = maybe_mkwrite(pte_mkdirty(entry), vma);
/* copy-on-write page */
if (write && !(vma->vm_flags & VM_SHARED)) {
inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
page_add_new_anon_rmap(page, vma, vmf->address, false);
- mem_cgroup_commit_charge(page, memcg, false, false);
lru_cache_add_active_or_unevictable(page, vma);
} else {
inc_mm_counter_fast(vma->vm_mm, mm_counter_file(page));
@@ -3706,7 +3706,7 @@ vm_fault_t finish_fault(struct vm_fault *vmf)
if (!(vmf->vma->vm_flags & VM_SHARED))
ret = check_stable_address_space(vmf->vma->vm_mm);
if (!ret)
- ret = alloc_set_pte(vmf, vmf->memcg, page);
+ ret = alloc_set_pte(vmf, page);
if (vmf->pte)
pte_unmap_unlock(vmf->pte, vmf->ptl);
return ret;
@@ -3866,11 +3866,11 @@ static vm_fault_t do_cow_fault(struct vm_fault *vmf)
if (!vmf->cow_page)
return VM_FAULT_OOM;
- if (mem_cgroup_try_charge_delay(vmf->cow_page, vma->vm_mm, GFP_KERNEL,
- &vmf->memcg, false)) {
+ if (mem_cgroup_charge(vmf->cow_page, vma->vm_mm, GFP_KERNEL)) {
put_page(vmf->cow_page);
return VM_FAULT_OOM;
}
+ cgroup_throttle_swaprate(vmf->cow_page, GFP_KERNEL);
ret = __do_fault(vmf);
if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
@@ -3888,7 +3888,6 @@ static vm_fault_t do_cow_fault(struct vm_fault *vmf)
goto uncharge_out;
return ret;
uncharge_out:
- mem_cgroup_cancel_charge(vmf->cow_page, vmf->memcg, false);
put_page(vmf->cow_page);
return ret;
}
@@ -3929,11 +3928,11 @@ static vm_fault_t do_shared_fault(struct vm_fault *vmf)
}
/*
- * We enter with non-exclusive mmap_sem (to exclude vma changes,
+ * We enter with non-exclusive mmap_lock (to exclude vma changes,
* but allow concurrent faults).
- * The mmap_sem may have been released depending on flags and our
+ * The mmap_lock may have been released depending on flags and our
* return value. See filemap_fault() and __lock_page_or_retry().
- * If mmap_sem is released, vma may become invalid (for example
+ * If mmap_lock is released, vma may become invalid (for example
* by other thread calling munmap()).
*/
static vm_fault_t do_fault(struct vm_fault *vmf)
@@ -4162,10 +4161,10 @@ static vm_fault_t wp_huge_pud(struct vm_fault *vmf, pud_t orig_pud)
* with external mmu caches can use to update those (ie the Sparc or
* PowerPC hashed page tables that act as extended TLBs).
*
- * We enter with non-exclusive mmap_sem (to exclude vma changes, but allow
+ * We enter with non-exclusive mmap_lock (to exclude vma changes, but allow
* concurrent faults).
*
- * The mmap_sem may have been released depending on flags and our return value.
+ * The mmap_lock may have been released depending on flags and our return value.
* See filemap_fault() and __lock_page_or_retry().
*/
static vm_fault_t handle_pte_fault(struct vm_fault *vmf)
@@ -4187,7 +4186,7 @@ static vm_fault_t handle_pte_fault(struct vm_fault *vmf)
/*
* A regular pmd is established and it can't morph into a huge
* pmd from under us anymore at this point because we hold the
- * mmap_sem read mode and khugepaged takes it in write mode.
+ * mmap_lock read mode and khugepaged takes it in write mode.
* So now it's safe to run pte_offset_map().
*/
vmf->pte = pte_offset_map(vmf->pmd, vmf->address);
@@ -4224,8 +4223,10 @@ static vm_fault_t handle_pte_fault(struct vm_fault *vmf)
vmf->ptl = pte_lockptr(vmf->vma->vm_mm, vmf->pmd);
spin_lock(vmf->ptl);
entry = vmf->orig_pte;
- if (unlikely(!pte_same(*vmf->pte, entry)))
+ if (unlikely(!pte_same(*vmf->pte, entry))) {
+ update_mmu_tlb(vmf->vma, vmf->address, vmf->pte);
goto unlock;
+ }
if (vmf->flags & FAULT_FLAG_WRITE) {
if (!pte_write(entry))
return do_wp_page(vmf);
@@ -4253,7 +4254,7 @@ unlock:
/*
* By the time we get here, we already hold the mm semaphore
*
- * The mmap_sem may have been released depending on flags and our
+ * The mmap_lock may have been released depending on flags and our
* return value. See filemap_fault() and __lock_page_or_retry().
*/
static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma,
@@ -4348,7 +4349,7 @@ retry_pud:
/*
* By the time we get here, we already hold the mm semaphore
*
- * The mmap_sem may have been released depending on flags and our
+ * The mmap_lock may have been released depending on flags and our
* return value. See filemap_fault() and __lock_page_or_retry().
*/
vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
@@ -4434,19 +4435,11 @@ int __pud_alloc(struct mm_struct *mm, p4d_t *p4d, unsigned long address)
smp_wmb(); /* See comment in __pte_alloc */
spin_lock(&mm->page_table_lock);
-#ifndef __ARCH_HAS_5LEVEL_HACK
if (!p4d_present(*p4d)) {
mm_inc_nr_puds(mm);
p4d_populate(mm, p4d, new);
} else /* Another has populated it */
pud_free(mm, new);
-#else
- if (!pgd_present(*p4d)) {
- mm_inc_nr_puds(mm);
- pgd_populate(mm, p4d, new);
- } else /* Another has populated it */
- pud_free(mm, new);
-#endif /* __ARCH_HAS_5LEVEL_HACK */
spin_unlock(&mm->page_table_lock);
return 0;
}
@@ -4665,7 +4658,7 @@ int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm,
void *old_buf = buf;
int write = gup_flags & FOLL_WRITE;
- if (down_read_killable(&mm->mmap_sem))
+ if (mmap_read_lock_killable(mm))
return 0;
/* ignore errors, just check how much was successfully transferred */
@@ -4716,7 +4709,7 @@ int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm,
buf += bytes;
addr += bytes;
}
- up_read(&mm->mmap_sem);
+ mmap_read_unlock(mm);
return buf - old_buf;
}
@@ -4773,7 +4766,7 @@ void print_vma_addr(char *prefix, unsigned long ip)
/*
* we might be running from an atomic context so we cannot sleep
*/
- if (!down_read_trylock(&mm->mmap_sem))
+ if (!mmap_read_trylock(mm))
return;
vma = find_vma(mm, ip);
@@ -4792,7 +4785,7 @@ void print_vma_addr(char *prefix, unsigned long ip)
free_page((unsigned long)buf);
}
}
- up_read(&mm->mmap_sem);
+ mmap_read_unlock(mm);
}
#if defined(CONFIG_PROVE_LOCKING) || defined(CONFIG_DEBUG_ATOMIC_SLEEP)
@@ -4800,7 +4793,7 @@ void __might_fault(const char *file, int line)
{
/*
* Some code (nfs/sunrpc) uses socket ops on kernel memory while
- * holding the mmap_sem, this is safe because kernel memory doesn't
+ * holding the mmap_lock, this is safe because kernel memory doesn't
* get paged out, therefore we'll never actually fault, and the
* below annotations will generate false positives.
*/
@@ -4811,7 +4804,7 @@ void __might_fault(const char *file, int line)
__might_sleep(file, line, 0);
#if defined(CONFIG_DEBUG_ATOMIC_SLEEP)
if (current->mm)
- might_lock_read(&current->mm->mmap_sem);
+ might_lock_read(&current->mm->mmap_lock);
#endif
}
EXPORT_SYMBOL(__might_fault);
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index fc0aad0bc1f5..9b34e03e730a 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -98,11 +98,14 @@ void mem_hotplug_done(void)
u64 max_mem_size = U64_MAX;
/* add this memory to iomem resource */
-static struct resource *register_memory_resource(u64 start, u64 size)
+static struct resource *register_memory_resource(u64 start, u64 size,
+ const char *resource_name)
{
struct resource *res;
unsigned long flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
- char *resource_name = "System RAM";
+
+ if (strcmp(resource_name, "System RAM"))
+ flags |= IORESOURCE_MEM_DRIVER_MANAGED;
/*
* Make sure value parsed from 'mem=' only restricts memory adding
@@ -862,10 +865,9 @@ static void reset_node_present_pages(pg_data_t *pgdat)
}
/* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */
-static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start)
+static pg_data_t __ref *hotadd_new_pgdat(int nid)
{
struct pglist_data *pgdat;
- unsigned long start_pfn = PFN_DOWN(start);
pgdat = NODE_DATA(nid);
if (!pgdat) {
@@ -879,13 +881,13 @@ static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start)
} else {
int cpu;
/*
- * Reset the nr_zones, order and classzone_idx before reuse.
- * Note that kswapd will init kswapd_classzone_idx properly
+ * Reset the nr_zones, order and highest_zoneidx before reuse.
+ * Note that kswapd will init kswapd_highest_zoneidx properly
* when it starts in the near future.
*/
pgdat->nr_zones = 0;
pgdat->kswapd_order = 0;
- pgdat->kswapd_classzone_idx = 0;
+ pgdat->kswapd_highest_zoneidx = 0;
for_each_online_cpu(cpu) {
struct per_cpu_nodestat *p;
@@ -895,9 +897,8 @@ static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start)
}
/* we can use NODE_DATA(nid) from here */
-
pgdat->node_id = nid;
- pgdat->node_start_pfn = start_pfn;
+ pgdat->node_start_pfn = 0;
/* init node's zones as empty zones, we don't have any present pages.*/
free_area_init_core_hotplug(nid);
@@ -932,7 +933,6 @@ static void rollback_node_hotadd(int nid)
/**
* try_online_node - online a node if offlined
* @nid: the node ID
- * @start: start addr of the node
* @set_node_online: Whether we want to online the node
* called by cpu_up() to online a node without onlined memory.
*
@@ -941,7 +941,7 @@ static void rollback_node_hotadd(int nid)
* 0 -> the node is already online
* -ENOMEM -> the node could not be allocated
*/
-static int __try_online_node(int nid, u64 start, bool set_node_online)
+static int __try_online_node(int nid, bool set_node_online)
{
pg_data_t *pgdat;
int ret = 1;
@@ -949,7 +949,7 @@ static int __try_online_node(int nid, u64 start, bool set_node_online)
if (node_online(nid))
return 0;
- pgdat = hotadd_new_pgdat(nid, start);
+ pgdat = hotadd_new_pgdat(nid);
if (!pgdat) {
pr_err("Cannot online node %d due to NULL pgdat\n", nid);
ret = -ENOMEM;
@@ -973,7 +973,7 @@ int try_online_node(int nid)
int ret;
mem_hotplug_begin();
- ret = __try_online_node(nid, 0, true);
+ ret = __try_online_node(nid, true);
mem_hotplug_done();
return ret;
}
@@ -1017,17 +1017,17 @@ int __ref add_memory_resource(int nid, struct resource *res)
if (ret)
return ret;
+ if (!node_possible(nid)) {
+ WARN(1, "node %d was absent from the node_possible_map\n", nid);
+ return -EINVAL;
+ }
+
mem_hotplug_begin();
- /*
- * Add new range to memblock so that when hotadd_new_pgdat() is called
- * to allocate new pgdat, get_pfn_range_for_nid() will be able to find
- * this new range and calculate total pages correctly. The range will
- * be removed at hot-remove time.
- */
- memblock_add_node(start, size, nid);
+ if (IS_ENABLED(CONFIG_ARCH_KEEP_MEMBLOCK))
+ memblock_add_node(start, size, nid);
- ret = __try_online_node(nid, start, false);
+ ret = __try_online_node(nid, false);
if (ret < 0)
goto error;
new_node = ret;
@@ -1060,7 +1060,8 @@ int __ref add_memory_resource(int nid, struct resource *res)
BUG_ON(ret);
/* create new memmap entry */
- firmware_map_add_hotplug(start, start + size, "System RAM");
+ if (!strcmp(res->name, "System RAM"))
+ firmware_map_add_hotplug(start, start + size, "System RAM");
/* device_online() will take the lock when calling online_pages() */
mem_hotplug_done();
@@ -1074,7 +1075,8 @@ error:
/* rollback pgdat allocation and others */
if (new_node)
rollback_node_hotadd(nid);
- memblock_remove(start, size);
+ if (IS_ENABLED(CONFIG_ARCH_KEEP_MEMBLOCK))
+ memblock_remove(start, size);
mem_hotplug_done();
return ret;
}
@@ -1085,7 +1087,7 @@ int __ref __add_memory(int nid, u64 start, u64 size)
struct resource *res;
int ret;
- res = register_memory_resource(start, size);
+ res = register_memory_resource(start, size, "System RAM");
if (IS_ERR(res))
return PTR_ERR(res);
@@ -1107,82 +1109,57 @@ int add_memory(int nid, u64 start, u64 size)
}
EXPORT_SYMBOL_GPL(add_memory);
-#ifdef CONFIG_MEMORY_HOTREMOVE
/*
- * A free page on the buddy free lists (not the per-cpu lists) has PageBuddy
- * set and the size of the free page is given by page_order(). Using this,
- * the function determines if the pageblock contains only free pages.
- * Due to buddy contraints, a free page at least the size of a pageblock will
- * be located at the start of the pageblock
+ * Add special, driver-managed memory to the system as system RAM. Such
+ * memory is not exposed via the raw firmware-provided memmap as system
+ * RAM, instead, it is detected and added by a driver - during cold boot,
+ * after a reboot, and after kexec.
+ *
+ * Reasons why this memory should not be used for the initial memmap of a
+ * kexec kernel or for placing kexec images:
+ * - The booting kernel is in charge of determining how this memory will be
+ * used (e.g., use persistent memory as system RAM)
+ * - Coordination with a hypervisor is required before this memory
+ * can be used (e.g., inaccessible parts).
+ *
+ * For this memory, no entries in /sys/firmware/memmap ("raw firmware-provided
+ * memory map") are created. Also, the created memory resource is flagged
+ * with IORESOURCE_MEM_DRIVER_MANAGED, so in-kernel users can special-case
+ * this memory as well (esp., not place kexec images onto it).
+ *
+ * The resource_name (visible via /proc/iomem) has to have the format
+ * "System RAM ($DRIVER)".
*/
-static inline int pageblock_free(struct page *page)
+int add_memory_driver_managed(int nid, u64 start, u64 size,
+ const char *resource_name)
{
- return PageBuddy(page) && page_order(page) >= pageblock_order;
-}
+ struct resource *res;
+ int rc;
-/* Return the pfn of the start of the next active pageblock after a given pfn */
-static unsigned long next_active_pageblock(unsigned long pfn)
-{
- struct page *page = pfn_to_page(pfn);
+ if (!resource_name ||
+ strstr(resource_name, "System RAM (") != resource_name ||
+ resource_name[strlen(resource_name) - 1] != ')')
+ return -EINVAL;
- /* Ensure the starting page is pageblock-aligned */
- BUG_ON(pfn & (pageblock_nr_pages - 1));
+ lock_device_hotplug();
- /* If the entire pageblock is free, move to the end of free page */
- if (pageblock_free(page)) {
- int order;
- /* be careful. we don't have locks, page_order can be changed.*/
- order = page_order(page);
- if ((order < MAX_ORDER) && (order >= pageblock_order))
- return pfn + (1 << order);
+ res = register_memory_resource(start, size, resource_name);
+ if (IS_ERR(res)) {
+ rc = PTR_ERR(res);
+ goto out_unlock;
}
- return pfn + pageblock_nr_pages;
-}
-
-static bool is_pageblock_removable_nolock(unsigned long pfn)
-{
- struct page *page = pfn_to_page(pfn);
- struct zone *zone;
-
- /*
- * We have to be careful here because we are iterating over memory
- * sections which are not zone aware so we might end up outside of
- * the zone but still within the section.
- * We have to take care about the node as well. If the node is offline
- * its NODE_DATA will be NULL - see page_zone.
- */
- if (!node_online(page_to_nid(page)))
- return false;
-
- zone = page_zone(page);
- pfn = page_to_pfn(page);
- if (!zone_spans_pfn(zone, pfn))
- return false;
-
- return !has_unmovable_pages(zone, page, MIGRATE_MOVABLE,
- MEMORY_OFFLINE);
-}
-
-/* Checks if this range of memory is likely to be hot-removable. */
-bool is_mem_section_removable(unsigned long start_pfn, unsigned long nr_pages)
-{
- unsigned long end_pfn, pfn;
-
- end_pfn = min(start_pfn + nr_pages,
- zone_end_pfn(page_zone(pfn_to_page(start_pfn))));
-
- /* Check the starting page of each pageblock within the range */
- for (pfn = start_pfn; pfn < end_pfn; pfn = next_active_pageblock(pfn)) {
- if (!is_pageblock_removable_nolock(pfn))
- return false;
- cond_resched();
- }
+ rc = add_memory_resource(nid, res);
+ if (rc < 0)
+ release_memory_resource(res);
- /* All pageblocks in the memory block are likely to be hot-removable */
- return true;
+out_unlock:
+ unlock_device_hotplug();
+ return rc;
}
+EXPORT_SYMBOL_GPL(add_memory_driver_managed);
+#ifdef CONFIG_MEMORY_HOTREMOVE
/*
* Confirm all pages in a range [start, end) belong to the same zone (skipping
* memory holes). When true, return the zone.
@@ -1224,11 +1201,17 @@ struct zone *test_pages_in_a_zone(unsigned long start_pfn,
/*
* Scan pfn range [start,end) to find movable/migratable pages (LRU pages,
- * non-lru movable pages and hugepages). We scan pfn because it's much
- * easier than scanning over linked list. This function returns the pfn
- * of the first found movable page if it's found, otherwise 0.
+ * non-lru movable pages and hugepages). Will skip over most unmovable
+ * pages (esp., pages that can be skipped when offlining), but bail out on
+ * definitely unmovable pages.
+ *
+ * Returns:
+ * 0 in case a movable page is found and movable_pfn was updated.
+ * -ENOENT in case no movable page was found.
+ * -EBUSY in case a definitely unmovable page was found.
*/
-static unsigned long scan_movable_pages(unsigned long start, unsigned long end)
+static int scan_movable_pages(unsigned long start, unsigned long end,
+ unsigned long *movable_pfn)
{
unsigned long pfn;
@@ -1240,18 +1223,30 @@ static unsigned long scan_movable_pages(unsigned long start, unsigned long end)
continue;
page = pfn_to_page(pfn);
if (PageLRU(page))
- return pfn;
+ goto found;
if (__PageMovable(page))
- return pfn;
+ goto found;
+
+ /*
+ * PageOffline() pages that are not marked __PageMovable() and
+ * have a reference count > 0 (after MEM_GOING_OFFLINE) are
+ * definitely unmovable. If their reference count would be 0,
+ * they could at least be skipped when offlining memory.
+ */
+ if (PageOffline(page) && page_count(page))
+ return -EBUSY;
if (!PageHuge(page))
continue;
head = compound_head(page);
if (page_huge_active(head))
- return pfn;
+ goto found;
skip = compound_nr(head) - (page - head);
pfn += skip - 1;
}
+ return -ENOENT;
+found:
+ *movable_pfn = pfn;
return 0;
}
@@ -1360,7 +1355,7 @@ offline_isolated_pages_cb(unsigned long start, unsigned long nr_pages,
}
/*
- * Check all pages in range, recoreded as memory resource, are isolated.
+ * Check all pages in range, recorded as memory resource, are isolated.
*/
static int
check_pages_isolated_cb(unsigned long start_pfn, unsigned long nr_pages,
@@ -1372,11 +1367,7 @@ check_pages_isolated_cb(unsigned long start_pfn, unsigned long nr_pages,
static int __init cmdline_parse_movable_node(char *p)
{
-#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
movable_node_enabled = true;
-#else
- pr_warn("movable_node parameter depends on CONFIG_HAVE_MEMBLOCK_NODE_MAP to work properly\n");
-#endif
return 0;
}
early_param("movable_node", cmdline_parse_movable_node);
@@ -1518,7 +1509,8 @@ static int __ref __offline_pages(unsigned long start_pfn,
}
do {
- for (pfn = start_pfn; pfn;) {
+ pfn = start_pfn;
+ do {
if (signal_pending(current)) {
ret = -EINTR;
reason = "signal backoff";
@@ -1528,14 +1520,19 @@ static int __ref __offline_pages(unsigned long start_pfn,
cond_resched();
lru_add_drain_all();
- pfn = scan_movable_pages(pfn, end_pfn);
- if (pfn) {
+ ret = scan_movable_pages(pfn, end_pfn, &pfn);
+ if (!ret) {
/*
* TODO: fatal migration failures should bail
* out
*/
do_migrate_range(pfn, end_pfn);
}
+ } while (!ret);
+
+ if (ret != -ENOENT) {
+ reason = "unmovable page";
+ goto failed_removal_isolated;
}
/*
@@ -1750,8 +1747,12 @@ static int __ref try_remove_memory(int nid, u64 start, u64 size)
mem_hotplug_begin();
arch_remove_memory(nid, start, size, NULL);
- memblock_free(start, size);
- memblock_remove(start, size);
+
+ if (IS_ENABLED(CONFIG_ARCH_KEEP_MEMBLOCK)) {
+ memblock_free(start, size);
+ memblock_remove(start, size);
+ }
+
__release_memory_resource(start, size);
try_offline_node(nid);
@@ -1797,4 +1798,41 @@ int remove_memory(int nid, u64 start, u64 size)
return rc;
}
EXPORT_SYMBOL_GPL(remove_memory);
+
+/*
+ * Try to offline and remove a memory block. Might take a long time to
+ * finish in case memory is still in use. Primarily useful for memory devices
+ * that logically unplugged all memory (so it's no longer in use) and want to
+ * offline + remove the memory block.
+ */
+int offline_and_remove_memory(int nid, u64 start, u64 size)
+{
+ struct memory_block *mem;
+ int rc = -EINVAL;
+
+ if (!IS_ALIGNED(start, memory_block_size_bytes()) ||
+ size != memory_block_size_bytes())
+ return rc;
+
+ lock_device_hotplug();
+ mem = find_memory_block(__pfn_to_section(PFN_DOWN(start)));
+ if (mem)
+ rc = device_offline(&mem->dev);
+ /* Ignore if the device is already offline. */
+ if (rc > 0)
+ rc = 0;
+
+ /*
+ * In case we succeeded to offline the memory block, remove it.
+ * This cannot fail as it cannot get onlined in the meantime.
+ */
+ if (!rc) {
+ rc = try_remove_memory(nid, start, size);
+ WARN_ON_ONCE(rc);
+ }
+ unlock_device_hotplug();
+
+ return rc;
+}
+EXPORT_SYMBOL_GPL(offline_and_remove_memory);
#endif /* CONFIG_MEMORY_HOTREMOVE */
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 48ba9729062e..381320671677 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -224,7 +224,7 @@ static int mpol_new_bind(struct mempolicy *pol, const nodemask_t *nodes)
* handle an empty nodemask with MPOL_PREFERRED here.
*
* Must be called holding task's alloc_lock to protect task's mems_allowed
- * and mempolicy. May also be called holding the mmap_semaphore for write.
+ * and mempolicy. May also be called holding the mmap_lock for write.
*/
static int mpol_set_nodemask(struct mempolicy *pol,
const nodemask_t *nodes, struct nodemask_scratch *nsc)
@@ -368,7 +368,7 @@ static void mpol_rebind_preferred(struct mempolicy *pol,
/*
* mpol_rebind_policy - Migrate a policy to a different set of nodes
*
- * Per-vma policies are protected by mmap_sem. Allocations using per-task
+ * Per-vma policies are protected by mmap_lock. Allocations using per-task
* policies are protected by task->mems_allowed_seq to prevent a premature
* OOM/allocation failure due to parallel nodemask modification.
*/
@@ -398,17 +398,17 @@ void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new)
/*
* Rebind each vma in mm to new nodemask.
*
- * Call holding a reference to mm. Takes mm->mmap_sem during call.
+ * Call holding a reference to mm. Takes mm->mmap_lock during call.
*/
void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
{
struct vm_area_struct *vma;
- down_write(&mm->mmap_sem);
+ mmap_write_lock(mm);
for (vma = mm->mmap; vma; vma = vma->vm_next)
mpol_rebind_policy(vma->vm_policy, new);
- up_write(&mm->mmap_sem);
+ mmap_write_unlock(mm);
}
static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
@@ -764,7 +764,7 @@ queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end,
/*
* Apply policy to a single VMA
- * This must be called with the mmap_sem held for writing.
+ * This must be called with the mmap_lock held for writing.
*/
static int vma_replace_policy(struct vm_area_struct *vma,
struct mempolicy *pol)
@@ -789,7 +789,7 @@ static int vma_replace_policy(struct vm_area_struct *vma,
}
old = vma->vm_policy;
- vma->vm_policy = new; /* protected by mmap_sem */
+ vma->vm_policy = new; /* protected by mmap_lock */
mpol_put(old);
return 0;
@@ -927,15 +927,12 @@ static int lookup_node(struct mm_struct *mm, unsigned long addr)
int locked = 1;
err = get_user_pages_locked(addr & PAGE_MASK, 1, 0, &p, &locked);
- if (err == 0) {
- /* E.g. GUP interrupted by fatal signal */
- err = -EFAULT;
- } else if (err > 0) {
+ if (err > 0) {
err = page_to_nid(p);
put_page(p);
}
if (locked)
- up_read(&mm->mmap_sem);
+ mmap_read_unlock(mm);
return err;
}
@@ -968,10 +965,10 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask,
* vma/shared policy at addr is NULL. We
* want to return MPOL_DEFAULT in this case.
*/
- down_read(&mm->mmap_sem);
+ mmap_read_lock(mm);
vma = find_vma_intersection(mm, addr, addr+1);
if (!vma) {
- up_read(&mm->mmap_sem);
+ mmap_read_unlock(mm);
return -EFAULT;
}
if (vma->vm_ops && vma->vm_ops->get_policy)
@@ -988,7 +985,7 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask,
if (flags & MPOL_F_ADDR) {
/*
* Take a refcount on the mpol, lookup_node()
- * wil drop the mmap_sem, so after calling
+ * wil drop the mmap_lock, so after calling
* lookup_node() only "pol" remains valid, "vma"
* is stale.
*/
@@ -1030,7 +1027,7 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask,
out:
mpol_cond_put(pol);
if (vma)
- up_read(&mm->mmap_sem);
+ mmap_read_unlock(mm);
if (pol_refcount)
mpol_put(pol_refcount);
return err;
@@ -1139,7 +1136,7 @@ int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
if (err)
return err;
- down_read(&mm->mmap_sem);
+ mmap_read_lock(mm);
/*
* Find a 'source' bit set in 'tmp' whose corresponding 'dest'
@@ -1220,7 +1217,7 @@ int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
if (err < 0)
break;
}
- up_read(&mm->mmap_sem);
+ mmap_read_unlock(mm);
if (err < 0)
return err;
return busy;
@@ -1343,12 +1340,12 @@ static long do_mbind(unsigned long start, unsigned long len,
{
NODEMASK_SCRATCH(scratch);
if (scratch) {
- down_write(&mm->mmap_sem);
+ mmap_write_lock(mm);
task_lock(current);
err = mpol_set_nodemask(new, nmask, scratch);
task_unlock(current);
if (err)
- up_write(&mm->mmap_sem);
+ mmap_write_unlock(mm);
} else
err = -ENOMEM;
NODEMASK_SCRATCH_FREE(scratch);
@@ -1385,7 +1382,7 @@ up_out:
putback_movable_pages(&pagelist);
}
- up_write(&mm->mmap_sem);
+ mmap_write_unlock(mm);
mpol_out:
mpol_put(new);
return err;
@@ -2188,7 +2185,7 @@ static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
*
* This function allocates a page from the kernel page pool and applies
* a NUMA policy associated with the VMA or the current process.
- * When VMA is not NULL caller must hold down_read on the mmap_sem of the
+ * When VMA is not NULL caller must read-lock the mmap_lock of the
* mm_struct of the VMA to prevent it from going away. Should be used for
* all allocations for pages that will be mapped into user space. Returns
* NULL when no page can be allocated.
diff --git a/mm/migrate.c b/mm/migrate.c
index 7160c1556f79..f37729673558 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -490,11 +490,18 @@ int migrate_page_move_mapping(struct address_space *mapping,
* are mapped to swap space.
*/
if (newzone != oldzone) {
- __dec_node_state(oldzone->zone_pgdat, NR_FILE_PAGES);
- __inc_node_state(newzone->zone_pgdat, NR_FILE_PAGES);
+ struct lruvec *old_lruvec, *new_lruvec;
+ struct mem_cgroup *memcg;
+
+ memcg = page_memcg(page);
+ old_lruvec = mem_cgroup_lruvec(memcg, oldzone->zone_pgdat);
+ new_lruvec = mem_cgroup_lruvec(memcg, newzone->zone_pgdat);
+
+ __dec_lruvec_state(old_lruvec, NR_FILE_PAGES);
+ __inc_lruvec_state(new_lruvec, NR_FILE_PAGES);
if (PageSwapBacked(page) && !PageSwapCache(page)) {
- __dec_node_state(oldzone->zone_pgdat, NR_SHMEM);
- __inc_node_state(newzone->zone_pgdat, NR_SHMEM);
+ __dec_lruvec_state(old_lruvec, NR_SHMEM);
+ __inc_lruvec_state(new_lruvec, NR_SHMEM);
}
if (dirty && mapping_cap_account_dirty(mapping)) {
__dec_node_state(oldzone->zone_pgdat, NR_FILE_DIRTY);
@@ -797,11 +804,7 @@ recheck_buffers:
if (rc != MIGRATEPAGE_SUCCESS)
goto unlock_buffers;
- ClearPagePrivate(page);
- set_page_private(newpage, page_private(page));
- set_page_private(page, 0);
- put_page(page);
- get_page(newpage);
+ attach_page_private(newpage, detach_page_private(page));
bh = head;
do {
@@ -810,8 +813,6 @@ recheck_buffers:
} while (bh != head);
- SetPagePrivate(newpage);
-
if (mode != MIGRATE_SYNC_NO_COPY)
migrate_page_copy(newpage, page);
else
@@ -1032,7 +1033,7 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
* to the LRU. Later, when the IO completes the pages are
* marked uptodate and unlocked. However, the queueing
* could be merging multiple pages for one bio (e.g.
- * mpage_readpages). If an allocation happens for the
+ * mpage_readahead). If an allocation happens for the
* second or third page, the process can end up locking
* the same page twice and deadlocking. Rather than
* trying to be clever about what pages can be locked,
@@ -1554,7 +1555,7 @@ static int add_page_for_migration(struct mm_struct *mm, unsigned long addr,
unsigned int follflags;
int err;
- down_read(&mm->mmap_sem);
+ mmap_read_lock(mm);
err = -EFAULT;
vma = find_vma(mm, addr);
if (!vma || addr < vma->vm_start || !vma_migratable(vma))
@@ -1607,7 +1608,7 @@ out_putpage:
*/
put_page(page);
out:
- up_read(&mm->mmap_sem);
+ mmap_read_unlock(mm);
return err;
}
@@ -1732,7 +1733,7 @@ static void do_pages_stat_array(struct mm_struct *mm, unsigned long nr_pages,
{
unsigned long i;
- down_read(&mm->mmap_sem);
+ mmap_read_lock(mm);
for (i = 0; i < nr_pages; i++) {
unsigned long addr = (unsigned long)(*pages);
@@ -1759,7 +1760,7 @@ set_status:
status++;
}
- up_read(&mm->mmap_sem);
+ mmap_read_unlock(mm);
}
/*
@@ -2119,7 +2120,7 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
* pmd before doing set_pmd_at(), nor to flush the TLB after
* set_pmd_at(). Clearing the pmd here would introduce a race
* condition against MADV_DONTNEED, because MADV_DONTNEED only holds the
- * mmap_sem for reading. If the pmd is set to NULL at any given time,
+ * mmap_lock for reading. If the pmd is set to NULL at any given time,
* MADV_DONTNEED won't wait on the pmd lock and it'll skip clearing this
* pmd.
*/
@@ -2674,7 +2675,7 @@ restore:
* have the MIGRATE_PFN_MIGRATE flag set for their src array entry.
*
* It is safe to update device page table after migrate_vma_pages() because
- * both destination and source page are still locked, and the mmap_sem is held
+ * both destination and source page are still locked, and the mmap_lock is held
* in read mode (hence no one can unmap the range being migrated).
*
* Once the caller is done cleaning up things and updating its page table (if it
@@ -2739,7 +2740,6 @@ static void migrate_vma_insert_page(struct migrate_vma *migrate,
{
struct vm_area_struct *vma = migrate->vma;
struct mm_struct *mm = vma->vm_mm;
- struct mem_cgroup *memcg;
bool flush = false;
spinlock_t *ptl;
pte_t entry;
@@ -2772,10 +2772,10 @@ static void migrate_vma_insert_page(struct migrate_vma *migrate,
* pte_offset_map() on pmds where a huge pmd might be created
* from a different thread.
*
- * pte_alloc_map() is safe to use under down_write(mmap_sem) or when
+ * pte_alloc_map() is safe to use under mmap_write_lock(mm) or when
* parallel threads are excluded by other means.
*
- * Here we only have down_read(mmap_sem).
+ * Here we only have mmap_read_lock(mm).
*/
if (pte_alloc(mm, pmdp))
goto abort;
@@ -2786,7 +2786,7 @@ static void migrate_vma_insert_page(struct migrate_vma *migrate,
if (unlikely(anon_vma_prepare(vma)))
goto abort;
- if (mem_cgroup_try_charge(page, vma->vm_mm, GFP_KERNEL, &memcg, false))
+ if (mem_cgroup_charge(page, vma->vm_mm, GFP_KERNEL))
goto abort;
/*
@@ -2832,7 +2832,6 @@ static void migrate_vma_insert_page(struct migrate_vma *migrate,
inc_mm_counter(mm, MM_ANONPAGES);
page_add_new_anon_rmap(page, vma, addr, false);
- mem_cgroup_commit_charge(page, memcg, false, false);
if (!is_zone_device_page(page))
lru_cache_add_active_or_unevictable(page, vma);
get_page(page);
@@ -2854,7 +2853,6 @@ static void migrate_vma_insert_page(struct migrate_vma *migrate,
unlock_abort:
pte_unmap_unlock(ptep, ptl);
- mem_cgroup_cancel_charge(page, memcg, false);
abort:
*src &= ~MIGRATE_PFN_MIGRATE;
}
diff --git a/mm/mincore.c b/mm/mincore.c
index 0e6dd9948f1a..453ff112470f 100644
--- a/mm/mincore.c
+++ b/mm/mincore.c
@@ -17,9 +17,9 @@
#include <linux/swapops.h>
#include <linux/shmem_fs.h>
#include <linux/hugetlb.h>
+#include <linux/pgtable.h>
#include <linux/uaccess.h>
-#include <asm/pgtable.h>
static int mincore_hugetlb(pte_t *pte, unsigned long hmask, unsigned long addr,
unsigned long end, struct mm_walk *walk)
@@ -284,9 +284,9 @@ SYSCALL_DEFINE3(mincore, unsigned long, start, size_t, len,
* Do at most PAGE_SIZE entries per iteration, due to
* the temporary buffer size.
*/
- down_read(&current->mm->mmap_sem);
+ mmap_read_lock(current->mm);
retval = do_mincore(start, min(pages, PAGE_SIZE), tmp);
- up_read(&current->mm->mmap_sem);
+ mmap_read_unlock(current->mm);
if (retval <= 0)
break;
diff --git a/mm/mlock.c b/mm/mlock.c
index a72c1eeded77..f8736136fad7 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -49,7 +49,7 @@ EXPORT_SYMBOL(can_do_mlock);
* When lazy mlocking via vmscan, it is important to ensure that the
* vma's VM_LOCKED status is not concurrently being modified, otherwise we
* may have mlocked a page that is being munlocked. So lazy mlock must take
- * the mmap_sem for read, and verify that the vma really is locked
+ * the mmap_lock for read, and verify that the vma really is locked
* (see mm/rmap.c).
*/
@@ -381,7 +381,7 @@ static unsigned long __munlock_pagevec_fill(struct pagevec *pvec,
/*
* Initialize pte walk starting at the already pinned page where we
* are sure that there is a pte, as it was pinned under the same
- * mmap_sem write op.
+ * mmap_lock write op.
*/
pte = get_locked_pte(vma->vm_mm, start, &ptl);
/* Make sure we do not cross the page table boundary */
@@ -565,7 +565,7 @@ success:
mm->locked_vm += nr_pages;
/*
- * vm_flags is protected by the mmap_sem held in write mode.
+ * vm_flags is protected by the mmap_lock held in write mode.
* It's okay if try_to_unmap_one unmaps a page just after we
* set VM_LOCKED, populate_vma_page_range will bring it back.
*/
@@ -686,7 +686,7 @@ static __must_check int do_mlock(unsigned long start, size_t len, vm_flags_t fla
lock_limit >>= PAGE_SHIFT;
locked = len >> PAGE_SHIFT;
- if (down_write_killable(&current->mm->mmap_sem))
+ if (mmap_write_lock_killable(current->mm))
return -EINTR;
locked += current->mm->locked_vm;
@@ -705,7 +705,7 @@ static __must_check int do_mlock(unsigned long start, size_t len, vm_flags_t fla
if ((locked <= lock_limit) || capable(CAP_IPC_LOCK))
error = apply_vma_lock_flags(start, len, flags);
- up_write(&current->mm->mmap_sem);
+ mmap_write_unlock(current->mm);
if (error)
return error;
@@ -742,10 +742,10 @@ SYSCALL_DEFINE2(munlock, unsigned long, start, size_t, len)
len = PAGE_ALIGN(len + (offset_in_page(start)));
start &= PAGE_MASK;
- if (down_write_killable(&current->mm->mmap_sem))
+ if (mmap_write_lock_killable(current->mm))
return -EINTR;
ret = apply_vma_lock_flags(start, len, 0);
- up_write(&current->mm->mmap_sem);
+ mmap_write_unlock(current->mm);
return ret;
}
@@ -811,14 +811,14 @@ SYSCALL_DEFINE1(mlockall, int, flags)
lock_limit = rlimit(RLIMIT_MEMLOCK);
lock_limit >>= PAGE_SHIFT;
- if (down_write_killable(&current->mm->mmap_sem))
+ if (mmap_write_lock_killable(current->mm))
return -EINTR;
ret = -ENOMEM;
if (!(flags & MCL_CURRENT) || (current->mm->total_vm <= lock_limit) ||
capable(CAP_IPC_LOCK))
ret = apply_mlockall_flags(flags);
- up_write(&current->mm->mmap_sem);
+ mmap_write_unlock(current->mm);
if (!ret && (flags & MCL_CURRENT))
mm_populate(0, TASK_SIZE);
@@ -829,10 +829,10 @@ SYSCALL_DEFINE0(munlockall)
{
int ret;
- if (down_write_killable(&current->mm->mmap_sem))
+ if (mmap_write_lock_killable(current->mm))
return -EINTR;
ret = apply_mlockall_flags(0);
- up_write(&current->mm->mmap_sem);
+ mmap_write_unlock(current->mm);
return ret;
}
diff --git a/mm/mm_init.c b/mm/mm_init.c
index 7da6991d9435..435e5f794b3b 100644
--- a/mm/mm_init.c
+++ b/mm/mm_init.c
@@ -67,26 +67,30 @@ void __init mminit_verify_pageflags_layout(void)
unsigned long or_mask, add_mask;
shift = 8 * sizeof(unsigned long);
- width = shift - SECTIONS_WIDTH - NODES_WIDTH - ZONES_WIDTH - LAST_CPUPID_SHIFT;
+ width = shift - SECTIONS_WIDTH - NODES_WIDTH - ZONES_WIDTH
+ - LAST_CPUPID_SHIFT - KASAN_TAG_WIDTH;
mminit_dprintk(MMINIT_TRACE, "pageflags_layout_widths",
- "Section %d Node %d Zone %d Lastcpupid %d Flags %d\n",
+ "Section %d Node %d Zone %d Lastcpupid %d Kasantag %d Flags %d\n",
SECTIONS_WIDTH,
NODES_WIDTH,
ZONES_WIDTH,
LAST_CPUPID_WIDTH,
+ KASAN_TAG_WIDTH,
NR_PAGEFLAGS);
mminit_dprintk(MMINIT_TRACE, "pageflags_layout_shifts",
- "Section %d Node %d Zone %d Lastcpupid %d\n",
+ "Section %d Node %d Zone %d Lastcpupid %d Kasantag %d\n",
SECTIONS_SHIFT,
NODES_SHIFT,
ZONES_SHIFT,
- LAST_CPUPID_SHIFT);
+ LAST_CPUPID_SHIFT,
+ KASAN_TAG_WIDTH);
mminit_dprintk(MMINIT_TRACE, "pageflags_layout_pgshifts",
- "Section %lu Node %lu Zone %lu Lastcpupid %lu\n",
+ "Section %lu Node %lu Zone %lu Lastcpupid %lu Kasantag %lu\n",
(unsigned long)SECTIONS_PGSHIFT,
(unsigned long)NODES_PGSHIFT,
(unsigned long)ZONES_PGSHIFT,
- (unsigned long)LAST_CPUPID_PGSHIFT);
+ (unsigned long)LAST_CPUPID_PGSHIFT,
+ (unsigned long)KASAN_TAG_PGSHIFT);
mminit_dprintk(MMINIT_TRACE, "pageflags_layout_nodezoneid",
"Node/Zone ID: %lu -> %lu\n",
(unsigned long)(ZONEID_PGOFF + ZONEID_SHIFT),
diff --git a/mm/mmap.c b/mm/mmap.c
index f609e9ec4a25..59a4682ebf3f 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -132,7 +132,7 @@ void vma_set_page_prot(struct vm_area_struct *vma)
vm_flags &= ~VM_SHARED;
vm_page_prot = vm_pgprot_modify(vm_page_prot, vm_flags);
}
- /* remove_protection_ptes reads vma->vm_page_prot without mmap_sem */
+ /* remove_protection_ptes reads vma->vm_page_prot without mmap_lock */
WRITE_ONCE(vma->vm_page_prot, vm_page_prot);
}
@@ -198,7 +198,7 @@ SYSCALL_DEFINE1(brk, unsigned long, brk)
bool downgraded = false;
LIST_HEAD(uf);
- if (down_write_killable(&mm->mmap_sem))
+ if (mmap_write_lock_killable(mm))
return -EINTR;
origbrk = mm->brk;
@@ -238,14 +238,14 @@ SYSCALL_DEFINE1(brk, unsigned long, brk)
/*
* Always allow shrinking brk.
- * __do_munmap() may downgrade mmap_sem to read.
+ * __do_munmap() may downgrade mmap_lock to read.
*/
if (brk <= mm->brk) {
int ret;
/*
- * mm->brk must to be protected by write mmap_sem so update it
- * before downgrading mmap_sem. When __do_munmap() fails,
+ * mm->brk must to be protected by write mmap_lock so update it
+ * before downgrading mmap_lock. When __do_munmap() fails,
* mm->brk will be restored from origbrk.
*/
mm->brk = brk;
@@ -272,9 +272,9 @@ SYSCALL_DEFINE1(brk, unsigned long, brk)
success:
populate = newbrk > oldbrk && (mm->def_flags & VM_LOCKED) != 0;
if (downgraded)
- up_read(&mm->mmap_sem);
+ mmap_read_unlock(mm);
else
- up_write(&mm->mmap_sem);
+ mmap_write_unlock(mm);
userfaultfd_unmap_complete(mm, &uf);
if (populate)
mm_populate(oldbrk, newbrk - oldbrk);
@@ -282,7 +282,7 @@ success:
out:
retval = origbrk;
- up_write(&mm->mmap_sem);
+ mmap_write_unlock(mm);
return retval;
}
@@ -505,7 +505,7 @@ static __always_inline void vma_rb_erase(struct vm_area_struct *vma,
* After the update, the vma will be reinserted using
* anon_vma_interval_tree_post_update_vma().
*
- * The entire update must be protected by exclusive mmap_sem and by
+ * The entire update must be protected by exclusive mmap_lock and by
* the root anon_vma's mutex.
*/
static inline void
@@ -1207,7 +1207,7 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm,
}
/*
- * Rough compatbility check to quickly see if it's even worth looking
+ * Rough compatibility check to quickly see if it's even worth looking
* at sharing an anon_vma.
*
* They need to have the same vm_file, and the flags can only differ
@@ -1361,7 +1361,7 @@ static inline bool file_mmap_ok(struct file *file, struct inode *inode,
}
/*
- * The caller must hold down_write(&current->mm->mmap_sem).
+ * The caller must write-lock current->mm->mmap_lock.
*/
unsigned long do_mmap(struct file *file, unsigned long addr,
unsigned long len, unsigned long prot,
@@ -2371,7 +2371,7 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address)
/*
* vma->vm_start/vm_end cannot change under us because the caller
- * is required to hold the mmap_sem in read mode. We need the
+ * is required to hold the mmap_lock in read mode. We need the
* anon_vma lock to serialize against concurrent expand_stacks.
*/
anon_vma_lock_write(vma->anon_vma);
@@ -2389,7 +2389,7 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address)
if (!error) {
/*
* vma_gap_update() doesn't support concurrent
- * updates, but we only hold a shared mmap_sem
+ * updates, but we only hold a shared mmap_lock
* lock here, so we need to protect against
* concurrent vma expansions.
* anon_vma_lock_write() doesn't help here, as
@@ -2451,7 +2451,7 @@ int expand_downwards(struct vm_area_struct *vma,
/*
* vma->vm_start/vm_end cannot change under us because the caller
- * is required to hold the mmap_sem in read mode. We need the
+ * is required to hold the mmap_lock in read mode. We need the
* anon_vma lock to serialize against concurrent expand_stacks.
*/
anon_vma_lock_write(vma->anon_vma);
@@ -2469,7 +2469,7 @@ int expand_downwards(struct vm_area_struct *vma,
if (!error) {
/*
* vma_gap_update() doesn't support concurrent
- * updates, but we only hold a shared mmap_sem
+ * updates, but we only hold a shared mmap_lock
* lock here, so we need to protect against
* concurrent vma expansions.
* anon_vma_lock_write() doesn't help here, as
@@ -2828,7 +2828,7 @@ int __do_munmap(struct mm_struct *mm, unsigned long start, size_t len,
detach_vmas_to_be_unmapped(mm, vma, prev, end);
if (downgrade)
- downgrade_write(&mm->mmap_sem);
+ mmap_write_downgrade(mm);
unmap_region(mm, vma, prev, start, end);
@@ -2850,20 +2850,20 @@ static int __vm_munmap(unsigned long start, size_t len, bool downgrade)
struct mm_struct *mm = current->mm;
LIST_HEAD(uf);
- if (down_write_killable(&mm->mmap_sem))
+ if (mmap_write_lock_killable(mm))
return -EINTR;
ret = __do_munmap(mm, start, len, &uf, downgrade);
/*
- * Returning 1 indicates mmap_sem is downgraded.
+ * Returning 1 indicates mmap_lock is downgraded.
* But 1 is not legal return value of vm_munmap() and munmap(), reset
* it to 0 before return.
*/
if (ret == 1) {
- up_read(&mm->mmap_sem);
+ mmap_read_unlock(mm);
ret = 0;
} else
- up_write(&mm->mmap_sem);
+ mmap_write_unlock(mm);
userfaultfd_unmap_complete(mm, &uf);
return ret;
@@ -2911,7 +2911,7 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
if (pgoff + (size >> PAGE_SHIFT) < pgoff)
return ret;
- if (down_write_killable(&mm->mmap_sem))
+ if (mmap_write_lock_killable(mm))
return -EINTR;
vma = find_vma(mm, start);
@@ -2974,7 +2974,7 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
prot, flags, pgoff, &populate, NULL);
fput(file);
out:
- up_write(&mm->mmap_sem);
+ mmap_write_unlock(mm);
if (populate)
mm_populate(ret, populate);
if (!IS_ERR_VALUE(ret))
@@ -3074,12 +3074,12 @@ int vm_brk_flags(unsigned long addr, unsigned long request, unsigned long flags)
if (!len)
return 0;
- if (down_write_killable(&mm->mmap_sem))
+ if (mmap_write_lock_killable(mm))
return -EINTR;
ret = do_brk_flags(addr, len, flags, &uf);
populate = ((mm->def_flags & VM_LOCKED) != 0);
- up_write(&mm->mmap_sem);
+ mmap_write_unlock(mm);
userfaultfd_unmap_complete(mm, &uf);
if (populate && !ret)
mm_populate(addr, len);
@@ -3107,12 +3107,12 @@ void exit_mmap(struct mm_struct *mm)
/*
* Manually reap the mm to free as much memory as possible.
* Then, as the oom reaper does, set MMF_OOM_SKIP to disregard
- * this mm from further consideration. Taking mm->mmap_sem for
+ * this mm from further consideration. Taking mm->mmap_lock for
* write after setting MMF_OOM_SKIP will guarantee that the oom
- * reaper will not run on this mm again after mmap_sem is
+ * reaper will not run on this mm again after mmap_lock is
* dropped.
*
- * Nothing can be holding mm->mmap_sem here and the above call
+ * Nothing can be holding mm->mmap_lock here and the above call
* to mmu_notifier_release(mm) ensures mmu notifier callbacks in
* __oom_reap_task_mm() will not block.
*
@@ -3123,8 +3123,8 @@ void exit_mmap(struct mm_struct *mm)
(void)__oom_reap_task_mm(mm);
set_bit(MMF_OOM_SKIP, &mm->flags);
- down_write(&mm->mmap_sem);
- up_write(&mm->mmap_sem);
+ mmap_write_lock(mm);
+ mmap_write_unlock(mm);
}
if (mm->locked_vm) {
@@ -3437,7 +3437,7 @@ bool vma_is_special_mapping(const struct vm_area_struct *vma,
}
/*
- * Called with mm->mmap_sem held for writing.
+ * Called with mm->mmap_lock held for writing.
* Insert a new vma covering the given region, with the given flags.
* Its pages are supplied by the given array of struct page *.
* The array can be shorter than len >> PAGE_SHIFT if it's null-terminated.
@@ -3474,7 +3474,7 @@ static void vm_lock_anon_vma(struct mm_struct *mm, struct anon_vma *anon_vma)
* The LSB of head.next can't change from under us
* because we hold the mm_all_locks_mutex.
*/
- down_write_nest_lock(&anon_vma->root->rwsem, &mm->mmap_sem);
+ down_write_nest_lock(&anon_vma->root->rwsem, &mm->mmap_lock);
/*
* We can safely modify head.next after taking the
* anon_vma->root->rwsem. If some other vma in this mm shares
@@ -3504,7 +3504,7 @@ static void vm_lock_mapping(struct mm_struct *mm, struct address_space *mapping)
*/
if (test_and_set_bit(AS_MM_ALL_LOCKS, &mapping->flags))
BUG();
- down_write_nest_lock(&mapping->i_mmap_rwsem, &mm->mmap_sem);
+ down_write_nest_lock(&mapping->i_mmap_rwsem, &mm->mmap_lock);
}
}
@@ -3513,11 +3513,11 @@ static void vm_lock_mapping(struct mm_struct *mm, struct address_space *mapping)
* operations that could ever happen on a certain mm. This includes
* vmtruncate, try_to_unmap, and all page faults.
*
- * The caller must take the mmap_sem in write mode before calling
+ * The caller must take the mmap_lock in write mode before calling
* mm_take_all_locks(). The caller isn't allowed to release the
- * mmap_sem until mm_drop_all_locks() returns.
+ * mmap_lock until mm_drop_all_locks() returns.
*
- * mmap_sem in write mode is required in order to block all operations
+ * mmap_lock in write mode is required in order to block all operations
* that could modify pagetables and free pages without need of
* altering the vma layout. It's also needed in write mode to avoid new
* anon_vmas to be associated with existing vmas.
@@ -3550,7 +3550,7 @@ int mm_take_all_locks(struct mm_struct *mm)
struct vm_area_struct *vma;
struct anon_vma_chain *avc;
- BUG_ON(down_read_trylock(&mm->mmap_sem));
+ BUG_ON(mmap_read_trylock(mm));
mutex_lock(&mm_all_locks_mutex);
@@ -3622,7 +3622,7 @@ static void vm_unlock_mapping(struct address_space *mapping)
}
/*
- * The mmap_sem cannot be released by the caller until
+ * The mmap_lock cannot be released by the caller until
* mm_drop_all_locks() returns.
*/
void mm_drop_all_locks(struct mm_struct *mm)
@@ -3630,7 +3630,7 @@ void mm_drop_all_locks(struct mm_struct *mm)
struct vm_area_struct *vma;
struct anon_vma_chain *avc;
- BUG_ON(down_read_trylock(&mm->mmap_sem));
+ BUG_ON(mmap_read_trylock(mm));
BUG_ON(!mutex_is_locked(&mm_all_locks_mutex));
for (vma = mm->mmap; vma; vma = vma->vm_next) {
diff --git a/mm/mmu_gather.c b/mm/mmu_gather.c
index a3538cb2bcbe..03c33c93a582 100644
--- a/mm/mmu_gather.c
+++ b/mm/mmu_gather.c
@@ -301,7 +301,7 @@ void tlb_finish_mmu(struct mmu_gather *tlb,
{
/*
* If there are parallel threads are doing PTE changes on same range
- * under non-exclusive lock (e.g., mmap_sem read-side) but defer TLB
+ * under non-exclusive lock (e.g., mmap_lock read-side) but defer TLB
* flush by batching, one thread may end up seeing inconsistent PTEs
* and result in having stale TLB entries. So flush TLB forcefully
* if we detect parallel PTE batching threads.
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
index 06852b896fa6..352bb9f3ecc0 100644
--- a/mm/mmu_notifier.c
+++ b/mm/mmu_notifier.c
@@ -599,7 +599,7 @@ void __mmu_notifier_invalidate_range(struct mm_struct *mm,
}
/*
- * Same as mmu_notifier_register but here the caller must hold the mmap_sem in
+ * Same as mmu_notifier_register but here the caller must hold the mmap_lock in
* write mode. A NULL mn signals the notifier is being registered for itree
* mode.
*/
@@ -609,7 +609,7 @@ int __mmu_notifier_register(struct mmu_notifier *subscription,
struct mmu_notifier_subscriptions *subscriptions = NULL;
int ret;
- lockdep_assert_held_write(&mm->mmap_sem);
+ mmap_assert_write_locked(mm);
BUG_ON(atomic_read(&mm->mm_users) <= 0);
if (IS_ENABLED(CONFIG_LOCKDEP)) {
@@ -623,7 +623,7 @@ int __mmu_notifier_register(struct mmu_notifier *subscription,
/*
* kmalloc cannot be called under mm_take_all_locks(), but we
* know that mm->notifier_subscriptions can't change while we
- * hold the write side of the mmap_sem.
+ * hold the write side of the mmap_lock.
*/
subscriptions = kzalloc(
sizeof(struct mmu_notifier_subscriptions), GFP_KERNEL);
@@ -655,7 +655,7 @@ int __mmu_notifier_register(struct mmu_notifier *subscription,
* readers. acquire can only be used while holding the mmgrab or
* mmget, and is safe because once created the
* mmu_notifier_subscriptions is not freed until the mm is destroyed.
- * As above, users holding the mmap_sem or one of the
+ * As above, users holding the mmap_lock or one of the
* mm_take_all_locks() do not need to use acquire semantics.
*/
if (subscriptions)
@@ -689,7 +689,7 @@ EXPORT_SYMBOL_GPL(__mmu_notifier_register);
* @mn: The notifier to attach
* @mm: The mm to attach the notifier to
*
- * Must not hold mmap_sem nor any other VM related lock when calling
+ * Must not hold mmap_lock nor any other VM related lock when calling
* this registration function. Must also ensure mm_users can't go down
* to zero while this runs to avoid races with mmu_notifier_release,
* so mm has to be current->mm or the mm should be pinned safely such
@@ -708,9 +708,9 @@ int mmu_notifier_register(struct mmu_notifier *subscription,
{
int ret;
- down_write(&mm->mmap_sem);
+ mmap_write_lock(mm);
ret = __mmu_notifier_register(subscription, mm);
- up_write(&mm->mmap_sem);
+ mmap_write_unlock(mm);
return ret;
}
EXPORT_SYMBOL_GPL(mmu_notifier_register);
@@ -750,7 +750,7 @@ find_get_mmu_notifier(struct mm_struct *mm, const struct mmu_notifier_ops *ops)
* are the same.
*
* Each call to mmu_notifier_get() must be paired with a call to
- * mmu_notifier_put(). The caller must hold the write side of mm->mmap_sem.
+ * mmu_notifier_put(). The caller must hold the write side of mm->mmap_lock.
*
* While the caller has a mmu_notifier get the mm pointer will remain valid,
* and can be converted to an active mm pointer via mmget_not_zero().
@@ -761,7 +761,7 @@ struct mmu_notifier *mmu_notifier_get_locked(const struct mmu_notifier_ops *ops,
struct mmu_notifier *subscription;
int ret;
- lockdep_assert_held_write(&mm->mmap_sem);
+ mmap_assert_write_locked(mm);
if (mm->notifier_subscriptions) {
subscription = find_get_mmu_notifier(mm, ops);
@@ -983,7 +983,7 @@ int mmu_interval_notifier_insert(struct mmu_interval_notifier *interval_sub,
struct mmu_notifier_subscriptions *subscriptions;
int ret;
- might_lock(&mm->mmap_sem);
+ might_lock(&mm->mmap_lock);
subscriptions = smp_load_acquire(&mm->notifier_subscriptions);
if (!subscriptions || !subscriptions->has_itree) {
@@ -1006,7 +1006,7 @@ int mmu_interval_notifier_insert_locked(
mm->notifier_subscriptions;
int ret;
- lockdep_assert_held_write(&mm->mmap_sem);
+ mmap_assert_write_locked(mm);
if (!subscriptions || !subscriptions->has_itree) {
ret = __mmu_notifier_register(NULL, mm);
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 494192ca954b..ce8b8a5eacbb 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -28,7 +28,7 @@
#include <linux/ksm.h>
#include <linux/uaccess.h>
#include <linux/mm_inline.h>
-#include <asm/pgtable.h>
+#include <linux/pgtable.h>
#include <asm/cacheflush.h>
#include <asm/mmu_context.h>
#include <asm/tlbflush.h>
@@ -49,7 +49,7 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
bool uffd_wp_resolve = cp_flags & MM_CP_UFFD_WP_RESOLVE;
/*
- * Can be called with only the mmap_sem for reading by
+ * Can be called with only the mmap_lock for reading by
* prot_numa so we must check the pmd isn't constantly
* changing from under us from pmd_none to pmd_trans_huge
* and/or the other way around.
@@ -59,7 +59,7 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
/*
* The pmd points to a regular pte so the pmd can't change
- * from under us even if the mmap_sem is only hold for
+ * from under us even if the mmap_lock is only hold for
* reading.
*/
pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
@@ -228,7 +228,7 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma,
next = pmd_addr_end(addr, end);
/*
- * Automatic NUMA balancing walks the tables with mmap_sem
+ * Automatic NUMA balancing walks the tables with mmap_lock
* held for read. It's possible a parallel update to occur
* between pmd_trans_huge() and a pmd_none_or_clear_bad()
* check leading to a false positive and clearing.
@@ -477,7 +477,7 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev,
success:
/*
- * vm_flags and vm_page_prot are protected by the mmap_sem
+ * vm_flags and vm_page_prot are protected by the mmap_lock
* held in write mode.
*/
vma->vm_flags = newflags;
@@ -538,7 +538,7 @@ static int do_mprotect_pkey(unsigned long start, size_t len,
reqprot = prot;
- if (down_write_killable(&current->mm->mmap_sem))
+ if (mmap_write_lock_killable(current->mm))
return -EINTR;
/*
@@ -628,7 +628,7 @@ static int do_mprotect_pkey(unsigned long start, size_t len,
prot = reqprot;
}
out:
- up_write(&current->mm->mmap_sem);
+ mmap_write_unlock(current->mm);
return error;
}
@@ -658,7 +658,7 @@ SYSCALL_DEFINE2(pkey_alloc, unsigned long, flags, unsigned long, init_val)
if (init_val & ~PKEY_ACCESS_MASK)
return -EINVAL;
- down_write(&current->mm->mmap_sem);
+ mmap_write_lock(current->mm);
pkey = mm_pkey_alloc(current->mm);
ret = -ENOSPC;
@@ -672,7 +672,7 @@ SYSCALL_DEFINE2(pkey_alloc, unsigned long, flags, unsigned long, init_val)
}
ret = pkey;
out:
- up_write(&current->mm->mmap_sem);
+ mmap_write_unlock(current->mm);
return ret;
}
@@ -680,9 +680,9 @@ SYSCALL_DEFINE1(pkey_free, int, pkey)
{
int ret;
- down_write(&current->mm->mmap_sem);
+ mmap_write_lock(current->mm);
ret = mm_pkey_free(current->mm, pkey);
- up_write(&current->mm->mmap_sem);
+ mmap_write_unlock(current->mm);
/*
* We could provie warnings or errors if any VMA still
diff --git a/mm/mremap.c b/mm/mremap.c
index a7e282ead438..5dd572d57ca9 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -146,7 +146,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
/*
* We don't have to worry about the ordering of src and dst
- * pte locks because exclusive mmap_sem prevents deadlock.
+ * pte locks because exclusive mmap_lock prevents deadlock.
*/
old_pte = pte_offset_map_lock(mm, old_pmd, old_addr, &old_ptl);
new_pte = pte_offset_map(new_pmd, new_addr);
@@ -213,7 +213,7 @@ static bool move_normal_pmd(struct vm_area_struct *vma, unsigned long old_addr,
/*
* We don't have to worry about the ordering of src and dst
- * ptlocks because exclusive mmap_sem prevents deadlock.
+ * ptlocks because exclusive mmap_lock prevents deadlock.
*/
old_ptl = pmd_lock(vma->vm_mm, old_pmd);
new_ptl = pmd_lockptr(mm, new_pmd);
@@ -266,7 +266,7 @@ unsigned long move_page_tables(struct vm_area_struct *vma,
new_pmd = alloc_new_pmd(vma->vm_mm, vma, new_addr);
if (!new_pmd)
break;
- if (is_swap_pmd(*old_pmd) || pmd_trans_huge(*old_pmd)) {
+ if (is_swap_pmd(*old_pmd) || pmd_trans_huge(*old_pmd) || pmd_devmap(*old_pmd)) {
if (extent == HPAGE_PMD_SIZE) {
bool moved;
/* See comment in move_ptes() */
@@ -413,9 +413,20 @@ static unsigned long move_vma(struct vm_area_struct *vma,
/* Always put back VM_ACCOUNT since we won't unmap */
vma->vm_flags |= VM_ACCOUNT;
- vm_acct_memory(vma_pages(new_vma));
+ vm_acct_memory(new_len >> PAGE_SHIFT);
}
+ /*
+ * VMAs can actually be merged back together in copy_vma
+ * calling merge_vma. This can happen with anonymous vmas
+ * which have not yet been faulted, so if we were to consider
+ * this VMA split we'll end up adding VM_ACCOUNT on the
+ * next VMA, which is completely unrelated if this VMA
+ * was re-merged.
+ */
+ if (split && new_vma == vma)
+ split = 0;
+
/* We always clear VM_LOCKED[ONFAULT] on the old vma */
vma->vm_flags &= VM_LOCKED_CLEAR_MASK;
@@ -685,7 +696,7 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
if (!new_len)
return ret;
- if (down_write_killable(&current->mm->mmap_sem))
+ if (mmap_write_lock_killable(current->mm))
return -EINTR;
if (flags & (MREMAP_FIXED | MREMAP_DONTUNMAP)) {
@@ -699,7 +710,7 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
* Always allow a shrinking remap: that just unmaps
* the unnecessary pages..
* __do_munmap does all the needed commit accounting, and
- * downgrades mmap_sem to read if so directed.
+ * downgrades mmap_lock to read if so directed.
*/
if (old_len >= new_len) {
int retval;
@@ -709,7 +720,7 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
if (retval < 0 && old_len != new_len) {
ret = retval;
goto out;
- /* Returning 1 indicates mmap_sem is downgraded to read. */
+ /* Returning 1 indicates mmap_lock is downgraded to read. */
} else if (retval == 1)
downgraded = true;
ret = addr;
@@ -774,16 +785,16 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
out:
if (offset_in_page(ret)) {
vm_unacct_memory(charged);
- locked = 0;
+ locked = false;
}
if (downgraded)
- up_read(&current->mm->mmap_sem);
+ mmap_read_unlock(current->mm);
else
- up_write(&current->mm->mmap_sem);
+ mmap_write_unlock(current->mm);
if (locked && new_len > old_len)
mm_populate(new_addr + old_len, new_len - old_len);
userfaultfd_unmap_complete(mm, &uf_unmap_early);
- mremap_userfaultfd_complete(&uf, addr, new_addr, old_len);
+ mremap_userfaultfd_complete(&uf, addr, ret, old_len);
userfaultfd_unmap_complete(mm, &uf_unmap);
return ret;
}
diff --git a/mm/msync.c b/mm/msync.c
index c3bd3e75f687..69c6d2029531 100644
--- a/mm/msync.c
+++ b/mm/msync.c
@@ -57,7 +57,7 @@ SYSCALL_DEFINE3(msync, unsigned long, start, size_t, len, int, flags)
* If the interval [start,end) covers some unmapped address ranges,
* just ignore them, but return -ENOMEM at the end.
*/
- down_read(&mm->mmap_sem);
+ mmap_read_lock(mm);
vma = find_vma(mm, start);
for (;;) {
struct file *file;
@@ -88,12 +88,12 @@ SYSCALL_DEFINE3(msync, unsigned long, start, size_t, len, int, flags)
if ((flags & MS_SYNC) && file &&
(vma->vm_flags & VM_SHARED)) {
get_file(file);
- up_read(&mm->mmap_sem);
+ mmap_read_unlock(mm);
error = vfs_fsync_range(file, fstart, fend, 1);
fput(file);
if (error || start >= end)
goto out;
- down_read(&mm->mmap_sem);
+ mmap_read_lock(mm);
vma = find_vma(mm, start);
} else {
if (start >= end) {
@@ -104,7 +104,7 @@ SYSCALL_DEFINE3(msync, unsigned long, start, size_t, len, int, flags)
}
}
out_unlock:
- up_read(&mm->mmap_sem);
+ mmap_read_unlock(mm);
out:
return error ? : unmapped_error;
}
diff --git a/mm/nommu.c b/mm/nommu.c
index 318df4e236c9..cdcad5d61dd1 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -140,7 +140,7 @@ void vfree(const void *addr)
}
EXPORT_SYMBOL(vfree);
-void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot)
+void *__vmalloc(unsigned long size, gfp_t gfp_mask)
{
/*
* You can't specify __GFP_HIGHMEM with kmalloc() since kmalloc()
@@ -150,24 +150,33 @@ void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot)
}
EXPORT_SYMBOL(__vmalloc);
-void *__vmalloc_node_flags(unsigned long size, int node, gfp_t flags)
+void *__vmalloc_node_range(unsigned long size, unsigned long align,
+ unsigned long start, unsigned long end, gfp_t gfp_mask,
+ pgprot_t prot, unsigned long vm_flags, int node,
+ const void *caller)
{
- return __vmalloc(size, flags, PAGE_KERNEL);
+ return __vmalloc(size, gfp_mask);
+}
+
+void *__vmalloc_node(unsigned long size, unsigned long align, gfp_t gfp_mask,
+ int node, const void *caller)
+{
+ return __vmalloc(size, gfp_mask);
}
static void *__vmalloc_user_flags(unsigned long size, gfp_t flags)
{
void *ret;
- ret = __vmalloc(size, flags, PAGE_KERNEL);
+ ret = __vmalloc(size, flags);
if (ret) {
struct vm_area_struct *vma;
- down_write(&current->mm->mmap_sem);
+ mmap_write_lock(current->mm);
vma = find_vma(current->mm, (unsigned long)ret);
if (vma)
vma->vm_flags |= VM_USERMAP;
- up_write(&current->mm->mmap_sem);
+ mmap_write_unlock(current->mm);
}
return ret;
@@ -179,12 +188,6 @@ void *vmalloc_user(unsigned long size)
}
EXPORT_SYMBOL(vmalloc_user);
-void *vmalloc_user_node_flags(unsigned long size, int node, gfp_t flags)
-{
- return __vmalloc_user_flags(size, flags | __GFP_ZERO);
-}
-EXPORT_SYMBOL(vmalloc_user_node_flags);
-
struct page *vmalloc_to_page(const void *addr)
{
return virt_to_page(addr);
@@ -230,7 +233,7 @@ long vwrite(char *buf, char *addr, unsigned long count)
*/
void *vmalloc(unsigned long size)
{
- return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL);
+ return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM);
}
EXPORT_SYMBOL(vmalloc);
@@ -248,8 +251,7 @@ EXPORT_SYMBOL(vmalloc);
*/
void *vzalloc(unsigned long size)
{
- return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO,
- PAGE_KERNEL);
+ return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO);
}
EXPORT_SYMBOL(vzalloc);
@@ -302,7 +304,7 @@ EXPORT_SYMBOL(vzalloc_node);
void *vmalloc_exec(unsigned long size)
{
- return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL_EXEC);
+ return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM);
}
/**
@@ -314,7 +316,7 @@ void *vmalloc_exec(unsigned long size)
*/
void *vmalloc_32(unsigned long size)
{
- return __vmalloc(size, GFP_KERNEL, PAGE_KERNEL);
+ return __vmalloc(size, GFP_KERNEL);
}
EXPORT_SYMBOL(vmalloc_32);
@@ -351,7 +353,7 @@ void vunmap(const void *addr)
}
EXPORT_SYMBOL(vunmap);
-void *vm_map_ram(struct page **pages, unsigned int count, int node, pgprot_t prot)
+void *vm_map_ram(struct page **pages, unsigned int count, int node)
{
BUG();
return NULL;
@@ -369,18 +371,6 @@ void vm_unmap_aliases(void)
}
EXPORT_SYMBOL_GPL(vm_unmap_aliases);
-/*
- * Implement a stub for vmalloc_sync_[un]mapping() if the architecture
- * chose not to have one.
- */
-void __weak vmalloc_sync_mappings(void)
-{
-}
-
-void __weak vmalloc_sync_unmappings(void)
-{
-}
-
struct vm_struct *alloc_vm_area(size_t size, pte_t **ptes)
{
BUG();
@@ -443,7 +433,7 @@ SYSCALL_DEFINE1(brk, unsigned long, brk)
/*
* Ok, looks good - let it rip.
*/
- flush_icache_range(mm->brk, brk);
+ flush_icache_user_range(mm->brk, brk);
return mm->brk = brk;
}
@@ -592,7 +582,7 @@ static void put_nommu_region(struct vm_region *region)
* add a VMA into a process's mm_struct in the appropriate place in the list
* and tree and add to the address space's page tree also if not an anonymous
* page
- * - should be called with mm->mmap_sem held writelocked
+ * - should be called with mm->mmap_lock held writelocked
*/
static void add_vma_to_mm(struct mm_struct *mm, struct vm_area_struct *vma)
{
@@ -706,7 +696,7 @@ static void delete_vma(struct mm_struct *mm, struct vm_area_struct *vma)
/*
* look up the first VMA in which addr resides, NULL if none
- * - should be called with mm->mmap_sem at least held readlocked
+ * - should be called with mm->mmap_lock at least held readlocked
*/
struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)
{
@@ -752,7 +742,7 @@ int expand_stack(struct vm_area_struct *vma, unsigned long address)
/*
* look up the first VMA exactly that exactly matches addr
- * - should be called with mm->mmap_sem at least held readlocked
+ * - should be called with mm->mmap_lock at least held readlocked
*/
static struct vm_area_struct *find_vma_exact(struct mm_struct *mm,
unsigned long addr,
@@ -1287,7 +1277,7 @@ share:
/* we flush the region from the icache only when the first executable
* mapping of it is made */
if (vma->vm_flags & VM_EXEC && !region->vm_icache_flushed) {
- flush_icache_range(region->vm_start, region->vm_end);
+ flush_icache_user_range(region->vm_start, region->vm_end);
region->vm_icache_flushed = true;
}
@@ -1552,9 +1542,9 @@ int vm_munmap(unsigned long addr, size_t len)
struct mm_struct *mm = current->mm;
int ret;
- down_write(&mm->mmap_sem);
+ mmap_write_lock(mm);
ret = do_munmap(mm, addr, len, NULL);
- up_write(&mm->mmap_sem);
+ mmap_write_unlock(mm);
return ret;
}
EXPORT_SYMBOL(vm_munmap);
@@ -1641,9 +1631,9 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
{
unsigned long ret;
- down_write(&current->mm->mmap_sem);
+ mmap_write_lock(current->mm);
ret = do_mremap(addr, old_len, new_len, flags, new_addr);
- up_write(&current->mm->mmap_sem);
+ mmap_write_unlock(current->mm);
return ret;
}
@@ -1715,7 +1705,7 @@ int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm,
struct vm_area_struct *vma;
int write = gup_flags & FOLL_WRITE;
- if (down_read_killable(&mm->mmap_sem))
+ if (mmap_read_lock_killable(mm))
return 0;
/* the access must start within one of the target process's mappings */
@@ -1738,7 +1728,7 @@ int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm,
len = 0;
}
- up_read(&mm->mmap_sem);
+ mmap_read_unlock(mm);
return len;
}
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index dfc357614e56..b4e9491cb320 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -254,7 +254,7 @@ static enum oom_constraint constrained_alloc(struct oom_control *oc)
{
struct zone *zone;
struct zoneref *z;
- enum zone_type high_zoneidx = gfp_zone(oc->gfp_mask);
+ enum zone_type highest_zoneidx = gfp_zone(oc->gfp_mask);
bool cpuset_limited = false;
int nid;
@@ -294,7 +294,7 @@ static enum oom_constraint constrained_alloc(struct oom_control *oc)
/* Check this allocation failure is caused by cpuset's wall function */
for_each_zone_zonelist_nodemask(zone, z, oc->zonelist,
- high_zoneidx, oc->nodemask)
+ highest_zoneidx, oc->nodemask)
if (!cpuset_zone_allowed(zone, oc->gfp_mask))
cpuset_limited = true;
@@ -569,7 +569,7 @@ static bool oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm)
{
bool ret = true;
- if (!down_read_trylock(&mm->mmap_sem)) {
+ if (!mmap_read_trylock(mm)) {
trace_skip_task_reaping(tsk->pid);
return false;
}
@@ -577,8 +577,8 @@ static bool oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm)
/*
* MMF_OOM_SKIP is set by exit_mmap when the OOM reaper can't
* work on the mm anymore. The check for MMF_OOM_SKIP must run
- * under mmap_sem for reading because it serializes against the
- * down_write();up_write() cycle in exit_mmap().
+ * under mmap_lock for reading because it serializes against the
+ * mmap_write_lock();mmap_write_unlock() cycle in exit_mmap().
*/
if (test_bit(MMF_OOM_SKIP, &mm->flags)) {
trace_skip_task_reaping(tsk->pid);
@@ -600,7 +600,7 @@ static bool oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm)
out_finish:
trace_finish_task_reaping(tsk->pid);
out_unlock:
- up_read(&mm->mmap_sem);
+ mmap_read_unlock(mm);
return ret;
}
@@ -611,7 +611,7 @@ static void oom_reap_task(struct task_struct *tsk)
int attempts = 0;
struct mm_struct *mm = tsk->signal->oom_mm;
- /* Retry the down_read_trylock(mmap_sem) a few times */
+ /* Retry the mmap_read_trylock(mm) a few times */
while (attempts++ < MAX_OOM_REAP_RETRIES && !oom_reap_task_mm(tsk, mm))
schedule_timeout_idle(HZ/10);
@@ -629,7 +629,7 @@ done:
/*
* Hide this mm from OOM killer because it has been either reaped or
- * somebody can't call up_write(mmap_sem).
+ * somebody can't call mmap_write_unlock(mm).
*/
set_bit(MMF_OOM_SKIP, &mm->flags);
@@ -898,7 +898,7 @@ static void __oom_kill_process(struct task_struct *victim, const char *message)
/*
* Kill all user processes sharing victim->mm in other thread groups, if
* any. They don't get access to memory reserves, though, to avoid
- * depletion of all memory. This prevents mm->mmap_sem livelock when an
+ * depletion of all memory. This prevents mm->mmap_lock livelock when an
* oom killed thread cannot exit because it requires the semaphore and
* its contended by another thread trying to allocate memory itself.
* That thread will now get access to memory reserves since it has a
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 7326b54ab728..28b3e7a67565 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -257,7 +257,7 @@ static void wb_min_max_ratio(struct bdi_writeback *wb,
* requiring writeback.
*
* This number of dirtyable pages is the base value of which the
- * user-configurable dirty ratio is the effictive number of pages that
+ * user-configurable dirty ratio is the effective number of pages that
* are allowed to be actually dirtied. Per individual zone, or
* globally by using the sum of dirtyable pages over all zones.
*
@@ -387,8 +387,7 @@ static unsigned long global_dirtyable_memory(void)
* Calculate @dtc->thresh and ->bg_thresh considering
* vm_dirty_{bytes|ratio} and dirty_background_{bytes|ratio}. The caller
* must ensure that @dtc->avail is set before calling this function. The
- * dirty limits will be lifted by 1/4 for PF_LESS_THROTTLE (ie. nfsd) and
- * real-time tasks.
+ * dirty limits will be lifted by 1/4 for real-time tasks.
*/
static void domain_dirty_limits(struct dirty_throttle_control *dtc)
{
@@ -436,7 +435,7 @@ static void domain_dirty_limits(struct dirty_throttle_control *dtc)
if (bg_thresh >= thresh)
bg_thresh = thresh / 2;
tsk = current;
- if (tsk->flags & PF_LESS_THROTTLE || rt_task(tsk)) {
+ if (rt_task(tsk)) {
bg_thresh += bg_thresh / 4 + global_wb_domain.dirty_limit / 32;
thresh += thresh / 4 + global_wb_domain.dirty_limit / 32;
}
@@ -486,7 +485,7 @@ static unsigned long node_dirty_limit(struct pglist_data *pgdat)
else
dirty = vm_dirty_ratio * node_memory / 100;
- if (tsk->flags & PF_LESS_THROTTLE || rt_task(tsk))
+ if (rt_task(tsk))
dirty += dirty / 4;
return dirty;
@@ -505,15 +504,13 @@ bool node_dirty_ok(struct pglist_data *pgdat)
unsigned long nr_pages = 0;
nr_pages += node_page_state(pgdat, NR_FILE_DIRTY);
- nr_pages += node_page_state(pgdat, NR_UNSTABLE_NFS);
nr_pages += node_page_state(pgdat, NR_WRITEBACK);
return nr_pages <= limit;
}
int dirty_background_ratio_handler(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp,
- loff_t *ppos)
+ void *buffer, size_t *lenp, loff_t *ppos)
{
int ret;
@@ -524,8 +521,7 @@ int dirty_background_ratio_handler(struct ctl_table *table, int write,
}
int dirty_background_bytes_handler(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp,
- loff_t *ppos)
+ void *buffer, size_t *lenp, loff_t *ppos)
{
int ret;
@@ -535,9 +531,8 @@ int dirty_background_bytes_handler(struct ctl_table *table, int write,
return ret;
}
-int dirty_ratio_handler(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp,
- loff_t *ppos)
+int dirty_ratio_handler(struct ctl_table *table, int write, void *buffer,
+ size_t *lenp, loff_t *ppos)
{
int old_ratio = vm_dirty_ratio;
int ret;
@@ -551,8 +546,7 @@ int dirty_ratio_handler(struct ctl_table *table, int write,
}
int dirty_bytes_handler(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp,
- loff_t *ppos)
+ void *buffer, size_t *lenp, loff_t *ppos)
{
unsigned long old_bytes = vm_dirty_bytes;
int ret;
@@ -759,7 +753,7 @@ static void mdtc_calc_avail(struct dirty_throttle_control *mdtc,
* bounded by the bdi->min_ratio and/or bdi->max_ratio parameters, if set.
*
* Return: @wb's dirty limit in pages. The term "dirty" in the context of
- * dirty balancing includes all PG_dirty, PG_writeback and NFS unstable pages.
+ * dirty balancing includes all PG_dirty and PG_writeback pages.
*/
static unsigned long __wb_calc_thresh(struct dirty_throttle_control *dtc)
{
@@ -1567,7 +1561,7 @@ static void balance_dirty_pages(struct bdi_writeback *wb,
struct dirty_throttle_control * const mdtc = mdtc_valid(&mdtc_stor) ?
&mdtc_stor : NULL;
struct dirty_throttle_control *sdtc;
- unsigned long nr_reclaimable; /* = file_dirty + unstable_nfs */
+ unsigned long nr_reclaimable; /* = file_dirty */
long period;
long pause;
long max_pause;
@@ -1587,14 +1581,7 @@ static void balance_dirty_pages(struct bdi_writeback *wb,
unsigned long m_thresh = 0;
unsigned long m_bg_thresh = 0;
- /*
- * Unstable writes are a feature of certain networked
- * filesystems (i.e. NFS) in which data may have been
- * written to the server's write cache, but has not yet
- * been flushed to permanent storage.
- */
- nr_reclaimable = global_node_page_state(NR_FILE_DIRTY) +
- global_node_page_state(NR_UNSTABLE_NFS);
+ nr_reclaimable = global_node_page_state(NR_FILE_DIRTY);
gdtc->avail = global_dirtyable_memory();
gdtc->dirty = nr_reclaimable + global_node_page_state(NR_WRITEBACK);
@@ -1653,8 +1640,12 @@ static void balance_dirty_pages(struct bdi_writeback *wb,
if (dirty <= dirty_freerun_ceiling(thresh, bg_thresh) &&
(!mdtc ||
m_dirty <= dirty_freerun_ceiling(m_thresh, m_bg_thresh))) {
- unsigned long intv = dirty_poll_interval(dirty, thresh);
- unsigned long m_intv = ULONG_MAX;
+ unsigned long intv;
+ unsigned long m_intv;
+
+free_running:
+ intv = dirty_poll_interval(dirty, thresh);
+ m_intv = ULONG_MAX;
current->dirty_paused_when = now;
current->nr_dirtied = 0;
@@ -1673,9 +1664,20 @@ static void balance_dirty_pages(struct bdi_writeback *wb,
* Calculate global domain's pos_ratio and select the
* global dtc by default.
*/
- if (!strictlimit)
+ if (!strictlimit) {
wb_dirty_limits(gdtc);
+ if ((current->flags & PF_LOCAL_THROTTLE) &&
+ gdtc->wb_dirty <
+ dirty_freerun_ceiling(gdtc->wb_thresh,
+ gdtc->wb_bg_thresh))
+ /*
+ * LOCAL_THROTTLE tasks must not be throttled
+ * when below the per-wb freerun ceiling.
+ */
+ goto free_running;
+ }
+
dirty_exceeded = (gdtc->wb_dirty > gdtc->wb_thresh) &&
((gdtc->dirty > gdtc->thresh) || strictlimit);
@@ -1689,9 +1691,20 @@ static void balance_dirty_pages(struct bdi_writeback *wb,
* both global and memcg domains. Choose the one
* w/ lower pos_ratio.
*/
- if (!strictlimit)
+ if (!strictlimit) {
wb_dirty_limits(mdtc);
+ if ((current->flags & PF_LOCAL_THROTTLE) &&
+ mdtc->wb_dirty <
+ dirty_freerun_ceiling(mdtc->wb_thresh,
+ mdtc->wb_bg_thresh))
+ /*
+ * LOCAL_THROTTLE tasks must not be
+ * throttled when below the per-wb
+ * freerun ceiling.
+ */
+ goto free_running;
+ }
dirty_exceeded |= (mdtc->wb_dirty > mdtc->wb_thresh) &&
((mdtc->dirty > mdtc->thresh) || strictlimit);
@@ -1938,8 +1951,7 @@ bool wb_over_bg_thresh(struct bdi_writeback *wb)
* as we're trying to decide whether to put more under writeback.
*/
gdtc->avail = global_dirtyable_memory();
- gdtc->dirty = global_node_page_state(NR_FILE_DIRTY) +
- global_node_page_state(NR_UNSTABLE_NFS);
+ gdtc->dirty = global_node_page_state(NR_FILE_DIRTY);
domain_dirty_limits(gdtc);
if (gdtc->dirty > gdtc->bg_thresh)
@@ -1972,7 +1984,7 @@ bool wb_over_bg_thresh(struct bdi_writeback *wb)
* sysctl handler for /proc/sys/vm/dirty_writeback_centisecs
*/
int dirty_writeback_centisecs_handler(struct ctl_table *table, int write,
- void __user *buffer, size_t *length, loff_t *ppos)
+ void *buffer, size_t *length, loff_t *ppos)
{
unsigned int old_interval = dirty_writeback_interval;
int ret;
@@ -2164,7 +2176,6 @@ int write_cache_pages(struct address_space *mapping,
int error;
struct pagevec pvec;
int nr_pages;
- pgoff_t uninitialized_var(writeback_index);
pgoff_t index;
pgoff_t end; /* Inclusive */
pgoff_t done_index;
@@ -2173,8 +2184,7 @@ int write_cache_pages(struct address_space *mapping,
pagevec_init(&pvec);
if (wbc->range_cyclic) {
- writeback_index = mapping->writeback_index; /* prev offset */
- index = writeback_index;
+ index = mapping->writeback_index; /* prev offset */
end = -1;
} else {
index = wbc->range_start >> PAGE_SHIFT;
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 69827d4fa052..48eb0f1410d4 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -68,6 +68,7 @@
#include <linux/lockdep.h>
#include <linux/nmi.h>
#include <linux/psi.h>
+#include <linux/padata.h>
#include <asm/sections.h>
#include <asm/tlbflush.h>
@@ -302,14 +303,14 @@ const char * const migratetype_names[MIGRATE_TYPES] = {
#endif
};
-compound_page_dtor * const compound_page_dtors[] = {
- NULL,
- free_compound_page,
+compound_page_dtor * const compound_page_dtors[NR_COMPOUND_DTORS] = {
+ [NULL_COMPOUND_DTOR] = NULL,
+ [COMPOUND_PAGE_DTOR] = free_compound_page,
#ifdef CONFIG_HUGETLB_PAGE
- free_huge_page,
+ [HUGETLB_PAGE_DTOR] = free_huge_page,
#endif
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
- free_transhuge_page,
+ [TRANSHUGE_PAGE_DTOR] = free_transhuge_page,
#endif
};
@@ -335,7 +336,6 @@ static unsigned long nr_kernel_pages __initdata;
static unsigned long nr_all_pages __initdata;
static unsigned long dma_reserve __initdata;
-#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
static unsigned long arch_zone_lowest_possible_pfn[MAX_NR_ZONES] __initdata;
static unsigned long arch_zone_highest_possible_pfn[MAX_NR_ZONES] __initdata;
static unsigned long required_kernelcore __initdata;
@@ -348,7 +348,6 @@ static bool mirrored_kernelcore __meminitdata;
/* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */
int movable_zone;
EXPORT_SYMBOL(movable_zone);
-#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
#if MAX_NUMNODES > 1
unsigned int nr_node_ids __read_mostly = MAX_NUMNODES;
@@ -609,8 +608,7 @@ static inline int __maybe_unused bad_range(struct zone *zone, struct page *page)
}
#endif
-static void bad_page(struct page *page, const char *reason,
- unsigned long bad_flags)
+static void bad_page(struct page *page, const char *reason)
{
static unsigned long resume;
static unsigned long nr_shown;
@@ -639,10 +637,6 @@ static void bad_page(struct page *page, const char *reason,
pr_alert("BUG: Bad page state in process %s pfn:%05lx\n",
current->comm, page_to_pfn(page));
__dump_page(page, reason);
- bad_flags &= page->flags;
- if (bad_flags)
- pr_alert("bad because of flags: %#lx(%pGp)\n",
- bad_flags, &bad_flags);
dump_page_owner(page);
print_modules();
@@ -1077,13 +1071,9 @@ static inline bool page_expected_state(struct page *page,
return true;
}
-static void free_pages_check_bad(struct page *page)
+static const char *page_bad_reason(struct page *page, unsigned long flags)
{
- const char *bad_reason;
- unsigned long bad_flags;
-
- bad_reason = NULL;
- bad_flags = 0;
+ const char *bad_reason = NULL;
if (unlikely(atomic_read(&page->_mapcount) != -1))
bad_reason = "nonzero mapcount";
@@ -1091,24 +1081,32 @@ static void free_pages_check_bad(struct page *page)
bad_reason = "non-NULL mapping";
if (unlikely(page_ref_count(page) != 0))
bad_reason = "nonzero _refcount";
- if (unlikely(page->flags & PAGE_FLAGS_CHECK_AT_FREE)) {
- bad_reason = "PAGE_FLAGS_CHECK_AT_FREE flag(s) set";
- bad_flags = PAGE_FLAGS_CHECK_AT_FREE;
+ if (unlikely(page->flags & flags)) {
+ if (flags == PAGE_FLAGS_CHECK_AT_PREP)
+ bad_reason = "PAGE_FLAGS_CHECK_AT_PREP flag(s) set";
+ else
+ bad_reason = "PAGE_FLAGS_CHECK_AT_FREE flag(s) set";
}
#ifdef CONFIG_MEMCG
if (unlikely(page->mem_cgroup))
bad_reason = "page still charged to cgroup";
#endif
- bad_page(page, bad_reason, bad_flags);
+ return bad_reason;
}
-static inline int free_pages_check(struct page *page)
+static void check_free_page_bad(struct page *page)
+{
+ bad_page(page,
+ page_bad_reason(page, PAGE_FLAGS_CHECK_AT_FREE));
+}
+
+static inline int check_free_page(struct page *page)
{
if (likely(page_expected_state(page, PAGE_FLAGS_CHECK_AT_FREE)))
return 0;
/* Something has gone sideways, find it */
- free_pages_check_bad(page);
+ check_free_page_bad(page);
return 1;
}
@@ -1130,7 +1128,7 @@ static int free_tail_pages_check(struct page *head_page, struct page *page)
case 1:
/* the first tail page: ->mapping may be compound_mapcount() */
if (unlikely(compound_mapcount(page))) {
- bad_page(page, "nonzero compound_mapcount", 0);
+ bad_page(page, "nonzero compound_mapcount");
goto out;
}
break;
@@ -1142,17 +1140,17 @@ static int free_tail_pages_check(struct page *head_page, struct page *page)
break;
default:
if (page->mapping != TAIL_MAPPING) {
- bad_page(page, "corrupted mapping in tail page", 0);
+ bad_page(page, "corrupted mapping in tail page");
goto out;
}
break;
}
if (unlikely(!PageTail(page))) {
- bad_page(page, "PageTail not set", 0);
+ bad_page(page, "PageTail not set");
goto out;
}
if (unlikely(compound_head(page) != head_page)) {
- bad_page(page, "compound_head not consistent", 0);
+ bad_page(page, "compound_head not consistent");
goto out;
}
ret = 0;
@@ -1194,7 +1192,7 @@ static __always_inline bool free_pages_prepare(struct page *page,
for (i = 1; i < (1 << order); i++) {
if (compound)
bad += free_tail_pages_check(page, page + i);
- if (unlikely(free_pages_check(page + i))) {
+ if (unlikely(check_free_page(page + i))) {
bad++;
continue;
}
@@ -1206,7 +1204,7 @@ static __always_inline bool free_pages_prepare(struct page *page,
if (memcg_kmem_enabled() && PageKmemcg(page))
__memcg_kmem_uncharge_page(page, order);
if (check_free)
- bad += free_pages_check(page);
+ bad += check_free_page(page);
if (bad)
return false;
@@ -1253,7 +1251,7 @@ static bool free_pcp_prepare(struct page *page)
static bool bulkfree_pcp_prepare(struct page *page)
{
if (debug_pagealloc_enabled_static())
- return free_pages_check(page);
+ return check_free_page(page);
else
return false;
}
@@ -1274,7 +1272,7 @@ static bool free_pcp_prepare(struct page *page)
static bool bulkfree_pcp_prepare(struct page *page)
{
- return free_pages_check(page);
+ return check_free_page(page);
}
#endif /* CONFIG_DEBUG_VM */
@@ -1499,45 +1497,49 @@ void __free_pages_core(struct page *page, unsigned int order)
__free_pages(page, order);
}
-#if defined(CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID) || \
- defined(CONFIG_HAVE_MEMBLOCK_NODE_MAP)
+#ifdef CONFIG_NEED_MULTIPLE_NODES
static struct mminit_pfnnid_cache early_pfnnid_cache __meminitdata;
-int __meminit early_pfn_to_nid(unsigned long pfn)
+#ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID
+
+/*
+ * Required by SPARSEMEM. Given a PFN, return what node the PFN is on.
+ */
+int __meminit __early_pfn_to_nid(unsigned long pfn,
+ struct mminit_pfnnid_cache *state)
{
- static DEFINE_SPINLOCK(early_pfn_lock);
+ unsigned long start_pfn, end_pfn;
int nid;
- spin_lock(&early_pfn_lock);
- nid = __early_pfn_to_nid(pfn, &early_pfnnid_cache);
- if (nid < 0)
- nid = first_online_node;
- spin_unlock(&early_pfn_lock);
+ if (state->last_start <= pfn && pfn < state->last_end)
+ return state->last_nid;
+
+ nid = memblock_search_pfn_nid(pfn, &start_pfn, &end_pfn);
+ if (nid != NUMA_NO_NODE) {
+ state->last_start = start_pfn;
+ state->last_end = end_pfn;
+ state->last_nid = nid;
+ }
return nid;
}
-#endif
+#endif /* CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID */
-#ifdef CONFIG_NODES_SPAN_OTHER_NODES
-/* Only safe to use early in boot when initialisation is single-threaded */
-static inline bool __meminit early_pfn_in_nid(unsigned long pfn, int node)
+int __meminit early_pfn_to_nid(unsigned long pfn)
{
+ static DEFINE_SPINLOCK(early_pfn_lock);
int nid;
+ spin_lock(&early_pfn_lock);
nid = __early_pfn_to_nid(pfn, &early_pfnnid_cache);
- if (nid >= 0 && nid != node)
- return false;
- return true;
-}
+ if (nid < 0)
+ nid = first_online_node;
+ spin_unlock(&early_pfn_lock);
-#else
-static inline bool __meminit early_pfn_in_nid(unsigned long pfn, int node)
-{
- return true;
+ return nid;
}
-#endif
-
+#endif /* CONFIG_NEED_MULTIPLE_NODES */
void __init memblock_free_pages(struct page *page, unsigned long pfn,
unsigned int order)
@@ -1607,6 +1609,7 @@ void set_zone_contiguous(struct zone *zone)
if (!__pageblock_pfn_to_page(block_start_pfn,
block_end_pfn, zone))
return;
+ cond_resched();
}
/* We confirm that there is no hole */
@@ -1691,7 +1694,6 @@ static void __init deferred_free_pages(unsigned long pfn,
} else if (!(pfn & nr_pgmask)) {
deferred_free_range(pfn - nr_free, nr_free);
nr_free = 1;
- touch_nmi_watchdog();
} else {
nr_free++;
}
@@ -1721,7 +1723,6 @@ static unsigned long __init deferred_init_pages(struct zone *zone,
continue;
} else if (!page || !(pfn & nr_pgmask)) {
page = pfn_to_page(pfn);
- touch_nmi_watchdog();
} else {
page++;
}
@@ -1815,16 +1816,43 @@ deferred_init_maxorder(u64 *i, struct zone *zone, unsigned long *start_pfn,
return nr_pages;
}
+static void __init
+deferred_init_memmap_chunk(unsigned long start_pfn, unsigned long end_pfn,
+ void *arg)
+{
+ unsigned long spfn, epfn;
+ struct zone *zone = arg;
+ u64 i;
+
+ deferred_init_mem_pfn_range_in_zone(&i, zone, &spfn, &epfn, start_pfn);
+
+ /*
+ * Initialize and free pages in MAX_ORDER sized increments so that we
+ * can avoid introducing any issues with the buddy allocator.
+ */
+ while (spfn < end_pfn) {
+ deferred_init_maxorder(&i, zone, &spfn, &epfn);
+ cond_resched();
+ }
+}
+
+/* An arch may override for more concurrency. */
+__weak int __init
+deferred_page_init_max_threads(const struct cpumask *node_cpumask)
+{
+ return 1;
+}
+
/* Initialise remaining memory on a node */
static int __init deferred_init_memmap(void *data)
{
pg_data_t *pgdat = data;
const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id);
- unsigned long spfn = 0, epfn = 0, nr_pages = 0;
+ unsigned long spfn = 0, epfn = 0;
unsigned long first_init_pfn, flags;
unsigned long start = jiffies;
struct zone *zone;
- int zid;
+ int zid, max_threads;
u64 i;
/* Bind memory initialisation thread to a local node if possible */
@@ -1844,6 +1872,13 @@ static int __init deferred_init_memmap(void *data)
BUG_ON(pgdat->first_deferred_pfn > pgdat_end_pfn(pgdat));
pgdat->first_deferred_pfn = ULONG_MAX;
+ /*
+ * Once we unlock here, the zone cannot be grown anymore, thus if an
+ * interrupt thread must allocate this early in boot, zone must be
+ * pre-grown prior to start of deferred page initialization.
+ */
+ pgdat_resize_unlock(pgdat, &flags);
+
/* Only the highest zone is deferred so find it */
for (zid = 0; zid < MAX_NR_ZONES; zid++) {
zone = pgdat->node_zones + zid;
@@ -1856,21 +1891,30 @@ static int __init deferred_init_memmap(void *data)
first_init_pfn))
goto zone_empty;
- /*
- * Initialize and free pages in MAX_ORDER sized increments so
- * that we can avoid introducing any issues with the buddy
- * allocator.
- */
- while (spfn < epfn)
- nr_pages += deferred_init_maxorder(&i, zone, &spfn, &epfn);
-zone_empty:
- pgdat_resize_unlock(pgdat, &flags);
+ max_threads = deferred_page_init_max_threads(cpumask);
+ while (spfn < epfn) {
+ unsigned long epfn_align = ALIGN(epfn, PAGES_PER_SECTION);
+ struct padata_mt_job job = {
+ .thread_fn = deferred_init_memmap_chunk,
+ .fn_arg = zone,
+ .start = spfn,
+ .size = epfn_align - spfn,
+ .align = PAGES_PER_SECTION,
+ .min_chunk = PAGES_PER_SECTION,
+ .max_threads = max_threads,
+ };
+
+ padata_do_multithreaded(&job);
+ deferred_init_mem_pfn_range_in_zone(&i, zone, &spfn, &epfn,
+ epfn_align);
+ }
+zone_empty:
/* Sanity check that the next zone really is unpopulated */
WARN_ON(++zid < MAX_NR_ZONES && populated_zone(++zone));
- pr_info("node %d initialised, %lu pages in %ums\n",
- pgdat->node_id, nr_pages, jiffies_to_msecs(jiffies - start));
+ pr_info("node %d deferred pages initialised in %ums\n",
+ pgdat->node_id, jiffies_to_msecs(jiffies - start));
pgdat_init_report_one_done();
return 0;
@@ -1908,17 +1952,6 @@ deferred_grow_zone(struct zone *zone, unsigned int order)
pgdat_resize_lock(pgdat, &flags);
/*
- * If deferred pages have been initialized while we were waiting for
- * the lock, return true, as the zone was grown. The caller will retry
- * this zone. We won't return to this function since the caller also
- * has this static branch.
- */
- if (!static_branch_unlikely(&deferred_pages)) {
- pgdat_resize_unlock(pgdat, &flags);
- return true;
- }
-
- /*
* If someone grew this zone while we were waiting for spinlock, return
* true, as there might be enough pages already.
*/
@@ -1946,6 +1979,7 @@ deferred_grow_zone(struct zone *zone, unsigned int order)
first_deferred_pfn = spfn;
nr_pages += deferred_init_maxorder(&i, zone, &spfn, &epfn);
+ touch_nmi_watchdog();
/* We should only stop along section boundaries */
if ((first_deferred_pfn ^ spfn) < PAGES_PER_SECTION)
@@ -2091,31 +2125,14 @@ static inline void expand(struct zone *zone, struct page *page,
static void check_new_page_bad(struct page *page)
{
- const char *bad_reason = NULL;
- unsigned long bad_flags = 0;
-
- if (unlikely(atomic_read(&page->_mapcount) != -1))
- bad_reason = "nonzero mapcount";
- if (unlikely(page->mapping != NULL))
- bad_reason = "non-NULL mapping";
- if (unlikely(page_ref_count(page) != 0))
- bad_reason = "nonzero _refcount";
if (unlikely(page->flags & __PG_HWPOISON)) {
- bad_reason = "HWPoisoned (hardware-corrupted)";
- bad_flags = __PG_HWPOISON;
/* Don't complain about hwpoisoned pages */
page_mapcount_reset(page); /* remove PageBuddy */
return;
}
- if (unlikely(page->flags & PAGE_FLAGS_CHECK_AT_PREP)) {
- bad_reason = "PAGE_FLAGS_CHECK_AT_PREP flag set";
- bad_flags = PAGE_FLAGS_CHECK_AT_PREP;
- }
-#ifdef CONFIG_MEMCG
- if (unlikely(page->mem_cgroup))
- bad_reason = "page still charged to cgroup";
-#endif
- bad_page(page, bad_reason, bad_flags);
+
+ bad_page(page,
+ page_bad_reason(page, PAGE_FLAGS_CHECK_AT_PREP));
}
/*
@@ -2400,6 +2417,14 @@ static inline void boost_watermark(struct zone *zone)
if (!watermark_boost_factor)
return;
+ /*
+ * Don't bother in zones that are unlikely to produce results.
+ * On small machines, including kdump capture kernels running
+ * in a small area, boosting the watermark can cause an out of
+ * memory situation immediately.
+ */
+ if ((pageblock_nr_pages * 4) > zone_managed_pages(zone))
+ return;
max_boost = mult_frac(zone->_watermark[WMARK_HIGH],
watermark_boost_factor, 10000);
@@ -2600,7 +2625,7 @@ static bool unreserve_highatomic_pageblock(const struct alloc_context *ac,
int order;
bool ret;
- for_each_zone_zonelist_nodemask(zone, z, zonelist, ac->high_zoneidx,
+ for_each_zone_zonelist_nodemask(zone, z, zonelist, ac->highest_zoneidx,
ac->nodemask) {
/*
* Preserve at least one pageblock unless memory pressure
@@ -2759,6 +2784,20 @@ __rmqueue(struct zone *zone, unsigned int order, int migratetype,
{
struct page *page;
+#ifdef CONFIG_CMA
+ /*
+ * Balance movable allocations between regular and CMA areas by
+ * allocating from CMA when over half of the zone's free memory
+ * is in the CMA area.
+ */
+ if (migratetype == MIGRATE_MOVABLE &&
+ zone_page_state(zone, NR_FREE_CMA_PAGES) >
+ zone_page_state(zone, NR_FREE_PAGES) / 2) {
+ page = __rmqueue_cma_fallback(zone, order);
+ if (page)
+ return page;
+ }
+#endif
retry:
page = __rmqueue_smallest(zone, order, migratetype);
if (unlikely(!page)) {
@@ -3455,7 +3494,7 @@ ALLOW_ERROR_INJECTION(should_fail_alloc_page, TRUE);
* to check in the allocation paths if no pages are free.
*/
bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
- int classzone_idx, unsigned int alloc_flags,
+ int highest_zoneidx, unsigned int alloc_flags,
long free_pages)
{
long min = mark;
@@ -3500,7 +3539,7 @@ bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
* are not met, then a high-order request also cannot go ahead
* even if a suitable page happened to be free.
*/
- if (free_pages <= min + z->lowmem_reserve[classzone_idx])
+ if (free_pages <= min + z->lowmem_reserve[highest_zoneidx])
return false;
/* If this is an order-0 request then the watermark is fine */
@@ -3533,14 +3572,15 @@ bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
}
bool zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
- int classzone_idx, unsigned int alloc_flags)
+ int highest_zoneidx, unsigned int alloc_flags)
{
- return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,
+ return __zone_watermark_ok(z, order, mark, highest_zoneidx, alloc_flags,
zone_page_state(z, NR_FREE_PAGES));
}
static inline bool zone_watermark_fast(struct zone *z, unsigned int order,
- unsigned long mark, int classzone_idx, unsigned int alloc_flags)
+ unsigned long mark, int highest_zoneidx,
+ unsigned int alloc_flags)
{
long free_pages = zone_page_state(z, NR_FREE_PAGES);
long cma_pages = 0;
@@ -3558,22 +3598,23 @@ static inline bool zone_watermark_fast(struct zone *z, unsigned int order,
* the caller is !atomic then it'll uselessly search the free
* list. That corner case is then slower but it is harmless.
*/
- if (!order && (free_pages - cma_pages) > mark + z->lowmem_reserve[classzone_idx])
+ if (!order && (free_pages - cma_pages) >
+ mark + z->lowmem_reserve[highest_zoneidx])
return true;
- return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,
+ return __zone_watermark_ok(z, order, mark, highest_zoneidx, alloc_flags,
free_pages);
}
bool zone_watermark_ok_safe(struct zone *z, unsigned int order,
- unsigned long mark, int classzone_idx)
+ unsigned long mark, int highest_zoneidx)
{
long free_pages = zone_page_state(z, NR_FREE_PAGES);
if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark)
free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES);
- return __zone_watermark_ok(z, order, mark, classzone_idx, 0,
+ return __zone_watermark_ok(z, order, mark, highest_zoneidx, 0,
free_pages);
}
@@ -3650,8 +3691,8 @@ retry:
*/
no_fallback = alloc_flags & ALLOC_NOFRAGMENT;
z = ac->preferred_zoneref;
- for_next_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx,
- ac->nodemask) {
+ for_next_zone_zonelist_nodemask(zone, z, ac->zonelist,
+ ac->highest_zoneidx, ac->nodemask) {
struct page *page;
unsigned long mark;
@@ -3706,7 +3747,7 @@ retry:
mark = wmark_pages(zone, alloc_flags & ALLOC_WMARK_MASK);
if (!zone_watermark_fast(zone, order, mark,
- ac_classzone_idx(ac), alloc_flags)) {
+ ac->highest_zoneidx, alloc_flags)) {
int ret;
#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
@@ -3739,7 +3780,7 @@ retry:
default:
/* did we reclaim enough */
if (zone_watermark_ok(zone, order, mark,
- ac_classzone_idx(ac), alloc_flags))
+ ac->highest_zoneidx, alloc_flags))
goto try_this_zone;
continue;
@@ -3898,7 +3939,7 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
if (gfp_mask & __GFP_RETRY_MAYFAIL)
goto out;
/* The OOM killer does not needlessly kill tasks for lowmem */
- if (ac->high_zoneidx < ZONE_NORMAL)
+ if (ac->highest_zoneidx < ZONE_NORMAL)
goto out;
if (pm_suspended_storage())
goto out;
@@ -4101,10 +4142,10 @@ should_compact_retry(struct alloc_context *ac, unsigned int order, int alloc_fla
* Let's give them a good hope and keep retrying while the order-0
* watermarks are OK.
*/
- for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx,
- ac->nodemask) {
+ for_each_zone_zonelist_nodemask(zone, z, ac->zonelist,
+ ac->highest_zoneidx, ac->nodemask) {
if (zone_watermark_ok(zone, 0, min_wmark_pages(zone),
- ac_classzone_idx(ac), alloc_flags))
+ ac->highest_zoneidx, alloc_flags))
return true;
}
return false;
@@ -4228,12 +4269,12 @@ static void wake_all_kswapds(unsigned int order, gfp_t gfp_mask,
struct zoneref *z;
struct zone *zone;
pg_data_t *last_pgdat = NULL;
- enum zone_type high_zoneidx = ac->high_zoneidx;
+ enum zone_type highest_zoneidx = ac->highest_zoneidx;
- for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, high_zoneidx,
+ for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, highest_zoneidx,
ac->nodemask) {
if (last_pgdat != zone->zone_pgdat)
- wakeup_kswapd(zone, gfp_mask, order, high_zoneidx);
+ wakeup_kswapd(zone, gfp_mask, order, highest_zoneidx);
last_pgdat = zone->zone_pgdat;
}
}
@@ -4276,7 +4317,7 @@ gfp_to_alloc_flags(gfp_t gfp_mask)
alloc_flags |= ALLOC_HARDER;
#ifdef CONFIG_CMA
- if (gfpflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE)
+ if (gfp_migratetype(gfp_mask) == MIGRATE_MOVABLE)
alloc_flags |= ALLOC_CMA;
#endif
return alloc_flags;
@@ -4368,8 +4409,8 @@ should_reclaim_retry(gfp_t gfp_mask, unsigned order,
* request even if all reclaimable pages are considered then we are
* screwed and have to go OOM.
*/
- for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx,
- ac->nodemask) {
+ for_each_zone_zonelist_nodemask(zone, z, ac->zonelist,
+ ac->highest_zoneidx, ac->nodemask) {
unsigned long available;
unsigned long reclaimable;
unsigned long min_wmark = min_wmark_pages(zone);
@@ -4383,7 +4424,7 @@ should_reclaim_retry(gfp_t gfp_mask, unsigned order,
* reclaimable pages?
*/
wmark = __zone_watermark_ok(zone, order, min_wmark,
- ac_classzone_idx(ac), alloc_flags, available);
+ ac->highest_zoneidx, alloc_flags, available);
trace_reclaim_retry_zone(z, order, reclaimable,
available, min_wmark, *no_progress_loops, wmark);
if (wmark) {
@@ -4502,7 +4543,7 @@ retry_cpuset:
* could end up iterating over non-eligible zones endlessly.
*/
ac->preferred_zoneref = first_zones_zonelist(ac->zonelist,
- ac->high_zoneidx, ac->nodemask);
+ ac->highest_zoneidx, ac->nodemask);
if (!ac->preferred_zoneref->zone)
goto nopage;
@@ -4589,7 +4630,7 @@ retry:
if (!(alloc_flags & ALLOC_CPUSET) || reserve_flags) {
ac->nodemask = NULL;
ac->preferred_zoneref = first_zones_zonelist(ac->zonelist,
- ac->high_zoneidx, ac->nodemask);
+ ac->highest_zoneidx, ac->nodemask);
}
/* Attempt with potentially adjusted zonelist and alloc_flags */
@@ -4723,10 +4764,10 @@ static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order,
struct alloc_context *ac, gfp_t *alloc_mask,
unsigned int *alloc_flags)
{
- ac->high_zoneidx = gfp_zone(gfp_mask);
+ ac->highest_zoneidx = gfp_zone(gfp_mask);
ac->zonelist = node_zonelist(preferred_nid, gfp_mask);
ac->nodemask = nodemask;
- ac->migratetype = gfpflags_to_migratetype(gfp_mask);
+ ac->migratetype = gfp_migratetype(gfp_mask);
if (cpusets_enabled()) {
*alloc_mask |= __GFP_HARDWALL;
@@ -4762,7 +4803,7 @@ static inline void finalise_ac(gfp_t gfp_mask, struct alloc_context *ac)
* may get reset for allocations that ignore memory policies.
*/
ac->preferred_zoneref = first_zones_zonelist(ac->zonelist,
- ac->high_zoneidx, ac->nodemask);
+ ac->highest_zoneidx, ac->nodemask);
}
/*
@@ -5310,7 +5351,7 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask)
printk("active_anon:%lu inactive_anon:%lu isolated_anon:%lu\n"
" active_file:%lu inactive_file:%lu isolated_file:%lu\n"
- " unevictable:%lu dirty:%lu writeback:%lu unstable:%lu\n"
+ " unevictable:%lu dirty:%lu writeback:%lu\n"
" slab_reclaimable:%lu slab_unreclaimable:%lu\n"
" mapped:%lu shmem:%lu pagetables:%lu bounce:%lu\n"
" free:%lu free_pcp:%lu free_cma:%lu\n",
@@ -5323,7 +5364,6 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask)
global_node_page_state(NR_UNEVICTABLE),
global_node_page_state(NR_FILE_DIRTY),
global_node_page_state(NR_WRITEBACK),
- global_node_page_state(NR_UNSTABLE_NFS),
global_node_page_state(NR_SLAB_RECLAIMABLE),
global_node_page_state(NR_SLAB_UNRECLAIMABLE),
global_node_page_state(NR_FILE_MAPPED),
@@ -5356,7 +5396,6 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask)
" anon_thp: %lukB"
#endif
" writeback_tmp:%lukB"
- " unstable:%lukB"
" all_unreclaimable? %s"
"\n",
pgdat->node_id,
@@ -5378,7 +5417,6 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask)
K(node_page_state(pgdat, NR_ANON_THPS) * HPAGE_PMD_NR),
#endif
K(node_page_state(pgdat, NR_WRITEBACK_TEMP)),
- K(node_page_state(pgdat, NR_UNSTABLE_NFS)),
pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES ?
"yes" : "no");
}
@@ -5411,6 +5449,9 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask)
" managed:%lukB"
" mlocked:%lukB"
" kernel_stack:%lukB"
+#ifdef CONFIG_SHADOW_CALL_STACK
+ " shadow_call_stack:%lukB"
+#endif
" pagetables:%lukB"
" bounce:%lukB"
" free_pcp:%lukB"
@@ -5433,6 +5474,9 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask)
K(zone_managed_pages(zone)),
K(zone_page_state(zone, NR_MLOCK)),
zone_page_state(zone, NR_KERNEL_STACK_KB),
+#ifdef CONFIG_SHADOW_CALL_STACK
+ zone_page_state(zone, NR_KERNEL_SCS_KB),
+#endif
K(zone_page_state(zone, NR_PAGETABLE)),
K(zone_page_state(zone, NR_BOUNCE)),
K(free_pcp),
@@ -5531,36 +5575,17 @@ static int __parse_numa_zonelist_order(char *s)
return 0;
}
-static __init int setup_numa_zonelist_order(char *s)
-{
- if (!s)
- return 0;
-
- return __parse_numa_zonelist_order(s);
-}
-early_param("numa_zonelist_order", setup_numa_zonelist_order);
-
char numa_zonelist_order[] = "Node";
/*
* sysctl handler for numa_zonelist_order
*/
int numa_zonelist_order_handler(struct ctl_table *table, int write,
- void __user *buffer, size_t *length,
- loff_t *ppos)
+ void *buffer, size_t *length, loff_t *ppos)
{
- char *str;
- int ret;
-
- if (!write)
- return proc_dostring(table, write, buffer, length, ppos);
- str = memdup_user_nul(buffer, 16);
- if (IS_ERR(str))
- return PTR_ERR(str);
-
- ret = __parse_numa_zonelist_order(str);
- kfree(str);
- return ret;
+ if (write)
+ return __parse_numa_zonelist_order(buffer);
+ return proc_dostring(table, write, buffer, length, ppos);
}
@@ -5680,14 +5705,13 @@ static void build_zonelists(pg_data_t *pgdat)
{
static int node_order[MAX_NUMNODES];
int node, load, nr_nodes = 0;
- nodemask_t used_mask;
+ nodemask_t used_mask = NODE_MASK_NONE;
int local_node, prev_node;
/* NUMA-aware ordering of nodes */
local_node = pgdat->node_id;
load = nr_online_nodes;
prev_node = local_node;
- nodes_clear(used_mask);
memset(node_order, 0, sizeof(node_order));
while ((node = find_next_best_node(local_node, &used_mask)) >= 0) {
@@ -5899,7 +5923,6 @@ void __ref build_all_zonelists(pg_data_t *pgdat)
static bool __meminit
overlap_memmap_init(unsigned long zone, unsigned long *pfn)
{
-#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
static struct memblock_region *r;
if (mirrored_kernelcore && zone == ZONE_MOVABLE) {
@@ -5915,27 +5938,9 @@ overlap_memmap_init(unsigned long zone, unsigned long *pfn)
return true;
}
}
-#endif
return false;
}
-#ifdef CONFIG_SPARSEMEM
-/* Skip PFNs that belong to non-present sections */
-static inline __meminit unsigned long next_pfn(unsigned long pfn)
-{
- const unsigned long section_nr = pfn_to_section_nr(++pfn);
-
- if (present_section_nr(section_nr))
- return pfn;
- return section_nr_to_pfn(next_present_section_nr(section_nr));
-}
-#else
-static inline __meminit unsigned long next_pfn(unsigned long pfn)
-{
- return pfn++;
-}
-#endif
-
/*
* Initially all pages are reserved - free ones are freed
* up by memblock_free_all() once the early boot process is
@@ -5975,14 +5980,6 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
* function. They do not exist on hotplugged memory.
*/
if (context == MEMMAP_EARLY) {
- if (!early_pfn_valid(pfn)) {
- pfn = next_pfn(pfn);
- continue;
- }
- if (!early_pfn_in_nid(pfn, nid)) {
- pfn++;
- continue;
- }
if (overlap_memmap_init(zone, &pfn))
continue;
if (defer_init(nid, pfn, end_pfn))
@@ -6098,9 +6095,23 @@ static void __meminit zone_init_free_lists(struct zone *zone)
}
void __meminit __weak memmap_init(unsigned long size, int nid,
- unsigned long zone, unsigned long start_pfn)
+ unsigned long zone,
+ unsigned long range_start_pfn)
{
- memmap_init_zone(size, nid, zone, start_pfn, MEMMAP_EARLY, NULL);
+ unsigned long start_pfn, end_pfn;
+ unsigned long range_end_pfn = range_start_pfn + size;
+ int i;
+
+ for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
+ start_pfn = clamp(start_pfn, range_start_pfn, range_end_pfn);
+ end_pfn = clamp(end_pfn, range_start_pfn, range_end_pfn);
+
+ if (end_pfn > start_pfn) {
+ size = end_pfn - start_pfn;
+ memmap_init_zone(size, nid, zone, start_pfn,
+ MEMMAP_EARLY, NULL);
+ }
+ }
}
static int zone_batchsize(struct zone *zone)
@@ -6252,10 +6263,25 @@ void __init setup_per_cpu_pageset(void)
{
struct pglist_data *pgdat;
struct zone *zone;
+ int __maybe_unused cpu;
for_each_populated_zone(zone)
setup_zone_pageset(zone);
+#ifdef CONFIG_NUMA
+ /*
+ * Unpopulated zones continue using the boot pagesets.
+ * The numa stats for these pagesets need to be reset.
+ * Otherwise, they will end up skewing the stats of
+ * the nodes these zones are associated with.
+ */
+ for_each_possible_cpu(cpu) {
+ struct per_cpu_pageset *pcp = &per_cpu(boot_pageset, cpu);
+ memset(pcp->vm_numa_stat_diff, 0,
+ sizeof(pcp->vm_numa_stat_diff));
+ }
+#endif
+
for_each_online_pgdat(pgdat)
pgdat->per_cpu_nodestats =
alloc_percpu(struct per_cpu_nodestat);
@@ -6298,57 +6324,6 @@ void __meminit init_currently_empty_zone(struct zone *zone,
zone->initialized = 1;
}
-#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
-#ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID
-
-/*
- * Required by SPARSEMEM. Given a PFN, return what node the PFN is on.
- */
-int __meminit __early_pfn_to_nid(unsigned long pfn,
- struct mminit_pfnnid_cache *state)
-{
- unsigned long start_pfn, end_pfn;
- int nid;
-
- if (state->last_start <= pfn && pfn < state->last_end)
- return state->last_nid;
-
- nid = memblock_search_pfn_nid(pfn, &start_pfn, &end_pfn);
- if (nid != NUMA_NO_NODE) {
- state->last_start = start_pfn;
- state->last_end = end_pfn;
- state->last_nid = nid;
- }
-
- return nid;
-}
-#endif /* CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID */
-
-/**
- * free_bootmem_with_active_regions - Call memblock_free_early_nid for each active range
- * @nid: The node to free memory on. If MAX_NUMNODES, all nodes are freed.
- * @max_low_pfn: The highest PFN that will be passed to memblock_free_early_nid
- *
- * If an architecture guarantees that all ranges registered contain no holes
- * and may be freed, this this function may be used instead of calling
- * memblock_free_early_nid() manually.
- */
-void __init free_bootmem_with_active_regions(int nid, unsigned long max_low_pfn)
-{
- unsigned long start_pfn, end_pfn;
- int i, this_nid;
-
- for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, &this_nid) {
- start_pfn = min(start_pfn, max_low_pfn);
- end_pfn = min(end_pfn, max_low_pfn);
-
- if (start_pfn < end_pfn)
- memblock_free_early_nid(PFN_PHYS(start_pfn),
- (end_pfn - start_pfn) << PAGE_SHIFT,
- this_nid);
- }
-}
-
/**
* sparse_memory_present_with_active_regions - Call memory_present for each active range
* @nid: The node to call memory_present for. If MAX_NUMNODES, all nodes will be used.
@@ -6461,8 +6436,7 @@ static unsigned long __init zone_spanned_pages_in_node(int nid,
unsigned long node_start_pfn,
unsigned long node_end_pfn,
unsigned long *zone_start_pfn,
- unsigned long *zone_end_pfn,
- unsigned long *ignored)
+ unsigned long *zone_end_pfn)
{
unsigned long zone_low = arch_zone_lowest_possible_pfn[zone_type];
unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type];
@@ -6526,8 +6500,7 @@ unsigned long __init absent_pages_in_range(unsigned long start_pfn,
static unsigned long __init zone_absent_pages_in_node(int nid,
unsigned long zone_type,
unsigned long node_start_pfn,
- unsigned long node_end_pfn,
- unsigned long *ignored)
+ unsigned long node_end_pfn)
{
unsigned long zone_low = arch_zone_lowest_possible_pfn[zone_type];
unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type];
@@ -6574,45 +6547,9 @@ static unsigned long __init zone_absent_pages_in_node(int nid,
return nr_absent;
}
-#else /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
-static inline unsigned long __init zone_spanned_pages_in_node(int nid,
- unsigned long zone_type,
- unsigned long node_start_pfn,
- unsigned long node_end_pfn,
- unsigned long *zone_start_pfn,
- unsigned long *zone_end_pfn,
- unsigned long *zones_size)
-{
- unsigned int zone;
-
- *zone_start_pfn = node_start_pfn;
- for (zone = 0; zone < zone_type; zone++)
- *zone_start_pfn += zones_size[zone];
-
- *zone_end_pfn = *zone_start_pfn + zones_size[zone_type];
-
- return zones_size[zone_type];
-}
-
-static inline unsigned long __init zone_absent_pages_in_node(int nid,
- unsigned long zone_type,
- unsigned long node_start_pfn,
- unsigned long node_end_pfn,
- unsigned long *zholes_size)
-{
- if (!zholes_size)
- return 0;
-
- return zholes_size[zone_type];
-}
-
-#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
-
static void __init calculate_node_totalpages(struct pglist_data *pgdat,
unsigned long node_start_pfn,
- unsigned long node_end_pfn,
- unsigned long *zones_size,
- unsigned long *zholes_size)
+ unsigned long node_end_pfn)
{
unsigned long realtotalpages = 0, totalpages = 0;
enum zone_type i;
@@ -6620,17 +6557,21 @@ static void __init calculate_node_totalpages(struct pglist_data *pgdat,
for (i = 0; i < MAX_NR_ZONES; i++) {
struct zone *zone = pgdat->node_zones + i;
unsigned long zone_start_pfn, zone_end_pfn;
+ unsigned long spanned, absent;
unsigned long size, real_size;
- size = zone_spanned_pages_in_node(pgdat->node_id, i,
- node_start_pfn,
- node_end_pfn,
- &zone_start_pfn,
- &zone_end_pfn,
- zones_size);
- real_size = size - zone_absent_pages_in_node(pgdat->node_id, i,
- node_start_pfn, node_end_pfn,
- zholes_size);
+ spanned = zone_spanned_pages_in_node(pgdat->node_id, i,
+ node_start_pfn,
+ node_end_pfn,
+ &zone_start_pfn,
+ &zone_end_pfn);
+ absent = zone_absent_pages_in_node(pgdat->node_id, i,
+ node_start_pfn,
+ node_end_pfn);
+
+ size = spanned;
+ real_size = size - absent;
+
if (size)
zone->zone_start_pfn = zone_start_pfn;
else
@@ -6930,10 +6871,8 @@ static void __ref alloc_node_mem_map(struct pglist_data *pgdat)
*/
if (pgdat == NODE_DATA(0)) {
mem_map = NODE_DATA(0)->node_mem_map;
-#if defined(CONFIG_HAVE_MEMBLOCK_NODE_MAP) || defined(CONFIG_FLATMEM)
if (page_to_pfn(mem_map) != pgdat->node_start_pfn)
mem_map -= offset;
-#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
}
#endif
}
@@ -6950,30 +6889,25 @@ static inline void pgdat_set_deferred_range(pg_data_t *pgdat)
static inline void pgdat_set_deferred_range(pg_data_t *pgdat) {}
#endif
-void __init free_area_init_node(int nid, unsigned long *zones_size,
- unsigned long node_start_pfn,
- unsigned long *zholes_size)
+static void __init free_area_init_node(int nid)
{
pg_data_t *pgdat = NODE_DATA(nid);
unsigned long start_pfn = 0;
unsigned long end_pfn = 0;
/* pg_data_t should be reset to zero when it's allocated */
- WARN_ON(pgdat->nr_zones || pgdat->kswapd_classzone_idx);
+ WARN_ON(pgdat->nr_zones || pgdat->kswapd_highest_zoneidx);
+
+ get_pfn_range_for_nid(nid, &start_pfn, &end_pfn);
pgdat->node_id = nid;
- pgdat->node_start_pfn = node_start_pfn;
+ pgdat->node_start_pfn = start_pfn;
pgdat->per_cpu_nodestats = NULL;
-#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
- get_pfn_range_for_nid(nid, &start_pfn, &end_pfn);
+
pr_info("Initmem setup node %d [mem %#018Lx-%#018Lx]\n", nid,
(u64)start_pfn << PAGE_SHIFT,
end_pfn ? ((u64)end_pfn << PAGE_SHIFT) - 1 : 0);
-#else
- start_pfn = node_start_pfn;
-#endif
- calculate_node_totalpages(pgdat, start_pfn, end_pfn,
- zones_size, zholes_size);
+ calculate_node_totalpages(pgdat, start_pfn, end_pfn);
alloc_node_mem_map(pgdat);
pgdat_set_deferred_range(pgdat);
@@ -6981,6 +6915,11 @@ void __init free_area_init_node(int nid, unsigned long *zones_size,
free_area_init_core(pgdat);
}
+void __init free_area_init_memoryless_node(int nid)
+{
+ free_area_init_node(nid);
+}
+
#if !defined(CONFIG_FLAT_NODE_MEM_MAP)
/*
* Initialize all valid struct pages in the range [spfn, epfn) and mark them
@@ -7064,8 +7003,6 @@ static inline void __init init_unavailable_mem(void)
}
#endif /* !CONFIG_FLAT_NODE_MEM_MAP */
-#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
-
#if MAX_NUMNODES > 1
/*
* Figure out the number of possible node ids.
@@ -7129,24 +7066,6 @@ unsigned long __init node_map_pfn_alignment(void)
return ~accl_mask + 1;
}
-/* Find the lowest pfn for a node */
-static unsigned long __init find_min_pfn_for_node(int nid)
-{
- unsigned long min_pfn = ULONG_MAX;
- unsigned long start_pfn;
- int i;
-
- for_each_mem_pfn_range(i, nid, &start_pfn, NULL, NULL)
- min_pfn = min(min_pfn, start_pfn);
-
- if (min_pfn == ULONG_MAX) {
- pr_warn("Could not find start_pfn for node %d\n", nid);
- return 0;
- }
-
- return min_pfn;
-}
-
/**
* find_min_pfn_with_active_regions - Find the minimum PFN registered
*
@@ -7155,7 +7074,7 @@ static unsigned long __init find_min_pfn_for_node(int nid)
*/
unsigned long __init find_min_pfn_with_active_regions(void)
{
- return find_min_pfn_for_node(MAX_NUMNODES);
+ return PHYS_PFN(memblock_start_of_DRAM());
}
/*
@@ -7208,7 +7127,7 @@ static void __init find_zone_movable_pfns_for_nodes(void)
if (!memblock_is_hotpluggable(r))
continue;
- nid = r->nid;
+ nid = memblock_get_region_node(r);
usable_startpfn = PFN_DOWN(r->base);
zone_movable_pfn[nid] = zone_movable_pfn[nid] ?
@@ -7229,7 +7148,7 @@ static void __init find_zone_movable_pfns_for_nodes(void)
if (memblock_is_mirror(r))
continue;
- nid = r->nid;
+ nid = memblock_get_region_node(r);
usable_startpfn = memblock_region_memory_base_pfn(r);
@@ -7244,7 +7163,7 @@ static void __init find_zone_movable_pfns_for_nodes(void)
}
if (mem_below_4gb_not_mirrored)
- pr_warn("This configuration results in unmirrored kernel memory.");
+ pr_warn("This configuration results in unmirrored kernel memory.\n");
goto out2;
}
@@ -7409,8 +7328,17 @@ static void check_for_memory(pg_data_t *pgdat, int nid)
}
}
+/*
+ * Some architecturs, e.g. ARC may have ZONE_HIGHMEM below ZONE_NORMAL. For
+ * such cases we allow max_zone_pfn sorted in the descending order
+ */
+bool __weak arch_has_descending_max_zone_pfns(void)
+{
+ return false;
+}
+
/**
- * free_area_init_nodes - Initialise all pg_data_t and zone data
+ * free_area_init - Initialise all pg_data_t and zone data
* @max_zone_pfn: an array of max PFNs for each zone
*
* This will call free_area_init_node() for each active node in the system.
@@ -7422,10 +7350,11 @@ static void check_for_memory(pg_data_t *pgdat, int nid)
* starts where the previous one ended. For example, ZONE_DMA32 starts
* at arch_max_dma_pfn.
*/
-void __init free_area_init_nodes(unsigned long *max_zone_pfn)
+void __init free_area_init(unsigned long *max_zone_pfn)
{
unsigned long start_pfn, end_pfn;
- int i, nid;
+ int i, nid, zone;
+ bool descending;
/* Record where the zone boundaries are */
memset(arch_zone_lowest_possible_pfn, 0,
@@ -7434,14 +7363,20 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn)
sizeof(arch_zone_highest_possible_pfn));
start_pfn = find_min_pfn_with_active_regions();
+ descending = arch_has_descending_max_zone_pfns();
for (i = 0; i < MAX_NR_ZONES; i++) {
- if (i == ZONE_MOVABLE)
+ if (descending)
+ zone = MAX_NR_ZONES - i - 1;
+ else
+ zone = i;
+
+ if (zone == ZONE_MOVABLE)
continue;
- end_pfn = max(max_zone_pfn[i], start_pfn);
- arch_zone_lowest_possible_pfn[i] = start_pfn;
- arch_zone_highest_possible_pfn[i] = end_pfn;
+ end_pfn = max(max_zone_pfn[zone], start_pfn);
+ arch_zone_lowest_possible_pfn[zone] = start_pfn;
+ arch_zone_highest_possible_pfn[zone] = end_pfn;
start_pfn = end_pfn;
}
@@ -7494,8 +7429,7 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn)
init_unavailable_mem();
for_each_online_node(nid) {
pg_data_t *pgdat = NODE_DATA(nid);
- free_area_init_node(nid, NULL,
- find_min_pfn_for_node(nid), NULL);
+ free_area_init_node(nid);
/* Any memory on that node */
if (pgdat->node_present_pages)
@@ -7560,8 +7494,6 @@ static int __init cmdline_parse_movablecore(char *p)
early_param("kernelcore", cmdline_parse_kernelcore);
early_param("movablecore", cmdline_parse_movablecore);
-#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
-
void adjust_managed_page_count(struct page *page, long count)
{
atomic_long_add(count, &page_zone(page)->managed_pages);
@@ -7684,13 +7616,6 @@ void __init set_dma_reserve(unsigned long new_dma_reserve)
dma_reserve = new_dma_reserve;
}
-void __init free_area_init(unsigned long *zones_size)
-{
- init_unavailable_mem();
- free_area_init_node(0, zones_size,
- __pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL);
-}
-
static int page_alloc_cpu_dead(unsigned int cpu)
{
@@ -7808,9 +7733,10 @@ static void setup_per_zone_lowmem_reserve(void)
idx--;
lower_zone = pgdat->node_zones + idx;
- if (sysctl_lowmem_reserve_ratio[idx] < 1) {
- sysctl_lowmem_reserve_ratio[idx] = 0;
+ if (!sysctl_lowmem_reserve_ratio[idx] ||
+ !zone_managed_pages(lower_zone)) {
lower_zone->lowmem_reserve[j] = 0;
+ continue;
} else {
lower_zone->lowmem_reserve[j] =
managed_pages / sysctl_lowmem_reserve_ratio[idx];
@@ -7875,9 +7801,9 @@ static void __setup_per_zone_wmarks(void)
mult_frac(zone_managed_pages(zone),
watermark_scale_factor, 10000));
+ zone->watermark_boost = 0;
zone->_watermark[WMARK_LOW] = min_wmark_pages(zone) + tmp;
zone->_watermark[WMARK_HIGH] = min_wmark_pages(zone) + tmp * 2;
- zone->watermark_boost = 0;
spin_unlock_irqrestore(&zone->lock, flags);
}
@@ -7963,7 +7889,7 @@ core_initcall(init_per_zone_wmark_min)
* changes.
*/
int min_free_kbytes_sysctl_handler(struct ctl_table *table, int write,
- void __user *buffer, size_t *length, loff_t *ppos)
+ void *buffer, size_t *length, loff_t *ppos)
{
int rc;
@@ -7978,20 +7904,8 @@ int min_free_kbytes_sysctl_handler(struct ctl_table *table, int write,
return 0;
}
-int watermark_boost_factor_sysctl_handler(struct ctl_table *table, int write,
- void __user *buffer, size_t *length, loff_t *ppos)
-{
- int rc;
-
- rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
- if (rc)
- return rc;
-
- return 0;
-}
-
int watermark_scale_factor_sysctl_handler(struct ctl_table *table, int write,
- void __user *buffer, size_t *length, loff_t *ppos)
+ void *buffer, size_t *length, loff_t *ppos)
{
int rc;
@@ -8021,7 +7935,7 @@ static void setup_min_unmapped_ratio(void)
int sysctl_min_unmapped_ratio_sysctl_handler(struct ctl_table *table, int write,
- void __user *buffer, size_t *length, loff_t *ppos)
+ void *buffer, size_t *length, loff_t *ppos)
{
int rc;
@@ -8048,7 +7962,7 @@ static void setup_min_slab_ratio(void)
}
int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *table, int write,
- void __user *buffer, size_t *length, loff_t *ppos)
+ void *buffer, size_t *length, loff_t *ppos)
{
int rc;
@@ -8072,9 +7986,17 @@ int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *table, int write,
* if in function of the boot time zone sizes.
*/
int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *table, int write,
- void __user *buffer, size_t *length, loff_t *ppos)
+ void *buffer, size_t *length, loff_t *ppos)
{
+ int i;
+
proc_dointvec_minmax(table, write, buffer, length, ppos);
+
+ for (i = 0; i < MAX_NR_ZONES; i++) {
+ if (sysctl_lowmem_reserve_ratio[i] < 1)
+ sysctl_lowmem_reserve_ratio[i] = 0;
+ }
+
setup_per_zone_lowmem_reserve();
return 0;
}
@@ -8094,7 +8016,7 @@ static void __zone_pcp_update(struct zone *zone)
* pagelist can have before it gets flushed back to buddy allocator.
*/
int percpu_pagelist_fraction_sysctl_handler(struct ctl_table *table, int write,
- void __user *buffer, size_t *length, loff_t *ppos)
+ void *buffer, size_t *length, loff_t *ppos)
{
struct zone *zone;
int old_percpu_pagelist_fraction;
@@ -8238,7 +8160,7 @@ void *__init alloc_large_system_hash(const char *tablename,
table = memblock_alloc_raw(size,
SMP_CACHE_BYTES);
} else if (get_order(size) >= MAX_ORDER || hashdist) {
- table = __vmalloc(size, gfp_flags, PAGE_KERNEL);
+ table = __vmalloc(size, gfp_flags);
virt = true;
} else {
/*
@@ -8363,6 +8285,19 @@ struct page *has_unmovable_pages(struct zone *zone, struct page *page,
if ((flags & MEMORY_OFFLINE) && PageHWPoison(page))
continue;
+ /*
+ * We treat all PageOffline() pages as movable when offlining
+ * to give drivers a chance to decrement their reference count
+ * in MEM_GOING_OFFLINE in order to indicate that these pages
+ * can be offlined as there are no direct references anymore.
+ * For actually unmovable PageOffline() where the driver does
+ * not support this, we will fail later when trying to actually
+ * move these pages that still have a reference count > 0.
+ * (false negatives in this function only)
+ */
+ if ((flags & MEMORY_OFFLINE) && PageOffline(page))
+ continue;
+
if (__PageMovable(page) || PageLRU(page))
continue;
@@ -8402,7 +8337,7 @@ static int __alloc_contig_migrate_range(struct compact_control *cc,
unsigned long start, unsigned long end)
{
/* This function is based on compact_zone() from compaction.c. */
- unsigned long nr_reclaimed;
+ unsigned int nr_reclaimed;
unsigned long pfn = start;
unsigned int tries = 0;
int ret = 0;
@@ -8594,6 +8529,7 @@ done:
pfn_max_align_up(end), migratetype);
return ret;
}
+EXPORT_SYMBOL(alloc_contig_range);
static int __alloc_contig_pages(unsigned long start_pfn,
unsigned long nr_pages, gfp_t gfp_mask)
@@ -8709,6 +8645,7 @@ void free_contig_range(unsigned long pfn, unsigned int nr_pages)
}
WARN(count != 0, "%d pages are still in use!\n", count);
}
+EXPORT_SYMBOL(free_contig_range);
/*
* The zone indicated has a new number of managed_pages; batch sizes and percpu
@@ -8781,6 +8718,17 @@ __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
offlined_pages++;
continue;
}
+ /*
+ * At this point all remaining PageOffline() pages have a
+ * reference count of 0 and can simply be skipped.
+ */
+ if (PageOffline(page)) {
+ BUG_ON(page_count(page));
+ BUG_ON(PageBuddy(page));
+ pfn++;
+ offlined_pages++;
+ continue;
+ }
BUG_ON(page_count(page));
BUG_ON(!PageBuddy(page));
diff --git a/mm/page_idle.c b/mm/page_idle.c
index 295512465065..057c61df12db 100644
--- a/mm/page_idle.c
+++ b/mm/page_idle.c
@@ -4,6 +4,7 @@
#include <linux/fs.h>
#include <linux/sysfs.h>
#include <linux/kobject.h>
+#include <linux/memory_hotplug.h>
#include <linux/mm.h>
#include <linux/mmzone.h>
#include <linux/pagemap.h>
@@ -30,13 +31,9 @@
*/
static struct page *page_idle_get_page(unsigned long pfn)
{
- struct page *page;
+ struct page *page = pfn_to_online_page(pfn);
pg_data_t *pgdat;
- if (!pfn_valid(pfn))
- return NULL;
-
- page = pfn_to_page(pfn);
if (!page || !PageLRU(page) ||
!get_page_unless_zero(page))
return NULL;
diff --git a/mm/page_io.c b/mm/page_io.c
index 76965be1d40e..e8726f3e3820 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -25,7 +25,6 @@
#include <linux/psi.h>
#include <linux/uio.h>
#include <linux/sched/task.h>
-#include <asm/pgtable.h>
static struct bio *get_swap_bio(gfp_t gfp_flags,
struct page *page, bio_end_io_t end_io)
diff --git a/mm/page_isolation.c b/mm/page_isolation.c
index 2c11a38d6e87..f6d07c5f0d34 100644
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -151,6 +151,7 @@ __first_valid_page(unsigned long pfn, unsigned long nr_pages)
* a bit mask)
* MEMORY_OFFLINE - isolate to offline (!allocate) memory
* e.g., skip over PageHWPoison() pages
+ * and PageOffline() pages.
* REPORT_FAILURE - report details about the failure to
* isolate the range
*
@@ -259,6 +260,14 @@ __test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn,
else if ((flags & MEMORY_OFFLINE) && PageHWPoison(page))
/* A HWPoisoned page cannot be also PageBuddy */
pfn++;
+ else if ((flags & MEMORY_OFFLINE) && PageOffline(page) &&
+ !page_count(page))
+ /*
+ * The responsible driver agreed to skip PageOffline()
+ * pages when offlining memory by dropping its
+ * reference in MEM_GOING_OFFLINE.
+ */
+ pfn++;
else
break;
}
diff --git a/mm/page_owner.c b/mm/page_owner.c
index 18ecde9f45b2..360461509423 100644
--- a/mm/page_owner.c
+++ b/mm/page_owner.c
@@ -312,8 +312,7 @@ void pagetypeinfo_showmixedcount_print(struct seq_file *m,
continue;
page_owner = get_page_owner(page_ext);
- page_mt = gfpflags_to_migratetype(
- page_owner->gfp_mask);
+ page_mt = gfp_migratetype(page_owner->gfp_mask);
if (pageblock_mt != page_mt) {
if (is_migrate_cma(pageblock_mt))
count[MIGRATE_MOVABLE]++;
@@ -359,7 +358,7 @@ print_page_owner(char __user *buf, size_t count, unsigned long pfn,
/* Print information relevant to grouping pages by mobility */
pageblock_mt = get_pageblock_migratetype(page);
- page_mt = gfpflags_to_migratetype(page_owner->gfp_mask);
+ page_mt = gfp_migratetype(page_owner->gfp_mask);
ret += snprintf(kbuf + ret, count - ret,
"PFN %lu type %s Block %lu type %s Flags %#lx(%pGp)\n",
pfn,
@@ -416,7 +415,7 @@ void __dump_page_owner(struct page *page)
page_owner = get_page_owner(page_ext);
gfp_mask = page_owner->gfp_mask;
- mt = gfpflags_to_migratetype(gfp_mask);
+ mt = gfp_migratetype(gfp_mask);
if (!test_bit(PAGE_EXT_OWNER, &page_ext->flags)) {
pr_alert("page_owner info is not present (never set?)\n");
diff --git a/mm/page_reporting.h b/mm/page_reporting.h
index aa6d37f4dc22..2c385dd4ddbd 100644
--- a/mm/page_reporting.h
+++ b/mm/page_reporting.h
@@ -7,7 +7,7 @@
#include <linux/page-isolation.h>
#include <linux/jump_label.h>
#include <linux/slab.h>
-#include <asm/pgtable.h>
+#include <linux/pgtable.h>
#include <linux/scatterlist.h>
#define PAGE_REPORTING_MIN_ORDER pageblock_order
diff --git a/mm/pagewalk.c b/mm/pagewalk.c
index 928df1638c30..e81640d9f177 100644
--- a/mm/pagewalk.c
+++ b/mm/pagewalk.c
@@ -373,7 +373,7 @@ static int __walk_page_range(unsigned long start, unsigned long end,
* caller-specific data to callbacks, @private should be helpful.
*
* Locking:
- * Callers of walk_page_range() and walk_page_vma() should hold @mm->mmap_sem,
+ * Callers of walk_page_range() and walk_page_vma() should hold @mm->mmap_lock,
* because these function traverse vma list and/or access to vma's data.
*/
int walk_page_range(struct mm_struct *mm, unsigned long start,
@@ -395,7 +395,7 @@ int walk_page_range(struct mm_struct *mm, unsigned long start,
if (!walk.mm)
return -EINVAL;
- lockdep_assert_held(&walk.mm->mmap_sem);
+ mmap_assert_locked(walk.mm);
vma = find_vma(walk.mm, start);
do {
@@ -453,7 +453,7 @@ int walk_page_range_novma(struct mm_struct *mm, unsigned long start,
if (start >= end || !walk.mm)
return -EINVAL;
- lockdep_assert_held(&walk.mm->mmap_sem);
+ mmap_assert_locked(walk.mm);
return __walk_page_range(start, end, &walk);
}
@@ -472,7 +472,7 @@ int walk_page_vma(struct vm_area_struct *vma, const struct mm_walk_ops *ops,
if (!walk.mm)
return -EINVAL;
- lockdep_assert_held(&walk.mm->mmap_sem);
+ mmap_assert_locked(walk.mm);
err = walk_page_test(vma->vm_start, vma->vm_end, &walk);
if (err > 0)
@@ -498,11 +498,11 @@ int walk_page_vma(struct vm_area_struct *vma, const struct mm_walk_ops *ops,
* Also see walk_page_range() for additional information.
*
* Locking:
- * This function can't require that the struct mm_struct::mmap_sem is held,
+ * This function can't require that the struct mm_struct::mmap_lock is held,
* since @mapping may be mapped by multiple processes. Instead
* @mapping->i_mmap_rwsem must be held. This might have implications in the
* callbacks, and it's up tho the caller to ensure that the
- * struct mm_struct::mmap_sem is not needed.
+ * struct mm_struct::mmap_lock is not needed.
*
* Also this means that a caller can't rely on the struct
* vm_area_struct::vm_flags to be constant across a call,
diff --git a/mm/percpu.c b/mm/percpu.c
index d7e3bc649f4e..696367b18222 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -80,6 +80,7 @@
#include <linux/workqueue.h>
#include <linux/kmemleak.h>
#include <linux/sched.h>
+#include <linux/sched/mm.h>
#include <asm/cacheflush.h>
#include <asm/sections.h>
@@ -481,7 +482,7 @@ static void *pcpu_mem_zalloc(size_t size, gfp_t gfp)
if (size <= PAGE_SIZE)
return kzalloc(size, gfp);
else
- return __vmalloc(size, gfp | __GFP_ZERO, PAGE_KERNEL);
+ return __vmalloc(size, gfp | __GFP_ZERO);
}
/**
@@ -1557,10 +1558,9 @@ static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr)
static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved,
gfp_t gfp)
{
- /* whitelisted flags that can be passed to the backing allocators */
- gfp_t pcpu_gfp = gfp & (GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN);
- bool is_atomic = (gfp & GFP_KERNEL) != GFP_KERNEL;
- bool do_warn = !(gfp & __GFP_NOWARN);
+ gfp_t pcpu_gfp;
+ bool is_atomic;
+ bool do_warn;
static int warn_limit = 10;
struct pcpu_chunk *chunk, *next;
const char *err;
@@ -1569,6 +1569,12 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved,
void __percpu *ptr;
size_t bits, bit_align;
+ gfp = current_gfp_context(gfp);
+ /* whitelisted flags that can be passed to the backing allocators */
+ pcpu_gfp = gfp & (GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN);
+ is_atomic = (gfp & GFP_KERNEL) != GFP_KERNEL;
+ do_warn = !(gfp & __GFP_NOWARN);
+
/*
* There is now a minimum allocation size of PCPU_MIN_ALLOC_SIZE,
* therefore alignment must be a minimum of that many bytes.
diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c
index 3d7c01e76efc..9578db83e312 100644
--- a/mm/pgtable-generic.c
+++ b/mm/pgtable-generic.c
@@ -2,15 +2,15 @@
/*
* mm/pgtable-generic.c
*
- * Generic pgtable methods declared in asm-generic/pgtable.h
+ * Generic pgtable methods declared in linux/pgtable.h
*
* Copyright (C) 2010 Linus Torvalds
*/
#include <linux/pagemap.h>
#include <linux/hugetlb.h>
+#include <linux/pgtable.h>
#include <asm/tlb.h>
-#include <asm-generic/pgtable.h>
/*
* If a p?d_bad entry is found while walking page tables, report
@@ -53,7 +53,7 @@ void pmd_clear_bad(pmd_t *pmd)
#ifndef __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS
/*
- * Only sets the access flags (dirty, accessed), as well as write
+ * Only sets the access flags (dirty, accessed), as well as write
* permission. Furthermore, we know it always gets set to a "more
* permissive" setting, which allows most architectures to optimize
* this. We return whether the PTE actually changed, which in turn
@@ -194,7 +194,7 @@ pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp)
pmd_t pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
pmd_t *pmdp)
{
- pmd_t old = pmdp_establish(vma, address, pmdp, pmd_mknotpresent(*pmdp));
+ pmd_t old = pmdp_establish(vma, address, pmdp, pmd_mkinvalid(*pmdp));
flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
return old;
}
diff --git a/mm/process_vm_access.c b/mm/process_vm_access.c
index 74e957e302fe..cc85ce81914a 100644
--- a/mm/process_vm_access.c
+++ b/mm/process_vm_access.c
@@ -104,12 +104,12 @@ static int process_vm_rw_single_vec(unsigned long addr,
* access remotely because task/mm might not
* current/current->mm
*/
- down_read(&mm->mmap_sem);
+ mmap_read_lock(mm);
pinned_pages = pin_user_pages_remote(task, mm, pa, pinned_pages,
flags, process_pages,
NULL, &locked);
if (locked)
- up_read(&mm->mmap_sem);
+ mmap_read_unlock(mm);
if (pinned_pages <= 0)
return -EFAULT;
diff --git a/mm/ptdump.c b/mm/ptdump.c
index 26208d0d03b7..ba88ec43ff21 100644
--- a/mm/ptdump.c
+++ b/mm/ptdump.c
@@ -36,6 +36,9 @@ static int ptdump_pgd_entry(pgd_t *pgd, unsigned long addr,
return note_kasan_page_table(walk, addr);
#endif
+ if (st->effective_prot)
+ st->effective_prot(st, 0, pgd_val(val));
+
if (pgd_leaf(val))
st->note_page(st, addr, 0, pgd_val(val));
@@ -53,6 +56,9 @@ static int ptdump_p4d_entry(p4d_t *p4d, unsigned long addr,
return note_kasan_page_table(walk, addr);
#endif
+ if (st->effective_prot)
+ st->effective_prot(st, 1, p4d_val(val));
+
if (p4d_leaf(val))
st->note_page(st, addr, 1, p4d_val(val));
@@ -70,6 +76,9 @@ static int ptdump_pud_entry(pud_t *pud, unsigned long addr,
return note_kasan_page_table(walk, addr);
#endif
+ if (st->effective_prot)
+ st->effective_prot(st, 2, pud_val(val));
+
if (pud_leaf(val))
st->note_page(st, addr, 2, pud_val(val));
@@ -87,6 +96,8 @@ static int ptdump_pmd_entry(pmd_t *pmd, unsigned long addr,
return note_kasan_page_table(walk, addr);
#endif
+ if (st->effective_prot)
+ st->effective_prot(st, 3, pmd_val(val));
if (pmd_leaf(val))
st->note_page(st, addr, 3, pmd_val(val));
@@ -97,8 +108,12 @@ static int ptdump_pte_entry(pte_t *pte, unsigned long addr,
unsigned long next, struct mm_walk *walk)
{
struct ptdump_state *st = walk->private;
+ pte_t val = READ_ONCE(*pte);
+
+ if (st->effective_prot)
+ st->effective_prot(st, 4, pte_val(val));
- st->note_page(st, addr, 4, pte_val(READ_ONCE(*pte)));
+ st->note_page(st, addr, 4, pte_val(val));
return 0;
}
@@ -126,13 +141,13 @@ void ptdump_walk_pgd(struct ptdump_state *st, struct mm_struct *mm, pgd_t *pgd)
{
const struct ptdump_range *range = st->range;
- down_read(&mm->mmap_sem);
+ mmap_read_lock(mm);
while (range->start != range->end) {
walk_page_range_novma(mm, range->start, range->end,
&ptdump_ops, pgd, st);
range++;
}
- up_read(&mm->mmap_sem);
+ mmap_read_unlock(mm);
/* Flush out the last page */
st->note_page(st, 0, -1, 0);
diff --git a/mm/readahead.c b/mm/readahead.c
index 2fe72cd29b47..3c9a8dd7c56c 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -22,6 +22,7 @@
#include <linux/mm_inline.h>
#include <linux/blk-cgroup.h>
#include <linux/fadvise.h>
+#include <linux/sched/mm.h>
#include "internal.h"
@@ -113,94 +114,126 @@ int read_cache_pages(struct address_space *mapping, struct list_head *pages,
EXPORT_SYMBOL(read_cache_pages);
-static int read_pages(struct address_space *mapping, struct file *filp,
- struct list_head *pages, unsigned int nr_pages, gfp_t gfp)
+static void read_pages(struct readahead_control *rac, struct list_head *pages,
+ bool skip_page)
{
+ const struct address_space_operations *aops = rac->mapping->a_ops;
+ struct page *page;
struct blk_plug plug;
- unsigned page_idx;
- int ret;
+
+ if (!readahead_count(rac))
+ goto out;
blk_start_plug(&plug);
- if (mapping->a_ops->readpages) {
- ret = mapping->a_ops->readpages(filp, mapping, pages, nr_pages);
+ if (aops->readahead) {
+ aops->readahead(rac);
+ /* Clean up the remaining pages */
+ while ((page = readahead_page(rac))) {
+ unlock_page(page);
+ put_page(page);
+ }
+ } else if (aops->readpages) {
+ aops->readpages(rac->file, rac->mapping, pages,
+ readahead_count(rac));
/* Clean up the remaining pages */
put_pages_list(pages);
- goto out;
- }
-
- for (page_idx = 0; page_idx < nr_pages; page_idx++) {
- struct page *page = lru_to_page(pages);
- list_del(&page->lru);
- if (!add_to_page_cache_lru(page, mapping, page->index, gfp))
- mapping->a_ops->readpage(filp, page);
- put_page(page);
+ rac->_index += rac->_nr_pages;
+ rac->_nr_pages = 0;
+ } else {
+ while ((page = readahead_page(rac))) {
+ aops->readpage(rac->file, page);
+ put_page(page);
+ }
}
- ret = 0;
-out:
blk_finish_plug(&plug);
- return ret;
+ BUG_ON(!list_empty(pages));
+ BUG_ON(readahead_count(rac));
+
+out:
+ if (skip_page)
+ rac->_index++;
}
-/*
- * __do_page_cache_readahead() actually reads a chunk of disk. It allocates
- * the pages first, then submits them for I/O. This avoids the very bad
- * behaviour which would occur if page allocations are causing VM writeback.
- * We really don't want to intermingle reads and writes like that.
+/**
+ * page_cache_readahead_unbounded - Start unchecked readahead.
+ * @mapping: File address space.
+ * @file: This instance of the open file; used for authentication.
+ * @index: First page index to read.
+ * @nr_to_read: The number of pages to read.
+ * @lookahead_size: Where to start the next readahead.
*
- * Returns the number of pages requested, or the maximum amount of I/O allowed.
+ * This function is for filesystems to call when they want to start
+ * readahead beyond a file's stated i_size. This is almost certainly
+ * not the function you want to call. Use page_cache_async_readahead()
+ * or page_cache_sync_readahead() instead.
+ *
+ * Context: File is referenced by caller. Mutexes may be held by caller.
+ * May sleep, but will not reenter filesystem to reclaim memory.
*/
-unsigned int __do_page_cache_readahead(struct address_space *mapping,
- struct file *filp, pgoff_t offset, unsigned long nr_to_read,
+void page_cache_readahead_unbounded(struct address_space *mapping,
+ struct file *file, pgoff_t index, unsigned long nr_to_read,
unsigned long lookahead_size)
{
- struct inode *inode = mapping->host;
- struct page *page;
- unsigned long end_index; /* The last page we want to read */
LIST_HEAD(page_pool);
- int page_idx;
- unsigned int nr_pages = 0;
- loff_t isize = i_size_read(inode);
gfp_t gfp_mask = readahead_gfp_mask(mapping);
+ struct readahead_control rac = {
+ .mapping = mapping,
+ .file = file,
+ ._index = index,
+ };
+ unsigned long i;
- if (isize == 0)
- goto out;
-
- end_index = ((isize - 1) >> PAGE_SHIFT);
+ /*
+ * Partway through the readahead operation, we will have added
+ * locked pages to the page cache, but will not yet have submitted
+ * them for I/O. Adding another page may need to allocate memory,
+ * which can trigger memory reclaim. Telling the VM we're in
+ * the middle of a filesystem operation will cause it to not
+ * touch file-backed pages, preventing a deadlock. Most (all?)
+ * filesystems already specify __GFP_NOFS in their mapping's
+ * gfp_mask, but let's be explicit here.
+ */
+ unsigned int nofs = memalloc_nofs_save();
/*
* Preallocate as many pages as we will need.
*/
- for (page_idx = 0; page_idx < nr_to_read; page_idx++) {
- pgoff_t page_offset = offset + page_idx;
+ for (i = 0; i < nr_to_read; i++) {
+ struct page *page = xa_load(&mapping->i_pages, index + i);
- if (page_offset > end_index)
- break;
+ BUG_ON(index + i != rac._index + rac._nr_pages);
- page = xa_load(&mapping->i_pages, page_offset);
if (page && !xa_is_value(page)) {
/*
- * Page already present? Kick off the current batch of
- * contiguous pages before continuing with the next
- * batch.
+ * Page already present? Kick off the current batch
+ * of contiguous pages before continuing with the
+ * next batch. This page may be the one we would
+ * have intended to mark as Readahead, but we don't
+ * have a stable reference to this page, and it's
+ * not worth getting one just for that.
*/
- if (nr_pages)
- read_pages(mapping, filp, &page_pool, nr_pages,
- gfp_mask);
- nr_pages = 0;
+ read_pages(&rac, &page_pool, true);
continue;
}
page = __page_cache_alloc(gfp_mask);
if (!page)
break;
- page->index = page_offset;
- list_add(&page->lru, &page_pool);
- if (page_idx == nr_to_read - lookahead_size)
+ if (mapping->a_ops->readpages) {
+ page->index = index + i;
+ list_add(&page->lru, &page_pool);
+ } else if (add_to_page_cache_lru(page, mapping, index + i,
+ gfp_mask) < 0) {
+ put_page(page);
+ read_pages(&rac, &page_pool, true);
+ continue;
+ }
+ if (i == nr_to_read - lookahead_size)
SetPageReadahead(page);
- nr_pages++;
+ rac._nr_pages++;
}
/*
@@ -208,26 +241,53 @@ unsigned int __do_page_cache_readahead(struct address_space *mapping,
* uptodate then the caller will launch readpage again, and
* will then handle the error.
*/
- if (nr_pages)
- read_pages(mapping, filp, &page_pool, nr_pages, gfp_mask);
- BUG_ON(!list_empty(&page_pool));
-out:
- return nr_pages;
+ read_pages(&rac, &page_pool, false);
+ memalloc_nofs_restore(nofs);
+}
+EXPORT_SYMBOL_GPL(page_cache_readahead_unbounded);
+
+/*
+ * __do_page_cache_readahead() actually reads a chunk of disk. It allocates
+ * the pages first, then submits them for I/O. This avoids the very bad
+ * behaviour which would occur if page allocations are causing VM writeback.
+ * We really don't want to intermingle reads and writes like that.
+ */
+void __do_page_cache_readahead(struct address_space *mapping,
+ struct file *file, pgoff_t index, unsigned long nr_to_read,
+ unsigned long lookahead_size)
+{
+ struct inode *inode = mapping->host;
+ loff_t isize = i_size_read(inode);
+ pgoff_t end_index; /* The last page we want to read */
+
+ if (isize == 0)
+ return;
+
+ end_index = (isize - 1) >> PAGE_SHIFT;
+ if (index > end_index)
+ return;
+ /* Don't read past the page containing the last byte of the file */
+ if (nr_to_read > end_index - index)
+ nr_to_read = end_index - index + 1;
+
+ page_cache_readahead_unbounded(mapping, file, index, nr_to_read,
+ lookahead_size);
}
/*
* Chunk the readahead into 2 megabyte units, so that we don't pin too much
* memory at once.
*/
-int force_page_cache_readahead(struct address_space *mapping, struct file *filp,
- pgoff_t offset, unsigned long nr_to_read)
+void force_page_cache_readahead(struct address_space *mapping,
+ struct file *filp, pgoff_t index, unsigned long nr_to_read)
{
struct backing_dev_info *bdi = inode_to_bdi(mapping->host);
struct file_ra_state *ra = &filp->f_ra;
unsigned long max_pages;
- if (unlikely(!mapping->a_ops->readpage && !mapping->a_ops->readpages))
- return -EINVAL;
+ if (unlikely(!mapping->a_ops->readpage && !mapping->a_ops->readpages &&
+ !mapping->a_ops->readahead))
+ return;
/*
* If the request exceeds the readahead window, allow the read to
@@ -240,12 +300,11 @@ int force_page_cache_readahead(struct address_space *mapping, struct file *filp,
if (this_chunk > nr_to_read)
this_chunk = nr_to_read;
- __do_page_cache_readahead(mapping, filp, offset, this_chunk, 0);
+ __do_page_cache_readahead(mapping, filp, index, this_chunk, 0);
- offset += this_chunk;
+ index += this_chunk;
nr_to_read -= this_chunk;
}
- return 0;
}
/*
@@ -324,21 +383,21 @@ static unsigned long get_next_ra_size(struct file_ra_state *ra,
*/
/*
- * Count contiguously cached pages from @offset-1 to @offset-@max,
+ * Count contiguously cached pages from @index-1 to @index-@max,
* this count is a conservative estimation of
* - length of the sequential read sequence, or
* - thrashing threshold in memory tight systems
*/
static pgoff_t count_history_pages(struct address_space *mapping,
- pgoff_t offset, unsigned long max)
+ pgoff_t index, unsigned long max)
{
pgoff_t head;
rcu_read_lock();
- head = page_cache_prev_miss(mapping, offset - 1, max);
+ head = page_cache_prev_miss(mapping, index - 1, max);
rcu_read_unlock();
- return offset - 1 - head;
+ return index - 1 - head;
}
/*
@@ -346,13 +405,13 @@ static pgoff_t count_history_pages(struct address_space *mapping,
*/
static int try_context_readahead(struct address_space *mapping,
struct file_ra_state *ra,
- pgoff_t offset,
+ pgoff_t index,
unsigned long req_size,
unsigned long max)
{
pgoff_t size;
- size = count_history_pages(mapping, offset, max);
+ size = count_history_pages(mapping, index, max);
/*
* not enough history pages:
@@ -365,10 +424,10 @@ static int try_context_readahead(struct address_space *mapping,
* starts from beginning of file:
* it is a strong indication of long-run stream (or whole-file-read)
*/
- if (size >= offset)
+ if (size >= index)
size *= 2;
- ra->start = offset;
+ ra->start = index;
ra->size = min(size + req_size, max);
ra->async_size = 1;
@@ -378,16 +437,15 @@ static int try_context_readahead(struct address_space *mapping,
/*
* A minimal readahead algorithm for trivial sequential/random reads.
*/
-static unsigned long
-ondemand_readahead(struct address_space *mapping,
- struct file_ra_state *ra, struct file *filp,
- bool hit_readahead_marker, pgoff_t offset,
- unsigned long req_size)
+static void ondemand_readahead(struct address_space *mapping,
+ struct file_ra_state *ra, struct file *filp,
+ bool hit_readahead_marker, pgoff_t index,
+ unsigned long req_size)
{
struct backing_dev_info *bdi = inode_to_bdi(mapping->host);
unsigned long max_pages = ra->ra_pages;
unsigned long add_pages;
- pgoff_t prev_offset;
+ pgoff_t prev_index;
/*
* If the request exceeds the readahead window, allow the read to
@@ -399,15 +457,15 @@ ondemand_readahead(struct address_space *mapping,
/*
* start of file
*/
- if (!offset)
+ if (!index)
goto initial_readahead;
/*
- * It's the expected callback offset, assume sequential access.
+ * It's the expected callback index, assume sequential access.
* Ramp up sizes, and push forward the readahead window.
*/
- if ((offset == (ra->start + ra->size - ra->async_size) ||
- offset == (ra->start + ra->size))) {
+ if ((index == (ra->start + ra->size - ra->async_size) ||
+ index == (ra->start + ra->size))) {
ra->start += ra->size;
ra->size = get_next_ra_size(ra, max_pages);
ra->async_size = ra->size;
@@ -424,14 +482,14 @@ ondemand_readahead(struct address_space *mapping,
pgoff_t start;
rcu_read_lock();
- start = page_cache_next_miss(mapping, offset + 1, max_pages);
+ start = page_cache_next_miss(mapping, index + 1, max_pages);
rcu_read_unlock();
- if (!start || start - offset > max_pages)
- return 0;
+ if (!start || start - index > max_pages)
+ return;
ra->start = start;
- ra->size = start - offset; /* old async_size */
+ ra->size = start - index; /* old async_size */
ra->size += req_size;
ra->size = get_next_ra_size(ra, max_pages);
ra->async_size = ra->size;
@@ -446,28 +504,29 @@ ondemand_readahead(struct address_space *mapping,
/*
* sequential cache miss
- * trivial case: (offset - prev_offset) == 1
- * unaligned reads: (offset - prev_offset) == 0
+ * trivial case: (index - prev_index) == 1
+ * unaligned reads: (index - prev_index) == 0
*/
- prev_offset = (unsigned long long)ra->prev_pos >> PAGE_SHIFT;
- if (offset - prev_offset <= 1UL)
+ prev_index = (unsigned long long)ra->prev_pos >> PAGE_SHIFT;
+ if (index - prev_index <= 1UL)
goto initial_readahead;
/*
* Query the page cache and look for the traces(cached history pages)
* that a sequential stream would leave behind.
*/
- if (try_context_readahead(mapping, ra, offset, req_size, max_pages))
+ if (try_context_readahead(mapping, ra, index, req_size, max_pages))
goto readit;
/*
* standalone, small random read
* Read as is, and do not pollute the readahead state.
*/
- return __do_page_cache_readahead(mapping, filp, offset, req_size, 0);
+ __do_page_cache_readahead(mapping, filp, index, req_size, 0);
+ return;
initial_readahead:
- ra->start = offset;
+ ra->start = index;
ra->size = get_init_ra_size(req_size, max_pages);
ra->async_size = ra->size > req_size ? ra->size - req_size : ra->size;
@@ -478,7 +537,7 @@ readit:
* the resulted next readahead window into the current one.
* Take care of maximum IO pages as above.
*/
- if (offset == ra->start && ra->size == ra->async_size) {
+ if (index == ra->start && ra->size == ra->async_size) {
add_pages = get_next_ra_size(ra, max_pages);
if (ra->size + add_pages <= max_pages) {
ra->async_size = add_pages;
@@ -489,7 +548,7 @@ readit:
}
}
- return ra_submit(ra, mapping, filp);
+ ra_submit(ra, mapping, filp);
}
/**
@@ -497,9 +556,8 @@ readit:
* @mapping: address_space which holds the pagecache and I/O vectors
* @ra: file_ra_state which holds the readahead state
* @filp: passed on to ->readpage() and ->readpages()
- * @offset: start offset into @mapping, in pagecache page-sized units
- * @req_size: hint: total size of the read which the caller is performing in
- * pagecache pages
+ * @index: Index of first page to be read.
+ * @req_count: Total number of pages being read by the caller.
*
* page_cache_sync_readahead() should be called when a cache miss happened:
* it will submit the read. The readahead logic may decide to piggyback more
@@ -508,7 +566,7 @@ readit:
*/
void page_cache_sync_readahead(struct address_space *mapping,
struct file_ra_state *ra, struct file *filp,
- pgoff_t offset, unsigned long req_size)
+ pgoff_t index, unsigned long req_count)
{
/* no read-ahead */
if (!ra->ra_pages)
@@ -519,12 +577,12 @@ void page_cache_sync_readahead(struct address_space *mapping,
/* be dumb */
if (filp && (filp->f_mode & FMODE_RANDOM)) {
- force_page_cache_readahead(mapping, filp, offset, req_size);
+ force_page_cache_readahead(mapping, filp, index, req_count);
return;
}
/* do read-ahead */
- ondemand_readahead(mapping, ra, filp, false, offset, req_size);
+ ondemand_readahead(mapping, ra, filp, false, index, req_count);
}
EXPORT_SYMBOL_GPL(page_cache_sync_readahead);
@@ -533,21 +591,20 @@ EXPORT_SYMBOL_GPL(page_cache_sync_readahead);
* @mapping: address_space which holds the pagecache and I/O vectors
* @ra: file_ra_state which holds the readahead state
* @filp: passed on to ->readpage() and ->readpages()
- * @page: the page at @offset which has the PG_readahead flag set
- * @offset: start offset into @mapping, in pagecache page-sized units
- * @req_size: hint: total size of the read which the caller is performing in
- * pagecache pages
+ * @page: The page at @index which triggered the readahead call.
+ * @index: Index of first page to be read.
+ * @req_count: Total number of pages being read by the caller.
*
* page_cache_async_readahead() should be called when a page is used which
- * has the PG_readahead flag; this is a marker to suggest that the application
+ * is marked as PageReadahead; this is a marker to suggest that the application
* has used up enough of the readahead window that we should start pulling in
* more pages.
*/
void
page_cache_async_readahead(struct address_space *mapping,
struct file_ra_state *ra, struct file *filp,
- struct page *page, pgoff_t offset,
- unsigned long req_size)
+ struct page *page, pgoff_t index,
+ unsigned long req_count)
{
/* no read-ahead */
if (!ra->ra_pages)
@@ -571,7 +628,7 @@ page_cache_async_readahead(struct address_space *mapping,
return;
/* do read-ahead */
- ondemand_readahead(mapping, ra, filp, true, offset, req_size);
+ ondemand_readahead(mapping, ra, filp, true, index, req_count);
}
EXPORT_SYMBOL_GPL(page_cache_async_readahead);
diff --git a/mm/rmap.c b/mm/rmap.c
index f79a206b271a..5fe2dedce1fc 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -21,7 +21,7 @@
* Lock ordering in mm:
*
* inode->i_mutex (while writing or truncating, not reading or faulting)
- * mm->mmap_sem
+ * mm->mmap_lock
* page->flags PG_locked (lock_page) * (see huegtlbfs below)
* hugetlbfs_i_mmap_rwsem_key (in huge_pmd_share)
* mapping->i_mmap_rwsem
@@ -177,7 +177,7 @@ static void anon_vma_chain_link(struct vm_area_struct *vma,
* to do any locking for the common case of already having
* an anon_vma.
*
- * This must be called with the mmap_sem held for reading.
+ * This must be called with the mmap_lock held for reading.
*/
int __anon_vma_prepare(struct vm_area_struct *vma)
{
@@ -1114,6 +1114,11 @@ void do_page_add_anon_rmap(struct page *page,
bool compound = flags & RMAP_COMPOUND;
bool first;
+ if (unlikely(PageKsm(page)))
+ lock_page_memcg(page);
+ else
+ VM_BUG_ON_PAGE(!PageLocked(page), page);
+
if (compound) {
atomic_t *mapcount;
VM_BUG_ON_PAGE(!PageLocked(page), page);
@@ -1133,13 +1138,14 @@ void do_page_add_anon_rmap(struct page *page,
* disabled.
*/
if (compound)
- __inc_node_page_state(page, NR_ANON_THPS);
- __mod_node_page_state(page_pgdat(page), NR_ANON_MAPPED, nr);
+ __inc_lruvec_page_state(page, NR_ANON_THPS);
+ __mod_lruvec_page_state(page, NR_ANON_MAPPED, nr);
}
- if (unlikely(PageKsm(page)))
- return;
- VM_BUG_ON_PAGE(!PageLocked(page), page);
+ if (unlikely(PageKsm(page))) {
+ unlock_page_memcg(page);
+ return;
+ }
/* address might be in next vma when migration races vma_adjust */
if (first)
@@ -1174,14 +1180,14 @@ void page_add_new_anon_rmap(struct page *page,
if (hpage_pincount_available(page))
atomic_set(compound_pincount_ptr(page), 0);
- __inc_node_page_state(page, NR_ANON_THPS);
+ __inc_lruvec_page_state(page, NR_ANON_THPS);
} else {
/* Anon THP always mapped first with PMD */
VM_BUG_ON_PAGE(PageTransCompound(page), page);
/* increment count (starts at -1) */
atomic_set(&page->_mapcount, 0);
}
- __mod_node_page_state(page_pgdat(page), NR_ANON_MAPPED, nr);
+ __mod_lruvec_page_state(page, NR_ANON_MAPPED, nr);
__page_set_anon_rmap(page, vma, address, 1);
}
@@ -1230,13 +1236,12 @@ static void page_remove_file_rmap(struct page *page, bool compound)
int i, nr = 1;
VM_BUG_ON_PAGE(compound && !PageHead(page), page);
- lock_page_memcg(page);
/* Hugepages are not counted in NR_FILE_MAPPED for now. */
if (unlikely(PageHuge(page))) {
/* hugetlb pages are always mapped with pmds */
atomic_dec(compound_mapcount_ptr(page));
- goto out;
+ return;
}
/* page still mapped by someone else? */
@@ -1246,14 +1251,14 @@ static void page_remove_file_rmap(struct page *page, bool compound)
nr++;
}
if (!atomic_add_negative(-1, compound_mapcount_ptr(page)))
- goto out;
+ return;
if (PageSwapBacked(page))
__dec_node_page_state(page, NR_SHMEM_PMDMAPPED);
else
__dec_node_page_state(page, NR_FILE_PMDMAPPED);
} else {
if (!atomic_add_negative(-1, &page->_mapcount))
- goto out;
+ return;
}
/*
@@ -1265,8 +1270,6 @@ static void page_remove_file_rmap(struct page *page, bool compound)
if (unlikely(PageMlocked(page)))
clear_page_mlock(page);
-out:
- unlock_page_memcg(page);
}
static void page_remove_anon_compound_rmap(struct page *page)
@@ -1283,7 +1286,7 @@ static void page_remove_anon_compound_rmap(struct page *page)
if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE))
return;
- __dec_node_page_state(page, NR_ANON_THPS);
+ __dec_lruvec_page_state(page, NR_ANON_THPS);
if (TestClearPageDoubleMap(page)) {
/*
@@ -1310,7 +1313,7 @@ static void page_remove_anon_compound_rmap(struct page *page)
clear_page_mlock(page);
if (nr)
- __mod_node_page_state(page_pgdat(page), NR_ANON_MAPPED, -nr);
+ __mod_lruvec_page_state(page, NR_ANON_MAPPED, -nr);
}
/**
@@ -1322,22 +1325,28 @@ static void page_remove_anon_compound_rmap(struct page *page)
*/
void page_remove_rmap(struct page *page, bool compound)
{
- if (!PageAnon(page))
- return page_remove_file_rmap(page, compound);
+ lock_page_memcg(page);
- if (compound)
- return page_remove_anon_compound_rmap(page);
+ if (!PageAnon(page)) {
+ page_remove_file_rmap(page, compound);
+ goto out;
+ }
+
+ if (compound) {
+ page_remove_anon_compound_rmap(page);
+ goto out;
+ }
/* page still mapped by someone else? */
if (!atomic_add_negative(-1, &page->_mapcount))
- return;
+ goto out;
/*
* We use the irq-unsafe __{inc|mod}_zone_page_stat because
* these counters are not modified in interrupt context, and
* pte lock(a spinlock) is held, which implies preemption disabled.
*/
- __dec_node_page_state(page, NR_ANON_MAPPED);
+ __dec_lruvec_page_state(page, NR_ANON_MAPPED);
if (unlikely(PageMlocked(page)))
clear_page_mlock(page);
@@ -1354,6 +1363,8 @@ void page_remove_rmap(struct page *page, bool compound)
* Leaving it set also helps swapoff to reinstate ptes
* faster for those pages still in swapcache.
*/
+out:
+ unlock_page_memcg(page);
}
/*
@@ -1433,7 +1444,7 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
if (!PageTransCompound(page)) {
/*
* Holding pte lock, we do *not* need
- * mmap_sem here
+ * mmap_lock here
*/
mlock_vma_page(page);
}
@@ -1806,7 +1817,7 @@ static struct anon_vma *rmap_walk_anon_lock(struct page *page,
/*
* Note: remove_migration_ptes() cannot use page_lock_anon_vma_read()
* because that depends on page_mapped(); but not all its usages
- * are holding mmap_sem. Users without mmap_sem are required to
+ * are holding mmap_lock. Users without mmap_lock are required to
* take a reference count to prevent the anon_vma disappearing
*/
anon_vma = page_anon_vma(page);
@@ -1826,7 +1837,7 @@ static struct anon_vma *rmap_walk_anon_lock(struct page *page,
* Find all the mappings of a page using the mapping pointer and the vma chains
* contained in the anon_vma struct it points to.
*
- * When called from try_to_munlock(), the mmap_sem of the mm containing the vma
+ * When called from try_to_munlock(), the mmap_lock of the mm containing the vma
* where the page was found will be held for write. So, we won't recheck
* vm_flags for that VMA. That should be OK, because that vma shouldn't be
* LOCKED.
@@ -1878,7 +1889,7 @@ static void rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc,
* Find all the mappings of a page using the mapping pointer and the vma chains
* contained in the address_space struct it points to.
*
- * When called from try_to_munlock(), the mmap_sem of the mm containing the vma
+ * When called from try_to_munlock(), the mmap_lock of the mm containing the vma
* where the page was found will be held for write. So, we won't recheck
* vm_flags for that VMA. That should be OK, because that vma shouldn't be
* LOCKED.
diff --git a/mm/shmem.c b/mm/shmem.c
index d722eb830317..a0dbe62f8042 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -82,7 +82,6 @@ static struct vfsmount *shm_mnt;
#include <linux/uuid.h>
#include <linux/uaccess.h>
-#include <asm/pgtable.h>
#include "internal.h"
@@ -605,11 +604,13 @@ static inline bool is_huge_enabled(struct shmem_sb_info *sbinfo)
*/
static int shmem_add_to_page_cache(struct page *page,
struct address_space *mapping,
- pgoff_t index, void *expected, gfp_t gfp)
+ pgoff_t index, void *expected, gfp_t gfp,
+ struct mm_struct *charge_mm)
{
XA_STATE_ORDER(xas, &mapping->i_pages, index, compound_order(page));
unsigned long i = 0;
unsigned long nr = compound_nr(page);
+ int error;
VM_BUG_ON_PAGE(PageTail(page), page);
VM_BUG_ON_PAGE(index != round_down(index, nr), page);
@@ -621,6 +622,18 @@ static int shmem_add_to_page_cache(struct page *page,
page->mapping = mapping;
page->index = index;
+ if (!PageSwapCache(page)) {
+ error = mem_cgroup_charge(page, charge_mm, gfp);
+ if (error) {
+ if (PageTransHuge(page)) {
+ count_vm_event(THP_FILE_FALLBACK);
+ count_vm_event(THP_FILE_FALLBACK_CHARGE);
+ }
+ goto error;
+ }
+ }
+ cgroup_throttle_swaprate(page, gfp);
+
do {
void *entry;
xas_lock_irq(&xas);
@@ -641,19 +654,22 @@ next:
__inc_node_page_state(page, NR_SHMEM_THPS);
}
mapping->nrpages += nr;
- __mod_node_page_state(page_pgdat(page), NR_FILE_PAGES, nr);
- __mod_node_page_state(page_pgdat(page), NR_SHMEM, nr);
+ __mod_lruvec_page_state(page, NR_FILE_PAGES, nr);
+ __mod_lruvec_page_state(page, NR_SHMEM, nr);
unlock:
xas_unlock_irq(&xas);
} while (xas_nomem(&xas, gfp));
if (xas_error(&xas)) {
- page->mapping = NULL;
- page_ref_sub(page, nr);
- return xas_error(&xas);
+ error = xas_error(&xas);
+ goto error;
}
return 0;
+error:
+ page->mapping = NULL;
+ page_ref_sub(page, nr);
+ return error;
}
/*
@@ -670,8 +686,8 @@ static void shmem_delete_from_page_cache(struct page *page, void *radswap)
error = shmem_replace_entry(mapping, page->index, page, radswap);
page->mapping = NULL;
mapping->nrpages--;
- __dec_node_page_state(page, NR_FILE_PAGES);
- __dec_node_page_state(page, NR_SHMEM);
+ __dec_lruvec_page_state(page, NR_FILE_PAGES);
+ __dec_lruvec_page_state(page, NR_SHMEM);
xa_unlock_irq(&mapping->i_pages);
put_page(page);
BUG_ON(error);
@@ -952,7 +968,7 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
VM_BUG_ON_PAGE(PageWriteback(page), page);
if (shmem_punch_compound(page, start, end))
truncate_inode_page(mapping, page);
- else {
+ else if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) {
/* Wipe the page and don't get stuck */
clear_highpage(page);
flush_dcache_page(page);
@@ -1578,8 +1594,9 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp,
xa_lock_irq(&swap_mapping->i_pages);
error = shmem_replace_entry(swap_mapping, swap_index, oldpage, newpage);
if (!error) {
- __inc_node_page_state(newpage, NR_FILE_PAGES);
- __dec_node_page_state(oldpage, NR_FILE_PAGES);
+ mem_cgroup_migrate(oldpage, newpage);
+ __inc_lruvec_page_state(newpage, NR_FILE_PAGES);
+ __dec_lruvec_page_state(oldpage, NR_FILE_PAGES);
}
xa_unlock_irq(&swap_mapping->i_pages);
@@ -1591,8 +1608,7 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp,
*/
oldpage = newpage;
} else {
- mem_cgroup_migrate(oldpage, newpage);
- lru_cache_add_anon(newpage);
+ lru_cache_add(newpage);
*pagep = newpage;
}
@@ -1619,7 +1635,6 @@ static int shmem_swapin_page(struct inode *inode, pgoff_t index,
struct address_space *mapping = inode->i_mapping;
struct shmem_inode_info *info = SHMEM_I(inode);
struct mm_struct *charge_mm = vma ? vma->vm_mm : current->mm;
- struct mem_cgroup *memcg;
struct page *page;
swp_entry_t swap;
int error;
@@ -1664,31 +1679,12 @@ static int shmem_swapin_page(struct inode *inode, pgoff_t index,
goto failed;
}
- error = mem_cgroup_try_charge_delay(page, charge_mm, gfp, &memcg,
- false);
- if (!error) {
- error = shmem_add_to_page_cache(page, mapping, index,
- swp_to_radix_entry(swap), gfp);
- /*
- * We already confirmed swap under page lock, and make
- * no memory allocation here, so usually no possibility
- * of error; but free_swap_and_cache() only trylocks a
- * page, so it is just possible that the entry has been
- * truncated or holepunched since swap was confirmed.
- * shmem_undo_range() will have done some of the
- * unaccounting, now delete_from_swap_cache() will do
- * the rest.
- */
- if (error) {
- mem_cgroup_cancel_charge(page, memcg, false);
- delete_from_swap_cache(page);
- }
- }
+ error = shmem_add_to_page_cache(page, mapping, index,
+ swp_to_radix_entry(swap), gfp,
+ charge_mm);
if (error)
goto failed;
- mem_cgroup_commit_charge(page, memcg, true, false);
-
spin_lock_irq(&info->lock);
info->swapped--;
shmem_recalc_inode(inode);
@@ -1734,7 +1730,6 @@ static int shmem_getpage_gfp(struct inode *inode, pgoff_t index,
struct shmem_inode_info *info = SHMEM_I(inode);
struct shmem_sb_info *sbinfo;
struct mm_struct *charge_mm;
- struct mem_cgroup *memcg;
struct page *page;
enum sgp_type sgp_huge = sgp;
pgoff_t hindex = index;
@@ -1859,25 +1854,12 @@ alloc_nohuge:
if (sgp == SGP_WRITE)
__SetPageReferenced(page);
- error = mem_cgroup_try_charge_delay(page, charge_mm, gfp, &memcg,
- PageTransHuge(page));
- if (error) {
- if (PageTransHuge(page)) {
- count_vm_event(THP_FILE_FALLBACK);
- count_vm_event(THP_FILE_FALLBACK_CHARGE);
- }
- goto unacct;
- }
error = shmem_add_to_page_cache(page, mapping, hindex,
- NULL, gfp & GFP_RECLAIM_MASK);
- if (error) {
- mem_cgroup_cancel_charge(page, memcg,
- PageTransHuge(page));
+ NULL, gfp & GFP_RECLAIM_MASK,
+ charge_mm);
+ if (error)
goto unacct;
- }
- mem_cgroup_commit_charge(page, memcg, false,
- PageTransHuge(page));
- lru_cache_add_anon(page);
+ lru_cache_add(page);
spin_lock_irq(&info->lock);
info->alloced += compound_nr(page);
@@ -2179,7 +2161,11 @@ int shmem_lock(struct file *file, int lock, struct user_struct *user)
struct shmem_inode_info *info = SHMEM_I(inode);
int retval = -ENOMEM;
- spin_lock_irq(&info->lock);
+ /*
+ * What serializes the accesses to info->flags?
+ * ipc_lock_object() when called from shmctl_do_lock(),
+ * no serialization needed when called from shm_destroy().
+ */
if (lock && !(info->flags & VM_LOCKED)) {
if (!user_shm_lock(inode->i_size, user))
goto out_nomem;
@@ -2194,7 +2180,6 @@ int shmem_lock(struct file *file, int lock, struct user_struct *user)
retval = 0;
out_nomem:
- spin_unlock_irq(&info->lock);
return retval;
}
@@ -2311,7 +2296,6 @@ static int shmem_mfill_atomic_pte(struct mm_struct *dst_mm,
struct address_space *mapping = inode->i_mapping;
gfp_t gfp = mapping_gfp_mask(mapping);
pgoff_t pgoff = linear_page_index(dst_vma, dst_addr);
- struct mem_cgroup *memcg;
spinlock_t *ptl;
void *page_kaddr;
struct page *page;
@@ -2335,7 +2319,7 @@ static int shmem_mfill_atomic_pte(struct mm_struct *dst_mm,
PAGE_SIZE);
kunmap_atomic(page_kaddr);
- /* fallback to copy_from_user outside mmap_sem */
+ /* fallback to copy_from_user outside mmap_lock */
if (unlikely(ret)) {
*pagep = page;
shmem_inode_unacct_blocks(inode, 1);
@@ -2361,16 +2345,10 @@ static int shmem_mfill_atomic_pte(struct mm_struct *dst_mm,
if (unlikely(offset >= max_off))
goto out_release;
- ret = mem_cgroup_try_charge_delay(page, dst_mm, gfp, &memcg, false);
- if (ret)
- goto out_release;
-
ret = shmem_add_to_page_cache(page, mapping, pgoff, NULL,
- gfp & GFP_RECLAIM_MASK);
+ gfp & GFP_RECLAIM_MASK, dst_mm);
if (ret)
- goto out_release_uncharge;
-
- mem_cgroup_commit_charge(page, memcg, false, false);
+ goto out_release;
_dst_pte = mk_pte(page, dst_vma->vm_page_prot);
if (dst_vma->vm_flags & VM_WRITE)
@@ -2391,19 +2369,19 @@ static int shmem_mfill_atomic_pte(struct mm_struct *dst_mm,
ret = -EFAULT;
max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
if (unlikely(offset >= max_off))
- goto out_release_uncharge_unlock;
+ goto out_release_unlock;
ret = -EEXIST;
if (!pte_none(*dst_pte))
- goto out_release_uncharge_unlock;
+ goto out_release_unlock;
- lru_cache_add_anon(page);
+ lru_cache_add(page);
- spin_lock(&info->lock);
+ spin_lock_irq(&info->lock);
info->alloced++;
inode->i_blocks += BLOCKS_PER_PAGE;
shmem_recalc_inode(inode);
- spin_unlock(&info->lock);
+ spin_unlock_irq(&info->lock);
inc_mm_counter(dst_mm, mm_counter_file(page));
page_add_file_rmap(page, false);
@@ -2416,12 +2394,10 @@ static int shmem_mfill_atomic_pte(struct mm_struct *dst_mm,
ret = 0;
out:
return ret;
-out_release_uncharge_unlock:
+out_release_unlock:
pte_unmap_unlock(dst_pte, ptl);
ClearPageDirty(page);
delete_from_page_cache(page);
-out_release_uncharge:
- mem_cgroup_cancel_charge(page, memcg, false);
out_release:
unlock_page(page);
put_page(page);
@@ -4160,7 +4136,7 @@ int shmem_zero_setup(struct vm_area_struct *vma)
loff_t size = vma->vm_end - vma->vm_start;
/*
- * Cloning a new file under mmap_sem leads to a lock ordering conflict
+ * Cloning a new file under mmap_lock leads to a lock ordering conflict
* between XFS directory reading and selinux: since this file is only
* accessible to the user through its mapping, use S_PRIVATE flag to
* bypass file security, in the same way as shmem_kernel_file_setup().
diff --git a/mm/slab.c b/mm/slab.c
index a89633603b2d..9350062ffc1a 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -3106,7 +3106,7 @@ static void *fallback_alloc(struct kmem_cache *cache, gfp_t flags)
struct zonelist *zonelist;
struct zoneref *z;
struct zone *zone;
- enum zone_type high_zoneidx = gfp_zone(flags);
+ enum zone_type highest_zoneidx = gfp_zone(flags);
void *obj = NULL;
struct page *page;
int nid;
@@ -3124,7 +3124,7 @@ retry:
* Look through allowed nodes for objects available
* from existing per node queues.
*/
- for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
+ for_each_zone_zonelist(zone, z, zonelist, highest_zoneidx) {
nid = zone_to_nid(zone);
if (cpuset_zone_allowed(zone, flags) &&
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 23c7500eea7d..9e72ba224175 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -1303,7 +1303,8 @@ void __init create_kmalloc_caches(slab_flags_t flags)
kmalloc_caches[KMALLOC_DMA][i] = create_kmalloc_cache(
kmalloc_info[i].name[KMALLOC_DMA],
kmalloc_info[i].size,
- SLAB_CACHE_DMA | flags, 0, 0);
+ SLAB_CACHE_DMA | flags, 0,
+ kmalloc_info[i].size);
}
}
#endif
diff --git a/mm/slob.c b/mm/slob.c
index fa53e9f73893..ac2aecfbc7a8 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -524,6 +524,7 @@ void *__kmalloc_track_caller(size_t size, gfp_t gfp, unsigned long caller)
{
return __do_kmalloc_node(size, gfp, NUMA_NO_NODE, caller);
}
+EXPORT_SYMBOL(__kmalloc_track_caller);
#ifdef CONFIG_NUMA
void *__kmalloc_node_track_caller(size_t size, gfp_t gfp,
@@ -531,6 +532,7 @@ void *__kmalloc_node_track_caller(size_t size, gfp_t gfp,
{
return __do_kmalloc_node(size, gfp, node, caller);
}
+EXPORT_SYMBOL(__kmalloc_node_track_caller);
#endif
void kfree(const void *block)
diff --git a/mm/slub.c b/mm/slub.c
index 332d4b459a90..b8f798b50d44 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -551,15 +551,32 @@ static void print_section(char *level, char *text, u8 *addr,
metadata_access_disable();
}
+/*
+ * See comment in calculate_sizes().
+ */
+static inline bool freeptr_outside_object(struct kmem_cache *s)
+{
+ return s->offset >= s->inuse;
+}
+
+/*
+ * Return offset of the end of info block which is inuse + free pointer if
+ * not overlapping with object.
+ */
+static inline unsigned int get_info_end(struct kmem_cache *s)
+{
+ if (freeptr_outside_object(s))
+ return s->inuse + sizeof(void *);
+ else
+ return s->inuse;
+}
+
static struct track *get_track(struct kmem_cache *s, void *object,
enum track_item alloc)
{
struct track *p;
- if (s->offset)
- p = object + s->offset + sizeof(void *);
- else
- p = object + s->inuse;
+ p = object + get_info_end(s);
return p + alloc;
}
@@ -662,6 +679,20 @@ static void slab_fix(struct kmem_cache *s, char *fmt, ...)
va_end(args);
}
+static bool freelist_corrupted(struct kmem_cache *s, struct page *page,
+ void *freelist, void *nextfree)
+{
+ if ((s->flags & SLAB_CONSISTENCY_CHECKS) &&
+ !check_valid_pointer(s, page, nextfree)) {
+ object_err(s, page, freelist, "Freechain corrupt");
+ freelist = NULL;
+ slab_fix(s, "Isolate corrupted freechain");
+ return true;
+ }
+
+ return false;
+}
+
static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p)
{
unsigned int off; /* Offset of last byte */
@@ -686,10 +717,7 @@ static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p)
print_section(KERN_ERR, "Redzone ", p + s->object_size,
s->inuse - s->object_size);
- if (s->offset)
- off = s->offset + sizeof(void *);
- else
- off = s->inuse;
+ off = get_info_end(s);
if (s->flags & SLAB_STORE_USER)
off += 2 * sizeof(struct track);
@@ -782,7 +810,7 @@ static int check_bytes_and_report(struct kmem_cache *s, struct page *page,
* object address
* Bytes of the object to be managed.
* If the freepointer may overlay the object then the free
- * pointer is the first word of the object.
+ * pointer is at the middle of the object.
*
* Poisoning uses 0x6b (POISON_FREE) and the last byte is
* 0xa5 (POISON_END)
@@ -816,11 +844,7 @@ static int check_bytes_and_report(struct kmem_cache *s, struct page *page,
static int check_pad_bytes(struct kmem_cache *s, struct page *page, u8 *p)
{
- unsigned long off = s->inuse; /* The end of info */
-
- if (s->offset)
- /* Freepointer is placed after the object. */
- off += sizeof(void *);
+ unsigned long off = get_info_end(s); /* The end of info */
if (s->flags & SLAB_STORE_USER)
/* We also have user information there */
@@ -907,7 +931,7 @@ static int check_object(struct kmem_cache *s, struct page *page,
check_pad_bytes(s, page, p);
}
- if (!s->offset && val == SLUB_RED_ACTIVE)
+ if (!freeptr_outside_object(s) && val == SLUB_RED_ACTIVE)
/*
* Object and freepointer overlap. Cannot check
* freepointer while object is allocated.
@@ -1400,6 +1424,11 @@ static inline void inc_slabs_node(struct kmem_cache *s, int node,
static inline void dec_slabs_node(struct kmem_cache *s, int node,
int objects) {}
+static bool freelist_corrupted(struct kmem_cache *s, struct page *page,
+ void *freelist, void *nextfree)
+{
+ return false;
+}
#endif /* CONFIG_SLUB_DEBUG */
/*
@@ -1909,7 +1938,7 @@ static void *get_any_partial(struct kmem_cache *s, gfp_t flags,
struct zonelist *zonelist;
struct zoneref *z;
struct zone *zone;
- enum zone_type high_zoneidx = gfp_zone(flags);
+ enum zone_type highest_zoneidx = gfp_zone(flags);
void *object;
unsigned int cpuset_mems_cookie;
@@ -1938,7 +1967,7 @@ static void *get_any_partial(struct kmem_cache *s, gfp_t flags,
do {
cpuset_mems_cookie = read_mems_allowed_begin();
zonelist = node_zonelist(mempolicy_slab_node(), flags);
- for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
+ for_each_zone_zonelist(zone, z, zonelist, highest_zoneidx) {
struct kmem_cache_node *n;
n = get_node(s, zone_to_nid(zone));
@@ -1984,7 +2013,7 @@ static void *get_partial(struct kmem_cache *s, gfp_t flags, int node,
#ifdef CONFIG_PREEMPTION
/*
- * Calculate the next globally unique transaction for disambiguiation
+ * Calculate the next globally unique transaction for disambiguation
* during cmpxchg. The transactions start with the cpu number and are then
* incremented by CONFIG_NR_CPUS.
*/
@@ -2083,6 +2112,14 @@ static void deactivate_slab(struct kmem_cache *s, struct page *page,
void *prior;
unsigned long counters;
+ /*
+ * If 'nextfree' is invalid, it is possible that the object at
+ * 'freelist' is already corrupted. So isolate all objects
+ * starting at 'freelist'.
+ */
+ if (freelist_corrupted(s, page, freelist, nextfree))
+ break;
+
do {
prior = page->freelist;
counters = page->counters;
@@ -3533,6 +3570,7 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order)
{
slab_flags_t flags = s->flags;
unsigned int size = s->object_size;
+ unsigned int freepointer_area;
unsigned int order;
/*
@@ -3541,6 +3579,13 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order)
* the possible location of the free pointer.
*/
size = ALIGN(size, sizeof(void *));
+ /*
+ * This is the area of the object where a freepointer can be
+ * safely written. If redzoning adds more to the inuse size, we
+ * can't use that portion for writing the freepointer, so
+ * s->offset must be limited within this for the general case.
+ */
+ freepointer_area = size;
#ifdef CONFIG_SLUB_DEBUG
/*
@@ -3579,16 +3624,21 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order)
*
* This is the case if we do RCU, have a constructor or
* destructor or are poisoning the objects.
+ *
+ * The assumption that s->offset >= s->inuse means free
+ * pointer is outside of the object is used in the
+ * freeptr_outside_object() function. If that is no
+ * longer true, the function needs to be modified.
*/
s->offset = size;
size += sizeof(void *);
- } else if (size > sizeof(void *)) {
+ } else if (freepointer_area > sizeof(void *)) {
/*
* Store freelist pointer near middle of object to keep
* it away from the edges of the object to avoid small
* sized over/underflows from neighboring allocations.
*/
- s->offset = ALIGN(size / 2, sizeof(void *));
+ s->offset = ALIGN(freepointer_area / 2, sizeof(void *));
}
#ifdef CONFIG_SLUB_DEBUG
@@ -3716,12 +3766,14 @@ error:
}
static void list_slab_objects(struct kmem_cache *s, struct page *page,
- const char *text)
+ const char *text, unsigned long *map)
{
#ifdef CONFIG_SLUB_DEBUG
void *addr = page_address(page);
void *p;
- unsigned long *map;
+
+ if (!map)
+ return;
slab_err(s, page, text, s->name);
slab_lock(page);
@@ -3734,8 +3786,6 @@ static void list_slab_objects(struct kmem_cache *s, struct page *page,
print_tracking(s, p);
}
}
- put_map(map);
-
slab_unlock(page);
#endif
}
@@ -3749,6 +3799,11 @@ static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n)
{
LIST_HEAD(discard);
struct page *page, *h;
+ unsigned long *map = NULL;
+
+#ifdef CONFIG_SLUB_DEBUG
+ map = bitmap_alloc(oo_objects(s->max), GFP_KERNEL);
+#endif
BUG_ON(irqs_disabled());
spin_lock_irq(&n->list_lock);
@@ -3758,11 +3813,16 @@ static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n)
list_add(&page->slab_list, &discard);
} else {
list_slab_objects(s, page,
- "Objects remaining in %s on __kmem_cache_shutdown()");
+ "Objects remaining in %s on __kmem_cache_shutdown()",
+ map);
}
}
spin_unlock_irq(&n->list_lock);
+#ifdef CONFIG_SLUB_DEBUG
+ bitmap_free(map);
+#endif
+
list_for_each_entry_safe(page, h, &discard, slab_list)
discard_slab(s, page);
}
@@ -4385,6 +4445,7 @@ void *__kmalloc_track_caller(size_t size, gfp_t gfpflags, unsigned long caller)
return ret;
}
+EXPORT_SYMBOL(__kmalloc_track_caller);
#ifdef CONFIG_NUMA
void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags,
@@ -4415,6 +4476,7 @@ void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags,
return ret;
}
+EXPORT_SYMBOL(__kmalloc_node_track_caller);
#endif
#ifdef CONFIG_SYSFS
@@ -5631,7 +5693,8 @@ static void memcg_propagate_slab_attrs(struct kmem_cache *s)
*/
if (buffer)
buf = buffer;
- else if (root_cache->max_attr_size < ARRAY_SIZE(mbuf))
+ else if (root_cache->max_attr_size < ARRAY_SIZE(mbuf) &&
+ !IS_ENABLED(CONFIG_SLUB_STATS))
buf = mbuf;
else {
buffer = (char *) get_zeroed_page(GFP_KERNEL);
@@ -5665,19 +5728,6 @@ static struct kobj_type slab_ktype = {
.release = kmem_cache_release,
};
-static int uevent_filter(struct kset *kset, struct kobject *kobj)
-{
- struct kobj_type *ktype = get_ktype(kobj);
-
- if (ktype == &slab_ktype)
- return 1;
- return 0;
-}
-
-static const struct kset_uevent_ops slab_uevent_ops = {
- .filter = uevent_filter,
-};
-
static struct kset *slab_kset;
static inline struct kset *cache_kset(struct kmem_cache *s)
@@ -5745,7 +5795,6 @@ static void sysfs_slab_remove_workfn(struct work_struct *work)
#ifdef CONFIG_MEMCG
kset_unregister(s->memcg_kset);
#endif
- kobject_uevent(&s->kobj, KOBJ_REMOVE);
out:
kobject_put(&s->kobj);
}
@@ -5786,8 +5835,10 @@ static int sysfs_slab_add(struct kmem_cache *s)
s->kobj.kset = kset;
err = kobject_init_and_add(&s->kobj, &slab_ktype, NULL, "%s", name);
- if (err)
+ if (err) {
+ kobject_put(&s->kobj);
goto out;
+ }
err = sysfs_create_group(&s->kobj, &slab_attr_group);
if (err)
@@ -5803,7 +5854,6 @@ static int sysfs_slab_add(struct kmem_cache *s)
}
#endif
- kobject_uevent(&s->kobj, KOBJ_ADD);
if (!unmergeable) {
/* Setup first alias */
sysfs_slab_alias(s, s->name);
@@ -5884,7 +5934,7 @@ static int __init slab_sysfs_init(void)
mutex_lock(&slab_mutex);
- slab_kset = kset_create_and_add("slab", &slab_uevent_ops, kernel_kobj);
+ slab_kset = kset_create_and_add("slab", NULL, kernel_kobj);
if (!slab_kset) {
mutex_unlock(&slab_mutex);
pr_err("Cannot register slab subsystem.\n");
diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c
index 200aef686722..0db7738d76e9 100644
--- a/mm/sparse-vmemmap.c
+++ b/mm/sparse-vmemmap.c
@@ -29,7 +29,6 @@
#include <linux/sched.h>
#include <asm/dma.h>
#include <asm/pgalloc.h>
-#include <asm/pgtable.h>
/*
* Allocate a block of memory to be used to back the virtual memory map
diff --git a/mm/sparse.c b/mm/sparse.c
index 1aee5a481571..b2b9a3e34696 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -17,7 +17,6 @@
#include "internal.h"
#include <asm/dma.h>
#include <asm/pgalloc.h>
-#include <asm/pgtable.h>
/*
* Permanent SPARSEMEM data:
@@ -288,7 +287,7 @@ void __init memory_present(int nid, unsigned long start, unsigned long end)
/*
* Mark all memblocks as present using memory_present(). This is a
- * convienence function that is useful for a number of arches
+ * convenience function that is useful for a number of arches
* to mark all of the systems memory as present during initialization.
*/
void __init memblocks_present(void)
diff --git a/mm/swap.c b/mm/swap.c
index bf9a79fed62d..dbcab84c6fce 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -35,6 +35,7 @@
#include <linux/uio.h>
#include <linux/hugetlb.h>
#include <linux/page_idle.h>
+#include <linux/local_lock.h>
#include "internal.h"
@@ -44,14 +45,32 @@
/* How many pages do we try to swap or page in/out together? */
int page_cluster;
-static DEFINE_PER_CPU(struct pagevec, lru_add_pvec);
-static DEFINE_PER_CPU(struct pagevec, lru_rotate_pvecs);
-static DEFINE_PER_CPU(struct pagevec, lru_deactivate_file_pvecs);
-static DEFINE_PER_CPU(struct pagevec, lru_deactivate_pvecs);
-static DEFINE_PER_CPU(struct pagevec, lru_lazyfree_pvecs);
+/* Protecting only lru_rotate.pvec which requires disabling interrupts */
+struct lru_rotate {
+ local_lock_t lock;
+ struct pagevec pvec;
+};
+static DEFINE_PER_CPU(struct lru_rotate, lru_rotate) = {
+ .lock = INIT_LOCAL_LOCK(lock),
+};
+
+/*
+ * The following struct pagevec are grouped together because they are protected
+ * by disabling preemption (and interrupts remain enabled).
+ */
+struct lru_pvecs {
+ local_lock_t lock;
+ struct pagevec lru_add;
+ struct pagevec lru_deactivate_file;
+ struct pagevec lru_deactivate;
+ struct pagevec lru_lazyfree;
#ifdef CONFIG_SMP
-static DEFINE_PER_CPU(struct pagevec, activate_page_pvecs);
+ struct pagevec activate_page;
#endif
+};
+static DEFINE_PER_CPU(struct lru_pvecs, lru_pvecs) = {
+ .lock = INIT_LOCAL_LOCK(lock),
+};
/*
* This path almost never happens for VM activity - pages are normally
@@ -83,8 +102,6 @@ static void __put_single_page(struct page *page)
static void __put_compound_page(struct page *page)
{
- compound_page_dtor *dtor;
-
/*
* __page_cache_release() is supposed to be called for thp, not for
* hugetlb. This is because hugetlb page does never have PageLRU set
@@ -93,8 +110,7 @@ static void __put_compound_page(struct page *page)
*/
if (!PageHuge(page))
__page_cache_release(page);
- dtor = get_compound_page_dtor(page);
- (*dtor)(page);
+ destroy_compound_page(page);
}
void __put_page(struct page *page)
@@ -225,7 +241,7 @@ static void pagevec_move_tail_fn(struct page *page, struct lruvec *lruvec,
del_page_from_lru_list(page, lruvec, page_lru(page));
ClearPageActive(page);
add_page_to_lru_list_tail(page, lruvec, page_lru(page));
- (*pgmoved)++;
+ (*pgmoved) += hpage_nr_pages(page);
}
}
@@ -254,30 +270,57 @@ void rotate_reclaimable_page(struct page *page)
unsigned long flags;
get_page(page);
- local_irq_save(flags);
- pvec = this_cpu_ptr(&lru_rotate_pvecs);
+ local_lock_irqsave(&lru_rotate.lock, flags);
+ pvec = this_cpu_ptr(&lru_rotate.pvec);
if (!pagevec_add(pvec, page) || PageCompound(page))
pagevec_move_tail(pvec);
- local_irq_restore(flags);
+ local_unlock_irqrestore(&lru_rotate.lock, flags);
}
}
-static void update_page_reclaim_stat(struct lruvec *lruvec,
- int file, int rotated)
+void lru_note_cost(struct lruvec *lruvec, bool file, unsigned int nr_pages)
{
- struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
+ do {
+ unsigned long lrusize;
+
+ /* Record cost event */
+ if (file)
+ lruvec->file_cost += nr_pages;
+ else
+ lruvec->anon_cost += nr_pages;
+
+ /*
+ * Decay previous events
+ *
+ * Because workloads change over time (and to avoid
+ * overflow) we keep these statistics as a floating
+ * average, which ends up weighing recent refaults
+ * more than old ones.
+ */
+ lrusize = lruvec_page_state(lruvec, NR_INACTIVE_ANON) +
+ lruvec_page_state(lruvec, NR_ACTIVE_ANON) +
+ lruvec_page_state(lruvec, NR_INACTIVE_FILE) +
+ lruvec_page_state(lruvec, NR_ACTIVE_FILE);
+
+ if (lruvec->file_cost + lruvec->anon_cost > lrusize / 4) {
+ lruvec->file_cost /= 2;
+ lruvec->anon_cost /= 2;
+ }
+ } while ((lruvec = parent_lruvec(lruvec)));
+}
- reclaim_stat->recent_scanned[file]++;
- if (rotated)
- reclaim_stat->recent_rotated[file]++;
+void lru_note_cost_page(struct page *page)
+{
+ lru_note_cost(mem_cgroup_page_lruvec(page, page_pgdat(page)),
+ page_is_file_lru(page), hpage_nr_pages(page));
}
static void __activate_page(struct page *page, struct lruvec *lruvec,
void *arg)
{
if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) {
- int file = page_is_file_lru(page);
int lru = page_lru_base_type(page);
+ int nr_pages = hpage_nr_pages(page);
del_page_from_lru_list(page, lruvec, lru);
SetPageActive(page);
@@ -285,15 +328,16 @@ static void __activate_page(struct page *page, struct lruvec *lruvec,
add_page_to_lru_list(page, lruvec, lru);
trace_mm_lru_activate(page);
- __count_vm_event(PGACTIVATE);
- update_page_reclaim_stat(lruvec, file, 1);
+ __count_vm_events(PGACTIVATE, nr_pages);
+ __count_memcg_events(lruvec_memcg(lruvec), PGACTIVATE,
+ nr_pages);
}
}
#ifdef CONFIG_SMP
static void activate_page_drain(int cpu)
{
- struct pagevec *pvec = &per_cpu(activate_page_pvecs, cpu);
+ struct pagevec *pvec = &per_cpu(lru_pvecs.activate_page, cpu);
if (pagevec_count(pvec))
pagevec_lru_move_fn(pvec, __activate_page, NULL);
@@ -301,19 +345,21 @@ static void activate_page_drain(int cpu)
static bool need_activate_page_drain(int cpu)
{
- return pagevec_count(&per_cpu(activate_page_pvecs, cpu)) != 0;
+ return pagevec_count(&per_cpu(lru_pvecs.activate_page, cpu)) != 0;
}
void activate_page(struct page *page)
{
page = compound_head(page);
if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) {
- struct pagevec *pvec = &get_cpu_var(activate_page_pvecs);
+ struct pagevec *pvec;
+ local_lock(&lru_pvecs.lock);
+ pvec = this_cpu_ptr(&lru_pvecs.activate_page);
get_page(page);
if (!pagevec_add(pvec, page) || PageCompound(page))
pagevec_lru_move_fn(pvec, __activate_page, NULL);
- put_cpu_var(activate_page_pvecs);
+ local_unlock(&lru_pvecs.lock);
}
}
@@ -335,9 +381,12 @@ void activate_page(struct page *page)
static void __lru_cache_activate_page(struct page *page)
{
- struct pagevec *pvec = &get_cpu_var(lru_add_pvec);
+ struct pagevec *pvec;
int i;
+ local_lock(&lru_pvecs.lock);
+ pvec = this_cpu_ptr(&lru_pvecs.lru_add);
+
/*
* Search backwards on the optimistic assumption that the page being
* activated has just been added to this pagevec. Note that only
@@ -357,7 +406,7 @@ static void __lru_cache_activate_page(struct page *page)
}
}
- put_cpu_var(lru_add_pvec);
+ local_unlock(&lru_pvecs.lock);
}
/*
@@ -385,7 +434,7 @@ void mark_page_accessed(struct page *page)
} else if (!PageActive(page)) {
/*
* If the page is on the LRU, queue it for activation via
- * activate_page_pvecs. Otherwise, assume the page is on a
+ * lru_pvecs.activate_page. Otherwise, assume the page is on a
* pagevec, mark it active and it'll be moved to the active
* LRU on the next drain.
*/
@@ -402,35 +451,6 @@ void mark_page_accessed(struct page *page)
}
EXPORT_SYMBOL(mark_page_accessed);
-static void __lru_cache_add(struct page *page)
-{
- struct pagevec *pvec = &get_cpu_var(lru_add_pvec);
-
- get_page(page);
- if (!pagevec_add(pvec, page) || PageCompound(page))
- __pagevec_lru_add(pvec);
- put_cpu_var(lru_add_pvec);
-}
-
-/**
- * lru_cache_add_anon - add a page to the page lists
- * @page: the page to add
- */
-void lru_cache_add_anon(struct page *page)
-{
- if (PageActive(page))
- ClearPageActive(page);
- __lru_cache_add(page);
-}
-
-void lru_cache_add_file(struct page *page)
-{
- if (PageActive(page))
- ClearPageActive(page);
- __lru_cache_add(page);
-}
-EXPORT_SYMBOL(lru_cache_add_file);
-
/**
* lru_cache_add - add a page to a page list
* @page: the page to be added to the LRU.
@@ -442,10 +462,19 @@ EXPORT_SYMBOL(lru_cache_add_file);
*/
void lru_cache_add(struct page *page)
{
+ struct pagevec *pvec;
+
VM_BUG_ON_PAGE(PageActive(page) && PageUnevictable(page), page);
VM_BUG_ON_PAGE(PageLRU(page), page);
- __lru_cache_add(page);
+
+ get_page(page);
+ local_lock(&lru_pvecs.lock);
+ pvec = this_cpu_ptr(&lru_pvecs.lru_add);
+ if (!pagevec_add(pvec, page) || PageCompound(page))
+ __pagevec_lru_add(pvec);
+ local_unlock(&lru_pvecs.lock);
}
+EXPORT_SYMBOL(lru_cache_add);
/**
* lru_cache_add_active_or_unevictable
@@ -501,8 +530,9 @@ void lru_cache_add_active_or_unevictable(struct page *page,
static void lru_deactivate_file_fn(struct page *page, struct lruvec *lruvec,
void *arg)
{
- int lru, file;
+ int lru;
bool active;
+ int nr_pages = hpage_nr_pages(page);
if (!PageLRU(page))
return;
@@ -515,7 +545,6 @@ static void lru_deactivate_file_fn(struct page *page, struct lruvec *lruvec,
return;
active = PageActive(page);
- file = page_is_file_lru(page);
lru = page_lru_base_type(page);
del_page_from_lru_list(page, lruvec, lru + active);
@@ -536,28 +565,31 @@ static void lru_deactivate_file_fn(struct page *page, struct lruvec *lruvec,
* We moves tha page into tail of inactive.
*/
add_page_to_lru_list_tail(page, lruvec, lru);
- __count_vm_event(PGROTATED);
+ __count_vm_events(PGROTATED, nr_pages);
}
- if (active)
- __count_vm_event(PGDEACTIVATE);
- update_page_reclaim_stat(lruvec, file, 0);
+ if (active) {
+ __count_vm_events(PGDEACTIVATE, nr_pages);
+ __count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE,
+ nr_pages);
+ }
}
static void lru_deactivate_fn(struct page *page, struct lruvec *lruvec,
void *arg)
{
if (PageLRU(page) && PageActive(page) && !PageUnevictable(page)) {
- int file = page_is_file_lru(page);
int lru = page_lru_base_type(page);
+ int nr_pages = hpage_nr_pages(page);
del_page_from_lru_list(page, lruvec, lru + LRU_ACTIVE);
ClearPageActive(page);
ClearPageReferenced(page);
add_page_to_lru_list(page, lruvec, lru);
- __count_vm_events(PGDEACTIVATE, hpage_nr_pages(page));
- update_page_reclaim_stat(lruvec, file, 0);
+ __count_vm_events(PGDEACTIVATE, nr_pages);
+ __count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE,
+ nr_pages);
}
}
@@ -567,6 +599,7 @@ static void lru_lazyfree_fn(struct page *page, struct lruvec *lruvec,
if (PageLRU(page) && PageAnon(page) && PageSwapBacked(page) &&
!PageSwapCache(page) && !PageUnevictable(page)) {
bool active = PageActive(page);
+ int nr_pages = hpage_nr_pages(page);
del_page_from_lru_list(page, lruvec,
LRU_INACTIVE_ANON + active);
@@ -580,9 +613,9 @@ static void lru_lazyfree_fn(struct page *page, struct lruvec *lruvec,
ClearPageSwapBacked(page);
add_page_to_lru_list(page, lruvec, LRU_INACTIVE_FILE);
- __count_vm_events(PGLAZYFREE, hpage_nr_pages(page));
- count_memcg_page_event(page, PGLAZYFREE);
- update_page_reclaim_stat(lruvec, 1, 0);
+ __count_vm_events(PGLAZYFREE, nr_pages);
+ __count_memcg_events(lruvec_memcg(lruvec), PGLAZYFREE,
+ nr_pages);
}
}
@@ -593,30 +626,30 @@ static void lru_lazyfree_fn(struct page *page, struct lruvec *lruvec,
*/
void lru_add_drain_cpu(int cpu)
{
- struct pagevec *pvec = &per_cpu(lru_add_pvec, cpu);
+ struct pagevec *pvec = &per_cpu(lru_pvecs.lru_add, cpu);
if (pagevec_count(pvec))
__pagevec_lru_add(pvec);
- pvec = &per_cpu(lru_rotate_pvecs, cpu);
+ pvec = &per_cpu(lru_rotate.pvec, cpu);
if (pagevec_count(pvec)) {
unsigned long flags;
/* No harm done if a racing interrupt already did this */
- local_irq_save(flags);
+ local_lock_irqsave(&lru_rotate.lock, flags);
pagevec_move_tail(pvec);
- local_irq_restore(flags);
+ local_unlock_irqrestore(&lru_rotate.lock, flags);
}
- pvec = &per_cpu(lru_deactivate_file_pvecs, cpu);
+ pvec = &per_cpu(lru_pvecs.lru_deactivate_file, cpu);
if (pagevec_count(pvec))
pagevec_lru_move_fn(pvec, lru_deactivate_file_fn, NULL);
- pvec = &per_cpu(lru_deactivate_pvecs, cpu);
+ pvec = &per_cpu(lru_pvecs.lru_deactivate, cpu);
if (pagevec_count(pvec))
pagevec_lru_move_fn(pvec, lru_deactivate_fn, NULL);
- pvec = &per_cpu(lru_lazyfree_pvecs, cpu);
+ pvec = &per_cpu(lru_pvecs.lru_lazyfree, cpu);
if (pagevec_count(pvec))
pagevec_lru_move_fn(pvec, lru_lazyfree_fn, NULL);
@@ -641,11 +674,14 @@ void deactivate_file_page(struct page *page)
return;
if (likely(get_page_unless_zero(page))) {
- struct pagevec *pvec = &get_cpu_var(lru_deactivate_file_pvecs);
+ struct pagevec *pvec;
+
+ local_lock(&lru_pvecs.lock);
+ pvec = this_cpu_ptr(&lru_pvecs.lru_deactivate_file);
if (!pagevec_add(pvec, page) || PageCompound(page))
pagevec_lru_move_fn(pvec, lru_deactivate_file_fn, NULL);
- put_cpu_var(lru_deactivate_file_pvecs);
+ local_unlock(&lru_pvecs.lock);
}
}
@@ -660,12 +696,14 @@ void deactivate_file_page(struct page *page)
void deactivate_page(struct page *page)
{
if (PageLRU(page) && PageActive(page) && !PageUnevictable(page)) {
- struct pagevec *pvec = &get_cpu_var(lru_deactivate_pvecs);
+ struct pagevec *pvec;
+ local_lock(&lru_pvecs.lock);
+ pvec = this_cpu_ptr(&lru_pvecs.lru_deactivate);
get_page(page);
if (!pagevec_add(pvec, page) || PageCompound(page))
pagevec_lru_move_fn(pvec, lru_deactivate_fn, NULL);
- put_cpu_var(lru_deactivate_pvecs);
+ local_unlock(&lru_pvecs.lock);
}
}
@@ -680,19 +718,30 @@ void mark_page_lazyfree(struct page *page)
{
if (PageLRU(page) && PageAnon(page) && PageSwapBacked(page) &&
!PageSwapCache(page) && !PageUnevictable(page)) {
- struct pagevec *pvec = &get_cpu_var(lru_lazyfree_pvecs);
+ struct pagevec *pvec;
+ local_lock(&lru_pvecs.lock);
+ pvec = this_cpu_ptr(&lru_pvecs.lru_lazyfree);
get_page(page);
if (!pagevec_add(pvec, page) || PageCompound(page))
pagevec_lru_move_fn(pvec, lru_lazyfree_fn, NULL);
- put_cpu_var(lru_lazyfree_pvecs);
+ local_unlock(&lru_pvecs.lock);
}
}
void lru_add_drain(void)
{
- lru_add_drain_cpu(get_cpu());
- put_cpu();
+ local_lock(&lru_pvecs.lock);
+ lru_add_drain_cpu(smp_processor_id());
+ local_unlock(&lru_pvecs.lock);
+}
+
+void lru_add_drain_cpu_zone(struct zone *zone)
+{
+ local_lock(&lru_pvecs.lock);
+ lru_add_drain_cpu(smp_processor_id());
+ drain_local_pages(zone);
+ local_unlock(&lru_pvecs.lock);
}
#ifdef CONFIG_SMP
@@ -743,11 +792,11 @@ void lru_add_drain_all(void)
for_each_online_cpu(cpu) {
struct work_struct *work = &per_cpu(lru_add_drain_work, cpu);
- if (pagevec_count(&per_cpu(lru_add_pvec, cpu)) ||
- pagevec_count(&per_cpu(lru_rotate_pvecs, cpu)) ||
- pagevec_count(&per_cpu(lru_deactivate_file_pvecs, cpu)) ||
- pagevec_count(&per_cpu(lru_deactivate_pvecs, cpu)) ||
- pagevec_count(&per_cpu(lru_lazyfree_pvecs, cpu)) ||
+ if (pagevec_count(&per_cpu(lru_pvecs.lru_add, cpu)) ||
+ pagevec_count(&per_cpu(lru_rotate.pvec, cpu)) ||
+ pagevec_count(&per_cpu(lru_pvecs.lru_deactivate_file, cpu)) ||
+ pagevec_count(&per_cpu(lru_pvecs.lru_deactivate, cpu)) ||
+ pagevec_count(&per_cpu(lru_pvecs.lru_lazyfree, cpu)) ||
need_activate_page_drain(cpu)) {
INIT_WORK(work, lru_add_drain_per_cpu);
queue_work_on(cpu, mm_percpu_wq, work);
@@ -890,8 +939,6 @@ EXPORT_SYMBOL(__pagevec_release);
void lru_add_page_tail(struct page *page, struct page *page_tail,
struct lruvec *lruvec, struct list_head *list)
{
- const int file = 0;
-
VM_BUG_ON_PAGE(!PageHead(page), page);
VM_BUG_ON_PAGE(PageCompound(page_tail), page);
VM_BUG_ON_PAGE(PageLRU(page_tail), page);
@@ -917,9 +964,6 @@ void lru_add_page_tail(struct page *page, struct page *page_tail,
add_page_to_lru_list_tail(page_tail, lruvec,
page_lru(page_tail));
}
-
- if (!PageUnevictable(page))
- update_page_reclaim_stat(lruvec, file, PageActive(page_tail));
}
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
@@ -928,6 +972,7 @@ static void __pagevec_lru_add_fn(struct page *page, struct lruvec *lruvec,
{
enum lru_list lru;
int was_unevictable = TestClearPageUnevictable(page);
+ int nr_pages = hpage_nr_pages(page);
VM_BUG_ON_PAGE(PageLRU(page), page);
@@ -962,16 +1007,14 @@ static void __pagevec_lru_add_fn(struct page *page, struct lruvec *lruvec,
if (page_evictable(page)) {
lru = page_lru(page);
- update_page_reclaim_stat(lruvec, page_is_file_lru(page),
- PageActive(page));
if (was_unevictable)
- count_vm_event(UNEVICTABLE_PGRESCUED);
+ __count_vm_events(UNEVICTABLE_PGRESCUED, nr_pages);
} else {
lru = LRU_UNEVICTABLE;
ClearPageActive(page);
SetPageUnevictable(page);
if (!was_unevictable)
- count_vm_event(UNEVICTABLE_PGCULLED);
+ __count_vm_events(UNEVICTABLE_PGCULLED, nr_pages);
}
add_page_to_lru_list(page, lruvec, lru);
diff --git a/mm/swap_cgroup.c b/mm/swap_cgroup.c
index 45affaef3bc6..7f34343c075a 100644
--- a/mm/swap_cgroup.c
+++ b/mm/swap_cgroup.c
@@ -171,9 +171,6 @@ int swap_cgroup_swapon(int type, unsigned long max_pages)
unsigned long length;
struct swap_cgroup_ctrl *ctrl;
- if (!do_swap_account)
- return 0;
-
length = DIV_ROUND_UP(max_pages, SC_PER_PAGE);
array_size = length * sizeof(void *);
@@ -209,9 +206,6 @@ void swap_cgroup_swapoff(int type)
unsigned long i, length;
struct swap_cgroup_ctrl *ctrl;
- if (!do_swap_account)
- return;
-
mutex_lock(&swap_cgroup_mutex);
ctrl = &swap_cgroup_ctrl[type];
map = ctrl->map;
diff --git a/mm/swap_state.c b/mm/swap_state.c
index ebed37bbf7a3..e98ff460e9e9 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -22,7 +22,6 @@
#include <linux/swap_slots.h>
#include <linux/huge_mm.h>
-#include <asm/pgtable.h>
/*
* swapper_space is a fiction, retained to simplify the path through
@@ -360,12 +359,13 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
struct vm_area_struct *vma, unsigned long addr,
bool *new_page_allocated)
{
- struct page *found_page = NULL, *new_page = NULL;
struct swap_info_struct *si;
- int err;
+ struct page *page;
+
*new_page_allocated = false;
- do {
+ for (;;) {
+ int err;
/*
* First check the swap cache. Since this is normally
* called after lookup_swap_cache() failed, re-calling
@@ -373,12 +373,12 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
*/
si = get_swap_device(entry);
if (!si)
- break;
- found_page = find_get_page(swap_address_space(entry),
- swp_offset(entry));
+ return NULL;
+ page = find_get_page(swap_address_space(entry),
+ swp_offset(entry));
put_swap_device(si);
- if (found_page)
- break;
+ if (page)
+ return page;
/*
* Just skip read ahead for unused swap slot.
@@ -389,54 +389,71 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
* else swap_off will be aborted if we return NULL.
*/
if (!__swp_swapcount(entry) && swap_slot_cache_enabled)
- break;
+ return NULL;
/*
- * Get a new page to read into from swap.
+ * Get a new page to read into from swap. Allocate it now,
+ * before marking swap_map SWAP_HAS_CACHE, when -EEXIST will
+ * cause any racers to loop around until we add it to cache.
*/
- if (!new_page) {
- new_page = alloc_page_vma(gfp_mask, vma, addr);
- if (!new_page)
- break; /* Out of memory */
- }
+ page = alloc_page_vma(gfp_mask, vma, addr);
+ if (!page)
+ return NULL;
/*
* Swap entry may have been freed since our caller observed it.
*/
err = swapcache_prepare(entry);
- if (err == -EEXIST) {
- /*
- * We might race against get_swap_page() and stumble
- * across a SWAP_HAS_CACHE swap_map entry whose page
- * has not been brought into the swapcache yet.
- */
- cond_resched();
- continue;
- } else if (err) /* swp entry is obsolete ? */
+ if (!err)
break;
- /* May fail (-ENOMEM) if XArray node allocation failed. */
- __SetPageLocked(new_page);
- __SetPageSwapBacked(new_page);
- err = add_to_swap_cache(new_page, entry, gfp_mask & GFP_KERNEL);
- if (likely(!err)) {
- /* Initiate read into locked page */
- SetPageWorkingset(new_page);
- lru_cache_add_anon(new_page);
- *new_page_allocated = true;
- return new_page;
- }
- __ClearPageLocked(new_page);
+ put_page(page);
+ if (err != -EEXIST)
+ return NULL;
+
/*
- * add_to_swap_cache() doesn't return -EEXIST, so we can safely
- * clear SWAP_HAS_CACHE flag.
+ * We might race against __delete_from_swap_cache(), and
+ * stumble across a swap_map entry whose SWAP_HAS_CACHE
+ * has not yet been cleared. Or race against another
+ * __read_swap_cache_async(), which has set SWAP_HAS_CACHE
+ * in swap_map, but not yet added its page to swap cache.
*/
- put_swap_page(new_page, entry);
- } while (err != -ENOMEM);
+ cond_resched();
+ }
+
+ /*
+ * The swap entry is ours to swap in. Prepare the new page.
+ */
+
+ __SetPageLocked(page);
+ __SetPageSwapBacked(page);
+
+ /* May fail (-ENOMEM) if XArray node allocation failed. */
+ if (add_to_swap_cache(page, entry, gfp_mask & GFP_KERNEL)) {
+ put_swap_page(page, entry);
+ goto fail_unlock;
+ }
+
+ if (mem_cgroup_charge(page, NULL, gfp_mask)) {
+ delete_from_swap_cache(page);
+ goto fail_unlock;
+ }
+
+ /* XXX: Move to lru_cache_add() when it supports new vs putback */
+ spin_lock_irq(&page_pgdat(page)->lru_lock);
+ lru_note_cost_page(page);
+ spin_unlock_irq(&page_pgdat(page)->lru_lock);
+
+ /* Caller will initiate read into locked page */
+ SetPageWorkingset(page);
+ lru_cache_add(page);
+ *new_page_allocated = true;
+ return page;
- if (new_page)
- put_page(new_page);
- return found_page;
+fail_unlock:
+ unlock_page(page);
+ put_page(page);
+ return NULL;
}
/*
@@ -509,10 +526,11 @@ static unsigned long swapin_nr_pages(unsigned long offset)
return 1;
hits = atomic_xchg(&swapin_readahead_hits, 0);
- pages = __swapin_nr_pages(prev_offset, offset, hits, max_pages,
+ pages = __swapin_nr_pages(READ_ONCE(prev_offset), offset, hits,
+ max_pages,
atomic_read(&last_readahead_pages));
if (!hits)
- prev_offset = offset;
+ WRITE_ONCE(prev_offset, offset);
atomic_set(&last_readahead_pages, pages);
return pages;
@@ -534,7 +552,7 @@ static unsigned long swapin_nr_pages(unsigned long offset)
* This has been extended to use the NUMA policies from the mm triggering
* the readahead.
*
- * Caller must hold read mmap_sem if vmf->vma is not NULL.
+ * Caller must hold read mmap_lock if vmf->vma is not NULL.
*/
struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask,
struct vm_fault *vmf)
@@ -716,7 +734,7 @@ static void swap_ra_info(struct vm_fault *vmf,
* Primitive swap readahead code. We simply read in a few pages whoes
* virtual addresses are around the fault address in the same vma.
*
- * Caller must hold read mmap_sem if vmf->vma is not NULL.
+ * Caller must hold read mmap_lock if vmf->vma is not NULL.
*
*/
static struct page *swap_vma_readahead(swp_entry_t fentry, gfp_t gfp_mask,
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 5871a2aa86a5..987276c557d1 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -40,7 +40,6 @@
#include <linux/swap_slots.h>
#include <linux/sort.h>
-#include <asm/pgtable.h>
#include <asm/tlbflush.h>
#include <linux/swapops.h>
#include <linux/swap_cgroup.h>
@@ -601,7 +600,6 @@ static bool scan_swap_map_try_ssd_cluster(struct swap_info_struct *si,
{
struct percpu_cluster *cluster;
struct swap_cluster_info *ci;
- bool found_free;
unsigned long tmp, max;
new_cluster:
@@ -614,17 +612,17 @@ new_cluster:
} else if (!cluster_list_empty(&si->discard_clusters)) {
/*
* we don't have free cluster but have some clusters in
- * discarding, do discard now and reclaim them
+ * discarding, do discard now and reclaim them, then
+ * reread cluster_next_cpu since we dropped si->lock
*/
swap_do_scheduled_discard(si);
- *scan_base = *offset = si->cluster_next;
+ *scan_base = this_cpu_read(*si->cluster_next_cpu);
+ *offset = *scan_base;
goto new_cluster;
} else
return false;
}
- found_free = false;
-
/*
* Other CPUs can use our cluster if they can't find a free cluster,
* check if there is still free entry in the cluster
@@ -632,27 +630,23 @@ new_cluster:
tmp = cluster->next;
max = min_t(unsigned long, si->max,
(cluster_next(&cluster->index) + 1) * SWAPFILE_CLUSTER);
- if (tmp >= max) {
- cluster_set_null(&cluster->index);
- goto new_cluster;
- }
- ci = lock_cluster(si, tmp);
- while (tmp < max) {
- if (!si->swap_map[tmp]) {
- found_free = true;
- break;
+ if (tmp < max) {
+ ci = lock_cluster(si, tmp);
+ while (tmp < max) {
+ if (!si->swap_map[tmp])
+ break;
+ tmp++;
}
- tmp++;
+ unlock_cluster(ci);
}
- unlock_cluster(ci);
- if (!found_free) {
+ if (tmp >= max) {
cluster_set_null(&cluster->index);
goto new_cluster;
}
cluster->next = tmp + 1;
*offset = tmp;
*scan_base = tmp;
- return found_free;
+ return true;
}
static void __del_from_avail_list(struct swap_info_struct *p)
@@ -729,6 +723,34 @@ static void swap_range_free(struct swap_info_struct *si, unsigned long offset,
}
}
+static void set_cluster_next(struct swap_info_struct *si, unsigned long next)
+{
+ unsigned long prev;
+
+ if (!(si->flags & SWP_SOLIDSTATE)) {
+ si->cluster_next = next;
+ return;
+ }
+
+ prev = this_cpu_read(*si->cluster_next_cpu);
+ /*
+ * Cross the swap address space size aligned trunk, choose
+ * another trunk randomly to avoid lock contention on swap
+ * address space if possible.
+ */
+ if ((prev >> SWAP_ADDRESS_SPACE_SHIFT) !=
+ (next >> SWAP_ADDRESS_SPACE_SHIFT)) {
+ /* No free swap slots available */
+ if (si->highest_bit <= si->lowest_bit)
+ return;
+ next = si->lowest_bit +
+ prandom_u32_max(si->highest_bit - si->lowest_bit + 1);
+ next = ALIGN_DOWN(next, SWAP_ADDRESS_SPACE_PAGES);
+ next = max_t(unsigned int, next, si->lowest_bit);
+ }
+ this_cpu_write(*si->cluster_next_cpu, next);
+}
+
static int scan_swap_map_slots(struct swap_info_struct *si,
unsigned char usage, int nr,
swp_entry_t slots[])
@@ -739,9 +761,7 @@ static int scan_swap_map_slots(struct swap_info_struct *si,
unsigned long last_in_cluster = 0;
int latency_ration = LATENCY_LIMIT;
int n_ret = 0;
-
- if (nr > SWAP_BATCH)
- nr = SWAP_BATCH;
+ bool scanned_many = false;
/*
* We try to cluster swap pages by allocating them sequentially
@@ -755,17 +775,22 @@ static int scan_swap_map_slots(struct swap_info_struct *si,
*/
si->flags += SWP_SCANNING;
- scan_base = offset = si->cluster_next;
+ /*
+ * Use percpu scan base for SSD to reduce lock contention on
+ * cluster and swap cache. For HDD, sequential access is more
+ * important.
+ */
+ if (si->flags & SWP_SOLIDSTATE)
+ scan_base = this_cpu_read(*si->cluster_next_cpu);
+ else
+ scan_base = si->cluster_next;
+ offset = scan_base;
/* SSD algorithm */
if (si->cluster_info) {
- if (scan_swap_map_try_ssd_cluster(si, &offset, &scan_base))
- goto checks;
- else
+ if (!scan_swap_map_try_ssd_cluster(si, &offset, &scan_base))
goto scan;
- }
-
- if (unlikely(!si->cluster_nr--)) {
+ } else if (unlikely(!si->cluster_nr--)) {
if (si->pages - si->inuse_pages < SWAPFILE_CLUSTER) {
si->cluster_nr = SWAPFILE_CLUSTER - 1;
goto checks;
@@ -848,7 +873,6 @@ checks:
unlock_cluster(ci);
swap_range_alloc(si, offset, 1);
- si->cluster_next = offset + 1;
slots[n_ret++] = swp_entry(si->type, offset);
/* got enough slots or reach max slots? */
@@ -871,19 +895,33 @@ checks:
if (si->cluster_info) {
if (scan_swap_map_try_ssd_cluster(si, &offset, &scan_base))
goto checks;
- else
- goto done;
- }
- /* non-ssd case */
- ++offset;
-
- /* non-ssd case, still more slots in cluster? */
- if (si->cluster_nr && !si->swap_map[offset]) {
+ } else if (si->cluster_nr && !si->swap_map[++offset]) {
+ /* non-ssd case, still more slots in cluster? */
--si->cluster_nr;
goto checks;
}
+ /*
+ * Even if there's no free clusters available (fragmented),
+ * try to scan a little more quickly with lock held unless we
+ * have scanned too many slots already.
+ */
+ if (!scanned_many) {
+ unsigned long scan_limit;
+
+ if (offset < scan_base)
+ scan_limit = scan_base;
+ else
+ scan_limit = si->highest_bit;
+ for (; offset <= scan_limit && --latency_ration > 0;
+ offset++) {
+ if (!si->swap_map[offset])
+ goto checks;
+ }
+ }
+
done:
+ set_cluster_next(si, offset + 1);
si->flags -= SWP_SCANNING;
return n_ret;
@@ -901,6 +939,7 @@ scan:
if (unlikely(--latency_ration < 0)) {
cond_resched();
latency_ration = LATENCY_LIMIT;
+ scanned_many = true;
}
}
offset = si->lowest_bit;
@@ -916,6 +955,7 @@ scan:
if (unlikely(--latency_ration < 0)) {
cond_resched();
latency_ration = LATENCY_LIMIT;
+ scanned_many = true;
}
offset++;
}
@@ -1004,11 +1044,7 @@ int get_swap_pages(int n_goal, swp_entry_t swp_entries[], int entry_size)
if (avail_pgs <= 0)
goto noswap;
- if (n_goal > SWAP_BATCH)
- n_goal = SWAP_BATCH;
-
- if (n_goal > avail_pgs)
- n_goal = avail_pgs;
+ n_goal = min3((long)n_goal, (long)SWAP_BATCH, avail_pgs);
atomic_long_sub(n_goal * size, &nr_swap_pages);
@@ -1275,13 +1311,14 @@ unlock_out:
}
static unsigned char __swap_entry_free(struct swap_info_struct *p,
- swp_entry_t entry, unsigned char usage)
+ swp_entry_t entry)
{
struct swap_cluster_info *ci;
unsigned long offset = swp_offset(entry);
+ unsigned char usage;
ci = lock_cluster_or_swap_info(p, offset);
- usage = __swap_entry_free_locked(p, offset, usage);
+ usage = __swap_entry_free_locked(p, offset, 1);
unlock_cluster_or_swap_info(p, ci);
if (!usage)
free_swap_slot(entry);
@@ -1316,7 +1353,7 @@ void swap_free(swp_entry_t entry)
p = _swap_info_get(entry);
if (p)
- __swap_entry_free(p, entry, 1);
+ __swap_entry_free(p, entry);
}
/*
@@ -1739,7 +1776,7 @@ int free_swap_and_cache(swp_entry_t entry)
p = _swap_info_get(entry);
if (p) {
- count = __swap_entry_free(p, entry, 1);
+ count = __swap_entry_free(p, entry);
if (count == SWAP_HAS_CACHE &&
!swap_page_trans_huge_swapped(p, entry))
__try_to_reclaim_swap(p, swp_offset(entry),
@@ -1854,7 +1891,6 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
unsigned long addr, swp_entry_t entry, struct page *page)
{
struct page *swapcache;
- struct mem_cgroup *memcg;
spinlock_t *ptl;
pte_t *pte;
int ret = 1;
@@ -1864,15 +1900,8 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
if (unlikely(!page))
return -ENOMEM;
- if (mem_cgroup_try_charge(page, vma->vm_mm, GFP_KERNEL,
- &memcg, false)) {
- ret = -ENOMEM;
- goto out_nolock;
- }
-
pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
if (unlikely(!pte_same_as_swp(*pte, swp_entry_to_pte(entry)))) {
- mem_cgroup_cancel_charge(page, memcg, false);
ret = 0;
goto out;
}
@@ -1884,10 +1913,8 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
pte_mkold(mk_pte(page, vma->vm_page_prot)));
if (page == swapcache) {
page_add_anon_rmap(page, vma, addr, false);
- mem_cgroup_commit_charge(page, memcg, true, false);
} else { /* ksm created a completely new copy */
page_add_new_anon_rmap(page, vma, addr, false);
- mem_cgroup_commit_charge(page, memcg, false, false);
lru_cache_add_active_or_unevictable(page, vma);
}
swap_free(entry);
@@ -1898,7 +1925,6 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
activate_page(page);
out:
pte_unmap_unlock(pte, ptl);
-out_nolock:
if (page != swapcache) {
unlock_page(page);
put_page(page);
@@ -1937,10 +1963,14 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
pte_unmap(pte);
swap_map = &si->swap_map[offset];
- vmf.vma = vma;
- vmf.address = addr;
- vmf.pmd = pmd;
- page = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE, &vmf);
+ page = lookup_swap_cache(entry, vma, addr);
+ if (!page) {
+ vmf.vma = vma;
+ vmf.address = addr;
+ vmf.pmd = pmd;
+ page = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE,
+ &vmf);
+ }
if (!page) {
if (*swap_map == 0 || *swap_map == SWAP_MAP_BAD)
goto try_next;
@@ -2070,7 +2100,7 @@ static int unuse_mm(struct mm_struct *mm, unsigned int type,
struct vm_area_struct *vma;
int ret = 0;
- down_read(&mm->mmap_sem);
+ mmap_read_lock(mm);
for (vma = mm->mmap; vma; vma = vma->vm_next) {
if (vma->anon_vma) {
ret = unuse_vma(vma, type, frontswap,
@@ -2080,7 +2110,7 @@ static int unuse_mm(struct mm_struct *mm, unsigned int type,
}
cond_resched();
}
- up_read(&mm->mmap_sem);
+ mmap_read_unlock(mm);
return ret;
}
@@ -2650,6 +2680,8 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
mutex_unlock(&swapon_mutex);
free_percpu(p->percpu_cluster);
p->percpu_cluster = NULL;
+ free_percpu(p->cluster_next_cpu);
+ p->cluster_next_cpu = NULL;
vfree(swap_map);
kvfree(cluster_info);
kvfree(frontswap_map);
@@ -2757,20 +2789,24 @@ static int swap_show(struct seq_file *swap, void *v)
struct swap_info_struct *si = v;
struct file *file;
int len;
+ unsigned int bytes, inuse;
if (si == SEQ_START_TOKEN) {
- seq_puts(swap,"Filename\t\t\t\tType\t\tSize\tUsed\tPriority\n");
+ seq_puts(swap,"Filename\t\t\t\tType\t\tSize\t\tUsed\t\tPriority\n");
return 0;
}
+ bytes = si->pages << (PAGE_SHIFT - 10);
+ inuse = si->inuse_pages << (PAGE_SHIFT - 10);
+
file = si->swap_file;
len = seq_file_path(swap, file, " \t\n\\");
- seq_printf(swap, "%*s%s\t%u\t%u\t%d\n",
+ seq_printf(swap, "%*s%s\t%u\t%s%u\t%s%d\n",
len < 40 ? 40 - len : 1, " ",
S_ISBLK(file_inode(file)->i_mode) ?
"partition" : "file\t",
- si->pages << (PAGE_SHIFT - 10),
- si->inuse_pages << (PAGE_SHIFT - 10),
+ bytes, bytes < 10000000 ? "\t" : "",
+ inuse, inuse < 10000000 ? "\t" : "",
si->prio);
return 0;
}
@@ -3202,11 +3238,19 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
unsigned long ci, nr_cluster;
p->flags |= SWP_SOLIDSTATE;
+ p->cluster_next_cpu = alloc_percpu(unsigned int);
+ if (!p->cluster_next_cpu) {
+ error = -ENOMEM;
+ goto bad_swap_unlock_inode;
+ }
/*
* select a random position to start with to help wear leveling
* SSD
*/
- p->cluster_next = 1 + (prandom_u32() % p->highest_bit);
+ for_each_possible_cpu(cpu) {
+ per_cpu(*p->cluster_next_cpu, cpu) =
+ 1 + prandom_u32_max(p->highest_bit);
+ }
nr_cluster = DIV_ROUND_UP(maxpages, SWAPFILE_CLUSTER);
cluster_info = kvcalloc(nr_cluster, sizeof(*cluster_info),
@@ -3322,6 +3366,8 @@ bad_swap_unlock_inode:
bad_swap:
free_percpu(p->percpu_cluster);
p->percpu_cluster = NULL;
+ free_percpu(p->cluster_next_cpu);
+ p->cluster_next_cpu = NULL;
if (inode && S_ISBLK(inode->i_mode) && p->bdev) {
set_blocksize(p->bdev, p->old_block_size);
blkdev_put(p->bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
@@ -3654,7 +3700,7 @@ static bool swap_count_continued(struct swap_info_struct *si,
spin_lock(&si->cont_lock);
offset &= ~PAGE_MASK;
- page = list_entry(head->lru.next, struct page, lru);
+ page = list_next_entry(head, lru);
map = kmap_atomic(page) + offset;
if (count == SWAP_MAP_MAX) /* initial increment from swap_map */
@@ -3666,13 +3712,13 @@ static bool swap_count_continued(struct swap_info_struct *si,
*/
while (*map == (SWAP_CONT_MAX | COUNT_CONTINUED)) {
kunmap_atomic(map);
- page = list_entry(page->lru.next, struct page, lru);
+ page = list_next_entry(page, lru);
BUG_ON(page == head);
map = kmap_atomic(page) + offset;
}
if (*map == SWAP_CONT_MAX) {
kunmap_atomic(map);
- page = list_entry(page->lru.next, struct page, lru);
+ page = list_next_entry(page, lru);
if (page == head) {
ret = false; /* add count continuation */
goto out;
@@ -3682,12 +3728,10 @@ init_map: *map = 0; /* we didn't zero the page */
}
*map += 1;
kunmap_atomic(map);
- page = list_entry(page->lru.prev, struct page, lru);
- while (page != head) {
+ while ((page = list_prev_entry(page, lru)) != head) {
map = kmap_atomic(page) + offset;
*map = COUNT_CONTINUED;
kunmap_atomic(map);
- page = list_entry(page->lru.prev, struct page, lru);
}
ret = true; /* incremented */
@@ -3698,7 +3742,7 @@ init_map: *map = 0; /* we didn't zero the page */
BUG_ON(count != COUNT_CONTINUED);
while (*map == COUNT_CONTINUED) {
kunmap_atomic(map);
- page = list_entry(page->lru.next, struct page, lru);
+ page = list_next_entry(page, lru);
BUG_ON(page == head);
map = kmap_atomic(page) + offset;
}
@@ -3707,13 +3751,11 @@ init_map: *map = 0; /* we didn't zero the page */
if (*map == 0)
count = 0;
kunmap_atomic(map);
- page = list_entry(page->lru.prev, struct page, lru);
- while (page != head) {
+ while ((page = list_prev_entry(page, lru)) != head) {
map = kmap_atomic(page) + offset;
*map = SWAP_CONT_MAX | count;
count = COUNT_CONTINUED;
kunmap_atomic(map);
- page = list_entry(page->lru.prev, struct page, lru);
}
ret = count == COUNT_CONTINUED;
}
@@ -3745,11 +3787,12 @@ static void free_swap_count_continuations(struct swap_info_struct *si)
}
#if defined(CONFIG_MEMCG) && defined(CONFIG_BLK_CGROUP)
-void mem_cgroup_throttle_swaprate(struct mem_cgroup *memcg, int node,
- gfp_t gfp_mask)
+void cgroup_throttle_swaprate(struct page *page, gfp_t gfp_mask)
{
struct swap_info_struct *si, *next;
- if (!(gfp_mask & __GFP_IO) || !memcg)
+ int nid = page_to_nid(page);
+
+ if (!(gfp_mask & __GFP_IO))
return;
if (!blk_cgroup_congested())
@@ -3763,11 +3806,10 @@ void mem_cgroup_throttle_swaprate(struct mem_cgroup *memcg, int node,
return;
spin_lock(&swap_avail_lock);
- plist_for_each_entry_safe(si, next, &swap_avail_heads[node],
- avail_lists[node]) {
+ plist_for_each_entry_safe(si, next, &swap_avail_heads[nid],
+ avail_lists[nid]) {
if (si->bdev) {
- blkcg_schedule_throttle(bdev_get_queue(si->bdev),
- true);
+ blkcg_schedule_throttle(bdev_get_queue(si->bdev), true);
break;
}
}
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
index 512576e171ce..b80419320c7d 100644
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -56,7 +56,6 @@ static int mcopy_atomic_pte(struct mm_struct *dst_mm,
struct page **pagep,
bool wp_copy)
{
- struct mem_cgroup *memcg;
pte_t _dst_pte, *dst_pte;
spinlock_t *ptl;
void *page_kaddr;
@@ -77,7 +76,7 @@ static int mcopy_atomic_pte(struct mm_struct *dst_mm,
PAGE_SIZE);
kunmap_atomic(page_kaddr);
- /* fallback to copy_from_user outside mmap_sem */
+ /* fallback to copy_from_user outside mmap_lock */
if (unlikely(ret)) {
ret = -ENOENT;
*pagep = page;
@@ -97,7 +96,7 @@ static int mcopy_atomic_pte(struct mm_struct *dst_mm,
__SetPageUptodate(page);
ret = -ENOMEM;
- if (mem_cgroup_try_charge(page, dst_mm, GFP_KERNEL, &memcg, false))
+ if (mem_cgroup_charge(page, dst_mm, GFP_KERNEL))
goto out_release;
_dst_pte = pte_mkdirty(mk_pte(page, dst_vma->vm_page_prot));
@@ -124,7 +123,6 @@ static int mcopy_atomic_pte(struct mm_struct *dst_mm,
inc_mm_counter(dst_mm, MM_ANONPAGES);
page_add_new_anon_rmap(page, dst_vma, dst_addr, false);
- mem_cgroup_commit_charge(page, memcg, false, false);
lru_cache_add_active_or_unevictable(page, dst_vma);
set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte);
@@ -138,7 +136,6 @@ out:
return ret;
out_release_uncharge_unlock:
pte_unmap_unlock(dst_pte, ptl);
- mem_cgroup_cancel_charge(page, memcg, false);
out_release:
put_page(page);
goto out;
@@ -203,7 +200,7 @@ static pmd_t *mm_alloc_pmd(struct mm_struct *mm, unsigned long address)
#ifdef CONFIG_HUGETLB_PAGE
/*
* __mcopy_atomic processing for HUGETLB vmas. Note that this routine is
- * called with mmap_sem held, it will release mmap_sem before returning.
+ * called with mmap_lock held, it will release mmap_lock before returning.
*/
static __always_inline ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm,
struct vm_area_struct *dst_vma,
@@ -231,7 +228,7 @@ static __always_inline ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm,
* feature is not supported.
*/
if (zeropage) {
- up_read(&dst_mm->mmap_sem);
+ mmap_read_unlock(dst_mm);
return -EINVAL;
}
@@ -250,7 +247,7 @@ static __always_inline ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm,
retry:
/*
- * On routine entry dst_vma is set. If we had to drop mmap_sem and
+ * On routine entry dst_vma is set. If we had to drop mmap_lock and
* retry, dst_vma will be set to NULL and we must lookup again.
*/
if (!dst_vma) {
@@ -318,7 +315,7 @@ retry:
cond_resched();
if (unlikely(err == -ENOENT)) {
- up_read(&dst_mm->mmap_sem);
+ mmap_read_unlock(dst_mm);
BUG_ON(!page);
err = copy_huge_page_from_user(page,
@@ -329,7 +326,7 @@ retry:
err = -EFAULT;
goto out;
}
- down_read(&dst_mm->mmap_sem);
+ mmap_read_lock(dst_mm);
dst_vma = NULL;
goto retry;
@@ -349,7 +346,7 @@ retry:
}
out_unlock:
- up_read(&dst_mm->mmap_sem);
+ mmap_read_unlock(dst_mm);
out:
if (page) {
/*
@@ -360,7 +357,7 @@ out:
* private and shared mappings. See the routine
* restore_reserve_on_error for details. Unfortunately, we
* can not call restore_reserve_on_error now as it would
- * require holding mmap_sem.
+ * require holding mmap_lock.
*
* If a reservation for the page existed in the reservation
* map of a private mapping, the map was modified to indicate
@@ -488,7 +485,7 @@ static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm,
copied = 0;
page = NULL;
retry:
- down_read(&dst_mm->mmap_sem);
+ mmap_read_lock(dst_mm);
/*
* If memory mappings are changing because of non-cooperative
@@ -586,7 +583,7 @@ retry:
if (unlikely(err == -ENOENT)) {
void *page_kaddr;
- up_read(&dst_mm->mmap_sem);
+ mmap_read_unlock(dst_mm);
BUG_ON(!page);
page_kaddr = kmap(page);
@@ -615,7 +612,7 @@ retry:
}
out_unlock:
- up_read(&dst_mm->mmap_sem);
+ mmap_read_unlock(dst_mm);
out:
if (page)
put_page(page);
@@ -655,7 +652,7 @@ int mwriteprotect_range(struct mm_struct *dst_mm, unsigned long start,
/* Does the address range wrap, or is the span zero-sized? */
BUG_ON(start + len <= start);
- down_read(&dst_mm->mmap_sem);
+ mmap_read_lock(dst_mm);
/*
* If memory mappings are changing because of non-cooperative
@@ -689,6 +686,6 @@ int mwriteprotect_range(struct mm_struct *dst_mm, unsigned long start,
err = 0;
out_unlock:
- up_read(&dst_mm->mmap_sem);
+ mmap_read_unlock(dst_mm);
return err;
}
diff --git a/mm/util.c b/mm/util.c
index 988d11e6c17c..c63c8e47be57 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -425,7 +425,7 @@ void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack)
* @bypass_rlim: %true if checking RLIMIT_MEMLOCK should be skipped
*
* Assumes @task and @mm are valid (i.e. at least one reference on each), and
- * that mmap_sem is held as writer.
+ * that mmap_lock is held as writer.
*
* Return:
* * 0 on success
@@ -437,7 +437,7 @@ int __account_locked_vm(struct mm_struct *mm, unsigned long pages, bool inc,
unsigned long locked_vm, limit;
int ret = 0;
- lockdep_assert_held_write(&mm->mmap_sem);
+ mmap_assert_write_locked(mm);
locked_vm = mm->locked_vm;
if (inc) {
@@ -481,10 +481,10 @@ int account_locked_vm(struct mm_struct *mm, unsigned long pages, bool inc)
if (pages == 0 || !mm)
return 0;
- down_write(&mm->mmap_sem);
+ mmap_write_lock(mm);
ret = __account_locked_vm(mm, pages, inc, current,
capable(CAP_IPC_LOCK));
- up_write(&mm->mmap_sem);
+ mmap_write_unlock(mm);
return ret;
}
@@ -501,11 +501,11 @@ unsigned long vm_mmap_pgoff(struct file *file, unsigned long addr,
ret = security_mmap_file(file, prot, flag);
if (!ret) {
- if (down_write_killable(&mm->mmap_sem))
+ if (mmap_write_lock_killable(mm))
return -EINTR;
ret = do_mmap_pgoff(file, addr, len, prot, flag, pgoff,
&populate, &uf);
- up_write(&mm->mmap_sem);
+ mmap_write_unlock(mm);
userfaultfd_unmap_complete(mm, &uf);
if (populate)
mm_populate(ret, populate);
@@ -580,7 +580,7 @@ void *kvmalloc_node(size_t size, gfp_t flags, int node)
if (ret || size <= PAGE_SIZE)
return ret;
- return __vmalloc_node_flags_caller(size, node, flags,
+ return __vmalloc_node(size, 1, flags, node,
__builtin_return_address(0));
}
EXPORT_SYMBOL(kvmalloc_node);
@@ -604,6 +604,24 @@ void kvfree(const void *addr)
}
EXPORT_SYMBOL(kvfree);
+/**
+ * kvfree_sensitive - Free a data object containing sensitive information.
+ * @addr: address of the data object to be freed.
+ * @len: length of the data object.
+ *
+ * Use the special memzero_explicit() function to clear the content of a
+ * kvmalloc'ed object containing sensitive data to make sure that the
+ * compiler won't optimize out the data clearing.
+ */
+void kvfree_sensitive(const void *addr, size_t len)
+{
+ if (likely(!ZERO_OR_NULL_PTR(addr))) {
+ memzero_explicit((void *)addr, len);
+ kvfree(addr);
+ }
+}
+EXPORT_SYMBOL(kvfree_sensitive);
+
static inline void *__page_rmapping(struct page *page)
{
unsigned long mapping;
@@ -717,9 +735,8 @@ int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT;
unsigned long sysctl_user_reserve_kbytes __read_mostly = 1UL << 17; /* 128MB */
unsigned long sysctl_admin_reserve_kbytes __read_mostly = 1UL << 13; /* 8MB */
-int overcommit_ratio_handler(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp,
- loff_t *ppos)
+int overcommit_ratio_handler(struct ctl_table *table, int write, void *buffer,
+ size_t *lenp, loff_t *ppos)
{
int ret;
@@ -729,9 +746,8 @@ int overcommit_ratio_handler(struct ctl_table *table, int write,
return ret;
}
-int overcommit_kbytes_handler(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp,
- loff_t *ppos)
+int overcommit_kbytes_handler(struct ctl_table *table, int write, void *buffer,
+ size_t *lenp, loff_t *ppos)
{
int ret;
@@ -798,10 +814,6 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
{
long allowed;
- VM_WARN_ONCE(percpu_counter_read(&vm_committed_as) <
- -(s64)vm_committed_as_batch * num_online_cpus(),
- "memory commitment underflow");
-
vm_acct_memory(pages);
/*
diff --git a/mm/vmacache.c b/mm/vmacache.c
index cdc32a3b02fa..d9092814c772 100644
--- a/mm/vmacache.c
+++ b/mm/vmacache.c
@@ -6,7 +6,6 @@
#include <linux/sched/task.h>
#include <linux/mm.h>
#include <linux/vmacache.h>
-#include <asm/pgtable.h>
/*
* Hash based on the pmd of addr if configured with MMU, which provides a good
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 399f219544f7..3091c2ca60df 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -34,6 +34,7 @@
#include <linux/llist.h>
#include <linux/bitops.h>
#include <linux/rbtree_augmented.h>
+#include <linux/overflow.h>
#include <linux/uaccess.h>
#include <asm/tlbflush.h>
@@ -68,7 +69,8 @@ static void free_work(struct work_struct *w)
/*** Page table manipulation functions ***/
-static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end)
+static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
+ pgtbl_mod_mask *mask)
{
pte_t *pte;
@@ -77,73 +79,118 @@ static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end)
pte_t ptent = ptep_get_and_clear(&init_mm, addr, pte);
WARN_ON(!pte_none(ptent) && !pte_present(ptent));
} while (pte++, addr += PAGE_SIZE, addr != end);
+ *mask |= PGTBL_PTE_MODIFIED;
}
-static void vunmap_pmd_range(pud_t *pud, unsigned long addr, unsigned long end)
+static void vunmap_pmd_range(pud_t *pud, unsigned long addr, unsigned long end,
+ pgtbl_mod_mask *mask)
{
pmd_t *pmd;
unsigned long next;
+ int cleared;
pmd = pmd_offset(pud, addr);
do {
next = pmd_addr_end(addr, end);
- if (pmd_clear_huge(pmd))
+
+ cleared = pmd_clear_huge(pmd);
+ if (cleared || pmd_bad(*pmd))
+ *mask |= PGTBL_PMD_MODIFIED;
+
+ if (cleared)
continue;
if (pmd_none_or_clear_bad(pmd))
continue;
- vunmap_pte_range(pmd, addr, next);
+ vunmap_pte_range(pmd, addr, next, mask);
} while (pmd++, addr = next, addr != end);
}
-static void vunmap_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end)
+static void vunmap_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end,
+ pgtbl_mod_mask *mask)
{
pud_t *pud;
unsigned long next;
+ int cleared;
pud = pud_offset(p4d, addr);
do {
next = pud_addr_end(addr, end);
- if (pud_clear_huge(pud))
+
+ cleared = pud_clear_huge(pud);
+ if (cleared || pud_bad(*pud))
+ *mask |= PGTBL_PUD_MODIFIED;
+
+ if (cleared)
continue;
if (pud_none_or_clear_bad(pud))
continue;
- vunmap_pmd_range(pud, addr, next);
+ vunmap_pmd_range(pud, addr, next, mask);
} while (pud++, addr = next, addr != end);
}
-static void vunmap_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end)
+static void vunmap_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end,
+ pgtbl_mod_mask *mask)
{
p4d_t *p4d;
unsigned long next;
+ int cleared;
p4d = p4d_offset(pgd, addr);
do {
next = p4d_addr_end(addr, end);
- if (p4d_clear_huge(p4d))
+
+ cleared = p4d_clear_huge(p4d);
+ if (cleared || p4d_bad(*p4d))
+ *mask |= PGTBL_P4D_MODIFIED;
+
+ if (cleared)
continue;
if (p4d_none_or_clear_bad(p4d))
continue;
- vunmap_pud_range(p4d, addr, next);
+ vunmap_pud_range(p4d, addr, next, mask);
} while (p4d++, addr = next, addr != end);
}
-static void vunmap_page_range(unsigned long addr, unsigned long end)
+/**
+ * unmap_kernel_range_noflush - unmap kernel VM area
+ * @start: start of the VM area to unmap
+ * @size: size of the VM area to unmap
+ *
+ * Unmap PFN_UP(@size) pages at @addr. The VM area @addr and @size specify
+ * should have been allocated using get_vm_area() and its friends.
+ *
+ * NOTE:
+ * This function does NOT do any cache flushing. The caller is responsible
+ * for calling flush_cache_vunmap() on to-be-mapped areas before calling this
+ * function and flush_tlb_kernel_range() after.
+ */
+void unmap_kernel_range_noflush(unsigned long start, unsigned long size)
{
- pgd_t *pgd;
+ unsigned long end = start + size;
unsigned long next;
+ pgd_t *pgd;
+ unsigned long addr = start;
+ pgtbl_mod_mask mask = 0;
BUG_ON(addr >= end);
+ start = addr;
pgd = pgd_offset_k(addr);
do {
next = pgd_addr_end(addr, end);
+ if (pgd_bad(*pgd))
+ mask |= PGTBL_PGD_MODIFIED;
if (pgd_none_or_clear_bad(pgd))
continue;
- vunmap_p4d_range(pgd, addr, next);
+ vunmap_p4d_range(pgd, addr, next, &mask);
} while (pgd++, addr = next, addr != end);
+
+ if (mask & ARCH_PAGE_TABLE_SYNC_MASK)
+ arch_sync_kernel_mappings(start, end);
}
static int vmap_pte_range(pmd_t *pmd, unsigned long addr,
- unsigned long end, pgprot_t prot, struct page **pages, int *nr)
+ unsigned long end, pgprot_t prot, struct page **pages, int *nr,
+ pgtbl_mod_mask *mask)
{
pte_t *pte;
@@ -152,7 +199,7 @@ static int vmap_pte_range(pmd_t *pmd, unsigned long addr,
* callers keep track of where we're up to.
*/
- pte = pte_alloc_kernel(pmd, addr);
+ pte = pte_alloc_kernel_track(pmd, addr, mask);
if (!pte)
return -ENOMEM;
do {
@@ -165,94 +212,117 @@ static int vmap_pte_range(pmd_t *pmd, unsigned long addr,
set_pte_at(&init_mm, addr, pte, mk_pte(page, prot));
(*nr)++;
} while (pte++, addr += PAGE_SIZE, addr != end);
+ *mask |= PGTBL_PTE_MODIFIED;
return 0;
}
static int vmap_pmd_range(pud_t *pud, unsigned long addr,
- unsigned long end, pgprot_t prot, struct page **pages, int *nr)
+ unsigned long end, pgprot_t prot, struct page **pages, int *nr,
+ pgtbl_mod_mask *mask)
{
pmd_t *pmd;
unsigned long next;
- pmd = pmd_alloc(&init_mm, pud, addr);
+ pmd = pmd_alloc_track(&init_mm, pud, addr, mask);
if (!pmd)
return -ENOMEM;
do {
next = pmd_addr_end(addr, end);
- if (vmap_pte_range(pmd, addr, next, prot, pages, nr))
+ if (vmap_pte_range(pmd, addr, next, prot, pages, nr, mask))
return -ENOMEM;
} while (pmd++, addr = next, addr != end);
return 0;
}
static int vmap_pud_range(p4d_t *p4d, unsigned long addr,
- unsigned long end, pgprot_t prot, struct page **pages, int *nr)
+ unsigned long end, pgprot_t prot, struct page **pages, int *nr,
+ pgtbl_mod_mask *mask)
{
pud_t *pud;
unsigned long next;
- pud = pud_alloc(&init_mm, p4d, addr);
+ pud = pud_alloc_track(&init_mm, p4d, addr, mask);
if (!pud)
return -ENOMEM;
do {
next = pud_addr_end(addr, end);
- if (vmap_pmd_range(pud, addr, next, prot, pages, nr))
+ if (vmap_pmd_range(pud, addr, next, prot, pages, nr, mask))
return -ENOMEM;
} while (pud++, addr = next, addr != end);
return 0;
}
static int vmap_p4d_range(pgd_t *pgd, unsigned long addr,
- unsigned long end, pgprot_t prot, struct page **pages, int *nr)
+ unsigned long end, pgprot_t prot, struct page **pages, int *nr,
+ pgtbl_mod_mask *mask)
{
p4d_t *p4d;
unsigned long next;
- p4d = p4d_alloc(&init_mm, pgd, addr);
+ p4d = p4d_alloc_track(&init_mm, pgd, addr, mask);
if (!p4d)
return -ENOMEM;
do {
next = p4d_addr_end(addr, end);
- if (vmap_pud_range(p4d, addr, next, prot, pages, nr))
+ if (vmap_pud_range(p4d, addr, next, prot, pages, nr, mask))
return -ENOMEM;
} while (p4d++, addr = next, addr != end);
return 0;
}
-/*
- * Set up page tables in kva (addr, end). The ptes shall have prot "prot", and
- * will have pfns corresponding to the "pages" array.
+/**
+ * map_kernel_range_noflush - map kernel VM area with the specified pages
+ * @addr: start of the VM area to map
+ * @size: size of the VM area to map
+ * @prot: page protection flags to use
+ * @pages: pages to map
*
- * Ie. pte at addr+N*PAGE_SIZE shall point to pfn corresponding to pages[N]
+ * Map PFN_UP(@size) pages at @addr. The VM area @addr and @size specify should
+ * have been allocated using get_vm_area() and its friends.
+ *
+ * NOTE:
+ * This function does NOT do any cache flushing. The caller is responsible for
+ * calling flush_cache_vmap() on to-be-mapped areas before calling this
+ * function.
+ *
+ * RETURNS:
+ * 0 on success, -errno on failure.
*/
-static int vmap_page_range_noflush(unsigned long start, unsigned long end,
- pgprot_t prot, struct page **pages)
+int map_kernel_range_noflush(unsigned long addr, unsigned long size,
+ pgprot_t prot, struct page **pages)
{
- pgd_t *pgd;
+ unsigned long start = addr;
+ unsigned long end = addr + size;
unsigned long next;
- unsigned long addr = start;
+ pgd_t *pgd;
int err = 0;
int nr = 0;
+ pgtbl_mod_mask mask = 0;
BUG_ON(addr >= end);
pgd = pgd_offset_k(addr);
do {
next = pgd_addr_end(addr, end);
- err = vmap_p4d_range(pgd, addr, next, prot, pages, &nr);
+ if (pgd_bad(*pgd))
+ mask |= PGTBL_PGD_MODIFIED;
+ err = vmap_p4d_range(pgd, addr, next, prot, pages, &nr, &mask);
if (err)
return err;
} while (pgd++, addr = next, addr != end);
- return nr;
+ if (mask & ARCH_PAGE_TABLE_SYNC_MASK)
+ arch_sync_kernel_mappings(start, end);
+
+ return 0;
}
-static int vmap_page_range(unsigned long start, unsigned long end,
- pgprot_t prot, struct page **pages)
+int map_kernel_range(unsigned long start, unsigned long size, pgprot_t prot,
+ struct page **pages)
{
int ret;
- ret = vmap_page_range_noflush(start, end, prot, pages);
- flush_cache_vmap(start, end);
+ ret = map_kernel_range_noflush(start, size, prot, pages);
+ flush_cache_vmap(start, start + size);
return ret;
}
@@ -1222,14 +1292,6 @@ int unregister_vmap_purge_notifier(struct notifier_block *nb)
EXPORT_SYMBOL_GPL(unregister_vmap_purge_notifier);
/*
- * Clear the pagetable entries of a given vmap_area
- */
-static void unmap_vmap_area(struct vmap_area *va)
-{
- vunmap_page_range(va->va_start, va->va_end);
-}
-
-/*
* lazy_max_pages is the maximum amount of virtual address space we gather up
* before attempting to purge with a TLB flush.
*
@@ -1292,12 +1354,6 @@ static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end)
return false;
/*
- * First make sure the mappings are removed from all page-tables
- * before they are freed.
- */
- vmalloc_sync_unmappings();
-
- /*
* TODO: to calculate a flush range without looping.
* The list can be up to lazy_max_pages() elements.
*/
@@ -1390,7 +1446,7 @@ static void free_vmap_area_noflush(struct vmap_area *va)
static void free_unmap_vmap_area(struct vmap_area *va)
{
flush_cache_vunmap(va->va_start, va->va_end);
- unmap_vmap_area(va);
+ unmap_kernel_range_noflush(va->va_start, va->va_end - va->va_start);
if (debug_pagealloc_enabled_static())
flush_tlb_kernel_range(va->va_start, va->va_end);
@@ -1664,7 +1720,7 @@ static void *vb_alloc(unsigned long size, gfp_t gfp_mask)
return vaddr;
}
-static void vb_free(const void *addr, unsigned long size)
+static void vb_free(unsigned long addr, unsigned long size)
{
unsigned long offset;
unsigned long vb_idx;
@@ -1674,24 +1730,22 @@ static void vb_free(const void *addr, unsigned long size)
BUG_ON(offset_in_page(size));
BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC);
- flush_cache_vunmap((unsigned long)addr, (unsigned long)addr + size);
+ flush_cache_vunmap(addr, addr + size);
order = get_order(size);
- offset = (unsigned long)addr & (VMAP_BLOCK_SIZE - 1);
- offset >>= PAGE_SHIFT;
+ offset = (addr & (VMAP_BLOCK_SIZE - 1)) >> PAGE_SHIFT;
- vb_idx = addr_to_vb_idx((unsigned long)addr);
+ vb_idx = addr_to_vb_idx(addr);
rcu_read_lock();
vb = radix_tree_lookup(&vmap_block_tree, vb_idx);
rcu_read_unlock();
BUG_ON(!vb);
- vunmap_page_range((unsigned long)addr, (unsigned long)addr + size);
+ unmap_kernel_range_noflush(addr, size);
if (debug_pagealloc_enabled_static())
- flush_tlb_kernel_range((unsigned long)addr,
- (unsigned long)addr + size);
+ flush_tlb_kernel_range(addr, addr + size);
spin_lock(&vb->lock);
@@ -1791,7 +1845,7 @@ void vm_unmap_ram(const void *mem, unsigned int count)
if (likely(count <= VMAP_MAX_ALLOC)) {
debug_check_no_locks_freed(mem, size);
- vb_free(mem, size);
+ vb_free(addr, size);
return;
}
@@ -1818,7 +1872,7 @@ EXPORT_SYMBOL(vm_unmap_ram);
*
* Returns: a pointer to the address that has been mapped, or %NULL on failure
*/
-void *vm_map_ram(struct page **pages, unsigned int count, int node, pgprot_t prot)
+void *vm_map_ram(struct page **pages, unsigned int count, int node)
{
unsigned long size = (unsigned long)count << PAGE_SHIFT;
unsigned long addr;
@@ -1842,7 +1896,7 @@ void *vm_map_ram(struct page **pages, unsigned int count, int node, pgprot_t pro
kasan_unpoison_vmalloc(mem, size);
- if (vmap_page_range(addr, addr + size, prot, pages) < 0) {
+ if (map_kernel_range(addr, size, PAGE_KERNEL, pages) < 0) {
vm_unmap_ram(mem, count);
return NULL;
}
@@ -1987,51 +2041,6 @@ void __init vmalloc_init(void)
}
/**
- * map_kernel_range_noflush - map kernel VM area with the specified pages
- * @addr: start of the VM area to map
- * @size: size of the VM area to map
- * @prot: page protection flags to use
- * @pages: pages to map
- *
- * Map PFN_UP(@size) pages at @addr. The VM area @addr and @size
- * specify should have been allocated using get_vm_area() and its
- * friends.
- *
- * NOTE:
- * This function does NOT do any cache flushing. The caller is
- * responsible for calling flush_cache_vmap() on to-be-mapped areas
- * before calling this function.
- *
- * RETURNS:
- * The number of pages mapped on success, -errno on failure.
- */
-int map_kernel_range_noflush(unsigned long addr, unsigned long size,
- pgprot_t prot, struct page **pages)
-{
- return vmap_page_range_noflush(addr, addr + size, prot, pages);
-}
-
-/**
- * unmap_kernel_range_noflush - unmap kernel VM area
- * @addr: start of the VM area to unmap
- * @size: size of the VM area to unmap
- *
- * Unmap PFN_UP(@size) pages at @addr. The VM area @addr and @size
- * specify should have been allocated using get_vm_area() and its
- * friends.
- *
- * NOTE:
- * This function does NOT do any cache flushing. The caller is
- * responsible for calling flush_cache_vunmap() on to-be-mapped areas
- * before calling this function and flush_tlb_kernel_range() after.
- */
-void unmap_kernel_range_noflush(unsigned long addr, unsigned long size)
-{
- vunmap_page_range(addr, addr + size);
-}
-EXPORT_SYMBOL_GPL(unmap_kernel_range_noflush);
-
-/**
* unmap_kernel_range - unmap kernel VM area and flush cache and TLB
* @addr: start of the VM area to unmap
* @size: size of the VM area to unmap
@@ -2044,22 +2053,9 @@ void unmap_kernel_range(unsigned long addr, unsigned long size)
unsigned long end = addr + size;
flush_cache_vunmap(addr, end);
- vunmap_page_range(addr, end);
+ unmap_kernel_range_noflush(addr, size);
flush_tlb_kernel_range(addr, end);
}
-EXPORT_SYMBOL_GPL(unmap_kernel_range);
-
-int map_vm_area(struct vm_struct *area, pgprot_t prot, struct page **pages)
-{
- unsigned long addr = (unsigned long)area->addr;
- unsigned long end = addr + get_vm_area_size(area);
- int err;
-
- err = vmap_page_range(addr, end, prot, pages);
-
- return err > 0 ? 0 : err;
-}
-EXPORT_SYMBOL_GPL(map_vm_area);
static inline void setup_vmalloc_vm_locked(struct vm_struct *vm,
struct vmap_area *va, unsigned long flags, const void *caller)
@@ -2127,14 +2123,6 @@ static struct vm_struct *__get_vm_area_node(unsigned long size,
return area;
}
-struct vm_struct *__get_vm_area(unsigned long size, unsigned long flags,
- unsigned long start, unsigned long end)
-{
- return __get_vm_area_node(size, 1, flags, start, end, NUMA_NO_NODE,
- GFP_KERNEL, __builtin_return_address(0));
-}
-EXPORT_SYMBOL_GPL(__get_vm_area);
-
struct vm_struct *__get_vm_area_caller(unsigned long size, unsigned long flags,
unsigned long start, unsigned long end,
const void *caller)
@@ -2329,7 +2317,7 @@ static inline void __vfree_deferred(const void *addr)
* Use raw_cpu_ptr() because this can be called from preemptible
* context. Preemption is absolutely fine here, because the llist_add()
* implementation is lockless, so it works even if we are adding to
- * nother cpu's list. schedule_work() should be fine with this too.
+ * another cpu's list. schedule_work() should be fine with this too.
*/
struct vfree_deferred *p = raw_cpu_ptr(&vfree_deferred);
@@ -2440,7 +2428,8 @@ void *vmap(struct page **pages, unsigned int count,
if (!area)
return NULL;
- if (map_vm_area(area, prot, pages)) {
+ if (map_kernel_range((unsigned long)area->addr, size, pgprot_nx(prot),
+ pages) < 0) {
vunmap(area->addr);
return NULL;
}
@@ -2449,9 +2438,6 @@ void *vmap(struct page **pages, unsigned int count,
}
EXPORT_SYMBOL(vmap);
-static void *__vmalloc_node(unsigned long size, unsigned long align,
- gfp_t gfp_mask, pgprot_t prot,
- int node, const void *caller);
static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
pgprot_t prot, int node)
{
@@ -2469,7 +2455,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
/* Please note that the recursion is strictly bounded. */
if (array_size > PAGE_SIZE) {
pages = __vmalloc_node(array_size, 1, nested_gfp|highmem_mask,
- PAGE_KERNEL, node, area->caller);
+ node, area->caller);
} else {
pages = kmalloc_node(array_size, nested_gfp, node);
}
@@ -2503,8 +2489,10 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
}
atomic_long_add(area->nr_pages, &nr_vmalloc_pages);
- if (map_vm_area(area, prot, pages))
+ if (map_kernel_range((unsigned long)area->addr, get_vm_area_size(area),
+ prot, pages) < 0)
goto fail;
+
return area->addr;
fail:
@@ -2572,27 +2560,16 @@ fail:
return NULL;
}
-/*
- * This is only for performance analysis of vmalloc and stress purpose.
- * It is required by vmalloc test module, therefore do not use it other
- * than that.
- */
-#ifdef CONFIG_TEST_VMALLOC_MODULE
-EXPORT_SYMBOL_GPL(__vmalloc_node_range);
-#endif
-
/**
* __vmalloc_node - allocate virtually contiguous memory
* @size: allocation size
* @align: desired alignment
* @gfp_mask: flags for the page level allocator
- * @prot: protection mask for the allocated pages
* @node: node to use for allocation or NUMA_NO_NODE
* @caller: caller's return address
*
- * Allocate enough pages to cover @size from the page level
- * allocator with @gfp_mask flags. Map them into contiguous
- * kernel virtual space, using a pagetable protection of @prot.
+ * Allocate enough pages to cover @size from the page level allocator with
+ * @gfp_mask flags. Map them into contiguous kernel virtual space.
*
* Reclaim modifiers in @gfp_mask - __GFP_NORETRY, __GFP_RETRY_MAYFAIL
* and __GFP_NOFAIL are not supported
@@ -2602,35 +2579,28 @@ EXPORT_SYMBOL_GPL(__vmalloc_node_range);
*
* Return: pointer to the allocated memory or %NULL on error
*/
-static void *__vmalloc_node(unsigned long size, unsigned long align,
- gfp_t gfp_mask, pgprot_t prot,
- int node, const void *caller)
+void *__vmalloc_node(unsigned long size, unsigned long align,
+ gfp_t gfp_mask, int node, const void *caller)
{
return __vmalloc_node_range(size, align, VMALLOC_START, VMALLOC_END,
- gfp_mask, prot, 0, node, caller);
+ gfp_mask, PAGE_KERNEL, 0, node, caller);
}
+/*
+ * This is only for performance analysis of vmalloc and stress purpose.
+ * It is required by vmalloc test module, therefore do not use it other
+ * than that.
+ */
+#ifdef CONFIG_TEST_VMALLOC_MODULE
+EXPORT_SYMBOL_GPL(__vmalloc_node);
+#endif
-void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot)
+void *__vmalloc(unsigned long size, gfp_t gfp_mask)
{
- return __vmalloc_node(size, 1, gfp_mask, prot, NUMA_NO_NODE,
+ return __vmalloc_node(size, 1, gfp_mask, NUMA_NO_NODE,
__builtin_return_address(0));
}
EXPORT_SYMBOL(__vmalloc);
-static inline void *__vmalloc_node_flags(unsigned long size,
- int node, gfp_t flags)
-{
- return __vmalloc_node(size, 1, flags, PAGE_KERNEL,
- node, __builtin_return_address(0));
-}
-
-
-void *__vmalloc_node_flags_caller(unsigned long size, int node, gfp_t flags,
- void *caller)
-{
- return __vmalloc_node(size, 1, flags, PAGE_KERNEL, node, caller);
-}
-
/**
* vmalloc - allocate virtually contiguous memory
* @size: allocation size
@@ -2645,8 +2615,8 @@ void *__vmalloc_node_flags_caller(unsigned long size, int node, gfp_t flags,
*/
void *vmalloc(unsigned long size)
{
- return __vmalloc_node_flags(size, NUMA_NO_NODE,
- GFP_KERNEL);
+ return __vmalloc_node(size, 1, GFP_KERNEL, NUMA_NO_NODE,
+ __builtin_return_address(0));
}
EXPORT_SYMBOL(vmalloc);
@@ -2665,8 +2635,8 @@ EXPORT_SYMBOL(vmalloc);
*/
void *vzalloc(unsigned long size)
{
- return __vmalloc_node_flags(size, NUMA_NO_NODE,
- GFP_KERNEL | __GFP_ZERO);
+ return __vmalloc_node(size, 1, GFP_KERNEL | __GFP_ZERO, NUMA_NO_NODE,
+ __builtin_return_address(0));
}
EXPORT_SYMBOL(vzalloc);
@@ -2703,8 +2673,8 @@ EXPORT_SYMBOL(vmalloc_user);
*/
void *vmalloc_node(unsigned long size, int node)
{
- return __vmalloc_node(size, 1, GFP_KERNEL, PAGE_KERNEL,
- node, __builtin_return_address(0));
+ return __vmalloc_node(size, 1, GFP_KERNEL, node,
+ __builtin_return_address(0));
}
EXPORT_SYMBOL(vmalloc_node);
@@ -2717,39 +2687,16 @@ EXPORT_SYMBOL(vmalloc_node);
* allocator and map them into contiguous kernel virtual space.
* The memory allocated is set to zero.
*
- * For tight control over page level allocator and protection flags
- * use __vmalloc_node() instead.
- *
* Return: pointer to the allocated memory or %NULL on error
*/
void *vzalloc_node(unsigned long size, int node)
{
- return __vmalloc_node_flags(size, node,
- GFP_KERNEL | __GFP_ZERO);
+ return __vmalloc_node(size, 1, GFP_KERNEL | __GFP_ZERO, node,
+ __builtin_return_address(0));
}
EXPORT_SYMBOL(vzalloc_node);
/**
- * vmalloc_user_node_flags - allocate memory for userspace on a specific node
- * @size: allocation size
- * @node: numa node
- * @flags: flags for the page level allocator
- *
- * The resulting memory area is zeroed so it can be mapped to userspace
- * without leaking data.
- *
- * Return: pointer to the allocated memory or %NULL on error
- */
-void *vmalloc_user_node_flags(unsigned long size, int node, gfp_t flags)
-{
- return __vmalloc_node_range(size, SHMLBA, VMALLOC_START, VMALLOC_END,
- flags | __GFP_ZERO, PAGE_KERNEL,
- VM_USERMAP, node,
- __builtin_return_address(0));
-}
-EXPORT_SYMBOL(vmalloc_user_node_flags);
-
-/**
* vmalloc_exec - allocate virtually contiguous, executable memory
* @size: allocation size
*
@@ -2792,8 +2739,8 @@ void *vmalloc_exec(unsigned long size)
*/
void *vmalloc_32(unsigned long size)
{
- return __vmalloc_node(size, 1, GFP_VMALLOC32, PAGE_KERNEL,
- NUMA_NO_NODE, __builtin_return_address(0));
+ return __vmalloc_node(size, 1, GFP_VMALLOC32, NUMA_NO_NODE,
+ __builtin_return_address(0));
}
EXPORT_SYMBOL(vmalloc_32);
@@ -3054,6 +3001,7 @@ finished:
* @vma: vma to cover
* @uaddr: target user address to start at
* @kaddr: virtual address of vmalloc kernel memory
+ * @pgoff: offset from @kaddr to start at
* @size: size of map area
*
* Returns: 0 for success, -Exxx on failure
@@ -3066,9 +3014,15 @@ finished:
* Similar to remap_pfn_range() (see mm/memory.c)
*/
int remap_vmalloc_range_partial(struct vm_area_struct *vma, unsigned long uaddr,
- void *kaddr, unsigned long size)
+ void *kaddr, unsigned long pgoff,
+ unsigned long size)
{
struct vm_struct *area;
+ unsigned long off;
+ unsigned long end_index;
+
+ if (check_shl_overflow(pgoff, PAGE_SHIFT, &off))
+ return -EINVAL;
size = PAGE_ALIGN(size);
@@ -3082,8 +3036,10 @@ int remap_vmalloc_range_partial(struct vm_area_struct *vma, unsigned long uaddr,
if (!(area->flags & (VM_USERMAP | VM_DMA_COHERENT)))
return -EINVAL;
- if (kaddr + size > area->addr + get_vm_area_size(area))
+ if (check_add_overflow(size, off, &end_index) ||
+ end_index > get_vm_area_size(area))
return -EINVAL;
+ kaddr += off;
do {
struct page *page = vmalloc_to_page(kaddr);
@@ -3122,26 +3078,11 @@ int remap_vmalloc_range(struct vm_area_struct *vma, void *addr,
unsigned long pgoff)
{
return remap_vmalloc_range_partial(vma, vma->vm_start,
- addr + (pgoff << PAGE_SHIFT),
+ addr, pgoff,
vma->vm_end - vma->vm_start);
}
EXPORT_SYMBOL(remap_vmalloc_range);
-/*
- * Implement stubs for vmalloc_sync_[un]mappings () if the architecture chose
- * not to have one.
- *
- * The purpose of this function is to make sure the vmalloc area
- * mappings are identical in all page-tables in the system.
- */
-void __weak vmalloc_sync_mappings(void)
-{
-}
-
-void __weak vmalloc_sync_unmappings(void)
-{
-}
-
static int f(pte_t *pte, unsigned long addr, void *data)
{
pte_t ***p = data;
diff --git a/mm/vmscan.c b/mm/vmscan.c
index b06868fc4926..b6d84326bdf2 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -79,6 +79,12 @@ struct scan_control {
*/
struct mem_cgroup *target_mem_cgroup;
+ /*
+ * Scan pressure balancing between anon and file LRUs
+ */
+ unsigned long anon_cost;
+ unsigned long file_cost;
+
/* Can active pages be deactivated as part of reclaim? */
#define DEACTIVATE_ANON 1
#define DEACTIVATE_FILE 2
@@ -161,7 +167,7 @@ struct scan_control {
#endif
/*
- * From 0 .. 100. Higher means more swappy.
+ * From 0 .. 200. Higher means more swappy.
*/
int vm_swappiness = 60;
/*
@@ -676,7 +682,7 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
freed += ret;
/*
* Bail out if someone want to register a new shrinker to
- * prevent the regsitration from being stalled for long periods
+ * prevent the registration from being stalled for long periods
* by parallel ongoing shrinking.
*/
if (rwsem_is_contended(&shrinker_rwsem)) {
@@ -1066,17 +1072,17 @@ static void page_check_dirty_writeback(struct page *page,
/*
* shrink_page_list() returns the number of reclaimed pages
*/
-static unsigned long shrink_page_list(struct list_head *page_list,
- struct pglist_data *pgdat,
- struct scan_control *sc,
- enum ttu_flags ttu_flags,
- struct reclaim_stat *stat,
- bool ignore_references)
+static unsigned int shrink_page_list(struct list_head *page_list,
+ struct pglist_data *pgdat,
+ struct scan_control *sc,
+ enum ttu_flags ttu_flags,
+ struct reclaim_stat *stat,
+ bool ignore_references)
{
LIST_HEAD(ret_pages);
LIST_HEAD(free_pages);
- unsigned nr_reclaimed = 0;
- unsigned pgactivate = 0;
+ unsigned int nr_reclaimed = 0;
+ unsigned int pgactivate = 0;
memset(stat, 0, sizeof(*stat));
cond_resched();
@@ -1295,11 +1301,15 @@ static unsigned long shrink_page_list(struct list_head *page_list,
*/
if (page_mapped(page)) {
enum ttu_flags flags = ttu_flags | TTU_BATCH_FLUSH;
+ bool was_swapbacked = PageSwapBacked(page);
if (unlikely(PageTransHuge(page)))
flags |= TTU_SPLIT_HUGE_PMD;
+
if (!try_to_unmap(page, flags)) {
stat->nr_unmap_fail += nr_pages;
+ if (!was_swapbacked && PageSwapBacked(page))
+ stat->nr_lazyfree_fail += nr_pages;
goto activate_locked;
}
}
@@ -1349,6 +1359,8 @@ static unsigned long shrink_page_list(struct list_head *page_list,
case PAGE_ACTIVATE:
goto activate_locked;
case PAGE_SUCCESS:
+ stat->nr_pageout += hpage_nr_pages(page);
+
if (PageWriteback(page))
goto keep;
if (PageDirty(page))
@@ -1438,7 +1450,7 @@ free_it:
* appear not as the counts should be low
*/
if (unlikely(PageTransHuge(page)))
- (*get_compound_page_dtor(page))(page);
+ destroy_compound_page(page);
else
list_add(&page->lru, &free_pages);
continue;
@@ -1483,7 +1495,7 @@ keep:
return nr_reclaimed;
}
-unsigned long reclaim_clean_pages_from_list(struct zone *zone,
+unsigned int reclaim_clean_pages_from_list(struct zone *zone,
struct list_head *page_list)
{
struct scan_control sc = {
@@ -1491,8 +1503,8 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone,
.priority = DEF_PRIORITY,
.may_unmap = 1,
};
- struct reclaim_stat dummy_stat;
- unsigned long ret;
+ struct reclaim_stat stat;
+ unsigned int nr_reclaimed;
struct page *page, *next;
LIST_HEAD(clean_pages);
@@ -1504,11 +1516,21 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone,
}
}
- ret = shrink_page_list(&clean_pages, zone->zone_pgdat, &sc,
- TTU_IGNORE_ACCESS, &dummy_stat, true);
+ nr_reclaimed = shrink_page_list(&clean_pages, zone->zone_pgdat, &sc,
+ TTU_IGNORE_ACCESS, &stat, true);
list_splice(&clean_pages, page_list);
- mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_FILE, -ret);
- return ret;
+ mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_FILE, -nr_reclaimed);
+ /*
+ * Since lazyfree pages are isolated from file LRU from the beginning,
+ * they will rotate back to anonymous LRU in the end if it failed to
+ * discard so isolated count will be mismatched.
+ * Compensate the isolated count for both LRU lists.
+ */
+ mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_ANON,
+ stat.nr_lazyfree_fail);
+ mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_FILE,
+ -stat.nr_lazyfree_fail);
+ return nr_reclaimed;
}
/*
@@ -1591,7 +1613,7 @@ int __isolate_lru_page(struct page *page, isolate_mode_t mode)
/*
* Update LRU sizes after isolating pages. The LRU size updates must
- * be complete before mem_cgroup_update_lru_size due to a santity check.
+ * be complete before mem_cgroup_update_lru_size due to a sanity check.
*/
static __always_inline void update_lru_sizes(struct lruvec *lruvec,
enum lru_list lru, unsigned long *nr_zone_taken)
@@ -1602,10 +1624,7 @@ static __always_inline void update_lru_sizes(struct lruvec *lruvec,
if (!nr_zone_taken[zid])
continue;
- __update_lru_size(lruvec, lru, zid, -nr_zone_taken[zid]);
-#ifdef CONFIG_MEMCG
- mem_cgroup_update_lru_size(lruvec, lru, zid, -nr_zone_taken[zid]);
-#endif
+ update_lru_size(lruvec, lru, zid, -nr_zone_taken[zid]);
}
}
@@ -1625,7 +1644,6 @@ static __always_inline void update_lru_sizes(struct lruvec *lruvec,
* @dst: The temp list to put pages on to.
* @nr_scanned: The number of pages that were scanned.
* @sc: The scan_control struct for this reclaim session
- * @mode: One of the LRU isolation modes
* @lru: LRU list id for isolating
*
* returns how many pages were moved onto *@dst.
@@ -1860,7 +1878,7 @@ static unsigned noinline_for_stack move_pages_to_lru(struct lruvec *lruvec,
if (unlikely(PageCompound(page))) {
spin_unlock_irq(&pgdat->lru_lock);
- (*get_compound_page_dtor(page))(page);
+ destroy_compound_page(page);
spin_lock_irq(&pgdat->lru_lock);
} else
list_add(&page->lru, &pages_to_free);
@@ -1879,13 +1897,13 @@ static unsigned noinline_for_stack move_pages_to_lru(struct lruvec *lruvec,
/*
* If a kernel thread (such as nfsd for loop-back mounts) services
- * a backing device by writing to the page cache it sets PF_LESS_THROTTLE.
+ * a backing device by writing to the page cache it sets PF_LOCAL_THROTTLE.
* In that case we should only throttle if the backing device it is
* writing to is congested. In other cases it is safe to throttle.
*/
static int current_may_throttle(void)
{
- return !(current->flags & PF_LESS_THROTTLE) ||
+ return !(current->flags & PF_LOCAL_THROTTLE) ||
current->backing_dev_info == NULL ||
bdi_write_congested(current->backing_dev_info);
}
@@ -1900,13 +1918,12 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
{
LIST_HEAD(page_list);
unsigned long nr_scanned;
- unsigned long nr_reclaimed = 0;
+ unsigned int nr_reclaimed = 0;
unsigned long nr_taken;
struct reclaim_stat stat;
- int file = is_file_lru(lru);
+ bool file = is_file_lru(lru);
enum vm_event_item item;
struct pglist_data *pgdat = lruvec_pgdat(lruvec);
- struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
bool stalled = false;
while (unlikely(too_many_isolated(pgdat, file, sc))) {
@@ -1930,12 +1947,12 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
&nr_scanned, sc, lru);
__mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, nr_taken);
- reclaim_stat->recent_scanned[file] += nr_taken;
-
item = current_is_kswapd() ? PGSCAN_KSWAPD : PGSCAN_DIRECT;
if (!cgroup_reclaim(sc))
__count_vm_events(item, nr_scanned);
__count_memcg_events(lruvec_memcg(lruvec), item, nr_scanned);
+ __count_vm_events(PGSCAN_ANON + file, nr_scanned);
+
spin_unlock_irq(&pgdat->lru_lock);
if (nr_taken == 0)
@@ -1946,16 +1963,15 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
spin_lock_irq(&pgdat->lru_lock);
+ move_pages_to_lru(lruvec, &page_list);
+
+ __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken);
+ lru_note_cost(lruvec, file, stat.nr_pageout);
item = current_is_kswapd() ? PGSTEAL_KSWAPD : PGSTEAL_DIRECT;
if (!cgroup_reclaim(sc))
__count_vm_events(item, nr_reclaimed);
__count_memcg_events(lruvec_memcg(lruvec), item, nr_reclaimed);
- reclaim_stat->recent_rotated[0] += stat.nr_activate[0];
- reclaim_stat->recent_rotated[1] += stat.nr_activate[1];
-
- move_pages_to_lru(lruvec, &page_list);
-
- __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken);
+ __count_vm_events(PGSTEAL_ANON + file, nr_reclaimed);
spin_unlock_irq(&pgdat->lru_lock);
@@ -2002,7 +2018,6 @@ static void shrink_active_list(unsigned long nr_to_scan,
LIST_HEAD(l_active);
LIST_HEAD(l_inactive);
struct page *page;
- struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
unsigned nr_deactivate, nr_activate;
unsigned nr_rotated = 0;
int file = is_file_lru(lru);
@@ -2016,7 +2031,6 @@ static void shrink_active_list(unsigned long nr_to_scan,
&nr_scanned, sc, lru);
__mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, nr_taken);
- reclaim_stat->recent_scanned[file] += nr_taken;
__count_vm_events(PGREFILL, nr_scanned);
__count_memcg_events(lruvec_memcg(lruvec), PGREFILL, nr_scanned);
@@ -2043,7 +2057,6 @@ static void shrink_active_list(unsigned long nr_to_scan,
if (page_referenced(page, 0, sc->target_mem_cgroup,
&vm_flags)) {
- nr_rotated += hpage_nr_pages(page);
/*
* Identify referenced, file-backed active pages and
* give them one more trip around the active list. So
@@ -2054,6 +2067,7 @@ static void shrink_active_list(unsigned long nr_to_scan,
* so we ignore them here.
*/
if ((vm_flags & VM_EXEC) && page_is_file_lru(page)) {
+ nr_rotated += hpage_nr_pages(page);
list_add(&page->lru, &l_active);
continue;
}
@@ -2068,13 +2082,6 @@ static void shrink_active_list(unsigned long nr_to_scan,
* Move pages back to the lru list.
*/
spin_lock_irq(&pgdat->lru_lock);
- /*
- * Count referenced pages from currently used mappings as rotated,
- * even though only some of them are actually re-activated. This
- * helps balance scan pressure between file and anonymous pages in
- * get_scan_count.
- */
- reclaim_stat->recent_rotated[file] += nr_rotated;
nr_activate = move_pages_to_lru(lruvec, &l_active);
nr_deactivate = move_pages_to_lru(lruvec, &l_inactive);
@@ -2096,7 +2103,7 @@ static void shrink_active_list(unsigned long nr_to_scan,
unsigned long reclaim_pages(struct list_head *page_list)
{
int nid = NUMA_NO_NODE;
- unsigned long nr_reclaimed = 0;
+ unsigned int nr_reclaimed = 0;
LIST_HEAD(node_page_list);
struct reclaim_stat dummy_stat;
struct page *page;
@@ -2230,14 +2237,11 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
unsigned long *nr)
{
struct mem_cgroup *memcg = lruvec_memcg(lruvec);
+ unsigned long anon_cost, file_cost, total_cost;
int swappiness = mem_cgroup_swappiness(memcg);
- struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
u64 fraction[2];
u64 denominator = 0; /* gcc */
- struct pglist_data *pgdat = lruvec_pgdat(lruvec);
- unsigned long anon_prio, file_prio;
enum scan_balance scan_balance;
- unsigned long anon, file;
unsigned long ap, fp;
enum lru_list lru;
@@ -2287,57 +2291,35 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
}
scan_balance = SCAN_FRACT;
-
- /*
- * With swappiness at 100, anonymous and file have the same priority.
- * This scanning priority is essentially the inverse of IO cost.
- */
- anon_prio = swappiness;
- file_prio = 200 - anon_prio;
-
/*
- * OK, so we have swap space and a fair amount of page cache
- * pages. We use the recently rotated / recently scanned
- * ratios to determine how valuable each cache is.
+ * Calculate the pressure balance between anon and file pages.
+ *
+ * The amount of pressure we put on each LRU is inversely
+ * proportional to the cost of reclaiming each list, as
+ * determined by the share of pages that are refaulting, times
+ * the relative IO cost of bringing back a swapped out
+ * anonymous page vs reloading a filesystem page (swappiness).
*
- * Because workloads change over time (and to avoid overflow)
- * we keep these statistics as a floating average, which ends
- * up weighing recent references more than old ones.
+ * Although we limit that influence to ensure no list gets
+ * left behind completely: at least a third of the pressure is
+ * applied, before swappiness.
*
- * anon in [0], file in [1]
+ * With swappiness at 100, anon and file have equal IO cost.
*/
+ total_cost = sc->anon_cost + sc->file_cost;
+ anon_cost = total_cost + sc->anon_cost;
+ file_cost = total_cost + sc->file_cost;
+ total_cost = anon_cost + file_cost;
- anon = lruvec_lru_size(lruvec, LRU_ACTIVE_ANON, MAX_NR_ZONES) +
- lruvec_lru_size(lruvec, LRU_INACTIVE_ANON, MAX_NR_ZONES);
- file = lruvec_lru_size(lruvec, LRU_ACTIVE_FILE, MAX_NR_ZONES) +
- lruvec_lru_size(lruvec, LRU_INACTIVE_FILE, MAX_NR_ZONES);
+ ap = swappiness * (total_cost + 1);
+ ap /= anon_cost + 1;
- spin_lock_irq(&pgdat->lru_lock);
- if (unlikely(reclaim_stat->recent_scanned[0] > anon / 4)) {
- reclaim_stat->recent_scanned[0] /= 2;
- reclaim_stat->recent_rotated[0] /= 2;
- }
-
- if (unlikely(reclaim_stat->recent_scanned[1] > file / 4)) {
- reclaim_stat->recent_scanned[1] /= 2;
- reclaim_stat->recent_rotated[1] /= 2;
- }
-
- /*
- * The amount of pressure on anon vs file pages is inversely
- * proportional to the fraction of recently scanned pages on
- * each list that were recently referenced and in active use.
- */
- ap = anon_prio * (reclaim_stat->recent_scanned[0] + 1);
- ap /= reclaim_stat->recent_rotated[0] + 1;
-
- fp = file_prio * (reclaim_stat->recent_scanned[1] + 1);
- fp /= reclaim_stat->recent_rotated[1] + 1;
- spin_unlock_irq(&pgdat->lru_lock);
+ fp = (200 - swappiness) * (total_cost + 1);
+ fp /= file_cost + 1;
fraction[0] = ap;
fraction[1] = fp;
- denominator = ap + fp + 1;
+ denominator = ap + fp;
out:
for_each_evictable_lru(lru) {
int file = is_file_lru(lru);
@@ -2389,7 +2371,7 @@ out:
/*
* Minimally target SWAP_CLUSTER_MAX pages to keep
- * reclaim moving forwards, avoiding decremeting
+ * reclaim moving forwards, avoiding decrementing
* sc->priority further than desirable.
*/
scan = max(scan, SWAP_CLUSTER_MAX);
@@ -2567,7 +2549,7 @@ static bool in_reclaim_compaction(struct scan_control *sc)
* Reclaim/compaction is used for high-order allocation requests. It reclaims
* order-0 pages before compacting the zone. should_continue_reclaim() returns
* true if more pages should be reclaimed such that when the page allocator
- * calls try_to_compact_zone() that it will have enough free pages to succeed.
+ * calls try_to_compact_pages() that it will have enough free pages to succeed.
* It will give up earlier than that if there is difficulty reclaiming pages.
*/
static inline bool should_continue_reclaim(struct pglist_data *pgdat,
@@ -2698,6 +2680,14 @@ again:
nr_scanned = sc->nr_scanned;
/*
+ * Determine the scan balance between anon and file LRUs.
+ */
+ spin_lock_irq(&pgdat->lru_lock);
+ sc->anon_cost = target_lruvec->anon_cost;
+ sc->file_cost = target_lruvec->file_cost;
+ spin_unlock_irq(&pgdat->lru_lock);
+
+ /*
* Target desirable inactive:active list ratios for the anon
* and file LRU lists.
*/
@@ -3132,8 +3122,8 @@ static bool allow_direct_reclaim(pg_data_t *pgdat)
/* kswapd must be awake if processes are being throttled */
if (!wmark_ok && waitqueue_active(&pgdat->kswapd_wait)) {
- if (READ_ONCE(pgdat->kswapd_classzone_idx) > ZONE_NORMAL)
- WRITE_ONCE(pgdat->kswapd_classzone_idx, ZONE_NORMAL);
+ if (READ_ONCE(pgdat->kswapd_highest_zoneidx) > ZONE_NORMAL)
+ WRITE_ONCE(pgdat->kswapd_highest_zoneidx, ZONE_NORMAL);
wake_up_interruptible(&pgdat->kswapd_wait);
}
@@ -3386,7 +3376,7 @@ static void age_active_anon(struct pglist_data *pgdat,
} while (memcg);
}
-static bool pgdat_watermark_boosted(pg_data_t *pgdat, int classzone_idx)
+static bool pgdat_watermark_boosted(pg_data_t *pgdat, int highest_zoneidx)
{
int i;
struct zone *zone;
@@ -3398,7 +3388,7 @@ static bool pgdat_watermark_boosted(pg_data_t *pgdat, int classzone_idx)
* start prematurely when there is no boosting and a lower
* zone is balanced.
*/
- for (i = classzone_idx; i >= 0; i--) {
+ for (i = highest_zoneidx; i >= 0; i--) {
zone = pgdat->node_zones + i;
if (!managed_zone(zone))
continue;
@@ -3412,9 +3402,9 @@ static bool pgdat_watermark_boosted(pg_data_t *pgdat, int classzone_idx)
/*
* Returns true if there is an eligible zone balanced for the request order
- * and classzone_idx
+ * and highest_zoneidx
*/
-static bool pgdat_balanced(pg_data_t *pgdat, int order, int classzone_idx)
+static bool pgdat_balanced(pg_data_t *pgdat, int order, int highest_zoneidx)
{
int i;
unsigned long mark = -1;
@@ -3424,19 +3414,19 @@ static bool pgdat_balanced(pg_data_t *pgdat, int order, int classzone_idx)
* Check watermarks bottom-up as lower zones are more likely to
* meet watermarks.
*/
- for (i = 0; i <= classzone_idx; i++) {
+ for (i = 0; i <= highest_zoneidx; i++) {
zone = pgdat->node_zones + i;
if (!managed_zone(zone))
continue;
mark = high_wmark_pages(zone);
- if (zone_watermark_ok_safe(zone, order, mark, classzone_idx))
+ if (zone_watermark_ok_safe(zone, order, mark, highest_zoneidx))
return true;
}
/*
- * If a node has no populated zone within classzone_idx, it does not
+ * If a node has no populated zone within highest_zoneidx, it does not
* need balancing by definition. This can happen if a zone-restricted
* allocation tries to wake a remote kswapd.
*/
@@ -3462,7 +3452,8 @@ static void clear_pgdat_congested(pg_data_t *pgdat)
*
* Returns true if kswapd is ready to sleep
*/
-static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, int classzone_idx)
+static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order,
+ int highest_zoneidx)
{
/*
* The throttled processes are normally woken up in balance_pgdat() as
@@ -3484,7 +3475,7 @@ static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, int classzone_idx)
if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES)
return true;
- if (pgdat_balanced(pgdat, order, classzone_idx)) {
+ if (pgdat_balanced(pgdat, order, highest_zoneidx)) {
clear_pgdat_congested(pgdat);
return true;
}
@@ -3548,7 +3539,7 @@ static bool kswapd_shrink_node(pg_data_t *pgdat,
* or lower is eligible for reclaim until at least one usable zone is
* balanced.
*/
-static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx)
+static int balance_pgdat(pg_data_t *pgdat, int order, int highest_zoneidx)
{
int i;
unsigned long nr_soft_reclaimed;
@@ -3576,7 +3567,7 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx)
* stall or direct reclaim until kswapd is finished.
*/
nr_boost_reclaim = 0;
- for (i = 0; i <= classzone_idx; i++) {
+ for (i = 0; i <= highest_zoneidx; i++) {
zone = pgdat->node_zones + i;
if (!managed_zone(zone))
continue;
@@ -3594,7 +3585,7 @@ restart:
bool balanced;
bool ret;
- sc.reclaim_idx = classzone_idx;
+ sc.reclaim_idx = highest_zoneidx;
/*
* If the number of buffer_heads exceeds the maximum allowed
@@ -3624,7 +3615,7 @@ restart:
* on the grounds that the normal reclaim should be enough to
* re-evaluate if boosting is required when kswapd next wakes.
*/
- balanced = pgdat_balanced(pgdat, sc.order, classzone_idx);
+ balanced = pgdat_balanced(pgdat, sc.order, highest_zoneidx);
if (!balanced && nr_boost_reclaim) {
nr_boost_reclaim = 0;
goto restart;
@@ -3724,7 +3715,7 @@ out:
if (boosted) {
unsigned long flags;
- for (i = 0; i <= classzone_idx; i++) {
+ for (i = 0; i <= highest_zoneidx; i++) {
if (!zone_boosts[i])
continue;
@@ -3739,7 +3730,7 @@ out:
* As there is now likely space, wakeup kcompact to defragment
* pageblocks.
*/
- wakeup_kcompactd(pgdat, pageblock_order, classzone_idx);
+ wakeup_kcompactd(pgdat, pageblock_order, highest_zoneidx);
}
snapshot_refaults(NULL, pgdat);
@@ -3757,22 +3748,22 @@ out:
}
/*
- * The pgdat->kswapd_classzone_idx is used to pass the highest zone index to be
- * reclaimed by kswapd from the waker. If the value is MAX_NR_ZONES which is not
- * a valid index then either kswapd runs for first time or kswapd couldn't sleep
- * after previous reclaim attempt (node is still unbalanced). In that case
- * return the zone index of the previous kswapd reclaim cycle.
+ * The pgdat->kswapd_highest_zoneidx is used to pass the highest zone index to
+ * be reclaimed by kswapd from the waker. If the value is MAX_NR_ZONES which is
+ * not a valid index then either kswapd runs for first time or kswapd couldn't
+ * sleep after previous reclaim attempt (node is still unbalanced). In that
+ * case return the zone index of the previous kswapd reclaim cycle.
*/
-static enum zone_type kswapd_classzone_idx(pg_data_t *pgdat,
- enum zone_type prev_classzone_idx)
+static enum zone_type kswapd_highest_zoneidx(pg_data_t *pgdat,
+ enum zone_type prev_highest_zoneidx)
{
- enum zone_type curr_idx = READ_ONCE(pgdat->kswapd_classzone_idx);
+ enum zone_type curr_idx = READ_ONCE(pgdat->kswapd_highest_zoneidx);
- return curr_idx == MAX_NR_ZONES ? prev_classzone_idx : curr_idx;
+ return curr_idx == MAX_NR_ZONES ? prev_highest_zoneidx : curr_idx;
}
static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_order,
- unsigned int classzone_idx)
+ unsigned int highest_zoneidx)
{
long remaining = 0;
DEFINE_WAIT(wait);
@@ -3789,7 +3780,7 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_o
* eligible zone balanced that it's also unlikely that compaction will
* succeed.
*/
- if (prepare_kswapd_sleep(pgdat, reclaim_order, classzone_idx)) {
+ if (prepare_kswapd_sleep(pgdat, reclaim_order, highest_zoneidx)) {
/*
* Compaction records what page blocks it recently failed to
* isolate pages from and skips them in the future scanning.
@@ -3802,18 +3793,19 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_o
* We have freed the memory, now we should compact it to make
* allocation of the requested order possible.
*/
- wakeup_kcompactd(pgdat, alloc_order, classzone_idx);
+ wakeup_kcompactd(pgdat, alloc_order, highest_zoneidx);
remaining = schedule_timeout(HZ/10);
/*
- * If woken prematurely then reset kswapd_classzone_idx and
+ * If woken prematurely then reset kswapd_highest_zoneidx and
* order. The values will either be from a wakeup request or
* the previous request that slept prematurely.
*/
if (remaining) {
- WRITE_ONCE(pgdat->kswapd_classzone_idx,
- kswapd_classzone_idx(pgdat, classzone_idx));
+ WRITE_ONCE(pgdat->kswapd_highest_zoneidx,
+ kswapd_highest_zoneidx(pgdat,
+ highest_zoneidx));
if (READ_ONCE(pgdat->kswapd_order) < reclaim_order)
WRITE_ONCE(pgdat->kswapd_order, reclaim_order);
@@ -3828,7 +3820,7 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_o
* go fully to sleep until explicitly woken up.
*/
if (!remaining &&
- prepare_kswapd_sleep(pgdat, reclaim_order, classzone_idx)) {
+ prepare_kswapd_sleep(pgdat, reclaim_order, highest_zoneidx)) {
trace_mm_vmscan_kswapd_sleep(pgdat->node_id);
/*
@@ -3870,7 +3862,7 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_o
static int kswapd(void *p)
{
unsigned int alloc_order, reclaim_order;
- unsigned int classzone_idx = MAX_NR_ZONES - 1;
+ unsigned int highest_zoneidx = MAX_NR_ZONES - 1;
pg_data_t *pgdat = (pg_data_t*)p;
struct task_struct *tsk = current;
const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id);
@@ -3894,22 +3886,24 @@ static int kswapd(void *p)
set_freezable();
WRITE_ONCE(pgdat->kswapd_order, 0);
- WRITE_ONCE(pgdat->kswapd_classzone_idx, MAX_NR_ZONES);
+ WRITE_ONCE(pgdat->kswapd_highest_zoneidx, MAX_NR_ZONES);
for ( ; ; ) {
bool ret;
alloc_order = reclaim_order = READ_ONCE(pgdat->kswapd_order);
- classzone_idx = kswapd_classzone_idx(pgdat, classzone_idx);
+ highest_zoneidx = kswapd_highest_zoneidx(pgdat,
+ highest_zoneidx);
kswapd_try_sleep:
kswapd_try_to_sleep(pgdat, alloc_order, reclaim_order,
- classzone_idx);
+ highest_zoneidx);
- /* Read the new order and classzone_idx */
+ /* Read the new order and highest_zoneidx */
alloc_order = reclaim_order = READ_ONCE(pgdat->kswapd_order);
- classzone_idx = kswapd_classzone_idx(pgdat, classzone_idx);
+ highest_zoneidx = kswapd_highest_zoneidx(pgdat,
+ highest_zoneidx);
WRITE_ONCE(pgdat->kswapd_order, 0);
- WRITE_ONCE(pgdat->kswapd_classzone_idx, MAX_NR_ZONES);
+ WRITE_ONCE(pgdat->kswapd_highest_zoneidx, MAX_NR_ZONES);
ret = try_to_freeze();
if (kthread_should_stop())
@@ -3930,9 +3924,10 @@ kswapd_try_sleep:
* but kcompactd is woken to compact for the original
* request (alloc_order).
*/
- trace_mm_vmscan_kswapd_wake(pgdat->node_id, classzone_idx,
+ trace_mm_vmscan_kswapd_wake(pgdat->node_id, highest_zoneidx,
alloc_order);
- reclaim_order = balance_pgdat(pgdat, alloc_order, classzone_idx);
+ reclaim_order = balance_pgdat(pgdat, alloc_order,
+ highest_zoneidx);
if (reclaim_order < alloc_order)
goto kswapd_try_sleep;
}
@@ -3950,7 +3945,7 @@ kswapd_try_sleep:
* needed.
*/
void wakeup_kswapd(struct zone *zone, gfp_t gfp_flags, int order,
- enum zone_type classzone_idx)
+ enum zone_type highest_zoneidx)
{
pg_data_t *pgdat;
enum zone_type curr_idx;
@@ -3962,10 +3957,10 @@ void wakeup_kswapd(struct zone *zone, gfp_t gfp_flags, int order,
return;
pgdat = zone->zone_pgdat;
- curr_idx = READ_ONCE(pgdat->kswapd_classzone_idx);
+ curr_idx = READ_ONCE(pgdat->kswapd_highest_zoneidx);
- if (curr_idx == MAX_NR_ZONES || curr_idx < classzone_idx)
- WRITE_ONCE(pgdat->kswapd_classzone_idx, classzone_idx);
+ if (curr_idx == MAX_NR_ZONES || curr_idx < highest_zoneidx)
+ WRITE_ONCE(pgdat->kswapd_highest_zoneidx, highest_zoneidx);
if (READ_ONCE(pgdat->kswapd_order) < order)
WRITE_ONCE(pgdat->kswapd_order, order);
@@ -3975,8 +3970,8 @@ void wakeup_kswapd(struct zone *zone, gfp_t gfp_flags, int order,
/* Hopeless node, leave it to direct reclaim if possible */
if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES ||
- (pgdat_balanced(pgdat, order, classzone_idx) &&
- !pgdat_watermark_boosted(pgdat, classzone_idx))) {
+ (pgdat_balanced(pgdat, order, highest_zoneidx) &&
+ !pgdat_watermark_boosted(pgdat, highest_zoneidx))) {
/*
* There may be plenty of free memory available, but it's too
* fragmented for high-order allocations. Wake up kcompactd
@@ -3985,11 +3980,11 @@ void wakeup_kswapd(struct zone *zone, gfp_t gfp_flags, int order,
* ratelimit its work.
*/
if (!(gfp_flags & __GFP_DIRECT_RECLAIM))
- wakeup_kcompactd(pgdat, order, classzone_idx);
+ wakeup_kcompactd(pgdat, order, highest_zoneidx);
return;
}
- trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, classzone_idx, order,
+ trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, highest_zoneidx, order,
gfp_flags);
wake_up_interruptible(&pgdat->kswapd_wait);
}
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 96d21a792b57..3fb23a21f6dd 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -76,7 +76,7 @@ static void invalid_numa_statistics(void)
static DEFINE_MUTEX(vm_numa_stat_lock);
int sysctl_vm_numa_stat_handler(struct ctl_table *table, int write,
- void __user *buffer, size_t *length, loff_t *ppos)
+ void *buffer, size_t *length, loff_t *ppos)
{
int ret, oldval;
@@ -1108,7 +1108,7 @@ int fragmentation_index(struct zone *zone, unsigned int order)
TEXT_FOR_HIGHMEM(xx) xx "_movable",
const char * const vmstat_text[] = {
- /* enum zone_stat_item countes */
+ /* enum zone_stat_item counters */
"nr_free_pages",
"nr_zone_inactive_anon",
"nr_zone_active_anon",
@@ -1119,6 +1119,9 @@ const char * const vmstat_text[] = {
"nr_mlock",
"nr_page_table_pages",
"nr_kernel_stack",
+#if IS_ENABLED(CONFIG_SHADOW_CALL_STACK)
+ "nr_shadow_call_stack",
+#endif
"nr_bounce",
#if IS_ENABLED(CONFIG_ZSMALLOC)
"nr_zspages",
@@ -1162,7 +1165,6 @@ const char * const vmstat_text[] = {
"nr_file_hugepages",
"nr_file_pmdmapped",
"nr_anon_transparent_hugepages",
- "nr_unstable",
"nr_vmscan_write",
"nr_vmscan_immediate_reclaim",
"nr_dirtied",
@@ -1201,6 +1203,10 @@ const char * const vmstat_text[] = {
"pgscan_kswapd",
"pgscan_direct",
"pgscan_direct_throttle",
+ "pgscan_anon",
+ "pgscan_file",
+ "pgsteal_anon",
+ "pgsteal_file",
#ifdef CONFIG_NUMA
"zone_reclaim_failed",
@@ -1590,6 +1596,12 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
zone->present_pages,
zone_managed_pages(zone));
+ /* If unpopulated, no other information is useful */
+ if (!populated_zone(zone)) {
+ seq_putc(m, '\n');
+ return;
+ }
+
seq_printf(m,
"\n protection: (%ld",
zone->lowmem_reserve[0]);
@@ -1597,12 +1609,6 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
seq_printf(m, ", %ld", zone->lowmem_reserve[i]);
seq_putc(m, ')');
- /* If unpopulated, no other information is useful */
- if (!populated_zone(zone)) {
- seq_putc(m, '\n');
- return;
- }
-
for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
seq_printf(m, "\n %-12s %lu", zone_stat_name(i),
zone_page_state(zone, i));
@@ -1723,6 +1729,14 @@ static int vmstat_show(struct seq_file *m, void *arg)
seq_puts(m, vmstat_text[off]);
seq_put_decimal_ull(m, " ", *l);
seq_putc(m, '\n');
+
+ if (off == NR_VMSTAT_ITEMS - 1) {
+ /*
+ * We've come to the end - add any deprecated counters to avoid
+ * breaking userspace which might depend on them being present.
+ */
+ seq_puts(m, "nr_unstable 0\n");
+ }
return 0;
}
@@ -1751,7 +1765,7 @@ static void refresh_vm_stats(struct work_struct *work)
}
int vmstat_refresh(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
+ void *buffer, size_t *lenp, loff_t *ppos)
{
long val;
int err;
@@ -2055,24 +2069,14 @@ static int unusable_show(struct seq_file *m, void *arg)
return 0;
}
-static const struct seq_operations unusable_op = {
+static const struct seq_operations unusable_sops = {
.start = frag_start,
.next = frag_next,
.stop = frag_stop,
.show = unusable_show,
};
-static int unusable_open(struct inode *inode, struct file *file)
-{
- return seq_open(file, &unusable_op);
-}
-
-static const struct file_operations unusable_file_ops = {
- .open = unusable_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = seq_release,
-};
+DEFINE_SEQ_ATTRIBUTE(unusable);
static void extfrag_show_print(struct seq_file *m,
pg_data_t *pgdat, struct zone *zone)
@@ -2107,24 +2111,14 @@ static int extfrag_show(struct seq_file *m, void *arg)
return 0;
}
-static const struct seq_operations extfrag_op = {
+static const struct seq_operations extfrag_sops = {
.start = frag_start,
.next = frag_next,
.stop = frag_stop,
.show = extfrag_show,
};
-static int extfrag_open(struct inode *inode, struct file *file)
-{
- return seq_open(file, &extfrag_op);
-}
-
-static const struct file_operations extfrag_file_ops = {
- .open = extfrag_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = seq_release,
-};
+DEFINE_SEQ_ATTRIBUTE(extfrag);
static int __init extfrag_debug_init(void)
{
@@ -2133,10 +2127,10 @@ static int __init extfrag_debug_init(void)
extfrag_debug_root = debugfs_create_dir("extfrag", NULL);
debugfs_create_file("unusable_index", 0444, extfrag_debug_root, NULL,
- &unusable_file_ops);
+ &unusable_fops);
debugfs_create_file("extfrag_index", 0444, extfrag_debug_root, NULL,
- &extfrag_file_ops);
+ &extfrag_fops);
return 0;
}
diff --git a/mm/workingset.c b/mm/workingset.c
index 474186b76ced..d481ea452eeb 100644
--- a/mm/workingset.c
+++ b/mm/workingset.c
@@ -277,8 +277,8 @@ void workingset_refault(struct page *page, void *shadow)
struct mem_cgroup *eviction_memcg;
struct lruvec *eviction_lruvec;
unsigned long refault_distance;
+ unsigned long workingset_size;
struct pglist_data *pgdat;
- unsigned long active_file;
struct mem_cgroup *memcg;
unsigned long eviction;
struct lruvec *lruvec;
@@ -310,7 +310,6 @@ void workingset_refault(struct page *page, void *shadow)
goto out;
eviction_lruvec = mem_cgroup_lruvec(eviction_memcg, pgdat);
refault = atomic_long_read(&eviction_lruvec->inactive_age);
- active_file = lruvec_page_state(eviction_lruvec, NR_ACTIVE_FILE);
/*
* Calculate the refault distance
@@ -345,10 +344,18 @@ void workingset_refault(struct page *page, void *shadow)
/*
* Compare the distance to the existing workingset size. We
- * don't act on pages that couldn't stay resident even if all
- * the memory was available to the page cache.
+ * don't activate pages that couldn't stay resident even if
+ * all the memory was available to the page cache. Whether
+ * cache can compete with anon or not depends on having swap.
*/
- if (refault_distance > active_file)
+ workingset_size = lruvec_page_state(eviction_lruvec, NR_ACTIVE_FILE);
+ if (mem_cgroup_get_nr_swap_pages(memcg) > 0) {
+ workingset_size += lruvec_page_state(eviction_lruvec,
+ NR_INACTIVE_ANON);
+ workingset_size += lruvec_page_state(eviction_lruvec,
+ NR_ACTIVE_ANON);
+ }
+ if (refault_distance > workingset_size)
goto out;
SetPageActive(page);
@@ -358,6 +365,10 @@ void workingset_refault(struct page *page, void *shadow)
/* Page was active prior to eviction */
if (workingset) {
SetPageWorkingset(page);
+ /* XXX: Move to lru_cache_add() when it supports new vs putback */
+ spin_lock_irq(&page_pgdat(page)->lru_lock);
+ lru_note_cost_page(page);
+ spin_unlock_irq(&page_pgdat(page)->lru_lock);
inc_lruvec_state(lruvec, WORKINGSET_RESTORE);
}
out:
diff --git a/mm/z3fold.c b/mm/z3fold.c
index 42f31c4b53ad..460b0feced26 100644
--- a/mm/z3fold.c
+++ b/mm/z3fold.c
@@ -43,6 +43,7 @@
#include <linux/spinlock.h>
#include <linux/zpool.h>
#include <linux/magic.h>
+#include <linux/kmemleak.h>
/*
* NCHUNKS_ORDER determines the internal allocation granularity, effectively
@@ -215,6 +216,8 @@ static inline struct z3fold_buddy_slots *alloc_slots(struct z3fold_pool *pool,
(gfp & ~(__GFP_HIGHMEM | __GFP_MOVABLE)));
if (slots) {
+ /* It will be freed separately in free_handle(). */
+ kmemleak_not_leak(slots);
memset(slots->slot, 0, sizeof(slots->slot));
slots->pool = (unsigned long)pool;
rwlock_init(&slots->lock);
@@ -318,16 +321,16 @@ static inline void free_handle(unsigned long handle)
slots = handle_to_slots(handle);
write_lock(&slots->lock);
*(unsigned long *)handle = 0;
- write_unlock(&slots->lock);
- if (zhdr->slots == slots)
+ if (zhdr->slots == slots) {
+ write_unlock(&slots->lock);
return; /* simple case, nothing else to do */
+ }
/* we are freeing a foreign handle if we are here */
zhdr->foreign_handles--;
is_free = true;
- read_lock(&slots->lock);
if (!test_bit(HANDLES_ORPHANED, &slots->pool)) {
- read_unlock(&slots->lock);
+ write_unlock(&slots->lock);
return;
}
for (i = 0; i <= BUDDY_MASK; i++) {
@@ -336,7 +339,7 @@ static inline void free_handle(unsigned long handle)
break;
}
}
- read_unlock(&slots->lock);
+ write_unlock(&slots->lock);
if (is_free) {
struct z3fold_pool *pool = slots_to_pool(slots);
@@ -422,6 +425,7 @@ static struct z3fold_header *init_z3fold_page(struct page *page, bool headless,
zhdr->start_middle = 0;
zhdr->cpu = -1;
zhdr->foreign_handles = 0;
+ zhdr->mapped_count = 0;
zhdr->slots = slots;
zhdr->pool = pool;
INIT_LIST_HEAD(&zhdr->buddy);
diff --git a/mm/zbud.c b/mm/zbud.c
index de5dd4ddaa82..bc93aa4e46fc 100644
--- a/mm/zbud.c
+++ b/mm/zbud.c
@@ -243,7 +243,7 @@ static struct zbud_header *init_zbud_page(struct page *page)
zhdr->last_chunks = 0;
INIT_LIST_HEAD(&zhdr->buddy);
INIT_LIST_HEAD(&zhdr->lru);
- zhdr->under_reclaim = 0;
+ zhdr->under_reclaim = false;
return zhdr;
}
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index 2f836a2b993f..952a01e45c6a 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -39,8 +39,8 @@
#include <linux/highmem.h>
#include <linux/string.h>
#include <linux/slab.h>
+#include <linux/pgtable.h>
#include <asm/tlbflush.h>
-#include <asm/pgtable.h>
#include <linux/cpumask.h>
#include <linux/cpu.h>
#include <linux/vmalloc.h>
@@ -293,7 +293,7 @@ struct zspage {
};
struct mapping_area {
-#ifdef CONFIG_PGTABLE_MAPPING
+#ifdef CONFIG_ZSMALLOC_PGTABLE_MAPPING
struct vm_struct *vm; /* vm area for mapping object that span pages */
#else
char *vm_buf; /* copy buffer for objects that span pages */
@@ -1113,7 +1113,7 @@ static struct zspage *find_get_zspage(struct size_class *class)
return zspage;
}
-#ifdef CONFIG_PGTABLE_MAPPING
+#ifdef CONFIG_ZSMALLOC_PGTABLE_MAPPING
static inline int __zs_cpu_up(struct mapping_area *area)
{
/*
@@ -1138,7 +1138,9 @@ static inline void __zs_cpu_down(struct mapping_area *area)
static inline void *__zs_map_object(struct mapping_area *area,
struct page *pages[2], int off, int size)
{
- BUG_ON(map_vm_area(area->vm, PAGE_KERNEL, pages));
+ unsigned long addr = (unsigned long)area->vm->addr;
+
+ BUG_ON(map_kernel_range(addr, PAGE_SIZE * 2, PAGE_KERNEL, pages) < 0);
area->vm_addr = area->vm->addr;
return area->vm_addr + off;
}
@@ -1151,7 +1153,7 @@ static inline void __zs_unmap_object(struct mapping_area *area,
unmap_kernel_range(addr, PAGE_SIZE * 2);
}
-#else /* CONFIG_PGTABLE_MAPPING */
+#else /* CONFIG_ZSMALLOC_PGTABLE_MAPPING */
static inline int __zs_cpu_up(struct mapping_area *area)
{
@@ -1233,7 +1235,7 @@ out:
pagefault_enable();
}
-#endif /* CONFIG_PGTABLE_MAPPING */
+#endif /* CONFIG_ZSMALLOC_PGTABLE_MAPPING */
static int zs_cpu_prepare(unsigned int cpu)
{