summaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig3
-rw-r--r--mm/Makefile2
-rw-r--r--mm/backing-dev.c9
-rw-r--r--mm/cma.c64
-rw-r--r--mm/compaction.c73
-rw-r--r--mm/debug.c10
-rw-r--r--mm/debug_vm_pgtable.c86
-rw-r--r--mm/dmapool.c3
-rw-r--r--mm/early_ioremap.c12
-rw-r--r--mm/filemap.c926
-rw-r--r--mm/frame_vector.c240
-rw-r--r--mm/gup.c5
-rw-r--r--mm/highmem.c21
-rw-r--r--mm/huge_memory.c44
-rw-r--r--mm/hugetlb.c510
-rw-r--r--mm/hugetlb_cgroup.c16
-rw-r--r--mm/internal.h9
-rw-r--r--mm/kasan/common.c235
-rw-r--r--mm/kasan/generic.c41
-rw-r--r--mm/kasan/hw_tags.c18
-rw-r--r--mm/kasan/kasan.h150
-rw-r--r--mm/kasan/quarantine.c22
-rw-r--r--mm/kasan/report.c23
-rw-r--r--mm/kasan/report_generic.c8
-rw-r--r--mm/kasan/report_hw_tags.c8
-rw-r--r--mm/kasan/report_sw_tags.c8
-rw-r--r--mm/kasan/shadow.c83
-rw-r--r--mm/kasan/sw_tags.c20
-rw-r--r--mm/kfence/Makefile6
-rw-r--r--mm/kfence/core.c850
-rw-r--r--mm/kfence/kfence.h106
-rw-r--r--mm/kfence/kfence_test.c858
-rw-r--r--mm/kfence/report.c268
-rw-r--r--mm/khugepaged.c28
-rw-r--r--mm/kmemleak.c3
-rw-r--r--mm/list_lru.c12
-rw-r--r--mm/madvise.c18
-rw-r--r--mm/memcontrol.c296
-rw-r--r--mm/memory-failure.c40
-rw-r--r--mm/memory.c90
-rw-r--r--mm/memory_hotplug.c173
-rw-r--r--mm/mempolicy.c18
-rw-r--r--mm/mempool.c2
-rw-r--r--mm/memremap.c23
-rw-r--r--mm/migrate.c10
-rw-r--r--mm/mincore.c5
-rw-r--r--mm/mlock.c5
-rw-r--r--mm/mmap.c4
-rw-r--r--mm/mmu_notifier.c23
-rw-r--r--mm/mprotect.c7
-rw-r--r--mm/mremap.c8
-rw-r--r--mm/oom_kill.c5
-rw-r--r--mm/page-writeback.c16
-rw-r--r--mm/page_alloc.c232
-rw-r--r--mm/page_io.c17
-rw-r--r--mm/page_owner.c4
-rw-r--r--mm/page_reporting.c2
-rw-r--r--mm/percpu.c36
-rw-r--r--mm/pgtable-generic.c5
-rw-r--r--mm/rmap.c57
-rw-r--r--mm/shmem.c206
-rw-r--r--mm/slab.c61
-rw-r--r--mm/slab.h20
-rw-r--r--mm/slab_common.c63
-rw-r--r--mm/slob.c2
-rw-r--r--mm/slub.c234
-rw-r--r--mm/swap.c88
-rw-r--r--mm/swap_slots.c3
-rw-r--r--mm/swap_state.c38
-rw-r--r--mm/swapfile.c21
-rw-r--r--mm/truncate.c131
-rw-r--r--mm/vmscan.c98
-rw-r--r--mm/vmstat.c49
-rw-r--r--mm/workingset.c7
-rw-r--r--mm/z3fold.c28
-rw-r--r--mm/zbud.c1
-rw-r--r--mm/zpool.c13
-rw-r--r--mm/zsmalloc.c22
-rw-r--r--mm/zswap.c57
79 files changed, 4740 insertions, 2278 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index f730605b8dcf..24c045b24b95 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -804,9 +804,6 @@ config DEVICE_PRIVATE
config VMAP_PFN
bool
-config FRAME_VECTOR
- bool
-
config ARCH_USES_HIGH_VMA_FLAGS
bool
config ARCH_HAS_PKEYS
diff --git a/mm/Makefile b/mm/Makefile
index b6cd2fffa492..72227b24a616 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -81,6 +81,7 @@ obj-$(CONFIG_PAGE_POISONING) += page_poison.o
obj-$(CONFIG_SLAB) += slab.o
obj-$(CONFIG_SLUB) += slub.o
obj-$(CONFIG_KASAN) += kasan/
+obj-$(CONFIG_KFENCE) += kfence/
obj-$(CONFIG_FAILSLAB) += failslab.o
obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o
obj-$(CONFIG_MEMTEST) += memtest.o
@@ -110,7 +111,6 @@ obj-$(CONFIG_PAGE_EXTENSION) += page_ext.o
obj-$(CONFIG_CMA_DEBUGFS) += cma_debug.o
obj-$(CONFIG_USERFAULTFD) += userfaultfd.o
obj-$(CONFIG_IDLE_PAGE_TRACKING) += page_idle.o
-obj-$(CONFIG_FRAME_VECTOR) += frame_vector.o
obj-$(CONFIG_DEBUG_PAGE_REF) += debug_page_ref.o
obj-$(CONFIG_HARDENED_USERCOPY) += usercopy.o
obj-$(CONFIG_PERCPU_STATS) += percpu-stats.o
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index e33797579338..576220acd686 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -8,6 +8,7 @@
#include <linux/fs.h>
#include <linux/pagemap.h>
#include <linux/mm.h>
+#include <linux/sched/mm.h>
#include <linux/sched.h>
#include <linux/module.h>
#include <linux/writeback.h>
@@ -32,6 +33,8 @@ LIST_HEAD(bdi_list);
/* bdi_wq serves all asynchronous writeback tasks */
struct workqueue_struct *bdi_wq;
+#define K(x) ((x) << (PAGE_SHIFT - 10))
+
#ifdef CONFIG_DEBUG_FS
#include <linux/debugfs.h>
#include <linux/seq_file.h>
@@ -69,7 +72,6 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v)
global_dirty_limits(&background_thresh, &dirty_thresh);
wb_thresh = wb_calc_thresh(wb, dirty_thresh);
-#define K(x) ((x) << (PAGE_SHIFT - 10))
seq_printf(m,
"BdiWriteback: %10lu kB\n"
"BdiReclaimable: %10lu kB\n"
@@ -98,7 +100,6 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v)
nr_more_io,
nr_dirty_time,
!list_empty(&bdi->bdi_list), bdi->wb.state);
-#undef K
return 0;
}
@@ -146,8 +147,6 @@ static ssize_t read_ahead_kb_store(struct device *dev,
return count;
}
-#define K(pages) ((pages) << (PAGE_SHIFT - 10))
-
#define BDI_SHOW(name, expr) \
static ssize_t name##_show(struct device *dev, \
struct device_attribute *attr, char *buf) \
@@ -580,7 +579,7 @@ struct bdi_writeback *wb_get_create(struct backing_dev_info *bdi,
{
struct bdi_writeback *wb;
- might_sleep_if(gfpflags_allow_blocking(gfp));
+ might_alloc(gfp);
if (!memcg_css->parent)
return &bdi->wb;
diff --git a/mm/cma.c b/mm/cma.c
index 20c4f6f40037..54eee2119822 100644
--- a/mm/cma.c
+++ b/mm/cma.c
@@ -94,34 +94,29 @@ static void cma_clear_bitmap(struct cma *cma, unsigned long pfn,
static void __init cma_activate_area(struct cma *cma)
{
- unsigned long base_pfn = cma->base_pfn, pfn = base_pfn;
- unsigned i = cma->count >> pageblock_order;
+ unsigned long base_pfn = cma->base_pfn, pfn;
struct zone *zone;
cma->bitmap = bitmap_zalloc(cma_bitmap_maxno(cma), GFP_KERNEL);
if (!cma->bitmap)
goto out_error;
- WARN_ON_ONCE(!pfn_valid(pfn));
- zone = page_zone(pfn_to_page(pfn));
-
- do {
- unsigned j;
-
- base_pfn = pfn;
- for (j = pageblock_nr_pages; j; --j, pfn++) {
- WARN_ON_ONCE(!pfn_valid(pfn));
- /*
- * alloc_contig_range requires the pfn range
- * specified to be in the same zone. Make this
- * simple by forcing the entire CMA resv range
- * to be in the same zone.
- */
- if (page_zone(pfn_to_page(pfn)) != zone)
- goto not_in_zone;
- }
- init_cma_reserved_pageblock(pfn_to_page(base_pfn));
- } while (--i);
+ /*
+ * alloc_contig_range() requires the pfn range specified to be in the
+ * same zone. Simplify by forcing the entire CMA resv range to be in the
+ * same zone.
+ */
+ WARN_ON_ONCE(!pfn_valid(base_pfn));
+ zone = page_zone(pfn_to_page(base_pfn));
+ for (pfn = base_pfn + 1; pfn < base_pfn + cma->count; pfn++) {
+ WARN_ON_ONCE(!pfn_valid(pfn));
+ if (page_zone(pfn_to_page(pfn)) != zone)
+ goto not_in_zone;
+ }
+
+ for (pfn = base_pfn; pfn < base_pfn + cma->count;
+ pfn += pageblock_nr_pages)
+ init_cma_reserved_pageblock(pfn_to_page(pfn));
mutex_init(&cma->lock);
@@ -135,6 +130,10 @@ static void __init cma_activate_area(struct cma *cma)
not_in_zone:
bitmap_free(cma->bitmap);
out_error:
+ /* Expose all pages to the buddy, they are useless for CMA. */
+ for (pfn = base_pfn; pfn < base_pfn + cma->count; pfn++)
+ free_reserved_page(pfn_to_page(pfn));
+ totalcma_pages -= cma->count;
cma->count = 0;
pr_err("CMA area %s could not be activated\n", cma->name);
return;
@@ -336,6 +335,23 @@ int __init cma_declare_contiguous_nid(phys_addr_t base,
limit = highmem_start;
}
+ /*
+ * If there is enough memory, try a bottom-up allocation first.
+ * It will place the new cma area close to the start of the node
+ * and guarantee that the compaction is moving pages out of the
+ * cma area and not into it.
+ * Avoid using first 4GB to not interfere with constrained zones
+ * like DMA/DMA32.
+ */
+#ifdef CONFIG_PHYS_ADDR_T_64BIT
+ if (!memblock_bottom_up() && memblock_end >= SZ_4G + size) {
+ memblock_set_bottom_up(true);
+ addr = memblock_alloc_range_nid(size, alignment, SZ_4G,
+ limit, nid, true);
+ memblock_set_bottom_up(false);
+ }
+#endif
+
if (!addr) {
addr = memblock_alloc_range_nid(size, alignment, base,
limit, nid, true);
@@ -484,8 +500,8 @@ struct page *cma_alloc(struct cma *cma, size_t count, unsigned int align,
}
if (ret && !no_warn) {
- pr_err("%s: alloc failed, req-size: %zu pages, ret: %d\n",
- __func__, count, ret);
+ pr_err("%s: %s: alloc failed, req-size: %zu pages, ret: %d\n",
+ __func__, cma->name, count, ret);
cma_debug_show_areas(cma);
}
diff --git a/mm/compaction.c b/mm/compaction.c
index 190ccdaa6c19..e04f4476e68e 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -137,7 +137,6 @@ EXPORT_SYMBOL(__SetPageMovable);
void __ClearPageMovable(struct page *page)
{
- VM_BUG_ON_PAGE(!PageLocked(page), page);
VM_BUG_ON_PAGE(!PageMovable(page), page);
/*
* Clear registered address_space val with keeping PAGE_MAPPING_MOVABLE
@@ -988,14 +987,13 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
if (unlikely(!get_page_unless_zero(page)))
goto isolate_fail;
- if (__isolate_lru_page_prepare(page, isolate_mode) != 0)
+ if (!__isolate_lru_page_prepare(page, isolate_mode))
goto isolate_fail_put;
/* Try isolate the page */
if (!TestClearPageLRU(page))
goto isolate_fail_put;
- rcu_read_lock();
lruvec = mem_cgroup_page_lruvec(page, pgdat);
/* If we already hold the lock, we can skip some rechecking */
@@ -1005,7 +1003,6 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
compact_lock_irqsave(&lruvec->lru_lock, &flags, cc);
locked = lruvec;
- rcu_read_unlock();
lruvec_memcg_debug(lruvec, page);
@@ -1026,15 +1023,14 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
SetPageLRU(page);
goto isolate_fail_put;
}
- } else
- rcu_read_unlock();
+ }
/* The whole page is taken off the LRU; skip the tail pages. */
if (PageCompound(page))
low_pfn += compound_nr(page) - 1;
/* Successfully isolated */
- del_page_from_lru_list(page, lruvec, page_lru(page));
+ del_page_from_lru_list(page, lruvec);
mod_node_page_state(page_pgdat(page),
NR_ISOLATED_ANON + page_is_file_lru(page),
thp_nr_pages(page));
@@ -1288,7 +1284,7 @@ static void
fast_isolate_around(struct compact_control *cc, unsigned long pfn, unsigned long nr_isolated)
{
unsigned long start_pfn, end_pfn;
- struct page *page = pfn_to_page(pfn);
+ struct page *page;
/* Do not search around if there are enough pages already */
if (cc->nr_freepages >= cc->nr_migratepages)
@@ -1299,8 +1295,12 @@ fast_isolate_around(struct compact_control *cc, unsigned long pfn, unsigned long
return;
/* Pageblock boundaries */
- start_pfn = pageblock_start_pfn(pfn);
- end_pfn = min(pageblock_end_pfn(pfn), zone_end_pfn(cc->zone)) - 1;
+ start_pfn = max(pageblock_start_pfn(pfn), cc->zone->zone_start_pfn);
+ end_pfn = min(pageblock_end_pfn(pfn), zone_end_pfn(cc->zone));
+
+ page = pageblock_pfn_to_page(start_pfn, end_pfn, cc->zone);
+ if (!page)
+ return;
/* Scan before */
if (start_pfn != pfn) {
@@ -1402,7 +1402,8 @@ fast_isolate_freepages(struct compact_control *cc)
pfn = page_to_pfn(freepage);
if (pfn >= highest)
- highest = pageblock_start_pfn(pfn);
+ highest = max(pageblock_start_pfn(pfn),
+ cc->zone->zone_start_pfn);
if (pfn >= low_pfn) {
cc->fast_search_fail = 0;
@@ -1472,7 +1473,8 @@ fast_isolate_freepages(struct compact_control *cc)
} else {
if (cc->direct_compaction && pfn_valid(min_pfn)) {
page = pageblock_pfn_to_page(min_pfn,
- pageblock_end_pfn(min_pfn),
+ min(pageblock_end_pfn(min_pfn),
+ zone_end_pfn(cc->zone)),
cc->zone);
cc->free_pfn = min_pfn;
}
@@ -1702,6 +1704,7 @@ static unsigned long fast_find_migrateblock(struct compact_control *cc)
unsigned long pfn = cc->migrate_pfn;
unsigned long high_pfn;
int order;
+ bool found_block = false;
/* Skip hints are relied on to avoid repeats on the fast search */
if (cc->ignore_skip_hint)
@@ -1744,7 +1747,7 @@ static unsigned long fast_find_migrateblock(struct compact_control *cc)
high_pfn = pageblock_start_pfn(cc->migrate_pfn + distance);
for (order = cc->order - 1;
- order >= PAGE_ALLOC_COSTLY_ORDER && pfn == cc->migrate_pfn && nr_scanned < limit;
+ order >= PAGE_ALLOC_COSTLY_ORDER && !found_block && nr_scanned < limit;
order--) {
struct free_area *area = &cc->zone->free_area[order];
struct list_head *freelist;
@@ -1759,7 +1762,11 @@ static unsigned long fast_find_migrateblock(struct compact_control *cc)
list_for_each_entry(freepage, freelist, lru) {
unsigned long free_pfn;
- nr_scanned++;
+ if (nr_scanned++ >= limit) {
+ move_freelist_tail(freelist, freepage);
+ break;
+ }
+
free_pfn = page_to_pfn(freepage);
if (free_pfn < high_pfn) {
/*
@@ -1768,12 +1775,8 @@ static unsigned long fast_find_migrateblock(struct compact_control *cc)
* the list assumes an entry is deleted, not
* reordered.
*/
- if (get_pageblock_skip(freepage)) {
- if (list_is_last(freelist, &freepage->lru))
- break;
-
+ if (get_pageblock_skip(freepage))
continue;
- }
/* Reorder to so a future search skips recent pages */
move_freelist_tail(freelist, freepage);
@@ -1781,15 +1784,10 @@ static unsigned long fast_find_migrateblock(struct compact_control *cc)
update_fast_start_pfn(cc, free_pfn);
pfn = pageblock_start_pfn(free_pfn);
cc->fast_search_fail = 0;
+ found_block = true;
set_pageblock_skip(freepage);
break;
}
-
- if (nr_scanned >= limit) {
- cc->fast_search_fail++;
- move_freelist_tail(freelist, freepage);
- break;
- }
}
spin_unlock_irqrestore(&cc->zone->lock, flags);
}
@@ -1800,9 +1798,10 @@ static unsigned long fast_find_migrateblock(struct compact_control *cc)
* If fast scanning failed then use a cached entry for a page block
* that had free pages as the basis for starting a linear scan.
*/
- if (pfn == cc->migrate_pfn)
+ if (!found_block) {
+ cc->fast_search_fail++;
pfn = reinit_migrate_pfn(cc);
-
+ }
return pfn;
}
@@ -1926,20 +1925,28 @@ static bool kswapd_is_running(pg_data_t *pgdat)
/*
* A zone's fragmentation score is the external fragmentation wrt to the
- * COMPACTION_HPAGE_ORDER scaled by the zone's size. It returns a value
- * in the range [0, 100].
+ * COMPACTION_HPAGE_ORDER. It returns a value in the range [0, 100].
+ */
+static unsigned int fragmentation_score_zone(struct zone *zone)
+{
+ return extfrag_for_order(zone, COMPACTION_HPAGE_ORDER);
+}
+
+/*
+ * A weighted zone's fragmentation score is the external fragmentation
+ * wrt to the COMPACTION_HPAGE_ORDER scaled by the zone's size. It
+ * returns a value in the range [0, 100].
*
* The scaling factor ensures that proactive compaction focuses on larger
* zones like ZONE_NORMAL, rather than smaller, specialized zones like
* ZONE_DMA32. For smaller zones, the score value remains close to zero,
* and thus never exceeds the high threshold for proactive compaction.
*/
-static unsigned int fragmentation_score_zone(struct zone *zone)
+static unsigned int fragmentation_score_zone_weighted(struct zone *zone)
{
unsigned long score;
- score = zone->present_pages *
- extfrag_for_order(zone, COMPACTION_HPAGE_ORDER);
+ score = zone->present_pages * fragmentation_score_zone(zone);
return div64_ul(score, zone->zone_pgdat->node_present_pages + 1);
}
@@ -1959,7 +1966,7 @@ static unsigned int fragmentation_score_node(pg_data_t *pgdat)
struct zone *zone;
zone = &pgdat->node_zones[zoneid];
- score += fragmentation_score_zone(zone);
+ score += fragmentation_score_zone_weighted(zone);
}
return score;
diff --git a/mm/debug.c b/mm/debug.c
index 8a40b3fefbeb..0bdda8407f71 100644
--- a/mm/debug.c
+++ b/mm/debug.c
@@ -110,6 +110,11 @@ void __dump_page(struct page *page, const char *reason)
head_compound_mapcount(head));
}
}
+
+#ifdef CONFIG_MEMCG
+ if (head->memcg_data)
+ pr_warn("memcg:%lx\n", head->memcg_data);
+#endif
if (PageKsm(page))
type = "ksm ";
else if (PageAnon(page))
@@ -180,11 +185,6 @@ hex_only:
if (reason)
pr_warn("page dumped because: %s\n", reason);
-
-#ifdef CONFIG_MEMCG
- if (!page_poisoned && page->memcg_data)
- pr_warn("pages's memcg:%lx\n", page->memcg_data);
-#endif
}
void dump_page(struct page *page, const char *reason)
diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c
index c05d9dcf7891..a9bd6ce1ba02 100644
--- a/mm/debug_vm_pgtable.c
+++ b/mm/debug_vm_pgtable.c
@@ -58,11 +58,23 @@
#define RANDOM_ORVALUE (GENMASK(BITS_PER_LONG - 1, 0) & ~ARCH_SKIP_MASK)
#define RANDOM_NZVALUE GENMASK(7, 0)
-static void __init pte_basic_tests(unsigned long pfn, pgprot_t prot)
+static void __init pte_basic_tests(unsigned long pfn, int idx)
{
+ pgprot_t prot = protection_map[idx];
pte_t pte = pfn_pte(pfn, prot);
+ unsigned long val = idx, *ptr = &val;
+
+ pr_debug("Validating PTE basic (%pGv)\n", ptr);
+
+ /*
+ * This test needs to be executed after the given page table entry
+ * is created with pfn_pte() to make sure that protection_map[idx]
+ * does not have the dirty bit enabled from the beginning. This is
+ * important for platforms like arm64 where (!PTE_RDONLY) indicate
+ * dirty bit being set.
+ */
+ WARN_ON(pte_dirty(pte_wrprotect(pte)));
- pr_debug("Validating PTE basic\n");
WARN_ON(!pte_same(pte, pte));
WARN_ON(!pte_young(pte_mkyoung(pte_mkold(pte))));
WARN_ON(!pte_dirty(pte_mkdirty(pte_mkclean(pte))));
@@ -70,6 +82,8 @@ static void __init pte_basic_tests(unsigned long pfn, pgprot_t prot)
WARN_ON(pte_young(pte_mkold(pte_mkyoung(pte))));
WARN_ON(pte_dirty(pte_mkclean(pte_mkdirty(pte))));
WARN_ON(pte_write(pte_wrprotect(pte_mkwrite(pte))));
+ WARN_ON(pte_dirty(pte_wrprotect(pte_mkclean(pte))));
+ WARN_ON(!pte_dirty(pte_wrprotect(pte_mkdirty(pte))));
}
static void __init pte_advanced_tests(struct mm_struct *mm,
@@ -129,14 +143,27 @@ static void __init pte_savedwrite_tests(unsigned long pfn, pgprot_t prot)
}
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-static void __init pmd_basic_tests(unsigned long pfn, pgprot_t prot)
+static void __init pmd_basic_tests(unsigned long pfn, int idx)
{
+ pgprot_t prot = protection_map[idx];
pmd_t pmd = pfn_pmd(pfn, prot);
+ unsigned long val = idx, *ptr = &val;
if (!has_transparent_hugepage())
return;
- pr_debug("Validating PMD basic\n");
+ pr_debug("Validating PMD basic (%pGv)\n", ptr);
+
+ /*
+ * This test needs to be executed after the given page table entry
+ * is created with pfn_pmd() to make sure that protection_map[idx]
+ * does not have the dirty bit enabled from the beginning. This is
+ * important for platforms like arm64 where (!PTE_RDONLY) indicate
+ * dirty bit being set.
+ */
+ WARN_ON(pmd_dirty(pmd_wrprotect(pmd)));
+
+
WARN_ON(!pmd_same(pmd, pmd));
WARN_ON(!pmd_young(pmd_mkyoung(pmd_mkold(pmd))));
WARN_ON(!pmd_dirty(pmd_mkdirty(pmd_mkclean(pmd))));
@@ -144,6 +171,8 @@ static void __init pmd_basic_tests(unsigned long pfn, pgprot_t prot)
WARN_ON(pmd_young(pmd_mkold(pmd_mkyoung(pmd))));
WARN_ON(pmd_dirty(pmd_mkclean(pmd_mkdirty(pmd))));
WARN_ON(pmd_write(pmd_wrprotect(pmd_mkwrite(pmd))));
+ WARN_ON(pmd_dirty(pmd_wrprotect(pmd_mkclean(pmd))));
+ WARN_ON(!pmd_dirty(pmd_wrprotect(pmd_mkdirty(pmd))));
/*
* A huge page does not point to next level page table
* entry. Hence this must qualify as pmd_bad().
@@ -249,19 +278,35 @@ static void __init pmd_savedwrite_tests(unsigned long pfn, pgprot_t prot)
}
#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
-static void __init pud_basic_tests(unsigned long pfn, pgprot_t prot)
+static void __init pud_basic_tests(struct mm_struct *mm, unsigned long pfn, int idx)
{
+ pgprot_t prot = protection_map[idx];
pud_t pud = pfn_pud(pfn, prot);
+ unsigned long val = idx, *ptr = &val;
if (!has_transparent_hugepage())
return;
- pr_debug("Validating PUD basic\n");
+ pr_debug("Validating PUD basic (%pGv)\n", ptr);
+
+ /*
+ * This test needs to be executed after the given page table entry
+ * is created with pfn_pud() to make sure that protection_map[idx]
+ * does not have the dirty bit enabled from the beginning. This is
+ * important for platforms like arm64 where (!PTE_RDONLY) indicate
+ * dirty bit being set.
+ */
+ WARN_ON(pud_dirty(pud_wrprotect(pud)));
+
WARN_ON(!pud_same(pud, pud));
WARN_ON(!pud_young(pud_mkyoung(pud_mkold(pud))));
+ WARN_ON(!pud_dirty(pud_mkdirty(pud_mkclean(pud))));
+ WARN_ON(pud_dirty(pud_mkclean(pud_mkdirty(pud))));
WARN_ON(!pud_write(pud_mkwrite(pud_wrprotect(pud))));
WARN_ON(pud_write(pud_wrprotect(pud_mkwrite(pud))));
WARN_ON(pud_young(pud_mkold(pud_mkyoung(pud))));
+ WARN_ON(pud_dirty(pud_wrprotect(pud_mkclean(pud))));
+ WARN_ON(!pud_dirty(pud_wrprotect(pud_mkdirty(pud))));
if (mm_pmd_folded(mm))
return;
@@ -359,7 +404,7 @@ static void __init pud_huge_tests(pud_t *pudp, unsigned long pfn, pgprot_t prot)
#endif /* !CONFIG_HAVE_ARCH_HUGE_VMAP */
#else /* !CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
-static void __init pud_basic_tests(unsigned long pfn, pgprot_t prot) { }
+static void __init pud_basic_tests(struct mm_struct *mm, unsigned long pfn, int idx) { }
static void __init pud_advanced_tests(struct mm_struct *mm,
struct vm_area_struct *vma, pud_t *pudp,
unsigned long pfn, unsigned long vaddr,
@@ -372,8 +417,8 @@ static void __init pud_huge_tests(pud_t *pudp, unsigned long pfn, pgprot_t prot)
}
#endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
#else /* !CONFIG_TRANSPARENT_HUGEPAGE */
-static void __init pmd_basic_tests(unsigned long pfn, pgprot_t prot) { }
-static void __init pud_basic_tests(unsigned long pfn, pgprot_t prot) { }
+static void __init pmd_basic_tests(unsigned long pfn, int idx) { }
+static void __init pud_basic_tests(struct mm_struct *mm, unsigned long pfn, int idx) { }
static void __init pmd_advanced_tests(struct mm_struct *mm,
struct vm_area_struct *vma, pmd_t *pmdp,
unsigned long pfn, unsigned long vaddr,
@@ -899,6 +944,7 @@ static int __init debug_vm_pgtable(void)
unsigned long vaddr, pte_aligned, pmd_aligned;
unsigned long pud_aligned, p4d_aligned, pgd_aligned;
spinlock_t *ptl = NULL;
+ int idx;
pr_info("Validating architecture page table helpers\n");
prot = vm_get_page_prot(VMFLAGS);
@@ -963,9 +1009,25 @@ static int __init debug_vm_pgtable(void)
saved_pmdp = pmd_offset(pudp, 0UL);
saved_ptep = pmd_pgtable(pmd);
- pte_basic_tests(pte_aligned, prot);
- pmd_basic_tests(pmd_aligned, prot);
- pud_basic_tests(pud_aligned, prot);
+ /*
+ * Iterate over the protection_map[] to make sure that all
+ * the basic page table transformation validations just hold
+ * true irrespective of the starting protection value for a
+ * given page table entry.
+ */
+ for (idx = 0; idx < ARRAY_SIZE(protection_map); idx++) {
+ pte_basic_tests(pte_aligned, idx);
+ pmd_basic_tests(pmd_aligned, idx);
+ pud_basic_tests(mm, pud_aligned, idx);
+ }
+
+ /*
+ * Both P4D and PGD level tests are very basic which do not
+ * involve creating page table entries from the protection
+ * value and the given pfn. Hence just keep them out from
+ * the above iteration for now to save some test execution
+ * time.
+ */
p4d_basic_tests(p4d_aligned, prot);
pgd_basic_tests(pgd_aligned, prot);
diff --git a/mm/dmapool.c b/mm/dmapool.c
index a97c97232337..f3791532fef2 100644
--- a/mm/dmapool.c
+++ b/mm/dmapool.c
@@ -28,6 +28,7 @@
#include <linux/mutex.h>
#include <linux/poison.h>
#include <linux/sched.h>
+#include <linux/sched/mm.h>
#include <linux/slab.h>
#include <linux/stat.h>
#include <linux/spinlock.h>
@@ -319,7 +320,7 @@ void *dma_pool_alloc(struct dma_pool *pool, gfp_t mem_flags,
size_t offset;
void *retval;
- might_sleep_if(gfpflags_allow_blocking(mem_flags));
+ might_alloc(mem_flags);
spin_lock_irqsave(&pool->lock, flags);
list_for_each_entry(page, &pool->page_list, page_list) {
diff --git a/mm/early_ioremap.c b/mm/early_ioremap.c
index a0018ad1a1f6..164607c7cdf1 100644
--- a/mm/early_ioremap.c
+++ b/mm/early_ioremap.c
@@ -181,17 +181,17 @@ void __init early_iounmap(void __iomem *addr, unsigned long size)
}
}
- if (WARN(slot < 0, "early_iounmap(%p, %08lx) not found slot\n",
- addr, size))
+ if (WARN(slot < 0, "%s(%p, %08lx) not found slot\n",
+ __func__, addr, size))
return;
if (WARN(prev_size[slot] != size,
- "early_iounmap(%p, %08lx) [%d] size not consistent %08lx\n",
- addr, size, slot, prev_size[slot]))
+ "%s(%p, %08lx) [%d] size not consistent %08lx\n",
+ __func__, addr, size, slot, prev_size[slot]))
return;
- WARN(early_ioremap_debug, "early_iounmap(%p, %08lx) [%d]\n",
- addr, size, slot);
+ WARN(early_ioremap_debug, "%s(%p, %08lx) [%d]\n",
+ __func__, addr, size, slot);
virt_addr = (unsigned long)addr;
if (WARN_ON(virt_addr < fix_to_virt(FIX_BTMAP_BEGIN)))
diff --git a/mm/filemap.c b/mm/filemap.c
index 6ff2a3fb0dc7..43700480d897 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -206,9 +206,9 @@ static void unaccount_page_cache_page(struct address_space *mapping,
if (PageSwapBacked(page)) {
__mod_lruvec_page_state(page, NR_SHMEM, -nr);
if (PageTransHuge(page))
- __dec_lruvec_page_state(page, NR_SHMEM_THPS);
+ __mod_lruvec_page_state(page, NR_SHMEM_THPS, -nr);
} else if (PageTransHuge(page)) {
- __dec_lruvec_page_state(page, NR_FILE_THPS);
+ __mod_lruvec_page_state(page, NR_FILE_THPS, -nr);
filemap_nr_thps_dec(mapping);
}
@@ -777,7 +777,6 @@ EXPORT_SYMBOL(file_write_and_wait_range);
* replace_page_cache_page - replace a pagecache page with a new one
* @old: page to be replaced
* @new: page to replace with
- * @gfp_mask: allocation mode
*
* This function replaces a page in the pagecache with a new one. On
* success it acquires the pagecache reference for the new page and
@@ -786,10 +785,8 @@ EXPORT_SYMBOL(file_write_and_wait_range);
* caller must do that.
*
* The remove + add is atomic. This function cannot fail.
- *
- * Return: %0
*/
-int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask)
+void replace_page_cache_page(struct page *old, struct page *new)
{
struct address_space *mapping = old->mapping;
void (*freepage)(struct page *) = mapping->a_ops->freepage;
@@ -824,8 +821,6 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask)
if (freepage)
freepage(old);
put_page(old);
-
- return 0;
}
EXPORT_SYMBOL_GPL(replace_page_cache_page);
@@ -1348,61 +1343,26 @@ int wait_on_page_bit_killable(struct page *page, int bit_nr)
}
EXPORT_SYMBOL(wait_on_page_bit_killable);
-static int __wait_on_page_locked_async(struct page *page,
- struct wait_page_queue *wait, bool set)
-{
- struct wait_queue_head *q = page_waitqueue(page);
- int ret = 0;
-
- wait->page = page;
- wait->bit_nr = PG_locked;
-
- spin_lock_irq(&q->lock);
- __add_wait_queue_entry_tail(q, &wait->wait);
- SetPageWaiters(page);
- if (set)
- ret = !trylock_page(page);
- else
- ret = PageLocked(page);
- /*
- * If we were successful now, we know we're still on the
- * waitqueue as we're still under the lock. This means it's
- * safe to remove and return success, we know the callback
- * isn't going to trigger.
- */
- if (!ret)
- __remove_wait_queue(q, &wait->wait);
- else
- ret = -EIOCBQUEUED;
- spin_unlock_irq(&q->lock);
- return ret;
-}
-
-static int wait_on_page_locked_async(struct page *page,
- struct wait_page_queue *wait)
-{
- if (!PageLocked(page))
- return 0;
- return __wait_on_page_locked_async(compound_head(page), wait, false);
-}
-
/**
* put_and_wait_on_page_locked - Drop a reference and wait for it to be unlocked
* @page: The page to wait for.
+ * @state: The sleep state (TASK_KILLABLE, TASK_UNINTERRUPTIBLE, etc).
*
* The caller should hold a reference on @page. They expect the page to
* become unlocked relatively soon, but do not wish to hold up migration
* (for example) by holding the reference while waiting for the page to
* come unlocked. After this function returns, the caller should not
* dereference @page.
+ *
+ * Return: 0 if the page was unlocked or -EINTR if interrupted by a signal.
*/
-void put_and_wait_on_page_locked(struct page *page)
+int put_and_wait_on_page_locked(struct page *page, int state)
{
wait_queue_head_t *q;
page = compound_head(page);
q = page_waitqueue(page);
- wait_on_page_bit_common(q, page, PG_locked, TASK_UNINTERRUPTIBLE, DROP);
+ return wait_on_page_bit_common(q, page, PG_locked, state, DROP);
}
/**
@@ -1558,7 +1518,28 @@ EXPORT_SYMBOL_GPL(__lock_page_killable);
int __lock_page_async(struct page *page, struct wait_page_queue *wait)
{
- return __wait_on_page_locked_async(page, wait, true);
+ struct wait_queue_head *q = page_waitqueue(page);
+ int ret = 0;
+
+ wait->page = page;
+ wait->bit_nr = PG_locked;
+
+ spin_lock_irq(&q->lock);
+ __add_wait_queue_entry_tail(q, &wait->wait);
+ SetPageWaiters(page);
+ ret = !trylock_page(page);
+ /*
+ * If we were successful now, we know we're still on the
+ * waitqueue as we're still under the lock. This means it's
+ * safe to remove and return success, we know the callback
+ * isn't going to trigger.
+ */
+ if (!ret)
+ __remove_wait_queue(q, &wait->wait);
+ else
+ ret = -EIOCBQUEUED;
+ spin_unlock_irq(&q->lock);
+ return ret;
}
/*
@@ -1677,8 +1658,8 @@ pgoff_t page_cache_prev_miss(struct address_space *mapping,
}
EXPORT_SYMBOL(page_cache_prev_miss);
-/**
- * find_get_entry - find and get a page cache entry
+/*
+ * mapping_get_entry - Get a page cache entry.
* @mapping: the address_space to search
* @index: The page cache index.
*
@@ -1690,7 +1671,8 @@ EXPORT_SYMBOL(page_cache_prev_miss);
*
* Return: The head page or shadow entry, %NULL if nothing is found.
*/
-struct page *find_get_entry(struct address_space *mapping, pgoff_t index)
+static struct page *mapping_get_entry(struct address_space *mapping,
+ pgoff_t index)
{
XA_STATE(xas, &mapping->i_pages, index);
struct page *page;
@@ -1727,39 +1709,6 @@ out:
}
/**
- * find_lock_entry - Locate and lock a page cache entry.
- * @mapping: The address_space to search.
- * @index: The page cache index.
- *
- * Looks up the page at @mapping & @index. If there is a page in the
- * cache, the head page is returned locked and with an increased refcount.
- *
- * If the slot holds a shadow entry of a previously evicted page, or a
- * swap entry from shmem/tmpfs, it is returned.
- *
- * Context: May sleep.
- * Return: The head page or shadow entry, %NULL if nothing is found.
- */
-struct page *find_lock_entry(struct address_space *mapping, pgoff_t index)
-{
- struct page *page;
-
-repeat:
- page = find_get_entry(mapping, index);
- if (page && !xa_is_value(page)) {
- lock_page(page);
- /* Has the page been truncated? */
- if (unlikely(page->mapping != mapping)) {
- unlock_page(page);
- put_page(page);
- goto repeat;
- }
- VM_BUG_ON_PAGE(!thp_contains(page, index), page);
- }
- return page;
-}
-
-/**
* pagecache_get_page - Find and get a reference to a page.
* @mapping: The address_space to search.
* @index: The page index.
@@ -1774,6 +1723,8 @@ repeat:
* * %FGP_LOCK - The page is returned locked.
* * %FGP_HEAD - If the page is present and a THP, return the head page
* rather than the exact page specified by the index.
+ * * %FGP_ENTRY - If there is a shadow / swap / DAX entry, return it
+ * instead of allocating a new page to replace it.
* * %FGP_CREAT - If no page is present then a new page is allocated using
* @gfp_mask and added to the page cache and the VM's LRU list.
* The page is returned locked and with an increased refcount.
@@ -1797,9 +1748,12 @@ struct page *pagecache_get_page(struct address_space *mapping, pgoff_t index,
struct page *page;
repeat:
- page = find_get_entry(mapping, index);
- if (xa_is_value(page))
+ page = mapping_get_entry(mapping, index);
+ if (xa_is_value(page)) {
+ if (fgp_flags & FGP_ENTRY)
+ return page;
page = NULL;
+ }
if (!page)
goto no_page;
@@ -1871,18 +1825,53 @@ no_page:
}
EXPORT_SYMBOL(pagecache_get_page);
+static inline struct page *find_get_entry(struct xa_state *xas, pgoff_t max,
+ xa_mark_t mark)
+{
+ struct page *page;
+
+retry:
+ if (mark == XA_PRESENT)
+ page = xas_find(xas, max);
+ else
+ page = xas_find_marked(xas, max, mark);
+
+ if (xas_retry(xas, page))
+ goto retry;
+ /*
+ * A shadow entry of a recently evicted page, a swap
+ * entry from shmem/tmpfs or a DAX entry. Return it
+ * without attempting to raise page count.
+ */
+ if (!page || xa_is_value(page))
+ return page;
+
+ if (!page_cache_get_speculative(page))
+ goto reset;
+
+ /* Has the page moved or been split? */
+ if (unlikely(page != xas_reload(xas))) {
+ put_page(page);
+ goto reset;
+ }
+
+ return page;
+reset:
+ xas_reset(xas);
+ goto retry;
+}
+
/**
* find_get_entries - gang pagecache lookup
* @mapping: The address_space to search
* @start: The starting page cache index
- * @nr_entries: The maximum number of entries
- * @entries: Where the resulting entries are placed
+ * @end: The final page index (inclusive).
+ * @pvec: Where the resulting entries are placed.
* @indices: The cache indices corresponding to the entries in @entries
*
- * find_get_entries() will search for and return a group of up to
- * @nr_entries entries in the mapping. The entries are placed at
- * @entries. find_get_entries() takes a reference against any actual
- * pages it returns.
+ * find_get_entries() will search for and return a batch of entries in
+ * the mapping. The entries are placed in @pvec. find_get_entries()
+ * takes a reference on any actual pages it returns.
*
* The search returns a group of mapping-contiguous page cache entries
* with ascending indexes. There may be holes in the indices due to
@@ -1898,60 +1887,97 @@ EXPORT_SYMBOL(pagecache_get_page);
*
* Return: the number of pages and shadow entries which were found.
*/
-unsigned find_get_entries(struct address_space *mapping,
- pgoff_t start, unsigned int nr_entries,
- struct page **entries, pgoff_t *indices)
+unsigned find_get_entries(struct address_space *mapping, pgoff_t start,
+ pgoff_t end, struct pagevec *pvec, pgoff_t *indices)
{
XA_STATE(xas, &mapping->i_pages, start);
struct page *page;
unsigned int ret = 0;
-
- if (!nr_entries)
- return 0;
+ unsigned nr_entries = PAGEVEC_SIZE;
rcu_read_lock();
- xas_for_each(&xas, page, ULONG_MAX) {
- if (xas_retry(&xas, page))
- continue;
- /*
- * A shadow entry of a recently evicted page, a swap
- * entry from shmem/tmpfs or a DAX entry. Return it
- * without attempting to raise page count.
- */
- if (xa_is_value(page))
- goto export;
-
- if (!page_cache_get_speculative(page))
- goto retry;
-
- /* Has the page moved or been split? */
- if (unlikely(page != xas_reload(&xas)))
- goto put_page;
-
+ while ((page = find_get_entry(&xas, end, XA_PRESENT))) {
/*
* Terminate early on finding a THP, to allow the caller to
* handle it all at once; but continue if this is hugetlbfs.
*/
- if (PageTransHuge(page) && !PageHuge(page)) {
+ if (!xa_is_value(page) && PageTransHuge(page) &&
+ !PageHuge(page)) {
page = find_subpage(page, xas.xa_index);
nr_entries = ret + 1;
}
-export:
+
indices[ret] = xas.xa_index;
- entries[ret] = page;
+ pvec->pages[ret] = page;
if (++ret == nr_entries)
break;
- continue;
-put_page:
- put_page(page);
-retry:
- xas_reset(&xas);
}
rcu_read_unlock();
+
+ pvec->nr = ret;
return ret;
}
/**
+ * find_lock_entries - Find a batch of pagecache entries.
+ * @mapping: The address_space to search.
+ * @start: The starting page cache index.
+ * @end: The final page index (inclusive).
+ * @pvec: Where the resulting entries are placed.
+ * @indices: The cache indices of the entries in @pvec.
+ *
+ * find_lock_entries() will return a batch of entries from @mapping.
+ * Swap, shadow and DAX entries are included. Pages are returned
+ * locked and with an incremented refcount. Pages which are locked by
+ * somebody else or under writeback are skipped. Only the head page of
+ * a THP is returned. Pages which are partially outside the range are
+ * not returned.
+ *
+ * The entries have ascending indexes. The indices may not be consecutive
+ * due to not-present entries, THP pages, pages which could not be locked
+ * or pages under writeback.
+ *
+ * Return: The number of entries which were found.
+ */
+unsigned find_lock_entries(struct address_space *mapping, pgoff_t start,
+ pgoff_t end, struct pagevec *pvec, pgoff_t *indices)
+{
+ XA_STATE(xas, &mapping->i_pages, start);
+ struct page *page;
+
+ rcu_read_lock();
+ while ((page = find_get_entry(&xas, end, XA_PRESENT))) {
+ if (!xa_is_value(page)) {
+ if (page->index < start)
+ goto put;
+ VM_BUG_ON_PAGE(page->index != xas.xa_index, page);
+ if (page->index + thp_nr_pages(page) - 1 > end)
+ goto put;
+ if (!trylock_page(page))
+ goto put;
+ if (page->mapping != mapping || PageWriteback(page))
+ goto unlock;
+ VM_BUG_ON_PAGE(!thp_contains(page, xas.xa_index),
+ page);
+ }
+ indices[pvec->nr] = xas.xa_index;
+ if (!pagevec_add(pvec, page))
+ break;
+ goto next;
+unlock:
+ unlock_page(page);
+put:
+ put_page(page);
+next:
+ if (!xa_is_value(page) && PageTransHuge(page))
+ xas_set(&xas, page->index + thp_nr_pages(page));
+ }
+ rcu_read_unlock();
+
+ return pagevec_count(pvec);
+}
+
+/**
* find_get_pages_range - gang pagecache lookup
* @mapping: The address_space to search
* @start: The starting page index
@@ -1984,30 +2010,16 @@ unsigned find_get_pages_range(struct address_space *mapping, pgoff_t *start,
return 0;
rcu_read_lock();
- xas_for_each(&xas, page, end) {
- if (xas_retry(&xas, page))
- continue;
+ while ((page = find_get_entry(&xas, end, XA_PRESENT))) {
/* Skip over shadow, swap and DAX entries */
if (xa_is_value(page))
continue;
- if (!page_cache_get_speculative(page))
- goto retry;
-
- /* Has the page moved or been split? */
- if (unlikely(page != xas_reload(&xas)))
- goto put_page;
-
pages[ret] = find_subpage(page, xas.xa_index);
if (++ret == nr_pages) {
*start = xas.xa_index + 1;
goto out;
}
- continue;
-put_page:
- put_page(page);
-retry:
- xas_reset(&xas);
}
/*
@@ -2081,7 +2093,7 @@ retry:
EXPORT_SYMBOL(find_get_pages_contig);
/**
- * find_get_pages_range_tag - find and return pages in given range matching @tag
+ * find_get_pages_range_tag - Find and return head pages matching @tag.
* @mapping: the address_space to search
* @index: the starting page index
* @end: The final page index (inclusive)
@@ -2089,8 +2101,9 @@ EXPORT_SYMBOL(find_get_pages_contig);
* @nr_pages: the maximum number of pages
* @pages: where the resulting pages are placed
*
- * Like find_get_pages, except we only return pages which are tagged with
- * @tag. We update @index to index the next page for the traversal.
+ * Like find_get_pages(), except we only return head pages which are tagged
+ * with @tag. @index is updated to the index immediately after the last
+ * page we return, ready for the next iteration.
*
* Return: the number of pages which were found.
*/
@@ -2106,9 +2119,7 @@ unsigned find_get_pages_range_tag(struct address_space *mapping, pgoff_t *index,
return 0;
rcu_read_lock();
- xas_for_each_marked(&xas, page, end, tag) {
- if (xas_retry(&xas, page))
- continue;
+ while ((page = find_get_entry(&xas, end, tag))) {
/*
* Shadow entries should never be tagged, but this iteration
* is lockless so there is a window for page reclaim to evict
@@ -2117,23 +2128,11 @@ unsigned find_get_pages_range_tag(struct address_space *mapping, pgoff_t *index,
if (xa_is_value(page))
continue;
- if (!page_cache_get_speculative(page))
- goto retry;
-
- /* Has the page moved or been split? */
- if (unlikely(page != xas_reload(&xas)))
- goto put_page;
-
- pages[ret] = find_subpage(page, xas.xa_index);
+ pages[ret] = page;
if (++ret == nr_pages) {
- *index = xas.xa_index + 1;
+ *index = page->index + thp_nr_pages(page);
goto out;
}
- continue;
-put_page:
- put_page(page);
-retry:
- xas_reset(&xas);
}
/*
@@ -2173,287 +2172,267 @@ static void shrink_readahead_size_eio(struct file_ra_state *ra)
ra->ra_pages /= 4;
}
-static int lock_page_for_iocb(struct kiocb *iocb, struct page *page)
+/*
+ * filemap_get_read_batch - Get a batch of pages for read
+ *
+ * Get a batch of pages which represent a contiguous range of bytes
+ * in the file. No tail pages will be returned. If @index is in the
+ * middle of a THP, the entire THP will be returned. The last page in
+ * the batch may have Readahead set or be not Uptodate so that the
+ * caller can take the appropriate action.
+ */
+static void filemap_get_read_batch(struct address_space *mapping,
+ pgoff_t index, pgoff_t max, struct pagevec *pvec)
{
- if (iocb->ki_flags & IOCB_WAITQ)
- return lock_page_async(page, iocb->ki_waitq);
- else if (iocb->ki_flags & IOCB_NOWAIT)
- return trylock_page(page) ? 0 : -EAGAIN;
- else
- return lock_page_killable(page);
+ XA_STATE(xas, &mapping->i_pages, index);
+ struct page *head;
+
+ rcu_read_lock();
+ for (head = xas_load(&xas); head; head = xas_next(&xas)) {
+ if (xas_retry(&xas, head))
+ continue;
+ if (xas.xa_index > max || xa_is_value(head))
+ break;
+ if (!page_cache_get_speculative(head))
+ goto retry;
+
+ /* Has the page moved or been split? */
+ if (unlikely(head != xas_reload(&xas)))
+ goto put_page;
+
+ if (!pagevec_add(pvec, head))
+ break;
+ if (!PageUptodate(head))
+ break;
+ if (PageReadahead(head))
+ break;
+ xas.xa_index = head->index + thp_nr_pages(head) - 1;
+ xas.xa_offset = (xas.xa_index >> xas.xa_shift) & XA_CHUNK_MASK;
+ continue;
+put_page:
+ put_page(head);
+retry:
+ xas_reset(&xas);
+ }
+ rcu_read_unlock();
}
-static struct page *
-generic_file_buffered_read_readpage(struct kiocb *iocb,
- struct file *filp,
- struct address_space *mapping,
- struct page *page)
+static int filemap_read_page(struct file *file, struct address_space *mapping,
+ struct page *page)
{
- struct file_ra_state *ra = &filp->f_ra;
int error;
- if (iocb->ki_flags & (IOCB_NOIO | IOCB_NOWAIT)) {
- unlock_page(page);
- put_page(page);
- return ERR_PTR(-EAGAIN);
- }
-
/*
- * A previous I/O error may have been due to temporary
- * failures, eg. multipath errors.
- * PG_error will be set again if readpage fails.
+ * A previous I/O error may have been due to temporary failures,
+ * eg. multipath errors. PG_error will be set again if readpage
+ * fails.
*/
ClearPageError(page);
/* Start the actual read. The read will unlock the page. */
- error = mapping->a_ops->readpage(filp, page);
+ error = mapping->a_ops->readpage(file, page);
+ if (error)
+ return error;
- if (unlikely(error)) {
- put_page(page);
- return error != AOP_TRUNCATED_PAGE ? ERR_PTR(error) : NULL;
- }
+ error = wait_on_page_locked_killable(page);
+ if (error)
+ return error;
+ if (PageUptodate(page))
+ return 0;
+ if (!page->mapping) /* page truncated */
+ return AOP_TRUNCATED_PAGE;
+ shrink_readahead_size_eio(&file->f_ra);
+ return -EIO;
+}
- if (!PageUptodate(page)) {
- error = lock_page_for_iocb(iocb, page);
- if (unlikely(error)) {
- put_page(page);
- return ERR_PTR(error);
- }
- if (!PageUptodate(page)) {
- if (page->mapping == NULL) {
- /*
- * invalidate_mapping_pages got it
- */
- unlock_page(page);
- put_page(page);
- return NULL;
- }
- unlock_page(page);
- shrink_readahead_size_eio(ra);
- put_page(page);
- return ERR_PTR(-EIO);
- }
- unlock_page(page);
+static bool filemap_range_uptodate(struct address_space *mapping,
+ loff_t pos, struct iov_iter *iter, struct page *page)
+{
+ int count;
+
+ if (PageUptodate(page))
+ return true;
+ /* pipes can't handle partially uptodate pages */
+ if (iov_iter_is_pipe(iter))
+ return false;
+ if (!mapping->a_ops->is_partially_uptodate)
+ return false;
+ if (mapping->host->i_blkbits >= (PAGE_SHIFT + thp_order(page)))
+ return false;
+
+ count = iter->count;
+ if (page_offset(page) > pos) {
+ count -= page_offset(page) - pos;
+ pos = 0;
+ } else {
+ pos -= page_offset(page);
}
- return page;
+ return mapping->a_ops->is_partially_uptodate(page, pos, count);
}
-static struct page *
-generic_file_buffered_read_pagenotuptodate(struct kiocb *iocb,
- struct file *filp,
- struct iov_iter *iter,
- struct page *page,
- loff_t pos, loff_t count)
+static int filemap_update_page(struct kiocb *iocb,
+ struct address_space *mapping, struct iov_iter *iter,
+ struct page *page)
{
- struct address_space *mapping = filp->f_mapping;
- struct inode *inode = mapping->host;
int error;
- /*
- * See comment in do_read_cache_page on why
- * wait_on_page_locked is used to avoid unnecessarily
- * serialisations and why it's safe.
- */
- if (iocb->ki_flags & IOCB_WAITQ) {
- error = wait_on_page_locked_async(page,
- iocb->ki_waitq);
- } else {
- error = wait_on_page_locked_killable(page);
- }
- if (unlikely(error)) {
- put_page(page);
- return ERR_PTR(error);
+ if (!trylock_page(page)) {
+ if (iocb->ki_flags & (IOCB_NOWAIT | IOCB_NOIO))
+ return -EAGAIN;
+ if (!(iocb->ki_flags & IOCB_WAITQ)) {
+ put_and_wait_on_page_locked(page, TASK_KILLABLE);
+ return AOP_TRUNCATED_PAGE;
+ }
+ error = __lock_page_async(page, iocb->ki_waitq);
+ if (error)
+ return error;
}
- if (PageUptodate(page))
- return page;
- if (inode->i_blkbits == PAGE_SHIFT ||
- !mapping->a_ops->is_partially_uptodate)
- goto page_not_up_to_date;
- /* pipes can't handle partially uptodate pages */
- if (unlikely(iov_iter_is_pipe(iter)))
- goto page_not_up_to_date;
- if (!trylock_page(page))
- goto page_not_up_to_date;
- /* Did it get truncated before we got the lock? */
if (!page->mapping)
- goto page_not_up_to_date_locked;
- if (!mapping->a_ops->is_partially_uptodate(page,
- pos & ~PAGE_MASK, count))
- goto page_not_up_to_date_locked;
- unlock_page(page);
- return page;
-
-page_not_up_to_date:
- /* Get exclusive access to the page ... */
- error = lock_page_for_iocb(iocb, page);
- if (unlikely(error)) {
- put_page(page);
- return ERR_PTR(error);
- }
+ goto truncated;
-page_not_up_to_date_locked:
- /* Did it get truncated before we got the lock? */
- if (!page->mapping) {
- unlock_page(page);
- put_page(page);
- return NULL;
- }
+ error = 0;
+ if (filemap_range_uptodate(mapping, iocb->ki_pos, iter, page))
+ goto unlock;
- /* Did somebody else fill it already? */
- if (PageUptodate(page)) {
- unlock_page(page);
- return page;
- }
+ error = -EAGAIN;
+ if (iocb->ki_flags & (IOCB_NOIO | IOCB_NOWAIT | IOCB_WAITQ))
+ goto unlock;
- return generic_file_buffered_read_readpage(iocb, filp, mapping, page);
+ error = filemap_read_page(iocb->ki_filp, mapping, page);
+ if (error == AOP_TRUNCATED_PAGE)
+ put_page(page);
+ return error;
+truncated:
+ unlock_page(page);
+ put_page(page);
+ return AOP_TRUNCATED_PAGE;
+unlock:
+ unlock_page(page);
+ return error;
}
-static struct page *
-generic_file_buffered_read_no_cached_page(struct kiocb *iocb,
- struct iov_iter *iter)
+static int filemap_create_page(struct file *file,
+ struct address_space *mapping, pgoff_t index,
+ struct pagevec *pvec)
{
- struct file *filp = iocb->ki_filp;
- struct address_space *mapping = filp->f_mapping;
- pgoff_t index = iocb->ki_pos >> PAGE_SHIFT;
struct page *page;
int error;
- if (iocb->ki_flags & IOCB_NOIO)
- return ERR_PTR(-EAGAIN);
-
- /*
- * Ok, it wasn't cached, so we need to create a new
- * page..
- */
page = page_cache_alloc(mapping);
if (!page)
- return ERR_PTR(-ENOMEM);
+ return -ENOMEM;
error = add_to_page_cache_lru(page, mapping, index,
- mapping_gfp_constraint(mapping, GFP_KERNEL));
- if (error) {
- put_page(page);
- return error != -EEXIST ? ERR_PTR(error) : NULL;
- }
+ mapping_gfp_constraint(mapping, GFP_KERNEL));
+ if (error == -EEXIST)
+ error = AOP_TRUNCATED_PAGE;
+ if (error)
+ goto error;
+
+ error = filemap_read_page(file, mapping, page);
+ if (error)
+ goto error;
- return generic_file_buffered_read_readpage(iocb, filp, mapping, page);
+ pagevec_add(pvec, page);
+ return 0;
+error:
+ put_page(page);
+ return error;
+}
+
+static int filemap_readahead(struct kiocb *iocb, struct file *file,
+ struct address_space *mapping, struct page *page,
+ pgoff_t last_index)
+{
+ if (iocb->ki_flags & IOCB_NOIO)
+ return -EAGAIN;
+ page_cache_async_readahead(mapping, &file->f_ra, file, page,
+ page->index, last_index - page->index);
+ return 0;
}
-static int generic_file_buffered_read_get_pages(struct kiocb *iocb,
- struct iov_iter *iter,
- struct page **pages,
- unsigned int nr)
+static int filemap_get_pages(struct kiocb *iocb, struct iov_iter *iter,
+ struct pagevec *pvec)
{
struct file *filp = iocb->ki_filp;
struct address_space *mapping = filp->f_mapping;
struct file_ra_state *ra = &filp->f_ra;
pgoff_t index = iocb->ki_pos >> PAGE_SHIFT;
- pgoff_t last_index = (iocb->ki_pos + iter->count + PAGE_SIZE-1) >> PAGE_SHIFT;
- int i, j, nr_got, err = 0;
+ pgoff_t last_index;
+ struct page *page;
+ int err = 0;
- nr = min_t(unsigned long, last_index - index, nr);
-find_page:
+ last_index = DIV_ROUND_UP(iocb->ki_pos + iter->count, PAGE_SIZE);
+retry:
if (fatal_signal_pending(current))
return -EINTR;
- nr_got = find_get_pages_contig(mapping, index, nr, pages);
- if (nr_got)
- goto got_pages;
-
- if (iocb->ki_flags & IOCB_NOIO)
- return -EAGAIN;
-
- page_cache_sync_readahead(mapping, ra, filp, index, last_index - index);
-
- nr_got = find_get_pages_contig(mapping, index, nr, pages);
- if (nr_got)
- goto got_pages;
-
- pages[0] = generic_file_buffered_read_no_cached_page(iocb, iter);
- err = PTR_ERR_OR_ZERO(pages[0]);
- if (!IS_ERR_OR_NULL(pages[0]))
- nr_got = 1;
-got_pages:
- for (i = 0; i < nr_got; i++) {
- struct page *page = pages[i];
- pgoff_t pg_index = index + i;
- loff_t pg_pos = max(iocb->ki_pos,
- (loff_t) pg_index << PAGE_SHIFT);
- loff_t pg_count = iocb->ki_pos + iter->count - pg_pos;
-
- if (PageReadahead(page)) {
- if (iocb->ki_flags & IOCB_NOIO) {
- for (j = i; j < nr_got; j++)
- put_page(pages[j]);
- nr_got = i;
- err = -EAGAIN;
- break;
- }
- page_cache_async_readahead(mapping, ra, filp, page,
- pg_index, last_index - pg_index);
- }
-
- if (!PageUptodate(page)) {
- if ((iocb->ki_flags & IOCB_NOWAIT) ||
- ((iocb->ki_flags & IOCB_WAITQ) && i)) {
- for (j = i; j < nr_got; j++)
- put_page(pages[j]);
- nr_got = i;
- err = -EAGAIN;
- break;
- }
+ filemap_get_read_batch(mapping, index, last_index, pvec);
+ if (!pagevec_count(pvec)) {
+ if (iocb->ki_flags & IOCB_NOIO)
+ return -EAGAIN;
+ page_cache_sync_readahead(mapping, ra, filp, index,
+ last_index - index);
+ filemap_get_read_batch(mapping, index, last_index, pvec);
+ }
+ if (!pagevec_count(pvec)) {
+ if (iocb->ki_flags & (IOCB_NOWAIT | IOCB_WAITQ))
+ return -EAGAIN;
+ err = filemap_create_page(filp, mapping,
+ iocb->ki_pos >> PAGE_SHIFT, pvec);
+ if (err == AOP_TRUNCATED_PAGE)
+ goto retry;
+ return err;
+ }
- page = generic_file_buffered_read_pagenotuptodate(iocb,
- filp, iter, page, pg_pos, pg_count);
- if (IS_ERR_OR_NULL(page)) {
- for (j = i + 1; j < nr_got; j++)
- put_page(pages[j]);
- nr_got = i;
- err = PTR_ERR_OR_ZERO(page);
- break;
- }
- }
+ page = pvec->pages[pagevec_count(pvec) - 1];
+ if (PageReadahead(page)) {
+ err = filemap_readahead(iocb, filp, mapping, page, last_index);
+ if (err)
+ goto err;
+ }
+ if (!PageUptodate(page)) {
+ if ((iocb->ki_flags & IOCB_WAITQ) && pagevec_count(pvec) > 1)
+ iocb->ki_flags |= IOCB_NOWAIT;
+ err = filemap_update_page(iocb, mapping, iter, page);
+ if (err)
+ goto err;
}
- if (likely(nr_got))
- return nr_got;
- if (err)
- return err;
- /*
- * No pages and no error means we raced and should retry:
- */
- goto find_page;
+ return 0;
+err:
+ if (err < 0)
+ put_page(page);
+ if (likely(--pvec->nr))
+ return 0;
+ if (err == AOP_TRUNCATED_PAGE)
+ goto retry;
+ return err;
}
/**
- * generic_file_buffered_read - generic file read routine
- * @iocb: the iocb to read
- * @iter: data destination
- * @written: already copied
- *
- * This is a generic file read routine, and uses the
- * mapping->a_ops->readpage() function for the actual low-level stuff.
+ * filemap_read - Read data from the page cache.
+ * @iocb: The iocb to read.
+ * @iter: Destination for the data.
+ * @already_read: Number of bytes already read by the caller.
*
- * This is really ugly. But the goto's actually try to clarify some
- * of the logic when it comes to error handling etc.
+ * Copies data from the page cache. If the data is not currently present,
+ * uses the readahead and readpage address_space operations to fetch it.
*
- * Return:
- * * total number of bytes copied, including those the were already @written
- * * negative error code if nothing was copied
+ * Return: Total number of bytes copied, including those already read by
+ * the caller. If an error happens before any bytes are copied, returns
+ * a negative error number.
*/
-ssize_t generic_file_buffered_read(struct kiocb *iocb,
- struct iov_iter *iter, ssize_t written)
+ssize_t filemap_read(struct kiocb *iocb, struct iov_iter *iter,
+ ssize_t already_read)
{
struct file *filp = iocb->ki_filp;
struct file_ra_state *ra = &filp->f_ra;
struct address_space *mapping = filp->f_mapping;
struct inode *inode = mapping->host;
- struct page *pages_onstack[PAGEVEC_SIZE], **pages = NULL;
- unsigned int nr_pages = min_t(unsigned int, 512,
- ((iocb->ki_pos + iter->count + PAGE_SIZE - 1) >> PAGE_SHIFT) -
- (iocb->ki_pos >> PAGE_SHIFT));
- int i, pg_nr, error = 0;
+ struct pagevec pvec;
+ int i, error = 0;
bool writably_mapped;
loff_t isize, end_offset;
@@ -2463,14 +2442,7 @@ ssize_t generic_file_buffered_read(struct kiocb *iocb,
return 0;
iov_iter_truncate(iter, inode->i_sb->s_maxbytes);
-
- if (nr_pages > ARRAY_SIZE(pages_onstack))
- pages = kmalloc_array(nr_pages, sizeof(void *), GFP_KERNEL);
-
- if (!pages) {
- pages = pages_onstack;
- nr_pages = min_t(unsigned int, nr_pages, ARRAY_SIZE(pages_onstack));
- }
+ pagevec_init(&pvec);
do {
cond_resched();
@@ -2480,16 +2452,12 @@ ssize_t generic_file_buffered_read(struct kiocb *iocb,
* can no longer safely return -EIOCBQUEUED. Hence mark
* an async read NOWAIT at that point.
*/
- if ((iocb->ki_flags & IOCB_WAITQ) && written)
+ if ((iocb->ki_flags & IOCB_WAITQ) && already_read)
iocb->ki_flags |= IOCB_NOWAIT;
- i = 0;
- pg_nr = generic_file_buffered_read_get_pages(iocb, iter,
- pages, nr_pages);
- if (pg_nr < 0) {
- error = pg_nr;
+ error = filemap_get_pages(iocb, iter, &pvec);
+ if (error < 0)
break;
- }
/*
* i_size must be checked after we know the pages are Uptodate.
@@ -2502,13 +2470,8 @@ ssize_t generic_file_buffered_read(struct kiocb *iocb,
isize = i_size_read(inode);
if (unlikely(iocb->ki_pos >= isize))
goto put_pages;
-
end_offset = min_t(loff_t, isize, iocb->ki_pos + iter->count);
- while ((iocb->ki_pos >> PAGE_SHIFT) + pg_nr >
- (end_offset + PAGE_SIZE - 1) >> PAGE_SHIFT)
- put_page(pages[--pg_nr]);
-
/*
* Once we start copying data, we don't want to be touching any
* cachelines that might be contended:
@@ -2521,27 +2484,35 @@ ssize_t generic_file_buffered_read(struct kiocb *iocb,
*/
if (iocb->ki_pos >> PAGE_SHIFT !=
ra->prev_pos >> PAGE_SHIFT)
- mark_page_accessed(pages[0]);
- for (i = 1; i < pg_nr; i++)
- mark_page_accessed(pages[i]);
+ mark_page_accessed(pvec.pages[0]);
- for (i = 0; i < pg_nr; i++) {
- unsigned int offset = iocb->ki_pos & ~PAGE_MASK;
- unsigned int bytes = min_t(loff_t, end_offset - iocb->ki_pos,
- PAGE_SIZE - offset);
- unsigned int copied;
+ for (i = 0; i < pagevec_count(&pvec); i++) {
+ struct page *page = pvec.pages[i];
+ size_t page_size = thp_size(page);
+ size_t offset = iocb->ki_pos & (page_size - 1);
+ size_t bytes = min_t(loff_t, end_offset - iocb->ki_pos,
+ page_size - offset);
+ size_t copied;
+ if (end_offset < page_offset(page))
+ break;
+ if (i > 0)
+ mark_page_accessed(page);
/*
* If users can be writing to this page using arbitrary
* virtual addresses, take care about potential aliasing
* before reading the page on the kernel side.
*/
- if (writably_mapped)
- flush_dcache_page(pages[i]);
+ if (writably_mapped) {
+ int j;
+
+ for (j = 0; j < thp_nr_pages(page); j++)
+ flush_dcache_page(page + j);
+ }
- copied = copy_page_to_iter(pages[i], offset, bytes, iter);
+ copied = copy_page_to_iter(page, offset, bytes, iter);
- written += copied;
+ already_read += copied;
iocb->ki_pos += copied;
ra->prev_pos = iocb->ki_pos;
@@ -2551,18 +2522,16 @@ ssize_t generic_file_buffered_read(struct kiocb *iocb,
}
}
put_pages:
- for (i = 0; i < pg_nr; i++)
- put_page(pages[i]);
+ for (i = 0; i < pagevec_count(&pvec); i++)
+ put_page(pvec.pages[i]);
+ pagevec_reinit(&pvec);
} while (iov_iter_count(iter) && iocb->ki_pos < isize && !error);
file_accessed(filp);
- if (pages != pages_onstack)
- kfree(pages);
-
- return written ? written : error;
+ return already_read ? already_read : error;
}
-EXPORT_SYMBOL_GPL(generic_file_buffered_read);
+EXPORT_SYMBOL_GPL(filemap_read);
/**
* generic_file_read_iter - generic filesystem read routine
@@ -2592,7 +2561,7 @@ generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
ssize_t retval = 0;
if (!count)
- goto out; /* skip atime */
+ return 0; /* skip atime */
if (iocb->ki_flags & IOCB_DIRECT) {
struct file *file = iocb->ki_filp;
@@ -2610,7 +2579,7 @@ generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
iocb->ki_pos,
iocb->ki_pos + count - 1);
if (retval < 0)
- goto out;
+ return retval;
}
file_accessed(file);
@@ -2620,7 +2589,8 @@ generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
iocb->ki_pos += retval;
count -= retval;
}
- iov_iter_revert(iter, count - iov_iter_count(iter));
+ if (retval != -EIOCBQUEUED)
+ iov_iter_revert(iter, count - iov_iter_count(iter));
/*
* Btrfs can have a short DIO read if we encounter
@@ -2633,15 +2603,116 @@ generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
*/
if (retval < 0 || !count || iocb->ki_pos >= size ||
IS_DAX(inode))
- goto out;
+ return retval;
}
- retval = generic_file_buffered_read(iocb, iter, retval);
-out:
- return retval;
+ return filemap_read(iocb, iter, retval);
}
EXPORT_SYMBOL(generic_file_read_iter);
+static inline loff_t page_seek_hole_data(struct xa_state *xas,
+ struct address_space *mapping, struct page *page,
+ loff_t start, loff_t end, bool seek_data)
+{
+ const struct address_space_operations *ops = mapping->a_ops;
+ size_t offset, bsz = i_blocksize(mapping->host);
+
+ if (xa_is_value(page) || PageUptodate(page))
+ return seek_data ? start : end;
+ if (!ops->is_partially_uptodate)
+ return seek_data ? end : start;
+
+ xas_pause(xas);
+ rcu_read_unlock();
+ lock_page(page);
+ if (unlikely(page->mapping != mapping))
+ goto unlock;
+
+ offset = offset_in_thp(page, start) & ~(bsz - 1);
+
+ do {
+ if (ops->is_partially_uptodate(page, offset, bsz) == seek_data)
+ break;
+ start = (start + bsz) & ~(bsz - 1);
+ offset += bsz;
+ } while (offset < thp_size(page));
+unlock:
+ unlock_page(page);
+ rcu_read_lock();
+ return start;
+}
+
+static inline
+unsigned int seek_page_size(struct xa_state *xas, struct page *page)
+{
+ if (xa_is_value(page))
+ return PAGE_SIZE << xa_get_order(xas->xa, xas->xa_index);
+ return thp_size(page);
+}
+
+/**
+ * mapping_seek_hole_data - Seek for SEEK_DATA / SEEK_HOLE in the page cache.
+ * @mapping: Address space to search.
+ * @start: First byte to consider.
+ * @end: Limit of search (exclusive).
+ * @whence: Either SEEK_HOLE or SEEK_DATA.
+ *
+ * If the page cache knows which blocks contain holes and which blocks
+ * contain data, your filesystem can use this function to implement
+ * SEEK_HOLE and SEEK_DATA. This is useful for filesystems which are
+ * entirely memory-based such as tmpfs, and filesystems which support
+ * unwritten extents.
+ *
+ * Return: The requested offset on successs, or -ENXIO if @whence specifies
+ * SEEK_DATA and there is no data after @start. There is an implicit hole
+ * after @end - 1, so SEEK_HOLE returns @end if all the bytes between @start
+ * and @end contain data.
+ */
+loff_t mapping_seek_hole_data(struct address_space *mapping, loff_t start,
+ loff_t end, int whence)
+{
+ XA_STATE(xas, &mapping->i_pages, start >> PAGE_SHIFT);
+ pgoff_t max = (end - 1) / PAGE_SIZE;
+ bool seek_data = (whence == SEEK_DATA);
+ struct page *page;
+
+ if (end <= start)
+ return -ENXIO;
+
+ rcu_read_lock();
+ while ((page = find_get_entry(&xas, max, XA_PRESENT))) {
+ loff_t pos = xas.xa_index * PAGE_SIZE;
+
+ if (start < pos) {
+ if (!seek_data)
+ goto unlock;
+ start = pos;
+ }
+
+ pos += seek_page_size(&xas, page);
+ start = page_seek_hole_data(&xas, mapping, page, start, pos,
+ seek_data);
+ if (start < pos)
+ goto unlock;
+ if (!xa_is_value(page))
+ put_page(page);
+ }
+ rcu_read_unlock();
+
+ if (seek_data)
+ return -ENXIO;
+ goto out;
+
+unlock:
+ rcu_read_unlock();
+ if (!xa_is_value(page))
+ put_page(page);
+out:
+ if (start > end)
+ return end;
+ return start;
+}
+
#ifdef CONFIG_MMU
#define MMAP_LOTSAMISS (100)
/*
@@ -3431,7 +3502,8 @@ generic_file_direct_write(struct kiocb *iocb, struct iov_iter *from)
}
iocb->ki_pos = pos;
}
- iov_iter_revert(from, write_len - iov_iter_count(from));
+ if (written != -EIOCBQUEUED)
+ iov_iter_revert(from, write_len - iov_iter_count(from));
out:
return written;
}
diff --git a/mm/frame_vector.c b/mm/frame_vector.c
deleted file mode 100644
index 10f82d5643b6..000000000000
--- a/mm/frame_vector.c
+++ /dev/null
@@ -1,240 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#include <linux/kernel.h>
-#include <linux/errno.h>
-#include <linux/err.h>
-#include <linux/mm.h>
-#include <linux/slab.h>
-#include <linux/vmalloc.h>
-#include <linux/pagemap.h>
-#include <linux/sched.h>
-
-/**
- * get_vaddr_frames() - map virtual addresses to pfns
- * @start: starting user address
- * @nr_frames: number of pages / pfns from start to map
- * @gup_flags: flags modifying lookup behaviour
- * @vec: structure which receives pages / pfns of the addresses mapped.
- * It should have space for at least nr_frames entries.
- *
- * This function maps virtual addresses from @start and fills @vec structure
- * with page frame numbers or page pointers to corresponding pages (choice
- * depends on the type of the vma underlying the virtual address). If @start
- * belongs to a normal vma, the function grabs reference to each of the pages
- * to pin them in memory. If @start belongs to VM_IO | VM_PFNMAP vma, we don't
- * touch page structures and the caller must make sure pfns aren't reused for
- * anything else while he is using them.
- *
- * The function returns number of pages mapped which may be less than
- * @nr_frames. In particular we stop mapping if there are more vmas of
- * different type underlying the specified range of virtual addresses.
- * When the function isn't able to map a single page, it returns error.
- *
- * This function takes care of grabbing mmap_lock as necessary.
- */
-int get_vaddr_frames(unsigned long start, unsigned int nr_frames,
- unsigned int gup_flags, struct frame_vector *vec)
-{
- struct mm_struct *mm = current->mm;
- struct vm_area_struct *vma;
- int ret = 0;
- int err;
- int locked;
-
- if (nr_frames == 0)
- return 0;
-
- if (WARN_ON_ONCE(nr_frames > vec->nr_allocated))
- nr_frames = vec->nr_allocated;
-
- start = untagged_addr(start);
-
- mmap_read_lock(mm);
- locked = 1;
- vma = find_vma_intersection(mm, start, start + 1);
- if (!vma) {
- ret = -EFAULT;
- goto out;
- }
-
- /*
- * While get_vaddr_frames() could be used for transient (kernel
- * controlled lifetime) pinning of memory pages all current
- * users establish long term (userspace controlled lifetime)
- * page pinning. Treat get_vaddr_frames() like
- * get_user_pages_longterm() and disallow it for filesystem-dax
- * mappings.
- */
- if (vma_is_fsdax(vma)) {
- ret = -EOPNOTSUPP;
- goto out;
- }
-
- if (!(vma->vm_flags & (VM_IO | VM_PFNMAP))) {
- vec->got_ref = true;
- vec->is_pfns = false;
- ret = pin_user_pages_locked(start, nr_frames,
- gup_flags, (struct page **)(vec->ptrs), &locked);
- goto out;
- }
-
- vec->got_ref = false;
- vec->is_pfns = true;
- do {
- unsigned long *nums = frame_vector_pfns(vec);
-
- while (ret < nr_frames && start + PAGE_SIZE <= vma->vm_end) {
- err = follow_pfn(vma, start, &nums[ret]);
- if (err) {
- if (ret == 0)
- ret = err;
- goto out;
- }
- start += PAGE_SIZE;
- ret++;
- }
- /*
- * We stop if we have enough pages or if VMA doesn't completely
- * cover the tail page.
- */
- if (ret >= nr_frames || start < vma->vm_end)
- break;
- vma = find_vma_intersection(mm, start, start + 1);
- } while (vma && vma->vm_flags & (VM_IO | VM_PFNMAP));
-out:
- if (locked)
- mmap_read_unlock(mm);
- if (!ret)
- ret = -EFAULT;
- if (ret > 0)
- vec->nr_frames = ret;
- return ret;
-}
-EXPORT_SYMBOL(get_vaddr_frames);
-
-/**
- * put_vaddr_frames() - drop references to pages if get_vaddr_frames() acquired
- * them
- * @vec: frame vector to put
- *
- * Drop references to pages if get_vaddr_frames() acquired them. We also
- * invalidate the frame vector so that it is prepared for the next call into
- * get_vaddr_frames().
- */
-void put_vaddr_frames(struct frame_vector *vec)
-{
- struct page **pages;
-
- if (!vec->got_ref)
- goto out;
- pages = frame_vector_pages(vec);
- /*
- * frame_vector_pages() might needed to do a conversion when
- * get_vaddr_frames() got pages but vec was later converted to pfns.
- * But it shouldn't really fail to convert pfns back...
- */
- if (WARN_ON(IS_ERR(pages)))
- goto out;
-
- unpin_user_pages(pages, vec->nr_frames);
- vec->got_ref = false;
-out:
- vec->nr_frames = 0;
-}
-EXPORT_SYMBOL(put_vaddr_frames);
-
-/**
- * frame_vector_to_pages - convert frame vector to contain page pointers
- * @vec: frame vector to convert
- *
- * Convert @vec to contain array of page pointers. If the conversion is
- * successful, return 0. Otherwise return an error. Note that we do not grab
- * page references for the page structures.
- */
-int frame_vector_to_pages(struct frame_vector *vec)
-{
- int i;
- unsigned long *nums;
- struct page **pages;
-
- if (!vec->is_pfns)
- return 0;
- nums = frame_vector_pfns(vec);
- for (i = 0; i < vec->nr_frames; i++)
- if (!pfn_valid(nums[i]))
- return -EINVAL;
- pages = (struct page **)nums;
- for (i = 0; i < vec->nr_frames; i++)
- pages[i] = pfn_to_page(nums[i]);
- vec->is_pfns = false;
- return 0;
-}
-EXPORT_SYMBOL(frame_vector_to_pages);
-
-/**
- * frame_vector_to_pfns - convert frame vector to contain pfns
- * @vec: frame vector to convert
- *
- * Convert @vec to contain array of pfns.
- */
-void frame_vector_to_pfns(struct frame_vector *vec)
-{
- int i;
- unsigned long *nums;
- struct page **pages;
-
- if (vec->is_pfns)
- return;
- pages = (struct page **)(vec->ptrs);
- nums = (unsigned long *)pages;
- for (i = 0; i < vec->nr_frames; i++)
- nums[i] = page_to_pfn(pages[i]);
- vec->is_pfns = true;
-}
-EXPORT_SYMBOL(frame_vector_to_pfns);
-
-/**
- * frame_vector_create() - allocate & initialize structure for pinned pfns
- * @nr_frames: number of pfns slots we should reserve
- *
- * Allocate and initialize struct pinned_pfns to be able to hold @nr_pfns
- * pfns.
- */
-struct frame_vector *frame_vector_create(unsigned int nr_frames)
-{
- struct frame_vector *vec;
- int size = sizeof(struct frame_vector) + sizeof(void *) * nr_frames;
-
- if (WARN_ON_ONCE(nr_frames == 0))
- return NULL;
- /*
- * This is absurdly high. It's here just to avoid strange effects when
- * arithmetics overflows.
- */
- if (WARN_ON_ONCE(nr_frames > INT_MAX / sizeof(void *) / 2))
- return NULL;
- /*
- * Avoid higher order allocations, use vmalloc instead. It should
- * be rare anyway.
- */
- vec = kvmalloc(size, GFP_KERNEL);
- if (!vec)
- return NULL;
- vec->nr_allocated = nr_frames;
- vec->nr_frames = 0;
- return vec;
-}
-EXPORT_SYMBOL(frame_vector_create);
-
-/**
- * frame_vector_destroy() - free memory allocated to carry frame vector
- * @vec: Frame vector to free
- *
- * Free structure allocated by frame_vector_create() to carry frames.
- */
-void frame_vector_destroy(struct frame_vector *vec)
-{
- /* Make sure put_vaddr_frames() got called properly... */
- VM_BUG_ON(vec->nr_frames > 0);
- kvfree(vec);
-}
-EXPORT_SYMBOL(frame_vector_destroy);
diff --git a/mm/gup.c b/mm/gup.c
index e4c224cd9661..e40579624f10 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -78,9 +78,8 @@ static inline struct page *try_get_compound_head(struct page *page, int refs)
* considered failure, and furthermore, a likely bug in the caller, so a warning
* is also emitted.
*/
-static __maybe_unused struct page *try_grab_compound_head(struct page *page,
- int refs,
- unsigned int flags)
+__maybe_unused struct page *try_grab_compound_head(struct page *page,
+ int refs, unsigned int flags)
{
if (flags & FOLL_GET)
return try_get_compound_head(page, refs);
diff --git a/mm/highmem.c b/mm/highmem.c
index 874b732b120c..6ef8f5e05e7e 100644
--- a/mm/highmem.c
+++ b/mm/highmem.c
@@ -368,20 +368,24 @@ void zero_user_segments(struct page *page, unsigned start1, unsigned end1,
BUG_ON(end1 > page_size(page) || end2 > page_size(page));
+ if (start1 >= end1)
+ start1 = end1 = 0;
+ if (start2 >= end2)
+ start2 = end2 = 0;
+
for (i = 0; i < compound_nr(page); i++) {
void *kaddr = NULL;
- if (start1 < PAGE_SIZE || start2 < PAGE_SIZE)
- kaddr = kmap_atomic(page + i);
-
if (start1 >= PAGE_SIZE) {
start1 -= PAGE_SIZE;
end1 -= PAGE_SIZE;
} else {
unsigned this_end = min_t(unsigned, end1, PAGE_SIZE);
- if (end1 > start1)
+ if (end1 > start1) {
+ kaddr = kmap_atomic(page + i);
memset(kaddr + start1, 0, this_end - start1);
+ }
end1 -= this_end;
start1 = 0;
}
@@ -392,8 +396,11 @@ void zero_user_segments(struct page *page, unsigned start1, unsigned end1,
} else {
unsigned this_end = min_t(unsigned, end2, PAGE_SIZE);
- if (end2 > start2)
+ if (end2 > start2) {
+ if (!kaddr)
+ kaddr = kmap_atomic(page + i);
memset(kaddr + start2, 0, this_end - start2);
+ }
end2 -= this_end;
start2 = 0;
}
@@ -611,7 +618,7 @@ void __kmap_local_sched_out(void)
int idx;
/* With debug all even slots are unmapped and act as guard */
- if (IS_ENABLED(CONFIG_DEBUG_HIGHMEM) && !(i & 0x01)) {
+ if (IS_ENABLED(CONFIG_DEBUG_KMAP_LOCAL) && !(i & 0x01)) {
WARN_ON_ONCE(!pte_none(pteval));
continue;
}
@@ -647,7 +654,7 @@ void __kmap_local_sched_in(void)
int idx;
/* With debug all even slots are unmapped and act as guard */
- if (IS_ENABLED(CONFIG_DEBUG_HIGHMEM) && !(i & 0x01)) {
+ if (IS_ENABLED(CONFIG_DEBUG_KMAP_LOCAL) && !(i & 0x01)) {
WARN_ON_ONCE(!pte_none(pteval));
continue;
}
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 91ca9b103ee5..ae907a9c2050 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -386,7 +386,11 @@ static int __init hugepage_init(void)
struct kobject *hugepage_kobj;
if (!has_transparent_hugepage()) {
- transparent_hugepage_flags = 0;
+ /*
+ * Hardware doesn't support hugepages, hence disable
+ * DAX PMD support.
+ */
+ transparent_hugepage_flags = 1 << TRANSPARENT_HUGEPAGE_NEVER_DAX;
return -EINVAL;
}
@@ -636,6 +640,7 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf,
lru_cache_add_inactive_or_unevictable(page, vma);
pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable);
set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry);
+ update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR);
mm_inc_nr_ptes(vma->vm_mm);
spin_unlock(vmf->ptl);
@@ -663,9 +668,9 @@ release:
* available
* never: never stall for any thp allocation
*/
-static inline gfp_t alloc_hugepage_direct_gfpmask(struct vm_area_struct *vma)
+gfp_t vma_thp_gfp_mask(struct vm_area_struct *vma)
{
- const bool vma_madvised = !!(vma->vm_flags & VM_HUGEPAGE);
+ const bool vma_madvised = vma && (vma->vm_flags & VM_HUGEPAGE);
/* Always do synchronous compaction */
if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags))
@@ -690,20 +695,19 @@ static inline gfp_t alloc_hugepage_direct_gfpmask(struct vm_area_struct *vma)
}
/* Caller must hold page table lock. */
-static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm,
+static void set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm,
struct vm_area_struct *vma, unsigned long haddr, pmd_t *pmd,
struct page *zero_page)
{
pmd_t entry;
if (!pmd_none(*pmd))
- return false;
+ return;
entry = mk_pmd(zero_page, vma->vm_page_prot);
entry = pmd_mkhuge(entry);
if (pgtable)
pgtable_trans_huge_deposit(mm, pmd, pgtable);
set_pmd_at(mm, haddr, pmd, entry);
mm_inc_nr_ptes(mm);
- return true;
}
vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf)
@@ -749,6 +753,7 @@ vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf)
} else {
set_huge_zero_page(pgtable, vma->vm_mm, vma,
haddr, vmf->pmd, zero_page);
+ update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
spin_unlock(vmf->ptl);
}
} else {
@@ -757,7 +762,7 @@ vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf)
}
return ret;
}
- gfp = alloc_hugepage_direct_gfpmask(vma);
+ gfp = vma_thp_gfp_mask(vma);
page = alloc_hugepage_vma(gfp, vma, haddr, HPAGE_PMD_ORDER);
if (unlikely(!page)) {
count_vm_event(THP_FAULT_FALLBACK);
@@ -1095,9 +1100,7 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
* best effort that the pinned pages won't be replaced by another
* random page during the coming copy-on-write.
*/
- if (unlikely(is_cow_mapping(vma->vm_flags) &&
- atomic_read(&src_mm->has_pinned) &&
- page_maybe_dma_pinned(src_page))) {
+ if (unlikely(page_needs_cow_for_dma(vma, src_page))) {
pte_free(dst_mm, pgtable);
spin_unlock(src_ptl);
spin_unlock(dst_ptl);
@@ -1209,9 +1212,7 @@ int copy_huge_pud(struct mm_struct *dst_mm, struct mm_struct *src_mm,
}
/* Please refer to comments in copy_huge_pmd() */
- if (unlikely(is_cow_mapping(vma->vm_flags) &&
- atomic_read(&src_mm->has_pinned) &&
- page_maybe_dma_pinned(pud_page(pud)))) {
+ if (unlikely(page_needs_cow_for_dma(vma, pud_page(pud)))) {
spin_unlock(src_ptl);
spin_unlock(dst_ptl);
__split_huge_pud(vma, src_pud, addr);
@@ -1439,7 +1440,7 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t pmd)
if (!get_page_unless_zero(page))
goto out_unlock;
spin_unlock(vmf->ptl);
- put_and_wait_on_page_locked(page);
+ put_and_wait_on_page_locked(page, TASK_UNINTERRUPTIBLE);
goto out;
}
@@ -1475,7 +1476,7 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t pmd)
if (!get_page_unless_zero(page))
goto out_unlock;
spin_unlock(vmf->ptl);
- put_and_wait_on_page_locked(page);
+ put_and_wait_on_page_locked(page, TASK_UNINTERRUPTIBLE);
goto out;
}
@@ -2176,7 +2177,8 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
lock_page_memcg(page);
if (atomic_add_negative(-1, compound_mapcount_ptr(page))) {
/* Last compound_mapcount is gone. */
- __dec_lruvec_page_state(page, NR_ANON_THPS);
+ __mod_lruvec_page_state(page, NR_ANON_THPS,
+ -HPAGE_PMD_NR);
if (TestClearPageDoubleMap(page)) {
/* No need in mapcount reference anymore */
for (i = 0; i < HPAGE_PMD_NR; i++)
@@ -2465,7 +2467,7 @@ static void __split_huge_page(struct page *page, struct list_head *list,
int i;
/* complete memcg works before add pages to LRU */
- mem_cgroup_split_huge_fixup(head);
+ split_page_memcg(head, nr);
if (PageAnon(head) && PageSwapCache(head)) {
swp_entry_t entry = { .val = page_private(head) };
@@ -2751,10 +2753,14 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
}
spin_unlock(&ds_queue->split_queue_lock);
if (mapping) {
+ int nr = thp_nr_pages(head);
+
if (PageSwapBacked(head))
- __dec_lruvec_page_state(head, NR_SHMEM_THPS);
+ __mod_lruvec_page_state(head, NR_SHMEM_THPS,
+ -nr);
else
- __dec_lruvec_page_state(head, NR_FILE_THPS);
+ __mod_lruvec_page_state(head, NR_FILE_THPS,
+ -nr);
}
__split_huge_page(page, list, end);
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 905a7d549b00..a86a58ef132d 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -79,34 +79,29 @@ DEFINE_SPINLOCK(hugetlb_lock);
static int num_fault_mutexes;
struct mutex *hugetlb_fault_mutex_table ____cacheline_aligned_in_smp;
-static inline bool PageHugeFreed(struct page *head)
-{
- return page_private(head + 4) == -1UL;
-}
+/* Forward declaration */
+static int hugetlb_acct_memory(struct hstate *h, long delta);
-static inline void SetPageHugeFreed(struct page *head)
+static inline bool subpool_is_free(struct hugepage_subpool *spool)
{
- set_page_private(head + 4, -1UL);
-}
+ if (spool->count)
+ return false;
+ if (spool->max_hpages != -1)
+ return spool->used_hpages == 0;
+ if (spool->min_hpages != -1)
+ return spool->rsv_hpages == spool->min_hpages;
-static inline void ClearPageHugeFreed(struct page *head)
-{
- set_page_private(head + 4, 0);
+ return true;
}
-/* Forward declaration */
-static int hugetlb_acct_memory(struct hstate *h, long delta);
-
static inline void unlock_or_release_subpool(struct hugepage_subpool *spool)
{
- bool free = (spool->count == 0) && (spool->used_hpages == 0);
-
spin_unlock(&spool->lock);
/* If no pages are used, and no other handles to the subpool
* remain, give up any reservations based on minimum size and
* free the subpool */
- if (free) {
+ if (subpool_is_free(spool)) {
if (spool->min_hpages != -1)
hugetlb_acct_memory(spool->hstate,
-spool->min_hpages);
@@ -285,6 +280,17 @@ static void record_hugetlb_cgroup_uncharge_info(struct hugetlb_cgroup *h_cg,
nrg->reservation_counter =
&h_cg->rsvd_hugepage[hstate_index(h)];
nrg->css = &h_cg->css;
+ /*
+ * The caller will hold exactly one h_cg->css reference for the
+ * whole contiguous reservation region. But this area might be
+ * scattered when there are already some file_regions reside in
+ * it. As a result, many file_regions may share only one css
+ * reference. In order to ensure that one file_region must hold
+ * exactly one h_cg->css reference, we should do css_get for
+ * each file_region and leave the reference held by caller
+ * untouched.
+ */
+ css_get(&h_cg->css);
if (!resv->pages_per_hpage)
resv->pages_per_hpage = pages_per_huge_page(h);
/* pages_per_hpage should be the same for all entries in
@@ -298,6 +304,14 @@ static void record_hugetlb_cgroup_uncharge_info(struct hugetlb_cgroup *h_cg,
#endif
}
+static void put_uncharge_info(struct file_region *rg)
+{
+#ifdef CONFIG_CGROUP_HUGETLB
+ if (rg->css)
+ css_put(rg->css);
+#endif
+}
+
static bool has_same_uncharge_info(struct file_region *rg,
struct file_region *org)
{
@@ -321,6 +335,7 @@ static void coalesce_file_region(struct resv_map *resv, struct file_region *rg)
prg->to = rg->to;
list_del(&rg->link);
+ put_uncharge_info(rg);
kfree(rg);
rg = prg;
@@ -332,10 +347,29 @@ static void coalesce_file_region(struct resv_map *resv, struct file_region *rg)
nrg->from = rg->from;
list_del(&rg->link);
+ put_uncharge_info(rg);
kfree(rg);
}
}
+static inline long
+hugetlb_resv_map_add(struct resv_map *map, struct file_region *rg, long from,
+ long to, struct hstate *h, struct hugetlb_cgroup *cg,
+ long *regions_needed)
+{
+ struct file_region *nrg;
+
+ if (!regions_needed) {
+ nrg = get_file_region_entry_from_cache(map, from, to);
+ record_hugetlb_cgroup_uncharge_info(cg, h, map, nrg);
+ list_add(&nrg->link, rg->link.prev);
+ coalesce_file_region(map, nrg);
+ } else
+ *regions_needed += 1;
+
+ return to - from;
+}
+
/*
* Must be called with resv->lock held.
*
@@ -351,7 +385,7 @@ static long add_reservation_in_range(struct resv_map *resv, long f, long t,
long add = 0;
struct list_head *head = &resv->regions;
long last_accounted_offset = f;
- struct file_region *rg = NULL, *trg = NULL, *nrg = NULL;
+ struct file_region *rg = NULL, *trg = NULL;
if (regions_needed)
*regions_needed = 0;
@@ -374,24 +408,17 @@ static long add_reservation_in_range(struct resv_map *resv, long f, long t,
/* When we find a region that starts beyond our range, we've
* finished.
*/
- if (rg->from > t)
+ if (rg->from >= t)
break;
/* Add an entry for last_accounted_offset -> rg->from, and
* update last_accounted_offset.
*/
- if (rg->from > last_accounted_offset) {
- add += rg->from - last_accounted_offset;
- if (!regions_needed) {
- nrg = get_file_region_entry_from_cache(
- resv, last_accounted_offset, rg->from);
- record_hugetlb_cgroup_uncharge_info(h_cg, h,
- resv, nrg);
- list_add(&nrg->link, rg->link.prev);
- coalesce_file_region(resv, nrg);
- } else
- *regions_needed += 1;
- }
+ if (rg->from > last_accounted_offset)
+ add += hugetlb_resv_map_add(resv, rg,
+ last_accounted_offset,
+ rg->from, h, h_cg,
+ regions_needed);
last_accounted_offset = rg->to;
}
@@ -399,17 +426,9 @@ static long add_reservation_in_range(struct resv_map *resv, long f, long t,
/* Handle the case where our range extends beyond
* last_accounted_offset.
*/
- if (last_accounted_offset < t) {
- add += t - last_accounted_offset;
- if (!regions_needed) {
- nrg = get_file_region_entry_from_cache(
- resv, last_accounted_offset, t);
- record_hugetlb_cgroup_uncharge_info(h_cg, h, resv, nrg);
- list_add(&nrg->link, rg->link.prev);
- coalesce_file_region(resv, nrg);
- } else
- *regions_needed += 1;
- }
+ if (last_accounted_offset < t)
+ add += hugetlb_resv_map_add(resv, rg, last_accounted_offset,
+ t, h, h_cg, regions_needed);
VM_BUG_ON(add < 0);
return add;
@@ -664,7 +683,7 @@ retry:
del += t - f;
hugetlb_cgroup_uncharge_file_region(
- resv, rg, t - f);
+ resv, rg, t - f, false);
/* New entry for end of split region */
nrg->from = t;
@@ -685,7 +704,7 @@ retry:
if (f <= rg->from && t >= rg->to) { /* Remove entire region */
del += rg->to - rg->from;
hugetlb_cgroup_uncharge_file_region(resv, rg,
- rg->to - rg->from);
+ rg->to - rg->from, true);
list_del(&rg->link);
kfree(rg);
continue;
@@ -693,13 +712,13 @@ retry:
if (f <= rg->from) { /* Trim beginning of region */
hugetlb_cgroup_uncharge_file_region(resv, rg,
- t - rg->from);
+ t - rg->from, false);
del += t - rg->from;
rg->from = t;
} else { /* Trim end of region */
hugetlb_cgroup_uncharge_file_region(resv, rg,
- rg->to - f);
+ rg->to - f, false);
del += rg->to - f;
rg->to = f;
@@ -1043,7 +1062,7 @@ static void enqueue_huge_page(struct hstate *h, struct page *page)
list_move(&page->lru, &h->hugepage_freelists[nid]);
h->free_huge_pages++;
h->free_huge_pages_node[nid]++;
- SetPageHugeFreed(page);
+ SetHPageFreed(page);
}
static struct page *dequeue_huge_page_node_exact(struct hstate *h, int nid)
@@ -1060,7 +1079,7 @@ static struct page *dequeue_huge_page_node_exact(struct hstate *h, int nid)
list_move(&page->lru, &h->hugepage_activelist);
set_page_refcounted(page);
- ClearPageHugeFreed(page);
+ ClearHPageFreed(page);
h->free_huge_pages--;
h->free_huge_pages_node[nid]--;
return page;
@@ -1133,7 +1152,7 @@ static struct page *dequeue_huge_page_vma(struct hstate *h,
nid = huge_node(vma, address, gfp_mask, &mpol, &nodemask);
page = dequeue_huge_page_nodemask(h, gfp_mask, nid, nodemask);
if (page && !avoid_reserve && vma_has_reserves(vma, chg)) {
- SetPagePrivate(page);
+ SetHPageRestoreReserve(page);
h->resv_huge_pages--;
}
@@ -1224,8 +1243,7 @@ static void destroy_compound_gigantic_page(struct page *page,
struct page *p = page + 1;
atomic_set(compound_mapcount_ptr(page), 0);
- if (hpage_pincount_available(page))
- atomic_set(compound_pincount_ptr(page), 0);
+ atomic_set(compound_pincount_ptr(page), 0);
for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) {
clear_compound_head(p);
@@ -1312,14 +1330,16 @@ static inline void destroy_compound_gigantic_page(struct page *page,
static void update_and_free_page(struct hstate *h, struct page *page)
{
int i;
+ struct page *subpage = page;
if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported())
return;
h->nr_huge_pages--;
h->nr_huge_pages_node[page_to_nid(page)]--;
- for (i = 0; i < pages_per_huge_page(h); i++) {
- page[i].flags &= ~(1 << PG_locked | 1 << PG_error |
+ for (i = 0; i < pages_per_huge_page(h);
+ i++, subpage = mem_map_next(subpage, page, i)) {
+ subpage->flags &= ~(1 << PG_locked | 1 << PG_error |
1 << PG_referenced | 1 << PG_dirty |
1 << PG_active | 1 << PG_private |
1 << PG_writeback);
@@ -1353,52 +1373,6 @@ struct hstate *size_to_hstate(unsigned long size)
return NULL;
}
-/*
- * Test to determine whether the hugepage is "active/in-use" (i.e. being linked
- * to hstate->hugepage_activelist.)
- *
- * This function can be called for tail pages, but never returns true for them.
- */
-bool page_huge_active(struct page *page)
-{
- return PageHeadHuge(page) && PagePrivate(&page[1]);
-}
-
-/* never called for tail page */
-void set_page_huge_active(struct page *page)
-{
- VM_BUG_ON_PAGE(!PageHeadHuge(page), page);
- SetPagePrivate(&page[1]);
-}
-
-static void clear_page_huge_active(struct page *page)
-{
- VM_BUG_ON_PAGE(!PageHeadHuge(page), page);
- ClearPagePrivate(&page[1]);
-}
-
-/*
- * Internal hugetlb specific page flag. Do not use outside of the hugetlb
- * code
- */
-static inline bool PageHugeTemporary(struct page *page)
-{
- if (!PageHuge(page))
- return false;
-
- return (unsigned long)page[2].mapping == -1U;
-}
-
-static inline void SetPageHugeTemporary(struct page *page)
-{
- page[2].mapping = (void *)-1U;
-}
-
-static inline void ClearPageHugeTemporary(struct page *page)
-{
- page[2].mapping = NULL;
-}
-
static void __free_huge_page(struct page *page)
{
/*
@@ -1407,24 +1381,23 @@ static void __free_huge_page(struct page *page)
*/
struct hstate *h = page_hstate(page);
int nid = page_to_nid(page);
- struct hugepage_subpool *spool =
- (struct hugepage_subpool *)page_private(page);
+ struct hugepage_subpool *spool = hugetlb_page_subpool(page);
bool restore_reserve;
VM_BUG_ON_PAGE(page_count(page), page);
VM_BUG_ON_PAGE(page_mapcount(page), page);
- set_page_private(page, 0);
+ hugetlb_set_page_subpool(page, NULL);
page->mapping = NULL;
- restore_reserve = PagePrivate(page);
- ClearPagePrivate(page);
+ restore_reserve = HPageRestoreReserve(page);
+ ClearHPageRestoreReserve(page);
/*
- * If PagePrivate() was set on page, page allocation consumed a
+ * If HPageRestoreReserve was set on page, page allocation consumed a
* reservation. If the page was associated with a subpool, there
* would have been a page reserved in the subpool before allocation
* via hugepage_subpool_get_pages(). Since we are 'restoring' the
- * reservtion, do not call hugepage_subpool_put_pages() as this will
+ * reservation, do not call hugepage_subpool_put_pages() as this will
* remove the reserved page from the subpool.
*/
if (!restore_reserve) {
@@ -1439,7 +1412,7 @@ static void __free_huge_page(struct page *page)
}
spin_lock(&hugetlb_lock);
- clear_page_huge_active(page);
+ ClearHPageMigratable(page);
hugetlb_cgroup_uncharge_page(hstate_index(h),
pages_per_huge_page(h), page);
hugetlb_cgroup_uncharge_page_rsvd(hstate_index(h),
@@ -1447,9 +1420,9 @@ static void __free_huge_page(struct page *page)
if (restore_reserve)
h->resv_huge_pages++;
- if (PageHugeTemporary(page)) {
+ if (HPageTemporary(page)) {
list_del(&page->lru);
- ClearPageHugeTemporary(page);
+ ClearHPageTemporary(page);
update_and_free_page(h, page);
} else if (h->surplus_huge_pages_node[nid]) {
/* remove the page from active list */
@@ -1516,12 +1489,13 @@ static void prep_new_huge_page(struct hstate *h, struct page *page, int nid)
{
INIT_LIST_HEAD(&page->lru);
set_compound_page_dtor(page, HUGETLB_PAGE_DTOR);
+ hugetlb_set_page_subpool(page, NULL);
set_hugetlb_cgroup(page, NULL);
set_hugetlb_cgroup_rsvd(page, NULL);
spin_lock(&hugetlb_lock);
h->nr_huge_pages++;
h->nr_huge_pages_node[nid]++;
- ClearPageHugeFreed(page);
+ ClearHPageFreed(page);
spin_unlock(&hugetlb_lock);
}
@@ -1553,9 +1527,7 @@ static void prep_compound_gigantic_page(struct page *page, unsigned int order)
set_compound_head(p, page);
}
atomic_set(compound_mapcount_ptr(page), -1);
-
- if (hpage_pincount_available(page))
- atomic_set(compound_pincount_ptr(page), 0);
+ atomic_set(compound_pincount_ptr(page), 0);
}
/*
@@ -1794,7 +1766,7 @@ retry:
* We should make sure that the page is already on the free list
* when it is dissolved.
*/
- if (unlikely(!PageHugeFreed(head))) {
+ if (unlikely(!HPageFreed(head))) {
spin_unlock(&hugetlb_lock);
cond_resched();
@@ -1885,7 +1857,7 @@ static struct page *alloc_surplus_huge_page(struct hstate *h, gfp_t gfp_mask,
* codeflow
*/
if (h->surplus_huge_pages >= h->nr_overcommit_huge_pages) {
- SetPageHugeTemporary(page);
+ SetHPageTemporary(page);
spin_unlock(&hugetlb_lock);
put_page(page);
return NULL;
@@ -1916,7 +1888,7 @@ static struct page *alloc_migrate_huge_page(struct hstate *h, gfp_t gfp_mask,
* We do not account these pages as surplus because they are only
* temporary and will be released properly on the last reference
*/
- SetPageHugeTemporary(page);
+ SetHPageTemporary(page);
return page;
}
@@ -2254,24 +2226,24 @@ static long vma_add_reservation(struct hstate *h,
* This routine is called to restore a reservation on error paths. In the
* specific error paths, a huge page was allocated (via alloc_huge_page)
* and is about to be freed. If a reservation for the page existed,
- * alloc_huge_page would have consumed the reservation and set PagePrivate
- * in the newly allocated page. When the page is freed via free_huge_page,
- * the global reservation count will be incremented if PagePrivate is set.
- * However, free_huge_page can not adjust the reserve map. Adjust the
- * reserve map here to be consistent with global reserve count adjustments
- * to be made by free_huge_page.
+ * alloc_huge_page would have consumed the reservation and set
+ * HPageRestoreReserve in the newly allocated page. When the page is freed
+ * via free_huge_page, the global reservation count will be incremented if
+ * HPageRestoreReserve is set. However, free_huge_page can not adjust the
+ * reserve map. Adjust the reserve map here to be consistent with global
+ * reserve count adjustments to be made by free_huge_page.
*/
static void restore_reserve_on_error(struct hstate *h,
struct vm_area_struct *vma, unsigned long address,
struct page *page)
{
- if (unlikely(PagePrivate(page))) {
+ if (unlikely(HPageRestoreReserve(page))) {
long rc = vma_needs_reservation(h, vma, address);
if (unlikely(rc < 0)) {
/*
* Rare out of memory condition in reserve map
- * manipulation. Clear PagePrivate so that
+ * manipulation. Clear HPageRestoreReserve so that
* global reserve count will not be incremented
* by free_huge_page. This will make it appear
* as though the reservation for this page was
@@ -2280,7 +2252,7 @@ static void restore_reserve_on_error(struct hstate *h,
* is better than inconsistent global huge page
* accounting of reserve counts.
*/
- ClearPagePrivate(page);
+ ClearHPageRestoreReserve(page);
} else if (rc) {
rc = vma_add_reservation(h, vma, address);
if (unlikely(rc < 0))
@@ -2288,7 +2260,7 @@ static void restore_reserve_on_error(struct hstate *h,
* See above comment about rare out of
* memory condition.
*/
- ClearPagePrivate(page);
+ ClearHPageRestoreReserve(page);
} else
vma_end_reservation(h, vma, address);
}
@@ -2369,7 +2341,7 @@ struct page *alloc_huge_page(struct vm_area_struct *vma,
if (!page)
goto out_uncharge_cgroup;
if (!avoid_reserve && vma_has_reserves(vma, gbl_chg)) {
- SetPagePrivate(page);
+ SetHPageRestoreReserve(page);
h->resv_huge_pages--;
}
spin_lock(&hugetlb_lock);
@@ -2387,7 +2359,7 @@ struct page *alloc_huge_page(struct vm_area_struct *vma,
spin_unlock(&hugetlb_lock);
- set_page_private(page, (unsigned long)spool);
+ hugetlb_set_page_subpool(page, spool);
map_commit = vma_commit_reservation(h, vma, addr);
if (unlikely(map_chg > map_commit)) {
@@ -2476,7 +2448,7 @@ static void __init gather_bootmem_prealloc(void)
struct hstate *h = m->hstate;
WARN_ON(page_count(page) != 1);
- prep_compound_huge_page(page, h->order);
+ prep_compound_huge_page(page, huge_page_order(h));
WARN_ON(PageReserved(page));
prep_new_huge_page(h, page, page_to_nid(page));
put_page(page); /* free it into the hugepage allocator */
@@ -2488,7 +2460,7 @@ static void __init gather_bootmem_prealloc(void)
* side-effects, like CommitLimit going negative.
*/
if (hstate_is_gigantic(h))
- adjust_managed_page_count(page, 1 << h->order);
+ adjust_managed_page_count(page, pages_per_huge_page(h));
cond_resched();
}
}
@@ -2520,7 +2492,7 @@ static void __init hugetlb_hstate_alloc_pages(struct hstate *h)
if (hstate_is_gigantic(h)) {
if (hugetlb_cma_size) {
pr_warn_once("HugeTLB: hugetlb_cma is enabled, skip boot time allocation\n");
- break;
+ goto free;
}
if (!alloc_bootmem_huge_page(h))
break;
@@ -2538,7 +2510,7 @@ static void __init hugetlb_hstate_alloc_pages(struct hstate *h)
h->max_huge_pages, buf, i);
h->max_huge_pages = i;
}
-
+free:
kfree(node_alloc_noretry);
}
@@ -2988,8 +2960,10 @@ static int hugetlb_sysfs_add_hstate(struct hstate *h, struct kobject *parent,
return -ENOMEM;
retval = sysfs_create_group(hstate_kobjs[hi], hstate_attr_group);
- if (retval)
+ if (retval) {
kobject_put(hstate_kobjs[hi]);
+ hstate_kobjs[hi] = NULL;
+ }
return retval;
}
@@ -3159,6 +3133,9 @@ static int __init hugetlb_init(void)
{
int i;
+ BUILD_BUG_ON(sizeof_field(struct page, private) * BITS_PER_BYTE <
+ __NR_HPAGEFLAGS);
+
if (!hugepages_supported()) {
if (hugetlb_max_hstate || default_hstate_max_huge_pages)
pr_warn("HugeTLB: huge pages not supported, ignoring associated command-line parameters\n");
@@ -3239,7 +3216,7 @@ void __init hugetlb_add_hstate(unsigned int order)
BUG_ON(order == 0);
h = &hstates[hugetlb_max_hstate++];
h->order = order;
- h->mask = ~((1ULL << (order + PAGE_SHIFT)) - 1);
+ h->mask = ~(huge_page_size(h) - 1);
for (i = 0; i < MAX_NUMNODES; ++i)
INIT_LIST_HEAD(&h->hugepage_freelists[i]);
INIT_LIST_HEAD(&h->hugepage_activelist);
@@ -3408,8 +3385,7 @@ static unsigned int allowed_mems_nr(struct hstate *h)
mpol_allowed = policy_nodemask_current(gfp_mask);
for_each_node_mask(node, cpuset_current_mems_allowed) {
- if (!mpol_allowed ||
- (mpol_allowed && node_isset(node, *mpol_allowed)))
+ if (!mpol_allowed || node_isset(node, *mpol_allowed))
nr += array[node];
}
@@ -3515,7 +3491,7 @@ void hugetlb_report_meminfo(struct seq_file *m)
for_each_hstate(h) {
unsigned long count = h->nr_huge_pages;
- total += (PAGE_SIZE << huge_page_order(h)) * count;
+ total += huge_page_size(h) * count;
if (h == &default_hstate)
seq_printf(m,
@@ -3528,10 +3504,10 @@ void hugetlb_report_meminfo(struct seq_file *m)
h->free_huge_pages,
h->resv_huge_pages,
h->surplus_huge_pages,
- (PAGE_SIZE << huge_page_order(h)) / 1024);
+ huge_page_size(h) / SZ_1K);
}
- seq_printf(m, "Hugetlb: %8lu kB\n", total / 1024);
+ seq_printf(m, "Hugetlb: %8lu kB\n", total / SZ_1K);
}
int hugetlb_report_node_meminfo(char *buf, int len, int nid)
@@ -3565,7 +3541,7 @@ void hugetlb_show_meminfo(void)
h->nr_huge_pages_node[nid],
h->free_huge_pages_node[nid],
h->surplus_huge_pages_node[nid],
- 1UL << (huge_page_order(h) + PAGE_SHIFT - 10));
+ huge_page_size(h) / SZ_1K);
}
void hugetlb_report_usage(struct seq_file *m, struct mm_struct *mm)
@@ -3589,6 +3565,9 @@ static int hugetlb_acct_memory(struct hstate *h, long delta)
{
int ret = -ENOMEM;
+ if (!delta)
+ return 0;
+
spin_lock(&hugetlb_lock);
/*
* When cpuset is configured, it breaks the strict hugetlb page
@@ -3685,15 +3664,13 @@ static int hugetlb_vm_op_split(struct vm_area_struct *vma, unsigned long addr)
static unsigned long hugetlb_vm_op_pagesize(struct vm_area_struct *vma)
{
- struct hstate *hstate = hstate_vma(vma);
-
- return 1UL << huge_page_shift(hstate);
+ return huge_page_size(hstate_vma(vma));
}
/*
* We cannot handle pagefaults against hugetlb pages at all. They cause
* handle_mm_fault() to try to instantiate regular-sized pages in the
- * hugegpage VMA. do_page_fault() is supposed to trap this, so BUG is we get
+ * hugepage VMA. do_page_fault() is supposed to trap this, so BUG is we get
* this far.
*/
static vm_fault_t hugetlb_vm_op_fault(struct vm_fault *vmf)
@@ -3772,21 +3749,32 @@ static bool is_hugetlb_entry_hwpoisoned(pte_t pte)
return false;
}
+static void
+hugetlb_install_page(struct vm_area_struct *vma, pte_t *ptep, unsigned long addr,
+ struct page *new_page)
+{
+ __SetPageUptodate(new_page);
+ set_huge_pte_at(vma->vm_mm, addr, ptep, make_huge_pte(vma, new_page, 1));
+ hugepage_add_new_anon_rmap(new_page, vma, addr);
+ hugetlb_count_add(pages_per_huge_page(hstate_vma(vma)), vma->vm_mm);
+ ClearHPageRestoreReserve(new_page);
+ SetHPageMigratable(new_page);
+}
+
int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
struct vm_area_struct *vma)
{
pte_t *src_pte, *dst_pte, entry, dst_entry;
struct page *ptepage;
unsigned long addr;
- int cow;
+ bool cow = is_cow_mapping(vma->vm_flags);
struct hstate *h = hstate_vma(vma);
unsigned long sz = huge_page_size(h);
+ unsigned long npages = pages_per_huge_page(h);
struct address_space *mapping = vma->vm_file->f_mapping;
struct mmu_notifier_range range;
int ret = 0;
- cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
-
if (cow) {
mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, src,
vma->vm_start,
@@ -3831,6 +3819,7 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
entry = huge_ptep_get(src_pte);
dst_entry = huge_ptep_get(dst_pte);
+again:
if (huge_pte_none(entry) || !huge_pte_none(dst_entry)) {
/*
* Skip if src entry none. Also, skip in the
@@ -3854,6 +3843,52 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
}
set_huge_swap_pte_at(dst, addr, dst_pte, entry, sz);
} else {
+ entry = huge_ptep_get(src_pte);
+ ptepage = pte_page(entry);
+ get_page(ptepage);
+
+ /*
+ * This is a rare case where we see pinned hugetlb
+ * pages while they're prone to COW. We need to do the
+ * COW earlier during fork.
+ *
+ * When pre-allocating the page or copying data, we
+ * need to be without the pgtable locks since we could
+ * sleep during the process.
+ */
+ if (unlikely(page_needs_cow_for_dma(vma, ptepage))) {
+ pte_t src_pte_old = entry;
+ struct page *new;
+
+ spin_unlock(src_ptl);
+ spin_unlock(dst_ptl);
+ /* Do not use reserve as it's private owned */
+ new = alloc_huge_page(vma, addr, 1);
+ if (IS_ERR(new)) {
+ put_page(ptepage);
+ ret = PTR_ERR(new);
+ break;
+ }
+ copy_user_huge_page(new, ptepage, addr, vma,
+ npages);
+ put_page(ptepage);
+
+ /* Install the new huge page if src pte stable */
+ dst_ptl = huge_pte_lock(h, dst, dst_pte);
+ src_ptl = huge_pte_lockptr(h, src, src_pte);
+ spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
+ entry = huge_ptep_get(src_pte);
+ if (!pte_same(src_pte_old, entry)) {
+ put_page(new);
+ /* dst_entry won't change as in child */
+ goto again;
+ }
+ hugetlb_install_page(vma, dst_pte, addr, new);
+ spin_unlock(src_ptl);
+ spin_unlock(dst_ptl);
+ continue;
+ }
+
if (cow) {
/*
* No need to notify as we are downgrading page
@@ -3864,12 +3899,10 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
*/
huge_ptep_set_wrprotect(src, addr, src_pte);
}
- entry = huge_ptep_get(src_pte);
- ptepage = pte_page(entry);
- get_page(ptepage);
+
page_dup_rmap(ptepage, true);
set_huge_pte_at(dst, addr, dst_pte, entry);
- hugetlb_count_add(pages_per_huge_page(h), dst);
+ hugetlb_count_add(npages, dst);
}
spin_unlock(src_ptl);
spin_unlock(dst_ptl);
@@ -4017,7 +4050,7 @@ void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
/*
* This is called when the original mapper is failing to COW a MAP_PRIVATE
- * mappping it owns the reserve page for. The intention is to unmap the page
+ * mapping it owns the reserve page for. The intention is to unmap the page
* from other VMAs and let the children be SIGKILLed if they are faulting the
* same region.
*/
@@ -4196,7 +4229,7 @@ retry_avoidcopy:
spin_lock(ptl);
ptep = huge_pte_offset(mm, haddr, huge_page_size(h));
if (likely(ptep && pte_same(huge_ptep_get(ptep), pte))) {
- ClearPagePrivate(new_page);
+ ClearHPageRestoreReserve(new_page);
/* Break COW */
huge_ptep_clear_flush(vma, haddr, ptep);
@@ -4205,7 +4238,7 @@ retry_avoidcopy:
make_huge_pte(vma, new_page, 1));
page_remove_rmap(old_page, true);
hugepage_add_new_anon_rmap(new_page, vma, haddr);
- set_page_huge_active(new_page);
+ SetHPageMigratable(new_page);
/* Make the old page be freed below */
new_page = old_page;
}
@@ -4263,7 +4296,7 @@ int huge_add_to_page_cache(struct page *page, struct address_space *mapping,
if (err)
return err;
- ClearPagePrivate(page);
+ ClearHPageRestoreReserve(page);
/*
* set page dirty so that it will not be removed from cache/file
@@ -4425,7 +4458,7 @@ retry:
goto backout;
if (anon_rmap) {
- ClearPagePrivate(page);
+ ClearHPageRestoreReserve(page);
hugepage_add_new_anon_rmap(page, vma, haddr);
} else
page_dup_rmap(page, true);
@@ -4442,12 +4475,12 @@ retry:
spin_unlock(ptl);
/*
- * Only make newly allocated pages active. Existing pages found
- * in the pagecache could be !page_huge_active() if they have been
- * isolated for migration.
+ * Only set HPageMigratable in newly allocated pages. Existing pages
+ * found in the pagecache may not have HPageMigratableset if they have
+ * been isolated for migration.
*/
if (new_page)
- set_page_huge_active(page);
+ SetHPageMigratable(page);
unlock_page(page);
out:
@@ -4477,7 +4510,7 @@ u32 hugetlb_fault_mutex_hash(struct address_space *mapping, pgoff_t idx)
}
#else
/*
- * For uniprocesor systems we always use a single mutex, so just
+ * For uniprocessor systems we always use a single mutex, so just
* return 0 and avoid the hashing overhead.
*/
u32 hugetlb_fault_mutex_hash(struct address_space *mapping, pgoff_t idx)
@@ -4739,7 +4772,7 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
if (vm_shared) {
page_dup_rmap(page, true);
} else {
- ClearPagePrivate(page);
+ ClearHPageRestoreReserve(page);
hugepage_add_new_anon_rmap(page, dst_vma, dst_addr);
}
@@ -4758,7 +4791,7 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
update_mmu_cache(dst_vma, dst_addr, dst_pte);
spin_unlock(ptl);
- set_page_huge_active(page);
+ SetHPageMigratable(page);
if (vm_shared)
unlock_page(page);
ret = 0;
@@ -4773,6 +4806,20 @@ out_release_nounlock:
goto out;
}
+static void record_subpages_vmas(struct page *page, struct vm_area_struct *vma,
+ int refs, struct page **pages,
+ struct vm_area_struct **vmas)
+{
+ int nr;
+
+ for (nr = 0; nr < refs; nr++) {
+ if (likely(pages))
+ pages[nr] = mem_map_offset(page, nr);
+ if (vmas)
+ vmas[nr] = vma;
+ }
+}
+
long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
struct page **pages, struct vm_area_struct **vmas,
unsigned long *position, unsigned long *nr_pages,
@@ -4782,7 +4829,7 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long vaddr = *position;
unsigned long remainder = *nr_pages;
struct hstate *h = hstate_vma(vma);
- int err = -EFAULT;
+ int err = -EFAULT, refs;
while (vaddr < vma->vm_end && remainder) {
pte_t *pte;
@@ -4902,20 +4949,29 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
continue;
}
-same_page:
+ refs = min3(pages_per_huge_page(h) - pfn_offset,
+ (vma->vm_end - vaddr) >> PAGE_SHIFT, remainder);
+
+ if (pages || vmas)
+ record_subpages_vmas(mem_map_offset(page, pfn_offset),
+ vma, refs,
+ likely(pages) ? pages + i : NULL,
+ vmas ? vmas + i : NULL);
+
if (pages) {
- pages[i] = mem_map_offset(page, pfn_offset);
/*
- * try_grab_page() should always succeed here, because:
- * a) we hold the ptl lock, and b) we've just checked
- * that the huge page is present in the page tables. If
- * the huge page is present, then the tail pages must
- * also be present. The ptl prevents the head page and
- * tail pages from being rearranged in any way. So this
- * page must be available at this point, unless the page
- * refcount overflowed:
+ * try_grab_compound_head() should always succeed here,
+ * because: a) we hold the ptl lock, and b) we've just
+ * checked that the huge page is present in the page
+ * tables. If the huge page is present, then the tail
+ * pages must also be present. The ptl prevents the
+ * head page and tail pages from being rearranged in
+ * any way. So this page must be available at this
+ * point, unless the page refcount overflowed:
*/
- if (WARN_ON_ONCE(!try_grab_page(pages[i], flags))) {
+ if (WARN_ON_ONCE(!try_grab_compound_head(pages[i],
+ refs,
+ flags))) {
spin_unlock(ptl);
remainder = 0;
err = -ENOMEM;
@@ -4923,21 +4979,10 @@ same_page:
}
}
- if (vmas)
- vmas[i] = vma;
-
- vaddr += PAGE_SIZE;
- ++pfn_offset;
- --remainder;
- ++i;
- if (vaddr < vma->vm_end && remainder &&
- pfn_offset < pages_per_huge_page(h)) {
- /*
- * We use pfn_offset to avoid touching the pageframes
- * of this compound page.
- */
- goto same_page;
- }
+ vaddr += (refs << PAGE_SHIFT);
+ remainder -= refs;
+ i += refs;
+
spin_unlock(ptl);
}
*nr_pages = remainder;
@@ -5051,12 +5096,13 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
return pages << h->order;
}
-int hugetlb_reserve_pages(struct inode *inode,
+/* Return true if reservation was successful, false otherwise. */
+bool hugetlb_reserve_pages(struct inode *inode,
long from, long to,
struct vm_area_struct *vma,
vm_flags_t vm_flags)
{
- long ret, chg, add = -1;
+ long chg, add = -1;
struct hstate *h = hstate_inode(inode);
struct hugepage_subpool *spool = subpool_inode(inode);
struct resv_map *resv_map;
@@ -5066,7 +5112,7 @@ int hugetlb_reserve_pages(struct inode *inode,
/* This should never happen */
if (from > to) {
VM_WARN(1, "%s called with a negative range\n", __func__);
- return -EINVAL;
+ return false;
}
/*
@@ -5075,7 +5121,7 @@ int hugetlb_reserve_pages(struct inode *inode,
* without using reserves
*/
if (vm_flags & VM_NORESERVE)
- return 0;
+ return true;
/*
* Shared mappings base their reservation on the number of pages that
@@ -5097,7 +5143,7 @@ int hugetlb_reserve_pages(struct inode *inode,
/* Private mapping. */
resv_map = resv_map_alloc();
if (!resv_map)
- return -ENOMEM;
+ return false;
chg = to - from;
@@ -5105,18 +5151,12 @@ int hugetlb_reserve_pages(struct inode *inode,
set_vma_resv_flags(vma, HPAGE_RESV_OWNER);
}
- if (chg < 0) {
- ret = chg;
+ if (chg < 0)
goto out_err;
- }
- ret = hugetlb_cgroup_charge_cgroup_rsvd(
- hstate_index(h), chg * pages_per_huge_page(h), &h_cg);
-
- if (ret < 0) {
- ret = -ENOMEM;
+ if (hugetlb_cgroup_charge_cgroup_rsvd(hstate_index(h),
+ chg * pages_per_huge_page(h), &h_cg) < 0)
goto out_err;
- }
if (vma && !(vma->vm_flags & VM_MAYSHARE) && h_cg) {
/* For private mappings, the hugetlb_cgroup uncharge info hangs
@@ -5131,19 +5171,15 @@ int hugetlb_reserve_pages(struct inode *inode,
* reservations already in place (gbl_reserve).
*/
gbl_reserve = hugepage_subpool_get_pages(spool, chg);
- if (gbl_reserve < 0) {
- ret = -ENOSPC;
+ if (gbl_reserve < 0)
goto out_uncharge_cgroup;
- }
/*
* Check enough hugepages are available for the reservation.
* Hand the pages back to the subpool if there are not
*/
- ret = hugetlb_acct_memory(h, gbl_reserve);
- if (ret < 0) {
+ if (hugetlb_acct_memory(h, gbl_reserve) < 0)
goto out_put_pages;
- }
/*
* Account for the reservations made. Shared mappings record regions
@@ -5161,7 +5197,6 @@ int hugetlb_reserve_pages(struct inode *inode,
if (unlikely(add < 0)) {
hugetlb_acct_memory(h, -gbl_reserve);
- ret = add;
goto out_put_pages;
} else if (unlikely(chg > add)) {
/*
@@ -5173,6 +5208,10 @@ int hugetlb_reserve_pages(struct inode *inode,
*/
long rsv_adjust;
+ /*
+ * hugetlb_cgroup_uncharge_cgroup_rsvd() will put the
+ * reference to h_cg->css. See comment below for detail.
+ */
hugetlb_cgroup_uncharge_cgroup_rsvd(
hstate_index(h),
(chg - add) * pages_per_huge_page(h), h_cg);
@@ -5180,9 +5219,18 @@ int hugetlb_reserve_pages(struct inode *inode,
rsv_adjust = hugepage_subpool_put_pages(spool,
chg - add);
hugetlb_acct_memory(h, -rsv_adjust);
+ } else if (h_cg) {
+ /*
+ * The file_regions will hold their own reference to
+ * h_cg->css. So we should release the reference held
+ * via hugetlb_cgroup_charge_cgroup_rsvd() when we are
+ * done.
+ */
+ hugetlb_cgroup_put_rsvd_cgroup(h_cg);
}
}
- return 0;
+ return true;
+
out_put_pages:
/* put back original number of pages, chg */
(void)hugepage_subpool_put_pages(spool, chg);
@@ -5198,7 +5246,7 @@ out_err:
region_abort(resv_map, from, to, regions_needed);
if (vma && is_vma_resv_set(vma, HPAGE_RESV_OWNER))
kref_put(&resv_map->refs, resv_map_release);
- return ret;
+ return false;
}
long hugetlb_unreserve_pages(struct inode *inode, long start, long end,
@@ -5259,7 +5307,7 @@ static unsigned long page_table_shareable(struct vm_area_struct *svma,
*/
if (pmd_index(addr) != pmd_index(saddr) ||
vm_flags != svm_flags ||
- sbase < svma->vm_start || svma->vm_end < s_end)
+ !range_in_vma(svma, sbase, s_end))
return 0;
return saddr;
@@ -5286,21 +5334,23 @@ static bool vma_shareable(struct vm_area_struct *vma, unsigned long addr)
void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma,
unsigned long *start, unsigned long *end)
{
- unsigned long a_start, a_end;
+ unsigned long v_start = ALIGN(vma->vm_start, PUD_SIZE),
+ v_end = ALIGN_DOWN(vma->vm_end, PUD_SIZE);
- if (!(vma->vm_flags & VM_MAYSHARE))
+ /*
+ * vma need span at least one aligned PUD size and the start,end range
+ * must at least partialy within it.
+ */
+ if (!(vma->vm_flags & VM_MAYSHARE) || !(v_end > v_start) ||
+ (*end <= v_start) || (*start >= v_end))
return;
/* Extend the range to be PUD aligned for a worst case scenario */
- a_start = ALIGN_DOWN(*start, PUD_SIZE);
- a_end = ALIGN(*end, PUD_SIZE);
+ if (*start > v_start)
+ *start = ALIGN_DOWN(*start, PUD_SIZE);
- /*
- * Intersect the range with the vma range, since pmd sharing won't be
- * across vma after all
- */
- *start = max(vma->vm_start, a_start);
- *end = min(vma->vm_end, a_end);
+ if (*end < v_end)
+ *end = ALIGN(*end, PUD_SIZE);
}
/*
@@ -5583,12 +5633,13 @@ bool isolate_huge_page(struct page *page, struct list_head *list)
bool ret = true;
spin_lock(&hugetlb_lock);
- if (!PageHeadHuge(page) || !page_huge_active(page) ||
+ if (!PageHeadHuge(page) ||
+ !HPageMigratable(page) ||
!get_page_unless_zero(page)) {
ret = false;
goto unlock;
}
- clear_page_huge_active(page);
+ ClearHPageMigratable(page);
list_move_tail(&page->lru, list);
unlock:
spin_unlock(&hugetlb_lock);
@@ -5597,9 +5648,8 @@ unlock:
void putback_active_hugepage(struct page *page)
{
- VM_BUG_ON_PAGE(!PageHead(page), page);
spin_lock(&hugetlb_lock);
- set_page_huge_active(page);
+ SetHPageMigratable(page);
list_move_tail(&page->lru, &(page_hstate(page))->hugepage_activelist);
spin_unlock(&hugetlb_lock);
put_page(page);
@@ -5622,12 +5672,12 @@ void move_hugetlb_state(struct page *oldpage, struct page *newpage, int reason)
* here as well otherwise the global surplus count will not match
* the per-node's.
*/
- if (PageHugeTemporary(newpage)) {
+ if (HPageTemporary(newpage)) {
int old_nid = page_to_nid(oldpage);
int new_nid = page_to_nid(newpage);
- SetPageHugeTemporary(oldpage);
- ClearPageHugeTemporary(newpage);
+ SetHPageTemporary(oldpage);
+ ClearHPageTemporary(newpage);
spin_lock(&hugetlb_lock);
if (h->surplus_huge_pages_node[old_nid]) {
diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c
index 9182848dda3e..603a131e262d 100644
--- a/mm/hugetlb_cgroup.c
+++ b/mm/hugetlb_cgroup.c
@@ -113,7 +113,7 @@ static void hugetlb_cgroup_init(struct hugetlb_cgroup *h_cgroup,
rsvd_parent);
limit = round_down(PAGE_COUNTER_MAX,
- 1 << huge_page_order(&hstates[idx]));
+ pages_per_huge_page(&hstates[idx]));
ret = page_counter_set_max(
hugetlb_cgroup_counter_from_cgroup(h_cgroup, idx),
@@ -391,7 +391,8 @@ void hugetlb_cgroup_uncharge_counter(struct resv_map *resv, unsigned long start,
void hugetlb_cgroup_uncharge_file_region(struct resv_map *resv,
struct file_region *rg,
- unsigned long nr_pages)
+ unsigned long nr_pages,
+ bool region_del)
{
if (hugetlb_cgroup_disabled() || !resv || !rg || !nr_pages)
return;
@@ -400,7 +401,12 @@ void hugetlb_cgroup_uncharge_file_region(struct resv_map *resv,
!resv->reservation_counter) {
page_counter_uncharge(rg->reservation_counter,
nr_pages * resv->pages_per_hpage);
- css_put(rg->css);
+ /*
+ * Only do css_put(rg->css) when we delete the entire region
+ * because one file_region must hold exactly one css reference.
+ */
+ if (region_del)
+ css_put(rg->css);
}
}
@@ -460,7 +466,7 @@ static int hugetlb_cgroup_read_u64_max(struct seq_file *seq, void *v)
counter = &h_cg->hugepage[idx];
limit = round_down(PAGE_COUNTER_MAX,
- 1 << huge_page_order(&hstates[idx]));
+ pages_per_huge_page(&hstates[idx]));
switch (MEMFILE_ATTR(cft->private)) {
case RES_RSVD_USAGE:
@@ -507,7 +513,7 @@ static ssize_t hugetlb_cgroup_write(struct kernfs_open_file *of,
return ret;
idx = MEMFILE_IDX(of_cft(of)->private);
- nr_pages = round_down(nr_pages, 1 << huge_page_order(&hstates[idx]));
+ nr_pages = round_down(nr_pages, pages_per_huge_page(&hstates[idx]));
switch (MEMFILE_ATTR(of_cft(of)->private)) {
case RES_RSVD_LIMIT:
diff --git a/mm/internal.h b/mm/internal.h
index 25d2b2439f19..1432feec62df 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -60,8 +60,8 @@ static inline void force_page_cache_readahead(struct address_space *mapping,
force_page_cache_ra(&ractl, &file->f_ra, nr_to_read);
}
-struct page *find_get_entry(struct address_space *mapping, pgoff_t index);
-struct page *find_lock_entry(struct address_space *mapping, pgoff_t index);
+unsigned find_lock_entries(struct address_space *mapping, pgoff_t start,
+ pgoff_t end, struct pagevec *pvec, pgoff_t *indices);
/**
* page_evictable - test whether a page is evictable
@@ -296,11 +296,6 @@ static inline unsigned int buddy_order(struct page *page)
*/
#define buddy_order_unsafe(page) READ_ONCE(page_private(page))
-static inline bool is_cow_mapping(vm_flags_t flags)
-{
- return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
-}
-
/*
* These three helpers classifies VMAs for virtual memory accounting.
*/
diff --git a/mm/kasan/common.c b/mm/kasan/common.c
index b25167664ead..b5e08d4cefec 100644
--- a/mm/kasan/common.c
+++ b/mm/kasan/common.c
@@ -60,7 +60,7 @@ void kasan_disable_current(void)
void __kasan_unpoison_range(const void *address, size_t size)
{
- unpoison_range(address, size);
+ kasan_unpoison(address, size);
}
#if CONFIG_KASAN_STACK
@@ -69,7 +69,7 @@ void kasan_unpoison_task_stack(struct task_struct *task)
{
void *base = task_stack_page(task);
- unpoison_range(base, THREAD_SIZE);
+ kasan_unpoison(base, THREAD_SIZE);
}
/* Unpoison the stack for the current task beyond a watermark sp value. */
@@ -82,7 +82,7 @@ asmlinkage void kasan_unpoison_task_stack_below(const void *watermark)
*/
void *base = (void *)((unsigned long)watermark & ~(THREAD_SIZE - 1));
- unpoison_range(base, watermark - base);
+ kasan_unpoison(base, watermark - base);
}
#endif /* CONFIG_KASAN_STACK */
@@ -105,18 +105,17 @@ void __kasan_alloc_pages(struct page *page, unsigned int order)
if (unlikely(PageHighMem(page)))
return;
- tag = random_tag();
+ tag = kasan_random_tag();
for (i = 0; i < (1 << order); i++)
page_kasan_tag_set(page + i, tag);
- unpoison_range(page_address(page), PAGE_SIZE << order);
+ kasan_unpoison(page_address(page), PAGE_SIZE << order);
}
void __kasan_free_pages(struct page *page, unsigned int order)
{
if (likely(!PageHighMem(page)))
- poison_range(page_address(page),
- PAGE_SIZE << order,
- KASAN_FREE_PAGE);
+ kasan_poison(page_address(page), PAGE_SIZE << order,
+ KASAN_FREE_PAGE);
}
/*
@@ -211,6 +210,11 @@ void __kasan_cache_create(struct kmem_cache *cache, unsigned int *size,
*size = optimal_size;
}
+void __kasan_cache_create_kmalloc(struct kmem_cache *cache)
+{
+ cache->kasan_info.is_kmalloc = true;
+}
+
size_t __kasan_metadata_size(struct kmem_cache *cache)
{
if (!kasan_stack_collection_enabled())
@@ -246,18 +250,19 @@ void __kasan_poison_slab(struct page *page)
for (i = 0; i < compound_nr(page); i++)
page_kasan_tag_reset(page + i);
- poison_range(page_address(page), page_size(page),
+ kasan_poison(page_address(page), page_size(page),
KASAN_KMALLOC_REDZONE);
}
void __kasan_unpoison_object_data(struct kmem_cache *cache, void *object)
{
- unpoison_range(object, cache->object_size);
+ kasan_unpoison(object, cache->object_size);
}
void __kasan_poison_object_data(struct kmem_cache *cache, void *object)
{
- poison_range(object, cache->object_size, KASAN_KMALLOC_REDZONE);
+ kasan_poison(object, round_up(cache->object_size, KASAN_GRANULE_SIZE),
+ KASAN_KMALLOC_REDZONE);
}
/*
@@ -274,27 +279,18 @@ void __kasan_poison_object_data(struct kmem_cache *cache, void *object)
* based on objects indexes, so that objects that are next to each other
* get different tags.
*/
-static u8 assign_tag(struct kmem_cache *cache, const void *object,
- bool init, bool keep_tag)
+static inline u8 assign_tag(struct kmem_cache *cache,
+ const void *object, bool init)
{
if (IS_ENABLED(CONFIG_KASAN_GENERIC))
return 0xff;
/*
- * 1. When an object is kmalloc()'ed, two hooks are called:
- * kasan_slab_alloc() and kasan_kmalloc(). We assign the
- * tag only in the first one.
- * 2. We reuse the same tag for krealloc'ed objects.
- */
- if (keep_tag)
- return get_tag(object);
-
- /*
* If the cache neither has a constructor nor has SLAB_TYPESAFE_BY_RCU
* set, assign a tag when the object is being allocated (init == false).
*/
if (!cache->ctor && !(cache->flags & SLAB_TYPESAFE_BY_RCU))
- return init ? KASAN_TAG_KERNEL : random_tag();
+ return init ? KASAN_TAG_KERNEL : kasan_random_tag();
/* For caches that either have a constructor or SLAB_TYPESAFE_BY_RCU: */
#ifdef CONFIG_SLAB
@@ -305,7 +301,7 @@ static u8 assign_tag(struct kmem_cache *cache, const void *object,
* For SLUB assign a random tag during slab creation, otherwise reuse
* the already assigned tag.
*/
- return init ? random_tag() : get_tag(object);
+ return init ? kasan_random_tag() : get_tag(object);
#endif
}
@@ -321,13 +317,13 @@ void * __must_check __kasan_init_slab_obj(struct kmem_cache *cache,
}
/* Tag is ignored in set_tag() without CONFIG_KASAN_SW/HW_TAGS */
- object = set_tag(object, assign_tag(cache, object, true, false));
+ object = set_tag(object, assign_tag(cache, object, true));
return (void *)object;
}
-static bool ____kasan_slab_free(struct kmem_cache *cache, void *object,
- unsigned long ip, bool quarantine)
+static inline bool ____kasan_slab_free(struct kmem_cache *cache,
+ void *object, unsigned long ip, bool quarantine)
{
u8 tag;
void *tagged_object;
@@ -336,6 +332,9 @@ static bool ____kasan_slab_free(struct kmem_cache *cache, void *object,
tagged_object = object;
object = kasan_reset_tag(object);
+ if (is_kfence_address(object))
+ return false;
+
if (unlikely(nearest_obj(cache, virt_to_head_page(object), object) !=
object)) {
kasan_report_invalid_free(tagged_object, ip);
@@ -346,22 +345,21 @@ static bool ____kasan_slab_free(struct kmem_cache *cache, void *object,
if (unlikely(cache->flags & SLAB_TYPESAFE_BY_RCU))
return false;
- if (check_invalid_free(tagged_object)) {
+ if (!kasan_byte_accessible(tagged_object)) {
kasan_report_invalid_free(tagged_object, ip);
return true;
}
- poison_range(object, cache->object_size, KASAN_KMALLOC_FREE);
-
- if (!kasan_stack_collection_enabled())
- return false;
+ kasan_poison(object, round_up(cache->object_size, KASAN_GRANULE_SIZE),
+ KASAN_KMALLOC_FREE);
if ((IS_ENABLED(CONFIG_KASAN_GENERIC) && !quarantine))
return false;
- kasan_set_free_info(cache, object, tag);
+ if (kasan_stack_collection_enabled())
+ kasan_set_free_info(cache, object, tag);
- return quarantine_put(cache, object);
+ return kasan_quarantine_put(cache, object);
}
bool __kasan_slab_free(struct kmem_cache *cache, void *object, unsigned long ip)
@@ -369,6 +367,31 @@ bool __kasan_slab_free(struct kmem_cache *cache, void *object, unsigned long ip)
return ____kasan_slab_free(cache, object, ip, true);
}
+static inline bool ____kasan_kfree_large(void *ptr, unsigned long ip)
+{
+ if (ptr != page_address(virt_to_head_page(ptr))) {
+ kasan_report_invalid_free(ptr, ip);
+ return true;
+ }
+
+ if (!kasan_byte_accessible(ptr)) {
+ kasan_report_invalid_free(ptr, ip);
+ return true;
+ }
+
+ /*
+ * The object will be poisoned by kasan_free_pages() or
+ * kasan_slab_free_mempool().
+ */
+
+ return false;
+}
+
+void __kasan_kfree_large(void *ptr, unsigned long ip)
+{
+ ____kasan_kfree_large(ptr, ip);
+}
+
void __kasan_slab_free_mempool(void *ptr, unsigned long ip)
{
struct page *page;
@@ -382,88 +405,147 @@ void __kasan_slab_free_mempool(void *ptr, unsigned long ip)
* KMALLOC_MAX_SIZE, and kmalloc falls back onto page_alloc.
*/
if (unlikely(!PageSlab(page))) {
- if (ptr != page_address(page)) {
- kasan_report_invalid_free(ptr, ip);
+ if (____kasan_kfree_large(ptr, ip))
return;
- }
- poison_range(ptr, page_size(page), KASAN_FREE_PAGE);
+ kasan_poison(ptr, page_size(page), KASAN_FREE_PAGE);
} else {
____kasan_slab_free(page->slab_cache, ptr, ip, false);
}
}
-static void set_alloc_info(struct kmem_cache *cache, void *object, gfp_t flags)
+static void set_alloc_info(struct kmem_cache *cache, void *object,
+ gfp_t flags, bool is_kmalloc)
{
struct kasan_alloc_meta *alloc_meta;
+ /* Don't save alloc info for kmalloc caches in kasan_slab_alloc(). */
+ if (cache->kasan_info.is_kmalloc && !is_kmalloc)
+ return;
+
alloc_meta = kasan_get_alloc_meta(cache, object);
if (alloc_meta)
kasan_set_track(&alloc_meta->alloc_track, flags);
}
-static void *____kasan_kmalloc(struct kmem_cache *cache, const void *object,
- size_t size, gfp_t flags, bool keep_tag)
+void * __must_check __kasan_slab_alloc(struct kmem_cache *cache,
+ void *object, gfp_t flags)
{
- unsigned long redzone_start;
- unsigned long redzone_end;
u8 tag;
+ void *tagged_object;
if (gfpflags_allow_blocking(flags))
- quarantine_reduce();
+ kasan_quarantine_reduce();
if (unlikely(object == NULL))
return NULL;
- redzone_start = round_up((unsigned long)(object + size),
- KASAN_GRANULE_SIZE);
- redzone_end = round_up((unsigned long)object + cache->object_size,
- KASAN_GRANULE_SIZE);
- tag = assign_tag(cache, object, false, keep_tag);
+ if (is_kfence_address(object))
+ return (void *)object;
- /* Tag is ignored in set_tag without CONFIG_KASAN_SW/HW_TAGS */
- unpoison_range(set_tag(object, tag), size);
- poison_range((void *)redzone_start, redzone_end - redzone_start,
- KASAN_KMALLOC_REDZONE);
+ /*
+ * Generate and assign random tag for tag-based modes.
+ * Tag is ignored in set_tag() for the generic mode.
+ */
+ tag = assign_tag(cache, object, false);
+ tagged_object = set_tag(object, tag);
+
+ /*
+ * Unpoison the whole object.
+ * For kmalloc() allocations, kasan_kmalloc() will do precise poisoning.
+ */
+ kasan_unpoison(tagged_object, cache->object_size);
+ /* Save alloc info (if possible) for non-kmalloc() allocations. */
if (kasan_stack_collection_enabled())
- set_alloc_info(cache, (void *)object, flags);
+ set_alloc_info(cache, (void *)object, flags, false);
- return set_tag(object, tag);
+ return tagged_object;
}
-void * __must_check __kasan_slab_alloc(struct kmem_cache *cache,
- void *object, gfp_t flags)
+static inline void *____kasan_kmalloc(struct kmem_cache *cache,
+ const void *object, size_t size, gfp_t flags)
{
- return ____kasan_kmalloc(cache, object, cache->object_size, flags, false);
+ unsigned long redzone_start;
+ unsigned long redzone_end;
+
+ if (gfpflags_allow_blocking(flags))
+ kasan_quarantine_reduce();
+
+ if (unlikely(object == NULL))
+ return NULL;
+
+ if (is_kfence_address(kasan_reset_tag(object)))
+ return (void *)object;
+
+ /*
+ * The object has already been unpoisoned by kasan_slab_alloc() for
+ * kmalloc() or by kasan_krealloc() for krealloc().
+ */
+
+ /*
+ * The redzone has byte-level precision for the generic mode.
+ * Partially poison the last object granule to cover the unaligned
+ * part of the redzone.
+ */
+ if (IS_ENABLED(CONFIG_KASAN_GENERIC))
+ kasan_poison_last_granule((void *)object, size);
+
+ /* Poison the aligned part of the redzone. */
+ redzone_start = round_up((unsigned long)(object + size),
+ KASAN_GRANULE_SIZE);
+ redzone_end = round_up((unsigned long)(object + cache->object_size),
+ KASAN_GRANULE_SIZE);
+ kasan_poison((void *)redzone_start, redzone_end - redzone_start,
+ KASAN_KMALLOC_REDZONE);
+
+ /*
+ * Save alloc info (if possible) for kmalloc() allocations.
+ * This also rewrites the alloc info when called from kasan_krealloc().
+ */
+ if (kasan_stack_collection_enabled())
+ set_alloc_info(cache, (void *)object, flags, true);
+
+ /* Keep the tag that was set by kasan_slab_alloc(). */
+ return (void *)object;
}
void * __must_check __kasan_kmalloc(struct kmem_cache *cache, const void *object,
size_t size, gfp_t flags)
{
- return ____kasan_kmalloc(cache, object, size, flags, true);
+ return ____kasan_kmalloc(cache, object, size, flags);
}
EXPORT_SYMBOL(__kasan_kmalloc);
void * __must_check __kasan_kmalloc_large(const void *ptr, size_t size,
gfp_t flags)
{
- struct page *page;
unsigned long redzone_start;
unsigned long redzone_end;
if (gfpflags_allow_blocking(flags))
- quarantine_reduce();
+ kasan_quarantine_reduce();
if (unlikely(ptr == NULL))
return NULL;
- page = virt_to_page(ptr);
+ /*
+ * The object has already been unpoisoned by kasan_alloc_pages() for
+ * alloc_pages() or by kasan_krealloc() for krealloc().
+ */
+
+ /*
+ * The redzone has byte-level precision for the generic mode.
+ * Partially poison the last object granule to cover the unaligned
+ * part of the redzone.
+ */
+ if (IS_ENABLED(CONFIG_KASAN_GENERIC))
+ kasan_poison_last_granule(ptr, size);
+
+ /* Poison the aligned part of the redzone. */
redzone_start = round_up((unsigned long)(ptr + size),
KASAN_GRANULE_SIZE);
- redzone_end = (unsigned long)ptr + page_size(page);
-
- unpoison_range(ptr, size);
- poison_range((void *)redzone_start, redzone_end - redzone_start,
+ redzone_end = (unsigned long)ptr + page_size(virt_to_page(ptr));
+ kasan_poison((void *)redzone_start, redzone_end - redzone_start,
KASAN_PAGE_REDZONE);
return (void *)ptr;
@@ -476,18 +558,27 @@ void * __must_check __kasan_krealloc(const void *object, size_t size, gfp_t flag
if (unlikely(object == ZERO_SIZE_PTR))
return (void *)object;
+ /*
+ * Unpoison the object's data.
+ * Part of it might already have been unpoisoned, but it's unknown
+ * how big that part is.
+ */
+ kasan_unpoison(object, size);
+
page = virt_to_head_page(object);
+ /* Piggy-back on kmalloc() instrumentation to poison the redzone. */
if (unlikely(!PageSlab(page)))
return __kasan_kmalloc_large(object, size, flags);
else
- return ____kasan_kmalloc(page->slab_cache, object, size,
- flags, true);
+ return ____kasan_kmalloc(page->slab_cache, object, size, flags);
}
-void __kasan_kfree_large(void *ptr, unsigned long ip)
+bool __kasan_check_byte(const void *address, unsigned long ip)
{
- if (ptr != page_address(virt_to_head_page(ptr)))
- kasan_report_invalid_free(ptr, ip);
- /* The object will be poisoned by kasan_free_pages(). */
+ if (!kasan_byte_accessible(address)) {
+ kasan_report((unsigned long)address, 1, false, ip);
+ return false;
+ }
+ return true;
}
diff --git a/mm/kasan/generic.c b/mm/kasan/generic.c
index 5106b84b07d4..2e55e0f82f39 100644
--- a/mm/kasan/generic.c
+++ b/mm/kasan/generic.c
@@ -14,6 +14,7 @@
#include <linux/init.h>
#include <linux/kasan.h>
#include <linux/kernel.h>
+#include <linux/kfence.h>
#include <linux/kmemleak.h>
#include <linux/linkage.h>
#include <linux/memblock.h>
@@ -158,7 +159,7 @@ static __always_inline bool memory_is_poisoned(unsigned long addr, size_t size)
return memory_is_poisoned_n(addr, size);
}
-static __always_inline bool check_memory_region_inline(unsigned long addr,
+static __always_inline bool check_region_inline(unsigned long addr,
size_t size, bool write,
unsigned long ret_ip)
{
@@ -179,37 +180,37 @@ static __always_inline bool check_memory_region_inline(unsigned long addr,
return !kasan_report(addr, size, write, ret_ip);
}
-bool check_memory_region(unsigned long addr, size_t size, bool write,
- unsigned long ret_ip)
+bool kasan_check_range(unsigned long addr, size_t size, bool write,
+ unsigned long ret_ip)
{
- return check_memory_region_inline(addr, size, write, ret_ip);
+ return check_region_inline(addr, size, write, ret_ip);
}
-bool check_invalid_free(void *addr)
+bool kasan_byte_accessible(const void *addr)
{
s8 shadow_byte = READ_ONCE(*(s8 *)kasan_mem_to_shadow(addr));
- return shadow_byte < 0 || shadow_byte >= KASAN_GRANULE_SIZE;
+ return shadow_byte >= 0 && shadow_byte < KASAN_GRANULE_SIZE;
}
void kasan_cache_shrink(struct kmem_cache *cache)
{
- quarantine_remove_cache(cache);
+ kasan_quarantine_remove_cache(cache);
}
void kasan_cache_shutdown(struct kmem_cache *cache)
{
if (!__kmem_cache_empty(cache))
- quarantine_remove_cache(cache);
+ kasan_quarantine_remove_cache(cache);
}
static void register_global(struct kasan_global *global)
{
size_t aligned_size = round_up(global->size, KASAN_GRANULE_SIZE);
- unpoison_range(global->beg, global->size);
+ kasan_unpoison(global->beg, global->size);
- poison_range(global->beg + aligned_size,
+ kasan_poison(global->beg + aligned_size,
global->size_with_redzone - aligned_size,
KASAN_GLOBAL_REDZONE);
}
@@ -231,7 +232,7 @@ EXPORT_SYMBOL(__asan_unregister_globals);
#define DEFINE_ASAN_LOAD_STORE(size) \
void __asan_load##size(unsigned long addr) \
{ \
- check_memory_region_inline(addr, size, false, _RET_IP_);\
+ check_region_inline(addr, size, false, _RET_IP_); \
} \
EXPORT_SYMBOL(__asan_load##size); \
__alias(__asan_load##size) \
@@ -239,7 +240,7 @@ EXPORT_SYMBOL(__asan_unregister_globals);
EXPORT_SYMBOL(__asan_load##size##_noabort); \
void __asan_store##size(unsigned long addr) \
{ \
- check_memory_region_inline(addr, size, true, _RET_IP_); \
+ check_region_inline(addr, size, true, _RET_IP_); \
} \
EXPORT_SYMBOL(__asan_store##size); \
__alias(__asan_store##size) \
@@ -254,7 +255,7 @@ DEFINE_ASAN_LOAD_STORE(16);
void __asan_loadN(unsigned long addr, size_t size)
{
- check_memory_region(addr, size, false, _RET_IP_);
+ kasan_check_range(addr, size, false, _RET_IP_);
}
EXPORT_SYMBOL(__asan_loadN);
@@ -264,7 +265,7 @@ EXPORT_SYMBOL(__asan_loadN_noabort);
void __asan_storeN(unsigned long addr, size_t size)
{
- check_memory_region(addr, size, true, _RET_IP_);
+ kasan_check_range(addr, size, true, _RET_IP_);
}
EXPORT_SYMBOL(__asan_storeN);
@@ -290,11 +291,11 @@ void __asan_alloca_poison(unsigned long addr, size_t size)
WARN_ON(!IS_ALIGNED(addr, KASAN_ALLOCA_REDZONE_SIZE));
- unpoison_range((const void *)(addr + rounded_down_size),
- size - rounded_down_size);
- poison_range(left_redzone, KASAN_ALLOCA_REDZONE_SIZE,
+ kasan_unpoison((const void *)(addr + rounded_down_size),
+ size - rounded_down_size);
+ kasan_poison(left_redzone, KASAN_ALLOCA_REDZONE_SIZE,
KASAN_ALLOCA_LEFT);
- poison_range(right_redzone, padding_size + KASAN_ALLOCA_REDZONE_SIZE,
+ kasan_poison(right_redzone, padding_size + KASAN_ALLOCA_REDZONE_SIZE,
KASAN_ALLOCA_RIGHT);
}
EXPORT_SYMBOL(__asan_alloca_poison);
@@ -305,7 +306,7 @@ void __asan_allocas_unpoison(const void *stack_top, const void *stack_bottom)
if (unlikely(!stack_top || stack_top > stack_bottom))
return;
- unpoison_range(stack_top, stack_bottom - stack_top);
+ kasan_unpoison(stack_top, stack_bottom - stack_top);
}
EXPORT_SYMBOL(__asan_allocas_unpoison);
@@ -331,7 +332,7 @@ void kasan_record_aux_stack(void *addr)
struct kasan_alloc_meta *alloc_meta;
void *object;
- if (!(page && PageSlab(page)))
+ if (is_kfence_address(addr) || !(page && PageSlab(page)))
return;
cache = page->slab_cache;
diff --git a/mm/kasan/hw_tags.c b/mm/kasan/hw_tags.c
index d558799b25b3..2aad21fda156 100644
--- a/mm/kasan/hw_tags.c
+++ b/mm/kasan/hw_tags.c
@@ -48,7 +48,7 @@ EXPORT_SYMBOL(kasan_flag_enabled);
/* Whether to collect alloc/free stack traces. */
DEFINE_STATIC_KEY_FALSE(kasan_flag_stacktrace);
-/* Whether panic or disable tag checking on fault. */
+/* Whether to panic or print a report and disable tag checking on fault. */
bool kasan_flag_panic __ro_after_init;
/* kasan=off/on */
@@ -185,3 +185,19 @@ struct kasan_track *kasan_get_free_track(struct kmem_cache *cache,
return &alloc_meta->free_track[0];
}
+
+#if IS_ENABLED(CONFIG_KASAN_KUNIT_TEST)
+
+void kasan_set_tagging_report_once(bool state)
+{
+ hw_set_tagging_report_once(state);
+}
+EXPORT_SYMBOL_GPL(kasan_set_tagging_report_once);
+
+void kasan_enable_tagging(void)
+{
+ hw_enable_tagging();
+}
+EXPORT_SYMBOL_GPL(kasan_enable_tagging);
+
+#endif
diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h
index 8c706e7652f2..8c55634d6edd 100644
--- a/mm/kasan/kasan.h
+++ b/mm/kasan/kasan.h
@@ -3,6 +3,7 @@
#define __MM_KASAN_KASAN_H
#include <linux/kasan.h>
+#include <linux/kfence.h>
#include <linux/stackdepot.h>
#ifdef CONFIG_KASAN_HW_TAGS
@@ -36,6 +37,12 @@ extern bool kasan_flag_panic __ro_after_init;
#define KASAN_TAG_INVALID 0xFE /* inaccessible memory tag */
#define KASAN_TAG_MAX 0xFD /* maximum value for random tags */
+#ifdef CONFIG_KASAN_HW_TAGS
+#define KASAN_TAG_MIN 0xF0 /* mimimum value for random tags */
+#else
+#define KASAN_TAG_MIN 0x00 /* mimimum value for random tags */
+#endif
+
#ifdef CONFIG_KASAN_GENERIC
#define KASAN_FREE_PAGE 0xFF /* page was freed */
#define KASAN_PAGE_REDZONE 0xFE /* redzone for kmalloc_large allocations */
@@ -195,14 +202,14 @@ static inline bool addr_has_metadata(const void *addr)
}
/**
- * check_memory_region - Check memory region, and report if invalid access.
+ * kasan_check_range - Check memory region, and report if invalid access.
* @addr: the accessed address
* @size: the accessed size
* @write: true if access is a write access
* @ret_ip: return address
* @return: true if access was valid, false if invalid
*/
-bool check_memory_region(unsigned long addr, size_t size, bool write,
+bool kasan_check_range(unsigned long addr, size_t size, bool write,
unsigned long ret_ip);
#else /* CONFIG_KASAN_GENERIC || CONFIG_KASAN_SW_TAGS */
@@ -215,19 +222,19 @@ static inline bool addr_has_metadata(const void *addr)
#endif /* CONFIG_KASAN_GENERIC || CONFIG_KASAN_SW_TAGS */
#if defined(CONFIG_KASAN_SW_TAGS) || defined(CONFIG_KASAN_HW_TAGS)
-void print_tags(u8 addr_tag, const void *addr);
+void kasan_print_tags(u8 addr_tag, const void *addr);
#else
-static inline void print_tags(u8 addr_tag, const void *addr) { }
+static inline void kasan_print_tags(u8 addr_tag, const void *addr) { }
#endif
-void *find_first_bad_addr(void *addr, size_t size);
-const char *get_bug_type(struct kasan_access_info *info);
-void metadata_fetch_row(char *buffer, void *row);
+void *kasan_find_first_bad_addr(void *addr, size_t size);
+const char *kasan_get_bug_type(struct kasan_access_info *info);
+void kasan_metadata_fetch_row(char *buffer, void *row);
#if defined(CONFIG_KASAN_GENERIC) && CONFIG_KASAN_STACK
-void print_address_stack_frame(const void *addr);
+void kasan_print_address_stack_frame(const void *addr);
#else
-static inline void print_address_stack_frame(const void *addr) { }
+static inline void kasan_print_address_stack_frame(const void *addr) { }
#endif
bool kasan_report(unsigned long addr, size_t size,
@@ -244,13 +251,13 @@ struct kasan_track *kasan_get_free_track(struct kmem_cache *cache,
#if defined(CONFIG_KASAN_GENERIC) && \
(defined(CONFIG_SLAB) || defined(CONFIG_SLUB))
-bool quarantine_put(struct kmem_cache *cache, void *object);
-void quarantine_reduce(void);
-void quarantine_remove_cache(struct kmem_cache *cache);
+bool kasan_quarantine_put(struct kmem_cache *cache, void *object);
+void kasan_quarantine_reduce(void);
+void kasan_quarantine_remove_cache(struct kmem_cache *cache);
#else
-static inline bool quarantine_put(struct kmem_cache *cache, void *object) { return false; }
-static inline void quarantine_reduce(void) { }
-static inline void quarantine_remove_cache(struct kmem_cache *cache) { }
+static inline bool kasan_quarantine_put(struct kmem_cache *cache, void *object) { return false; }
+static inline void kasan_quarantine_reduce(void) { }
+static inline void kasan_quarantine_remove_cache(struct kmem_cache *cache) { }
#endif
#ifndef arch_kasan_set_tag
@@ -274,6 +281,9 @@ static inline const void *arch_kasan_set_tag(const void *addr, u8 tag)
#ifndef arch_init_tags
#define arch_init_tags(max_tag)
#endif
+#ifndef arch_set_tagging_report_once
+#define arch_set_tagging_report_once(state)
+#endif
#ifndef arch_get_random_tag
#define arch_get_random_tag() (0xFF)
#endif
@@ -286,51 +296,129 @@ static inline const void *arch_kasan_set_tag(const void *addr, u8 tag)
#define hw_enable_tagging() arch_enable_tagging()
#define hw_init_tags(max_tag) arch_init_tags(max_tag)
+#define hw_set_tagging_report_once(state) arch_set_tagging_report_once(state)
#define hw_get_random_tag() arch_get_random_tag()
#define hw_get_mem_tag(addr) arch_get_mem_tag(addr)
#define hw_set_mem_tag_range(addr, size, tag) arch_set_mem_tag_range((addr), (size), (tag))
+#else /* CONFIG_KASAN_HW_TAGS */
+
+#define hw_enable_tagging()
+#define hw_set_tagging_report_once(state)
+
#endif /* CONFIG_KASAN_HW_TAGS */
+#if defined(CONFIG_KASAN_HW_TAGS) && IS_ENABLED(CONFIG_KASAN_KUNIT_TEST)
+
+void kasan_set_tagging_report_once(bool state);
+void kasan_enable_tagging(void);
+
+#else /* CONFIG_KASAN_HW_TAGS || CONFIG_KASAN_KUNIT_TEST */
+
+static inline void kasan_set_tagging_report_once(bool state) { }
+static inline void kasan_enable_tagging(void) { }
+
+#endif /* CONFIG_KASAN_HW_TAGS || CONFIG_KASAN_KUNIT_TEST */
+
#ifdef CONFIG_KASAN_SW_TAGS
-u8 random_tag(void);
+u8 kasan_random_tag(void);
#elif defined(CONFIG_KASAN_HW_TAGS)
-static inline u8 random_tag(void) { return hw_get_random_tag(); }
+static inline u8 kasan_random_tag(void) { return hw_get_random_tag(); }
#else
-static inline u8 random_tag(void) { return 0; }
+static inline u8 kasan_random_tag(void) { return 0; }
#endif
#ifdef CONFIG_KASAN_HW_TAGS
-static inline void poison_range(const void *address, size_t size, u8 value)
+static inline void kasan_poison(const void *addr, size_t size, u8 value)
{
- hw_set_mem_tag_range(kasan_reset_tag(address),
- round_up(size, KASAN_GRANULE_SIZE), value);
+ addr = kasan_reset_tag(addr);
+
+ /* Skip KFENCE memory if called explicitly outside of sl*b. */
+ if (is_kfence_address(addr))
+ return;
+
+ if (WARN_ON((unsigned long)addr & KASAN_GRANULE_MASK))
+ return;
+ if (WARN_ON(size & KASAN_GRANULE_MASK))
+ return;
+
+ hw_set_mem_tag_range((void *)addr, size, value);
}
-static inline void unpoison_range(const void *address, size_t size)
+static inline void kasan_unpoison(const void *addr, size_t size)
{
- hw_set_mem_tag_range(kasan_reset_tag(address),
- round_up(size, KASAN_GRANULE_SIZE), get_tag(address));
+ u8 tag = get_tag(addr);
+
+ addr = kasan_reset_tag(addr);
+
+ /* Skip KFENCE memory if called explicitly outside of sl*b. */
+ if (is_kfence_address(addr))
+ return;
+
+ if (WARN_ON((unsigned long)addr & KASAN_GRANULE_MASK))
+ return;
+ size = round_up(size, KASAN_GRANULE_SIZE);
+
+ hw_set_mem_tag_range((void *)addr, size, tag);
}
-static inline bool check_invalid_free(void *addr)
+static inline bool kasan_byte_accessible(const void *addr)
{
u8 ptr_tag = get_tag(addr);
- u8 mem_tag = hw_get_mem_tag(addr);
+ u8 mem_tag = hw_get_mem_tag((void *)addr);
- return (mem_tag == KASAN_TAG_INVALID) ||
- (ptr_tag != KASAN_TAG_KERNEL && ptr_tag != mem_tag);
+ return (mem_tag != KASAN_TAG_INVALID) &&
+ (ptr_tag == KASAN_TAG_KERNEL || ptr_tag == mem_tag);
}
#else /* CONFIG_KASAN_HW_TAGS */
-void poison_range(const void *address, size_t size, u8 value);
-void unpoison_range(const void *address, size_t size);
-bool check_invalid_free(void *addr);
+/**
+ * kasan_poison - mark the memory range as unaccessible
+ * @addr - range start address, must be aligned to KASAN_GRANULE_SIZE
+ * @size - range size, must be aligned to KASAN_GRANULE_SIZE
+ * @value - value that's written to metadata for the range
+ *
+ * The size gets aligned to KASAN_GRANULE_SIZE before marking the range.
+ */
+void kasan_poison(const void *addr, size_t size, u8 value);
+
+/**
+ * kasan_unpoison - mark the memory range as accessible
+ * @addr - range start address, must be aligned to KASAN_GRANULE_SIZE
+ * @size - range size, can be unaligned
+ *
+ * For the tag-based modes, the @size gets aligned to KASAN_GRANULE_SIZE before
+ * marking the range.
+ * For the generic mode, the last granule of the memory range gets partially
+ * unpoisoned based on the @size.
+ */
+void kasan_unpoison(const void *addr, size_t size);
+
+bool kasan_byte_accessible(const void *addr);
#endif /* CONFIG_KASAN_HW_TAGS */
+#ifdef CONFIG_KASAN_GENERIC
+
+/**
+ * kasan_poison_last_granule - mark the last granule of the memory range as
+ * unaccessible
+ * @addr - range start address, must be aligned to KASAN_GRANULE_SIZE
+ * @size - range size
+ *
+ * This function is only available for the generic mode, as it's the only mode
+ * that has partially poisoned memory granules.
+ */
+void kasan_poison_last_granule(const void *address, size_t size);
+
+#else /* CONFIG_KASAN_GENERIC */
+
+static inline void kasan_poison_last_granule(const void *address, size_t size) { }
+
+#endif /* CONFIG_KASAN_GENERIC */
+
/*
* Exported functions for interfaces called from assembly or from generated
* code. Declarations here to avoid warning about missing declarations.
diff --git a/mm/kasan/quarantine.c b/mm/kasan/quarantine.c
index 55783125a767..728fb24c5683 100644
--- a/mm/kasan/quarantine.c
+++ b/mm/kasan/quarantine.c
@@ -168,7 +168,7 @@ static void qlist_free_all(struct qlist_head *q, struct kmem_cache *cache)
qlist_init(q);
}
-bool quarantine_put(struct kmem_cache *cache, void *object)
+bool kasan_quarantine_put(struct kmem_cache *cache, void *object)
{
unsigned long flags;
struct qlist_head *q;
@@ -184,11 +184,11 @@ bool quarantine_put(struct kmem_cache *cache, void *object)
/*
* Note: irq must be disabled until after we move the batch to the
- * global quarantine. Otherwise quarantine_remove_cache() can miss
- * some objects belonging to the cache if they are in our local temp
- * list. quarantine_remove_cache() executes on_each_cpu() at the
- * beginning which ensures that it either sees the objects in per-cpu
- * lists or in the global quarantine.
+ * global quarantine. Otherwise kasan_quarantine_remove_cache() can
+ * miss some objects belonging to the cache if they are in our local
+ * temp list. kasan_quarantine_remove_cache() executes on_each_cpu()
+ * at the beginning which ensures that it either sees the objects in
+ * per-cpu lists or in the global quarantine.
*/
local_irq_save(flags);
@@ -222,7 +222,7 @@ bool quarantine_put(struct kmem_cache *cache, void *object)
return true;
}
-void quarantine_reduce(void)
+void kasan_quarantine_reduce(void)
{
size_t total_size, new_quarantine_size, percpu_quarantines;
unsigned long flags;
@@ -234,7 +234,7 @@ void quarantine_reduce(void)
return;
/*
- * srcu critical section ensures that quarantine_remove_cache()
+ * srcu critical section ensures that kasan_quarantine_remove_cache()
* will not miss objects belonging to the cache while they are in our
* local to_free list. srcu is chosen because (1) it gives us private
* grace period domain that does not interfere with anything else,
@@ -309,15 +309,15 @@ static void per_cpu_remove_cache(void *arg)
}
/* Free all quarantined objects belonging to cache. */
-void quarantine_remove_cache(struct kmem_cache *cache)
+void kasan_quarantine_remove_cache(struct kmem_cache *cache)
{
unsigned long flags, i;
struct qlist_head to_free = QLIST_INIT;
/*
* Must be careful to not miss any objects that are being moved from
- * per-cpu list to the global quarantine in quarantine_put(),
- * nor objects being freed in quarantine_reduce(). on_each_cpu()
+ * per-cpu list to the global quarantine in kasan_quarantine_put(),
+ * nor objects being freed in kasan_quarantine_reduce(). on_each_cpu()
* achieves the first goal, while synchronize_srcu() achieves the
* second.
*/
diff --git a/mm/kasan/report.c b/mm/kasan/report.c
index c0fb21797550..87b271206163 100644
--- a/mm/kasan/report.c
+++ b/mm/kasan/report.c
@@ -25,6 +25,7 @@
#include <linux/module.h>
#include <linux/sched/task_stack.h>
#include <linux/uaccess.h>
+#include <trace/events/error_report.h>
#include <asm/sections.h>
@@ -61,7 +62,7 @@ __setup("kasan_multi_shot", kasan_set_multi_shot);
static void print_error_description(struct kasan_access_info *info)
{
pr_err("BUG: KASAN: %s in %pS\n",
- get_bug_type(info), (void *)info->ip);
+ kasan_get_bug_type(info), (void *)info->ip);
if (info->access_size)
pr_err("%s of size %zu at addr %px by task %s/%d\n",
info->is_write ? "Write" : "Read", info->access_size,
@@ -84,8 +85,9 @@ static void start_report(unsigned long *flags)
pr_err("==================================================================\n");
}
-static void end_report(unsigned long *flags)
+static void end_report(unsigned long *flags, unsigned long addr)
{
+ trace_error_report_end(ERROR_DETECTOR_KASAN, addr);
pr_err("==================================================================\n");
add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
spin_unlock_irqrestore(&report_lock, *flags);
@@ -247,7 +249,7 @@ static void print_address_description(void *addr, u8 tag)
dump_page(page, "kasan: bad access detected");
}
- print_address_stack_frame(addr);
+ kasan_print_address_stack_frame(addr);
}
static bool meta_row_is_guilty(const void *row, const void *addr)
@@ -293,7 +295,7 @@ static void print_memory_metadata(const void *addr)
* function, because generic functions may try to
* access kasan mapping for the passed address.
*/
- metadata_fetch_row(&metadata[0], row);
+ kasan_metadata_fetch_row(&metadata[0], row);
print_hex_dump(KERN_ERR, buffer,
DUMP_PREFIX_NONE, META_BYTES_PER_ROW, 1,
@@ -331,7 +333,7 @@ static void kasan_update_kunit_status(struct kunit *cur_test)
}
kasan_data = (struct kunit_kasan_expectation *)resource->data;
- kasan_data->report_found = true;
+ WRITE_ONCE(kasan_data->report_found, true);
kunit_put_resource(resource);
}
#endif /* IS_ENABLED(CONFIG_KUNIT) */
@@ -350,12 +352,12 @@ void kasan_report_invalid_free(void *object, unsigned long ip)
start_report(&flags);
pr_err("BUG: KASAN: double-free or invalid-free in %pS\n", (void *)ip);
- print_tags(tag, object);
+ kasan_print_tags(tag, object);
pr_err("\n");
print_address_description(object, tag);
pr_err("\n");
print_memory_metadata(object);
- end_report(&flags);
+ end_report(&flags, (unsigned long)object);
}
static void __kasan_report(unsigned long addr, size_t size, bool is_write,
@@ -378,7 +380,8 @@ static void __kasan_report(unsigned long addr, size_t size, bool is_write,
info.access_addr = tagged_addr;
if (addr_has_metadata(untagged_addr))
- info.first_bad_addr = find_first_bad_addr(tagged_addr, size);
+ info.first_bad_addr =
+ kasan_find_first_bad_addr(tagged_addr, size);
else
info.first_bad_addr = untagged_addr;
info.access_size = size;
@@ -389,7 +392,7 @@ static void __kasan_report(unsigned long addr, size_t size, bool is_write,
print_error_description(&info);
if (addr_has_metadata(untagged_addr))
- print_tags(get_tag(tagged_addr), info.first_bad_addr);
+ kasan_print_tags(get_tag(tagged_addr), info.first_bad_addr);
pr_err("\n");
if (addr_has_metadata(untagged_addr)) {
@@ -400,7 +403,7 @@ static void __kasan_report(unsigned long addr, size_t size, bool is_write,
dump_stack();
}
- end_report(&flags);
+ end_report(&flags, addr);
}
bool kasan_report(unsigned long addr, size_t size, bool is_write,
diff --git a/mm/kasan/report_generic.c b/mm/kasan/report_generic.c
index 8a9c889872da..41f374585144 100644
--- a/mm/kasan/report_generic.c
+++ b/mm/kasan/report_generic.c
@@ -30,7 +30,7 @@
#include "kasan.h"
#include "../slab.h"
-void *find_first_bad_addr(void *addr, size_t size)
+void *kasan_find_first_bad_addr(void *addr, size_t size)
{
void *p = addr;
@@ -105,7 +105,7 @@ static const char *get_wild_bug_type(struct kasan_access_info *info)
return bug_type;
}
-const char *get_bug_type(struct kasan_access_info *info)
+const char *kasan_get_bug_type(struct kasan_access_info *info)
{
/*
* If access_size is a negative number, then it has reason to be
@@ -123,7 +123,7 @@ const char *get_bug_type(struct kasan_access_info *info)
return get_wild_bug_type(info);
}
-void metadata_fetch_row(char *buffer, void *row)
+void kasan_metadata_fetch_row(char *buffer, void *row)
{
memcpy(buffer, kasan_mem_to_shadow(row), META_BYTES_PER_ROW);
}
@@ -263,7 +263,7 @@ static bool __must_check get_address_stack_frame_info(const void *addr,
return true;
}
-void print_address_stack_frame(const void *addr)
+void kasan_print_address_stack_frame(const void *addr)
{
unsigned long offset;
const char *frame_descr;
diff --git a/mm/kasan/report_hw_tags.c b/mm/kasan/report_hw_tags.c
index 57114f0e14d1..42b2168755d6 100644
--- a/mm/kasan/report_hw_tags.c
+++ b/mm/kasan/report_hw_tags.c
@@ -15,17 +15,17 @@
#include "kasan.h"
-const char *get_bug_type(struct kasan_access_info *info)
+const char *kasan_get_bug_type(struct kasan_access_info *info)
{
return "invalid-access";
}
-void *find_first_bad_addr(void *addr, size_t size)
+void *kasan_find_first_bad_addr(void *addr, size_t size)
{
return kasan_reset_tag(addr);
}
-void metadata_fetch_row(char *buffer, void *row)
+void kasan_metadata_fetch_row(char *buffer, void *row)
{
int i;
@@ -33,7 +33,7 @@ void metadata_fetch_row(char *buffer, void *row)
buffer[i] = hw_get_mem_tag(row + i * KASAN_GRANULE_SIZE);
}
-void print_tags(u8 addr_tag, const void *addr)
+void kasan_print_tags(u8 addr_tag, const void *addr)
{
u8 memory_tag = hw_get_mem_tag((void *)addr);
diff --git a/mm/kasan/report_sw_tags.c b/mm/kasan/report_sw_tags.c
index 1b026793ad57..3d20d3451d9e 100644
--- a/mm/kasan/report_sw_tags.c
+++ b/mm/kasan/report_sw_tags.c
@@ -29,7 +29,7 @@
#include "kasan.h"
#include "../slab.h"
-const char *get_bug_type(struct kasan_access_info *info)
+const char *kasan_get_bug_type(struct kasan_access_info *info)
{
#ifdef CONFIG_KASAN_SW_TAGS_IDENTIFY
struct kasan_alloc_meta *alloc_meta;
@@ -72,7 +72,7 @@ const char *get_bug_type(struct kasan_access_info *info)
return "invalid-access";
}
-void *find_first_bad_addr(void *addr, size_t size)
+void *kasan_find_first_bad_addr(void *addr, size_t size)
{
u8 tag = get_tag(addr);
void *p = kasan_reset_tag(addr);
@@ -83,12 +83,12 @@ void *find_first_bad_addr(void *addr, size_t size)
return p;
}
-void metadata_fetch_row(char *buffer, void *row)
+void kasan_metadata_fetch_row(char *buffer, void *row)
{
memcpy(buffer, kasan_mem_to_shadow(row), META_BYTES_PER_ROW);
}
-void print_tags(u8 addr_tag, const void *addr)
+void kasan_print_tags(u8 addr_tag, const void *addr)
{
u8 *shadow = (u8 *)kasan_mem_to_shadow(addr);
diff --git a/mm/kasan/shadow.c b/mm/kasan/shadow.c
index 7c2c08c55f32..63f43443f5d7 100644
--- a/mm/kasan/shadow.c
+++ b/mm/kasan/shadow.c
@@ -13,6 +13,7 @@
#include <linux/init.h>
#include <linux/kasan.h>
#include <linux/kernel.h>
+#include <linux/kfence.h>
#include <linux/kmemleak.h>
#include <linux/memory.h>
#include <linux/mm.h>
@@ -27,20 +28,20 @@
bool __kasan_check_read(const volatile void *p, unsigned int size)
{
- return check_memory_region((unsigned long)p, size, false, _RET_IP_);
+ return kasan_check_range((unsigned long)p, size, false, _RET_IP_);
}
EXPORT_SYMBOL(__kasan_check_read);
bool __kasan_check_write(const volatile void *p, unsigned int size)
{
- return check_memory_region((unsigned long)p, size, true, _RET_IP_);
+ return kasan_check_range((unsigned long)p, size, true, _RET_IP_);
}
EXPORT_SYMBOL(__kasan_check_write);
#undef memset
void *memset(void *addr, int c, size_t len)
{
- if (!check_memory_region((unsigned long)addr, len, true, _RET_IP_))
+ if (!kasan_check_range((unsigned long)addr, len, true, _RET_IP_))
return NULL;
return __memset(addr, c, len);
@@ -50,8 +51,8 @@ void *memset(void *addr, int c, size_t len)
#undef memmove
void *memmove(void *dest, const void *src, size_t len)
{
- if (!check_memory_region((unsigned long)src, len, false, _RET_IP_) ||
- !check_memory_region((unsigned long)dest, len, true, _RET_IP_))
+ if (!kasan_check_range((unsigned long)src, len, false, _RET_IP_) ||
+ !kasan_check_range((unsigned long)dest, len, true, _RET_IP_))
return NULL;
return __memmove(dest, src, len);
@@ -61,18 +62,14 @@ void *memmove(void *dest, const void *src, size_t len)
#undef memcpy
void *memcpy(void *dest, const void *src, size_t len)
{
- if (!check_memory_region((unsigned long)src, len, false, _RET_IP_) ||
- !check_memory_region((unsigned long)dest, len, true, _RET_IP_))
+ if (!kasan_check_range((unsigned long)src, len, false, _RET_IP_) ||
+ !kasan_check_range((unsigned long)dest, len, true, _RET_IP_))
return NULL;
return __memcpy(dest, src, len);
}
-/*
- * Poisons the shadow memory for 'size' bytes starting from 'addr'.
- * Memory addresses should be aligned to KASAN_GRANULE_SIZE.
- */
-void poison_range(const void *address, size_t size, u8 value)
+void kasan_poison(const void *addr, size_t size, u8 value)
{
void *shadow_start, *shadow_end;
@@ -81,36 +78,62 @@ void poison_range(const void *address, size_t size, u8 value)
* some of the callers (e.g. kasan_poison_object_data) pass tagged
* addresses to this function.
*/
- address = kasan_reset_tag(address);
- size = round_up(size, KASAN_GRANULE_SIZE);
+ addr = kasan_reset_tag(addr);
- shadow_start = kasan_mem_to_shadow(address);
- shadow_end = kasan_mem_to_shadow(address + size);
+ /* Skip KFENCE memory if called explicitly outside of sl*b. */
+ if (is_kfence_address(addr))
+ return;
+
+ if (WARN_ON((unsigned long)addr & KASAN_GRANULE_MASK))
+ return;
+ if (WARN_ON(size & KASAN_GRANULE_MASK))
+ return;
+
+ shadow_start = kasan_mem_to_shadow(addr);
+ shadow_end = kasan_mem_to_shadow(addr + size);
__memset(shadow_start, value, shadow_end - shadow_start);
}
+EXPORT_SYMBOL(kasan_poison);
-void unpoison_range(const void *address, size_t size)
+#ifdef CONFIG_KASAN_GENERIC
+void kasan_poison_last_granule(const void *addr, size_t size)
{
- u8 tag = get_tag(address);
+ if (size & KASAN_GRANULE_MASK) {
+ u8 *shadow = (u8 *)kasan_mem_to_shadow(addr + size);
+ *shadow = size & KASAN_GRANULE_MASK;
+ }
+}
+#endif
+
+void kasan_unpoison(const void *addr, size_t size)
+{
+ u8 tag = get_tag(addr);
/*
* Perform shadow offset calculation based on untagged address, as
* some of the callers (e.g. kasan_unpoison_object_data) pass tagged
* addresses to this function.
*/
- address = kasan_reset_tag(address);
+ addr = kasan_reset_tag(addr);
+
+ /*
+ * Skip KFENCE memory if called explicitly outside of sl*b. Also note
+ * that calls to ksize(), where size is not a multiple of machine-word
+ * size, would otherwise poison the invalid portion of the word.
+ */
+ if (is_kfence_address(addr))
+ return;
- poison_range(address, size, tag);
+ if (WARN_ON((unsigned long)addr & KASAN_GRANULE_MASK))
+ return;
- if (size & KASAN_GRANULE_MASK) {
- u8 *shadow = (u8 *)kasan_mem_to_shadow(address + size);
+ /* Unpoison all granules that cover the object. */
+ kasan_poison(addr, round_up(size, KASAN_GRANULE_SIZE), tag);
- if (IS_ENABLED(CONFIG_KASAN_SW_TAGS))
- *shadow = tag;
- else /* CONFIG_KASAN_GENERIC */
- *shadow = size & KASAN_GRANULE_MASK;
- }
+ /* Partially poison the last granule for the generic mode. */
+ if (IS_ENABLED(CONFIG_KASAN_GENERIC))
+ kasan_poison_last_granule(addr, size);
}
#ifdef CONFIG_MEMORY_HOTPLUG
@@ -286,7 +309,7 @@ int kasan_populate_vmalloc(unsigned long addr, unsigned long size)
* // vmalloc() allocates memory
* // let a = area->addr
* // we reach kasan_populate_vmalloc
- * // and call unpoison_range:
+ * // and call kasan_unpoison:
* STORE shadow(a), unpoison_val
* ...
* STORE shadow(a+99), unpoison_val x = LOAD p
@@ -321,7 +344,7 @@ void kasan_poison_vmalloc(const void *start, unsigned long size)
return;
size = round_up(size, KASAN_GRANULE_SIZE);
- poison_range(start, size, KASAN_VMALLOC_INVALID);
+ kasan_poison(start, size, KASAN_VMALLOC_INVALID);
}
void kasan_unpoison_vmalloc(const void *start, unsigned long size)
@@ -329,7 +352,7 @@ void kasan_unpoison_vmalloc(const void *start, unsigned long size)
if (!is_vmalloc_or_module_addr(start))
return;
- unpoison_range(start, size);
+ kasan_unpoison(start, size);
}
static int kasan_depopulate_vmalloc_pte(pte_t *ptep, unsigned long addr,
diff --git a/mm/kasan/sw_tags.c b/mm/kasan/sw_tags.c
index 5dcd830805b2..94c2d33be333 100644
--- a/mm/kasan/sw_tags.c
+++ b/mm/kasan/sw_tags.c
@@ -57,7 +57,7 @@ void __init kasan_init_sw_tags(void)
* sequence has in fact positive effect, since interrupts that randomly skew
* PRNG at unpredictable points do only good.
*/
-u8 random_tag(void)
+u8 kasan_random_tag(void)
{
u32 state = this_cpu_read(prng_state);
@@ -67,7 +67,7 @@ u8 random_tag(void)
return (u8)(state % (KASAN_TAG_MAX + 1));
}
-bool check_memory_region(unsigned long addr, size_t size, bool write,
+bool kasan_check_range(unsigned long addr, size_t size, bool write,
unsigned long ret_ip)
{
u8 tag;
@@ -118,24 +118,24 @@ bool check_memory_region(unsigned long addr, size_t size, bool write,
return true;
}
-bool check_invalid_free(void *addr)
+bool kasan_byte_accessible(const void *addr)
{
u8 tag = get_tag(addr);
u8 shadow_byte = READ_ONCE(*(u8 *)kasan_mem_to_shadow(kasan_reset_tag(addr)));
- return (shadow_byte == KASAN_TAG_INVALID) ||
- (tag != KASAN_TAG_KERNEL && tag != shadow_byte);
+ return (shadow_byte != KASAN_TAG_INVALID) &&
+ (tag == KASAN_TAG_KERNEL || tag == shadow_byte);
}
#define DEFINE_HWASAN_LOAD_STORE(size) \
void __hwasan_load##size##_noabort(unsigned long addr) \
{ \
- check_memory_region(addr, size, false, _RET_IP_); \
+ kasan_check_range(addr, size, false, _RET_IP_); \
} \
EXPORT_SYMBOL(__hwasan_load##size##_noabort); \
void __hwasan_store##size##_noabort(unsigned long addr) \
{ \
- check_memory_region(addr, size, true, _RET_IP_); \
+ kasan_check_range(addr, size, true, _RET_IP_); \
} \
EXPORT_SYMBOL(__hwasan_store##size##_noabort)
@@ -147,19 +147,19 @@ DEFINE_HWASAN_LOAD_STORE(16);
void __hwasan_loadN_noabort(unsigned long addr, unsigned long size)
{
- check_memory_region(addr, size, false, _RET_IP_);
+ kasan_check_range(addr, size, false, _RET_IP_);
}
EXPORT_SYMBOL(__hwasan_loadN_noabort);
void __hwasan_storeN_noabort(unsigned long addr, unsigned long size)
{
- check_memory_region(addr, size, true, _RET_IP_);
+ kasan_check_range(addr, size, true, _RET_IP_);
}
EXPORT_SYMBOL(__hwasan_storeN_noabort);
void __hwasan_tag_memory(unsigned long addr, u8 tag, unsigned long size)
{
- poison_range((void *)addr, size, tag);
+ kasan_poison((void *)addr, size, tag);
}
EXPORT_SYMBOL(__hwasan_tag_memory);
diff --git a/mm/kfence/Makefile b/mm/kfence/Makefile
new file mode 100644
index 000000000000..6872cd5e5390
--- /dev/null
+++ b/mm/kfence/Makefile
@@ -0,0 +1,6 @@
+# SPDX-License-Identifier: GPL-2.0
+
+obj-$(CONFIG_KFENCE) := core.o report.o
+
+CFLAGS_kfence_test.o := -g -fno-omit-frame-pointer -fno-optimize-sibling-calls
+obj-$(CONFIG_KFENCE_KUNIT_TEST) += kfence_test.o
diff --git a/mm/kfence/core.c b/mm/kfence/core.c
new file mode 100644
index 000000000000..d53c91f881a4
--- /dev/null
+++ b/mm/kfence/core.c
@@ -0,0 +1,850 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * KFENCE guarded object allocator and fault handling.
+ *
+ * Copyright (C) 2020, Google LLC.
+ */
+
+#define pr_fmt(fmt) "kfence: " fmt
+
+#include <linux/atomic.h>
+#include <linux/bug.h>
+#include <linux/debugfs.h>
+#include <linux/kcsan-checks.h>
+#include <linux/kfence.h>
+#include <linux/kmemleak.h>
+#include <linux/list.h>
+#include <linux/lockdep.h>
+#include <linux/memblock.h>
+#include <linux/moduleparam.h>
+#include <linux/random.h>
+#include <linux/rcupdate.h>
+#include <linux/seq_file.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/string.h>
+
+#include <asm/kfence.h>
+
+#include "kfence.h"
+
+/* Disables KFENCE on the first warning assuming an irrecoverable error. */
+#define KFENCE_WARN_ON(cond) \
+ ({ \
+ const bool __cond = WARN_ON(cond); \
+ if (unlikely(__cond)) \
+ WRITE_ONCE(kfence_enabled, false); \
+ __cond; \
+ })
+
+/* === Data ================================================================= */
+
+static bool kfence_enabled __read_mostly;
+
+static unsigned long kfence_sample_interval __read_mostly = CONFIG_KFENCE_SAMPLE_INTERVAL;
+
+#ifdef MODULE_PARAM_PREFIX
+#undef MODULE_PARAM_PREFIX
+#endif
+#define MODULE_PARAM_PREFIX "kfence."
+
+static int param_set_sample_interval(const char *val, const struct kernel_param *kp)
+{
+ unsigned long num;
+ int ret = kstrtoul(val, 0, &num);
+
+ if (ret < 0)
+ return ret;
+
+ if (!num) /* Using 0 to indicate KFENCE is disabled. */
+ WRITE_ONCE(kfence_enabled, false);
+ else if (!READ_ONCE(kfence_enabled) && system_state != SYSTEM_BOOTING)
+ return -EINVAL; /* Cannot (re-)enable KFENCE on-the-fly. */
+
+ *((unsigned long *)kp->arg) = num;
+ return 0;
+}
+
+static int param_get_sample_interval(char *buffer, const struct kernel_param *kp)
+{
+ if (!READ_ONCE(kfence_enabled))
+ return sprintf(buffer, "0\n");
+
+ return param_get_ulong(buffer, kp);
+}
+
+static const struct kernel_param_ops sample_interval_param_ops = {
+ .set = param_set_sample_interval,
+ .get = param_get_sample_interval,
+};
+module_param_cb(sample_interval, &sample_interval_param_ops, &kfence_sample_interval, 0600);
+
+/* The pool of pages used for guard pages and objects. */
+char *__kfence_pool __ro_after_init;
+EXPORT_SYMBOL(__kfence_pool); /* Export for test modules. */
+
+/*
+ * Per-object metadata, with one-to-one mapping of object metadata to
+ * backing pages (in __kfence_pool).
+ */
+static_assert(CONFIG_KFENCE_NUM_OBJECTS > 0);
+struct kfence_metadata kfence_metadata[CONFIG_KFENCE_NUM_OBJECTS];
+
+/* Freelist with available objects. */
+static struct list_head kfence_freelist = LIST_HEAD_INIT(kfence_freelist);
+static DEFINE_RAW_SPINLOCK(kfence_freelist_lock); /* Lock protecting freelist. */
+
+#ifdef CONFIG_KFENCE_STATIC_KEYS
+/* The static key to set up a KFENCE allocation. */
+DEFINE_STATIC_KEY_FALSE(kfence_allocation_key);
+#endif
+
+/* Gates the allocation, ensuring only one succeeds in a given period. */
+atomic_t kfence_allocation_gate = ATOMIC_INIT(1);
+
+/* Statistics counters for debugfs. */
+enum kfence_counter_id {
+ KFENCE_COUNTER_ALLOCATED,
+ KFENCE_COUNTER_ALLOCS,
+ KFENCE_COUNTER_FREES,
+ KFENCE_COUNTER_ZOMBIES,
+ KFENCE_COUNTER_BUGS,
+ KFENCE_COUNTER_COUNT,
+};
+static atomic_long_t counters[KFENCE_COUNTER_COUNT];
+static const char *const counter_names[] = {
+ [KFENCE_COUNTER_ALLOCATED] = "currently allocated",
+ [KFENCE_COUNTER_ALLOCS] = "total allocations",
+ [KFENCE_COUNTER_FREES] = "total frees",
+ [KFENCE_COUNTER_ZOMBIES] = "zombie allocations",
+ [KFENCE_COUNTER_BUGS] = "total bugs",
+};
+static_assert(ARRAY_SIZE(counter_names) == KFENCE_COUNTER_COUNT);
+
+/* === Internals ============================================================ */
+
+static bool kfence_protect(unsigned long addr)
+{
+ return !KFENCE_WARN_ON(!kfence_protect_page(ALIGN_DOWN(addr, PAGE_SIZE), true));
+}
+
+static bool kfence_unprotect(unsigned long addr)
+{
+ return !KFENCE_WARN_ON(!kfence_protect_page(ALIGN_DOWN(addr, PAGE_SIZE), false));
+}
+
+static inline struct kfence_metadata *addr_to_metadata(unsigned long addr)
+{
+ long index;
+
+ /* The checks do not affect performance; only called from slow-paths. */
+
+ if (!is_kfence_address((void *)addr))
+ return NULL;
+
+ /*
+ * May be an invalid index if called with an address at the edge of
+ * __kfence_pool, in which case we would report an "invalid access"
+ * error.
+ */
+ index = (addr - (unsigned long)__kfence_pool) / (PAGE_SIZE * 2) - 1;
+ if (index < 0 || index >= CONFIG_KFENCE_NUM_OBJECTS)
+ return NULL;
+
+ return &kfence_metadata[index];
+}
+
+static inline unsigned long metadata_to_pageaddr(const struct kfence_metadata *meta)
+{
+ unsigned long offset = (meta - kfence_metadata + 1) * PAGE_SIZE * 2;
+ unsigned long pageaddr = (unsigned long)&__kfence_pool[offset];
+
+ /* The checks do not affect performance; only called from slow-paths. */
+
+ /* Only call with a pointer into kfence_metadata. */
+ if (KFENCE_WARN_ON(meta < kfence_metadata ||
+ meta >= kfence_metadata + CONFIG_KFENCE_NUM_OBJECTS))
+ return 0;
+
+ /*
+ * This metadata object only ever maps to 1 page; verify that the stored
+ * address is in the expected range.
+ */
+ if (KFENCE_WARN_ON(ALIGN_DOWN(meta->addr, PAGE_SIZE) != pageaddr))
+ return 0;
+
+ return pageaddr;
+}
+
+/*
+ * Update the object's metadata state, including updating the alloc/free stacks
+ * depending on the state transition.
+ */
+static noinline void metadata_update_state(struct kfence_metadata *meta,
+ enum kfence_object_state next)
+{
+ struct kfence_track *track =
+ next == KFENCE_OBJECT_FREED ? &meta->free_track : &meta->alloc_track;
+
+ lockdep_assert_held(&meta->lock);
+
+ /*
+ * Skip over 1 (this) functions; noinline ensures we do not accidentally
+ * skip over the caller by never inlining.
+ */
+ track->num_stack_entries = stack_trace_save(track->stack_entries, KFENCE_STACK_DEPTH, 1);
+ track->pid = task_pid_nr(current);
+
+ /*
+ * Pairs with READ_ONCE() in
+ * kfence_shutdown_cache(),
+ * kfence_handle_page_fault().
+ */
+ WRITE_ONCE(meta->state, next);
+}
+
+/* Write canary byte to @addr. */
+static inline bool set_canary_byte(u8 *addr)
+{
+ *addr = KFENCE_CANARY_PATTERN(addr);
+ return true;
+}
+
+/* Check canary byte at @addr. */
+static inline bool check_canary_byte(u8 *addr)
+{
+ if (likely(*addr == KFENCE_CANARY_PATTERN(addr)))
+ return true;
+
+ atomic_long_inc(&counters[KFENCE_COUNTER_BUGS]);
+ kfence_report_error((unsigned long)addr, false, NULL, addr_to_metadata((unsigned long)addr),
+ KFENCE_ERROR_CORRUPTION);
+ return false;
+}
+
+/* __always_inline this to ensure we won't do an indirect call to fn. */
+static __always_inline void for_each_canary(const struct kfence_metadata *meta, bool (*fn)(u8 *))
+{
+ const unsigned long pageaddr = ALIGN_DOWN(meta->addr, PAGE_SIZE);
+ unsigned long addr;
+
+ lockdep_assert_held(&meta->lock);
+
+ /*
+ * We'll iterate over each canary byte per-side until fn() returns
+ * false. However, we'll still iterate over the canary bytes to the
+ * right of the object even if there was an error in the canary bytes to
+ * the left of the object. Specifically, if check_canary_byte()
+ * generates an error, showing both sides might give more clues as to
+ * what the error is about when displaying which bytes were corrupted.
+ */
+
+ /* Apply to left of object. */
+ for (addr = pageaddr; addr < meta->addr; addr++) {
+ if (!fn((u8 *)addr))
+ break;
+ }
+
+ /* Apply to right of object. */
+ for (addr = meta->addr + meta->size; addr < pageaddr + PAGE_SIZE; addr++) {
+ if (!fn((u8 *)addr))
+ break;
+ }
+}
+
+static void *kfence_guarded_alloc(struct kmem_cache *cache, size_t size, gfp_t gfp)
+{
+ struct kfence_metadata *meta = NULL;
+ unsigned long flags;
+ struct page *page;
+ void *addr;
+
+ /* Try to obtain a free object. */
+ raw_spin_lock_irqsave(&kfence_freelist_lock, flags);
+ if (!list_empty(&kfence_freelist)) {
+ meta = list_entry(kfence_freelist.next, struct kfence_metadata, list);
+ list_del_init(&meta->list);
+ }
+ raw_spin_unlock_irqrestore(&kfence_freelist_lock, flags);
+ if (!meta)
+ return NULL;
+
+ if (unlikely(!raw_spin_trylock_irqsave(&meta->lock, flags))) {
+ /*
+ * This is extremely unlikely -- we are reporting on a
+ * use-after-free, which locked meta->lock, and the reporting
+ * code via printk calls kmalloc() which ends up in
+ * kfence_alloc() and tries to grab the same object that we're
+ * reporting on. While it has never been observed, lockdep does
+ * report that there is a possibility of deadlock. Fix it by
+ * using trylock and bailing out gracefully.
+ */
+ raw_spin_lock_irqsave(&kfence_freelist_lock, flags);
+ /* Put the object back on the freelist. */
+ list_add_tail(&meta->list, &kfence_freelist);
+ raw_spin_unlock_irqrestore(&kfence_freelist_lock, flags);
+
+ return NULL;
+ }
+
+ meta->addr = metadata_to_pageaddr(meta);
+ /* Unprotect if we're reusing this page. */
+ if (meta->state == KFENCE_OBJECT_FREED)
+ kfence_unprotect(meta->addr);
+
+ /*
+ * Note: for allocations made before RNG initialization, will always
+ * return zero. We still benefit from enabling KFENCE as early as
+ * possible, even when the RNG is not yet available, as this will allow
+ * KFENCE to detect bugs due to earlier allocations. The only downside
+ * is that the out-of-bounds accesses detected are deterministic for
+ * such allocations.
+ */
+ if (prandom_u32_max(2)) {
+ /* Allocate on the "right" side, re-calculate address. */
+ meta->addr += PAGE_SIZE - size;
+ meta->addr = ALIGN_DOWN(meta->addr, cache->align);
+ }
+
+ addr = (void *)meta->addr;
+
+ /* Update remaining metadata. */
+ metadata_update_state(meta, KFENCE_OBJECT_ALLOCATED);
+ /* Pairs with READ_ONCE() in kfence_shutdown_cache(). */
+ WRITE_ONCE(meta->cache, cache);
+ meta->size = size;
+ for_each_canary(meta, set_canary_byte);
+
+ /* Set required struct page fields. */
+ page = virt_to_page(meta->addr);
+ page->slab_cache = cache;
+ if (IS_ENABLED(CONFIG_SLUB))
+ page->objects = 1;
+ if (IS_ENABLED(CONFIG_SLAB))
+ page->s_mem = addr;
+
+ raw_spin_unlock_irqrestore(&meta->lock, flags);
+
+ /* Memory initialization. */
+
+ /*
+ * We check slab_want_init_on_alloc() ourselves, rather than letting
+ * SL*B do the initialization, as otherwise we might overwrite KFENCE's
+ * redzone.
+ */
+ if (unlikely(slab_want_init_on_alloc(gfp, cache)))
+ memzero_explicit(addr, size);
+ if (cache->ctor)
+ cache->ctor(addr);
+
+ if (CONFIG_KFENCE_STRESS_TEST_FAULTS && !prandom_u32_max(CONFIG_KFENCE_STRESS_TEST_FAULTS))
+ kfence_protect(meta->addr); /* Random "faults" by protecting the object. */
+
+ atomic_long_inc(&counters[KFENCE_COUNTER_ALLOCATED]);
+ atomic_long_inc(&counters[KFENCE_COUNTER_ALLOCS]);
+
+ return addr;
+}
+
+static void kfence_guarded_free(void *addr, struct kfence_metadata *meta, bool zombie)
+{
+ struct kcsan_scoped_access assert_page_exclusive;
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&meta->lock, flags);
+
+ if (meta->state != KFENCE_OBJECT_ALLOCATED || meta->addr != (unsigned long)addr) {
+ /* Invalid or double-free, bail out. */
+ atomic_long_inc(&counters[KFENCE_COUNTER_BUGS]);
+ kfence_report_error((unsigned long)addr, false, NULL, meta,
+ KFENCE_ERROR_INVALID_FREE);
+ raw_spin_unlock_irqrestore(&meta->lock, flags);
+ return;
+ }
+
+ /* Detect racy use-after-free, or incorrect reallocation of this page by KFENCE. */
+ kcsan_begin_scoped_access((void *)ALIGN_DOWN((unsigned long)addr, PAGE_SIZE), PAGE_SIZE,
+ KCSAN_ACCESS_SCOPED | KCSAN_ACCESS_WRITE | KCSAN_ACCESS_ASSERT,
+ &assert_page_exclusive);
+
+ if (CONFIG_KFENCE_STRESS_TEST_FAULTS)
+ kfence_unprotect((unsigned long)addr); /* To check canary bytes. */
+
+ /* Restore page protection if there was an OOB access. */
+ if (meta->unprotected_page) {
+ kfence_protect(meta->unprotected_page);
+ meta->unprotected_page = 0;
+ }
+
+ /* Check canary bytes for memory corruption. */
+ for_each_canary(meta, check_canary_byte);
+
+ /*
+ * Clear memory if init-on-free is set. While we protect the page, the
+ * data is still there, and after a use-after-free is detected, we
+ * unprotect the page, so the data is still accessible.
+ */
+ if (!zombie && unlikely(slab_want_init_on_free(meta->cache)))
+ memzero_explicit(addr, meta->size);
+
+ /* Mark the object as freed. */
+ metadata_update_state(meta, KFENCE_OBJECT_FREED);
+
+ raw_spin_unlock_irqrestore(&meta->lock, flags);
+
+ /* Protect to detect use-after-frees. */
+ kfence_protect((unsigned long)addr);
+
+ kcsan_end_scoped_access(&assert_page_exclusive);
+ if (!zombie) {
+ /* Add it to the tail of the freelist for reuse. */
+ raw_spin_lock_irqsave(&kfence_freelist_lock, flags);
+ KFENCE_WARN_ON(!list_empty(&meta->list));
+ list_add_tail(&meta->list, &kfence_freelist);
+ raw_spin_unlock_irqrestore(&kfence_freelist_lock, flags);
+
+ atomic_long_dec(&counters[KFENCE_COUNTER_ALLOCATED]);
+ atomic_long_inc(&counters[KFENCE_COUNTER_FREES]);
+ } else {
+ /* See kfence_shutdown_cache(). */
+ atomic_long_inc(&counters[KFENCE_COUNTER_ZOMBIES]);
+ }
+}
+
+static void rcu_guarded_free(struct rcu_head *h)
+{
+ struct kfence_metadata *meta = container_of(h, struct kfence_metadata, rcu_head);
+
+ kfence_guarded_free((void *)meta->addr, meta, false);
+}
+
+static bool __init kfence_init_pool(void)
+{
+ unsigned long addr = (unsigned long)__kfence_pool;
+ struct page *pages;
+ int i;
+
+ if (!__kfence_pool)
+ return false;
+
+ if (!arch_kfence_init_pool())
+ goto err;
+
+ pages = virt_to_page(addr);
+
+ /*
+ * Set up object pages: they must have PG_slab set, to avoid freeing
+ * these as real pages.
+ *
+ * We also want to avoid inserting kfence_free() in the kfree()
+ * fast-path in SLUB, and therefore need to ensure kfree() correctly
+ * enters __slab_free() slow-path.
+ */
+ for (i = 0; i < KFENCE_POOL_SIZE / PAGE_SIZE; i++) {
+ if (!i || (i % 2))
+ continue;
+
+ /* Verify we do not have a compound head page. */
+ if (WARN_ON(compound_head(&pages[i]) != &pages[i]))
+ goto err;
+
+ __SetPageSlab(&pages[i]);
+ }
+
+ /*
+ * Protect the first 2 pages. The first page is mostly unnecessary, and
+ * merely serves as an extended guard page. However, adding one
+ * additional page in the beginning gives us an even number of pages,
+ * which simplifies the mapping of address to metadata index.
+ */
+ for (i = 0; i < 2; i++) {
+ if (unlikely(!kfence_protect(addr)))
+ goto err;
+
+ addr += PAGE_SIZE;
+ }
+
+ for (i = 0; i < CONFIG_KFENCE_NUM_OBJECTS; i++) {
+ struct kfence_metadata *meta = &kfence_metadata[i];
+
+ /* Initialize metadata. */
+ INIT_LIST_HEAD(&meta->list);
+ raw_spin_lock_init(&meta->lock);
+ meta->state = KFENCE_OBJECT_UNUSED;
+ meta->addr = addr; /* Initialize for validation in metadata_to_pageaddr(). */
+ list_add_tail(&meta->list, &kfence_freelist);
+
+ /* Protect the right redzone. */
+ if (unlikely(!kfence_protect(addr + PAGE_SIZE)))
+ goto err;
+
+ addr += 2 * PAGE_SIZE;
+ }
+
+ /*
+ * The pool is live and will never be deallocated from this point on.
+ * Remove the pool object from the kmemleak object tree, as it would
+ * otherwise overlap with allocations returned by kfence_alloc(), which
+ * are registered with kmemleak through the slab post-alloc hook.
+ */
+ kmemleak_free(__kfence_pool);
+
+ return true;
+
+err:
+ /*
+ * Only release unprotected pages, and do not try to go back and change
+ * page attributes due to risk of failing to do so as well. If changing
+ * page attributes for some pages fails, it is very likely that it also
+ * fails for the first page, and therefore expect addr==__kfence_pool in
+ * most failure cases.
+ */
+ memblock_free_late(__pa(addr), KFENCE_POOL_SIZE - (addr - (unsigned long)__kfence_pool));
+ __kfence_pool = NULL;
+ return false;
+}
+
+/* === DebugFS Interface ==================================================== */
+
+static int stats_show(struct seq_file *seq, void *v)
+{
+ int i;
+
+ seq_printf(seq, "enabled: %i\n", READ_ONCE(kfence_enabled));
+ for (i = 0; i < KFENCE_COUNTER_COUNT; i++)
+ seq_printf(seq, "%s: %ld\n", counter_names[i], atomic_long_read(&counters[i]));
+
+ return 0;
+}
+DEFINE_SHOW_ATTRIBUTE(stats);
+
+/*
+ * debugfs seq_file operations for /sys/kernel/debug/kfence/objects.
+ * start_object() and next_object() return the object index + 1, because NULL is used
+ * to stop iteration.
+ */
+static void *start_object(struct seq_file *seq, loff_t *pos)
+{
+ if (*pos < CONFIG_KFENCE_NUM_OBJECTS)
+ return (void *)((long)*pos + 1);
+ return NULL;
+}
+
+static void stop_object(struct seq_file *seq, void *v)
+{
+}
+
+static void *next_object(struct seq_file *seq, void *v, loff_t *pos)
+{
+ ++*pos;
+ if (*pos < CONFIG_KFENCE_NUM_OBJECTS)
+ return (void *)((long)*pos + 1);
+ return NULL;
+}
+
+static int show_object(struct seq_file *seq, void *v)
+{
+ struct kfence_metadata *meta = &kfence_metadata[(long)v - 1];
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&meta->lock, flags);
+ kfence_print_object(seq, meta);
+ raw_spin_unlock_irqrestore(&meta->lock, flags);
+ seq_puts(seq, "---------------------------------\n");
+
+ return 0;
+}
+
+static const struct seq_operations object_seqops = {
+ .start = start_object,
+ .next = next_object,
+ .stop = stop_object,
+ .show = show_object,
+};
+
+static int open_objects(struct inode *inode, struct file *file)
+{
+ return seq_open(file, &object_seqops);
+}
+
+static const struct file_operations objects_fops = {
+ .open = open_objects,
+ .read = seq_read,
+ .llseek = seq_lseek,
+};
+
+static int __init kfence_debugfs_init(void)
+{
+ struct dentry *kfence_dir = debugfs_create_dir("kfence", NULL);
+
+ debugfs_create_file("stats", 0444, kfence_dir, NULL, &stats_fops);
+ debugfs_create_file("objects", 0400, kfence_dir, NULL, &objects_fops);
+ return 0;
+}
+
+late_initcall(kfence_debugfs_init);
+
+/* === Allocation Gate Timer ================================================ */
+
+/*
+ * Set up delayed work, which will enable and disable the static key. We need to
+ * use a work queue (rather than a simple timer), since enabling and disabling a
+ * static key cannot be done from an interrupt.
+ *
+ * Note: Toggling a static branch currently causes IPIs, and here we'll end up
+ * with a total of 2 IPIs to all CPUs. If this ends up a problem in future (with
+ * more aggressive sampling intervals), we could get away with a variant that
+ * avoids IPIs, at the cost of not immediately capturing allocations if the
+ * instructions remain cached.
+ */
+static struct delayed_work kfence_timer;
+static void toggle_allocation_gate(struct work_struct *work)
+{
+ if (!READ_ONCE(kfence_enabled))
+ return;
+
+ /* Enable static key, and await allocation to happen. */
+ atomic_set(&kfence_allocation_gate, 0);
+#ifdef CONFIG_KFENCE_STATIC_KEYS
+ static_branch_enable(&kfence_allocation_key);
+ /*
+ * Await an allocation. Timeout after 1 second, in case the kernel stops
+ * doing allocations, to avoid stalling this worker task for too long.
+ */
+ {
+ unsigned long end_wait = jiffies + HZ;
+
+ do {
+ set_current_state(TASK_UNINTERRUPTIBLE);
+ if (atomic_read(&kfence_allocation_gate) != 0)
+ break;
+ schedule_timeout(1);
+ } while (time_before(jiffies, end_wait));
+ __set_current_state(TASK_RUNNING);
+ }
+ /* Disable static key and reset timer. */
+ static_branch_disable(&kfence_allocation_key);
+#endif
+ schedule_delayed_work(&kfence_timer, msecs_to_jiffies(kfence_sample_interval));
+}
+static DECLARE_DELAYED_WORK(kfence_timer, toggle_allocation_gate);
+
+/* === Public interface ===================================================== */
+
+void __init kfence_alloc_pool(void)
+{
+ if (!kfence_sample_interval)
+ return;
+
+ __kfence_pool = memblock_alloc(KFENCE_POOL_SIZE, PAGE_SIZE);
+
+ if (!__kfence_pool)
+ pr_err("failed to allocate pool\n");
+}
+
+void __init kfence_init(void)
+{
+ /* Setting kfence_sample_interval to 0 on boot disables KFENCE. */
+ if (!kfence_sample_interval)
+ return;
+
+ if (!kfence_init_pool()) {
+ pr_err("%s failed\n", __func__);
+ return;
+ }
+
+ WRITE_ONCE(kfence_enabled, true);
+ schedule_delayed_work(&kfence_timer, 0);
+ pr_info("initialized - using %lu bytes for %d objects at 0x%p-0x%p\n", KFENCE_POOL_SIZE,
+ CONFIG_KFENCE_NUM_OBJECTS, (void *)__kfence_pool,
+ (void *)(__kfence_pool + KFENCE_POOL_SIZE));
+}
+
+void kfence_shutdown_cache(struct kmem_cache *s)
+{
+ unsigned long flags;
+ struct kfence_metadata *meta;
+ int i;
+
+ for (i = 0; i < CONFIG_KFENCE_NUM_OBJECTS; i++) {
+ bool in_use;
+
+ meta = &kfence_metadata[i];
+
+ /*
+ * If we observe some inconsistent cache and state pair where we
+ * should have returned false here, cache destruction is racing
+ * with either kmem_cache_alloc() or kmem_cache_free(). Taking
+ * the lock will not help, as different critical section
+ * serialization will have the same outcome.
+ */
+ if (READ_ONCE(meta->cache) != s ||
+ READ_ONCE(meta->state) != KFENCE_OBJECT_ALLOCATED)
+ continue;
+
+ raw_spin_lock_irqsave(&meta->lock, flags);
+ in_use = meta->cache == s && meta->state == KFENCE_OBJECT_ALLOCATED;
+ raw_spin_unlock_irqrestore(&meta->lock, flags);
+
+ if (in_use) {
+ /*
+ * This cache still has allocations, and we should not
+ * release them back into the freelist so they can still
+ * safely be used and retain the kernel's default
+ * behaviour of keeping the allocations alive (leak the
+ * cache); however, they effectively become "zombie
+ * allocations" as the KFENCE objects are the only ones
+ * still in use and the owning cache is being destroyed.
+ *
+ * We mark them freed, so that any subsequent use shows
+ * more useful error messages that will include stack
+ * traces of the user of the object, the original
+ * allocation, and caller to shutdown_cache().
+ */
+ kfence_guarded_free((void *)meta->addr, meta, /*zombie=*/true);
+ }
+ }
+
+ for (i = 0; i < CONFIG_KFENCE_NUM_OBJECTS; i++) {
+ meta = &kfence_metadata[i];
+
+ /* See above. */
+ if (READ_ONCE(meta->cache) != s || READ_ONCE(meta->state) != KFENCE_OBJECT_FREED)
+ continue;
+
+ raw_spin_lock_irqsave(&meta->lock, flags);
+ if (meta->cache == s && meta->state == KFENCE_OBJECT_FREED)
+ meta->cache = NULL;
+ raw_spin_unlock_irqrestore(&meta->lock, flags);
+ }
+}
+
+void *__kfence_alloc(struct kmem_cache *s, size_t size, gfp_t flags)
+{
+ /*
+ * allocation_gate only needs to become non-zero, so it doesn't make
+ * sense to continue writing to it and pay the associated contention
+ * cost, in case we have a large number of concurrent allocations.
+ */
+ if (atomic_read(&kfence_allocation_gate) || atomic_inc_return(&kfence_allocation_gate) > 1)
+ return NULL;
+
+ if (!READ_ONCE(kfence_enabled))
+ return NULL;
+
+ if (size > PAGE_SIZE)
+ return NULL;
+
+ return kfence_guarded_alloc(s, size, flags);
+}
+
+size_t kfence_ksize(const void *addr)
+{
+ const struct kfence_metadata *meta = addr_to_metadata((unsigned long)addr);
+
+ /*
+ * Read locklessly -- if there is a race with __kfence_alloc(), this is
+ * either a use-after-free or invalid access.
+ */
+ return meta ? meta->size : 0;
+}
+
+void *kfence_object_start(const void *addr)
+{
+ const struct kfence_metadata *meta = addr_to_metadata((unsigned long)addr);
+
+ /*
+ * Read locklessly -- if there is a race with __kfence_alloc(), this is
+ * either a use-after-free or invalid access.
+ */
+ return meta ? (void *)meta->addr : NULL;
+}
+
+void __kfence_free(void *addr)
+{
+ struct kfence_metadata *meta = addr_to_metadata((unsigned long)addr);
+
+ /*
+ * If the objects of the cache are SLAB_TYPESAFE_BY_RCU, defer freeing
+ * the object, as the object page may be recycled for other-typed
+ * objects once it has been freed. meta->cache may be NULL if the cache
+ * was destroyed.
+ */
+ if (unlikely(meta->cache && (meta->cache->flags & SLAB_TYPESAFE_BY_RCU)))
+ call_rcu(&meta->rcu_head, rcu_guarded_free);
+ else
+ kfence_guarded_free(addr, meta, false);
+}
+
+bool kfence_handle_page_fault(unsigned long addr, bool is_write, struct pt_regs *regs)
+{
+ const int page_index = (addr - (unsigned long)__kfence_pool) / PAGE_SIZE;
+ struct kfence_metadata *to_report = NULL;
+ enum kfence_error_type error_type;
+ unsigned long flags;
+
+ if (!is_kfence_address((void *)addr))
+ return false;
+
+ if (!READ_ONCE(kfence_enabled)) /* If disabled at runtime ... */
+ return kfence_unprotect(addr); /* ... unprotect and proceed. */
+
+ atomic_long_inc(&counters[KFENCE_COUNTER_BUGS]);
+
+ if (page_index % 2) {
+ /* This is a redzone, report a buffer overflow. */
+ struct kfence_metadata *meta;
+ int distance = 0;
+
+ meta = addr_to_metadata(addr - PAGE_SIZE);
+ if (meta && READ_ONCE(meta->state) == KFENCE_OBJECT_ALLOCATED) {
+ to_report = meta;
+ /* Data race ok; distance calculation approximate. */
+ distance = addr - data_race(meta->addr + meta->size);
+ }
+
+ meta = addr_to_metadata(addr + PAGE_SIZE);
+ if (meta && READ_ONCE(meta->state) == KFENCE_OBJECT_ALLOCATED) {
+ /* Data race ok; distance calculation approximate. */
+ if (!to_report || distance > data_race(meta->addr) - addr)
+ to_report = meta;
+ }
+
+ if (!to_report)
+ goto out;
+
+ raw_spin_lock_irqsave(&to_report->lock, flags);
+ to_report->unprotected_page = addr;
+ error_type = KFENCE_ERROR_OOB;
+
+ /*
+ * If the object was freed before we took the look we can still
+ * report this as an OOB -- the report will simply show the
+ * stacktrace of the free as well.
+ */
+ } else {
+ to_report = addr_to_metadata(addr);
+ if (!to_report)
+ goto out;
+
+ raw_spin_lock_irqsave(&to_report->lock, flags);
+ error_type = KFENCE_ERROR_UAF;
+ /*
+ * We may race with __kfence_alloc(), and it is possible that a
+ * freed object may be reallocated. We simply report this as a
+ * use-after-free, with the stack trace showing the place where
+ * the object was re-allocated.
+ */
+ }
+
+out:
+ if (to_report) {
+ kfence_report_error(addr, is_write, regs, to_report, error_type);
+ raw_spin_unlock_irqrestore(&to_report->lock, flags);
+ } else {
+ /* This may be a UAF or OOB access, but we can't be sure. */
+ kfence_report_error(addr, is_write, regs, NULL, KFENCE_ERROR_INVALID);
+ }
+
+ return kfence_unprotect(addr); /* Unprotect and let access proceed. */
+}
diff --git a/mm/kfence/kfence.h b/mm/kfence/kfence.h
new file mode 100644
index 000000000000..24065321ff8a
--- /dev/null
+++ b/mm/kfence/kfence.h
@@ -0,0 +1,106 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Kernel Electric-Fence (KFENCE). For more info please see
+ * Documentation/dev-tools/kfence.rst.
+ *
+ * Copyright (C) 2020, Google LLC.
+ */
+
+#ifndef MM_KFENCE_KFENCE_H
+#define MM_KFENCE_KFENCE_H
+
+#include <linux/mm.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/types.h>
+
+#include "../slab.h" /* for struct kmem_cache */
+
+/*
+ * Get the canary byte pattern for @addr. Use a pattern that varies based on the
+ * lower 3 bits of the address, to detect memory corruptions with higher
+ * probability, where similar constants are used.
+ */
+#define KFENCE_CANARY_PATTERN(addr) ((u8)0xaa ^ (u8)((unsigned long)(addr) & 0x7))
+
+/* Maximum stack depth for reports. */
+#define KFENCE_STACK_DEPTH 64
+
+/* KFENCE object states. */
+enum kfence_object_state {
+ KFENCE_OBJECT_UNUSED, /* Object is unused. */
+ KFENCE_OBJECT_ALLOCATED, /* Object is currently allocated. */
+ KFENCE_OBJECT_FREED, /* Object was allocated, and then freed. */
+};
+
+/* Alloc/free tracking information. */
+struct kfence_track {
+ pid_t pid;
+ int num_stack_entries;
+ unsigned long stack_entries[KFENCE_STACK_DEPTH];
+};
+
+/* KFENCE metadata per guarded allocation. */
+struct kfence_metadata {
+ struct list_head list; /* Freelist node; access under kfence_freelist_lock. */
+ struct rcu_head rcu_head; /* For delayed freeing. */
+
+ /*
+ * Lock protecting below data; to ensure consistency of the below data,
+ * since the following may execute concurrently: __kfence_alloc(),
+ * __kfence_free(), kfence_handle_page_fault(). However, note that we
+ * cannot grab the same metadata off the freelist twice, and multiple
+ * __kfence_alloc() cannot run concurrently on the same metadata.
+ */
+ raw_spinlock_t lock;
+
+ /* The current state of the object; see above. */
+ enum kfence_object_state state;
+
+ /*
+ * Allocated object address; cannot be calculated from size, because of
+ * alignment requirements.
+ *
+ * Invariant: ALIGN_DOWN(addr, PAGE_SIZE) is constant.
+ */
+ unsigned long addr;
+
+ /*
+ * The size of the original allocation.
+ */
+ size_t size;
+
+ /*
+ * The kmem_cache cache of the last allocation; NULL if never allocated
+ * or the cache has already been destroyed.
+ */
+ struct kmem_cache *cache;
+
+ /*
+ * In case of an invalid access, the page that was unprotected; we
+ * optimistically only store one address.
+ */
+ unsigned long unprotected_page;
+
+ /* Allocation and free stack information. */
+ struct kfence_track alloc_track;
+ struct kfence_track free_track;
+};
+
+extern struct kfence_metadata kfence_metadata[CONFIG_KFENCE_NUM_OBJECTS];
+
+/* KFENCE error types for report generation. */
+enum kfence_error_type {
+ KFENCE_ERROR_OOB, /* Detected a out-of-bounds access. */
+ KFENCE_ERROR_UAF, /* Detected a use-after-free access. */
+ KFENCE_ERROR_CORRUPTION, /* Detected a memory corruption on free. */
+ KFENCE_ERROR_INVALID, /* Invalid access of unknown type. */
+ KFENCE_ERROR_INVALID_FREE, /* Invalid free. */
+};
+
+void kfence_report_error(unsigned long address, bool is_write, struct pt_regs *regs,
+ const struct kfence_metadata *meta, enum kfence_error_type type);
+
+void kfence_print_object(struct seq_file *seq, const struct kfence_metadata *meta);
+
+#endif /* MM_KFENCE_KFENCE_H */
diff --git a/mm/kfence/kfence_test.c b/mm/kfence/kfence_test.c
new file mode 100644
index 000000000000..4acf4251ee04
--- /dev/null
+++ b/mm/kfence/kfence_test.c
@@ -0,0 +1,858 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Test cases for KFENCE memory safety error detector. Since the interface with
+ * which KFENCE's reports are obtained is via the console, this is the output we
+ * should verify. For each test case checks the presence (or absence) of
+ * generated reports. Relies on 'console' tracepoint to capture reports as they
+ * appear in the kernel log.
+ *
+ * Copyright (C) 2020, Google LLC.
+ * Author: Alexander Potapenko <glider@google.com>
+ * Marco Elver <elver@google.com>
+ */
+
+#include <kunit/test.h>
+#include <linux/jiffies.h>
+#include <linux/kernel.h>
+#include <linux/kfence.h>
+#include <linux/mm.h>
+#include <linux/random.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/string.h>
+#include <linux/tracepoint.h>
+#include <trace/events/printk.h>
+
+#include "kfence.h"
+
+/* Report as observed from console. */
+static struct {
+ spinlock_t lock;
+ int nlines;
+ char lines[2][256];
+} observed = {
+ .lock = __SPIN_LOCK_UNLOCKED(observed.lock),
+};
+
+/* Probe for console output: obtains observed lines of interest. */
+static void probe_console(void *ignore, const char *buf, size_t len)
+{
+ unsigned long flags;
+ int nlines;
+
+ spin_lock_irqsave(&observed.lock, flags);
+ nlines = observed.nlines;
+
+ if (strnstr(buf, "BUG: KFENCE: ", len) && strnstr(buf, "test_", len)) {
+ /*
+ * KFENCE report and related to the test.
+ *
+ * The provided @buf is not NUL-terminated; copy no more than
+ * @len bytes and let strscpy() add the missing NUL-terminator.
+ */
+ strscpy(observed.lines[0], buf, min(len + 1, sizeof(observed.lines[0])));
+ nlines = 1;
+ } else if (nlines == 1 && (strnstr(buf, "at 0x", len) || strnstr(buf, "of 0x", len))) {
+ strscpy(observed.lines[nlines++], buf, min(len + 1, sizeof(observed.lines[0])));
+ }
+
+ WRITE_ONCE(observed.nlines, nlines); /* Publish new nlines. */
+ spin_unlock_irqrestore(&observed.lock, flags);
+}
+
+/* Check if a report related to the test exists. */
+static bool report_available(void)
+{
+ return READ_ONCE(observed.nlines) == ARRAY_SIZE(observed.lines);
+}
+
+/* Information we expect in a report. */
+struct expect_report {
+ enum kfence_error_type type; /* The type or error. */
+ void *fn; /* Function pointer to expected function where access occurred. */
+ char *addr; /* Address at which the bad access occurred. */
+ bool is_write; /* Is access a write. */
+};
+
+static const char *get_access_type(const struct expect_report *r)
+{
+ return r->is_write ? "write" : "read";
+}
+
+/* Check observed report matches information in @r. */
+static bool report_matches(const struct expect_report *r)
+{
+ bool ret = false;
+ unsigned long flags;
+ typeof(observed.lines) expect;
+ const char *end;
+ char *cur;
+
+ /* Doubled-checked locking. */
+ if (!report_available())
+ return false;
+
+ /* Generate expected report contents. */
+
+ /* Title */
+ cur = expect[0];
+ end = &expect[0][sizeof(expect[0]) - 1];
+ switch (r->type) {
+ case KFENCE_ERROR_OOB:
+ cur += scnprintf(cur, end - cur, "BUG: KFENCE: out-of-bounds %s",
+ get_access_type(r));
+ break;
+ case KFENCE_ERROR_UAF:
+ cur += scnprintf(cur, end - cur, "BUG: KFENCE: use-after-free %s",
+ get_access_type(r));
+ break;
+ case KFENCE_ERROR_CORRUPTION:
+ cur += scnprintf(cur, end - cur, "BUG: KFENCE: memory corruption");
+ break;
+ case KFENCE_ERROR_INVALID:
+ cur += scnprintf(cur, end - cur, "BUG: KFENCE: invalid %s",
+ get_access_type(r));
+ break;
+ case KFENCE_ERROR_INVALID_FREE:
+ cur += scnprintf(cur, end - cur, "BUG: KFENCE: invalid free");
+ break;
+ }
+
+ scnprintf(cur, end - cur, " in %pS", r->fn);
+ /* The exact offset won't match, remove it; also strip module name. */
+ cur = strchr(expect[0], '+');
+ if (cur)
+ *cur = '\0';
+
+ /* Access information */
+ cur = expect[1];
+ end = &expect[1][sizeof(expect[1]) - 1];
+
+ switch (r->type) {
+ case KFENCE_ERROR_OOB:
+ cur += scnprintf(cur, end - cur, "Out-of-bounds %s at", get_access_type(r));
+ break;
+ case KFENCE_ERROR_UAF:
+ cur += scnprintf(cur, end - cur, "Use-after-free %s at", get_access_type(r));
+ break;
+ case KFENCE_ERROR_CORRUPTION:
+ cur += scnprintf(cur, end - cur, "Corrupted memory at");
+ break;
+ case KFENCE_ERROR_INVALID:
+ cur += scnprintf(cur, end - cur, "Invalid %s at", get_access_type(r));
+ break;
+ case KFENCE_ERROR_INVALID_FREE:
+ cur += scnprintf(cur, end - cur, "Invalid free of");
+ break;
+ }
+
+ cur += scnprintf(cur, end - cur, " 0x%p", (void *)r->addr);
+
+ spin_lock_irqsave(&observed.lock, flags);
+ if (!report_available())
+ goto out; /* A new report is being captured. */
+
+ /* Finally match expected output to what we actually observed. */
+ ret = strstr(observed.lines[0], expect[0]) && strstr(observed.lines[1], expect[1]);
+out:
+ spin_unlock_irqrestore(&observed.lock, flags);
+ return ret;
+}
+
+/* ===== Test cases ===== */
+
+#define TEST_PRIV_WANT_MEMCACHE ((void *)1)
+
+/* Cache used by tests; if NULL, allocate from kmalloc instead. */
+static struct kmem_cache *test_cache;
+
+static size_t setup_test_cache(struct kunit *test, size_t size, slab_flags_t flags,
+ void (*ctor)(void *))
+{
+ if (test->priv != TEST_PRIV_WANT_MEMCACHE)
+ return size;
+
+ kunit_info(test, "%s: size=%zu, ctor=%ps\n", __func__, size, ctor);
+
+ /*
+ * Use SLAB_NOLEAKTRACE to prevent merging with existing caches. Any
+ * other flag in SLAB_NEVER_MERGE also works. Use SLAB_ACCOUNT to
+ * allocate via memcg, if enabled.
+ */
+ flags |= SLAB_NOLEAKTRACE | SLAB_ACCOUNT;
+ test_cache = kmem_cache_create("test", size, 1, flags, ctor);
+ KUNIT_ASSERT_TRUE_MSG(test, test_cache, "could not create cache");
+
+ return size;
+}
+
+static void test_cache_destroy(void)
+{
+ if (!test_cache)
+ return;
+
+ kmem_cache_destroy(test_cache);
+ test_cache = NULL;
+}
+
+static inline size_t kmalloc_cache_alignment(size_t size)
+{
+ return kmalloc_caches[kmalloc_type(GFP_KERNEL)][kmalloc_index(size)]->align;
+}
+
+/* Must always inline to match stack trace against caller. */
+static __always_inline void test_free(void *ptr)
+{
+ if (test_cache)
+ kmem_cache_free(test_cache, ptr);
+ else
+ kfree(ptr);
+}
+
+/*
+ * If this should be a KFENCE allocation, and on which side the allocation and
+ * the closest guard page should be.
+ */
+enum allocation_policy {
+ ALLOCATE_ANY, /* KFENCE, any side. */
+ ALLOCATE_LEFT, /* KFENCE, left side of page. */
+ ALLOCATE_RIGHT, /* KFENCE, right side of page. */
+ ALLOCATE_NONE, /* No KFENCE allocation. */
+};
+
+/*
+ * Try to get a guarded allocation from KFENCE. Uses either kmalloc() or the
+ * current test_cache if set up.
+ */
+static void *test_alloc(struct kunit *test, size_t size, gfp_t gfp, enum allocation_policy policy)
+{
+ void *alloc;
+ unsigned long timeout, resched_after;
+ const char *policy_name;
+
+ switch (policy) {
+ case ALLOCATE_ANY:
+ policy_name = "any";
+ break;
+ case ALLOCATE_LEFT:
+ policy_name = "left";
+ break;
+ case ALLOCATE_RIGHT:
+ policy_name = "right";
+ break;
+ case ALLOCATE_NONE:
+ policy_name = "none";
+ break;
+ }
+
+ kunit_info(test, "%s: size=%zu, gfp=%x, policy=%s, cache=%i\n", __func__, size, gfp,
+ policy_name, !!test_cache);
+
+ /*
+ * 100x the sample interval should be more than enough to ensure we get
+ * a KFENCE allocation eventually.
+ */
+ timeout = jiffies + msecs_to_jiffies(100 * CONFIG_KFENCE_SAMPLE_INTERVAL);
+ /*
+ * Especially for non-preemption kernels, ensure the allocation-gate
+ * timer can catch up: after @resched_after, every failed allocation
+ * attempt yields, to ensure the allocation-gate timer is scheduled.
+ */
+ resched_after = jiffies + msecs_to_jiffies(CONFIG_KFENCE_SAMPLE_INTERVAL);
+ do {
+ if (test_cache)
+ alloc = kmem_cache_alloc(test_cache, gfp);
+ else
+ alloc = kmalloc(size, gfp);
+
+ if (is_kfence_address(alloc)) {
+ struct page *page = virt_to_head_page(alloc);
+ struct kmem_cache *s = test_cache ?: kmalloc_caches[kmalloc_type(GFP_KERNEL)][kmalloc_index(size)];
+
+ /*
+ * Verify that various helpers return the right values
+ * even for KFENCE objects; these are required so that
+ * memcg accounting works correctly.
+ */
+ KUNIT_EXPECT_EQ(test, obj_to_index(s, page, alloc), 0U);
+ KUNIT_EXPECT_EQ(test, objs_per_slab_page(s, page), 1);
+
+ if (policy == ALLOCATE_ANY)
+ return alloc;
+ if (policy == ALLOCATE_LEFT && IS_ALIGNED((unsigned long)alloc, PAGE_SIZE))
+ return alloc;
+ if (policy == ALLOCATE_RIGHT &&
+ !IS_ALIGNED((unsigned long)alloc, PAGE_SIZE))
+ return alloc;
+ } else if (policy == ALLOCATE_NONE)
+ return alloc;
+
+ test_free(alloc);
+
+ if (time_after(jiffies, resched_after))
+ cond_resched();
+ } while (time_before(jiffies, timeout));
+
+ KUNIT_ASSERT_TRUE_MSG(test, false, "failed to allocate from KFENCE");
+ return NULL; /* Unreachable. */
+}
+
+static void test_out_of_bounds_read(struct kunit *test)
+{
+ size_t size = 32;
+ struct expect_report expect = {
+ .type = KFENCE_ERROR_OOB,
+ .fn = test_out_of_bounds_read,
+ .is_write = false,
+ };
+ char *buf;
+
+ setup_test_cache(test, size, 0, NULL);
+
+ /*
+ * If we don't have our own cache, adjust based on alignment, so that we
+ * actually access guard pages on either side.
+ */
+ if (!test_cache)
+ size = kmalloc_cache_alignment(size);
+
+ /* Test both sides. */
+
+ buf = test_alloc(test, size, GFP_KERNEL, ALLOCATE_LEFT);
+ expect.addr = buf - 1;
+ READ_ONCE(*expect.addr);
+ KUNIT_EXPECT_TRUE(test, report_matches(&expect));
+ test_free(buf);
+
+ buf = test_alloc(test, size, GFP_KERNEL, ALLOCATE_RIGHT);
+ expect.addr = buf + size;
+ READ_ONCE(*expect.addr);
+ KUNIT_EXPECT_TRUE(test, report_matches(&expect));
+ test_free(buf);
+}
+
+static void test_out_of_bounds_write(struct kunit *test)
+{
+ size_t size = 32;
+ struct expect_report expect = {
+ .type = KFENCE_ERROR_OOB,
+ .fn = test_out_of_bounds_write,
+ .is_write = true,
+ };
+ char *buf;
+
+ setup_test_cache(test, size, 0, NULL);
+ buf = test_alloc(test, size, GFP_KERNEL, ALLOCATE_LEFT);
+ expect.addr = buf - 1;
+ WRITE_ONCE(*expect.addr, 42);
+ KUNIT_EXPECT_TRUE(test, report_matches(&expect));
+ test_free(buf);
+}
+
+static void test_use_after_free_read(struct kunit *test)
+{
+ const size_t size = 32;
+ struct expect_report expect = {
+ .type = KFENCE_ERROR_UAF,
+ .fn = test_use_after_free_read,
+ .is_write = false,
+ };
+
+ setup_test_cache(test, size, 0, NULL);
+ expect.addr = test_alloc(test, size, GFP_KERNEL, ALLOCATE_ANY);
+ test_free(expect.addr);
+ READ_ONCE(*expect.addr);
+ KUNIT_EXPECT_TRUE(test, report_matches(&expect));
+}
+
+static void test_double_free(struct kunit *test)
+{
+ const size_t size = 32;
+ struct expect_report expect = {
+ .type = KFENCE_ERROR_INVALID_FREE,
+ .fn = test_double_free,
+ };
+
+ setup_test_cache(test, size, 0, NULL);
+ expect.addr = test_alloc(test, size, GFP_KERNEL, ALLOCATE_ANY);
+ test_free(expect.addr);
+ test_free(expect.addr); /* Double-free. */
+ KUNIT_EXPECT_TRUE(test, report_matches(&expect));
+}
+
+static void test_invalid_addr_free(struct kunit *test)
+{
+ const size_t size = 32;
+ struct expect_report expect = {
+ .type = KFENCE_ERROR_INVALID_FREE,
+ .fn = test_invalid_addr_free,
+ };
+ char *buf;
+
+ setup_test_cache(test, size, 0, NULL);
+ buf = test_alloc(test, size, GFP_KERNEL, ALLOCATE_ANY);
+ expect.addr = buf + 1; /* Free on invalid address. */
+ test_free(expect.addr); /* Invalid address free. */
+ test_free(buf); /* No error. */
+ KUNIT_EXPECT_TRUE(test, report_matches(&expect));
+}
+
+static void test_corruption(struct kunit *test)
+{
+ size_t size = 32;
+ struct expect_report expect = {
+ .type = KFENCE_ERROR_CORRUPTION,
+ .fn = test_corruption,
+ };
+ char *buf;
+
+ setup_test_cache(test, size, 0, NULL);
+
+ /* Test both sides. */
+
+ buf = test_alloc(test, size, GFP_KERNEL, ALLOCATE_LEFT);
+ expect.addr = buf + size;
+ WRITE_ONCE(*expect.addr, 42);
+ test_free(buf);
+ KUNIT_EXPECT_TRUE(test, report_matches(&expect));
+
+ buf = test_alloc(test, size, GFP_KERNEL, ALLOCATE_RIGHT);
+ expect.addr = buf - 1;
+ WRITE_ONCE(*expect.addr, 42);
+ test_free(buf);
+ KUNIT_EXPECT_TRUE(test, report_matches(&expect));
+}
+
+/*
+ * KFENCE is unable to detect an OOB if the allocation's alignment requirements
+ * leave a gap between the object and the guard page. Specifically, an
+ * allocation of e.g. 73 bytes is aligned on 8 and 128 bytes for SLUB or SLAB
+ * respectively. Therefore it is impossible for the allocated object to
+ * contiguously line up with the right guard page.
+ *
+ * However, we test that an access to memory beyond the gap results in KFENCE
+ * detecting an OOB access.
+ */
+static void test_kmalloc_aligned_oob_read(struct kunit *test)
+{
+ const size_t size = 73;
+ const size_t align = kmalloc_cache_alignment(size);
+ struct expect_report expect = {
+ .type = KFENCE_ERROR_OOB,
+ .fn = test_kmalloc_aligned_oob_read,
+ .is_write = false,
+ };
+ char *buf;
+
+ buf = test_alloc(test, size, GFP_KERNEL, ALLOCATE_RIGHT);
+
+ /*
+ * The object is offset to the right, so there won't be an OOB to the
+ * left of it.
+ */
+ READ_ONCE(*(buf - 1));
+ KUNIT_EXPECT_FALSE(test, report_available());
+
+ /*
+ * @buf must be aligned on @align, therefore buf + size belongs to the
+ * same page -> no OOB.
+ */
+ READ_ONCE(*(buf + size));
+ KUNIT_EXPECT_FALSE(test, report_available());
+
+ /* Overflowing by @align bytes will result in an OOB. */
+ expect.addr = buf + size + align;
+ READ_ONCE(*expect.addr);
+ KUNIT_EXPECT_TRUE(test, report_matches(&expect));
+
+ test_free(buf);
+}
+
+static void test_kmalloc_aligned_oob_write(struct kunit *test)
+{
+ const size_t size = 73;
+ struct expect_report expect = {
+ .type = KFENCE_ERROR_CORRUPTION,
+ .fn = test_kmalloc_aligned_oob_write,
+ };
+ char *buf;
+
+ buf = test_alloc(test, size, GFP_KERNEL, ALLOCATE_RIGHT);
+ /*
+ * The object is offset to the right, so we won't get a page
+ * fault immediately after it.
+ */
+ expect.addr = buf + size;
+ WRITE_ONCE(*expect.addr, READ_ONCE(*expect.addr) + 1);
+ KUNIT_EXPECT_FALSE(test, report_available());
+ test_free(buf);
+ KUNIT_EXPECT_TRUE(test, report_matches(&expect));
+}
+
+/* Test cache shrinking and destroying with KFENCE. */
+static void test_shrink_memcache(struct kunit *test)
+{
+ const size_t size = 32;
+ void *buf;
+
+ setup_test_cache(test, size, 0, NULL);
+ KUNIT_EXPECT_TRUE(test, test_cache);
+ buf = test_alloc(test, size, GFP_KERNEL, ALLOCATE_ANY);
+ kmem_cache_shrink(test_cache);
+ test_free(buf);
+
+ KUNIT_EXPECT_FALSE(test, report_available());
+}
+
+static void ctor_set_x(void *obj)
+{
+ /* Every object has at least 8 bytes. */
+ memset(obj, 'x', 8);
+}
+
+/* Ensure that SL*B does not modify KFENCE objects on bulk free. */
+static void test_free_bulk(struct kunit *test)
+{
+ int iter;
+
+ for (iter = 0; iter < 5; iter++) {
+ const size_t size = setup_test_cache(test, 8 + prandom_u32_max(300), 0,
+ (iter & 1) ? ctor_set_x : NULL);
+ void *objects[] = {
+ test_alloc(test, size, GFP_KERNEL, ALLOCATE_RIGHT),
+ test_alloc(test, size, GFP_KERNEL, ALLOCATE_NONE),
+ test_alloc(test, size, GFP_KERNEL, ALLOCATE_LEFT),
+ test_alloc(test, size, GFP_KERNEL, ALLOCATE_NONE),
+ test_alloc(test, size, GFP_KERNEL, ALLOCATE_NONE),
+ };
+
+ kmem_cache_free_bulk(test_cache, ARRAY_SIZE(objects), objects);
+ KUNIT_ASSERT_FALSE(test, report_available());
+ test_cache_destroy();
+ }
+}
+
+/* Test init-on-free works. */
+static void test_init_on_free(struct kunit *test)
+{
+ const size_t size = 32;
+ struct expect_report expect = {
+ .type = KFENCE_ERROR_UAF,
+ .fn = test_init_on_free,
+ .is_write = false,
+ };
+ int i;
+
+ if (!IS_ENABLED(CONFIG_INIT_ON_FREE_DEFAULT_ON))
+ return;
+ /* Assume it hasn't been disabled on command line. */
+
+ setup_test_cache(test, size, 0, NULL);
+ expect.addr = test_alloc(test, size, GFP_KERNEL, ALLOCATE_ANY);
+ for (i = 0; i < size; i++)
+ expect.addr[i] = i + 1;
+ test_free(expect.addr);
+
+ for (i = 0; i < size; i++) {
+ /*
+ * This may fail if the page was recycled by KFENCE and then
+ * written to again -- this however, is near impossible with a
+ * default config.
+ */
+ KUNIT_EXPECT_EQ(test, expect.addr[i], (char)0);
+
+ if (!i) /* Only check first access to not fail test if page is ever re-protected. */
+ KUNIT_EXPECT_TRUE(test, report_matches(&expect));
+ }
+}
+
+/* Ensure that constructors work properly. */
+static void test_memcache_ctor(struct kunit *test)
+{
+ const size_t size = 32;
+ char *buf;
+ int i;
+
+ setup_test_cache(test, size, 0, ctor_set_x);
+ buf = test_alloc(test, size, GFP_KERNEL, ALLOCATE_ANY);
+
+ for (i = 0; i < 8; i++)
+ KUNIT_EXPECT_EQ(test, buf[i], (char)'x');
+
+ test_free(buf);
+
+ KUNIT_EXPECT_FALSE(test, report_available());
+}
+
+/* Test that memory is zeroed if requested. */
+static void test_gfpzero(struct kunit *test)
+{
+ const size_t size = PAGE_SIZE; /* PAGE_SIZE so we can use ALLOCATE_ANY. */
+ char *buf1, *buf2;
+ int i;
+
+ if (CONFIG_KFENCE_SAMPLE_INTERVAL > 100) {
+ kunit_warn(test, "skipping ... would take too long\n");
+ return;
+ }
+
+ setup_test_cache(test, size, 0, NULL);
+ buf1 = test_alloc(test, size, GFP_KERNEL, ALLOCATE_ANY);
+ for (i = 0; i < size; i++)
+ buf1[i] = i + 1;
+ test_free(buf1);
+
+ /* Try to get same address again -- this can take a while. */
+ for (i = 0;; i++) {
+ buf2 = test_alloc(test, size, GFP_KERNEL | __GFP_ZERO, ALLOCATE_ANY);
+ if (buf1 == buf2)
+ break;
+ test_free(buf2);
+
+ if (i == CONFIG_KFENCE_NUM_OBJECTS) {
+ kunit_warn(test, "giving up ... cannot get same object back\n");
+ return;
+ }
+ }
+
+ for (i = 0; i < size; i++)
+ KUNIT_EXPECT_EQ(test, buf2[i], (char)0);
+
+ test_free(buf2);
+
+ KUNIT_EXPECT_FALSE(test, report_available());
+}
+
+static void test_invalid_access(struct kunit *test)
+{
+ const struct expect_report expect = {
+ .type = KFENCE_ERROR_INVALID,
+ .fn = test_invalid_access,
+ .addr = &__kfence_pool[10],
+ .is_write = false,
+ };
+
+ READ_ONCE(__kfence_pool[10]);
+ KUNIT_EXPECT_TRUE(test, report_matches(&expect));
+}
+
+/* Test SLAB_TYPESAFE_BY_RCU works. */
+static void test_memcache_typesafe_by_rcu(struct kunit *test)
+{
+ const size_t size = 32;
+ struct expect_report expect = {
+ .type = KFENCE_ERROR_UAF,
+ .fn = test_memcache_typesafe_by_rcu,
+ .is_write = false,
+ };
+
+ setup_test_cache(test, size, SLAB_TYPESAFE_BY_RCU, NULL);
+ KUNIT_EXPECT_TRUE(test, test_cache); /* Want memcache. */
+
+ expect.addr = test_alloc(test, size, GFP_KERNEL, ALLOCATE_ANY);
+ *expect.addr = 42;
+
+ rcu_read_lock();
+ test_free(expect.addr);
+ KUNIT_EXPECT_EQ(test, *expect.addr, (char)42);
+ /*
+ * Up to this point, memory should not have been freed yet, and
+ * therefore there should be no KFENCE report from the above access.
+ */
+ rcu_read_unlock();
+
+ /* Above access to @expect.addr should not have generated a report! */
+ KUNIT_EXPECT_FALSE(test, report_available());
+
+ /* Only after rcu_barrier() is the memory guaranteed to be freed. */
+ rcu_barrier();
+
+ /* Expect use-after-free. */
+ KUNIT_EXPECT_EQ(test, *expect.addr, (char)42);
+ KUNIT_EXPECT_TRUE(test, report_matches(&expect));
+}
+
+/* Test krealloc(). */
+static void test_krealloc(struct kunit *test)
+{
+ const size_t size = 32;
+ const struct expect_report expect = {
+ .type = KFENCE_ERROR_UAF,
+ .fn = test_krealloc,
+ .addr = test_alloc(test, size, GFP_KERNEL, ALLOCATE_ANY),
+ .is_write = false,
+ };
+ char *buf = expect.addr;
+ int i;
+
+ KUNIT_EXPECT_FALSE(test, test_cache);
+ KUNIT_EXPECT_EQ(test, ksize(buf), size); /* Precise size match after KFENCE alloc. */
+ for (i = 0; i < size; i++)
+ buf[i] = i + 1;
+
+ /* Check that we successfully change the size. */
+ buf = krealloc(buf, size * 3, GFP_KERNEL); /* Grow. */
+ /* Note: Might no longer be a KFENCE alloc. */
+ KUNIT_EXPECT_GE(test, ksize(buf), size * 3);
+ for (i = 0; i < size; i++)
+ KUNIT_EXPECT_EQ(test, buf[i], (char)(i + 1));
+ for (; i < size * 3; i++) /* Fill to extra bytes. */
+ buf[i] = i + 1;
+
+ buf = krealloc(buf, size * 2, GFP_KERNEL); /* Shrink. */
+ KUNIT_EXPECT_GE(test, ksize(buf), size * 2);
+ for (i = 0; i < size * 2; i++)
+ KUNIT_EXPECT_EQ(test, buf[i], (char)(i + 1));
+
+ buf = krealloc(buf, 0, GFP_KERNEL); /* Free. */
+ KUNIT_EXPECT_EQ(test, (unsigned long)buf, (unsigned long)ZERO_SIZE_PTR);
+ KUNIT_ASSERT_FALSE(test, report_available()); /* No reports yet! */
+
+ READ_ONCE(*expect.addr); /* Ensure krealloc() actually freed earlier KFENCE object. */
+ KUNIT_ASSERT_TRUE(test, report_matches(&expect));
+}
+
+/* Test that some objects from a bulk allocation belong to KFENCE pool. */
+static void test_memcache_alloc_bulk(struct kunit *test)
+{
+ const size_t size = 32;
+ bool pass = false;
+ unsigned long timeout;
+
+ setup_test_cache(test, size, 0, NULL);
+ KUNIT_EXPECT_TRUE(test, test_cache); /* Want memcache. */
+ /*
+ * 100x the sample interval should be more than enough to ensure we get
+ * a KFENCE allocation eventually.
+ */
+ timeout = jiffies + msecs_to_jiffies(100 * CONFIG_KFENCE_SAMPLE_INTERVAL);
+ do {
+ void *objects[100];
+ int i, num = kmem_cache_alloc_bulk(test_cache, GFP_ATOMIC, ARRAY_SIZE(objects),
+ objects);
+ if (!num)
+ continue;
+ for (i = 0; i < ARRAY_SIZE(objects); i++) {
+ if (is_kfence_address(objects[i])) {
+ pass = true;
+ break;
+ }
+ }
+ kmem_cache_free_bulk(test_cache, num, objects);
+ /*
+ * kmem_cache_alloc_bulk() disables interrupts, and calling it
+ * in a tight loop may not give KFENCE a chance to switch the
+ * static branch. Call cond_resched() to let KFENCE chime in.
+ */
+ cond_resched();
+ } while (!pass && time_before(jiffies, timeout));
+
+ KUNIT_EXPECT_TRUE(test, pass);
+ KUNIT_EXPECT_FALSE(test, report_available());
+}
+
+/*
+ * KUnit does not provide a way to provide arguments to tests, and we encode
+ * additional info in the name. Set up 2 tests per test case, one using the
+ * default allocator, and another using a custom memcache (suffix '-memcache').
+ */
+#define KFENCE_KUNIT_CASE(test_name) \
+ { .run_case = test_name, .name = #test_name }, \
+ { .run_case = test_name, .name = #test_name "-memcache" }
+
+static struct kunit_case kfence_test_cases[] = {
+ KFENCE_KUNIT_CASE(test_out_of_bounds_read),
+ KFENCE_KUNIT_CASE(test_out_of_bounds_write),
+ KFENCE_KUNIT_CASE(test_use_after_free_read),
+ KFENCE_KUNIT_CASE(test_double_free),
+ KFENCE_KUNIT_CASE(test_invalid_addr_free),
+ KFENCE_KUNIT_CASE(test_corruption),
+ KFENCE_KUNIT_CASE(test_free_bulk),
+ KFENCE_KUNIT_CASE(test_init_on_free),
+ KUNIT_CASE(test_kmalloc_aligned_oob_read),
+ KUNIT_CASE(test_kmalloc_aligned_oob_write),
+ KUNIT_CASE(test_shrink_memcache),
+ KUNIT_CASE(test_memcache_ctor),
+ KUNIT_CASE(test_invalid_access),
+ KUNIT_CASE(test_gfpzero),
+ KUNIT_CASE(test_memcache_typesafe_by_rcu),
+ KUNIT_CASE(test_krealloc),
+ KUNIT_CASE(test_memcache_alloc_bulk),
+ {},
+};
+
+/* ===== End test cases ===== */
+
+static int test_init(struct kunit *test)
+{
+ unsigned long flags;
+ int i;
+
+ spin_lock_irqsave(&observed.lock, flags);
+ for (i = 0; i < ARRAY_SIZE(observed.lines); i++)
+ observed.lines[i][0] = '\0';
+ observed.nlines = 0;
+ spin_unlock_irqrestore(&observed.lock, flags);
+
+ /* Any test with 'memcache' in its name will want a memcache. */
+ if (strstr(test->name, "memcache"))
+ test->priv = TEST_PRIV_WANT_MEMCACHE;
+ else
+ test->priv = NULL;
+
+ return 0;
+}
+
+static void test_exit(struct kunit *test)
+{
+ test_cache_destroy();
+}
+
+static struct kunit_suite kfence_test_suite = {
+ .name = "kfence",
+ .test_cases = kfence_test_cases,
+ .init = test_init,
+ .exit = test_exit,
+};
+static struct kunit_suite *kfence_test_suites[] = { &kfence_test_suite, NULL };
+
+static void register_tracepoints(struct tracepoint *tp, void *ignore)
+{
+ check_trace_callback_type_console(probe_console);
+ if (!strcmp(tp->name, "console"))
+ WARN_ON(tracepoint_probe_register(tp, probe_console, NULL));
+}
+
+static void unregister_tracepoints(struct tracepoint *tp, void *ignore)
+{
+ if (!strcmp(tp->name, "console"))
+ tracepoint_probe_unregister(tp, probe_console, NULL);
+}
+
+/*
+ * We only want to do tracepoints setup and teardown once, therefore we have to
+ * customize the init and exit functions and cannot rely on kunit_test_suite().
+ */
+static int __init kfence_test_init(void)
+{
+ /*
+ * Because we want to be able to build the test as a module, we need to
+ * iterate through all known tracepoints, since the static registration
+ * won't work here.
+ */
+ for_each_kernel_tracepoint(register_tracepoints, NULL);
+ return __kunit_test_suites_init(kfence_test_suites);
+}
+
+static void kfence_test_exit(void)
+{
+ __kunit_test_suites_exit(kfence_test_suites);
+ for_each_kernel_tracepoint(unregister_tracepoints, NULL);
+ tracepoint_synchronize_unregister();
+}
+
+late_initcall(kfence_test_init);
+module_exit(kfence_test_exit);
+
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR("Alexander Potapenko <glider@google.com>, Marco Elver <elver@google.com>");
diff --git a/mm/kfence/report.c b/mm/kfence/report.c
new file mode 100644
index 000000000000..e3f71451ad9e
--- /dev/null
+++ b/mm/kfence/report.c
@@ -0,0 +1,268 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * KFENCE reporting.
+ *
+ * Copyright (C) 2020, Google LLC.
+ */
+
+#include <stdarg.h>
+
+#include <linux/kernel.h>
+#include <linux/lockdep.h>
+#include <linux/printk.h>
+#include <linux/sched/debug.h>
+#include <linux/seq_file.h>
+#include <linux/stacktrace.h>
+#include <linux/string.h>
+#include <trace/events/error_report.h>
+
+#include <asm/kfence.h>
+
+#include "kfence.h"
+
+/* May be overridden by <asm/kfence.h>. */
+#ifndef ARCH_FUNC_PREFIX
+#define ARCH_FUNC_PREFIX ""
+#endif
+
+extern bool no_hash_pointers;
+
+/* Helper function to either print to a seq_file or to console. */
+__printf(2, 3)
+static void seq_con_printf(struct seq_file *seq, const char *fmt, ...)
+{
+ va_list args;
+
+ va_start(args, fmt);
+ if (seq)
+ seq_vprintf(seq, fmt, args);
+ else
+ vprintk(fmt, args);
+ va_end(args);
+}
+
+/*
+ * Get the number of stack entries to skip to get out of MM internals. @type is
+ * optional, and if set to NULL, assumes an allocation or free stack.
+ */
+static int get_stack_skipnr(const unsigned long stack_entries[], int num_entries,
+ const enum kfence_error_type *type)
+{
+ char buf[64];
+ int skipnr, fallback = 0;
+
+ if (type) {
+ /* Depending on error type, find different stack entries. */
+ switch (*type) {
+ case KFENCE_ERROR_UAF:
+ case KFENCE_ERROR_OOB:
+ case KFENCE_ERROR_INVALID:
+ /*
+ * kfence_handle_page_fault() may be called with pt_regs
+ * set to NULL; in that case we'll simply show the full
+ * stack trace.
+ */
+ return 0;
+ case KFENCE_ERROR_CORRUPTION:
+ case KFENCE_ERROR_INVALID_FREE:
+ break;
+ }
+ }
+
+ for (skipnr = 0; skipnr < num_entries; skipnr++) {
+ int len = scnprintf(buf, sizeof(buf), "%ps", (void *)stack_entries[skipnr]);
+
+ if (str_has_prefix(buf, ARCH_FUNC_PREFIX "kfence_") ||
+ str_has_prefix(buf, ARCH_FUNC_PREFIX "__kfence_") ||
+ !strncmp(buf, ARCH_FUNC_PREFIX "__slab_free", len)) {
+ /*
+ * In case of tail calls from any of the below
+ * to any of the above.
+ */
+ fallback = skipnr + 1;
+ }
+
+ /* Also the *_bulk() variants by only checking prefixes. */
+ if (str_has_prefix(buf, ARCH_FUNC_PREFIX "kfree") ||
+ str_has_prefix(buf, ARCH_FUNC_PREFIX "kmem_cache_free") ||
+ str_has_prefix(buf, ARCH_FUNC_PREFIX "__kmalloc") ||
+ str_has_prefix(buf, ARCH_FUNC_PREFIX "kmem_cache_alloc"))
+ goto found;
+ }
+ if (fallback < num_entries)
+ return fallback;
+found:
+ skipnr++;
+ return skipnr < num_entries ? skipnr : 0;
+}
+
+static void kfence_print_stack(struct seq_file *seq, const struct kfence_metadata *meta,
+ bool show_alloc)
+{
+ const struct kfence_track *track = show_alloc ? &meta->alloc_track : &meta->free_track;
+
+ if (track->num_stack_entries) {
+ /* Skip allocation/free internals stack. */
+ int i = get_stack_skipnr(track->stack_entries, track->num_stack_entries, NULL);
+
+ /* stack_trace_seq_print() does not exist; open code our own. */
+ for (; i < track->num_stack_entries; i++)
+ seq_con_printf(seq, " %pS\n", (void *)track->stack_entries[i]);
+ } else {
+ seq_con_printf(seq, " no %s stack\n", show_alloc ? "allocation" : "deallocation");
+ }
+}
+
+void kfence_print_object(struct seq_file *seq, const struct kfence_metadata *meta)
+{
+ const int size = abs(meta->size);
+ const unsigned long start = meta->addr;
+ const struct kmem_cache *const cache = meta->cache;
+
+ lockdep_assert_held(&meta->lock);
+
+ if (meta->state == KFENCE_OBJECT_UNUSED) {
+ seq_con_printf(seq, "kfence-#%td unused\n", meta - kfence_metadata);
+ return;
+ }
+
+ seq_con_printf(seq,
+ "kfence-#%td [0x%p-0x%p"
+ ", size=%d, cache=%s] allocated by task %d:\n",
+ meta - kfence_metadata, (void *)start, (void *)(start + size - 1), size,
+ (cache && cache->name) ? cache->name : "<destroyed>", meta->alloc_track.pid);
+ kfence_print_stack(seq, meta, true);
+
+ if (meta->state == KFENCE_OBJECT_FREED) {
+ seq_con_printf(seq, "\nfreed by task %d:\n", meta->free_track.pid);
+ kfence_print_stack(seq, meta, false);
+ }
+}
+
+/*
+ * Show bytes at @addr that are different from the expected canary values, up to
+ * @max_bytes.
+ */
+static void print_diff_canary(unsigned long address, size_t bytes_to_show,
+ const struct kfence_metadata *meta)
+{
+ const unsigned long show_until_addr = address + bytes_to_show;
+ const u8 *cur, *end;
+
+ /* Do not show contents of object nor read into following guard page. */
+ end = (const u8 *)(address < meta->addr ? min(show_until_addr, meta->addr)
+ : min(show_until_addr, PAGE_ALIGN(address)));
+
+ pr_cont("[");
+ for (cur = (const u8 *)address; cur < end; cur++) {
+ if (*cur == KFENCE_CANARY_PATTERN(cur))
+ pr_cont(" .");
+ else if (no_hash_pointers)
+ pr_cont(" 0x%02x", *cur);
+ else /* Do not leak kernel memory in non-debug builds. */
+ pr_cont(" !");
+ }
+ pr_cont(" ]");
+}
+
+static const char *get_access_type(bool is_write)
+{
+ return is_write ? "write" : "read";
+}
+
+void kfence_report_error(unsigned long address, bool is_write, struct pt_regs *regs,
+ const struct kfence_metadata *meta, enum kfence_error_type type)
+{
+ unsigned long stack_entries[KFENCE_STACK_DEPTH] = { 0 };
+ const ptrdiff_t object_index = meta ? meta - kfence_metadata : -1;
+ int num_stack_entries;
+ int skipnr = 0;
+
+ if (regs) {
+ num_stack_entries = stack_trace_save_regs(regs, stack_entries, KFENCE_STACK_DEPTH, 0);
+ } else {
+ num_stack_entries = stack_trace_save(stack_entries, KFENCE_STACK_DEPTH, 1);
+ skipnr = get_stack_skipnr(stack_entries, num_stack_entries, &type);
+ }
+
+ /* Require non-NULL meta, except if KFENCE_ERROR_INVALID. */
+ if (WARN_ON(type != KFENCE_ERROR_INVALID && !meta))
+ return;
+
+ if (meta)
+ lockdep_assert_held(&meta->lock);
+ /*
+ * Because we may generate reports in printk-unfriendly parts of the
+ * kernel, such as scheduler code, the use of printk() could deadlock.
+ * Until such time that all printing code here is safe in all parts of
+ * the kernel, accept the risk, and just get our message out (given the
+ * system might already behave unpredictably due to the memory error).
+ * As such, also disable lockdep to hide warnings, and avoid disabling
+ * lockdep for the rest of the kernel.
+ */
+ lockdep_off();
+
+ pr_err("==================================================================\n");
+ /* Print report header. */
+ switch (type) {
+ case KFENCE_ERROR_OOB: {
+ const bool left_of_object = address < meta->addr;
+
+ pr_err("BUG: KFENCE: out-of-bounds %s in %pS\n\n", get_access_type(is_write),
+ (void *)stack_entries[skipnr]);
+ pr_err("Out-of-bounds %s at 0x%p (%luB %s of kfence-#%td):\n",
+ get_access_type(is_write), (void *)address,
+ left_of_object ? meta->addr - address : address - meta->addr,
+ left_of_object ? "left" : "right", object_index);
+ break;
+ }
+ case KFENCE_ERROR_UAF:
+ pr_err("BUG: KFENCE: use-after-free %s in %pS\n\n", get_access_type(is_write),
+ (void *)stack_entries[skipnr]);
+ pr_err("Use-after-free %s at 0x%p (in kfence-#%td):\n",
+ get_access_type(is_write), (void *)address, object_index);
+ break;
+ case KFENCE_ERROR_CORRUPTION:
+ pr_err("BUG: KFENCE: memory corruption in %pS\n\n", (void *)stack_entries[skipnr]);
+ pr_err("Corrupted memory at 0x%p ", (void *)address);
+ print_diff_canary(address, 16, meta);
+ pr_cont(" (in kfence-#%td):\n", object_index);
+ break;
+ case KFENCE_ERROR_INVALID:
+ pr_err("BUG: KFENCE: invalid %s in %pS\n\n", get_access_type(is_write),
+ (void *)stack_entries[skipnr]);
+ pr_err("Invalid %s at 0x%p:\n", get_access_type(is_write),
+ (void *)address);
+ break;
+ case KFENCE_ERROR_INVALID_FREE:
+ pr_err("BUG: KFENCE: invalid free in %pS\n\n", (void *)stack_entries[skipnr]);
+ pr_err("Invalid free of 0x%p (in kfence-#%td):\n", (void *)address,
+ object_index);
+ break;
+ }
+
+ /* Print stack trace and object info. */
+ stack_trace_print(stack_entries + skipnr, num_stack_entries - skipnr, 0);
+
+ if (meta) {
+ pr_err("\n");
+ kfence_print_object(NULL, meta);
+ }
+
+ /* Print report footer. */
+ pr_err("\n");
+ if (no_hash_pointers && regs)
+ show_regs(regs);
+ else
+ dump_stack_print_info(KERN_ERR);
+ trace_error_report_end(ERROR_DETECTOR_KFENCE, address);
+ pr_err("==================================================================\n");
+
+ lockdep_on();
+
+ if (panic_on_warn)
+ panic("panic_on_warn set ...\n");
+
+ /* We encountered a memory unsafety error, taint the kernel! */
+ add_taint(TAINT_BAD_PAGE, LOCKDEP_STILL_OK);
+}
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index fb0fdaec34d5..a7d6cb912b05 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -442,18 +442,28 @@ static inline int khugepaged_test_exit(struct mm_struct *mm)
static bool hugepage_vma_check(struct vm_area_struct *vma,
unsigned long vm_flags)
{
- if ((!(vm_flags & VM_HUGEPAGE) && !khugepaged_always()) ||
- (vm_flags & VM_NOHUGEPAGE) ||
+ /* Explicitly disabled through madvise. */
+ if ((vm_flags & VM_NOHUGEPAGE) ||
test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags))
return false;
- if (shmem_file(vma->vm_file) ||
- (IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) &&
- vma->vm_file &&
- (vm_flags & VM_DENYWRITE))) {
+ /* Enabled via shmem mount options or sysfs settings. */
+ if (shmem_file(vma->vm_file) && shmem_huge_enabled(vma)) {
return IS_ALIGNED((vma->vm_start >> PAGE_SHIFT) - vma->vm_pgoff,
HPAGE_PMD_NR);
}
+
+ /* THP settings require madvise. */
+ if (!(vm_flags & VM_HUGEPAGE) && !khugepaged_always())
+ return false;
+
+ /* Read-only file mappings need to be aligned for THP to work. */
+ if (IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) && vma->vm_file &&
+ (vm_flags & VM_DENYWRITE)) {
+ return IS_ALIGNED((vma->vm_start >> PAGE_SHIFT) - vma->vm_pgoff,
+ HPAGE_PMD_NR);
+ }
+
if (!vma->anon_vma || vma->vm_ops)
return false;
if (vma_is_temporary_stack(vma))
@@ -1643,6 +1653,7 @@ static void collapse_file(struct mm_struct *mm,
XA_STATE_ORDER(xas, &mapping->i_pages, start, HPAGE_PMD_ORDER);
int nr_none = 0, result = SCAN_SUCCEED;
bool is_shmem = shmem_file(file);
+ int nr;
VM_BUG_ON(!IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) && !is_shmem);
VM_BUG_ON(start & (HPAGE_PMD_NR - 1));
@@ -1854,11 +1865,12 @@ out_unlock:
put_page(page);
goto xa_unlocked;
}
+ nr = thp_nr_pages(new_page);
if (is_shmem)
- __inc_lruvec_page_state(new_page, NR_SHMEM_THPS);
+ __mod_lruvec_page_state(new_page, NR_SHMEM_THPS, nr);
else {
- __inc_lruvec_page_state(new_page, NR_FILE_THPS);
+ __mod_lruvec_page_state(new_page, NR_FILE_THPS, nr);
filemap_nr_thps_inc(mapping);
}
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index c0014d3b91c1..fe6e3ae8e8c6 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -97,6 +97,7 @@
#include <linux/atomic.h>
#include <linux/kasan.h>
+#include <linux/kfence.h>
#include <linux/kmemleak.h>
#include <linux/memory_hotplug.h>
@@ -589,7 +590,7 @@ static struct kmemleak_object *create_object(unsigned long ptr, size_t size,
atomic_set(&object->use_count, 1);
object->flags = OBJECT_ALLOCATED;
object->pointer = ptr;
- object->size = size;
+ object->size = kfence_ksize((void *)ptr) ?: size;
object->excess_ref = 0;
object->min_count = min_count;
object->count = 0; /* white color initially */
diff --git a/mm/list_lru.c b/mm/list_lru.c
index fe230081690b..6f067b6b935f 100644
--- a/mm/list_lru.c
+++ b/mm/list_lru.c
@@ -373,21 +373,13 @@ static void memcg_destroy_list_lru_node(struct list_lru_node *nlru)
struct list_lru_memcg *memcg_lrus;
/*
* This is called when shrinker has already been unregistered,
- * and nobody can use it. So, there is no need to use kvfree_rcu_local().
+ * and nobody can use it. So, there is no need to use kvfree_rcu().
*/
memcg_lrus = rcu_dereference_protected(nlru->memcg_lrus, true);
__memcg_destroy_list_lru_node(memcg_lrus, 0, memcg_nr_cache_ids);
kvfree(memcg_lrus);
}
-static void kvfree_rcu_local(struct rcu_head *head)
-{
- struct list_lru_memcg *mlru;
-
- mlru = container_of(head, struct list_lru_memcg, rcu);
- kvfree(mlru);
-}
-
static int memcg_update_list_lru_node(struct list_lru_node *nlru,
int old_size, int new_size)
{
@@ -419,7 +411,7 @@ static int memcg_update_list_lru_node(struct list_lru_node *nlru,
rcu_assign_pointer(nlru->memcg_lrus, new);
spin_unlock_irq(&nlru->lock);
- call_rcu(&old->rcu, kvfree_rcu_local);
+ kvfree_rcu(old, rcu);
return 0;
}
diff --git a/mm/madvise.c b/mm/madvise.c
index 0938fd3ad228..01fef79ac761 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -539,8 +539,9 @@ static inline bool can_do_pageout(struct vm_area_struct *vma)
* otherwise we'd be including shared non-exclusive mappings, which
* opens a side channel.
*/
- return inode_owner_or_capable(file_inode(vma->vm_file)) ||
- inode_permission(file_inode(vma->vm_file), MAY_WRITE) == 0;
+ return inode_owner_or_capable(&init_user_ns,
+ file_inode(vma->vm_file)) ||
+ file_permission(vma->vm_file, MAY_WRITE) == 0;
}
static long madvise_pageout(struct vm_area_struct *vma,
@@ -1197,12 +1198,22 @@ SYSCALL_DEFINE5(process_madvise, int, pidfd, const struct iovec __user *, vec,
goto release_task;
}
- mm = mm_access(task, PTRACE_MODE_ATTACH_FSCREDS);
+ /* Require PTRACE_MODE_READ to avoid leaking ASLR metadata. */
+ mm = mm_access(task, PTRACE_MODE_READ_FSCREDS);
if (IS_ERR_OR_NULL(mm)) {
ret = IS_ERR(mm) ? PTR_ERR(mm) : -ESRCH;
goto release_task;
}
+ /*
+ * Require CAP_SYS_NICE for influencing process performance. Note that
+ * only non-destructive hints are currently supported.
+ */
+ if (!capable(CAP_SYS_NICE)) {
+ ret = -EPERM;
+ goto release_mm;
+ }
+
total_len = iov_iter_count(&iter);
while (iov_iter_count(&iter)) {
@@ -1217,6 +1228,7 @@ SYSCALL_DEFINE5(process_madvise, int, pidfd, const struct iovec __user *, vec,
if (ret == 0)
ret = total_len - iov_iter_count(&iter);
+release_mm:
mmput(mm);
release_task:
put_task_struct(task);
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 913c2b9e5c72..e064ac0d850a 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -255,6 +255,11 @@ struct cgroup_subsys_state *vmpressure_to_css(struct vmpressure *vmpr)
#ifdef CONFIG_MEMCG_KMEM
extern spinlock_t css_set_lock;
+static int __memcg_kmem_charge(struct mem_cgroup *memcg, gfp_t gfp,
+ unsigned int nr_pages);
+static void __memcg_kmem_uncharge(struct mem_cgroup *memcg,
+ unsigned int nr_pages);
+
static void obj_cgroup_release(struct percpu_ref *ref)
{
struct obj_cgroup *objcg = container_of(ref, struct obj_cgroup, refcnt);
@@ -447,8 +452,7 @@ static void memcg_free_shrinker_maps(struct mem_cgroup *memcg)
for_each_node(nid) {
pn = mem_cgroup_nodeinfo(memcg, nid);
map = rcu_dereference_protected(pn->shrinker_map, true);
- if (map)
- kvfree(map);
+ kvfree(map);
rcu_assign_pointer(pn->shrinker_map, NULL);
}
}
@@ -1043,29 +1047,6 @@ struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm)
}
EXPORT_SYMBOL(get_mem_cgroup_from_mm);
-/**
- * get_mem_cgroup_from_page: Obtain a reference on given page's memcg.
- * @page: page from which memcg should be extracted.
- *
- * Obtain a reference on page->memcg and returns it if successful. Otherwise
- * root_mem_cgroup is returned.
- */
-struct mem_cgroup *get_mem_cgroup_from_page(struct page *page)
-{
- struct mem_cgroup *memcg = page_memcg(page);
-
- if (mem_cgroup_disabled())
- return NULL;
-
- rcu_read_lock();
- /* Page should not get uncharged and freed memcg under us. */
- if (!memcg || WARN_ON_ONCE(!css_tryget(&memcg->css)))
- memcg = root_mem_cgroup;
- rcu_read_unlock();
- return memcg;
-}
-EXPORT_SYMBOL(get_mem_cgroup_from_page);
-
static __always_inline struct mem_cgroup *active_memcg(void)
{
if (in_interrupt())
@@ -1080,13 +1061,9 @@ static __always_inline struct mem_cgroup *get_active_memcg(void)
rcu_read_lock();
memcg = active_memcg();
- if (memcg) {
- /* current->active_memcg must hold a ref. */
- if (WARN_ON_ONCE(!css_tryget(&memcg->css)))
- memcg = root_mem_cgroup;
- else
- memcg = current->active_memcg;
- }
+ /* remote memcg must hold a ref. */
+ if (memcg && WARN_ON_ONCE(!css_tryget(&memcg->css)))
+ memcg = root_mem_cgroup;
rcu_read_unlock();
return memcg;
@@ -1346,20 +1323,19 @@ void lruvec_memcg_debug(struct lruvec *lruvec, struct page *page)
* lock_page_lruvec - lock and return lruvec for a given page.
* @page: the page
*
- * This series functions should be used in either conditions:
- * PageLRU is cleared or unset
- * or page->_refcount is zero
- * or page is locked.
+ * These functions are safe to use under any of the following conditions:
+ * - page locked
+ * - PageLRU cleared
+ * - lock_page_memcg()
+ * - page->_refcount is zero
*/
struct lruvec *lock_page_lruvec(struct page *page)
{
struct lruvec *lruvec;
struct pglist_data *pgdat = page_pgdat(page);
- rcu_read_lock();
lruvec = mem_cgroup_page_lruvec(page, pgdat);
spin_lock(&lruvec->lru_lock);
- rcu_read_unlock();
lruvec_memcg_debug(lruvec, page);
@@ -1371,10 +1347,8 @@ struct lruvec *lock_page_lruvec_irq(struct page *page)
struct lruvec *lruvec;
struct pglist_data *pgdat = page_pgdat(page);
- rcu_read_lock();
lruvec = mem_cgroup_page_lruvec(page, pgdat);
spin_lock_irq(&lruvec->lru_lock);
- rcu_read_unlock();
lruvec_memcg_debug(lruvec, page);
@@ -1386,10 +1360,8 @@ struct lruvec *lock_page_lruvec_irqsave(struct page *page, unsigned long *flags)
struct lruvec *lruvec;
struct pglist_data *pgdat = page_pgdat(page);
- rcu_read_lock();
lruvec = mem_cgroup_page_lruvec(page, pgdat);
spin_lock_irqsave(&lruvec->lru_lock, *flags);
- rcu_read_unlock();
lruvec_memcg_debug(lruvec, page);
@@ -1512,72 +1484,73 @@ static bool mem_cgroup_wait_acct_move(struct mem_cgroup *memcg)
struct memory_stat {
const char *name;
- unsigned int ratio;
unsigned int idx;
};
-static struct memory_stat memory_stats[] = {
- { "anon", PAGE_SIZE, NR_ANON_MAPPED },
- { "file", PAGE_SIZE, NR_FILE_PAGES },
- { "kernel_stack", 1024, NR_KERNEL_STACK_KB },
- { "pagetables", PAGE_SIZE, NR_PAGETABLE },
- { "percpu", 1, MEMCG_PERCPU_B },
- { "sock", PAGE_SIZE, MEMCG_SOCK },
- { "shmem", PAGE_SIZE, NR_SHMEM },
- { "file_mapped", PAGE_SIZE, NR_FILE_MAPPED },
- { "file_dirty", PAGE_SIZE, NR_FILE_DIRTY },
- { "file_writeback", PAGE_SIZE, NR_WRITEBACK },
+static const struct memory_stat memory_stats[] = {
+ { "anon", NR_ANON_MAPPED },
+ { "file", NR_FILE_PAGES },
+ { "kernel_stack", NR_KERNEL_STACK_KB },
+ { "pagetables", NR_PAGETABLE },
+ { "percpu", MEMCG_PERCPU_B },
+ { "sock", MEMCG_SOCK },
+ { "shmem", NR_SHMEM },
+ { "file_mapped", NR_FILE_MAPPED },
+ { "file_dirty", NR_FILE_DIRTY },
+ { "file_writeback", NR_WRITEBACK },
+#ifdef CONFIG_SWAP
+ { "swapcached", NR_SWAPCACHE },
+#endif
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
- /*
- * The ratio will be initialized in memory_stats_init(). Because
- * on some architectures, the macro of HPAGE_PMD_SIZE is not
- * constant(e.g. powerpc).
- */
- { "anon_thp", 0, NR_ANON_THPS },
- { "file_thp", 0, NR_FILE_THPS },
- { "shmem_thp", 0, NR_SHMEM_THPS },
+ { "anon_thp", NR_ANON_THPS },
+ { "file_thp", NR_FILE_THPS },
+ { "shmem_thp", NR_SHMEM_THPS },
#endif
- { "inactive_anon", PAGE_SIZE, NR_INACTIVE_ANON },
- { "active_anon", PAGE_SIZE, NR_ACTIVE_ANON },
- { "inactive_file", PAGE_SIZE, NR_INACTIVE_FILE },
- { "active_file", PAGE_SIZE, NR_ACTIVE_FILE },
- { "unevictable", PAGE_SIZE, NR_UNEVICTABLE },
-
- /*
- * Note: The slab_reclaimable and slab_unreclaimable must be
- * together and slab_reclaimable must be in front.
- */
- { "slab_reclaimable", 1, NR_SLAB_RECLAIMABLE_B },
- { "slab_unreclaimable", 1, NR_SLAB_UNRECLAIMABLE_B },
+ { "inactive_anon", NR_INACTIVE_ANON },
+ { "active_anon", NR_ACTIVE_ANON },
+ { "inactive_file", NR_INACTIVE_FILE },
+ { "active_file", NR_ACTIVE_FILE },
+ { "unevictable", NR_UNEVICTABLE },
+ { "slab_reclaimable", NR_SLAB_RECLAIMABLE_B },
+ { "slab_unreclaimable", NR_SLAB_UNRECLAIMABLE_B },
/* The memory events */
- { "workingset_refault_anon", 1, WORKINGSET_REFAULT_ANON },
- { "workingset_refault_file", 1, WORKINGSET_REFAULT_FILE },
- { "workingset_activate_anon", 1, WORKINGSET_ACTIVATE_ANON },
- { "workingset_activate_file", 1, WORKINGSET_ACTIVATE_FILE },
- { "workingset_restore_anon", 1, WORKINGSET_RESTORE_ANON },
- { "workingset_restore_file", 1, WORKINGSET_RESTORE_FILE },
- { "workingset_nodereclaim", 1, WORKINGSET_NODERECLAIM },
+ { "workingset_refault_anon", WORKINGSET_REFAULT_ANON },
+ { "workingset_refault_file", WORKINGSET_REFAULT_FILE },
+ { "workingset_activate_anon", WORKINGSET_ACTIVATE_ANON },
+ { "workingset_activate_file", WORKINGSET_ACTIVATE_FILE },
+ { "workingset_restore_anon", WORKINGSET_RESTORE_ANON },
+ { "workingset_restore_file", WORKINGSET_RESTORE_FILE },
+ { "workingset_nodereclaim", WORKINGSET_NODERECLAIM },
};
-static int __init memory_stats_init(void)
-{
- int i;
-
- for (i = 0; i < ARRAY_SIZE(memory_stats); i++) {
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
- if (memory_stats[i].idx == NR_ANON_THPS ||
- memory_stats[i].idx == NR_FILE_THPS ||
- memory_stats[i].idx == NR_SHMEM_THPS)
- memory_stats[i].ratio = HPAGE_PMD_SIZE;
-#endif
- VM_BUG_ON(!memory_stats[i].ratio);
- VM_BUG_ON(memory_stats[i].idx >= MEMCG_NR_STAT);
+/* Translate stat items to the correct unit for memory.stat output */
+static int memcg_page_state_unit(int item)
+{
+ switch (item) {
+ case MEMCG_PERCPU_B:
+ case NR_SLAB_RECLAIMABLE_B:
+ case NR_SLAB_UNRECLAIMABLE_B:
+ case WORKINGSET_REFAULT_ANON:
+ case WORKINGSET_REFAULT_FILE:
+ case WORKINGSET_ACTIVATE_ANON:
+ case WORKINGSET_ACTIVATE_FILE:
+ case WORKINGSET_RESTORE_ANON:
+ case WORKINGSET_RESTORE_FILE:
+ case WORKINGSET_NODERECLAIM:
+ return 1;
+ case NR_KERNEL_STACK_KB:
+ return SZ_1K;
+ default:
+ return PAGE_SIZE;
}
+}
- return 0;
+static inline unsigned long memcg_page_state_output(struct mem_cgroup *memcg,
+ int item)
+{
+ return memcg_page_state(memcg, item) * memcg_page_state_unit(item);
}
-pure_initcall(memory_stats_init);
static char *memory_stat_format(struct mem_cgroup *memcg)
{
@@ -1602,13 +1575,12 @@ static char *memory_stat_format(struct mem_cgroup *memcg)
for (i = 0; i < ARRAY_SIZE(memory_stats); i++) {
u64 size;
- size = memcg_page_state(memcg, memory_stats[i].idx);
- size *= memory_stats[i].ratio;
+ size = memcg_page_state_output(memcg, memory_stats[i].idx);
seq_buf_printf(&s, "%s %llu\n", memory_stats[i].name, size);
if (unlikely(memory_stats[i].idx == NR_SLAB_UNRECLAIMABLE_B)) {
- size = memcg_page_state(memcg, NR_SLAB_RECLAIMABLE_B) +
- memcg_page_state(memcg, NR_SLAB_UNRECLAIMABLE_B);
+ size += memcg_page_state_output(memcg,
+ NR_SLAB_RECLAIMABLE_B);
seq_buf_printf(&s, "slab %llu\n", size);
}
}
@@ -2935,9 +2907,10 @@ static void commit_charge(struct page *page, struct mem_cgroup *memcg)
#ifdef CONFIG_MEMCG_KMEM
int memcg_alloc_page_obj_cgroups(struct page *page, struct kmem_cache *s,
- gfp_t gfp)
+ gfp_t gfp, bool new_page)
{
unsigned int objects = objs_per_slab_page(s, page);
+ unsigned long memcg_data;
void *vec;
vec = kcalloc_node(objects, sizeof(struct obj_cgroup *), gfp,
@@ -2945,11 +2918,25 @@ int memcg_alloc_page_obj_cgroups(struct page *page, struct kmem_cache *s,
if (!vec)
return -ENOMEM;
- if (!set_page_objcgs(page, vec))
+ memcg_data = (unsigned long) vec | MEMCG_DATA_OBJCGS;
+ if (new_page) {
+ /*
+ * If the slab page is brand new and nobody can yet access
+ * it's memcg_data, no synchronization is required and
+ * memcg_data can be simply assigned.
+ */
+ page->memcg_data = memcg_data;
+ } else if (cmpxchg(&page->memcg_data, 0, memcg_data)) {
+ /*
+ * If the slab page is already in use, somebody can allocate
+ * and assign obj_cgroups in parallel. In this case the existing
+ * objcg vector should be reused.
+ */
kfree(vec);
- else
- kmemleak_not_leak(vec);
+ return 0;
+ }
+ kmemleak_not_leak(vec);
return 0;
}
@@ -3077,8 +3064,8 @@ static void memcg_free_cache_id(int id)
*
* Returns 0 on success, an error code on failure.
*/
-int __memcg_kmem_charge(struct mem_cgroup *memcg, gfp_t gfp,
- unsigned int nr_pages)
+static int __memcg_kmem_charge(struct mem_cgroup *memcg, gfp_t gfp,
+ unsigned int nr_pages)
{
struct page_counter *counter;
int ret;
@@ -3110,7 +3097,7 @@ int __memcg_kmem_charge(struct mem_cgroup *memcg, gfp_t gfp,
* @memcg: memcg to uncharge
* @nr_pages: number of pages to uncharge
*/
-void __memcg_kmem_uncharge(struct mem_cgroup *memcg, unsigned int nr_pages)
+static void __memcg_kmem_uncharge(struct mem_cgroup *memcg, unsigned int nr_pages)
{
if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
page_counter_uncharge(&memcg->kmem, nr_pages);
@@ -3300,24 +3287,21 @@ void obj_cgroup_uncharge(struct obj_cgroup *objcg, size_t size)
#endif /* CONFIG_MEMCG_KMEM */
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
/*
- * Because page_memcg(head) is not set on compound tails, set it now.
+ * Because page_memcg(head) is not set on tails, set it now.
*/
-void mem_cgroup_split_huge_fixup(struct page *head)
+void split_page_memcg(struct page *head, unsigned int nr)
{
struct mem_cgroup *memcg = page_memcg(head);
int i;
- if (mem_cgroup_disabled())
+ if (mem_cgroup_disabled() || !memcg)
return;
- for (i = 1; i < HPAGE_PMD_NR; i++) {
- css_get(&memcg->css);
- head[i].memcg_data = (unsigned long)memcg;
- }
+ for (i = 1; i < nr; i++)
+ head[i].memcg_data = head->memcg_data;
+ css_get_many(&memcg->css, nr - 1);
}
-#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
#ifdef CONFIG_MEMCG_SWAP
/**
@@ -4072,10 +4056,6 @@ static int memcg_stat_show(struct seq_file *m, void *v)
if (memcg1_stats[i] == MEMCG_SWAP && !do_memsw_account())
continue;
nr = memcg_page_state_local(memcg, memcg1_stats[i]);
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
- if (memcg1_stats[i] == NR_ANON_THPS)
- nr *= HPAGE_PMD_NR;
-#endif
seq_printf(m, "%s %lu\n", memcg1_stat_names[i], nr * PAGE_SIZE);
}
@@ -4106,10 +4086,6 @@ static int memcg_stat_show(struct seq_file *m, void *v)
if (memcg1_stats[i] == MEMCG_SWAP && !do_memsw_account())
continue;
nr = memcg_page_state(memcg, memcg1_stats[i]);
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
- if (memcg1_stats[i] == NR_ANON_THPS)
- nr *= HPAGE_PMD_NR;
-#endif
seq_printf(m, "total_%s %llu\n", memcg1_stat_names[i],
(u64)nr * PAGE_SIZE);
}
@@ -4897,7 +4873,7 @@ static ssize_t memcg_write_event_control(struct kernfs_open_file *of,
/* the process need read permission on control file */
/* AV: shouldn't we check that it's been opened for read instead? */
- ret = inode_permission(file_inode(cfile.file), MAY_READ);
+ ret = file_permission(cfile.file, MAY_READ);
if (ret < 0)
goto out_put_cfile;
@@ -5193,7 +5169,7 @@ static int alloc_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node)
return 1;
}
- pn->lruvec_stat_cpu = alloc_percpu_gfp(struct lruvec_stat,
+ pn->lruvec_stat_cpu = alloc_percpu_gfp(struct batched_lruvec_stat,
GFP_KERNEL_ACCOUNT);
if (!pn->lruvec_stat_cpu) {
free_percpu(pn->lruvec_stat_local);
@@ -5642,7 +5618,6 @@ static int mem_cgroup_move_account(struct page *page,
__mod_lruvec_state(to_vec, NR_ANON_THPS,
nr_pages);
}
-
}
} else {
__mod_lruvec_state(from_vec, NR_FILE_PAGES, -nr_pages);
@@ -6393,6 +6368,12 @@ static int memory_stat_show(struct seq_file *m, void *v)
}
#ifdef CONFIG_NUMA
+static inline unsigned long lruvec_page_state_output(struct lruvec *lruvec,
+ int item)
+{
+ return lruvec_page_state(lruvec, item) * memcg_page_state_unit(item);
+}
+
static int memory_numa_stat_show(struct seq_file *m, void *v)
{
int i;
@@ -6410,8 +6391,8 @@ static int memory_numa_stat_show(struct seq_file *m, void *v)
struct lruvec *lruvec;
lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(nid));
- size = lruvec_page_state(lruvec, memory_stats[i].idx);
- size *= memory_stats[i].ratio;
+ size = lruvec_page_state_output(lruvec,
+ memory_stats[i].idx);
seq_printf(m, " N%d=%llu", nid, size);
}
seq_putc(m, '\n');
@@ -6760,7 +6741,19 @@ int mem_cgroup_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask)
memcg_check_events(memcg, page);
local_irq_enable();
- if (PageSwapCache(page)) {
+ /*
+ * Cgroup1's unified memory+swap counter has been charged with the
+ * new swapcache page, finish the transfer by uncharging the swap
+ * slot. The swap slot would also get uncharged when it dies, but
+ * it can stick around indefinitely and we'd count the page twice
+ * the entire time.
+ *
+ * Cgroup2 has separate resource counters for memory and swap,
+ * so this is a non-issue here. Memory and swap charge lifetimes
+ * correspond 1:1 to page and swap slot lifetimes: we charge the
+ * page to memory here, and uncharge swap when the slot is freed.
+ */
+ if (do_memsw_account() && PageSwapCache(page)) {
swp_entry_t entry = { .val = page_private(page) };
/*
* The swap entry might not get freed for a long time,
@@ -6851,31 +6844,6 @@ static void uncharge_page(struct page *page, struct uncharge_gather *ug)
css_put(&ug->memcg->css);
}
-static void uncharge_list(struct list_head *page_list)
-{
- struct uncharge_gather ug;
- struct list_head *next;
-
- uncharge_gather_clear(&ug);
-
- /*
- * Note that the list can be a single page->lru; hence the
- * do-while loop instead of a simple list_for_each_entry().
- */
- next = page_list->next;
- do {
- struct page *page;
-
- page = list_entry(next, struct page, lru);
- next = page->lru.next;
-
- uncharge_page(page, &ug);
- } while (next != page_list);
-
- if (ug.memcg)
- uncharge_batch(&ug);
-}
-
/**
* mem_cgroup_uncharge - uncharge a page
* @page: page to uncharge
@@ -6907,11 +6875,17 @@ void mem_cgroup_uncharge(struct page *page)
*/
void mem_cgroup_uncharge_list(struct list_head *page_list)
{
+ struct uncharge_gather ug;
+ struct page *page;
+
if (mem_cgroup_disabled())
return;
- if (!list_empty(page_list))
- uncharge_list(page_list);
+ uncharge_gather_clear(&ug);
+ list_for_each_entry(page, page_list, lru)
+ uncharge_page(page, &ug);
+ if (ug.memcg)
+ uncharge_batch(&ug);
}
/**
@@ -7078,6 +7052,14 @@ static int __init mem_cgroup_init(void)
{
int cpu, node;
+ /*
+ * Currently s32 type (can refer to struct batched_lruvec_stat) is
+ * used for per-memcg-per-cpu caching of per-node statistics. In order
+ * to work fine, we should make sure that the overfill threshold can't
+ * exceed S32_MAX / PAGE_SIZE.
+ */
+ BUILD_BUG_ON(MEMCG_CHARGE_BATCH > S32_MAX / PAGE_SIZE);
+
cpuhp_setup_state_nocalls(CPUHP_MM_MEMCQ_DEAD, "mm/memctrl:dead", NULL,
memcg_hotplug_cpu_dead);
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index e9481632fcd1..24210c9bd843 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -243,9 +243,13 @@ static int kill_proc(struct to_kill *tk, unsigned long pfn, int flags)
pfn, t->comm, t->pid);
if (flags & MF_ACTION_REQUIRED) {
- WARN_ON_ONCE(t != current);
- ret = force_sig_mceerr(BUS_MCEERR_AR,
+ if (t == current)
+ ret = force_sig_mceerr(BUS_MCEERR_AR,
(void __user *)tk->addr, addr_lsb);
+ else
+ /* Signal other processes sharing the page if they have PF_MCE_EARLY set. */
+ ret = send_sig_mceerr(BUS_MCEERR_AO, (void __user *)tk->addr,
+ addr_lsb, t);
} else {
/*
* Don't use force here, it's convenient if the signal
@@ -440,26 +444,26 @@ static struct task_struct *find_early_kill_thread(struct task_struct *tsk)
* Determine whether a given process is "early kill" process which expects
* to be signaled when some page under the process is hwpoisoned.
* Return task_struct of the dedicated thread (main thread unless explicitly
- * specified) if the process is "early kill," and otherwise returns NULL.
+ * specified) if the process is "early kill" and otherwise returns NULL.
*
- * Note that the above is true for Action Optional case, but not for Action
- * Required case where SIGBUS should sent only to the current thread.
+ * Note that the above is true for Action Optional case. For Action Required
+ * case, it's only meaningful to the current thread which need to be signaled
+ * with SIGBUS, this error is Action Optional for other non current
+ * processes sharing the same error page,if the process is "early kill", the
+ * task_struct of the dedicated thread will also be returned.
*/
static struct task_struct *task_early_kill(struct task_struct *tsk,
int force_early)
{
if (!tsk->mm)
return NULL;
- if (force_early) {
- /*
- * Comparing ->mm here because current task might represent
- * a subthread, while tsk always points to the main thread.
- */
- if (tsk->mm == current->mm)
- return current;
- else
- return NULL;
- }
+ /*
+ * Comparing ->mm here because current task might represent
+ * a subthread, while tsk always points to the main thread.
+ */
+ if (force_early && tsk->mm == current->mm)
+ return current;
+
return find_early_kill_thread(tsk);
}
@@ -1308,6 +1312,12 @@ static int memory_failure_dev_pagemap(unsigned long pfn, int flags,
*/
put_page(page);
+ /* device metadata space is not recoverable */
+ if (!pgmap_pfn_valid(pgmap, pfn)) {
+ rc = -ENXIO;
+ goto out;
+ }
+
/*
* Prevent the inode from being freed while we are interrogating
* the address_space, typically this would be handled by
diff --git a/mm/memory.c b/mm/memory.c
index c32318dc11d4..5efa07fb6cdc 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -809,12 +809,8 @@ copy_present_page(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma
pte_t *dst_pte, pte_t *src_pte, unsigned long addr, int *rss,
struct page **prealloc, pte_t pte, struct page *page)
{
- struct mm_struct *src_mm = src_vma->vm_mm;
struct page *new_page;
- if (!is_cow_mapping(src_vma->vm_flags))
- return 1;
-
/*
* What we want to do is to check whether this page may
* have been pinned by the parent process. If so,
@@ -828,9 +824,7 @@ copy_present_page(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma
* the page count. That might give false positives for
* for pinning, but it will work correctly.
*/
- if (likely(!atomic_read(&src_mm->has_pinned)))
- return 1;
- if (likely(!page_maybe_dma_pinned(page)))
+ if (likely(!page_needs_cow_for_dma(src_vma, page)))
return 1;
new_page = *prealloc;
@@ -2177,11 +2171,11 @@ static int remap_pte_range(struct mm_struct *mm, pmd_t *pmd,
unsigned long addr, unsigned long end,
unsigned long pfn, pgprot_t prot)
{
- pte_t *pte;
+ pte_t *pte, *mapped_pte;
spinlock_t *ptl;
int err = 0;
- pte = pte_alloc_map_lock(mm, pmd, addr, &ptl);
+ mapped_pte = pte = pte_alloc_map_lock(mm, pmd, addr, &ptl);
if (!pte)
return -ENOMEM;
arch_enter_lazy_mmu_mode();
@@ -2195,7 +2189,7 @@ static int remap_pte_range(struct mm_struct *mm, pmd_t *pmd,
pfn++;
} while (pte++, addr += PAGE_SIZE, addr != end);
arch_leave_lazy_mmu_mode();
- pte_unmap_unlock(pte - 1, ptl);
+ pte_unmap_unlock(mapped_pte, ptl);
return err;
}
@@ -2394,18 +2388,18 @@ static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd,
pte_fn_t fn, void *data, bool create,
pgtbl_mod_mask *mask)
{
- pte_t *pte;
+ pte_t *pte, *mapped_pte;
int err = 0;
spinlock_t *ptl;
if (create) {
- pte = (mm == &init_mm) ?
+ mapped_pte = pte = (mm == &init_mm) ?
pte_alloc_kernel_track(pmd, addr, mask) :
pte_alloc_map_lock(mm, pmd, addr, &ptl);
if (!pte)
return -ENOMEM;
} else {
- pte = (mm == &init_mm) ?
+ mapped_pte = pte = (mm == &init_mm) ?
pte_offset_kernel(pmd, addr) :
pte_offset_map_lock(mm, pmd, addr, &ptl);
}
@@ -2428,7 +2422,7 @@ static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd,
arch_leave_lazy_mmu_mode();
if (mm != &init_mm)
- pte_unmap_unlock(pte-1, ptl);
+ pte_unmap_unlock(mapped_pte, ptl);
return err;
}
@@ -2902,7 +2896,6 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
}
flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte));
entry = mk_pte(new_page, vma->vm_page_prot);
- entry = pte_sw_mkyoung(entry);
entry = maybe_mkwrite(pte_mkdirty(entry), vma);
/*
@@ -3104,6 +3097,14 @@ static vm_fault_t do_wp_page(struct vm_fault *vmf)
return handle_userfault(vmf, VM_UFFD_WP);
}
+ /*
+ * Userfaultfd write-protect can defer flushes. Ensure the TLB
+ * is flushed in this case before copying.
+ */
+ if (unlikely(userfaultfd_wp(vmf->vma) &&
+ mm_tlb_flush_pending(vmf->vma->vm_mm)))
+ flush_tlb_page(vmf->vma, vmf->address);
+
vmf->page = vm_normal_page(vma, vmf->address, vmf->orig_pte);
if (!vmf->page) {
/*
@@ -3560,7 +3561,6 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
__SetPageUptodate(page);
entry = mk_pte(page, vma->vm_page_prot);
- entry = pte_sw_mkyoung(entry);
if (vma->vm_flags & VM_WRITE)
entry = pte_mkwrite(pte_mkdirty(entry));
@@ -3745,8 +3745,6 @@ void do_set_pte(struct vm_fault *vmf, struct page *page, unsigned long addr)
if (prefault && arch_wants_old_prefaulted_pte())
entry = pte_mkold(entry);
- else
- entry = pte_sw_mkyoung(entry);
if (write)
entry = maybe_mkwrite(pte_mkdirty(entry), vma);
@@ -4798,28 +4796,68 @@ out:
return ret;
}
+/**
+ * generic_access_phys - generic implementation for iomem mmap access
+ * @vma: the vma to access
+ * @addr: userspace addres, not relative offset within @vma
+ * @buf: buffer to read/write
+ * @len: length of transfer
+ * @write: set to FOLL_WRITE when writing, otherwise reading
+ *
+ * This is a generic implementation for &vm_operations_struct.access for an
+ * iomem mapping. This callback is used by access_process_vm() when the @vma is
+ * not page based.
+ */
int generic_access_phys(struct vm_area_struct *vma, unsigned long addr,
void *buf, int len, int write)
{
resource_size_t phys_addr;
unsigned long prot = 0;
void __iomem *maddr;
- int offset = addr & (PAGE_SIZE-1);
+ pte_t *ptep, pte;
+ spinlock_t *ptl;
+ int offset = offset_in_page(addr);
+ int ret = -EINVAL;
- if (follow_phys(vma, addr, write, &prot, &phys_addr))
+ if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)))
+ return -EINVAL;
+
+retry:
+ if (follow_pte(vma->vm_mm, addr, &ptep, &ptl))
+ return -EINVAL;
+ pte = *ptep;
+ pte_unmap_unlock(ptep, ptl);
+
+ prot = pgprot_val(pte_pgprot(pte));
+ phys_addr = (resource_size_t)pte_pfn(pte) << PAGE_SHIFT;
+
+ if ((write & FOLL_WRITE) && !pte_write(pte))
return -EINVAL;
maddr = ioremap_prot(phys_addr, PAGE_ALIGN(len + offset), prot);
if (!maddr)
return -ENOMEM;
+ if (follow_pte(vma->vm_mm, addr, &ptep, &ptl))
+ goto out_unmap;
+
+ if (!pte_same(pte, *ptep)) {
+ pte_unmap_unlock(ptep, ptl);
+ iounmap(maddr);
+
+ goto retry;
+ }
+
if (write)
memcpy_toio(maddr + offset, buf, len);
else
memcpy_fromio(buf, maddr + offset, len);
+ ret = len;
+ pte_unmap_unlock(ptep, ptl);
+out_unmap:
iounmap(maddr);
- return len;
+ return ret;
}
EXPORT_SYMBOL_GPL(generic_access_phys);
#endif
@@ -5137,17 +5175,19 @@ long copy_huge_page_from_user(struct page *dst_page,
void *page_kaddr;
unsigned long i, rc = 0;
unsigned long ret_val = pages_per_huge_page * PAGE_SIZE;
+ struct page *subpage = dst_page;
- for (i = 0; i < pages_per_huge_page; i++) {
+ for (i = 0; i < pages_per_huge_page;
+ i++, subpage = mem_map_next(subpage, dst_page, i)) {
if (allow_pagefault)
- page_kaddr = kmap(dst_page + i);
+ page_kaddr = kmap(subpage);
else
- page_kaddr = kmap_atomic(dst_page + i);
+ page_kaddr = kmap_atomic(subpage);
rc = copy_from_user(page_kaddr,
(const void __user *)(src + i * PAGE_SIZE),
PAGE_SIZE);
if (allow_pagefault)
- kunmap(dst_page + i);
+ kunmap(subpage);
else
kunmap_atomic(page_kaddr);
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index f9d57b9be8c7..0cdbbfbc5757 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -67,17 +67,17 @@ void put_online_mems(void)
bool movable_node_enabled = false;
#ifndef CONFIG_MEMORY_HOTPLUG_DEFAULT_ONLINE
-int memhp_default_online_type = MMOP_OFFLINE;
+int mhp_default_online_type = MMOP_OFFLINE;
#else
-int memhp_default_online_type = MMOP_ONLINE;
+int mhp_default_online_type = MMOP_ONLINE;
#endif
static int __init setup_memhp_default_state(char *str)
{
- const int online_type = memhp_online_type_from_str(str);
+ const int online_type = mhp_online_type_from_str(str);
if (online_type >= 0)
- memhp_default_online_type = online_type;
+ mhp_default_online_type = online_type;
return 1;
}
@@ -107,6 +107,9 @@ static struct resource *register_memory_resource(u64 start, u64 size,
if (strcmp(resource_name, "System RAM"))
flags |= IORESOURCE_SYSRAM_DRIVER_MANAGED;
+ if (!mhp_range_allowed(start, size, true))
+ return ERR_PTR(-E2BIG);
+
/*
* Make sure value parsed from 'mem=' only restricts memory adding
* while booting, so that memory hotplug won't be impacted. Please
@@ -284,21 +287,53 @@ static int check_pfn_span(unsigned long pfn, unsigned long nr_pages,
return 0;
}
-static int check_hotplug_memory_addressable(unsigned long pfn,
- unsigned long nr_pages)
+/*
+ * Return page for the valid pfn only if the page is online. All pfn
+ * walkers which rely on the fully initialized page->flags and others
+ * should use this rather than pfn_valid && pfn_to_page
+ */
+struct page *pfn_to_online_page(unsigned long pfn)
{
- const u64 max_addr = PFN_PHYS(pfn + nr_pages) - 1;
+ unsigned long nr = pfn_to_section_nr(pfn);
+ struct dev_pagemap *pgmap;
+ struct mem_section *ms;
- if (max_addr >> MAX_PHYSMEM_BITS) {
- const u64 max_allowed = (1ull << (MAX_PHYSMEM_BITS + 1)) - 1;
- WARN(1,
- "Hotplugged memory exceeds maximum addressable address, range=%#llx-%#llx, maximum=%#llx\n",
- (u64)PFN_PHYS(pfn), max_addr, max_allowed);
- return -E2BIG;
- }
+ if (nr >= NR_MEM_SECTIONS)
+ return NULL;
- return 0;
+ ms = __nr_to_section(nr);
+ if (!online_section(ms))
+ return NULL;
+
+ /*
+ * Save some code text when online_section() +
+ * pfn_section_valid() are sufficient.
+ */
+ if (IS_ENABLED(CONFIG_HAVE_ARCH_PFN_VALID) && !pfn_valid(pfn))
+ return NULL;
+
+ if (!pfn_section_valid(ms, pfn))
+ return NULL;
+
+ if (!online_device_section(ms))
+ return pfn_to_page(pfn);
+
+ /*
+ * Slowpath: when ZONE_DEVICE collides with
+ * ZONE_{NORMAL,MOVABLE} within the same section some pfns in
+ * the section may be 'offline' but 'valid'. Only
+ * get_dev_pagemap() can determine sub-section online status.
+ */
+ pgmap = get_dev_pagemap(pfn, NULL);
+ put_dev_pagemap(pgmap);
+
+ /* The presence of a pgmap indicates ZONE_DEVICE offline pfn */
+ if (pgmap)
+ return NULL;
+
+ return pfn_to_page(pfn);
}
+EXPORT_SYMBOL_GPL(pfn_to_online_page);
/*
* Reasonably generic function for adding memory. It is
@@ -317,9 +352,7 @@ int __ref __add_pages(int nid, unsigned long pfn, unsigned long nr_pages,
if (WARN_ON_ONCE(!params->pgprot.pgprot))
return -EINVAL;
- err = check_hotplug_memory_addressable(pfn, nr_pages);
- if (err)
- return err;
+ VM_BUG_ON(!mhp_range_allowed(PFN_PHYS(pfn), nr_pages * PAGE_SIZE, false));
if (altmap) {
/*
@@ -445,20 +478,19 @@ static void update_pgdat_span(struct pglist_data *pgdat)
for (zone = pgdat->node_zones;
zone < pgdat->node_zones + MAX_NR_ZONES; zone++) {
- unsigned long zone_end_pfn = zone->zone_start_pfn +
- zone->spanned_pages;
+ unsigned long end_pfn = zone_end_pfn(zone);
/* No need to lock the zones, they can't change. */
if (!zone->spanned_pages)
continue;
if (!node_end_pfn) {
node_start_pfn = zone->zone_start_pfn;
- node_end_pfn = zone_end_pfn;
+ node_end_pfn = end_pfn;
continue;
}
- if (zone_end_pfn > node_end_pfn)
- node_end_pfn = zone_end_pfn;
+ if (end_pfn > node_end_pfn)
+ node_end_pfn = end_pfn;
if (zone->zone_start_pfn < node_start_pfn)
node_start_pfn = zone->zone_start_pfn;
}
@@ -678,6 +710,14 @@ static void __meminit resize_pgdat_range(struct pglist_data *pgdat, unsigned lon
pgdat->node_spanned_pages = max(start_pfn + nr_pages, old_end_pfn) - pgdat->node_start_pfn;
}
+
+static void section_taint_zone_device(unsigned long pfn)
+{
+ struct mem_section *ms = __pfn_to_section(pfn);
+
+ ms->section_mem_map |= SECTION_TAINT_ZONE_DEVICE;
+}
+
/*
* Associate the pfn range with the given zone, initializing the memmaps
* and resizing the pgdat/zone data to span the added pages. After this
@@ -708,12 +748,25 @@ void __ref move_pfn_range_to_zone(struct zone *zone, unsigned long start_pfn,
pgdat_resize_unlock(pgdat, &flags);
/*
+ * Subsection population requires care in pfn_to_online_page().
+ * Set the taint to enable the slow path detection of
+ * ZONE_DEVICE pages in an otherwise ZONE_{NORMAL,MOVABLE}
+ * section.
+ */
+ if (zone_is_zone_device(zone)) {
+ if (!IS_ALIGNED(start_pfn, PAGES_PER_SECTION))
+ section_taint_zone_device(start_pfn);
+ if (!IS_ALIGNED(start_pfn + nr_pages, PAGES_PER_SECTION))
+ section_taint_zone_device(start_pfn + nr_pages);
+ }
+
+ /*
* TODO now we have a visible range of pages which are not associated
* with their zone properly. Not nice but set_pfnblock_flags_mask
* expects the zone spans the pfn range. All the pages in the range
* are reserved so nobody should be touching them so we should be safe
*/
- memmap_init_zone(nr_pages, nid, zone_idx(zone), start_pfn, 0,
+ memmap_init_range(nr_pages, nid, zone_idx(zone), start_pfn, 0,
MEMINIT_HOTPLUG, altmap, migratetype);
set_zone_contiguous(zone);
@@ -1007,7 +1060,7 @@ static int check_hotplug_memory_range(u64 start, u64 size)
static int online_memory_block(struct memory_block *mem, void *arg)
{
- mem->online_type = memhp_default_online_type;
+ mem->online_type = mhp_default_online_type;
return device_online(&mem->dev);
}
@@ -1019,7 +1072,7 @@ static int online_memory_block(struct memory_block *mem, void *arg)
*/
int __ref add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags)
{
- struct mhp_params params = { .pgprot = PAGE_KERNEL };
+ struct mhp_params params = { .pgprot = pgprot_mhp(PAGE_KERNEL) };
u64 start, size;
bool new_node = false;
int ret;
@@ -1084,11 +1137,11 @@ int __ref add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags)
* In case we're allowed to merge the resource, flag it and trigger
* merging now that adding succeeded.
*/
- if (mhp_flags & MEMHP_MERGE_RESOURCE)
+ if (mhp_flags & MHP_MERGE_RESOURCE)
merge_system_ram_resource(res);
/* online pages if requested */
- if (memhp_default_online_type != MMOP_OFFLINE)
+ if (mhp_default_online_type != MMOP_OFFLINE)
walk_memory_blocks(start, size, NULL, online_memory_block);
return ret;
@@ -1180,6 +1233,61 @@ out_unlock:
}
EXPORT_SYMBOL_GPL(add_memory_driver_managed);
+/*
+ * Platforms should define arch_get_mappable_range() that provides
+ * maximum possible addressable physical memory range for which the
+ * linear mapping could be created. The platform returned address
+ * range must adhere to these following semantics.
+ *
+ * - range.start <= range.end
+ * - Range includes both end points [range.start..range.end]
+ *
+ * There is also a fallback definition provided here, allowing the
+ * entire possible physical address range in case any platform does
+ * not define arch_get_mappable_range().
+ */
+struct range __weak arch_get_mappable_range(void)
+{
+ struct range mhp_range = {
+ .start = 0UL,
+ .end = -1ULL,
+ };
+ return mhp_range;
+}
+
+struct range mhp_get_pluggable_range(bool need_mapping)
+{
+ const u64 max_phys = (1ULL << MAX_PHYSMEM_BITS) - 1;
+ struct range mhp_range;
+
+ if (need_mapping) {
+ mhp_range = arch_get_mappable_range();
+ if (mhp_range.start > max_phys) {
+ mhp_range.start = 0;
+ mhp_range.end = 0;
+ }
+ mhp_range.end = min_t(u64, mhp_range.end, max_phys);
+ } else {
+ mhp_range.start = 0;
+ mhp_range.end = max_phys;
+ }
+ return mhp_range;
+}
+EXPORT_SYMBOL_GPL(mhp_get_pluggable_range);
+
+bool mhp_range_allowed(u64 start, u64 size, bool need_mapping)
+{
+ struct range mhp_range = mhp_get_pluggable_range(need_mapping);
+ u64 end = start + size;
+
+ if (start < end && start >= mhp_range.start && (end - 1) <= mhp_range.end)
+ return true;
+
+ pr_warn("Hotplug memory [%#llx-%#llx] exceeds maximum addressable range [%#llx-%#llx]\n",
+ start, end, mhp_range.start, mhp_range.end);
+ return false;
+}
+
#ifdef CONFIG_MEMORY_HOTREMOVE
/*
* Confirm all pages in a range [start, end) belong to the same zone (skipping
@@ -1260,7 +1368,14 @@ static int scan_movable_pages(unsigned long start, unsigned long end,
if (!PageHuge(page))
continue;
head = compound_head(page);
- if (page_huge_active(head))
+ /*
+ * This test is racy as we hold no reference or lock. The
+ * hugetlb page could have been free'ed and head is no longer
+ * a hugetlb page before the following check. In such unlikely
+ * cases false positives and negatives are possible. Calling
+ * code must deal with these scenarios.
+ */
+ if (HPageMigratable(head))
goto found;
skip = compound_nr(head) - (page - head);
pfn += skip - 1;
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 2c3a86502053..ab51132547b8 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -677,7 +677,7 @@ static int queue_pages_test_walk(unsigned long start, unsigned long end,
unsigned long flags = qp->flags;
/* range check first */
- VM_BUG_ON_VMA((vma->vm_start > start) || (vma->vm_end < end), vma);
+ VM_BUG_ON_VMA(!range_in_vma(vma, start, end), vma);
if (!qp->first) {
qp->first = vma;
@@ -875,6 +875,16 @@ static long do_set_mempolicy(unsigned short mode, unsigned short flags,
goto out;
}
+ if (flags & MPOL_F_NUMA_BALANCING) {
+ if (new && new->mode == MPOL_BIND) {
+ new->flags |= (MPOL_F_MOF | MPOL_F_MORON);
+ } else {
+ ret = -EINVAL;
+ mpol_put(new);
+ goto out;
+ }
+ }
+
ret = mpol_set_nodemask(new, nodes, scratch);
if (ret) {
mpol_put(new);
@@ -2486,6 +2496,12 @@ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long
break;
case MPOL_BIND:
+ /* Optimize placement among multiple nodes via NUMA balancing */
+ if (pol->flags & MPOL_F_MORON) {
+ if (node_isset(thisnid, pol->v.nodes))
+ break;
+ goto out;
+ }
/*
* allows binding to multiple nodes.
diff --git a/mm/mempool.c b/mm/mempool.c
index 624ed51b060f..79959fac27d7 100644
--- a/mm/mempool.c
+++ b/mm/mempool.c
@@ -104,7 +104,7 @@ static inline void poison_element(mempool_t *pool, void *element)
static __always_inline void kasan_poison_element(mempool_t *pool, void *element)
{
if (pool->alloc == mempool_alloc_slab || pool->alloc == mempool_kmalloc)
- kasan_slab_free_mempool(element, _RET_IP_);
+ kasan_slab_free_mempool(element);
else if (pool->alloc == mempool_alloc_pages)
kasan_free_pages(element, (unsigned long)pool->pool_data);
}
diff --git a/mm/memremap.c b/mm/memremap.c
index 16b2fb482da1..7aa7d6e80ee5 100644
--- a/mm/memremap.c
+++ b/mm/memremap.c
@@ -80,6 +80,21 @@ static unsigned long pfn_first(struct dev_pagemap *pgmap, int range_id)
return pfn + vmem_altmap_offset(pgmap_altmap(pgmap));
}
+bool pgmap_pfn_valid(struct dev_pagemap *pgmap, unsigned long pfn)
+{
+ int i;
+
+ for (i = 0; i < pgmap->nr_range; i++) {
+ struct range *range = &pgmap->ranges[i];
+
+ if (pfn >= PHYS_PFN(range->start) &&
+ pfn <= PHYS_PFN(range->end))
+ return pfn >= pfn_first(pgmap, i);
+ }
+
+ return false;
+}
+
static unsigned long pfn_end(struct dev_pagemap *pgmap, int range_id)
{
const struct range *range = &pgmap->ranges[range_id];
@@ -185,6 +200,7 @@ static void dev_pagemap_percpu_release(struct percpu_ref *ref)
static int pagemap_range(struct dev_pagemap *pgmap, struct mhp_params *params,
int range_id, int nid)
{
+ const bool is_private = pgmap->type == MEMORY_DEVICE_PRIVATE;
struct range *range = &pgmap->ranges[range_id];
struct dev_pagemap *conflict_pgmap;
int error, is_ram;
@@ -230,6 +246,11 @@ static int pagemap_range(struct dev_pagemap *pgmap, struct mhp_params *params,
if (error)
goto err_pfn_remap;
+ if (!mhp_range_allowed(range->start, range_len(range), !is_private)) {
+ error = -EINVAL;
+ goto err_pfn_remap;
+ }
+
mem_hotplug_begin();
/*
@@ -243,7 +264,7 @@ static int pagemap_range(struct dev_pagemap *pgmap, struct mhp_params *params,
* the CPU, we do want the linear mapping and thus use
* arch_add_memory().
*/
- if (pgmap->type == MEMORY_DEVICE_PRIVATE) {
+ if (is_private) {
error = add_pages(nid, PHYS_PFN(range->start),
PHYS_PFN(range_len(range)), params);
} else {
diff --git a/mm/migrate.c b/mm/migrate.c
index 20ca887ea769..62b81d5257aa 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -331,7 +331,7 @@ void __migration_entry_wait(struct mm_struct *mm, pte_t *ptep,
if (!get_page_unless_zero(page))
goto out;
pte_unmap_unlock(ptep, ptl);
- put_and_wait_on_page_locked(page);
+ put_and_wait_on_page_locked(page, TASK_UNINTERRUPTIBLE);
return;
out:
pte_unmap_unlock(ptep, ptl);
@@ -365,7 +365,7 @@ void pmd_migration_entry_wait(struct mm_struct *mm, pmd_t *pmd)
if (!get_page_unless_zero(page))
goto unlock;
spin_unlock(ptl);
- put_and_wait_on_page_locked(page);
+ put_and_wait_on_page_locked(page, TASK_UNINTERRUPTIBLE);
return;
unlock:
spin_unlock(ptl);
@@ -500,6 +500,12 @@ int migrate_page_move_mapping(struct address_space *mapping,
__mod_lruvec_state(old_lruvec, NR_SHMEM, -nr);
__mod_lruvec_state(new_lruvec, NR_SHMEM, nr);
}
+#ifdef CONFIG_SWAP
+ if (PageSwapCache(page)) {
+ __mod_lruvec_state(old_lruvec, NR_SWAPCACHE, -nr);
+ __mod_lruvec_state(new_lruvec, NR_SWAPCACHE, nr);
+ }
+#endif
if (dirty && mapping_can_writeback(mapping)) {
__mod_lruvec_state(old_lruvec, NR_FILE_DIRTY, -nr);
__mod_zone_page_state(oldzone, NR_ZONE_WRITE_PENDING, -nr);
diff --git a/mm/mincore.c b/mm/mincore.c
index 02db1a834021..9122676b54d6 100644
--- a/mm/mincore.c
+++ b/mm/mincore.c
@@ -166,8 +166,9 @@ static inline bool can_do_mincore(struct vm_area_struct *vma)
* for writing; otherwise we'd be including shared non-exclusive
* mappings, which opens a side channel.
*/
- return inode_owner_or_capable(file_inode(vma->vm_file)) ||
- inode_permission(file_inode(vma->vm_file), MAY_WRITE) == 0;
+ return inode_owner_or_capable(&init_user_ns,
+ file_inode(vma->vm_file)) ||
+ file_permission(vma->vm_file, MAY_WRITE) == 0;
}
static const struct mm_walk_ops mincore_walk_ops = {
diff --git a/mm/mlock.c b/mm/mlock.c
index 55b3b3672977..f8f8cc32d03d 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -278,8 +278,7 @@ static void __munlock_pagevec(struct pagevec *pvec, struct zone *zone)
*/
if (TestClearPageLRU(page)) {
lruvec = relock_page_lruvec_irq(page, lruvec);
- del_page_from_lru_list(page, lruvec,
- page_lru(page));
+ del_page_from_lru_list(page, lruvec);
continue;
} else
__munlock_isolation_failed(page);
@@ -623,7 +622,7 @@ static unsigned long count_mm_mlocked_page_nr(struct mm_struct *mm,
vma = find_vma(mm, start);
if (vma == NULL)
- vma = mm->mmap;
+ return 0;
for (; vma ; vma = vma->vm_next) {
if (start >= vma->vm_end)
diff --git a/mm/mmap.c b/mm/mmap.c
index 90673febce6a..3f287599a7a3 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -189,7 +189,6 @@ static int do_brk_flags(unsigned long addr, unsigned long request, unsigned long
struct list_head *uf);
SYSCALL_DEFINE1(brk, unsigned long, brk)
{
- unsigned long retval;
unsigned long newbrk, oldbrk, origbrk;
struct mm_struct *mm = current->mm;
struct vm_area_struct *next;
@@ -281,9 +280,8 @@ success:
return brk;
out:
- retval = origbrk;
mmap_write_unlock(mm);
- return retval;
+ return origbrk;
}
static inline unsigned long vma_compute_gap(struct vm_area_struct *vma)
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
index 61ee40ed804e..459d195d2ff6 100644
--- a/mm/mmu_notifier.c
+++ b/mm/mmu_notifier.c
@@ -501,10 +501,33 @@ static int mn_hlist_invalidate_range_start(
"");
WARN_ON(mmu_notifier_range_blockable(range) ||
_ret != -EAGAIN);
+ /*
+ * We call all the notifiers on any EAGAIN,
+ * there is no way for a notifier to know if
+ * its start method failed, thus a start that
+ * does EAGAIN can't also do end.
+ */
+ WARN_ON(ops->invalidate_range_end);
ret = _ret;
}
}
}
+
+ if (ret) {
+ /*
+ * Must be non-blocking to get here. If there are multiple
+ * notifiers and one or more failed start, any that succeeded
+ * start are expecting their end to be called. Do so now.
+ */
+ hlist_for_each_entry_rcu(subscription, &subscriptions->list,
+ hlist, srcu_read_lock_held(&srcu)) {
+ if (!subscription->ops->invalidate_range_end)
+ continue;
+
+ subscription->ops->invalidate_range_end(subscription,
+ range);
+ }
+ }
srcu_read_unlock(&srcu, id);
return ret;
diff --git a/mm/mprotect.c b/mm/mprotect.c
index ab709023e9aa..94188df1ee55 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -617,10 +617,11 @@ static int do_mprotect_pkey(unsigned long start, size_t len,
if (tmp > end)
tmp = end;
- if (vma->vm_ops && vma->vm_ops->mprotect)
+ if (vma->vm_ops && vma->vm_ops->mprotect) {
error = vma->vm_ops->mprotect(vma, nstart, tmp, newflags);
- if (error)
- goto out;
+ if (error)
+ goto out;
+ }
error = mprotect_fixup(vma, &prev, nstart, tmp, newflags);
if (error)
diff --git a/mm/mremap.c b/mm/mremap.c
index 47192691fe32..ec8f840399ed 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -593,6 +593,14 @@ static unsigned long move_vma(struct vm_area_struct *vma,
/* We always clear VM_LOCKED[ONFAULT] on the old vma */
vma->vm_flags &= VM_LOCKED_CLEAR_MASK;
+ /*
+ * anon_vma links of the old vma is no longer needed after its page
+ * table has been moved.
+ */
+ if (new_vma != vma && vma->vm_start == old_addr &&
+ vma->vm_end == (old_addr + old_len))
+ unlink_anon_vmas(vma);
+
/* Because we won't unmap we don't need to touch locked_vm */
return new_addr;
}
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index c9a33ffe38b7..9efaf430cfd3 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -395,9 +395,8 @@ static int dump_task(struct task_struct *p, void *arg)
task = find_lock_task_mm(p);
if (!task) {
/*
- * This is a kthread or all of p's threads have already
- * detached their mm's. There's no need to report
- * them; they can't be oom killed anyway.
+ * All of p's threads have already detached their mm's. There's
+ * no need to report them; they can't be oom killed anyway.
*/
return 0;
}
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index eb34d204d4ee..9e35b636a393 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -2833,6 +2833,22 @@ void wait_on_page_writeback(struct page *page)
}
EXPORT_SYMBOL_GPL(wait_on_page_writeback);
+/*
+ * Wait for a page to complete writeback. Returns -EINTR if we get a
+ * fatal signal while waiting.
+ */
+int wait_on_page_writeback_killable(struct page *page)
+{
+ while (PageWriteback(page)) {
+ trace_wait_on_page_writeback(page, page_mapping(page));
+ if (wait_on_page_bit_killable(page, PG_writeback))
+ return -EINTR;
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(wait_on_page_writeback_killable);
+
/**
* wait_for_stable_page() - wait for writeback to finish, if necessary.
* @page: The page to wait on.
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index ef5070fed76b..cfc72873961d 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1282,6 +1282,12 @@ static __always_inline bool free_pages_prepare(struct page *page,
kernel_poison_pages(page, 1 << order);
/*
+ * With hardware tag-based KASAN, memory tags must be set before the
+ * page becomes unavailable via debug_pagealloc or arch_free_page.
+ */
+ kasan_free_nondeferred_pages(page, order);
+
+ /*
* arch_free_page() can make the page's contents inaccessible. s390
* does this. So nothing which can access the page's contents should
* happen after this.
@@ -1290,8 +1296,6 @@ static __always_inline bool free_pages_prepare(struct page *page,
debug_pagealloc_unmap_pages(page, 1 << order);
- kasan_free_nondeferred_pages(page, order);
-
return true;
}
@@ -2168,6 +2172,7 @@ void __init init_cma_reserved_pageblock(struct page *page)
}
adjust_managed_page_count(page, pageblock_nr_pages);
+ page_zone(page)->cma_pages += pageblock_nr_pages;
}
#endif
@@ -3309,6 +3314,7 @@ void split_page(struct page *page, unsigned int order)
for (i = 1; i < (1 << order); i++)
set_page_refcounted(page + i);
split_page_owner(page, 1 << order);
+ split_page_memcg(page, 1 << order);
}
EXPORT_SYMBOL_GPL(split_page);
@@ -5584,10 +5590,9 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask)
K(node_page_state(pgdat, NR_WRITEBACK)),
K(node_page_state(pgdat, NR_SHMEM)),
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
- K(node_page_state(pgdat, NR_SHMEM_THPS) * HPAGE_PMD_NR),
- K(node_page_state(pgdat, NR_SHMEM_PMDMAPPED)
- * HPAGE_PMD_NR),
- K(node_page_state(pgdat, NR_ANON_THPS) * HPAGE_PMD_NR),
+ K(node_page_state(pgdat, NR_SHMEM_THPS)),
+ K(node_page_state(pgdat, NR_SHMEM_PMDMAPPED)),
+ K(node_page_state(pgdat, NR_ANON_THPS)),
#endif
K(node_page_state(pgdat, NR_WRITEBACK_TEMP)),
node_page_state(pgdat, NR_KERNEL_STACK_KB),
@@ -6122,7 +6127,7 @@ overlap_memmap_init(unsigned long zone, unsigned long *pfn)
* (usually MIGRATE_MOVABLE). Besides setting the migratetype, no related
* zone stats (e.g., nr_isolate_pageblock) are touched.
*/
-void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
+void __meminit memmap_init_range(unsigned long size, int nid, unsigned long zone,
unsigned long start_pfn, unsigned long zone_end_pfn,
enum meminit_context context,
struct vmem_altmap *altmap, int migratetype)
@@ -6259,24 +6264,97 @@ static void __meminit zone_init_free_lists(struct zone *zone)
}
}
-void __meminit __weak memmap_init(unsigned long size, int nid,
- unsigned long zone,
- unsigned long range_start_pfn)
+#if !defined(CONFIG_FLAT_NODE_MEM_MAP)
+/*
+ * Only struct pages that correspond to ranges defined by memblock.memory
+ * are zeroed and initialized by going through __init_single_page() during
+ * memmap_init_zone().
+ *
+ * But, there could be struct pages that correspond to holes in
+ * memblock.memory. This can happen because of the following reasons:
+ * - physical memory bank size is not necessarily the exact multiple of the
+ * arbitrary section size
+ * - early reserved memory may not be listed in memblock.memory
+ * - memory layouts defined with memmap= kernel parameter may not align
+ * nicely with memmap sections
+ *
+ * Explicitly initialize those struct pages so that:
+ * - PG_Reserved is set
+ * - zone and node links point to zone and node that span the page if the
+ * hole is in the middle of a zone
+ * - zone and node links point to adjacent zone/node if the hole falls on
+ * the zone boundary; the pages in such holes will be prepended to the
+ * zone/node above the hole except for the trailing pages in the last
+ * section that will be appended to the zone/node below.
+ */
+static u64 __meminit init_unavailable_range(unsigned long spfn,
+ unsigned long epfn,
+ int zone, int node)
{
+ unsigned long pfn;
+ u64 pgcnt = 0;
+
+ for (pfn = spfn; pfn < epfn; pfn++) {
+ if (!pfn_valid(ALIGN_DOWN(pfn, pageblock_nr_pages))) {
+ pfn = ALIGN_DOWN(pfn, pageblock_nr_pages)
+ + pageblock_nr_pages - 1;
+ continue;
+ }
+ __init_single_page(pfn_to_page(pfn), pfn, zone, node);
+ __SetPageReserved(pfn_to_page(pfn));
+ pgcnt++;
+ }
+
+ return pgcnt;
+}
+#else
+static inline u64 init_unavailable_range(unsigned long spfn, unsigned long epfn,
+ int zone, int node)
+{
+ return 0;
+}
+#endif
+
+void __meminit __weak memmap_init_zone(struct zone *zone)
+{
+ unsigned long zone_start_pfn = zone->zone_start_pfn;
+ unsigned long zone_end_pfn = zone_start_pfn + zone->spanned_pages;
+ int i, nid = zone_to_nid(zone), zone_id = zone_idx(zone);
+ static unsigned long hole_pfn;
unsigned long start_pfn, end_pfn;
- unsigned long range_end_pfn = range_start_pfn + size;
- int i;
+ u64 pgcnt = 0;
for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
- start_pfn = clamp(start_pfn, range_start_pfn, range_end_pfn);
- end_pfn = clamp(end_pfn, range_start_pfn, range_end_pfn);
+ start_pfn = clamp(start_pfn, zone_start_pfn, zone_end_pfn);
+ end_pfn = clamp(end_pfn, zone_start_pfn, zone_end_pfn);
- if (end_pfn > start_pfn) {
- size = end_pfn - start_pfn;
- memmap_init_zone(size, nid, zone, start_pfn, range_end_pfn,
- MEMINIT_EARLY, NULL, MIGRATE_MOVABLE);
- }
+ if (end_pfn > start_pfn)
+ memmap_init_range(end_pfn - start_pfn, nid,
+ zone_id, start_pfn, zone_end_pfn,
+ MEMINIT_EARLY, NULL, MIGRATE_MOVABLE);
+
+ if (hole_pfn < start_pfn)
+ pgcnt += init_unavailable_range(hole_pfn, start_pfn,
+ zone_id, nid);
+ hole_pfn = end_pfn;
}
+
+#ifdef CONFIG_SPARSEMEM
+ /*
+ * Initialize the hole in the range [zone_end_pfn, section_end].
+ * If zone boundary falls in the middle of a section, this hole
+ * will be re-initialized during the call to this function for the
+ * higher zone.
+ */
+ end_pfn = round_up(zone_end_pfn, PAGES_PER_SECTION);
+ if (hole_pfn < end_pfn)
+ pgcnt += init_unavailable_range(hole_pfn, end_pfn,
+ zone_id, nid);
+#endif
+
+ if (pgcnt)
+ pr_info(" %s zone: %llu pages in unavailable ranges\n",
+ zone->name, pgcnt);
}
static int zone_batchsize(struct zone *zone)
@@ -6768,25 +6846,22 @@ static unsigned long __init usemap_size(unsigned long zone_start_pfn, unsigned l
return usemapsize / 8;
}
-static void __ref setup_usemap(struct pglist_data *pgdat,
- struct zone *zone,
- unsigned long zone_start_pfn,
- unsigned long zonesize)
+static void __ref setup_usemap(struct zone *zone)
{
- unsigned long usemapsize = usemap_size(zone_start_pfn, zonesize);
+ unsigned long usemapsize = usemap_size(zone->zone_start_pfn,
+ zone->spanned_pages);
zone->pageblock_flags = NULL;
if (usemapsize) {
zone->pageblock_flags =
memblock_alloc_node(usemapsize, SMP_CACHE_BYTES,
- pgdat->node_id);
+ zone_to_nid(zone));
if (!zone->pageblock_flags)
panic("Failed to allocate %ld bytes for zone %s pageblock flags on node %d\n",
- usemapsize, zone->name, pgdat->node_id);
+ usemapsize, zone->name, zone_to_nid(zone));
}
}
#else
-static inline void setup_usemap(struct pglist_data *pgdat, struct zone *zone,
- unsigned long zone_start_pfn, unsigned long zonesize) {}
+static inline void setup_usemap(struct zone *zone) {}
#endif /* CONFIG_SPARSEMEM */
#ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE
@@ -6933,7 +7008,6 @@ static void __init free_area_init_core(struct pglist_data *pgdat)
for (j = 0; j < MAX_NR_ZONES; j++) {
struct zone *zone = pgdat->node_zones + j;
unsigned long size, freesize, memmap_pages;
- unsigned long zone_start_pfn = zone->zone_start_pfn;
size = zone->spanned_pages;
freesize = zone->present_pages;
@@ -6981,9 +7055,9 @@ static void __init free_area_init_core(struct pglist_data *pgdat)
continue;
set_pageblock_order();
- setup_usemap(pgdat, zone, zone_start_pfn, size);
- init_currently_empty_zone(zone, zone_start_pfn, size);
- memmap_init(size, nid, j, zone_start_pfn);
+ setup_usemap(zone);
+ init_currently_empty_zone(zone, zone->zone_start_pfn, size);
+ memmap_init_zone(zone);
}
}
@@ -7077,88 +7151,6 @@ void __init free_area_init_memoryless_node(int nid)
free_area_init_node(nid);
}
-#if !defined(CONFIG_FLAT_NODE_MEM_MAP)
-/*
- * Initialize all valid struct pages in the range [spfn, epfn) and mark them
- * PageReserved(). Return the number of struct pages that were initialized.
- */
-static u64 __init init_unavailable_range(unsigned long spfn, unsigned long epfn)
-{
- unsigned long pfn;
- u64 pgcnt = 0;
-
- for (pfn = spfn; pfn < epfn; pfn++) {
- if (!pfn_valid(ALIGN_DOWN(pfn, pageblock_nr_pages))) {
- pfn = ALIGN_DOWN(pfn, pageblock_nr_pages)
- + pageblock_nr_pages - 1;
- continue;
- }
- /*
- * Use a fake node/zone (0) for now. Some of these pages
- * (in memblock.reserved but not in memblock.memory) will
- * get re-initialized via reserve_bootmem_region() later.
- */
- __init_single_page(pfn_to_page(pfn), pfn, 0, 0);
- __SetPageReserved(pfn_to_page(pfn));
- pgcnt++;
- }
-
- return pgcnt;
-}
-
-/*
- * Only struct pages that are backed by physical memory are zeroed and
- * initialized by going through __init_single_page(). But, there are some
- * struct pages which are reserved in memblock allocator and their fields
- * may be accessed (for example page_to_pfn() on some configuration accesses
- * flags). We must explicitly initialize those struct pages.
- *
- * This function also addresses a similar issue where struct pages are left
- * uninitialized because the physical address range is not covered by
- * memblock.memory or memblock.reserved. That could happen when memblock
- * layout is manually configured via memmap=, or when the highest physical
- * address (max_pfn) does not end on a section boundary.
- */
-static void __init init_unavailable_mem(void)
-{
- phys_addr_t start, end;
- u64 i, pgcnt;
- phys_addr_t next = 0;
-
- /*
- * Loop through unavailable ranges not covered by memblock.memory.
- */
- pgcnt = 0;
- for_each_mem_range(i, &start, &end) {
- if (next < start)
- pgcnt += init_unavailable_range(PFN_DOWN(next),
- PFN_UP(start));
- next = end;
- }
-
- /*
- * Early sections always have a fully populated memmap for the whole
- * section - see pfn_valid(). If the last section has holes at the
- * end and that section is marked "online", the memmap will be
- * considered initialized. Make sure that memmap has a well defined
- * state.
- */
- pgcnt += init_unavailable_range(PFN_DOWN(next),
- round_up(max_pfn, PAGES_PER_SECTION));
-
- /*
- * Struct pages that do not have backing memory. This could be because
- * firmware is using some of this memory, or for some other reasons.
- */
- if (pgcnt)
- pr_info("Zeroed struct page in unavailable ranges: %lld pages", pgcnt);
-}
-#else
-static inline void __init init_unavailable_mem(void)
-{
-}
-#endif /* !CONFIG_FLAT_NODE_MEM_MAP */
-
#if MAX_NUMNODES > 1
/*
* Figure out the number of possible node ids.
@@ -7582,7 +7574,6 @@ void __init free_area_init(unsigned long *max_zone_pfn)
/* Initialise every node */
mminit_verify_pageflags_layout();
setup_nr_node_ids();
- init_unavailable_mem();
for_each_online_node(nid) {
pg_data_t *pgdat = NODE_DATA(nid);
free_area_init_node(nid);
@@ -7698,17 +7689,6 @@ unsigned long free_reserved_area(void *start, void *end, int poison, const char
return pages;
}
-#ifdef CONFIG_HIGHMEM
-void free_highmem_page(struct page *page)
-{
- __free_reserved_page(page);
- totalram_pages_inc();
- atomic_long_inc(&page_zone(page)->managed_pages);
- totalhigh_pages_inc();
-}
-#endif
-
-
void __init mem_init_print_info(const char *str)
{
unsigned long physpages, codesize, datasize, rosize, bss_size;
diff --git a/mm/page_io.c b/mm/page_io.c
index 92f7941c6d01..c493ce9ebcf5 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -41,9 +41,9 @@ void end_swap_bio_write(struct bio *bio)
* Also clear PG_reclaim to avoid rotate_reclaimable_page()
*/
set_page_dirty(page);
- pr_alert("Write-error on swap-device (%u:%u:%llu)\n",
- MAJOR(bio_dev(bio)), MINOR(bio_dev(bio)),
- (unsigned long long)bio->bi_iter.bi_sector);
+ pr_alert_ratelimited("Write-error on swap-device (%u:%u:%llu)\n",
+ MAJOR(bio_dev(bio)), MINOR(bio_dev(bio)),
+ (unsigned long long)bio->bi_iter.bi_sector);
ClearPageReclaim(page);
}
end_page_writeback(page);
@@ -106,9 +106,9 @@ static void end_swap_bio_read(struct bio *bio)
if (bio->bi_status) {
SetPageError(page);
ClearPageUptodate(page);
- pr_alert("Read-error on swap-device (%u:%u:%llu)\n",
- MAJOR(bio_dev(bio)), MINOR(bio_dev(bio)),
- (unsigned long long)bio->bi_iter.bi_sector);
+ pr_alert_ratelimited("Read-error on swap-device (%u:%u:%llu)\n",
+ MAJOR(bio_dev(bio)), MINOR(bio_dev(bio)),
+ (unsigned long long)bio->bi_iter.bi_sector);
goto out;
}
@@ -254,11 +254,6 @@ out:
return ret;
}
-static sector_t swap_page_sector(struct page *page)
-{
- return (sector_t)__page_file_index(page) << (PAGE_SHIFT - 9);
-}
-
static inline void count_swpout_vm_event(struct page *page)
{
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
diff --git a/mm/page_owner.c b/mm/page_owner.c
index af464bb7fbe7..d15c7c4994f5 100644
--- a/mm/page_owner.c
+++ b/mm/page_owner.c
@@ -263,8 +263,8 @@ void pagetypeinfo_showmixedcount_print(struct seq_file *m,
struct page *page;
struct page_ext *page_ext;
struct page_owner *page_owner;
- unsigned long pfn = zone->zone_start_pfn, block_end_pfn;
- unsigned long end_pfn = pfn + zone->spanned_pages;
+ unsigned long pfn, block_end_pfn;
+ unsigned long end_pfn = zone_end_pfn(zone);
unsigned long count[MIGRATE_TYPES] = { 0, };
int pageblock_mt, page_mt;
int i;
diff --git a/mm/page_reporting.c b/mm/page_reporting.c
index cd8e13d41df4..c50d93ffa252 100644
--- a/mm/page_reporting.c
+++ b/mm/page_reporting.c
@@ -211,7 +211,7 @@ page_reporting_cycle(struct page_reporting_dev_info *prdev, struct zone *zone,
}
/* Rotate any leftover pages to the head of the freelist */
- if (&next->lru != list && !list_is_first(&next->lru, list))
+ if (!list_entry_is_head(next, list, lru) && !list_is_first(&next->lru, list))
list_rotate_to_front(&next->lru, list);
spin_unlock_irq(&zone->lock);
diff --git a/mm/percpu.c b/mm/percpu.c
index ad7a37ee74ef..6596a0a4286e 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -69,6 +69,7 @@
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/bitmap.h>
+#include <linux/cpumask.h>
#include <linux/memblock.h>
#include <linux/err.h>
#include <linux/lcm.h>
@@ -2662,13 +2663,14 @@ early_param("percpu_alloc", percpu_alloc_setup);
* On success, pointer to the new allocation_info is returned. On
* failure, ERR_PTR value is returned.
*/
-static struct pcpu_alloc_info * __init pcpu_build_alloc_info(
+static struct pcpu_alloc_info * __init __flatten pcpu_build_alloc_info(
size_t reserved_size, size_t dyn_size,
size_t atom_size,
pcpu_fc_cpu_distance_fn_t cpu_distance_fn)
{
static int group_map[NR_CPUS] __initdata;
static int group_cnt[NR_CPUS] __initdata;
+ static struct cpumask mask __initdata;
const size_t static_size = __per_cpu_end - __per_cpu_start;
int nr_groups = 1, nr_units = 0;
size_t size_sum, min_unit_size, alloc_size;
@@ -2681,6 +2683,7 @@ static struct pcpu_alloc_info * __init pcpu_build_alloc_info(
/* this function may be called multiple times */
memset(group_map, 0, sizeof(group_map));
memset(group_cnt, 0, sizeof(group_cnt));
+ cpumask_clear(&mask);
/* calculate size_sum and ensure dyn_size is enough for early alloc */
size_sum = PFN_ALIGN(static_size + reserved_size +
@@ -2702,24 +2705,27 @@ static struct pcpu_alloc_info * __init pcpu_build_alloc_info(
upa--;
max_upa = upa;
+ cpumask_copy(&mask, cpu_possible_mask);
+
/* group cpus according to their proximity */
- for_each_possible_cpu(cpu) {
- group = 0;
- next_group:
- for_each_possible_cpu(tcpu) {
- if (cpu == tcpu)
- break;
- if (group_map[tcpu] == group && cpu_distance_fn &&
- (cpu_distance_fn(cpu, tcpu) > LOCAL_DISTANCE ||
- cpu_distance_fn(tcpu, cpu) > LOCAL_DISTANCE)) {
- group++;
- nr_groups = max(nr_groups, group + 1);
- goto next_group;
- }
- }
+ for (group = 0; !cpumask_empty(&mask); group++) {
+ /* pop the group's first cpu */
+ cpu = cpumask_first(&mask);
group_map[cpu] = group;
group_cnt[group]++;
+ cpumask_clear_cpu(cpu, &mask);
+
+ for_each_cpu(tcpu, &mask) {
+ if (!cpu_distance_fn ||
+ (cpu_distance_fn(cpu, tcpu) == LOCAL_DISTANCE &&
+ cpu_distance_fn(tcpu, cpu) == LOCAL_DISTANCE)) {
+ group_map[tcpu] = group;
+ group_cnt[group]++;
+ cpumask_clear_cpu(tcpu, &mask);
+ }
+ }
}
+ nr_groups = group;
/*
* Wasted space is caused by a ratio imbalance of upa to group_cnt.
diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c
index 9578db83e312..c2210e1cdb51 100644
--- a/mm/pgtable-generic.c
+++ b/mm/pgtable-generic.c
@@ -135,8 +135,9 @@ pmd_t pmdp_huge_clear_flush(struct vm_area_struct *vma, unsigned long address,
{
pmd_t pmd;
VM_BUG_ON(address & ~HPAGE_PMD_MASK);
- VM_BUG_ON((pmd_present(*pmdp) && !pmd_trans_huge(*pmdp) &&
- !pmd_devmap(*pmdp)) || !pmd_present(*pmdp));
+ VM_BUG_ON(!pmd_present(*pmdp));
+ /* Below assumes pmd_present() is true */
+ VM_BUG_ON(!pmd_trans_huge(*pmdp) && !pmd_devmap(*pmdp));
pmd = pmdp_huge_get_and_clear(vma->vm_mm, address, pmdp);
flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
return pmd;
diff --git a/mm/rmap.c b/mm/rmap.c
index 08c56aaf72eb..b0fc27e77d6d 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -168,7 +168,7 @@ static void anon_vma_chain_link(struct vm_area_struct *vma,
*
* Anon-vma allocations are very subtle, because we may have
* optimistically looked up an anon_vma in page_lock_anon_vma_read()
- * and that may actually touch the spinlock even in the newly
+ * and that may actually touch the rwsem even in the newly
* allocated vma (it depends on RCU to make sure that the
* anon_vma isn't actually destroyed).
*
@@ -359,7 +359,7 @@ int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma)
goto out_error_free_anon_vma;
/*
- * The root anon_vma's spinlock is the lock actually used when we
+ * The root anon_vma's rwsem is the lock actually used when we
* lock any of the anon_vmas in this anon_vma tree.
*/
anon_vma->root = pvma->anon_vma->root;
@@ -413,8 +413,15 @@ void unlink_anon_vmas(struct vm_area_struct *vma)
list_del(&avc->same_vma);
anon_vma_chain_free(avc);
}
- if (vma->anon_vma)
+ if (vma->anon_vma) {
vma->anon_vma->degree--;
+
+ /*
+ * vma would still be needed after unlink, and anon_vma will be prepared
+ * when handle fault.
+ */
+ vma->anon_vma = NULL;
+ }
unlock_anon_vma_root(root);
/*
@@ -455,8 +462,8 @@ void __init anon_vma_init(void)
* Getting a lock on a stable anon_vma from a page off the LRU is tricky!
*
* Since there is no serialization what so ever against page_remove_rmap()
- * the best this function can do is return a locked anon_vma that might
- * have been relevant to this page.
+ * the best this function can do is return a refcount increased anon_vma
+ * that might have been relevant to this page.
*
* The page might have been remapped to a different anon_vma or the anon_vma
* returned may already be freed (and even reused).
@@ -1079,8 +1086,7 @@ static void __page_check_anon_rmap(struct page *page,
* be set up correctly at this point.
*
* We have exclusion against page_add_anon_rmap because the caller
- * always holds the page locked, except if called from page_dup_rmap,
- * in which case the page is already known to be setup.
+ * always holds the page locked.
*
* We have exclusion against page_add_new_anon_rmap because those pages
* are initially only visible via the pagetables, and the pte is locked
@@ -1144,7 +1150,7 @@ void do_page_add_anon_rmap(struct page *page,
* disabled.
*/
if (compound)
- __inc_lruvec_page_state(page, NR_ANON_THPS);
+ __mod_lruvec_page_state(page, NR_ANON_THPS, nr);
__mod_lruvec_page_state(page, NR_ANON_MAPPED, nr);
}
@@ -1186,7 +1192,7 @@ void page_add_new_anon_rmap(struct page *page,
if (hpage_pincount_available(page))
atomic_set(compound_pincount_ptr(page), 0);
- __inc_lruvec_page_state(page, NR_ANON_THPS);
+ __mod_lruvec_page_state(page, NR_ANON_THPS, nr);
} else {
/* Anon THP always mapped first with PMD */
VM_BUG_ON_PAGE(PageTransCompound(page), page);
@@ -1211,16 +1217,20 @@ void page_add_file_rmap(struct page *page, bool compound)
VM_BUG_ON_PAGE(compound && !PageTransHuge(page), page);
lock_page_memcg(page);
if (compound && PageTransHuge(page)) {
- for (i = 0, nr = 0; i < thp_nr_pages(page); i++) {
+ int nr_pages = thp_nr_pages(page);
+
+ for (i = 0, nr = 0; i < nr_pages; i++) {
if (atomic_inc_and_test(&page[i]._mapcount))
nr++;
}
if (!atomic_inc_and_test(compound_mapcount_ptr(page)))
goto out;
if (PageSwapBacked(page))
- __inc_node_page_state(page, NR_SHMEM_PMDMAPPED);
+ __mod_lruvec_page_state(page, NR_SHMEM_PMDMAPPED,
+ nr_pages);
else
- __inc_node_page_state(page, NR_FILE_PMDMAPPED);
+ __mod_lruvec_page_state(page, NR_FILE_PMDMAPPED,
+ nr_pages);
} else {
if (PageTransCompound(page) && page_mapping(page)) {
VM_WARN_ON_ONCE(!PageLocked(page));
@@ -1252,16 +1262,20 @@ static void page_remove_file_rmap(struct page *page, bool compound)
/* page still mapped by someone else? */
if (compound && PageTransHuge(page)) {
- for (i = 0, nr = 0; i < thp_nr_pages(page); i++) {
+ int nr_pages = thp_nr_pages(page);
+
+ for (i = 0, nr = 0; i < nr_pages; i++) {
if (atomic_add_negative(-1, &page[i]._mapcount))
nr++;
}
if (!atomic_add_negative(-1, compound_mapcount_ptr(page)))
return;
if (PageSwapBacked(page))
- __dec_node_page_state(page, NR_SHMEM_PMDMAPPED);
+ __mod_lruvec_page_state(page, NR_SHMEM_PMDMAPPED,
+ -nr_pages);
else
- __dec_node_page_state(page, NR_FILE_PMDMAPPED);
+ __mod_lruvec_page_state(page, NR_FILE_PMDMAPPED,
+ -nr_pages);
} else {
if (!atomic_add_negative(-1, &page->_mapcount))
return;
@@ -1292,7 +1306,7 @@ static void page_remove_anon_compound_rmap(struct page *page)
if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE))
return;
- __dec_lruvec_page_state(page, NR_ANON_THPS);
+ __mod_lruvec_page_state(page, NR_ANON_THPS, -thp_nr_pages(page));
if (TestClearPageDoubleMap(page)) {
/*
@@ -1722,9 +1736,9 @@ static bool invalid_migration_vma(struct vm_area_struct *vma, void *arg)
return vma_is_temporary_stack(vma);
}
-static int page_mapcount_is_zero(struct page *page)
+static int page_not_mapped(struct page *page)
{
- return !total_mapcount(page);
+ return !page_mapped(page);
}
/**
@@ -1742,7 +1756,7 @@ bool try_to_unmap(struct page *page, enum ttu_flags flags)
struct rmap_walk_control rwc = {
.rmap_one = try_to_unmap_one,
.arg = (void *)flags,
- .done = page_mapcount_is_zero,
+ .done = page_not_mapped,
.anon_lock = page_lock_anon_vma_read,
};
@@ -1766,11 +1780,6 @@ bool try_to_unmap(struct page *page, enum ttu_flags flags)
return !page_mapcount(page) ? true : false;
}
-static int page_not_mapped(struct page *page)
-{
- return !page_mapped(page);
-};
-
/**
* try_to_munlock - try to munlock a page
* @page: the page to be munlocked
diff --git a/mm/shmem.c b/mm/shmem.c
index 1b254fbfdf52..b2db4ed0fbc7 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -713,7 +713,7 @@ next:
}
if (PageTransHuge(page)) {
count_vm_event(THP_FILE_ALLOC);
- __inc_lruvec_page_state(page, NR_SHMEM_THPS);
+ __mod_lruvec_page_state(page, NR_SHMEM_THPS, nr);
}
mapping->nrpages += nr;
__mod_lruvec_page_state(page, NR_FILE_PAGES, nr);
@@ -842,7 +842,6 @@ unsigned long shmem_swap_usage(struct vm_area_struct *vma)
void shmem_unlock_mapping(struct address_space *mapping)
{
struct pagevec pvec;
- pgoff_t indices[PAGEVEC_SIZE];
pgoff_t index = 0;
pagevec_init(&pvec);
@@ -850,16 +849,8 @@ void shmem_unlock_mapping(struct address_space *mapping)
* Minor point, but we might as well stop if someone else SHM_LOCKs it.
*/
while (!mapping_unevictable(mapping)) {
- /*
- * Avoid pagevec_lookup(): find_get_pages() returns 0 as if it
- * has finished, if it hits a row of PAGEVEC_SIZE swap entries.
- */
- pvec.nr = find_get_entries(mapping, index,
- PAGEVEC_SIZE, pvec.pages, indices);
- if (!pvec.nr)
+ if (!pagevec_lookup(&pvec, mapping, &index))
break;
- index = indices[pvec.nr - 1] + 1;
- pagevec_remove_exceptionals(&pvec);
check_move_unevictable_pages(&pvec);
pagevec_release(&pvec);
cond_resched();
@@ -916,18 +907,12 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
pagevec_init(&pvec);
index = start;
- while (index < end) {
- pvec.nr = find_get_entries(mapping, index,
- min(end - index, (pgoff_t)PAGEVEC_SIZE),
- pvec.pages, indices);
- if (!pvec.nr)
- break;
+ while (index < end && find_lock_entries(mapping, index, end - 1,
+ &pvec, indices)) {
for (i = 0; i < pagevec_count(&pvec); i++) {
struct page *page = pvec.pages[i];
index = indices[i];
- if (index >= end)
- break;
if (xa_is_value(page)) {
if (unfalloc)
@@ -936,18 +921,10 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
index, page);
continue;
}
+ index += thp_nr_pages(page) - 1;
- VM_BUG_ON_PAGE(page_to_pgoff(page) != index, page);
-
- if (!trylock_page(page))
- continue;
-
- if ((!unfalloc || !PageUptodate(page)) &&
- page_mapping(page) == mapping) {
- VM_BUG_ON_PAGE(PageWriteback(page), page);
- if (shmem_punch_compound(page, start, end))
- truncate_inode_page(mapping, page);
- }
+ if (!unfalloc || !PageUptodate(page))
+ truncate_inode_page(mapping, page);
unlock_page(page);
}
pagevec_remove_exceptionals(&pvec);
@@ -988,10 +965,8 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
while (index < end) {
cond_resched();
- pvec.nr = find_get_entries(mapping, index,
- min(end - index, (pgoff_t)PAGEVEC_SIZE),
- pvec.pages, indices);
- if (!pvec.nr) {
+ if (!find_get_entries(mapping, index, end - 1, &pvec,
+ indices)) {
/* If all gone or hole-punch or unfalloc, we're done */
if (index == start || end != -1)
break;
@@ -1003,9 +978,6 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
struct page *page = pvec.pages[i];
index = indices[i];
- if (index >= end)
- break;
-
if (xa_is_value(page)) {
if (unfalloc)
continue;
@@ -1060,7 +1032,8 @@ void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend)
}
EXPORT_SYMBOL_GPL(shmem_truncate_range);
-static int shmem_getattr(const struct path *path, struct kstat *stat,
+static int shmem_getattr(struct user_namespace *mnt_userns,
+ const struct path *path, struct kstat *stat,
u32 request_mask, unsigned int query_flags)
{
struct inode *inode = path->dentry->d_inode;
@@ -1072,7 +1045,7 @@ static int shmem_getattr(const struct path *path, struct kstat *stat,
shmem_recalc_inode(inode);
spin_unlock_irq(&info->lock);
}
- generic_fillattr(inode, stat);
+ generic_fillattr(&init_user_ns, inode, stat);
if (is_huge_enabled(sb_info))
stat->blksize = HPAGE_PMD_SIZE;
@@ -1080,14 +1053,15 @@ static int shmem_getattr(const struct path *path, struct kstat *stat,
return 0;
}
-static int shmem_setattr(struct dentry *dentry, struct iattr *attr)
+static int shmem_setattr(struct user_namespace *mnt_userns,
+ struct dentry *dentry, struct iattr *attr)
{
struct inode *inode = d_inode(dentry);
struct shmem_inode_info *info = SHMEM_I(inode);
struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
int error;
- error = setattr_prepare(dentry, attr);
+ error = setattr_prepare(&init_user_ns, dentry, attr);
if (error)
return error;
@@ -1141,9 +1115,9 @@ static int shmem_setattr(struct dentry *dentry, struct iattr *attr)
}
}
- setattr_copy(inode, attr);
+ setattr_copy(&init_user_ns, inode, attr);
if (attr->ia_valid & ATTR_MODE)
- error = posix_acl_chmod(inode, inode->i_mode);
+ error = posix_acl_chmod(&init_user_ns, inode, inode->i_mode);
return error;
}
@@ -1531,6 +1505,30 @@ static struct page *shmem_swapin(swp_entry_t swap, gfp_t gfp,
return page;
}
+/*
+ * Make sure huge_gfp is always more limited than limit_gfp.
+ * Some of the flags set permissions, while others set limitations.
+ */
+static gfp_t limit_gfp_mask(gfp_t huge_gfp, gfp_t limit_gfp)
+{
+ gfp_t allowflags = __GFP_IO | __GFP_FS | __GFP_RECLAIM;
+ gfp_t denyflags = __GFP_NOWARN | __GFP_NORETRY;
+ gfp_t zoneflags = limit_gfp & GFP_ZONEMASK;
+ gfp_t result = huge_gfp & ~(allowflags | GFP_ZONEMASK);
+
+ /* Allow allocations only from the originally specified zones. */
+ result |= zoneflags;
+
+ /*
+ * Minimize the result gfp by taking the union with the deny flags,
+ * and the intersection of the allow flags.
+ */
+ result |= (limit_gfp & denyflags);
+ result |= (huge_gfp & limit_gfp) & allowflags;
+
+ return result;
+}
+
static struct page *shmem_alloc_hugepage(gfp_t gfp,
struct shmem_inode_info *info, pgoff_t index)
{
@@ -1545,8 +1543,8 @@ static struct page *shmem_alloc_hugepage(gfp_t gfp,
return NULL;
shmem_pseudo_vma_init(&pvma, info, hindex);
- page = alloc_pages_vma(gfp | __GFP_COMP | __GFP_NORETRY | __GFP_NOWARN,
- HPAGE_PMD_ORDER, &pvma, 0, numa_node_id(), true);
+ page = alloc_pages_vma(gfp, HPAGE_PMD_ORDER, &pvma, 0, numa_node_id(),
+ true);
shmem_pseudo_vma_destroy(&pvma);
if (page)
prep_transhuge_page(page);
@@ -1802,6 +1800,7 @@ static int shmem_getpage_gfp(struct inode *inode, pgoff_t index,
struct page *page;
enum sgp_type sgp_huge = sgp;
pgoff_t hindex = index;
+ gfp_t huge_gfp;
int error;
int once = 0;
int alloced = 0;
@@ -1819,7 +1818,8 @@ repeat:
sbinfo = SHMEM_SB(inode->i_sb);
charge_mm = vma ? vma->vm_mm : current->mm;
- page = find_lock_entry(mapping, index);
+ page = pagecache_get_page(mapping, index,
+ FGP_ENTRY | FGP_HEAD | FGP_LOCK, 0);
if (xa_is_value(page)) {
error = shmem_swapin_page(inode, index, &page,
sgp, gfp, vma, fault_type);
@@ -1887,7 +1887,9 @@ repeat:
}
alloc_huge:
- page = shmem_alloc_and_acct_page(gfp, inode, index, true);
+ huge_gfp = vma_thp_gfp_mask(vma);
+ huge_gfp = limit_gfp_mask(huge_gfp, gfp);
+ page = shmem_alloc_and_acct_page(huge_gfp, inode, index, true);
if (IS_ERR(page)) {
alloc_nohuge:
page = shmem_alloc_and_acct_page(gfp, inode,
@@ -2303,7 +2305,7 @@ static struct inode *shmem_get_inode(struct super_block *sb, const struct inode
inode = new_inode(sb);
if (inode) {
inode->i_ino = ino;
- inode_init_owner(inode, dir, mode);
+ inode_init_owner(&init_user_ns, inode, dir, mode);
inode->i_blocks = 0;
inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode);
inode->i_generation = prandom_u32();
@@ -2674,86 +2676,20 @@ static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
return retval ? retval : error;
}
-/*
- * llseek SEEK_DATA or SEEK_HOLE through the page cache.
- */
-static pgoff_t shmem_seek_hole_data(struct address_space *mapping,
- pgoff_t index, pgoff_t end, int whence)
-{
- struct page *page;
- struct pagevec pvec;
- pgoff_t indices[PAGEVEC_SIZE];
- bool done = false;
- int i;
-
- pagevec_init(&pvec);
- pvec.nr = 1; /* start small: we may be there already */
- while (!done) {
- pvec.nr = find_get_entries(mapping, index,
- pvec.nr, pvec.pages, indices);
- if (!pvec.nr) {
- if (whence == SEEK_DATA)
- index = end;
- break;
- }
- for (i = 0; i < pvec.nr; i++, index++) {
- if (index < indices[i]) {
- if (whence == SEEK_HOLE) {
- done = true;
- break;
- }
- index = indices[i];
- }
- page = pvec.pages[i];
- if (page && !xa_is_value(page)) {
- if (!PageUptodate(page))
- page = NULL;
- }
- if (index >= end ||
- (page && whence == SEEK_DATA) ||
- (!page && whence == SEEK_HOLE)) {
- done = true;
- break;
- }
- }
- pagevec_remove_exceptionals(&pvec);
- pagevec_release(&pvec);
- pvec.nr = PAGEVEC_SIZE;
- cond_resched();
- }
- return index;
-}
-
static loff_t shmem_file_llseek(struct file *file, loff_t offset, int whence)
{
struct address_space *mapping = file->f_mapping;
struct inode *inode = mapping->host;
- pgoff_t start, end;
- loff_t new_offset;
if (whence != SEEK_DATA && whence != SEEK_HOLE)
return generic_file_llseek_size(file, offset, whence,
MAX_LFS_FILESIZE, i_size_read(inode));
+ if (offset < 0)
+ return -ENXIO;
+
inode_lock(inode);
/* We're holding i_mutex so we can access i_size directly */
-
- if (offset < 0 || offset >= inode->i_size)
- offset = -ENXIO;
- else {
- start = offset >> PAGE_SHIFT;
- end = (inode->i_size + PAGE_SIZE - 1) >> PAGE_SHIFT;
- new_offset = shmem_seek_hole_data(mapping, start, end, whence);
- new_offset <<= PAGE_SHIFT;
- if (new_offset > offset) {
- if (new_offset < inode->i_size)
- offset = new_offset;
- else if (whence == SEEK_DATA)
- offset = -ENXIO;
- else
- offset = inode->i_size;
- }
- }
-
+ offset = mapping_seek_hole_data(mapping, offset, inode->i_size, whence);
if (offset >= 0)
offset = vfs_setpos(file, offset, MAX_LFS_FILESIZE);
inode_unlock(inode);
@@ -2917,7 +2853,8 @@ static int shmem_statfs(struct dentry *dentry, struct kstatfs *buf)
* File creation. Allocate an inode, and we're done..
*/
static int
-shmem_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev)
+shmem_mknod(struct user_namespace *mnt_userns, struct inode *dir,
+ struct dentry *dentry, umode_t mode, dev_t dev)
{
struct inode *inode;
int error = -ENOSPC;
@@ -2946,7 +2883,8 @@ out_iput:
}
static int
-shmem_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
+shmem_tmpfile(struct user_namespace *mnt_userns, struct inode *dir,
+ struct dentry *dentry, umode_t mode)
{
struct inode *inode;
int error = -ENOSPC;
@@ -2969,20 +2907,22 @@ out_iput:
return error;
}
-static int shmem_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
+static int shmem_mkdir(struct user_namespace *mnt_userns, struct inode *dir,
+ struct dentry *dentry, umode_t mode)
{
int error;
- if ((error = shmem_mknod(dir, dentry, mode | S_IFDIR, 0)))
+ if ((error = shmem_mknod(&init_user_ns, dir, dentry,
+ mode | S_IFDIR, 0)))
return error;
inc_nlink(dir);
return 0;
}
-static int shmem_create(struct inode *dir, struct dentry *dentry, umode_t mode,
- bool excl)
+static int shmem_create(struct user_namespace *mnt_userns, struct inode *dir,
+ struct dentry *dentry, umode_t mode, bool excl)
{
- return shmem_mknod(dir, dentry, mode | S_IFREG, 0);
+ return shmem_mknod(&init_user_ns, dir, dentry, mode | S_IFREG, 0);
}
/*
@@ -3062,7 +3002,8 @@ static int shmem_exchange(struct inode *old_dir, struct dentry *old_dentry, stru
return 0;
}
-static int shmem_whiteout(struct inode *old_dir, struct dentry *old_dentry)
+static int shmem_whiteout(struct user_namespace *mnt_userns,
+ struct inode *old_dir, struct dentry *old_dentry)
{
struct dentry *whiteout;
int error;
@@ -3071,7 +3012,7 @@ static int shmem_whiteout(struct inode *old_dir, struct dentry *old_dentry)
if (!whiteout)
return -ENOMEM;
- error = shmem_mknod(old_dir, whiteout,
+ error = shmem_mknod(&init_user_ns, old_dir, whiteout,
S_IFCHR | WHITEOUT_MODE, WHITEOUT_DEV);
dput(whiteout);
if (error)
@@ -3094,7 +3035,10 @@ static int shmem_whiteout(struct inode *old_dir, struct dentry *old_dentry)
* it exists so that the VFS layer correctly free's it when it
* gets overwritten.
*/
-static int shmem_rename2(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags)
+static int shmem_rename2(struct user_namespace *mnt_userns,
+ struct inode *old_dir, struct dentry *old_dentry,
+ struct inode *new_dir, struct dentry *new_dentry,
+ unsigned int flags)
{
struct inode *inode = d_inode(old_dentry);
int they_are_dirs = S_ISDIR(inode->i_mode);
@@ -3111,7 +3055,7 @@ static int shmem_rename2(struct inode *old_dir, struct dentry *old_dentry, struc
if (flags & RENAME_WHITEOUT) {
int error;
- error = shmem_whiteout(old_dir, old_dentry);
+ error = shmem_whiteout(&init_user_ns, old_dir, old_dentry);
if (error)
return error;
}
@@ -3135,7 +3079,8 @@ static int shmem_rename2(struct inode *old_dir, struct dentry *old_dentry, struc
return 0;
}
-static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *symname)
+static int shmem_symlink(struct user_namespace *mnt_userns, struct inode *dir,
+ struct dentry *dentry, const char *symname)
{
int error;
int len;
@@ -3273,6 +3218,7 @@ static int shmem_xattr_handler_get(const struct xattr_handler *handler,
}
static int shmem_xattr_handler_set(const struct xattr_handler *handler,
+ struct user_namespace *mnt_userns,
struct dentry *unused, struct inode *inode,
const char *name, const void *value,
size_t size, int flags)
diff --git a/mm/slab.c b/mm/slab.c
index dcc55e78f353..ae651bf540b7 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -100,6 +100,7 @@
#include <linux/seq_file.h>
#include <linux/notifier.h>
#include <linux/kallsyms.h>
+#include <linux/kfence.h>
#include <linux/cpu.h>
#include <linux/sysctl.h>
#include <linux/module.h>
@@ -272,7 +273,7 @@ static void kmem_cache_node_init(struct kmem_cache_node *parent)
#define STATS_DEC_ACTIVE(x) ((x)->num_active--)
#define STATS_INC_ALLOCED(x) ((x)->num_allocations++)
#define STATS_INC_GROWN(x) ((x)->grown++)
-#define STATS_ADD_REAPED(x,y) ((x)->reaped += (y))
+#define STATS_ADD_REAPED(x, y) ((x)->reaped += (y))
#define STATS_SET_HIGH(x) \
do { \
if ((x)->num_active > (x)->high_mark) \
@@ -296,7 +297,7 @@ static void kmem_cache_node_init(struct kmem_cache_node *parent)
#define STATS_DEC_ACTIVE(x) do { } while (0)
#define STATS_INC_ALLOCED(x) do { } while (0)
#define STATS_INC_GROWN(x) do { } while (0)
-#define STATS_ADD_REAPED(x,y) do { (void)(y); } while (0)
+#define STATS_ADD_REAPED(x, y) do { (void)(y); } while (0)
#define STATS_SET_HIGH(x) do { } while (0)
#define STATS_INC_ERR(x) do { } while (0)
#define STATS_INC_NODEALLOCS(x) do { } while (0)
@@ -332,7 +333,7 @@ static int obj_offset(struct kmem_cache *cachep)
static unsigned long long *dbg_redzone1(struct kmem_cache *cachep, void *objp)
{
BUG_ON(!(cachep->flags & SLAB_RED_ZONE));
- return (unsigned long long*) (objp + obj_offset(cachep) -
+ return (unsigned long long *) (objp + obj_offset(cachep) -
sizeof(unsigned long long));
}
@@ -580,7 +581,7 @@ static int transfer_objects(struct array_cache *to,
if (!nr)
return 0;
- memcpy(to->entry + to->avail, from->entry + from->avail -nr,
+ memcpy(to->entry + to->avail, from->entry + from->avail - nr,
sizeof(void *) *nr);
from->avail -= nr;
@@ -1379,7 +1380,7 @@ static struct page *kmem_getpages(struct kmem_cache *cachep, gfp_t flags,
return NULL;
}
- account_slab_page(page, cachep->gfporder, cachep);
+ account_slab_page(page, cachep->gfporder, cachep, flags);
__SetPageSlab(page);
/* Record if ALLOC_NO_WATERMARKS was set when allocating the slab */
if (sk_memalloc_socks() && page_is_pfmemalloc(page))
@@ -1790,8 +1791,7 @@ static int __ref setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp)
}
slab_flags_t kmem_cache_flags(unsigned int object_size,
- slab_flags_t flags, const char *name,
- void (*ctor)(void *))
+ slab_flags_t flags, const char *name)
{
return flags;
}
@@ -2738,7 +2738,7 @@ static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp,
#else
#define kfree_debugcheck(x) do { } while(0)
-#define cache_free_debugcheck(x,objp,z) (objp)
+#define cache_free_debugcheck(x, objp, z) (objp)
#endif
static inline void fixup_objfreelist_debug(struct kmem_cache *cachep,
@@ -2992,7 +2992,7 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep,
gfp_t flags, void *objp, unsigned long caller)
{
WARN_ON_ONCE(cachep->ctor && (flags & __GFP_ZERO));
- if (!objp)
+ if (!objp || is_kfence_address(objp))
return objp;
if (cachep->flags & SLAB_POISON) {
check_poison_obj(cachep, objp);
@@ -3025,7 +3025,7 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep,
return objp;
}
#else
-#define cache_alloc_debugcheck_after(a,b,objp,d) (objp)
+#define cache_alloc_debugcheck_after(a, b, objp, d) (objp)
#endif
static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags)
@@ -3209,7 +3209,7 @@ must_grow:
}
static __always_inline void *
-slab_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid,
+slab_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid, size_t orig_size,
unsigned long caller)
{
unsigned long save_flags;
@@ -3222,6 +3222,10 @@ slab_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid,
if (unlikely(!cachep))
return NULL;
+ ptr = kfence_alloc(cachep, orig_size, flags);
+ if (unlikely(ptr))
+ goto out_hooks;
+
cache_alloc_debugcheck_before(cachep, flags);
local_irq_save(save_flags);
@@ -3254,6 +3258,7 @@ slab_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid,
if (unlikely(slab_want_init_on_alloc(flags, cachep)) && ptr)
memset(ptr, 0, cachep->object_size);
+out_hooks:
slab_post_alloc_hook(cachep, objcg, flags, 1, &ptr);
return ptr;
}
@@ -3291,7 +3296,7 @@ __do_cache_alloc(struct kmem_cache *cachep, gfp_t flags)
#endif /* CONFIG_NUMA */
static __always_inline void *
-slab_alloc(struct kmem_cache *cachep, gfp_t flags, unsigned long caller)
+slab_alloc(struct kmem_cache *cachep, gfp_t flags, size_t orig_size, unsigned long caller)
{
unsigned long save_flags;
void *objp;
@@ -3302,6 +3307,10 @@ slab_alloc(struct kmem_cache *cachep, gfp_t flags, unsigned long caller)
if (unlikely(!cachep))
return NULL;
+ objp = kfence_alloc(cachep, orig_size, flags);
+ if (unlikely(objp))
+ goto out;
+
cache_alloc_debugcheck_before(cachep, flags);
local_irq_save(save_flags);
objp = __do_cache_alloc(cachep, flags);
@@ -3312,6 +3321,7 @@ slab_alloc(struct kmem_cache *cachep, gfp_t flags, unsigned long caller)
if (unlikely(slab_want_init_on_alloc(flags, cachep)) && objp)
memset(objp, 0, cachep->object_size);
+out:
slab_post_alloc_hook(cachep, objcg, flags, 1, &objp);
return objp;
}
@@ -3417,11 +3427,17 @@ free_done:
static __always_inline void __cache_free(struct kmem_cache *cachep, void *objp,
unsigned long caller)
{
+ if (is_kfence_address(objp)) {
+ kmemleak_free_recursive(objp, cachep->flags);
+ __kfence_free(objp);
+ return;
+ }
+
if (unlikely(slab_want_init_on_free(cachep)))
memset(objp, 0, cachep->object_size);
/* Put the object into the quarantine, don't touch it for now. */
- if (kasan_slab_free(cachep, objp, _RET_IP_))
+ if (kasan_slab_free(cachep, objp))
return;
/* Use KCSAN to help debug racy use-after-free. */
@@ -3483,7 +3499,7 @@ void ___cache_free(struct kmem_cache *cachep, void *objp,
*/
void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags)
{
- void *ret = slab_alloc(cachep, flags, _RET_IP_);
+ void *ret = slab_alloc(cachep, flags, cachep->object_size, _RET_IP_);
trace_kmem_cache_alloc(_RET_IP_, ret,
cachep->object_size, cachep->size, flags);
@@ -3516,7 +3532,7 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
local_irq_disable();
for (i = 0; i < size; i++) {
- void *objp = __do_cache_alloc(s, flags);
+ void *objp = kfence_alloc(s, s->object_size, flags) ?: __do_cache_alloc(s, flags);
if (unlikely(!objp))
goto error;
@@ -3549,7 +3565,7 @@ kmem_cache_alloc_trace(struct kmem_cache *cachep, gfp_t flags, size_t size)
{
void *ret;
- ret = slab_alloc(cachep, flags, _RET_IP_);
+ ret = slab_alloc(cachep, flags, size, _RET_IP_);
ret = kasan_kmalloc(cachep, ret, size, flags);
trace_kmalloc(_RET_IP_, ret,
@@ -3575,7 +3591,7 @@ EXPORT_SYMBOL(kmem_cache_alloc_trace);
*/
void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid)
{
- void *ret = slab_alloc_node(cachep, flags, nodeid, _RET_IP_);
+ void *ret = slab_alloc_node(cachep, flags, nodeid, cachep->object_size, _RET_IP_);
trace_kmem_cache_alloc_node(_RET_IP_, ret,
cachep->object_size, cachep->size,
@@ -3593,7 +3609,7 @@ void *kmem_cache_alloc_node_trace(struct kmem_cache *cachep,
{
void *ret;
- ret = slab_alloc_node(cachep, flags, nodeid, _RET_IP_);
+ ret = slab_alloc_node(cachep, flags, nodeid, size, _RET_IP_);
ret = kasan_kmalloc(cachep, ret, size, flags);
trace_kmalloc_node(_RET_IP_, ret,
@@ -3674,7 +3690,7 @@ static __always_inline void *__do_kmalloc(size_t size, gfp_t flags,
cachep = kmalloc_slab(size, flags);
if (unlikely(ZERO_OR_NULL_PTR(cachep)))
return cachep;
- ret = slab_alloc(cachep, flags, caller);
+ ret = slab_alloc(cachep, flags, size, caller);
ret = kasan_kmalloc(cachep, ret, size, flags);
trace_kmalloc(caller, ret,
@@ -3717,7 +3733,7 @@ void kmem_cache_free(struct kmem_cache *cachep, void *objp)
__cache_free(cachep, objp, _RET_IP_);
local_irq_restore(flags);
- trace_kmem_cache_free(_RET_IP_, objp);
+ trace_kmem_cache_free(_RET_IP_, objp, cachep->name);
}
EXPORT_SYMBOL(kmem_cache_free);
@@ -4173,7 +4189,10 @@ void __check_heap_object(const void *ptr, unsigned long n, struct page *page,
BUG_ON(objnr >= cachep->num);
/* Find offset within object. */
- offset = ptr - index_to_obj(cachep, page, objnr) - obj_offset(cachep);
+ if (is_kfence_address(ptr))
+ offset = ptr - kfence_object_start(ptr);
+ else
+ offset = ptr - index_to_obj(cachep, page, objnr) - obj_offset(cachep);
/* Allow address range falling entirely within usercopy region. */
if (offset >= cachep->useroffset &&
diff --git a/mm/slab.h b/mm/slab.h
index ecad9b57bc44..076582f58f68 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -110,8 +110,7 @@ __kmem_cache_alias(const char *name, unsigned int size, unsigned int align,
slab_flags_t flags, void (*ctor)(void *));
slab_flags_t kmem_cache_flags(unsigned int object_size,
- slab_flags_t flags, const char *name,
- void (*ctor)(void *));
+ slab_flags_t flags, const char *name);
#else
static inline struct kmem_cache *
__kmem_cache_alias(const char *name, unsigned int size, unsigned int align,
@@ -119,8 +118,7 @@ __kmem_cache_alias(const char *name, unsigned int size, unsigned int align,
{ return NULL; }
static inline slab_flags_t kmem_cache_flags(unsigned int object_size,
- slab_flags_t flags, const char *name,
- void (*ctor)(void *))
+ slab_flags_t flags, const char *name)
{
return flags;
}
@@ -240,7 +238,7 @@ static inline bool kmem_cache_debug_flags(struct kmem_cache *s, slab_flags_t fla
#ifdef CONFIG_MEMCG_KMEM
int memcg_alloc_page_obj_cgroups(struct page *page, struct kmem_cache *s,
- gfp_t gfp);
+ gfp_t gfp, bool new_page);
static inline void memcg_free_page_obj_cgroups(struct page *page)
{
@@ -317,7 +315,8 @@ static inline void memcg_slab_post_alloc_hook(struct kmem_cache *s,
page = virt_to_head_page(p[i]);
if (!page_objcgs(page) &&
- memcg_alloc_page_obj_cgroups(page, s, flags)) {
+ memcg_alloc_page_obj_cgroups(page, s, flags,
+ false)) {
obj_cgroup_uncharge(objcg, obj_full_size(s));
continue;
}
@@ -381,7 +380,8 @@ static inline struct mem_cgroup *memcg_from_slab_obj(void *ptr)
}
static inline int memcg_alloc_page_obj_cgroups(struct page *page,
- struct kmem_cache *s, gfp_t gfp)
+ struct kmem_cache *s, gfp_t gfp,
+ bool new_page)
{
return 0;
}
@@ -422,8 +422,12 @@ static inline struct kmem_cache *virt_to_cache(const void *obj)
}
static __always_inline void account_slab_page(struct page *page, int order,
- struct kmem_cache *s)
+ struct kmem_cache *s,
+ gfp_t gfp)
{
+ if (memcg_kmem_enabled() && (s->flags & SLAB_ACCOUNT))
+ memcg_alloc_page_obj_cgroups(page, s, gfp, true);
+
mod_node_page_state(page_pgdat(page), cache_vmstat_idx(s),
PAGE_SIZE << order);
}
diff --git a/mm/slab_common.c b/mm/slab_common.c
index adbace4256ef..88e833986332 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -12,6 +12,7 @@
#include <linux/memory.h>
#include <linux/cache.h>
#include <linux/compiler.h>
+#include <linux/kfence.h>
#include <linux/module.h>
#include <linux/cpu.h>
#include <linux/uaccess.h>
@@ -197,7 +198,7 @@ struct kmem_cache *find_mergeable(unsigned int size, unsigned int align,
size = ALIGN(size, sizeof(void *));
align = calculate_alignment(flags, align, size);
size = ALIGN(size, align);
- flags = kmem_cache_flags(size, flags, name, NULL);
+ flags = kmem_cache_flags(size, flags, name);
if (flags & SLAB_NEVER_MERGE)
return NULL;
@@ -309,9 +310,6 @@ kmem_cache_create_usercopy(const char *name,
const char *cache_name;
int err;
- get_online_cpus();
- get_online_mems();
-
mutex_lock(&slab_mutex);
err = kmem_cache_sanity_check(name, size);
@@ -360,9 +358,6 @@ kmem_cache_create_usercopy(const char *name,
out_unlock:
mutex_unlock(&slab_mutex);
- put_online_mems();
- put_online_cpus();
-
if (err) {
if (flags & SLAB_PANIC)
panic("kmem_cache_create: Failed to create slab '%s'. Error %d\n",
@@ -436,6 +431,7 @@ static void slab_caches_to_rcu_destroy_workfn(struct work_struct *work)
rcu_barrier();
list_for_each_entry_safe(s, s2, &to_destroy, list) {
+ kfence_shutdown_cache(s);
#ifdef SLAB_SUPPORTS_SYSFS
sysfs_slab_release(s);
#else
@@ -461,6 +457,7 @@ static int shutdown_cache(struct kmem_cache *s)
list_add_tail(&s->list, &slab_caches_to_rcu_destroy);
schedule_work(&slab_caches_to_rcu_destroy_work);
} else {
+ kfence_shutdown_cache(s);
#ifdef SLAB_SUPPORTS_SYSFS
sysfs_slab_unlink(s);
sysfs_slab_release(s);
@@ -486,9 +483,6 @@ void kmem_cache_destroy(struct kmem_cache *s)
if (unlikely(!s))
return;
- get_online_cpus();
- get_online_mems();
-
mutex_lock(&slab_mutex);
s->refcount--;
@@ -503,9 +497,6 @@ void kmem_cache_destroy(struct kmem_cache *s)
}
out_unlock:
mutex_unlock(&slab_mutex);
-
- put_online_mems();
- put_online_cpus();
}
EXPORT_SYMBOL(kmem_cache_destroy);
@@ -522,12 +513,10 @@ int kmem_cache_shrink(struct kmem_cache *cachep)
{
int ret;
- get_online_cpus();
- get_online_mems();
+
kasan_cache_shrink(cachep);
ret = __kmem_cache_shrink(cachep);
- put_online_mems();
- put_online_cpus();
+
return ret;
}
EXPORT_SYMBOL(kmem_cache_shrink);
@@ -654,6 +643,7 @@ struct kmem_cache *__init create_kmalloc_cache(const char *name,
panic("Out of memory when creating slab %s\n", name);
create_boot_cache(s, name, size, flags, useroffset, usersize);
+ kasan_cache_create_kmalloc(s);
list_add(&s->list, &slab_caches);
s->refcount = 1;
return s;
@@ -912,8 +902,8 @@ void *kmalloc_order(size_t size, gfp_t flags, unsigned int order)
page = alloc_pages(flags, order);
if (likely(page)) {
ret = page_address(page);
- mod_node_page_state(page_pgdat(page), NR_SLAB_UNRECLAIMABLE_B,
- PAGE_SIZE << order);
+ mod_lruvec_page_state(page, NR_SLAB_UNRECLAIMABLE_B,
+ PAGE_SIZE << order);
}
ret = kasan_kmalloc_large(ret, size, flags);
/* As ret might get tagged, call kmemleak hook after KASAN. */
@@ -1146,16 +1136,27 @@ static __always_inline void *__do_krealloc(const void *p, size_t new_size,
void *ret;
size_t ks;
- ks = ksize(p);
+ /* Don't use instrumented ksize to allow precise KASAN poisoning. */
+ if (likely(!ZERO_OR_NULL_PTR(p))) {
+ if (!kasan_check_byte(p))
+ return NULL;
+ ks = kfence_ksize(p) ?: __ksize(p);
+ } else
+ ks = 0;
+ /* If the object still fits, repoison it precisely. */
if (ks >= new_size) {
p = kasan_krealloc((void *)p, new_size, flags);
return (void *)p;
}
ret = kmalloc_track_caller(new_size, flags);
- if (ret && p)
- memcpy(ret, p, ks);
+ if (ret && p) {
+ /* Disable KASAN checks as the object's redzone is accessed. */
+ kasan_disable_current();
+ memcpy(ret, kasan_reset_tag(p), ks);
+ kasan_enable_current();
+ }
return ret;
}
@@ -1232,22 +1233,24 @@ size_t ksize(const void *objp)
size_t size;
/*
- * We need to check that the pointed to object is valid, and only then
- * unpoison the shadow memory below. We use __kasan_check_read(), to
- * generate a more useful report at the time ksize() is called (rather
- * than later where behaviour is undefined due to potential
- * use-after-free or double-free).
+ * We need to first check that the pointer to the object is valid, and
+ * only then unpoison the memory. The report printed from ksize() is
+ * more useful, then when it's printed later when the behaviour could
+ * be undefined due to a potential use-after-free or double-free.
+ *
+ * We use kasan_check_byte(), which is supported for the hardware
+ * tag-based KASAN mode, unlike kasan_check_read/write().
*
- * If the pointed to memory is invalid we return 0, to avoid users of
+ * If the pointed to memory is invalid, we return 0 to avoid users of
* ksize() writing to and potentially corrupting the memory region.
*
* We want to perform the check before __ksize(), to avoid potentially
* crashing in __ksize() due to accessing invalid metadata.
*/
- if (unlikely(ZERO_OR_NULL_PTR(objp)) || !__kasan_check_read(objp, 1))
+ if (unlikely(ZERO_OR_NULL_PTR(objp)) || !kasan_check_byte(objp))
return 0;
- size = __ksize(objp);
+ size = kfence_ksize(objp) ?: __ksize(objp);
/*
* We assume that ksize callers could use whole allocated area,
* so we need to unpoison this area.
diff --git a/mm/slob.c b/mm/slob.c
index ef87ada8705d..0578429b991b 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -673,7 +673,7 @@ void kmem_cache_free(struct kmem_cache *c, void *b)
__kmem_cache_free(b, c->size);
}
- trace_kmem_cache_free(_RET_IP_, b);
+ trace_kmem_cache_free(_RET_IP_, b, c->name);
}
EXPORT_SYMBOL(kmem_cache_free);
diff --git a/mm/slub.c b/mm/slub.c
index f5baf429654f..3021ce9bf1b3 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -27,6 +27,7 @@
#include <linux/ctype.h>
#include <linux/debugobjects.h>
#include <linux/kallsyms.h>
+#include <linux/kfence.h>
#include <linux/memory.h>
#include <linux/math64.h>
#include <linux/fault-inject.h>
@@ -235,6 +236,14 @@ static inline void stat(const struct kmem_cache *s, enum stat_item si)
#endif
}
+/*
+ * Tracks for which NUMA nodes we have kmem_cache_nodes allocated.
+ * Corresponds to node_state[N_NORMAL_MEMORY], but can temporarily
+ * differ during memory hotplug/hotremove operations.
+ * Protected by slab_mutex.
+ */
+static nodemask_t slab_nodes;
+
/********************************************************************
* Core slab cache functions
*******************************************************************/
@@ -1400,7 +1409,6 @@ __setup("slub_debug", setup_slub_debug);
* @object_size: the size of an object without meta data
* @flags: flags to set
* @name: name of the cache
- * @ctor: constructor function
*
* Debug option(s) are applied to @flags. In addition to the debug
* option(s), if a slab name (or multiple) is specified i.e.
@@ -1408,13 +1416,21 @@ __setup("slub_debug", setup_slub_debug);
* then only the select slabs will receive the debug option(s).
*/
slab_flags_t kmem_cache_flags(unsigned int object_size,
- slab_flags_t flags, const char *name,
- void (*ctor)(void *))
+ slab_flags_t flags, const char *name)
{
char *iter;
size_t len;
char *next_block;
slab_flags_t block_flags;
+ slab_flags_t slub_debug_local = slub_debug;
+
+ /*
+ * If the slab cache is for debugging (e.g. kmemleak) then
+ * don't store user (stack trace) information by default,
+ * but let the user enable it via the command line below.
+ */
+ if (flags & SLAB_NOLEAKTRACE)
+ slub_debug_local &= ~SLAB_STORE_USER;
len = strlen(name);
next_block = slub_debug_string;
@@ -1449,7 +1465,7 @@ slab_flags_t kmem_cache_flags(unsigned int object_size,
}
}
- return flags | slub_debug;
+ return flags | slub_debug_local;
}
#else /* !CONFIG_SLUB_DEBUG */
static inline void setup_object_debug(struct kmem_cache *s,
@@ -1474,8 +1490,7 @@ static inline void add_full(struct kmem_cache *s, struct kmem_cache_node *n,
static inline void remove_full(struct kmem_cache *s, struct kmem_cache_node *n,
struct page *page) {}
slab_flags_t kmem_cache_flags(unsigned int object_size,
- slab_flags_t flags, const char *name,
- void (*ctor)(void *))
+ slab_flags_t flags, const char *name)
{
return flags;
}
@@ -1514,7 +1529,7 @@ static inline void *kmalloc_large_node_hook(void *ptr, size_t size, gfp_t flags)
static __always_inline void kfree_hook(void *x)
{
kmemleak_free(x);
- kasan_kfree_large(x, _RET_IP_);
+ kasan_kfree_large(x);
}
static __always_inline bool slab_free_hook(struct kmem_cache *s, void *x)
@@ -1544,7 +1559,7 @@ static __always_inline bool slab_free_hook(struct kmem_cache *s, void *x)
KCSAN_ACCESS_WRITE | KCSAN_ACCESS_ASSERT);
/* KASAN might put x into memory quarantine, delaying its reuse */
- return kasan_slab_free(s, x, _RET_IP_);
+ return kasan_slab_free(s, x);
}
static inline bool slab_free_freelist_hook(struct kmem_cache *s,
@@ -1556,6 +1571,11 @@ static inline bool slab_free_freelist_hook(struct kmem_cache *s,
void *old_tail = *tail ? *tail : *head;
int rsize;
+ if (is_kfence_address(next)) {
+ slab_free_hook(s, next);
+ return true;
+ }
+
/* Head and tail of the reconstructed freelist */
*head = NULL;
*tail = NULL;
@@ -1771,7 +1791,7 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
page->objects = oo_objects(oo);
- account_slab_page(page, oo_order(oo), s);
+ account_slab_page(page, oo_order(oo), s, flags);
page->slab_cache = s;
__SetPageSlab(page);
@@ -1973,7 +1993,7 @@ static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n,
t = acquire_slab(s, n, page, object == NULL, &objects);
if (!t)
- continue; /* cmpxchg raced */
+ break;
available += objects;
if (!object) {
@@ -2153,9 +2173,9 @@ static void deactivate_slab(struct kmem_cache *s, struct page *page,
{
enum slab_modes { M_NONE, M_PARTIAL, M_FULL, M_FREE };
struct kmem_cache_node *n = get_node(s, page_to_nid(page));
- int lock = 0;
+ int lock = 0, free_delta = 0;
enum slab_modes l = M_NONE, m = M_NONE;
- void *nextfree;
+ void *nextfree, *freelist_iter, *freelist_tail;
int tail = DEACTIVATE_TO_HEAD;
struct page new;
struct page old;
@@ -2166,45 +2186,34 @@ static void deactivate_slab(struct kmem_cache *s, struct page *page,
}
/*
- * Stage one: Free all available per cpu objects back
- * to the page freelist while it is still frozen. Leave the
- * last one.
- *
- * There is no need to take the list->lock because the page
- * is still frozen.
+ * Stage one: Count the objects on cpu's freelist as free_delta and
+ * remember the last object in freelist_tail for later splicing.
*/
- while (freelist && (nextfree = get_freepointer(s, freelist))) {
- void *prior;
- unsigned long counters;
+ freelist_tail = NULL;
+ freelist_iter = freelist;
+ while (freelist_iter) {
+ nextfree = get_freepointer(s, freelist_iter);
/*
* If 'nextfree' is invalid, it is possible that the object at
- * 'freelist' is already corrupted. So isolate all objects
- * starting at 'freelist'.
+ * 'freelist_iter' is already corrupted. So isolate all objects
+ * starting at 'freelist_iter' by skipping them.
*/
- if (freelist_corrupted(s, page, &freelist, nextfree))
+ if (freelist_corrupted(s, page, &freelist_iter, nextfree))
break;
- do {
- prior = page->freelist;
- counters = page->counters;
- set_freepointer(s, freelist, prior);
- new.counters = counters;
- new.inuse--;
- VM_BUG_ON(!new.frozen);
-
- } while (!__cmpxchg_double_slab(s, page,
- prior, counters,
- freelist, new.counters,
- "drain percpu freelist"));
+ freelist_tail = freelist_iter;
+ free_delta++;
- freelist = nextfree;
+ freelist_iter = nextfree;
}
/*
- * Stage two: Ensure that the page is unfrozen while the
- * list presence reflects the actual number of objects
- * during unfreeze.
+ * Stage two: Unfreeze the page while splicing the per-cpu
+ * freelist to the head of page's freelist.
+ *
+ * Ensure that the page is unfrozen while the list presence
+ * reflects the actual number of objects during unfreeze.
*
* We setup the list membership and then perform a cmpxchg
* with the count. If there is a mismatch then the page
@@ -2217,15 +2226,15 @@ static void deactivate_slab(struct kmem_cache *s, struct page *page,
*/
redo:
- old.freelist = page->freelist;
- old.counters = page->counters;
+ old.freelist = READ_ONCE(page->freelist);
+ old.counters = READ_ONCE(page->counters);
VM_BUG_ON(!old.frozen);
/* Determine target state of the slab */
new.counters = old.counters;
- if (freelist) {
- new.inuse--;
- set_freepointer(s, freelist, old.freelist);
+ if (freelist_tail) {
+ new.inuse -= free_delta;
+ set_freepointer(s, freelist_tail, old.freelist);
new.freelist = freelist;
} else
new.freelist = old.freelist;
@@ -2672,7 +2681,7 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
* ignore the node constraint
*/
if (unlikely(node != NUMA_NO_NODE &&
- !node_state(node, N_NORMAL_MEMORY)))
+ !node_isset(node, slab_nodes)))
node = NUMA_NO_NODE;
goto new_slab;
}
@@ -2683,7 +2692,7 @@ redo:
* same as above but node_match() being false already
* implies node != NUMA_NO_NODE
*/
- if (!node_state(node, N_NORMAL_MEMORY)) {
+ if (!node_isset(node, slab_nodes)) {
node = NUMA_NO_NODE;
goto redo;
} else {
@@ -2806,7 +2815,7 @@ static __always_inline void maybe_wipe_obj_freeptr(struct kmem_cache *s,
* Otherwise we can simply pick the next object from the lockless free list.
*/
static __always_inline void *slab_alloc_node(struct kmem_cache *s,
- gfp_t gfpflags, int node, unsigned long addr)
+ gfp_t gfpflags, int node, unsigned long addr, size_t orig_size)
{
void *object;
struct kmem_cache_cpu *c;
@@ -2817,6 +2826,11 @@ static __always_inline void *slab_alloc_node(struct kmem_cache *s,
s = slab_pre_alloc_hook(s, &objcg, 1, gfpflags);
if (!s)
return NULL;
+
+ object = kfence_alloc(s, orig_size, gfpflags);
+ if (unlikely(object))
+ goto out;
+
redo:
/*
* Must read kmem_cache cpu data via this cpu ptr. Preemption is
@@ -2889,20 +2903,21 @@ redo:
if (unlikely(slab_want_init_on_alloc(gfpflags, s)) && object)
memset(kasan_reset_tag(object), 0, s->object_size);
+out:
slab_post_alloc_hook(s, objcg, gfpflags, 1, &object);
return object;
}
static __always_inline void *slab_alloc(struct kmem_cache *s,
- gfp_t gfpflags, unsigned long addr)
+ gfp_t gfpflags, unsigned long addr, size_t orig_size)
{
- return slab_alloc_node(s, gfpflags, NUMA_NO_NODE, addr);
+ return slab_alloc_node(s, gfpflags, NUMA_NO_NODE, addr, orig_size);
}
void *kmem_cache_alloc(struct kmem_cache *s, gfp_t gfpflags)
{
- void *ret = slab_alloc(s, gfpflags, _RET_IP_);
+ void *ret = slab_alloc(s, gfpflags, _RET_IP_, s->object_size);
trace_kmem_cache_alloc(_RET_IP_, ret, s->object_size,
s->size, gfpflags);
@@ -2914,7 +2929,7 @@ EXPORT_SYMBOL(kmem_cache_alloc);
#ifdef CONFIG_TRACING
void *kmem_cache_alloc_trace(struct kmem_cache *s, gfp_t gfpflags, size_t size)
{
- void *ret = slab_alloc(s, gfpflags, _RET_IP_);
+ void *ret = slab_alloc(s, gfpflags, _RET_IP_, size);
trace_kmalloc(_RET_IP_, ret, size, s->size, gfpflags);
ret = kasan_kmalloc(s, ret, size, gfpflags);
return ret;
@@ -2925,7 +2940,7 @@ EXPORT_SYMBOL(kmem_cache_alloc_trace);
#ifdef CONFIG_NUMA
void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t gfpflags, int node)
{
- void *ret = slab_alloc_node(s, gfpflags, node, _RET_IP_);
+ void *ret = slab_alloc_node(s, gfpflags, node, _RET_IP_, s->object_size);
trace_kmem_cache_alloc_node(_RET_IP_, ret,
s->object_size, s->size, gfpflags, node);
@@ -2939,7 +2954,7 @@ void *kmem_cache_alloc_node_trace(struct kmem_cache *s,
gfp_t gfpflags,
int node, size_t size)
{
- void *ret = slab_alloc_node(s, gfpflags, node, _RET_IP_);
+ void *ret = slab_alloc_node(s, gfpflags, node, _RET_IP_, size);
trace_kmalloc_node(_RET_IP_, ret,
size, s->size, gfpflags, node);
@@ -2973,6 +2988,9 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
stat(s, FREE_SLOWPATH);
+ if (kfence_free(head))
+ return;
+
if (kmem_cache_debug(s) &&
!free_debug_processing(s, page, head, tail, cnt, addr))
return;
@@ -3157,7 +3175,7 @@ void kmem_cache_free(struct kmem_cache *s, void *x)
if (!s)
return;
slab_free(s, virt_to_head_page(x), x, NULL, 1, _RET_IP_);
- trace_kmem_cache_free(_RET_IP_, x);
+ trace_kmem_cache_free(_RET_IP_, x, s->name);
}
EXPORT_SYMBOL(kmem_cache_free);
@@ -3217,6 +3235,13 @@ int build_detached_freelist(struct kmem_cache *s, size_t size,
df->s = cache_from_obj(s, object); /* Support for memcg */
}
+ if (is_kfence_address(object)) {
+ slab_free_hook(df->s, object);
+ __kfence_free(object);
+ p[size] = NULL; /* mark object processed */
+ return size;
+ }
+
/* Start new detached freelist */
df->page = page;
set_freepointer(df->s, object, NULL);
@@ -3266,7 +3291,7 @@ void kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p)
if (!df.page)
continue;
- slab_free(df.s, df.page, df.freelist, df.tail, df.cnt,_RET_IP_);
+ slab_free(df.s, df.page, df.freelist, df.tail, df.cnt, _RET_IP_);
} while (likely(size));
}
EXPORT_SYMBOL(kmem_cache_free_bulk);
@@ -3292,8 +3317,14 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
c = this_cpu_ptr(s->cpu_slab);
for (i = 0; i < size; i++) {
- void *object = c->freelist;
+ void *object = kfence_alloc(s, s->object_size, flags);
+ if (unlikely(object)) {
+ p[i] = object;
+ continue;
+ }
+
+ object = c->freelist;
if (unlikely(!object)) {
/*
* We may have removed an object from c->freelist using
@@ -3548,8 +3579,7 @@ static void early_kmem_cache_node_alloc(int node)
init_object(kmem_cache_node, n, SLUB_RED_ACTIVE);
init_tracking(kmem_cache_node, n);
#endif
- n = kasan_kmalloc(kmem_cache_node, n, sizeof(struct kmem_cache_node),
- GFP_KERNEL);
+ n = kasan_slab_alloc(kmem_cache_node, n, GFP_KERNEL);
page->freelist = get_freepointer(kmem_cache_node, n);
page->inuse = 1;
page->frozen = 0;
@@ -3586,7 +3616,7 @@ static int init_kmem_cache_nodes(struct kmem_cache *s)
{
int node;
- for_each_node_state(node, N_NORMAL_MEMORY) {
+ for_each_node_mask(node, slab_nodes) {
struct kmem_cache_node *n;
if (slab_state == DOWN) {
@@ -3797,7 +3827,7 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order)
static int kmem_cache_open(struct kmem_cache *s, slab_flags_t flags)
{
- s->flags = kmem_cache_flags(s->size, flags, s->name, s->ctor);
+ s->flags = kmem_cache_flags(s->size, flags, s->name);
#ifdef CONFIG_SLAB_FREELIST_HARDENED
s->random = get_random_long();
#endif
@@ -4018,7 +4048,7 @@ void *__kmalloc(size_t size, gfp_t flags)
if (unlikely(ZERO_OR_NULL_PTR(s)))
return s;
- ret = slab_alloc(s, flags, _RET_IP_);
+ ret = slab_alloc(s, flags, _RET_IP_, size);
trace_kmalloc(_RET_IP_, ret, size, s->size, flags);
@@ -4039,8 +4069,8 @@ static void *kmalloc_large_node(size_t size, gfp_t flags, int node)
page = alloc_pages_node(node, flags, order);
if (page) {
ptr = page_address(page);
- mod_node_page_state(page_pgdat(page), NR_SLAB_UNRECLAIMABLE_B,
- PAGE_SIZE << order);
+ mod_lruvec_page_state(page, NR_SLAB_UNRECLAIMABLE_B,
+ PAGE_SIZE << order);
}
return kmalloc_large_node_hook(ptr, size, flags);
@@ -4066,7 +4096,7 @@ void *__kmalloc_node(size_t size, gfp_t flags, int node)
if (unlikely(ZERO_OR_NULL_PTR(s)))
return s;
- ret = slab_alloc_node(s, flags, node, _RET_IP_);
+ ret = slab_alloc_node(s, flags, node, _RET_IP_, size);
trace_kmalloc_node(_RET_IP_, ret, size, s->size, flags, node);
@@ -4092,6 +4122,7 @@ void __check_heap_object(const void *ptr, unsigned long n, struct page *page,
struct kmem_cache *s;
unsigned int offset;
size_t object_size;
+ bool is_kfence = is_kfence_address(ptr);
ptr = kasan_reset_tag(ptr);
@@ -4104,10 +4135,13 @@ void __check_heap_object(const void *ptr, unsigned long n, struct page *page,
to_user, 0, n);
/* Find offset within object. */
- offset = (ptr - page_address(page)) % s->size;
+ if (is_kfence)
+ offset = ptr - kfence_object_start(ptr);
+ else
+ offset = (ptr - page_address(page)) % s->size;
/* Adjust for redzone and reject if within the redzone. */
- if (kmem_cache_debug_flags(s, SLAB_RED_ZONE)) {
+ if (!is_kfence && kmem_cache_debug_flags(s, SLAB_RED_ZONE)) {
if (offset < s->red_left_pad)
usercopy_abort("SLUB object in left red zone",
s->name, to_user, offset, n);
@@ -4171,8 +4205,8 @@ void kfree(const void *x)
BUG_ON(!PageCompound(page));
kfree_hook(object);
- mod_node_page_state(page_pgdat(page), NR_SLAB_UNRECLAIMABLE_B,
- -(PAGE_SIZE << order));
+ mod_lruvec_page_state(page, NR_SLAB_UNRECLAIMABLE_B,
+ -(PAGE_SIZE << order));
__free_pages(page, order);
return;
}
@@ -4267,8 +4301,6 @@ static int slab_mem_going_offline_callback(void *arg)
static void slab_mem_offline_callback(void *arg)
{
- struct kmem_cache_node *n;
- struct kmem_cache *s;
struct memory_notify *marg = arg;
int offline_node;
@@ -4282,21 +4314,12 @@ static void slab_mem_offline_callback(void *arg)
return;
mutex_lock(&slab_mutex);
- list_for_each_entry(s, &slab_caches, list) {
- n = get_node(s, offline_node);
- if (n) {
- /*
- * if n->nr_slabs > 0, slabs still exist on the node
- * that is going down. We were unable to free them,
- * and offline_pages() function shouldn't call this
- * callback. So, we must fail.
- */
- BUG_ON(slabs_node(s, offline_node));
-
- s->node[offline_node] = NULL;
- kmem_cache_free(kmem_cache_node, n);
- }
- }
+ node_clear(offline_node, slab_nodes);
+ /*
+ * We no longer free kmem_cache_node structures here, as it would be
+ * racy with all get_node() users, and infeasible to protect them with
+ * slab_mutex.
+ */
mutex_unlock(&slab_mutex);
}
@@ -4323,6 +4346,12 @@ static int slab_mem_going_online_callback(void *arg)
mutex_lock(&slab_mutex);
list_for_each_entry(s, &slab_caches, list) {
/*
+ * The structure may already exist if the node was previously
+ * onlined and offlined.
+ */
+ if (get_node(s, nid))
+ continue;
+ /*
* XXX: kmem_cache_alloc_node will fallback to other nodes
* since memory is not yet available from the node that
* is brought up.
@@ -4335,6 +4364,11 @@ static int slab_mem_going_online_callback(void *arg)
init_kmem_cache_node(n);
s->node[nid] = n;
}
+ /*
+ * Any cache created after this point will also have kmem_cache_node
+ * initialized for the new node.
+ */
+ node_set(nid, slab_nodes);
out:
mutex_unlock(&slab_mutex);
return ret;
@@ -4415,6 +4449,7 @@ void __init kmem_cache_init(void)
{
static __initdata struct kmem_cache boot_kmem_cache,
boot_kmem_cache_node;
+ int node;
if (debug_guardpage_minorder())
slub_max_order = 0;
@@ -4422,6 +4457,13 @@ void __init kmem_cache_init(void)
kmem_cache_node = &boot_kmem_cache_node;
kmem_cache = &boot_kmem_cache;
+ /*
+ * Initialize the nodemask for which we will allocate per node
+ * structures. Here we don't need taking slab_mutex yet.
+ */
+ for_each_node_state(node, N_NORMAL_MEMORY)
+ node_set(node, slab_nodes);
+
create_boot_cache(kmem_cache_node, "kmem_cache_node",
sizeof(struct kmem_cache_node), SLAB_HWCACHE_ALIGN, 0, 0);
@@ -4516,7 +4558,7 @@ void *__kmalloc_track_caller(size_t size, gfp_t gfpflags, unsigned long caller)
if (unlikely(ZERO_OR_NULL_PTR(s)))
return s;
- ret = slab_alloc(s, gfpflags, caller);
+ ret = slab_alloc(s, gfpflags, caller, size);
/* Honor the call site pointer we received. */
trace_kmalloc(caller, ret, size, s->size, gfpflags);
@@ -4547,7 +4589,7 @@ void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags,
if (unlikely(ZERO_OR_NULL_PTR(s)))
return s;
- ret = slab_alloc_node(s, gfpflags, node, caller);
+ ret = slab_alloc_node(s, gfpflags, node, caller, size);
/* Honor the call site pointer we received. */
trace_kmalloc_node(caller, ret, size, s->size, gfpflags, node);
@@ -4932,22 +4974,6 @@ enum slab_stat_type {
#define SO_OBJECTS (1 << SL_OBJECTS)
#define SO_TOTAL (1 << SL_TOTAL)
-#ifdef CONFIG_MEMCG
-static bool memcg_sysfs_enabled = IS_ENABLED(CONFIG_SLUB_MEMCG_SYSFS_ON);
-
-static int __init setup_slub_memcg_sysfs(char *str)
-{
- int v;
-
- if (get_option(&str, &v) > 0)
- memcg_sysfs_enabled = v;
-
- return 1;
-}
-
-__setup("slub_memcg_sysfs=", setup_slub_memcg_sysfs);
-#endif
-
static ssize_t show_slab_objects(struct kmem_cache *s,
char *buf, unsigned long flags)
{
diff --git a/mm/swap.c b/mm/swap.c
index 2cca7141470c..31b844d4ed94 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -83,9 +83,8 @@ static void __page_cache_release(struct page *page)
unsigned long flags;
lruvec = lock_page_lruvec_irqsave(page, &flags);
- VM_BUG_ON_PAGE(!PageLRU(page), page);
- __ClearPageLRU(page);
- del_page_from_lru_list(page, lruvec, page_off_lru(page));
+ del_page_from_lru_list(page, lruvec);
+ __clear_page_lru_flags(page);
unlock_page_lruvec_irqrestore(lruvec, flags);
}
__ClearPageWaiters(page);
@@ -229,9 +228,9 @@ static void pagevec_lru_move_fn(struct pagevec *pvec,
static void pagevec_move_tail_fn(struct page *page, struct lruvec *lruvec)
{
if (!PageUnevictable(page)) {
- del_page_from_lru_list(page, lruvec, page_lru(page));
+ del_page_from_lru_list(page, lruvec);
ClearPageActive(page);
- add_page_to_lru_list_tail(page, lruvec, page_lru(page));
+ add_page_to_lru_list_tail(page, lruvec);
__count_vm_events(PGROTATED, thp_nr_pages(page));
}
}
@@ -308,13 +307,11 @@ void lru_note_cost_page(struct page *page)
static void __activate_page(struct page *page, struct lruvec *lruvec)
{
if (!PageActive(page) && !PageUnevictable(page)) {
- int lru = page_lru_base_type(page);
int nr_pages = thp_nr_pages(page);
- del_page_from_lru_list(page, lruvec, lru);
+ del_page_from_lru_list(page, lruvec);
SetPageActive(page);
- lru += LRU_ACTIVE;
- add_page_to_lru_list(page, lruvec, lru);
+ add_page_to_lru_list(page, lruvec);
trace_mm_lru_activate(page);
__count_vm_events(PGACTIVATE, nr_pages);
@@ -519,8 +516,7 @@ void lru_cache_add_inactive_or_unevictable(struct page *page,
*/
static void lru_deactivate_file_fn(struct page *page, struct lruvec *lruvec)
{
- int lru;
- bool active;
+ bool active = PageActive(page);
int nr_pages = thp_nr_pages(page);
if (PageUnevictable(page))
@@ -530,10 +526,7 @@ static void lru_deactivate_file_fn(struct page *page, struct lruvec *lruvec)
if (page_mapped(page))
return;
- active = PageActive(page);
- lru = page_lru_base_type(page);
-
- del_page_from_lru_list(page, lruvec, lru + active);
+ del_page_from_lru_list(page, lruvec);
ClearPageActive(page);
ClearPageReferenced(page);
@@ -543,14 +536,14 @@ static void lru_deactivate_file_fn(struct page *page, struct lruvec *lruvec)
* It can make readahead confusing. But race window
* is _really_ small and it's non-critical problem.
*/
- add_page_to_lru_list(page, lruvec, lru);
+ add_page_to_lru_list(page, lruvec);
SetPageReclaim(page);
} else {
/*
* The page's writeback ends up during pagevec
* We moves tha page into tail of inactive.
*/
- add_page_to_lru_list_tail(page, lruvec, lru);
+ add_page_to_lru_list_tail(page, lruvec);
__count_vm_events(PGROTATED, nr_pages);
}
@@ -564,13 +557,12 @@ static void lru_deactivate_file_fn(struct page *page, struct lruvec *lruvec)
static void lru_deactivate_fn(struct page *page, struct lruvec *lruvec)
{
if (PageActive(page) && !PageUnevictable(page)) {
- int lru = page_lru_base_type(page);
int nr_pages = thp_nr_pages(page);
- del_page_from_lru_list(page, lruvec, lru + LRU_ACTIVE);
+ del_page_from_lru_list(page, lruvec);
ClearPageActive(page);
ClearPageReferenced(page);
- add_page_to_lru_list(page, lruvec, lru);
+ add_page_to_lru_list(page, lruvec);
__count_vm_events(PGDEACTIVATE, nr_pages);
__count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE,
@@ -582,11 +574,9 @@ static void lru_lazyfree_fn(struct page *page, struct lruvec *lruvec)
{
if (PageAnon(page) && PageSwapBacked(page) &&
!PageSwapCache(page) && !PageUnevictable(page)) {
- bool active = PageActive(page);
int nr_pages = thp_nr_pages(page);
- del_page_from_lru_list(page, lruvec,
- LRU_INACTIVE_ANON + active);
+ del_page_from_lru_list(page, lruvec);
ClearPageActive(page);
ClearPageReferenced(page);
/*
@@ -595,7 +585,7 @@ static void lru_lazyfree_fn(struct page *page, struct lruvec *lruvec)
* anonymous pages
*/
ClearPageSwapBacked(page);
- add_page_to_lru_list(page, lruvec, LRU_INACTIVE_FILE);
+ add_page_to_lru_list(page, lruvec);
__count_vm_events(PGLAZYFREE, nr_pages);
__count_memcg_events(lruvec_memcg(lruvec), PGLAZYFREE,
@@ -918,9 +908,8 @@ void release_pages(struct page **pages, int nr)
if (prev_lruvec != lruvec)
lock_batch = 0;
- VM_BUG_ON_PAGE(!PageLRU(page), page);
- __ClearPageLRU(page);
- del_page_from_lru_list(page, lruvec, page_off_lru(page));
+ del_page_from_lru_list(page, lruvec);
+ __clear_page_lru_flags(page);
}
__ClearPageWaiters(page);
@@ -958,7 +947,6 @@ EXPORT_SYMBOL(__pagevec_release);
static void __pagevec_lru_add_fn(struct page *page, struct lruvec *lruvec)
{
- enum lru_list lru;
int was_unevictable = TestClearPageUnevictable(page);
int nr_pages = thp_nr_pages(page);
@@ -994,19 +982,17 @@ static void __pagevec_lru_add_fn(struct page *page, struct lruvec *lruvec)
smp_mb__after_atomic();
if (page_evictable(page)) {
- lru = page_lru(page);
if (was_unevictable)
__count_vm_events(UNEVICTABLE_PGRESCUED, nr_pages);
} else {
- lru = LRU_UNEVICTABLE;
ClearPageActive(page);
SetPageUnevictable(page);
if (!was_unevictable)
__count_vm_events(UNEVICTABLE_PGCULLED, nr_pages);
}
- add_page_to_lru_list(page, lruvec, lru);
- trace_mm_lru_insertion(page, lru);
+ add_page_to_lru_list(page, lruvec);
+ trace_mm_lru_insertion(page);
}
/*
@@ -1032,45 +1018,11 @@ void __pagevec_lru_add(struct pagevec *pvec)
}
/**
- * pagevec_lookup_entries - gang pagecache lookup
- * @pvec: Where the resulting entries are placed
- * @mapping: The address_space to search
- * @start: The starting entry index
- * @nr_entries: The maximum number of pages
- * @indices: The cache indices corresponding to the entries in @pvec
- *
- * pagevec_lookup_entries() will search for and return a group of up
- * to @nr_pages pages and shadow entries in the mapping. All
- * entries are placed in @pvec. pagevec_lookup_entries() takes a
- * reference against actual pages in @pvec.
- *
- * The search returns a group of mapping-contiguous entries with
- * ascending indexes. There may be holes in the indices due to
- * not-present entries.
- *
- * Only one subpage of a Transparent Huge Page is returned in one call:
- * allowing truncate_inode_pages_range() to evict the whole THP without
- * cycling through a pagevec of extra references.
- *
- * pagevec_lookup_entries() returns the number of entries which were
- * found.
- */
-unsigned pagevec_lookup_entries(struct pagevec *pvec,
- struct address_space *mapping,
- pgoff_t start, unsigned nr_entries,
- pgoff_t *indices)
-{
- pvec->nr = find_get_entries(mapping, start, nr_entries,
- pvec->pages, indices);
- return pagevec_count(pvec);
-}
-
-/**
* pagevec_remove_exceptionals - pagevec exceptionals pruning
* @pvec: The pagevec to prune
*
- * pagevec_lookup_entries() fills both pages and exceptional radix
- * tree entries into the pagevec. This function prunes all
+ * find_get_entries() fills both pages and XArray value entries (aka
+ * exceptional entries) into the pagevec. This function prunes all
* exceptionals from @pvec without leaving holes, so that it can be
* passed on to page-only pagevec operations.
*/
diff --git a/mm/swap_slots.c b/mm/swap_slots.c
index 0357fbe70645..be9de6d5b516 100644
--- a/mm/swap_slots.c
+++ b/mm/swap_slots.c
@@ -193,8 +193,7 @@ static void drain_slots_cache_cpu(unsigned int cpu, unsigned int type,
cache->slots_ret = NULL;
}
spin_unlock_irq(&cache->free_lock);
- if (slots)
- kvfree(slots);
+ kvfree(slots);
}
}
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 751c1ef2fe0e..3cdee7b11da9 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -68,32 +68,6 @@ static struct {
unsigned long find_total;
} swap_cache_info;
-unsigned long total_swapcache_pages(void)
-{
- unsigned int i, j, nr;
- unsigned long ret = 0;
- struct address_space *spaces;
- struct swap_info_struct *si;
-
- for (i = 0; i < MAX_SWAPFILES; i++) {
- swp_entry_t entry = swp_entry(i, 1);
-
- /* Avoid get_swap_device() to warn for bad swap entry */
- if (!swp_swap_info(entry))
- continue;
- /* Prevent swapoff to free swapper_spaces */
- si = get_swap_device(entry);
- if (!si)
- continue;
- nr = nr_swapper_spaces[i];
- spaces = swapper_spaces[i];
- for (j = 0; j < nr; j++)
- ret += spaces[j].nrpages;
- put_swap_device(si);
- }
- return ret;
-}
-
static atomic_t swapin_readahead_hits = ATOMIC_INIT(4);
void show_swap_cache_info(void)
@@ -113,11 +87,9 @@ void *get_shadow_from_swap_cache(swp_entry_t entry)
pgoff_t idx = swp_offset(entry);
struct page *page;
- page = find_get_entry(address_space, idx);
+ page = xa_load(&address_space->i_pages, idx);
if (xa_is_value(page))
return page;
- if (page)
- put_page(page);
return NULL;
}
@@ -163,6 +135,7 @@ int add_to_swap_cache(struct page *page, swp_entry_t entry,
address_space->nrexceptional -= nr_shadows;
address_space->nrpages += nr;
__mod_node_page_state(page_pgdat(page), NR_FILE_PAGES, nr);
+ __mod_lruvec_page_state(page, NR_SWAPCACHE, nr);
ADD_CACHE_INFO(add_total, nr);
unlock:
xas_unlock_irq(&xas);
@@ -203,6 +176,7 @@ void __delete_from_swap_cache(struct page *page,
address_space->nrexceptional += nr;
address_space->nrpages -= nr;
__mod_node_page_state(page_pgdat(page), NR_FILE_PAGES, -nr);
+ __mod_lruvec_page_state(page, NR_SWAPCACHE, -nr);
ADD_CACHE_INFO(del_total, nr);
}
@@ -429,7 +403,8 @@ struct page *find_get_incore_page(struct address_space *mapping, pgoff_t index)
{
swp_entry_t swp;
struct swap_info_struct *si;
- struct page *page = find_get_entry(mapping, index);
+ struct page *page = pagecache_get_page(mapping, index,
+ FGP_ENTRY | FGP_HEAD, 0);
if (!page)
return page;
@@ -537,7 +512,6 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
workingset_refault(page, shadow);
/* Caller will initiate read into locked page */
- SetPageWorkingset(page);
lru_cache_add(page);
*new_page_allocated = true;
return page;
@@ -927,7 +901,7 @@ static struct attribute *swap_attrs[] = {
NULL,
};
-static struct attribute_group swap_attr_group = {
+static const struct attribute_group swap_attr_group = {
.attrs = swap_attrs,
};
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 96799a2f6957..084a5b9a18e5 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -219,6 +219,19 @@ offset_to_swap_extent(struct swap_info_struct *sis, unsigned long offset)
BUG();
}
+sector_t swap_page_sector(struct page *page)
+{
+ struct swap_info_struct *sis = page_swap_info(page);
+ struct swap_extent *se;
+ sector_t sector;
+ pgoff_t offset;
+
+ offset = __page_file_index(page);
+ se = offset_to_swap_extent(sis, offset);
+ sector = se->start_block + (offset - se->start_page);
+ return sector << (PAGE_SHIFT - 9);
+}
+
/*
* swap allocation tell device that a cluster of swap can now be discarded,
* to allow the swap device to optimize its wear-levelling.
@@ -1157,13 +1170,13 @@ static struct swap_info_struct *__swap_info_get(swp_entry_t entry)
return p;
bad_offset:
- pr_err("swap_info_get: %s%08lx\n", Bad_offset, entry.val);
+ pr_err("%s: %s%08lx\n", __func__, Bad_offset, entry.val);
goto out;
bad_device:
- pr_err("swap_info_get: %s%08lx\n", Unused_file, entry.val);
+ pr_err("%s: %s%08lx\n", __func__, Unused_file, entry.val);
goto out;
bad_nofile:
- pr_err("swap_info_get: %s%08lx\n", Bad_file, entry.val);
+ pr_err("%s: %s%08lx\n", __func__, Bad_file, entry.val);
out:
return NULL;
}
@@ -1180,7 +1193,7 @@ static struct swap_info_struct *_swap_info_get(swp_entry_t entry)
return p;
bad_free:
- pr_err("swap_info_get: %s%08lx\n", Unused_offset, entry.val);
+ pr_err("%s: %s%08lx\n", __func__, Unused_offset, entry.val);
out:
return NULL;
}
diff --git a/mm/truncate.c b/mm/truncate.c
index 8aa4907e06e0..455944264663 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -57,11 +57,10 @@ static void clear_shadow_entry(struct address_space *mapping, pgoff_t index,
* exceptional entries similar to what pagevec_remove_exceptionals does.
*/
static void truncate_exceptional_pvec_entries(struct address_space *mapping,
- struct pagevec *pvec, pgoff_t *indices,
- pgoff_t end)
+ struct pagevec *pvec, pgoff_t *indices)
{
int i, j;
- bool dax, lock;
+ bool dax;
/* Handled by shmem itself */
if (shmem_mapping(mapping))
@@ -75,8 +74,7 @@ static void truncate_exceptional_pvec_entries(struct address_space *mapping,
return;
dax = dax_mapping(mapping);
- lock = !dax && indices[j] < end;
- if (lock)
+ if (!dax)
xa_lock_irq(&mapping->i_pages);
for (i = j; i < pagevec_count(pvec); i++) {
@@ -88,9 +86,6 @@ static void truncate_exceptional_pvec_entries(struct address_space *mapping,
continue;
}
- if (index >= end)
- continue;
-
if (unlikely(dax)) {
dax_delete_mapping_entry(mapping, index);
continue;
@@ -99,7 +94,7 @@ static void truncate_exceptional_pvec_entries(struct address_space *mapping,
__clear_shadow_entry(mapping, index, page);
}
- if (lock)
+ if (!dax)
xa_unlock_irq(&mapping->i_pages);
pvec->nr = j;
}
@@ -326,51 +321,19 @@ void truncate_inode_pages_range(struct address_space *mapping,
pagevec_init(&pvec);
index = start;
- while (index < end && pagevec_lookup_entries(&pvec, mapping, index,
- min(end - index, (pgoff_t)PAGEVEC_SIZE),
- indices)) {
- /*
- * Pagevec array has exceptional entries and we may also fail
- * to lock some pages. So we store pages that can be deleted
- * in a new pagevec.
- */
- struct pagevec locked_pvec;
-
- pagevec_init(&locked_pvec);
- for (i = 0; i < pagevec_count(&pvec); i++) {
- struct page *page = pvec.pages[i];
-
- /* We rely upon deletion not changing page->index */
- index = indices[i];
- if (index >= end)
- break;
-
- if (xa_is_value(page))
- continue;
-
- if (!trylock_page(page))
- continue;
- WARN_ON(page_to_index(page) != index);
- if (PageWriteback(page)) {
- unlock_page(page);
- continue;
- }
- if (page->mapping != mapping) {
- unlock_page(page);
- continue;
- }
- pagevec_add(&locked_pvec, page);
- }
- for (i = 0; i < pagevec_count(&locked_pvec); i++)
- truncate_cleanup_page(mapping, locked_pvec.pages[i]);
- delete_from_page_cache_batch(mapping, &locked_pvec);
- for (i = 0; i < pagevec_count(&locked_pvec); i++)
- unlock_page(locked_pvec.pages[i]);
- truncate_exceptional_pvec_entries(mapping, &pvec, indices, end);
+ while (index < end && find_lock_entries(mapping, index, end - 1,
+ &pvec, indices)) {
+ index = indices[pagevec_count(&pvec) - 1] + 1;
+ truncate_exceptional_pvec_entries(mapping, &pvec, indices);
+ for (i = 0; i < pagevec_count(&pvec); i++)
+ truncate_cleanup_page(mapping, pvec.pages[i]);
+ delete_from_page_cache_batch(mapping, &pvec);
+ for (i = 0; i < pagevec_count(&pvec); i++)
+ unlock_page(pvec.pages[i]);
pagevec_release(&pvec);
cond_resched();
- index++;
}
+
if (partial_start) {
struct page *page = find_lock_page(mapping, start - 1);
if (page) {
@@ -413,8 +376,8 @@ void truncate_inode_pages_range(struct address_space *mapping,
index = start;
for ( ; ; ) {
cond_resched();
- if (!pagevec_lookup_entries(&pvec, mapping, index,
- min(end - index, (pgoff_t)PAGEVEC_SIZE), indices)) {
+ if (!find_get_entries(mapping, index, end - 1, &pvec,
+ indices)) {
/* If all gone from start onwards, we're done */
if (index == start)
break;
@@ -422,23 +385,12 @@ void truncate_inode_pages_range(struct address_space *mapping,
index = start;
continue;
}
- if (index == start && indices[0] >= end) {
- /* All gone out of hole to be punched, we're done */
- pagevec_remove_exceptionals(&pvec);
- pagevec_release(&pvec);
- break;
- }
for (i = 0; i < pagevec_count(&pvec); i++) {
struct page *page = pvec.pages[i];
/* We rely upon deletion not changing page->index */
index = indices[i];
- if (index >= end) {
- /* Restart punch to make sure all gone */
- index = start - 1;
- break;
- }
if (xa_is_value(page))
continue;
@@ -449,7 +401,7 @@ void truncate_inode_pages_range(struct address_space *mapping,
truncate_inode_page(mapping, page);
unlock_page(page);
}
- truncate_exceptional_pvec_entries(mapping, &pvec, indices, end);
+ truncate_exceptional_pvec_entries(mapping, &pvec, indices);
pagevec_release(&pvec);
index++;
}
@@ -539,55 +491,19 @@ static unsigned long __invalidate_mapping_pages(struct address_space *mapping,
int i;
pagevec_init(&pvec);
- while (index <= end && pagevec_lookup_entries(&pvec, mapping, index,
- min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1,
- indices)) {
+ while (find_lock_entries(mapping, index, end, &pvec, indices)) {
for (i = 0; i < pagevec_count(&pvec); i++) {
struct page *page = pvec.pages[i];
/* We rely upon deletion not changing page->index */
index = indices[i];
- if (index > end)
- break;
if (xa_is_value(page)) {
invalidate_exceptional_entry(mapping, index,
page);
continue;
}
-
- if (!trylock_page(page))
- continue;
-
- WARN_ON(page_to_index(page) != index);
-
- /* Middle of THP: skip */
- if (PageTransTail(page)) {
- unlock_page(page);
- continue;
- } else if (PageTransHuge(page)) {
- index += HPAGE_PMD_NR - 1;
- i += HPAGE_PMD_NR - 1;
- /*
- * 'end' is in the middle of THP. Don't
- * invalidate the page as the part outside of
- * 'end' could be still useful.
- */
- if (index > end) {
- unlock_page(page);
- continue;
- }
-
- /* Take a pin outside pagevec */
- get_page(page);
-
- /*
- * Drop extra pins before trying to invalidate
- * the huge page.
- */
- pagevec_remove_exceptionals(&pvec);
- pagevec_release(&pvec);
- }
+ index += thp_nr_pages(page) - 1;
ret = invalidate_inode_page(page);
unlock_page(page);
@@ -601,9 +517,6 @@ static unsigned long __invalidate_mapping_pages(struct address_space *mapping,
if (nr_pagevec)
(*nr_pagevec)++;
}
-
- if (PageTransHuge(page))
- put_page(page);
count += ret;
}
pagevec_remove_exceptionals(&pvec);
@@ -725,16 +638,12 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
pagevec_init(&pvec);
index = start;
- while (index <= end && pagevec_lookup_entries(&pvec, mapping, index,
- min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1,
- indices)) {
+ while (find_get_entries(mapping, index, end, &pvec, indices)) {
for (i = 0; i < pagevec_count(&pvec); i++) {
struct page *page = pvec.pages[i];
/* We rely upon deletion not changing page->index */
index = indices[i];
- if (index > end)
- break;
if (xa_is_value(page)) {
if (!invalidate_exceptional_entry2(mapping,
diff --git a/mm/vmscan.c b/mm/vmscan.c
index b1b574ad199d..562e87cbd7a1 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -310,7 +310,8 @@ unsigned long zone_reclaimable_pages(struct zone *zone)
* @lru: lru to use
* @zone_idx: zones to consider (use MAX_NR_ZONES for the whole LRU list)
*/
-unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru, int zone_idx)
+static unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru,
+ int zone_idx)
{
unsigned long size = 0;
int zid;
@@ -1539,19 +1540,17 @@ unsigned int reclaim_clean_pages_from_list(struct zone *zone,
* page: page to consider
* mode: one of the LRU isolation modes defined above
*
- * returns 0 on success, -ve errno on failure.
+ * returns true on success, false on failure.
*/
-int __isolate_lru_page_prepare(struct page *page, isolate_mode_t mode)
+bool __isolate_lru_page_prepare(struct page *page, isolate_mode_t mode)
{
- int ret = -EBUSY;
-
/* Only take pages on the LRU. */
if (!PageLRU(page))
- return ret;
+ return false;
/* Compaction should not handle unevictable pages but CMA can do so */
if (PageUnevictable(page) && !(mode & ISOLATE_UNEVICTABLE))
- return ret;
+ return false;
/*
* To minimise LRU disruption, the caller can indicate that it only
@@ -1564,7 +1563,7 @@ int __isolate_lru_page_prepare(struct page *page, isolate_mode_t mode)
if (mode & ISOLATE_ASYNC_MIGRATE) {
/* All the caller can do on PageWriteback is block */
if (PageWriteback(page))
- return ret;
+ return false;
if (PageDirty(page)) {
struct address_space *mapping;
@@ -1580,20 +1579,20 @@ int __isolate_lru_page_prepare(struct page *page, isolate_mode_t mode)
* from the page cache.
*/
if (!trylock_page(page))
- return ret;
+ return false;
mapping = page_mapping(page);
migrate_dirty = !mapping || mapping->a_ops->migratepage;
unlock_page(page);
if (!migrate_dirty)
- return ret;
+ return false;
}
}
if ((mode & ISOLATE_UNMAPPED) && page_mapped(page))
- return ret;
+ return false;
- return 0;
+ return true;
}
/*
@@ -1677,35 +1676,31 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
* only when the page is being freed somewhere else.
*/
scan += nr_pages;
- switch (__isolate_lru_page_prepare(page, mode)) {
- case 0:
- /*
- * Be careful not to clear PageLRU until after we're
- * sure the page is not being freed elsewhere -- the
- * page release code relies on it.
- */
- if (unlikely(!get_page_unless_zero(page)))
- goto busy;
-
- if (!TestClearPageLRU(page)) {
- /*
- * This page may in other isolation path,
- * but we still hold lru_lock.
- */
- put_page(page);
- goto busy;
- }
-
- nr_taken += nr_pages;
- nr_zone_taken[page_zonenum(page)] += nr_pages;
- list_move(&page->lru, dst);
- break;
+ if (!__isolate_lru_page_prepare(page, mode)) {
+ /* It is being freed elsewhere */
+ list_move(&page->lru, src);
+ continue;
+ }
+ /*
+ * Be careful not to clear PageLRU until after we're
+ * sure the page is not being freed elsewhere -- the
+ * page release code relies on it.
+ */
+ if (unlikely(!get_page_unless_zero(page))) {
+ list_move(&page->lru, src);
+ continue;
+ }
- default:
-busy:
- /* else it is being freed elsewhere */
+ if (!TestClearPageLRU(page)) {
+ /* Another thread is already isolating this page */
+ put_page(page);
list_move(&page->lru, src);
+ continue;
}
+
+ nr_taken += nr_pages;
+ nr_zone_taken[page_zonenum(page)] += nr_pages;
+ list_move(&page->lru, dst);
}
/*
@@ -1772,7 +1767,7 @@ int isolate_lru_page(struct page *page)
get_page(page);
lruvec = lock_page_lruvec_irq(page);
- del_page_from_lru_list(page, lruvec, page_lru(page));
+ del_page_from_lru_list(page, lruvec);
unlock_page_lruvec_irq(lruvec);
ret = 0;
}
@@ -1829,7 +1824,6 @@ static unsigned noinline_for_stack move_pages_to_lru(struct lruvec *lruvec,
int nr_pages, nr_moved = 0;
LIST_HEAD(pages_to_free);
struct page *page;
- enum lru_list lru;
while (!list_empty(list)) {
page = lru_to_page(list);
@@ -1856,8 +1850,7 @@ static unsigned noinline_for_stack move_pages_to_lru(struct lruvec *lruvec,
SetPageLRU(page);
if (unlikely(put_page_testzero(page))) {
- __ClearPageLRU(page);
- __ClearPageActive(page);
+ __clear_page_lru_flags(page);
if (unlikely(PageCompound(page))) {
spin_unlock_irq(&lruvec->lru_lock);
@@ -1874,11 +1867,8 @@ static unsigned noinline_for_stack move_pages_to_lru(struct lruvec *lruvec,
* inhibits memcg migration).
*/
VM_BUG_ON_PAGE(!lruvec_holds_page_lru_lock(page, lruvec), page);
- lru = page_lru(page);
+ add_page_to_lru_list(page, lruvec);
nr_pages = thp_nr_pages(page);
-
- update_lru_size(lruvec, lru, page_zonenum(page), nr_pages);
- list_add(&page->lru, &lruvec->lists[lru]);
nr_moved += nr_pages;
if (PageActive(page))
workingset_age_nonresident(lruvec, nr_pages);
@@ -4095,8 +4085,13 @@ module_init(kswapd_init)
*/
int node_reclaim_mode __read_mostly;
-#define RECLAIM_WRITE (1<<0) /* Writeout pages during reclaim */
-#define RECLAIM_UNMAP (1<<1) /* Unmap pages during reclaim */
+/*
+ * These bit locations are exposed in the vm.zone_reclaim_mode sysctl
+ * ABI. New bits are OK, but existing bits can never change.
+ */
+#define RECLAIM_ZONE (1<<0) /* Run shrink_inactive_list on the zone */
+#define RECLAIM_WRITE (1<<1) /* Writeout pages during reclaim */
+#define RECLAIM_UNMAP (1<<2) /* Unmap pages during reclaim */
/*
* Priority for NODE_RECLAIM. This determines the fraction of pages
@@ -4292,12 +4287,9 @@ void check_move_unevictable_pages(struct pagevec *pvec)
lruvec = relock_page_lruvec_irq(page, lruvec);
if (page_evictable(page) && PageUnevictable(page)) {
- enum lru_list lru = page_lru_base_type(page);
-
- VM_BUG_ON_PAGE(PageActive(page), page);
+ del_page_from_lru_list(page, lruvec);
ClearPageUnevictable(page);
- del_page_from_lru_list(page, lruvec, LRU_UNEVICTABLE);
- add_page_to_lru_list(page, lruvec, lru);
+ add_page_to_lru_list(page, lruvec);
pgrescued += nr_pages;
}
SetPageLRU(page);
diff --git a/mm/vmstat.c b/mm/vmstat.c
index f8942160fc95..74b2c374b86c 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -342,6 +342,12 @@ void __mod_node_page_state(struct pglist_data *pgdat, enum node_stat_item item,
long t;
if (vmstat_item_in_bytes(item)) {
+ /*
+ * Only cgroups use subpage accounting right now; at
+ * the global level, these items still change in
+ * multiples of whole pages. Store them as pages
+ * internally to keep the per-cpu counters compact.
+ */
VM_WARN_ON_ONCE(delta & (PAGE_SIZE - 1));
delta >>= PAGE_SHIFT;
}
@@ -551,6 +557,12 @@ static inline void mod_node_state(struct pglist_data *pgdat,
long o, n, t, z;
if (vmstat_item_in_bytes(item)) {
+ /*
+ * Only cgroups use subpage accounting right now; at
+ * the global level, these items still change in
+ * multiples of whole pages. Store them as pages
+ * internally to keep the per-cpu counters compact.
+ */
VM_WARN_ON_ONCE(delta & (PAGE_SIZE - 1));
delta >>= PAGE_SHIFT;
}
@@ -1215,6 +1227,9 @@ const char * const vmstat_text[] = {
"nr_shadow_call_stack",
#endif
"nr_page_table_pages",
+#ifdef CONFIG_SWAP
+ "nr_swapcached",
+#endif
/* enum writeback_stat_item counters */
"nr_dirty_threshold",
@@ -1619,8 +1634,12 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
if (is_zone_first_populated(pgdat, zone)) {
seq_printf(m, "\n per-node stats");
for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) {
+ unsigned long pages = node_page_state_pages(pgdat, i);
+
+ if (vmstat_item_print_in_thp(i))
+ pages /= HPAGE_PMD_NR;
seq_printf(m, "\n %-12s %lu", node_stat_name(i),
- node_page_state_pages(pgdat, i));
+ pages);
}
}
seq_printf(m,
@@ -1630,14 +1649,16 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
"\n high %lu"
"\n spanned %lu"
"\n present %lu"
- "\n managed %lu",
+ "\n managed %lu"
+ "\n cma %lu",
zone_page_state(zone, NR_FREE_PAGES),
min_wmark_pages(zone),
low_wmark_pages(zone),
high_wmark_pages(zone),
zone->spanned_pages,
zone->present_pages,
- zone_managed_pages(zone));
+ zone_managed_pages(zone),
+ zone_cma_pages(zone));
seq_printf(m,
"\n protection: (%ld",
@@ -1740,8 +1761,11 @@ static void *vmstat_start(struct seq_file *m, loff_t *pos)
v += NR_VM_NUMA_STAT_ITEMS;
#endif
- for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
+ for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) {
v[i] = global_node_page_state_pages(i);
+ if (vmstat_item_print_in_thp(i))
+ v[i] /= HPAGE_PMD_NR;
+ }
v += NR_VM_NODE_STAT_ITEMS;
global_dirty_limits(v + NR_DIRTY_BG_THRESHOLD,
@@ -1882,16 +1906,12 @@ static void vmstat_update(struct work_struct *w)
*/
static bool need_update(int cpu)
{
+ pg_data_t *last_pgdat = NULL;
struct zone *zone;
for_each_populated_zone(zone) {
struct per_cpu_pageset *p = per_cpu_ptr(zone->pageset, cpu);
-
- BUILD_BUG_ON(sizeof(p->vm_stat_diff[0]) != 1);
-#ifdef CONFIG_NUMA
- BUILD_BUG_ON(sizeof(p->vm_numa_stat_diff[0]) != 2);
-#endif
-
+ struct per_cpu_nodestat *n;
/*
* The fast way of checking if there are any vmstat diffs.
*/
@@ -1903,6 +1923,13 @@ static bool need_update(int cpu)
sizeof(p->vm_numa_stat_diff[0])))
return true;
#endif
+ if (last_pgdat == zone->zone_pgdat)
+ continue;
+ last_pgdat = zone->zone_pgdat;
+ n = per_cpu_ptr(zone->zone_pgdat->per_cpu_nodestats, cpu);
+ if (memchr_inv(n->vm_node_stat_diff, 0, NR_VM_NODE_STAT_ITEMS *
+ sizeof(n->vm_node_stat_diff[0])))
+ return true;
}
return false;
}
@@ -1953,6 +1980,8 @@ static void vmstat_shepherd(struct work_struct *w)
if (!delayed_work_pending(dw) && need_update(cpu))
queue_delayed_work_on(cpu, mm_percpu_wq, dw, 0);
+
+ cond_resched();
}
put_online_cpus();
diff --git a/mm/workingset.c b/mm/workingset.c
index 10e96de945b3..cd39902c1062 100644
--- a/mm/workingset.c
+++ b/mm/workingset.c
@@ -263,10 +263,10 @@ void *workingset_eviction(struct page *page, struct mem_cgroup *target_memcg)
VM_BUG_ON_PAGE(!PageLocked(page), page);
lruvec = mem_cgroup_lruvec(target_memcg, pgdat);
- workingset_age_nonresident(lruvec, thp_nr_pages(page));
/* XXX: target_memcg can be NULL, go through lruvec */
memcgid = mem_cgroup_id(lruvec_memcg(lruvec));
eviction = atomic_long_read(&lruvec->nonresident_age);
+ workingset_age_nonresident(lruvec, thp_nr_pages(page));
return pack_shadow(memcgid, pgdat, eviction, PageWorkingset(page));
}
@@ -461,6 +461,8 @@ static unsigned long count_shadow_nodes(struct shrinker *shrinker,
unsigned long pages;
nodes = list_lru_shrink_count(&shadow_nodes, sc);
+ if (!nodes)
+ return SHRINK_EMPTY;
/*
* Approximate a reasonable limit for the nodes
@@ -503,9 +505,6 @@ static unsigned long count_shadow_nodes(struct shrinker *shrinker,
max_nodes = pages >> (XA_CHUNK_SHIFT - 3);
- if (!nodes)
- return SHRINK_EMPTY;
-
if (nodes <= max_nodes)
return 0;
return nodes - max_nodes;
diff --git a/mm/z3fold.c b/mm/z3fold.c
index dacb0d70fa61..9d889ad2bb86 100644
--- a/mm/z3fold.c
+++ b/mm/z3fold.c
@@ -413,16 +413,10 @@ static struct z3fold_header *init_z3fold_page(struct page *page, bool headless,
if (!slots)
return NULL;
+ memset(zhdr, 0, sizeof(*zhdr));
spin_lock_init(&zhdr->page_lock);
kref_init(&zhdr->refcount);
- zhdr->first_chunks = 0;
- zhdr->middle_chunks = 0;
- zhdr->last_chunks = 0;
- zhdr->first_num = 0;
- zhdr->start_middle = 0;
zhdr->cpu = -1;
- zhdr->foreign_handles = 0;
- zhdr->mapped_count = 0;
zhdr->slots = slots;
zhdr->pool = pool;
INIT_LIST_HEAD(&zhdr->buddy);
@@ -541,8 +535,7 @@ static void __release_z3fold_page(struct z3fold_header *zhdr, bool locked)
spin_unlock(&pool->stale_lock);
}
-static void __attribute__((__unused__))
- release_z3fold_page(struct kref *ref)
+static void release_z3fold_page(struct kref *ref)
{
struct z3fold_header *zhdr = container_of(ref, struct z3fold_header,
refcount);
@@ -1353,8 +1346,22 @@ static int z3fold_reclaim_page(struct z3fold_pool *pool, unsigned int retries)
page = list_entry(pos, struct page, lru);
zhdr = page_address(page);
- if (test_bit(PAGE_HEADLESS, &page->private))
+ if (test_bit(PAGE_HEADLESS, &page->private)) {
+ /*
+ * For non-headless pages, we wait to do this
+ * until we have the page lock to avoid racing
+ * with __z3fold_alloc(). Headless pages don't
+ * have a lock (and __z3fold_alloc() will never
+ * see them), but we still need to test and set
+ * PAGE_CLAIMED to avoid racing with
+ * z3fold_free(), so just do it now before
+ * leaving the loop.
+ */
+ if (test_and_set_bit(PAGE_CLAIMED, &page->private))
+ continue;
+
break;
+ }
if (kref_get_unless_zero(&zhdr->refcount) == 0) {
zhdr = NULL;
@@ -1778,6 +1785,7 @@ static u64 z3fold_zpool_total_size(void *pool)
static struct zpool_driver z3fold_zpool_driver = {
.type = "z3fold",
+ .sleep_mapped = true,
.owner = THIS_MODULE,
.create = z3fold_zpool_create,
.destroy = z3fold_zpool_destroy,
diff --git a/mm/zbud.c b/mm/zbud.c
index c49966ece674..7ec5f27a68b0 100644
--- a/mm/zbud.c
+++ b/mm/zbud.c
@@ -203,6 +203,7 @@ static u64 zbud_zpool_total_size(void *pool)
static struct zpool_driver zbud_zpool_driver = {
.type = "zbud",
+ .sleep_mapped = true,
.owner = THIS_MODULE,
.create = zbud_zpool_create,
.destroy = zbud_zpool_destroy,
diff --git a/mm/zpool.c b/mm/zpool.c
index 3744a2d1a624..5ed71207ced7 100644
--- a/mm/zpool.c
+++ b/mm/zpool.c
@@ -23,6 +23,7 @@ struct zpool {
void *pool;
const struct zpool_ops *ops;
bool evictable;
+ bool can_sleep_mapped;
struct list_head list;
};
@@ -183,6 +184,7 @@ struct zpool *zpool_create_pool(const char *type, const char *name, gfp_t gfp,
zpool->pool = driver->create(name, gfp, ops, zpool);
zpool->ops = ops;
zpool->evictable = driver->shrink && ops && ops->evict;
+ zpool->can_sleep_mapped = driver->sleep_mapped;
if (!zpool->pool) {
pr_err("couldn't create %s pool\n", type);
@@ -393,6 +395,17 @@ bool zpool_evictable(struct zpool *zpool)
return zpool->evictable;
}
+/**
+ * zpool_can_sleep_mapped - Test if zpool can sleep when do mapped.
+ * @zpool: The zpool to test
+ *
+ * Returns: true if zpool can sleep; false otherwise.
+ */
+bool zpool_can_sleep_mapped(struct zpool *zpool)
+{
+ return zpool->can_sleep_mapped;
+}
+
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Dan Streetman <ddstreet@ieee.org>");
MODULE_DESCRIPTION("Common API for compressed memory storage");
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index 7289f502ffac..30c358b72025 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -357,7 +357,7 @@ static void cache_free_handle(struct zs_pool *pool, unsigned long handle)
static struct zspage *cache_alloc_zspage(struct zs_pool *pool, gfp_t flags)
{
- return kmem_cache_alloc(pool->zspage_cachep,
+ return kmem_cache_zalloc(pool->zspage_cachep,
flags & ~(__GFP_HIGHMEM|__GFP_MOVABLE));
}
@@ -816,7 +816,7 @@ static int get_pages_per_zspage(int class_size)
static struct zspage *get_zspage(struct page *page)
{
- struct zspage *zspage = (struct zspage *)page->private;
+ struct zspage *zspage = (struct zspage *)page_private(page);
BUG_ON(zspage->magic != ZSPAGE_MAGIC);
return zspage;
@@ -1064,7 +1064,6 @@ static struct zspage *alloc_zspage(struct zs_pool *pool,
if (!zspage)
return NULL;
- memset(zspage, 0, sizeof(struct zspage));
zspage->magic = ZSPAGE_MAGIC;
migrate_lock_init(zspage);
@@ -2213,11 +2212,13 @@ static unsigned long zs_can_compact(struct size_class *class)
return obj_wasted * class->pages_per_zspage;
}
-static void __zs_compact(struct zs_pool *pool, struct size_class *class)
+static unsigned long __zs_compact(struct zs_pool *pool,
+ struct size_class *class)
{
struct zs_compact_control cc;
struct zspage *src_zspage;
struct zspage *dst_zspage = NULL;
+ unsigned long pages_freed = 0;
spin_lock(&class->lock);
while ((src_zspage = isolate_zspage(class, true))) {
@@ -2247,7 +2248,7 @@ static void __zs_compact(struct zs_pool *pool, struct size_class *class)
putback_zspage(class, dst_zspage);
if (putback_zspage(class, src_zspage) == ZS_EMPTY) {
free_zspage(pool, class, src_zspage);
- pool->stats.pages_compacted += class->pages_per_zspage;
+ pages_freed += class->pages_per_zspage;
}
spin_unlock(&class->lock);
cond_resched();
@@ -2258,12 +2259,15 @@ static void __zs_compact(struct zs_pool *pool, struct size_class *class)
putback_zspage(class, src_zspage);
spin_unlock(&class->lock);
+
+ return pages_freed;
}
unsigned long zs_compact(struct zs_pool *pool)
{
int i;
struct size_class *class;
+ unsigned long pages_freed = 0;
for (i = ZS_SIZE_CLASSES - 1; i >= 0; i--) {
class = pool->size_class[i];
@@ -2271,10 +2275,11 @@ unsigned long zs_compact(struct zs_pool *pool)
continue;
if (class->index != i)
continue;
- __zs_compact(pool, class);
+ pages_freed += __zs_compact(pool, class);
}
+ atomic_long_add(pages_freed, &pool->stats.pages_compacted);
- return pool->stats.pages_compacted;
+ return pages_freed;
}
EXPORT_SYMBOL_GPL(zs_compact);
@@ -2291,13 +2296,12 @@ static unsigned long zs_shrinker_scan(struct shrinker *shrinker,
struct zs_pool *pool = container_of(shrinker, struct zs_pool,
shrinker);
- pages_freed = pool->stats.pages_compacted;
/*
* Compact classes and calculate compaction delta.
* Can run concurrently with a manually triggered
* (by user) compaction.
*/
- pages_freed = zs_compact(pool) - pages_freed;
+ pages_freed = zs_compact(pool);
return pages_freed ? pages_freed : SHRINK_STOP;
}
diff --git a/mm/zswap.c b/mm/zswap.c
index 182f6ad5aa69..578d9f256920 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -935,13 +935,19 @@ static int zswap_writeback_entry(struct zpool *pool, unsigned long handle)
struct scatterlist input, output;
struct crypto_acomp_ctx *acomp_ctx;
- u8 *src;
+ u8 *src, *tmp = NULL;
unsigned int dlen;
int ret;
struct writeback_control wbc = {
.sync_mode = WB_SYNC_NONE,
};
+ if (!zpool_can_sleep_mapped(pool)) {
+ tmp = kmalloc(PAGE_SIZE, GFP_ATOMIC);
+ if (!tmp)
+ return -ENOMEM;
+ }
+
/* extract swpentry from data */
zhdr = zpool_map_handle(pool, handle, ZPOOL_MM_RO);
swpentry = zhdr->swpentry; /* here */
@@ -955,6 +961,7 @@ static int zswap_writeback_entry(struct zpool *pool, unsigned long handle)
/* entry was invalidated */
spin_unlock(&tree->lock);
zpool_unmap_handle(pool, handle);
+ kfree(tmp);
return 0;
}
spin_unlock(&tree->lock);
@@ -979,6 +986,14 @@ static int zswap_writeback_entry(struct zpool *pool, unsigned long handle)
dlen = PAGE_SIZE;
src = (u8 *)zhdr + sizeof(struct zswap_header);
+ if (!zpool_can_sleep_mapped(pool)) {
+
+ memcpy(tmp, src, entry->length);
+ src = tmp;
+
+ zpool_unmap_handle(pool, handle);
+ }
+
mutex_lock(acomp_ctx->mutex);
sg_init_one(&input, src, entry->length);
sg_init_table(&output, 1);
@@ -1022,10 +1037,10 @@ static int zswap_writeback_entry(struct zpool *pool, unsigned long handle)
/*
* if we get here due to ZSWAP_SWAPCACHE_EXIST
- * a load may happening concurrently
- * it is safe and okay to not free the entry
+ * a load may be happening concurrently.
+ * it is safe and okay to not free the entry.
* if we free the entry in the following put
- * it it either okay to return !0
+ * it is also okay to return !0
*/
fail:
spin_lock(&tree->lock);
@@ -1033,7 +1048,11 @@ fail:
spin_unlock(&tree->lock);
end:
- zpool_unmap_handle(pool, handle);
+ if (zpool_can_sleep_mapped(pool))
+ zpool_unmap_handle(pool, handle);
+ else
+ kfree(tmp);
+
return ret;
}
@@ -1235,7 +1254,7 @@ static int zswap_frontswap_load(unsigned type, pgoff_t offset,
struct zswap_entry *entry;
struct scatterlist input, output;
struct crypto_acomp_ctx *acomp_ctx;
- u8 *src, *dst;
+ u8 *src, *dst, *tmp;
unsigned int dlen;
int ret;
@@ -1253,15 +1272,33 @@ static int zswap_frontswap_load(unsigned type, pgoff_t offset,
dst = kmap_atomic(page);
zswap_fill_page(dst, entry->value);
kunmap_atomic(dst);
+ ret = 0;
goto freeentry;
}
+ if (!zpool_can_sleep_mapped(entry->pool->zpool)) {
+
+ tmp = kmalloc(entry->length, GFP_ATOMIC);
+ if (!tmp) {
+ ret = -ENOMEM;
+ goto freeentry;
+ }
+ }
+
/* decompress */
dlen = PAGE_SIZE;
src = zpool_map_handle(entry->pool->zpool, entry->handle, ZPOOL_MM_RO);
if (zpool_evictable(entry->pool->zpool))
src += sizeof(struct zswap_header);
+ if (!zpool_can_sleep_mapped(entry->pool->zpool)) {
+
+ memcpy(tmp, src, entry->length);
+ src = tmp;
+
+ zpool_unmap_handle(entry->pool->zpool, entry->handle);
+ }
+
acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx);
mutex_lock(acomp_ctx->mutex);
sg_init_one(&input, src, entry->length);
@@ -1271,7 +1308,11 @@ static int zswap_frontswap_load(unsigned type, pgoff_t offset,
ret = crypto_wait_req(crypto_acomp_decompress(acomp_ctx->req), &acomp_ctx->wait);
mutex_unlock(acomp_ctx->mutex);
- zpool_unmap_handle(entry->pool->zpool, entry->handle);
+ if (zpool_can_sleep_mapped(entry->pool->zpool))
+ zpool_unmap_handle(entry->pool->zpool, entry->handle);
+ else
+ kfree(tmp);
+
BUG_ON(ret);
freeentry:
@@ -1279,7 +1320,7 @@ freeentry:
zswap_entry_put(tree, entry);
spin_unlock(&tree->lock);
- return 0;
+ return ret;
}
/* frees an entry in zswap */