diff options
Diffstat (limited to 'mm')
-rw-r--r-- | mm/Kconfig | 42 | ||||
-rw-r--r-- | mm/Kconfig.debug | 28 | ||||
-rw-r--r-- | mm/Makefile | 4 | ||||
-rw-r--r-- | mm/backing-dev.c | 8 | ||||
-rw-r--r-- | mm/cma.c | 6 | ||||
-rw-r--r-- | mm/compaction.c | 123 | ||||
-rw-r--r-- | mm/debug.c | 4 | ||||
-rw-r--r-- | mm/filemap.c | 631 | ||||
-rw-r--r-- | mm/gup.c | 384 | ||||
-rw-r--r-- | mm/gup_test.c (renamed from mm/gup_benchmark.c) | 111 | ||||
-rw-r--r-- | mm/gup_test.h | 32 | ||||
-rw-r--r-- | mm/highmem.c | 324 | ||||
-rw-r--r-- | mm/huge_memory.c | 113 | ||||
-rw-r--r-- | mm/hugetlb.c | 28 | ||||
-rw-r--r-- | mm/init-mm.c | 1 | ||||
-rw-r--r-- | mm/internal.h | 5 | ||||
-rw-r--r-- | mm/kasan/Makefile | 25 | ||||
-rw-r--r-- | mm/kasan/common.c | 822 | ||||
-rw-r--r-- | mm/kasan/generic.c | 75 | ||||
-rw-r--r-- | mm/kasan/generic_report.c | 165 | ||||
-rw-r--r-- | mm/kasan/hw_tags.c | 204 | ||||
-rw-r--r-- | mm/kasan/init.c | 17 | ||||
-rw-r--r-- | mm/kasan/kasan.h | 173 | ||||
-rw-r--r-- | mm/kasan/quarantine.c | 31 | ||||
-rw-r--r-- | mm/kasan/report.c | 317 | ||||
-rw-r--r-- | mm/kasan/report_generic.c | 327 | ||||
-rw-r--r-- | mm/kasan/report_hw_tags.c | 42 | ||||
-rw-r--r-- | mm/kasan/report_sw_tags.c (renamed from mm/kasan/tags_report.c) | 29 | ||||
-rw-r--r-- | mm/kasan/shadow.c | 504 | ||||
-rw-r--r-- | mm/kasan/sw_tags.c (renamed from mm/kasan/tags.c) | 39 | ||||
-rw-r--r-- | mm/khugepaged.c | 60 | ||||
-rw-r--r-- | mm/ksm.c | 50 | ||||
-rw-r--r-- | mm/madvise.c | 14 | ||||
-rw-r--r-- | mm/mapping_dirty_helpers.c | 6 | ||||
-rw-r--r-- | mm/memblock.c | 85 | ||||
-rw-r--r-- | mm/memcontrol.c | 415 | ||||
-rw-r--r-- | mm/memory-failure.c | 226 | ||||
-rw-r--r-- | mm/memory.c | 60 | ||||
-rw-r--r-- | mm/memory_hotplug.c | 34 | ||||
-rw-r--r-- | mm/mempolicy.c | 8 | ||||
-rw-r--r-- | mm/mempool.c | 4 | ||||
-rw-r--r-- | mm/migrate.c | 185 | ||||
-rw-r--r-- | mm/mlock.c | 63 | ||||
-rw-r--r-- | mm/mm_init.c | 1 | ||||
-rw-r--r-- | mm/mmap.c | 24 | ||||
-rw-r--r-- | mm/mmap_lock.c | 230 | ||||
-rw-r--r-- | mm/mmu_notifier.c | 7 | ||||
-rw-r--r-- | mm/mmzone.c | 15 | ||||
-rw-r--r-- | mm/mprotect.c | 7 | ||||
-rw-r--r-- | mm/mremap.c | 280 | ||||
-rw-r--r-- | mm/nommu.c | 8 | ||||
-rw-r--r-- | mm/oom_kill.c | 14 | ||||
-rw-r--r-- | mm/page_alloc.c | 487 | ||||
-rw-r--r-- | mm/page_counter.c | 4 | ||||
-rw-r--r-- | mm/page_ext.c | 12 | ||||
-rw-r--r-- | mm/page_idle.c | 4 | ||||
-rw-r--r-- | mm/page_io.c | 6 | ||||
-rw-r--r-- | mm/page_isolation.c | 12 | ||||
-rw-r--r-- | mm/page_owner.c | 17 | ||||
-rw-r--r-- | mm/page_poison.c | 58 | ||||
-rw-r--r-- | mm/page_vma_mapped.c | 9 | ||||
-rw-r--r-- | mm/process_vm_access.c | 2 | ||||
-rw-r--r-- | mm/ptdump.c | 13 | ||||
-rw-r--r-- | mm/rmap.c | 21 | ||||
-rw-r--r-- | mm/shmem.c | 39 | ||||
-rw-r--r-- | mm/slab.c | 10 | ||||
-rw-r--r-- | mm/slab.h | 47 | ||||
-rw-r--r-- | mm/slab_common.c | 15 | ||||
-rw-r--r-- | mm/slob.c | 6 | ||||
-rw-r--r-- | mm/slub.c | 189 | ||||
-rw-r--r-- | mm/swap.c | 220 | ||||
-rw-r--r-- | mm/swap_state.c | 7 | ||||
-rw-r--r-- | mm/swapfile.c | 25 | ||||
-rw-r--r-- | mm/truncate.c | 12 | ||||
-rw-r--r-- | mm/util.c | 12 | ||||
-rw-r--r-- | mm/vmalloc.c | 105 | ||||
-rw-r--r-- | mm/vmscan.c | 228 | ||||
-rw-r--r-- | mm/vmstat.c | 6 | ||||
-rw-r--r-- | mm/workingset.c | 12 | ||||
-rw-r--r-- | mm/z3fold.c | 191 | ||||
-rw-r--r-- | mm/zsmalloc.c | 11 | ||||
-rw-r--r-- | mm/zswap.c | 189 |
82 files changed, 4872 insertions, 3507 deletions
diff --git a/mm/Kconfig b/mm/Kconfig index 390165ffbb0f..f730605b8dcf 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -713,26 +713,24 @@ config ZSMALLOC_STAT select DEBUG_FS help This option enables code in the zsmalloc to collect various - statistics about whats happening in zsmalloc and exports that + statistics about what's happening in zsmalloc and exports that information to userspace via debugfs. If unsure, say N. config GENERIC_EARLY_IOREMAP bool -config MAX_STACK_SIZE_MB - int "Maximum user stack size for 32-bit processes (MB)" - default 80 +config STACK_MAX_DEFAULT_SIZE_MB + int "Default maximum user stack size for 32-bit processes (MB)" + default 100 range 8 2048 depends on STACK_GROWSUP && (!64BIT || COMPAT) help This is the maximum stack size in Megabytes in the VM layout of 32-bit user processes when the stack grows upwards (currently only on parisc - arch). The stack will be located at the highest memory address minus - the given value, unless the RLIMIT_STACK hard limit is changed to a - smaller value in which case that is used. + arch) when the RLIMIT_STACK hard limit is unlimited. - A sane initial value is 80 MB. + A sane initial value is 100 MB. config DEFERRED_STRUCT_PAGE_INIT bool "Defer initialisation of struct pages to kthreads" @@ -821,13 +819,28 @@ config PERCPU_STATS information includes global and per chunk statistics, which can be used to help understand percpu memory usage. -config GUP_BENCHMARK - bool "Enable infrastructure for get_user_pages() and related calls benchmarking" +config GUP_TEST + bool "Enable infrastructure for get_user_pages()-related unit tests" + depends on DEBUG_FS help - Provides /sys/kernel/debug/gup_benchmark that helps with testing - performance of get_user_pages() and related calls. + Provides /sys/kernel/debug/gup_test, which in turn provides a way + to make ioctl calls that can launch kernel-based unit tests for + the get_user_pages*() and pin_user_pages*() family of API calls. - See tools/testing/selftests/vm/gup_benchmark.c + These tests include benchmark testing of the _fast variants of + get_user_pages*() and pin_user_pages*(), as well as smoke tests of + the non-_fast variants. + + There is also a sub-test that allows running dump_page() on any + of up to eight pages (selected by command line args) within the + range of user-space addresses. These pages are either pinned via + pin_user_pages*(), or pinned via get_user_pages*(), as specified + by other command line arguments. + + See tools/testing/selftests/vm/gup_test.c + +comment "GUP_TEST needs to have DEBUG_FS enabled" + depends on !GUP_TEST && !DEBUG_FS config GUP_GET_PTE_LOW_HIGH bool @@ -859,4 +872,7 @@ config ARCH_HAS_HUGEPD config MAPPING_DIRTY_HELPERS bool +config KMAP_LOCAL + bool + endmenu diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug index 864f129f1937..1e73717802f8 100644 --- a/mm/Kconfig.debug +++ b/mm/Kconfig.debug @@ -64,7 +64,6 @@ config PAGE_OWNER config PAGE_POISONING bool "Poison pages after freeing" - select PAGE_POISONING_NO_SANITY if HIBERNATION help Fill the pages with poison patterns after free_pages() and verify the patterns before alloc_pages. The filling of the memory helps @@ -75,30 +74,11 @@ config PAGE_POISONING Note that "poison" here is not the same thing as the "HWPoison" for CONFIG_MEMORY_FAILURE. This is software poisoning only. - If unsure, say N - -config PAGE_POISONING_NO_SANITY - depends on PAGE_POISONING - bool "Only poison, don't sanity check" - help - Skip the sanity checking on alloc, only fill the pages with - poison on free. This reduces some of the overhead of the - poisoning feature. - - If you are only interested in sanitization, say Y. Otherwise - say N. + If you are only interested in sanitization of freed pages without + checking the poison pattern on alloc, you can boot the kernel with + "init_on_free=1" instead of enabling this. -config PAGE_POISONING_ZERO - bool "Use zero for poisoning instead of debugging value" - depends on PAGE_POISONING - help - Instead of using the existing poison value, fill the pages with - zeros. This makes it harder to detect when errors are occurring - due to sanitization but the zeroing at free means that it is - no longer necessary to write zeros when GFP_ZERO is used on - allocation. - - If unsure, say N + If unsure, say N config DEBUG_PAGE_REF bool "Enable tracepoint to track down page reference manipulation" diff --git a/mm/Makefile b/mm/Makefile index d73aed0fc99c..b6cd2fffa492 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -52,7 +52,7 @@ obj-y := filemap.o mempool.o oom_kill.o fadvise.o \ mm_init.o percpu.o slab_common.o \ compaction.o vmacache.o \ interval_tree.o list_lru.o workingset.o \ - debug.o gup.o $(mmu-y) + debug.o gup.o mmap_lock.o $(mmu-y) # Give 'page_alloc' its own module-parameter namespace page-alloc-y := page_alloc.o @@ -90,7 +90,7 @@ obj-$(CONFIG_PAGE_COUNTER) += page_counter.o obj-$(CONFIG_MEMCG) += memcontrol.o vmpressure.o obj-$(CONFIG_MEMCG_SWAP) += swap_cgroup.o obj-$(CONFIG_CGROUP_HUGETLB) += hugetlb_cgroup.o -obj-$(CONFIG_GUP_BENCHMARK) += gup_benchmark.o +obj-$(CONFIG_GUP_TEST) += gup_test.o obj-$(CONFIG_MEMORY_FAILURE) += memory-failure.o obj-$(CONFIG_HWPOISON_INJECT) += hwpoison-inject.o obj-$(CONFIG_DEBUG_KMEMLEAK) += kmemleak.o diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 408d5051d05b..e33797579338 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -150,11 +150,11 @@ static ssize_t read_ahead_kb_store(struct device *dev, #define BDI_SHOW(name, expr) \ static ssize_t name##_show(struct device *dev, \ - struct device_attribute *attr, char *page) \ + struct device_attribute *attr, char *buf) \ { \ struct backing_dev_info *bdi = dev_get_drvdata(dev); \ \ - return snprintf(page, PAGE_SIZE-1, "%lld\n", (long long)expr); \ + return sysfs_emit(buf, "%lld\n", (long long)expr); \ } \ static DEVICE_ATTR_RW(name); @@ -200,11 +200,11 @@ BDI_SHOW(max_ratio, bdi->max_ratio) static ssize_t stable_pages_required_show(struct device *dev, struct device_attribute *attr, - char *page) + char *buf) { dev_warn_once(dev, "the stable_pages_required attribute has been removed. Use the stable_writes queue attribute instead.\n"); - return snprintf(page, PAGE_SIZE-1, "%d\n", 0); + return sysfs_emit(buf, "%d\n", 0); } static DEVICE_ATTR_RO(stable_pages_required); @@ -38,7 +38,6 @@ struct cma cma_areas[MAX_CMA_AREAS]; unsigned cma_area_count; -static DEFINE_MUTEX(cma_mutex); phys_addr_t cma_get_base(const struct cma *cma) { @@ -454,10 +453,9 @@ struct page *cma_alloc(struct cma *cma, size_t count, unsigned int align, mutex_unlock(&cma->lock); pfn = cma->base_pfn + (bitmap_no << cma->order_per_bit); - mutex_lock(&cma_mutex); ret = alloc_contig_range(pfn, pfn + count, MIGRATE_CMA, GFP_KERNEL | (no_warn ? __GFP_NOWARN : 0)); - mutex_unlock(&cma_mutex); + if (ret == 0) { page = pfn_to_page(pfn); break; @@ -512,7 +510,7 @@ bool cma_release(struct cma *cma, const struct page *pages, unsigned int count) if (!cma || !pages) return false; - pr_debug("%s(page %p)\n", __func__, (void *)pages); + pr_debug("%s(page %p, count %u)\n", __func__, (void *)pages, count); pfn = page_to_pfn(pages); diff --git a/mm/compaction.c b/mm/compaction.c index 13cb7a961b31..e5acb9714436 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -157,7 +157,7 @@ EXPORT_SYMBOL(__ClearPageMovable); * allocation success. 1 << compact_defer_shift, compactions are skipped up * to a limit of 1 << COMPACT_MAX_DEFER_SHIFT */ -void defer_compaction(struct zone *zone, int order) +static void defer_compaction(struct zone *zone, int order) { zone->compact_considered = 0; zone->compact_defer_shift++; @@ -172,7 +172,7 @@ void defer_compaction(struct zone *zone, int order) } /* Returns true if compaction should be skipped this time */ -bool compaction_deferred(struct zone *zone, int order) +static bool compaction_deferred(struct zone *zone, int order) { unsigned long defer_limit = 1UL << zone->compact_defer_shift; @@ -209,7 +209,7 @@ void compaction_defer_reset(struct zone *zone, int order, } /* Returns true if restarting compaction after many failures */ -bool compaction_restarting(struct zone *zone, int order) +static bool compaction_restarting(struct zone *zone, int order) { if (order < zone->compact_order_failed) return false; @@ -237,7 +237,7 @@ static void reset_cached_positions(struct zone *zone) } /* - * Compound pages of >= pageblock_order should consistenly be skipped until + * Compound pages of >= pageblock_order should consistently be skipped until * released. It is always pointless to compact pages of such order (if they are * migratable), and the pageblocks they occupy cannot contain any free pages. */ @@ -804,7 +804,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, unsigned long nr_scanned = 0, nr_isolated = 0; struct lruvec *lruvec; unsigned long flags = 0; - bool locked = false; + struct lruvec *locked = NULL; struct page *page = NULL, *valid_page = NULL; unsigned long start_pfn = low_pfn; bool skip_on_failure = false; @@ -868,11 +868,20 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, * contention, to give chance to IRQs. Abort completely if * a fatal signal is pending. */ - if (!(low_pfn % SWAP_CLUSTER_MAX) - && compact_unlock_should_abort(&pgdat->lru_lock, - flags, &locked, cc)) { - low_pfn = 0; - goto fatal_pending; + if (!(low_pfn % SWAP_CLUSTER_MAX)) { + if (locked) { + unlock_page_lruvec_irqrestore(locked, flags); + locked = NULL; + } + + if (fatal_signal_pending(current)) { + cc->contended = true; + + low_pfn = 0; + goto fatal_pending; + } + + cond_resched(); } if (!pfn_valid_within(low_pfn)) @@ -890,6 +899,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, if (!valid_page && IS_ALIGNED(low_pfn, pageblock_nr_pages)) { if (!cc->ignore_skip_hint && get_pageblock_skip(page)) { low_pfn = end_pfn; + page = NULL; goto isolate_abort; } valid_page = page; @@ -943,9 +953,8 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, if (unlikely(__PageMovable(page)) && !PageIsolated(page)) { if (locked) { - spin_unlock_irqrestore(&pgdat->lru_lock, - flags); - locked = false; + unlock_page_lruvec_irqrestore(locked, flags); + locked = NULL; } if (!isolate_movable_page(page, isolate_mode)) @@ -971,10 +980,34 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, if (!(cc->gfp_mask & __GFP_FS) && page_mapping(page)) goto isolate_fail; + /* + * Be careful not to clear PageLRU until after we're + * sure the page is not being freed elsewhere -- the + * page release code relies on it. + */ + if (unlikely(!get_page_unless_zero(page))) + goto isolate_fail; + + if (__isolate_lru_page_prepare(page, isolate_mode) != 0) + goto isolate_fail_put; + + /* Try isolate the page */ + if (!TestClearPageLRU(page)) + goto isolate_fail_put; + + rcu_read_lock(); + lruvec = mem_cgroup_page_lruvec(page, pgdat); + /* If we already hold the lock, we can skip some rechecking */ - if (!locked) { - locked = compact_lock_irqsave(&pgdat->lru_lock, - &flags, cc); + if (lruvec != locked) { + if (locked) + unlock_page_lruvec_irqrestore(locked, flags); + + compact_lock_irqsave(&lruvec->lru_lock, &flags, cc); + locked = lruvec; + rcu_read_unlock(); + + lruvec_memcg_debug(lruvec, page); /* Try get exclusive access under lock */ if (!skip_updated) { @@ -983,10 +1016,6 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, goto isolate_abort; } - /* Recheck PageLRU and PageCompound under lock */ - if (!PageLRU(page)) - goto isolate_fail; - /* * Page become compound since the non-locked check, * and it's on LRU. It can only be a THP so the order @@ -994,15 +1023,11 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, */ if (unlikely(PageCompound(page) && !cc->alloc_contig)) { low_pfn += compound_nr(page) - 1; - goto isolate_fail; + SetPageLRU(page); + goto isolate_fail_put; } - } - - lruvec = mem_cgroup_page_lruvec(page, pgdat); - - /* Try isolate the page */ - if (__isolate_lru_page(page, isolate_mode) != 0) - goto isolate_fail; + } else + rcu_read_unlock(); /* The whole page is taken off the LRU; skip the tail pages. */ if (PageCompound(page)) @@ -1032,6 +1057,15 @@ isolate_success: } continue; + +isolate_fail_put: + /* Avoid potential deadlock in freeing page under lru_lock */ + if (locked) { + unlock_page_lruvec_irqrestore(locked, flags); + locked = NULL; + } + put_page(page); + isolate_fail: if (!skip_on_failure) continue; @@ -1043,8 +1077,8 @@ isolate_fail: */ if (nr_isolated) { if (locked) { - spin_unlock_irqrestore(&pgdat->lru_lock, flags); - locked = false; + unlock_page_lruvec_irqrestore(locked, flags); + locked = NULL; } putback_movable_pages(&cc->migratepages); cc->nr_migratepages = 0; @@ -1068,9 +1102,15 @@ isolate_fail: if (unlikely(low_pfn > end_pfn)) low_pfn = end_pfn; + page = NULL; + isolate_abort: if (locked) - spin_unlock_irqrestore(&pgdat->lru_lock, flags); + unlock_page_lruvec_irqrestore(locked, flags); + if (page) { + SetPageLRU(page); + put_page(page); + } /* * Updated the cached scanner pfn once the pageblock has been scanned @@ -2070,13 +2110,6 @@ static enum compact_result compact_finished(struct compact_control *cc) return ret; } -/* - * compaction_suitable: Is this suitable to run compaction on this zone now? - * Returns - * COMPACT_SKIPPED - If there are too few free pages for compaction - * COMPACT_SUCCESS - If the allocation would succeed without compaction - * COMPACT_CONTINUE - If compaction should run now - */ static enum compact_result __compaction_suitable(struct zone *zone, int order, unsigned int alloc_flags, int highest_zoneidx, @@ -2120,6 +2153,13 @@ static enum compact_result __compaction_suitable(struct zone *zone, int order, return COMPACT_CONTINUE; } +/* + * compaction_suitable: Is this suitable to run compaction on this zone now? + * Returns + * COMPACT_SKIPPED - If there are too few free pages for compaction + * COMPACT_SUCCESS - If the allocation would succeed without compaction + * COMPACT_CONTINUE - If compaction should run now + */ enum compact_result compaction_suitable(struct zone *zone, int order, unsigned int alloc_flags, int highest_zoneidx) @@ -2275,7 +2315,7 @@ compact_zone(struct compact_control *cc, struct capture_control *capc) while ((ret = compact_finished(cc)) == COMPACT_CONTINUE) { int err; - unsigned long start_pfn = cc->migrate_pfn; + unsigned long iteration_start_pfn = cc->migrate_pfn; /* * Avoid multiple rescans which can happen if a page cannot be @@ -2287,7 +2327,7 @@ compact_zone(struct compact_control *cc, struct capture_control *capc) */ cc->rescan = false; if (pageblock_start_pfn(last_migrated_pfn) == - pageblock_start_pfn(start_pfn)) { + pageblock_start_pfn(iteration_start_pfn)) { cc->rescan = true; } @@ -2311,8 +2351,7 @@ compact_zone(struct compact_control *cc, struct capture_control *capc) goto check_drain; case ISOLATE_SUCCESS: update_cached = false; - last_migrated_pfn = start_pfn; - ; + last_migrated_pfn = iteration_start_pfn; } err = migrate_pages(&cc->migratepages, compaction_alloc, diff --git a/mm/debug.c b/mm/debug.c index ccca576b2899..8a40b3fefbeb 100644 --- a/mm/debug.c +++ b/mm/debug.c @@ -182,8 +182,8 @@ hex_only: pr_warn("page dumped because: %s\n", reason); #ifdef CONFIG_MEMCG - if (!page_poisoned && page->mem_cgroup) - pr_warn("page->mem_cgroup:%px\n", page->mem_cgroup); + if (!page_poisoned && page->memcg_data) + pr_warn("pages's memcg:%lx\n", page->memcg_data); #endif } diff --git a/mm/filemap.c b/mm/filemap.c index 0b2067b3c328..5c9d564317a5 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -102,8 +102,8 @@ * ->swap_lock (try_to_unmap_one) * ->private_lock (try_to_unmap_one) * ->i_pages lock (try_to_unmap_one) - * ->pgdat->lru_lock (follow_page->mark_page_accessed) - * ->pgdat->lru_lock (check_pte_range->isolate_lru_page) + * ->lruvec->lru_lock (follow_page->mark_page_accessed) + * ->lruvec->lru_lock (check_pte_range->isolate_lru_page) * ->private_lock (page_remove_rmap->set_page_dirty) * ->i_pages lock (page_remove_rmap->set_page_dirty) * bdi.wb->list_lock (page_remove_rmap->set_page_dirty) @@ -204,9 +204,9 @@ static void unaccount_page_cache_page(struct address_space *mapping, if (PageSwapBacked(page)) { __mod_lruvec_page_state(page, NR_SHMEM, -nr); if (PageTransHuge(page)) - __dec_node_page_state(page, NR_SHMEM_THPS); + __dec_lruvec_page_state(page, NR_SHMEM_THPS); } else if (PageTransHuge(page)) { - __dec_node_page_state(page, NR_FILE_THPS); + __dec_lruvec_page_state(page, NR_FILE_THPS); filemap_nr_thps_dec(mapping); } @@ -1359,7 +1359,7 @@ static int __wait_on_page_locked_async(struct page *page, else ret = PageLocked(page); /* - * If we were succesful now, we know we're still on the + * If we were successful now, we know we're still on the * waitqueue as we're still under the lock. This means it's * safe to remove and return success, we know the callback * isn't going to trigger. @@ -1583,19 +1583,20 @@ int __lock_page_or_retry(struct page *page, struct mm_struct *mm, else wait_on_page_locked(page); return 0; - } else { - if (flags & FAULT_FLAG_KILLABLE) { - int ret; + } + if (flags & FAULT_FLAG_KILLABLE) { + int ret; - ret = __lock_page_killable(page); - if (ret) { - mmap_read_unlock(mm); - return 0; - } - } else - __lock_page(page); - return 1; + ret = __lock_page_killable(page); + if (ret) { + mmap_read_unlock(mm); + return 0; + } + } else { + __lock_page(page); } + return 1; + } /** @@ -2166,6 +2167,259 @@ static void shrink_readahead_size_eio(struct file_ra_state *ra) ra->ra_pages /= 4; } +static int lock_page_for_iocb(struct kiocb *iocb, struct page *page) +{ + if (iocb->ki_flags & IOCB_WAITQ) + return lock_page_async(page, iocb->ki_waitq); + else if (iocb->ki_flags & IOCB_NOWAIT) + return trylock_page(page) ? 0 : -EAGAIN; + else + return lock_page_killable(page); +} + +static struct page * +generic_file_buffered_read_readpage(struct kiocb *iocb, + struct file *filp, + struct address_space *mapping, + struct page *page) +{ + struct file_ra_state *ra = &filp->f_ra; + int error; + + if (iocb->ki_flags & (IOCB_NOIO | IOCB_NOWAIT)) { + unlock_page(page); + put_page(page); + return ERR_PTR(-EAGAIN); + } + + /* + * A previous I/O error may have been due to temporary + * failures, eg. multipath errors. + * PG_error will be set again if readpage fails. + */ + ClearPageError(page); + /* Start the actual read. The read will unlock the page. */ + error = mapping->a_ops->readpage(filp, page); + + if (unlikely(error)) { + put_page(page); + return error != AOP_TRUNCATED_PAGE ? ERR_PTR(error) : NULL; + } + + if (!PageUptodate(page)) { + error = lock_page_for_iocb(iocb, page); + if (unlikely(error)) { + put_page(page); + return ERR_PTR(error); + } + if (!PageUptodate(page)) { + if (page->mapping == NULL) { + /* + * invalidate_mapping_pages got it + */ + unlock_page(page); + put_page(page); + return NULL; + } + unlock_page(page); + shrink_readahead_size_eio(ra); + put_page(page); + return ERR_PTR(-EIO); + } + unlock_page(page); + } + + return page; +} + +static struct page * +generic_file_buffered_read_pagenotuptodate(struct kiocb *iocb, + struct file *filp, + struct iov_iter *iter, + struct page *page, + loff_t pos, loff_t count) +{ + struct address_space *mapping = filp->f_mapping; + struct inode *inode = mapping->host; + int error; + + /* + * See comment in do_read_cache_page on why + * wait_on_page_locked is used to avoid unnecessarily + * serialisations and why it's safe. + */ + if (iocb->ki_flags & IOCB_WAITQ) { + error = wait_on_page_locked_async(page, + iocb->ki_waitq); + } else { + error = wait_on_page_locked_killable(page); + } + if (unlikely(error)) { + put_page(page); + return ERR_PTR(error); + } + if (PageUptodate(page)) + return page; + + if (inode->i_blkbits == PAGE_SHIFT || + !mapping->a_ops->is_partially_uptodate) + goto page_not_up_to_date; + /* pipes can't handle partially uptodate pages */ + if (unlikely(iov_iter_is_pipe(iter))) + goto page_not_up_to_date; + if (!trylock_page(page)) + goto page_not_up_to_date; + /* Did it get truncated before we got the lock? */ + if (!page->mapping) + goto page_not_up_to_date_locked; + if (!mapping->a_ops->is_partially_uptodate(page, + pos & ~PAGE_MASK, count)) + goto page_not_up_to_date_locked; + unlock_page(page); + return page; + +page_not_up_to_date: + /* Get exclusive access to the page ... */ + error = lock_page_for_iocb(iocb, page); + if (unlikely(error)) { + put_page(page); + return ERR_PTR(error); + } + +page_not_up_to_date_locked: + /* Did it get truncated before we got the lock? */ + if (!page->mapping) { + unlock_page(page); + put_page(page); + return NULL; + } + + /* Did somebody else fill it already? */ + if (PageUptodate(page)) { + unlock_page(page); + return page; + } + + return generic_file_buffered_read_readpage(iocb, filp, mapping, page); +} + +static struct page * +generic_file_buffered_read_no_cached_page(struct kiocb *iocb, + struct iov_iter *iter) +{ + struct file *filp = iocb->ki_filp; + struct address_space *mapping = filp->f_mapping; + pgoff_t index = iocb->ki_pos >> PAGE_SHIFT; + struct page *page; + int error; + + if (iocb->ki_flags & IOCB_NOIO) + return ERR_PTR(-EAGAIN); + + /* + * Ok, it wasn't cached, so we need to create a new + * page.. + */ + page = page_cache_alloc(mapping); + if (!page) + return ERR_PTR(-ENOMEM); + + error = add_to_page_cache_lru(page, mapping, index, + mapping_gfp_constraint(mapping, GFP_KERNEL)); + if (error) { + put_page(page); + return error != -EEXIST ? ERR_PTR(error) : NULL; + } + + return generic_file_buffered_read_readpage(iocb, filp, mapping, page); +} + +static int generic_file_buffered_read_get_pages(struct kiocb *iocb, + struct iov_iter *iter, + struct page **pages, + unsigned int nr) +{ + struct file *filp = iocb->ki_filp; + struct address_space *mapping = filp->f_mapping; + struct file_ra_state *ra = &filp->f_ra; + pgoff_t index = iocb->ki_pos >> PAGE_SHIFT; + pgoff_t last_index = (iocb->ki_pos + iter->count + PAGE_SIZE-1) >> PAGE_SHIFT; + int i, j, nr_got, err = 0; + + nr = min_t(unsigned long, last_index - index, nr); +find_page: + if (fatal_signal_pending(current)) + return -EINTR; + + nr_got = find_get_pages_contig(mapping, index, nr, pages); + if (nr_got) + goto got_pages; + + if (iocb->ki_flags & IOCB_NOIO) + return -EAGAIN; + + page_cache_sync_readahead(mapping, ra, filp, index, last_index - index); + + nr_got = find_get_pages_contig(mapping, index, nr, pages); + if (nr_got) + goto got_pages; + + pages[0] = generic_file_buffered_read_no_cached_page(iocb, iter); + err = PTR_ERR_OR_ZERO(pages[0]); + if (!IS_ERR_OR_NULL(pages[0])) + nr_got = 1; +got_pages: + for (i = 0; i < nr_got; i++) { + struct page *page = pages[i]; + pgoff_t pg_index = index + i; + loff_t pg_pos = max(iocb->ki_pos, + (loff_t) pg_index << PAGE_SHIFT); + loff_t pg_count = iocb->ki_pos + iter->count - pg_pos; + + if (PageReadahead(page)) { + if (iocb->ki_flags & IOCB_NOIO) { + for (j = i; j < nr_got; j++) + put_page(pages[j]); + nr_got = i; + err = -EAGAIN; + break; + } + page_cache_async_readahead(mapping, ra, filp, page, + pg_index, last_index - pg_index); + } + + if (!PageUptodate(page)) { + if ((iocb->ki_flags & IOCB_NOWAIT) || + ((iocb->ki_flags & IOCB_WAITQ) && i)) { + for (j = i; j < nr_got; j++) + put_page(pages[j]); + nr_got = i; + err = -EAGAIN; + break; + } + + page = generic_file_buffered_read_pagenotuptodate(iocb, + filp, iter, page, pg_pos, pg_count); + if (IS_ERR_OR_NULL(page)) { + for (j = i + 1; j < nr_got; j++) + put_page(pages[j]); + nr_got = i; + err = PTR_ERR_OR_ZERO(page); + break; + } + } + } + + if (likely(nr_got)) + return nr_got; + if (err) + return err; + /* + * No pages and no error means we raced and should retry: + */ + goto find_page; +} + /** * generic_file_buffered_read - generic file read routine * @iocb: the iocb to read @@ -2186,294 +2440,120 @@ ssize_t generic_file_buffered_read(struct kiocb *iocb, struct iov_iter *iter, ssize_t written) { struct file *filp = iocb->ki_filp; + struct file_ra_state *ra = &filp->f_ra; struct address_space *mapping = filp->f_mapping; struct inode *inode = mapping->host; - struct file_ra_state *ra = &filp->f_ra; - loff_t *ppos = &iocb->ki_pos; - pgoff_t index; - pgoff_t last_index; - pgoff_t prev_index; - unsigned long offset; /* offset into pagecache page */ - unsigned int prev_offset; - int error = 0; - - if (unlikely(*ppos >= inode->i_sb->s_maxbytes)) + struct page *pages_onstack[PAGEVEC_SIZE], **pages = NULL; + unsigned int nr_pages = min_t(unsigned int, 512, + ((iocb->ki_pos + iter->count + PAGE_SIZE - 1) >> PAGE_SHIFT) - + (iocb->ki_pos >> PAGE_SHIFT)); + int i, pg_nr, error = 0; + bool writably_mapped; + loff_t isize, end_offset; + + if (unlikely(iocb->ki_pos >= inode->i_sb->s_maxbytes)) return 0; + if (unlikely(!iov_iter_count(iter))) + return 0; + iov_iter_truncate(iter, inode->i_sb->s_maxbytes); - index = *ppos >> PAGE_SHIFT; - prev_index = ra->prev_pos >> PAGE_SHIFT; - prev_offset = ra->prev_pos & (PAGE_SIZE-1); - last_index = (*ppos + iter->count + PAGE_SIZE-1) >> PAGE_SHIFT; - offset = *ppos & ~PAGE_MASK; + if (nr_pages > ARRAY_SIZE(pages_onstack)) + pages = kmalloc_array(nr_pages, sizeof(void *), GFP_KERNEL); - /* - * If we've already successfully copied some data, then we - * can no longer safely return -EIOCBQUEUED. Hence mark - * an async read NOWAIT at that point. - */ - if (written && (iocb->ki_flags & IOCB_WAITQ)) - iocb->ki_flags |= IOCB_NOWAIT; - - for (;;) { - struct page *page; - pgoff_t end_index; - loff_t isize; - unsigned long nr, ret; + if (!pages) { + pages = pages_onstack; + nr_pages = min_t(unsigned int, nr_pages, ARRAY_SIZE(pages_onstack)); + } + do { cond_resched(); -find_page: - if (fatal_signal_pending(current)) { - error = -EINTR; - goto out; - } - page = find_get_page(mapping, index); - if (!page) { - if (iocb->ki_flags & IOCB_NOIO) - goto would_block; - page_cache_sync_readahead(mapping, - ra, filp, - index, last_index - index); - page = find_get_page(mapping, index); - if (unlikely(page == NULL)) - goto no_cached_page; - } - if (PageReadahead(page)) { - if (iocb->ki_flags & IOCB_NOIO) { - put_page(page); - goto out; - } - page_cache_async_readahead(mapping, - ra, filp, page, - index, last_index - index); - } - if (!PageUptodate(page)) { - /* - * See comment in do_read_cache_page on why - * wait_on_page_locked is used to avoid unnecessarily - * serialisations and why it's safe. - */ - if (iocb->ki_flags & IOCB_WAITQ) { - if (written) { - put_page(page); - goto out; - } - error = wait_on_page_locked_async(page, - iocb->ki_waitq); - } else { - if (iocb->ki_flags & IOCB_NOWAIT) { - put_page(page); - goto would_block; - } - error = wait_on_page_locked_killable(page); - } - if (unlikely(error)) - goto readpage_error; - if (PageUptodate(page)) - goto page_ok; - - if (inode->i_blkbits == PAGE_SHIFT || - !mapping->a_ops->is_partially_uptodate) - goto page_not_up_to_date; - /* pipes can't handle partially uptodate pages */ - if (unlikely(iov_iter_is_pipe(iter))) - goto page_not_up_to_date; - if (!trylock_page(page)) - goto page_not_up_to_date; - /* Did it get truncated before we got the lock? */ - if (!page->mapping) - goto page_not_up_to_date_locked; - if (!mapping->a_ops->is_partially_uptodate(page, - offset, iter->count)) - goto page_not_up_to_date_locked; - unlock_page(page); + /* + * If we've already successfully copied some data, then we + * can no longer safely return -EIOCBQUEUED. Hence mark + * an async read NOWAIT at that point. + */ + if ((iocb->ki_flags & IOCB_WAITQ) && written) + iocb->ki_flags |= IOCB_NOWAIT; + + i = 0; + pg_nr = generic_file_buffered_read_get_pages(iocb, iter, + pages, nr_pages); + if (pg_nr < 0) { + error = pg_nr; + break; } -page_ok: + /* - * i_size must be checked after we know the page is Uptodate. + * i_size must be checked after we know the pages are Uptodate. * * Checking i_size after the check allows us to calculate * the correct value for "nr", which means the zero-filled * part of the page is not copied back to userspace (unless * another truncate extends the file - this is desired though). */ - isize = i_size_read(inode); - end_index = (isize - 1) >> PAGE_SHIFT; - if (unlikely(!isize || index > end_index)) { - put_page(page); - goto out; - } + if (unlikely(iocb->ki_pos >= isize)) + goto put_pages; - /* nr is the maximum number of bytes to copy from this page */ - nr = PAGE_SIZE; - if (index == end_index) { - nr = ((isize - 1) & ~PAGE_MASK) + 1; - if (nr <= offset) { - put_page(page); - goto out; - } - } - nr = nr - offset; + end_offset = min_t(loff_t, isize, iocb->ki_pos + iter->count); - /* If users can be writing to this page using arbitrary - * virtual addresses, take care about potential aliasing - * before reading the page on the kernel side. - */ - if (mapping_writably_mapped(mapping)) - flush_dcache_page(page); + while ((iocb->ki_pos >> PAGE_SHIFT) + pg_nr > + (end_offset + PAGE_SIZE - 1) >> PAGE_SHIFT) + put_page(pages[--pg_nr]); /* - * When a sequential read accesses a page several times, - * only mark it as accessed the first time. + * Once we start copying data, we don't want to be touching any + * cachelines that might be contended: */ - if (prev_index != index || offset != prev_offset) - mark_page_accessed(page); - prev_index = index; + writably_mapped = mapping_writably_mapped(mapping); /* - * Ok, we have the page, and it's up-to-date, so - * now we can copy it to user space... + * When a sequential read accesses a page several times, only + * mark it as accessed the first time. */ + if (iocb->ki_pos >> PAGE_SHIFT != + ra->prev_pos >> PAGE_SHIFT) + mark_page_accessed(pages[0]); + for (i = 1; i < pg_nr; i++) + mark_page_accessed(pages[i]); + + for (i = 0; i < pg_nr; i++) { + unsigned int offset = iocb->ki_pos & ~PAGE_MASK; + unsigned int bytes = min_t(loff_t, end_offset - iocb->ki_pos, + PAGE_SIZE - offset); + unsigned int copied; - ret = copy_page_to_iter(page, offset, nr, iter); - offset += ret; - index += offset >> PAGE_SHIFT; - offset &= ~PAGE_MASK; - prev_offset = offset; - - put_page(page); - written += ret; - if (!iov_iter_count(iter)) - goto out; - if (ret < nr) { - error = -EFAULT; - goto out; - } - continue; - -page_not_up_to_date: - /* Get exclusive access to the page ... */ - if (iocb->ki_flags & IOCB_WAITQ) { - if (written) { - put_page(page); - goto out; - } - error = lock_page_async(page, iocb->ki_waitq); - } else { - error = lock_page_killable(page); - } - if (unlikely(error)) - goto readpage_error; - -page_not_up_to_date_locked: - /* Did it get truncated before we got the lock? */ - if (!page->mapping) { - unlock_page(page); - put_page(page); - continue; - } - - /* Did somebody else fill it already? */ - if (PageUptodate(page)) { - unlock_page(page); - goto page_ok; - } - -readpage: - if (iocb->ki_flags & (IOCB_NOIO | IOCB_NOWAIT)) { - unlock_page(page); - put_page(page); - goto would_block; - } - /* - * A previous I/O error may have been due to temporary - * failures, eg. multipath errors. - * PG_error will be set again if readpage fails. - */ - ClearPageError(page); - /* Start the actual read. The read will unlock the page. */ - error = mapping->a_ops->readpage(filp, page); + /* + * If users can be writing to this page using arbitrary + * virtual addresses, take care about potential aliasing + * before reading the page on the kernel side. + */ + if (writably_mapped) + flush_dcache_page(pages[i]); - if (unlikely(error)) { - if (error == AOP_TRUNCATED_PAGE) { - put_page(page); - error = 0; - goto find_page; - } - goto readpage_error; - } + copied = copy_page_to_iter(pages[i], offset, bytes, iter); - if (!PageUptodate(page)) { - if (iocb->ki_flags & IOCB_WAITQ) { - if (written) { - put_page(page); - goto out; - } - error = lock_page_async(page, iocb->ki_waitq); - } else { - error = lock_page_killable(page); - } + written += copied; + iocb->ki_pos += copied; + ra->prev_pos = iocb->ki_pos; - if (unlikely(error)) - goto readpage_error; - if (!PageUptodate(page)) { - if (page->mapping == NULL) { - /* - * invalidate_mapping_pages got it - */ - unlock_page(page); - put_page(page); - goto find_page; - } - unlock_page(page); - shrink_readahead_size_eio(ra); - error = -EIO; - goto readpage_error; + if (copied < bytes) { + error = -EFAULT; + break; } - unlock_page(page); } +put_pages: + for (i = 0; i < pg_nr; i++) + put_page(pages[i]); + } while (iov_iter_count(iter) && iocb->ki_pos < isize && !error); - goto page_ok; - -readpage_error: - /* UHHUH! A synchronous read error occurred. Report it */ - put_page(page); - goto out; - -no_cached_page: - /* - * Ok, it wasn't cached, so we need to create a new - * page.. - */ - page = page_cache_alloc(mapping); - if (!page) { - error = -ENOMEM; - goto out; - } - error = add_to_page_cache_lru(page, mapping, index, - mapping_gfp_constraint(mapping, GFP_KERNEL)); - if (error) { - put_page(page); - if (error == -EEXIST) { - error = 0; - goto find_page; - } - goto out; - } - goto readpage; - } + file_accessed(filp); -would_block: - error = -EAGAIN; -out: - ra->prev_pos = prev_index; - ra->prev_pos <<= PAGE_SHIFT; - ra->prev_pos |= prev_offset; + if (pages != pages_onstack) + kfree(pages); - *ppos = ((loff_t)index << PAGE_SHIFT) + offset; - file_accessed(filp); return written ? written : error; } EXPORT_SYMBOL_GPL(generic_file_buffered_read); @@ -2904,14 +2984,14 @@ EXPORT_SYMBOL(filemap_map_pages); vm_fault_t filemap_page_mkwrite(struct vm_fault *vmf) { + struct address_space *mapping = vmf->vma->vm_file->f_mapping; struct page *page = vmf->page; - struct inode *inode = file_inode(vmf->vma->vm_file); vm_fault_t ret = VM_FAULT_LOCKED; - sb_start_pagefault(inode->i_sb); + sb_start_pagefault(mapping->host->i_sb); file_update_time(vmf->vma->vm_file); lock_page(page); - if (page->mapping != inode->i_mapping) { + if (page->mapping != mapping) { unlock_page(page); ret = VM_FAULT_NOPAGE; goto out; @@ -2924,7 +3004,7 @@ vm_fault_t filemap_page_mkwrite(struct vm_fault *vmf) set_page_dirty(page); wait_for_stable_page(page); out: - sb_end_pagefault(inode->i_sb); + sb_end_pagefault(mapping->host->i_sb); return ret; } @@ -3167,10 +3247,9 @@ void dio_warn_stale_pagecache(struct file *filp) { static DEFINE_RATELIMIT_STATE(_rs, 86400 * HZ, DEFAULT_RATELIMIT_BURST); char pathname[128]; - struct inode *inode = file_inode(filp); char *path; - errseq_set(&inode->i_mapping->wb_err, -EIO); + errseq_set(&filp->f_mapping->wb_err, -EIO); if (__ratelimit(&_rs)) { path = file_path(filp, pathname, sizeof(pathname)); if (IS_ERR(path)) @@ -3197,7 +3276,7 @@ generic_file_direct_write(struct kiocb *iocb, struct iov_iter *from) if (iocb->ki_flags & IOCB_NOWAIT) { /* If there are pages to writeback, return */ - if (filemap_range_has_page(inode->i_mapping, pos, + if (filemap_range_has_page(file->f_mapping, pos, pos + write_len - 1)) return -EAGAIN; } else { @@ -123,6 +123,28 @@ static __maybe_unused struct page *try_grab_compound_head(struct page *page, return NULL; } +static void put_compound_head(struct page *page, int refs, unsigned int flags) +{ + if (flags & FOLL_PIN) { + mod_node_page_state(page_pgdat(page), NR_FOLL_PIN_RELEASED, + refs); + + if (hpage_pincount_available(page)) + hpage_pincount_sub(page, refs); + else + refs *= GUP_PIN_COUNTING_BIAS; + } + + VM_BUG_ON_PAGE(page_ref_count(page) < refs, page); + /* + * Calling put_page() for each ref is unnecessarily slow. Only the last + * ref needs a put_page(). + */ + if (refs > 1) + page_ref_sub(page, refs - 1); + put_page(page); +} + /** * try_grab_page() - elevate a page's refcount by a flag-dependent amount * @@ -177,41 +199,6 @@ bool __must_check try_grab_page(struct page *page, unsigned int flags) return true; } -#ifdef CONFIG_DEV_PAGEMAP_OPS -static bool __unpin_devmap_managed_user_page(struct page *page) -{ - int count, refs = 1; - - if (!page_is_devmap_managed(page)) - return false; - - if (hpage_pincount_available(page)) - hpage_pincount_sub(page, 1); - else - refs = GUP_PIN_COUNTING_BIAS; - - count = page_ref_sub_return(page, refs); - - mod_node_page_state(page_pgdat(page), NR_FOLL_PIN_RELEASED, 1); - /* - * devmap page refcounts are 1-based, rather than 0-based: if - * refcount is 1, then the page is free and the refcount is - * stable because nobody holds a reference on the page. - */ - if (count == 1) - free_devmap_managed_page(page); - else if (!count) - __put_page(page); - - return true; -} -#else -static bool __unpin_devmap_managed_user_page(struct page *page) -{ - return false; -} -#endif /* CONFIG_DEV_PAGEMAP_OPS */ - /** * unpin_user_page() - release a dma-pinned page * @page: pointer to page to be released @@ -223,28 +210,7 @@ static bool __unpin_devmap_managed_user_page(struct page *page) */ void unpin_user_page(struct page *page) { - int refs = 1; - - page = compound_head(page); - - /* - * For devmap managed pages we need to catch refcount transition from - * GUP_PIN_COUNTING_BIAS to 1, when refcount reach one it means the - * page is free and we need to inform the device driver through - * callback. See include/linux/memremap.h and HMM for details. - */ - if (__unpin_devmap_managed_user_page(page)) - return; - - if (hpage_pincount_available(page)) - hpage_pincount_sub(page, 1); - else - refs = GUP_PIN_COUNTING_BIAS; - - if (page_ref_sub_and_test(page, refs)) - __put_page(page); - - mod_node_page_state(page_pgdat(page), NR_FOLL_PIN_RELEASED, 1); + put_compound_head(compound_head(page), 1, FOLL_PIN); } EXPORT_SYMBOL(unpin_user_page); @@ -923,6 +889,9 @@ static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags) if (gup_flags & FOLL_ANON && !vma_is_anonymous(vma)) return -EFAULT; + if ((gup_flags & FOLL_LONGTERM) && vma_is_fsdax(vma)) + return -EOPNOTSUPP; + if (write) { if (!(vm_flags & VM_WRITE)) { if (!(gup_flags & FOLL_FORCE)) @@ -1060,10 +1029,14 @@ static long __get_user_pages(struct mm_struct *mm, goto next_page; } - if (!vma || check_vma_flags(vma, gup_flags)) { + if (!vma) { ret = -EFAULT; goto out; } + ret = check_vma_flags(vma, gup_flags); + if (ret) + goto out; + if (is_vm_hugetlb_page(vma)) { i = follow_hugetlb_page(mm, vma, pages, vmas, &start, &nr_pages, i, @@ -1567,26 +1540,6 @@ struct page *get_dump_page(unsigned long addr) } #endif /* CONFIG_ELF_CORE */ -#if defined(CONFIG_FS_DAX) || defined (CONFIG_CMA) -static bool check_dax_vmas(struct vm_area_struct **vmas, long nr_pages) -{ - long i; - struct vm_area_struct *vma_prev = NULL; - - for (i = 0; i < nr_pages; i++) { - struct vm_area_struct *vma = vmas[i]; - - if (vma == vma_prev) - continue; - - vma_prev = vma; - - if (vma_is_fsdax(vma)) - return true; - } - return false; -} - #ifdef CONFIG_CMA static long check_and_migrate_cma_pages(struct mm_struct *mm, unsigned long start, @@ -1705,63 +1658,23 @@ static long __gup_longterm_locked(struct mm_struct *mm, struct vm_area_struct **vmas, unsigned int gup_flags) { - struct vm_area_struct **vmas_tmp = vmas; unsigned long flags = 0; - long rc, i; + long rc; - if (gup_flags & FOLL_LONGTERM) { - if (!pages) - return -EINVAL; - - if (!vmas_tmp) { - vmas_tmp = kcalloc(nr_pages, - sizeof(struct vm_area_struct *), - GFP_KERNEL); - if (!vmas_tmp) - return -ENOMEM; - } + if (gup_flags & FOLL_LONGTERM) flags = memalloc_nocma_save(); - } - rc = __get_user_pages_locked(mm, start, nr_pages, pages, - vmas_tmp, NULL, gup_flags); + rc = __get_user_pages_locked(mm, start, nr_pages, pages, vmas, NULL, + gup_flags); if (gup_flags & FOLL_LONGTERM) { - if (rc < 0) - goto out; - - if (check_dax_vmas(vmas_tmp, rc)) { - if (gup_flags & FOLL_PIN) - unpin_user_pages(pages, rc); - else - for (i = 0; i < rc; i++) - put_page(pages[i]); - rc = -EOPNOTSUPP; - goto out; - } - - rc = check_and_migrate_cma_pages(mm, start, rc, pages, - vmas_tmp, gup_flags); -out: + if (rc > 0) + rc = check_and_migrate_cma_pages(mm, start, rc, pages, + vmas, gup_flags); memalloc_nocma_restore(flags); } - - if (vmas_tmp != vmas) - kfree(vmas_tmp); return rc; } -#else /* !CONFIG_FS_DAX && !CONFIG_CMA */ -static __always_inline long __gup_longterm_locked(struct mm_struct *mm, - unsigned long start, - unsigned long nr_pages, - struct page **pages, - struct vm_area_struct **vmas, - unsigned int flags) -{ - return __get_user_pages_locked(mm, start, nr_pages, pages, vmas, - NULL, flags); -} -#endif /* CONFIG_FS_DAX || CONFIG_CMA */ static bool is_valid_gup_flags(unsigned int gup_flags) { @@ -1932,7 +1845,19 @@ long get_user_pages(unsigned long start, unsigned long nr_pages, EXPORT_SYMBOL(get_user_pages); /** - * get_user_pages_locked() is suitable to replace the form: + * get_user_pages_locked() - variant of get_user_pages() + * + * @start: starting user address + * @nr_pages: number of pages from start to pin + * @gup_flags: flags modifying lookup behaviour + * @pages: array that receives pointers to the pages pinned. + * Should be at least nr_pages long. Or NULL, if caller + * only intends to ensure the pages are faulted in. + * @locked: pointer to lock flag indicating whether lock is held and + * subsequently whether VM_FAULT_RETRY functionality can be + * utilised. Lock must initially be held. + * + * It is suitable to replace the form: * * mmap_read_lock(mm); * do_something() @@ -1948,16 +1873,6 @@ EXPORT_SYMBOL(get_user_pages); * if (locked) * mmap_read_unlock(mm); * - * @start: starting user address - * @nr_pages: number of pages from start to pin - * @gup_flags: flags modifying lookup behaviour - * @pages: array that receives pointers to the pages pinned. - * Should be at least nr_pages long. Or NULL, if caller - * only intends to ensure the pages are faulted in. - * @locked: pointer to lock flag indicating whether lock is held and - * subsequently whether VM_FAULT_RETRY functionality can be - * utilised. Lock must initially be held. - * * We can leverage the VM_FAULT_RETRY functionality in the page fault * paths better by using either get_user_pages_locked() or * get_user_pages_unlocked(). @@ -2063,84 +1978,6 @@ EXPORT_SYMBOL(get_user_pages_unlocked); */ #ifdef CONFIG_HAVE_FAST_GUP -static void put_compound_head(struct page *page, int refs, unsigned int flags) -{ - if (flags & FOLL_PIN) { - mod_node_page_state(page_pgdat(page), NR_FOLL_PIN_RELEASED, - refs); - - if (hpage_pincount_available(page)) - hpage_pincount_sub(page, refs); - else - refs *= GUP_PIN_COUNTING_BIAS; - } - - VM_BUG_ON_PAGE(page_ref_count(page) < refs, page); - /* - * Calling put_page() for each ref is unnecessarily slow. Only the last - * ref needs a put_page(). - */ - if (refs > 1) - page_ref_sub(page, refs - 1); - put_page(page); -} - -#ifdef CONFIG_GUP_GET_PTE_LOW_HIGH - -/* - * WARNING: only to be used in the get_user_pages_fast() implementation. - * - * With get_user_pages_fast(), we walk down the pagetables without taking any - * locks. For this we would like to load the pointers atomically, but sometimes - * that is not possible (e.g. without expensive cmpxchg8b on x86_32 PAE). What - * we do have is the guarantee that a PTE will only either go from not present - * to present, or present to not present or both -- it will not switch to a - * completely different present page without a TLB flush in between; something - * that we are blocking by holding interrupts off. - * - * Setting ptes from not present to present goes: - * - * ptep->pte_high = h; - * smp_wmb(); - * ptep->pte_low = l; - * - * And present to not present goes: - * - * ptep->pte_low = 0; - * smp_wmb(); - * ptep->pte_high = 0; - * - * We must ensure here that the load of pte_low sees 'l' IFF pte_high sees 'h'. - * We load pte_high *after* loading pte_low, which ensures we don't see an older - * value of pte_high. *Then* we recheck pte_low, which ensures that we haven't - * picked up a changed pte high. We might have gotten rubbish values from - * pte_low and pte_high, but we are guaranteed that pte_low will not have the - * present bit set *unless* it is 'l'. Because get_user_pages_fast() only - * operates on present ptes we're safe. - */ -static inline pte_t gup_get_pte(pte_t *ptep) -{ - pte_t pte; - - do { - pte.pte_low = ptep->pte_low; - smp_rmb(); - pte.pte_high = ptep->pte_high; - smp_rmb(); - } while (unlikely(pte.pte_low != ptep->pte_low)); - - return pte; -} -#else /* CONFIG_GUP_GET_PTE_LOW_HIGH */ -/* - * We require that the PTE can be read atomically. - */ -static inline pte_t gup_get_pte(pte_t *ptep) -{ - return ptep_get(ptep); -} -#endif /* CONFIG_GUP_GET_PTE_LOW_HIGH */ - static void __maybe_unused undo_dev_pagemap(int *nr, int nr_start, unsigned int flags, struct page **pages) @@ -2166,7 +2003,7 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end, ptem = ptep = pte_offset_map(&pmd, addr); do { - pte_t pte = gup_get_pte(ptep); + pte_t pte = ptep_get_lockless(ptep); struct page *head, *page; /* @@ -2677,13 +2514,61 @@ static int __gup_longterm_unlocked(unsigned long start, int nr_pages, return ret; } -static int internal_get_user_pages_fast(unsigned long start, int nr_pages, +static unsigned long lockless_pages_from_mm(unsigned long start, + unsigned long end, + unsigned int gup_flags, + struct page **pages) +{ + unsigned long flags; + int nr_pinned = 0; + unsigned seq; + + if (!IS_ENABLED(CONFIG_HAVE_FAST_GUP) || + !gup_fast_permitted(start, end)) + return 0; + + if (gup_flags & FOLL_PIN) { + seq = raw_read_seqcount(¤t->mm->write_protect_seq); + if (seq & 1) + return 0; + } + + /* + * Disable interrupts. The nested form is used, in order to allow full, + * general purpose use of this routine. + * + * With interrupts disabled, we block page table pages from being freed + * from under us. See struct mmu_table_batch comments in + * include/asm-generic/tlb.h for more details. + * + * We do not adopt an rcu_read_lock() here as we also want to block IPIs + * that come from THPs splitting. + */ + local_irq_save(flags); + gup_pgd_range(start, end, gup_flags, pages, &nr_pinned); + local_irq_restore(flags); + + /* + * When pinning pages for DMA there could be a concurrent write protect + * from fork() via copy_page_range(), in this case always fail fast GUP. + */ + if (gup_flags & FOLL_PIN) { + if (read_seqcount_retry(¤t->mm->write_protect_seq, seq)) { + unpin_user_pages(pages, nr_pinned); + return 0; + } + } + return nr_pinned; +} + +static int internal_get_user_pages_fast(unsigned long start, + unsigned long nr_pages, unsigned int gup_flags, struct page **pages) { - unsigned long addr, len, end; - unsigned long flags; - int nr_pinned = 0, ret = 0; + unsigned long len, end; + unsigned long nr_pinned; + int ret; if (WARN_ON_ONCE(gup_flags & ~(FOLL_WRITE | FOLL_LONGTERM | FOLL_FORCE | FOLL_PIN | FOLL_GET | @@ -2697,54 +2582,33 @@ static int internal_get_user_pages_fast(unsigned long start, int nr_pages, might_lock_read(¤t->mm->mmap_lock); start = untagged_addr(start) & PAGE_MASK; - addr = start; - len = (unsigned long) nr_pages << PAGE_SHIFT; - end = start + len; - - if (end <= start) + len = nr_pages << PAGE_SHIFT; + if (check_add_overflow(start, len, &end)) return 0; if (unlikely(!access_ok((void __user *)start, len))) return -EFAULT; - /* - * Disable interrupts. The nested form is used, in order to allow - * full, general purpose use of this routine. - * - * With interrupts disabled, we block page table pages from being - * freed from under us. See struct mmu_table_batch comments in - * include/asm-generic/tlb.h for more details. - * - * We do not adopt an rcu_read_lock(.) here as we also want to - * block IPIs that come from THPs splitting. - */ - if (IS_ENABLED(CONFIG_HAVE_FAST_GUP) && gup_fast_permitted(start, end)) { - unsigned long fast_flags = gup_flags; + nr_pinned = lockless_pages_from_mm(start, end, gup_flags, pages); + if (nr_pinned == nr_pages || gup_flags & FOLL_FAST_ONLY) + return nr_pinned; - local_irq_save(flags); - gup_pgd_range(addr, end, fast_flags, pages, &nr_pinned); - local_irq_restore(flags); - ret = nr_pinned; - } - - if (nr_pinned < nr_pages && !(gup_flags & FOLL_FAST_ONLY)) { - /* Try to get the remaining pages with get_user_pages */ - start += nr_pinned << PAGE_SHIFT; - pages += nr_pinned; - - ret = __gup_longterm_unlocked(start, nr_pages - nr_pinned, - gup_flags, pages); - - /* Have to be a bit careful with return values */ - if (nr_pinned > 0) { - if (ret < 0) - ret = nr_pinned; - else - ret += nr_pinned; - } + /* Slow path: try to get the remaining pages with get_user_pages */ + start += nr_pinned << PAGE_SHIFT; + pages += nr_pinned; + ret = __gup_longterm_unlocked(start, nr_pages - nr_pinned, gup_flags, + pages); + if (ret < 0) { + /* + * The caller has to unpin the pages we already pinned so + * returning -errno is not an option + */ + if (nr_pinned) + return nr_pinned; + return ret; } - - return ret; + return ret + nr_pinned; } + /** * get_user_pages_fast_only() - pin user pages in memory * @start: starting user address diff --git a/mm/gup_benchmark.c b/mm/gup_test.c index 8b3e5b5cd8fa..e3cf78e5873e 100644 --- a/mm/gup_benchmark.c +++ b/mm/gup_test.c @@ -4,40 +4,34 @@ #include <linux/uaccess.h> #include <linux/ktime.h> #include <linux/debugfs.h> - -#define GUP_FAST_BENCHMARK _IOWR('g', 1, struct gup_benchmark) -#define GUP_BENCHMARK _IOWR('g', 2, struct gup_benchmark) -#define PIN_FAST_BENCHMARK _IOWR('g', 3, struct gup_benchmark) -#define PIN_BENCHMARK _IOWR('g', 4, struct gup_benchmark) -#define PIN_LONGTERM_BENCHMARK _IOWR('g', 5, struct gup_benchmark) - -struct gup_benchmark { - __u64 get_delta_usec; - __u64 put_delta_usec; - __u64 addr; - __u64 size; - __u32 nr_pages_per_call; - __u32 flags; - __u64 expansion[10]; /* For future use */ -}; +#include "gup_test.h" static void put_back_pages(unsigned int cmd, struct page **pages, - unsigned long nr_pages) + unsigned long nr_pages, unsigned int gup_test_flags) { unsigned long i; switch (cmd) { case GUP_FAST_BENCHMARK: - case GUP_BENCHMARK: + case GUP_BASIC_TEST: for (i = 0; i < nr_pages; i++) put_page(pages[i]); break; case PIN_FAST_BENCHMARK: - case PIN_BENCHMARK: + case PIN_BASIC_TEST: case PIN_LONGTERM_BENCHMARK: unpin_user_pages(pages, nr_pages); break; + case DUMP_USER_PAGES_TEST: + if (gup_test_flags & GUP_TEST_FLAG_DUMP_PAGES_USE_PIN) { + unpin_user_pages(pages, nr_pages); + } else { + for (i = 0; i < nr_pages; i++) + put_page(pages[i]); + + } + break; } } @@ -49,14 +43,14 @@ static void verify_dma_pinned(unsigned int cmd, struct page **pages, switch (cmd) { case PIN_FAST_BENCHMARK: - case PIN_BENCHMARK: + case PIN_BASIC_TEST: case PIN_LONGTERM_BENCHMARK: for (i = 0; i < nr_pages; i++) { page = pages[i]; if (WARN(!page_maybe_dma_pinned(page), "pages[%lu] is NOT dma-pinned\n", i)) { - dump_page(page, "gup_benchmark failure"); + dump_page(page, "gup_test failure"); break; } } @@ -64,8 +58,39 @@ static void verify_dma_pinned(unsigned int cmd, struct page **pages, } } -static int __gup_benchmark_ioctl(unsigned int cmd, - struct gup_benchmark *gup) +static void dump_pages_test(struct gup_test *gup, struct page **pages, + unsigned long nr_pages) +{ + unsigned int index_to_dump; + unsigned int i; + + /* + * Zero out any user-supplied page index that is out of range. Remember: + * .which_pages[] contains a 1-based set of page indices. + */ + for (i = 0; i < GUP_TEST_MAX_PAGES_TO_DUMP; i++) { + if (gup->which_pages[i] > nr_pages) { + pr_warn("ZEROING due to out of range: .which_pages[%u]: %u\n", + i, gup->which_pages[i]); + gup->which_pages[i] = 0; + } + } + + for (i = 0; i < GUP_TEST_MAX_PAGES_TO_DUMP; i++) { + index_to_dump = gup->which_pages[i]; + + if (index_to_dump) { + index_to_dump--; // Decode from 1-based, to 0-based + pr_info("---- page #%u, starting from user virt addr: 0x%llx\n", + index_to_dump, gup->addr); + dump_page(pages[index_to_dump], + "gup_test: dump_pages() test"); + } + } +} + +static int __gup_test_ioctl(unsigned int cmd, + struct gup_test *gup) { ktime_t start_time, end_time; unsigned long i, nr_pages, addr, next; @@ -109,7 +134,7 @@ static int __gup_benchmark_ioctl(unsigned int cmd, nr = get_user_pages_fast(addr, nr, gup->flags, pages + i); break; - case GUP_BENCHMARK: + case GUP_BASIC_TEST: nr = get_user_pages(addr, nr, gup->flags, pages + i, NULL); break; @@ -117,7 +142,7 @@ static int __gup_benchmark_ioctl(unsigned int cmd, nr = pin_user_pages_fast(addr, nr, gup->flags, pages + i); break; - case PIN_BENCHMARK: + case PIN_BASIC_TEST: nr = pin_user_pages(addr, nr, gup->flags, pages + i, NULL); break; @@ -126,6 +151,14 @@ static int __gup_benchmark_ioctl(unsigned int cmd, gup->flags | FOLL_LONGTERM, pages + i, NULL); break; + case DUMP_USER_PAGES_TEST: + if (gup->flags & GUP_TEST_FLAG_DUMP_PAGES_USE_PIN) + nr = pin_user_pages(addr, nr, gup->flags, + pages + i, NULL); + else + nr = get_user_pages(addr, nr, gup->flags, + pages + i, NULL); + break; default: ret = -EINVAL; goto unlock; @@ -149,9 +182,12 @@ static int __gup_benchmark_ioctl(unsigned int cmd, */ verify_dma_pinned(cmd, pages, nr_pages); + if (cmd == DUMP_USER_PAGES_TEST) + dump_pages_test(gup, pages, nr_pages); + start_time = ktime_get(); - put_back_pages(cmd, pages, nr_pages); + put_back_pages(cmd, pages, nr_pages, gup->flags); end_time = ktime_get(); gup->put_delta_usec = ktime_us_delta(end_time, start_time); @@ -164,18 +200,19 @@ free_pages: return ret; } -static long gup_benchmark_ioctl(struct file *filep, unsigned int cmd, +static long gup_test_ioctl(struct file *filep, unsigned int cmd, unsigned long arg) { - struct gup_benchmark gup; + struct gup_test gup; int ret; switch (cmd) { case GUP_FAST_BENCHMARK: - case GUP_BENCHMARK: case PIN_FAST_BENCHMARK: - case PIN_BENCHMARK: case PIN_LONGTERM_BENCHMARK: + case GUP_BASIC_TEST: + case PIN_BASIC_TEST: + case DUMP_USER_PAGES_TEST: break; default: return -EINVAL; @@ -184,7 +221,7 @@ static long gup_benchmark_ioctl(struct file *filep, unsigned int cmd, if (copy_from_user(&gup, (void __user *)arg, sizeof(gup))) return -EFAULT; - ret = __gup_benchmark_ioctl(cmd, &gup); + ret = __gup_test_ioctl(cmd, &gup); if (ret) return ret; @@ -194,17 +231,17 @@ static long gup_benchmark_ioctl(struct file *filep, unsigned int cmd, return 0; } -static const struct file_operations gup_benchmark_fops = { +static const struct file_operations gup_test_fops = { .open = nonseekable_open, - .unlocked_ioctl = gup_benchmark_ioctl, + .unlocked_ioctl = gup_test_ioctl, }; -static int gup_benchmark_init(void) +static int __init gup_test_init(void) { - debugfs_create_file_unsafe("gup_benchmark", 0600, NULL, NULL, - &gup_benchmark_fops); + debugfs_create_file_unsafe("gup_test", 0600, NULL, NULL, + &gup_test_fops); return 0; } -late_initcall(gup_benchmark_init); +late_initcall(gup_test_init); diff --git a/mm/gup_test.h b/mm/gup_test.h new file mode 100644 index 000000000000..90a6713d50eb --- /dev/null +++ b/mm/gup_test.h @@ -0,0 +1,32 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +#ifndef __GUP_TEST_H +#define __GUP_TEST_H + +#include <linux/types.h> + +#define GUP_FAST_BENCHMARK _IOWR('g', 1, struct gup_test) +#define PIN_FAST_BENCHMARK _IOWR('g', 2, struct gup_test) +#define PIN_LONGTERM_BENCHMARK _IOWR('g', 3, struct gup_test) +#define GUP_BASIC_TEST _IOWR('g', 4, struct gup_test) +#define PIN_BASIC_TEST _IOWR('g', 5, struct gup_test) +#define DUMP_USER_PAGES_TEST _IOWR('g', 6, struct gup_test) + +#define GUP_TEST_MAX_PAGES_TO_DUMP 8 + +#define GUP_TEST_FLAG_DUMP_PAGES_USE_PIN 0x1 + +struct gup_test { + __u64 get_delta_usec; + __u64 put_delta_usec; + __u64 addr; + __u64 size; + __u32 nr_pages_per_call; + __u32 flags; + /* + * Each non-zero entry is the number of the page (1-based: first page is + * page 1, so that zero entries mean "do nothing") from the .addr base. + */ + __u32 which_pages[GUP_TEST_MAX_PAGES_TO_DUMP]; +}; + +#endif /* __GUP_TEST_H */ diff --git a/mm/highmem.c b/mm/highmem.c index 1352a27951e3..c3a9ea7875ef 100644 --- a/mm/highmem.c +++ b/mm/highmem.c @@ -31,10 +31,6 @@ #include <asm/tlbflush.h> #include <linux/vmalloc.h> -#if defined(CONFIG_HIGHMEM) || defined(CONFIG_X86_32) -DEFINE_PER_CPU(int, __kmap_atomic_idx); -#endif - /* * Virtual_count is not a pure "count". * 0 means that it is not mapped, and has not been mapped @@ -108,9 +104,7 @@ static inline wait_queue_head_t *get_pkmap_wait_queue_head(unsigned int color) atomic_long_t _totalhigh_pages __read_mostly; EXPORT_SYMBOL(_totalhigh_pages); -EXPORT_PER_CPU_SYMBOL(__kmap_atomic_idx); - -unsigned int nr_free_highpages (void) +unsigned int __nr_free_highpages (void) { struct zone *zone; unsigned int pages = 0; @@ -147,7 +141,7 @@ pte_t * pkmap_page_table; do { spin_unlock(&kmap_lock); (void)(flags); } while (0) #endif -struct page *kmap_to_page(void *vaddr) +struct page *__kmap_to_page(void *vaddr) { unsigned long addr = (unsigned long)vaddr; @@ -158,7 +152,7 @@ struct page *kmap_to_page(void *vaddr) return virt_to_page(addr); } -EXPORT_SYMBOL(kmap_to_page); +EXPORT_SYMBOL(__kmap_to_page); static void flush_all_zero_pkmaps(void) { @@ -200,10 +194,7 @@ static void flush_all_zero_pkmaps(void) flush_tlb_kernel_range(PKMAP_ADDR(0), PKMAP_ADDR(LAST_PKMAP)); } -/** - * kmap_flush_unused - flush all unused kmap mappings in order to remove stray mappings - */ -void kmap_flush_unused(void) +void __kmap_flush_unused(void) { lock_kmap(); flush_all_zero_pkmaps(); @@ -367,9 +358,312 @@ void kunmap_high(struct page *page) if (need_wakeup) wake_up(pkmap_map_wait); } - EXPORT_SYMBOL(kunmap_high); -#endif /* CONFIG_HIGHMEM */ + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +void zero_user_segments(struct page *page, unsigned start1, unsigned end1, + unsigned start2, unsigned end2) +{ + unsigned int i; + + BUG_ON(end1 > page_size(page) || end2 > page_size(page)); + + for (i = 0; i < compound_nr(page); i++) { + void *kaddr = NULL; + + if (start1 < PAGE_SIZE || start2 < PAGE_SIZE) + kaddr = kmap_atomic(page + i); + + if (start1 >= PAGE_SIZE) { + start1 -= PAGE_SIZE; + end1 -= PAGE_SIZE; + } else { + unsigned this_end = min_t(unsigned, end1, PAGE_SIZE); + + if (end1 > start1) + memset(kaddr + start1, 0, this_end - start1); + end1 -= this_end; + start1 = 0; + } + + if (start2 >= PAGE_SIZE) { + start2 -= PAGE_SIZE; + end2 -= PAGE_SIZE; + } else { + unsigned this_end = min_t(unsigned, end2, PAGE_SIZE); + + if (end2 > start2) + memset(kaddr + start2, 0, this_end - start2); + end2 -= this_end; + start2 = 0; + } + + if (kaddr) { + kunmap_atomic(kaddr); + flush_dcache_page(page + i); + } + + if (!end1 && !end2) + break; + } + + BUG_ON((start1 | start2 | end1 | end2) != 0); +} +EXPORT_SYMBOL(zero_user_segments); +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ +#endif /* CONFIG_HIGHMEM */ + +#ifdef CONFIG_KMAP_LOCAL + +#include <asm/kmap_size.h> + +/* + * With DEBUG_KMAP_LOCAL the stack depth is doubled and every second + * slot is unused which acts as a guard page + */ +#ifdef CONFIG_DEBUG_KMAP_LOCAL +# define KM_INCR 2 +#else +# define KM_INCR 1 +#endif + +static inline int kmap_local_idx_push(void) +{ + WARN_ON_ONCE(in_irq() && !irqs_disabled()); + current->kmap_ctrl.idx += KM_INCR; + BUG_ON(current->kmap_ctrl.idx >= KM_MAX_IDX); + return current->kmap_ctrl.idx - 1; +} + +static inline int kmap_local_idx(void) +{ + return current->kmap_ctrl.idx - 1; +} + +static inline void kmap_local_idx_pop(void) +{ + current->kmap_ctrl.idx -= KM_INCR; + BUG_ON(current->kmap_ctrl.idx < 0); +} + +#ifndef arch_kmap_local_post_map +# define arch_kmap_local_post_map(vaddr, pteval) do { } while (0) +#endif + +#ifndef arch_kmap_local_pre_unmap +# define arch_kmap_local_pre_unmap(vaddr) do { } while (0) +#endif + +#ifndef arch_kmap_local_post_unmap +# define arch_kmap_local_post_unmap(vaddr) do { } while (0) +#endif + +#ifndef arch_kmap_local_map_idx +#define arch_kmap_local_map_idx(idx, pfn) kmap_local_calc_idx(idx) +#endif + +#ifndef arch_kmap_local_unmap_idx +#define arch_kmap_local_unmap_idx(idx, vaddr) kmap_local_calc_idx(idx) +#endif + +#ifndef arch_kmap_local_high_get +static inline void *arch_kmap_local_high_get(struct page *page) +{ + return NULL; +} +#endif + +/* Unmap a local mapping which was obtained by kmap_high_get() */ +static inline bool kmap_high_unmap_local(unsigned long vaddr) +{ +#ifdef ARCH_NEEDS_KMAP_HIGH_GET + if (vaddr >= PKMAP_ADDR(0) && vaddr < PKMAP_ADDR(LAST_PKMAP)) { + kunmap_high(pte_page(pkmap_page_table[PKMAP_NR(vaddr)])); + return true; + } +#endif + return false; +} + +static inline int kmap_local_calc_idx(int idx) +{ + return idx + KM_MAX_IDX * smp_processor_id(); +} + +static pte_t *__kmap_pte; + +static pte_t *kmap_get_pte(void) +{ + if (!__kmap_pte) + __kmap_pte = virt_to_kpte(__fix_to_virt(FIX_KMAP_BEGIN)); + return __kmap_pte; +} + +void *__kmap_local_pfn_prot(unsigned long pfn, pgprot_t prot) +{ + pte_t pteval, *kmap_pte = kmap_get_pte(); + unsigned long vaddr; + int idx; + + /* + * Disable migration so resulting virtual address is stable + * accross preemption. + */ + migrate_disable(); + preempt_disable(); + idx = arch_kmap_local_map_idx(kmap_local_idx_push(), pfn); + vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); + BUG_ON(!pte_none(*(kmap_pte - idx))); + pteval = pfn_pte(pfn, prot); + set_pte_at(&init_mm, vaddr, kmap_pte - idx, pteval); + arch_kmap_local_post_map(vaddr, pteval); + current->kmap_ctrl.pteval[kmap_local_idx()] = pteval; + preempt_enable(); + + return (void *)vaddr; +} +EXPORT_SYMBOL_GPL(__kmap_local_pfn_prot); + +void *__kmap_local_page_prot(struct page *page, pgprot_t prot) +{ + void *kmap; + + /* + * To broaden the usage of the actual kmap_local() machinery always map + * pages when debugging is enabled and the architecture has no problems + * with alias mappings. + */ + if (!IS_ENABLED(CONFIG_DEBUG_KMAP_LOCAL_FORCE_MAP) && !PageHighMem(page)) + return page_address(page); + + /* Try kmap_high_get() if architecture has it enabled */ + kmap = arch_kmap_local_high_get(page); + if (kmap) + return kmap; + + return __kmap_local_pfn_prot(page_to_pfn(page), prot); +} +EXPORT_SYMBOL(__kmap_local_page_prot); + +void kunmap_local_indexed(void *vaddr) +{ + unsigned long addr = (unsigned long) vaddr & PAGE_MASK; + pte_t *kmap_pte = kmap_get_pte(); + int idx; + + if (addr < __fix_to_virt(FIX_KMAP_END) || + addr > __fix_to_virt(FIX_KMAP_BEGIN)) { + if (IS_ENABLED(CONFIG_DEBUG_KMAP_LOCAL_FORCE_MAP)) { + /* This _should_ never happen! See above. */ + WARN_ON_ONCE(1); + return; + } + /* + * Handle mappings which were obtained by kmap_high_get() + * first as the virtual address of such mappings is below + * PAGE_OFFSET. Warn for all other addresses which are in + * the user space part of the virtual address space. + */ + if (!kmap_high_unmap_local(addr)) + WARN_ON_ONCE(addr < PAGE_OFFSET); + return; + } + + preempt_disable(); + idx = arch_kmap_local_unmap_idx(kmap_local_idx(), addr); + WARN_ON_ONCE(addr != __fix_to_virt(FIX_KMAP_BEGIN + idx)); + + arch_kmap_local_pre_unmap(addr); + pte_clear(&init_mm, addr, kmap_pte - idx); + arch_kmap_local_post_unmap(addr); + current->kmap_ctrl.pteval[kmap_local_idx()] = __pte(0); + kmap_local_idx_pop(); + preempt_enable(); + migrate_enable(); +} +EXPORT_SYMBOL(kunmap_local_indexed); + +/* + * Invoked before switch_to(). This is safe even when during or after + * clearing the maps an interrupt which needs a kmap_local happens because + * the task::kmap_ctrl.idx is not modified by the unmapping code so a + * nested kmap_local will use the next unused index and restore the index + * on unmap. The already cleared kmaps of the outgoing task are irrelevant + * because the interrupt context does not know about them. The same applies + * when scheduling back in for an interrupt which happens before the + * restore is complete. + */ +void __kmap_local_sched_out(void) +{ + struct task_struct *tsk = current; + pte_t *kmap_pte = kmap_get_pte(); + int i; + + /* Clear kmaps */ + for (i = 0; i < tsk->kmap_ctrl.idx; i++) { + pte_t pteval = tsk->kmap_ctrl.pteval[i]; + unsigned long addr; + int idx; + + /* With debug all even slots are unmapped and act as guard */ + if (IS_ENABLED(CONFIG_DEBUG_HIGHMEM) && !(i & 0x01)) { + WARN_ON_ONCE(!pte_none(pteval)); + continue; + } + if (WARN_ON_ONCE(pte_none(pteval))) + continue; + + /* + * This is a horrible hack for XTENSA to calculate the + * coloured PTE index. Uses the PFN encoded into the pteval + * and the map index calculation because the actual mapped + * virtual address is not stored in task::kmap_ctrl. + * For any sane architecture this is optimized out. + */ + idx = arch_kmap_local_map_idx(i, pte_pfn(pteval)); + + addr = __fix_to_virt(FIX_KMAP_BEGIN + idx); + arch_kmap_local_pre_unmap(addr); + pte_clear(&init_mm, addr, kmap_pte - idx); + arch_kmap_local_post_unmap(addr); + } +} + +void __kmap_local_sched_in(void) +{ + struct task_struct *tsk = current; + pte_t *kmap_pte = kmap_get_pte(); + int i; + + /* Restore kmaps */ + for (i = 0; i < tsk->kmap_ctrl.idx; i++) { + pte_t pteval = tsk->kmap_ctrl.pteval[i]; + unsigned long addr; + int idx; + + /* With debug all even slots are unmapped and act as guard */ + if (IS_ENABLED(CONFIG_DEBUG_HIGHMEM) && !(i & 0x01)) { + WARN_ON_ONCE(!pte_none(pteval)); + continue; + } + if (WARN_ON_ONCE(pte_none(pteval))) + continue; + + /* See comment in __kmap_local_sched_out() */ + idx = arch_kmap_local_map_idx(i, pte_pfn(pteval)); + addr = __fix_to_virt(FIX_KMAP_BEGIN + idx); + set_pte_at(&init_mm, addr, kmap_pte - idx, pteval); + arch_kmap_local_post_map(addr, pteval); + } +} + +void kmap_local_fork(struct task_struct *tsk) +{ + if (WARN_ON_ONCE(tsk->kmap_ctrl.idx)) + memset(&tsk->kmap_ctrl, 0, sizeof(tsk->kmap_ctrl)); +} + +#endif #if defined(HASHED_PAGE_VIRTUAL) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index ec2bb93f7431..9237976abe72 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -163,12 +163,17 @@ static struct shrinker huge_zero_page_shrinker = { static ssize_t enabled_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { + const char *output; + if (test_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags)) - return sprintf(buf, "[always] madvise never\n"); - else if (test_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, &transparent_hugepage_flags)) - return sprintf(buf, "always [madvise] never\n"); + output = "[always] madvise never"; + else if (test_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, + &transparent_hugepage_flags)) + output = "always [madvise] never"; else - return sprintf(buf, "always madvise [never]\n"); + output = "always madvise [never]"; + + return sysfs_emit(buf, "%s\n", output); } static ssize_t enabled_store(struct kobject *kobj, @@ -200,11 +205,11 @@ static struct kobj_attribute enabled_attr = __ATTR(enabled, 0644, enabled_show, enabled_store); ssize_t single_hugepage_flag_show(struct kobject *kobj, - struct kobj_attribute *attr, char *buf, - enum transparent_hugepage_flag flag) + struct kobj_attribute *attr, char *buf, + enum transparent_hugepage_flag flag) { - return sprintf(buf, "%d\n", - !!test_bit(flag, &transparent_hugepage_flags)); + return sysfs_emit(buf, "%d\n", + !!test_bit(flag, &transparent_hugepage_flags)); } ssize_t single_hugepage_flag_store(struct kobject *kobj, @@ -232,15 +237,24 @@ ssize_t single_hugepage_flag_store(struct kobject *kobj, static ssize_t defrag_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { - if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags)) - return sprintf(buf, "[always] defer defer+madvise madvise never\n"); - if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags)) - return sprintf(buf, "always [defer] defer+madvise madvise never\n"); - if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags)) - return sprintf(buf, "always defer [defer+madvise] madvise never\n"); - if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags)) - return sprintf(buf, "always defer defer+madvise [madvise] never\n"); - return sprintf(buf, "always defer defer+madvise madvise [never]\n"); + const char *output; + + if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, + &transparent_hugepage_flags)) + output = "[always] defer defer+madvise madvise never"; + else if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, + &transparent_hugepage_flags)) + output = "always [defer] defer+madvise madvise never"; + else if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, + &transparent_hugepage_flags)) + output = "always defer [defer+madvise] madvise never"; + else if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, + &transparent_hugepage_flags)) + output = "always defer defer+madvise [madvise] never"; + else + output = "always defer defer+madvise madvise [never]"; + + return sysfs_emit(buf, "%s\n", output); } static ssize_t defrag_store(struct kobject *kobj, @@ -281,10 +295,10 @@ static struct kobj_attribute defrag_attr = __ATTR(defrag, 0644, defrag_show, defrag_store); static ssize_t use_zero_page_show(struct kobject *kobj, - struct kobj_attribute *attr, char *buf) + struct kobj_attribute *attr, char *buf) { return single_hugepage_flag_show(kobj, attr, buf, - TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG); + TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG); } static ssize_t use_zero_page_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) @@ -296,9 +310,9 @@ static struct kobj_attribute use_zero_page_attr = __ATTR(use_zero_page, 0644, use_zero_page_show, use_zero_page_store); static ssize_t hpage_pmd_size_show(struct kobject *kobj, - struct kobj_attribute *attr, char *buf) + struct kobj_attribute *attr, char *buf) { - return sprintf(buf, "%lu\n", HPAGE_PMD_SIZE); + return sysfs_emit(buf, "%lu\n", HPAGE_PMD_SIZE); } static struct kobj_attribute hpage_pmd_size_attr = __ATTR_RO(hpage_pmd_size); @@ -470,7 +484,7 @@ pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma) #ifdef CONFIG_MEMCG static inline struct deferred_split *get_deferred_split_queue(struct page *page) { - struct mem_cgroup *memcg = compound_head(page)->mem_cgroup; + struct mem_cgroup *memcg = page_memcg(compound_head(page)); struct pglist_data *pgdat = NODE_DATA(page_to_nid(page)); if (memcg) @@ -2321,7 +2335,7 @@ void vma_adjust_trans_huge(struct vm_area_struct *vma, static void unmap_page(struct page *page) { - enum ttu_flags ttu_flags = TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS | + enum ttu_flags ttu_flags = TTU_IGNORE_MLOCK | TTU_RMAP_LOCKED | TTU_SPLIT_HUGE_PMD; bool unmap_success; @@ -2345,6 +2359,27 @@ static void remap_page(struct page *page, unsigned int nr) } } +static void lru_add_page_tail(struct page *head, struct page *tail, + struct lruvec *lruvec, struct list_head *list) +{ + VM_BUG_ON_PAGE(!PageHead(head), head); + VM_BUG_ON_PAGE(PageCompound(tail), head); + VM_BUG_ON_PAGE(PageLRU(tail), head); + lockdep_assert_held(&lruvec->lru_lock); + + if (list) { + /* page reclaim is reclaiming a huge page */ + VM_WARN_ON(PageLRU(head)); + get_page(tail); + list_add_tail(&tail->lru, list); + } else { + /* head is still on lru (and we have it frozen) */ + VM_WARN_ON(!PageLRU(head)); + SetPageLRU(tail); + list_add_tail(&tail->lru, &head->lru); + } +} + static void __split_huge_page_tail(struct page *head, int tail, struct lruvec *lruvec, struct list_head *list) { @@ -2356,7 +2391,7 @@ static void __split_huge_page_tail(struct page *head, int tail, * Clone page flags before unfreezing refcount. * * After successful get_page_unless_zero() might follow flags change, - * for exmaple lock_page() which set PG_waiters. + * for example lock_page() which set PG_waiters. */ page_tail->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; page_tail->flags |= (head->flags & @@ -2411,18 +2446,15 @@ static void __split_huge_page_tail(struct page *head, int tail, } static void __split_huge_page(struct page *page, struct list_head *list, - pgoff_t end, unsigned long flags) + pgoff_t end) { struct page *head = compound_head(page); - pg_data_t *pgdat = page_pgdat(head); struct lruvec *lruvec; struct address_space *swap_cache = NULL; unsigned long offset = 0; unsigned int nr = thp_nr_pages(head); int i; - lruvec = mem_cgroup_page_lruvec(head, pgdat); - /* complete memcg works before add pages to LRU */ mem_cgroup_split_huge_fixup(head); @@ -2434,6 +2466,9 @@ static void __split_huge_page(struct page *page, struct list_head *list, xa_lock(&swap_cache->i_pages); } + /* lock lru list/PageCompound, ref freezed by page_ref_freeze */ + lruvec = lock_page_lruvec(head); + for (i = nr - 1; i >= 1; i--) { __split_huge_page_tail(head, i, lruvec, list); /* Some pages can be beyond i_size: drop them from page cache */ @@ -2453,6 +2488,8 @@ static void __split_huge_page(struct page *page, struct list_head *list, } ClearPageCompound(head); + unlock_page_lruvec(lruvec); + /* Caller disabled irqs, so they are still disabled here */ split_page_owner(head, nr); @@ -2470,8 +2507,7 @@ static void __split_huge_page(struct page *page, struct list_head *list, page_ref_add(head, 2); xa_unlock(&head->mapping->i_pages); } - - spin_unlock_irqrestore(&pgdat->lru_lock, flags); + local_irq_enable(); remap_page(head, nr); @@ -2617,12 +2653,10 @@ bool can_split_huge_page(struct page *page, int *pextra_pins) int split_huge_page_to_list(struct page *page, struct list_head *list) { struct page *head = compound_head(page); - struct pglist_data *pgdata = NODE_DATA(page_to_nid(head)); struct deferred_split *ds_queue = get_deferred_split_queue(head); struct anon_vma *anon_vma = NULL; struct address_space *mapping = NULL; int count, mapcount, extra_pins, ret; - unsigned long flags; pgoff_t end; VM_BUG_ON_PAGE(is_huge_zero_page(head), head); @@ -2683,9 +2717,8 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) unmap_page(head); VM_BUG_ON_PAGE(compound_mapcount(head), head); - /* prevent PageLRU to go away from under us, and freeze lru stats */ - spin_lock_irqsave(&pgdata->lru_lock, flags); - + /* block interrupt reentry in xa_lock and spinlock */ + local_irq_disable(); if (mapping) { XA_STATE(xas, &mapping->i_pages, page_index(head)); @@ -2710,12 +2743,12 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) spin_unlock(&ds_queue->split_queue_lock); if (mapping) { if (PageSwapBacked(head)) - __dec_node_page_state(head, NR_SHMEM_THPS); + __dec_lruvec_page_state(head, NR_SHMEM_THPS); else - __dec_node_page_state(head, NR_FILE_THPS); + __dec_lruvec_page_state(head, NR_FILE_THPS); } - __split_huge_page(page, list, end, flags); + __split_huge_page(page, list, end); ret = 0; } else { if (IS_ENABLED(CONFIG_DEBUG_VM) && mapcount) { @@ -2729,7 +2762,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) spin_unlock(&ds_queue->split_queue_lock); fail: if (mapping) xa_unlock(&mapping->i_pages); - spin_unlock_irqrestore(&pgdata->lru_lock, flags); + local_irq_enable(); remap_page(head, thp_nr_pages(head)); ret = -EBUSY; } @@ -2764,7 +2797,7 @@ void deferred_split_huge_page(struct page *page) { struct deferred_split *ds_queue = get_deferred_split_queue(page); #ifdef CONFIG_MEMCG - struct mem_cgroup *memcg = compound_head(page)->mem_cgroup; + struct mem_cgroup *memcg = page_memcg(compound_head(page)); #endif unsigned long flags; diff --git a/mm/hugetlb.c b/mm/hugetlb.c index d029d938d26d..cbf32d2824fd 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1944,13 +1944,14 @@ struct page *alloc_huge_page_vma(struct hstate *h, struct vm_area_struct *vma, * Increase the hugetlb pool such that it can accommodate a reservation * of size 'delta'. */ -static int gather_surplus_pages(struct hstate *h, int delta) +static int gather_surplus_pages(struct hstate *h, long delta) __must_hold(&hugetlb_lock) { struct list_head surplus_list; struct page *page, *tmp; - int ret, i; - int needed, allocated; + int ret; + long i; + long needed, allocated; bool alloc_ok = true; needed = (h->resv_huge_pages + delta) - h->free_huge_pages; @@ -2014,8 +2015,7 @@ retry: * This page is now managed by the hugetlb allocator and has * no users -- drop the buddy allocator's reference. */ - put_page_testzero(page); - VM_BUG_ON_PAGE(page_count(page), page); + VM_BUG_ON_PAGE(!put_page_testzero(page), page); enqueue_huge_page(h, page); } free: @@ -2760,7 +2760,7 @@ static ssize_t nr_hugepages_show_common(struct kobject *kobj, else nr_huge_pages = h->nr_huge_pages_node[nid]; - return sprintf(buf, "%lu\n", nr_huge_pages); + return sysfs_emit(buf, "%lu\n", nr_huge_pages); } static ssize_t __nr_hugepages_store_common(bool obey_mempolicy, @@ -2833,7 +2833,8 @@ HSTATE_ATTR(nr_hugepages); * huge page alloc/free. */ static ssize_t nr_hugepages_mempolicy_show(struct kobject *kobj, - struct kobj_attribute *attr, char *buf) + struct kobj_attribute *attr, + char *buf) { return nr_hugepages_show_common(kobj, attr, buf); } @@ -2851,7 +2852,7 @@ static ssize_t nr_overcommit_hugepages_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { struct hstate *h = kobj_to_hstate(kobj, NULL); - return sprintf(buf, "%lu\n", h->nr_overcommit_huge_pages); + return sysfs_emit(buf, "%lu\n", h->nr_overcommit_huge_pages); } static ssize_t nr_overcommit_hugepages_store(struct kobject *kobj, @@ -2889,7 +2890,7 @@ static ssize_t free_hugepages_show(struct kobject *kobj, else free_huge_pages = h->free_huge_pages_node[nid]; - return sprintf(buf, "%lu\n", free_huge_pages); + return sysfs_emit(buf, "%lu\n", free_huge_pages); } HSTATE_ATTR_RO(free_hugepages); @@ -2897,7 +2898,7 @@ static ssize_t resv_hugepages_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { struct hstate *h = kobj_to_hstate(kobj, NULL); - return sprintf(buf, "%lu\n", h->resv_huge_pages); + return sysfs_emit(buf, "%lu\n", h->resv_huge_pages); } HSTATE_ATTR_RO(resv_hugepages); @@ -2914,7 +2915,7 @@ static ssize_t surplus_hugepages_show(struct kobject *kobj, else surplus_huge_pages = h->surplus_huge_pages_node[nid]; - return sprintf(buf, "%lu\n", surplus_huge_pages); + return sysfs_emit(buf, "%lu\n", surplus_huge_pages); } HSTATE_ATTR_RO(surplus_hugepages); @@ -3198,8 +3199,6 @@ void __init hugetlb_add_hstate(unsigned int order) h = &hstates[hugetlb_max_hstate++]; h->order = order; h->mask = ~((1ULL << (order + PAGE_SHIFT)) - 1); - h->nr_huge_pages = 0; - h->free_huge_pages = 0; for (i = 0; i < MAX_NUMNODES; ++i) INIT_LIST_HEAD(&h->hugepage_freelists[i]); INIT_LIST_HEAD(&h->hugepage_activelist); @@ -3673,7 +3672,7 @@ const struct vm_operations_struct hugetlb_vm_ops = { .fault = hugetlb_vm_op_fault, .open = hugetlb_vm_op_open, .close = hugetlb_vm_op_close, - .split = hugetlb_vm_op_split, + .may_split = hugetlb_vm_op_split, .pagesize = hugetlb_vm_op_pagesize, }; @@ -5115,6 +5114,7 @@ int hugetlb_reserve_pages(struct inode *inode, if (unlikely(add < 0)) { hugetlb_acct_memory(h, -gbl_reserve); + ret = add; goto out_put_pages; } else if (unlikely(chg > add)) { /* diff --git a/mm/init-mm.c b/mm/init-mm.c index 3a613c85f9ed..153162669f80 100644 --- a/mm/init-mm.c +++ b/mm/init-mm.c @@ -31,6 +31,7 @@ struct mm_struct init_mm = { .pgd = swapper_pg_dir, .mm_users = ATOMIC_INIT(2), .mm_count = ATOMIC_INIT(1), + .write_protect_seq = SEQCNT_ZERO(init_mm.write_protect_seq), MMAP_LOCK_INITIALIZER(init_mm) .page_table_lock = __SPIN_LOCK_UNLOCKED(init_mm.page_table_lock), .arg_lock = __SPIN_LOCK_UNLOCKED(init_mm.arg_lock), diff --git a/mm/internal.h b/mm/internal.h index c43ccdddb0f6..25d2b2439f19 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -199,8 +199,13 @@ extern void post_alloc_hook(struct page *page, unsigned int order, gfp_t gfp_flags); extern int user_min_free_kbytes; +extern void free_unref_page(struct page *page); +extern void free_unref_page_list(struct list_head *list); + extern void zone_pcp_update(struct zone *zone); extern void zone_pcp_reset(struct zone *zone); +extern void zone_pcp_disable(struct zone *zone); +extern void zone_pcp_enable(struct zone *zone); #if defined CONFIG_COMPACTION || defined CONFIG_CMA diff --git a/mm/kasan/Makefile b/mm/kasan/Makefile index 370d970e5ab5..9fe39a66388a 100644 --- a/mm/kasan/Makefile +++ b/mm/kasan/Makefile @@ -6,12 +6,15 @@ KCOV_INSTRUMENT := n # Disable ftrace to avoid recursion. CFLAGS_REMOVE_common.o = $(CC_FLAGS_FTRACE) CFLAGS_REMOVE_generic.o = $(CC_FLAGS_FTRACE) -CFLAGS_REMOVE_generic_report.o = $(CC_FLAGS_FTRACE) CFLAGS_REMOVE_init.o = $(CC_FLAGS_FTRACE) CFLAGS_REMOVE_quarantine.o = $(CC_FLAGS_FTRACE) CFLAGS_REMOVE_report.o = $(CC_FLAGS_FTRACE) -CFLAGS_REMOVE_tags.o = $(CC_FLAGS_FTRACE) -CFLAGS_REMOVE_tags_report.o = $(CC_FLAGS_FTRACE) +CFLAGS_REMOVE_report_generic.o = $(CC_FLAGS_FTRACE) +CFLAGS_REMOVE_report_hw_tags.o = $(CC_FLAGS_FTRACE) +CFLAGS_REMOVE_report_sw_tags.o = $(CC_FLAGS_FTRACE) +CFLAGS_REMOVE_shadow.o = $(CC_FLAGS_FTRACE) +CFLAGS_REMOVE_hw_tags.o = $(CC_FLAGS_FTRACE) +CFLAGS_REMOVE_sw_tags.o = $(CC_FLAGS_FTRACE) # Function splitter causes unnecessary splits in __asan_load1/__asan_store1 # see: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=63533 @@ -22,13 +25,17 @@ CC_FLAGS_KASAN_RUNTIME += -DDISABLE_BRANCH_PROFILING CFLAGS_common.o := $(CC_FLAGS_KASAN_RUNTIME) CFLAGS_generic.o := $(CC_FLAGS_KASAN_RUNTIME) -CFLAGS_generic_report.o := $(CC_FLAGS_KASAN_RUNTIME) CFLAGS_init.o := $(CC_FLAGS_KASAN_RUNTIME) CFLAGS_quarantine.o := $(CC_FLAGS_KASAN_RUNTIME) CFLAGS_report.o := $(CC_FLAGS_KASAN_RUNTIME) -CFLAGS_tags.o := $(CC_FLAGS_KASAN_RUNTIME) -CFLAGS_tags_report.o := $(CC_FLAGS_KASAN_RUNTIME) +CFLAGS_report_generic.o := $(CC_FLAGS_KASAN_RUNTIME) +CFLAGS_report_hw_tags.o := $(CC_FLAGS_KASAN_RUNTIME) +CFLAGS_report_sw_tags.o := $(CC_FLAGS_KASAN_RUNTIME) +CFLAGS_shadow.o := $(CC_FLAGS_KASAN_RUNTIME) +CFLAGS_hw_tags.o := $(CC_FLAGS_KASAN_RUNTIME) +CFLAGS_sw_tags.o := $(CC_FLAGS_KASAN_RUNTIME) -obj-$(CONFIG_KASAN) := common.o init.o report.o -obj-$(CONFIG_KASAN_GENERIC) += generic.o generic_report.o quarantine.o -obj-$(CONFIG_KASAN_SW_TAGS) += tags.o tags_report.o +obj-$(CONFIG_KASAN) := common.o report.o +obj-$(CONFIG_KASAN_GENERIC) += init.o generic.o report_generic.o shadow.o quarantine.o +obj-$(CONFIG_KASAN_HW_TAGS) += hw_tags.o report_hw_tags.o +obj-$(CONFIG_KASAN_SW_TAGS) += init.o report_sw_tags.o shadow.o sw_tags.o diff --git a/mm/kasan/common.c b/mm/kasan/common.c index 950fd372a07e..b25167664ead 100644 --- a/mm/kasan/common.c +++ b/mm/kasan/common.c @@ -1,24 +1,18 @@ // SPDX-License-Identifier: GPL-2.0 /* - * This file contains common generic and tag-based KASAN code. + * This file contains common KASAN code. * * Copyright (c) 2014 Samsung Electronics Co., Ltd. * Author: Andrey Ryabinin <ryabinin.a.a@gmail.com> * * Some code borrowed from https://github.com/xairy/kasan-prototype by * Andrey Konovalov <andreyknvl@gmail.com> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * */ #include <linux/export.h> #include <linux/init.h> #include <linux/kasan.h> #include <linux/kernel.h> -#include <linux/kmemleak.h> #include <linux/linkage.h> #include <linux/memblock.h> #include <linux/memory.h> @@ -31,12 +25,8 @@ #include <linux/stacktrace.h> #include <linux/string.h> #include <linux/types.h> -#include <linux/vmalloc.h> #include <linux/bug.h> -#include <asm/cacheflush.h> -#include <asm/tlbflush.h> - #include "kasan.h" #include "../slab.h" @@ -56,6 +46,7 @@ void kasan_set_track(struct kasan_track *track, gfp_t flags) track->stack = kasan_save_stack(flags); } +#if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS) void kasan_enable_current(void) { current->kasan_depth++; @@ -65,106 +56,20 @@ void kasan_disable_current(void) { current->kasan_depth--; } +#endif /* CONFIG_KASAN_GENERIC || CONFIG_KASAN_SW_TAGS */ -bool __kasan_check_read(const volatile void *p, unsigned int size) -{ - return check_memory_region((unsigned long)p, size, false, _RET_IP_); -} -EXPORT_SYMBOL(__kasan_check_read); - -bool __kasan_check_write(const volatile void *p, unsigned int size) -{ - return check_memory_region((unsigned long)p, size, true, _RET_IP_); -} -EXPORT_SYMBOL(__kasan_check_write); - -#undef memset -void *memset(void *addr, int c, size_t len) -{ - if (!check_memory_region((unsigned long)addr, len, true, _RET_IP_)) - return NULL; - - return __memset(addr, c, len); -} - -#ifdef __HAVE_ARCH_MEMMOVE -#undef memmove -void *memmove(void *dest, const void *src, size_t len) -{ - if (!check_memory_region((unsigned long)src, len, false, _RET_IP_) || - !check_memory_region((unsigned long)dest, len, true, _RET_IP_)) - return NULL; - - return __memmove(dest, src, len); -} -#endif - -#undef memcpy -void *memcpy(void *dest, const void *src, size_t len) +void __kasan_unpoison_range(const void *address, size_t size) { - if (!check_memory_region((unsigned long)src, len, false, _RET_IP_) || - !check_memory_region((unsigned long)dest, len, true, _RET_IP_)) - return NULL; - - return __memcpy(dest, src, len); -} - -/* - * Poisons the shadow memory for 'size' bytes starting from 'addr'. - * Memory addresses should be aligned to KASAN_SHADOW_SCALE_SIZE. - */ -void kasan_poison_shadow(const void *address, size_t size, u8 value) -{ - void *shadow_start, *shadow_end; - - /* - * Perform shadow offset calculation based on untagged address, as - * some of the callers (e.g. kasan_poison_object_data) pass tagged - * addresses to this function. - */ - address = reset_tag(address); - - shadow_start = kasan_mem_to_shadow(address); - shadow_end = kasan_mem_to_shadow(address + size); - - __memset(shadow_start, value, shadow_end - shadow_start); -} - -void kasan_unpoison_shadow(const void *address, size_t size) -{ - u8 tag = get_tag(address); - - /* - * Perform shadow offset calculation based on untagged address, as - * some of the callers (e.g. kasan_unpoison_object_data) pass tagged - * addresses to this function. - */ - address = reset_tag(address); - - kasan_poison_shadow(address, size, tag); - - if (size & KASAN_SHADOW_MASK) { - u8 *shadow = (u8 *)kasan_mem_to_shadow(address + size); - - if (IS_ENABLED(CONFIG_KASAN_SW_TAGS)) - *shadow = tag; - else - *shadow = size & KASAN_SHADOW_MASK; - } -} - -static void __kasan_unpoison_stack(struct task_struct *task, const void *sp) -{ - void *base = task_stack_page(task); - size_t size = sp - base; - - kasan_unpoison_shadow(base, size); + unpoison_range(address, size); } +#if CONFIG_KASAN_STACK /* Unpoison the entire stack for a task. */ void kasan_unpoison_task_stack(struct task_struct *task) { - __kasan_unpoison_stack(task, task_stack_page(task) + THREAD_SIZE); + void *base = task_stack_page(task); + + unpoison_range(base, THREAD_SIZE); } /* Unpoison the stack for the current task beyond a watermark sp value. */ @@ -177,10 +82,22 @@ asmlinkage void kasan_unpoison_task_stack_below(const void *watermark) */ void *base = (void *)((unsigned long)watermark & ~(THREAD_SIZE - 1)); - kasan_unpoison_shadow(base, watermark - base); + unpoison_range(base, watermark - base); +} +#endif /* CONFIG_KASAN_STACK */ + +/* + * Only allow cache merging when stack collection is disabled and no metadata + * is present. + */ +slab_flags_t __kasan_never_merge(void) +{ + if (kasan_stack_collection_enabled()) + return SLAB_KASAN; + return 0; } -void kasan_alloc_pages(struct page *page, unsigned int order) +void __kasan_alloc_pages(struct page *page, unsigned int order) { u8 tag; unsigned long i; @@ -191,13 +108,13 @@ void kasan_alloc_pages(struct page *page, unsigned int order) tag = random_tag(); for (i = 0; i < (1 << order); i++) page_kasan_tag_set(page + i, tag); - kasan_unpoison_shadow(page_address(page), PAGE_SIZE << order); + unpoison_range(page_address(page), PAGE_SIZE << order); } -void kasan_free_pages(struct page *page, unsigned int order) +void __kasan_free_pages(struct page *page, unsigned int order) { if (likely(!PageHighMem(page))) - kasan_poison_shadow(page_address(page), + poison_range(page_address(page), PAGE_SIZE << order, KASAN_FREE_PAGE); } @@ -208,9 +125,6 @@ void kasan_free_pages(struct page *page, unsigned int order) */ static inline unsigned int optimal_redzone(unsigned int object_size) { - if (IS_ENABLED(CONFIG_KASAN_SW_TAGS)) - return 0; - return object_size <= 64 - 16 ? 16 : object_size <= 128 - 32 ? 32 : @@ -221,88 +135,129 @@ static inline unsigned int optimal_redzone(unsigned int object_size) object_size <= (1 << 16) - 1024 ? 1024 : 2048; } -void kasan_cache_create(struct kmem_cache *cache, unsigned int *size, - slab_flags_t *flags) +void __kasan_cache_create(struct kmem_cache *cache, unsigned int *size, + slab_flags_t *flags) { - unsigned int orig_size = *size; - unsigned int redzone_size; - int redzone_adjust; + unsigned int ok_size; + unsigned int optimal_size; - /* Add alloc meta. */ - cache->kasan_info.alloc_meta_offset = *size; - *size += sizeof(struct kasan_alloc_meta); + /* + * SLAB_KASAN is used to mark caches as ones that are sanitized by + * KASAN. Currently this flag is used in two places: + * 1. In slab_ksize() when calculating the size of the accessible + * memory within the object. + * 2. In slab_common.c to prevent merging of sanitized caches. + */ + *flags |= SLAB_KASAN; - /* Add free meta. */ - if (IS_ENABLED(CONFIG_KASAN_GENERIC) && - (cache->flags & SLAB_TYPESAFE_BY_RCU || cache->ctor || - cache->object_size < sizeof(struct kasan_free_meta))) { - cache->kasan_info.free_meta_offset = *size; - *size += sizeof(struct kasan_free_meta); - } + if (!kasan_stack_collection_enabled()) + return; - redzone_size = optimal_redzone(cache->object_size); - redzone_adjust = redzone_size - (*size - cache->object_size); - if (redzone_adjust > 0) - *size += redzone_adjust; + ok_size = *size; - *size = min_t(unsigned int, KMALLOC_MAX_SIZE, - max(*size, cache->object_size + redzone_size)); + /* Add alloc meta into redzone. */ + cache->kasan_info.alloc_meta_offset = *size; + *size += sizeof(struct kasan_alloc_meta); /* - * If the metadata doesn't fit, don't enable KASAN at all. + * If alloc meta doesn't fit, don't add it. + * This can only happen with SLAB, as it has KMALLOC_MAX_SIZE equal + * to KMALLOC_MAX_CACHE_SIZE and doesn't fall back to page_alloc for + * larger sizes. */ - if (*size <= cache->kasan_info.alloc_meta_offset || - *size <= cache->kasan_info.free_meta_offset) { + if (*size > KMALLOC_MAX_SIZE) { cache->kasan_info.alloc_meta_offset = 0; - cache->kasan_info.free_meta_offset = 0; - *size = orig_size; + *size = ok_size; + /* Continue, since free meta might still fit. */ + } + + /* Only the generic mode uses free meta or flexible redzones. */ + if (!IS_ENABLED(CONFIG_KASAN_GENERIC)) { + cache->kasan_info.free_meta_offset = KASAN_NO_FREE_META; return; } - *flags |= SLAB_KASAN; + /* + * Add free meta into redzone when it's not possible to store + * it in the object. This is the case when: + * 1. Object is SLAB_TYPESAFE_BY_RCU, which means that it can + * be touched after it was freed, or + * 2. Object has a constructor, which means it's expected to + * retain its content until the next allocation, or + * 3. Object is too small. + * Otherwise cache->kasan_info.free_meta_offset = 0 is implied. + */ + if ((cache->flags & SLAB_TYPESAFE_BY_RCU) || cache->ctor || + cache->object_size < sizeof(struct kasan_free_meta)) { + ok_size = *size; + + cache->kasan_info.free_meta_offset = *size; + *size += sizeof(struct kasan_free_meta); + + /* If free meta doesn't fit, don't add it. */ + if (*size > KMALLOC_MAX_SIZE) { + cache->kasan_info.free_meta_offset = KASAN_NO_FREE_META; + *size = ok_size; + } + } + + /* Calculate size with optimal redzone. */ + optimal_size = cache->object_size + optimal_redzone(cache->object_size); + /* Limit it with KMALLOC_MAX_SIZE (relevant for SLAB only). */ + if (optimal_size > KMALLOC_MAX_SIZE) + optimal_size = KMALLOC_MAX_SIZE; + /* Use optimal size if the size with added metas is not large enough. */ + if (*size < optimal_size) + *size = optimal_size; } -size_t kasan_metadata_size(struct kmem_cache *cache) +size_t __kasan_metadata_size(struct kmem_cache *cache) { + if (!kasan_stack_collection_enabled()) + return 0; return (cache->kasan_info.alloc_meta_offset ? sizeof(struct kasan_alloc_meta) : 0) + (cache->kasan_info.free_meta_offset ? sizeof(struct kasan_free_meta) : 0); } -struct kasan_alloc_meta *get_alloc_info(struct kmem_cache *cache, - const void *object) +struct kasan_alloc_meta *kasan_get_alloc_meta(struct kmem_cache *cache, + const void *object) { - return (void *)object + cache->kasan_info.alloc_meta_offset; + if (!cache->kasan_info.alloc_meta_offset) + return NULL; + return kasan_reset_tag(object) + cache->kasan_info.alloc_meta_offset; } -struct kasan_free_meta *get_free_info(struct kmem_cache *cache, - const void *object) +#ifdef CONFIG_KASAN_GENERIC +struct kasan_free_meta *kasan_get_free_meta(struct kmem_cache *cache, + const void *object) { BUILD_BUG_ON(sizeof(struct kasan_free_meta) > 32); - return (void *)object + cache->kasan_info.free_meta_offset; + if (cache->kasan_info.free_meta_offset == KASAN_NO_FREE_META) + return NULL; + return kasan_reset_tag(object) + cache->kasan_info.free_meta_offset; } +#endif -void kasan_poison_slab(struct page *page) +void __kasan_poison_slab(struct page *page) { unsigned long i; for (i = 0; i < compound_nr(page); i++) page_kasan_tag_reset(page + i); - kasan_poison_shadow(page_address(page), page_size(page), - KASAN_KMALLOC_REDZONE); + poison_range(page_address(page), page_size(page), + KASAN_KMALLOC_REDZONE); } -void kasan_unpoison_object_data(struct kmem_cache *cache, void *object) +void __kasan_unpoison_object_data(struct kmem_cache *cache, void *object) { - kasan_unpoison_shadow(object, cache->object_size); + unpoison_range(object, cache->object_size); } -void kasan_poison_object_data(struct kmem_cache *cache, void *object) +void __kasan_poison_object_data(struct kmem_cache *cache, void *object) { - kasan_poison_shadow(object, - round_up(cache->object_size, KASAN_SHADOW_SCALE_SIZE), - KASAN_KMALLOC_REDZONE); + poison_range(object, cache->object_size, KASAN_KMALLOC_REDZONE); } /* @@ -322,6 +277,9 @@ void kasan_poison_object_data(struct kmem_cache *cache, void *object) static u8 assign_tag(struct kmem_cache *cache, const void *object, bool init, bool keep_tag) { + if (IS_ENABLED(CONFIG_KASAN_GENERIC)) + return 0xff; + /* * 1. When an object is kmalloc()'ed, two hooks are called: * kasan_slab_alloc() and kasan_kmalloc(). We assign the @@ -351,50 +309,32 @@ static u8 assign_tag(struct kmem_cache *cache, const void *object, #endif } -void * __must_check kasan_init_slab_obj(struct kmem_cache *cache, +void * __must_check __kasan_init_slab_obj(struct kmem_cache *cache, const void *object) { - struct kasan_alloc_meta *alloc_info; + struct kasan_alloc_meta *alloc_meta; - if (!(cache->flags & SLAB_KASAN)) - return (void *)object; - - alloc_info = get_alloc_info(cache, object); - __memset(alloc_info, 0, sizeof(*alloc_info)); + if (kasan_stack_collection_enabled()) { + alloc_meta = kasan_get_alloc_meta(cache, object); + if (alloc_meta) + __memset(alloc_meta, 0, sizeof(*alloc_meta)); + } - if (IS_ENABLED(CONFIG_KASAN_SW_TAGS)) - object = set_tag(object, - assign_tag(cache, object, true, false)); + /* Tag is ignored in set_tag() without CONFIG_KASAN_SW/HW_TAGS */ + object = set_tag(object, assign_tag(cache, object, true, false)); return (void *)object; } -static inline bool shadow_invalid(u8 tag, s8 shadow_byte) -{ - if (IS_ENABLED(CONFIG_KASAN_GENERIC)) - return shadow_byte < 0 || - shadow_byte >= KASAN_SHADOW_SCALE_SIZE; - - /* else CONFIG_KASAN_SW_TAGS: */ - if ((u8)shadow_byte == KASAN_TAG_INVALID) - return true; - if ((tag != KASAN_TAG_KERNEL) && (tag != (u8)shadow_byte)) - return true; - - return false; -} - -static bool __kasan_slab_free(struct kmem_cache *cache, void *object, +static bool ____kasan_slab_free(struct kmem_cache *cache, void *object, unsigned long ip, bool quarantine) { - s8 shadow_byte; u8 tag; void *tagged_object; - unsigned long rounded_up_size; tag = get_tag(object); tagged_object = object; - object = reset_tag(object); + object = kasan_reset_tag(object); if (unlikely(nearest_obj(cache, virt_to_head_page(object), object) != object)) { @@ -406,37 +346,67 @@ static bool __kasan_slab_free(struct kmem_cache *cache, void *object, if (unlikely(cache->flags & SLAB_TYPESAFE_BY_RCU)) return false; - shadow_byte = READ_ONCE(*(s8 *)kasan_mem_to_shadow(object)); - if (shadow_invalid(tag, shadow_byte)) { + if (check_invalid_free(tagged_object)) { kasan_report_invalid_free(tagged_object, ip); return true; } - rounded_up_size = round_up(cache->object_size, KASAN_SHADOW_SCALE_SIZE); - kasan_poison_shadow(object, rounded_up_size, KASAN_KMALLOC_FREE); + poison_range(object, cache->object_size, KASAN_KMALLOC_FREE); - if ((IS_ENABLED(CONFIG_KASAN_GENERIC) && !quarantine) || - unlikely(!(cache->flags & SLAB_KASAN))) + if (!kasan_stack_collection_enabled()) + return false; + + if ((IS_ENABLED(CONFIG_KASAN_GENERIC) && !quarantine)) return false; kasan_set_free_info(cache, object, tag); - quarantine_put(get_free_info(cache, object), cache); + return quarantine_put(cache, object); +} - return IS_ENABLED(CONFIG_KASAN_GENERIC); +bool __kasan_slab_free(struct kmem_cache *cache, void *object, unsigned long ip) +{ + return ____kasan_slab_free(cache, object, ip, true); } -bool kasan_slab_free(struct kmem_cache *cache, void *object, unsigned long ip) +void __kasan_slab_free_mempool(void *ptr, unsigned long ip) { - return __kasan_slab_free(cache, object, ip, true); + struct page *page; + + page = virt_to_head_page(ptr); + + /* + * Even though this function is only called for kmem_cache_alloc and + * kmalloc backed mempool allocations, those allocations can still be + * !PageSlab() when the size provided to kmalloc is larger than + * KMALLOC_MAX_SIZE, and kmalloc falls back onto page_alloc. + */ + if (unlikely(!PageSlab(page))) { + if (ptr != page_address(page)) { + kasan_report_invalid_free(ptr, ip); + return; + } + poison_range(ptr, page_size(page), KASAN_FREE_PAGE); + } else { + ____kasan_slab_free(page->slab_cache, ptr, ip, false); + } +} + +static void set_alloc_info(struct kmem_cache *cache, void *object, gfp_t flags) +{ + struct kasan_alloc_meta *alloc_meta; + + alloc_meta = kasan_get_alloc_meta(cache, object); + if (alloc_meta) + kasan_set_track(&alloc_meta->alloc_track, flags); } -static void *__kasan_kmalloc(struct kmem_cache *cache, const void *object, +static void *____kasan_kmalloc(struct kmem_cache *cache, const void *object, size_t size, gfp_t flags, bool keep_tag) { unsigned long redzone_start; unsigned long redzone_end; - u8 tag = 0xff; + u8 tag; if (gfpflags_allow_blocking(flags)) quarantine_reduce(); @@ -445,38 +415,36 @@ static void *__kasan_kmalloc(struct kmem_cache *cache, const void *object, return NULL; redzone_start = round_up((unsigned long)(object + size), - KASAN_SHADOW_SCALE_SIZE); + KASAN_GRANULE_SIZE); redzone_end = round_up((unsigned long)object + cache->object_size, - KASAN_SHADOW_SCALE_SIZE); - - if (IS_ENABLED(CONFIG_KASAN_SW_TAGS)) - tag = assign_tag(cache, object, false, keep_tag); + KASAN_GRANULE_SIZE); + tag = assign_tag(cache, object, false, keep_tag); - /* Tag is ignored in set_tag without CONFIG_KASAN_SW_TAGS */ - kasan_unpoison_shadow(set_tag(object, tag), size); - kasan_poison_shadow((void *)redzone_start, redzone_end - redzone_start, - KASAN_KMALLOC_REDZONE); + /* Tag is ignored in set_tag without CONFIG_KASAN_SW/HW_TAGS */ + unpoison_range(set_tag(object, tag), size); + poison_range((void *)redzone_start, redzone_end - redzone_start, + KASAN_KMALLOC_REDZONE); - if (cache->flags & SLAB_KASAN) - kasan_set_track(&get_alloc_info(cache, object)->alloc_track, flags); + if (kasan_stack_collection_enabled()) + set_alloc_info(cache, (void *)object, flags); return set_tag(object, tag); } -void * __must_check kasan_slab_alloc(struct kmem_cache *cache, void *object, - gfp_t flags) +void * __must_check __kasan_slab_alloc(struct kmem_cache *cache, + void *object, gfp_t flags) { - return __kasan_kmalloc(cache, object, cache->object_size, flags, false); + return ____kasan_kmalloc(cache, object, cache->object_size, flags, false); } -void * __must_check kasan_kmalloc(struct kmem_cache *cache, const void *object, - size_t size, gfp_t flags) +void * __must_check __kasan_kmalloc(struct kmem_cache *cache, const void *object, + size_t size, gfp_t flags) { - return __kasan_kmalloc(cache, object, size, flags, true); + return ____kasan_kmalloc(cache, object, size, flags, true); } -EXPORT_SYMBOL(kasan_kmalloc); +EXPORT_SYMBOL(__kasan_kmalloc); -void * __must_check kasan_kmalloc_large(const void *ptr, size_t size, +void * __must_check __kasan_kmalloc_large(const void *ptr, size_t size, gfp_t flags) { struct page *page; @@ -491,17 +459,17 @@ void * __must_check kasan_kmalloc_large(const void *ptr, size_t size, page = virt_to_page(ptr); redzone_start = round_up((unsigned long)(ptr + size), - KASAN_SHADOW_SCALE_SIZE); + KASAN_GRANULE_SIZE); redzone_end = (unsigned long)ptr + page_size(page); - kasan_unpoison_shadow(ptr, size); - kasan_poison_shadow((void *)redzone_start, redzone_end - redzone_start, - KASAN_PAGE_REDZONE); + unpoison_range(ptr, size); + poison_range((void *)redzone_start, redzone_end - redzone_start, + KASAN_PAGE_REDZONE); return (void *)ptr; } -void * __must_check kasan_krealloc(const void *object, size_t size, gfp_t flags) +void * __must_check __kasan_krealloc(const void *object, size_t size, gfp_t flags) { struct page *page; @@ -511,421 +479,15 @@ void * __must_check kasan_krealloc(const void *object, size_t size, gfp_t flags) page = virt_to_head_page(object); if (unlikely(!PageSlab(page))) - return kasan_kmalloc_large(object, size, flags); + return __kasan_kmalloc_large(object, size, flags); else - return __kasan_kmalloc(page->slab_cache, object, size, + return ____kasan_kmalloc(page->slab_cache, object, size, flags, true); } -void kasan_poison_kfree(void *ptr, unsigned long ip) -{ - struct page *page; - - page = virt_to_head_page(ptr); - - if (unlikely(!PageSlab(page))) { - if (ptr != page_address(page)) { - kasan_report_invalid_free(ptr, ip); - return; - } - kasan_poison_shadow(ptr, page_size(page), KASAN_FREE_PAGE); - } else { - __kasan_slab_free(page->slab_cache, ptr, ip, false); - } -} - -void kasan_kfree_large(void *ptr, unsigned long ip) +void __kasan_kfree_large(void *ptr, unsigned long ip) { if (ptr != page_address(virt_to_head_page(ptr))) kasan_report_invalid_free(ptr, ip); - /* The object will be poisoned by page_alloc. */ -} - -#ifndef CONFIG_KASAN_VMALLOC -int kasan_module_alloc(void *addr, size_t size) -{ - void *ret; - size_t scaled_size; - size_t shadow_size; - unsigned long shadow_start; - - shadow_start = (unsigned long)kasan_mem_to_shadow(addr); - scaled_size = (size + KASAN_SHADOW_MASK) >> KASAN_SHADOW_SCALE_SHIFT; - shadow_size = round_up(scaled_size, PAGE_SIZE); - - if (WARN_ON(!PAGE_ALIGNED(shadow_start))) - return -EINVAL; - - ret = __vmalloc_node_range(shadow_size, 1, shadow_start, - shadow_start + shadow_size, - GFP_KERNEL, - PAGE_KERNEL, VM_NO_GUARD, NUMA_NO_NODE, - __builtin_return_address(0)); - - if (ret) { - __memset(ret, KASAN_SHADOW_INIT, shadow_size); - find_vm_area(addr)->flags |= VM_KASAN; - kmemleak_ignore(ret); - return 0; - } - - return -ENOMEM; -} - -void kasan_free_shadow(const struct vm_struct *vm) -{ - if (vm->flags & VM_KASAN) - vfree(kasan_mem_to_shadow(vm->addr)); -} -#endif - -#ifdef CONFIG_MEMORY_HOTPLUG -static bool shadow_mapped(unsigned long addr) -{ - pgd_t *pgd = pgd_offset_k(addr); - p4d_t *p4d; - pud_t *pud; - pmd_t *pmd; - pte_t *pte; - - if (pgd_none(*pgd)) - return false; - p4d = p4d_offset(pgd, addr); - if (p4d_none(*p4d)) - return false; - pud = pud_offset(p4d, addr); - if (pud_none(*pud)) - return false; - - /* - * We can't use pud_large() or pud_huge(), the first one is - * arch-specific, the last one depends on HUGETLB_PAGE. So let's abuse - * pud_bad(), if pud is bad then it's bad because it's huge. - */ - if (pud_bad(*pud)) - return true; - pmd = pmd_offset(pud, addr); - if (pmd_none(*pmd)) - return false; - - if (pmd_bad(*pmd)) - return true; - pte = pte_offset_kernel(pmd, addr); - return !pte_none(*pte); -} - -static int __meminit kasan_mem_notifier(struct notifier_block *nb, - unsigned long action, void *data) -{ - struct memory_notify *mem_data = data; - unsigned long nr_shadow_pages, start_kaddr, shadow_start; - unsigned long shadow_end, shadow_size; - - nr_shadow_pages = mem_data->nr_pages >> KASAN_SHADOW_SCALE_SHIFT; - start_kaddr = (unsigned long)pfn_to_kaddr(mem_data->start_pfn); - shadow_start = (unsigned long)kasan_mem_to_shadow((void *)start_kaddr); - shadow_size = nr_shadow_pages << PAGE_SHIFT; - shadow_end = shadow_start + shadow_size; - - if (WARN_ON(mem_data->nr_pages % KASAN_SHADOW_SCALE_SIZE) || - WARN_ON(start_kaddr % (KASAN_SHADOW_SCALE_SIZE << PAGE_SHIFT))) - return NOTIFY_BAD; - - switch (action) { - case MEM_GOING_ONLINE: { - void *ret; - - /* - * If shadow is mapped already than it must have been mapped - * during the boot. This could happen if we onlining previously - * offlined memory. - */ - if (shadow_mapped(shadow_start)) - return NOTIFY_OK; - - ret = __vmalloc_node_range(shadow_size, PAGE_SIZE, shadow_start, - shadow_end, GFP_KERNEL, - PAGE_KERNEL, VM_NO_GUARD, - pfn_to_nid(mem_data->start_pfn), - __builtin_return_address(0)); - if (!ret) - return NOTIFY_BAD; - - kmemleak_ignore(ret); - return NOTIFY_OK; - } - case MEM_CANCEL_ONLINE: - case MEM_OFFLINE: { - struct vm_struct *vm; - - /* - * shadow_start was either mapped during boot by kasan_init() - * or during memory online by __vmalloc_node_range(). - * In the latter case we can use vfree() to free shadow. - * Non-NULL result of the find_vm_area() will tell us if - * that was the second case. - * - * Currently it's not possible to free shadow mapped - * during boot by kasan_init(). It's because the code - * to do that hasn't been written yet. So we'll just - * leak the memory. - */ - vm = find_vm_area((void *)shadow_start); - if (vm) - vfree((void *)shadow_start); - } - } - - return NOTIFY_OK; -} - -static int __init kasan_memhotplug_init(void) -{ - hotplug_memory_notifier(kasan_mem_notifier, 0); - - return 0; -} - -core_initcall(kasan_memhotplug_init); -#endif - -#ifdef CONFIG_KASAN_VMALLOC -static int kasan_populate_vmalloc_pte(pte_t *ptep, unsigned long addr, - void *unused) -{ - unsigned long page; - pte_t pte; - - if (likely(!pte_none(*ptep))) - return 0; - - page = __get_free_page(GFP_KERNEL); - if (!page) - return -ENOMEM; - - memset((void *)page, KASAN_VMALLOC_INVALID, PAGE_SIZE); - pte = pfn_pte(PFN_DOWN(__pa(page)), PAGE_KERNEL); - - spin_lock(&init_mm.page_table_lock); - if (likely(pte_none(*ptep))) { - set_pte_at(&init_mm, addr, ptep, pte); - page = 0; - } - spin_unlock(&init_mm.page_table_lock); - if (page) - free_page(page); - return 0; -} - -int kasan_populate_vmalloc(unsigned long addr, unsigned long size) -{ - unsigned long shadow_start, shadow_end; - int ret; - - if (!is_vmalloc_or_module_addr((void *)addr)) - return 0; - - shadow_start = (unsigned long)kasan_mem_to_shadow((void *)addr); - shadow_start = ALIGN_DOWN(shadow_start, PAGE_SIZE); - shadow_end = (unsigned long)kasan_mem_to_shadow((void *)addr + size); - shadow_end = ALIGN(shadow_end, PAGE_SIZE); - - ret = apply_to_page_range(&init_mm, shadow_start, - shadow_end - shadow_start, - kasan_populate_vmalloc_pte, NULL); - if (ret) - return ret; - - flush_cache_vmap(shadow_start, shadow_end); - - /* - * We need to be careful about inter-cpu effects here. Consider: - * - * CPU#0 CPU#1 - * WRITE_ONCE(p, vmalloc(100)); while (x = READ_ONCE(p)) ; - * p[99] = 1; - * - * With compiler instrumentation, that ends up looking like this: - * - * CPU#0 CPU#1 - * // vmalloc() allocates memory - * // let a = area->addr - * // we reach kasan_populate_vmalloc - * // and call kasan_unpoison_shadow: - * STORE shadow(a), unpoison_val - * ... - * STORE shadow(a+99), unpoison_val x = LOAD p - * // rest of vmalloc process <data dependency> - * STORE p, a LOAD shadow(x+99) - * - * If there is no barrier between the end of unpoisioning the shadow - * and the store of the result to p, the stores could be committed - * in a different order by CPU#0, and CPU#1 could erroneously observe - * poison in the shadow. - * - * We need some sort of barrier between the stores. - * - * In the vmalloc() case, this is provided by a smp_wmb() in - * clear_vm_uninitialized_flag(). In the per-cpu allocator and in - * get_vm_area() and friends, the caller gets shadow allocated but - * doesn't have any pages mapped into the virtual address space that - * has been reserved. Mapping those pages in will involve taking and - * releasing a page-table lock, which will provide the barrier. - */ - - return 0; -} - -/* - * Poison the shadow for a vmalloc region. Called as part of the - * freeing process at the time the region is freed. - */ -void kasan_poison_vmalloc(const void *start, unsigned long size) -{ - if (!is_vmalloc_or_module_addr(start)) - return; - - size = round_up(size, KASAN_SHADOW_SCALE_SIZE); - kasan_poison_shadow(start, size, KASAN_VMALLOC_INVALID); -} - -void kasan_unpoison_vmalloc(const void *start, unsigned long size) -{ - if (!is_vmalloc_or_module_addr(start)) - return; - - kasan_unpoison_shadow(start, size); + /* The object will be poisoned by kasan_free_pages(). */ } - -static int kasan_depopulate_vmalloc_pte(pte_t *ptep, unsigned long addr, - void *unused) -{ - unsigned long page; - - page = (unsigned long)__va(pte_pfn(*ptep) << PAGE_SHIFT); - - spin_lock(&init_mm.page_table_lock); - - if (likely(!pte_none(*ptep))) { - pte_clear(&init_mm, addr, ptep); - free_page(page); - } - spin_unlock(&init_mm.page_table_lock); - - return 0; -} - -/* - * Release the backing for the vmalloc region [start, end), which - * lies within the free region [free_region_start, free_region_end). - * - * This can be run lazily, long after the region was freed. It runs - * under vmap_area_lock, so it's not safe to interact with the vmalloc/vmap - * infrastructure. - * - * How does this work? - * ------------------- - * - * We have a region that is page aligned, labelled as A. - * That might not map onto the shadow in a way that is page-aligned: - * - * start end - * v v - * |????????|????????|AAAAAAAA|AA....AA|AAAAAAAA|????????| < vmalloc - * -------- -------- -------- -------- -------- - * | | | | | - * | | | /-------/ | - * \-------\|/------/ |/---------------/ - * ||| || - * |??AAAAAA|AAAAAAAA|AA??????| < shadow - * (1) (2) (3) - * - * First we align the start upwards and the end downwards, so that the - * shadow of the region aligns with shadow page boundaries. In the - * example, this gives us the shadow page (2). This is the shadow entirely - * covered by this allocation. - * - * Then we have the tricky bits. We want to know if we can free the - * partially covered shadow pages - (1) and (3) in the example. For this, - * we are given the start and end of the free region that contains this - * allocation. Extending our previous example, we could have: - * - * free_region_start free_region_end - * | start end | - * v v v v - * |FFFFFFFF|FFFFFFFF|AAAAAAAA|AA....AA|AAAAAAAA|FFFFFFFF| < vmalloc - * -------- -------- -------- -------- -------- - * | | | | | - * | | | /-------/ | - * \-------\|/------/ |/---------------/ - * ||| || - * |FFAAAAAA|AAAAAAAA|AAF?????| < shadow - * (1) (2) (3) - * - * Once again, we align the start of the free region up, and the end of - * the free region down so that the shadow is page aligned. So we can free - * page (1) - we know no allocation currently uses anything in that page, - * because all of it is in the vmalloc free region. But we cannot free - * page (3), because we can't be sure that the rest of it is unused. - * - * We only consider pages that contain part of the original region for - * freeing: we don't try to free other pages from the free region or we'd - * end up trying to free huge chunks of virtual address space. - * - * Concurrency - * ----------- - * - * How do we know that we're not freeing a page that is simultaneously - * being used for a fresh allocation in kasan_populate_vmalloc(_pte)? - * - * We _can_ have kasan_release_vmalloc and kasan_populate_vmalloc running - * at the same time. While we run under free_vmap_area_lock, the population - * code does not. - * - * free_vmap_area_lock instead operates to ensure that the larger range - * [free_region_start, free_region_end) is safe: because __alloc_vmap_area and - * the per-cpu region-finding algorithm both run under free_vmap_area_lock, - * no space identified as free will become used while we are running. This - * means that so long as we are careful with alignment and only free shadow - * pages entirely covered by the free region, we will not run in to any - * trouble - any simultaneous allocations will be for disjoint regions. - */ -void kasan_release_vmalloc(unsigned long start, unsigned long end, - unsigned long free_region_start, - unsigned long free_region_end) -{ - void *shadow_start, *shadow_end; - unsigned long region_start, region_end; - unsigned long size; - - region_start = ALIGN(start, PAGE_SIZE * KASAN_SHADOW_SCALE_SIZE); - region_end = ALIGN_DOWN(end, PAGE_SIZE * KASAN_SHADOW_SCALE_SIZE); - - free_region_start = ALIGN(free_region_start, - PAGE_SIZE * KASAN_SHADOW_SCALE_SIZE); - - if (start != region_start && - free_region_start < region_start) - region_start -= PAGE_SIZE * KASAN_SHADOW_SCALE_SIZE; - - free_region_end = ALIGN_DOWN(free_region_end, - PAGE_SIZE * KASAN_SHADOW_SCALE_SIZE); - - if (end != region_end && - free_region_end > region_end) - region_end += PAGE_SIZE * KASAN_SHADOW_SCALE_SIZE; - - shadow_start = kasan_mem_to_shadow((void *)region_start); - shadow_end = kasan_mem_to_shadow((void *)region_end); - - if (shadow_end > shadow_start) { - size = shadow_end - shadow_start; - apply_to_existing_page_range(&init_mm, - (unsigned long)shadow_start, - size, kasan_depopulate_vmalloc_pte, - NULL); - flush_tlb_kernel_range((unsigned long)shadow_start, - (unsigned long)shadow_end); - } -} -#endif diff --git a/mm/kasan/generic.c b/mm/kasan/generic.c index 248264b9cb76..1dd5a0f99372 100644 --- a/mm/kasan/generic.c +++ b/mm/kasan/generic.c @@ -7,15 +7,8 @@ * * Some code borrowed from https://github.com/xairy/kasan-prototype by * Andrey Konovalov <andreyknvl@gmail.com> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * */ -#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt - #include <linux/export.h> #include <linux/interrupt.h> #include <linux/init.h> @@ -51,7 +44,7 @@ static __always_inline bool memory_is_poisoned_1(unsigned long addr) s8 shadow_value = *(s8 *)kasan_mem_to_shadow((void *)addr); if (unlikely(shadow_value)) { - s8 last_accessible_byte = addr & KASAN_SHADOW_MASK; + s8 last_accessible_byte = addr & KASAN_GRANULE_MASK; return unlikely(last_accessible_byte >= shadow_value); } @@ -67,7 +60,7 @@ static __always_inline bool memory_is_poisoned_2_4_8(unsigned long addr, * Access crosses 8(shadow size)-byte boundary. Such access maps * into 2 shadow bytes, so we need to check them both. */ - if (unlikely(((addr + size - 1) & KASAN_SHADOW_MASK) < size - 1)) + if (unlikely(((addr + size - 1) & KASAN_GRANULE_MASK) < size - 1)) return *shadow_addr || memory_is_poisoned_1(addr + size - 1); return memory_is_poisoned_1(addr + size - 1); @@ -78,7 +71,7 @@ static __always_inline bool memory_is_poisoned_16(unsigned long addr) u16 *shadow_addr = (u16 *)kasan_mem_to_shadow((void *)addr); /* Unaligned 16-bytes access maps into 3 shadow bytes. */ - if (unlikely(!IS_ALIGNED(addr, KASAN_SHADOW_SCALE_SIZE))) + if (unlikely(!IS_ALIGNED(addr, KASAN_GRANULE_SIZE))) return *shadow_addr || memory_is_poisoned_1(addr + 15); return *shadow_addr; @@ -139,7 +132,7 @@ static __always_inline bool memory_is_poisoned_n(unsigned long addr, s8 *last_shadow = (s8 *)kasan_mem_to_shadow((void *)last_byte); if (unlikely(ret != (unsigned long)last_shadow || - ((long)(last_byte & KASAN_SHADOW_MASK) >= *last_shadow))) + ((long)(last_byte & KASAN_GRANULE_MASK) >= *last_shadow))) return true; } return false; @@ -192,6 +185,13 @@ bool check_memory_region(unsigned long addr, size_t size, bool write, return check_memory_region_inline(addr, size, write, ret_ip); } +bool check_invalid_free(void *addr) +{ + s8 shadow_byte = READ_ONCE(*(s8 *)kasan_mem_to_shadow(addr)); + + return shadow_byte < 0 || shadow_byte >= KASAN_GRANULE_SIZE; +} + void kasan_cache_shrink(struct kmem_cache *cache) { quarantine_remove_cache(cache); @@ -205,13 +205,13 @@ void kasan_cache_shutdown(struct kmem_cache *cache) static void register_global(struct kasan_global *global) { - size_t aligned_size = round_up(global->size, KASAN_SHADOW_SCALE_SIZE); + size_t aligned_size = round_up(global->size, KASAN_GRANULE_SIZE); - kasan_unpoison_shadow(global->beg, global->size); + unpoison_range(global->beg, global->size); - kasan_poison_shadow(global->beg + aligned_size, - global->size_with_redzone - aligned_size, - KASAN_GLOBAL_REDZONE); + poison_range(global->beg + aligned_size, + global->size_with_redzone - aligned_size, + KASAN_GLOBAL_REDZONE); } void __asan_register_globals(struct kasan_global *globals, size_t size) @@ -279,10 +279,10 @@ EXPORT_SYMBOL(__asan_handle_no_return); /* Emitted by compiler to poison alloca()ed objects. */ void __asan_alloca_poison(unsigned long addr, size_t size) { - size_t rounded_up_size = round_up(size, KASAN_SHADOW_SCALE_SIZE); + size_t rounded_up_size = round_up(size, KASAN_GRANULE_SIZE); size_t padding_size = round_up(size, KASAN_ALLOCA_REDZONE_SIZE) - rounded_up_size; - size_t rounded_down_size = round_down(size, KASAN_SHADOW_SCALE_SIZE); + size_t rounded_down_size = round_down(size, KASAN_GRANULE_SIZE); const void *left_redzone = (const void *)(addr - KASAN_ALLOCA_REDZONE_SIZE); @@ -290,13 +290,12 @@ void __asan_alloca_poison(unsigned long addr, size_t size) WARN_ON(!IS_ALIGNED(addr, KASAN_ALLOCA_REDZONE_SIZE)); - kasan_unpoison_shadow((const void *)(addr + rounded_down_size), - size - rounded_down_size); - kasan_poison_shadow(left_redzone, KASAN_ALLOCA_REDZONE_SIZE, - KASAN_ALLOCA_LEFT); - kasan_poison_shadow(right_redzone, - padding_size + KASAN_ALLOCA_REDZONE_SIZE, - KASAN_ALLOCA_RIGHT); + unpoison_range((const void *)(addr + rounded_down_size), + size - rounded_down_size); + poison_range(left_redzone, KASAN_ALLOCA_REDZONE_SIZE, + KASAN_ALLOCA_LEFT); + poison_range(right_redzone, padding_size + KASAN_ALLOCA_REDZONE_SIZE, + KASAN_ALLOCA_RIGHT); } EXPORT_SYMBOL(__asan_alloca_poison); @@ -306,7 +305,7 @@ void __asan_allocas_unpoison(const void *stack_top, const void *stack_bottom) if (unlikely(!stack_top || stack_top > stack_bottom)) return; - kasan_unpoison_shadow(stack_top, stack_bottom - stack_top); + unpoison_range(stack_top, stack_bottom - stack_top); } EXPORT_SYMBOL(__asan_allocas_unpoison); @@ -329,7 +328,7 @@ void kasan_record_aux_stack(void *addr) { struct page *page = kasan_addr_to_page(addr); struct kmem_cache *cache; - struct kasan_alloc_meta *alloc_info; + struct kasan_alloc_meta *alloc_meta; void *object; if (!(page && PageSlab(page))) @@ -337,13 +336,10 @@ void kasan_record_aux_stack(void *addr) cache = page->slab_cache; object = nearest_obj(cache, page, addr); - alloc_info = get_alloc_info(cache, object); + alloc_meta = kasan_get_alloc_meta(cache, object); - /* - * record the last two call_rcu() call stacks. - */ - alloc_info->aux_stack[1] = alloc_info->aux_stack[0]; - alloc_info->aux_stack[0] = kasan_save_stack(GFP_NOWAIT); + alloc_meta->aux_stack[1] = alloc_meta->aux_stack[0]; + alloc_meta->aux_stack[0] = kasan_save_stack(GFP_NOWAIT); } void kasan_set_free_info(struct kmem_cache *cache, @@ -351,12 +347,12 @@ void kasan_set_free_info(struct kmem_cache *cache, { struct kasan_free_meta *free_meta; - free_meta = get_free_info(cache, object); - kasan_set_track(&free_meta->free_track, GFP_NOWAIT); + free_meta = kasan_get_free_meta(cache, object); + if (!free_meta) + return; - /* - * the object was freed and has free track set - */ + kasan_set_track(&free_meta->free_track, GFP_NOWAIT); + /* The object was freed and has free track set. */ *(u8 *)kasan_mem_to_shadow(object) = KASAN_KMALLOC_FREETRACK; } @@ -365,5 +361,6 @@ struct kasan_track *kasan_get_free_track(struct kmem_cache *cache, { if (*(u8 *)kasan_mem_to_shadow(object) != KASAN_KMALLOC_FREETRACK) return NULL; - return &get_free_info(cache, object)->free_track; + /* Free meta must be present with KASAN_KMALLOC_FREETRACK. */ + return &kasan_get_free_meta(cache, object)->free_track; } diff --git a/mm/kasan/generic_report.c b/mm/kasan/generic_report.c deleted file mode 100644 index a38c7a9e192a..000000000000 --- a/mm/kasan/generic_report.c +++ /dev/null @@ -1,165 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * This file contains generic KASAN specific error reporting code. - * - * Copyright (c) 2014 Samsung Electronics Co., Ltd. - * Author: Andrey Ryabinin <ryabinin.a.a@gmail.com> - * - * Some code borrowed from https://github.com/xairy/kasan-prototype by - * Andrey Konovalov <andreyknvl@gmail.com> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - */ - -#include <linux/bitops.h> -#include <linux/ftrace.h> -#include <linux/init.h> -#include <linux/kernel.h> -#include <linux/mm.h> -#include <linux/printk.h> -#include <linux/sched.h> -#include <linux/slab.h> -#include <linux/stackdepot.h> -#include <linux/stacktrace.h> -#include <linux/string.h> -#include <linux/types.h> -#include <linux/kasan.h> -#include <linux/module.h> - -#include <asm/sections.h> - -#include "kasan.h" -#include "../slab.h" - -void *find_first_bad_addr(void *addr, size_t size) -{ - void *p = addr; - - while (p < addr + size && !(*(u8 *)kasan_mem_to_shadow(p))) - p += KASAN_SHADOW_SCALE_SIZE; - return p; -} - -static const char *get_shadow_bug_type(struct kasan_access_info *info) -{ - const char *bug_type = "unknown-crash"; - u8 *shadow_addr; - - shadow_addr = (u8 *)kasan_mem_to_shadow(info->first_bad_addr); - - /* - * If shadow byte value is in [0, KASAN_SHADOW_SCALE_SIZE) we can look - * at the next shadow byte to determine the type of the bad access. - */ - if (*shadow_addr > 0 && *shadow_addr <= KASAN_SHADOW_SCALE_SIZE - 1) - shadow_addr++; - - switch (*shadow_addr) { - case 0 ... KASAN_SHADOW_SCALE_SIZE - 1: - /* - * In theory it's still possible to see these shadow values - * due to a data race in the kernel code. - */ - bug_type = "out-of-bounds"; - break; - case KASAN_PAGE_REDZONE: - case KASAN_KMALLOC_REDZONE: - bug_type = "slab-out-of-bounds"; - break; - case KASAN_GLOBAL_REDZONE: - bug_type = "global-out-of-bounds"; - break; - case KASAN_STACK_LEFT: - case KASAN_STACK_MID: - case KASAN_STACK_RIGHT: - case KASAN_STACK_PARTIAL: - bug_type = "stack-out-of-bounds"; - break; - case KASAN_FREE_PAGE: - case KASAN_KMALLOC_FREE: - case KASAN_KMALLOC_FREETRACK: - bug_type = "use-after-free"; - break; - case KASAN_ALLOCA_LEFT: - case KASAN_ALLOCA_RIGHT: - bug_type = "alloca-out-of-bounds"; - break; - case KASAN_VMALLOC_INVALID: - bug_type = "vmalloc-out-of-bounds"; - break; - } - - return bug_type; -} - -static const char *get_wild_bug_type(struct kasan_access_info *info) -{ - const char *bug_type = "unknown-crash"; - - if ((unsigned long)info->access_addr < PAGE_SIZE) - bug_type = "null-ptr-deref"; - else if ((unsigned long)info->access_addr < TASK_SIZE) - bug_type = "user-memory-access"; - else - bug_type = "wild-memory-access"; - - return bug_type; -} - -const char *get_bug_type(struct kasan_access_info *info) -{ - /* - * If access_size is a negative number, then it has reason to be - * defined as out-of-bounds bug type. - * - * Casting negative numbers to size_t would indeed turn up as - * a large size_t and its value will be larger than ULONG_MAX/2, - * so that this can qualify as out-of-bounds. - */ - if (info->access_addr + info->access_size < info->access_addr) - return "out-of-bounds"; - - if (addr_has_shadow(info->access_addr)) - return get_shadow_bug_type(info); - return get_wild_bug_type(info); -} - -#define DEFINE_ASAN_REPORT_LOAD(size) \ -void __asan_report_load##size##_noabort(unsigned long addr) \ -{ \ - kasan_report(addr, size, false, _RET_IP_); \ -} \ -EXPORT_SYMBOL(__asan_report_load##size##_noabort) - -#define DEFINE_ASAN_REPORT_STORE(size) \ -void __asan_report_store##size##_noabort(unsigned long addr) \ -{ \ - kasan_report(addr, size, true, _RET_IP_); \ -} \ -EXPORT_SYMBOL(__asan_report_store##size##_noabort) - -DEFINE_ASAN_REPORT_LOAD(1); -DEFINE_ASAN_REPORT_LOAD(2); -DEFINE_ASAN_REPORT_LOAD(4); -DEFINE_ASAN_REPORT_LOAD(8); -DEFINE_ASAN_REPORT_LOAD(16); -DEFINE_ASAN_REPORT_STORE(1); -DEFINE_ASAN_REPORT_STORE(2); -DEFINE_ASAN_REPORT_STORE(4); -DEFINE_ASAN_REPORT_STORE(8); -DEFINE_ASAN_REPORT_STORE(16); - -void __asan_report_load_n_noabort(unsigned long addr, size_t size) -{ - kasan_report(addr, size, false, _RET_IP_); -} -EXPORT_SYMBOL(__asan_report_load_n_noabort); - -void __asan_report_store_n_noabort(unsigned long addr, size_t size) -{ - kasan_report(addr, size, true, _RET_IP_); -} -EXPORT_SYMBOL(__asan_report_store_n_noabort); diff --git a/mm/kasan/hw_tags.c b/mm/kasan/hw_tags.c new file mode 100644 index 000000000000..55bd6f09c70f --- /dev/null +++ b/mm/kasan/hw_tags.c @@ -0,0 +1,204 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * This file contains core hardware tag-based KASAN code. + * + * Copyright (c) 2020 Google, Inc. + * Author: Andrey Konovalov <andreyknvl@google.com> + */ + +#define pr_fmt(fmt) "kasan: " fmt + +#include <linux/init.h> +#include <linux/kasan.h> +#include <linux/kernel.h> +#include <linux/memory.h> +#include <linux/mm.h> +#include <linux/static_key.h> +#include <linux/string.h> +#include <linux/types.h> + +#include "kasan.h" + +enum kasan_arg_mode { + KASAN_ARG_MODE_DEFAULT, + KASAN_ARG_MODE_OFF, + KASAN_ARG_MODE_PROD, + KASAN_ARG_MODE_FULL, +}; + +enum kasan_arg_stacktrace { + KASAN_ARG_STACKTRACE_DEFAULT, + KASAN_ARG_STACKTRACE_OFF, + KASAN_ARG_STACKTRACE_ON, +}; + +enum kasan_arg_fault { + KASAN_ARG_FAULT_DEFAULT, + KASAN_ARG_FAULT_REPORT, + KASAN_ARG_FAULT_PANIC, +}; + +static enum kasan_arg_mode kasan_arg_mode __ro_after_init; +static enum kasan_arg_stacktrace kasan_arg_stacktrace __ro_after_init; +static enum kasan_arg_fault kasan_arg_fault __ro_after_init; + +/* Whether KASAN is enabled at all. */ +DEFINE_STATIC_KEY_FALSE(kasan_flag_enabled); +EXPORT_SYMBOL(kasan_flag_enabled); + +/* Whether to collect alloc/free stack traces. */ +DEFINE_STATIC_KEY_FALSE(kasan_flag_stacktrace); + +/* Whether panic or disable tag checking on fault. */ +bool kasan_flag_panic __ro_after_init; + +/* kasan.mode=off/prod/full */ +static int __init early_kasan_mode(char *arg) +{ + if (!arg) + return -EINVAL; + + if (!strcmp(arg, "off")) + kasan_arg_mode = KASAN_ARG_MODE_OFF; + else if (!strcmp(arg, "prod")) + kasan_arg_mode = KASAN_ARG_MODE_PROD; + else if (!strcmp(arg, "full")) + kasan_arg_mode = KASAN_ARG_MODE_FULL; + else + return -EINVAL; + + return 0; +} +early_param("kasan.mode", early_kasan_mode); + +/* kasan.stack=off/on */ +static int __init early_kasan_flag_stacktrace(char *arg) +{ + if (!arg) + return -EINVAL; + + if (!strcmp(arg, "off")) + kasan_arg_stacktrace = KASAN_ARG_STACKTRACE_OFF; + else if (!strcmp(arg, "on")) + kasan_arg_stacktrace = KASAN_ARG_STACKTRACE_ON; + else + return -EINVAL; + + return 0; +} +early_param("kasan.stacktrace", early_kasan_flag_stacktrace); + +/* kasan.fault=report/panic */ +static int __init early_kasan_fault(char *arg) +{ + if (!arg) + return -EINVAL; + + if (!strcmp(arg, "report")) + kasan_arg_fault = KASAN_ARG_FAULT_REPORT; + else if (!strcmp(arg, "panic")) + kasan_arg_fault = KASAN_ARG_FAULT_PANIC; + else + return -EINVAL; + + return 0; +} +early_param("kasan.fault", early_kasan_fault); + +/* kasan_init_hw_tags_cpu() is called for each CPU. */ +void kasan_init_hw_tags_cpu(void) +{ + /* + * There's no need to check that the hardware is MTE-capable here, + * as this function is only called for MTE-capable hardware. + */ + + /* If KASAN is disabled, do nothing. */ + if (kasan_arg_mode == KASAN_ARG_MODE_OFF) + return; + + hw_init_tags(KASAN_TAG_MAX); + hw_enable_tagging(); +} + +/* kasan_init_hw_tags() is called once on boot CPU. */ +void __init kasan_init_hw_tags(void) +{ + /* If hardware doesn't support MTE, do nothing. */ + if (!system_supports_mte()) + return; + + /* Choose KASAN mode if kasan boot parameter is not provided. */ + if (kasan_arg_mode == KASAN_ARG_MODE_DEFAULT) { + if (IS_ENABLED(CONFIG_DEBUG_KERNEL)) + kasan_arg_mode = KASAN_ARG_MODE_FULL; + else + kasan_arg_mode = KASAN_ARG_MODE_PROD; + } + + /* Preset parameter values based on the mode. */ + switch (kasan_arg_mode) { + case KASAN_ARG_MODE_DEFAULT: + /* Shouldn't happen as per the check above. */ + WARN_ON(1); + return; + case KASAN_ARG_MODE_OFF: + /* If KASAN is disabled, do nothing. */ + return; + case KASAN_ARG_MODE_PROD: + static_branch_enable(&kasan_flag_enabled); + break; + case KASAN_ARG_MODE_FULL: + static_branch_enable(&kasan_flag_enabled); + static_branch_enable(&kasan_flag_stacktrace); + break; + } + + /* Now, optionally override the presets. */ + + switch (kasan_arg_stacktrace) { + case KASAN_ARG_STACKTRACE_DEFAULT: + break; + case KASAN_ARG_STACKTRACE_OFF: + static_branch_disable(&kasan_flag_stacktrace); + break; + case KASAN_ARG_STACKTRACE_ON: + static_branch_enable(&kasan_flag_stacktrace); + break; + } + + switch (kasan_arg_fault) { + case KASAN_ARG_FAULT_DEFAULT: + break; + case KASAN_ARG_FAULT_REPORT: + kasan_flag_panic = false; + break; + case KASAN_ARG_FAULT_PANIC: + kasan_flag_panic = true; + break; + } + + pr_info("KernelAddressSanitizer initialized\n"); +} + +void kasan_set_free_info(struct kmem_cache *cache, + void *object, u8 tag) +{ + struct kasan_alloc_meta *alloc_meta; + + alloc_meta = kasan_get_alloc_meta(cache, object); + if (alloc_meta) + kasan_set_track(&alloc_meta->free_track[0], GFP_NOWAIT); +} + +struct kasan_track *kasan_get_free_track(struct kmem_cache *cache, + void *object, u8 tag) +{ + struct kasan_alloc_meta *alloc_meta; + + alloc_meta = kasan_get_alloc_meta(cache, object); + if (!alloc_meta) + return NULL; + + return &alloc_meta->free_track[0]; +} diff --git a/mm/kasan/init.c b/mm/kasan/init.c index fe6be0be1f76..bc0ad208b3a7 100644 --- a/mm/kasan/init.c +++ b/mm/kasan/init.c @@ -1,14 +1,9 @@ // SPDX-License-Identifier: GPL-2.0 /* - * This file contains some kasan initialization code. + * This file contains KASAN shadow initialization code. * * Copyright (c) 2015 Samsung Electronics Co., Ltd. * Author: Andrey Ryabinin <ryabinin.a.a@gmail.com> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * */ #include <linux/memblock.h> @@ -446,9 +441,8 @@ void kasan_remove_zero_shadow(void *start, unsigned long size) addr = (unsigned long)kasan_mem_to_shadow(start); end = addr + (size >> KASAN_SHADOW_SCALE_SHIFT); - if (WARN_ON((unsigned long)start % - (KASAN_SHADOW_SCALE_SIZE * PAGE_SIZE)) || - WARN_ON(size % (KASAN_SHADOW_SCALE_SIZE * PAGE_SIZE))) + if (WARN_ON((unsigned long)start % KASAN_MEMORY_PER_SHADOW_PAGE) || + WARN_ON(size % KASAN_MEMORY_PER_SHADOW_PAGE)) return; for (; addr < end; addr = next) { @@ -481,9 +475,8 @@ int kasan_add_zero_shadow(void *start, unsigned long size) shadow_start = kasan_mem_to_shadow(start); shadow_end = shadow_start + (size >> KASAN_SHADOW_SCALE_SHIFT); - if (WARN_ON((unsigned long)start % - (KASAN_SHADOW_SCALE_SIZE * PAGE_SIZE)) || - WARN_ON(size % (KASAN_SHADOW_SCALE_SIZE * PAGE_SIZE))) + if (WARN_ON((unsigned long)start % KASAN_MEMORY_PER_SHADOW_PAGE) || + WARN_ON(size % KASAN_MEMORY_PER_SHADOW_PAGE)) return -EINVAL; ret = kasan_populate_early_shadow(shadow_start, shadow_end); diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h index ac499456740f..cc4d9e1d49b1 100644 --- a/mm/kasan/kasan.h +++ b/mm/kasan/kasan.h @@ -5,8 +5,32 @@ #include <linux/kasan.h> #include <linux/stackdepot.h> -#define KASAN_SHADOW_SCALE_SIZE (1UL << KASAN_SHADOW_SCALE_SHIFT) -#define KASAN_SHADOW_MASK (KASAN_SHADOW_SCALE_SIZE - 1) +#ifdef CONFIG_KASAN_HW_TAGS +#include <linux/static_key.h> +DECLARE_STATIC_KEY_FALSE(kasan_flag_stacktrace); +static inline bool kasan_stack_collection_enabled(void) +{ + return static_branch_unlikely(&kasan_flag_stacktrace); +} +#else +static inline bool kasan_stack_collection_enabled(void) +{ + return true; +} +#endif + +extern bool kasan_flag_panic __ro_after_init; + +#if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS) +#define KASAN_GRANULE_SIZE (1UL << KASAN_SHADOW_SCALE_SHIFT) +#else +#include <asm/mte-kasan.h> +#define KASAN_GRANULE_SIZE MTE_GRANULE_SIZE +#endif + +#define KASAN_GRANULE_MASK (KASAN_GRANULE_SIZE - 1) + +#define KASAN_MEMORY_PER_SHADOW_PAGE (KASAN_GRANULE_SIZE << PAGE_SHIFT) #define KASAN_TAG_KERNEL 0xFF /* native kernel pointers tag */ #define KASAN_TAG_INVALID 0xFE /* inaccessible memory tag */ @@ -56,6 +80,13 @@ #define KASAN_ABI_VERSION 1 #endif +/* Metadata layout customization. */ +#define META_BYTES_PER_BLOCK 1 +#define META_BLOCKS_PER_ROW 16 +#define META_BYTES_PER_ROW (META_BLOCKS_PER_ROW * META_BYTES_PER_BLOCK) +#define META_MEM_BYTES_PER_ROW (META_BYTES_PER_ROW * KASAN_GRANULE_SIZE) +#define META_ROWS_AROUND_ADDR 2 + struct kasan_access_info { const void *access_addr; const void *first_bad_addr; @@ -124,20 +155,33 @@ struct kasan_alloc_meta { struct qlist_node { struct qlist_node *next; }; + +/* + * Generic mode either stores free meta in the object itself or in the redzone + * after the object. In the former case free meta offset is 0, in the latter + * case it has some sane value smaller than INT_MAX. Use INT_MAX as free meta + * offset when free meta isn't present. + */ +#define KASAN_NO_FREE_META INT_MAX + struct kasan_free_meta { +#ifdef CONFIG_KASAN_GENERIC /* This field is used while the object is in the quarantine. * Otherwise it might be used for the allocator freelist. */ struct qlist_node quarantine_link; -#ifdef CONFIG_KASAN_GENERIC struct kasan_track free_track; #endif }; -struct kasan_alloc_meta *get_alloc_info(struct kmem_cache *cache, - const void *object); -struct kasan_free_meta *get_free_info(struct kmem_cache *cache, - const void *object); +struct kasan_alloc_meta *kasan_get_alloc_meta(struct kmem_cache *cache, + const void *object); +#ifdef CONFIG_KASAN_GENERIC +struct kasan_free_meta *kasan_get_free_meta(struct kmem_cache *cache, + const void *object); +#endif + +#if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS) static inline const void *kasan_shadow_to_mem(const void *shadow_addr) { @@ -145,13 +189,11 @@ static inline const void *kasan_shadow_to_mem(const void *shadow_addr) << KASAN_SHADOW_SCALE_SHIFT); } -static inline bool addr_has_shadow(const void *addr) +static inline bool addr_has_metadata(const void *addr) { return (addr >= kasan_shadow_to_mem((void *)KASAN_SHADOW_START)); } -void kasan_poison_shadow(const void *address, size_t size, u8 value); - /** * check_memory_region - Check memory region, and report if invalid access. * @addr: the accessed address @@ -163,8 +205,30 @@ void kasan_poison_shadow(const void *address, size_t size, u8 value); bool check_memory_region(unsigned long addr, size_t size, bool write, unsigned long ret_ip); +#else /* CONFIG_KASAN_GENERIC || CONFIG_KASAN_SW_TAGS */ + +static inline bool addr_has_metadata(const void *addr) +{ + return true; +} + +#endif /* CONFIG_KASAN_GENERIC || CONFIG_KASAN_SW_TAGS */ + +#if defined(CONFIG_KASAN_SW_TAGS) || defined(CONFIG_KASAN_HW_TAGS) +void print_tags(u8 addr_tag, const void *addr); +#else +static inline void print_tags(u8 addr_tag, const void *addr) { } +#endif + void *find_first_bad_addr(void *addr, size_t size); const char *get_bug_type(struct kasan_access_info *info); +void metadata_fetch_row(char *buffer, void *row); + +#if defined(CONFIG_KASAN_GENERIC) && CONFIG_KASAN_STACK +void print_address_stack_frame(const void *addr); +#else +static inline void print_address_stack_frame(const void *addr) { } +#endif bool kasan_report(unsigned long addr, size_t size, bool is_write, unsigned long ip); @@ -180,49 +244,92 @@ struct kasan_track *kasan_get_free_track(struct kmem_cache *cache, #if defined(CONFIG_KASAN_GENERIC) && \ (defined(CONFIG_SLAB) || defined(CONFIG_SLUB)) -void quarantine_put(struct kasan_free_meta *info, struct kmem_cache *cache); +bool quarantine_put(struct kmem_cache *cache, void *object); void quarantine_reduce(void); void quarantine_remove_cache(struct kmem_cache *cache); #else -static inline void quarantine_put(struct kasan_free_meta *info, - struct kmem_cache *cache) { } +static inline bool quarantine_put(struct kmem_cache *cache, void *object) { return false; } static inline void quarantine_reduce(void) { } static inline void quarantine_remove_cache(struct kmem_cache *cache) { } #endif -#ifdef CONFIG_KASAN_SW_TAGS +#ifndef arch_kasan_set_tag +static inline const void *arch_kasan_set_tag(const void *addr, u8 tag) +{ + return addr; +} +#endif +#ifndef arch_kasan_get_tag +#define arch_kasan_get_tag(addr) 0 +#endif -void print_tags(u8 addr_tag, const void *addr); +#define set_tag(addr, tag) ((void *)arch_kasan_set_tag((addr), (tag))) +#define get_tag(addr) arch_kasan_get_tag(addr) -u8 random_tag(void); +#ifdef CONFIG_KASAN_HW_TAGS + +#ifndef arch_enable_tagging +#define arch_enable_tagging() +#endif +#ifndef arch_init_tags +#define arch_init_tags(max_tag) +#endif +#ifndef arch_get_random_tag +#define arch_get_random_tag() (0xFF) +#endif +#ifndef arch_get_mem_tag +#define arch_get_mem_tag(addr) (0xFF) +#endif +#ifndef arch_set_mem_tag_range +#define arch_set_mem_tag_range(addr, size, tag) ((void *)(addr)) +#endif + +#define hw_enable_tagging() arch_enable_tagging() +#define hw_init_tags(max_tag) arch_init_tags(max_tag) +#define hw_get_random_tag() arch_get_random_tag() +#define hw_get_mem_tag(addr) arch_get_mem_tag(addr) +#define hw_set_mem_tag_range(addr, size, tag) arch_set_mem_tag_range((addr), (size), (tag)) +#endif /* CONFIG_KASAN_HW_TAGS */ + +#ifdef CONFIG_KASAN_SW_TAGS +u8 random_tag(void); +#elif defined(CONFIG_KASAN_HW_TAGS) +static inline u8 random_tag(void) { return hw_get_random_tag(); } #else +static inline u8 random_tag(void) { return 0; } +#endif -static inline void print_tags(u8 addr_tag, const void *addr) { } +#ifdef CONFIG_KASAN_HW_TAGS -static inline u8 random_tag(void) +static inline void poison_range(const void *address, size_t size, u8 value) { - return 0; + hw_set_mem_tag_range(kasan_reset_tag(address), + round_up(size, KASAN_GRANULE_SIZE), value); } -#endif +static inline void unpoison_range(const void *address, size_t size) +{ + hw_set_mem_tag_range(kasan_reset_tag(address), + round_up(size, KASAN_GRANULE_SIZE), get_tag(address)); +} -#ifndef arch_kasan_set_tag -static inline const void *arch_kasan_set_tag(const void *addr, u8 tag) +static inline bool check_invalid_free(void *addr) { - return addr; + u8 ptr_tag = get_tag(addr); + u8 mem_tag = hw_get_mem_tag(addr); + + return (mem_tag == KASAN_TAG_INVALID) || + (ptr_tag != KASAN_TAG_KERNEL && ptr_tag != mem_tag); } -#endif -#ifndef arch_kasan_reset_tag -#define arch_kasan_reset_tag(addr) ((void *)(addr)) -#endif -#ifndef arch_kasan_get_tag -#define arch_kasan_get_tag(addr) 0 -#endif -#define set_tag(addr, tag) ((void *)arch_kasan_set_tag((addr), (tag))) -#define reset_tag(addr) ((void *)arch_kasan_reset_tag(addr)) -#define get_tag(addr) arch_kasan_get_tag(addr) +#else /* CONFIG_KASAN_HW_TAGS */ + +void poison_range(const void *address, size_t size, u8 value); +void unpoison_range(const void *address, size_t size); +bool check_invalid_free(void *addr); + +#endif /* CONFIG_KASAN_HW_TAGS */ /* * Exported functions for interfaces called from assembly or from generated diff --git a/mm/kasan/quarantine.c b/mm/kasan/quarantine.c index 0e3f8494628f..55783125a767 100644 --- a/mm/kasan/quarantine.c +++ b/mm/kasan/quarantine.c @@ -6,16 +6,6 @@ * Copyright (C) 2016 Google, Inc. * * Based on code by Dmitry Chernenkov. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * version 2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * */ #include <linux/gfp.h> @@ -147,7 +137,12 @@ static void qlink_free(struct qlist_node *qlink, struct kmem_cache *cache) if (IS_ENABLED(CONFIG_SLAB)) local_irq_save(flags); + /* + * As the object now gets freed from the quaratine, assume that its + * free track is no longer valid. + */ *(u8 *)kasan_mem_to_shadow(object) = KASAN_KMALLOC_FREE; + ___cache_free(cache, object, _THIS_IP_); if (IS_ENABLED(CONFIG_SLAB)) @@ -173,11 +168,19 @@ static void qlist_free_all(struct qlist_head *q, struct kmem_cache *cache) qlist_init(q); } -void quarantine_put(struct kasan_free_meta *info, struct kmem_cache *cache) +bool quarantine_put(struct kmem_cache *cache, void *object) { unsigned long flags; struct qlist_head *q; struct qlist_head temp = QLIST_INIT; + struct kasan_free_meta *meta = kasan_get_free_meta(cache, object); + + /* + * If there's no metadata for this object, don't put it into + * quarantine. + */ + if (!meta) + return false; /* * Note: irq must be disabled until after we move the batch to the @@ -192,9 +195,9 @@ void quarantine_put(struct kasan_free_meta *info, struct kmem_cache *cache) q = this_cpu_ptr(&cpu_quarantine); if (q->offline) { local_irq_restore(flags); - return; + return false; } - qlist_put(q, &info->quarantine_link, cache->size); + qlist_put(q, &meta->quarantine_link, cache->size); if (unlikely(q->bytes > QUARANTINE_PERCPU_SIZE)) { qlist_move_all(q, &temp); @@ -215,6 +218,8 @@ void quarantine_put(struct kasan_free_meta *info, struct kmem_cache *cache) } local_irq_restore(flags); + + return true; } void quarantine_reduce(void) diff --git a/mm/kasan/report.c b/mm/kasan/report.c index 00a53f1355ae..c0fb21797550 100644 --- a/mm/kasan/report.c +++ b/mm/kasan/report.c @@ -1,17 +1,12 @@ // SPDX-License-Identifier: GPL-2.0 /* - * This file contains common generic and tag-based KASAN error reporting code. + * This file contains common KASAN error reporting code. * * Copyright (c) 2014 Samsung Electronics Co., Ltd. * Author: Andrey Ryabinin <ryabinin.a.a@gmail.com> * * Some code borrowed from https://github.com/xairy/kasan-prototype by * Andrey Konovalov <andreyknvl@gmail.com> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * */ #include <linux/bitops.h> @@ -38,12 +33,6 @@ #include "kasan.h" #include "../slab.h" -/* Shadow layout customization. */ -#define SHADOW_BYTES_PER_BLOCK 1 -#define SHADOW_BLOCKS_PER_ROW 16 -#define SHADOW_BYTES_PER_ROW (SHADOW_BLOCKS_PER_ROW * SHADOW_BYTES_PER_BLOCK) -#define SHADOW_ROWS_AROUND_ADDR 2 - static unsigned long kasan_flags; #define KASAN_BIT_REPORTED 0 @@ -73,9 +62,14 @@ static void print_error_description(struct kasan_access_info *info) { pr_err("BUG: KASAN: %s in %pS\n", get_bug_type(info), (void *)info->ip); - pr_err("%s of size %zu at addr %px by task %s/%d\n", - info->is_write ? "Write" : "Read", info->access_size, - info->access_addr, current->comm, task_pid_nr(current)); + if (info->access_size) + pr_err("%s of size %zu at addr %px by task %s/%d\n", + info->is_write ? "Write" : "Read", info->access_size, + info->access_addr, current->comm, task_pid_nr(current)); + else + pr_err("%s at addr %px by task %s/%d\n", + info->is_write ? "Write" : "Read", + info->access_addr, current->comm, task_pid_nr(current)); } static DEFINE_SPINLOCK(report_lock); @@ -105,6 +99,10 @@ static void end_report(unsigned long *flags) panic_on_warn = 0; panic("panic_on_warn set ...\n"); } +#ifdef CONFIG_KASAN_HW_TAGS + if (kasan_flag_panic) + panic("kasan.fault=panic set ...\n"); +#endif kasan_enable_current(); } @@ -167,36 +165,45 @@ static void describe_object_addr(struct kmem_cache *cache, void *object, (void *)(object_addr + cache->object_size)); } -static void describe_object(struct kmem_cache *cache, void *object, - const void *addr, u8 tag) +static void describe_object_stacks(struct kmem_cache *cache, void *object, + const void *addr, u8 tag) { - struct kasan_alloc_meta *alloc_info = get_alloc_info(cache, object); + struct kasan_alloc_meta *alloc_meta; + struct kasan_track *free_track; - if (cache->flags & SLAB_KASAN) { - struct kasan_track *free_track; + alloc_meta = kasan_get_alloc_meta(cache, object); + if (alloc_meta) { + print_track(&alloc_meta->alloc_track, "Allocated"); + pr_err("\n"); + } - print_track(&alloc_info->alloc_track, "Allocated"); + free_track = kasan_get_free_track(cache, object, tag); + if (free_track) { + print_track(free_track, "Freed"); pr_err("\n"); - free_track = kasan_get_free_track(cache, object, tag); - if (free_track) { - print_track(free_track, "Freed"); - pr_err("\n"); - } + } #ifdef CONFIG_KASAN_GENERIC - if (alloc_info->aux_stack[0]) { - pr_err("Last call_rcu():\n"); - print_stack(alloc_info->aux_stack[0]); - pr_err("\n"); - } - if (alloc_info->aux_stack[1]) { - pr_err("Second to last call_rcu():\n"); - print_stack(alloc_info->aux_stack[1]); - pr_err("\n"); - } -#endif + if (!alloc_meta) + return; + if (alloc_meta->aux_stack[0]) { + pr_err("Last potentially related work creation:\n"); + print_stack(alloc_meta->aux_stack[0]); + pr_err("\n"); } + if (alloc_meta->aux_stack[1]) { + pr_err("Second to last potentially related work creation:\n"); + print_stack(alloc_meta->aux_stack[1]); + pr_err("\n"); + } +#endif +} +static void describe_object(struct kmem_cache *cache, void *object, + const void *addr, u8 tag) +{ + if (kasan_stack_collection_enabled()) + describe_object_stacks(cache, object, addr, tag); describe_object_addr(cache, object, addr); } @@ -216,168 +223,6 @@ static inline bool init_task_stack_addr(const void *addr) sizeof(init_thread_union.stack)); } -static bool __must_check tokenize_frame_descr(const char **frame_descr, - char *token, size_t max_tok_len, - unsigned long *value) -{ - const char *sep = strchr(*frame_descr, ' '); - - if (sep == NULL) - sep = *frame_descr + strlen(*frame_descr); - - if (token != NULL) { - const size_t tok_len = sep - *frame_descr; - - if (tok_len + 1 > max_tok_len) { - pr_err("KASAN internal error: frame description too long: %s\n", - *frame_descr); - return false; - } - - /* Copy token (+ 1 byte for '\0'). */ - strlcpy(token, *frame_descr, tok_len + 1); - } - - /* Advance frame_descr past separator. */ - *frame_descr = sep + 1; - - if (value != NULL && kstrtoul(token, 10, value)) { - pr_err("KASAN internal error: not a valid number: %s\n", token); - return false; - } - - return true; -} - -static void print_decoded_frame_descr(const char *frame_descr) -{ - /* - * We need to parse the following string: - * "n alloc_1 alloc_2 ... alloc_n" - * where alloc_i looks like - * "offset size len name" - * or "offset size len name:line". - */ - - char token[64]; - unsigned long num_objects; - - if (!tokenize_frame_descr(&frame_descr, token, sizeof(token), - &num_objects)) - return; - - pr_err("\n"); - pr_err("this frame has %lu %s:\n", num_objects, - num_objects == 1 ? "object" : "objects"); - - while (num_objects--) { - unsigned long offset; - unsigned long size; - - /* access offset */ - if (!tokenize_frame_descr(&frame_descr, token, sizeof(token), - &offset)) - return; - /* access size */ - if (!tokenize_frame_descr(&frame_descr, token, sizeof(token), - &size)) - return; - /* name length (unused) */ - if (!tokenize_frame_descr(&frame_descr, NULL, 0, NULL)) - return; - /* object name */ - if (!tokenize_frame_descr(&frame_descr, token, sizeof(token), - NULL)) - return; - - /* Strip line number; without filename it's not very helpful. */ - strreplace(token, ':', '\0'); - - /* Finally, print object information. */ - pr_err(" [%lu, %lu) '%s'", offset, offset + size, token); - } -} - -static bool __must_check get_address_stack_frame_info(const void *addr, - unsigned long *offset, - const char **frame_descr, - const void **frame_pc) -{ - unsigned long aligned_addr; - unsigned long mem_ptr; - const u8 *shadow_bottom; - const u8 *shadow_ptr; - const unsigned long *frame; - - BUILD_BUG_ON(IS_ENABLED(CONFIG_STACK_GROWSUP)); - - /* - * NOTE: We currently only support printing frame information for - * accesses to the task's own stack. - */ - if (!object_is_on_stack(addr)) - return false; - - aligned_addr = round_down((unsigned long)addr, sizeof(long)); - mem_ptr = round_down(aligned_addr, KASAN_SHADOW_SCALE_SIZE); - shadow_ptr = kasan_mem_to_shadow((void *)aligned_addr); - shadow_bottom = kasan_mem_to_shadow(end_of_stack(current)); - - while (shadow_ptr >= shadow_bottom && *shadow_ptr != KASAN_STACK_LEFT) { - shadow_ptr--; - mem_ptr -= KASAN_SHADOW_SCALE_SIZE; - } - - while (shadow_ptr >= shadow_bottom && *shadow_ptr == KASAN_STACK_LEFT) { - shadow_ptr--; - mem_ptr -= KASAN_SHADOW_SCALE_SIZE; - } - - if (shadow_ptr < shadow_bottom) - return false; - - frame = (const unsigned long *)(mem_ptr + KASAN_SHADOW_SCALE_SIZE); - if (frame[0] != KASAN_CURRENT_STACK_FRAME_MAGIC) { - pr_err("KASAN internal error: frame info validation failed; invalid marker: %lu\n", - frame[0]); - return false; - } - - *offset = (unsigned long)addr - (unsigned long)frame; - *frame_descr = (const char *)frame[1]; - *frame_pc = (void *)frame[2]; - - return true; -} - -static void print_address_stack_frame(const void *addr) -{ - unsigned long offset; - const char *frame_descr; - const void *frame_pc; - - if (IS_ENABLED(CONFIG_KASAN_SW_TAGS)) - return; - - if (!get_address_stack_frame_info(addr, &offset, &frame_descr, - &frame_pc)) - return; - - /* - * get_address_stack_frame_info only returns true if the given addr is - * on the current task's stack. - */ - pr_err("\n"); - pr_err("addr %px is located in stack of task %s/%d at offset %lu in frame:\n", - addr, current->comm, task_pid_nr(current), offset); - pr_err(" %pS\n", frame_pc); - - if (!frame_descr) - return; - - print_decoded_frame_descr(frame_descr); -} - static void print_address_description(void *addr, u8 tag) { struct page *page = kasan_addr_to_page(addr); @@ -405,62 +250,68 @@ static void print_address_description(void *addr, u8 tag) print_address_stack_frame(addr); } -static bool row_is_guilty(const void *row, const void *guilty) +static bool meta_row_is_guilty(const void *row, const void *addr) { - return (row <= guilty) && (guilty < row + SHADOW_BYTES_PER_ROW); + return (row <= addr) && (addr < row + META_MEM_BYTES_PER_ROW); } -static int shadow_pointer_offset(const void *row, const void *shadow) +static int meta_pointer_offset(const void *row, const void *addr) { - /* The length of ">ff00ff00ff00ff00: " is - * 3 + (BITS_PER_LONG/8)*2 chars. + /* + * Memory state around the buggy address: + * ff00ff00ff00ff00: 00 00 00 05 fe fe fe fe fe fe fe fe fe fe fe fe + * ... + * + * The length of ">ff00ff00ff00ff00: " is + * 3 + (BITS_PER_LONG / 8) * 2 chars. + * The length of each granule metadata is 2 bytes + * plus 1 byte for space. */ - return 3 + (BITS_PER_LONG/8)*2 + (shadow - row)*2 + - (shadow - row) / SHADOW_BYTES_PER_BLOCK + 1; + return 3 + (BITS_PER_LONG / 8) * 2 + + (addr - row) / KASAN_GRANULE_SIZE * 3 + 1; } -static void print_shadow_for_address(const void *addr) +static void print_memory_metadata(const void *addr) { int i; - const void *shadow = kasan_mem_to_shadow(addr); - const void *shadow_row; + void *row; - shadow_row = (void *)round_down((unsigned long)shadow, - SHADOW_BYTES_PER_ROW) - - SHADOW_ROWS_AROUND_ADDR * SHADOW_BYTES_PER_ROW; + row = (void *)round_down((unsigned long)addr, META_MEM_BYTES_PER_ROW) + - META_ROWS_AROUND_ADDR * META_MEM_BYTES_PER_ROW; pr_err("Memory state around the buggy address:\n"); - for (i = -SHADOW_ROWS_AROUND_ADDR; i <= SHADOW_ROWS_AROUND_ADDR; i++) { - const void *kaddr = kasan_shadow_to_mem(shadow_row); - char buffer[4 + (BITS_PER_LONG/8)*2]; - char shadow_buf[SHADOW_BYTES_PER_ROW]; + for (i = -META_ROWS_AROUND_ADDR; i <= META_ROWS_AROUND_ADDR; i++) { + char buffer[4 + (BITS_PER_LONG / 8) * 2]; + char metadata[META_BYTES_PER_ROW]; snprintf(buffer, sizeof(buffer), - (i == 0) ? ">%px: " : " %px: ", kaddr); + (i == 0) ? ">%px: " : " %px: ", row); + /* * We should not pass a shadow pointer to generic * function, because generic functions may try to * access kasan mapping for the passed address. */ - memcpy(shadow_buf, shadow_row, SHADOW_BYTES_PER_ROW); + metadata_fetch_row(&metadata[0], row); + print_hex_dump(KERN_ERR, buffer, - DUMP_PREFIX_NONE, SHADOW_BYTES_PER_ROW, 1, - shadow_buf, SHADOW_BYTES_PER_ROW, 0); + DUMP_PREFIX_NONE, META_BYTES_PER_ROW, 1, + metadata, META_BYTES_PER_ROW, 0); - if (row_is_guilty(shadow_row, shadow)) - pr_err("%*c\n", - shadow_pointer_offset(shadow_row, shadow), - '^'); + if (meta_row_is_guilty(row, addr)) + pr_err("%*c\n", meta_pointer_offset(row, addr), '^'); - shadow_row += SHADOW_BYTES_PER_ROW; + row += META_MEM_BYTES_PER_ROW; } } static bool report_enabled(void) { +#if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS) if (current->kasan_depth) return false; +#endif if (test_bit(KASAN_BIT_MULTI_SHOT, &kasan_flags)) return true; return !test_and_set_bit(KASAN_BIT_REPORTED, &kasan_flags); @@ -490,7 +341,7 @@ void kasan_report_invalid_free(void *object, unsigned long ip) unsigned long flags; u8 tag = get_tag(object); - object = reset_tag(object); + object = kasan_reset_tag(object); #if IS_ENABLED(CONFIG_KUNIT) if (current->kunit_test) @@ -503,7 +354,7 @@ void kasan_report_invalid_free(void *object, unsigned long ip) pr_err("\n"); print_address_description(object, tag); pr_err("\n"); - print_shadow_for_address(object); + print_memory_metadata(object); end_report(&flags); } @@ -523,10 +374,10 @@ static void __kasan_report(unsigned long addr, size_t size, bool is_write, disable_trace_on_warning(); tagged_addr = (void *)addr; - untagged_addr = reset_tag(tagged_addr); + untagged_addr = kasan_reset_tag(tagged_addr); info.access_addr = tagged_addr; - if (addr_has_shadow(untagged_addr)) + if (addr_has_metadata(untagged_addr)) info.first_bad_addr = find_first_bad_addr(tagged_addr, size); else info.first_bad_addr = untagged_addr; @@ -537,14 +388,14 @@ static void __kasan_report(unsigned long addr, size_t size, bool is_write, start_report(&flags); print_error_description(&info); - if (addr_has_shadow(untagged_addr)) + if (addr_has_metadata(untagged_addr)) print_tags(get_tag(tagged_addr), info.first_bad_addr); pr_err("\n"); - if (addr_has_shadow(untagged_addr)) { + if (addr_has_metadata(untagged_addr)) { print_address_description(untagged_addr, get_tag(tagged_addr)); pr_err("\n"); - print_shadow_for_address(info.first_bad_addr); + print_memory_metadata(info.first_bad_addr); } else { dump_stack(); } @@ -604,6 +455,6 @@ void kasan_non_canonical_hook(unsigned long addr) else bug_type = "maybe wild-memory-access"; pr_alert("KASAN: %s in range [0x%016lx-0x%016lx]\n", bug_type, - orig_addr, orig_addr + KASAN_SHADOW_MASK); + orig_addr, orig_addr + KASAN_GRANULE_SIZE - 1); } #endif diff --git a/mm/kasan/report_generic.c b/mm/kasan/report_generic.c new file mode 100644 index 000000000000..8a9c889872da --- /dev/null +++ b/mm/kasan/report_generic.c @@ -0,0 +1,327 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * This file contains generic KASAN specific error reporting code. + * + * Copyright (c) 2014 Samsung Electronics Co., Ltd. + * Author: Andrey Ryabinin <ryabinin.a.a@gmail.com> + * + * Some code borrowed from https://github.com/xairy/kasan-prototype by + * Andrey Konovalov <andreyknvl@gmail.com> + */ + +#include <linux/bitops.h> +#include <linux/ftrace.h> +#include <linux/init.h> +#include <linux/kernel.h> +#include <linux/mm.h> +#include <linux/printk.h> +#include <linux/sched.h> +#include <linux/sched/task_stack.h> +#include <linux/slab.h> +#include <linux/stackdepot.h> +#include <linux/stacktrace.h> +#include <linux/string.h> +#include <linux/types.h> +#include <linux/kasan.h> +#include <linux/module.h> + +#include <asm/sections.h> + +#include "kasan.h" +#include "../slab.h" + +void *find_first_bad_addr(void *addr, size_t size) +{ + void *p = addr; + + while (p < addr + size && !(*(u8 *)kasan_mem_to_shadow(p))) + p += KASAN_GRANULE_SIZE; + return p; +} + +static const char *get_shadow_bug_type(struct kasan_access_info *info) +{ + const char *bug_type = "unknown-crash"; + u8 *shadow_addr; + + shadow_addr = (u8 *)kasan_mem_to_shadow(info->first_bad_addr); + + /* + * If shadow byte value is in [0, KASAN_GRANULE_SIZE) we can look + * at the next shadow byte to determine the type of the bad access. + */ + if (*shadow_addr > 0 && *shadow_addr <= KASAN_GRANULE_SIZE - 1) + shadow_addr++; + + switch (*shadow_addr) { + case 0 ... KASAN_GRANULE_SIZE - 1: + /* + * In theory it's still possible to see these shadow values + * due to a data race in the kernel code. + */ + bug_type = "out-of-bounds"; + break; + case KASAN_PAGE_REDZONE: + case KASAN_KMALLOC_REDZONE: + bug_type = "slab-out-of-bounds"; + break; + case KASAN_GLOBAL_REDZONE: + bug_type = "global-out-of-bounds"; + break; + case KASAN_STACK_LEFT: + case KASAN_STACK_MID: + case KASAN_STACK_RIGHT: + case KASAN_STACK_PARTIAL: + bug_type = "stack-out-of-bounds"; + break; + case KASAN_FREE_PAGE: + case KASAN_KMALLOC_FREE: + case KASAN_KMALLOC_FREETRACK: + bug_type = "use-after-free"; + break; + case KASAN_ALLOCA_LEFT: + case KASAN_ALLOCA_RIGHT: + bug_type = "alloca-out-of-bounds"; + break; + case KASAN_VMALLOC_INVALID: + bug_type = "vmalloc-out-of-bounds"; + break; + } + + return bug_type; +} + +static const char *get_wild_bug_type(struct kasan_access_info *info) +{ + const char *bug_type = "unknown-crash"; + + if ((unsigned long)info->access_addr < PAGE_SIZE) + bug_type = "null-ptr-deref"; + else if ((unsigned long)info->access_addr < TASK_SIZE) + bug_type = "user-memory-access"; + else + bug_type = "wild-memory-access"; + + return bug_type; +} + +const char *get_bug_type(struct kasan_access_info *info) +{ + /* + * If access_size is a negative number, then it has reason to be + * defined as out-of-bounds bug type. + * + * Casting negative numbers to size_t would indeed turn up as + * a large size_t and its value will be larger than ULONG_MAX/2, + * so that this can qualify as out-of-bounds. + */ + if (info->access_addr + info->access_size < info->access_addr) + return "out-of-bounds"; + + if (addr_has_metadata(info->access_addr)) + return get_shadow_bug_type(info); + return get_wild_bug_type(info); +} + +void metadata_fetch_row(char *buffer, void *row) +{ + memcpy(buffer, kasan_mem_to_shadow(row), META_BYTES_PER_ROW); +} + +#if CONFIG_KASAN_STACK +static bool __must_check tokenize_frame_descr(const char **frame_descr, + char *token, size_t max_tok_len, + unsigned long *value) +{ + const char *sep = strchr(*frame_descr, ' '); + + if (sep == NULL) + sep = *frame_descr + strlen(*frame_descr); + + if (token != NULL) { + const size_t tok_len = sep - *frame_descr; + + if (tok_len + 1 > max_tok_len) { + pr_err("KASAN internal error: frame description too long: %s\n", + *frame_descr); + return false; + } + + /* Copy token (+ 1 byte for '\0'). */ + strlcpy(token, *frame_descr, tok_len + 1); + } + + /* Advance frame_descr past separator. */ + *frame_descr = sep + 1; + + if (value != NULL && kstrtoul(token, 10, value)) { + pr_err("KASAN internal error: not a valid number: %s\n", token); + return false; + } + + return true; +} + +static void print_decoded_frame_descr(const char *frame_descr) +{ + /* + * We need to parse the following string: + * "n alloc_1 alloc_2 ... alloc_n" + * where alloc_i looks like + * "offset size len name" + * or "offset size len name:line". + */ + + char token[64]; + unsigned long num_objects; + + if (!tokenize_frame_descr(&frame_descr, token, sizeof(token), + &num_objects)) + return; + + pr_err("\n"); + pr_err("this frame has %lu %s:\n", num_objects, + num_objects == 1 ? "object" : "objects"); + + while (num_objects--) { + unsigned long offset; + unsigned long size; + + /* access offset */ + if (!tokenize_frame_descr(&frame_descr, token, sizeof(token), + &offset)) + return; + /* access size */ + if (!tokenize_frame_descr(&frame_descr, token, sizeof(token), + &size)) + return; + /* name length (unused) */ + if (!tokenize_frame_descr(&frame_descr, NULL, 0, NULL)) + return; + /* object name */ + if (!tokenize_frame_descr(&frame_descr, token, sizeof(token), + NULL)) + return; + + /* Strip line number; without filename it's not very helpful. */ + strreplace(token, ':', '\0'); + + /* Finally, print object information. */ + pr_err(" [%lu, %lu) '%s'", offset, offset + size, token); + } +} + +static bool __must_check get_address_stack_frame_info(const void *addr, + unsigned long *offset, + const char **frame_descr, + const void **frame_pc) +{ + unsigned long aligned_addr; + unsigned long mem_ptr; + const u8 *shadow_bottom; + const u8 *shadow_ptr; + const unsigned long *frame; + + BUILD_BUG_ON(IS_ENABLED(CONFIG_STACK_GROWSUP)); + + /* + * NOTE: We currently only support printing frame information for + * accesses to the task's own stack. + */ + if (!object_is_on_stack(addr)) + return false; + + aligned_addr = round_down((unsigned long)addr, sizeof(long)); + mem_ptr = round_down(aligned_addr, KASAN_GRANULE_SIZE); + shadow_ptr = kasan_mem_to_shadow((void *)aligned_addr); + shadow_bottom = kasan_mem_to_shadow(end_of_stack(current)); + + while (shadow_ptr >= shadow_bottom && *shadow_ptr != KASAN_STACK_LEFT) { + shadow_ptr--; + mem_ptr -= KASAN_GRANULE_SIZE; + } + + while (shadow_ptr >= shadow_bottom && *shadow_ptr == KASAN_STACK_LEFT) { + shadow_ptr--; + mem_ptr -= KASAN_GRANULE_SIZE; + } + + if (shadow_ptr < shadow_bottom) + return false; + + frame = (const unsigned long *)(mem_ptr + KASAN_GRANULE_SIZE); + if (frame[0] != KASAN_CURRENT_STACK_FRAME_MAGIC) { + pr_err("KASAN internal error: frame info validation failed; invalid marker: %lu\n", + frame[0]); + return false; + } + + *offset = (unsigned long)addr - (unsigned long)frame; + *frame_descr = (const char *)frame[1]; + *frame_pc = (void *)frame[2]; + + return true; +} + +void print_address_stack_frame(const void *addr) +{ + unsigned long offset; + const char *frame_descr; + const void *frame_pc; + + if (!get_address_stack_frame_info(addr, &offset, &frame_descr, + &frame_pc)) + return; + + /* + * get_address_stack_frame_info only returns true if the given addr is + * on the current task's stack. + */ + pr_err("\n"); + pr_err("addr %px is located in stack of task %s/%d at offset %lu in frame:\n", + addr, current->comm, task_pid_nr(current), offset); + pr_err(" %pS\n", frame_pc); + + if (!frame_descr) + return; + + print_decoded_frame_descr(frame_descr); +} +#endif /* CONFIG_KASAN_STACK */ + +#define DEFINE_ASAN_REPORT_LOAD(size) \ +void __asan_report_load##size##_noabort(unsigned long addr) \ +{ \ + kasan_report(addr, size, false, _RET_IP_); \ +} \ +EXPORT_SYMBOL(__asan_report_load##size##_noabort) + +#define DEFINE_ASAN_REPORT_STORE(size) \ +void __asan_report_store##size##_noabort(unsigned long addr) \ +{ \ + kasan_report(addr, size, true, _RET_IP_); \ +} \ +EXPORT_SYMBOL(__asan_report_store##size##_noabort) + +DEFINE_ASAN_REPORT_LOAD(1); +DEFINE_ASAN_REPORT_LOAD(2); +DEFINE_ASAN_REPORT_LOAD(4); +DEFINE_ASAN_REPORT_LOAD(8); +DEFINE_ASAN_REPORT_LOAD(16); +DEFINE_ASAN_REPORT_STORE(1); +DEFINE_ASAN_REPORT_STORE(2); +DEFINE_ASAN_REPORT_STORE(4); +DEFINE_ASAN_REPORT_STORE(8); +DEFINE_ASAN_REPORT_STORE(16); + +void __asan_report_load_n_noabort(unsigned long addr, size_t size) +{ + kasan_report(addr, size, false, _RET_IP_); +} +EXPORT_SYMBOL(__asan_report_load_n_noabort); + +void __asan_report_store_n_noabort(unsigned long addr, size_t size) +{ + kasan_report(addr, size, true, _RET_IP_); +} +EXPORT_SYMBOL(__asan_report_store_n_noabort); diff --git a/mm/kasan/report_hw_tags.c b/mm/kasan/report_hw_tags.c new file mode 100644 index 000000000000..57114f0e14d1 --- /dev/null +++ b/mm/kasan/report_hw_tags.c @@ -0,0 +1,42 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * This file contains hardware tag-based KASAN specific error reporting code. + * + * Copyright (c) 2020 Google, Inc. + * Author: Andrey Konovalov <andreyknvl@google.com> + */ + +#include <linux/kasan.h> +#include <linux/kernel.h> +#include <linux/memory.h> +#include <linux/mm.h> +#include <linux/string.h> +#include <linux/types.h> + +#include "kasan.h" + +const char *get_bug_type(struct kasan_access_info *info) +{ + return "invalid-access"; +} + +void *find_first_bad_addr(void *addr, size_t size) +{ + return kasan_reset_tag(addr); +} + +void metadata_fetch_row(char *buffer, void *row) +{ + int i; + + for (i = 0; i < META_BYTES_PER_ROW; i++) + buffer[i] = hw_get_mem_tag(row + i * KASAN_GRANULE_SIZE); +} + +void print_tags(u8 addr_tag, const void *addr) +{ + u8 memory_tag = hw_get_mem_tag((void *)addr); + + pr_err("Pointer tag: [%02x], memory tag: [%02x]\n", + addr_tag, memory_tag); +} diff --git a/mm/kasan/tags_report.c b/mm/kasan/report_sw_tags.c index bee43717d6f0..1b026793ad57 100644 --- a/mm/kasan/tags_report.c +++ b/mm/kasan/report_sw_tags.c @@ -1,17 +1,12 @@ // SPDX-License-Identifier: GPL-2.0 /* - * This file contains tag-based KASAN specific error reporting code. + * This file contains software tag-based KASAN specific error reporting code. * * Copyright (c) 2014 Samsung Electronics Co., Ltd. * Author: Andrey Ryabinin <ryabinin.a.a@gmail.com> * * Some code borrowed from https://github.com/xairy/kasan-prototype by * Andrey Konovalov <andreyknvl@gmail.com> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * */ #include <linux/bitops.h> @@ -46,16 +41,19 @@ const char *get_bug_type(struct kasan_access_info *info) int i; tag = get_tag(info->access_addr); - addr = reset_tag(info->access_addr); + addr = kasan_reset_tag(info->access_addr); page = kasan_addr_to_page(addr); if (page && PageSlab(page)) { cache = page->slab_cache; object = nearest_obj(cache, page, (void *)addr); - alloc_meta = get_alloc_info(cache, object); + alloc_meta = kasan_get_alloc_meta(cache, object); - for (i = 0; i < KASAN_NR_FREE_STACKS; i++) - if (alloc_meta->free_pointer_tag[i] == tag) - return "use-after-free"; + if (alloc_meta) { + for (i = 0; i < KASAN_NR_FREE_STACKS; i++) { + if (alloc_meta->free_pointer_tag[i] == tag) + return "use-after-free"; + } + } return "out-of-bounds"; } @@ -77,14 +75,19 @@ const char *get_bug_type(struct kasan_access_info *info) void *find_first_bad_addr(void *addr, size_t size) { u8 tag = get_tag(addr); - void *p = reset_tag(addr); + void *p = kasan_reset_tag(addr); void *end = p + size; while (p < end && tag == *(u8 *)kasan_mem_to_shadow(p)) - p += KASAN_SHADOW_SCALE_SIZE; + p += KASAN_GRANULE_SIZE; return p; } +void metadata_fetch_row(char *buffer, void *row) +{ + memcpy(buffer, kasan_mem_to_shadow(row), META_BYTES_PER_ROW); +} + void print_tags(u8 addr_tag, const void *addr) { u8 *shadow = (u8 *)kasan_mem_to_shadow(addr); diff --git a/mm/kasan/shadow.c b/mm/kasan/shadow.c new file mode 100644 index 000000000000..7c2c08c55f32 --- /dev/null +++ b/mm/kasan/shadow.c @@ -0,0 +1,504 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * This file contains KASAN runtime code that manages shadow memory for + * generic and software tag-based KASAN modes. + * + * Copyright (c) 2014 Samsung Electronics Co., Ltd. + * Author: Andrey Ryabinin <ryabinin.a.a@gmail.com> + * + * Some code borrowed from https://github.com/xairy/kasan-prototype by + * Andrey Konovalov <andreyknvl@gmail.com> + */ + +#include <linux/init.h> +#include <linux/kasan.h> +#include <linux/kernel.h> +#include <linux/kmemleak.h> +#include <linux/memory.h> +#include <linux/mm.h> +#include <linux/string.h> +#include <linux/types.h> +#include <linux/vmalloc.h> + +#include <asm/cacheflush.h> +#include <asm/tlbflush.h> + +#include "kasan.h" + +bool __kasan_check_read(const volatile void *p, unsigned int size) +{ + return check_memory_region((unsigned long)p, size, false, _RET_IP_); +} +EXPORT_SYMBOL(__kasan_check_read); + +bool __kasan_check_write(const volatile void *p, unsigned int size) +{ + return check_memory_region((unsigned long)p, size, true, _RET_IP_); +} +EXPORT_SYMBOL(__kasan_check_write); + +#undef memset +void *memset(void *addr, int c, size_t len) +{ + if (!check_memory_region((unsigned long)addr, len, true, _RET_IP_)) + return NULL; + + return __memset(addr, c, len); +} + +#ifdef __HAVE_ARCH_MEMMOVE +#undef memmove +void *memmove(void *dest, const void *src, size_t len) +{ + if (!check_memory_region((unsigned long)src, len, false, _RET_IP_) || + !check_memory_region((unsigned long)dest, len, true, _RET_IP_)) + return NULL; + + return __memmove(dest, src, len); +} +#endif + +#undef memcpy +void *memcpy(void *dest, const void *src, size_t len) +{ + if (!check_memory_region((unsigned long)src, len, false, _RET_IP_) || + !check_memory_region((unsigned long)dest, len, true, _RET_IP_)) + return NULL; + + return __memcpy(dest, src, len); +} + +/* + * Poisons the shadow memory for 'size' bytes starting from 'addr'. + * Memory addresses should be aligned to KASAN_GRANULE_SIZE. + */ +void poison_range(const void *address, size_t size, u8 value) +{ + void *shadow_start, *shadow_end; + + /* + * Perform shadow offset calculation based on untagged address, as + * some of the callers (e.g. kasan_poison_object_data) pass tagged + * addresses to this function. + */ + address = kasan_reset_tag(address); + size = round_up(size, KASAN_GRANULE_SIZE); + + shadow_start = kasan_mem_to_shadow(address); + shadow_end = kasan_mem_to_shadow(address + size); + + __memset(shadow_start, value, shadow_end - shadow_start); +} + +void unpoison_range(const void *address, size_t size) +{ + u8 tag = get_tag(address); + + /* + * Perform shadow offset calculation based on untagged address, as + * some of the callers (e.g. kasan_unpoison_object_data) pass tagged + * addresses to this function. + */ + address = kasan_reset_tag(address); + + poison_range(address, size, tag); + + if (size & KASAN_GRANULE_MASK) { + u8 *shadow = (u8 *)kasan_mem_to_shadow(address + size); + + if (IS_ENABLED(CONFIG_KASAN_SW_TAGS)) + *shadow = tag; + else /* CONFIG_KASAN_GENERIC */ + *shadow = size & KASAN_GRANULE_MASK; + } +} + +#ifdef CONFIG_MEMORY_HOTPLUG +static bool shadow_mapped(unsigned long addr) +{ + pgd_t *pgd = pgd_offset_k(addr); + p4d_t *p4d; + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + + if (pgd_none(*pgd)) + return false; + p4d = p4d_offset(pgd, addr); + if (p4d_none(*p4d)) + return false; + pud = pud_offset(p4d, addr); + if (pud_none(*pud)) + return false; + + /* + * We can't use pud_large() or pud_huge(), the first one is + * arch-specific, the last one depends on HUGETLB_PAGE. So let's abuse + * pud_bad(), if pud is bad then it's bad because it's huge. + */ + if (pud_bad(*pud)) + return true; + pmd = pmd_offset(pud, addr); + if (pmd_none(*pmd)) + return false; + + if (pmd_bad(*pmd)) + return true; + pte = pte_offset_kernel(pmd, addr); + return !pte_none(*pte); +} + +static int __meminit kasan_mem_notifier(struct notifier_block *nb, + unsigned long action, void *data) +{ + struct memory_notify *mem_data = data; + unsigned long nr_shadow_pages, start_kaddr, shadow_start; + unsigned long shadow_end, shadow_size; + + nr_shadow_pages = mem_data->nr_pages >> KASAN_SHADOW_SCALE_SHIFT; + start_kaddr = (unsigned long)pfn_to_kaddr(mem_data->start_pfn); + shadow_start = (unsigned long)kasan_mem_to_shadow((void *)start_kaddr); + shadow_size = nr_shadow_pages << PAGE_SHIFT; + shadow_end = shadow_start + shadow_size; + + if (WARN_ON(mem_data->nr_pages % KASAN_GRANULE_SIZE) || + WARN_ON(start_kaddr % KASAN_MEMORY_PER_SHADOW_PAGE)) + return NOTIFY_BAD; + + switch (action) { + case MEM_GOING_ONLINE: { + void *ret; + + /* + * If shadow is mapped already than it must have been mapped + * during the boot. This could happen if we onlining previously + * offlined memory. + */ + if (shadow_mapped(shadow_start)) + return NOTIFY_OK; + + ret = __vmalloc_node_range(shadow_size, PAGE_SIZE, shadow_start, + shadow_end, GFP_KERNEL, + PAGE_KERNEL, VM_NO_GUARD, + pfn_to_nid(mem_data->start_pfn), + __builtin_return_address(0)); + if (!ret) + return NOTIFY_BAD; + + kmemleak_ignore(ret); + return NOTIFY_OK; + } + case MEM_CANCEL_ONLINE: + case MEM_OFFLINE: { + struct vm_struct *vm; + + /* + * shadow_start was either mapped during boot by kasan_init() + * or during memory online by __vmalloc_node_range(). + * In the latter case we can use vfree() to free shadow. + * Non-NULL result of the find_vm_area() will tell us if + * that was the second case. + * + * Currently it's not possible to free shadow mapped + * during boot by kasan_init(). It's because the code + * to do that hasn't been written yet. So we'll just + * leak the memory. + */ + vm = find_vm_area((void *)shadow_start); + if (vm) + vfree((void *)shadow_start); + } + } + + return NOTIFY_OK; +} + +static int __init kasan_memhotplug_init(void) +{ + hotplug_memory_notifier(kasan_mem_notifier, 0); + + return 0; +} + +core_initcall(kasan_memhotplug_init); +#endif + +#ifdef CONFIG_KASAN_VMALLOC + +static int kasan_populate_vmalloc_pte(pte_t *ptep, unsigned long addr, + void *unused) +{ + unsigned long page; + pte_t pte; + + if (likely(!pte_none(*ptep))) + return 0; + + page = __get_free_page(GFP_KERNEL); + if (!page) + return -ENOMEM; + + memset((void *)page, KASAN_VMALLOC_INVALID, PAGE_SIZE); + pte = pfn_pte(PFN_DOWN(__pa(page)), PAGE_KERNEL); + + spin_lock(&init_mm.page_table_lock); + if (likely(pte_none(*ptep))) { + set_pte_at(&init_mm, addr, ptep, pte); + page = 0; + } + spin_unlock(&init_mm.page_table_lock); + if (page) + free_page(page); + return 0; +} + +int kasan_populate_vmalloc(unsigned long addr, unsigned long size) +{ + unsigned long shadow_start, shadow_end; + int ret; + + if (!is_vmalloc_or_module_addr((void *)addr)) + return 0; + + shadow_start = (unsigned long)kasan_mem_to_shadow((void *)addr); + shadow_start = ALIGN_DOWN(shadow_start, PAGE_SIZE); + shadow_end = (unsigned long)kasan_mem_to_shadow((void *)addr + size); + shadow_end = ALIGN(shadow_end, PAGE_SIZE); + + ret = apply_to_page_range(&init_mm, shadow_start, + shadow_end - shadow_start, + kasan_populate_vmalloc_pte, NULL); + if (ret) + return ret; + + flush_cache_vmap(shadow_start, shadow_end); + + /* + * We need to be careful about inter-cpu effects here. Consider: + * + * CPU#0 CPU#1 + * WRITE_ONCE(p, vmalloc(100)); while (x = READ_ONCE(p)) ; + * p[99] = 1; + * + * With compiler instrumentation, that ends up looking like this: + * + * CPU#0 CPU#1 + * // vmalloc() allocates memory + * // let a = area->addr + * // we reach kasan_populate_vmalloc + * // and call unpoison_range: + * STORE shadow(a), unpoison_val + * ... + * STORE shadow(a+99), unpoison_val x = LOAD p + * // rest of vmalloc process <data dependency> + * STORE p, a LOAD shadow(x+99) + * + * If there is no barrier between the end of unpoisioning the shadow + * and the store of the result to p, the stores could be committed + * in a different order by CPU#0, and CPU#1 could erroneously observe + * poison in the shadow. + * + * We need some sort of barrier between the stores. + * + * In the vmalloc() case, this is provided by a smp_wmb() in + * clear_vm_uninitialized_flag(). In the per-cpu allocator and in + * get_vm_area() and friends, the caller gets shadow allocated but + * doesn't have any pages mapped into the virtual address space that + * has been reserved. Mapping those pages in will involve taking and + * releasing a page-table lock, which will provide the barrier. + */ + + return 0; +} + +/* + * Poison the shadow for a vmalloc region. Called as part of the + * freeing process at the time the region is freed. + */ +void kasan_poison_vmalloc(const void *start, unsigned long size) +{ + if (!is_vmalloc_or_module_addr(start)) + return; + + size = round_up(size, KASAN_GRANULE_SIZE); + poison_range(start, size, KASAN_VMALLOC_INVALID); +} + +void kasan_unpoison_vmalloc(const void *start, unsigned long size) +{ + if (!is_vmalloc_or_module_addr(start)) + return; + + unpoison_range(start, size); +} + +static int kasan_depopulate_vmalloc_pte(pte_t *ptep, unsigned long addr, + void *unused) +{ + unsigned long page; + + page = (unsigned long)__va(pte_pfn(*ptep) << PAGE_SHIFT); + + spin_lock(&init_mm.page_table_lock); + + if (likely(!pte_none(*ptep))) { + pte_clear(&init_mm, addr, ptep); + free_page(page); + } + spin_unlock(&init_mm.page_table_lock); + + return 0; +} + +/* + * Release the backing for the vmalloc region [start, end), which + * lies within the free region [free_region_start, free_region_end). + * + * This can be run lazily, long after the region was freed. It runs + * under vmap_area_lock, so it's not safe to interact with the vmalloc/vmap + * infrastructure. + * + * How does this work? + * ------------------- + * + * We have a region that is page aligned, labelled as A. + * That might not map onto the shadow in a way that is page-aligned: + * + * start end + * v v + * |????????|????????|AAAAAAAA|AA....AA|AAAAAAAA|????????| < vmalloc + * -------- -------- -------- -------- -------- + * | | | | | + * | | | /-------/ | + * \-------\|/------/ |/---------------/ + * ||| || + * |??AAAAAA|AAAAAAAA|AA??????| < shadow + * (1) (2) (3) + * + * First we align the start upwards and the end downwards, so that the + * shadow of the region aligns with shadow page boundaries. In the + * example, this gives us the shadow page (2). This is the shadow entirely + * covered by this allocation. + * + * Then we have the tricky bits. We want to know if we can free the + * partially covered shadow pages - (1) and (3) in the example. For this, + * we are given the start and end of the free region that contains this + * allocation. Extending our previous example, we could have: + * + * free_region_start free_region_end + * | start end | + * v v v v + * |FFFFFFFF|FFFFFFFF|AAAAAAAA|AA....AA|AAAAAAAA|FFFFFFFF| < vmalloc + * -------- -------- -------- -------- -------- + * | | | | | + * | | | /-------/ | + * \-------\|/------/ |/---------------/ + * ||| || + * |FFAAAAAA|AAAAAAAA|AAF?????| < shadow + * (1) (2) (3) + * + * Once again, we align the start of the free region up, and the end of + * the free region down so that the shadow is page aligned. So we can free + * page (1) - we know no allocation currently uses anything in that page, + * because all of it is in the vmalloc free region. But we cannot free + * page (3), because we can't be sure that the rest of it is unused. + * + * We only consider pages that contain part of the original region for + * freeing: we don't try to free other pages from the free region or we'd + * end up trying to free huge chunks of virtual address space. + * + * Concurrency + * ----------- + * + * How do we know that we're not freeing a page that is simultaneously + * being used for a fresh allocation in kasan_populate_vmalloc(_pte)? + * + * We _can_ have kasan_release_vmalloc and kasan_populate_vmalloc running + * at the same time. While we run under free_vmap_area_lock, the population + * code does not. + * + * free_vmap_area_lock instead operates to ensure that the larger range + * [free_region_start, free_region_end) is safe: because __alloc_vmap_area and + * the per-cpu region-finding algorithm both run under free_vmap_area_lock, + * no space identified as free will become used while we are running. This + * means that so long as we are careful with alignment and only free shadow + * pages entirely covered by the free region, we will not run in to any + * trouble - any simultaneous allocations will be for disjoint regions. + */ +void kasan_release_vmalloc(unsigned long start, unsigned long end, + unsigned long free_region_start, + unsigned long free_region_end) +{ + void *shadow_start, *shadow_end; + unsigned long region_start, region_end; + unsigned long size; + + region_start = ALIGN(start, KASAN_MEMORY_PER_SHADOW_PAGE); + region_end = ALIGN_DOWN(end, KASAN_MEMORY_PER_SHADOW_PAGE); + + free_region_start = ALIGN(free_region_start, KASAN_MEMORY_PER_SHADOW_PAGE); + + if (start != region_start && + free_region_start < region_start) + region_start -= KASAN_MEMORY_PER_SHADOW_PAGE; + + free_region_end = ALIGN_DOWN(free_region_end, KASAN_MEMORY_PER_SHADOW_PAGE); + + if (end != region_end && + free_region_end > region_end) + region_end += KASAN_MEMORY_PER_SHADOW_PAGE; + + shadow_start = kasan_mem_to_shadow((void *)region_start); + shadow_end = kasan_mem_to_shadow((void *)region_end); + + if (shadow_end > shadow_start) { + size = shadow_end - shadow_start; + apply_to_existing_page_range(&init_mm, + (unsigned long)shadow_start, + size, kasan_depopulate_vmalloc_pte, + NULL); + flush_tlb_kernel_range((unsigned long)shadow_start, + (unsigned long)shadow_end); + } +} + +#else /* CONFIG_KASAN_VMALLOC */ + +int kasan_module_alloc(void *addr, size_t size) +{ + void *ret; + size_t scaled_size; + size_t shadow_size; + unsigned long shadow_start; + + shadow_start = (unsigned long)kasan_mem_to_shadow(addr); + scaled_size = (size + KASAN_GRANULE_SIZE - 1) >> + KASAN_SHADOW_SCALE_SHIFT; + shadow_size = round_up(scaled_size, PAGE_SIZE); + + if (WARN_ON(!PAGE_ALIGNED(shadow_start))) + return -EINVAL; + + ret = __vmalloc_node_range(shadow_size, 1, shadow_start, + shadow_start + shadow_size, + GFP_KERNEL, + PAGE_KERNEL, VM_NO_GUARD, NUMA_NO_NODE, + __builtin_return_address(0)); + + if (ret) { + __memset(ret, KASAN_SHADOW_INIT, shadow_size); + find_vm_area(addr)->flags |= VM_KASAN; + kmemleak_ignore(ret); + return 0; + } + + return -ENOMEM; +} + +void kasan_free_shadow(const struct vm_struct *vm) +{ + if (vm->flags & VM_KASAN) + vfree(kasan_mem_to_shadow(vm->addr)); +} + +#endif diff --git a/mm/kasan/tags.c b/mm/kasan/sw_tags.c index e02a36a51f42..5dcd830805b2 100644 --- a/mm/kasan/tags.c +++ b/mm/kasan/sw_tags.c @@ -1,17 +1,12 @@ // SPDX-License-Identifier: GPL-2.0 /* - * This file contains core tag-based KASAN code. + * This file contains core software tag-based KASAN code. * * Copyright (c) 2018 Google, Inc. * Author: Andrey Konovalov <andreyknvl@google.com> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * */ -#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#define pr_fmt(fmt) "kasan: " fmt #include <linux/export.h> #include <linux/interrupt.h> @@ -40,12 +35,14 @@ static DEFINE_PER_CPU(u32, prng_state); -void kasan_init_tags(void) +void __init kasan_init_sw_tags(void) { int cpu; for_each_possible_cpu(cpu) per_cpu(prng_state, cpu) = (u32)get_cycles(); + + pr_info("KernelAddressSanitizer initialized\n"); } /* @@ -70,11 +67,6 @@ u8 random_tag(void) return (u8)(state % (KASAN_TAG_MAX + 1)); } -void *kasan_reset_tag(const void *addr) -{ - return reset_tag(addr); -} - bool check_memory_region(unsigned long addr, size_t size, bool write, unsigned long ret_ip) { @@ -110,7 +102,7 @@ bool check_memory_region(unsigned long addr, size_t size, bool write, if (tag == KASAN_TAG_KERNEL) return true; - untagged_addr = reset_tag((const void *)addr); + untagged_addr = kasan_reset_tag((const void *)addr); if (unlikely(untagged_addr < kasan_shadow_to_mem((void *)KASAN_SHADOW_START))) { return !kasan_report(addr, size, write, ret_ip); @@ -126,6 +118,15 @@ bool check_memory_region(unsigned long addr, size_t size, bool write, return true; } +bool check_invalid_free(void *addr) +{ + u8 tag = get_tag(addr); + u8 shadow_byte = READ_ONCE(*(u8 *)kasan_mem_to_shadow(kasan_reset_tag(addr))); + + return (shadow_byte == KASAN_TAG_INVALID) || + (tag != KASAN_TAG_KERNEL && tag != shadow_byte); +} + #define DEFINE_HWASAN_LOAD_STORE(size) \ void __hwasan_load##size##_noabort(unsigned long addr) \ { \ @@ -158,7 +159,7 @@ EXPORT_SYMBOL(__hwasan_storeN_noabort); void __hwasan_tag_memory(unsigned long addr, u8 tag, unsigned long size) { - kasan_poison_shadow((void *)addr, size, tag); + poison_range((void *)addr, size, tag); } EXPORT_SYMBOL(__hwasan_tag_memory); @@ -168,7 +169,9 @@ void kasan_set_free_info(struct kmem_cache *cache, struct kasan_alloc_meta *alloc_meta; u8 idx = 0; - alloc_meta = get_alloc_info(cache, object); + alloc_meta = kasan_get_alloc_meta(cache, object); + if (!alloc_meta) + return; #ifdef CONFIG_KASAN_SW_TAGS_IDENTIFY idx = alloc_meta->free_track_idx; @@ -185,7 +188,9 @@ struct kasan_track *kasan_get_free_track(struct kmem_cache *cache, struct kasan_alloc_meta *alloc_meta; int i = 0; - alloc_meta = get_alloc_info(cache, object); + alloc_meta = kasan_get_alloc_meta(cache, object); + if (!alloc_meta) + return NULL; #ifdef CONFIG_KASAN_SW_TAGS_IDENTIFY for (i = 0; i < KASAN_NR_FREE_STACKS; i++) { diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 4e3dff13eb70..67ab391a5373 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -90,6 +90,8 @@ static struct kmem_cache *mm_slot_cache __read_mostly; * @hash: hash collision list * @mm_node: khugepaged scan list headed in khugepaged_scan.mm_head * @mm: the mm that this information is valid for + * @nr_pte_mapped_thp: number of pte mapped THP + * @pte_mapped_thp: address array corresponding pte mapped THP */ struct mm_slot { struct hlist_node hash; @@ -124,18 +126,18 @@ static ssize_t scan_sleep_millisecs_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { - return sprintf(buf, "%u\n", khugepaged_scan_sleep_millisecs); + return sysfs_emit(buf, "%u\n", khugepaged_scan_sleep_millisecs); } static ssize_t scan_sleep_millisecs_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { - unsigned long msecs; + unsigned int msecs; int err; - err = kstrtoul(buf, 10, &msecs); - if (err || msecs > UINT_MAX) + err = kstrtouint(buf, 10, &msecs); + if (err) return -EINVAL; khugepaged_scan_sleep_millisecs = msecs; @@ -152,18 +154,18 @@ static ssize_t alloc_sleep_millisecs_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { - return sprintf(buf, "%u\n", khugepaged_alloc_sleep_millisecs); + return sysfs_emit(buf, "%u\n", khugepaged_alloc_sleep_millisecs); } static ssize_t alloc_sleep_millisecs_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { - unsigned long msecs; + unsigned int msecs; int err; - err = kstrtoul(buf, 10, &msecs); - if (err || msecs > UINT_MAX) + err = kstrtouint(buf, 10, &msecs); + if (err) return -EINVAL; khugepaged_alloc_sleep_millisecs = msecs; @@ -180,17 +182,17 @@ static ssize_t pages_to_scan_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { - return sprintf(buf, "%u\n", khugepaged_pages_to_scan); + return sysfs_emit(buf, "%u\n", khugepaged_pages_to_scan); } static ssize_t pages_to_scan_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { + unsigned int pages; int err; - unsigned long pages; - err = kstrtoul(buf, 10, &pages); - if (err || !pages || pages > UINT_MAX) + err = kstrtouint(buf, 10, &pages); + if (err || !pages) return -EINVAL; khugepaged_pages_to_scan = pages; @@ -205,7 +207,7 @@ static ssize_t pages_collapsed_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { - return sprintf(buf, "%u\n", khugepaged_pages_collapsed); + return sysfs_emit(buf, "%u\n", khugepaged_pages_collapsed); } static struct kobj_attribute pages_collapsed_attr = __ATTR_RO(pages_collapsed); @@ -214,7 +216,7 @@ static ssize_t full_scans_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { - return sprintf(buf, "%u\n", khugepaged_full_scans); + return sysfs_emit(buf, "%u\n", khugepaged_full_scans); } static struct kobj_attribute full_scans_attr = __ATTR_RO(full_scans); @@ -223,7 +225,7 @@ static ssize_t khugepaged_defrag_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return single_hugepage_flag_show(kobj, attr, buf, - TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG); + TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG); } static ssize_t khugepaged_defrag_store(struct kobject *kobj, struct kobj_attribute *attr, @@ -248,7 +250,7 @@ static ssize_t khugepaged_max_ptes_none_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { - return sprintf(buf, "%u\n", khugepaged_max_ptes_none); + return sysfs_emit(buf, "%u\n", khugepaged_max_ptes_none); } static ssize_t khugepaged_max_ptes_none_store(struct kobject *kobj, struct kobj_attribute *attr, @@ -273,7 +275,7 @@ static ssize_t khugepaged_max_ptes_swap_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { - return sprintf(buf, "%u\n", khugepaged_max_ptes_swap); + return sysfs_emit(buf, "%u\n", khugepaged_max_ptes_swap); } static ssize_t khugepaged_max_ptes_swap_store(struct kobject *kobj, @@ -297,10 +299,10 @@ static struct kobj_attribute khugepaged_max_ptes_swap_attr = khugepaged_max_ptes_swap_store); static ssize_t khugepaged_max_ptes_shared_show(struct kobject *kobj, - struct kobj_attribute *attr, - char *buf) + struct kobj_attribute *attr, + char *buf) { - return sprintf(buf, "%u\n", khugepaged_max_ptes_shared); + return sysfs_emit(buf, "%u\n", khugepaged_max_ptes_shared); } static ssize_t khugepaged_max_ptes_shared_store(struct kobject *kobj, @@ -1273,7 +1275,7 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, * PTEs are armed with uffd write protection. * Here we can also mark the new huge pmd as * write protected if any of the small ones is - * marked but that could bring uknown + * marked but that could bring unknown * userfault messages that falls outside of * the registered range. So, just be simple. */ @@ -1414,7 +1416,11 @@ static int khugepaged_add_pte_mapped_thp(struct mm_struct *mm, } /** - * Try to collapse a pte-mapped THP for mm at address haddr. + * collapse_pte_mapped_thp - Try to collapse a pte-mapped THP for mm at + * address haddr. + * + * @mm: process address space where collapse happens + * @addr: THP collapse address * * This function checks whether all the PTEs in the PMD are pointing to the * right THP. If so, retract the page table so the THP can refault in with @@ -1605,6 +1611,12 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff) /** * collapse_file - collapse filemap/tmpfs/shmem pages into huge one. * + * @mm: process address space where collapse happens + * @file: file that collapse on + * @start: collapse start address + * @hpage: new allocated huge page for collapse + * @node: appointed node the new huge page allocate from + * * Basic scheme is simple, details are more complex: * - allocate and lock a new huge page; * - scan page cache replacing old pages with the new one @@ -1845,9 +1857,9 @@ out_unlock: } if (is_shmem) - __inc_node_page_state(new_page, NR_SHMEM_THPS); + __inc_lruvec_page_state(new_page, NR_SHMEM_THPS); else { - __inc_node_page_state(new_page, NR_FILE_THPS); + __inc_lruvec_page_state(new_page, NR_FILE_THPS); filemap_nr_thps_inc(mapping); } @@ -2833,18 +2833,18 @@ static void wait_while_offlining(void) static ssize_t sleep_millisecs_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { - return sprintf(buf, "%u\n", ksm_thread_sleep_millisecs); + return sysfs_emit(buf, "%u\n", ksm_thread_sleep_millisecs); } static ssize_t sleep_millisecs_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { - unsigned long msecs; + unsigned int msecs; int err; - err = kstrtoul(buf, 10, &msecs); - if (err || msecs > UINT_MAX) + err = kstrtouint(buf, 10, &msecs); + if (err) return -EINVAL; ksm_thread_sleep_millisecs = msecs; @@ -2857,18 +2857,18 @@ KSM_ATTR(sleep_millisecs); static ssize_t pages_to_scan_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { - return sprintf(buf, "%u\n", ksm_thread_pages_to_scan); + return sysfs_emit(buf, "%u\n", ksm_thread_pages_to_scan); } static ssize_t pages_to_scan_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { + unsigned int nr_pages; int err; - unsigned long nr_pages; - err = kstrtoul(buf, 10, &nr_pages); - if (err || nr_pages > UINT_MAX) + err = kstrtouint(buf, 10, &nr_pages); + if (err) return -EINVAL; ksm_thread_pages_to_scan = nr_pages; @@ -2880,17 +2880,17 @@ KSM_ATTR(pages_to_scan); static ssize_t run_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { - return sprintf(buf, "%lu\n", ksm_run); + return sysfs_emit(buf, "%lu\n", ksm_run); } static ssize_t run_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { + unsigned int flags; int err; - unsigned long flags; - err = kstrtoul(buf, 10, &flags); - if (err || flags > UINT_MAX) + err = kstrtouint(buf, 10, &flags); + if (err) return -EINVAL; if (flags > KSM_RUN_UNMERGE) return -EINVAL; @@ -2927,9 +2927,9 @@ KSM_ATTR(run); #ifdef CONFIG_NUMA static ssize_t merge_across_nodes_show(struct kobject *kobj, - struct kobj_attribute *attr, char *buf) + struct kobj_attribute *attr, char *buf) { - return sprintf(buf, "%u\n", ksm_merge_across_nodes); + return sysfs_emit(buf, "%u\n", ksm_merge_across_nodes); } static ssize_t merge_across_nodes_store(struct kobject *kobj, @@ -2984,9 +2984,9 @@ KSM_ATTR(merge_across_nodes); #endif static ssize_t use_zero_pages_show(struct kobject *kobj, - struct kobj_attribute *attr, char *buf) + struct kobj_attribute *attr, char *buf) { - return sprintf(buf, "%u\n", ksm_use_zero_pages); + return sysfs_emit(buf, "%u\n", ksm_use_zero_pages); } static ssize_t use_zero_pages_store(struct kobject *kobj, struct kobj_attribute *attr, @@ -3008,7 +3008,7 @@ KSM_ATTR(use_zero_pages); static ssize_t max_page_sharing_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { - return sprintf(buf, "%u\n", ksm_max_page_sharing); + return sysfs_emit(buf, "%u\n", ksm_max_page_sharing); } static ssize_t max_page_sharing_store(struct kobject *kobj, @@ -3049,21 +3049,21 @@ KSM_ATTR(max_page_sharing); static ssize_t pages_shared_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { - return sprintf(buf, "%lu\n", ksm_pages_shared); + return sysfs_emit(buf, "%lu\n", ksm_pages_shared); } KSM_ATTR_RO(pages_shared); static ssize_t pages_sharing_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { - return sprintf(buf, "%lu\n", ksm_pages_sharing); + return sysfs_emit(buf, "%lu\n", ksm_pages_sharing); } KSM_ATTR_RO(pages_sharing); static ssize_t pages_unshared_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { - return sprintf(buf, "%lu\n", ksm_pages_unshared); + return sysfs_emit(buf, "%lu\n", ksm_pages_unshared); } KSM_ATTR_RO(pages_unshared); @@ -3080,21 +3080,21 @@ static ssize_t pages_volatile_show(struct kobject *kobj, */ if (ksm_pages_volatile < 0) ksm_pages_volatile = 0; - return sprintf(buf, "%ld\n", ksm_pages_volatile); + return sysfs_emit(buf, "%ld\n", ksm_pages_volatile); } KSM_ATTR_RO(pages_volatile); static ssize_t stable_node_dups_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { - return sprintf(buf, "%lu\n", ksm_stable_node_dups); + return sysfs_emit(buf, "%lu\n", ksm_stable_node_dups); } KSM_ATTR_RO(stable_node_dups); static ssize_t stable_node_chains_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { - return sprintf(buf, "%lu\n", ksm_stable_node_chains); + return sysfs_emit(buf, "%lu\n", ksm_stable_node_chains); } KSM_ATTR_RO(stable_node_chains); @@ -3103,7 +3103,7 @@ stable_node_chains_prune_millisecs_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { - return sprintf(buf, "%u\n", ksm_stable_node_chains_prune_millisecs); + return sysfs_emit(buf, "%u\n", ksm_stable_node_chains_prune_millisecs); } static ssize_t @@ -3127,7 +3127,7 @@ KSM_ATTR(stable_node_chains_prune_millisecs); static ssize_t full_scans_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { - return sprintf(buf, "%lu\n", ksm_scan.seqnr); + return sysfs_emit(buf, "%lu\n", ksm_scan.seqnr); } KSM_ATTR_RO(full_scans); diff --git a/mm/madvise.c b/mm/madvise.c index 13f5677b9322..6a660858784b 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -877,7 +877,6 @@ static long madvise_remove(struct vm_area_struct *vma, static int madvise_inject_error(int behavior, unsigned long start, unsigned long end) { - struct zone *zone; unsigned long size; if (!capable(CAP_SYS_ADMIN)) @@ -908,24 +907,13 @@ static int madvise_inject_error(int behavior, } else { pr_info("Injecting memory failure for pfn %#lx at process virtual address %#lx\n", pfn, start); - /* - * Drop the page reference taken by get_user_pages_fast(). In - * the absence of MF_COUNT_INCREASED the memory_failure() - * routine is responsible for pinning the page to prevent it - * from being released back to the page allocator. - */ - put_page(page); - ret = memory_failure(pfn, 0); + ret = memory_failure(pfn, MF_COUNT_INCREASED); } if (ret) return ret; } - /* Ensure that all poisoned pages are removed from per-cpu lists */ - for_each_populated_zone(zone) - drain_all_pages(zone); - return 0; } #endif diff --git a/mm/mapping_dirty_helpers.c b/mm/mapping_dirty_helpers.c index 2c7d03675903..b59054ef2e10 100644 --- a/mm/mapping_dirty_helpers.c +++ b/mm/mapping_dirty_helpers.c @@ -23,7 +23,8 @@ struct wp_walk { /** * wp_pte - Write-protect a pte * @pte: Pointer to the pte - * @addr: The virtual page address + * @addr: The start of protecting virtual address + * @end: The end of protecting virtual address * @walk: pagetable walk callback argument * * The function write-protects a pte and records the range in @@ -74,7 +75,8 @@ struct clean_walk { * clean_record_pte - Clean a pte and record its address space offset in a * bitmap * @pte: Pointer to the pte - * @addr: The virtual page address + * @addr: The start of virtual address to be clean + * @end: The end of virtual address to be clean * @walk: pagetable walk callback argument * * The function cleans a pte and records the range in diff --git a/mm/memblock.c b/mm/memblock.c index b68ee86788af..d24bcfa88d2f 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -871,7 +871,7 @@ int __init_memblock memblock_physmem_add(phys_addr_t base, phys_addr_t size) * @base: base address of the region * @size: size of the region * @set: set or clear the flag - * @flag: the flag to udpate + * @flag: the flag to update * * This function isolates region [@base, @base + @size), and sets/clears flag * @@ -1419,6 +1419,9 @@ phys_addr_t __init memblock_phys_alloc_range(phys_addr_t size, phys_addr_t start, phys_addr_t end) { + memblock_dbg("%s: %llu bytes align=0x%llx from=%pa max_addr=%pa %pS\n", + __func__, (u64)size, (u64)align, &start, &end, + (void *)_RET_IP_); return memblock_alloc_range_nid(size, align, start, end, NUMA_NO_NODE, false); } @@ -1926,6 +1929,85 @@ static int __init early_memblock(char *p) } early_param("memblock", early_memblock); +static void __init free_memmap(unsigned long start_pfn, unsigned long end_pfn) +{ + struct page *start_pg, *end_pg; + phys_addr_t pg, pgend; + + /* + * Convert start_pfn/end_pfn to a struct page pointer. + */ + start_pg = pfn_to_page(start_pfn - 1) + 1; + end_pg = pfn_to_page(end_pfn - 1) + 1; + + /* + * Convert to physical addresses, and round start upwards and end + * downwards. + */ + pg = PAGE_ALIGN(__pa(start_pg)); + pgend = __pa(end_pg) & PAGE_MASK; + + /* + * If there are free pages between these, free the section of the + * memmap array. + */ + if (pg < pgend) + memblock_free(pg, pgend - pg); +} + +/* + * The mem_map array can get very big. Free the unused area of the memory map. + */ +static void __init free_unused_memmap(void) +{ + unsigned long start, end, prev_end = 0; + int i; + + if (!IS_ENABLED(CONFIG_HAVE_ARCH_PFN_VALID) || + IS_ENABLED(CONFIG_SPARSEMEM_VMEMMAP)) + return; + + /* + * This relies on each bank being in address order. + * The banks are sorted previously in bootmem_init(). + */ + for_each_mem_pfn_range(i, MAX_NUMNODES, &start, &end, NULL) { +#ifdef CONFIG_SPARSEMEM + /* + * Take care not to free memmap entries that don't exist + * due to SPARSEMEM sections which aren't present. + */ + start = min(start, ALIGN(prev_end, PAGES_PER_SECTION)); +#else + /* + * Align down here since the VM subsystem insists that the + * memmap entries are valid from the bank start aligned to + * MAX_ORDER_NR_PAGES. + */ + start = round_down(start, MAX_ORDER_NR_PAGES); +#endif + + /* + * If we had a previous bank, and there is a space + * between the current bank and the previous, free it. + */ + if (prev_end && prev_end < start) + free_memmap(prev_end, start); + + /* + * Align up here since the VM subsystem insists that the + * memmap entries are valid from the bank end aligned to + * MAX_ORDER_NR_PAGES. + */ + prev_end = ALIGN(end, MAX_ORDER_NR_PAGES); + } + +#ifdef CONFIG_SPARSEMEM + if (!IS_ALIGNED(prev_end, PAGES_PER_SECTION)) + free_memmap(prev_end, ALIGN(prev_end, PAGES_PER_SECTION)); +#endif +} + static void __init __free_pages_memory(unsigned long start, unsigned long end) { int order; @@ -2012,6 +2094,7 @@ unsigned long __init memblock_free_all(void) { unsigned long pages; + free_unused_memmap(); reset_all_zones_managed_pages(); pages = free_low_memory_core_early(); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 29459a6ce1c7..605f671203ef 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -20,6 +20,9 @@ * Lockless page tracking & accounting * Unified hierarchy configuration model * Copyright (C) 2015 Red Hat, Inc., Johannes Weiner + * + * Per memcg lru locking + * Copyright (C) 2020 Alibaba, Inc, Alex Shi */ #include <linux/page_counter.h> @@ -533,7 +536,7 @@ struct cgroup_subsys_state *mem_cgroup_css_from_page(struct page *page) { struct mem_cgroup *memcg; - memcg = page->mem_cgroup; + memcg = page_memcg(page); if (!memcg || !cgroup_subsys_on_dfl(memory_cgrp_subsys)) memcg = root_mem_cgroup; @@ -560,16 +563,7 @@ ino_t page_cgroup_ino(struct page *page) unsigned long ino = 0; rcu_read_lock(); - memcg = page->mem_cgroup; - - /* - * The lowest bit set means that memcg isn't a valid - * memcg pointer, but a obj_cgroups pointer. - * In this case the page is shared and doesn't belong - * to any specific memory cgroup. - */ - if ((unsigned long) memcg & 0x1UL) - memcg = NULL; + memcg = page_memcg_check(page); while (memcg && !(memcg->css.flags & CSS_ONLINE)) memcg = parent_mem_cgroup(memcg); @@ -623,14 +617,9 @@ static void __mem_cgroup_insert_exceeded(struct mem_cgroup_per_node *mz, if (mz->usage_in_excess < mz_node->usage_in_excess) { p = &(*p)->rb_left; rightmost = false; - } - - /* - * We can't avoid mem cgroups that are over their soft - * limit by the same amount - */ - else if (mz->usage_in_excess >= mz_node->usage_in_excess) + } else { p = &(*p)->rb_right; + } } if (rightmost) @@ -858,7 +847,26 @@ void __mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, __mod_memcg_lruvec_state(lruvec, idx, val); } -void __mod_lruvec_slab_state(void *p, enum node_stat_item idx, int val) +void __mod_lruvec_page_state(struct page *page, enum node_stat_item idx, + int val) +{ + struct page *head = compound_head(page); /* rmap on tail pages */ + struct mem_cgroup *memcg = page_memcg(head); + pg_data_t *pgdat = page_pgdat(page); + struct lruvec *lruvec; + + /* Untracked pages have no memcg, no lruvec. Update only the node */ + if (!memcg) { + __mod_node_page_state(pgdat, idx, val); + return; + } + + lruvec = mem_cgroup_lruvec(memcg, pgdat); + __mod_lruvec_state(lruvec, idx, val); +} +EXPORT_SYMBOL(__mod_lruvec_page_state); + +void __mod_lruvec_kmem_state(void *p, enum node_stat_item idx, int val) { pg_data_t *pgdat = page_pgdat(virt_to_page(p)); struct mem_cgroup *memcg; @@ -882,17 +890,6 @@ void __mod_lruvec_slab_state(void *p, enum node_stat_item idx, int val) rcu_read_unlock(); } -void mod_memcg_obj_state(void *p, int idx, int val) -{ - struct mem_cgroup *memcg; - - rcu_read_lock(); - memcg = mem_cgroup_from_obj(p); - if (memcg) - mod_memcg_state(memcg, idx, val); - rcu_read_unlock(); -} - /** * __count_memcg_events - account VM events in a cgroup * @memcg: the memory cgroup @@ -1055,7 +1052,7 @@ EXPORT_SYMBOL(get_mem_cgroup_from_mm); */ struct mem_cgroup *get_mem_cgroup_from_page(struct page *page) { - struct mem_cgroup *memcg = page->mem_cgroup; + struct mem_cgroup *memcg = page_memcg(page); if (mem_cgroup_disabled()) return NULL; @@ -1157,12 +1154,6 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, if (prev && !reclaim) pos = prev; - if (!root->use_hierarchy && root != root_mem_cgroup) { - if (prev) - goto out; - return root; - } - rcu_read_lock(); if (reclaim) { @@ -1242,7 +1233,6 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, out_unlock: rcu_read_unlock(); -out: if (prev && prev != root) css_put(&prev->css); @@ -1335,43 +1325,74 @@ int mem_cgroup_scan_tasks(struct mem_cgroup *memcg, return ret; } +#ifdef CONFIG_DEBUG_VM +void lruvec_memcg_debug(struct lruvec *lruvec, struct page *page) +{ + struct mem_cgroup *memcg; + + if (mem_cgroup_disabled()) + return; + + memcg = page_memcg(page); + + if (!memcg) + VM_BUG_ON_PAGE(lruvec_memcg(lruvec) != root_mem_cgroup, page); + else + VM_BUG_ON_PAGE(lruvec_memcg(lruvec) != memcg, page); +} +#endif + /** - * mem_cgroup_page_lruvec - return lruvec for isolating/putting an LRU page + * lock_page_lruvec - lock and return lruvec for a given page. * @page: the page - * @pgdat: pgdat of the page * - * This function relies on page->mem_cgroup being stable - see the - * access rules in commit_charge(). + * This series functions should be used in either conditions: + * PageLRU is cleared or unset + * or page->_refcount is zero + * or page is locked. */ -struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct pglist_data *pgdat) +struct lruvec *lock_page_lruvec(struct page *page) { - struct mem_cgroup_per_node *mz; - struct mem_cgroup *memcg; struct lruvec *lruvec; + struct pglist_data *pgdat = page_pgdat(page); - if (mem_cgroup_disabled()) { - lruvec = &pgdat->__lruvec; - goto out; - } + rcu_read_lock(); + lruvec = mem_cgroup_page_lruvec(page, pgdat); + spin_lock(&lruvec->lru_lock); + rcu_read_unlock(); - memcg = page->mem_cgroup; - /* - * Swapcache readahead pages are added to the LRU - and - * possibly migrated - before they are charged. - */ - if (!memcg) - memcg = root_mem_cgroup; + lruvec_memcg_debug(lruvec, page); + + return lruvec; +} + +struct lruvec *lock_page_lruvec_irq(struct page *page) +{ + struct lruvec *lruvec; + struct pglist_data *pgdat = page_pgdat(page); + + rcu_read_lock(); + lruvec = mem_cgroup_page_lruvec(page, pgdat); + spin_lock_irq(&lruvec->lru_lock); + rcu_read_unlock(); + + lruvec_memcg_debug(lruvec, page); + + return lruvec; +} + +struct lruvec *lock_page_lruvec_irqsave(struct page *page, unsigned long *flags) +{ + struct lruvec *lruvec; + struct pglist_data *pgdat = page_pgdat(page); + + rcu_read_lock(); + lruvec = mem_cgroup_page_lruvec(page, pgdat); + spin_lock_irqsave(&lruvec->lru_lock, *flags); + rcu_read_unlock(); + + lruvec_memcg_debug(lruvec, page); - mz = mem_cgroup_page_nodeinfo(memcg, page); - lruvec = &mz->lruvec; -out: - /* - * Since a node can be onlined after the mem_cgroup was created, - * we have to be prepared to initialize lruvec->zone here; - * and if offlined then reonlined, we need to reinitialize it. - */ - if (unlikely(lruvec->pgdat != pgdat)) - lruvec->pgdat = pgdat; return lruvec; } @@ -1499,6 +1520,7 @@ static struct memory_stat memory_stats[] = { { "anon", PAGE_SIZE, NR_ANON_MAPPED }, { "file", PAGE_SIZE, NR_FILE_PAGES }, { "kernel_stack", 1024, NR_KERNEL_STACK_KB }, + { "pagetables", PAGE_SIZE, NR_PAGETABLE }, { "percpu", 1, MEMCG_PERCPU_B }, { "sock", PAGE_SIZE, MEMCG_SOCK }, { "shmem", PAGE_SIZE, NR_SHMEM }, @@ -1512,6 +1534,8 @@ static struct memory_stat memory_stats[] = { * constant(e.g. powerpc). */ { "anon_thp", 0, NR_ANON_THPS }, + { "file_thp", 0, NR_FILE_THPS }, + { "shmem_thp", 0, NR_SHMEM_THPS }, #endif { "inactive_anon", PAGE_SIZE, NR_INACTIVE_ANON }, { "active_anon", PAGE_SIZE, NR_ACTIVE_ANON }, @@ -1542,7 +1566,9 @@ static int __init memory_stats_init(void) for (i = 0; i < ARRAY_SIZE(memory_stats); i++) { #ifdef CONFIG_TRANSPARENT_HUGEPAGE - if (memory_stats[i].idx == NR_ANON_THPS) + if (memory_stats[i].idx == NR_ANON_THPS || + memory_stats[i].idx == NR_FILE_THPS || + memory_stats[i].idx == NR_SHMEM_THPS) memory_stats[i].ratio = HPAGE_PMD_SIZE; #endif VM_BUG_ON(!memory_stats[i].ratio); @@ -2114,7 +2140,7 @@ void mem_cgroup_print_oom_group(struct mem_cgroup *memcg) } /** - * lock_page_memcg - lock a page->mem_cgroup binding + * lock_page_memcg - lock a page and memcg binding * @page: the page * * This function protects unlocked LRU pages from being moved to @@ -2146,15 +2172,21 @@ struct mem_cgroup *lock_page_memcg(struct page *page) if (mem_cgroup_disabled()) return NULL; again: - memcg = head->mem_cgroup; + memcg = page_memcg(head); if (unlikely(!memcg)) return NULL; +#ifdef CONFIG_PROVE_LOCKING + local_irq_save(flags); + might_lock(&memcg->move_lock); + local_irq_restore(flags); +#endif + if (atomic_read(&memcg->moving_account) <= 0) return memcg; spin_lock_irqsave(&memcg->move_lock, flags); - if (memcg != head->mem_cgroup) { + if (memcg != page_memcg(head)) { spin_unlock_irqrestore(&memcg->move_lock, flags); goto again; } @@ -2192,14 +2224,14 @@ void __unlock_page_memcg(struct mem_cgroup *memcg) } /** - * unlock_page_memcg - unlock a page->mem_cgroup binding + * unlock_page_memcg - unlock a page and memcg binding * @page: the page */ void unlock_page_memcg(struct page *page) { struct page *head = compound_head(page); - __unlock_page_memcg(head->mem_cgroup); + __unlock_page_memcg(page_memcg(head)); } EXPORT_SYMBOL(unlock_page_memcg); @@ -2889,16 +2921,16 @@ static void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages) static void commit_charge(struct page *page, struct mem_cgroup *memcg) { - VM_BUG_ON_PAGE(page->mem_cgroup, page); + VM_BUG_ON_PAGE(page_memcg(page), page); /* - * Any of the following ensures page->mem_cgroup stability: + * Any of the following ensures page's memcg stability: * * - the page lock * - LRU isolation * - lock_page_memcg() * - exclusive reference */ - page->mem_cgroup = memcg; + page->memcg_data = (unsigned long)memcg; } #ifdef CONFIG_MEMCG_KMEM @@ -2913,8 +2945,7 @@ int memcg_alloc_page_obj_cgroups(struct page *page, struct kmem_cache *s, if (!vec) return -ENOMEM; - if (cmpxchg(&page->obj_cgroups, NULL, - (struct obj_cgroup **) ((unsigned long)vec | 0x1UL))) + if (!set_page_objcgs(page, vec)) kfree(vec); else kmemleak_not_leak(vec); @@ -2925,6 +2956,12 @@ int memcg_alloc_page_obj_cgroups(struct page *page, struct kmem_cache *s, /* * Returns a pointer to the memory cgroup to which the kernel object is charged. * + * A passed kernel object can be a slab object or a generic kernel page, so + * different mechanisms for getting the memory cgroup pointer should be used. + * In certain cases (e.g. kernel stacks or large kmallocs with SLUB) the caller + * can not know for sure how the kernel object is implemented. + * mem_cgroup_from_obj() can be safely used in such cases. + * * The caller must ensure the memcg lifetime, e.g. by taking rcu_read_lock(), * cgroup_mutex, etc. */ @@ -2938,35 +2975,30 @@ struct mem_cgroup *mem_cgroup_from_obj(void *p) page = virt_to_head_page(p); /* - * If page->mem_cgroup is set, it's either a simple mem_cgroup pointer - * or a pointer to obj_cgroup vector. In the latter case the lowest - * bit of the pointer is set. - * The page->mem_cgroup pointer can be asynchronously changed - * from NULL to (obj_cgroup_vec | 0x1UL), but can't be changed - * from a valid memcg pointer to objcg vector or back. - */ - if (!page->mem_cgroup) - return NULL; - - /* * Slab objects are accounted individually, not per-page. * Memcg membership data for each individual object is saved in * the page->obj_cgroups. */ - if (page_has_obj_cgroups(page)) { + if (page_objcgs_check(page)) { struct obj_cgroup *objcg; unsigned int off; off = obj_to_index(page->slab_cache, page, p); - objcg = page_obj_cgroups(page)[off]; + objcg = page_objcgs(page)[off]; if (objcg) return obj_cgroup_memcg(objcg); return NULL; } - /* All other pages use page->mem_cgroup */ - return page->mem_cgroup; + /* + * page_memcg_check() is used here, because page_has_obj_cgroups() + * check above could fail because the object cgroups vector wasn't set + * at that moment, but it can be set concurrently. + * page_memcg_check(page) will guarantee that a proper memory + * cgroup pointer or NULL will be returned. + */ + return page_memcg_check(page); } __always_inline struct obj_cgroup *get_obj_cgroup_from_current(void) @@ -2987,6 +3019,7 @@ __always_inline struct obj_cgroup *get_obj_cgroup_from_current(void) objcg = rcu_dereference(memcg->objcg); if (objcg && obj_cgroup_tryget(objcg)) break; + objcg = NULL; } rcu_read_unlock(); @@ -3104,8 +3137,8 @@ int __memcg_kmem_charge_page(struct page *page, gfp_t gfp, int order) if (memcg && !mem_cgroup_is_root(memcg)) { ret = __memcg_kmem_charge(memcg, gfp, 1 << order); if (!ret) { - page->mem_cgroup = memcg; - __SetPageKmemcg(page); + page->memcg_data = (unsigned long)memcg | + MEMCG_DATA_KMEM; return 0; } css_put(&memcg->css); @@ -3120,7 +3153,7 @@ int __memcg_kmem_charge_page(struct page *page, gfp_t gfp, int order) */ void __memcg_kmem_uncharge_page(struct page *page, int order) { - struct mem_cgroup *memcg = page->mem_cgroup; + struct mem_cgroup *memcg = page_memcg(page); unsigned int nr_pages = 1 << order; if (!memcg) @@ -3128,12 +3161,8 @@ void __memcg_kmem_uncharge_page(struct page *page, int order) VM_BUG_ON_PAGE(mem_cgroup_is_root(memcg), page); __memcg_kmem_uncharge(memcg, nr_pages); - page->mem_cgroup = NULL; + page->memcg_data = 0; css_put(&memcg->css); - - /* slab pages do not have PageKmemcg flag set */ - if (PageKmemcg(page)) - __ClearPageKmemcg(page); } static bool consume_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes) @@ -3246,8 +3275,10 @@ int obj_cgroup_charge(struct obj_cgroup *objcg, gfp_t gfp, size_t size) * independently later. */ rcu_read_lock(); +retry: memcg = obj_cgroup_memcg(objcg); - css_get(&memcg->css); + if (unlikely(!css_tryget(&memcg->css))) + goto retry; rcu_read_unlock(); nr_pages = size >> PAGE_SHIFT; @@ -3272,14 +3303,12 @@ void obj_cgroup_uncharge(struct obj_cgroup *objcg, size_t size) #endif /* CONFIG_MEMCG_KMEM */ #ifdef CONFIG_TRANSPARENT_HUGEPAGE - /* - * Because tail pages are not marked as "used", set it. We're under - * pgdat->lru_lock and migration entries setup in all page mappings. + * Because page_memcg(head) is not set on compound tails, set it now. */ void mem_cgroup_split_huge_fixup(struct page *head) { - struct mem_cgroup *memcg = head->mem_cgroup; + struct mem_cgroup *memcg = page_memcg(head); int i; if (mem_cgroup_disabled()) @@ -3287,7 +3316,7 @@ void mem_cgroup_split_huge_fixup(struct page *head) for (i = 1; i < HPAGE_PMD_NR; i++) { css_get(&memcg->css); - head[i].mem_cgroup = memcg; + head[i].memcg_data = (unsigned long)memcg; } } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ @@ -3470,22 +3499,6 @@ unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order, } /* - * Test whether @memcg has children, dead or alive. Note that this - * function doesn't care whether @memcg has use_hierarchy enabled and - * returns %true if there are child csses according to the cgroup - * hierarchy. Testing use_hierarchy is the caller's responsibility. - */ -static inline bool memcg_has_children(struct mem_cgroup *memcg) -{ - bool ret; - - rcu_read_lock(); - ret = css_next_child(NULL, &memcg->css); - rcu_read_unlock(); - return ret; -} - -/* * Reclaims as many pages from the given memcg as possible. * * Caller is responsible for holding css reference for memcg. @@ -3533,37 +3546,20 @@ static ssize_t mem_cgroup_force_empty_write(struct kernfs_open_file *of, static u64 mem_cgroup_hierarchy_read(struct cgroup_subsys_state *css, struct cftype *cft) { - return mem_cgroup_from_css(css)->use_hierarchy; + return 1; } static int mem_cgroup_hierarchy_write(struct cgroup_subsys_state *css, struct cftype *cft, u64 val) { - int retval = 0; - struct mem_cgroup *memcg = mem_cgroup_from_css(css); - struct mem_cgroup *parent_memcg = mem_cgroup_from_css(memcg->css.parent); - - if (memcg->use_hierarchy == val) + if (val == 1) return 0; - /* - * If parent's use_hierarchy is set, we can't make any modifications - * in the child subtrees. If it is unset, then the change can - * occur, provided the current cgroup has no children. - * - * For the root cgroup, parent_mem is NULL, we allow value to be - * set if there are no children. - */ - if ((!parent_memcg || !parent_memcg->use_hierarchy) && - (val == 1 || val == 0)) { - if (!memcg_has_children(memcg)) - memcg->use_hierarchy = val; - else - retval = -EBUSY; - } else - retval = -EINVAL; + pr_warn_once("Non-hierarchical mode is deprecated. " + "Please report your usecase to linux-mm@kvack.org if you " + "depend on this functionality.\n"); - return retval; + return -EINVAL; } static unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap) @@ -3712,12 +3708,6 @@ static int memcg_online_kmem(struct mem_cgroup *memcg) static_branch_enable(&memcg_kmem_enabled_key); - /* - * A memory cgroup is considered kmem-online as soon as it gets - * kmemcg_id. Setting the id after enabling static branching will - * guarantee no one starts accounting before all call sites are - * patched. - */ memcg->kmemcg_id = memcg_id; memcg->kmem_state = KMEM_ONLINE; @@ -3757,8 +3747,6 @@ static void memcg_offline_kmem(struct mem_cgroup *memcg) child = mem_cgroup_from_css(css); BUG_ON(child->kmemcg_id != kmemcg_id); child->kmemcg_id = parent->kmemcg_id; - if (!memcg->use_hierarchy) - break; } rcu_read_unlock(); @@ -4669,7 +4657,7 @@ void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages, void mem_cgroup_track_foreign_dirty_slowpath(struct page *page, struct bdi_writeback *wb) { - struct mem_cgroup *memcg = page->mem_cgroup; + struct mem_cgroup *memcg = page_memcg(page); struct memcg_cgwb_frn *frn; u64 now = get_jiffies_64(); u64 oldest_at = now; @@ -5349,38 +5337,22 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) if (parent) { memcg->swappiness = mem_cgroup_swappiness(parent); memcg->oom_kill_disable = parent->oom_kill_disable; - } - if (!parent) { - page_counter_init(&memcg->memory, NULL); - page_counter_init(&memcg->swap, NULL); - page_counter_init(&memcg->kmem, NULL); - page_counter_init(&memcg->tcpmem, NULL); - } else if (parent->use_hierarchy) { - memcg->use_hierarchy = true; + page_counter_init(&memcg->memory, &parent->memory); page_counter_init(&memcg->swap, &parent->swap); page_counter_init(&memcg->kmem, &parent->kmem); page_counter_init(&memcg->tcpmem, &parent->tcpmem); } else { - page_counter_init(&memcg->memory, &root_mem_cgroup->memory); - page_counter_init(&memcg->swap, &root_mem_cgroup->swap); - page_counter_init(&memcg->kmem, &root_mem_cgroup->kmem); - page_counter_init(&memcg->tcpmem, &root_mem_cgroup->tcpmem); - /* - * Deeper hierachy with use_hierarchy == false doesn't make - * much sense so let cgroup subsystem know about this - * unfortunate state in our controller. - */ - if (parent != root_mem_cgroup) - memory_cgrp_subsys.broken_hierarchy = true; - } + page_counter_init(&memcg->memory, NULL); + page_counter_init(&memcg->swap, NULL); + page_counter_init(&memcg->kmem, NULL); + page_counter_init(&memcg->tcpmem, NULL); - /* The following stuff does not apply to the root */ - if (!parent) { root_mem_cgroup = memcg; return &memcg->css; } + /* The following stuff does not apply to the root */ error = memcg_online_kmem(memcg); if (error) goto fail; @@ -5646,14 +5618,14 @@ static int mem_cgroup_move_account(struct page *page, /* * Prevent mem_cgroup_migrate() from looking at - * page->mem_cgroup of its source page while we change it. + * page's memory cgroup of its source page while we change it. */ ret = -EBUSY; if (!trylock_page(page)) goto out; ret = -EINVAL; - if (page->mem_cgroup != from) + if (page_memcg(page) != from) goto out_unlock; pgdat = page_pgdat(page); @@ -5708,13 +5680,13 @@ static int mem_cgroup_move_account(struct page *page, /* * All state has been migrated, let's switch to the new memcg. * - * It is safe to change page->mem_cgroup here because the page + * It is safe to change page's memcg here because the page * is referenced, charged, isolated, and locked: we can't race * with (un)charging, migration, LRU putback, or anything else - * that would rely on a stable page->mem_cgroup. + * that would rely on a stable page's memory cgroup. * * Note that lock_page_memcg is a memcg lock, not a page lock, - * to save space. As soon as we switch page->mem_cgroup to a + * to save space. As soon as we switch page's memory cgroup to a * new memcg that isn't locked, the above state can change * concurrently again. Make sure we're truly done with it. */ @@ -5723,7 +5695,7 @@ static int mem_cgroup_move_account(struct page *page, css_get(&to->css); css_put(&from->css); - page->mem_cgroup = to; + page->memcg_data = (unsigned long)to; __unlock_page_memcg(from); @@ -5789,7 +5761,7 @@ static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma, * mem_cgroup_move_account() checks the page is valid or * not under LRU exclusion. */ - if (page->mem_cgroup == mc.from) { + if (page_memcg(page) == mc.from) { ret = MC_TARGET_PAGE; if (is_device_private_page(page)) ret = MC_TARGET_DEVICE; @@ -5833,7 +5805,7 @@ static enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma, VM_BUG_ON_PAGE(!page || !PageHead(page), page); if (!(mc.flags & MOVE_ANON)) return ret; - if (page->mem_cgroup == mc.from) { + if (page_memcg(page) == mc.from) { ret = MC_TARGET_PAGE; if (target) { get_page(page); @@ -6217,24 +6189,6 @@ static void mem_cgroup_move_task(void) } #endif -/* - * Cgroup retains root cgroups across [un]mount cycles making it necessary - * to verify whether we're attached to the default hierarchy on each mount - * attempt. - */ -static void mem_cgroup_bind(struct cgroup_subsys_state *root_css) -{ - /* - * use_hierarchy is forced on the default hierarchy. cgroup core - * guarantees that @root doesn't have any children, so turning it - * on for the root memcg is enough. - */ - if (cgroup_subsys_on_dfl(memory_cgrp_subsys)) - root_mem_cgroup->use_hierarchy = true; - else - root_mem_cgroup->use_hierarchy = false; -} - static int seq_puts_memcg_tunable(struct seq_file *m, unsigned long value) { if (value == PAGE_COUNTER_MAX) @@ -6572,7 +6526,6 @@ struct cgroup_subsys memory_cgrp_subsys = { .can_attach = mem_cgroup_can_attach, .cancel_attach = mem_cgroup_cancel_attach, .post_attach = mem_cgroup_move_task, - .bind = mem_cgroup_bind, .dfl_cftypes = memory_files, .legacy_cftypes = mem_cgroup_legacy_files, .early_init = 0, @@ -6779,12 +6732,12 @@ int mem_cgroup_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask) /* * Every swap fault against a single page tries to charge the * page, bail as early as possible. shmem_unuse() encounters - * already charged pages, too. page->mem_cgroup is protected - * by the page lock, which serializes swap cache removal, which - * in turn serializes uncharging. + * already charged pages, too. page and memcg binding is + * protected by the page lock, which serializes swap cache + * removal, which in turn serializes uncharging. */ VM_BUG_ON_PAGE(!PageLocked(page), page); - if (compound_head(page)->mem_cgroup) + if (page_memcg(compound_head(page))) goto out; id = lookup_swap_cgroup_id(ent); @@ -6868,21 +6821,21 @@ static void uncharge_page(struct page *page, struct uncharge_gather *ug) VM_BUG_ON_PAGE(PageLRU(page), page); - if (!page->mem_cgroup) + if (!page_memcg(page)) return; /* * Nobody should be changing or seriously looking at - * page->mem_cgroup at this point, we have fully + * page_memcg(page) at this point, we have fully * exclusive access to the page. */ - if (ug->memcg != page->mem_cgroup) { + if (ug->memcg != page_memcg(page)) { if (ug->memcg) { uncharge_batch(ug); uncharge_gather_clear(ug); } - ug->memcg = page->mem_cgroup; + ug->memcg = page_memcg(page); /* pairs with css_put in uncharge_batch */ css_get(&ug->memcg->css); @@ -6891,15 +6844,13 @@ static void uncharge_page(struct page *page, struct uncharge_gather *ug) nr_pages = compound_nr(page); ug->nr_pages += nr_pages; - if (!PageKmemcg(page)) { - ug->pgpgout++; - } else { + if (PageMemcgKmem(page)) ug->nr_kmem += nr_pages; - __ClearPageKmemcg(page); - } + else + ug->pgpgout++; ug->dummy_page = page; - page->mem_cgroup = NULL; + page->memcg_data = 0; css_put(&ug->memcg->css); } @@ -6942,7 +6893,7 @@ void mem_cgroup_uncharge(struct page *page) return; /* Don't touch page->lru of any random page, pre-check: */ - if (!page->mem_cgroup) + if (!page_memcg(page)) return; uncharge_gather_clear(&ug); @@ -6992,11 +6943,11 @@ void mem_cgroup_migrate(struct page *oldpage, struct page *newpage) return; /* Page cache replacement: new page already charged? */ - if (newpage->mem_cgroup) + if (page_memcg(newpage)) return; - /* Swapcache readahead pages can get replaced before being charged */ - memcg = oldpage->mem_cgroup; + memcg = page_memcg(oldpage); + VM_WARN_ON_ONCE_PAGE(!memcg, oldpage); if (!memcg) return; @@ -7188,12 +7139,15 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry) VM_BUG_ON_PAGE(PageLRU(page), page); VM_BUG_ON_PAGE(page_count(page), page); + if (mem_cgroup_disabled()) + return; + if (cgroup_subsys_on_dfl(memory_cgrp_subsys)) return; - memcg = page->mem_cgroup; + memcg = page_memcg(page); - /* Readahead page, never charged */ + VM_WARN_ON_ONCE_PAGE(!memcg, page); if (!memcg) return; @@ -7212,7 +7166,7 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry) VM_BUG_ON_PAGE(oldid, page); mod_memcg_state(swap_memcg, MEMCG_SWAP, nr_entries); - page->mem_cgroup = NULL; + page->memcg_data = 0; if (!mem_cgroup_is_root(memcg)) page_counter_uncharge(&memcg->memory, nr_entries); @@ -7252,12 +7206,15 @@ int mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry) struct mem_cgroup *memcg; unsigned short oldid; + if (mem_cgroup_disabled()) + return 0; + if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) return 0; - memcg = page->mem_cgroup; + memcg = page_memcg(page); - /* Readahead page, never charged */ + VM_WARN_ON_ONCE_PAGE(!memcg, page); if (!memcg) return 0; @@ -7336,7 +7293,7 @@ bool mem_cgroup_swap_full(struct page *page) if (cgroup_memory_noswap || !cgroup_subsys_on_dfl(memory_cgrp_subsys)) return false; - memcg = page->mem_cgroup; + memcg = page_memcg(page); if (!memcg) return false; @@ -7354,9 +7311,9 @@ bool mem_cgroup_swap_full(struct page *page) static int __init setup_swap_account(char *s) { if (!strcmp(s, "1")) - cgroup_memory_noswap = 0; + cgroup_memory_noswap = false; else if (!strcmp(s, "0")) - cgroup_memory_noswap = 1; + cgroup_memory_noswap = true; return 1; } __setup("swapaccount=", setup_swap_account); diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 5d880d4eb9a2..5a38e9eade94 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -263,8 +263,8 @@ static int kill_proc(struct to_kill *tk, unsigned long pfn, int flags) } /* - * When a unknown page type is encountered drain as many buffers as possible - * in the hope to turn the page into a LRU or free page, which we can handle. + * Unknown page type encountered. Try to check whether it can turn PageLRU by + * lru_add_drain_all, or a free page by reclaiming slabs when possible. */ void shake_page(struct page *p, int access) { @@ -273,9 +273,6 @@ void shake_page(struct page *p, int access) if (!PageSlab(p)) { lru_add_drain_all(); - if (PageLRU(p)) - return; - drain_all_pages(page_zone(p)); if (PageLRU(p) || is_free_buddy_page(p)) return; } @@ -809,7 +806,7 @@ static int me_swapcache_clean(struct page *p, unsigned long pfn) */ static int me_huge_page(struct page *p, unsigned long pfn) { - int res = 0; + int res; struct page *hpage = compound_head(p); struct address_space *mapping; @@ -820,6 +817,7 @@ static int me_huge_page(struct page *p, unsigned long pfn) if (mapping) { res = truncate_error_page(hpage, pfn, mapping); } else { + res = MF_FAILED; unlock_page(hpage); /* * migration entry prevents later access on error anonymous @@ -828,8 +826,10 @@ static int me_huge_page(struct page *p, unsigned long pfn) */ if (PageAnon(hpage)) put_page(hpage); - dissolve_free_huge_page(p); - res = MF_RECOVERED; + if (!dissolve_free_huge_page(p) && take_page_off_buddy(p)) { + page_ref_inc(p); + res = MF_RECOVERED; + } lock_page(hpage); } @@ -946,13 +946,13 @@ static int page_action(struct page_state *ps, struct page *p, } /** - * get_hwpoison_page() - Get refcount for memory error handling: + * __get_hwpoison_page() - Get refcount for memory error handling: * @page: raw error page (hit by memory error) * * Return: return 0 if failed to grab the refcount, otherwise true (some * non-zero value.) */ -static int get_hwpoison_page(struct page *page) +static int __get_hwpoison_page(struct page *page) { struct page *head = compound_head(page); @@ -983,13 +983,80 @@ static int get_hwpoison_page(struct page *page) } /* + * Safely get reference count of an arbitrary page. + * + * Returns 0 for a free page, 1 for an in-use page, + * -EIO for a page-type we cannot handle and -EBUSY if we raced with an + * allocation. + * We only incremented refcount in case the page was already in-use and it + * is a known type we can handle. + */ +static int get_any_page(struct page *p, unsigned long flags) +{ + int ret = 0, pass = 0; + bool count_increased = false; + + if (flags & MF_COUNT_INCREASED) + count_increased = true; + +try_again: + if (!count_increased && !__get_hwpoison_page(p)) { + if (page_count(p)) { + /* We raced with an allocation, retry. */ + if (pass++ < 3) + goto try_again; + ret = -EBUSY; + } else if (!PageHuge(p) && !is_free_buddy_page(p)) { + /* We raced with put_page, retry. */ + if (pass++ < 3) + goto try_again; + ret = -EIO; + } + } else { + if (PageHuge(p) || PageLRU(p) || __PageMovable(p)) { + ret = 1; + } else { + /* + * A page we cannot handle. Check whether we can turn + * it into something we can handle. + */ + if (pass++ < 3) { + put_page(p); + shake_page(p, 1); + count_increased = false; + goto try_again; + } + put_page(p); + ret = -EIO; + } + } + + return ret; +} + +static int get_hwpoison_page(struct page *p, unsigned long flags, + enum mf_flags ctxt) +{ + int ret; + + zone_pcp_disable(page_zone(p)); + if (ctxt == MF_SOFT_OFFLINE) + ret = get_any_page(p, flags); + else + ret = __get_hwpoison_page(p); + zone_pcp_enable(page_zone(p)); + + return ret; +} + +/* * Do all that is necessary to remove user space mappings. Unmap * the pages and send SIGBUS to the processes if the data was dirty. */ static bool hwpoison_user_mappings(struct page *p, unsigned long pfn, int flags, struct page **hpagep) { - enum ttu_flags ttu = TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS; + enum ttu_flags ttu = TTU_IGNORE_MLOCK; struct address_space *mapping; LIST_HEAD(tokill); bool unmap_success = true; @@ -1162,7 +1229,7 @@ static int memory_failure_hugetlb(unsigned long pfn, int flags) num_poisoned_pages_inc(); - if (!(flags & MF_COUNT_INCREASED) && !get_hwpoison_page(p)) { + if (!(flags & MF_COUNT_INCREASED) && !get_hwpoison_page(p, flags, 0)) { /* * Check "filter hit" and "race with other subpage." */ @@ -1176,9 +1243,13 @@ static int memory_failure_hugetlb(unsigned long pfn, int flags) } } unlock_page(head); - dissolve_free_huge_page(p); - action_result(pfn, MF_MSG_FREE_HUGE, MF_DELAYED); - return 0; + res = MF_FAILED; + if (!dissolve_free_huge_page(p) && take_page_off_buddy(p)) { + page_ref_inc(p); + res = MF_RECOVERED; + } + action_result(pfn, MF_MSG_FREE_HUGE, res); + return res == MF_RECOVERED ? 0 : -EBUSY; } lock_page(head); @@ -1231,6 +1302,12 @@ static int memory_failure_dev_pagemap(unsigned long pfn, int flags, loff_t start; dax_entry_t cookie; + if (flags & MF_COUNT_INCREASED) + /* + * Drop the extra refcount in case we come from madvise(). + */ + put_page(page); + /* * Prevent the inode from being freed while we are interrogating * the address_space, typically this would be handled by @@ -1319,6 +1396,7 @@ int memory_failure(unsigned long pfn, int flags) struct dev_pagemap *pgmap; int res; unsigned long page_flags; + bool retry = true; if (!sysctl_memory_failure_recovery) panic("Memory failure on page %lx", pfn); @@ -1336,6 +1414,7 @@ int memory_failure(unsigned long pfn, int flags) return -ENXIO; } +try_again: if (PageHuge(p)) return memory_failure_hugetlb(pfn, flags); if (TestSetPageHWPoison(p)) { @@ -1358,10 +1437,23 @@ int memory_failure(unsigned long pfn, int flags) * In fact it's dangerous to directly bump up page count from 0, * that may make page_ref_freeze()/page_ref_unfreeze() mismatch. */ - if (!(flags & MF_COUNT_INCREASED) && !get_hwpoison_page(p)) { + if (!(flags & MF_COUNT_INCREASED) && !get_hwpoison_page(p, flags, 0)) { if (is_free_buddy_page(p)) { - action_result(pfn, MF_MSG_BUDDY, MF_DELAYED); - return 0; + if (take_page_off_buddy(p)) { + page_ref_inc(p); + res = MF_RECOVERED; + } else { + /* We lost the race, try again */ + if (retry) { + ClearPageHWPoison(p); + num_poisoned_pages_dec(); + retry = false; + goto try_again; + } + res = MF_FAILED; + } + action_result(pfn, MF_MSG_BUDDY, res); + return res == MF_RECOVERED ? 0 : -EBUSY; } else { action_result(pfn, MF_MSG_KERNEL_HIGH_ORDER, MF_IGNORED); return -EBUSY; @@ -1385,14 +1477,6 @@ int memory_failure(unsigned long pfn, int flags) * walked by the page reclaim code, however that's not a big loss. */ shake_page(p, 0); - /* shake_page could have turned it free. */ - if (!PageLRU(p) && is_free_buddy_page(p)) { - if (flags & MF_COUNT_INCREASED) - action_result(pfn, MF_MSG_BUDDY, MF_DELAYED); - else - action_result(pfn, MF_MSG_BUDDY_2ND, MF_DELAYED); - return 0; - } lock_page(p); @@ -1596,6 +1680,7 @@ int unpoison_memory(unsigned long pfn) struct page *page; struct page *p; int freeit = 0; + unsigned long flags = 0; static DEFINE_RATELIMIT_STATE(unpoison_rs, DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST); @@ -1640,7 +1725,7 @@ int unpoison_memory(unsigned long pfn) return 0; } - if (!get_hwpoison_page(p)) { + if (!get_hwpoison_page(p, flags, 0)) { if (TestClearPageHWPoison(p)) num_poisoned_pages_dec(); unpoison_pr_info("Unpoison: Software-unpoisoned free page %#lx\n", @@ -1671,75 +1756,6 @@ int unpoison_memory(unsigned long pfn) } EXPORT_SYMBOL(unpoison_memory); -/* - * Safely get reference count of an arbitrary page. - * Returns 0 for a free page, -EIO for a zero refcount page - * that is not free, and 1 for any other page type. - * For 1 the page is returned with increased page count, otherwise not. - */ -static int __get_any_page(struct page *p, unsigned long pfn, int flags) -{ - int ret; - - if (flags & MF_COUNT_INCREASED) - return 1; - - /* - * When the target page is a free hugepage, just remove it - * from free hugepage list. - */ - if (!get_hwpoison_page(p)) { - if (PageHuge(p)) { - pr_info("%s: %#lx free huge page\n", __func__, pfn); - ret = 0; - } else if (is_free_buddy_page(p)) { - pr_info("%s: %#lx free buddy page\n", __func__, pfn); - ret = 0; - } else if (page_count(p)) { - /* raced with allocation */ - ret = -EBUSY; - } else { - pr_info("%s: %#lx: unknown zero refcount page type %lx\n", - __func__, pfn, p->flags); - ret = -EIO; - } - } else { - /* Not a free page */ - ret = 1; - } - return ret; -} - -static int get_any_page(struct page *page, unsigned long pfn, int flags) -{ - int ret = __get_any_page(page, pfn, flags); - - if (ret == -EBUSY) - ret = __get_any_page(page, pfn, flags); - - if (ret == 1 && !PageHuge(page) && - !PageLRU(page) && !__PageMovable(page)) { - /* - * Try to free it. - */ - put_page(page); - shake_page(page, 1); - - /* - * Did it turn free? - */ - ret = __get_any_page(page, pfn, 0); - if (ret == 1 && !PageLRU(page)) { - /* Drop page reference which is from __get_any_page() */ - put_page(page); - pr_info("soft_offline: %#lx: unknown non LRU page type %lx (%pGp)\n", - pfn, page->flags, &page->flags); - return -EIO; - } - } - return ret; -} - static bool isolate_page(struct page *page, struct list_head *pagelist) { bool isolated = false; @@ -1839,11 +1855,11 @@ static int __soft_offline_page(struct page *page) pr_info("soft offline: %#lx: %s migration failed %d, type %lx (%pGp)\n", pfn, msg_page[huge], ret, page->flags, &page->flags); if (ret > 0) - ret = -EIO; + ret = -EBUSY; } } else { - pr_info("soft offline: %#lx: %s isolation failed: %d, page count %d, type %lx (%pGp)\n", - pfn, msg_page[huge], ret, page_count(page), page->flags, &page->flags); + pr_info("soft offline: %#lx: %s isolation failed, page count %d, type %lx (%pGp)\n", + pfn, msg_page[huge], page_count(page), page->flags, &page->flags); ret = -EBUSY; } return ret; @@ -1905,7 +1921,7 @@ int soft_offline_page(unsigned long pfn, int flags) return -EIO; if (PageHWPoison(page)) { - pr_info("soft offline: %#lx page already poisoned\n", pfn); + pr_info("%s: %#lx page already poisoned\n", __func__, pfn); if (flags & MF_COUNT_INCREASED) put_page(page); return 0; @@ -1913,16 +1929,20 @@ int soft_offline_page(unsigned long pfn, int flags) retry: get_online_mems(); - ret = get_any_page(page, pfn, flags); + ret = get_hwpoison_page(page, flags, MF_SOFT_OFFLINE); put_online_mems(); - if (ret > 0) + if (ret > 0) { ret = soft_offline_in_use_page(page); - else if (ret == 0) + } else if (ret == 0) { if (soft_offline_free_page(page) && try_again) { try_again = false; goto retry; } + } else if (ret == -EIO) { + pr_info("%s: %#lx: unknown page type: %lx (%pGP)\n", + __func__, pfn, page->flags, &page->flags); + } return ret; } diff --git a/mm/memory.c b/mm/memory.c index c48f8df6e502..7d608765932b 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1171,6 +1171,15 @@ copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma) mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_PAGE, 0, src_vma, src_mm, addr, end); mmu_notifier_invalidate_range_start(&range); + /* + * Disabling preemption is not needed for the write side, as + * the read side doesn't spin, but goes to the mmap_lock. + * + * Use the raw variant of the seqcount_t write API to avoid + * lockdep complaining about preemptibility. + */ + mmap_assert_write_locked(src_mm); + raw_write_seqcount_begin(&src_mm->write_protect_seq); } ret = 0; @@ -1187,8 +1196,10 @@ copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma) } } while (dst_pgd++, src_pgd++, addr = next, addr != end); - if (is_cow) + if (is_cow) { + raw_write_seqcount_end(&src_mm->write_protect_seq); mmu_notifier_invalidate_range_end(&range); + } return ret; } @@ -4696,9 +4707,9 @@ int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address) } #endif /* __PAGETABLE_PMD_FOLDED */ -static int __follow_pte_pmd(struct mm_struct *mm, unsigned long address, - struct mmu_notifier_range *range, - pte_t **ptepp, pmd_t **pmdpp, spinlock_t **ptlp) +int follow_pte(struct mm_struct *mm, unsigned long address, + struct mmu_notifier_range *range, pte_t **ptepp, pmd_t **pmdpp, + spinlock_t **ptlp) { pgd_t *pgd; p4d_t *p4d; @@ -4763,32 +4774,6 @@ out: return -EINVAL; } -static inline int follow_pte(struct mm_struct *mm, unsigned long address, - pte_t **ptepp, spinlock_t **ptlp) -{ - int res; - - /* (void) is needed to make gcc happy */ - (void) __cond_lock(*ptlp, - !(res = __follow_pte_pmd(mm, address, NULL, - ptepp, NULL, ptlp))); - return res; -} - -int follow_pte_pmd(struct mm_struct *mm, unsigned long address, - struct mmu_notifier_range *range, - pte_t **ptepp, pmd_t **pmdpp, spinlock_t **ptlp) -{ - int res; - - /* (void) is needed to make gcc happy */ - (void) __cond_lock(*ptlp, - !(res = __follow_pte_pmd(mm, address, range, - ptepp, pmdpp, ptlp))); - return res; -} -EXPORT_SYMBOL(follow_pte_pmd); - /** * follow_pfn - look up PFN at a user virtual address * @vma: memory mapping @@ -4809,7 +4794,7 @@ int follow_pfn(struct vm_area_struct *vma, unsigned long address, if (!(vma->vm_flags & (VM_IO | VM_PFNMAP))) return ret; - ret = follow_pte(vma->vm_mm, address, &ptep, &ptl); + ret = follow_pte(vma->vm_mm, address, NULL, &ptep, NULL, &ptl); if (ret) return ret; *pfn = pte_pfn(*ptep); @@ -4830,7 +4815,7 @@ int follow_phys(struct vm_area_struct *vma, if (!(vma->vm_flags & (VM_IO | VM_PFNMAP))) goto out; - if (follow_pte(vma->vm_mm, address, &ptep, &ptl)) + if (follow_pte(vma->vm_mm, address, NULL, &ptep, NULL, &ptl)) goto out; pte = *ptep; @@ -4874,11 +4859,10 @@ EXPORT_SYMBOL_GPL(generic_access_phys); #endif /* - * Access another process' address space as given in mm. If non-NULL, use the - * given task for page fault accounting. + * Access another process' address space as given in mm. */ -int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm, - unsigned long addr, void *buf, int len, unsigned int gup_flags) +int __access_remote_vm(struct mm_struct *mm, unsigned long addr, void *buf, + int len, unsigned int gup_flags) { struct vm_area_struct *vma; void *old_buf = buf; @@ -4955,7 +4939,7 @@ int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm, int access_remote_vm(struct mm_struct *mm, unsigned long addr, void *buf, int len, unsigned int gup_flags) { - return __access_remote_vm(NULL, mm, addr, buf, len, gup_flags); + return __access_remote_vm(mm, addr, buf, len, gup_flags); } /* @@ -4973,7 +4957,7 @@ int access_process_vm(struct task_struct *tsk, unsigned long addr, if (!mm) return 0; - ret = __access_remote_vm(tsk, mm, addr, buf, len, gup_flags); + ret = __access_remote_vm(mm, addr, buf, len, gup_flags); mmput(mm); diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 2b6cc42ba0a3..af41fb990820 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -596,8 +596,7 @@ void generic_online_page(struct page *page, unsigned int order) * so we should map it first. This is better than introducing a special * case in page freeing fast path. */ - if (debug_pagealloc_enabled_static()) - kernel_map_pages(page, 1 << order, 1); + debug_pagealloc_map_pages(page, 1 << order); __free_pages_core(page, order); totalram_pages_add(1UL << order); #ifdef CONFIG_HIGHMEM @@ -1304,7 +1303,7 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) if (WARN_ON(PageLRU(page))) isolate_lru_page(page); if (page_mapped(page)) - try_to_unmap(page, TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS); + try_to_unmap(page, TTU_IGNORE_MLOCK); continue; } @@ -1492,13 +1491,19 @@ int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages) } node = zone_to_nid(zone); + /* + * Disable pcplists so that page isolation cannot race with freeing + * in a way that pages from isolated pageblock are left on pcplists. + */ + zone_pcp_disable(zone); + /* set above range as isolated */ ret = start_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE, MEMORY_OFFLINE | REPORT_FAILURE); if (ret) { reason = "failure to isolate range"; - goto failed_removal; + goto failed_removal_pcplists_disabled; } arg.start_pfn = start_pfn; @@ -1550,26 +1555,13 @@ int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages) goto failed_removal_isolated; } - /* - * per-cpu pages are drained in start_isolate_page_range, but if - * there are still pages that are not free, make sure that we - * drain again, because when we isolated range we might - * have raced with another thread that was adding pages to pcp - * list. - * - * Forward progress should be still guaranteed because - * pages on the pcp list can only belong to MOVABLE_ZONE - * because has_unmovable_pages explicitly checks for - * PageBuddy on freed pages on other zones. - */ ret = test_pages_isolated(start_pfn, end_pfn, MEMORY_OFFLINE); - if (ret) - drain_all_pages(zone); + } while (ret); /* Mark all sections offline and remove free pages from the buddy. */ __offline_isolated_pages(start_pfn, end_pfn); - pr_info("Offlined Pages %ld\n", nr_pages); + pr_debug("Offlined Pages %ld\n", nr_pages); /* * The memory sections are marked offline, and the pageblock flags @@ -1580,6 +1572,8 @@ int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages) zone->nr_isolate_pageblock -= nr_pages / pageblock_nr_pages; spin_unlock_irqrestore(&zone->lock, flags); + zone_pcp_enable(zone); + /* removal success */ adjust_managed_page_count(pfn_to_page(start_pfn), -nr_pages); zone->present_pages -= nr_pages; @@ -1612,6 +1606,8 @@ int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages) failed_removal_isolated: undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE); memory_notify(MEM_CANCEL_OFFLINE, &arg); +failed_removal_pcplists_disabled: + zone_pcp_enable(zone); failed_removal: pr_debug("memory offlining [mem %#010llx-%#010llx] failed due to %s\n", (unsigned long long) start_pfn << PAGE_SHIFT, diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 3ca4898f3f24..8cf96bd21341 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1114,9 +1114,7 @@ int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from, int err; nodemask_t tmp; - err = migrate_prep(); - if (err) - return err; + migrate_prep(); mmap_read_lock(mm); @@ -1315,9 +1313,7 @@ static long do_mbind(unsigned long start, unsigned long len, if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) { - err = migrate_prep(); - if (err) - goto mpol_out; + migrate_prep(); } { NODEMASK_SCRATCH(scratch); diff --git a/mm/mempool.c b/mm/mempool.c index f473cdddaff0..624ed51b060f 100644 --- a/mm/mempool.c +++ b/mm/mempool.c @@ -104,7 +104,7 @@ static inline void poison_element(mempool_t *pool, void *element) static __always_inline void kasan_poison_element(mempool_t *pool, void *element) { if (pool->alloc == mempool_alloc_slab || pool->alloc == mempool_kmalloc) - kasan_poison_kfree(element, _RET_IP_); + kasan_slab_free_mempool(element, _RET_IP_); else if (pool->alloc == mempool_alloc_pages) kasan_free_pages(element, (unsigned long)pool->pool_data); } @@ -112,7 +112,7 @@ static __always_inline void kasan_poison_element(mempool_t *pool, void *element) static void kasan_unpoison_element(mempool_t *pool, void *element) { if (pool->alloc == mempool_alloc_slab || pool->alloc == mempool_kmalloc) - kasan_unpoison_slab(element); + kasan_unpoison_range(element, __ksize(element)); else if (pool->alloc == mempool_alloc_pages) kasan_alloc_pages(element, (unsigned long)pool->pool_data); } diff --git a/mm/migrate.c b/mm/migrate.c index 5795cb82e27c..ee5e612b4cd8 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -62,7 +62,7 @@ * to be migrated using isolate_lru_page(). If scheduling work on other CPUs is * undesirable, use migrate_prep_local() */ -int migrate_prep(void) +void migrate_prep(void) { /* * Clear the LRU lists so pages can be isolated. @@ -71,16 +71,12 @@ int migrate_prep(void) * pages that may be busy. */ lru_add_drain_all(); - - return 0; } /* Do the necessary work of migrate_prep but not if it involves other CPUs */ -int migrate_prep_local(void) +void migrate_prep_local(void) { lru_add_drain(); - - return 0; } int isolate_movable_page(struct page *page, isolate_mode_t mode) @@ -1106,7 +1102,7 @@ static int __unmap_and_move(struct page *page, struct page *newpage, * and treated as swapcache but it has no rmap yet. * Calling try_to_unmap() against a page->mapping==NULL page will * trigger a BUG. So handle it here. - * 2. An orphaned page (see truncate_complete_page) might have + * 2. An orphaned page (see truncate_cleanup_page) might have * fs-private metadata. The page can be picked up due to memory * offlining. Everywhere else except page reclaim, the page is * invisible to the vm, so the page can not be migrated. So try to @@ -1122,8 +1118,7 @@ static int __unmap_and_move(struct page *page, struct page *newpage, /* Establish migration ptes */ VM_BUG_ON_PAGE(PageAnon(page) && !PageKsm(page) && !anon_vma, page); - try_to_unmap(page, - TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS); + try_to_unmap(page, TTU_MIGRATION|TTU_IGNORE_MLOCK); page_was_mapped = 1; } @@ -1169,13 +1164,14 @@ static int unmap_and_move(new_page_t get_new_page, free_page_t put_new_page, unsigned long private, struct page *page, int force, enum migrate_mode mode, - enum migrate_reason reason) + enum migrate_reason reason, + struct list_head *ret) { int rc = MIGRATEPAGE_SUCCESS; struct page *newpage = NULL; if (!thp_migration_supported() && PageTransHuge(page)) - return -ENOMEM; + return -ENOSYS; if (page_count(page) == 1) { /* page was freed from under us. So we are done. */ @@ -1206,7 +1202,14 @@ out: * migrated will have kept its references and be restored. */ list_del(&page->lru); + } + /* + * If migration is successful, releases reference grabbed during + * isolation. Otherwise, restore the page to right list unless + * we want to retry. + */ + if (rc == MIGRATEPAGE_SUCCESS) { /* * Compaction can migrate also non-LRU pages which are * not accounted to NR_ISOLATED_*. They can be recognized @@ -1215,35 +1218,16 @@ out: if (likely(!__PageMovable(page))) mod_node_page_state(page_pgdat(page), NR_ISOLATED_ANON + page_is_file_lru(page), -thp_nr_pages(page)); - } - /* - * If migration is successful, releases reference grabbed during - * isolation. Otherwise, restore the page to right list unless - * we want to retry. - */ - if (rc == MIGRATEPAGE_SUCCESS) { if (reason != MR_MEMORY_FAILURE) /* * We release the page in page_handle_poison. */ put_page(page); } else { - if (rc != -EAGAIN) { - if (likely(!__PageMovable(page))) { - putback_lru_page(page); - goto put_new; - } + if (rc != -EAGAIN) + list_add_tail(&page->lru, ret); - lock_page(page); - if (PageMovable(page)) - putback_movable_page(page); - else - __ClearPageIsolated(page); - unlock_page(page); - put_page(page); - } -put_new: if (put_new_page) put_new_page(newpage, private); else @@ -1274,7 +1258,8 @@ put_new: static int unmap_and_move_huge_page(new_page_t get_new_page, free_page_t put_new_page, unsigned long private, struct page *hpage, int force, - enum migrate_mode mode, int reason) + enum migrate_mode mode, int reason, + struct list_head *ret) { int rc = -EAGAIN; int page_was_mapped = 0; @@ -1290,7 +1275,7 @@ static int unmap_and_move_huge_page(new_page_t get_new_page, * kicking migration. */ if (!hugepage_migration_supported(page_hstate(hpage))) { - putback_active_hugepage(hpage); + list_move_tail(&hpage->lru, ret); return -ENOSYS; } @@ -1329,8 +1314,7 @@ static int unmap_and_move_huge_page(new_page_t get_new_page, if (page_mapped(hpage)) { bool mapping_locked = false; - enum ttu_flags ttu = TTU_MIGRATION|TTU_IGNORE_MLOCK| - TTU_IGNORE_ACCESS; + enum ttu_flags ttu = TTU_MIGRATION|TTU_IGNORE_MLOCK; if (!PageAnon(hpage)) { /* @@ -1376,8 +1360,10 @@ put_anon: out_unlock: unlock_page(hpage); out: - if (rc != -EAGAIN) + if (rc == MIGRATEPAGE_SUCCESS) putback_active_hugepage(hpage); + else if (rc != -EAGAIN && rc != MIGRATEPAGE_SUCCESS) + list_move_tail(&hpage->lru, ret); /* * If migration was not successful and there's a freeing callback, use @@ -1392,6 +1378,20 @@ out: return rc; } +static inline int try_split_thp(struct page *page, struct page **page2, + struct list_head *from) +{ + int rc = 0; + + lock_page(page); + rc = split_huge_page_to_list(page, from); + unlock_page(page); + if (!rc) + list_safe_reset_next(page, *page2, lru); + + return rc; +} + /* * migrate_pages - migrate the pages specified in a list, to the free pages * supplied as the target for the page migration @@ -1408,8 +1408,8 @@ out: * * The function returns after 10 attempts or if no pages are movable any more * because the list has become empty or no retryable pages exist any more. - * The caller should call putback_movable_pages() to return pages to the LRU - * or free list only if ret != 0. + * It is caller's responsibility to call putback_movable_pages() to return pages + * to the LRU or free list only if ret != 0. * * Returns the number of pages that were not migrated, or an error code. */ @@ -1430,6 +1430,7 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page, struct page *page2; int swapwrite = current->flags & PF_SWAPWRITE; int rc, nr_subpages; + LIST_HEAD(ret_pages); if (!swapwrite) current->flags |= PF_SWAPWRITE; @@ -1452,31 +1453,56 @@ retry: if (PageHuge(page)) rc = unmap_and_move_huge_page(get_new_page, put_new_page, private, page, - pass > 2, mode, reason); + pass > 2, mode, reason, + &ret_pages); else rc = unmap_and_move(get_new_page, put_new_page, private, page, pass > 2, mode, - reason); - + reason, &ret_pages); + /* + * The rules are: + * Success: non hugetlb page will be freed, hugetlb + * page will be put back + * -EAGAIN: stay on the from list + * -ENOMEM: stay on the from list + * Other errno: put on ret_pages list then splice to + * from list + */ switch(rc) { + /* + * THP migration might be unsupported or the + * allocation could've failed so we should + * retry on the same page with the THP split + * to base pages. + * + * Head page is retried immediately and tail + * pages are added to the tail of the list so + * we encounter them after the rest of the list + * is processed. + */ + case -ENOSYS: + /* THP migration is unsupported */ + if (is_thp) { + if (!try_split_thp(page, &page2, from)) { + nr_thp_split++; + goto retry; + } + + nr_thp_failed++; + nr_failed += nr_subpages; + break; + } + + /* Hugetlb migration is unsupported */ + nr_failed++; + break; case -ENOMEM: /* - * THP migration might be unsupported or the - * allocation could've failed so we should - * retry on the same page with the THP split - * to base pages. - * - * Head page is retried immediately and tail - * pages are added to the tail of the list so - * we encounter them after the rest of the list - * is processed. + * When memory is low, don't bother to try to migrate + * other pages, just exit. */ if (is_thp) { - lock_page(page); - rc = split_huge_page_to_list(page, from); - unlock_page(page); - if (!rc) { - list_safe_reset_next(page, page2, lru); + if (!try_split_thp(page, &page2, from)) { nr_thp_split++; goto retry; } @@ -1504,7 +1530,7 @@ retry: break; default: /* - * Permanent failure (-EBUSY, -ENOSYS, etc.): + * Permanent failure (-EBUSY, etc.): * unlike -EAGAIN case, the failed page is * removed from migration page list and not * retried in the next outer loop. @@ -1523,6 +1549,12 @@ retry: nr_thp_failed += thp_retry; rc = nr_failed; out: + /* + * Put the permanent failure page back to migration list, they + * will be put back to the right list by the caller. + */ + list_splice(&ret_pages, from); + count_vm_events(PGMIGRATE_SUCCESS, nr_succeeded); count_vm_events(PGMIGRATE_FAIL, nr_failed); count_vm_events(THP_MIGRATION_SUCCESS, nr_thp_succeeded); @@ -1698,7 +1730,7 @@ static int move_pages_and_store_status(struct mm_struct *mm, int node, * Positive err means the number of failed * pages to migrate. Since we are going to * abort and return the number of non-migrated - * pages, so need to incude the rest of the + * pages, so need to include the rest of the * nr_pages that have not been attempted as * well. */ @@ -2065,6 +2097,17 @@ bool pmd_trans_migrating(pmd_t pmd) return PageLocked(page); } +static inline bool is_shared_exec_page(struct vm_area_struct *vma, + struct page *page) +{ + if (page_mapcount(page) != 1 && + (page_is_file_lru(page) || vma_is_shmem(vma)) && + (vma->vm_flags & VM_EXEC)) + return true; + + return false; +} + /* * Attempt to migrate a misplaced page to the specified destination * node. Caller is expected to have an elevated reference count on @@ -2082,8 +2125,7 @@ int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma, * Don't migrate file pages that are mapped in multiple processes * with execute permissions as they are probably shared libraries. */ - if (page_mapcount(page) != 1 && page_is_file_lru(page) && - (vma->vm_flags & VM_EXEC)) + if (is_shared_exec_page(vma, page)) goto out; /* @@ -2138,6 +2180,9 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm, int page_lru = page_is_file_lru(page); unsigned long start = address & HPAGE_PMD_MASK; + if (is_shared_exec_page(vma, page)) + goto out; + new_page = alloc_pages_node(node, (GFP_TRANSHUGE_LIGHT | __GFP_THISNODE), HPAGE_PMD_ORDER); @@ -2249,6 +2294,7 @@ out_fail: out_unlock: unlock_page(page); +out: put_page(page); return 0; } @@ -2548,7 +2594,7 @@ static bool migrate_vma_check_page(struct page *page) * will bump the page reference count. Sadly there is no way to * differentiate a regular pin from migration wait. Hence to * avoid 2 racing thread trying to migrate back to CPU to enter - * infinite loop (one stoping migration because the other is + * infinite loop (one stopping migration because the other is * waiting on pte migration entry). We always return true here. * * FIXME proper solution is to rework migration_entry_wait() so @@ -2688,7 +2734,7 @@ static void migrate_vma_prepare(struct migrate_vma *migrate) */ static void migrate_vma_unmap(struct migrate_vma *migrate) { - int flags = TTU_MIGRATION | TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS; + int flags = TTU_MIGRATION | TTU_IGNORE_MLOCK; const unsigned long npages = migrate->npages; const unsigned long start = migrate->start; unsigned long addr, i, restore = 0; @@ -2848,8 +2894,7 @@ EXPORT_SYMBOL(migrate_vma_setup); static void migrate_vma_insert_page(struct migrate_vma *migrate, unsigned long addr, struct page *page, - unsigned long *src, - unsigned long *dst) + unsigned long *src) { struct vm_area_struct *vma = migrate->vma; struct mm_struct *mm = vma->vm_mm; @@ -3003,16 +3048,14 @@ void migrate_vma_pages(struct migrate_vma *migrate) if (!notified) { notified = true; - mmu_notifier_range_init(&range, - MMU_NOTIFY_CLEAR, 0, - NULL, - migrate->vma->vm_mm, - addr, migrate->end); + mmu_notifier_range_init_migrate(&range, 0, + migrate->vma, migrate->vma->vm_mm, + addr, migrate->end, + migrate->pgmap_owner); mmu_notifier_invalidate_range_start(&range); } migrate_vma_insert_page(migrate, addr, newpage, - &migrate->src[i], - &migrate->dst[i]); + &migrate->src[i]); continue; } diff --git a/mm/mlock.c b/mm/mlock.c index 884b1216da6a..55b3b3672977 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -106,26 +106,6 @@ void mlock_vma_page(struct page *page) } /* - * Isolate a page from LRU with optional get_page() pin. - * Assumes lru_lock already held and page already pinned. - */ -static bool __munlock_isolate_lru_page(struct page *page, bool getpage) -{ - if (PageLRU(page)) { - struct lruvec *lruvec; - - lruvec = mem_cgroup_page_lruvec(page, page_pgdat(page)); - if (getpage) - get_page(page); - ClearPageLRU(page); - del_page_from_lru_list(page, lruvec, page_lru(page)); - return true; - } - - return false; -} - -/* * Finish munlock after successful page isolation * * Page must be locked. This is a wrapper for try_to_munlock() @@ -187,40 +167,24 @@ static void __munlock_isolation_failed(struct page *page) unsigned int munlock_vma_page(struct page *page) { int nr_pages; - pg_data_t *pgdat = page_pgdat(page); /* For try_to_munlock() and to serialize with page migration */ BUG_ON(!PageLocked(page)); - VM_BUG_ON_PAGE(PageTail(page), page); - /* - * Serialize with any parallel __split_huge_page_refcount() which - * might otherwise copy PageMlocked to part of the tail pages before - * we clear it in the head page. It also stabilizes thp_nr_pages(). - */ - spin_lock_irq(&pgdat->lru_lock); - if (!TestClearPageMlocked(page)) { /* Potentially, PTE-mapped THP: do not skip the rest PTEs */ - nr_pages = 1; - goto unlock_out; + return 0; } nr_pages = thp_nr_pages(page); - __mod_zone_page_state(page_zone(page), NR_MLOCK, -nr_pages); + mod_zone_page_state(page_zone(page), NR_MLOCK, -nr_pages); - if (__munlock_isolate_lru_page(page, true)) { - spin_unlock_irq(&pgdat->lru_lock); + if (!isolate_lru_page(page)) __munlock_isolated_page(page); - goto out; - } - __munlock_isolation_failed(page); - -unlock_out: - spin_unlock_irq(&pgdat->lru_lock); + else + __munlock_isolation_failed(page); -out: return nr_pages - 1; } @@ -298,12 +262,12 @@ static void __munlock_pagevec(struct pagevec *pvec, struct zone *zone) int nr = pagevec_count(pvec); int delta_munlocked = -nr; struct pagevec pvec_putback; + struct lruvec *lruvec = NULL; int pgrescued = 0; pagevec_init(&pvec_putback); /* Phase 1: page isolation */ - spin_lock_irq(&zone->zone_pgdat->lru_lock); for (i = 0; i < nr; i++) { struct page *page = pvec->pages[i]; @@ -312,9 +276,12 @@ static void __munlock_pagevec(struct pagevec *pvec, struct zone *zone) * We already have pin from follow_page_mask() * so we can spare the get_page() here. */ - if (__munlock_isolate_lru_page(page, false)) + if (TestClearPageLRU(page)) { + lruvec = relock_page_lruvec_irq(page, lruvec); + del_page_from_lru_list(page, lruvec, + page_lru(page)); continue; - else + } else __munlock_isolation_failed(page); } else { delta_munlocked++; @@ -329,8 +296,12 @@ static void __munlock_pagevec(struct pagevec *pvec, struct zone *zone) pagevec_add(&pvec_putback, pvec->pages[i]); pvec->pages[i] = NULL; } - __mod_zone_page_state(zone, NR_MLOCK, delta_munlocked); - spin_unlock_irq(&zone->zone_pgdat->lru_lock); + if (lruvec) { + __mod_zone_page_state(zone, NR_MLOCK, delta_munlocked); + unlock_page_lruvec_irq(lruvec); + } else if (delta_munlocked) { + mod_zone_page_state(zone, NR_MLOCK, delta_munlocked); + } /* Now we can release pins of pages that we are not munlocking */ pagevec_release(&pvec_putback); diff --git a/mm/mm_init.c b/mm/mm_init.c index b06a30fbedff..8e02e865cc65 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -173,6 +173,7 @@ static int __meminit mm_compute_batch_notifier(struct notifier_block *self, case MEM_ONLINE: case MEM_OFFLINE: mm_compute_batch(sysctl_overcommit_memory); + break; default: break; } diff --git a/mm/mmap.c b/mm/mmap.c index 5c8b4485860d..dc7206032387 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1897,8 +1897,8 @@ out: return addr; unmap_and_free_vma: + fput(vma->vm_file); vma->vm_file = NULL; - fput(file); /* Undo any partial mapping done by a device driver. */ unmap_region(mm, vma, prev, vma->vm_start, vma->vm_end); @@ -2731,8 +2731,8 @@ int __split_vma(struct mm_struct *mm, struct vm_area_struct *vma, struct vm_area_struct *new; int err; - if (vma->vm_ops && vma->vm_ops->split) { - err = vma->vm_ops->split(vma, addr); + if (vma->vm_ops && vma->vm_ops->may_split) { + err = vma->vm_ops->may_split(vma, addr); if (err) return err; } @@ -3405,10 +3405,14 @@ static const char *special_mapping_name(struct vm_area_struct *vma) return ((struct vm_special_mapping *)vma->vm_private_data)->name; } -static int special_mapping_mremap(struct vm_area_struct *new_vma) +static int special_mapping_mremap(struct vm_area_struct *new_vma, + unsigned long flags) { struct vm_special_mapping *sm = new_vma->vm_private_data; + if (flags & MREMAP_DONTUNMAP) + return -EINVAL; + if (WARN_ON_ONCE(current->mm != new_vma->vm_mm)) return -EFAULT; @@ -3418,6 +3422,17 @@ static int special_mapping_mremap(struct vm_area_struct *new_vma) return 0; } +static int special_mapping_split(struct vm_area_struct *vma, unsigned long addr) +{ + /* + * Forbid splitting special mappings - kernel has expectations over + * the number of pages in mapping. Together with VM_DONTEXPAND + * the size of vma should stay the same over the special mapping's + * lifetime. + */ + return -EINVAL; +} + static const struct vm_operations_struct special_mapping_vmops = { .close = special_mapping_close, .fault = special_mapping_fault, @@ -3425,6 +3440,7 @@ static const struct vm_operations_struct special_mapping_vmops = { .name = special_mapping_name, /* vDSO code relies that VVAR can't be accessed remotely */ .access = NULL, + .may_split = special_mapping_split, }; static const struct vm_operations_struct legacy_special_mapping_vmops = { diff --git a/mm/mmap_lock.c b/mm/mmap_lock.c new file mode 100644 index 000000000000..dcdde4f722a4 --- /dev/null +++ b/mm/mmap_lock.c @@ -0,0 +1,230 @@ +// SPDX-License-Identifier: GPL-2.0 +#define CREATE_TRACE_POINTS +#include <trace/events/mmap_lock.h> + +#include <linux/mm.h> +#include <linux/cgroup.h> +#include <linux/memcontrol.h> +#include <linux/mmap_lock.h> +#include <linux/mutex.h> +#include <linux/percpu.h> +#include <linux/rcupdate.h> +#include <linux/smp.h> +#include <linux/trace_events.h> + +EXPORT_TRACEPOINT_SYMBOL(mmap_lock_start_locking); +EXPORT_TRACEPOINT_SYMBOL(mmap_lock_acquire_returned); +EXPORT_TRACEPOINT_SYMBOL(mmap_lock_released); + +#ifdef CONFIG_MEMCG + +/* + * Our various events all share the same buffer (because we don't want or need + * to allocate a set of buffers *per event type*), so we need to protect against + * concurrent _reg() and _unreg() calls, and count how many _reg() calls have + * been made. + */ +static DEFINE_MUTEX(reg_lock); +static int reg_refcount; /* Protected by reg_lock. */ + +/* + * Size of the buffer for memcg path names. Ignoring stack trace support, + * trace_events_hist.c uses MAX_FILTER_STR_VAL for this, so we also use it. + */ +#define MEMCG_PATH_BUF_SIZE MAX_FILTER_STR_VAL + +/* + * How many contexts our trace events might be called in: normal, softirq, irq, + * and NMI. + */ +#define CONTEXT_COUNT 4 + +static DEFINE_PER_CPU(char __rcu *, memcg_path_buf); +static char **tmp_bufs; +static DEFINE_PER_CPU(int, memcg_path_buf_idx); + +/* Called with reg_lock held. */ +static void free_memcg_path_bufs(void) +{ + int cpu; + char **old = tmp_bufs; + + for_each_possible_cpu(cpu) { + *(old++) = rcu_dereference_protected( + per_cpu(memcg_path_buf, cpu), + lockdep_is_held(®_lock)); + rcu_assign_pointer(per_cpu(memcg_path_buf, cpu), NULL); + } + + /* Wait for inflight memcg_path_buf users to finish. */ + synchronize_rcu(); + + old = tmp_bufs; + for_each_possible_cpu(cpu) { + kfree(*(old++)); + } + + kfree(tmp_bufs); + tmp_bufs = NULL; +} + +int trace_mmap_lock_reg(void) +{ + int cpu; + char *new; + + mutex_lock(®_lock); + + /* If the refcount is going 0->1, proceed with allocating buffers. */ + if (reg_refcount++) + goto out; + + tmp_bufs = kmalloc_array(num_possible_cpus(), sizeof(*tmp_bufs), + GFP_KERNEL); + if (tmp_bufs == NULL) + goto out_fail; + + for_each_possible_cpu(cpu) { + new = kmalloc(MEMCG_PATH_BUF_SIZE * CONTEXT_COUNT, GFP_KERNEL); + if (new == NULL) + goto out_fail_free; + rcu_assign_pointer(per_cpu(memcg_path_buf, cpu), new); + /* Don't need to wait for inflights, they'd have gotten NULL. */ + } + +out: + mutex_unlock(®_lock); + return 0; + +out_fail_free: + free_memcg_path_bufs(); +out_fail: + /* Since we failed, undo the earlier ref increment. */ + --reg_refcount; + + mutex_unlock(®_lock); + return -ENOMEM; +} + +void trace_mmap_lock_unreg(void) +{ + mutex_lock(®_lock); + + /* If the refcount is going 1->0, proceed with freeing buffers. */ + if (--reg_refcount) + goto out; + + free_memcg_path_bufs(); + +out: + mutex_unlock(®_lock); +} + +static inline char *get_memcg_path_buf(void) +{ + char *buf; + int idx; + + rcu_read_lock(); + buf = rcu_dereference(*this_cpu_ptr(&memcg_path_buf)); + if (buf == NULL) { + rcu_read_unlock(); + return NULL; + } + idx = this_cpu_add_return(memcg_path_buf_idx, MEMCG_PATH_BUF_SIZE) - + MEMCG_PATH_BUF_SIZE; + return &buf[idx]; +} + +static inline void put_memcg_path_buf(void) +{ + this_cpu_sub(memcg_path_buf_idx, MEMCG_PATH_BUF_SIZE); + rcu_read_unlock(); +} + +/* + * Write the given mm_struct's memcg path to a percpu buffer, and return a + * pointer to it. If the path cannot be determined, or no buffer was available + * (because the trace event is being unregistered), NULL is returned. + * + * Note: buffers are allocated per-cpu to avoid locking, so preemption must be + * disabled by the caller before calling us, and re-enabled only after the + * caller is done with the pointer. + * + * The caller must call put_memcg_path_buf() once the buffer is no longer + * needed. This must be done while preemption is still disabled. + */ +static const char *get_mm_memcg_path(struct mm_struct *mm) +{ + char *buf = NULL; + struct mem_cgroup *memcg = get_mem_cgroup_from_mm(mm); + + if (memcg == NULL) + goto out; + if (unlikely(memcg->css.cgroup == NULL)) + goto out_put; + + buf = get_memcg_path_buf(); + if (buf == NULL) + goto out_put; + + cgroup_path(memcg->css.cgroup, buf, MEMCG_PATH_BUF_SIZE); + +out_put: + css_put(&memcg->css); +out: + return buf; +} + +#define TRACE_MMAP_LOCK_EVENT(type, mm, ...) \ + do { \ + const char *memcg_path; \ + preempt_disable(); \ + memcg_path = get_mm_memcg_path(mm); \ + trace_mmap_lock_##type(mm, \ + memcg_path != NULL ? memcg_path : "", \ + ##__VA_ARGS__); \ + if (likely(memcg_path != NULL)) \ + put_memcg_path_buf(); \ + preempt_enable(); \ + } while (0) + +#else /* !CONFIG_MEMCG */ + +int trace_mmap_lock_reg(void) +{ + return 0; +} + +void trace_mmap_lock_unreg(void) +{ +} + +#define TRACE_MMAP_LOCK_EVENT(type, mm, ...) \ + trace_mmap_lock_##type(mm, "", ##__VA_ARGS__) + +#endif /* CONFIG_MEMCG */ + +/* + * Trace calls must be in a separate file, as otherwise there's a circular + * dependency between linux/mmap_lock.h and trace/events/mmap_lock.h. + */ + +void __mmap_lock_do_trace_start_locking(struct mm_struct *mm, bool write) +{ + TRACE_MMAP_LOCK_EVENT(start_locking, mm, write); +} +EXPORT_SYMBOL(__mmap_lock_do_trace_start_locking); + +void __mmap_lock_do_trace_acquire_returned(struct mm_struct *mm, bool write, + bool success) +{ + TRACE_MMAP_LOCK_EVENT(acquire_returned, mm, write, success); +} +EXPORT_SYMBOL(__mmap_lock_do_trace_acquire_returned); + +void __mmap_lock_do_trace_released(struct mm_struct *mm, bool write) +{ + TRACE_MMAP_LOCK_EVENT(released, mm, write); +} +EXPORT_SYMBOL(__mmap_lock_do_trace_released); diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c index 5654dd19addc..61ee40ed804e 100644 --- a/mm/mmu_notifier.c +++ b/mm/mmu_notifier.c @@ -612,13 +612,6 @@ int __mmu_notifier_register(struct mmu_notifier *subscription, mmap_assert_write_locked(mm); BUG_ON(atomic_read(&mm->mm_users) <= 0); - if (IS_ENABLED(CONFIG_LOCKDEP)) { - fs_reclaim_acquire(GFP_KERNEL); - lock_map_acquire(&__mmu_notifier_invalidate_range_start_map); - lock_map_release(&__mmu_notifier_invalidate_range_start_map); - fs_reclaim_release(GFP_KERNEL); - } - if (!mm->notifier_subscriptions) { /* * kmalloc cannot be called under mm_take_all_locks(), but we diff --git a/mm/mmzone.c b/mm/mmzone.c index 4686fdc23bb9..eb89d6e018e2 100644 --- a/mm/mmzone.c +++ b/mm/mmzone.c @@ -72,25 +72,12 @@ struct zoneref *__next_zones_zonelist(struct zoneref *z, return z; } -#ifdef CONFIG_ARCH_HAS_HOLES_MEMORYMODEL -bool memmap_valid_within(unsigned long pfn, - struct page *page, struct zone *zone) -{ - if (page_to_pfn(page) != pfn) - return false; - - if (page_zone(page) != zone) - return false; - - return true; -} -#endif /* CONFIG_ARCH_HAS_HOLES_MEMORYMODEL */ - void lruvec_init(struct lruvec *lruvec) { enum lru_list lru; memset(lruvec, 0, sizeof(struct lruvec)); + spin_lock_init(&lruvec->lru_lock); for_each_lru(lru) INIT_LIST_HEAD(&lruvec->lists[lru]); diff --git a/mm/mprotect.c b/mm/mprotect.c index 56c02beb6041..ab709023e9aa 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -616,9 +616,16 @@ static int do_mprotect_pkey(unsigned long start, size_t len, tmp = vma->vm_end; if (tmp > end) tmp = end; + + if (vma->vm_ops && vma->vm_ops->mprotect) + error = vma->vm_ops->mprotect(vma, nstart, tmp, newflags); + if (error) + goto out; + error = mprotect_fixup(vma, &prev, nstart, tmp, newflags); if (error) goto out; + nstart = tmp; if (nstart < prev->vm_end) diff --git a/mm/mremap.c b/mm/mremap.c index 138abbae4f75..c5590afe7165 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -30,12 +30,11 @@ #include "internal.h" -static pmd_t *get_old_pmd(struct mm_struct *mm, unsigned long addr) +static pud_t *get_old_pud(struct mm_struct *mm, unsigned long addr) { pgd_t *pgd; p4d_t *p4d; pud_t *pud; - pmd_t *pmd; pgd = pgd_offset(mm, addr); if (pgd_none_or_clear_bad(pgd)) @@ -49,6 +48,18 @@ static pmd_t *get_old_pmd(struct mm_struct *mm, unsigned long addr) if (pud_none_or_clear_bad(pud)) return NULL; + return pud; +} + +static pmd_t *get_old_pmd(struct mm_struct *mm, unsigned long addr) +{ + pud_t *pud; + pmd_t *pmd; + + pud = get_old_pud(mm, addr); + if (!pud) + return NULL; + pmd = pmd_offset(pud, addr); if (pmd_none(*pmd)) return NULL; @@ -56,19 +67,27 @@ static pmd_t *get_old_pmd(struct mm_struct *mm, unsigned long addr) return pmd; } -static pmd_t *alloc_new_pmd(struct mm_struct *mm, struct vm_area_struct *vma, +static pud_t *alloc_new_pud(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr) { pgd_t *pgd; p4d_t *p4d; - pud_t *pud; - pmd_t *pmd; pgd = pgd_offset(mm, addr); p4d = p4d_alloc(mm, pgd, addr); if (!p4d) return NULL; - pud = pud_alloc(mm, p4d, addr); + + return pud_alloc(mm, p4d, addr); +} + +static pmd_t *alloc_new_pmd(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long addr) +{ + pud_t *pud; + pmd_t *pmd; + + pud = alloc_new_pud(mm, vma, addr); if (!pud) return NULL; @@ -249,14 +268,148 @@ static bool move_normal_pmd(struct vm_area_struct *vma, unsigned long old_addr, return true; } +#else +static inline bool move_normal_pmd(struct vm_area_struct *vma, + unsigned long old_addr, unsigned long new_addr, pmd_t *old_pmd, + pmd_t *new_pmd) +{ + return false; +} #endif +#ifdef CONFIG_HAVE_MOVE_PUD +static bool move_normal_pud(struct vm_area_struct *vma, unsigned long old_addr, + unsigned long new_addr, pud_t *old_pud, pud_t *new_pud) +{ + spinlock_t *old_ptl, *new_ptl; + struct mm_struct *mm = vma->vm_mm; + pud_t pud; + + /* + * The destination pud shouldn't be established, free_pgtables() + * should have released it. + */ + if (WARN_ON_ONCE(!pud_none(*new_pud))) + return false; + + /* + * We don't have to worry about the ordering of src and dst + * ptlocks because exclusive mmap_lock prevents deadlock. + */ + old_ptl = pud_lock(vma->vm_mm, old_pud); + new_ptl = pud_lockptr(mm, new_pud); + if (new_ptl != old_ptl) + spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING); + + /* Clear the pud */ + pud = *old_pud; + pud_clear(old_pud); + + VM_BUG_ON(!pud_none(*new_pud)); + + /* Set the new pud */ + set_pud_at(mm, new_addr, new_pud, pud); + flush_tlb_range(vma, old_addr, old_addr + PUD_SIZE); + if (new_ptl != old_ptl) + spin_unlock(new_ptl); + spin_unlock(old_ptl); + + return true; +} +#else +static inline bool move_normal_pud(struct vm_area_struct *vma, + unsigned long old_addr, unsigned long new_addr, pud_t *old_pud, + pud_t *new_pud) +{ + return false; +} +#endif + +enum pgt_entry { + NORMAL_PMD, + HPAGE_PMD, + NORMAL_PUD, +}; + +/* + * Returns an extent of the corresponding size for the pgt_entry specified if + * valid. Else returns a smaller extent bounded by the end of the source and + * destination pgt_entry. + */ +static unsigned long get_extent(enum pgt_entry entry, unsigned long old_addr, + unsigned long old_end, unsigned long new_addr) +{ + unsigned long next, extent, mask, size; + + switch (entry) { + case HPAGE_PMD: + case NORMAL_PMD: + mask = PMD_MASK; + size = PMD_SIZE; + break; + case NORMAL_PUD: + mask = PUD_MASK; + size = PUD_SIZE; + break; + default: + BUILD_BUG(); + break; + } + + next = (old_addr + size) & mask; + /* even if next overflowed, extent below will be ok */ + extent = (next > old_end) ? old_end - old_addr : next - old_addr; + next = (new_addr + size) & mask; + if (extent > next - new_addr) + extent = next - new_addr; + return extent; +} + +/* + * Attempts to speedup the move by moving entry at the level corresponding to + * pgt_entry. Returns true if the move was successful, else false. + */ +static bool move_pgt_entry(enum pgt_entry entry, struct vm_area_struct *vma, + unsigned long old_addr, unsigned long new_addr, + void *old_entry, void *new_entry, bool need_rmap_locks) +{ + bool moved = false; + + /* See comment in move_ptes() */ + if (need_rmap_locks) + take_rmap_locks(vma); + + switch (entry) { + case NORMAL_PMD: + moved = move_normal_pmd(vma, old_addr, new_addr, old_entry, + new_entry); + break; + case NORMAL_PUD: + moved = move_normal_pud(vma, old_addr, new_addr, old_entry, + new_entry); + break; + case HPAGE_PMD: + moved = IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && + move_huge_pmd(vma, old_addr, new_addr, old_entry, + new_entry); + break; + default: + WARN_ON_ONCE(1); + break; + } + + if (need_rmap_locks) + drop_rmap_locks(vma); + + return moved; +} + unsigned long move_page_tables(struct vm_area_struct *vma, unsigned long old_addr, struct vm_area_struct *new_vma, unsigned long new_addr, unsigned long len, bool need_rmap_locks) { - unsigned long extent, next, old_end; + unsigned long extent, old_end; struct mmu_notifier_range range; pmd_t *old_pmd, *new_pmd; @@ -269,53 +422,50 @@ unsigned long move_page_tables(struct vm_area_struct *vma, for (; old_addr < old_end; old_addr += extent, new_addr += extent) { cond_resched(); - next = (old_addr + PMD_SIZE) & PMD_MASK; - /* even if next overflowed, extent below will be ok */ - extent = next - old_addr; - if (extent > old_end - old_addr) - extent = old_end - old_addr; - next = (new_addr + PMD_SIZE) & PMD_MASK; - if (extent > next - new_addr) - extent = next - new_addr; + /* + * If extent is PUD-sized try to speed up the move by moving at the + * PUD level if possible. + */ + extent = get_extent(NORMAL_PUD, old_addr, old_end, new_addr); + if (IS_ENABLED(CONFIG_HAVE_MOVE_PUD) && extent == PUD_SIZE) { + pud_t *old_pud, *new_pud; + + old_pud = get_old_pud(vma->vm_mm, old_addr); + if (!old_pud) + continue; + new_pud = alloc_new_pud(vma->vm_mm, vma, new_addr); + if (!new_pud) + break; + if (move_pgt_entry(NORMAL_PUD, vma, old_addr, new_addr, + old_pud, new_pud, need_rmap_locks)) + continue; + } + + extent = get_extent(NORMAL_PMD, old_addr, old_end, new_addr); old_pmd = get_old_pmd(vma->vm_mm, old_addr); if (!old_pmd) continue; new_pmd = alloc_new_pmd(vma->vm_mm, vma, new_addr); if (!new_pmd) break; - if (is_swap_pmd(*old_pmd) || pmd_trans_huge(*old_pmd) || pmd_devmap(*old_pmd)) { - if (extent == HPAGE_PMD_SIZE) { - bool moved; - /* See comment in move_ptes() */ - if (need_rmap_locks) - take_rmap_locks(vma); - moved = move_huge_pmd(vma, old_addr, new_addr, - old_pmd, new_pmd); - if (need_rmap_locks) - drop_rmap_locks(vma); - if (moved) - continue; - } + if (is_swap_pmd(*old_pmd) || pmd_trans_huge(*old_pmd) || + pmd_devmap(*old_pmd)) { + if (extent == HPAGE_PMD_SIZE && + move_pgt_entry(HPAGE_PMD, vma, old_addr, new_addr, + old_pmd, new_pmd, need_rmap_locks)) + continue; split_huge_pmd(vma, old_pmd, old_addr); if (pmd_trans_unstable(old_pmd)) continue; - } else if (extent == PMD_SIZE) { -#ifdef CONFIG_HAVE_MOVE_PMD + } else if (IS_ENABLED(CONFIG_HAVE_MOVE_PMD) && + extent == PMD_SIZE) { /* * If the extent is PMD-sized, try to speed the move by * moving at the PMD level if possible. */ - bool moved; - - if (need_rmap_locks) - take_rmap_locks(vma); - moved = move_normal_pmd(vma, old_addr, new_addr, - old_pmd, new_pmd); - if (need_rmap_locks) - drop_rmap_locks(vma); - if (moved) + if (move_pgt_entry(NORMAL_PMD, vma, old_addr, new_addr, + old_pmd, new_pmd, need_rmap_locks)) continue; -#endif } if (pte_alloc(new_vma->vm_mm, new_pmd)) @@ -343,7 +493,7 @@ static unsigned long move_vma(struct vm_area_struct *vma, unsigned long excess = 0; unsigned long hiwater_vm; int split = 0; - int err; + int err = 0; bool need_rmap_locks; /* @@ -353,6 +503,15 @@ static unsigned long move_vma(struct vm_area_struct *vma, if (mm->map_count >= sysctl_max_map_count - 3) return -ENOMEM; + if (vma->vm_ops && vma->vm_ops->may_split) { + if (vma->vm_start != old_addr) + err = vma->vm_ops->may_split(vma, old_addr); + if (!err && vma->vm_end != old_addr + old_len) + err = vma->vm_ops->may_split(vma, old_addr + old_len); + if (err) + return err; + } + /* * Advise KSM to break any KSM pages in the area to be moved: * it would be confusing if they were to turn up at the new @@ -365,18 +524,26 @@ static unsigned long move_vma(struct vm_area_struct *vma, if (err) return err; + if (unlikely(flags & MREMAP_DONTUNMAP && vm_flags & VM_ACCOUNT)) { + if (security_vm_enough_memory_mm(mm, new_len >> PAGE_SHIFT)) + return -ENOMEM; + } + new_pgoff = vma->vm_pgoff + ((old_addr - vma->vm_start) >> PAGE_SHIFT); new_vma = copy_vma(&vma, new_addr, new_len, new_pgoff, &need_rmap_locks); - if (!new_vma) + if (!new_vma) { + if (unlikely(flags & MREMAP_DONTUNMAP && vm_flags & VM_ACCOUNT)) + vm_unacct_memory(new_len >> PAGE_SHIFT); return -ENOMEM; + } moved_len = move_page_tables(vma, old_addr, new_vma, new_addr, old_len, need_rmap_locks); if (moved_len < old_len) { err = -ENOMEM; } else if (vma->vm_ops && vma->vm_ops->mremap) { - err = vma->vm_ops->mremap(new_vma); + err = vma->vm_ops->mremap(new_vma, flags); } if (unlikely(err)) { @@ -398,7 +565,7 @@ static unsigned long move_vma(struct vm_area_struct *vma, } /* Conceal VM_ACCOUNT so old reservation is not undone */ - if (vm_flags & VM_ACCOUNT) { + if (vm_flags & VM_ACCOUNT && !(flags & MREMAP_DONTUNMAP)) { vma->vm_flags &= ~VM_ACCOUNT; excess = vma->vm_end - vma->vm_start - old_len; if (old_addr > vma->vm_start && @@ -423,34 +590,17 @@ static unsigned long move_vma(struct vm_area_struct *vma, untrack_pfn_moved(vma); if (unlikely(!err && (flags & MREMAP_DONTUNMAP))) { - if (vm_flags & VM_ACCOUNT) { - /* Always put back VM_ACCOUNT since we won't unmap */ - vma->vm_flags |= VM_ACCOUNT; - - vm_acct_memory(new_len >> PAGE_SHIFT); - } - - /* - * VMAs can actually be merged back together in copy_vma - * calling merge_vma. This can happen with anonymous vmas - * which have not yet been faulted, so if we were to consider - * this VMA split we'll end up adding VM_ACCOUNT on the - * next VMA, which is completely unrelated if this VMA - * was re-merged. - */ - if (split && new_vma == vma) - split = 0; - /* We always clear VM_LOCKED[ONFAULT] on the old vma */ vma->vm_flags &= VM_LOCKED_CLEAR_MASK; /* Because we won't unmap we don't need to touch locked_vm */ - goto out; + return new_addr; } if (do_munmap(mm, old_addr, old_len, uf_unmap) < 0) { /* OOM: unable to split vma, just get accounts right */ - vm_unacct_memory(excess >> PAGE_SHIFT); + if (vm_flags & VM_ACCOUNT && !(flags & MREMAP_DONTUNMAP)) + vm_acct_memory(new_len >> PAGE_SHIFT); excess = 0; } @@ -458,7 +608,7 @@ static unsigned long move_vma(struct vm_area_struct *vma, mm->locked_vm += new_len >> PAGE_SHIFT; *locked = true; } -out: + mm->hiwater_vm = hiwater_vm; /* Restore VM_ACCOUNT if one or two pieces of vma left */ diff --git a/mm/nommu.c b/mm/nommu.c index 0faf39b32cdb..870fea12823e 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -1675,8 +1675,8 @@ void filemap_map_pages(struct vm_fault *vmf, } EXPORT_SYMBOL(filemap_map_pages); -int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm, - unsigned long addr, void *buf, int len, unsigned int gup_flags) +int __access_remote_vm(struct mm_struct *mm, unsigned long addr, void *buf, + int len, unsigned int gup_flags) { struct vm_area_struct *vma; int write = gup_flags & FOLL_WRITE; @@ -1722,7 +1722,7 @@ int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm, int access_remote_vm(struct mm_struct *mm, unsigned long addr, void *buf, int len, unsigned int gup_flags) { - return __access_remote_vm(NULL, mm, addr, buf, len, gup_flags); + return __access_remote_vm(mm, addr, buf, len, gup_flags); } /* @@ -1741,7 +1741,7 @@ int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, in if (!mm) return 0; - len = __access_remote_vm(tsk, mm, addr, buf, len, gup_flags); + len = __access_remote_vm(mm, addr, buf, len, gup_flags); mmput(mm); return len; diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 8b84661a6410..04b19b7b5435 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -170,11 +170,13 @@ static bool oom_unkillable_task(struct task_struct *p) return false; } -/* - * Print out unreclaimble slabs info when unreclaimable slabs amount is greater - * than all user memory (LRU pages) - */ -static bool is_dump_unreclaim_slabs(void) +/** + * Check whether unreclaimable slab amount is greater than + * all user memory(LRU pages). + * dump_unreclaimable_slab() could help in the case that + * oom due to too much unreclaimable slab used by kernel. +*/ +static bool should_dump_unreclaim_slab(void) { unsigned long nr_lru; @@ -463,7 +465,7 @@ static void dump_header(struct oom_control *oc, struct task_struct *p) mem_cgroup_print_oom_meminfo(oc->memcg); else { show_mem(SHOW_MEM_FILTER_NODES, oc->nodemask); - if (is_dump_unreclaim_slabs()) + if (should_dump_unreclaim_slab()) dump_unreclaimable_slab(); } if (sysctl_oom_dump_tasks) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index eaa227a479e4..7a2c89b21115 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -57,6 +57,7 @@ #include <trace/events/oom.h> #include <linux/prefetch.h> #include <linux/mm_inline.h> +#include <linux/mmu_notifier.h> #include <linux/migrate.h> #include <linux/hugetlb.h> #include <linux/sched/rt.h> @@ -70,6 +71,7 @@ #include <linux/psi.h> #include <linux/padata.h> #include <linux/khugepaged.h> +#include <linux/buffer_head.h> #include <asm/sections.h> #include <asm/tlbflush.h> @@ -165,53 +167,26 @@ unsigned long totalcma_pages __read_mostly; int percpu_pagelist_fraction; gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK; -#ifdef CONFIG_INIT_ON_ALLOC_DEFAULT_ON -DEFINE_STATIC_KEY_TRUE(init_on_alloc); -#else DEFINE_STATIC_KEY_FALSE(init_on_alloc); -#endif EXPORT_SYMBOL(init_on_alloc); -#ifdef CONFIG_INIT_ON_FREE_DEFAULT_ON -DEFINE_STATIC_KEY_TRUE(init_on_free); -#else DEFINE_STATIC_KEY_FALSE(init_on_free); -#endif EXPORT_SYMBOL(init_on_free); +static bool _init_on_alloc_enabled_early __read_mostly + = IS_ENABLED(CONFIG_INIT_ON_ALLOC_DEFAULT_ON); static int __init early_init_on_alloc(char *buf) { - int ret; - bool bool_result; - ret = kstrtobool(buf, &bool_result); - if (ret) - return ret; - if (bool_result && page_poisoning_enabled()) - pr_info("mem auto-init: CONFIG_PAGE_POISONING is on, will take precedence over init_on_alloc\n"); - if (bool_result) - static_branch_enable(&init_on_alloc); - else - static_branch_disable(&init_on_alloc); - return 0; + return kstrtobool(buf, &_init_on_alloc_enabled_early); } early_param("init_on_alloc", early_init_on_alloc); +static bool _init_on_free_enabled_early __read_mostly + = IS_ENABLED(CONFIG_INIT_ON_FREE_DEFAULT_ON); static int __init early_init_on_free(char *buf) { - int ret; - bool bool_result; - - ret = kstrtobool(buf, &bool_result); - if (ret) - return ret; - if (bool_result && page_poisoning_enabled()) - pr_info("mem auto-init: CONFIG_PAGE_POISONING is on, will take precedence over init_on_free\n"); - if (bool_result) - static_branch_enable(&init_on_free); - else - static_branch_disable(&init_on_free); - return 0; + return kstrtobool(buf, &_init_on_free_enabled_early); } early_param("init_on_free", early_init_on_free); @@ -495,14 +470,6 @@ static inline int pfn_to_bitidx(struct page *page, unsigned long pfn) return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS; } -/** - * get_pfnblock_flags_mask - Return the requested group of flags for the pageblock_nr_pages block of pages - * @page: The page within the block of interest - * @pfn: The target page frame number - * @mask: mask of bits that the caller is interested in - * - * Return: pageblock_bits flags - */ static __always_inline unsigned long __get_pfnblock_flags_mask(struct page *page, unsigned long pfn, @@ -521,6 +488,14 @@ unsigned long __get_pfnblock_flags_mask(struct page *page, return (word >> bitidx) & mask; } +/** + * get_pfnblock_flags_mask - Return the requested group of flags for the pageblock_nr_pages block of pages + * @page: The page within the block of interest + * @pfn: The target page frame number + * @mask: mask of bits that the caller is interested in + * + * Return: pageblock_bits flags + */ unsigned long get_pfnblock_flags_mask(struct page *page, unsigned long pfn, unsigned long mask) { @@ -728,19 +703,6 @@ static int __init early_debug_pagealloc(char *buf) } early_param("debug_pagealloc", early_debug_pagealloc); -void init_debug_pagealloc(void) -{ - if (!debug_pagealloc_enabled()) - return; - - static_branch_enable(&_debug_pagealloc_enabled); - - if (!debug_guardpage_minorder()) - return; - - static_branch_enable(&_debug_guardpage_enabled); -} - static int __init debug_guardpage_minorder_setup(char *buf) { unsigned long res; @@ -792,6 +754,53 @@ static inline void clear_page_guard(struct zone *zone, struct page *page, unsigned int order, int migratetype) {} #endif +/* + * Enable static keys related to various memory debugging and hardening options. + * Some override others, and depend on early params that are evaluated in the + * order of appearance. So we need to first gather the full picture of what was + * enabled, and then make decisions. + */ +void init_mem_debugging_and_hardening(void) +{ + if (_init_on_alloc_enabled_early) { + if (page_poisoning_enabled()) + pr_info("mem auto-init: CONFIG_PAGE_POISONING is on, " + "will take precedence over init_on_alloc\n"); + else + static_branch_enable(&init_on_alloc); + } + if (_init_on_free_enabled_early) { + if (page_poisoning_enabled()) + pr_info("mem auto-init: CONFIG_PAGE_POISONING is on, " + "will take precedence over init_on_free\n"); + else + static_branch_enable(&init_on_free); + } + +#ifdef CONFIG_PAGE_POISONING + /* + * Page poisoning is debug page alloc for some arches. If + * either of those options are enabled, enable poisoning. + */ + if (page_poisoning_enabled() || + (!IS_ENABLED(CONFIG_ARCH_SUPPORTS_DEBUG_PAGEALLOC) && + debug_pagealloc_enabled())) + static_branch_enable(&_page_poisoning_enabled); +#endif + +#ifdef CONFIG_DEBUG_PAGEALLOC + if (!debug_pagealloc_enabled()) + return; + + static_branch_enable(&_debug_pagealloc_enabled); + + if (!debug_guardpage_minorder()) + return; + + static_branch_enable(&_debug_guardpage_enabled); +#endif +} + static inline void set_buddy_order(struct page *page, unsigned int order) { set_page_private(page, order); @@ -994,7 +1003,7 @@ static inline void __free_one_page(struct page *page, struct page *buddy; bool to_tail; - max_order = min_t(unsigned int, MAX_ORDER, pageblock_order + 1); + max_order = min_t(unsigned int, MAX_ORDER - 1, pageblock_order); VM_BUG_ON(!zone_is_initialized(zone)); VM_BUG_ON_PAGE(page->flags & PAGE_FLAGS_CHECK_AT_PREP, page); @@ -1007,7 +1016,7 @@ static inline void __free_one_page(struct page *page, VM_BUG_ON_PAGE(bad_range(zone, page), page); continue_merging: - while (order < max_order - 1) { + while (order < max_order) { if (compaction_capture(capc, page, order, migratetype)) { __mod_zone_freepage_state(zone, -(1 << order), migratetype); @@ -1033,7 +1042,7 @@ continue_merging: pfn = combined_pfn; order++; } - if (max_order < MAX_ORDER) { + if (order < MAX_ORDER - 1) { /* If we are here, it means order is >= pageblock_order. * We want to prevent merge between freepages on isolate * pageblock and normal pageblock. Without this, pageblock @@ -1054,7 +1063,7 @@ continue_merging: is_migrate_isolate(buddy_mt))) goto done_merging; } - max_order++; + max_order = order + 1; goto continue_merging; } @@ -1092,7 +1101,7 @@ static inline bool page_expected_state(struct page *page, if (unlikely((unsigned long)page->mapping | page_ref_count(page) | #ifdef CONFIG_MEMCG - (unsigned long)page->mem_cgroup | + (unsigned long)page_memcg(page) | #endif (page->flags & check_flags))) return false; @@ -1117,7 +1126,7 @@ static const char *page_bad_reason(struct page *page, unsigned long flags) bad_reason = "PAGE_FLAGS_CHECK_AT_FREE flag(s) set"; } #ifdef CONFIG_MEMCG - if (unlikely(page->mem_cgroup)) + if (unlikely(page_memcg(page))) bad_reason = "page still charged to cgroup"; #endif return bad_reason; @@ -1195,8 +1204,10 @@ static void kernel_init_free_pages(struct page *page, int numpages) /* s390's use of memset() could override KASAN redzones. */ kasan_disable_current(); - for (i = 0; i < numpages; i++) + for (i = 0; i < numpages; i++) { + page_kasan_tag_reset(page + i); clear_highpage(page + i); + } kasan_enable_current(); } @@ -1214,7 +1225,7 @@ static __always_inline bool free_pages_prepare(struct page *page, * Do not let hwpoison pages hit pcplists/buddy * Untie memcg state and reset page's owner */ - if (memcg_kmem_enabled() && PageKmemcg(page)) + if (memcg_kmem_enabled() && PageMemcgKmem(page)) __memcg_kmem_uncharge_page(page, order); reset_page_owner(page, order); return false; @@ -1244,7 +1255,7 @@ static __always_inline bool free_pages_prepare(struct page *page, } if (PageMappingFlags(page)) page->mapping = NULL; - if (memcg_kmem_enabled() && PageKmemcg(page)) + if (memcg_kmem_enabled() && PageMemcgKmem(page)) __memcg_kmem_uncharge_page(page, order); if (check_free) bad += check_free_page(page); @@ -1264,7 +1275,8 @@ static __always_inline bool free_pages_prepare(struct page *page, if (want_init_on_free()) kernel_init_free_pages(page, 1 << order); - kernel_poison_pages(page, 1 << order, 0); + kernel_poison_pages(page, 1 << order); + /* * arch_free_page() can make the page's contents inaccessible. s390 * does this. So nothing which can access the page's contents should @@ -1272,8 +1284,7 @@ static __always_inline bool free_pages_prepare(struct page *page, */ arch_free_page(page, order); - if (debug_pagealloc_enabled_static()) - kernel_map_pages(page, 1 << order, 0); + debug_pagealloc_unmap_pages(page, 1 << order); kasan_free_nondeferred_pages(page, order); @@ -1344,7 +1355,7 @@ static void free_pcppages_bulk(struct zone *zone, int count, { int migratetype = 0; int batch_free = 0; - int prefetch_nr = 0; + int prefetch_nr = READ_ONCE(pcp->batch); bool isolated_pageblocks; struct page *page, *tmp; LIST_HEAD(head); @@ -1395,8 +1406,10 @@ static void free_pcppages_bulk(struct zone *zone, int count, * avoid excessive prefetching due to large count, only * prefetch buddy for the first pcp->batch nr of pages. */ - if (prefetch_nr++ < pcp->batch) + if (prefetch_nr) { prefetch_buddy(page); + prefetch_nr--; + } } while (--count && --batch_free && !list_empty(list)); } @@ -1558,14 +1571,23 @@ void __free_pages_core(struct page *page, unsigned int order) #ifdef CONFIG_NEED_MULTIPLE_NODES -static struct mminit_pfnnid_cache early_pfnnid_cache __meminitdata; +/* + * During memory init memblocks map pfns to nids. The search is expensive and + * this caches recent lookups. The implementation of __early_pfn_to_nid + * treats start/end as pfns. + */ +struct mminit_pfnnid_cache { + unsigned long last_start; + unsigned long last_end; + int last_nid; +}; -#ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID +static struct mminit_pfnnid_cache early_pfnnid_cache __meminitdata; /* * Required by SPARSEMEM. Given a PFN, return what node the PFN is on. */ -int __meminit __early_pfn_to_nid(unsigned long pfn, +static int __meminit __early_pfn_to_nid(unsigned long pfn, struct mminit_pfnnid_cache *state) { unsigned long start_pfn, end_pfn; @@ -1583,7 +1605,6 @@ int __meminit __early_pfn_to_nid(unsigned long pfn, return nid; } -#endif /* CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID */ int __meminit early_pfn_to_nid(unsigned long pfn) { @@ -2103,6 +2124,8 @@ void __init page_alloc_init_late(void) files_maxfiles_init(); #endif + buffer_init(); + /* Discard memblock private memory */ memblock_discard(); @@ -2207,12 +2230,6 @@ static inline int check_new_page(struct page *page) return 1; } -static inline bool free_pages_prezeroed(void) -{ - return (IS_ENABLED(CONFIG_PAGE_POISONING_ZERO) && - page_poisoning_enabled()) || want_init_on_free(); -} - #ifdef CONFIG_DEBUG_VM /* * With DEBUG_VM enabled, order-0 pages are checked for expected state when @@ -2270,11 +2287,13 @@ inline void post_alloc_hook(struct page *page, unsigned int order, set_page_refcounted(page); arch_alloc_page(page, order); - if (debug_pagealloc_enabled_static()) - kernel_map_pages(page, 1 << order, 1); + debug_pagealloc_map_pages(page, 1 << order); kasan_alloc_pages(page, order); - kernel_poison_pages(page, 1 << order, 1); + kernel_unpoison_pages(page, 1 << order); set_page_owner(page, order, gfp_flags); + + if (!want_init_on_free() && want_init_on_alloc(gfp_flags)) + kernel_init_free_pages(page, 1 << order); } static void prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags, @@ -2282,9 +2301,6 @@ static void prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags { post_alloc_hook(page, order, gfp_flags); - if (!free_pages_prezeroed() && want_init_on_alloc(gfp_flags)) - kernel_init_free_pages(page, 1 << order); - if (order && (gfp_flags & __GFP_COMP)) prep_compound_page(page, order); @@ -2470,12 +2486,12 @@ static bool can_steal_fallback(unsigned int order, int start_mt) return false; } -static inline void boost_watermark(struct zone *zone) +static inline bool boost_watermark(struct zone *zone) { unsigned long max_boost; if (!watermark_boost_factor) - return; + return false; /* * Don't bother in zones that are unlikely to produce results. * On small machines, including kdump capture kernels running @@ -2483,7 +2499,7 @@ static inline void boost_watermark(struct zone *zone) * memory situation immediately. */ if ((pageblock_nr_pages * 4) > zone_managed_pages(zone)) - return; + return false; max_boost = mult_frac(zone->_watermark[WMARK_HIGH], watermark_boost_factor, 10000); @@ -2497,12 +2513,14 @@ static inline void boost_watermark(struct zone *zone) * boosted watermark resulting in a hang. */ if (!max_boost) - return; + return false; max_boost = max(pageblock_nr_pages, max_boost); zone->watermark_boost = min(zone->watermark_boost + pageblock_nr_pages, max_boost); + + return true; } /* @@ -2540,8 +2558,7 @@ static void steal_suitable_fallback(struct zone *zone, struct page *page, * likelihood of future fallbacks. Wake kswapd now as the node * may be balanced overall and kswapd will not wake naturally. */ - boost_watermark(zone); - if (alloc_flags & ALLOC_KSWAPD) + if (boost_watermark(zone) && (alloc_flags & ALLOC_KSWAPD)) set_bit(ZONE_BOOSTED_WATERMARK, &zone->flags); /* We are not allowed to try stealing from the whole block */ @@ -3017,13 +3034,16 @@ static void drain_local_pages_wq(struct work_struct *work) } /* - * Spill all the per-cpu pages from all CPUs back into the buddy allocator. - * - * When zone parameter is non-NULL, spill just the single zone's pages. + * The implementation of drain_all_pages(), exposing an extra parameter to + * drain on all cpus. * - * Note that this can be extremely slow as the draining happens in a workqueue. + * drain_all_pages() is optimized to only execute on cpus where pcplists are + * not empty. The check for non-emptiness can however race with a free to + * pcplist that has not yet increased the pcp->count from 0 to 1. Callers + * that need the guarantee that every CPU has drained can disable the + * optimizing racy check. */ -void drain_all_pages(struct zone *zone) +static void __drain_all_pages(struct zone *zone, bool force_all_cpus) { int cpu; @@ -3062,7 +3082,13 @@ void drain_all_pages(struct zone *zone) struct zone *z; bool has_pcps = false; - if (zone) { + if (force_all_cpus) { + /* + * The pcp.count check is racy, some callers need a + * guarantee that no cpu is missed. + */ + has_pcps = true; + } else if (zone) { pcp = per_cpu_ptr(zone->pageset, cpu); if (pcp->pcp.count) has_pcps = true; @@ -3095,6 +3121,18 @@ void drain_all_pages(struct zone *zone) mutex_unlock(&pcpu_drain_mutex); } +/* + * Spill all the per-cpu pages from all CPUs back into the buddy allocator. + * + * When zone parameter is non-NULL, spill just the single zone's pages. + * + * Note that this can be extremely slow as the draining happens in a workqueue. + */ +void drain_all_pages(struct zone *zone) +{ + __drain_all_pages(zone, false); +} + #ifdef CONFIG_HIBERNATION /* @@ -3190,10 +3228,8 @@ static void free_unref_page_commit(struct page *page, unsigned long pfn) pcp = &this_cpu_ptr(zone->pageset)->pcp; list_add(&page->lru, &pcp->lists[migratetype]); pcp->count++; - if (pcp->count >= pcp->high) { - unsigned long batch = READ_ONCE(pcp->batch); - free_pcppages_bulk(zone, batch, pcp); - } + if (pcp->count >= READ_ONCE(pcp->high)) + free_pcppages_bulk(zone, READ_ONCE(pcp->batch), pcp); } /* @@ -3378,7 +3414,7 @@ static struct page *__rmqueue_pcplist(struct zone *zone, int migratetype, do { if (list_empty(list)) { pcp->count += rmqueue_bulk(zone, 0, - pcp->batch, list, + READ_ONCE(pcp->batch), list, migratetype, alloc_flags); if (unlikely(list_empty(list))) return NULL; @@ -4264,10 +4300,8 @@ should_compact_retry(struct alloc_context *ac, unsigned int order, int alloc_fla static struct lockdep_map __fs_reclaim_map = STATIC_LOCKDEP_MAP_INIT("fs_reclaim", &__fs_reclaim_map); -static bool __need_fs_reclaim(gfp_t gfp_mask) +static bool __need_reclaim(gfp_t gfp_mask) { - gfp_mask = current_gfp_context(gfp_mask); - /* no reclaim without waiting on it */ if (!(gfp_mask & __GFP_DIRECT_RECLAIM)) return false; @@ -4276,10 +4310,6 @@ static bool __need_fs_reclaim(gfp_t gfp_mask) if (current->flags & PF_MEMALLOC) return false; - /* We're only interested __GFP_FS allocations for now */ - if (!(gfp_mask & __GFP_FS)) - return false; - if (gfp_mask & __GFP_NOLOCKDEP) return false; @@ -4298,15 +4328,29 @@ void __fs_reclaim_release(void) void fs_reclaim_acquire(gfp_t gfp_mask) { - if (__need_fs_reclaim(gfp_mask)) - __fs_reclaim_acquire(); + gfp_mask = current_gfp_context(gfp_mask); + + if (__need_reclaim(gfp_mask)) { + if (gfp_mask & __GFP_FS) + __fs_reclaim_acquire(); + +#ifdef CONFIG_MMU_NOTIFIER + lock_map_acquire(&__mmu_notifier_invalidate_range_start_map); + lock_map_release(&__mmu_notifier_invalidate_range_start_map); +#endif + + } } EXPORT_SYMBOL_GPL(fs_reclaim_acquire); void fs_reclaim_release(gfp_t gfp_mask) { - if (__need_fs_reclaim(gfp_mask)) - __fs_reclaim_release(); + gfp_mask = current_gfp_context(gfp_mask); + + if (__need_reclaim(gfp_mask)) { + if (gfp_mask & __GFP_FS) + __fs_reclaim_release(); + } } EXPORT_SYMBOL_GPL(fs_reclaim_release); #endif @@ -5007,6 +5051,26 @@ static inline void free_the_page(struct page *page, unsigned int order) __free_pages_ok(page, order, FPI_NONE); } +/** + * __free_pages - Free pages allocated with alloc_pages(). + * @page: The page pointer returned from alloc_pages(). + * @order: The order of the allocation. + * + * This function can free multi-page allocations that are not compound + * pages. It does not check that the @order passed in matches that of + * the allocation, so it is easy to leak memory. Freeing more memory + * than was allocated will probably emit a warning. + * + * If the last reference to this page is speculative, it will be released + * by put_page() which only frees the first page of a non-compound + * allocation. To prevent the remaining pages from being leaked, we free + * the subsequent pages here. If you want to use the page's reference + * count to decide when to free the allocation, you should allocate a + * compound page, and use put_page() instead of __free_pages(). + * + * Context: May be called in interrupt context or while holding a normal + * spinlock, but not in NMI context or while holding a raw spinlock. + */ void __free_pages(struct page *page, unsigned int order) { if (put_page_testzero(page)) @@ -5465,7 +5529,7 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask) global_node_page_state_pages(NR_SLAB_UNRECLAIMABLE_B), global_node_page_state(NR_FILE_MAPPED), global_node_page_state(NR_SHMEM), - global_zone_page_state(NR_PAGETABLE), + global_node_page_state(NR_PAGETABLE), global_zone_page_state(NR_BOUNCE), global_zone_page_state(NR_FREE_PAGES), free_pcp, @@ -5497,6 +5561,7 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask) #ifdef CONFIG_SHADOW_CALL_STACK " shadow_call_stack:%lukB" #endif + " pagetables:%lukB" " all_unreclaimable? %s" "\n", pgdat->node_id, @@ -5522,6 +5587,7 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask) #ifdef CONFIG_SHADOW_CALL_STACK node_page_state(pgdat, NR_KERNEL_SCS_KB), #endif + K(node_page_state(pgdat, NR_PAGETABLE)), pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES ? "yes" : "no"); } @@ -5553,7 +5619,6 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask) " present:%lukB" " managed:%lukB" " mlocked:%lukB" - " pagetables:%lukB" " bounce:%lukB" " free_pcp:%lukB" " local_pcp:%ukB" @@ -5574,7 +5639,6 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask) K(zone->present_pages), K(zone_managed_pages(zone)), K(zone_page_state(zone, NR_MLOCK)), - K(zone_page_state(zone, NR_PAGETABLE)), K(zone_page_state(zone, NR_BOUNCE)), K(free_pcp), K(this_cpu_read(zone->pageset->pcp.count)), @@ -5904,7 +5968,10 @@ static void build_zonelists(pg_data_t *pgdat) * not check if the processor is online before following the pageset pointer. * Other parts of the kernel may not check if the zone is available. */ -static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch); +static void pageset_init(struct per_cpu_pageset *p); +/* These effectively disable the pcplists in the boot pageset completely */ +#define BOOT_PAGESET_HIGH 0 +#define BOOT_PAGESET_BATCH 1 static DEFINE_PER_CPU(struct per_cpu_pageset, boot_pageset); static DEFINE_PER_CPU(struct per_cpu_nodestat, boot_nodestats); @@ -5972,7 +6039,7 @@ build_all_zonelists_init(void) * (a chicken-egg dilemma). */ for_each_possible_cpu(cpu) - setup_pageset(&per_cpu(boot_pageset, cpu), 0); + pageset_init(&per_cpu(boot_pageset, cpu)); mminit_verify_zonelist(); cpuset_init_current_mems_allowed(); @@ -6255,13 +6322,16 @@ static int zone_batchsize(struct zone *zone) } /* - * pcp->high and pcp->batch values are related and dependent on one another: - * ->batch must never be higher then ->high. - * The following function updates them in a safe manner without read side - * locking. + * pcp->high and pcp->batch values are related and generally batch is lower + * than high. They are also related to pcp->count such that count is lower + * than high, and as soon as it reaches high, the pcplist is flushed. * - * Any new users of pcp->batch and pcp->high should ensure they can cope with - * those fields changing asynchronously (acording to the above rule). + * However, guaranteeing these relations at all times would require e.g. write + * barriers here but also careful usage of read barriers at the read side, and + * thus be prone to error and bad for performance. Thus the update only prevents + * store tearing. Any new users of pcp->batch and pcp->high should ensure they + * can cope with those fields changing asynchronously, and fully trust only the + * pcp->count field on the local CPU with interrupts disabled. * * mutex_is_locked(&pcp_batch_high_lock) required when calling this function * outside of boot time (or some other assurance that no concurrent updaters @@ -6270,21 +6340,8 @@ static int zone_batchsize(struct zone *zone) static void pageset_update(struct per_cpu_pages *pcp, unsigned long high, unsigned long batch) { - /* start with a fail safe value for batch */ - pcp->batch = 1; - smp_wmb(); - - /* Update high, then batch, in order */ - pcp->high = high; - smp_wmb(); - - pcp->batch = batch; -} - -/* a companion to pageset_set_high() */ -static void pageset_set_batch(struct per_cpu_pageset *p, unsigned long batch) -{ - pageset_update(&p->pcp, 6 * batch, max(1UL, 1 * batch)); + WRITE_ONCE(pcp->batch, batch); + WRITE_ONCE(pcp->high, high); } static void pageset_init(struct per_cpu_pageset *p) @@ -6297,53 +6354,70 @@ static void pageset_init(struct per_cpu_pageset *p) pcp = &p->pcp; for (migratetype = 0; migratetype < MIGRATE_PCPTYPES; migratetype++) INIT_LIST_HEAD(&pcp->lists[migratetype]); + + /* + * Set batch and high values safe for a boot pageset. A true percpu + * pageset's initialization will update them subsequently. Here we don't + * need to be as careful as pageset_update() as nobody can access the + * pageset yet. + */ + pcp->high = BOOT_PAGESET_HIGH; + pcp->batch = BOOT_PAGESET_BATCH; } -static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch) +static void __zone_set_pageset_high_and_batch(struct zone *zone, unsigned long high, + unsigned long batch) { - pageset_init(p); - pageset_set_batch(p, batch); + struct per_cpu_pageset *p; + int cpu; + + for_each_possible_cpu(cpu) { + p = per_cpu_ptr(zone->pageset, cpu); + pageset_update(&p->pcp, high, batch); + } } /* - * pageset_set_high() sets the high water mark for hot per_cpu_pagelist - * to the value high for the pageset p. + * Calculate and set new high and batch values for all per-cpu pagesets of a + * zone, based on the zone's size and the percpu_pagelist_fraction sysctl. */ -static void pageset_set_high(struct per_cpu_pageset *p, - unsigned long high) +static void zone_set_pageset_high_and_batch(struct zone *zone) { - unsigned long batch = max(1UL, high / 4); - if ((high / 4) > (PAGE_SHIFT * 8)) - batch = PAGE_SHIFT * 8; + unsigned long new_high, new_batch; - pageset_update(&p->pcp, high, batch); -} + if (percpu_pagelist_fraction) { + new_high = zone_managed_pages(zone) / percpu_pagelist_fraction; + new_batch = max(1UL, new_high / 4); + if ((new_high / 4) > (PAGE_SHIFT * 8)) + new_batch = PAGE_SHIFT * 8; + } else { + new_batch = zone_batchsize(zone); + new_high = 6 * new_batch; + new_batch = max(1UL, 1 * new_batch); + } -static void pageset_set_high_and_batch(struct zone *zone, - struct per_cpu_pageset *pcp) -{ - if (percpu_pagelist_fraction) - pageset_set_high(pcp, - (zone_managed_pages(zone) / - percpu_pagelist_fraction)); - else - pageset_set_batch(pcp, zone_batchsize(zone)); -} + if (zone->pageset_high == new_high && + zone->pageset_batch == new_batch) + return; -static void __meminit zone_pageset_init(struct zone *zone, int cpu) -{ - struct per_cpu_pageset *pcp = per_cpu_ptr(zone->pageset, cpu); + zone->pageset_high = new_high; + zone->pageset_batch = new_batch; - pageset_init(pcp); - pageset_set_high_and_batch(zone, pcp); + __zone_set_pageset_high_and_batch(zone, new_high, new_batch); } void __meminit setup_zone_pageset(struct zone *zone) { + struct per_cpu_pageset *p; int cpu; + zone->pageset = alloc_percpu(struct per_cpu_pageset); - for_each_possible_cpu(cpu) - zone_pageset_init(zone, cpu); + for_each_possible_cpu(cpu) { + p = per_cpu_ptr(zone->pageset, cpu); + pageset_init(p); + } + + zone_set_pageset_high_and_batch(zone); } /* @@ -6386,6 +6460,8 @@ static __meminit void zone_pcp_init(struct zone *zone) * offset of a (static) per cpu variable into the per cpu area. */ zone->pageset = &boot_pageset; + zone->pageset_high = BOOT_PAGESET_HIGH; + zone->pageset_batch = BOOT_PAGESET_BATCH; if (populated_zone(zone)) printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%u\n", @@ -6796,7 +6872,6 @@ static void __meminit pgdat_init_internals(struct pglist_data *pgdat) init_waitqueue_head(&pgdat->pfmemalloc_wait); pgdat_page_ext_init(pgdat); - spin_lock_init(&pgdat->lru_lock); lruvec_init(&pgdat->__lruvec); } @@ -7598,6 +7673,11 @@ unsigned long free_reserved_area(void *start, void *end, int poison, const char * alias for the memset(). */ direct_map_addr = page_address(page); + /* + * Perform a kasan-unchecked memset() since this memory + * has not been initialized. + */ + direct_map_addr = kasan_reset_tag(direct_map_addr); if ((unsigned int)poison <= 0xFF) memset(direct_map_addr, poison, PAGE_SIZE); @@ -7791,31 +7871,24 @@ static void calculate_totalreserve_pages(void) static void setup_per_zone_lowmem_reserve(void) { struct pglist_data *pgdat; - enum zone_type j, idx; + enum zone_type i, j; for_each_online_pgdat(pgdat) { - for (j = 0; j < MAX_NR_ZONES; j++) { - struct zone *zone = pgdat->node_zones + j; - unsigned long managed_pages = zone_managed_pages(zone); - - zone->lowmem_reserve[j] = 0; - - idx = j; - while (idx) { - struct zone *lower_zone; - - idx--; - lower_zone = pgdat->node_zones + idx; - - if (!sysctl_lowmem_reserve_ratio[idx] || - !zone_managed_pages(lower_zone)) { - lower_zone->lowmem_reserve[j] = 0; - continue; + for (i = 0; i < MAX_NR_ZONES - 1; i++) { + struct zone *zone = &pgdat->node_zones[i]; + int ratio = sysctl_lowmem_reserve_ratio[i]; + bool clear = !ratio || !zone_managed_pages(zone); + unsigned long managed_pages = 0; + + for (j = i + 1; j < MAX_NR_ZONES; j++) { + if (clear) { + zone->lowmem_reserve[j] = 0; } else { - lower_zone->lowmem_reserve[j] = - managed_pages / sysctl_lowmem_reserve_ratio[idx]; + struct zone *upper_zone = &pgdat->node_zones[j]; + + managed_pages += zone_managed_pages(upper_zone); + zone->lowmem_reserve[j] = managed_pages / ratio; } - managed_pages += zone_managed_pages(lower_zone); } } } @@ -8077,15 +8150,6 @@ int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *table, int write, return 0; } -static void __zone_pcp_update(struct zone *zone) -{ - unsigned int cpu; - - for_each_possible_cpu(cpu) - pageset_set_high_and_batch(zone, - per_cpu_ptr(zone->pageset, cpu)); -} - /* * percpu_pagelist_fraction - changes the pcp->high for each zone on each * cpu. It is the fraction of total pages in each zone that a hot per cpu @@ -8118,7 +8182,7 @@ int percpu_pagelist_fraction_sysctl_handler(struct ctl_table *table, int write, goto out; for_each_populated_zone(zone) - __zone_pcp_update(zone); + zone_set_pageset_high_and_batch(zone); out: mutex_unlock(&pcp_batch_high_lock); return ret; @@ -8517,6 +8581,8 @@ int alloc_contig_range(unsigned long start, unsigned long end, if (ret) return ret; + drain_all_pages(cc.zone); + /* * In case of -EBUSY, we'd like to know which page causes problem. * So, just fall through. test_pages_isolated() has a tracepoint @@ -8725,7 +8791,28 @@ EXPORT_SYMBOL(free_contig_range); void __meminit zone_pcp_update(struct zone *zone) { mutex_lock(&pcp_batch_high_lock); - __zone_pcp_update(zone); + zone_set_pageset_high_and_batch(zone); + mutex_unlock(&pcp_batch_high_lock); +} + +/* + * Effectively disable pcplists for the zone by setting the high limit to 0 + * and draining all cpus. A concurrent page freeing on another CPU that's about + * to put the page on pcplist will either finish before the drain and the page + * will be drained, or observe the new high limit and skip the pcplist. + * + * Must be paired with a call to zone_pcp_enable(). + */ +void zone_pcp_disable(struct zone *zone) +{ + mutex_lock(&pcp_batch_high_lock); + __zone_set_pageset_high_and_batch(zone, 0, 1); + __drain_all_pages(zone, true); +} + +void zone_pcp_enable(struct zone *zone) +{ + __zone_set_pageset_high_and_batch(zone, zone->pageset_high, zone->pageset_batch); mutex_unlock(&pcp_batch_high_lock); } diff --git a/mm/page_counter.c b/mm/page_counter.c index b24a60b28bb0..c6860f51b6c6 100644 --- a/mm/page_counter.c +++ b/mm/page_counter.c @@ -183,14 +183,14 @@ int page_counter_set_max(struct page_counter *counter, unsigned long nr_pages) * the limit, so if it sees the old limit, we see the * modified counter and retry. */ - usage = atomic_long_read(&counter->usage); + usage = page_counter_read(counter); if (usage > nr_pages) return -EBUSY; old = xchg(&counter->max, nr_pages); - if (atomic_long_read(&counter->usage) <= usage) + if (page_counter_read(counter) <= usage) return 0; counter->max = old; diff --git a/mm/page_ext.c b/mm/page_ext.c index a3616f7a0e9e..df6f74aac8e1 100644 --- a/mm/page_ext.c +++ b/mm/page_ext.c @@ -34,7 +34,7 @@ * * The need callback is used to decide whether extended memory allocation is * needed or not. Sometimes users want to deactivate some features in this - * boot and extra memory would be unneccessary. In this case, to avoid + * boot and extra memory would be unnecessary. In this case, to avoid * allocating huge chunk of memory, each clients represent their need of * extra memory through the need callback. If one of the need callbacks * returns true, it means that someone needs extra memory so that @@ -99,12 +99,19 @@ static void __init invoke_init_callbacks(void) } } +#ifndef CONFIG_SPARSEMEM +void __init page_ext_init_flatmem_late(void) +{ + invoke_init_callbacks(); +} +#endif + static inline struct page_ext *get_entry(void *base, unsigned long index) { return base + page_ext_size * index; } -#if !defined(CONFIG_SPARSEMEM) +#ifndef CONFIG_SPARSEMEM void __meminit pgdat_page_ext_init(struct pglist_data *pgdat) @@ -177,7 +184,6 @@ void __init page_ext_init_flatmem(void) goto fail; } pr_info("allocated %ld bytes of page_ext\n", total_usage); - invoke_init_callbacks(); return; fail: diff --git a/mm/page_idle.c b/mm/page_idle.c index 057c61df12db..64e5344a992c 100644 --- a/mm/page_idle.c +++ b/mm/page_idle.c @@ -32,19 +32,15 @@ static struct page *page_idle_get_page(unsigned long pfn) { struct page *page = pfn_to_online_page(pfn); - pg_data_t *pgdat; if (!page || !PageLRU(page) || !get_page_unless_zero(page)) return NULL; - pgdat = page_pgdat(page); - spin_lock_irq(&pgdat->lru_lock); if (unlikely(!PageLRU(page))) { put_page(page); page = NULL; } - spin_unlock_irq(&pgdat->lru_lock); return page; } diff --git a/mm/page_io.c b/mm/page_io.c index 433df1263349..9bca17ecc4df 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -291,12 +291,14 @@ static inline void count_swpout_vm_event(struct page *page) static void bio_associate_blkg_from_page(struct bio *bio, struct page *page) { struct cgroup_subsys_state *css; + struct mem_cgroup *memcg; - if (!page->mem_cgroup) + memcg = page_memcg(page); + if (!memcg) return; rcu_read_lock(); - css = cgroup_e_css(page->mem_cgroup->css.cgroup, &io_cgrp_subsys); + css = cgroup_e_css(memcg->css.cgroup, &io_cgrp_subsys); bio_associate_blkg_from_css(bio, css); rcu_read_unlock(); } diff --git a/mm/page_isolation.c b/mm/page_isolation.c index abbf42214485..bddf788f45bf 100644 --- a/mm/page_isolation.c +++ b/mm/page_isolation.c @@ -49,7 +49,6 @@ static int set_migratetype_isolate(struct page *page, int migratetype, int isol_ __mod_zone_freepage_state(zone, -nr_pages, mt); spin_unlock_irqrestore(&zone->lock, flags); - drain_all_pages(zone); return 0; } @@ -89,7 +88,7 @@ static void unset_migratetype_isolate(struct page *page, unsigned migratetype) */ if (PageBuddy(page)) { order = buddy_order(page); - if (order >= pageblock_order) { + if (order >= pageblock_order && order < MAX_ORDER - 1) { pfn = page_to_pfn(page); buddy_pfn = __find_buddy_pfn(pfn, order); buddy = page + (buddy_pfn - pfn); @@ -172,11 +171,12 @@ __first_valid_page(unsigned long pfn, unsigned long nr_pages) * * Please note that there is no strong synchronization with the page allocator * either. Pages might be freed while their page blocks are marked ISOLATED. - * In some cases pages might still end up on pcp lists and that would allow + * A call to drain_all_pages() after isolation can flush most of them. However + * in some cases pages might still end up on pcp lists and that would allow * for their allocation even when they are in fact isolated already. Depending - * on how strong of a guarantee the caller needs drain_all_pages might be needed - * (e.g. __offline_pages will need to call it after check for isolated range for - * a next retry). + * on how strong of a guarantee the caller needs, zone_pcp_disable/enable() + * might be used to flush and disable pcplist before isolation and enable after + * unisolation. * * Return: 0 on success and -EBUSY if any part of range cannot be isolated. */ diff --git a/mm/page_owner.c b/mm/page_owner.c index b735a8eafcdb..af464bb7fbe7 100644 --- a/mm/page_owner.c +++ b/mm/page_owner.c @@ -10,6 +10,7 @@ #include <linux/migrate.h> #include <linux/stackdepot.h> #include <linux/seq_file.h> +#include <linux/sched/clock.h> #include "internal.h" @@ -25,6 +26,8 @@ struct page_owner { gfp_t gfp_mask; depot_stack_handle_t handle; depot_stack_handle_t free_handle; + u64 ts_nsec; + pid_t pid; }; static bool page_owner_enabled = false; @@ -172,6 +175,8 @@ static inline void __set_page_owner_handle(struct page *page, page_owner->order = order; page_owner->gfp_mask = gfp_mask; page_owner->last_migrate_reason = -1; + page_owner->pid = current->pid; + page_owner->ts_nsec = local_clock(); __set_bit(PAGE_EXT_OWNER, &page_ext->flags); __set_bit(PAGE_EXT_OWNER_ALLOCATED, &page_ext->flags); @@ -236,6 +241,8 @@ void __copy_page_owner(struct page *oldpage, struct page *newpage) new_page_owner->last_migrate_reason = old_page_owner->last_migrate_reason; new_page_owner->handle = old_page_owner->handle; + new_page_owner->pid = old_page_owner->pid; + new_page_owner->ts_nsec = old_page_owner->ts_nsec; /* * We don't clear the bit on the oldpage as it's going to be freed @@ -349,9 +356,10 @@ print_page_owner(char __user *buf, size_t count, unsigned long pfn, return -ENOMEM; ret = snprintf(kbuf, count, - "Page allocated via order %u, mask %#x(%pGg)\n", + "Page allocated via order %u, mask %#x(%pGg), pid %d, ts %llu ns\n", page_owner->order, page_owner->gfp_mask, - &page_owner->gfp_mask); + &page_owner->gfp_mask, page_owner->pid, + page_owner->ts_nsec); if (ret >= count) goto err; @@ -427,8 +435,9 @@ void __dump_page_owner(struct page *page) else pr_alert("page_owner tracks the page as freed\n"); - pr_alert("page last allocated via order %u, migratetype %s, gfp_mask %#x(%pGg)\n", - page_owner->order, migratetype_names[mt], gfp_mask, &gfp_mask); + pr_alert("page last allocated via order %u, migratetype %s, gfp_mask %#x(%pGg), pid %d, ts %llu\n", + page_owner->order, migratetype_names[mt], gfp_mask, &gfp_mask, + page_owner->pid, page_owner->ts_nsec); handle = READ_ONCE(page_owner->handle); if (!handle) { diff --git a/mm/page_poison.c b/mm/page_poison.c index ae0482cded87..65cdf844c8ad 100644 --- a/mm/page_poison.c +++ b/mm/page_poison.c @@ -8,57 +8,29 @@ #include <linux/ratelimit.h> #include <linux/kasan.h> -static DEFINE_STATIC_KEY_FALSE_RO(want_page_poisoning); +bool _page_poisoning_enabled_early; +EXPORT_SYMBOL(_page_poisoning_enabled_early); +DEFINE_STATIC_KEY_FALSE(_page_poisoning_enabled); +EXPORT_SYMBOL(_page_poisoning_enabled); static int __init early_page_poison_param(char *buf) { - int ret; - bool tmp; - - ret = strtobool(buf, &tmp); - if (ret) - return ret; - - if (tmp) - static_branch_enable(&want_page_poisoning); - else - static_branch_disable(&want_page_poisoning); - - return 0; + return kstrtobool(buf, &_page_poisoning_enabled_early); } early_param("page_poison", early_page_poison_param); -/** - * page_poisoning_enabled - check if page poisoning is enabled - * - * Return true if page poisoning is enabled, or false if not. - */ -bool page_poisoning_enabled(void) -{ - /* - * Assumes that debug_pagealloc_enabled is set before - * memblock_free_all. - * Page poisoning is debug page alloc for some arches. If - * either of those options are enabled, enable poisoning. - */ - return (static_branch_unlikely(&want_page_poisoning) || - (!IS_ENABLED(CONFIG_ARCH_SUPPORTS_DEBUG_PAGEALLOC) && - debug_pagealloc_enabled())); -} -EXPORT_SYMBOL_GPL(page_poisoning_enabled); - static void poison_page(struct page *page) { void *addr = kmap_atomic(page); /* KASAN still think the page is in-use, so skip it. */ kasan_disable_current(); - memset(addr, PAGE_POISON, PAGE_SIZE); + memset(kasan_reset_tag(addr), PAGE_POISON, PAGE_SIZE); kasan_enable_current(); kunmap_atomic(addr); } -static void poison_pages(struct page *page, int n) +void __kernel_poison_pages(struct page *page, int n) { int i; @@ -79,9 +51,6 @@ static void check_poison_mem(unsigned char *mem, size_t bytes) unsigned char *start; unsigned char *end; - if (IS_ENABLED(CONFIG_PAGE_POISONING_NO_SANITY)) - return; - start = memchr_inv(mem, PAGE_POISON, bytes); if (!start) return; @@ -117,7 +86,7 @@ static void unpoison_page(struct page *page) kunmap_atomic(addr); } -static void unpoison_pages(struct page *page, int n) +void __kernel_unpoison_pages(struct page *page, int n) { int i; @@ -125,17 +94,6 @@ static void unpoison_pages(struct page *page, int n) unpoison_page(page + i); } -void kernel_poison_pages(struct page *page, int numpages, int enable) -{ - if (!page_poisoning_enabled()) - return; - - if (enable) - unpoison_pages(page, numpages); - else - poison_pages(page, numpages); -} - #ifndef CONFIG_ARCH_SUPPORTS_DEBUG_PAGEALLOC void __kernel_map_pages(struct page *page, int numpages, int enable) { diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c index 5e77b269c330..86e3a3688d59 100644 --- a/mm/page_vma_mapped.c +++ b/mm/page_vma_mapped.c @@ -66,18 +66,19 @@ static inline bool pfn_is_match(struct page *page, unsigned long pfn) /** * check_pte - check if @pvmw->page is mapped at the @pvmw->pte + * @pvmw: page_vma_mapped_walk struct, includes a pair pte and page for checking * * page_vma_mapped_walk() found a place where @pvmw->page is *potentially* * mapped. check_pte() has to validate this. * - * @pvmw->pte may point to empty PTE, swap PTE or PTE pointing to arbitrary - * page. + * pvmw->pte may point to empty PTE, swap PTE or PTE pointing to + * arbitrary page. * * If PVMW_MIGRATION flag is set, returns true if @pvmw->pte contains migration * entry that points to @pvmw->page or any subpage in case of THP. * - * If PVMW_MIGRATION flag is not set, returns true if @pvmw->pte points to - * @pvmw->page or any subpage in case of THP. + * If PVMW_MIGRATION flag is not set, returns true if pvmw->pte points to + * pvmw->page or any subpage in case of THP. * * Otherwise, return false. * diff --git a/mm/process_vm_access.c b/mm/process_vm_access.c index 702250f148e7..4bcc11958089 100644 --- a/mm/process_vm_access.c +++ b/mm/process_vm_access.c @@ -260,7 +260,7 @@ static ssize_t process_vm_rw(pid_t pid, struct iovec iovstack_l[UIO_FASTIOV]; struct iovec iovstack_r[UIO_FASTIOV]; struct iovec *iov_l = iovstack_l; - struct iovec *iov_r = iovstack_r; + struct iovec *iov_r; struct iov_iter iter; ssize_t rc; int dir = vm_write ? WRITE : READ; diff --git a/mm/ptdump.c b/mm/ptdump.c index ba88ec43ff21..4354c1422d57 100644 --- a/mm/ptdump.c +++ b/mm/ptdump.c @@ -4,7 +4,7 @@ #include <linux/ptdump.h> #include <linux/kasan.h> -#ifdef CONFIG_KASAN +#if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS) /* * This is an optimization for KASAN=y case. Since all kasan page tables * eventually point to the kasan_early_shadow_page we could call note_page() @@ -31,7 +31,8 @@ static int ptdump_pgd_entry(pgd_t *pgd, unsigned long addr, struct ptdump_state *st = walk->private; pgd_t val = READ_ONCE(*pgd); -#if CONFIG_PGTABLE_LEVELS > 4 && defined(CONFIG_KASAN) +#if CONFIG_PGTABLE_LEVELS > 4 && \ + (defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)) if (pgd_page(val) == virt_to_page(lm_alias(kasan_early_shadow_p4d))) return note_kasan_page_table(walk, addr); #endif @@ -51,7 +52,8 @@ static int ptdump_p4d_entry(p4d_t *p4d, unsigned long addr, struct ptdump_state *st = walk->private; p4d_t val = READ_ONCE(*p4d); -#if CONFIG_PGTABLE_LEVELS > 3 && defined(CONFIG_KASAN) +#if CONFIG_PGTABLE_LEVELS > 3 && \ + (defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)) if (p4d_page(val) == virt_to_page(lm_alias(kasan_early_shadow_pud))) return note_kasan_page_table(walk, addr); #endif @@ -71,7 +73,8 @@ static int ptdump_pud_entry(pud_t *pud, unsigned long addr, struct ptdump_state *st = walk->private; pud_t val = READ_ONCE(*pud); -#if CONFIG_PGTABLE_LEVELS > 2 && defined(CONFIG_KASAN) +#if CONFIG_PGTABLE_LEVELS > 2 && \ + (defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)) if (pud_page(val) == virt_to_page(lm_alias(kasan_early_shadow_pmd))) return note_kasan_page_table(walk, addr); #endif @@ -91,7 +94,7 @@ static int ptdump_pmd_entry(pmd_t *pmd, unsigned long addr, struct ptdump_state *st = walk->private; pmd_t val = READ_ONCE(*pmd); -#if defined(CONFIG_KASAN) +#if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS) if (pmd_page(val) == virt_to_page(lm_alias(kasan_early_shadow_pte))) return note_kasan_page_table(walk, addr); #endif diff --git a/mm/rmap.c b/mm/rmap.c index 31b29321adfe..08c56aaf72eb 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -28,12 +28,12 @@ * hugetlb_fault_mutex (hugetlbfs specific page fault mutex) * anon_vma->rwsem * mm->page_table_lock or pte_lock - * pgdat->lru_lock (in mark_page_accessed, isolate_lru_page) * swap_lock (in swap_duplicate, swap_info_get) * mmlist_lock (in mmput, drain_mmlist and others) * mapping->private_lock (in __set_page_dirty_buffers) - * mem_cgroup_{begin,end}_page_stat (memcg->move_lock) + * lock_page_memcg move_lock (in __set_page_dirty_buffers) * i_pages lock (widely used) + * lruvec->lru_lock (in lock_page_lruvec_irq) * inode->i_lock (in set_page_dirty's __mark_inode_dirty) * bdi.wb->list_lock (in set_page_dirty's __mark_inode_dirty) * sb_lock (within inode_lock in fs/fs-writeback.c) @@ -1054,8 +1054,14 @@ static void __page_set_anon_rmap(struct page *page, if (!exclusive) anon_vma = anon_vma->root; + /* + * page_idle does a lockless/optimistic rmap scan on page->mapping. + * Make sure the compiler doesn't split the stores of anon_vma and + * the PAGE_MAPPING_ANON type identifier, otherwise the rmap code + * could mistake the mapping for a struct address_space and crash. + */ anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON; - page->mapping = (struct address_space *) anon_vma; + WRITE_ONCE(page->mapping, (struct address_space *) anon_vma); page->index = linear_page_index(vma, address); } @@ -1533,15 +1539,6 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma, goto discard; } - if (!(flags & TTU_IGNORE_ACCESS)) { - if (ptep_clear_flush_young_notify(vma, address, - pvmw.pte)) { - ret = false; - page_vma_mapped_walk_done(&pvmw); - break; - } - } - /* Nuke the page table entry. */ flush_cache_page(vma, address, pte_pfn(*pvmw.pte)); if (should_defer_flush(mm, flags)) { diff --git a/mm/shmem.c b/mm/shmem.c index 537c137698f8..7c6b6d8f6c39 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -246,7 +246,7 @@ static inline void shmem_inode_unacct_blocks(struct inode *inode, long pages) } static const struct super_operations shmem_ops; -static const struct address_space_operations shmem_aops; +const struct address_space_operations shmem_aops; static const struct file_operations shmem_file_operations; static const struct inode_operations shmem_inode_operations; static const struct inode_operations shmem_dir_inode_operations; @@ -713,7 +713,7 @@ next: } if (PageTransHuge(page)) { count_vm_event(THP_FILE_ALLOC); - __inc_node_page_state(page, NR_SHMEM_THPS); + __inc_lruvec_page_state(page, NR_SHMEM_THPS); } mapping->nrpages += nr; __mod_lruvec_page_state(page, NR_FILE_PAGES, nr); @@ -1152,7 +1152,7 @@ static void shmem_evict_inode(struct inode *inode) struct shmem_inode_info *info = SHMEM_I(inode); struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); - if (inode->i_mapping->a_ops == &shmem_aops) { + if (shmem_mapping(inode->i_mapping)) { shmem_unacct_size(info->flags, inode->i_size); inode->i_size = 0; shmem_truncate_range(inode, 0, (loff_t)-1); @@ -1858,7 +1858,7 @@ repeat: } /* shmem_symlink() */ - if (mapping->a_ops != &shmem_aops) + if (!shmem_mapping(mapping)) goto alloc_nohuge; if (shmem_huge == SHMEM_HUGE_DENY || sgp_huge == SGP_NOHUGE) goto alloc_nohuge; @@ -2352,11 +2352,6 @@ static struct inode *shmem_get_inode(struct super_block *sb, const struct inode return inode; } -bool shmem_mapping(struct address_space *mapping) -{ - return mapping->a_ops == &shmem_aops; -} - static int shmem_mfill_atomic_pte(struct mm_struct *dst_mm, pmd_t *dst_pmd, struct vm_area_struct *dst_vma, @@ -3865,7 +3860,7 @@ static void shmem_destroy_inodecache(void) kmem_cache_destroy(shmem_inode_cachep); } -static const struct address_space_operations shmem_aops = { +const struct address_space_operations shmem_aops = { .writepage = shmem_writepage, .set_page_dirty = __set_page_dirty_no_writeback, #ifdef CONFIG_TMPFS @@ -3877,6 +3872,7 @@ static const struct address_space_operations shmem_aops = { #endif .error_remove_page = generic_error_remove_page, }; +EXPORT_SYMBOL(shmem_aops); static const struct file_operations shmem_file_operations = { .mmap = shmem_mmap, @@ -4024,7 +4020,7 @@ out2: #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && defined(CONFIG_SYSFS) static ssize_t shmem_enabled_show(struct kobject *kobj, - struct kobj_attribute *attr, char *buf) + struct kobj_attribute *attr, char *buf) { static const int values[] = { SHMEM_HUGE_ALWAYS, @@ -4034,16 +4030,19 @@ static ssize_t shmem_enabled_show(struct kobject *kobj, SHMEM_HUGE_DENY, SHMEM_HUGE_FORCE, }; - int i, count; - - for (i = 0, count = 0; i < ARRAY_SIZE(values); i++) { - const char *fmt = shmem_huge == values[i] ? "[%s] " : "%s "; + int len = 0; + int i; - count += sprintf(buf + count, fmt, - shmem_format_huge(values[i])); + for (i = 0; i < ARRAY_SIZE(values); i++) { + len += sysfs_emit_at(buf, len, + shmem_huge == values[i] ? "%s[%s]" : "%s%s", + i ? " " : "", + shmem_format_huge(values[i])); } - buf[count - 1] = '\n'; - return count; + + len += sysfs_emit_at(buf, len, "\n"); + + return len; } static ssize_t shmem_enabled_store(struct kobject *kobj, @@ -4312,7 +4311,7 @@ struct page *shmem_read_mapping_page_gfp(struct address_space *mapping, struct page *page; int error; - BUG_ON(mapping->a_ops != &shmem_aops); + BUG_ON(!shmem_mapping(mapping)); error = shmem_getpage_gfp(inode, index, &page, SGP_CACHE, gfp, NULL, NULL, NULL); if (error) diff --git a/mm/slab.c b/mm/slab.c index b1113561b98b..d7c8da9319c7 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -1399,7 +1399,8 @@ static void kmem_freepages(struct kmem_cache *cachep, struct page *page) __ClearPageSlabPfmemalloc(page); __ClearPageSlab(page); page_mapcount_reset(page); - page->mapping = NULL; + /* In union with page->mapping where page allocator expects NULL */ + page->slab_cache = NULL; if (current->reclaim_state) current->reclaim_state->reclaimed_slab += 1 << order; @@ -1434,7 +1435,7 @@ static void slab_kernel_map(struct kmem_cache *cachep, void *objp, int map) if (!is_debug_pagealloc_cache(cachep)) return; - kernel_map_pages(virt_to_page(objp), cachep->size / PAGE_SIZE, map); + __kernel_map_pages(virt_to_page(objp), cachep->size / PAGE_SIZE, map); } #else @@ -3416,6 +3417,9 @@ free_done: static __always_inline void __cache_free(struct kmem_cache *cachep, void *objp, unsigned long caller) { + if (unlikely(slab_want_init_on_free(cachep))) + memset(objp, 0, cachep->object_size); + /* Put the object into the quarantine, don't touch it for now. */ if (kasan_slab_free(cachep, objp, _RET_IP_)) return; @@ -3434,8 +3438,6 @@ void ___cache_free(struct kmem_cache *cachep, void *objp, struct array_cache *ac = cpu_cache_get(cachep); check_irq_off(); - if (unlikely(slab_want_init_on_free(cachep))) - memset(objp, 0, cachep->object_size); kmemleak_free_recursive(objp, cachep->flags); objp = cache_free_debugcheck(cachep, objp, caller); memcg_slab_free_hook(cachep, &objp, 1); diff --git a/mm/slab.h b/mm/slab.h index f9977d6613d6..1a756a359fa8 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -204,7 +204,7 @@ ssize_t slabinfo_write(struct file *file, const char __user *buffer, void __kmem_cache_free_bulk(struct kmem_cache *, size_t, void **); int __kmem_cache_alloc_bulk(struct kmem_cache *, gfp_t, size_t, void **); -static inline int cache_vmstat_idx(struct kmem_cache *s) +static inline enum node_stat_item cache_vmstat_idx(struct kmem_cache *s) { return (s->flags & SLAB_RECLAIM_ACCOUNT) ? NR_SLAB_RECLAIMABLE_B : NR_SLAB_UNRECLAIMABLE_B; @@ -239,30 +239,13 @@ static inline bool kmem_cache_debug_flags(struct kmem_cache *s, slab_flags_t fla } #ifdef CONFIG_MEMCG_KMEM -static inline struct obj_cgroup **page_obj_cgroups(struct page *page) -{ - /* - * page->mem_cgroup and page->obj_cgroups are sharing the same - * space. To distinguish between them in case we don't know for sure - * that the page is a slab page (e.g. page_cgroup_ino()), let's - * always set the lowest bit of obj_cgroups. - */ - return (struct obj_cgroup **) - ((unsigned long)page->obj_cgroups & ~0x1UL); -} - -static inline bool page_has_obj_cgroups(struct page *page) -{ - return ((unsigned long)page->obj_cgroups & 0x1UL); -} - int memcg_alloc_page_obj_cgroups(struct page *page, struct kmem_cache *s, gfp_t gfp); static inline void memcg_free_page_obj_cgroups(struct page *page) { - kfree(page_obj_cgroups(page)); - page->obj_cgroups = NULL; + kfree(page_objcgs(page)); + page->memcg_data = 0; } static inline size_t obj_full_size(struct kmem_cache *s) @@ -304,7 +287,7 @@ static inline bool memcg_slab_pre_alloc_hook(struct kmem_cache *s, static inline void mod_objcg_state(struct obj_cgroup *objcg, struct pglist_data *pgdat, - int idx, int nr) + enum node_stat_item idx, int nr) { struct mem_cgroup *memcg; struct lruvec *lruvec; @@ -333,7 +316,7 @@ static inline void memcg_slab_post_alloc_hook(struct kmem_cache *s, if (likely(p[i])) { page = virt_to_head_page(p[i]); - if (!page_has_obj_cgroups(page) && + if (!page_objcgs(page) && memcg_alloc_page_obj_cgroups(page, s, flags)) { obj_cgroup_uncharge(objcg, obj_full_size(s)); continue; @@ -341,7 +324,7 @@ static inline void memcg_slab_post_alloc_hook(struct kmem_cache *s, off = obj_to_index(s, page, p[i]); obj_cgroup_get(objcg); - page_obj_cgroups(page)[off] = objcg; + page_objcgs(page)[off] = objcg; mod_objcg_state(objcg, page_pgdat(page), cache_vmstat_idx(s), obj_full_size(s)); } else { @@ -355,6 +338,7 @@ static inline void memcg_slab_free_hook(struct kmem_cache *s_orig, void **p, int objects) { struct kmem_cache *s; + struct obj_cgroup **objcgs; struct obj_cgroup *objcg; struct page *page; unsigned int off; @@ -368,7 +352,8 @@ static inline void memcg_slab_free_hook(struct kmem_cache *s_orig, continue; page = virt_to_head_page(p[i]); - if (!page_has_obj_cgroups(page)) + objcgs = page_objcgs(page); + if (!objcgs) continue; if (!s_orig) @@ -377,11 +362,11 @@ static inline void memcg_slab_free_hook(struct kmem_cache *s_orig, s = s_orig; off = obj_to_index(s, page, p[i]); - objcg = page_obj_cgroups(page)[off]; + objcg = objcgs[off]; if (!objcg) continue; - page_obj_cgroups(page)[off] = NULL; + objcgs[off] = NULL; obj_cgroup_uncharge(objcg, obj_full_size(s)); mod_objcg_state(objcg, page_pgdat(page), cache_vmstat_idx(s), -obj_full_size(s)); @@ -390,11 +375,6 @@ static inline void memcg_slab_free_hook(struct kmem_cache *s_orig, } #else /* CONFIG_MEMCG_KMEM */ -static inline bool page_has_obj_cgroups(struct page *page) -{ - return false; -} - static inline struct mem_cgroup *memcg_from_slab_obj(void *ptr) { return NULL; @@ -510,10 +490,7 @@ static inline struct kmem_cache *slab_pre_alloc_hook(struct kmem_cache *s, { flags &= gfp_allowed_mask; - fs_reclaim_acquire(flags); - fs_reclaim_release(flags); - - might_sleep_if(gfpflags_allow_blocking(flags)); + might_alloc(flags); if (should_failslab(s, flags)) return NULL; diff --git a/mm/slab_common.c b/mm/slab_common.c index f9ccd5dc13f3..e981c80d216c 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -18,6 +18,7 @@ #include <linux/seq_file.h> #include <linux/proc_fs.h> #include <linux/debugfs.h> +#include <linux/kasan.h> #include <asm/cacheflush.h> #include <asm/tlbflush.h> #include <asm/page.h> @@ -53,7 +54,7 @@ static DECLARE_WORK(slab_caches_to_rcu_destroy_work, */ #define SLAB_NEVER_MERGE (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \ SLAB_TRACE | SLAB_TYPESAFE_BY_RCU | SLAB_NOLEAKTRACE | \ - SLAB_FAILSLAB | SLAB_KASAN) + SLAB_FAILSLAB | kasan_never_merge()) #define SLAB_MERGE_SAME (SLAB_RECLAIM_ACCOUNT | SLAB_CACHE_DMA | \ SLAB_CACHE_DMA32 | SLAB_ACCOUNT) @@ -978,7 +979,7 @@ static int slab_show(struct seq_file *m, void *p) void dump_unreclaimable_slab(void) { - struct kmem_cache *s, *s2; + struct kmem_cache *s; struct slabinfo sinfo; /* @@ -996,7 +997,7 @@ void dump_unreclaimable_slab(void) pr_info("Unreclaimable slab info:\n"); pr_info("Name Used Total\n"); - list_for_each_entry_safe(s, s2, &slab_caches, list) { + list_for_each_entry(s, &slab_caches, list) { if (s->flags & SLAB_RECLAIM_ACCOUNT) continue; @@ -1091,9 +1092,9 @@ static __always_inline void *__do_krealloc(const void *p, size_t new_size, * @flags: the type of memory to allocate. * * The contents of the object pointed to are preserved up to the - * lesser of the new and old sizes. If @p is %NULL, krealloc() - * behaves exactly like kmalloc(). If @new_size is 0 and @p is not a - * %NULL pointer, the object pointed to is freed. + * lesser of the new and old sizes (__GFP_ZERO flag is effectively ignored). + * If @p is %NULL, krealloc() behaves exactly like kmalloc(). If @new_size + * is 0 and @p is not a %NULL pointer, the object pointed to is freed. * * Return: pointer to the allocated memory or %NULL in case of error */ @@ -1176,7 +1177,7 @@ size_t ksize(const void *objp) * We assume that ksize callers could use whole allocated area, * so we need to unpoison this area. */ - kasan_unpoison_shadow(objp, size); + kasan_unpoison_range(objp, size); return size; } EXPORT_SYMBOL(ksize); diff --git a/mm/slob.c b/mm/slob.c index 7cc9805c8091..8d4bfa46247f 100644 --- a/mm/slob.c +++ b/mm/slob.c @@ -474,8 +474,7 @@ __do_kmalloc_node(size_t size, gfp_t gfp, int node, unsigned long caller) gfp &= gfp_allowed_mask; - fs_reclaim_acquire(gfp); - fs_reclaim_release(gfp); + might_alloc(gfp); if (size < PAGE_SIZE - minalign) { int align = minalign; @@ -597,8 +596,7 @@ static void *slob_alloc_node(struct kmem_cache *c, gfp_t flags, int node) flags &= gfp_allowed_mask; - fs_reclaim_acquire(flags); - fs_reclaim_release(flags); + might_alloc(flags); if (c->size < PAGE_SIZE) { b = slob_alloc(c->size, flags, c->align, node, 0); diff --git a/mm/slub.c b/mm/slub.c index 34dcc09e2ec9..0c8b43a5b3b0 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -249,7 +249,7 @@ static inline void *freelist_ptr(const struct kmem_cache *s, void *ptr, { #ifdef CONFIG_SLAB_FREELIST_HARDENED /* - * When CONFIG_KASAN_SW_TAGS is enabled, ptr_addr might be tagged. + * When CONFIG_KASAN_SW/HW_TAGS is enabled, ptr_addr might be tagged. * Normally, this doesn't cause any issues, as both set_freepointer() * and get_freepointer() are called with a pointer with the same tag. * However, there are some issues with CONFIG_SLUB_DEBUG code. For @@ -275,6 +275,7 @@ static inline void *freelist_dereference(const struct kmem_cache *s, static inline void *get_freepointer(struct kmem_cache *s, void *object) { + object = kasan_reset_tag(object); return freelist_dereference(s, object + s->offset); } @@ -304,6 +305,7 @@ static inline void set_freepointer(struct kmem_cache *s, void *object, void *fp) BUG_ON(object == fp); /* naive detection of double free or corruption */ #endif + freeptr_addr = (unsigned long)kasan_reset_tag((void *)freeptr_addr); *(void **)freeptr_addr = freelist_ptr(s, fp, freeptr_addr); } @@ -538,8 +540,8 @@ static void print_section(char *level, char *text, u8 *addr, unsigned int length) { metadata_access_enable(); - print_hex_dump(level, text, DUMP_PREFIX_ADDRESS, 16, 1, addr, - length, 1); + print_hex_dump(level, kasan_reset_tag(text), DUMP_PREFIX_ADDRESS, + 16, 1, addr, length, 1); metadata_access_disable(); } @@ -570,7 +572,7 @@ static struct track *get_track(struct kmem_cache *s, void *object, p = object + get_info_end(s); - return p + alloc; + return kasan_reset_tag(p + alloc); } static void set_track(struct kmem_cache *s, void *object, @@ -583,7 +585,8 @@ static void set_track(struct kmem_cache *s, void *object, unsigned int nr_entries; metadata_access_enable(); - nr_entries = stack_trace_save(p->addrs, TRACK_ADDRS_COUNT, 3); + nr_entries = stack_trace_save(kasan_reset_tag(p->addrs), + TRACK_ADDRS_COUNT, 3); metadata_access_disable(); if (nr_entries < TRACK_ADDRS_COUNT) @@ -747,7 +750,7 @@ static __printf(3, 4) void slab_err(struct kmem_cache *s, struct page *page, static void init_object(struct kmem_cache *s, void *object, u8 val) { - u8 *p = object; + u8 *p = kasan_reset_tag(object); if (s->flags & SLAB_RED_ZONE) memset(p - s->red_left_pad, val, s->red_left_pad); @@ -777,7 +780,7 @@ static int check_bytes_and_report(struct kmem_cache *s, struct page *page, u8 *addr = page_address(page); metadata_access_enable(); - fault = memchr_inv(start, value, bytes); + fault = memchr_inv(kasan_reset_tag(start), value, bytes); metadata_access_disable(); if (!fault) return 1; @@ -873,7 +876,7 @@ static int slab_pad_check(struct kmem_cache *s, struct page *page) pad = end - remainder; metadata_access_enable(); - fault = memchr_inv(pad, POISON_INUSE, remainder); + fault = memchr_inv(kasan_reset_tag(pad), POISON_INUSE, remainder); metadata_access_disable(); if (!fault) return 1; @@ -1118,7 +1121,7 @@ void setup_page_debug(struct kmem_cache *s, struct page *page, void *addr) return; metadata_access_enable(); - memset(addr, POISON_INUSE, page_size(page)); + memset(kasan_reset_tag(addr), POISON_INUSE, page_size(page)); metadata_access_disable(); } @@ -1566,10 +1569,10 @@ static inline bool slab_free_freelist_hook(struct kmem_cache *s, * Clear the object and the metadata, but don't touch * the redzone. */ - memset(object, 0, s->object_size); + memset(kasan_reset_tag(object), 0, s->object_size); rsize = (s->flags & SLAB_RED_ZONE) ? s->red_left_pad : 0; - memset((char *)object + s->inuse, 0, + memset((char *)kasan_reset_tag(object) + s->inuse, 0, s->size - s->inuse - rsize); } @@ -1836,8 +1839,8 @@ static void __free_slab(struct kmem_cache *s, struct page *page) __ClearPageSlabPfmemalloc(page); __ClearPageSlab(page); - - page->mapping = NULL; + /* In union with page->mapping where page allocator expects NULL */ + page->slab_cache = NULL; if (current->reclaim_state) current->reclaim_state->reclaimed_slab += pages; unaccount_slab_page(page, order, s); @@ -2245,8 +2248,7 @@ redo: } } else { m = M_FULL; -#ifdef CONFIG_SLUB_DEBUG - if ((s->flags & SLAB_STORE_USER) && !lock) { + if (kmem_cache_debug_flags(s, SLAB_STORE_USER) && !lock) { lock = 1; /* * This also ensures that the scanning of full @@ -2255,7 +2257,6 @@ redo: */ spin_lock(&n->list_lock); } -#endif } if (l != m) { @@ -2883,10 +2884,10 @@ redo: stat(s, ALLOC_FASTPATH); } - maybe_wipe_obj_freeptr(s, object); + maybe_wipe_obj_freeptr(s, kasan_reset_tag(object)); if (unlikely(slab_want_init_on_alloc(gfpflags, s)) && object) - memset(object, 0, s->object_size); + memset(kasan_reset_tag(object), 0, s->object_size); slab_post_alloc_hook(s, objcg, gfpflags, 1, &object); @@ -3433,7 +3434,7 @@ static inline int calculate_order(unsigned int size) */ min_objects = slub_min_objects; if (!min_objects) - min_objects = 4 * (fls(nr_cpu_ids) + 1); + min_objects = 4 * (fls(num_online_cpus()) + 1); max_objects = order_objects(slub_max_order, size); min_objects = min(min_objects, max_objects); @@ -4726,7 +4727,7 @@ static void process_slab(struct loc_track *t, struct kmem_cache *s, } static int list_locations(struct kmem_cache *s, char *buf, - enum track_item alloc) + enum track_item alloc) { int len = 0; unsigned long i; @@ -4736,7 +4737,7 @@ static int list_locations(struct kmem_cache *s, char *buf, if (!alloc_loc_track(&t, PAGE_SIZE / sizeof(struct location), GFP_KERNEL)) { - return sprintf(buf, "Out of memory\n"); + return sysfs_emit(buf, "Out of memory\n"); } /* Push back cpu slabs */ flush_all(s); @@ -4759,50 +4760,45 @@ static int list_locations(struct kmem_cache *s, char *buf, for (i = 0; i < t.count; i++) { struct location *l = &t.loc[i]; - if (len > PAGE_SIZE - KSYM_SYMBOL_LEN - 100) - break; - len += sprintf(buf + len, "%7ld ", l->count); + len += sysfs_emit_at(buf, len, "%7ld ", l->count); if (l->addr) - len += sprintf(buf + len, "%pS", (void *)l->addr); + len += sysfs_emit_at(buf, len, "%pS", (void *)l->addr); else - len += sprintf(buf + len, "<not-available>"); - - if (l->sum_time != l->min_time) { - len += sprintf(buf + len, " age=%ld/%ld/%ld", - l->min_time, - (long)div_u64(l->sum_time, l->count), - l->max_time); - } else - len += sprintf(buf + len, " age=%ld", - l->min_time); + len += sysfs_emit_at(buf, len, "<not-available>"); + + if (l->sum_time != l->min_time) + len += sysfs_emit_at(buf, len, " age=%ld/%ld/%ld", + l->min_time, + (long)div_u64(l->sum_time, + l->count), + l->max_time); + else + len += sysfs_emit_at(buf, len, " age=%ld", l->min_time); if (l->min_pid != l->max_pid) - len += sprintf(buf + len, " pid=%ld-%ld", - l->min_pid, l->max_pid); + len += sysfs_emit_at(buf, len, " pid=%ld-%ld", + l->min_pid, l->max_pid); else - len += sprintf(buf + len, " pid=%ld", - l->min_pid); + len += sysfs_emit_at(buf, len, " pid=%ld", + l->min_pid); if (num_online_cpus() > 1 && - !cpumask_empty(to_cpumask(l->cpus)) && - len < PAGE_SIZE - 60) - len += scnprintf(buf + len, PAGE_SIZE - len - 50, - " cpus=%*pbl", - cpumask_pr_args(to_cpumask(l->cpus))); - - if (nr_online_nodes > 1 && !nodes_empty(l->nodes) && - len < PAGE_SIZE - 60) - len += scnprintf(buf + len, PAGE_SIZE - len - 50, - " nodes=%*pbl", - nodemask_pr_args(&l->nodes)); - - len += sprintf(buf + len, "\n"); + !cpumask_empty(to_cpumask(l->cpus))) + len += sysfs_emit_at(buf, len, " cpus=%*pbl", + cpumask_pr_args(to_cpumask(l->cpus))); + + if (nr_online_nodes > 1 && !nodes_empty(l->nodes)) + len += sysfs_emit_at(buf, len, " nodes=%*pbl", + nodemask_pr_args(&l->nodes)); + + len += sysfs_emit_at(buf, len, "\n"); } free_loc_track(&t); if (!t.count) - len += sprintf(buf, "No data\n"); + len += sysfs_emit_at(buf, len, "No data\n"); + return len; } #endif /* CONFIG_SLUB_DEBUG */ @@ -4899,12 +4895,13 @@ __setup("slub_memcg_sysfs=", setup_slub_memcg_sysfs); #endif static ssize_t show_slab_objects(struct kmem_cache *s, - char *buf, unsigned long flags) + char *buf, unsigned long flags) { unsigned long total = 0; int node; int x; unsigned long *nodes; + int len = 0; nodes = kcalloc(nr_node_ids, sizeof(unsigned long), GFP_KERNEL); if (!nodes) @@ -4993,15 +4990,19 @@ static ssize_t show_slab_objects(struct kmem_cache *s, nodes[node] += x; } } - x = sprintf(buf, "%lu", total); + + len += sysfs_emit_at(buf, len, "%lu", total); #ifdef CONFIG_NUMA - for (node = 0; node < nr_node_ids; node++) + for (node = 0; node < nr_node_ids; node++) { if (nodes[node]) - x += sprintf(buf + x, " N%d=%lu", - node, nodes[node]); + len += sysfs_emit_at(buf, len, " N%d=%lu", + node, nodes[node]); + } #endif + len += sysfs_emit_at(buf, len, "\n"); kfree(nodes); - return x + sprintf(buf + x, "\n"); + + return len; } #define to_slab_attr(n) container_of(n, struct slab_attribute, attr) @@ -5023,37 +5024,37 @@ struct slab_attribute { static ssize_t slab_size_show(struct kmem_cache *s, char *buf) { - return sprintf(buf, "%u\n", s->size); + return sysfs_emit(buf, "%u\n", s->size); } SLAB_ATTR_RO(slab_size); static ssize_t align_show(struct kmem_cache *s, char *buf) { - return sprintf(buf, "%u\n", s->align); + return sysfs_emit(buf, "%u\n", s->align); } SLAB_ATTR_RO(align); static ssize_t object_size_show(struct kmem_cache *s, char *buf) { - return sprintf(buf, "%u\n", s->object_size); + return sysfs_emit(buf, "%u\n", s->object_size); } SLAB_ATTR_RO(object_size); static ssize_t objs_per_slab_show(struct kmem_cache *s, char *buf) { - return sprintf(buf, "%u\n", oo_objects(s->oo)); + return sysfs_emit(buf, "%u\n", oo_objects(s->oo)); } SLAB_ATTR_RO(objs_per_slab); static ssize_t order_show(struct kmem_cache *s, char *buf) { - return sprintf(buf, "%u\n", oo_order(s->oo)); + return sysfs_emit(buf, "%u\n", oo_order(s->oo)); } SLAB_ATTR_RO(order); static ssize_t min_partial_show(struct kmem_cache *s, char *buf) { - return sprintf(buf, "%lu\n", s->min_partial); + return sysfs_emit(buf, "%lu\n", s->min_partial); } static ssize_t min_partial_store(struct kmem_cache *s, const char *buf, @@ -5073,7 +5074,7 @@ SLAB_ATTR(min_partial); static ssize_t cpu_partial_show(struct kmem_cache *s, char *buf) { - return sprintf(buf, "%u\n", slub_cpu_partial(s)); + return sysfs_emit(buf, "%u\n", slub_cpu_partial(s)); } static ssize_t cpu_partial_store(struct kmem_cache *s, const char *buf, @@ -5098,13 +5099,13 @@ static ssize_t ctor_show(struct kmem_cache *s, char *buf) { if (!s->ctor) return 0; - return sprintf(buf, "%pS\n", s->ctor); + return sysfs_emit(buf, "%pS\n", s->ctor); } SLAB_ATTR_RO(ctor); static ssize_t aliases_show(struct kmem_cache *s, char *buf) { - return sprintf(buf, "%d\n", s->refcount < 0 ? 0 : s->refcount - 1); + return sysfs_emit(buf, "%d\n", s->refcount < 0 ? 0 : s->refcount - 1); } SLAB_ATTR_RO(aliases); @@ -5137,7 +5138,7 @@ static ssize_t slabs_cpu_partial_show(struct kmem_cache *s, char *buf) int objects = 0; int pages = 0; int cpu; - int len; + int len = 0; for_each_online_cpu(cpu) { struct page *page; @@ -5150,52 +5151,53 @@ static ssize_t slabs_cpu_partial_show(struct kmem_cache *s, char *buf) } } - len = sprintf(buf, "%d(%d)", objects, pages); + len += sysfs_emit_at(buf, len, "%d(%d)", objects, pages); #ifdef CONFIG_SMP for_each_online_cpu(cpu) { struct page *page; page = slub_percpu_partial(per_cpu_ptr(s->cpu_slab, cpu)); - - if (page && len < PAGE_SIZE - 20) - len += sprintf(buf + len, " C%d=%d(%d)", cpu, - page->pobjects, page->pages); + if (page) + len += sysfs_emit_at(buf, len, " C%d=%d(%d)", + cpu, page->pobjects, page->pages); } #endif - return len + sprintf(buf + len, "\n"); + len += sysfs_emit_at(buf, len, "\n"); + + return len; } SLAB_ATTR_RO(slabs_cpu_partial); static ssize_t reclaim_account_show(struct kmem_cache *s, char *buf) { - return sprintf(buf, "%d\n", !!(s->flags & SLAB_RECLAIM_ACCOUNT)); + return sysfs_emit(buf, "%d\n", !!(s->flags & SLAB_RECLAIM_ACCOUNT)); } SLAB_ATTR_RO(reclaim_account); static ssize_t hwcache_align_show(struct kmem_cache *s, char *buf) { - return sprintf(buf, "%d\n", !!(s->flags & SLAB_HWCACHE_ALIGN)); + return sysfs_emit(buf, "%d\n", !!(s->flags & SLAB_HWCACHE_ALIGN)); } SLAB_ATTR_RO(hwcache_align); #ifdef CONFIG_ZONE_DMA static ssize_t cache_dma_show(struct kmem_cache *s, char *buf) { - return sprintf(buf, "%d\n", !!(s->flags & SLAB_CACHE_DMA)); + return sysfs_emit(buf, "%d\n", !!(s->flags & SLAB_CACHE_DMA)); } SLAB_ATTR_RO(cache_dma); #endif static ssize_t usersize_show(struct kmem_cache *s, char *buf) { - return sprintf(buf, "%u\n", s->usersize); + return sysfs_emit(buf, "%u\n", s->usersize); } SLAB_ATTR_RO(usersize); static ssize_t destroy_by_rcu_show(struct kmem_cache *s, char *buf) { - return sprintf(buf, "%d\n", !!(s->flags & SLAB_TYPESAFE_BY_RCU)); + return sysfs_emit(buf, "%d\n", !!(s->flags & SLAB_TYPESAFE_BY_RCU)); } SLAB_ATTR_RO(destroy_by_rcu); @@ -5214,33 +5216,33 @@ SLAB_ATTR_RO(total_objects); static ssize_t sanity_checks_show(struct kmem_cache *s, char *buf) { - return sprintf(buf, "%d\n", !!(s->flags & SLAB_CONSISTENCY_CHECKS)); + return sysfs_emit(buf, "%d\n", !!(s->flags & SLAB_CONSISTENCY_CHECKS)); } SLAB_ATTR_RO(sanity_checks); static ssize_t trace_show(struct kmem_cache *s, char *buf) { - return sprintf(buf, "%d\n", !!(s->flags & SLAB_TRACE)); + return sysfs_emit(buf, "%d\n", !!(s->flags & SLAB_TRACE)); } SLAB_ATTR_RO(trace); static ssize_t red_zone_show(struct kmem_cache *s, char *buf) { - return sprintf(buf, "%d\n", !!(s->flags & SLAB_RED_ZONE)); + return sysfs_emit(buf, "%d\n", !!(s->flags & SLAB_RED_ZONE)); } SLAB_ATTR_RO(red_zone); static ssize_t poison_show(struct kmem_cache *s, char *buf) { - return sprintf(buf, "%d\n", !!(s->flags & SLAB_POISON)); + return sysfs_emit(buf, "%d\n", !!(s->flags & SLAB_POISON)); } SLAB_ATTR_RO(poison); static ssize_t store_user_show(struct kmem_cache *s, char *buf) { - return sprintf(buf, "%d\n", !!(s->flags & SLAB_STORE_USER)); + return sysfs_emit(buf, "%d\n", !!(s->flags & SLAB_STORE_USER)); } SLAB_ATTR_RO(store_user); @@ -5284,7 +5286,7 @@ SLAB_ATTR_RO(free_calls); #ifdef CONFIG_FAILSLAB static ssize_t failslab_show(struct kmem_cache *s, char *buf) { - return sprintf(buf, "%d\n", !!(s->flags & SLAB_FAILSLAB)); + return sysfs_emit(buf, "%d\n", !!(s->flags & SLAB_FAILSLAB)); } SLAB_ATTR_RO(failslab); #endif @@ -5308,7 +5310,7 @@ SLAB_ATTR(shrink); #ifdef CONFIG_NUMA static ssize_t remote_node_defrag_ratio_show(struct kmem_cache *s, char *buf) { - return sprintf(buf, "%u\n", s->remote_node_defrag_ratio / 10); + return sysfs_emit(buf, "%u\n", s->remote_node_defrag_ratio / 10); } static ssize_t remote_node_defrag_ratio_store(struct kmem_cache *s, @@ -5335,7 +5337,7 @@ static int show_stat(struct kmem_cache *s, char *buf, enum stat_item si) { unsigned long sum = 0; int cpu; - int len; + int len = 0; int *data = kmalloc_array(nr_cpu_ids, sizeof(int), GFP_KERNEL); if (!data) @@ -5348,16 +5350,19 @@ static int show_stat(struct kmem_cache *s, char *buf, enum stat_item si) sum += x; } - len = sprintf(buf, "%lu", sum); + len += sysfs_emit_at(buf, len, "%lu", sum); #ifdef CONFIG_SMP for_each_online_cpu(cpu) { - if (data[cpu] && len < PAGE_SIZE - 20) - len += sprintf(buf + len, " C%d=%u", cpu, data[cpu]); + if (data[cpu]) + len += sysfs_emit_at(buf, len, " C%d=%u", + cpu, data[cpu]); } #endif kfree(data); - return len + sprintf(buf + len, "\n"); + len += sysfs_emit_at(buf, len, "\n"); + + return len; } static void clear_stat(struct kmem_cache *s, enum stat_item si) diff --git a/mm/swap.c b/mm/swap.c index 47a47681c86b..2cca7141470c 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -79,16 +79,14 @@ static DEFINE_PER_CPU(struct lru_pvecs, lru_pvecs) = { static void __page_cache_release(struct page *page) { if (PageLRU(page)) { - pg_data_t *pgdat = page_pgdat(page); struct lruvec *lruvec; unsigned long flags; - spin_lock_irqsave(&pgdat->lru_lock, flags); - lruvec = mem_cgroup_page_lruvec(page, pgdat); + lruvec = lock_page_lruvec_irqsave(page, &flags); VM_BUG_ON_PAGE(!PageLRU(page), page); __ClearPageLRU(page); del_page_from_lru_list(page, lruvec, page_off_lru(page)); - spin_unlock_irqrestore(&pgdat->lru_lock, flags); + unlock_page_lruvec_irqrestore(lruvec, flags); } __ClearPageWaiters(page); } @@ -204,63 +202,46 @@ int get_kernel_page(unsigned long start, int write, struct page **pages) EXPORT_SYMBOL_GPL(get_kernel_page); static void pagevec_lru_move_fn(struct pagevec *pvec, - void (*move_fn)(struct page *page, struct lruvec *lruvec, void *arg), - void *arg) + void (*move_fn)(struct page *page, struct lruvec *lruvec)) { int i; - struct pglist_data *pgdat = NULL; - struct lruvec *lruvec; + struct lruvec *lruvec = NULL; unsigned long flags = 0; for (i = 0; i < pagevec_count(pvec); i++) { struct page *page = pvec->pages[i]; - struct pglist_data *pagepgdat = page_pgdat(page); - if (pagepgdat != pgdat) { - if (pgdat) - spin_unlock_irqrestore(&pgdat->lru_lock, flags); - pgdat = pagepgdat; - spin_lock_irqsave(&pgdat->lru_lock, flags); - } + /* block memcg migration during page moving between lru */ + if (!TestClearPageLRU(page)) + continue; - lruvec = mem_cgroup_page_lruvec(page, pgdat); - (*move_fn)(page, lruvec, arg); + lruvec = relock_page_lruvec_irqsave(page, lruvec, &flags); + (*move_fn)(page, lruvec); + + SetPageLRU(page); } - if (pgdat) - spin_unlock_irqrestore(&pgdat->lru_lock, flags); + if (lruvec) + unlock_page_lruvec_irqrestore(lruvec, flags); release_pages(pvec->pages, pvec->nr); pagevec_reinit(pvec); } -static void pagevec_move_tail_fn(struct page *page, struct lruvec *lruvec, - void *arg) +static void pagevec_move_tail_fn(struct page *page, struct lruvec *lruvec) { - int *pgmoved = arg; - - if (PageLRU(page) && !PageUnevictable(page)) { + if (!PageUnevictable(page)) { del_page_from_lru_list(page, lruvec, page_lru(page)); ClearPageActive(page); add_page_to_lru_list_tail(page, lruvec, page_lru(page)); - (*pgmoved) += thp_nr_pages(page); + __count_vm_events(PGROTATED, thp_nr_pages(page)); } } /* - * pagevec_move_tail() must be called with IRQ disabled. - * Otherwise this may cause nasty races. - */ -static void pagevec_move_tail(struct pagevec *pvec) -{ - int pgmoved = 0; - - pagevec_lru_move_fn(pvec, pagevec_move_tail_fn, &pgmoved); - __count_vm_events(PGROTATED, pgmoved); -} - -/* * Writeback is about to end against a page which has been marked for immediate * reclaim. If it still appears to be reclaimable, move it to the tail of the * inactive list. + * + * rotate_reclaimable_page() must disable IRQs, to prevent nasty races. */ void rotate_reclaimable_page(struct page *page) { @@ -273,7 +254,7 @@ void rotate_reclaimable_page(struct page *page) local_lock_irqsave(&lru_rotate.lock, flags); pvec = this_cpu_ptr(&lru_rotate.pvec); if (!pagevec_add(pvec, page) || PageCompound(page)) - pagevec_move_tail(pvec); + pagevec_lru_move_fn(pvec, pagevec_move_tail_fn); local_unlock_irqrestore(&lru_rotate.lock, flags); } } @@ -283,6 +264,14 @@ void lru_note_cost(struct lruvec *lruvec, bool file, unsigned int nr_pages) do { unsigned long lrusize; + /* + * Hold lruvec->lru_lock is safe here, since + * 1) The pinned lruvec in reclaim, or + * 2) From a pre-LRU page during refault (which also holds the + * rcu lock, so would be safe even if the page was on the LRU + * and could move simultaneously to a new lruvec). + */ + spin_lock_irq(&lruvec->lru_lock); /* Record cost event */ if (file) lruvec->file_cost += nr_pages; @@ -306,6 +295,7 @@ void lru_note_cost(struct lruvec *lruvec, bool file, unsigned int nr_pages) lruvec->file_cost /= 2; lruvec->anon_cost /= 2; } + spin_unlock_irq(&lruvec->lru_lock); } while ((lruvec = parent_lruvec(lruvec))); } @@ -315,10 +305,9 @@ void lru_note_cost_page(struct page *page) page_is_file_lru(page), thp_nr_pages(page)); } -static void __activate_page(struct page *page, struct lruvec *lruvec, - void *arg) +static void __activate_page(struct page *page, struct lruvec *lruvec) { - if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) { + if (!PageActive(page) && !PageUnevictable(page)) { int lru = page_lru_base_type(page); int nr_pages = thp_nr_pages(page); @@ -340,7 +329,7 @@ static void activate_page_drain(int cpu) struct pagevec *pvec = &per_cpu(lru_pvecs.activate_page, cpu); if (pagevec_count(pvec)) - pagevec_lru_move_fn(pvec, __activate_page, NULL); + pagevec_lru_move_fn(pvec, __activate_page); } static bool need_activate_page_drain(int cpu) @@ -358,7 +347,7 @@ static void activate_page(struct page *page) pvec = this_cpu_ptr(&lru_pvecs.activate_page); get_page(page); if (!pagevec_add(pvec, page) || PageCompound(page)) - pagevec_lru_move_fn(pvec, __activate_page, NULL); + pagevec_lru_move_fn(pvec, __activate_page); local_unlock(&lru_pvecs.lock); } } @@ -370,12 +359,15 @@ static inline void activate_page_drain(int cpu) static void activate_page(struct page *page) { - pg_data_t *pgdat = page_pgdat(page); + struct lruvec *lruvec; page = compound_head(page); - spin_lock_irq(&pgdat->lru_lock); - __activate_page(page, mem_cgroup_page_lruvec(page, pgdat), NULL); - spin_unlock_irq(&pgdat->lru_lock); + if (TestClearPageLRU(page)) { + lruvec = lock_page_lruvec_irq(page); + __activate_page(page, lruvec); + unlock_page_lruvec_irq(lruvec); + SetPageLRU(page); + } } #endif @@ -525,16 +517,12 @@ void lru_cache_add_inactive_or_unevictable(struct page *page, * be write it out by flusher threads as this is much more effective * than the single-page writeout from reclaim. */ -static void lru_deactivate_file_fn(struct page *page, struct lruvec *lruvec, - void *arg) +static void lru_deactivate_file_fn(struct page *page, struct lruvec *lruvec) { int lru; bool active; int nr_pages = thp_nr_pages(page); - if (!PageLRU(page)) - return; - if (PageUnevictable(page)) return; @@ -573,10 +561,9 @@ static void lru_deactivate_file_fn(struct page *page, struct lruvec *lruvec, } } -static void lru_deactivate_fn(struct page *page, struct lruvec *lruvec, - void *arg) +static void lru_deactivate_fn(struct page *page, struct lruvec *lruvec) { - if (PageLRU(page) && PageActive(page) && !PageUnevictable(page)) { + if (PageActive(page) && !PageUnevictable(page)) { int lru = page_lru_base_type(page); int nr_pages = thp_nr_pages(page); @@ -591,10 +578,9 @@ static void lru_deactivate_fn(struct page *page, struct lruvec *lruvec, } } -static void lru_lazyfree_fn(struct page *page, struct lruvec *lruvec, - void *arg) +static void lru_lazyfree_fn(struct page *page, struct lruvec *lruvec) { - if (PageLRU(page) && PageAnon(page) && PageSwapBacked(page) && + if (PageAnon(page) && PageSwapBacked(page) && !PageSwapCache(page) && !PageUnevictable(page)) { bool active = PageActive(page); int nr_pages = thp_nr_pages(page); @@ -636,21 +622,21 @@ void lru_add_drain_cpu(int cpu) /* No harm done if a racing interrupt already did this */ local_lock_irqsave(&lru_rotate.lock, flags); - pagevec_move_tail(pvec); + pagevec_lru_move_fn(pvec, pagevec_move_tail_fn); local_unlock_irqrestore(&lru_rotate.lock, flags); } pvec = &per_cpu(lru_pvecs.lru_deactivate_file, cpu); if (pagevec_count(pvec)) - pagevec_lru_move_fn(pvec, lru_deactivate_file_fn, NULL); + pagevec_lru_move_fn(pvec, lru_deactivate_file_fn); pvec = &per_cpu(lru_pvecs.lru_deactivate, cpu); if (pagevec_count(pvec)) - pagevec_lru_move_fn(pvec, lru_deactivate_fn, NULL); + pagevec_lru_move_fn(pvec, lru_deactivate_fn); pvec = &per_cpu(lru_pvecs.lru_lazyfree, cpu); if (pagevec_count(pvec)) - pagevec_lru_move_fn(pvec, lru_lazyfree_fn, NULL); + pagevec_lru_move_fn(pvec, lru_lazyfree_fn); activate_page_drain(cpu); } @@ -679,7 +665,7 @@ void deactivate_file_page(struct page *page) pvec = this_cpu_ptr(&lru_pvecs.lru_deactivate_file); if (!pagevec_add(pvec, page) || PageCompound(page)) - pagevec_lru_move_fn(pvec, lru_deactivate_file_fn, NULL); + pagevec_lru_move_fn(pvec, lru_deactivate_file_fn); local_unlock(&lru_pvecs.lock); } } @@ -701,7 +687,7 @@ void deactivate_page(struct page *page) pvec = this_cpu_ptr(&lru_pvecs.lru_deactivate); get_page(page); if (!pagevec_add(pvec, page) || PageCompound(page)) - pagevec_lru_move_fn(pvec, lru_deactivate_fn, NULL); + pagevec_lru_move_fn(pvec, lru_deactivate_fn); local_unlock(&lru_pvecs.lock); } } @@ -723,7 +709,7 @@ void mark_page_lazyfree(struct page *page) pvec = this_cpu_ptr(&lru_pvecs.lru_lazyfree); get_page(page); if (!pagevec_add(pvec, page) || PageCompound(page)) - pagevec_lru_move_fn(pvec, lru_lazyfree_fn, NULL); + pagevec_lru_move_fn(pvec, lru_lazyfree_fn); local_unlock(&lru_pvecs.lock); } } @@ -871,8 +857,7 @@ void release_pages(struct page **pages, int nr) { int i; LIST_HEAD(pages_to_free); - struct pglist_data *locked_pgdat = NULL; - struct lruvec *lruvec; + struct lruvec *lruvec = NULL; unsigned long flags; unsigned int lock_batch; @@ -882,11 +867,11 @@ void release_pages(struct page **pages, int nr) /* * Make sure the IRQ-safe lock-holding time does not get * excessive with a continuous string of pages from the - * same pgdat. The lock is held only if pgdat != NULL. + * same lruvec. The lock is held only if lruvec != NULL. */ - if (locked_pgdat && ++lock_batch == SWAP_CLUSTER_MAX) { - spin_unlock_irqrestore(&locked_pgdat->lru_lock, flags); - locked_pgdat = NULL; + if (lruvec && ++lock_batch == SWAP_CLUSTER_MAX) { + unlock_page_lruvec_irqrestore(lruvec, flags); + lruvec = NULL; } page = compound_head(page); @@ -894,10 +879,9 @@ void release_pages(struct page **pages, int nr) continue; if (is_zone_device_page(page)) { - if (locked_pgdat) { - spin_unlock_irqrestore(&locked_pgdat->lru_lock, - flags); - locked_pgdat = NULL; + if (lruvec) { + unlock_page_lruvec_irqrestore(lruvec, flags); + lruvec = NULL; } /* * ZONE_DEVICE pages that return 'false' from @@ -909,33 +893,31 @@ void release_pages(struct page **pages, int nr) put_devmap_managed_page(page); continue; } + if (put_page_testzero(page)) + put_dev_pagemap(page->pgmap); + continue; } if (!put_page_testzero(page)) continue; if (PageCompound(page)) { - if (locked_pgdat) { - spin_unlock_irqrestore(&locked_pgdat->lru_lock, flags); - locked_pgdat = NULL; + if (lruvec) { + unlock_page_lruvec_irqrestore(lruvec, flags); + lruvec = NULL; } __put_compound_page(page); continue; } if (PageLRU(page)) { - struct pglist_data *pgdat = page_pgdat(page); + struct lruvec *prev_lruvec = lruvec; - if (pgdat != locked_pgdat) { - if (locked_pgdat) - spin_unlock_irqrestore(&locked_pgdat->lru_lock, - flags); + lruvec = relock_page_lruvec_irqsave(page, lruvec, + &flags); + if (prev_lruvec != lruvec) lock_batch = 0; - locked_pgdat = pgdat; - spin_lock_irqsave(&locked_pgdat->lru_lock, flags); - } - lruvec = mem_cgroup_page_lruvec(page, locked_pgdat); VM_BUG_ON_PAGE(!PageLRU(page), page); __ClearPageLRU(page); del_page_from_lru_list(page, lruvec, page_off_lru(page)); @@ -945,8 +927,8 @@ void release_pages(struct page **pages, int nr) list_add(&page->lru, &pages_to_free); } - if (locked_pgdat) - spin_unlock_irqrestore(&locked_pgdat->lru_lock, flags); + if (lruvec) + unlock_page_lruvec_irqrestore(lruvec, flags); mem_cgroup_uncharge_list(&pages_to_free); free_unref_page_list(&pages_to_free); @@ -974,41 +956,7 @@ void __pagevec_release(struct pagevec *pvec) } EXPORT_SYMBOL(__pagevec_release); -#ifdef CONFIG_TRANSPARENT_HUGEPAGE -/* used by __split_huge_page_refcount() */ -void lru_add_page_tail(struct page *page, struct page *page_tail, - struct lruvec *lruvec, struct list_head *list) -{ - VM_BUG_ON_PAGE(!PageHead(page), page); - VM_BUG_ON_PAGE(PageCompound(page_tail), page); - VM_BUG_ON_PAGE(PageLRU(page_tail), page); - lockdep_assert_held(&lruvec_pgdat(lruvec)->lru_lock); - - if (!list) - SetPageLRU(page_tail); - - if (likely(PageLRU(page))) - list_add_tail(&page_tail->lru, &page->lru); - else if (list) { - /* page reclaim is reclaiming a huge page */ - get_page(page_tail); - list_add_tail(&page_tail->lru, list); - } else { - /* - * Head page has not yet been counted, as an hpage, - * so we must account for each subpage individually. - * - * Put page_tail on the list at the correct position - * so they all end up in order. - */ - add_page_to_lru_list_tail(page_tail, lruvec, - page_lru(page_tail)); - } -} -#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ - -static void __pagevec_lru_add_fn(struct page *page, struct lruvec *lruvec, - void *arg) +static void __pagevec_lru_add_fn(struct page *page, struct lruvec *lruvec) { enum lru_list lru; int was_unevictable = TestClearPageUnevictable(page); @@ -1067,7 +1015,20 @@ static void __pagevec_lru_add_fn(struct page *page, struct lruvec *lruvec, */ void __pagevec_lru_add(struct pagevec *pvec) { - pagevec_lru_move_fn(pvec, __pagevec_lru_add_fn, NULL); + int i; + struct lruvec *lruvec = NULL; + unsigned long flags = 0; + + for (i = 0; i < pagevec_count(pvec); i++) { + struct page *page = pvec->pages[i]; + + lruvec = relock_page_lruvec_irqsave(page, lruvec, &flags); + __pagevec_lru_add_fn(page, lruvec); + } + if (lruvec) + unlock_page_lruvec_irqrestore(lruvec, flags); + release_pages(pvec->pages, pvec->nr); + pagevec_reinit(pvec); } /** @@ -1164,15 +1125,6 @@ unsigned pagevec_lookup_range_tag(struct pagevec *pvec, } EXPORT_SYMBOL(pagevec_lookup_range_tag); -unsigned pagevec_lookup_range_nr_tag(struct pagevec *pvec, - struct address_space *mapping, pgoff_t *index, pgoff_t end, - xa_mark_t tag, unsigned max_pages) -{ - pvec->nr = find_get_pages_range_tag(mapping, index, end, tag, - min_t(unsigned int, max_pages, PAGEVEC_SIZE), pvec->pages); - return pagevec_count(pvec); -} -EXPORT_SYMBOL(pagevec_lookup_range_nr_tag); /* * Perform any setup for the swap system */ diff --git a/mm/swap_state.c b/mm/swap_state.c index ee465827420e..751c1ef2fe0e 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -839,7 +839,9 @@ static struct page *swap_vma_readahead(swp_entry_t fentry, gfp_t gfp_mask, swp_entry_t entry; unsigned int i; bool page_allocated; - struct vma_swap_readahead ra_info = {0,}; + struct vma_swap_readahead ra_info = { + .win = 1, + }; swap_ra_info(vmf, &ra_info); if (ra_info.win == 1) @@ -900,7 +902,8 @@ struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask, static ssize_t vma_ra_enabled_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { - return sprintf(buf, "%s\n", enable_vma_readahead ? "true" : "false"); + return sysfs_emit(buf, "%s\n", + enable_vma_readahead ? "true" : "false"); } static ssize_t vma_ra_enabled_store(struct kobject *kobj, struct kobj_attribute *attr, diff --git a/mm/swapfile.c b/mm/swapfile.c index d58361109066..9fffc5af29d1 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -975,8 +975,7 @@ static int swap_alloc_cluster(struct swap_info_struct *si, swp_entry_t *slot) { unsigned long idx; struct swap_cluster_info *ci; - unsigned long offset, i; - unsigned char *map; + unsigned long offset; /* * Should not even be attempting cluster allocations when huge @@ -996,9 +995,7 @@ static int swap_alloc_cluster(struct swap_info_struct *si, swp_entry_t *slot) alloc_cluster(si, idx); cluster_set_count_flag(ci, SWAPFILE_CLUSTER, CLUSTER_FLAG_HUGE); - map = si->swap_map + offset; - for (i = 0; i < SWAPFILE_CLUSTER; i++) - map[i] = SWAP_HAS_CACHE; + memset(si->swap_map + offset, SWAP_HAS_CACHE, SWAPFILE_CLUSTER); unlock_cluster(ci); swap_range_alloc(si, offset, SWAPFILE_CLUSTER); *slot = swp_entry(si->type, offset); @@ -1045,16 +1042,18 @@ int get_swap_pages(int n_goal, swp_entry_t swp_entries[], int entry_size) /* Only single cluster request supported */ WARN_ON_ONCE(n_goal > 1 && size == SWAPFILE_CLUSTER); + spin_lock(&swap_avail_lock); + avail_pgs = atomic_long_read(&nr_swap_pages) / size; - if (avail_pgs <= 0) + if (avail_pgs <= 0) { + spin_unlock(&swap_avail_lock); goto noswap; + } n_goal = min3((long)n_goal, (long)SWAP_BATCH, avail_pgs); atomic_long_sub(n_goal * size, &nr_swap_pages); - spin_lock(&swap_avail_lock); - start_over: node = numa_node_id(); plist_for_each_entry_safe(si, next, &swap_avail_heads[node], avail_lists[node]) { @@ -1128,14 +1127,13 @@ swp_entry_t get_swap_page_of_type(int type) spin_lock(&si->lock); if (si->flags & SWP_WRITEOK) { - atomic_long_dec(&nr_swap_pages); /* This is called for allocating swap entry, not cache */ offset = scan_swap_map(si, 1); if (offset) { + atomic_long_dec(&nr_swap_pages); spin_unlock(&si->lock); return swp_entry(type, offset); } - atomic_long_inc(&nr_swap_pages); } spin_unlock(&si->lock); fail: @@ -3445,11 +3443,11 @@ static int __swap_duplicate(swp_entry_t entry, unsigned char usage) unsigned long offset; unsigned char count; unsigned char has_cache; - int err = -EINVAL; + int err; p = get_swap_device(entry); if (!p) - goto out; + return -EINVAL; offset = swp_offset(entry); ci = lock_cluster_or_swap_info(p, offset); @@ -3496,7 +3494,6 @@ static int __swap_duplicate(swp_entry_t entry, unsigned char usage) unlock_out: unlock_cluster_or_swap_info(p, ci); -out: if (p) put_swap_device(p); return err; @@ -3613,7 +3610,7 @@ int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask) ci = lock_cluster(si, offset); - count = si->swap_map[offset] & ~SWAP_HAS_CACHE; + count = swap_count(si->swap_map[offset]); if ((count & ~COUNT_CONTINUED) != SWAP_MAP_MAX) { /* diff --git a/mm/truncate.c b/mm/truncate.c index 960edf5803ca..8aa4907e06e0 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -637,9 +637,15 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping, EXPORT_SYMBOL(invalidate_mapping_pages); /** - * This helper is similar with the above one, except that it accounts for pages - * that are likely on a pagevec and count them in @nr_pagevec, which will used by - * the caller. + * invalidate_mapping_pagevec - Invalidate all the unlocked pages of one inode + * @mapping: the address_space which holds the pages to invalidate + * @start: the offset 'from' which to invalidate + * @end: the offset 'to' which to invalidate (inclusive) + * @nr_pagevec: invalidate failed page number for caller + * + * This helper is similar to invalidate_mapping_pages(), except that it accounts + * for pages that are likely on a pagevec and counts them in @nr_pagevec, which + * will be used by the caller. */ void invalidate_mapping_pagevec(struct address_space *mapping, pgoff_t start, pgoff_t end, unsigned long *nr_pagevec) diff --git a/mm/util.c b/mm/util.c index 4ddb6e186dd5..8c9b7d1e7c49 100644 --- a/mm/util.c +++ b/mm/util.c @@ -311,6 +311,18 @@ int vma_is_stack_for_current(struct vm_area_struct *vma) return (vma->vm_start <= KSTK_ESP(t) && vma->vm_end >= KSTK_ESP(t)); } +/* + * Change backing file, only valid to use during initial VMA setup. + */ +void vma_set_file(struct vm_area_struct *vma, struct file *file) +{ + /* Changing an anonymous vma with this is illegal */ + get_file(file); + swap(vma->vm_file, file); + fput(file); +} +EXPORT_SYMBOL(vma_set_file); + #ifndef STACK_RND_MASK #define STACK_RND_MASK (0x7ff >> (PAGE_SHIFT - 12)) /* 8MB of VA */ #endif diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 6ae491a8b210..4d88fe5a277a 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -413,10 +413,13 @@ static DEFINE_SPINLOCK(vmap_area_lock); static DEFINE_SPINLOCK(free_vmap_area_lock); /* Export for kexec only */ LIST_HEAD(vmap_area_list); -static LLIST_HEAD(vmap_purge_list); static struct rb_root vmap_area_root = RB_ROOT; static bool vmap_initialized __read_mostly; +static struct rb_root purge_vmap_area_root = RB_ROOT; +static LIST_HEAD(purge_vmap_area_list); +static DEFINE_SPINLOCK(purge_vmap_area_lock); + /* * This kmem_cache is used for vmap_area objects. Instead of * allocating from slab we reuse an object from this cache to @@ -820,10 +823,17 @@ insert: if (!merged) link_va(va, root, parent, link, head); - /* - * Last step is to check and update the tree. - */ - augment_tree_propagate_from(va); + return va; +} + +static __always_inline struct vmap_area * +merge_or_add_vmap_area_augment(struct vmap_area *va, + struct rb_root *root, struct list_head *head) +{ + va = merge_or_add_vmap_area(va, root, head); + if (va) + augment_tree_propagate_from(va); + return va; } @@ -1138,7 +1148,7 @@ static void free_vmap_area(struct vmap_area *va) * Insert/Merge it back to the free tree/list. */ spin_lock(&free_vmap_area_lock); - merge_or_add_vmap_area(va, &free_vmap_area_root, &free_vmap_area_list); + merge_or_add_vmap_area_augment(va, &free_vmap_area_root, &free_vmap_area_list); spin_unlock(&free_vmap_area_lock); } @@ -1326,32 +1336,32 @@ void set_iounmap_nonlazy(void) static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end) { unsigned long resched_threshold; - struct llist_node *valist; - struct vmap_area *va; - struct vmap_area *n_va; + struct list_head local_pure_list; + struct vmap_area *va, *n_va; lockdep_assert_held(&vmap_purge_lock); - valist = llist_del_all(&vmap_purge_list); - if (unlikely(valist == NULL)) + spin_lock(&purge_vmap_area_lock); + purge_vmap_area_root = RB_ROOT; + list_replace_init(&purge_vmap_area_list, &local_pure_list); + spin_unlock(&purge_vmap_area_lock); + + if (unlikely(list_empty(&local_pure_list))) return false; - /* - * TODO: to calculate a flush range without looping. - * The list can be up to lazy_max_pages() elements. - */ - llist_for_each_entry(va, valist, purge_list) { - if (va->va_start < start) - start = va->va_start; - if (va->va_end > end) - end = va->va_end; - } + start = min(start, + list_first_entry(&local_pure_list, + struct vmap_area, list)->va_start); + + end = max(end, + list_last_entry(&local_pure_list, + struct vmap_area, list)->va_end); flush_tlb_kernel_range(start, end); resched_threshold = lazy_max_pages() << 1; spin_lock(&free_vmap_area_lock); - llist_for_each_entry_safe(va, n_va, valist, purge_list) { + list_for_each_entry_safe(va, n_va, &local_pure_list, list) { unsigned long nr = (va->va_end - va->va_start) >> PAGE_SHIFT; unsigned long orig_start = va->va_start; unsigned long orig_end = va->va_end; @@ -1361,8 +1371,8 @@ static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end) * detached and there is no need to "unlink" it from * anything. */ - va = merge_or_add_vmap_area(va, &free_vmap_area_root, - &free_vmap_area_list); + va = merge_or_add_vmap_area_augment(va, &free_vmap_area_root, + &free_vmap_area_list); if (!va) continue; @@ -1419,9 +1429,15 @@ static void free_vmap_area_noflush(struct vmap_area *va) nr_lazy = atomic_long_add_return((va->va_end - va->va_start) >> PAGE_SHIFT, &vmap_lazy_nr); - /* After this point, we may free va at any time */ - llist_add(&va->purge_list, &vmap_purge_list); + /* + * Merge or place it to the purge tree/list. + */ + spin_lock(&purge_vmap_area_lock); + merge_or_add_vmap_area(va, + &purge_vmap_area_root, &purge_vmap_area_list); + spin_unlock(&purge_vmap_area_lock); + /* After this point, we may free va at any time */ if (unlikely(nr_lazy > lazy_max_pages())) try_purge_vmap_area_lazy(); } @@ -2256,7 +2272,7 @@ static void __vunmap(const void *addr, int deallocate_pages) debug_check_no_locks_freed(area->addr, get_vm_area_size(area)); debug_check_no_obj_freed(area->addr, get_vm_area_size(area)); - kasan_poison_vmalloc(area->addr, area->size); + kasan_poison_vmalloc(area->addr, get_vm_area_size(area)); vm_remove_mappings(area, deallocate_pages); @@ -2275,7 +2291,6 @@ static void __vunmap(const void *addr, int deallocate_pages) } kfree(area); - return; } static inline void __vfree_deferred(const void *addr) @@ -2461,9 +2476,11 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, { const gfp_t nested_gfp = (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO; unsigned int nr_pages = get_vm_area_size(area) >> PAGE_SHIFT; - unsigned int array_size = nr_pages * sizeof(struct page *), i; + unsigned long array_size; + unsigned int i; struct page **pages; + array_size = (unsigned long)nr_pages * sizeof(struct page *); gfp_mask |= __GFP_NOWARN; if (!(gfp_mask & (GFP_DMA | GFP_DMA32))) gfp_mask |= __GFP_HIGHMEM; @@ -2477,8 +2494,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, } if (!pages) { - remove_vm_area(area->addr); - kfree(area); + free_vm_area(area); return NULL; } @@ -3134,6 +3150,7 @@ pvm_find_va_enclose_addr(unsigned long addr) * @va: * in - the VA we start the search(reverse order); * out - the VA with the highest aligned end address. + * @align: alignment for required highest address * * Returns: determined end address within vmap_area */ @@ -3350,8 +3367,8 @@ recovery: while (area--) { orig_start = vas[area]->va_start; orig_end = vas[area]->va_end; - va = merge_or_add_vmap_area(vas[area], &free_vmap_area_root, - &free_vmap_area_list); + va = merge_or_add_vmap_area_augment(vas[area], &free_vmap_area_root, + &free_vmap_area_list); if (va) kasan_release_vmalloc(orig_start, orig_end, va->va_start, va->va_end); @@ -3400,8 +3417,8 @@ err_free_shadow: for (area = 0; area < nr_vms; area++) { orig_start = vas[area]->va_start; orig_end = vas[area]->va_end; - va = merge_or_add_vmap_area(vas[area], &free_vmap_area_root, - &free_vmap_area_list); + va = merge_or_add_vmap_area_augment(vas[area], &free_vmap_area_root, + &free_vmap_area_list); if (va) kasan_release_vmalloc(orig_start, orig_end, va->va_start, va->va_end); @@ -3448,11 +3465,11 @@ static void *s_next(struct seq_file *m, void *p, loff_t *pos) } static void s_stop(struct seq_file *m, void *p) - __releases(&vmap_purge_lock) __releases(&vmap_area_lock) + __releases(&vmap_purge_lock) { - mutex_unlock(&vmap_purge_lock); spin_unlock(&vmap_area_lock); + mutex_unlock(&vmap_purge_lock); } static void show_numa_info(struct seq_file *m, struct vm_struct *v) @@ -3481,18 +3498,15 @@ static void show_numa_info(struct seq_file *m, struct vm_struct *v) static void show_purge_info(struct seq_file *m) { - struct llist_node *head; struct vmap_area *va; - head = READ_ONCE(vmap_purge_list.first); - if (head == NULL) - return; - - llist_for_each_entry(va, head, purge_list) { + spin_lock(&purge_vmap_area_lock); + list_for_each_entry(va, &purge_vmap_area_list, list) { seq_printf(m, "0x%pK-0x%pK %7ld unpurged vm_area\n", (void *)va->va_start, (void *)va->va_end, va->va_end - va->va_start); } + spin_unlock(&purge_vmap_area_lock); } static int s_show(struct seq_file *m, void *p) @@ -3550,10 +3564,7 @@ static int s_show(struct seq_file *m, void *p) seq_putc(m, '\n'); /* - * As a final step, dump "unpurged" areas. Note, - * that entire "/proc/vmallocinfo" output will not - * be address sorted, because the purge list is not - * sorted. + * As a final step, dump "unpurged" areas. */ if (list_is_last(&va->list, &vmap_area_list)) show_purge_info(m); diff --git a/mm/vmscan.c b/mm/vmscan.c index 7b4e31eac2cf..257cba79a96d 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1,7 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 /* - * linux/mm/vmscan.c - * * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds * * Swap reorganised 29.12.95, Stephen Tweedie. @@ -1072,7 +1070,6 @@ static void page_check_dirty_writeback(struct page *page, static unsigned int shrink_page_list(struct list_head *page_list, struct pglist_data *pgdat, struct scan_control *sc, - enum ttu_flags ttu_flags, struct reclaim_stat *stat, bool ignore_references) { @@ -1297,7 +1294,7 @@ static unsigned int shrink_page_list(struct list_head *page_list, * processes. Try to unmap it here. */ if (page_mapped(page)) { - enum ttu_flags flags = ttu_flags | TTU_BATCH_FLUSH; + enum ttu_flags flags = TTU_BATCH_FLUSH; bool was_swapbacked = PageSwapBacked(page); if (unlikely(PageTransHuge(page))) @@ -1372,6 +1369,7 @@ static unsigned int shrink_page_list(struct list_head *page_list, if (PageDirty(page) || PageWriteback(page)) goto keep_locked; mapping = page_mapping(page); + fallthrough; case PAGE_CLEAN: ; /* try to free the page below */ } @@ -1393,7 +1391,7 @@ static unsigned int shrink_page_list(struct list_head *page_list, * * Rarely, pages can have buffers and no ->mapping. These are * the pages which were not successfully invalidated in - * truncate_complete_page(). We try to drop those buffers here + * truncate_cleanup_page(). We try to drop those buffers here * and if that worked, and the page is no longer mapped into * process address space (page_count == 1) it can be freed. * Otherwise, leave the page on the LRU so it is swappable. @@ -1514,7 +1512,7 @@ unsigned int reclaim_clean_pages_from_list(struct zone *zone, } nr_reclaimed = shrink_page_list(&clean_pages, zone->zone_pgdat, &sc, - TTU_IGNORE_ACCESS, &stat, true); + &stat, true); list_splice(&clean_pages, page_list); mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_FILE, -(long)nr_reclaimed); @@ -1541,9 +1539,9 @@ unsigned int reclaim_clean_pages_from_list(struct zone *zone, * * returns 0 on success, -ve errno on failure. */ -int __isolate_lru_page(struct page *page, isolate_mode_t mode) +int __isolate_lru_page_prepare(struct page *page, isolate_mode_t mode) { - int ret = -EINVAL; + int ret = -EBUSY; /* Only take pages on the LRU. */ if (!PageLRU(page)) @@ -1553,8 +1551,6 @@ int __isolate_lru_page(struct page *page, isolate_mode_t mode) if (PageUnevictable(page) && !(mode & ISOLATE_UNEVICTABLE)) return ret; - ret = -EBUSY; - /* * To minimise LRU disruption, the caller can indicate that it only * wants to isolate pages it will be able to operate on without @@ -1595,20 +1591,9 @@ int __isolate_lru_page(struct page *page, isolate_mode_t mode) if ((mode & ISOLATE_UNMAPPED) && page_mapped(page)) return ret; - if (likely(get_page_unless_zero(page))) { - /* - * Be careful not to clear PageLRU until after we're - * sure the page is not being freed elsewhere -- the - * page release code relies on it. - */ - ClearPageLRU(page); - ret = 0; - } - - return ret; + return 0; } - /* * Update LRU sizes after isolating pages. The LRU size updates must * be complete before mem_cgroup_update_lru_size due to a sanity check. @@ -1628,14 +1613,16 @@ static __always_inline void update_lru_sizes(struct lruvec *lruvec, } /** - * pgdat->lru_lock is heavily contended. Some of the functions that + * Isolating page from the lruvec to fill in @dst list by nr_to_scan times. + * + * lruvec->lru_lock is heavily contended. Some of the functions that * shrink the lists perform better by taking out a batch of pages * and working on them outside the LRU lock. * * For pagecache intensive workloads, this function is the hottest * spot in the kernel (apart from copy_*_user functions). * - * Appropriate locks must be held before calling this function. + * Lru_lock must be held before calling this function. * * @nr_to_scan: The number of eligible pages to look through on the list. * @lruvec: The LRU vector to pull pages from. @@ -1668,8 +1655,6 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, page = lru_to_page(src); prefetchw_prev_lru_page(page, src, flags); - VM_BUG_ON_PAGE(!PageLRU(page), page); - nr_pages = compound_nr(page); total_scan += nr_pages; @@ -1690,20 +1675,34 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, * only when the page is being freed somewhere else. */ scan += nr_pages; - switch (__isolate_lru_page(page, mode)) { + switch (__isolate_lru_page_prepare(page, mode)) { case 0: + /* + * Be careful not to clear PageLRU until after we're + * sure the page is not being freed elsewhere -- the + * page release code relies on it. + */ + if (unlikely(!get_page_unless_zero(page))) + goto busy; + + if (!TestClearPageLRU(page)) { + /* + * This page may in other isolation path, + * but we still hold lru_lock. + */ + put_page(page); + goto busy; + } + nr_taken += nr_pages; nr_zone_taken[page_zonenum(page)] += nr_pages; list_move(&page->lru, dst); break; - case -EBUSY: + default: +busy: /* else it is being freed elsewhere */ list_move(&page->lru, src); - continue; - - default: - BUG(); } } @@ -1766,21 +1765,16 @@ int isolate_lru_page(struct page *page) VM_BUG_ON_PAGE(!page_count(page), page); WARN_RATELIMIT(PageTail(page), "trying to isolate tail page"); - if (PageLRU(page)) { - pg_data_t *pgdat = page_pgdat(page); + if (TestClearPageLRU(page)) { struct lruvec *lruvec; - spin_lock_irq(&pgdat->lru_lock); - lruvec = mem_cgroup_page_lruvec(page, pgdat); - if (PageLRU(page)) { - int lru = page_lru(page); - get_page(page); - ClearPageLRU(page); - del_page_from_lru_list(page, lruvec, lru); - ret = 0; - } - spin_unlock_irq(&pgdat->lru_lock); + get_page(page); + lruvec = lock_page_lruvec_irq(page); + del_page_from_lru_list(page, lruvec, page_lru(page)); + unlock_page_lruvec_irq(lruvec); + ret = 0; } + return ret; } @@ -1822,29 +1816,14 @@ static int too_many_isolated(struct pglist_data *pgdat, int file, } /* - * This moves pages from @list to corresponding LRU list. - * - * We move them the other way if the page is referenced by one or more - * processes, from rmap. - * - * If the pages are mostly unmapped, the processing is fast and it is - * appropriate to hold zone_lru_lock across the whole operation. But if - * the pages are mapped, the processing is slow (page_referenced()) so we - * should drop zone_lru_lock around each page. It's impossible to balance - * this, so instead we remove the pages from the LRU while processing them. - * It is safe to rely on PG_active against the non-LRU pages in here because - * nobody will play with that bit on a non-LRU page. - * - * The downside is that we have to touch page->_refcount against each page. - * But we had to alter page->flags anyway. + * move_pages_to_lru() moves pages from private @list to appropriate LRU list. + * On return, @list is reused as a list of pages to be freed by the caller. * * Returns the number of pages moved to the given lruvec. */ - static unsigned noinline_for_stack move_pages_to_lru(struct lruvec *lruvec, struct list_head *list) { - struct pglist_data *pgdat = lruvec_pgdat(lruvec); int nr_pages, nr_moved = 0; LIST_HEAD(pages_to_free); struct page *page; @@ -1853,38 +1832,54 @@ static unsigned noinline_for_stack move_pages_to_lru(struct lruvec *lruvec, while (!list_empty(list)) { page = lru_to_page(list); VM_BUG_ON_PAGE(PageLRU(page), page); + list_del(&page->lru); if (unlikely(!page_evictable(page))) { - list_del(&page->lru); - spin_unlock_irq(&pgdat->lru_lock); + spin_unlock_irq(&lruvec->lru_lock); putback_lru_page(page); - spin_lock_irq(&pgdat->lru_lock); + spin_lock_irq(&lruvec->lru_lock); continue; } - lruvec = mem_cgroup_page_lruvec(page, pgdat); + /* + * The SetPageLRU needs to be kept here for list integrity. + * Otherwise: + * #0 move_pages_to_lru #1 release_pages + * if !put_page_testzero + * if (put_page_testzero()) + * !PageLRU //skip lru_lock + * SetPageLRU() + * list_add(&page->lru,) + * list_add(&page->lru,) + */ SetPageLRU(page); - lru = page_lru(page); - nr_pages = thp_nr_pages(page); - update_lru_size(lruvec, lru, page_zonenum(page), nr_pages); - list_move(&page->lru, &lruvec->lists[lru]); - - if (put_page_testzero(page)) { + if (unlikely(put_page_testzero(page))) { __ClearPageLRU(page); __ClearPageActive(page); - del_page_from_lru_list(page, lruvec, lru); if (unlikely(PageCompound(page))) { - spin_unlock_irq(&pgdat->lru_lock); + spin_unlock_irq(&lruvec->lru_lock); destroy_compound_page(page); - spin_lock_irq(&pgdat->lru_lock); + spin_lock_irq(&lruvec->lru_lock); } else list_add(&page->lru, &pages_to_free); - } else { - nr_moved += nr_pages; - if (PageActive(page)) - workingset_age_nonresident(lruvec, nr_pages); + + continue; } + + /* + * All pages were isolated from the same lruvec (and isolation + * inhibits memcg migration). + */ + VM_BUG_ON_PAGE(!lruvec_holds_page_lru_lock(page, lruvec), page); + lru = page_lru(page); + nr_pages = thp_nr_pages(page); + + update_lru_size(lruvec, lru, page_zonenum(page), nr_pages); + list_add(&page->lru, &lruvec->lists[lru]); + nr_moved += nr_pages; + if (PageActive(page)) + workingset_age_nonresident(lruvec, nr_pages); } /* @@ -1941,7 +1936,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, lru_add_drain(); - spin_lock_irq(&pgdat->lru_lock); + spin_lock_irq(&lruvec->lru_lock); nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &page_list, &nr_scanned, sc, lru); @@ -1953,28 +1948,25 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, __count_memcg_events(lruvec_memcg(lruvec), item, nr_scanned); __count_vm_events(PGSCAN_ANON + file, nr_scanned); - spin_unlock_irq(&pgdat->lru_lock); + spin_unlock_irq(&lruvec->lru_lock); if (nr_taken == 0) return 0; - nr_reclaimed = shrink_page_list(&page_list, pgdat, sc, 0, - &stat, false); - - spin_lock_irq(&pgdat->lru_lock); + nr_reclaimed = shrink_page_list(&page_list, pgdat, sc, &stat, false); + spin_lock_irq(&lruvec->lru_lock); move_pages_to_lru(lruvec, &page_list); __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken); - lru_note_cost(lruvec, file, stat.nr_pageout); item = current_is_kswapd() ? PGSTEAL_KSWAPD : PGSTEAL_DIRECT; if (!cgroup_reclaim(sc)) __count_vm_events(item, nr_reclaimed); __count_memcg_events(lruvec_memcg(lruvec), item, nr_reclaimed); __count_vm_events(PGSTEAL_ANON + file, nr_reclaimed); + spin_unlock_irq(&lruvec->lru_lock); - spin_unlock_irq(&pgdat->lru_lock); - + lru_note_cost(lruvec, file, stat.nr_pageout); mem_cgroup_uncharge_list(&page_list); free_unref_page_list(&page_list); @@ -2006,6 +1998,23 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, return nr_reclaimed; } +/* + * shrink_active_list() moves pages from the active LRU to the inactive LRU. + * + * We move them the other way if the page is referenced by one or more + * processes. + * + * If the pages are mostly unmapped, the processing is fast and it is + * appropriate to hold lru_lock across the whole operation. But if + * the pages are mapped, the processing is slow (page_referenced()), so + * we should drop lru_lock around each page. It's impossible to balance + * this, so instead we remove the pages from the LRU while processing them. + * It is safe to rely on PG_active against the non-LRU pages in here because + * nobody will play with that bit on a non-LRU page. + * + * The downside is that we have to touch page->_refcount against each page. + * But we had to alter page->flags anyway. + */ static void shrink_active_list(unsigned long nr_to_scan, struct lruvec *lruvec, struct scan_control *sc, @@ -2025,7 +2034,7 @@ static void shrink_active_list(unsigned long nr_to_scan, lru_add_drain(); - spin_lock_irq(&pgdat->lru_lock); + spin_lock_irq(&lruvec->lru_lock); nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &l_hold, &nr_scanned, sc, lru); @@ -2036,7 +2045,7 @@ static void shrink_active_list(unsigned long nr_to_scan, __count_vm_events(PGREFILL, nr_scanned); __count_memcg_events(lruvec_memcg(lruvec), PGREFILL, nr_scanned); - spin_unlock_irq(&pgdat->lru_lock); + spin_unlock_irq(&lruvec->lru_lock); while (!list_empty(&l_hold)) { cond_resched(); @@ -2082,7 +2091,7 @@ static void shrink_active_list(unsigned long nr_to_scan, /* * Move pages back to the lru list. */ - spin_lock_irq(&pgdat->lru_lock); + spin_lock_irq(&lruvec->lru_lock); nr_activate = move_pages_to_lru(lruvec, &l_active); nr_deactivate = move_pages_to_lru(lruvec, &l_inactive); @@ -2093,7 +2102,7 @@ static void shrink_active_list(unsigned long nr_to_scan, __count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE, nr_deactivate); __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken); - spin_unlock_irq(&pgdat->lru_lock); + spin_unlock_irq(&lruvec->lru_lock); mem_cgroup_uncharge_list(&l_active); free_unref_page_list(&l_active); @@ -2131,8 +2140,7 @@ unsigned long reclaim_pages(struct list_head *page_list) nr_reclaimed += shrink_page_list(&node_page_list, NODE_DATA(nid), - &sc, 0, - &dummy_stat, false); + &sc, &dummy_stat, false); while (!list_empty(&node_page_list)) { page = lru_to_page(&node_page_list); list_del(&page->lru); @@ -2145,8 +2153,7 @@ unsigned long reclaim_pages(struct list_head *page_list) if (!list_empty(&node_page_list)) { nr_reclaimed += shrink_page_list(&node_page_list, NODE_DATA(nid), - &sc, 0, - &dummy_stat, false); + &sc, &dummy_stat, false); while (!list_empty(&node_page_list)) { page = lru_to_page(&node_page_list); list_del(&page->lru); @@ -2683,10 +2690,10 @@ again: /* * Determine the scan balance between anon and file LRUs. */ - spin_lock_irq(&pgdat->lru_lock); + spin_lock_irq(&target_lruvec->lru_lock); sc->anon_cost = target_lruvec->anon_cost; sc->file_cost = target_lruvec->file_cost; - spin_unlock_irq(&pgdat->lru_lock); + spin_unlock_irq(&target_lruvec->lru_lock); /* * Target desirable inactive:active list ratios for the anon @@ -3899,7 +3906,7 @@ kswapd_try_sleep: highest_zoneidx); /* Read the new order and highest_zoneidx */ - alloc_order = reclaim_order = READ_ONCE(pgdat->kswapd_order); + alloc_order = READ_ONCE(pgdat->kswapd_order); highest_zoneidx = kswapd_highest_zoneidx(pgdat, highest_zoneidx); WRITE_ONCE(pgdat->kswapd_order, 0); @@ -4262,15 +4269,13 @@ int node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order) */ void check_move_unevictable_pages(struct pagevec *pvec) { - struct lruvec *lruvec; - struct pglist_data *pgdat = NULL; + struct lruvec *lruvec = NULL; int pgscanned = 0; int pgrescued = 0; int i; for (i = 0; i < pvec->nr; i++) { struct page *page = pvec->pages[i]; - struct pglist_data *pagepgdat = page_pgdat(page); int nr_pages; if (PageTransTail(page)) @@ -4279,18 +4284,12 @@ void check_move_unevictable_pages(struct pagevec *pvec) nr_pages = thp_nr_pages(page); pgscanned += nr_pages; - if (pagepgdat != pgdat) { - if (pgdat) - spin_unlock_irq(&pgdat->lru_lock); - pgdat = pagepgdat; - spin_lock_irq(&pgdat->lru_lock); - } - lruvec = mem_cgroup_page_lruvec(page, pgdat); - - if (!PageLRU(page) || !PageUnevictable(page)) + /* block memcg migration during page moving between lru */ + if (!TestClearPageLRU(page)) continue; - if (page_evictable(page)) { + lruvec = relock_page_lruvec_irq(page, lruvec); + if (page_evictable(page) && PageUnevictable(page)) { enum lru_list lru = page_lru_base_type(page); VM_BUG_ON_PAGE(PageActive(page), page); @@ -4299,12 +4298,15 @@ void check_move_unevictable_pages(struct pagevec *pvec) add_page_to_lru_list(page, lruvec, lru); pgrescued += nr_pages; } + SetPageLRU(page); } - if (pgdat) { + if (lruvec) { __count_vm_events(UNEVICTABLE_PGRESCUED, pgrescued); __count_vm_events(UNEVICTABLE_PGSCANNED, pgscanned); - spin_unlock_irq(&pgdat->lru_lock); + unlock_page_lruvec_irq(lruvec); + } else if (pgscanned) { + count_vm_events(UNEVICTABLE_PGSCANNED, pgscanned); } } EXPORT_SYMBOL_GPL(check_move_unevictable_pages); diff --git a/mm/vmstat.c b/mm/vmstat.c index 698bc0bc18d1..f8942160fc95 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1157,7 +1157,6 @@ const char * const vmstat_text[] = { "nr_zone_unevictable", "nr_zone_write_pending", "nr_mlock", - "nr_page_table_pages", "nr_bounce", #if IS_ENABLED(CONFIG_ZSMALLOC) "nr_zspages", @@ -1215,6 +1214,7 @@ const char * const vmstat_text[] = { #if IS_ENABLED(CONFIG_SHADOW_CALL_STACK) "nr_shadow_call_stack", #endif + "nr_page_table_pages", /* enum writeback_stat_item counters */ "nr_dirty_threshold", @@ -1503,10 +1503,6 @@ static void pagetypeinfo_showblockcount_print(struct seq_file *m, if (!page) continue; - /* Watch for unexpected holes punched in the memmap */ - if (!memmap_valid_within(pfn, page, zone)) - continue; - if (page_zone(page) != zone) continue; diff --git a/mm/workingset.c b/mm/workingset.c index 975a4d2dd02e..10e96de945b3 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -257,7 +257,7 @@ void *workingset_eviction(struct page *page, struct mem_cgroup *target_memcg) struct lruvec *lruvec; int memcgid; - /* Page is fully exclusive and pins page->mem_cgroup */ + /* Page is fully exclusive and pins page's memory cgroup pointer */ VM_BUG_ON_PAGE(PageLRU(page), page); VM_BUG_ON_PAGE(page_count(page), page); VM_BUG_ON_PAGE(!PageLocked(page), page); @@ -381,9 +381,7 @@ void workingset_refault(struct page *page, void *shadow) if (workingset) { SetPageWorkingset(page); /* XXX: Move to lru_cache_add() when it supports new vs putback */ - spin_lock_irq(&page_pgdat(page)->lru_lock); lru_note_cost_page(page); - spin_unlock_irq(&page_pgdat(page)->lru_lock); inc_lruvec_state(lruvec, WORKINGSET_RESTORE_BASE + file); } out: @@ -445,12 +443,12 @@ void workingset_update_node(struct xa_node *node) if (node->count && node->count == node->nr_values) { if (list_empty(&node->private_list)) { list_lru_add(&shadow_nodes, &node->private_list); - __inc_lruvec_slab_state(node, WORKINGSET_NODES); + __inc_lruvec_kmem_state(node, WORKINGSET_NODES); } } else { if (!list_empty(&node->private_list)) { list_lru_del(&shadow_nodes, &node->private_list); - __dec_lruvec_slab_state(node, WORKINGSET_NODES); + __dec_lruvec_kmem_state(node, WORKINGSET_NODES); } } } @@ -544,7 +542,7 @@ static enum lru_status shadow_lru_isolate(struct list_head *item, } list_lru_isolate(lru, item); - __dec_lruvec_slab_state(node, WORKINGSET_NODES); + __dec_lruvec_kmem_state(node, WORKINGSET_NODES); spin_unlock(lru_lock); @@ -559,7 +557,7 @@ static enum lru_status shadow_lru_isolate(struct list_head *item, goto out_invalid; mapping->nrexceptional -= node->nr_values; xa_delete_node(node, workingset_update_node); - __inc_lruvec_slab_state(node, WORKINGSET_NODERECLAIM); + __inc_lruvec_kmem_state(node, WORKINGSET_NODERECLAIM); out_invalid: xa_unlock_irq(&mapping->i_pages); diff --git a/mm/z3fold.c b/mm/z3fold.c index 18feaa0bc537..dacb0d70fa61 100644 --- a/mm/z3fold.c +++ b/mm/z3fold.c @@ -90,7 +90,7 @@ struct z3fold_buddy_slots { * be enough slots to hold all possible variants */ unsigned long slot[BUDDY_MASK + 1]; - unsigned long pool; /* back link + flags */ + unsigned long pool; /* back link */ rwlock_t lock; }; #define HANDLE_FLAG_MASK (0x03) @@ -185,7 +185,7 @@ enum z3fold_page_flags { * handle flags, go under HANDLE_FLAG_MASK */ enum z3fold_handle_flags { - HANDLES_ORPHANED = 0, + HANDLES_NOFREE = 0, }; /* @@ -303,10 +303,9 @@ static inline void put_z3fold_header(struct z3fold_header *zhdr) z3fold_page_unlock(zhdr); } -static inline void free_handle(unsigned long handle) +static inline void free_handle(unsigned long handle, struct z3fold_header *zhdr) { struct z3fold_buddy_slots *slots; - struct z3fold_header *zhdr; int i; bool is_free; @@ -316,22 +315,19 @@ static inline void free_handle(unsigned long handle) if (WARN_ON(*(unsigned long *)handle == 0)) return; - zhdr = handle_to_z3fold_header(handle); slots = handle_to_slots(handle); write_lock(&slots->lock); *(unsigned long *)handle = 0; - if (zhdr->slots == slots) { + + if (test_bit(HANDLES_NOFREE, &slots->pool)) { write_unlock(&slots->lock); return; /* simple case, nothing else to do */ } - /* we are freeing a foreign handle if we are here */ - zhdr->foreign_handles--; + if (zhdr->slots != slots) + zhdr->foreign_handles--; + is_free = true; - if (!test_bit(HANDLES_ORPHANED, &slots->pool)) { - write_unlock(&slots->lock); - return; - } for (i = 0; i <= BUDDY_MASK; i++) { if (slots->slot[i]) { is_free = false; @@ -343,6 +339,8 @@ static inline void free_handle(unsigned long handle) if (is_free) { struct z3fold_pool *pool = slots_to_pool(slots); + if (zhdr->slots == slots) + zhdr->slots = NULL; kmem_cache_free(pool->c_handle, slots); } } @@ -525,8 +523,6 @@ static void __release_z3fold_page(struct z3fold_header *zhdr, bool locked) { struct page *page = virt_to_page(zhdr); struct z3fold_pool *pool = zhdr_to_pool(zhdr); - bool is_free = true; - int i; WARN_ON(!list_empty(&zhdr->buddy)); set_bit(PAGE_STALE, &page->private); @@ -536,21 +532,6 @@ static void __release_z3fold_page(struct z3fold_header *zhdr, bool locked) list_del_init(&page->lru); spin_unlock(&pool->lock); - /* If there are no foreign handles, free the handles array */ - read_lock(&zhdr->slots->lock); - for (i = 0; i <= BUDDY_MASK; i++) { - if (zhdr->slots->slot[i]) { - is_free = false; - break; - } - } - if (!is_free) - set_bit(HANDLES_ORPHANED, &zhdr->slots->pool); - read_unlock(&zhdr->slots->lock); - - if (is_free) - kmem_cache_free(pool->c_handle, zhdr->slots); - if (locked) z3fold_page_unlock(zhdr); @@ -642,15 +623,39 @@ static inline void add_to_unbuddied(struct z3fold_pool *pool, { if (zhdr->first_chunks == 0 || zhdr->last_chunks == 0 || zhdr->middle_chunks == 0) { - struct list_head *unbuddied = get_cpu_ptr(pool->unbuddied); - + struct list_head *unbuddied; int freechunks = num_free_chunks(zhdr); + + migrate_disable(); + unbuddied = this_cpu_ptr(pool->unbuddied); spin_lock(&pool->lock); list_add(&zhdr->buddy, &unbuddied[freechunks]); spin_unlock(&pool->lock); zhdr->cpu = smp_processor_id(); - put_cpu_ptr(pool->unbuddied); + migrate_enable(); + } +} + +static inline enum buddy get_free_buddy(struct z3fold_header *zhdr, int chunks) +{ + enum buddy bud = HEADLESS; + + if (zhdr->middle_chunks) { + if (!zhdr->first_chunks && + chunks <= zhdr->start_middle - ZHDR_CHUNKS) + bud = FIRST; + else if (!zhdr->last_chunks) + bud = LAST; + } else { + if (!zhdr->first_chunks) + bud = FIRST; + else if (!zhdr->last_chunks) + bud = LAST; + else + bud = MIDDLE; } + + return bud; } static inline void *mchunk_memmove(struct z3fold_header *zhdr, @@ -714,18 +719,7 @@ static struct z3fold_header *compact_single_buddy(struct z3fold_header *zhdr) if (WARN_ON(new_zhdr == zhdr)) goto out_fail; - if (new_zhdr->first_chunks == 0) { - if (new_zhdr->middle_chunks != 0 && - chunks >= new_zhdr->start_middle) { - new_bud = LAST; - } else { - new_bud = FIRST; - } - } else if (new_zhdr->last_chunks == 0) { - new_bud = LAST; - } else if (new_zhdr->middle_chunks == 0) { - new_bud = MIDDLE; - } + new_bud = get_free_buddy(new_zhdr, chunks); q = new_zhdr; switch (new_bud) { case FIRST: @@ -847,9 +841,8 @@ static void do_compact_page(struct z3fold_header *zhdr, bool locked) return; } - if (unlikely(PageIsolated(page) || - test_bit(PAGE_CLAIMED, &page->private) || - test_bit(PAGE_STALE, &page->private))) { + if (test_bit(PAGE_STALE, &page->private) || + test_and_set_bit(PAGE_CLAIMED, &page->private)) { z3fold_page_unlock(zhdr); return; } @@ -858,13 +851,16 @@ static void do_compact_page(struct z3fold_header *zhdr, bool locked) zhdr->mapped_count == 0 && compact_single_buddy(zhdr)) { if (kref_put(&zhdr->refcount, release_z3fold_page_locked)) atomic64_dec(&pool->pages_nr); - else + else { + clear_bit(PAGE_CLAIMED, &page->private); z3fold_page_unlock(zhdr); + } return; } z3fold_compact_page(zhdr); add_to_unbuddied(pool, zhdr); + clear_bit(PAGE_CLAIMED, &page->private); z3fold_page_unlock(zhdr); } @@ -886,8 +882,9 @@ static inline struct z3fold_header *__z3fold_alloc(struct z3fold_pool *pool, int chunks = size_to_chunks(size), i; lookup: + migrate_disable(); /* First, try to find an unbuddied z3fold page. */ - unbuddied = get_cpu_ptr(pool->unbuddied); + unbuddied = this_cpu_ptr(pool->unbuddied); for_each_unbuddied_list(i, chunks) { struct list_head *l = &unbuddied[i]; @@ -905,7 +902,7 @@ lookup: !z3fold_page_trylock(zhdr)) { spin_unlock(&pool->lock); zhdr = NULL; - put_cpu_ptr(pool->unbuddied); + migrate_enable(); if (can_sleep) cond_resched(); goto lookup; @@ -919,7 +916,7 @@ lookup: test_bit(PAGE_CLAIMED, &page->private)) { z3fold_page_unlock(zhdr); zhdr = NULL; - put_cpu_ptr(pool->unbuddied); + migrate_enable(); if (can_sleep) cond_resched(); goto lookup; @@ -934,7 +931,7 @@ lookup: kref_get(&zhdr->refcount); break; } - put_cpu_ptr(pool->unbuddied); + migrate_enable(); if (!zhdr) { int cpu; @@ -973,6 +970,9 @@ lookup: } } + if (zhdr && !zhdr->slots) + zhdr->slots = alloc_slots(pool, + can_sleep ? GFP_NOIO : GFP_ATOMIC); return zhdr; } @@ -1109,17 +1109,8 @@ static int z3fold_alloc(struct z3fold_pool *pool, size_t size, gfp_t gfp, retry: zhdr = __z3fold_alloc(pool, size, can_sleep); if (zhdr) { - if (zhdr->first_chunks == 0) { - if (zhdr->middle_chunks != 0 && - chunks >= zhdr->start_middle) - bud = LAST; - else - bud = FIRST; - } else if (zhdr->last_chunks == 0) - bud = LAST; - else if (zhdr->middle_chunks == 0) - bud = MIDDLE; - else { + bud = get_free_buddy(zhdr, chunks); + if (bud == HEADLESS) { if (kref_put(&zhdr->refcount, release_z3fold_page_locked)) atomic64_dec(&pool->pages_nr); @@ -1265,12 +1256,11 @@ static void z3fold_free(struct z3fold_pool *pool, unsigned long handle) pr_err("%s: unknown bud %d\n", __func__, bud); WARN_ON(1); put_z3fold_header(zhdr); - clear_bit(PAGE_CLAIMED, &page->private); return; } if (!page_claimed) - free_handle(handle); + free_handle(handle, zhdr); if (kref_put(&zhdr->refcount, release_z3fold_page_locked_list)) { atomic64_dec(&pool->pages_nr); return; @@ -1280,8 +1270,7 @@ static void z3fold_free(struct z3fold_pool *pool, unsigned long handle) z3fold_page_unlock(zhdr); return; } - if (unlikely(PageIsolated(page)) || - test_and_set_bit(NEEDS_COMPACTING, &page->private)) { + if (test_and_set_bit(NEEDS_COMPACTING, &page->private)) { put_z3fold_header(zhdr); clear_bit(PAGE_CLAIMED, &page->private); return; @@ -1345,6 +1334,10 @@ static int z3fold_reclaim_page(struct z3fold_pool *pool, unsigned int retries) struct page *page = NULL; struct list_head *pos; unsigned long first_handle = 0, middle_handle = 0, last_handle = 0; + struct z3fold_buddy_slots slots __attribute__((aligned(SLOTS_ALIGN))); + + rwlock_init(&slots.lock); + slots.pool = (unsigned long)pool | (1 << HANDLES_NOFREE); spin_lock(&pool->lock); if (!pool->ops || !pool->ops->evict || retries == 0) { @@ -1359,35 +1352,36 @@ static int z3fold_reclaim_page(struct z3fold_pool *pool, unsigned int retries) list_for_each_prev(pos, &pool->lru) { page = list_entry(pos, struct page, lru); - /* this bit could have been set by free, in which case - * we pass over to the next page in the pool. - */ - if (test_and_set_bit(PAGE_CLAIMED, &page->private)) { - page = NULL; - continue; - } - - if (unlikely(PageIsolated(page))) { - clear_bit(PAGE_CLAIMED, &page->private); - page = NULL; - continue; - } zhdr = page_address(page); if (test_bit(PAGE_HEADLESS, &page->private)) break; + if (kref_get_unless_zero(&zhdr->refcount) == 0) { + zhdr = NULL; + break; + } if (!z3fold_page_trylock(zhdr)) { - clear_bit(PAGE_CLAIMED, &page->private); + if (kref_put(&zhdr->refcount, + release_z3fold_page)) + atomic64_dec(&pool->pages_nr); zhdr = NULL; continue; /* can't evict at this point */ } - if (zhdr->foreign_handles) { - clear_bit(PAGE_CLAIMED, &page->private); - z3fold_page_unlock(zhdr); + + /* test_and_set_bit is of course atomic, but we still + * need to do it under page lock, otherwise checking + * that bit in __z3fold_alloc wouldn't make sense + */ + if (zhdr->foreign_handles || + test_and_set_bit(PAGE_CLAIMED, &page->private)) { + if (kref_put(&zhdr->refcount, + release_z3fold_page)) + atomic64_dec(&pool->pages_nr); + else + z3fold_page_unlock(zhdr); zhdr = NULL; continue; /* can't evict such page */ } - kref_get(&zhdr->refcount); list_del_init(&zhdr->buddy); zhdr->cpu = -1; break; @@ -1409,12 +1403,16 @@ static int z3fold_reclaim_page(struct z3fold_pool *pool, unsigned int retries) first_handle = 0; last_handle = 0; middle_handle = 0; + memset(slots.slot, 0, sizeof(slots.slot)); if (zhdr->first_chunks) - first_handle = encode_handle(zhdr, FIRST); + first_handle = __encode_handle(zhdr, &slots, + FIRST); if (zhdr->middle_chunks) - middle_handle = encode_handle(zhdr, MIDDLE); + middle_handle = __encode_handle(zhdr, &slots, + MIDDLE); if (zhdr->last_chunks) - last_handle = encode_handle(zhdr, LAST); + last_handle = __encode_handle(zhdr, &slots, + LAST); /* * it's safe to unlock here because we hold a * reference to this page @@ -1429,19 +1427,16 @@ static int z3fold_reclaim_page(struct z3fold_pool *pool, unsigned int retries) ret = pool->ops->evict(pool, middle_handle); if (ret) goto next; - free_handle(middle_handle); } if (first_handle) { ret = pool->ops->evict(pool, first_handle); if (ret) goto next; - free_handle(first_handle); } if (last_handle) { ret = pool->ops->evict(pool, last_handle); if (ret) goto next; - free_handle(last_handle); } next: if (test_bit(PAGE_HEADLESS, &page->private)) { @@ -1455,9 +1450,11 @@ next: spin_unlock(&pool->lock); clear_bit(PAGE_CLAIMED, &page->private); } else { + struct z3fold_buddy_slots *slots = zhdr->slots; z3fold_page_lock(zhdr); if (kref_put(&zhdr->refcount, release_z3fold_page_locked)) { + kmem_cache_free(pool->c_handle, slots); atomic64_dec(&pool->pages_nr); return 0; } @@ -1573,8 +1570,7 @@ static bool z3fold_page_isolate(struct page *page, isolate_mode_t mode) VM_BUG_ON_PAGE(!PageMovable(page), page); VM_BUG_ON_PAGE(PageIsolated(page), page); - if (test_bit(PAGE_HEADLESS, &page->private) || - test_bit(PAGE_CLAIMED, &page->private)) + if (test_bit(PAGE_HEADLESS, &page->private)) return false; zhdr = page_address(page); @@ -1586,6 +1582,8 @@ static bool z3fold_page_isolate(struct page *page, isolate_mode_t mode) if (zhdr->mapped_count != 0 || zhdr->foreign_handles != 0) goto out; + if (test_and_set_bit(PAGE_CLAIMED, &page->private)) + goto out; pool = zhdr_to_pool(zhdr); spin_lock(&pool->lock); if (!list_empty(&zhdr->buddy)) @@ -1612,16 +1610,17 @@ static int z3fold_page_migrate(struct address_space *mapping, struct page *newpa VM_BUG_ON_PAGE(!PageMovable(page), page); VM_BUG_ON_PAGE(!PageIsolated(page), page); + VM_BUG_ON_PAGE(!test_bit(PAGE_CLAIMED, &page->private), page); VM_BUG_ON_PAGE(!PageLocked(newpage), newpage); zhdr = page_address(page); pool = zhdr_to_pool(zhdr); - if (!z3fold_page_trylock(zhdr)) { + if (!z3fold_page_trylock(zhdr)) return -EAGAIN; - } if (zhdr->mapped_count != 0 || zhdr->foreign_handles != 0) { z3fold_page_unlock(zhdr); + clear_bit(PAGE_CLAIMED, &page->private); return -EBUSY; } if (work_pending(&zhdr->work)) { @@ -1663,6 +1662,7 @@ static int z3fold_page_migrate(struct address_space *mapping, struct page *newpa queue_work_on(new_zhdr->cpu, pool->compact_wq, &new_zhdr->work); page_mapcount_reset(page); + clear_bit(PAGE_CLAIMED, &page->private); put_page(page); return 0; } @@ -1686,6 +1686,7 @@ static void z3fold_page_putback(struct page *page) spin_lock(&pool->lock); list_add(&page->lru, &pool->lru); spin_unlock(&pool->lock); + clear_bit(PAGE_CLAIMED, &page->private); z3fold_page_unlock(zhdr); } diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index cdfaaadea8ff..7289f502ffac 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -726,13 +726,10 @@ static void insert_zspage(struct size_class *class, * We want to see more ZS_FULL pages and less almost empty/full. * Put pages with higher ->inuse first. */ - if (head) { - if (get_zspage_inuse(zspage) < get_zspage_inuse(head)) { - list_add(&zspage->list, &head->list); - return; - } - } - list_add(&zspage->list, &class->fullness_list[fullness]); + if (head && get_zspage_inuse(zspage) < get_zspage_inuse(head)) + list_add(&zspage->list, &head->list); + else + list_add(&zspage->list, &class->fullness_list[fullness]); } /* diff --git a/mm/zswap.c b/mm/zswap.c index fbb782924ccc..182f6ad5aa69 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -24,8 +24,10 @@ #include <linux/rbtree.h> #include <linux/swap.h> #include <linux/crypto.h> +#include <linux/scatterlist.h> #include <linux/mempool.h> #include <linux/zpool.h> +#include <crypto/acompress.h> #include <linux/mm_types.h> #include <linux/page-flags.h> @@ -81,7 +83,7 @@ static bool zswap_pool_reached_full; static bool zswap_enabled = IS_ENABLED(CONFIG_ZSWAP_DEFAULT_ON); static int zswap_enabled_param_set(const char *, const struct kernel_param *); -static struct kernel_param_ops zswap_enabled_param_ops = { +static const struct kernel_param_ops zswap_enabled_param_ops = { .set = zswap_enabled_param_set, .get = param_get_bool, }; @@ -91,7 +93,7 @@ module_param_cb(enabled, &zswap_enabled_param_ops, &zswap_enabled, 0644); static char *zswap_compressor = CONFIG_ZSWAP_COMPRESSOR_DEFAULT; static int zswap_compressor_param_set(const char *, const struct kernel_param *); -static struct kernel_param_ops zswap_compressor_param_ops = { +static const struct kernel_param_ops zswap_compressor_param_ops = { .set = zswap_compressor_param_set, .get = param_get_charp, .free = param_free_charp, @@ -102,7 +104,7 @@ module_param_cb(compressor, &zswap_compressor_param_ops, /* Compressed storage zpool to use */ static char *zswap_zpool_type = CONFIG_ZSWAP_ZPOOL_DEFAULT; static int zswap_zpool_param_set(const char *, const struct kernel_param *); -static struct kernel_param_ops zswap_zpool_param_ops = { +static const struct kernel_param_ops zswap_zpool_param_ops = { .set = zswap_zpool_param_set, .get = param_get_charp, .free = param_free_charp, @@ -127,9 +129,17 @@ module_param_named(same_filled_pages_enabled, zswap_same_filled_pages_enabled, * data structures **********************************/ +struct crypto_acomp_ctx { + struct crypto_acomp *acomp; + struct acomp_req *req; + struct crypto_wait wait; + u8 *dstmem; + struct mutex *mutex; +}; + struct zswap_pool { struct zpool *zpool; - struct crypto_comp * __percpu *tfm; + struct crypto_acomp_ctx __percpu *acomp_ctx; struct kref kref; struct list_head list; struct work_struct release_work; @@ -388,23 +398,43 @@ static struct zswap_entry *zswap_entry_find_get(struct rb_root *root, * per-cpu code **********************************/ static DEFINE_PER_CPU(u8 *, zswap_dstmem); +/* + * If users dynamically change the zpool type and compressor at runtime, i.e. + * zswap is running, zswap can have more than one zpool on one cpu, but they + * are sharing dtsmem. So we need this mutex to be per-cpu. + */ +static DEFINE_PER_CPU(struct mutex *, zswap_mutex); static int zswap_dstmem_prepare(unsigned int cpu) { + struct mutex *mutex; u8 *dst; dst = kmalloc_node(PAGE_SIZE * 2, GFP_KERNEL, cpu_to_node(cpu)); if (!dst) return -ENOMEM; + mutex = kmalloc_node(sizeof(*mutex), GFP_KERNEL, cpu_to_node(cpu)); + if (!mutex) { + kfree(dst); + return -ENOMEM; + } + + mutex_init(mutex); per_cpu(zswap_dstmem, cpu) = dst; + per_cpu(zswap_mutex, cpu) = mutex; return 0; } static int zswap_dstmem_dead(unsigned int cpu) { + struct mutex *mutex; u8 *dst; + mutex = per_cpu(zswap_mutex, cpu); + kfree(mutex); + per_cpu(zswap_mutex, cpu) = NULL; + dst = per_cpu(zswap_dstmem, cpu); kfree(dst); per_cpu(zswap_dstmem, cpu) = NULL; @@ -415,30 +445,54 @@ static int zswap_dstmem_dead(unsigned int cpu) static int zswap_cpu_comp_prepare(unsigned int cpu, struct hlist_node *node) { struct zswap_pool *pool = hlist_entry(node, struct zswap_pool, node); - struct crypto_comp *tfm; - - if (WARN_ON(*per_cpu_ptr(pool->tfm, cpu))) - return 0; + struct crypto_acomp_ctx *acomp_ctx = per_cpu_ptr(pool->acomp_ctx, cpu); + struct crypto_acomp *acomp; + struct acomp_req *req; + + acomp = crypto_alloc_acomp_node(pool->tfm_name, 0, 0, cpu_to_node(cpu)); + if (IS_ERR(acomp)) { + pr_err("could not alloc crypto acomp %s : %ld\n", + pool->tfm_name, PTR_ERR(acomp)); + return PTR_ERR(acomp); + } + acomp_ctx->acomp = acomp; - tfm = crypto_alloc_comp(pool->tfm_name, 0, 0); - if (IS_ERR_OR_NULL(tfm)) { - pr_err("could not alloc crypto comp %s : %ld\n", - pool->tfm_name, PTR_ERR(tfm)); + req = acomp_request_alloc(acomp_ctx->acomp); + if (!req) { + pr_err("could not alloc crypto acomp_request %s\n", + pool->tfm_name); + crypto_free_acomp(acomp_ctx->acomp); return -ENOMEM; } - *per_cpu_ptr(pool->tfm, cpu) = tfm; + acomp_ctx->req = req; + + crypto_init_wait(&acomp_ctx->wait); + /* + * if the backend of acomp is async zip, crypto_req_done() will wakeup + * crypto_wait_req(); if the backend of acomp is scomp, the callback + * won't be called, crypto_wait_req() will return without blocking. + */ + acomp_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG, + crypto_req_done, &acomp_ctx->wait); + + acomp_ctx->mutex = per_cpu(zswap_mutex, cpu); + acomp_ctx->dstmem = per_cpu(zswap_dstmem, cpu); + return 0; } static int zswap_cpu_comp_dead(unsigned int cpu, struct hlist_node *node) { struct zswap_pool *pool = hlist_entry(node, struct zswap_pool, node); - struct crypto_comp *tfm; + struct crypto_acomp_ctx *acomp_ctx = per_cpu_ptr(pool->acomp_ctx, cpu); + + if (!IS_ERR_OR_NULL(acomp_ctx)) { + if (!IS_ERR_OR_NULL(acomp_ctx->req)) + acomp_request_free(acomp_ctx->req); + if (!IS_ERR_OR_NULL(acomp_ctx->acomp)) + crypto_free_acomp(acomp_ctx->acomp); + } - tfm = *per_cpu_ptr(pool->tfm, cpu); - if (!IS_ERR_OR_NULL(tfm)) - crypto_free_comp(tfm); - *per_cpu_ptr(pool->tfm, cpu) = NULL; return 0; } @@ -561,8 +615,9 @@ static struct zswap_pool *zswap_pool_create(char *type, char *compressor) pr_debug("using %s zpool\n", zpool_get_type(pool->zpool)); strlcpy(pool->tfm_name, compressor, sizeof(pool->tfm_name)); - pool->tfm = alloc_percpu(struct crypto_comp *); - if (!pool->tfm) { + + pool->acomp_ctx = alloc_percpu(*pool->acomp_ctx); + if (!pool->acomp_ctx) { pr_err("percpu alloc failed\n"); goto error; } @@ -585,7 +640,8 @@ static struct zswap_pool *zswap_pool_create(char *type, char *compressor) return pool; error: - free_percpu(pool->tfm); + if (pool->acomp_ctx) + free_percpu(pool->acomp_ctx); if (pool->zpool) zpool_destroy_pool(pool->zpool); kfree(pool); @@ -596,14 +652,14 @@ static __init struct zswap_pool *__zswap_pool_create_fallback(void) { bool has_comp, has_zpool; - has_comp = crypto_has_comp(zswap_compressor, 0, 0); + has_comp = crypto_has_acomp(zswap_compressor, 0, 0); if (!has_comp && strcmp(zswap_compressor, CONFIG_ZSWAP_COMPRESSOR_DEFAULT)) { pr_err("compressor %s not available, using default %s\n", zswap_compressor, CONFIG_ZSWAP_COMPRESSOR_DEFAULT); param_free_charp(&zswap_compressor); zswap_compressor = CONFIG_ZSWAP_COMPRESSOR_DEFAULT; - has_comp = crypto_has_comp(zswap_compressor, 0, 0); + has_comp = crypto_has_acomp(zswap_compressor, 0, 0); } if (!has_comp) { pr_err("default compressor %s not available\n", @@ -639,7 +695,7 @@ static void zswap_pool_destroy(struct zswap_pool *pool) zswap_pool_debug("destroying", pool); cpuhp_state_remove_instance(CPUHP_MM_ZSWP_POOL_PREPARE, &pool->node); - free_percpu(pool->tfm); + free_percpu(pool->acomp_ctx); zpool_destroy_pool(pool->zpool); kfree(pool); } @@ -723,7 +779,7 @@ static int __zswap_param_set(const char *val, const struct kernel_param *kp, } type = s; } else if (!compressor) { - if (!crypto_has_comp(s, 0, 0)) { + if (!crypto_has_acomp(s, 0, 0)) { pr_err("compressor %s not available\n", s); return -ENOENT; } @@ -774,7 +830,7 @@ static int __zswap_param_set(const char *val, const struct kernel_param *kp, * failed, maybe both compressor and zpool params were bad. * Allow changing this param, so pool creation will succeed * when the other param is changed. We already verified this - * param is ok in the zpool_has_pool() or crypto_has_comp() + * param is ok in the zpool_has_pool() or crypto_has_acomp() * checks above. */ ret = param_set_charp(s, kp); @@ -876,8 +932,10 @@ static int zswap_writeback_entry(struct zpool *pool, unsigned long handle) pgoff_t offset; struct zswap_entry *entry; struct page *page; - struct crypto_comp *tfm; - u8 *src, *dst; + struct scatterlist input, output; + struct crypto_acomp_ctx *acomp_ctx; + + u8 *src; unsigned int dlen; int ret; struct writeback_control wbc = { @@ -916,14 +974,20 @@ static int zswap_writeback_entry(struct zpool *pool, unsigned long handle) case ZSWAP_SWAPCACHE_NEW: /* page is locked */ /* decompress */ + acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx); + dlen = PAGE_SIZE; src = (u8 *)zhdr + sizeof(struct zswap_header); - dst = kmap_atomic(page); - tfm = *get_cpu_ptr(entry->pool->tfm); - ret = crypto_comp_decompress(tfm, src, entry->length, - dst, &dlen); - put_cpu_ptr(entry->pool->tfm); - kunmap_atomic(dst); + + mutex_lock(acomp_ctx->mutex); + sg_init_one(&input, src, entry->length); + sg_init_table(&output, 1); + sg_set_page(&output, page, PAGE_SIZE, 0); + acomp_request_set_params(acomp_ctx->req, &input, &output, entry->length, dlen); + ret = crypto_wait_req(crypto_acomp_decompress(acomp_ctx->req), &acomp_ctx->wait); + dlen = acomp_ctx->req->dlen; + mutex_unlock(acomp_ctx->mutex); + BUG_ON(ret); BUG_ON(dlen != PAGE_SIZE); @@ -1004,7 +1068,8 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset, { struct zswap_tree *tree = zswap_trees[type]; struct zswap_entry *entry, *dupentry; - struct crypto_comp *tfm; + struct scatterlist input, output; + struct crypto_acomp_ctx *acomp_ctx; int ret; unsigned int hlen, dlen = PAGE_SIZE; unsigned long handle, value; @@ -1074,12 +1139,32 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset, } /* compress */ - dst = get_cpu_var(zswap_dstmem); - tfm = *get_cpu_ptr(entry->pool->tfm); - src = kmap_atomic(page); - ret = crypto_comp_compress(tfm, src, PAGE_SIZE, dst, &dlen); - kunmap_atomic(src); - put_cpu_ptr(entry->pool->tfm); + acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx); + + mutex_lock(acomp_ctx->mutex); + + dst = acomp_ctx->dstmem; + sg_init_table(&input, 1); + sg_set_page(&input, page, PAGE_SIZE, 0); + + /* zswap_dstmem is of size (PAGE_SIZE * 2). Reflect same in sg_list */ + sg_init_one(&output, dst, PAGE_SIZE * 2); + acomp_request_set_params(acomp_ctx->req, &input, &output, PAGE_SIZE, dlen); + /* + * it maybe looks a little bit silly that we send an asynchronous request, + * then wait for its completion synchronously. This makes the process look + * synchronous in fact. + * Theoretically, acomp supports users send multiple acomp requests in one + * acomp instance, then get those requests done simultaneously. but in this + * case, frontswap actually does store and load page by page, there is no + * existing method to send the second page before the first page is done + * in one thread doing frontswap. + * but in different threads running on different cpu, we have different + * acomp instance, so multiple threads can do (de)compression in parallel. + */ + ret = crypto_wait_req(crypto_acomp_compress(acomp_ctx->req), &acomp_ctx->wait); + dlen = acomp_ctx->req->dlen; + if (ret) { ret = -EINVAL; goto put_dstmem; @@ -1103,7 +1188,7 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset, memcpy(buf, &zhdr, hlen); memcpy(buf + hlen, dst, dlen); zpool_unmap_handle(entry->pool->zpool, handle); - put_cpu_var(zswap_dstmem); + mutex_unlock(acomp_ctx->mutex); /* populate entry */ entry->offset = offset; @@ -1131,7 +1216,7 @@ insert_entry: return 0; put_dstmem: - put_cpu_var(zswap_dstmem); + mutex_unlock(acomp_ctx->mutex); zswap_pool_put(entry->pool); freepage: zswap_entry_cache_free(entry); @@ -1148,7 +1233,8 @@ static int zswap_frontswap_load(unsigned type, pgoff_t offset, { struct zswap_tree *tree = zswap_trees[type]; struct zswap_entry *entry; - struct crypto_comp *tfm; + struct scatterlist input, output; + struct crypto_acomp_ctx *acomp_ctx; u8 *src, *dst; unsigned int dlen; int ret; @@ -1175,11 +1261,16 @@ static int zswap_frontswap_load(unsigned type, pgoff_t offset, src = zpool_map_handle(entry->pool->zpool, entry->handle, ZPOOL_MM_RO); if (zpool_evictable(entry->pool->zpool)) src += sizeof(struct zswap_header); - dst = kmap_atomic(page); - tfm = *get_cpu_ptr(entry->pool->tfm); - ret = crypto_comp_decompress(tfm, src, entry->length, dst, &dlen); - put_cpu_ptr(entry->pool->tfm); - kunmap_atomic(dst); + + acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx); + mutex_lock(acomp_ctx->mutex); + sg_init_one(&input, src, entry->length); + sg_init_table(&output, 1); + sg_set_page(&output, page, PAGE_SIZE, 0); + acomp_request_set_params(acomp_ctx->req, &input, &output, entry->length, dlen); + ret = crypto_wait_req(crypto_acomp_decompress(acomp_ctx->req), &acomp_ctx->wait); + mutex_unlock(acomp_ctx->mutex); + zpool_unmap_handle(entry->pool->zpool, entry->handle); BUG_ON(ret); |