diff options
Diffstat (limited to 'mm')
65 files changed, 1957 insertions, 1924 deletions
diff --git a/mm/Kconfig b/mm/Kconfig index f730605b8dcf..24c045b24b95 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -804,9 +804,6 @@ config DEVICE_PRIVATE config VMAP_PFN bool -config FRAME_VECTOR - bool - config ARCH_USES_HIGH_VMA_FLAGS bool config ARCH_HAS_PKEYS diff --git a/mm/Makefile b/mm/Makefile index b6cd2fffa492..135bbb65511a 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -110,7 +110,6 @@ obj-$(CONFIG_PAGE_EXTENSION) += page_ext.o obj-$(CONFIG_CMA_DEBUGFS) += cma_debug.o obj-$(CONFIG_USERFAULTFD) += userfaultfd.o obj-$(CONFIG_IDLE_PAGE_TRACKING) += page_idle.o -obj-$(CONFIG_FRAME_VECTOR) += frame_vector.o obj-$(CONFIG_DEBUG_PAGE_REF) += debug_page_ref.o obj-$(CONFIG_HARDENED_USERCOPY) += usercopy.o obj-$(CONFIG_PERCPU_STATS) += percpu-stats.o diff --git a/mm/backing-dev.c b/mm/backing-dev.c index e33797579338..eca555f658d9 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -32,6 +32,8 @@ LIST_HEAD(bdi_list); /* bdi_wq serves all asynchronous writeback tasks */ struct workqueue_struct *bdi_wq; +#define K(x) ((x) << (PAGE_SHIFT - 10)) + #ifdef CONFIG_DEBUG_FS #include <linux/debugfs.h> #include <linux/seq_file.h> @@ -69,7 +71,6 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v) global_dirty_limits(&background_thresh, &dirty_thresh); wb_thresh = wb_calc_thresh(wb, dirty_thresh); -#define K(x) ((x) << (PAGE_SHIFT - 10)) seq_printf(m, "BdiWriteback: %10lu kB\n" "BdiReclaimable: %10lu kB\n" @@ -98,7 +99,6 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v) nr_more_io, nr_dirty_time, !list_empty(&bdi->bdi_list), bdi->wb.state); -#undef K return 0; } @@ -146,8 +146,6 @@ static ssize_t read_ahead_kb_store(struct device *dev, return count; } -#define K(pages) ((pages) << (PAGE_SHIFT - 10)) - #define BDI_SHOW(name, expr) \ static ssize_t name##_show(struct device *dev, \ struct device_attribute *attr, char *buf) \ diff --git a/mm/compaction.c b/mm/compaction.c index e5acb9714436..e04f4476e68e 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -137,7 +137,6 @@ EXPORT_SYMBOL(__SetPageMovable); void __ClearPageMovable(struct page *page) { - VM_BUG_ON_PAGE(!PageLocked(page), page); VM_BUG_ON_PAGE(!PageMovable(page), page); /* * Clear registered address_space val with keeping PAGE_MAPPING_MOVABLE @@ -988,14 +987,13 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, if (unlikely(!get_page_unless_zero(page))) goto isolate_fail; - if (__isolate_lru_page_prepare(page, isolate_mode) != 0) + if (!__isolate_lru_page_prepare(page, isolate_mode)) goto isolate_fail_put; /* Try isolate the page */ if (!TestClearPageLRU(page)) goto isolate_fail_put; - rcu_read_lock(); lruvec = mem_cgroup_page_lruvec(page, pgdat); /* If we already hold the lock, we can skip some rechecking */ @@ -1005,7 +1003,6 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, compact_lock_irqsave(&lruvec->lru_lock, &flags, cc); locked = lruvec; - rcu_read_unlock(); lruvec_memcg_debug(lruvec, page); @@ -1026,15 +1023,14 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, SetPageLRU(page); goto isolate_fail_put; } - } else - rcu_read_unlock(); + } /* The whole page is taken off the LRU; skip the tail pages. */ if (PageCompound(page)) low_pfn += compound_nr(page) - 1; /* Successfully isolated */ - del_page_from_lru_list(page, lruvec, page_lru(page)); + del_page_from_lru_list(page, lruvec); mod_node_page_state(page_pgdat(page), NR_ISOLATED_ANON + page_is_file_lru(page), thp_nr_pages(page)); @@ -1288,7 +1284,7 @@ static void fast_isolate_around(struct compact_control *cc, unsigned long pfn, unsigned long nr_isolated) { unsigned long start_pfn, end_pfn; - struct page *page = pfn_to_page(pfn); + struct page *page; /* Do not search around if there are enough pages already */ if (cc->nr_freepages >= cc->nr_migratepages) @@ -1299,8 +1295,12 @@ fast_isolate_around(struct compact_control *cc, unsigned long pfn, unsigned long return; /* Pageblock boundaries */ - start_pfn = pageblock_start_pfn(pfn); - end_pfn = min(pageblock_end_pfn(pfn), zone_end_pfn(cc->zone)) - 1; + start_pfn = max(pageblock_start_pfn(pfn), cc->zone->zone_start_pfn); + end_pfn = min(pageblock_end_pfn(pfn), zone_end_pfn(cc->zone)); + + page = pageblock_pfn_to_page(start_pfn, end_pfn, cc->zone); + if (!page) + return; /* Scan before */ if (start_pfn != pfn) { @@ -1342,7 +1342,7 @@ fast_isolate_freepages(struct compact_control *cc) { unsigned int limit = min(1U, freelist_scan_limit(cc) >> 1); unsigned int nr_scanned = 0; - unsigned long low_pfn, min_pfn, high_pfn = 0, highest = 0; + unsigned long low_pfn, min_pfn, highest = 0; unsigned long nr_isolated = 0; unsigned long distance; struct page *page = NULL; @@ -1387,6 +1387,7 @@ fast_isolate_freepages(struct compact_control *cc) struct page *freepage; unsigned long flags; unsigned int order_scanned = 0; + unsigned long high_pfn = 0; if (!area->nr_free) continue; @@ -1401,7 +1402,8 @@ fast_isolate_freepages(struct compact_control *cc) pfn = page_to_pfn(freepage); if (pfn >= highest) - highest = pageblock_start_pfn(pfn); + highest = max(pageblock_start_pfn(pfn), + cc->zone->zone_start_pfn); if (pfn >= low_pfn) { cc->fast_search_fail = 0; @@ -1471,7 +1473,8 @@ fast_isolate_freepages(struct compact_control *cc) } else { if (cc->direct_compaction && pfn_valid(min_pfn)) { page = pageblock_pfn_to_page(min_pfn, - pageblock_end_pfn(min_pfn), + min(pageblock_end_pfn(min_pfn), + zone_end_pfn(cc->zone)), cc->zone); cc->free_pfn = min_pfn; } @@ -1701,6 +1704,7 @@ static unsigned long fast_find_migrateblock(struct compact_control *cc) unsigned long pfn = cc->migrate_pfn; unsigned long high_pfn; int order; + bool found_block = false; /* Skip hints are relied on to avoid repeats on the fast search */ if (cc->ignore_skip_hint) @@ -1743,7 +1747,7 @@ static unsigned long fast_find_migrateblock(struct compact_control *cc) high_pfn = pageblock_start_pfn(cc->migrate_pfn + distance); for (order = cc->order - 1; - order >= PAGE_ALLOC_COSTLY_ORDER && pfn == cc->migrate_pfn && nr_scanned < limit; + order >= PAGE_ALLOC_COSTLY_ORDER && !found_block && nr_scanned < limit; order--) { struct free_area *area = &cc->zone->free_area[order]; struct list_head *freelist; @@ -1758,7 +1762,11 @@ static unsigned long fast_find_migrateblock(struct compact_control *cc) list_for_each_entry(freepage, freelist, lru) { unsigned long free_pfn; - nr_scanned++; + if (nr_scanned++ >= limit) { + move_freelist_tail(freelist, freepage); + break; + } + free_pfn = page_to_pfn(freepage); if (free_pfn < high_pfn) { /* @@ -1767,12 +1775,8 @@ static unsigned long fast_find_migrateblock(struct compact_control *cc) * the list assumes an entry is deleted, not * reordered. */ - if (get_pageblock_skip(freepage)) { - if (list_is_last(freelist, &freepage->lru)) - break; - + if (get_pageblock_skip(freepage)) continue; - } /* Reorder to so a future search skips recent pages */ move_freelist_tail(freelist, freepage); @@ -1780,15 +1784,10 @@ static unsigned long fast_find_migrateblock(struct compact_control *cc) update_fast_start_pfn(cc, free_pfn); pfn = pageblock_start_pfn(free_pfn); cc->fast_search_fail = 0; + found_block = true; set_pageblock_skip(freepage); break; } - - if (nr_scanned >= limit) { - cc->fast_search_fail++; - move_freelist_tail(freelist, freepage); - break; - } } spin_unlock_irqrestore(&cc->zone->lock, flags); } @@ -1799,9 +1798,10 @@ static unsigned long fast_find_migrateblock(struct compact_control *cc) * If fast scanning failed then use a cached entry for a page block * that had free pages as the basis for starting a linear scan. */ - if (pfn == cc->migrate_pfn) + if (!found_block) { + cc->fast_search_fail++; pfn = reinit_migrate_pfn(cc); - + } return pfn; } @@ -1925,20 +1925,28 @@ static bool kswapd_is_running(pg_data_t *pgdat) /* * A zone's fragmentation score is the external fragmentation wrt to the - * COMPACTION_HPAGE_ORDER scaled by the zone's size. It returns a value - * in the range [0, 100]. + * COMPACTION_HPAGE_ORDER. It returns a value in the range [0, 100]. + */ +static unsigned int fragmentation_score_zone(struct zone *zone) +{ + return extfrag_for_order(zone, COMPACTION_HPAGE_ORDER); +} + +/* + * A weighted zone's fragmentation score is the external fragmentation + * wrt to the COMPACTION_HPAGE_ORDER scaled by the zone's size. It + * returns a value in the range [0, 100]. * * The scaling factor ensures that proactive compaction focuses on larger * zones like ZONE_NORMAL, rather than smaller, specialized zones like * ZONE_DMA32. For smaller zones, the score value remains close to zero, * and thus never exceeds the high threshold for proactive compaction. */ -static unsigned int fragmentation_score_zone(struct zone *zone) +static unsigned int fragmentation_score_zone_weighted(struct zone *zone) { unsigned long score; - score = zone->present_pages * - extfrag_for_order(zone, COMPACTION_HPAGE_ORDER); + score = zone->present_pages * fragmentation_score_zone(zone); return div64_ul(score, zone->zone_pgdat->node_present_pages + 1); } @@ -1958,7 +1966,7 @@ static unsigned int fragmentation_score_node(pg_data_t *pgdat) struct zone *zone; zone = &pgdat->node_zones[zoneid]; - score += fragmentation_score_zone(zone); + score += fragmentation_score_zone_weighted(zone); } return score; diff --git a/mm/debug.c b/mm/debug.c index 8a40b3fefbeb..0bdda8407f71 100644 --- a/mm/debug.c +++ b/mm/debug.c @@ -110,6 +110,11 @@ void __dump_page(struct page *page, const char *reason) head_compound_mapcount(head)); } } + +#ifdef CONFIG_MEMCG + if (head->memcg_data) + pr_warn("memcg:%lx\n", head->memcg_data); +#endif if (PageKsm(page)) type = "ksm "; else if (PageAnon(page)) @@ -180,11 +185,6 @@ hex_only: if (reason) pr_warn("page dumped because: %s\n", reason); - -#ifdef CONFIG_MEMCG - if (!page_poisoned && page->memcg_data) - pr_warn("pages's memcg:%lx\n", page->memcg_data); -#endif } void dump_page(struct page *page, const char *reason) diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c index c05d9dcf7891..a9bd6ce1ba02 100644 --- a/mm/debug_vm_pgtable.c +++ b/mm/debug_vm_pgtable.c @@ -58,11 +58,23 @@ #define RANDOM_ORVALUE (GENMASK(BITS_PER_LONG - 1, 0) & ~ARCH_SKIP_MASK) #define RANDOM_NZVALUE GENMASK(7, 0) -static void __init pte_basic_tests(unsigned long pfn, pgprot_t prot) +static void __init pte_basic_tests(unsigned long pfn, int idx) { + pgprot_t prot = protection_map[idx]; pte_t pte = pfn_pte(pfn, prot); + unsigned long val = idx, *ptr = &val; + + pr_debug("Validating PTE basic (%pGv)\n", ptr); + + /* + * This test needs to be executed after the given page table entry + * is created with pfn_pte() to make sure that protection_map[idx] + * does not have the dirty bit enabled from the beginning. This is + * important for platforms like arm64 where (!PTE_RDONLY) indicate + * dirty bit being set. + */ + WARN_ON(pte_dirty(pte_wrprotect(pte))); - pr_debug("Validating PTE basic\n"); WARN_ON(!pte_same(pte, pte)); WARN_ON(!pte_young(pte_mkyoung(pte_mkold(pte)))); WARN_ON(!pte_dirty(pte_mkdirty(pte_mkclean(pte)))); @@ -70,6 +82,8 @@ static void __init pte_basic_tests(unsigned long pfn, pgprot_t prot) WARN_ON(pte_young(pte_mkold(pte_mkyoung(pte)))); WARN_ON(pte_dirty(pte_mkclean(pte_mkdirty(pte)))); WARN_ON(pte_write(pte_wrprotect(pte_mkwrite(pte)))); + WARN_ON(pte_dirty(pte_wrprotect(pte_mkclean(pte)))); + WARN_ON(!pte_dirty(pte_wrprotect(pte_mkdirty(pte)))); } static void __init pte_advanced_tests(struct mm_struct *mm, @@ -129,14 +143,27 @@ static void __init pte_savedwrite_tests(unsigned long pfn, pgprot_t prot) } #ifdef CONFIG_TRANSPARENT_HUGEPAGE -static void __init pmd_basic_tests(unsigned long pfn, pgprot_t prot) +static void __init pmd_basic_tests(unsigned long pfn, int idx) { + pgprot_t prot = protection_map[idx]; pmd_t pmd = pfn_pmd(pfn, prot); + unsigned long val = idx, *ptr = &val; if (!has_transparent_hugepage()) return; - pr_debug("Validating PMD basic\n"); + pr_debug("Validating PMD basic (%pGv)\n", ptr); + + /* + * This test needs to be executed after the given page table entry + * is created with pfn_pmd() to make sure that protection_map[idx] + * does not have the dirty bit enabled from the beginning. This is + * important for platforms like arm64 where (!PTE_RDONLY) indicate + * dirty bit being set. + */ + WARN_ON(pmd_dirty(pmd_wrprotect(pmd))); + + WARN_ON(!pmd_same(pmd, pmd)); WARN_ON(!pmd_young(pmd_mkyoung(pmd_mkold(pmd)))); WARN_ON(!pmd_dirty(pmd_mkdirty(pmd_mkclean(pmd)))); @@ -144,6 +171,8 @@ static void __init pmd_basic_tests(unsigned long pfn, pgprot_t prot) WARN_ON(pmd_young(pmd_mkold(pmd_mkyoung(pmd)))); WARN_ON(pmd_dirty(pmd_mkclean(pmd_mkdirty(pmd)))); WARN_ON(pmd_write(pmd_wrprotect(pmd_mkwrite(pmd)))); + WARN_ON(pmd_dirty(pmd_wrprotect(pmd_mkclean(pmd)))); + WARN_ON(!pmd_dirty(pmd_wrprotect(pmd_mkdirty(pmd)))); /* * A huge page does not point to next level page table * entry. Hence this must qualify as pmd_bad(). @@ -249,19 +278,35 @@ static void __init pmd_savedwrite_tests(unsigned long pfn, pgprot_t prot) } #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD -static void __init pud_basic_tests(unsigned long pfn, pgprot_t prot) +static void __init pud_basic_tests(struct mm_struct *mm, unsigned long pfn, int idx) { + pgprot_t prot = protection_map[idx]; pud_t pud = pfn_pud(pfn, prot); + unsigned long val = idx, *ptr = &val; if (!has_transparent_hugepage()) return; - pr_debug("Validating PUD basic\n"); + pr_debug("Validating PUD basic (%pGv)\n", ptr); + + /* + * This test needs to be executed after the given page table entry + * is created with pfn_pud() to make sure that protection_map[idx] + * does not have the dirty bit enabled from the beginning. This is + * important for platforms like arm64 where (!PTE_RDONLY) indicate + * dirty bit being set. + */ + WARN_ON(pud_dirty(pud_wrprotect(pud))); + WARN_ON(!pud_same(pud, pud)); WARN_ON(!pud_young(pud_mkyoung(pud_mkold(pud)))); + WARN_ON(!pud_dirty(pud_mkdirty(pud_mkclean(pud)))); + WARN_ON(pud_dirty(pud_mkclean(pud_mkdirty(pud)))); WARN_ON(!pud_write(pud_mkwrite(pud_wrprotect(pud)))); WARN_ON(pud_write(pud_wrprotect(pud_mkwrite(pud)))); WARN_ON(pud_young(pud_mkold(pud_mkyoung(pud)))); + WARN_ON(pud_dirty(pud_wrprotect(pud_mkclean(pud)))); + WARN_ON(!pud_dirty(pud_wrprotect(pud_mkdirty(pud)))); if (mm_pmd_folded(mm)) return; @@ -359,7 +404,7 @@ static void __init pud_huge_tests(pud_t *pudp, unsigned long pfn, pgprot_t prot) #endif /* !CONFIG_HAVE_ARCH_HUGE_VMAP */ #else /* !CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */ -static void __init pud_basic_tests(unsigned long pfn, pgprot_t prot) { } +static void __init pud_basic_tests(struct mm_struct *mm, unsigned long pfn, int idx) { } static void __init pud_advanced_tests(struct mm_struct *mm, struct vm_area_struct *vma, pud_t *pudp, unsigned long pfn, unsigned long vaddr, @@ -372,8 +417,8 @@ static void __init pud_huge_tests(pud_t *pudp, unsigned long pfn, pgprot_t prot) } #endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */ #else /* !CONFIG_TRANSPARENT_HUGEPAGE */ -static void __init pmd_basic_tests(unsigned long pfn, pgprot_t prot) { } -static void __init pud_basic_tests(unsigned long pfn, pgprot_t prot) { } +static void __init pmd_basic_tests(unsigned long pfn, int idx) { } +static void __init pud_basic_tests(struct mm_struct *mm, unsigned long pfn, int idx) { } static void __init pmd_advanced_tests(struct mm_struct *mm, struct vm_area_struct *vma, pmd_t *pmdp, unsigned long pfn, unsigned long vaddr, @@ -899,6 +944,7 @@ static int __init debug_vm_pgtable(void) unsigned long vaddr, pte_aligned, pmd_aligned; unsigned long pud_aligned, p4d_aligned, pgd_aligned; spinlock_t *ptl = NULL; + int idx; pr_info("Validating architecture page table helpers\n"); prot = vm_get_page_prot(VMFLAGS); @@ -963,9 +1009,25 @@ static int __init debug_vm_pgtable(void) saved_pmdp = pmd_offset(pudp, 0UL); saved_ptep = pmd_pgtable(pmd); - pte_basic_tests(pte_aligned, prot); - pmd_basic_tests(pmd_aligned, prot); - pud_basic_tests(pud_aligned, prot); + /* + * Iterate over the protection_map[] to make sure that all + * the basic page table transformation validations just hold + * true irrespective of the starting protection value for a + * given page table entry. + */ + for (idx = 0; idx < ARRAY_SIZE(protection_map); idx++) { + pte_basic_tests(pte_aligned, idx); + pmd_basic_tests(pmd_aligned, idx); + pud_basic_tests(mm, pud_aligned, idx); + } + + /* + * Both P4D and PGD level tests are very basic which do not + * involve creating page table entries from the protection + * value and the given pfn. Hence just keep them out from + * the above iteration for now to save some test execution + * time. + */ p4d_basic_tests(p4d_aligned, prot); pgd_basic_tests(pgd_aligned, prot); diff --git a/mm/filemap.c b/mm/filemap.c index 5c9d564317a5..46a8b9e82434 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -42,6 +42,8 @@ #include <linux/psi.h> #include <linux/ramfs.h> #include <linux/page_idle.h> +#include <asm/pgalloc.h> +#include <asm/tlbflush.h> #include "internal.h" #define CREATE_TRACE_POINTS @@ -204,9 +206,9 @@ static void unaccount_page_cache_page(struct address_space *mapping, if (PageSwapBacked(page)) { __mod_lruvec_page_state(page, NR_SHMEM, -nr); if (PageTransHuge(page)) - __dec_lruvec_page_state(page, NR_SHMEM_THPS); + __mod_lruvec_page_state(page, NR_SHMEM_THPS, -nr); } else if (PageTransHuge(page)) { - __dec_lruvec_page_state(page, NR_FILE_THPS); + __mod_lruvec_page_state(page, NR_FILE_THPS, -nr); filemap_nr_thps_dec(mapping); } @@ -775,7 +777,6 @@ EXPORT_SYMBOL(file_write_and_wait_range); * replace_page_cache_page - replace a pagecache page with a new one * @old: page to be replaced * @new: page to replace with - * @gfp_mask: allocation mode * * This function replaces a page in the pagecache with a new one. On * success it acquires the pagecache reference for the new page and @@ -784,10 +785,8 @@ EXPORT_SYMBOL(file_write_and_wait_range); * caller must do that. * * The remove + add is atomic. This function cannot fail. - * - * Return: %0 */ -int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask) +void replace_page_cache_page(struct page *old, struct page *new) { struct address_space *mapping = old->mapping; void (*freepage)(struct page *) = mapping->a_ops->freepage; @@ -822,8 +821,6 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask) if (freepage) freepage(old); put_page(old); - - return 0; } EXPORT_SYMBOL_GPL(replace_page_cache_page); @@ -835,6 +832,7 @@ noinline int __add_to_page_cache_locked(struct page *page, XA_STATE(xas, &mapping->i_pages, offset); int huge = PageHuge(page); int error; + bool charged = false; VM_BUG_ON_PAGE(!PageLocked(page), page); VM_BUG_ON_PAGE(PageSwapBacked(page), page); @@ -848,6 +846,7 @@ noinline int __add_to_page_cache_locked(struct page *page, error = mem_cgroup_charge(page, current->mm, gfp); if (error) goto error; + charged = true; } gfp &= GFP_RECLAIM_MASK; @@ -896,6 +895,8 @@ unlock: if (xas_error(&xas)) { error = xas_error(&xas); + if (charged) + mem_cgroup_uncharge(page); goto error; } @@ -1342,61 +1343,26 @@ int wait_on_page_bit_killable(struct page *page, int bit_nr) } EXPORT_SYMBOL(wait_on_page_bit_killable); -static int __wait_on_page_locked_async(struct page *page, - struct wait_page_queue *wait, bool set) -{ - struct wait_queue_head *q = page_waitqueue(page); - int ret = 0; - - wait->page = page; - wait->bit_nr = PG_locked; - - spin_lock_irq(&q->lock); - __add_wait_queue_entry_tail(q, &wait->wait); - SetPageWaiters(page); - if (set) - ret = !trylock_page(page); - else - ret = PageLocked(page); - /* - * If we were successful now, we know we're still on the - * waitqueue as we're still under the lock. This means it's - * safe to remove and return success, we know the callback - * isn't going to trigger. - */ - if (!ret) - __remove_wait_queue(q, &wait->wait); - else - ret = -EIOCBQUEUED; - spin_unlock_irq(&q->lock); - return ret; -} - -static int wait_on_page_locked_async(struct page *page, - struct wait_page_queue *wait) -{ - if (!PageLocked(page)) - return 0; - return __wait_on_page_locked_async(compound_head(page), wait, false); -} - /** * put_and_wait_on_page_locked - Drop a reference and wait for it to be unlocked * @page: The page to wait for. + * @state: The sleep state (TASK_KILLABLE, TASK_UNINTERRUPTIBLE, etc). * * The caller should hold a reference on @page. They expect the page to * become unlocked relatively soon, but do not wish to hold up migration * (for example) by holding the reference while waiting for the page to * come unlocked. After this function returns, the caller should not * dereference @page. + * + * Return: 0 if the page was unlocked or -EINTR if interrupted by a signal. */ -void put_and_wait_on_page_locked(struct page *page) +int put_and_wait_on_page_locked(struct page *page, int state) { wait_queue_head_t *q; page = compound_head(page); q = page_waitqueue(page); - wait_on_page_bit_common(q, page, PG_locked, TASK_UNINTERRUPTIBLE, DROP); + return wait_on_page_bit_common(q, page, PG_locked, state, DROP); } /** @@ -1552,7 +1518,28 @@ EXPORT_SYMBOL_GPL(__lock_page_killable); int __lock_page_async(struct page *page, struct wait_page_queue *wait) { - return __wait_on_page_locked_async(page, wait, true); + struct wait_queue_head *q = page_waitqueue(page); + int ret = 0; + + wait->page = page; + wait->bit_nr = PG_locked; + + spin_lock_irq(&q->lock); + __add_wait_queue_entry_tail(q, &wait->wait); + SetPageWaiters(page); + ret = !trylock_page(page); + /* + * If we were successful now, we know we're still on the + * waitqueue as we're still under the lock. This means it's + * safe to remove and return success, we know the callback + * isn't going to trigger. + */ + if (!ret) + __remove_wait_queue(q, &wait->wait); + else + ret = -EIOCBQUEUED; + spin_unlock_irq(&q->lock); + return ret; } /* @@ -2167,287 +2154,267 @@ static void shrink_readahead_size_eio(struct file_ra_state *ra) ra->ra_pages /= 4; } -static int lock_page_for_iocb(struct kiocb *iocb, struct page *page) +/* + * filemap_get_read_batch - Get a batch of pages for read + * + * Get a batch of pages which represent a contiguous range of bytes + * in the file. No tail pages will be returned. If @index is in the + * middle of a THP, the entire THP will be returned. The last page in + * the batch may have Readahead set or be not Uptodate so that the + * caller can take the appropriate action. + */ +static void filemap_get_read_batch(struct address_space *mapping, + pgoff_t index, pgoff_t max, struct pagevec *pvec) { - if (iocb->ki_flags & IOCB_WAITQ) - return lock_page_async(page, iocb->ki_waitq); - else if (iocb->ki_flags & IOCB_NOWAIT) - return trylock_page(page) ? 0 : -EAGAIN; - else - return lock_page_killable(page); + XA_STATE(xas, &mapping->i_pages, index); + struct page *head; + + rcu_read_lock(); + for (head = xas_load(&xas); head; head = xas_next(&xas)) { + if (xas_retry(&xas, head)) + continue; + if (xas.xa_index > max || xa_is_value(head)) + break; + if (!page_cache_get_speculative(head)) + goto retry; + + /* Has the page moved or been split? */ + if (unlikely(head != xas_reload(&xas))) + goto put_page; + + if (!pagevec_add(pvec, head)) + break; + if (!PageUptodate(head)) + break; + if (PageReadahead(head)) + break; + xas.xa_index = head->index + thp_nr_pages(head) - 1; + xas.xa_offset = (xas.xa_index >> xas.xa_shift) & XA_CHUNK_MASK; + continue; +put_page: + put_page(head); +retry: + xas_reset(&xas); + } + rcu_read_unlock(); } -static struct page * -generic_file_buffered_read_readpage(struct kiocb *iocb, - struct file *filp, - struct address_space *mapping, - struct page *page) +static int filemap_read_page(struct file *file, struct address_space *mapping, + struct page *page) { - struct file_ra_state *ra = &filp->f_ra; int error; - if (iocb->ki_flags & (IOCB_NOIO | IOCB_NOWAIT)) { - unlock_page(page); - put_page(page); - return ERR_PTR(-EAGAIN); - } - /* - * A previous I/O error may have been due to temporary - * failures, eg. multipath errors. - * PG_error will be set again if readpage fails. + * A previous I/O error may have been due to temporary failures, + * eg. multipath errors. PG_error will be set again if readpage + * fails. */ ClearPageError(page); /* Start the actual read. The read will unlock the page. */ - error = mapping->a_ops->readpage(filp, page); + error = mapping->a_ops->readpage(file, page); + if (error) + return error; - if (unlikely(error)) { - put_page(page); - return error != AOP_TRUNCATED_PAGE ? ERR_PTR(error) : NULL; - } + error = wait_on_page_locked_killable(page); + if (error) + return error; + if (PageUptodate(page)) + return 0; + if (!page->mapping) /* page truncated */ + return AOP_TRUNCATED_PAGE; + shrink_readahead_size_eio(&file->f_ra); + return -EIO; +} - if (!PageUptodate(page)) { - error = lock_page_for_iocb(iocb, page); - if (unlikely(error)) { - put_page(page); - return ERR_PTR(error); - } - if (!PageUptodate(page)) { - if (page->mapping == NULL) { - /* - * invalidate_mapping_pages got it - */ - unlock_page(page); - put_page(page); - return NULL; - } - unlock_page(page); - shrink_readahead_size_eio(ra); - put_page(page); - return ERR_PTR(-EIO); - } - unlock_page(page); +static bool filemap_range_uptodate(struct address_space *mapping, + loff_t pos, struct iov_iter *iter, struct page *page) +{ + int count; + + if (PageUptodate(page)) + return true; + /* pipes can't handle partially uptodate pages */ + if (iov_iter_is_pipe(iter)) + return false; + if (!mapping->a_ops->is_partially_uptodate) + return false; + if (mapping->host->i_blkbits >= (PAGE_SHIFT + thp_order(page))) + return false; + + count = iter->count; + if (page_offset(page) > pos) { + count -= page_offset(page) - pos; + pos = 0; + } else { + pos -= page_offset(page); } - return page; + return mapping->a_ops->is_partially_uptodate(page, pos, count); } -static struct page * -generic_file_buffered_read_pagenotuptodate(struct kiocb *iocb, - struct file *filp, - struct iov_iter *iter, - struct page *page, - loff_t pos, loff_t count) +static int filemap_update_page(struct kiocb *iocb, + struct address_space *mapping, struct iov_iter *iter, + struct page *page) { - struct address_space *mapping = filp->f_mapping; - struct inode *inode = mapping->host; int error; - /* - * See comment in do_read_cache_page on why - * wait_on_page_locked is used to avoid unnecessarily - * serialisations and why it's safe. - */ - if (iocb->ki_flags & IOCB_WAITQ) { - error = wait_on_page_locked_async(page, - iocb->ki_waitq); - } else { - error = wait_on_page_locked_killable(page); - } - if (unlikely(error)) { - put_page(page); - return ERR_PTR(error); + if (!trylock_page(page)) { + if (iocb->ki_flags & (IOCB_NOWAIT | IOCB_NOIO)) + return -EAGAIN; + if (!(iocb->ki_flags & IOCB_WAITQ)) { + put_and_wait_on_page_locked(page, TASK_KILLABLE); + return AOP_TRUNCATED_PAGE; + } + error = __lock_page_async(page, iocb->ki_waitq); + if (error) + return error; } - if (PageUptodate(page)) - return page; - if (inode->i_blkbits == PAGE_SHIFT || - !mapping->a_ops->is_partially_uptodate) - goto page_not_up_to_date; - /* pipes can't handle partially uptodate pages */ - if (unlikely(iov_iter_is_pipe(iter))) - goto page_not_up_to_date; - if (!trylock_page(page)) - goto page_not_up_to_date; - /* Did it get truncated before we got the lock? */ if (!page->mapping) - goto page_not_up_to_date_locked; - if (!mapping->a_ops->is_partially_uptodate(page, - pos & ~PAGE_MASK, count)) - goto page_not_up_to_date_locked; - unlock_page(page); - return page; + goto truncated; -page_not_up_to_date: - /* Get exclusive access to the page ... */ - error = lock_page_for_iocb(iocb, page); - if (unlikely(error)) { - put_page(page); - return ERR_PTR(error); - } - -page_not_up_to_date_locked: - /* Did it get truncated before we got the lock? */ - if (!page->mapping) { - unlock_page(page); - put_page(page); - return NULL; - } + error = 0; + if (filemap_range_uptodate(mapping, iocb->ki_pos, iter, page)) + goto unlock; - /* Did somebody else fill it already? */ - if (PageUptodate(page)) { - unlock_page(page); - return page; - } + error = -EAGAIN; + if (iocb->ki_flags & (IOCB_NOIO | IOCB_NOWAIT | IOCB_WAITQ)) + goto unlock; - return generic_file_buffered_read_readpage(iocb, filp, mapping, page); + error = filemap_read_page(iocb->ki_filp, mapping, page); + if (error == AOP_TRUNCATED_PAGE) + put_page(page); + return error; +truncated: + unlock_page(page); + put_page(page); + return AOP_TRUNCATED_PAGE; +unlock: + unlock_page(page); + return error; } -static struct page * -generic_file_buffered_read_no_cached_page(struct kiocb *iocb, - struct iov_iter *iter) +static int filemap_create_page(struct file *file, + struct address_space *mapping, pgoff_t index, + struct pagevec *pvec) { - struct file *filp = iocb->ki_filp; - struct address_space *mapping = filp->f_mapping; - pgoff_t index = iocb->ki_pos >> PAGE_SHIFT; struct page *page; int error; - if (iocb->ki_flags & IOCB_NOIO) - return ERR_PTR(-EAGAIN); - - /* - * Ok, it wasn't cached, so we need to create a new - * page.. - */ page = page_cache_alloc(mapping); if (!page) - return ERR_PTR(-ENOMEM); + return -ENOMEM; error = add_to_page_cache_lru(page, mapping, index, - mapping_gfp_constraint(mapping, GFP_KERNEL)); - if (error) { - put_page(page); - return error != -EEXIST ? ERR_PTR(error) : NULL; - } + mapping_gfp_constraint(mapping, GFP_KERNEL)); + if (error == -EEXIST) + error = AOP_TRUNCATED_PAGE; + if (error) + goto error; + + error = filemap_read_page(file, mapping, page); + if (error) + goto error; + + pagevec_add(pvec, page); + return 0; +error: + put_page(page); + return error; +} - return generic_file_buffered_read_readpage(iocb, filp, mapping, page); +static int filemap_readahead(struct kiocb *iocb, struct file *file, + struct address_space *mapping, struct page *page, + pgoff_t last_index) +{ + if (iocb->ki_flags & IOCB_NOIO) + return -EAGAIN; + page_cache_async_readahead(mapping, &file->f_ra, file, page, + page->index, last_index - page->index); + return 0; } -static int generic_file_buffered_read_get_pages(struct kiocb *iocb, - struct iov_iter *iter, - struct page **pages, - unsigned int nr) +static int filemap_get_pages(struct kiocb *iocb, struct iov_iter *iter, + struct pagevec *pvec) { struct file *filp = iocb->ki_filp; struct address_space *mapping = filp->f_mapping; struct file_ra_state *ra = &filp->f_ra; pgoff_t index = iocb->ki_pos >> PAGE_SHIFT; - pgoff_t last_index = (iocb->ki_pos + iter->count + PAGE_SIZE-1) >> PAGE_SHIFT; - int i, j, nr_got, err = 0; + pgoff_t last_index; + struct page *page; + int err = 0; - nr = min_t(unsigned long, last_index - index, nr); -find_page: + last_index = DIV_ROUND_UP(iocb->ki_pos + iter->count, PAGE_SIZE); +retry: if (fatal_signal_pending(current)) return -EINTR; - nr_got = find_get_pages_contig(mapping, index, nr, pages); - if (nr_got) - goto got_pages; - - if (iocb->ki_flags & IOCB_NOIO) - return -EAGAIN; - - page_cache_sync_readahead(mapping, ra, filp, index, last_index - index); - - nr_got = find_get_pages_contig(mapping, index, nr, pages); - if (nr_got) - goto got_pages; - - pages[0] = generic_file_buffered_read_no_cached_page(iocb, iter); - err = PTR_ERR_OR_ZERO(pages[0]); - if (!IS_ERR_OR_NULL(pages[0])) - nr_got = 1; -got_pages: - for (i = 0; i < nr_got; i++) { - struct page *page = pages[i]; - pgoff_t pg_index = index + i; - loff_t pg_pos = max(iocb->ki_pos, - (loff_t) pg_index << PAGE_SHIFT); - loff_t pg_count = iocb->ki_pos + iter->count - pg_pos; - - if (PageReadahead(page)) { - if (iocb->ki_flags & IOCB_NOIO) { - for (j = i; j < nr_got; j++) - put_page(pages[j]); - nr_got = i; - err = -EAGAIN; - break; - } - page_cache_async_readahead(mapping, ra, filp, page, - pg_index, last_index - pg_index); - } - - if (!PageUptodate(page)) { - if ((iocb->ki_flags & IOCB_NOWAIT) || - ((iocb->ki_flags & IOCB_WAITQ) && i)) { - for (j = i; j < nr_got; j++) - put_page(pages[j]); - nr_got = i; - err = -EAGAIN; - break; - } + filemap_get_read_batch(mapping, index, last_index, pvec); + if (!pagevec_count(pvec)) { + if (iocb->ki_flags & IOCB_NOIO) + return -EAGAIN; + page_cache_sync_readahead(mapping, ra, filp, index, + last_index - index); + filemap_get_read_batch(mapping, index, last_index, pvec); + } + if (!pagevec_count(pvec)) { + if (iocb->ki_flags & (IOCB_NOWAIT | IOCB_WAITQ)) + return -EAGAIN; + err = filemap_create_page(filp, mapping, + iocb->ki_pos >> PAGE_SHIFT, pvec); + if (err == AOP_TRUNCATED_PAGE) + goto retry; + return err; + } - page = generic_file_buffered_read_pagenotuptodate(iocb, - filp, iter, page, pg_pos, pg_count); - if (IS_ERR_OR_NULL(page)) { - for (j = i + 1; j < nr_got; j++) - put_page(pages[j]); - nr_got = i; - err = PTR_ERR_OR_ZERO(page); - break; - } - } + page = pvec->pages[pagevec_count(pvec) - 1]; + if (PageReadahead(page)) { + err = filemap_readahead(iocb, filp, mapping, page, last_index); + if (err) + goto err; + } + if (!PageUptodate(page)) { + if ((iocb->ki_flags & IOCB_WAITQ) && pagevec_count(pvec) > 1) + iocb->ki_flags |= IOCB_NOWAIT; + err = filemap_update_page(iocb, mapping, iter, page); + if (err) + goto err; } - if (likely(nr_got)) - return nr_got; - if (err) - return err; - /* - * No pages and no error means we raced and should retry: - */ - goto find_page; + return 0; +err: + if (err < 0) + put_page(page); + if (likely(--pvec->nr)) + return 0; + if (err == AOP_TRUNCATED_PAGE) + goto retry; + return err; } /** - * generic_file_buffered_read - generic file read routine - * @iocb: the iocb to read - * @iter: data destination - * @written: already copied - * - * This is a generic file read routine, and uses the - * mapping->a_ops->readpage() function for the actual low-level stuff. + * filemap_read - Read data from the page cache. + * @iocb: The iocb to read. + * @iter: Destination for the data. + * @already_read: Number of bytes already read by the caller. * - * This is really ugly. But the goto's actually try to clarify some - * of the logic when it comes to error handling etc. + * Copies data from the page cache. If the data is not currently present, + * uses the readahead and readpage address_space operations to fetch it. * - * Return: - * * total number of bytes copied, including those the were already @written - * * negative error code if nothing was copied + * Return: Total number of bytes copied, including those already read by + * the caller. If an error happens before any bytes are copied, returns + * a negative error number. */ -ssize_t generic_file_buffered_read(struct kiocb *iocb, - struct iov_iter *iter, ssize_t written) +ssize_t filemap_read(struct kiocb *iocb, struct iov_iter *iter, + ssize_t already_read) { struct file *filp = iocb->ki_filp; struct file_ra_state *ra = &filp->f_ra; struct address_space *mapping = filp->f_mapping; struct inode *inode = mapping->host; - struct page *pages_onstack[PAGEVEC_SIZE], **pages = NULL; - unsigned int nr_pages = min_t(unsigned int, 512, - ((iocb->ki_pos + iter->count + PAGE_SIZE - 1) >> PAGE_SHIFT) - - (iocb->ki_pos >> PAGE_SHIFT)); - int i, pg_nr, error = 0; + struct pagevec pvec; + int i, error = 0; bool writably_mapped; loff_t isize, end_offset; @@ -2457,14 +2424,7 @@ ssize_t generic_file_buffered_read(struct kiocb *iocb, return 0; iov_iter_truncate(iter, inode->i_sb->s_maxbytes); - - if (nr_pages > ARRAY_SIZE(pages_onstack)) - pages = kmalloc_array(nr_pages, sizeof(void *), GFP_KERNEL); - - if (!pages) { - pages = pages_onstack; - nr_pages = min_t(unsigned int, nr_pages, ARRAY_SIZE(pages_onstack)); - } + pagevec_init(&pvec); do { cond_resched(); @@ -2474,16 +2434,12 @@ ssize_t generic_file_buffered_read(struct kiocb *iocb, * can no longer safely return -EIOCBQUEUED. Hence mark * an async read NOWAIT at that point. */ - if ((iocb->ki_flags & IOCB_WAITQ) && written) + if ((iocb->ki_flags & IOCB_WAITQ) && already_read) iocb->ki_flags |= IOCB_NOWAIT; - i = 0; - pg_nr = generic_file_buffered_read_get_pages(iocb, iter, - pages, nr_pages); - if (pg_nr < 0) { - error = pg_nr; + error = filemap_get_pages(iocb, iter, &pvec); + if (error < 0) break; - } /* * i_size must be checked after we know the pages are Uptodate. @@ -2496,13 +2452,8 @@ ssize_t generic_file_buffered_read(struct kiocb *iocb, isize = i_size_read(inode); if (unlikely(iocb->ki_pos >= isize)) goto put_pages; - end_offset = min_t(loff_t, isize, iocb->ki_pos + iter->count); - while ((iocb->ki_pos >> PAGE_SHIFT) + pg_nr > - (end_offset + PAGE_SIZE - 1) >> PAGE_SHIFT) - put_page(pages[--pg_nr]); - /* * Once we start copying data, we don't want to be touching any * cachelines that might be contended: @@ -2515,27 +2466,35 @@ ssize_t generic_file_buffered_read(struct kiocb *iocb, */ if (iocb->ki_pos >> PAGE_SHIFT != ra->prev_pos >> PAGE_SHIFT) - mark_page_accessed(pages[0]); - for (i = 1; i < pg_nr; i++) - mark_page_accessed(pages[i]); + mark_page_accessed(pvec.pages[0]); - for (i = 0; i < pg_nr; i++) { - unsigned int offset = iocb->ki_pos & ~PAGE_MASK; - unsigned int bytes = min_t(loff_t, end_offset - iocb->ki_pos, - PAGE_SIZE - offset); - unsigned int copied; + for (i = 0; i < pagevec_count(&pvec); i++) { + struct page *page = pvec.pages[i]; + size_t page_size = thp_size(page); + size_t offset = iocb->ki_pos & (page_size - 1); + size_t bytes = min_t(loff_t, end_offset - iocb->ki_pos, + page_size - offset); + size_t copied; + if (end_offset < page_offset(page)) + break; + if (i > 0) + mark_page_accessed(page); /* * If users can be writing to this page using arbitrary * virtual addresses, take care about potential aliasing * before reading the page on the kernel side. */ - if (writably_mapped) - flush_dcache_page(pages[i]); + if (writably_mapped) { + int j; - copied = copy_page_to_iter(pages[i], offset, bytes, iter); + for (j = 0; j < thp_nr_pages(page); j++) + flush_dcache_page(page + j); + } + + copied = copy_page_to_iter(page, offset, bytes, iter); - written += copied; + already_read += copied; iocb->ki_pos += copied; ra->prev_pos = iocb->ki_pos; @@ -2545,18 +2504,16 @@ ssize_t generic_file_buffered_read(struct kiocb *iocb, } } put_pages: - for (i = 0; i < pg_nr; i++) - put_page(pages[i]); + for (i = 0; i < pagevec_count(&pvec); i++) + put_page(pvec.pages[i]); + pagevec_reinit(&pvec); } while (iov_iter_count(iter) && iocb->ki_pos < isize && !error); file_accessed(filp); - if (pages != pages_onstack) - kfree(pages); - - return written ? written : error; + return already_read ? already_read : error; } -EXPORT_SYMBOL_GPL(generic_file_buffered_read); +EXPORT_SYMBOL_GPL(filemap_read); /** * generic_file_read_iter - generic filesystem read routine @@ -2586,7 +2543,7 @@ generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter) ssize_t retval = 0; if (!count) - goto out; /* skip atime */ + return 0; /* skip atime */ if (iocb->ki_flags & IOCB_DIRECT) { struct file *file = iocb->ki_filp; @@ -2604,7 +2561,7 @@ generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter) iocb->ki_pos, iocb->ki_pos + count - 1); if (retval < 0) - goto out; + return retval; } file_accessed(file); @@ -2614,7 +2571,8 @@ generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter) iocb->ki_pos += retval; count -= retval; } - iov_iter_revert(iter, count - iov_iter_count(iter)); + if (retval != -EIOCBQUEUED) + iov_iter_revert(iter, count - iov_iter_count(iter)); /* * Btrfs can have a short DIO read if we encounter @@ -2627,12 +2585,10 @@ generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter) */ if (retval < 0 || !count || iocb->ki_pos >= size || IS_DAX(inode)) - goto out; + return retval; } - retval = generic_file_buffered_read(iocb, iter, retval); -out: - return retval; + return filemap_read(iocb, iter, retval); } EXPORT_SYMBOL(generic_file_read_iter); @@ -2911,74 +2867,163 @@ out_retry: } EXPORT_SYMBOL(filemap_fault); -void filemap_map_pages(struct vm_fault *vmf, - pgoff_t start_pgoff, pgoff_t end_pgoff) +static bool filemap_map_pmd(struct vm_fault *vmf, struct page *page) { - struct file *file = vmf->vma->vm_file; + struct mm_struct *mm = vmf->vma->vm_mm; + + /* Huge page is mapped? No need to proceed. */ + if (pmd_trans_huge(*vmf->pmd)) { + unlock_page(page); + put_page(page); + return true; + } + + if (pmd_none(*vmf->pmd) && PageTransHuge(page)) { + vm_fault_t ret = do_set_pmd(vmf, page); + if (!ret) { + /* The page is mapped successfully, reference consumed. */ + unlock_page(page); + return true; + } + } + + if (pmd_none(*vmf->pmd)) { + vmf->ptl = pmd_lock(mm, vmf->pmd); + if (likely(pmd_none(*vmf->pmd))) { + mm_inc_nr_ptes(mm); + pmd_populate(mm, vmf->pmd, vmf->prealloc_pte); + vmf->prealloc_pte = NULL; + } + spin_unlock(vmf->ptl); + } + + /* See comment in handle_pte_fault() */ + if (pmd_devmap_trans_unstable(vmf->pmd)) { + unlock_page(page); + put_page(page); + return true; + } + + return false; +} + +static struct page *next_uptodate_page(struct page *page, + struct address_space *mapping, + struct xa_state *xas, pgoff_t end_pgoff) +{ + unsigned long max_idx; + + do { + if (!page) + return NULL; + if (xas_retry(xas, page)) + continue; + if (xa_is_value(page)) + continue; + if (PageLocked(page)) + continue; + if (!page_cache_get_speculative(page)) + continue; + /* Has the page moved or been split? */ + if (unlikely(page != xas_reload(xas))) + goto skip; + if (!PageUptodate(page) || PageReadahead(page)) + goto skip; + if (PageHWPoison(page)) + goto skip; + if (!trylock_page(page)) + goto skip; + if (page->mapping != mapping) + goto unlock; + if (!PageUptodate(page)) + goto unlock; + max_idx = DIV_ROUND_UP(i_size_read(mapping->host), PAGE_SIZE); + if (xas->xa_index >= max_idx) + goto unlock; + return page; +unlock: + unlock_page(page); +skip: + put_page(page); + } while ((page = xas_next_entry(xas, end_pgoff)) != NULL); + + return NULL; +} + +static inline struct page *first_map_page(struct address_space *mapping, + struct xa_state *xas, + pgoff_t end_pgoff) +{ + return next_uptodate_page(xas_find(xas, end_pgoff), + mapping, xas, end_pgoff); +} + +static inline struct page *next_map_page(struct address_space *mapping, + struct xa_state *xas, + pgoff_t end_pgoff) +{ + return next_uptodate_page(xas_next_entry(xas, end_pgoff), + mapping, xas, end_pgoff); +} + +vm_fault_t filemap_map_pages(struct vm_fault *vmf, + pgoff_t start_pgoff, pgoff_t end_pgoff) +{ + struct vm_area_struct *vma = vmf->vma; + struct file *file = vma->vm_file; struct address_space *mapping = file->f_mapping; pgoff_t last_pgoff = start_pgoff; - unsigned long max_idx; + unsigned long addr; XA_STATE(xas, &mapping->i_pages, start_pgoff); struct page *head, *page; unsigned int mmap_miss = READ_ONCE(file->f_ra.mmap_miss); + vm_fault_t ret = 0; rcu_read_lock(); - xas_for_each(&xas, head, end_pgoff) { - if (xas_retry(&xas, head)) - continue; - if (xa_is_value(head)) - goto next; + head = first_map_page(mapping, &xas, end_pgoff); + if (!head) + goto out; - /* - * Check for a locked page first, as a speculative - * reference may adversely influence page migration. - */ - if (PageLocked(head)) - goto next; - if (!page_cache_get_speculative(head)) - goto next; + if (filemap_map_pmd(vmf, head)) { + ret = VM_FAULT_NOPAGE; + goto out; + } - /* Has the page moved or been split? */ - if (unlikely(head != xas_reload(&xas))) - goto skip; + addr = vma->vm_start + ((start_pgoff - vma->vm_pgoff) << PAGE_SHIFT); + vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, addr, &vmf->ptl); + do { page = find_subpage(head, xas.xa_index); - - if (!PageUptodate(head) || - PageReadahead(page) || - PageHWPoison(page)) - goto skip; - if (!trylock_page(head)) - goto skip; - - if (head->mapping != mapping || !PageUptodate(head)) - goto unlock; - - max_idx = DIV_ROUND_UP(i_size_read(mapping->host), PAGE_SIZE); - if (xas.xa_index >= max_idx) + if (PageHWPoison(page)) goto unlock; if (mmap_miss > 0) mmap_miss--; - vmf->address += (xas.xa_index - last_pgoff) << PAGE_SHIFT; - if (vmf->pte) - vmf->pte += xas.xa_index - last_pgoff; + addr += (xas.xa_index - last_pgoff) << PAGE_SHIFT; + vmf->pte += xas.xa_index - last_pgoff; last_pgoff = xas.xa_index; - if (alloc_set_pte(vmf, page)) + + if (!pte_none(*vmf->pte)) goto unlock; + + /* We're about to handle the fault */ + if (vmf->address == addr) + ret = VM_FAULT_NOPAGE; + + do_set_pte(vmf, page, addr); + /* no need to invalidate: a not-present page won't be cached */ + update_mmu_cache(vma, addr, vmf->pte); unlock_page(head); - goto next; + continue; unlock: unlock_page(head); -skip: put_page(head); -next: - /* Huge page is mapped? No need to proceed. */ - if (pmd_trans_huge(*vmf->pmd)) - break; - } + } while ((head = next_map_page(mapping, &xas, end_pgoff)) != NULL); + pte_unmap_unlock(vmf->pte, vmf->ptl); +out: rcu_read_unlock(); WRITE_ONCE(file->f_ra.mmap_miss, mmap_miss); + return ret; } EXPORT_SYMBOL(filemap_map_pages); @@ -3336,7 +3381,8 @@ generic_file_direct_write(struct kiocb *iocb, struct iov_iter *from) } iocb->ki_pos = pos; } - iov_iter_revert(from, write_len - iov_iter_count(from)); + if (written != -EIOCBQUEUED) + iov_iter_revert(from, write_len - iov_iter_count(from)); out: return written; } diff --git a/mm/frame_vector.c b/mm/frame_vector.c deleted file mode 100644 index 10f82d5643b6..000000000000 --- a/mm/frame_vector.c +++ /dev/null @@ -1,240 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#include <linux/kernel.h> -#include <linux/errno.h> -#include <linux/err.h> -#include <linux/mm.h> -#include <linux/slab.h> -#include <linux/vmalloc.h> -#include <linux/pagemap.h> -#include <linux/sched.h> - -/** - * get_vaddr_frames() - map virtual addresses to pfns - * @start: starting user address - * @nr_frames: number of pages / pfns from start to map - * @gup_flags: flags modifying lookup behaviour - * @vec: structure which receives pages / pfns of the addresses mapped. - * It should have space for at least nr_frames entries. - * - * This function maps virtual addresses from @start and fills @vec structure - * with page frame numbers or page pointers to corresponding pages (choice - * depends on the type of the vma underlying the virtual address). If @start - * belongs to a normal vma, the function grabs reference to each of the pages - * to pin them in memory. If @start belongs to VM_IO | VM_PFNMAP vma, we don't - * touch page structures and the caller must make sure pfns aren't reused for - * anything else while he is using them. - * - * The function returns number of pages mapped which may be less than - * @nr_frames. In particular we stop mapping if there are more vmas of - * different type underlying the specified range of virtual addresses. - * When the function isn't able to map a single page, it returns error. - * - * This function takes care of grabbing mmap_lock as necessary. - */ -int get_vaddr_frames(unsigned long start, unsigned int nr_frames, - unsigned int gup_flags, struct frame_vector *vec) -{ - struct mm_struct *mm = current->mm; - struct vm_area_struct *vma; - int ret = 0; - int err; - int locked; - - if (nr_frames == 0) - return 0; - - if (WARN_ON_ONCE(nr_frames > vec->nr_allocated)) - nr_frames = vec->nr_allocated; - - start = untagged_addr(start); - - mmap_read_lock(mm); - locked = 1; - vma = find_vma_intersection(mm, start, start + 1); - if (!vma) { - ret = -EFAULT; - goto out; - } - - /* - * While get_vaddr_frames() could be used for transient (kernel - * controlled lifetime) pinning of memory pages all current - * users establish long term (userspace controlled lifetime) - * page pinning. Treat get_vaddr_frames() like - * get_user_pages_longterm() and disallow it for filesystem-dax - * mappings. - */ - if (vma_is_fsdax(vma)) { - ret = -EOPNOTSUPP; - goto out; - } - - if (!(vma->vm_flags & (VM_IO | VM_PFNMAP))) { - vec->got_ref = true; - vec->is_pfns = false; - ret = pin_user_pages_locked(start, nr_frames, - gup_flags, (struct page **)(vec->ptrs), &locked); - goto out; - } - - vec->got_ref = false; - vec->is_pfns = true; - do { - unsigned long *nums = frame_vector_pfns(vec); - - while (ret < nr_frames && start + PAGE_SIZE <= vma->vm_end) { - err = follow_pfn(vma, start, &nums[ret]); - if (err) { - if (ret == 0) - ret = err; - goto out; - } - start += PAGE_SIZE; - ret++; - } - /* - * We stop if we have enough pages or if VMA doesn't completely - * cover the tail page. - */ - if (ret >= nr_frames || start < vma->vm_end) - break; - vma = find_vma_intersection(mm, start, start + 1); - } while (vma && vma->vm_flags & (VM_IO | VM_PFNMAP)); -out: - if (locked) - mmap_read_unlock(mm); - if (!ret) - ret = -EFAULT; - if (ret > 0) - vec->nr_frames = ret; - return ret; -} -EXPORT_SYMBOL(get_vaddr_frames); - -/** - * put_vaddr_frames() - drop references to pages if get_vaddr_frames() acquired - * them - * @vec: frame vector to put - * - * Drop references to pages if get_vaddr_frames() acquired them. We also - * invalidate the frame vector so that it is prepared for the next call into - * get_vaddr_frames(). - */ -void put_vaddr_frames(struct frame_vector *vec) -{ - struct page **pages; - - if (!vec->got_ref) - goto out; - pages = frame_vector_pages(vec); - /* - * frame_vector_pages() might needed to do a conversion when - * get_vaddr_frames() got pages but vec was later converted to pfns. - * But it shouldn't really fail to convert pfns back... - */ - if (WARN_ON(IS_ERR(pages))) - goto out; - - unpin_user_pages(pages, vec->nr_frames); - vec->got_ref = false; -out: - vec->nr_frames = 0; -} -EXPORT_SYMBOL(put_vaddr_frames); - -/** - * frame_vector_to_pages - convert frame vector to contain page pointers - * @vec: frame vector to convert - * - * Convert @vec to contain array of page pointers. If the conversion is - * successful, return 0. Otherwise return an error. Note that we do not grab - * page references for the page structures. - */ -int frame_vector_to_pages(struct frame_vector *vec) -{ - int i; - unsigned long *nums; - struct page **pages; - - if (!vec->is_pfns) - return 0; - nums = frame_vector_pfns(vec); - for (i = 0; i < vec->nr_frames; i++) - if (!pfn_valid(nums[i])) - return -EINVAL; - pages = (struct page **)nums; - for (i = 0; i < vec->nr_frames; i++) - pages[i] = pfn_to_page(nums[i]); - vec->is_pfns = false; - return 0; -} -EXPORT_SYMBOL(frame_vector_to_pages); - -/** - * frame_vector_to_pfns - convert frame vector to contain pfns - * @vec: frame vector to convert - * - * Convert @vec to contain array of pfns. - */ -void frame_vector_to_pfns(struct frame_vector *vec) -{ - int i; - unsigned long *nums; - struct page **pages; - - if (vec->is_pfns) - return; - pages = (struct page **)(vec->ptrs); - nums = (unsigned long *)pages; - for (i = 0; i < vec->nr_frames; i++) - nums[i] = page_to_pfn(pages[i]); - vec->is_pfns = true; -} -EXPORT_SYMBOL(frame_vector_to_pfns); - -/** - * frame_vector_create() - allocate & initialize structure for pinned pfns - * @nr_frames: number of pfns slots we should reserve - * - * Allocate and initialize struct pinned_pfns to be able to hold @nr_pfns - * pfns. - */ -struct frame_vector *frame_vector_create(unsigned int nr_frames) -{ - struct frame_vector *vec; - int size = sizeof(struct frame_vector) + sizeof(void *) * nr_frames; - - if (WARN_ON_ONCE(nr_frames == 0)) - return NULL; - /* - * This is absurdly high. It's here just to avoid strange effects when - * arithmetics overflows. - */ - if (WARN_ON_ONCE(nr_frames > INT_MAX / sizeof(void *) / 2)) - return NULL; - /* - * Avoid higher order allocations, use vmalloc instead. It should - * be rare anyway. - */ - vec = kvmalloc(size, GFP_KERNEL); - if (!vec) - return NULL; - vec->nr_allocated = nr_frames; - vec->nr_frames = 0; - return vec; -} -EXPORT_SYMBOL(frame_vector_create); - -/** - * frame_vector_destroy() - free memory allocated to carry frame vector - * @vec: Frame vector to free - * - * Free structure allocated by frame_vector_create() to carry frames. - */ -void frame_vector_destroy(struct frame_vector *vec) -{ - /* Make sure put_vaddr_frames() got called properly... */ - VM_BUG_ON(vec->nr_frames > 0); - kvfree(vec); -} -EXPORT_SYMBOL(frame_vector_destroy); @@ -78,9 +78,8 @@ static inline struct page *try_get_compound_head(struct page *page, int refs) * considered failure, and furthermore, a likely bug in the caller, so a warning * is also emitted. */ -static __maybe_unused struct page *try_grab_compound_head(struct page *page, - int refs, - unsigned int flags) +__maybe_unused struct page *try_grab_compound_head(struct page *page, + int refs, unsigned int flags) { if (flags & FOLL_GET) return try_get_compound_head(page, refs); diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 9237976abe72..d77605c30f2e 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -386,7 +386,11 @@ static int __init hugepage_init(void) struct kobject *hugepage_kobj; if (!has_transparent_hugepage()) { - transparent_hugepage_flags = 0; + /* + * Hardware doesn't support hugepages, hence disable + * DAX PMD support. + */ + transparent_hugepage_flags = 1 << TRANSPARENT_HUGEPAGE_NEVER_DAX; return -EINVAL; } @@ -636,6 +640,7 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf, lru_cache_add_inactive_or_unevictable(page, vma); pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable); set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry); + update_mmu_cache_pmd(vma, vmf->address, vmf->pmd); add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR); mm_inc_nr_ptes(vma->vm_mm); spin_unlock(vmf->ptl); @@ -690,20 +695,19 @@ static inline gfp_t alloc_hugepage_direct_gfpmask(struct vm_area_struct *vma) } /* Caller must hold page table lock. */ -static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm, +static void set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm, struct vm_area_struct *vma, unsigned long haddr, pmd_t *pmd, struct page *zero_page) { pmd_t entry; if (!pmd_none(*pmd)) - return false; + return; entry = mk_pmd(zero_page, vma->vm_page_prot); entry = pmd_mkhuge(entry); if (pgtable) pgtable_trans_huge_deposit(mm, pmd, pgtable); set_pmd_at(mm, haddr, pmd, entry); mm_inc_nr_ptes(mm); - return true; } vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf) @@ -749,6 +753,7 @@ vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf) } else { set_huge_zero_page(pgtable, vma->vm_mm, vma, haddr, vmf->pmd, zero_page); + update_mmu_cache_pmd(vma, vmf->address, vmf->pmd); spin_unlock(vmf->ptl); } } else { @@ -1439,7 +1444,7 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t pmd) if (!get_page_unless_zero(page)) goto out_unlock; spin_unlock(vmf->ptl); - put_and_wait_on_page_locked(page); + put_and_wait_on_page_locked(page, TASK_UNINTERRUPTIBLE); goto out; } @@ -1475,7 +1480,7 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t pmd) if (!get_page_unless_zero(page)) goto out_unlock; spin_unlock(vmf->ptl); - put_and_wait_on_page_locked(page); + put_and_wait_on_page_locked(page, TASK_UNINTERRUPTIBLE); goto out; } @@ -2176,7 +2181,8 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, lock_page_memcg(page); if (atomic_add_negative(-1, compound_mapcount_ptr(page))) { /* Last compound_mapcount is gone. */ - __dec_lruvec_page_state(page, NR_ANON_THPS); + __mod_lruvec_page_state(page, NR_ANON_THPS, + -HPAGE_PMD_NR); if (TestClearPageDoubleMap(page)) { /* No need in mapcount reference anymore */ for (i = 0; i < HPAGE_PMD_NR; i++) @@ -2202,7 +2208,7 @@ void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, { spinlock_t *ptl; struct mmu_notifier_range range; - bool was_locked = false; + bool do_unlock_page = false; pmd_t _pmd; mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm, @@ -2218,7 +2224,6 @@ void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, VM_BUG_ON(freeze && !page); if (page) { VM_WARN_ON_ONCE(!PageLocked(page)); - was_locked = true; if (page != pmd_page(*pmd)) goto out; } @@ -2227,19 +2232,29 @@ repeat: if (pmd_trans_huge(*pmd)) { if (!page) { page = pmd_page(*pmd); - if (unlikely(!trylock_page(page))) { - get_page(page); - _pmd = *pmd; - spin_unlock(ptl); - lock_page(page); - spin_lock(ptl); - if (unlikely(!pmd_same(*pmd, _pmd))) { - unlock_page(page); + /* + * An anonymous page must be locked, to ensure that a + * concurrent reuse_swap_page() sees stable mapcount; + * but reuse_swap_page() is not used on shmem or file, + * and page lock must not be taken when zap_pmd_range() + * calls __split_huge_pmd() while i_mmap_lock is held. + */ + if (PageAnon(page)) { + if (unlikely(!trylock_page(page))) { + get_page(page); + _pmd = *pmd; + spin_unlock(ptl); + lock_page(page); + spin_lock(ptl); + if (unlikely(!pmd_same(*pmd, _pmd))) { + unlock_page(page); + put_page(page); + page = NULL; + goto repeat; + } put_page(page); - page = NULL; - goto repeat; } - put_page(page); + do_unlock_page = true; } } if (PageMlocked(page)) @@ -2249,7 +2264,7 @@ repeat: __split_huge_pmd_locked(vma, pmd, range.start, freeze); out: spin_unlock(ptl); - if (!was_locked && page) + if (do_unlock_page) unlock_page(page); /* * No need to double call mmu_notifier->invalidate_range() callback. @@ -2742,10 +2757,14 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) } spin_unlock(&ds_queue->split_queue_lock); if (mapping) { + int nr = thp_nr_pages(head); + if (PageSwapBacked(head)) - __dec_lruvec_page_state(head, NR_SHMEM_THPS); + __mod_lruvec_page_state(head, NR_SHMEM_THPS, + -nr); else - __dec_lruvec_page_state(head, NR_FILE_THPS); + __mod_lruvec_page_state(head, NR_FILE_THPS, + -nr); } __split_huge_page(page, list, end); diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 18f6ee317900..8fb42c6dd74b 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -82,16 +82,26 @@ struct mutex *hugetlb_fault_mutex_table ____cacheline_aligned_in_smp; /* Forward declaration */ static int hugetlb_acct_memory(struct hstate *h, long delta); -static inline void unlock_or_release_subpool(struct hugepage_subpool *spool) +static inline bool subpool_is_free(struct hugepage_subpool *spool) { - bool free = (spool->count == 0) && (spool->used_hpages == 0); + if (spool->count) + return false; + if (spool->max_hpages != -1) + return spool->used_hpages == 0; + if (spool->min_hpages != -1) + return spool->rsv_hpages == spool->min_hpages; + + return true; +} +static inline void unlock_or_release_subpool(struct hugepage_subpool *spool) +{ spin_unlock(&spool->lock); /* If no pages are used, and no other handles to the subpool * remain, give up any reservations based on minimum size and * free the subpool */ - if (free) { + if (subpool_is_free(spool)) { if (spool->min_hpages != -1) hugetlb_acct_memory(spool->hstate, -spool->min_hpages); @@ -1028,6 +1038,7 @@ static void enqueue_huge_page(struct hstate *h, struct page *page) list_move(&page->lru, &h->hugepage_freelists[nid]); h->free_huge_pages++; h->free_huge_pages_node[nid]++; + SetHPageFreed(page); } static struct page *dequeue_huge_page_node_exact(struct hstate *h, int nid) @@ -1044,6 +1055,7 @@ static struct page *dequeue_huge_page_node_exact(struct hstate *h, int nid) list_move(&page->lru, &h->hugepage_activelist); set_page_refcounted(page); + ClearHPageFreed(page); h->free_huge_pages--; h->free_huge_pages_node[nid]--; return page; @@ -1116,7 +1128,7 @@ static struct page *dequeue_huge_page_vma(struct hstate *h, nid = huge_node(vma, address, gfp_mask, &mpol, &nodemask); page = dequeue_huge_page_nodemask(h, gfp_mask, nid, nodemask); if (page && !avoid_reserve && vma_has_reserves(vma, chg)) { - SetPagePrivate(page); + SetHPageRestoreReserve(page); h->resv_huge_pages--; } @@ -1207,8 +1219,7 @@ static void destroy_compound_gigantic_page(struct page *page, struct page *p = page + 1; atomic_set(compound_mapcount_ptr(page), 0); - if (hpage_pincount_available(page)) - atomic_set(compound_pincount_ptr(page), 0); + atomic_set(compound_pincount_ptr(page), 0); for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) { clear_compound_head(p); @@ -1295,14 +1306,16 @@ static inline void destroy_compound_gigantic_page(struct page *page, static void update_and_free_page(struct hstate *h, struct page *page) { int i; + struct page *subpage = page; if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported()) return; h->nr_huge_pages--; h->nr_huge_pages_node[page_to_nid(page)]--; - for (i = 0; i < pages_per_huge_page(h); i++) { - page[i].flags &= ~(1 << PG_locked | 1 << PG_error | + for (i = 0; i < pages_per_huge_page(h); + i++, subpage = mem_map_next(subpage, page, i)) { + subpage->flags &= ~(1 << PG_locked | 1 << PG_error | 1 << PG_referenced | 1 << PG_dirty | 1 << PG_active | 1 << PG_private | 1 << PG_writeback); @@ -1336,53 +1349,6 @@ struct hstate *size_to_hstate(unsigned long size) return NULL; } -/* - * Test to determine whether the hugepage is "active/in-use" (i.e. being linked - * to hstate->hugepage_activelist.) - * - * This function can be called for tail pages, but never returns true for them. - */ -bool page_huge_active(struct page *page) -{ - VM_BUG_ON_PAGE(!PageHuge(page), page); - return PageHead(page) && PagePrivate(&page[1]); -} - -/* never called for tail page */ -static void set_page_huge_active(struct page *page) -{ - VM_BUG_ON_PAGE(!PageHeadHuge(page), page); - SetPagePrivate(&page[1]); -} - -static void clear_page_huge_active(struct page *page) -{ - VM_BUG_ON_PAGE(!PageHeadHuge(page), page); - ClearPagePrivate(&page[1]); -} - -/* - * Internal hugetlb specific page flag. Do not use outside of the hugetlb - * code - */ -static inline bool PageHugeTemporary(struct page *page) -{ - if (!PageHuge(page)) - return false; - - return (unsigned long)page[2].mapping == -1U; -} - -static inline void SetPageHugeTemporary(struct page *page) -{ - page[2].mapping = (void *)-1U; -} - -static inline void ClearPageHugeTemporary(struct page *page) -{ - page[2].mapping = NULL; -} - static void __free_huge_page(struct page *page) { /* @@ -1391,24 +1357,23 @@ static void __free_huge_page(struct page *page) */ struct hstate *h = page_hstate(page); int nid = page_to_nid(page); - struct hugepage_subpool *spool = - (struct hugepage_subpool *)page_private(page); + struct hugepage_subpool *spool = hugetlb_page_subpool(page); bool restore_reserve; VM_BUG_ON_PAGE(page_count(page), page); VM_BUG_ON_PAGE(page_mapcount(page), page); - set_page_private(page, 0); + hugetlb_set_page_subpool(page, NULL); page->mapping = NULL; - restore_reserve = PagePrivate(page); - ClearPagePrivate(page); + restore_reserve = HPageRestoreReserve(page); + ClearHPageRestoreReserve(page); /* - * If PagePrivate() was set on page, page allocation consumed a + * If HPageRestoreReserve was set on page, page allocation consumed a * reservation. If the page was associated with a subpool, there * would have been a page reserved in the subpool before allocation * via hugepage_subpool_get_pages(). Since we are 'restoring' the - * reservtion, do not call hugepage_subpool_put_pages() as this will + * reservation, do not call hugepage_subpool_put_pages() as this will * remove the reserved page from the subpool. */ if (!restore_reserve) { @@ -1423,7 +1388,7 @@ static void __free_huge_page(struct page *page) } spin_lock(&hugetlb_lock); - clear_page_huge_active(page); + ClearHPageMigratable(page); hugetlb_cgroup_uncharge_page(hstate_index(h), pages_per_huge_page(h), page); hugetlb_cgroup_uncharge_page_rsvd(hstate_index(h), @@ -1431,9 +1396,9 @@ static void __free_huge_page(struct page *page) if (restore_reserve) h->resv_huge_pages++; - if (PageHugeTemporary(page)) { + if (HPageTemporary(page)) { list_del(&page->lru); - ClearPageHugeTemporary(page); + ClearHPageTemporary(page); update_and_free_page(h, page); } else if (h->surplus_huge_pages_node[nid]) { /* remove the page from active list */ @@ -1500,11 +1465,13 @@ static void prep_new_huge_page(struct hstate *h, struct page *page, int nid) { INIT_LIST_HEAD(&page->lru); set_compound_page_dtor(page, HUGETLB_PAGE_DTOR); + hugetlb_set_page_subpool(page, NULL); set_hugetlb_cgroup(page, NULL); set_hugetlb_cgroup_rsvd(page, NULL); spin_lock(&hugetlb_lock); h->nr_huge_pages++; h->nr_huge_pages_node[nid]++; + ClearHPageFreed(page); spin_unlock(&hugetlb_lock); } @@ -1536,9 +1503,7 @@ static void prep_compound_gigantic_page(struct page *page, unsigned int order) set_compound_head(p, page); } atomic_set(compound_mapcount_ptr(page), -1); - - if (hpage_pincount_available(page)) - atomic_set(compound_pincount_ptr(page), 0); + atomic_set(compound_pincount_ptr(page), 0); } /* @@ -1755,6 +1720,7 @@ int dissolve_free_huge_page(struct page *page) { int rc = -EBUSY; +retry: /* Not to disrupt normal path by vainly holding hugetlb_lock */ if (!PageHuge(page)) return 0; @@ -1771,6 +1737,26 @@ int dissolve_free_huge_page(struct page *page) int nid = page_to_nid(head); if (h->free_huge_pages - h->resv_huge_pages == 0) goto out; + + /* + * We should make sure that the page is already on the free list + * when it is dissolved. + */ + if (unlikely(!HPageFreed(head))) { + spin_unlock(&hugetlb_lock); + cond_resched(); + + /* + * Theoretically, we should return -EBUSY when we + * encounter this race. In fact, we have a chance + * to successfully dissolve the page if we do a + * retry. Because the race window is quite small. + * If we seize this opportunity, it is an optimization + * for increasing the success rate of dissolving page. + */ + goto retry; + } + /* * Move PageHWPoison flag from head page to the raw error page, * which makes any subpages rather than the error page reusable. @@ -1847,7 +1833,7 @@ static struct page *alloc_surplus_huge_page(struct hstate *h, gfp_t gfp_mask, * codeflow */ if (h->surplus_huge_pages >= h->nr_overcommit_huge_pages) { - SetPageHugeTemporary(page); + SetHPageTemporary(page); spin_unlock(&hugetlb_lock); put_page(page); return NULL; @@ -1878,7 +1864,7 @@ static struct page *alloc_migrate_huge_page(struct hstate *h, gfp_t gfp_mask, * We do not account these pages as surplus because they are only * temporary and will be released properly on the last reference */ - SetPageHugeTemporary(page); + SetHPageTemporary(page); return page; } @@ -2009,13 +1995,16 @@ retry: /* Free the needed pages to the hugetlb pool */ list_for_each_entry_safe(page, tmp, &surplus_list, lru) { + int zeroed; + if ((--needed) < 0) break; /* * This page is now managed by the hugetlb allocator and has * no users -- drop the buddy allocator's reference. */ - VM_BUG_ON_PAGE(!put_page_testzero(page), page); + zeroed = put_page_testzero(page); + VM_BUG_ON_PAGE(!zeroed, page); enqueue_huge_page(h, page); } free: @@ -2213,24 +2202,24 @@ static long vma_add_reservation(struct hstate *h, * This routine is called to restore a reservation on error paths. In the * specific error paths, a huge page was allocated (via alloc_huge_page) * and is about to be freed. If a reservation for the page existed, - * alloc_huge_page would have consumed the reservation and set PagePrivate - * in the newly allocated page. When the page is freed via free_huge_page, - * the global reservation count will be incremented if PagePrivate is set. - * However, free_huge_page can not adjust the reserve map. Adjust the - * reserve map here to be consistent with global reserve count adjustments - * to be made by free_huge_page. + * alloc_huge_page would have consumed the reservation and set + * HPageRestoreReserve in the newly allocated page. When the page is freed + * via free_huge_page, the global reservation count will be incremented if + * HPageRestoreReserve is set. However, free_huge_page can not adjust the + * reserve map. Adjust the reserve map here to be consistent with global + * reserve count adjustments to be made by free_huge_page. */ static void restore_reserve_on_error(struct hstate *h, struct vm_area_struct *vma, unsigned long address, struct page *page) { - if (unlikely(PagePrivate(page))) { + if (unlikely(HPageRestoreReserve(page))) { long rc = vma_needs_reservation(h, vma, address); if (unlikely(rc < 0)) { /* * Rare out of memory condition in reserve map - * manipulation. Clear PagePrivate so that + * manipulation. Clear HPageRestoreReserve so that * global reserve count will not be incremented * by free_huge_page. This will make it appear * as though the reservation for this page was @@ -2239,7 +2228,7 @@ static void restore_reserve_on_error(struct hstate *h, * is better than inconsistent global huge page * accounting of reserve counts. */ - ClearPagePrivate(page); + ClearHPageRestoreReserve(page); } else if (rc) { rc = vma_add_reservation(h, vma, address); if (unlikely(rc < 0)) @@ -2247,7 +2236,7 @@ static void restore_reserve_on_error(struct hstate *h, * See above comment about rare out of * memory condition. */ - ClearPagePrivate(page); + ClearHPageRestoreReserve(page); } else vma_end_reservation(h, vma, address); } @@ -2328,7 +2317,7 @@ struct page *alloc_huge_page(struct vm_area_struct *vma, if (!page) goto out_uncharge_cgroup; if (!avoid_reserve && vma_has_reserves(vma, gbl_chg)) { - SetPagePrivate(page); + SetHPageRestoreReserve(page); h->resv_huge_pages--; } spin_lock(&hugetlb_lock); @@ -2346,7 +2335,7 @@ struct page *alloc_huge_page(struct vm_area_struct *vma, spin_unlock(&hugetlb_lock); - set_page_private(page, (unsigned long)spool); + hugetlb_set_page_subpool(page, spool); map_commit = vma_commit_reservation(h, vma, addr); if (unlikely(map_chg > map_commit)) { @@ -2435,7 +2424,7 @@ static void __init gather_bootmem_prealloc(void) struct hstate *h = m->hstate; WARN_ON(page_count(page) != 1); - prep_compound_huge_page(page, h->order); + prep_compound_huge_page(page, huge_page_order(h)); WARN_ON(PageReserved(page)); prep_new_huge_page(h, page, page_to_nid(page)); put_page(page); /* free it into the hugepage allocator */ @@ -2447,7 +2436,7 @@ static void __init gather_bootmem_prealloc(void) * side-effects, like CommitLimit going negative. */ if (hstate_is_gigantic(h)) - adjust_managed_page_count(page, 1 << h->order); + adjust_managed_page_count(page, pages_per_huge_page(h)); cond_resched(); } } @@ -2479,7 +2468,7 @@ static void __init hugetlb_hstate_alloc_pages(struct hstate *h) if (hstate_is_gigantic(h)) { if (hugetlb_cma_size) { pr_warn_once("HugeTLB: hugetlb_cma is enabled, skip boot time allocation\n"); - break; + goto free; } if (!alloc_bootmem_huge_page(h)) break; @@ -2497,7 +2486,7 @@ static void __init hugetlb_hstate_alloc_pages(struct hstate *h) h->max_huge_pages, buf, i); h->max_huge_pages = i; } - +free: kfree(node_alloc_noretry); } @@ -2947,8 +2936,10 @@ static int hugetlb_sysfs_add_hstate(struct hstate *h, struct kobject *parent, return -ENOMEM; retval = sysfs_create_group(hstate_kobjs[hi], hstate_attr_group); - if (retval) + if (retval) { kobject_put(hstate_kobjs[hi]); + hstate_kobjs[hi] = NULL; + } return retval; } @@ -3118,6 +3109,9 @@ static int __init hugetlb_init(void) { int i; + BUILD_BUG_ON(sizeof_field(struct page, private) * BITS_PER_BYTE < + __NR_HPAGEFLAGS); + if (!hugepages_supported()) { if (hugetlb_max_hstate || default_hstate_max_huge_pages) pr_warn("HugeTLB: huge pages not supported, ignoring associated command-line parameters\n"); @@ -3198,7 +3192,7 @@ void __init hugetlb_add_hstate(unsigned int order) BUG_ON(order == 0); h = &hstates[hugetlb_max_hstate++]; h->order = order; - h->mask = ~((1ULL << (order + PAGE_SHIFT)) - 1); + h->mask = ~(huge_page_size(h) - 1); for (i = 0; i < MAX_NUMNODES; ++i) INIT_LIST_HEAD(&h->hugepage_freelists[i]); INIT_LIST_HEAD(&h->hugepage_activelist); @@ -3367,8 +3361,7 @@ static unsigned int allowed_mems_nr(struct hstate *h) mpol_allowed = policy_nodemask_current(gfp_mask); for_each_node_mask(node, cpuset_current_mems_allowed) { - if (!mpol_allowed || - (mpol_allowed && node_isset(node, *mpol_allowed))) + if (!mpol_allowed || node_isset(node, *mpol_allowed)) nr += array[node]; } @@ -3474,7 +3467,7 @@ void hugetlb_report_meminfo(struct seq_file *m) for_each_hstate(h) { unsigned long count = h->nr_huge_pages; - total += (PAGE_SIZE << huge_page_order(h)) * count; + total += huge_page_size(h) * count; if (h == &default_hstate) seq_printf(m, @@ -3487,10 +3480,10 @@ void hugetlb_report_meminfo(struct seq_file *m) h->free_huge_pages, h->resv_huge_pages, h->surplus_huge_pages, - (PAGE_SIZE << huge_page_order(h)) / 1024); + huge_page_size(h) / SZ_1K); } - seq_printf(m, "Hugetlb: %8lu kB\n", total / 1024); + seq_printf(m, "Hugetlb: %8lu kB\n", total / SZ_1K); } int hugetlb_report_node_meminfo(char *buf, int len, int nid) @@ -3524,7 +3517,7 @@ void hugetlb_show_meminfo(void) h->nr_huge_pages_node[nid], h->free_huge_pages_node[nid], h->surplus_huge_pages_node[nid], - 1UL << (huge_page_order(h) + PAGE_SHIFT - 10)); + huge_page_size(h) / SZ_1K); } void hugetlb_report_usage(struct seq_file *m, struct mm_struct *mm) @@ -3548,6 +3541,9 @@ static int hugetlb_acct_memory(struct hstate *h, long delta) { int ret = -ENOMEM; + if (!delta) + return 0; + spin_lock(&hugetlb_lock); /* * When cpuset is configured, it breaks the strict hugetlb page @@ -3644,15 +3640,13 @@ static int hugetlb_vm_op_split(struct vm_area_struct *vma, unsigned long addr) static unsigned long hugetlb_vm_op_pagesize(struct vm_area_struct *vma) { - struct hstate *hstate = hstate_vma(vma); - - return 1UL << huge_page_shift(hstate); + return huge_page_size(hstate_vma(vma)); } /* * We cannot handle pagefaults against hugetlb pages at all. They cause * handle_mm_fault() to try to instantiate regular-sized pages in the - * hugegpage VMA. do_page_fault() is supposed to trap this, so BUG is we get + * hugepage VMA. do_page_fault() is supposed to trap this, so BUG is we get * this far. */ static vm_fault_t hugetlb_vm_op_fault(struct vm_fault *vmf) @@ -3967,30 +3961,16 @@ void __unmap_hugepage_range_final(struct mmu_gather *tlb, void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, unsigned long end, struct page *ref_page) { - struct mm_struct *mm; struct mmu_gather tlb; - unsigned long tlb_start = start; - unsigned long tlb_end = end; - - /* - * If shared PMDs were possibly used within this vma range, adjust - * start/end for worst case tlb flushing. - * Note that we can not be sure if PMDs are shared until we try to - * unmap pages. However, we want to make sure TLB flushing covers - * the largest possible range. - */ - adjust_range_if_pmd_sharing_possible(vma, &tlb_start, &tlb_end); - - mm = vma->vm_mm; - tlb_gather_mmu(&tlb, mm, tlb_start, tlb_end); + tlb_gather_mmu(&tlb, vma->vm_mm); __unmap_hugepage_range(&tlb, vma, start, end, ref_page); - tlb_finish_mmu(&tlb, tlb_start, tlb_end); + tlb_finish_mmu(&tlb); } /* * This is called when the original mapper is failing to COW a MAP_PRIVATE - * mappping it owns the reserve page for. The intention is to unmap the page + * mapping it owns the reserve page for. The intention is to unmap the page * from other VMAs and let the children be SIGKILLed if they are faulting the * same region. */ @@ -4169,7 +4149,7 @@ retry_avoidcopy: spin_lock(ptl); ptep = huge_pte_offset(mm, haddr, huge_page_size(h)); if (likely(ptep && pte_same(huge_ptep_get(ptep), pte))) { - ClearPagePrivate(new_page); + ClearHPageRestoreReserve(new_page); /* Break COW */ huge_ptep_clear_flush(vma, haddr, ptep); @@ -4178,7 +4158,7 @@ retry_avoidcopy: make_huge_pte(vma, new_page, 1)); page_remove_rmap(old_page, true); hugepage_add_new_anon_rmap(new_page, vma, haddr); - set_page_huge_active(new_page); + SetHPageMigratable(new_page); /* Make the old page be freed below */ new_page = old_page; } @@ -4236,7 +4216,7 @@ int huge_add_to_page_cache(struct page *page, struct address_space *mapping, if (err) return err; - ClearPagePrivate(page); + ClearHPageRestoreReserve(page); /* * set page dirty so that it will not be removed from cache/file @@ -4398,7 +4378,7 @@ retry: goto backout; if (anon_rmap) { - ClearPagePrivate(page); + ClearHPageRestoreReserve(page); hugepage_add_new_anon_rmap(page, vma, haddr); } else page_dup_rmap(page, true); @@ -4415,12 +4395,12 @@ retry: spin_unlock(ptl); /* - * Only make newly allocated pages active. Existing pages found - * in the pagecache could be !page_huge_active() if they have been - * isolated for migration. + * Only set HPageMigratable in newly allocated pages. Existing pages + * found in the pagecache may not have HPageMigratableset if they have + * been isolated for migration. */ if (new_page) - set_page_huge_active(page); + SetHPageMigratable(page); unlock_page(page); out: @@ -4450,7 +4430,7 @@ u32 hugetlb_fault_mutex_hash(struct address_space *mapping, pgoff_t idx) } #else /* - * For uniprocesor systems we always use a single mutex, so just + * For uniprocessor systems we always use a single mutex, so just * return 0 and avoid the hashing overhead. */ u32 hugetlb_fault_mutex_hash(struct address_space *mapping, pgoff_t idx) @@ -4712,7 +4692,7 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, if (vm_shared) { page_dup_rmap(page, true); } else { - ClearPagePrivate(page); + ClearHPageRestoreReserve(page); hugepage_add_new_anon_rmap(page, dst_vma, dst_addr); } @@ -4731,7 +4711,7 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, update_mmu_cache(dst_vma, dst_addr, dst_pte); spin_unlock(ptl); - set_page_huge_active(page); + SetHPageMigratable(page); if (vm_shared) unlock_page(page); ret = 0; @@ -4746,6 +4726,20 @@ out_release_nounlock: goto out; } +static void record_subpages_vmas(struct page *page, struct vm_area_struct *vma, + int refs, struct page **pages, + struct vm_area_struct **vmas) +{ + int nr; + + for (nr = 0; nr < refs; nr++) { + if (likely(pages)) + pages[nr] = mem_map_offset(page, nr); + if (vmas) + vmas[nr] = vma; + } +} + long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, struct page **pages, struct vm_area_struct **vmas, unsigned long *position, unsigned long *nr_pages, @@ -4755,7 +4749,7 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long vaddr = *position; unsigned long remainder = *nr_pages; struct hstate *h = hstate_vma(vma); - int err = -EFAULT; + int err = -EFAULT, refs; while (vaddr < vma->vm_end && remainder) { pte_t *pte; @@ -4875,20 +4869,29 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, continue; } -same_page: + refs = min3(pages_per_huge_page(h) - pfn_offset, + (vma->vm_end - vaddr) >> PAGE_SHIFT, remainder); + + if (pages || vmas) + record_subpages_vmas(mem_map_offset(page, pfn_offset), + vma, refs, + likely(pages) ? pages + i : NULL, + vmas ? vmas + i : NULL); + if (pages) { - pages[i] = mem_map_offset(page, pfn_offset); /* - * try_grab_page() should always succeed here, because: - * a) we hold the ptl lock, and b) we've just checked - * that the huge page is present in the page tables. If - * the huge page is present, then the tail pages must - * also be present. The ptl prevents the head page and - * tail pages from being rearranged in any way. So this - * page must be available at this point, unless the page - * refcount overflowed: + * try_grab_compound_head() should always succeed here, + * because: a) we hold the ptl lock, and b) we've just + * checked that the huge page is present in the page + * tables. If the huge page is present, then the tail + * pages must also be present. The ptl prevents the + * head page and tail pages from being rearranged in + * any way. So this page must be available at this + * point, unless the page refcount overflowed: */ - if (WARN_ON_ONCE(!try_grab_page(pages[i], flags))) { + if (WARN_ON_ONCE(!try_grab_compound_head(pages[i], + refs, + flags))) { spin_unlock(ptl); remainder = 0; err = -ENOMEM; @@ -4896,21 +4899,10 @@ same_page: } } - if (vmas) - vmas[i] = vma; - - vaddr += PAGE_SIZE; - ++pfn_offset; - --remainder; - ++i; - if (vaddr < vma->vm_end && remainder && - pfn_offset < pages_per_huge_page(h)) { - /* - * We use pfn_offset to avoid touching the pageframes - * of this compound page. - */ - goto same_page; - } + vaddr += (refs << PAGE_SHIFT); + remainder -= refs; + i += refs; + spin_unlock(ptl); } *nr_pages = remainder; @@ -5024,12 +5016,13 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma, return pages << h->order; } -int hugetlb_reserve_pages(struct inode *inode, +/* Return true if reservation was successful, false otherwise. */ +bool hugetlb_reserve_pages(struct inode *inode, long from, long to, struct vm_area_struct *vma, vm_flags_t vm_flags) { - long ret, chg, add = -1; + long chg, add = -1; struct hstate *h = hstate_inode(inode); struct hugepage_subpool *spool = subpool_inode(inode); struct resv_map *resv_map; @@ -5039,7 +5032,7 @@ int hugetlb_reserve_pages(struct inode *inode, /* This should never happen */ if (from > to) { VM_WARN(1, "%s called with a negative range\n", __func__); - return -EINVAL; + return false; } /* @@ -5048,7 +5041,7 @@ int hugetlb_reserve_pages(struct inode *inode, * without using reserves */ if (vm_flags & VM_NORESERVE) - return 0; + return true; /* * Shared mappings base their reservation on the number of pages that @@ -5070,7 +5063,7 @@ int hugetlb_reserve_pages(struct inode *inode, /* Private mapping. */ resv_map = resv_map_alloc(); if (!resv_map) - return -ENOMEM; + return false; chg = to - from; @@ -5078,18 +5071,12 @@ int hugetlb_reserve_pages(struct inode *inode, set_vma_resv_flags(vma, HPAGE_RESV_OWNER); } - if (chg < 0) { - ret = chg; + if (chg < 0) goto out_err; - } - - ret = hugetlb_cgroup_charge_cgroup_rsvd( - hstate_index(h), chg * pages_per_huge_page(h), &h_cg); - if (ret < 0) { - ret = -ENOMEM; + if (hugetlb_cgroup_charge_cgroup_rsvd(hstate_index(h), + chg * pages_per_huge_page(h), &h_cg) < 0) goto out_err; - } if (vma && !(vma->vm_flags & VM_MAYSHARE) && h_cg) { /* For private mappings, the hugetlb_cgroup uncharge info hangs @@ -5104,19 +5091,15 @@ int hugetlb_reserve_pages(struct inode *inode, * reservations already in place (gbl_reserve). */ gbl_reserve = hugepage_subpool_get_pages(spool, chg); - if (gbl_reserve < 0) { - ret = -ENOSPC; + if (gbl_reserve < 0) goto out_uncharge_cgroup; - } /* * Check enough hugepages are available for the reservation. * Hand the pages back to the subpool if there are not */ - ret = hugetlb_acct_memory(h, gbl_reserve); - if (ret < 0) { + if (hugetlb_acct_memory(h, gbl_reserve) < 0) goto out_put_pages; - } /* * Account for the reservations made. Shared mappings record regions @@ -5134,7 +5117,6 @@ int hugetlb_reserve_pages(struct inode *inode, if (unlikely(add < 0)) { hugetlb_acct_memory(h, -gbl_reserve); - ret = add; goto out_put_pages; } else if (unlikely(chg > add)) { /* @@ -5155,7 +5137,8 @@ int hugetlb_reserve_pages(struct inode *inode, hugetlb_acct_memory(h, -rsv_adjust); } } - return 0; + return true; + out_put_pages: /* put back original number of pages, chg */ (void)hugepage_subpool_put_pages(spool, chg); @@ -5171,7 +5154,7 @@ out_err: region_abort(resv_map, from, to, regions_needed); if (vma && is_vma_resv_set(vma, HPAGE_RESV_OWNER)) kref_put(&resv_map->refs, resv_map_release); - return ret; + return false; } long hugetlb_unreserve_pages(struct inode *inode, long start, long end, @@ -5232,7 +5215,7 @@ static unsigned long page_table_shareable(struct vm_area_struct *svma, */ if (pmd_index(addr) != pmd_index(saddr) || vm_flags != svm_flags || - sbase < svma->vm_start || svma->vm_end < s_end) + !range_in_vma(svma, sbase, s_end)) return 0; return saddr; @@ -5259,21 +5242,23 @@ static bool vma_shareable(struct vm_area_struct *vma, unsigned long addr) void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma, unsigned long *start, unsigned long *end) { - unsigned long a_start, a_end; + unsigned long v_start = ALIGN(vma->vm_start, PUD_SIZE), + v_end = ALIGN_DOWN(vma->vm_end, PUD_SIZE); - if (!(vma->vm_flags & VM_MAYSHARE)) + /* + * vma need span at least one aligned PUD size and the start,end range + * must at least partialy within it. + */ + if (!(vma->vm_flags & VM_MAYSHARE) || !(v_end > v_start) || + (*end <= v_start) || (*start >= v_end)) return; /* Extend the range to be PUD aligned for a worst case scenario */ - a_start = ALIGN_DOWN(*start, PUD_SIZE); - a_end = ALIGN(*end, PUD_SIZE); + if (*start > v_start) + *start = ALIGN_DOWN(*start, PUD_SIZE); - /* - * Intersect the range with the vma range, since pmd sharing won't be - * across vma after all - */ - *start = max(vma->vm_start, a_start); - *end = min(vma->vm_end, a_end); + if (*end < v_end) + *end = ALIGN(*end, PUD_SIZE); } /* @@ -5555,13 +5540,14 @@ bool isolate_huge_page(struct page *page, struct list_head *list) { bool ret = true; - VM_BUG_ON_PAGE(!PageHead(page), page); spin_lock(&hugetlb_lock); - if (!page_huge_active(page) || !get_page_unless_zero(page)) { + if (!PageHeadHuge(page) || + !HPageMigratable(page) || + !get_page_unless_zero(page)) { ret = false; goto unlock; } - clear_page_huge_active(page); + ClearHPageMigratable(page); list_move_tail(&page->lru, list); unlock: spin_unlock(&hugetlb_lock); @@ -5570,9 +5556,8 @@ unlock: void putback_active_hugepage(struct page *page) { - VM_BUG_ON_PAGE(!PageHead(page), page); spin_lock(&hugetlb_lock); - set_page_huge_active(page); + SetHPageMigratable(page); list_move_tail(&page->lru, &(page_hstate(page))->hugepage_activelist); spin_unlock(&hugetlb_lock); put_page(page); @@ -5595,12 +5580,12 @@ void move_hugetlb_state(struct page *oldpage, struct page *newpage, int reason) * here as well otherwise the global surplus count will not match * the per-node's. */ - if (PageHugeTemporary(newpage)) { + if (HPageTemporary(newpage)) { int old_nid = page_to_nid(oldpage); int new_nid = page_to_nid(newpage); - SetPageHugeTemporary(oldpage); - ClearPageHugeTemporary(newpage); + SetHPageTemporary(oldpage); + ClearHPageTemporary(newpage); spin_lock(&hugetlb_lock); if (h->surplus_huge_pages_node[old_nid]) { diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c index 9182848dda3e..f68b51fcda3d 100644 --- a/mm/hugetlb_cgroup.c +++ b/mm/hugetlb_cgroup.c @@ -113,7 +113,7 @@ static void hugetlb_cgroup_init(struct hugetlb_cgroup *h_cgroup, rsvd_parent); limit = round_down(PAGE_COUNTER_MAX, - 1 << huge_page_order(&hstates[idx])); + pages_per_huge_page(&hstates[idx])); ret = page_counter_set_max( hugetlb_cgroup_counter_from_cgroup(h_cgroup, idx), @@ -460,7 +460,7 @@ static int hugetlb_cgroup_read_u64_max(struct seq_file *seq, void *v) counter = &h_cg->hugepage[idx]; limit = round_down(PAGE_COUNTER_MAX, - 1 << huge_page_order(&hstates[idx])); + pages_per_huge_page(&hstates[idx])); switch (MEMFILE_ATTR(cft->private)) { case RES_RSVD_USAGE: @@ -507,7 +507,7 @@ static ssize_t hugetlb_cgroup_write(struct kernfs_open_file *of, return ret; idx = MEMFILE_IDX(of_cft(of)->private); - nr_pages = round_down(nr_pages, 1 << huge_page_order(&hstates[idx])); + nr_pages = round_down(nr_pages, pages_per_huge_page(&hstates[idx])); switch (MEMFILE_ATTR(of_cft(of)->private)) { case RES_RSVD_LIMIT: diff --git a/mm/kasan/common.c b/mm/kasan/common.c index b25167664ead..b18189ef3a92 100644 --- a/mm/kasan/common.c +++ b/mm/kasan/common.c @@ -60,7 +60,7 @@ void kasan_disable_current(void) void __kasan_unpoison_range(const void *address, size_t size) { - unpoison_range(address, size); + kasan_unpoison(address, size); } #if CONFIG_KASAN_STACK @@ -69,7 +69,7 @@ void kasan_unpoison_task_stack(struct task_struct *task) { void *base = task_stack_page(task); - unpoison_range(base, THREAD_SIZE); + kasan_unpoison(base, THREAD_SIZE); } /* Unpoison the stack for the current task beyond a watermark sp value. */ @@ -82,7 +82,7 @@ asmlinkage void kasan_unpoison_task_stack_below(const void *watermark) */ void *base = (void *)((unsigned long)watermark & ~(THREAD_SIZE - 1)); - unpoison_range(base, watermark - base); + kasan_unpoison(base, watermark - base); } #endif /* CONFIG_KASAN_STACK */ @@ -105,18 +105,17 @@ void __kasan_alloc_pages(struct page *page, unsigned int order) if (unlikely(PageHighMem(page))) return; - tag = random_tag(); + tag = kasan_random_tag(); for (i = 0; i < (1 << order); i++) page_kasan_tag_set(page + i, tag); - unpoison_range(page_address(page), PAGE_SIZE << order); + kasan_unpoison(page_address(page), PAGE_SIZE << order); } void __kasan_free_pages(struct page *page, unsigned int order) { if (likely(!PageHighMem(page))) - poison_range(page_address(page), - PAGE_SIZE << order, - KASAN_FREE_PAGE); + kasan_poison(page_address(page), PAGE_SIZE << order, + KASAN_FREE_PAGE); } /* @@ -246,18 +245,18 @@ void __kasan_poison_slab(struct page *page) for (i = 0; i < compound_nr(page); i++) page_kasan_tag_reset(page + i); - poison_range(page_address(page), page_size(page), + kasan_poison(page_address(page), page_size(page), KASAN_KMALLOC_REDZONE); } void __kasan_unpoison_object_data(struct kmem_cache *cache, void *object) { - unpoison_range(object, cache->object_size); + kasan_unpoison(object, cache->object_size); } void __kasan_poison_object_data(struct kmem_cache *cache, void *object) { - poison_range(object, cache->object_size, KASAN_KMALLOC_REDZONE); + kasan_poison(object, cache->object_size, KASAN_KMALLOC_REDZONE); } /* @@ -294,7 +293,7 @@ static u8 assign_tag(struct kmem_cache *cache, const void *object, * set, assign a tag when the object is being allocated (init == false). */ if (!cache->ctor && !(cache->flags & SLAB_TYPESAFE_BY_RCU)) - return init ? KASAN_TAG_KERNEL : random_tag(); + return init ? KASAN_TAG_KERNEL : kasan_random_tag(); /* For caches that either have a constructor or SLAB_TYPESAFE_BY_RCU: */ #ifdef CONFIG_SLAB @@ -305,7 +304,7 @@ static u8 assign_tag(struct kmem_cache *cache, const void *object, * For SLUB assign a random tag during slab creation, otherwise reuse * the already assigned tag. */ - return init ? random_tag() : get_tag(object); + return init ? kasan_random_tag() : get_tag(object); #endif } @@ -346,12 +345,12 @@ static bool ____kasan_slab_free(struct kmem_cache *cache, void *object, if (unlikely(cache->flags & SLAB_TYPESAFE_BY_RCU)) return false; - if (check_invalid_free(tagged_object)) { + if (!kasan_byte_accessible(tagged_object)) { kasan_report_invalid_free(tagged_object, ip); return true; } - poison_range(object, cache->object_size, KASAN_KMALLOC_FREE); + kasan_poison(object, cache->object_size, KASAN_KMALLOC_FREE); if (!kasan_stack_collection_enabled()) return false; @@ -361,7 +360,7 @@ static bool ____kasan_slab_free(struct kmem_cache *cache, void *object, kasan_set_free_info(cache, object, tag); - return quarantine_put(cache, object); + return kasan_quarantine_put(cache, object); } bool __kasan_slab_free(struct kmem_cache *cache, void *object, unsigned long ip) @@ -386,7 +385,7 @@ void __kasan_slab_free_mempool(void *ptr, unsigned long ip) kasan_report_invalid_free(ptr, ip); return; } - poison_range(ptr, page_size(page), KASAN_FREE_PAGE); + kasan_poison(ptr, page_size(page), KASAN_FREE_PAGE); } else { ____kasan_slab_free(page->slab_cache, ptr, ip, false); } @@ -409,7 +408,7 @@ static void *____kasan_kmalloc(struct kmem_cache *cache, const void *object, u8 tag; if (gfpflags_allow_blocking(flags)) - quarantine_reduce(); + kasan_quarantine_reduce(); if (unlikely(object == NULL)) return NULL; @@ -421,9 +420,9 @@ static void *____kasan_kmalloc(struct kmem_cache *cache, const void *object, tag = assign_tag(cache, object, false, keep_tag); /* Tag is ignored in set_tag without CONFIG_KASAN_SW/HW_TAGS */ - unpoison_range(set_tag(object, tag), size); - poison_range((void *)redzone_start, redzone_end - redzone_start, - KASAN_KMALLOC_REDZONE); + kasan_unpoison(set_tag(object, tag), size); + kasan_poison((void *)redzone_start, redzone_end - redzone_start, + KASAN_KMALLOC_REDZONE); if (kasan_stack_collection_enabled()) set_alloc_info(cache, (void *)object, flags); @@ -452,7 +451,7 @@ void * __must_check __kasan_kmalloc_large(const void *ptr, size_t size, unsigned long redzone_end; if (gfpflags_allow_blocking(flags)) - quarantine_reduce(); + kasan_quarantine_reduce(); if (unlikely(ptr == NULL)) return NULL; @@ -462,8 +461,8 @@ void * __must_check __kasan_kmalloc_large(const void *ptr, size_t size, KASAN_GRANULE_SIZE); redzone_end = (unsigned long)ptr + page_size(page); - unpoison_range(ptr, size); - poison_range((void *)redzone_start, redzone_end - redzone_start, + kasan_unpoison(ptr, size); + kasan_poison((void *)redzone_start, redzone_end - redzone_start, KASAN_PAGE_REDZONE); return (void *)ptr; @@ -491,3 +490,12 @@ void __kasan_kfree_large(void *ptr, unsigned long ip) kasan_report_invalid_free(ptr, ip); /* The object will be poisoned by kasan_free_pages(). */ } + +bool __kasan_check_byte(const void *address, unsigned long ip) +{ + if (!kasan_byte_accessible(address)) { + kasan_report((unsigned long)address, 1, false, ip); + return false; + } + return true; +} diff --git a/mm/kasan/generic.c b/mm/kasan/generic.c index 5106b84b07d4..3f17a1218055 100644 --- a/mm/kasan/generic.c +++ b/mm/kasan/generic.c @@ -158,7 +158,7 @@ static __always_inline bool memory_is_poisoned(unsigned long addr, size_t size) return memory_is_poisoned_n(addr, size); } -static __always_inline bool check_memory_region_inline(unsigned long addr, +static __always_inline bool check_region_inline(unsigned long addr, size_t size, bool write, unsigned long ret_ip) { @@ -179,37 +179,37 @@ static __always_inline bool check_memory_region_inline(unsigned long addr, return !kasan_report(addr, size, write, ret_ip); } -bool check_memory_region(unsigned long addr, size_t size, bool write, - unsigned long ret_ip) +bool kasan_check_range(unsigned long addr, size_t size, bool write, + unsigned long ret_ip) { - return check_memory_region_inline(addr, size, write, ret_ip); + return check_region_inline(addr, size, write, ret_ip); } -bool check_invalid_free(void *addr) +bool kasan_byte_accessible(const void *addr) { s8 shadow_byte = READ_ONCE(*(s8 *)kasan_mem_to_shadow(addr)); - return shadow_byte < 0 || shadow_byte >= KASAN_GRANULE_SIZE; + return shadow_byte >= 0 && shadow_byte < KASAN_GRANULE_SIZE; } void kasan_cache_shrink(struct kmem_cache *cache) { - quarantine_remove_cache(cache); + kasan_quarantine_remove_cache(cache); } void kasan_cache_shutdown(struct kmem_cache *cache) { if (!__kmem_cache_empty(cache)) - quarantine_remove_cache(cache); + kasan_quarantine_remove_cache(cache); } static void register_global(struct kasan_global *global) { size_t aligned_size = round_up(global->size, KASAN_GRANULE_SIZE); - unpoison_range(global->beg, global->size); + kasan_unpoison(global->beg, global->size); - poison_range(global->beg + aligned_size, + kasan_poison(global->beg + aligned_size, global->size_with_redzone - aligned_size, KASAN_GLOBAL_REDZONE); } @@ -231,7 +231,7 @@ EXPORT_SYMBOL(__asan_unregister_globals); #define DEFINE_ASAN_LOAD_STORE(size) \ void __asan_load##size(unsigned long addr) \ { \ - check_memory_region_inline(addr, size, false, _RET_IP_);\ + check_region_inline(addr, size, false, _RET_IP_); \ } \ EXPORT_SYMBOL(__asan_load##size); \ __alias(__asan_load##size) \ @@ -239,7 +239,7 @@ EXPORT_SYMBOL(__asan_unregister_globals); EXPORT_SYMBOL(__asan_load##size##_noabort); \ void __asan_store##size(unsigned long addr) \ { \ - check_memory_region_inline(addr, size, true, _RET_IP_); \ + check_region_inline(addr, size, true, _RET_IP_); \ } \ EXPORT_SYMBOL(__asan_store##size); \ __alias(__asan_store##size) \ @@ -254,7 +254,7 @@ DEFINE_ASAN_LOAD_STORE(16); void __asan_loadN(unsigned long addr, size_t size) { - check_memory_region(addr, size, false, _RET_IP_); + kasan_check_range(addr, size, false, _RET_IP_); } EXPORT_SYMBOL(__asan_loadN); @@ -264,7 +264,7 @@ EXPORT_SYMBOL(__asan_loadN_noabort); void __asan_storeN(unsigned long addr, size_t size) { - check_memory_region(addr, size, true, _RET_IP_); + kasan_check_range(addr, size, true, _RET_IP_); } EXPORT_SYMBOL(__asan_storeN); @@ -290,11 +290,11 @@ void __asan_alloca_poison(unsigned long addr, size_t size) WARN_ON(!IS_ALIGNED(addr, KASAN_ALLOCA_REDZONE_SIZE)); - unpoison_range((const void *)(addr + rounded_down_size), - size - rounded_down_size); - poison_range(left_redzone, KASAN_ALLOCA_REDZONE_SIZE, + kasan_unpoison((const void *)(addr + rounded_down_size), + size - rounded_down_size); + kasan_poison(left_redzone, KASAN_ALLOCA_REDZONE_SIZE, KASAN_ALLOCA_LEFT); - poison_range(right_redzone, padding_size + KASAN_ALLOCA_REDZONE_SIZE, + kasan_poison(right_redzone, padding_size + KASAN_ALLOCA_REDZONE_SIZE, KASAN_ALLOCA_RIGHT); } EXPORT_SYMBOL(__asan_alloca_poison); @@ -305,7 +305,7 @@ void __asan_allocas_unpoison(const void *stack_top, const void *stack_bottom) if (unlikely(!stack_top || stack_top > stack_bottom)) return; - unpoison_range(stack_top, stack_bottom - stack_top); + kasan_unpoison(stack_top, stack_bottom - stack_top); } EXPORT_SYMBOL(__asan_allocas_unpoison); diff --git a/mm/kasan/hw_tags.c b/mm/kasan/hw_tags.c index e529428e7a11..b31aeef505dd 100644 --- a/mm/kasan/hw_tags.c +++ b/mm/kasan/hw_tags.c @@ -134,12 +134,8 @@ void __init kasan_init_hw_tags(void) switch (kasan_arg_stacktrace) { case KASAN_ARG_STACKTRACE_DEFAULT: - /* - * Default to enabling stack trace collection for - * debug kernels. - */ - if (IS_ENABLED(CONFIG_DEBUG_KERNEL)) - static_branch_enable(&kasan_flag_stacktrace); + /* Default to enabling stack trace collection. */ + static_branch_enable(&kasan_flag_stacktrace); break; case KASAN_ARG_STACKTRACE_OFF: /* Do nothing, kasan_flag_stacktrace keeps its default value. */ @@ -189,3 +185,19 @@ struct kasan_track *kasan_get_free_track(struct kmem_cache *cache, return &alloc_meta->free_track[0]; } + +#if IS_ENABLED(CONFIG_KASAN_KUNIT_TEST) + +void kasan_set_tagging_report_once(bool state) +{ + hw_set_tagging_report_once(state); +} +EXPORT_SYMBOL_GPL(kasan_set_tagging_report_once); + +void kasan_enable_tagging(void) +{ + hw_enable_tagging(); +} +EXPORT_SYMBOL_GPL(kasan_enable_tagging); + +#endif diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h index cc4d9e1d49b1..cc14b6e6c14c 100644 --- a/mm/kasan/kasan.h +++ b/mm/kasan/kasan.h @@ -36,6 +36,12 @@ extern bool kasan_flag_panic __ro_after_init; #define KASAN_TAG_INVALID 0xFE /* inaccessible memory tag */ #define KASAN_TAG_MAX 0xFD /* maximum value for random tags */ +#ifdef CONFIG_KASAN_HW_TAGS +#define KASAN_TAG_MIN 0xF0 /* mimimum value for random tags */ +#else +#define KASAN_TAG_MIN 0x00 /* mimimum value for random tags */ +#endif + #ifdef CONFIG_KASAN_GENERIC #define KASAN_FREE_PAGE 0xFF /* page was freed */ #define KASAN_PAGE_REDZONE 0xFE /* redzone for kmalloc_large allocations */ @@ -195,39 +201,39 @@ static inline bool addr_has_metadata(const void *addr) } /** - * check_memory_region - Check memory region, and report if invalid access. + * kasan_check_range - Check memory region, and report if invalid access. * @addr: the accessed address * @size: the accessed size * @write: true if access is a write access * @ret_ip: return address * @return: true if access was valid, false if invalid */ -bool check_memory_region(unsigned long addr, size_t size, bool write, +bool kasan_check_range(unsigned long addr, size_t size, bool write, unsigned long ret_ip); #else /* CONFIG_KASAN_GENERIC || CONFIG_KASAN_SW_TAGS */ static inline bool addr_has_metadata(const void *addr) { - return true; + return (is_vmalloc_addr(addr) || virt_addr_valid(addr)); } #endif /* CONFIG_KASAN_GENERIC || CONFIG_KASAN_SW_TAGS */ #if defined(CONFIG_KASAN_SW_TAGS) || defined(CONFIG_KASAN_HW_TAGS) -void print_tags(u8 addr_tag, const void *addr); +void kasan_print_tags(u8 addr_tag, const void *addr); #else -static inline void print_tags(u8 addr_tag, const void *addr) { } +static inline void kasan_print_tags(u8 addr_tag, const void *addr) { } #endif -void *find_first_bad_addr(void *addr, size_t size); -const char *get_bug_type(struct kasan_access_info *info); -void metadata_fetch_row(char *buffer, void *row); +void *kasan_find_first_bad_addr(void *addr, size_t size); +const char *kasan_get_bug_type(struct kasan_access_info *info); +void kasan_metadata_fetch_row(char *buffer, void *row); #if defined(CONFIG_KASAN_GENERIC) && CONFIG_KASAN_STACK -void print_address_stack_frame(const void *addr); +void kasan_print_address_stack_frame(const void *addr); #else -static inline void print_address_stack_frame(const void *addr) { } +static inline void kasan_print_address_stack_frame(const void *addr) { } #endif bool kasan_report(unsigned long addr, size_t size, @@ -244,13 +250,13 @@ struct kasan_track *kasan_get_free_track(struct kmem_cache *cache, #if defined(CONFIG_KASAN_GENERIC) && \ (defined(CONFIG_SLAB) || defined(CONFIG_SLUB)) -bool quarantine_put(struct kmem_cache *cache, void *object); -void quarantine_reduce(void); -void quarantine_remove_cache(struct kmem_cache *cache); +bool kasan_quarantine_put(struct kmem_cache *cache, void *object); +void kasan_quarantine_reduce(void); +void kasan_quarantine_remove_cache(struct kmem_cache *cache); #else -static inline bool quarantine_put(struct kmem_cache *cache, void *object) { return false; } -static inline void quarantine_reduce(void) { } -static inline void quarantine_remove_cache(struct kmem_cache *cache) { } +static inline bool kasan_quarantine_put(struct kmem_cache *cache, void *object) { return false; } +static inline void kasan_quarantine_reduce(void) { } +static inline void kasan_quarantine_remove_cache(struct kmem_cache *cache) { } #endif #ifndef arch_kasan_set_tag @@ -274,6 +280,9 @@ static inline const void *arch_kasan_set_tag(const void *addr, u8 tag) #ifndef arch_init_tags #define arch_init_tags(max_tag) #endif +#ifndef arch_set_tagging_report_once +#define arch_set_tagging_report_once(state) +#endif #ifndef arch_get_random_tag #define arch_get_random_tag() (0xFF) #endif @@ -286,48 +295,66 @@ static inline const void *arch_kasan_set_tag(const void *addr, u8 tag) #define hw_enable_tagging() arch_enable_tagging() #define hw_init_tags(max_tag) arch_init_tags(max_tag) +#define hw_set_tagging_report_once(state) arch_set_tagging_report_once(state) #define hw_get_random_tag() arch_get_random_tag() #define hw_get_mem_tag(addr) arch_get_mem_tag(addr) #define hw_set_mem_tag_range(addr, size, tag) arch_set_mem_tag_range((addr), (size), (tag)) +#else /* CONFIG_KASAN_HW_TAGS */ + +#define hw_enable_tagging() +#define hw_set_tagging_report_once(state) + #endif /* CONFIG_KASAN_HW_TAGS */ +#if defined(CONFIG_KASAN_HW_TAGS) && IS_ENABLED(CONFIG_KASAN_KUNIT_TEST) + +void kasan_set_tagging_report_once(bool state); +void kasan_enable_tagging(void); + +#else /* CONFIG_KASAN_HW_TAGS || CONFIG_KASAN_KUNIT_TEST */ + +static inline void kasan_set_tagging_report_once(bool state) { } +static inline void kasan_enable_tagging(void) { } + +#endif /* CONFIG_KASAN_HW_TAGS || CONFIG_KASAN_KUNIT_TEST */ + #ifdef CONFIG_KASAN_SW_TAGS -u8 random_tag(void); +u8 kasan_random_tag(void); #elif defined(CONFIG_KASAN_HW_TAGS) -static inline u8 random_tag(void) { return hw_get_random_tag(); } +static inline u8 kasan_random_tag(void) { return hw_get_random_tag(); } #else -static inline u8 random_tag(void) { return 0; } +static inline u8 kasan_random_tag(void) { return 0; } #endif #ifdef CONFIG_KASAN_HW_TAGS -static inline void poison_range(const void *address, size_t size, u8 value) +static inline void kasan_poison(const void *address, size_t size, u8 value) { hw_set_mem_tag_range(kasan_reset_tag(address), round_up(size, KASAN_GRANULE_SIZE), value); } -static inline void unpoison_range(const void *address, size_t size) +static inline void kasan_unpoison(const void *address, size_t size) { hw_set_mem_tag_range(kasan_reset_tag(address), round_up(size, KASAN_GRANULE_SIZE), get_tag(address)); } -static inline bool check_invalid_free(void *addr) +static inline bool kasan_byte_accessible(const void *addr) { u8 ptr_tag = get_tag(addr); - u8 mem_tag = hw_get_mem_tag(addr); + u8 mem_tag = hw_get_mem_tag((void *)addr); - return (mem_tag == KASAN_TAG_INVALID) || - (ptr_tag != KASAN_TAG_KERNEL && ptr_tag != mem_tag); + return (mem_tag != KASAN_TAG_INVALID) && + (ptr_tag == KASAN_TAG_KERNEL || ptr_tag == mem_tag); } #else /* CONFIG_KASAN_HW_TAGS */ -void poison_range(const void *address, size_t size, u8 value); -void unpoison_range(const void *address, size_t size); -bool check_invalid_free(void *addr); +void kasan_poison(const void *address, size_t size, u8 value); +void kasan_unpoison(const void *address, size_t size); +bool kasan_byte_accessible(const void *addr); #endif /* CONFIG_KASAN_HW_TAGS */ diff --git a/mm/kasan/quarantine.c b/mm/kasan/quarantine.c index 55783125a767..728fb24c5683 100644 --- a/mm/kasan/quarantine.c +++ b/mm/kasan/quarantine.c @@ -168,7 +168,7 @@ static void qlist_free_all(struct qlist_head *q, struct kmem_cache *cache) qlist_init(q); } -bool quarantine_put(struct kmem_cache *cache, void *object) +bool kasan_quarantine_put(struct kmem_cache *cache, void *object) { unsigned long flags; struct qlist_head *q; @@ -184,11 +184,11 @@ bool quarantine_put(struct kmem_cache *cache, void *object) /* * Note: irq must be disabled until after we move the batch to the - * global quarantine. Otherwise quarantine_remove_cache() can miss - * some objects belonging to the cache if they are in our local temp - * list. quarantine_remove_cache() executes on_each_cpu() at the - * beginning which ensures that it either sees the objects in per-cpu - * lists or in the global quarantine. + * global quarantine. Otherwise kasan_quarantine_remove_cache() can + * miss some objects belonging to the cache if they are in our local + * temp list. kasan_quarantine_remove_cache() executes on_each_cpu() + * at the beginning which ensures that it either sees the objects in + * per-cpu lists or in the global quarantine. */ local_irq_save(flags); @@ -222,7 +222,7 @@ bool quarantine_put(struct kmem_cache *cache, void *object) return true; } -void quarantine_reduce(void) +void kasan_quarantine_reduce(void) { size_t total_size, new_quarantine_size, percpu_quarantines; unsigned long flags; @@ -234,7 +234,7 @@ void quarantine_reduce(void) return; /* - * srcu critical section ensures that quarantine_remove_cache() + * srcu critical section ensures that kasan_quarantine_remove_cache() * will not miss objects belonging to the cache while they are in our * local to_free list. srcu is chosen because (1) it gives us private * grace period domain that does not interfere with anything else, @@ -309,15 +309,15 @@ static void per_cpu_remove_cache(void *arg) } /* Free all quarantined objects belonging to cache. */ -void quarantine_remove_cache(struct kmem_cache *cache) +void kasan_quarantine_remove_cache(struct kmem_cache *cache) { unsigned long flags, i; struct qlist_head to_free = QLIST_INIT; /* * Must be careful to not miss any objects that are being moved from - * per-cpu list to the global quarantine in quarantine_put(), - * nor objects being freed in quarantine_reduce(). on_each_cpu() + * per-cpu list to the global quarantine in kasan_quarantine_put(), + * nor objects being freed in kasan_quarantine_reduce(). on_each_cpu() * achieves the first goal, while synchronize_srcu() achieves the * second. */ diff --git a/mm/kasan/report.c b/mm/kasan/report.c index c0fb21797550..234f35a84f19 100644 --- a/mm/kasan/report.c +++ b/mm/kasan/report.c @@ -61,7 +61,7 @@ __setup("kasan_multi_shot", kasan_set_multi_shot); static void print_error_description(struct kasan_access_info *info) { pr_err("BUG: KASAN: %s in %pS\n", - get_bug_type(info), (void *)info->ip); + kasan_get_bug_type(info), (void *)info->ip); if (info->access_size) pr_err("%s of size %zu at addr %px by task %s/%d\n", info->is_write ? "Write" : "Read", info->access_size, @@ -247,7 +247,7 @@ static void print_address_description(void *addr, u8 tag) dump_page(page, "kasan: bad access detected"); } - print_address_stack_frame(addr); + kasan_print_address_stack_frame(addr); } static bool meta_row_is_guilty(const void *row, const void *addr) @@ -293,7 +293,7 @@ static void print_memory_metadata(const void *addr) * function, because generic functions may try to * access kasan mapping for the passed address. */ - metadata_fetch_row(&metadata[0], row); + kasan_metadata_fetch_row(&metadata[0], row); print_hex_dump(KERN_ERR, buffer, DUMP_PREFIX_NONE, META_BYTES_PER_ROW, 1, @@ -331,7 +331,7 @@ static void kasan_update_kunit_status(struct kunit *cur_test) } kasan_data = (struct kunit_kasan_expectation *)resource->data; - kasan_data->report_found = true; + WRITE_ONCE(kasan_data->report_found, true); kunit_put_resource(resource); } #endif /* IS_ENABLED(CONFIG_KUNIT) */ @@ -350,7 +350,7 @@ void kasan_report_invalid_free(void *object, unsigned long ip) start_report(&flags); pr_err("BUG: KASAN: double-free or invalid-free in %pS\n", (void *)ip); - print_tags(tag, object); + kasan_print_tags(tag, object); pr_err("\n"); print_address_description(object, tag); pr_err("\n"); @@ -378,7 +378,8 @@ static void __kasan_report(unsigned long addr, size_t size, bool is_write, info.access_addr = tagged_addr; if (addr_has_metadata(untagged_addr)) - info.first_bad_addr = find_first_bad_addr(tagged_addr, size); + info.first_bad_addr = + kasan_find_first_bad_addr(tagged_addr, size); else info.first_bad_addr = untagged_addr; info.access_size = size; @@ -389,7 +390,7 @@ static void __kasan_report(unsigned long addr, size_t size, bool is_write, print_error_description(&info); if (addr_has_metadata(untagged_addr)) - print_tags(get_tag(tagged_addr), info.first_bad_addr); + kasan_print_tags(get_tag(tagged_addr), info.first_bad_addr); pr_err("\n"); if (addr_has_metadata(untagged_addr)) { diff --git a/mm/kasan/report_generic.c b/mm/kasan/report_generic.c index 8a9c889872da..41f374585144 100644 --- a/mm/kasan/report_generic.c +++ b/mm/kasan/report_generic.c @@ -30,7 +30,7 @@ #include "kasan.h" #include "../slab.h" -void *find_first_bad_addr(void *addr, size_t size) +void *kasan_find_first_bad_addr(void *addr, size_t size) { void *p = addr; @@ -105,7 +105,7 @@ static const char *get_wild_bug_type(struct kasan_access_info *info) return bug_type; } -const char *get_bug_type(struct kasan_access_info *info) +const char *kasan_get_bug_type(struct kasan_access_info *info) { /* * If access_size is a negative number, then it has reason to be @@ -123,7 +123,7 @@ const char *get_bug_type(struct kasan_access_info *info) return get_wild_bug_type(info); } -void metadata_fetch_row(char *buffer, void *row) +void kasan_metadata_fetch_row(char *buffer, void *row) { memcpy(buffer, kasan_mem_to_shadow(row), META_BYTES_PER_ROW); } @@ -263,7 +263,7 @@ static bool __must_check get_address_stack_frame_info(const void *addr, return true; } -void print_address_stack_frame(const void *addr) +void kasan_print_address_stack_frame(const void *addr) { unsigned long offset; const char *frame_descr; diff --git a/mm/kasan/report_hw_tags.c b/mm/kasan/report_hw_tags.c index 57114f0e14d1..42b2168755d6 100644 --- a/mm/kasan/report_hw_tags.c +++ b/mm/kasan/report_hw_tags.c @@ -15,17 +15,17 @@ #include "kasan.h" -const char *get_bug_type(struct kasan_access_info *info) +const char *kasan_get_bug_type(struct kasan_access_info *info) { return "invalid-access"; } -void *find_first_bad_addr(void *addr, size_t size) +void *kasan_find_first_bad_addr(void *addr, size_t size) { return kasan_reset_tag(addr); } -void metadata_fetch_row(char *buffer, void *row) +void kasan_metadata_fetch_row(char *buffer, void *row) { int i; @@ -33,7 +33,7 @@ void metadata_fetch_row(char *buffer, void *row) buffer[i] = hw_get_mem_tag(row + i * KASAN_GRANULE_SIZE); } -void print_tags(u8 addr_tag, const void *addr) +void kasan_print_tags(u8 addr_tag, const void *addr) { u8 memory_tag = hw_get_mem_tag((void *)addr); diff --git a/mm/kasan/report_sw_tags.c b/mm/kasan/report_sw_tags.c index 1b026793ad57..3d20d3451d9e 100644 --- a/mm/kasan/report_sw_tags.c +++ b/mm/kasan/report_sw_tags.c @@ -29,7 +29,7 @@ #include "kasan.h" #include "../slab.h" -const char *get_bug_type(struct kasan_access_info *info) +const char *kasan_get_bug_type(struct kasan_access_info *info) { #ifdef CONFIG_KASAN_SW_TAGS_IDENTIFY struct kasan_alloc_meta *alloc_meta; @@ -72,7 +72,7 @@ const char *get_bug_type(struct kasan_access_info *info) return "invalid-access"; } -void *find_first_bad_addr(void *addr, size_t size) +void *kasan_find_first_bad_addr(void *addr, size_t size) { u8 tag = get_tag(addr); void *p = kasan_reset_tag(addr); @@ -83,12 +83,12 @@ void *find_first_bad_addr(void *addr, size_t size) return p; } -void metadata_fetch_row(char *buffer, void *row) +void kasan_metadata_fetch_row(char *buffer, void *row) { memcpy(buffer, kasan_mem_to_shadow(row), META_BYTES_PER_ROW); } -void print_tags(u8 addr_tag, const void *addr) +void kasan_print_tags(u8 addr_tag, const void *addr) { u8 *shadow = (u8 *)kasan_mem_to_shadow(addr); diff --git a/mm/kasan/shadow.c b/mm/kasan/shadow.c index 7c2c08c55f32..80adc85d0393 100644 --- a/mm/kasan/shadow.c +++ b/mm/kasan/shadow.c @@ -27,20 +27,20 @@ bool __kasan_check_read(const volatile void *p, unsigned int size) { - return check_memory_region((unsigned long)p, size, false, _RET_IP_); + return kasan_check_range((unsigned long)p, size, false, _RET_IP_); } EXPORT_SYMBOL(__kasan_check_read); bool __kasan_check_write(const volatile void *p, unsigned int size) { - return check_memory_region((unsigned long)p, size, true, _RET_IP_); + return kasan_check_range((unsigned long)p, size, true, _RET_IP_); } EXPORT_SYMBOL(__kasan_check_write); #undef memset void *memset(void *addr, int c, size_t len) { - if (!check_memory_region((unsigned long)addr, len, true, _RET_IP_)) + if (!kasan_check_range((unsigned long)addr, len, true, _RET_IP_)) return NULL; return __memset(addr, c, len); @@ -50,8 +50,8 @@ void *memset(void *addr, int c, size_t len) #undef memmove void *memmove(void *dest, const void *src, size_t len) { - if (!check_memory_region((unsigned long)src, len, false, _RET_IP_) || - !check_memory_region((unsigned long)dest, len, true, _RET_IP_)) + if (!kasan_check_range((unsigned long)src, len, false, _RET_IP_) || + !kasan_check_range((unsigned long)dest, len, true, _RET_IP_)) return NULL; return __memmove(dest, src, len); @@ -61,8 +61,8 @@ void *memmove(void *dest, const void *src, size_t len) #undef memcpy void *memcpy(void *dest, const void *src, size_t len) { - if (!check_memory_region((unsigned long)src, len, false, _RET_IP_) || - !check_memory_region((unsigned long)dest, len, true, _RET_IP_)) + if (!kasan_check_range((unsigned long)src, len, false, _RET_IP_) || + !kasan_check_range((unsigned long)dest, len, true, _RET_IP_)) return NULL; return __memcpy(dest, src, len); @@ -72,7 +72,7 @@ void *memcpy(void *dest, const void *src, size_t len) * Poisons the shadow memory for 'size' bytes starting from 'addr'. * Memory addresses should be aligned to KASAN_GRANULE_SIZE. */ -void poison_range(const void *address, size_t size, u8 value) +void kasan_poison(const void *address, size_t size, u8 value) { void *shadow_start, *shadow_end; @@ -89,8 +89,9 @@ void poison_range(const void *address, size_t size, u8 value) __memset(shadow_start, value, shadow_end - shadow_start); } +EXPORT_SYMBOL(kasan_poison); -void unpoison_range(const void *address, size_t size) +void kasan_unpoison(const void *address, size_t size) { u8 tag = get_tag(address); @@ -101,7 +102,7 @@ void unpoison_range(const void *address, size_t size) */ address = kasan_reset_tag(address); - poison_range(address, size, tag); + kasan_poison(address, size, tag); if (size & KASAN_GRANULE_MASK) { u8 *shadow = (u8 *)kasan_mem_to_shadow(address + size); @@ -286,7 +287,7 @@ int kasan_populate_vmalloc(unsigned long addr, unsigned long size) * // vmalloc() allocates memory * // let a = area->addr * // we reach kasan_populate_vmalloc - * // and call unpoison_range: + * // and call kasan_unpoison: * STORE shadow(a), unpoison_val * ... * STORE shadow(a+99), unpoison_val x = LOAD p @@ -321,7 +322,7 @@ void kasan_poison_vmalloc(const void *start, unsigned long size) return; size = round_up(size, KASAN_GRANULE_SIZE); - poison_range(start, size, KASAN_VMALLOC_INVALID); + kasan_poison(start, size, KASAN_VMALLOC_INVALID); } void kasan_unpoison_vmalloc(const void *start, unsigned long size) @@ -329,7 +330,7 @@ void kasan_unpoison_vmalloc(const void *start, unsigned long size) if (!is_vmalloc_or_module_addr(start)) return; - unpoison_range(start, size); + kasan_unpoison(start, size); } static int kasan_depopulate_vmalloc_pte(pte_t *ptep, unsigned long addr, diff --git a/mm/kasan/sw_tags.c b/mm/kasan/sw_tags.c index 5dcd830805b2..94c2d33be333 100644 --- a/mm/kasan/sw_tags.c +++ b/mm/kasan/sw_tags.c @@ -57,7 +57,7 @@ void __init kasan_init_sw_tags(void) * sequence has in fact positive effect, since interrupts that randomly skew * PRNG at unpredictable points do only good. */ -u8 random_tag(void) +u8 kasan_random_tag(void) { u32 state = this_cpu_read(prng_state); @@ -67,7 +67,7 @@ u8 random_tag(void) return (u8)(state % (KASAN_TAG_MAX + 1)); } -bool check_memory_region(unsigned long addr, size_t size, bool write, +bool kasan_check_range(unsigned long addr, size_t size, bool write, unsigned long ret_ip) { u8 tag; @@ -118,24 +118,24 @@ bool check_memory_region(unsigned long addr, size_t size, bool write, return true; } -bool check_invalid_free(void *addr) +bool kasan_byte_accessible(const void *addr) { u8 tag = get_tag(addr); u8 shadow_byte = READ_ONCE(*(u8 *)kasan_mem_to_shadow(kasan_reset_tag(addr))); - return (shadow_byte == KASAN_TAG_INVALID) || - (tag != KASAN_TAG_KERNEL && tag != shadow_byte); + return (shadow_byte != KASAN_TAG_INVALID) && + (tag == KASAN_TAG_KERNEL || tag == shadow_byte); } #define DEFINE_HWASAN_LOAD_STORE(size) \ void __hwasan_load##size##_noabort(unsigned long addr) \ { \ - check_memory_region(addr, size, false, _RET_IP_); \ + kasan_check_range(addr, size, false, _RET_IP_); \ } \ EXPORT_SYMBOL(__hwasan_load##size##_noabort); \ void __hwasan_store##size##_noabort(unsigned long addr) \ { \ - check_memory_region(addr, size, true, _RET_IP_); \ + kasan_check_range(addr, size, true, _RET_IP_); \ } \ EXPORT_SYMBOL(__hwasan_store##size##_noabort) @@ -147,19 +147,19 @@ DEFINE_HWASAN_LOAD_STORE(16); void __hwasan_loadN_noabort(unsigned long addr, unsigned long size) { - check_memory_region(addr, size, false, _RET_IP_); + kasan_check_range(addr, size, false, _RET_IP_); } EXPORT_SYMBOL(__hwasan_loadN_noabort); void __hwasan_storeN_noabort(unsigned long addr, unsigned long size) { - check_memory_region(addr, size, true, _RET_IP_); + kasan_check_range(addr, size, true, _RET_IP_); } EXPORT_SYMBOL(__hwasan_storeN_noabort); void __hwasan_tag_memory(unsigned long addr, u8 tag, unsigned long size) { - poison_range((void *)addr, size, tag); + kasan_poison((void *)addr, size, tag); } EXPORT_SYMBOL(__hwasan_tag_memory); diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 67ab391a5373..75e246f680f4 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -991,38 +991,41 @@ static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address, static bool __collapse_huge_page_swapin(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long address, pmd_t *pmd, + unsigned long haddr, pmd_t *pmd, int referenced) { int swapped_in = 0; vm_fault_t ret = 0; - struct vm_fault vmf = { - .vma = vma, - .address = address, - .flags = FAULT_FLAG_ALLOW_RETRY, - .pmd = pmd, - .pgoff = linear_page_index(vma, address), - }; - - vmf.pte = pte_offset_map(pmd, address); - for (; vmf.address < address + HPAGE_PMD_NR*PAGE_SIZE; - vmf.pte++, vmf.address += PAGE_SIZE) { + unsigned long address, end = haddr + (HPAGE_PMD_NR * PAGE_SIZE); + + for (address = haddr; address < end; address += PAGE_SIZE) { + struct vm_fault vmf = { + .vma = vma, + .address = address, + .pgoff = linear_page_index(vma, haddr), + .flags = FAULT_FLAG_ALLOW_RETRY, + .pmd = pmd, + }; + + vmf.pte = pte_offset_map(pmd, address); vmf.orig_pte = *vmf.pte; - if (!is_swap_pte(vmf.orig_pte)) + if (!is_swap_pte(vmf.orig_pte)) { + pte_unmap(vmf.pte); continue; + } swapped_in++; ret = do_swap_page(&vmf); /* do_swap_page returns VM_FAULT_RETRY with released mmap_lock */ if (ret & VM_FAULT_RETRY) { mmap_read_lock(mm); - if (hugepage_vma_revalidate(mm, address, &vmf.vma)) { + if (hugepage_vma_revalidate(mm, haddr, &vma)) { /* vma is no longer available, don't continue to swapin */ trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 0); return false; } /* check if the pmd is still valid */ - if (mm_find_pmd(mm, address) != pmd) { + if (mm_find_pmd(mm, haddr) != pmd) { trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 0); return false; } @@ -1031,11 +1034,7 @@ static bool __collapse_huge_page_swapin(struct mm_struct *mm, trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 0); return false; } - /* pte is unmapped now, we need to map it */ - vmf.pte = pte_offset_map(pmd, vmf.address); } - vmf.pte--; - pte_unmap(vmf.pte); /* Drain LRU add pagevec to remove extra pin on the swapped in pages */ if (swapped_in) @@ -1644,6 +1643,7 @@ static void collapse_file(struct mm_struct *mm, XA_STATE_ORDER(xas, &mapping->i_pages, start, HPAGE_PMD_ORDER); int nr_none = 0, result = SCAN_SUCCEED; bool is_shmem = shmem_file(file); + int nr; VM_BUG_ON(!IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) && !is_shmem); VM_BUG_ON(start & (HPAGE_PMD_NR - 1)); @@ -1855,11 +1855,12 @@ out_unlock: put_page(page); goto xa_unlocked; } + nr = thp_nr_pages(new_page); if (is_shmem) - __inc_lruvec_page_state(new_page, NR_SHMEM_THPS); + __mod_lruvec_page_state(new_page, NR_SHMEM_THPS, nr); else { - __inc_lruvec_page_state(new_page, NR_FILE_THPS); + __mod_lruvec_page_state(new_page, NR_FILE_THPS, nr); filemap_nr_thps_inc(mapping); } diff --git a/mm/list_lru.c b/mm/list_lru.c index fe230081690b..6f067b6b935f 100644 --- a/mm/list_lru.c +++ b/mm/list_lru.c @@ -373,21 +373,13 @@ static void memcg_destroy_list_lru_node(struct list_lru_node *nlru) struct list_lru_memcg *memcg_lrus; /* * This is called when shrinker has already been unregistered, - * and nobody can use it. So, there is no need to use kvfree_rcu_local(). + * and nobody can use it. So, there is no need to use kvfree_rcu(). */ memcg_lrus = rcu_dereference_protected(nlru->memcg_lrus, true); __memcg_destroy_list_lru_node(memcg_lrus, 0, memcg_nr_cache_ids); kvfree(memcg_lrus); } -static void kvfree_rcu_local(struct rcu_head *head) -{ - struct list_lru_memcg *mlru; - - mlru = container_of(head, struct list_lru_memcg, rcu); - kvfree(mlru); -} - static int memcg_update_list_lru_node(struct list_lru_node *nlru, int old_size, int new_size) { @@ -419,7 +411,7 @@ static int memcg_update_list_lru_node(struct list_lru_node *nlru, rcu_assign_pointer(nlru->memcg_lrus, new); spin_unlock_irq(&nlru->lock); - call_rcu(&old->rcu, kvfree_rcu_local); + kvfree_rcu(old, rcu); return 0; } diff --git a/mm/madvise.c b/mm/madvise.c index 6a660858784b..df692d2e35d4 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -506,9 +506,9 @@ static long madvise_cold(struct vm_area_struct *vma, return -EINVAL; lru_add_drain(); - tlb_gather_mmu(&tlb, mm, start_addr, end_addr); + tlb_gather_mmu(&tlb, mm); madvise_cold_page_range(&tlb, vma, start_addr, end_addr); - tlb_finish_mmu(&tlb, start_addr, end_addr); + tlb_finish_mmu(&tlb); return 0; } @@ -539,8 +539,9 @@ static inline bool can_do_pageout(struct vm_area_struct *vma) * otherwise we'd be including shared non-exclusive mappings, which * opens a side channel. */ - return inode_owner_or_capable(file_inode(vma->vm_file)) || - inode_permission(file_inode(vma->vm_file), MAY_WRITE) == 0; + return inode_owner_or_capable(&init_user_ns, + file_inode(vma->vm_file)) || + file_permission(vma->vm_file, MAY_WRITE) == 0; } static long madvise_pageout(struct vm_area_struct *vma, @@ -558,9 +559,9 @@ static long madvise_pageout(struct vm_area_struct *vma, return 0; lru_add_drain(); - tlb_gather_mmu(&tlb, mm, start_addr, end_addr); + tlb_gather_mmu(&tlb, mm); madvise_pageout_page_range(&tlb, vma, start_addr, end_addr); - tlb_finish_mmu(&tlb, start_addr, end_addr); + tlb_finish_mmu(&tlb); return 0; } @@ -723,7 +724,7 @@ static int madvise_free_single_vma(struct vm_area_struct *vma, range.start, range.end); lru_add_drain(); - tlb_gather_mmu(&tlb, mm, range.start, range.end); + tlb_gather_mmu(&tlb, mm); update_hiwater_rss(mm); mmu_notifier_invalidate_range_start(&range); @@ -732,7 +733,7 @@ static int madvise_free_single_vma(struct vm_area_struct *vma, &madvise_free_walk_ops, &tlb); tlb_end_vma(&tlb, vma); mmu_notifier_invalidate_range_end(&range); - tlb_finish_mmu(&tlb, range.start, range.end); + tlb_finish_mmu(&tlb); return 0; } diff --git a/mm/memblock.c b/mm/memblock.c index 1eaaec1e7687..afaefa8fc6ab 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -275,14 +275,6 @@ __memblock_find_range_top_down(phys_addr_t start, phys_addr_t end, * * Find @size free area aligned to @align in the specified range and node. * - * When allocation direction is bottom-up, the @start should be greater - * than the end of the kernel image. Otherwise, it will be trimmed. The - * reason is that we want the bottom-up allocation just near the kernel - * image so it is highly likely that the allocated memory and the kernel - * will reside in the same node. - * - * If bottom-up allocation failed, will try to allocate memory top-down. - * * Return: * Found address on success, 0 on failure. */ @@ -291,8 +283,6 @@ static phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t size, phys_addr_t end, int nid, enum memblock_flags flags) { - phys_addr_t kernel_end, ret; - /* pump up @end */ if (end == MEMBLOCK_ALLOC_ACCESSIBLE || end == MEMBLOCK_ALLOC_KASAN) @@ -301,40 +291,13 @@ static phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t size, /* avoid allocating the first page */ start = max_t(phys_addr_t, start, PAGE_SIZE); end = max(start, end); - kernel_end = __pa_symbol(_end); - - /* - * try bottom-up allocation only when bottom-up mode - * is set and @end is above the kernel image. - */ - if (memblock_bottom_up() && end > kernel_end) { - phys_addr_t bottom_up_start; - - /* make sure we will allocate above the kernel */ - bottom_up_start = max(start, kernel_end); - - /* ok, try bottom-up allocation first */ - ret = __memblock_find_range_bottom_up(bottom_up_start, end, - size, align, nid, flags); - if (ret) - return ret; - /* - * we always limit bottom-up allocation above the kernel, - * but top-down allocation doesn't have the limit, so - * retrying top-down allocation may succeed when bottom-up - * allocation failed. - * - * bottom-up allocation is expected to be fail very rarely, - * so we use WARN_ONCE() here to see the stack trace if - * fail happens. - */ - WARN_ONCE(IS_ENABLED(CONFIG_MEMORY_HOTREMOVE), - "memblock: bottom-up allocation failed, memory hotremove may be affected\n"); - } - - return __memblock_find_range_top_down(start, end, size, align, nid, - flags); + if (memblock_bottom_up()) + return __memblock_find_range_bottom_up(start, end, size, align, + nid, flags); + else + return __memblock_find_range_top_down(start, end, size, align, + nid, flags); } /** @@ -2087,10 +2050,8 @@ void __init reset_all_zones_managed_pages(void) /** * memblock_free_all - release free pages to the buddy allocator - * - * Return: the number of pages actually released. */ -unsigned long __init memblock_free_all(void) +void __init memblock_free_all(void) { unsigned long pages; @@ -2099,8 +2060,6 @@ unsigned long __init memblock_free_all(void) pages = free_low_memory_core_early(); totalram_pages_add(pages); - - return pages; } #if defined(CONFIG_DEBUG_FS) && defined(CONFIG_ARCH_KEEP_MEMBLOCK) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index e2de77b5bcc2..845eec01ef9d 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -255,6 +255,11 @@ struct cgroup_subsys_state *vmpressure_to_css(struct vmpressure *vmpr) #ifdef CONFIG_MEMCG_KMEM extern spinlock_t css_set_lock; +static int __memcg_kmem_charge(struct mem_cgroup *memcg, gfp_t gfp, + unsigned int nr_pages); +static void __memcg_kmem_uncharge(struct mem_cgroup *memcg, + unsigned int nr_pages); + static void obj_cgroup_release(struct percpu_ref *ref) { struct obj_cgroup *objcg = container_of(ref, struct obj_cgroup, refcnt); @@ -447,8 +452,7 @@ static void memcg_free_shrinker_maps(struct mem_cgroup *memcg) for_each_node(nid) { pn = mem_cgroup_nodeinfo(memcg, nid); map = rcu_dereference_protected(pn->shrinker_map, true); - if (map) - kvfree(map); + kvfree(map); rcu_assign_pointer(pn->shrinker_map, NULL); } } @@ -1043,29 +1047,6 @@ struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm) } EXPORT_SYMBOL(get_mem_cgroup_from_mm); -/** - * get_mem_cgroup_from_page: Obtain a reference on given page's memcg. - * @page: page from which memcg should be extracted. - * - * Obtain a reference on page->memcg and returns it if successful. Otherwise - * root_mem_cgroup is returned. - */ -struct mem_cgroup *get_mem_cgroup_from_page(struct page *page) -{ - struct mem_cgroup *memcg = page_memcg(page); - - if (mem_cgroup_disabled()) - return NULL; - - rcu_read_lock(); - /* Page should not get uncharged and freed memcg under us. */ - if (!memcg || WARN_ON_ONCE(!css_tryget(&memcg->css))) - memcg = root_mem_cgroup; - rcu_read_unlock(); - return memcg; -} -EXPORT_SYMBOL(get_mem_cgroup_from_page); - static __always_inline struct mem_cgroup *active_memcg(void) { if (in_interrupt()) @@ -1080,13 +1061,9 @@ static __always_inline struct mem_cgroup *get_active_memcg(void) rcu_read_lock(); memcg = active_memcg(); - if (memcg) { - /* current->active_memcg must hold a ref. */ - if (WARN_ON_ONCE(!css_tryget(&memcg->css))) - memcg = root_mem_cgroup; - else - memcg = current->active_memcg; - } + /* remote memcg must hold a ref. */ + if (memcg && WARN_ON_ONCE(!css_tryget(&memcg->css))) + memcg = root_mem_cgroup; rcu_read_unlock(); return memcg; @@ -1346,20 +1323,19 @@ void lruvec_memcg_debug(struct lruvec *lruvec, struct page *page) * lock_page_lruvec - lock and return lruvec for a given page. * @page: the page * - * This series functions should be used in either conditions: - * PageLRU is cleared or unset - * or page->_refcount is zero - * or page is locked. + * These functions are safe to use under any of the following conditions: + * - page locked + * - PageLRU cleared + * - lock_page_memcg() + * - page->_refcount is zero */ struct lruvec *lock_page_lruvec(struct page *page) { struct lruvec *lruvec; struct pglist_data *pgdat = page_pgdat(page); - rcu_read_lock(); lruvec = mem_cgroup_page_lruvec(page, pgdat); spin_lock(&lruvec->lru_lock); - rcu_read_unlock(); lruvec_memcg_debug(lruvec, page); @@ -1371,10 +1347,8 @@ struct lruvec *lock_page_lruvec_irq(struct page *page) struct lruvec *lruvec; struct pglist_data *pgdat = page_pgdat(page); - rcu_read_lock(); lruvec = mem_cgroup_page_lruvec(page, pgdat); spin_lock_irq(&lruvec->lru_lock); - rcu_read_unlock(); lruvec_memcg_debug(lruvec, page); @@ -1386,10 +1360,8 @@ struct lruvec *lock_page_lruvec_irqsave(struct page *page, unsigned long *flags) struct lruvec *lruvec; struct pglist_data *pgdat = page_pgdat(page); - rcu_read_lock(); lruvec = mem_cgroup_page_lruvec(page, pgdat); spin_lock_irqsave(&lruvec->lru_lock, *flags); - rcu_read_unlock(); lruvec_memcg_debug(lruvec, page); @@ -1512,72 +1484,73 @@ static bool mem_cgroup_wait_acct_move(struct mem_cgroup *memcg) struct memory_stat { const char *name; - unsigned int ratio; unsigned int idx; }; -static struct memory_stat memory_stats[] = { - { "anon", PAGE_SIZE, NR_ANON_MAPPED }, - { "file", PAGE_SIZE, NR_FILE_PAGES }, - { "kernel_stack", 1024, NR_KERNEL_STACK_KB }, - { "pagetables", PAGE_SIZE, NR_PAGETABLE }, - { "percpu", 1, MEMCG_PERCPU_B }, - { "sock", PAGE_SIZE, MEMCG_SOCK }, - { "shmem", PAGE_SIZE, NR_SHMEM }, - { "file_mapped", PAGE_SIZE, NR_FILE_MAPPED }, - { "file_dirty", PAGE_SIZE, NR_FILE_DIRTY }, - { "file_writeback", PAGE_SIZE, NR_WRITEBACK }, +static const struct memory_stat memory_stats[] = { + { "anon", NR_ANON_MAPPED }, + { "file", NR_FILE_PAGES }, + { "kernel_stack", NR_KERNEL_STACK_KB }, + { "pagetables", NR_PAGETABLE }, + { "percpu", MEMCG_PERCPU_B }, + { "sock", MEMCG_SOCK }, + { "shmem", NR_SHMEM }, + { "file_mapped", NR_FILE_MAPPED }, + { "file_dirty", NR_FILE_DIRTY }, + { "file_writeback", NR_WRITEBACK }, +#ifdef CONFIG_SWAP + { "swapcached", NR_SWAPCACHE }, +#endif #ifdef CONFIG_TRANSPARENT_HUGEPAGE - /* - * The ratio will be initialized in memory_stats_init(). Because - * on some architectures, the macro of HPAGE_PMD_SIZE is not - * constant(e.g. powerpc). - */ - { "anon_thp", 0, NR_ANON_THPS }, - { "file_thp", 0, NR_FILE_THPS }, - { "shmem_thp", 0, NR_SHMEM_THPS }, + { "anon_thp", NR_ANON_THPS }, + { "file_thp", NR_FILE_THPS }, + { "shmem_thp", NR_SHMEM_THPS }, #endif - { "inactive_anon", PAGE_SIZE, NR_INACTIVE_ANON }, - { "active_anon", PAGE_SIZE, NR_ACTIVE_ANON }, - { "inactive_file", PAGE_SIZE, NR_INACTIVE_FILE }, - { "active_file", PAGE_SIZE, NR_ACTIVE_FILE }, - { "unevictable", PAGE_SIZE, NR_UNEVICTABLE }, - - /* - * Note: The slab_reclaimable and slab_unreclaimable must be - * together and slab_reclaimable must be in front. - */ - { "slab_reclaimable", 1, NR_SLAB_RECLAIMABLE_B }, - { "slab_unreclaimable", 1, NR_SLAB_UNRECLAIMABLE_B }, + { "inactive_anon", NR_INACTIVE_ANON }, + { "active_anon", NR_ACTIVE_ANON }, + { "inactive_file", NR_INACTIVE_FILE }, + { "active_file", NR_ACTIVE_FILE }, + { "unevictable", NR_UNEVICTABLE }, + { "slab_reclaimable", NR_SLAB_RECLAIMABLE_B }, + { "slab_unreclaimable", NR_SLAB_UNRECLAIMABLE_B }, /* The memory events */ - { "workingset_refault_anon", 1, WORKINGSET_REFAULT_ANON }, - { "workingset_refault_file", 1, WORKINGSET_REFAULT_FILE }, - { "workingset_activate_anon", 1, WORKINGSET_ACTIVATE_ANON }, - { "workingset_activate_file", 1, WORKINGSET_ACTIVATE_FILE }, - { "workingset_restore_anon", 1, WORKINGSET_RESTORE_ANON }, - { "workingset_restore_file", 1, WORKINGSET_RESTORE_FILE }, - { "workingset_nodereclaim", 1, WORKINGSET_NODERECLAIM }, + { "workingset_refault_anon", WORKINGSET_REFAULT_ANON }, + { "workingset_refault_file", WORKINGSET_REFAULT_FILE }, + { "workingset_activate_anon", WORKINGSET_ACTIVATE_ANON }, + { "workingset_activate_file", WORKINGSET_ACTIVATE_FILE }, + { "workingset_restore_anon", WORKINGSET_RESTORE_ANON }, + { "workingset_restore_file", WORKINGSET_RESTORE_FILE }, + { "workingset_nodereclaim", WORKINGSET_NODERECLAIM }, }; -static int __init memory_stats_init(void) -{ - int i; - - for (i = 0; i < ARRAY_SIZE(memory_stats); i++) { -#ifdef CONFIG_TRANSPARENT_HUGEPAGE - if (memory_stats[i].idx == NR_ANON_THPS || - memory_stats[i].idx == NR_FILE_THPS || - memory_stats[i].idx == NR_SHMEM_THPS) - memory_stats[i].ratio = HPAGE_PMD_SIZE; -#endif - VM_BUG_ON(!memory_stats[i].ratio); - VM_BUG_ON(memory_stats[i].idx >= MEMCG_NR_STAT); +/* Translate stat items to the correct unit for memory.stat output */ +static int memcg_page_state_unit(int item) +{ + switch (item) { + case MEMCG_PERCPU_B: + case NR_SLAB_RECLAIMABLE_B: + case NR_SLAB_UNRECLAIMABLE_B: + case WORKINGSET_REFAULT_ANON: + case WORKINGSET_REFAULT_FILE: + case WORKINGSET_ACTIVATE_ANON: + case WORKINGSET_ACTIVATE_FILE: + case WORKINGSET_RESTORE_ANON: + case WORKINGSET_RESTORE_FILE: + case WORKINGSET_NODERECLAIM: + return 1; + case NR_KERNEL_STACK_KB: + return SZ_1K; + default: + return PAGE_SIZE; } +} - return 0; +static inline unsigned long memcg_page_state_output(struct mem_cgroup *memcg, + int item) +{ + return memcg_page_state(memcg, item) * memcg_page_state_unit(item); } -pure_initcall(memory_stats_init); static char *memory_stat_format(struct mem_cgroup *memcg) { @@ -1602,13 +1575,12 @@ static char *memory_stat_format(struct mem_cgroup *memcg) for (i = 0; i < ARRAY_SIZE(memory_stats); i++) { u64 size; - size = memcg_page_state(memcg, memory_stats[i].idx); - size *= memory_stats[i].ratio; + size = memcg_page_state_output(memcg, memory_stats[i].idx); seq_buf_printf(&s, "%s %llu\n", memory_stats[i].name, size); if (unlikely(memory_stats[i].idx == NR_SLAB_UNRECLAIMABLE_B)) { - size = memcg_page_state(memcg, NR_SLAB_RECLAIMABLE_B) + - memcg_page_state(memcg, NR_SLAB_UNRECLAIMABLE_B); + size += memcg_page_state_output(memcg, + NR_SLAB_RECLAIMABLE_B); seq_buf_printf(&s, "slab %llu\n", size); } } @@ -2935,9 +2907,10 @@ static void commit_charge(struct page *page, struct mem_cgroup *memcg) #ifdef CONFIG_MEMCG_KMEM int memcg_alloc_page_obj_cgroups(struct page *page, struct kmem_cache *s, - gfp_t gfp) + gfp_t gfp, bool new_page) { unsigned int objects = objs_per_slab_page(s, page); + unsigned long memcg_data; void *vec; vec = kcalloc_node(objects, sizeof(struct obj_cgroup *), gfp, @@ -2945,11 +2918,25 @@ int memcg_alloc_page_obj_cgroups(struct page *page, struct kmem_cache *s, if (!vec) return -ENOMEM; - if (!set_page_objcgs(page, vec)) + memcg_data = (unsigned long) vec | MEMCG_DATA_OBJCGS; + if (new_page) { + /* + * If the slab page is brand new and nobody can yet access + * it's memcg_data, no synchronization is required and + * memcg_data can be simply assigned. + */ + page->memcg_data = memcg_data; + } else if (cmpxchg(&page->memcg_data, 0, memcg_data)) { + /* + * If the slab page is already in use, somebody can allocate + * and assign obj_cgroups in parallel. In this case the existing + * objcg vector should be reused. + */ kfree(vec); - else - kmemleak_not_leak(vec); + return 0; + } + kmemleak_not_leak(vec); return 0; } @@ -3077,8 +3064,8 @@ static void memcg_free_cache_id(int id) * * Returns 0 on success, an error code on failure. */ -int __memcg_kmem_charge(struct mem_cgroup *memcg, gfp_t gfp, - unsigned int nr_pages) +static int __memcg_kmem_charge(struct mem_cgroup *memcg, gfp_t gfp, + unsigned int nr_pages) { struct page_counter *counter; int ret; @@ -3110,7 +3097,7 @@ int __memcg_kmem_charge(struct mem_cgroup *memcg, gfp_t gfp, * @memcg: memcg to uncharge * @nr_pages: number of pages to uncharge */ -void __memcg_kmem_uncharge(struct mem_cgroup *memcg, unsigned int nr_pages) +static void __memcg_kmem_uncharge(struct mem_cgroup *memcg, unsigned int nr_pages) { if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) page_counter_uncharge(&memcg->kmem, nr_pages); @@ -4072,10 +4059,6 @@ static int memcg_stat_show(struct seq_file *m, void *v) if (memcg1_stats[i] == MEMCG_SWAP && !do_memsw_account()) continue; nr = memcg_page_state_local(memcg, memcg1_stats[i]); -#ifdef CONFIG_TRANSPARENT_HUGEPAGE - if (memcg1_stats[i] == NR_ANON_THPS) - nr *= HPAGE_PMD_NR; -#endif seq_printf(m, "%s %lu\n", memcg1_stat_names[i], nr * PAGE_SIZE); } @@ -4106,10 +4089,6 @@ static int memcg_stat_show(struct seq_file *m, void *v) if (memcg1_stats[i] == MEMCG_SWAP && !do_memsw_account()) continue; nr = memcg_page_state(memcg, memcg1_stats[i]); -#ifdef CONFIG_TRANSPARENT_HUGEPAGE - if (memcg1_stats[i] == NR_ANON_THPS) - nr *= HPAGE_PMD_NR; -#endif seq_printf(m, "total_%s %llu\n", memcg1_stat_names[i], (u64)nr * PAGE_SIZE); } @@ -4897,7 +4876,7 @@ static ssize_t memcg_write_event_control(struct kernfs_open_file *of, /* the process need read permission on control file */ /* AV: shouldn't we check that it's been opened for read instead? */ - ret = inode_permission(file_inode(cfile.file), MAY_READ); + ret = file_permission(cfile.file, MAY_READ); if (ret < 0) goto out_put_cfile; @@ -5193,7 +5172,7 @@ static int alloc_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node) return 1; } - pn->lruvec_stat_cpu = alloc_percpu_gfp(struct lruvec_stat, + pn->lruvec_stat_cpu = alloc_percpu_gfp(struct batched_lruvec_stat, GFP_KERNEL_ACCOUNT); if (!pn->lruvec_stat_cpu) { free_percpu(pn->lruvec_stat_local); @@ -5642,7 +5621,6 @@ static int mem_cgroup_move_account(struct page *page, __mod_lruvec_state(to_vec, NR_ANON_THPS, nr_pages); } - } } else { __mod_lruvec_state(from_vec, NR_FILE_PAGES, -nr_pages); @@ -6271,6 +6249,8 @@ static ssize_t memory_high_write(struct kernfs_open_file *of, if (err) return err; + page_counter_set_high(&memcg->memory, high); + for (;;) { unsigned long nr_pages = page_counter_read(&memcg->memory); unsigned long reclaimed; @@ -6294,10 +6274,7 @@ static ssize_t memory_high_write(struct kernfs_open_file *of, break; } - page_counter_set_high(&memcg->memory, high); - memcg_wb_domain_size_changed(memcg); - return nbytes; } @@ -6394,6 +6371,12 @@ static int memory_stat_show(struct seq_file *m, void *v) } #ifdef CONFIG_NUMA +static inline unsigned long lruvec_page_state_output(struct lruvec *lruvec, + int item) +{ + return lruvec_page_state(lruvec, item) * memcg_page_state_unit(item); +} + static int memory_numa_stat_show(struct seq_file *m, void *v) { int i; @@ -6411,8 +6394,8 @@ static int memory_numa_stat_show(struct seq_file *m, void *v) struct lruvec *lruvec; lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(nid)); - size = lruvec_page_state(lruvec, memory_stats[i].idx); - size *= memory_stats[i].ratio; + size = lruvec_page_state_output(lruvec, + memory_stats[i].idx); seq_printf(m, " N%d=%llu", nid, size); } seq_putc(m, '\n'); @@ -6761,7 +6744,19 @@ int mem_cgroup_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask) memcg_check_events(memcg, page); local_irq_enable(); - if (PageSwapCache(page)) { + /* + * Cgroup1's unified memory+swap counter has been charged with the + * new swapcache page, finish the transfer by uncharging the swap + * slot. The swap slot would also get uncharged when it dies, but + * it can stick around indefinitely and we'd count the page twice + * the entire time. + * + * Cgroup2 has separate resource counters for memory and swap, + * so this is a non-issue here. Memory and swap charge lifetimes + * correspond 1:1 to page and swap slot lifetimes: we charge the + * page to memory here, and uncharge swap when the slot is freed. + */ + if (do_memsw_account() && PageSwapCache(page)) { swp_entry_t entry = { .val = page_private(page) }; /* * The swap entry might not get freed for a long time, @@ -6852,31 +6847,6 @@ static void uncharge_page(struct page *page, struct uncharge_gather *ug) css_put(&ug->memcg->css); } -static void uncharge_list(struct list_head *page_list) -{ - struct uncharge_gather ug; - struct list_head *next; - - uncharge_gather_clear(&ug); - - /* - * Note that the list can be a single page->lru; hence the - * do-while loop instead of a simple list_for_each_entry(). - */ - next = page_list->next; - do { - struct page *page; - - page = list_entry(next, struct page, lru); - next = page->lru.next; - - uncharge_page(page, &ug); - } while (next != page_list); - - if (ug.memcg) - uncharge_batch(&ug); -} - /** * mem_cgroup_uncharge - uncharge a page * @page: page to uncharge @@ -6908,11 +6878,17 @@ void mem_cgroup_uncharge(struct page *page) */ void mem_cgroup_uncharge_list(struct list_head *page_list) { + struct uncharge_gather ug; + struct page *page; + if (mem_cgroup_disabled()) return; - if (!list_empty(page_list)) - uncharge_list(page_list); + uncharge_gather_clear(&ug); + list_for_each_entry(page, page_list, lru) + uncharge_page(page, &ug); + if (ug.memcg) + uncharge_batch(&ug); } /** @@ -7079,6 +7055,14 @@ static int __init mem_cgroup_init(void) { int cpu, node; + /* + * Currently s32 type (can refer to struct batched_lruvec_stat) is + * used for per-memcg-per-cpu caching of per-node statistics. In order + * to work fine, we should make sure that the overfill threshold can't + * exceed S32_MAX / PAGE_SIZE. + */ + BUILD_BUG_ON(MEMCG_CHARGE_BATCH > S32_MAX / PAGE_SIZE); + cpuhp_setup_state_nocalls(CPUHP_MM_MEMCQ_DEAD, "mm/memctrl:dead", NULL, memcg_hotplug_cpu_dead); diff --git a/mm/memory-failure.c b/mm/memory-failure.c index e9481632fcd1..55c671904aac 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -243,9 +243,13 @@ static int kill_proc(struct to_kill *tk, unsigned long pfn, int flags) pfn, t->comm, t->pid); if (flags & MF_ACTION_REQUIRED) { - WARN_ON_ONCE(t != current); - ret = force_sig_mceerr(BUS_MCEERR_AR, + if (t == current) + ret = force_sig_mceerr(BUS_MCEERR_AR, (void __user *)tk->addr, addr_lsb); + else + /* Signal other processes sharing the page if they have PF_MCE_EARLY set. */ + ret = send_sig_mceerr(BUS_MCEERR_AO, (void __user *)tk->addr, + addr_lsb, t); } else { /* * Don't use force here, it's convenient if the signal @@ -440,26 +444,26 @@ static struct task_struct *find_early_kill_thread(struct task_struct *tsk) * Determine whether a given process is "early kill" process which expects * to be signaled when some page under the process is hwpoisoned. * Return task_struct of the dedicated thread (main thread unless explicitly - * specified) if the process is "early kill," and otherwise returns NULL. + * specified) if the process is "early kill" and otherwise returns NULL. * - * Note that the above is true for Action Optional case, but not for Action - * Required case where SIGBUS should sent only to the current thread. + * Note that the above is true for Action Optional case. For Action Required + * case, it's only meaningful to the current thread which need to be signaled + * with SIGBUS, this error is Action Optional for other non current + * processes sharing the same error page,if the process is "early kill", the + * task_struct of the dedicated thread will also be returned. */ static struct task_struct *task_early_kill(struct task_struct *tsk, int force_early) { if (!tsk->mm) return NULL; - if (force_early) { - /* - * Comparing ->mm here because current task might represent - * a subthread, while tsk always points to the main thread. - */ - if (tsk->mm == current->mm) - return current; - else - return NULL; - } + /* + * Comparing ->mm here because current task might represent + * a subthread, while tsk always points to the main thread. + */ + if (force_early && tsk->mm == current->mm) + return current; + return find_early_kill_thread(tsk); } diff --git a/mm/memory.c b/mm/memory.c index feff48e1465a..784249f3307b 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -134,6 +134,18 @@ static inline bool arch_faults_on_old_pte(void) } #endif +#ifndef arch_wants_old_prefaulted_pte +static inline bool arch_wants_old_prefaulted_pte(void) +{ + /* + * Transitioning a PTE from 'old' to 'young' can be expensive on + * some architectures, even if it's performed in hardware. By + * default, "false" means prefaulted entries will be 'young'. + */ + return false; +} +#endif + static int __init disable_randmaps(char *s) { randomize_va_space = 0; @@ -1534,13 +1546,13 @@ void zap_page_range(struct vm_area_struct *vma, unsigned long start, lru_add_drain(); mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm, start, start + size); - tlb_gather_mmu(&tlb, vma->vm_mm, start, range.end); + tlb_gather_mmu(&tlb, vma->vm_mm); update_hiwater_rss(vma->vm_mm); mmu_notifier_invalidate_range_start(&range); for ( ; vma && vma->vm_start < range.end; vma = vma->vm_next) unmap_single_vma(&tlb, vma, start, range.end, NULL); mmu_notifier_invalidate_range_end(&range); - tlb_finish_mmu(&tlb, start, range.end); + tlb_finish_mmu(&tlb); } /** @@ -1561,12 +1573,12 @@ static void zap_page_range_single(struct vm_area_struct *vma, unsigned long addr lru_add_drain(); mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm, address, address + size); - tlb_gather_mmu(&tlb, vma->vm_mm, address, range.end); + tlb_gather_mmu(&tlb, vma->vm_mm); update_hiwater_rss(vma->vm_mm); mmu_notifier_invalidate_range_start(&range); unmap_single_vma(&tlb, vma, address, range.end, details); mmu_notifier_invalidate_range_end(&range); - tlb_finish_mmu(&tlb, address, range.end); + tlb_finish_mmu(&tlb); } /** @@ -2165,11 +2177,11 @@ static int remap_pte_range(struct mm_struct *mm, pmd_t *pmd, unsigned long addr, unsigned long end, unsigned long pfn, pgprot_t prot) { - pte_t *pte; + pte_t *pte, *mapped_pte; spinlock_t *ptl; int err = 0; - pte = pte_alloc_map_lock(mm, pmd, addr, &ptl); + mapped_pte = pte = pte_alloc_map_lock(mm, pmd, addr, &ptl); if (!pte) return -ENOMEM; arch_enter_lazy_mmu_mode(); @@ -2183,7 +2195,7 @@ static int remap_pte_range(struct mm_struct *mm, pmd_t *pmd, pfn++; } while (pte++, addr += PAGE_SIZE, addr != end); arch_leave_lazy_mmu_mode(); - pte_unmap_unlock(pte - 1, ptl); + pte_unmap_unlock(mapped_pte, ptl); return err; } @@ -2382,18 +2394,18 @@ static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd, pte_fn_t fn, void *data, bool create, pgtbl_mod_mask *mask) { - pte_t *pte; + pte_t *pte, *mapped_pte; int err = 0; spinlock_t *ptl; if (create) { - pte = (mm == &init_mm) ? + mapped_pte = pte = (mm == &init_mm) ? pte_alloc_kernel_track(pmd, addr, mask) : pte_alloc_map_lock(mm, pmd, addr, &ptl); if (!pte) return -ENOMEM; } else { - pte = (mm == &init_mm) ? + mapped_pte = pte = (mm == &init_mm) ? pte_offset_kernel(pmd, addr) : pte_offset_map_lock(mm, pmd, addr, &ptl); } @@ -2416,7 +2428,7 @@ static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd, arch_leave_lazy_mmu_mode(); if (mm != &init_mm) - pte_unmap_unlock(pte-1, ptl); + pte_unmap_unlock(mapped_pte, ptl); return err; } @@ -3503,7 +3515,7 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf) if (pte_alloc(vma->vm_mm, vmf->pmd)) return VM_FAULT_OOM; - /* See the comment in pte_alloc_one_map() */ + /* See comment in handle_pte_fault() */ if (unlikely(pmd_trans_unstable(vmf->pmd))) return 0; @@ -3643,66 +3655,6 @@ static vm_fault_t __do_fault(struct vm_fault *vmf) return ret; } -/* - * The ordering of these checks is important for pmds with _PAGE_DEVMAP set. - * If we check pmd_trans_unstable() first we will trip the bad_pmd() check - * inside of pmd_none_or_trans_huge_or_clear_bad(). This will end up correctly - * returning 1 but not before it spams dmesg with the pmd_clear_bad() output. - */ -static int pmd_devmap_trans_unstable(pmd_t *pmd) -{ - return pmd_devmap(*pmd) || pmd_trans_unstable(pmd); -} - -static vm_fault_t pte_alloc_one_map(struct vm_fault *vmf) -{ - struct vm_area_struct *vma = vmf->vma; - - if (!pmd_none(*vmf->pmd)) - goto map_pte; - if (vmf->prealloc_pte) { - vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd); - if (unlikely(!pmd_none(*vmf->pmd))) { - spin_unlock(vmf->ptl); - goto map_pte; - } - - mm_inc_nr_ptes(vma->vm_mm); - pmd_populate(vma->vm_mm, vmf->pmd, vmf->prealloc_pte); - spin_unlock(vmf->ptl); - vmf->prealloc_pte = NULL; - } else if (unlikely(pte_alloc(vma->vm_mm, vmf->pmd))) { - return VM_FAULT_OOM; - } -map_pte: - /* - * If a huge pmd materialized under us just retry later. Use - * pmd_trans_unstable() via pmd_devmap_trans_unstable() instead of - * pmd_trans_huge() to ensure the pmd didn't become pmd_trans_huge - * under us and then back to pmd_none, as a result of MADV_DONTNEED - * running immediately after a huge pmd fault in a different thread of - * this mm, in turn leading to a misleading pmd_trans_huge() retval. - * All we have to ensure is that it is a regular pmd that we can walk - * with pte_offset_map() and we can do that through an atomic read in - * C, which is what pmd_trans_unstable() provides. - */ - if (pmd_devmap_trans_unstable(vmf->pmd)) - return VM_FAULT_NOPAGE; - - /* - * At this point we know that our vmf->pmd points to a page of ptes - * and it cannot become pmd_none(), pmd_devmap() or pmd_trans_huge() - * for the duration of the fault. If a racing MADV_DONTNEED runs and - * we zap the ptes pointed to by our vmf->pmd, the vmf->ptl will still - * be valid and we will re-check to make sure the vmf->pte isn't - * pte_none() under vmf->ptl protection when we return to - * alloc_set_pte(). - */ - vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address, - &vmf->ptl); - return 0; -} - #ifdef CONFIG_TRANSPARENT_HUGEPAGE static void deposit_prealloc_pte(struct vm_fault *vmf) { @@ -3717,7 +3669,7 @@ static void deposit_prealloc_pte(struct vm_fault *vmf) vmf->prealloc_pte = NULL; } -static vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page) +vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page) { struct vm_area_struct *vma = vmf->vma; bool write = vmf->flags & FAULT_FLAG_WRITE; @@ -3775,76 +3727,41 @@ out: return ret; } #else -static vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page) +vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page) { - BUILD_BUG(); - return 0; + return VM_FAULT_FALLBACK; } #endif -/** - * alloc_set_pte - setup new PTE entry for given page and add reverse page - * mapping. If needed, the function allocates page table or use pre-allocated. - * - * @vmf: fault environment - * @page: page to map - * - * Caller must take care of unlocking vmf->ptl, if vmf->pte is non-NULL on - * return. - * - * Target users are page handler itself and implementations of - * vm_ops->map_pages. - * - * Return: %0 on success, %VM_FAULT_ code in case of error. - */ -vm_fault_t alloc_set_pte(struct vm_fault *vmf, struct page *page) +void do_set_pte(struct vm_fault *vmf, struct page *page, unsigned long addr) { struct vm_area_struct *vma = vmf->vma; bool write = vmf->flags & FAULT_FLAG_WRITE; + bool prefault = vmf->address != addr; pte_t entry; - vm_fault_t ret; - - if (pmd_none(*vmf->pmd) && PageTransCompound(page)) { - ret = do_set_pmd(vmf, page); - if (ret != VM_FAULT_FALLBACK) - return ret; - } - - if (!vmf->pte) { - ret = pte_alloc_one_map(vmf); - if (ret) - return ret; - } - - /* Re-check under ptl */ - if (unlikely(!pte_none(*vmf->pte))) { - update_mmu_tlb(vma, vmf->address, vmf->pte); - return VM_FAULT_NOPAGE; - } flush_icache_page(vma, page); entry = mk_pte(page, vma->vm_page_prot); - entry = pte_sw_mkyoung(entry); + + if (prefault && arch_wants_old_prefaulted_pte()) + entry = pte_mkold(entry); + else + entry = pte_sw_mkyoung(entry); + if (write) entry = maybe_mkwrite(pte_mkdirty(entry), vma); /* copy-on-write page */ if (write && !(vma->vm_flags & VM_SHARED)) { inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES); - page_add_new_anon_rmap(page, vma, vmf->address, false); + page_add_new_anon_rmap(page, vma, addr, false); lru_cache_add_inactive_or_unevictable(page, vma); } else { inc_mm_counter_fast(vma->vm_mm, mm_counter_file(page)); page_add_file_rmap(page, false); } - set_pte_at(vma->vm_mm, vmf->address, vmf->pte, entry); - - /* no need to invalidate: a not-present page won't be cached */ - update_mmu_cache(vma, vmf->address, vmf->pte); - - return 0; + set_pte_at(vma->vm_mm, addr, vmf->pte, entry); } - /** * finish_fault - finish page fault once we have prepared the page to fault * @@ -3862,12 +3779,12 @@ vm_fault_t alloc_set_pte(struct vm_fault *vmf, struct page *page) */ vm_fault_t finish_fault(struct vm_fault *vmf) { + struct vm_area_struct *vma = vmf->vma; struct page *page; - vm_fault_t ret = 0; + vm_fault_t ret; /* Did we COW the page? */ - if ((vmf->flags & FAULT_FLAG_WRITE) && - !(vmf->vma->vm_flags & VM_SHARED)) + if ((vmf->flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) page = vmf->cow_page; else page = vmf->page; @@ -3876,12 +3793,38 @@ vm_fault_t finish_fault(struct vm_fault *vmf) * check even for read faults because we might have lost our CoWed * page */ - if (!(vmf->vma->vm_flags & VM_SHARED)) - ret = check_stable_address_space(vmf->vma->vm_mm); - if (!ret) - ret = alloc_set_pte(vmf, page); - if (vmf->pte) - pte_unmap_unlock(vmf->pte, vmf->ptl); + if (!(vma->vm_flags & VM_SHARED)) { + ret = check_stable_address_space(vma->vm_mm); + if (ret) + return ret; + } + + if (pmd_none(*vmf->pmd)) { + if (PageTransCompound(page)) { + ret = do_set_pmd(vmf, page); + if (ret != VM_FAULT_FALLBACK) + return ret; + } + + if (unlikely(pte_alloc(vma->vm_mm, vmf->pmd))) + return VM_FAULT_OOM; + } + + /* See comment in handle_pte_fault() */ + if (pmd_devmap_trans_unstable(vmf->pmd)) + return 0; + + vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, + vmf->address, &vmf->ptl); + ret = 0; + /* Re-check under ptl */ + if (likely(pte_none(*vmf->pte))) + do_set_pte(vmf, page, vmf->address); + else + ret = VM_FAULT_NOPAGE; + + update_mmu_tlb(vma, vmf->address, vmf->pte); + pte_unmap_unlock(vmf->pte, vmf->ptl); return ret; } @@ -3951,13 +3894,12 @@ static vm_fault_t do_fault_around(struct vm_fault *vmf) pgoff_t start_pgoff = vmf->pgoff; pgoff_t end_pgoff; int off; - vm_fault_t ret = 0; nr_pages = READ_ONCE(fault_around_bytes) >> PAGE_SHIFT; mask = ~(nr_pages * PAGE_SIZE - 1) & PAGE_MASK; - vmf->address = max(address & mask, vmf->vma->vm_start); - off = ((address - vmf->address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1); + address = max(address & mask, vmf->vma->vm_start); + off = ((vmf->address - address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1); start_pgoff -= off; /* @@ -3965,7 +3907,7 @@ static vm_fault_t do_fault_around(struct vm_fault *vmf) * the vma or nr_pages from start_pgoff, depending what is nearest. */ end_pgoff = start_pgoff - - ((vmf->address >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) + + ((address >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) + PTRS_PER_PTE - 1; end_pgoff = min3(end_pgoff, vma_pages(vmf->vma) + vmf->vma->vm_pgoff - 1, start_pgoff + nr_pages - 1); @@ -3973,31 +3915,11 @@ static vm_fault_t do_fault_around(struct vm_fault *vmf) if (pmd_none(*vmf->pmd)) { vmf->prealloc_pte = pte_alloc_one(vmf->vma->vm_mm); if (!vmf->prealloc_pte) - goto out; + return VM_FAULT_OOM; smp_wmb(); /* See comment in __pte_alloc() */ } - vmf->vma->vm_ops->map_pages(vmf, start_pgoff, end_pgoff); - - /* Huge page is mapped? Page fault is solved */ - if (pmd_trans_huge(*vmf->pmd)) { - ret = VM_FAULT_NOPAGE; - goto out; - } - - /* ->map_pages() haven't done anything useful. Cold page cache? */ - if (!vmf->pte) - goto out; - - /* check if the page fault is solved */ - vmf->pte -= (vmf->address >> PAGE_SHIFT) - (address >> PAGE_SHIFT); - if (!pte_none(*vmf->pte)) - ret = VM_FAULT_NOPAGE; - pte_unmap_unlock(vmf->pte, vmf->ptl); -out: - vmf->address = address; - vmf->pte = NULL; - return ret; + return vmf->vma->vm_ops->map_pages(vmf, start_pgoff, end_pgoff); } static vm_fault_t do_read_fault(struct vm_fault *vmf) @@ -4353,7 +4275,18 @@ static vm_fault_t handle_pte_fault(struct vm_fault *vmf) */ vmf->pte = NULL; } else { - /* See comment in pte_alloc_one_map() */ + /* + * If a huge pmd materialized under us just retry later. Use + * pmd_trans_unstable() via pmd_devmap_trans_unstable() instead + * of pmd_trans_huge() to ensure the pmd didn't become + * pmd_trans_huge under us and then back to pmd_none, as a + * result of MADV_DONTNEED running immediately after a huge pmd + * fault in a different thread of this mm, in turn leading to a + * misleading pmd_trans_huge() retval. All we have to ensure is + * that it is a regular pmd that we can walk with + * pte_offset_map() and we can do that through an atomic read + * in C, which is what pmd_trans_unstable() provides. + */ if (pmd_devmap_trans_unstable(vmf->pmd)) return 0; /* @@ -4709,9 +4642,9 @@ int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address) } #endif /* __PAGETABLE_PMD_FOLDED */ -int follow_pte(struct mm_struct *mm, unsigned long address, - struct mmu_notifier_range *range, pte_t **ptepp, pmd_t **pmdpp, - spinlock_t **ptlp) +int follow_invalidate_pte(struct mm_struct *mm, unsigned long address, + struct mmu_notifier_range *range, pte_t **ptepp, + pmd_t **pmdpp, spinlock_t **ptlp) { pgd_t *pgd; p4d_t *p4d; @@ -4777,6 +4710,34 @@ out: } /** + * follow_pte - look up PTE at a user virtual address + * @mm: the mm_struct of the target address space + * @address: user virtual address + * @ptepp: location to store found PTE + * @ptlp: location to store the lock for the PTE + * + * On a successful return, the pointer to the PTE is stored in @ptepp; + * the corresponding lock is taken and its location is stored in @ptlp. + * The contents of the PTE are only stable until @ptlp is released; + * any further use, if any, must be protected against invalidation + * with MMU notifiers. + * + * Only IO mappings and raw PFN mappings are allowed. The mmap semaphore + * should be taken for read. + * + * KVM uses this function. While it is arguably less bad than ``follow_pfn``, + * it is not a good general-purpose API. + * + * Return: zero on success, -ve otherwise. + */ +int follow_pte(struct mm_struct *mm, unsigned long address, + pte_t **ptepp, spinlock_t **ptlp) +{ + return follow_invalidate_pte(mm, address, NULL, ptepp, NULL, ptlp); +} +EXPORT_SYMBOL_GPL(follow_pte); + +/** * follow_pfn - look up PFN at a user virtual address * @vma: memory mapping * @address: user virtual address @@ -4784,6 +4745,9 @@ out: * * Only IO mappings and raw PFN mappings are allowed. * + * This function does not allow the caller to read the permissions + * of the PTE. Do not use it. + * * Return: zero and the pfn at @pfn on success, -ve otherwise. */ int follow_pfn(struct vm_area_struct *vma, unsigned long address, @@ -4796,7 +4760,7 @@ int follow_pfn(struct vm_area_struct *vma, unsigned long address, if (!(vma->vm_flags & (VM_IO | VM_PFNMAP))) return ret; - ret = follow_pte(vma->vm_mm, address, NULL, &ptep, NULL, &ptl); + ret = follow_pte(vma->vm_mm, address, &ptep, &ptl); if (ret) return ret; *pfn = pte_pfn(*ptep); @@ -4817,7 +4781,7 @@ int follow_phys(struct vm_area_struct *vma, if (!(vma->vm_flags & (VM_IO | VM_PFNMAP))) goto out; - if (follow_pte(vma->vm_mm, address, NULL, &ptep, NULL, &ptl)) + if (follow_pte(vma->vm_mm, address, &ptep, &ptl)) goto out; pte = *ptep; @@ -4834,28 +4798,68 @@ out: return ret; } +/** + * generic_access_phys - generic implementation for iomem mmap access + * @vma: the vma to access + * @addr: userspace addres, not relative offset within @vma + * @buf: buffer to read/write + * @len: length of transfer + * @write: set to FOLL_WRITE when writing, otherwise reading + * + * This is a generic implementation for &vm_operations_struct.access for an + * iomem mapping. This callback is used by access_process_vm() when the @vma is + * not page based. + */ int generic_access_phys(struct vm_area_struct *vma, unsigned long addr, void *buf, int len, int write) { resource_size_t phys_addr; unsigned long prot = 0; void __iomem *maddr; - int offset = addr & (PAGE_SIZE-1); + pte_t *ptep, pte; + spinlock_t *ptl; + int offset = offset_in_page(addr); + int ret = -EINVAL; + + if (!(vma->vm_flags & (VM_IO | VM_PFNMAP))) + return -EINVAL; + +retry: + if (follow_pte(vma->vm_mm, addr, &ptep, &ptl)) + return -EINVAL; + pte = *ptep; + pte_unmap_unlock(ptep, ptl); + + prot = pgprot_val(pte_pgprot(pte)); + phys_addr = (resource_size_t)pte_pfn(pte) << PAGE_SHIFT; - if (follow_phys(vma, addr, write, &prot, &phys_addr)) + if ((write & FOLL_WRITE) && !pte_write(pte)) return -EINVAL; maddr = ioremap_prot(phys_addr, PAGE_ALIGN(len + offset), prot); if (!maddr) return -ENOMEM; + if (follow_pte(vma->vm_mm, addr, &ptep, &ptl)) + goto out_unmap; + + if (!pte_same(pte, *ptep)) { + pte_unmap_unlock(ptep, ptl); + iounmap(maddr); + + goto retry; + } + if (write) memcpy_toio(maddr + offset, buf, len); else memcpy_fromio(buf, maddr + offset, len); + ret = len; + pte_unmap_unlock(ptep, ptl); +out_unmap: iounmap(maddr); - return len; + return ret; } EXPORT_SYMBOL_GPL(generic_access_phys); #endif @@ -5173,17 +5177,19 @@ long copy_huge_page_from_user(struct page *dst_page, void *page_kaddr; unsigned long i, rc = 0; unsigned long ret_val = pages_per_huge_page * PAGE_SIZE; + struct page *subpage = dst_page; - for (i = 0; i < pages_per_huge_page; i++) { + for (i = 0; i < pages_per_huge_page; + i++, subpage = mem_map_next(subpage, dst_page, i)) { if (allow_pagefault) - page_kaddr = kmap(dst_page + i); + page_kaddr = kmap(subpage); else - page_kaddr = kmap_atomic(dst_page + i); + page_kaddr = kmap_atomic(subpage); rc = copy_from_user(page_kaddr, (const void __user *)(src + i * PAGE_SIZE), PAGE_SIZE); if (allow_pagefault) - kunmap(dst_page + i); + kunmap(subpage); else kunmap_atomic(page_kaddr); diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index f9d57b9be8c7..abe43c1ae920 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -713,7 +713,7 @@ void __ref move_pfn_range_to_zone(struct zone *zone, unsigned long start_pfn, * expects the zone spans the pfn range. All the pages in the range * are reserved so nobody should be touching them so we should be safe */ - memmap_init_zone(nr_pages, nid, zone_idx(zone), start_pfn, 0, + memmap_init_range(nr_pages, nid, zone_idx(zone), start_pfn, 0, MEMINIT_HOTPLUG, altmap, migratetype); set_zone_contiguous(zone); @@ -1260,7 +1260,14 @@ static int scan_movable_pages(unsigned long start, unsigned long end, if (!PageHuge(page)) continue; head = compound_head(page); - if (page_huge_active(head)) + /* + * This test is racy as we hold no reference or lock. The + * hugetlb page could have been free'ed and head is no longer + * a hugetlb page before the following check. In such unlikely + * cases false positives and negatives are possible. Calling + * code must deal with these scenarios. + */ + if (HPageMigratable(head)) goto found; skip = compound_nr(head) - (page - head); pfn += skip - 1; diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 2c3a86502053..ab51132547b8 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -677,7 +677,7 @@ static int queue_pages_test_walk(unsigned long start, unsigned long end, unsigned long flags = qp->flags; /* range check first */ - VM_BUG_ON_VMA((vma->vm_start > start) || (vma->vm_end < end), vma); + VM_BUG_ON_VMA(!range_in_vma(vma, start, end), vma); if (!qp->first) { qp->first = vma; @@ -875,6 +875,16 @@ static long do_set_mempolicy(unsigned short mode, unsigned short flags, goto out; } + if (flags & MPOL_F_NUMA_BALANCING) { + if (new && new->mode == MPOL_BIND) { + new->flags |= (MPOL_F_MOF | MPOL_F_MORON); + } else { + ret = -EINVAL; + mpol_put(new); + goto out; + } + } + ret = mpol_set_nodemask(new, nodes, scratch); if (ret) { mpol_put(new); @@ -2486,6 +2496,12 @@ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long break; case MPOL_BIND: + /* Optimize placement among multiple nodes via NUMA balancing */ + if (pol->flags & MPOL_F_MORON) { + if (node_isset(thisnid, pol->v.nodes)) + break; + goto out; + } /* * allows binding to multiple nodes. diff --git a/mm/mempool.c b/mm/mempool.c index 624ed51b060f..79959fac27d7 100644 --- a/mm/mempool.c +++ b/mm/mempool.c @@ -104,7 +104,7 @@ static inline void poison_element(mempool_t *pool, void *element) static __always_inline void kasan_poison_element(mempool_t *pool, void *element) { if (pool->alloc == mempool_alloc_slab || pool->alloc == mempool_kmalloc) - kasan_slab_free_mempool(element, _RET_IP_); + kasan_slab_free_mempool(element); else if (pool->alloc == mempool_alloc_pages) kasan_free_pages(element, (unsigned long)pool->pool_data); } diff --git a/mm/migrate.c b/mm/migrate.c index c0efe921bca5..62b81d5257aa 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -331,7 +331,7 @@ void __migration_entry_wait(struct mm_struct *mm, pte_t *ptep, if (!get_page_unless_zero(page)) goto out; pte_unmap_unlock(ptep, ptl); - put_and_wait_on_page_locked(page); + put_and_wait_on_page_locked(page, TASK_UNINTERRUPTIBLE); return; out: pte_unmap_unlock(ptep, ptl); @@ -365,7 +365,7 @@ void pmd_migration_entry_wait(struct mm_struct *mm, pmd_t *pmd) if (!get_page_unless_zero(page)) goto unlock; spin_unlock(ptl); - put_and_wait_on_page_locked(page); + put_and_wait_on_page_locked(page, TASK_UNINTERRUPTIBLE); return; unlock: spin_unlock(ptl); @@ -500,6 +500,12 @@ int migrate_page_move_mapping(struct address_space *mapping, __mod_lruvec_state(old_lruvec, NR_SHMEM, -nr); __mod_lruvec_state(new_lruvec, NR_SHMEM, nr); } +#ifdef CONFIG_SWAP + if (PageSwapCache(page)) { + __mod_lruvec_state(old_lruvec, NR_SWAPCACHE, -nr); + __mod_lruvec_state(new_lruvec, NR_SWAPCACHE, nr); + } +#endif if (dirty && mapping_can_writeback(mapping)) { __mod_lruvec_state(old_lruvec, NR_FILE_DIRTY, -nr); __mod_zone_page_state(oldzone, NR_ZONE_WRITE_PENDING, -nr); @@ -1280,6 +1286,12 @@ static int unmap_and_move_huge_page(new_page_t get_new_page, return -ENOSYS; } + if (page_count(hpage) == 1) { + /* page was freed from under us. So we are done. */ + putback_active_hugepage(hpage); + return MIGRATEPAGE_SUCCESS; + } + new_hpage = get_new_page(hpage, private); if (!new_hpage) return -ENOMEM; diff --git a/mm/mincore.c b/mm/mincore.c index 02db1a834021..9122676b54d6 100644 --- a/mm/mincore.c +++ b/mm/mincore.c @@ -166,8 +166,9 @@ static inline bool can_do_mincore(struct vm_area_struct *vma) * for writing; otherwise we'd be including shared non-exclusive * mappings, which opens a side channel. */ - return inode_owner_or_capable(file_inode(vma->vm_file)) || - inode_permission(file_inode(vma->vm_file), MAY_WRITE) == 0; + return inode_owner_or_capable(&init_user_ns, + file_inode(vma->vm_file)) || + file_permission(vma->vm_file, MAY_WRITE) == 0; } static const struct mm_walk_ops mincore_walk_ops = { diff --git a/mm/mlock.c b/mm/mlock.c index 55b3b3672977..73960bb3464d 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -278,8 +278,7 @@ static void __munlock_pagevec(struct pagevec *pvec, struct zone *zone) */ if (TestClearPageLRU(page)) { lruvec = relock_page_lruvec_irq(page, lruvec); - del_page_from_lru_list(page, lruvec, - page_lru(page)); + del_page_from_lru_list(page, lruvec); continue; } else __munlock_isolation_failed(page); diff --git a/mm/mmap.c b/mm/mmap.c index dc7206032387..3f287599a7a3 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -189,7 +189,6 @@ static int do_brk_flags(unsigned long addr, unsigned long request, unsigned long struct list_head *uf); SYSCALL_DEFINE1(brk, unsigned long, brk) { - unsigned long retval; unsigned long newbrk, oldbrk, origbrk; struct mm_struct *mm = current->mm; struct vm_area_struct *next; @@ -281,9 +280,8 @@ success: return brk; out: - retval = origbrk; mmap_write_unlock(mm); - return retval; + return origbrk; } static inline unsigned long vma_compute_gap(struct vm_area_struct *vma) @@ -2671,12 +2669,12 @@ static void unmap_region(struct mm_struct *mm, struct mmu_gather tlb; lru_add_drain(); - tlb_gather_mmu(&tlb, mm, start, end); + tlb_gather_mmu(&tlb, mm); update_hiwater_rss(mm); unmap_vmas(&tlb, vma, start, end); free_pgtables(&tlb, vma, prev ? prev->vm_end : FIRST_USER_ADDRESS, next ? next->vm_start : USER_PGTABLES_CEILING); - tlb_finish_mmu(&tlb, start, end); + tlb_finish_mmu(&tlb); } /* @@ -3214,12 +3212,12 @@ void exit_mmap(struct mm_struct *mm) lru_add_drain(); flush_cache_mm(mm); - tlb_gather_mmu(&tlb, mm, 0, -1); + tlb_gather_mmu_fullmm(&tlb, mm); /* update_hiwater_rss(mm) here? but nobody should be looking */ /* Use -1 here to ensure all VMAs in the mm are unmapped */ unmap_vmas(&tlb, vma, 0, -1); free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, USER_PGTABLES_CEILING); - tlb_finish_mmu(&tlb, 0, -1); + tlb_finish_mmu(&tlb); /* * Walk the list again, actually closing and freeing it, diff --git a/mm/mmu_gather.c b/mm/mmu_gather.c index 03c33c93a582..0dc7149b0c61 100644 --- a/mm/mmu_gather.c +++ b/mm/mmu_gather.c @@ -253,21 +253,17 @@ void tlb_flush_mmu(struct mmu_gather *tlb) * tlb_gather_mmu - initialize an mmu_gather structure for page-table tear-down * @tlb: the mmu_gather structure to initialize * @mm: the mm_struct of the target address space - * @start: start of the region that will be removed from the page-table - * @end: end of the region that will be removed from the page-table + * @fullmm: @mm is without users and we're going to destroy the full address + * space (exit/execve) * * Called to initialize an (on-stack) mmu_gather structure for page-table - * tear-down from @mm. The @start and @end are set to 0 and -1 - * respectively when @mm is without users and we're going to destroy - * the full address space (exit/execve). + * tear-down from @mm. */ -void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, - unsigned long start, unsigned long end) +static void __tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, + bool fullmm) { tlb->mm = mm; - - /* Is it from 0 to ~0? */ - tlb->fullmm = !(start | (end+1)); + tlb->fullmm = fullmm; #ifndef CONFIG_MMU_GATHER_NO_GATHER tlb->need_flush_all = 0; @@ -287,17 +283,24 @@ void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, inc_tlb_flush_pending(tlb->mm); } +void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm) +{ + __tlb_gather_mmu(tlb, mm, false); +} + +void tlb_gather_mmu_fullmm(struct mmu_gather *tlb, struct mm_struct *mm) +{ + __tlb_gather_mmu(tlb, mm, true); +} + /** * tlb_finish_mmu - finish an mmu_gather structure * @tlb: the mmu_gather structure to finish - * @start: start of the region that will be removed from the page-table - * @end: end of the region that will be removed from the page-table * * Called at the end of the shootdown operation to free up any resources that * were required. */ -void tlb_finish_mmu(struct mmu_gather *tlb, - unsigned long start, unsigned long end) +void tlb_finish_mmu(struct mmu_gather *tlb) { /* * If there are parallel threads are doing PTE changes on same range diff --git a/mm/mprotect.c b/mm/mprotect.c index ab709023e9aa..94188df1ee55 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -617,10 +617,11 @@ static int do_mprotect_pkey(unsigned long start, size_t len, if (tmp > end) tmp = end; - if (vma->vm_ops && vma->vm_ops->mprotect) + if (vma->vm_ops && vma->vm_ops->mprotect) { error = vma->vm_ops->mprotect(vma, nstart, tmp, newflags); - if (error) - goto out; + if (error) + goto out; + } error = mprotect_fixup(vma, &prev, nstart, tmp, newflags); if (error) diff --git a/mm/mremap.c b/mm/mremap.c index f554320281cc..ec8f840399ed 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -22,7 +22,6 @@ #include <linux/syscalls.h> #include <linux/mmu_notifier.h> #include <linux/uaccess.h> -#include <linux/mm-arch-hooks.h> #include <linux/userfaultfd_k.h> #include <asm/cacheflush.h> @@ -336,8 +335,9 @@ enum pgt_entry { * valid. Else returns a smaller extent bounded by the end of the source and * destination pgt_entry. */ -static unsigned long get_extent(enum pgt_entry entry, unsigned long old_addr, - unsigned long old_end, unsigned long new_addr) +static __always_inline unsigned long get_extent(enum pgt_entry entry, + unsigned long old_addr, unsigned long old_end, + unsigned long new_addr) { unsigned long next, extent, mask, size; @@ -562,8 +562,6 @@ static unsigned long move_vma(struct vm_area_struct *vma, new_addr = err; } else { mremap_userfaultfd_prep(new_vma, uf); - arch_remap(mm, old_addr, old_addr + old_len, - new_addr, new_addr + new_len); } /* Conceal VM_ACCOUNT so old reservation is not undone */ @@ -595,6 +593,14 @@ static unsigned long move_vma(struct vm_area_struct *vma, /* We always clear VM_LOCKED[ONFAULT] on the old vma */ vma->vm_flags &= VM_LOCKED_CLEAR_MASK; + /* + * anon_vma links of the old vma is no longer needed after its page + * table has been moved. + */ + if (new_vma != vma && vma->vm_start == old_addr && + vma->vm_end == (old_addr + old_len)) + unlink_anon_vmas(vma); + /* Because we won't unmap we don't need to touch locked_vm */ return new_addr; } diff --git a/mm/nommu.c b/mm/nommu.c index 870fea12823e..5c9ab799c0e6 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -1668,10 +1668,11 @@ vm_fault_t filemap_fault(struct vm_fault *vmf) } EXPORT_SYMBOL(filemap_fault); -void filemap_map_pages(struct vm_fault *vmf, +vm_fault_t filemap_map_pages(struct vm_fault *vmf, pgoff_t start_pgoff, pgoff_t end_pgoff) { BUG(); + return 0; } EXPORT_SYMBOL(filemap_map_pages); diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 04b19b7b5435..9efaf430cfd3 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -395,9 +395,8 @@ static int dump_task(struct task_struct *p, void *arg) task = find_lock_task_mm(p); if (!task) { /* - * This is a kthread or all of p's threads have already - * detached their mm's. There's no need to report - * them; they can't be oom killed anyway. + * All of p's threads have already detached their mm's. There's + * no need to report them; they can't be oom killed anyway. */ return 0; } @@ -546,15 +545,15 @@ bool __oom_reap_task_mm(struct mm_struct *mm) mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, mm, vma->vm_start, vma->vm_end); - tlb_gather_mmu(&tlb, mm, range.start, range.end); + tlb_gather_mmu(&tlb, mm); if (mmu_notifier_invalidate_range_start_nonblock(&range)) { - tlb_finish_mmu(&tlb, range.start, range.end); + tlb_finish_mmu(&tlb); ret = false; continue; } unmap_page_range(&tlb, vma, range.start, range.end, NULL); mmu_notifier_invalidate_range_end(&range); - tlb_finish_mmu(&tlb, range.start, range.end); + tlb_finish_mmu(&tlb); } } diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 519a60d5b6f7..ddccc59f2f72 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5137,8 +5137,9 @@ void __page_frag_cache_drain(struct page *page, unsigned int count) } EXPORT_SYMBOL(__page_frag_cache_drain); -void *page_frag_alloc(struct page_frag_cache *nc, - unsigned int fragsz, gfp_t gfp_mask) +void *page_frag_alloc_align(struct page_frag_cache *nc, + unsigned int fragsz, gfp_t gfp_mask, + unsigned int align_mask) { unsigned int size = PAGE_SIZE; struct page *page; @@ -5190,11 +5191,12 @@ refill: } nc->pagecnt_bias--; + offset &= align_mask; nc->offset = offset; return nc->va + offset; } -EXPORT_SYMBOL(page_frag_alloc); +EXPORT_SYMBOL(page_frag_alloc_align); /* * Frees a page fragment allocated out of either a compound or order 0 page. @@ -5582,10 +5584,9 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask) K(node_page_state(pgdat, NR_WRITEBACK)), K(node_page_state(pgdat, NR_SHMEM)), #ifdef CONFIG_TRANSPARENT_HUGEPAGE - K(node_page_state(pgdat, NR_SHMEM_THPS) * HPAGE_PMD_NR), - K(node_page_state(pgdat, NR_SHMEM_PMDMAPPED) - * HPAGE_PMD_NR), - K(node_page_state(pgdat, NR_ANON_THPS) * HPAGE_PMD_NR), + K(node_page_state(pgdat, NR_SHMEM_THPS)), + K(node_page_state(pgdat, NR_SHMEM_PMDMAPPED)), + K(node_page_state(pgdat, NR_ANON_THPS)), #endif K(node_page_state(pgdat, NR_WRITEBACK_TEMP)), node_page_state(pgdat, NR_KERNEL_STACK_KB), @@ -6120,7 +6121,7 @@ overlap_memmap_init(unsigned long zone, unsigned long *pfn) * (usually MIGRATE_MOVABLE). Besides setting the migratetype, no related * zone stats (e.g., nr_isolate_pageblock) are touched. */ -void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, +void __meminit memmap_init_range(unsigned long size, int nid, unsigned long zone, unsigned long start_pfn, unsigned long zone_end_pfn, enum meminit_context context, struct vmem_altmap *altmap, int migratetype) @@ -6257,23 +6258,21 @@ static void __meminit zone_init_free_lists(struct zone *zone) } } -void __meminit __weak memmap_init(unsigned long size, int nid, - unsigned long zone, - unsigned long range_start_pfn) +void __meminit __weak memmap_init_zone(struct zone *zone) { + unsigned long zone_start_pfn = zone->zone_start_pfn; + unsigned long zone_end_pfn = zone_start_pfn + zone->spanned_pages; + int i, nid = zone_to_nid(zone), zone_id = zone_idx(zone); unsigned long start_pfn, end_pfn; - unsigned long range_end_pfn = range_start_pfn + size; - int i; for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) { - start_pfn = clamp(start_pfn, range_start_pfn, range_end_pfn); - end_pfn = clamp(end_pfn, range_start_pfn, range_end_pfn); + start_pfn = clamp(start_pfn, zone_start_pfn, zone_end_pfn); + end_pfn = clamp(end_pfn, zone_start_pfn, zone_end_pfn); - if (end_pfn > start_pfn) { - size = end_pfn - start_pfn; - memmap_init_zone(size, nid, zone, start_pfn, range_end_pfn, - MEMINIT_EARLY, NULL, MIGRATE_MOVABLE); - } + if (end_pfn > start_pfn) + memmap_init_range(end_pfn - start_pfn, nid, + zone_id, start_pfn, zone_end_pfn, + MEMINIT_EARLY, NULL, MIGRATE_MOVABLE); } } @@ -6766,25 +6765,22 @@ static unsigned long __init usemap_size(unsigned long zone_start_pfn, unsigned l return usemapsize / 8; } -static void __ref setup_usemap(struct pglist_data *pgdat, - struct zone *zone, - unsigned long zone_start_pfn, - unsigned long zonesize) +static void __ref setup_usemap(struct zone *zone) { - unsigned long usemapsize = usemap_size(zone_start_pfn, zonesize); + unsigned long usemapsize = usemap_size(zone->zone_start_pfn, + zone->spanned_pages); zone->pageblock_flags = NULL; if (usemapsize) { zone->pageblock_flags = memblock_alloc_node(usemapsize, SMP_CACHE_BYTES, - pgdat->node_id); + zone_to_nid(zone)); if (!zone->pageblock_flags) panic("Failed to allocate %ld bytes for zone %s pageblock flags on node %d\n", - usemapsize, zone->name, pgdat->node_id); + usemapsize, zone->name, zone_to_nid(zone)); } } #else -static inline void setup_usemap(struct pglist_data *pgdat, struct zone *zone, - unsigned long zone_start_pfn, unsigned long zonesize) {} +static inline void setup_usemap(struct zone *zone) {} #endif /* CONFIG_SPARSEMEM */ #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE @@ -6931,7 +6927,6 @@ static void __init free_area_init_core(struct pglist_data *pgdat) for (j = 0; j < MAX_NR_ZONES; j++) { struct zone *zone = pgdat->node_zones + j; unsigned long size, freesize, memmap_pages; - unsigned long zone_start_pfn = zone->zone_start_pfn; size = zone->spanned_pages; freesize = zone->present_pages; @@ -6979,9 +6974,9 @@ static void __init free_area_init_core(struct pglist_data *pgdat) continue; set_pageblock_order(); - setup_usemap(pgdat, zone, zone_start_pfn, size); - init_currently_empty_zone(zone, zone_start_pfn, size); - memmap_init(size, nid, j, zone_start_pfn); + setup_usemap(zone); + init_currently_empty_zone(zone, zone->zone_start_pfn, size); + memmap_init_zone(zone); } } @@ -7696,17 +7691,6 @@ unsigned long free_reserved_area(void *start, void *end, int poison, const char return pages; } -#ifdef CONFIG_HIGHMEM -void free_highmem_page(struct page *page) -{ - __free_reserved_page(page); - totalram_pages_inc(); - atomic_long_inc(&page_zone(page)->managed_pages); - totalhigh_pages_inc(); -} -#endif - - void __init mem_init_print_info(const char *str) { unsigned long physpages, codesize, datasize, rosize, bss_size; diff --git a/mm/page_io.c b/mm/page_io.c index 9bca17ecc4df..485fa5cca4a2 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -26,25 +26,6 @@ #include <linux/uio.h> #include <linux/sched/task.h> -static struct bio *get_swap_bio(gfp_t gfp_flags, - struct page *page, bio_end_io_t end_io) -{ - struct bio *bio; - - bio = bio_alloc(gfp_flags, 1); - if (bio) { - struct block_device *bdev; - - bio->bi_iter.bi_sector = map_swap_page(page, &bdev); - bio_set_dev(bio, bdev); - bio->bi_iter.bi_sector <<= PAGE_SHIFT - 9; - bio->bi_end_io = end_io; - - bio_add_page(bio, page, thp_size(page), 0); - } - return bio; -} - void end_swap_bio_write(struct bio *bio) { struct page *page = bio_first_page_all(bio); @@ -60,9 +41,9 @@ void end_swap_bio_write(struct bio *bio) * Also clear PG_reclaim to avoid rotate_reclaimable_page() */ set_page_dirty(page); - pr_alert("Write-error on swap-device (%u:%u:%llu)\n", - MAJOR(bio_dev(bio)), MINOR(bio_dev(bio)), - (unsigned long long)bio->bi_iter.bi_sector); + pr_alert_ratelimited("Write-error on swap-device (%u:%u:%llu)\n", + MAJOR(bio_dev(bio)), MINOR(bio_dev(bio)), + (unsigned long long)bio->bi_iter.bi_sector); ClearPageReclaim(page); } end_page_writeback(page); @@ -125,9 +106,9 @@ static void end_swap_bio_read(struct bio *bio) if (bio->bi_status) { SetPageError(page); ClearPageUptodate(page); - pr_alert("Read-error on swap-device (%u:%u:%llu)\n", - MAJOR(bio_dev(bio)), MINOR(bio_dev(bio)), - (unsigned long long)bio->bi_iter.bi_sector); + pr_alert_ratelimited("Read-error on swap-device (%u:%u:%llu)\n", + MAJOR(bio_dev(bio)), MINOR(bio_dev(bio)), + (unsigned long long)bio->bi_iter.bi_sector); goto out; } @@ -361,13 +342,13 @@ int __swap_writepage(struct page *page, struct writeback_control *wbc, return 0; } - bio = get_swap_bio(GFP_NOIO, page, end_write_func); - if (bio == NULL) { - set_page_dirty(page); - unlock_page(page); - return -ENOMEM; - } + bio = bio_alloc(GFP_NOIO, 1); + bio_set_dev(bio, sis->bdev); + bio->bi_iter.bi_sector = swap_page_sector(page); bio->bi_opf = REQ_OP_WRITE | REQ_SWAP | wbc_to_write_flags(wbc); + bio->bi_end_io = end_write_func; + bio_add_page(bio, page, thp_size(page), 0); + bio_associate_blkg_from_page(bio, page); count_swpout_vm_event(page); set_page_writeback(page); @@ -427,18 +408,18 @@ int swap_readpage(struct page *page, bool synchronous) } ret = 0; - bio = get_swap_bio(GFP_KERNEL, page, end_swap_bio_read); - if (bio == NULL) { - unlock_page(page); - ret = -ENOMEM; - goto out; - } - disk = bio->bi_disk; + bio = bio_alloc(GFP_KERNEL, 1); + bio_set_dev(bio, sis->bdev); + bio->bi_opf = REQ_OP_READ; + bio->bi_iter.bi_sector = swap_page_sector(page); + bio->bi_end_io = end_swap_bio_read; + bio_add_page(bio, page, thp_size(page), 0); + + disk = bio->bi_bdev->bd_disk; /* * Keep this task valid during swap readpage because the oom killer may * attempt to access it in the page fault retry time check. */ - bio_set_op_attrs(bio, REQ_OP_READ, 0); if (synchronous) { bio->bi_opf |= REQ_HIPRI; get_task_struct(current); diff --git a/mm/page_owner.c b/mm/page_owner.c index af464bb7fbe7..d15c7c4994f5 100644 --- a/mm/page_owner.c +++ b/mm/page_owner.c @@ -263,8 +263,8 @@ void pagetypeinfo_showmixedcount_print(struct seq_file *m, struct page *page; struct page_ext *page_ext; struct page_owner *page_owner; - unsigned long pfn = zone->zone_start_pfn, block_end_pfn; - unsigned long end_pfn = pfn + zone->spanned_pages; + unsigned long pfn, block_end_pfn; + unsigned long end_pfn = zone_end_pfn(zone); unsigned long count[MIGRATE_TYPES] = { 0, }; int pageblock_mt, page_mt; int i; diff --git a/mm/page_reporting.c b/mm/page_reporting.c index cd8e13d41df4..c50d93ffa252 100644 --- a/mm/page_reporting.c +++ b/mm/page_reporting.c @@ -211,7 +211,7 @@ page_reporting_cycle(struct page_reporting_dev_info *prdev, struct zone *zone, } /* Rotate any leftover pages to the head of the freelist */ - if (&next->lru != list && !list_is_first(&next->lru, list)) + if (!list_entry_is_head(next, list, lru) && !list_is_first(&next->lru, list)) list_rotate_to_front(&next->lru, list); spin_unlock_irq(&zone->lock); diff --git a/mm/percpu.c b/mm/percpu.c index ad7a37ee74ef..6596a0a4286e 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -69,6 +69,7 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/bitmap.h> +#include <linux/cpumask.h> #include <linux/memblock.h> #include <linux/err.h> #include <linux/lcm.h> @@ -2662,13 +2663,14 @@ early_param("percpu_alloc", percpu_alloc_setup); * On success, pointer to the new allocation_info is returned. On * failure, ERR_PTR value is returned. */ -static struct pcpu_alloc_info * __init pcpu_build_alloc_info( +static struct pcpu_alloc_info * __init __flatten pcpu_build_alloc_info( size_t reserved_size, size_t dyn_size, size_t atom_size, pcpu_fc_cpu_distance_fn_t cpu_distance_fn) { static int group_map[NR_CPUS] __initdata; static int group_cnt[NR_CPUS] __initdata; + static struct cpumask mask __initdata; const size_t static_size = __per_cpu_end - __per_cpu_start; int nr_groups = 1, nr_units = 0; size_t size_sum, min_unit_size, alloc_size; @@ -2681,6 +2683,7 @@ static struct pcpu_alloc_info * __init pcpu_build_alloc_info( /* this function may be called multiple times */ memset(group_map, 0, sizeof(group_map)); memset(group_cnt, 0, sizeof(group_cnt)); + cpumask_clear(&mask); /* calculate size_sum and ensure dyn_size is enough for early alloc */ size_sum = PFN_ALIGN(static_size + reserved_size + @@ -2702,24 +2705,27 @@ static struct pcpu_alloc_info * __init pcpu_build_alloc_info( upa--; max_upa = upa; + cpumask_copy(&mask, cpu_possible_mask); + /* group cpus according to their proximity */ - for_each_possible_cpu(cpu) { - group = 0; - next_group: - for_each_possible_cpu(tcpu) { - if (cpu == tcpu) - break; - if (group_map[tcpu] == group && cpu_distance_fn && - (cpu_distance_fn(cpu, tcpu) > LOCAL_DISTANCE || - cpu_distance_fn(tcpu, cpu) > LOCAL_DISTANCE)) { - group++; - nr_groups = max(nr_groups, group + 1); - goto next_group; - } - } + for (group = 0; !cpumask_empty(&mask); group++) { + /* pop the group's first cpu */ + cpu = cpumask_first(&mask); group_map[cpu] = group; group_cnt[group]++; + cpumask_clear_cpu(cpu, &mask); + + for_each_cpu(tcpu, &mask) { + if (!cpu_distance_fn || + (cpu_distance_fn(cpu, tcpu) == LOCAL_DISTANCE && + cpu_distance_fn(tcpu, cpu) == LOCAL_DISTANCE)) { + group_map[tcpu] = group; + group_cnt[group]++; + cpumask_clear_cpu(tcpu, &mask); + } + } } + nr_groups = group; /* * Wasted space is caused by a ratio imbalance of upa to group_cnt. diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c index 9578db83e312..c2210e1cdb51 100644 --- a/mm/pgtable-generic.c +++ b/mm/pgtable-generic.c @@ -135,8 +135,9 @@ pmd_t pmdp_huge_clear_flush(struct vm_area_struct *vma, unsigned long address, { pmd_t pmd; VM_BUG_ON(address & ~HPAGE_PMD_MASK); - VM_BUG_ON((pmd_present(*pmdp) && !pmd_trans_huge(*pmdp) && - !pmd_devmap(*pmdp)) || !pmd_present(*pmdp)); + VM_BUG_ON(!pmd_present(*pmdp)); + /* Below assumes pmd_present() is true */ + VM_BUG_ON(!pmd_trans_huge(*pmdp) && !pmd_devmap(*pmdp)); pmd = pmdp_huge_get_and_clear(vma->vm_mm, address, pmdp); flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE); return pmd; diff --git a/mm/rmap.c b/mm/rmap.c index 08c56aaf72eb..e26ae119a131 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -413,8 +413,15 @@ void unlink_anon_vmas(struct vm_area_struct *vma) list_del(&avc->same_vma); anon_vma_chain_free(avc); } - if (vma->anon_vma) + if (vma->anon_vma) { vma->anon_vma->degree--; + + /* + * vma would still be needed after unlink, and anon_vma will be prepared + * when handle fault. + */ + vma->anon_vma = NULL; + } unlock_anon_vma_root(root); /* @@ -1144,7 +1151,7 @@ void do_page_add_anon_rmap(struct page *page, * disabled. */ if (compound) - __inc_lruvec_page_state(page, NR_ANON_THPS); + __mod_lruvec_page_state(page, NR_ANON_THPS, nr); __mod_lruvec_page_state(page, NR_ANON_MAPPED, nr); } @@ -1186,7 +1193,7 @@ void page_add_new_anon_rmap(struct page *page, if (hpage_pincount_available(page)) atomic_set(compound_pincount_ptr(page), 0); - __inc_lruvec_page_state(page, NR_ANON_THPS); + __mod_lruvec_page_state(page, NR_ANON_THPS, nr); } else { /* Anon THP always mapped first with PMD */ VM_BUG_ON_PAGE(PageTransCompound(page), page); @@ -1211,16 +1218,20 @@ void page_add_file_rmap(struct page *page, bool compound) VM_BUG_ON_PAGE(compound && !PageTransHuge(page), page); lock_page_memcg(page); if (compound && PageTransHuge(page)) { - for (i = 0, nr = 0; i < thp_nr_pages(page); i++) { + int nr_pages = thp_nr_pages(page); + + for (i = 0, nr = 0; i < nr_pages; i++) { if (atomic_inc_and_test(&page[i]._mapcount)) nr++; } if (!atomic_inc_and_test(compound_mapcount_ptr(page))) goto out; if (PageSwapBacked(page)) - __inc_node_page_state(page, NR_SHMEM_PMDMAPPED); + __mod_lruvec_page_state(page, NR_SHMEM_PMDMAPPED, + nr_pages); else - __inc_node_page_state(page, NR_FILE_PMDMAPPED); + __mod_lruvec_page_state(page, NR_FILE_PMDMAPPED, + nr_pages); } else { if (PageTransCompound(page) && page_mapping(page)) { VM_WARN_ON_ONCE(!PageLocked(page)); @@ -1252,16 +1263,20 @@ static void page_remove_file_rmap(struct page *page, bool compound) /* page still mapped by someone else? */ if (compound && PageTransHuge(page)) { - for (i = 0, nr = 0; i < thp_nr_pages(page); i++) { + int nr_pages = thp_nr_pages(page); + + for (i = 0, nr = 0; i < nr_pages; i++) { if (atomic_add_negative(-1, &page[i]._mapcount)) nr++; } if (!atomic_add_negative(-1, compound_mapcount_ptr(page))) return; if (PageSwapBacked(page)) - __dec_node_page_state(page, NR_SHMEM_PMDMAPPED); + __mod_lruvec_page_state(page, NR_SHMEM_PMDMAPPED, + -nr_pages); else - __dec_node_page_state(page, NR_FILE_PMDMAPPED); + __mod_lruvec_page_state(page, NR_FILE_PMDMAPPED, + -nr_pages); } else { if (!atomic_add_negative(-1, &page->_mapcount)) return; @@ -1292,7 +1307,7 @@ static void page_remove_anon_compound_rmap(struct page *page) if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) return; - __dec_lruvec_page_state(page, NR_ANON_THPS); + __mod_lruvec_page_state(page, NR_ANON_THPS, -thp_nr_pages(page)); if (TestClearPageDoubleMap(page)) { /* diff --git a/mm/shmem.c b/mm/shmem.c index 7c6b6d8f6c39..ff741d229701 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -713,7 +713,7 @@ next: } if (PageTransHuge(page)) { count_vm_event(THP_FILE_ALLOC); - __inc_lruvec_page_state(page, NR_SHMEM_THPS); + __mod_lruvec_page_state(page, NR_SHMEM_THPS, nr); } mapping->nrpages += nr; __mod_lruvec_page_state(page, NR_FILE_PAGES, nr); @@ -1060,7 +1060,8 @@ void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend) } EXPORT_SYMBOL_GPL(shmem_truncate_range); -static int shmem_getattr(const struct path *path, struct kstat *stat, +static int shmem_getattr(struct user_namespace *mnt_userns, + const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) { struct inode *inode = path->dentry->d_inode; @@ -1072,7 +1073,7 @@ static int shmem_getattr(const struct path *path, struct kstat *stat, shmem_recalc_inode(inode); spin_unlock_irq(&info->lock); } - generic_fillattr(inode, stat); + generic_fillattr(&init_user_ns, inode, stat); if (is_huge_enabled(sb_info)) stat->blksize = HPAGE_PMD_SIZE; @@ -1080,14 +1081,15 @@ static int shmem_getattr(const struct path *path, struct kstat *stat, return 0; } -static int shmem_setattr(struct dentry *dentry, struct iattr *attr) +static int shmem_setattr(struct user_namespace *mnt_userns, + struct dentry *dentry, struct iattr *attr) { struct inode *inode = d_inode(dentry); struct shmem_inode_info *info = SHMEM_I(inode); struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); int error; - error = setattr_prepare(dentry, attr); + error = setattr_prepare(&init_user_ns, dentry, attr); if (error) return error; @@ -1141,9 +1143,9 @@ static int shmem_setattr(struct dentry *dentry, struct iattr *attr) } } - setattr_copy(inode, attr); + setattr_copy(&init_user_ns, inode, attr); if (attr->ia_valid & ATTR_MODE) - error = posix_acl_chmod(inode, inode->i_mode); + error = posix_acl_chmod(&init_user_ns, inode, inode->i_mode); return error; } @@ -1520,11 +1522,11 @@ static struct page *shmem_swapin(swp_entry_t swap, gfp_t gfp, { struct vm_area_struct pvma; struct page *page; - struct vm_fault vmf; + struct vm_fault vmf = { + .vma = &pvma, + }; shmem_pseudo_vma_init(&pvma, info, index); - vmf.vma = &pvma; - vmf.address = 0; page = swap_cluster_readahead(swap, gfp, &vmf); shmem_pseudo_vma_destroy(&pvma); @@ -2303,7 +2305,7 @@ static struct inode *shmem_get_inode(struct super_block *sb, const struct inode inode = new_inode(sb); if (inode) { inode->i_ino = ino; - inode_init_owner(inode, dir, mode); + inode_init_owner(&init_user_ns, inode, dir, mode); inode->i_blocks = 0; inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode); inode->i_generation = prandom_u32(); @@ -2917,7 +2919,8 @@ static int shmem_statfs(struct dentry *dentry, struct kstatfs *buf) * File creation. Allocate an inode, and we're done.. */ static int -shmem_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev) +shmem_mknod(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, umode_t mode, dev_t dev) { struct inode *inode; int error = -ENOSPC; @@ -2946,7 +2949,8 @@ out_iput: } static int -shmem_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode) +shmem_tmpfile(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, umode_t mode) { struct inode *inode; int error = -ENOSPC; @@ -2969,20 +2973,22 @@ out_iput: return error; } -static int shmem_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) +static int shmem_mkdir(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, umode_t mode) { int error; - if ((error = shmem_mknod(dir, dentry, mode | S_IFDIR, 0))) + if ((error = shmem_mknod(&init_user_ns, dir, dentry, + mode | S_IFDIR, 0))) return error; inc_nlink(dir); return 0; } -static int shmem_create(struct inode *dir, struct dentry *dentry, umode_t mode, - bool excl) +static int shmem_create(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, umode_t mode, bool excl) { - return shmem_mknod(dir, dentry, mode | S_IFREG, 0); + return shmem_mknod(&init_user_ns, dir, dentry, mode | S_IFREG, 0); } /* @@ -3062,7 +3068,8 @@ static int shmem_exchange(struct inode *old_dir, struct dentry *old_dentry, stru return 0; } -static int shmem_whiteout(struct inode *old_dir, struct dentry *old_dentry) +static int shmem_whiteout(struct user_namespace *mnt_userns, + struct inode *old_dir, struct dentry *old_dentry) { struct dentry *whiteout; int error; @@ -3071,7 +3078,7 @@ static int shmem_whiteout(struct inode *old_dir, struct dentry *old_dentry) if (!whiteout) return -ENOMEM; - error = shmem_mknod(old_dir, whiteout, + error = shmem_mknod(&init_user_ns, old_dir, whiteout, S_IFCHR | WHITEOUT_MODE, WHITEOUT_DEV); dput(whiteout); if (error) @@ -3094,7 +3101,10 @@ static int shmem_whiteout(struct inode *old_dir, struct dentry *old_dentry) * it exists so that the VFS layer correctly free's it when it * gets overwritten. */ -static int shmem_rename2(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) +static int shmem_rename2(struct user_namespace *mnt_userns, + struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry, + unsigned int flags) { struct inode *inode = d_inode(old_dentry); int they_are_dirs = S_ISDIR(inode->i_mode); @@ -3111,7 +3121,7 @@ static int shmem_rename2(struct inode *old_dir, struct dentry *old_dentry, struc if (flags & RENAME_WHITEOUT) { int error; - error = shmem_whiteout(old_dir, old_dentry); + error = shmem_whiteout(&init_user_ns, old_dir, old_dentry); if (error) return error; } @@ -3135,7 +3145,8 @@ static int shmem_rename2(struct inode *old_dir, struct dentry *old_dentry, struc return 0; } -static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *symname) +static int shmem_symlink(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, const char *symname) { int error; int len; @@ -3273,6 +3284,7 @@ static int shmem_xattr_handler_get(const struct xattr_handler *handler, } static int shmem_xattr_handler_set(const struct xattr_handler *handler, + struct user_namespace *mnt_userns, struct dentry *unused, struct inode *inode, const char *name, const void *value, size_t size, int flags) diff --git a/mm/slab.c b/mm/slab.c index d7c8da9319c7..35c68d99d460 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -272,7 +272,7 @@ static void kmem_cache_node_init(struct kmem_cache_node *parent) #define STATS_DEC_ACTIVE(x) ((x)->num_active--) #define STATS_INC_ALLOCED(x) ((x)->num_allocations++) #define STATS_INC_GROWN(x) ((x)->grown++) -#define STATS_ADD_REAPED(x,y) ((x)->reaped += (y)) +#define STATS_ADD_REAPED(x, y) ((x)->reaped += (y)) #define STATS_SET_HIGH(x) \ do { \ if ((x)->num_active > (x)->high_mark) \ @@ -296,7 +296,7 @@ static void kmem_cache_node_init(struct kmem_cache_node *parent) #define STATS_DEC_ACTIVE(x) do { } while (0) #define STATS_INC_ALLOCED(x) do { } while (0) #define STATS_INC_GROWN(x) do { } while (0) -#define STATS_ADD_REAPED(x,y) do { (void)(y); } while (0) +#define STATS_ADD_REAPED(x, y) do { (void)(y); } while (0) #define STATS_SET_HIGH(x) do { } while (0) #define STATS_INC_ERR(x) do { } while (0) #define STATS_INC_NODEALLOCS(x) do { } while (0) @@ -332,7 +332,7 @@ static int obj_offset(struct kmem_cache *cachep) static unsigned long long *dbg_redzone1(struct kmem_cache *cachep, void *objp) { BUG_ON(!(cachep->flags & SLAB_RED_ZONE)); - return (unsigned long long*) (objp + obj_offset(cachep) - + return (unsigned long long *) (objp + obj_offset(cachep) - sizeof(unsigned long long)); } @@ -580,7 +580,7 @@ static int transfer_objects(struct array_cache *to, if (!nr) return 0; - memcpy(to->entry + to->avail, from->entry + from->avail -nr, + memcpy(to->entry + to->avail, from->entry + from->avail - nr, sizeof(void *) *nr); from->avail -= nr; @@ -1379,7 +1379,7 @@ static struct page *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, return NULL; } - account_slab_page(page, cachep->gfporder, cachep); + account_slab_page(page, cachep->gfporder, cachep, flags); __SetPageSlab(page); /* Record if ALLOC_NO_WATERMARKS was set when allocating the slab */ if (sk_memalloc_socks() && page_is_pfmemalloc(page)) @@ -1790,8 +1790,7 @@ static int __ref setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp) } slab_flags_t kmem_cache_flags(unsigned int object_size, - slab_flags_t flags, const char *name, - void (*ctor)(void *)) + slab_flags_t flags, const char *name) { return flags; } @@ -2738,7 +2737,7 @@ static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp, #else #define kfree_debugcheck(x) do { } while(0) -#define cache_free_debugcheck(x,objp,z) (objp) +#define cache_free_debugcheck(x, objp, z) (objp) #endif static inline void fixup_objfreelist_debug(struct kmem_cache *cachep, @@ -3025,7 +3024,7 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep, return objp; } #else -#define cache_alloc_debugcheck_after(a,b,objp,d) (objp) +#define cache_alloc_debugcheck_after(a, b, objp, d) (objp) #endif static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags) @@ -3421,7 +3420,7 @@ static __always_inline void __cache_free(struct kmem_cache *cachep, void *objp, memset(objp, 0, cachep->object_size); /* Put the object into the quarantine, don't touch it for now. */ - if (kasan_slab_free(cachep, objp, _RET_IP_)) + if (kasan_slab_free(cachep, objp)) return; /* Use KCSAN to help debug racy use-after-free. */ @@ -3635,6 +3634,26 @@ void *__kmalloc_node_track_caller(size_t size, gfp_t flags, EXPORT_SYMBOL(__kmalloc_node_track_caller); #endif /* CONFIG_NUMA */ +void kmem_obj_info(struct kmem_obj_info *kpp, void *object, struct page *page) +{ + struct kmem_cache *cachep; + unsigned int objnr; + void *objp; + + kpp->kp_ptr = object; + kpp->kp_page = page; + cachep = page->slab_cache; + kpp->kp_slab_cache = cachep; + objp = object - obj_offset(cachep); + kpp->kp_data_offset = obj_offset(cachep); + page = virt_to_head_page(objp); + objnr = obj_to_index(cachep, page, objp); + objp = index_to_obj(cachep, page, objnr); + kpp->kp_objp = objp; + if (DEBUG && cachep->flags & SLAB_STORE_USER) + kpp->kp_ret = *dbg_userword(cachep, objp); +} + /** * __do_kmalloc - allocate memory * @size: how many bytes of memory are required. @@ -3697,7 +3716,7 @@ void kmem_cache_free(struct kmem_cache *cachep, void *objp) __cache_free(cachep, objp, _RET_IP_); local_irq_restore(flags); - trace_kmem_cache_free(_RET_IP_, objp); + trace_kmem_cache_free(_RET_IP_, objp, cachep->name); } EXPORT_SYMBOL(kmem_cache_free); diff --git a/mm/slab.h b/mm/slab.h index 1a756a359fa8..076582f58f68 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -110,8 +110,7 @@ __kmem_cache_alias(const char *name, unsigned int size, unsigned int align, slab_flags_t flags, void (*ctor)(void *)); slab_flags_t kmem_cache_flags(unsigned int object_size, - slab_flags_t flags, const char *name, - void (*ctor)(void *)); + slab_flags_t flags, const char *name); #else static inline struct kmem_cache * __kmem_cache_alias(const char *name, unsigned int size, unsigned int align, @@ -119,8 +118,7 @@ __kmem_cache_alias(const char *name, unsigned int size, unsigned int align, { return NULL; } static inline slab_flags_t kmem_cache_flags(unsigned int object_size, - slab_flags_t flags, const char *name, - void (*ctor)(void *)) + slab_flags_t flags, const char *name) { return flags; } @@ -240,7 +238,7 @@ static inline bool kmem_cache_debug_flags(struct kmem_cache *s, slab_flags_t fla #ifdef CONFIG_MEMCG_KMEM int memcg_alloc_page_obj_cgroups(struct page *page, struct kmem_cache *s, - gfp_t gfp); + gfp_t gfp, bool new_page); static inline void memcg_free_page_obj_cgroups(struct page *page) { @@ -317,7 +315,8 @@ static inline void memcg_slab_post_alloc_hook(struct kmem_cache *s, page = virt_to_head_page(p[i]); if (!page_objcgs(page) && - memcg_alloc_page_obj_cgroups(page, s, flags)) { + memcg_alloc_page_obj_cgroups(page, s, flags, + false)) { obj_cgroup_uncharge(objcg, obj_full_size(s)); continue; } @@ -381,7 +380,8 @@ static inline struct mem_cgroup *memcg_from_slab_obj(void *ptr) } static inline int memcg_alloc_page_obj_cgroups(struct page *page, - struct kmem_cache *s, gfp_t gfp) + struct kmem_cache *s, gfp_t gfp, + bool new_page) { return 0; } @@ -422,8 +422,12 @@ static inline struct kmem_cache *virt_to_cache(const void *obj) } static __always_inline void account_slab_page(struct page *page, int order, - struct kmem_cache *s) + struct kmem_cache *s, + gfp_t gfp) { + if (memcg_kmem_enabled() && (s->flags & SLAB_ACCOUNT)) + memcg_alloc_page_obj_cgroups(page, s, gfp, true); + mod_node_page_state(page_pgdat(page), cache_vmstat_idx(s), PAGE_SIZE << order); } @@ -615,4 +619,16 @@ static inline bool slab_want_init_on_free(struct kmem_cache *c) return false; } +#define KS_ADDRS_COUNT 16 +struct kmem_obj_info { + void *kp_ptr; + struct page *kp_page; + void *kp_objp; + unsigned long kp_data_offset; + struct kmem_cache *kp_slab_cache; + void *kp_ret; + void *kp_stack[KS_ADDRS_COUNT]; +}; +void kmem_obj_info(struct kmem_obj_info *kpp, void *object, struct page *page); + #endif /* MM_SLAB_H */ diff --git a/mm/slab_common.c b/mm/slab_common.c index e981c80d216c..7c8298c17145 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -197,7 +197,7 @@ struct kmem_cache *find_mergeable(unsigned int size, unsigned int align, size = ALIGN(size, sizeof(void *)); align = calculate_alignment(flags, align, size); size = ALIGN(size, align); - flags = kmem_cache_flags(size, flags, name, NULL); + flags = kmem_cache_flags(size, flags, name); if (flags & SLAB_NEVER_MERGE) return NULL; @@ -309,9 +309,6 @@ kmem_cache_create_usercopy(const char *name, const char *cache_name; int err; - get_online_cpus(); - get_online_mems(); - mutex_lock(&slab_mutex); err = kmem_cache_sanity_check(name, size); @@ -360,9 +357,6 @@ kmem_cache_create_usercopy(const char *name, out_unlock: mutex_unlock(&slab_mutex); - put_online_mems(); - put_online_cpus(); - if (err) { if (flags & SLAB_PANIC) panic("kmem_cache_create: Failed to create slab '%s'. Error %d\n", @@ -486,9 +480,6 @@ void kmem_cache_destroy(struct kmem_cache *s) if (unlikely(!s)) return; - get_online_cpus(); - get_online_mems(); - mutex_lock(&slab_mutex); s->refcount--; @@ -503,9 +494,6 @@ void kmem_cache_destroy(struct kmem_cache *s) } out_unlock: mutex_unlock(&slab_mutex); - - put_online_mems(); - put_online_cpus(); } EXPORT_SYMBOL(kmem_cache_destroy); @@ -522,12 +510,10 @@ int kmem_cache_shrink(struct kmem_cache *cachep) { int ret; - get_online_cpus(); - get_online_mems(); + kasan_cache_shrink(cachep); ret = __kmem_cache_shrink(cachep); - put_online_mems(); - put_online_cpus(); + return ret; } EXPORT_SYMBOL(kmem_cache_shrink); @@ -537,6 +523,81 @@ bool slab_is_available(void) return slab_state >= UP; } +/** + * kmem_valid_obj - does the pointer reference a valid slab object? + * @object: pointer to query. + * + * Return: %true if the pointer is to a not-yet-freed object from + * kmalloc() or kmem_cache_alloc(), either %true or %false if the pointer + * is to an already-freed object, and %false otherwise. + */ +bool kmem_valid_obj(void *object) +{ + struct page *page; + + /* Some arches consider ZERO_SIZE_PTR to be a valid address. */ + if (object < (void *)PAGE_SIZE || !virt_addr_valid(object)) + return false; + page = virt_to_head_page(object); + return PageSlab(page); +} + +/** + * kmem_dump_obj - Print available slab provenance information + * @object: slab object for which to find provenance information. + * + * This function uses pr_cont(), so that the caller is expected to have + * printed out whatever preamble is appropriate. The provenance information + * depends on the type of object and on how much debugging is enabled. + * For a slab-cache object, the fact that it is a slab object is printed, + * and, if available, the slab name, return address, and stack trace from + * the allocation of that object. + * + * This function will splat if passed a pointer to a non-slab object. + * If you are not sure what type of object you have, you should instead + * use mem_dump_obj(). + */ +void kmem_dump_obj(void *object) +{ + char *cp = IS_ENABLED(CONFIG_MMU) ? "" : "/vmalloc"; + int i; + struct page *page; + unsigned long ptroffset; + struct kmem_obj_info kp = { }; + + if (WARN_ON_ONCE(!virt_addr_valid(object))) + return; + page = virt_to_head_page(object); + if (WARN_ON_ONCE(!PageSlab(page))) { + pr_cont(" non-slab memory.\n"); + return; + } + kmem_obj_info(&kp, object, page); + if (kp.kp_slab_cache) + pr_cont(" slab%s %s", cp, kp.kp_slab_cache->name); + else + pr_cont(" slab%s", cp); + if (kp.kp_objp) + pr_cont(" start %px", kp.kp_objp); + if (kp.kp_data_offset) + pr_cont(" data offset %lu", kp.kp_data_offset); + if (kp.kp_objp) { + ptroffset = ((char *)object - (char *)kp.kp_objp) - kp.kp_data_offset; + pr_cont(" pointer offset %lu", ptroffset); + } + if (kp.kp_slab_cache && kp.kp_slab_cache->usersize) + pr_cont(" size %u", kp.kp_slab_cache->usersize); + if (kp.kp_ret) + pr_cont(" allocated at %pS\n", kp.kp_ret); + else + pr_cont("\n"); + for (i = 0; i < ARRAY_SIZE(kp.kp_stack); i++) { + if (!kp.kp_stack[i]) + break; + pr_info(" %pS\n", kp.kp_stack[i]); + } +} + #ifndef CONFIG_SLOB /* Create a cache during boot when no slab services are available yet */ void __init create_boot_cache(struct kmem_cache *s, const char *name, @@ -837,8 +898,8 @@ void *kmalloc_order(size_t size, gfp_t flags, unsigned int order) page = alloc_pages(flags, order); if (likely(page)) { ret = page_address(page); - mod_node_page_state(page_pgdat(page), NR_SLAB_UNRECLAIMABLE_B, - PAGE_SIZE << order); + mod_lruvec_page_state(page, NR_SLAB_UNRECLAIMABLE_B, + PAGE_SIZE << order); } ret = kasan_kmalloc_large(ret, size, flags); /* As ret might get tagged, call kmemleak hook after KASAN. */ @@ -1157,19 +1218,21 @@ size_t ksize(const void *objp) size_t size; /* - * We need to check that the pointed to object is valid, and only then - * unpoison the shadow memory below. We use __kasan_check_read(), to - * generate a more useful report at the time ksize() is called (rather - * than later where behaviour is undefined due to potential - * use-after-free or double-free). + * We need to first check that the pointer to the object is valid, and + * only then unpoison the memory. The report printed from ksize() is + * more useful, then when it's printed later when the behaviour could + * be undefined due to a potential use-after-free or double-free. + * + * We use kasan_check_byte(), which is supported for the hardware + * tag-based KASAN mode, unlike kasan_check_read/write(). * - * If the pointed to memory is invalid we return 0, to avoid users of + * If the pointed to memory is invalid, we return 0 to avoid users of * ksize() writing to and potentially corrupting the memory region. * * We want to perform the check before __ksize(), to avoid potentially * crashing in __ksize() due to accessing invalid metadata. */ - if (unlikely(ZERO_OR_NULL_PTR(objp)) || !__kasan_check_read(objp, 1)) + if (unlikely(ZERO_OR_NULL_PTR(objp)) || !kasan_check_byte(objp)) return 0; size = __ksize(objp); diff --git a/mm/slob.c b/mm/slob.c index 8d4bfa46247f..0578429b991b 100644 --- a/mm/slob.c +++ b/mm/slob.c @@ -461,6 +461,12 @@ out: spin_unlock_irqrestore(&slob_lock, flags); } +void kmem_obj_info(struct kmem_obj_info *kpp, void *object, struct page *page) +{ + kpp->kp_ptr = object; + kpp->kp_page = page; +} + /* * End of slob allocator proper. Begin kmem_cache_alloc and kmalloc frontend. */ @@ -667,7 +673,7 @@ void kmem_cache_free(struct kmem_cache *c, void *b) __kmem_cache_free(b, c->size); } - trace_kmem_cache_free(_RET_IP_, b); + trace_kmem_cache_free(_RET_IP_, b, c->name); } EXPORT_SYMBOL(kmem_cache_free); diff --git a/mm/slub.c b/mm/slub.c index 7ecbbbe5bc0c..b2833ce85c92 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -235,6 +235,14 @@ static inline void stat(const struct kmem_cache *s, enum stat_item si) #endif } +/* + * Tracks for which NUMA nodes we have kmem_cache_nodes allocated. + * Corresponds to node_state[N_NORMAL_MEMORY], but can temporarily + * differ during memory hotplug/hotremove operations. + * Protected by slab_mutex. + */ +static nodemask_t slab_nodes; + /******************************************************************** * Core slab cache functions *******************************************************************/ @@ -1400,7 +1408,6 @@ __setup("slub_debug", setup_slub_debug); * @object_size: the size of an object without meta data * @flags: flags to set * @name: name of the cache - * @ctor: constructor function * * Debug option(s) are applied to @flags. In addition to the debug * option(s), if a slab name (or multiple) is specified i.e. @@ -1408,13 +1415,21 @@ __setup("slub_debug", setup_slub_debug); * then only the select slabs will receive the debug option(s). */ slab_flags_t kmem_cache_flags(unsigned int object_size, - slab_flags_t flags, const char *name, - void (*ctor)(void *)) + slab_flags_t flags, const char *name) { char *iter; size_t len; char *next_block; slab_flags_t block_flags; + slab_flags_t slub_debug_local = slub_debug; + + /* + * If the slab cache is for debugging (e.g. kmemleak) then + * don't store user (stack trace) information by default, + * but let the user enable it via the command line below. + */ + if (flags & SLAB_NOLEAKTRACE) + slub_debug_local &= ~SLAB_STORE_USER; len = strlen(name); next_block = slub_debug_string; @@ -1449,7 +1464,7 @@ slab_flags_t kmem_cache_flags(unsigned int object_size, } } - return flags | slub_debug; + return flags | slub_debug_local; } #else /* !CONFIG_SLUB_DEBUG */ static inline void setup_object_debug(struct kmem_cache *s, @@ -1474,8 +1489,7 @@ static inline void add_full(struct kmem_cache *s, struct kmem_cache_node *n, static inline void remove_full(struct kmem_cache *s, struct kmem_cache_node *n, struct page *page) {} slab_flags_t kmem_cache_flags(unsigned int object_size, - slab_flags_t flags, const char *name, - void (*ctor)(void *)) + slab_flags_t flags, const char *name) { return flags; } @@ -1514,7 +1528,7 @@ static inline void *kmalloc_large_node_hook(void *ptr, size_t size, gfp_t flags) static __always_inline void kfree_hook(void *x) { kmemleak_free(x); - kasan_kfree_large(x, _RET_IP_); + kasan_kfree_large(x); } static __always_inline bool slab_free_hook(struct kmem_cache *s, void *x) @@ -1544,7 +1558,7 @@ static __always_inline bool slab_free_hook(struct kmem_cache *s, void *x) KCSAN_ACCESS_WRITE | KCSAN_ACCESS_ASSERT); /* KASAN might put x into memory quarantine, delaying its reuse */ - return kasan_slab_free(s, x, _RET_IP_); + return kasan_slab_free(s, x); } static inline bool slab_free_freelist_hook(struct kmem_cache *s, @@ -1771,7 +1785,7 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) page->objects = oo_objects(oo); - account_slab_page(page, oo_order(oo), s); + account_slab_page(page, oo_order(oo), s, flags); page->slab_cache = s; __SetPageSlab(page); @@ -2153,9 +2167,9 @@ static void deactivate_slab(struct kmem_cache *s, struct page *page, { enum slab_modes { M_NONE, M_PARTIAL, M_FULL, M_FREE }; struct kmem_cache_node *n = get_node(s, page_to_nid(page)); - int lock = 0; + int lock = 0, free_delta = 0; enum slab_modes l = M_NONE, m = M_NONE; - void *nextfree; + void *nextfree, *freelist_iter, *freelist_tail; int tail = DEACTIVATE_TO_HEAD; struct page new; struct page old; @@ -2166,45 +2180,34 @@ static void deactivate_slab(struct kmem_cache *s, struct page *page, } /* - * Stage one: Free all available per cpu objects back - * to the page freelist while it is still frozen. Leave the - * last one. - * - * There is no need to take the list->lock because the page - * is still frozen. + * Stage one: Count the objects on cpu's freelist as free_delta and + * remember the last object in freelist_tail for later splicing. */ - while (freelist && (nextfree = get_freepointer(s, freelist))) { - void *prior; - unsigned long counters; + freelist_tail = NULL; + freelist_iter = freelist; + while (freelist_iter) { + nextfree = get_freepointer(s, freelist_iter); /* * If 'nextfree' is invalid, it is possible that the object at - * 'freelist' is already corrupted. So isolate all objects - * starting at 'freelist'. + * 'freelist_iter' is already corrupted. So isolate all objects + * starting at 'freelist_iter' by skipping them. */ - if (freelist_corrupted(s, page, &freelist, nextfree)) + if (freelist_corrupted(s, page, &freelist_iter, nextfree)) break; - do { - prior = page->freelist; - counters = page->counters; - set_freepointer(s, freelist, prior); - new.counters = counters; - new.inuse--; - VM_BUG_ON(!new.frozen); + freelist_tail = freelist_iter; + free_delta++; - } while (!__cmpxchg_double_slab(s, page, - prior, counters, - freelist, new.counters, - "drain percpu freelist")); - - freelist = nextfree; + freelist_iter = nextfree; } /* - * Stage two: Ensure that the page is unfrozen while the - * list presence reflects the actual number of objects - * during unfreeze. + * Stage two: Unfreeze the page while splicing the per-cpu + * freelist to the head of page's freelist. + * + * Ensure that the page is unfrozen while the list presence + * reflects the actual number of objects during unfreeze. * * We setup the list membership and then perform a cmpxchg * with the count. If there is a mismatch then the page @@ -2217,15 +2220,15 @@ static void deactivate_slab(struct kmem_cache *s, struct page *page, */ redo: - old.freelist = page->freelist; - old.counters = page->counters; + old.freelist = READ_ONCE(page->freelist); + old.counters = READ_ONCE(page->counters); VM_BUG_ON(!old.frozen); /* Determine target state of the slab */ new.counters = old.counters; - if (freelist) { - new.inuse--; - set_freepointer(s, freelist, old.freelist); + if (freelist_tail) { + new.inuse -= free_delta; + set_freepointer(s, freelist_tail, old.freelist); new.freelist = freelist; } else new.freelist = old.freelist; @@ -2672,7 +2675,7 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, * ignore the node constraint */ if (unlikely(node != NUMA_NO_NODE && - !node_state(node, N_NORMAL_MEMORY))) + !node_isset(node, slab_nodes))) node = NUMA_NO_NODE; goto new_slab; } @@ -2683,7 +2686,7 @@ redo: * same as above but node_match() being false already * implies node != NUMA_NO_NODE */ - if (!node_state(node, N_NORMAL_MEMORY)) { + if (!node_isset(node, slab_nodes)) { node = NUMA_NO_NODE; goto redo; } else { @@ -3157,7 +3160,7 @@ void kmem_cache_free(struct kmem_cache *s, void *x) if (!s) return; slab_free(s, virt_to_head_page(x), x, NULL, 1, _RET_IP_); - trace_kmem_cache_free(_RET_IP_, x); + trace_kmem_cache_free(_RET_IP_, x, s->name); } EXPORT_SYMBOL(kmem_cache_free); @@ -3266,7 +3269,7 @@ void kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p) if (!df.page) continue; - slab_free(df.s, df.page, df.freelist, df.tail, df.cnt,_RET_IP_); + slab_free(df.s, df.page, df.freelist, df.tail, df.cnt, _RET_IP_); } while (likely(size)); } EXPORT_SYMBOL(kmem_cache_free_bulk); @@ -3423,6 +3426,7 @@ static inline int calculate_order(unsigned int size) unsigned int order; unsigned int min_objects; unsigned int max_objects; + unsigned int nr_cpus; /* * Attempt to find best configuration for a slab. This @@ -3433,8 +3437,21 @@ static inline int calculate_order(unsigned int size) * we reduce the minimum objects required in a slab. */ min_objects = slub_min_objects; - if (!min_objects) - min_objects = 4 * (fls(num_online_cpus()) + 1); + if (!min_objects) { + /* + * Some architectures will only update present cpus when + * onlining them, so don't trust the number if it's just 1. But + * we also don't want to use nr_cpu_ids always, as on some other + * architectures, there can be many possible cpus, but never + * onlined. Here we compromise between trying to avoid too high + * order on systems that appear larger than they are, and too + * low order on systems that appear smaller than they are. + */ + nr_cpus = num_present_cpus(); + if (nr_cpus <= 1) + nr_cpus = nr_cpu_ids; + min_objects = 4 * (fls(nr_cpus) + 1); + } max_objects = order_objects(slub_max_order, size); min_objects = min(min_objects, max_objects); @@ -3572,7 +3589,7 @@ static int init_kmem_cache_nodes(struct kmem_cache *s) { int node; - for_each_node_state(node, N_NORMAL_MEMORY) { + for_each_node_mask(node, slab_nodes) { struct kmem_cache_node *n; if (slab_state == DOWN) { @@ -3783,7 +3800,7 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order) static int kmem_cache_open(struct kmem_cache *s, slab_flags_t flags) { - s->flags = kmem_cache_flags(s->size, flags, s->name, s->ctor); + s->flags = kmem_cache_flags(s->size, flags, s->name); #ifdef CONFIG_SLAB_FREELIST_HARDENED s->random = get_random_long(); #endif @@ -3919,6 +3936,46 @@ int __kmem_cache_shutdown(struct kmem_cache *s) return 0; } +void kmem_obj_info(struct kmem_obj_info *kpp, void *object, struct page *page) +{ + void *base; + int __maybe_unused i; + unsigned int objnr; + void *objp; + void *objp0; + struct kmem_cache *s = page->slab_cache; + struct track __maybe_unused *trackp; + + kpp->kp_ptr = object; + kpp->kp_page = page; + kpp->kp_slab_cache = s; + base = page_address(page); + objp0 = kasan_reset_tag(object); +#ifdef CONFIG_SLUB_DEBUG + objp = restore_red_left(s, objp0); +#else + objp = objp0; +#endif + objnr = obj_to_index(s, page, objp); + kpp->kp_data_offset = (unsigned long)((char *)objp0 - (char *)objp); + objp = base + s->size * objnr; + kpp->kp_objp = objp; + if (WARN_ON_ONCE(objp < base || objp >= base + page->objects * s->size || (objp - base) % s->size) || + !(s->flags & SLAB_STORE_USER)) + return; +#ifdef CONFIG_SLUB_DEBUG + trackp = get_track(s, objp, TRACK_ALLOC); + kpp->kp_ret = (void *)trackp->addr; +#ifdef CONFIG_STACKTRACE + for (i = 0; i < KS_ADDRS_COUNT && i < TRACK_ADDRS_COUNT; i++) { + kpp->kp_stack[i] = (void *)trackp->addrs[i]; + if (!kpp->kp_stack[i]) + break; + } +#endif +#endif +} + /******************************************************************** * Kmalloc subsystem *******************************************************************/ @@ -3985,8 +4042,8 @@ static void *kmalloc_large_node(size_t size, gfp_t flags, int node) page = alloc_pages_node(node, flags, order); if (page) { ptr = page_address(page); - mod_node_page_state(page_pgdat(page), NR_SLAB_UNRECLAIMABLE_B, - PAGE_SIZE << order); + mod_lruvec_page_state(page, NR_SLAB_UNRECLAIMABLE_B, + PAGE_SIZE << order); } return kmalloc_large_node_hook(ptr, size, flags); @@ -4117,8 +4174,8 @@ void kfree(const void *x) BUG_ON(!PageCompound(page)); kfree_hook(object); - mod_node_page_state(page_pgdat(page), NR_SLAB_UNRECLAIMABLE_B, - -(PAGE_SIZE << order)); + mod_lruvec_page_state(page, NR_SLAB_UNRECLAIMABLE_B, + -(PAGE_SIZE << order)); __free_pages(page, order); return; } @@ -4213,8 +4270,6 @@ static int slab_mem_going_offline_callback(void *arg) static void slab_mem_offline_callback(void *arg) { - struct kmem_cache_node *n; - struct kmem_cache *s; struct memory_notify *marg = arg; int offline_node; @@ -4228,21 +4283,12 @@ static void slab_mem_offline_callback(void *arg) return; mutex_lock(&slab_mutex); - list_for_each_entry(s, &slab_caches, list) { - n = get_node(s, offline_node); - if (n) { - /* - * if n->nr_slabs > 0, slabs still exist on the node - * that is going down. We were unable to free them, - * and offline_pages() function shouldn't call this - * callback. So, we must fail. - */ - BUG_ON(slabs_node(s, offline_node)); - - s->node[offline_node] = NULL; - kmem_cache_free(kmem_cache_node, n); - } - } + node_clear(offline_node, slab_nodes); + /* + * We no longer free kmem_cache_node structures here, as it would be + * racy with all get_node() users, and infeasible to protect them with + * slab_mutex. + */ mutex_unlock(&slab_mutex); } @@ -4269,6 +4315,12 @@ static int slab_mem_going_online_callback(void *arg) mutex_lock(&slab_mutex); list_for_each_entry(s, &slab_caches, list) { /* + * The structure may already exist if the node was previously + * onlined and offlined. + */ + if (get_node(s, nid)) + continue; + /* * XXX: kmem_cache_alloc_node will fallback to other nodes * since memory is not yet available from the node that * is brought up. @@ -4281,6 +4333,11 @@ static int slab_mem_going_online_callback(void *arg) init_kmem_cache_node(n); s->node[nid] = n; } + /* + * Any cache created after this point will also have kmem_cache_node + * initialized for the new node. + */ + node_set(nid, slab_nodes); out: mutex_unlock(&slab_mutex); return ret; @@ -4361,6 +4418,7 @@ void __init kmem_cache_init(void) { static __initdata struct kmem_cache boot_kmem_cache, boot_kmem_cache_node; + int node; if (debug_guardpage_minorder()) slub_max_order = 0; @@ -4368,6 +4426,13 @@ void __init kmem_cache_init(void) kmem_cache_node = &boot_kmem_cache_node; kmem_cache = &boot_kmem_cache; + /* + * Initialize the nodemask for which we will allocate per node + * structures. Here we don't need taking slab_mutex yet. + */ + for_each_node_state(node, N_NORMAL_MEMORY) + node_set(node, slab_nodes); + create_boot_cache(kmem_cache_node, "kmem_cache_node", sizeof(struct kmem_cache_node), SLAB_HWCACHE_ALIGN, 0, 0); @@ -4878,22 +4943,6 @@ enum slab_stat_type { #define SO_OBJECTS (1 << SL_OBJECTS) #define SO_TOTAL (1 << SL_TOTAL) -#ifdef CONFIG_MEMCG -static bool memcg_sysfs_enabled = IS_ENABLED(CONFIG_SLUB_MEMCG_SYSFS_ON); - -static int __init setup_slub_memcg_sysfs(char *str) -{ - int v; - - if (get_option(&str, &v) > 0) - memcg_sysfs_enabled = v; - - return 1; -} - -__setup("slub_memcg_sysfs=", setup_slub_memcg_sysfs); -#endif - static ssize_t show_slab_objects(struct kmem_cache *s, char *buf, unsigned long flags) { diff --git a/mm/swap.c b/mm/swap.c index 2cca7141470c..ab3258afcbeb 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -83,9 +83,8 @@ static void __page_cache_release(struct page *page) unsigned long flags; lruvec = lock_page_lruvec_irqsave(page, &flags); - VM_BUG_ON_PAGE(!PageLRU(page), page); - __ClearPageLRU(page); - del_page_from_lru_list(page, lruvec, page_off_lru(page)); + del_page_from_lru_list(page, lruvec); + __clear_page_lru_flags(page); unlock_page_lruvec_irqrestore(lruvec, flags); } __ClearPageWaiters(page); @@ -229,9 +228,9 @@ static void pagevec_lru_move_fn(struct pagevec *pvec, static void pagevec_move_tail_fn(struct page *page, struct lruvec *lruvec) { if (!PageUnevictable(page)) { - del_page_from_lru_list(page, lruvec, page_lru(page)); + del_page_from_lru_list(page, lruvec); ClearPageActive(page); - add_page_to_lru_list_tail(page, lruvec, page_lru(page)); + add_page_to_lru_list_tail(page, lruvec); __count_vm_events(PGROTATED, thp_nr_pages(page)); } } @@ -308,13 +307,11 @@ void lru_note_cost_page(struct page *page) static void __activate_page(struct page *page, struct lruvec *lruvec) { if (!PageActive(page) && !PageUnevictable(page)) { - int lru = page_lru_base_type(page); int nr_pages = thp_nr_pages(page); - del_page_from_lru_list(page, lruvec, lru); + del_page_from_lru_list(page, lruvec); SetPageActive(page); - lru += LRU_ACTIVE; - add_page_to_lru_list(page, lruvec, lru); + add_page_to_lru_list(page, lruvec); trace_mm_lru_activate(page); __count_vm_events(PGACTIVATE, nr_pages); @@ -519,8 +516,7 @@ void lru_cache_add_inactive_or_unevictable(struct page *page, */ static void lru_deactivate_file_fn(struct page *page, struct lruvec *lruvec) { - int lru; - bool active; + bool active = PageActive(page); int nr_pages = thp_nr_pages(page); if (PageUnevictable(page)) @@ -530,10 +526,7 @@ static void lru_deactivate_file_fn(struct page *page, struct lruvec *lruvec) if (page_mapped(page)) return; - active = PageActive(page); - lru = page_lru_base_type(page); - - del_page_from_lru_list(page, lruvec, lru + active); + del_page_from_lru_list(page, lruvec); ClearPageActive(page); ClearPageReferenced(page); @@ -543,14 +536,14 @@ static void lru_deactivate_file_fn(struct page *page, struct lruvec *lruvec) * It can make readahead confusing. But race window * is _really_ small and it's non-critical problem. */ - add_page_to_lru_list(page, lruvec, lru); + add_page_to_lru_list(page, lruvec); SetPageReclaim(page); } else { /* * The page's writeback ends up during pagevec * We moves tha page into tail of inactive. */ - add_page_to_lru_list_tail(page, lruvec, lru); + add_page_to_lru_list_tail(page, lruvec); __count_vm_events(PGROTATED, nr_pages); } @@ -564,13 +557,12 @@ static void lru_deactivate_file_fn(struct page *page, struct lruvec *lruvec) static void lru_deactivate_fn(struct page *page, struct lruvec *lruvec) { if (PageActive(page) && !PageUnevictable(page)) { - int lru = page_lru_base_type(page); int nr_pages = thp_nr_pages(page); - del_page_from_lru_list(page, lruvec, lru + LRU_ACTIVE); + del_page_from_lru_list(page, lruvec); ClearPageActive(page); ClearPageReferenced(page); - add_page_to_lru_list(page, lruvec, lru); + add_page_to_lru_list(page, lruvec); __count_vm_events(PGDEACTIVATE, nr_pages); __count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE, @@ -582,11 +574,9 @@ static void lru_lazyfree_fn(struct page *page, struct lruvec *lruvec) { if (PageAnon(page) && PageSwapBacked(page) && !PageSwapCache(page) && !PageUnevictable(page)) { - bool active = PageActive(page); int nr_pages = thp_nr_pages(page); - del_page_from_lru_list(page, lruvec, - LRU_INACTIVE_ANON + active); + del_page_from_lru_list(page, lruvec); ClearPageActive(page); ClearPageReferenced(page); /* @@ -595,7 +585,7 @@ static void lru_lazyfree_fn(struct page *page, struct lruvec *lruvec) * anonymous pages */ ClearPageSwapBacked(page); - add_page_to_lru_list(page, lruvec, LRU_INACTIVE_FILE); + add_page_to_lru_list(page, lruvec); __count_vm_events(PGLAZYFREE, nr_pages); __count_memcg_events(lruvec_memcg(lruvec), PGLAZYFREE, @@ -918,9 +908,8 @@ void release_pages(struct page **pages, int nr) if (prev_lruvec != lruvec) lock_batch = 0; - VM_BUG_ON_PAGE(!PageLRU(page), page); - __ClearPageLRU(page); - del_page_from_lru_list(page, lruvec, page_off_lru(page)); + del_page_from_lru_list(page, lruvec); + __clear_page_lru_flags(page); } __ClearPageWaiters(page); @@ -958,7 +947,6 @@ EXPORT_SYMBOL(__pagevec_release); static void __pagevec_lru_add_fn(struct page *page, struct lruvec *lruvec) { - enum lru_list lru; int was_unevictable = TestClearPageUnevictable(page); int nr_pages = thp_nr_pages(page); @@ -994,19 +982,17 @@ static void __pagevec_lru_add_fn(struct page *page, struct lruvec *lruvec) smp_mb__after_atomic(); if (page_evictable(page)) { - lru = page_lru(page); if (was_unevictable) __count_vm_events(UNEVICTABLE_PGRESCUED, nr_pages); } else { - lru = LRU_UNEVICTABLE; ClearPageActive(page); SetPageUnevictable(page); if (!was_unevictable) __count_vm_events(UNEVICTABLE_PGCULLED, nr_pages); } - add_page_to_lru_list(page, lruvec, lru); - trace_mm_lru_insertion(page, lru); + add_page_to_lru_list(page, lruvec); + trace_mm_lru_insertion(page); } /* diff --git a/mm/swap_slots.c b/mm/swap_slots.c index 0357fbe70645..be9de6d5b516 100644 --- a/mm/swap_slots.c +++ b/mm/swap_slots.c @@ -193,8 +193,7 @@ static void drain_slots_cache_cpu(unsigned int cpu, unsigned int type, cache->slots_ret = NULL; } spin_unlock_irq(&cache->free_lock); - if (slots) - kvfree(slots); + kvfree(slots); } } diff --git a/mm/swap_state.c b/mm/swap_state.c index 751c1ef2fe0e..c1a648d9092b 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -68,32 +68,6 @@ static struct { unsigned long find_total; } swap_cache_info; -unsigned long total_swapcache_pages(void) -{ - unsigned int i, j, nr; - unsigned long ret = 0; - struct address_space *spaces; - struct swap_info_struct *si; - - for (i = 0; i < MAX_SWAPFILES; i++) { - swp_entry_t entry = swp_entry(i, 1); - - /* Avoid get_swap_device() to warn for bad swap entry */ - if (!swp_swap_info(entry)) - continue; - /* Prevent swapoff to free swapper_spaces */ - si = get_swap_device(entry); - if (!si) - continue; - nr = nr_swapper_spaces[i]; - spaces = swapper_spaces[i]; - for (j = 0; j < nr; j++) - ret += spaces[j].nrpages; - put_swap_device(si); - } - return ret; -} - static atomic_t swapin_readahead_hits = ATOMIC_INIT(4); void show_swap_cache_info(void) @@ -163,6 +137,7 @@ int add_to_swap_cache(struct page *page, swp_entry_t entry, address_space->nrexceptional -= nr_shadows; address_space->nrpages += nr; __mod_node_page_state(page_pgdat(page), NR_FILE_PAGES, nr); + __mod_lruvec_page_state(page, NR_SWAPCACHE, nr); ADD_CACHE_INFO(add_total, nr); unlock: xas_unlock_irq(&xas); @@ -203,6 +178,7 @@ void __delete_from_swap_cache(struct page *page, address_space->nrexceptional += nr; address_space->nrpages -= nr; __mod_node_page_state(page_pgdat(page), NR_FILE_PAGES, -nr); + __mod_lruvec_page_state(page, NR_SWAPCACHE, -nr); ADD_CACHE_INFO(del_total, nr); } @@ -537,7 +513,6 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, workingset_refault(page, shadow); /* Caller will initiate read into locked page */ - SetPageWorkingset(page); lru_cache_add(page); *new_page_allocated = true; return page; @@ -927,7 +902,7 @@ static struct attribute *swap_attrs[] = { NULL, }; -static struct attribute_group swap_attr_group = { +static const struct attribute_group swap_attr_group = { .attrs = swap_attrs, }; diff --git a/mm/swapfile.c b/mm/swapfile.c index 9fffc5af29d1..f039745989d2 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -47,7 +47,6 @@ static bool swap_count_continued(struct swap_info_struct *, pgoff_t, unsigned char); static void free_swap_count_continuations(struct swap_info_struct *); -static sector_t map_swap_entry(swp_entry_t, struct block_device**); DEFINE_SPINLOCK(swap_lock); static unsigned int nr_swapfiles; @@ -1158,13 +1157,13 @@ static struct swap_info_struct *__swap_info_get(swp_entry_t entry) return p; bad_offset: - pr_err("swap_info_get: %s%08lx\n", Bad_offset, entry.val); + pr_err("%s: %s%08lx\n", __func__, Bad_offset, entry.val); goto out; bad_device: - pr_err("swap_info_get: %s%08lx\n", Unused_file, entry.val); + pr_err("%s: %s%08lx\n", __func__, Unused_file, entry.val); goto out; bad_nofile: - pr_err("swap_info_get: %s%08lx\n", Bad_file, entry.val); + pr_err("%s: %s%08lx\n", __func__, Bad_file, entry.val); out: return NULL; } @@ -1181,7 +1180,7 @@ static struct swap_info_struct *_swap_info_get(swp_entry_t entry) return p; bad_free: - pr_err("swap_info_get: %s%08lx\n", Unused_offset, entry.val); + pr_err("%s: %s%08lx\n", __func__, Unused_offset, entry.val); out: return NULL; } @@ -1850,12 +1849,13 @@ int find_first_swap(dev_t *device) */ sector_t swapdev_block(int type, pgoff_t offset) { - struct block_device *bdev; struct swap_info_struct *si = swap_type_to_swap_info(type); + struct swap_extent *se; if (!si || !(si->flags & SWP_WRITEOK)) return 0; - return map_swap_entry(swp_entry(type, offset), &bdev); + se = offset_to_swap_extent(si, offset); + return se->start_block + (offset - se->start_page); } /* @@ -1951,8 +1951,6 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd, si = swap_info[type]; pte = pte_offset_map(pmd, addr); do { - struct vm_fault vmf; - if (!is_swap_pte(*pte)) continue; @@ -1968,9 +1966,12 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd, swap_map = &si->swap_map[offset]; page = lookup_swap_cache(entry, vma, addr); if (!page) { - vmf.vma = vma; - vmf.address = addr; - vmf.pmd = pmd; + struct vm_fault vmf = { + .vma = vma, + .address = addr, + .pmd = pmd, + }; + page = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE, &vmf); } @@ -2282,36 +2283,6 @@ static void drain_mmlist(void) } /* - * Use this swapdev's extent info to locate the (PAGE_SIZE) block which - * corresponds to page offset for the specified swap entry. - * Note that the type of this function is sector_t, but it returns page offset - * into the bdev, not sector offset. - */ -static sector_t map_swap_entry(swp_entry_t entry, struct block_device **bdev) -{ - struct swap_info_struct *sis; - struct swap_extent *se; - pgoff_t offset; - - sis = swp_swap_info(entry); - *bdev = sis->bdev; - - offset = swp_offset(entry); - se = offset_to_swap_extent(sis, offset); - return se->start_block + (offset - se->start_page); -} - -/* - * Returns the page offset into bdev for the specified page's swap entry. - */ -sector_t map_swap_page(struct page *page, struct block_device **bdev) -{ - swp_entry_t entry; - entry.val = page_private(page); - return map_swap_entry(entry, bdev); -} - -/* * Free all of a swapdev's extent information */ static void destroy_swap_extents(struct swap_info_struct *sis) diff --git a/mm/util.c b/mm/util.c index 8c9b7d1e7c49..54870226cea6 100644 --- a/mm/util.c +++ b/mm/util.c @@ -982,3 +982,34 @@ int __weak memcmp_pages(struct page *page1, struct page *page2) kunmap_atomic(addr1); return ret; } + +/** + * mem_dump_obj - Print available provenance information + * @object: object for which to find provenance information. + * + * This function uses pr_cont(), so that the caller is expected to have + * printed out whatever preamble is appropriate. The provenance information + * depends on the type of object and on how much debugging is enabled. + * For example, for a slab-cache object, the slab name is printed, and, + * if available, the return address and stack trace from the allocation + * of that object. + */ +void mem_dump_obj(void *object) +{ + if (kmem_valid_obj(object)) { + kmem_dump_obj(object); + return; + } + if (vmalloc_dump_obj(object)) + return; + if (!virt_addr_valid(object)) { + if (object == NULL) + pr_cont(" NULL pointer.\n"); + else if (object == ZERO_SIZE_PTR) + pr_cont(" zero-size pointer.\n"); + else + pr_cont(" non-paged memory.\n"); + return; + } + pr_cont(" non-slab/vmalloc memory.\n"); +} diff --git a/mm/vmalloc.c b/mm/vmalloc.c index e6f352bf0498..4f5f8c907897 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -3450,6 +3450,19 @@ void pcpu_free_vm_areas(struct vm_struct **vms, int nr_vms) } #endif /* CONFIG_SMP */ +bool vmalloc_dump_obj(void *object) +{ + struct vm_struct *vm; + void *objp = (void *)PAGE_ALIGN((unsigned long)object); + + vm = find_vm_area(objp); + if (!vm) + return false; + pr_cont(" %u-page vmalloc region starting at %#lx allocated at %pS\n", + vm->nr_pages, (unsigned long)vm->addr, vm->caller); + return true; +} + #ifdef CONFIG_PROC_FS static void *s_start(struct seq_file *m, loff_t *pos) __acquires(&vmap_purge_lock) diff --git a/mm/vmscan.c b/mm/vmscan.c index b1b574ad199d..562e87cbd7a1 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -310,7 +310,8 @@ unsigned long zone_reclaimable_pages(struct zone *zone) * @lru: lru to use * @zone_idx: zones to consider (use MAX_NR_ZONES for the whole LRU list) */ -unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru, int zone_idx) +static unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru, + int zone_idx) { unsigned long size = 0; int zid; @@ -1539,19 +1540,17 @@ unsigned int reclaim_clean_pages_from_list(struct zone *zone, * page: page to consider * mode: one of the LRU isolation modes defined above * - * returns 0 on success, -ve errno on failure. + * returns true on success, false on failure. */ -int __isolate_lru_page_prepare(struct page *page, isolate_mode_t mode) +bool __isolate_lru_page_prepare(struct page *page, isolate_mode_t mode) { - int ret = -EBUSY; - /* Only take pages on the LRU. */ if (!PageLRU(page)) - return ret; + return false; /* Compaction should not handle unevictable pages but CMA can do so */ if (PageUnevictable(page) && !(mode & ISOLATE_UNEVICTABLE)) - return ret; + return false; /* * To minimise LRU disruption, the caller can indicate that it only @@ -1564,7 +1563,7 @@ int __isolate_lru_page_prepare(struct page *page, isolate_mode_t mode) if (mode & ISOLATE_ASYNC_MIGRATE) { /* All the caller can do on PageWriteback is block */ if (PageWriteback(page)) - return ret; + return false; if (PageDirty(page)) { struct address_space *mapping; @@ -1580,20 +1579,20 @@ int __isolate_lru_page_prepare(struct page *page, isolate_mode_t mode) * from the page cache. */ if (!trylock_page(page)) - return ret; + return false; mapping = page_mapping(page); migrate_dirty = !mapping || mapping->a_ops->migratepage; unlock_page(page); if (!migrate_dirty) - return ret; + return false; } } if ((mode & ISOLATE_UNMAPPED) && page_mapped(page)) - return ret; + return false; - return 0; + return true; } /* @@ -1677,35 +1676,31 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, * only when the page is being freed somewhere else. */ scan += nr_pages; - switch (__isolate_lru_page_prepare(page, mode)) { - case 0: - /* - * Be careful not to clear PageLRU until after we're - * sure the page is not being freed elsewhere -- the - * page release code relies on it. - */ - if (unlikely(!get_page_unless_zero(page))) - goto busy; - - if (!TestClearPageLRU(page)) { - /* - * This page may in other isolation path, - * but we still hold lru_lock. - */ - put_page(page); - goto busy; - } - - nr_taken += nr_pages; - nr_zone_taken[page_zonenum(page)] += nr_pages; - list_move(&page->lru, dst); - break; + if (!__isolate_lru_page_prepare(page, mode)) { + /* It is being freed elsewhere */ + list_move(&page->lru, src); + continue; + } + /* + * Be careful not to clear PageLRU until after we're + * sure the page is not being freed elsewhere -- the + * page release code relies on it. + */ + if (unlikely(!get_page_unless_zero(page))) { + list_move(&page->lru, src); + continue; + } - default: -busy: - /* else it is being freed elsewhere */ + if (!TestClearPageLRU(page)) { + /* Another thread is already isolating this page */ + put_page(page); list_move(&page->lru, src); + continue; } + + nr_taken += nr_pages; + nr_zone_taken[page_zonenum(page)] += nr_pages; + list_move(&page->lru, dst); } /* @@ -1772,7 +1767,7 @@ int isolate_lru_page(struct page *page) get_page(page); lruvec = lock_page_lruvec_irq(page); - del_page_from_lru_list(page, lruvec, page_lru(page)); + del_page_from_lru_list(page, lruvec); unlock_page_lruvec_irq(lruvec); ret = 0; } @@ -1829,7 +1824,6 @@ static unsigned noinline_for_stack move_pages_to_lru(struct lruvec *lruvec, int nr_pages, nr_moved = 0; LIST_HEAD(pages_to_free); struct page *page; - enum lru_list lru; while (!list_empty(list)) { page = lru_to_page(list); @@ -1856,8 +1850,7 @@ static unsigned noinline_for_stack move_pages_to_lru(struct lruvec *lruvec, SetPageLRU(page); if (unlikely(put_page_testzero(page))) { - __ClearPageLRU(page); - __ClearPageActive(page); + __clear_page_lru_flags(page); if (unlikely(PageCompound(page))) { spin_unlock_irq(&lruvec->lru_lock); @@ -1874,11 +1867,8 @@ static unsigned noinline_for_stack move_pages_to_lru(struct lruvec *lruvec, * inhibits memcg migration). */ VM_BUG_ON_PAGE(!lruvec_holds_page_lru_lock(page, lruvec), page); - lru = page_lru(page); + add_page_to_lru_list(page, lruvec); nr_pages = thp_nr_pages(page); - - update_lru_size(lruvec, lru, page_zonenum(page), nr_pages); - list_add(&page->lru, &lruvec->lists[lru]); nr_moved += nr_pages; if (PageActive(page)) workingset_age_nonresident(lruvec, nr_pages); @@ -4095,8 +4085,13 @@ module_init(kswapd_init) */ int node_reclaim_mode __read_mostly; -#define RECLAIM_WRITE (1<<0) /* Writeout pages during reclaim */ -#define RECLAIM_UNMAP (1<<1) /* Unmap pages during reclaim */ +/* + * These bit locations are exposed in the vm.zone_reclaim_mode sysctl + * ABI. New bits are OK, but existing bits can never change. + */ +#define RECLAIM_ZONE (1<<0) /* Run shrink_inactive_list on the zone */ +#define RECLAIM_WRITE (1<<1) /* Writeout pages during reclaim */ +#define RECLAIM_UNMAP (1<<2) /* Unmap pages during reclaim */ /* * Priority for NODE_RECLAIM. This determines the fraction of pages @@ -4292,12 +4287,9 @@ void check_move_unevictable_pages(struct pagevec *pvec) lruvec = relock_page_lruvec_irq(page, lruvec); if (page_evictable(page) && PageUnevictable(page)) { - enum lru_list lru = page_lru_base_type(page); - - VM_BUG_ON_PAGE(PageActive(page), page); + del_page_from_lru_list(page, lruvec); ClearPageUnevictable(page); - del_page_from_lru_list(page, lruvec, LRU_UNEVICTABLE); - add_page_to_lru_list(page, lruvec, lru); + add_page_to_lru_list(page, lruvec); pgrescued += nr_pages; } SetPageLRU(page); diff --git a/mm/vmstat.c b/mm/vmstat.c index f8942160fc95..a0e949542204 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1215,6 +1215,9 @@ const char * const vmstat_text[] = { "nr_shadow_call_stack", #endif "nr_page_table_pages", +#ifdef CONFIG_SWAP + "nr_swapcached", +#endif /* enum writeback_stat_item counters */ "nr_dirty_threshold", @@ -1619,8 +1622,12 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, if (is_zone_first_populated(pgdat, zone)) { seq_printf(m, "\n per-node stats"); for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) { + unsigned long pages = node_page_state_pages(pgdat, i); + + if (vmstat_item_print_in_thp(i)) + pages /= HPAGE_PMD_NR; seq_printf(m, "\n %-12s %lu", node_stat_name(i), - node_page_state_pages(pgdat, i)); + pages); } } seq_printf(m, @@ -1740,8 +1747,11 @@ static void *vmstat_start(struct seq_file *m, loff_t *pos) v += NR_VM_NUMA_STAT_ITEMS; #endif - for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) + for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) { v[i] = global_node_page_state_pages(i); + if (vmstat_item_print_in_thp(i)) + v[i] /= HPAGE_PMD_NR; + } v += NR_VM_NODE_STAT_ITEMS; global_dirty_limits(v + NR_DIRTY_BG_THRESHOLD, diff --git a/mm/workingset.c b/mm/workingset.c index 10e96de945b3..cd39902c1062 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -263,10 +263,10 @@ void *workingset_eviction(struct page *page, struct mem_cgroup *target_memcg) VM_BUG_ON_PAGE(!PageLocked(page), page); lruvec = mem_cgroup_lruvec(target_memcg, pgdat); - workingset_age_nonresident(lruvec, thp_nr_pages(page)); /* XXX: target_memcg can be NULL, go through lruvec */ memcgid = mem_cgroup_id(lruvec_memcg(lruvec)); eviction = atomic_long_read(&lruvec->nonresident_age); + workingset_age_nonresident(lruvec, thp_nr_pages(page)); return pack_shadow(memcgid, pgdat, eviction, PageWorkingset(page)); } @@ -461,6 +461,8 @@ static unsigned long count_shadow_nodes(struct shrinker *shrinker, unsigned long pages; nodes = list_lru_shrink_count(&shadow_nodes, sc); + if (!nodes) + return SHRINK_EMPTY; /* * Approximate a reasonable limit for the nodes @@ -503,9 +505,6 @@ static unsigned long count_shadow_nodes(struct shrinker *shrinker, max_nodes = pages >> (XA_CHUNK_SHIFT - 3); - if (!nodes) - return SHRINK_EMPTY; - if (nodes <= max_nodes) return 0; return nodes - max_nodes; diff --git a/mm/z3fold.c b/mm/z3fold.c index dacb0d70fa61..c1ccf6bb0ffb 100644 --- a/mm/z3fold.c +++ b/mm/z3fold.c @@ -413,16 +413,10 @@ static struct z3fold_header *init_z3fold_page(struct page *page, bool headless, if (!slots) return NULL; + memset(zhdr, 0, sizeof(*zhdr)); spin_lock_init(&zhdr->page_lock); kref_init(&zhdr->refcount); - zhdr->first_chunks = 0; - zhdr->middle_chunks = 0; - zhdr->last_chunks = 0; - zhdr->first_num = 0; - zhdr->start_middle = 0; zhdr->cpu = -1; - zhdr->foreign_handles = 0; - zhdr->mapped_count = 0; zhdr->slots = slots; zhdr->pool = pool; INIT_LIST_HEAD(&zhdr->buddy); @@ -541,8 +535,7 @@ static void __release_z3fold_page(struct z3fold_header *zhdr, bool locked) spin_unlock(&pool->stale_lock); } -static void __attribute__((__unused__)) - release_z3fold_page(struct kref *ref) +static void release_z3fold_page(struct kref *ref) { struct z3fold_header *zhdr = container_of(ref, struct z3fold_header, refcount); |