diff options
Diffstat (limited to 'mm')
-rw-r--r-- | mm/Kconfig | 82 | ||||
-rw-r--r-- | mm/Kconfig.debug | 1 | ||||
-rw-r--r-- | mm/Makefile | 7 | ||||
-rw-r--r-- | mm/cma.c | 23 | ||||
-rw-r--r-- | mm/cma_debug.c | 2 | ||||
-rw-r--r-- | mm/compaction.c | 41 | ||||
-rw-r--r-- | mm/debug.c | 6 | ||||
-rw-r--r-- | mm/filemap.c | 216 | ||||
-rw-r--r-- | mm/gup.c | 441 | ||||
-rw-r--r-- | mm/gup_benchmark.c | 5 | ||||
-rw-r--r-- | mm/hmm.c | 1086 | ||||
-rw-r--r-- | mm/huge_memory.c | 75 | ||||
-rw-r--r-- | mm/hugetlb.c | 195 | ||||
-rw-r--r-- | mm/kasan/Makefile | 9 | ||||
-rw-r--r-- | mm/kasan/common.c | 43 | ||||
-rw-r--r-- | mm/kasan/kasan.h | 5 | ||||
-rw-r--r-- | mm/kasan/report.c | 10 | ||||
-rw-r--r-- | mm/khugepaged.c | 7 | ||||
-rw-r--r-- | mm/kmemleak.c | 42 | ||||
-rw-r--r-- | mm/ksm.c | 6 | ||||
-rw-r--r-- | mm/madvise.c | 5 | ||||
-rw-r--r-- | mm/memblock.c | 82 | ||||
-rw-r--r-- | mm/memcontrol.c | 401 | ||||
-rw-r--r-- | mm/memfd.c | 2 | ||||
-rw-r--r-- | mm/memory.c | 125 | ||||
-rw-r--r-- | mm/memory_hotplug.c | 150 | ||||
-rw-r--r-- | mm/mempolicy.c | 40 | ||||
-rw-r--r-- | mm/migrate.c | 18 | ||||
-rw-r--r-- | mm/mincore.c | 23 | ||||
-rw-r--r-- | mm/mmap.c | 22 | ||||
-rw-r--r-- | mm/mmu_gather.c | 129 | ||||
-rw-r--r-- | mm/mmu_notifier.c | 12 | ||||
-rw-r--r-- | mm/mprotect.c | 9 | ||||
-rw-r--r-- | mm/mremap.c | 3 | ||||
-rw-r--r-- | mm/nommu.c | 14 | ||||
-rw-r--r-- | mm/oom_kill.c | 3 | ||||
-rw-r--r-- | mm/page-writeback.c | 12 | ||||
-rw-r--r-- | mm/page_alloc.c | 418 | ||||
-rw-r--r-- | mm/page_isolation.c | 53 | ||||
-rw-r--r-- | mm/page_owner.c | 82 | ||||
-rw-r--r-- | mm/percpu-internal.h | 15 | ||||
-rw-r--r-- | mm/percpu-km.c | 2 | ||||
-rw-r--r-- | mm/percpu-stats.c | 5 | ||||
-rw-r--r-- | mm/percpu.c | 557 | ||||
-rw-r--r-- | mm/rmap.c | 10 | ||||
-rw-r--r-- | mm/shmem.c | 65 | ||||
-rw-r--r-- | mm/shuffle.c | 207 | ||||
-rw-r--r-- | mm/shuffle.h | 64 | ||||
-rw-r--r-- | mm/slab.c | 323 | ||||
-rw-r--r-- | mm/slab.h | 3 | ||||
-rw-r--r-- | mm/slab_common.c | 2 | ||||
-rw-r--r-- | mm/slob.c | 59 | ||||
-rw-r--r-- | mm/slub.c | 98 | ||||
-rw-r--r-- | mm/sparse.c | 18 | ||||
-rw-r--r-- | mm/swap.c | 2 | ||||
-rw-r--r-- | mm/swap_state.c | 4 | ||||
-rw-r--r-- | mm/swapfile.c | 32 | ||||
-rw-r--r-- | mm/userfaultfd.c | 3 | ||||
-rw-r--r-- | mm/util.c | 61 | ||||
-rw-r--r-- | mm/vmalloc.c | 1240 | ||||
-rw-r--r-- | mm/vmscan.c | 234 | ||||
-rw-r--r-- | mm/vmstat.c | 5 | ||||
-rw-r--r-- | mm/workingset.c | 10 | ||||
-rw-r--r-- | mm/z3fold.c | 638 |
64 files changed, 4923 insertions, 2639 deletions
diff --git a/mm/Kconfig b/mm/Kconfig index 25c71eb8a7db..ee8d1f311858 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -11,23 +11,24 @@ choice default DISCONTIGMEM_MANUAL if ARCH_DISCONTIGMEM_DEFAULT default SPARSEMEM_MANUAL if ARCH_SPARSEMEM_DEFAULT default FLATMEM_MANUAL + help + This option allows you to change some of the ways that + Linux manages its memory internally. Most users will + only have one option here selected by the architecture + configuration. This is normal. config FLATMEM_MANUAL bool "Flat Memory" depends on !(ARCH_DISCONTIGMEM_ENABLE || ARCH_SPARSEMEM_ENABLE) || ARCH_FLATMEM_ENABLE help - This option allows you to change some of the ways that - Linux manages its memory internally. Most users will - only have one option here: FLATMEM. This is normal - and a correct option. - - Some users of more advanced features like NUMA and - memory hotplug may have different options here. - DISCONTIGMEM is a more mature, better tested system, - but is incompatible with memory hotplug and may suffer - decreased performance over SPARSEMEM. If unsure between - "Sparse Memory" and "Discontiguous Memory", choose - "Discontiguous Memory". + This option is best suited for non-NUMA systems with + flat address space. The FLATMEM is the most efficient + system in terms of performance and resource consumption + and it is the best option for smaller systems. + + For systems that have holes in their physical address + spaces and for features like NUMA and memory hotplug, + choose "Sparse Memory" If unsure, choose this option (Flat Memory) over any other. @@ -38,29 +39,26 @@ config DISCONTIGMEM_MANUAL This option provides enhanced support for discontiguous memory systems, over FLATMEM. These systems have holes in their physical address spaces, and this option provides - more efficient handling of these holes. However, the vast - majority of hardware has quite flat address spaces, and - can have degraded performance from the extra overhead that - this option imposes. + more efficient handling of these holes. - Many NUMA configurations will have this as the only option. + Although "Discontiguous Memory" is still used by several + architectures, it is considered deprecated in favor of + "Sparse Memory". - If unsure, choose "Flat Memory" over this option. + If unsure, choose "Sparse Memory" over this option. config SPARSEMEM_MANUAL bool "Sparse Memory" depends on ARCH_SPARSEMEM_ENABLE help This will be the only option for some systems, including - memory hotplug systems. This is normal. + memory hot-plug systems. This is normal. - For many other systems, this will be an alternative to - "Discontiguous Memory". This option provides some potential - performance benefits, along with decreased code complexity, - but it is newer, and more experimental. + This option provides efficient support for systems with + holes is their physical address space and allows memory + hot-plug and hot-remove. - If unsure, choose "Discontiguous Memory" or "Flat Memory" - over this option. + If unsure, choose "Flat Memory" over this option. endchoice @@ -136,7 +134,7 @@ config HAVE_MEMBLOCK_PHYS_MAP config HAVE_GENERIC_GUP bool -config ARCH_DISCARD_MEMBLOCK +config ARCH_KEEP_MEMBLOCK bool config MEMORY_ISOLATION @@ -161,7 +159,6 @@ config MEMORY_HOTPLUG_SPARSE config MEMORY_HOTPLUG_DEFAULT_ONLINE bool "Online the newly added memory blocks by default" - default n depends on MEMORY_HOTPLUG help This option sets the default policy setting for memory hotplug @@ -258,6 +255,9 @@ config ARCH_ENABLE_HUGEPAGE_MIGRATION config ARCH_ENABLE_THP_MIGRATION bool +config CONTIG_ALLOC + def_bool (MEMORY_ISOLATION && COMPACTION) || CMA + config PHYS_ADDR_T_64BIT def_bool 64BIT @@ -436,7 +436,6 @@ config NEED_PER_CPU_KM config CLEANCACHE bool "Enable cleancache driver to cache clean pages if tmem is present" - default n help Cleancache can be thought of as a page-granularity victim cache for clean pages that the kernel's pageframe replacement algorithm @@ -460,7 +459,6 @@ config CLEANCACHE config FRONTSWAP bool "Enable frontswap to cache swap pages if tmem is present" depends on SWAP - default n help Frontswap is so named because it can be thought of as the opposite of a "backing" store for a swap device. The data is stored into @@ -532,7 +530,6 @@ config ZSWAP depends on FRONTSWAP && CRYPTO=y select CRYPTO_LZO select ZPOOL - default n help A lightweight compressed cache for swap pages. It takes pages that are in the process of being swapped out and attempts to @@ -549,14 +546,12 @@ config ZSWAP config ZPOOL tristate "Common API for compressed memory storage" - default n help Compressed memory storage API. This allows using either zbud or zsmalloc. config ZBUD tristate "Low (Up to 2x) density storage for compressed pages" - default n help A special purpose allocator for storing compressed pages. It is designed to store up to two compressed pages per physical @@ -567,7 +562,6 @@ config ZBUD config Z3FOLD tristate "Up to 3x density storage for compressed pages" depends on ZPOOL - default n help A special purpose allocator for storing compressed pages. It is designed to store up to three compressed pages per physical @@ -577,7 +571,6 @@ config Z3FOLD config ZSMALLOC tristate "Memory allocator for compressed pages" depends on MMU - default n help zsmalloc is a slab-based memory allocator designed to store compressed RAM pages. zsmalloc uses virtual memory mapping @@ -628,7 +621,6 @@ config MAX_STACK_SIZE_MB config DEFERRED_STRUCT_PAGE_INIT bool "Defer initialisation of struct pages to kthreads" - default n depends on SPARSEMEM depends on !NEED_PER_CPU_KM depends on 64BIT @@ -676,6 +668,22 @@ config ZONE_DEVICE If FS_DAX is enabled, then say Y. +config ARCH_HAS_HMM_MIRROR + bool + default y + depends on (X86_64 || PPC64) + depends on MMU && 64BIT + +config ARCH_HAS_HMM_DEVICE + bool + default y + depends on (X86_64 || PPC64) + depends on MEMORY_HOTPLUG + depends on MEMORY_HOTREMOVE + depends on SPARSEMEM_VMEMMAP + depends on ARCH_HAS_ZONE_DEVICE + select XARRAY_MULTI + config ARCH_HAS_HMM bool default y @@ -694,12 +702,12 @@ config DEV_PAGEMAP_OPS config HMM bool + select MMU_NOTIFIER select MIGRATE_VMA_HELPER config HMM_MIRROR bool "HMM mirror CPU page table into a device page table" depends on ARCH_HAS_HMM - select MMU_NOTIFIER select HMM help Select HMM_MIRROR if you want to mirror range of the CPU page table of a @@ -740,7 +748,6 @@ config ARCH_HAS_PKEYS config PERCPU_STATS bool "Collect percpu memory statistics" - default n help This feature collects and exposes statistics via debugfs. The information includes global and per chunk statistics, which can @@ -748,7 +755,6 @@ config PERCPU_STATS config GUP_BENCHMARK bool "Enable infrastructure for get_user_pages_fast() benchmarking" - default n help Provides /sys/kernel/debug/gup_benchmark that helps with testing performance of get_user_pages_fast(). diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug index e3df921208c0..e980ceb775a4 100644 --- a/mm/Kconfig.debug +++ b/mm/Kconfig.debug @@ -33,7 +33,6 @@ config DEBUG_PAGEALLOC config DEBUG_PAGEALLOC_ENABLE_DEFAULT bool "Enable debug page memory allocations by default?" - default n depends on DEBUG_PAGEALLOC ---help--- Enable debug page memory allocations by default? This value diff --git a/mm/Makefile b/mm/Makefile index d210cc9d6f80..ac5e5ba78874 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -33,7 +33,7 @@ mmu-$(CONFIG_MMU) += process_vm_access.o endif obj-y := filemap.o mempool.o oom_kill.o fadvise.o \ - maccess.o page_alloc.o page-writeback.o \ + maccess.o page-writeback.o \ readahead.o swap.o truncate.o vmscan.o shmem.o \ util.o mmzone.o vmstat.o backing-dev.o \ mm_init.o mmu_context.o percpu.o slab_common.o \ @@ -41,6 +41,11 @@ obj-y := filemap.o mempool.o oom_kill.o fadvise.o \ interval_tree.o list_lru.o workingset.o \ debug.o $(mmu-y) +# Give 'page_alloc' its own module-parameter namespace +page-alloc-y := page_alloc.o +page-alloc-$(CONFIG_SHUFFLE_PAGE_ALLOCATOR) += shuffle.o + +obj-y += page-alloc.o obj-y += init-mm.o obj-y += memblock.o @@ -106,8 +106,10 @@ static int __init cma_activate_area(struct cma *cma) cma->bitmap = kzalloc(bitmap_size, GFP_KERNEL); - if (!cma->bitmap) + if (!cma->bitmap) { + cma->count = 0; return -ENOMEM; + } WARN_ON_ONCE(!pfn_valid(pfn)); zone = page_zone(pfn_to_page(pfn)); @@ -367,23 +369,26 @@ err: #ifdef CONFIG_CMA_DEBUG static void cma_debug_show_areas(struct cma *cma) { - unsigned long next_zero_bit, next_set_bit; + unsigned long next_zero_bit, next_set_bit, nr_zero; unsigned long start = 0; - unsigned int nr_zero, nr_total = 0; + unsigned long nr_part, nr_total = 0; + unsigned long nbits = cma_bitmap_maxno(cma); mutex_lock(&cma->lock); pr_info("number of available pages: "); for (;;) { - next_zero_bit = find_next_zero_bit(cma->bitmap, cma->count, start); - if (next_zero_bit >= cma->count) + next_zero_bit = find_next_zero_bit(cma->bitmap, nbits, start); + if (next_zero_bit >= nbits) break; - next_set_bit = find_next_bit(cma->bitmap, cma->count, next_zero_bit); + next_set_bit = find_next_bit(cma->bitmap, nbits, next_zero_bit); nr_zero = next_set_bit - next_zero_bit; - pr_cont("%s%u@%lu", nr_total ? "+" : "", nr_zero, next_zero_bit); - nr_total += nr_zero; + nr_part = nr_zero << cma->order_per_bit; + pr_cont("%s%lu@%lu", nr_total ? "+" : "", nr_part, + next_zero_bit); + nr_total += nr_part; start = next_zero_bit + nr_zero; } - pr_cont("=> %u free of %lu total pages\n", nr_total, cma->count); + pr_cont("=> %lu free of %lu total pages\n", nr_total, cma->count); mutex_unlock(&cma->lock); } #else diff --git a/mm/cma_debug.c b/mm/cma_debug.c index 8d7b2fd52225..a7dd9e8e10d5 100644 --- a/mm/cma_debug.c +++ b/mm/cma_debug.c @@ -56,7 +56,7 @@ static int cma_maxchunk_get(void *data, u64 *val) mutex_lock(&cma->lock); for (;;) { start = find_next_zero_bit(cma->bitmap, bitmap_maxno, end); - if (start >= cma->count) + if (start >= bitmap_maxno) break; end = find_next_bit(cma->bitmap, bitmap_maxno, start); maxchunk = max(end - start, maxchunk); diff --git a/mm/compaction.c b/mm/compaction.c index f171a83707ce..9febc8cc84e7 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -242,6 +242,7 @@ __reset_isolation_pfn(struct zone *zone, unsigned long pfn, bool check_source, bool check_target) { struct page *page = pfn_to_online_page(pfn); + struct page *block_page; struct page *end_page; unsigned long block_pfn; @@ -267,20 +268,26 @@ __reset_isolation_pfn(struct zone *zone, unsigned long pfn, bool check_source, get_pageblock_migratetype(page) != MIGRATE_MOVABLE) return false; + /* Ensure the start of the pageblock or zone is online and valid */ + block_pfn = pageblock_start_pfn(pfn); + block_page = pfn_to_online_page(max(block_pfn, zone->zone_start_pfn)); + if (block_page) { + page = block_page; + pfn = block_pfn; + } + + /* Ensure the end of the pageblock or zone is online and valid */ + block_pfn += pageblock_nr_pages; + block_pfn = min(block_pfn, zone_end_pfn(zone) - 1); + end_page = pfn_to_online_page(block_pfn); + if (!end_page) + return false; + /* * Only clear the hint if a sample indicates there is either a * free page or an LRU page in the block. One or other condition * is necessary for the block to be a migration source/target. */ - block_pfn = pageblock_start_pfn(pfn); - pfn = max(block_pfn, zone->zone_start_pfn); - page = pfn_to_page(pfn); - if (zone != page_zone(page)) - return false; - pfn = block_pfn + pageblock_nr_pages; - pfn = min(pfn, zone_end_pfn(zone)); - end_page = pfn_to_page(pfn); - do { if (pfn_valid_within(pfn)) { if (check_source && PageLRU(page)) { @@ -309,7 +316,7 @@ __reset_isolation_pfn(struct zone *zone, unsigned long pfn, bool check_source, static void __reset_isolation_suitable(struct zone *zone) { unsigned long migrate_pfn = zone->zone_start_pfn; - unsigned long free_pfn = zone_end_pfn(zone); + unsigned long free_pfn = zone_end_pfn(zone) - 1; unsigned long reset_migrate = free_pfn; unsigned long reset_free = migrate_pfn; bool source_set = false; @@ -1157,7 +1164,9 @@ static bool suitable_migration_target(struct compact_control *cc, static inline unsigned int freelist_scan_limit(struct compact_control *cc) { - return (COMPACT_CLUSTER_MAX >> cc->fast_search_fail) + 1; + unsigned short shift = BITS_PER_LONG - 1; + + return (COMPACT_CLUSTER_MAX >> min(shift, cc->fast_search_fail)) + 1; } /* @@ -1221,7 +1230,7 @@ fast_isolate_around(struct compact_control *cc, unsigned long pfn, unsigned long /* Pageblock boundaries */ start_pfn = pageblock_start_pfn(pfn); - end_pfn = min(start_pfn + pageblock_nr_pages, zone_end_pfn(cc->zone)); + end_pfn = min(pageblock_end_pfn(pfn), zone_end_pfn(cc->zone)) - 1; /* Scan before */ if (start_pfn != pfn) { @@ -1232,7 +1241,7 @@ fast_isolate_around(struct compact_control *cc, unsigned long pfn, unsigned long /* Scan after */ start_pfn = pfn + nr_isolated; - if (start_pfn != end_pfn) + if (start_pfn < end_pfn) isolate_freepages_block(cc, &start_pfn, end_pfn, &cc->freepages, 1, false); /* Skip this pageblock in the future as it's full or nearly full */ @@ -1363,7 +1372,7 @@ fast_isolate_freepages(struct compact_control *cc) count_compact_events(COMPACTISOLATED, nr_isolated); } else { /* If isolation fails, abort the search */ - order = -1; + order = cc->search_order + 1; page = NULL; } } @@ -1879,13 +1888,13 @@ static enum compact_result __compact_finished(struct compact_control *cc) bool can_steal; /* Job done if page is free of the right migratetype */ - if (!list_empty(&area->free_list[migratetype])) + if (!free_area_empty(area, migratetype)) return COMPACT_SUCCESS; #ifdef CONFIG_CMA /* MIGRATE_MOVABLE can fallback on MIGRATE_CMA */ if (migratetype == MIGRATE_MOVABLE && - !list_empty(&area->free_list[MIGRATE_CMA])) + !free_area_empty(area, MIGRATE_CMA)) return COMPACT_SUCCESS; #endif /* diff --git a/mm/debug.c b/mm/debug.c index c0b31b6c3877..8345bb6e4769 100644 --- a/mm/debug.c +++ b/mm/debug.c @@ -67,7 +67,7 @@ void __dump_page(struct page *page, const char *reason) */ mapcount = PageSlab(page) ? 0 : page_mapcount(page); - pr_warn("page:%px count:%d mapcount:%d mapping:%px index:%#lx", + pr_warn("page:%px refcount:%d mapcount:%d mapping:%px index:%#lx", page, page_ref_count(page), mapcount, page->mapping, page_to_pgoff(page)); if (PageCompound(page)) @@ -79,7 +79,7 @@ void __dump_page(struct page *page, const char *reason) pr_warn("ksm "); else if (mapping) { pr_warn("%ps ", mapping->a_ops); - if (mapping->host->i_dentry.first) { + if (mapping->host && mapping->host->i_dentry.first) { struct dentry *dentry; dentry = container_of(mapping->host->i_dentry.first, struct dentry, d_u.d_alias); pr_warn("name:\"%pd\" ", dentry); @@ -168,7 +168,7 @@ void dump_mm(const struct mm_struct *mm) mm_pgtables_bytes(mm), mm->map_count, mm->hiwater_rss, mm->hiwater_vm, mm->total_vm, mm->locked_vm, - atomic64_read(&mm->pinned_vm), + (u64)atomic64_read(&mm->pinned_vm), mm->data_vm, mm->exec_vm, mm->stack_vm, mm->start_code, mm->end_code, mm->start_data, mm->end_data, mm->start_brk, mm->brk, mm->start_stack, diff --git a/mm/filemap.c b/mm/filemap.c index d78f577baef2..c5af80c43d36 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -24,6 +24,7 @@ #include <linux/pagemap.h> #include <linux/file.h> #include <linux/uio.h> +#include <linux/error-injection.h> #include <linux/hash.h> #include <linux/writeback.h> #include <linux/backing-dev.h> @@ -279,11 +280,11 @@ EXPORT_SYMBOL(delete_from_page_cache); * @pvec: pagevec with pages to delete * * The function walks over mapping->i_pages and removes pages passed in @pvec - * from the mapping. The function expects @pvec to be sorted by page index. + * from the mapping. The function expects @pvec to be sorted by page index + * and is optimised for it to be dense. * It tolerates holes in @pvec (mapping entries at those indices are not * modified). The function expects only THP head pages to be present in the - * @pvec and takes care to delete all corresponding tail pages from the - * mapping as well. + * @pvec. * * The function expects the i_pages lock to be held. */ @@ -292,40 +293,44 @@ static void page_cache_delete_batch(struct address_space *mapping, { XA_STATE(xas, &mapping->i_pages, pvec->pages[0]->index); int total_pages = 0; - int i = 0, tail_pages = 0; + int i = 0; struct page *page; mapping_set_update(&xas, mapping); xas_for_each(&xas, page, ULONG_MAX) { - if (i >= pagevec_count(pvec) && !tail_pages) + if (i >= pagevec_count(pvec)) break; + + /* A swap/dax/shadow entry got inserted? Skip it. */ if (xa_is_value(page)) continue; - if (!tail_pages) { - /* - * Some page got inserted in our range? Skip it. We - * have our pages locked so they are protected from - * being removed. - */ - if (page != pvec->pages[i]) { - VM_BUG_ON_PAGE(page->index > - pvec->pages[i]->index, page); - continue; - } - WARN_ON_ONCE(!PageLocked(page)); - if (PageTransHuge(page) && !PageHuge(page)) - tail_pages = HPAGE_PMD_NR - 1; + /* + * A page got inserted in our range? Skip it. We have our + * pages locked so they are protected from being removed. + * If we see a page whose index is higher than ours, it + * means our page has been removed, which shouldn't be + * possible because we're holding the PageLock. + */ + if (page != pvec->pages[i]) { + VM_BUG_ON_PAGE(page->index > pvec->pages[i]->index, + page); + continue; + } + + WARN_ON_ONCE(!PageLocked(page)); + + if (page->index == xas.xa_index) page->mapping = NULL; - /* - * Leave page->index set: truncation lookup relies - * upon it - */ + /* Leave page->index set: truncation lookup relies on it */ + + /* + * Move to the next page in the vector if this is a regular + * page or the index is of the last sub-page of this compound + * page. + */ + if (page->index + (1UL << compound_order(page)) - 1 == + xas.xa_index) i++; - } else { - VM_BUG_ON_PAGE(page->index + HPAGE_PMD_NR - tail_pages - != pvec->pages[i]->index, page); - tail_pages--; - } xas_store(&xas, NULL); total_pages++; } @@ -878,6 +883,7 @@ error: put_page(page); return xas_error(&xas); } +ALLOW_ERROR_INJECTION(__add_to_page_cache_locked, ERRNO); /** * add_to_page_cache_locked - add a locked page to the pagecache @@ -1440,7 +1446,7 @@ pgoff_t page_cache_next_miss(struct address_space *mapping, EXPORT_SYMBOL(page_cache_next_miss); /** - * page_cache_prev_miss() - Find the next gap in the page cache. + * page_cache_prev_miss() - Find the previous gap in the page cache. * @mapping: Mapping. * @index: Index. * @max_scan: Maximum range to search. @@ -1491,7 +1497,7 @@ EXPORT_SYMBOL(page_cache_prev_miss); struct page *find_get_entry(struct address_space *mapping, pgoff_t offset) { XA_STATE(xas, &mapping->i_pages, offset); - struct page *head, *page; + struct page *page; rcu_read_lock(); repeat: @@ -1506,25 +1512,19 @@ repeat: if (!page || xa_is_value(page)) goto out; - head = compound_head(page); - if (!page_cache_get_speculative(head)) + if (!page_cache_get_speculative(page)) goto repeat; - /* The page was split under us? */ - if (compound_head(page) != head) { - put_page(head); - goto repeat; - } - /* - * Has the page moved? + * Has the page moved or been split? * This is part of the lockless pagecache protocol. See * include/linux/pagemap.h for details. */ if (unlikely(page != xas_reload(&xas))) { - put_page(head); + put_page(page); goto repeat; } + page = find_subpage(page, offset); out: rcu_read_unlock(); @@ -1706,7 +1706,6 @@ unsigned find_get_entries(struct address_space *mapping, rcu_read_lock(); xas_for_each(&xas, page, ULONG_MAX) { - struct page *head; if (xas_retry(&xas, page)) continue; /* @@ -1717,17 +1716,13 @@ unsigned find_get_entries(struct address_space *mapping, if (xa_is_value(page)) goto export; - head = compound_head(page); - if (!page_cache_get_speculative(head)) + if (!page_cache_get_speculative(page)) goto retry; - /* The page was split under us? */ - if (compound_head(page) != head) - goto put_page; - - /* Has the page moved? */ + /* Has the page moved or been split? */ if (unlikely(page != xas_reload(&xas))) goto put_page; + page = find_subpage(page, xas.xa_index); export: indices[ret] = xas.xa_index; @@ -1736,7 +1731,7 @@ export: break; continue; put_page: - put_page(head); + put_page(page); retry: xas_reset(&xas); } @@ -1778,33 +1773,27 @@ unsigned find_get_pages_range(struct address_space *mapping, pgoff_t *start, rcu_read_lock(); xas_for_each(&xas, page, end) { - struct page *head; if (xas_retry(&xas, page)) continue; /* Skip over shadow, swap and DAX entries */ if (xa_is_value(page)) continue; - head = compound_head(page); - if (!page_cache_get_speculative(head)) + if (!page_cache_get_speculative(page)) goto retry; - /* The page was split under us? */ - if (compound_head(page) != head) - goto put_page; - - /* Has the page moved? */ + /* Has the page moved or been split? */ if (unlikely(page != xas_reload(&xas))) goto put_page; - pages[ret] = page; + pages[ret] = find_subpage(page, xas.xa_index); if (++ret == nr_pages) { *start = xas.xa_index + 1; goto out; } continue; put_page: - put_page(head); + put_page(page); retry: xas_reset(&xas); } @@ -1849,7 +1838,6 @@ unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t index, rcu_read_lock(); for (page = xas_load(&xas); page; page = xas_next(&xas)) { - struct page *head; if (xas_retry(&xas, page)) continue; /* @@ -1859,24 +1847,19 @@ unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t index, if (xa_is_value(page)) break; - head = compound_head(page); - if (!page_cache_get_speculative(head)) + if (!page_cache_get_speculative(page)) goto retry; - /* The page was split under us? */ - if (compound_head(page) != head) - goto put_page; - - /* Has the page moved? */ + /* Has the page moved or been split? */ if (unlikely(page != xas_reload(&xas))) goto put_page; - pages[ret] = page; + pages[ret] = find_subpage(page, xas.xa_index); if (++ret == nr_pages) break; continue; put_page: - put_page(head); + put_page(page); retry: xas_reset(&xas); } @@ -1912,7 +1895,6 @@ unsigned find_get_pages_range_tag(struct address_space *mapping, pgoff_t *index, rcu_read_lock(); xas_for_each_marked(&xas, page, end, tag) { - struct page *head; if (xas_retry(&xas, page)) continue; /* @@ -1923,26 +1905,21 @@ unsigned find_get_pages_range_tag(struct address_space *mapping, pgoff_t *index, if (xa_is_value(page)) continue; - head = compound_head(page); - if (!page_cache_get_speculative(head)) + if (!page_cache_get_speculative(page)) goto retry; - /* The page was split under us? */ - if (compound_head(page) != head) - goto put_page; - - /* Has the page moved? */ + /* Has the page moved or been split? */ if (unlikely(page != xas_reload(&xas))) goto put_page; - pages[ret] = page; + pages[ret] = find_subpage(page, xas.xa_index); if (++ret == nr_pages) { *index = xas.xa_index + 1; goto out; } continue; put_page: - put_page(head); + put_page(page); retry: xas_reset(&xas); } @@ -1964,72 +1941,6 @@ out: } EXPORT_SYMBOL(find_get_pages_range_tag); -/** - * find_get_entries_tag - find and return entries that match @tag - * @mapping: the address_space to search - * @start: the starting page cache index - * @tag: the tag index - * @nr_entries: the maximum number of entries - * @entries: where the resulting entries are placed - * @indices: the cache indices corresponding to the entries in @entries - * - * Like find_get_entries, except we only return entries which are tagged with - * @tag. - * - * Return: the number of entries which were found. - */ -unsigned find_get_entries_tag(struct address_space *mapping, pgoff_t start, - xa_mark_t tag, unsigned int nr_entries, - struct page **entries, pgoff_t *indices) -{ - XA_STATE(xas, &mapping->i_pages, start); - struct page *page; - unsigned int ret = 0; - - if (!nr_entries) - return 0; - - rcu_read_lock(); - xas_for_each_marked(&xas, page, ULONG_MAX, tag) { - struct page *head; - if (xas_retry(&xas, page)) - continue; - /* - * A shadow entry of a recently evicted page, a swap - * entry from shmem/tmpfs or a DAX entry. Return it - * without attempting to raise page count. - */ - if (xa_is_value(page)) - goto export; - - head = compound_head(page); - if (!page_cache_get_speculative(head)) - goto retry; - - /* The page was split under us? */ - if (compound_head(page) != head) - goto put_page; - - /* Has the page moved? */ - if (unlikely(page != xas_reload(&xas))) - goto put_page; - -export: - indices[ret] = xas.xa_index; - entries[ret] = page; - if (++ret == nr_entries) - break; - continue; -put_page: - put_page(head); -retry: - xas_reset(&xas); - } - rcu_read_unlock(); - return ret; -} -EXPORT_SYMBOL(find_get_entries_tag); - /* * CD/DVDs are error prone. When a medium error occurs, the driver may fail * a _large_ part of the i/o request. Imagine the worst scenario: @@ -2691,7 +2602,7 @@ void filemap_map_pages(struct vm_fault *vmf, pgoff_t last_pgoff = start_pgoff; unsigned long max_idx; XA_STATE(xas, &mapping->i_pages, start_pgoff); - struct page *head, *page; + struct page *page; rcu_read_lock(); xas_for_each(&xas, page, end_pgoff) { @@ -2700,24 +2611,19 @@ void filemap_map_pages(struct vm_fault *vmf, if (xa_is_value(page)) goto next; - head = compound_head(page); - /* * Check for a locked page first, as a speculative * reference may adversely influence page migration. */ - if (PageLocked(head)) + if (PageLocked(page)) goto next; - if (!page_cache_get_speculative(head)) + if (!page_cache_get_speculative(page)) goto next; - /* The page was split under us? */ - if (compound_head(page) != head) - goto skip; - - /* Has the page moved? */ + /* Has the page moved or been split? */ if (unlikely(page != xas_reload(&xas))) goto skip; + page = find_subpage(page, xas.xa_index); if (!PageUptodate(page) || PageReadahead(page) || @@ -28,6 +28,111 @@ struct follow_page_context { unsigned int page_mask; }; +typedef int (*set_dirty_func_t)(struct page *page); + +static void __put_user_pages_dirty(struct page **pages, + unsigned long npages, + set_dirty_func_t sdf) +{ + unsigned long index; + + for (index = 0; index < npages; index++) { + struct page *page = compound_head(pages[index]); + + /* + * Checking PageDirty at this point may race with + * clear_page_dirty_for_io(), but that's OK. Two key cases: + * + * 1) This code sees the page as already dirty, so it skips + * the call to sdf(). That could happen because + * clear_page_dirty_for_io() called page_mkclean(), + * followed by set_page_dirty(). However, now the page is + * going to get written back, which meets the original + * intention of setting it dirty, so all is well: + * clear_page_dirty_for_io() goes on to call + * TestClearPageDirty(), and write the page back. + * + * 2) This code sees the page as clean, so it calls sdf(). + * The page stays dirty, despite being written back, so it + * gets written back again in the next writeback cycle. + * This is harmless. + */ + if (!PageDirty(page)) + sdf(page); + + put_user_page(page); + } +} + +/** + * put_user_pages_dirty() - release and dirty an array of gup-pinned pages + * @pages: array of pages to be marked dirty and released. + * @npages: number of pages in the @pages array. + * + * "gup-pinned page" refers to a page that has had one of the get_user_pages() + * variants called on that page. + * + * For each page in the @pages array, make that page (or its head page, if a + * compound page) dirty, if it was previously listed as clean. Then, release + * the page using put_user_page(). + * + * Please see the put_user_page() documentation for details. + * + * set_page_dirty(), which does not lock the page, is used here. + * Therefore, it is the caller's responsibility to ensure that this is + * safe. If not, then put_user_pages_dirty_lock() should be called instead. + * + */ +void put_user_pages_dirty(struct page **pages, unsigned long npages) +{ + __put_user_pages_dirty(pages, npages, set_page_dirty); +} +EXPORT_SYMBOL(put_user_pages_dirty); + +/** + * put_user_pages_dirty_lock() - release and dirty an array of gup-pinned pages + * @pages: array of pages to be marked dirty and released. + * @npages: number of pages in the @pages array. + * + * For each page in the @pages array, make that page (or its head page, if a + * compound page) dirty, if it was previously listed as clean. Then, release + * the page using put_user_page(). + * + * Please see the put_user_page() documentation for details. + * + * This is just like put_user_pages_dirty(), except that it invokes + * set_page_dirty_lock(), instead of set_page_dirty(). + * + */ +void put_user_pages_dirty_lock(struct page **pages, unsigned long npages) +{ + __put_user_pages_dirty(pages, npages, set_page_dirty_lock); +} +EXPORT_SYMBOL(put_user_pages_dirty_lock); + +/** + * put_user_pages() - release an array of gup-pinned pages. + * @pages: array of pages to be marked dirty and released. + * @npages: number of pages in the @pages array. + * + * For each page in the @pages array, release the page using put_user_page(). + * + * Please see the put_user_page() documentation for details. + */ +void put_user_pages(struct page **pages, unsigned long npages) +{ + unsigned long index; + + /* + * TODO: this can be optimized for huge pages: if a series of pages is + * physically contiguous and part of the same compound page, then a + * single operation to the head page should suffice. + */ + for (index = 0; index < npages; index++) + put_user_page(pages[index]); +} +EXPORT_SYMBOL(put_user_pages); + static struct page *no_page_table(struct vm_area_struct *vma, unsigned int flags) { @@ -160,8 +265,12 @@ retry: goto retry; } - if (flags & FOLL_GET) - get_page(page); + if (flags & FOLL_GET) { + if (unlikely(!try_get_page(page))) { + page = ERR_PTR(-ENOMEM); + goto out; + } + } if (flags & FOLL_TOUCH) { if ((flags & FOLL_WRITE) && !pte_dirty(pte) && !PageDirty(page)) @@ -298,7 +407,10 @@ retry_locked: if (pmd_trans_unstable(pmd)) ret = -EBUSY; } else { - get_page(page); + if (unlikely(!try_get_page(page))) { + spin_unlock(ptl); + return ERR_PTR(-ENOMEM); + } spin_unlock(ptl); lock_page(page); ret = split_huge_page(page); @@ -500,7 +612,10 @@ static int get_gate_page(struct mm_struct *mm, unsigned long address, if (is_device_public_page(*page)) goto unmap; } - get_page(*page); + if (unlikely(!try_get_page(*page))) { + ret = -ENOMEM; + goto unmap; + } out: ret = 0; unmap: @@ -1008,6 +1123,15 @@ long get_user_pages_locked(unsigned long start, unsigned long nr_pages, unsigned int gup_flags, struct page **pages, int *locked) { + /* + * FIXME: Current FOLL_LONGTERM behavior is incompatible with + * FAULT_FLAG_ALLOW_RETRY because of the FS DAX check requirement on + * vmas. As there are no users of this flag in this call we simply + * disallow this option for now. + */ + if (WARN_ON_ONCE(gup_flags & FOLL_LONGTERM)) + return -EINVAL; + return __get_user_pages_locked(current, current->mm, start, nr_pages, pages, NULL, locked, gup_flags | FOLL_TOUCH); @@ -1036,6 +1160,15 @@ long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages, int locked = 1; long ret; + /* + * FIXME: Current FOLL_LONGTERM behavior is incompatible with + * FAULT_FLAG_ALLOW_RETRY because of the FS DAX check requirement on + * vmas. As there are no users of this flag in this call we simply + * disallow this option for now. + */ + if (WARN_ON_ONCE(gup_flags & FOLL_LONGTERM)) + return -EINVAL; + down_read(&mm->mmap_sem); ret = __get_user_pages_locked(current, mm, start, nr_pages, pages, NULL, &locked, gup_flags | FOLL_TOUCH); @@ -1106,32 +1239,22 @@ long get_user_pages_remote(struct task_struct *tsk, struct mm_struct *mm, unsigned int gup_flags, struct page **pages, struct vm_area_struct **vmas, int *locked) { + /* + * FIXME: Current FOLL_LONGTERM behavior is incompatible with + * FAULT_FLAG_ALLOW_RETRY because of the FS DAX check requirement on + * vmas. As there are no users of this flag in this call we simply + * disallow this option for now. + */ + if (WARN_ON_ONCE(gup_flags & FOLL_LONGTERM)) + return -EINVAL; + return __get_user_pages_locked(tsk, mm, start, nr_pages, pages, vmas, locked, gup_flags | FOLL_TOUCH | FOLL_REMOTE); } EXPORT_SYMBOL(get_user_pages_remote); -/* - * This is the same as get_user_pages_remote(), just with a - * less-flexible calling convention where we assume that the task - * and mm being operated on are the current task's and don't allow - * passing of a locked parameter. We also obviously don't pass - * FOLL_REMOTE in here. - */ -long get_user_pages(unsigned long start, unsigned long nr_pages, - unsigned int gup_flags, struct page **pages, - struct vm_area_struct **vmas) -{ - return __get_user_pages_locked(current, current->mm, start, nr_pages, - pages, vmas, NULL, - gup_flags | FOLL_TOUCH); -} -EXPORT_SYMBOL(get_user_pages); - #if defined(CONFIG_FS_DAX) || defined (CONFIG_CMA) - -#ifdef CONFIG_FS_DAX static bool check_dax_vmas(struct vm_area_struct **vmas, long nr_pages) { long i; @@ -1150,12 +1273,6 @@ static bool check_dax_vmas(struct vm_area_struct **vmas, long nr_pages) } return false; } -#else -static inline bool check_dax_vmas(struct vm_area_struct **vmas, long nr_pages) -{ - return false; -} -#endif #ifdef CONFIG_CMA static struct page *new_non_cma_page(struct page *page, unsigned long private) @@ -1209,10 +1326,13 @@ static struct page *new_non_cma_page(struct page *page, unsigned long private) return __alloc_pages_node(nid, gfp_mask, 0); } -static long check_and_migrate_cma_pages(unsigned long start, long nr_pages, - unsigned int gup_flags, +static long check_and_migrate_cma_pages(struct task_struct *tsk, + struct mm_struct *mm, + unsigned long start, + unsigned long nr_pages, struct page **pages, - struct vm_area_struct **vmas) + struct vm_area_struct **vmas, + unsigned int gup_flags) { long i; bool drain_allow = true; @@ -1268,10 +1388,14 @@ check_again: putback_movable_pages(&cma_page_list); } /* - * We did migrate all the pages, Try to get the page references again - * migrating any new CMA pages which we failed to isolate earlier. + * We did migrate all the pages, Try to get the page references + * again migrating any new CMA pages which we failed to isolate + * earlier. */ - nr_pages = get_user_pages(start, nr_pages, gup_flags, pages, vmas); + nr_pages = __get_user_pages_locked(tsk, mm, start, nr_pages, + pages, vmas, NULL, + gup_flags); + if ((nr_pages > 0) && migrate_allow) { drain_allow = true; goto check_again; @@ -1281,66 +1405,101 @@ check_again: return nr_pages; } #else -static inline long check_and_migrate_cma_pages(unsigned long start, long nr_pages, - unsigned int gup_flags, - struct page **pages, - struct vm_area_struct **vmas) +static long check_and_migrate_cma_pages(struct task_struct *tsk, + struct mm_struct *mm, + unsigned long start, + unsigned long nr_pages, + struct page **pages, + struct vm_area_struct **vmas, + unsigned int gup_flags) { return nr_pages; } #endif /* - * This is the same as get_user_pages() in that it assumes we are - * operating on the current task's mm, but it goes further to validate - * that the vmas associated with the address range are suitable for - * longterm elevated page reference counts. For example, filesystem-dax - * mappings are subject to the lifetime enforced by the filesystem and - * we need guarantees that longterm users like RDMA and V4L2 only - * establish mappings that have a kernel enforced revocation mechanism. - * - * "longterm" == userspace controlled elevated page count lifetime. - * Contrast this to iov_iter_get_pages() usages which are transient. + * __gup_longterm_locked() is a wrapper for __get_user_pages_locked which + * allows us to process the FOLL_LONGTERM flag. */ -long get_user_pages_longterm(unsigned long start, unsigned long nr_pages, - unsigned int gup_flags, struct page **pages, - struct vm_area_struct **vmas_arg) +static long __gup_longterm_locked(struct task_struct *tsk, + struct mm_struct *mm, + unsigned long start, + unsigned long nr_pages, + struct page **pages, + struct vm_area_struct **vmas, + unsigned int gup_flags) { - struct vm_area_struct **vmas = vmas_arg; - unsigned long flags; + struct vm_area_struct **vmas_tmp = vmas; + unsigned long flags = 0; long rc, i; - if (!pages) - return -EINVAL; - - if (!vmas) { - vmas = kcalloc(nr_pages, sizeof(struct vm_area_struct *), - GFP_KERNEL); - if (!vmas) - return -ENOMEM; + if (gup_flags & FOLL_LONGTERM) { + if (!pages) + return -EINVAL; + + if (!vmas_tmp) { + vmas_tmp = kcalloc(nr_pages, + sizeof(struct vm_area_struct *), + GFP_KERNEL); + if (!vmas_tmp) + return -ENOMEM; + } + flags = memalloc_nocma_save(); } - flags = memalloc_nocma_save(); - rc = get_user_pages(start, nr_pages, gup_flags, pages, vmas); - memalloc_nocma_restore(flags); - if (rc < 0) - goto out; + rc = __get_user_pages_locked(tsk, mm, start, nr_pages, pages, + vmas_tmp, NULL, gup_flags); - if (check_dax_vmas(vmas, rc)) { - for (i = 0; i < rc; i++) - put_page(pages[i]); - rc = -EOPNOTSUPP; - goto out; + if (gup_flags & FOLL_LONGTERM) { + memalloc_nocma_restore(flags); + if (rc < 0) + goto out; + + if (check_dax_vmas(vmas_tmp, rc)) { + for (i = 0; i < rc; i++) + put_page(pages[i]); + rc = -EOPNOTSUPP; + goto out; + } + + rc = check_and_migrate_cma_pages(tsk, mm, start, rc, pages, + vmas_tmp, gup_flags); } - rc = check_and_migrate_cma_pages(start, rc, gup_flags, pages, vmas); out: - if (vmas != vmas_arg) - kfree(vmas); + if (vmas_tmp != vmas) + kfree(vmas_tmp); return rc; } -EXPORT_SYMBOL(get_user_pages_longterm); -#endif /* CONFIG_FS_DAX */ +#else /* !CONFIG_FS_DAX && !CONFIG_CMA */ +static __always_inline long __gup_longterm_locked(struct task_struct *tsk, + struct mm_struct *mm, + unsigned long start, + unsigned long nr_pages, + struct page **pages, + struct vm_area_struct **vmas, + unsigned int flags) +{ + return __get_user_pages_locked(tsk, mm, start, nr_pages, pages, vmas, + NULL, flags); +} +#endif /* CONFIG_FS_DAX || CONFIG_CMA */ + +/* + * This is the same as get_user_pages_remote(), just with a + * less-flexible calling convention where we assume that the task + * and mm being operated on are the current task's and don't allow + * passing of a locked parameter. We also obviously don't pass + * FOLL_REMOTE in here. + */ +long get_user_pages(unsigned long start, unsigned long nr_pages, + unsigned int gup_flags, struct page **pages, + struct vm_area_struct **vmas) +{ + return __gup_longterm_locked(current, current->mm, start, nr_pages, + pages, vmas, gup_flags | FOLL_TOUCH); +} +EXPORT_SYMBOL(get_user_pages); /** * populate_vma_page_range() - populate a range of pages in the vma. @@ -1545,9 +1704,23 @@ static void undo_dev_pagemap(int *nr, int nr_start, struct page **pages) } } +/* + * Return the compund head page with ref appropriately incremented, + * or NULL if that failed. + */ +static inline struct page *try_get_compound_head(struct page *page, int refs) +{ + struct page *head = compound_head(page); + if (WARN_ON_ONCE(page_ref_count(head) < 0)) + return NULL; + if (unlikely(!page_cache_add_speculative(head, refs))) + return NULL; + return head; +} + #ifdef CONFIG_ARCH_HAS_PTE_SPECIAL static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end, - int write, struct page **pages, int *nr) + unsigned int flags, struct page **pages, int *nr) { struct dev_pagemap *pgmap = NULL; int nr_start = *nr, ret = 0; @@ -1565,10 +1738,13 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end, if (pte_protnone(pte)) goto pte_unmap; - if (!pte_access_permitted(pte, write)) + if (!pte_access_permitted(pte, flags & FOLL_WRITE)) goto pte_unmap; if (pte_devmap(pte)) { + if (unlikely(flags & FOLL_LONGTERM)) + goto pte_unmap; + pgmap = get_dev_pagemap(pte_pfn(pte), pgmap); if (unlikely(!pgmap)) { undo_dev_pagemap(nr, nr_start, pages); @@ -1579,9 +1755,9 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end, VM_BUG_ON(!pfn_valid(pte_pfn(pte))); page = pte_page(pte); - head = compound_head(page); - if (!page_cache_get_speculative(head)) + head = try_get_compound_head(page, 1); + if (!head) goto pte_unmap; if (unlikely(pte_val(pte) != pte_val(*ptep))) { @@ -1617,7 +1793,7 @@ pte_unmap: * useful to have gup_huge_pmd even if we can't operate on ptes. */ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end, - int write, struct page **pages, int *nr) + unsigned int flags, struct page **pages, int *nr) { return 0; } @@ -1700,16 +1876,19 @@ static int __gup_device_huge_pud(pud_t pud, pud_t *pudp, unsigned long addr, #endif static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr, - unsigned long end, int write, struct page **pages, int *nr) + unsigned long end, unsigned int flags, struct page **pages, int *nr) { struct page *head, *page; int refs; - if (!pmd_access_permitted(orig, write)) + if (!pmd_access_permitted(orig, flags & FOLL_WRITE)) return 0; - if (pmd_devmap(orig)) + if (pmd_devmap(orig)) { + if (unlikely(flags & FOLL_LONGTERM)) + return 0; return __gup_device_huge_pmd(orig, pmdp, addr, end, pages, nr); + } refs = 0; page = pmd_page(orig) + ((addr & ~PMD_MASK) >> PAGE_SHIFT); @@ -1720,8 +1899,8 @@ static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr, refs++; } while (addr += PAGE_SIZE, addr != end); - head = compound_head(pmd_page(orig)); - if (!page_cache_add_speculative(head, refs)) { + head = try_get_compound_head(pmd_page(orig), refs); + if (!head) { *nr -= refs; return 0; } @@ -1738,16 +1917,19 @@ static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr, } static int gup_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr, - unsigned long end, int write, struct page **pages, int *nr) + unsigned long end, unsigned int flags, struct page **pages, int *nr) { struct page *head, *page; int refs; - if (!pud_access_permitted(orig, write)) + if (!pud_access_permitted(orig, flags & FOLL_WRITE)) return 0; - if (pud_devmap(orig)) + if (pud_devmap(orig)) { + if (unlikely(flags & FOLL_LONGTERM)) + return 0; return __gup_device_huge_pud(orig, pudp, addr, end, pages, nr); + } refs = 0; page = pud_page(orig) + ((addr & ~PUD_MASK) >> PAGE_SHIFT); @@ -1758,8 +1940,8 @@ static int gup_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr, refs++; } while (addr += PAGE_SIZE, addr != end); - head = compound_head(pud_page(orig)); - if (!page_cache_add_speculative(head, refs)) { + head = try_get_compound_head(pud_page(orig), refs); + if (!head) { *nr -= refs; return 0; } @@ -1776,13 +1958,13 @@ static int gup_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr, } static int gup_huge_pgd(pgd_t orig, pgd_t *pgdp, unsigned long addr, - unsigned long end, int write, + unsigned long end, unsigned int flags, struct page **pages, int *nr) { int refs; struct page *head, *page; - if (!pgd_access_permitted(orig, write)) + if (!pgd_access_permitted(orig, flags & FOLL_WRITE)) return 0; BUILD_BUG_ON(pgd_devmap(orig)); @@ -1795,8 +1977,8 @@ static int gup_huge_pgd(pgd_t orig, pgd_t *pgdp, unsigned long addr, refs++; } while (addr += PAGE_SIZE, addr != end); - head = compound_head(pgd_page(orig)); - if (!page_cache_add_speculative(head, refs)) { + head = try_get_compound_head(pgd_page(orig), refs); + if (!head) { *nr -= refs; return 0; } @@ -1813,7 +1995,7 @@ static int gup_huge_pgd(pgd_t orig, pgd_t *pgdp, unsigned long addr, } static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end, - int write, struct page **pages, int *nr) + unsigned int flags, struct page **pages, int *nr) { unsigned long next; pmd_t *pmdp; @@ -1836,7 +2018,7 @@ static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end, if (pmd_protnone(pmd)) return 0; - if (!gup_huge_pmd(pmd, pmdp, addr, next, write, + if (!gup_huge_pmd(pmd, pmdp, addr, next, flags, pages, nr)) return 0; @@ -1846,9 +2028,9 @@ static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end, * pmd format and THP pmd format */ if (!gup_huge_pd(__hugepd(pmd_val(pmd)), addr, - PMD_SHIFT, next, write, pages, nr)) + PMD_SHIFT, next, flags, pages, nr)) return 0; - } else if (!gup_pte_range(pmd, addr, next, write, pages, nr)) + } else if (!gup_pte_range(pmd, addr, next, flags, pages, nr)) return 0; } while (pmdp++, addr = next, addr != end); @@ -1856,7 +2038,7 @@ static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end, } static int gup_pud_range(p4d_t p4d, unsigned long addr, unsigned long end, - int write, struct page **pages, int *nr) + unsigned int flags, struct page **pages, int *nr) { unsigned long next; pud_t *pudp; @@ -1869,14 +2051,14 @@ static int gup_pud_range(p4d_t p4d, unsigned long addr, unsigned long end, if (pud_none(pud)) return 0; if (unlikely(pud_huge(pud))) { - if (!gup_huge_pud(pud, pudp, addr, next, write, + if (!gup_huge_pud(pud, pudp, addr, next, flags, pages, nr)) return 0; } else if (unlikely(is_hugepd(__hugepd(pud_val(pud))))) { if (!gup_huge_pd(__hugepd(pud_val(pud)), addr, - PUD_SHIFT, next, write, pages, nr)) + PUD_SHIFT, next, flags, pages, nr)) return 0; - } else if (!gup_pmd_range(pud, addr, next, write, pages, nr)) + } else if (!gup_pmd_range(pud, addr, next, flags, pages, nr)) return 0; } while (pudp++, addr = next, addr != end); @@ -1884,7 +2066,7 @@ static int gup_pud_range(p4d_t p4d, unsigned long addr, unsigned long end, } static int gup_p4d_range(pgd_t pgd, unsigned long addr, unsigned long end, - int write, struct page **pages, int *nr) + unsigned int flags, struct page **pages, int *nr) { unsigned long next; p4d_t *p4dp; @@ -1899,9 +2081,9 @@ static int gup_p4d_range(pgd_t pgd, unsigned long addr, unsigned long end, BUILD_BUG_ON(p4d_huge(p4d)); if (unlikely(is_hugepd(__hugepd(p4d_val(p4d))))) { if (!gup_huge_pd(__hugepd(p4d_val(p4d)), addr, - P4D_SHIFT, next, write, pages, nr)) + P4D_SHIFT, next, flags, pages, nr)) return 0; - } else if (!gup_pud_range(p4d, addr, next, write, pages, nr)) + } else if (!gup_pud_range(p4d, addr, next, flags, pages, nr)) return 0; } while (p4dp++, addr = next, addr != end); @@ -1909,7 +2091,7 @@ static int gup_p4d_range(pgd_t pgd, unsigned long addr, unsigned long end, } static void gup_pgd_range(unsigned long addr, unsigned long end, - int write, struct page **pages, int *nr) + unsigned int flags, struct page **pages, int *nr) { unsigned long next; pgd_t *pgdp; @@ -1922,14 +2104,14 @@ static void gup_pgd_range(unsigned long addr, unsigned long end, if (pgd_none(pgd)) return; if (unlikely(pgd_huge(pgd))) { - if (!gup_huge_pgd(pgd, pgdp, addr, next, write, + if (!gup_huge_pgd(pgd, pgdp, addr, next, flags, pages, nr)) return; } else if (unlikely(is_hugepd(__hugepd(pgd_val(pgd))))) { if (!gup_huge_pd(__hugepd(pgd_val(pgd)), addr, - PGDIR_SHIFT, next, write, pages, nr)) + PGDIR_SHIFT, next, flags, pages, nr)) return; - } else if (!gup_p4d_range(pgd, addr, next, write, pages, nr)) + } else if (!gup_p4d_range(pgd, addr, next, flags, pages, nr)) return; } while (pgdp++, addr = next, addr != end); } @@ -1983,18 +2165,41 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write, if (gup_fast_permitted(start, nr_pages)) { local_irq_save(flags); - gup_pgd_range(start, end, write, pages, &nr); + gup_pgd_range(start, end, write ? FOLL_WRITE : 0, pages, &nr); local_irq_restore(flags); } return nr; } +static int __gup_longterm_unlocked(unsigned long start, int nr_pages, + unsigned int gup_flags, struct page **pages) +{ + int ret; + + /* + * FIXME: FOLL_LONGTERM does not work with + * get_user_pages_unlocked() (see comments in that function) + */ + if (gup_flags & FOLL_LONGTERM) { + down_read(¤t->mm->mmap_sem); + ret = __gup_longterm_locked(current, current->mm, + start, nr_pages, + pages, NULL, gup_flags); + up_read(¤t->mm->mmap_sem); + } else { + ret = get_user_pages_unlocked(start, nr_pages, + pages, gup_flags); + } + + return ret; +} + /** * get_user_pages_fast() - pin user pages in memory * @start: starting user address * @nr_pages: number of pages from start to pin - * @write: whether pages will be written to + * @gup_flags: flags modifying pin behaviour * @pages: array that receives pointers to the pages pinned. * Should be at least nr_pages long. * @@ -2006,8 +2211,8 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write, * requested. If nr_pages is 0 or negative, returns 0. If no pages * were pinned, returns -errno. */ -int get_user_pages_fast(unsigned long start, int nr_pages, int write, - struct page **pages) +int get_user_pages_fast(unsigned long start, int nr_pages, + unsigned int gup_flags, struct page **pages) { unsigned long addr, len, end; int nr = 0, ret = 0; @@ -2025,7 +2230,7 @@ int get_user_pages_fast(unsigned long start, int nr_pages, int write, if (gup_fast_permitted(start, nr_pages)) { local_irq_disable(); - gup_pgd_range(addr, end, write, pages, &nr); + gup_pgd_range(addr, end, gup_flags, pages, &nr); local_irq_enable(); ret = nr; } @@ -2035,8 +2240,8 @@ int get_user_pages_fast(unsigned long start, int nr_pages, int write, start += nr << PAGE_SHIFT; pages += nr; - ret = get_user_pages_unlocked(start, nr_pages - nr, pages, - write ? FOLL_WRITE : 0); + ret = __gup_longterm_unlocked(start, nr_pages - nr, + gup_flags, pages); /* Have to be a bit careful with return values */ if (nr > 0) { diff --git a/mm/gup_benchmark.c b/mm/gup_benchmark.c index 6c0279e70cc4..7dd602d7f8db 100644 --- a/mm/gup_benchmark.c +++ b/mm/gup_benchmark.c @@ -54,8 +54,9 @@ static int __gup_benchmark_ioctl(unsigned int cmd, pages + i); break; case GUP_LONGTERM_BENCHMARK: - nr = get_user_pages_longterm(addr, nr, gup->flags & 1, - pages + i, NULL); + nr = get_user_pages(addr, nr, + (gup->flags & 1) | FOLL_LONGTERM, + pages + i, NULL); break; case GUP_BENCHMARK: nr = get_user_pages(addr, nr, gup->flags & 1, pages + i, @@ -30,6 +30,7 @@ #include <linux/hugetlb.h> #include <linux/memremap.h> #include <linux/jump_label.h> +#include <linux/dma-mapping.h> #include <linux/mmu_notifier.h> #include <linux/memory_hotplug.h> @@ -38,54 +39,48 @@ #if IS_ENABLED(CONFIG_HMM_MIRROR) static const struct mmu_notifier_ops hmm_mmu_notifier_ops; -/* - * struct hmm - HMM per mm struct - * - * @mm: mm struct this HMM struct is bound to - * @lock: lock protecting ranges list - * @ranges: list of range being snapshotted - * @mirrors: list of mirrors for this mm - * @mmu_notifier: mmu notifier to track updates to CPU page table - * @mirrors_sem: read/write semaphore protecting the mirrors list - */ -struct hmm { - struct mm_struct *mm; - spinlock_t lock; - struct list_head ranges; - struct list_head mirrors; - struct mmu_notifier mmu_notifier; - struct rw_semaphore mirrors_sem; -}; +static inline struct hmm *mm_get_hmm(struct mm_struct *mm) +{ + struct hmm *hmm = READ_ONCE(mm->hmm); -/* - * hmm_register - register HMM against an mm (HMM internal) + if (hmm && kref_get_unless_zero(&hmm->kref)) + return hmm; + + return NULL; +} + +/** + * hmm_get_or_create - register HMM against an mm (HMM internal) * * @mm: mm struct to attach to + * Returns: returns an HMM object, either by referencing the existing + * (per-process) object, or by creating a new one. * - * This is not intended to be used directly by device drivers. It allocates an - * HMM struct if mm does not have one, and initializes it. + * This is not intended to be used directly by device drivers. If mm already + * has an HMM struct then it get a reference on it and returns it. Otherwise + * it allocates an HMM struct, initializes it, associate it with the mm and + * returns it. */ -static struct hmm *hmm_register(struct mm_struct *mm) +static struct hmm *hmm_get_or_create(struct mm_struct *mm) { - struct hmm *hmm = READ_ONCE(mm->hmm); + struct hmm *hmm = mm_get_hmm(mm); bool cleanup = false; - /* - * The hmm struct can only be freed once the mm_struct goes away, - * hence we should always have pre-allocated an new hmm struct - * above. - */ if (hmm) return hmm; hmm = kmalloc(sizeof(*hmm), GFP_KERNEL); if (!hmm) return NULL; + init_waitqueue_head(&hmm->wq); INIT_LIST_HEAD(&hmm->mirrors); init_rwsem(&hmm->mirrors_sem); hmm->mmu_notifier.ops = NULL; INIT_LIST_HEAD(&hmm->ranges); - spin_lock_init(&hmm->lock); + mutex_init(&hmm->lock); + kref_init(&hmm->kref); + hmm->notifiers = 0; + hmm->dead = false; hmm->mm = mm; spin_lock(&mm->page_table_lock); @@ -106,7 +101,7 @@ static struct hmm *hmm_register(struct mm_struct *mm) if (__mmu_notifier_register(&hmm->mmu_notifier, mm)) goto error_mm; - return mm->hmm; + return hmm; error_mm: spin_lock(&mm->page_table_lock); @@ -118,54 +113,60 @@ error: return NULL; } -void hmm_mm_destroy(struct mm_struct *mm) +static void hmm_free(struct kref *kref) { - kfree(mm->hmm); -} + struct hmm *hmm = container_of(kref, struct hmm, kref); + struct mm_struct *mm = hmm->mm; -static int hmm_invalidate_range(struct hmm *hmm, bool device, - const struct hmm_update *update) -{ - struct hmm_mirror *mirror; - struct hmm_range *range; - - spin_lock(&hmm->lock); - list_for_each_entry(range, &hmm->ranges, list) { - unsigned long addr, idx, npages; + mmu_notifier_unregister_no_release(&hmm->mmu_notifier, mm); - if (update->end < range->start || update->start >= range->end) - continue; + spin_lock(&mm->page_table_lock); + if (mm->hmm == hmm) + mm->hmm = NULL; + spin_unlock(&mm->page_table_lock); - range->valid = false; - addr = max(update->start, range->start); - idx = (addr - range->start) >> PAGE_SHIFT; - npages = (min(range->end, update->end) - addr) >> PAGE_SHIFT; - memset(&range->pfns[idx], 0, sizeof(*range->pfns) * npages); - } - spin_unlock(&hmm->lock); + kfree(hmm); +} - if (!device) - return 0; +static inline void hmm_put(struct hmm *hmm) +{ + kref_put(&hmm->kref, hmm_free); +} - down_read(&hmm->mirrors_sem); - list_for_each_entry(mirror, &hmm->mirrors, list) { - int ret; +void hmm_mm_destroy(struct mm_struct *mm) +{ + struct hmm *hmm; - ret = mirror->ops->sync_cpu_device_pagetables(mirror, update); - if (!update->blockable && ret == -EAGAIN) { - up_read(&hmm->mirrors_sem); - return -EAGAIN; - } + spin_lock(&mm->page_table_lock); + hmm = mm_get_hmm(mm); + mm->hmm = NULL; + if (hmm) { + hmm->mm = NULL; + hmm->dead = true; + spin_unlock(&mm->page_table_lock); + hmm_put(hmm); + return; } - up_read(&hmm->mirrors_sem); - return 0; + spin_unlock(&mm->page_table_lock); } static void hmm_release(struct mmu_notifier *mn, struct mm_struct *mm) { + struct hmm *hmm = mm_get_hmm(mm); struct hmm_mirror *mirror; - struct hmm *hmm = mm->hmm; + struct hmm_range *range; + + /* Report this HMM as dying. */ + hmm->dead = true; + + /* Wake-up everyone waiting on any range. */ + mutex_lock(&hmm->lock); + list_for_each_entry(range, &hmm->ranges, list) { + range->valid = false; + } + wake_up_all(&hmm->wq); + mutex_unlock(&hmm->lock); down_write(&hmm->mirrors_sem); mirror = list_first_entry_or_null(&hmm->mirrors, struct hmm_mirror, @@ -186,36 +187,86 @@ static void hmm_release(struct mmu_notifier *mn, struct mm_struct *mm) struct hmm_mirror, list); } up_write(&hmm->mirrors_sem); + + hmm_put(hmm); } static int hmm_invalidate_range_start(struct mmu_notifier *mn, - const struct mmu_notifier_range *range) + const struct mmu_notifier_range *nrange) { + struct hmm *hmm = mm_get_hmm(nrange->mm); + struct hmm_mirror *mirror; struct hmm_update update; - struct hmm *hmm = range->mm->hmm; + struct hmm_range *range; + int ret = 0; VM_BUG_ON(!hmm); - update.start = range->start; - update.end = range->end; + update.start = nrange->start; + update.end = nrange->end; update.event = HMM_UPDATE_INVALIDATE; - update.blockable = range->blockable; - return hmm_invalidate_range(hmm, true, &update); + update.blockable = mmu_notifier_range_blockable(nrange); + + if (mmu_notifier_range_blockable(nrange)) + mutex_lock(&hmm->lock); + else if (!mutex_trylock(&hmm->lock)) { + ret = -EAGAIN; + goto out; + } + hmm->notifiers++; + list_for_each_entry(range, &hmm->ranges, list) { + if (update.end < range->start || update.start >= range->end) + continue; + + range->valid = false; + } + mutex_unlock(&hmm->lock); + + if (mmu_notifier_range_blockable(nrange)) + down_read(&hmm->mirrors_sem); + else if (!down_read_trylock(&hmm->mirrors_sem)) { + ret = -EAGAIN; + goto out; + } + list_for_each_entry(mirror, &hmm->mirrors, list) { + int ret; + + ret = mirror->ops->sync_cpu_device_pagetables(mirror, &update); + if (!update.blockable && ret == -EAGAIN) { + up_read(&hmm->mirrors_sem); + ret = -EAGAIN; + goto out; + } + } + up_read(&hmm->mirrors_sem); + +out: + hmm_put(hmm); + return ret; } static void hmm_invalidate_range_end(struct mmu_notifier *mn, - const struct mmu_notifier_range *range) + const struct mmu_notifier_range *nrange) { - struct hmm_update update; - struct hmm *hmm = range->mm->hmm; + struct hmm *hmm = mm_get_hmm(nrange->mm); VM_BUG_ON(!hmm); - update.start = range->start; - update.end = range->end; - update.event = HMM_UPDATE_INVALIDATE; - update.blockable = true; - hmm_invalidate_range(hmm, false, &update); + mutex_lock(&hmm->lock); + hmm->notifiers--; + if (!hmm->notifiers) { + struct hmm_range *range; + + list_for_each_entry(range, &hmm->ranges, list) { + if (range->valid) + continue; + range->valid = true; + } + wake_up_all(&hmm->wq); + } + mutex_unlock(&hmm->lock); + + hmm_put(hmm); } static const struct mmu_notifier_ops hmm_mmu_notifier_ops = { @@ -241,24 +292,13 @@ int hmm_mirror_register(struct hmm_mirror *mirror, struct mm_struct *mm) if (!mm || !mirror || !mirror->ops) return -EINVAL; -again: - mirror->hmm = hmm_register(mm); + mirror->hmm = hmm_get_or_create(mm); if (!mirror->hmm) return -ENOMEM; down_write(&mirror->hmm->mirrors_sem); - if (mirror->hmm->mm == NULL) { - /* - * A racing hmm_mirror_unregister() is about to destroy the hmm - * struct. Try again to allocate a new one. - */ - up_write(&mirror->hmm->mirrors_sem); - mirror->hmm = NULL; - goto again; - } else { - list_add(&mirror->list, &mirror->hmm->mirrors); - up_write(&mirror->hmm->mirrors_sem); - } + list_add(&mirror->list, &mirror->hmm->mirrors); + up_write(&mirror->hmm->mirrors_sem); return 0; } @@ -273,38 +313,24 @@ EXPORT_SYMBOL(hmm_mirror_register); */ void hmm_mirror_unregister(struct hmm_mirror *mirror) { - bool should_unregister = false; - struct mm_struct *mm; - struct hmm *hmm; + struct hmm *hmm = READ_ONCE(mirror->hmm); - if (mirror->hmm == NULL) + if (hmm == NULL) return; - hmm = mirror->hmm; down_write(&hmm->mirrors_sem); list_del_init(&mirror->list); - should_unregister = list_empty(&hmm->mirrors); + /* To protect us against double unregister ... */ mirror->hmm = NULL; - mm = hmm->mm; - hmm->mm = NULL; up_write(&hmm->mirrors_sem); - if (!should_unregister || mm == NULL) - return; - - mmu_notifier_unregister_no_release(&hmm->mmu_notifier, mm); - - spin_lock(&mm->page_table_lock); - if (mm->hmm == hmm) - mm->hmm = NULL; - spin_unlock(&mm->page_table_lock); - - kfree(hmm); + hmm_put(hmm); } EXPORT_SYMBOL(hmm_mirror_unregister); struct hmm_vma_walk { struct hmm_range *range; + struct dev_pagemap *pgmap; unsigned long last; bool fault; bool block; @@ -323,13 +349,13 @@ static int hmm_vma_do_fault(struct mm_walk *walk, unsigned long addr, flags |= write_fault ? FAULT_FLAG_WRITE : 0; ret = handle_mm_fault(vma, addr, flags); if (ret & VM_FAULT_RETRY) - return -EBUSY; + return -EAGAIN; if (ret & VM_FAULT_ERROR) { *pfn = range->values[HMM_PFN_ERROR]; return -EFAULT; } - return -EAGAIN; + return -EBUSY; } static int hmm_pfns_bad(unsigned long addr, @@ -355,7 +381,7 @@ static int hmm_pfns_bad(unsigned long addr, * @fault: should we fault or not ? * @write_fault: write fault ? * @walk: mm_walk structure - * Returns: 0 on success, -EAGAIN after page fault, or page fault error + * Returns: 0 on success, -EBUSY after page fault, or page fault error * * This function will be called whenever pmd_none() or pte_none() returns true, * or whenever there is no page directory covering the virtual address range. @@ -367,23 +393,25 @@ static int hmm_vma_walk_hole_(unsigned long addr, unsigned long end, struct hmm_vma_walk *hmm_vma_walk = walk->private; struct hmm_range *range = hmm_vma_walk->range; uint64_t *pfns = range->pfns; - unsigned long i; + unsigned long i, page_size; hmm_vma_walk->last = addr; - i = (addr - range->start) >> PAGE_SHIFT; - for (; addr < end; addr += PAGE_SIZE, i++) { + page_size = hmm_range_page_size(range); + i = (addr - range->start) >> range->page_shift; + + for (; addr < end; addr += page_size, i++) { pfns[i] = range->values[HMM_PFN_NONE]; if (fault || write_fault) { int ret; ret = hmm_vma_do_fault(walk, addr, write_fault, &pfns[i]); - if (ret != -EAGAIN) + if (ret != -EBUSY) return ret; } } - return (fault || write_fault) ? -EAGAIN : 0; + return (fault || write_fault) ? -EBUSY : 0; } static inline void hmm_pte_need_fault(const struct hmm_vma_walk *hmm_vma_walk, @@ -392,10 +420,21 @@ static inline void hmm_pte_need_fault(const struct hmm_vma_walk *hmm_vma_walk, { struct hmm_range *range = hmm_vma_walk->range; - *fault = *write_fault = false; if (!hmm_vma_walk->fault) return; + /* + * So we not only consider the individual per page request we also + * consider the default flags requested for the range. The API can + * be use in 2 fashions. The first one where the HMM user coalesce + * multiple page fault into one request and set flags per pfns for + * of those faults. The second one where the HMM user want to pre- + * fault a range with specific flags. For the latter one it is a + * waste to have the user pre-fill the pfn arrays with a default + * flags value. + */ + pfns = (pfns & range->pfn_flags_mask) | range->default_flags; + /* We aren't ask to do anything ... */ if (!(pfns & range->flags[HMM_PFN_VALID])) return; @@ -431,10 +470,11 @@ static void hmm_range_need_fault(const struct hmm_vma_walk *hmm_vma_walk, return; } + *fault = *write_fault = false; for (i = 0; i < npages; ++i) { hmm_pte_need_fault(hmm_vma_walk, pfns[i], cpu_flags, fault, write_fault); - if ((*fault) || (*write_fault)) + if ((*write_fault)) return; } } @@ -465,12 +505,22 @@ static inline uint64_t pmd_to_hmm_pfn_flags(struct hmm_range *range, pmd_t pmd) range->flags[HMM_PFN_VALID]; } +static inline uint64_t pud_to_hmm_pfn_flags(struct hmm_range *range, pud_t pud) +{ + if (!pud_present(pud)) + return 0; + return pud_write(pud) ? range->flags[HMM_PFN_VALID] | + range->flags[HMM_PFN_WRITE] : + range->flags[HMM_PFN_VALID]; +} + static int hmm_vma_handle_pmd(struct mm_walk *walk, unsigned long addr, unsigned long end, uint64_t *pfns, pmd_t pmd) { +#ifdef CONFIG_TRANSPARENT_HUGEPAGE struct hmm_vma_walk *hmm_vma_walk = walk->private; struct hmm_range *range = hmm_vma_walk->range; unsigned long pfn, npages, i; @@ -486,10 +536,25 @@ static int hmm_vma_handle_pmd(struct mm_walk *walk, return hmm_vma_walk_hole_(addr, end, fault, write_fault, walk); pfn = pmd_pfn(pmd) + pte_index(addr); - for (i = 0; addr < end; addr += PAGE_SIZE, i++, pfn++) - pfns[i] = hmm_pfn_from_pfn(range, pfn) | cpu_flags; + for (i = 0; addr < end; addr += PAGE_SIZE, i++, pfn++) { + if (pmd_devmap(pmd)) { + hmm_vma_walk->pgmap = get_dev_pagemap(pfn, + hmm_vma_walk->pgmap); + if (unlikely(!hmm_vma_walk->pgmap)) + return -EBUSY; + } + pfns[i] = hmm_device_entry_from_pfn(range, pfn) | cpu_flags; + } + if (hmm_vma_walk->pgmap) { + put_dev_pagemap(hmm_vma_walk->pgmap); + hmm_vma_walk->pgmap = NULL; + } hmm_vma_walk->last = end; return 0; +#else + /* If THP is not enabled then we should never reach that code ! */ + return -EINVAL; +#endif } static inline uint64_t pte_to_hmm_pfn_flags(struct hmm_range *range, pte_t pte) @@ -514,11 +579,11 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr, uint64_t orig_pfn = *pfn; *pfn = range->values[HMM_PFN_NONE]; - cpu_flags = pte_to_hmm_pfn_flags(range, pte); - hmm_pte_need_fault(hmm_vma_walk, orig_pfn, cpu_flags, - &fault, &write_fault); + fault = write_fault = false; if (pte_none(pte)) { + hmm_pte_need_fault(hmm_vma_walk, orig_pfn, 0, + &fault, &write_fault); if (fault || write_fault) goto fault; return 0; @@ -546,7 +611,8 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr, &fault, &write_fault); if (fault || write_fault) goto fault; - *pfn = hmm_pfn_from_pfn(range, swp_offset(entry)); + *pfn = hmm_device_entry_from_pfn(range, + swp_offset(entry)); *pfn |= cpu_flags; return 0; } @@ -557,7 +623,7 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr, hmm_vma_walk->last = addr; migration_entry_wait(vma->vm_mm, pmdp, addr); - return -EAGAIN; + return -EBUSY; } return 0; } @@ -565,15 +631,33 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr, /* Report error for everything else */ *pfn = range->values[HMM_PFN_ERROR]; return -EFAULT; + } else { + cpu_flags = pte_to_hmm_pfn_flags(range, pte); + hmm_pte_need_fault(hmm_vma_walk, orig_pfn, cpu_flags, + &fault, &write_fault); } if (fault || write_fault) goto fault; - *pfn = hmm_pfn_from_pfn(range, pte_pfn(pte)) | cpu_flags; + if (pte_devmap(pte)) { + hmm_vma_walk->pgmap = get_dev_pagemap(pte_pfn(pte), + hmm_vma_walk->pgmap); + if (unlikely(!hmm_vma_walk->pgmap)) + return -EBUSY; + } else if (IS_ENABLED(CONFIG_ARCH_HAS_PTE_SPECIAL) && pte_special(pte)) { + *pfn = range->values[HMM_PFN_SPECIAL]; + return -EFAULT; + } + + *pfn = hmm_device_entry_from_pfn(range, pte_pfn(pte)) | cpu_flags; return 0; fault: + if (hmm_vma_walk->pgmap) { + put_dev_pagemap(hmm_vma_walk->pgmap); + hmm_vma_walk->pgmap = NULL; + } pte_unmap(ptep); /* Fault any virtual address we were asked to fault */ return hmm_vma_walk_hole_(addr, end, fault, write_fault, walk); @@ -615,7 +699,7 @@ again: if (fault || write_fault) { hmm_vma_walk->last = addr; pmd_migration_entry_wait(vma->vm_mm, pmdp); - return -EAGAIN; + return -EBUSY; } return 0; } else if (!pmd_present(pmd)) @@ -661,12 +745,158 @@ again: return r; } } + if (hmm_vma_walk->pgmap) { + /* + * We do put_dev_pagemap() here and not in hmm_vma_handle_pte() + * so that we can leverage get_dev_pagemap() optimization which + * will not re-take a reference on a pgmap if we already have + * one. + */ + put_dev_pagemap(hmm_vma_walk->pgmap); + hmm_vma_walk->pgmap = NULL; + } pte_unmap(ptep - 1); hmm_vma_walk->last = addr; return 0; } +static int hmm_vma_walk_pud(pud_t *pudp, + unsigned long start, + unsigned long end, + struct mm_walk *walk) +{ + struct hmm_vma_walk *hmm_vma_walk = walk->private; + struct hmm_range *range = hmm_vma_walk->range; + unsigned long addr = start, next; + pmd_t *pmdp; + pud_t pud; + int ret; + +again: + pud = READ_ONCE(*pudp); + if (pud_none(pud)) + return hmm_vma_walk_hole(start, end, walk); + + if (pud_huge(pud) && pud_devmap(pud)) { + unsigned long i, npages, pfn; + uint64_t *pfns, cpu_flags; + bool fault, write_fault; + + if (!pud_present(pud)) + return hmm_vma_walk_hole(start, end, walk); + + i = (addr - range->start) >> PAGE_SHIFT; + npages = (end - addr) >> PAGE_SHIFT; + pfns = &range->pfns[i]; + + cpu_flags = pud_to_hmm_pfn_flags(range, pud); + hmm_range_need_fault(hmm_vma_walk, pfns, npages, + cpu_flags, &fault, &write_fault); + if (fault || write_fault) + return hmm_vma_walk_hole_(addr, end, fault, + write_fault, walk); + +#ifdef CONFIG_HUGETLB_PAGE + pfn = pud_pfn(pud) + ((addr & ~PUD_MASK) >> PAGE_SHIFT); + for (i = 0; i < npages; ++i, ++pfn) { + hmm_vma_walk->pgmap = get_dev_pagemap(pfn, + hmm_vma_walk->pgmap); + if (unlikely(!hmm_vma_walk->pgmap)) + return -EBUSY; + pfns[i] = hmm_device_entry_from_pfn(range, pfn) | + cpu_flags; + } + if (hmm_vma_walk->pgmap) { + put_dev_pagemap(hmm_vma_walk->pgmap); + hmm_vma_walk->pgmap = NULL; + } + hmm_vma_walk->last = end; + return 0; +#else + return -EINVAL; +#endif + } + + split_huge_pud(walk->vma, pudp, addr); + if (pud_none(*pudp)) + goto again; + + pmdp = pmd_offset(pudp, addr); + do { + next = pmd_addr_end(addr, end); + ret = hmm_vma_walk_pmd(pmdp, addr, next, walk); + if (ret) + return ret; + } while (pmdp++, addr = next, addr != end); + + return 0; +} + +static int hmm_vma_walk_hugetlb_entry(pte_t *pte, unsigned long hmask, + unsigned long start, unsigned long end, + struct mm_walk *walk) +{ +#ifdef CONFIG_HUGETLB_PAGE + unsigned long addr = start, i, pfn, mask, size, pfn_inc; + struct hmm_vma_walk *hmm_vma_walk = walk->private; + struct hmm_range *range = hmm_vma_walk->range; + struct vm_area_struct *vma = walk->vma; + struct hstate *h = hstate_vma(vma); + uint64_t orig_pfn, cpu_flags; + bool fault, write_fault; + spinlock_t *ptl; + pte_t entry; + int ret = 0; + + size = 1UL << huge_page_shift(h); + mask = size - 1; + if (range->page_shift != PAGE_SHIFT) { + /* Make sure we are looking at full page. */ + if (start & mask) + return -EINVAL; + if (end < (start + size)) + return -EINVAL; + pfn_inc = size >> PAGE_SHIFT; + } else { + pfn_inc = 1; + size = PAGE_SIZE; + } + + + ptl = huge_pte_lock(hstate_vma(walk->vma), walk->mm, pte); + entry = huge_ptep_get(pte); + + i = (start - range->start) >> range->page_shift; + orig_pfn = range->pfns[i]; + range->pfns[i] = range->values[HMM_PFN_NONE]; + cpu_flags = pte_to_hmm_pfn_flags(range, entry); + fault = write_fault = false; + hmm_pte_need_fault(hmm_vma_walk, orig_pfn, cpu_flags, + &fault, &write_fault); + if (fault || write_fault) { + ret = -ENOENT; + goto unlock; + } + + pfn = pte_pfn(entry) + ((start & mask) >> range->page_shift); + for (; addr < end; addr += size, i++, pfn += pfn_inc) + range->pfns[i] = hmm_device_entry_from_pfn(range, pfn) | + cpu_flags; + hmm_vma_walk->last = end; + +unlock: + spin_unlock(ptl); + + if (ret == -ENOENT) + return hmm_vma_walk_hole_(addr, end, fault, write_fault, walk); + + return ret; +#else /* CONFIG_HUGETLB_PAGE */ + return -EINVAL; +#endif +} + static void hmm_pfns_clear(struct hmm_range *range, uint64_t *pfns, unsigned long addr, @@ -676,279 +906,437 @@ static void hmm_pfns_clear(struct hmm_range *range, *pfns = range->values[HMM_PFN_NONE]; } -static void hmm_pfns_special(struct hmm_range *range) -{ - unsigned long addr = range->start, i = 0; - - for (; addr < range->end; addr += PAGE_SIZE, i++) - range->pfns[i] = range->values[HMM_PFN_SPECIAL]; -} - /* - * hmm_vma_get_pfns() - snapshot CPU page table for a range of virtual addresses - * @range: range being snapshotted - * Returns: -EINVAL if invalid argument, -ENOMEM out of memory, -EPERM invalid - * vma permission, 0 success - * - * This snapshots the CPU page table for a range of virtual addresses. Snapshot - * validity is tracked by range struct. See hmm_vma_range_done() for further - * information. - * - * The range struct is initialized here. It tracks the CPU page table, but only - * if the function returns success (0), in which case the caller must then call - * hmm_vma_range_done() to stop CPU page table update tracking on this range. + * hmm_range_register() - start tracking change to CPU page table over a range + * @range: range + * @mm: the mm struct for the range of virtual address + * @start: start virtual address (inclusive) + * @end: end virtual address (exclusive) + * @page_shift: expect page shift for the range + * Returns 0 on success, -EFAULT if the address space is no longer valid * - * NOT CALLING hmm_vma_range_done() IF FUNCTION RETURNS 0 WILL LEAD TO SERIOUS - * MEMORY CORRUPTION ! YOU HAVE BEEN WARNED ! + * Track updates to the CPU page table see include/linux/hmm.h */ -int hmm_vma_get_pfns(struct hmm_range *range) +int hmm_range_register(struct hmm_range *range, + struct mm_struct *mm, + unsigned long start, + unsigned long end, + unsigned page_shift) { - struct vm_area_struct *vma = range->vma; - struct hmm_vma_walk hmm_vma_walk; - struct mm_walk mm_walk; - struct hmm *hmm; + unsigned long mask = ((1UL << page_shift) - 1UL); + + range->valid = false; + range->hmm = NULL; - /* Sanity check, this really should not happen ! */ - if (range->start < vma->vm_start || range->start >= vma->vm_end) + if ((start & mask) || (end & mask)) return -EINVAL; - if (range->end < vma->vm_start || range->end > vma->vm_end) + if (start >= end) return -EINVAL; - hmm = hmm_register(vma->vm_mm); - if (!hmm) - return -ENOMEM; - /* Caller must have registered a mirror, via hmm_mirror_register() ! */ - if (!hmm->mmu_notifier.ops) - return -EINVAL; + range->page_shift = page_shift; + range->start = start; + range->end = end; - /* FIXME support hugetlb fs */ - if (is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_SPECIAL) || - vma_is_dax(vma)) { - hmm_pfns_special(range); - return -EINVAL; - } + range->hmm = hmm_get_or_create(mm); + if (!range->hmm) + return -EFAULT; - if (!(vma->vm_flags & VM_READ)) { - /* - * If vma do not allow read access, then assume that it does - * not allow write access, either. Architecture that allow - * write without read access are not supported by HMM, because - * operations such has atomic access would not work. - */ - hmm_pfns_clear(range, range->pfns, range->start, range->end); - return -EPERM; + /* Check if hmm_mm_destroy() was call. */ + if (range->hmm->mm == NULL || range->hmm->dead) { + hmm_put(range->hmm); + return -EFAULT; } /* Initialize range to track CPU page table update */ - spin_lock(&hmm->lock); - range->valid = true; - list_add_rcu(&range->list, &hmm->ranges); - spin_unlock(&hmm->lock); - - hmm_vma_walk.fault = false; - hmm_vma_walk.range = range; - mm_walk.private = &hmm_vma_walk; - - mm_walk.vma = vma; - mm_walk.mm = vma->vm_mm; - mm_walk.pte_entry = NULL; - mm_walk.test_walk = NULL; - mm_walk.hugetlb_entry = NULL; - mm_walk.pmd_entry = hmm_vma_walk_pmd; - mm_walk.pte_hole = hmm_vma_walk_hole; - - walk_page_range(range->start, range->end, &mm_walk); + mutex_lock(&range->hmm->lock); + + list_add_rcu(&range->list, &range->hmm->ranges); + + /* + * If there are any concurrent notifiers we have to wait for them for + * the range to be valid (see hmm_range_wait_until_valid()). + */ + if (!range->hmm->notifiers) + range->valid = true; + mutex_unlock(&range->hmm->lock); + return 0; } -EXPORT_SYMBOL(hmm_vma_get_pfns); +EXPORT_SYMBOL(hmm_range_register); /* - * hmm_vma_range_done() - stop tracking change to CPU page table over a range - * @range: range being tracked - * Returns: false if range data has been invalidated, true otherwise + * hmm_range_unregister() - stop tracking change to CPU page table over a range + * @range: range * * Range struct is used to track updates to the CPU page table after a call to - * either hmm_vma_get_pfns() or hmm_vma_fault(). Once the device driver is done - * using the data, or wants to lock updates to the data it got from those - * functions, it must call the hmm_vma_range_done() function, which will then - * stop tracking CPU page table updates. - * - * Note that device driver must still implement general CPU page table update - * tracking either by using hmm_mirror (see hmm_mirror_register()) or by using - * the mmu_notifier API directly. - * - * CPU page table update tracking done through hmm_range is only temporary and - * to be used while trying to duplicate CPU page table contents for a range of - * virtual addresses. - * - * There are two ways to use this : - * again: - * hmm_vma_get_pfns(range); or hmm_vma_fault(...); - * trans = device_build_page_table_update_transaction(pfns); - * device_page_table_lock(); - * if (!hmm_vma_range_done(range)) { - * device_page_table_unlock(); - * goto again; - * } - * device_commit_transaction(trans); - * device_page_table_unlock(); + * hmm_range_register(). See include/linux/hmm.h for how to use it. + */ +void hmm_range_unregister(struct hmm_range *range) +{ + /* Sanity check this really should not happen. */ + if (range->hmm == NULL || range->end <= range->start) + return; + + mutex_lock(&range->hmm->lock); + list_del_rcu(&range->list); + mutex_unlock(&range->hmm->lock); + + /* Drop reference taken by hmm_range_register() */ + range->valid = false; + hmm_put(range->hmm); + range->hmm = NULL; +} +EXPORT_SYMBOL(hmm_range_unregister); + +/* + * hmm_range_snapshot() - snapshot CPU page table for a range + * @range: range + * Returns: -EINVAL if invalid argument, -ENOMEM out of memory, -EPERM invalid + * permission (for instance asking for write and range is read only), + * -EAGAIN if you need to retry, -EFAULT invalid (ie either no valid + * vma or it is illegal to access that range), number of valid pages + * in range->pfns[] (from range start address). * - * Or: - * hmm_vma_get_pfns(range); or hmm_vma_fault(...); - * device_page_table_lock(); - * hmm_vma_range_done(range); - * device_update_page_table(range->pfns); - * device_page_table_unlock(); + * This snapshots the CPU page table for a range of virtual addresses. Snapshot + * validity is tracked by range struct. See in include/linux/hmm.h for example + * on how to use. */ -bool hmm_vma_range_done(struct hmm_range *range) +long hmm_range_snapshot(struct hmm_range *range) { - unsigned long npages = (range->end - range->start) >> PAGE_SHIFT; - struct hmm *hmm; + const unsigned long device_vma = VM_IO | VM_PFNMAP | VM_MIXEDMAP; + unsigned long start = range->start, end; + struct hmm_vma_walk hmm_vma_walk; + struct hmm *hmm = range->hmm; + struct vm_area_struct *vma; + struct mm_walk mm_walk; - if (range->end <= range->start) { - BUG(); - return false; - } + /* Check if hmm_mm_destroy() was call. */ + if (hmm->mm == NULL || hmm->dead) + return -EFAULT; - hmm = hmm_register(range->vma->vm_mm); - if (!hmm) { - memset(range->pfns, 0, sizeof(*range->pfns) * npages); - return false; - } + do { + /* If range is no longer valid force retry. */ + if (!range->valid) + return -EAGAIN; - spin_lock(&hmm->lock); - list_del_rcu(&range->list); - spin_unlock(&hmm->lock); + vma = find_vma(hmm->mm, start); + if (vma == NULL || (vma->vm_flags & device_vma)) + return -EFAULT; + + if (is_vm_hugetlb_page(vma)) { + struct hstate *h = hstate_vma(vma); - return range->valid; + if (huge_page_shift(h) != range->page_shift && + range->page_shift != PAGE_SHIFT) + return -EINVAL; + } else { + if (range->page_shift != PAGE_SHIFT) + return -EINVAL; + } + + if (!(vma->vm_flags & VM_READ)) { + /* + * If vma do not allow read access, then assume that it + * does not allow write access, either. HMM does not + * support architecture that allow write without read. + */ + hmm_pfns_clear(range, range->pfns, + range->start, range->end); + return -EPERM; + } + + range->vma = vma; + hmm_vma_walk.pgmap = NULL; + hmm_vma_walk.last = start; + hmm_vma_walk.fault = false; + hmm_vma_walk.range = range; + mm_walk.private = &hmm_vma_walk; + end = min(range->end, vma->vm_end); + + mm_walk.vma = vma; + mm_walk.mm = vma->vm_mm; + mm_walk.pte_entry = NULL; + mm_walk.test_walk = NULL; + mm_walk.hugetlb_entry = NULL; + mm_walk.pud_entry = hmm_vma_walk_pud; + mm_walk.pmd_entry = hmm_vma_walk_pmd; + mm_walk.pte_hole = hmm_vma_walk_hole; + mm_walk.hugetlb_entry = hmm_vma_walk_hugetlb_entry; + + walk_page_range(start, end, &mm_walk); + start = end; + } while (start < range->end); + + return (hmm_vma_walk.last - range->start) >> PAGE_SHIFT; } -EXPORT_SYMBOL(hmm_vma_range_done); +EXPORT_SYMBOL(hmm_range_snapshot); /* - * hmm_vma_fault() - try to fault some address in a virtual address range + * hmm_range_fault() - try to fault some address in a virtual address range * @range: range being faulted * @block: allow blocking on fault (if true it sleeps and do not drop mmap_sem) - * Returns: 0 success, error otherwise (-EAGAIN means mmap_sem have been drop) + * Returns: number of valid pages in range->pfns[] (from range start + * address). This may be zero. If the return value is negative, + * then one of the following values may be returned: + * + * -EINVAL invalid arguments or mm or virtual address are in an + * invalid vma (for instance device file vma). + * -ENOMEM: Out of memory. + * -EPERM: Invalid permission (for instance asking for write and + * range is read only). + * -EAGAIN: If you need to retry and mmap_sem was drop. This can only + * happens if block argument is false. + * -EBUSY: If the the range is being invalidated and you should wait + * for invalidation to finish. + * -EFAULT: Invalid (ie either no valid vma or it is illegal to access + * that range), number of valid pages in range->pfns[] (from + * range start address). * * This is similar to a regular CPU page fault except that it will not trigger - * any memory migration if the memory being faulted is not accessible by CPUs. + * any memory migration if the memory being faulted is not accessible by CPUs + * and caller does not ask for migration. * * On error, for one virtual address in the range, the function will mark the * corresponding HMM pfn entry with an error flag. - * - * Expected use pattern: - * retry: - * down_read(&mm->mmap_sem); - * // Find vma and address device wants to fault, initialize hmm_pfn_t - * // array accordingly - * ret = hmm_vma_fault(range, write, block); - * switch (ret) { - * case -EAGAIN: - * hmm_vma_range_done(range); - * // You might want to rate limit or yield to play nicely, you may - * // also commit any valid pfn in the array assuming that you are - * // getting true from hmm_vma_range_monitor_end() - * goto retry; - * case 0: - * break; - * case -ENOMEM: - * case -EINVAL: - * case -EPERM: - * default: - * // Handle error ! - * up_read(&mm->mmap_sem) - * return; - * } - * // Take device driver lock that serialize device page table update - * driver_lock_device_page_table_update(); - * hmm_vma_range_done(range); - * // Commit pfns we got from hmm_vma_fault() - * driver_unlock_device_page_table_update(); - * up_read(&mm->mmap_sem) - * - * YOU MUST CALL hmm_vma_range_done() AFTER THIS FUNCTION RETURN SUCCESS (0) - * BEFORE FREEING THE range struct OR YOU WILL HAVE SERIOUS MEMORY CORRUPTION ! - * - * YOU HAVE BEEN WARNED ! */ -int hmm_vma_fault(struct hmm_range *range, bool block) +long hmm_range_fault(struct hmm_range *range, bool block) { - struct vm_area_struct *vma = range->vma; - unsigned long start = range->start; + const unsigned long device_vma = VM_IO | VM_PFNMAP | VM_MIXEDMAP; + unsigned long start = range->start, end; struct hmm_vma_walk hmm_vma_walk; + struct hmm *hmm = range->hmm; + struct vm_area_struct *vma; struct mm_walk mm_walk; - struct hmm *hmm; int ret; - /* Sanity check, this really should not happen ! */ - if (range->start < vma->vm_start || range->start >= vma->vm_end) - return -EINVAL; - if (range->end < vma->vm_start || range->end > vma->vm_end) - return -EINVAL; + /* Check if hmm_mm_destroy() was call. */ + if (hmm->mm == NULL || hmm->dead) + return -EFAULT; - hmm = hmm_register(vma->vm_mm); - if (!hmm) { - hmm_pfns_clear(range, range->pfns, range->start, range->end); - return -ENOMEM; - } - /* Caller must have registered a mirror using hmm_mirror_register() */ - if (!hmm->mmu_notifier.ops) - return -EINVAL; + do { + /* If range is no longer valid force retry. */ + if (!range->valid) { + up_read(&hmm->mm->mmap_sem); + return -EAGAIN; + } - /* FIXME support hugetlb fs */ - if (is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_SPECIAL) || - vma_is_dax(vma)) { - hmm_pfns_special(range); - return -EINVAL; - } + vma = find_vma(hmm->mm, start); + if (vma == NULL || (vma->vm_flags & device_vma)) + return -EFAULT; + + if (is_vm_hugetlb_page(vma)) { + if (huge_page_shift(hstate_vma(vma)) != + range->page_shift && + range->page_shift != PAGE_SHIFT) + return -EINVAL; + } else { + if (range->page_shift != PAGE_SHIFT) + return -EINVAL; + } + + if (!(vma->vm_flags & VM_READ)) { + /* + * If vma do not allow read access, then assume that it + * does not allow write access, either. HMM does not + * support architecture that allow write without read. + */ + hmm_pfns_clear(range, range->pfns, + range->start, range->end); + return -EPERM; + } + + range->vma = vma; + hmm_vma_walk.pgmap = NULL; + hmm_vma_walk.last = start; + hmm_vma_walk.fault = true; + hmm_vma_walk.block = block; + hmm_vma_walk.range = range; + mm_walk.private = &hmm_vma_walk; + end = min(range->end, vma->vm_end); + + mm_walk.vma = vma; + mm_walk.mm = vma->vm_mm; + mm_walk.pte_entry = NULL; + mm_walk.test_walk = NULL; + mm_walk.hugetlb_entry = NULL; + mm_walk.pud_entry = hmm_vma_walk_pud; + mm_walk.pmd_entry = hmm_vma_walk_pmd; + mm_walk.pte_hole = hmm_vma_walk_hole; + mm_walk.hugetlb_entry = hmm_vma_walk_hugetlb_entry; + + do { + ret = walk_page_range(start, end, &mm_walk); + start = hmm_vma_walk.last; + + /* Keep trying while the range is valid. */ + } while (ret == -EBUSY && range->valid); + + if (ret) { + unsigned long i; + + i = (hmm_vma_walk.last - range->start) >> PAGE_SHIFT; + hmm_pfns_clear(range, &range->pfns[i], + hmm_vma_walk.last, range->end); + return ret; + } + start = end; + + } while (start < range->end); + + return (hmm_vma_walk.last - range->start) >> PAGE_SHIFT; +} +EXPORT_SYMBOL(hmm_range_fault); + +/** + * hmm_range_dma_map() - hmm_range_fault() and dma map page all in one. + * @range: range being faulted + * @device: device against to dma map page to + * @daddrs: dma address of mapped pages + * @block: allow blocking on fault (if true it sleeps and do not drop mmap_sem) + * Returns: number of pages mapped on success, -EAGAIN if mmap_sem have been + * drop and you need to try again, some other error value otherwise + * + * Note same usage pattern as hmm_range_fault(). + */ +long hmm_range_dma_map(struct hmm_range *range, + struct device *device, + dma_addr_t *daddrs, + bool block) +{ + unsigned long i, npages, mapped; + long ret; + + ret = hmm_range_fault(range, block); + if (ret <= 0) + return ret ? ret : -EBUSY; + + npages = (range->end - range->start) >> PAGE_SHIFT; + for (i = 0, mapped = 0; i < npages; ++i) { + enum dma_data_direction dir = DMA_TO_DEVICE; + struct page *page; - if (!(vma->vm_flags & VM_READ)) { /* - * If vma do not allow read access, then assume that it does - * not allow write access, either. Architecture that allow - * write without read access are not supported by HMM, because - * operations such has atomic access would not work. + * FIXME need to update DMA API to provide invalid DMA address + * value instead of a function to test dma address value. This + * would remove lot of dumb code duplicated accross many arch. + * + * For now setting it to 0 here is good enough as the pfns[] + * value is what is use to check what is valid and what isn't. */ - hmm_pfns_clear(range, range->pfns, range->start, range->end); - return -EPERM; + daddrs[i] = 0; + + page = hmm_device_entry_to_page(range, range->pfns[i]); + if (page == NULL) + continue; + + /* Check if range is being invalidated */ + if (!range->valid) { + ret = -EBUSY; + goto unmap; + } + + /* If it is read and write than map bi-directional. */ + if (range->pfns[i] & range->flags[HMM_PFN_WRITE]) + dir = DMA_BIDIRECTIONAL; + + daddrs[i] = dma_map_page(device, page, 0, PAGE_SIZE, dir); + if (dma_mapping_error(device, daddrs[i])) { + ret = -EFAULT; + goto unmap; + } + + mapped++; } - /* Initialize range to track CPU page table update */ - spin_lock(&hmm->lock); - range->valid = true; - list_add_rcu(&range->list, &hmm->ranges); - spin_unlock(&hmm->lock); - - hmm_vma_walk.fault = true; - hmm_vma_walk.block = block; - hmm_vma_walk.range = range; - mm_walk.private = &hmm_vma_walk; - hmm_vma_walk.last = range->start; - - mm_walk.vma = vma; - mm_walk.mm = vma->vm_mm; - mm_walk.pte_entry = NULL; - mm_walk.test_walk = NULL; - mm_walk.hugetlb_entry = NULL; - mm_walk.pmd_entry = hmm_vma_walk_pmd; - mm_walk.pte_hole = hmm_vma_walk_hole; + return mapped; - do { - ret = walk_page_range(start, range->end, &mm_walk); - start = hmm_vma_walk.last; - } while (ret == -EAGAIN); +unmap: + for (npages = i, i = 0; (i < npages) && mapped; ++i) { + enum dma_data_direction dir = DMA_TO_DEVICE; + struct page *page; - if (ret) { - unsigned long i; + page = hmm_device_entry_to_page(range, range->pfns[i]); + if (page == NULL) + continue; + + if (dma_mapping_error(device, daddrs[i])) + continue; - i = (hmm_vma_walk.last - range->start) >> PAGE_SHIFT; - hmm_pfns_clear(range, &range->pfns[i], hmm_vma_walk.last, - range->end); - hmm_vma_range_done(range); + /* If it is read and write than map bi-directional. */ + if (range->pfns[i] & range->flags[HMM_PFN_WRITE]) + dir = DMA_BIDIRECTIONAL; + + dma_unmap_page(device, daddrs[i], PAGE_SIZE, dir); + mapped--; } + return ret; } -EXPORT_SYMBOL(hmm_vma_fault); +EXPORT_SYMBOL(hmm_range_dma_map); + +/** + * hmm_range_dma_unmap() - unmap range of that was map with hmm_range_dma_map() + * @range: range being unmapped + * @vma: the vma against which the range (optional) + * @device: device against which dma map was done + * @daddrs: dma address of mapped pages + * @dirty: dirty page if it had the write flag set + * Returns: number of page unmapped on success, -EINVAL otherwise + * + * Note that caller MUST abide by mmu notifier or use HMM mirror and abide + * to the sync_cpu_device_pagetables() callback so that it is safe here to + * call set_page_dirty(). Caller must also take appropriate locks to avoid + * concurrent mmu notifier or sync_cpu_device_pagetables() to make progress. + */ +long hmm_range_dma_unmap(struct hmm_range *range, + struct vm_area_struct *vma, + struct device *device, + dma_addr_t *daddrs, + bool dirty) +{ + unsigned long i, npages; + long cpages = 0; + + /* Sanity check. */ + if (range->end <= range->start) + return -EINVAL; + if (!daddrs) + return -EINVAL; + if (!range->pfns) + return -EINVAL; + + npages = (range->end - range->start) >> PAGE_SHIFT; + for (i = 0; i < npages; ++i) { + enum dma_data_direction dir = DMA_TO_DEVICE; + struct page *page; + + page = hmm_device_entry_to_page(range, range->pfns[i]); + if (page == NULL) + continue; + + /* If it is read and write than map bi-directional. */ + if (range->pfns[i] & range->flags[HMM_PFN_WRITE]) { + dir = DMA_BIDIRECTIONAL; + + /* + * See comments in function description on why it is + * safe here to call set_page_dirty() + */ + if (dirty) + set_page_dirty(page); + } + + /* Unmap and clear pfns/dma address */ + dma_unmap_page(device, daddrs[i], PAGE_SIZE, dir); + range->pfns[i] = range->values[HMM_PFN_NONE]; + /* FIXME see comments in hmm_vma_dma_map() */ + daddrs[i] = 0; + cpages++; + } + + return cpages; +} +EXPORT_SYMBOL(hmm_range_dma_unmap); #endif /* IS_ENABLED(CONFIG_HMM_MIRROR) */ diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 404acdcd0455..9f8bce9a6b32 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -509,7 +509,7 @@ void prep_transhuge_page(struct page *page) set_compound_page_dtor(page, TRANSHUGE_PAGE_DTOR); } -unsigned long __thp_get_unmapped_area(struct file *filp, unsigned long len, +static unsigned long __thp_get_unmapped_area(struct file *filp, unsigned long len, loff_t off, unsigned long flags, unsigned long size) { unsigned long addr; @@ -755,6 +755,21 @@ static void insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr, spinlock_t *ptl; ptl = pmd_lock(mm, pmd); + if (!pmd_none(*pmd)) { + if (write) { + if (pmd_pfn(*pmd) != pfn_t_to_pfn(pfn)) { + WARN_ON_ONCE(!is_huge_zero_pmd(*pmd)); + goto out_unlock; + } + entry = pmd_mkyoung(*pmd); + entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); + if (pmdp_set_access_flags(vma, addr, pmd, entry, 1)) + update_mmu_cache_pmd(vma, addr, pmd); + } + + goto out_unlock; + } + entry = pmd_mkhuge(pfn_t_pmd(pfn, prot)); if (pfn_t_devmap(pfn)) entry = pmd_mkdevmap(entry); @@ -766,18 +781,25 @@ static void insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr, if (pgtable) { pgtable_trans_huge_deposit(mm, pmd, pgtable); mm_inc_nr_ptes(mm); + pgtable = NULL; } set_pmd_at(mm, addr, pmd, entry); update_mmu_cache_pmd(vma, addr, pmd); + +out_unlock: spin_unlock(ptl); + if (pgtable) + pte_free(mm, pgtable); } -vm_fault_t vmf_insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr, - pmd_t *pmd, pfn_t pfn, bool write) +vm_fault_t vmf_insert_pfn_pmd(struct vm_fault *vmf, pfn_t pfn, bool write) { + unsigned long addr = vmf->address & PMD_MASK; + struct vm_area_struct *vma = vmf->vma; pgprot_t pgprot = vma->vm_page_prot; pgtable_t pgtable = NULL; + /* * If we had pmd_special, we could avoid all these restrictions, * but we need to be consistent with PTEs and architectures that @@ -800,7 +822,7 @@ vm_fault_t vmf_insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr, track_pfn_insert(vma, &pgprot, pfn); - insert_pfn_pmd(vma, addr, pmd, pfn, pgprot, write, pgtable); + insert_pfn_pmd(vma, addr, vmf->pmd, pfn, pgprot, write, pgtable); return VM_FAULT_NOPAGE; } EXPORT_SYMBOL_GPL(vmf_insert_pfn_pmd); @@ -821,6 +843,20 @@ static void insert_pfn_pud(struct vm_area_struct *vma, unsigned long addr, spinlock_t *ptl; ptl = pud_lock(mm, pud); + if (!pud_none(*pud)) { + if (write) { + if (pud_pfn(*pud) != pfn_t_to_pfn(pfn)) { + WARN_ON_ONCE(!is_huge_zero_pud(*pud)); + goto out_unlock; + } + entry = pud_mkyoung(*pud); + entry = maybe_pud_mkwrite(pud_mkdirty(entry), vma); + if (pudp_set_access_flags(vma, addr, pud, entry, 1)) + update_mmu_cache_pud(vma, addr, pud); + } + goto out_unlock; + } + entry = pud_mkhuge(pfn_t_pud(pfn, prot)); if (pfn_t_devmap(pfn)) entry = pud_mkdevmap(entry); @@ -830,13 +866,17 @@ static void insert_pfn_pud(struct vm_area_struct *vma, unsigned long addr, } set_pud_at(mm, addr, pud, entry); update_mmu_cache_pud(vma, addr, pud); + +out_unlock: spin_unlock(ptl); } -vm_fault_t vmf_insert_pfn_pud(struct vm_area_struct *vma, unsigned long addr, - pud_t *pud, pfn_t pfn, bool write) +vm_fault_t vmf_insert_pfn_pud(struct vm_fault *vmf, pfn_t pfn, bool write) { + unsigned long addr = vmf->address & PUD_MASK; + struct vm_area_struct *vma = vmf->vma; pgprot_t pgprot = vma->vm_page_prot; + /* * If we had pud_special, we could avoid all these restrictions, * but we need to be consistent with PTEs and architectures that @@ -853,7 +893,7 @@ vm_fault_t vmf_insert_pfn_pud(struct vm_area_struct *vma, unsigned long addr, track_pfn_insert(vma, &pgprot, pfn); - insert_pfn_pud(vma, addr, pud, pfn, pgprot, write); + insert_pfn_pud(vma, addr, vmf->pud, pfn, pgprot, write); return VM_FAULT_NOPAGE; } EXPORT_SYMBOL_GPL(vmf_insert_pfn_pud); @@ -1184,8 +1224,8 @@ static vm_fault_t do_huge_pmd_wp_page_fallback(struct vm_fault *vmf, cond_resched(); } - mmu_notifier_range_init(&range, vma->vm_mm, haddr, - haddr + HPAGE_PMD_SIZE); + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm, + haddr, haddr + HPAGE_PMD_SIZE); mmu_notifier_invalidate_range_start(&range); vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd); @@ -1348,8 +1388,8 @@ alloc: vma, HPAGE_PMD_NR); __SetPageUptodate(new_page); - mmu_notifier_range_init(&range, vma->vm_mm, haddr, - haddr + HPAGE_PMD_SIZE); + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm, + haddr, haddr + HPAGE_PMD_SIZE); mmu_notifier_invalidate_range_start(&range); spin_lock(vmf->ptl); @@ -1641,7 +1681,7 @@ bool madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, struct mm_struct *mm = tlb->mm; bool ret = false; - tlb_remove_check_page_size_change(tlb, HPAGE_PMD_SIZE); + tlb_change_page_size(tlb, HPAGE_PMD_SIZE); ptl = pmd_trans_huge_lock(pmd, vma); if (!ptl) @@ -1717,7 +1757,7 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, pmd_t orig_pmd; spinlock_t *ptl; - tlb_remove_check_page_size_change(tlb, HPAGE_PMD_SIZE); + tlb_change_page_size(tlb, HPAGE_PMD_SIZE); ptl = __pmd_trans_huge_lock(pmd, vma); if (!ptl) @@ -2024,7 +2064,8 @@ void __split_huge_pud(struct vm_area_struct *vma, pud_t *pud, spinlock_t *ptl; struct mmu_notifier_range range; - mmu_notifier_range_init(&range, vma->vm_mm, address & HPAGE_PUD_MASK, + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm, + address & HPAGE_PUD_MASK, (address & HPAGE_PUD_MASK) + HPAGE_PUD_SIZE); mmu_notifier_invalidate_range_start(&range); ptl = pud_lock(vma->vm_mm, pud); @@ -2242,7 +2283,8 @@ void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, spinlock_t *ptl; struct mmu_notifier_range range; - mmu_notifier_range_init(&range, vma->vm_mm, address & HPAGE_PMD_MASK, + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm, + address & HPAGE_PMD_MASK, (address & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE); mmu_notifier_invalidate_range_start(&range); ptl = pmd_lock(vma->vm_mm, pmd); @@ -2456,6 +2498,9 @@ static void __split_huge_page(struct page *page, struct list_head *list, if (IS_ENABLED(CONFIG_SHMEM) && PageSwapBacked(head)) shmem_uncharge(head->mapping->host, 1); put_page(head + i); + } else if (!PageAnon(page)) { + __xa_store(&head->mapping->i_pages, head[i].index, + head + i, 0); } } diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 97b1e0290c66..81718c56b8f5 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -740,7 +740,15 @@ void resv_map_release(struct kref *ref) static inline struct resv_map *inode_resv_map(struct inode *inode) { - return inode->i_mapping->private_data; + /* + * At inode evict time, i_mapping may not point to the original + * address space within the inode. This original address space + * contains the pointer to the resv_map. So, always use the + * address space embedded within the inode. + * The VERY common case is inode->mapping == &inode->i_data but, + * this may not be true for device special inodes. + */ + return (struct resv_map *)(&inode->i_data)->private_data; } static struct resv_map *vma_resv_map(struct vm_area_struct *vma) @@ -1059,6 +1067,7 @@ static void free_gigantic_page(struct page *page, unsigned int order) free_contig_range(page_to_pfn(page), 1 << order); } +#ifdef CONFIG_CONTIG_ALLOC static int __alloc_gigantic_page(unsigned long start_pfn, unsigned long nr_pages, gfp_t gfp_mask) { @@ -1143,11 +1152,20 @@ static struct page *alloc_gigantic_page(struct hstate *h, gfp_t gfp_mask, static void prep_new_huge_page(struct hstate *h, struct page *page, int nid); static void prep_compound_gigantic_page(struct page *page, unsigned int order); +#else /* !CONFIG_CONTIG_ALLOC */ +static struct page *alloc_gigantic_page(struct hstate *h, gfp_t gfp_mask, + int nid, nodemask_t *nodemask) +{ + return NULL; +} +#endif /* CONFIG_CONTIG_ALLOC */ #else /* !CONFIG_ARCH_HAS_GIGANTIC_PAGE */ -static inline bool gigantic_page_supported(void) { return false; } static struct page *alloc_gigantic_page(struct hstate *h, gfp_t gfp_mask, - int nid, nodemask_t *nodemask) { return NULL; } + int nid, nodemask_t *nodemask) +{ + return NULL; +} static inline void free_gigantic_page(struct page *page, unsigned int order) { } static inline void destroy_compound_gigantic_page(struct page *page, unsigned int order) { } @@ -1157,7 +1175,7 @@ static void update_and_free_page(struct hstate *h, struct page *page) { int i; - if (hstate_is_gigantic(h) && !gigantic_page_supported()) + if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported()) return; h->nr_huge_pages--; @@ -1258,12 +1276,23 @@ void free_huge_page(struct page *page) ClearPagePrivate(page); /* - * A return code of zero implies that the subpool will be under its - * minimum size if the reservation is not restored after page is free. - * Therefore, force restore_reserve operation. + * If PagePrivate() was set on page, page allocation consumed a + * reservation. If the page was associated with a subpool, there + * would have been a page reserved in the subpool before allocation + * via hugepage_subpool_get_pages(). Since we are 'restoring' the + * reservtion, do not call hugepage_subpool_put_pages() as this will + * remove the reserved page from the subpool. */ - if (hugepage_subpool_put_pages(spool, 1) == 0) - restore_reserve = true; + if (!restore_reserve) { + /* + * A return code of zero implies that the subpool will be + * under its minimum size if the reservation is not restored + * after page is free. Therefore, force restore_reserve + * operation. + */ + if (hugepage_subpool_put_pages(spool, 1) == 0) + restore_reserve = true; + } spin_lock(&hugetlb_lock); clear_page_huge_active(page); @@ -1574,8 +1603,9 @@ static struct page *alloc_surplus_huge_page(struct hstate *h, gfp_t gfp_mask, */ if (h->surplus_huge_pages >= h->nr_overcommit_huge_pages) { SetPageHugeTemporary(page); + spin_unlock(&hugetlb_lock); put_page(page); - page = NULL; + return NULL; } else { h->surplus_huge_pages++; h->surplus_huge_pages_node[page_to_nid(page)]++; @@ -2277,13 +2307,47 @@ found: } #define persistent_huge_pages(h) (h->nr_huge_pages - h->surplus_huge_pages) -static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count, - nodemask_t *nodes_allowed) +static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid, + nodemask_t *nodes_allowed) { unsigned long min_count, ret; - if (hstate_is_gigantic(h) && !gigantic_page_supported()) - return h->max_huge_pages; + spin_lock(&hugetlb_lock); + + /* + * Check for a node specific request. + * Changing node specific huge page count may require a corresponding + * change to the global count. In any case, the passed node mask + * (nodes_allowed) will restrict alloc/free to the specified node. + */ + if (nid != NUMA_NO_NODE) { + unsigned long old_count = count; + + count += h->nr_huge_pages - h->nr_huge_pages_node[nid]; + /* + * User may have specified a large count value which caused the + * above calculation to overflow. In this case, they wanted + * to allocate as many huge pages as possible. Set count to + * largest possible value to align with their intention. + */ + if (count < old_count) + count = ULONG_MAX; + } + + /* + * Gigantic pages runtime allocation depend on the capability for large + * page range allocation. + * If the system does not provide this feature, return an error when + * the user tries to allocate gigantic pages but let the user free the + * boottime allocated gigantic pages. + */ + if (hstate_is_gigantic(h) && !IS_ENABLED(CONFIG_CONTIG_ALLOC)) { + if (count > persistent_huge_pages(h)) { + spin_unlock(&hugetlb_lock); + return -EINVAL; + } + /* Fall through to decrease pool */ + } /* * Increase the pool size @@ -2296,7 +2360,6 @@ static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count, * pool might be one hugepage larger than it needs to be, but * within all the constraints specified by the sysctls. */ - spin_lock(&hugetlb_lock); while (h->surplus_huge_pages && count > persistent_huge_pages(h)) { if (!adjust_pool_surplus(h, nodes_allowed, -1)) break; @@ -2351,9 +2414,10 @@ static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count, break; } out: - ret = persistent_huge_pages(h); + h->max_huge_pages = persistent_huge_pages(h); spin_unlock(&hugetlb_lock); - return ret; + + return 0; } #define HSTATE_ATTR_RO(_name) \ @@ -2403,41 +2467,32 @@ static ssize_t __nr_hugepages_store_common(bool obey_mempolicy, unsigned long count, size_t len) { int err; - NODEMASK_ALLOC(nodemask_t, nodes_allowed, GFP_KERNEL | __GFP_NORETRY); + nodemask_t nodes_allowed, *n_mask; - if (hstate_is_gigantic(h) && !gigantic_page_supported()) { - err = -EINVAL; - goto out; - } + if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported()) + return -EINVAL; if (nid == NUMA_NO_NODE) { /* * global hstate attribute */ if (!(obey_mempolicy && - init_nodemask_of_mempolicy(nodes_allowed))) { - NODEMASK_FREE(nodes_allowed); - nodes_allowed = &node_states[N_MEMORY]; - } - } else if (nodes_allowed) { + init_nodemask_of_mempolicy(&nodes_allowed))) + n_mask = &node_states[N_MEMORY]; + else + n_mask = &nodes_allowed; + } else { /* - * per node hstate attribute: adjust count to global, - * but restrict alloc/free to the specified node. + * Node specific request. count adjustment happens in + * set_max_huge_pages() after acquiring hugetlb_lock. */ - count += h->nr_huge_pages - h->nr_huge_pages_node[nid]; - init_nodemask_of_node(nodes_allowed, nid); - } else - nodes_allowed = &node_states[N_MEMORY]; - - h->max_huge_pages = set_max_huge_pages(h, count, nodes_allowed); + init_nodemask_of_node(&nodes_allowed, nid); + n_mask = &nodes_allowed; + } - if (nodes_allowed != &node_states[N_MEMORY]) - NODEMASK_FREE(nodes_allowed); + err = set_max_huge_pages(h, count, nid, n_mask); - return len; -out: - NODEMASK_FREE(nodes_allowed); - return err; + return err ? err : len; } static ssize_t nr_hugepages_store_common(bool obey_mempolicy, @@ -3247,7 +3302,8 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; if (cow) { - mmu_notifier_range_init(&range, src, vma->vm_start, + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, src, + vma->vm_start, vma->vm_end); mmu_notifier_invalidate_range_start(&range); } @@ -3353,13 +3409,14 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma, * This is a hugetlb vma, all the pte entries should point * to huge page. */ - tlb_remove_check_page_size_change(tlb, sz); + tlb_change_page_size(tlb, sz); tlb_start_vma(tlb, vma); /* * If sharing possible, alert mmu notifiers of worst case. */ - mmu_notifier_range_init(&range, mm, start, end); + mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, mm, start, + end); adjust_range_if_pmd_sharing_possible(vma, &range.start, &range.end); mmu_notifier_invalidate_range_start(&range); address = start; @@ -3626,7 +3683,8 @@ retry_avoidcopy: pages_per_huge_page(h)); __SetPageUptodate(new_page); - mmu_notifier_range_init(&range, mm, haddr, haddr + huge_page_size(h)); + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, haddr, + haddr + huge_page_size(h)); mmu_notifier_invalidate_range_start(&range); /* @@ -3777,8 +3835,7 @@ retry: * handling userfault. Reacquire after handling * fault to make calling code simpler. */ - hash = hugetlb_fault_mutex_hash(h, mm, vma, mapping, - idx, haddr); + hash = hugetlb_fault_mutex_hash(h, mapping, idx, haddr); mutex_unlock(&hugetlb_fault_mutex_table[hash]); ret = handle_userfault(&vmf, VM_UFFD_MISSING); mutex_lock(&hugetlb_fault_mutex_table[hash]); @@ -3886,21 +3943,14 @@ backout_unlocked: } #ifdef CONFIG_SMP -u32 hugetlb_fault_mutex_hash(struct hstate *h, struct mm_struct *mm, - struct vm_area_struct *vma, - struct address_space *mapping, +u32 hugetlb_fault_mutex_hash(struct hstate *h, struct address_space *mapping, pgoff_t idx, unsigned long address) { unsigned long key[2]; u32 hash; - if (vma->vm_flags & VM_SHARED) { - key[0] = (unsigned long) mapping; - key[1] = idx; - } else { - key[0] = (unsigned long) mm; - key[1] = address >> huge_page_shift(h); - } + key[0] = (unsigned long) mapping; + key[1] = idx; hash = jhash2((u32 *)&key, sizeof(key)/sizeof(u32), 0); @@ -3911,9 +3961,7 @@ u32 hugetlb_fault_mutex_hash(struct hstate *h, struct mm_struct *mm, * For uniprocesor systems we always use a single mutex, so just * return 0 and avoid the hashing overhead. */ -u32 hugetlb_fault_mutex_hash(struct hstate *h, struct mm_struct *mm, - struct vm_area_struct *vma, - struct address_space *mapping, +u32 hugetlb_fault_mutex_hash(struct hstate *h, struct address_space *mapping, pgoff_t idx, unsigned long address) { return 0; @@ -3958,7 +4006,7 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, * get spurious allocation failures if two CPUs race to instantiate * the same page in the page cache. */ - hash = hugetlb_fault_mutex_hash(h, mm, vma, mapping, idx, haddr); + hash = hugetlb_fault_mutex_hash(h, mapping, idx, haddr); mutex_lock(&hugetlb_fault_mutex_table[hash]); entry = huge_ptep_get(ptep); @@ -4299,6 +4347,19 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, pfn_offset = (vaddr & ~huge_page_mask(h)) >> PAGE_SHIFT; page = pte_page(huge_ptep_get(pte)); + + /* + * Instead of doing 'try_get_page()' below in the same_page + * loop, just check the count once here. + */ + if (unlikely(page_count(page) <= 0)) { + if (pages) { + spin_unlock(ptl); + remainder = 0; + err = -ENOMEM; + break; + } + } same_page: if (pages) { pages[i] = mem_map_offset(page, pfn_offset); @@ -4358,7 +4419,8 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma, * start/end. Set range.start/range.end to cover the maximum possible * range if PMD sharing is possible. */ - mmu_notifier_range_init(&range, mm, start, end); + mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_VMA, + 0, vma, mm, start, end); adjust_range_if_pmd_sharing_possible(vma, &range.start, &range.end); BUG_ON(address >= end); @@ -4464,6 +4526,11 @@ int hugetlb_reserve_pages(struct inode *inode, * called to make the mapping read-write. Assume !vma is a shm mapping */ if (!vma || vma->vm_flags & VM_MAYSHARE) { + /* + * resv_map can not be NULL as hugetlb_reserve_pages is only + * called for inodes for which resv_maps were created (see + * hugetlbfs_get_inode). + */ resv_map = inode_resv_map(inode); chg = region_chg(resv_map, from, to); @@ -4555,6 +4622,10 @@ long hugetlb_unreserve_pages(struct inode *inode, long start, long end, struct hugepage_subpool *spool = subpool_inode(inode); long gbl_reserve; + /* + * Since this routine can be called in the evict inode path for all + * hugetlbfs inodes, resv_map could be NULL. + */ if (resv_map) { chg = region_del(resv_map, start, end); /* diff --git a/mm/kasan/Makefile b/mm/kasan/Makefile index 5d1065efbd47..08b43de2383b 100644 --- a/mm/kasan/Makefile +++ b/mm/kasan/Makefile @@ -2,18 +2,21 @@ KASAN_SANITIZE := n UBSAN_SANITIZE_common.o := n UBSAN_SANITIZE_generic.o := n +UBSAN_SANITIZE_generic_report.o := n UBSAN_SANITIZE_tags.o := n KCOV_INSTRUMENT := n -CFLAGS_REMOVE_common.o = -pg -CFLAGS_REMOVE_generic.o = -pg -CFLAGS_REMOVE_tags.o = -pg +CFLAGS_REMOVE_common.o = $(CC_FLAGS_FTRACE) +CFLAGS_REMOVE_generic.o = $(CC_FLAGS_FTRACE) +CFLAGS_REMOVE_generic_report.o = $(CC_FLAGS_FTRACE) +CFLAGS_REMOVE_tags.o = $(CC_FLAGS_FTRACE) # Function splitter causes unnecessary splits in __asan_load1/__asan_store1 # see: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=63533 CFLAGS_common.o := $(call cc-option, -fno-conserve-stack -fno-stack-protector) CFLAGS_generic.o := $(call cc-option, -fno-conserve-stack -fno-stack-protector) +CFLAGS_generic_report.o := $(call cc-option, -fno-conserve-stack -fno-stack-protector) CFLAGS_tags.o := $(call cc-option, -fno-conserve-stack -fno-stack-protector) obj-$(CONFIG_KASAN) := common.o init.o report.o diff --git a/mm/kasan/common.c b/mm/kasan/common.c index 80bbe62b16cd..36afcf64e016 100644 --- a/mm/kasan/common.c +++ b/mm/kasan/common.c @@ -36,6 +36,7 @@ #include <linux/types.h> #include <linux/vmalloc.h> #include <linux/bug.h> +#include <linux/uaccess.h> #include "kasan.h" #include "../slab.h" @@ -48,37 +49,28 @@ static inline int in_irqentry_text(unsigned long ptr) ptr < (unsigned long)&__softirqentry_text_end); } -static inline void filter_irq_stacks(struct stack_trace *trace) +static inline unsigned int filter_irq_stacks(unsigned long *entries, + unsigned int nr_entries) { - int i; + unsigned int i; - if (!trace->nr_entries) - return; - for (i = 0; i < trace->nr_entries; i++) - if (in_irqentry_text(trace->entries[i])) { + for (i = 0; i < nr_entries; i++) { + if (in_irqentry_text(entries[i])) { /* Include the irqentry function into the stack. */ - trace->nr_entries = i + 1; - break; + return i + 1; } + } + return nr_entries; } static inline depot_stack_handle_t save_stack(gfp_t flags) { unsigned long entries[KASAN_STACK_DEPTH]; - struct stack_trace trace = { - .nr_entries = 0, - .entries = entries, - .max_entries = KASAN_STACK_DEPTH, - .skip = 0 - }; + unsigned int nr_entries; - save_stack_trace(&trace); - filter_irq_stacks(&trace); - if (trace.nr_entries != 0 && - trace.entries[trace.nr_entries-1] == ULONG_MAX) - trace.nr_entries--; - - return depot_save_stack(&trace, flags); + nr_entries = stack_trace_save(entries, ARRAY_SIZE(entries), 0); + nr_entries = filter_irq_stacks(entries, nr_entries); + return stack_depot_save(entries, nr_entries, flags); } static inline void set_track(struct kasan_track *track, gfp_t flags) @@ -614,6 +606,15 @@ void kasan_free_shadow(const struct vm_struct *vm) vfree(kasan_mem_to_shadow(vm->addr)); } +extern void __kasan_report(unsigned long addr, size_t size, bool is_write, unsigned long ip); + +void kasan_report(unsigned long addr, size_t size, bool is_write, unsigned long ip) +{ + unsigned long flags = user_access_save(); + __kasan_report(addr, size, is_write, ip); + user_access_restore(flags); +} + #ifdef CONFIG_MEMORY_HOTPLUG static bool shadow_mapped(unsigned long addr) { diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h index 3e0c11f7d7a1..3ce956efa0cb 100644 --- a/mm/kasan/kasan.h +++ b/mm/kasan/kasan.h @@ -163,7 +163,10 @@ static inline u8 random_tag(void) #endif #ifndef arch_kasan_set_tag -#define arch_kasan_set_tag(addr, tag) ((void *)(addr)) +static inline const void *arch_kasan_set_tag(const void *addr, u8 tag) +{ + return addr; +} #endif #ifndef arch_kasan_reset_tag #define arch_kasan_reset_tag(addr) ((void *)(addr)) diff --git a/mm/kasan/report.c b/mm/kasan/report.c index ca9418fe9232..03a443579386 100644 --- a/mm/kasan/report.c +++ b/mm/kasan/report.c @@ -100,10 +100,11 @@ static void print_track(struct kasan_track *track, const char *prefix) { pr_err("%s by task %u:\n", prefix, track->pid); if (track->stack) { - struct stack_trace trace; + unsigned long *entries; + unsigned int nr_entries; - depot_fetch_stack(track->stack, &trace); - print_stack_trace(&trace, 0); + nr_entries = stack_depot_fetch(track->stack, &entries); + stack_trace_print(entries, nr_entries, 0); } else { pr_err("(stack is not available)\n"); } @@ -281,8 +282,7 @@ void kasan_report_invalid_free(void *object, unsigned long ip) end_report(&flags); } -void kasan_report(unsigned long addr, size_t size, - bool is_write, unsigned long ip) +void __kasan_report(unsigned long addr, size_t size, bool is_write, unsigned long ip) { struct kasan_access_info info; void *tagged_addr; diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 449044378782..a335f7c1fac4 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -1016,7 +1016,8 @@ static void collapse_huge_page(struct mm_struct *mm, pte = pte_offset_map(pmd, address); pte_ptl = pte_lockptr(mm, pmd); - mmu_notifier_range_init(&range, mm, address, address + HPAGE_PMD_SIZE); + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, NULL, mm, + address, address + HPAGE_PMD_SIZE); mmu_notifier_invalidate_range_start(&range); pmd_ptl = pmd_lock(mm, pmd); /* probably unnecessary */ /* @@ -1374,7 +1375,7 @@ static void collapse_shmem(struct mm_struct *mm, result = SCAN_FAIL; goto xa_locked; } - xas_store(&xas, new_page + (index % HPAGE_PMD_NR)); + xas_store(&xas, new_page); nr_none++; continue; } @@ -1450,7 +1451,7 @@ static void collapse_shmem(struct mm_struct *mm, list_add_tail(&page->lru, &pagelist); /* Finally, replace with the new page. */ - xas_store(&xas, new_page + (index % HPAGE_PMD_NR)); + xas_store(&xas, new_page); continue; out_unlock: unlock_page(page); diff --git a/mm/kmemleak.c b/mm/kmemleak.c index 707fa5579f66..e57bf810f798 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -410,11 +410,6 @@ static void print_unreferenced(struct seq_file *seq, */ static void dump_object_info(struct kmemleak_object *object) { - struct stack_trace trace; - - trace.nr_entries = object->trace_len; - trace.entries = object->trace; - pr_notice("Object 0x%08lx (size %zu):\n", object->pointer, object->size); pr_notice(" comm \"%s\", pid %d, jiffies %lu\n", @@ -424,7 +419,7 @@ static void dump_object_info(struct kmemleak_object *object) pr_notice(" flags = 0x%x\n", object->flags); pr_notice(" checksum = %u\n", object->checksum); pr_notice(" backtrace:\n"); - print_stack_trace(&trace, 4); + stack_trace_print(object->trace, object->trace_len, 4); } /* @@ -553,15 +548,7 @@ static struct kmemleak_object *find_and_remove_object(unsigned long ptr, int ali */ static int __save_stack_trace(unsigned long *trace) { - struct stack_trace stack_trace; - - stack_trace.max_entries = MAX_TRACE; - stack_trace.nr_entries = 0; - stack_trace.entries = trace; - stack_trace.skip = 2; - save_stack_trace(&stack_trace); - - return stack_trace.nr_entries; + return stack_trace_save(trace, MAX_TRACE, 2); } /* @@ -1401,6 +1388,7 @@ static void scan_block(void *_start, void *_end, /* * Scan a large memory block in MAX_SCAN_SIZE chunks to reduce the latency. */ +#ifdef CONFIG_SMP static void scan_large_block(void *start, void *end) { void *next; @@ -1412,6 +1400,7 @@ static void scan_large_block(void *start, void *end) cond_resched(); } } +#endif /* * Scan a memory block corresponding to a kmemleak_object. A condition is @@ -1529,11 +1518,6 @@ static void kmemleak_scan(void) } rcu_read_unlock(); - /* data/bss scanning */ - scan_large_block(_sdata, _edata); - scan_large_block(__bss_start, __bss_stop); - scan_large_block(__start_ro_after_init, __end_ro_after_init); - #ifdef CONFIG_SMP /* per-cpu sections scanning */ for_each_possible_cpu(i) @@ -2024,13 +2008,8 @@ early_param("kmemleak", kmemleak_boot_config); static void __init print_log_trace(struct early_log *log) { - struct stack_trace trace; - - trace.nr_entries = log->trace_len; - trace.entries = log->trace; - pr_notice("Early log backtrace:\n"); - print_stack_trace(&trace, 2); + stack_trace_print(log->trace, log->trace_len, 2); } /* @@ -2071,6 +2050,17 @@ void __init kmemleak_init(void) } local_irq_restore(flags); + /* register the data/bss sections */ + create_object((unsigned long)_sdata, _edata - _sdata, + KMEMLEAK_GREY, GFP_ATOMIC); + create_object((unsigned long)__bss_start, __bss_stop - __bss_start, + KMEMLEAK_GREY, GFP_ATOMIC); + /* only register .data..ro_after_init if not within .data */ + if (__start_ro_after_init < _sdata || __end_ro_after_init > _edata) + create_object((unsigned long)__start_ro_after_init, + __end_ro_after_init - __start_ro_after_init, + KMEMLEAK_GREY, GFP_ATOMIC); + /* * This is the point where tracking allocations is safe. Automatic * scanning is started during the late initcall. Add the early logged @@ -1066,7 +1066,8 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page, BUG_ON(PageTransCompound(page)); - mmu_notifier_range_init(&range, mm, pvmw.address, + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, + pvmw.address, pvmw.address + PAGE_SIZE); mmu_notifier_invalidate_range_start(&range); @@ -1154,7 +1155,8 @@ static int replace_page(struct vm_area_struct *vma, struct page *page, if (!pmd) goto out; - mmu_notifier_range_init(&range, mm, addr, addr + PAGE_SIZE); + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, addr, + addr + PAGE_SIZE); mmu_notifier_invalidate_range_start(&range); ptep = pte_offset_map_lock(mm, pmd, addr, &ptl); diff --git a/mm/madvise.c b/mm/madvise.c index 21a7881a2db4..628022e674a7 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -328,7 +328,7 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr, if (pmd_trans_unstable(pmd)) return 0; - tlb_remove_check_page_size_change(tlb, PAGE_SIZE); + tlb_change_page_size(tlb, PAGE_SIZE); orig_pte = pte = pte_offset_map_lock(mm, pmd, addr, &ptl); flush_tlb_batched_pending(mm); arch_enter_lazy_mmu_mode(); @@ -472,7 +472,8 @@ static int madvise_free_single_vma(struct vm_area_struct *vma, range.end = min(vma->vm_end, end_addr); if (range.end <= vma->vm_start) return -EINVAL; - mmu_notifier_range_init(&range, mm, range.start, range.end); + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, + range.start, range.end); lru_add_drain(); tlb_gather_mmu(&tlb, mm, range.start, range.end); diff --git a/mm/memblock.c b/mm/memblock.c index e7665cf914b1..6bbad46f4d2c 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -94,7 +94,7 @@ * :c:func:`mem_init` function frees all the memory to the buddy page * allocator. * - * If an architecure enables %CONFIG_ARCH_DISCARD_MEMBLOCK, the + * Unless an architecure enables %CONFIG_ARCH_KEEP_MEMBLOCK, the * memblock data structures will be discarded after the system * initialization compltes. */ @@ -375,7 +375,7 @@ static void __init_memblock memblock_remove_region(struct memblock_type *type, u } } -#ifdef CONFIG_ARCH_DISCARD_MEMBLOCK +#ifndef CONFIG_ARCH_KEEP_MEMBLOCK /** * memblock_discard - discard memory and reserved arrays if they were allocated */ @@ -702,7 +702,7 @@ int __init_memblock memblock_add(phys_addr_t base, phys_addr_t size) { phys_addr_t end = base + size - 1; - memblock_dbg("memblock_add: [%pa-%pa] %pF\n", + memblock_dbg("memblock_add: [%pa-%pa] %pS\n", &base, &end, (void *)_RET_IP_); return memblock_add_range(&memblock.memory, base, size, MAX_NUMNODES, 0); @@ -821,7 +821,7 @@ int __init_memblock memblock_free(phys_addr_t base, phys_addr_t size) { phys_addr_t end = base + size - 1; - memblock_dbg(" memblock_free: [%pa-%pa] %pF\n", + memblock_dbg(" memblock_free: [%pa-%pa] %pS\n", &base, &end, (void *)_RET_IP_); kmemleak_free_part_phys(base, size); @@ -832,7 +832,7 @@ int __init_memblock memblock_reserve(phys_addr_t base, phys_addr_t size) { phys_addr_t end = base + size - 1; - memblock_dbg("memblock_reserve: [%pa-%pa] %pF\n", + memblock_dbg("memblock_reserve: [%pa-%pa] %pS\n", &base, &end, (void *)_RET_IP_); return memblock_add_range(&memblock.reserved, base, size, MAX_NUMNODES, 0); @@ -1255,6 +1255,70 @@ int __init_memblock memblock_set_node(phys_addr_t base, phys_addr_t size, return 0; } #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ +#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT +/** + * __next_mem_pfn_range_in_zone - iterator for for_each_*_range_in_zone() + * + * @idx: pointer to u64 loop variable + * @zone: zone in which all of the memory blocks reside + * @out_spfn: ptr to ulong for start pfn of the range, can be %NULL + * @out_epfn: ptr to ulong for end pfn of the range, can be %NULL + * + * This function is meant to be a zone/pfn specific wrapper for the + * for_each_mem_range type iterators. Specifically they are used in the + * deferred memory init routines and as such we were duplicating much of + * this logic throughout the code. So instead of having it in multiple + * locations it seemed like it would make more sense to centralize this to + * one new iterator that does everything they need. + */ +void __init_memblock +__next_mem_pfn_range_in_zone(u64 *idx, struct zone *zone, + unsigned long *out_spfn, unsigned long *out_epfn) +{ + int zone_nid = zone_to_nid(zone); + phys_addr_t spa, epa; + int nid; + + __next_mem_range(idx, zone_nid, MEMBLOCK_NONE, + &memblock.memory, &memblock.reserved, + &spa, &epa, &nid); + + while (*idx != U64_MAX) { + unsigned long epfn = PFN_DOWN(epa); + unsigned long spfn = PFN_UP(spa); + + /* + * Verify the end is at least past the start of the zone and + * that we have at least one PFN to initialize. + */ + if (zone->zone_start_pfn < epfn && spfn < epfn) { + /* if we went too far just stop searching */ + if (zone_end_pfn(zone) <= spfn) { + *idx = U64_MAX; + break; + } + + if (out_spfn) + *out_spfn = max(zone->zone_start_pfn, spfn); + if (out_epfn) + *out_epfn = min(zone_end_pfn(zone), epfn); + + return; + } + + __next_mem_range(idx, zone_nid, MEMBLOCK_NONE, + &memblock.memory, &memblock.reserved, + &spa, &epa, &nid); + } + + /* signal end of iteration */ + if (out_spfn) + *out_spfn = ULONG_MAX; + if (out_epfn) + *out_epfn = 0; +} + +#endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */ /** * memblock_alloc_range_nid - allocate boot memory block @@ -1447,7 +1511,7 @@ void * __init memblock_alloc_try_nid_raw( { void *ptr; - memblock_dbg("%s: %llu bytes align=0x%llx nid=%d from=%pa max_addr=%pa %pF\n", + memblock_dbg("%s: %llu bytes align=0x%llx nid=%d from=%pa max_addr=%pa %pS\n", __func__, (u64)size, (u64)align, nid, &min_addr, &max_addr, (void *)_RET_IP_); @@ -1483,7 +1547,7 @@ void * __init memblock_alloc_try_nid( { void *ptr; - memblock_dbg("%s: %llu bytes align=0x%llx nid=%d from=%pa max_addr=%pa %pF\n", + memblock_dbg("%s: %llu bytes align=0x%llx nid=%d from=%pa max_addr=%pa %pS\n", __func__, (u64)size, (u64)align, nid, &min_addr, &max_addr, (void *)_RET_IP_); ptr = memblock_alloc_internal(size, align, @@ -1508,7 +1572,7 @@ void __init __memblock_free_late(phys_addr_t base, phys_addr_t size) phys_addr_t cursor, end; end = base + size - 1; - memblock_dbg("%s: [%pa-%pa] %pF\n", + memblock_dbg("%s: [%pa-%pa] %pS\n", __func__, &base, &end, (void *)_RET_IP_); kmemleak_free_part_phys(base, size); cursor = PFN_UP(base); @@ -1923,7 +1987,7 @@ unsigned long __init memblock_free_all(void) return pages; } -#if defined(CONFIG_DEBUG_FS) && !defined(CONFIG_ARCH_DISCARD_MEMBLOCK) +#if defined(CONFIG_DEBUG_FS) && defined(CONFIG_ARCH_KEEP_MEMBLOCK) static int memblock_debug_show(struct seq_file *m, void *private) { diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 532e0e2a4817..e50a2db5b4ff 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -687,10 +687,119 @@ mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz) return mz; } -static unsigned long memcg_sum_events(struct mem_cgroup *memcg, - int event) +/** + * __mod_memcg_state - update cgroup memory statistics + * @memcg: the memory cgroup + * @idx: the stat item - can be enum memcg_stat_item or enum node_stat_item + * @val: delta to add to the counter, can be negative + */ +void __mod_memcg_state(struct mem_cgroup *memcg, int idx, int val) +{ + long x; + + if (mem_cgroup_disabled()) + return; + + x = val + __this_cpu_read(memcg->vmstats_percpu->stat[idx]); + if (unlikely(abs(x) > MEMCG_CHARGE_BATCH)) { + struct mem_cgroup *mi; + + atomic_long_add(x, &memcg->vmstats_local[idx]); + for (mi = memcg; mi; mi = parent_mem_cgroup(mi)) + atomic_long_add(x, &mi->vmstats[idx]); + x = 0; + } + __this_cpu_write(memcg->vmstats_percpu->stat[idx], x); +} + +static struct mem_cgroup_per_node * +parent_nodeinfo(struct mem_cgroup_per_node *pn, int nid) { - return atomic_long_read(&memcg->events[event]); + struct mem_cgroup *parent; + + parent = parent_mem_cgroup(pn->memcg); + if (!parent) + return NULL; + return mem_cgroup_nodeinfo(parent, nid); +} + +/** + * __mod_lruvec_state - update lruvec memory statistics + * @lruvec: the lruvec + * @idx: the stat item + * @val: delta to add to the counter, can be negative + * + * The lruvec is the intersection of the NUMA node and a cgroup. This + * function updates the all three counters that are affected by a + * change of state at this level: per-node, per-cgroup, per-lruvec. + */ +void __mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, + int val) +{ + pg_data_t *pgdat = lruvec_pgdat(lruvec); + struct mem_cgroup_per_node *pn; + struct mem_cgroup *memcg; + long x; + + /* Update node */ + __mod_node_page_state(pgdat, idx, val); + + if (mem_cgroup_disabled()) + return; + + pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec); + memcg = pn->memcg; + + /* Update memcg */ + __mod_memcg_state(memcg, idx, val); + + /* Update lruvec */ + x = val + __this_cpu_read(pn->lruvec_stat_cpu->count[idx]); + if (unlikely(abs(x) > MEMCG_CHARGE_BATCH)) { + struct mem_cgroup_per_node *pi; + + atomic_long_add(x, &pn->lruvec_stat_local[idx]); + for (pi = pn; pi; pi = parent_nodeinfo(pi, pgdat->node_id)) + atomic_long_add(x, &pi->lruvec_stat[idx]); + x = 0; + } + __this_cpu_write(pn->lruvec_stat_cpu->count[idx], x); +} + +/** + * __count_memcg_events - account VM events in a cgroup + * @memcg: the memory cgroup + * @idx: the event item + * @count: the number of events that occured + */ +void __count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx, + unsigned long count) +{ + unsigned long x; + + if (mem_cgroup_disabled()) + return; + + x = count + __this_cpu_read(memcg->vmstats_percpu->events[idx]); + if (unlikely(x > MEMCG_CHARGE_BATCH)) { + struct mem_cgroup *mi; + + atomic_long_add(x, &memcg->vmevents_local[idx]); + for (mi = memcg; mi; mi = parent_mem_cgroup(mi)) + atomic_long_add(x, &mi->vmevents[idx]); + x = 0; + } + __this_cpu_write(memcg->vmstats_percpu->events[idx], x); +} + +static unsigned long memcg_events(struct mem_cgroup *memcg, int event) +{ + return atomic_long_read(&memcg->vmevents[event]); +} + +static unsigned long memcg_events_local(struct mem_cgroup *memcg, int event) +{ + return atomic_long_read(&memcg->vmevents_local[event]); } static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg, @@ -722,35 +831,7 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg, nr_pages = -nr_pages; /* for event */ } - __this_cpu_add(memcg->stat_cpu->nr_page_events, nr_pages); -} - -unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg, - int nid, unsigned int lru_mask) -{ - struct lruvec *lruvec = mem_cgroup_lruvec(NODE_DATA(nid), memcg); - unsigned long nr = 0; - enum lru_list lru; - - VM_BUG_ON((unsigned)nid >= nr_node_ids); - - for_each_lru(lru) { - if (!(BIT(lru) & lru_mask)) - continue; - nr += mem_cgroup_get_lru_size(lruvec, lru); - } - return nr; -} - -static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg, - unsigned int lru_mask) -{ - unsigned long nr = 0; - int nid; - - for_each_node_state(nid, N_MEMORY) - nr += mem_cgroup_node_nr_lru_pages(memcg, nid, lru_mask); - return nr; + __this_cpu_add(memcg->vmstats_percpu->nr_page_events, nr_pages); } static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg, @@ -758,8 +839,8 @@ static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg, { unsigned long val, next; - val = __this_cpu_read(memcg->stat_cpu->nr_page_events); - next = __this_cpu_read(memcg->stat_cpu->targets[target]); + val = __this_cpu_read(memcg->vmstats_percpu->nr_page_events); + next = __this_cpu_read(memcg->vmstats_percpu->targets[target]); /* from time_after() in jiffies.h */ if ((long)(next - val) < 0) { switch (target) { @@ -775,7 +856,7 @@ static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg, default: break; } - __this_cpu_write(memcg->stat_cpu->targets[target], next); + __this_cpu_write(memcg->vmstats_percpu->targets[target], next); return true; } return false; @@ -1353,12 +1434,14 @@ void mem_cgroup_print_oom_meminfo(struct mem_cgroup *memcg) if (memcg1_stats[i] == MEMCG_SWAP && !do_swap_account) continue; pr_cont(" %s:%luKB", memcg1_stat_names[i], - K(memcg_page_state(iter, memcg1_stats[i]))); + K(memcg_page_state_local(iter, + memcg1_stats[i]))); } for (i = 0; i < NR_LRU_LISTS; i++) pr_cont(" %s:%luKB", mem_cgroup_lru_names[i], - K(mem_cgroup_nr_lru_pages(iter, BIT(i)))); + K(memcg_page_state_local(iter, + NR_LRU_BASE + i))); pr_cont("\n"); } @@ -1422,11 +1505,15 @@ static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, static bool test_mem_cgroup_node_reclaimable(struct mem_cgroup *memcg, int nid, bool noswap) { - if (mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL_FILE)) + struct lruvec *lruvec = mem_cgroup_lruvec(NODE_DATA(nid), memcg); + + if (lruvec_page_state(lruvec, NR_INACTIVE_FILE) || + lruvec_page_state(lruvec, NR_ACTIVE_FILE)) return true; if (noswap || !total_swap_pages) return false; - if (mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL_ANON)) + if (lruvec_page_state(lruvec, NR_INACTIVE_ANON) || + lruvec_page_state(lruvec, NR_ACTIVE_ANON)) return true; return false; @@ -2100,7 +2187,7 @@ static void drain_all_stock(struct mem_cgroup *root_memcg) static int memcg_hotplug_cpu_dead(unsigned int cpu) { struct memcg_stock_pcp *stock; - struct mem_cgroup *memcg; + struct mem_cgroup *memcg, *mi; stock = &per_cpu(memcg_stock, cpu); drain_stock(stock); @@ -2112,9 +2199,12 @@ static int memcg_hotplug_cpu_dead(unsigned int cpu) int nid; long x; - x = this_cpu_xchg(memcg->stat_cpu->count[i], 0); - if (x) - atomic_long_add(x, &memcg->stat[i]); + x = this_cpu_xchg(memcg->vmstats_percpu->stat[i], 0); + if (x) { + atomic_long_add(x, &memcg->vmstats_local[i]); + for (mi = memcg; mi; mi = parent_mem_cgroup(mi)) + atomic_long_add(x, &memcg->vmstats[i]); + } if (i >= NR_VM_NODE_STAT_ITEMS) continue; @@ -2124,17 +2214,24 @@ static int memcg_hotplug_cpu_dead(unsigned int cpu) pn = mem_cgroup_nodeinfo(memcg, nid); x = this_cpu_xchg(pn->lruvec_stat_cpu->count[i], 0); - if (x) - atomic_long_add(x, &pn->lruvec_stat[i]); + if (x) { + atomic_long_add(x, &pn->lruvec_stat_local[i]); + do { + atomic_long_add(x, &pn->lruvec_stat[i]); + } while ((pn = parent_nodeinfo(pn, nid))); + } } } for (i = 0; i < NR_VM_EVENT_ITEMS; i++) { long x; - x = this_cpu_xchg(memcg->stat_cpu->events[i], 0); - if (x) - atomic_long_add(x, &memcg->events[i]); + x = this_cpu_xchg(memcg->vmstats_percpu->events[i], 0); + if (x) { + atomic_long_add(x, &memcg->vmevents_local[i]); + for (mi = memcg; mi; mi = parent_mem_cgroup(mi)) + atomic_long_add(x, &memcg->vmevents[i]); + } } } @@ -2964,50 +3061,15 @@ static int mem_cgroup_hierarchy_write(struct cgroup_subsys_state *css, return retval; } -struct accumulated_stats { - unsigned long stat[MEMCG_NR_STAT]; - unsigned long events[NR_VM_EVENT_ITEMS]; - unsigned long lru_pages[NR_LRU_LISTS]; - const unsigned int *stats_array; - const unsigned int *events_array; - int stats_size; - int events_size; -}; - -static void accumulate_memcg_tree(struct mem_cgroup *memcg, - struct accumulated_stats *acc) -{ - struct mem_cgroup *mi; - int i; - - for_each_mem_cgroup_tree(mi, memcg) { - for (i = 0; i < acc->stats_size; i++) - acc->stat[i] += memcg_page_state(mi, - acc->stats_array ? acc->stats_array[i] : i); - - for (i = 0; i < acc->events_size; i++) - acc->events[i] += memcg_sum_events(mi, - acc->events_array ? acc->events_array[i] : i); - - for (i = 0; i < NR_LRU_LISTS; i++) - acc->lru_pages[i] += - mem_cgroup_nr_lru_pages(mi, BIT(i)); - } -} - static unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap) { - unsigned long val = 0; + unsigned long val; if (mem_cgroup_is_root(memcg)) { - struct mem_cgroup *iter; - - for_each_mem_cgroup_tree(iter, memcg) { - val += memcg_page_state(iter, MEMCG_CACHE); - val += memcg_page_state(iter, MEMCG_RSS); - if (swap) - val += memcg_page_state(iter, MEMCG_SWAP); - } + val = memcg_page_state(memcg, MEMCG_CACHE) + + memcg_page_state(memcg, MEMCG_RSS); + if (swap) + val += memcg_page_state(memcg, MEMCG_SWAP); } else { if (!swap) val = page_counter_read(&memcg->memory); @@ -3331,6 +3393,42 @@ static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css, #endif #ifdef CONFIG_NUMA + +#define LRU_ALL_FILE (BIT(LRU_INACTIVE_FILE) | BIT(LRU_ACTIVE_FILE)) +#define LRU_ALL_ANON (BIT(LRU_INACTIVE_ANON) | BIT(LRU_ACTIVE_ANON)) +#define LRU_ALL ((1 << NR_LRU_LISTS) - 1) + +static unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg, + int nid, unsigned int lru_mask) +{ + struct lruvec *lruvec = mem_cgroup_lruvec(NODE_DATA(nid), memcg); + unsigned long nr = 0; + enum lru_list lru; + + VM_BUG_ON((unsigned)nid >= nr_node_ids); + + for_each_lru(lru) { + if (!(BIT(lru) & lru_mask)) + continue; + nr += lruvec_page_state_local(lruvec, NR_LRU_BASE + lru); + } + return nr; +} + +static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg, + unsigned int lru_mask) +{ + unsigned long nr = 0; + enum lru_list lru; + + for_each_lru(lru) { + if (!(BIT(lru) & lru_mask)) + continue; + nr += memcg_page_state_local(memcg, NR_LRU_BASE + lru); + } + return nr; +} + static int memcg_numa_stat_show(struct seq_file *m, void *v) { struct numa_stat { @@ -3402,7 +3500,6 @@ static int memcg_stat_show(struct seq_file *m, void *v) unsigned long memory, memsw; struct mem_cgroup *mi; unsigned int i; - struct accumulated_stats acc; BUILD_BUG_ON(ARRAY_SIZE(memcg1_stat_names) != ARRAY_SIZE(memcg1_stats)); BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_lru_names) != NR_LRU_LISTS); @@ -3411,17 +3508,18 @@ static int memcg_stat_show(struct seq_file *m, void *v) if (memcg1_stats[i] == MEMCG_SWAP && !do_memsw_account()) continue; seq_printf(m, "%s %lu\n", memcg1_stat_names[i], - memcg_page_state(memcg, memcg1_stats[i]) * + memcg_page_state_local(memcg, memcg1_stats[i]) * PAGE_SIZE); } for (i = 0; i < ARRAY_SIZE(memcg1_events); i++) seq_printf(m, "%s %lu\n", memcg1_event_names[i], - memcg_sum_events(memcg, memcg1_events[i])); + memcg_events_local(memcg, memcg1_events[i])); for (i = 0; i < NR_LRU_LISTS; i++) seq_printf(m, "%s %lu\n", mem_cgroup_lru_names[i], - mem_cgroup_nr_lru_pages(memcg, BIT(i)) * PAGE_SIZE); + memcg_page_state_local(memcg, NR_LRU_BASE + i) * + PAGE_SIZE); /* Hierarchical information */ memory = memsw = PAGE_COUNTER_MAX; @@ -3435,27 +3533,21 @@ static int memcg_stat_show(struct seq_file *m, void *v) seq_printf(m, "hierarchical_memsw_limit %llu\n", (u64)memsw * PAGE_SIZE); - memset(&acc, 0, sizeof(acc)); - acc.stats_size = ARRAY_SIZE(memcg1_stats); - acc.stats_array = memcg1_stats; - acc.events_size = ARRAY_SIZE(memcg1_events); - acc.events_array = memcg1_events; - accumulate_memcg_tree(memcg, &acc); - for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) { if (memcg1_stats[i] == MEMCG_SWAP && !do_memsw_account()) continue; seq_printf(m, "total_%s %llu\n", memcg1_stat_names[i], - (u64)acc.stat[i] * PAGE_SIZE); + (u64)memcg_page_state(memcg, i) * PAGE_SIZE); } for (i = 0; i < ARRAY_SIZE(memcg1_events); i++) seq_printf(m, "total_%s %llu\n", memcg1_event_names[i], - (u64)acc.events[i]); + (u64)memcg_events(memcg, i)); for (i = 0; i < NR_LRU_LISTS; i++) seq_printf(m, "total_%s %llu\n", mem_cgroup_lru_names[i], - (u64)acc.lru_pages[i] * PAGE_SIZE); + (u64)memcg_page_state(memcg, NR_LRU_BASE + i) * + PAGE_SIZE); #ifdef CONFIG_DEBUG_VM { @@ -3882,6 +3974,22 @@ struct wb_domain *mem_cgroup_wb_domain(struct bdi_writeback *wb) return &memcg->cgwb_domain; } +/* + * idx can be of type enum memcg_stat_item or node_stat_item. + * Keep in sync with memcg_exact_page(). + */ +static unsigned long memcg_exact_page_state(struct mem_cgroup *memcg, int idx) +{ + long x = atomic_long_read(&memcg->vmstats[idx]); + int cpu; + + for_each_online_cpu(cpu) + x += per_cpu_ptr(memcg->vmstats_percpu, cpu)->stat[idx]; + if (x < 0) + x = 0; + return x; +} + /** * mem_cgroup_wb_stats - retrieve writeback related stats from its memcg * @wb: bdi_writeback in question @@ -3907,12 +4015,12 @@ void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages, struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css); struct mem_cgroup *parent; - *pdirty = memcg_page_state(memcg, NR_FILE_DIRTY); + *pdirty = memcg_exact_page_state(memcg, NR_FILE_DIRTY); /* this should eventually include NR_UNSTABLE_NFS */ - *pwriteback = memcg_page_state(memcg, NR_WRITEBACK); - *pfilepages = mem_cgroup_nr_lru_pages(memcg, (1 << LRU_INACTIVE_FILE) | - (1 << LRU_ACTIVE_FILE)); + *pwriteback = memcg_exact_page_state(memcg, NR_WRITEBACK); + *pfilepages = memcg_exact_page_state(memcg, NR_INACTIVE_FILE) + + memcg_exact_page_state(memcg, NR_ACTIVE_FILE); *pheadroom = PAGE_COUNTER_MAX; while ((parent = parent_mem_cgroup(memcg))) { @@ -4416,7 +4524,7 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg) for_each_node(node) free_mem_cgroup_per_node_info(memcg, node); - free_percpu(memcg->stat_cpu); + free_percpu(memcg->vmstats_percpu); kfree(memcg); } @@ -4445,8 +4553,8 @@ static struct mem_cgroup *mem_cgroup_alloc(void) if (memcg->id.id < 0) goto fail; - memcg->stat_cpu = alloc_percpu(struct mem_cgroup_stat_cpu); - if (!memcg->stat_cpu) + memcg->vmstats_percpu = alloc_percpu(struct memcg_vmstats_percpu); + if (!memcg->vmstats_percpu) goto fail; for_each_node(node) @@ -5532,7 +5640,6 @@ static int memory_events_show(struct seq_file *m, void *v) static int memory_stat_show(struct seq_file *m, void *v) { struct mem_cgroup *memcg = mem_cgroup_from_seq(m); - struct accumulated_stats acc; int i; /* @@ -5546,31 +5653,27 @@ static int memory_stat_show(struct seq_file *m, void *v) * Current memory state: */ - memset(&acc, 0, sizeof(acc)); - acc.stats_size = MEMCG_NR_STAT; - acc.events_size = NR_VM_EVENT_ITEMS; - accumulate_memcg_tree(memcg, &acc); - seq_printf(m, "anon %llu\n", - (u64)acc.stat[MEMCG_RSS] * PAGE_SIZE); + (u64)memcg_page_state(memcg, MEMCG_RSS) * PAGE_SIZE); seq_printf(m, "file %llu\n", - (u64)acc.stat[MEMCG_CACHE] * PAGE_SIZE); + (u64)memcg_page_state(memcg, MEMCG_CACHE) * PAGE_SIZE); seq_printf(m, "kernel_stack %llu\n", - (u64)acc.stat[MEMCG_KERNEL_STACK_KB] * 1024); + (u64)memcg_page_state(memcg, MEMCG_KERNEL_STACK_KB) * 1024); seq_printf(m, "slab %llu\n", - (u64)(acc.stat[NR_SLAB_RECLAIMABLE] + - acc.stat[NR_SLAB_UNRECLAIMABLE]) * PAGE_SIZE); + (u64)(memcg_page_state(memcg, NR_SLAB_RECLAIMABLE) + + memcg_page_state(memcg, NR_SLAB_UNRECLAIMABLE)) * + PAGE_SIZE); seq_printf(m, "sock %llu\n", - (u64)acc.stat[MEMCG_SOCK] * PAGE_SIZE); + (u64)memcg_page_state(memcg, MEMCG_SOCK) * PAGE_SIZE); seq_printf(m, "shmem %llu\n", - (u64)acc.stat[NR_SHMEM] * PAGE_SIZE); + (u64)memcg_page_state(memcg, NR_SHMEM) * PAGE_SIZE); seq_printf(m, "file_mapped %llu\n", - (u64)acc.stat[NR_FILE_MAPPED] * PAGE_SIZE); + (u64)memcg_page_state(memcg, NR_FILE_MAPPED) * PAGE_SIZE); seq_printf(m, "file_dirty %llu\n", - (u64)acc.stat[NR_FILE_DIRTY] * PAGE_SIZE); + (u64)memcg_page_state(memcg, NR_FILE_DIRTY) * PAGE_SIZE); seq_printf(m, "file_writeback %llu\n", - (u64)acc.stat[NR_WRITEBACK] * PAGE_SIZE); + (u64)memcg_page_state(memcg, NR_WRITEBACK) * PAGE_SIZE); /* * TODO: We should eventually replace our own MEMCG_RSS_HUGE counter @@ -5579,43 +5682,47 @@ static int memory_stat_show(struct seq_file *m, void *v) * where the page->mem_cgroup is set up and stable. */ seq_printf(m, "anon_thp %llu\n", - (u64)acc.stat[MEMCG_RSS_HUGE] * PAGE_SIZE); + (u64)memcg_page_state(memcg, MEMCG_RSS_HUGE) * PAGE_SIZE); for (i = 0; i < NR_LRU_LISTS; i++) seq_printf(m, "%s %llu\n", mem_cgroup_lru_names[i], - (u64)acc.lru_pages[i] * PAGE_SIZE); + (u64)memcg_page_state(memcg, NR_LRU_BASE + i) * + PAGE_SIZE); seq_printf(m, "slab_reclaimable %llu\n", - (u64)acc.stat[NR_SLAB_RECLAIMABLE] * PAGE_SIZE); + (u64)memcg_page_state(memcg, NR_SLAB_RECLAIMABLE) * + PAGE_SIZE); seq_printf(m, "slab_unreclaimable %llu\n", - (u64)acc.stat[NR_SLAB_UNRECLAIMABLE] * PAGE_SIZE); + (u64)memcg_page_state(memcg, NR_SLAB_UNRECLAIMABLE) * + PAGE_SIZE); /* Accumulated memory events */ - seq_printf(m, "pgfault %lu\n", acc.events[PGFAULT]); - seq_printf(m, "pgmajfault %lu\n", acc.events[PGMAJFAULT]); + seq_printf(m, "pgfault %lu\n", memcg_events(memcg, PGFAULT)); + seq_printf(m, "pgmajfault %lu\n", memcg_events(memcg, PGMAJFAULT)); seq_printf(m, "workingset_refault %lu\n", - acc.stat[WORKINGSET_REFAULT]); + memcg_page_state(memcg, WORKINGSET_REFAULT)); seq_printf(m, "workingset_activate %lu\n", - acc.stat[WORKINGSET_ACTIVATE]); + memcg_page_state(memcg, WORKINGSET_ACTIVATE)); seq_printf(m, "workingset_nodereclaim %lu\n", - acc.stat[WORKINGSET_NODERECLAIM]); - - seq_printf(m, "pgrefill %lu\n", acc.events[PGREFILL]); - seq_printf(m, "pgscan %lu\n", acc.events[PGSCAN_KSWAPD] + - acc.events[PGSCAN_DIRECT]); - seq_printf(m, "pgsteal %lu\n", acc.events[PGSTEAL_KSWAPD] + - acc.events[PGSTEAL_DIRECT]); - seq_printf(m, "pgactivate %lu\n", acc.events[PGACTIVATE]); - seq_printf(m, "pgdeactivate %lu\n", acc.events[PGDEACTIVATE]); - seq_printf(m, "pglazyfree %lu\n", acc.events[PGLAZYFREE]); - seq_printf(m, "pglazyfreed %lu\n", acc.events[PGLAZYFREED]); + memcg_page_state(memcg, WORKINGSET_NODERECLAIM)); + + seq_printf(m, "pgrefill %lu\n", memcg_events(memcg, PGREFILL)); + seq_printf(m, "pgscan %lu\n", memcg_events(memcg, PGSCAN_KSWAPD) + + memcg_events(memcg, PGSCAN_DIRECT)); + seq_printf(m, "pgsteal %lu\n", memcg_events(memcg, PGSTEAL_KSWAPD) + + memcg_events(memcg, PGSTEAL_DIRECT)); + seq_printf(m, "pgactivate %lu\n", memcg_events(memcg, PGACTIVATE)); + seq_printf(m, "pgdeactivate %lu\n", memcg_events(memcg, PGDEACTIVATE)); + seq_printf(m, "pglazyfree %lu\n", memcg_events(memcg, PGLAZYFREE)); + seq_printf(m, "pglazyfreed %lu\n", memcg_events(memcg, PGLAZYFREED)); #ifdef CONFIG_TRANSPARENT_HUGEPAGE - seq_printf(m, "thp_fault_alloc %lu\n", acc.events[THP_FAULT_ALLOC]); + seq_printf(m, "thp_fault_alloc %lu\n", + memcg_events(memcg, THP_FAULT_ALLOC)); seq_printf(m, "thp_collapse_alloc %lu\n", - acc.events[THP_COLLAPSE_ALLOC]); + memcg_events(memcg, THP_COLLAPSE_ALLOC)); #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ return 0; @@ -6051,7 +6158,7 @@ static void uncharge_batch(const struct uncharge_gather *ug) __mod_memcg_state(ug->memcg, MEMCG_RSS_HUGE, -ug->nr_huge); __mod_memcg_state(ug->memcg, NR_SHMEM, -ug->nr_shmem); __count_memcg_events(ug->memcg, PGPGOUT, ug->pgpgout); - __this_cpu_add(ug->memcg->stat_cpu->nr_page_events, nr_pages); + __this_cpu_add(ug->memcg->vmstats_percpu->nr_page_events, nr_pages); memcg_check_events(ug->memcg, ug->dummy_page); local_irq_restore(flags); diff --git a/mm/memfd.c b/mm/memfd.c index 650e65a46b9c..2647c898990c 100644 --- a/mm/memfd.c +++ b/mm/memfd.c @@ -39,6 +39,7 @@ static void memfd_tag_pins(struct xa_state *xas) xas_for_each(xas, page, ULONG_MAX) { if (xa_is_value(page)) continue; + page = find_subpage(page, xas->xa_index); if (page_count(page) - page_mapcount(page) > 1) xas_set_mark(xas, MEMFD_TAG_PINNED); @@ -88,6 +89,7 @@ static int memfd_wait_for_pins(struct address_space *mapping) bool clear = true; if (xa_is_value(page)) continue; + page = find_subpage(page, xas.xa_index); if (page_count(page) - page_mapcount(page) != 1) { /* * On the last scan, we clean up all those tags diff --git a/mm/memory.c b/mm/memory.c index 47fe250307c7..96f1d473c89a 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -356,7 +356,7 @@ void free_pgd_range(struct mmu_gather *tlb, * We add page table cache pages with PAGE_SIZE, * (see pte_free_tlb()), flush the tlb if we need */ - tlb_remove_check_page_size_change(tlb, PAGE_SIZE); + tlb_change_page_size(tlb, PAGE_SIZE); pgd = pgd_offset(tlb->mm, addr); do { next = pgd_addr_end(addr, end); @@ -519,7 +519,7 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr, dump_page(page, "bad pte"); pr_alert("addr:%p vm_flags:%08lx anon_vma:%p mapping:%p index:%lx\n", (void *)addr, vma->vm_flags, vma->anon_vma, mapping, index); - pr_alert("file:%pD fault:%pf mmap:%pf readpage:%pf\n", + pr_alert("file:%pD fault:%ps mmap:%ps readpage:%ps\n", vma->vm_file, vma->vm_ops ? vma->vm_ops->fault : NULL, vma->vm_file ? vma->vm_file->f_op->mmap : NULL, @@ -1010,7 +1010,8 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, is_cow = is_cow_mapping(vma->vm_flags); if (is_cow) { - mmu_notifier_range_init(&range, src_mm, addr, end); + mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_PAGE, + 0, vma, src_mm, addr, end); mmu_notifier_invalidate_range_start(&range); } @@ -1046,7 +1047,7 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, pte_t *pte; swp_entry_t entry; - tlb_remove_check_page_size_change(tlb, PAGE_SIZE); + tlb_change_page_size(tlb, PAGE_SIZE); again: init_rss_vec(rss); start_pte = pte_offset_map_lock(mm, pmd, addr, &ptl); @@ -1155,7 +1156,7 @@ again: */ if (force_flush) { force_flush = 0; - tlb_flush_mmu_free(tlb); + tlb_flush_mmu(tlb); if (addr != end) goto again; } @@ -1334,7 +1335,8 @@ void unmap_vmas(struct mmu_gather *tlb, { struct mmu_notifier_range range; - mmu_notifier_range_init(&range, vma->vm_mm, start_addr, end_addr); + mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, vma->vm_mm, + start_addr, end_addr); mmu_notifier_invalidate_range_start(&range); for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next) unmap_single_vma(tlb, vma, start_addr, end_addr, NULL); @@ -1356,7 +1358,8 @@ void zap_page_range(struct vm_area_struct *vma, unsigned long start, struct mmu_gather tlb; lru_add_drain(); - mmu_notifier_range_init(&range, vma->vm_mm, start, start + size); + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm, + start, start + size); tlb_gather_mmu(&tlb, vma->vm_mm, start, range.end); update_hiwater_rss(vma->vm_mm); mmu_notifier_invalidate_range_start(&range); @@ -1382,7 +1385,8 @@ static void zap_page_range_single(struct vm_area_struct *vma, unsigned long addr struct mmu_gather tlb; lru_add_drain(); - mmu_notifier_range_init(&range, vma->vm_mm, address, address + size); + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm, + address, address + size); tlb_gather_mmu(&tlb, vma->vm_mm, address, range.end); update_hiwater_rss(vma->vm_mm); mmu_notifier_invalidate_range_start(&range); @@ -1523,6 +1527,87 @@ int vm_insert_page(struct vm_area_struct *vma, unsigned long addr, } EXPORT_SYMBOL(vm_insert_page); +/* + * __vm_map_pages - maps range of kernel pages into user vma + * @vma: user vma to map to + * @pages: pointer to array of source kernel pages + * @num: number of pages in page array + * @offset: user's requested vm_pgoff + * + * This allows drivers to map range of kernel pages into a user vma. + * + * Return: 0 on success and error code otherwise. + */ +static int __vm_map_pages(struct vm_area_struct *vma, struct page **pages, + unsigned long num, unsigned long offset) +{ + unsigned long count = vma_pages(vma); + unsigned long uaddr = vma->vm_start; + int ret, i; + + /* Fail if the user requested offset is beyond the end of the object */ + if (offset > num) + return -ENXIO; + + /* Fail if the user requested size exceeds available object size */ + if (count > num - offset) + return -ENXIO; + + for (i = 0; i < count; i++) { + ret = vm_insert_page(vma, uaddr, pages[offset + i]); + if (ret < 0) + return ret; + uaddr += PAGE_SIZE; + } + + return 0; +} + +/** + * vm_map_pages - maps range of kernel pages starts with non zero offset + * @vma: user vma to map to + * @pages: pointer to array of source kernel pages + * @num: number of pages in page array + * + * Maps an object consisting of @num pages, catering for the user's + * requested vm_pgoff + * + * If we fail to insert any page into the vma, the function will return + * immediately leaving any previously inserted pages present. Callers + * from the mmap handler may immediately return the error as their caller + * will destroy the vma, removing any successfully inserted pages. Other + * callers should make their own arrangements for calling unmap_region(). + * + * Context: Process context. Called by mmap handlers. + * Return: 0 on success and error code otherwise. + */ +int vm_map_pages(struct vm_area_struct *vma, struct page **pages, + unsigned long num) +{ + return __vm_map_pages(vma, pages, num, vma->vm_pgoff); +} +EXPORT_SYMBOL(vm_map_pages); + +/** + * vm_map_pages_zero - map range of kernel pages starts with zero offset + * @vma: user vma to map to + * @pages: pointer to array of source kernel pages + * @num: number of pages in page array + * + * Similar to vm_map_pages(), except that it explicitly sets the offset + * to 0. This function is intended for the drivers that did not consider + * vm_pgoff. + * + * Context: Process context. Called by mmap handlers. + * Return: 0 on success and error code otherwise. + */ +int vm_map_pages_zero(struct vm_area_struct *vma, struct page **pages, + unsigned long num) +{ + return __vm_map_pages(vma, pages, num, 0); +} +EXPORT_SYMBOL(vm_map_pages_zero); + static vm_fault_t insert_pfn(struct vm_area_struct *vma, unsigned long addr, pfn_t pfn, pgprot_t prot, bool mkwrite) { @@ -1549,10 +1634,12 @@ static vm_fault_t insert_pfn(struct vm_area_struct *vma, unsigned long addr, WARN_ON_ONCE(!is_zero_pfn(pte_pfn(*pte))); goto out_unlock; } - entry = *pte; - goto out_mkwrite; - } else - goto out_unlock; + entry = pte_mkyoung(*pte); + entry = maybe_mkwrite(pte_mkdirty(entry), vma); + if (ptep_set_access_flags(vma, addr, pte, entry, 1)) + update_mmu_cache(vma, addr, pte); + } + goto out_unlock; } /* Ok, finally just insert the thing.. */ @@ -1561,7 +1648,6 @@ static vm_fault_t insert_pfn(struct vm_area_struct *vma, unsigned long addr, else entry = pte_mkspecial(pfn_t_pte(pfn, prot)); -out_mkwrite: if (mkwrite) { entry = pte_mkyoung(entry); entry = maybe_mkwrite(pte_mkdirty(entry), vma); @@ -2278,7 +2364,8 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf) __SetPageUptodate(new_page); - mmu_notifier_range_init(&range, mm, vmf->address & PAGE_MASK, + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, + vmf->address & PAGE_MASK, (vmf->address & PAGE_MASK) + PAGE_SIZE); mmu_notifier_invalidate_range_start(&range); @@ -4103,8 +4190,9 @@ static int __follow_pte_pmd(struct mm_struct *mm, unsigned long address, goto out; if (range) { - mmu_notifier_range_init(range, mm, address & PMD_MASK, - (address & PMD_MASK) + PMD_SIZE); + mmu_notifier_range_init(range, MMU_NOTIFY_CLEAR, 0, + NULL, mm, address & PMD_MASK, + (address & PMD_MASK) + PMD_SIZE); mmu_notifier_invalidate_range_start(range); } *ptlp = pmd_lock(mm, pmd); @@ -4121,8 +4209,9 @@ static int __follow_pte_pmd(struct mm_struct *mm, unsigned long address, goto out; if (range) { - mmu_notifier_range_init(range, mm, address & PAGE_MASK, - (address & PAGE_MASK) + PAGE_SIZE); + mmu_notifier_range_init(range, MMU_NOTIFY_CLEAR, 0, NULL, mm, + address & PAGE_MASK, + (address & PAGE_MASK) + PAGE_SIZE); mmu_notifier_invalidate_range_start(range); } ptep = pte_offset_map_lock(mm, pmd, address, ptlp); diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index f767582af4f8..328878b6799d 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -39,6 +39,7 @@ #include <asm/tlbflush.h> #include "internal.h" +#include "shuffle.h" /* * online_page_callback contains pointer to current page onlining function. @@ -273,12 +274,12 @@ static int __meminit __add_section(int nid, unsigned long phys_start_pfn, * add the new pages. */ int __ref __add_pages(int nid, unsigned long phys_start_pfn, - unsigned long nr_pages, struct vmem_altmap *altmap, - bool want_memblock) + unsigned long nr_pages, struct mhp_restrictions *restrictions) { unsigned long i; int err = 0; int start_sec, end_sec; + struct vmem_altmap *altmap = restrictions->altmap; /* during initialize mem_map, align hot-added range to section */ start_sec = pfn_to_section_nr(phys_start_pfn); @@ -299,7 +300,7 @@ int __ref __add_pages(int nid, unsigned long phys_start_pfn, for (i = start_sec; i <= end_sec; i++) { err = __add_section(nid, section_nr_to_pfn(i), altmap, - want_memblock); + restrictions->flags & MHP_MEMBLOCK_API); /* * EEXIST is finally dealt with by ioresource collision @@ -516,26 +517,23 @@ static void __remove_zone(struct zone *zone, unsigned long start_pfn) pgdat_resize_unlock(zone->zone_pgdat, &flags); } -static int __remove_section(struct zone *zone, struct mem_section *ms, - unsigned long map_offset, struct vmem_altmap *altmap) +static void __remove_section(struct zone *zone, struct mem_section *ms, + unsigned long map_offset, + struct vmem_altmap *altmap) { unsigned long start_pfn; int scn_nr; - int ret = -EINVAL; - if (!valid_section(ms)) - return ret; + if (WARN_ON_ONCE(!valid_section(ms))) + return; - ret = unregister_memory_section(ms); - if (ret) - return ret; + unregister_memory_section(ms); scn_nr = __section_nr(ms); start_pfn = section_nr_to_pfn((unsigned long)scn_nr); __remove_zone(zone, start_pfn); sparse_remove_one_section(zone, ms, map_offset, altmap); - return 0; } /** @@ -550,31 +548,17 @@ static int __remove_section(struct zone *zone, struct mem_section *ms, * sure that pages are marked reserved and zones are adjust properly by * calling offline_pages(). */ -int __remove_pages(struct zone *zone, unsigned long phys_start_pfn, - unsigned long nr_pages, struct vmem_altmap *altmap) +void __remove_pages(struct zone *zone, unsigned long phys_start_pfn, + unsigned long nr_pages, struct vmem_altmap *altmap) { unsigned long i; unsigned long map_offset = 0; - int sections_to_remove, ret = 0; + int sections_to_remove; /* In the ZONE_DEVICE case device driver owns the memory region */ if (is_dev_zone(zone)) { if (altmap) map_offset = vmem_altmap_offset(altmap); - } else { - resource_size_t start, size; - - start = phys_start_pfn << PAGE_SHIFT; - size = nr_pages * PAGE_SIZE; - - ret = release_mem_region_adjustable(&iomem_resource, start, - size); - if (ret) { - resource_size_t endres = start + size - 1; - - pr_warn("Unable to release resource <%pa-%pa> (%d)\n", - &start, &endres, ret); - } } clear_zone_contiguous(zone); @@ -590,16 +574,12 @@ int __remove_pages(struct zone *zone, unsigned long phys_start_pfn, unsigned long pfn = phys_start_pfn + i*PAGES_PER_SECTION; cond_resched(); - ret = __remove_section(zone, __pfn_to_section(pfn), map_offset, - altmap); + __remove_section(zone, __pfn_to_section(pfn), map_offset, + altmap); map_offset = 0; - if (ret) - break; } set_zone_contiguous(zone); - - return ret; } #endif /* CONFIG_MEMORY_HOTREMOVE */ @@ -714,7 +694,7 @@ static void node_states_check_changes_online(unsigned long nr_pages, if (zone_idx(zone) <= ZONE_NORMAL && !node_state(nid, N_NORMAL_MEMORY)) arg->status_change_nid_normal = nid; #ifdef CONFIG_HIGHMEM - if (zone_idx(zone) <= N_HIGH_MEMORY && !node_state(nid, N_HIGH_MEMORY)) + if (zone_idx(zone) <= ZONE_HIGHMEM && !node_state(nid, N_HIGH_MEMORY)) arg->status_change_nid_high = nid; #endif } @@ -874,6 +854,7 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ */ mem = find_memory_block(__pfn_to_section(pfn)); nid = mem->nid; + put_device(&mem->dev); /* associate pfn range with the zone */ zone = move_pfn_range(online_type, nid, pfn, nr_pages); @@ -911,6 +892,8 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ zone->zone_pgdat->node_present_pages += onlined_pages; pgdat_resize_unlock(zone->zone_pgdat, &flags); + shuffle_zone(zone); + if (onlined_pages) { node_states_set_node(nid, &arg); if (need_zonelists_rebuild) @@ -1096,6 +1079,9 @@ static int online_memory_block(struct memory_block *mem, void *arg) */ int __ref add_memory_resource(int nid, struct resource *res) { + struct mhp_restrictions restrictions = { + .flags = MHP_MEMBLOCK_API, + }; u64 start, size; bool new_node = false; int ret; @@ -1123,7 +1109,7 @@ int __ref add_memory_resource(int nid, struct resource *res) new_node = ret; /* call arch's memory hotadd */ - ret = arch_add_memory(nid, start, size, NULL, true); + ret = arch_add_memory(nid, start, size, &restrictions); if (ret < 0) goto error; @@ -1340,8 +1326,7 @@ static unsigned long scan_movable_pages(unsigned long start, unsigned long end) if (!PageHuge(page)) continue; head = compound_head(page); - if (hugepage_migration_supported(page_hstate(head)) && - page_huge_active(head)) + if (page_huge_active(head)) return pfn; skip = (1 << compound_order(head)) - (page - head); pfn += skip - 1; @@ -1381,10 +1366,6 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) if (PageHuge(page)) { struct page *head = compound_head(page); - if (compound_order(head) > PFN_SECTION_SHIFT) { - ret = -EBUSY; - break; - } pfn = page_to_pfn(head) + (1<<compound_order(head)) - 1; isolate_huge_page(head, &source); continue; @@ -1453,15 +1434,10 @@ static int offline_isolated_pages_cb(unsigned long start, unsigned long nr_pages, void *data) { - __offline_isolated_pages(start, start + nr_pages); - return 0; -} + unsigned long *offlined_pages = (unsigned long *)data; -static void -offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn) -{ - walk_system_ram_range(start_pfn, end_pfn - start_pfn, NULL, - offline_isolated_pages_cb); + *offlined_pages += __offline_isolated_pages(start, start + nr_pages); + return 0; } /* @@ -1471,26 +1447,7 @@ static int check_pages_isolated_cb(unsigned long start_pfn, unsigned long nr_pages, void *data) { - int ret; - long offlined = *(long *)data; - ret = test_pages_isolated(start_pfn, start_pfn + nr_pages, true); - offlined = nr_pages; - if (!ret) - *(long *)data += offlined; - return ret; -} - -static long -check_pages_isolated(unsigned long start_pfn, unsigned long end_pfn) -{ - long offlined = 0; - int ret; - - ret = walk_system_ram_range(start_pfn, end_pfn - start_pfn, &offlined, - check_pages_isolated_cb); - if (ret < 0) - offlined = (long)ret; - return offlined; + return test_pages_isolated(start_pfn, start_pfn + nr_pages, true); } static int __init cmdline_parse_movable_node(char *p) @@ -1575,8 +1532,8 @@ static int __ref __offline_pages(unsigned long start_pfn, unsigned long end_pfn) { unsigned long pfn, nr_pages; - long offlined_pages; - int ret, node; + unsigned long offlined_pages = 0; + int ret, node, nr_isolate_pageblock; unsigned long flags; unsigned long valid_start, valid_end; struct zone *zone; @@ -1602,10 +1559,11 @@ static int __ref __offline_pages(unsigned long start_pfn, ret = start_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE, SKIP_HWPOISON | REPORT_FAILURE); - if (ret) { + if (ret < 0) { reason = "failure to isolate range"; goto failed_removal; } + nr_isolate_pageblock = ret; arg.start_pfn = start_pfn; arg.nr_pages = nr_pages; @@ -1650,15 +1608,24 @@ static int __ref __offline_pages(unsigned long start_pfn, goto failed_removal_isolated; } /* check again */ - offlined_pages = check_pages_isolated(start_pfn, end_pfn); - } while (offlined_pages < 0); + ret = walk_system_ram_range(start_pfn, end_pfn - start_pfn, + NULL, check_pages_isolated_cb); + } while (ret); - pr_info("Offlined Pages %ld\n", offlined_pages); /* Ok, all of our target is isolated. We cannot do rollback at this point. */ - offline_isolated_pages(start_pfn, end_pfn); - /* reset pagetype flags and makes migrate type to be MOVABLE */ - undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE); + walk_system_ram_range(start_pfn, end_pfn - start_pfn, + &offlined_pages, offline_isolated_pages_cb); + pr_info("Offlined Pages %ld\n", offlined_pages); + /* + * Onlining will reset pagetype flags and makes migrate type + * MOVABLE, so just need to decrease the number of isolated + * pageblocks zone counter here. + */ + spin_lock_irqsave(&zone->lock, flags); + zone->nr_isolate_pageblock -= nr_isolate_pageblock; + spin_unlock_irqrestore(&zone->lock, flags); + /* removal success */ adjust_managed_page_count(pfn_to_page(start_pfn), -offlined_pages); zone->present_pages -= offlined_pages; @@ -1690,12 +1657,12 @@ static int __ref __offline_pages(unsigned long start_pfn, failed_removal_isolated: undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE); + memory_notify(MEM_CANCEL_OFFLINE, &arg); failed_removal: pr_debug("memory offlining [mem %#010llx-%#010llx] failed due to %s\n", (unsigned long long) start_pfn << PAGE_SHIFT, ((unsigned long long) end_pfn << PAGE_SHIFT) - 1, reason); - memory_notify(MEM_CANCEL_OFFLINE, &arg); /* pushback to free area */ mem_hotplug_done(); return ret; @@ -1833,6 +1800,26 @@ void try_offline_node(int nid) } EXPORT_SYMBOL(try_offline_node); +static void __release_memory_resource(resource_size_t start, + resource_size_t size) +{ + int ret; + + /* + * When removing memory in the same granularity as it was added, + * this function never fails. It might only fail if resources + * have to be adjusted or split. We'll ignore the error, as + * removing of memory cannot fail. + */ + ret = release_mem_region_adjustable(&iomem_resource, start, size); + if (ret) { + resource_size_t endres = start + size - 1; + + pr_warn("Unable to release resource <%pa-%pa> (%d)\n", + &start, &endres, ret); + } +} + /** * remove_memory * @nid: the node ID @@ -1867,6 +1854,7 @@ void __ref __remove_memory(int nid, u64 start, u64 size) memblock_remove(start, size); arch_remove_memory(nid, start, size, NULL); + __release_memory_resource(start, size); try_offline_node(nid); diff --git a/mm/mempolicy.c b/mm/mempolicy.c index af171ccb56a2..2219e747df49 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -428,6 +428,13 @@ static inline bool queue_pages_required(struct page *page, return node_isset(nid, *qp->nmask) == !(flags & MPOL_MF_INVERT); } +/* + * queue_pages_pmd() has three possible return values: + * 1 - pages are placed on the right node or queued successfully. + * 0 - THP was split. + * -EIO - is migration entry or MPOL_MF_STRICT was specified and an existing + * page was already on a node that does not follow the policy. + */ static int queue_pages_pmd(pmd_t *pmd, spinlock_t *ptl, unsigned long addr, unsigned long end, struct mm_walk *walk) { @@ -437,7 +444,7 @@ static int queue_pages_pmd(pmd_t *pmd, spinlock_t *ptl, unsigned long addr, unsigned long flags; if (unlikely(is_pmd_migration_entry(*pmd))) { - ret = 1; + ret = -EIO; goto unlock; } page = pmd_page(*pmd); @@ -454,8 +461,15 @@ static int queue_pages_pmd(pmd_t *pmd, spinlock_t *ptl, unsigned long addr, ret = 1; flags = qp->flags; /* go to thp migration */ - if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) + if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) { + if (!vma_migratable(walk->vma)) { + ret = -EIO; + goto unlock; + } + migrate_page_add(page, qp->pagelist, flags); + } else + ret = -EIO; unlock: spin_unlock(ptl); out: @@ -480,8 +494,10 @@ static int queue_pages_pte_range(pmd_t *pmd, unsigned long addr, ptl = pmd_trans_huge_lock(pmd, vma); if (ptl) { ret = queue_pages_pmd(pmd, ptl, addr, end, walk); - if (ret) + if (ret > 0) return 0; + else if (ret < 0) + return ret; } if (pmd_trans_unstable(pmd)) @@ -502,11 +518,16 @@ static int queue_pages_pte_range(pmd_t *pmd, unsigned long addr, continue; if (!queue_pages_required(page, qp)) continue; - migrate_page_add(page, qp->pagelist, flags); + if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) { + if (!vma_migratable(vma)) + break; + migrate_page_add(page, qp->pagelist, flags); + } else + break; } pte_unmap_unlock(pte - 1, ptl); cond_resched(); - return 0; + return addr != end ? -EIO : 0; } static int queue_pages_hugetlb(pte_t *pte, unsigned long hmask, @@ -576,7 +597,12 @@ static int queue_pages_test_walk(unsigned long start, unsigned long end, unsigned long endvma = vma->vm_end; unsigned long flags = qp->flags; - if (!vma_migratable(vma)) + /* + * Need check MPOL_MF_STRICT to return -EIO if possible + * regardless of vma_migratable + */ + if (!vma_migratable(vma) && + !(flags & MPOL_MF_STRICT)) return 1; if (endvma > end) @@ -603,7 +629,7 @@ static int queue_pages_test_walk(unsigned long start, unsigned long end, } /* queue pages from current vma */ - if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) + if (flags & MPOL_MF_VALID) return 0; return 1; } diff --git a/mm/migrate.c b/mm/migrate.c index ac6f4939bb59..f2ecc2855a12 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -248,10 +248,8 @@ static bool remove_migration_pte(struct page *page, struct vm_area_struct *vma, pte = swp_entry_to_pte(entry); } else if (is_device_public_page(new)) { pte = pte_mkdevmap(pte); - flush_dcache_page(new); } - } else - flush_dcache_page(new); + } #ifdef CONFIG_HUGETLB_PAGE if (PageHuge(new)) { @@ -465,7 +463,7 @@ int migrate_page_move_mapping(struct address_space *mapping, for (i = 1; i < HPAGE_PMD_NR; i++) { xas_next(&xas); - xas_store(&xas, newpage + i); + xas_store(&xas, newpage); } } @@ -995,6 +993,13 @@ static int move_to_new_page(struct page *newpage, struct page *page, */ if (!PageMappingFlags(page)) page->mapping = NULL; + + if (unlikely(is_zone_device_page(newpage))) { + if (is_device_public_page(newpage)) + flush_dcache_page(newpage); + } else + flush_dcache_page(newpage); + } out: return rc; @@ -2351,7 +2356,8 @@ static void migrate_vma_collect(struct migrate_vma *migrate) mm_walk.mm = migrate->vma->vm_mm; mm_walk.private = migrate; - mmu_notifier_range_init(&range, mm_walk.mm, migrate->start, + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, NULL, mm_walk.mm, + migrate->start, migrate->end); mmu_notifier_invalidate_range_start(&range); walk_page_range(migrate->start, migrate->end, &mm_walk); @@ -2759,6 +2765,8 @@ static void migrate_vma_pages(struct migrate_vma *migrate) notified = true; mmu_notifier_range_init(&range, + MMU_NOTIFY_CLEAR, 0, + NULL, migrate->vma->vm_mm, addr, migrate->end); mmu_notifier_invalidate_range_start(&range); diff --git a/mm/mincore.c b/mm/mincore.c index 218099b5ed31..c3f058bd0faf 100644 --- a/mm/mincore.c +++ b/mm/mincore.c @@ -169,6 +169,22 @@ out: return 0; } +static inline bool can_do_mincore(struct vm_area_struct *vma) +{ + if (vma_is_anonymous(vma)) + return true; + if (!vma->vm_file) + return false; + /* + * Reveal pagecache information only for non-anonymous mappings that + * correspond to the files the calling process could (if tried) open + * for writing; otherwise we'd be including shared non-exclusive + * mappings, which opens a side channel. + */ + return inode_owner_or_capable(file_inode(vma->vm_file)) || + inode_permission(file_inode(vma->vm_file), MAY_WRITE) == 0; +} + /* * Do a chunk of "sys_mincore()". We've already checked * all the arguments, we hold the mmap semaphore: we should @@ -189,8 +205,13 @@ static long do_mincore(unsigned long addr, unsigned long pages, unsigned char *v vma = find_vma(current->mm, addr); if (!vma || addr < vma->vm_start) return -ENOMEM; - mincore_walk.mm = vma->vm_mm; end = min(vma->vm_end, addr + (pages << PAGE_SHIFT)); + if (!can_do_mincore(vma)) { + unsigned long pages = DIV_ROUND_UP(end - addr, PAGE_SIZE); + memset(vec, 1, pages); + return pages; + } + mincore_walk.mm = vma->vm_mm; err = walk_page_range(addr, end, &mincore_walk); if (err < 0) return err; diff --git a/mm/mmap.c b/mm/mmap.c index 41eb48d9b527..2d6a6662edb9 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -45,6 +45,7 @@ #include <linux/moduleparam.h> #include <linux/pkeys.h> #include <linux/oom.h> +#include <linux/sched/mm.h> #include <linux/uaccess.h> #include <asm/cacheflush.h> @@ -2525,7 +2526,8 @@ find_extend_vma(struct mm_struct *mm, unsigned long addr) vma = find_vma_prev(mm, addr, &prev); if (vma && (vma->vm_start <= addr)) return vma; - if (!prev || expand_stack(prev, addr)) + /* don't alter vm_end if the coredump is running */ + if (!prev || !mmget_still_valid(mm) || expand_stack(prev, addr)) return NULL; if (prev->vm_flags & VM_LOCKED) populate_vma_page_range(prev, addr, prev->vm_end, NULL); @@ -2551,6 +2553,9 @@ find_extend_vma(struct mm_struct *mm, unsigned long addr) return vma; if (!(vma->vm_flags & VM_GROWSDOWN)) return NULL; + /* don't alter vm_start if the coredump is running */ + if (!mmget_still_valid(mm)) + return NULL; start = vma->vm_start; if (expand_stack(vma, addr)) return NULL; @@ -2730,9 +2735,17 @@ int __do_munmap(struct mm_struct *mm, unsigned long start, size_t len, return -EINVAL; len = PAGE_ALIGN(len); + end = start + len; if (len == 0) return -EINVAL; + /* + * arch_unmap() might do unmaps itself. It must be called + * and finish any rbtree manipulation before this code + * runs and also starts to manipulate the rbtree. + */ + arch_unmap(mm, start, end); + /* Find the first overlapping VMA */ vma = find_vma(mm, start); if (!vma) @@ -2741,7 +2754,6 @@ int __do_munmap(struct mm_struct *mm, unsigned long start, size_t len, /* we have start < vma->vm_end */ /* if it doesn't overlap, we have nothing.. */ - end = start + len; if (vma->vm_start >= end) return 0; @@ -2811,12 +2823,6 @@ int __do_munmap(struct mm_struct *mm, unsigned long start, size_t len, /* Detach vmas from rbtree */ detach_vmas_to_be_unmapped(mm, vma, prev, end); - /* - * mpx unmap needs to be called with mmap_sem held for write. - * It is safe to call it before unmap_region(). - */ - arch_unmap(mm, vma, start, end); - if (downgrade) downgrade_write(&mm->mmap_sem); diff --git a/mm/mmu_gather.c b/mm/mmu_gather.c index f2f03c655807..99740e1dd273 100644 --- a/mm/mmu_gather.c +++ b/mm/mmu_gather.c @@ -11,7 +11,7 @@ #include <asm/pgalloc.h> #include <asm/tlb.h> -#ifdef HAVE_GENERIC_MMU_GATHER +#ifndef CONFIG_HAVE_MMU_GATHER_NO_GATHER static bool tlb_next_batch(struct mmu_gather *tlb) { @@ -41,35 +41,10 @@ static bool tlb_next_batch(struct mmu_gather *tlb) return true; } -void arch_tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, - unsigned long start, unsigned long end) -{ - tlb->mm = mm; - - /* Is it from 0 to ~0? */ - tlb->fullmm = !(start | (end+1)); - tlb->need_flush_all = 0; - tlb->local.next = NULL; - tlb->local.nr = 0; - tlb->local.max = ARRAY_SIZE(tlb->__pages); - tlb->active = &tlb->local; - tlb->batch_count = 0; - -#ifdef CONFIG_HAVE_RCU_TABLE_FREE - tlb->batch = NULL; -#endif - tlb->page_size = 0; - - __tlb_reset_range(tlb); -} - -void tlb_flush_mmu_free(struct mmu_gather *tlb) +static void tlb_batch_pages_flush(struct mmu_gather *tlb) { struct mmu_gather_batch *batch; -#ifdef CONFIG_HAVE_RCU_TABLE_FREE - tlb_table_flush(tlb); -#endif for (batch = &tlb->local; batch && batch->nr; batch = batch->next) { free_pages_and_swap_cache(batch->pages, batch->nr); batch->nr = 0; @@ -77,31 +52,10 @@ void tlb_flush_mmu_free(struct mmu_gather *tlb) tlb->active = &tlb->local; } -void tlb_flush_mmu(struct mmu_gather *tlb) -{ - tlb_flush_mmu_tlbonly(tlb); - tlb_flush_mmu_free(tlb); -} - -/* tlb_finish_mmu - * Called at the end of the shootdown operation to free up any resources - * that were required. - */ -void arch_tlb_finish_mmu(struct mmu_gather *tlb, - unsigned long start, unsigned long end, bool force) +static void tlb_batch_list_free(struct mmu_gather *tlb) { struct mmu_gather_batch *batch, *next; - if (force) { - __tlb_reset_range(tlb); - __tlb_adjust_range(tlb, start, end - start); - } - - tlb_flush_mmu(tlb); - - /* keep the page table cache within bounds */ - check_pgt_cache(); - for (batch = tlb->local.next; batch; batch = next) { next = batch->next; free_pages((unsigned long)batch, 0); @@ -109,19 +63,15 @@ void arch_tlb_finish_mmu(struct mmu_gather *tlb, tlb->local.next = NULL; } -/* __tlb_remove_page - * Must perform the equivalent to __free_pte(pte_get_and_clear(ptep)), while - * handling the additional races in SMP caused by other CPUs caching valid - * mappings in their TLBs. Returns the number of free page slots left. - * When out of page slots we must call tlb_flush_mmu(). - *returns true if the caller should flush. - */ bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, int page_size) { struct mmu_gather_batch *batch; VM_BUG_ON(!tlb->end); + +#ifdef CONFIG_HAVE_MMU_GATHER_PAGE_SIZE VM_WARN_ON(tlb->page_size != page_size); +#endif batch = tlb->active; /* @@ -139,7 +89,7 @@ bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, int page_ return false; } -#endif /* HAVE_GENERIC_MMU_GATHER */ +#endif /* HAVE_MMU_GATHER_NO_GATHER */ #ifdef CONFIG_HAVE_RCU_TABLE_FREE @@ -152,7 +102,7 @@ bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, int page_ */ static inline void tlb_table_invalidate(struct mmu_gather *tlb) { -#ifdef CONFIG_HAVE_RCU_TABLE_INVALIDATE +#ifndef CONFIG_HAVE_RCU_TABLE_NO_INVALIDATE /* * Invalidate page-table caches used by hardware walkers. Then we still * need to RCU-sched wait while freeing the pages because software @@ -193,7 +143,7 @@ static void tlb_remove_table_rcu(struct rcu_head *head) free_page((unsigned long)batch); } -void tlb_table_flush(struct mmu_gather *tlb) +static void tlb_table_flush(struct mmu_gather *tlb) { struct mmu_table_batch **batch = &tlb->batch; @@ -225,6 +175,22 @@ void tlb_remove_table(struct mmu_gather *tlb, void *table) #endif /* CONFIG_HAVE_RCU_TABLE_FREE */ +static void tlb_flush_mmu_free(struct mmu_gather *tlb) +{ +#ifdef CONFIG_HAVE_RCU_TABLE_FREE + tlb_table_flush(tlb); +#endif +#ifndef CONFIG_HAVE_MMU_GATHER_NO_GATHER + tlb_batch_pages_flush(tlb); +#endif +} + +void tlb_flush_mmu(struct mmu_gather *tlb) +{ + tlb_flush_mmu_tlbonly(tlb); + tlb_flush_mmu_free(tlb); +} + /** * tlb_gather_mmu - initialize an mmu_gather structure for page-table tear-down * @tlb: the mmu_gather structure to initialize @@ -240,10 +206,40 @@ void tlb_remove_table(struct mmu_gather *tlb, void *table) void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, unsigned long start, unsigned long end) { - arch_tlb_gather_mmu(tlb, mm, start, end); + tlb->mm = mm; + + /* Is it from 0 to ~0? */ + tlb->fullmm = !(start | (end+1)); + +#ifndef CONFIG_HAVE_MMU_GATHER_NO_GATHER + tlb->need_flush_all = 0; + tlb->local.next = NULL; + tlb->local.nr = 0; + tlb->local.max = ARRAY_SIZE(tlb->__pages); + tlb->active = &tlb->local; + tlb->batch_count = 0; +#endif + +#ifdef CONFIG_HAVE_RCU_TABLE_FREE + tlb->batch = NULL; +#endif +#ifdef CONFIG_HAVE_MMU_GATHER_PAGE_SIZE + tlb->page_size = 0; +#endif + + __tlb_reset_range(tlb); inc_tlb_flush_pending(tlb->mm); } +/** + * tlb_finish_mmu - finish an mmu_gather structure + * @tlb: the mmu_gather structure to finish + * @start: start of the region that will be removed from the page-table + * @end: end of the region that will be removed from the page-table + * + * Called at the end of the shootdown operation to free up any resources that + * were required. + */ void tlb_finish_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long end) { @@ -254,8 +250,17 @@ void tlb_finish_mmu(struct mmu_gather *tlb, * the TLB by observing pte_none|!pte_dirty, for example so flush TLB * forcefully if we detect parallel PTE batching threads. */ - bool force = mm_tlb_flush_nested(tlb->mm); + if (mm_tlb_flush_nested(tlb->mm)) { + __tlb_reset_range(tlb); + __tlb_adjust_range(tlb, start, end - start); + } - arch_tlb_finish_mmu(tlb, start, end, force); + tlb_flush_mmu(tlb); + + /* keep the page table cache within bounds */ + check_pgt_cache(); +#ifndef CONFIG_HAVE_MMU_GATHER_NO_GATHER + tlb_batch_list_free(tlb); +#endif dec_tlb_flush_pending(tlb->mm); } diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c index 9c884abc7850..ee36068077b6 100644 --- a/mm/mmu_notifier.c +++ b/mm/mmu_notifier.c @@ -180,7 +180,7 @@ int __mmu_notifier_invalidate_range_start(struct mmu_notifier_range *range) if (_ret) { pr_info("%pS callback failed with %d in %sblockable context.\n", mn->ops->invalidate_range_start, _ret, - !range->blockable ? "non-" : ""); + !mmu_notifier_range_blockable(range) ? "non-" : ""); ret = _ret; } } @@ -395,3 +395,13 @@ void mmu_notifier_unregister_no_release(struct mmu_notifier *mn, mmdrop(mm); } EXPORT_SYMBOL_GPL(mmu_notifier_unregister_no_release); + +bool +mmu_notifier_range_update_to_read_only(const struct mmu_notifier_range *range) +{ + if (!range->vma || range->event != MMU_NOTIFY_PROTECTION_VMA) + return false; + /* Return true if the vma still have the read flag set. */ + return range->vma->vm_flags & VM_READ; +} +EXPORT_SYMBOL_GPL(mmu_notifier_range_update_to_read_only); diff --git a/mm/mprotect.c b/mm/mprotect.c index 028c724dcb1a..bf38dfbbb4b4 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -39,7 +39,6 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr, unsigned long end, pgprot_t newprot, int dirty_accountable, int prot_numa) { - struct mm_struct *mm = vma->vm_mm; pte_t *pte, oldpte; spinlock_t *ptl; unsigned long pages = 0; @@ -136,7 +135,7 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, newpte = swp_entry_to_pte(entry); if (pte_swp_soft_dirty(oldpte)) newpte = pte_swp_mksoft_dirty(newpte); - set_pte_at(mm, addr, pte, newpte); + set_pte_at(vma->vm_mm, addr, pte, newpte); pages++; } @@ -150,7 +149,7 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, */ make_device_private_entry_read(&entry); newpte = swp_entry_to_pte(entry); - set_pte_at(mm, addr, pte, newpte); + set_pte_at(vma->vm_mm, addr, pte, newpte); pages++; } @@ -185,7 +184,9 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma, /* invoke the mmu notifier if the pmd is populated */ if (!range.start) { - mmu_notifier_range_init(&range, vma->vm_mm, addr, end); + mmu_notifier_range_init(&range, + MMU_NOTIFY_PROTECTION_VMA, 0, + vma, vma->vm_mm, addr, end); mmu_notifier_invalidate_range_start(&range); } diff --git a/mm/mremap.c b/mm/mremap.c index e3edef6b7a12..fc241d23cd97 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -249,7 +249,8 @@ unsigned long move_page_tables(struct vm_area_struct *vma, old_end = old_addr + len; flush_cache_range(vma, old_addr, old_end); - mmu_notifier_range_init(&range, vma->vm_mm, old_addr, old_end); + mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, vma->vm_mm, + old_addr, old_end); mmu_notifier_invalidate_range_start(&range); for (; old_addr < old_end; old_addr += extent, new_addr += extent) { diff --git a/mm/nommu.c b/mm/nommu.c index 749276beb109..b492fd1fcf9f 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -473,6 +473,20 @@ int vm_insert_page(struct vm_area_struct *vma, unsigned long addr, } EXPORT_SYMBOL(vm_insert_page); +int vm_map_pages(struct vm_area_struct *vma, struct page **pages, + unsigned long num) +{ + return -EINVAL; +} +EXPORT_SYMBOL(vm_map_pages); + +int vm_map_pages_zero(struct vm_area_struct *vma, struct page **pages, + unsigned long num) +{ + return -EINVAL; +} +EXPORT_SYMBOL(vm_map_pages_zero); + /* * sys_brk() for the most part doesn't need the global kernel * lock, except when an application is doing something nasty diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 3a2484884cfd..539c91d0b26a 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -531,7 +531,8 @@ bool __oom_reap_task_mm(struct mm_struct *mm) struct mmu_notifier_range range; struct mmu_gather tlb; - mmu_notifier_range_init(&range, mm, vma->vm_start, + mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, + vma, mm, vma->vm_start, vma->vm_end); tlb_gather_mmu(&tlb, mm, range.start, range.end); if (mmu_notifier_invalidate_range_start_nonblock(&range)) { diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 9f61dfec6a1f..07656485c0e6 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -2808,6 +2808,18 @@ int __test_set_page_writeback(struct page *page, bool keep_write) } EXPORT_SYMBOL(__test_set_page_writeback); +/* + * Wait for a page to complete writeback + */ +void wait_on_page_writeback(struct page *page) +{ + if (PageWriteback(page)) { + trace_wait_on_page_writeback(page, page_mapping(page)); + wait_on_page_bit(page, PG_writeback); + } +} +EXPORT_SYMBOL_GPL(wait_on_page_writeback); + /** * wait_for_stable_page() - wait for writeback to finish, if necessary. * @page: The page to wait on. diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 03fcf73d47da..3b13d3914176 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -43,6 +43,7 @@ #include <linux/mempolicy.h> #include <linux/memremap.h> #include <linux/stop_machine.h> +#include <linux/random.h> #include <linux/sort.h> #include <linux/pfn.h> #include <linux/backing-dev.h> @@ -72,6 +73,7 @@ #include <asm/tlbflush.h> #include <asm/div64.h> #include "internal.h" +#include "shuffle.h" /* prevent >1 _updater_ of zone percpu pageset ->high and ->batch fields */ static DEFINE_MUTEX(pcp_batch_high_lock); @@ -266,7 +268,20 @@ compound_page_dtor * const compound_page_dtors[] = { int min_free_kbytes = 1024; int user_min_free_kbytes = -1; +#ifdef CONFIG_DISCONTIGMEM +/* + * DiscontigMem defines memory ranges as separate pg_data_t even if the ranges + * are not on separate NUMA nodes. Functionally this works but with + * watermark_boost_factor, it can reclaim prematurely as the ranges can be + * quite small. By default, do not boost watermarks on discontigmem as in + * many cases very high-order allocations like THP are likely to be + * unsupported and the premature reclaim offsets the advantage of long-term + * fragmentation avoidance. + */ +int watermark_boost_factor __read_mostly; +#else int watermark_boost_factor __read_mostly = 15000; +#endif int watermark_scale_factor = 10; static unsigned long nr_kernel_pages __initdata; @@ -742,12 +757,6 @@ static inline void set_page_order(struct page *page, unsigned int order) __SetPageBuddy(page); } -static inline void rmv_page_order(struct page *page) -{ - __ClearPageBuddy(page); - set_page_private(page, 0); -} - /* * This function checks whether a page is free && is the buddy * we can coalesce a page and its buddy if @@ -905,13 +914,10 @@ continue_merging: * Our buddy is free or it is CONFIG_DEBUG_PAGEALLOC guard page, * merge with it and move up one order. */ - if (page_is_guard(buddy)) { + if (page_is_guard(buddy)) clear_page_guard(zone, buddy, order, migratetype); - } else { - list_del(&buddy->lru); - zone->free_area[order].nr_free--; - rmv_page_order(buddy); - } + else + del_page_from_free_area(buddy, &zone->free_area[order]); combined_pfn = buddy_pfn & pfn; page = page + (combined_pfn - pfn); pfn = combined_pfn; @@ -953,7 +959,8 @@ done_merging: * so it's less likely to be used soon and more likely to be merged * as a higher order page */ - if ((order < MAX_ORDER-2) && pfn_valid_within(buddy_pfn)) { + if ((order < MAX_ORDER-2) && pfn_valid_within(buddy_pfn) + && !is_shuffle_order(order)) { struct page *higher_page, *higher_buddy; combined_pfn = buddy_pfn & pfn; higher_page = page + (combined_pfn - pfn); @@ -961,15 +968,18 @@ done_merging: higher_buddy = higher_page + (buddy_pfn - combined_pfn); if (pfn_valid_within(buddy_pfn) && page_is_buddy(higher_page, higher_buddy, order + 1)) { - list_add_tail(&page->lru, - &zone->free_area[order].free_list[migratetype]); - goto out; + add_to_free_area_tail(page, &zone->free_area[order], + migratetype); + return; } } - list_add(&page->lru, &zone->free_area[order].free_list[migratetype]); -out: - zone->free_area[order].nr_free++; + if (is_shuffle_order(order)) + add_to_free_area_random(page, &zone->free_area[order], + migratetype); + else + add_to_free_area(page, &zone->free_area[order], migratetype); + } /* @@ -1131,7 +1141,9 @@ static __always_inline bool free_pages_prepare(struct page *page, } arch_free_page(page, order); kernel_poison_pages(page, 1 << order, 0); - kernel_map_pages(page, 1 << order, 0); + if (debug_pagealloc_enabled()) + kernel_map_pages(page, 1 << order, 0); + kasan_free_nondeferred_pages(page, order); return true; @@ -1401,36 +1413,22 @@ int __meminit early_pfn_to_nid(unsigned long pfn) #endif #ifdef CONFIG_NODES_SPAN_OTHER_NODES -static inline bool __meminit __maybe_unused -meminit_pfn_in_nid(unsigned long pfn, int node, - struct mminit_pfnnid_cache *state) +/* Only safe to use early in boot when initialisation is single-threaded */ +static inline bool __meminit early_pfn_in_nid(unsigned long pfn, int node) { int nid; - nid = __early_pfn_to_nid(pfn, state); + nid = __early_pfn_to_nid(pfn, &early_pfnnid_cache); if (nid >= 0 && nid != node) return false; return true; } -/* Only safe to use early in boot when initialisation is single-threaded */ -static inline bool __meminit early_pfn_in_nid(unsigned long pfn, int node) -{ - return meminit_pfn_in_nid(pfn, node, &early_pfnnid_cache); -} - #else - static inline bool __meminit early_pfn_in_nid(unsigned long pfn, int node) { return true; } -static inline bool __meminit __maybe_unused -meminit_pfn_in_nid(unsigned long pfn, int node, - struct mminit_pfnnid_cache *state) -{ - return true; -} #endif @@ -1559,21 +1557,13 @@ static inline void __init pgdat_init_report_one_done(void) * * Then, we check if a current large page is valid by only checking the validity * of the head pfn. - * - * Finally, meminit_pfn_in_nid is checked on systems where pfns can interleave - * within a node: a pfn is between start and end of a node, but does not belong - * to this memory node. */ -static inline bool __init -deferred_pfn_valid(int nid, unsigned long pfn, - struct mminit_pfnnid_cache *nid_init_state) +static inline bool __init deferred_pfn_valid(unsigned long pfn) { if (!pfn_valid_within(pfn)) return false; if (!(pfn & (pageblock_nr_pages - 1)) && !pfn_valid(pfn)) return false; - if (!meminit_pfn_in_nid(pfn, nid, nid_init_state)) - return false; return true; } @@ -1581,15 +1571,14 @@ deferred_pfn_valid(int nid, unsigned long pfn, * Free pages to buddy allocator. Try to free aligned pages in * pageblock_nr_pages sizes. */ -static void __init deferred_free_pages(int nid, int zid, unsigned long pfn, +static void __init deferred_free_pages(unsigned long pfn, unsigned long end_pfn) { - struct mminit_pfnnid_cache nid_init_state = { }; unsigned long nr_pgmask = pageblock_nr_pages - 1; unsigned long nr_free = 0; for (; pfn < end_pfn; pfn++) { - if (!deferred_pfn_valid(nid, pfn, &nid_init_state)) { + if (!deferred_pfn_valid(pfn)) { deferred_free_range(pfn - nr_free, nr_free); nr_free = 0; } else if (!(pfn & nr_pgmask)) { @@ -1609,17 +1598,18 @@ static void __init deferred_free_pages(int nid, int zid, unsigned long pfn, * by performing it only once every pageblock_nr_pages. * Return number of pages initialized. */ -static unsigned long __init deferred_init_pages(int nid, int zid, +static unsigned long __init deferred_init_pages(struct zone *zone, unsigned long pfn, unsigned long end_pfn) { - struct mminit_pfnnid_cache nid_init_state = { }; unsigned long nr_pgmask = pageblock_nr_pages - 1; + int nid = zone_to_nid(zone); unsigned long nr_pages = 0; + int zid = zone_idx(zone); struct page *page = NULL; for (; pfn < end_pfn; pfn++) { - if (!deferred_pfn_valid(nid, pfn, &nid_init_state)) { + if (!deferred_pfn_valid(pfn)) { page = NULL; continue; } else if (!page || !(pfn & nr_pgmask)) { @@ -1634,18 +1624,100 @@ static unsigned long __init deferred_init_pages(int nid, int zid, return (nr_pages); } +/* + * This function is meant to pre-load the iterator for the zone init. + * Specifically it walks through the ranges until we are caught up to the + * first_init_pfn value and exits there. If we never encounter the value we + * return false indicating there are no valid ranges left. + */ +static bool __init +deferred_init_mem_pfn_range_in_zone(u64 *i, struct zone *zone, + unsigned long *spfn, unsigned long *epfn, + unsigned long first_init_pfn) +{ + u64 j; + + /* + * Start out by walking through the ranges in this zone that have + * already been initialized. We don't need to do anything with them + * so we just need to flush them out of the system. + */ + for_each_free_mem_pfn_range_in_zone(j, zone, spfn, epfn) { + if (*epfn <= first_init_pfn) + continue; + if (*spfn < first_init_pfn) + *spfn = first_init_pfn; + *i = j; + return true; + } + + return false; +} + +/* + * Initialize and free pages. We do it in two loops: first we initialize + * struct page, then free to buddy allocator, because while we are + * freeing pages we can access pages that are ahead (computing buddy + * page in __free_one_page()). + * + * In order to try and keep some memory in the cache we have the loop + * broken along max page order boundaries. This way we will not cause + * any issues with the buddy page computation. + */ +static unsigned long __init +deferred_init_maxorder(u64 *i, struct zone *zone, unsigned long *start_pfn, + unsigned long *end_pfn) +{ + unsigned long mo_pfn = ALIGN(*start_pfn + 1, MAX_ORDER_NR_PAGES); + unsigned long spfn = *start_pfn, epfn = *end_pfn; + unsigned long nr_pages = 0; + u64 j = *i; + + /* First we loop through and initialize the page values */ + for_each_free_mem_pfn_range_in_zone_from(j, zone, start_pfn, end_pfn) { + unsigned long t; + + if (mo_pfn <= *start_pfn) + break; + + t = min(mo_pfn, *end_pfn); + nr_pages += deferred_init_pages(zone, *start_pfn, t); + + if (mo_pfn < *end_pfn) { + *start_pfn = mo_pfn; + break; + } + } + + /* Reset values and now loop through freeing pages as needed */ + swap(j, *i); + + for_each_free_mem_pfn_range_in_zone_from(j, zone, &spfn, &epfn) { + unsigned long t; + + if (mo_pfn <= spfn) + break; + + t = min(mo_pfn, epfn); + deferred_free_pages(spfn, t); + + if (mo_pfn <= epfn) + break; + } + + return nr_pages; +} + /* Initialise remaining memory on a node */ static int __init deferred_init_memmap(void *data) { pg_data_t *pgdat = data; - int nid = pgdat->node_id; + const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id); + unsigned long spfn = 0, epfn = 0, nr_pages = 0; + unsigned long first_init_pfn, flags; unsigned long start = jiffies; - unsigned long nr_pages = 0; - unsigned long spfn, epfn, first_init_pfn, flags; - phys_addr_t spa, epa; - int zid; struct zone *zone; - const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id); + int zid; u64 i; /* Bind memory initialisation thread to a local node if possible */ @@ -1671,31 +1743,27 @@ static int __init deferred_init_memmap(void *data) if (first_init_pfn < zone_end_pfn(zone)) break; } - first_init_pfn = max(zone->zone_start_pfn, first_init_pfn); + + /* If the zone is empty somebody else may have cleared out the zone */ + if (!deferred_init_mem_pfn_range_in_zone(&i, zone, &spfn, &epfn, + first_init_pfn)) + goto zone_empty; /* - * Initialize and free pages. We do it in two loops: first we initialize - * struct page, than free to buddy allocator, because while we are - * freeing pages we can access pages that are ahead (computing buddy - * page in __free_one_page()). + * Initialize and free pages in MAX_ORDER sized increments so + * that we can avoid introducing any issues with the buddy + * allocator. */ - for_each_free_mem_range(i, nid, MEMBLOCK_NONE, &spa, &epa, NULL) { - spfn = max_t(unsigned long, first_init_pfn, PFN_UP(spa)); - epfn = min_t(unsigned long, zone_end_pfn(zone), PFN_DOWN(epa)); - nr_pages += deferred_init_pages(nid, zid, spfn, epfn); - } - for_each_free_mem_range(i, nid, MEMBLOCK_NONE, &spa, &epa, NULL) { - spfn = max_t(unsigned long, first_init_pfn, PFN_UP(spa)); - epfn = min_t(unsigned long, zone_end_pfn(zone), PFN_DOWN(epa)); - deferred_free_pages(nid, zid, spfn, epfn); - } + while (spfn < epfn) + nr_pages += deferred_init_maxorder(&i, zone, &spfn, &epfn); +zone_empty: pgdat_resize_unlock(pgdat, &flags); /* Sanity check that the next zone really is unpopulated */ WARN_ON(++zid < MAX_NR_ZONES && populated_zone(++zone)); - pr_info("node %d initialised, %lu pages in %ums\n", nid, nr_pages, - jiffies_to_msecs(jiffies - start)); + pr_info("node %d initialised, %lu pages in %ums\n", + pgdat->node_id, nr_pages, jiffies_to_msecs(jiffies - start)); pgdat_init_report_one_done(); return 0; @@ -1719,14 +1787,11 @@ static int __init deferred_init_memmap(void *data) static noinline bool __init deferred_grow_zone(struct zone *zone, unsigned int order) { - int zid = zone_idx(zone); - int nid = zone_to_nid(zone); - pg_data_t *pgdat = NODE_DATA(nid); unsigned long nr_pages_needed = ALIGN(1 << order, PAGES_PER_SECTION); - unsigned long nr_pages = 0; - unsigned long first_init_pfn, spfn, epfn, t, flags; + pg_data_t *pgdat = zone->zone_pgdat; unsigned long first_deferred_pfn = pgdat->first_deferred_pfn; - phys_addr_t spa, epa; + unsigned long spfn, epfn, flags; + unsigned long nr_pages = 0; u64 i; /* Only the last zone may have deferred pages */ @@ -1755,38 +1820,35 @@ deferred_grow_zone(struct zone *zone, unsigned int order) return true; } - first_init_pfn = max(zone->zone_start_pfn, first_deferred_pfn); - - if (first_init_pfn >= pgdat_end_pfn(pgdat)) { + /* If the zone is empty somebody else may have cleared out the zone */ + if (!deferred_init_mem_pfn_range_in_zone(&i, zone, &spfn, &epfn, + first_deferred_pfn)) { + pgdat->first_deferred_pfn = ULONG_MAX; pgdat_resize_unlock(pgdat, &flags); - return false; + return true; } - for_each_free_mem_range(i, nid, MEMBLOCK_NONE, &spa, &epa, NULL) { - spfn = max_t(unsigned long, first_init_pfn, PFN_UP(spa)); - epfn = min_t(unsigned long, zone_end_pfn(zone), PFN_DOWN(epa)); + /* + * Initialize and free pages in MAX_ORDER sized increments so + * that we can avoid introducing any issues with the buddy + * allocator. + */ + while (spfn < epfn) { + /* update our first deferred PFN for this section */ + first_deferred_pfn = spfn; - while (spfn < epfn && nr_pages < nr_pages_needed) { - t = ALIGN(spfn + PAGES_PER_SECTION, PAGES_PER_SECTION); - first_deferred_pfn = min(t, epfn); - nr_pages += deferred_init_pages(nid, zid, spfn, - first_deferred_pfn); - spfn = first_deferred_pfn; - } + nr_pages += deferred_init_maxorder(&i, zone, &spfn, &epfn); + + /* We should only stop along section boundaries */ + if ((first_deferred_pfn ^ spfn) < PAGES_PER_SECTION) + continue; + /* If our quota has been met we can stop here */ if (nr_pages >= nr_pages_needed) break; } - for_each_free_mem_range(i, nid, MEMBLOCK_NONE, &spa, &epa, NULL) { - spfn = max_t(unsigned long, first_init_pfn, PFN_UP(spa)); - epfn = min_t(unsigned long, first_deferred_pfn, PFN_DOWN(epa)); - deferred_free_pages(nid, zid, spfn, epfn); - - if (first_deferred_pfn == epfn) - break; - } - pgdat->first_deferred_pfn = first_deferred_pfn; + pgdat->first_deferred_pfn = spfn; pgdat_resize_unlock(pgdat, &flags); return nr_pages > 0; @@ -1809,9 +1871,9 @@ _deferred_grow_zone(struct zone *zone, unsigned int order) void __init page_alloc_init_late(void) { struct zone *zone; + int nid; #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT - int nid; /* There will be num_node_state(N_MEMORY) threads */ atomic_set(&pgdat_init_n_undone, num_node_state(N_MEMORY)); @@ -1831,10 +1893,12 @@ void __init page_alloc_init_late(void) /* Reinit limits that are based on free pages after the kernel is up */ files_maxfiles_init(); #endif -#ifdef CONFIG_ARCH_DISCARD_MEMBLOCK + /* Discard memblock private memory */ memblock_discard(); -#endif + + for_each_node_state(nid, N_MEMORY) + shuffle_free_memory(NODE_DATA(nid)); for_each_populated_zone(zone) set_zone_contiguous(zone); @@ -1906,8 +1970,7 @@ static inline void expand(struct zone *zone, struct page *page, if (set_page_guard(zone, &page[size], high, migratetype)) continue; - list_add(&page[size].lru, &area->free_list[migratetype]); - area->nr_free++; + add_to_free_area(&page[size], area, migratetype); set_page_order(&page[size], high); } } @@ -1922,7 +1985,7 @@ static void check_new_page_bad(struct page *page) if (unlikely(page->mapping != NULL)) bad_reason = "non-NULL mapping"; if (unlikely(page_ref_count(page) != 0)) - bad_reason = "nonzero _count"; + bad_reason = "nonzero _refcount"; if (unlikely(page->flags & __PG_HWPOISON)) { bad_reason = "HWPoisoned (hardware-corrupted)"; bad_flags = __PG_HWPOISON; @@ -2001,7 +2064,8 @@ inline void post_alloc_hook(struct page *page, unsigned int order, set_page_refcounted(page); arch_alloc_page(page, order); - kernel_map_pages(page, 1 << order, 1); + if (debug_pagealloc_enabled()) + kernel_map_pages(page, 1 << order, 1); kasan_alloc_pages(page, order); kernel_poison_pages(page, 1 << order, 1); set_page_owner(page, order, gfp_flags); @@ -2048,13 +2112,10 @@ struct page *__rmqueue_smallest(struct zone *zone, unsigned int order, /* Find a page of the appropriate size in the preferred list */ for (current_order = order; current_order < MAX_ORDER; ++current_order) { area = &(zone->free_area[current_order]); - page = list_first_entry_or_null(&area->free_list[migratetype], - struct page, lru); + page = get_page_from_free_area(area, migratetype); if (!page) continue; - list_del(&page->lru); - rmv_page_order(page); - area->nr_free--; + del_page_from_free_area(page, area); expand(zone, page, order, current_order, area, migratetype); set_pcppage_migratetype(page, migratetype); return page; @@ -2140,8 +2201,7 @@ static int move_freepages(struct zone *zone, } order = page_order(page); - list_move(&page->lru, - &zone->free_area[order].free_list[migratetype]); + move_to_free_area(page, &zone->free_area[order], migratetype); page += 1 << order; pages_moved += 1 << order; } @@ -2329,7 +2389,7 @@ static void steal_suitable_fallback(struct zone *zone, struct page *page, single_page: area = &zone->free_area[current_order]; - list_move(&page->lru, &area->free_list[start_type]); + move_to_free_area(page, area, start_type); } /* @@ -2353,7 +2413,7 @@ int find_suitable_fallback(struct free_area *area, unsigned int order, if (fallback_mt == MIGRATE_TYPES) break; - if (list_empty(&area->free_list[fallback_mt])) + if (free_area_empty(area, fallback_mt)) continue; if (can_steal_fallback(order, migratetype)) @@ -2440,9 +2500,7 @@ static bool unreserve_highatomic_pageblock(const struct alloc_context *ac, for (order = 0; order < MAX_ORDER; order++) { struct free_area *area = &(zone->free_area[order]); - page = list_first_entry_or_null( - &area->free_list[MIGRATE_HIGHATOMIC], - struct page, lru); + page = get_page_from_free_area(area, MIGRATE_HIGHATOMIC); if (!page) continue; @@ -2565,8 +2623,7 @@ find_smallest: VM_BUG_ON(current_order == MAX_ORDER); do_steal: - page = list_first_entry(&area->free_list[fallback_mt], - struct page, lru); + page = get_page_from_free_area(area, fallback_mt); steal_suitable_fallback(zone, page, alloc_flags, start_migratetype, can_steal); @@ -3003,6 +3060,7 @@ EXPORT_SYMBOL_GPL(split_page); int __isolate_free_page(struct page *page, unsigned int order) { + struct free_area *area = &page_zone(page)->free_area[order]; unsigned long watermark; struct zone *zone; int mt; @@ -3027,9 +3085,8 @@ int __isolate_free_page(struct page *page, unsigned int order) } /* Remove page from free list */ - list_del(&page->lru); - zone->free_area[order].nr_free--; - rmv_page_order(page); + + del_page_from_free_area(page, area); /* * Set the pageblock if the isolated page is at least half of a @@ -3104,9 +3161,8 @@ static struct page *__rmqueue_pcplist(struct zone *zone, int migratetype, /* Lock and remove page from the per-cpu list */ static struct page *rmqueue_pcplist(struct zone *preferred_zone, - struct zone *zone, unsigned int order, - gfp_t gfp_flags, int migratetype, - unsigned int alloc_flags) + struct zone *zone, gfp_t gfp_flags, + int migratetype, unsigned int alloc_flags) { struct per_cpu_pages *pcp; struct list_head *list; @@ -3118,7 +3174,7 @@ static struct page *rmqueue_pcplist(struct zone *preferred_zone, list = &pcp->lists[migratetype]; page = __rmqueue_pcplist(zone, migratetype, alloc_flags, pcp, list); if (page) { - __count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order); + __count_zid_vm_events(PGALLOC, page_zonenum(page), 1); zone_statistics(preferred_zone, zone); } local_irq_restore(flags); @@ -3138,8 +3194,8 @@ struct page *rmqueue(struct zone *preferred_zone, struct page *page; if (likely(order == 0)) { - page = rmqueue_pcplist(preferred_zone, zone, order, - gfp_flags, migratetype, alloc_flags); + page = rmqueue_pcplist(preferred_zone, zone, gfp_flags, + migratetype, alloc_flags); goto out; } @@ -3327,13 +3383,13 @@ bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark, continue; for (mt = 0; mt < MIGRATE_PCPTYPES; mt++) { - if (!list_empty(&area->free_list[mt])) + if (!free_area_empty(area, mt)) return true; } #ifdef CONFIG_CMA if ((alloc_flags & ALLOC_CMA) && - !list_empty(&area->free_list[MIGRATE_CMA])) { + !free_area_empty(area, MIGRATE_CMA)) { return true; } #endif @@ -3419,8 +3475,11 @@ alloc_flags_nofragment(struct zone *zone, gfp_t gfp_mask) alloc_flags |= ALLOC_KSWAPD; #ifdef CONFIG_ZONE_DMA32 + if (!zone) + return alloc_flags; + if (zone_idx(zone) != ZONE_NORMAL) - goto out; + return alloc_flags; /* * If ZONE_DMA32 exists, assume it is the one after ZONE_NORMAL and @@ -3429,9 +3488,9 @@ alloc_flags_nofragment(struct zone *zone, gfp_t gfp_mask) */ BUILD_BUG_ON(ZONE_NORMAL - ZONE_DMA32 != 1); if (nr_online_nodes > 1 && !populated_zone(--zone)) - goto out; + return alloc_flags; -out: + alloc_flags |= ALLOC_NOFRAGMENT; #endif /* CONFIG_ZONE_DMA32 */ return alloc_flags; } @@ -3773,11 +3832,6 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, memalloc_noreclaim_restore(noreclaim_flag); psi_memstall_leave(&pflags); - if (*compact_result <= COMPACT_INACTIVE) { - WARN_ON_ONCE(page); - return NULL; - } - /* * At least in one zone compaction wasn't deferred or skipped, so let's * count a compaction stall @@ -4807,7 +4861,7 @@ static void *make_alloc_exact(unsigned long addr, unsigned int order, /** * alloc_pages_exact - allocate an exact number physically-contiguous pages. * @size: the number of bytes to allocate - * @gfp_mask: GFP flags for the allocation + * @gfp_mask: GFP flags for the allocation, must not contain __GFP_COMP * * This function is similar to alloc_pages(), except that it allocates the * minimum number of pages to satisfy the request. alloc_pages() can only @@ -4824,6 +4878,9 @@ void *alloc_pages_exact(size_t size, gfp_t gfp_mask) unsigned int order = get_order(size); unsigned long addr; + if (WARN_ON_ONCE(gfp_mask & __GFP_COMP)) + gfp_mask &= ~__GFP_COMP; + addr = __get_free_pages(gfp_mask, order); return make_alloc_exact(addr, order, size); } @@ -4834,7 +4891,7 @@ EXPORT_SYMBOL(alloc_pages_exact); * pages on a node. * @nid: the preferred node ID where memory should be allocated * @size: the number of bytes to allocate - * @gfp_mask: GFP flags for the allocation + * @gfp_mask: GFP flags for the allocation, must not contain __GFP_COMP * * Like alloc_pages_exact(), but try to allocate on node nid first before falling * back. @@ -4844,7 +4901,12 @@ EXPORT_SYMBOL(alloc_pages_exact); void * __meminit alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask) { unsigned int order = get_order(size); - struct page *p = alloc_pages_node(nid, gfp_mask, order); + struct page *p; + + if (WARN_ON_ONCE(gfp_mask & __GFP_COMP)) + gfp_mask &= ~__GFP_COMP; + + p = alloc_pages_node(nid, gfp_mask, order); if (!p) return NULL; return make_alloc_exact((unsigned long)page_address(p), order, size); @@ -5254,7 +5316,7 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask) types[order] = 0; for (type = 0; type < MIGRATE_TYPES; type++) { - if (!list_empty(&area->free_list[type])) + if (!free_area_empty(area, type)) types[order] |= 1 << type; } } @@ -6233,13 +6295,15 @@ static unsigned long __init zone_spanned_pages_in_node(int nid, unsigned long *zone_end_pfn, unsigned long *ignored) { + unsigned long zone_low = arch_zone_lowest_possible_pfn[zone_type]; + unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type]; /* When hotadd a new node from cpu_up(), the node should be empty */ if (!node_start_pfn && !node_end_pfn) return 0; /* Get the start and end of the zone */ - *zone_start_pfn = arch_zone_lowest_possible_pfn[zone_type]; - *zone_end_pfn = arch_zone_highest_possible_pfn[zone_type]; + *zone_start_pfn = clamp(node_start_pfn, zone_low, zone_high); + *zone_end_pfn = clamp(node_end_pfn, zone_low, zone_high); adjust_zone_range_for_zone_movable(nid, zone_type, node_start_pfn, node_end_pfn, zone_start_pfn, zone_end_pfn); @@ -8005,7 +8069,10 @@ void *__init alloc_large_system_hash(const char *tablename, bool has_unmovable_pages(struct zone *zone, struct page *page, int count, int migratetype, int flags) { - unsigned long pfn, iter, found; + unsigned long found; + unsigned long iter = 0; + unsigned long pfn = page_to_pfn(page); + const char *reason = "unmovable page"; /* * TODO we could make this much more efficient by not checking every @@ -8015,17 +8082,20 @@ bool has_unmovable_pages(struct zone *zone, struct page *page, int count, * can still lead to having bootmem allocations in zone_movable. */ - /* - * CMA allocations (alloc_contig_range) really need to mark isolate - * CMA pageblocks even when they are not movable in fact so consider - * them movable here. - */ - if (is_migrate_cma(migratetype) && - is_migrate_cma(get_pageblock_migratetype(page))) - return false; + if (is_migrate_cma_page(page)) { + /* + * CMA allocations (alloc_contig_range) really need to mark + * isolate CMA pageblocks even when they are not movable in fact + * so consider them movable here. + */ + if (is_migrate_cma(migratetype)) + return false; + + reason = "CMA page"; + goto unmovable; + } - pfn = page_to_pfn(page); - for (found = 0, iter = 0; iter < pageblock_nr_pages; iter++) { + for (found = 0; iter < pageblock_nr_pages; iter++) { unsigned long check = pfn + iter; if (!pfn_valid_within(check)) @@ -8105,12 +8175,11 @@ bool has_unmovable_pages(struct zone *zone, struct page *page, int count, unmovable: WARN_ON_ONCE(zone_idx(zone) == ZONE_MOVABLE); if (flags & REPORT_FAILURE) - dump_page(pfn_to_page(pfn+iter), "unmovable page"); + dump_page(pfn_to_page(pfn + iter), reason); return true; } -#if (defined(CONFIG_MEMORY_ISOLATION) && defined(CONFIG_COMPACTION)) || defined(CONFIG_CMA) - +#ifdef CONFIG_CONTIG_ALLOC static unsigned long pfn_max_align_down(unsigned long pfn) { return pfn & ~(max_t(unsigned long, MAX_ORDER_NR_PAGES, @@ -8233,7 +8302,7 @@ int alloc_contig_range(unsigned long start, unsigned long end, ret = start_isolate_page_range(pfn_max_align_down(start), pfn_max_align_up(end), migratetype, 0); - if (ret) + if (ret < 0) return ret; /* @@ -8319,8 +8388,9 @@ done: pfn_max_align_up(end), migratetype); return ret; } +#endif /* CONFIG_CONTIG_ALLOC */ -void free_contig_range(unsigned long pfn, unsigned nr_pages) +void free_contig_range(unsigned long pfn, unsigned int nr_pages) { unsigned int count = 0; @@ -8332,7 +8402,6 @@ void free_contig_range(unsigned long pfn, unsigned nr_pages) } WARN(count != 0, "%d pages are still in use!\n", count); } -#endif #ifdef CONFIG_MEMORY_HOTPLUG /* @@ -8374,7 +8443,7 @@ void zone_pcp_reset(struct zone *zone) * All pages in the range must be in a single zone and isolated * before calling this. */ -void +unsigned long __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn) { struct page *page; @@ -8382,12 +8451,15 @@ __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn) unsigned int order, i; unsigned long pfn; unsigned long flags; + unsigned long offlined_pages = 0; + /* find the first valid pfn */ for (pfn = start_pfn; pfn < end_pfn; pfn++) if (pfn_valid(pfn)) break; if (pfn == end_pfn) - return; + return offlined_pages; + offline_mem_sections(pfn, end_pfn); zone = page_zone(pfn_to_page(pfn)); spin_lock_irqsave(&zone->lock, flags); @@ -8405,24 +8477,26 @@ __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn) if (unlikely(!PageBuddy(page) && PageHWPoison(page))) { pfn++; SetPageReserved(page); + offlined_pages++; continue; } BUG_ON(page_count(page)); BUG_ON(!PageBuddy(page)); order = page_order(page); + offlined_pages += 1 << order; #ifdef CONFIG_DEBUG_VM pr_info("remove from free list %lx %d %lx\n", pfn, 1 << order, end_pfn); #endif - list_del(&page->lru); - rmv_page_order(page); - zone->free_area[order].nr_free--; + del_page_from_free_area(page, &zone->free_area[order]); for (i = 0; i < (1 << order); i++) SetPageReserved((page+i)); pfn += (1 << order); } spin_unlock_irqrestore(&zone->lock, flags); + + return offlined_pages; } #endif diff --git a/mm/page_isolation.c b/mm/page_isolation.c index ce323e56b34d..e3638a5bafff 100644 --- a/mm/page_isolation.c +++ b/mm/page_isolation.c @@ -59,7 +59,8 @@ static int set_migratetype_isolate(struct page *page, int migratetype, int isol_ * FIXME: Now, memory hotplug doesn't call shrink_slab() by itself. * We just check MOVABLE pages. */ - if (!has_unmovable_pages(zone, page, arg.pages_found, migratetype, flags)) + if (!has_unmovable_pages(zone, page, arg.pages_found, migratetype, + isol_flags)) ret = 0; /* @@ -150,8 +151,6 @@ __first_valid_page(unsigned long pfn, unsigned long nr_pages) for (i = 0; i < nr_pages; i++) { struct page *page; - if (!pfn_valid_within(pfn + i)) - continue; page = pfn_to_online_page(pfn + i); if (!page) continue; @@ -160,27 +159,36 @@ __first_valid_page(unsigned long pfn, unsigned long nr_pages) return NULL; } -/* - * start_isolate_page_range() -- make page-allocation-type of range of pages - * to be MIGRATE_ISOLATE. - * @start_pfn: The lower PFN of the range to be isolated. - * @end_pfn: The upper PFN of the range to be isolated. - * @migratetype: migrate type to set in error recovery. +/** + * start_isolate_page_range() - make page-allocation-type of range of pages to + * be MIGRATE_ISOLATE. + * @start_pfn: The lower PFN of the range to be isolated. + * @end_pfn: The upper PFN of the range to be isolated. + * start_pfn/end_pfn must be aligned to pageblock_order. + * @migratetype: Migrate type to set in error recovery. + * @flags: The following flags are allowed (they can be combined in + * a bit mask) + * SKIP_HWPOISON - ignore hwpoison pages + * REPORT_FAILURE - report details about the failure to + * isolate the range * * Making page-allocation-type to be MIGRATE_ISOLATE means free pages in * the range will never be allocated. Any free pages and pages freed in the - * future will not be allocated again. - * - * start_pfn/end_pfn must be aligned to pageblock_order. - * Return 0 on success and -EBUSY if any part of range cannot be isolated. + * future will not be allocated again. If specified range includes migrate types + * other than MOVABLE or CMA, this will fail with -EBUSY. For isolating all + * pages in the range finally, the caller have to free all pages in the range. + * test_page_isolated() can be used for test it. * * There is no high level synchronization mechanism that prevents two threads - * from trying to isolate overlapping ranges. If this happens, one thread + * from trying to isolate overlapping ranges. If this happens, one thread * will notice pageblocks in the overlapping range already set to isolate. * This happens in set_migratetype_isolate, and set_migratetype_isolate - * returns an error. We then clean up by restoring the migration type on - * pageblocks we may have modified and return -EBUSY to caller. This + * returns an error. We then clean up by restoring the migration type on + * pageblocks we may have modified and return -EBUSY to caller. This * prevents two threads from simultaneously working on overlapping ranges. + * + * Return: the number of isolated pageblocks on success and -EBUSY if any part + * of range cannot be isolated. */ int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn, unsigned migratetype, int flags) @@ -188,6 +196,7 @@ int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn, unsigned long pfn; unsigned long undo_pfn; struct page *page; + int nr_isolate_pageblock = 0; BUG_ON(!IS_ALIGNED(start_pfn, pageblock_nr_pages)); BUG_ON(!IS_ALIGNED(end_pfn, pageblock_nr_pages)); @@ -196,13 +205,15 @@ int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn, pfn < end_pfn; pfn += pageblock_nr_pages) { page = __first_valid_page(pfn, pageblock_nr_pages); - if (page && - set_migratetype_isolate(page, migratetype, flags)) { - undo_pfn = pfn; - goto undo; + if (page) { + if (set_migratetype_isolate(page, migratetype, flags)) { + undo_pfn = pfn; + goto undo; + } + nr_isolate_pageblock++; } } - return 0; + return nr_isolate_pageblock; undo: for (pfn = start_pfn; pfn < undo_pfn; diff --git a/mm/page_owner.c b/mm/page_owner.c index 925b6f44a444..addcbb2ae4e4 100644 --- a/mm/page_owner.c +++ b/mm/page_owner.c @@ -58,15 +58,10 @@ static bool need_page_owner(void) static __always_inline depot_stack_handle_t create_dummy_stack(void) { unsigned long entries[4]; - struct stack_trace dummy; + unsigned int nr_entries; - dummy.nr_entries = 0; - dummy.max_entries = ARRAY_SIZE(entries); - dummy.entries = &entries[0]; - dummy.skip = 0; - - save_stack_trace(&dummy); - return depot_save_stack(&dummy, GFP_KERNEL); + nr_entries = stack_trace_save(entries, ARRAY_SIZE(entries), 0); + return stack_depot_save(entries, nr_entries, GFP_KERNEL); } static noinline void register_dummy_stack(void) @@ -120,49 +115,39 @@ void __reset_page_owner(struct page *page, unsigned int order) } } -static inline bool check_recursive_alloc(struct stack_trace *trace, - unsigned long ip) +static inline bool check_recursive_alloc(unsigned long *entries, + unsigned int nr_entries, + unsigned long ip) { - int i; - - if (!trace->nr_entries) - return false; + unsigned int i; - for (i = 0; i < trace->nr_entries; i++) { - if (trace->entries[i] == ip) + for (i = 0; i < nr_entries; i++) { + if (entries[i] == ip) return true; } - return false; } static noinline depot_stack_handle_t save_stack(gfp_t flags) { unsigned long entries[PAGE_OWNER_STACK_DEPTH]; - struct stack_trace trace = { - .nr_entries = 0, - .entries = entries, - .max_entries = PAGE_OWNER_STACK_DEPTH, - .skip = 2 - }; depot_stack_handle_t handle; + unsigned int nr_entries; - save_stack_trace(&trace); - if (trace.nr_entries != 0 && - trace.entries[trace.nr_entries-1] == ULONG_MAX) - trace.nr_entries--; + nr_entries = stack_trace_save(entries, ARRAY_SIZE(entries), 2); /* - * We need to check recursion here because our request to stackdepot - * could trigger memory allocation to save new entry. New memory - * allocation would reach here and call depot_save_stack() again - * if we don't catch it. There is still not enough memory in stackdepot - * so it would try to allocate memory again and loop forever. + * We need to check recursion here because our request to + * stackdepot could trigger memory allocation to save new + * entry. New memory allocation would reach here and call + * stack_depot_save_entries() again if we don't catch it. There is + * still not enough memory in stackdepot so it would try to + * allocate memory again and loop forever. */ - if (check_recursive_alloc(&trace, _RET_IP_)) + if (check_recursive_alloc(entries, nr_entries, _RET_IP_)) return dummy_handle; - handle = depot_save_stack(&trace, flags); + handle = stack_depot_save(entries, nr_entries, flags); if (!handle) handle = failure_handle; @@ -340,16 +325,10 @@ print_page_owner(char __user *buf, size_t count, unsigned long pfn, struct page *page, struct page_owner *page_owner, depot_stack_handle_t handle) { - int ret; - int pageblock_mt, page_mt; + int ret, pageblock_mt, page_mt; + unsigned long *entries; + unsigned int nr_entries; char *kbuf; - unsigned long entries[PAGE_OWNER_STACK_DEPTH]; - struct stack_trace trace = { - .nr_entries = 0, - .entries = entries, - .max_entries = PAGE_OWNER_STACK_DEPTH, - .skip = 0 - }; count = min_t(size_t, count, PAGE_SIZE); kbuf = kmalloc(count, GFP_KERNEL); @@ -378,8 +357,8 @@ print_page_owner(char __user *buf, size_t count, unsigned long pfn, if (ret >= count) goto err; - depot_fetch_stack(handle, &trace); - ret += snprint_stack_trace(kbuf + ret, count - ret, &trace, 0); + nr_entries = stack_depot_fetch(handle, &entries); + ret += stack_trace_snprint(kbuf + ret, count - ret, entries, nr_entries, 0); if (ret >= count) goto err; @@ -410,14 +389,9 @@ void __dump_page_owner(struct page *page) { struct page_ext *page_ext = lookup_page_ext(page); struct page_owner *page_owner; - unsigned long entries[PAGE_OWNER_STACK_DEPTH]; - struct stack_trace trace = { - .nr_entries = 0, - .entries = entries, - .max_entries = PAGE_OWNER_STACK_DEPTH, - .skip = 0 - }; depot_stack_handle_t handle; + unsigned long *entries; + unsigned int nr_entries; gfp_t gfp_mask; int mt; @@ -441,10 +415,10 @@ void __dump_page_owner(struct page *page) return; } - depot_fetch_stack(handle, &trace); + nr_entries = stack_depot_fetch(handle, &entries); pr_alert("page allocated via order %u, migratetype %s, gfp_mask %#x(%pGg)\n", page_owner->order, migratetype_names[mt], gfp_mask, &gfp_mask); - print_stack_trace(&trace, 0); + stack_trace_print(entries, nr_entries, 0); if (page_owner->last_migrate_reason != -1) pr_alert("page has been migrated, last migrate reason: %s\n", diff --git a/mm/percpu-internal.h b/mm/percpu-internal.h index b1739dc06b73..0468ba500bd4 100644 --- a/mm/percpu-internal.h +++ b/mm/percpu-internal.h @@ -9,8 +9,17 @@ * pcpu_block_md is the metadata block struct. * Each chunk's bitmap is split into a number of full blocks. * All units are in terms of bits. + * + * The scan hint is the largest known contiguous area before the contig hint. + * It is not necessarily the actual largest contig hint though. There is an + * invariant that the scan_hint_start > contig_hint_start iff + * scan_hint == contig_hint. This is necessary because when scanning forward, + * we don't know if a new contig hint would be better than the current one. */ struct pcpu_block_md { + int scan_hint; /* scan hint for block */ + int scan_hint_start; /* block relative starting + position of the scan hint */ int contig_hint; /* contig hint for block */ int contig_hint_start; /* block relative starting position of the contig hint */ @@ -19,6 +28,7 @@ struct pcpu_block_md { int right_free; /* size of free space along the right side of the block */ int first_free; /* block position of first free */ + int nr_bits; /* total bits responsible for */ }; struct pcpu_chunk { @@ -29,9 +39,7 @@ struct pcpu_chunk { struct list_head list; /* linked to pcpu_slot lists */ int free_bytes; /* free bytes in the chunk */ - int contig_bits; /* max contiguous size hint */ - int contig_bits_start; /* contig_bits starting - offset */ + struct pcpu_block_md chunk_md; void *base_addr; /* base address of this chunk */ unsigned long *alloc_map; /* allocation map */ @@ -39,7 +47,6 @@ struct pcpu_chunk { struct pcpu_block_md *md_blocks; /* metadata blocks */ void *data; /* chunk data */ - int first_bit; /* no free below this */ bool immutable; /* no [de]population allowed */ int start_offset; /* the overlap with the previous region to have a page aligned diff --git a/mm/percpu-km.c b/mm/percpu-km.c index b68d5df14731..3a2ff5c9192c 100644 --- a/mm/percpu-km.c +++ b/mm/percpu-km.c @@ -70,7 +70,7 @@ static struct pcpu_chunk *pcpu_create_chunk(gfp_t gfp) chunk->base_addr = page_address(pages); spin_lock_irqsave(&pcpu_lock, flags); - pcpu_chunk_populated(chunk, 0, nr_pages, false); + pcpu_chunk_populated(chunk, 0, nr_pages); spin_unlock_irqrestore(&pcpu_lock, flags); pcpu_stats_chunk_alloc(); diff --git a/mm/percpu-stats.c b/mm/percpu-stats.c index b5fdd43b60c9..ef5034a0464e 100644 --- a/mm/percpu-stats.c +++ b/mm/percpu-stats.c @@ -53,6 +53,7 @@ static int find_max_nr_alloc(void) static void chunk_map_stats(struct seq_file *m, struct pcpu_chunk *chunk, int *buffer) { + struct pcpu_block_md *chunk_md = &chunk->chunk_md; int i, last_alloc, as_len, start, end; int *alloc_sizes, *p; /* statistics */ @@ -121,9 +122,9 @@ static void chunk_map_stats(struct seq_file *m, struct pcpu_chunk *chunk, P("nr_alloc", chunk->nr_alloc); P("max_alloc_size", chunk->max_alloc_size); P("empty_pop_pages", chunk->nr_empty_pop_pages); - P("first_bit", chunk->first_bit); + P("first_bit", chunk_md->first_free); P("free_bytes", chunk->free_bytes); - P("contig_bytes", chunk->contig_bits * PCPU_MIN_ALLOC_SIZE); + P("contig_bytes", chunk_md->contig_hint * PCPU_MIN_ALLOC_SIZE); P("sum_frag", sum_frag); P("max_frag", max_frag); P("cur_min_alloc", cur_min_alloc); diff --git a/mm/percpu.c b/mm/percpu.c index 2e6fc8d552c9..2df0ee680ea6 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -94,6 +94,8 @@ /* the slots are sorted by free bytes left, 1-31 bytes share the same slot */ #define PCPU_SLOT_BASE_SHIFT 5 +/* chunks in slots below this are subject to being sidelined on failed alloc */ +#define PCPU_SLOT_FAIL_THRESHOLD 3 #define PCPU_EMPTY_POP_PAGES_LOW 2 #define PCPU_EMPTY_POP_PAGES_HIGH 4 @@ -231,10 +233,13 @@ static int pcpu_size_to_slot(int size) static int pcpu_chunk_slot(const struct pcpu_chunk *chunk) { - if (chunk->free_bytes < PCPU_MIN_ALLOC_SIZE || chunk->contig_bits == 0) + const struct pcpu_block_md *chunk_md = &chunk->chunk_md; + + if (chunk->free_bytes < PCPU_MIN_ALLOC_SIZE || + chunk_md->contig_hint == 0) return 0; - return pcpu_size_to_slot(chunk->free_bytes); + return pcpu_size_to_slot(chunk_md->contig_hint * PCPU_MIN_ALLOC_SIZE); } /* set the pointer to a chunk in a page struct */ @@ -318,6 +323,34 @@ static unsigned long pcpu_block_off_to_off(int index, int off) return index * PCPU_BITMAP_BLOCK_BITS + off; } +/* + * pcpu_next_hint - determine which hint to use + * @block: block of interest + * @alloc_bits: size of allocation + * + * This determines if we should scan based on the scan_hint or first_free. + * In general, we want to scan from first_free to fulfill allocations by + * first fit. However, if we know a scan_hint at position scan_hint_start + * cannot fulfill an allocation, we can begin scanning from there knowing + * the contig_hint will be our fallback. + */ +static int pcpu_next_hint(struct pcpu_block_md *block, int alloc_bits) +{ + /* + * The three conditions below determine if we can skip past the + * scan_hint. First, does the scan hint exist. Second, is the + * contig_hint after the scan_hint (possibly not true iff + * contig_hint == scan_hint). Third, is the allocation request + * larger than the scan_hint. + */ + if (block->scan_hint && + block->contig_hint_start > block->scan_hint_start && + alloc_bits > block->scan_hint) + return block->scan_hint_start + block->scan_hint; + + return block->first_free; +} + /** * pcpu_next_md_free_region - finds the next hint free area * @chunk: chunk of interest @@ -413,9 +446,11 @@ static void pcpu_next_fit_region(struct pcpu_chunk *chunk, int alloc_bits, if (block->contig_hint && block->contig_hint_start >= block_off && block->contig_hint >= *bits + alloc_bits) { + int start = pcpu_next_hint(block, alloc_bits); + *bits += alloc_bits + block->contig_hint_start - - block->first_free; - *bit_off = pcpu_block_off_to_off(i, block->first_free); + start; + *bit_off = pcpu_block_off_to_off(i, start); return; } /* reset to satisfy the second predicate above */ @@ -488,6 +523,22 @@ static void pcpu_mem_free(void *ptr) kvfree(ptr); } +static void __pcpu_chunk_move(struct pcpu_chunk *chunk, int slot, + bool move_front) +{ + if (chunk != pcpu_reserved_chunk) { + if (move_front) + list_move(&chunk->list, &pcpu_slot[slot]); + else + list_move_tail(&chunk->list, &pcpu_slot[slot]); + } +} + +static void pcpu_chunk_move(struct pcpu_chunk *chunk, int slot) +{ + __pcpu_chunk_move(chunk, slot, true); +} + /** * pcpu_chunk_relocate - put chunk in the appropriate chunk slot * @chunk: chunk of interest @@ -505,110 +556,39 @@ static void pcpu_chunk_relocate(struct pcpu_chunk *chunk, int oslot) { int nslot = pcpu_chunk_slot(chunk); - if (chunk != pcpu_reserved_chunk && oslot != nslot) { - if (oslot < nslot) - list_move(&chunk->list, &pcpu_slot[nslot]); - else - list_move_tail(&chunk->list, &pcpu_slot[nslot]); - } + if (oslot != nslot) + __pcpu_chunk_move(chunk, nslot, oslot < nslot); } -/** - * pcpu_cnt_pop_pages- counts populated backing pages in range +/* + * pcpu_update_empty_pages - update empty page counters * @chunk: chunk of interest - * @bit_off: start offset - * @bits: size of area to check - * - * Calculates the number of populated pages in the region - * [page_start, page_end). This keeps track of how many empty populated - * pages are available and decide if async work should be scheduled. + * @nr: nr of empty pages * - * RETURNS: - * The nr of populated pages. + * This is used to keep track of the empty pages now based on the premise + * a md_block covers a page. The hint update functions recognize if a block + * is made full or broken to calculate deltas for keeping track of free pages. */ -static inline int pcpu_cnt_pop_pages(struct pcpu_chunk *chunk, int bit_off, - int bits) +static inline void pcpu_update_empty_pages(struct pcpu_chunk *chunk, int nr) { - int page_start = PFN_UP(bit_off * PCPU_MIN_ALLOC_SIZE); - int page_end = PFN_DOWN((bit_off + bits) * PCPU_MIN_ALLOC_SIZE); - - if (page_start >= page_end) - return 0; - - /* - * bitmap_weight counts the number of bits set in a bitmap up to - * the specified number of bits. This is counting the populated - * pages up to page_end and then subtracting the populated pages - * up to page_start to count the populated pages in - * [page_start, page_end). - */ - return bitmap_weight(chunk->populated, page_end) - - bitmap_weight(chunk->populated, page_start); -} - -/** - * pcpu_chunk_update - updates the chunk metadata given a free area - * @chunk: chunk of interest - * @bit_off: chunk offset - * @bits: size of free area - * - * This updates the chunk's contig hint and starting offset given a free area. - * Choose the best starting offset if the contig hint is equal. - */ -static void pcpu_chunk_update(struct pcpu_chunk *chunk, int bit_off, int bits) -{ - if (bits > chunk->contig_bits) { - chunk->contig_bits_start = bit_off; - chunk->contig_bits = bits; - } else if (bits == chunk->contig_bits && chunk->contig_bits_start && - (!bit_off || - __ffs(bit_off) > __ffs(chunk->contig_bits_start))) { - /* use the start with the best alignment */ - chunk->contig_bits_start = bit_off; - } + chunk->nr_empty_pop_pages += nr; + if (chunk != pcpu_reserved_chunk) + pcpu_nr_empty_pop_pages += nr; } -/** - * pcpu_chunk_refresh_hint - updates metadata about a chunk - * @chunk: chunk of interest - * - * Iterates over the metadata blocks to find the largest contig area. - * It also counts the populated pages and uses the delta to update the - * global count. - * - * Updates: - * chunk->contig_bits - * chunk->contig_bits_start - * nr_empty_pop_pages (chunk and global) +/* + * pcpu_region_overlap - determines if two regions overlap + * @a: start of first region, inclusive + * @b: end of first region, exclusive + * @x: start of second region, inclusive + * @y: end of second region, exclusive + * + * This is used to determine if the hint region [a, b) overlaps with the + * allocated region [x, y). */ -static void pcpu_chunk_refresh_hint(struct pcpu_chunk *chunk) +static inline bool pcpu_region_overlap(int a, int b, int x, int y) { - int bit_off, bits, nr_empty_pop_pages; - - /* clear metadata */ - chunk->contig_bits = 0; - - bit_off = chunk->first_bit; - bits = nr_empty_pop_pages = 0; - pcpu_for_each_md_free_region(chunk, bit_off, bits) { - pcpu_chunk_update(chunk, bit_off, bits); - - nr_empty_pop_pages += pcpu_cnt_pop_pages(chunk, bit_off, bits); - } - - /* - * Keep track of nr_empty_pop_pages. - * - * The chunk maintains the previous number of free pages it held, - * so the delta is used to update the global counter. The reserved - * chunk is not part of the free page count as they are populated - * at init and are special to serving reserved allocations. - */ - if (chunk != pcpu_reserved_chunk) - pcpu_nr_empty_pop_pages += - (nr_empty_pop_pages - chunk->nr_empty_pop_pages); - - chunk->nr_empty_pop_pages = nr_empty_pop_pages; + return (a < y) && (x < b); } /** @@ -629,16 +609,132 @@ static void pcpu_block_update(struct pcpu_block_md *block, int start, int end) if (start == 0) block->left_free = contig; - if (end == PCPU_BITMAP_BLOCK_BITS) + if (end == block->nr_bits) block->right_free = contig; if (contig > block->contig_hint) { + /* promote the old contig_hint to be the new scan_hint */ + if (start > block->contig_hint_start) { + if (block->contig_hint > block->scan_hint) { + block->scan_hint_start = + block->contig_hint_start; + block->scan_hint = block->contig_hint; + } else if (start < block->scan_hint_start) { + /* + * The old contig_hint == scan_hint. But, the + * new contig is larger so hold the invariant + * scan_hint_start < contig_hint_start. + */ + block->scan_hint = 0; + } + } else { + block->scan_hint = 0; + } block->contig_hint_start = start; block->contig_hint = contig; - } else if (block->contig_hint_start && contig == block->contig_hint && - (!start || __ffs(start) > __ffs(block->contig_hint_start))) { - /* use the start with the best alignment */ - block->contig_hint_start = start; + } else if (contig == block->contig_hint) { + if (block->contig_hint_start && + (!start || + __ffs(start) > __ffs(block->contig_hint_start))) { + /* start has a better alignment so use it */ + block->contig_hint_start = start; + if (start < block->scan_hint_start && + block->contig_hint > block->scan_hint) + block->scan_hint = 0; + } else if (start > block->scan_hint_start || + block->contig_hint > block->scan_hint) { + /* + * Knowing contig == contig_hint, update the scan_hint + * if it is farther than or larger than the current + * scan_hint. + */ + block->scan_hint_start = start; + block->scan_hint = contig; + } + } else { + /* + * The region is smaller than the contig_hint. So only update + * the scan_hint if it is larger than or equal and farther than + * the current scan_hint. + */ + if ((start < block->contig_hint_start && + (contig > block->scan_hint || + (contig == block->scan_hint && + start > block->scan_hint_start)))) { + block->scan_hint_start = start; + block->scan_hint = contig; + } + } +} + +/* + * pcpu_block_update_scan - update a block given a free area from a scan + * @chunk: chunk of interest + * @bit_off: chunk offset + * @bits: size of free area + * + * Finding the final allocation spot first goes through pcpu_find_block_fit() + * to find a block that can hold the allocation and then pcpu_alloc_area() + * where a scan is used. When allocations require specific alignments, + * we can inadvertently create holes which will not be seen in the alloc + * or free paths. + * + * This takes a given free area hole and updates a block as it may change the + * scan_hint. We need to scan backwards to ensure we don't miss free bits + * from alignment. + */ +static void pcpu_block_update_scan(struct pcpu_chunk *chunk, int bit_off, + int bits) +{ + int s_off = pcpu_off_to_block_off(bit_off); + int e_off = s_off + bits; + int s_index, l_bit; + struct pcpu_block_md *block; + + if (e_off > PCPU_BITMAP_BLOCK_BITS) + return; + + s_index = pcpu_off_to_block_index(bit_off); + block = chunk->md_blocks + s_index; + + /* scan backwards in case of alignment skipping free bits */ + l_bit = find_last_bit(pcpu_index_alloc_map(chunk, s_index), s_off); + s_off = (s_off == l_bit) ? 0 : l_bit + 1; + + pcpu_block_update(block, s_off, e_off); +} + +/** + * pcpu_chunk_refresh_hint - updates metadata about a chunk + * @chunk: chunk of interest + * @full_scan: if we should scan from the beginning + * + * Iterates over the metadata blocks to find the largest contig area. + * A full scan can be avoided on the allocation path as this is triggered + * if we broke the contig_hint. In doing so, the scan_hint will be before + * the contig_hint or after if the scan_hint == contig_hint. This cannot + * be prevented on freeing as we want to find the largest area possibly + * spanning blocks. + */ +static void pcpu_chunk_refresh_hint(struct pcpu_chunk *chunk, bool full_scan) +{ + struct pcpu_block_md *chunk_md = &chunk->chunk_md; + int bit_off, bits; + + /* promote scan_hint to contig_hint */ + if (!full_scan && chunk_md->scan_hint) { + bit_off = chunk_md->scan_hint_start + chunk_md->scan_hint; + chunk_md->contig_hint_start = chunk_md->scan_hint_start; + chunk_md->contig_hint = chunk_md->scan_hint; + chunk_md->scan_hint = 0; + } else { + bit_off = chunk_md->first_free; + chunk_md->contig_hint = 0; + } + + bits = 0; + pcpu_for_each_md_free_region(chunk, bit_off, bits) { + pcpu_block_update(chunk_md, bit_off, bit_off + bits); } } @@ -654,14 +750,23 @@ static void pcpu_block_refresh_hint(struct pcpu_chunk *chunk, int index) { struct pcpu_block_md *block = chunk->md_blocks + index; unsigned long *alloc_map = pcpu_index_alloc_map(chunk, index); - int rs, re; /* region start, region end */ + int rs, re, start; /* region start, region end */ + + /* promote scan_hint to contig_hint */ + if (block->scan_hint) { + start = block->scan_hint_start + block->scan_hint; + block->contig_hint_start = block->scan_hint_start; + block->contig_hint = block->scan_hint; + block->scan_hint = 0; + } else { + start = block->first_free; + block->contig_hint = 0; + } - /* clear hints */ - block->contig_hint = 0; - block->left_free = block->right_free = 0; + block->right_free = 0; /* iterate over free areas and update the contig hints */ - pcpu_for_each_unpop_region(alloc_map, rs, re, block->first_free, + pcpu_for_each_unpop_region(alloc_map, rs, re, start, PCPU_BITMAP_BLOCK_BITS) { pcpu_block_update(block, rs, re); } @@ -680,6 +785,8 @@ static void pcpu_block_refresh_hint(struct pcpu_chunk *chunk, int index) static void pcpu_block_update_hint_alloc(struct pcpu_chunk *chunk, int bit_off, int bits) { + struct pcpu_block_md *chunk_md = &chunk->chunk_md; + int nr_empty_pages = 0; struct pcpu_block_md *s_block, *e_block, *block; int s_index, e_index; /* block indexes of the freed allocation */ int s_off, e_off; /* block offsets of the freed allocation */ @@ -704,15 +811,29 @@ static void pcpu_block_update_hint_alloc(struct pcpu_chunk *chunk, int bit_off, * If the allocation breaks the contig_hint, a scan is required to * restore this hint. */ + if (s_block->contig_hint == PCPU_BITMAP_BLOCK_BITS) + nr_empty_pages++; + if (s_off == s_block->first_free) s_block->first_free = find_next_zero_bit( pcpu_index_alloc_map(chunk, s_index), PCPU_BITMAP_BLOCK_BITS, s_off + bits); - if (s_off >= s_block->contig_hint_start && - s_off < s_block->contig_hint_start + s_block->contig_hint) { + if (pcpu_region_overlap(s_block->scan_hint_start, + s_block->scan_hint_start + s_block->scan_hint, + s_off, + s_off + bits)) + s_block->scan_hint = 0; + + if (pcpu_region_overlap(s_block->contig_hint_start, + s_block->contig_hint_start + + s_block->contig_hint, + s_off, + s_off + bits)) { /* block contig hint is broken - scan to fix it */ + if (!s_off) + s_block->left_free = 0; pcpu_block_refresh_hint(chunk, s_index); } else { /* update left and right contig manually */ @@ -728,6 +849,9 @@ static void pcpu_block_update_hint_alloc(struct pcpu_chunk *chunk, int bit_off, * Update e_block. */ if (s_index != e_index) { + if (e_block->contig_hint == PCPU_BITMAP_BLOCK_BITS) + nr_empty_pages++; + /* * When the allocation is across blocks, the end is along * the left part of the e_block. @@ -740,11 +864,14 @@ static void pcpu_block_update_hint_alloc(struct pcpu_chunk *chunk, int bit_off, /* reset the block */ e_block++; } else { + if (e_off > e_block->scan_hint_start) + e_block->scan_hint = 0; + + e_block->left_free = 0; if (e_off > e_block->contig_hint_start) { /* contig hint is broken - scan to fix it */ pcpu_block_refresh_hint(chunk, e_index); } else { - e_block->left_free = 0; e_block->right_free = min_t(int, e_block->right_free, PCPU_BITMAP_BLOCK_BITS - e_off); @@ -752,21 +879,36 @@ static void pcpu_block_update_hint_alloc(struct pcpu_chunk *chunk, int bit_off, } /* update in-between md_blocks */ + nr_empty_pages += (e_index - s_index - 1); for (block = s_block + 1; block < e_block; block++) { + block->scan_hint = 0; block->contig_hint = 0; block->left_free = 0; block->right_free = 0; } } + if (nr_empty_pages) + pcpu_update_empty_pages(chunk, -nr_empty_pages); + + if (pcpu_region_overlap(chunk_md->scan_hint_start, + chunk_md->scan_hint_start + + chunk_md->scan_hint, + bit_off, + bit_off + bits)) + chunk_md->scan_hint = 0; + /* * The only time a full chunk scan is required is if the chunk * contig hint is broken. Otherwise, it means a smaller space * was used and therefore the chunk contig hint is still correct. */ - if (bit_off >= chunk->contig_bits_start && - bit_off < chunk->contig_bits_start + chunk->contig_bits) - pcpu_chunk_refresh_hint(chunk); + if (pcpu_region_overlap(chunk_md->contig_hint_start, + chunk_md->contig_hint_start + + chunk_md->contig_hint, + bit_off, + bit_off + bits)) + pcpu_chunk_refresh_hint(chunk, false); } /** @@ -782,13 +924,15 @@ static void pcpu_block_update_hint_alloc(struct pcpu_chunk *chunk, int bit_off, * * A chunk update is triggered if a page becomes free, a block becomes free, * or the free spans across blocks. This tradeoff is to minimize iterating - * over the block metadata to update chunk->contig_bits. chunk->contig_bits - * may be off by up to a page, but it will never be more than the available - * space. If the contig hint is contained in one block, it will be accurate. + * over the block metadata to update chunk_md->contig_hint. + * chunk_md->contig_hint may be off by up to a page, but it will never be more + * than the available space. If the contig hint is contained in one block, it + * will be accurate. */ static void pcpu_block_update_hint_free(struct pcpu_chunk *chunk, int bit_off, int bits) { + int nr_empty_pages = 0; struct pcpu_block_md *s_block, *e_block, *block; int s_index, e_index; /* block indexes of the freed allocation */ int s_off, e_off; /* block offsets of the freed allocation */ @@ -842,16 +986,22 @@ static void pcpu_block_update_hint_free(struct pcpu_chunk *chunk, int bit_off, /* update s_block */ e_off = (s_index == e_index) ? end : PCPU_BITMAP_BLOCK_BITS; + if (!start && e_off == PCPU_BITMAP_BLOCK_BITS) + nr_empty_pages++; pcpu_block_update(s_block, start, e_off); /* freeing in the same block */ if (s_index != e_index) { /* update e_block */ + if (end == PCPU_BITMAP_BLOCK_BITS) + nr_empty_pages++; pcpu_block_update(e_block, 0, end); /* reset md_blocks in the middle */ + nr_empty_pages += (e_index - s_index - 1); for (block = s_block + 1; block < e_block; block++) { block->first_free = 0; + block->scan_hint = 0; block->contig_hint_start = 0; block->contig_hint = PCPU_BITMAP_BLOCK_BITS; block->left_free = PCPU_BITMAP_BLOCK_BITS; @@ -859,19 +1009,21 @@ static void pcpu_block_update_hint_free(struct pcpu_chunk *chunk, int bit_off, } } + if (nr_empty_pages) + pcpu_update_empty_pages(chunk, nr_empty_pages); + /* - * Refresh chunk metadata when the free makes a page free, a block - * free, or spans across blocks. The contig hint may be off by up to - * a page, but if the hint is contained in a block, it will be accurate - * with the else condition below. + * Refresh chunk metadata when the free makes a block free or spans + * across blocks. The contig_hint may be off by up to a page, but if + * the contig_hint is contained in a block, it will be accurate with + * the else condition below. */ - if ((ALIGN_DOWN(end, min(PCPU_BITS_PER_PAGE, PCPU_BITMAP_BLOCK_BITS)) > - ALIGN(start, min(PCPU_BITS_PER_PAGE, PCPU_BITMAP_BLOCK_BITS))) || - s_index != e_index) - pcpu_chunk_refresh_hint(chunk); + if (((end - start) >= PCPU_BITMAP_BLOCK_BITS) || s_index != e_index) + pcpu_chunk_refresh_hint(chunk, true); else - pcpu_chunk_update(chunk, pcpu_block_off_to_off(s_index, start), - s_block->contig_hint); + pcpu_block_update(&chunk->chunk_md, + pcpu_block_off_to_off(s_index, start), + end); } /** @@ -926,6 +1078,7 @@ static bool pcpu_is_populated(struct pcpu_chunk *chunk, int bit_off, int bits, static int pcpu_find_block_fit(struct pcpu_chunk *chunk, int alloc_bits, size_t align, bool pop_only) { + struct pcpu_block_md *chunk_md = &chunk->chunk_md; int bit_off, bits, next_off; /* @@ -934,12 +1087,12 @@ static int pcpu_find_block_fit(struct pcpu_chunk *chunk, int alloc_bits, * cannot fit in the global hint, there is memory pressure and creating * a new chunk would happen soon. */ - bit_off = ALIGN(chunk->contig_bits_start, align) - - chunk->contig_bits_start; - if (bit_off + alloc_bits > chunk->contig_bits) + bit_off = ALIGN(chunk_md->contig_hint_start, align) - + chunk_md->contig_hint_start; + if (bit_off + alloc_bits > chunk_md->contig_hint) return -1; - bit_off = chunk->first_bit; + bit_off = pcpu_next_hint(chunk_md, alloc_bits); bits = 0; pcpu_for_each_fit_region(chunk, alloc_bits, align, bit_off, bits) { if (!pop_only || pcpu_is_populated(chunk, bit_off, bits, @@ -956,6 +1109,62 @@ static int pcpu_find_block_fit(struct pcpu_chunk *chunk, int alloc_bits, return bit_off; } +/* + * pcpu_find_zero_area - modified from bitmap_find_next_zero_area_off() + * @map: the address to base the search on + * @size: the bitmap size in bits + * @start: the bitnumber to start searching at + * @nr: the number of zeroed bits we're looking for + * @align_mask: alignment mask for zero area + * @largest_off: offset of the largest area skipped + * @largest_bits: size of the largest area skipped + * + * The @align_mask should be one less than a power of 2. + * + * This is a modified version of bitmap_find_next_zero_area_off() to remember + * the largest area that was skipped. This is imperfect, but in general is + * good enough. The largest remembered region is the largest failed region + * seen. This does not include anything we possibly skipped due to alignment. + * pcpu_block_update_scan() does scan backwards to try and recover what was + * lost to alignment. While this can cause scanning to miss earlier possible + * free areas, smaller allocations will eventually fill those holes. + */ +static unsigned long pcpu_find_zero_area(unsigned long *map, + unsigned long size, + unsigned long start, + unsigned long nr, + unsigned long align_mask, + unsigned long *largest_off, + unsigned long *largest_bits) +{ + unsigned long index, end, i, area_off, area_bits; +again: + index = find_next_zero_bit(map, size, start); + + /* Align allocation */ + index = __ALIGN_MASK(index, align_mask); + area_off = index; + + end = index + nr; + if (end > size) + return end; + i = find_next_bit(map, end, index); + if (i < end) { + area_bits = i - area_off; + /* remember largest unused area with best alignment */ + if (area_bits > *largest_bits || + (area_bits == *largest_bits && *largest_off && + (!area_off || __ffs(area_off) > __ffs(*largest_off)))) { + *largest_off = area_off; + *largest_bits = area_bits; + } + + start = i + 1; + goto again; + } + return index; +} + /** * pcpu_alloc_area - allocates an area from a pcpu_chunk * @chunk: chunk of interest @@ -978,7 +1187,9 @@ static int pcpu_find_block_fit(struct pcpu_chunk *chunk, int alloc_bits, static int pcpu_alloc_area(struct pcpu_chunk *chunk, int alloc_bits, size_t align, int start) { + struct pcpu_block_md *chunk_md = &chunk->chunk_md; size_t align_mask = (align) ? (align - 1) : 0; + unsigned long area_off = 0, area_bits = 0; int bit_off, end, oslot; lockdep_assert_held(&pcpu_lock); @@ -988,12 +1199,16 @@ static int pcpu_alloc_area(struct pcpu_chunk *chunk, int alloc_bits, /* * Search to find a fit. */ - end = start + alloc_bits + PCPU_BITMAP_BLOCK_BITS; - bit_off = bitmap_find_next_zero_area(chunk->alloc_map, end, start, - alloc_bits, align_mask); + end = min_t(int, start + alloc_bits + PCPU_BITMAP_BLOCK_BITS, + pcpu_chunk_map_bits(chunk)); + bit_off = pcpu_find_zero_area(chunk->alloc_map, end, start, alloc_bits, + align_mask, &area_off, &area_bits); if (bit_off >= end) return -1; + if (area_bits) + pcpu_block_update_scan(chunk, area_off, area_bits); + /* update alloc map */ bitmap_set(chunk->alloc_map, bit_off, alloc_bits); @@ -1005,8 +1220,8 @@ static int pcpu_alloc_area(struct pcpu_chunk *chunk, int alloc_bits, chunk->free_bytes -= alloc_bits * PCPU_MIN_ALLOC_SIZE; /* update first free bit */ - if (bit_off == chunk->first_bit) - chunk->first_bit = find_next_zero_bit( + if (bit_off == chunk_md->first_free) + chunk_md->first_free = find_next_zero_bit( chunk->alloc_map, pcpu_chunk_map_bits(chunk), bit_off + alloc_bits); @@ -1028,6 +1243,7 @@ static int pcpu_alloc_area(struct pcpu_chunk *chunk, int alloc_bits, */ static void pcpu_free_area(struct pcpu_chunk *chunk, int off) { + struct pcpu_block_md *chunk_md = &chunk->chunk_md; int bit_off, bits, end, oslot; lockdep_assert_held(&pcpu_lock); @@ -1047,24 +1263,34 @@ static void pcpu_free_area(struct pcpu_chunk *chunk, int off) chunk->free_bytes += bits * PCPU_MIN_ALLOC_SIZE; /* update first free bit */ - chunk->first_bit = min(chunk->first_bit, bit_off); + chunk_md->first_free = min(chunk_md->first_free, bit_off); pcpu_block_update_hint_free(chunk, bit_off, bits); pcpu_chunk_relocate(chunk, oslot); } +static void pcpu_init_md_block(struct pcpu_block_md *block, int nr_bits) +{ + block->scan_hint = 0; + block->contig_hint = nr_bits; + block->left_free = nr_bits; + block->right_free = nr_bits; + block->first_free = 0; + block->nr_bits = nr_bits; +} + static void pcpu_init_md_blocks(struct pcpu_chunk *chunk) { struct pcpu_block_md *md_block; + /* init the chunk's block */ + pcpu_init_md_block(&chunk->chunk_md, pcpu_chunk_map_bits(chunk)); + for (md_block = chunk->md_blocks; md_block != chunk->md_blocks + pcpu_chunk_nr_blocks(chunk); - md_block++) { - md_block->contig_hint = PCPU_BITMAP_BLOCK_BITS; - md_block->left_free = PCPU_BITMAP_BLOCK_BITS; - md_block->right_free = PCPU_BITMAP_BLOCK_BITS; - } + md_block++) + pcpu_init_md_block(md_block, PCPU_BITMAP_BLOCK_BITS); } /** @@ -1143,11 +1369,8 @@ static struct pcpu_chunk * __init pcpu_alloc_first_chunk(unsigned long tmp_addr, chunk->immutable = true; bitmap_fill(chunk->populated, chunk->nr_pages); chunk->nr_populated = chunk->nr_pages; - chunk->nr_empty_pop_pages = - pcpu_cnt_pop_pages(chunk, start_offset / PCPU_MIN_ALLOC_SIZE, - map_size / PCPU_MIN_ALLOC_SIZE); + chunk->nr_empty_pop_pages = chunk->nr_pages; - chunk->contig_bits = map_size / PCPU_MIN_ALLOC_SIZE; chunk->free_bytes = map_size; if (chunk->start_offset) { @@ -1157,7 +1380,7 @@ static struct pcpu_chunk * __init pcpu_alloc_first_chunk(unsigned long tmp_addr, set_bit(0, chunk->bound_map); set_bit(offset_bits, chunk->bound_map); - chunk->first_bit = offset_bits; + chunk->chunk_md.first_free = offset_bits; pcpu_block_update_hint_alloc(chunk, 0, offset_bits); } @@ -1210,7 +1433,6 @@ static struct pcpu_chunk *pcpu_alloc_chunk(gfp_t gfp) pcpu_init_md_blocks(chunk); /* init metadata */ - chunk->contig_bits = region_bits; chunk->free_bytes = chunk->nr_pages * PAGE_SIZE; return chunk; @@ -1240,7 +1462,6 @@ static void pcpu_free_chunk(struct pcpu_chunk *chunk) * @chunk: pcpu_chunk which got populated * @page_start: the start page * @page_end: the end page - * @for_alloc: if this is to populate for allocation * * Pages in [@page_start,@page_end) have been populated to @chunk. Update * the bookkeeping information accordingly. Must be called after each @@ -1250,7 +1471,7 @@ static void pcpu_free_chunk(struct pcpu_chunk *chunk) * is to serve an allocation in that area. */ static void pcpu_chunk_populated(struct pcpu_chunk *chunk, int page_start, - int page_end, bool for_alloc) + int page_end) { int nr = page_end - page_start; @@ -1260,10 +1481,7 @@ static void pcpu_chunk_populated(struct pcpu_chunk *chunk, int page_start, chunk->nr_populated += nr; pcpu_nr_populated += nr; - if (!for_alloc) { - chunk->nr_empty_pop_pages += nr; - pcpu_nr_empty_pop_pages += nr; - } + pcpu_update_empty_pages(chunk, nr); } /** @@ -1285,9 +1503,9 @@ static void pcpu_chunk_depopulated(struct pcpu_chunk *chunk, bitmap_clear(chunk->populated, page_start, nr); chunk->nr_populated -= nr; - chunk->nr_empty_pop_pages -= nr; - pcpu_nr_empty_pop_pages -= nr; pcpu_nr_populated -= nr; + + pcpu_update_empty_pages(chunk, -nr); } /* @@ -1374,7 +1592,7 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved, bool is_atomic = (gfp & GFP_KERNEL) != GFP_KERNEL; bool do_warn = !(gfp & __GFP_NOWARN); static int warn_limit = 10; - struct pcpu_chunk *chunk; + struct pcpu_chunk *chunk, *next; const char *err; int slot, off, cpu, ret; unsigned long flags; @@ -1436,11 +1654,14 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved, restart: /* search through normal chunks */ for (slot = pcpu_size_to_slot(size); slot < pcpu_nr_slots; slot++) { - list_for_each_entry(chunk, &pcpu_slot[slot], list) { + list_for_each_entry_safe(chunk, next, &pcpu_slot[slot], list) { off = pcpu_find_block_fit(chunk, bits, bit_align, is_atomic); - if (off < 0) + if (off < 0) { + if (slot < PCPU_SLOT_FAIL_THRESHOLD) + pcpu_chunk_move(chunk, 0); continue; + } off = pcpu_alloc_area(chunk, bits, bit_align, off); if (off >= 0) @@ -1499,7 +1720,7 @@ area_found: err = "failed to populate"; goto fail_unlock; } - pcpu_chunk_populated(chunk, rs, re, true); + pcpu_chunk_populated(chunk, rs, re); spin_unlock_irqrestore(&pcpu_lock, flags); } @@ -1698,7 +1919,7 @@ retry_pop: if (!ret) { nr_to_pop -= nr; spin_lock_irq(&pcpu_lock); - pcpu_chunk_populated(chunk, rs, rs + nr, false); + pcpu_chunk_populated(chunk, rs, rs + nr); spin_unlock_irq(&pcpu_lock); } else { nr_to_pop = 0; @@ -1738,6 +1959,7 @@ void free_percpu(void __percpu *ptr) struct pcpu_chunk *chunk; unsigned long flags; int off; + bool need_balance = false; if (!ptr) return; @@ -1759,7 +1981,7 @@ void free_percpu(void __percpu *ptr) list_for_each_entry(pos, &pcpu_slot[pcpu_nr_slots - 1], list) if (pos != chunk) { - pcpu_schedule_balance_work(); + need_balance = true; break; } } @@ -1767,6 +1989,9 @@ void free_percpu(void __percpu *ptr) trace_percpu_free_percpu(chunk->base_addr, off, ptr); spin_unlock_irqrestore(&pcpu_lock, flags); + + if (need_balance) + pcpu_schedule_balance_work(); } EXPORT_SYMBOL_GPL(free_percpu); @@ -2567,8 +2792,8 @@ int __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size, ai->groups[group].base_offset = areas[group] - base; } - pr_info("Embedded %zu pages/cpu @%p s%zu r%zu d%zu u%zu\n", - PFN_DOWN(size_sum), base, ai->static_size, ai->reserved_size, + pr_info("Embedded %zu pages/cpu s%zu r%zu d%zu u%zu\n", + PFN_DOWN(size_sum), ai->static_size, ai->reserved_size, ai->dyn_size, ai->unit_size); rc = pcpu_setup_first_chunk(ai, base); @@ -2692,8 +2917,8 @@ int __init pcpu_page_first_chunk(size_t reserved_size, } /* we're ready, commit */ - pr_info("%d %s pages/cpu @%p s%zu r%zu d%zu\n", - unit_pages, psize_str, vm.addr, ai->static_size, + pr_info("%d %s pages/cpu s%zu r%zu d%zu\n", + unit_pages, psize_str, ai->static_size, ai->reserved_size, ai->dyn_size); rc = pcpu_setup_first_chunk(ai, vm.addr); diff --git a/mm/rmap.c b/mm/rmap.c index b30c7c71d1d9..e5dfe2ae6b0d 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -850,7 +850,7 @@ int page_referenced(struct page *page, }; *vm_flags = 0; - if (!page_mapped(page)) + if (!pra.mapcount) return 0; if (!page_rmapping(page)) @@ -896,7 +896,8 @@ static bool page_mkclean_one(struct page *page, struct vm_area_struct *vma, * We have to assume the worse case ie pmd for invalidation. Note that * the page can not be free from this function. */ - mmu_notifier_range_init(&range, vma->vm_mm, address, + mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_PAGE, + 0, vma, vma->vm_mm, address, min(vma->vm_end, address + (PAGE_SIZE << compound_order(page)))); mmu_notifier_invalidate_range_start(&range); @@ -928,7 +929,7 @@ static bool page_mkclean_one(struct page *page, struct vm_area_struct *vma, continue; flush_cache_page(vma, address, page_to_pfn(page)); - entry = pmdp_huge_clear_flush(vma, address, pmd); + entry = pmdp_invalidate(vma, address, pmd); entry = pmd_wrprotect(entry); entry = pmd_mkclean(entry); set_pmd_at(vma->vm_mm, address, pmd, entry); @@ -1371,7 +1372,8 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma, * Note that the page can not be free in this function as call of * try_to_unmap() must hold a reference on the page. */ - mmu_notifier_range_init(&range, vma->vm_mm, address, + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm, + address, min(vma->vm_end, address + (PAGE_SIZE << compound_order(page)))); if (PageHuge(page)) { diff --git a/mm/shmem.c b/mm/shmem.c index b3db3779a30a..1bb3b8dc8bb2 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -614,7 +614,7 @@ static int shmem_add_to_page_cache(struct page *page, if (xas_error(&xas)) goto unlock; next: - xas_store(&xas, page + i); + xas_store(&xas, page); if (++i < nr) { xas_next(&xas); goto next; @@ -1081,9 +1081,14 @@ static void shmem_evict_inode(struct inode *inode) } spin_unlock(&sbinfo->shrinklist_lock); } - if (!list_empty(&info->swaplist)) { + while (!list_empty(&info->swaplist)) { + /* Wait while shmem_unuse() is scanning this inode... */ + wait_var_event(&info->stop_eviction, + !atomic_read(&info->stop_eviction)); mutex_lock(&shmem_swaplist_mutex); - list_del_init(&info->swaplist); + /* ...but beware of the race if we peeked too early */ + if (!atomic_read(&info->stop_eviction)) + list_del_init(&info->swaplist); mutex_unlock(&shmem_swaplist_mutex); } } @@ -1099,10 +1104,11 @@ extern struct swap_info_struct *swap_info[]; static int shmem_find_swap_entries(struct address_space *mapping, pgoff_t start, unsigned int nr_entries, struct page **entries, pgoff_t *indices, - bool frontswap) + unsigned int type, bool frontswap) { XA_STATE(xas, &mapping->i_pages, start); struct page *page; + swp_entry_t entry; unsigned int ret = 0; if (!nr_entries) @@ -1116,13 +1122,12 @@ static int shmem_find_swap_entries(struct address_space *mapping, if (!xa_is_value(page)) continue; - if (frontswap) { - swp_entry_t entry = radix_to_swp_entry(page); - - if (!frontswap_test(swap_info[swp_type(entry)], - swp_offset(entry))) - continue; - } + entry = radix_to_swp_entry(page); + if (swp_type(entry) != type) + continue; + if (frontswap && + !frontswap_test(swap_info[type], swp_offset(entry))) + continue; indices[ret] = xas.xa_index; entries[ret] = page; @@ -1194,7 +1199,7 @@ static int shmem_unuse_inode(struct inode *inode, unsigned int type, pvec.nr = shmem_find_swap_entries(mapping, start, nr_entries, pvec.pages, indices, - frontswap); + type, frontswap); if (pvec.nr == 0) { ret = 0; break; @@ -1227,36 +1232,27 @@ int shmem_unuse(unsigned int type, bool frontswap, unsigned long *fs_pages_to_unuse) { struct shmem_inode_info *info, *next; - struct inode *inode; - struct inode *prev_inode = NULL; int error = 0; if (list_empty(&shmem_swaplist)) return 0; mutex_lock(&shmem_swaplist_mutex); - - /* - * The extra refcount on the inode is necessary to safely dereference - * p->next after re-acquiring the lock. New shmem inodes with swap - * get added to the end of the list and we will scan them all. - */ list_for_each_entry_safe(info, next, &shmem_swaplist, swaplist) { if (!info->swapped) { list_del_init(&info->swaplist); continue; } - - inode = igrab(&info->vfs_inode); - if (!inode) - continue; - + /* + * Drop the swaplist mutex while searching the inode for swap; + * but before doing so, make sure shmem_evict_inode() will not + * remove placeholder inode from swaplist, nor let it be freed + * (igrab() would protect from unlink, but not from unmount). + */ + atomic_inc(&info->stop_eviction); mutex_unlock(&shmem_swaplist_mutex); - if (prev_inode) - iput(prev_inode); - prev_inode = inode; - error = shmem_unuse_inode(inode, type, frontswap, + error = shmem_unuse_inode(&info->vfs_inode, type, frontswap, fs_pages_to_unuse); cond_resched(); @@ -1264,14 +1260,13 @@ int shmem_unuse(unsigned int type, bool frontswap, next = list_next_entry(info, swaplist); if (!info->swapped) list_del_init(&info->swaplist); + if (atomic_dec_and_test(&info->stop_eviction)) + wake_up_var(&info->stop_eviction); if (error) break; } mutex_unlock(&shmem_swaplist_mutex); - if (prev_inode) - iput(prev_inode); - return error; } @@ -2238,6 +2233,7 @@ static struct inode *shmem_get_inode(struct super_block *sb, const struct inode info = SHMEM_I(inode); memset(info, 0, (char *)inode - (char *)info); spin_lock_init(&info->lock); + atomic_set(&info->stop_eviction, 0); info->seals = F_SEAL_SEAL; info->flags = flags & VM_NORESERVE; INIT_LIST_HEAD(&info->shrinklist); @@ -3635,9 +3631,8 @@ static struct inode *shmem_alloc_inode(struct super_block *sb) return &info->vfs_inode; } -static void shmem_destroy_callback(struct rcu_head *head) +static void shmem_free_in_core_inode(struct inode *inode) { - struct inode *inode = container_of(head, struct inode, i_rcu); if (S_ISLNK(inode->i_mode)) kfree(inode->i_link); kmem_cache_free(shmem_inode_cachep, SHMEM_I(inode)); @@ -3647,7 +3642,6 @@ static void shmem_destroy_inode(struct inode *inode) { if (S_ISREG(inode->i_mode)) mpol_free_shared_policy(&SHMEM_I(inode)->policy); - call_rcu(&inode->i_rcu, shmem_destroy_callback); } static void shmem_init_inode(void *foo) @@ -3738,6 +3732,7 @@ static const struct inode_operations shmem_special_inode_operations = { static const struct super_operations shmem_ops = { .alloc_inode = shmem_alloc_inode, + .free_inode = shmem_free_in_core_inode, .destroy_inode = shmem_destroy_inode, #ifdef CONFIG_TMPFS .statfs = shmem_statfs, diff --git a/mm/shuffle.c b/mm/shuffle.c new file mode 100644 index 000000000000..3ce12481b1dc --- /dev/null +++ b/mm/shuffle.c @@ -0,0 +1,207 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright(c) 2018 Intel Corporation. All rights reserved. + +#include <linux/mm.h> +#include <linux/init.h> +#include <linux/mmzone.h> +#include <linux/random.h> +#include <linux/moduleparam.h> +#include "internal.h" +#include "shuffle.h" + +DEFINE_STATIC_KEY_FALSE(page_alloc_shuffle_key); +static unsigned long shuffle_state __ro_after_init; + +/* + * Depending on the architecture, module parameter parsing may run + * before, or after the cache detection. SHUFFLE_FORCE_DISABLE prevents, + * or reverts the enabling of the shuffle implementation. SHUFFLE_ENABLE + * attempts to turn on the implementation, but aborts if it finds + * SHUFFLE_FORCE_DISABLE already set. + */ +__meminit void page_alloc_shuffle(enum mm_shuffle_ctl ctl) +{ + if (ctl == SHUFFLE_FORCE_DISABLE) + set_bit(SHUFFLE_FORCE_DISABLE, &shuffle_state); + + if (test_bit(SHUFFLE_FORCE_DISABLE, &shuffle_state)) { + if (test_and_clear_bit(SHUFFLE_ENABLE, &shuffle_state)) + static_branch_disable(&page_alloc_shuffle_key); + } else if (ctl == SHUFFLE_ENABLE + && !test_and_set_bit(SHUFFLE_ENABLE, &shuffle_state)) + static_branch_enable(&page_alloc_shuffle_key); +} + +static bool shuffle_param; +extern int shuffle_show(char *buffer, const struct kernel_param *kp) +{ + return sprintf(buffer, "%c\n", test_bit(SHUFFLE_ENABLE, &shuffle_state) + ? 'Y' : 'N'); +} + +static __meminit int shuffle_store(const char *val, + const struct kernel_param *kp) +{ + int rc = param_set_bool(val, kp); + + if (rc < 0) + return rc; + if (shuffle_param) + page_alloc_shuffle(SHUFFLE_ENABLE); + else + page_alloc_shuffle(SHUFFLE_FORCE_DISABLE); + return 0; +} +module_param_call(shuffle, shuffle_store, shuffle_show, &shuffle_param, 0400); + +/* + * For two pages to be swapped in the shuffle, they must be free (on a + * 'free_area' lru), have the same order, and have the same migratetype. + */ +static struct page * __meminit shuffle_valid_page(unsigned long pfn, int order) +{ + struct page *page; + + /* + * Given we're dealing with randomly selected pfns in a zone we + * need to ask questions like... + */ + + /* ...is the pfn even in the memmap? */ + if (!pfn_valid_within(pfn)) + return NULL; + + /* ...is the pfn in a present section or a hole? */ + if (!pfn_present(pfn)) + return NULL; + + /* ...is the page free and currently on a free_area list? */ + page = pfn_to_page(pfn); + if (!PageBuddy(page)) + return NULL; + + /* + * ...is the page on the same list as the page we will + * shuffle it with? + */ + if (page_order(page) != order) + return NULL; + + return page; +} + +/* + * Fisher-Yates shuffle the freelist which prescribes iterating through an + * array, pfns in this case, and randomly swapping each entry with another in + * the span, end_pfn - start_pfn. + * + * To keep the implementation simple it does not attempt to correct for sources + * of bias in the distribution, like modulo bias or pseudo-random number + * generator bias. I.e. the expectation is that this shuffling raises the bar + * for attacks that exploit the predictability of page allocations, but need not + * be a perfect shuffle. + */ +#define SHUFFLE_RETRY 10 +void __meminit __shuffle_zone(struct zone *z) +{ + unsigned long i, flags; + unsigned long start_pfn = z->zone_start_pfn; + unsigned long end_pfn = zone_end_pfn(z); + const int order = SHUFFLE_ORDER; + const int order_pages = 1 << order; + + spin_lock_irqsave(&z->lock, flags); + start_pfn = ALIGN(start_pfn, order_pages); + for (i = start_pfn; i < end_pfn; i += order_pages) { + unsigned long j; + int migratetype, retry; + struct page *page_i, *page_j; + + /* + * We expect page_i, in the sub-range of a zone being added + * (@start_pfn to @end_pfn), to more likely be valid compared to + * page_j randomly selected in the span @zone_start_pfn to + * @spanned_pages. + */ + page_i = shuffle_valid_page(i, order); + if (!page_i) + continue; + + for (retry = 0; retry < SHUFFLE_RETRY; retry++) { + /* + * Pick a random order aligned page in the zone span as + * a swap target. If the selected pfn is a hole, retry + * up to SHUFFLE_RETRY attempts find a random valid pfn + * in the zone. + */ + j = z->zone_start_pfn + + ALIGN_DOWN(get_random_long() % z->spanned_pages, + order_pages); + page_j = shuffle_valid_page(j, order); + if (page_j && page_j != page_i) + break; + } + if (retry >= SHUFFLE_RETRY) { + pr_debug("%s: failed to swap %#lx\n", __func__, i); + continue; + } + + /* + * Each migratetype corresponds to its own list, make sure the + * types match otherwise we're moving pages to lists where they + * do not belong. + */ + migratetype = get_pageblock_migratetype(page_i); + if (get_pageblock_migratetype(page_j) != migratetype) { + pr_debug("%s: migratetype mismatch %#lx\n", __func__, i); + continue; + } + + list_swap(&page_i->lru, &page_j->lru); + + pr_debug("%s: swap: %#lx -> %#lx\n", __func__, i, j); + + /* take it easy on the zone lock */ + if ((i % (100 * order_pages)) == 0) { + spin_unlock_irqrestore(&z->lock, flags); + cond_resched(); + spin_lock_irqsave(&z->lock, flags); + } + } + spin_unlock_irqrestore(&z->lock, flags); +} + +/** + * shuffle_free_memory - reduce the predictability of the page allocator + * @pgdat: node page data + */ +void __meminit __shuffle_free_memory(pg_data_t *pgdat) +{ + struct zone *z; + + for (z = pgdat->node_zones; z < pgdat->node_zones + MAX_NR_ZONES; z++) + shuffle_zone(z); +} + +void add_to_free_area_random(struct page *page, struct free_area *area, + int migratetype) +{ + static u64 rand; + static u8 rand_bits; + + /* + * The lack of locking is deliberate. If 2 threads race to + * update the rand state it just adds to the entropy. + */ + if (rand_bits == 0) { + rand_bits = 64; + rand = get_random_u64(); + } + + if (rand & 1) + add_to_free_area(page, area, migratetype); + else + add_to_free_area_tail(page, area, migratetype); + rand_bits--; + rand >>= 1; +} diff --git a/mm/shuffle.h b/mm/shuffle.h new file mode 100644 index 000000000000..777a257a0d2f --- /dev/null +++ b/mm/shuffle.h @@ -0,0 +1,64 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright(c) 2018 Intel Corporation. All rights reserved. +#ifndef _MM_SHUFFLE_H +#define _MM_SHUFFLE_H +#include <linux/jump_label.h> + +/* + * SHUFFLE_ENABLE is called from the command line enabling path, or by + * platform-firmware enabling that indicates the presence of a + * direct-mapped memory-side-cache. SHUFFLE_FORCE_DISABLE is called from + * the command line path and overrides any previous or future + * SHUFFLE_ENABLE. + */ +enum mm_shuffle_ctl { + SHUFFLE_ENABLE, + SHUFFLE_FORCE_DISABLE, +}; + +#define SHUFFLE_ORDER (MAX_ORDER-1) + +#ifdef CONFIG_SHUFFLE_PAGE_ALLOCATOR +DECLARE_STATIC_KEY_FALSE(page_alloc_shuffle_key); +extern void page_alloc_shuffle(enum mm_shuffle_ctl ctl); +extern void __shuffle_free_memory(pg_data_t *pgdat); +static inline void shuffle_free_memory(pg_data_t *pgdat) +{ + if (!static_branch_unlikely(&page_alloc_shuffle_key)) + return; + __shuffle_free_memory(pgdat); +} + +extern void __shuffle_zone(struct zone *z); +static inline void shuffle_zone(struct zone *z) +{ + if (!static_branch_unlikely(&page_alloc_shuffle_key)) + return; + __shuffle_zone(z); +} + +static inline bool is_shuffle_order(int order) +{ + if (!static_branch_unlikely(&page_alloc_shuffle_key)) + return false; + return order >= SHUFFLE_ORDER; +} +#else +static inline void shuffle_free_memory(pg_data_t *pgdat) +{ +} + +static inline void shuffle_zone(struct zone *z) +{ +} + +static inline void page_alloc_shuffle(enum mm_shuffle_ctl ctl) +{ +} + +static inline bool is_shuffle_order(int order) +{ + return false; +} +#endif +#endif /* _MM_SHUFFLE_H */ diff --git a/mm/slab.c b/mm/slab.c index 28652e4218e0..f7117ad9b3a3 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -362,29 +362,6 @@ static void **dbg_userword(struct kmem_cache *cachep, void *objp) #endif -#ifdef CONFIG_DEBUG_SLAB_LEAK - -static inline bool is_store_user_clean(struct kmem_cache *cachep) -{ - return atomic_read(&cachep->store_user_clean) == 1; -} - -static inline void set_store_user_clean(struct kmem_cache *cachep) -{ - atomic_set(&cachep->store_user_clean, 1); -} - -static inline void set_store_user_dirty(struct kmem_cache *cachep) -{ - if (is_store_user_clean(cachep)) - atomic_set(&cachep->store_user_clean, 0); -} - -#else -static inline void set_store_user_dirty(struct kmem_cache *cachep) {} - -#endif - /* * Do not go above this order unless 0 objects fit into the slab or * overridden on the command line. @@ -990,10 +967,8 @@ static void cpuup_canceled(long cpu) /* cpu is dead; no one can alloc from it. */ nc = per_cpu_ptr(cachep->cpu_cache, cpu); - if (nc) { - free_block(cachep, nc->entry, nc->avail, node, &list); - nc->avail = 0; - } + free_block(cachep, nc->entry, nc->avail, node, &list); + nc->avail = 0; if (!cpumask_empty(mask)) { spin_unlock_irq(&n->list_lock); @@ -1467,53 +1442,17 @@ static bool is_debug_pagealloc_cache(struct kmem_cache *cachep) } #ifdef CONFIG_DEBUG_PAGEALLOC -static void store_stackinfo(struct kmem_cache *cachep, unsigned long *addr, - unsigned long caller) -{ - int size = cachep->object_size; - - addr = (unsigned long *)&((char *)addr)[obj_offset(cachep)]; - - if (size < 5 * sizeof(unsigned long)) - return; - - *addr++ = 0x12345678; - *addr++ = caller; - *addr++ = smp_processor_id(); - size -= 3 * sizeof(unsigned long); - { - unsigned long *sptr = &caller; - unsigned long svalue; - - while (!kstack_end(sptr)) { - svalue = *sptr++; - if (kernel_text_address(svalue)) { - *addr++ = svalue; - size -= sizeof(unsigned long); - if (size <= sizeof(unsigned long)) - break; - } - } - - } - *addr++ = 0x87654321; -} - -static void slab_kernel_map(struct kmem_cache *cachep, void *objp, - int map, unsigned long caller) +static void slab_kernel_map(struct kmem_cache *cachep, void *objp, int map) { if (!is_debug_pagealloc_cache(cachep)) return; - if (caller) - store_stackinfo(cachep, objp, caller); - kernel_map_pages(virt_to_page(objp), cachep->size / PAGE_SIZE, map); } #else static inline void slab_kernel_map(struct kmem_cache *cachep, void *objp, - int map, unsigned long caller) {} + int map) {} #endif @@ -1661,7 +1600,7 @@ static void slab_destroy_debugcheck(struct kmem_cache *cachep, if (cachep->flags & SLAB_POISON) { check_poison_obj(cachep, objp); - slab_kernel_map(cachep, objp, 1, 0); + slab_kernel_map(cachep, objp, 1); } if (cachep->flags & SLAB_RED_ZONE) { if (*dbg_redzone1(cachep, objp) != RED_INACTIVE) @@ -1710,8 +1649,8 @@ static void slabs_destroy(struct kmem_cache *cachep, struct list_head *list) { struct page *page, *n; - list_for_each_entry_safe(page, n, list, lru) { - list_del(&page->lru); + list_for_each_entry_safe(page, n, list, slab_list) { + list_del(&page->slab_list); slab_destroy(cachep, page); } } @@ -2115,6 +2054,8 @@ done: cachep->allocflags = __GFP_COMP; if (flags & SLAB_CACHE_DMA) cachep->allocflags |= GFP_DMA; + if (flags & SLAB_CACHE_DMA32) + cachep->allocflags |= GFP_DMA32; if (flags & SLAB_RECLAIM_ACCOUNT) cachep->allocflags |= __GFP_RECLAIMABLE; cachep->size = size; @@ -2265,8 +2206,8 @@ static int drain_freelist(struct kmem_cache *cache, goto out; } - page = list_entry(p, struct page, lru); - list_del(&page->lru); + page = list_entry(p, struct page, slab_list); + list_del(&page->slab_list); n->free_slabs--; n->total_slabs--; /* @@ -2372,7 +2313,6 @@ static void *alloc_slabmgmt(struct kmem_cache *cachep, /* Slab management obj is off-slab. */ freelist = kmem_cache_alloc_node(cachep->freelist_cache, local_flags, nodeid); - freelist = kasan_reset_tag(freelist); if (!freelist) return NULL; } else { @@ -2432,7 +2372,7 @@ static void cache_init_objs_debug(struct kmem_cache *cachep, struct page *page) /* need to poison the objs? */ if (cachep->flags & SLAB_POISON) { poison_obj(cachep, objp, POISON_FREE); - slab_kernel_map(cachep, objp, 0, 0); + slab_kernel_map(cachep, objp, 0); } } #endif @@ -2589,11 +2529,6 @@ static void *slab_get_obj(struct kmem_cache *cachep, struct page *page) objp = index_to_obj(cachep, page, get_free_obj(page, page->active)); page->active++; -#if DEBUG - if (cachep->flags & SLAB_STORE_USER) - set_store_user_dirty(cachep); -#endif - return objp; } @@ -2726,13 +2661,13 @@ static void cache_grow_end(struct kmem_cache *cachep, struct page *page) if (!page) return; - INIT_LIST_HEAD(&page->lru); + INIT_LIST_HEAD(&page->slab_list); n = get_node(cachep, page_to_nid(page)); spin_lock(&n->list_lock); n->total_slabs++; if (!page->active) { - list_add_tail(&page->lru, &(n->slabs_free)); + list_add_tail(&page->slab_list, &n->slabs_free); n->free_slabs++; } else fixup_slab_list(cachep, n, page, &list); @@ -2799,10 +2734,8 @@ static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp, *dbg_redzone1(cachep, objp) = RED_INACTIVE; *dbg_redzone2(cachep, objp) = RED_INACTIVE; } - if (cachep->flags & SLAB_STORE_USER) { - set_store_user_dirty(cachep); + if (cachep->flags & SLAB_STORE_USER) *dbg_userword(cachep, objp) = (void *)caller; - } objnr = obj_to_index(cachep, page, objp); @@ -2811,7 +2744,7 @@ static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp, if (cachep->flags & SLAB_POISON) { poison_obj(cachep, objp, POISON_FREE); - slab_kernel_map(cachep, objp, 0, caller); + slab_kernel_map(cachep, objp, 0); } return objp; } @@ -2841,9 +2774,9 @@ static inline void fixup_slab_list(struct kmem_cache *cachep, void **list) { /* move slabp to correct slabp list: */ - list_del(&page->lru); + list_del(&page->slab_list); if (page->active == cachep->num) { - list_add(&page->lru, &n->slabs_full); + list_add(&page->slab_list, &n->slabs_full); if (OBJFREELIST_SLAB(cachep)) { #if DEBUG /* Poisoning will be done without holding the lock */ @@ -2857,7 +2790,7 @@ static inline void fixup_slab_list(struct kmem_cache *cachep, page->freelist = NULL; } } else - list_add(&page->lru, &n->slabs_partial); + list_add(&page->slab_list, &n->slabs_partial); } /* Try to find non-pfmemalloc slab if needed */ @@ -2880,20 +2813,20 @@ static noinline struct page *get_valid_first_slab(struct kmem_cache_node *n, } /* Move pfmemalloc slab to the end of list to speed up next search */ - list_del(&page->lru); + list_del(&page->slab_list); if (!page->active) { - list_add_tail(&page->lru, &n->slabs_free); + list_add_tail(&page->slab_list, &n->slabs_free); n->free_slabs++; } else - list_add_tail(&page->lru, &n->slabs_partial); + list_add_tail(&page->slab_list, &n->slabs_partial); - list_for_each_entry(page, &n->slabs_partial, lru) { + list_for_each_entry(page, &n->slabs_partial, slab_list) { if (!PageSlabPfmemalloc(page)) return page; } n->free_touched = 1; - list_for_each_entry(page, &n->slabs_free, lru) { + list_for_each_entry(page, &n->slabs_free, slab_list) { if (!PageSlabPfmemalloc(page)) { n->free_slabs--; return page; @@ -2908,11 +2841,12 @@ static struct page *get_first_slab(struct kmem_cache_node *n, bool pfmemalloc) struct page *page; assert_spin_locked(&n->list_lock); - page = list_first_entry_or_null(&n->slabs_partial, struct page, lru); + page = list_first_entry_or_null(&n->slabs_partial, struct page, + slab_list); if (!page) { n->free_touched = 1; page = list_first_entry_or_null(&n->slabs_free, struct page, - lru); + slab_list); if (page) n->free_slabs--; } @@ -3075,7 +3009,7 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep, return objp; if (cachep->flags & SLAB_POISON) { check_poison_obj(cachep, objp); - slab_kernel_map(cachep, objp, 1, 0); + slab_kernel_map(cachep, objp, 1); poison_obj(cachep, objp, POISON_INUSE); } if (cachep->flags & SLAB_STORE_USER) @@ -3413,29 +3347,29 @@ static void free_block(struct kmem_cache *cachep, void **objpp, objp = objpp[i]; page = virt_to_head_page(objp); - list_del(&page->lru); + list_del(&page->slab_list); check_spinlock_acquired_node(cachep, node); slab_put_obj(cachep, page, objp); STATS_DEC_ACTIVE(cachep); /* fixup slab chains */ if (page->active == 0) { - list_add(&page->lru, &n->slabs_free); + list_add(&page->slab_list, &n->slabs_free); n->free_slabs++; } else { /* Unconditionally move a slab to the end of the * partial list on free - maximum time for the * other objects to be freed, too. */ - list_add_tail(&page->lru, &n->slabs_partial); + list_add_tail(&page->slab_list, &n->slabs_partial); } } while (n->free_objects > n->free_limit && !list_empty(&n->slabs_free)) { n->free_objects -= cachep->num; - page = list_last_entry(&n->slabs_free, struct page, lru); - list_move(&page->lru, list); + page = list_last_entry(&n->slabs_free, struct page, slab_list); + list_move(&page->slab_list, list); n->free_slabs--; n->total_slabs--; } @@ -3473,7 +3407,7 @@ free_done: int i = 0; struct page *page; - list_for_each_entry(page, &n->slabs_free, lru) { + list_for_each_entry(page, &n->slabs_free, slab_list) { BUG_ON(page->active); i++; @@ -4220,195 +4154,6 @@ ssize_t slabinfo_write(struct file *file, const char __user *buffer, return res; } -#ifdef CONFIG_DEBUG_SLAB_LEAK - -static inline int add_caller(unsigned long *n, unsigned long v) -{ - unsigned long *p; - int l; - if (!v) - return 1; - l = n[1]; - p = n + 2; - while (l) { - int i = l/2; - unsigned long *q = p + 2 * i; - if (*q == v) { - q[1]++; - return 1; - } - if (*q > v) { - l = i; - } else { - p = q + 2; - l -= i + 1; - } - } - if (++n[1] == n[0]) - return 0; - memmove(p + 2, p, n[1] * 2 * sizeof(unsigned long) - ((void *)p - (void *)n)); - p[0] = v; - p[1] = 1; - return 1; -} - -static void handle_slab(unsigned long *n, struct kmem_cache *c, - struct page *page) -{ - void *p; - int i, j; - unsigned long v; - - if (n[0] == n[1]) - return; - for (i = 0, p = page->s_mem; i < c->num; i++, p += c->size) { - bool active = true; - - for (j = page->active; j < c->num; j++) { - if (get_free_obj(page, j) == i) { - active = false; - break; - } - } - - if (!active) - continue; - - /* - * probe_kernel_read() is used for DEBUG_PAGEALLOC. page table - * mapping is established when actual object allocation and - * we could mistakenly access the unmapped object in the cpu - * cache. - */ - if (probe_kernel_read(&v, dbg_userword(c, p), sizeof(v))) - continue; - - if (!add_caller(n, v)) - return; - } -} - -static void show_symbol(struct seq_file *m, unsigned long address) -{ -#ifdef CONFIG_KALLSYMS - unsigned long offset, size; - char modname[MODULE_NAME_LEN], name[KSYM_NAME_LEN]; - - if (lookup_symbol_attrs(address, &size, &offset, modname, name) == 0) { - seq_printf(m, "%s+%#lx/%#lx", name, offset, size); - if (modname[0]) - seq_printf(m, " [%s]", modname); - return; - } -#endif - seq_printf(m, "%px", (void *)address); -} - -static int leaks_show(struct seq_file *m, void *p) -{ - struct kmem_cache *cachep = list_entry(p, struct kmem_cache, list); - struct page *page; - struct kmem_cache_node *n; - const char *name; - unsigned long *x = m->private; - int node; - int i; - - if (!(cachep->flags & SLAB_STORE_USER)) - return 0; - if (!(cachep->flags & SLAB_RED_ZONE)) - return 0; - - /* - * Set store_user_clean and start to grab stored user information - * for all objects on this cache. If some alloc/free requests comes - * during the processing, information would be wrong so restart - * whole processing. - */ - do { - set_store_user_clean(cachep); - drain_cpu_caches(cachep); - - x[1] = 0; - - for_each_kmem_cache_node(cachep, node, n) { - - check_irq_on(); - spin_lock_irq(&n->list_lock); - - list_for_each_entry(page, &n->slabs_full, lru) - handle_slab(x, cachep, page); - list_for_each_entry(page, &n->slabs_partial, lru) - handle_slab(x, cachep, page); - spin_unlock_irq(&n->list_lock); - } - } while (!is_store_user_clean(cachep)); - - name = cachep->name; - if (x[0] == x[1]) { - /* Increase the buffer size */ - mutex_unlock(&slab_mutex); - m->private = kcalloc(x[0] * 4, sizeof(unsigned long), - GFP_KERNEL); - if (!m->private) { - /* Too bad, we are really out */ - m->private = x; - mutex_lock(&slab_mutex); - return -ENOMEM; - } - *(unsigned long *)m->private = x[0] * 2; - kfree(x); - mutex_lock(&slab_mutex); - /* Now make sure this entry will be retried */ - m->count = m->size; - return 0; - } - for (i = 0; i < x[1]; i++) { - seq_printf(m, "%s: %lu ", name, x[2*i+3]); - show_symbol(m, x[2*i+2]); - seq_putc(m, '\n'); - } - - return 0; -} - -static const struct seq_operations slabstats_op = { - .start = slab_start, - .next = slab_next, - .stop = slab_stop, - .show = leaks_show, -}; - -static int slabstats_open(struct inode *inode, struct file *file) -{ - unsigned long *n; - - n = __seq_open_private(file, &slabstats_op, PAGE_SIZE); - if (!n) - return -ENOMEM; - - *n = PAGE_SIZE / (2 * sizeof(unsigned long)); - - return 0; -} - -static const struct file_operations proc_slabstats_operations = { - .open = slabstats_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release_private, -}; -#endif - -static int __init slab_proc_init(void) -{ -#ifdef CONFIG_DEBUG_SLAB_LEAK - proc_create("slab_allocators", 0, NULL, &proc_slabstats_operations); -#endif - return 0; -} -module_init(slab_proc_init); - #ifdef CONFIG_HARDENED_USERCOPY /* * Rejects incorrectly sized objects and objects that are to be copied diff --git a/mm/slab.h b/mm/slab.h index e5e6658eeacc..43ac818b8592 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -127,7 +127,8 @@ static inline slab_flags_t kmem_cache_flags(unsigned int object_size, /* Legal flag mask for kmem_cache_create(), for various configurations */ -#define SLAB_CORE_FLAGS (SLAB_HWCACHE_ALIGN | SLAB_CACHE_DMA | SLAB_PANIC | \ +#define SLAB_CORE_FLAGS (SLAB_HWCACHE_ALIGN | SLAB_CACHE_DMA | \ + SLAB_CACHE_DMA32 | SLAB_PANIC | \ SLAB_TYPESAFE_BY_RCU | SLAB_DEBUG_OBJECTS ) #if defined(CONFIG_DEBUG_SLAB) diff --git a/mm/slab_common.c b/mm/slab_common.c index 03eeb8b7b4b1..58251ba63e4a 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -53,7 +53,7 @@ static DECLARE_WORK(slab_caches_to_rcu_destroy_work, SLAB_FAILSLAB | SLAB_KASAN) #define SLAB_MERGE_SAME (SLAB_RECLAIM_ACCOUNT | SLAB_CACHE_DMA | \ - SLAB_ACCOUNT) + SLAB_CACHE_DMA32 | SLAB_ACCOUNT) /* * Merge control. If this is set then no merging of slab caches will occur. diff --git a/mm/slob.c b/mm/slob.c index 307c2c9feb44..84aefd9b91ee 100644 --- a/mm/slob.c +++ b/mm/slob.c @@ -112,13 +112,13 @@ static inline int slob_page_free(struct page *sp) static void set_slob_page_free(struct page *sp, struct list_head *list) { - list_add(&sp->lru, list); + list_add(&sp->slab_list, list); __SetPageSlobFree(sp); } static inline void clear_slob_page_free(struct page *sp) { - list_del(&sp->lru); + list_del(&sp->slab_list); __ClearPageSlobFree(sp); } @@ -213,13 +213,26 @@ static void slob_free_pages(void *b, int order) } /* - * Allocate a slob block within a given slob_page sp. + * slob_page_alloc() - Allocate a slob block within a given slob_page sp. + * @sp: Page to look in. + * @size: Size of the allocation. + * @align: Allocation alignment. + * @page_removed_from_list: Return parameter. + * + * Tries to find a chunk of memory at least @size bytes big within @page. + * + * Return: Pointer to memory if allocated, %NULL otherwise. If the + * allocation fills up @page then the page is removed from the + * freelist, in this case @page_removed_from_list will be set to + * true (set to false otherwise). */ -static void *slob_page_alloc(struct page *sp, size_t size, int align) +static void *slob_page_alloc(struct page *sp, size_t size, int align, + bool *page_removed_from_list) { slob_t *prev, *cur, *aligned = NULL; int delta = 0, units = SLOB_UNITS(size); + *page_removed_from_list = false; for (prev = NULL, cur = sp->freelist; ; prev = cur, cur = slob_next(cur)) { slobidx_t avail = slob_units(cur); @@ -254,8 +267,10 @@ static void *slob_page_alloc(struct page *sp, size_t size, int align) } sp->units -= units; - if (!sp->units) + if (!sp->units) { clear_slob_page_free(sp); + *page_removed_from_list = true; + } return cur; } if (slob_last(cur)) @@ -269,10 +284,10 @@ static void *slob_page_alloc(struct page *sp, size_t size, int align) static void *slob_alloc(size_t size, gfp_t gfp, int align, int node) { struct page *sp; - struct list_head *prev; struct list_head *slob_list; slob_t *b = NULL; unsigned long flags; + bool _unused; if (size < SLOB_BREAK1) slob_list = &free_slob_small; @@ -283,7 +298,8 @@ static void *slob_alloc(size_t size, gfp_t gfp, int align, int node) spin_lock_irqsave(&slob_lock, flags); /* Iterate through each partially free page, try to find room */ - list_for_each_entry(sp, slob_list, lru) { + list_for_each_entry(sp, slob_list, slab_list) { + bool page_removed_from_list = false; #ifdef CONFIG_NUMA /* * If there's a node specification, search for a partial @@ -296,18 +312,25 @@ static void *slob_alloc(size_t size, gfp_t gfp, int align, int node) if (sp->units < SLOB_UNITS(size)) continue; - /* Attempt to alloc */ - prev = sp->lru.prev; - b = slob_page_alloc(sp, size, align); + b = slob_page_alloc(sp, size, align, &page_removed_from_list); if (!b) continue; - /* Improve fragment distribution and reduce our average - * search time by starting our next search here. (see - * Knuth vol 1, sec 2.5, pg 449) */ - if (prev != slob_list->prev && - slob_list->next != prev->next) - list_move_tail(slob_list, prev->next); + /* + * If slob_page_alloc() removed sp from the list then we + * cannot call list functions on sp. If so allocation + * did not fragment the page anyway so optimisation is + * unnecessary. + */ + if (!page_removed_from_list) { + /* + * Improve fragment distribution and reduce our average + * search time by starting our next search here. (see + * Knuth vol 1, sec 2.5, pg 449) + */ + if (!list_is_first(&sp->slab_list, slob_list)) + list_rotate_to_front(&sp->slab_list, slob_list); + } break; } spin_unlock_irqrestore(&slob_lock, flags); @@ -323,10 +346,10 @@ static void *slob_alloc(size_t size, gfp_t gfp, int align, int node) spin_lock_irqsave(&slob_lock, flags); sp->units = SLOB_UNITS(PAGE_SIZE); sp->freelist = b; - INIT_LIST_HEAD(&sp->lru); + INIT_LIST_HEAD(&sp->slab_list); set_slob(b, SLOB_UNITS(PAGE_SIZE), b + SLOB_UNITS(PAGE_SIZE)); set_slob_page_free(sp, slob_list); - b = slob_page_alloc(sp, size, align); + b = slob_page_alloc(sp, size, align, &_unused); BUG_ON(!b); spin_unlock_irqrestore(&slob_lock, flags); } diff --git a/mm/slub.c b/mm/slub.c index 1b08fbcb7e61..cd04dbd2b5d0 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -58,10 +58,11 @@ * D. page->frozen -> frozen state * * If a slab is frozen then it is exempt from list management. It is not - * on any list. The processor that froze the slab is the one who can - * perform list operations on the page. Other processors may put objects - * onto the freelist but the processor that froze the slab is the only - * one that can retrieve the objects from the page's freelist. + * on any list except per cpu partial list. The processor that froze the + * slab is the one who can perform list operations on the page. Other + * processors may put objects onto the freelist but the processor that + * froze the slab is the only one that can retrieve the objects from the + * page's freelist. * * The list_lock protects the partial and full list on each node and * the partial slab counter. If taken then no new slabs may be added or @@ -552,31 +553,22 @@ static void set_track(struct kmem_cache *s, void *object, if (addr) { #ifdef CONFIG_STACKTRACE - struct stack_trace trace; - int i; + unsigned int nr_entries; - trace.nr_entries = 0; - trace.max_entries = TRACK_ADDRS_COUNT; - trace.entries = p->addrs; - trace.skip = 3; metadata_access_enable(); - save_stack_trace(&trace); + nr_entries = stack_trace_save(p->addrs, TRACK_ADDRS_COUNT, 3); metadata_access_disable(); - /* See rant in lockdep.c */ - if (trace.nr_entries != 0 && - trace.entries[trace.nr_entries - 1] == ULONG_MAX) - trace.nr_entries--; - - for (i = trace.nr_entries; i < TRACK_ADDRS_COUNT; i++) - p->addrs[i] = 0; + if (nr_entries < TRACK_ADDRS_COUNT) + p->addrs[nr_entries] = 0; #endif p->addr = addr; p->cpu = smp_processor_id(); p->pid = current->pid; p->when = jiffies; - } else + } else { memset(p, 0, sizeof(struct track)); + } } static void init_tracking(struct kmem_cache *s, void *object) @@ -1023,7 +1015,7 @@ static void add_full(struct kmem_cache *s, return; lockdep_assert_held(&n->list_lock); - list_add(&page->lru, &n->full); + list_add(&page->slab_list, &n->full); } static void remove_full(struct kmem_cache *s, struct kmem_cache_node *n, struct page *page) @@ -1032,7 +1024,7 @@ static void remove_full(struct kmem_cache *s, struct kmem_cache_node *n, struct return; lockdep_assert_held(&n->list_lock); - list_del(&page->lru); + list_del(&page->slab_list); } /* Tracking of the number of slabs for debugging purposes */ @@ -1773,9 +1765,9 @@ __add_partial(struct kmem_cache_node *n, struct page *page, int tail) { n->nr_partial++; if (tail == DEACTIVATE_TO_TAIL) - list_add_tail(&page->lru, &n->partial); + list_add_tail(&page->slab_list, &n->partial); else - list_add(&page->lru, &n->partial); + list_add(&page->slab_list, &n->partial); } static inline void add_partial(struct kmem_cache_node *n, @@ -1789,7 +1781,7 @@ static inline void remove_partial(struct kmem_cache_node *n, struct page *page) { lockdep_assert_held(&n->list_lock); - list_del(&page->lru); + list_del(&page->slab_list); n->nr_partial--; } @@ -1863,7 +1855,7 @@ static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n, return NULL; spin_lock(&n->list_lock); - list_for_each_entry_safe(page, page2, &n->partial, lru) { + list_for_each_entry_safe(page, page2, &n->partial, slab_list) { void *t; if (!pfmemalloc_match(page, flags)) @@ -1951,7 +1943,7 @@ static void *get_any_partial(struct kmem_cache *s, gfp_t flags, } } } while (read_mems_allowed_retry(cpuset_mems_cookie)); -#endif +#endif /* CONFIG_NUMA */ return NULL; } @@ -2249,7 +2241,7 @@ static void unfreeze_partials(struct kmem_cache *s, discard_slab(s, page); stat(s, FREE_SLAB); } -#endif +#endif /* CONFIG_SLUB_CPU_PARTIAL */ } /* @@ -2308,7 +2300,7 @@ static void put_cpu_partial(struct kmem_cache *s, struct page *page, int drain) local_irq_restore(flags); } preempt_enable(); -#endif +#endif /* CONFIG_SLUB_CPU_PARTIAL */ } static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c) @@ -2407,7 +2399,7 @@ static unsigned long count_partial(struct kmem_cache_node *n, struct page *page; spin_lock_irqsave(&n->list_lock, flags); - list_for_each_entry(page, &n->partial, lru) + list_for_each_entry(page, &n->partial, slab_list) x += get_count(page); spin_unlock_irqrestore(&n->list_lock, flags); return x; @@ -2813,7 +2805,7 @@ void *kmem_cache_alloc_node_trace(struct kmem_cache *s, } EXPORT_SYMBOL(kmem_cache_alloc_node_trace); #endif -#endif +#endif /* CONFIG_NUMA */ /* * Slow path handling. This may still be called frequently since objects @@ -2912,8 +2904,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page, * then add it. */ if (!kmem_cache_has_cpu_partial(s) && unlikely(!prior)) { - if (kmem_cache_debug(s)) - remove_full(s, n, page); + remove_full(s, n, page); add_partial(n, page, DEACTIVATE_TO_TAIL); stat(s, FREE_ADD_PARTIAL); } @@ -3589,6 +3580,9 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order) if (s->flags & SLAB_CACHE_DMA) s->allocflags |= GFP_DMA; + if (s->flags & SLAB_CACHE_DMA32) + s->allocflags |= GFP_DMA32; + if (s->flags & SLAB_RECLAIM_ACCOUNT) s->allocflags |= __GFP_RECLAIMABLE; @@ -3702,10 +3696,10 @@ static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n) BUG_ON(irqs_disabled()); spin_lock_irq(&n->list_lock); - list_for_each_entry_safe(page, h, &n->partial, lru) { + list_for_each_entry_safe(page, h, &n->partial, slab_list) { if (!page->inuse) { remove_partial(n, page); - list_add(&page->lru, &discard); + list_add(&page->slab_list, &discard); } else { list_slab_objects(s, page, "Objects remaining in %s on __kmem_cache_shutdown()"); @@ -3713,7 +3707,7 @@ static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n) } spin_unlock_irq(&n->list_lock); - list_for_each_entry_safe(page, h, &discard, lru) + list_for_each_entry_safe(page, h, &discard, slab_list) discard_slab(s, page); } @@ -3845,7 +3839,7 @@ void *__kmalloc_node(size_t size, gfp_t flags, int node) return ret; } EXPORT_SYMBOL(__kmalloc_node); -#endif +#endif /* CONFIG_NUMA */ #ifdef CONFIG_HARDENED_USERCOPY /* @@ -3993,7 +3987,7 @@ int __kmem_cache_shrink(struct kmem_cache *s) * Note that concurrent frees may occur while we hold the * list_lock. page->inuse here is the upper limit. */ - list_for_each_entry_safe(page, t, &n->partial, lru) { + list_for_each_entry_safe(page, t, &n->partial, slab_list) { int free = page->objects - page->inuse; /* Do not reread page->inuse */ @@ -4003,10 +3997,10 @@ int __kmem_cache_shrink(struct kmem_cache *s) BUG_ON(free <= 0); if (free == page->objects) { - list_move(&page->lru, &discard); + list_move(&page->slab_list, &discard); n->nr_partial--; } else if (free <= SHRINK_PROMOTE_MAX) - list_move(&page->lru, promote + free - 1); + list_move(&page->slab_list, promote + free - 1); } /* @@ -4019,7 +4013,7 @@ int __kmem_cache_shrink(struct kmem_cache *s) spin_unlock_irqrestore(&n->list_lock, flags); /* Release empty slabs */ - list_for_each_entry_safe(page, t, &discard, lru) + list_for_each_entry_safe(page, t, &discard, slab_list) discard_slab(s, page); if (slabs_node(s, node)) @@ -4063,7 +4057,7 @@ void __kmemcg_cache_deactivate(struct kmem_cache *s) */ slab_deactivate_memcg_cache_rcu_sched(s, kmemcg_cache_deact_after_rcu); } -#endif +#endif /* CONFIG_MEMCG */ static int slab_mem_going_offline_callback(void *arg) { @@ -4211,11 +4205,11 @@ static struct kmem_cache * __init bootstrap(struct kmem_cache *static_cache) for_each_kmem_cache_node(s, node, n) { struct page *p; - list_for_each_entry(p, &n->partial, lru) + list_for_each_entry(p, &n->partial, slab_list) p->slab_cache = s; #ifdef CONFIG_SLUB_DEBUG - list_for_each_entry(p, &n->full, lru) + list_for_each_entry(p, &n->full, slab_list) p->slab_cache = s; #endif } @@ -4432,7 +4426,7 @@ static int validate_slab_node(struct kmem_cache *s, spin_lock_irqsave(&n->list_lock, flags); - list_for_each_entry(page, &n->partial, lru) { + list_for_each_entry(page, &n->partial, slab_list) { validate_slab_slab(s, page, map); count++; } @@ -4443,7 +4437,7 @@ static int validate_slab_node(struct kmem_cache *s, if (!(s->flags & SLAB_STORE_USER)) goto out; - list_for_each_entry(page, &n->full, lru) { + list_for_each_entry(page, &n->full, slab_list) { validate_slab_slab(s, page, map); count++; } @@ -4639,9 +4633,9 @@ static int list_locations(struct kmem_cache *s, char *buf, continue; spin_lock_irqsave(&n->list_lock, flags); - list_for_each_entry(page, &n->partial, lru) + list_for_each_entry(page, &n->partial, slab_list) process_slab(&t, s, page, alloc, map); - list_for_each_entry(page, &n->full, lru) + list_for_each_entry(page, &n->full, slab_list) process_slab(&t, s, page, alloc, map); spin_unlock_irqrestore(&n->list_lock, flags); } @@ -4696,7 +4690,7 @@ static int list_locations(struct kmem_cache *s, char *buf, len += sprintf(buf, "No data\n"); return len; } -#endif +#endif /* CONFIG_SLUB_DEBUG */ #ifdef SLUB_RESILIENCY_TEST static void __init resiliency_test(void) @@ -4756,7 +4750,7 @@ static void __init resiliency_test(void) #ifdef CONFIG_SYSFS static void resiliency_test(void) {}; #endif -#endif +#endif /* SLUB_RESILIENCY_TEST */ #ifdef CONFIG_SYSFS enum slab_stat_type { @@ -5413,7 +5407,7 @@ STAT_ATTR(CPU_PARTIAL_ALLOC, cpu_partial_alloc); STAT_ATTR(CPU_PARTIAL_FREE, cpu_partial_free); STAT_ATTR(CPU_PARTIAL_NODE, cpu_partial_node); STAT_ATTR(CPU_PARTIAL_DRAIN, cpu_partial_drain); -#endif +#endif /* CONFIG_SLUB_STATS */ static struct attribute *slab_attrs[] = { &slab_size_attr.attr, @@ -5614,7 +5608,7 @@ static void memcg_propagate_slab_attrs(struct kmem_cache *s) if (buffer) free_page((unsigned long)buffer); -#endif +#endif /* CONFIG_MEMCG */ } static void kmem_cache_release(struct kobject *k) @@ -5679,6 +5673,8 @@ static char *create_unique_id(struct kmem_cache *s) */ if (s->flags & SLAB_CACHE_DMA) *p++ = 'd'; + if (s->flags & SLAB_CACHE_DMA32) + *p++ = 'D'; if (s->flags & SLAB_RECLAIM_ACCOUNT) *p++ = 'a'; if (s->flags & SLAB_CONSISTENCY_CHECKS) diff --git a/mm/sparse.c b/mm/sparse.c index 69904aa6165b..fd13166949b5 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -567,7 +567,7 @@ void online_mem_sections(unsigned long start_pfn, unsigned long end_pfn) } #ifdef CONFIG_MEMORY_HOTREMOVE -/* Mark all memory sections within the pfn range as online */ +/* Mark all memory sections within the pfn range as offline */ void offline_mem_sections(unsigned long start_pfn, unsigned long end_pfn) { unsigned long pfn; @@ -684,10 +684,18 @@ static void free_map_bootmem(struct page *memmap) #endif /* CONFIG_MEMORY_HOTREMOVE */ #endif /* CONFIG_SPARSEMEM_VMEMMAP */ -/* - * returns the number of sections whose mem_maps were properly - * set. If this is <=0, then that means that the passed-in - * map was not consumed and must be freed. +/** + * sparse_add_one_section - add a memory section + * @nid: The node to add section on + * @start_pfn: start pfn of the memory range + * @altmap: device page map + * + * This is only intended for hotplug. + * + * Return: + * * 0 - On success. + * * -EEXIST - Section has been present. + * * -ENOMEM - Out of memory. */ int __meminit sparse_add_one_section(int nid, unsigned long start_pfn, struct vmem_altmap *altmap) diff --git a/mm/swap.c b/mm/swap.c index 301ed4e04320..3a75722e68a9 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -867,7 +867,7 @@ static void __pagevec_lru_add_fn(struct page *page, struct lruvec *lruvec, SetPageLRU(page); /* * Page becomes evictable in two ways: - * 1) Within LRU lock [munlock_vma_pages() and __munlock_pagevec()]. + * 1) Within LRU lock [munlock_vma_page() and __munlock_pagevec()]. * 2) Before acquiring LRU lock to put the page to correct LRU and then * a) do PageLRU check with lock [check_move_unevictable_pages] * b) do PageLRU check before lock [clear_page_mlock] diff --git a/mm/swap_state.c b/mm/swap_state.c index 85245fdec8d9..eb714165afd2 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -132,7 +132,7 @@ int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp) for (i = 0; i < nr; i++) { VM_BUG_ON_PAGE(xas.xa_index != idx + i, page); set_page_private(page + i, entry.val + i); - xas_store(&xas, page + i); + xas_store(&xas, page); xas_next(&xas); } address_space->nrpages += nr; @@ -167,7 +167,7 @@ void __delete_from_swap_cache(struct page *page, swp_entry_t entry) for (i = 0; i < nr; i++) { void *entry = xas_store(&xas, NULL); - VM_BUG_ON_PAGE(entry != page + i, entry); + VM_BUG_ON_PAGE(entry != page, entry); set_page_private(page + i, 0); xas_next(&xas); } diff --git a/mm/swapfile.c b/mm/swapfile.c index 2b8d9c3fbb47..cf63b5f01adf 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -2023,7 +2023,6 @@ static unsigned int find_next_to_unuse(struct swap_info_struct *si, * If the boolean frontswap is true, only unuse pages_to_unuse pages; * pages_to_unuse==0 means all pages; ignored if frontswap is false */ -#define SWAP_UNUSE_MAX_TRIES 3 int try_to_unuse(unsigned int type, bool frontswap, unsigned long pages_to_unuse) { @@ -2035,7 +2034,6 @@ int try_to_unuse(unsigned int type, bool frontswap, struct page *page; swp_entry_t entry; unsigned int i; - int retries = 0; if (!si->inuse_pages) return 0; @@ -2053,11 +2051,9 @@ retry: spin_lock(&mmlist_lock); p = &init_mm.mmlist; - while ((p = p->next) != &init_mm.mmlist) { - if (signal_pending(current)) { - retval = -EINTR; - break; - } + while (si->inuse_pages && + !signal_pending(current) && + (p = p->next) != &init_mm.mmlist) { mm = list_entry(p, struct mm_struct, mmlist); if (!mmget_not_zero(mm)) @@ -2084,7 +2080,9 @@ retry: mmput(prev_mm); i = 0; - while ((i = find_next_to_unuse(si, i, frontswap)) != 0) { + while (si->inuse_pages && + !signal_pending(current) && + (i = find_next_to_unuse(si, i, frontswap)) != 0) { entry = swp_entry(type, i); page = find_get_page(swap_address_space(entry), i); @@ -2117,14 +2115,18 @@ retry: * If yes, we would need to do retry the unuse logic again. * Under global memory pressure, swap entries can be reinserted back * into process space after the mmlist loop above passes over them. - * Its not worth continuosuly retrying to unuse the swap in this case. - * So we try SWAP_UNUSE_MAX_TRIES times. + * + * Limit the number of retries? No: when mmget_not_zero() above fails, + * that mm is likely to be freeing swap from exit_mmap(), which proceeds + * at its own independent pace; and even shmem_writepage() could have + * been preempted after get_swap_page(), temporarily hiding that swap. + * It's easy and robust (though cpu-intensive) just to keep retrying. */ - if (++retries >= SWAP_UNUSE_MAX_TRIES) - retval = -EBUSY; - else if (si->inuse_pages) - goto retry; - + if (si->inuse_pages) { + if (!signal_pending(current)) + goto retry; + retval = -EINTR; + } out: return (retval == FRONTSWAP_PAGES_UNUSED) ? 0 : retval; } diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index d59b5a73dfb3..9932d5755e4c 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -271,8 +271,7 @@ retry: */ idx = linear_page_index(dst_vma, dst_addr); mapping = dst_vma->vm_file->f_mapping; - hash = hugetlb_fault_mutex_hash(h, dst_mm, dst_vma, mapping, - idx, dst_addr); + hash = hugetlb_fault_mutex_hash(h, mapping, idx, dst_addr); mutex_lock(&hugetlb_fault_mutex_table[hash]); err = -ENOMEM; diff --git a/mm/util.c b/mm/util.c index d559bde497a9..e2e4f8c3fa12 100644 --- a/mm/util.c +++ b/mm/util.c @@ -204,7 +204,7 @@ EXPORT_SYMBOL(vmemdup_user); * @s: The string to duplicate * @n: Maximum number of bytes to copy, including the trailing NUL. * - * Return: newly allocated copy of @s or %NULL in case of error + * Return: newly allocated copy of @s or an ERR_PTR() in case of error */ char *strndup_user(const char __user *s, long n) { @@ -318,7 +318,7 @@ EXPORT_SYMBOL_GPL(__get_user_pages_fast); * get_user_pages_fast() - pin user pages in memory * @start: starting user address * @nr_pages: number of pages from start to pin - * @write: whether pages will be written to + * @gup_flags: flags modifying pin behaviour * @pages: array that receives pointers to the pages pinned. * Should be at least nr_pages long. * @@ -339,10 +339,10 @@ EXPORT_SYMBOL_GPL(__get_user_pages_fast); * were pinned, returns -errno. */ int __weak get_user_pages_fast(unsigned long start, - int nr_pages, int write, struct page **pages) + int nr_pages, unsigned int gup_flags, + struct page **pages) { - return get_user_pages_unlocked(start, nr_pages, pages, - write ? FOLL_WRITE : 0); + return get_user_pages_unlocked(start, nr_pages, pages, gup_flags); } EXPORT_SYMBOL_GPL(get_user_pages_fast); @@ -652,7 +652,7 @@ EXPORT_SYMBOL_GPL(vm_memory_committed); */ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) { - long free, allowed, reserve; + long allowed; VM_WARN_ONCE(percpu_counter_read(&vm_committed_as) < -(s64)vm_committed_as_batch * num_online_cpus(), @@ -667,51 +667,9 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) return 0; if (sysctl_overcommit_memory == OVERCOMMIT_GUESS) { - free = global_zone_page_state(NR_FREE_PAGES); - free += global_node_page_state(NR_FILE_PAGES); - - /* - * shmem pages shouldn't be counted as free in this - * case, they can't be purged, only swapped out, and - * that won't affect the overall amount of available - * memory in the system. - */ - free -= global_node_page_state(NR_SHMEM); - - free += get_nr_swap_pages(); - - /* - * Any slabs which are created with the - * SLAB_RECLAIM_ACCOUNT flag claim to have contents - * which are reclaimable, under pressure. The dentry - * cache and most inode caches should fall into this - */ - free += global_node_page_state(NR_SLAB_RECLAIMABLE); - - /* - * Part of the kernel memory, which can be released - * under memory pressure. - */ - free += global_node_page_state(NR_KERNEL_MISC_RECLAIMABLE); - - /* - * Leave reserved pages. The pages are not for anonymous pages. - */ - if (free <= totalreserve_pages) + if (pages > totalram_pages() + total_swap_pages) goto error; - else - free -= totalreserve_pages; - - /* - * Reserve some for root - */ - if (!cap_sys_admin) - free -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10); - - if (free > pages) - return 0; - - goto error; + return 0; } allowed = vm_commit_limit(); @@ -725,7 +683,8 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) * Don't let a single process grow so big a user can't recover */ if (mm) { - reserve = sysctl_user_reserve_kbytes >> (PAGE_SHIFT - 10); + long reserve = sysctl_user_reserve_kbytes >> (PAGE_SHIFT - 10); + allowed -= min_t(long, mm->total_vm / 32, reserve); } diff --git a/mm/vmalloc.c b/mm/vmalloc.c index e86ba6e74b50..c42872ed82ac 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -18,6 +18,7 @@ #include <linux/interrupt.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> +#include <linux/set_memory.h> #include <linux/debugobjects.h> #include <linux/kallsyms.h> #include <linux/list.h> @@ -31,6 +32,7 @@ #include <linux/compiler.h> #include <linux/llist.h> #include <linux/bitops.h> +#include <linux/rbtree_augmented.h> #include <linux/uaccess.h> #include <asm/tlbflush.h> @@ -323,6 +325,9 @@ EXPORT_SYMBOL(vmalloc_to_pfn); /*** Global kva allocator ***/ +#define DEBUG_AUGMENT_PROPAGATE_CHECK 0 +#define DEBUG_AUGMENT_LOWEST_MATCH_CHECK 0 + #define VM_LAZY_FREE 0x02 #define VM_VM_AREA 0x04 @@ -331,14 +336,67 @@ static DEFINE_SPINLOCK(vmap_area_lock); LIST_HEAD(vmap_area_list); static LLIST_HEAD(vmap_purge_list); static struct rb_root vmap_area_root = RB_ROOT; +static bool vmap_initialized __read_mostly; + +/* + * This kmem_cache is used for vmap_area objects. Instead of + * allocating from slab we reuse an object from this cache to + * make things faster. Especially in "no edge" splitting of + * free block. + */ +static struct kmem_cache *vmap_area_cachep; + +/* + * This linked list is used in pair with free_vmap_area_root. + * It gives O(1) access to prev/next to perform fast coalescing. + */ +static LIST_HEAD(free_vmap_area_list); -/* The vmap cache globals are protected by vmap_area_lock */ -static struct rb_node *free_vmap_cache; -static unsigned long cached_hole_size; -static unsigned long cached_vstart; -static unsigned long cached_align; +/* + * This augment red-black tree represents the free vmap space. + * All vmap_area objects in this tree are sorted by va->va_start + * address. It is used for allocation and merging when a vmap + * object is released. + * + * Each vmap_area node contains a maximum available free block + * of its sub-tree, right or left. Therefore it is possible to + * find a lowest match of free area. + */ +static struct rb_root free_vmap_area_root = RB_ROOT; -static unsigned long vmap_area_pcpu_hole; +static __always_inline unsigned long +va_size(struct vmap_area *va) +{ + return (va->va_end - va->va_start); +} + +static __always_inline unsigned long +get_subtree_max_size(struct rb_node *node) +{ + struct vmap_area *va; + + va = rb_entry_safe(node, struct vmap_area, rb_node); + return va ? va->subtree_max_size : 0; +} + +/* + * Gets called when remove the node and rotate. + */ +static __always_inline unsigned long +compute_subtree_max_size(struct vmap_area *va) +{ + return max3(va_size(va), + get_subtree_max_size(va->rb_node.rb_left), + get_subtree_max_size(va->rb_node.rb_right)); +} + +RB_DECLARE_CALLBACKS(static, free_vmap_area_rb_augment_cb, + struct vmap_area, rb_node, unsigned long, subtree_max_size, + compute_subtree_max_size) + +static void purge_vmap_area_lazy(void); +static BLOCKING_NOTIFIER_HEAD(vmap_notify_list); +static unsigned long lazy_max_pages(void); static struct vmap_area *__find_vmap_area(unsigned long addr) { @@ -359,41 +417,610 @@ static struct vmap_area *__find_vmap_area(unsigned long addr) return NULL; } -static void __insert_vmap_area(struct vmap_area *va) -{ - struct rb_node **p = &vmap_area_root.rb_node; - struct rb_node *parent = NULL; - struct rb_node *tmp; +/* + * This function returns back addresses of parent node + * and its left or right link for further processing. + */ +static __always_inline struct rb_node ** +find_va_links(struct vmap_area *va, + struct rb_root *root, struct rb_node *from, + struct rb_node **parent) +{ + struct vmap_area *tmp_va; + struct rb_node **link; + + if (root) { + link = &root->rb_node; + if (unlikely(!*link)) { + *parent = NULL; + return link; + } + } else { + link = &from; + } - while (*p) { - struct vmap_area *tmp_va; + /* + * Go to the bottom of the tree. When we hit the last point + * we end up with parent rb_node and correct direction, i name + * it link, where the new va->rb_node will be attached to. + */ + do { + tmp_va = rb_entry(*link, struct vmap_area, rb_node); - parent = *p; - tmp_va = rb_entry(parent, struct vmap_area, rb_node); - if (va->va_start < tmp_va->va_end) - p = &(*p)->rb_left; - else if (va->va_end > tmp_va->va_start) - p = &(*p)->rb_right; + /* + * During the traversal we also do some sanity check. + * Trigger the BUG() if there are sides(left/right) + * or full overlaps. + */ + if (va->va_start < tmp_va->va_end && + va->va_end <= tmp_va->va_start) + link = &(*link)->rb_left; + else if (va->va_end > tmp_va->va_start && + va->va_start >= tmp_va->va_end) + link = &(*link)->rb_right; else BUG(); + } while (*link); + + *parent = &tmp_va->rb_node; + return link; +} + +static __always_inline struct list_head * +get_va_next_sibling(struct rb_node *parent, struct rb_node **link) +{ + struct list_head *list; + + if (unlikely(!parent)) + /* + * The red-black tree where we try to find VA neighbors + * before merging or inserting is empty, i.e. it means + * there is no free vmap space. Normally it does not + * happen but we handle this case anyway. + */ + return NULL; + + list = &rb_entry(parent, struct vmap_area, rb_node)->list; + return (&parent->rb_right == link ? list->next : list); +} + +static __always_inline void +link_va(struct vmap_area *va, struct rb_root *root, + struct rb_node *parent, struct rb_node **link, struct list_head *head) +{ + /* + * VA is still not in the list, but we can + * identify its future previous list_head node. + */ + if (likely(parent)) { + head = &rb_entry(parent, struct vmap_area, rb_node)->list; + if (&parent->rb_right != link) + head = head->prev; + } + + /* Insert to the rb-tree */ + rb_link_node(&va->rb_node, parent, link); + if (root == &free_vmap_area_root) { + /* + * Some explanation here. Just perform simple insertion + * to the tree. We do not set va->subtree_max_size to + * its current size before calling rb_insert_augmented(). + * It is because of we populate the tree from the bottom + * to parent levels when the node _is_ in the tree. + * + * Therefore we set subtree_max_size to zero after insertion, + * to let __augment_tree_propagate_from() puts everything to + * the correct order later on. + */ + rb_insert_augmented(&va->rb_node, + root, &free_vmap_area_rb_augment_cb); + va->subtree_max_size = 0; + } else { + rb_insert_color(&va->rb_node, root); } - rb_link_node(&va->rb_node, parent, p); - rb_insert_color(&va->rb_node, &vmap_area_root); + /* Address-sort this list */ + list_add(&va->list, head); +} - /* address-sort this list */ - tmp = rb_prev(&va->rb_node); - if (tmp) { - struct vmap_area *prev; - prev = rb_entry(tmp, struct vmap_area, rb_node); - list_add_rcu(&va->list, &prev->list); - } else - list_add_rcu(&va->list, &vmap_area_list); +static __always_inline void +unlink_va(struct vmap_area *va, struct rb_root *root) +{ + /* + * During merging a VA node can be empty, therefore + * not linked with the tree nor list. Just check it. + */ + if (!RB_EMPTY_NODE(&va->rb_node)) { + if (root == &free_vmap_area_root) + rb_erase_augmented(&va->rb_node, + root, &free_vmap_area_rb_augment_cb); + else + rb_erase(&va->rb_node, root); + + list_del(&va->list); + RB_CLEAR_NODE(&va->rb_node); + } } -static void purge_vmap_area_lazy(void); +#if DEBUG_AUGMENT_PROPAGATE_CHECK +static void +augment_tree_propagate_check(struct rb_node *n) +{ + struct vmap_area *va; + struct rb_node *node; + unsigned long size; + bool found = false; -static BLOCKING_NOTIFIER_HEAD(vmap_notify_list); + if (n == NULL) + return; + + va = rb_entry(n, struct vmap_area, rb_node); + size = va->subtree_max_size; + node = n; + + while (node) { + va = rb_entry(node, struct vmap_area, rb_node); + + if (get_subtree_max_size(node->rb_left) == size) { + node = node->rb_left; + } else { + if (va_size(va) == size) { + found = true; + break; + } + + node = node->rb_right; + } + } + + if (!found) { + va = rb_entry(n, struct vmap_area, rb_node); + pr_emerg("tree is corrupted: %lu, %lu\n", + va_size(va), va->subtree_max_size); + } + + augment_tree_propagate_check(n->rb_left); + augment_tree_propagate_check(n->rb_right); +} +#endif + +/* + * This function populates subtree_max_size from bottom to upper + * levels starting from VA point. The propagation must be done + * when VA size is modified by changing its va_start/va_end. Or + * in case of newly inserting of VA to the tree. + * + * It means that __augment_tree_propagate_from() must be called: + * - After VA has been inserted to the tree(free path); + * - After VA has been shrunk(allocation path); + * - After VA has been increased(merging path). + * + * Please note that, it does not mean that upper parent nodes + * and their subtree_max_size are recalculated all the time up + * to the root node. + * + * 4--8 + * /\ + * / \ + * / \ + * 2--2 8--8 + * + * For example if we modify the node 4, shrinking it to 2, then + * no any modification is required. If we shrink the node 2 to 1 + * its subtree_max_size is updated only, and set to 1. If we shrink + * the node 8 to 6, then its subtree_max_size is set to 6 and parent + * node becomes 4--6. + */ +static __always_inline void +augment_tree_propagate_from(struct vmap_area *va) +{ + struct rb_node *node = &va->rb_node; + unsigned long new_va_sub_max_size; + + while (node) { + va = rb_entry(node, struct vmap_area, rb_node); + new_va_sub_max_size = compute_subtree_max_size(va); + + /* + * If the newly calculated maximum available size of the + * subtree is equal to the current one, then it means that + * the tree is propagated correctly. So we have to stop at + * this point to save cycles. + */ + if (va->subtree_max_size == new_va_sub_max_size) + break; + + va->subtree_max_size = new_va_sub_max_size; + node = rb_parent(&va->rb_node); + } + +#if DEBUG_AUGMENT_PROPAGATE_CHECK + augment_tree_propagate_check(free_vmap_area_root.rb_node); +#endif +} + +static void +insert_vmap_area(struct vmap_area *va, + struct rb_root *root, struct list_head *head) +{ + struct rb_node **link; + struct rb_node *parent; + + link = find_va_links(va, root, NULL, &parent); + link_va(va, root, parent, link, head); +} + +static void +insert_vmap_area_augment(struct vmap_area *va, + struct rb_node *from, struct rb_root *root, + struct list_head *head) +{ + struct rb_node **link; + struct rb_node *parent; + + if (from) + link = find_va_links(va, NULL, from, &parent); + else + link = find_va_links(va, root, NULL, &parent); + + link_va(va, root, parent, link, head); + augment_tree_propagate_from(va); +} + +/* + * Merge de-allocated chunk of VA memory with previous + * and next free blocks. If coalesce is not done a new + * free area is inserted. If VA has been merged, it is + * freed. + */ +static __always_inline void +merge_or_add_vmap_area(struct vmap_area *va, + struct rb_root *root, struct list_head *head) +{ + struct vmap_area *sibling; + struct list_head *next; + struct rb_node **link; + struct rb_node *parent; + bool merged = false; + + /* + * Find a place in the tree where VA potentially will be + * inserted, unless it is merged with its sibling/siblings. + */ + link = find_va_links(va, root, NULL, &parent); + + /* + * Get next node of VA to check if merging can be done. + */ + next = get_va_next_sibling(parent, link); + if (unlikely(next == NULL)) + goto insert; + + /* + * start end + * | | + * |<------VA------>|<-----Next----->| + * | | + * start end + */ + if (next != head) { + sibling = list_entry(next, struct vmap_area, list); + if (sibling->va_start == va->va_end) { + sibling->va_start = va->va_start; + + /* Check and update the tree if needed. */ + augment_tree_propagate_from(sibling); + + /* Remove this VA, it has been merged. */ + unlink_va(va, root); + + /* Free vmap_area object. */ + kmem_cache_free(vmap_area_cachep, va); + + /* Point to the new merged area. */ + va = sibling; + merged = true; + } + } + + /* + * start end + * | | + * |<-----Prev----->|<------VA------>| + * | | + * start end + */ + if (next->prev != head) { + sibling = list_entry(next->prev, struct vmap_area, list); + if (sibling->va_end == va->va_start) { + sibling->va_end = va->va_end; + + /* Check and update the tree if needed. */ + augment_tree_propagate_from(sibling); + + /* Remove this VA, it has been merged. */ + unlink_va(va, root); + + /* Free vmap_area object. */ + kmem_cache_free(vmap_area_cachep, va); + + return; + } + } + +insert: + if (!merged) { + link_va(va, root, parent, link, head); + augment_tree_propagate_from(va); + } +} + +static __always_inline bool +is_within_this_va(struct vmap_area *va, unsigned long size, + unsigned long align, unsigned long vstart) +{ + unsigned long nva_start_addr; + + if (va->va_start > vstart) + nva_start_addr = ALIGN(va->va_start, align); + else + nva_start_addr = ALIGN(vstart, align); + + /* Can be overflowed due to big size or alignment. */ + if (nva_start_addr + size < nva_start_addr || + nva_start_addr < vstart) + return false; + + return (nva_start_addr + size <= va->va_end); +} + +/* + * Find the first free block(lowest start address) in the tree, + * that will accomplish the request corresponding to passing + * parameters. + */ +static __always_inline struct vmap_area * +find_vmap_lowest_match(unsigned long size, + unsigned long align, unsigned long vstart) +{ + struct vmap_area *va; + struct rb_node *node; + unsigned long length; + + /* Start from the root. */ + node = free_vmap_area_root.rb_node; + + /* Adjust the search size for alignment overhead. */ + length = size + align - 1; + + while (node) { + va = rb_entry(node, struct vmap_area, rb_node); + + if (get_subtree_max_size(node->rb_left) >= length && + vstart < va->va_start) { + node = node->rb_left; + } else { + if (is_within_this_va(va, size, align, vstart)) + return va; + + /* + * Does not make sense to go deeper towards the right + * sub-tree if it does not have a free block that is + * equal or bigger to the requested search length. + */ + if (get_subtree_max_size(node->rb_right) >= length) { + node = node->rb_right; + continue; + } + + /* + * OK. We roll back and find the fist right sub-tree, + * that will satisfy the search criteria. It can happen + * only once due to "vstart" restriction. + */ + while ((node = rb_parent(node))) { + va = rb_entry(node, struct vmap_area, rb_node); + if (is_within_this_va(va, size, align, vstart)) + return va; + + if (get_subtree_max_size(node->rb_right) >= length && + vstart <= va->va_start) { + node = node->rb_right; + break; + } + } + } + } + + return NULL; +} + +#if DEBUG_AUGMENT_LOWEST_MATCH_CHECK +#include <linux/random.h> + +static struct vmap_area * +find_vmap_lowest_linear_match(unsigned long size, + unsigned long align, unsigned long vstart) +{ + struct vmap_area *va; + + list_for_each_entry(va, &free_vmap_area_list, list) { + if (!is_within_this_va(va, size, align, vstart)) + continue; + + return va; + } + + return NULL; +} + +static void +find_vmap_lowest_match_check(unsigned long size) +{ + struct vmap_area *va_1, *va_2; + unsigned long vstart; + unsigned int rnd; + + get_random_bytes(&rnd, sizeof(rnd)); + vstart = VMALLOC_START + rnd; + + va_1 = find_vmap_lowest_match(size, 1, vstart); + va_2 = find_vmap_lowest_linear_match(size, 1, vstart); + + if (va_1 != va_2) + pr_emerg("not lowest: t: 0x%p, l: 0x%p, v: 0x%lx\n", + va_1, va_2, vstart); +} +#endif + +enum fit_type { + NOTHING_FIT = 0, + FL_FIT_TYPE = 1, /* full fit */ + LE_FIT_TYPE = 2, /* left edge fit */ + RE_FIT_TYPE = 3, /* right edge fit */ + NE_FIT_TYPE = 4 /* no edge fit */ +}; + +static __always_inline enum fit_type +classify_va_fit_type(struct vmap_area *va, + unsigned long nva_start_addr, unsigned long size) +{ + enum fit_type type; + + /* Check if it is within VA. */ + if (nva_start_addr < va->va_start || + nva_start_addr + size > va->va_end) + return NOTHING_FIT; + + /* Now classify. */ + if (va->va_start == nva_start_addr) { + if (va->va_end == nva_start_addr + size) + type = FL_FIT_TYPE; + else + type = LE_FIT_TYPE; + } else if (va->va_end == nva_start_addr + size) { + type = RE_FIT_TYPE; + } else { + type = NE_FIT_TYPE; + } + + return type; +} + +static __always_inline int +adjust_va_to_fit_type(struct vmap_area *va, + unsigned long nva_start_addr, unsigned long size, + enum fit_type type) +{ + struct vmap_area *lva; + + if (type == FL_FIT_TYPE) { + /* + * No need to split VA, it fully fits. + * + * | | + * V NVA V + * |---------------| + */ + unlink_va(va, &free_vmap_area_root); + kmem_cache_free(vmap_area_cachep, va); + } else if (type == LE_FIT_TYPE) { + /* + * Split left edge of fit VA. + * + * | | + * V NVA V R + * |-------|-------| + */ + va->va_start += size; + } else if (type == RE_FIT_TYPE) { + /* + * Split right edge of fit VA. + * + * | | + * L V NVA V + * |-------|-------| + */ + va->va_end = nva_start_addr; + } else if (type == NE_FIT_TYPE) { + /* + * Split no edge of fit VA. + * + * | | + * L V NVA V R + * |---|-------|---| + */ + lva = kmem_cache_alloc(vmap_area_cachep, GFP_NOWAIT); + if (unlikely(!lva)) + return -1; + + /* + * Build the remainder. + */ + lva->va_start = va->va_start; + lva->va_end = nva_start_addr; + + /* + * Shrink this VA to remaining size. + */ + va->va_start = nva_start_addr + size; + } else { + return -1; + } + + if (type != FL_FIT_TYPE) { + augment_tree_propagate_from(va); + + if (type == NE_FIT_TYPE) + insert_vmap_area_augment(lva, &va->rb_node, + &free_vmap_area_root, &free_vmap_area_list); + } + + return 0; +} + +/* + * Returns a start address of the newly allocated area, if success. + * Otherwise a vend is returned that indicates failure. + */ +static __always_inline unsigned long +__alloc_vmap_area(unsigned long size, unsigned long align, + unsigned long vstart, unsigned long vend, int node) +{ + unsigned long nva_start_addr; + struct vmap_area *va; + enum fit_type type; + int ret; + + va = find_vmap_lowest_match(size, align, vstart); + if (unlikely(!va)) + return vend; + + if (va->va_start > vstart) + nva_start_addr = ALIGN(va->va_start, align); + else + nva_start_addr = ALIGN(vstart, align); + + /* Check the "vend" restriction. */ + if (nva_start_addr + size > vend) + return vend; + + /* Classify what we have found. */ + type = classify_va_fit_type(va, nva_start_addr, size); + if (WARN_ON_ONCE(type == NOTHING_FIT)) + return vend; + + /* Update the free vmap_area. */ + ret = adjust_va_to_fit_type(va, nva_start_addr, size, type); + if (ret) + return vend; + +#if DEBUG_AUGMENT_LOWEST_MATCH_CHECK + find_vmap_lowest_match_check(size); +#endif + + return nva_start_addr; +} /* * Allocate a region of KVA of the specified size and alignment, within the @@ -405,18 +1032,19 @@ static struct vmap_area *alloc_vmap_area(unsigned long size, int node, gfp_t gfp_mask) { struct vmap_area *va; - struct rb_node *n; unsigned long addr; int purged = 0; - struct vmap_area *first; BUG_ON(!size); BUG_ON(offset_in_page(size)); BUG_ON(!is_power_of_2(align)); + if (unlikely(!vmap_initialized)) + return ERR_PTR(-EBUSY); + might_sleep(); - va = kmalloc_node(sizeof(struct vmap_area), + va = kmem_cache_alloc_node(vmap_area_cachep, gfp_mask & GFP_RECLAIM_MASK, node); if (unlikely(!va)) return ERR_PTR(-ENOMEM); @@ -429,87 +1057,20 @@ static struct vmap_area *alloc_vmap_area(unsigned long size, retry: spin_lock(&vmap_area_lock); - /* - * Invalidate cache if we have more permissive parameters. - * cached_hole_size notes the largest hole noticed _below_ - * the vmap_area cached in free_vmap_cache: if size fits - * into that hole, we want to scan from vstart to reuse - * the hole instead of allocating above free_vmap_cache. - * Note that __free_vmap_area may update free_vmap_cache - * without updating cached_hole_size or cached_align. - */ - if (!free_vmap_cache || - size < cached_hole_size || - vstart < cached_vstart || - align < cached_align) { -nocache: - cached_hole_size = 0; - free_vmap_cache = NULL; - } - /* record if we encounter less permissive parameters */ - cached_vstart = vstart; - cached_align = align; - - /* find starting point for our search */ - if (free_vmap_cache) { - first = rb_entry(free_vmap_cache, struct vmap_area, rb_node); - addr = ALIGN(first->va_end, align); - if (addr < vstart) - goto nocache; - if (addr + size < addr) - goto overflow; - - } else { - addr = ALIGN(vstart, align); - if (addr + size < addr) - goto overflow; - - n = vmap_area_root.rb_node; - first = NULL; - - while (n) { - struct vmap_area *tmp; - tmp = rb_entry(n, struct vmap_area, rb_node); - if (tmp->va_end >= addr) { - first = tmp; - if (tmp->va_start <= addr) - break; - n = n->rb_left; - } else - n = n->rb_right; - } - - if (!first) - goto found; - } - /* from the starting point, walk areas until a suitable hole is found */ - while (addr + size > first->va_start && addr + size <= vend) { - if (addr + cached_hole_size < first->va_start) - cached_hole_size = first->va_start - addr; - addr = ALIGN(first->va_end, align); - if (addr + size < addr) - goto overflow; - - if (list_is_last(&first->list, &vmap_area_list)) - goto found; - - first = list_next_entry(first, list); - } - -found: /* - * Check also calculated address against the vstart, - * because it can be 0 because of big align request. + * If an allocation fails, the "vend" address is + * returned. Therefore trigger the overflow path. */ - if (addr + size > vend || addr < vstart) + addr = __alloc_vmap_area(size, align, vstart, vend, node); + if (unlikely(addr == vend)) goto overflow; va->va_start = addr; va->va_end = addr + size; va->flags = 0; - __insert_vmap_area(va); - free_vmap_cache = &va->rb_node; + insert_vmap_area(va, &vmap_area_root, &vmap_area_list); + spin_unlock(&vmap_area_lock); BUG_ON(!IS_ALIGNED(va->va_start, align)); @@ -538,7 +1099,8 @@ overflow: if (!(gfp_mask & __GFP_NOWARN) && printk_ratelimit()) pr_warn("vmap allocation for size %lu failed: use vmalloc=<size> to increase size\n", size); - kfree(va); + + kmem_cache_free(vmap_area_cachep, va); return ERR_PTR(-EBUSY); } @@ -558,35 +1120,16 @@ static void __free_vmap_area(struct vmap_area *va) { BUG_ON(RB_EMPTY_NODE(&va->rb_node)); - if (free_vmap_cache) { - if (va->va_end < cached_vstart) { - free_vmap_cache = NULL; - } else { - struct vmap_area *cache; - cache = rb_entry(free_vmap_cache, struct vmap_area, rb_node); - if (va->va_start <= cache->va_start) { - free_vmap_cache = rb_prev(&va->rb_node); - /* - * We don't try to update cached_hole_size or - * cached_align, but it won't go very wrong. - */ - } - } - } - rb_erase(&va->rb_node, &vmap_area_root); - RB_CLEAR_NODE(&va->rb_node); - list_del_rcu(&va->list); - /* - * Track the highest possible candidate for pcpu area - * allocation. Areas outside of vmalloc area can be returned - * here too, consider only end addresses which fall inside - * vmalloc area proper. + * Remove from the busy tree/list. */ - if (va->va_end > VMALLOC_START && va->va_end <= VMALLOC_END) - vmap_area_pcpu_hole = max(vmap_area_pcpu_hole, va->va_end); + unlink_va(va, &vmap_area_root); - kfree_rcu(va, rcu_head); + /* + * Merge VA with its neighbors, otherwise just add it. + */ + merge_or_add_vmap_area(va, + &free_vmap_area_root, &free_vmap_area_list); } /* @@ -632,7 +1175,7 @@ static unsigned long lazy_max_pages(void) return log * (32UL * 1024 * 1024 / PAGE_SIZE); } -static atomic_t vmap_lazy_nr = ATOMIC_INIT(0); +static atomic_long_t vmap_lazy_nr = ATOMIC_LONG_INIT(0); /* * Serialize vmap purging. There is no actual criticial section protected @@ -650,7 +1193,7 @@ static void purge_fragmented_blocks_allcpus(void); */ void set_iounmap_nonlazy(void) { - atomic_set(&vmap_lazy_nr, lazy_max_pages()+1); + atomic_long_set(&vmap_lazy_nr, lazy_max_pages()+1); } /* @@ -658,34 +1201,40 @@ void set_iounmap_nonlazy(void) */ static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end) { + unsigned long resched_threshold; struct llist_node *valist; struct vmap_area *va; struct vmap_area *n_va; - bool do_free = false; lockdep_assert_held(&vmap_purge_lock); valist = llist_del_all(&vmap_purge_list); + if (unlikely(valist == NULL)) + return false; + + /* + * TODO: to calculate a flush range without looping. + * The list can be up to lazy_max_pages() elements. + */ llist_for_each_entry(va, valist, purge_list) { if (va->va_start < start) start = va->va_start; if (va->va_end > end) end = va->va_end; - do_free = true; } - if (!do_free) - return false; - flush_tlb_kernel_range(start, end); + resched_threshold = lazy_max_pages() << 1; spin_lock(&vmap_area_lock); llist_for_each_entry_safe(va, n_va, valist, purge_list) { - int nr = (va->va_end - va->va_start) >> PAGE_SHIFT; + unsigned long nr = (va->va_end - va->va_start) >> PAGE_SHIFT; __free_vmap_area(va); - atomic_sub(nr, &vmap_lazy_nr); - cond_resched_lock(&vmap_area_lock); + atomic_long_sub(nr, &vmap_lazy_nr); + + if (atomic_long_read(&vmap_lazy_nr) < resched_threshold) + cond_resched_lock(&vmap_area_lock); } spin_unlock(&vmap_area_lock); return true; @@ -721,10 +1270,10 @@ static void purge_vmap_area_lazy(void) */ static void free_vmap_area_noflush(struct vmap_area *va) { - int nr_lazy; + unsigned long nr_lazy; - nr_lazy = atomic_add_return((va->va_end - va->va_start) >> PAGE_SHIFT, - &vmap_lazy_nr); + nr_lazy = atomic_long_add_return((va->va_end - va->va_start) >> + PAGE_SHIFT, &vmap_lazy_nr); /* After this point, we may free va at any time */ llist_add(&va->purge_list, &vmap_purge_list); @@ -787,8 +1336,6 @@ static struct vmap_area *find_vmap_area(unsigned long addr) #define VMAP_BLOCK_SIZE (VMAP_BBMAP_BITS * PAGE_SIZE) -static bool vmap_initialized __read_mostly = false; - struct vmap_block_queue { spinlock_t lock; struct list_head free; @@ -1059,24 +1606,9 @@ static void vb_free(const void *addr, unsigned long size) spin_unlock(&vb->lock); } -/** - * vm_unmap_aliases - unmap outstanding lazy aliases in the vmap layer - * - * The vmap/vmalloc layer lazily flushes kernel virtual mappings primarily - * to amortize TLB flushing overheads. What this means is that any page you - * have now, may, in a former life, have been mapped into kernel virtual - * address by the vmap layer and so there might be some CPUs with TLB entries - * still referencing that page (additional to the regular 1:1 kernel mapping). - * - * vm_unmap_aliases flushes all such lazy mappings. After it returns, we can - * be sure that none of the pages we have control over will have any aliases - * from the vmap layer. - */ -void vm_unmap_aliases(void) +static void _vm_unmap_aliases(unsigned long start, unsigned long end, int flush) { - unsigned long start = ULONG_MAX, end = 0; int cpu; - int flush = 0; if (unlikely(!vmap_initialized)) return; @@ -1113,6 +1645,27 @@ void vm_unmap_aliases(void) flush_tlb_kernel_range(start, end); mutex_unlock(&vmap_purge_lock); } + +/** + * vm_unmap_aliases - unmap outstanding lazy aliases in the vmap layer + * + * The vmap/vmalloc layer lazily flushes kernel virtual mappings primarily + * to amortize TLB flushing overheads. What this means is that any page you + * have now, may, in a former life, have been mapped into kernel virtual + * address by the vmap layer and so there might be some CPUs with TLB entries + * still referencing that page (additional to the regular 1:1 kernel mapping). + * + * vm_unmap_aliases flushes all such lazy mappings. After it returns, we can + * be sure that none of the pages we have control over will have any aliases + * from the vmap layer. + */ +void vm_unmap_aliases(void) +{ + unsigned long start = ULONG_MAX, end = 0; + int flush = 0; + + _vm_unmap_aliases(start, end, flush); +} EXPORT_SYMBOL_GPL(vm_unmap_aliases); /** @@ -1243,12 +1796,58 @@ void __init vm_area_register_early(struct vm_struct *vm, size_t align) vm_area_add_early(vm); } +static void vmap_init_free_space(void) +{ + unsigned long vmap_start = 1; + const unsigned long vmap_end = ULONG_MAX; + struct vmap_area *busy, *free; + + /* + * B F B B B F + * -|-----|.....|-----|-----|-----|.....|- + * | The KVA space | + * |<--------------------------------->| + */ + list_for_each_entry(busy, &vmap_area_list, list) { + if (busy->va_start - vmap_start > 0) { + free = kmem_cache_zalloc(vmap_area_cachep, GFP_NOWAIT); + if (!WARN_ON_ONCE(!free)) { + free->va_start = vmap_start; + free->va_end = busy->va_start; + + insert_vmap_area_augment(free, NULL, + &free_vmap_area_root, + &free_vmap_area_list); + } + } + + vmap_start = busy->va_end; + } + + if (vmap_end - vmap_start > 0) { + free = kmem_cache_zalloc(vmap_area_cachep, GFP_NOWAIT); + if (!WARN_ON_ONCE(!free)) { + free->va_start = vmap_start; + free->va_end = vmap_end; + + insert_vmap_area_augment(free, NULL, + &free_vmap_area_root, + &free_vmap_area_list); + } + } +} + void __init vmalloc_init(void) { struct vmap_area *va; struct vm_struct *tmp; int i; + /* + * Create the cache for vmap_area objects. + */ + vmap_area_cachep = KMEM_CACHE(vmap_area, SLAB_PANIC); + for_each_possible_cpu(i) { struct vmap_block_queue *vbq; struct vfree_deferred *p; @@ -1263,16 +1862,21 @@ void __init vmalloc_init(void) /* Import existing vmlist entries. */ for (tmp = vmlist; tmp; tmp = tmp->next) { - va = kzalloc(sizeof(struct vmap_area), GFP_NOWAIT); + va = kmem_cache_zalloc(vmap_area_cachep, GFP_NOWAIT); + if (WARN_ON_ONCE(!va)) + continue; + va->flags = VM_VM_AREA; va->va_start = (unsigned long)tmp->addr; va->va_end = va->va_start + tmp->size; va->vm = tmp; - __insert_vmap_area(va); + insert_vmap_area(va, &vmap_area_root, &vmap_area_list); } - vmap_area_pcpu_hole = VMALLOC_END; - + /* + * Now we can initialize a free vmap space. + */ + vmap_init_free_space(); vmap_initialized = true; } @@ -1505,6 +2109,72 @@ struct vm_struct *remove_vm_area(const void *addr) return NULL; } +static inline void set_area_direct_map(const struct vm_struct *area, + int (*set_direct_map)(struct page *page)) +{ + int i; + + for (i = 0; i < area->nr_pages; i++) + if (page_address(area->pages[i])) + set_direct_map(area->pages[i]); +} + +/* Handle removing and resetting vm mappings related to the vm_struct. */ +static void vm_remove_mappings(struct vm_struct *area, int deallocate_pages) +{ + unsigned long addr = (unsigned long)area->addr; + unsigned long start = ULONG_MAX, end = 0; + int flush_reset = area->flags & VM_FLUSH_RESET_PERMS; + int i; + + /* + * The below block can be removed when all architectures that have + * direct map permissions also have set_direct_map_() implementations. + * This is concerned with resetting the direct map any an vm alias with + * execute permissions, without leaving a RW+X window. + */ + if (flush_reset && !IS_ENABLED(CONFIG_ARCH_HAS_SET_DIRECT_MAP)) { + set_memory_nx(addr, area->nr_pages); + set_memory_rw(addr, area->nr_pages); + } + + remove_vm_area(area->addr); + + /* If this is not VM_FLUSH_RESET_PERMS memory, no need for the below. */ + if (!flush_reset) + return; + + /* + * If not deallocating pages, just do the flush of the VM area and + * return. + */ + if (!deallocate_pages) { + vm_unmap_aliases(); + return; + } + + /* + * If execution gets here, flush the vm mapping and reset the direct + * map. Find the start and end range of the direct mappings to make sure + * the vm_unmap_aliases() flush includes the direct map. + */ + for (i = 0; i < area->nr_pages; i++) { + if (page_address(area->pages[i])) { + start = min(addr, start); + end = max(addr, end); + } + } + + /* + * Set direct map to something invalid so that it won't be cached if + * there are any accesses after the TLB flush, then flush the TLB and + * reset the direct map permissions to the default. + */ + set_area_direct_map(area, set_direct_map_invalid_noflush); + _vm_unmap_aliases(start, end, 1); + set_area_direct_map(area, set_direct_map_default_noflush); +} + static void __vunmap(const void *addr, int deallocate_pages) { struct vm_struct *area; @@ -1526,7 +2196,8 @@ static void __vunmap(const void *addr, int deallocate_pages) debug_check_no_locks_freed(area->addr, get_vm_area_size(area)); debug_check_no_obj_freed(area->addr, get_vm_area_size(area)); - remove_vm_area(addr); + vm_remove_mappings(area, deallocate_pages); + if (deallocate_pages) { int i; @@ -1961,8 +2632,9 @@ EXPORT_SYMBOL(vzalloc_node); */ void *vmalloc_exec(unsigned long size) { - return __vmalloc_node(size, 1, GFP_KERNEL, PAGE_KERNEL_EXEC, - NUMA_NO_NODE, __builtin_return_address(0)); + return __vmalloc_node_range(size, 1, VMALLOC_START, VMALLOC_END, + GFP_KERNEL, PAGE_KERNEL_EXEC, VM_FLUSH_RESET_PERMS, + NUMA_NO_NODE, __builtin_return_address(0)); } #if defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA32) @@ -2396,81 +3068,64 @@ static struct vmap_area *node_to_va(struct rb_node *n) } /** - * pvm_find_next_prev - find the next and prev vmap_area surrounding @end - * @end: target address - * @pnext: out arg for the next vmap_area - * @pprev: out arg for the previous vmap_area + * pvm_find_va_enclose_addr - find the vmap_area @addr belongs to + * @addr: target address * - * Returns: %true if either or both of next and prev are found, - * %false if no vmap_area exists - * - * Find vmap_areas end addresses of which enclose @end. ie. if not - * NULL, *pnext->va_end > @end and *pprev->va_end <= @end. + * Returns: vmap_area if it is found. If there is no such area + * the first highest(reverse order) vmap_area is returned + * i.e. va->va_start < addr && va->va_end < addr or NULL + * if there are no any areas before @addr. */ -static bool pvm_find_next_prev(unsigned long end, - struct vmap_area **pnext, - struct vmap_area **pprev) +static struct vmap_area * +pvm_find_va_enclose_addr(unsigned long addr) { - struct rb_node *n = vmap_area_root.rb_node; - struct vmap_area *va = NULL; + struct vmap_area *va, *tmp; + struct rb_node *n; + + n = free_vmap_area_root.rb_node; + va = NULL; while (n) { - va = rb_entry(n, struct vmap_area, rb_node); - if (end < va->va_end) - n = n->rb_left; - else if (end > va->va_end) + tmp = rb_entry(n, struct vmap_area, rb_node); + if (tmp->va_start <= addr) { + va = tmp; + if (tmp->va_end >= addr) + break; + n = n->rb_right; - else - break; + } else { + n = n->rb_left; + } } - if (!va) - return false; - - if (va->va_end > end) { - *pnext = va; - *pprev = node_to_va(rb_prev(&(*pnext)->rb_node)); - } else { - *pprev = va; - *pnext = node_to_va(rb_next(&(*pprev)->rb_node)); - } - return true; + return va; } /** - * pvm_determine_end - find the highest aligned address between two vmap_areas - * @pnext: in/out arg for the next vmap_area - * @pprev: in/out arg for the previous vmap_area - * @align: alignment - * - * Returns: determined end address - * - * Find the highest aligned address between *@pnext and *@pprev below - * VMALLOC_END. *@pnext and *@pprev are adjusted so that the aligned - * down address is between the end addresses of the two vmap_areas. + * pvm_determine_end_from_reverse - find the highest aligned address + * of free block below VMALLOC_END + * @va: + * in - the VA we start the search(reverse order); + * out - the VA with the highest aligned end address. * - * Please note that the address returned by this function may fall - * inside *@pnext vmap_area. The caller is responsible for checking - * that. + * Returns: determined end address within vmap_area */ -static unsigned long pvm_determine_end(struct vmap_area **pnext, - struct vmap_area **pprev, - unsigned long align) +static unsigned long +pvm_determine_end_from_reverse(struct vmap_area **va, unsigned long align) { - const unsigned long vmalloc_end = VMALLOC_END & ~(align - 1); + unsigned long vmalloc_end = VMALLOC_END & ~(align - 1); unsigned long addr; - if (*pnext) - addr = min((*pnext)->va_start & ~(align - 1), vmalloc_end); - else - addr = vmalloc_end; - - while (*pprev && (*pprev)->va_end > addr) { - *pnext = *pprev; - *pprev = node_to_va(rb_prev(&(*pnext)->rb_node)); + if (likely(*va)) { + list_for_each_entry_from_reverse((*va), + &free_vmap_area_list, list) { + addr = min((*va)->va_end & ~(align - 1), vmalloc_end); + if ((*va)->va_start < addr) + return addr; + } } - return addr; + return 0; } /** @@ -2490,12 +3145,12 @@ static unsigned long pvm_determine_end(struct vmap_area **pnext, * to gigabytes. To avoid interacting with regular vmallocs, these * areas are allocated from top. * - * Despite its complicated look, this allocator is rather simple. It - * does everything top-down and scans areas from the end looking for - * matching slot. While scanning, if any of the areas overlaps with - * existing vmap_area, the base address is pulled down to fit the - * area. Scanning is repeated till all the areas fit and then all - * necessary data structures are inserted and the result is returned. + * Despite its complicated look, this allocator is rather simple. It + * does everything top-down and scans free blocks from the end looking + * for matching base. While scanning, if any of the areas do not fit the + * base address is pulled down to fit the area. Scanning is repeated till + * all the areas fit and then all necessary data structures are inserted + * and the result is returned. */ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets, const size_t *sizes, int nr_vms, @@ -2503,11 +3158,12 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets, { const unsigned long vmalloc_start = ALIGN(VMALLOC_START, align); const unsigned long vmalloc_end = VMALLOC_END & ~(align - 1); - struct vmap_area **vas, *prev, *next; + struct vmap_area **vas, *va; struct vm_struct **vms; int area, area2, last_area, term_area; - unsigned long base, start, end, last_end; + unsigned long base, start, size, end, last_end; bool purged = false; + enum fit_type type; /* verify parameters and allocate data structures */ BUG_ON(offset_in_page(align) || !is_power_of_2(align)); @@ -2543,7 +3199,7 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets, goto err_free2; for (area = 0; area < nr_vms; area++) { - vas[area] = kzalloc(sizeof(struct vmap_area), GFP_KERNEL); + vas[area] = kmem_cache_zalloc(vmap_area_cachep, GFP_KERNEL); vms[area] = kzalloc(sizeof(struct vm_struct), GFP_KERNEL); if (!vas[area] || !vms[area]) goto err_free; @@ -2556,49 +3212,29 @@ retry: start = offsets[area]; end = start + sizes[area]; - if (!pvm_find_next_prev(vmap_area_pcpu_hole, &next, &prev)) { - base = vmalloc_end - last_end; - goto found; - } - base = pvm_determine_end(&next, &prev, align) - end; + va = pvm_find_va_enclose_addr(vmalloc_end); + base = pvm_determine_end_from_reverse(&va, align) - end; while (true) { - BUG_ON(next && next->va_end <= base + end); - BUG_ON(prev && prev->va_end > base + end); - /* * base might have underflowed, add last_end before * comparing. */ - if (base + last_end < vmalloc_start + last_end) { - spin_unlock(&vmap_area_lock); - if (!purged) { - purge_vmap_area_lazy(); - purged = true; - goto retry; - } - goto err_free; - } + if (base + last_end < vmalloc_start + last_end) + goto overflow; /* - * If next overlaps, move base downwards so that it's - * right below next and then recheck. + * Fitting base has not been found. */ - if (next && next->va_start < base + end) { - base = pvm_determine_end(&next, &prev, align) - end; - term_area = area; - continue; - } + if (va == NULL) + goto overflow; /* - * If prev overlaps, shift down next and prev and move - * base so that it's right below new next and then - * recheck. + * If this VA does not fit, move base downwards and recheck. */ - if (prev && prev->va_end > base + start) { - next = prev; - prev = node_to_va(rb_prev(&next->rb_node)); - base = pvm_determine_end(&next, &prev, align) - end; + if (base + start < va->va_start || base + end > va->va_end) { + va = node_to_va(rb_prev(&va->rb_node)); + base = pvm_determine_end_from_reverse(&va, align) - end; term_area = area; continue; } @@ -2610,21 +3246,40 @@ retry: area = (area + nr_vms - 1) % nr_vms; if (area == term_area) break; + start = offsets[area]; end = start + sizes[area]; - pvm_find_next_prev(base + end, &next, &prev); + va = pvm_find_va_enclose_addr(base + end); } -found: + /* we've found a fitting base, insert all va's */ for (area = 0; area < nr_vms; area++) { - struct vmap_area *va = vas[area]; + int ret; - va->va_start = base + offsets[area]; - va->va_end = va->va_start + sizes[area]; - __insert_vmap_area(va); - } + start = base + offsets[area]; + size = sizes[area]; - vmap_area_pcpu_hole = base + offsets[last_area]; + va = pvm_find_va_enclose_addr(start); + if (WARN_ON_ONCE(va == NULL)) + /* It is a BUG(), but trigger recovery instead. */ + goto recovery; + + type = classify_va_fit_type(va, start, size); + if (WARN_ON_ONCE(type == NOTHING_FIT)) + /* It is a BUG(), but trigger recovery instead. */ + goto recovery; + + ret = adjust_va_to_fit_type(va, start, size, type); + if (unlikely(ret)) + goto recovery; + + /* Allocated area. */ + va = vas[area]; + va->va_start = start; + va->va_end = start + size; + + insert_vmap_area(va, &vmap_area_root, &vmap_area_list); + } spin_unlock(&vmap_area_lock); @@ -2636,9 +3291,38 @@ found: kfree(vas); return vms; +recovery: + /* Remove previously inserted areas. */ + while (area--) { + __free_vmap_area(vas[area]); + vas[area] = NULL; + } + +overflow: + spin_unlock(&vmap_area_lock); + if (!purged) { + purge_vmap_area_lazy(); + purged = true; + + /* Before "retry", check if we recover. */ + for (area = 0; area < nr_vms; area++) { + if (vas[area]) + continue; + + vas[area] = kmem_cache_zalloc( + vmap_area_cachep, GFP_KERNEL); + if (!vas[area]) + goto err_free; + } + + goto retry; + } + err_free: for (area = 0; area < nr_vms; area++) { - kfree(vas[area]); + if (vas[area]) + kmem_cache_free(vmap_area_cachep, vas[area]); + kfree(vms[area]); } err_free2: diff --git a/mm/vmscan.c b/mm/vmscan.c index a5ad0b35ab8e..7acd0afdfc2a 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -346,7 +346,7 @@ unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru, int zone int zid; if (!mem_cgroup_disabled()) - lru_size = mem_cgroup_get_lru_size(lruvec, lru); + lru_size = lruvec_page_state_local(lruvec, NR_LRU_BASE + lru); else lru_size = node_page_state(lruvec_pgdat(lruvec), NR_LRU_BASE + lru); @@ -493,7 +493,7 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl, total_scan += delta; if (total_scan < 0) { - pr_err("shrink_slab: %pF negative objects to delete nr=%ld\n", + pr_err("shrink_slab: %pS negative objects to delete nr=%ld\n", shrinker->scan_objects, total_scan); total_scan = freeable; next_deferred = nr; @@ -1107,6 +1107,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, LIST_HEAD(ret_pages); LIST_HEAD(free_pages); unsigned nr_reclaimed = 0; + unsigned pgactivate = 0; memset(stat, 0, sizeof(*stat)); cond_resched(); @@ -1466,8 +1467,10 @@ activate_locked: try_to_free_swap(page); VM_BUG_ON_PAGE(PageActive(page), page); if (!PageMlocked(page)) { + int type = page_is_file_cache(page); SetPageActive(page); - stat->nr_activate++; + pgactivate++; + stat->nr_activate[type] += hpage_nr_pages(page); count_memcg_page_event(page, PGACTIVATE); } keep_locked: @@ -1482,7 +1485,7 @@ keep: free_unref_page_list(&free_pages); list_splice(&ret_pages, page_list); - count_vm_events(PGACTIVATE, stat->nr_activate); + count_vm_events(PGACTIVATE, pgactivate); return nr_reclaimed; } @@ -1804,40 +1807,54 @@ static int too_many_isolated(struct pglist_data *pgdat, int file, return isolated > inactive; } -static noinline_for_stack void -putback_inactive_pages(struct lruvec *lruvec, struct list_head *page_list) +/* + * This moves pages from @list to corresponding LRU list. + * + * We move them the other way if the page is referenced by one or more + * processes, from rmap. + * + * If the pages are mostly unmapped, the processing is fast and it is + * appropriate to hold zone_lru_lock across the whole operation. But if + * the pages are mapped, the processing is slow (page_referenced()) so we + * should drop zone_lru_lock around each page. It's impossible to balance + * this, so instead we remove the pages from the LRU while processing them. + * It is safe to rely on PG_active against the non-LRU pages in here because + * nobody will play with that bit on a non-LRU page. + * + * The downside is that we have to touch page->_refcount against each page. + * But we had to alter page->flags anyway. + * + * Returns the number of pages moved to the given lruvec. + */ + +static unsigned noinline_for_stack move_pages_to_lru(struct lruvec *lruvec, + struct list_head *list) { - struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat; struct pglist_data *pgdat = lruvec_pgdat(lruvec); + int nr_pages, nr_moved = 0; LIST_HEAD(pages_to_free); + struct page *page; + enum lru_list lru; - /* - * Put back any unfreeable pages. - */ - while (!list_empty(page_list)) { - struct page *page = lru_to_page(page_list); - int lru; - + while (!list_empty(list)) { + page = lru_to_page(list); VM_BUG_ON_PAGE(PageLRU(page), page); - list_del(&page->lru); if (unlikely(!page_evictable(page))) { + list_del(&page->lru); spin_unlock_irq(&pgdat->lru_lock); putback_lru_page(page); spin_lock_irq(&pgdat->lru_lock); continue; } - lruvec = mem_cgroup_page_lruvec(page, pgdat); SetPageLRU(page); lru = page_lru(page); - add_page_to_lru_list(page, lruvec, lru); - if (is_active_lru(lru)) { - int file = is_file_lru(lru); - int numpages = hpage_nr_pages(page); - reclaim_stat->recent_rotated[file] += numpages; - } + nr_pages = hpage_nr_pages(page); + update_lru_size(lruvec, lru, page_zonenum(page), nr_pages); + list_move(&page->lru, &lruvec->lists[lru]); + if (put_page_testzero(page)) { __ClearPageLRU(page); __ClearPageActive(page); @@ -1850,13 +1867,17 @@ putback_inactive_pages(struct lruvec *lruvec, struct list_head *page_list) spin_lock_irq(&pgdat->lru_lock); } else list_add(&page->lru, &pages_to_free); + } else { + nr_moved += nr_pages; } } /* * To save our caller's stack, now use input list for pages to free. */ - list_splice(&pages_to_free, page_list); + list_splice(&pages_to_free, list); + + return nr_moved; } /* @@ -1886,6 +1907,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, unsigned long nr_taken; struct reclaim_stat stat; int file = is_file_lru(lru); + enum vm_event_item item; struct pglist_data *pgdat = lruvec_pgdat(lruvec); struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat; bool stalled = false; @@ -1913,17 +1935,10 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, nr_taken); reclaim_stat->recent_scanned[file] += nr_taken; - if (current_is_kswapd()) { - if (global_reclaim(sc)) - __count_vm_events(PGSCAN_KSWAPD, nr_scanned); - count_memcg_events(lruvec_memcg(lruvec), PGSCAN_KSWAPD, - nr_scanned); - } else { - if (global_reclaim(sc)) - __count_vm_events(PGSCAN_DIRECT, nr_scanned); - count_memcg_events(lruvec_memcg(lruvec), PGSCAN_DIRECT, - nr_scanned); - } + item = current_is_kswapd() ? PGSCAN_KSWAPD : PGSCAN_DIRECT; + if (global_reclaim(sc)) + __count_vm_events(item, nr_scanned); + __count_memcg_events(lruvec_memcg(lruvec), item, nr_scanned); spin_unlock_irq(&pgdat->lru_lock); if (nr_taken == 0) @@ -1934,19 +1949,14 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, spin_lock_irq(&pgdat->lru_lock); - if (current_is_kswapd()) { - if (global_reclaim(sc)) - __count_vm_events(PGSTEAL_KSWAPD, nr_reclaimed); - count_memcg_events(lruvec_memcg(lruvec), PGSTEAL_KSWAPD, - nr_reclaimed); - } else { - if (global_reclaim(sc)) - __count_vm_events(PGSTEAL_DIRECT, nr_reclaimed); - count_memcg_events(lruvec_memcg(lruvec), PGSTEAL_DIRECT, - nr_reclaimed); - } + item = current_is_kswapd() ? PGSTEAL_KSWAPD : PGSTEAL_DIRECT; + if (global_reclaim(sc)) + __count_vm_events(item, nr_reclaimed); + __count_memcg_events(lruvec_memcg(lruvec), item, nr_reclaimed); + reclaim_stat->recent_rotated[0] = stat.nr_activate[0]; + reclaim_stat->recent_rotated[1] = stat.nr_activate[1]; - putback_inactive_pages(lruvec, &page_list); + move_pages_to_lru(lruvec, &page_list); __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken); @@ -1983,73 +1993,6 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, return nr_reclaimed; } -/* - * This moves pages from the active list to the inactive list. - * - * We move them the other way if the page is referenced by one or more - * processes, from rmap. - * - * If the pages are mostly unmapped, the processing is fast and it is - * appropriate to hold pgdat->lru_lock across the whole operation. But if - * the pages are mapped, the processing is slow (page_referenced()) so we - * should drop pgdat->lru_lock around each page. It's impossible to balance - * this, so instead we remove the pages from the LRU while processing them. - * It is safe to rely on PG_active against the non-LRU pages in here because - * nobody will play with that bit on a non-LRU page. - * - * The downside is that we have to touch page->_refcount against each page. - * But we had to alter page->flags anyway. - * - * Returns the number of pages moved to the given lru. - */ - -static unsigned move_active_pages_to_lru(struct lruvec *lruvec, - struct list_head *list, - struct list_head *pages_to_free, - enum lru_list lru) -{ - struct pglist_data *pgdat = lruvec_pgdat(lruvec); - struct page *page; - int nr_pages; - int nr_moved = 0; - - while (!list_empty(list)) { - page = lru_to_page(list); - lruvec = mem_cgroup_page_lruvec(page, pgdat); - - VM_BUG_ON_PAGE(PageLRU(page), page); - SetPageLRU(page); - - nr_pages = hpage_nr_pages(page); - update_lru_size(lruvec, lru, page_zonenum(page), nr_pages); - list_move(&page->lru, &lruvec->lists[lru]); - - if (put_page_testzero(page)) { - __ClearPageLRU(page); - __ClearPageActive(page); - del_page_from_lru_list(page, lruvec, lru); - - if (unlikely(PageCompound(page))) { - spin_unlock_irq(&pgdat->lru_lock); - mem_cgroup_uncharge(page); - (*get_compound_page_dtor(page))(page); - spin_lock_irq(&pgdat->lru_lock); - } else - list_add(&page->lru, pages_to_free); - } else { - nr_moved += nr_pages; - } - } - - if (!is_active_lru(lru)) { - __count_vm_events(PGDEACTIVATE, nr_moved); - count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE, - nr_moved); - } - - return nr_moved; -} - static void shrink_active_list(unsigned long nr_to_scan, struct lruvec *lruvec, struct scan_control *sc, @@ -2079,7 +2022,7 @@ static void shrink_active_list(unsigned long nr_to_scan, reclaim_stat->recent_scanned[file] += nr_taken; __count_vm_events(PGREFILL, nr_scanned); - count_memcg_events(lruvec_memcg(lruvec), PGREFILL, nr_scanned); + __count_memcg_events(lruvec_memcg(lruvec), PGREFILL, nr_scanned); spin_unlock_irq(&pgdat->lru_lock); @@ -2136,13 +2079,19 @@ static void shrink_active_list(unsigned long nr_to_scan, */ reclaim_stat->recent_rotated[file] += nr_rotated; - nr_activate = move_active_pages_to_lru(lruvec, &l_active, &l_hold, lru); - nr_deactivate = move_active_pages_to_lru(lruvec, &l_inactive, &l_hold, lru - LRU_ACTIVE); + nr_activate = move_pages_to_lru(lruvec, &l_active); + nr_deactivate = move_pages_to_lru(lruvec, &l_inactive); + /* Keep all free pages in l_active list */ + list_splice(&l_inactive, &l_active); + + __count_vm_events(PGDEACTIVATE, nr_deactivate); + __count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE, nr_deactivate); + __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken); spin_unlock_irq(&pgdat->lru_lock); - mem_cgroup_uncharge_list(&l_hold); - free_unref_page_list(&l_hold); + mem_cgroup_uncharge_list(&l_active); + free_unref_page_list(&l_active); trace_mm_vmscan_lru_shrink_active(pgdat->node_id, nr_taken, nr_activate, nr_deactivate, nr_rotated, sc->priority, file); } @@ -2176,7 +2125,6 @@ static void shrink_active_list(unsigned long nr_to_scan, * 10TB 320 32GB */ static bool inactive_list_is_low(struct lruvec *lruvec, bool file, - struct mem_cgroup *memcg, struct scan_control *sc, bool actual_reclaim) { enum lru_list active_lru = file * LRU_FILE + LRU_ACTIVE; @@ -2197,16 +2145,12 @@ static bool inactive_list_is_low(struct lruvec *lruvec, bool file, inactive = lruvec_lru_size(lruvec, inactive_lru, sc->reclaim_idx); active = lruvec_lru_size(lruvec, active_lru, sc->reclaim_idx); - if (memcg) - refaults = memcg_page_state(memcg, WORKINGSET_ACTIVATE); - else - refaults = node_page_state(pgdat, WORKINGSET_ACTIVATE); - /* * When refaults are being observed, it means a new workingset * is being established. Disable active list protection to get * rid of the stale workingset quickly. */ + refaults = lruvec_page_state_local(lruvec, WORKINGSET_ACTIVATE); if (file && actual_reclaim && lruvec->refaults != refaults) { inactive_ratio = 0; } else { @@ -2227,12 +2171,10 @@ static bool inactive_list_is_low(struct lruvec *lruvec, bool file, } static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan, - struct lruvec *lruvec, struct mem_cgroup *memcg, - struct scan_control *sc) + struct lruvec *lruvec, struct scan_control *sc) { if (is_active_lru(lru)) { - if (inactive_list_is_low(lruvec, is_file_lru(lru), - memcg, sc, true)) + if (inactive_list_is_low(lruvec, is_file_lru(lru), sc, true)) shrink_active_list(nr_to_scan, lruvec, sc, lru); return 0; } @@ -2332,7 +2274,7 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg, * anonymous pages on the LRU in eligible zones. * Otherwise, the small LRU gets thrashed. */ - if (!inactive_list_is_low(lruvec, false, memcg, sc, false) && + if (!inactive_list_is_low(lruvec, false, sc, false) && lruvec_lru_size(lruvec, LRU_INACTIVE_ANON, sc->reclaim_idx) >> sc->priority) { scan_balance = SCAN_ANON; @@ -2350,7 +2292,7 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg, * lruvec even if it has plenty of old anonymous pages unless the * system is under heavy pressure. */ - if (!inactive_list_is_low(lruvec, true, memcg, sc, false) && + if (!inactive_list_is_low(lruvec, true, sc, false) && lruvec_lru_size(lruvec, LRU_INACTIVE_FILE, sc->reclaim_idx) >> sc->priority) { scan_balance = SCAN_FILE; goto out; @@ -2503,7 +2445,7 @@ static void shrink_node_memcg(struct pglist_data *pgdat, struct mem_cgroup *memc nr[lru] -= nr_to_scan; nr_reclaimed += shrink_list(lru, nr_to_scan, - lruvec, memcg, sc); + lruvec, sc); } } @@ -2570,7 +2512,7 @@ static void shrink_node_memcg(struct pglist_data *pgdat, struct mem_cgroup *memc * Even if we did not try to evict anon pages at all, we want to * rebalance the anon lru active/inactive ratio. */ - if (inactive_list_is_low(lruvec, false, memcg, sc, true)) + if (inactive_list_is_low(lruvec, false, sc, true)) shrink_active_list(SWAP_CLUSTER_MAX, lruvec, sc, LRU_ACTIVE_ANON); } @@ -2969,12 +2911,8 @@ static void snapshot_refaults(struct mem_cgroup *root_memcg, pg_data_t *pgdat) unsigned long refaults; struct lruvec *lruvec; - if (memcg) - refaults = memcg_page_state(memcg, WORKINGSET_ACTIVATE); - else - refaults = node_page_state(pgdat, WORKINGSET_ACTIVATE); - lruvec = mem_cgroup_lruvec(pgdat, memcg); + refaults = lruvec_page_state_local(lruvec, WORKINGSET_ACTIVATE); lruvec->refaults = refaults; } while ((memcg = mem_cgroup_iter(root_memcg, memcg, NULL))); } @@ -3223,10 +3161,7 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order, if (throttle_direct_reclaim(sc.gfp_mask, zonelist, nodemask)) return 1; - trace_mm_vmscan_direct_reclaim_begin(order, - sc.may_writepage, - sc.gfp_mask, - sc.reclaim_idx); + trace_mm_vmscan_direct_reclaim_begin(order, sc.gfp_mask); nr_reclaimed = do_try_to_free_pages(zonelist, &sc); @@ -3257,9 +3192,7 @@ unsigned long mem_cgroup_shrink_node(struct mem_cgroup *memcg, (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK); trace_mm_vmscan_memcg_softlimit_reclaim_begin(sc.order, - sc.may_writepage, - sc.gfp_mask, - sc.reclaim_idx); + sc.gfp_mask); /* * NOTE: Although we can get the priority field, using it @@ -3308,10 +3241,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, zonelist = &NODE_DATA(nid)->node_zonelists[ZONELIST_FALLBACK]; - trace_mm_vmscan_memcg_reclaim_begin(0, - sc.may_writepage, - sc.gfp_mask, - sc.reclaim_idx); + trace_mm_vmscan_memcg_reclaim_begin(0, sc.gfp_mask); psi_memstall_enter(&pflags); noreclaim_flag = memalloc_noreclaim_save(); @@ -3339,7 +3269,7 @@ static void age_active_anon(struct pglist_data *pgdat, do { struct lruvec *lruvec = mem_cgroup_lruvec(pgdat, memcg); - if (inactive_list_is_low(lruvec, false, memcg, sc, true)) + if (inactive_list_is_low(lruvec, false, sc, true)) shrink_active_list(SWAP_CLUSTER_MAX, lruvec, sc, LRU_ACTIVE_ANON); @@ -4160,6 +4090,9 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in .reclaim_idx = gfp_zone(gfp_mask), }; + trace_mm_vmscan_node_reclaim_begin(pgdat->node_id, order, + sc.gfp_mask); + cond_resched(); fs_reclaim_acquire(sc.gfp_mask); /* @@ -4186,6 +4119,9 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in current->flags &= ~PF_SWAPWRITE; memalloc_noreclaim_restore(noreclaim_flag); fs_reclaim_release(sc.gfp_mask); + + trace_mm_vmscan_node_reclaim_end(sc.nr_reclaimed); + return sc.nr_reclaimed >= nr_pages; } diff --git a/mm/vmstat.c b/mm/vmstat.c index 36b56f858f0f..a7d493366a65 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1274,13 +1274,8 @@ const char * const vmstat_text[] = { #endif #endif /* CONFIG_MEMORY_BALLOON */ #ifdef CONFIG_DEBUG_TLBFLUSH -#ifdef CONFIG_SMP "nr_tlb_remote_flush", "nr_tlb_remote_flush_received", -#else - "", /* nr_tlb_remote_flush */ - "", /* nr_tlb_remote_flush_received */ -#endif /* CONFIG_SMP */ "nr_tlb_local_flush_all", "nr_tlb_local_flush_one", #endif /* CONFIG_DEBUG_TLBFLUSH */ diff --git a/mm/workingset.c b/mm/workingset.c index 0bedf67502d5..e0b4edcb88c8 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -426,12 +426,14 @@ static unsigned long count_shadow_nodes(struct shrinker *shrinker, #ifdef CONFIG_MEMCG if (sc->memcg) { struct lruvec *lruvec; + int i; - pages = mem_cgroup_node_nr_lru_pages(sc->memcg, sc->nid, - LRU_ALL); lruvec = mem_cgroup_lruvec(NODE_DATA(sc->nid), sc->memcg); - pages += lruvec_page_state(lruvec, NR_SLAB_RECLAIMABLE); - pages += lruvec_page_state(lruvec, NR_SLAB_UNRECLAIMABLE); + for (pages = 0, i = 0; i < NR_LRU_LISTS; i++) + pages += lruvec_page_state_local(lruvec, + NR_LRU_BASE + i); + pages += lruvec_page_state_local(lruvec, NR_SLAB_RECLAIMABLE); + pages += lruvec_page_state_local(lruvec, NR_SLAB_UNRECLAIMABLE); } else #endif pages = node_present_pages(sc->nid); diff --git a/mm/z3fold.c b/mm/z3fold.c index aee9b0b8d907..1ffecd6333e5 100644 --- a/mm/z3fold.c +++ b/mm/z3fold.c @@ -24,16 +24,47 @@ #include <linux/atomic.h> #include <linux/sched.h> +#include <linux/cpumask.h> +#include <linux/dcache.h> #include <linux/list.h> #include <linux/mm.h> #include <linux/module.h> +#include <linux/page-flags.h> +#include <linux/migrate.h> +#include <linux/node.h> +#include <linux/compaction.h> #include <linux/percpu.h> +#include <linux/mount.h> +#include <linux/fs.h> #include <linux/preempt.h> #include <linux/workqueue.h> #include <linux/slab.h> #include <linux/spinlock.h> #include <linux/zpool.h> +/* + * NCHUNKS_ORDER determines the internal allocation granularity, effectively + * adjusting internal fragmentation. It also determines the number of + * freelists maintained in each pool. NCHUNKS_ORDER of 6 means that the + * allocation granularity will be in chunks of size PAGE_SIZE/64. Some chunks + * in the beginning of an allocated page are occupied by z3fold header, so + * NCHUNKS will be calculated to 63 (or 62 in case CONFIG_DEBUG_SPINLOCK=y), + * which shows the max number of free chunks in z3fold page, also there will + * be 63, or 62, respectively, freelists per pool. + */ +#define NCHUNKS_ORDER 6 + +#define CHUNK_SHIFT (PAGE_SHIFT - NCHUNKS_ORDER) +#define CHUNK_SIZE (1 << CHUNK_SHIFT) +#define ZHDR_SIZE_ALIGNED round_up(sizeof(struct z3fold_header), CHUNK_SIZE) +#define ZHDR_CHUNKS (ZHDR_SIZE_ALIGNED >> CHUNK_SHIFT) +#define TOTAL_CHUNKS (PAGE_SIZE >> CHUNK_SHIFT) +#define NCHUNKS ((PAGE_SIZE - ZHDR_SIZE_ALIGNED) >> CHUNK_SHIFT) + +#define BUDDY_MASK (0x3) +#define BUDDY_SHIFT 2 +#define SLOTS_ALIGN (0x40) + /***************** * Structures *****************/ @@ -47,8 +78,18 @@ enum buddy { FIRST, MIDDLE, LAST, - BUDDIES_MAX + BUDDIES_MAX = LAST +}; + +struct z3fold_buddy_slots { + /* + * we are using BUDDY_MASK in handle_to_buddy etc. so there should + * be enough slots to hold all possible variants + */ + unsigned long slot[BUDDY_MASK + 1]; + unsigned long pool; /* back link + flags */ }; +#define HANDLE_FLAG_MASK (0x03) /* * struct z3fold_header - z3fold page metadata occupying first chunks of each @@ -58,49 +99,29 @@ enum buddy { * @page_lock: per-page lock * @refcount: reference count for the z3fold page * @work: work_struct for page layout optimization - * @pool: pointer to the pool which this page belongs to + * @slots: pointer to the structure holding buddy slots * @cpu: CPU which this page "belongs" to * @first_chunks: the size of the first buddy in chunks, 0 if free * @middle_chunks: the size of the middle buddy in chunks, 0 if free * @last_chunks: the size of the last buddy in chunks, 0 if free * @first_num: the starting number (for the first handle) + * @mapped_count: the number of objects currently mapped */ struct z3fold_header { struct list_head buddy; spinlock_t page_lock; struct kref refcount; struct work_struct work; - struct z3fold_pool *pool; + struct z3fold_buddy_slots *slots; short cpu; unsigned short first_chunks; unsigned short middle_chunks; unsigned short last_chunks; unsigned short start_middle; unsigned short first_num:2; + unsigned short mapped_count:2; }; -/* - * NCHUNKS_ORDER determines the internal allocation granularity, effectively - * adjusting internal fragmentation. It also determines the number of - * freelists maintained in each pool. NCHUNKS_ORDER of 6 means that the - * allocation granularity will be in chunks of size PAGE_SIZE/64. Some chunks - * in the beginning of an allocated page are occupied by z3fold header, so - * NCHUNKS will be calculated to 63 (or 62 in case CONFIG_DEBUG_SPINLOCK=y), - * which shows the max number of free chunks in z3fold page, also there will - * be 63, or 62, respectively, freelists per pool. - */ -#define NCHUNKS_ORDER 6 - -#define CHUNK_SHIFT (PAGE_SHIFT - NCHUNKS_ORDER) -#define CHUNK_SIZE (1 << CHUNK_SHIFT) -#define ZHDR_SIZE_ALIGNED round_up(sizeof(struct z3fold_header), CHUNK_SIZE) -#define ZHDR_CHUNKS (ZHDR_SIZE_ALIGNED >> CHUNK_SHIFT) -#define TOTAL_CHUNKS (PAGE_SIZE >> CHUNK_SHIFT) -#define NCHUNKS ((PAGE_SIZE - ZHDR_SIZE_ALIGNED) >> CHUNK_SHIFT) - -#define BUDDY_MASK (0x3) -#define BUDDY_SHIFT 2 - /** * struct z3fold_pool - stores metadata for each z3fold pool * @name: pool name @@ -113,11 +134,13 @@ struct z3fold_header { * added buddy. * @stale: list of pages marked for freeing * @pages_nr: number of z3fold pages in the pool. + * @c_handle: cache for z3fold_buddy_slots allocation * @ops: pointer to a structure of user defined operations specified at * pool creation time. * @compact_wq: workqueue for page layout background optimization * @release_wq: workqueue for safe page release * @work: work_struct for safe page release + * @inode: inode for z3fold pseudo filesystem * * This structure is allocated at pool creation time and maintains metadata * pertaining to a particular z3fold pool. @@ -130,12 +153,14 @@ struct z3fold_pool { struct list_head lru; struct list_head stale; atomic64_t pages_nr; + struct kmem_cache *c_handle; const struct z3fold_ops *ops; struct zpool *zpool; const struct zpool_ops *zpool_ops; struct workqueue_struct *compact_wq; struct workqueue_struct *release_wq; struct work_struct work; + struct inode *inode; }; /* @@ -164,11 +189,118 @@ static int size_to_chunks(size_t size) static void compact_page_work(struct work_struct *w); +static inline struct z3fold_buddy_slots *alloc_slots(struct z3fold_pool *pool) +{ + struct z3fold_buddy_slots *slots = kmem_cache_alloc(pool->c_handle, + GFP_KERNEL); + + if (slots) { + memset(slots->slot, 0, sizeof(slots->slot)); + slots->pool = (unsigned long)pool; + } + + return slots; +} + +static inline struct z3fold_pool *slots_to_pool(struct z3fold_buddy_slots *s) +{ + return (struct z3fold_pool *)(s->pool & ~HANDLE_FLAG_MASK); +} + +static inline struct z3fold_buddy_slots *handle_to_slots(unsigned long handle) +{ + return (struct z3fold_buddy_slots *)(handle & ~(SLOTS_ALIGN - 1)); +} + +static inline void free_handle(unsigned long handle) +{ + struct z3fold_buddy_slots *slots; + int i; + bool is_free; + + if (handle & (1 << PAGE_HEADLESS)) + return; + + WARN_ON(*(unsigned long *)handle == 0); + *(unsigned long *)handle = 0; + slots = handle_to_slots(handle); + is_free = true; + for (i = 0; i <= BUDDY_MASK; i++) { + if (slots->slot[i]) { + is_free = false; + break; + } + } + + if (is_free) { + struct z3fold_pool *pool = slots_to_pool(slots); + + kmem_cache_free(pool->c_handle, slots); + } +} + +static struct dentry *z3fold_do_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data) +{ + static const struct dentry_operations ops = { + .d_dname = simple_dname, + }; + + return mount_pseudo(fs_type, "z3fold:", NULL, &ops, 0x33); +} + +static struct file_system_type z3fold_fs = { + .name = "z3fold", + .mount = z3fold_do_mount, + .kill_sb = kill_anon_super, +}; + +static struct vfsmount *z3fold_mnt; +static int z3fold_mount(void) +{ + int ret = 0; + + z3fold_mnt = kern_mount(&z3fold_fs); + if (IS_ERR(z3fold_mnt)) + ret = PTR_ERR(z3fold_mnt); + + return ret; +} + +static void z3fold_unmount(void) +{ + kern_unmount(z3fold_mnt); +} + +static const struct address_space_operations z3fold_aops; +static int z3fold_register_migration(struct z3fold_pool *pool) +{ + pool->inode = alloc_anon_inode(z3fold_mnt->mnt_sb); + if (IS_ERR(pool->inode)) { + pool->inode = NULL; + return 1; + } + + pool->inode->i_mapping->private_data = pool; + pool->inode->i_mapping->a_ops = &z3fold_aops; + return 0; +} + +static void z3fold_unregister_migration(struct z3fold_pool *pool) +{ + if (pool->inode) + iput(pool->inode); + } + /* Initializes the z3fold header of a newly allocated z3fold page */ static struct z3fold_header *init_z3fold_page(struct page *page, struct z3fold_pool *pool) { struct z3fold_header *zhdr = page_address(page); + struct z3fold_buddy_slots *slots = alloc_slots(pool); + + if (!slots) + return NULL; INIT_LIST_HEAD(&page->lru); clear_bit(PAGE_HEADLESS, &page->private); @@ -185,15 +317,21 @@ static struct z3fold_header *init_z3fold_page(struct page *page, zhdr->first_num = 0; zhdr->start_middle = 0; zhdr->cpu = -1; - zhdr->pool = pool; + zhdr->slots = slots; INIT_LIST_HEAD(&zhdr->buddy); INIT_WORK(&zhdr->work, compact_page_work); return zhdr; } /* Resets the struct page fields and frees the page */ -static void free_z3fold_page(struct page *page) +static void free_z3fold_page(struct page *page, bool headless) { + if (!headless) { + lock_page(page); + __ClearPageMovable(page); + unlock_page(page); + } + ClearPagePrivate(page); __free_page(page); } @@ -215,33 +353,57 @@ static inline void z3fold_page_unlock(struct z3fold_header *zhdr) spin_unlock(&zhdr->page_lock); } +/* Helper function to build the index */ +static inline int __idx(struct z3fold_header *zhdr, enum buddy bud) +{ + return (bud + zhdr->first_num) & BUDDY_MASK; +} + /* * Encodes the handle of a particular buddy within a z3fold page * Pool lock should be held as this function accesses first_num */ static unsigned long encode_handle(struct z3fold_header *zhdr, enum buddy bud) { - unsigned long handle; + struct z3fold_buddy_slots *slots; + unsigned long h = (unsigned long)zhdr; + int idx = 0; - handle = (unsigned long)zhdr; - if (bud != HEADLESS) { - handle |= (bud + zhdr->first_num) & BUDDY_MASK; - if (bud == LAST) - handle |= (zhdr->last_chunks << BUDDY_SHIFT); - } - return handle; + /* + * For a headless page, its handle is its pointer with the extra + * PAGE_HEADLESS bit set + */ + if (bud == HEADLESS) + return h | (1 << PAGE_HEADLESS); + + /* otherwise, return pointer to encoded handle */ + idx = __idx(zhdr, bud); + h += idx; + if (bud == LAST) + h |= (zhdr->last_chunks << BUDDY_SHIFT); + + slots = zhdr->slots; + slots->slot[idx] = h; + return (unsigned long)&slots->slot[idx]; } /* Returns the z3fold page where a given handle is stored */ -static struct z3fold_header *handle_to_z3fold_header(unsigned long handle) +static inline struct z3fold_header *handle_to_z3fold_header(unsigned long h) { - return (struct z3fold_header *)(handle & PAGE_MASK); + unsigned long addr = h; + + if (!(addr & (1 << PAGE_HEADLESS))) + addr = *(unsigned long *)h; + + return (struct z3fold_header *)(addr & PAGE_MASK); } /* only for LAST bud, returns zero otherwise */ static unsigned short handle_to_chunks(unsigned long handle) { - return (handle & ~PAGE_MASK) >> BUDDY_SHIFT; + unsigned long addr = *(unsigned long *)handle; + + return (addr & ~PAGE_MASK) >> BUDDY_SHIFT; } /* @@ -251,21 +413,31 @@ static unsigned short handle_to_chunks(unsigned long handle) */ static enum buddy handle_to_buddy(unsigned long handle) { - struct z3fold_header *zhdr = handle_to_z3fold_header(handle); - return (handle - zhdr->first_num) & BUDDY_MASK; + struct z3fold_header *zhdr; + unsigned long addr; + + WARN_ON(handle & (1 << PAGE_HEADLESS)); + addr = *(unsigned long *)handle; + zhdr = (struct z3fold_header *)(addr & PAGE_MASK); + return (addr - zhdr->first_num) & BUDDY_MASK; +} + +static inline struct z3fold_pool *zhdr_to_pool(struct z3fold_header *zhdr) +{ + return slots_to_pool(zhdr->slots); } static void __release_z3fold_page(struct z3fold_header *zhdr, bool locked) { struct page *page = virt_to_page(zhdr); - struct z3fold_pool *pool = zhdr->pool; + struct z3fold_pool *pool = zhdr_to_pool(zhdr); WARN_ON(!list_empty(&zhdr->buddy)); set_bit(PAGE_STALE, &page->private); clear_bit(NEEDS_COMPACTING, &page->private); spin_lock(&pool->lock); if (!list_empty(&page->lru)) - list_del(&page->lru); + list_del_init(&page->lru); spin_unlock(&pool->lock); if (locked) z3fold_page_unlock(zhdr); @@ -295,9 +467,10 @@ static void release_z3fold_page_locked_list(struct kref *ref) { struct z3fold_header *zhdr = container_of(ref, struct z3fold_header, refcount); - spin_lock(&zhdr->pool->lock); + struct z3fold_pool *pool = zhdr_to_pool(zhdr); + spin_lock(&pool->lock); list_del_init(&zhdr->buddy); - spin_unlock(&zhdr->pool->lock); + spin_unlock(&pool->lock); WARN_ON(z3fold_page_trylock(zhdr)); __release_z3fold_page(zhdr, true); @@ -318,7 +491,7 @@ static void free_pages_work(struct work_struct *w) continue; spin_unlock(&pool->stale_lock); cancel_work_sync(&zhdr->work); - free_z3fold_page(page); + free_z3fold_page(page, false); cond_resched(); spin_lock(&pool->stale_lock); } @@ -349,6 +522,23 @@ static int num_free_chunks(struct z3fold_header *zhdr) return nfree; } +/* Add to the appropriate unbuddied list */ +static inline void add_to_unbuddied(struct z3fold_pool *pool, + struct z3fold_header *zhdr) +{ + if (zhdr->first_chunks == 0 || zhdr->last_chunks == 0 || + zhdr->middle_chunks == 0) { + struct list_head *unbuddied = get_cpu_ptr(pool->unbuddied); + + int freechunks = num_free_chunks(zhdr); + spin_lock(&pool->lock); + list_add(&zhdr->buddy, &unbuddied[freechunks]); + spin_unlock(&pool->lock); + zhdr->cpu = smp_processor_id(); + put_cpu_ptr(pool->unbuddied); + } +} + static inline void *mchunk_memmove(struct z3fold_header *zhdr, unsigned short dst_chunk) { @@ -367,6 +557,9 @@ static int z3fold_compact_page(struct z3fold_header *zhdr) if (test_bit(MIDDLE_CHUNK_MAPPED, &page->private)) return 0; /* can't move middle chunk, it's used */ + if (unlikely(PageIsolated(page))) + return 0; + if (zhdr->middle_chunks == 0) return 0; /* nothing to compact */ @@ -406,10 +599,8 @@ static int z3fold_compact_page(struct z3fold_header *zhdr) static void do_compact_page(struct z3fold_header *zhdr, bool locked) { - struct z3fold_pool *pool = zhdr->pool; + struct z3fold_pool *pool = zhdr_to_pool(zhdr); struct page *page; - struct list_head *unbuddied; - int fchunks; page = virt_to_page(zhdr); if (locked) @@ -429,19 +620,14 @@ static void do_compact_page(struct z3fold_header *zhdr, bool locked) return; } - z3fold_compact_page(zhdr); - unbuddied = get_cpu_ptr(pool->unbuddied); - fchunks = num_free_chunks(zhdr); - if (fchunks < NCHUNKS && - (!zhdr->first_chunks || !zhdr->middle_chunks || - !zhdr->last_chunks)) { - /* the page's not completely free and it's unbuddied */ - spin_lock(&pool->lock); - list_add(&zhdr->buddy, &unbuddied[fchunks]); - spin_unlock(&pool->lock); - zhdr->cpu = smp_processor_id(); + if (unlikely(PageIsolated(page) || + test_bit(PAGE_STALE, &page->private))) { + z3fold_page_unlock(zhdr); + return; } - put_cpu_ptr(pool->unbuddied); + + z3fold_compact_page(zhdr); + add_to_unbuddied(pool, zhdr); z3fold_page_unlock(zhdr); } @@ -453,6 +639,103 @@ static void compact_page_work(struct work_struct *w) do_compact_page(zhdr, false); } +/* returns _locked_ z3fold page header or NULL */ +static inline struct z3fold_header *__z3fold_alloc(struct z3fold_pool *pool, + size_t size, bool can_sleep) +{ + struct z3fold_header *zhdr = NULL; + struct page *page; + struct list_head *unbuddied; + int chunks = size_to_chunks(size), i; + +lookup: + /* First, try to find an unbuddied z3fold page. */ + unbuddied = get_cpu_ptr(pool->unbuddied); + for_each_unbuddied_list(i, chunks) { + struct list_head *l = &unbuddied[i]; + + zhdr = list_first_entry_or_null(READ_ONCE(l), + struct z3fold_header, buddy); + + if (!zhdr) + continue; + + /* Re-check under lock. */ + spin_lock(&pool->lock); + l = &unbuddied[i]; + if (unlikely(zhdr != list_first_entry(READ_ONCE(l), + struct z3fold_header, buddy)) || + !z3fold_page_trylock(zhdr)) { + spin_unlock(&pool->lock); + zhdr = NULL; + put_cpu_ptr(pool->unbuddied); + if (can_sleep) + cond_resched(); + goto lookup; + } + list_del_init(&zhdr->buddy); + zhdr->cpu = -1; + spin_unlock(&pool->lock); + + page = virt_to_page(zhdr); + if (test_bit(NEEDS_COMPACTING, &page->private)) { + z3fold_page_unlock(zhdr); + zhdr = NULL; + put_cpu_ptr(pool->unbuddied); + if (can_sleep) + cond_resched(); + goto lookup; + } + + /* + * this page could not be removed from its unbuddied + * list while pool lock was held, and then we've taken + * page lock so kref_put could not be called before + * we got here, so it's safe to just call kref_get() + */ + kref_get(&zhdr->refcount); + break; + } + put_cpu_ptr(pool->unbuddied); + + if (!zhdr) { + int cpu; + + /* look for _exact_ match on other cpus' lists */ + for_each_online_cpu(cpu) { + struct list_head *l; + + unbuddied = per_cpu_ptr(pool->unbuddied, cpu); + spin_lock(&pool->lock); + l = &unbuddied[chunks]; + + zhdr = list_first_entry_or_null(READ_ONCE(l), + struct z3fold_header, buddy); + + if (!zhdr || !z3fold_page_trylock(zhdr)) { + spin_unlock(&pool->lock); + zhdr = NULL; + continue; + } + list_del_init(&zhdr->buddy); + zhdr->cpu = -1; + spin_unlock(&pool->lock); + + page = virt_to_page(zhdr); + if (test_bit(NEEDS_COMPACTING, &page->private)) { + z3fold_page_unlock(zhdr); + zhdr = NULL; + if (can_sleep) + cond_resched(); + continue; + } + kref_get(&zhdr->refcount); + break; + } + } + + return zhdr; +} /* * API Functions @@ -476,6 +759,11 @@ static struct z3fold_pool *z3fold_create_pool(const char *name, gfp_t gfp, pool = kzalloc(sizeof(struct z3fold_pool), gfp); if (!pool) goto out; + pool->c_handle = kmem_cache_create("z3fold_handle", + sizeof(struct z3fold_buddy_slots), + SLOTS_ALIGN, 0, NULL); + if (!pool->c_handle) + goto out_c; spin_lock_init(&pool->lock); spin_lock_init(&pool->stale_lock); pool->unbuddied = __alloc_percpu(sizeof(struct list_head)*NCHUNKS, 2); @@ -497,15 +785,21 @@ static struct z3fold_pool *z3fold_create_pool(const char *name, gfp_t gfp, pool->release_wq = create_singlethread_workqueue(pool->name); if (!pool->release_wq) goto out_wq; + if (z3fold_register_migration(pool)) + goto out_rwq; INIT_WORK(&pool->work, free_pages_work); pool->ops = ops; return pool; +out_rwq: + destroy_workqueue(pool->release_wq); out_wq: destroy_workqueue(pool->compact_wq); out_unbuddied: free_percpu(pool->unbuddied); out_pool: + kmem_cache_destroy(pool->c_handle); +out_c: kfree(pool); out: return NULL; @@ -519,6 +813,8 @@ out: */ static void z3fold_destroy_pool(struct z3fold_pool *pool) { + kmem_cache_destroy(pool->c_handle); + z3fold_unregister_migration(pool); destroy_workqueue(pool->release_wq); destroy_workqueue(pool->compact_wq); kfree(pool); @@ -546,7 +842,7 @@ static void z3fold_destroy_pool(struct z3fold_pool *pool) static int z3fold_alloc(struct z3fold_pool *pool, size_t size, gfp_t gfp, unsigned long *handle) { - int chunks = 0, i, freechunks; + int chunks = size_to_chunks(size); struct z3fold_header *zhdr = NULL; struct page *page = NULL; enum buddy bud; @@ -561,56 +857,8 @@ static int z3fold_alloc(struct z3fold_pool *pool, size_t size, gfp_t gfp, if (size > PAGE_SIZE - ZHDR_SIZE_ALIGNED - CHUNK_SIZE) bud = HEADLESS; else { - struct list_head *unbuddied; - chunks = size_to_chunks(size); - -lookup: - /* First, try to find an unbuddied z3fold page. */ - unbuddied = get_cpu_ptr(pool->unbuddied); - for_each_unbuddied_list(i, chunks) { - struct list_head *l = &unbuddied[i]; - - zhdr = list_first_entry_or_null(READ_ONCE(l), - struct z3fold_header, buddy); - - if (!zhdr) - continue; - - /* Re-check under lock. */ - spin_lock(&pool->lock); - l = &unbuddied[i]; - if (unlikely(zhdr != list_first_entry(READ_ONCE(l), - struct z3fold_header, buddy)) || - !z3fold_page_trylock(zhdr)) { - spin_unlock(&pool->lock); - put_cpu_ptr(pool->unbuddied); - goto lookup; - } - list_del_init(&zhdr->buddy); - zhdr->cpu = -1; - spin_unlock(&pool->lock); - - page = virt_to_page(zhdr); - if (test_bit(NEEDS_COMPACTING, &page->private)) { - z3fold_page_unlock(zhdr); - zhdr = NULL; - put_cpu_ptr(pool->unbuddied); - if (can_sleep) - cond_resched(); - goto lookup; - } - - /* - * this page could not be removed from its unbuddied - * list while pool lock was held, and then we've taken - * page lock so kref_put could not be called before - * we got here, so it's safe to just call kref_get() - */ - kref_get(&zhdr->refcount); - break; - } - put_cpu_ptr(pool->unbuddied); - +retry: + zhdr = __z3fold_alloc(pool, size, can_sleep); if (zhdr) { if (zhdr->first_chunks == 0) { if (zhdr->middle_chunks != 0 && @@ -630,8 +878,9 @@ lookup: z3fold_page_unlock(zhdr); pr_err("No free chunks in unbuddied\n"); WARN_ON(1); - goto lookup; + goto retry; } + page = virt_to_page(zhdr); goto found; } bud = FIRST; @@ -662,13 +911,18 @@ lookup: if (!page) return -ENOMEM; - atomic64_inc(&pool->pages_nr); zhdr = init_z3fold_page(page, pool); + if (!zhdr) { + __free_page(page); + return -ENOMEM; + } + atomic64_inc(&pool->pages_nr); if (bud == HEADLESS) { set_bit(PAGE_HEADLESS, &page->private); goto headless; } + __SetPageMovable(page, pool->inode->i_mapping); z3fold_page_lock(zhdr); found: @@ -680,19 +934,7 @@ found: zhdr->middle_chunks = chunks; zhdr->start_middle = zhdr->first_chunks + ZHDR_CHUNKS; } - - if (zhdr->first_chunks == 0 || zhdr->last_chunks == 0 || - zhdr->middle_chunks == 0) { - struct list_head *unbuddied = get_cpu_ptr(pool->unbuddied); - - /* Add to unbuddied list */ - freechunks = num_free_chunks(zhdr); - spin_lock(&pool->lock); - list_add(&zhdr->buddy, &unbuddied[freechunks]); - spin_unlock(&pool->lock); - zhdr->cpu = smp_processor_id(); - put_cpu_ptr(pool->unbuddied); - } + add_to_unbuddied(pool, zhdr); headless: spin_lock(&pool->lock); @@ -739,7 +981,7 @@ static void z3fold_free(struct z3fold_pool *pool, unsigned long handle) spin_lock(&pool->lock); list_del(&page->lru); spin_unlock(&pool->lock); - free_z3fold_page(page); + free_z3fold_page(page, true); atomic64_dec(&pool->pages_nr); } return; @@ -766,6 +1008,7 @@ static void z3fold_free(struct z3fold_pool *pool, unsigned long handle) return; } + free_handle(handle); if (kref_put(&zhdr->refcount, release_z3fold_page_locked_list)) { atomic64_dec(&pool->pages_nr); return; @@ -774,7 +1017,8 @@ static void z3fold_free(struct z3fold_pool *pool, unsigned long handle) z3fold_page_unlock(zhdr); return; } - if (test_and_set_bit(NEEDS_COMPACTING, &page->private)) { + if (unlikely(PageIsolated(page)) || + test_and_set_bit(NEEDS_COMPACTING, &page->private)) { z3fold_page_unlock(zhdr); return; } @@ -855,10 +1099,12 @@ static int z3fold_reclaim_page(struct z3fold_pool *pool, unsigned int retries) if (test_and_set_bit(PAGE_CLAIMED, &page->private)) continue; - zhdr = page_address(page); + if (unlikely(PageIsolated(page))) + continue; if (test_bit(PAGE_HEADLESS, &page->private)) break; + zhdr = page_address(page); if (!z3fold_page_trylock(zhdr)) { zhdr = NULL; continue; /* can't evict at this point */ @@ -919,7 +1165,7 @@ static int z3fold_reclaim_page(struct z3fold_pool *pool, unsigned int retries) next: if (test_bit(PAGE_HEADLESS, &page->private)) { if (ret == 0) { - free_z3fold_page(page); + free_z3fold_page(page, true); atomic64_dec(&pool->pages_nr); return 0; } @@ -996,6 +1242,8 @@ static void *z3fold_map(struct z3fold_pool *pool, unsigned long handle) break; } + if (addr) + zhdr->mapped_count++; z3fold_page_unlock(zhdr); out: return addr; @@ -1022,6 +1270,7 @@ static void z3fold_unmap(struct z3fold_pool *pool, unsigned long handle) buddy = handle_to_buddy(handle); if (buddy == MIDDLE) clear_bit(MIDDLE_CHUNK_MAPPED, &page->private); + zhdr->mapped_count--; z3fold_page_unlock(zhdr); } @@ -1036,6 +1285,128 @@ static u64 z3fold_get_pool_size(struct z3fold_pool *pool) return atomic64_read(&pool->pages_nr); } +static bool z3fold_page_isolate(struct page *page, isolate_mode_t mode) +{ + struct z3fold_header *zhdr; + struct z3fold_pool *pool; + + VM_BUG_ON_PAGE(!PageMovable(page), page); + VM_BUG_ON_PAGE(PageIsolated(page), page); + + if (test_bit(PAGE_HEADLESS, &page->private)) + return false; + + zhdr = page_address(page); + z3fold_page_lock(zhdr); + if (test_bit(NEEDS_COMPACTING, &page->private) || + test_bit(PAGE_STALE, &page->private)) + goto out; + + pool = zhdr_to_pool(zhdr); + + if (zhdr->mapped_count == 0) { + kref_get(&zhdr->refcount); + if (!list_empty(&zhdr->buddy)) + list_del_init(&zhdr->buddy); + spin_lock(&pool->lock); + if (!list_empty(&page->lru)) + list_del(&page->lru); + spin_unlock(&pool->lock); + z3fold_page_unlock(zhdr); + return true; + } +out: + z3fold_page_unlock(zhdr); + return false; +} + +static int z3fold_page_migrate(struct address_space *mapping, struct page *newpage, + struct page *page, enum migrate_mode mode) +{ + struct z3fold_header *zhdr, *new_zhdr; + struct z3fold_pool *pool; + struct address_space *new_mapping; + + VM_BUG_ON_PAGE(!PageMovable(page), page); + VM_BUG_ON_PAGE(!PageIsolated(page), page); + + zhdr = page_address(page); + pool = zhdr_to_pool(zhdr); + + if (!trylock_page(page)) + return -EAGAIN; + + if (!z3fold_page_trylock(zhdr)) { + unlock_page(page); + return -EAGAIN; + } + if (zhdr->mapped_count != 0) { + z3fold_page_unlock(zhdr); + unlock_page(page); + return -EBUSY; + } + new_zhdr = page_address(newpage); + memcpy(new_zhdr, zhdr, PAGE_SIZE); + newpage->private = page->private; + page->private = 0; + z3fold_page_unlock(zhdr); + spin_lock_init(&new_zhdr->page_lock); + new_mapping = page_mapping(page); + __ClearPageMovable(page); + ClearPagePrivate(page); + + get_page(newpage); + z3fold_page_lock(new_zhdr); + if (new_zhdr->first_chunks) + encode_handle(new_zhdr, FIRST); + if (new_zhdr->last_chunks) + encode_handle(new_zhdr, LAST); + if (new_zhdr->middle_chunks) + encode_handle(new_zhdr, MIDDLE); + set_bit(NEEDS_COMPACTING, &newpage->private); + new_zhdr->cpu = smp_processor_id(); + spin_lock(&pool->lock); + list_add(&newpage->lru, &pool->lru); + spin_unlock(&pool->lock); + __SetPageMovable(newpage, new_mapping); + z3fold_page_unlock(new_zhdr); + + queue_work_on(new_zhdr->cpu, pool->compact_wq, &new_zhdr->work); + + page_mapcount_reset(page); + unlock_page(page); + put_page(page); + return 0; +} + +static void z3fold_page_putback(struct page *page) +{ + struct z3fold_header *zhdr; + struct z3fold_pool *pool; + + zhdr = page_address(page); + pool = zhdr_to_pool(zhdr); + + z3fold_page_lock(zhdr); + if (!list_empty(&zhdr->buddy)) + list_del_init(&zhdr->buddy); + INIT_LIST_HEAD(&page->lru); + if (kref_put(&zhdr->refcount, release_z3fold_page_locked)) { + atomic64_dec(&pool->pages_nr); + return; + } + spin_lock(&pool->lock); + list_add(&page->lru, &pool->lru); + spin_unlock(&pool->lock); + z3fold_page_unlock(zhdr); +} + +static const struct address_space_operations z3fold_aops = { + .isolate_page = z3fold_page_isolate, + .migratepage = z3fold_page_migrate, + .putback_page = z3fold_page_putback, +}; + /***************** * zpool ****************/ @@ -1133,8 +1504,14 @@ MODULE_ALIAS("zpool-z3fold"); static int __init init_z3fold(void) { + int ret; + /* Make sure the z3fold header is not larger than the page size */ BUILD_BUG_ON(ZHDR_SIZE_ALIGNED > PAGE_SIZE); + ret = z3fold_mount(); + if (ret) + return ret; + zpool_register_driver(&z3fold_zpool_driver); return 0; @@ -1142,6 +1519,7 @@ static int __init init_z3fold(void) static void __exit exit_z3fold(void) { + z3fold_unmount(); zpool_unregister_driver(&z3fold_zpool_driver); } |