diff options
author | Jiri Kosina <jkosina@suse.cz> | 2018-06-08 10:20:42 +0200 |
---|---|---|
committer | Jiri Kosina <jkosina@suse.cz> | 2018-06-08 10:20:42 +0200 |
commit | c1144d29f405ce1f4e6ede6482beb3d0d09750c6 (patch) | |
tree | 0f9fe36a50005bae6ffe28a4f978e71273f5b1d1 /mm | |
parent | HID: core: fix hid_hw_open() comment (diff) | |
parent | HID: alps: Fix some style in 't4_read_write_register()' (diff) | |
download | linux-c1144d29f405ce1f4e6ede6482beb3d0d09750c6.tar.xz linux-c1144d29f405ce1f4e6ede6482beb3d0d09750c6.zip |
Merge branch 'for-4.18/alps' into for-linus
hid-alps driver cleanups wrt. t4_read_write_register() handling
from Christophe Jaillet
Diffstat (limited to 'mm')
55 files changed, 2032 insertions, 1411 deletions
diff --git a/mm/Makefile b/mm/Makefile index e669f02c5a54..b4e54a9ae9c5 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -37,7 +37,7 @@ obj-y := filemap.o mempool.o oom_kill.o \ readahead.o swap.o truncate.o vmscan.o shmem.o \ util.o mmzone.o vmstat.o backing-dev.o \ mm_init.o mmu_context.o percpu.o slab_common.o \ - compaction.o vmacache.o swap_slots.o \ + compaction.o vmacache.o \ interval_tree.o list_lru.o workingset.o \ debug.o $(mmu-y) @@ -55,7 +55,7 @@ ifdef CONFIG_MMU endif obj-$(CONFIG_HAVE_MEMBLOCK) += memblock.o -obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o +obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o swap_slots.o obj-$(CONFIG_FRONTSWAP) += frontswap.o obj-$(CONFIG_ZSWAP) += zswap.o obj-$(CONFIG_HAS_DMA) += dmapool.o diff --git a/mm/backing-dev.c b/mm/backing-dev.c index b5f940ce0143..023190c69dce 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -100,18 +100,7 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v) return 0; } - -static int bdi_debug_stats_open(struct inode *inode, struct file *file) -{ - return single_open(file, bdi_debug_stats_show, inode->i_private); -} - -static const struct file_operations bdi_debug_stats_fops = { - .open = bdi_debug_stats_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; +DEFINE_SHOW_ATTRIBUTE(bdi_debug_stats); static int bdi_debug_register(struct backing_dev_info *bdi, const char *name) { @@ -745,7 +734,6 @@ static void cgwb_bdi_unregister(struct backing_dev_info *bdi) */ void wb_memcg_offline(struct mem_cgroup *memcg) { - LIST_HEAD(to_destroy); struct list_head *memcg_cgwb_list = mem_cgroup_cgwb_list(memcg); struct bdi_writeback *wb, *next; @@ -764,7 +752,6 @@ void wb_memcg_offline(struct mem_cgroup *memcg) */ void wb_blkcg_offline(struct blkcg *blkcg) { - LIST_HEAD(to_destroy); struct bdi_writeback *wb, *next; spin_lock_irq(&cgwb_lock); @@ -1033,23 +1020,18 @@ EXPORT_SYMBOL(congestion_wait); /** * wait_iff_congested - Conditionally wait for a backing_dev to become uncongested or a pgdat to complete writes - * @pgdat: A pgdat to check if it is heavily congested * @sync: SYNC or ASYNC IO * @timeout: timeout in jiffies * - * In the event of a congested backing_dev (any backing_dev) and the given - * @pgdat has experienced recent congestion, this waits for up to @timeout - * jiffies for either a BDI to exit congestion of the given @sync queue - * or a write to complete. - * - * In the absence of pgdat congestion, cond_resched() is called to yield - * the processor if necessary but otherwise does not sleep. + * In the event of a congested backing_dev (any backing_dev) this waits + * for up to @timeout jiffies for either a BDI to exit congestion of the + * given @sync queue or a write to complete. * * The return value is 0 if the sleep is for the full timeout. Otherwise, * it is the number of jiffies that were still remaining when the function * returned. return_value == timeout implies the function did not sleep. */ -long wait_iff_congested(struct pglist_data *pgdat, int sync, long timeout) +long wait_iff_congested(int sync, long timeout) { long ret; unsigned long start = jiffies; @@ -1057,12 +1039,10 @@ long wait_iff_congested(struct pglist_data *pgdat, int sync, long timeout) wait_queue_head_t *wqh = &congestion_wqh[sync]; /* - * If there is no congestion, or heavy congestion is not being - * encountered in the current pgdat, yield if necessary instead + * If there is no congestion, yield if necessary instead * of sleeping on the congestion queue */ - if (atomic_read(&nr_wb_congested[sync]) == 0 || - !test_bit(PGDAT_CONGESTED, &pgdat->flags)) { + if (atomic_read(&nr_wb_congested[sync]) == 0) { cond_resched(); /* In case we scheduled, work out time remaining */ @@ -35,9 +35,11 @@ #include <linux/cma.h> #include <linux/highmem.h> #include <linux/io.h> +#include <linux/kmemleak.h> #include <trace/events/cma.h> #include "cma.h" +#include "internal.h" struct cma cma_areas[MAX_CMA_AREAS]; unsigned cma_area_count; @@ -108,23 +110,25 @@ static int __init cma_activate_area(struct cma *cma) if (!cma->bitmap) return -ENOMEM; - WARN_ON_ONCE(!pfn_valid(pfn)); - zone = page_zone(pfn_to_page(pfn)); - do { unsigned j; base_pfn = pfn; + if (!pfn_valid(base_pfn)) + goto err; + + zone = page_zone(pfn_to_page(base_pfn)); for (j = pageblock_nr_pages; j; --j, pfn++) { - WARN_ON_ONCE(!pfn_valid(pfn)); + if (!pfn_valid(pfn)) + goto err; + /* - * alloc_contig_range requires the pfn range - * specified to be in the same zone. Make this - * simple by forcing the entire CMA resv range - * to be in the same zone. + * In init_cma_reserved_pageblock(), present_pages + * is adjusted with assumption that all pages in + * the pageblock come from a single zone. */ if (page_zone(pfn_to_page(pfn)) != zone) - goto not_in_zone; + goto err; } init_cma_reserved_pageblock(pfn_to_page(base_pfn)); } while (--i); @@ -138,7 +142,7 @@ static int __init cma_activate_area(struct cma *cma) return 0; -not_in_zone: +err: pr_err("CMA area %s could not be activated\n", cma->name); kfree(cma->bitmap); cma->count = 0; @@ -148,6 +152,41 @@ not_in_zone: static int __init cma_init_reserved_areas(void) { int i; + struct zone *zone; + pg_data_t *pgdat; + + if (!cma_area_count) + return 0; + + for_each_online_pgdat(pgdat) { + unsigned long start_pfn = UINT_MAX, end_pfn = 0; + + zone = &pgdat->node_zones[ZONE_MOVABLE]; + + /* + * In this case, we cannot adjust the zone range + * since it is now maximum node span and we don't + * know original zone range. + */ + if (populated_zone(zone)) + continue; + + for (i = 0; i < cma_area_count; i++) { + if (pfn_to_nid(cma_areas[i].base_pfn) != + pgdat->node_id) + continue; + + start_pfn = min(start_pfn, cma_areas[i].base_pfn); + end_pfn = max(end_pfn, cma_areas[i].base_pfn + + cma_areas[i].count); + } + + if (!end_pfn) + continue; + + zone->zone_start_pfn = start_pfn; + zone->spanned_pages = end_pfn - start_pfn; + } for (i = 0; i < cma_area_count; i++) { int ret = cma_activate_area(&cma_areas[i]); @@ -156,15 +195,41 @@ static int __init cma_init_reserved_areas(void) return ret; } + /* + * Reserved pages for ZONE_MOVABLE are now activated and + * this would change ZONE_MOVABLE's managed page counter and + * the other zones' present counter. We need to re-calculate + * various zone information that depends on this initialization. + */ + build_all_zonelists(NULL); + for_each_populated_zone(zone) { + if (zone_idx(zone) == ZONE_MOVABLE) { + zone_pcp_reset(zone); + setup_zone_pageset(zone); + } else + zone_pcp_update(zone); + + set_zone_contiguous(zone); + } + + /* + * We need to re-init per zone wmark by calling + * init_per_zone_wmark_min() but doesn't call here because it is + * registered on core_initcall and it will be called later than us. + */ + return 0; } -core_initcall(cma_init_reserved_areas); +pure_initcall(cma_init_reserved_areas); /** * cma_init_reserved_mem() - create custom contiguous area from reserved memory * @base: Base address of the reserved area * @size: Size of the reserved area (in bytes), * @order_per_bit: Order of pages represented by one bit on bitmap. + * @name: The name of the area. If this parameter is NULL, the name of + * the area will be set to "cmaN", where N is a running counter of + * used areas. * @res_cma: Pointer to store the created cma region. * * This function creates custom contiguous area from already reserved memory. @@ -227,6 +292,7 @@ int __init cma_init_reserved_mem(phys_addr_t base, phys_addr_t size, * @alignment: Alignment for the CMA area, should be power of 2 or zero * @order_per_bit: Order of pages represented by one bit on bitmap. * @fixed: hint about where to place the reserved area + * @name: The name of the area. See function cma_init_reserved_mem() * @res_cma: Pointer to store the created cma region. * * This function reserves memory from early allocator. It should be @@ -390,6 +456,7 @@ static inline void cma_debug_show_areas(struct cma *cma) { } * @cma: Contiguous memory region for which the allocation is performed. * @count: Requested number of pages. * @align: Requested alignment of pages (in PAGE_SIZE order). + * @gfp_mask: GFP mask to use during compaction * * This function allocates part of contiguous memory on specific * contiguous memory area. diff --git a/mm/compaction.c b/mm/compaction.c index 2c8999d027ab..028b7210a669 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -576,6 +576,7 @@ isolate_fail: /** * isolate_freepages_range() - isolate free pages. + * @cc: Compaction control structure. * @start_pfn: The first PFN to start isolating. * @end_pfn: The one-past-last PFN. * @@ -1165,8 +1166,7 @@ static void isolate_freepages(struct compact_control *cc) * from the isolated freelists in the block we are migrating to. */ static struct page *compaction_alloc(struct page *migratepage, - unsigned long data, - int **result) + unsigned long data) { struct compact_control *cc = (struct compact_control *)data; struct page *freepage; @@ -1450,14 +1450,12 @@ static enum compact_result __compaction_suitable(struct zone *zone, int order, * if compaction succeeds. * For costly orders, we require low watermark instead of min for * compaction to proceed to increase its chances. - * ALLOC_CMA is used, as pages in CMA pageblocks are considered - * suitable migration targets */ watermark = (order > PAGE_ALLOC_COSTLY_ORDER) ? low_wmark_pages(zone) : min_wmark_pages(zone); watermark += compact_gap(order); if (!__zone_watermark_ok(zone, 0, watermark, classzone_idx, - ALLOC_CMA, wmark_target)) + 0, wmark_target)) return COMPACT_SKIPPED; return COMPACT_CONTINUE; @@ -1988,6 +1986,14 @@ static void kcompactd_do_work(pg_data_t *pgdat) compaction_defer_reset(zone, cc.order, false); } else if (status == COMPACT_PARTIAL_SKIPPED || status == COMPACT_COMPLETE) { /* + * Buddy pages may become stranded on pcps that could + * otherwise coalesce on the zone's free area for + * order >= cc.order. This is ratelimited by the + * upcoming deferral. + */ + drain_all_pages(zone); + + /* * We use sync migration mode here, so we defer like * sync direct compaction does. */ diff --git a/mm/failslab.c b/mm/failslab.c index 8087d976a809..1f2f248e3601 100644 --- a/mm/failslab.c +++ b/mm/failslab.c @@ -14,7 +14,7 @@ static struct { .cache_filter = false, }; -bool should_failslab(struct kmem_cache *s, gfp_t gfpflags) +bool __should_failslab(struct kmem_cache *s, gfp_t gfpflags) { /* No fault-injection for bootstrap cache */ if (unlikely(s == kmem_cache)) diff --git a/mm/filemap.c b/mm/filemap.c index 693f62212a59..9276bdb2343c 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -66,7 +66,7 @@ * ->i_mmap_rwsem (truncate_pagecache) * ->private_lock (__free_pte->__set_page_dirty_buffers) * ->swap_lock (exclusive_swap_page, others) - * ->mapping->tree_lock + * ->i_pages lock * * ->i_mutex * ->i_mmap_rwsem (truncate->unmap_mapping_range) @@ -74,7 +74,7 @@ * ->mmap_sem * ->i_mmap_rwsem * ->page_table_lock or pte_lock (various, mainly in memory.c) - * ->mapping->tree_lock (arch-dependent flush_dcache_mmap_lock) + * ->i_pages lock (arch-dependent flush_dcache_mmap_lock) * * ->mmap_sem * ->lock_page (access_process_vm) @@ -84,7 +84,7 @@ * * bdi->wb.list_lock * sb_lock (fs/fs-writeback.c) - * ->mapping->tree_lock (__sync_single_inode) + * ->i_pages lock (__sync_single_inode) * * ->i_mmap_rwsem * ->anon_vma.lock (vma_adjust) @@ -95,11 +95,11 @@ * ->page_table_lock or pte_lock * ->swap_lock (try_to_unmap_one) * ->private_lock (try_to_unmap_one) - * ->tree_lock (try_to_unmap_one) + * ->i_pages lock (try_to_unmap_one) * ->zone_lru_lock(zone) (follow_page->mark_page_accessed) * ->zone_lru_lock(zone) (check_pte_range->isolate_lru_page) * ->private_lock (page_remove_rmap->set_page_dirty) - * ->tree_lock (page_remove_rmap->set_page_dirty) + * ->i_pages lock (page_remove_rmap->set_page_dirty) * bdi.wb->list_lock (page_remove_rmap->set_page_dirty) * ->inode->i_lock (page_remove_rmap->set_page_dirty) * ->memcg->move_lock (page_remove_rmap->lock_page_memcg) @@ -118,14 +118,15 @@ static int page_cache_tree_insert(struct address_space *mapping, void **slot; int error; - error = __radix_tree_create(&mapping->page_tree, page->index, 0, + error = __radix_tree_create(&mapping->i_pages, page->index, 0, &node, &slot); if (error) return error; if (*slot) { void *p; - p = radix_tree_deref_slot_protected(slot, &mapping->tree_lock); + p = radix_tree_deref_slot_protected(slot, + &mapping->i_pages.xa_lock); if (!radix_tree_exceptional_entry(p)) return -EEXIST; @@ -133,7 +134,7 @@ static int page_cache_tree_insert(struct address_space *mapping, if (shadowp) *shadowp = p; } - __radix_tree_replace(&mapping->page_tree, node, slot, page, + __radix_tree_replace(&mapping->i_pages, node, slot, page, workingset_lookup_update(mapping)); mapping->nrpages++; return 0; @@ -155,13 +156,13 @@ static void page_cache_tree_delete(struct address_space *mapping, struct radix_tree_node *node; void **slot; - __radix_tree_lookup(&mapping->page_tree, page->index + i, + __radix_tree_lookup(&mapping->i_pages, page->index + i, &node, &slot); VM_BUG_ON_PAGE(!node && nr != 1, page); - radix_tree_clear_tags(&mapping->page_tree, node, slot); - __radix_tree_replace(&mapping->page_tree, node, slot, shadow, + radix_tree_clear_tags(&mapping->i_pages, node, slot); + __radix_tree_replace(&mapping->i_pages, node, slot, shadow, workingset_lookup_update(mapping)); } @@ -253,7 +254,7 @@ static void unaccount_page_cache_page(struct address_space *mapping, /* * Delete a page from the page cache and free it. Caller has to make * sure the page is locked and that nobody else uses it - or that usage - * is safe. The caller must hold the mapping's tree_lock. + * is safe. The caller must hold the i_pages lock. */ void __delete_from_page_cache(struct page *page, void *shadow) { @@ -296,9 +297,9 @@ void delete_from_page_cache(struct page *page) unsigned long flags; BUG_ON(!PageLocked(page)); - spin_lock_irqsave(&mapping->tree_lock, flags); + xa_lock_irqsave(&mapping->i_pages, flags); __delete_from_page_cache(page, NULL); - spin_unlock_irqrestore(&mapping->tree_lock, flags); + xa_unlock_irqrestore(&mapping->i_pages, flags); page_cache_free_page(mapping, page); } @@ -309,14 +310,14 @@ EXPORT_SYMBOL(delete_from_page_cache); * @mapping: the mapping to which pages belong * @pvec: pagevec with pages to delete * - * The function walks over mapping->page_tree and removes pages passed in @pvec - * from the radix tree. The function expects @pvec to be sorted by page index. - * It tolerates holes in @pvec (radix tree entries at those indices are not + * The function walks over mapping->i_pages and removes pages passed in @pvec + * from the mapping. The function expects @pvec to be sorted by page index. + * It tolerates holes in @pvec (mapping entries at those indices are not * modified). The function expects only THP head pages to be present in the - * @pvec and takes care to delete all corresponding tail pages from the radix - * tree as well. + * @pvec and takes care to delete all corresponding tail pages from the + * mapping as well. * - * The function expects mapping->tree_lock to be held. + * The function expects the i_pages lock to be held. */ static void page_cache_tree_delete_batch(struct address_space *mapping, @@ -330,11 +331,11 @@ page_cache_tree_delete_batch(struct address_space *mapping, pgoff_t start; start = pvec->pages[0]->index; - radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) { + radix_tree_for_each_slot(slot, &mapping->i_pages, &iter, start) { if (i >= pagevec_count(pvec) && !tail_pages) break; page = radix_tree_deref_slot_protected(slot, - &mapping->tree_lock); + &mapping->i_pages.xa_lock); if (radix_tree_exceptional_entry(page)) continue; if (!tail_pages) { @@ -357,8 +358,8 @@ page_cache_tree_delete_batch(struct address_space *mapping, } else { tail_pages--; } - radix_tree_clear_tags(&mapping->page_tree, iter.node, slot); - __radix_tree_replace(&mapping->page_tree, iter.node, slot, NULL, + radix_tree_clear_tags(&mapping->i_pages, iter.node, slot); + __radix_tree_replace(&mapping->i_pages, iter.node, slot, NULL, workingset_lookup_update(mapping)); total_pages++; } @@ -374,14 +375,14 @@ void delete_from_page_cache_batch(struct address_space *mapping, if (!pagevec_count(pvec)) return; - spin_lock_irqsave(&mapping->tree_lock, flags); + xa_lock_irqsave(&mapping->i_pages, flags); for (i = 0; i < pagevec_count(pvec); i++) { trace_mm_filemap_delete_from_page_cache(pvec->pages[i]); unaccount_page_cache_page(mapping, pvec->pages[i]); } page_cache_tree_delete_batch(mapping, pvec); - spin_unlock_irqrestore(&mapping->tree_lock, flags); + xa_unlock_irqrestore(&mapping->i_pages, flags); for (i = 0; i < pagevec_count(pvec); i++) page_cache_free_page(mapping, pvec->pages[i]); @@ -798,7 +799,7 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask) new->mapping = mapping; new->index = offset; - spin_lock_irqsave(&mapping->tree_lock, flags); + xa_lock_irqsave(&mapping->i_pages, flags); __delete_from_page_cache(old, NULL); error = page_cache_tree_insert(mapping, new, NULL); BUG_ON(error); @@ -810,7 +811,7 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask) __inc_node_page_state(new, NR_FILE_PAGES); if (PageSwapBacked(new)) __inc_node_page_state(new, NR_SHMEM); - spin_unlock_irqrestore(&mapping->tree_lock, flags); + xa_unlock_irqrestore(&mapping->i_pages, flags); mem_cgroup_migrate(old, new); radix_tree_preload_end(); if (freepage) @@ -852,7 +853,7 @@ static int __add_to_page_cache_locked(struct page *page, page->mapping = mapping; page->index = offset; - spin_lock_irq(&mapping->tree_lock); + xa_lock_irq(&mapping->i_pages); error = page_cache_tree_insert(mapping, page, shadowp); radix_tree_preload_end(); if (unlikely(error)) @@ -861,7 +862,7 @@ static int __add_to_page_cache_locked(struct page *page, /* hugetlb pages do not participate in page cache accounting. */ if (!huge) __inc_node_page_state(page, NR_FILE_PAGES); - spin_unlock_irq(&mapping->tree_lock); + xa_unlock_irq(&mapping->i_pages); if (!huge) mem_cgroup_commit_charge(page, memcg, false, false); trace_mm_filemap_add_to_page_cache(page); @@ -869,7 +870,7 @@ static int __add_to_page_cache_locked(struct page *page, err_insert: page->mapping = NULL; /* Leave page->index set: truncation relies upon it */ - spin_unlock_irq(&mapping->tree_lock); + xa_unlock_irq(&mapping->i_pages); if (!huge) mem_cgroup_cancel_charge(page, memcg, false); put_page(page); @@ -1353,7 +1354,7 @@ pgoff_t page_cache_next_hole(struct address_space *mapping, for (i = 0; i < max_scan; i++) { struct page *page; - page = radix_tree_lookup(&mapping->page_tree, index); + page = radix_tree_lookup(&mapping->i_pages, index); if (!page || radix_tree_exceptional_entry(page)) break; index++; @@ -1394,7 +1395,7 @@ pgoff_t page_cache_prev_hole(struct address_space *mapping, for (i = 0; i < max_scan; i++) { struct page *page; - page = radix_tree_lookup(&mapping->page_tree, index); + page = radix_tree_lookup(&mapping->i_pages, index); if (!page || radix_tree_exceptional_entry(page)) break; index--; @@ -1427,7 +1428,7 @@ struct page *find_get_entry(struct address_space *mapping, pgoff_t offset) rcu_read_lock(); repeat: page = NULL; - pagep = radix_tree_lookup_slot(&mapping->page_tree, offset); + pagep = radix_tree_lookup_slot(&mapping->i_pages, offset); if (pagep) { page = radix_tree_deref_slot(pagep); if (unlikely(!page)) @@ -1633,7 +1634,7 @@ unsigned find_get_entries(struct address_space *mapping, return 0; rcu_read_lock(); - radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) { + radix_tree_for_each_slot(slot, &mapping->i_pages, &iter, start) { struct page *head, *page; repeat: page = radix_tree_deref_slot(slot); @@ -1710,7 +1711,7 @@ unsigned find_get_pages_range(struct address_space *mapping, pgoff_t *start, return 0; rcu_read_lock(); - radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, *start) { + radix_tree_for_each_slot(slot, &mapping->i_pages, &iter, *start) { struct page *head, *page; if (iter.index > end) @@ -1795,7 +1796,7 @@ unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t index, return 0; rcu_read_lock(); - radix_tree_for_each_contig(slot, &mapping->page_tree, &iter, index) { + radix_tree_for_each_contig(slot, &mapping->i_pages, &iter, index) { struct page *head, *page; repeat: page = radix_tree_deref_slot(slot); @@ -1875,8 +1876,7 @@ unsigned find_get_pages_range_tag(struct address_space *mapping, pgoff_t *index, return 0; rcu_read_lock(); - radix_tree_for_each_tagged(slot, &mapping->page_tree, - &iter, *index, tag) { + radix_tree_for_each_tagged(slot, &mapping->i_pages, &iter, *index, tag) { struct page *head, *page; if (iter.index > end) @@ -1969,8 +1969,7 @@ unsigned find_get_entries_tag(struct address_space *mapping, pgoff_t start, return 0; rcu_read_lock(); - radix_tree_for_each_tagged(slot, &mapping->page_tree, - &iter, start, tag) { + radix_tree_for_each_tagged(slot, &mapping->i_pages, &iter, start, tag) { struct page *head, *page; repeat: page = radix_tree_deref_slot(slot); @@ -2624,8 +2623,7 @@ void filemap_map_pages(struct vm_fault *vmf, struct page *head, *page; rcu_read_lock(); - radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, - start_pgoff) { + radix_tree_for_each_slot(slot, &mapping->i_pages, &iter, start_pgoff) { if (iter.index > end_pgoff) break; repeat: @@ -2721,7 +2719,6 @@ out: sb_end_pagefault(inode->i_sb); return ret; } -EXPORT_SYMBOL(filemap_page_mkwrite); const struct vm_operations_struct generic_file_vm_ops = { .fault = filemap_fault, @@ -2752,6 +2749,10 @@ int generic_file_readonly_mmap(struct file *file, struct vm_area_struct *vma) return generic_file_mmap(file, vma); } #else +int filemap_page_mkwrite(struct vm_fault *vmf) +{ + return -ENOSYS; +} int generic_file_mmap(struct file * file, struct vm_area_struct * vma) { return -ENOSYS; @@ -2762,6 +2763,7 @@ int generic_file_readonly_mmap(struct file * file, struct vm_area_struct * vma) } #endif /* CONFIG_MMU */ +EXPORT_SYMBOL(filemap_page_mkwrite); EXPORT_SYMBOL(generic_file_mmap); EXPORT_SYMBOL(generic_file_readonly_mmap); @@ -531,7 +531,7 @@ static int faultin_page(struct task_struct *tsk, struct vm_area_struct *vma, * reCOWed by userspace write). */ if ((ret & VM_FAULT_WRITE) && !(vma->vm_flags & VM_WRITE)) - *flags |= FOLL_COW; + *flags |= FOLL_COW; return 0; } @@ -1638,7 +1638,7 @@ static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end, PMD_SHIFT, next, write, pages, nr)) return 0; } else if (!gup_pte_range(pmd, addr, next, write, pages, nr)) - return 0; + return 0; } while (pmdp++, addr = next, addr != end); return 1; @@ -1740,7 +1740,9 @@ bool gup_fast_permitted(unsigned long start, int nr_pages, int write) /* * Like get_user_pages_fast() except it's IRQ-safe in that it won't fall back to - * the regular GUP. It will only return non-negative values. + * the regular GUP. + * Note a difference with get_user_pages_fast: this always returns the + * number of pages pinned, 0 if no pages were pinned. */ int __get_user_pages_fast(unsigned long start, int nr_pages, int write, struct page **pages) @@ -1806,9 +1808,12 @@ int get_user_pages_fast(unsigned long start, int nr_pages, int write, len = (unsigned long) nr_pages << PAGE_SHIFT; end = start + len; + if (nr_pages <= 0) + return 0; + if (unlikely(!access_ok(write ? VERIFY_WRITE : VERIFY_READ, (void __user *)start, len))) - return 0; + return -EFAULT; if (gup_fast_permitted(start, nr_pages, write)) { local_irq_disable(); diff --git a/mm/gup_benchmark.c b/mm/gup_benchmark.c index 5c8e2abeaa15..0f44759486e2 100644 --- a/mm/gup_benchmark.c +++ b/mm/gup_benchmark.c @@ -23,7 +23,7 @@ static int __gup_benchmark_ioctl(unsigned int cmd, struct page **pages; nr_pages = gup->size / PAGE_SIZE; - pages = kvmalloc(sizeof(void *) * nr_pages, GFP_KERNEL); + pages = kvzalloc(sizeof(void *) * nr_pages, GFP_KERNEL); if (!pages) return -ENOMEM; @@ -41,6 +41,8 @@ static int __gup_benchmark_ioctl(unsigned int cmd, } nr = get_user_pages_fast(addr, nr, gup->flags & 1, pages + i); + if (nr <= 0) + break; i += nr; } end_time = ktime_get(); @@ -160,6 +160,32 @@ static void hmm_invalidate_range(struct hmm *hmm, up_read(&hmm->mirrors_sem); } +static void hmm_release(struct mmu_notifier *mn, struct mm_struct *mm) +{ + struct hmm_mirror *mirror; + struct hmm *hmm = mm->hmm; + + down_write(&hmm->mirrors_sem); + mirror = list_first_entry_or_null(&hmm->mirrors, struct hmm_mirror, + list); + while (mirror) { + list_del_init(&mirror->list); + if (mirror->ops->release) { + /* + * Drop mirrors_sem so callback can wait on any pending + * work that might itself trigger mmu_notifier callback + * and thus would deadlock with us. + */ + up_write(&hmm->mirrors_sem); + mirror->ops->release(mirror); + down_write(&hmm->mirrors_sem); + } + mirror = list_first_entry_or_null(&hmm->mirrors, + struct hmm_mirror, list); + } + up_write(&hmm->mirrors_sem); +} + static void hmm_invalidate_range_start(struct mmu_notifier *mn, struct mm_struct *mm, unsigned long start, @@ -185,6 +211,7 @@ static void hmm_invalidate_range_end(struct mmu_notifier *mn, } static const struct mmu_notifier_ops hmm_mmu_notifier_ops = { + .release = hmm_release, .invalidate_range_start = hmm_invalidate_range_start, .invalidate_range_end = hmm_invalidate_range_end, }; @@ -206,13 +233,24 @@ int hmm_mirror_register(struct hmm_mirror *mirror, struct mm_struct *mm) if (!mm || !mirror || !mirror->ops) return -EINVAL; +again: mirror->hmm = hmm_register(mm); if (!mirror->hmm) return -ENOMEM; down_write(&mirror->hmm->mirrors_sem); - list_add(&mirror->list, &mirror->hmm->mirrors); - up_write(&mirror->hmm->mirrors_sem); + if (mirror->hmm->mm == NULL) { + /* + * A racing hmm_mirror_unregister() is about to destroy the hmm + * struct. Try again to allocate a new one. + */ + up_write(&mirror->hmm->mirrors_sem); + mirror->hmm = NULL; + goto again; + } else { + list_add(&mirror->list, &mirror->hmm->mirrors); + up_write(&mirror->hmm->mirrors_sem); + } return 0; } @@ -227,11 +265,32 @@ EXPORT_SYMBOL(hmm_mirror_register); */ void hmm_mirror_unregister(struct hmm_mirror *mirror) { - struct hmm *hmm = mirror->hmm; + bool should_unregister = false; + struct mm_struct *mm; + struct hmm *hmm; + if (mirror->hmm == NULL) + return; + + hmm = mirror->hmm; down_write(&hmm->mirrors_sem); - list_del(&mirror->list); + list_del_init(&mirror->list); + should_unregister = list_empty(&hmm->mirrors); + mirror->hmm = NULL; + mm = hmm->mm; + hmm->mm = NULL; up_write(&hmm->mirrors_sem); + + if (!should_unregister || mm == NULL) + return; + + spin_lock(&mm->page_table_lock); + if (mm->hmm == hmm) + mm->hmm = NULL; + spin_unlock(&mm->page_table_lock); + + mmu_notifier_unregister_no_release(&hmm->mmu_notifier, mm); + kfree(hmm); } EXPORT_SYMBOL(hmm_mirror_unregister); @@ -240,110 +299,275 @@ struct hmm_vma_walk { unsigned long last; bool fault; bool block; - bool write; }; -static int hmm_vma_do_fault(struct mm_walk *walk, - unsigned long addr, - hmm_pfn_t *pfn) +static int hmm_vma_do_fault(struct mm_walk *walk, unsigned long addr, + bool write_fault, uint64_t *pfn) { unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_REMOTE; struct hmm_vma_walk *hmm_vma_walk = walk->private; + struct hmm_range *range = hmm_vma_walk->range; struct vm_area_struct *vma = walk->vma; int r; flags |= hmm_vma_walk->block ? 0 : FAULT_FLAG_ALLOW_RETRY; - flags |= hmm_vma_walk->write ? FAULT_FLAG_WRITE : 0; + flags |= write_fault ? FAULT_FLAG_WRITE : 0; r = handle_mm_fault(vma, addr, flags); if (r & VM_FAULT_RETRY) return -EBUSY; if (r & VM_FAULT_ERROR) { - *pfn = HMM_PFN_ERROR; + *pfn = range->values[HMM_PFN_ERROR]; return -EFAULT; } return -EAGAIN; } -static void hmm_pfns_special(hmm_pfn_t *pfns, - unsigned long addr, - unsigned long end) -{ - for (; addr < end; addr += PAGE_SIZE, pfns++) - *pfns = HMM_PFN_SPECIAL; -} - static int hmm_pfns_bad(unsigned long addr, unsigned long end, struct mm_walk *walk) { - struct hmm_range *range = walk->private; - hmm_pfn_t *pfns = range->pfns; + struct hmm_vma_walk *hmm_vma_walk = walk->private; + struct hmm_range *range = hmm_vma_walk->range; + uint64_t *pfns = range->pfns; unsigned long i; i = (addr - range->start) >> PAGE_SHIFT; for (; addr < end; addr += PAGE_SIZE, i++) - pfns[i] = HMM_PFN_ERROR; + pfns[i] = range->values[HMM_PFN_ERROR]; return 0; } -static void hmm_pfns_clear(hmm_pfn_t *pfns, - unsigned long addr, - unsigned long end) -{ - for (; addr < end; addr += PAGE_SIZE, pfns++) - *pfns = 0; -} - -static int hmm_vma_walk_hole(unsigned long addr, - unsigned long end, - struct mm_walk *walk) +/* + * hmm_vma_walk_hole() - handle a range lacking valid pmd or pte(s) + * @start: range virtual start address (inclusive) + * @end: range virtual end address (exclusive) + * @fault: should we fault or not ? + * @write_fault: write fault ? + * @walk: mm_walk structure + * Returns: 0 on success, -EAGAIN after page fault, or page fault error + * + * This function will be called whenever pmd_none() or pte_none() returns true, + * or whenever there is no page directory covering the virtual address range. + */ +static int hmm_vma_walk_hole_(unsigned long addr, unsigned long end, + bool fault, bool write_fault, + struct mm_walk *walk) { struct hmm_vma_walk *hmm_vma_walk = walk->private; struct hmm_range *range = hmm_vma_walk->range; - hmm_pfn_t *pfns = range->pfns; + uint64_t *pfns = range->pfns; unsigned long i; hmm_vma_walk->last = addr; i = (addr - range->start) >> PAGE_SHIFT; for (; addr < end; addr += PAGE_SIZE, i++) { - pfns[i] = HMM_PFN_EMPTY; - if (hmm_vma_walk->fault) { + pfns[i] = range->values[HMM_PFN_NONE]; + if (fault || write_fault) { int ret; - ret = hmm_vma_do_fault(walk, addr, &pfns[i]); + ret = hmm_vma_do_fault(walk, addr, write_fault, + &pfns[i]); if (ret != -EAGAIN) return ret; } } - return hmm_vma_walk->fault ? -EAGAIN : 0; + return (fault || write_fault) ? -EAGAIN : 0; } -static int hmm_vma_walk_clear(unsigned long addr, - unsigned long end, - struct mm_walk *walk) +static inline void hmm_pte_need_fault(const struct hmm_vma_walk *hmm_vma_walk, + uint64_t pfns, uint64_t cpu_flags, + bool *fault, bool *write_fault) { - struct hmm_vma_walk *hmm_vma_walk = walk->private; struct hmm_range *range = hmm_vma_walk->range; - hmm_pfn_t *pfns = range->pfns; + + *fault = *write_fault = false; + if (!hmm_vma_walk->fault) + return; + + /* We aren't ask to do anything ... */ + if (!(pfns & range->flags[HMM_PFN_VALID])) + return; + /* If this is device memory than only fault if explicitly requested */ + if ((cpu_flags & range->flags[HMM_PFN_DEVICE_PRIVATE])) { + /* Do we fault on device memory ? */ + if (pfns & range->flags[HMM_PFN_DEVICE_PRIVATE]) { + *write_fault = pfns & range->flags[HMM_PFN_WRITE]; + *fault = true; + } + return; + } + + /* If CPU page table is not valid then we need to fault */ + *fault = !(cpu_flags & range->flags[HMM_PFN_VALID]); + /* Need to write fault ? */ + if ((pfns & range->flags[HMM_PFN_WRITE]) && + !(cpu_flags & range->flags[HMM_PFN_WRITE])) { + *write_fault = true; + *fault = true; + } +} + +static void hmm_range_need_fault(const struct hmm_vma_walk *hmm_vma_walk, + const uint64_t *pfns, unsigned long npages, + uint64_t cpu_flags, bool *fault, + bool *write_fault) +{ unsigned long i; - hmm_vma_walk->last = addr; + if (!hmm_vma_walk->fault) { + *fault = *write_fault = false; + return; + } + + for (i = 0; i < npages; ++i) { + hmm_pte_need_fault(hmm_vma_walk, pfns[i], cpu_flags, + fault, write_fault); + if ((*fault) || (*write_fault)) + return; + } +} + +static int hmm_vma_walk_hole(unsigned long addr, unsigned long end, + struct mm_walk *walk) +{ + struct hmm_vma_walk *hmm_vma_walk = walk->private; + struct hmm_range *range = hmm_vma_walk->range; + bool fault, write_fault; + unsigned long i, npages; + uint64_t *pfns; + i = (addr - range->start) >> PAGE_SHIFT; - for (; addr < end; addr += PAGE_SIZE, i++) { - pfns[i] = 0; - if (hmm_vma_walk->fault) { - int ret; + npages = (end - addr) >> PAGE_SHIFT; + pfns = &range->pfns[i]; + hmm_range_need_fault(hmm_vma_walk, pfns, npages, + 0, &fault, &write_fault); + return hmm_vma_walk_hole_(addr, end, fault, write_fault, walk); +} - ret = hmm_vma_do_fault(walk, addr, &pfns[i]); - if (ret != -EAGAIN) - return ret; +static inline uint64_t pmd_to_hmm_pfn_flags(struct hmm_range *range, pmd_t pmd) +{ + if (pmd_protnone(pmd)) + return 0; + return pmd_write(pmd) ? range->flags[HMM_PFN_VALID] | + range->flags[HMM_PFN_WRITE] : + range->flags[HMM_PFN_VALID]; +} + +static int hmm_vma_handle_pmd(struct mm_walk *walk, + unsigned long addr, + unsigned long end, + uint64_t *pfns, + pmd_t pmd) +{ + struct hmm_vma_walk *hmm_vma_walk = walk->private; + struct hmm_range *range = hmm_vma_walk->range; + unsigned long pfn, npages, i; + bool fault, write_fault; + uint64_t cpu_flags; + + npages = (end - addr) >> PAGE_SHIFT; + cpu_flags = pmd_to_hmm_pfn_flags(range, pmd); + hmm_range_need_fault(hmm_vma_walk, pfns, npages, cpu_flags, + &fault, &write_fault); + + if (pmd_protnone(pmd) || fault || write_fault) + return hmm_vma_walk_hole_(addr, end, fault, write_fault, walk); + + pfn = pmd_pfn(pmd) + pte_index(addr); + for (i = 0; addr < end; addr += PAGE_SIZE, i++, pfn++) + pfns[i] = hmm_pfn_from_pfn(range, pfn) | cpu_flags; + hmm_vma_walk->last = end; + return 0; +} + +static inline uint64_t pte_to_hmm_pfn_flags(struct hmm_range *range, pte_t pte) +{ + if (pte_none(pte) || !pte_present(pte)) + return 0; + return pte_write(pte) ? range->flags[HMM_PFN_VALID] | + range->flags[HMM_PFN_WRITE] : + range->flags[HMM_PFN_VALID]; +} + +static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr, + unsigned long end, pmd_t *pmdp, pte_t *ptep, + uint64_t *pfn) +{ + struct hmm_vma_walk *hmm_vma_walk = walk->private; + struct hmm_range *range = hmm_vma_walk->range; + struct vm_area_struct *vma = walk->vma; + bool fault, write_fault; + uint64_t cpu_flags; + pte_t pte = *ptep; + uint64_t orig_pfn = *pfn; + + *pfn = range->values[HMM_PFN_NONE]; + cpu_flags = pte_to_hmm_pfn_flags(range, pte); + hmm_pte_need_fault(hmm_vma_walk, orig_pfn, cpu_flags, + &fault, &write_fault); + + if (pte_none(pte)) { + if (fault || write_fault) + goto fault; + return 0; + } + + if (!pte_present(pte)) { + swp_entry_t entry = pte_to_swp_entry(pte); + + if (!non_swap_entry(entry)) { + if (fault || write_fault) + goto fault; + return 0; } + + /* + * This is a special swap entry, ignore migration, use + * device and report anything else as error. + */ + if (is_device_private_entry(entry)) { + cpu_flags = range->flags[HMM_PFN_VALID] | + range->flags[HMM_PFN_DEVICE_PRIVATE]; + cpu_flags |= is_write_device_private_entry(entry) ? + range->flags[HMM_PFN_WRITE] : 0; + hmm_pte_need_fault(hmm_vma_walk, orig_pfn, cpu_flags, + &fault, &write_fault); + if (fault || write_fault) + goto fault; + *pfn = hmm_pfn_from_pfn(range, swp_offset(entry)); + *pfn |= cpu_flags; + return 0; + } + + if (is_migration_entry(entry)) { + if (fault || write_fault) { + pte_unmap(ptep); + hmm_vma_walk->last = addr; + migration_entry_wait(vma->vm_mm, + pmdp, addr); + return -EAGAIN; + } + return 0; + } + + /* Report error for everything else */ + *pfn = range->values[HMM_PFN_ERROR]; + return -EFAULT; } - return hmm_vma_walk->fault ? -EAGAIN : 0; + if (fault || write_fault) + goto fault; + + *pfn = hmm_pfn_from_pfn(range, pte_pfn(pte)) | cpu_flags; + return 0; + +fault: + pte_unmap(ptep); + /* Fault any virtual address we were asked to fault */ + return hmm_vma_walk_hole_(addr, end, fault, write_fault, walk); } static int hmm_vma_walk_pmd(pmd_t *pmdp, @@ -353,26 +577,20 @@ static int hmm_vma_walk_pmd(pmd_t *pmdp, { struct hmm_vma_walk *hmm_vma_walk = walk->private; struct hmm_range *range = hmm_vma_walk->range; - struct vm_area_struct *vma = walk->vma; - hmm_pfn_t *pfns = range->pfns; + uint64_t *pfns = range->pfns; unsigned long addr = start, i; - bool write_fault; - hmm_pfn_t flag; pte_t *ptep; i = (addr - range->start) >> PAGE_SHIFT; - flag = vma->vm_flags & VM_READ ? HMM_PFN_READ : 0; - write_fault = hmm_vma_walk->fault & hmm_vma_walk->write; again: if (pmd_none(*pmdp)) return hmm_vma_walk_hole(start, end, walk); - if (pmd_huge(*pmdp) && vma->vm_flags & VM_HUGETLB) + if (pmd_huge(*pmdp) && (range->vma->vm_flags & VM_HUGETLB)) return hmm_pfns_bad(start, end, walk); if (pmd_devmap(*pmdp) || pmd_trans_huge(*pmdp)) { - unsigned long pfn; pmd_t pmd; /* @@ -388,17 +606,8 @@ again: barrier(); if (!pmd_devmap(pmd) && !pmd_trans_huge(pmd)) goto again; - if (pmd_protnone(pmd)) - return hmm_vma_walk_clear(start, end, walk); - if (write_fault && !pmd_write(pmd)) - return hmm_vma_walk_clear(start, end, walk); - - pfn = pmd_pfn(pmd) + pte_index(addr); - flag |= pmd_write(pmd) ? HMM_PFN_WRITE : 0; - for (; addr < end; addr += PAGE_SIZE, i++, pfn++) - pfns[i] = hmm_pfn_t_from_pfn(pfn) | flag; - return 0; + return hmm_vma_handle_pmd(walk, addr, end, &pfns[i], pmd); } if (pmd_bad(*pmdp)) @@ -406,79 +615,43 @@ again: ptep = pte_offset_map(pmdp, addr); for (; addr < end; addr += PAGE_SIZE, ptep++, i++) { - pte_t pte = *ptep; - - pfns[i] = 0; + int r; - if (pte_none(pte)) { - pfns[i] = HMM_PFN_EMPTY; - if (hmm_vma_walk->fault) - goto fault; - continue; + r = hmm_vma_handle_pte(walk, addr, end, pmdp, ptep, &pfns[i]); + if (r) { + /* hmm_vma_handle_pte() did unmap pte directory */ + hmm_vma_walk->last = addr; + return r; } - - if (!pte_present(pte)) { - swp_entry_t entry = pte_to_swp_entry(pte); - - if (!non_swap_entry(entry)) { - if (hmm_vma_walk->fault) - goto fault; - continue; - } - - /* - * This is a special swap entry, ignore migration, use - * device and report anything else as error. - */ - if (is_device_private_entry(entry)) { - pfns[i] = hmm_pfn_t_from_pfn(swp_offset(entry)); - if (is_write_device_private_entry(entry)) { - pfns[i] |= HMM_PFN_WRITE; - } else if (write_fault) - goto fault; - pfns[i] |= HMM_PFN_DEVICE_UNADDRESSABLE; - pfns[i] |= flag; - } else if (is_migration_entry(entry)) { - if (hmm_vma_walk->fault) { - pte_unmap(ptep); - hmm_vma_walk->last = addr; - migration_entry_wait(vma->vm_mm, - pmdp, addr); - return -EAGAIN; - } - continue; - } else { - /* Report error for everything else */ - pfns[i] = HMM_PFN_ERROR; - } - continue; - } - - if (write_fault && !pte_write(pte)) - goto fault; - - pfns[i] = hmm_pfn_t_from_pfn(pte_pfn(pte)) | flag; - pfns[i] |= pte_write(pte) ? HMM_PFN_WRITE : 0; - continue; - -fault: - pte_unmap(ptep); - /* Fault all pages in range */ - return hmm_vma_walk_clear(start, end, walk); } pte_unmap(ptep - 1); + hmm_vma_walk->last = addr; return 0; } +static void hmm_pfns_clear(struct hmm_range *range, + uint64_t *pfns, + unsigned long addr, + unsigned long end) +{ + for (; addr < end; addr += PAGE_SIZE, pfns++) + *pfns = range->values[HMM_PFN_NONE]; +} + +static void hmm_pfns_special(struct hmm_range *range) +{ + unsigned long addr = range->start, i = 0; + + for (; addr < range->end; addr += PAGE_SIZE, i++) + range->pfns[i] = range->values[HMM_PFN_SPECIAL]; +} + /* * hmm_vma_get_pfns() - snapshot CPU page table for a range of virtual addresses - * @vma: virtual memory area containing the virtual address range - * @range: used to track snapshot validity - * @start: range virtual start address (inclusive) - * @end: range virtual end address (exclusive) - * @entries: array of hmm_pfn_t: provided by the caller, filled in by function - * Returns: -EINVAL if invalid argument, -ENOMEM out of memory, 0 success + * @range: range being snapshotted + * Returns: -EINVAL if invalid argument, -ENOMEM out of memory, -EPERM invalid + * vma permission, 0 success * * This snapshots the CPU page table for a range of virtual addresses. Snapshot * validity is tracked by range struct. See hmm_vma_range_done() for further @@ -491,26 +664,17 @@ fault: * NOT CALLING hmm_vma_range_done() IF FUNCTION RETURNS 0 WILL LEAD TO SERIOUS * MEMORY CORRUPTION ! YOU HAVE BEEN WARNED ! */ -int hmm_vma_get_pfns(struct vm_area_struct *vma, - struct hmm_range *range, - unsigned long start, - unsigned long end, - hmm_pfn_t *pfns) +int hmm_vma_get_pfns(struct hmm_range *range) { + struct vm_area_struct *vma = range->vma; struct hmm_vma_walk hmm_vma_walk; struct mm_walk mm_walk; struct hmm *hmm; - /* FIXME support hugetlb fs */ - if (is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_SPECIAL)) { - hmm_pfns_special(pfns, start, end); - return -EINVAL; - } - /* Sanity check, this really should not happen ! */ - if (start < vma->vm_start || start >= vma->vm_end) + if (range->start < vma->vm_start || range->start >= vma->vm_end) return -EINVAL; - if (end < vma->vm_start || end > vma->vm_end) + if (range->end < vma->vm_start || range->end > vma->vm_end) return -EINVAL; hmm = hmm_register(vma->vm_mm); @@ -520,10 +684,24 @@ int hmm_vma_get_pfns(struct vm_area_struct *vma, if (!hmm->mmu_notifier.ops) return -EINVAL; + /* FIXME support hugetlb fs */ + if (is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_SPECIAL)) { + hmm_pfns_special(range); + return -EINVAL; + } + + if (!(vma->vm_flags & VM_READ)) { + /* + * If vma do not allow read access, then assume that it does + * not allow write access, either. Architecture that allow + * write without read access are not supported by HMM, because + * operations such has atomic access would not work. + */ + hmm_pfns_clear(range, range->pfns, range->start, range->end); + return -EPERM; + } + /* Initialize range to track CPU page table update */ - range->start = start; - range->pfns = pfns; - range->end = end; spin_lock(&hmm->lock); range->valid = true; list_add_rcu(&range->list, &hmm->ranges); @@ -541,14 +719,13 @@ int hmm_vma_get_pfns(struct vm_area_struct *vma, mm_walk.pmd_entry = hmm_vma_walk_pmd; mm_walk.pte_hole = hmm_vma_walk_hole; - walk_page_range(start, end, &mm_walk); + walk_page_range(range->start, range->end, &mm_walk); return 0; } EXPORT_SYMBOL(hmm_vma_get_pfns); /* * hmm_vma_range_done() - stop tracking change to CPU page table over a range - * @vma: virtual memory area containing the virtual address range * @range: range being tracked * Returns: false if range data has been invalidated, true otherwise * @@ -568,10 +745,10 @@ EXPORT_SYMBOL(hmm_vma_get_pfns); * * There are two ways to use this : * again: - * hmm_vma_get_pfns(vma, range, start, end, pfns); or hmm_vma_fault(...); + * hmm_vma_get_pfns(range); or hmm_vma_fault(...); * trans = device_build_page_table_update_transaction(pfns); * device_page_table_lock(); - * if (!hmm_vma_range_done(vma, range)) { + * if (!hmm_vma_range_done(range)) { * device_page_table_unlock(); * goto again; * } @@ -579,13 +756,13 @@ EXPORT_SYMBOL(hmm_vma_get_pfns); * device_page_table_unlock(); * * Or: - * hmm_vma_get_pfns(vma, range, start, end, pfns); or hmm_vma_fault(...); + * hmm_vma_get_pfns(range); or hmm_vma_fault(...); * device_page_table_lock(); - * hmm_vma_range_done(vma, range); - * device_update_page_table(pfns); + * hmm_vma_range_done(range); + * device_update_page_table(range->pfns); * device_page_table_unlock(); */ -bool hmm_vma_range_done(struct vm_area_struct *vma, struct hmm_range *range) +bool hmm_vma_range_done(struct hmm_range *range) { unsigned long npages = (range->end - range->start) >> PAGE_SHIFT; struct hmm *hmm; @@ -595,7 +772,7 @@ bool hmm_vma_range_done(struct vm_area_struct *vma, struct hmm_range *range) return false; } - hmm = hmm_register(vma->vm_mm); + hmm = hmm_register(range->vma->vm_mm); if (!hmm) { memset(range->pfns, 0, sizeof(*range->pfns) * npages); return false; @@ -611,36 +788,34 @@ EXPORT_SYMBOL(hmm_vma_range_done); /* * hmm_vma_fault() - try to fault some address in a virtual address range - * @vma: virtual memory area containing the virtual address range - * @range: use to track pfns array content validity - * @start: fault range virtual start address (inclusive) - * @end: fault range virtual end address (exclusive) - * @pfns: array of hmm_pfn_t, only entry with fault flag set will be faulted - * @write: is it a write fault + * @range: range being faulted * @block: allow blocking on fault (if true it sleeps and do not drop mmap_sem) * Returns: 0 success, error otherwise (-EAGAIN means mmap_sem have been drop) * * This is similar to a regular CPU page fault except that it will not trigger * any memory migration if the memory being faulted is not accessible by CPUs. * - * On error, for one virtual address in the range, the function will set the - * hmm_pfn_t error flag for the corresponding pfn entry. + * On error, for one virtual address in the range, the function will mark the + * corresponding HMM pfn entry with an error flag. * * Expected use pattern: * retry: * down_read(&mm->mmap_sem); * // Find vma and address device wants to fault, initialize hmm_pfn_t * // array accordingly - * ret = hmm_vma_fault(vma, start, end, pfns, allow_retry); + * ret = hmm_vma_fault(range, write, block); * switch (ret) { * case -EAGAIN: - * hmm_vma_range_done(vma, range); + * hmm_vma_range_done(range); * // You might want to rate limit or yield to play nicely, you may * // also commit any valid pfn in the array assuming that you are * // getting true from hmm_vma_range_monitor_end() * goto retry; * case 0: * break; + * case -ENOMEM: + * case -EINVAL: + * case -EPERM: * default: * // Handle error ! * up_read(&mm->mmap_sem) @@ -648,7 +823,7 @@ EXPORT_SYMBOL(hmm_vma_range_done); * } * // Take device driver lock that serialize device page table update * driver_lock_device_page_table_update(); - * hmm_vma_range_done(vma, range); + * hmm_vma_range_done(range); * // Commit pfns we got from hmm_vma_fault() * driver_unlock_device_page_table_update(); * up_read(&mm->mmap_sem) @@ -658,51 +833,54 @@ EXPORT_SYMBOL(hmm_vma_range_done); * * YOU HAVE BEEN WARNED ! */ -int hmm_vma_fault(struct vm_area_struct *vma, - struct hmm_range *range, - unsigned long start, - unsigned long end, - hmm_pfn_t *pfns, - bool write, - bool block) +int hmm_vma_fault(struct hmm_range *range, bool block) { + struct vm_area_struct *vma = range->vma; + unsigned long start = range->start; struct hmm_vma_walk hmm_vma_walk; struct mm_walk mm_walk; struct hmm *hmm; int ret; /* Sanity check, this really should not happen ! */ - if (start < vma->vm_start || start >= vma->vm_end) + if (range->start < vma->vm_start || range->start >= vma->vm_end) return -EINVAL; - if (end < vma->vm_start || end > vma->vm_end) + if (range->end < vma->vm_start || range->end > vma->vm_end) return -EINVAL; hmm = hmm_register(vma->vm_mm); if (!hmm) { - hmm_pfns_clear(pfns, start, end); + hmm_pfns_clear(range, range->pfns, range->start, range->end); return -ENOMEM; } /* Caller must have registered a mirror using hmm_mirror_register() */ if (!hmm->mmu_notifier.ops) return -EINVAL; + /* FIXME support hugetlb fs */ + if (is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_SPECIAL)) { + hmm_pfns_special(range); + return -EINVAL; + } + + if (!(vma->vm_flags & VM_READ)) { + /* + * If vma do not allow read access, then assume that it does + * not allow write access, either. Architecture that allow + * write without read access are not supported by HMM, because + * operations such has atomic access would not work. + */ + hmm_pfns_clear(range, range->pfns, range->start, range->end); + return -EPERM; + } + /* Initialize range to track CPU page table update */ - range->start = start; - range->pfns = pfns; - range->end = end; spin_lock(&hmm->lock); range->valid = true; list_add_rcu(&range->list, &hmm->ranges); spin_unlock(&hmm->lock); - /* FIXME support hugetlb fs */ - if (is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_SPECIAL)) { - hmm_pfns_special(pfns, start, end); - return 0; - } - hmm_vma_walk.fault = true; - hmm_vma_walk.write = write; hmm_vma_walk.block = block; hmm_vma_walk.range = range; mm_walk.private = &hmm_vma_walk; @@ -717,7 +895,7 @@ int hmm_vma_fault(struct vm_area_struct *vma, mm_walk.pte_hole = hmm_vma_walk_hole; do { - ret = walk_page_range(start, end, &mm_walk); + ret = walk_page_range(start, range->end, &mm_walk); start = hmm_vma_walk.last; } while (ret == -EAGAIN); @@ -725,8 +903,9 @@ int hmm_vma_fault(struct vm_area_struct *vma, unsigned long i; i = (hmm_vma_walk.last - range->start) >> PAGE_SHIFT; - hmm_pfns_clear(&pfns[i], hmm_vma_walk.last, end); - hmm_vma_range_done(vma, range); + hmm_pfns_clear(range, &range->pfns[i], hmm_vma_walk.last, + range->end); + hmm_vma_range_done(range); } return ret; } @@ -845,13 +1024,6 @@ static void hmm_devmem_release(struct device *dev, void *data) hmm_devmem_radix_release(resource); } -static struct hmm_devmem *hmm_devmem_find(resource_size_t phys) -{ - WARN_ON_ONCE(!rcu_read_lock_held()); - - return radix_tree_lookup(&hmm_devmem_radix, phys >> PA_SECTION_SHIFT); -} - static int hmm_devmem_pages_create(struct hmm_devmem *devmem) { resource_size_t key, align_start, align_size, align_end; @@ -892,9 +1064,8 @@ static int hmm_devmem_pages_create(struct hmm_devmem *devmem) for (key = align_start; key <= align_end; key += PA_SECTION_SIZE) { struct hmm_devmem *dup; - rcu_read_lock(); - dup = hmm_devmem_find(key); - rcu_read_unlock(); + dup = radix_tree_lookup(&hmm_devmem_radix, + key >> PA_SECTION_SHIFT); if (dup) { dev_err(device, "%s: collides with mapping for %s\n", __func__, dev_name(dup->device)); diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 5a68730eebd6..14ed6ee5e02f 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -555,8 +555,7 @@ static int __do_huge_pmd_anonymous_page(struct vm_fault *vmf, struct page *page, VM_BUG_ON_PAGE(!PageCompound(page), page); - if (mem_cgroup_try_charge(page, vma->vm_mm, gfp | __GFP_NORETRY, &memcg, - true)) { + if (mem_cgroup_try_charge(page, vma->vm_mm, gfp, &memcg, true)) { put_page(page); count_vm_event(THP_FAULT_FALLBACK); return VM_FAULT_FALLBACK; @@ -1317,7 +1316,7 @@ alloc: } if (unlikely(mem_cgroup_try_charge(new_page, vma->vm_mm, - huge_gfp | __GFP_NORETRY, &memcg, true))) { + huge_gfp, &memcg, true))) { put_page(new_page); split_huge_pmd(vma, vmf->pmd, vmf->address); if (page) @@ -2356,26 +2355,13 @@ static void __split_huge_page_tail(struct page *head, int tail, struct page *page_tail = head + tail; VM_BUG_ON_PAGE(atomic_read(&page_tail->_mapcount) != -1, page_tail); - VM_BUG_ON_PAGE(page_ref_count(page_tail) != 0, page_tail); /* - * tail_page->_refcount is zero and not changing from under us. But - * get_page_unless_zero() may be running from under us on the - * tail_page. If we used atomic_set() below instead of atomic_inc() or - * atomic_add(), we would then run atomic_set() concurrently with - * get_page_unless_zero(), and atomic_set() is implemented in C not - * using locked ops. spin_unlock on x86 sometime uses locked ops - * because of PPro errata 66, 92, so unless somebody can guarantee - * atomic_set() here would be safe on all archs (and not only on x86), - * it's safer to use atomic_inc()/atomic_add(). + * Clone page flags before unfreezing refcount. + * + * After successful get_page_unless_zero() might follow flags change, + * for exmaple lock_page() which set PG_waiters. */ - if (PageAnon(head) && !PageSwapCache(head)) { - page_ref_inc(page_tail); - } else { - /* Additional pin to radix tree */ - page_ref_add(page_tail, 2); - } - page_tail->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; page_tail->flags |= (head->flags & ((1L << PG_referenced) | @@ -2388,14 +2374,21 @@ static void __split_huge_page_tail(struct page *head, int tail, (1L << PG_unevictable) | (1L << PG_dirty))); - /* - * After clearing PageTail the gup refcount can be released. - * Page flags also must be visible before we make the page non-compound. - */ + /* Page flags must be visible before we make the page non-compound. */ smp_wmb(); + /* + * Clear PageTail before unfreezing page refcount. + * + * After successful get_page_unless_zero() might follow put_page() + * which needs correct compound_head(). + */ clear_compound_head(page_tail); + /* Finally unfreeze refcount. Additional reference from page cache. */ + page_ref_unfreeze(page_tail, 1 + (!PageAnon(head) || + PageSwapCache(head))); + if (page_is_young(head)) set_page_young(page_tail); if (page_is_idle(head)) @@ -2408,6 +2401,12 @@ static void __split_huge_page_tail(struct page *head, int tail, page_tail->index = head->index + tail; page_cpupid_xchg_last(page_tail, page_cpupid_last(head)); + + /* + * always add to the tail because some iterators expect new + * pages to show after the currently processed elements - e.g. + * migrate_pages + */ lru_add_page_tail(head, page_tail, lruvec, list); } @@ -2451,7 +2450,7 @@ static void __split_huge_page(struct page *page, struct list_head *list, } else { /* Additional pin to radix tree */ page_ref_add(head, 2); - spin_unlock(&head->mapping->tree_lock); + xa_unlock(&head->mapping->i_pages); } spin_unlock_irqrestore(zone_lru_lock(page_zone(head)), flags); @@ -2659,15 +2658,15 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) if (mapping) { void **pslot; - spin_lock(&mapping->tree_lock); - pslot = radix_tree_lookup_slot(&mapping->page_tree, + xa_lock(&mapping->i_pages); + pslot = radix_tree_lookup_slot(&mapping->i_pages, page_index(head)); /* * Check if the head page is present in radix tree. * We assume all tail are present too, if head is there. */ if (radix_tree_deref_slot_protected(pslot, - &mapping->tree_lock) != head) + &mapping->i_pages.xa_lock) != head) goto fail; } @@ -2701,7 +2700,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) } spin_unlock(&pgdata->split_queue_lock); fail: if (mapping) - spin_unlock(&mapping->tree_lock); + xa_unlock(&mapping->i_pages); spin_unlock_irqrestore(zone_lru_lock(page_zone(head)), flags); unfreeze_page(head); ret = -EBUSY; diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 976bbc5646fe..218679138255 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -637,29 +637,22 @@ EXPORT_SYMBOL_GPL(linear_hugepage_index); */ unsigned long vma_kernel_pagesize(struct vm_area_struct *vma) { - struct hstate *hstate; - - if (!is_vm_hugetlb_page(vma)) - return PAGE_SIZE; - - hstate = hstate_vma(vma); - - return 1UL << huge_page_shift(hstate); + if (vma->vm_ops && vma->vm_ops->pagesize) + return vma->vm_ops->pagesize(vma); + return PAGE_SIZE; } EXPORT_SYMBOL_GPL(vma_kernel_pagesize); /* * Return the page size being used by the MMU to back a VMA. In the majority * of cases, the page size used by the kernel matches the MMU size. On - * architectures where it differs, an architecture-specific version of this - * function is required. + * architectures where it differs, an architecture-specific 'strong' + * version of this symbol is required. */ -#ifndef vma_mmu_pagesize -unsigned long vma_mmu_pagesize(struct vm_area_struct *vma) +__weak unsigned long vma_mmu_pagesize(struct vm_area_struct *vma) { return vma_kernel_pagesize(vma); } -#endif /* * Flags for MAP_PRIVATE reservations. These are stored in the bottom @@ -3153,6 +3146,13 @@ static int hugetlb_vm_op_split(struct vm_area_struct *vma, unsigned long addr) return 0; } +static unsigned long hugetlb_vm_op_pagesize(struct vm_area_struct *vma) +{ + struct hstate *hstate = hstate_vma(vma); + + return 1UL << huge_page_shift(hstate); +} + /* * We cannot handle pagefaults against hugetlb pages at all. They cause * handle_mm_fault() to try to instantiate regular-sized pages in the @@ -3170,6 +3170,7 @@ const struct vm_operations_struct hugetlb_vm_ops = { .open = hugetlb_vm_op_open, .close = hugetlb_vm_op_close, .split = hugetlb_vm_op_split, + .pagesize = hugetlb_vm_op_pagesize, }; static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page, diff --git a/mm/internal.h b/mm/internal.h index e6bd35182dae..62d8c34e63d5 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -168,6 +168,9 @@ extern void post_alloc_hook(struct page *page, unsigned int order, gfp_t gfp_flags); extern int user_min_free_kbytes; +extern void set_zone_contiguous(struct zone *zone); +extern void clear_zone_contiguous(struct zone *zone); + #if defined CONFIG_COMPACTION || defined CONFIG_CMA /* @@ -495,7 +498,6 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone, #define ALLOC_HARDER 0x10 /* try to alloc harder */ #define ALLOC_HIGH 0x20 /* __GFP_HIGH set */ #define ALLOC_CPUSET 0x40 /* check for correct cpuset */ -#define ALLOC_CMA 0x80 /* allow allocations from CMA areas */ enum ttu_flags; struct tlbflush_unmap_batch; @@ -538,4 +540,5 @@ static inline bool is_migrate_highatomic_page(struct page *page) } void setup_zone_pageset(struct zone *zone); +extern struct page *alloc_new_node_page(struct page *page, unsigned long node); #endif /* __MM_INTERNAL_H */ diff --git a/mm/kasan/kasan.c b/mm/kasan/kasan.c index e13d911251e7..bc0e68f7dc75 100644 --- a/mm/kasan/kasan.c +++ b/mm/kasan/kasan.c @@ -323,9 +323,9 @@ void kasan_free_pages(struct page *page, unsigned int order) * Adaptive redzone policy taken from the userspace AddressSanitizer runtime. * For larger allocations larger redzones are used. */ -static size_t optimal_redzone(size_t object_size) +static unsigned int optimal_redzone(unsigned int object_size) { - int rz = + return object_size <= 64 - 16 ? 16 : object_size <= 128 - 32 ? 32 : object_size <= 512 - 64 ? 64 : @@ -333,14 +333,13 @@ static size_t optimal_redzone(size_t object_size) object_size <= (1 << 14) - 256 ? 256 : object_size <= (1 << 15) - 512 ? 512 : object_size <= (1 << 16) - 1024 ? 1024 : 2048; - return rz; } -void kasan_cache_create(struct kmem_cache *cache, size_t *size, +void kasan_cache_create(struct kmem_cache *cache, unsigned int *size, slab_flags_t *flags) { + unsigned int orig_size = *size; int redzone_adjust; - int orig_size = *size; /* Add alloc meta. */ cache->kasan_info.alloc_meta_offset = *size; @@ -358,7 +357,8 @@ void kasan_cache_create(struct kmem_cache *cache, size_t *size, if (redzone_adjust > 0) *size += redzone_adjust; - *size = min(KMALLOC_MAX_SIZE, max(*size, cache->object_size + + *size = min_t(unsigned int, KMALLOC_MAX_SIZE, + max(*size, cache->object_size + optimal_redzone(cache->object_size))); /* @@ -382,7 +382,8 @@ void kasan_cache_shrink(struct kmem_cache *cache) void kasan_cache_shutdown(struct kmem_cache *cache) { - quarantine_remove_cache(cache); + if (!__kmem_cache_empty(cache)) + quarantine_remove_cache(cache); } size_t kasan_metadata_size(struct kmem_cache *cache) diff --git a/mm/khugepaged.c b/mm/khugepaged.c index e42568284e06..d7b2a4bf8671 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -965,9 +965,7 @@ static void collapse_huge_page(struct mm_struct *mm, goto out_nolock; } - /* Do not oom kill for khugepaged charges */ - if (unlikely(mem_cgroup_try_charge(new_page, mm, gfp | __GFP_NORETRY, - &memcg, true))) { + if (unlikely(mem_cgroup_try_charge(new_page, mm, gfp, &memcg, true))) { result = SCAN_CGROUP_CHARGE_FAIL; goto out_nolock; } @@ -1326,9 +1324,7 @@ static void collapse_shmem(struct mm_struct *mm, goto out; } - /* Do not oom kill for khugepaged charges */ - if (unlikely(mem_cgroup_try_charge(new_page, mm, gfp | __GFP_NORETRY, - &memcg, true))) { + if (unlikely(mem_cgroup_try_charge(new_page, mm, gfp, &memcg, true))) { result = SCAN_CGROUP_CHARGE_FAIL; goto out; } @@ -1348,8 +1344,8 @@ static void collapse_shmem(struct mm_struct *mm, */ index = start; - spin_lock_irq(&mapping->tree_lock); - radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) { + xa_lock_irq(&mapping->i_pages); + radix_tree_for_each_slot(slot, &mapping->i_pages, &iter, start) { int n = min(iter.index, end) - index; /* @@ -1362,7 +1358,7 @@ static void collapse_shmem(struct mm_struct *mm, } nr_none += n; for (; index < min(iter.index, end); index++) { - radix_tree_insert(&mapping->page_tree, index, + radix_tree_insert(&mapping->i_pages, index, new_page + (index % HPAGE_PMD_NR)); } @@ -1371,16 +1367,16 @@ static void collapse_shmem(struct mm_struct *mm, break; page = radix_tree_deref_slot_protected(slot, - &mapping->tree_lock); + &mapping->i_pages.xa_lock); if (radix_tree_exceptional_entry(page) || !PageUptodate(page)) { - spin_unlock_irq(&mapping->tree_lock); + xa_unlock_irq(&mapping->i_pages); /* swap in or instantiate fallocated page */ if (shmem_getpage(mapping->host, index, &page, SGP_NOHUGE)) { result = SCAN_FAIL; goto tree_unlocked; } - spin_lock_irq(&mapping->tree_lock); + xa_lock_irq(&mapping->i_pages); } else if (trylock_page(page)) { get_page(page); } else { @@ -1389,7 +1385,7 @@ static void collapse_shmem(struct mm_struct *mm, } /* - * The page must be locked, so we can drop the tree_lock + * The page must be locked, so we can drop the i_pages lock * without racing with truncate. */ VM_BUG_ON_PAGE(!PageLocked(page), page); @@ -1400,7 +1396,7 @@ static void collapse_shmem(struct mm_struct *mm, result = SCAN_TRUNCATED; goto out_unlock; } - spin_unlock_irq(&mapping->tree_lock); + xa_unlock_irq(&mapping->i_pages); if (isolate_lru_page(page)) { result = SCAN_DEL_PAGE_LRU; @@ -1410,11 +1406,11 @@ static void collapse_shmem(struct mm_struct *mm, if (page_mapped(page)) unmap_mapping_pages(mapping, index, 1, false); - spin_lock_irq(&mapping->tree_lock); + xa_lock_irq(&mapping->i_pages); - slot = radix_tree_lookup_slot(&mapping->page_tree, index); + slot = radix_tree_lookup_slot(&mapping->i_pages, index); VM_BUG_ON_PAGE(page != radix_tree_deref_slot_protected(slot, - &mapping->tree_lock), page); + &mapping->i_pages.xa_lock), page); VM_BUG_ON_PAGE(page_mapped(page), page); /* @@ -1435,14 +1431,14 @@ static void collapse_shmem(struct mm_struct *mm, list_add_tail(&page->lru, &pagelist); /* Finally, replace with the new page. */ - radix_tree_replace_slot(&mapping->page_tree, slot, + radix_tree_replace_slot(&mapping->i_pages, slot, new_page + (index % HPAGE_PMD_NR)); slot = radix_tree_iter_resume(slot, &iter); index++; continue; out_lru: - spin_unlock_irq(&mapping->tree_lock); + xa_unlock_irq(&mapping->i_pages); putback_lru_page(page); out_isolate_failed: unlock_page(page); @@ -1468,14 +1464,14 @@ out_unlock: } for (; index < end; index++) { - radix_tree_insert(&mapping->page_tree, index, + radix_tree_insert(&mapping->i_pages, index, new_page + (index % HPAGE_PMD_NR)); } nr_none += n; } tree_locked: - spin_unlock_irq(&mapping->tree_lock); + xa_unlock_irq(&mapping->i_pages); tree_unlocked: if (result == SCAN_SUCCEED) { @@ -1524,9 +1520,8 @@ tree_unlocked: } else { /* Something went wrong: rollback changes to the radix-tree */ shmem_uncharge(mapping->host, nr_none); - spin_lock_irq(&mapping->tree_lock); - radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, - start) { + xa_lock_irq(&mapping->i_pages); + radix_tree_for_each_slot(slot, &mapping->i_pages, &iter, start) { if (iter.index >= end) break; page = list_first_entry_or_null(&pagelist, @@ -1536,8 +1531,7 @@ tree_unlocked: break; nr_none--; /* Put holes back where they were */ - radix_tree_delete(&mapping->page_tree, - iter.index); + radix_tree_delete(&mapping->i_pages, iter.index); continue; } @@ -1546,16 +1540,15 @@ tree_unlocked: /* Unfreeze the page. */ list_del(&page->lru); page_ref_unfreeze(page, 2); - radix_tree_replace_slot(&mapping->page_tree, - slot, page); + radix_tree_replace_slot(&mapping->i_pages, slot, page); slot = radix_tree_iter_resume(slot, &iter); - spin_unlock_irq(&mapping->tree_lock); + xa_unlock_irq(&mapping->i_pages); putback_lru_page(page); unlock_page(page); - spin_lock_irq(&mapping->tree_lock); + xa_lock_irq(&mapping->i_pages); } VM_BUG_ON(nr_none); - spin_unlock_irq(&mapping->tree_lock); + xa_unlock_irq(&mapping->i_pages); /* Unfreeze new_page, caller would take care about freeing it */ page_ref_unfreeze(new_page, 1); @@ -1583,7 +1576,7 @@ static void khugepaged_scan_shmem(struct mm_struct *mm, swap = 0; memset(khugepaged_node_load, 0, sizeof(khugepaged_node_load)); rcu_read_lock(); - radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) { + radix_tree_for_each_slot(slot, &mapping->i_pages, &iter, start) { if (iter.index >= start + HPAGE_PMD_NR) break; @@ -1883,8 +1876,16 @@ static void set_recommended_min_free_kbytes(void) int nr_zones = 0; unsigned long recommended_min; - for_each_populated_zone(zone) + for_each_populated_zone(zone) { + /* + * We don't need to worry about fragmentation of + * ZONE_MOVABLE since it only has movable pages. + */ + if (zone_idx(zone) > gfp_zone(GFP_USER)) + continue; + nr_zones++; + } /* Ensure 2 pageblocks are free to assist fragmentation avoidance */ recommended_min = pageblock_nr_pages * nr_zones * 2; diff --git a/mm/kmemleak.c b/mm/kmemleak.c index 46c2290a08f1..9a085d525bbc 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -1187,6 +1187,11 @@ EXPORT_SYMBOL(kmemleak_no_scan); /** * kmemleak_alloc_phys - similar to kmemleak_alloc but taking a physical * address argument + * @phys: physical address of the object + * @size: size of the object + * @min_count: minimum number of references to this object. + * See kmemleak_alloc() + * @gfp: kmalloc() flags used for kmemleak internal memory allocations */ void __ref kmemleak_alloc_phys(phys_addr_t phys, size_t size, int min_count, gfp_t gfp) @@ -1199,6 +1204,9 @@ EXPORT_SYMBOL(kmemleak_alloc_phys); /** * kmemleak_free_part_phys - similar to kmemleak_free_part but taking a * physical address argument + * @phys: physical address if the beginning or inside an object. This + * also represents the start of the range to be freed + * @size: size to be unregistered */ void __ref kmemleak_free_part_phys(phys_addr_t phys, size_t size) { @@ -1210,6 +1218,7 @@ EXPORT_SYMBOL(kmemleak_free_part_phys); /** * kmemleak_not_leak_phys - similar to kmemleak_not_leak but taking a physical * address argument + * @phys: physical address of the object */ void __ref kmemleak_not_leak_phys(phys_addr_t phys) { @@ -1221,6 +1230,7 @@ EXPORT_SYMBOL(kmemleak_not_leak_phys); /** * kmemleak_ignore_phys - similar to kmemleak_ignore but taking a physical * address argument + * @phys: physical address of the object */ void __ref kmemleak_ignore_phys(phys_addr_t phys) { @@ -1963,7 +1973,7 @@ static void kmemleak_disable(void) /* * Allow boot-time kmemleak disabling (enabled by default). */ -static int kmemleak_boot_config(char *str) +static int __init kmemleak_boot_config(char *str) { if (!str) return -EINVAL; @@ -1131,6 +1131,13 @@ static int replace_page(struct vm_area_struct *vma, struct page *page, } else { newpte = pte_mkspecial(pfn_pte(page_to_pfn(kpage), vma->vm_page_prot)); + /* + * We're replacing an anonymous page with a zero page, which is + * not anonymous. We need to do proper accounting otherwise we + * will get wrong values in /proc, and a BUG message in dmesg + * when tearing down the mm. + */ + dec_mm_counter(mm, MM_ANONPAGES); } flush_cache_page(vma, addr, pte_pfn(*ptep)); @@ -1318,10 +1325,10 @@ bool is_page_sharing_candidate(struct stable_node *stable_node) return __is_page_sharing_candidate(stable_node, 0); } -struct page *stable_node_dup(struct stable_node **_stable_node_dup, - struct stable_node **_stable_node, - struct rb_root *root, - bool prune_stale_stable_nodes) +static struct page *stable_node_dup(struct stable_node **_stable_node_dup, + struct stable_node **_stable_node, + struct rb_root *root, + bool prune_stale_stable_nodes) { struct stable_node *dup, *found = NULL, *stable_node = *_stable_node; struct hlist_node *hlist_safe; @@ -2082,8 +2089,22 @@ static void cmp_and_merge_page(struct page *page, struct rmap_item *rmap_item) tree_rmap_item = unstable_tree_search_insert(rmap_item, page, &tree_page); if (tree_rmap_item) { + bool split; + kpage = try_to_merge_two_pages(rmap_item, page, tree_rmap_item, tree_page); + /* + * If both pages we tried to merge belong to the same compound + * page, then we actually ended up increasing the reference + * count of the same compound page twice, and split_huge_page + * failed. + * Here we set a flag if that happened, and we use it later to + * try split_huge_page again. Since we call put_page right + * afterwards, the reference count will be correct and + * split_huge_page should succeed. + */ + split = PageTransCompound(page) + && compound_head(page) == compound_head(tree_page); put_page(tree_page); if (kpage) { /* @@ -2110,6 +2131,20 @@ static void cmp_and_merge_page(struct page *page, struct rmap_item *rmap_item) break_cow(tree_rmap_item); break_cow(rmap_item); } + } else if (split) { + /* + * We are here if we tried to merge two pages and + * failed because they both belonged to the same + * compound page. We will split the page now, but no + * merging will take place. + * We do not want to add the cost of a full lock; if + * the page is locked, it is better to skip it and + * perhaps try again later. + */ + if (!trylock_page(page)) + return; + split_huge_page(page); + unlock_page(page); } } } diff --git a/mm/list_lru.c b/mm/list_lru.c index fd41e969ede5..fcfb6c89ed47 100644 --- a/mm/list_lru.c +++ b/mm/list_lru.c @@ -52,14 +52,15 @@ static inline bool list_lru_memcg_aware(struct list_lru *lru) static inline struct list_lru_one * list_lru_from_memcg_idx(struct list_lru_node *nlru, int idx) { + struct list_lru_memcg *memcg_lrus; /* - * The lock protects the array of per cgroup lists from relocation - * (see memcg_update_list_lru_node). + * Either lock or RCU protects the array of per cgroup lists + * from relocation (see memcg_update_list_lru_node). */ - lockdep_assert_held(&nlru->lock); - if (nlru->memcg_lrus && idx >= 0) - return nlru->memcg_lrus->lru[idx]; - + memcg_lrus = rcu_dereference_check(nlru->memcg_lrus, + lockdep_is_held(&nlru->lock)); + if (memcg_lrus && idx >= 0) + return memcg_lrus->lru[idx]; return &nlru->lru; } @@ -168,10 +169,10 @@ static unsigned long __list_lru_count_one(struct list_lru *lru, struct list_lru_one *l; unsigned long count; - spin_lock(&nlru->lock); + rcu_read_lock(); l = list_lru_from_memcg_idx(nlru, memcg_idx); count = l->nr_items; - spin_unlock(&nlru->lock); + rcu_read_unlock(); return count; } @@ -324,24 +325,41 @@ fail: static int memcg_init_list_lru_node(struct list_lru_node *nlru) { + struct list_lru_memcg *memcg_lrus; int size = memcg_nr_cache_ids; - nlru->memcg_lrus = kvmalloc(size * sizeof(void *), GFP_KERNEL); - if (!nlru->memcg_lrus) + memcg_lrus = kvmalloc(sizeof(*memcg_lrus) + + size * sizeof(void *), GFP_KERNEL); + if (!memcg_lrus) return -ENOMEM; - if (__memcg_init_list_lru_node(nlru->memcg_lrus, 0, size)) { - kvfree(nlru->memcg_lrus); + if (__memcg_init_list_lru_node(memcg_lrus, 0, size)) { + kvfree(memcg_lrus); return -ENOMEM; } + RCU_INIT_POINTER(nlru->memcg_lrus, memcg_lrus); return 0; } static void memcg_destroy_list_lru_node(struct list_lru_node *nlru) { - __memcg_destroy_list_lru_node(nlru->memcg_lrus, 0, memcg_nr_cache_ids); - kvfree(nlru->memcg_lrus); + struct list_lru_memcg *memcg_lrus; + /* + * This is called when shrinker has already been unregistered, + * and nobody can use it. So, there is no need to use kvfree_rcu(). + */ + memcg_lrus = rcu_dereference_protected(nlru->memcg_lrus, true); + __memcg_destroy_list_lru_node(memcg_lrus, 0, memcg_nr_cache_ids); + kvfree(memcg_lrus); +} + +static void kvfree_rcu(struct rcu_head *head) +{ + struct list_lru_memcg *mlru; + + mlru = container_of(head, struct list_lru_memcg, rcu); + kvfree(mlru); } static int memcg_update_list_lru_node(struct list_lru_node *nlru, @@ -351,8 +369,9 @@ static int memcg_update_list_lru_node(struct list_lru_node *nlru, BUG_ON(old_size > new_size); - old = nlru->memcg_lrus; - new = kvmalloc(new_size * sizeof(void *), GFP_KERNEL); + old = rcu_dereference_protected(nlru->memcg_lrus, + lockdep_is_held(&list_lrus_mutex)); + new = kvmalloc(sizeof(*new) + new_size * sizeof(void *), GFP_KERNEL); if (!new) return -ENOMEM; @@ -361,29 +380,33 @@ static int memcg_update_list_lru_node(struct list_lru_node *nlru, return -ENOMEM; } - memcpy(new, old, old_size * sizeof(void *)); + memcpy(&new->lru, &old->lru, old_size * sizeof(void *)); /* - * The lock guarantees that we won't race with a reader - * (see list_lru_from_memcg_idx). + * The locking below allows readers that hold nlru->lock avoid taking + * rcu_read_lock (see list_lru_from_memcg_idx). * * Since list_lru_{add,del} may be called under an IRQ-safe lock, * we have to use IRQ-safe primitives here to avoid deadlock. */ spin_lock_irq(&nlru->lock); - nlru->memcg_lrus = new; + rcu_assign_pointer(nlru->memcg_lrus, new); spin_unlock_irq(&nlru->lock); - kvfree(old); + call_rcu(&old->rcu, kvfree_rcu); return 0; } static void memcg_cancel_update_list_lru_node(struct list_lru_node *nlru, int old_size, int new_size) { + struct list_lru_memcg *memcg_lrus; + + memcg_lrus = rcu_dereference_protected(nlru->memcg_lrus, + lockdep_is_held(&list_lrus_mutex)); /* do not bother shrinking the array back to the old size, because we * cannot handle allocation failures here */ - __memcg_destroy_list_lru_node(nlru->memcg_lrus, old_size, new_size); + __memcg_destroy_list_lru_node(memcg_lrus, old_size, new_size); } static int memcg_init_list_lru(struct list_lru *lru, bool memcg_aware) diff --git a/mm/memblock.c b/mm/memblock.c index 48376bd33274..5108356ad8aa 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -17,6 +17,7 @@ #include <linux/poison.h> #include <linux/pfn.h> #include <linux/debugfs.h> +#include <linux/kmemleak.h> #include <linux/seq_file.h> #include <linux/memblock.h> @@ -924,7 +925,7 @@ void __init_memblock __next_mem_range(u64 *idx, int nid, ulong flags, r = &type_b->regions[idx_b]; r_start = idx_b ? r[-1].base + r[-1].size : 0; r_end = idx_b < type_b->cnt ? - r->base : ULLONG_MAX; + r->base : (phys_addr_t)ULLONG_MAX; /* * if idx_b advanced past idx_a, @@ -1040,7 +1041,7 @@ void __init_memblock __next_mem_range_rev(u64 *idx, int nid, ulong flags, r = &type_b->regions[idx_b]; r_start = idx_b ? r[-1].base + r[-1].size : 0; r_end = idx_b < type_b->cnt ? - r->base : ULLONG_MAX; + r->base : (phys_addr_t)ULLONG_MAX; /* * if idx_b advanced past idx_a, * break out to advance idx_a @@ -1162,7 +1163,7 @@ phys_addr_t __init memblock_alloc_range(phys_addr_t size, phys_addr_t align, flags); } -static phys_addr_t __init memblock_alloc_base_nid(phys_addr_t size, +phys_addr_t __init memblock_alloc_base_nid(phys_addr_t size, phys_addr_t align, phys_addr_t max_addr, int nid, ulong flags) { @@ -1345,7 +1346,7 @@ void * __init memblock_virt_alloc_try_nid_raw( min_addr, max_addr, nid); #ifdef CONFIG_DEBUG_VM if (ptr && size > 0) - memset(ptr, 0xff, size); + memset(ptr, PAGE_POISON_PATTERN, size); #endif return ptr; } @@ -1750,29 +1751,6 @@ static void __init_memblock memblock_dump(struct memblock_type *type) } } -extern unsigned long __init_memblock -memblock_reserved_memory_within(phys_addr_t start_addr, phys_addr_t end_addr) -{ - struct memblock_region *rgn; - unsigned long size = 0; - int idx; - - for_each_memblock_type(idx, (&memblock.reserved), rgn) { - phys_addr_t start, end; - - if (rgn->base + rgn->size < start_addr) - continue; - if (rgn->base > end_addr) - continue; - - start = rgn->base; - end = start + rgn->size; - size += end - start; - } - - return size; -} - void __init_memblock __memblock_dump_all(void) { pr_info("MEMBLOCK configuration:\n"); @@ -1818,18 +1796,7 @@ static int memblock_debug_show(struct seq_file *m, void *private) } return 0; } - -static int memblock_debug_open(struct inode *inode, struct file *file) -{ - return single_open(file, memblock_debug_show, inode->i_private); -} - -static const struct file_operations memblock_debug_fops = { - .open = memblock_debug_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; +DEFINE_SHOW_ATTRIBUTE(memblock_debug); static int __init memblock_init_debugfs(void) { diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 9ec024b862ac..e074f7c637aa 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1485,7 +1485,7 @@ static void memcg_oom_recover(struct mem_cgroup *memcg) static void mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order) { - if (!current->memcg_may_oom) + if (!current->memcg_may_oom || order > PAGE_ALLOC_COSTLY_ORDER) return; /* * We are in the middle of the charge context here, so we @@ -1839,7 +1839,7 @@ static int memcg_hotplug_cpu_dead(unsigned int cpu) } } - for (i = 0; i < MEMCG_NR_EVENTS; i++) { + for (i = 0; i < NR_VM_EVENT_ITEMS; i++) { long x; x = this_cpu_xchg(memcg->stat_cpu->events[i], 0); @@ -1858,7 +1858,7 @@ static void reclaim_high(struct mem_cgroup *memcg, do { if (page_counter_read(&memcg->memory) <= memcg->high) continue; - mem_cgroup_event(memcg, MEMCG_HIGH); + memcg_memory_event(memcg, MEMCG_HIGH); try_to_free_mem_cgroup_pages(memcg, nr_pages, gfp_mask, true); } while ((memcg = parent_mem_cgroup(memcg))); } @@ -1949,7 +1949,7 @@ retry: if (!gfpflags_allow_blocking(gfp_mask)) goto nomem; - mem_cgroup_event(mem_over_limit, MEMCG_MAX); + memcg_memory_event(mem_over_limit, MEMCG_MAX); nr_reclaimed = try_to_free_mem_cgroup_pages(mem_over_limit, nr_pages, gfp_mask, may_swap); @@ -1992,7 +1992,7 @@ retry: if (fatal_signal_pending(current)) goto force; - mem_cgroup_event(mem_over_limit, MEMCG_OOM); + memcg_memory_event(mem_over_limit, MEMCG_OOM); mem_cgroup_oom(mem_over_limit, gfp_mask, get_order(nr_pages * PAGE_SIZE)); @@ -2688,10 +2688,10 @@ static void tree_events(struct mem_cgroup *memcg, unsigned long *events) struct mem_cgroup *iter; int i; - memset(events, 0, sizeof(*events) * MEMCG_NR_EVENTS); + memset(events, 0, sizeof(*events) * NR_VM_EVENT_ITEMS); for_each_mem_cgroup_tree(iter, memcg) { - for (i = 0; i < MEMCG_NR_EVENTS; i++) + for (i = 0; i < NR_VM_EVENT_ITEMS; i++) events[i] += memcg_sum_events(iter, i); } } @@ -4108,6 +4108,9 @@ static void free_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node) { struct mem_cgroup_per_node *pn = memcg->nodeinfo[node]; + if (!pn) + return; + free_percpu(pn->lruvec_stat_cpu); kfree(pn); } @@ -5178,7 +5181,7 @@ static ssize_t memory_max_write(struct kernfs_open_file *of, continue; } - mem_cgroup_event(memcg, MEMCG_OOM); + memcg_memory_event(memcg, MEMCG_OOM); if (!mem_cgroup_out_of_memory(memcg, GFP_KERNEL, 0)) break; } @@ -5191,10 +5194,14 @@ static int memory_events_show(struct seq_file *m, void *v) { struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); - seq_printf(m, "low %lu\n", memcg_sum_events(memcg, MEMCG_LOW)); - seq_printf(m, "high %lu\n", memcg_sum_events(memcg, MEMCG_HIGH)); - seq_printf(m, "max %lu\n", memcg_sum_events(memcg, MEMCG_MAX)); - seq_printf(m, "oom %lu\n", memcg_sum_events(memcg, MEMCG_OOM)); + seq_printf(m, "low %lu\n", + atomic_long_read(&memcg->memory_events[MEMCG_LOW])); + seq_printf(m, "high %lu\n", + atomic_long_read(&memcg->memory_events[MEMCG_HIGH])); + seq_printf(m, "max %lu\n", + atomic_long_read(&memcg->memory_events[MEMCG_MAX])); + seq_printf(m, "oom %lu\n", + atomic_long_read(&memcg->memory_events[MEMCG_OOM])); seq_printf(m, "oom_kill %lu\n", memcg_sum_events(memcg, OOM_KILL)); return 0; @@ -5204,7 +5211,7 @@ static int memory_stat_show(struct seq_file *m, void *v) { struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); unsigned long stat[MEMCG_NR_STAT]; - unsigned long events[MEMCG_NR_EVENTS]; + unsigned long events[NR_VM_EVENT_ITEMS]; int i; /* @@ -5967,9 +5974,9 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry) /* * Interrupts should be disabled here because the caller holds the - * mapping->tree_lock lock which is taken with interrupts-off. It is + * i_pages lock which is taken with interrupts-off. It is * important here to have the interrupts disabled because it is the - * only synchronisation we have for udpating the per-CPU variables. + * only synchronisation we have for updating the per-CPU variables. */ VM_BUG_ON(!irqs_disabled()); mem_cgroup_charge_statistics(memcg, page, PageTransHuge(page), diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 8291b75f42c8..9d142b9b86dc 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -502,6 +502,7 @@ static const char * const action_page_types[] = { [MF_MSG_POISONED_HUGE] = "huge page already hardware poisoned", [MF_MSG_HUGE] = "huge page", [MF_MSG_FREE_HUGE] = "free huge page", + [MF_MSG_NON_PMD_HUGE] = "non-pmd-sized huge page", [MF_MSG_UNMAP_FAILED] = "unmapping failed page", [MF_MSG_DIRTY_SWAPCACHE] = "dirty swapcache page", [MF_MSG_CLEAN_SWAPCACHE] = "clean swapcache page", @@ -1084,6 +1085,21 @@ static int memory_failure_hugetlb(unsigned long pfn, int flags) return 0; } + /* + * TODO: hwpoison for pud-sized hugetlb doesn't work right now, so + * simply disable it. In order to make it work properly, we need + * make sure that: + * - conversion of a pud that maps an error hugetlb into hwpoison + * entry properly works, and + * - other mm code walking over page table is aware of pud-aligned + * hwpoison entries. + */ + if (huge_page_size(page_hstate(head)) > PMD_SIZE) { + action_result(pfn, MF_MSG_NON_PMD_HUGE, MF_IGNORED); + res = -EBUSY; + goto out; + } + if (!hwpoison_user_mappings(p, pfn, flags, &head)) { action_result(pfn, MF_MSG_UNMAP_FAILED, MF_IGNORED); res = -EBUSY; @@ -1471,7 +1487,7 @@ int unpoison_memory(unsigned long pfn) } EXPORT_SYMBOL(unpoison_memory); -static struct page *new_page(struct page *p, unsigned long private, int **x) +static struct page *new_page(struct page *p, unsigned long private) { int nid = page_to_nid(p); diff --git a/mm/memory.c b/mm/memory.c index aed37325d94e..01f5464e0fd2 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2883,26 +2883,16 @@ EXPORT_SYMBOL(unmap_mapping_range); int do_swap_page(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; - struct page *page = NULL, *swapcache = NULL; + struct page *page = NULL, *swapcache; struct mem_cgroup *memcg; - struct vma_swap_readahead swap_ra; swp_entry_t entry; pte_t pte; int locked; int exclusive = 0; int ret = 0; - bool vma_readahead = swap_use_vma_readahead(); - if (vma_readahead) { - page = swap_readahead_detect(vmf, &swap_ra); - swapcache = page; - } - - if (!pte_unmap_same(vma->vm_mm, vmf->pmd, vmf->pte, vmf->orig_pte)) { - if (page) - put_page(page); + if (!pte_unmap_same(vma->vm_mm, vmf->pmd, vmf->pte, vmf->orig_pte)) goto out; - } entry = pte_to_swp_entry(vmf->orig_pte); if (unlikely(non_swap_entry(entry))) { @@ -2928,11 +2918,8 @@ int do_swap_page(struct vm_fault *vmf) delayacct_set_flag(DELAYACCT_PF_SWAPIN); - if (!page) { - page = lookup_swap_cache(entry, vma_readahead ? vma : NULL, - vmf->address); - swapcache = page; - } + page = lookup_swap_cache(entry, vma, vmf->address); + swapcache = page; if (!page) { struct swap_info_struct *si = swp_swap_info(entry); @@ -2940,7 +2927,8 @@ int do_swap_page(struct vm_fault *vmf) if (si->flags & SWP_SYNCHRONOUS_IO && __swap_count(si, entry) == 1) { /* skip swapcache */ - page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, vmf->address); + page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, + vmf->address); if (page) { __SetPageLocked(page); __SetPageSwapBacked(page); @@ -2949,12 +2937,8 @@ int do_swap_page(struct vm_fault *vmf) swap_readpage(page, true); } } else { - if (vma_readahead) - page = do_swap_page_readahead(entry, - GFP_HIGHUSER_MOVABLE, vmf, &swap_ra); - else - page = swapin_readahead(entry, - GFP_HIGHUSER_MOVABLE, vma, vmf->address); + page = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE, + vmf); swapcache = page; } @@ -2982,7 +2966,6 @@ int do_swap_page(struct vm_fault *vmf) */ ret = VM_FAULT_HWPOISON; delayacct_clear_flag(DELAYACCT_PF_SWAPIN); - swapcache = page; goto out_release; } diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index b2bd52ff7605..f74826cdceea 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -250,7 +250,6 @@ static int __meminit __add_section(int nid, unsigned long phys_start_pfn, struct vmem_altmap *altmap, bool want_memblock) { int ret; - int i; if (pfn_valid(phys_start_pfn)) return -EEXIST; @@ -259,27 +258,10 @@ static int __meminit __add_section(int nid, unsigned long phys_start_pfn, if (ret < 0) return ret; - /* - * Make all the pages reserved so that nobody will stumble over half - * initialized state. - * FIXME: We also have to associate it with a node because page_to_nid - * relies on having page with the proper node. - */ - for (i = 0; i < PAGES_PER_SECTION; i++) { - unsigned long pfn = phys_start_pfn + i; - struct page *page; - if (!pfn_valid(pfn)) - continue; - - page = pfn_to_page(pfn); - set_page_node(page, nid); - SetPageReserved(page); - } - if (!want_memblock) return 0; - return register_new_memory(nid, __pfn_to_section(phys_start_pfn)); + return hotplug_memory_register(nid, __pfn_to_section(phys_start_pfn)); } /* @@ -559,6 +541,7 @@ static int __remove_section(struct zone *zone, struct mem_section *ms, * @zone: zone from which pages need to be removed * @phys_start_pfn: starting pageframe (must be aligned to start of a section) * @nr_pages: number of pages to remove (must be multiple of section size) + * @altmap: alternative device page map or %NULL if default memmap is used * * Generic helper function to remove section mappings and sysfs entries * for the section of the memory we are removing. Caller needs to make @@ -908,8 +891,15 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ int nid; int ret; struct memory_notify arg; + struct memory_block *mem; + + /* + * We can't use pfn_to_nid() because nid might be stored in struct page + * which is not yet initialized. Instead, we find nid from memory block. + */ + mem = find_memory_block(__pfn_to_section(pfn)); + nid = mem->nid; - nid = pfn_to_nid(pfn); /* associate pfn range with the zone */ zone = move_pfn_range(online_type, nid, pfn, nr_pages); @@ -1055,6 +1045,7 @@ static void rollback_node_hotadd(int nid, pg_data_t *pgdat) /** * try_online_node - online a node if offlined + * @nid: the node ID * * called by cpu_up() to online a node without onlined memory. */ @@ -1083,15 +1074,16 @@ out: static int check_hotplug_memory_range(u64 start, u64 size) { - u64 start_pfn = PFN_DOWN(start); + unsigned long block_sz = memory_block_size_bytes(); + u64 block_nr_pages = block_sz >> PAGE_SHIFT; u64 nr_pages = size >> PAGE_SHIFT; + u64 start_pfn = PFN_DOWN(start); - /* Memory range must be aligned with section */ - if ((start_pfn & ~PAGE_SECTION_MASK) || - (nr_pages % PAGES_PER_SECTION) || (!nr_pages)) { - pr_err("Section-unaligned hotplug range: start 0x%llx, size 0x%llx\n", - (unsigned long long)start, - (unsigned long long)size); + /* memory range must be block size aligned */ + if (!nr_pages || !IS_ALIGNED(start_pfn, block_nr_pages) || + !IS_ALIGNED(nr_pages, block_nr_pages)) { + pr_err("Block size [%#lx] unaligned hotplug range: start %#llx, size %#llx", + block_sz, start, size); return -EINVAL; } @@ -1337,8 +1329,7 @@ static unsigned long scan_movable_pages(unsigned long start, unsigned long end) return 0; } -static struct page *new_node_page(struct page *page, unsigned long private, - int **result) +static struct page *new_node_page(struct page *page, unsigned long private) { int nid = page_to_nid(page); nodemask_t nmask = node_states[N_MEMORY]; @@ -1381,7 +1372,7 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) if (isolate_huge_page(page, &source)) move_pages -= 1 << compound_order(head); continue; - } else if (thp_migration_supported() && PageTransHuge(page)) + } else if (PageTransHuge(page)) pfn = page_to_pfn(compound_head(page)) + hpage_nr_pages(page) - 1; @@ -1814,6 +1805,7 @@ static int check_and_unmap_cpu_on_node(pg_data_t *pgdat) /** * try_offline_node + * @nid: the node ID * * Offline a node if all memory sections and cpus of the node are removed. * @@ -1857,6 +1849,9 @@ EXPORT_SYMBOL(try_offline_node); /** * remove_memory + * @nid: the node ID + * @start: physical address of the region to remove + * @size: size of the region to remove * * NOTE: The caller must call lock_device_hotplug() to serialize hotplug * and online/offline operations before this call, as required by diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 01cbb7078d6c..9ac49ef17b4e 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -446,15 +446,6 @@ static int queue_pages_pmd(pmd_t *pmd, spinlock_t *ptl, unsigned long addr, __split_huge_pmd(walk->vma, pmd, addr, false, NULL); goto out; } - if (!thp_migration_supported()) { - get_page(page); - spin_unlock(ptl); - lock_page(page); - ret = split_huge_page(page); - unlock_page(page); - put_page(page); - goto out; - } if (!queue_pages_required(page, qp)) { ret = 1; goto unlock; @@ -495,7 +486,7 @@ static int queue_pages_pte_range(pmd_t *pmd, unsigned long addr, if (pmd_trans_unstable(pmd)) return 0; -retry: + pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl); for (; addr != end; pte++, addr += PAGE_SIZE) { if (!pte_present(*pte)) @@ -511,22 +502,6 @@ retry: continue; if (!queue_pages_required(page, qp)) continue; - if (PageTransCompound(page) && !thp_migration_supported()) { - get_page(page); - pte_unmap_unlock(pte, ptl); - lock_page(page); - ret = split_huge_page(page); - unlock_page(page); - put_page(page); - /* Failed to split -- skip. */ - if (ret) { - pte = pte_offset_map_lock(walk->mm, pmd, - addr, &ptl); - continue; - } - goto retry; - } - migrate_page_add(page, qp->pagelist, flags); } pte_unmap_unlock(pte - 1, ptl); @@ -942,12 +917,13 @@ static void migrate_page_add(struct page *page, struct list_head *pagelist, } } -static struct page *new_node_page(struct page *page, unsigned long node, int **x) +/* page allocation callback for NUMA node migration */ +struct page *alloc_new_node_page(struct page *page, unsigned long node) { if (PageHuge(page)) return alloc_huge_page_node(page_hstate(compound_head(page)), node); - else if (thp_migration_supported() && PageTransHuge(page)) { + else if (PageTransHuge(page)) { struct page *thp; thp = alloc_pages_node(node, @@ -986,7 +962,7 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest, flags | MPOL_MF_DISCONTIG_OK, &pagelist); if (!list_empty(&pagelist)) { - err = migrate_pages(&pagelist, new_node_page, NULL, dest, + err = migrate_pages(&pagelist, alloc_new_node_page, NULL, dest, MIGRATE_SYNC, MR_SYSCALL); if (err) putback_movable_pages(&pagelist); @@ -1107,7 +1083,7 @@ int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from, * list of pages handed to migrate_pages()--which is how we get here-- * is in virtual address order. */ -static struct page *new_page(struct page *page, unsigned long start, int **x) +static struct page *new_page(struct page *page, unsigned long start) { struct vm_area_struct *vma; unsigned long uninitialized_var(address); @@ -1123,7 +1099,7 @@ static struct page *new_page(struct page *page, unsigned long start, int **x) if (PageHuge(page)) { return alloc_huge_page_vma(page_hstate(compound_head(page)), vma, address); - } else if (thp_migration_supported() && PageTransHuge(page)) { + } else if (PageTransHuge(page)) { struct page *thp; thp = alloc_hugepage_vma(GFP_TRANSHUGE, vma, address, @@ -1152,7 +1128,7 @@ int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from, return -ENOSYS; } -static struct page *new_page(struct page *page, unsigned long start, int **x) +static struct page *new_page(struct page *page, unsigned long start) { return NULL; } diff --git a/mm/migrate.c b/mm/migrate.c index 003886606a22..f65dd69e1fd1 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -467,20 +467,21 @@ int migrate_page_move_mapping(struct address_space *mapping, oldzone = page_zone(page); newzone = page_zone(newpage); - spin_lock_irq(&mapping->tree_lock); + xa_lock_irq(&mapping->i_pages); - pslot = radix_tree_lookup_slot(&mapping->page_tree, + pslot = radix_tree_lookup_slot(&mapping->i_pages, page_index(page)); expected_count += 1 + page_has_private(page); if (page_count(page) != expected_count || - radix_tree_deref_slot_protected(pslot, &mapping->tree_lock) != page) { - spin_unlock_irq(&mapping->tree_lock); + radix_tree_deref_slot_protected(pslot, + &mapping->i_pages.xa_lock) != page) { + xa_unlock_irq(&mapping->i_pages); return -EAGAIN; } if (!page_ref_freeze(page, expected_count)) { - spin_unlock_irq(&mapping->tree_lock); + xa_unlock_irq(&mapping->i_pages); return -EAGAIN; } @@ -494,7 +495,7 @@ int migrate_page_move_mapping(struct address_space *mapping, if (mode == MIGRATE_ASYNC && head && !buffer_migrate_lock_buffers(head, mode)) { page_ref_unfreeze(page, expected_count); - spin_unlock_irq(&mapping->tree_lock); + xa_unlock_irq(&mapping->i_pages); return -EAGAIN; } @@ -522,7 +523,7 @@ int migrate_page_move_mapping(struct address_space *mapping, SetPageDirty(newpage); } - radix_tree_replace_slot(&mapping->page_tree, pslot, newpage); + radix_tree_replace_slot(&mapping->i_pages, pslot, newpage); /* * Drop cache reference from old page by unfreezing @@ -531,7 +532,7 @@ int migrate_page_move_mapping(struct address_space *mapping, */ page_ref_unfreeze(page, expected_count - 1); - spin_unlock(&mapping->tree_lock); + xa_unlock(&mapping->i_pages); /* Leave irq disabled to prevent preemption while updating stats */ /* @@ -574,20 +575,19 @@ int migrate_huge_page_move_mapping(struct address_space *mapping, int expected_count; void **pslot; - spin_lock_irq(&mapping->tree_lock); + xa_lock_irq(&mapping->i_pages); - pslot = radix_tree_lookup_slot(&mapping->page_tree, - page_index(page)); + pslot = radix_tree_lookup_slot(&mapping->i_pages, page_index(page)); expected_count = 2 + page_has_private(page); if (page_count(page) != expected_count || - radix_tree_deref_slot_protected(pslot, &mapping->tree_lock) != page) { - spin_unlock_irq(&mapping->tree_lock); + radix_tree_deref_slot_protected(pslot, &mapping->i_pages.xa_lock) != page) { + xa_unlock_irq(&mapping->i_pages); return -EAGAIN; } if (!page_ref_freeze(page, expected_count)) { - spin_unlock_irq(&mapping->tree_lock); + xa_unlock_irq(&mapping->i_pages); return -EAGAIN; } @@ -596,11 +596,11 @@ int migrate_huge_page_move_mapping(struct address_space *mapping, get_page(newpage); - radix_tree_replace_slot(&mapping->page_tree, pslot, newpage); + radix_tree_replace_slot(&mapping->i_pages, pslot, newpage); page_ref_unfreeze(page, expected_count - 1); - spin_unlock_irq(&mapping->tree_lock); + xa_unlock_irq(&mapping->i_pages); return MIGRATEPAGE_SUCCESS; } @@ -1137,10 +1137,12 @@ static ICE_noinline int unmap_and_move(new_page_t get_new_page, enum migrate_reason reason) { int rc = MIGRATEPAGE_SUCCESS; - int *result = NULL; struct page *newpage; - newpage = get_new_page(page, private, &result); + if (!thp_migration_supported() && PageTransHuge(page)) + return -ENOMEM; + + newpage = get_new_page(page, private); if (!newpage) return -ENOMEM; @@ -1161,14 +1163,6 @@ static ICE_noinline int unmap_and_move(new_page_t get_new_page, goto out; } - if (unlikely(PageTransHuge(page) && !PageTransHuge(newpage))) { - lock_page(page); - rc = split_huge_page(page); - unlock_page(page); - if (rc) - goto out; - } - rc = __unmap_and_move(page, newpage, force, mode); if (rc == MIGRATEPAGE_SUCCESS) set_page_owner_migrate_reason(newpage, reason); @@ -1231,12 +1225,6 @@ put_new: put_page(newpage); } - if (result) { - if (rc) - *result = rc; - else - *result = page_to_nid(newpage); - } return rc; } @@ -1264,7 +1252,6 @@ static int unmap_and_move_huge_page(new_page_t get_new_page, enum migrate_mode mode, int reason) { int rc = -EAGAIN; - int *result = NULL; int page_was_mapped = 0; struct page *new_hpage; struct anon_vma *anon_vma = NULL; @@ -1281,7 +1268,7 @@ static int unmap_and_move_huge_page(new_page_t get_new_page, return -ENOSYS; } - new_hpage = get_new_page(hpage, private, &result); + new_hpage = get_new_page(hpage, private); if (!new_hpage) return -ENOMEM; @@ -1345,12 +1332,6 @@ out: else putback_active_hugepage(new_hpage); - if (result) { - if (rc) - *result = rc; - else - *result = page_to_nid(new_hpage); - } return rc; } @@ -1395,6 +1376,7 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page, retry = 0; list_for_each_entry_safe(page, page2, from, lru) { +retry: cond_resched(); if (PageHuge(page)) @@ -1408,6 +1390,26 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page, switch(rc) { case -ENOMEM: + /* + * THP migration might be unsupported or the + * allocation could've failed so we should + * retry on the same page with the THP split + * to base pages. + * + * Head page is retried immediately and tail + * pages are added to the tail of the list so + * we encounter them after the rest of the list + * is processed. + */ + if (PageTransHuge(page)) { + lock_page(page); + rc = split_huge_page_to_list(page, from); + unlock_page(page); + if (!rc) { + list_safe_reset_next(page, page2, lru); + goto retry; + } + } nr_failed++; goto out; case -EAGAIN: @@ -1444,141 +1446,101 @@ out: } #ifdef CONFIG_NUMA -/* - * Move a list of individual pages - */ -struct page_to_node { - unsigned long addr; - struct page *page; - int node; - int status; -}; -static struct page *new_page_node(struct page *p, unsigned long private, - int **result) +static int store_status(int __user *status, int start, int value, int nr) { - struct page_to_node *pm = (struct page_to_node *)private; - - while (pm->node != MAX_NUMNODES && pm->page != p) - pm++; + while (nr-- > 0) { + if (put_user(value, status + start)) + return -EFAULT; + start++; + } - if (pm->node == MAX_NUMNODES) - return NULL; + return 0; +} - *result = &pm->status; +static int do_move_pages_to_node(struct mm_struct *mm, + struct list_head *pagelist, int node) +{ + int err; - if (PageHuge(p)) - return alloc_huge_page_node(page_hstate(compound_head(p)), - pm->node); - else if (thp_migration_supported() && PageTransHuge(p)) { - struct page *thp; + if (list_empty(pagelist)) + return 0; - thp = alloc_pages_node(pm->node, - (GFP_TRANSHUGE | __GFP_THISNODE) & ~__GFP_RECLAIM, - HPAGE_PMD_ORDER); - if (!thp) - return NULL; - prep_transhuge_page(thp); - return thp; - } else - return __alloc_pages_node(pm->node, - GFP_HIGHUSER_MOVABLE | __GFP_THISNODE, 0); + err = migrate_pages(pagelist, alloc_new_node_page, NULL, node, + MIGRATE_SYNC, MR_SYSCALL); + if (err) + putback_movable_pages(pagelist); + return err; } /* - * Move a set of pages as indicated in the pm array. The addr - * field must be set to the virtual address of the page to be moved - * and the node number must contain a valid target node. - * The pm array ends with node = MAX_NUMNODES. + * Resolves the given address to a struct page, isolates it from the LRU and + * puts it to the given pagelist. + * Returns -errno if the page cannot be found/isolated or 0 when it has been + * queued or the page doesn't need to be migrated because it is already on + * the target node */ -static int do_move_page_to_node_array(struct mm_struct *mm, - struct page_to_node *pm, - int migrate_all) +static int add_page_for_migration(struct mm_struct *mm, unsigned long addr, + int node, struct list_head *pagelist, bool migrate_all) { + struct vm_area_struct *vma; + struct page *page; + unsigned int follflags; int err; - struct page_to_node *pp; - LIST_HEAD(pagelist); down_read(&mm->mmap_sem); + err = -EFAULT; + vma = find_vma(mm, addr); + if (!vma || addr < vma->vm_start || !vma_migratable(vma)) + goto out; - /* - * Build a list of pages to migrate - */ - for (pp = pm; pp->node != MAX_NUMNODES; pp++) { - struct vm_area_struct *vma; - struct page *page; - struct page *head; - unsigned int follflags; - - err = -EFAULT; - vma = find_vma(mm, pp->addr); - if (!vma || pp->addr < vma->vm_start || !vma_migratable(vma)) - goto set_status; - - /* FOLL_DUMP to ignore special (like zero) pages */ - follflags = FOLL_GET | FOLL_DUMP; - if (!thp_migration_supported()) - follflags |= FOLL_SPLIT; - page = follow_page(vma, pp->addr, follflags); + /* FOLL_DUMP to ignore special (like zero) pages */ + follflags = FOLL_GET | FOLL_DUMP; + page = follow_page(vma, addr, follflags); - err = PTR_ERR(page); - if (IS_ERR(page)) - goto set_status; + err = PTR_ERR(page); + if (IS_ERR(page)) + goto out; - err = -ENOENT; - if (!page) - goto set_status; + err = -ENOENT; + if (!page) + goto out; - err = page_to_nid(page); + err = 0; + if (page_to_nid(page) == node) + goto out_putpage; - if (err == pp->node) - /* - * Node already in the right place - */ - goto put_and_set; + err = -EACCES; + if (page_mapcount(page) > 1 && !migrate_all) + goto out_putpage; - err = -EACCES; - if (page_mapcount(page) > 1 && - !migrate_all) - goto put_and_set; - - if (PageHuge(page)) { - if (PageHead(page)) { - isolate_huge_page(page, &pagelist); - err = 0; - pp->page = page; - } - goto put_and_set; + if (PageHuge(page)) { + if (PageHead(page)) { + isolate_huge_page(page, pagelist); + err = 0; } + } else { + struct page *head; - pp->page = compound_head(page); head = compound_head(page); err = isolate_lru_page(head); - if (!err) { - list_add_tail(&head->lru, &pagelist); - mod_node_page_state(page_pgdat(head), - NR_ISOLATED_ANON + page_is_file_cache(head), - hpage_nr_pages(head)); - } -put_and_set: - /* - * Either remove the duplicate refcount from - * isolate_lru_page() or drop the page ref if it was - * not isolated. - */ - put_page(page); -set_status: - pp->status = err; - } - - err = 0; - if (!list_empty(&pagelist)) { - err = migrate_pages(&pagelist, new_page_node, NULL, - (unsigned long)pm, MIGRATE_SYNC, MR_SYSCALL); if (err) - putback_movable_pages(&pagelist); - } + goto out_putpage; + err = 0; + list_add_tail(&head->lru, pagelist); + mod_node_page_state(page_pgdat(head), + NR_ISOLATED_ANON + page_is_file_cache(head), + hpage_nr_pages(head)); + } +out_putpage: + /* + * Either remove the duplicate refcount from + * isolate_lru_page() or drop the page ref if it was + * not isolated. + */ + put_page(page); +out: up_read(&mm->mmap_sem); return err; } @@ -1593,79 +1555,79 @@ static int do_pages_move(struct mm_struct *mm, nodemask_t task_nodes, const int __user *nodes, int __user *status, int flags) { - struct page_to_node *pm; - unsigned long chunk_nr_pages; - unsigned long chunk_start; - int err; - - err = -ENOMEM; - pm = (struct page_to_node *)__get_free_page(GFP_KERNEL); - if (!pm) - goto out; + int current_node = NUMA_NO_NODE; + LIST_HEAD(pagelist); + int start, i; + int err = 0, err1; migrate_prep(); - /* - * Store a chunk of page_to_node array in a page, - * but keep the last one as a marker - */ - chunk_nr_pages = (PAGE_SIZE / sizeof(struct page_to_node)) - 1; - - for (chunk_start = 0; - chunk_start < nr_pages; - chunk_start += chunk_nr_pages) { - int j; + for (i = start = 0; i < nr_pages; i++) { + const void __user *p; + unsigned long addr; + int node; - if (chunk_start + chunk_nr_pages > nr_pages) - chunk_nr_pages = nr_pages - chunk_start; - - /* fill the chunk pm with addrs and nodes from user-space */ - for (j = 0; j < chunk_nr_pages; j++) { - const void __user *p; - int node; - - err = -EFAULT; - if (get_user(p, pages + j + chunk_start)) - goto out_pm; - pm[j].addr = (unsigned long) p; - - if (get_user(node, nodes + j + chunk_start)) - goto out_pm; - - err = -ENODEV; - if (node < 0 || node >= MAX_NUMNODES) - goto out_pm; - - if (!node_state(node, N_MEMORY)) - goto out_pm; - - err = -EACCES; - if (!node_isset(node, task_nodes)) - goto out_pm; + err = -EFAULT; + if (get_user(p, pages + i)) + goto out_flush; + if (get_user(node, nodes + i)) + goto out_flush; + addr = (unsigned long)p; + + err = -ENODEV; + if (node < 0 || node >= MAX_NUMNODES) + goto out_flush; + if (!node_state(node, N_MEMORY)) + goto out_flush; - pm[j].node = node; + err = -EACCES; + if (!node_isset(node, task_nodes)) + goto out_flush; + + if (current_node == NUMA_NO_NODE) { + current_node = node; + start = i; + } else if (node != current_node) { + err = do_move_pages_to_node(mm, &pagelist, current_node); + if (err) + goto out; + err = store_status(status, start, current_node, i - start); + if (err) + goto out; + start = i; + current_node = node; } - /* End marker for this chunk */ - pm[chunk_nr_pages].node = MAX_NUMNODES; - - /* Migrate this chunk */ - err = do_move_page_to_node_array(mm, pm, - flags & MPOL_MF_MOVE_ALL); - if (err < 0) - goto out_pm; + /* + * Errors in the page lookup or isolation are not fatal and we simply + * report them via status + */ + err = add_page_for_migration(mm, addr, current_node, + &pagelist, flags & MPOL_MF_MOVE_ALL); + if (!err) + continue; - /* Return status information */ - for (j = 0; j < chunk_nr_pages; j++) - if (put_user(pm[j].status, status + j + chunk_start)) { - err = -EFAULT; - goto out_pm; - } - } - err = 0; + err = store_status(status, i, err, 1); + if (err) + goto out_flush; -out_pm: - free_page((unsigned long)pm); + err = do_move_pages_to_node(mm, &pagelist, current_node); + if (err) + goto out; + if (i > start) { + err = store_status(status, start, current_node, i - start); + if (err) + goto out; + } + current_node = NUMA_NO_NODE; + } +out_flush: + /* Make sure we do not overwrite the existing error */ + err1 = do_move_pages_to_node(mm, &pagelist, current_node); + if (!err1) + err1 = store_status(status, start, current_node, i - start); + if (!err) + err = err1; out: return err; } @@ -1866,8 +1828,7 @@ static bool migrate_balanced_pgdat(struct pglist_data *pgdat, } static struct page *alloc_misplaced_dst_page(struct page *page, - unsigned long data, - int **result) + unsigned long data) { int nid = (int) data; struct page *newpage; @@ -1987,6 +1948,13 @@ int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma, goto out; /* + * Also do not migrate dirty pages as not all filesystems can move + * dirty pages in MIGRATE_ASYNC mode which is a waste of cycles. + */ + if (page_is_file_cache(page) && PageDirty(page)) + goto out; + + /* * Rate-limit the amount of data that is being migrated to a node. * Optimal placement is no good if the memory bus is saturated and * all the time is being spent migrating! @@ -2339,7 +2307,8 @@ again: ptep_get_and_clear(mm, addr, ptep); /* Setup special migration page table entry */ - entry = make_migration_entry(page, pte_write(pte)); + entry = make_migration_entry(page, mpfn & + MIGRATE_PFN_WRITE); swp_pte = swp_entry_to_pte(entry); if (pte_soft_dirty(pte)) swp_pte = pte_swp_mksoft_dirty(swp_pte); diff --git a/mm/mmap.c b/mm/mmap.c index aa0dc8231c0d..188f195883b9 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1342,6 +1342,10 @@ unsigned long do_mmap(struct file *file, unsigned long addr, if (!(file && path_noexec(&file->f_path))) prot |= PROT_EXEC; + /* force arch specific MAP_FIXED handling in get_unmapped_area */ + if (flags & MAP_FIXED_NOREPLACE) + flags |= MAP_FIXED; + if (!(flags & MAP_FIXED)) addr = round_hint_to_min(addr); @@ -1365,6 +1369,13 @@ unsigned long do_mmap(struct file *file, unsigned long addr, if (offset_in_page(addr)) return addr; + if (flags & MAP_FIXED_NOREPLACE) { + struct vm_area_struct *vma = find_vma(mm, addr); + + if (vma && vma->vm_start <= addr) + return -EEXIST; + } + if (prot == PROT_EXEC) { pkey = execute_only_pkey(mm); if (pkey < 0) @@ -3191,13 +3202,15 @@ bool may_expand_vm(struct mm_struct *mm, vm_flags_t flags, unsigned long npages) if (rlimit(RLIMIT_DATA) == 0 && mm->data_vm + npages <= rlimit_max(RLIMIT_DATA) >> PAGE_SHIFT) return true; - if (!ignore_rlimit_data) { - pr_warn_once("%s (%d): VmData %lu exceed data ulimit %lu. Update limits or use boot option ignore_rlimit_data.\n", - current->comm, current->pid, - (mm->data_vm + npages) << PAGE_SHIFT, - rlimit(RLIMIT_DATA)); + + pr_warn_once("%s (%d): VmData %lu exceed data ulimit %lu. Update limits%s.\n", + current->comm, current->pid, + (mm->data_vm + npages) << PAGE_SHIFT, + rlimit(RLIMIT_DATA), + ignore_rlimit_data ? "" : " or use boot option ignore_rlimit_data"); + + if (!ignore_rlimit_data) return false; - } } return true; diff --git a/mm/mprotect.c b/mm/mprotect.c index c1d6af7455da..625608bc8962 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -27,6 +27,7 @@ #include <linux/pkeys.h> #include <linux/ksm.h> #include <linux/uaccess.h> +#include <linux/mm_inline.h> #include <asm/pgtable.h> #include <asm/cacheflush.h> #include <asm/mmu_context.h> @@ -89,6 +90,14 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, page_mapcount(page) != 1) continue; + /* + * While migration can move some dirty pages, + * it cannot move them all from MIGRATE_ASYNC + * context. + */ + if (page_is_file_cache(page) && PageDirty(page)) + continue; + /* Avoid TLB flush if possible */ if (pte_protnone(oldpte)) continue; diff --git a/mm/nommu.c b/mm/nommu.c index 4f8720243ae7..13723736d38f 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -457,18 +457,6 @@ void __weak vmalloc_sync_all(void) { } -/** - * alloc_vm_area - allocate a range of kernel address space - * @size: size of the area - * - * Returns: NULL on failure, vm_struct on success - * - * This function reserves a range of kernel address space, and - * allocates pagetables to map that range. No actual mappings - * are created. If the kernel address space is not shared - * between processes, it syncs the pagetable across all - * processes. - */ struct vm_struct *alloc_vm_area(size_t size, pte_t **ptes) { BUG(); diff --git a/mm/oom_kill.c b/mm/oom_kill.c index f2e7dfb81eee..ff992fa8760a 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -185,6 +185,8 @@ static bool is_dump_unreclaim_slabs(void) * oom_badness - heuristic function to determine which candidate task to kill * @p: task struct of which task we should calculate * @totalpages: total present RAM allowed for page allocation + * @memcg: task's memory controller, if constrained + * @nodemask: nodemask passed to page allocator for mempolicy ooms * * The heuristic for determining which task to kill is made to be as simple and * predictable as possible. The goal is to return the highest value for the @@ -224,13 +226,6 @@ unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg, mm_pgtables_bytes(p->mm) / PAGE_SIZE; task_unlock(p); - /* - * Root processes get 3% bonus, just like the __vm_enough_memory() - * implementation used by LSMs. - */ - if (has_capability_noaudit(p, CAP_SYS_ADMIN)) - points -= (points * 3) / 100; - /* Normalize to oom_score_adj units */ adj *= totalpages / 1000; points += adj; @@ -595,7 +590,8 @@ static void oom_reap_task(struct task_struct *tsk) while (attempts++ < MAX_OOM_REAP_RETRIES && !__oom_reap_task_mm(tsk, mm)) schedule_timeout_idle(HZ/10); - if (attempts <= MAX_OOM_REAP_RETRIES) + if (attempts <= MAX_OOM_REAP_RETRIES || + test_bit(MMF_OOM_SKIP, &mm->flags)) goto done; diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 586f31261c83..5c1a3279e63f 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -2099,7 +2099,8 @@ void __init page_writeback_init(void) * so that it can tag pages faster than a dirtying process can create them). */ /* - * We tag pages in batches of WRITEBACK_TAG_BATCH to reduce tree_lock latency. + * We tag pages in batches of WRITEBACK_TAG_BATCH to reduce the i_pages lock + * latency. */ void tag_pages_for_writeback(struct address_space *mapping, pgoff_t start, pgoff_t end) @@ -2109,22 +2110,22 @@ void tag_pages_for_writeback(struct address_space *mapping, struct radix_tree_iter iter; void **slot; - spin_lock_irq(&mapping->tree_lock); - radix_tree_for_each_tagged(slot, &mapping->page_tree, &iter, start, + xa_lock_irq(&mapping->i_pages); + radix_tree_for_each_tagged(slot, &mapping->i_pages, &iter, start, PAGECACHE_TAG_DIRTY) { if (iter.index > end) break; - radix_tree_iter_tag_set(&mapping->page_tree, &iter, + radix_tree_iter_tag_set(&mapping->i_pages, &iter, PAGECACHE_TAG_TOWRITE); tagged++; if ((tagged % WRITEBACK_TAG_BATCH) != 0) continue; slot = radix_tree_iter_resume(slot, &iter); - spin_unlock_irq(&mapping->tree_lock); + xa_unlock_irq(&mapping->i_pages); cond_resched(); - spin_lock_irq(&mapping->tree_lock); + xa_lock_irq(&mapping->i_pages); } - spin_unlock_irq(&mapping->tree_lock); + xa_unlock_irq(&mapping->i_pages); } EXPORT_SYMBOL(tag_pages_for_writeback); @@ -2467,13 +2468,13 @@ int __set_page_dirty_nobuffers(struct page *page) return 1; } - spin_lock_irqsave(&mapping->tree_lock, flags); + xa_lock_irqsave(&mapping->i_pages, flags); BUG_ON(page_mapping(page) != mapping); WARN_ON_ONCE(!PagePrivate(page) && !PageUptodate(page)); account_page_dirtied(page, mapping); - radix_tree_tag_set(&mapping->page_tree, page_index(page), + radix_tree_tag_set(&mapping->i_pages, page_index(page), PAGECACHE_TAG_DIRTY); - spin_unlock_irqrestore(&mapping->tree_lock, flags); + xa_unlock_irqrestore(&mapping->i_pages, flags); unlock_page_memcg(page); if (mapping->host) { @@ -2718,11 +2719,10 @@ int test_clear_page_writeback(struct page *page) struct backing_dev_info *bdi = inode_to_bdi(inode); unsigned long flags; - spin_lock_irqsave(&mapping->tree_lock, flags); + xa_lock_irqsave(&mapping->i_pages, flags); ret = TestClearPageWriteback(page); if (ret) { - radix_tree_tag_clear(&mapping->page_tree, - page_index(page), + radix_tree_tag_clear(&mapping->i_pages, page_index(page), PAGECACHE_TAG_WRITEBACK); if (bdi_cap_account_writeback(bdi)) { struct bdi_writeback *wb = inode_to_wb(inode); @@ -2736,7 +2736,7 @@ int test_clear_page_writeback(struct page *page) PAGECACHE_TAG_WRITEBACK)) sb_clear_inode_writeback(mapping->host); - spin_unlock_irqrestore(&mapping->tree_lock, flags); + xa_unlock_irqrestore(&mapping->i_pages, flags); } else { ret = TestClearPageWriteback(page); } @@ -2766,7 +2766,7 @@ int __test_set_page_writeback(struct page *page, bool keep_write) struct backing_dev_info *bdi = inode_to_bdi(inode); unsigned long flags; - spin_lock_irqsave(&mapping->tree_lock, flags); + xa_lock_irqsave(&mapping->i_pages, flags); ret = TestSetPageWriteback(page); if (!ret) { bool on_wblist; @@ -2774,8 +2774,7 @@ int __test_set_page_writeback(struct page *page, bool keep_write) on_wblist = mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK); - radix_tree_tag_set(&mapping->page_tree, - page_index(page), + radix_tree_tag_set(&mapping->i_pages, page_index(page), PAGECACHE_TAG_WRITEBACK); if (bdi_cap_account_writeback(bdi)) inc_wb_stat(inode_to_wb(inode), WB_WRITEBACK); @@ -2789,14 +2788,12 @@ int __test_set_page_writeback(struct page *page, bool keep_write) sb_mark_inode_writeback(mapping->host); } if (!PageDirty(page)) - radix_tree_tag_clear(&mapping->page_tree, - page_index(page), + radix_tree_tag_clear(&mapping->i_pages, page_index(page), PAGECACHE_TAG_DIRTY); if (!keep_write) - radix_tree_tag_clear(&mapping->page_tree, - page_index(page), + radix_tree_tag_clear(&mapping->i_pages, page_index(page), PAGECACHE_TAG_TOWRITE); - spin_unlock_irqrestore(&mapping->tree_lock, flags); + xa_unlock_irqrestore(&mapping->i_pages, flags); } else { ret = TestSetPageWriteback(page); } @@ -2816,7 +2813,7 @@ EXPORT_SYMBOL(__test_set_page_writeback); */ int mapping_tagged(struct address_space *mapping, int tag) { - return radix_tree_tagged(&mapping->page_tree, tag); + return radix_tree_tagged(&mapping->i_pages, tag); } EXPORT_SYMBOL(mapping_tagged); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 4ea018263210..905db9d7962f 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -46,7 +46,6 @@ #include <linux/stop_machine.h> #include <linux/sort.h> #include <linux/pfn.h> -#include <xen/xen.h> #include <linux/backing-dev.h> #include <linux/fault-inject.h> #include <linux/page-isolation.h> @@ -205,17 +204,18 @@ static void __free_pages_ok(struct page *page, unsigned int order); * TBD: should special case ZONE_DMA32 machines here - in those we normally * don't need any ZONE_NORMAL reservation */ -int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1] = { +int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES] = { #ifdef CONFIG_ZONE_DMA - 256, + [ZONE_DMA] = 256, #endif #ifdef CONFIG_ZONE_DMA32 - 256, + [ZONE_DMA32] = 256, #endif + [ZONE_NORMAL] = 32, #ifdef CONFIG_HIGHMEM - 32, + [ZONE_HIGHMEM] = 0, #endif - 32, + [ZONE_MOVABLE] = 0, }; EXPORT_SYMBOL(totalram_pages); @@ -265,17 +265,19 @@ int min_free_kbytes = 1024; int user_min_free_kbytes = -1; int watermark_scale_factor = 10; -static unsigned long __meminitdata nr_kernel_pages; -static unsigned long __meminitdata nr_all_pages; -static unsigned long __meminitdata dma_reserve; +static unsigned long nr_kernel_pages __meminitdata; +static unsigned long nr_all_pages __meminitdata; +static unsigned long dma_reserve __meminitdata; #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP -static unsigned long __meminitdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES]; -static unsigned long __meminitdata arch_zone_highest_possible_pfn[MAX_NR_ZONES]; -static unsigned long __initdata required_kernelcore; -static unsigned long __initdata required_movablecore; -static unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES]; -static bool mirrored_kernelcore; +static unsigned long arch_zone_lowest_possible_pfn[MAX_NR_ZONES] __meminitdata; +static unsigned long arch_zone_highest_possible_pfn[MAX_NR_ZONES] __meminitdata; +static unsigned long required_kernelcore __initdata; +static unsigned long required_kernelcore_percent __initdata; +static unsigned long required_movablecore __initdata; +static unsigned long required_movablecore_percent __initdata; +static unsigned long zone_movable_pfn[MAX_NUMNODES] __meminitdata; +static bool mirrored_kernelcore __meminitdata; /* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */ int movable_zone; @@ -292,40 +294,6 @@ EXPORT_SYMBOL(nr_online_nodes); int page_group_by_mobility_disabled __read_mostly; #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT - -/* - * Determine how many pages need to be initialized during early boot - * (non-deferred initialization). - * The value of first_deferred_pfn will be set later, once non-deferred pages - * are initialized, but for now set it ULONG_MAX. - */ -static inline void reset_deferred_meminit(pg_data_t *pgdat) -{ - phys_addr_t start_addr, end_addr; - unsigned long max_pgcnt; - unsigned long reserved; - - /* - * Initialise at least 2G of a node but also take into account that - * two large system hashes that can take up 1GB for 0.25TB/node. - */ - max_pgcnt = max(2UL << (30 - PAGE_SHIFT), - (pgdat->node_spanned_pages >> 8)); - - /* - * Compensate the all the memblock reservations (e.g. crash kernel) - * from the initial estimation to make sure we will initialize enough - * memory to boot. - */ - start_addr = PFN_PHYS(pgdat->node_start_pfn); - end_addr = PFN_PHYS(pgdat->node_start_pfn + max_pgcnt); - reserved = memblock_reserved_memory_within(start_addr, end_addr); - max_pgcnt += PHYS_PFN(reserved); - - pgdat->static_init_pgcnt = min(max_pgcnt, pgdat->node_spanned_pages); - pgdat->first_deferred_pfn = ULONG_MAX; -} - /* Returns true if the struct page for the pfn is uninitialised */ static inline bool __meminit early_page_uninitialised(unsigned long pfn) { @@ -348,9 +316,6 @@ static inline bool update_defer_init(pg_data_t *pgdat, /* Always populate low zones for address-constrained allocations */ if (zone_end < pgdat_end_pfn(pgdat)) return true; - /* Xen PV domains need page structures early */ - if (xen_pv_domain()) - return true; (*nr_initialised)++; if ((*nr_initialised > pgdat->static_init_pgcnt) && (pfn & (PAGES_PER_SECTION - 1)) == 0) { @@ -361,10 +326,6 @@ static inline bool update_defer_init(pg_data_t *pgdat, return true; } #else -static inline void reset_deferred_meminit(pg_data_t *pgdat) -{ -} - static inline bool early_page_uninitialised(unsigned long pfn) { return false; @@ -1099,6 +1060,15 @@ static bool bulkfree_pcp_prepare(struct page *page) } #endif /* CONFIG_DEBUG_VM */ +static inline void prefetch_buddy(struct page *page) +{ + unsigned long pfn = page_to_pfn(page); + unsigned long buddy_pfn = __find_buddy_pfn(pfn, 0); + struct page *buddy = page + (buddy_pfn - pfn); + + prefetch(buddy); +} + /* * Frees a number of pages from the PCP lists * Assumes all pages on list are in same zone, and of same order. @@ -1115,13 +1085,12 @@ static void free_pcppages_bulk(struct zone *zone, int count, { int migratetype = 0; int batch_free = 0; + int prefetch_nr = 0; bool isolated_pageblocks; - - spin_lock(&zone->lock); - isolated_pageblocks = has_isolate_pageblock(zone); + struct page *page, *tmp; + LIST_HEAD(head); while (count) { - struct page *page; struct list_head *list; /* @@ -1143,26 +1112,48 @@ static void free_pcppages_bulk(struct zone *zone, int count, batch_free = count; do { - int mt; /* migratetype of the to-be-freed page */ - page = list_last_entry(list, struct page, lru); - /* must delete as __free_one_page list manipulates */ + /* must delete to avoid corrupting pcp list */ list_del(&page->lru); - - mt = get_pcppage_migratetype(page); - /* MIGRATE_ISOLATE page should not go to pcplists */ - VM_BUG_ON_PAGE(is_migrate_isolate(mt), page); - /* Pageblock could have been isolated meanwhile */ - if (unlikely(isolated_pageblocks)) - mt = get_pageblock_migratetype(page); + pcp->count--; if (bulkfree_pcp_prepare(page)) continue; - __free_one_page(page, page_to_pfn(page), zone, 0, mt); - trace_mm_page_pcpu_drain(page, 0, mt); + list_add_tail(&page->lru, &head); + + /* + * We are going to put the page back to the global + * pool, prefetch its buddy to speed up later access + * under zone->lock. It is believed the overhead of + * an additional test and calculating buddy_pfn here + * can be offset by reduced memory latency later. To + * avoid excessive prefetching due to large count, only + * prefetch buddy for the first pcp->batch nr of pages. + */ + if (prefetch_nr++ < pcp->batch) + prefetch_buddy(page); } while (--count && --batch_free && !list_empty(list)); } + + spin_lock(&zone->lock); + isolated_pageblocks = has_isolate_pageblock(zone); + + /* + * Use safe version since after __free_one_page(), + * page->lru.next will not point to original list. + */ + list_for_each_entry_safe(page, tmp, &head, lru) { + int mt = get_pcppage_migratetype(page); + /* MIGRATE_ISOLATE page should not go to pcplists */ + VM_BUG_ON_PAGE(is_migrate_isolate(mt), page); + /* Pageblock could have been isolated meanwhile */ + if (unlikely(isolated_pageblocks)) + mt = get_pageblock_migratetype(page); + + __free_one_page(page, page_to_pfn(page), zone, 0, mt); + trace_mm_page_pcpu_drain(page, 0, mt); + } spin_unlock(&zone->lock); } @@ -1181,10 +1172,9 @@ static void free_one_page(struct zone *zone, } static void __meminit __init_single_page(struct page *page, unsigned long pfn, - unsigned long zone, int nid, bool zero) + unsigned long zone, int nid) { - if (zero) - mm_zero_struct_page(page); + mm_zero_struct_page(page); set_page_links(page, zone, nid, pfn); init_page_count(page); page_mapcount_reset(page); @@ -1198,12 +1188,6 @@ static void __meminit __init_single_page(struct page *page, unsigned long pfn, #endif } -static void __meminit __init_single_pfn(unsigned long pfn, unsigned long zone, - int nid, bool zero) -{ - return __init_single_page(pfn_to_page(pfn), pfn, zone, nid, zero); -} - #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT static void __meminit init_reserved_page(unsigned long pfn) { @@ -1222,7 +1206,7 @@ static void __meminit init_reserved_page(unsigned long pfn) if (pfn >= zone->zone_start_pfn && pfn < zone_end_pfn(zone)) break; } - __init_single_pfn(pfn, zid, nid, true); + __init_single_page(pfn_to_page(pfn), pfn, zid, nid); } #else static inline void init_reserved_page(unsigned long pfn) @@ -1506,7 +1490,7 @@ static void __init deferred_free_pages(int nid, int zid, unsigned long pfn, } else if (!(pfn & nr_pgmask)) { deferred_free_range(pfn - nr_free, nr_free); nr_free = 1; - cond_resched(); + touch_nmi_watchdog(); } else { nr_free++; } @@ -1535,11 +1519,11 @@ static unsigned long __init deferred_init_pages(int nid, int zid, continue; } else if (!page || !(pfn & nr_pgmask)) { page = pfn_to_page(pfn); - cond_resched(); + touch_nmi_watchdog(); } else { page++; } - __init_single_page(page, pfn, zid, nid, true); + __init_single_page(page, pfn, zid, nid); nr_pages++; } return (nr_pages); @@ -1552,23 +1536,25 @@ static int __init deferred_init_memmap(void *data) int nid = pgdat->node_id; unsigned long start = jiffies; unsigned long nr_pages = 0; - unsigned long spfn, epfn; + unsigned long spfn, epfn, first_init_pfn, flags; phys_addr_t spa, epa; int zid; struct zone *zone; - unsigned long first_init_pfn = pgdat->first_deferred_pfn; const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id); u64 i; + /* Bind memory initialisation thread to a local node if possible */ + if (!cpumask_empty(cpumask)) + set_cpus_allowed_ptr(current, cpumask); + + pgdat_resize_lock(pgdat, &flags); + first_init_pfn = pgdat->first_deferred_pfn; if (first_init_pfn == ULONG_MAX) { + pgdat_resize_unlock(pgdat, &flags); pgdat_init_report_one_done(); return 0; } - /* Bind memory initialisation thread to a local node if possible */ - if (!cpumask_empty(cpumask)) - set_cpus_allowed_ptr(current, cpumask); - /* Sanity check boundaries */ BUG_ON(pgdat->first_deferred_pfn < pgdat->node_start_pfn); BUG_ON(pgdat->first_deferred_pfn > pgdat_end_pfn(pgdat)); @@ -1598,6 +1584,7 @@ static int __init deferred_init_memmap(void *data) epfn = min_t(unsigned long, zone_end_pfn(zone), PFN_DOWN(epa)); deferred_free_pages(nid, zid, spfn, epfn); } + pgdat_resize_unlock(pgdat, &flags); /* Sanity check that the next zone really is unpopulated */ WARN_ON(++zid < MAX_NR_ZONES && populated_zone(++zone)); @@ -1608,6 +1595,117 @@ static int __init deferred_init_memmap(void *data) pgdat_init_report_one_done(); return 0; } + +/* + * During boot we initialize deferred pages on-demand, as needed, but once + * page_alloc_init_late() has finished, the deferred pages are all initialized, + * and we can permanently disable that path. + */ +static DEFINE_STATIC_KEY_TRUE(deferred_pages); + +/* + * If this zone has deferred pages, try to grow it by initializing enough + * deferred pages to satisfy the allocation specified by order, rounded up to + * the nearest PAGES_PER_SECTION boundary. So we're adding memory in increments + * of SECTION_SIZE bytes by initializing struct pages in increments of + * PAGES_PER_SECTION * sizeof(struct page) bytes. + * + * Return true when zone was grown, otherwise return false. We return true even + * when we grow less than requested, to let the caller decide if there are + * enough pages to satisfy the allocation. + * + * Note: We use noinline because this function is needed only during boot, and + * it is called from a __ref function _deferred_grow_zone. This way we are + * making sure that it is not inlined into permanent text section. + */ +static noinline bool __init +deferred_grow_zone(struct zone *zone, unsigned int order) +{ + int zid = zone_idx(zone); + int nid = zone_to_nid(zone); + pg_data_t *pgdat = NODE_DATA(nid); + unsigned long nr_pages_needed = ALIGN(1 << order, PAGES_PER_SECTION); + unsigned long nr_pages = 0; + unsigned long first_init_pfn, spfn, epfn, t, flags; + unsigned long first_deferred_pfn = pgdat->first_deferred_pfn; + phys_addr_t spa, epa; + u64 i; + + /* Only the last zone may have deferred pages */ + if (zone_end_pfn(zone) != pgdat_end_pfn(pgdat)) + return false; + + pgdat_resize_lock(pgdat, &flags); + + /* + * If deferred pages have been initialized while we were waiting for + * the lock, return true, as the zone was grown. The caller will retry + * this zone. We won't return to this function since the caller also + * has this static branch. + */ + if (!static_branch_unlikely(&deferred_pages)) { + pgdat_resize_unlock(pgdat, &flags); + return true; + } + + /* + * If someone grew this zone while we were waiting for spinlock, return + * true, as there might be enough pages already. + */ + if (first_deferred_pfn != pgdat->first_deferred_pfn) { + pgdat_resize_unlock(pgdat, &flags); + return true; + } + + first_init_pfn = max(zone->zone_start_pfn, first_deferred_pfn); + + if (first_init_pfn >= pgdat_end_pfn(pgdat)) { + pgdat_resize_unlock(pgdat, &flags); + return false; + } + + for_each_free_mem_range(i, nid, MEMBLOCK_NONE, &spa, &epa, NULL) { + spfn = max_t(unsigned long, first_init_pfn, PFN_UP(spa)); + epfn = min_t(unsigned long, zone_end_pfn(zone), PFN_DOWN(epa)); + + while (spfn < epfn && nr_pages < nr_pages_needed) { + t = ALIGN(spfn + PAGES_PER_SECTION, PAGES_PER_SECTION); + first_deferred_pfn = min(t, epfn); + nr_pages += deferred_init_pages(nid, zid, spfn, + first_deferred_pfn); + spfn = first_deferred_pfn; + } + + if (nr_pages >= nr_pages_needed) + break; + } + + for_each_free_mem_range(i, nid, MEMBLOCK_NONE, &spa, &epa, NULL) { + spfn = max_t(unsigned long, first_init_pfn, PFN_UP(spa)); + epfn = min_t(unsigned long, first_deferred_pfn, PFN_DOWN(epa)); + deferred_free_pages(nid, zid, spfn, epfn); + + if (first_deferred_pfn == epfn) + break; + } + pgdat->first_deferred_pfn = first_deferred_pfn; + pgdat_resize_unlock(pgdat, &flags); + + return nr_pages > 0; +} + +/* + * deferred_grow_zone() is __init, but it is called from + * get_page_from_freelist() during early boot until deferred_pages permanently + * disables this call. This is why we have refdata wrapper to avoid warning, + * and to ensure that the function body gets unloaded. + */ +static bool __ref +_deferred_grow_zone(struct zone *zone, unsigned int order) +{ + return deferred_grow_zone(zone, order); +} + #endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */ void __init page_alloc_init_late(void) @@ -1626,6 +1724,12 @@ void __init page_alloc_init_late(void) /* Block until all are initialised */ wait_for_completion(&pgdat_init_all_done_comp); + /* + * We initialized the rest of the deferred pages. Permanently disable + * on-demand struct page initialization. + */ + static_branch_disable(&deferred_pages); + /* Reinit limits that are based on free pages after the kernel is up */ files_maxfiles_init(); #endif @@ -1639,16 +1743,38 @@ void __init page_alloc_init_late(void) } #ifdef CONFIG_CMA +static void __init adjust_present_page_count(struct page *page, long count) +{ + struct zone *zone = page_zone(page); + + /* We don't need to hold a lock since it is boot-up process */ + zone->present_pages += count; +} + /* Free whole pageblock and set its migration type to MIGRATE_CMA. */ void __init init_cma_reserved_pageblock(struct page *page) { unsigned i = pageblock_nr_pages; + unsigned long pfn = page_to_pfn(page); struct page *p = page; + int nid = page_to_nid(page); + + /* + * ZONE_MOVABLE will steal present pages from other zones by + * changing page links so page_zone() is changed. Before that, + * we need to adjust previous zone's page count first. + */ + adjust_present_page_count(page, -pageblock_nr_pages); do { __ClearPageReserved(p); set_page_count(p, 0); - } while (++p, --i); + + /* Steal pages from other zones */ + set_page_links(p, ZONE_MOVABLE, nid, pfn); + } while (++p, ++pfn, --i); + + adjust_present_page_count(page, pageblock_nr_pages); set_pageblock_migratetype(page, MIGRATE_CMA); @@ -2418,10 +2544,8 @@ void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp) local_irq_save(flags); batch = READ_ONCE(pcp->batch); to_drain = min(pcp->count, batch); - if (to_drain > 0) { + if (to_drain > 0) free_pcppages_bulk(zone, to_drain, pcp); - pcp->count -= to_drain; - } local_irq_restore(flags); } #endif @@ -2443,10 +2567,8 @@ static void drain_pages_zone(unsigned int cpu, struct zone *zone) pset = per_cpu_ptr(zone->pageset, cpu); pcp = &pset->pcp; - if (pcp->count) { + if (pcp->count) free_pcppages_bulk(zone, pcp->count, pcp); - pcp->count = 0; - } local_irq_restore(flags); } @@ -2670,7 +2792,6 @@ static void free_unref_page_commit(struct page *page, unsigned long pfn) if (pcp->count >= pcp->high) { unsigned long batch = READ_ONCE(pcp->batch); free_pcppages_bulk(zone, batch, pcp); - pcp->count -= batch; } } @@ -2768,7 +2889,7 @@ int __isolate_free_page(struct page *page, unsigned int order) * exists. */ watermark = min_wmark_pages(zone) + (1UL << order); - if (!zone_watermark_ok(zone, 0, watermark, 0, ALLOC_CMA)) + if (!zone_watermark_ok(zone, 0, watermark, 0, 0)) return 0; __mod_zone_freepage_state(zone, -(1UL << order), mt); @@ -3044,12 +3165,6 @@ bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark, } -#ifdef CONFIG_CMA - /* If allocation can't use CMA areas don't use free CMA pages */ - if (!(alloc_flags & ALLOC_CMA)) - free_pages -= zone_page_state(z, NR_FREE_CMA_PAGES); -#endif - /* * Check watermarks for an order-0 allocation request. If these * are not met, then a high-order request also cannot go ahead @@ -3076,10 +3191,8 @@ bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark, } #ifdef CONFIG_CMA - if ((alloc_flags & ALLOC_CMA) && - !list_empty(&area->free_list[MIGRATE_CMA])) { + if (!list_empty(&area->free_list[MIGRATE_CMA])) return true; - } #endif if (alloc_harder && !list_empty(&area->free_list[MIGRATE_HIGHATOMIC])) @@ -3099,13 +3212,6 @@ static inline bool zone_watermark_fast(struct zone *z, unsigned int order, unsigned long mark, int classzone_idx, unsigned int alloc_flags) { long free_pages = zone_page_state(z, NR_FREE_PAGES); - long cma_pages = 0; - -#ifdef CONFIG_CMA - /* If allocation can't use CMA areas don't use free CMA pages */ - if (!(alloc_flags & ALLOC_CMA)) - cma_pages = zone_page_state(z, NR_FREE_CMA_PAGES); -#endif /* * Fast check for order-0 only. If this fails then the reserves @@ -3114,7 +3220,7 @@ static inline bool zone_watermark_fast(struct zone *z, unsigned int order, * the caller is !atomic then it'll uselessly search the free * list. That corner case is then slower but it is harmless. */ - if (!order && (free_pages - cma_pages) > mark + z->lowmem_reserve[classzone_idx]) + if (!order && free_pages > mark + z->lowmem_reserve[classzone_idx]) return true; return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags, @@ -3205,6 +3311,16 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags, ac_classzone_idx(ac), alloc_flags)) { int ret; +#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT + /* + * Watermark failed for this zone, but see if we can + * grow this zone if it contains deferred pages. + */ + if (static_branch_unlikely(&deferred_pages)) { + if (_deferred_grow_zone(zone, order)) + goto try_this_zone; + } +#endif /* Checked here to keep the fast path fast */ BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK); if (alloc_flags & ALLOC_NO_WATERMARKS) @@ -3246,6 +3362,14 @@ try_this_zone: reserve_highatomic_pageblock(page, zone, order); return page; + } else { +#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT + /* Try again if zone has deferred pages */ + if (static_branch_unlikely(&deferred_pages)) { + if (_deferred_grow_zone(zone, order)) + goto try_this_zone; + } +#endif } } @@ -3685,16 +3809,18 @@ retry: return page; } -static void wake_all_kswapds(unsigned int order, const struct alloc_context *ac) +static void wake_all_kswapds(unsigned int order, gfp_t gfp_mask, + const struct alloc_context *ac) { struct zoneref *z; struct zone *zone; pg_data_t *last_pgdat = NULL; + enum zone_type high_zoneidx = ac->high_zoneidx; - for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, - ac->high_zoneidx, ac->nodemask) { + for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, high_zoneidx, + ac->nodemask) { if (last_pgdat != zone->zone_pgdat) - wakeup_kswapd(zone, order, ac->high_zoneidx); + wakeup_kswapd(zone, gfp_mask, order, high_zoneidx); last_pgdat = zone->zone_pgdat; } } @@ -3730,10 +3856,6 @@ gfp_to_alloc_flags(gfp_t gfp_mask) } else if (unlikely(rt_task(current)) && !in_interrupt()) alloc_flags |= ALLOC_HARDER; -#ifdef CONFIG_CMA - if (gfpflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE) - alloc_flags |= ALLOC_CMA; -#endif return alloc_flags; } @@ -3973,7 +4095,7 @@ retry_cpuset: goto nopage; if (gfp_mask & __GFP_KSWAPD_RECLAIM) - wake_all_kswapds(order, ac); + wake_all_kswapds(order, gfp_mask, ac); /* * The adjusted alloc_flags might result in immediate success, so try @@ -4031,7 +4153,7 @@ retry_cpuset: retry: /* Ensure kswapd doesn't accidentally go to sleep as long as we loop */ if (gfp_mask & __GFP_KSWAPD_RECLAIM) - wake_all_kswapds(order, ac); + wake_all_kswapds(order, gfp_mask, ac); reserve_flags = __gfp_pfmemalloc_flags(gfp_mask); if (reserve_flags) @@ -4200,9 +4322,6 @@ static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order, if (should_fail_alloc_page(gfp_mask, order)) return false; - if (IS_ENABLED(CONFIG_CMA) && ac->migratetype == MIGRATE_MOVABLE) - *alloc_flags |= ALLOC_CMA; - return true; } @@ -4612,6 +4731,13 @@ long si_mem_available(void) min(global_node_page_state(NR_SLAB_RECLAIMABLE) / 2, wmark_low); + /* + * Part of the kernel memory, which can be released under memory + * pressure. + */ + available += global_node_page_state(NR_INDIRECTLY_RECLAIMABLE_BYTES) >> + PAGE_SHIFT; + if (available < 0) available = 0; return available; @@ -5334,6 +5460,7 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, pg_data_t *pgdat = NODE_DATA(nid); unsigned long pfn; unsigned long nr_initialised = 0; + struct page *page; #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP struct memblock_region *r = NULL, *tmp; #endif @@ -5386,6 +5513,11 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, #endif not_early: + page = pfn_to_page(pfn); + __init_single_page(page, pfn, zone, nid); + if (context == MEMMAP_HOTPLUG) + SetPageReserved(page); + /* * Mark the block movable so that blocks are reserved for * movable at startup. This will force kernel allocations @@ -5402,15 +5534,8 @@ not_early: * because this is done early in sparse_add_one_section */ if (!(pfn & (pageblock_nr_pages - 1))) { - struct page *page = pfn_to_page(pfn); - - __init_single_page(page, pfn, zone, nid, - context != MEMMAP_HOTPLUG); set_pageblock_migratetype(page, MIGRATE_MOVABLE); cond_resched(); - } else { - __init_single_pfn(pfn, zone, nid, - context != MEMMAP_HOTPLUG); } } } @@ -6079,6 +6204,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat) { enum zone_type j; int nid = pgdat->node_id; + unsigned long node_end_pfn = 0; pgdat_resize_init(pgdat); #ifdef CONFIG_NUMA_BALANCING @@ -6106,9 +6232,13 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat) struct zone *zone = pgdat->node_zones + j; unsigned long size, realsize, freesize, memmap_pages; unsigned long zone_start_pfn = zone->zone_start_pfn; + unsigned long movable_size = 0; size = zone->spanned_pages; realsize = freesize = zone->present_pages; + if (zone_end_pfn(zone) > node_end_pfn) + node_end_pfn = zone_end_pfn(zone); + /* * Adjust freesize so that it accounts for how much memory @@ -6157,12 +6287,30 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat) zone_seqlock_init(zone); zone_pcp_init(zone); - if (!size) + /* + * The size of the CMA area is unknown now so we need to + * prepare the memory for the usemap at maximum. + */ + if (IS_ENABLED(CONFIG_CMA) && j == ZONE_MOVABLE && + pgdat->node_spanned_pages) { + movable_size = node_end_pfn - pgdat->node_start_pfn; + } + + if (!size && !movable_size) continue; set_pageblock_order(); - setup_usemap(pgdat, zone, zone_start_pfn, size); - init_currently_empty_zone(zone, zone_start_pfn, size); + if (movable_size) { + zone->zone_start_pfn = pgdat->node_start_pfn; + zone->spanned_pages = movable_size; + setup_usemap(pgdat, zone, + pgdat->node_start_pfn, movable_size); + init_currently_empty_zone(zone, + pgdat->node_start_pfn, movable_size); + } else { + setup_usemap(pgdat, zone, zone_start_pfn, size); + init_currently_empty_zone(zone, zone_start_pfn, size); + } memmap_init(size, nid, j, zone_start_pfn); } } @@ -6241,7 +6389,15 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size, alloc_node_mem_map(pgdat); - reset_deferred_meminit(pgdat); +#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT + /* + * We start only with one section of pages, more pages are added as + * needed until the rest of deferred pages are initialized. + */ + pgdat->static_init_pgcnt = min_t(unsigned long, PAGES_PER_SECTION, + pgdat->node_spanned_pages); + pgdat->first_deferred_pfn = ULONG_MAX; +#endif free_area_init_core(pgdat); } @@ -6471,7 +6627,18 @@ static void __init find_zone_movable_pfns_for_nodes(void) } /* - * If movablecore=nn[KMG] was specified, calculate what size of + * If kernelcore=nn% or movablecore=nn% was specified, calculate the + * amount of necessary memory. + */ + if (required_kernelcore_percent) + required_kernelcore = (totalpages * 100 * required_kernelcore_percent) / + 10000UL; + if (required_movablecore_percent) + required_movablecore = (totalpages * 100 * required_movablecore_percent) / + 10000UL; + + /* + * If movablecore= was specified, calculate what size of * kernelcore that corresponds so that memory usable for * any allocation type is evenly spread. If both kernelcore * and movablecore are specified, then the value of kernelcore @@ -6711,18 +6878,30 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn) zero_resv_unavail(); } -static int __init cmdline_parse_core(char *p, unsigned long *core) +static int __init cmdline_parse_core(char *p, unsigned long *core, + unsigned long *percent) { unsigned long long coremem; + char *endptr; + if (!p) return -EINVAL; - coremem = memparse(p, &p); - *core = coremem >> PAGE_SHIFT; + /* Value may be a percentage of total memory, otherwise bytes */ + coremem = simple_strtoull(p, &endptr, 0); + if (*endptr == '%') { + /* Paranoid check for percent values greater than 100 */ + WARN_ON(coremem > 100); - /* Paranoid check that UL is enough for the coremem value */ - WARN_ON((coremem >> PAGE_SHIFT) > ULONG_MAX); + *percent = coremem; + } else { + coremem = memparse(p, &p); + /* Paranoid check that UL is enough for the coremem value */ + WARN_ON((coremem >> PAGE_SHIFT) > ULONG_MAX); + *core = coremem >> PAGE_SHIFT; + *percent = 0UL; + } return 0; } @@ -6738,7 +6917,8 @@ static int __init cmdline_parse_kernelcore(char *p) return 0; } - return cmdline_parse_core(p, &required_kernelcore); + return cmdline_parse_core(p, &required_kernelcore, + &required_kernelcore_percent); } /* @@ -6747,7 +6927,8 @@ static int __init cmdline_parse_kernelcore(char *p) */ static int __init cmdline_parse_movablecore(char *p) { - return cmdline_parse_core(p, &required_movablecore); + return cmdline_parse_core(p, &required_movablecore, + &required_movablecore_percent); } early_param("kernelcore", cmdline_parse_kernelcore); @@ -6971,13 +7152,15 @@ static void setup_per_zone_lowmem_reserve(void) struct zone *lower_zone; idx--; - - if (sysctl_lowmem_reserve_ratio[idx] < 1) - sysctl_lowmem_reserve_ratio[idx] = 1; - lower_zone = pgdat->node_zones + idx; - lower_zone->lowmem_reserve[j] = managed_pages / - sysctl_lowmem_reserve_ratio[idx]; + + if (sysctl_lowmem_reserve_ratio[idx] < 1) { + sysctl_lowmem_reserve_ratio[idx] = 0; + lower_zone->lowmem_reserve[j] = 0; + } else { + lower_zone->lowmem_reserve[j] = + managed_pages / sysctl_lowmem_reserve_ratio[idx]; + } managed_pages += lower_zone->managed_pages; } } @@ -7591,7 +7774,7 @@ static int __alloc_contig_migrate_range(struct compact_control *cc, cc->nr_migratepages -= nr_reclaimed; ret = migrate_pages(&cc->migratepages, alloc_migrate_target, - NULL, 0, cc->mode, MR_CMA); + NULL, 0, cc->mode, MR_CONTIG_RANGE); } if (ret < 0) { putback_movable_pages(&cc->migratepages); @@ -7611,11 +7794,11 @@ static int __alloc_contig_migrate_range(struct compact_control *cc, * @gfp_mask: GFP mask to use during compaction * * The PFN range does not have to be pageblock or MAX_ORDER_NR_PAGES - * aligned, however it's the caller's responsibility to guarantee that - * we are the only thread that changes migrate type of pageblocks the - * pages fall in. + * aligned. The PFN range must belong to a single zone. * - * The PFN range must belong to a single zone. + * The first thing this routine does is attempt to MIGRATE_ISOLATE all + * pageblocks in the range. Once isolated, the pageblocks should not + * be modified by others. * * Returns zero on success or negative error code. On success all * pages which PFN is in [start, end) are allocated for the caller and @@ -7768,7 +7951,7 @@ void free_contig_range(unsigned long pfn, unsigned nr_pages) } #endif -#ifdef CONFIG_MEMORY_HOTPLUG +#if defined CONFIG_MEMORY_HOTPLUG || defined CONFIG_CMA /* * The zone indicated has a new number of managed_pages; batch sizes and percpu * page high values need to be recalulated. diff --git a/mm/page_idle.c b/mm/page_idle.c index 0a49374e6931..e412a63b2b74 100644 --- a/mm/page_idle.c +++ b/mm/page_idle.c @@ -65,11 +65,15 @@ static bool page_idle_clear_pte_refs_one(struct page *page, while (page_vma_mapped_walk(&pvmw)) { addr = pvmw.address; if (pvmw.pte) { - referenced = ptep_clear_young_notify(vma, addr, - pvmw.pte); + /* + * For PTE-mapped THP, one sub page is referenced, + * the whole THP is referenced. + */ + if (ptep_clear_young_notify(vma, addr, pvmw.pte)) + referenced = true; } else if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) { - referenced = pmdp_clear_young_notify(vma, addr, - pvmw.pmd); + if (pmdp_clear_young_notify(vma, addr, pvmw.pmd)) + referenced = true; } else { /* unexpected pmd-mapped page? */ WARN_ON_ONCE(1); diff --git a/mm/page_isolation.c b/mm/page_isolation.c index 165ed8117bd1..43e085608846 100644 --- a/mm/page_isolation.c +++ b/mm/page_isolation.c @@ -28,6 +28,14 @@ static int set_migratetype_isolate(struct page *page, int migratetype, spin_lock_irqsave(&zone->lock, flags); + /* + * We assume the caller intended to SET migrate type to isolate. + * If it is already set, then someone else must have raced and + * set it before us. Return -EBUSY + */ + if (is_migrate_isolate_page(page)) + goto out; + pfn = page_to_pfn(page); arg.start_pfn = pfn; arg.nr_pages = pageblock_nr_pages; @@ -166,7 +174,15 @@ __first_valid_page(unsigned long pfn, unsigned long nr_pages) * future will not be allocated again. * * start_pfn/end_pfn must be aligned to pageblock_order. - * Returns 0 on success and -EBUSY if any part of range cannot be isolated. + * Return 0 on success and -EBUSY if any part of range cannot be isolated. + * + * There is no high level synchronization mechanism that prevents two threads + * from trying to isolate overlapping ranges. If this happens, one thread + * will notice pageblocks in the overlapping range already set to isolate. + * This happens in set_migratetype_isolate, and set_migratetype_isolate + * returns an error. We then clean up by restoring the migration type on + * pageblocks we may have modified and return -EBUSY to caller. This + * prevents two threads from simultaneously working on overlapping ranges. */ int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn, unsigned migratetype, bool skip_hwpoisoned_pages) @@ -293,8 +309,7 @@ int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn, return pfn < end_pfn ? -EBUSY : 0; } -struct page *alloc_migrate_target(struct page *page, unsigned long private, - int **resultp) +struct page *alloc_migrate_target(struct page *page, unsigned long private) { return new_page_nodemask(page, numa_node_id(), &node_states[N_MEMORY]); } diff --git a/mm/page_owner.c b/mm/page_owner.c index 7172e0a80e13..75d21a2259b3 100644 --- a/mm/page_owner.c +++ b/mm/page_owner.c @@ -35,7 +35,7 @@ static depot_stack_handle_t early_handle; static void init_early_allocated_pages(void); -static int early_page_owner_param(char *buf) +static int __init early_page_owner_param(char *buf) { if (!buf) return -EINVAL; diff --git a/mm/page_poison.c b/mm/page_poison.c index e83fd44867de..aa2b3d34e8ea 100644 --- a/mm/page_poison.c +++ b/mm/page_poison.c @@ -9,7 +9,7 @@ static bool want_page_poisoning __read_mostly; -static int early_page_poison_param(char *buf) +static int __init early_page_poison_param(char *buf) { if (!buf) return -EINVAL; diff --git a/mm/pagewalk.c b/mm/pagewalk.c index 8d2da5dec1e0..c3084ff2569d 100644 --- a/mm/pagewalk.c +++ b/mm/pagewalk.c @@ -258,6 +258,9 @@ static int __walk_page_range(unsigned long start, unsigned long end, /** * walk_page_range - walk page table with caller specific callbacks + * @start: start address of the virtual address range + * @end: end address of the virtual address range + * @walk: mm_walk structure defining the callbacks and the target address space * * Recursively walk the page table tree of the process represented by @walk->mm * within the virtual address range [@start, @end). During walking, we can do diff --git a/mm/percpu-stats.c b/mm/percpu-stats.c index 7a58460bfd27..063ff60ecd90 100644 --- a/mm/percpu-stats.c +++ b/mm/percpu-stats.c @@ -223,18 +223,7 @@ alloc_buffer: return 0; } - -static int percpu_stats_open(struct inode *inode, struct file *filp) -{ - return single_open(filp, percpu_stats_show, NULL); -} - -static const struct file_operations percpu_stats_fops = { - .open = percpu_stats_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; +DEFINE_SHOW_ATTRIBUTE(percpu_stats); static int __init init_percpu_stats_debugfs(void) { diff --git a/mm/readahead.c b/mm/readahead.c index 4d57b4644f98..539bbb6c1fad 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -175,7 +175,7 @@ int __do_page_cache_readahead(struct address_space *mapping, struct file *filp, break; rcu_read_lock(); - page = radix_tree_lookup(&mapping->page_tree, page_offset); + page = radix_tree_lookup(&mapping->i_pages, page_offset); rcu_read_unlock(); if (page && !radix_tree_exceptional_entry(page)) continue; diff --git a/mm/rmap.c b/mm/rmap.c index 144c66e688a9..f0dd4e4565bc 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -32,11 +32,11 @@ * mmlist_lock (in mmput, drain_mmlist and others) * mapping->private_lock (in __set_page_dirty_buffers) * mem_cgroup_{begin,end}_page_stat (memcg->move_lock) - * mapping->tree_lock (widely used) + * i_pages lock (widely used) * inode->i_lock (in set_page_dirty's __mark_inode_dirty) * bdi.wb->list_lock (in set_page_dirty's __mark_inode_dirty) * sb_lock (within inode_lock in fs/fs-writeback.c) - * mapping->tree_lock (widely used, in set_page_dirty, + * i_pages lock (widely used, in set_page_dirty, * in arch-dependent flush_dcache_mmap_lock, * within bdi.wb->list_lock in __sync_single_inode) * @@ -1171,6 +1171,7 @@ void page_add_new_anon_rmap(struct page *page, /** * page_add_file_rmap - add pte mapping to a file page * @page: the page to add the mapping to + * @compound: charge the page as compound or small page * * The caller needs to hold the pte lock. */ diff --git a/mm/shmem.c b/mm/shmem.c index b85919243399..9d6c7e595415 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -332,12 +332,12 @@ static int shmem_radix_tree_replace(struct address_space *mapping, VM_BUG_ON(!expected); VM_BUG_ON(!replacement); - item = __radix_tree_lookup(&mapping->page_tree, index, &node, &pslot); + item = __radix_tree_lookup(&mapping->i_pages, index, &node, &pslot); if (!item) return -ENOENT; if (item != expected) return -ENOENT; - __radix_tree_replace(&mapping->page_tree, node, pslot, + __radix_tree_replace(&mapping->i_pages, node, pslot, replacement, NULL); return 0; } @@ -355,7 +355,7 @@ static bool shmem_confirm_swap(struct address_space *mapping, void *item; rcu_read_lock(); - item = radix_tree_lookup(&mapping->page_tree, index); + item = radix_tree_lookup(&mapping->i_pages, index); rcu_read_unlock(); return item == swp_to_radix_entry(swap); } @@ -590,14 +590,14 @@ static int shmem_add_to_page_cache(struct page *page, page->mapping = mapping; page->index = index; - spin_lock_irq(&mapping->tree_lock); + xa_lock_irq(&mapping->i_pages); if (PageTransHuge(page)) { void __rcu **results; pgoff_t idx; int i; error = 0; - if (radix_tree_gang_lookup_slot(&mapping->page_tree, + if (radix_tree_gang_lookup_slot(&mapping->i_pages, &results, &idx, index, 1) && idx < index + HPAGE_PMD_NR) { error = -EEXIST; @@ -605,14 +605,14 @@ static int shmem_add_to_page_cache(struct page *page, if (!error) { for (i = 0; i < HPAGE_PMD_NR; i++) { - error = radix_tree_insert(&mapping->page_tree, + error = radix_tree_insert(&mapping->i_pages, index + i, page + i); VM_BUG_ON(error); } count_vm_event(THP_FILE_ALLOC); } } else if (!expected) { - error = radix_tree_insert(&mapping->page_tree, index, page); + error = radix_tree_insert(&mapping->i_pages, index, page); } else { error = shmem_radix_tree_replace(mapping, index, expected, page); @@ -624,10 +624,10 @@ static int shmem_add_to_page_cache(struct page *page, __inc_node_page_state(page, NR_SHMEM_THPS); __mod_node_page_state(page_pgdat(page), NR_FILE_PAGES, nr); __mod_node_page_state(page_pgdat(page), NR_SHMEM, nr); - spin_unlock_irq(&mapping->tree_lock); + xa_unlock_irq(&mapping->i_pages); } else { page->mapping = NULL; - spin_unlock_irq(&mapping->tree_lock); + xa_unlock_irq(&mapping->i_pages); page_ref_sub(page, nr); } return error; @@ -643,13 +643,13 @@ static void shmem_delete_from_page_cache(struct page *page, void *radswap) VM_BUG_ON_PAGE(PageCompound(page), page); - spin_lock_irq(&mapping->tree_lock); + xa_lock_irq(&mapping->i_pages); error = shmem_radix_tree_replace(mapping, page->index, page, radswap); page->mapping = NULL; mapping->nrpages--; __dec_node_page_state(page, NR_FILE_PAGES); __dec_node_page_state(page, NR_SHMEM); - spin_unlock_irq(&mapping->tree_lock); + xa_unlock_irq(&mapping->i_pages); put_page(page); BUG_ON(error); } @@ -662,9 +662,9 @@ static int shmem_free_swap(struct address_space *mapping, { void *old; - spin_lock_irq(&mapping->tree_lock); - old = radix_tree_delete_item(&mapping->page_tree, index, radswap); - spin_unlock_irq(&mapping->tree_lock); + xa_lock_irq(&mapping->i_pages); + old = radix_tree_delete_item(&mapping->i_pages, index, radswap); + xa_unlock_irq(&mapping->i_pages); if (old != radswap) return -ENOENT; free_swap_and_cache(radix_to_swp_entry(radswap)); @@ -675,7 +675,7 @@ static int shmem_free_swap(struct address_space *mapping, * Determine (in bytes) how many of the shmem object's pages mapped by the * given offsets are swapped out. * - * This is safe to call without i_mutex or mapping->tree_lock thanks to RCU, + * This is safe to call without i_mutex or the i_pages lock thanks to RCU, * as long as the inode doesn't go away and racy results are not a problem. */ unsigned long shmem_partial_swap_usage(struct address_space *mapping, @@ -688,7 +688,7 @@ unsigned long shmem_partial_swap_usage(struct address_space *mapping, rcu_read_lock(); - radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) { + radix_tree_for_each_slot(slot, &mapping->i_pages, &iter, start) { if (iter.index >= end) break; @@ -717,7 +717,7 @@ unsigned long shmem_partial_swap_usage(struct address_space *mapping, * Determine (in bytes) how many of the shmem object's pages mapped by the * given vma is swapped out. * - * This is safe to call without i_mutex or mapping->tree_lock thanks to RCU, + * This is safe to call without i_mutex or the i_pages lock thanks to RCU, * as long as the inode doesn't go away and racy results are not a problem. */ unsigned long shmem_swap_usage(struct vm_area_struct *vma) @@ -1132,7 +1132,7 @@ static int shmem_unuse_inode(struct shmem_inode_info *info, int error = 0; radswap = swp_to_radix_entry(swap); - index = find_swap_entry(&mapping->page_tree, radswap); + index = find_swap_entry(&mapping->i_pages, radswap); if (index == -1) return -EAGAIN; /* tell shmem_unuse we found nothing */ @@ -1422,9 +1422,12 @@ static struct page *shmem_swapin(swp_entry_t swap, gfp_t gfp, { struct vm_area_struct pvma; struct page *page; + struct vm_fault vmf; shmem_pseudo_vma_init(&pvma, info, index); - page = swapin_readahead(swap, gfp, &pvma, 0); + vmf.vma = &pvma; + vmf.address = 0; + page = swap_cluster_readahead(swap, gfp, &vmf); shmem_pseudo_vma_destroy(&pvma); return page; @@ -1445,7 +1448,7 @@ static struct page *shmem_alloc_hugepage(gfp_t gfp, hindex = round_down(index, HPAGE_PMD_NR); rcu_read_lock(); - if (radix_tree_gang_lookup_slot(&mapping->page_tree, &results, &idx, + if (radix_tree_gang_lookup_slot(&mapping->i_pages, &results, &idx, hindex, 1) && idx < hindex + HPAGE_PMD_NR) { rcu_read_unlock(); return NULL; @@ -1558,14 +1561,14 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp, * Our caller will very soon move newpage out of swapcache, but it's * a nice clean interface for us to replace oldpage by newpage there. */ - spin_lock_irq(&swap_mapping->tree_lock); + xa_lock_irq(&swap_mapping->i_pages); error = shmem_radix_tree_replace(swap_mapping, swap_index, oldpage, newpage); if (!error) { __inc_node_page_state(newpage, NR_FILE_PAGES); __dec_node_page_state(oldpage, NR_FILE_PAGES); } - spin_unlock_irq(&swap_mapping->tree_lock); + xa_unlock_irq(&swap_mapping->i_pages); if (unlikely(error)) { /* @@ -2631,7 +2634,7 @@ static void shmem_tag_pins(struct address_space *mapping) start = 0; rcu_read_lock(); - radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) { + radix_tree_for_each_slot(slot, &mapping->i_pages, &iter, start) { page = radix_tree_deref_slot(slot); if (!page || radix_tree_exception(page)) { if (radix_tree_deref_retry(page)) { @@ -2639,10 +2642,10 @@ static void shmem_tag_pins(struct address_space *mapping) continue; } } else if (page_count(page) - page_mapcount(page) > 1) { - spin_lock_irq(&mapping->tree_lock); - radix_tree_tag_set(&mapping->page_tree, iter.index, + xa_lock_irq(&mapping->i_pages); + radix_tree_tag_set(&mapping->i_pages, iter.index, SHMEM_TAG_PINNED); - spin_unlock_irq(&mapping->tree_lock); + xa_unlock_irq(&mapping->i_pages); } if (need_resched()) { @@ -2674,7 +2677,7 @@ static int shmem_wait_for_pins(struct address_space *mapping) error = 0; for (scan = 0; scan <= LAST_SCAN; scan++) { - if (!radix_tree_tagged(&mapping->page_tree, SHMEM_TAG_PINNED)) + if (!radix_tree_tagged(&mapping->i_pages, SHMEM_TAG_PINNED)) break; if (!scan) @@ -2684,7 +2687,7 @@ static int shmem_wait_for_pins(struct address_space *mapping) start = 0; rcu_read_lock(); - radix_tree_for_each_tagged(slot, &mapping->page_tree, &iter, + radix_tree_for_each_tagged(slot, &mapping->i_pages, &iter, start, SHMEM_TAG_PINNED) { page = radix_tree_deref_slot(slot); @@ -2710,10 +2713,10 @@ static int shmem_wait_for_pins(struct address_space *mapping) error = -EBUSY; } - spin_lock_irq(&mapping->tree_lock); - radix_tree_tag_clear(&mapping->page_tree, + xa_lock_irq(&mapping->i_pages); + radix_tree_tag_clear(&mapping->i_pages, iter.index, SHMEM_TAG_PINNED); - spin_unlock_irq(&mapping->tree_lock); + xa_unlock_irq(&mapping->i_pages); continue_resched: if (need_resched()) { slot = radix_tree_iter_resume(slot, &iter); diff --git a/mm/slab.c b/mm/slab.c index 9095c3945425..2f308253c3d7 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -1869,7 +1869,7 @@ static int __ref setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp) return 0; } -slab_flags_t kmem_cache_flags(unsigned long object_size, +slab_flags_t kmem_cache_flags(unsigned int object_size, slab_flags_t flags, const char *name, void (*ctor)(void *)) { @@ -1877,7 +1877,7 @@ slab_flags_t kmem_cache_flags(unsigned long object_size, } struct kmem_cache * -__kmem_cache_alias(const char *name, size_t size, size_t align, +__kmem_cache_alias(const char *name, unsigned int size, unsigned int align, slab_flags_t flags, void (*ctor)(void *)) { struct kmem_cache *cachep; @@ -1994,7 +1994,7 @@ int __kmem_cache_create(struct kmem_cache *cachep, slab_flags_t flags) size_t ralign = BYTES_PER_WORD; gfp_t gfp; int err; - size_t size = cachep->size; + unsigned int size = cachep->size; #if DEBUG #if FORCED_DEBUG @@ -2291,6 +2291,18 @@ out: return nr_freed; } +bool __kmem_cache_empty(struct kmem_cache *s) +{ + int node; + struct kmem_cache_node *n; + + for_each_kmem_cache_node(s, node, n) + if (!list_empty(&n->slabs_full) || + !list_empty(&n->slabs_partial)) + return false; + return true; +} + int __kmem_cache_shrink(struct kmem_cache *cachep) { int ret = 0; @@ -4074,7 +4086,8 @@ next: next_reap_node(); out: /* Set up the next iteration */ - schedule_delayed_work(work, round_jiffies_relative(REAPTIMEOUT_AC)); + schedule_delayed_work_on(smp_processor_id(), work, + round_jiffies_relative(REAPTIMEOUT_AC)); } void get_slabinfo(struct kmem_cache *cachep, struct slabinfo *sinfo) diff --git a/mm/slab.h b/mm/slab.h index 51813236e773..68bdf498da3b 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -22,8 +22,8 @@ struct kmem_cache { unsigned int size; /* The aligned/padded/added on size */ unsigned int align; /* Alignment as calculated */ slab_flags_t flags; /* Active flags on the slab */ - size_t useroffset; /* Usercopy region offset */ - size_t usersize; /* Usercopy region size */ + unsigned int useroffset;/* Usercopy region offset */ + unsigned int usersize; /* Usercopy region size */ const char *name; /* Slab name for sysfs */ int refcount; /* Use counter */ void (*ctor)(void *); /* Called on object slot creation */ @@ -77,7 +77,7 @@ extern struct kmem_cache *kmem_cache; /* A table of kmalloc cache names and sizes */ extern const struct kmalloc_info_struct { const char *name; - unsigned long size; + unsigned int size; } kmalloc_info[]; #ifndef CONFIG_SLOB @@ -93,31 +93,31 @@ struct kmem_cache *kmalloc_slab(size_t, gfp_t); /* Functions provided by the slab allocators */ int __kmem_cache_create(struct kmem_cache *, slab_flags_t flags); -extern struct kmem_cache *create_kmalloc_cache(const char *name, size_t size, - slab_flags_t flags, size_t useroffset, - size_t usersize); +struct kmem_cache *create_kmalloc_cache(const char *name, unsigned int size, + slab_flags_t flags, unsigned int useroffset, + unsigned int usersize); extern void create_boot_cache(struct kmem_cache *, const char *name, - size_t size, slab_flags_t flags, size_t useroffset, - size_t usersize); + unsigned int size, slab_flags_t flags, + unsigned int useroffset, unsigned int usersize); int slab_unmergeable(struct kmem_cache *s); -struct kmem_cache *find_mergeable(size_t size, size_t align, +struct kmem_cache *find_mergeable(unsigned size, unsigned align, slab_flags_t flags, const char *name, void (*ctor)(void *)); #ifndef CONFIG_SLOB struct kmem_cache * -__kmem_cache_alias(const char *name, size_t size, size_t align, +__kmem_cache_alias(const char *name, unsigned int size, unsigned int align, slab_flags_t flags, void (*ctor)(void *)); -slab_flags_t kmem_cache_flags(unsigned long object_size, +slab_flags_t kmem_cache_flags(unsigned int object_size, slab_flags_t flags, const char *name, void (*ctor)(void *)); #else static inline struct kmem_cache * -__kmem_cache_alias(const char *name, size_t size, size_t align, +__kmem_cache_alias(const char *name, unsigned int size, unsigned int align, slab_flags_t flags, void (*ctor)(void *)) { return NULL; } -static inline slab_flags_t kmem_cache_flags(unsigned long object_size, +static inline slab_flags_t kmem_cache_flags(unsigned int object_size, slab_flags_t flags, const char *name, void (*ctor)(void *)) { @@ -166,6 +166,7 @@ static inline slab_flags_t kmem_cache_flags(unsigned long object_size, SLAB_TEMPORARY | \ SLAB_ACCOUNT) +bool __kmem_cache_empty(struct kmem_cache *); int __kmem_cache_shutdown(struct kmem_cache *); void __kmem_cache_release(struct kmem_cache *); int __kmem_cache_shrink(struct kmem_cache *); diff --git a/mm/slab_common.c b/mm/slab_common.c index 10f127b2de7c..98dcdc352062 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -10,6 +10,7 @@ #include <linux/poison.h> #include <linux/interrupt.h> #include <linux/memory.h> +#include <linux/cache.h> #include <linux/compiler.h> #include <linux/module.h> #include <linux/cpu.h> @@ -81,38 +82,19 @@ unsigned int kmem_cache_size(struct kmem_cache *s) EXPORT_SYMBOL(kmem_cache_size); #ifdef CONFIG_DEBUG_VM -static int kmem_cache_sanity_check(const char *name, size_t size) +static int kmem_cache_sanity_check(const char *name, unsigned int size) { - struct kmem_cache *s = NULL; - if (!name || in_interrupt() || size < sizeof(void *) || size > KMALLOC_MAX_SIZE) { pr_err("kmem_cache_create(%s) integrity check failed\n", name); return -EINVAL; } - list_for_each_entry(s, &slab_caches, list) { - char tmp; - int res; - - /* - * This happens when the module gets unloaded and doesn't - * destroy its slab cache and no-one else reuses the vmalloc - * area of the module. Print a warning. - */ - res = probe_kernel_address(s->name, tmp); - if (res) { - pr_err("Slab cache with size %d has lost its name\n", - s->object_size); - continue; - } - } - WARN_ON(strchr(name, ' ')); /* It confuses parsers */ return 0; } #else -static inline int kmem_cache_sanity_check(const char *name, size_t size) +static inline int kmem_cache_sanity_check(const char *name, unsigned int size) { return 0; } @@ -279,8 +261,8 @@ static inline void memcg_unlink_cache(struct kmem_cache *s) * Figure out what the alignment of the objects will be given a set of * flags, a user specified alignment and the size of the objects. */ -static unsigned long calculate_alignment(unsigned long flags, - unsigned long align, unsigned long size) +static unsigned int calculate_alignment(slab_flags_t flags, + unsigned int align, unsigned int size) { /* * If the user wants hardware cache aligned objects then follow that @@ -290,7 +272,7 @@ static unsigned long calculate_alignment(unsigned long flags, * alignment though. If that is greater then use it. */ if (flags & SLAB_HWCACHE_ALIGN) { - unsigned long ralign; + unsigned int ralign; ralign = cache_line_size(); while (size <= ralign / 2) @@ -330,7 +312,7 @@ int slab_unmergeable(struct kmem_cache *s) return 0; } -struct kmem_cache *find_mergeable(size_t size, size_t align, +struct kmem_cache *find_mergeable(unsigned int size, unsigned int align, slab_flags_t flags, const char *name, void (*ctor)(void *)) { struct kmem_cache *s; @@ -378,9 +360,9 @@ struct kmem_cache *find_mergeable(size_t size, size_t align, } static struct kmem_cache *create_cache(const char *name, - size_t object_size, size_t size, size_t align, - slab_flags_t flags, size_t useroffset, - size_t usersize, void (*ctor)(void *), + unsigned int object_size, unsigned int align, + slab_flags_t flags, unsigned int useroffset, + unsigned int usersize, void (*ctor)(void *), struct mem_cgroup *memcg, struct kmem_cache *root_cache) { struct kmem_cache *s; @@ -395,8 +377,7 @@ static struct kmem_cache *create_cache(const char *name, goto out; s->name = name; - s->object_size = object_size; - s->size = size; + s->size = s->object_size = object_size; s->align = align; s->ctor = ctor; s->useroffset = useroffset; @@ -451,8 +432,10 @@ out_free_cache: * as davem. */ struct kmem_cache * -kmem_cache_create_usercopy(const char *name, size_t size, size_t align, - slab_flags_t flags, size_t useroffset, size_t usersize, +kmem_cache_create_usercopy(const char *name, + unsigned int size, unsigned int align, + slab_flags_t flags, + unsigned int useroffset, unsigned int usersize, void (*ctor)(void *)) { struct kmem_cache *s = NULL; @@ -500,7 +483,7 @@ kmem_cache_create_usercopy(const char *name, size_t size, size_t align, goto out_unlock; } - s = create_cache(cache_name, size, size, + s = create_cache(cache_name, size, calculate_alignment(flags, align, size), flags, useroffset, usersize, ctor, NULL, NULL); if (IS_ERR(s)) { @@ -531,7 +514,7 @@ out_unlock: EXPORT_SYMBOL(kmem_cache_create_usercopy); struct kmem_cache * -kmem_cache_create(const char *name, size_t size, size_t align, +kmem_cache_create(const char *name, unsigned int size, unsigned int align, slab_flags_t flags, void (*ctor)(void *)) { return kmem_cache_create_usercopy(name, size, align, flags, 0, 0, @@ -647,7 +630,7 @@ void memcg_create_kmem_cache(struct mem_cgroup *memcg, goto out_unlock; s = create_cache(cache_name, root_cache->object_size, - root_cache->size, root_cache->align, + root_cache->align, root_cache->flags & CACHE_CREATE_MASK, root_cache->useroffset, root_cache->usersize, root_cache->ctor, memcg, root_cache); @@ -916,8 +899,9 @@ bool slab_is_available(void) #ifndef CONFIG_SLOB /* Create a cache during boot when no slab services are available yet */ -void __init create_boot_cache(struct kmem_cache *s, const char *name, size_t size, - slab_flags_t flags, size_t useroffset, size_t usersize) +void __init create_boot_cache(struct kmem_cache *s, const char *name, + unsigned int size, slab_flags_t flags, + unsigned int useroffset, unsigned int usersize) { int err; @@ -932,15 +916,15 @@ void __init create_boot_cache(struct kmem_cache *s, const char *name, size_t siz err = __kmem_cache_create(s, flags); if (err) - panic("Creation of kmalloc slab %s size=%zu failed. Reason %d\n", + panic("Creation of kmalloc slab %s size=%u failed. Reason %d\n", name, size, err); s->refcount = -1; /* Exempt from merging for now */ } -struct kmem_cache *__init create_kmalloc_cache(const char *name, size_t size, - slab_flags_t flags, size_t useroffset, - size_t usersize) +struct kmem_cache *__init create_kmalloc_cache(const char *name, + unsigned int size, slab_flags_t flags, + unsigned int useroffset, unsigned int usersize) { struct kmem_cache *s = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT); @@ -954,11 +938,11 @@ struct kmem_cache *__init create_kmalloc_cache(const char *name, size_t size, return s; } -struct kmem_cache *kmalloc_caches[KMALLOC_SHIFT_HIGH + 1]; +struct kmem_cache *kmalloc_caches[KMALLOC_SHIFT_HIGH + 1] __ro_after_init; EXPORT_SYMBOL(kmalloc_caches); #ifdef CONFIG_ZONE_DMA -struct kmem_cache *kmalloc_dma_caches[KMALLOC_SHIFT_HIGH + 1]; +struct kmem_cache *kmalloc_dma_caches[KMALLOC_SHIFT_HIGH + 1] __ro_after_init; EXPORT_SYMBOL(kmalloc_dma_caches); #endif @@ -968,7 +952,7 @@ EXPORT_SYMBOL(kmalloc_dma_caches); * of two cache sizes there. The size of larger slabs can be determined using * fls. */ -static s8 size_index[24] = { +static u8 size_index[24] __ro_after_init = { 3, /* 8 */ 4, /* 16 */ 5, /* 24 */ @@ -995,7 +979,7 @@ static s8 size_index[24] = { 2 /* 192 */ }; -static inline int size_index_elem(size_t bytes) +static inline unsigned int size_index_elem(unsigned int bytes) { return (bytes - 1) / 8; } @@ -1006,7 +990,7 @@ static inline int size_index_elem(size_t bytes) */ struct kmem_cache *kmalloc_slab(size_t size, gfp_t flags) { - int index; + unsigned int index; if (unlikely(size > KMALLOC_MAX_SIZE)) { WARN_ON_ONCE(!(flags & __GFP_NOWARN)); @@ -1064,13 +1048,13 @@ const struct kmalloc_info_struct kmalloc_info[] __initconst = { */ void __init setup_kmalloc_cache_index_table(void) { - int i; + unsigned int i; BUILD_BUG_ON(KMALLOC_MIN_SIZE > 256 || (KMALLOC_MIN_SIZE & (KMALLOC_MIN_SIZE - 1))); for (i = 8; i < KMALLOC_MIN_SIZE; i += 8) { - int elem = size_index_elem(i); + unsigned int elem = size_index_elem(i); if (elem >= ARRAY_SIZE(size_index)) break; @@ -1137,9 +1121,9 @@ void __init create_kmalloc_caches(slab_flags_t flags) struct kmem_cache *s = kmalloc_caches[i]; if (s) { - int size = kmalloc_size(i); + unsigned int size = kmalloc_size(i); char *n = kasprintf(GFP_NOWAIT, - "dma-kmalloc-%d", size); + "dma-kmalloc-%u", size); BUG_ON(!n); kmalloc_dma_caches[i] = create_kmalloc_cache(n, @@ -1182,10 +1166,10 @@ EXPORT_SYMBOL(kmalloc_order_trace); #ifdef CONFIG_SLAB_FREELIST_RANDOM /* Randomize a generic freelist */ static void freelist_randomize(struct rnd_state *state, unsigned int *list, - size_t count) + unsigned int count) { - size_t i; unsigned int rand; + unsigned int i; for (i = 0; i < count; i++) list[i] = i; @@ -1532,3 +1516,11 @@ EXPORT_TRACEPOINT_SYMBOL(kmalloc_node); EXPORT_TRACEPOINT_SYMBOL(kmem_cache_alloc_node); EXPORT_TRACEPOINT_SYMBOL(kfree); EXPORT_TRACEPOINT_SYMBOL(kmem_cache_free); + +int should_failslab(struct kmem_cache *s, gfp_t gfpflags) +{ + if (__should_failslab(s, gfpflags)) + return -ENOMEM; + return 0; +} +ALLOW_ERROR_INJECTION(should_failslab, ERRNO); diff --git a/mm/slub.c b/mm/slub.c index e381728a3751..44aa7847324a 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -311,18 +311,18 @@ static inline void set_freepointer(struct kmem_cache *s, void *object, void *fp) __p += (__s)->size, __idx++) /* Determine object index from a given position */ -static inline int slab_index(void *p, struct kmem_cache *s, void *addr) +static inline unsigned int slab_index(void *p, struct kmem_cache *s, void *addr) { return (p - addr) / s->size; } -static inline int order_objects(int order, unsigned long size, int reserved) +static inline unsigned int order_objects(unsigned int order, unsigned int size, unsigned int reserved) { - return ((PAGE_SIZE << order) - reserved) / size; + return (((unsigned int)PAGE_SIZE << order) - reserved) / size; } -static inline struct kmem_cache_order_objects oo_make(int order, - unsigned long size, int reserved) +static inline struct kmem_cache_order_objects oo_make(unsigned int order, + unsigned int size, unsigned int reserved) { struct kmem_cache_order_objects x = { (order << OO_SHIFT) + order_objects(order, size, reserved) @@ -331,12 +331,12 @@ static inline struct kmem_cache_order_objects oo_make(int order, return x; } -static inline int oo_order(struct kmem_cache_order_objects x) +static inline unsigned int oo_order(struct kmem_cache_order_objects x) { return x.x >> OO_SHIFT; } -static inline int oo_objects(struct kmem_cache_order_objects x) +static inline unsigned int oo_objects(struct kmem_cache_order_objects x) { return x.x & OO_MASK; } @@ -466,7 +466,7 @@ static void get_map(struct kmem_cache *s, struct page *page, unsigned long *map) set_bit(slab_index(p, s, addr), map); } -static inline int size_from_object(struct kmem_cache *s) +static inline unsigned int size_from_object(struct kmem_cache *s) { if (s->flags & SLAB_RED_ZONE) return s->size - s->red_left_pad; @@ -598,13 +598,13 @@ static void init_tracking(struct kmem_cache *s, void *object) set_track(s, object, TRACK_ALLOC, 0UL); } -static void print_track(const char *s, struct track *t) +static void print_track(const char *s, struct track *t, unsigned long pr_time) { if (!t->addr) return; pr_err("INFO: %s in %pS age=%lu cpu=%u pid=%d\n", - s, (void *)t->addr, jiffies - t->when, t->cpu, t->pid); + s, (void *)t->addr, pr_time - t->when, t->cpu, t->pid); #ifdef CONFIG_STACKTRACE { int i; @@ -619,11 +619,12 @@ static void print_track(const char *s, struct track *t) static void print_tracking(struct kmem_cache *s, void *object) { + unsigned long pr_time = jiffies; if (!(s->flags & SLAB_STORE_USER)) return; - print_track("Allocated", get_track(s, object, TRACK_ALLOC)); - print_track("Freed", get_track(s, object, TRACK_FREE)); + print_track("Allocated", get_track(s, object, TRACK_ALLOC), pr_time); + print_track("Freed", get_track(s, object, TRACK_FREE), pr_time); } static void print_page_info(struct page *page) @@ -680,7 +681,7 @@ static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p) print_section(KERN_ERR, "Bytes b4 ", p - 16, 16); print_section(KERN_ERR, "Object ", p, - min_t(unsigned long, s->object_size, PAGE_SIZE)); + min_t(unsigned int, s->object_size, PAGE_SIZE)); if (s->flags & SLAB_RED_ZONE) print_section(KERN_ERR, "Redzone ", p + s->object_size, s->inuse - s->object_size); @@ -1292,7 +1293,7 @@ out: __setup("slub_debug", setup_slub_debug); -slab_flags_t kmem_cache_flags(unsigned long object_size, +slab_flags_t kmem_cache_flags(unsigned int object_size, slab_flags_t flags, const char *name, void (*ctor)(void *)) { @@ -1325,7 +1326,7 @@ static inline void add_full(struct kmem_cache *s, struct kmem_cache_node *n, struct page *page) {} static inline void remove_full(struct kmem_cache *s, struct kmem_cache_node *n, struct page *page) {} -slab_flags_t kmem_cache_flags(unsigned long object_size, +slab_flags_t kmem_cache_flags(unsigned int object_size, slab_flags_t flags, const char *name, void (*ctor)(void *)) { @@ -1362,10 +1363,8 @@ static __always_inline void kfree_hook(void *x) kasan_kfree_large(x, _RET_IP_); } -static __always_inline void *slab_free_hook(struct kmem_cache *s, void *x) +static __always_inline bool slab_free_hook(struct kmem_cache *s, void *x) { - void *freeptr; - kmemleak_free_recursive(x, s->flags); /* @@ -1385,17 +1384,12 @@ static __always_inline void *slab_free_hook(struct kmem_cache *s, void *x) if (!(s->flags & SLAB_DEBUG_OBJECTS)) debug_check_no_obj_freed(x, s->object_size); - freeptr = get_freepointer(s, x); - /* - * kasan_slab_free() may put x into memory quarantine, delaying its - * reuse. In this case the object's freelist pointer is changed. - */ - kasan_slab_free(s, x, _RET_IP_); - return freeptr; + /* KASAN might put x into memory quarantine, delaying its reuse */ + return kasan_slab_free(s, x, _RET_IP_); } -static inline void slab_free_freelist_hook(struct kmem_cache *s, - void *head, void *tail) +static inline bool slab_free_freelist_hook(struct kmem_cache *s, + void **head, void **tail) { /* * Compiler cannot detect this function can be removed if slab_free_hook() @@ -1406,13 +1400,33 @@ static inline void slab_free_freelist_hook(struct kmem_cache *s, defined(CONFIG_DEBUG_OBJECTS_FREE) || \ defined(CONFIG_KASAN) - void *object = head; - void *tail_obj = tail ? : head; - void *freeptr; + void *object; + void *next = *head; + void *old_tail = *tail ? *tail : *head; + + /* Head and tail of the reconstructed freelist */ + *head = NULL; + *tail = NULL; do { - freeptr = slab_free_hook(s, object); - } while ((object != tail_obj) && (object = freeptr)); + object = next; + next = get_freepointer(s, object); + /* If object's reuse doesn't have to be delayed */ + if (!slab_free_hook(s, object)) { + /* Move object to the new freelist */ + set_freepointer(s, object, *head); + *head = object; + if (!*tail) + *tail = object; + } + } while (object != old_tail); + + if (*head == *tail) + *tail = NULL; + + return *head != NULL; +#else + return true; #endif } @@ -1435,7 +1449,7 @@ static inline struct page *alloc_slab_page(struct kmem_cache *s, gfp_t flags, int node, struct kmem_cache_order_objects oo) { struct page *page; - int order = oo_order(oo); + unsigned int order = oo_order(oo); if (node == NUMA_NO_NODE) page = alloc_pages(flags, order); @@ -1454,8 +1468,8 @@ static inline struct page *alloc_slab_page(struct kmem_cache *s, /* Pre-initialize the random sequence cache */ static int init_cache_random_seq(struct kmem_cache *s) { + unsigned int count = oo_objects(s->oo); int err; - unsigned long i, count = oo_objects(s->oo); /* Bailout if already initialised */ if (s->random_seq) @@ -1470,6 +1484,8 @@ static int init_cache_random_seq(struct kmem_cache *s) /* Transform to an offset on the set of pages */ if (s->random_seq) { + unsigned int i; + for (i = 0; i < count; i++) s->random_seq[i] *= s->size; } @@ -1811,7 +1827,7 @@ static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n, { struct page *page, *page2; void *object = NULL; - int available = 0; + unsigned int available = 0; int objects; /* @@ -2398,7 +2414,7 @@ slab_out_of_memory(struct kmem_cache *s, gfp_t gfpflags, int nid) pr_warn("SLUB: Unable to allocate memory on node %d, gfp=%#x(%pGg)\n", nid, gfpflags, &gfpflags); - pr_warn(" cache: %s, object size: %d, buffer size: %d, default order: %d, min order: %d\n", + pr_warn(" cache: %s, object size: %u, buffer size: %u, default order: %u, min order: %u\n", s->name, s->object_size, s->size, oo_order(s->oo), oo_order(s->min)); @@ -2965,14 +2981,12 @@ static __always_inline void slab_free(struct kmem_cache *s, struct page *page, void *head, void *tail, int cnt, unsigned long addr) { - slab_free_freelist_hook(s, head, tail); /* - * slab_free_freelist_hook() could have put the items into quarantine. - * If so, no need to free them. + * With KASAN enabled slab_free_freelist_hook modifies the freelist + * to remove objects, whose reuse must be delayed. */ - if (s->flags & SLAB_KASAN && !(s->flags & SLAB_TYPESAFE_BY_RCU)) - return; - do_slab_free(s, page, head, tail, cnt, addr); + if (slab_free_freelist_hook(s, &head, &tail)) + do_slab_free(s, page, head, tail, cnt, addr); } #ifdef CONFIG_KASAN @@ -3181,9 +3195,9 @@ EXPORT_SYMBOL(kmem_cache_alloc_bulk); * and increases the number of allocations possible without having to * take the list_lock. */ -static int slub_min_order; -static int slub_max_order = PAGE_ALLOC_COSTLY_ORDER; -static int slub_min_objects; +static unsigned int slub_min_order; +static unsigned int slub_max_order = PAGE_ALLOC_COSTLY_ORDER; +static unsigned int slub_min_objects; /* * Calculate the order of allocation given an slab object size. @@ -3210,20 +3224,21 @@ static int slub_min_objects; * requested a higher mininum order then we start with that one instead of * the smallest order which will fit the object. */ -static inline int slab_order(int size, int min_objects, - int max_order, int fract_leftover, int reserved) +static inline unsigned int slab_order(unsigned int size, + unsigned int min_objects, unsigned int max_order, + unsigned int fract_leftover, unsigned int reserved) { - int order; - int rem; - int min_order = slub_min_order; + unsigned int min_order = slub_min_order; + unsigned int order; if (order_objects(min_order, size, reserved) > MAX_OBJS_PER_PAGE) return get_order(size * MAX_OBJS_PER_PAGE) - 1; - for (order = max(min_order, get_order(min_objects * size + reserved)); + for (order = max(min_order, (unsigned int)get_order(min_objects * size + reserved)); order <= max_order; order++) { - unsigned long slab_size = PAGE_SIZE << order; + unsigned int slab_size = (unsigned int)PAGE_SIZE << order; + unsigned int rem; rem = (slab_size - reserved) % size; @@ -3234,12 +3249,11 @@ static inline int slab_order(int size, int min_objects, return order; } -static inline int calculate_order(int size, int reserved) +static inline int calculate_order(unsigned int size, unsigned int reserved) { - int order; - int min_objects; - int fraction; - int max_objects; + unsigned int order; + unsigned int min_objects; + unsigned int max_objects; /* * Attempt to find best configuration for a slab. This @@ -3256,6 +3270,8 @@ static inline int calculate_order(int size, int reserved) min_objects = min(min_objects, max_objects); while (min_objects > 1) { + unsigned int fraction; + fraction = 16; while (fraction >= 4) { order = slab_order(size, min_objects, @@ -3457,8 +3473,8 @@ static void set_cpu_partial(struct kmem_cache *s) static int calculate_sizes(struct kmem_cache *s, int forced_order) { slab_flags_t flags = s->flags; - size_t size = s->object_size; - int order; + unsigned int size = s->object_size; + unsigned int order; /* * Round up object size to the next word boundary. We can only @@ -3548,7 +3564,7 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order) else order = calculate_order(size, s->reserved); - if (order < 0) + if ((int)order < 0) return 0; s->allocflags = 0; @@ -3632,8 +3648,8 @@ static int kmem_cache_open(struct kmem_cache *s, slab_flags_t flags) free_kmem_cache_nodes(s); error: if (flags & SLAB_PANIC) - panic("Cannot create slab %s size=%lu realsize=%u order=%u offset=%u flags=%lx\n", - s->name, (unsigned long)s->size, s->size, + panic("Cannot create slab %s size=%u realsize=%u order=%u offset=%u flags=%lx\n", + s->name, s->size, s->size, oo_order(s->oo), s->offset, (unsigned long)flags); return -EINVAL; } @@ -3691,6 +3707,17 @@ static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n) discard_slab(s, page); } +bool __kmem_cache_empty(struct kmem_cache *s) +{ + int node; + struct kmem_cache_node *n; + + for_each_kmem_cache_node(s, node, n) + if (n->nr_partial || slabs_node(s, node)) + return false; + return true; +} + /* * Release all resources used by a slab cache. */ @@ -3716,7 +3743,7 @@ int __kmem_cache_shutdown(struct kmem_cache *s) static int __init setup_slub_min_order(char *str) { - get_option(&str, &slub_min_order); + get_option(&str, (int *)&slub_min_order); return 1; } @@ -3725,8 +3752,8 @@ __setup("slub_min_order=", setup_slub_min_order); static int __init setup_slub_max_order(char *str) { - get_option(&str, &slub_max_order); - slub_max_order = min(slub_max_order, MAX_ORDER - 1); + get_option(&str, (int *)&slub_max_order); + slub_max_order = min(slub_max_order, (unsigned int)MAX_ORDER - 1); return 1; } @@ -3735,7 +3762,7 @@ __setup("slub_max_order=", setup_slub_max_order); static int __init setup_slub_min_objects(char *str) { - get_option(&str, &slub_min_objects); + get_option(&str, (int *)&slub_min_objects); return 1; } @@ -3824,7 +3851,7 @@ void __check_heap_object(const void *ptr, unsigned long n, struct page *page, bool to_user) { struct kmem_cache *s; - unsigned long offset; + unsigned int offset; size_t object_size; /* Find object and usable object size. */ @@ -4230,7 +4257,7 @@ void __init kmem_cache_init(void) cpuhp_setup_state_nocalls(CPUHP_SLUB_DEAD, "slub:dead", NULL, slub_cpu_dead); - pr_info("SLUB: HWalign=%d, Order=%d-%d, MinObjects=%d, CPUs=%u, Nodes=%d\n", + pr_info("SLUB: HWalign=%d, Order=%u-%u, MinObjects=%u, CPUs=%u, Nodes=%d\n", cache_line_size(), slub_min_order, slub_max_order, slub_min_objects, nr_cpu_ids, nr_node_ids); @@ -4241,7 +4268,7 @@ void __init kmem_cache_init_late(void) } struct kmem_cache * -__kmem_cache_alias(const char *name, size_t size, size_t align, +__kmem_cache_alias(const char *name, unsigned int size, unsigned int align, slab_flags_t flags, void (*ctor)(void *)) { struct kmem_cache *s, *c; @@ -4254,13 +4281,12 @@ __kmem_cache_alias(const char *name, size_t size, size_t align, * Adjust the object sizes so that we clear * the complete object on kzalloc. */ - s->object_size = max(s->object_size, (int)size); - s->inuse = max_t(int, s->inuse, ALIGN(size, sizeof(void *))); + s->object_size = max(s->object_size, size); + s->inuse = max(s->inuse, ALIGN(size, sizeof(void *))); for_each_memcg_cache(c, s) { c->object_size = s->object_size; - c->inuse = max_t(int, c->inuse, - ALIGN(size, sizeof(void *))); + c->inuse = max(c->inuse, ALIGN(size, sizeof(void *))); } if (sysfs_slab_alias(s, name)) { @@ -4889,35 +4915,35 @@ struct slab_attribute { static ssize_t slab_size_show(struct kmem_cache *s, char *buf) { - return sprintf(buf, "%d\n", s->size); + return sprintf(buf, "%u\n", s->size); } SLAB_ATTR_RO(slab_size); static ssize_t align_show(struct kmem_cache *s, char *buf) { - return sprintf(buf, "%d\n", s->align); + return sprintf(buf, "%u\n", s->align); } SLAB_ATTR_RO(align); static ssize_t object_size_show(struct kmem_cache *s, char *buf) { - return sprintf(buf, "%d\n", s->object_size); + return sprintf(buf, "%u\n", s->object_size); } SLAB_ATTR_RO(object_size); static ssize_t objs_per_slab_show(struct kmem_cache *s, char *buf) { - return sprintf(buf, "%d\n", oo_objects(s->oo)); + return sprintf(buf, "%u\n", oo_objects(s->oo)); } SLAB_ATTR_RO(objs_per_slab); static ssize_t order_store(struct kmem_cache *s, const char *buf, size_t length) { - unsigned long order; + unsigned int order; int err; - err = kstrtoul(buf, 10, &order); + err = kstrtouint(buf, 10, &order); if (err) return err; @@ -4930,7 +4956,7 @@ static ssize_t order_store(struct kmem_cache *s, static ssize_t order_show(struct kmem_cache *s, char *buf) { - return sprintf(buf, "%d\n", oo_order(s->oo)); + return sprintf(buf, "%u\n", oo_order(s->oo)); } SLAB_ATTR(order); @@ -4962,10 +4988,10 @@ static ssize_t cpu_partial_show(struct kmem_cache *s, char *buf) static ssize_t cpu_partial_store(struct kmem_cache *s, const char *buf, size_t length) { - unsigned long objects; + unsigned int objects; int err; - err = kstrtoul(buf, 10, &objects); + err = kstrtouint(buf, 10, &objects); if (err) return err; if (objects && !kmem_cache_has_cpu_partial(s)) @@ -5081,7 +5107,7 @@ SLAB_ATTR_RO(cache_dma); static ssize_t usersize_show(struct kmem_cache *s, char *buf) { - return sprintf(buf, "%zu\n", s->usersize); + return sprintf(buf, "%u\n", s->usersize); } SLAB_ATTR_RO(usersize); @@ -5093,7 +5119,7 @@ SLAB_ATTR_RO(destroy_by_rcu); static ssize_t reserved_show(struct kmem_cache *s, char *buf) { - return sprintf(buf, "%d\n", s->reserved); + return sprintf(buf, "%u\n", s->reserved); } SLAB_ATTR_RO(reserved); @@ -5288,21 +5314,22 @@ SLAB_ATTR(shrink); #ifdef CONFIG_NUMA static ssize_t remote_node_defrag_ratio_show(struct kmem_cache *s, char *buf) { - return sprintf(buf, "%d\n", s->remote_node_defrag_ratio / 10); + return sprintf(buf, "%u\n", s->remote_node_defrag_ratio / 10); } static ssize_t remote_node_defrag_ratio_store(struct kmem_cache *s, const char *buf, size_t length) { - unsigned long ratio; + unsigned int ratio; int err; - err = kstrtoul(buf, 10, &ratio); + err = kstrtouint(buf, 10, &ratio); if (err) return err; + if (ratio > 100) + return -ERANGE; - if (ratio <= 100) - s->remote_node_defrag_ratio = ratio * 10; + s->remote_node_defrag_ratio = ratio * 10; return length; } @@ -5663,7 +5690,7 @@ static char *create_unique_id(struct kmem_cache *s) *p++ = 'A'; if (p != name + 1) *p++ = '-'; - p += sprintf(p, "%07d", s->size); + p += sprintf(p, "%07u", s->size); BUG_ON(p > name + ID_STR_LENGTH - 1); return name; diff --git a/mm/sparse.c b/mm/sparse.c index 58cab483e81b..62eef264a7bd 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -779,7 +779,13 @@ int __meminit sparse_add_one_section(struct pglist_data *pgdat, goto out; } - memset(memmap, 0, sizeof(struct page) * PAGES_PER_SECTION); +#ifdef CONFIG_DEBUG_VM + /* + * Poison uninitialized struct pages in order to catch invalid flags + * combinations. + */ + memset(memmap, PAGE_POISON_PATTERN, sizeof(struct page) * PAGES_PER_SECTION); +#endif section_mark_present(ms); diff --git a/mm/swap.c b/mm/swap.c index 0f17330dd0e5..3dd518832096 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -707,7 +707,6 @@ void lru_add_drain_all(void) * release_pages - batched put_page() * @pages: array of pages to release * @nr: number of pages - * @cold: whether the pages are cache cold * * Decrement the reference count on all the pages in @pages. If it * fell to zero, remove the page from the LRU and free it. diff --git a/mm/swap_slots.c b/mm/swap_slots.c index bebc19292018..f2641894f440 100644 --- a/mm/swap_slots.c +++ b/mm/swap_slots.c @@ -34,8 +34,6 @@ #include <linux/mutex.h> #include <linux/mm.h> -#ifdef CONFIG_SWAP - static DEFINE_PER_CPU(struct swap_slots_cache, swp_slots); static bool swap_slot_cache_active; bool swap_slot_cache_enabled; @@ -356,5 +354,3 @@ repeat: return entry; } - -#endif /* CONFIG_SWAP */ diff --git a/mm/swap_state.c b/mm/swap_state.c index 39ae7cfad90f..07f9aa2340c3 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -38,7 +38,7 @@ static const struct address_space_operations swap_aops = { struct address_space *swapper_spaces[MAX_SWAPFILES] __read_mostly; static unsigned int nr_swapper_spaces[MAX_SWAPFILES] __read_mostly; -bool swap_vma_readahead __read_mostly = true; +static bool enable_vma_readahead __read_mostly = true; #define SWAP_RA_WIN_SHIFT (PAGE_SHIFT / 2) #define SWAP_RA_HITS_MASK ((1UL << SWAP_RA_WIN_SHIFT) - 1) @@ -124,10 +124,10 @@ int __add_to_swap_cache(struct page *page, swp_entry_t entry) SetPageSwapCache(page); address_space = swap_address_space(entry); - spin_lock_irq(&address_space->tree_lock); + xa_lock_irq(&address_space->i_pages); for (i = 0; i < nr; i++) { set_page_private(page + i, entry.val + i); - error = radix_tree_insert(&address_space->page_tree, + error = radix_tree_insert(&address_space->i_pages, idx + i, page + i); if (unlikely(error)) break; @@ -145,13 +145,13 @@ int __add_to_swap_cache(struct page *page, swp_entry_t entry) VM_BUG_ON(error == -EEXIST); set_page_private(page + i, 0UL); while (i--) { - radix_tree_delete(&address_space->page_tree, idx + i); + radix_tree_delete(&address_space->i_pages, idx + i); set_page_private(page + i, 0UL); } ClearPageSwapCache(page); page_ref_sub(page, nr); } - spin_unlock_irq(&address_space->tree_lock); + xa_unlock_irq(&address_space->i_pages); return error; } @@ -188,7 +188,7 @@ void __delete_from_swap_cache(struct page *page) address_space = swap_address_space(entry); idx = swp_offset(entry); for (i = 0; i < nr; i++) { - radix_tree_delete(&address_space->page_tree, idx + i); + radix_tree_delete(&address_space->i_pages, idx + i); set_page_private(page + i, 0); } ClearPageSwapCache(page); @@ -272,9 +272,9 @@ void delete_from_swap_cache(struct page *page) entry.val = page_private(page); address_space = swap_address_space(entry); - spin_lock_irq(&address_space->tree_lock); + xa_lock_irq(&address_space->i_pages); __delete_from_swap_cache(page); - spin_unlock_irq(&address_space->tree_lock); + xa_unlock_irq(&address_space->i_pages); put_swap_page(page, entry); page_ref_sub(page, hpage_nr_pages(page)); @@ -322,6 +322,11 @@ void free_pages_and_swap_cache(struct page **pages, int nr) release_pages(pagep, nr); } +static inline bool swap_use_vma_readahead(void) +{ + return READ_ONCE(enable_vma_readahead) && !atomic_read(&nr_rotate_swap); +} + /* * Lookup a swap entry in the swap cache. A found page will be returned * unlocked and with its refcount incremented - we rely on the kernel @@ -332,32 +337,43 @@ struct page *lookup_swap_cache(swp_entry_t entry, struct vm_area_struct *vma, unsigned long addr) { struct page *page; - unsigned long ra_info; - int win, hits, readahead; page = find_get_page(swap_address_space(entry), swp_offset(entry)); INC_CACHE_INFO(find_total); if (page) { + bool vma_ra = swap_use_vma_readahead(); + bool readahead; + INC_CACHE_INFO(find_success); + /* + * At the moment, we don't support PG_readahead for anon THP + * so let's bail out rather than confusing the readahead stat. + */ if (unlikely(PageTransCompound(page))) return page; + readahead = TestClearPageReadahead(page); - if (vma) { - ra_info = GET_SWAP_RA_VAL(vma); - win = SWAP_RA_WIN(ra_info); - hits = SWAP_RA_HITS(ra_info); + if (vma && vma_ra) { + unsigned long ra_val; + int win, hits; + + ra_val = GET_SWAP_RA_VAL(vma); + win = SWAP_RA_WIN(ra_val); + hits = SWAP_RA_HITS(ra_val); if (readahead) hits = min_t(int, hits + 1, SWAP_RA_HITS_MAX); atomic_long_set(&vma->swap_readahead_info, SWAP_RA_VAL(addr, win, hits)); } + if (readahead) { count_vm_event(SWAP_RA_HIT); - if (!vma) + if (!vma || !vma_ra) atomic_inc(&swapin_readahead_hits); } } + return page; } @@ -533,11 +549,10 @@ static unsigned long swapin_nr_pages(unsigned long offset) } /** - * swapin_readahead - swap in pages in hope we need them soon + * swap_cluster_readahead - swap in pages in hope we need them soon * @entry: swap entry of this memory * @gfp_mask: memory allocation flags - * @vma: user vma this address belongs to - * @addr: target address for mempolicy + * @vmf: fault information * * Returns the struct page for entry and addr, after queueing swapin. * @@ -549,10 +564,10 @@ static unsigned long swapin_nr_pages(unsigned long offset) * This has been extended to use the NUMA policies from the mm triggering * the readahead. * - * Caller must hold down_read on the vma->vm_mm if vma is not NULL. + * Caller must hold down_read on the vma->vm_mm if vmf->vma is not NULL. */ -struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask, - struct vm_area_struct *vma, unsigned long addr) +struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask, + struct vm_fault *vmf) { struct page *page; unsigned long entry_offset = swp_offset(entry); @@ -562,6 +577,8 @@ struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask, struct swap_info_struct *si = swp_swap_info(entry); struct blk_plug plug; bool do_poll = true, page_allocated; + struct vm_area_struct *vma = vmf->vma; + unsigned long addr = vmf->address; mask = swapin_nr_pages(offset) - 1; if (!mask) @@ -586,8 +603,7 @@ struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask, continue; if (page_allocated) { swap_readpage(page, false); - if (offset != entry_offset && - likely(!PageTransCompound(page))) { + if (offset != entry_offset) { SetPageReadahead(page); count_vm_event(SWAP_RA); } @@ -612,12 +628,11 @@ int init_swap_address_space(unsigned int type, unsigned long nr_pages) return -ENOMEM; for (i = 0; i < nr; i++) { space = spaces + i; - INIT_RADIX_TREE(&space->page_tree, GFP_ATOMIC|__GFP_NOWARN); + INIT_RADIX_TREE(&space->i_pages, GFP_ATOMIC|__GFP_NOWARN); atomic_set(&space->i_mmap_writable, 0); space->a_ops = &swap_aops; /* swap cache doesn't use writeback related tags */ mapping_set_no_writeback_tags(space); - spin_lock_init(&space->tree_lock); } nr_swapper_spaces[type] = nr; rcu_assign_pointer(swapper_spaces[type], spaces); @@ -649,16 +664,15 @@ static inline void swap_ra_clamp_pfn(struct vm_area_struct *vma, PFN_DOWN((faddr & PMD_MASK) + PMD_SIZE)); } -struct page *swap_readahead_detect(struct vm_fault *vmf, - struct vma_swap_readahead *swap_ra) +static void swap_ra_info(struct vm_fault *vmf, + struct vma_swap_readahead *ra_info) { struct vm_area_struct *vma = vmf->vma; - unsigned long swap_ra_info; - struct page *page; + unsigned long ra_val; swp_entry_t entry; unsigned long faddr, pfn, fpfn; unsigned long start, end; - pte_t *pte; + pte_t *pte, *orig_pte; unsigned int max_win, hits, prev_win, win, left; #ifndef CONFIG_64BIT pte_t *tpte; @@ -667,30 +681,32 @@ struct page *swap_readahead_detect(struct vm_fault *vmf, max_win = 1 << min_t(unsigned int, READ_ONCE(page_cluster), SWAP_RA_ORDER_CEILING); if (max_win == 1) { - swap_ra->win = 1; - return NULL; + ra_info->win = 1; + return; } faddr = vmf->address; - entry = pte_to_swp_entry(vmf->orig_pte); - if ((unlikely(non_swap_entry(entry)))) - return NULL; - page = lookup_swap_cache(entry, vma, faddr); - if (page) - return page; + orig_pte = pte = pte_offset_map(vmf->pmd, faddr); + entry = pte_to_swp_entry(*pte); + if ((unlikely(non_swap_entry(entry)))) { + pte_unmap(orig_pte); + return; + } fpfn = PFN_DOWN(faddr); - swap_ra_info = GET_SWAP_RA_VAL(vma); - pfn = PFN_DOWN(SWAP_RA_ADDR(swap_ra_info)); - prev_win = SWAP_RA_WIN(swap_ra_info); - hits = SWAP_RA_HITS(swap_ra_info); - swap_ra->win = win = __swapin_nr_pages(pfn, fpfn, hits, + ra_val = GET_SWAP_RA_VAL(vma); + pfn = PFN_DOWN(SWAP_RA_ADDR(ra_val)); + prev_win = SWAP_RA_WIN(ra_val); + hits = SWAP_RA_HITS(ra_val); + ra_info->win = win = __swapin_nr_pages(pfn, fpfn, hits, max_win, prev_win); atomic_long_set(&vma->swap_readahead_info, SWAP_RA_VAL(faddr, win, 0)); - if (win == 1) - return NULL; + if (win == 1) { + pte_unmap(orig_pte); + return; + } /* Copy the PTEs because the page table may be unmapped */ if (fpfn == pfn + 1) @@ -703,23 +719,21 @@ struct page *swap_readahead_detect(struct vm_fault *vmf, swap_ra_clamp_pfn(vma, faddr, fpfn - left, fpfn + win - left, &start, &end); } - swap_ra->nr_pte = end - start; - swap_ra->offset = fpfn - start; - pte = vmf->pte - swap_ra->offset; + ra_info->nr_pte = end - start; + ra_info->offset = fpfn - start; + pte -= ra_info->offset; #ifdef CONFIG_64BIT - swap_ra->ptes = pte; + ra_info->ptes = pte; #else - tpte = swap_ra->ptes; + tpte = ra_info->ptes; for (pfn = start; pfn != end; pfn++) *tpte++ = *pte++; #endif - - return NULL; + pte_unmap(orig_pte); } -struct page *do_swap_page_readahead(swp_entry_t fentry, gfp_t gfp_mask, - struct vm_fault *vmf, - struct vma_swap_readahead *swap_ra) +static struct page *swap_vma_readahead(swp_entry_t fentry, gfp_t gfp_mask, + struct vm_fault *vmf) { struct blk_plug plug; struct vm_area_struct *vma = vmf->vma; @@ -728,12 +742,14 @@ struct page *do_swap_page_readahead(swp_entry_t fentry, gfp_t gfp_mask, swp_entry_t entry; unsigned int i; bool page_allocated; + struct vma_swap_readahead ra_info = {0,}; - if (swap_ra->win == 1) + swap_ra_info(vmf, &ra_info); + if (ra_info.win == 1) goto skip; blk_start_plug(&plug); - for (i = 0, pte = swap_ra->ptes; i < swap_ra->nr_pte; + for (i = 0, pte = ra_info.ptes; i < ra_info.nr_pte; i++, pte++) { pentry = *pte; if (pte_none(pentry)) @@ -749,8 +765,7 @@ struct page *do_swap_page_readahead(swp_entry_t fentry, gfp_t gfp_mask, continue; if (page_allocated) { swap_readpage(page, false); - if (i != swap_ra->offset && - likely(!PageTransCompound(page))) { + if (i != ra_info.offset) { SetPageReadahead(page); count_vm_event(SWAP_RA); } @@ -761,23 +776,43 @@ struct page *do_swap_page_readahead(swp_entry_t fentry, gfp_t gfp_mask, lru_add_drain(); skip: return read_swap_cache_async(fentry, gfp_mask, vma, vmf->address, - swap_ra->win == 1); + ra_info.win == 1); +} + +/** + * swapin_readahead - swap in pages in hope we need them soon + * @entry: swap entry of this memory + * @gfp_mask: memory allocation flags + * @vmf: fault information + * + * Returns the struct page for entry and addr, after queueing swapin. + * + * It's a main entry function for swap readahead. By the configuration, + * it will read ahead blocks by cluster-based(ie, physical disk based) + * or vma-based(ie, virtual address based on faulty address) readahead. + */ +struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask, + struct vm_fault *vmf) +{ + return swap_use_vma_readahead() ? + swap_vma_readahead(entry, gfp_mask, vmf) : + swap_cluster_readahead(entry, gfp_mask, vmf); } #ifdef CONFIG_SYSFS static ssize_t vma_ra_enabled_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { - return sprintf(buf, "%s\n", swap_vma_readahead ? "true" : "false"); + return sprintf(buf, "%s\n", enable_vma_readahead ? "true" : "false"); } static ssize_t vma_ra_enabled_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { if (!strncmp(buf, "true", 4) || !strncmp(buf, "1", 1)) - swap_vma_readahead = true; + enable_vma_readahead = true; else if (!strncmp(buf, "false", 5) || !strncmp(buf, "0", 1)) - swap_vma_readahead = false; + enable_vma_readahead = false; else return -EINVAL; diff --git a/mm/swapfile.c b/mm/swapfile.c index c7a33717d079..cc2cf04d9018 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -85,7 +85,7 @@ PLIST_HEAD(swap_active_head); * is held and the locking order requires swap_lock to be taken * before any swap_info_struct->lock. */ -struct plist_head *swap_avail_heads; +static struct plist_head *swap_avail_heads; static DEFINE_SPINLOCK(swap_avail_lock); struct swap_info_struct *swap_info[MAX_SWAPFILES]; @@ -2961,6 +2961,10 @@ static unsigned long read_swap_header(struct swap_info_struct *p, maxpages = swp_offset(pte_to_swp_entry( swp_entry_to_pte(swp_entry(0, ~0UL)))) + 1; last_page = swap_header->info.last_page; + if (!last_page) { + pr_warn("Empty swap-file\n"); + return 0; + } if (last_page > maxpages) { pr_warn("Truncating oversized swap area, only using %luk out of %luk\n", maxpages << (PAGE_SHIFT - 10), diff --git a/mm/truncate.c b/mm/truncate.c index c34e2fd4f583..1d2fb2dca96f 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -36,11 +36,11 @@ static inline void __clear_shadow_entry(struct address_space *mapping, struct radix_tree_node *node; void **slot; - if (!__radix_tree_lookup(&mapping->page_tree, index, &node, &slot)) + if (!__radix_tree_lookup(&mapping->i_pages, index, &node, &slot)) return; if (*slot != entry) return; - __radix_tree_replace(&mapping->page_tree, node, slot, NULL, + __radix_tree_replace(&mapping->i_pages, node, slot, NULL, workingset_update_node); mapping->nrexceptional--; } @@ -48,9 +48,9 @@ static inline void __clear_shadow_entry(struct address_space *mapping, static void clear_shadow_entry(struct address_space *mapping, pgoff_t index, void *entry) { - spin_lock_irq(&mapping->tree_lock); + xa_lock_irq(&mapping->i_pages); __clear_shadow_entry(mapping, index, entry); - spin_unlock_irq(&mapping->tree_lock); + xa_unlock_irq(&mapping->i_pages); } /* @@ -79,7 +79,7 @@ static void truncate_exceptional_pvec_entries(struct address_space *mapping, dax = dax_mapping(mapping); lock = !dax && indices[j] < end; if (lock) - spin_lock_irq(&mapping->tree_lock); + xa_lock_irq(&mapping->i_pages); for (i = j; i < pagevec_count(pvec); i++) { struct page *page = pvec->pages[i]; @@ -102,7 +102,7 @@ static void truncate_exceptional_pvec_entries(struct address_space *mapping, } if (lock) - spin_unlock_irq(&mapping->tree_lock); + xa_unlock_irq(&mapping->i_pages); pvec->nr = j; } @@ -518,8 +518,8 @@ void truncate_inode_pages_final(struct address_space *mapping) * modification that does not see AS_EXITING is * completed before starting the final truncate. */ - spin_lock_irq(&mapping->tree_lock); - spin_unlock_irq(&mapping->tree_lock); + xa_lock_irq(&mapping->i_pages); + xa_unlock_irq(&mapping->i_pages); truncate_inode_pages(mapping, 0); } @@ -627,13 +627,13 @@ invalidate_complete_page2(struct address_space *mapping, struct page *page) if (page_has_private(page) && !try_to_release_page(page, GFP_KERNEL)) return 0; - spin_lock_irqsave(&mapping->tree_lock, flags); + xa_lock_irqsave(&mapping->i_pages, flags); if (PageDirty(page)) goto failed; BUG_ON(page_has_private(page)); __delete_from_page_cache(page, NULL); - spin_unlock_irqrestore(&mapping->tree_lock, flags); + xa_unlock_irqrestore(&mapping->i_pages, flags); if (mapping->a_ops->freepage) mapping->a_ops->freepage(page); @@ -641,7 +641,7 @@ invalidate_complete_page2(struct address_space *mapping, struct page *page) put_page(page); /* pagecache ref */ return 1; failed: - spin_unlock_irqrestore(&mapping->tree_lock, flags); + xa_unlock_irqrestore(&mapping->i_pages, flags); return 0; } diff --git a/mm/util.c b/mm/util.c index c1250501364f..45fc3169e7b0 100644 --- a/mm/util.c +++ b/mm/util.c @@ -287,7 +287,7 @@ int vma_is_stack_for_current(struct vm_area_struct *vma) } #if defined(CONFIG_MMU) && !defined(HAVE_ARCH_PICK_MMAP_LAYOUT) -void arch_pick_mmap_layout(struct mm_struct *mm) +void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack) { mm->mmap_base = TASK_UNMAPPED_BASE; mm->get_unmapped_area = arch_get_unmapped_area; @@ -297,8 +297,10 @@ void arch_pick_mmap_layout(struct mm_struct *mm) /* * Like get_user_pages_fast() except its IRQ-safe in that it won't fall * back to the regular GUP. - * If the architecture not support this function, simply return with no - * page pinned + * Note a difference with get_user_pages_fast: this always returns the + * number of pages pinned, 0 if no pages were pinned. + * If the architecture does not support this function, simply return with no + * pages pinned. */ int __weak __get_user_pages_fast(unsigned long start, int nr_pages, int write, struct page **pages) @@ -515,6 +517,16 @@ struct address_space *page_mapping(struct page *page) } EXPORT_SYMBOL(page_mapping); +/* + * For file cache pages, return the address_space, otherwise return NULL + */ +struct address_space *page_mapping_file(struct page *page) +{ + if (unlikely(PageSwapCache(page))) + return NULL; + return page_mapping(page); +} + /* Slow path of page_mapcount() for compound pages */ int __page_mapcount(struct page *page) { @@ -658,6 +670,13 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) free += global_node_page_state(NR_SLAB_RECLAIMABLE); /* + * Part of the kernel memory, which can be released + * under memory pressure. + */ + free += global_node_page_state( + NR_INDIRECTLY_RECLAIMABLE_BYTES) >> PAGE_SHIFT; + + /* * Leave reserved pages. The pages are not for anonymous pages. */ if (free <= totalreserve_pages) diff --git a/mm/vmscan.c b/mm/vmscan.c index cd5dc3faaa57..8b920ce3ae02 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -116,6 +116,16 @@ struct scan_control { /* Number of pages freed so far during a call to shrink_zones() */ unsigned long nr_reclaimed; + + struct { + unsigned int dirty; + unsigned int unqueued_dirty; + unsigned int congested; + unsigned int writeback; + unsigned int immediate; + unsigned int file_taken; + unsigned int taken; + } nr; }; #ifdef ARCH_HAS_PREFETCH @@ -190,6 +200,29 @@ static bool sane_reclaim(struct scan_control *sc) #endif return false; } + +static void set_memcg_congestion(pg_data_t *pgdat, + struct mem_cgroup *memcg, + bool congested) +{ + struct mem_cgroup_per_node *mn; + + if (!memcg) + return; + + mn = mem_cgroup_nodeinfo(memcg, pgdat->node_id); + WRITE_ONCE(mn->congested, congested); +} + +static bool memcg_congested(pg_data_t *pgdat, + struct mem_cgroup *memcg) +{ + struct mem_cgroup_per_node *mn; + + mn = mem_cgroup_nodeinfo(memcg, pgdat->node_id); + return READ_ONCE(mn->congested); + +} #else static bool global_reclaim(struct scan_control *sc) { @@ -200,6 +233,18 @@ static bool sane_reclaim(struct scan_control *sc) { return true; } + +static inline void set_memcg_congestion(struct pglist_data *pgdat, + struct mem_cgroup *memcg, bool congested) +{ +} + +static inline bool memcg_congested(struct pglist_data *pgdat, + struct mem_cgroup *memcg) +{ + return false; + +} #endif /* @@ -442,16 +487,8 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid, if (memcg && (!memcg_kmem_enabled() || !mem_cgroup_online(memcg))) return 0; - if (!down_read_trylock(&shrinker_rwsem)) { - /* - * If we would return 0, our callers would understand that we - * have nothing else to shrink and give up trying. By returning - * 1 we keep it going and assume we'll be able to shrink next - * time. - */ - freed = 1; + if (!down_read_trylock(&shrinker_rwsem)) goto out; - } list_for_each_entry(shrinker, &shrinker_list, list) { struct shrink_control sc = { @@ -656,7 +693,7 @@ static int __remove_mapping(struct address_space *mapping, struct page *page, BUG_ON(!PageLocked(page)); BUG_ON(mapping != page_mapping(page)); - spin_lock_irqsave(&mapping->tree_lock, flags); + xa_lock_irqsave(&mapping->i_pages, flags); /* * The non racy check for a busy page. * @@ -680,7 +717,7 @@ static int __remove_mapping(struct address_space *mapping, struct page *page, * load is not satisfied before that of page->_refcount. * * Note that if SetPageDirty is always performed via set_page_dirty, - * and thus under tree_lock, then this ordering is not required. + * and thus under the i_pages lock, then this ordering is not required. */ if (unlikely(PageTransHuge(page)) && PageSwapCache(page)) refcount = 1 + HPAGE_PMD_NR; @@ -698,7 +735,7 @@ static int __remove_mapping(struct address_space *mapping, struct page *page, swp_entry_t swap = { .val = page_private(page) }; mem_cgroup_swapout(page, swap); __delete_from_swap_cache(page); - spin_unlock_irqrestore(&mapping->tree_lock, flags); + xa_unlock_irqrestore(&mapping->i_pages, flags); put_swap_page(page, swap); } else { void (*freepage)(struct page *); @@ -719,13 +756,13 @@ static int __remove_mapping(struct address_space *mapping, struct page *page, * only page cache pages found in these are zero pages * covering holes, and because we don't want to mix DAX * exceptional entries and shadow exceptional entries in the - * same page_tree. + * same address_space. */ if (reclaimed && page_is_file_cache(page) && !mapping_exiting(mapping) && !dax_mapping(mapping)) shadow = workingset_eviction(mapping, page); __delete_from_page_cache(page, shadow); - spin_unlock_irqrestore(&mapping->tree_lock, flags); + xa_unlock_irqrestore(&mapping->i_pages, flags); if (freepage != NULL) freepage(page); @@ -734,7 +771,7 @@ static int __remove_mapping(struct address_space *mapping, struct page *page, return 1; cannot_free: - spin_unlock_irqrestore(&mapping->tree_lock, flags); + xa_unlock_irqrestore(&mapping->i_pages, flags); return 0; } @@ -865,17 +902,6 @@ static void page_check_dirty_writeback(struct page *page, mapping->a_ops->is_dirty_writeback(page, dirty, writeback); } -struct reclaim_stat { - unsigned nr_dirty; - unsigned nr_unqueued_dirty; - unsigned nr_congested; - unsigned nr_writeback; - unsigned nr_immediate; - unsigned nr_activate; - unsigned nr_ref_keep; - unsigned nr_unmap_fail; -}; - /* * shrink_page_list() returns the number of reclaimed pages */ @@ -934,7 +960,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, (PageSwapCache(page) && (sc->gfp_mask & __GFP_IO)); /* - * The number of dirty pages determines if a zone is marked + * The number of dirty pages determines if a node is marked * reclaim_congested which affects wait_iff_congested. kswapd * will stall and start writing pages if the tail of the LRU * is all dirty unqueued pages. @@ -1763,23 +1789,6 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, free_unref_page_list(&page_list); /* - * If reclaim is isolating dirty pages under writeback, it implies - * that the long-lived page allocation rate is exceeding the page - * laundering rate. Either the global limits are not being effective - * at throttling processes due to the page distribution throughout - * zones or there is heavy usage of a slow backing device. The - * only option is to throttle from reclaim context which is not ideal - * as there is no guarantee the dirtying process is throttled in the - * same way balance_dirty_pages() manages. - * - * Once a zone is flagged ZONE_WRITEBACK, kswapd will count the number - * of pages under pages flagged for immediate reclaim and stall if any - * are encountered in the nr_immediate check below. - */ - if (stat.nr_writeback && stat.nr_writeback == nr_taken) - set_bit(PGDAT_WRITEBACK, &pgdat->flags); - - /* * If dirty pages are scanned that are not queued for IO, it * implies that flushers are not doing their job. This can * happen when memory pressure pushes dirty pages to the end of @@ -1793,48 +1802,17 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, if (stat.nr_unqueued_dirty == nr_taken) wakeup_flusher_threads(WB_REASON_VMSCAN); - /* - * Legacy memcg will stall in page writeback so avoid forcibly - * stalling here. - */ - if (sane_reclaim(sc)) { - /* - * Tag a zone as congested if all the dirty pages scanned were - * backed by a congested BDI and wait_iff_congested will stall. - */ - if (stat.nr_dirty && stat.nr_dirty == stat.nr_congested) - set_bit(PGDAT_CONGESTED, &pgdat->flags); - - /* Allow kswapd to start writing pages during reclaim. */ - if (stat.nr_unqueued_dirty == nr_taken) - set_bit(PGDAT_DIRTY, &pgdat->flags); - - /* - * If kswapd scans pages marked marked for immediate - * reclaim and under writeback (nr_immediate), it implies - * that pages are cycling through the LRU faster than - * they are written so also forcibly stall. - */ - if (stat.nr_immediate && current_may_throttle()) - congestion_wait(BLK_RW_ASYNC, HZ/10); - } - - /* - * Stall direct reclaim for IO completions if underlying BDIs or zone - * is congested. Allow kswapd to continue until it starts encountering - * unqueued dirty pages or cycling through the LRU too quickly. - */ - if (!sc->hibernation_mode && !current_is_kswapd() && - current_may_throttle()) - wait_iff_congested(pgdat, BLK_RW_ASYNC, HZ/10); + sc->nr.dirty += stat.nr_dirty; + sc->nr.congested += stat.nr_congested; + sc->nr.unqueued_dirty += stat.nr_unqueued_dirty; + sc->nr.writeback += stat.nr_writeback; + sc->nr.immediate += stat.nr_immediate; + sc->nr.taken += nr_taken; + if (file) + sc->nr.file_taken += nr_taken; trace_mm_vmscan_lru_shrink_inactive(pgdat->node_id, - nr_scanned, nr_reclaimed, - stat.nr_dirty, stat.nr_writeback, - stat.nr_congested, stat.nr_immediate, - stat.nr_activate, stat.nr_ref_keep, - stat.nr_unmap_fail, - sc->priority, file); + nr_scanned, nr_reclaimed, &stat, sc->priority, file); return nr_reclaimed; } @@ -2515,6 +2493,12 @@ static inline bool should_continue_reclaim(struct pglist_data *pgdat, return true; } +static bool pgdat_memcg_congested(pg_data_t *pgdat, struct mem_cgroup *memcg) +{ + return test_bit(PGDAT_CONGESTED, &pgdat->flags) || + (memcg && memcg_congested(pgdat, memcg)); +} + static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc) { struct reclaim_state *reclaim_state = current->reclaim_state; @@ -2530,6 +2514,8 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc) unsigned long node_lru_pages = 0; struct mem_cgroup *memcg; + memset(&sc->nr, 0, sizeof(sc->nr)); + nr_reclaimed = sc->nr_reclaimed; nr_scanned = sc->nr_scanned; @@ -2544,7 +2530,7 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc) sc->memcg_low_skipped = 1; continue; } - mem_cgroup_event(memcg, MEMCG_LOW); + memcg_memory_event(memcg, MEMCG_LOW); } reclaimed = sc->nr_reclaimed; @@ -2595,6 +2581,67 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc) if (sc->nr_reclaimed - nr_reclaimed) reclaimable = true; + if (current_is_kswapd()) { + /* + * If reclaim is isolating dirty pages under writeback, + * it implies that the long-lived page allocation rate + * is exceeding the page laundering rate. Either the + * global limits are not being effective at throttling + * processes due to the page distribution throughout + * zones or there is heavy usage of a slow backing + * device. The only option is to throttle from reclaim + * context which is not ideal as there is no guarantee + * the dirtying process is throttled in the same way + * balance_dirty_pages() manages. + * + * Once a node is flagged PGDAT_WRITEBACK, kswapd will + * count the number of pages under pages flagged for + * immediate reclaim and stall if any are encountered + * in the nr_immediate check below. + */ + if (sc->nr.writeback && sc->nr.writeback == sc->nr.taken) + set_bit(PGDAT_WRITEBACK, &pgdat->flags); + + /* + * Tag a node as congested if all the dirty pages + * scanned were backed by a congested BDI and + * wait_iff_congested will stall. + */ + if (sc->nr.dirty && sc->nr.dirty == sc->nr.congested) + set_bit(PGDAT_CONGESTED, &pgdat->flags); + + /* Allow kswapd to start writing pages during reclaim.*/ + if (sc->nr.unqueued_dirty == sc->nr.file_taken) + set_bit(PGDAT_DIRTY, &pgdat->flags); + + /* + * If kswapd scans pages marked marked for immediate + * reclaim and under writeback (nr_immediate), it + * implies that pages are cycling through the LRU + * faster than they are written so also forcibly stall. + */ + if (sc->nr.immediate) + congestion_wait(BLK_RW_ASYNC, HZ/10); + } + + /* + * Legacy memcg will stall in page writeback so avoid forcibly + * stalling in wait_iff_congested(). + */ + if (!global_reclaim(sc) && sane_reclaim(sc) && + sc->nr.dirty && sc->nr.dirty == sc->nr.congested) + set_memcg_congestion(pgdat, root, true); + + /* + * Stall direct reclaim for IO completions if underlying BDIs + * and node is congested. Allow kswapd to continue until it + * starts encountering unqueued dirty pages or cycling through + * the LRU too quickly. + */ + if (!sc->hibernation_mode && !current_is_kswapd() && + current_may_throttle() && pgdat_memcg_congested(pgdat, root)) + wait_iff_congested(BLK_RW_ASYNC, HZ/10); + } while (should_continue_reclaim(pgdat, sc->nr_reclaimed - nr_reclaimed, sc->nr_scanned - nr_scanned, sc)); @@ -2810,6 +2857,7 @@ retry: continue; last_pgdat = zone->zone_pgdat; snapshot_refaults(sc->target_mem_cgroup, zone->zone_pgdat); + set_memcg_congestion(last_pgdat, sc->target_mem_cgroup, false); } delayacct_freepages_end(); @@ -3547,16 +3595,21 @@ kswapd_try_sleep: } /* - * A zone is low on free memory, so wake its kswapd task to service it. + * A zone is low on free memory or too fragmented for high-order memory. If + * kswapd should reclaim (direct reclaim is deferred), wake it up for the zone's + * pgdat. It will wake up kcompactd after reclaiming memory. If kswapd reclaim + * has failed or is not needed, still wake up kcompactd if only compaction is + * needed. */ -void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx) +void wakeup_kswapd(struct zone *zone, gfp_t gfp_flags, int order, + enum zone_type classzone_idx) { pg_data_t *pgdat; if (!managed_zone(zone)) return; - if (!cpuset_zone_allowed(zone, GFP_KERNEL | __GFP_HARDWALL)) + if (!cpuset_zone_allowed(zone, gfp_flags)) return; pgdat = zone->zone_pgdat; pgdat->kswapd_classzone_idx = kswapd_classzone_idx(pgdat, @@ -3565,14 +3618,23 @@ void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx) if (!waitqueue_active(&pgdat->kswapd_wait)) return; - /* Hopeless node, leave it to direct reclaim */ - if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES) - return; - - if (pgdat_balanced(pgdat, order, classzone_idx)) + /* Hopeless node, leave it to direct reclaim if possible */ + if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES || + pgdat_balanced(pgdat, order, classzone_idx)) { + /* + * There may be plenty of free memory available, but it's too + * fragmented for high-order allocations. Wake up kcompactd + * and rely on compaction_suitable() to determine if it's + * needed. If it fails, it will defer subsequent attempts to + * ratelimit its work. + */ + if (!(gfp_flags & __GFP_DIRECT_RECLAIM)) + wakeup_kcompactd(pgdat, order, classzone_idx); return; + } - trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, classzone_idx, order); + trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, classzone_idx, order, + gfp_flags); wake_up_interruptible(&pgdat->kswapd_wait); } @@ -3802,7 +3864,7 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in if (node_pagecache_reclaimable(pgdat) > pgdat->min_unmapped_pages) { /* - * Free memory by calling shrink zone with increasing + * Free memory by calling shrink node with increasing * priorities until we have enough memory freed. */ do { @@ -3877,7 +3939,13 @@ int node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order) */ int page_evictable(struct page *page) { - return !mapping_unevictable(page_mapping(page)) && !PageMlocked(page); + int ret; + + /* Prevent address_space of inode and swap cache from being freed */ + rcu_read_lock(); + ret = !mapping_unevictable(page_mapping(page)) && !PageMlocked(page); + rcu_read_unlock(); + return ret; } #ifdef CONFIG_SHMEM diff --git a/mm/vmstat.c b/mm/vmstat.c index 33581be705f0..536332e988b8 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1161,6 +1161,7 @@ const char * const vmstat_text[] = { "nr_vmscan_immediate_reclaim", "nr_dirtied", "nr_written", + "nr_indirectly_reclaimable", /* enum writeback_stat_item counters */ "nr_dirty_threshold", diff --git a/mm/workingset.c b/mm/workingset.c index b7d616a3bbbe..40ee02c83978 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -202,7 +202,7 @@ static void unpack_shadow(void *shadow, int *memcgidp, pg_data_t **pgdat, * @mapping: address space the page was backing * @page: the page being evicted * - * Returns a shadow entry to be stored in @mapping->page_tree in place + * Returns a shadow entry to be stored in @mapping->i_pages in place * of the evicted @page so that a later refault can be detected. */ void *workingset_eviction(struct address_space *mapping, struct page *page) @@ -348,7 +348,7 @@ void workingset_update_node(struct radix_tree_node *node) * * Avoid acquiring the list_lru lock when the nodes are * already where they should be. The list_empty() test is safe - * as node->private_list is protected by &mapping->tree_lock. + * as node->private_list is protected by the i_pages lock. */ if (node->count && node->count == node->exceptional) { if (list_empty(&node->private_list)) @@ -366,7 +366,7 @@ static unsigned long count_shadow_nodes(struct shrinker *shrinker, unsigned long nodes; unsigned long cache; - /* list_lru lock nests inside IRQ-safe mapping->tree_lock */ + /* list_lru lock nests inside the IRQ-safe i_pages lock */ local_irq_disable(); nodes = list_lru_shrink_count(&shadow_nodes, sc); local_irq_enable(); @@ -419,21 +419,21 @@ static enum lru_status shadow_lru_isolate(struct list_head *item, /* * Page cache insertions and deletions synchroneously maintain - * the shadow node LRU under the mapping->tree_lock and the + * the shadow node LRU under the i_pages lock and the * lru_lock. Because the page cache tree is emptied before * the inode can be destroyed, holding the lru_lock pins any * address_space that has radix tree nodes on the LRU. * - * We can then safely transition to the mapping->tree_lock to + * We can then safely transition to the i_pages lock to * pin only the address_space of the particular node we want * to reclaim, take the node off-LRU, and drop the lru_lock. */ node = container_of(item, struct radix_tree_node, private_list); - mapping = container_of(node->root, struct address_space, page_tree); + mapping = container_of(node->root, struct address_space, i_pages); /* Coming from the list, invert the lock order */ - if (!spin_trylock(&mapping->tree_lock)) { + if (!xa_trylock(&mapping->i_pages)) { spin_unlock(lru_lock); ret = LRU_RETRY; goto out; @@ -468,11 +468,11 @@ static enum lru_status shadow_lru_isolate(struct list_head *item, if (WARN_ON_ONCE(node->exceptional)) goto out_invalid; inc_lruvec_page_state(virt_to_page(node), WORKINGSET_NODERECLAIM); - __radix_tree_delete_node(&mapping->page_tree, node, + __radix_tree_delete_node(&mapping->i_pages, node, workingset_lookup_update(mapping)); out_invalid: - spin_unlock(&mapping->tree_lock); + xa_unlock(&mapping->i_pages); ret = LRU_REMOVED_RETRY; out: local_irq_enable(); @@ -487,7 +487,7 @@ static unsigned long scan_shadow_nodes(struct shrinker *shrinker, { unsigned long ret; - /* list_lru lock nests inside IRQ-safe mapping->tree_lock */ + /* list_lru lock nests inside the IRQ-safe i_pages lock */ local_irq_disable(); ret = list_lru_shrink_walk(&shadow_nodes, sc, shadow_lru_isolate, NULL); local_irq_enable(); @@ -503,7 +503,7 @@ static struct shrinker workingset_shadow_shrinker = { /* * Our list_lru->lock is IRQ-safe as it nests inside the IRQ-safe - * mapping->tree_lock. + * i_pages lock. */ static struct lock_class_key shadow_nodes_key; diff --git a/mm/z3fold.c b/mm/z3fold.c index d589d318727f..c0bca6153b95 100644 --- a/mm/z3fold.c +++ b/mm/z3fold.c @@ -467,6 +467,8 @@ static struct z3fold_pool *z3fold_create_pool(const char *name, gfp_t gfp, spin_lock_init(&pool->lock); spin_lock_init(&pool->stale_lock); pool->unbuddied = __alloc_percpu(sizeof(struct list_head)*NCHUNKS, 2); + if (!pool->unbuddied) + goto out_pool; for_each_possible_cpu(cpu) { struct list_head *unbuddied = per_cpu_ptr(pool->unbuddied, cpu); @@ -479,7 +481,7 @@ static struct z3fold_pool *z3fold_create_pool(const char *name, gfp_t gfp, pool->name = name; pool->compact_wq = create_singlethread_workqueue(pool->name); if (!pool->compact_wq) - goto out; + goto out_unbuddied; pool->release_wq = create_singlethread_workqueue(pool->name); if (!pool->release_wq) goto out_wq; @@ -489,8 +491,11 @@ static struct z3fold_pool *z3fold_create_pool(const char *name, gfp_t gfp, out_wq: destroy_workqueue(pool->compact_wq); -out: +out_unbuddied: + free_percpu(pool->unbuddied); +out_pool: kfree(pool); +out: return NULL; } @@ -533,7 +538,7 @@ static int z3fold_alloc(struct z3fold_pool *pool, size_t size, gfp_t gfp, struct z3fold_header *zhdr = NULL; struct page *page = NULL; enum buddy bud; - bool can_sleep = (gfp & __GFP_RECLAIM) == __GFP_RECLAIM; + bool can_sleep = gfpflags_allow_blocking(gfp); if (!size || (gfp & __GFP_HIGHMEM)) return -EINVAL; @@ -620,24 +625,27 @@ lookup: bud = FIRST; } - spin_lock(&pool->stale_lock); - zhdr = list_first_entry_or_null(&pool->stale, - struct z3fold_header, buddy); - /* - * Before allocating a page, let's see if we can take one from the - * stale pages list. cancel_work_sync() can sleep so we must make - * sure it won't be called in case we're in atomic context. - */ - if (zhdr && (can_sleep || !work_pending(&zhdr->work))) { - list_del(&zhdr->buddy); - spin_unlock(&pool->stale_lock); - if (can_sleep) + page = NULL; + if (can_sleep) { + spin_lock(&pool->stale_lock); + zhdr = list_first_entry_or_null(&pool->stale, + struct z3fold_header, buddy); + /* + * Before allocating a page, let's see if we can take one from + * the stale pages list. cancel_work_sync() can sleep so we + * limit this case to the contexts where we can sleep + */ + if (zhdr) { + list_del(&zhdr->buddy); + spin_unlock(&pool->stale_lock); cancel_work_sync(&zhdr->work); - page = virt_to_page(zhdr); - } else { - spin_unlock(&pool->stale_lock); - page = alloc_page(gfp); + page = virt_to_page(zhdr); + } else { + spin_unlock(&pool->stale_lock); + } } + if (!page) + page = alloc_page(gfp); if (!page) return -ENOMEM; diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index b7f61cd1c709..61cb05dc950c 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -193,6 +193,7 @@ static struct vfsmount *zsmalloc_mnt; * (see: fix_fullness_group()) */ static const int fullness_threshold_frac = 4; +static size_t huge_class_size; struct size_class { spinlock_t lock; @@ -642,18 +643,7 @@ static int zs_stats_size_show(struct seq_file *s, void *v) return 0; } - -static int zs_stats_size_open(struct inode *inode, struct file *file) -{ - return single_open(file, zs_stats_size_show, inode->i_private); -} - -static const struct file_operations zs_stat_size_ops = { - .open = zs_stats_size_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; +DEFINE_SHOW_ATTRIBUTE(zs_stats_size); static void zs_pool_stat_create(struct zs_pool *pool, const char *name) { @@ -672,7 +662,7 @@ static void zs_pool_stat_create(struct zs_pool *pool, const char *name) pool->stat_dentry = entry; entry = debugfs_create_file("classes", S_IFREG | S_IRUGO, - pool->stat_dentry, pool, &zs_stat_size_ops); + pool->stat_dentry, pool, &zs_stats_size_fops); if (!entry) { pr_warn("%s: debugfs file entry <%s> creation failed\n", name, "classes"); @@ -861,6 +851,7 @@ static struct page *get_next_page(struct page *page) /** * obj_to_location - get (<page>, <obj_idx>) from encoded object value + * @obj: the encoded object value * @page: page object resides in zspage * @obj_idx: object index */ @@ -1311,6 +1302,7 @@ EXPORT_SYMBOL_GPL(zs_get_total_pages); * zs_map_object - get address of allocated object from handle. * @pool: pool from which the object was allocated * @handle: handle returned from zs_malloc + * @mm: maping mode to use * * Before using an object allocated from zs_malloc, it must be mapped using * this function. When done with the object, it must be unmapped using @@ -1418,6 +1410,25 @@ void zs_unmap_object(struct zs_pool *pool, unsigned long handle) } EXPORT_SYMBOL_GPL(zs_unmap_object); +/** + * zs_huge_class_size() - Returns the size (in bytes) of the first huge + * zsmalloc &size_class. + * @pool: zsmalloc pool to use + * + * The function returns the size of the first huge class - any object of equal + * or bigger size will be stored in zspage consisting of a single physical + * page. + * + * Context: Any context. + * + * Return: the size (in bytes) of the first huge zsmalloc &size_class. + */ +size_t zs_huge_class_size(struct zs_pool *pool) +{ + return huge_class_size; +} +EXPORT_SYMBOL_GPL(zs_huge_class_size); + static unsigned long obj_malloc(struct size_class *class, struct zspage *zspage, unsigned long handle) { @@ -2375,6 +2386,27 @@ struct zs_pool *zs_create_pool(const char *name) objs_per_zspage = pages_per_zspage * PAGE_SIZE / size; /* + * We iterate from biggest down to smallest classes, + * so huge_class_size holds the size of the first huge + * class. Any object bigger than or equal to that will + * endup in the huge class. + */ + if (pages_per_zspage != 1 && objs_per_zspage != 1 && + !huge_class_size) { + huge_class_size = size; + /* + * The object uses ZS_HANDLE_SIZE bytes to store the + * handle. We need to subtract it, because zs_malloc() + * unconditionally adds handle size before it performs + * size class search - so object may be smaller than + * huge class size, yet it still can end up in the huge + * class because it grows by ZS_HANDLE_SIZE extra bytes + * right before class lookup. + */ + huge_class_size -= (ZS_HANDLE_SIZE - 1); + } + + /* * size_class is used for normal zsmalloc operation such * as alloc/free for that size. Although it is natural that we * have one size_class for each size, there is a chance that we |