diff options
Diffstat (limited to 'mm')
-rw-r--r-- | mm/Kconfig | 4 | ||||
-rw-r--r-- | mm/Makefile | 2 | ||||
-rw-r--r-- | mm/backing-dev.c | 16 | ||||
-rw-r--r-- | mm/compaction.c | 32 | ||||
-rw-r--r-- | mm/filemap.c | 490 | ||||
-rw-r--r-- | mm/fremap.c | 28 | ||||
-rw-r--r-- | mm/huge_memory.c | 90 | ||||
-rw-r--r-- | mm/hugetlb.c | 294 | ||||
-rw-r--r-- | mm/hugetlb_cgroup.c | 11 | ||||
-rw-r--r-- | mm/kmemleak.c | 140 | ||||
-rw-r--r-- | mm/ksm.c | 2 | ||||
-rw-r--r-- | mm/list_lru.c | 16 | ||||
-rw-r--r-- | mm/memblock.c | 5 | ||||
-rw-r--r-- | mm/memcontrol.c | 130 | ||||
-rw-r--r-- | mm/memory-failure.c | 10 | ||||
-rw-r--r-- | mm/memory.c | 481 | ||||
-rw-r--r-- | mm/mempolicy.c | 104 | ||||
-rw-r--r-- | mm/migrate.c | 43 | ||||
-rw-r--r-- | mm/mincore.c | 20 | ||||
-rw-r--r-- | mm/mmap.c | 24 | ||||
-rw-r--r-- | mm/mmu_context.c | 3 | ||||
-rw-r--r-- | mm/nobootmem.c | 2 | ||||
-rw-r--r-- | mm/nommu.c | 2 | ||||
-rw-r--r-- | mm/page_alloc.c | 38 | ||||
-rw-r--r-- | mm/page_cgroup.c | 12 | ||||
-rw-r--r-- | mm/percpu.c | 208 | ||||
-rw-r--r-- | mm/process_vm_access.c | 28 | ||||
-rw-r--r-- | mm/readahead.c | 10 | ||||
-rw-r--r-- | mm/rmap.c | 15 | ||||
-rw-r--r-- | mm/shmem.c | 124 | ||||
-rw-r--r-- | mm/slab.c | 4 | ||||
-rw-r--r-- | mm/slub.c | 24 | ||||
-rw-r--r-- | mm/sparse.c | 2 | ||||
-rw-r--r-- | mm/swap.c | 57 | ||||
-rw-r--r-- | mm/truncate.c | 148 | ||||
-rw-r--r-- | mm/vmscan.c | 122 | ||||
-rw-r--r-- | mm/vmstat.c | 6 | ||||
-rw-r--r-- | mm/workingset.c | 414 |
38 files changed, 2045 insertions, 1116 deletions
diff --git a/mm/Kconfig b/mm/Kconfig index 2d9f1504d75e..2888024e0b0a 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -575,5 +575,5 @@ config PGTABLE_MAPPING then you should select this. This causes zsmalloc to use page table mapping rather than copying for object mapping. - You can check speed with zsmalloc benchmark[1]. - [1] https://github.com/spartacus06/zsmalloc + You can check speed with zsmalloc benchmark: + https://github.com/spartacus06/zsmapbench diff --git a/mm/Makefile b/mm/Makefile index 310c90a09264..cdd741519ee0 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -17,7 +17,7 @@ obj-y := filemap.o mempool.o oom_kill.o fadvise.o \ util.o mmzone.o vmstat.o backing-dev.o \ mm_init.o mmu_context.o percpu.o slab_common.o \ compaction.o balloon_compaction.o \ - interval_tree.o list_lru.o $(mmu-y) + interval_tree.o list_lru.o workingset.o $(mmu-y) obj-y += init-mm.o diff --git a/mm/backing-dev.c b/mm/backing-dev.c index ce682f7a4f29..09d9591b7708 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -288,13 +288,19 @@ int bdi_has_dirty_io(struct backing_dev_info *bdi) * Note, we wouldn't bother setting up the timer, but this function is on the * fast-path (used by '__mark_inode_dirty()'), so we save few context switches * by delaying the wake-up. + * + * We have to be careful not to postpone flush work if it is scheduled for + * earlier. Thus we use queue_delayed_work(). */ void bdi_wakeup_thread_delayed(struct backing_dev_info *bdi) { unsigned long timeout; timeout = msecs_to_jiffies(dirty_writeback_interval * 10); - mod_delayed_work(bdi_wq, &bdi->wb.dwork, timeout); + spin_lock_bh(&bdi->wb_lock); + if (test_bit(BDI_registered, &bdi->state)) + queue_delayed_work(bdi_wq, &bdi->wb.dwork, timeout); + spin_unlock_bh(&bdi->wb_lock); } /* @@ -307,9 +313,6 @@ static void bdi_remove_from_list(struct backing_dev_info *bdi) spin_unlock_bh(&bdi_lock); synchronize_rcu_expedited(); - - /* bdi_list is now unused, clear it to mark @bdi dying */ - INIT_LIST_HEAD(&bdi->bdi_list); } int bdi_register(struct backing_dev_info *bdi, struct device *parent, @@ -360,6 +363,11 @@ static void bdi_wb_shutdown(struct backing_dev_info *bdi) */ bdi_remove_from_list(bdi); + /* Make sure nobody queues further work */ + spin_lock_bh(&bdi->wb_lock); + clear_bit(BDI_registered, &bdi->state); + spin_unlock_bh(&bdi->wb_lock); + /* * Drain work list and shutdown the delayed_work. At this point, * @bdi->bdi_list is empty telling bdi_Writeback_workfn() that @bdi diff --git a/mm/compaction.c b/mm/compaction.c index b48c5259ea33..b6ab77160068 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -251,7 +251,6 @@ static unsigned long isolate_freepages_block(struct compact_control *cc, { int nr_scanned = 0, total_isolated = 0; struct page *cursor, *valid_page = NULL; - unsigned long nr_strict_required = end_pfn - blockpfn; unsigned long flags; bool locked = false; @@ -264,11 +263,12 @@ static unsigned long isolate_freepages_block(struct compact_control *cc, nr_scanned++; if (!pfn_valid_within(blockpfn)) - continue; + goto isolate_fail; + if (!valid_page) valid_page = page; if (!PageBuddy(page)) - continue; + goto isolate_fail; /* * The zone lock must be held to isolate freepages. @@ -289,12 +289,10 @@ static unsigned long isolate_freepages_block(struct compact_control *cc, /* Recheck this is a buddy page under lock */ if (!PageBuddy(page)) - continue; + goto isolate_fail; /* Found a free page, break it into order-0 pages */ isolated = split_free_page(page); - if (!isolated && strict) - break; total_isolated += isolated; for (i = 0; i < isolated; i++) { list_add(&page->lru, freelist); @@ -305,7 +303,15 @@ static unsigned long isolate_freepages_block(struct compact_control *cc, if (isolated) { blockpfn += isolated - 1; cursor += isolated - 1; + continue; } + +isolate_fail: + if (strict) + break; + else + continue; + } trace_mm_compaction_isolate_freepages(nr_scanned, total_isolated); @@ -315,7 +321,7 @@ static unsigned long isolate_freepages_block(struct compact_control *cc, * pages requested were isolated. If there were any failures, 0 is * returned and CMA will fail. */ - if (strict && nr_strict_required > total_isolated) + if (strict && blockpfn < end_pfn) total_isolated = 0; if (locked) @@ -578,6 +584,15 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, continue; } + /* + * Migration will fail if an anonymous page is pinned in memory, + * so avoid taking lru_lock and isolating it unnecessarily in an + * admittedly racy check. + */ + if (!page_mapping(page) && + page_count(page) > page_mapcount(page)) + continue; + /* Check if it is ok to still hold the lock */ locked = compact_checklock_irqsave(&zone->lru_lock, &flags, locked, cc); @@ -1180,6 +1195,7 @@ static void compact_node(int nid) struct compact_control cc = { .order = -1, .sync = true, + .ignore_skip_hint = true, }; __compact_pgdat(NODE_DATA(nid), &cc); @@ -1219,7 +1235,7 @@ int sysctl_extfrag_handler(struct ctl_table *table, int write, } #if defined(CONFIG_SYSFS) && defined(CONFIG_NUMA) -ssize_t sysfs_compact_node(struct device *dev, +static ssize_t sysfs_compact_node(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { diff --git a/mm/filemap.c b/mm/filemap.c index 7a13f6ac5421..21781f1fe52b 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -107,12 +107,75 @@ * ->tasklist_lock (memory_failure, collect_procs_ao) */ +static void page_cache_tree_delete(struct address_space *mapping, + struct page *page, void *shadow) +{ + struct radix_tree_node *node; + unsigned long index; + unsigned int offset; + unsigned int tag; + void **slot; + + VM_BUG_ON(!PageLocked(page)); + + __radix_tree_lookup(&mapping->page_tree, page->index, &node, &slot); + + if (shadow) { + mapping->nrshadows++; + /* + * Make sure the nrshadows update is committed before + * the nrpages update so that final truncate racing + * with reclaim does not see both counters 0 at the + * same time and miss a shadow entry. + */ + smp_wmb(); + } + mapping->nrpages--; + + if (!node) { + /* Clear direct pointer tags in root node */ + mapping->page_tree.gfp_mask &= __GFP_BITS_MASK; + radix_tree_replace_slot(slot, shadow); + return; + } + + /* Clear tree tags for the removed page */ + index = page->index; + offset = index & RADIX_TREE_MAP_MASK; + for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) { + if (test_bit(offset, node->tags[tag])) + radix_tree_tag_clear(&mapping->page_tree, index, tag); + } + + /* Delete page, swap shadow entry */ + radix_tree_replace_slot(slot, shadow); + workingset_node_pages_dec(node); + if (shadow) + workingset_node_shadows_inc(node); + else + if (__radix_tree_delete_node(&mapping->page_tree, node)) + return; + + /* + * Track node that only contains shadow entries. + * + * Avoid acquiring the list_lru lock if already tracked. The + * list_empty() test is safe as node->private_list is + * protected by mapping->tree_lock. + */ + if (!workingset_node_pages(node) && + list_empty(&node->private_list)) { + node->private_data = mapping; + list_lru_add(&workingset_shadow_nodes, &node->private_list); + } +} + /* * Delete a page from the page cache and free it. Caller has to make * sure the page is locked and that nobody else uses it - or that usage * is safe. The caller must hold the mapping's tree_lock. */ -void __delete_from_page_cache(struct page *page) +void __delete_from_page_cache(struct page *page, void *shadow) { struct address_space *mapping = page->mapping; @@ -127,10 +190,11 @@ void __delete_from_page_cache(struct page *page) else cleancache_invalidate_page(mapping, page); - radix_tree_delete(&mapping->page_tree, page->index); + page_cache_tree_delete(mapping, page, shadow); + page->mapping = NULL; /* Leave page->index set: truncation lookup relies upon it */ - mapping->nrpages--; + __dec_zone_page_state(page, NR_FILE_PAGES); if (PageSwapBacked(page)) __dec_zone_page_state(page, NR_SHMEM); @@ -166,7 +230,7 @@ void delete_from_page_cache(struct page *page) freepage = mapping->a_ops->freepage; spin_lock_irq(&mapping->tree_lock); - __delete_from_page_cache(page); + __delete_from_page_cache(page, NULL); spin_unlock_irq(&mapping->tree_lock); mem_cgroup_uncharge_cache_page(page); @@ -426,7 +490,7 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask) new->index = offset; spin_lock_irq(&mapping->tree_lock); - __delete_from_page_cache(old); + __delete_from_page_cache(old, NULL); error = radix_tree_insert(&mapping->page_tree, offset, new); BUG_ON(error); mapping->nrpages++; @@ -446,18 +510,52 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask) } EXPORT_SYMBOL_GPL(replace_page_cache_page); -/** - * add_to_page_cache_locked - add a locked page to the pagecache - * @page: page to add - * @mapping: the page's address_space - * @offset: page index - * @gfp_mask: page allocation mode - * - * This function is used to add a page to the pagecache. It must be locked. - * This function does not add the page to the LRU. The caller must do that. - */ -int add_to_page_cache_locked(struct page *page, struct address_space *mapping, - pgoff_t offset, gfp_t gfp_mask) +static int page_cache_tree_insert(struct address_space *mapping, + struct page *page, void **shadowp) +{ + struct radix_tree_node *node; + void **slot; + int error; + + error = __radix_tree_create(&mapping->page_tree, page->index, + &node, &slot); + if (error) + return error; + if (*slot) { + void *p; + + p = radix_tree_deref_slot_protected(slot, &mapping->tree_lock); + if (!radix_tree_exceptional_entry(p)) + return -EEXIST; + if (shadowp) + *shadowp = p; + mapping->nrshadows--; + if (node) + workingset_node_shadows_dec(node); + } + radix_tree_replace_slot(slot, page); + mapping->nrpages++; + if (node) { + workingset_node_pages_inc(node); + /* + * Don't track node that contains actual pages. + * + * Avoid acquiring the list_lru lock if already + * untracked. The list_empty() test is safe as + * node->private_list is protected by + * mapping->tree_lock. + */ + if (!list_empty(&node->private_list)) + list_lru_del(&workingset_shadow_nodes, + &node->private_list); + } + return 0; +} + +static int __add_to_page_cache_locked(struct page *page, + struct address_space *mapping, + pgoff_t offset, gfp_t gfp_mask, + void **shadowp) { int error; @@ -480,11 +578,10 @@ int add_to_page_cache_locked(struct page *page, struct address_space *mapping, page->index = offset; spin_lock_irq(&mapping->tree_lock); - error = radix_tree_insert(&mapping->page_tree, offset, page); + error = page_cache_tree_insert(mapping, page, shadowp); radix_tree_preload_end(); if (unlikely(error)) goto err_insert; - mapping->nrpages++; __inc_zone_page_state(page, NR_FILE_PAGES); spin_unlock_irq(&mapping->tree_lock); trace_mm_filemap_add_to_page_cache(page); @@ -497,16 +594,49 @@ err_insert: page_cache_release(page); return error; } + +/** + * add_to_page_cache_locked - add a locked page to the pagecache + * @page: page to add + * @mapping: the page's address_space + * @offset: page index + * @gfp_mask: page allocation mode + * + * This function is used to add a page to the pagecache. It must be locked. + * This function does not add the page to the LRU. The caller must do that. + */ +int add_to_page_cache_locked(struct page *page, struct address_space *mapping, + pgoff_t offset, gfp_t gfp_mask) +{ + return __add_to_page_cache_locked(page, mapping, offset, + gfp_mask, NULL); +} EXPORT_SYMBOL(add_to_page_cache_locked); int add_to_page_cache_lru(struct page *page, struct address_space *mapping, pgoff_t offset, gfp_t gfp_mask) { + void *shadow = NULL; int ret; - ret = add_to_page_cache(page, mapping, offset, gfp_mask); - if (ret == 0) - lru_cache_add_file(page); + __set_page_locked(page); + ret = __add_to_page_cache_locked(page, mapping, offset, + gfp_mask, &shadow); + if (unlikely(ret)) + __clear_page_locked(page); + else { + /* + * The page might have been evicted from cache only + * recently, in which case it should be activated like + * any other repeatedly accessed page. + */ + if (shadow && workingset_refault(shadow)) { + SetPageActive(page); + workingset_activation(page); + } else + ClearPageActive(page); + lru_cache_add(page); + } return ret; } EXPORT_SYMBOL_GPL(add_to_page_cache_lru); @@ -520,10 +650,10 @@ struct page *__page_cache_alloc(gfp_t gfp) if (cpuset_do_page_mem_spread()) { unsigned int cpuset_mems_cookie; do { - cpuset_mems_cookie = get_mems_allowed(); + cpuset_mems_cookie = read_mems_allowed_begin(); n = cpuset_mem_spread_node(); page = alloc_pages_exact_node(n, gfp, 0); - } while (!put_mems_allowed(cpuset_mems_cookie) && !page); + } while (!page && read_mems_allowed_retry(cpuset_mems_cookie)); return page; } @@ -686,14 +816,101 @@ int __lock_page_or_retry(struct page *page, struct mm_struct *mm, } /** - * find_get_page - find and get a page reference + * page_cache_next_hole - find the next hole (not-present entry) + * @mapping: mapping + * @index: index + * @max_scan: maximum range to search + * + * Search the set [index, min(index+max_scan-1, MAX_INDEX)] for the + * lowest indexed hole. + * + * Returns: the index of the hole if found, otherwise returns an index + * outside of the set specified (in which case 'return - index >= + * max_scan' will be true). In rare cases of index wrap-around, 0 will + * be returned. + * + * page_cache_next_hole may be called under rcu_read_lock. However, + * like radix_tree_gang_lookup, this will not atomically search a + * snapshot of the tree at a single point in time. For example, if a + * hole is created at index 5, then subsequently a hole is created at + * index 10, page_cache_next_hole covering both indexes may return 10 + * if called under rcu_read_lock. + */ +pgoff_t page_cache_next_hole(struct address_space *mapping, + pgoff_t index, unsigned long max_scan) +{ + unsigned long i; + + for (i = 0; i < max_scan; i++) { + struct page *page; + + page = radix_tree_lookup(&mapping->page_tree, index); + if (!page || radix_tree_exceptional_entry(page)) + break; + index++; + if (index == 0) + break; + } + + return index; +} +EXPORT_SYMBOL(page_cache_next_hole); + +/** + * page_cache_prev_hole - find the prev hole (not-present entry) + * @mapping: mapping + * @index: index + * @max_scan: maximum range to search + * + * Search backwards in the range [max(index-max_scan+1, 0), index] for + * the first hole. + * + * Returns: the index of the hole if found, otherwise returns an index + * outside of the set specified (in which case 'index - return >= + * max_scan' will be true). In rare cases of wrap-around, ULONG_MAX + * will be returned. + * + * page_cache_prev_hole may be called under rcu_read_lock. However, + * like radix_tree_gang_lookup, this will not atomically search a + * snapshot of the tree at a single point in time. For example, if a + * hole is created at index 10, then subsequently a hole is created at + * index 5, page_cache_prev_hole covering both indexes may return 5 if + * called under rcu_read_lock. + */ +pgoff_t page_cache_prev_hole(struct address_space *mapping, + pgoff_t index, unsigned long max_scan) +{ + unsigned long i; + + for (i = 0; i < max_scan; i++) { + struct page *page; + + page = radix_tree_lookup(&mapping->page_tree, index); + if (!page || radix_tree_exceptional_entry(page)) + break; + index--; + if (index == ULONG_MAX) + break; + } + + return index; +} +EXPORT_SYMBOL(page_cache_prev_hole); + +/** + * find_get_entry - find and get a page cache entry * @mapping: the address_space to search - * @offset: the page index + * @offset: the page cache index + * + * Looks up the page cache slot at @mapping & @offset. If there is a + * page cache page, it is returned with an increased refcount. + * + * If the slot holds a shadow entry of a previously evicted page, it + * is returned. * - * Is there a pagecache struct page at the given (mapping, offset) tuple? - * If yes, increment its refcount and return it; if no, return NULL. + * Otherwise, %NULL is returned. */ -struct page *find_get_page(struct address_space *mapping, pgoff_t offset) +struct page *find_get_entry(struct address_space *mapping, pgoff_t offset) { void **pagep; struct page *page; @@ -734,24 +951,50 @@ out: return page; } -EXPORT_SYMBOL(find_get_page); +EXPORT_SYMBOL(find_get_entry); /** - * find_lock_page - locate, pin and lock a pagecache page + * find_get_page - find and get a page reference * @mapping: the address_space to search * @offset: the page index * - * Locates the desired pagecache page, locks it, increments its reference - * count and returns its address. + * Looks up the page cache slot at @mapping & @offset. If there is a + * page cache page, it is returned with an increased refcount. * - * Returns zero if the page was not present. find_lock_page() may sleep. + * Otherwise, %NULL is returned. */ -struct page *find_lock_page(struct address_space *mapping, pgoff_t offset) +struct page *find_get_page(struct address_space *mapping, pgoff_t offset) +{ + struct page *page = find_get_entry(mapping, offset); + + if (radix_tree_exceptional_entry(page)) + page = NULL; + return page; +} +EXPORT_SYMBOL(find_get_page); + +/** + * find_lock_entry - locate, pin and lock a page cache entry + * @mapping: the address_space to search + * @offset: the page cache index + * + * Looks up the page cache slot at @mapping & @offset. If there is a + * page cache page, it is returned locked and with an increased + * refcount. + * + * If the slot holds a shadow entry of a previously evicted page, it + * is returned. + * + * Otherwise, %NULL is returned. + * + * find_lock_entry() may sleep. + */ +struct page *find_lock_entry(struct address_space *mapping, pgoff_t offset) { struct page *page; repeat: - page = find_get_page(mapping, offset); + page = find_get_entry(mapping, offset); if (page && !radix_tree_exception(page)) { lock_page(page); /* Has the page been truncated? */ @@ -764,6 +1007,29 @@ repeat: } return page; } +EXPORT_SYMBOL(find_lock_entry); + +/** + * find_lock_page - locate, pin and lock a pagecache page + * @mapping: the address_space to search + * @offset: the page index + * + * Looks up the page cache slot at @mapping & @offset. If there is a + * page cache page, it is returned locked and with an increased + * refcount. + * + * Otherwise, %NULL is returned. + * + * find_lock_page() may sleep. + */ +struct page *find_lock_page(struct address_space *mapping, pgoff_t offset) +{ + struct page *page = find_lock_entry(mapping, offset); + + if (radix_tree_exceptional_entry(page)) + page = NULL; + return page; +} EXPORT_SYMBOL(find_lock_page); /** @@ -772,16 +1038,18 @@ EXPORT_SYMBOL(find_lock_page); * @index: the page's index into the mapping * @gfp_mask: page allocation mode * - * Locates a page in the pagecache. If the page is not present, a new page - * is allocated using @gfp_mask and is added to the pagecache and to the VM's - * LRU list. The returned page is locked and has its reference count - * incremented. + * Looks up the page cache slot at @mapping & @offset. If there is a + * page cache page, it is returned locked and with an increased + * refcount. * - * find_or_create_page() may sleep, even if @gfp_flags specifies an atomic - * allocation! + * If the page is not present, a new page is allocated using @gfp_mask + * and added to the page cache and the VM's LRU list. The page is + * returned locked and with an increased refcount. * - * find_or_create_page() returns the desired page's address, or zero on - * memory exhaustion. + * On memory exhaustion, %NULL is returned. + * + * find_or_create_page() may sleep, even if @gfp_flags specifies an + * atomic allocation! */ struct page *find_or_create_page(struct address_space *mapping, pgoff_t index, gfp_t gfp_mask) @@ -814,6 +1082,76 @@ repeat: EXPORT_SYMBOL(find_or_create_page); /** + * find_get_entries - gang pagecache lookup + * @mapping: The address_space to search + * @start: The starting page cache index + * @nr_entries: The maximum number of entries + * @entries: Where the resulting entries are placed + * @indices: The cache indices corresponding to the entries in @entries + * + * find_get_entries() will search for and return a group of up to + * @nr_entries entries in the mapping. The entries are placed at + * @entries. find_get_entries() takes a reference against any actual + * pages it returns. + * + * The search returns a group of mapping-contiguous page cache entries + * with ascending indexes. There may be holes in the indices due to + * not-present pages. + * + * Any shadow entries of evicted pages are included in the returned + * array. + * + * find_get_entries() returns the number of pages and shadow entries + * which were found. + */ +unsigned find_get_entries(struct address_space *mapping, + pgoff_t start, unsigned int nr_entries, + struct page **entries, pgoff_t *indices) +{ + void **slot; + unsigned int ret = 0; + struct radix_tree_iter iter; + + if (!nr_entries) + return 0; + + rcu_read_lock(); +restart: + radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) { + struct page *page; +repeat: + page = radix_tree_deref_slot(slot); + if (unlikely(!page)) + continue; + if (radix_tree_exception(page)) { + if (radix_tree_deref_retry(page)) + goto restart; + /* + * Otherwise, we must be storing a swap entry + * here as an exceptional entry: so return it + * without attempting to raise page count. + */ + goto export; + } + if (!page_cache_get_speculative(page)) + goto repeat; + + /* Has the page moved? */ + if (unlikely(page != *slot)) { + page_cache_release(page); + goto repeat; + } +export: + indices[ret] = iter.index; + entries[ret] = page; + if (++ret == nr_entries) + break; + } + rcu_read_unlock(); + return ret; +} + +/** * find_get_pages - gang pagecache lookup * @mapping: The address_space to search * @start: The starting page index @@ -1795,6 +2133,18 @@ int generic_file_readonly_mmap(struct file * file, struct vm_area_struct * vma) EXPORT_SYMBOL(generic_file_mmap); EXPORT_SYMBOL(generic_file_readonly_mmap); +static struct page *wait_on_page_read(struct page *page) +{ + if (!IS_ERR(page)) { + wait_on_page_locked(page); + if (!PageUptodate(page)) { + page_cache_release(page); + page = ERR_PTR(-EIO); + } + } + return page; +} + static struct page *__read_cache_page(struct address_space *mapping, pgoff_t index, int (*filler)(void *, struct page *), @@ -1821,6 +2171,8 @@ repeat: if (err < 0) { page_cache_release(page); page = ERR_PTR(err); + } else { + page = wait_on_page_read(page); } } return page; @@ -1857,6 +2209,10 @@ retry: if (err < 0) { page_cache_release(page); return ERR_PTR(err); + } else { + page = wait_on_page_read(page); + if (IS_ERR(page)) + return page; } out: mark_page_accessed(page); @@ -1864,40 +2220,25 @@ out: } /** - * read_cache_page_async - read into page cache, fill it if needed + * read_cache_page - read into page cache, fill it if needed * @mapping: the page's address_space * @index: the page index * @filler: function to perform the read * @data: first arg to filler(data, page) function, often left as NULL * - * Same as read_cache_page, but don't wait for page to become unlocked - * after submitting it to the filler. - * * Read into the page cache. If a page already exists, and PageUptodate() is - * not set, try to fill the page but don't wait for it to become unlocked. + * not set, try to fill the page and wait for it to become unlocked. * * If the page does not get brought uptodate, return -EIO. */ -struct page *read_cache_page_async(struct address_space *mapping, +struct page *read_cache_page(struct address_space *mapping, pgoff_t index, int (*filler)(void *, struct page *), void *data) { return do_read_cache_page(mapping, index, filler, data, mapping_gfp_mask(mapping)); } -EXPORT_SYMBOL(read_cache_page_async); - -static struct page *wait_on_page_read(struct page *page) -{ - if (!IS_ERR(page)) { - wait_on_page_locked(page); - if (!PageUptodate(page)) { - page_cache_release(page); - page = ERR_PTR(-EIO); - } - } - return page; -} +EXPORT_SYMBOL(read_cache_page); /** * read_cache_page_gfp - read into page cache, using specified page allocation flags. @@ -1916,31 +2257,10 @@ struct page *read_cache_page_gfp(struct address_space *mapping, { filler_t *filler = (filler_t *)mapping->a_ops->readpage; - return wait_on_page_read(do_read_cache_page(mapping, index, filler, NULL, gfp)); + return do_read_cache_page(mapping, index, filler, NULL, gfp); } EXPORT_SYMBOL(read_cache_page_gfp); -/** - * read_cache_page - read into page cache, fill it if needed - * @mapping: the page's address_space - * @index: the page index - * @filler: function to perform the read - * @data: first arg to filler(data, page) function, often left as NULL - * - * Read into the page cache. If a page already exists, and PageUptodate() is - * not set, try to fill the page then wait for it to become unlocked. - * - * If the page does not get brought uptodate, return -EIO. - */ -struct page *read_cache_page(struct address_space *mapping, - pgoff_t index, - int (*filler)(void *, struct page *), - void *data) -{ - return wait_on_page_read(read_cache_page_async(mapping, index, filler, data)); -} -EXPORT_SYMBOL(read_cache_page); - static size_t __iovec_copy_from_user_inatomic(char *vaddr, const struct iovec *iov, size_t base, size_t bytes) { diff --git a/mm/fremap.c b/mm/fremap.c index bbc4d660221a..34feba60a17e 100644 --- a/mm/fremap.c +++ b/mm/fremap.c @@ -23,28 +23,44 @@ #include "internal.h" +static int mm_counter(struct page *page) +{ + return PageAnon(page) ? MM_ANONPAGES : MM_FILEPAGES; +} + static void zap_pte(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) { pte_t pte = *ptep; + struct page *page; + swp_entry_t entry; if (pte_present(pte)) { - struct page *page; - flush_cache_page(vma, addr, pte_pfn(pte)); pte = ptep_clear_flush(vma, addr, ptep); page = vm_normal_page(vma, addr, pte); if (page) { if (pte_dirty(pte)) set_page_dirty(page); + update_hiwater_rss(mm); + dec_mm_counter(mm, mm_counter(page)); page_remove_rmap(page); page_cache_release(page); + } + } else { /* zap_pte() is not called when pte_none() */ + if (!pte_file(pte)) { update_hiwater_rss(mm); - dec_mm_counter(mm, MM_FILEPAGES); + entry = pte_to_swp_entry(pte); + if (non_swap_entry(entry)) { + if (is_migration_entry(entry)) { + page = migration_entry_to_page(entry); + dec_mm_counter(mm, mm_counter(page)); + } + } else { + free_swap_and_cache(entry); + dec_mm_counter(mm, MM_SWAPENTS); + } } - } else { - if (!pte_file(pte)) - free_swap_and_cache(pte_to_swp_entry(pte)); pte_clear_not_present_full(mm, addr, ptep, 0); } } diff --git a/mm/huge_memory.c b/mm/huge_memory.c index da23eb96779f..6ac89e9f82ef 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -941,81 +941,6 @@ unlock: spin_unlock(ptl); } -static int do_huge_pmd_wp_zero_page_fallback(struct mm_struct *mm, - struct vm_area_struct *vma, unsigned long address, - pmd_t *pmd, pmd_t orig_pmd, unsigned long haddr) -{ - spinlock_t *ptl; - pgtable_t pgtable; - pmd_t _pmd; - struct page *page; - int i, ret = 0; - unsigned long mmun_start; /* For mmu_notifiers */ - unsigned long mmun_end; /* For mmu_notifiers */ - - page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address); - if (!page) { - ret |= VM_FAULT_OOM; - goto out; - } - - if (mem_cgroup_newpage_charge(page, mm, GFP_KERNEL)) { - put_page(page); - ret |= VM_FAULT_OOM; - goto out; - } - - clear_user_highpage(page, address); - __SetPageUptodate(page); - - mmun_start = haddr; - mmun_end = haddr + HPAGE_PMD_SIZE; - mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); - - ptl = pmd_lock(mm, pmd); - if (unlikely(!pmd_same(*pmd, orig_pmd))) - goto out_free_page; - - pmdp_clear_flush(vma, haddr, pmd); - /* leave pmd empty until pte is filled */ - - pgtable = pgtable_trans_huge_withdraw(mm, pmd); - pmd_populate(mm, &_pmd, pgtable); - - for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) { - pte_t *pte, entry; - if (haddr == (address & PAGE_MASK)) { - entry = mk_pte(page, vma->vm_page_prot); - entry = maybe_mkwrite(pte_mkdirty(entry), vma); - page_add_new_anon_rmap(page, vma, haddr); - } else { - entry = pfn_pte(my_zero_pfn(haddr), vma->vm_page_prot); - entry = pte_mkspecial(entry); - } - pte = pte_offset_map(&_pmd, haddr); - VM_BUG_ON(!pte_none(*pte)); - set_pte_at(mm, haddr, pte, entry); - pte_unmap(pte); - } - smp_wmb(); /* make pte visible before pmd */ - pmd_populate(mm, pmd, pgtable); - spin_unlock(ptl); - put_huge_zero_page(); - inc_mm_counter(mm, MM_ANONPAGES); - - mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); - - ret |= VM_FAULT_WRITE; -out: - return ret; -out_free_page: - spin_unlock(ptl); - mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); - mem_cgroup_uncharge_page(page); - put_page(page); - goto out; -} - static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, @@ -1161,13 +1086,15 @@ alloc: if (unlikely(!new_page)) { if (!page) { - ret = do_huge_pmd_wp_zero_page_fallback(mm, vma, - address, pmd, orig_pmd, haddr); + split_huge_page_pmd(vma, address, pmd); + ret |= VM_FAULT_FALLBACK; } else { ret = do_huge_pmd_wp_page_fallback(mm, vma, address, pmd, orig_pmd, page, haddr); - if (ret & VM_FAULT_OOM) + if (ret & VM_FAULT_OOM) { split_huge_page(page); + ret |= VM_FAULT_FALLBACK; + } put_page(page); } count_vm_event(THP_FAULT_FALLBACK); @@ -1179,9 +1106,10 @@ alloc: if (page) { split_huge_page(page); put_page(page); - } + } else + split_huge_page_pmd(vma, address, pmd); + ret |= VM_FAULT_FALLBACK; count_vm_event(THP_FAULT_FALLBACK); - ret |= VM_FAULT_OOM; goto out; } @@ -1958,7 +1886,7 @@ out: return ret; } -#define VM_NO_THP (VM_SPECIAL|VM_MIXEDMAP|VM_HUGETLB|VM_SHARED|VM_MAYSHARE) +#define VM_NO_THP (VM_SPECIAL | VM_HUGETLB | VM_SHARED | VM_MAYSHARE) int hugepage_madvise(struct vm_area_struct *vma, unsigned long *vm_flags, int advice) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index c01cb9fedb18..7c02b9dadfb0 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -22,6 +22,7 @@ #include <linux/swap.h> #include <linux/swapops.h> #include <linux/page-isolation.h> +#include <linux/jhash.h> #include <asm/page.h> #include <asm/pgtable.h> @@ -53,6 +54,13 @@ static unsigned long __initdata default_hstate_size; */ DEFINE_SPINLOCK(hugetlb_lock); +/* + * Serializes faults on the same logical page. This is used to + * prevent spurious OOMs when the hugepage pool is fully utilized. + */ +static int num_fault_mutexes; +static struct mutex *htlb_fault_mutex_table ____cacheline_aligned_in_smp; + static inline void unlock_or_release_subpool(struct hugepage_subpool *spool) { bool free = (spool->count == 0) && (spool->used_hpages == 0); @@ -135,15 +143,8 @@ static inline struct hugepage_subpool *subpool_vma(struct vm_area_struct *vma) * Region tracking -- allows tracking of reservations and instantiated pages * across the pages in a mapping. * - * The region data structures are protected by a combination of the mmap_sem - * and the hugetlb_instantiation_mutex. To access or modify a region the caller - * must either hold the mmap_sem for write, or the mmap_sem for read and - * the hugetlb_instantiation_mutex: - * - * down_write(&mm->mmap_sem); - * or - * down_read(&mm->mmap_sem); - * mutex_lock(&hugetlb_instantiation_mutex); + * The region data structures are embedded into a resv_map and + * protected by a resv_map's lock */ struct file_region { struct list_head link; @@ -151,10 +152,12 @@ struct file_region { long to; }; -static long region_add(struct list_head *head, long f, long t) +static long region_add(struct resv_map *resv, long f, long t) { + struct list_head *head = &resv->regions; struct file_region *rg, *nrg, *trg; + spin_lock(&resv->lock); /* Locate the region we are either in or before. */ list_for_each_entry(rg, head, link) if (f <= rg->to) @@ -184,14 +187,18 @@ static long region_add(struct list_head *head, long f, long t) } nrg->from = f; nrg->to = t; + spin_unlock(&resv->lock); return 0; } -static long region_chg(struct list_head *head, long f, long t) +static long region_chg(struct resv_map *resv, long f, long t) { - struct file_region *rg, *nrg; + struct list_head *head = &resv->regions; + struct file_region *rg, *nrg = NULL; long chg = 0; +retry: + spin_lock(&resv->lock); /* Locate the region we are before or in. */ list_for_each_entry(rg, head, link) if (f <= rg->to) @@ -201,15 +208,21 @@ static long region_chg(struct list_head *head, long f, long t) * Subtle, allocate a new region at the position but make it zero * size such that we can guarantee to record the reservation. */ if (&rg->link == head || t < rg->from) { - nrg = kmalloc(sizeof(*nrg), GFP_KERNEL); - if (!nrg) - return -ENOMEM; - nrg->from = f; - nrg->to = f; - INIT_LIST_HEAD(&nrg->link); - list_add(&nrg->link, rg->link.prev); + if (!nrg) { + spin_unlock(&resv->lock); + nrg = kmalloc(sizeof(*nrg), GFP_KERNEL); + if (!nrg) + return -ENOMEM; + + nrg->from = f; + nrg->to = f; + INIT_LIST_HEAD(&nrg->link); + goto retry; + } - return t - f; + list_add(&nrg->link, rg->link.prev); + chg = t - f; + goto out_nrg; } /* Round our left edge to the current segment if it encloses us. */ @@ -222,7 +235,7 @@ static long region_chg(struct list_head *head, long f, long t) if (&rg->link == head) break; if (rg->from > t) - return chg; + goto out; /* We overlap with this area, if it extends further than * us then we must extend ourselves. Account for its @@ -233,20 +246,30 @@ static long region_chg(struct list_head *head, long f, long t) } chg -= rg->to - rg->from; } + +out: + spin_unlock(&resv->lock); + /* We already know we raced and no longer need the new region */ + kfree(nrg); + return chg; +out_nrg: + spin_unlock(&resv->lock); return chg; } -static long region_truncate(struct list_head *head, long end) +static long region_truncate(struct resv_map *resv, long end) { + struct list_head *head = &resv->regions; struct file_region *rg, *trg; long chg = 0; + spin_lock(&resv->lock); /* Locate the region we are either in or before. */ list_for_each_entry(rg, head, link) if (end <= rg->to) break; if (&rg->link == head) - return 0; + goto out; /* If we are in the middle of a region then adjust it. */ if (end > rg->from) { @@ -263,14 +286,19 @@ static long region_truncate(struct list_head *head, long end) list_del(&rg->link); kfree(rg); } + +out: + spin_unlock(&resv->lock); return chg; } -static long region_count(struct list_head *head, long f, long t) +static long region_count(struct resv_map *resv, long f, long t) { + struct list_head *head = &resv->regions; struct file_region *rg; long chg = 0; + spin_lock(&resv->lock); /* Locate each segment we overlap with, and count that overlap. */ list_for_each_entry(rg, head, link) { long seg_from; @@ -286,6 +314,7 @@ static long region_count(struct list_head *head, long f, long t) chg += seg_to - seg_from; } + spin_unlock(&resv->lock); return chg; } @@ -376,39 +405,46 @@ static void set_vma_private_data(struct vm_area_struct *vma, vma->vm_private_data = (void *)value; } -struct resv_map { - struct kref refs; - struct list_head regions; -}; - -static struct resv_map *resv_map_alloc(void) +struct resv_map *resv_map_alloc(void) { struct resv_map *resv_map = kmalloc(sizeof(*resv_map), GFP_KERNEL); if (!resv_map) return NULL; kref_init(&resv_map->refs); + spin_lock_init(&resv_map->lock); INIT_LIST_HEAD(&resv_map->regions); return resv_map; } -static void resv_map_release(struct kref *ref) +void resv_map_release(struct kref *ref) { struct resv_map *resv_map = container_of(ref, struct resv_map, refs); /* Clear out any active regions before we release the map. */ - region_truncate(&resv_map->regions, 0); + region_truncate(resv_map, 0); kfree(resv_map); } +static inline struct resv_map *inode_resv_map(struct inode *inode) +{ + return inode->i_mapping->private_data; +} + static struct resv_map *vma_resv_map(struct vm_area_struct *vma) { VM_BUG_ON(!is_vm_hugetlb_page(vma)); - if (!(vma->vm_flags & VM_MAYSHARE)) + if (vma->vm_flags & VM_MAYSHARE) { + struct address_space *mapping = vma->vm_file->f_mapping; + struct inode *inode = mapping->host; + + return inode_resv_map(inode); + + } else { return (struct resv_map *)(get_vma_private_data(vma) & ~HPAGE_RESV_MASK); - return NULL; + } } static void set_vma_resv_map(struct vm_area_struct *vma, struct resv_map *map) @@ -540,7 +576,7 @@ static struct page *dequeue_huge_page_vma(struct hstate *h, goto err; retry_cpuset: - cpuset_mems_cookie = get_mems_allowed(); + cpuset_mems_cookie = read_mems_allowed_begin(); zonelist = huge_zonelist(vma, address, htlb_alloc_mask(h), &mpol, &nodemask); @@ -562,7 +598,7 @@ retry_cpuset: } mpol_cond_put(mpol); - if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page)) + if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie))) goto retry_cpuset; return page; @@ -653,7 +689,8 @@ static void prep_new_huge_page(struct hstate *h, struct page *page, int nid) put_page(page); /* free it into the hugepage allocator */ } -static void prep_compound_gigantic_page(struct page *page, unsigned long order) +static void __init prep_compound_gigantic_page(struct page *page, + unsigned long order) { int i; int nr_pages = 1 << order; @@ -1150,45 +1187,34 @@ static void return_unused_surplus_pages(struct hstate *h, static long vma_needs_reservation(struct hstate *h, struct vm_area_struct *vma, unsigned long addr) { - struct address_space *mapping = vma->vm_file->f_mapping; - struct inode *inode = mapping->host; - - if (vma->vm_flags & VM_MAYSHARE) { - pgoff_t idx = vma_hugecache_offset(h, vma, addr); - return region_chg(&inode->i_mapping->private_list, - idx, idx + 1); + struct resv_map *resv; + pgoff_t idx; + long chg; - } else if (!is_vma_resv_set(vma, HPAGE_RESV_OWNER)) { + resv = vma_resv_map(vma); + if (!resv) return 1; - } else { - long err; - pgoff_t idx = vma_hugecache_offset(h, vma, addr); - struct resv_map *resv = vma_resv_map(vma); + idx = vma_hugecache_offset(h, vma, addr); + chg = region_chg(resv, idx, idx + 1); - err = region_chg(&resv->regions, idx, idx + 1); - if (err < 0) - return err; - return 0; - } + if (vma->vm_flags & VM_MAYSHARE) + return chg; + else + return chg < 0 ? chg : 0; } static void vma_commit_reservation(struct hstate *h, struct vm_area_struct *vma, unsigned long addr) { - struct address_space *mapping = vma->vm_file->f_mapping; - struct inode *inode = mapping->host; - - if (vma->vm_flags & VM_MAYSHARE) { - pgoff_t idx = vma_hugecache_offset(h, vma, addr); - region_add(&inode->i_mapping->private_list, idx, idx + 1); + struct resv_map *resv; + pgoff_t idx; - } else if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) { - pgoff_t idx = vma_hugecache_offset(h, vma, addr); - struct resv_map *resv = vma_resv_map(vma); + resv = vma_resv_map(vma); + if (!resv) + return; - /* Mark this page used in the map. */ - region_add(&resv->regions, idx, idx + 1); - } + idx = vma_hugecache_offset(h, vma, addr); + region_add(resv, idx, idx + 1); } static struct page *alloc_huge_page(struct vm_area_struct *vma, @@ -1294,7 +1320,7 @@ found: return 1; } -static void prep_compound_huge_page(struct page *page, int order) +static void __init prep_compound_huge_page(struct page *page, int order) { if (unlikely(order > (MAX_ORDER - 1))) prep_compound_gigantic_page(page, order); @@ -1944,11 +1970,14 @@ static void __exit hugetlb_exit(void) } kobject_put(hugepages_kobj); + kfree(htlb_fault_mutex_table); } module_exit(hugetlb_exit); static int __init hugetlb_init(void) { + int i; + /* Some platform decide whether they support huge pages at boot * time. On these, such as powerpc, HPAGE_SHIFT is set to 0 when * there is no such support @@ -1973,6 +2002,17 @@ static int __init hugetlb_init(void) hugetlb_register_all_nodes(); hugetlb_cgroup_file_init(); +#ifdef CONFIG_SMP + num_fault_mutexes = roundup_pow_of_two(8 * num_possible_cpus()); +#else + num_fault_mutexes = 1; +#endif + htlb_fault_mutex_table = + kmalloc(sizeof(struct mutex) * num_fault_mutexes, GFP_KERNEL); + BUG_ON(!htlb_fault_mutex_table); + + for (i = 0; i < num_fault_mutexes; i++) + mutex_init(&htlb_fault_mutex_table[i]); return 0; } module_init(hugetlb_init); @@ -2251,41 +2291,30 @@ static void hugetlb_vm_op_open(struct vm_area_struct *vma) * after this open call completes. It is therefore safe to take a * new reference here without additional locking. */ - if (resv) + if (resv && is_vma_resv_set(vma, HPAGE_RESV_OWNER)) kref_get(&resv->refs); } -static void resv_map_put(struct vm_area_struct *vma) -{ - struct resv_map *resv = vma_resv_map(vma); - - if (!resv) - return; - kref_put(&resv->refs, resv_map_release); -} - static void hugetlb_vm_op_close(struct vm_area_struct *vma) { struct hstate *h = hstate_vma(vma); struct resv_map *resv = vma_resv_map(vma); struct hugepage_subpool *spool = subpool_vma(vma); - unsigned long reserve; - unsigned long start; - unsigned long end; + unsigned long reserve, start, end; - if (resv) { - start = vma_hugecache_offset(h, vma, vma->vm_start); - end = vma_hugecache_offset(h, vma, vma->vm_end); + if (!resv || !is_vma_resv_set(vma, HPAGE_RESV_OWNER)) + return; - reserve = (end - start) - - region_count(&resv->regions, start, end); + start = vma_hugecache_offset(h, vma, vma->vm_start); + end = vma_hugecache_offset(h, vma, vma->vm_end); - resv_map_put(vma); + reserve = (end - start) - region_count(resv, start, end); - if (reserve) { - hugetlb_acct_memory(h, -reserve); - hugepage_subpool_put_pages(spool, reserve); - } + kref_put(&resv->refs, resv_map_release); + + if (reserve) { + hugetlb_acct_memory(h, -reserve); + hugepage_subpool_put_pages(spool, reserve); } } @@ -2761,15 +2790,14 @@ static bool hugetlbfs_pagecache_present(struct hstate *h, } static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long address, pte_t *ptep, unsigned int flags) + struct address_space *mapping, pgoff_t idx, + unsigned long address, pte_t *ptep, unsigned int flags) { struct hstate *h = hstate_vma(vma); int ret = VM_FAULT_SIGBUS; int anon_rmap = 0; - pgoff_t idx; unsigned long size; struct page *page; - struct address_space *mapping; pte_t new_pte; spinlock_t *ptl; @@ -2784,9 +2812,6 @@ static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma, return ret; } - mapping = vma->vm_file->f_mapping; - idx = vma_hugecache_offset(h, vma, address); - /* * Use page lock to guard against racing truncation * before we get page_table_lock. @@ -2896,17 +2921,53 @@ backout_unlocked: goto out; } +#ifdef CONFIG_SMP +static u32 fault_mutex_hash(struct hstate *h, struct mm_struct *mm, + struct vm_area_struct *vma, + struct address_space *mapping, + pgoff_t idx, unsigned long address) +{ + unsigned long key[2]; + u32 hash; + + if (vma->vm_flags & VM_SHARED) { + key[0] = (unsigned long) mapping; + key[1] = idx; + } else { + key[0] = (unsigned long) mm; + key[1] = address >> huge_page_shift(h); + } + + hash = jhash2((u32 *)&key, sizeof(key)/sizeof(u32), 0); + + return hash & (num_fault_mutexes - 1); +} +#else +/* + * For uniprocesor systems we always use a single mutex, so just + * return 0 and avoid the hashing overhead. + */ +static u32 fault_mutex_hash(struct hstate *h, struct mm_struct *mm, + struct vm_area_struct *vma, + struct address_space *mapping, + pgoff_t idx, unsigned long address) +{ + return 0; +} +#endif + int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, unsigned int flags) { - pte_t *ptep; - pte_t entry; + pte_t *ptep, entry; spinlock_t *ptl; int ret; + u32 hash; + pgoff_t idx; struct page *page = NULL; struct page *pagecache_page = NULL; - static DEFINE_MUTEX(hugetlb_instantiation_mutex); struct hstate *h = hstate_vma(vma); + struct address_space *mapping; address &= huge_page_mask(h); @@ -2925,15 +2986,20 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, if (!ptep) return VM_FAULT_OOM; + mapping = vma->vm_file->f_mapping; + idx = vma_hugecache_offset(h, vma, address); + /* * Serialize hugepage allocation and instantiation, so that we don't * get spurious allocation failures if two CPUs race to instantiate * the same page in the page cache. */ - mutex_lock(&hugetlb_instantiation_mutex); + hash = fault_mutex_hash(h, mm, vma, mapping, idx, address); + mutex_lock(&htlb_fault_mutex_table[hash]); + entry = huge_ptep_get(ptep); if (huge_pte_none(entry)) { - ret = hugetlb_no_page(mm, vma, address, ptep, flags); + ret = hugetlb_no_page(mm, vma, mapping, idx, address, ptep, flags); goto out_mutex; } @@ -3002,8 +3068,7 @@ out_ptl: put_page(page); out_mutex: - mutex_unlock(&hugetlb_instantiation_mutex); - + mutex_unlock(&htlb_fault_mutex_table[hash]); return ret; } @@ -3161,6 +3226,7 @@ int hugetlb_reserve_pages(struct inode *inode, long ret, chg; struct hstate *h = hstate_inode(inode); struct hugepage_subpool *spool = subpool_inode(inode); + struct resv_map *resv_map; /* * Only apply hugepage reservation if asked. At fault time, an @@ -3176,10 +3242,13 @@ int hugetlb_reserve_pages(struct inode *inode, * to reserve the full area even if read-only as mprotect() may be * called to make the mapping read-write. Assume !vma is a shm mapping */ - if (!vma || vma->vm_flags & VM_MAYSHARE) - chg = region_chg(&inode->i_mapping->private_list, from, to); - else { - struct resv_map *resv_map = resv_map_alloc(); + if (!vma || vma->vm_flags & VM_MAYSHARE) { + resv_map = inode_resv_map(inode); + + chg = region_chg(resv_map, from, to); + + } else { + resv_map = resv_map_alloc(); if (!resv_map) return -ENOMEM; @@ -3222,20 +3291,23 @@ int hugetlb_reserve_pages(struct inode *inode, * else has to be done for private mappings here */ if (!vma || vma->vm_flags & VM_MAYSHARE) - region_add(&inode->i_mapping->private_list, from, to); + region_add(resv_map, from, to); return 0; out_err: - if (vma) - resv_map_put(vma); + if (vma && is_vma_resv_set(vma, HPAGE_RESV_OWNER)) + kref_put(&resv_map->refs, resv_map_release); return ret; } void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed) { struct hstate *h = hstate_inode(inode); - long chg = region_truncate(&inode->i_mapping->private_list, offset); + struct resv_map *resv_map = inode_resv_map(inode); + long chg = 0; struct hugepage_subpool *spool = subpool_inode(inode); + if (resv_map) + chg = region_truncate(resv_map, offset); spin_lock(&inode->i_lock); inode->i_blocks -= (blocks_per_huge_page(h) * freed); spin_unlock(&inode->i_lock); diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c index cb00829bb466..595d7fd795e1 100644 --- a/mm/hugetlb_cgroup.c +++ b/mm/hugetlb_cgroup.c @@ -30,7 +30,6 @@ struct hugetlb_cgroup { #define MEMFILE_IDX(val) (((val) >> 16) & 0xffff) #define MEMFILE_ATTR(val) ((val) & 0xffff) -struct cgroup_subsys hugetlb_subsys __read_mostly; static struct hugetlb_cgroup *root_h_cgroup __read_mostly; static inline @@ -42,7 +41,7 @@ struct hugetlb_cgroup *hugetlb_cgroup_from_css(struct cgroup_subsys_state *s) static inline struct hugetlb_cgroup *hugetlb_cgroup_from_task(struct task_struct *task) { - return hugetlb_cgroup_from_css(task_css(task, hugetlb_subsys_id)); + return hugetlb_cgroup_from_css(task_css(task, hugetlb_cgrp_id)); } static inline bool hugetlb_cgroup_is_root(struct hugetlb_cgroup *h_cg) @@ -255,7 +254,7 @@ static u64 hugetlb_cgroup_read_u64(struct cgroup_subsys_state *css, } static int hugetlb_cgroup_write(struct cgroup_subsys_state *css, - struct cftype *cft, const char *buffer) + struct cftype *cft, char *buffer) { int idx, name, ret; unsigned long long val; @@ -358,7 +357,7 @@ static void __init __hugetlb_cgroup_file_init(int idx) cft = &h->cgroup_files[4]; memset(cft, 0, sizeof(*cft)); - WARN_ON(cgroup_add_cftypes(&hugetlb_subsys, h->cgroup_files)); + WARN_ON(cgroup_add_cftypes(&hugetlb_cgrp_subsys, h->cgroup_files)); return; } @@ -402,10 +401,8 @@ void hugetlb_cgroup_migrate(struct page *oldhpage, struct page *newhpage) return; } -struct cgroup_subsys hugetlb_subsys = { - .name = "hugetlb", +struct cgroup_subsys hugetlb_cgrp_subsys = { .css_alloc = hugetlb_cgroup_css_alloc, .css_offline = hugetlb_cgroup_css_offline, .css_free = hugetlb_cgroup_css_free, - .subsys_id = hugetlb_subsys_id, }; diff --git a/mm/kmemleak.c b/mm/kmemleak.c index 31f01c5011e5..91d67eaee050 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -192,15 +192,15 @@ static struct kmem_cache *object_cache; static struct kmem_cache *scan_area_cache; /* set if tracing memory operations is enabled */ -static atomic_t kmemleak_enabled = ATOMIC_INIT(0); +static int kmemleak_enabled; /* set in the late_initcall if there were no errors */ -static atomic_t kmemleak_initialized = ATOMIC_INIT(0); +static int kmemleak_initialized; /* enables or disables early logging of the memory operations */ -static atomic_t kmemleak_early_log = ATOMIC_INIT(1); +static int kmemleak_early_log = 1; /* set if a kmemleak warning was issued */ -static atomic_t kmemleak_warning = ATOMIC_INIT(0); +static int kmemleak_warning; /* set if a fatal kmemleak error has occurred */ -static atomic_t kmemleak_error = ATOMIC_INIT(0); +static int kmemleak_error; /* minimum and maximum address that may be valid pointers */ static unsigned long min_addr = ULONG_MAX; @@ -218,7 +218,8 @@ static int kmemleak_stack_scan = 1; static DEFINE_MUTEX(scan_mutex); /* setting kmemleak=on, will set this var, skipping the disable */ static int kmemleak_skip_disable; - +/* If there are leaks that can be reported */ +static bool kmemleak_found_leaks; /* * Early object allocation/freeing logging. Kmemleak is initialized after the @@ -267,7 +268,7 @@ static void kmemleak_disable(void); #define kmemleak_warn(x...) do { \ pr_warning(x); \ dump_stack(); \ - atomic_set(&kmemleak_warning, 1); \ + kmemleak_warning = 1; \ } while (0) /* @@ -805,7 +806,7 @@ static void __init log_early(int op_type, const void *ptr, size_t size, unsigned long flags; struct early_log *log; - if (atomic_read(&kmemleak_error)) { + if (kmemleak_error) { /* kmemleak stopped recording, just count the requests */ crt_early_log++; return; @@ -840,7 +841,7 @@ static void early_alloc(struct early_log *log) unsigned long flags; int i; - if (!atomic_read(&kmemleak_enabled) || !log->ptr || IS_ERR(log->ptr)) + if (!kmemleak_enabled || !log->ptr || IS_ERR(log->ptr)) return; /* @@ -893,9 +894,9 @@ void __ref kmemleak_alloc(const void *ptr, size_t size, int min_count, { pr_debug("%s(0x%p, %zu, %d)\n", __func__, ptr, size, min_count); - if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr)) + if (kmemleak_enabled && ptr && !IS_ERR(ptr)) create_object((unsigned long)ptr, size, min_count, gfp); - else if (atomic_read(&kmemleak_early_log)) + else if (kmemleak_early_log) log_early(KMEMLEAK_ALLOC, ptr, size, min_count); } EXPORT_SYMBOL_GPL(kmemleak_alloc); @@ -919,11 +920,11 @@ void __ref kmemleak_alloc_percpu(const void __percpu *ptr, size_t size) * Percpu allocations are only scanned and not reported as leaks * (min_count is set to 0). */ - if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr)) + if (kmemleak_enabled && ptr && !IS_ERR(ptr)) for_each_possible_cpu(cpu) create_object((unsigned long)per_cpu_ptr(ptr, cpu), size, 0, GFP_KERNEL); - else if (atomic_read(&kmemleak_early_log)) + else if (kmemleak_early_log) log_early(KMEMLEAK_ALLOC_PERCPU, ptr, size, 0); } EXPORT_SYMBOL_GPL(kmemleak_alloc_percpu); @@ -939,9 +940,9 @@ void __ref kmemleak_free(const void *ptr) { pr_debug("%s(0x%p)\n", __func__, ptr); - if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr)) + if (kmemleak_enabled && ptr && !IS_ERR(ptr)) delete_object_full((unsigned long)ptr); - else if (atomic_read(&kmemleak_early_log)) + else if (kmemleak_early_log) log_early(KMEMLEAK_FREE, ptr, 0, 0); } EXPORT_SYMBOL_GPL(kmemleak_free); @@ -959,9 +960,9 @@ void __ref kmemleak_free_part(const void *ptr, size_t size) { pr_debug("%s(0x%p)\n", __func__, ptr); - if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr)) + if (kmemleak_enabled && ptr && !IS_ERR(ptr)) delete_object_part((unsigned long)ptr, size); - else if (atomic_read(&kmemleak_early_log)) + else if (kmemleak_early_log) log_early(KMEMLEAK_FREE_PART, ptr, size, 0); } EXPORT_SYMBOL_GPL(kmemleak_free_part); @@ -979,11 +980,11 @@ void __ref kmemleak_free_percpu(const void __percpu *ptr) pr_debug("%s(0x%p)\n", __func__, ptr); - if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr)) + if (kmemleak_enabled && ptr && !IS_ERR(ptr)) for_each_possible_cpu(cpu) delete_object_full((unsigned long)per_cpu_ptr(ptr, cpu)); - else if (atomic_read(&kmemleak_early_log)) + else if (kmemleak_early_log) log_early(KMEMLEAK_FREE_PERCPU, ptr, 0, 0); } EXPORT_SYMBOL_GPL(kmemleak_free_percpu); @@ -999,9 +1000,9 @@ void __ref kmemleak_not_leak(const void *ptr) { pr_debug("%s(0x%p)\n", __func__, ptr); - if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr)) + if (kmemleak_enabled && ptr && !IS_ERR(ptr)) make_gray_object((unsigned long)ptr); - else if (atomic_read(&kmemleak_early_log)) + else if (kmemleak_early_log) log_early(KMEMLEAK_NOT_LEAK, ptr, 0, 0); } EXPORT_SYMBOL(kmemleak_not_leak); @@ -1019,9 +1020,9 @@ void __ref kmemleak_ignore(const void *ptr) { pr_debug("%s(0x%p)\n", __func__, ptr); - if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr)) + if (kmemleak_enabled && ptr && !IS_ERR(ptr)) make_black_object((unsigned long)ptr); - else if (atomic_read(&kmemleak_early_log)) + else if (kmemleak_early_log) log_early(KMEMLEAK_IGNORE, ptr, 0, 0); } EXPORT_SYMBOL(kmemleak_ignore); @@ -1041,9 +1042,9 @@ void __ref kmemleak_scan_area(const void *ptr, size_t size, gfp_t gfp) { pr_debug("%s(0x%p)\n", __func__, ptr); - if (atomic_read(&kmemleak_enabled) && ptr && size && !IS_ERR(ptr)) + if (kmemleak_enabled && ptr && size && !IS_ERR(ptr)) add_scan_area((unsigned long)ptr, size, gfp); - else if (atomic_read(&kmemleak_early_log)) + else if (kmemleak_early_log) log_early(KMEMLEAK_SCAN_AREA, ptr, size, 0); } EXPORT_SYMBOL(kmemleak_scan_area); @@ -1061,9 +1062,9 @@ void __ref kmemleak_no_scan(const void *ptr) { pr_debug("%s(0x%p)\n", __func__, ptr); - if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr)) + if (kmemleak_enabled && ptr && !IS_ERR(ptr)) object_no_scan((unsigned long)ptr); - else if (atomic_read(&kmemleak_early_log)) + else if (kmemleak_early_log) log_early(KMEMLEAK_NO_SCAN, ptr, 0, 0); } EXPORT_SYMBOL(kmemleak_no_scan); @@ -1088,7 +1089,7 @@ static bool update_checksum(struct kmemleak_object *object) */ static int scan_should_stop(void) { - if (!atomic_read(&kmemleak_enabled)) + if (!kmemleak_enabled) return 1; /* @@ -1382,9 +1383,12 @@ static void kmemleak_scan(void) } rcu_read_unlock(); - if (new_leaks) + if (new_leaks) { + kmemleak_found_leaks = true; + pr_info("%d new suspected memory leaks (see " "/sys/kernel/debug/kmemleak)\n", new_leaks); + } } @@ -1545,11 +1549,6 @@ static int kmemleak_open(struct inode *inode, struct file *file) return seq_open(file, &kmemleak_seq_ops); } -static int kmemleak_release(struct inode *inode, struct file *file) -{ - return seq_release(inode, file); -} - static int dump_str_object_info(const char *str) { unsigned long flags; @@ -1592,8 +1591,12 @@ static void kmemleak_clear(void) spin_unlock_irqrestore(&object->lock, flags); } rcu_read_unlock(); + + kmemleak_found_leaks = false; } +static void __kmemleak_do_cleanup(void); + /* * File write operation to configure kmemleak at run-time. The following * commands can be written to the /sys/kernel/debug/kmemleak file: @@ -1606,7 +1609,8 @@ static void kmemleak_clear(void) * disable it) * scan - trigger a memory scan * clear - mark all current reported unreferenced kmemleak objects as - * grey to ignore printing them + * grey to ignore printing them, or free all kmemleak objects + * if kmemleak has been disabled. * dump=... - dump information about the object found at the given address */ static ssize_t kmemleak_write(struct file *file, const char __user *user_buf, @@ -1616,9 +1620,6 @@ static ssize_t kmemleak_write(struct file *file, const char __user *user_buf, int buf_size; int ret; - if (!atomic_read(&kmemleak_enabled)) - return -EBUSY; - buf_size = min(size, (sizeof(buf) - 1)); if (strncpy_from_user(buf, user_buf, buf_size) < 0) return -EFAULT; @@ -1628,6 +1629,19 @@ static ssize_t kmemleak_write(struct file *file, const char __user *user_buf, if (ret < 0) return ret; + if (strncmp(buf, "clear", 5) == 0) { + if (kmemleak_enabled) + kmemleak_clear(); + else + __kmemleak_do_cleanup(); + goto out; + } + + if (!kmemleak_enabled) { + ret = -EBUSY; + goto out; + } + if (strncmp(buf, "off", 3) == 0) kmemleak_disable(); else if (strncmp(buf, "stack=on", 8) == 0) @@ -1651,8 +1665,6 @@ static ssize_t kmemleak_write(struct file *file, const char __user *user_buf, } } else if (strncmp(buf, "scan", 4) == 0) kmemleak_scan(); - else if (strncmp(buf, "clear", 5) == 0) - kmemleak_clear(); else if (strncmp(buf, "dump=", 5) == 0) ret = dump_str_object_info(buf + 5); else @@ -1674,9 +1686,19 @@ static const struct file_operations kmemleak_fops = { .read = seq_read, .write = kmemleak_write, .llseek = seq_lseek, - .release = kmemleak_release, + .release = seq_release, }; +static void __kmemleak_do_cleanup(void) +{ + struct kmemleak_object *object; + + rcu_read_lock(); + list_for_each_entry_rcu(object, &object_list, object_list) + delete_object_full(object->pointer); + rcu_read_unlock(); +} + /* * Stop the memory scanning thread and free the kmemleak internal objects if * no previous scan thread (otherwise, kmemleak may still have some useful @@ -1684,18 +1706,14 @@ static const struct file_operations kmemleak_fops = { */ static void kmemleak_do_cleanup(struct work_struct *work) { - struct kmemleak_object *object; - bool cleanup = scan_thread == NULL; - mutex_lock(&scan_mutex); stop_scan_thread(); - if (cleanup) { - rcu_read_lock(); - list_for_each_entry_rcu(object, &object_list, object_list) - delete_object_full(object->pointer); - rcu_read_unlock(); - } + if (!kmemleak_found_leaks) + __kmemleak_do_cleanup(); + else + pr_info("Kmemleak disabled without freeing internal data. " + "Reclaim the memory with \"echo clear > /sys/kernel/debug/kmemleak\"\n"); mutex_unlock(&scan_mutex); } @@ -1708,14 +1726,14 @@ static DECLARE_WORK(cleanup_work, kmemleak_do_cleanup); static void kmemleak_disable(void) { /* atomically check whether it was already invoked */ - if (atomic_cmpxchg(&kmemleak_error, 0, 1)) + if (cmpxchg(&kmemleak_error, 0, 1)) return; /* stop any memory operation tracing */ - atomic_set(&kmemleak_enabled, 0); + kmemleak_enabled = 0; /* check whether it is too early for a kernel thread */ - if (atomic_read(&kmemleak_initialized)) + if (kmemleak_initialized) schedule_work(&cleanup_work); pr_info("Kernel memory leak detector disabled\n"); @@ -1757,9 +1775,10 @@ void __init kmemleak_init(void) int i; unsigned long flags; + kmemleak_early_log = 0; + #ifdef CONFIG_DEBUG_KMEMLEAK_DEFAULT_OFF if (!kmemleak_skip_disable) { - atomic_set(&kmemleak_early_log, 0); kmemleak_disable(); return; } @@ -1777,12 +1796,11 @@ void __init kmemleak_init(void) /* the kernel is still in UP mode, so disabling the IRQs is enough */ local_irq_save(flags); - atomic_set(&kmemleak_early_log, 0); - if (atomic_read(&kmemleak_error)) { + if (kmemleak_error) { local_irq_restore(flags); return; } else - atomic_set(&kmemleak_enabled, 1); + kmemleak_enabled = 1; local_irq_restore(flags); /* @@ -1826,9 +1844,9 @@ void __init kmemleak_init(void) log->op_type); } - if (atomic_read(&kmemleak_warning)) { + if (kmemleak_warning) { print_log_trace(log); - atomic_set(&kmemleak_warning, 0); + kmemleak_warning = 0; } } } @@ -1840,9 +1858,9 @@ static int __init kmemleak_late_init(void) { struct dentry *dentry; - atomic_set(&kmemleak_initialized, 1); + kmemleak_initialized = 1; - if (atomic_read(&kmemleak_error)) { + if (kmemleak_error) { /* * Some error occurred and kmemleak was disabled. There is a * small chance that kmemleak_disable() was called immediately @@ -444,7 +444,7 @@ static void break_cow(struct rmap_item *rmap_item) static struct page *page_trans_compound_anon(struct page *page) { if (PageTransCompound(page)) { - struct page *head = compound_trans_head(page); + struct page *head = compound_head(page); /* * head may actually be splitted and freed from under * us but it's ok here. diff --git a/mm/list_lru.c b/mm/list_lru.c index 72f9decb0104..f1a0db194173 100644 --- a/mm/list_lru.c +++ b/mm/list_lru.c @@ -87,11 +87,20 @@ restart: ret = isolate(item, &nlru->lock, cb_arg); switch (ret) { + case LRU_REMOVED_RETRY: + assert_spin_locked(&nlru->lock); case LRU_REMOVED: if (--nlru->nr_items == 0) node_clear(nid, lru->active_nodes); WARN_ON_ONCE(nlru->nr_items < 0); isolated++; + /* + * If the lru lock has been dropped, our list + * traversal is now invalid and so we have to + * restart from scratch. + */ + if (ret == LRU_REMOVED_RETRY) + goto restart; break; case LRU_ROTATE: list_move_tail(item, &nlru->list); @@ -103,6 +112,7 @@ restart: * The lru lock has been dropped, our list traversal is * now invalid and so we have to restart from scratch. */ + assert_spin_locked(&nlru->lock); goto restart; default: BUG(); @@ -114,7 +124,7 @@ restart: } EXPORT_SYMBOL_GPL(list_lru_walk_node); -int list_lru_init(struct list_lru *lru) +int list_lru_init_key(struct list_lru *lru, struct lock_class_key *key) { int i; size_t size = sizeof(*lru->node) * nr_node_ids; @@ -126,12 +136,14 @@ int list_lru_init(struct list_lru *lru) nodes_clear(lru->active_nodes); for (i = 0; i < nr_node_ids; i++) { spin_lock_init(&lru->node[i].lock); + if (key) + lockdep_set_class(&lru->node[i].lock, key); INIT_LIST_HEAD(&lru->node[i].list); lru->node[i].nr_items = 0; } return 0; } -EXPORT_SYMBOL_GPL(list_lru_init); +EXPORT_SYMBOL_GPL(list_lru_init_key); void list_lru_destroy(struct list_lru *lru) { diff --git a/mm/memblock.c b/mm/memblock.c index 39a31e7f0045..7fe5354e7552 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -1407,6 +1407,11 @@ void __init_memblock memblock_set_current_limit(phys_addr_t limit) memblock.current_limit = limit; } +phys_addr_t __init_memblock memblock_get_current_limit(void) +{ + return memblock.current_limit; +} + static void __init_memblock memblock_dump(struct memblock_type *type, char *name) { unsigned long long base, size; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 53385cd4e6f0..dcc8153a1681 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -66,8 +66,8 @@ #include <trace/events/vmscan.h> -struct cgroup_subsys mem_cgroup_subsys __read_mostly; -EXPORT_SYMBOL(mem_cgroup_subsys); +struct cgroup_subsys memory_cgrp_subsys __read_mostly; +EXPORT_SYMBOL(memory_cgrp_subsys); #define MEM_CGROUP_RECLAIM_RETRIES 5 static struct mem_cgroup *root_mem_cgroup __read_mostly; @@ -538,7 +538,7 @@ static inline struct mem_cgroup *mem_cgroup_from_id(unsigned short id) { struct cgroup_subsys_state *css; - css = css_from_id(id - 1, &mem_cgroup_subsys); + css = css_from_id(id - 1, &memory_cgrp_subsys); return mem_cgroup_from_css(css); } @@ -1072,7 +1072,7 @@ struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p) if (unlikely(!p)) return NULL; - return mem_cgroup_from_css(task_css(p, mem_cgroup_subsys_id)); + return mem_cgroup_from_css(task_css(p, memory_cgrp_id)); } struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm) @@ -1127,8 +1127,8 @@ skip_node: * skipping css reference should be safe. */ if (next_css) { - if ((next_css->flags & CSS_ONLINE) && - (next_css == &root->css || css_tryget(next_css))) + if ((next_css == &root->css) || + ((next_css->flags & CSS_ONLINE) && css_tryget(next_css))) return mem_cgroup_from_css(next_css); prev_css = next_css; @@ -1683,54 +1683,25 @@ static void move_unlock_mem_cgroup(struct mem_cgroup *memcg, */ void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p) { - /* - * protects memcg_name and makes sure that parallel ooms do not - * interleave - */ - static DEFINE_SPINLOCK(oom_info_lock); - struct cgroup *task_cgrp; - struct cgroup *mem_cgrp; - static char memcg_name[PATH_MAX]; - int ret; + /* oom_info_lock ensures that parallel ooms do not interleave */ + static DEFINE_MUTEX(oom_info_lock); struct mem_cgroup *iter; unsigned int i; if (!p) return; - spin_lock(&oom_info_lock); + mutex_lock(&oom_info_lock); rcu_read_lock(); - mem_cgrp = memcg->css.cgroup; - task_cgrp = task_cgroup(p, mem_cgroup_subsys_id); - - ret = cgroup_path(task_cgrp, memcg_name, PATH_MAX); - if (ret < 0) { - /* - * Unfortunately, we are unable to convert to a useful name - * But we'll still print out the usage information - */ - rcu_read_unlock(); - goto done; - } - rcu_read_unlock(); - - pr_info("Task in %s killed", memcg_name); + pr_info("Task in "); + pr_cont_cgroup_path(task_cgroup(p, memory_cgrp_id)); + pr_info(" killed as a result of limit of "); + pr_cont_cgroup_path(memcg->css.cgroup); + pr_info("\n"); - rcu_read_lock(); - ret = cgroup_path(mem_cgrp, memcg_name, PATH_MAX); - if (ret < 0) { - rcu_read_unlock(); - goto done; - } rcu_read_unlock(); - /* - * Continues from above, so we don't need an KERN_ level - */ - pr_cont(" as a result of limit of %s\n", memcg_name); -done: - pr_info("memory: usage %llukB, limit %llukB, failcnt %llu\n", res_counter_read_u64(&memcg->res, RES_USAGE) >> 10, res_counter_read_u64(&memcg->res, RES_LIMIT) >> 10, @@ -1745,13 +1716,8 @@ done: res_counter_read_u64(&memcg->kmem, RES_FAILCNT)); for_each_mem_cgroup_tree(iter, memcg) { - pr_info("Memory cgroup stats"); - - rcu_read_lock(); - ret = cgroup_path(iter->css.cgroup, memcg_name, PATH_MAX); - if (!ret) - pr_cont(" for %s", memcg_name); - rcu_read_unlock(); + pr_info("Memory cgroup stats for "); + pr_cont_cgroup_path(iter->css.cgroup); pr_cont(":"); for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) { @@ -1767,7 +1733,7 @@ done: pr_cont("\n"); } - spin_unlock(&oom_info_lock); + mutex_unlock(&oom_info_lock); } /* @@ -3401,7 +3367,7 @@ static struct kmem_cache *memcg_create_kmem_cache(struct mem_cgroup *memcg, struct kmem_cache *s) { struct kmem_cache *new = NULL; - static char *tmp_name = NULL; + static char *tmp_path = NULL, *tmp_name = NULL; static DEFINE_MUTEX(mutex); /* protects tmp_name */ BUG_ON(!memcg_can_account_kmem(memcg)); @@ -3413,18 +3379,20 @@ static struct kmem_cache *memcg_create_kmem_cache(struct mem_cgroup *memcg, * This static temporary buffer is used to prevent from * pointless shortliving allocation. */ - if (!tmp_name) { - tmp_name = kmalloc(PATH_MAX, GFP_KERNEL); + if (!tmp_path || !tmp_name) { + if (!tmp_path) + tmp_path = kmalloc(PATH_MAX, GFP_KERNEL); if (!tmp_name) + tmp_name = kmalloc(NAME_MAX + 1, GFP_KERNEL); + if (!tmp_path || !tmp_name) goto out; } - rcu_read_lock(); - snprintf(tmp_name, PATH_MAX, "%s(%d:%s)", s->name, - memcg_cache_id(memcg), cgroup_name(memcg->css.cgroup)); - rcu_read_unlock(); + cgroup_name(memcg->css.cgroup, tmp_name, NAME_MAX + 1); + snprintf(tmp_path, PATH_MAX, "%s(%d:%s)", s->name, + memcg_cache_id(memcg), tmp_name); - new = kmem_cache_create_memcg(memcg, tmp_name, s->object_size, s->align, + new = kmem_cache_create_memcg(memcg, tmp_path, s->object_size, s->align, (s->flags & ~SLAB_PANIC), s->ctor, s); if (new) new->allocflags |= __GFP_KMEMCG; @@ -4990,7 +4958,7 @@ static int mem_cgroup_force_empty(struct mem_cgroup *memcg) struct cgroup *cgrp = memcg->css.cgroup; /* returns EBUSY if there is a task or if we come here twice. */ - if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children)) + if (cgroup_has_tasks(cgrp) || !list_empty(&cgrp->children)) return -EBUSY; /* we call try-to-free pages for make this cgroup empty */ @@ -5172,7 +5140,7 @@ static int __memcg_activate_kmem(struct mem_cgroup *memcg, * of course permitted. */ mutex_lock(&memcg_create_mutex); - if (cgroup_task_count(memcg->css.cgroup) || memcg_has_children(memcg)) + if (cgroup_has_tasks(memcg->css.cgroup) || memcg_has_children(memcg)) err = -EBUSY; mutex_unlock(&memcg_create_mutex); if (err) @@ -5274,7 +5242,7 @@ static int memcg_update_kmem_limit(struct mem_cgroup *memcg, * RES_LIMIT. */ static int mem_cgroup_write(struct cgroup_subsys_state *css, struct cftype *cft, - const char *buffer) + char *buffer) { struct mem_cgroup *memcg = mem_cgroup_from_css(css); enum res_type type; @@ -6095,7 +6063,7 @@ static void memcg_event_ptable_queue_proc(struct file *file, * Interpretation of args is defined by control file implementation. */ static int memcg_write_event_control(struct cgroup_subsys_state *css, - struct cftype *cft, const char *buffer) + struct cftype *cft, char *buffer) { struct mem_cgroup *memcg = mem_cgroup_from_css(css); struct mem_cgroup_event *event; @@ -6183,17 +6151,15 @@ static int memcg_write_event_control(struct cgroup_subsys_state *css, * automatically removed on cgroup destruction but the removal is * asynchronous, so take an extra ref on @css. */ - rcu_read_lock(); - + cfile_css = css_tryget_from_dir(cfile.file->f_dentry->d_parent, + &memory_cgrp_subsys); ret = -EINVAL; - cfile_css = css_from_dir(cfile.file->f_dentry->d_parent, - &mem_cgroup_subsys); - if (cfile_css == css && css_tryget(css)) - ret = 0; - - rcu_read_unlock(); - if (ret) + if (IS_ERR(cfile_css)) + goto out_put_cfile; + if (cfile_css != css) { + css_put(cfile_css); goto out_put_cfile; + } ret = event->register_event(memcg, event->eventfd, buffer); if (ret) @@ -6566,11 +6532,11 @@ mem_cgroup_css_online(struct cgroup_subsys_state *css) * unfortunate state in our controller. */ if (parent != root_mem_cgroup) - mem_cgroup_subsys.broken_hierarchy = true; + memory_cgrp_subsys.broken_hierarchy = true; } mutex_unlock(&memcg_create_mutex); - return memcg_init_kmem(memcg, &mem_cgroup_subsys); + return memcg_init_kmem(memcg, &memory_cgrp_subsys); } /* @@ -6595,6 +6561,7 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css) { struct mem_cgroup *memcg = mem_cgroup_from_css(css); struct mem_cgroup_event *event, *tmp; + struct cgroup_subsys_state *iter; /* * Unregister events and notify userspace. @@ -6611,7 +6578,14 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css) kmem_cgroup_css_offline(memcg); mem_cgroup_invalidate_reclaim_iterators(memcg); - mem_cgroup_reparent_charges(memcg); + + /* + * This requires that offlining is serialized. Right now that is + * guaranteed because css_killed_work_fn() holds the cgroup_mutex. + */ + css_for_each_descendant_post(iter, css) + mem_cgroup_reparent_charges(mem_cgroup_from_css(iter)); + mem_cgroup_destroy_all_caches(memcg); vmpressure_cleanup(&memcg->vmpressure); } @@ -7264,9 +7238,7 @@ static void mem_cgroup_bind(struct cgroup_subsys_state *root_css) mem_cgroup_from_css(root_css)->use_hierarchy = true; } -struct cgroup_subsys mem_cgroup_subsys = { - .name = "memory", - .subsys_id = mem_cgroup_subsys_id, +struct cgroup_subsys memory_cgrp_subsys = { .css_alloc = mem_cgroup_css_alloc, .css_online = mem_cgroup_css_online, .css_offline = mem_cgroup_css_offline, @@ -7292,7 +7264,7 @@ __setup("swapaccount=", enable_swap_account); static void __init memsw_file_init(void) { - WARN_ON(cgroup_add_cftypes(&mem_cgroup_subsys, memsw_cgroup_files)); + WARN_ON(cgroup_add_cftypes(&memory_cgrp_subsys, memsw_cgroup_files)); } static void __init enable_swap_cgroup(void) diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 2f2f34a4e77d..35ef28acf137 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -145,14 +145,10 @@ static int hwpoison_filter_task(struct page *p) return -EINVAL; css = mem_cgroup_css(mem); - /* root_mem_cgroup has NULL dentries */ - if (!css->cgroup->dentry) - return -EINVAL; - - ino = css->cgroup->dentry->d_inode->i_ino; + ino = cgroup_ino(css->cgroup); css_put(css); - if (ino != hwpoison_filter_memcg) + if (!ino || ino != hwpoison_filter_memcg) return -EINVAL; return 0; @@ -1651,7 +1647,7 @@ int soft_offline_page(struct page *page, int flags) { int ret; unsigned long pfn = page_to_pfn(page); - struct page *hpage = compound_trans_head(page); + struct page *hpage = compound_head(page); if (PageHWPoison(page)) { pr_info("soft offline: %#lx page already poisoned\n", pfn); diff --git a/mm/memory.c b/mm/memory.c index be6a0c0d4ae0..82c1e4cf00d1 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1705,15 +1705,6 @@ long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, VM_BUG_ON(!!pages != !!(gup_flags & FOLL_GET)); - /* - * Require read or write permissions. - * If FOLL_FORCE is set, we only require the "MAY" flags. - */ - vm_flags = (gup_flags & FOLL_WRITE) ? - (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD); - vm_flags &= (gup_flags & FOLL_FORCE) ? - (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE); - /* * If FOLL_FORCE and FOLL_NUMA are both set, handle_mm_fault * would be called on PROT_NONE ranges. We must never invoke @@ -1741,7 +1732,7 @@ long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, /* user gate pages are read-only */ if (gup_flags & FOLL_WRITE) - return i ? : -EFAULT; + goto efault; if (pg > TASK_SIZE) pgd = pgd_offset_k(pg); else @@ -1751,12 +1742,12 @@ long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, BUG_ON(pud_none(*pud)); pmd = pmd_offset(pud, pg); if (pmd_none(*pmd)) - return i ? : -EFAULT; + goto efault; VM_BUG_ON(pmd_trans_huge(*pmd)); pte = pte_offset_map(pmd, pg); if (pte_none(*pte)) { pte_unmap(pte); - return i ? : -EFAULT; + goto efault; } vma = get_gate_vma(mm); if (pages) { @@ -1769,7 +1760,7 @@ long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, page = pte_page(*pte); else { pte_unmap(pte); - return i ? : -EFAULT; + goto efault; } } pages[i] = page; @@ -1780,10 +1771,42 @@ long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, goto next_page; } - if (!vma || - (vma->vm_flags & (VM_IO | VM_PFNMAP)) || - !(vm_flags & vma->vm_flags)) - return i ? : -EFAULT; + if (!vma) + goto efault; + vm_flags = vma->vm_flags; + if (vm_flags & (VM_IO | VM_PFNMAP)) + goto efault; + + if (gup_flags & FOLL_WRITE) { + if (!(vm_flags & VM_WRITE)) { + if (!(gup_flags & FOLL_FORCE)) + goto efault; + /* + * We used to let the write,force case do COW + * in a VM_MAYWRITE VM_SHARED !VM_WRITE vma, so + * ptrace could set a breakpoint in a read-only + * mapping of an executable, without corrupting + * the file (yet only when that file had been + * opened for writing!). Anon pages in shared + * mappings are surprising: now just reject it. + */ + if (!is_cow_mapping(vm_flags)) { + WARN_ON_ONCE(vm_flags & VM_MAYWRITE); + goto efault; + } + } + } else { + if (!(vm_flags & VM_READ)) { + if (!(gup_flags & FOLL_FORCE)) + goto efault; + /* + * Is there actually any vma we can reach here + * which does not have VM_MAYREAD set? + */ + if (!(vm_flags & VM_MAYREAD)) + goto efault; + } + } if (is_vm_hugetlb_page(vma)) { i = follow_hugetlb_page(mm, vma, pages, vmas, @@ -1837,7 +1860,7 @@ long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, return -EFAULT; } if (ret & VM_FAULT_SIGBUS) - return i ? i : -EFAULT; + goto efault; BUG(); } @@ -1895,6 +1918,8 @@ next_page: } while (nr_pages && start < vma->vm_end); } while (nr_pages); return i; +efault: + return i ? : -EFAULT; } EXPORT_SYMBOL(__get_user_pages); @@ -1962,9 +1987,8 @@ int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm, * @start: starting user address * @nr_pages: number of pages from start to pin * @write: whether pages will be written to by the caller - * @force: whether to force write access even if user mapping is - * readonly. This will result in the page being COWed even - * in MAP_SHARED mappings. You do not want this. + * @force: whether to force access even when user mapping is currently + * protected (but never forces write access to shared mapping). * @pages: array that receives pointers to the pages pinned. * Should be at least nr_pages long. Or NULL, if caller * only intends to ensure the pages are faulted in. @@ -2587,6 +2611,38 @@ static inline void cow_user_page(struct page *dst, struct page *src, unsigned lo } /* + * Notify the address space that the page is about to become writable so that + * it can prohibit this or wait for the page to get into an appropriate state. + * + * We do this without the lock held, so that it can sleep if it needs to. + */ +static int do_page_mkwrite(struct vm_area_struct *vma, struct page *page, + unsigned long address) +{ + struct vm_fault vmf; + int ret; + + vmf.virtual_address = (void __user *)(address & PAGE_MASK); + vmf.pgoff = page->index; + vmf.flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE; + vmf.page = page; + + ret = vma->vm_ops->page_mkwrite(vma, &vmf); + if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) + return ret; + if (unlikely(!(ret & VM_FAULT_LOCKED))) { + lock_page(page); + if (!page->mapping) { + unlock_page(page); + return 0; /* retry */ + } + ret |= VM_FAULT_LOCKED; + } else + VM_BUG_ON_PAGE(!PageLocked(page), page); + return ret; +} + +/* * This routine handles present pages, when users try to write * to a shared page. It is done by copying the page to a new address * and decrementing the shared-page counter for the old page. @@ -2668,42 +2724,15 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, * get_user_pages(.write=1, .force=1). */ if (vma->vm_ops && vma->vm_ops->page_mkwrite) { - struct vm_fault vmf; int tmp; - - vmf.virtual_address = (void __user *)(address & - PAGE_MASK); - vmf.pgoff = old_page->index; - vmf.flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE; - vmf.page = old_page; - - /* - * Notify the address space that the page is about to - * become writable so that it can prohibit this or wait - * for the page to get into an appropriate state. - * - * We do this without the lock held, so that it can - * sleep if it needs to. - */ page_cache_get(old_page); pte_unmap_unlock(page_table, ptl); - - tmp = vma->vm_ops->page_mkwrite(vma, &vmf); - if (unlikely(tmp & - (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) { - ret = tmp; - goto unwritable_page; + tmp = do_page_mkwrite(vma, old_page, address); + if (unlikely(!tmp || (tmp & + (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) { + page_cache_release(old_page); + return tmp; } - if (unlikely(!(tmp & VM_FAULT_LOCKED))) { - lock_page(old_page); - if (!old_page->mapping) { - ret = 0; /* retry the fault */ - unlock_page(old_page); - goto unwritable_page; - } - } else - VM_BUG_ON_PAGE(!PageLocked(old_page), old_page); - /* * Since we dropped the lock we need to revalidate * the PTE as someone else may have changed it. If @@ -2748,7 +2777,7 @@ reuse: * bit after it clear all dirty ptes, but before a racing * do_wp_page installs a dirty pte. * - * __do_fault is protected similarly. + * do_shared_fault is protected similarly. */ if (!page_mkwrite) { wait_on_page_locked(dirty_page); @@ -2892,10 +2921,6 @@ oom: if (old_page) page_cache_release(old_page); return VM_FAULT_OOM; - -unwritable_page: - page_cache_release(old_page); - return ret; } static void unmap_mapping_range_vma(struct vm_area_struct *vma, @@ -3286,53 +3311,11 @@ oom: return VM_FAULT_OOM; } -/* - * __do_fault() tries to create a new page mapping. It aggressively - * tries to share with existing pages, but makes a separate copy if - * the FAULT_FLAG_WRITE is set in the flags parameter in order to avoid - * the next page fault. - * - * As this is called only for pages that do not currently exist, we - * do not need to flush old virtual caches or the TLB. - * - * We enter with non-exclusive mmap_sem (to exclude vma changes, - * but allow concurrent faults), and pte neither mapped nor locked. - * We return with mmap_sem still held, but pte unmapped and unlocked. - */ -static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long address, pmd_t *pmd, - pgoff_t pgoff, unsigned int flags, pte_t orig_pte) +static int __do_fault(struct vm_area_struct *vma, unsigned long address, + pgoff_t pgoff, unsigned int flags, struct page **page) { - pte_t *page_table; - spinlock_t *ptl; - struct page *page; - struct page *cow_page; - pte_t entry; - int anon = 0; - struct page *dirty_page = NULL; struct vm_fault vmf; int ret; - int page_mkwrite = 0; - - /* - * If we do COW later, allocate page befor taking lock_page() - * on the file cache page. This will reduce lock holding time. - */ - if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) { - - if (unlikely(anon_vma_prepare(vma))) - return VM_FAULT_OOM; - - cow_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address); - if (!cow_page) - return VM_FAULT_OOM; - - if (mem_cgroup_newpage_charge(cow_page, mm, GFP_KERNEL)) { - page_cache_release(cow_page); - return VM_FAULT_OOM; - } - } else - cow_page = NULL; vmf.virtual_address = (void __user *)(address & PAGE_MASK); vmf.pgoff = pgoff; @@ -3340,150 +3323,176 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, vmf.page = NULL; ret = vma->vm_ops->fault(vma, &vmf); - if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | - VM_FAULT_RETRY))) - goto uncharge_out; + if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) + return ret; if (unlikely(PageHWPoison(vmf.page))) { if (ret & VM_FAULT_LOCKED) unlock_page(vmf.page); - ret = VM_FAULT_HWPOISON; - goto uncharge_out; + page_cache_release(vmf.page); + return VM_FAULT_HWPOISON; } - /* - * For consistency in subsequent calls, make the faulted page always - * locked. - */ if (unlikely(!(ret & VM_FAULT_LOCKED))) lock_page(vmf.page); else VM_BUG_ON_PAGE(!PageLocked(vmf.page), vmf.page); - /* - * Should we do an early C-O-W break? - */ - page = vmf.page; - if (flags & FAULT_FLAG_WRITE) { - if (!(vma->vm_flags & VM_SHARED)) { - page = cow_page; - anon = 1; - copy_user_highpage(page, vmf.page, address, vma); - __SetPageUptodate(page); - } else { - /* - * If the page will be shareable, see if the backing - * address space wants to know that the page is about - * to become writable - */ - if (vma->vm_ops->page_mkwrite) { - int tmp; - - unlock_page(page); - vmf.flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE; - tmp = vma->vm_ops->page_mkwrite(vma, &vmf); - if (unlikely(tmp & - (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) { - ret = tmp; - goto unwritable_page; - } - if (unlikely(!(tmp & VM_FAULT_LOCKED))) { - lock_page(page); - if (!page->mapping) { - ret = 0; /* retry the fault */ - unlock_page(page); - goto unwritable_page; - } - } else - VM_BUG_ON_PAGE(!PageLocked(page), page); - page_mkwrite = 1; - } - } + *page = vmf.page; + return ret; +} + +static void do_set_pte(struct vm_area_struct *vma, unsigned long address, + struct page *page, pte_t *pte, bool write, bool anon) +{ + pte_t entry; + flush_icache_page(vma, page); + entry = mk_pte(page, vma->vm_page_prot); + if (write) + entry = maybe_mkwrite(pte_mkdirty(entry), vma); + else if (pte_file(*pte) && pte_file_soft_dirty(*pte)) + pte_mksoft_dirty(entry); + if (anon) { + inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES); + page_add_new_anon_rmap(page, vma, address); + } else { + inc_mm_counter_fast(vma->vm_mm, MM_FILEPAGES); + page_add_file_rmap(page); } + set_pte_at(vma->vm_mm, address, pte, entry); - page_table = pte_offset_map_lock(mm, pmd, address, &ptl); + /* no need to invalidate: a not-present page won't be cached */ + update_mmu_cache(vma, address, pte); +} - /* - * This silly early PAGE_DIRTY setting removes a race - * due to the bad i386 page protection. But it's valid - * for other architectures too. - * - * Note that if FAULT_FLAG_WRITE is set, we either now have - * an exclusive copy of the page, or this is a shared mapping, - * so we can make it writable and dirty to avoid having to - * handle that later. - */ - /* Only go through if we didn't race with anybody else... */ - if (likely(pte_same(*page_table, orig_pte))) { - flush_icache_page(vma, page); - entry = mk_pte(page, vma->vm_page_prot); - if (flags & FAULT_FLAG_WRITE) - entry = maybe_mkwrite(pte_mkdirty(entry), vma); - else if (pte_file(orig_pte) && pte_file_soft_dirty(orig_pte)) - pte_mksoft_dirty(entry); - if (anon) { - inc_mm_counter_fast(mm, MM_ANONPAGES); - page_add_new_anon_rmap(page, vma, address); - } else { - inc_mm_counter_fast(mm, MM_FILEPAGES); - page_add_file_rmap(page); - if (flags & FAULT_FLAG_WRITE) { - dirty_page = page; - get_page(dirty_page); - } - } - set_pte_at(mm, address, page_table, entry); +static int do_read_fault(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long address, pmd_t *pmd, + pgoff_t pgoff, unsigned int flags, pte_t orig_pte) +{ + struct page *fault_page; + spinlock_t *ptl; + pte_t *pte; + int ret; - /* no need to invalidate: a not-present page won't be cached */ - update_mmu_cache(vma, address, page_table); - } else { - if (cow_page) - mem_cgroup_uncharge_page(cow_page); - if (anon) - page_cache_release(page); - else - anon = 1; /* no anon but release faulted_page */ + ret = __do_fault(vma, address, pgoff, flags, &fault_page); + if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) + return ret; + + pte = pte_offset_map_lock(mm, pmd, address, &ptl); + if (unlikely(!pte_same(*pte, orig_pte))) { + pte_unmap_unlock(pte, ptl); + unlock_page(fault_page); + page_cache_release(fault_page); + return ret; } + do_set_pte(vma, address, fault_page, pte, false, false); + pte_unmap_unlock(pte, ptl); + unlock_page(fault_page); + return ret; +} - pte_unmap_unlock(page_table, ptl); +static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long address, pmd_t *pmd, + pgoff_t pgoff, unsigned int flags, pte_t orig_pte) +{ + struct page *fault_page, *new_page; + spinlock_t *ptl; + pte_t *pte; + int ret; - if (dirty_page) { - struct address_space *mapping = page->mapping; - int dirtied = 0; + if (unlikely(anon_vma_prepare(vma))) + return VM_FAULT_OOM; - if (set_page_dirty(dirty_page)) - dirtied = 1; - unlock_page(dirty_page); - put_page(dirty_page); - if ((dirtied || page_mkwrite) && mapping) { - /* - * Some device drivers do not set page.mapping but still - * dirty their pages - */ - balance_dirty_pages_ratelimited(mapping); - } + new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address); + if (!new_page) + return VM_FAULT_OOM; - /* file_update_time outside page_lock */ - if (vma->vm_file && !page_mkwrite) - file_update_time(vma->vm_file); - } else { - unlock_page(vmf.page); - if (anon) - page_cache_release(vmf.page); + if (mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL)) { + page_cache_release(new_page); + return VM_FAULT_OOM; } - return ret; + ret = __do_fault(vma, address, pgoff, flags, &fault_page); + if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) + goto uncharge_out; -unwritable_page: - page_cache_release(page); + copy_user_highpage(new_page, fault_page, address, vma); + __SetPageUptodate(new_page); + + pte = pte_offset_map_lock(mm, pmd, address, &ptl); + if (unlikely(!pte_same(*pte, orig_pte))) { + pte_unmap_unlock(pte, ptl); + unlock_page(fault_page); + page_cache_release(fault_page); + goto uncharge_out; + } + do_set_pte(vma, address, new_page, pte, true, true); + pte_unmap_unlock(pte, ptl); + unlock_page(fault_page); + page_cache_release(fault_page); return ret; uncharge_out: - /* fs's fault handler get error */ - if (cow_page) { - mem_cgroup_uncharge_page(cow_page); - page_cache_release(cow_page); + mem_cgroup_uncharge_page(new_page); + page_cache_release(new_page); + return ret; +} + +static int do_shared_fault(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long address, pmd_t *pmd, + pgoff_t pgoff, unsigned int flags, pte_t orig_pte) +{ + struct page *fault_page; + struct address_space *mapping; + spinlock_t *ptl; + pte_t *pte; + int dirtied = 0; + int ret, tmp; + + ret = __do_fault(vma, address, pgoff, flags, &fault_page); + if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) + return ret; + + /* + * Check if the backing address space wants to know that the page is + * about to become writable + */ + if (vma->vm_ops->page_mkwrite) { + unlock_page(fault_page); + tmp = do_page_mkwrite(vma, fault_page, address); + if (unlikely(!tmp || + (tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) { + page_cache_release(fault_page); + return tmp; + } } + + pte = pte_offset_map_lock(mm, pmd, address, &ptl); + if (unlikely(!pte_same(*pte, orig_pte))) { + pte_unmap_unlock(pte, ptl); + unlock_page(fault_page); + page_cache_release(fault_page); + return ret; + } + do_set_pte(vma, address, fault_page, pte, true, false); + pte_unmap_unlock(pte, ptl); + + if (set_page_dirty(fault_page)) + dirtied = 1; + mapping = fault_page->mapping; + unlock_page(fault_page); + if ((dirtied || vma->vm_ops->page_mkwrite) && mapping) { + /* + * Some device drivers do not set page.mapping but still + * dirty their pages + */ + balance_dirty_pages_ratelimited(mapping); + } + + /* file_update_time outside page_lock */ + if (vma->vm_file && !vma->vm_ops->page_mkwrite) + file_update_time(vma->vm_file); + return ret; } @@ -3495,7 +3504,13 @@ static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma, - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; pte_unmap(page_table); - return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte); + if (!(flags & FAULT_FLAG_WRITE)) + return do_read_fault(mm, vma, address, pmd, pgoff, flags, + orig_pte); + if (!(vma->vm_flags & VM_SHARED)) + return do_cow_fault(mm, vma, address, pmd, pgoff, flags, + orig_pte); + return do_shared_fault(mm, vma, address, pmd, pgoff, flags, orig_pte); } /* @@ -3527,10 +3542,16 @@ static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma, } pgoff = pte_to_pgoff(orig_pte); - return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte); + if (!(flags & FAULT_FLAG_WRITE)) + return do_read_fault(mm, vma, address, pmd, pgoff, flags, + orig_pte); + if (!(vma->vm_flags & VM_SHARED)) + return do_cow_fault(mm, vma, address, pmd, pgoff, flags, + orig_pte); + return do_shared_fault(mm, vma, address, pmd, pgoff, flags, orig_pte); } -int numa_migrate_prep(struct page *page, struct vm_area_struct *vma, +static int numa_migrate_prep(struct page *page, struct vm_area_struct *vma, unsigned long addr, int page_nid, int *flags) { @@ -3545,7 +3566,7 @@ int numa_migrate_prep(struct page *page, struct vm_area_struct *vma, return mpol_misplaced(page, vma, addr); } -int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, +static int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, pte_t pte, pte_t *ptep, pmd_t *pmd) { struct page *page = NULL; @@ -3703,7 +3724,6 @@ static int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, if (unlikely(is_vm_hugetlb_page(vma))) return hugetlb_fault(mm, vma, address, flags); -retry: pgd = pgd_offset(mm, address); pud = pud_alloc(mm, pgd, address); if (!pud) @@ -3741,20 +3761,13 @@ retry: if (dirty && !pmd_write(orig_pmd)) { ret = do_huge_pmd_wp_page(mm, vma, address, pmd, orig_pmd); - /* - * If COW results in an oom, the huge pmd will - * have been split, so retry the fault on the - * pte for a smaller charge. - */ - if (unlikely(ret & VM_FAULT_OOM)) - goto retry; - return ret; + if (!(ret & VM_FAULT_FALLBACK)) + return ret; } else { huge_pmd_set_accessed(mm, vma, address, pmd, orig_pmd, dirty); + return 0; } - - return 0; } } diff --git a/mm/mempolicy.c b/mm/mempolicy.c index ae3c8f3595d4..e3ab02822799 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1556,10 +1556,10 @@ SYSCALL_DEFINE5(get_mempolicy, int __user *, policy, #ifdef CONFIG_COMPAT -asmlinkage long compat_sys_get_mempolicy(int __user *policy, - compat_ulong_t __user *nmask, - compat_ulong_t maxnode, - compat_ulong_t addr, compat_ulong_t flags) +COMPAT_SYSCALL_DEFINE5(get_mempolicy, int __user *, policy, + compat_ulong_t __user *, nmask, + compat_ulong_t, maxnode, + compat_ulong_t, addr, compat_ulong_t, flags) { long err; unsigned long __user *nm = NULL; @@ -1586,8 +1586,8 @@ asmlinkage long compat_sys_get_mempolicy(int __user *policy, return err; } -asmlinkage long compat_sys_set_mempolicy(int mode, compat_ulong_t __user *nmask, - compat_ulong_t maxnode) +COMPAT_SYSCALL_DEFINE3(set_mempolicy, int, mode, compat_ulong_t __user *, nmask, + compat_ulong_t, maxnode) { long err = 0; unsigned long __user *nm = NULL; @@ -1609,9 +1609,9 @@ asmlinkage long compat_sys_set_mempolicy(int mode, compat_ulong_t __user *nmask, return sys_set_mempolicy(mode, nm, nr_bits+1); } -asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len, - compat_ulong_t mode, compat_ulong_t __user *nmask, - compat_ulong_t maxnode, compat_ulong_t flags) +COMPAT_SYSCALL_DEFINE6(mbind, compat_ulong_t, start, compat_ulong_t, len, + compat_ulong_t, mode, compat_ulong_t __user *, nmask, + compat_ulong_t, maxnode, compat_ulong_t, flags) { long err = 0; unsigned long __user *nm = NULL; @@ -1899,7 +1899,7 @@ int node_random(const nodemask_t *maskp) * If the effective policy is 'BIND, returns a pointer to the mempolicy's * @nodemask for filtering the zonelist. * - * Must be protected by get_mems_allowed() + * Must be protected by read_mems_allowed_begin() */ struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr, gfp_t gfp_flags, struct mempolicy **mpol, @@ -2063,7 +2063,7 @@ alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma, retry_cpuset: pol = get_vma_policy(current, vma, addr); - cpuset_mems_cookie = get_mems_allowed(); + cpuset_mems_cookie = read_mems_allowed_begin(); if (unlikely(pol->mode == MPOL_INTERLEAVE)) { unsigned nid; @@ -2071,7 +2071,7 @@ retry_cpuset: nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order); mpol_cond_put(pol); page = alloc_page_interleave(gfp, order, nid); - if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page)) + if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie))) goto retry_cpuset; return page; @@ -2081,7 +2081,7 @@ retry_cpuset: policy_nodemask(gfp, pol)); if (unlikely(mpol_needs_cond_ref(pol))) __mpol_put(pol); - if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page)) + if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie))) goto retry_cpuset; return page; } @@ -2115,7 +2115,7 @@ struct page *alloc_pages_current(gfp_t gfp, unsigned order) pol = &default_policy; retry_cpuset: - cpuset_mems_cookie = get_mems_allowed(); + cpuset_mems_cookie = read_mems_allowed_begin(); /* * No reference counting needed for current->mempolicy @@ -2128,7 +2128,7 @@ retry_cpuset: policy_zonelist(gfp, pol, numa_node_id()), policy_nodemask(gfp, pol)); - if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page)) + if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie))) goto retry_cpuset; return page; @@ -2301,35 +2301,6 @@ static void sp_free(struct sp_node *n) kmem_cache_free(sn_cache, n); } -#ifdef CONFIG_NUMA_BALANCING -static bool numa_migrate_deferred(struct task_struct *p, int last_cpupid) -{ - /* Never defer a private fault */ - if (cpupid_match_pid(p, last_cpupid)) - return false; - - if (p->numa_migrate_deferred) { - p->numa_migrate_deferred--; - return true; - } - return false; -} - -static inline void defer_numa_migrate(struct task_struct *p) -{ - p->numa_migrate_deferred = sysctl_numa_balancing_migrate_deferred; -} -#else -static inline bool numa_migrate_deferred(struct task_struct *p, int last_cpupid) -{ - return false; -} - -static inline void defer_numa_migrate(struct task_struct *p) -{ -} -#endif /* CONFIG_NUMA_BALANCING */ - /** * mpol_misplaced - check whether current page node is valid in policy * @@ -2403,52 +2374,9 @@ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long /* Migrate the page towards the node whose CPU is referencing it */ if (pol->flags & MPOL_F_MORON) { - int last_cpupid; - int this_cpupid; - polnid = thisnid; - this_cpupid = cpu_pid_to_cpupid(thiscpu, current->pid); - - /* - * Multi-stage node selection is used in conjunction - * with a periodic migration fault to build a temporal - * task<->page relation. By using a two-stage filter we - * remove short/unlikely relations. - * - * Using P(p) ~ n_p / n_t as per frequentist - * probability, we can equate a task's usage of a - * particular page (n_p) per total usage of this - * page (n_t) (in a given time-span) to a probability. - * - * Our periodic faults will sample this probability and - * getting the same result twice in a row, given these - * samples are fully independent, is then given by - * P(n)^2, provided our sample period is sufficiently - * short compared to the usage pattern. - * - * This quadric squishes small probabilities, making - * it less likely we act on an unlikely task<->page - * relation. - */ - last_cpupid = page_cpupid_xchg_last(page, this_cpupid); - if (!cpupid_pid_unset(last_cpupid) && cpupid_to_nid(last_cpupid) != thisnid) { - /* See sysctl_numa_balancing_migrate_deferred comment */ - if (!cpupid_match_pid(current, last_cpupid)) - defer_numa_migrate(current); - - goto out; - } - - /* - * The quadratic filter above reduces extraneous migration - * of shared pages somewhat. This code reduces it even more, - * reducing the overhead of page migrations of shared pages. - * This makes workloads with shared pages rely more on - * "move task near its memory", and less on "move memory - * towards its task", which is exactly what we want. - */ - if (numa_migrate_deferred(current, last_cpupid)) + if (!should_numa_migrate_memory(current, page, curnid, thiscpu)) goto out; } diff --git a/mm/migrate.c b/mm/migrate.c index 482a33d89134..bed48809e5d0 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -178,6 +178,37 @@ out: } /* + * Congratulations to trinity for discovering this bug. + * mm/fremap.c's remap_file_pages() accepts any range within a single vma to + * convert that vma to VM_NONLINEAR; and generic_file_remap_pages() will then + * replace the specified range by file ptes throughout (maybe populated after). + * If page migration finds a page within that range, while it's still located + * by vma_interval_tree rather than lost to i_mmap_nonlinear list, no problem: + * zap_pte() clears the temporary migration entry before mmap_sem is dropped. + * But if the migrating page is in a part of the vma outside the range to be + * remapped, then it will not be cleared, and remove_migration_ptes() needs to + * deal with it. Fortunately, this part of the vma is of course still linear, + * so we just need to use linear location on the nonlinear list. + */ +static int remove_linear_migration_ptes_from_nonlinear(struct page *page, + struct address_space *mapping, void *arg) +{ + struct vm_area_struct *vma; + /* hugetlbfs does not support remap_pages, so no huge pgoff worries */ + pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); + unsigned long addr; + + list_for_each_entry(vma, + &mapping->i_mmap_nonlinear, shared.nonlinear) { + + addr = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); + if (addr >= vma->vm_start && addr < vma->vm_end) + remove_migration_pte(page, vma, addr, arg); + } + return SWAP_AGAIN; +} + +/* * Get rid of all migration entries and replace them by * references to the indicated page. */ @@ -186,6 +217,7 @@ static void remove_migration_ptes(struct page *old, struct page *new) struct rmap_walk_control rwc = { .rmap_one = remove_migration_pte, .arg = old, + .file_nonlinear = remove_linear_migration_ptes_from_nonlinear, }; rmap_walk(new, &rwc); @@ -1158,7 +1190,7 @@ static struct page *new_page_node(struct page *p, unsigned long private, pm->node); else return alloc_pages_exact_node(pm->node, - GFP_HIGHUSER_MOVABLE | GFP_THISNODE, 0); + GFP_HIGHUSER_MOVABLE | __GFP_THISNODE, 0); } /* @@ -1544,9 +1576,9 @@ static struct page *alloc_misplaced_dst_page(struct page *page, struct page *newpage; newpage = alloc_pages_exact_node(nid, - (GFP_HIGHUSER_MOVABLE | GFP_THISNODE | - __GFP_NOMEMALLOC | __GFP_NORETRY | - __GFP_NOWARN) & + (GFP_HIGHUSER_MOVABLE | + __GFP_THISNODE | __GFP_NOMEMALLOC | + __GFP_NORETRY | __GFP_NOWARN) & ~GFP_IOFS, 0); return newpage; @@ -1747,7 +1779,8 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm, goto out_dropref; new_page = alloc_pages_node(node, - (GFP_TRANSHUGE | GFP_THISNODE) & ~__GFP_WAIT, HPAGE_PMD_ORDER); + (GFP_TRANSHUGE | __GFP_THISNODE) & ~__GFP_WAIT, + HPAGE_PMD_ORDER); if (!new_page) goto out_fail; diff --git a/mm/mincore.c b/mm/mincore.c index 101623378fbf..725c80961048 100644 --- a/mm/mincore.c +++ b/mm/mincore.c @@ -70,13 +70,21 @@ static unsigned char mincore_page(struct address_space *mapping, pgoff_t pgoff) * any other file mapping (ie. marked !present and faulted in with * tmpfs's .fault). So swapped out tmpfs mappings are tested here. */ - page = find_get_page(mapping, pgoff); #ifdef CONFIG_SWAP - /* shmem/tmpfs may return swap: account for swapcache page too. */ - if (radix_tree_exceptional_entry(page)) { - swp_entry_t swap = radix_to_swp_entry(page); - page = find_get_page(swap_address_space(swap), swap.val); - } + if (shmem_mapping(mapping)) { + page = find_get_entry(mapping, pgoff); + /* + * shmem/tmpfs may return swap: account for swapcache + * page too. + */ + if (radix_tree_exceptional_entry(page)) { + swp_entry_t swp = radix_to_swp_entry(page); + page = find_get_page(swap_address_space(swp), swp.val); + } + } else + page = find_get_page(mapping, pgoff); +#else + page = find_get_page(mapping, pgoff); #endif if (page) { present = PageUptodate(page); diff --git a/mm/mmap.c b/mm/mmap.c index 20ff0c33274c..46433e137abc 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -405,7 +405,7 @@ static void validate_mm_rb(struct rb_root *root, struct vm_area_struct *ignore) } } -void validate_mm(struct mm_struct *mm) +static void validate_mm(struct mm_struct *mm) { int bug = 0; int i = 0; @@ -1299,7 +1299,7 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, /* * Make sure there are no mandatory locks on the file. */ - if (locks_verify_locked(inode)) + if (locks_verify_locked(file)) return -EAGAIN; vm_flags |= VM_SHARED | VM_MAYSHARE; @@ -2918,7 +2918,7 @@ static const struct vm_operations_struct special_mapping_vmops = { * The array pointer and the pages it points to are assumed to stay alive * for as long as this mapping might exist. */ -int install_special_mapping(struct mm_struct *mm, +struct vm_area_struct *_install_special_mapping(struct mm_struct *mm, unsigned long addr, unsigned long len, unsigned long vm_flags, struct page **pages) { @@ -2927,7 +2927,7 @@ int install_special_mapping(struct mm_struct *mm, vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL); if (unlikely(vma == NULL)) - return -ENOMEM; + return ERR_PTR(-ENOMEM); INIT_LIST_HEAD(&vma->anon_vma_chain); vma->vm_mm = mm; @@ -2948,11 +2948,23 @@ int install_special_mapping(struct mm_struct *mm, perf_event_mmap(vma); - return 0; + return vma; out: kmem_cache_free(vm_area_cachep, vma); - return ret; + return ERR_PTR(ret); +} + +int install_special_mapping(struct mm_struct *mm, + unsigned long addr, unsigned long len, + unsigned long vm_flags, struct page **pages) +{ + struct vm_area_struct *vma = _install_special_mapping(mm, + addr, len, vm_flags, pages); + + if (IS_ERR(vma)) + return PTR_ERR(vma); + return 0; } static DEFINE_MUTEX(mm_all_locks_mutex); diff --git a/mm/mmu_context.c b/mm/mmu_context.c index 8a8cd0265e52..f802c2d216a7 100644 --- a/mm/mmu_context.c +++ b/mm/mmu_context.c @@ -31,6 +31,9 @@ void use_mm(struct mm_struct *mm) tsk->mm = mm; switch_mm(active_mm, mm, tsk); task_unlock(tsk); +#ifdef finish_arch_post_lock_switch + finish_arch_post_lock_switch(); +#endif if (active_mm != mm) mmdrop(active_mm); diff --git a/mm/nobootmem.c b/mm/nobootmem.c index f73f2987a852..04a9d94333a5 100644 --- a/mm/nobootmem.c +++ b/mm/nobootmem.c @@ -334,7 +334,7 @@ void * __init __alloc_bootmem_node_nopanic(pg_data_t *pgdat, unsigned long size, return ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, 0); } -void * __init ___alloc_bootmem_node(pg_data_t *pgdat, unsigned long size, +static void * __init ___alloc_bootmem_node(pg_data_t *pgdat, unsigned long size, unsigned long align, unsigned long goal, unsigned long limit) { diff --git a/mm/nommu.c b/mm/nommu.c index 8740213b1647..a554e5a451cd 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -995,7 +995,7 @@ static int validate_mmap_request(struct file *file, (file->f_mode & FMODE_WRITE)) return -EACCES; - if (locks_verify_locked(file_inode(file))) + if (locks_verify_locked(file)) return -EAGAIN; if (!(capabilities & BDI_CAP_MAP_DIRECT)) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index e3758a09a009..979378deccbf 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -369,9 +369,11 @@ void prep_compound_page(struct page *page, unsigned long order) __SetPageHead(page); for (i = 1; i < nr_pages; i++) { struct page *p = page + i; - __SetPageTail(p); set_page_count(p, 0); p->first_page = page; + /* Make sure p->first_page is always valid for PageTail() */ + smp_wmb(); + __SetPageTail(p); } } @@ -1236,6 +1238,15 @@ void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp) } local_irq_restore(flags); } +static bool gfp_thisnode_allocation(gfp_t gfp_mask) +{ + return (gfp_mask & GFP_THISNODE) == GFP_THISNODE; +} +#else +static bool gfp_thisnode_allocation(gfp_t gfp_mask) +{ + return false; +} #endif /* @@ -1572,7 +1583,13 @@ again: get_pageblock_migratetype(page)); } - __mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order)); + /* + * NOTE: GFP_THISNODE allocations do not partake in the kswapd + * aging protocol, so they can't be fair. + */ + if (!gfp_thisnode_allocation(gfp_flags)) + __mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order)); + __count_zone_vm_events(PGALLOC, zone, 1 << order); zone_statistics(preferred_zone, zone, gfp_flags); local_irq_restore(flags); @@ -1944,8 +1961,12 @@ zonelist_scan: * ultimately fall back to remote zones that do not * partake in the fairness round-robin cycle of this * zonelist. + * + * NOTE: GFP_THISNODE allocations do not partake in + * the kswapd aging protocol, so they can't be fair. */ - if (alloc_flags & ALLOC_WMARK_LOW) { + if ((alloc_flags & ALLOC_WMARK_LOW) && + !gfp_thisnode_allocation(gfp_mask)) { if (zone_page_state(zone, NR_ALLOC_BATCH) <= 0) continue; if (!zone_local(preferred_zone, zone)) @@ -2501,8 +2522,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, * allowed per node queues are empty and that nodes are * over allocated. */ - if (IS_ENABLED(CONFIG_NUMA) && - (gfp_mask & GFP_THISNODE) == GFP_THISNODE) + if (gfp_thisnode_allocation(gfp_mask)) goto nopage; restart: @@ -2719,7 +2739,7 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, return NULL; retry_cpuset: - cpuset_mems_cookie = get_mems_allowed(); + cpuset_mems_cookie = read_mems_allowed_begin(); /* The preferred zone is used for statistics later */ first_zones_zonelist(zonelist, high_zoneidx, @@ -2757,7 +2777,7 @@ out: * the mask is being updated. If a page allocation is about to fail, * check if the cpuset changed during allocation and if so, retry. */ - if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page)) + if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie))) goto retry_cpuset; memcg_kmem_commit_charge(page, memcg, order); @@ -3025,9 +3045,9 @@ bool skip_free_areas_node(unsigned int flags, int nid) goto out; do { - cpuset_mems_cookie = get_mems_allowed(); + cpuset_mems_cookie = read_mems_allowed_begin(); ret = !node_isset(nid, cpuset_current_mems_allowed); - } while (!put_mems_allowed(cpuset_mems_cookie)); + } while (read_mems_allowed_retry(cpuset_mems_cookie)); out: return ret; } diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c index cfd162882c00..3708264d2833 100644 --- a/mm/page_cgroup.c +++ b/mm/page_cgroup.c @@ -175,7 +175,7 @@ static void free_page_cgroup(void *addr) } } -void __free_page_cgroup(unsigned long pfn) +static void __free_page_cgroup(unsigned long pfn) { struct mem_section *ms; struct page_cgroup *base; @@ -188,9 +188,9 @@ void __free_page_cgroup(unsigned long pfn) ms->page_cgroup = NULL; } -int __meminit online_page_cgroup(unsigned long start_pfn, - unsigned long nr_pages, - int nid) +static int __meminit online_page_cgroup(unsigned long start_pfn, + unsigned long nr_pages, + int nid) { unsigned long start, end, pfn; int fail = 0; @@ -223,8 +223,8 @@ int __meminit online_page_cgroup(unsigned long start_pfn, return -ENOMEM; } -int __meminit offline_page_cgroup(unsigned long start_pfn, - unsigned long nr_pages, int nid) +static int __meminit offline_page_cgroup(unsigned long start_pfn, + unsigned long nr_pages, int nid) { unsigned long start, end, pfn; diff --git a/mm/percpu.c b/mm/percpu.c index 036cfe07050f..63e24fb4387b 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -102,10 +102,11 @@ struct pcpu_chunk { int free_size; /* free bytes in the chunk */ int contig_hint; /* max contiguous size hint */ void *base_addr; /* base address of this chunk */ - int map_used; /* # of map entries used */ + int map_used; /* # of map entries used before the sentry */ int map_alloc; /* # of map entries allocated */ int *map; /* allocation map */ void *data; /* chunk data */ + int first_free; /* no free below this */ bool immutable; /* no [de]population allowed */ unsigned long populated[]; /* populated bitmap */ }; @@ -356,11 +357,11 @@ static int pcpu_need_to_extend(struct pcpu_chunk *chunk) { int new_alloc; - if (chunk->map_alloc >= chunk->map_used + 2) + if (chunk->map_alloc >= chunk->map_used + 3) return 0; new_alloc = PCPU_DFL_MAP_ALLOC; - while (new_alloc < chunk->map_used + 2) + while (new_alloc < chunk->map_used + 3) new_alloc *= 2; return new_alloc; @@ -418,48 +419,6 @@ out_unlock: } /** - * pcpu_split_block - split a map block - * @chunk: chunk of interest - * @i: index of map block to split - * @head: head size in bytes (can be 0) - * @tail: tail size in bytes (can be 0) - * - * Split the @i'th map block into two or three blocks. If @head is - * non-zero, @head bytes block is inserted before block @i moving it - * to @i+1 and reducing its size by @head bytes. - * - * If @tail is non-zero, the target block, which can be @i or @i+1 - * depending on @head, is reduced by @tail bytes and @tail byte block - * is inserted after the target block. - * - * @chunk->map must have enough free slots to accommodate the split. - * - * CONTEXT: - * pcpu_lock. - */ -static void pcpu_split_block(struct pcpu_chunk *chunk, int i, - int head, int tail) -{ - int nr_extra = !!head + !!tail; - - BUG_ON(chunk->map_alloc < chunk->map_used + nr_extra); - - /* insert new subblocks */ - memmove(&chunk->map[i + nr_extra], &chunk->map[i], - sizeof(chunk->map[0]) * (chunk->map_used - i)); - chunk->map_used += nr_extra; - - if (head) { - chunk->map[i + 1] = chunk->map[i] - head; - chunk->map[i++] = head; - } - if (tail) { - chunk->map[i++] -= tail; - chunk->map[i] = tail; - } -} - -/** * pcpu_alloc_area - allocate area from a pcpu_chunk * @chunk: chunk of interest * @size: wanted size in bytes @@ -483,19 +442,27 @@ static int pcpu_alloc_area(struct pcpu_chunk *chunk, int size, int align) int oslot = pcpu_chunk_slot(chunk); int max_contig = 0; int i, off; + bool seen_free = false; + int *p; - for (i = 0, off = 0; i < chunk->map_used; off += abs(chunk->map[i++])) { - bool is_last = i + 1 == chunk->map_used; + for (i = chunk->first_free, p = chunk->map + i; i < chunk->map_used; i++, p++) { int head, tail; + int this_size; + + off = *p; + if (off & 1) + continue; /* extra for alignment requirement */ head = ALIGN(off, align) - off; - BUG_ON(i == 0 && head != 0); - if (chunk->map[i] < 0) - continue; - if (chunk->map[i] < head + size) { - max_contig = max(chunk->map[i], max_contig); + this_size = (p[1] & ~1) - off; + if (this_size < head + size) { + if (!seen_free) { + chunk->first_free = i; + seen_free = true; + } + max_contig = max(this_size, max_contig); continue; } @@ -505,44 +472,59 @@ static int pcpu_alloc_area(struct pcpu_chunk *chunk, int size, int align) * than sizeof(int), which is very small but isn't too * uncommon for percpu allocations. */ - if (head && (head < sizeof(int) || chunk->map[i - 1] > 0)) { - if (chunk->map[i - 1] > 0) - chunk->map[i - 1] += head; - else { - chunk->map[i - 1] -= head; + if (head && (head < sizeof(int) || !(p[-1] & 1))) { + *p = off += head; + if (p[-1] & 1) chunk->free_size -= head; - } - chunk->map[i] -= head; - off += head; + else + max_contig = max(*p - p[-1], max_contig); + this_size -= head; head = 0; } /* if tail is small, just keep it around */ - tail = chunk->map[i] - head - size; - if (tail < sizeof(int)) + tail = this_size - head - size; + if (tail < sizeof(int)) { tail = 0; + size = this_size - head; + } /* split if warranted */ if (head || tail) { - pcpu_split_block(chunk, i, head, tail); + int nr_extra = !!head + !!tail; + + /* insert new subblocks */ + memmove(p + nr_extra + 1, p + 1, + sizeof(chunk->map[0]) * (chunk->map_used - i)); + chunk->map_used += nr_extra; + if (head) { - i++; - off += head; - max_contig = max(chunk->map[i - 1], max_contig); + if (!seen_free) { + chunk->first_free = i; + seen_free = true; + } + *++p = off += head; + ++i; + max_contig = max(head, max_contig); + } + if (tail) { + p[1] = off + size; + max_contig = max(tail, max_contig); } - if (tail) - max_contig = max(chunk->map[i + 1], max_contig); } + if (!seen_free) + chunk->first_free = i + 1; + /* update hint and mark allocated */ - if (is_last) + if (i + 1 == chunk->map_used) chunk->contig_hint = max_contig; /* fully scanned */ else chunk->contig_hint = max(chunk->contig_hint, max_contig); - chunk->free_size -= chunk->map[i]; - chunk->map[i] = -chunk->map[i]; + chunk->free_size -= size; + *p |= 1; pcpu_chunk_relocate(chunk, oslot); return off; @@ -570,34 +552,50 @@ static int pcpu_alloc_area(struct pcpu_chunk *chunk, int size, int align) static void pcpu_free_area(struct pcpu_chunk *chunk, int freeme) { int oslot = pcpu_chunk_slot(chunk); - int i, off; - - for (i = 0, off = 0; i < chunk->map_used; off += abs(chunk->map[i++])) - if (off == freeme) - break; + int off = 0; + unsigned i, j; + int to_free = 0; + int *p; + + freeme |= 1; /* we are searching for <given offset, in use> pair */ + + i = 0; + j = chunk->map_used; + while (i != j) { + unsigned k = (i + j) / 2; + off = chunk->map[k]; + if (off < freeme) + i = k + 1; + else if (off > freeme) + j = k; + else + i = j = k; + } BUG_ON(off != freeme); - BUG_ON(chunk->map[i] > 0); - chunk->map[i] = -chunk->map[i]; - chunk->free_size += chunk->map[i]; + if (i < chunk->first_free) + chunk->first_free = i; + p = chunk->map + i; + *p = off &= ~1; + chunk->free_size += (p[1] & ~1) - off; + + /* merge with next? */ + if (!(p[1] & 1)) + to_free++; /* merge with previous? */ - if (i > 0 && chunk->map[i - 1] >= 0) { - chunk->map[i - 1] += chunk->map[i]; - chunk->map_used--; - memmove(&chunk->map[i], &chunk->map[i + 1], - (chunk->map_used - i) * sizeof(chunk->map[0])); + if (i > 0 && !(p[-1] & 1)) { + to_free++; i--; + p--; } - /* merge with next? */ - if (i + 1 < chunk->map_used && chunk->map[i + 1] >= 0) { - chunk->map[i] += chunk->map[i + 1]; - chunk->map_used--; - memmove(&chunk->map[i + 1], &chunk->map[i + 2], - (chunk->map_used - (i + 1)) * sizeof(chunk->map[0])); + if (to_free) { + chunk->map_used -= to_free; + memmove(p + 1, p + 1 + to_free, + (chunk->map_used - i) * sizeof(chunk->map[0])); } - chunk->contig_hint = max(chunk->map[i], chunk->contig_hint); + chunk->contig_hint = max(chunk->map[i + 1] - chunk->map[i] - 1, chunk->contig_hint); pcpu_chunk_relocate(chunk, oslot); } @@ -617,7 +615,9 @@ static struct pcpu_chunk *pcpu_alloc_chunk(void) } chunk->map_alloc = PCPU_DFL_MAP_ALLOC; - chunk->map[chunk->map_used++] = pcpu_unit_size; + chunk->map[0] = 0; + chunk->map[1] = pcpu_unit_size | 1; + chunk->map_used = 1; INIT_LIST_HEAD(&chunk->list); chunk->free_size = pcpu_unit_size; @@ -713,6 +713,16 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved) unsigned long flags; void __percpu *ptr; + /* + * We want the lowest bit of offset available for in-use/free + * indicator, so force >= 16bit alignment and make size even. + */ + if (unlikely(align < 2)) + align = 2; + + if (unlikely(size & 1)) + size++; + if (unlikely(!size || size > PCPU_MIN_UNIT_SIZE || align > PAGE_SIZE)) { WARN(true, "illegal size (%zu) or align (%zu) for " "percpu allocation\n", size, align); @@ -1343,9 +1353,13 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, } schunk->contig_hint = schunk->free_size; - schunk->map[schunk->map_used++] = -ai->static_size; + schunk->map[0] = 1; + schunk->map[1] = ai->static_size; + schunk->map_used = 1; if (schunk->free_size) - schunk->map[schunk->map_used++] = schunk->free_size; + schunk->map[++schunk->map_used] = 1 | (ai->static_size + schunk->free_size); + else + schunk->map[1] |= 1; /* init dynamic chunk if necessary */ if (dyn_size) { @@ -1358,8 +1372,10 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, bitmap_fill(dchunk->populated, pcpu_unit_pages); dchunk->contig_hint = dchunk->free_size = dyn_size; - dchunk->map[dchunk->map_used++] = -pcpu_reserved_chunk_limit; - dchunk->map[dchunk->map_used++] = dchunk->free_size; + dchunk->map[0] = 1; + dchunk->map[1] = pcpu_reserved_chunk_limit; + dchunk->map[2] = (pcpu_reserved_chunk_limit + dchunk->free_size) | 1; + dchunk->map_used = 2; } /* link the first chunk in */ diff --git a/mm/process_vm_access.c b/mm/process_vm_access.c index fd26d0433509..cb79065c19e5 100644 --- a/mm/process_vm_access.c +++ b/mm/process_vm_access.c @@ -412,7 +412,7 @@ SYSCALL_DEFINE6(process_vm_writev, pid_t, pid, #ifdef CONFIG_COMPAT -asmlinkage ssize_t +static ssize_t compat_process_vm_rw(compat_pid_t pid, const struct compat_iovec __user *lvec, unsigned long liovcnt, @@ -456,25 +456,23 @@ free_iovecs: return rc; } -asmlinkage ssize_t -compat_sys_process_vm_readv(compat_pid_t pid, - const struct compat_iovec __user *lvec, - unsigned long liovcnt, - const struct compat_iovec __user *rvec, - unsigned long riovcnt, - unsigned long flags) +COMPAT_SYSCALL_DEFINE6(process_vm_readv, compat_pid_t, pid, + const struct compat_iovec __user *, lvec, + compat_ulong_t, liovcnt, + const struct compat_iovec __user *, rvec, + compat_ulong_t, riovcnt, + compat_ulong_t, flags) { return compat_process_vm_rw(pid, lvec, liovcnt, rvec, riovcnt, flags, 0); } -asmlinkage ssize_t -compat_sys_process_vm_writev(compat_pid_t pid, - const struct compat_iovec __user *lvec, - unsigned long liovcnt, - const struct compat_iovec __user *rvec, - unsigned long riovcnt, - unsigned long flags) +COMPAT_SYSCALL_DEFINE6(process_vm_writev, compat_pid_t, pid, + const struct compat_iovec __user *, lvec, + compat_ulong_t, liovcnt, + const struct compat_iovec __user *, rvec, + compat_ulong_t, riovcnt, + compat_ulong_t, flags) { return compat_process_vm_rw(pid, lvec, liovcnt, rvec, riovcnt, flags, 1); diff --git a/mm/readahead.c b/mm/readahead.c index 0de2360d65f3..29c5e1af5a0c 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -179,7 +179,7 @@ __do_page_cache_readahead(struct address_space *mapping, struct file *filp, rcu_read_lock(); page = radix_tree_lookup(&mapping->page_tree, page_offset); rcu_read_unlock(); - if (page) + if (page && !radix_tree_exceptional_entry(page)) continue; page = page_cache_alloc_readahead(mapping); @@ -233,14 +233,14 @@ int force_page_cache_readahead(struct address_space *mapping, struct file *filp, return 0; } +#define MAX_READAHEAD ((512*4096)/PAGE_CACHE_SIZE) /* * Given a desired number of PAGE_CACHE_SIZE readahead pages, return a * sensible upper limit. */ unsigned long max_sane_readahead(unsigned long nr) { - return min(nr, (node_page_state(numa_node_id(), NR_INACTIVE_FILE) - + node_page_state(numa_node_id(), NR_FREE_PAGES)) / 2); + return min(nr, MAX_READAHEAD); } /* @@ -347,7 +347,7 @@ static pgoff_t count_history_pages(struct address_space *mapping, pgoff_t head; rcu_read_lock(); - head = radix_tree_prev_hole(&mapping->page_tree, offset - 1, max); + head = page_cache_prev_hole(mapping, offset - 1, max); rcu_read_unlock(); return offset - 1 - head; @@ -427,7 +427,7 @@ ondemand_readahead(struct address_space *mapping, pgoff_t start; rcu_read_lock(); - start = radix_tree_next_hole(&mapping->page_tree, offset+1,max); + start = page_cache_next_hole(mapping, offset + 1, max); rcu_read_unlock(); if (!start || start - offset > max) diff --git a/mm/rmap.c b/mm/rmap.c index d9d42316a99a..11cf322f8133 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1165,6 +1165,16 @@ int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, } set_pte_at(mm, address, pte, swp_entry_to_pte(make_hwpoison_entry(page))); + } else if (pte_unused(pteval)) { + /* + * The guest indicated that the page content is of no + * interest anymore. Simply discard the pte, vmscan + * will take care of the rest. + */ + if (PageAnon(page)) + dec_mm_counter(mm, MM_ANONPAGES); + else + dec_mm_counter(mm, MM_FILEPAGES); } else if (PageAnon(page)) { swp_entry_t entry = { .val = page_private(page) }; pte_t swp_pte; @@ -1360,8 +1370,9 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount, } static int try_to_unmap_nonlinear(struct page *page, - struct address_space *mapping, struct vm_area_struct *vma) + struct address_space *mapping, void *arg) { + struct vm_area_struct *vma; int ret = SWAP_AGAIN; unsigned long cursor; unsigned long max_nl_cursor = 0; @@ -1663,7 +1674,7 @@ static int rmap_walk_file(struct page *page, struct rmap_walk_control *rwc) if (list_empty(&mapping->i_mmap_nonlinear)) goto done; - ret = rwc->file_nonlinear(page, mapping, vma); + ret = rwc->file_nonlinear(page, mapping, rwc->arg); done: mutex_unlock(&mapping->i_mmap_mutex); diff --git a/mm/shmem.c b/mm/shmem.c index 1f18c9d0d93e..a3ba988ec946 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -242,19 +242,17 @@ static int shmem_radix_tree_replace(struct address_space *mapping, pgoff_t index, void *expected, void *replacement) { void **pslot; - void *item = NULL; + void *item; VM_BUG_ON(!expected); + VM_BUG_ON(!replacement); pslot = radix_tree_lookup_slot(&mapping->page_tree, index); - if (pslot) - item = radix_tree_deref_slot_protected(pslot, - &mapping->tree_lock); + if (!pslot) + return -ENOENT; + item = radix_tree_deref_slot_protected(pslot, &mapping->tree_lock); if (item != expected) return -ENOENT; - if (replacement) - radix_tree_replace_slot(pslot, replacement); - else - radix_tree_delete(&mapping->page_tree, index); + radix_tree_replace_slot(pslot, replacement); return 0; } @@ -331,84 +329,20 @@ static void shmem_delete_from_page_cache(struct page *page, void *radswap) } /* - * Like find_get_pages, but collecting swap entries as well as pages. - */ -static unsigned shmem_find_get_pages_and_swap(struct address_space *mapping, - pgoff_t start, unsigned int nr_pages, - struct page **pages, pgoff_t *indices) -{ - void **slot; - unsigned int ret = 0; - struct radix_tree_iter iter; - - if (!nr_pages) - return 0; - - rcu_read_lock(); -restart: - radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) { - struct page *page; -repeat: - page = radix_tree_deref_slot(slot); - if (unlikely(!page)) - continue; - if (radix_tree_exception(page)) { - if (radix_tree_deref_retry(page)) - goto restart; - /* - * Otherwise, we must be storing a swap entry - * here as an exceptional entry: so return it - * without attempting to raise page count. - */ - goto export; - } - if (!page_cache_get_speculative(page)) - goto repeat; - - /* Has the page moved? */ - if (unlikely(page != *slot)) { - page_cache_release(page); - goto repeat; - } -export: - indices[ret] = iter.index; - pages[ret] = page; - if (++ret == nr_pages) - break; - } - rcu_read_unlock(); - return ret; -} - -/* * Remove swap entry from radix tree, free the swap and its page cache. */ static int shmem_free_swap(struct address_space *mapping, pgoff_t index, void *radswap) { - int error; + void *old; spin_lock_irq(&mapping->tree_lock); - error = shmem_radix_tree_replace(mapping, index, radswap, NULL); + old = radix_tree_delete_item(&mapping->page_tree, index, radswap); spin_unlock_irq(&mapping->tree_lock); - if (!error) - free_swap_and_cache(radix_to_swp_entry(radswap)); - return error; -} - -/* - * Pagevec may contain swap entries, so shuffle up pages before releasing. - */ -static void shmem_deswap_pagevec(struct pagevec *pvec) -{ - int i, j; - - for (i = 0, j = 0; i < pagevec_count(pvec); i++) { - struct page *page = pvec->pages[i]; - if (!radix_tree_exceptional_entry(page)) - pvec->pages[j++] = page; - } - pvec->nr = j; + if (old != radswap) + return -ENOENT; + free_swap_and_cache(radix_to_swp_entry(radswap)); + return 0; } /* @@ -429,12 +363,12 @@ void shmem_unlock_mapping(struct address_space *mapping) * Avoid pagevec_lookup(): find_get_pages() returns 0 as if it * has finished, if it hits a row of PAGEVEC_SIZE swap entries. */ - pvec.nr = shmem_find_get_pages_and_swap(mapping, index, - PAGEVEC_SIZE, pvec.pages, indices); + pvec.nr = find_get_entries(mapping, index, + PAGEVEC_SIZE, pvec.pages, indices); if (!pvec.nr) break; index = indices[pvec.nr - 1] + 1; - shmem_deswap_pagevec(&pvec); + pagevec_remove_exceptionals(&pvec); check_move_unevictable_pages(pvec.pages, pvec.nr); pagevec_release(&pvec); cond_resched(); @@ -466,9 +400,9 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend, pagevec_init(&pvec, 0); index = start; while (index < end) { - pvec.nr = shmem_find_get_pages_and_swap(mapping, index, - min(end - index, (pgoff_t)PAGEVEC_SIZE), - pvec.pages, indices); + pvec.nr = find_get_entries(mapping, index, + min(end - index, (pgoff_t)PAGEVEC_SIZE), + pvec.pages, indices); if (!pvec.nr) break; mem_cgroup_uncharge_start(); @@ -497,7 +431,7 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend, } unlock_page(page); } - shmem_deswap_pagevec(&pvec); + pagevec_remove_exceptionals(&pvec); pagevec_release(&pvec); mem_cgroup_uncharge_end(); cond_resched(); @@ -535,9 +469,10 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend, index = start; for ( ; ; ) { cond_resched(); - pvec.nr = shmem_find_get_pages_and_swap(mapping, index, + + pvec.nr = find_get_entries(mapping, index, min(end - index, (pgoff_t)PAGEVEC_SIZE), - pvec.pages, indices); + pvec.pages, indices); if (!pvec.nr) { if (index == start || unfalloc) break; @@ -545,7 +480,7 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend, continue; } if ((index == start || unfalloc) && indices[0] >= end) { - shmem_deswap_pagevec(&pvec); + pagevec_remove_exceptionals(&pvec); pagevec_release(&pvec); break; } @@ -574,7 +509,7 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend, } unlock_page(page); } - shmem_deswap_pagevec(&pvec); + pagevec_remove_exceptionals(&pvec); pagevec_release(&pvec); mem_cgroup_uncharge_end(); index++; @@ -1080,7 +1015,7 @@ static int shmem_getpage_gfp(struct inode *inode, pgoff_t index, return -EFBIG; repeat: swap.val = 0; - page = find_lock_page(mapping, index); + page = find_lock_entry(mapping, index); if (radix_tree_exceptional_entry(page)) { swap = radix_to_swp_entry(page); page = NULL; @@ -1417,6 +1352,11 @@ static struct inode *shmem_get_inode(struct super_block *sb, const struct inode return inode; } +bool shmem_mapping(struct address_space *mapping) +{ + return mapping->backing_dev_info == &shmem_backing_dev_info; +} + #ifdef CONFIG_TMPFS static const struct inode_operations shmem_symlink_inode_operations; static const struct inode_operations shmem_short_symlink_operations; @@ -1729,7 +1669,7 @@ static pgoff_t shmem_seek_hole_data(struct address_space *mapping, pagevec_init(&pvec, 0); pvec.nr = 1; /* start small: we may be there already */ while (!done) { - pvec.nr = shmem_find_get_pages_and_swap(mapping, index, + pvec.nr = find_get_entries(mapping, index, pvec.nr, pvec.pages, indices); if (!pvec.nr) { if (whence == SEEK_DATA) @@ -1756,7 +1696,7 @@ static pgoff_t shmem_seek_hole_data(struct address_space *mapping, break; } } - shmem_deswap_pagevec(&pvec); + pagevec_remove_exceptionals(&pvec); pagevec_release(&pvec); pvec.nr = PAGEVEC_SIZE; cond_resched(); diff --git a/mm/slab.c b/mm/slab.c index b264214c77ea..9153c802e2fe 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -3073,7 +3073,7 @@ static void *fallback_alloc(struct kmem_cache *cache, gfp_t flags) local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK); retry_cpuset: - cpuset_mems_cookie = get_mems_allowed(); + cpuset_mems_cookie = read_mems_allowed_begin(); zonelist = node_zonelist(slab_node(), flags); retry: @@ -3131,7 +3131,7 @@ retry: } } - if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !obj)) + if (unlikely(!obj && read_mems_allowed_retry(cpuset_mems_cookie))) goto retry_cpuset; return obj; } diff --git a/mm/slub.c b/mm/slub.c index 25f14ad8f817..fe6d7be22ef0 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1684,7 +1684,7 @@ static void *get_any_partial(struct kmem_cache *s, gfp_t flags, return NULL; do { - cpuset_mems_cookie = get_mems_allowed(); + cpuset_mems_cookie = read_mems_allowed_begin(); zonelist = node_zonelist(slab_node(), flags); for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) { struct kmem_cache_node *n; @@ -1696,19 +1696,17 @@ static void *get_any_partial(struct kmem_cache *s, gfp_t flags, object = get_partial_node(s, n, c, flags); if (object) { /* - * Return the object even if - * put_mems_allowed indicated that - * the cpuset mems_allowed was - * updated in parallel. It's a - * harmless race between the alloc - * and the cpuset update. + * Don't check read_mems_allowed_retry() + * here - if mems_allowed was updated in + * parallel, that was a harmless race + * between allocation and the cpuset + * update */ - put_mems_allowed(cpuset_mems_cookie); return object; } } } - } while (!put_mems_allowed(cpuset_mems_cookie)); + } while (read_mems_allowed_retry(cpuset_mems_cookie)); #endif return NULL; } @@ -3239,8 +3237,9 @@ int __kmem_cache_shutdown(struct kmem_cache *s) if (!rc) { /* - * We do the same lock strategy around sysfs_slab_add, see - * __kmem_cache_create. Because this is pretty much the last + * Since slab_attr_store may take the slab_mutex, we should + * release the lock while removing the sysfs entry in order to + * avoid a deadlock. Because this is pretty much the last * operation we do and the lock will be released shortly after * that in slab_common.c, we could just move sysfs_slab_remove * to a later point in common code. We should do that when we @@ -3780,10 +3779,7 @@ int __kmem_cache_create(struct kmem_cache *s, unsigned long flags) return 0; memcg_propagate_slab_attrs(s); - mutex_unlock(&slab_mutex); err = sysfs_slab_add(s); - mutex_lock(&slab_mutex); - if (err) kmem_cache_close(s); diff --git a/mm/sparse.c b/mm/sparse.c index 63c3ea5c119c..38cad8fd7397 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -268,7 +268,7 @@ sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat, /* * A page may contain usemaps for other sections preventing the * page being freed and making a section unremovable while - * other sections referencing the usemap retmain active. Similarly, + * other sections referencing the usemap remain active. Similarly, * a pgdat can prevent a section being removed. If section A * contains a pgdat and section B contains the usemap, both * sections become inter-dependent. This allocates usemaps diff --git a/mm/swap.c b/mm/swap.c index b31ba67d440a..9ce43ba4498b 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -98,7 +98,7 @@ static void put_compound_page(struct page *page) } /* __split_huge_page_refcount can run under us */ - page_head = compound_trans_head(page); + page_head = compound_head(page); /* * THP can not break up slab pages so avoid taking @@ -253,7 +253,7 @@ bool __get_page_tail(struct page *page) */ unsigned long flags; bool got; - struct page *page_head = compound_trans_head(page); + struct page *page_head = compound_head(page); /* Ref to put_compound_page() comment. */ if (!__compound_tail_refcounted(page_head)) { @@ -574,6 +574,8 @@ void mark_page_accessed(struct page *page) else __lru_cache_activate_page(page); ClearPageReferenced(page); + if (page_is_file_cache(page)) + workingset_activation(page); } else if (!PageReferenced(page)) { SetPageReferenced(page); } @@ -948,6 +950,57 @@ void __pagevec_lru_add(struct pagevec *pvec) EXPORT_SYMBOL(__pagevec_lru_add); /** + * pagevec_lookup_entries - gang pagecache lookup + * @pvec: Where the resulting entries are placed + * @mapping: The address_space to search + * @start: The starting entry index + * @nr_entries: The maximum number of entries + * @indices: The cache indices corresponding to the entries in @pvec + * + * pagevec_lookup_entries() will search for and return a group of up + * to @nr_entries pages and shadow entries in the mapping. All + * entries are placed in @pvec. pagevec_lookup_entries() takes a + * reference against actual pages in @pvec. + * + * The search returns a group of mapping-contiguous entries with + * ascending indexes. There may be holes in the indices due to + * not-present entries. + * + * pagevec_lookup_entries() returns the number of entries which were + * found. + */ +unsigned pagevec_lookup_entries(struct pagevec *pvec, + struct address_space *mapping, + pgoff_t start, unsigned nr_pages, + pgoff_t *indices) +{ + pvec->nr = find_get_entries(mapping, start, nr_pages, + pvec->pages, indices); + return pagevec_count(pvec); +} + +/** + * pagevec_remove_exceptionals - pagevec exceptionals pruning + * @pvec: The pagevec to prune + * + * pagevec_lookup_entries() fills both pages and exceptional radix + * tree entries into the pagevec. This function prunes all + * exceptionals from @pvec without leaving holes, so that it can be + * passed on to page-only pagevec operations. + */ +void pagevec_remove_exceptionals(struct pagevec *pvec) +{ + int i, j; + + for (i = 0, j = 0; i < pagevec_count(pvec); i++) { + struct page *page = pvec->pages[i]; + if (!radix_tree_exceptional_entry(page)) + pvec->pages[j++] = page; + } + pvec->nr = j; +} + +/** * pagevec_lookup - gang pagecache lookup * @pvec: Where the resulting pages are placed * @mapping: The address_space to search diff --git a/mm/truncate.c b/mm/truncate.c index 353b683afd6e..e5cc39ab0751 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -22,6 +22,45 @@ #include <linux/cleancache.h> #include "internal.h" +static void clear_exceptional_entry(struct address_space *mapping, + pgoff_t index, void *entry) +{ + struct radix_tree_node *node; + void **slot; + + /* Handled by shmem itself */ + if (shmem_mapping(mapping)) + return; + + spin_lock_irq(&mapping->tree_lock); + /* + * Regular page slots are stabilized by the page lock even + * without the tree itself locked. These unlocked entries + * need verification under the tree lock. + */ + if (!__radix_tree_lookup(&mapping->page_tree, index, &node, &slot)) + goto unlock; + if (*slot != entry) + goto unlock; + radix_tree_replace_slot(slot, NULL); + mapping->nrshadows--; + if (!node) + goto unlock; + workingset_node_shadows_dec(node); + /* + * Don't track node without shadow entries. + * + * Avoid acquiring the list_lru lock if already untracked. + * The list_empty() test is safe as node->private_list is + * protected by mapping->tree_lock. + */ + if (!workingset_node_shadows(node) && + !list_empty(&node->private_list)) + list_lru_del(&workingset_shadow_nodes, &node->private_list); + __radix_tree_delete_node(&mapping->page_tree, node); +unlock: + spin_unlock_irq(&mapping->tree_lock); +} /** * do_invalidatepage - invalidate part or all of a page @@ -208,11 +247,12 @@ void truncate_inode_pages_range(struct address_space *mapping, unsigned int partial_start; /* inclusive */ unsigned int partial_end; /* exclusive */ struct pagevec pvec; + pgoff_t indices[PAGEVEC_SIZE]; pgoff_t index; int i; cleancache_invalidate_inode(mapping); - if (mapping->nrpages == 0) + if (mapping->nrpages == 0 && mapping->nrshadows == 0) return; /* Offsets within partial pages */ @@ -238,17 +278,23 @@ void truncate_inode_pages_range(struct address_space *mapping, pagevec_init(&pvec, 0); index = start; - while (index < end && pagevec_lookup(&pvec, mapping, index, - min(end - index, (pgoff_t)PAGEVEC_SIZE))) { + while (index < end && pagevec_lookup_entries(&pvec, mapping, index, + min(end - index, (pgoff_t)PAGEVEC_SIZE), + indices)) { mem_cgroup_uncharge_start(); for (i = 0; i < pagevec_count(&pvec); i++) { struct page *page = pvec.pages[i]; /* We rely upon deletion not changing page->index */ - index = page->index; + index = indices[i]; if (index >= end) break; + if (radix_tree_exceptional_entry(page)) { + clear_exceptional_entry(mapping, index, page); + continue; + } + if (!trylock_page(page)) continue; WARN_ON(page->index != index); @@ -259,6 +305,7 @@ void truncate_inode_pages_range(struct address_space *mapping, truncate_inode_page(mapping, page); unlock_page(page); } + pagevec_remove_exceptionals(&pvec); pagevec_release(&pvec); mem_cgroup_uncharge_end(); cond_resched(); @@ -307,14 +354,16 @@ void truncate_inode_pages_range(struct address_space *mapping, index = start; for ( ; ; ) { cond_resched(); - if (!pagevec_lookup(&pvec, mapping, index, - min(end - index, (pgoff_t)PAGEVEC_SIZE))) { + if (!pagevec_lookup_entries(&pvec, mapping, index, + min(end - index, (pgoff_t)PAGEVEC_SIZE), + indices)) { if (index == start) break; index = start; continue; } - if (index == start && pvec.pages[0]->index >= end) { + if (index == start && indices[0] >= end) { + pagevec_remove_exceptionals(&pvec); pagevec_release(&pvec); break; } @@ -323,16 +372,22 @@ void truncate_inode_pages_range(struct address_space *mapping, struct page *page = pvec.pages[i]; /* We rely upon deletion not changing page->index */ - index = page->index; + index = indices[i]; if (index >= end) break; + if (radix_tree_exceptional_entry(page)) { + clear_exceptional_entry(mapping, index, page); + continue; + } + lock_page(page); WARN_ON(page->index != index); wait_on_page_writeback(page); truncate_inode_page(mapping, page); unlock_page(page); } + pagevec_remove_exceptionals(&pvec); pagevec_release(&pvec); mem_cgroup_uncharge_end(); index++; @@ -360,6 +415,53 @@ void truncate_inode_pages(struct address_space *mapping, loff_t lstart) EXPORT_SYMBOL(truncate_inode_pages); /** + * truncate_inode_pages_final - truncate *all* pages before inode dies + * @mapping: mapping to truncate + * + * Called under (and serialized by) inode->i_mutex. + * + * Filesystems have to use this in the .evict_inode path to inform the + * VM that this is the final truncate and the inode is going away. + */ +void truncate_inode_pages_final(struct address_space *mapping) +{ + unsigned long nrshadows; + unsigned long nrpages; + + /* + * Page reclaim can not participate in regular inode lifetime + * management (can't call iput()) and thus can race with the + * inode teardown. Tell it when the address space is exiting, + * so that it does not install eviction information after the + * final truncate has begun. + */ + mapping_set_exiting(mapping); + + /* + * When reclaim installs eviction entries, it increases + * nrshadows first, then decreases nrpages. Make sure we see + * this in the right order or we might miss an entry. + */ + nrpages = mapping->nrpages; + smp_rmb(); + nrshadows = mapping->nrshadows; + + if (nrpages || nrshadows) { + /* + * As truncation uses a lockless tree lookup, cycle + * the tree lock to make sure any ongoing tree + * modification that does not see AS_EXITING is + * completed before starting the final truncate. + */ + spin_lock_irq(&mapping->tree_lock); + spin_unlock_irq(&mapping->tree_lock); + + truncate_inode_pages(mapping, 0); + } +} +EXPORT_SYMBOL(truncate_inode_pages_final); + +/** * invalidate_mapping_pages - Invalidate all the unlocked pages of one inode * @mapping: the address_space which holds the pages to invalidate * @start: the offset 'from' which to invalidate @@ -375,6 +477,7 @@ EXPORT_SYMBOL(truncate_inode_pages); unsigned long invalidate_mapping_pages(struct address_space *mapping, pgoff_t start, pgoff_t end) { + pgoff_t indices[PAGEVEC_SIZE]; struct pagevec pvec; pgoff_t index = start; unsigned long ret; @@ -390,17 +493,23 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping, */ pagevec_init(&pvec, 0); - while (index <= end && pagevec_lookup(&pvec, mapping, index, - min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) { + while (index <= end && pagevec_lookup_entries(&pvec, mapping, index, + min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1, + indices)) { mem_cgroup_uncharge_start(); for (i = 0; i < pagevec_count(&pvec); i++) { struct page *page = pvec.pages[i]; /* We rely upon deletion not changing page->index */ - index = page->index; + index = indices[i]; if (index > end) break; + if (radix_tree_exceptional_entry(page)) { + clear_exceptional_entry(mapping, index, page); + continue; + } + if (!trylock_page(page)) continue; WARN_ON(page->index != index); @@ -414,6 +523,7 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping, deactivate_page(page); count += ret; } + pagevec_remove_exceptionals(&pvec); pagevec_release(&pvec); mem_cgroup_uncharge_end(); cond_resched(); @@ -444,7 +554,7 @@ invalidate_complete_page2(struct address_space *mapping, struct page *page) goto failed; BUG_ON(page_has_private(page)); - __delete_from_page_cache(page); + __delete_from_page_cache(page, NULL); spin_unlock_irq(&mapping->tree_lock); mem_cgroup_uncharge_cache_page(page); @@ -481,6 +591,7 @@ static int do_launder_page(struct address_space *mapping, struct page *page) int invalidate_inode_pages2_range(struct address_space *mapping, pgoff_t start, pgoff_t end) { + pgoff_t indices[PAGEVEC_SIZE]; struct pagevec pvec; pgoff_t index; int i; @@ -491,17 +602,23 @@ int invalidate_inode_pages2_range(struct address_space *mapping, cleancache_invalidate_inode(mapping); pagevec_init(&pvec, 0); index = start; - while (index <= end && pagevec_lookup(&pvec, mapping, index, - min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) { + while (index <= end && pagevec_lookup_entries(&pvec, mapping, index, + min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1, + indices)) { mem_cgroup_uncharge_start(); for (i = 0; i < pagevec_count(&pvec); i++) { struct page *page = pvec.pages[i]; /* We rely upon deletion not changing page->index */ - index = page->index; + index = indices[i]; if (index > end) break; + if (radix_tree_exceptional_entry(page)) { + clear_exceptional_entry(mapping, index, page); + continue; + } + lock_page(page); WARN_ON(page->index != index); if (page->mapping != mapping) { @@ -539,6 +656,7 @@ int invalidate_inode_pages2_range(struct address_space *mapping, ret = ret2; unlock_page(page); } + pagevec_remove_exceptionals(&pvec); pagevec_release(&pvec); mem_cgroup_uncharge_end(); cond_resched(); diff --git a/mm/vmscan.c b/mm/vmscan.c index a9c74b409681..1f56a80a7c41 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -224,15 +224,15 @@ shrink_slab_node(struct shrink_control *shrinkctl, struct shrinker *shrinker, unsigned long freed = 0; unsigned long long delta; long total_scan; - long max_pass; + long freeable; long nr; long new_nr; int nid = shrinkctl->nid; long batch_size = shrinker->batch ? shrinker->batch : SHRINK_BATCH; - max_pass = shrinker->count_objects(shrinker, shrinkctl); - if (max_pass == 0) + freeable = shrinker->count_objects(shrinker, shrinkctl); + if (freeable == 0) return 0; /* @@ -244,14 +244,14 @@ shrink_slab_node(struct shrink_control *shrinkctl, struct shrinker *shrinker, total_scan = nr; delta = (4 * nr_pages_scanned) / shrinker->seeks; - delta *= max_pass; + delta *= freeable; do_div(delta, lru_pages + 1); total_scan += delta; if (total_scan < 0) { printk(KERN_ERR "shrink_slab: %pF negative objects to delete nr=%ld\n", shrinker->scan_objects, total_scan); - total_scan = max_pass; + total_scan = freeable; } /* @@ -260,26 +260,26 @@ shrink_slab_node(struct shrink_control *shrinkctl, struct shrinker *shrinker, * shrinkers to return -1 all the time. This results in a large * nr being built up so when a shrink that can do some work * comes along it empties the entire cache due to nr >>> - * max_pass. This is bad for sustaining a working set in + * freeable. This is bad for sustaining a working set in * memory. * * Hence only allow the shrinker to scan the entire cache when * a large delta change is calculated directly. */ - if (delta < max_pass / 4) - total_scan = min(total_scan, max_pass / 2); + if (delta < freeable / 4) + total_scan = min(total_scan, freeable / 2); /* * Avoid risking looping forever due to too large nr value: * never try to free more than twice the estimate number of * freeable entries. */ - if (total_scan > max_pass * 2) - total_scan = max_pass * 2; + if (total_scan > freeable * 2) + total_scan = freeable * 2; trace_mm_shrink_slab_start(shrinker, shrinkctl, nr, nr_pages_scanned, lru_pages, - max_pass, delta, total_scan); + freeable, delta, total_scan); /* * Normally, we should not scan less than batch_size objects in one @@ -292,12 +292,12 @@ shrink_slab_node(struct shrink_control *shrinkctl, struct shrinker *shrinker, * * We detect the "tight on memory" situations by looking at the total * number of objects we want to scan (total_scan). If it is greater - * than the total number of objects on slab (max_pass), we must be + * than the total number of objects on slab (freeable), we must be * scanning at high prio and therefore should try to reclaim as much as * possible. */ while (total_scan >= batch_size || - total_scan >= max_pass) { + total_scan >= freeable) { unsigned long ret; unsigned long nr_to_scan = min(batch_size, total_scan); @@ -523,7 +523,8 @@ static pageout_t pageout(struct page *page, struct address_space *mapping, * Same as remove_mapping, but if the page is removed from the mapping, it * gets returned with a refcount of 0. */ -static int __remove_mapping(struct address_space *mapping, struct page *page) +static int __remove_mapping(struct address_space *mapping, struct page *page, + bool reclaimed) { BUG_ON(!PageLocked(page)); BUG_ON(mapping != page_mapping(page)); @@ -569,10 +570,23 @@ static int __remove_mapping(struct address_space *mapping, struct page *page) swapcache_free(swap, page); } else { void (*freepage)(struct page *); + void *shadow = NULL; freepage = mapping->a_ops->freepage; - - __delete_from_page_cache(page); + /* + * Remember a shadow entry for reclaimed file cache in + * order to detect refaults, thus thrashing, later on. + * + * But don't store shadows in an address space that is + * already exiting. This is not just an optizimation, + * inode reclaim needs to empty out the radix tree or + * the nodes are lost. Don't plant shadows behind its + * back. + */ + if (reclaimed && page_is_file_cache(page) && + !mapping_exiting(mapping)) + shadow = workingset_eviction(mapping, page); + __delete_from_page_cache(page, shadow); spin_unlock_irq(&mapping->tree_lock); mem_cgroup_uncharge_cache_page(page); @@ -595,7 +609,7 @@ cannot_free: */ int remove_mapping(struct address_space *mapping, struct page *page) { - if (__remove_mapping(mapping, page)) { + if (__remove_mapping(mapping, page, false)) { /* * Unfreezing the refcount with 1 rather than 2 effectively * drops the pagecache ref for us without requiring another @@ -1065,7 +1079,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, } } - if (!mapping || !__remove_mapping(mapping, page)) + if (!mapping || !__remove_mapping(mapping, page, true)) goto keep_locked; /* @@ -2297,7 +2311,12 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc) struct zone *zone; unsigned long nr_soft_reclaimed; unsigned long nr_soft_scanned; + unsigned long lru_pages = 0; bool aborted_reclaim = false; + struct reclaim_state *reclaim_state = current->reclaim_state; + struct shrink_control shrink = { + .gfp_mask = sc->gfp_mask, + }; /* * If the number of buffer_heads in the machine exceeds the maximum @@ -2307,6 +2326,8 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc) if (buffer_heads_over_limit) sc->gfp_mask |= __GFP_HIGHMEM; + nodes_clear(shrink.nodes_to_scan); + for_each_zone_zonelist_nodemask(zone, z, zonelist, gfp_zone(sc->gfp_mask), sc->nodemask) { if (!populated_zone(zone)) @@ -2318,6 +2339,10 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc) if (global_reclaim(sc)) { if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) continue; + + lru_pages += zone_reclaimable_pages(zone); + node_set(zone_to_nid(zone), shrink.nodes_to_scan); + if (sc->priority != DEF_PRIORITY && !zone_reclaimable(zone)) continue; /* Let kswapd poll it */ @@ -2354,6 +2379,20 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc) shrink_zone(zone, sc); } + /* + * Don't shrink slabs when reclaiming memory from over limit cgroups + * but do shrink slab at least once when aborting reclaim for + * compaction to avoid unevenly scanning file/anon LRU pages over slab + * pages. + */ + if (global_reclaim(sc)) { + shrink_slab(&shrink, sc->nr_scanned, lru_pages); + if (reclaim_state) { + sc->nr_reclaimed += reclaim_state->reclaimed_slab; + reclaim_state->reclaimed_slab = 0; + } + } + return aborted_reclaim; } @@ -2394,13 +2433,9 @@ static bool all_unreclaimable(struct zonelist *zonelist, * else, the number of pages reclaimed */ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, - struct scan_control *sc, - struct shrink_control *shrink) + struct scan_control *sc) { unsigned long total_scanned = 0; - struct reclaim_state *reclaim_state = current->reclaim_state; - struct zoneref *z; - struct zone *zone; unsigned long writeback_threshold; bool aborted_reclaim; @@ -2415,32 +2450,6 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, sc->nr_scanned = 0; aborted_reclaim = shrink_zones(zonelist, sc); - /* - * Don't shrink slabs when reclaiming memory from over limit - * cgroups but do shrink slab at least once when aborting - * reclaim for compaction to avoid unevenly scanning file/anon - * LRU pages over slab pages. - */ - if (global_reclaim(sc)) { - unsigned long lru_pages = 0; - - nodes_clear(shrink->nodes_to_scan); - for_each_zone_zonelist(zone, z, zonelist, - gfp_zone(sc->gfp_mask)) { - if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) - continue; - - lru_pages += zone_reclaimable_pages(zone); - node_set(zone_to_nid(zone), - shrink->nodes_to_scan); - } - - shrink_slab(shrink, sc->nr_scanned, lru_pages); - if (reclaim_state) { - sc->nr_reclaimed += reclaim_state->reclaimed_slab; - reclaim_state->reclaimed_slab = 0; - } - } total_scanned += sc->nr_scanned; if (sc->nr_reclaimed >= sc->nr_to_reclaim) goto out; @@ -2602,9 +2611,6 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order, .target_mem_cgroup = NULL, .nodemask = nodemask, }; - struct shrink_control shrink = { - .gfp_mask = sc.gfp_mask, - }; /* * Do not enter reclaim if fatal signal was delivered while throttled. @@ -2618,7 +2624,7 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order, sc.may_writepage, gfp_mask); - nr_reclaimed = do_try_to_free_pages(zonelist, &sc, &shrink); + nr_reclaimed = do_try_to_free_pages(zonelist, &sc); trace_mm_vmscan_direct_reclaim_end(nr_reclaimed); @@ -2685,9 +2691,6 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, .gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK), }; - struct shrink_control shrink = { - .gfp_mask = sc.gfp_mask, - }; /* * Unlike direct reclaim via alloc_pages(), memcg's reclaim doesn't @@ -2702,7 +2705,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, sc.may_writepage, sc.gfp_mask); - nr_reclaimed = do_try_to_free_pages(zonelist, &sc, &shrink); + nr_reclaimed = do_try_to_free_pages(zonelist, &sc); trace_mm_vmscan_memcg_reclaim_end(nr_reclaimed); @@ -3337,9 +3340,6 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim) .order = 0, .priority = DEF_PRIORITY, }; - struct shrink_control shrink = { - .gfp_mask = sc.gfp_mask, - }; struct zonelist *zonelist = node_zonelist(numa_node_id(), sc.gfp_mask); struct task_struct *p = current; unsigned long nr_reclaimed; @@ -3349,7 +3349,7 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim) reclaim_state.reclaimed_slab = 0; p->reclaim_state = &reclaim_state; - nr_reclaimed = do_try_to_free_pages(zonelist, &sc, &shrink); + nr_reclaimed = do_try_to_free_pages(zonelist, &sc); p->reclaim_state = NULL; lockdep_clear_current_reclaim_state(); diff --git a/mm/vmstat.c b/mm/vmstat.c index def5dd2fbe61..197b4c4a9587 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -770,6 +770,9 @@ const char * const vmstat_text[] = { "numa_local", "numa_other", #endif + "workingset_refault", + "workingset_activate", + "workingset_nodereclaim", "nr_anon_transparent_hugepages", "nr_free_cma", "nr_dirty_threshold", @@ -810,6 +813,9 @@ const char * const vmstat_text[] = { "pgrotated", + "drop_pagecache", + "drop_slab", + #ifdef CONFIG_NUMA_BALANCING "numa_pte_updates", "numa_huge_pte_updates", diff --git a/mm/workingset.c b/mm/workingset.c new file mode 100644 index 000000000000..f7216fa7da27 --- /dev/null +++ b/mm/workingset.c @@ -0,0 +1,414 @@ +/* + * Workingset detection + * + * Copyright (C) 2013 Red Hat, Inc., Johannes Weiner + */ + +#include <linux/memcontrol.h> +#include <linux/writeback.h> +#include <linux/pagemap.h> +#include <linux/atomic.h> +#include <linux/module.h> +#include <linux/swap.h> +#include <linux/fs.h> +#include <linux/mm.h> + +/* + * Double CLOCK lists + * + * Per zone, two clock lists are maintained for file pages: the + * inactive and the active list. Freshly faulted pages start out at + * the head of the inactive list and page reclaim scans pages from the + * tail. Pages that are accessed multiple times on the inactive list + * are promoted to the active list, to protect them from reclaim, + * whereas active pages are demoted to the inactive list when the + * active list grows too big. + * + * fault ------------------------+ + * | + * +--------------+ | +-------------+ + * reclaim <- | inactive | <-+-- demotion | active | <--+ + * +--------------+ +-------------+ | + * | | + * +-------------- promotion ------------------+ + * + * + * Access frequency and refault distance + * + * A workload is thrashing when its pages are frequently used but they + * are evicted from the inactive list every time before another access + * would have promoted them to the active list. + * + * In cases where the average access distance between thrashing pages + * is bigger than the size of memory there is nothing that can be + * done - the thrashing set could never fit into memory under any + * circumstance. + * + * However, the average access distance could be bigger than the + * inactive list, yet smaller than the size of memory. In this case, + * the set could fit into memory if it weren't for the currently + * active pages - which may be used more, hopefully less frequently: + * + * +-memory available to cache-+ + * | | + * +-inactive------+-active----+ + * a b | c d e f g h i | J K L M N | + * +---------------+-----------+ + * + * It is prohibitively expensive to accurately track access frequency + * of pages. But a reasonable approximation can be made to measure + * thrashing on the inactive list, after which refaulting pages can be + * activated optimistically to compete with the existing active pages. + * + * Approximating inactive page access frequency - Observations: + * + * 1. When a page is accessed for the first time, it is added to the + * head of the inactive list, slides every existing inactive page + * towards the tail by one slot, and pushes the current tail page + * out of memory. + * + * 2. When a page is accessed for the second time, it is promoted to + * the active list, shrinking the inactive list by one slot. This + * also slides all inactive pages that were faulted into the cache + * more recently than the activated page towards the tail of the + * inactive list. + * + * Thus: + * + * 1. The sum of evictions and activations between any two points in + * time indicate the minimum number of inactive pages accessed in + * between. + * + * 2. Moving one inactive page N page slots towards the tail of the + * list requires at least N inactive page accesses. + * + * Combining these: + * + * 1. When a page is finally evicted from memory, the number of + * inactive pages accessed while the page was in cache is at least + * the number of page slots on the inactive list. + * + * 2. In addition, measuring the sum of evictions and activations (E) + * at the time of a page's eviction, and comparing it to another + * reading (R) at the time the page faults back into memory tells + * the minimum number of accesses while the page was not cached. + * This is called the refault distance. + * + * Because the first access of the page was the fault and the second + * access the refault, we combine the in-cache distance with the + * out-of-cache distance to get the complete minimum access distance + * of this page: + * + * NR_inactive + (R - E) + * + * And knowing the minimum access distance of a page, we can easily + * tell if the page would be able to stay in cache assuming all page + * slots in the cache were available: + * + * NR_inactive + (R - E) <= NR_inactive + NR_active + * + * which can be further simplified to + * + * (R - E) <= NR_active + * + * Put into words, the refault distance (out-of-cache) can be seen as + * a deficit in inactive list space (in-cache). If the inactive list + * had (R - E) more page slots, the page would not have been evicted + * in between accesses, but activated instead. And on a full system, + * the only thing eating into inactive list space is active pages. + * + * + * Activating refaulting pages + * + * All that is known about the active list is that the pages have been + * accessed more than once in the past. This means that at any given + * time there is actually a good chance that pages on the active list + * are no longer in active use. + * + * So when a refault distance of (R - E) is observed and there are at + * least (R - E) active pages, the refaulting page is activated + * optimistically in the hope that (R - E) active pages are actually + * used less frequently than the refaulting page - or even not used at + * all anymore. + * + * If this is wrong and demotion kicks in, the pages which are truly + * used more frequently will be reactivated while the less frequently + * used once will be evicted from memory. + * + * But if this is right, the stale pages will be pushed out of memory + * and the used pages get to stay in cache. + * + * + * Implementation + * + * For each zone's file LRU lists, a counter for inactive evictions + * and activations is maintained (zone->inactive_age). + * + * On eviction, a snapshot of this counter (along with some bits to + * identify the zone) is stored in the now empty page cache radix tree + * slot of the evicted page. This is called a shadow entry. + * + * On cache misses for which there are shadow entries, an eligible + * refault distance will immediately activate the refaulting page. + */ + +static void *pack_shadow(unsigned long eviction, struct zone *zone) +{ + eviction = (eviction << NODES_SHIFT) | zone_to_nid(zone); + eviction = (eviction << ZONES_SHIFT) | zone_idx(zone); + eviction = (eviction << RADIX_TREE_EXCEPTIONAL_SHIFT); + + return (void *)(eviction | RADIX_TREE_EXCEPTIONAL_ENTRY); +} + +static void unpack_shadow(void *shadow, + struct zone **zone, + unsigned long *distance) +{ + unsigned long entry = (unsigned long)shadow; + unsigned long eviction; + unsigned long refault; + unsigned long mask; + int zid, nid; + + entry >>= RADIX_TREE_EXCEPTIONAL_SHIFT; + zid = entry & ((1UL << ZONES_SHIFT) - 1); + entry >>= ZONES_SHIFT; + nid = entry & ((1UL << NODES_SHIFT) - 1); + entry >>= NODES_SHIFT; + eviction = entry; + + *zone = NODE_DATA(nid)->node_zones + zid; + + refault = atomic_long_read(&(*zone)->inactive_age); + mask = ~0UL >> (NODES_SHIFT + ZONES_SHIFT + + RADIX_TREE_EXCEPTIONAL_SHIFT); + /* + * The unsigned subtraction here gives an accurate distance + * across inactive_age overflows in most cases. + * + * There is a special case: usually, shadow entries have a + * short lifetime and are either refaulted or reclaimed along + * with the inode before they get too old. But it is not + * impossible for the inactive_age to lap a shadow entry in + * the field, which can then can result in a false small + * refault distance, leading to a false activation should this + * old entry actually refault again. However, earlier kernels + * used to deactivate unconditionally with *every* reclaim + * invocation for the longest time, so the occasional + * inappropriate activation leading to pressure on the active + * list is not a problem. + */ + *distance = (refault - eviction) & mask; +} + +/** + * workingset_eviction - note the eviction of a page from memory + * @mapping: address space the page was backing + * @page: the page being evicted + * + * Returns a shadow entry to be stored in @mapping->page_tree in place + * of the evicted @page so that a later refault can be detected. + */ +void *workingset_eviction(struct address_space *mapping, struct page *page) +{ + struct zone *zone = page_zone(page); + unsigned long eviction; + + eviction = atomic_long_inc_return(&zone->inactive_age); + return pack_shadow(eviction, zone); +} + +/** + * workingset_refault - evaluate the refault of a previously evicted page + * @shadow: shadow entry of the evicted page + * + * Calculates and evaluates the refault distance of the previously + * evicted page in the context of the zone it was allocated in. + * + * Returns %true if the page should be activated, %false otherwise. + */ +bool workingset_refault(void *shadow) +{ + unsigned long refault_distance; + struct zone *zone; + + unpack_shadow(shadow, &zone, &refault_distance); + inc_zone_state(zone, WORKINGSET_REFAULT); + + if (refault_distance <= zone_page_state(zone, NR_ACTIVE_FILE)) { + inc_zone_state(zone, WORKINGSET_ACTIVATE); + return true; + } + return false; +} + +/** + * workingset_activation - note a page activation + * @page: page that is being activated + */ +void workingset_activation(struct page *page) +{ + atomic_long_inc(&page_zone(page)->inactive_age); +} + +/* + * Shadow entries reflect the share of the working set that does not + * fit into memory, so their number depends on the access pattern of + * the workload. In most cases, they will refault or get reclaimed + * along with the inode, but a (malicious) workload that streams + * through files with a total size several times that of available + * memory, while preventing the inodes from being reclaimed, can + * create excessive amounts of shadow nodes. To keep a lid on this, + * track shadow nodes and reclaim them when they grow way past the + * point where they would still be useful. + */ + +struct list_lru workingset_shadow_nodes; + +static unsigned long count_shadow_nodes(struct shrinker *shrinker, + struct shrink_control *sc) +{ + unsigned long shadow_nodes; + unsigned long max_nodes; + unsigned long pages; + + /* list_lru lock nests inside IRQ-safe mapping->tree_lock */ + local_irq_disable(); + shadow_nodes = list_lru_count_node(&workingset_shadow_nodes, sc->nid); + local_irq_enable(); + + pages = node_present_pages(sc->nid); + /* + * Active cache pages are limited to 50% of memory, and shadow + * entries that represent a refault distance bigger than that + * do not have any effect. Limit the number of shadow nodes + * such that shadow entries do not exceed the number of active + * cache pages, assuming a worst-case node population density + * of 1/8th on average. + * + * On 64-bit with 7 radix_tree_nodes per page and 64 slots + * each, this will reclaim shadow entries when they consume + * ~2% of available memory: + * + * PAGE_SIZE / radix_tree_nodes / node_entries / PAGE_SIZE + */ + max_nodes = pages >> (1 + RADIX_TREE_MAP_SHIFT - 3); + + if (shadow_nodes <= max_nodes) + return 0; + + return shadow_nodes - max_nodes; +} + +static enum lru_status shadow_lru_isolate(struct list_head *item, + spinlock_t *lru_lock, + void *arg) +{ + struct address_space *mapping; + struct radix_tree_node *node; + unsigned int i; + int ret; + + /* + * Page cache insertions and deletions synchroneously maintain + * the shadow node LRU under the mapping->tree_lock and the + * lru_lock. Because the page cache tree is emptied before + * the inode can be destroyed, holding the lru_lock pins any + * address_space that has radix tree nodes on the LRU. + * + * We can then safely transition to the mapping->tree_lock to + * pin only the address_space of the particular node we want + * to reclaim, take the node off-LRU, and drop the lru_lock. + */ + + node = container_of(item, struct radix_tree_node, private_list); + mapping = node->private_data; + + /* Coming from the list, invert the lock order */ + if (!spin_trylock(&mapping->tree_lock)) { + spin_unlock(lru_lock); + ret = LRU_RETRY; + goto out; + } + + list_del_init(item); + spin_unlock(lru_lock); + + /* + * The nodes should only contain one or more shadow entries, + * no pages, so we expect to be able to remove them all and + * delete and free the empty node afterwards. + */ + + BUG_ON(!node->count); + BUG_ON(node->count & RADIX_TREE_COUNT_MASK); + + for (i = 0; i < RADIX_TREE_MAP_SIZE; i++) { + if (node->slots[i]) { + BUG_ON(!radix_tree_exceptional_entry(node->slots[i])); + node->slots[i] = NULL; + BUG_ON(node->count < (1U << RADIX_TREE_COUNT_SHIFT)); + node->count -= 1U << RADIX_TREE_COUNT_SHIFT; + BUG_ON(!mapping->nrshadows); + mapping->nrshadows--; + } + } + BUG_ON(node->count); + inc_zone_state(page_zone(virt_to_page(node)), WORKINGSET_NODERECLAIM); + if (!__radix_tree_delete_node(&mapping->page_tree, node)) + BUG(); + + spin_unlock(&mapping->tree_lock); + ret = LRU_REMOVED_RETRY; +out: + local_irq_enable(); + cond_resched(); + local_irq_disable(); + spin_lock(lru_lock); + return ret; +} + +static unsigned long scan_shadow_nodes(struct shrinker *shrinker, + struct shrink_control *sc) +{ + unsigned long ret; + + /* list_lru lock nests inside IRQ-safe mapping->tree_lock */ + local_irq_disable(); + ret = list_lru_walk_node(&workingset_shadow_nodes, sc->nid, + shadow_lru_isolate, NULL, &sc->nr_to_scan); + local_irq_enable(); + return ret; +} + +static struct shrinker workingset_shadow_shrinker = { + .count_objects = count_shadow_nodes, + .scan_objects = scan_shadow_nodes, + .seeks = DEFAULT_SEEKS, + .flags = SHRINKER_NUMA_AWARE, +}; + +/* + * Our list_lru->lock is IRQ-safe as it nests inside the IRQ-safe + * mapping->tree_lock. + */ +static struct lock_class_key shadow_nodes_key; + +static int __init workingset_init(void) +{ + int ret; + + ret = list_lru_init_key(&workingset_shadow_nodes, &shadow_nodes_key); + if (ret) + goto err; + ret = register_shrinker(&workingset_shadow_shrinker); + if (ret) + goto err_list_lru; + return 0; +err_list_lru: + list_lru_destroy(&workingset_shadow_nodes); +err: + return ret; +} +module_init(workingset_init); |