diff options
Diffstat (limited to 'mm')
38 files changed, 784 insertions, 806 deletions
diff --git a/mm/Kconfig b/mm/Kconfig index 46ef77d5c332..48b1af447fa7 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -161,7 +161,6 @@ config MEMORY_HOTPLUG bool "Allow for memory hot-add" depends on SPARSEMEM || X86_64_ACPI_NUMA depends on ARCH_ENABLE_MEMORY_HOTPLUG - depends on COMPILE_TEST || !KASAN config MEMORY_HOTPLUG_SPARSE def_bool y diff --git a/mm/balloon_compaction.c b/mm/balloon_compaction.c index da91df50ba31..9075aa54e955 100644 --- a/mm/balloon_compaction.c +++ b/mm/balloon_compaction.c @@ -24,7 +24,7 @@ struct page *balloon_page_enqueue(struct balloon_dev_info *b_dev_info) { unsigned long flags; struct page *page = alloc_page(balloon_mapping_gfp_mask() | - __GFP_NOMEMALLOC | __GFP_NORETRY); + __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_ZERO); if (!page) return NULL; @@ -59,7 +59,7 @@ const char *cma_get_name(const struct cma *cma) } static unsigned long cma_bitmap_aligned_mask(const struct cma *cma, - int align_order) + unsigned int align_order) { if (align_order <= cma->order_per_bit) return 0; @@ -67,17 +67,14 @@ static unsigned long cma_bitmap_aligned_mask(const struct cma *cma, } /* - * Find a PFN aligned to the specified order and return an offset represented in - * order_per_bits. + * Find the offset of the base PFN from the specified align_order. + * The value returned is represented in order_per_bits. */ static unsigned long cma_bitmap_aligned_offset(const struct cma *cma, - int align_order) + unsigned int align_order) { - if (align_order <= cma->order_per_bit) - return 0; - - return (ALIGN(cma->base_pfn, (1UL << align_order)) - - cma->base_pfn) >> cma->order_per_bit; + return (cma->base_pfn & ((1UL << align_order) - 1)) + >> cma->order_per_bit; } static unsigned long cma_bitmap_pages_to_bits(const struct cma *cma, @@ -127,7 +124,7 @@ static int __init cma_activate_area(struct cma *cma) * to be in the same zone. */ if (page_zone(pfn_to_page(pfn)) != zone) - goto err; + goto not_in_zone; } init_cma_reserved_pageblock(pfn_to_page(base_pfn)); } while (--i); @@ -141,7 +138,8 @@ static int __init cma_activate_area(struct cma *cma) return 0; -err: +not_in_zone: + pr_err("CMA area %s could not be activated\n", cma->name); kfree(cma->bitmap); cma->count = 0; return -EINVAL; diff --git a/mm/filemap.c b/mm/filemap.c index 3247b4208034..a49702445ce0 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -239,14 +239,16 @@ void __delete_from_page_cache(struct page *page, void *shadow) /* Leave page->index set: truncation lookup relies upon it */ /* hugetlb pages do not participate in page cache accounting. */ - if (!PageHuge(page)) - __mod_node_page_state(page_pgdat(page), NR_FILE_PAGES, -nr); + if (PageHuge(page)) + return; + + __mod_node_page_state(page_pgdat(page), NR_FILE_PAGES, -nr); if (PageSwapBacked(page)) { __mod_node_page_state(page_pgdat(page), NR_SHMEM, -nr); if (PageTransHuge(page)) __dec_node_page_state(page, NR_SHMEM_THPS); } else { - VM_BUG_ON_PAGE(PageTransHuge(page) && !PageHuge(page), page); + VM_BUG_ON_PAGE(PageTransHuge(page), page); } /* diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 1a88006ec634..bc48ee783dd9 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -20,9 +20,9 @@ #include <linux/slab.h> #include <linux/sched/signal.h> #include <linux/rmap.h> +#include <linux/string_helpers.h> #include <linux/swap.h> #include <linux/swapops.h> -#include <linux/page-isolation.h> #include <linux/jhash.h> #include <asm/page.h> @@ -872,7 +872,7 @@ static struct page *dequeue_huge_page_node_exact(struct hstate *h, int nid) struct page *page; list_for_each_entry(page, &h->hugepage_freelists[nid], lru) - if (!is_migrate_isolate_page(page)) + if (!PageHWPoison(page)) break; /* * if 'non-isolated free hugepage' not found on the list, @@ -887,19 +887,39 @@ static struct page *dequeue_huge_page_node_exact(struct hstate *h, int nid) return page; } -static struct page *dequeue_huge_page_node(struct hstate *h, int nid) +static struct page *dequeue_huge_page_nodemask(struct hstate *h, gfp_t gfp_mask, int nid, + nodemask_t *nmask) { - struct page *page; - int node; + unsigned int cpuset_mems_cookie; + struct zonelist *zonelist; + struct zone *zone; + struct zoneref *z; + int node = -1; - if (nid != NUMA_NO_NODE) - return dequeue_huge_page_node_exact(h, nid); + zonelist = node_zonelist(nid, gfp_mask); + +retry_cpuset: + cpuset_mems_cookie = read_mems_allowed_begin(); + for_each_zone_zonelist_nodemask(zone, z, zonelist, gfp_zone(gfp_mask), nmask) { + struct page *page; + + if (!cpuset_zone_allowed(zone, gfp_mask)) + continue; + /* + * no need to ask again on the same node. Pool is node rather than + * zone aware + */ + if (zone_to_nid(zone) == node) + continue; + node = zone_to_nid(zone); - for_each_online_node(node) { page = dequeue_huge_page_node_exact(h, node); if (page) return page; } + if (unlikely(read_mems_allowed_retry(cpuset_mems_cookie))) + goto retry_cpuset; + return NULL; } @@ -917,15 +937,11 @@ static struct page *dequeue_huge_page_vma(struct hstate *h, unsigned long address, int avoid_reserve, long chg) { - struct page *page = NULL; + struct page *page; struct mempolicy *mpol; - nodemask_t *nodemask; gfp_t gfp_mask; + nodemask_t *nodemask; int nid; - struct zonelist *zonelist; - struct zone *zone; - struct zoneref *z; - unsigned int cpuset_mems_cookie; /* * A child process with MAP_PRIVATE mappings created by their parent @@ -940,32 +956,15 @@ static struct page *dequeue_huge_page_vma(struct hstate *h, if (avoid_reserve && h->free_huge_pages - h->resv_huge_pages == 0) goto err; -retry_cpuset: - cpuset_mems_cookie = read_mems_allowed_begin(); gfp_mask = htlb_alloc_mask(h); nid = huge_node(vma, address, gfp_mask, &mpol, &nodemask); - zonelist = node_zonelist(nid, gfp_mask); - - for_each_zone_zonelist_nodemask(zone, z, zonelist, - MAX_NR_ZONES - 1, nodemask) { - if (cpuset_zone_allowed(zone, gfp_mask)) { - page = dequeue_huge_page_node(h, zone_to_nid(zone)); - if (page) { - if (avoid_reserve) - break; - if (!vma_has_reserves(vma, chg)) - break; - - SetPagePrivate(page); - h->resv_huge_pages--; - break; - } - } + page = dequeue_huge_page_nodemask(h, gfp_mask, nid, nodemask); + if (page && !avoid_reserve && vma_has_reserves(vma, chg)) { + SetPagePrivate(page); + h->resv_huge_pages--; } mpol_cond_put(mpol); - if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie))) - goto retry_cpuset; return page; err: @@ -1385,7 +1384,7 @@ static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid) page = __alloc_pages_node(nid, htlb_alloc_mask(h)|__GFP_COMP|__GFP_THISNODE| - __GFP_REPEAT|__GFP_NOWARN, + __GFP_RETRY_MAYFAIL|__GFP_NOWARN, huge_page_order(h)); if (page) { prep_new_huge_page(h, page, nid); @@ -1460,7 +1459,7 @@ static int free_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed, * number of free hugepages would be reduced below the number of reserved * hugepages. */ -static int dissolve_free_huge_page(struct page *page) +int dissolve_free_huge_page(struct page *page) { int rc = 0; @@ -1473,6 +1472,14 @@ static int dissolve_free_huge_page(struct page *page) rc = -EBUSY; goto out; } + /* + * Move PageHWPoison flag from head page to the raw error page, + * which makes any subpages rather than the error page reusable. + */ + if (PageHWPoison(head) && page != head) { + SetPageHWPoison(page); + ClearPageHWPoison(head); + } list_del(&head->lru); h->free_huge_pages--; h->free_huge_pages_node[nid]--; @@ -1513,82 +1520,19 @@ int dissolve_free_huge_pages(unsigned long start_pfn, unsigned long end_pfn) return rc; } -/* - * There are 3 ways this can get called: - * 1. With vma+addr: we use the VMA's memory policy - * 2. With !vma, but nid=NUMA_NO_NODE: We try to allocate a huge - * page from any node, and let the buddy allocator itself figure - * it out. - * 3. With !vma, but nid!=NUMA_NO_NODE. We allocate a huge page - * strictly from 'nid' - */ static struct page *__hugetlb_alloc_buddy_huge_page(struct hstate *h, - struct vm_area_struct *vma, unsigned long addr, int nid) + gfp_t gfp_mask, int nid, nodemask_t *nmask) { int order = huge_page_order(h); - gfp_t gfp = htlb_alloc_mask(h)|__GFP_COMP|__GFP_REPEAT|__GFP_NOWARN; - unsigned int cpuset_mems_cookie; - - /* - * We need a VMA to get a memory policy. If we do not - * have one, we use the 'nid' argument. - * - * The mempolicy stuff below has some non-inlined bits - * and calls ->vm_ops. That makes it hard to optimize at - * compile-time, even when NUMA is off and it does - * nothing. This helps the compiler optimize it out. - */ - if (!IS_ENABLED(CONFIG_NUMA) || !vma) { - /* - * If a specific node is requested, make sure to - * get memory from there, but only when a node - * is explicitly specified. - */ - if (nid != NUMA_NO_NODE) - gfp |= __GFP_THISNODE; - /* - * Make sure to call something that can handle - * nid=NUMA_NO_NODE - */ - return alloc_pages_node(nid, gfp, order); - } - - /* - * OK, so we have a VMA. Fetch the mempolicy and try to - * allocate a huge page with it. We will only reach this - * when CONFIG_NUMA=y. - */ - do { - struct page *page; - struct mempolicy *mpol; - int nid; - nodemask_t *nodemask; - - cpuset_mems_cookie = read_mems_allowed_begin(); - nid = huge_node(vma, addr, gfp, &mpol, &nodemask); - mpol_cond_put(mpol); - page = __alloc_pages_nodemask(gfp, order, nid, nodemask); - if (page) - return page; - } while (read_mems_allowed_retry(cpuset_mems_cookie)); - return NULL; + gfp_mask |= __GFP_COMP|__GFP_RETRY_MAYFAIL|__GFP_NOWARN; + if (nid == NUMA_NO_NODE) + nid = numa_mem_id(); + return __alloc_pages_nodemask(gfp_mask, order, nid, nmask); } -/* - * There are two ways to allocate a huge page: - * 1. When you have a VMA and an address (like a fault) - * 2. When you have no VMA (like when setting /proc/.../nr_hugepages) - * - * 'vma' and 'addr' are only for (1). 'nid' is always NUMA_NO_NODE in - * this case which signifies that the allocation should be done with - * respect for the VMA's memory policy. - * - * For (2), we ignore 'vma' and 'addr' and use 'nid' exclusively. This - * implies that memory policies will not be taken in to account. - */ -static struct page *__alloc_buddy_huge_page(struct hstate *h, - struct vm_area_struct *vma, unsigned long addr, int nid) +static struct page *__alloc_buddy_huge_page(struct hstate *h, gfp_t gfp_mask, + int nid, nodemask_t *nmask) { struct page *page; unsigned int r_nid; @@ -1597,15 +1541,6 @@ static struct page *__alloc_buddy_huge_page(struct hstate *h, return NULL; /* - * Make sure that anyone specifying 'nid' is not also specifying a VMA. - * This makes sure the caller is picking _one_ of the modes with which - * we can call this function, not both. - */ - if (vma || (addr != -1)) { - VM_WARN_ON_ONCE(addr == -1); - VM_WARN_ON_ONCE(nid != NUMA_NO_NODE); - } - /* * Assume we will successfully allocate the surplus page to * prevent racing processes from causing the surplus to exceed * overcommit @@ -1638,7 +1573,7 @@ static struct page *__alloc_buddy_huge_page(struct hstate *h, } spin_unlock(&hugetlb_lock); - page = __hugetlb_alloc_buddy_huge_page(h, vma, addr, nid); + page = __hugetlb_alloc_buddy_huge_page(h, gfp_mask, nid, nmask); spin_lock(&hugetlb_lock); if (page) { @@ -1663,26 +1598,23 @@ static struct page *__alloc_buddy_huge_page(struct hstate *h, } /* - * Allocate a huge page from 'nid'. Note, 'nid' may be - * NUMA_NO_NODE, which means that it may be allocated - * anywhere. - */ -static -struct page *__alloc_buddy_huge_page_no_mpol(struct hstate *h, int nid) -{ - unsigned long addr = -1; - - return __alloc_buddy_huge_page(h, NULL, addr, nid); -} - -/* * Use the VMA's mpolicy to allocate a huge page from the buddy. */ static struct page *__alloc_buddy_huge_page_with_mpol(struct hstate *h, struct vm_area_struct *vma, unsigned long addr) { - return __alloc_buddy_huge_page(h, vma, addr, NUMA_NO_NODE); + struct page *page; + struct mempolicy *mpol; + gfp_t gfp_mask = htlb_alloc_mask(h); + int nid; + nodemask_t *nodemask; + + nid = huge_node(vma, addr, gfp_mask, &mpol, &nodemask); + page = __alloc_buddy_huge_page(h, gfp_mask, nid, nodemask); + mpol_cond_put(mpol); + + return page; } /* @@ -1692,19 +1624,46 @@ struct page *__alloc_buddy_huge_page_with_mpol(struct hstate *h, */ struct page *alloc_huge_page_node(struct hstate *h, int nid) { + gfp_t gfp_mask = htlb_alloc_mask(h); struct page *page = NULL; + if (nid != NUMA_NO_NODE) + gfp_mask |= __GFP_THISNODE; + spin_lock(&hugetlb_lock); if (h->free_huge_pages - h->resv_huge_pages > 0) - page = dequeue_huge_page_node(h, nid); + page = dequeue_huge_page_nodemask(h, gfp_mask, nid, NULL); spin_unlock(&hugetlb_lock); if (!page) - page = __alloc_buddy_huge_page_no_mpol(h, nid); + page = __alloc_buddy_huge_page(h, gfp_mask, nid, NULL); return page; } + +struct page *alloc_huge_page_nodemask(struct hstate *h, int preferred_nid, + nodemask_t *nmask) +{ + gfp_t gfp_mask = htlb_alloc_mask(h); + + spin_lock(&hugetlb_lock); + if (h->free_huge_pages - h->resv_huge_pages > 0) { + struct page *page; + + page = dequeue_huge_page_nodemask(h, gfp_mask, preferred_nid, nmask); + if (page) { + spin_unlock(&hugetlb_lock); + return page; + } + } + spin_unlock(&hugetlb_lock); + + /* No reservations, try to overcommit */ + + return __alloc_buddy_huge_page(h, gfp_mask, preferred_nid, nmask); +} + /* * Increase the hugetlb pool such that it can accommodate a reservation * of size 'delta'. @@ -1730,12 +1689,14 @@ static int gather_surplus_pages(struct hstate *h, int delta) retry: spin_unlock(&hugetlb_lock); for (i = 0; i < needed; i++) { - page = __alloc_buddy_huge_page_no_mpol(h, NUMA_NO_NODE); + page = __alloc_buddy_huge_page(h, htlb_alloc_mask(h), + NUMA_NO_NODE, NULL); if (!page) { alloc_ok = false; break; } list_add(&page->lru, &surplus_list); + cond_resched(); } allocated += i; @@ -2204,8 +2165,16 @@ static void __init hugetlb_hstate_alloc_pages(struct hstate *h) } else if (!alloc_fresh_huge_page(h, &node_states[N_MEMORY])) break; + cond_resched(); + } + if (i < h->max_huge_pages) { + char buf[32]; + + string_get_size(huge_page_size(h), 1, STRING_UNITS_2, buf, 32); + pr_warn("HugeTLB: allocating %lu of page size %s failed. Only allocated %lu hugepages.\n", + h->max_huge_pages, buf, i); + h->max_huge_pages = i; } - h->max_huge_pages = i; } static void __init hugetlb_init_hstates(void) @@ -2223,26 +2192,16 @@ static void __init hugetlb_init_hstates(void) VM_BUG_ON(minimum_order == UINT_MAX); } -static char * __init memfmt(char *buf, unsigned long n) -{ - if (n >= (1UL << 30)) - sprintf(buf, "%lu GB", n >> 30); - else if (n >= (1UL << 20)) - sprintf(buf, "%lu MB", n >> 20); - else - sprintf(buf, "%lu KB", n >> 10); - return buf; -} - static void __init report_hugepages(void) { struct hstate *h; for_each_hstate(h) { char buf[32]; + + string_get_size(huge_page_size(h), 1, STRING_UNITS_2, buf, 32); pr_info("HugeTLB registered %s page size, pre-allocated %ld pages\n", - memfmt(buf, huge_page_size(h)), - h->free_huge_pages); + buf, h->free_huge_pages); } } @@ -2801,6 +2760,11 @@ static int __init hugetlb_init(void) return 0; if (!size_to_hstate(default_hstate_size)) { + if (default_hstate_size != 0) { + pr_err("HugeTLB: unsupported default_hugepagesz %lu. Reverting to %lu\n", + default_hstate_size, HPAGE_SIZE); + } + default_hstate_size = HPAGE_SIZE; if (!size_to_hstate(default_hstate_size)) hugetlb_add_hstate(HUGETLB_PAGE_ORDER); @@ -4739,40 +4703,6 @@ follow_huge_pgd(struct mm_struct *mm, unsigned long address, pgd_t *pgd, int fla return pte_page(*(pte_t *)pgd) + ((address & ~PGDIR_MASK) >> PAGE_SHIFT); } -#ifdef CONFIG_MEMORY_FAILURE - -/* - * This function is called from memory failure code. - */ -int dequeue_hwpoisoned_huge_page(struct page *hpage) -{ - struct hstate *h = page_hstate(hpage); - int nid = page_to_nid(hpage); - int ret = -EBUSY; - - spin_lock(&hugetlb_lock); - /* - * Just checking !page_huge_active is not enough, because that could be - * an isolated/hwpoisoned hugepage (which have >0 refcount). - */ - if (!page_huge_active(hpage) && !page_count(hpage)) { - /* - * Hwpoisoned hugepage isn't linked to activelist or freelist, - * but dangling hpage->lru can trigger list-debug warnings - * (this happens when we call unpoison_memory() on it), - * so let it point to itself with list_del_init(). - */ - list_del_init(&hpage->lru); - set_page_refcounted(hpage); - h->free_huge_pages--; - h->free_huge_pages_node[nid]--; - ret = 0; - } - spin_unlock(&hugetlb_lock); - return ret; -} -#endif - bool isolate_huge_page(struct page *page, struct list_head *list) { bool ret = true; diff --git a/mm/internal.h b/mm/internal.h index 0e4f558412fb..24d88f084705 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -23,7 +23,7 @@ * hints such as HIGHMEM usage. */ #define GFP_RECLAIM_MASK (__GFP_RECLAIM|__GFP_HIGH|__GFP_IO|__GFP_FS|\ - __GFP_NOWARN|__GFP_REPEAT|__GFP_NOFAIL|\ + __GFP_NOWARN|__GFP_RETRY_MAYFAIL|__GFP_NOFAIL|\ __GFP_NORETRY|__GFP_MEMALLOC|__GFP_NOMEMALLOC|\ __GFP_ATOMIC) diff --git a/mm/kasan/kasan.c b/mm/kasan/kasan.c index c81549d5c833..ca11bc4ce205 100644 --- a/mm/kasan/kasan.c +++ b/mm/kasan/kasan.c @@ -134,97 +134,33 @@ static __always_inline bool memory_is_poisoned_1(unsigned long addr) return false; } -static __always_inline bool memory_is_poisoned_2(unsigned long addr) +static __always_inline bool memory_is_poisoned_2_4_8(unsigned long addr, + unsigned long size) { - u16 *shadow_addr = (u16 *)kasan_mem_to_shadow((void *)addr); - - if (unlikely(*shadow_addr)) { - if (memory_is_poisoned_1(addr + 1)) - return true; - - /* - * If single shadow byte covers 2-byte access, we don't - * need to do anything more. Otherwise, test the first - * shadow byte. - */ - if (likely(((addr + 1) & KASAN_SHADOW_MASK) != 0)) - return false; - - return unlikely(*(u8 *)shadow_addr); - } - - return false; -} - -static __always_inline bool memory_is_poisoned_4(unsigned long addr) -{ - u16 *shadow_addr = (u16 *)kasan_mem_to_shadow((void *)addr); + u8 *shadow_addr = (u8 *)kasan_mem_to_shadow((void *)addr); - if (unlikely(*shadow_addr)) { - if (memory_is_poisoned_1(addr + 3)) - return true; - - /* - * If single shadow byte covers 4-byte access, we don't - * need to do anything more. Otherwise, test the first - * shadow byte. - */ - if (likely(((addr + 3) & KASAN_SHADOW_MASK) >= 3)) - return false; - - return unlikely(*(u8 *)shadow_addr); - } - - return false; -} - -static __always_inline bool memory_is_poisoned_8(unsigned long addr) -{ - u16 *shadow_addr = (u16 *)kasan_mem_to_shadow((void *)addr); - - if (unlikely(*shadow_addr)) { - if (memory_is_poisoned_1(addr + 7)) - return true; - - /* - * If single shadow byte covers 8-byte access, we don't - * need to do anything more. Otherwise, test the first - * shadow byte. - */ - if (likely(IS_ALIGNED(addr, KASAN_SHADOW_SCALE_SIZE))) - return false; - - return unlikely(*(u8 *)shadow_addr); - } + /* + * Access crosses 8(shadow size)-byte boundary. Such access maps + * into 2 shadow bytes, so we need to check them both. + */ + if (unlikely(((addr + size - 1) & KASAN_SHADOW_MASK) < size - 1)) + return *shadow_addr || memory_is_poisoned_1(addr + size - 1); - return false; + return memory_is_poisoned_1(addr + size - 1); } static __always_inline bool memory_is_poisoned_16(unsigned long addr) { - u32 *shadow_addr = (u32 *)kasan_mem_to_shadow((void *)addr); - - if (unlikely(*shadow_addr)) { - u16 shadow_first_bytes = *(u16 *)shadow_addr; - - if (unlikely(shadow_first_bytes)) - return true; - - /* - * If two shadow bytes covers 16-byte access, we don't - * need to do anything more. Otherwise, test the last - * shadow byte. - */ - if (likely(IS_ALIGNED(addr, KASAN_SHADOW_SCALE_SIZE))) - return false; + u16 *shadow_addr = (u16 *)kasan_mem_to_shadow((void *)addr); - return memory_is_poisoned_1(addr + 15); - } + /* Unaligned 16-bytes access maps into 3 shadow bytes. */ + if (unlikely(!IS_ALIGNED(addr, KASAN_SHADOW_SCALE_SIZE))) + return *shadow_addr || memory_is_poisoned_1(addr + 15); - return false; + return *shadow_addr; } -static __always_inline unsigned long bytes_is_zero(const u8 *start, +static __always_inline unsigned long bytes_is_nonzero(const u8 *start, size_t size) { while (size) { @@ -237,7 +173,7 @@ static __always_inline unsigned long bytes_is_zero(const u8 *start, return 0; } -static __always_inline unsigned long memory_is_zero(const void *start, +static __always_inline unsigned long memory_is_nonzero(const void *start, const void *end) { unsigned int words; @@ -245,11 +181,11 @@ static __always_inline unsigned long memory_is_zero(const void *start, unsigned int prefix = (unsigned long)start % 8; if (end - start <= 16) - return bytes_is_zero(start, end - start); + return bytes_is_nonzero(start, end - start); if (prefix) { prefix = 8 - prefix; - ret = bytes_is_zero(start, prefix); + ret = bytes_is_nonzero(start, prefix); if (unlikely(ret)) return ret; start += prefix; @@ -258,12 +194,12 @@ static __always_inline unsigned long memory_is_zero(const void *start, words = (end - start) / 8; while (words) { if (unlikely(*(u64 *)start)) - return bytes_is_zero(start, 8); + return bytes_is_nonzero(start, 8); start += 8; words--; } - return bytes_is_zero(start, (end - start) % 8); + return bytes_is_nonzero(start, (end - start) % 8); } static __always_inline bool memory_is_poisoned_n(unsigned long addr, @@ -271,7 +207,7 @@ static __always_inline bool memory_is_poisoned_n(unsigned long addr, { unsigned long ret; - ret = memory_is_zero(kasan_mem_to_shadow((void *)addr), + ret = memory_is_nonzero(kasan_mem_to_shadow((void *)addr), kasan_mem_to_shadow((void *)addr + size - 1) + 1); if (unlikely(ret)) { @@ -292,11 +228,9 @@ static __always_inline bool memory_is_poisoned(unsigned long addr, size_t size) case 1: return memory_is_poisoned_1(addr); case 2: - return memory_is_poisoned_2(addr); case 4: - return memory_is_poisoned_4(addr); case 8: - return memory_is_poisoned_8(addr); + return memory_is_poisoned_2_4_8(addr, size); case 16: return memory_is_poisoned_16(addr); default: @@ -803,17 +737,47 @@ void __asan_unpoison_stack_memory(const void *addr, size_t size) EXPORT_SYMBOL(__asan_unpoison_stack_memory); #ifdef CONFIG_MEMORY_HOTPLUG -static int kasan_mem_notifier(struct notifier_block *nb, +static int __meminit kasan_mem_notifier(struct notifier_block *nb, unsigned long action, void *data) { - return (action == MEM_GOING_ONLINE) ? NOTIFY_BAD : NOTIFY_OK; + struct memory_notify *mem_data = data; + unsigned long nr_shadow_pages, start_kaddr, shadow_start; + unsigned long shadow_end, shadow_size; + + nr_shadow_pages = mem_data->nr_pages >> KASAN_SHADOW_SCALE_SHIFT; + start_kaddr = (unsigned long)pfn_to_kaddr(mem_data->start_pfn); + shadow_start = (unsigned long)kasan_mem_to_shadow((void *)start_kaddr); + shadow_size = nr_shadow_pages << PAGE_SHIFT; + shadow_end = shadow_start + shadow_size; + + if (WARN_ON(mem_data->nr_pages % KASAN_SHADOW_SCALE_SIZE) || + WARN_ON(start_kaddr % (KASAN_SHADOW_SCALE_SIZE << PAGE_SHIFT))) + return NOTIFY_BAD; + + switch (action) { + case MEM_GOING_ONLINE: { + void *ret; + + ret = __vmalloc_node_range(shadow_size, PAGE_SIZE, shadow_start, + shadow_end, GFP_KERNEL, + PAGE_KERNEL, VM_NO_GUARD, + pfn_to_nid(mem_data->start_pfn), + __builtin_return_address(0)); + if (!ret) + return NOTIFY_BAD; + + kmemleak_ignore(ret); + return NOTIFY_OK; + } + case MEM_OFFLINE: + vfree((void *)shadow_start); + } + + return NOTIFY_OK; } static int __init kasan_memhotplug_init(void) { - pr_info("WARNING: KASAN doesn't support memory hot-add\n"); - pr_info("Memory hot-add will be disabled\n"); - hotplug_memory_notifier(kasan_mem_notifier, 0); return 0; diff --git a/mm/kasan/kasan_init.c b/mm/kasan/kasan_init.c index b96a5f773d88..554e4c0f23a2 100644 --- a/mm/kasan/kasan_init.c +++ b/mm/kasan/kasan_init.c @@ -118,6 +118,18 @@ static void __init zero_p4d_populate(pgd_t *pgd, unsigned long addr, do { next = p4d_addr_end(addr, end); + if (IS_ALIGNED(addr, P4D_SIZE) && end - addr >= P4D_SIZE) { + pud_t *pud; + pmd_t *pmd; + + p4d_populate(&init_mm, p4d, lm_alias(kasan_zero_pud)); + pud = pud_offset(p4d, addr); + pud_populate(&init_mm, pud, lm_alias(kasan_zero_pmd)); + pmd = pmd_offset(pud, addr); + pmd_populate_kernel(&init_mm, pmd, + lm_alias(kasan_zero_pte)); + continue; + } if (p4d_none(*p4d)) { p4d_populate(&init_mm, p4d, diff --git a/mm/kasan/report.c b/mm/kasan/report.c index beee0e980e2d..04bb1d3eb9ec 100644 --- a/mm/kasan/report.c +++ b/mm/kasan/report.c @@ -107,7 +107,7 @@ static const char *get_shadow_bug_type(struct kasan_access_info *info) return bug_type; } -const char *get_wild_bug_type(struct kasan_access_info *info) +static const char *get_wild_bug_type(struct kasan_access_info *info) { const char *bug_type = "unknown-crash"; diff --git a/mm/khugepaged.c b/mm/khugepaged.c index df4ebdb2b10a..c01f177a1120 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -816,7 +816,8 @@ khugepaged_alloc_page(struct page **hpage, gfp_t gfp, int node) static bool hugepage_vma_check(struct vm_area_struct *vma) { if ((!(vma->vm_flags & VM_HUGEPAGE) && !khugepaged_always()) || - (vma->vm_flags & VM_NOHUGEPAGE)) + (vma->vm_flags & VM_NOHUGEPAGE) || + test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags)) return false; if (shmem_file(vma->vm_file)) { if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE)) diff --git a/mm/list_lru.c b/mm/list_lru.c index 234676e31edd..7a40fa2be858 100644 --- a/mm/list_lru.c +++ b/mm/list_lru.c @@ -117,6 +117,7 @@ bool list_lru_add(struct list_lru *lru, struct list_head *item) l = list_lru_from_kmem(nlru, item); list_add_tail(item, &l->list); l->nr_items++; + nlru->nr_items++; spin_unlock(&nlru->lock); return true; } @@ -136,6 +137,7 @@ bool list_lru_del(struct list_lru *lru, struct list_head *item) l = list_lru_from_kmem(nlru, item); list_del_init(item); l->nr_items--; + nlru->nr_items--; spin_unlock(&nlru->lock); return true; } @@ -183,15 +185,10 @@ EXPORT_SYMBOL_GPL(list_lru_count_one); unsigned long list_lru_count_node(struct list_lru *lru, int nid) { - long count = 0; - int memcg_idx; + struct list_lru_node *nlru; - count += __list_lru_count_one(lru, nid, -1); - if (list_lru_memcg_aware(lru)) { - for_each_memcg_cache_index(memcg_idx) - count += __list_lru_count_one(lru, nid, memcg_idx); - } - return count; + nlru = &lru->node[nid]; + return nlru->nr_items; } EXPORT_SYMBOL_GPL(list_lru_count_node); @@ -226,6 +223,7 @@ restart: assert_spin_locked(&nlru->lock); case LRU_REMOVED: isolated++; + nlru->nr_items--; /* * If the lru lock has been dropped, our list * traversal is now invalid and so we have to diff --git a/mm/madvise.c b/mm/madvise.c index 25b78ee4fc2c..9976852f1e1c 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -205,7 +205,7 @@ static int swapin_walk_pmd_entry(pmd_t *pmd, unsigned long start, continue; page = read_swap_cache_async(entry, GFP_HIGHUSER_MOVABLE, - vma, index); + vma, index, false); if (page) put_page(page); } @@ -246,7 +246,7 @@ static void force_shm_swapin_readahead(struct vm_area_struct *vma, } swap = radix_to_swp_entry(page); page = read_swap_cache_async(swap, GFP_HIGHUSER_MOVABLE, - NULL, 0); + NULL, 0, false); if (page) put_page(page); } @@ -451,9 +451,6 @@ static int madvise_free_single_vma(struct vm_area_struct *vma, struct mm_struct *mm = vma->vm_mm; struct mmu_gather tlb; - if (vma->vm_flags & (VM_LOCKED|VM_HUGETLB|VM_PFNMAP)) - return -EINVAL; - /* MADV_FREE works for only anon vma at the moment */ if (!vma_is_anonymous(vma)) return -EINVAL; @@ -477,14 +474,6 @@ static int madvise_free_single_vma(struct vm_area_struct *vma, return 0; } -static long madvise_free(struct vm_area_struct *vma, - struct vm_area_struct **prev, - unsigned long start, unsigned long end) -{ - *prev = vma; - return madvise_free_single_vma(vma, start, end); -} - /* * Application no longer needs these pages. If the pages are dirty, * it's OK to just throw them away. The app will be more careful about @@ -504,9 +493,17 @@ static long madvise_free(struct vm_area_struct *vma, * An interface that causes the system to free clean pages and flush * dirty pages is already available as msync(MS_INVALIDATE). */ -static long madvise_dontneed(struct vm_area_struct *vma, - struct vm_area_struct **prev, - unsigned long start, unsigned long end) +static long madvise_dontneed_single_vma(struct vm_area_struct *vma, + unsigned long start, unsigned long end) +{ + zap_page_range(vma, start, end - start); + return 0; +} + +static long madvise_dontneed_free(struct vm_area_struct *vma, + struct vm_area_struct **prev, + unsigned long start, unsigned long end, + int behavior) { *prev = vma; if (!can_madv_dontneed_vma(vma)) @@ -526,7 +523,8 @@ static long madvise_dontneed(struct vm_area_struct *vma, * is also < vma->vm_end. If start < * vma->vm_start it means an hole materialized * in the user address space within the - * virtual range passed to MADV_DONTNEED. + * virtual range passed to MADV_DONTNEED + * or MADV_FREE. */ return -ENOMEM; } @@ -537,7 +535,7 @@ static long madvise_dontneed(struct vm_area_struct *vma, * Don't fail if end > vma->vm_end. If the old * vma was splitted while the mmap_sem was * released the effect of the concurrent - * operation may not cause MADV_DONTNEED to + * operation may not cause madvise() to * have an undefined result. There may be an * adjacent next vma that we'll walk * next. userfaultfd_remove() will generate an @@ -549,8 +547,13 @@ static long madvise_dontneed(struct vm_area_struct *vma, } VM_WARN_ON(start >= end); } - zap_page_range(vma, start, end - start); - return 0; + + if (behavior == MADV_DONTNEED) + return madvise_dontneed_single_vma(vma, start, end); + else if (behavior == MADV_FREE) + return madvise_free_single_vma(vma, start, end); + else + return -EINVAL; } /* @@ -656,9 +659,8 @@ madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev, case MADV_WILLNEED: return madvise_willneed(vma, prev, start, end); case MADV_FREE: - return madvise_free(vma, prev, start, end); case MADV_DONTNEED: - return madvise_dontneed(vma, prev, start, end); + return madvise_dontneed_free(vma, prev, start, end, behavior); default: return madvise_behavior(vma, prev, start, end, behavior); } diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 425aa0caa712..3df3c04d73ab 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -631,7 +631,7 @@ static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg, val = __this_cpu_read(memcg->stat->nr_page_events); next = __this_cpu_read(memcg->stat->targets[target]); /* from time_after() in jiffies.h */ - if ((long)next - (long)val < 0) { + if ((long)(next - val) < 0) { switch (target) { case MEM_CGROUP_TARGET_THRESH: next = val + THRESHOLDS_EVENTS_TARGET; @@ -5317,38 +5317,52 @@ struct cgroup_subsys memory_cgrp_subsys = { /** * mem_cgroup_low - check if memory consumption is below the normal range - * @root: the highest ancestor to consider + * @root: the top ancestor of the sub-tree being checked * @memcg: the memory cgroup to check * * Returns %true if memory consumption of @memcg, and that of all - * configurable ancestors up to @root, is below the normal range. + * ancestors up to (but not including) @root, is below the normal range. + * + * @root is exclusive; it is never low when looked at directly and isn't + * checked when traversing the hierarchy. + * + * Excluding @root enables using memory.low to prioritize memory usage + * between cgroups within a subtree of the hierarchy that is limited by + * memory.high or memory.max. + * + * For example, given cgroup A with children B and C: + * + * A + * / \ + * B C + * + * and + * + * 1. A/memory.current > A/memory.high + * 2. A/B/memory.current < A/B/memory.low + * 3. A/C/memory.current >= A/C/memory.low + * + * As 'A' is high, i.e. triggers reclaim from 'A', and 'B' is low, we + * should reclaim from 'C' until 'A' is no longer high or until we can + * no longer reclaim from 'C'. If 'A', i.e. @root, isn't excluded by + * mem_cgroup_low when reclaming from 'A', then 'B' won't be considered + * low and we will reclaim indiscriminately from both 'B' and 'C'. */ bool mem_cgroup_low(struct mem_cgroup *root, struct mem_cgroup *memcg) { if (mem_cgroup_disabled()) return false; - /* - * The toplevel group doesn't have a configurable range, so - * it's never low when looked at directly, and it is not - * considered an ancestor when assessing the hierarchy. - */ - - if (memcg == root_mem_cgroup) - return false; - - if (page_counter_read(&memcg->memory) >= memcg->low) + if (!root) + root = root_mem_cgroup; + if (memcg == root) return false; - while (memcg != root) { - memcg = parent_mem_cgroup(memcg); - - if (memcg == root_mem_cgroup) - break; - + for (; memcg != root; memcg = parent_mem_cgroup(memcg)) { if (page_counter_read(&memcg->memory) >= memcg->low) return false; } + return true; } diff --git a/mm/memory-failure.c b/mm/memory-failure.c index dbe3e50c9aa5..1cd3b3569af8 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -49,7 +49,6 @@ #include <linux/swap.h> #include <linux/backing-dev.h> #include <linux/migrate.h> -#include <linux/page-isolation.h> #include <linux/suspend.h> #include <linux/slab.h> #include <linux/swapops.h> @@ -555,6 +554,39 @@ static int delete_from_lru_cache(struct page *p) return -EIO; } +static int truncate_error_page(struct page *p, unsigned long pfn, + struct address_space *mapping) +{ + int ret = MF_FAILED; + + if (mapping->a_ops->error_remove_page) { + int err = mapping->a_ops->error_remove_page(mapping, p); + + if (err != 0) { + pr_info("Memory failure: %#lx: Failed to punch page: %d\n", + pfn, err); + } else if (page_has_private(p) && + !try_to_release_page(p, GFP_NOIO)) { + pr_info("Memory failure: %#lx: failed to release buffers\n", + pfn); + } else { + ret = MF_RECOVERED; + } + } else { + /* + * If the file system doesn't support it just invalidate + * This fails on dirty or anything with private pages + */ + if (invalidate_inode_page(p)) + ret = MF_RECOVERED; + else + pr_info("Memory failure: %#lx: Failed to invalidate\n", + pfn); + } + + return ret; +} + /* * Error hit kernel page. * Do nothing, try to be lucky and not touch this instead. For a few cases we @@ -579,8 +611,6 @@ static int me_unknown(struct page *p, unsigned long pfn) */ static int me_pagecache_clean(struct page *p, unsigned long pfn) { - int err; - int ret = MF_FAILED; struct address_space *mapping; delete_from_lru_cache(p); @@ -612,30 +642,7 @@ static int me_pagecache_clean(struct page *p, unsigned long pfn) * * Open: to take i_mutex or not for this? Right now we don't. */ - if (mapping->a_ops->error_remove_page) { - err = mapping->a_ops->error_remove_page(mapping, p); - if (err != 0) { - pr_info("Memory failure: %#lx: Failed to punch page: %d\n", - pfn, err); - } else if (page_has_private(p) && - !try_to_release_page(p, GFP_NOIO)) { - pr_info("Memory failure: %#lx: failed to release buffers\n", - pfn); - } else { - ret = MF_RECOVERED; - } - } else { - /* - * If the file system doesn't support it just invalidate - * This fails on dirty or anything with private pages - */ - if (invalidate_inode_page(p)) - ret = MF_RECOVERED; - else - pr_info("Memory failure: %#lx: Failed to invalidate\n", - pfn); - } - return ret; + return truncate_error_page(p, pfn, mapping); } /* @@ -741,24 +748,29 @@ static int me_huge_page(struct page *p, unsigned long pfn) { int res = 0; struct page *hpage = compound_head(p); + struct address_space *mapping; if (!PageHuge(hpage)) return MF_DELAYED; - /* - * We can safely recover from error on free or reserved (i.e. - * not in-use) hugepage by dequeuing it from freelist. - * To check whether a hugepage is in-use or not, we can't use - * page->lru because it can be used in other hugepage operations, - * such as __unmap_hugepage_range() and gather_surplus_pages(). - * So instead we use page_mapping() and PageAnon(). - */ - if (!(page_mapping(hpage) || PageAnon(hpage))) { - res = dequeue_hwpoisoned_huge_page(hpage); - if (!res) - return MF_RECOVERED; + mapping = page_mapping(hpage); + if (mapping) { + res = truncate_error_page(hpage, pfn, mapping); + } else { + unlock_page(hpage); + /* + * migration entry prevents later access on error anonymous + * hugepage, so we can free and dissolve it into buddy to + * save healthy subpages. + */ + if (PageAnon(hpage)) + put_page(hpage); + dissolve_free_huge_page(p); + res = MF_RECOVERED; + lock_page(hpage); } - return MF_DELAYED; + + return res; } /* @@ -857,7 +869,7 @@ static int page_action(struct page_state *ps, struct page *p, count = page_count(p) - 1; if (ps->action == me_swapcache_dirty && result == MF_DELAYED) count--; - if (count != 0) { + if (count > 0) { pr_err("Memory failure: %#lx: %s still referenced by %d users\n", pfn, action_page_types[ps->type], count); result = MF_FAILED; @@ -1010,20 +1022,84 @@ static bool hwpoison_user_mappings(struct page *p, unsigned long pfn, return unmap_success; } -static void set_page_hwpoison_huge_page(struct page *hpage) +static int identify_page_state(unsigned long pfn, struct page *p, + unsigned long page_flags) { - int i; - int nr_pages = 1 << compound_order(hpage); - for (i = 0; i < nr_pages; i++) - SetPageHWPoison(hpage + i); + struct page_state *ps; + + /* + * The first check uses the current page flags which may not have any + * relevant information. The second check with the saved page flags is + * carried out only if the first check can't determine the page status. + */ + for (ps = error_states;; ps++) + if ((p->flags & ps->mask) == ps->res) + break; + + page_flags |= (p->flags & (1UL << PG_dirty)); + + if (!ps->mask) + for (ps = error_states;; ps++) + if ((page_flags & ps->mask) == ps->res) + break; + return page_action(ps, p, pfn); } -static void clear_page_hwpoison_huge_page(struct page *hpage) +static int memory_failure_hugetlb(unsigned long pfn, int trapno, int flags) { - int i; - int nr_pages = 1 << compound_order(hpage); - for (i = 0; i < nr_pages; i++) - ClearPageHWPoison(hpage + i); + struct page *p = pfn_to_page(pfn); + struct page *head = compound_head(p); + int res; + unsigned long page_flags; + + if (TestSetPageHWPoison(head)) { + pr_err("Memory failure: %#lx: already hardware poisoned\n", + pfn); + return 0; + } + + num_poisoned_pages_inc(); + + if (!(flags & MF_COUNT_INCREASED) && !get_hwpoison_page(p)) { + /* + * Check "filter hit" and "race with other subpage." + */ + lock_page(head); + if (PageHWPoison(head)) { + if ((hwpoison_filter(p) && TestClearPageHWPoison(p)) + || (p != head && TestSetPageHWPoison(head))) { + num_poisoned_pages_dec(); + unlock_page(head); + return 0; + } + } + unlock_page(head); + dissolve_free_huge_page(p); + action_result(pfn, MF_MSG_FREE_HUGE, MF_DELAYED); + return 0; + } + + lock_page(head); + page_flags = head->flags; + + if (!PageHWPoison(head)) { + pr_err("Memory failure: %#lx: just unpoisoned\n", pfn); + num_poisoned_pages_dec(); + unlock_page(head); + put_hwpoison_page(head); + return 0; + } + + if (!hwpoison_user_mappings(p, pfn, trapno, flags, &head)) { + action_result(pfn, MF_MSG_UNMAP_FAILED, MF_IGNORED); + res = -EBUSY; + goto out; + } + + res = identify_page_state(pfn, p, page_flags); +out: + unlock_page(head); + return res; } /** @@ -1046,12 +1122,10 @@ static void clear_page_hwpoison_huge_page(struct page *hpage) */ int memory_failure(unsigned long pfn, int trapno, int flags) { - struct page_state *ps; struct page *p; struct page *hpage; struct page *orig_head; int res; - unsigned int nr_pages; unsigned long page_flags; if (!sysctl_memory_failure_recovery) @@ -1064,34 +1138,22 @@ int memory_failure(unsigned long pfn, int trapno, int flags) } p = pfn_to_page(pfn); - orig_head = hpage = compound_head(p); + if (PageHuge(p)) + return memory_failure_hugetlb(pfn, trapno, flags); if (TestSetPageHWPoison(p)) { pr_err("Memory failure: %#lx: already hardware poisoned\n", pfn); return 0; } - /* - * Currently errors on hugetlbfs pages are measured in hugepage units, - * so nr_pages should be 1 << compound_order. OTOH when errors are on - * transparent hugepages, they are supposed to be split and error - * measurement is done in normal page units. So nr_pages should be one - * in this case. - */ - if (PageHuge(p)) - nr_pages = 1 << compound_order(hpage); - else /* normal page or thp */ - nr_pages = 1; - num_poisoned_pages_add(nr_pages); + orig_head = hpage = compound_head(p); + num_poisoned_pages_inc(); /* * We need/can do nothing about count=0 pages. * 1) it's a free page, and therefore in safe hand: * prep_new_page() will be the gate keeper. - * 2) it's a free hugepage, which is also safe: - * an affected hugepage will be dequeued from hugepage freelist, - * so there's no concern about reusing it ever after. - * 3) it's part of a non-compound high order page. + * 2) it's part of a non-compound high order page. * Implies some kernel user: cannot stop them from * R/W the page; let's pray that the page has been * used and will be freed some time later. @@ -1102,32 +1164,13 @@ int memory_failure(unsigned long pfn, int trapno, int flags) if (is_free_buddy_page(p)) { action_result(pfn, MF_MSG_BUDDY, MF_DELAYED); return 0; - } else if (PageHuge(hpage)) { - /* - * Check "filter hit" and "race with other subpage." - */ - lock_page(hpage); - if (PageHWPoison(hpage)) { - if ((hwpoison_filter(p) && TestClearPageHWPoison(p)) - || (p != hpage && TestSetPageHWPoison(hpage))) { - num_poisoned_pages_sub(nr_pages); - unlock_page(hpage); - return 0; - } - } - set_page_hwpoison_huge_page(hpage); - res = dequeue_hwpoisoned_huge_page(hpage); - action_result(pfn, MF_MSG_FREE_HUGE, - res ? MF_IGNORED : MF_DELAYED); - unlock_page(hpage); - return res; } else { action_result(pfn, MF_MSG_KERNEL_HIGH_ORDER, MF_IGNORED); return -EBUSY; } } - if (!PageHuge(p) && PageTransHuge(hpage)) { + if (PageTransHuge(hpage)) { lock_page(p); if (!PageAnon(p) || unlikely(split_huge_page(p))) { unlock_page(p); @@ -1138,7 +1181,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags) pr_err("Memory failure: %#lx: thp split failed\n", pfn); if (TestClearPageHWPoison(p)) - num_poisoned_pages_sub(nr_pages); + num_poisoned_pages_dec(); put_hwpoison_page(p); return -EBUSY; } @@ -1165,7 +1208,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags) return 0; } - lock_page(hpage); + lock_page(p); /* * The page could have changed compound pages during the locking. @@ -1194,42 +1237,23 @@ int memory_failure(unsigned long pfn, int trapno, int flags) */ if (!PageHWPoison(p)) { pr_err("Memory failure: %#lx: just unpoisoned\n", pfn); - num_poisoned_pages_sub(nr_pages); - unlock_page(hpage); - put_hwpoison_page(hpage); + num_poisoned_pages_dec(); + unlock_page(p); + put_hwpoison_page(p); return 0; } if (hwpoison_filter(p)) { if (TestClearPageHWPoison(p)) - num_poisoned_pages_sub(nr_pages); - unlock_page(hpage); - put_hwpoison_page(hpage); + num_poisoned_pages_dec(); + unlock_page(p); + put_hwpoison_page(p); return 0; } - if (!PageHuge(p) && !PageTransTail(p) && !PageLRU(p)) + if (!PageTransTail(p) && !PageLRU(p)) goto identify_page_state; /* - * For error on the tail page, we should set PG_hwpoison - * on the head page to show that the hugepage is hwpoisoned - */ - if (PageHuge(p) && PageTail(p) && TestSetPageHWPoison(hpage)) { - action_result(pfn, MF_MSG_POISONED_HUGE, MF_IGNORED); - unlock_page(hpage); - put_hwpoison_page(hpage); - return 0; - } - /* - * Set PG_hwpoison on all pages in an error hugepage, - * because containment is done in hugepage unit for now. - * Since we have done TestSetPageHWPoison() for the head page with - * page lock held, we can safely set PG_hwpoison bits on tail pages. - */ - if (PageHuge(p)) - set_page_hwpoison_huge_page(hpage); - - /* * It's very difficult to mess with pages currently under IO * and in many cases impossible, so we just avoid it here. */ @@ -1258,25 +1282,9 @@ int memory_failure(unsigned long pfn, int trapno, int flags) } identify_page_state: - res = -EBUSY; - /* - * The first check uses the current page flags which may not have any - * relevant information. The second check with the saved page flagss is - * carried out only if the first check can't determine the page status. - */ - for (ps = error_states;; ps++) - if ((p->flags & ps->mask) == ps->res) - break; - - page_flags |= (p->flags & (1UL << PG_dirty)); - - if (!ps->mask) - for (ps = error_states;; ps++) - if ((page_flags & ps->mask) == ps->res) - break; - res = page_action(ps, p, pfn); + res = identify_page_state(pfn, p, page_flags); out: - unlock_page(hpage); + unlock_page(p); return res; } EXPORT_SYMBOL_GPL(memory_failure); @@ -1398,7 +1406,6 @@ int unpoison_memory(unsigned long pfn) struct page *page; struct page *p; int freeit = 0; - unsigned int nr_pages; static DEFINE_RATELIMIT_STATE(unpoison_rs, DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST); @@ -1443,20 +1450,7 @@ int unpoison_memory(unsigned long pfn) return 0; } - nr_pages = 1 << compound_order(page); - if (!get_hwpoison_page(p)) { - /* - * Since HWPoisoned hugepage should have non-zero refcount, - * race between memory failure and unpoison seems to happen. - * In such case unpoison fails and memory failure runs - * to the end. - */ - if (PageHuge(page)) { - unpoison_pr_info("Unpoison: Memory failure is now running on free hugepage %#lx\n", - pfn, &unpoison_rs); - return 0; - } if (TestClearPageHWPoison(p)) num_poisoned_pages_dec(); unpoison_pr_info("Unpoison: Software-unpoisoned free page %#lx\n", @@ -1474,10 +1468,8 @@ int unpoison_memory(unsigned long pfn) if (TestClearPageHWPoison(page)) { unpoison_pr_info("Unpoison: Software-unpoisoned page %#lx\n", pfn, &unpoison_rs); - num_poisoned_pages_sub(nr_pages); + num_poisoned_pages_dec(); freeit = 1; - if (PageHuge(page)) - clear_page_hwpoison_huge_page(page); } unlock_page(page); @@ -1492,16 +1484,8 @@ EXPORT_SYMBOL(unpoison_memory); static struct page *new_page(struct page *p, unsigned long private, int **x) { int nid = page_to_nid(p); - if (PageHuge(p)) { - struct hstate *hstate = page_hstate(compound_head(p)); - - if (hstate_is_gigantic(hstate)) - return alloc_huge_page_node(hstate, NUMA_NO_NODE); - return alloc_huge_page_node(hstate, nid); - } else { - return __alloc_pages_node(nid, GFP_HIGHUSER_MOVABLE, 0); - } + return new_page_nodemask(p, nid, &node_states[N_MEMORY]); } /* @@ -1608,15 +1592,8 @@ static int soft_offline_huge_page(struct page *page, int flags) if (ret > 0) ret = -EIO; } else { - /* overcommit hugetlb page will be freed to buddy */ - if (PageHuge(page)) { - set_page_hwpoison_huge_page(hpage); - dequeue_hwpoisoned_huge_page(hpage); - num_poisoned_pages_add(1 << compound_order(hpage)); - } else { - SetPageHWPoison(page); - num_poisoned_pages_inc(); - } + if (PageHuge(page)) + dissolve_free_huge_page(page); } return ret; } @@ -1732,15 +1709,12 @@ static int soft_offline_in_use_page(struct page *page, int flags) static void soft_offline_free_page(struct page *page) { - if (PageHuge(page)) { - struct page *hpage = compound_head(page); + struct page *head = compound_head(page); - set_page_hwpoison_huge_page(hpage); - if (!dequeue_hwpoisoned_huge_page(hpage)) - num_poisoned_pages_add(1 << compound_order(hpage)); - } else { - if (!TestSetPageHWPoison(page)) - num_poisoned_pages_inc(); + if (!TestSetPageHWPoison(head)) { + num_poisoned_pages_inc(); + if (PageHuge(head)) + dissolve_free_huge_page(page); } } diff --git a/mm/memory.c b/mm/memory.c index e31dd97e6114..0e517be91a89 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3262,14 +3262,14 @@ static int fault_around_bytes_set(void *data, u64 val) fault_around_bytes = PAGE_SIZE; /* rounddown_pow_of_two(0) is undefined */ return 0; } -DEFINE_SIMPLE_ATTRIBUTE(fault_around_bytes_fops, +DEFINE_DEBUGFS_ATTRIBUTE(fault_around_bytes_fops, fault_around_bytes_get, fault_around_bytes_set, "%llu\n"); static int __init fault_around_debugfs(void) { void *ret; - ret = debugfs_create_file("fault_around_bytes", 0644, NULL, NULL, + ret = debugfs_create_file_unsafe("fault_around_bytes", 0644, NULL, NULL, &fault_around_bytes_fops); if (!ret) pr_warn("Failed to create fault_around_bytes in debugfs"); @@ -3591,7 +3591,7 @@ out: return 0; } -static int create_huge_pmd(struct vm_fault *vmf) +static inline int create_huge_pmd(struct vm_fault *vmf) { if (vma_is_anonymous(vmf->vma)) return do_huge_pmd_anonymous_page(vmf); diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index f79aac7a12b5..8dccc317aac2 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -52,32 +52,17 @@ static void generic_online_page(struct page *page); static online_page_callback_t online_page_callback = generic_online_page; static DEFINE_MUTEX(online_page_callback_lock); -/* The same as the cpu_hotplug lock, but for memory hotplug. */ -static struct { - struct task_struct *active_writer; - struct mutex lock; /* Synchronizes accesses to refcount, */ - /* - * Also blocks the new readers during - * an ongoing mem hotplug operation. - */ - int refcount; +DEFINE_STATIC_PERCPU_RWSEM(mem_hotplug_lock); -#ifdef CONFIG_DEBUG_LOCK_ALLOC - struct lockdep_map dep_map; -#endif -} mem_hotplug = { - .active_writer = NULL, - .lock = __MUTEX_INITIALIZER(mem_hotplug.lock), - .refcount = 0, -#ifdef CONFIG_DEBUG_LOCK_ALLOC - .dep_map = {.name = "mem_hotplug.lock" }, -#endif -}; +void get_online_mems(void) +{ + percpu_down_read(&mem_hotplug_lock); +} -/* Lockdep annotations for get/put_online_mems() and mem_hotplug_begin/end() */ -#define memhp_lock_acquire_read() lock_map_acquire_read(&mem_hotplug.dep_map) -#define memhp_lock_acquire() lock_map_acquire(&mem_hotplug.dep_map) -#define memhp_lock_release() lock_map_release(&mem_hotplug.dep_map) +void put_online_mems(void) +{ + percpu_up_read(&mem_hotplug_lock); +} bool movable_node_enabled = false; @@ -99,60 +84,16 @@ static int __init setup_memhp_default_state(char *str) } __setup("memhp_default_state=", setup_memhp_default_state); -void get_online_mems(void) -{ - might_sleep(); - if (mem_hotplug.active_writer == current) - return; - memhp_lock_acquire_read(); - mutex_lock(&mem_hotplug.lock); - mem_hotplug.refcount++; - mutex_unlock(&mem_hotplug.lock); - -} - -void put_online_mems(void) -{ - if (mem_hotplug.active_writer == current) - return; - mutex_lock(&mem_hotplug.lock); - - if (WARN_ON(!mem_hotplug.refcount)) - mem_hotplug.refcount++; /* try to fix things up */ - - if (!--mem_hotplug.refcount && unlikely(mem_hotplug.active_writer)) - wake_up_process(mem_hotplug.active_writer); - mutex_unlock(&mem_hotplug.lock); - memhp_lock_release(); - -} - -/* Serializes write accesses to mem_hotplug.active_writer. */ -static DEFINE_MUTEX(memory_add_remove_lock); - void mem_hotplug_begin(void) { - mutex_lock(&memory_add_remove_lock); - - mem_hotplug.active_writer = current; - - memhp_lock_acquire(); - for (;;) { - mutex_lock(&mem_hotplug.lock); - if (likely(!mem_hotplug.refcount)) - break; - __set_current_state(TASK_UNINTERRUPTIBLE); - mutex_unlock(&mem_hotplug.lock); - schedule(); - } + cpus_read_lock(); + percpu_down_write(&mem_hotplug_lock); } void mem_hotplug_done(void) { - mem_hotplug.active_writer = NULL; - mutex_unlock(&mem_hotplug.lock); - memhp_lock_release(); - mutex_unlock(&memory_add_remove_lock); + percpu_up_write(&mem_hotplug_lock); + cpus_read_unlock(); } /* add this memory to iomem resource */ @@ -580,11 +521,8 @@ static void __remove_zone(struct zone *zone, unsigned long start_pfn) { struct pglist_data *pgdat = zone->zone_pgdat; int nr_pages = PAGES_PER_SECTION; - int zone_type; unsigned long flags; - zone_type = zone - pgdat->node_zones; - pgdat_resize_lock(zone->zone_pgdat, &flags); shrink_zone_span(zone, start_pfn, start_pfn + nr_pages); shrink_pgdat_span(pgdat, start_pfn, start_pfn + nr_pages); @@ -934,6 +872,19 @@ struct zone *default_zone_for_pfn(int nid, unsigned long start_pfn, return &pgdat->node_zones[ZONE_NORMAL]; } +static inline bool movable_pfn_range(int nid, struct zone *default_zone, + unsigned long start_pfn, unsigned long nr_pages) +{ + if (!allow_online_pfn_range(nid, start_pfn, nr_pages, + MMOP_ONLINE_KERNEL)) + return true; + + if (!movable_node_is_enabled()) + return false; + + return !zone_intersects(default_zone, start_pfn, nr_pages); +} + /* * Associates the given pfn range with the given node and the zone appropriate * for the given online type. @@ -949,10 +900,10 @@ static struct zone * __meminit move_pfn_range(int online_type, int nid, /* * MMOP_ONLINE_KEEP defaults to MMOP_ONLINE_KERNEL but use * movable zone if that is not possible (e.g. we are within - * or past the existing movable zone) + * or past the existing movable zone). movable_node overrides + * this default and defaults to movable zone */ - if (!allow_online_pfn_range(nid, start_pfn, nr_pages, - MMOP_ONLINE_KERNEL)) + if (movable_pfn_range(nid, zone, start_pfn, nr_pages)) zone = movable_zone; } else if (online_type == MMOP_ONLINE_MOVABLE) { zone = &pgdat->node_zones[ZONE_MOVABLE]; @@ -1268,7 +1219,7 @@ register_fail: error: /* rollback pgdat allocation and others */ - if (new_pgdat) + if (new_pgdat && pgdat) rollback_node_hotadd(nid, pgdat); memblock_remove(start, size); @@ -1420,32 +1371,19 @@ static unsigned long scan_movable_pages(unsigned long start, unsigned long end) static struct page *new_node_page(struct page *page, unsigned long private, int **result) { - gfp_t gfp_mask = GFP_USER | __GFP_MOVABLE; int nid = page_to_nid(page); nodemask_t nmask = node_states[N_MEMORY]; - struct page *new_page = NULL; /* - * TODO: allocate a destination hugepage from a nearest neighbor node, - * accordance with memory policy of the user process if possible. For - * now as a simple work-around, we use the next node for destination. + * try to allocate from a different node but reuse this node if there + * are no other online nodes to be used (e.g. we are offlining a part + * of the only existing node) */ - if (PageHuge(page)) - return alloc_huge_page_node(page_hstate(compound_head(page)), - next_node_in(nid, nmask)); - node_clear(nid, nmask); + if (nodes_empty(nmask)) + node_set(nid, nmask); - if (PageHighMem(page) - || (zone_idx(page_zone(page)) == ZONE_MOVABLE)) - gfp_mask |= __GFP_HIGHMEM; - - if (!nodes_empty(nmask)) - new_page = __alloc_pages_nodemask(gfp_mask, 0, nid, &nmask); - if (!new_page) - new_page = __alloc_pages(gfp_mask, 0, nid); - - return new_page; + return new_page_nodemask(page, nid, &nmask); } #define NR_OFFLINE_AT_ONCE_PAGES (256) @@ -1728,7 +1666,7 @@ repeat: goto failed_removal; ret = 0; if (drain) { - lru_add_drain_all(); + lru_add_drain_all_cpuslocked(); cond_resched(); drain_all_pages(zone); } @@ -1749,7 +1687,7 @@ repeat: } } /* drain all zone's lru pagevec, this is asynchronous... */ - lru_add_drain_all(); + lru_add_drain_all_cpuslocked(); yield(); /* drain pcp pages, this is synchronous. */ drain_all_pages(zone); diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 7d8e56214ac0..d911fa5cb2a7 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1078,7 +1078,8 @@ static struct page *new_page(struct page *page, unsigned long start, int **x) /* * if !vma, alloc_page_vma() will use task or system default policy */ - return alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address); + return alloc_page_vma(GFP_HIGHUSER_MOVABLE | __GFP_RETRY_MAYFAIL, + vma, address); } #else diff --git a/mm/migrate.c b/mm/migrate.c index 051cc1555d36..627671551873 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1252,6 +1252,8 @@ put_anon: out: if (rc != -EAGAIN) putback_active_hugepage(hpage); + if (reason == MR_MEMORY_FAILURE && !test_set_page_hwpoison(hpage)) + num_poisoned_pages_inc(); /* * If migration was not successful and there's a freeing callback, use @@ -1914,7 +1916,6 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm, int page_lru = page_is_file_cache(page); unsigned long mmun_start = address & HPAGE_PMD_MASK; unsigned long mmun_end = mmun_start + HPAGE_PMD_SIZE; - pmd_t orig_entry; /* * Rate-limit the amount of data that is being migrated to a node. @@ -1957,8 +1958,7 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm, /* Recheck the target PMD */ mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); ptl = pmd_lock(mm, pmd); - if (unlikely(!pmd_same(*pmd, entry) || page_count(page) != 2)) { -fail_putback: + if (unlikely(!pmd_same(*pmd, entry) || !page_ref_freeze(page, 2))) { spin_unlock(ptl); mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); @@ -1980,7 +1980,6 @@ fail_putback: goto out_unlock; } - orig_entry = *pmd; entry = mk_huge_pmd(new_page, vma->vm_page_prot); entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); @@ -1997,15 +1996,7 @@ fail_putback: set_pmd_at(mm, mmun_start, pmd, entry); update_mmu_cache_pmd(vma, address, &entry); - if (page_count(page) != 2) { - set_pmd_at(mm, mmun_start, pmd, orig_entry); - flush_pmd_tlb_range(vma, mmun_start, mmun_end); - mmu_notifier_invalidate_range(mm, mmun_start, mmun_end); - update_mmu_cache_pmd(vma, address, &entry); - page_remove_rmap(new_page, true); - goto fail_putback; - } - + page_ref_unfreeze(page, 2); mlock_migrate_page(new_page, page); page_remove_rmap(page, true); set_page_owner_migrate_reason(new_page, MR_NUMA_MISPLACED); diff --git a/mm/mmap.c b/mm/mmap.c index 7f8cfe9d9b4d..f19efcf75418 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2177,7 +2177,6 @@ static int acct_stack_growth(struct vm_area_struct *vma, unsigned long size, unsigned long grow) { struct mm_struct *mm = vma->vm_mm; - struct rlimit *rlim = current->signal->rlim; unsigned long new_start; /* address space limit tests */ @@ -2185,7 +2184,7 @@ static int acct_stack_growth(struct vm_area_struct *vma, return -ENOMEM; /* Stack limit test */ - if (size > READ_ONCE(rlim[RLIMIT_STACK].rlim_cur)) + if (size > rlimit(RLIMIT_STACK)) return -ENOMEM; /* mlock limit tests */ @@ -2193,7 +2192,7 @@ static int acct_stack_growth(struct vm_area_struct *vma, unsigned long locked; unsigned long limit; locked = mm->locked_vm + grow; - limit = READ_ONCE(rlim[RLIMIT_MEMLOCK].rlim_cur); + limit = rlimit(RLIMIT_MEMLOCK); limit >>= PAGE_SHIFT; if (locked > limit && !capable(CAP_IPC_LOCK)) return -ENOMEM; @@ -2232,7 +2231,7 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address) /* Guard against exceeding limits of the address space. */ address &= PAGE_MASK; - if (address >= TASK_SIZE) + if (address >= (TASK_SIZE & PAGE_MASK)) return -ENOMEM; address += PAGE_SIZE; @@ -2244,7 +2243,8 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address) gap_addr = TASK_SIZE; next = vma->vm_next; - if (next && next->vm_start < gap_addr) { + if (next && next->vm_start < gap_addr && + (next->vm_flags & (VM_WRITE|VM_READ|VM_EXEC))) { if (!(next->vm_flags & VM_GROWSUP)) return -ENOMEM; /* Check that both stack segments have the same anon_vma? */ @@ -2315,7 +2315,6 @@ int expand_downwards(struct vm_area_struct *vma, { struct mm_struct *mm = vma->vm_mm; struct vm_area_struct *prev; - unsigned long gap_addr; int error; address &= PAGE_MASK; @@ -2324,14 +2323,12 @@ int expand_downwards(struct vm_area_struct *vma, return error; /* Enforce stack_guard_gap */ - gap_addr = address - stack_guard_gap; - if (gap_addr > address) - return -ENOMEM; prev = vma->vm_prev; - if (prev && prev->vm_end > gap_addr) { - if (!(prev->vm_flags & VM_GROWSDOWN)) + /* Check that both stack segments have the same anon_vma? */ + if (prev && !(prev->vm_flags & VM_GROWSDOWN) && + (prev->vm_flags & (VM_WRITE|VM_READ|VM_EXEC))) { + if (address - prev->vm_end < stack_guard_gap) return -ENOMEM; - /* Check that both stack segments have the same anon_vma? */ } /* We must make sure the anon_vma is allocated. */ diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 0e2c925e7826..9e8b4f030c1c 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -490,6 +490,7 @@ static bool __oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm) if (!down_read_trylock(&mm->mmap_sem)) { ret = false; + trace_skip_task_reaping(tsk->pid); goto unlock_oom; } @@ -500,9 +501,12 @@ static bool __oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm) */ if (!mmget_not_zero(mm)) { up_read(&mm->mmap_sem); + trace_skip_task_reaping(tsk->pid); goto unlock_oom; } + trace_start_task_reaping(tsk->pid); + /* * Tell all users of get_user/copy_from_user etc... that the content * is no longer stable. No barriers really needed because unmapping @@ -544,6 +548,7 @@ static bool __oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm) * put the oom_reaper out of the way. */ mmput_async(mm); + trace_finish_task_reaping(tsk->pid); unlock_oom: mutex_unlock(&oom_lock); return ret; @@ -615,6 +620,7 @@ static void wake_oom_reaper(struct task_struct *tsk) tsk->oom_reaper_list = oom_reaper_list; oom_reaper_list = tsk; spin_unlock(&oom_reaper_lock); + trace_wake_reaper(tsk->pid); wake_up(&oom_reaper_wait); } @@ -666,6 +672,7 @@ static void mark_oom_victim(struct task_struct *tsk) */ __thaw_task(tsk); atomic_inc(&oom_victims); + trace_mark_victim(tsk->pid); } /** diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 0b60cc7ddac2..96e93b214d31 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -601,7 +601,7 @@ static inline void __wb_writeout_inc(struct bdi_writeback *wb) { struct wb_domain *cgdom; - __inc_wb_stat(wb, WB_WRITTEN); + inc_wb_stat(wb, WB_WRITTEN); wb_domain_writeout_inc(&global_wb_domain, &wb->completions, wb->bdi->max_prop_frac); @@ -2435,8 +2435,8 @@ void account_page_dirtied(struct page *page, struct address_space *mapping) __inc_lruvec_page_state(page, NR_FILE_DIRTY); __inc_zone_page_state(page, NR_ZONE_WRITE_PENDING); __inc_node_page_state(page, NR_DIRTIED); - __inc_wb_stat(wb, WB_RECLAIMABLE); - __inc_wb_stat(wb, WB_DIRTIED); + inc_wb_stat(wb, WB_RECLAIMABLE); + inc_wb_stat(wb, WB_DIRTIED); task_io_account_write(PAGE_SIZE); current->nr_dirtied++; this_cpu_inc(bdp_ratelimits); @@ -2741,7 +2741,7 @@ int test_clear_page_writeback(struct page *page) if (bdi_cap_account_writeback(bdi)) { struct bdi_writeback *wb = inode_to_wb(inode); - __dec_wb_stat(wb, WB_WRITEBACK); + dec_wb_stat(wb, WB_WRITEBACK); __wb_writeout_inc(wb); } } @@ -2786,7 +2786,7 @@ int __test_set_page_writeback(struct page *page, bool keep_write) page_index(page), PAGECACHE_TAG_WRITEBACK); if (bdi_cap_account_writeback(bdi)) - __inc_wb_stat(inode_to_wb(inode), WB_WRITEBACK); + inc_wb_stat(inode_to_wb(inode), WB_WRITEBACK); /* * We can come through here when swapping anonymous diff --git a/mm/page_alloc.c b/mm/page_alloc.c index bd65b60939b6..6d30e914afb6 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2206,19 +2206,26 @@ static bool unreserve_highatomic_pageblock(const struct alloc_context *ac, * list of requested migratetype, possibly along with other pages from the same * block, depending on fragmentation avoidance heuristics. Returns true if * fallback was found so that __rmqueue_smallest() can grab it. + * + * The use of signed ints for order and current_order is a deliberate + * deviation from the rest of this file, to make the for loop + * condition simpler. */ static inline bool -__rmqueue_fallback(struct zone *zone, unsigned int order, int start_migratetype) +__rmqueue_fallback(struct zone *zone, int order, int start_migratetype) { struct free_area *area; - unsigned int current_order; + int current_order; struct page *page; int fallback_mt; bool can_steal; - /* Find the largest possible block of pages in the other list */ - for (current_order = MAX_ORDER-1; - current_order >= order && current_order <= MAX_ORDER-1; + /* + * Find the largest available free page in the other list. This roughly + * approximates finding the pageblock with the most free pages, which + * would be too costly to do exactly. + */ + for (current_order = MAX_ORDER - 1; current_order >= order; --current_order) { area = &(zone->free_area[current_order]); fallback_mt = find_suitable_fallback(area, current_order, @@ -2226,19 +2233,50 @@ __rmqueue_fallback(struct zone *zone, unsigned int order, int start_migratetype) if (fallback_mt == -1) continue; - page = list_first_entry(&area->free_list[fallback_mt], - struct page, lru); + /* + * We cannot steal all free pages from the pageblock and the + * requested migratetype is movable. In that case it's better to + * steal and split the smallest available page instead of the + * largest available page, because even if the next movable + * allocation falls back into a different pageblock than this + * one, it won't cause permanent fragmentation. + */ + if (!can_steal && start_migratetype == MIGRATE_MOVABLE + && current_order > order) + goto find_smallest; - steal_suitable_fallback(zone, page, start_migratetype, - can_steal); + goto do_steal; + } - trace_mm_page_alloc_extfrag(page, order, current_order, - start_migratetype, fallback_mt); + return false; - return true; +find_smallest: + for (current_order = order; current_order < MAX_ORDER; + current_order++) { + area = &(zone->free_area[current_order]); + fallback_mt = find_suitable_fallback(area, current_order, + start_migratetype, false, &can_steal); + if (fallback_mt != -1) + break; } - return false; + /* + * This should not happen - we already found a suitable fallback + * when looking for the largest page. + */ + VM_BUG_ON(current_order == MAX_ORDER); + +do_steal: + page = list_first_entry(&area->free_list[fallback_mt], + struct page, lru); + + steal_suitable_fallback(zone, page, start_migratetype, can_steal); + + trace_mm_page_alloc_extfrag(page, order, current_order, + start_migratetype, fallback_mt); + + return true; + } /* @@ -3246,6 +3284,14 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, /* The OOM killer will not help higher order allocs */ if (order > PAGE_ALLOC_COSTLY_ORDER) goto out; + /* + * We have already exhausted all our reclaim opportunities without any + * success so it is time to admit defeat. We will skip the OOM killer + * because it is very likely that the caller has a more reasonable + * fallback than shooting a random task. + */ + if (gfp_mask & __GFP_RETRY_MAYFAIL) + goto out; /* The OOM killer does not needlessly kill tasks for lowmem */ if (ac->high_zoneidx < ZONE_NORMAL) goto out; @@ -3375,7 +3421,7 @@ should_compact_retry(struct alloc_context *ac, int order, int alloc_flags, } /* - * !costly requests are much more important than __GFP_REPEAT + * !costly requests are much more important than __GFP_RETRY_MAYFAIL * costly ones because they are de facto nofail and invoke OOM * killer to move on while costly can fail and users are ready * to cope with that. 1/4 retries is rather arbitrary but we @@ -3882,9 +3928,9 @@ retry: /* * Do not retry costly high order allocations unless they are - * __GFP_REPEAT + * __GFP_RETRY_MAYFAIL */ - if (costly_order && !(gfp_mask & __GFP_REPEAT)) + if (costly_order && !(gfp_mask & __GFP_RETRY_MAYFAIL)) goto nopage; if (should_reclaim_retry(gfp_mask, order, ac, alloc_flags, @@ -5240,7 +5286,7 @@ void __ref build_all_zonelists(pg_data_t *pgdat, struct zone *zone) #endif /* we have to stop all cpus to guarantee there is no user of zonelist */ - stop_machine(__build_all_zonelists, pgdat, NULL); + stop_machine_cpuslocked(__build_all_zonelists, pgdat, NULL); /* cpuset refresh routine should be here */ } vm_total_pages = nr_free_pagecache_pages(); diff --git a/mm/page_io.c b/mm/page_io.c index 2da71e627812..b6c4ac388209 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -117,6 +117,7 @@ static void swap_slot_free_notify(struct page *page) static void end_swap_bio_read(struct bio *bio) { struct page *page = bio->bi_io_vec[0].bv_page; + struct task_struct *waiter = bio->bi_private; if (bio->bi_status) { SetPageError(page); @@ -132,7 +133,9 @@ static void end_swap_bio_read(struct bio *bio) swap_slot_free_notify(page); out: unlock_page(page); + WRITE_ONCE(bio->bi_private, NULL); bio_put(bio); + wake_up_process(waiter); } int generic_swapfile_activate(struct swap_info_struct *sis, @@ -329,11 +332,13 @@ out: return ret; } -int swap_readpage(struct page *page) +int swap_readpage(struct page *page, bool do_poll) { struct bio *bio; int ret = 0; struct swap_info_struct *sis = page_swap_info(page); + blk_qc_t qc; + struct block_device *bdev; VM_BUG_ON_PAGE(!PageSwapCache(page), page); VM_BUG_ON_PAGE(!PageLocked(page), page); @@ -372,9 +377,23 @@ int swap_readpage(struct page *page) ret = -ENOMEM; goto out; } + bdev = bio->bi_bdev; + bio->bi_private = current; bio_set_op_attrs(bio, REQ_OP_READ, 0); count_vm_event(PSWPIN); - submit_bio(bio); + bio_get(bio); + qc = submit_bio(bio); + while (do_poll) { + set_current_state(TASK_UNINTERRUPTIBLE); + if (!READ_ONCE(bio->bi_private)) + break; + + if (!blk_mq_poll(bdev_get_queue(bdev), qc)) + break; + } + __set_current_state(TASK_RUNNING); + bio_put(bio); + out: return ret; } diff --git a/mm/page_isolation.c b/mm/page_isolation.c index 3606104893e0..757410d9f758 100644 --- a/mm/page_isolation.c +++ b/mm/page_isolation.c @@ -8,6 +8,7 @@ #include <linux/memory.h> #include <linux/hugetlb.h> #include <linux/page_owner.h> +#include <linux/migrate.h> #include "internal.h" #define CREATE_TRACE_POINTS @@ -294,20 +295,5 @@ int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn, struct page *alloc_migrate_target(struct page *page, unsigned long private, int **resultp) { - gfp_t gfp_mask = GFP_USER | __GFP_MOVABLE; - - /* - * TODO: allocate a destination hugepage from a nearest neighbor node, - * accordance with memory policy of the user process if possible. For - * now as a simple work-around, we use the next node for destination. - */ - if (PageHuge(page)) - return alloc_huge_page_node(page_hstate(compound_head(page)), - next_node_in(page_to_nid(page), - node_online_map)); - - if (PageHighMem(page)) - gfp_mask |= __GFP_HIGHMEM; - - return alloc_page(gfp_mask); + return new_page_nodemask(page, numa_node_id(), &node_states[N_MEMORY]); } diff --git a/mm/page_owner.c b/mm/page_owner.c index 60634dc53a88..0fd9dcf2c5dc 100644 --- a/mm/page_owner.c +++ b/mm/page_owner.c @@ -281,7 +281,11 @@ void pagetypeinfo_showmixedcount_print(struct seq_file *m, continue; if (PageBuddy(page)) { - pfn += (1UL << page_order(page)) - 1; + unsigned long freepage_order; + + freepage_order = page_order_unsafe(page); + if (freepage_order < MAX_ORDER) + pfn += (1UL << freepage_order) - 1; continue; } diff --git a/mm/shmem.c b/mm/shmem.c index 9418f5a9bc46..b0aa6075d164 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1977,10 +1977,12 @@ static int shmem_fault(struct vm_fault *vmf) } sgp = SGP_CACHE; - if (vma->vm_flags & VM_HUGEPAGE) - sgp = SGP_HUGE; - else if (vma->vm_flags & VM_NOHUGEPAGE) + + if ((vma->vm_flags & VM_NOHUGEPAGE) || + test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags)) sgp = SGP_NOHUGE; + else if (vma->vm_flags & VM_HUGEPAGE) + sgp = SGP_HUGE; error = shmem_getpage_gfp(inode, vmf->pgoff, &vmf->page, sgp, gfp, vma, vmf, &ret); diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c index a56c3989f773..c50b1a14d55e 100644 --- a/mm/sparse-vmemmap.c +++ b/mm/sparse-vmemmap.c @@ -56,11 +56,11 @@ void * __meminit vmemmap_alloc_block(unsigned long size, int node) if (node_state(node, N_HIGH_MEMORY)) page = alloc_pages_node( - node, GFP_KERNEL | __GFP_ZERO | __GFP_REPEAT, + node, GFP_KERNEL | __GFP_ZERO | __GFP_RETRY_MAYFAIL, get_order(size)); else page = alloc_pages( - GFP_KERNEL | __GFP_ZERO | __GFP_REPEAT, + GFP_KERNEL | __GFP_ZERO | __GFP_RETRY_MAYFAIL, get_order(size)); if (page) return page_address(page); diff --git a/mm/swap.c b/mm/swap.c index 4f44dbd7f780..60b1d2a75852 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -688,7 +688,7 @@ static void lru_add_drain_per_cpu(struct work_struct *dummy) static DEFINE_PER_CPU(struct work_struct, lru_add_drain_work); -void lru_add_drain_all(void) +void lru_add_drain_all_cpuslocked(void) { static DEFINE_MUTEX(lock); static struct cpumask has_work; @@ -702,7 +702,6 @@ void lru_add_drain_all(void) return; mutex_lock(&lock); - get_online_cpus(); cpumask_clear(&has_work); for_each_online_cpu(cpu) { @@ -722,10 +721,16 @@ void lru_add_drain_all(void) for_each_cpu(cpu, &has_work) flush_work(&per_cpu(lru_add_drain_work, cpu)); - put_online_cpus(); mutex_unlock(&lock); } +void lru_add_drain_all(void) +{ + get_online_cpus(); + lru_add_drain_all_cpuslocked(); + put_online_cpus(); +} + /** * release_pages - batched put_page() * @pages: array of pages to release diff --git a/mm/swap_slots.c b/mm/swap_slots.c index 90c1032a8ac3..13a174006b91 100644 --- a/mm/swap_slots.c +++ b/mm/swap_slots.c @@ -273,11 +273,11 @@ int free_swap_slot(swp_entry_t entry) { struct swap_slots_cache *cache; - cache = &get_cpu_var(swp_slots); + cache = raw_cpu_ptr(&swp_slots); if (use_swap_slot_cache && cache->slots_ret) { spin_lock_irq(&cache->free_lock); /* Swap slots cache may be deactivated before acquiring lock */ - if (!use_swap_slot_cache) { + if (!use_swap_slot_cache || !cache->slots_ret) { spin_unlock_irq(&cache->free_lock); goto direct_free; } @@ -297,7 +297,6 @@ int free_swap_slot(swp_entry_t entry) direct_free: swapcache_free_entries(&entry, 1); } - put_cpu_var(swp_slots); return 0; } diff --git a/mm/swap_state.c b/mm/swap_state.c index 9c71b6b2562f..b68c93014f50 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -412,14 +412,14 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, * the swap entry is no longer in use. */ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, - struct vm_area_struct *vma, unsigned long addr) + struct vm_area_struct *vma, unsigned long addr, bool do_poll) { bool page_was_allocated; struct page *retpage = __read_swap_cache_async(entry, gfp_mask, vma, addr, &page_was_allocated); if (page_was_allocated) - swap_readpage(retpage); + swap_readpage(retpage, do_poll); return retpage; } @@ -496,11 +496,13 @@ struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask, unsigned long start_offset, end_offset; unsigned long mask; struct blk_plug plug; + bool do_poll = true; mask = swapin_nr_pages(offset) - 1; if (!mask) goto skip; + do_poll = false; /* Read a page_cluster sized and aligned cluster around offset. */ start_offset = offset & ~mask; end_offset = offset | mask; @@ -511,7 +513,7 @@ struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask, for (offset = start_offset; offset <= end_offset ; offset++) { /* Ok, do the async read-ahead now */ page = read_swap_cache_async(swp_entry(swp_type(entry), offset), - gfp_mask, vma, addr); + gfp_mask, vma, addr, false); if (!page) continue; if (offset != entry_offset && likely(!PageTransCompound(page))) @@ -522,7 +524,7 @@ struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask, lru_add_drain(); /* Push any new pages onto the LRU now */ skip: - return read_swap_cache_async(entry, gfp_mask, vma, addr); + return read_swap_cache_async(entry, gfp_mask, vma, addr, do_poll); } int init_swap_address_space(unsigned int type, unsigned long nr_pages) diff --git a/mm/swapfile.c b/mm/swapfile.c index 811d90e1c929..6ba4aab2db0b 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1868,7 +1868,7 @@ int try_to_unuse(unsigned int type, bool frontswap, swap_map = &si->swap_map[i]; entry = swp_entry(type, i); page = read_swap_cache_async(entry, - GFP_HIGHUSER_MOVABLE, NULL, 0); + GFP_HIGHUSER_MOVABLE, NULL, 0, false); if (!page) { /* * Either swap_duplicate() failed because entry diff --git a/mm/truncate.c b/mm/truncate.c index 6479ed2afc53..2330223841fb 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -530,9 +530,15 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping, } else if (PageTransHuge(page)) { index += HPAGE_PMD_NR - 1; i += HPAGE_PMD_NR - 1; - /* 'end' is in the middle of THP */ - if (index == round_down(end, HPAGE_PMD_NR)) + /* + * 'end' is in the middle of THP. Don't + * invalidate the page as the part outside of + * 'end' could be still useful. + */ + if (index > end) { + unlock_page(page); continue; + } } ret = invalidate_inode_page(page); diff --git a/mm/util.c b/mm/util.c index 26be6407abd7..7b07ec852e01 100644 --- a/mm/util.c +++ b/mm/util.c @@ -83,6 +83,8 @@ EXPORT_SYMBOL(kstrdup_const); * @s: the string to duplicate * @max: read at most @max chars from @s * @gfp: the GFP mask used in the kmalloc() call when allocating memory + * + * Note: Use kmemdup_nul() instead if the size is known exactly. */ char *kstrndup(const char *s, size_t max, gfp_t gfp) { @@ -121,6 +123,28 @@ void *kmemdup(const void *src, size_t len, gfp_t gfp) EXPORT_SYMBOL(kmemdup); /** + * kmemdup_nul - Create a NUL-terminated string from unterminated data + * @s: The data to stringify + * @len: The size of the data + * @gfp: the GFP mask used in the kmalloc() call when allocating memory + */ +char *kmemdup_nul(const char *s, size_t len, gfp_t gfp) +{ + char *buf; + + if (!s) + return NULL; + + buf = kmalloc_track_caller(len + 1, gfp); + if (buf) { + memcpy(buf, s, len); + buf[len] = '\0'; + } + return buf; +} +EXPORT_SYMBOL(kmemdup_nul); + +/** * memdup_user - duplicate memory region from user space * * @src: source address in user space @@ -339,9 +363,9 @@ EXPORT_SYMBOL(vm_mmap); * Uses kmalloc to get the memory but if the allocation fails then falls back * to the vmalloc allocator. Use kvfree for freeing the memory. * - * Reclaim modifiers - __GFP_NORETRY and __GFP_NOFAIL are not supported. __GFP_REPEAT - * is supported only for large (>32kB) allocations, and it should be used only if - * kmalloc is preferable to the vmalloc fallback, due to visible performance drawbacks. + * Reclaim modifiers - __GFP_NORETRY and __GFP_NOFAIL are not supported. + * __GFP_RETRY_MAYFAIL is supported, and it should be used only if kmalloc is + * preferable to the vmalloc fallback, due to visible performance drawbacks. * * Any use of gfp flags outside of GFP_KERNEL should be consulted with mm people. */ @@ -366,13 +390,7 @@ void *kvmalloc_node(size_t size, gfp_t flags, int node) if (size > PAGE_SIZE) { kmalloc_flags |= __GFP_NOWARN; - /* - * We have to override __GFP_REPEAT by __GFP_NORETRY for !costly - * requests because there is no other way to tell the allocator - * that we want to fail rather than retry endlessly. - */ - if (!(kmalloc_flags & __GFP_REPEAT) || - (size <= PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) + if (!(kmalloc_flags & __GFP_RETRY_MAYFAIL)) kmalloc_flags |= __GFP_NORETRY; } diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 6211a807cb31..8698c1c86c4d 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -325,6 +325,7 @@ EXPORT_SYMBOL(vmalloc_to_pfn); /*** Global kva allocator ***/ +#define VM_LAZY_FREE 0x02 #define VM_VM_AREA 0x04 static DEFINE_SPINLOCK(vmap_area_lock); @@ -1497,6 +1498,7 @@ struct vm_struct *remove_vm_area(const void *addr) spin_lock(&vmap_area_lock); va->vm = NULL; va->flags &= ~VM_VM_AREA; + va->flags |= VM_LAZY_FREE; spin_unlock(&vmap_area_lock); vmap_debug_free_range(va->va_start, va->va_end); @@ -1793,7 +1795,7 @@ fail: * allocator with @gfp_mask flags. Map them into contiguous * kernel virtual space, using a pagetable protection of @prot. * - * Reclaim modifiers in @gfp_mask - __GFP_NORETRY, __GFP_REPEAT + * Reclaim modifiers in @gfp_mask - __GFP_NORETRY, __GFP_RETRY_MAYFAIL * and __GFP_NOFAIL are not supported * * Any use of gfp flags outside of GFP_KERNEL should be consulted @@ -2704,8 +2706,14 @@ static int s_show(struct seq_file *m, void *p) * s_show can encounter race with remove_vm_area, !VM_VM_AREA on * behalf of vmap area is being tear down or vm_map_ram allocation. */ - if (!(va->flags & VM_VM_AREA)) + if (!(va->flags & VM_VM_AREA)) { + seq_printf(m, "0x%pK-0x%pK %7ld %s\n", + (void *)va->va_start, (void *)va->va_end, + va->va_end - va->va_start, + va->flags & VM_LAZY_FREE ? "unpurged vm_area" : "vm_map_ram"); + return 0; + } v = va->vm; diff --git a/mm/vmpressure.c b/mm/vmpressure.c index ce0618bfa8d0..85350ce2d25d 100644 --- a/mm/vmpressure.c +++ b/mm/vmpressure.c @@ -93,12 +93,25 @@ enum vmpressure_levels { VMPRESSURE_NUM_LEVELS, }; +enum vmpressure_modes { + VMPRESSURE_NO_PASSTHROUGH = 0, + VMPRESSURE_HIERARCHY, + VMPRESSURE_LOCAL, + VMPRESSURE_NUM_MODES, +}; + static const char * const vmpressure_str_levels[] = { [VMPRESSURE_LOW] = "low", [VMPRESSURE_MEDIUM] = "medium", [VMPRESSURE_CRITICAL] = "critical", }; +static const char * const vmpressure_str_modes[] = { + [VMPRESSURE_NO_PASSTHROUGH] = "default", + [VMPRESSURE_HIERARCHY] = "hierarchy", + [VMPRESSURE_LOCAL] = "local", +}; + static enum vmpressure_levels vmpressure_level(unsigned long pressure) { if (pressure >= vmpressure_level_critical) @@ -141,27 +154,31 @@ out: struct vmpressure_event { struct eventfd_ctx *efd; enum vmpressure_levels level; + enum vmpressure_modes mode; struct list_head node; }; static bool vmpressure_event(struct vmpressure *vmpr, - enum vmpressure_levels level) + const enum vmpressure_levels level, + bool ancestor, bool signalled) { struct vmpressure_event *ev; - bool signalled = false; + bool ret = false; mutex_lock(&vmpr->events_lock); - list_for_each_entry(ev, &vmpr->events, node) { - if (level >= ev->level) { - eventfd_signal(ev->efd, 1); - signalled = true; - } + if (ancestor && ev->mode == VMPRESSURE_LOCAL) + continue; + if (signalled && ev->mode == VMPRESSURE_NO_PASSTHROUGH) + continue; + if (level < ev->level) + continue; + eventfd_signal(ev->efd, 1); + ret = true; } - mutex_unlock(&vmpr->events_lock); - return signalled; + return ret; } static void vmpressure_work_fn(struct work_struct *work) @@ -170,6 +187,8 @@ static void vmpressure_work_fn(struct work_struct *work) unsigned long scanned; unsigned long reclaimed; enum vmpressure_levels level; + bool ancestor = false; + bool signalled = false; spin_lock(&vmpr->sr_lock); /* @@ -194,12 +213,9 @@ static void vmpressure_work_fn(struct work_struct *work) level = vmpressure_calc_level(scanned, reclaimed); do { - if (vmpressure_event(vmpr, level)) - break; - /* - * If not handled, propagate the event upward into the - * hierarchy. - */ + if (vmpressure_event(vmpr, level, ancestor, signalled)) + signalled = true; + ancestor = true; } while ((vmpr = vmpressure_parent(vmpr))); } @@ -326,17 +342,40 @@ void vmpressure_prio(gfp_t gfp, struct mem_cgroup *memcg, int prio) vmpressure(gfp, memcg, true, vmpressure_win, 0); } +static enum vmpressure_levels str_to_level(const char *arg) +{ + enum vmpressure_levels level; + + for (level = 0; level < VMPRESSURE_NUM_LEVELS; level++) + if (!strcmp(vmpressure_str_levels[level], arg)) + return level; + return -1; +} + +static enum vmpressure_modes str_to_mode(const char *arg) +{ + enum vmpressure_modes mode; + + for (mode = 0; mode < VMPRESSURE_NUM_MODES; mode++) + if (!strcmp(vmpressure_str_modes[mode], arg)) + return mode; + return -1; +} + +#define MAX_VMPRESSURE_ARGS_LEN (strlen("critical") + strlen("hierarchy") + 2) + /** * vmpressure_register_event() - Bind vmpressure notifications to an eventfd * @memcg: memcg that is interested in vmpressure notifications * @eventfd: eventfd context to link notifications with - * @args: event arguments (used to set up a pressure level threshold) + * @args: event arguments (pressure level threshold, optional mode) * * This function associates eventfd context with the vmpressure * infrastructure, so that the notifications will be delivered to the - * @eventfd. The @args parameter is a string that denotes pressure level - * threshold (one of vmpressure_str_levels, i.e. "low", "medium", or - * "critical"). + * @eventfd. The @args parameter is a comma-delimited string that denotes a + * pressure level threshold (one of vmpressure_str_levels, i.e. "low", "medium", + * or "critical") and an optional mode (one of vmpressure_str_modes, i.e. + * "hierarchy" or "local"). * * To be used as memcg event method. */ @@ -345,28 +384,53 @@ int vmpressure_register_event(struct mem_cgroup *memcg, { struct vmpressure *vmpr = memcg_to_vmpressure(memcg); struct vmpressure_event *ev; - int level; + enum vmpressure_modes mode = VMPRESSURE_NO_PASSTHROUGH; + enum vmpressure_levels level = -1; + char *spec, *spec_orig; + char *token; + int ret = 0; + + spec_orig = spec = kzalloc(MAX_VMPRESSURE_ARGS_LEN + 1, GFP_KERNEL); + if (!spec) { + ret = -ENOMEM; + goto out; + } + strncpy(spec, args, MAX_VMPRESSURE_ARGS_LEN); - for (level = 0; level < VMPRESSURE_NUM_LEVELS; level++) { - if (!strcmp(vmpressure_str_levels[level], args)) - break; + /* Find required level */ + token = strsep(&spec, ","); + level = str_to_level(token); + if (level == -1) { + ret = -EINVAL; + goto out; } - if (level >= VMPRESSURE_NUM_LEVELS) - return -EINVAL; + /* Find optional mode */ + token = strsep(&spec, ","); + if (token) { + mode = str_to_mode(token); + if (mode == -1) { + ret = -EINVAL; + goto out; + } + } ev = kzalloc(sizeof(*ev), GFP_KERNEL); - if (!ev) - return -ENOMEM; + if (!ev) { + ret = -ENOMEM; + goto out; + } ev->efd = eventfd; ev->level = level; + ev->mode = mode; mutex_lock(&vmpr->events_lock); list_add(&ev->node, &vmpr->events); mutex_unlock(&vmpr->events_lock); - - return 0; +out: + kfree(spec_orig); + return ret; } /** diff --git a/mm/vmscan.c b/mm/vmscan.c index 9e95fafc026b..a1af041930a6 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2228,8 +2228,17 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg, } if (unlikely(pgdatfile + pgdatfree <= total_high_wmark)) { - scan_balance = SCAN_ANON; - goto out; + /* + * Force SCAN_ANON if there are enough inactive + * anonymous pages on the LRU in eligible zones. + * Otherwise, the small LRU gets thrashed. + */ + if (!inactive_list_is_low(lruvec, false, memcg, sc, false) && + lruvec_lru_size(lruvec, LRU_INACTIVE_ANON, sc->reclaim_idx) + >> sc->priority) { + scan_balance = SCAN_ANON; + goto out; + } } } @@ -2497,18 +2506,18 @@ static inline bool should_continue_reclaim(struct pglist_data *pgdat, return false; /* Consider stopping depending on scan and reclaim activity */ - if (sc->gfp_mask & __GFP_REPEAT) { + if (sc->gfp_mask & __GFP_RETRY_MAYFAIL) { /* - * For __GFP_REPEAT allocations, stop reclaiming if the + * For __GFP_RETRY_MAYFAIL allocations, stop reclaiming if the * full LRU list has been scanned and we are still failing * to reclaim pages. This full LRU scan is potentially - * expensive but a __GFP_REPEAT caller really wants to succeed + * expensive but a __GFP_RETRY_MAYFAIL caller really wants to succeed */ if (!nr_reclaimed && !nr_scanned) return false; } else { /* - * For non-__GFP_REPEAT allocations which can presumably + * For non-__GFP_RETRY_MAYFAIL allocations which can presumably * fail without consequence, stop if we failed to reclaim * any pages from the last SWAP_CLUSTER_MAX number of * pages that were scanned. This will return to the diff --git a/mm/vmstat.c b/mm/vmstat.c index 744ceaeb42a0..9a4441bbeef2 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1130,7 +1130,7 @@ static void frag_stop(struct seq_file *m, void *arg) * If @assert_populated is true, only use callback for zones that are populated. */ static void walk_zones_in_node(struct seq_file *m, pg_data_t *pgdat, - bool assert_populated, + bool assert_populated, bool nolock, void (*print)(struct seq_file *m, pg_data_t *, struct zone *)) { struct zone *zone; @@ -1141,9 +1141,11 @@ static void walk_zones_in_node(struct seq_file *m, pg_data_t *pgdat, if (assert_populated && !populated_zone(zone)) continue; - spin_lock_irqsave(&zone->lock, flags); + if (!nolock) + spin_lock_irqsave(&zone->lock, flags); print(m, pgdat, zone); - spin_unlock_irqrestore(&zone->lock, flags); + if (!nolock) + spin_unlock_irqrestore(&zone->lock, flags); } } #endif @@ -1166,7 +1168,7 @@ static void frag_show_print(struct seq_file *m, pg_data_t *pgdat, static int frag_show(struct seq_file *m, void *arg) { pg_data_t *pgdat = (pg_data_t *)arg; - walk_zones_in_node(m, pgdat, true, frag_show_print); + walk_zones_in_node(m, pgdat, true, false, frag_show_print); return 0; } @@ -1207,7 +1209,7 @@ static int pagetypeinfo_showfree(struct seq_file *m, void *arg) seq_printf(m, "%6d ", order); seq_putc(m, '\n'); - walk_zones_in_node(m, pgdat, true, pagetypeinfo_showfree_print); + walk_zones_in_node(m, pgdat, true, false, pagetypeinfo_showfree_print); return 0; } @@ -1258,7 +1260,8 @@ static int pagetypeinfo_showblockcount(struct seq_file *m, void *arg) for (mtype = 0; mtype < MIGRATE_TYPES; mtype++) seq_printf(m, "%12s ", migratetype_names[mtype]); seq_putc(m, '\n'); - walk_zones_in_node(m, pgdat, true, pagetypeinfo_showblockcount_print); + walk_zones_in_node(m, pgdat, true, false, + pagetypeinfo_showblockcount_print); return 0; } @@ -1284,7 +1287,8 @@ static void pagetypeinfo_showmixedcount(struct seq_file *m, pg_data_t *pgdat) seq_printf(m, "%12s ", migratetype_names[mtype]); seq_putc(m, '\n'); - walk_zones_in_node(m, pgdat, true, pagetypeinfo_showmixedcount_print); + walk_zones_in_node(m, pgdat, true, true, + pagetypeinfo_showmixedcount_print); #endif /* CONFIG_PAGE_OWNER */ } @@ -1446,7 +1450,7 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, static int zoneinfo_show(struct seq_file *m, void *arg) { pg_data_t *pgdat = (pg_data_t *)arg; - walk_zones_in_node(m, pgdat, false, zoneinfo_show_print); + walk_zones_in_node(m, pgdat, false, false, zoneinfo_show_print); return 0; } @@ -1852,7 +1856,7 @@ static int unusable_show(struct seq_file *m, void *arg) if (!node_state(pgdat->node_id, N_MEMORY)) return 0; - walk_zones_in_node(m, pgdat, true, unusable_show_print); + walk_zones_in_node(m, pgdat, true, false, unusable_show_print); return 0; } @@ -1904,7 +1908,7 @@ static int extfrag_show(struct seq_file *m, void *arg) { pg_data_t *pgdat = (pg_data_t *)arg; - walk_zones_in_node(m, pgdat, true, extfrag_show_print); + walk_zones_in_node(m, pgdat, true, false, extfrag_show_print); return 0; } diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index d41edd28298b..013eea76685e 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -116,6 +116,11 @@ #define OBJ_INDEX_BITS (BITS_PER_LONG - _PFN_BITS - OBJ_TAG_BITS) #define OBJ_INDEX_MASK ((_AC(1, UL) << OBJ_INDEX_BITS) - 1) +#define FULLNESS_BITS 2 +#define CLASS_BITS 8 +#define ISOLATED_BITS 3 +#define MAGIC_VAL_BITS 8 + #define MAX(a, b) ((a) >= (b) ? (a) : (b)) /* ZS_MIN_ALLOC_SIZE must be multiple of ZS_ALIGN */ #define ZS_MIN_ALLOC_SIZE \ @@ -137,6 +142,8 @@ * (reason above) */ #define ZS_SIZE_CLASS_DELTA (PAGE_SIZE >> CLASS_BITS) +#define ZS_SIZE_CLASSES (DIV_ROUND_UP(ZS_MAX_ALLOC_SIZE - ZS_MIN_ALLOC_SIZE, \ + ZS_SIZE_CLASS_DELTA) + 1) enum fullness_group { ZS_EMPTY, @@ -169,11 +176,6 @@ static struct vfsmount *zsmalloc_mnt; #endif /* - * number of size_classes - */ -static int zs_size_classes; - -/* * We assign a page to ZS_ALMOST_EMPTY fullness group when: * n <= N / f, where * n = number of allocated objects @@ -244,7 +246,7 @@ struct link_free { struct zs_pool { const char *name; - struct size_class **size_class; + struct size_class *size_class[ZS_SIZE_CLASSES]; struct kmem_cache *handle_cachep; struct kmem_cache *zspage_cachep; @@ -268,11 +270,6 @@ struct zs_pool { #endif }; -#define FULLNESS_BITS 2 -#define CLASS_BITS 8 -#define ISOLATED_BITS 3 -#define MAGIC_VAL_BITS 8 - struct zspage { struct { unsigned int fullness:FULLNESS_BITS; @@ -469,7 +466,7 @@ static bool is_zspage_isolated(struct zspage *zspage) return zspage->isolated; } -static int is_first_page(struct page *page) +static __maybe_unused int is_first_page(struct page *page) { return PagePrivate(page); } @@ -551,7 +548,7 @@ static int get_size_class_index(int size) idx = DIV_ROUND_UP(size - ZS_MIN_ALLOC_SIZE, ZS_SIZE_CLASS_DELTA); - return min(zs_size_classes - 1, idx); + return min_t(int, ZS_SIZE_CLASSES - 1, idx); } static inline void zs_stat_inc(struct size_class *class, @@ -610,7 +607,7 @@ static int zs_stats_size_show(struct seq_file *s, void *v) "obj_allocated", "obj_used", "pages_used", "pages_per_zspage", "freeable"); - for (i = 0; i < zs_size_classes; i++) { + for (i = 0; i < ZS_SIZE_CLASSES; i++) { class = pool->size_class[i]; if (class->index != i) @@ -1294,17 +1291,6 @@ static int zs_cpu_dead(unsigned int cpu) return 0; } -static void __init init_zs_size_classes(void) -{ - int nr; - - nr = (ZS_MAX_ALLOC_SIZE - ZS_MIN_ALLOC_SIZE) / ZS_SIZE_CLASS_DELTA + 1; - if ((ZS_MAX_ALLOC_SIZE - ZS_MIN_ALLOC_SIZE) % ZS_SIZE_CLASS_DELTA) - nr += 1; - - zs_size_classes = nr; -} - static bool can_merge(struct size_class *prev, int pages_per_zspage, int objs_per_zspage) { @@ -2145,7 +2131,7 @@ static void async_free_zspage(struct work_struct *work) struct zs_pool *pool = container_of(work, struct zs_pool, free_work); - for (i = 0; i < zs_size_classes; i++) { + for (i = 0; i < ZS_SIZE_CLASSES; i++) { class = pool->size_class[i]; if (class->index != i) continue; @@ -2263,7 +2249,7 @@ unsigned long zs_compact(struct zs_pool *pool) int i; struct size_class *class; - for (i = zs_size_classes - 1; i >= 0; i--) { + for (i = ZS_SIZE_CLASSES - 1; i >= 0; i--) { class = pool->size_class[i]; if (!class) continue; @@ -2309,7 +2295,7 @@ static unsigned long zs_shrinker_count(struct shrinker *shrinker, struct zs_pool *pool = container_of(shrinker, struct zs_pool, shrinker); - for (i = zs_size_classes - 1; i >= 0; i--) { + for (i = ZS_SIZE_CLASSES - 1; i >= 0; i--) { class = pool->size_class[i]; if (!class) continue; @@ -2361,12 +2347,6 @@ struct zs_pool *zs_create_pool(const char *name) return NULL; init_deferred_free(pool); - pool->size_class = kcalloc(zs_size_classes, sizeof(struct size_class *), - GFP_KERNEL); - if (!pool->size_class) { - kfree(pool); - return NULL; - } pool->name = kstrdup(name, GFP_KERNEL); if (!pool->name) @@ -2379,7 +2359,7 @@ struct zs_pool *zs_create_pool(const char *name) * Iterate reversely, because, size of size_class that we want to use * for merging should be larger or equal to current size. */ - for (i = zs_size_classes - 1; i >= 0; i--) { + for (i = ZS_SIZE_CLASSES - 1; i >= 0; i--) { int size; int pages_per_zspage; int objs_per_zspage; @@ -2453,7 +2433,7 @@ void zs_destroy_pool(struct zs_pool *pool) zs_unregister_migration(pool); zs_pool_stat_destroy(pool); - for (i = 0; i < zs_size_classes; i++) { + for (i = 0; i < ZS_SIZE_CLASSES; i++) { int fg; struct size_class *class = pool->size_class[i]; @@ -2492,8 +2472,6 @@ static int __init zs_init(void) if (ret) goto hp_setup_fail; - init_zs_size_classes(); - #ifdef CONFIG_ZPOOL zpool_register_driver(&zs_zpool_driver); #endif |