diff options
Diffstat (limited to 'mm')
-rw-r--r-- | mm/backing-dev.c | 4 | ||||
-rw-r--r-- | mm/memcontrol.c | 5 | ||||
-rw-r--r-- | mm/memremap.c | 2 | ||||
-rw-r--r-- | mm/page_alloc.c | 8 | ||||
-rw-r--r-- | mm/shmem.c | 6 | ||||
-rw-r--r-- | mm/shuffle.c | 2 | ||||
-rw-r--r-- | mm/slab_common.c | 19 | ||||
-rw-r--r-- | mm/slob.c | 62 | ||||
-rw-r--r-- | mm/slub.c | 14 | ||||
-rw-r--r-- | mm/sparse.c | 2 | ||||
-rw-r--r-- | mm/vmpressure.c | 20 | ||||
-rw-r--r-- | mm/vmscan.c | 72 | ||||
-rw-r--r-- | mm/z3fold.c | 10 |
13 files changed, 181 insertions, 45 deletions
diff --git a/mm/backing-dev.c b/mm/backing-dev.c index d9daa3e422d0..c360f6a6c844 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -239,8 +239,8 @@ static int __init default_bdi_init(void) { int err; - bdi_wq = alloc_workqueue("writeback", WQ_MEM_RECLAIM | WQ_FREEZABLE | - WQ_UNBOUND | WQ_SYSFS, 0); + bdi_wq = alloc_workqueue("writeback", WQ_MEM_RECLAIM | WQ_UNBOUND | + WQ_SYSFS, 0); if (!bdi_wq) return -ENOMEM; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index c313c49074ca..bdac56009a38 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1567,6 +1567,11 @@ unsigned long mem_cgroup_get_max(struct mem_cgroup *memcg) return max; } +unsigned long mem_cgroup_size(struct mem_cgroup *memcg) +{ + return page_counter_read(&memcg->memory); +} + static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, int order) { diff --git a/mm/memremap.c b/mm/memremap.c index 32c79b51af86..68204912cc0a 100644 --- a/mm/memremap.c +++ b/mm/memremap.c @@ -13,8 +13,6 @@ #include <linux/xarray.h> static DEFINE_XARRAY(pgmap_array); -#define SECTION_MASK ~((1UL << PA_SECTION_SHIFT) - 1) -#define SECTION_SIZE (1UL << PA_SECTION_SHIFT) #ifdef CONFIG_DEV_PAGEMAP_OPS DEFINE_STATIC_KEY_FALSE(devmap_managed_key); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 15c2050c629b..c0b2e0306720 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1175,11 +1175,17 @@ static __always_inline bool free_pages_prepare(struct page *page, debug_check_no_obj_freed(page_address(page), PAGE_SIZE << order); } - arch_free_page(page, order); if (want_init_on_free()) kernel_init_free_pages(page, 1 << order); kernel_poison_pages(page, 1 << order, 0); + /* + * arch_free_page() can make the page's contents inaccessible. s390 + * does this. So nothing which can access the page's contents should + * happen after this. + */ + arch_free_page(page, order); + if (debug_pagealloc_enabled()) kernel_map_pages(page, 1 << order, 0); diff --git a/mm/shmem.c b/mm/shmem.c index cd570cc79c76..220be9fa2c41 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -3482,6 +3482,12 @@ static int shmem_parse_options(struct fs_context *fc, void *data) { char *options = data; + if (options) { + int err = security_sb_eat_lsm_opts(options, &fc->security); + if (err) + return err; + } + while (options != NULL) { char *this_char = options; for (;;) { diff --git a/mm/shuffle.c b/mm/shuffle.c index 3ce12481b1dc..b3fe97fd6654 100644 --- a/mm/shuffle.c +++ b/mm/shuffle.c @@ -33,7 +33,7 @@ __meminit void page_alloc_shuffle(enum mm_shuffle_ctl ctl) } static bool shuffle_param; -extern int shuffle_show(char *buffer, const struct kernel_param *kp) +static int shuffle_show(char *buffer, const struct kernel_param *kp) { return sprintf(buffer, "%c\n", test_bit(SHUFFLE_ENABLE, &shuffle_state) ? 'Y' : 'N'); diff --git a/mm/slab_common.c b/mm/slab_common.c index 6491c3a41805..c29f03adca91 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -1030,10 +1030,19 @@ void __init create_boot_cache(struct kmem_cache *s, const char *name, unsigned int useroffset, unsigned int usersize) { int err; + unsigned int align = ARCH_KMALLOC_MINALIGN; s->name = name; s->size = s->object_size = size; - s->align = calculate_alignment(flags, ARCH_KMALLOC_MINALIGN, size); + + /* + * For power of two sizes, guarantee natural alignment for kmalloc + * caches, regardless of SL*B debugging options. + */ + if (is_power_of_2(size)) + align = max(align, size); + s->align = calculate_alignment(flags, align, size); + s->useroffset = useroffset; s->usersize = usersize; @@ -1287,12 +1296,16 @@ void __init create_kmalloc_caches(slab_flags_t flags) */ void *kmalloc_order(size_t size, gfp_t flags, unsigned int order) { - void *ret; + void *ret = NULL; struct page *page; flags |= __GFP_COMP; page = alloc_pages(flags, order); - ret = page ? page_address(page) : NULL; + if (likely(page)) { + ret = page_address(page); + mod_node_page_state(page_pgdat(page), NR_SLAB_UNRECLAIMABLE, + 1 << order); + } ret = kasan_kmalloc_large(ret, size, flags); /* As ret might get tagged, call kmemleak hook after KASAN. */ kmemleak_alloc(ret, size, 1, flags); diff --git a/mm/slob.c b/mm/slob.c index cf377beab962..fa53e9f73893 100644 --- a/mm/slob.c +++ b/mm/slob.c @@ -190,7 +190,7 @@ static int slob_last(slob_t *s) static void *slob_new_pages(gfp_t gfp, int order, int node) { - void *page; + struct page *page; #ifdef CONFIG_NUMA if (node != NUMA_NO_NODE) @@ -202,14 +202,21 @@ static void *slob_new_pages(gfp_t gfp, int order, int node) if (!page) return NULL; + mod_node_page_state(page_pgdat(page), NR_SLAB_UNRECLAIMABLE, + 1 << order); return page_address(page); } static void slob_free_pages(void *b, int order) { + struct page *sp = virt_to_page(b); + if (current->reclaim_state) current->reclaim_state->reclaimed_slab += 1 << order; - free_pages((unsigned long)b, order); + + mod_node_page_state(page_pgdat(sp), NR_SLAB_UNRECLAIMABLE, + -(1 << order)); + __free_pages(sp, order); } /* @@ -217,6 +224,7 @@ static void slob_free_pages(void *b, int order) * @sp: Page to look in. * @size: Size of the allocation. * @align: Allocation alignment. + * @align_offset: Offset in the allocated block that will be aligned. * @page_removed_from_list: Return parameter. * * Tries to find a chunk of memory at least @size bytes big within @page. @@ -227,7 +235,7 @@ static void slob_free_pages(void *b, int order) * true (set to false otherwise). */ static void *slob_page_alloc(struct page *sp, size_t size, int align, - bool *page_removed_from_list) + int align_offset, bool *page_removed_from_list) { slob_t *prev, *cur, *aligned = NULL; int delta = 0, units = SLOB_UNITS(size); @@ -236,8 +244,17 @@ static void *slob_page_alloc(struct page *sp, size_t size, int align, for (prev = NULL, cur = sp->freelist; ; prev = cur, cur = slob_next(cur)) { slobidx_t avail = slob_units(cur); + /* + * 'aligned' will hold the address of the slob block so that the + * address 'aligned'+'align_offset' is aligned according to the + * 'align' parameter. This is for kmalloc() which prepends the + * allocated block with its size, so that the block itself is + * aligned when needed. + */ if (align) { - aligned = (slob_t *)ALIGN((unsigned long)cur, align); + aligned = (slob_t *) + (ALIGN((unsigned long)cur + align_offset, align) + - align_offset); delta = aligned - cur; } if (avail >= units + delta) { /* room enough? */ @@ -281,7 +298,8 @@ static void *slob_page_alloc(struct page *sp, size_t size, int align, /* * slob_alloc: entry point into the slob allocator. */ -static void *slob_alloc(size_t size, gfp_t gfp, int align, int node) +static void *slob_alloc(size_t size, gfp_t gfp, int align, int node, + int align_offset) { struct page *sp; struct list_head *slob_list; @@ -312,7 +330,7 @@ static void *slob_alloc(size_t size, gfp_t gfp, int align, int node) if (sp->units < SLOB_UNITS(size)) continue; - b = slob_page_alloc(sp, size, align, &page_removed_from_list); + b = slob_page_alloc(sp, size, align, align_offset, &page_removed_from_list); if (!b) continue; @@ -349,7 +367,7 @@ static void *slob_alloc(size_t size, gfp_t gfp, int align, int node) INIT_LIST_HEAD(&sp->slab_list); set_slob(b, SLOB_UNITS(PAGE_SIZE), b + SLOB_UNITS(PAGE_SIZE)); set_slob_page_free(sp, slob_list); - b = slob_page_alloc(sp, size, align, &_unused); + b = slob_page_alloc(sp, size, align, align_offset, &_unused); BUG_ON(!b); spin_unlock_irqrestore(&slob_lock, flags); } @@ -451,7 +469,7 @@ static __always_inline void * __do_kmalloc_node(size_t size, gfp_t gfp, int node, unsigned long caller) { unsigned int *m; - int align = max_t(size_t, ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN); + int minalign = max_t(size_t, ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN); void *ret; gfp &= gfp_allowed_mask; @@ -459,19 +477,28 @@ __do_kmalloc_node(size_t size, gfp_t gfp, int node, unsigned long caller) fs_reclaim_acquire(gfp); fs_reclaim_release(gfp); - if (size < PAGE_SIZE - align) { + if (size < PAGE_SIZE - minalign) { + int align = minalign; + + /* + * For power of two sizes, guarantee natural alignment for + * kmalloc()'d objects. + */ + if (is_power_of_2(size)) + align = max(minalign, (int) size); + if (!size) return ZERO_SIZE_PTR; - m = slob_alloc(size + align, gfp, align, node); + m = slob_alloc(size + minalign, gfp, align, node, minalign); if (!m) return NULL; *m = size; - ret = (void *)m + align; + ret = (void *)m + minalign; trace_kmalloc_node(caller, ret, - size, size + align, gfp, node); + size, size + minalign, gfp, node); } else { unsigned int order = get_order(size); @@ -521,8 +548,13 @@ void kfree(const void *block) int align = max_t(size_t, ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN); unsigned int *m = (unsigned int *)(block - align); slob_free(m, *m + align); - } else - __free_pages(sp, compound_order(sp)); + } else { + unsigned int order = compound_order(sp); + mod_node_page_state(page_pgdat(sp), NR_SLAB_UNRECLAIMABLE, + -(1 << order)); + __free_pages(sp, order); + + } } EXPORT_SYMBOL(kfree); @@ -567,7 +599,7 @@ static void *slob_alloc_node(struct kmem_cache *c, gfp_t flags, int node) fs_reclaim_release(flags); if (c->size < PAGE_SIZE) { - b = slob_alloc(c->size, flags, c->align, node); + b = slob_alloc(c->size, flags, c->align, node, 0); trace_kmem_cache_alloc_node(_RET_IP_, b, c->object_size, SLOB_UNITS(c->size) * SLOB_UNIT, flags, node); diff --git a/mm/slub.c b/mm/slub.c index 42c1b3af3c98..3d63ae320d31 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -3821,11 +3821,15 @@ static void *kmalloc_large_node(size_t size, gfp_t flags, int node) { struct page *page; void *ptr = NULL; + unsigned int order = get_order(size); flags |= __GFP_COMP; - page = alloc_pages_node(node, flags, get_order(size)); - if (page) + page = alloc_pages_node(node, flags, order); + if (page) { ptr = page_address(page); + mod_node_page_state(page_pgdat(page), NR_SLAB_UNRECLAIMABLE, + 1 << order); + } return kmalloc_large_node_hook(ptr, size, flags); } @@ -3951,9 +3955,13 @@ void kfree(const void *x) page = virt_to_head_page(x); if (unlikely(!PageSlab(page))) { + unsigned int order = compound_order(page); + BUG_ON(!PageCompound(page)); kfree_hook(object); - __free_pages(page, compound_order(page)); + mod_node_page_state(page_pgdat(page), NR_SLAB_UNRECLAIMABLE, + -(1 << order)); + __free_pages(page, order); return; } slab_free(page->slab_cache, page, object, NULL, 1, _RET_IP_); diff --git a/mm/sparse.c b/mm/sparse.c index bf32de9e666b..f6891c1992b1 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -219,7 +219,7 @@ static inline unsigned long first_present_section_nr(void) return next_present_section_nr(-1); } -void subsection_mask_set(unsigned long *map, unsigned long pfn, +static void subsection_mask_set(unsigned long *map, unsigned long pfn, unsigned long nr_pages) { int idx = subsection_map_index(pfn); diff --git a/mm/vmpressure.c b/mm/vmpressure.c index f3b50811497a..4bac22fe1aa2 100644 --- a/mm/vmpressure.c +++ b/mm/vmpressure.c @@ -355,6 +355,9 @@ void vmpressure_prio(gfp_t gfp, struct mem_cgroup *memcg, int prio) * "hierarchy" or "local"). * * To be used as memcg event method. + * + * Return: 0 on success, -ENOMEM on memory failure or -EINVAL if @args could + * not be parsed. */ int vmpressure_register_event(struct mem_cgroup *memcg, struct eventfd_ctx *eventfd, const char *args) @@ -362,7 +365,7 @@ int vmpressure_register_event(struct mem_cgroup *memcg, struct vmpressure *vmpr = memcg_to_vmpressure(memcg); struct vmpressure_event *ev; enum vmpressure_modes mode = VMPRESSURE_NO_PASSTHROUGH; - enum vmpressure_levels level = -1; + enum vmpressure_levels level; char *spec, *spec_orig; char *token; int ret = 0; @@ -375,20 +378,18 @@ int vmpressure_register_event(struct mem_cgroup *memcg, /* Find required level */ token = strsep(&spec, ","); - level = match_string(vmpressure_str_levels, VMPRESSURE_NUM_LEVELS, token); - if (level < 0) { - ret = level; + ret = match_string(vmpressure_str_levels, VMPRESSURE_NUM_LEVELS, token); + if (ret < 0) goto out; - } + level = ret; /* Find optional mode */ token = strsep(&spec, ","); if (token) { - mode = match_string(vmpressure_str_modes, VMPRESSURE_NUM_MODES, token); - if (mode < 0) { - ret = mode; + ret = match_string(vmpressure_str_modes, VMPRESSURE_NUM_MODES, token); + if (ret < 0) goto out; - } + mode = ret; } ev = kzalloc(sizeof(*ev), GFP_KERNEL); @@ -404,6 +405,7 @@ int vmpressure_register_event(struct mem_cgroup *memcg, mutex_lock(&vmpr->events_lock); list_add(&ev->node, &vmpr->events); mutex_unlock(&vmpr->events_lock); + ret = 0; out: kfree(spec_orig); return ret; diff --git a/mm/vmscan.c b/mm/vmscan.c index e5d52d6a24af..c6659bb758a4 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2459,17 +2459,70 @@ out: *lru_pages = 0; for_each_evictable_lru(lru) { int file = is_file_lru(lru); - unsigned long size; + unsigned long lruvec_size; unsigned long scan; + unsigned long protection; + + lruvec_size = lruvec_lru_size(lruvec, lru, sc->reclaim_idx); + protection = mem_cgroup_protection(memcg, + sc->memcg_low_reclaim); + + if (protection) { + /* + * Scale a cgroup's reclaim pressure by proportioning + * its current usage to its memory.low or memory.min + * setting. + * + * This is important, as otherwise scanning aggression + * becomes extremely binary -- from nothing as we + * approach the memory protection threshold, to totally + * nominal as we exceed it. This results in requiring + * setting extremely liberal protection thresholds. It + * also means we simply get no protection at all if we + * set it too low, which is not ideal. + * + * If there is any protection in place, we reduce scan + * pressure by how much of the total memory used is + * within protection thresholds. + * + * There is one special case: in the first reclaim pass, + * we skip over all groups that are within their low + * protection. If that fails to reclaim enough pages to + * satisfy the reclaim goal, we come back and override + * the best-effort low protection. However, we still + * ideally want to honor how well-behaved groups are in + * that case instead of simply punishing them all + * equally. As such, we reclaim them based on how much + * memory they are using, reducing the scan pressure + * again by how much of the total memory used is under + * hard protection. + */ + unsigned long cgroup_size = mem_cgroup_size(memcg); + + /* Avoid TOCTOU with earlier protection check */ + cgroup_size = max(cgroup_size, protection); + + scan = lruvec_size - lruvec_size * protection / + cgroup_size; + + /* + * Minimally target SWAP_CLUSTER_MAX pages to keep + * reclaim moving forwards, avoiding decremeting + * sc->priority further than desirable. + */ + scan = max(scan, SWAP_CLUSTER_MAX); + } else { + scan = lruvec_size; + } + + scan >>= sc->priority; - size = lruvec_lru_size(lruvec, lru, sc->reclaim_idx); - scan = size >> sc->priority; /* * If the cgroup's already been deleted, make sure to * scrape out the remaining cache. */ if (!scan && !mem_cgroup_online(memcg)) - scan = min(size, SWAP_CLUSTER_MAX); + scan = min(lruvec_size, SWAP_CLUSTER_MAX); switch (scan_balance) { case SCAN_EQUAL: @@ -2489,7 +2542,7 @@ out: case SCAN_ANON: /* Scan one type exclusively */ if ((scan_balance == SCAN_FILE) != file) { - size = 0; + lruvec_size = 0; scan = 0; } break; @@ -2498,7 +2551,7 @@ out: BUG(); } - *lru_pages += size; + *lru_pages += lruvec_size; nr[lru] = scan; } } @@ -2742,6 +2795,13 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc) memcg_memory_event(memcg, MEMCG_LOW); break; case MEMCG_PROT_NONE: + /* + * All protection thresholds breached. We may + * still choose to vary the scan pressure + * applied based on by how much the cgroup in + * question has exceeded its protection + * thresholds (see get_scan_count). + */ break; } diff --git a/mm/z3fold.c b/mm/z3fold.c index 05bdf90646e7..6d3d3f698ebb 100644 --- a/mm/z3fold.c +++ b/mm/z3fold.c @@ -998,9 +998,11 @@ static void z3fold_free(struct z3fold_pool *pool, unsigned long handle) struct z3fold_header *zhdr; struct page *page; enum buddy bud; + bool page_claimed; zhdr = handle_to_z3fold_header(handle); page = virt_to_page(zhdr); + page_claimed = test_and_set_bit(PAGE_CLAIMED, &page->private); if (test_bit(PAGE_HEADLESS, &page->private)) { /* if a headless page is under reclaim, just leave. @@ -1008,7 +1010,7 @@ static void z3fold_free(struct z3fold_pool *pool, unsigned long handle) * has not been set before, we release this page * immediately so we don't care about its value any more. */ - if (!test_and_set_bit(PAGE_CLAIMED, &page->private)) { + if (!page_claimed) { spin_lock(&pool->lock); list_del(&page->lru); spin_unlock(&pool->lock); @@ -1044,13 +1046,15 @@ static void z3fold_free(struct z3fold_pool *pool, unsigned long handle) atomic64_dec(&pool->pages_nr); return; } - if (test_bit(PAGE_CLAIMED, &page->private)) { + if (page_claimed) { + /* the page has not been claimed by us */ z3fold_page_unlock(zhdr); return; } if (unlikely(PageIsolated(page)) || test_and_set_bit(NEEDS_COMPACTING, &page->private)) { z3fold_page_unlock(zhdr); + clear_bit(PAGE_CLAIMED, &page->private); return; } if (zhdr->cpu < 0 || !cpu_online(zhdr->cpu)) { @@ -1060,10 +1064,12 @@ static void z3fold_free(struct z3fold_pool *pool, unsigned long handle) zhdr->cpu = -1; kref_get(&zhdr->refcount); do_compact_page(zhdr, true); + clear_bit(PAGE_CLAIMED, &page->private); return; } kref_get(&zhdr->refcount); queue_work_on(zhdr->cpu, pool->compact_wq, &zhdr->work); + clear_bit(PAGE_CLAIMED, &page->private); z3fold_page_unlock(zhdr); } |