diff options
Diffstat (limited to 'mm/vmalloc.c')
-rw-r--r-- | mm/vmalloc.c | 192 |
1 files changed, 134 insertions, 58 deletions
diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 4a7d7459c4f9..4d3b3d60d893 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -331,6 +331,7 @@ EXPORT_SYMBOL(vmalloc_to_pfn); static DEFINE_SPINLOCK(vmap_area_lock); +static DEFINE_SPINLOCK(free_vmap_area_lock); /* Export for kexec only */ LIST_HEAD(vmap_area_list); static LLIST_HEAD(vmap_purge_list); @@ -682,7 +683,7 @@ insert_vmap_area_augment(struct vmap_area *va, * free area is inserted. If VA has been merged, it is * freed. */ -static __always_inline void +static __always_inline struct vmap_area * merge_or_add_vmap_area(struct vmap_area *va, struct rb_root *root, struct list_head *head) { @@ -749,7 +750,10 @@ merge_or_add_vmap_area(struct vmap_area *va, /* Free vmap_area object. */ kmem_cache_free(vmap_area_cachep, va); - return; + + /* Point to the new merged area. */ + va = sibling; + merged = true; } } @@ -758,6 +762,8 @@ insert: link_va(va, root, parent, link, head); augment_tree_propagate_from(va); } + + return va; } static __always_inline bool @@ -968,6 +974,19 @@ adjust_va_to_fit_type(struct vmap_area *va, * There are a few exceptions though, as an example it is * a first allocation (early boot up) when we have "one" * big free space that has to be split. + * + * Also we can hit this path in case of regular "vmap" + * allocations, if "this" current CPU was not preloaded. + * See the comment in alloc_vmap_area() why. If so, then + * GFP_NOWAIT is used instead to get an extra object for + * split purpose. That is rare and most time does not + * occur. + * + * What happens if an allocation gets failed. Basically, + * an "overflow" path is triggered to purge lazily freed + * areas to free some memory, then, the "retry" path is + * triggered to repeat one more time. See more details + * in alloc_vmap_area() function. */ lva = kmem_cache_alloc(vmap_area_cachep, GFP_NOWAIT); if (!lva) @@ -1063,9 +1082,9 @@ static struct vmap_area *alloc_vmap_area(unsigned long size, return ERR_PTR(-EBUSY); might_sleep(); + gfp_mask = gfp_mask & GFP_RECLAIM_MASK; - va = kmem_cache_alloc_node(vmap_area_cachep, - gfp_mask & GFP_RECLAIM_MASK, node); + va = kmem_cache_alloc_node(vmap_area_cachep, gfp_mask, node); if (unlikely(!va)) return ERR_PTR(-ENOMEM); @@ -1073,49 +1092,55 @@ static struct vmap_area *alloc_vmap_area(unsigned long size, * Only scan the relevant parts containing pointers to other objects * to avoid false negatives. */ - kmemleak_scan_area(&va->rb_node, SIZE_MAX, gfp_mask & GFP_RECLAIM_MASK); + kmemleak_scan_area(&va->rb_node, SIZE_MAX, gfp_mask); retry: /* - * Preload this CPU with one extra vmap_area object to ensure - * that we have it available when fit type of free area is - * NE_FIT_TYPE. + * Preload this CPU with one extra vmap_area object. It is used + * when fit type of free area is NE_FIT_TYPE. Please note, it + * does not guarantee that an allocation occurs on a CPU that + * is preloaded, instead we minimize the case when it is not. + * It can happen because of cpu migration, because there is a + * race until the below spinlock is taken. * * The preload is done in non-atomic context, thus it allows us * to use more permissive allocation masks to be more stable under - * low memory condition and high memory pressure. + * low memory condition and high memory pressure. In rare case, + * if not preloaded, GFP_NOWAIT is used. * - * Even if it fails we do not really care about that. Just proceed - * as it is. "overflow" path will refill the cache we allocate from. + * Set "pva" to NULL here, because of "retry" path. */ - preempt_disable(); - if (!__this_cpu_read(ne_fit_preload_node)) { - preempt_enable(); - pva = kmem_cache_alloc_node(vmap_area_cachep, GFP_KERNEL, node); - preempt_disable(); - - if (__this_cpu_cmpxchg(ne_fit_preload_node, NULL, pva)) { - if (pva) - kmem_cache_free(vmap_area_cachep, pva); - } - } + pva = NULL; - spin_lock(&vmap_area_lock); - preempt_enable(); + if (!this_cpu_read(ne_fit_preload_node)) + /* + * Even if it fails we do not really care about that. + * Just proceed as it is. If needed "overflow" path + * will refill the cache we allocate from. + */ + pva = kmem_cache_alloc_node(vmap_area_cachep, gfp_mask, node); + + spin_lock(&free_vmap_area_lock); + + if (pva && __this_cpu_cmpxchg(ne_fit_preload_node, NULL, pva)) + kmem_cache_free(vmap_area_cachep, pva); /* * If an allocation fails, the "vend" address is * returned. Therefore trigger the overflow path. */ addr = __alloc_vmap_area(size, align, vstart, vend); + spin_unlock(&free_vmap_area_lock); + if (unlikely(addr == vend)) goto overflow; va->va_start = addr; va->va_end = addr + size; va->vm = NULL; - insert_vmap_area(va, &vmap_area_root, &vmap_area_list); + spin_lock(&vmap_area_lock); + insert_vmap_area(va, &vmap_area_root, &vmap_area_list); spin_unlock(&vmap_area_lock); BUG_ON(!IS_ALIGNED(va->va_start, align)); @@ -1125,7 +1150,6 @@ retry: return va; overflow: - spin_unlock(&vmap_area_lock); if (!purged) { purge_vmap_area_lazy(); purged = 1; @@ -1161,28 +1185,24 @@ int unregister_vmap_purge_notifier(struct notifier_block *nb) } EXPORT_SYMBOL_GPL(unregister_vmap_purge_notifier); -static void __free_vmap_area(struct vmap_area *va) +/* + * Free a region of KVA allocated by alloc_vmap_area + */ +static void free_vmap_area(struct vmap_area *va) { /* * Remove from the busy tree/list. */ + spin_lock(&vmap_area_lock); unlink_va(va, &vmap_area_root); + spin_unlock(&vmap_area_lock); /* - * Merge VA with its neighbors, otherwise just add it. + * Insert/Merge it back to the free tree/list. */ - merge_or_add_vmap_area(va, - &free_vmap_area_root, &free_vmap_area_list); -} - -/* - * Free a region of KVA allocated by alloc_vmap_area - */ -static void free_vmap_area(struct vmap_area *va) -{ - spin_lock(&vmap_area_lock); - __free_vmap_area(va); - spin_unlock(&vmap_area_lock); + spin_lock(&free_vmap_area_lock); + merge_or_add_vmap_area(va, &free_vmap_area_root, &free_vmap_area_list); + spin_unlock(&free_vmap_area_lock); } /* @@ -1275,24 +1295,30 @@ static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end) flush_tlb_kernel_range(start, end); resched_threshold = lazy_max_pages() << 1; - spin_lock(&vmap_area_lock); + spin_lock(&free_vmap_area_lock); llist_for_each_entry_safe(va, n_va, valist, purge_list) { unsigned long nr = (va->va_end - va->va_start) >> PAGE_SHIFT; + unsigned long orig_start = va->va_start; + unsigned long orig_end = va->va_end; /* * Finally insert or merge lazily-freed area. It is * detached and there is no need to "unlink" it from * anything. */ - merge_or_add_vmap_area(va, - &free_vmap_area_root, &free_vmap_area_list); + va = merge_or_add_vmap_area(va, &free_vmap_area_root, + &free_vmap_area_list); + + if (is_vmalloc_or_module_addr((void *)orig_start)) + kasan_release_vmalloc(orig_start, orig_end, + va->va_start, va->va_end); atomic_long_sub(nr, &vmap_lazy_nr); if (atomic_long_read(&vmap_lazy_nr) < resched_threshold) - cond_resched_lock(&vmap_area_lock); + cond_resched_lock(&free_vmap_area_lock); } - spin_unlock(&vmap_area_lock); + spin_unlock(&free_vmap_area_lock); return true; } @@ -2014,15 +2040,21 @@ int map_vm_area(struct vm_struct *area, pgprot_t prot, struct page **pages) } EXPORT_SYMBOL_GPL(map_vm_area); -static void setup_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va, - unsigned long flags, const void *caller) +static inline void setup_vmalloc_vm_locked(struct vm_struct *vm, + struct vmap_area *va, unsigned long flags, const void *caller) { - spin_lock(&vmap_area_lock); vm->flags = flags; vm->addr = (void *)va->va_start; vm->size = va->va_end - va->va_start; vm->caller = caller; va->vm = vm; +} + +static void setup_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va, + unsigned long flags, const void *caller) +{ + spin_lock(&vmap_area_lock); + setup_vmalloc_vm_locked(vm, va, flags, caller); spin_unlock(&vmap_area_lock); } @@ -2068,6 +2100,22 @@ static struct vm_struct *__get_vm_area_node(unsigned long size, setup_vmalloc_vm(area, va, flags, caller); + /* + * For KASAN, if we are in vmalloc space, we need to cover the shadow + * area with real memory. If we come here through VM_ALLOC, this is + * done by a higher level function that has access to the true size, + * which might not be a full page. + * + * We assume module space comes via VM_ALLOC path. + */ + if (is_vmalloc_addr(area->addr) && !(area->flags & VM_ALLOC)) { + if (kasan_populate_vmalloc(area->size, area)) { + unmap_vmap_area(va); + kfree(area); + return NULL; + } + } + return area; } @@ -2245,6 +2293,9 @@ static void __vunmap(const void *addr, int deallocate_pages) debug_check_no_locks_freed(area->addr, get_vm_area_size(area)); debug_check_no_obj_freed(area->addr, get_vm_area_size(area)); + if (area->flags & VM_KASAN) + kasan_poison_vmalloc(area->addr, area->size); + vm_remove_mappings(area, deallocate_pages); if (deallocate_pages) { @@ -2440,7 +2491,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, goto fail; } area->pages[i] = page; - if (gfpflags_allow_blocking(gfp_mask|highmem_mask)) + if (gfpflags_allow_blocking(gfp_mask)) cond_resched(); } atomic_long_add(area->nr_pages, &nr_vmalloc_pages); @@ -2497,6 +2548,11 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align, if (!addr) return NULL; + if (is_vmalloc_or_module_addr(area->addr)) { + if (kasan_populate_vmalloc(real_size, area)) + return NULL; + } + /* * In this function, newly allocated vm_struct has VM_UNINITIALIZED * flag. It means that vm_struct is not fully initialized. @@ -3282,7 +3338,7 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets, goto err_free; } retry: - spin_lock(&vmap_area_lock); + spin_lock(&free_vmap_area_lock); /* start scanning - we scan from the top, begin with the last area */ area = term_area = last_area; @@ -3364,29 +3420,44 @@ retry: va = vas[area]; va->va_start = start; va->va_end = start + size; - - insert_vmap_area(va, &vmap_area_root, &vmap_area_list); } - spin_unlock(&vmap_area_lock); + spin_unlock(&free_vmap_area_lock); /* insert all vm's */ - for (area = 0; area < nr_vms; area++) - setup_vmalloc_vm(vms[area], vas[area], VM_ALLOC, + spin_lock(&vmap_area_lock); + for (area = 0; area < nr_vms; area++) { + insert_vmap_area(vas[area], &vmap_area_root, &vmap_area_list); + + setup_vmalloc_vm_locked(vms[area], vas[area], VM_ALLOC, pcpu_get_vm_areas); + } + spin_unlock(&vmap_area_lock); + + /* populate the shadow space outside of the lock */ + for (area = 0; area < nr_vms; area++) { + /* assume success here */ + kasan_populate_vmalloc(sizes[area], vms[area]); + } kfree(vas); return vms; recovery: - /* Remove previously inserted areas. */ + /* + * Remove previously allocated areas. There is no + * need in removing these areas from the busy tree, + * because they are inserted only on the final step + * and when pcpu_get_vm_areas() is success. + */ while (area--) { - __free_vmap_area(vas[area]); + merge_or_add_vmap_area(vas[area], &free_vmap_area_root, + &free_vmap_area_list); vas[area] = NULL; } overflow: - spin_unlock(&vmap_area_lock); + spin_unlock(&free_vmap_area_lock); if (!purged) { purge_vmap_area_lazy(); purged = true; @@ -3437,9 +3508,12 @@ void pcpu_free_vm_areas(struct vm_struct **vms, int nr_vms) #ifdef CONFIG_PROC_FS static void *s_start(struct seq_file *m, loff_t *pos) + __acquires(&vmap_purge_lock) __acquires(&vmap_area_lock) { + mutex_lock(&vmap_purge_lock); spin_lock(&vmap_area_lock); + return seq_list_start(&vmap_area_list, *pos); } @@ -3449,8 +3523,10 @@ static void *s_next(struct seq_file *m, void *p, loff_t *pos) } static void s_stop(struct seq_file *m, void *p) + __releases(&vmap_purge_lock) __releases(&vmap_area_lock) { + mutex_unlock(&vmap_purge_lock); spin_unlock(&vmap_area_lock); } |