diff options
Diffstat (limited to 'mm/vmalloc.c')
-rw-r--r-- | mm/vmalloc.c | 461 |
1 files changed, 266 insertions, 195 deletions
diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 61f5bec0f2b6..ef910bf349e1 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -89,17 +89,6 @@ struct vfree_deferred { }; static DEFINE_PER_CPU(struct vfree_deferred, vfree_deferred); -static void __vunmap(const void *, int); - -static void free_work(struct work_struct *w) -{ - struct vfree_deferred *p = container_of(w, struct vfree_deferred, wq); - struct llist_node *t, *llnode; - - llist_for_each_safe(llnode, t, llist_del_all(&p->list)) - __vunmap((void *)llnode, 1); -} - /*** Page table manipulation functions ***/ static int vmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, phys_addr_t phys_addr, pgprot_t prot, @@ -1590,7 +1579,8 @@ preload_this_cpu_lock(spinlock_t *lock, gfp_t gfp_mask, int node) static struct vmap_area *alloc_vmap_area(unsigned long size, unsigned long align, unsigned long vstart, unsigned long vend, - int node, gfp_t gfp_mask) + int node, gfp_t gfp_mask, + unsigned long va_flags) { struct vmap_area *va; unsigned long freed; @@ -1598,9 +1588,8 @@ static struct vmap_area *alloc_vmap_area(unsigned long size, int purged = 0; int ret; - BUG_ON(!size); - BUG_ON(offset_in_page(size)); - BUG_ON(!is_power_of_2(align)); + if (unlikely(!size || offset_in_page(size) || !is_power_of_2(align))) + return ERR_PTR(-EINVAL); if (unlikely(!vmap_initialized)) return ERR_PTR(-EBUSY); @@ -1636,6 +1625,7 @@ retry: va->va_start = addr; va->va_end = addr + size; va->vm = NULL; + va->flags = va_flags; spin_lock(&vmap_area_lock); insert_vmap_area(va, &vmap_area_root, &vmap_area_list); @@ -1816,9 +1806,9 @@ static void drain_vmap_area_work(struct work_struct *work) } /* - * Free a vmap area, caller ensuring that the area has been unmapped - * and flush_cache_vunmap had been called for the correct range - * previously. + * Free a vmap area, caller ensuring that the area has been unmapped, + * unlinked and flush_cache_vunmap had been called for the correct + * range previously. */ static void free_vmap_area_noflush(struct vmap_area *va) { @@ -1826,9 +1816,8 @@ static void free_vmap_area_noflush(struct vmap_area *va) unsigned long va_start = va->va_start; unsigned long nr_lazy; - spin_lock(&vmap_area_lock); - unlink_va(va, &vmap_area_root); - spin_unlock(&vmap_area_lock); + if (WARN_ON_ONCE(!list_empty(&va->list))) + return; nr_lazy = atomic_long_add_return((va->va_end - va->va_start) >> PAGE_SHIFT, &vmap_lazy_nr); @@ -1872,6 +1861,19 @@ struct vmap_area *find_vmap_area(unsigned long addr) return va; } +static struct vmap_area *find_unlink_vmap_area(unsigned long addr) +{ + struct vmap_area *va; + + spin_lock(&vmap_area_lock); + va = __find_vmap_area(addr, &vmap_area_root); + if (va) + unlink_va(va, &vmap_area_root); + spin_unlock(&vmap_area_lock); + + return va; +} + /*** Per cpu kva allocator ***/ /* @@ -1902,6 +1904,10 @@ struct vmap_area *find_vmap_area(unsigned long addr) #define VMAP_BLOCK_SIZE (VMAP_BBMAP_BITS * PAGE_SIZE) +#define VMAP_RAM 0x1 /* indicates vm_map_ram area*/ +#define VMAP_BLOCK 0x2 /* mark out the vmap_block sub-type*/ +#define VMAP_FLAGS_MASK 0x3 + struct vmap_block_queue { spinlock_t lock; struct list_head free; @@ -1911,6 +1917,7 @@ struct vmap_block { spinlock_t lock; struct vmap_area *va; unsigned long free, dirty; + DECLARE_BITMAP(used_map, VMAP_BBMAP_BITS); unsigned long dirty_min, dirty_max; /*< dirty range */ struct list_head free_list; struct rcu_head rcu_head; @@ -1976,7 +1983,8 @@ static void *new_vmap_block(unsigned int order, gfp_t gfp_mask) va = alloc_vmap_area(VMAP_BLOCK_SIZE, VMAP_BLOCK_SIZE, VMALLOC_START, VMALLOC_END, - node, gfp_mask); + node, gfp_mask, + VMAP_RAM|VMAP_BLOCK); if (IS_ERR(va)) { kfree(vb); return ERR_CAST(va); @@ -1987,10 +1995,12 @@ static void *new_vmap_block(unsigned int order, gfp_t gfp_mask) vb->va = va; /* At least something should be left free */ BUG_ON(VMAP_BBMAP_BITS <= (1UL << order)); + bitmap_zero(vb->used_map, VMAP_BBMAP_BITS); vb->free = VMAP_BBMAP_BITS - (1UL << order); vb->dirty = 0; vb->dirty_min = VMAP_BBMAP_BITS; vb->dirty_max = 0; + bitmap_set(vb->used_map, 0, (1UL << order)); INIT_LIST_HEAD(&vb->free_list); vb_idx = addr_to_vb_idx(va->va_start); @@ -2016,6 +2026,10 @@ static void free_vmap_block(struct vmap_block *vb) tmp = xa_erase(&vmap_blocks, addr_to_vb_idx(vb->va->va_start)); BUG_ON(tmp != vb); + spin_lock(&vmap_area_lock); + unlink_va(vb->va, &vmap_area_root); + spin_unlock(&vmap_area_lock); + free_vmap_area_noflush(vb->va); kfree_rcu(vb, rcu_head); } @@ -2096,6 +2110,7 @@ static void *vb_alloc(unsigned long size, gfp_t gfp_mask) pages_off = VMAP_BBMAP_BITS - vb->free; vaddr = vmap_block_vaddr(vb->va->va_start, pages_off); vb->free -= 1UL << order; + bitmap_set(vb->used_map, pages_off, (1UL << order)); if (vb->free == 0) { spin_lock(&vbq->lock); list_del_rcu(&vb->free_list); @@ -2129,6 +2144,9 @@ static void vb_free(unsigned long addr, unsigned long size) order = get_order(size); offset = (addr & (VMAP_BLOCK_SIZE - 1)) >> PAGE_SHIFT; vb = xa_load(&vmap_blocks, addr_to_vb_idx(addr)); + spin_lock(&vb->lock); + bitmap_clear(vb->used_map, offset, (1UL << order)); + spin_unlock(&vb->lock); vunmap_range_noflush(addr, addr + size); @@ -2237,8 +2255,10 @@ void vm_unmap_ram(const void *mem, unsigned int count) return; } - va = find_vmap_area(addr); - BUG_ON(!va); + va = find_unlink_vmap_area(addr); + if (WARN_ON_ONCE(!va)) + return; + debug_check_no_locks_freed((void *)va->va_start, (va->va_end - va->va_start)); free_unmap_vmap_area(va); @@ -2273,7 +2293,8 @@ void *vm_map_ram(struct page **pages, unsigned int count, int node) } else { struct vmap_area *va; va = alloc_vmap_area(size, PAGE_SIZE, - VMALLOC_START, VMALLOC_END, node, GFP_KERNEL); + VMALLOC_START, VMALLOC_END, + node, GFP_KERNEL, VMAP_RAM); if (IS_ERR(va)) return NULL; @@ -2417,48 +2438,6 @@ static void vmap_init_free_space(void) } } -void __init vmalloc_init(void) -{ - struct vmap_area *va; - struct vm_struct *tmp; - int i; - - /* - * Create the cache for vmap_area objects. - */ - vmap_area_cachep = KMEM_CACHE(vmap_area, SLAB_PANIC); - - for_each_possible_cpu(i) { - struct vmap_block_queue *vbq; - struct vfree_deferred *p; - - vbq = &per_cpu(vmap_block_queue, i); - spin_lock_init(&vbq->lock); - INIT_LIST_HEAD(&vbq->free); - p = &per_cpu(vfree_deferred, i); - init_llist_head(&p->list); - INIT_WORK(&p->wq, free_work); - } - - /* Import existing vmlist entries. */ - for (tmp = vmlist; tmp; tmp = tmp->next) { - va = kmem_cache_zalloc(vmap_area_cachep, GFP_NOWAIT); - if (WARN_ON_ONCE(!va)) - continue; - - va->va_start = (unsigned long)tmp->addr; - va->va_end = va->va_start + tmp->size; - va->vm = tmp; - insert_vmap_area(va, &vmap_area_root, &vmap_area_list); - } - - /* - * Now we can initialize a free vmap space. - */ - vmap_init_free_space(); - vmap_initialized = true; -} - static inline void setup_vmalloc_vm_locked(struct vm_struct *vm, struct vmap_area *va, unsigned long flags, const void *caller) { @@ -2513,7 +2492,7 @@ static struct vm_struct *__get_vm_area_node(unsigned long size, if (!(flags & VM_NO_GUARD)) size += PAGE_SIZE; - va = alloc_vmap_area(size, align, start, end, node, gfp_mask); + va = alloc_vmap_area(size, align, start, end, node, gfp_mask, 0); if (IS_ERR(va)) { kfree(area); return NULL; @@ -2605,25 +2584,26 @@ struct vm_struct *find_vm_area(const void *addr) struct vm_struct *remove_vm_area(const void *addr) { struct vmap_area *va; + struct vm_struct *vm; might_sleep(); - spin_lock(&vmap_area_lock); - va = __find_vmap_area((unsigned long)addr, &vmap_area_root); - if (va && va->vm) { - struct vm_struct *vm = va->vm; - - va->vm = NULL; - spin_unlock(&vmap_area_lock); + if (WARN(!PAGE_ALIGNED(addr), "Trying to vfree() bad address (%p)\n", + addr)) + return NULL; - kasan_free_module_shadow(vm); - free_unmap_vmap_area(va); + va = find_unlink_vmap_area((unsigned long)addr); + if (!va || !va->vm) + return NULL; + vm = va->vm; - return vm; - } + debug_check_no_locks_freed(vm->addr, get_vm_area_size(vm)); + debug_check_no_obj_freed(vm->addr, get_vm_area_size(vm)); + kasan_free_module_shadow(vm); + kasan_poison_vmalloc(vm->addr, get_vm_area_size(vm)); - spin_unlock(&vmap_area_lock); - return NULL; + free_unmap_vmap_area(va); + return vm; } static inline void set_area_direct_map(const struct vm_struct *area, @@ -2637,37 +2617,23 @@ static inline void set_area_direct_map(const struct vm_struct *area, set_direct_map(area->pages[i]); } -/* Handle removing and resetting vm mappings related to the vm_struct. */ -static void vm_remove_mappings(struct vm_struct *area, int deallocate_pages) +/* + * Flush the vm mapping and reset the direct map. + */ +static void vm_reset_perms(struct vm_struct *area) { unsigned long start = ULONG_MAX, end = 0; unsigned int page_order = vm_area_page_order(area); - int flush_reset = area->flags & VM_FLUSH_RESET_PERMS; int flush_dmap = 0; int i; - remove_vm_area(area->addr); - - /* If this is not VM_FLUSH_RESET_PERMS memory, no need for the below. */ - if (!flush_reset) - return; - /* - * If not deallocating pages, just do the flush of the VM area and - * return. - */ - if (!deallocate_pages) { - vm_unmap_aliases(); - return; - } - - /* - * If execution gets here, flush the vm mapping and reset the direct - * map. Find the start and end range of the direct mappings to make sure + * Find the start and end range of the direct mappings to make sure that * the vm_unmap_aliases() flush includes the direct map. */ for (i = 0; i < area->nr_pages; i += 1U << page_order) { unsigned long addr = (unsigned long)page_address(area->pages[i]); + if (addr) { unsigned long page_size; @@ -2688,66 +2654,13 @@ static void vm_remove_mappings(struct vm_struct *area, int deallocate_pages) set_area_direct_map(area, set_direct_map_default_noflush); } -static void __vunmap(const void *addr, int deallocate_pages) +static void delayed_vfree_work(struct work_struct *w) { - struct vm_struct *area; - - if (!addr) - return; - - if (WARN(!PAGE_ALIGNED(addr), "Trying to vfree() bad address (%p)\n", - addr)) - return; - - area = find_vm_area(addr); - if (unlikely(!area)) { - WARN(1, KERN_ERR "Trying to vfree() nonexistent vm area (%p)\n", - addr); - return; - } - - debug_check_no_locks_freed(area->addr, get_vm_area_size(area)); - debug_check_no_obj_freed(area->addr, get_vm_area_size(area)); - - kasan_poison_vmalloc(area->addr, get_vm_area_size(area)); - - vm_remove_mappings(area, deallocate_pages); - - if (deallocate_pages) { - int i; - - for (i = 0; i < area->nr_pages; i++) { - struct page *page = area->pages[i]; - - BUG_ON(!page); - mod_memcg_page_state(page, MEMCG_VMALLOC, -1); - /* - * High-order allocs for huge vmallocs are split, so - * can be freed as an array of order-0 allocations - */ - __free_pages(page, 0); - cond_resched(); - } - atomic_long_sub(area->nr_pages, &nr_vmalloc_pages); - - kvfree(area->pages); - } - - kfree(area); -} - -static inline void __vfree_deferred(const void *addr) -{ - /* - * Use raw_cpu_ptr() because this can be called from preemptible - * context. Preemption is absolutely fine here, because the llist_add() - * implementation is lockless, so it works even if we are adding to - * another cpu's list. schedule_work() should be fine with this too. - */ - struct vfree_deferred *p = raw_cpu_ptr(&vfree_deferred); + struct vfree_deferred *p = container_of(w, struct vfree_deferred, wq); + struct llist_node *t, *llnode; - if (llist_add((struct llist_node *)addr, &p->list)) - schedule_work(&p->wq); + llist_for_each_safe(llnode, t, llist_del_all(&p->list)) + vfree(llnode); } /** @@ -2759,21 +2672,19 @@ static inline void __vfree_deferred(const void *addr) */ void vfree_atomic(const void *addr) { - BUG_ON(in_nmi()); + struct vfree_deferred *p = raw_cpu_ptr(&vfree_deferred); + BUG_ON(in_nmi()); kmemleak_free(addr); - if (!addr) - return; - __vfree_deferred(addr); -} - -static void __vfree(const void *addr) -{ - if (unlikely(in_interrupt())) - __vfree_deferred(addr); - else - __vunmap(addr, 1); + /* + * Use raw_cpu_ptr() because this can be called from preemptible + * context. Preemption is absolutely fine here, because the llist_add() + * implementation is lockless, so it works even if we are adding to + * another cpu's list. schedule_work() should be fine with this too. + */ + if (addr && llist_add((struct llist_node *)addr, &p->list)) + schedule_work(&p->wq); } /** @@ -2795,16 +2706,45 @@ static void __vfree(const void *addr) */ void vfree(const void *addr) { - BUG_ON(in_nmi()); + struct vm_struct *vm; + int i; - kmemleak_free(addr); + if (unlikely(in_interrupt())) { + vfree_atomic(addr); + return; + } - might_sleep_if(!in_interrupt()); + BUG_ON(in_nmi()); + kmemleak_free(addr); + might_sleep(); if (!addr) return; - __vfree(addr); + vm = remove_vm_area(addr); + if (unlikely(!vm)) { + WARN(1, KERN_ERR "Trying to vfree() nonexistent vm area (%p)\n", + addr); + return; + } + + if (unlikely(vm->flags & VM_FLUSH_RESET_PERMS)) + vm_reset_perms(vm); + for (i = 0; i < vm->nr_pages; i++) { + struct page *page = vm->pages[i]; + + BUG_ON(!page); + mod_memcg_page_state(page, MEMCG_VMALLOC, -1); + /* + * High-order allocs for huge vmallocs are split, so + * can be freed as an array of order-0 allocations + */ + __free_pages(page, 0); + cond_resched(); + } + atomic_long_sub(vm->nr_pages, &nr_vmalloc_pages); + kvfree(vm->pages); + kfree(vm); } EXPORT_SYMBOL(vfree); @@ -2819,10 +2759,20 @@ EXPORT_SYMBOL(vfree); */ void vunmap(const void *addr) { + struct vm_struct *vm; + BUG_ON(in_interrupt()); might_sleep(); - if (addr) - __vunmap(addr, 0); + + if (!addr) + return; + vm = remove_vm_area(addr); + if (unlikely(!vm)) { + WARN(1, KERN_ERR "Trying to vunmap() nonexistent vm area (%p)\n", + addr); + return; + } + kfree(vm); } EXPORT_SYMBOL(vunmap); @@ -2850,6 +2800,9 @@ void *vmap(struct page **pages, unsigned int count, might_sleep(); + if (WARN_ON_ONCE(flags & VM_FLUSH_RESET_PERMS)) + return NULL; + /* * Your top guard is someone else's bottom guard. Not having a top * guard compromises someone else's mappings too. @@ -3032,7 +2985,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, int ret; array_size = (unsigned long)nr_small_pages * sizeof(struct page *); - gfp_mask |= __GFP_NOWARN; + if (!(gfp_mask & (GFP_DMA | GFP_DMA32))) gfp_mask |= __GFP_HIGHMEM; @@ -3068,7 +3021,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, /* * If not enough pages were obtained to accomplish an - * allocation request, free them via __vfree() if any. + * allocation request, free them via vfree() if any. */ if (area->nr_pages != nr_small_pages) { warn_alloc(gfp_mask, NULL, @@ -3108,7 +3061,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, return area->addr; fail: - __vfree(area->addr); + vfree(area->addr); return NULL; } @@ -3511,6 +3464,68 @@ static int aligned_vread(char *buf, char *addr, unsigned long count) return copied; } +static void vmap_ram_vread(char *buf, char *addr, int count, unsigned long flags) +{ + char *start; + struct vmap_block *vb; + unsigned long offset; + unsigned int rs, re, n; + + /* + * If it's area created by vm_map_ram() interface directly, but + * not further subdividing and delegating management to vmap_block, + * handle it here. + */ + if (!(flags & VMAP_BLOCK)) { + aligned_vread(buf, addr, count); + return; + } + + /* + * Area is split into regions and tracked with vmap_block, read out + * each region and zero fill the hole between regions. + */ + vb = xa_load(&vmap_blocks, addr_to_vb_idx((unsigned long)addr)); + if (!vb) + goto finished; + + spin_lock(&vb->lock); + if (bitmap_empty(vb->used_map, VMAP_BBMAP_BITS)) { + spin_unlock(&vb->lock); + goto finished; + } + for_each_set_bitrange(rs, re, vb->used_map, VMAP_BBMAP_BITS) { + if (!count) + break; + start = vmap_block_vaddr(vb->va->va_start, rs); + while (addr < start) { + if (count == 0) + goto unlock; + *buf = '\0'; + buf++; + addr++; + count--; + } + /*it could start reading from the middle of used region*/ + offset = offset_in_page(addr); + n = ((re - rs + 1) << PAGE_SHIFT) - offset; + if (n > count) + n = count; + aligned_vread(buf, start+offset, n); + + buf += n; + addr += n; + count -= n; + } +unlock: + spin_unlock(&vb->lock); + +finished: + /* zero-fill the left dirty or free regions */ + if (count) + memset(buf, 0, count); +} + /** * vread() - read vmalloc area in a safe way. * @buf: buffer for reading data @@ -3541,7 +3556,7 @@ long vread(char *buf, char *addr, unsigned long count) struct vm_struct *vm; char *vaddr, *buf_start = buf; unsigned long buflen = count; - unsigned long n; + unsigned long n, size, flags; addr = kasan_reset_tag(addr); @@ -3562,12 +3577,26 @@ long vread(char *buf, char *addr, unsigned long count) if (!count) break; - if (!va->vm) + vm = va->vm; + flags = va->flags & VMAP_FLAGS_MASK; + /* + * VMAP_BLOCK indicates a sub-type of vm_map_ram area, need + * be set together with VMAP_RAM. + */ + WARN_ON(flags == VMAP_BLOCK); + + if (!vm && !flags) continue; - vm = va->vm; - vaddr = (char *) vm->addr; - if (addr >= vaddr + get_vm_area_size(vm)) + if (vm && (vm->flags & VM_UNINITIALIZED)) + continue; + /* Pair with smp_wmb() in clear_vm_uninitialized_flag() */ + smp_rmb(); + + vaddr = (char *) va->va_start; + size = vm ? get_vm_area_size(vm) : va_size(va); + + if (addr >= vaddr + size) continue; while (addr < vaddr) { if (count == 0) @@ -3577,10 +3606,13 @@ long vread(char *buf, char *addr, unsigned long count) addr++; count--; } - n = vaddr + get_vm_area_size(vm) - addr; + n = vaddr + size - addr; if (n > count) n = count; - if (!(vm->flags & VM_IOREMAP)) + + if (flags & VMAP_RAM) + vmap_ram_vread(buf, addr, n, flags); + else if (!(vm->flags & VM_IOREMAP)) aligned_vread(buf, addr, n); else /* IOREMAP area is treated as memory hole */ memset(buf, 0, n); @@ -3658,7 +3690,7 @@ int remap_vmalloc_range_partial(struct vm_area_struct *vma, unsigned long uaddr, size -= PAGE_SIZE; } while (size > 0); - vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; + vm_flags_set(vma, VM_DONTEXPAND | VM_DONTDUMP); return 0; } @@ -4126,14 +4158,11 @@ static int s_show(struct seq_file *m, void *p) va = list_entry(p, struct vmap_area, list); - /* - * s_show can encounter race with remove_vm_area, !vm on behalf - * of vmap area is being tear down or vm_map_ram allocation. - */ if (!va->vm) { - seq_printf(m, "0x%pK-0x%pK %7ld vm_map_ram\n", - (void *)va->va_start, (void *)va->va_end, - va->va_end - va->va_start); + if (va->flags & VMAP_RAM) + seq_printf(m, "0x%pK-0x%pK %7ld vm_map_ram\n", + (void *)va->va_start, (void *)va->va_end, + va->va_end - va->va_start); goto final; } @@ -4203,3 +4232,45 @@ static int __init proc_vmalloc_init(void) module_init(proc_vmalloc_init); #endif + +void __init vmalloc_init(void) +{ + struct vmap_area *va; + struct vm_struct *tmp; + int i; + + /* + * Create the cache for vmap_area objects. + */ + vmap_area_cachep = KMEM_CACHE(vmap_area, SLAB_PANIC); + + for_each_possible_cpu(i) { + struct vmap_block_queue *vbq; + struct vfree_deferred *p; + + vbq = &per_cpu(vmap_block_queue, i); + spin_lock_init(&vbq->lock); + INIT_LIST_HEAD(&vbq->free); + p = &per_cpu(vfree_deferred, i); + init_llist_head(&p->list); + INIT_WORK(&p->wq, delayed_vfree_work); + } + + /* Import existing vmlist entries. */ + for (tmp = vmlist; tmp; tmp = tmp->next) { + va = kmem_cache_zalloc(vmap_area_cachep, GFP_NOWAIT); + if (WARN_ON_ONCE(!va)) + continue; + + va->va_start = (unsigned long)tmp->addr; + va->va_end = va->va_start + tmp->size; + va->vm = tmp; + insert_vmap_area(va, &vmap_area_root, &vmap_area_list); + } + + /* + * Now we can initialize a free vmap space. + */ + vmap_init_free_space(); + vmap_initialized = true; +} |