From d98c9e83b5e7ca78175df1b13ac4a6d460d3962d Mon Sep 17 00:00:00 2001 From: Andrey Ryabinin Date: Tue, 17 Dec 2019 20:51:38 -0800 Subject: kasan: fix crashes on access to memory mapped by vm_map_ram() With CONFIG_KASAN_VMALLOC=y any use of memory obtained via vm_map_ram() will crash because there is no shadow backing that memory. Instead of sprinkling additional kasan_populate_vmalloc() calls all over the vmalloc code, move it into alloc_vmap_area(). This will fix vm_map_ram() and simplify the code a bit. [aryabinin@virtuozzo.com: v2] Link: http://lkml.kernel.org/r/20191205095942.1761-1-aryabinin@virtuozzo.comLink: http://lkml.kernel.org/r/20191204204534.32202-1-aryabinin@virtuozzo.com Fixes: 3c5c3cfb9ef4 ("kasan: support backing vmalloc space with real shadow memory") Signed-off-by: Andrey Ryabinin Reported-by: Dmitry Vyukov Reviewed-by: Uladzislau Rezki (Sony) Cc: Daniel Axtens Cc: Alexander Potapenko Cc: Daniel Axtens Cc: Qian Cai Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/kasan.h | 15 +++++---- mm/kasan/common.c | 27 ++++++++++------ mm/vmalloc.c | 85 ++++++++++++++++++++++++--------------------------- 3 files changed, 67 insertions(+), 60 deletions(-) diff --git a/include/linux/kasan.h b/include/linux/kasan.h index 4f404c565db1..e18fe54969e9 100644 --- a/include/linux/kasan.h +++ b/include/linux/kasan.h @@ -205,20 +205,23 @@ static inline void *kasan_reset_tag(const void *addr) #endif /* CONFIG_KASAN_SW_TAGS */ #ifdef CONFIG_KASAN_VMALLOC -int kasan_populate_vmalloc(unsigned long requested_size, - struct vm_struct *area); -void kasan_poison_vmalloc(void *start, unsigned long size); +int kasan_populate_vmalloc(unsigned long addr, unsigned long size); +void kasan_poison_vmalloc(const void *start, unsigned long size); +void kasan_unpoison_vmalloc(const void *start, unsigned long size); void kasan_release_vmalloc(unsigned long start, unsigned long end, unsigned long free_region_start, unsigned long free_region_end); #else -static inline int kasan_populate_vmalloc(unsigned long requested_size, - struct vm_struct *area) +static inline int kasan_populate_vmalloc(unsigned long start, + unsigned long size) { return 0; } -static inline void kasan_poison_vmalloc(void *start, unsigned long size) {} +static inline void kasan_poison_vmalloc(const void *start, unsigned long size) +{ } +static inline void kasan_unpoison_vmalloc(const void *start, unsigned long size) +{ } static inline void kasan_release_vmalloc(unsigned long start, unsigned long end, unsigned long free_region_start, diff --git a/mm/kasan/common.c b/mm/kasan/common.c index 2fa710bb6358..e04e73603dfc 100644 --- a/mm/kasan/common.c +++ b/mm/kasan/common.c @@ -778,15 +778,17 @@ static int kasan_populate_vmalloc_pte(pte_t *ptep, unsigned long addr, return 0; } -int kasan_populate_vmalloc(unsigned long requested_size, struct vm_struct *area) +int kasan_populate_vmalloc(unsigned long addr, unsigned long size) { unsigned long shadow_start, shadow_end; int ret; - shadow_start = (unsigned long)kasan_mem_to_shadow(area->addr); + if (!is_vmalloc_or_module_addr((void *)addr)) + return 0; + + shadow_start = (unsigned long)kasan_mem_to_shadow((void *)addr); shadow_start = ALIGN_DOWN(shadow_start, PAGE_SIZE); - shadow_end = (unsigned long)kasan_mem_to_shadow(area->addr + - area->size); + shadow_end = (unsigned long)kasan_mem_to_shadow((void *)addr + size); shadow_end = ALIGN(shadow_end, PAGE_SIZE); ret = apply_to_page_range(&init_mm, shadow_start, @@ -797,10 +799,6 @@ int kasan_populate_vmalloc(unsigned long requested_size, struct vm_struct *area) flush_cache_vmap(shadow_start, shadow_end); - kasan_unpoison_shadow(area->addr, requested_size); - - area->flags |= VM_KASAN; - /* * We need to be careful about inter-cpu effects here. Consider: * @@ -843,12 +841,23 @@ int kasan_populate_vmalloc(unsigned long requested_size, struct vm_struct *area) * Poison the shadow for a vmalloc region. Called as part of the * freeing process at the time the region is freed. */ -void kasan_poison_vmalloc(void *start, unsigned long size) +void kasan_poison_vmalloc(const void *start, unsigned long size) { + if (!is_vmalloc_or_module_addr(start)) + return; + size = round_up(size, KASAN_SHADOW_SCALE_SIZE); kasan_poison_shadow(start, size, KASAN_VMALLOC_INVALID); } +void kasan_unpoison_vmalloc(const void *start, unsigned long size) +{ + if (!is_vmalloc_or_module_addr(start)) + return; + + kasan_unpoison_shadow(start, size); +} + static int kasan_depopulate_vmalloc_pte(pte_t *ptep, unsigned long addr, void *unused) { diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 4d3b3d60d893..6e865cea846c 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -1061,6 +1061,26 @@ __alloc_vmap_area(unsigned long size, unsigned long align, return nva_start_addr; } +/* + * Free a region of KVA allocated by alloc_vmap_area + */ +static void free_vmap_area(struct vmap_area *va) +{ + /* + * Remove from the busy tree/list. + */ + spin_lock(&vmap_area_lock); + unlink_va(va, &vmap_area_root); + spin_unlock(&vmap_area_lock); + + /* + * Insert/Merge it back to the free tree/list. + */ + spin_lock(&free_vmap_area_lock); + merge_or_add_vmap_area(va, &free_vmap_area_root, &free_vmap_area_list); + spin_unlock(&free_vmap_area_lock); +} + /* * Allocate a region of KVA of the specified size and alignment, within the * vstart and vend. @@ -1073,6 +1093,7 @@ static struct vmap_area *alloc_vmap_area(unsigned long size, struct vmap_area *va, *pva; unsigned long addr; int purged = 0; + int ret; BUG_ON(!size); BUG_ON(offset_in_page(size)); @@ -1139,6 +1160,7 @@ retry: va->va_end = addr + size; va->vm = NULL; + spin_lock(&vmap_area_lock); insert_vmap_area(va, &vmap_area_root, &vmap_area_list); spin_unlock(&vmap_area_lock); @@ -1147,6 +1169,12 @@ retry: BUG_ON(va->va_start < vstart); BUG_ON(va->va_end > vend); + ret = kasan_populate_vmalloc(addr, size); + if (ret) { + free_vmap_area(va); + return ERR_PTR(ret); + } + return va; overflow: @@ -1185,26 +1213,6 @@ int unregister_vmap_purge_notifier(struct notifier_block *nb) } EXPORT_SYMBOL_GPL(unregister_vmap_purge_notifier); -/* - * Free a region of KVA allocated by alloc_vmap_area - */ -static void free_vmap_area(struct vmap_area *va) -{ - /* - * Remove from the busy tree/list. - */ - spin_lock(&vmap_area_lock); - unlink_va(va, &vmap_area_root); - spin_unlock(&vmap_area_lock); - - /* - * Insert/Merge it back to the free tree/list. - */ - spin_lock(&free_vmap_area_lock); - merge_or_add_vmap_area(va, &free_vmap_area_root, &free_vmap_area_list); - spin_unlock(&free_vmap_area_lock); -} - /* * Clear the pagetable entries of a given vmap_area */ @@ -1771,6 +1779,8 @@ void vm_unmap_ram(const void *mem, unsigned int count) BUG_ON(addr > VMALLOC_END); BUG_ON(!PAGE_ALIGNED(addr)); + kasan_poison_vmalloc(mem, size); + if (likely(count <= VMAP_MAX_ALLOC)) { debug_check_no_locks_freed(mem, size); vb_free(mem, size); @@ -1821,6 +1831,9 @@ void *vm_map_ram(struct page **pages, unsigned int count, int node, pgprot_t pro addr = va->va_start; mem = (void *)addr; } + + kasan_unpoison_vmalloc(mem, size); + if (vmap_page_range(addr, addr + size, prot, pages) < 0) { vm_unmap_ram(mem, count); return NULL; @@ -2075,6 +2088,7 @@ static struct vm_struct *__get_vm_area_node(unsigned long size, { struct vmap_area *va; struct vm_struct *area; + unsigned long requested_size = size; BUG_ON(in_interrupt()); size = PAGE_ALIGN(size); @@ -2098,23 +2112,9 @@ static struct vm_struct *__get_vm_area_node(unsigned long size, return NULL; } - setup_vmalloc_vm(area, va, flags, caller); + kasan_unpoison_vmalloc((void *)va->va_start, requested_size); - /* - * For KASAN, if we are in vmalloc space, we need to cover the shadow - * area with real memory. If we come here through VM_ALLOC, this is - * done by a higher level function that has access to the true size, - * which might not be a full page. - * - * We assume module space comes via VM_ALLOC path. - */ - if (is_vmalloc_addr(area->addr) && !(area->flags & VM_ALLOC)) { - if (kasan_populate_vmalloc(area->size, area)) { - unmap_vmap_area(va); - kfree(area); - return NULL; - } - } + setup_vmalloc_vm(area, va, flags, caller); return area; } @@ -2293,8 +2293,7 @@ static void __vunmap(const void *addr, int deallocate_pages) debug_check_no_locks_freed(area->addr, get_vm_area_size(area)); debug_check_no_obj_freed(area->addr, get_vm_area_size(area)); - if (area->flags & VM_KASAN) - kasan_poison_vmalloc(area->addr, area->size); + kasan_poison_vmalloc(area->addr, area->size); vm_remove_mappings(area, deallocate_pages); @@ -2539,7 +2538,7 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align, if (!size || (size >> PAGE_SHIFT) > totalram_pages()) goto fail; - area = __get_vm_area_node(size, align, VM_ALLOC | VM_UNINITIALIZED | + area = __get_vm_area_node(real_size, align, VM_ALLOC | VM_UNINITIALIZED | vm_flags, start, end, node, gfp_mask, caller); if (!area) goto fail; @@ -2548,11 +2547,6 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align, if (!addr) return NULL; - if (is_vmalloc_or_module_addr(area->addr)) { - if (kasan_populate_vmalloc(real_size, area)) - return NULL; - } - /* * In this function, newly allocated vm_struct has VM_UNINITIALIZED * flag. It means that vm_struct is not fully initialized. @@ -3437,7 +3431,8 @@ retry: /* populate the shadow space outside of the lock */ for (area = 0; area < nr_vms; area++) { /* assume success here */ - kasan_populate_vmalloc(sizes[area], vms[area]); + kasan_populate_vmalloc(vas[area]->va_start, sizes[area]); + kasan_unpoison_vmalloc((void *)vms[area]->addr, sizes[area]); } kfree(vas); -- cgit v1.2.3 From be1db4753ee6a0db80a900df9dbbf6ad2acc4bd1 Mon Sep 17 00:00:00 2001 From: Daniel Axtens Date: Tue, 17 Dec 2019 20:51:41 -0800 Subject: mm/memory.c: add apply_to_existing_page_range() helper apply_to_page_range() takes an address range, and if any parts of it are not covered by the existing page table hierarchy, it allocates memory to fill them in. In some use cases, this is not what we want - we want to be able to operate exclusively on PTEs that are already in the tables. Add apply_to_existing_page_range() for this. Adjust the walker functions for apply_to_page_range to take 'create', which switches them between the old and new modes. This will be used in KASAN vmalloc. [akpm@linux-foundation.org: reduce code duplication] [akpm@linux-foundation.org: s/apply_to_existing_pages/apply_to_existing_page_range/] [akpm@linux-foundation.org: initialize __apply_to_page_range::err] Link: http://lkml.kernel.org/r/20191205140407.1874-1-dja@axtens.net Signed-off-by: Daniel Axtens Cc: Dmitry Vyukov Cc: Uladzislau Rezki (Sony) Cc: Alexander Potapenko Cc: Daniel Axtens Cc: Qian Cai Cc: Andrey Ryabinin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 3 ++ mm/memory.c | 136 ++++++++++++++++++++++++++++++++++++----------------- 2 files changed, 97 insertions(+), 42 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index c97ea3b694e6..80a9162b406c 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2621,6 +2621,9 @@ static inline int vm_fault_to_errno(vm_fault_t vm_fault, int foll_flags) typedef int (*pte_fn_t)(pte_t *pte, unsigned long addr, void *data); extern int apply_to_page_range(struct mm_struct *mm, unsigned long address, unsigned long size, pte_fn_t fn, void *data); +extern int apply_to_existing_page_range(struct mm_struct *mm, + unsigned long address, unsigned long size, + pte_fn_t fn, void *data); #ifdef CONFIG_PAGE_POISONING extern bool page_poisoning_enabled(void); diff --git a/mm/memory.c b/mm/memory.c index 606da187d1de..45442d9a4f52 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2021,26 +2021,34 @@ EXPORT_SYMBOL(vm_iomap_memory); static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd, unsigned long addr, unsigned long end, - pte_fn_t fn, void *data) + pte_fn_t fn, void *data, bool create) { pte_t *pte; - int err; + int err = 0; spinlock_t *uninitialized_var(ptl); - pte = (mm == &init_mm) ? - pte_alloc_kernel(pmd, addr) : - pte_alloc_map_lock(mm, pmd, addr, &ptl); - if (!pte) - return -ENOMEM; + if (create) { + pte = (mm == &init_mm) ? + pte_alloc_kernel(pmd, addr) : + pte_alloc_map_lock(mm, pmd, addr, &ptl); + if (!pte) + return -ENOMEM; + } else { + pte = (mm == &init_mm) ? + pte_offset_kernel(pmd, addr) : + pte_offset_map_lock(mm, pmd, addr, &ptl); + } BUG_ON(pmd_huge(*pmd)); arch_enter_lazy_mmu_mode(); do { - err = fn(pte++, addr, data); - if (err) - break; + if (create || !pte_none(*pte)) { + err = fn(pte++, addr, data); + if (err) + break; + } } while (addr += PAGE_SIZE, addr != end); arch_leave_lazy_mmu_mode(); @@ -2052,77 +2060,95 @@ static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd, static int apply_to_pmd_range(struct mm_struct *mm, pud_t *pud, unsigned long addr, unsigned long end, - pte_fn_t fn, void *data) + pte_fn_t fn, void *data, bool create) { pmd_t *pmd; unsigned long next; - int err; + int err = 0; BUG_ON(pud_huge(*pud)); - pmd = pmd_alloc(mm, pud, addr); - if (!pmd) - return -ENOMEM; + if (create) { + pmd = pmd_alloc(mm, pud, addr); + if (!pmd) + return -ENOMEM; + } else { + pmd = pmd_offset(pud, addr); + } do { next = pmd_addr_end(addr, end); - err = apply_to_pte_range(mm, pmd, addr, next, fn, data); - if (err) - break; + if (create || !pmd_none_or_clear_bad(pmd)) { + err = apply_to_pte_range(mm, pmd, addr, next, fn, data, + create); + if (err) + break; + } } while (pmd++, addr = next, addr != end); return err; } static int apply_to_pud_range(struct mm_struct *mm, p4d_t *p4d, unsigned long addr, unsigned long end, - pte_fn_t fn, void *data) + pte_fn_t fn, void *data, bool create) { pud_t *pud; unsigned long next; - int err; + int err = 0; - pud = pud_alloc(mm, p4d, addr); - if (!pud) - return -ENOMEM; + if (create) { + pud = pud_alloc(mm, p4d, addr); + if (!pud) + return -ENOMEM; + } else { + pud = pud_offset(p4d, addr); + } do { next = pud_addr_end(addr, end); - err = apply_to_pmd_range(mm, pud, addr, next, fn, data); - if (err) - break; + if (create || !pud_none_or_clear_bad(pud)) { + err = apply_to_pmd_range(mm, pud, addr, next, fn, data, + create); + if (err) + break; + } } while (pud++, addr = next, addr != end); return err; } static int apply_to_p4d_range(struct mm_struct *mm, pgd_t *pgd, unsigned long addr, unsigned long end, - pte_fn_t fn, void *data) + pte_fn_t fn, void *data, bool create) { p4d_t *p4d; unsigned long next; - int err; + int err = 0; - p4d = p4d_alloc(mm, pgd, addr); - if (!p4d) - return -ENOMEM; + if (create) { + p4d = p4d_alloc(mm, pgd, addr); + if (!p4d) + return -ENOMEM; + } else { + p4d = p4d_offset(pgd, addr); + } do { next = p4d_addr_end(addr, end); - err = apply_to_pud_range(mm, p4d, addr, next, fn, data); - if (err) - break; + if (create || !p4d_none_or_clear_bad(p4d)) { + err = apply_to_pud_range(mm, p4d, addr, next, fn, data, + create); + if (err) + break; + } } while (p4d++, addr = next, addr != end); return err; } -/* - * Scan a region of virtual memory, filling in page tables as necessary - * and calling a provided function on each leaf page table. - */ -int apply_to_page_range(struct mm_struct *mm, unsigned long addr, - unsigned long size, pte_fn_t fn, void *data) +static int __apply_to_page_range(struct mm_struct *mm, unsigned long addr, + unsigned long size, pte_fn_t fn, + void *data, bool create) { pgd_t *pgd; unsigned long next; unsigned long end = addr + size; - int err; + int err = 0; if (WARN_ON(addr >= end)) return -EINVAL; @@ -2130,15 +2156,41 @@ int apply_to_page_range(struct mm_struct *mm, unsigned long addr, pgd = pgd_offset(mm, addr); do { next = pgd_addr_end(addr, end); - err = apply_to_p4d_range(mm, pgd, addr, next, fn, data); + if (!create && pgd_none_or_clear_bad(pgd)) + continue; + err = apply_to_p4d_range(mm, pgd, addr, next, fn, data, create); if (err) break; } while (pgd++, addr = next, addr != end); return err; } + +/* + * Scan a region of virtual memory, filling in page tables as necessary + * and calling a provided function on each leaf page table. + */ +int apply_to_page_range(struct mm_struct *mm, unsigned long addr, + unsigned long size, pte_fn_t fn, void *data) +{ + return __apply_to_page_range(mm, addr, size, fn, data, true); +} EXPORT_SYMBOL_GPL(apply_to_page_range); +/* + * Scan a region of virtual memory, calling a provided function on + * each leaf page table where it exists. + * + * Unlike apply_to_page_range, this does _not_ fill in page tables + * where they are absent. + */ +int apply_to_existing_page_range(struct mm_struct *mm, unsigned long addr, + unsigned long size, pte_fn_t fn, void *data) +{ + return __apply_to_page_range(mm, addr, size, fn, data, false); +} +EXPORT_SYMBOL_GPL(apply_to_existing_page_range); + /* * handle_pte_fault chooses page fault handler according to an entry which was * read non-atomically. Before making any commitment, on those architectures -- cgit v1.2.3 From e218f1ca3971e5bcaae1fe8e6f007f9a206e32e9 Mon Sep 17 00:00:00 2001 From: Daniel Axtens Date: Tue, 17 Dec 2019 20:51:46 -0800 Subject: kasan: use apply_to_existing_page_range() for releasing vmalloc shadow kasan_release_vmalloc uses apply_to_page_range to release vmalloc shadow. Unfortunately, apply_to_page_range can allocate memory to fill in page table entries, which is not what we want. Also, kasan_release_vmalloc is called under free_vmap_area_lock, so if apply_to_page_range does allocate memory, we get a sleep in atomic bug: BUG: sleeping function called from invalid context at mm/page_alloc.c:4681 in_atomic(): 1, irqs_disabled(): 0, non_block: 0, pid: 15087, name: Call Trace: __dump_stack lib/dump_stack.c:77 [inline] dump_stack+0x199/0x216 lib/dump_stack.c:118 ___might_sleep.cold.97+0x1f5/0x238 kernel/sched/core.c:6800 __might_sleep+0x95/0x190 kernel/sched/core.c:6753 prepare_alloc_pages mm/page_alloc.c:4681 [inline] __alloc_pages_nodemask+0x3cd/0x890 mm/page_alloc.c:4730 alloc_pages_current+0x10c/0x210 mm/mempolicy.c:2211 alloc_pages include/linux/gfp.h:532 [inline] __get_free_pages+0xc/0x40 mm/page_alloc.c:4786 __pte_alloc_one_kernel include/asm-generic/pgalloc.h:21 [inline] pte_alloc_one_kernel include/asm-generic/pgalloc.h:33 [inline] __pte_alloc_kernel+0x1d/0x200 mm/memory.c:459 apply_to_pte_range mm/memory.c:2031 [inline] apply_to_pmd_range mm/memory.c:2068 [inline] apply_to_pud_range mm/memory.c:2088 [inline] apply_to_p4d_range mm/memory.c:2108 [inline] apply_to_page_range+0x77d/0xa00 mm/memory.c:2133 kasan_release_vmalloc+0xa7/0xc0 mm/kasan/common.c:970 __purge_vmap_area_lazy+0xcbb/0x1f30 mm/vmalloc.c:1313 try_purge_vmap_area_lazy mm/vmalloc.c:1332 [inline] free_vmap_area_noflush+0x2ca/0x390 mm/vmalloc.c:1368 free_unmap_vmap_area mm/vmalloc.c:1381 [inline] remove_vm_area+0x1cc/0x230 mm/vmalloc.c:2209 vm_remove_mappings mm/vmalloc.c:2236 [inline] __vunmap+0x223/0xa20 mm/vmalloc.c:2299 __vfree+0x3f/0xd0 mm/vmalloc.c:2356 __vmalloc_area_node mm/vmalloc.c:2507 [inline] __vmalloc_node_range+0x5d5/0x810 mm/vmalloc.c:2547 __vmalloc_node mm/vmalloc.c:2607 [inline] __vmalloc_node_flags mm/vmalloc.c:2621 [inline] vzalloc+0x6f/0x80 mm/vmalloc.c:2666 alloc_one_pg_vec_page net/packet/af_packet.c:4233 [inline] alloc_pg_vec net/packet/af_packet.c:4258 [inline] packet_set_ring+0xbc0/0x1b50 net/packet/af_packet.c:4342 packet_setsockopt+0xed7/0x2d90 net/packet/af_packet.c:3695 __sys_setsockopt+0x29b/0x4d0 net/socket.c:2117 __do_sys_setsockopt net/socket.c:2133 [inline] __se_sys_setsockopt net/socket.c:2130 [inline] __x64_sys_setsockopt+0xbe/0x150 net/socket.c:2130 do_syscall_64+0xfa/0x780 arch/x86/entry/common.c:294 entry_SYSCALL_64_after_hwframe+0x49/0xbe Switch to using the apply_to_existing_page_range() helper instead, which won't allocate memory. [akpm@linux-foundation.org: s/apply_to_existing_pages/apply_to_existing_page_range/] Link: http://lkml.kernel.org/r/20191205140407.1874-2-dja@axtens.net Fixes: 3c5c3cfb9ef4 ("kasan: support backing vmalloc space with real shadow memory") Signed-off-by: Daniel Axtens Reported-by: Dmitry Vyukov Reviewed-by: Andrey Ryabinin Cc: Alexander Potapenko Cc: Qian Cai Cc: Uladzislau Rezki (Sony) Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/kasan/common.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/mm/kasan/common.c b/mm/kasan/common.c index e04e73603dfc..c15d8ae68c96 100644 --- a/mm/kasan/common.c +++ b/mm/kasan/common.c @@ -957,6 +957,7 @@ void kasan_release_vmalloc(unsigned long start, unsigned long end, { void *shadow_start, *shadow_end; unsigned long region_start, region_end; + unsigned long size; region_start = ALIGN(start, PAGE_SIZE * KASAN_SHADOW_SCALE_SIZE); region_end = ALIGN_DOWN(end, PAGE_SIZE * KASAN_SHADOW_SCALE_SIZE); @@ -979,9 +980,11 @@ void kasan_release_vmalloc(unsigned long start, unsigned long end, shadow_end = kasan_mem_to_shadow((void *)region_end); if (shadow_end > shadow_start) { - apply_to_page_range(&init_mm, (unsigned long)shadow_start, - (unsigned long)(shadow_end - shadow_start), - kasan_depopulate_vmalloc_pte, NULL); + size = shadow_end - shadow_start; + apply_to_existing_page_range(&init_mm, + (unsigned long)shadow_start, + size, kasan_depopulate_vmalloc_pte, + NULL); flush_tlb_kernel_range((unsigned long)shadow_start, (unsigned long)shadow_end); } -- cgit v1.2.3 From 253a496d8e57275d458eb3c988470525b0b2c545 Mon Sep 17 00:00:00 2001 From: Daniel Axtens Date: Tue, 17 Dec 2019 20:51:49 -0800 Subject: kasan: don't assume percpu shadow allocations will succeed syzkaller and the fault injector showed that I was wrong to assume that we could ignore percpu shadow allocation failures. Handle failures properly. Merge all the allocated areas back into the free list and release the shadow, then clean up and return NULL. The shadow is released unconditionally, which relies upon the fact that the release function is able to tolerate pages not being present. Also clean up shadows in the recovery path - currently they are not released, which leaks a bit of memory. Link: http://lkml.kernel.org/r/20191205140407.1874-3-dja@axtens.net Fixes: 3c5c3cfb9ef4 ("kasan: support backing vmalloc space with real shadow memory") Signed-off-by: Daniel Axtens Reported-by: syzbot+82e323920b78d54aaed5@syzkaller.appspotmail.com Reported-by: syzbot+59b7daa4315e07a994f1@syzkaller.appspotmail.com Reviewed-by: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Alexander Potapenko Cc: Qian Cai Cc: Uladzislau Rezki (Sony) Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmalloc.c | 48 ++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 38 insertions(+), 10 deletions(-) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 6e865cea846c..e9681dc4aa75 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -3288,7 +3288,7 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets, struct vmap_area **vas, *va; struct vm_struct **vms; int area, area2, last_area, term_area; - unsigned long base, start, size, end, last_end; + unsigned long base, start, size, end, last_end, orig_start, orig_end; bool purged = false; enum fit_type type; @@ -3418,6 +3418,15 @@ retry: spin_unlock(&free_vmap_area_lock); + /* populate the kasan shadow space */ + for (area = 0; area < nr_vms; area++) { + if (kasan_populate_vmalloc(vas[area]->va_start, sizes[area])) + goto err_free_shadow; + + kasan_unpoison_vmalloc((void *)vas[area]->va_start, + sizes[area]); + } + /* insert all vm's */ spin_lock(&vmap_area_lock); for (area = 0; area < nr_vms; area++) { @@ -3428,13 +3437,6 @@ retry: } spin_unlock(&vmap_area_lock); - /* populate the shadow space outside of the lock */ - for (area = 0; area < nr_vms; area++) { - /* assume success here */ - kasan_populate_vmalloc(vas[area]->va_start, sizes[area]); - kasan_unpoison_vmalloc((void *)vms[area]->addr, sizes[area]); - } - kfree(vas); return vms; @@ -3446,8 +3448,12 @@ recovery: * and when pcpu_get_vm_areas() is success. */ while (area--) { - merge_or_add_vmap_area(vas[area], &free_vmap_area_root, - &free_vmap_area_list); + orig_start = vas[area]->va_start; + orig_end = vas[area]->va_end; + va = merge_or_add_vmap_area(vas[area], &free_vmap_area_root, + &free_vmap_area_list); + kasan_release_vmalloc(orig_start, orig_end, + va->va_start, va->va_end); vas[area] = NULL; } @@ -3482,6 +3488,28 @@ err_free2: kfree(vas); kfree(vms); return NULL; + +err_free_shadow: + spin_lock(&free_vmap_area_lock); + /* + * We release all the vmalloc shadows, even the ones for regions that + * hadn't been successfully added. This relies on kasan_release_vmalloc + * being able to tolerate this case. + */ + for (area = 0; area < nr_vms; area++) { + orig_start = vas[area]->va_start; + orig_end = vas[area]->va_end; + va = merge_or_add_vmap_area(vas[area], &free_vmap_area_root, + &free_vmap_area_list); + kasan_release_vmalloc(orig_start, orig_end, + va->va_start, va->va_end); + vas[area] = NULL; + kfree(vms[area]); + } + spin_unlock(&free_vmap_area_lock); + kfree(vas); + kfree(vms); + return NULL; } /** -- cgit v1.2.3 From 42a9a53bb394a1de2247ef78f0b802ae86798122 Mon Sep 17 00:00:00 2001 From: Yang Shi Date: Tue, 17 Dec 2019 20:51:52 -0800 Subject: mm: vmscan: protect shrinker idr replace with CONFIG_MEMCG Since commit 0a432dcbeb32 ("mm: shrinker: make shrinker not depend on memcg kmem"), shrinkers' idr is protected by CONFIG_MEMCG instead of CONFIG_MEMCG_KMEM, so it makes no sense to protect shrinker idr replace with CONFIG_MEMCG_KMEM. And in the CONFIG_MEMCG && CONFIG_SLOB case, shrinker_idr contains only shrinker, and it is deferred_split_shrinker. But it is never actually called, since idr_replace() is never compiled due to the wrong #ifdef. The deferred_split_shrinker all the time is staying in half-registered state, and it's never called for subordinate mem cgroups. Link: http://lkml.kernel.org/r/1575486978-45249-1-git-send-email-yang.shi@linux.alibaba.com Fixes: 0a432dcbeb32 ("mm: shrinker: make shrinker not depend on memcg kmem") Signed-off-by: Yang Shi Reviewed-by: Kirill Tkhai Acked-by: Michal Hocko Cc: Johannes Weiner Cc: Shakeel Butt Cc: Roman Gushchin Cc: [5.4+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index 74e8edce83ca..572fb17c6273 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -387,7 +387,7 @@ void register_shrinker_prepared(struct shrinker *shrinker) { down_write(&shrinker_rwsem); list_add_tail(&shrinker->list, &shrinker_list); -#ifdef CONFIG_MEMCG_KMEM +#ifdef CONFIG_MEMCG if (shrinker->flags & SHRINKER_MEMCG_AWARE) idr_replace(&shrinker_idr, shrinker, shrinker->id); #endif -- cgit v1.2.3 From 045f6d7942be248fbda6e85b2393f2735695ed39 Mon Sep 17 00:00:00 2001 From: Changbin Du Date: Tue, 17 Dec 2019 20:51:56 -0800 Subject: lib/Kconfig.debug: fix some messed up configurations Some configuration items are messed up during conflict resolving. For example, STRICT_DEVMEM should not in testing menu, but kunit should. This patch fixes all of them. [akpm@linux-foundation.org: coding style fixes] Link: http://lkml.kernel.org/r/20191209155653.7509-1-changbin.du@gmail.com Signed-off-by: Changbin Du Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/Kconfig.debug | 100 ++++++++++++++++++++++++++++-------------------------- 1 file changed, 51 insertions(+), 49 deletions(-) diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index d1842fe756d5..5ffe144c9794 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -1483,6 +1483,55 @@ config PROVIDE_OHCI1394_DMA_INIT See Documentation/debugging-via-ohci1394.txt for more information. +source "samples/Kconfig" + +config ARCH_HAS_DEVMEM_IS_ALLOWED + bool + +config STRICT_DEVMEM + bool "Filter access to /dev/mem" + depends on MMU && DEVMEM + depends on ARCH_HAS_DEVMEM_IS_ALLOWED + default y if PPC || X86 || ARM64 + help + If this option is disabled, you allow userspace (root) access to all + of memory, including kernel and userspace memory. Accidental + access to this is obviously disastrous, but specific access can + be used by people debugging the kernel. Note that with PAT support + enabled, even in this case there are restrictions on /dev/mem + use due to the cache aliasing requirements. + + If this option is switched on, and IO_STRICT_DEVMEM=n, the /dev/mem + file only allows userspace access to PCI space and the BIOS code and + data regions. This is sufficient for dosemu and X and all common + users of /dev/mem. + + If in doubt, say Y. + +config IO_STRICT_DEVMEM + bool "Filter I/O access to /dev/mem" + depends on STRICT_DEVMEM + help + If this option is disabled, you allow userspace (root) access to all + io-memory regardless of whether a driver is actively using that + range. Accidental access to this is obviously disastrous, but + specific access can be used by people debugging kernel drivers. + + If this option is switched on, the /dev/mem file only allows + userspace access to *idle* io-memory ranges (see /proc/iomem) This + may break traditional users of /dev/mem (dosemu, legacy X, etc...) + if the driver using a given range cannot be disabled. + + If in doubt, say Y. + +menu "$(SRCARCH) Debugging" + +source "arch/$(SRCARCH)/Kconfig.debug" + +endmenu + +menu "Kernel Testing and Coverage" + source "lib/kunit/Kconfig" config NOTIFIER_ERROR_INJECTION @@ -1643,10 +1692,6 @@ config FAULT_INJECTION_STACKTRACE_FILTER help Provide stacktrace filter for fault-injection capabilities -endmenu # "Kernel Testing and Coverage" - -menu "Kernel Testing and Coverage" - config ARCH_HAS_KCOV bool help @@ -2130,52 +2175,7 @@ config MEMTEST memtest=17, mean do 17 test patterns. If you are unsure how to answer this question, answer N. -source "samples/Kconfig" - -config ARCH_HAS_DEVMEM_IS_ALLOWED - bool - -config STRICT_DEVMEM - bool "Filter access to /dev/mem" - depends on MMU && DEVMEM - depends on ARCH_HAS_DEVMEM_IS_ALLOWED - default y if PPC || X86 || ARM64 - ---help--- - If this option is disabled, you allow userspace (root) access to all - of memory, including kernel and userspace memory. Accidental - access to this is obviously disastrous, but specific access can - be used by people debugging the kernel. Note that with PAT support - enabled, even in this case there are restrictions on /dev/mem - use due to the cache aliasing requirements. - - If this option is switched on, and IO_STRICT_DEVMEM=n, the /dev/mem - file only allows userspace access to PCI space and the BIOS code and - data regions. This is sufficient for dosemu and X and all common - users of /dev/mem. - - If in doubt, say Y. -config IO_STRICT_DEVMEM - bool "Filter I/O access to /dev/mem" - depends on STRICT_DEVMEM - ---help--- - If this option is disabled, you allow userspace (root) access to all - io-memory regardless of whether a driver is actively using that - range. Accidental access to this is obviously disastrous, but - specific access can be used by people debugging kernel drivers. - - If this option is switched on, the /dev/mem file only allows - userspace access to *idle* io-memory ranges (see /proc/iomem) This - may break traditional users of /dev/mem (dosemu, legacy X, etc...) - if the driver using a given range cannot be disabled. - - If in doubt, say Y. - -menu "$(SRCARCH) Debugging" - -source "arch/$(SRCARCH)/Kconfig.debug" - -endmenu config HYPERV_TESTING bool "Microsoft Hyper-V driver testing" @@ -2184,4 +2184,6 @@ config HYPERV_TESTING help Select this option to enable Hyper-V vmbus testing. +endmenu # "Kernel Testing and Coverage" + endmenu # Kernel hacking -- cgit v1.2.3