diff options
Diffstat (limited to 'mm')
-rw-r--r-- | mm/Makefile | 2 | ||||
-rw-r--r-- | mm/cma.c | 12 | ||||
-rw-r--r-- | mm/huge_memory.c | 25 | ||||
-rw-r--r-- | mm/hugetlb.c | 4 | ||||
-rw-r--r-- | mm/iov_iter.c | 753 | ||||
-rw-r--r-- | mm/kasan/kasan.c | 14 | ||||
-rw-r--r-- | mm/memcontrol.c | 20 | ||||
-rw-r--r-- | mm/memory.c | 17 | ||||
-rw-r--r-- | mm/memory_hotplug.c | 13 | ||||
-rw-r--r-- | mm/mlock.c | 4 | ||||
-rw-r--r-- | mm/mmap.c | 4 | ||||
-rw-r--r-- | mm/mprotect.c | 3 | ||||
-rw-r--r-- | mm/mremap.c | 10 | ||||
-rw-r--r-- | mm/nommu.c | 5 | ||||
-rw-r--r-- | mm/page-writeback.c | 7 | ||||
-rw-r--r-- | mm/page_alloc.c | 12 | ||||
-rw-r--r-- | mm/page_isolation.c | 1 | ||||
-rw-r--r-- | mm/pagewalk.c | 9 | ||||
-rw-r--r-- | mm/rmap.c | 7 | ||||
-rw-r--r-- | mm/shmem.c | 3 | ||||
-rw-r--r-- | mm/slub.c | 6 | ||||
-rw-r--r-- | mm/vmalloc.c | 1 |
22 files changed, 119 insertions, 813 deletions
diff --git a/mm/Makefile b/mm/Makefile index 3c1caa2693bd..15dbe9903c27 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -21,7 +21,7 @@ obj-y := filemap.o mempool.o oom_kill.o \ mm_init.o mmu_context.o percpu.o slab_common.o \ compaction.o vmacache.o \ interval_tree.o list_lru.o workingset.o \ - iov_iter.o debug.o $(mmu-y) + debug.o $(mmu-y) obj-y += init-mm.o @@ -64,15 +64,17 @@ static unsigned long cma_bitmap_aligned_mask(struct cma *cma, int align_order) return (1UL << (align_order - cma->order_per_bit)) - 1; } +/* + * Find a PFN aligned to the specified order and return an offset represented in + * order_per_bits. + */ static unsigned long cma_bitmap_aligned_offset(struct cma *cma, int align_order) { - unsigned int alignment; - if (align_order <= cma->order_per_bit) return 0; - alignment = 1UL << (align_order - cma->order_per_bit); - return ALIGN(cma->base_pfn, alignment) - - (cma->base_pfn >> cma->order_per_bit); + + return (ALIGN(cma->base_pfn, (1UL << align_order)) + - cma->base_pfn) >> cma->order_per_bit; } static unsigned long cma_bitmap_maxno(struct cma *cma) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index fc00c8cb5a82..6817b0350c71 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1260,6 +1260,7 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, int target_nid, last_cpupid = -1; bool page_locked; bool migrated = false; + bool was_writable; int flags = 0; /* A PROT_NONE fault should not end up here */ @@ -1291,12 +1292,8 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, flags |= TNF_FAULT_LOCAL; } - /* - * Avoid grouping on DSO/COW pages in specific and RO pages - * in general, RO pages shouldn't hurt as much anyway since - * they can be in shared cache state. - */ - if (!pmd_write(pmd)) + /* See similar comment in do_numa_page for explanation */ + if (!(vma->vm_flags & VM_WRITE)) flags |= TNF_NO_GROUP; /* @@ -1353,12 +1350,17 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, if (migrated) { flags |= TNF_MIGRATED; page_nid = target_nid; - } + } else + flags |= TNF_MIGRATE_FAIL; goto out; clear_pmdnuma: BUG_ON(!PageLocked(page)); + was_writable = pmd_write(pmd); pmd = pmd_modify(pmd, vma->vm_page_prot); + pmd = pmd_mkyoung(pmd); + if (was_writable) + pmd = pmd_mkwrite(pmd); set_pmd_at(mm, haddr, pmdp, pmd); update_mmu_cache_pmd(vma, addr, pmdp); unlock_page(page); @@ -1482,6 +1484,8 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, if (__pmd_trans_huge_lock(pmd, vma, &ptl) == 1) { pmd_t entry; + bool preserve_write = prot_numa && pmd_write(*pmd); + ret = 1; /* * Avoid trapping faults against the zero page. The read-only @@ -1490,16 +1494,17 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, */ if (prot_numa && is_huge_zero_pmd(*pmd)) { spin_unlock(ptl); - return 0; + return ret; } if (!prot_numa || !pmd_protnone(*pmd)) { - ret = 1; entry = pmdp_get_and_clear_notify(mm, addr, pmd); entry = pmd_modify(entry, newprot); + if (preserve_write) + entry = pmd_mkwrite(entry); ret = HPAGE_PMD_NR; set_pmd_at(mm, addr, pmd, entry); - BUG_ON(pmd_write(entry)); + BUG_ON(!preserve_write && pmd_write(entry)); } spin_unlock(ptl); } diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 0a9ac6c26832..c41b2a0ee273 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -917,7 +917,6 @@ static void prep_compound_gigantic_page(struct page *page, unsigned long order) __SetPageHead(page); __ClearPageReserved(page); for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) { - __SetPageTail(p); /* * For gigantic hugepages allocated through bootmem at * boot, it's safer to be consistent with the not-gigantic @@ -933,6 +932,9 @@ static void prep_compound_gigantic_page(struct page *page, unsigned long order) __ClearPageReserved(p); set_page_count(p, 0); p->first_page = page; + /* Make sure p->first_page is always valid for PageTail() */ + smp_wmb(); + __SetPageTail(p); } } diff --git a/mm/iov_iter.c b/mm/iov_iter.c deleted file mode 100644 index 827732047da1..000000000000 --- a/mm/iov_iter.c +++ /dev/null @@ -1,753 +0,0 @@ -#include <linux/export.h> -#include <linux/uio.h> -#include <linux/pagemap.h> -#include <linux/slab.h> -#include <linux/vmalloc.h> -#include <net/checksum.h> - -#define iterate_iovec(i, n, __v, __p, skip, STEP) { \ - size_t left; \ - size_t wanted = n; \ - __p = i->iov; \ - __v.iov_len = min(n, __p->iov_len - skip); \ - if (likely(__v.iov_len)) { \ - __v.iov_base = __p->iov_base + skip; \ - left = (STEP); \ - __v.iov_len -= left; \ - skip += __v.iov_len; \ - n -= __v.iov_len; \ - } else { \ - left = 0; \ - } \ - while (unlikely(!left && n)) { \ - __p++; \ - __v.iov_len = min(n, __p->iov_len); \ - if (unlikely(!__v.iov_len)) \ - continue; \ - __v.iov_base = __p->iov_base; \ - left = (STEP); \ - __v.iov_len -= left; \ - skip = __v.iov_len; \ - n -= __v.iov_len; \ - } \ - n = wanted - n; \ -} - -#define iterate_kvec(i, n, __v, __p, skip, STEP) { \ - size_t wanted = n; \ - __p = i->kvec; \ - __v.iov_len = min(n, __p->iov_len - skip); \ - if (likely(__v.iov_len)) { \ - __v.iov_base = __p->iov_base + skip; \ - (void)(STEP); \ - skip += __v.iov_len; \ - n -= __v.iov_len; \ - } \ - while (unlikely(n)) { \ - __p++; \ - __v.iov_len = min(n, __p->iov_len); \ - if (unlikely(!__v.iov_len)) \ - continue; \ - __v.iov_base = __p->iov_base; \ - (void)(STEP); \ - skip = __v.iov_len; \ - n -= __v.iov_len; \ - } \ - n = wanted; \ -} - -#define iterate_bvec(i, n, __v, __p, skip, STEP) { \ - size_t wanted = n; \ - __p = i->bvec; \ - __v.bv_len = min_t(size_t, n, __p->bv_len - skip); \ - if (likely(__v.bv_len)) { \ - __v.bv_page = __p->bv_page; \ - __v.bv_offset = __p->bv_offset + skip; \ - (void)(STEP); \ - skip += __v.bv_len; \ - n -= __v.bv_len; \ - } \ - while (unlikely(n)) { \ - __p++; \ - __v.bv_len = min_t(size_t, n, __p->bv_len); \ - if (unlikely(!__v.bv_len)) \ - continue; \ - __v.bv_page = __p->bv_page; \ - __v.bv_offset = __p->bv_offset; \ - (void)(STEP); \ - skip = __v.bv_len; \ - n -= __v.bv_len; \ - } \ - n = wanted; \ -} - -#define iterate_all_kinds(i, n, v, I, B, K) { \ - size_t skip = i->iov_offset; \ - if (unlikely(i->type & ITER_BVEC)) { \ - const struct bio_vec *bvec; \ - struct bio_vec v; \ - iterate_bvec(i, n, v, bvec, skip, (B)) \ - } else if (unlikely(i->type & ITER_KVEC)) { \ - const struct kvec *kvec; \ - struct kvec v; \ - iterate_kvec(i, n, v, kvec, skip, (K)) \ - } else { \ - const struct iovec *iov; \ - struct iovec v; \ - iterate_iovec(i, n, v, iov, skip, (I)) \ - } \ -} - -#define iterate_and_advance(i, n, v, I, B, K) { \ - size_t skip = i->iov_offset; \ - if (unlikely(i->type & ITER_BVEC)) { \ - const struct bio_vec *bvec; \ - struct bio_vec v; \ - iterate_bvec(i, n, v, bvec, skip, (B)) \ - if (skip == bvec->bv_len) { \ - bvec++; \ - skip = 0; \ - } \ - i->nr_segs -= bvec - i->bvec; \ - i->bvec = bvec; \ - } else if (unlikely(i->type & ITER_KVEC)) { \ - const struct kvec *kvec; \ - struct kvec v; \ - iterate_kvec(i, n, v, kvec, skip, (K)) \ - if (skip == kvec->iov_len) { \ - kvec++; \ - skip = 0; \ - } \ - i->nr_segs -= kvec - i->kvec; \ - i->kvec = kvec; \ - } else { \ - const struct iovec *iov; \ - struct iovec v; \ - iterate_iovec(i, n, v, iov, skip, (I)) \ - if (skip == iov->iov_len) { \ - iov++; \ - skip = 0; \ - } \ - i->nr_segs -= iov - i->iov; \ - i->iov = iov; \ - } \ - i->count -= n; \ - i->iov_offset = skip; \ -} - -static size_t copy_page_to_iter_iovec(struct page *page, size_t offset, size_t bytes, - struct iov_iter *i) -{ - size_t skip, copy, left, wanted; - const struct iovec *iov; - char __user *buf; - void *kaddr, *from; - - if (unlikely(bytes > i->count)) - bytes = i->count; - - if (unlikely(!bytes)) - return 0; - - wanted = bytes; - iov = i->iov; - skip = i->iov_offset; - buf = iov->iov_base + skip; - copy = min(bytes, iov->iov_len - skip); - - if (!fault_in_pages_writeable(buf, copy)) { - kaddr = kmap_atomic(page); - from = kaddr + offset; - - /* first chunk, usually the only one */ - left = __copy_to_user_inatomic(buf, from, copy); - copy -= left; - skip += copy; - from += copy; - bytes -= copy; - - while (unlikely(!left && bytes)) { - iov++; - buf = iov->iov_base; - copy = min(bytes, iov->iov_len); - left = __copy_to_user_inatomic(buf, from, copy); - copy -= left; - skip = copy; - from += copy; - bytes -= copy; - } - if (likely(!bytes)) { - kunmap_atomic(kaddr); - goto done; - } - offset = from - kaddr; - buf += copy; - kunmap_atomic(kaddr); - copy = min(bytes, iov->iov_len - skip); - } - /* Too bad - revert to non-atomic kmap */ - kaddr = kmap(page); - from = kaddr + offset; - left = __copy_to_user(buf, from, copy); - copy -= left; - skip += copy; - from += copy; - bytes -= copy; - while (unlikely(!left && bytes)) { - iov++; - buf = iov->iov_base; - copy = min(bytes, iov->iov_len); - left = __copy_to_user(buf, from, copy); - copy -= left; - skip = copy; - from += copy; - bytes -= copy; - } - kunmap(page); -done: - if (skip == iov->iov_len) { - iov++; - skip = 0; - } - i->count -= wanted - bytes; - i->nr_segs -= iov - i->iov; - i->iov = iov; - i->iov_offset = skip; - return wanted - bytes; -} - -static size_t copy_page_from_iter_iovec(struct page *page, size_t offset, size_t bytes, - struct iov_iter *i) -{ - size_t skip, copy, left, wanted; - const struct iovec *iov; - char __user *buf; - void *kaddr, *to; - - if (unlikely(bytes > i->count)) - bytes = i->count; - - if (unlikely(!bytes)) - return 0; - - wanted = bytes; - iov = i->iov; - skip = i->iov_offset; - buf = iov->iov_base + skip; - copy = min(bytes, iov->iov_len - skip); - - if (!fault_in_pages_readable(buf, copy)) { - kaddr = kmap_atomic(page); - to = kaddr + offset; - - /* first chunk, usually the only one */ - left = __copy_from_user_inatomic(to, buf, copy); - copy -= left; - skip += copy; - to += copy; - bytes -= copy; - - while (unlikely(!left && bytes)) { - iov++; - buf = iov->iov_base; - copy = min(bytes, iov->iov_len); - left = __copy_from_user_inatomic(to, buf, copy); - copy -= left; - skip = copy; - to += copy; - bytes -= copy; - } - if (likely(!bytes)) { - kunmap_atomic(kaddr); - goto done; - } - offset = to - kaddr; - buf += copy; - kunmap_atomic(kaddr); - copy = min(bytes, iov->iov_len - skip); - } - /* Too bad - revert to non-atomic kmap */ - kaddr = kmap(page); - to = kaddr + offset; - left = __copy_from_user(to, buf, copy); - copy -= left; - skip += copy; - to += copy; - bytes -= copy; - while (unlikely(!left && bytes)) { - iov++; - buf = iov->iov_base; - copy = min(bytes, iov->iov_len); - left = __copy_from_user(to, buf, copy); - copy -= left; - skip = copy; - to += copy; - bytes -= copy; - } - kunmap(page); -done: - if (skip == iov->iov_len) { - iov++; - skip = 0; - } - i->count -= wanted - bytes; - i->nr_segs -= iov - i->iov; - i->iov = iov; - i->iov_offset = skip; - return wanted - bytes; -} - -/* - * Fault in the first iovec of the given iov_iter, to a maximum length - * of bytes. Returns 0 on success, or non-zero if the memory could not be - * accessed (ie. because it is an invalid address). - * - * writev-intensive code may want this to prefault several iovecs -- that - * would be possible (callers must not rely on the fact that _only_ the - * first iovec will be faulted with the current implementation). - */ -int iov_iter_fault_in_readable(struct iov_iter *i, size_t bytes) -{ - if (!(i->type & (ITER_BVEC|ITER_KVEC))) { - char __user *buf = i->iov->iov_base + i->iov_offset; - bytes = min(bytes, i->iov->iov_len - i->iov_offset); - return fault_in_pages_readable(buf, bytes); - } - return 0; -} -EXPORT_SYMBOL(iov_iter_fault_in_readable); - -void iov_iter_init(struct iov_iter *i, int direction, - const struct iovec *iov, unsigned long nr_segs, - size_t count) -{ - /* It will get better. Eventually... */ - if (segment_eq(get_fs(), KERNEL_DS)) { - direction |= ITER_KVEC; - i->type = direction; - i->kvec = (struct kvec *)iov; - } else { - i->type = direction; - i->iov = iov; - } - i->nr_segs = nr_segs; - i->iov_offset = 0; - i->count = count; -} -EXPORT_SYMBOL(iov_iter_init); - -static void memcpy_from_page(char *to, struct page *page, size_t offset, size_t len) -{ - char *from = kmap_atomic(page); - memcpy(to, from + offset, len); - kunmap_atomic(from); -} - -static void memcpy_to_page(struct page *page, size_t offset, char *from, size_t len) -{ - char *to = kmap_atomic(page); - memcpy(to + offset, from, len); - kunmap_atomic(to); -} - -static void memzero_page(struct page *page, size_t offset, size_t len) -{ - char *addr = kmap_atomic(page); - memset(addr + offset, 0, len); - kunmap_atomic(addr); -} - -size_t copy_to_iter(void *addr, size_t bytes, struct iov_iter *i) -{ - char *from = addr; - if (unlikely(bytes > i->count)) - bytes = i->count; - - if (unlikely(!bytes)) - return 0; - - iterate_and_advance(i, bytes, v, - __copy_to_user(v.iov_base, (from += v.iov_len) - v.iov_len, - v.iov_len), - memcpy_to_page(v.bv_page, v.bv_offset, - (from += v.bv_len) - v.bv_len, v.bv_len), - memcpy(v.iov_base, (from += v.iov_len) - v.iov_len, v.iov_len) - ) - - return bytes; -} -EXPORT_SYMBOL(copy_to_iter); - -size_t copy_from_iter(void *addr, size_t bytes, struct iov_iter *i) -{ - char *to = addr; - if (unlikely(bytes > i->count)) - bytes = i->count; - - if (unlikely(!bytes)) - return 0; - - iterate_and_advance(i, bytes, v, - __copy_from_user((to += v.iov_len) - v.iov_len, v.iov_base, - v.iov_len), - memcpy_from_page((to += v.bv_len) - v.bv_len, v.bv_page, - v.bv_offset, v.bv_len), - memcpy((to += v.iov_len) - v.iov_len, v.iov_base, v.iov_len) - ) - - return bytes; -} -EXPORT_SYMBOL(copy_from_iter); - -size_t copy_from_iter_nocache(void *addr, size_t bytes, struct iov_iter *i) -{ - char *to = addr; - if (unlikely(bytes > i->count)) - bytes = i->count; - - if (unlikely(!bytes)) - return 0; - - iterate_and_advance(i, bytes, v, - __copy_from_user_nocache((to += v.iov_len) - v.iov_len, - v.iov_base, v.iov_len), - memcpy_from_page((to += v.bv_len) - v.bv_len, v.bv_page, - v.bv_offset, v.bv_len), - memcpy((to += v.iov_len) - v.iov_len, v.iov_base, v.iov_len) - ) - - return bytes; -} -EXPORT_SYMBOL(copy_from_iter_nocache); - -size_t copy_page_to_iter(struct page *page, size_t offset, size_t bytes, - struct iov_iter *i) -{ - if (i->type & (ITER_BVEC|ITER_KVEC)) { - void *kaddr = kmap_atomic(page); - size_t wanted = copy_to_iter(kaddr + offset, bytes, i); - kunmap_atomic(kaddr); - return wanted; - } else - return copy_page_to_iter_iovec(page, offset, bytes, i); -} -EXPORT_SYMBOL(copy_page_to_iter); - -size_t copy_page_from_iter(struct page *page, size_t offset, size_t bytes, - struct iov_iter *i) -{ - if (i->type & (ITER_BVEC|ITER_KVEC)) { - void *kaddr = kmap_atomic(page); - size_t wanted = copy_from_iter(kaddr + offset, bytes, i); - kunmap_atomic(kaddr); - return wanted; - } else - return copy_page_from_iter_iovec(page, offset, bytes, i); -} -EXPORT_SYMBOL(copy_page_from_iter); - -size_t iov_iter_zero(size_t bytes, struct iov_iter *i) -{ - if (unlikely(bytes > i->count)) - bytes = i->count; - - if (unlikely(!bytes)) - return 0; - - iterate_and_advance(i, bytes, v, - __clear_user(v.iov_base, v.iov_len), - memzero_page(v.bv_page, v.bv_offset, v.bv_len), - memset(v.iov_base, 0, v.iov_len) - ) - - return bytes; -} -EXPORT_SYMBOL(iov_iter_zero); - -size_t iov_iter_copy_from_user_atomic(struct page *page, - struct iov_iter *i, unsigned long offset, size_t bytes) -{ - char *kaddr = kmap_atomic(page), *p = kaddr + offset; - iterate_all_kinds(i, bytes, v, - __copy_from_user_inatomic((p += v.iov_len) - v.iov_len, - v.iov_base, v.iov_len), - memcpy_from_page((p += v.bv_len) - v.bv_len, v.bv_page, - v.bv_offset, v.bv_len), - memcpy((p += v.iov_len) - v.iov_len, v.iov_base, v.iov_len) - ) - kunmap_atomic(kaddr); - return bytes; -} -EXPORT_SYMBOL(iov_iter_copy_from_user_atomic); - -void iov_iter_advance(struct iov_iter *i, size_t size) -{ - iterate_and_advance(i, size, v, 0, 0, 0) -} -EXPORT_SYMBOL(iov_iter_advance); - -/* - * Return the count of just the current iov_iter segment. - */ -size_t iov_iter_single_seg_count(const struct iov_iter *i) -{ - if (i->nr_segs == 1) - return i->count; - else if (i->type & ITER_BVEC) - return min(i->count, i->bvec->bv_len - i->iov_offset); - else - return min(i->count, i->iov->iov_len - i->iov_offset); -} -EXPORT_SYMBOL(iov_iter_single_seg_count); - -void iov_iter_kvec(struct iov_iter *i, int direction, - const struct kvec *kvec, unsigned long nr_segs, - size_t count) -{ - BUG_ON(!(direction & ITER_KVEC)); - i->type = direction; - i->kvec = kvec; - i->nr_segs = nr_segs; - i->iov_offset = 0; - i->count = count; -} -EXPORT_SYMBOL(iov_iter_kvec); - -void iov_iter_bvec(struct iov_iter *i, int direction, - const struct bio_vec *bvec, unsigned long nr_segs, - size_t count) -{ - BUG_ON(!(direction & ITER_BVEC)); - i->type = direction; - i->bvec = bvec; - i->nr_segs = nr_segs; - i->iov_offset = 0; - i->count = count; -} -EXPORT_SYMBOL(iov_iter_bvec); - -unsigned long iov_iter_alignment(const struct iov_iter *i) -{ - unsigned long res = 0; - size_t size = i->count; - - if (!size) - return 0; - - iterate_all_kinds(i, size, v, - (res |= (unsigned long)v.iov_base | v.iov_len, 0), - res |= v.bv_offset | v.bv_len, - res |= (unsigned long)v.iov_base | v.iov_len - ) - return res; -} -EXPORT_SYMBOL(iov_iter_alignment); - -ssize_t iov_iter_get_pages(struct iov_iter *i, - struct page **pages, size_t maxsize, unsigned maxpages, - size_t *start) -{ - if (maxsize > i->count) - maxsize = i->count; - - if (!maxsize) - return 0; - - iterate_all_kinds(i, maxsize, v, ({ - unsigned long addr = (unsigned long)v.iov_base; - size_t len = v.iov_len + (*start = addr & (PAGE_SIZE - 1)); - int n; - int res; - - if (len > maxpages * PAGE_SIZE) - len = maxpages * PAGE_SIZE; - addr &= ~(PAGE_SIZE - 1); - n = DIV_ROUND_UP(len, PAGE_SIZE); - res = get_user_pages_fast(addr, n, (i->type & WRITE) != WRITE, pages); - if (unlikely(res < 0)) - return res; - return (res == n ? len : res * PAGE_SIZE) - *start; - 0;}),({ - /* can't be more than PAGE_SIZE */ - *start = v.bv_offset; - get_page(*pages = v.bv_page); - return v.bv_len; - }),({ - return -EFAULT; - }) - ) - return 0; -} -EXPORT_SYMBOL(iov_iter_get_pages); - -static struct page **get_pages_array(size_t n) -{ - struct page **p = kmalloc(n * sizeof(struct page *), GFP_KERNEL); - if (!p) - p = vmalloc(n * sizeof(struct page *)); - return p; -} - -ssize_t iov_iter_get_pages_alloc(struct iov_iter *i, - struct page ***pages, size_t maxsize, - size_t *start) -{ - struct page **p; - - if (maxsize > i->count) - maxsize = i->count; - - if (!maxsize) - return 0; - - iterate_all_kinds(i, maxsize, v, ({ - unsigned long addr = (unsigned long)v.iov_base; - size_t len = v.iov_len + (*start = addr & (PAGE_SIZE - 1)); - int n; - int res; - - addr &= ~(PAGE_SIZE - 1); - n = DIV_ROUND_UP(len, PAGE_SIZE); - p = get_pages_array(n); - if (!p) - return -ENOMEM; - res = get_user_pages_fast(addr, n, (i->type & WRITE) != WRITE, p); - if (unlikely(res < 0)) { - kvfree(p); - return res; - } - *pages = p; - return (res == n ? len : res * PAGE_SIZE) - *start; - 0;}),({ - /* can't be more than PAGE_SIZE */ - *start = v.bv_offset; - *pages = p = get_pages_array(1); - if (!p) - return -ENOMEM; - get_page(*p = v.bv_page); - return v.bv_len; - }),({ - return -EFAULT; - }) - ) - return 0; -} -EXPORT_SYMBOL(iov_iter_get_pages_alloc); - -size_t csum_and_copy_from_iter(void *addr, size_t bytes, __wsum *csum, - struct iov_iter *i) -{ - char *to = addr; - __wsum sum, next; - size_t off = 0; - if (unlikely(bytes > i->count)) - bytes = i->count; - - if (unlikely(!bytes)) - return 0; - - sum = *csum; - iterate_and_advance(i, bytes, v, ({ - int err = 0; - next = csum_and_copy_from_user(v.iov_base, - (to += v.iov_len) - v.iov_len, - v.iov_len, 0, &err); - if (!err) { - sum = csum_block_add(sum, next, off); - off += v.iov_len; - } - err ? v.iov_len : 0; - }), ({ - char *p = kmap_atomic(v.bv_page); - next = csum_partial_copy_nocheck(p + v.bv_offset, - (to += v.bv_len) - v.bv_len, - v.bv_len, 0); - kunmap_atomic(p); - sum = csum_block_add(sum, next, off); - off += v.bv_len; - }),({ - next = csum_partial_copy_nocheck(v.iov_base, - (to += v.iov_len) - v.iov_len, - v.iov_len, 0); - sum = csum_block_add(sum, next, off); - off += v.iov_len; - }) - ) - *csum = sum; - return bytes; -} -EXPORT_SYMBOL(csum_and_copy_from_iter); - -size_t csum_and_copy_to_iter(void *addr, size_t bytes, __wsum *csum, - struct iov_iter *i) -{ - char *from = addr; - __wsum sum, next; - size_t off = 0; - if (unlikely(bytes > i->count)) - bytes = i->count; - - if (unlikely(!bytes)) - return 0; - - sum = *csum; - iterate_and_advance(i, bytes, v, ({ - int err = 0; - next = csum_and_copy_to_user((from += v.iov_len) - v.iov_len, - v.iov_base, - v.iov_len, 0, &err); - if (!err) { - sum = csum_block_add(sum, next, off); - off += v.iov_len; - } - err ? v.iov_len : 0; - }), ({ - char *p = kmap_atomic(v.bv_page); - next = csum_partial_copy_nocheck((from += v.bv_len) - v.bv_len, - p + v.bv_offset, - v.bv_len, 0); - kunmap_atomic(p); - sum = csum_block_add(sum, next, off); - off += v.bv_len; - }),({ - next = csum_partial_copy_nocheck((from += v.iov_len) - v.iov_len, - v.iov_base, - v.iov_len, 0); - sum = csum_block_add(sum, next, off); - off += v.iov_len; - }) - ) - *csum = sum; - return bytes; -} -EXPORT_SYMBOL(csum_and_copy_to_iter); - -int iov_iter_npages(const struct iov_iter *i, int maxpages) -{ - size_t size = i->count; - int npages = 0; - - if (!size) - return 0; - - iterate_all_kinds(i, size, v, ({ - unsigned long p = (unsigned long)v.iov_base; - npages += DIV_ROUND_UP(p + v.iov_len, PAGE_SIZE) - - p / PAGE_SIZE; - if (npages >= maxpages) - return maxpages; - 0;}),({ - npages++; - if (npages >= maxpages) - return maxpages; - }),({ - unsigned long p = (unsigned long)v.iov_base; - npages += DIV_ROUND_UP(p + v.iov_len, PAGE_SIZE) - - p / PAGE_SIZE; - if (npages >= maxpages) - return maxpages; - }) - ) - return npages; -} -EXPORT_SYMBOL(iov_iter_npages); diff --git a/mm/kasan/kasan.c b/mm/kasan/kasan.c index 78fee632a7ee..936d81661c47 100644 --- a/mm/kasan/kasan.c +++ b/mm/kasan/kasan.c @@ -29,6 +29,7 @@ #include <linux/stacktrace.h> #include <linux/string.h> #include <linux/types.h> +#include <linux/vmalloc.h> #include <linux/kasan.h> #include "kasan.h" @@ -414,12 +415,19 @@ int kasan_module_alloc(void *addr, size_t size) GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO, PAGE_KERNEL, VM_NO_GUARD, NUMA_NO_NODE, __builtin_return_address(0)); - return ret ? 0 : -ENOMEM; + + if (ret) { + find_vm_area(addr)->flags |= VM_KASAN; + return 0; + } + + return -ENOMEM; } -void kasan_module_free(void *addr) +void kasan_free_shadow(const struct vm_struct *vm) { - vfree(kasan_mem_to_shadow(addr)); + if (vm->flags & VM_KASAN) + vfree(kasan_mem_to_shadow(vm->addr)); } static void register_global(struct kasan_global *global) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index d18d3a6e7337..b34ef4a32a3b 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -5232,7 +5232,9 @@ static void mem_cgroup_bind(struct cgroup_subsys_state *root_css) * on for the root memcg is enough. */ if (cgroup_on_dfl(root_css->cgroup)) - mem_cgroup_from_css(root_css)->use_hierarchy = true; + root_mem_cgroup->use_hierarchy = true; + else + root_mem_cgroup->use_hierarchy = false; } static u64 memory_current_read(struct cgroup_subsys_state *css, @@ -5247,7 +5249,7 @@ static int memory_low_show(struct seq_file *m, void *v) unsigned long low = ACCESS_ONCE(memcg->low); if (low == PAGE_COUNTER_MAX) - seq_puts(m, "infinity\n"); + seq_puts(m, "max\n"); else seq_printf(m, "%llu\n", (u64)low * PAGE_SIZE); @@ -5262,7 +5264,7 @@ static ssize_t memory_low_write(struct kernfs_open_file *of, int err; buf = strstrip(buf); - err = page_counter_memparse(buf, "infinity", &low); + err = page_counter_memparse(buf, "max", &low); if (err) return err; @@ -5277,7 +5279,7 @@ static int memory_high_show(struct seq_file *m, void *v) unsigned long high = ACCESS_ONCE(memcg->high); if (high == PAGE_COUNTER_MAX) - seq_puts(m, "infinity\n"); + seq_puts(m, "max\n"); else seq_printf(m, "%llu\n", (u64)high * PAGE_SIZE); @@ -5292,7 +5294,7 @@ static ssize_t memory_high_write(struct kernfs_open_file *of, int err; buf = strstrip(buf); - err = page_counter_memparse(buf, "infinity", &high); + err = page_counter_memparse(buf, "max", &high); if (err) return err; @@ -5307,7 +5309,7 @@ static int memory_max_show(struct seq_file *m, void *v) unsigned long max = ACCESS_ONCE(memcg->memory.limit); if (max == PAGE_COUNTER_MAX) - seq_puts(m, "infinity\n"); + seq_puts(m, "max\n"); else seq_printf(m, "%llu\n", (u64)max * PAGE_SIZE); @@ -5322,7 +5324,7 @@ static ssize_t memory_max_write(struct kernfs_open_file *of, int err; buf = strstrip(buf); - err = page_counter_memparse(buf, "infinity", &max); + err = page_counter_memparse(buf, "max", &max); if (err) return err; @@ -5426,7 +5428,7 @@ bool mem_cgroup_low(struct mem_cgroup *root, struct mem_cgroup *memcg) if (memcg == root_mem_cgroup) return false; - if (page_counter_read(&memcg->memory) > memcg->low) + if (page_counter_read(&memcg->memory) >= memcg->low) return false; while (memcg != root) { @@ -5435,7 +5437,7 @@ bool mem_cgroup_low(struct mem_cgroup *root, struct mem_cgroup *memcg) if (memcg == root_mem_cgroup) break; - if (page_counter_read(&memcg->memory) > memcg->low) + if (page_counter_read(&memcg->memory) >= memcg->low) return false; } return true; diff --git a/mm/memory.c b/mm/memory.c index 8068893697bb..97839f5c8c30 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3035,6 +3035,7 @@ static int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, int last_cpupid; int target_nid; bool migrated = false; + bool was_writable = pte_write(pte); int flags = 0; /* A PROT_NONE fault should not end up here */ @@ -3059,6 +3060,8 @@ static int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, /* Make it present again */ pte = pte_modify(pte, vma->vm_page_prot); pte = pte_mkyoung(pte); + if (was_writable) + pte = pte_mkwrite(pte); set_pte_at(mm, addr, ptep, pte); update_mmu_cache(vma, addr, ptep); @@ -3069,11 +3072,14 @@ static int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, } /* - * Avoid grouping on DSO/COW pages in specific and RO pages - * in general, RO pages shouldn't hurt as much anyway since - * they can be in shared cache state. + * Avoid grouping on RO pages in general. RO pages shouldn't hurt as + * much anyway since they can be in shared cache state. This misses + * the case where a mapping is writable but the process never writes + * to it but pte_write gets cleared during protection updates and + * pte_dirty has unpredictable behaviour between PTE scan updates, + * background writeback, dirty balancing and application behaviour. */ - if (!pte_write(pte)) + if (!(vma->vm_flags & VM_WRITE)) flags |= TNF_NO_GROUP; /* @@ -3097,7 +3103,8 @@ static int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, if (migrated) { page_nid = target_nid; flags |= TNF_MIGRATED; - } + } else + flags |= TNF_MIGRATE_FAIL; out: if (page_nid != -1) diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 9fab10795bea..65842d688b7c 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1092,6 +1092,10 @@ static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start) return NULL; arch_refresh_nodedata(nid, pgdat); + } else { + /* Reset the nr_zones and classzone_idx to 0 before reuse */ + pgdat->nr_zones = 0; + pgdat->classzone_idx = 0; } /* we can use NODE_DATA(nid) from here */ @@ -1977,15 +1981,6 @@ void try_offline_node(int nid) if (is_vmalloc_addr(zone->wait_table)) vfree(zone->wait_table); } - - /* - * Since there is no way to guarentee the address of pgdat/zone is not - * on stack of any kernel threads or used by other kernel objects - * without reference counting or other symchronizing method, do not - * reset node_data and free pgdat here. Just reset it to 0 and reuse - * the memory when the node is online again. - */ - memset(pgdat, 0, sizeof(*pgdat)); } EXPORT_SYMBOL(try_offline_node); diff --git a/mm/mlock.c b/mm/mlock.c index 73cf0987088c..8a54cd214925 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -26,10 +26,10 @@ int can_do_mlock(void) { - if (capable(CAP_IPC_LOCK)) - return 1; if (rlimit(RLIMIT_MEMLOCK) != 0) return 1; + if (capable(CAP_IPC_LOCK)) + return 1; return 0; } EXPORT_SYMBOL(can_do_mlock); diff --git a/mm/mmap.c b/mm/mmap.c index da9990acc08b..9ec50a368634 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -774,10 +774,8 @@ again: remove_next = 1 + (end > next->vm_end); importer->anon_vma = exporter->anon_vma; error = anon_vma_clone(importer, exporter); - if (error) { - importer->anon_vma = NULL; + if (error) return error; - } } } diff --git a/mm/mprotect.c b/mm/mprotect.c index 44727811bf4c..88584838e704 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -75,6 +75,7 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, oldpte = *pte; if (pte_present(oldpte)) { pte_t ptent; + bool preserve_write = prot_numa && pte_write(oldpte); /* * Avoid trapping faults against the zero or KSM @@ -94,6 +95,8 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, ptent = ptep_modify_prot_start(mm, addr, pte); ptent = pte_modify(ptent, newprot); + if (preserve_write) + ptent = pte_mkwrite(ptent); /* Avoid taking write faults for known dirty pages */ if (dirty_accountable && pte_dirty(ptent) && diff --git a/mm/mremap.c b/mm/mremap.c index 57dadc025c64..2dc44b1cb1df 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -286,8 +286,14 @@ static unsigned long move_vma(struct vm_area_struct *vma, old_len = new_len; old_addr = new_addr; new_addr = -ENOMEM; - } else if (vma->vm_file && vma->vm_file->f_op->mremap) - vma->vm_file->f_op->mremap(vma->vm_file, new_vma); + } else if (vma->vm_file && vma->vm_file->f_op->mremap) { + err = vma->vm_file->f_op->mremap(vma->vm_file, new_vma); + if (err < 0) { + move_page_tables(new_vma, new_addr, vma, old_addr, + moved_len, true); + return err; + } + } /* Conceal VM_ACCOUNT so old reservation is not undone */ if (vm_flags & VM_ACCOUNT) { diff --git a/mm/nommu.c b/mm/nommu.c index 7296360fc057..3fba2dc97c44 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -62,6 +62,7 @@ void *high_memory; EXPORT_SYMBOL(high_memory); struct page *mem_map; unsigned long max_mapnr; +EXPORT_SYMBOL(max_mapnr); unsigned long highest_memmap_pfn; struct percpu_counter vm_committed_as; int sysctl_overcommit_memory = OVERCOMMIT_GUESS; /* heuristic overcommit */ @@ -1213,11 +1214,9 @@ static int do_mmap_private(struct vm_area_struct *vma, if (sysctl_nr_trim_pages && total - point >= sysctl_nr_trim_pages) { total = point; kdebug("try to alloc exact %lu pages", total); - base = alloc_pages_exact(len, GFP_KERNEL); - } else { - base = (void *)__get_free_pages(GFP_KERNEL, order); } + base = alloc_pages_exact(total << PAGE_SHIFT, GFP_KERNEL); if (!base) goto enomem; diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 45e187b2d971..644bcb665773 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -857,8 +857,11 @@ static void bdi_update_write_bandwidth(struct backing_dev_info *bdi, * bw * elapsed + write_bandwidth * (period - elapsed) * write_bandwidth = --------------------------------------------------- * period + * + * @written may have decreased due to account_page_redirty(). + * Avoid underflowing @bw calculation. */ - bw = written - bdi->written_stamp; + bw = written - min(written, bdi->written_stamp); bw *= HZ; if (unlikely(elapsed > period)) { do_div(bw, elapsed); @@ -922,7 +925,7 @@ static void global_update_bandwidth(unsigned long thresh, unsigned long now) { static DEFINE_SPINLOCK(dirty_lock); - static unsigned long update_time; + static unsigned long update_time = INITIAL_JIFFIES; /* * check locklessly first to optimize away locking for the most time diff --git a/mm/page_alloc.c b/mm/page_alloc.c index a47f0b229a1a..40e29429e7b0 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2353,8 +2353,15 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, if (ac->high_zoneidx < ZONE_NORMAL) goto out; /* The OOM killer does not compensate for light reclaim */ - if (!(gfp_mask & __GFP_FS)) + if (!(gfp_mask & __GFP_FS)) { + /* + * XXX: Page reclaim didn't yield anything, + * and the OOM killer can't be invoked, but + * keep looping as per should_alloc_retry(). + */ + *did_some_progress = 1; goto out; + } /* * GFP_THISNODE contains __GFP_NORETRY and we never hit this. * Sanity check for bare calls of __GFP_THISNODE, not real OOM. @@ -2366,7 +2373,8 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, goto out; } /* Exhausted what can be done so it's blamo time */ - if (out_of_memory(ac->zonelist, gfp_mask, order, ac->nodemask, false)) + if (out_of_memory(ac->zonelist, gfp_mask, order, ac->nodemask, false) + || WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL)) *did_some_progress = 1; out: oom_zonelist_unlock(ac->zonelist, gfp_mask); diff --git a/mm/page_isolation.c b/mm/page_isolation.c index 72f5ac381ab3..755a42c76eb4 100644 --- a/mm/page_isolation.c +++ b/mm/page_isolation.c @@ -103,6 +103,7 @@ void unset_migratetype_isolate(struct page *page, unsigned migratetype) if (!is_migrate_isolate_page(buddy)) { __isolate_free_page(page, order); + kernel_map_pages(page, (1 << order), 1); set_page_refcounted(page); isolated_page = page; } diff --git a/mm/pagewalk.c b/mm/pagewalk.c index 75c1f2878519..29f2f8b853ae 100644 --- a/mm/pagewalk.c +++ b/mm/pagewalk.c @@ -265,8 +265,15 @@ int walk_page_range(unsigned long start, unsigned long end, vma = vma->vm_next; err = walk_page_test(start, next, walk); - if (err > 0) + if (err > 0) { + /* + * positive return values are purely for + * controlling the pagewalk, so should never + * be passed to the callers. + */ + err = 0; continue; + } if (err < 0) break; } diff --git a/mm/rmap.c b/mm/rmap.c index 5e3e09081164..c161a14b6a8f 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -287,6 +287,13 @@ int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src) return 0; enomem_failure: + /* + * dst->anon_vma is dropped here otherwise its degree can be incorrectly + * decremented in unlink_anon_vmas(). + * We can safely do this because callers of anon_vma_clone() don't care + * about dst->anon_vma if anon_vma_clone() failed. + */ + dst->anon_vma = NULL; unlink_anon_vmas(dst); return -ENOMEM; } diff --git a/mm/shmem.c b/mm/shmem.c index 2f17cb5f00a4..cf2d0ca010bc 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1455,6 +1455,9 @@ static struct inode *shmem_get_inode(struct super_block *sb, const struct inode bool shmem_mapping(struct address_space *mapping) { + if (!mapping->host) + return false; + return mapping->host->i_sb->s_op == &shmem_ops; } diff --git a/mm/slub.c b/mm/slub.c index 6832c4eab104..82c473780c91 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -2449,7 +2449,8 @@ redo: do { tid = this_cpu_read(s->cpu_slab->tid); c = raw_cpu_ptr(s->cpu_slab); - } while (IS_ENABLED(CONFIG_PREEMPT) && unlikely(tid != c->tid)); + } while (IS_ENABLED(CONFIG_PREEMPT) && + unlikely(tid != READ_ONCE(c->tid))); /* * Irqless object alloc/free algorithm used here depends on sequence @@ -2718,7 +2719,8 @@ redo: do { tid = this_cpu_read(s->cpu_slab->tid); c = raw_cpu_ptr(s->cpu_slab); - } while (IS_ENABLED(CONFIG_PREEMPT) && unlikely(tid != c->tid)); + } while (IS_ENABLED(CONFIG_PREEMPT) && + unlikely(tid != READ_ONCE(c->tid))); /* Same with comment on barrier() in slab_alloc_node() */ barrier(); diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 35b25e1340ca..49abccf29a29 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -1418,6 +1418,7 @@ struct vm_struct *remove_vm_area(const void *addr) spin_unlock(&vmap_area_lock); vmap_debug_free_range(va->va_start, va->va_end); + kasan_free_shadow(vm); free_unmap_vmap_area(va); vm->size -= PAGE_SIZE; |