From fce86ff5802bac3a7b19db171aa1949ef9caac31 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Mon, 13 May 2019 17:15:33 -0700 Subject: mm/huge_memory: fix vmf_insert_pfn_{pmd, pud}() crash, handle unaligned addresses Starting with c6f3c5ee40c1 ("mm/huge_memory.c: fix modifying of page protection by insert_pfn_pmd()") vmf_insert_pfn_pmd() internally calls pmdp_set_access_flags(). That helper enforces a pmd aligned @address argument via VM_BUG_ON() assertion. Update the implementation to take a 'struct vm_fault' argument directly and apply the address alignment fixup internally to fix crash signatures like: kernel BUG at arch/x86/mm/pgtable.c:515! invalid opcode: 0000 [#1] SMP NOPTI CPU: 51 PID: 43713 Comm: java Tainted: G OE 4.19.35 #1 [..] RIP: 0010:pmdp_set_access_flags+0x48/0x50 [..] Call Trace: vmf_insert_pfn_pmd+0x198/0x350 dax_iomap_fault+0xe82/0x1190 ext4_dax_huge_fault+0x103/0x1f0 ? __switch_to_asm+0x40/0x70 __handle_mm_fault+0x3f6/0x1370 ? __switch_to_asm+0x34/0x70 ? __switch_to_asm+0x40/0x70 handle_mm_fault+0xda/0x200 __do_page_fault+0x249/0x4f0 do_page_fault+0x32/0x110 ? page_fault+0x8/0x30 page_fault+0x1e/0x30 Link: http://lkml.kernel.org/r/155741946350.372037.11148198430068238140.stgit@dwillia2-desk3.amr.corp.intel.com Fixes: c6f3c5ee40c1 ("mm/huge_memory.c: fix modifying of page protection by insert_pfn_pmd()") Signed-off-by: Dan Williams Reported-by: Piotr Balcer Tested-by: Yan Ma Tested-by: Pankaj Gupta Reviewed-by: Matthew Wilcox Reviewed-by: Jan Kara Reviewed-by: Aneesh Kumar K.V Cc: Chandan Rajendra Cc: Souptick Joarder Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/dax/device.c | 6 ++---- fs/dax.c | 6 ++---- include/linux/huge_mm.h | 6 ++---- mm/huge_memory.c | 16 ++++++++++------ 4 files changed, 16 insertions(+), 18 deletions(-) diff --git a/drivers/dax/device.c b/drivers/dax/device.c index e428468ab661..996d68ff992a 100644 --- a/drivers/dax/device.c +++ b/drivers/dax/device.c @@ -184,8 +184,7 @@ static vm_fault_t __dev_dax_pmd_fault(struct dev_dax *dev_dax, *pfn = phys_to_pfn_t(phys, dax_region->pfn_flags); - return vmf_insert_pfn_pmd(vmf->vma, vmf->address, vmf->pmd, *pfn, - vmf->flags & FAULT_FLAG_WRITE); + return vmf_insert_pfn_pmd(vmf, *pfn, vmf->flags & FAULT_FLAG_WRITE); } #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD @@ -235,8 +234,7 @@ static vm_fault_t __dev_dax_pud_fault(struct dev_dax *dev_dax, *pfn = phys_to_pfn_t(phys, dax_region->pfn_flags); - return vmf_insert_pfn_pud(vmf->vma, vmf->address, vmf->pud, *pfn, - vmf->flags & FAULT_FLAG_WRITE); + return vmf_insert_pfn_pud(vmf, *pfn, vmf->flags & FAULT_FLAG_WRITE); } #else static vm_fault_t __dev_dax_pud_fault(struct dev_dax *dev_dax, diff --git a/fs/dax.c b/fs/dax.c index e5e54da1715f..83009875308c 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -1575,8 +1575,7 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp, } trace_dax_pmd_insert_mapping(inode, vmf, PMD_SIZE, pfn, entry); - result = vmf_insert_pfn_pmd(vma, vmf->address, vmf->pmd, pfn, - write); + result = vmf_insert_pfn_pmd(vmf, pfn, write); break; case IOMAP_UNWRITTEN: case IOMAP_HOLE: @@ -1686,8 +1685,7 @@ dax_insert_pfn_mkwrite(struct vm_fault *vmf, pfn_t pfn, unsigned int order) ret = vmf_insert_mixed_mkwrite(vmf->vma, vmf->address, pfn); #ifdef CONFIG_FS_DAX_PMD else if (order == PMD_ORDER) - ret = vmf_insert_pfn_pmd(vmf->vma, vmf->address, vmf->pmd, - pfn, true); + ret = vmf_insert_pfn_pmd(vmf, pfn, FAULT_FLAG_WRITE); #endif else ret = VM_FAULT_FALLBACK; diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 381e872bfde0..7cd5c150c21d 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -47,10 +47,8 @@ extern bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr, extern int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr, pgprot_t newprot, int prot_numa); -vm_fault_t vmf_insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr, - pmd_t *pmd, pfn_t pfn, bool write); -vm_fault_t vmf_insert_pfn_pud(struct vm_area_struct *vma, unsigned long addr, - pud_t *pud, pfn_t pfn, bool write); +vm_fault_t vmf_insert_pfn_pmd(struct vm_fault *vmf, pfn_t pfn, bool write); +vm_fault_t vmf_insert_pfn_pud(struct vm_fault *vmf, pfn_t pfn, bool write); enum transparent_hugepage_flag { TRANSPARENT_HUGEPAGE_FLAG, TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, diff --git a/mm/huge_memory.c b/mm/huge_memory.c index b6a34b32d8ac..c314a362c167 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -793,11 +793,13 @@ out_unlock: pte_free(mm, pgtable); } -vm_fault_t vmf_insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr, - pmd_t *pmd, pfn_t pfn, bool write) +vm_fault_t vmf_insert_pfn_pmd(struct vm_fault *vmf, pfn_t pfn, bool write) { + unsigned long addr = vmf->address & PMD_MASK; + struct vm_area_struct *vma = vmf->vma; pgprot_t pgprot = vma->vm_page_prot; pgtable_t pgtable = NULL; + /* * If we had pmd_special, we could avoid all these restrictions, * but we need to be consistent with PTEs and architectures that @@ -820,7 +822,7 @@ vm_fault_t vmf_insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr, track_pfn_insert(vma, &pgprot, pfn); - insert_pfn_pmd(vma, addr, pmd, pfn, pgprot, write, pgtable); + insert_pfn_pmd(vma, addr, vmf->pmd, pfn, pgprot, write, pgtable); return VM_FAULT_NOPAGE; } EXPORT_SYMBOL_GPL(vmf_insert_pfn_pmd); @@ -869,10 +871,12 @@ out_unlock: spin_unlock(ptl); } -vm_fault_t vmf_insert_pfn_pud(struct vm_area_struct *vma, unsigned long addr, - pud_t *pud, pfn_t pfn, bool write) +vm_fault_t vmf_insert_pfn_pud(struct vm_fault *vmf, pfn_t pfn, bool write) { + unsigned long addr = vmf->address & PUD_MASK; + struct vm_area_struct *vma = vmf->vma; pgprot_t pgprot = vma->vm_page_prot; + /* * If we had pud_special, we could avoid all these restrictions, * but we need to be consistent with PTEs and architectures that @@ -889,7 +893,7 @@ vm_fault_t vmf_insert_pfn_pud(struct vm_area_struct *vma, unsigned long addr, track_pfn_insert(vma, &pgprot, pfn); - insert_pfn_pud(vma, addr, pud, pfn, pgprot, write); + insert_pfn_pud(vma, addr, vmf->pud, pfn, pgprot, write); return VM_FAULT_NOPAGE; } EXPORT_SYMBOL_GPL(vmf_insert_pfn_pud); -- cgit v1.2.3 From 2bf753e64b4a702e27ce26ff520c59563c62f96b Mon Sep 17 00:00:00 2001 From: Kai Shen Date: Mon, 13 May 2019 17:15:37 -0700 Subject: mm/hugetlb.c: don't put_page in lock of hugetlb_lock spinlock recursion happened when do LTP test: #!/bin/bash ./runltp -p -f hugetlb & ./runltp -p -f hugetlb & ./runltp -p -f hugetlb & ./runltp -p -f hugetlb & ./runltp -p -f hugetlb & The dtor returned by get_compound_page_dtor in __put_compound_page may be the function of free_huge_page which will lock the hugetlb_lock, so don't put_page in lock of hugetlb_lock. BUG: spinlock recursion on CPU#0, hugemmap05/1079 lock: hugetlb_lock+0x0/0x18, .magic: dead4ead, .owner: hugemmap05/1079, .owner_cpu: 0 Call trace: dump_backtrace+0x0/0x198 show_stack+0x24/0x30 dump_stack+0xa4/0xcc spin_dump+0x84/0xa8 do_raw_spin_lock+0xd0/0x108 _raw_spin_lock+0x20/0x30 free_huge_page+0x9c/0x260 __put_compound_page+0x44/0x50 __put_page+0x2c/0x60 alloc_surplus_huge_page.constprop.19+0xf0/0x140 hugetlb_acct_memory+0x104/0x378 hugetlb_reserve_pages+0xe0/0x250 hugetlbfs_file_mmap+0xc0/0x140 mmap_region+0x3e8/0x5b0 do_mmap+0x280/0x460 vm_mmap_pgoff+0xf4/0x128 ksys_mmap_pgoff+0xb4/0x258 __arm64_sys_mmap+0x34/0x48 el0_svc_common+0x78/0x130 el0_svc_handler+0x38/0x78 el0_svc+0x8/0xc Link: http://lkml.kernel.org/r/b8ade452-2d6b-0372-32c2-703644032b47@huawei.com Fixes: 9980d744a0 ("mm, hugetlb: get rid of surplus page accounting tricks") Signed-off-by: Kai Shen Signed-off-by: Feilong Lin Reported-by: Wang Wang Reviewed-by: Oscar Salvador Reviewed-by: Mike Kravetz Reviewed-by: Andrew Morton Acked-by: Michal Hocko Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 641cedfc8c0f..dffe5d9d03ae 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1574,8 +1574,9 @@ static struct page *alloc_surplus_huge_page(struct hstate *h, gfp_t gfp_mask, */ if (h->surplus_huge_pages >= h->nr_overcommit_huge_pages) { SetPageHugeTemporary(page); + spin_unlock(&hugetlb_lock); put_page(page); - page = NULL; + return NULL; } else { h->surplus_huge_pages++; h->surplus_huge_pages_node[page_to_nid(page)]++; -- cgit v1.2.3 From a9e73998f9d705c94a8dca9687633adc0f24a19a Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Mon, 13 May 2019 17:15:40 -0700 Subject: kernel/sys.c: prctl: fix false positive in validate_prctl_map() While validating new map we require the @start_data to be strictly less than @end_data, which is fine for regular applications (this is why this nit didn't trigger for that long). These members are set from executable loaders such as elf handers, still it is pretty valid to have a loadable data section with zero size in file, in such case the start_data is equal to end_data once kernel loader finishes. As a result when we're trying to restore such programs the procedure fails and the kernel returns -EINVAL. From the image dump of a program: | "mm_start_code": "0x400000", | "mm_end_code": "0x8f5fb4", | "mm_start_data": "0xf1bfb0", | "mm_end_data": "0xf1bfb0", Thus we need to change validate_prctl_map from strictly less to less or equal operator use. Link: http://lkml.kernel.org/r/20190408143554.GY1421@uranus.lan Fixes: f606b77f1a9e3 ("prctl: PR_SET_MM -- introduce PR_SET_MM_MAP operation") Signed-off-by: Cyrill Gorcunov Cc: Andrey Vagin Cc: Dmitry Safonov <0x7f454c46@gmail.com> Cc: Pavel Emelyanov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sys.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/sys.c b/kernel/sys.c index 12df0e5434b8..bdbfe8d37418 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1924,7 +1924,7 @@ static int validate_prctl_map(struct prctl_mm_map *prctl_map) ((unsigned long)prctl_map->__m1 __op \ (unsigned long)prctl_map->__m2) ? 0 : -EINVAL error = __prctl_check_order(start_code, <, end_code); - error |= __prctl_check_order(start_data, <, end_data); + error |= __prctl_check_order(start_data,<=, end_data); error |= __prctl_check_order(start_brk, <=, brk); error |= __prctl_check_order(arg_start, <=, arg_end); error |= __prctl_check_order(env_start, <=, env_end); -- cgit v1.2.3 From e602b26ce4758e0eb97252363d32bd294afea530 Mon Sep 17 00:00:00 2001 From: Sabyasachi Gupta Date: Mon, 13 May 2019 17:15:46 -0700 Subject: arch/sh/boards/mach-dreamcast/irq.c: Remove duplicate header Remove linux/irq.h which is included more than once. Link: http://lkml.kernel.org/r/5c8682ef.1c69fb81.5a1ea.2e7f@mx.google.com Signed-off-by: Sabyasachi Gupta Acked-by: Souptick Joarder Cc: Yoshinori Sato Cc: Rich Felker Cc: Mukesh Ojha Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/sh/boards/mach-dreamcast/irq.c | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/sh/boards/mach-dreamcast/irq.c b/arch/sh/boards/mach-dreamcast/irq.c index a929f764ae04..cc06e4cdb4cd 100644 --- a/arch/sh/boards/mach-dreamcast/irq.c +++ b/arch/sh/boards/mach-dreamcast/irq.c @@ -10,7 +10,6 @@ */ #include #include -#include #include #include #include -- cgit v1.2.3 From 3fde8c60b398823afd6fde0bc0cdbff72ad01a3e Mon Sep 17 00:00:00 2001 From: Joseph Qi Date: Mon, 13 May 2019 17:15:49 -0700 Subject: MAINTAINERS: add Joseph as ocfs2 co-maintainer I have been contributing and reviewing to the ocfs2 filesystem for recent years and I'm willing to continue doing so. Volunteer as a co-maintainer for ocfs2 filesystem. Link: http://lkml.kernel.org/r/f56d75b3-2be5-25c2-51f2-c3f5423d4f14@gmail.com Signed-off-by: Joseph Qi Acked-by: Andrew Morton Reviewed-by: Mark Fasheh Cc: piaojun Cc: "Gang He" Cc: Joel Becker Cc: Junxiao Bi Cc: Joseph Qi Cc: Changwei Ge Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index fb9f9d71f7a2..372e60e416f6 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -11746,6 +11746,7 @@ F: include/linux/oprofile.h ORACLE CLUSTER FILESYSTEM 2 (OCFS2) M: Mark Fasheh M: Joel Becker +M: Joseph Qi L: ocfs2-devel@oss.oracle.com (moderated for non-subscribers) W: http://ocfs2.wiki.kernel.org S: Supported -- cgit v1.2.3 From 9dc2108d667da44c7b147b185b64e31c0a60f583 Mon Sep 17 00:00:00 2001 From: Phillip Potter Date: Mon, 13 May 2019 17:15:53 -0700 Subject: ocfs2: use common file type conversion Deduplicate the ocfs2 file type conversion implementation and remove OCFS2_FT_* definitions - file systems that use the same file types as defined by POSIX do not need to define their own versions and can use the common helper functions decared in fs_types.h and implemented in fs_types.c Common implementation can be found via bbe7449e2599 ("fs: common implementation of file type"). Link: http://lkml.kernel.org/r/20190326213919.GA20878@pathfinder Signed-off-by: Amir Goldstein Signed-off-by: Phillip Potter Reviewed-by: Jan Kara Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Joseph Qi Cc: Changwei Ge Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/dir.c | 20 ++++---------------- fs/ocfs2/ocfs2_fs.h | 28 +--------------------------- 2 files changed, 5 insertions(+), 43 deletions(-) diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c index c121abbdfc7d..85f21caaa6ec 100644 --- a/fs/ocfs2/dir.c +++ b/fs/ocfs2/dir.c @@ -69,10 +69,6 @@ #define NAMEI_RA_BLOCKS 4 #define NAMEI_RA_SIZE (NAMEI_RA_CHUNKS * NAMEI_RA_BLOCKS) -static unsigned char ocfs2_filetype_table[] = { - DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK -}; - static int ocfs2_do_extend_dir(struct super_block *sb, handle_t *handle, struct inode *dir, @@ -1718,7 +1714,7 @@ int __ocfs2_add_entry(handle_t *handle, de->rec_len = cpu_to_le16(OCFS2_DIR_REC_LEN(de->name_len)); de = de1; } - de->file_type = OCFS2_FT_UNKNOWN; + de->file_type = FT_UNKNOWN; if (blkno) { de->inode = cpu_to_le64(blkno); ocfs2_set_de_type(de, inode->i_mode); @@ -1803,13 +1799,9 @@ static int ocfs2_dir_foreach_blk_id(struct inode *inode, } offset += le16_to_cpu(de->rec_len); if (le64_to_cpu(de->inode)) { - unsigned char d_type = DT_UNKNOWN; - - if (de->file_type < OCFS2_FT_MAX) - d_type = ocfs2_filetype_table[de->file_type]; - if (!dir_emit(ctx, de->name, de->name_len, - le64_to_cpu(de->inode), d_type)) + le64_to_cpu(de->inode), + fs_ftype_to_dtype(de->file_type))) goto out; } ctx->pos += le16_to_cpu(de->rec_len); @@ -1900,14 +1892,10 @@ static int ocfs2_dir_foreach_blk_el(struct inode *inode, break; } if (le64_to_cpu(de->inode)) { - unsigned char d_type = DT_UNKNOWN; - - if (de->file_type < OCFS2_FT_MAX) - d_type = ocfs2_filetype_table[de->file_type]; if (!dir_emit(ctx, de->name, de->name_len, le64_to_cpu(de->inode), - d_type)) { + fs_ftype_to_dtype(de->file_type))) { brelse(bh); return 0; } diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h index 7071ad0dec90..b86bf5e74348 100644 --- a/fs/ocfs2/ocfs2_fs.h +++ b/fs/ocfs2/ocfs2_fs.h @@ -391,21 +391,6 @@ static struct ocfs2_system_inode_info ocfs2_system_inodes[NUM_SYSTEM_INODES] = { #define OCFS2_HB_LOCAL "heartbeat=local" #define OCFS2_HB_GLOBAL "heartbeat=global" -/* - * OCFS2 directory file types. Only the low 3 bits are used. The - * other bits are reserved for now. - */ -#define OCFS2_FT_UNKNOWN 0 -#define OCFS2_FT_REG_FILE 1 -#define OCFS2_FT_DIR 2 -#define OCFS2_FT_CHRDEV 3 -#define OCFS2_FT_BLKDEV 4 -#define OCFS2_FT_FIFO 5 -#define OCFS2_FT_SOCK 6 -#define OCFS2_FT_SYMLINK 7 - -#define OCFS2_FT_MAX 8 - /* * OCFS2_DIR_PAD defines the directory entries boundaries * @@ -424,17 +409,6 @@ static struct ocfs2_system_inode_info ocfs2_system_inodes[NUM_SYSTEM_INODES] = { #define OCFS2_LINKS_HI_SHIFT 16 #define OCFS2_DX_ENTRIES_MAX (0xffffffffU) -#define S_SHIFT 12 -static unsigned char ocfs2_type_by_mode[S_IFMT >> S_SHIFT] = { - [S_IFREG >> S_SHIFT] = OCFS2_FT_REG_FILE, - [S_IFDIR >> S_SHIFT] = OCFS2_FT_DIR, - [S_IFCHR >> S_SHIFT] = OCFS2_FT_CHRDEV, - [S_IFBLK >> S_SHIFT] = OCFS2_FT_BLKDEV, - [S_IFIFO >> S_SHIFT] = OCFS2_FT_FIFO, - [S_IFSOCK >> S_SHIFT] = OCFS2_FT_SOCK, - [S_IFLNK >> S_SHIFT] = OCFS2_FT_SYMLINK, -}; - /* * Convenience casts @@ -1629,7 +1603,7 @@ static inline int ocfs2_sprintf_system_inode_name(char *buf, int len, static inline void ocfs2_set_de_type(struct ocfs2_dir_entry *de, umode_t mode) { - de->file_type = ocfs2_type_by_mode[(mode & S_IFMT)>>S_SHIFT]; + de->file_type = fs_umode_to_ftype(mode); } static inline int ocfs2_gd_is_discontig(struct ocfs2_group_desc *gd) -- cgit v1.2.3 From e091eab028f9253eac5c04f9141bbc9d170acab3 Mon Sep 17 00:00:00 2001 From: Shuning Zhang Date: Mon, 13 May 2019 17:15:56 -0700 Subject: ocfs2: fix ocfs2 read inode data panic in ocfs2_iget In some cases, ocfs2_iget() reads the data of inode, which has been deleted for some reason. That will make the system panic. So We should judge whether this inode has been deleted, and tell the caller that the inode is a bad inode. For example, the ocfs2 is used as the backed of nfs, and the client is nfsv3. This issue can be reproduced by the following steps. on the nfs server side, ..../patha/pathb Step 1: The process A was scheduled before calling the function fh_verify. Step 2: The process B is removing the 'pathb', and just completed the call to function dput. Then the dentry of 'pathb' has been deleted from the dcache, and all ancestors have been deleted also. The relationship of dentry and inode was deleted through the function hlist_del_init. The following is the call stack. dentry_iput->hlist_del_init(&dentry->d_u.d_alias) At this time, the inode is still in the dcache. Step 3: The process A call the function ocfs2_get_dentry, which get the inode from dcache. Then the refcount of inode is 1. The following is the call stack. nfsd3_proc_getacl->fh_verify->exportfs_decode_fh->fh_to_dentry(ocfs2_get_dentry) Step 4: Dirty pages are flushed by bdi threads. So the inode of 'patha' is evicted, and this directory was deleted. But the inode of 'pathb' can't be evicted, because the refcount of the inode was 1. Step 5: The process A keep running, and call the function reconnect_path(in exportfs_decode_fh), which call function ocfs2_get_parent of ocfs2. Get the block number of parent directory(patha) by the name of ... Then read the data from disk by the block number. But this inode has been deleted, so the system panic. Process A Process B 1. in nfsd3_proc_getacl | 2. | dput 3. fh_to_dentry(ocfs2_get_dentry) | 4. bdi flush dirty cache | 5. ocfs2_iget | [283465.542049] OCFS2: ERROR (device sdp): ocfs2_validate_inode_block: Invalid dinode #580640: OCFS2_VALID_FL not set [283465.545490] Kernel panic - not syncing: OCFS2: (device sdp): panic forced after error [283465.546889] CPU: 5 PID: 12416 Comm: nfsd Tainted: G W 4.1.12-124.18.6.el6uek.bug28762940v3.x86_64 #2 [283465.548382] Hardware name: VMware, Inc. VMware Virtual Platform/440BX Desktop Reference Platform, BIOS 6.00 09/21/2015 [283465.549657] 0000000000000000 ffff8800a56fb7b8 ffffffff816e839c ffffffffa0514758 [283465.550392] 000000000008dc20 ffff8800a56fb838 ffffffff816e62d3 0000000000000008 [283465.551056] ffff880000000010 ffff8800a56fb848 ffff8800a56fb7e8 ffff88005df9f000 [283465.551710] Call Trace: [283465.552516] [] dump_stack+0x63/0x81 [283465.553291] [] panic+0xcb/0x21b [283465.554037] [] ocfs2_handle_error+0xf0/0xf0 [ocfs2] [283465.554882] [] __ocfs2_error+0x67/0x70 [ocfs2] [283465.555768] [] ocfs2_validate_inode_block+0x229/0x230 [ocfs2] [283465.556683] [] ocfs2_read_blocks+0x46c/0x7b0 [ocfs2] [283465.557408] [] ? ocfs2_inode_cache_io_unlock+0x20/0x20 [ocfs2] [283465.557973] [] ocfs2_read_inode_block_full+0x3b/0x60 [ocfs2] [283465.558525] [] ocfs2_iget+0x4aa/0x880 [ocfs2] [283465.559082] [] ocfs2_get_parent+0x9e/0x220 [ocfs2] [283465.559622] [] reconnect_path+0xb5/0x300 [283465.560156] [] exportfs_decode_fh+0xf6/0x2b0 [283465.560708] [] ? nfsd_proc_getattr+0xa0/0xa0 [nfsd] [283465.561262] [] ? prepare_creds+0x26/0x110 [283465.561932] [] fh_verify+0x350/0x660 [nfsd] [283465.562862] [] ? nfsd_cache_lookup+0x44/0x630 [nfsd] [283465.563697] [] nfsd3_proc_getattr+0x69/0xf0 [nfsd] [283465.564510] [] nfsd_dispatch+0xe0/0x290 [nfsd] [283465.565358] [] ? svc_tcp_adjust_wspace+0x12/0x30 [sunrpc] [283465.566272] [] svc_process_common+0x412/0x6a0 [sunrpc] [283465.567155] [] svc_process+0x123/0x210 [sunrpc] [283465.568020] [] nfsd+0xff/0x170 [nfsd] [283465.568962] [] ? nfsd_destroy+0x80/0x80 [nfsd] [283465.570112] [] kthread+0xcb/0xf0 [283465.571099] [] ? kthread_create_on_node+0x180/0x180 [283465.572114] [] ret_from_fork+0x58/0x90 [283465.573156] [] ? kthread_create_on_node+0x180/0x180 Link: http://lkml.kernel.org/r/1554185919-3010-1-git-send-email-sunny.s.zhang@oracle.com Signed-off-by: Shuning Zhang Reviewed-by: Joseph Qi Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Changwei Ge Cc: piaojun Cc: "Gang He" Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/export.c | 30 +++++++++++++++++++++++++++++- 1 file changed, 29 insertions(+), 1 deletion(-) diff --git a/fs/ocfs2/export.c b/fs/ocfs2/export.c index 4bf8d5854b27..af2888d23de3 100644 --- a/fs/ocfs2/export.c +++ b/fs/ocfs2/export.c @@ -148,16 +148,24 @@ static struct dentry *ocfs2_get_parent(struct dentry *child) u64 blkno; struct dentry *parent; struct inode *dir = d_inode(child); + int set; trace_ocfs2_get_parent(child, child->d_name.len, child->d_name.name, (unsigned long long)OCFS2_I(dir)->ip_blkno); + status = ocfs2_nfs_sync_lock(OCFS2_SB(dir->i_sb), 1); + if (status < 0) { + mlog(ML_ERROR, "getting nfs sync lock(EX) failed %d\n", status); + parent = ERR_PTR(status); + goto bail; + } + status = ocfs2_inode_lock(dir, NULL, 0); if (status < 0) { if (status != -ENOENT) mlog_errno(status); parent = ERR_PTR(status); - goto bail; + goto unlock_nfs_sync; } status = ocfs2_lookup_ino_from_name(dir, "..", 2, &blkno); @@ -166,11 +174,31 @@ static struct dentry *ocfs2_get_parent(struct dentry *child) goto bail_unlock; } + status = ocfs2_test_inode_bit(OCFS2_SB(dir->i_sb), blkno, &set); + if (status < 0) { + if (status == -EINVAL) { + status = -ESTALE; + } else + mlog(ML_ERROR, "test inode bit failed %d\n", status); + parent = ERR_PTR(status); + goto bail_unlock; + } + + trace_ocfs2_get_dentry_test_bit(status, set); + if (!set) { + status = -ESTALE; + parent = ERR_PTR(status); + goto bail_unlock; + } + parent = d_obtain_alias(ocfs2_iget(OCFS2_SB(dir->i_sb), blkno, 0, 0)); bail_unlock: ocfs2_inode_unlock(dir, 0); +unlock_nfs_sync: + ocfs2_nfs_sync_unlock(OCFS2_SB(dir->i_sb), 1); + bail: trace_ocfs2_get_parent_end(parent); -- cgit v1.2.3 From a16b53849913e742d086bb2b6f5e069ea2850c56 Mon Sep 17 00:00:00 2001 From: "Tobin C. Harding" Date: Mon, 13 May 2019 17:15:59 -0700 Subject: list: add function list_rotate_to_front() Patch series "mm: Use slab_list list_head instead of lru", v5. Currently the slab allocators (ab)use the struct page 'lru' list_head. We have a list head for slab allocators to use, 'slab_list'. During v2 it was noted by Christoph that the SLOB allocator was reaching into a list_head, this version adds 2 patches to the front of the set to fix that. Clean up all three allocators by using the 'slab_list' list_head instead of overloading the 'lru' list_head. This patch (of 7): Currently if we wish to rotate a list until a specific item is at the front of the list we can call list_move_tail(head, list). Note that the arguments are the reverse way to the usual use of list_move_tail(list, head). This is a hack, it depends on the developer knowing how the list_head operates internally which violates the layer of abstraction offered by the list_head. Also, it is not intuitive so the next developer to come along must study list.h in order to fully understand what is meant by the call, while this is 'good for' the developer it makes reading the code harder. We should have an function appropriately named that does this if there are users for it intree. By grep'ing the tree for list_move_tail() and list_tail() and attempting to guess the argument order from the names it seems there is only one place currently in the tree that does this - the slob allocatator. Add function list_rotate_to_front() to rotate a list until the specified item is at the front of the list. Link: http://lkml.kernel.org/r/20190402230545.2929-2-tobin@kernel.org Signed-off-by: Tobin C. Harding Reviewed-by: Christoph Lameter Reviewed-by: Roman Gushchin Acked-by: Vlastimil Babka Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Cc: Matthew Wilcox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/list.h | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/include/linux/list.h b/include/linux/list.h index 58aa3adf94e6..9e9a6403dbe4 100644 --- a/include/linux/list.h +++ b/include/linux/list.h @@ -270,6 +270,24 @@ static inline void list_rotate_left(struct list_head *head) } } +/** + * list_rotate_to_front() - Rotate list to specific item. + * @list: The desired new front of the list. + * @head: The head of the list. + * + * Rotates list so that @list becomes the new front of the list. + */ +static inline void list_rotate_to_front(struct list_head *list, + struct list_head *head) +{ + /* + * Deletes the list head from the list denoted by @head and + * places it as the tail of @list, this effectively rotates the + * list so that @list is at the front. + */ + list_move_tail(head, list); +} + /** * list_is_singular - tests whether a list has just one entry. * @head: the list to test. -- cgit v1.2.3 From 130e8e09e2675bbc484581825fe29e2e5a6b8b0a Mon Sep 17 00:00:00 2001 From: "Tobin C. Harding" Date: Mon, 13 May 2019 17:16:03 -0700 Subject: slob: respect list_head abstraction layer Currently we reach inside the list_head. This is a violation of the layer of abstraction provided by the list_head. It makes the code fragile. More importantly it makes the code wicked hard to understand. The code reaches into the list_head structure to counteract the fact that the list _may_ have been changed during slob_page_alloc(). Instead of this we can add a return parameter to slob_page_alloc() to signal that the list was modified (list_del() called with page->lru to remove page from the freelist). This code is concerned with an optimisation that counters the tendency for first fit allocation algorithm to fragment memory into many small chunks at the front of the memory pool. Since the page is only removed from the list when an allocation uses _all_ the remaining memory in the page then in this special case fragmentation does not occur and we therefore do not need the optimisation. Add a return parameter to slob_page_alloc() to signal that the allocation used up the whole page and that the page was removed from the free list. After calling slob_page_alloc() check the return value just added and only attempt optimisation if the page is still on the list. Use list_head API instead of reaching into the list_head structure to check if sp is at the front of the list. Link: http://lkml.kernel.org/r/20190402230545.2929-3-tobin@kernel.org Signed-off-by: Tobin C. Harding Acked-by: Christoph Lameter Acked-by: Vlastimil Babka Cc: David Rientjes Cc: Joonsoo Kim Cc: Matthew Wilcox Cc: Pekka Enberg Cc: Roman Gushchin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slob.c | 51 +++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 37 insertions(+), 14 deletions(-) diff --git a/mm/slob.c b/mm/slob.c index 307c2c9feb44..07356e9feaaa 100644 --- a/mm/slob.c +++ b/mm/slob.c @@ -213,13 +213,26 @@ static void slob_free_pages(void *b, int order) } /* - * Allocate a slob block within a given slob_page sp. + * slob_page_alloc() - Allocate a slob block within a given slob_page sp. + * @sp: Page to look in. + * @size: Size of the allocation. + * @align: Allocation alignment. + * @page_removed_from_list: Return parameter. + * + * Tries to find a chunk of memory at least @size bytes big within @page. + * + * Return: Pointer to memory if allocated, %NULL otherwise. If the + * allocation fills up @page then the page is removed from the + * freelist, in this case @page_removed_from_list will be set to + * true (set to false otherwise). */ -static void *slob_page_alloc(struct page *sp, size_t size, int align) +static void *slob_page_alloc(struct page *sp, size_t size, int align, + bool *page_removed_from_list) { slob_t *prev, *cur, *aligned = NULL; int delta = 0, units = SLOB_UNITS(size); + *page_removed_from_list = false; for (prev = NULL, cur = sp->freelist; ; prev = cur, cur = slob_next(cur)) { slobidx_t avail = slob_units(cur); @@ -254,8 +267,10 @@ static void *slob_page_alloc(struct page *sp, size_t size, int align) } sp->units -= units; - if (!sp->units) + if (!sp->units) { clear_slob_page_free(sp); + *page_removed_from_list = true; + } return cur; } if (slob_last(cur)) @@ -269,10 +284,10 @@ static void *slob_page_alloc(struct page *sp, size_t size, int align) static void *slob_alloc(size_t size, gfp_t gfp, int align, int node) { struct page *sp; - struct list_head *prev; struct list_head *slob_list; slob_t *b = NULL; unsigned long flags; + bool _unused; if (size < SLOB_BREAK1) slob_list = &free_slob_small; @@ -284,6 +299,7 @@ static void *slob_alloc(size_t size, gfp_t gfp, int align, int node) spin_lock_irqsave(&slob_lock, flags); /* Iterate through each partially free page, try to find room */ list_for_each_entry(sp, slob_list, lru) { + bool page_removed_from_list = false; #ifdef CONFIG_NUMA /* * If there's a node specification, search for a partial @@ -296,18 +312,25 @@ static void *slob_alloc(size_t size, gfp_t gfp, int align, int node) if (sp->units < SLOB_UNITS(size)) continue; - /* Attempt to alloc */ - prev = sp->lru.prev; - b = slob_page_alloc(sp, size, align); + b = slob_page_alloc(sp, size, align, &page_removed_from_list); if (!b) continue; - /* Improve fragment distribution and reduce our average - * search time by starting our next search here. (see - * Knuth vol 1, sec 2.5, pg 449) */ - if (prev != slob_list->prev && - slob_list->next != prev->next) - list_move_tail(slob_list, prev->next); + /* + * If slob_page_alloc() removed sp from the list then we + * cannot call list functions on sp. If so allocation + * did not fragment the page anyway so optimisation is + * unnecessary. + */ + if (!page_removed_from_list) { + /* + * Improve fragment distribution and reduce our average + * search time by starting our next search here. (see + * Knuth vol 1, sec 2.5, pg 449) + */ + if (!list_is_first(&sp->lru, slob_list)) + list_rotate_to_front(&sp->lru, slob_list); + } break; } spin_unlock_irqrestore(&slob_lock, flags); @@ -326,7 +349,7 @@ static void *slob_alloc(size_t size, gfp_t gfp, int align, int node) INIT_LIST_HEAD(&sp->lru); set_slob(b, SLOB_UNITS(PAGE_SIZE), b + SLOB_UNITS(PAGE_SIZE)); set_slob_page_free(sp, slob_list); - b = slob_page_alloc(sp, size, align); + b = slob_page_alloc(sp, size, align, &_unused); BUG_ON(!b); spin_unlock_irqrestore(&slob_lock, flags); } -- cgit v1.2.3 From adab7b68189d14504e9f690ee7eb7e98af68301b Mon Sep 17 00:00:00 2001 From: "Tobin C. Harding" Date: Mon, 13 May 2019 17:16:06 -0700 Subject: slob: use slab_list instead of lru Currently we use the page->lru list for maintaining lists of slabs. We have a list_head in the page structure (slab_list) that can be used for this purpose. Doing so makes the code cleaner since we are not overloading the lru list. The slab_list is part of a union within the page struct (included here stripped down): union { struct { /* Page cache and anonymous pages */ struct list_head lru; ... }; struct { dma_addr_t dma_addr; }; struct { /* slab, slob and slub */ union { struct list_head slab_list; struct { /* Partial pages */ struct page *next; int pages; /* Nr of pages left */ int pobjects; /* Approximate count */ }; }; ... Here we see that slab_list and lru are the same bits. We can verify that this change is safe to do by examining the object file produced from slob.c before and after this patch is applied. Steps taken to verify: 1. checkout current tip of Linus' tree commit a667cb7a94d4 ("Merge branch 'akpm' (patches from Andrew)") 2. configure and build (select SLOB allocator) CONFIG_SLOB=y CONFIG_SLAB_MERGE_DEFAULT=y 3. dissasemble object file `objdump -dr mm/slub.o > before.s 4. apply patch 5. build 6. dissasemble object file `objdump -dr mm/slub.o > after.s 7. diff before.s after.s Use slab_list list_head instead of the lru list_head for maintaining lists of slabs. Link: http://lkml.kernel.org/r/20190402230545.2929-4-tobin@kernel.org Signed-off-by: Tobin C. Harding Reviewed-by: Roman Gushchin Acked-by: Christoph Lameter Acked-by: Vlastimil Babka Cc: David Rientjes Cc: Joonsoo Kim Cc: Matthew Wilcox Cc: Pekka Enberg Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slob.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/mm/slob.c b/mm/slob.c index 07356e9feaaa..84aefd9b91ee 100644 --- a/mm/slob.c +++ b/mm/slob.c @@ -112,13 +112,13 @@ static inline int slob_page_free(struct page *sp) static void set_slob_page_free(struct page *sp, struct list_head *list) { - list_add(&sp->lru, list); + list_add(&sp->slab_list, list); __SetPageSlobFree(sp); } static inline void clear_slob_page_free(struct page *sp) { - list_del(&sp->lru); + list_del(&sp->slab_list); __ClearPageSlobFree(sp); } @@ -298,7 +298,7 @@ static void *slob_alloc(size_t size, gfp_t gfp, int align, int node) spin_lock_irqsave(&slob_lock, flags); /* Iterate through each partially free page, try to find room */ - list_for_each_entry(sp, slob_list, lru) { + list_for_each_entry(sp, slob_list, slab_list) { bool page_removed_from_list = false; #ifdef CONFIG_NUMA /* @@ -328,8 +328,8 @@ static void *slob_alloc(size_t size, gfp_t gfp, int align, int node) * search time by starting our next search here. (see * Knuth vol 1, sec 2.5, pg 449) */ - if (!list_is_first(&sp->lru, slob_list)) - list_rotate_to_front(&sp->lru, slob_list); + if (!list_is_first(&sp->slab_list, slob_list)) + list_rotate_to_front(&sp->slab_list, slob_list); } break; } @@ -346,7 +346,7 @@ static void *slob_alloc(size_t size, gfp_t gfp, int align, int node) spin_lock_irqsave(&slob_lock, flags); sp->units = SLOB_UNITS(PAGE_SIZE); sp->freelist = b; - INIT_LIST_HEAD(&sp->lru); + INIT_LIST_HEAD(&sp->slab_list); set_slob(b, SLOB_UNITS(PAGE_SIZE), b + SLOB_UNITS(PAGE_SIZE)); set_slob_page_free(sp, slob_list); b = slob_page_alloc(sp, size, align, &_unused); -- cgit v1.2.3 From 6dfd1b653c49df2dad1dcfe063a196e940e02dbd Mon Sep 17 00:00:00 2001 From: "Tobin C. Harding" Date: Mon, 13 May 2019 17:16:09 -0700 Subject: slub: add comments to endif pre-processor macros SLUB allocator makes heavy use of ifdef/endif pre-processor macros. The pairing of these statements is at times hard to follow e.g. if the pair are further than a screen apart or if there are nested pairs. We can reduce cognitive load by adding a comment to the endif statement of form #ifdef CONFIG_FOO ... #endif /* CONFIG_FOO */ Add comments to endif pre-processor macros if ifdef/endif pair is not immediately apparent. Link: http://lkml.kernel.org/r/20190402230545.2929-5-tobin@kernel.org Signed-off-by: Tobin C. Harding Acked-by: Christoph Lameter Reviewed-by: Roman Gushchin Acked-by: Vlastimil Babka Cc: David Rientjes Cc: Joonsoo Kim Cc: Matthew Wilcox Cc: Pekka Enberg Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slub.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/mm/slub.c b/mm/slub.c index 6b28cd2b5a58..43935b4f2b9e 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1942,7 +1942,7 @@ static void *get_any_partial(struct kmem_cache *s, gfp_t flags, } } } while (read_mems_allowed_retry(cpuset_mems_cookie)); -#endif +#endif /* CONFIG_NUMA */ return NULL; } @@ -2240,7 +2240,7 @@ static void unfreeze_partials(struct kmem_cache *s, discard_slab(s, page); stat(s, FREE_SLAB); } -#endif +#endif /* CONFIG_SLUB_CPU_PARTIAL */ } /* @@ -2299,7 +2299,7 @@ static void put_cpu_partial(struct kmem_cache *s, struct page *page, int drain) local_irq_restore(flags); } preempt_enable(); -#endif +#endif /* CONFIG_SLUB_CPU_PARTIAL */ } static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c) @@ -2804,7 +2804,7 @@ void *kmem_cache_alloc_node_trace(struct kmem_cache *s, } EXPORT_SYMBOL(kmem_cache_alloc_node_trace); #endif -#endif +#endif /* CONFIG_NUMA */ /* * Slow path handling. This may still be called frequently since objects @@ -3839,7 +3839,7 @@ void *__kmalloc_node(size_t size, gfp_t flags, int node) return ret; } EXPORT_SYMBOL(__kmalloc_node); -#endif +#endif /* CONFIG_NUMA */ #ifdef CONFIG_HARDENED_USERCOPY /* @@ -4057,7 +4057,7 @@ void __kmemcg_cache_deactivate(struct kmem_cache *s) */ slab_deactivate_memcg_cache_rcu_sched(s, kmemcg_cache_deact_after_rcu); } -#endif +#endif /* CONFIG_MEMCG */ static int slab_mem_going_offline_callback(void *arg) { @@ -4690,7 +4690,7 @@ static int list_locations(struct kmem_cache *s, char *buf, len += sprintf(buf, "No data\n"); return len; } -#endif +#endif /* CONFIG_SLUB_DEBUG */ #ifdef SLUB_RESILIENCY_TEST static void __init resiliency_test(void) @@ -4750,7 +4750,7 @@ static void __init resiliency_test(void) #ifdef CONFIG_SYSFS static void resiliency_test(void) {}; #endif -#endif +#endif /* SLUB_RESILIENCY_TEST */ #ifdef CONFIG_SYSFS enum slab_stat_type { @@ -5407,7 +5407,7 @@ STAT_ATTR(CPU_PARTIAL_ALLOC, cpu_partial_alloc); STAT_ATTR(CPU_PARTIAL_FREE, cpu_partial_free); STAT_ATTR(CPU_PARTIAL_NODE, cpu_partial_node); STAT_ATTR(CPU_PARTIAL_DRAIN, cpu_partial_drain); -#endif +#endif /* CONFIG_SLUB_STATS */ static struct attribute *slab_attrs[] = { &slab_size_attr.attr, @@ -5608,7 +5608,7 @@ static void memcg_propagate_slab_attrs(struct kmem_cache *s) if (buffer) free_page((unsigned long)buffer); -#endif +#endif /* CONFIG_MEMCG */ } static void kmem_cache_release(struct kobject *k) -- cgit v1.2.3 From 916ac0527837aa0be46d82804f93dd46f03aaedc Mon Sep 17 00:00:00 2001 From: "Tobin C. Harding" Date: Mon, 13 May 2019 17:16:12 -0700 Subject: slub: use slab_list instead of lru Currently we use the page->lru list for maintaining lists of slabs. We have a list in the page structure (slab_list) that can be used for this purpose. Doing so makes the code cleaner since we are not overloading the lru list. Use the slab_list instead of the lru list for maintaining lists of slabs. Link: http://lkml.kernel.org/r/20190402230545.2929-6-tobin@kernel.org Signed-off-by: Tobin C. Harding Acked-by: Christoph Lameter Reviewed-by: Roman Gushchin Acked-by: Vlastimil Babka Cc: David Rientjes Cc: Joonsoo Kim Cc: Matthew Wilcox Cc: Pekka Enberg Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slub.c | 40 ++++++++++++++++++++-------------------- 1 file changed, 20 insertions(+), 20 deletions(-) diff --git a/mm/slub.c b/mm/slub.c index 43935b4f2b9e..ce6917b7451d 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1014,7 +1014,7 @@ static void add_full(struct kmem_cache *s, return; lockdep_assert_held(&n->list_lock); - list_add(&page->lru, &n->full); + list_add(&page->slab_list, &n->full); } static void remove_full(struct kmem_cache *s, struct kmem_cache_node *n, struct page *page) @@ -1023,7 +1023,7 @@ static void remove_full(struct kmem_cache *s, struct kmem_cache_node *n, struct return; lockdep_assert_held(&n->list_lock); - list_del(&page->lru); + list_del(&page->slab_list); } /* Tracking of the number of slabs for debugging purposes */ @@ -1764,9 +1764,9 @@ __add_partial(struct kmem_cache_node *n, struct page *page, int tail) { n->nr_partial++; if (tail == DEACTIVATE_TO_TAIL) - list_add_tail(&page->lru, &n->partial); + list_add_tail(&page->slab_list, &n->partial); else - list_add(&page->lru, &n->partial); + list_add(&page->slab_list, &n->partial); } static inline void add_partial(struct kmem_cache_node *n, @@ -1780,7 +1780,7 @@ static inline void remove_partial(struct kmem_cache_node *n, struct page *page) { lockdep_assert_held(&n->list_lock); - list_del(&page->lru); + list_del(&page->slab_list); n->nr_partial--; } @@ -1854,7 +1854,7 @@ static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n, return NULL; spin_lock(&n->list_lock); - list_for_each_entry_safe(page, page2, &n->partial, lru) { + list_for_each_entry_safe(page, page2, &n->partial, slab_list) { void *t; if (!pfmemalloc_match(page, flags)) @@ -2398,7 +2398,7 @@ static unsigned long count_partial(struct kmem_cache_node *n, struct page *page; spin_lock_irqsave(&n->list_lock, flags); - list_for_each_entry(page, &n->partial, lru) + list_for_each_entry(page, &n->partial, slab_list) x += get_count(page); spin_unlock_irqrestore(&n->list_lock, flags); return x; @@ -3696,10 +3696,10 @@ static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n) BUG_ON(irqs_disabled()); spin_lock_irq(&n->list_lock); - list_for_each_entry_safe(page, h, &n->partial, lru) { + list_for_each_entry_safe(page, h, &n->partial, slab_list) { if (!page->inuse) { remove_partial(n, page); - list_add(&page->lru, &discard); + list_add(&page->slab_list, &discard); } else { list_slab_objects(s, page, "Objects remaining in %s on __kmem_cache_shutdown()"); @@ -3707,7 +3707,7 @@ static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n) } spin_unlock_irq(&n->list_lock); - list_for_each_entry_safe(page, h, &discard, lru) + list_for_each_entry_safe(page, h, &discard, slab_list) discard_slab(s, page); } @@ -3987,7 +3987,7 @@ int __kmem_cache_shrink(struct kmem_cache *s) * Note that concurrent frees may occur while we hold the * list_lock. page->inuse here is the upper limit. */ - list_for_each_entry_safe(page, t, &n->partial, lru) { + list_for_each_entry_safe(page, t, &n->partial, slab_list) { int free = page->objects - page->inuse; /* Do not reread page->inuse */ @@ -3997,10 +3997,10 @@ int __kmem_cache_shrink(struct kmem_cache *s) BUG_ON(free <= 0); if (free == page->objects) { - list_move(&page->lru, &discard); + list_move(&page->slab_list, &discard); n->nr_partial--; } else if (free <= SHRINK_PROMOTE_MAX) - list_move(&page->lru, promote + free - 1); + list_move(&page->slab_list, promote + free - 1); } /* @@ -4013,7 +4013,7 @@ int __kmem_cache_shrink(struct kmem_cache *s) spin_unlock_irqrestore(&n->list_lock, flags); /* Release empty slabs */ - list_for_each_entry_safe(page, t, &discard, lru) + list_for_each_entry_safe(page, t, &discard, slab_list) discard_slab(s, page); if (slabs_node(s, node)) @@ -4205,11 +4205,11 @@ static struct kmem_cache * __init bootstrap(struct kmem_cache *static_cache) for_each_kmem_cache_node(s, node, n) { struct page *p; - list_for_each_entry(p, &n->partial, lru) + list_for_each_entry(p, &n->partial, slab_list) p->slab_cache = s; #ifdef CONFIG_SLUB_DEBUG - list_for_each_entry(p, &n->full, lru) + list_for_each_entry(p, &n->full, slab_list) p->slab_cache = s; #endif } @@ -4426,7 +4426,7 @@ static int validate_slab_node(struct kmem_cache *s, spin_lock_irqsave(&n->list_lock, flags); - list_for_each_entry(page, &n->partial, lru) { + list_for_each_entry(page, &n->partial, slab_list) { validate_slab_slab(s, page, map); count++; } @@ -4437,7 +4437,7 @@ static int validate_slab_node(struct kmem_cache *s, if (!(s->flags & SLAB_STORE_USER)) goto out; - list_for_each_entry(page, &n->full, lru) { + list_for_each_entry(page, &n->full, slab_list) { validate_slab_slab(s, page, map); count++; } @@ -4633,9 +4633,9 @@ static int list_locations(struct kmem_cache *s, char *buf, continue; spin_lock_irqsave(&n->list_lock, flags); - list_for_each_entry(page, &n->partial, lru) + list_for_each_entry(page, &n->partial, slab_list) process_slab(&t, s, page, alloc, map); - list_for_each_entry(page, &n->full, lru) + list_for_each_entry(page, &n->full, slab_list) process_slab(&t, s, page, alloc, map); spin_unlock_irqrestore(&n->list_lock, flags); } -- cgit v1.2.3 From 16cb0ec75b346ec4fce11c5ce40d68b173f4e2f4 Mon Sep 17 00:00:00 2001 From: "Tobin C. Harding" Date: Mon, 13 May 2019 17:16:15 -0700 Subject: slab: use slab_list instead of lru Currently we use the page->lru list for maintaining lists of slabs. We have a list in the page structure (slab_list) that can be used for this purpose. Doing so makes the code cleaner since we are not overloading the lru list. Use the slab_list instead of the lru list for maintaining lists of slabs. Link: http://lkml.kernel.org/r/20190402230545.2929-7-tobin@kernel.org Signed-off-by: Tobin C. Harding Acked-by: Christoph Lameter Reviewed-by: Roman Gushchin Acked-by: Vlastimil Babka Cc: David Rientjes Cc: Joonsoo Kim Cc: Matthew Wilcox Cc: Pekka Enberg Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slab.c | 49 +++++++++++++++++++++++++------------------------ 1 file changed, 25 insertions(+), 24 deletions(-) diff --git a/mm/slab.c b/mm/slab.c index 284ab737faee..e9eaa8fce231 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -1674,8 +1674,8 @@ static void slabs_destroy(struct kmem_cache *cachep, struct list_head *list) { struct page *page, *n; - list_for_each_entry_safe(page, n, list, lru) { - list_del(&page->lru); + list_for_each_entry_safe(page, n, list, slab_list) { + list_del(&page->slab_list); slab_destroy(cachep, page); } } @@ -2231,8 +2231,8 @@ static int drain_freelist(struct kmem_cache *cache, goto out; } - page = list_entry(p, struct page, lru); - list_del(&page->lru); + page = list_entry(p, struct page, slab_list); + list_del(&page->slab_list); n->free_slabs--; n->total_slabs--; /* @@ -2691,13 +2691,13 @@ static void cache_grow_end(struct kmem_cache *cachep, struct page *page) if (!page) return; - INIT_LIST_HEAD(&page->lru); + INIT_LIST_HEAD(&page->slab_list); n = get_node(cachep, page_to_nid(page)); spin_lock(&n->list_lock); n->total_slabs++; if (!page->active) { - list_add_tail(&page->lru, &(n->slabs_free)); + list_add_tail(&page->slab_list, &n->slabs_free); n->free_slabs++; } else fixup_slab_list(cachep, n, page, &list); @@ -2806,9 +2806,9 @@ static inline void fixup_slab_list(struct kmem_cache *cachep, void **list) { /* move slabp to correct slabp list: */ - list_del(&page->lru); + list_del(&page->slab_list); if (page->active == cachep->num) { - list_add(&page->lru, &n->slabs_full); + list_add(&page->slab_list, &n->slabs_full); if (OBJFREELIST_SLAB(cachep)) { #if DEBUG /* Poisoning will be done without holding the lock */ @@ -2822,7 +2822,7 @@ static inline void fixup_slab_list(struct kmem_cache *cachep, page->freelist = NULL; } } else - list_add(&page->lru, &n->slabs_partial); + list_add(&page->slab_list, &n->slabs_partial); } /* Try to find non-pfmemalloc slab if needed */ @@ -2845,20 +2845,20 @@ static noinline struct page *get_valid_first_slab(struct kmem_cache_node *n, } /* Move pfmemalloc slab to the end of list to speed up next search */ - list_del(&page->lru); + list_del(&page->slab_list); if (!page->active) { - list_add_tail(&page->lru, &n->slabs_free); + list_add_tail(&page->slab_list, &n->slabs_free); n->free_slabs++; } else - list_add_tail(&page->lru, &n->slabs_partial); + list_add_tail(&page->slab_list, &n->slabs_partial); - list_for_each_entry(page, &n->slabs_partial, lru) { + list_for_each_entry(page, &n->slabs_partial, slab_list) { if (!PageSlabPfmemalloc(page)) return page; } n->free_touched = 1; - list_for_each_entry(page, &n->slabs_free, lru) { + list_for_each_entry(page, &n->slabs_free, slab_list) { if (!PageSlabPfmemalloc(page)) { n->free_slabs--; return page; @@ -2873,11 +2873,12 @@ static struct page *get_first_slab(struct kmem_cache_node *n, bool pfmemalloc) struct page *page; assert_spin_locked(&n->list_lock); - page = list_first_entry_or_null(&n->slabs_partial, struct page, lru); + page = list_first_entry_or_null(&n->slabs_partial, struct page, + slab_list); if (!page) { n->free_touched = 1; page = list_first_entry_or_null(&n->slabs_free, struct page, - lru); + slab_list); if (page) n->free_slabs--; } @@ -3378,29 +3379,29 @@ static void free_block(struct kmem_cache *cachep, void **objpp, objp = objpp[i]; page = virt_to_head_page(objp); - list_del(&page->lru); + list_del(&page->slab_list); check_spinlock_acquired_node(cachep, node); slab_put_obj(cachep, page, objp); STATS_DEC_ACTIVE(cachep); /* fixup slab chains */ if (page->active == 0) { - list_add(&page->lru, &n->slabs_free); + list_add(&page->slab_list, &n->slabs_free); n->free_slabs++; } else { /* Unconditionally move a slab to the end of the * partial list on free - maximum time for the * other objects to be freed, too. */ - list_add_tail(&page->lru, &n->slabs_partial); + list_add_tail(&page->slab_list, &n->slabs_partial); } } while (n->free_objects > n->free_limit && !list_empty(&n->slabs_free)) { n->free_objects -= cachep->num; - page = list_last_entry(&n->slabs_free, struct page, lru); - list_move(&page->lru, list); + page = list_last_entry(&n->slabs_free, struct page, slab_list); + list_move(&page->slab_list, list); n->free_slabs--; n->total_slabs--; } @@ -3438,7 +3439,7 @@ free_done: int i = 0; struct page *page; - list_for_each_entry(page, &n->slabs_free, lru) { + list_for_each_entry(page, &n->slabs_free, slab_list) { BUG_ON(page->active); i++; @@ -4302,9 +4303,9 @@ static int leaks_show(struct seq_file *m, void *p) check_irq_on(); spin_lock_irq(&n->list_lock); - list_for_each_entry(page, &n->slabs_full, lru) + list_for_each_entry(page, &n->slabs_full, slab_list) handle_slab(x, cachep, page); - list_for_each_entry(page, &n->slabs_partial, lru) + list_for_each_entry(page, &n->slabs_partial, slab_list) handle_slab(x, cachep, page); spin_unlock_irq(&n->list_lock); } -- cgit v1.2.3 From 3e05617ceaa42838084daee209f9c4965bf03379 Mon Sep 17 00:00:00 2001 From: "Tobin C. Harding" Date: Mon, 13 May 2019 17:16:19 -0700 Subject: mm: remove stale comment from page struct We now use the slab_list list_head instead of the lru list_head. This comment has become stale. Remove stale comment from page struct slab_list list_head. Link: http://lkml.kernel.org/r/20190402230545.2929-8-tobin@kernel.org Signed-off-by: Tobin C. Harding Acked-by: Christoph Lameter Reviewed-by: Roman Gushchin Acked-by: Vlastimil Babka Cc: David Rientjes Cc: Joonsoo Kim Cc: Matthew Wilcox Cc: Pekka Enberg Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm_types.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 4ef4bbe78a1d..e1f42a07d8f0 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -103,7 +103,7 @@ struct page { }; struct { /* slab, slob and slub */ union { - struct list_head slab_list; /* uses lru */ + struct list_head slab_list; struct { /* Partial pages */ struct page *next; #ifdef CONFIG_64BIT -- cgit v1.2.3 From a4d3f8916c65b63e6b9457cdec74ceaaff7e6b12 Mon Sep 17 00:00:00 2001 From: Liu Xiang Date: Mon, 13 May 2019 17:16:22 -0700 Subject: slub: remove useless kmem_cache_debug() before remove_full() When CONFIG_SLUB_DEBUG is not enabled, remove_full() is empty. While CONFIG_SLUB_DEBUG is enabled, remove_full() can check s->flags by itself. So kmem_cache_debug() is useless and can be removed. Link: http://lkml.kernel.org/r/1552577313-2830-1-git-send-email-liu.xiang6@zte.com.cn Signed-off-by: Liu Xiang Acked-by: David Rientjes Cc: Christoph Lameter Cc: Pekka Enberg Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slub.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/mm/slub.c b/mm/slub.c index ce6917b7451d..af3b72b5de82 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -2903,8 +2903,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page, * then add it. */ if (!kmem_cache_has_cpu_partial(s) && unlikely(!prior)) { - if (kmem_cache_debug(s)) - remove_full(s, n, page); + remove_full(s, n, page); add_partial(n, page, DEACTIVATE_TO_TAIL); stat(s, FREE_ADD_PARTIAL); } -- cgit v1.2.3 From 517f9f1ee5ed0a05d0f6f884f6d9b5c46ac5a810 Mon Sep 17 00:00:00 2001 From: Li RongQing Date: Mon, 13 May 2019 17:16:25 -0700 Subject: mm/slab.c: remove unneed check in cpuup_canceled nc is a member of percpu allocation memory, and cannot be NULL. Link: http://lkml.kernel.org/r/1553159353-5056-1-git-send-email-lirongqing@baidu.com Signed-off-by: Li RongQing Reviewed-by: Andrew Morton Acked-by: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slab.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/mm/slab.c b/mm/slab.c index e9eaa8fce231..da4068a95307 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -990,10 +990,8 @@ static void cpuup_canceled(long cpu) /* cpu is dead; no one can alloc from it. */ nc = per_cpu_ptr(cachep->cpu_cache, cpu); - if (nc) { - free_block(cachep, nc->entry, nc->avail, node, &list); - nc->avail = 0; - } + free_block(cachep, nc->entry, nc->avail, node, &list); + nc->avail = 0; if (!cpumask_empty(mask)) { spin_unlock_irq(&n->list_lock); -- cgit v1.2.3 From 632b2ef0c72d9ea0f76619ad04ca53aae121f8b9 Mon Sep 17 00:00:00 2001 From: Liu Xiang Date: Mon, 13 May 2019 17:16:28 -0700 Subject: mm/slub.c: update the comment about slab frozen Now frozen slab can only be on the per cpu partial list. Link: http://lkml.kernel.org/r/1554022325-11305-1-git-send-email-liu.xiang6@zte.com.cn Signed-off-by: Liu Xiang Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slub.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/mm/slub.c b/mm/slub.c index af3b72b5de82..cd04dbd2b5d0 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -58,10 +58,11 @@ * D. page->frozen -> frozen state * * If a slab is frozen then it is exempt from list management. It is not - * on any list. The processor that froze the slab is the one who can - * perform list operations on the page. Other processors may put objects - * onto the freelist but the processor that froze the slab is the only - * one that can retrieve the objects from the page's freelist. + * on any list except per cpu partial list. The processor that froze the + * slab is the one who can perform list operations on the page. Other + * processors may put objects onto the freelist but the processor that + * froze the slab is the only one that can retrieve the objects from the + * page's freelist. * * The list_lock protects the partial and full list on each node and * the partial slab counter. If taken then no new slabs may be added or -- cgit v1.2.3 From 745e10146c31b1c6ed3326286704ae251b17f663 Mon Sep 17 00:00:00 2001 From: Qian Cai Date: Mon, 13 May 2019 17:16:31 -0700 Subject: mm/slab.c: fix an infinite loop in leaks_show() "cat /proc/slab_allocators" could hang forever on SMP machines with kmemleak or object debugging enabled due to other CPUs running do_drain() will keep making kmemleak_object or debug_objects_cache dirty and unable to escape the first loop in leaks_show(), do { set_store_user_clean(cachep); drain_cpu_caches(cachep); ... } while (!is_store_user_clean(cachep)); For example, do_drain slabs_destroy slab_destroy kmem_cache_free __cache_free ___cache_free kmemleak_free_recursive delete_object_full __delete_object put_object free_object_rcu kmem_cache_free cache_free_debugcheck --> dirty kmemleak_object One approach is to check cachep->name and skip both kmemleak_object and debug_objects_cache in leaks_show(). The other is to set store_user_clean after drain_cpu_caches() which leaves a small window between drain_cpu_caches() and set_store_user_clean() where per-CPU caches could be dirty again lead to slightly wrong information has been stored but could also speed up things significantly which sounds like a good compromise. For example, # cat /proc/slab_allocators 0m42.778s # 1st approach 0m0.737s # 2nd approach [akpm@linux-foundation.org: tweak comment] Link: http://lkml.kernel.org/r/20190411032635.10325-1-cai@lca.pw Fixes: d31676dfde25 ("mm/slab: alternative implementation for DEBUG_SLAB_LEAK") Signed-off-by: Qian Cai Reviewed-by: Andrew Morton Cc: Vlastimil Babka Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slab.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/mm/slab.c b/mm/slab.c index da4068a95307..2915d912e89a 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -4291,8 +4291,12 @@ static int leaks_show(struct seq_file *m, void *p) * whole processing. */ do { - set_store_user_clean(cachep); drain_cpu_caches(cachep); + /* + * drain_cpu_caches() could make kmemleak_object and + * debug_objects_cache dirty, so reset afterwards. + */ + set_store_user_clean(cachep); x[1] = 0; -- cgit v1.2.3 From 3b775998eca7fca0e470e0871feb1c9ec07dd84a Mon Sep 17 00:00:00 2001 From: Yafang Shao Date: Mon, 13 May 2019 17:16:34 -0700 Subject: include/trace/events/vmscan.h: drop zone id from kswapd tracepoints It is not clear how the zone id is useful in kswapd tracepoints and the id itself is not really easy to process because it depends on the configuration (available zones). Let's drop the id for now. If somebody really needs that information then the zone name should be used instead. [mhocko@suse.com: new changelog] Link: http://lkml.kernel.org/r/1552451813-10833-1-git-send-email-laoar.shao@gmail.com Signed-off-by: Yafang Shao Acked-by: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/trace/events/vmscan.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/include/trace/events/vmscan.h b/include/trace/events/vmscan.h index 252327dbfa51..e8709ab22d68 100644 --- a/include/trace/events/vmscan.h +++ b/include/trace/events/vmscan.h @@ -73,7 +73,9 @@ TRACE_EVENT(mm_vmscan_kswapd_wake, __entry->order = order; ), - TP_printk("nid=%d zid=%d order=%d", __entry->nid, __entry->zid, __entry->order) + TP_printk("nid=%d order=%d", + __entry->nid, + __entry->order) ); TRACE_EVENT(mm_vmscan_wakeup_kswapd, @@ -96,9 +98,8 @@ TRACE_EVENT(mm_vmscan_wakeup_kswapd, __entry->gfp_flags = gfp_flags; ), - TP_printk("nid=%d zid=%d order=%d gfp_flags=%s", + TP_printk("nid=%d order=%d gfp_flags=%s", __entry->nid, - __entry->zid, __entry->order, show_gfp_flags(__entry->gfp_flags)) ); -- cgit v1.2.3 From f0fd50504a54f5548eb666dc16ddf8394e44e4b7 Mon Sep 17 00:00:00 2001 From: Yue Hu Date: Mon, 13 May 2019 17:16:37 -0700 Subject: mm/cma_debug.c: fix the break condition in cma_maxchunk_get() If not find zero bit in find_next_zero_bit(), it will return the size parameter passed in, so the start bit should be compared with bitmap_maxno rather than cma->count. Although getting maxchunk is working fine due to zero value of order_per_bit currently, the operation will be stuck if order_per_bit is set as non-zero. Link: http://lkml.kernel.org/r/20190319092734.276-1-zbestahu@gmail.com Signed-off-by: Yue Hu Reviewed-by: Andrew Morton Cc: Michal Hocko Cc: Joe Perches Cc: David Rientjes Cc: Dmitry Safonov Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/cma_debug.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/cma_debug.c b/mm/cma_debug.c index 8d7b2fd52225..a7dd9e8e10d5 100644 --- a/mm/cma_debug.c +++ b/mm/cma_debug.c @@ -56,7 +56,7 @@ static int cma_maxchunk_get(void *data, u64 *val) mutex_lock(&cma->lock); for (;;) { start = find_next_zero_bit(cma->bitmap, bitmap_maxno, end); - if (start >= cma->count) + if (start >= bitmap_maxno) break; end = find_next_bit(cma->bitmap, bitmap_maxno, start); maxchunk = max(end - start, maxchunk); -- cgit v1.2.3 From cefdca0a86be517bc390fc4541e3674b8e7803b0 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Mon, 13 May 2019 17:16:41 -0700 Subject: userfaultfd/sysctl: add vm.unprivileged_userfaultfd Userfaultfd can be misued to make it easier to exploit existing use-after-free (and similar) bugs that might otherwise only make a short window or race condition available. By using userfaultfd to stall a kernel thread, a malicious program can keep some state that it wrote, stable for an extended period, which it can then access using an existing exploit. While it doesn't cause the exploit itself, and while it's not the only thing that can stall a kernel thread when accessing a memory location, it's one of the few that never needs privilege. We can add a flag, allowing userfaultfd to be restricted, so that in general it won't be useable by arbitrary user programs, but in environments that require userfaultfd it can be turned back on. Add a global sysctl knob "vm.unprivileged_userfaultfd" to control whether userfaultfd is allowed by unprivileged users. When this is set to zero, only privileged users (root user, or users with the CAP_SYS_PTRACE capability) will be able to use the userfaultfd syscalls. Andrea said: : The only difference between the bpf sysctl and the userfaultfd sysctl : this way is that the bpf sysctl adds the CAP_SYS_ADMIN capability : requirement, while userfaultfd adds the CAP_SYS_PTRACE requirement, : because the userfaultfd monitor is more likely to need CAP_SYS_PTRACE : already if it's doing other kind of tracking on processes runtime, in : addition of userfaultfd. In other words both syscalls works only for : root, when the two sysctl are opt-in set to 1. [dgilbert@redhat.com: changelog additions] [akpm@linux-foundation.org: documentation tweak, per Mike] Link: http://lkml.kernel.org/r/20190319030722.12441-2-peterx@redhat.com Signed-off-by: Peter Xu Suggested-by: Andrea Arcangeli Suggested-by: Mike Rapoport Reviewed-by: Mike Rapoport Reviewed-by: Andrea Arcangeli Cc: Paolo Bonzini Cc: Hugh Dickins Cc: Luis Chamberlain Cc: Maxime Coquelin Cc: Maya Gokhale Cc: Jerome Glisse Cc: Pavel Emelyanov Cc: Johannes Weiner Cc: Martin Cracauer Cc: Denis Plotnikov Cc: Marty McFadden Cc: Mike Kravetz Cc: Kees Cook Cc: Mel Gorman Cc: "Kirill A . Shutemov" Cc: "Dr . David Alan Gilbert" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/sysctl/vm.txt | 12 ++++++++++++ fs/userfaultfd.c | 5 +++++ include/linux/userfaultfd_k.h | 2 ++ kernel/sysctl.c | 12 ++++++++++++ 4 files changed, 31 insertions(+) diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt index 3f13d8599337..749322060f10 100644 --- a/Documentation/sysctl/vm.txt +++ b/Documentation/sysctl/vm.txt @@ -61,6 +61,7 @@ Currently, these files are in /proc/sys/vm: - stat_refresh - numa_stat - swappiness +- unprivileged_userfaultfd - user_reserve_kbytes - vfs_cache_pressure - watermark_boost_factor @@ -818,6 +819,17 @@ The default value is 60. ============================================================== +unprivileged_userfaultfd + +This flag controls whether unprivileged users can use the userfaultfd +system calls. Set this to 1 to allow unprivileged users to use the +userfaultfd system calls, or set this to 0 to restrict userfaultfd to only +privileged users (with SYS_CAP_PTRACE capability). + +The default value is 1. + +============================================================== + - user_reserve_kbytes When overcommit_memory is set to 2, "never overcommit" mode, reserve diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index f5de1e726356..3b30301c90ec 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -30,6 +30,8 @@ #include #include +int sysctl_unprivileged_userfaultfd __read_mostly = 1; + static struct kmem_cache *userfaultfd_ctx_cachep __read_mostly; enum userfaultfd_state { @@ -1930,6 +1932,9 @@ SYSCALL_DEFINE1(userfaultfd, int, flags) struct userfaultfd_ctx *ctx; int fd; + if (!sysctl_unprivileged_userfaultfd && !capable(CAP_SYS_PTRACE)) + return -EPERM; + BUG_ON(!current->mm); /* Check the UFFD_* constants for consistency. */ diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h index 37c9eba75c98..ac9d71e24b81 100644 --- a/include/linux/userfaultfd_k.h +++ b/include/linux/userfaultfd_k.h @@ -28,6 +28,8 @@ #define UFFD_SHARED_FCNTL_FLAGS (O_CLOEXEC | O_NONBLOCK) #define UFFD_FLAGS_SET (EFD_SHARED_FCNTL_FLAGS) +extern int sysctl_unprivileged_userfaultfd; + extern vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason); extern ssize_t mcopy_atomic(struct mm_struct *dst_mm, unsigned long dst_start, diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 599510a3355e..ba158f61aab4 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -66,6 +66,7 @@ #include #include #include +#include #include "../lib/kstrtox.h" @@ -1719,6 +1720,17 @@ static struct ctl_table vm_table[] = { .extra1 = (void *)&mmap_rnd_compat_bits_min, .extra2 = (void *)&mmap_rnd_compat_bits_max, }, +#endif +#ifdef CONFIG_USERFAULTFD + { + .procname = "unprivileged_userfaultfd", + .data = &sysctl_unprivileged_userfaultfd, + .maxlen = sizeof(sysctl_unprivileged_userfaultfd), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &one, + }, #endif { } }; -- cgit v1.2.3 From 5fd4ca2d84b249f0858ce28cf637cf25b61a398f Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Mon, 13 May 2019 17:16:44 -0700 Subject: mm: page cache: store only head pages in i_pages Transparent Huge Pages are currently stored in i_pages as pointers to consecutive subpages. This patch changes that to storing consecutive pointers to the head page in preparation for storing huge pages more efficiently in i_pages. Large parts of this are "inspired" by Kirill's patch https://lore.kernel.org/lkml/20170126115819.58875-2-kirill.shutemov@linux.intel.com/ [willy@infradead.org: fix swapcache pages] Link: http://lkml.kernel.org/r/20190324155441.GF10344@bombadil.infradead.org [kirill@shutemov.name: hugetlb stores pages in page cache differently] Link: http://lkml.kernel.org/r/20190404134553.vuvhgmghlkiw2hgl@kshutemo-mobl1 Link: http://lkml.kernel.org/r/20190307153051.18815-1-willy@infradead.org Signed-off-by: Matthew Wilcox Acked-by: Jan Kara Reviewed-by: Kirill Shutemov Reviewed-and-tested-by: Song Liu Tested-by: William Kucharski Reviewed-by: William Kucharski Tested-by: Qian Cai Cc: Hugh Dickins Cc: Song Liu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/pagemap.h | 13 ++++ mm/filemap.c | 159 +++++++++++++++++++----------------------------- mm/huge_memory.c | 3 + mm/khugepaged.c | 4 +- mm/memfd.c | 2 + mm/migrate.c | 2 +- mm/shmem.c | 2 +- mm/swap_state.c | 4 +- 8 files changed, 86 insertions(+), 103 deletions(-) diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index bcf909d0de5f..2e8438a1216a 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -333,6 +333,19 @@ static inline struct page *grab_cache_page_nowait(struct address_space *mapping, mapping_gfp_mask(mapping)); } +static inline struct page *find_subpage(struct page *page, pgoff_t offset) +{ + unsigned long mask; + + if (PageHuge(page)) + return page; + + VM_BUG_ON_PAGE(PageTail(page), page); + + mask = (1UL << compound_order(page)) - 1; + return page + (offset & mask); +} + struct page *find_get_entry(struct address_space *mapping, pgoff_t offset); struct page *find_lock_entry(struct address_space *mapping, pgoff_t offset); unsigned find_get_entries(struct address_space *mapping, pgoff_t start, diff --git a/mm/filemap.c b/mm/filemap.c index d78f577baef2..4157f858a9c6 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -279,11 +279,11 @@ EXPORT_SYMBOL(delete_from_page_cache); * @pvec: pagevec with pages to delete * * The function walks over mapping->i_pages and removes pages passed in @pvec - * from the mapping. The function expects @pvec to be sorted by page index. + * from the mapping. The function expects @pvec to be sorted by page index + * and is optimised for it to be dense. * It tolerates holes in @pvec (mapping entries at those indices are not * modified). The function expects only THP head pages to be present in the - * @pvec and takes care to delete all corresponding tail pages from the - * mapping as well. + * @pvec. * * The function expects the i_pages lock to be held. */ @@ -292,40 +292,44 @@ static void page_cache_delete_batch(struct address_space *mapping, { XA_STATE(xas, &mapping->i_pages, pvec->pages[0]->index); int total_pages = 0; - int i = 0, tail_pages = 0; + int i = 0; struct page *page; mapping_set_update(&xas, mapping); xas_for_each(&xas, page, ULONG_MAX) { - if (i >= pagevec_count(pvec) && !tail_pages) + if (i >= pagevec_count(pvec)) break; + + /* A swap/dax/shadow entry got inserted? Skip it. */ if (xa_is_value(page)) continue; - if (!tail_pages) { - /* - * Some page got inserted in our range? Skip it. We - * have our pages locked so they are protected from - * being removed. - */ - if (page != pvec->pages[i]) { - VM_BUG_ON_PAGE(page->index > - pvec->pages[i]->index, page); - continue; - } - WARN_ON_ONCE(!PageLocked(page)); - if (PageTransHuge(page) && !PageHuge(page)) - tail_pages = HPAGE_PMD_NR - 1; + /* + * A page got inserted in our range? Skip it. We have our + * pages locked so they are protected from being removed. + * If we see a page whose index is higher than ours, it + * means our page has been removed, which shouldn't be + * possible because we're holding the PageLock. + */ + if (page != pvec->pages[i]) { + VM_BUG_ON_PAGE(page->index > pvec->pages[i]->index, + page); + continue; + } + + WARN_ON_ONCE(!PageLocked(page)); + + if (page->index == xas.xa_index) page->mapping = NULL; - /* - * Leave page->index set: truncation lookup relies - * upon it - */ + /* Leave page->index set: truncation lookup relies on it */ + + /* + * Move to the next page in the vector if this is a regular + * page or the index is of the last sub-page of this compound + * page. + */ + if (page->index + (1UL << compound_order(page)) - 1 == + xas.xa_index) i++; - } else { - VM_BUG_ON_PAGE(page->index + HPAGE_PMD_NR - tail_pages - != pvec->pages[i]->index, page); - tail_pages--; - } xas_store(&xas, NULL); total_pages++; } @@ -1491,7 +1495,7 @@ EXPORT_SYMBOL(page_cache_prev_miss); struct page *find_get_entry(struct address_space *mapping, pgoff_t offset) { XA_STATE(xas, &mapping->i_pages, offset); - struct page *head, *page; + struct page *page; rcu_read_lock(); repeat: @@ -1506,25 +1510,19 @@ repeat: if (!page || xa_is_value(page)) goto out; - head = compound_head(page); - if (!page_cache_get_speculative(head)) + if (!page_cache_get_speculative(page)) goto repeat; - /* The page was split under us? */ - if (compound_head(page) != head) { - put_page(head); - goto repeat; - } - /* - * Has the page moved? + * Has the page moved or been split? * This is part of the lockless pagecache protocol. See * include/linux/pagemap.h for details. */ if (unlikely(page != xas_reload(&xas))) { - put_page(head); + put_page(page); goto repeat; } + page = find_subpage(page, offset); out: rcu_read_unlock(); @@ -1706,7 +1704,6 @@ unsigned find_get_entries(struct address_space *mapping, rcu_read_lock(); xas_for_each(&xas, page, ULONG_MAX) { - struct page *head; if (xas_retry(&xas, page)) continue; /* @@ -1717,17 +1714,13 @@ unsigned find_get_entries(struct address_space *mapping, if (xa_is_value(page)) goto export; - head = compound_head(page); - if (!page_cache_get_speculative(head)) + if (!page_cache_get_speculative(page)) goto retry; - /* The page was split under us? */ - if (compound_head(page) != head) - goto put_page; - - /* Has the page moved? */ + /* Has the page moved or been split? */ if (unlikely(page != xas_reload(&xas))) goto put_page; + page = find_subpage(page, xas.xa_index); export: indices[ret] = xas.xa_index; @@ -1736,7 +1729,7 @@ export: break; continue; put_page: - put_page(head); + put_page(page); retry: xas_reset(&xas); } @@ -1778,33 +1771,27 @@ unsigned find_get_pages_range(struct address_space *mapping, pgoff_t *start, rcu_read_lock(); xas_for_each(&xas, page, end) { - struct page *head; if (xas_retry(&xas, page)) continue; /* Skip over shadow, swap and DAX entries */ if (xa_is_value(page)) continue; - head = compound_head(page); - if (!page_cache_get_speculative(head)) + if (!page_cache_get_speculative(page)) goto retry; - /* The page was split under us? */ - if (compound_head(page) != head) - goto put_page; - - /* Has the page moved? */ + /* Has the page moved or been split? */ if (unlikely(page != xas_reload(&xas))) goto put_page; - pages[ret] = page; + pages[ret] = find_subpage(page, xas.xa_index); if (++ret == nr_pages) { *start = xas.xa_index + 1; goto out; } continue; put_page: - put_page(head); + put_page(page); retry: xas_reset(&xas); } @@ -1849,7 +1836,6 @@ unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t index, rcu_read_lock(); for (page = xas_load(&xas); page; page = xas_next(&xas)) { - struct page *head; if (xas_retry(&xas, page)) continue; /* @@ -1859,24 +1845,19 @@ unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t index, if (xa_is_value(page)) break; - head = compound_head(page); - if (!page_cache_get_speculative(head)) + if (!page_cache_get_speculative(page)) goto retry; - /* The page was split under us? */ - if (compound_head(page) != head) - goto put_page; - - /* Has the page moved? */ + /* Has the page moved or been split? */ if (unlikely(page != xas_reload(&xas))) goto put_page; - pages[ret] = page; + pages[ret] = find_subpage(page, xas.xa_index); if (++ret == nr_pages) break; continue; put_page: - put_page(head); + put_page(page); retry: xas_reset(&xas); } @@ -1912,7 +1893,6 @@ unsigned find_get_pages_range_tag(struct address_space *mapping, pgoff_t *index, rcu_read_lock(); xas_for_each_marked(&xas, page, end, tag) { - struct page *head; if (xas_retry(&xas, page)) continue; /* @@ -1923,26 +1903,21 @@ unsigned find_get_pages_range_tag(struct address_space *mapping, pgoff_t *index, if (xa_is_value(page)) continue; - head = compound_head(page); - if (!page_cache_get_speculative(head)) + if (!page_cache_get_speculative(page)) goto retry; - /* The page was split under us? */ - if (compound_head(page) != head) - goto put_page; - - /* Has the page moved? */ + /* Has the page moved or been split? */ if (unlikely(page != xas_reload(&xas))) goto put_page; - pages[ret] = page; + pages[ret] = find_subpage(page, xas.xa_index); if (++ret == nr_pages) { *index = xas.xa_index + 1; goto out; } continue; put_page: - put_page(head); + put_page(page); retry: xas_reset(&xas); } @@ -1991,7 +1966,6 @@ unsigned find_get_entries_tag(struct address_space *mapping, pgoff_t start, rcu_read_lock(); xas_for_each_marked(&xas, page, ULONG_MAX, tag) { - struct page *head; if (xas_retry(&xas, page)) continue; /* @@ -2002,17 +1976,13 @@ unsigned find_get_entries_tag(struct address_space *mapping, pgoff_t start, if (xa_is_value(page)) goto export; - head = compound_head(page); - if (!page_cache_get_speculative(head)) + if (!page_cache_get_speculative(page)) goto retry; - /* The page was split under us? */ - if (compound_head(page) != head) - goto put_page; - - /* Has the page moved? */ + /* Has the page moved or been split? */ if (unlikely(page != xas_reload(&xas))) goto put_page; + page = find_subpage(page, xas.xa_index); export: indices[ret] = xas.xa_index; @@ -2021,7 +1991,7 @@ export: break; continue; put_page: - put_page(head); + put_page(page); retry: xas_reset(&xas); } @@ -2691,7 +2661,7 @@ void filemap_map_pages(struct vm_fault *vmf, pgoff_t last_pgoff = start_pgoff; unsigned long max_idx; XA_STATE(xas, &mapping->i_pages, start_pgoff); - struct page *head, *page; + struct page *page; rcu_read_lock(); xas_for_each(&xas, page, end_pgoff) { @@ -2700,24 +2670,19 @@ void filemap_map_pages(struct vm_fault *vmf, if (xa_is_value(page)) goto next; - head = compound_head(page); - /* * Check for a locked page first, as a speculative * reference may adversely influence page migration. */ - if (PageLocked(head)) + if (PageLocked(page)) goto next; - if (!page_cache_get_speculative(head)) + if (!page_cache_get_speculative(page)) goto next; - /* The page was split under us? */ - if (compound_head(page) != head) - goto skip; - - /* Has the page moved? */ + /* Has the page moved or been split? */ if (unlikely(page != xas_reload(&xas))) goto skip; + page = find_subpage(page, xas.xa_index); if (!PageUptodate(page) || PageReadahead(page) || diff --git a/mm/huge_memory.c b/mm/huge_memory.c index c314a362c167..50c665b12cf1 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2496,6 +2496,9 @@ static void __split_huge_page(struct page *page, struct list_head *list, if (IS_ENABLED(CONFIG_SHMEM) && PageSwapBacked(head)) shmem_uncharge(head->mapping->host, 1); put_page(head + i); + } else if (!PageAnon(page)) { + __xa_store(&head->mapping->i_pages, head[i].index, + head + i, 0); } } diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 449044378782..7ba7a1e4fa79 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -1374,7 +1374,7 @@ static void collapse_shmem(struct mm_struct *mm, result = SCAN_FAIL; goto xa_locked; } - xas_store(&xas, new_page + (index % HPAGE_PMD_NR)); + xas_store(&xas, new_page); nr_none++; continue; } @@ -1450,7 +1450,7 @@ static void collapse_shmem(struct mm_struct *mm, list_add_tail(&page->lru, &pagelist); /* Finally, replace with the new page. */ - xas_store(&xas, new_page + (index % HPAGE_PMD_NR)); + xas_store(&xas, new_page); continue; out_unlock: unlock_page(page); diff --git a/mm/memfd.c b/mm/memfd.c index 650e65a46b9c..2647c898990c 100644 --- a/mm/memfd.c +++ b/mm/memfd.c @@ -39,6 +39,7 @@ static void memfd_tag_pins(struct xa_state *xas) xas_for_each(xas, page, ULONG_MAX) { if (xa_is_value(page)) continue; + page = find_subpage(page, xas->xa_index); if (page_count(page) - page_mapcount(page) > 1) xas_set_mark(xas, MEMFD_TAG_PINNED); @@ -88,6 +89,7 @@ static int memfd_wait_for_pins(struct address_space *mapping) bool clear = true; if (xa_is_value(page)) continue; + page = find_subpage(page, xas.xa_index); if (page_count(page) - page_mapcount(page) != 1) { /* * On the last scan, we clean up all those tags diff --git a/mm/migrate.c b/mm/migrate.c index 663a5449367a..a1770403ff7f 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -463,7 +463,7 @@ int migrate_page_move_mapping(struct address_space *mapping, for (i = 1; i < HPAGE_PMD_NR; i++) { xas_next(&xas); - xas_store(&xas, newpage + i); + xas_store(&xas, newpage); } } diff --git a/mm/shmem.c b/mm/shmem.c index f4dce9c8670d..1bb3b8dc8bb2 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -614,7 +614,7 @@ static int shmem_add_to_page_cache(struct page *page, if (xas_error(&xas)) goto unlock; next: - xas_store(&xas, page + i); + xas_store(&xas, page); if (++i < nr) { xas_next(&xas); goto next; diff --git a/mm/swap_state.c b/mm/swap_state.c index 85245fdec8d9..eb714165afd2 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -132,7 +132,7 @@ int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp) for (i = 0; i < nr; i++) { VM_BUG_ON_PAGE(xas.xa_index != idx + i, page); set_page_private(page + i, entry.val + i); - xas_store(&xas, page + i); + xas_store(&xas, page); xas_next(&xas); } address_space->nrpages += nr; @@ -167,7 +167,7 @@ void __delete_from_swap_cache(struct page *page, swp_entry_t entry) for (i = 0; i < nr; i++) { void *entry = xas_store(&xas, NULL); - VM_BUG_ON_PAGE(entry != page + i, entry); + VM_BUG_ON_PAGE(entry != page, entry); set_page_private(page + i, 0); xas_next(&xas); } -- cgit v1.2.3 From 63931eb97508cd67515dbcc049defaebd7b1fcd0 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Mon, 13 May 2019 17:16:47 -0700 Subject: mm, page_alloc: disallow __GFP_COMP in alloc_pages_exact() alloc_pages_exact*() allocates a page of sufficient order and then splits it to return only the number of pages requested. That makes it incompatible with __GFP_COMP, because compound pages cannot be split. As shown by [1] things may silently work until the requested size (possibly depending on user) stops being power of two. Then for CONFIG_DEBUG_VM, BUG_ON() triggers in split_page(). Without CONFIG_DEBUG_VM, consequences are unclear. There are several options here, none of them great: 1) Don't do the splitting when __GFP_COMP is passed, and return the whole compound page. However if caller then returns it via free_pages_exact(), that will be unexpected and the freeing actions there will be wrong. 2) Warn and remove __GFP_COMP from the flags. But the caller may have really wanted it, so things may break later somewhere. 3) Warn and return NULL. However NULL may be unexpected, especially for small sizes. This patch picks option 2, because as Michal Hocko put it: "callers wanted it" is much less probable than "caller is simply confused and more gfp flags is surely better than fewer". [1] https://lore.kernel.org/lkml/20181126002805.GI18977@shao2-debian/T/#u Link: http://lkml.kernel.org/r/0c6393eb-b28d-4607-c386-862a71f09de6@suse.cz Signed-off-by: Vlastimil Babka Acked-by: Michal Hocko Acked-by: Kirill A. Shutemov Acked-by: Mel Gorman Cc: Takashi Iwai Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 59661106da16..07a0d722d481 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4821,7 +4821,7 @@ static void *make_alloc_exact(unsigned long addr, unsigned int order, /** * alloc_pages_exact - allocate an exact number physically-contiguous pages. * @size: the number of bytes to allocate - * @gfp_mask: GFP flags for the allocation + * @gfp_mask: GFP flags for the allocation, must not contain __GFP_COMP * * This function is similar to alloc_pages(), except that it allocates the * minimum number of pages to satisfy the request. alloc_pages() can only @@ -4838,6 +4838,9 @@ void *alloc_pages_exact(size_t size, gfp_t gfp_mask) unsigned int order = get_order(size); unsigned long addr; + if (WARN_ON_ONCE(gfp_mask & __GFP_COMP)) + gfp_mask &= ~__GFP_COMP; + addr = __get_free_pages(gfp_mask, order); return make_alloc_exact(addr, order, size); } @@ -4848,7 +4851,7 @@ EXPORT_SYMBOL(alloc_pages_exact); * pages on a node. * @nid: the preferred node ID where memory should be allocated * @size: the number of bytes to allocate - * @gfp_mask: GFP flags for the allocation + * @gfp_mask: GFP flags for the allocation, must not contain __GFP_COMP * * Like alloc_pages_exact(), but try to allocate on node nid first before falling * back. @@ -4858,7 +4861,12 @@ EXPORT_SYMBOL(alloc_pages_exact); void * __meminit alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask) { unsigned int order = get_order(size); - struct page *p = alloc_pages_node(nid, gfp_mask, order); + struct page *p; + + if (WARN_ON_ONCE(gfp_mask & __GFP_COMP)) + gfp_mask &= ~__GFP_COMP; + + p = alloc_pages_node(nid, gfp_mask, order); if (!p) return NULL; return make_alloc_exact((unsigned long)page_address(p), order, size); -- cgit v1.2.3 From 886cf1901db962cee5f8b82b9b260079a5e8a4eb Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Mon, 13 May 2019 17:16:51 -0700 Subject: mm: move recent_rotated pages calculation to shrink_inactive_list() Patch series "mm: Generalize putback functions"] putback_inactive_pages() and move_active_pages_to_lru() are almost similar, so this patchset merges them ina single function. This patch (of 4): The patch moves the calculation from putback_inactive_pages() to shrink_inactive_list(). This makes putback_inactive_pages() looking more similar to move_active_pages_to_lru(). To do that, we account activated pages in reclaim_stat::nr_activate. Since a page may change its LRU type from anon to file cache inside shrink_page_list() (see ClearPageSwapBacked()), we have to account pages for the both types. So, nr_activate becomes an array. Previously we used nr_activate to account PGACTIVATE events, but now we account them into pgactivate variable (since they are about number of pages in general, not about sum of hpage_nr_pages). Link: http://lkml.kernel.org/r/155290127956.31489.3393586616054413298.stgit@localhost.localdomain Signed-off-by: Kirill Tkhai Reviewed-by: Daniel Jordan Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- .../trace/postprocess/trace-vmscan-postprocess.pl | 7 ++++--- include/linux/vmstat.h | 2 +- include/trace/events/vmscan.h | 13 ++++++++----- mm/vmscan.c | 15 +++++++-------- 4 files changed, 20 insertions(+), 17 deletions(-) diff --git a/Documentation/trace/postprocess/trace-vmscan-postprocess.pl b/Documentation/trace/postprocess/trace-vmscan-postprocess.pl index 66bfd8396877..995da15b16ca 100644 --- a/Documentation/trace/postprocess/trace-vmscan-postprocess.pl +++ b/Documentation/trace/postprocess/trace-vmscan-postprocess.pl @@ -113,7 +113,7 @@ my $regex_kswapd_wake_default = 'nid=([0-9]*) order=([0-9]*)'; my $regex_kswapd_sleep_default = 'nid=([0-9]*)'; my $regex_wakeup_kswapd_default = 'nid=([0-9]*) zid=([0-9]*) order=([0-9]*) gfp_flags=([A-Z_|]*)'; my $regex_lru_isolate_default = 'isolate_mode=([0-9]*) classzone_idx=([0-9]*) order=([0-9]*) nr_requested=([0-9]*) nr_scanned=([0-9]*) nr_skipped=([0-9]*) nr_taken=([0-9]*) lru=([a-z_]*)'; -my $regex_lru_shrink_inactive_default = 'nid=([0-9]*) nr_scanned=([0-9]*) nr_reclaimed=([0-9]*) nr_dirty=([0-9]*) nr_writeback=([0-9]*) nr_congested=([0-9]*) nr_immediate=([0-9]*) nr_activate=([0-9]*) nr_ref_keep=([0-9]*) nr_unmap_fail=([0-9]*) priority=([0-9]*) flags=([A-Z_|]*)'; +my $regex_lru_shrink_inactive_default = 'nid=([0-9]*) nr_scanned=([0-9]*) nr_reclaimed=([0-9]*) nr_dirty=([0-9]*) nr_writeback=([0-9]*) nr_congested=([0-9]*) nr_immediate=([0-9]*) nr_activate_anon=([0-9]*) nr_activate_file=([0-9]*) nr_ref_keep=([0-9]*) nr_unmap_fail=([0-9]*) priority=([0-9]*) flags=([A-Z_|]*)'; my $regex_lru_shrink_active_default = 'lru=([A-Z_]*) nr_scanned=([0-9]*) nr_rotated=([0-9]*) priority=([0-9]*)'; my $regex_writepage_default = 'page=([0-9a-f]*) pfn=([0-9]*) flags=([A-Z_|]*)'; @@ -212,7 +212,8 @@ $regex_lru_shrink_inactive = generate_traceevent_regex( "vmscan/mm_vmscan_lru_shrink_inactive", $regex_lru_shrink_inactive_default, "nid", "nr_scanned", "nr_reclaimed", "nr_dirty", "nr_writeback", - "nr_congested", "nr_immediate", "nr_activate", "nr_ref_keep", + "nr_congested", "nr_immediate", "nr_activate_anon", + "nr_activate_file", "nr_ref_keep", "nr_unmap_fail", "priority", "flags"); $regex_lru_shrink_active = generate_traceevent_regex( "vmscan/mm_vmscan_lru_shrink_active", @@ -407,7 +408,7 @@ EVENT_PROCESS: } my $nr_reclaimed = $3; - my $flags = $12; + my $flags = $13; my $file = 0; if ($flags =~ /RECLAIM_WB_FILE/) { $file = 1; diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h index 2db8d60981fe..bdeda4b079fe 100644 --- a/include/linux/vmstat.h +++ b/include/linux/vmstat.h @@ -26,7 +26,7 @@ struct reclaim_stat { unsigned nr_congested; unsigned nr_writeback; unsigned nr_immediate; - unsigned nr_activate; + unsigned nr_activate[2]; unsigned nr_ref_keep; unsigned nr_unmap_fail; }; diff --git a/include/trace/events/vmscan.h b/include/trace/events/vmscan.h index e8709ab22d68..cb2add69301a 100644 --- a/include/trace/events/vmscan.h +++ b/include/trace/events/vmscan.h @@ -359,7 +359,8 @@ TRACE_EVENT(mm_vmscan_lru_shrink_inactive, __field(unsigned long, nr_writeback) __field(unsigned long, nr_congested) __field(unsigned long, nr_immediate) - __field(unsigned long, nr_activate) + __field(unsigned int, nr_activate0) + __field(unsigned int, nr_activate1) __field(unsigned long, nr_ref_keep) __field(unsigned long, nr_unmap_fail) __field(int, priority) @@ -374,20 +375,22 @@ TRACE_EVENT(mm_vmscan_lru_shrink_inactive, __entry->nr_writeback = stat->nr_writeback; __entry->nr_congested = stat->nr_congested; __entry->nr_immediate = stat->nr_immediate; - __entry->nr_activate = stat->nr_activate; + __entry->nr_activate0 = stat->nr_activate[0]; + __entry->nr_activate1 = stat->nr_activate[1]; __entry->nr_ref_keep = stat->nr_ref_keep; __entry->nr_unmap_fail = stat->nr_unmap_fail; __entry->priority = priority; __entry->reclaim_flags = trace_shrink_flags(file); ), - TP_printk("nid=%d nr_scanned=%ld nr_reclaimed=%ld nr_dirty=%ld nr_writeback=%ld nr_congested=%ld nr_immediate=%ld nr_activate=%ld nr_ref_keep=%ld nr_unmap_fail=%ld priority=%d flags=%s", + TP_printk("nid=%d nr_scanned=%ld nr_reclaimed=%ld nr_dirty=%ld nr_writeback=%ld nr_congested=%ld nr_immediate=%ld nr_activate_anon=%d nr_activate_file=%d nr_ref_keep=%ld nr_unmap_fail=%ld priority=%d flags=%s", __entry->nid, __entry->nr_scanned, __entry->nr_reclaimed, __entry->nr_dirty, __entry->nr_writeback, __entry->nr_congested, __entry->nr_immediate, - __entry->nr_activate, __entry->nr_ref_keep, - __entry->nr_unmap_fail, __entry->priority, + __entry->nr_activate0, __entry->nr_activate1, + __entry->nr_ref_keep, __entry->nr_unmap_fail, + __entry->priority, show_reclaim_flags(__entry->reclaim_flags)) ); diff --git a/mm/vmscan.c b/mm/vmscan.c index fd9de504e516..e6913e68db2e 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1107,6 +1107,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, LIST_HEAD(ret_pages); LIST_HEAD(free_pages); unsigned nr_reclaimed = 0; + unsigned pgactivate = 0; memset(stat, 0, sizeof(*stat)); cond_resched(); @@ -1466,8 +1467,10 @@ activate_locked: try_to_free_swap(page); VM_BUG_ON_PAGE(PageActive(page), page); if (!PageMlocked(page)) { + int type = page_is_file_cache(page); SetPageActive(page); - stat->nr_activate++; + pgactivate++; + stat->nr_activate[type] += hpage_nr_pages(page); count_memcg_page_event(page, PGACTIVATE); } keep_locked: @@ -1482,7 +1485,7 @@ keep: free_unref_page_list(&free_pages); list_splice(&ret_pages, page_list); - count_vm_events(PGACTIVATE, stat->nr_activate); + count_vm_events(PGACTIVATE, pgactivate); return nr_reclaimed; } @@ -1807,7 +1810,6 @@ static int too_many_isolated(struct pglist_data *pgdat, int file, static noinline_for_stack void putback_inactive_pages(struct lruvec *lruvec, struct list_head *page_list) { - struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat; struct pglist_data *pgdat = lruvec_pgdat(lruvec); LIST_HEAD(pages_to_free); @@ -1833,11 +1835,6 @@ putback_inactive_pages(struct lruvec *lruvec, struct list_head *page_list) lru = page_lru(page); add_page_to_lru_list(page, lruvec, lru); - if (is_active_lru(lru)) { - int file = is_file_lru(lru); - int numpages = hpage_nr_pages(page); - reclaim_stat->recent_rotated[file] += numpages; - } if (put_page_testzero(page)) { __ClearPageLRU(page); __ClearPageActive(page); @@ -1945,6 +1942,8 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, count_memcg_events(lruvec_memcg(lruvec), PGSTEAL_DIRECT, nr_reclaimed); } + reclaim_stat->recent_rotated[0] = stat.nr_activate[0]; + reclaim_stat->recent_rotated[1] = stat.nr_activate[1]; putback_inactive_pages(lruvec, &page_list); -- cgit v1.2.3 From 9851ac13592df77958ae7bac6ba39e71420c38ec Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Mon, 13 May 2019 17:16:54 -0700 Subject: mm: move nr_deactivate accounting to shrink_active_list() We know which LRU is not active. [chris@chrisdown.name: fix build on !CONFIG_MEMCG] Link: http://lkml.kernel.org/r/20190322150513.GA22021@chrisdown.name Link: http://lkml.kernel.org/r/155290128498.31489.18250485448913338607.stgit@localhost.localdomain Signed-off-by: Kirill Tkhai Signed-off-by: Chris Down Reviewed-by: Daniel Jordan Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 6 ++++++ mm/vmscan.c | 10 ++++------ 2 files changed, 10 insertions(+), 6 deletions(-) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index dbb6118370c1..b238403f95b2 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -1117,6 +1117,12 @@ static inline void count_memcg_events(struct mem_cgroup *memcg, { } +static inline void __count_memcg_events(struct mem_cgroup *memcg, + enum vm_event_item idx, + unsigned long count) +{ +} + static inline void count_memcg_page_event(struct page *page, int idx) { diff --git a/mm/vmscan.c b/mm/vmscan.c index e6913e68db2e..5002cc43e32f 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2040,12 +2040,6 @@ static unsigned move_active_pages_to_lru(struct lruvec *lruvec, } } - if (!is_active_lru(lru)) { - __count_vm_events(PGDEACTIVATE, nr_moved); - count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE, - nr_moved); - } - return nr_moved; } @@ -2137,6 +2131,10 @@ static void shrink_active_list(unsigned long nr_to_scan, nr_activate = move_active_pages_to_lru(lruvec, &l_active, &l_hold, lru); nr_deactivate = move_active_pages_to_lru(lruvec, &l_inactive, &l_hold, lru - LRU_ACTIVE); + + __count_vm_events(PGDEACTIVATE, nr_deactivate); + __count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE, nr_deactivate); + __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken); spin_unlock_irq(&pgdat->lru_lock); -- cgit v1.2.3 From f372d89e5dbbf2bc8e37089bacd526afd4e1d6c2 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Mon, 13 May 2019 17:16:57 -0700 Subject: mm: remove pages_to_free argument of move_active_pages_to_lru() We may use input argument list as output argument too. This makes the function more similar to putback_inactive_pages(). Link: http://lkml.kernel.org/r/155290129079.31489.16180612694090502942.stgit@localhost.localdomain Signed-off-by: Kirill Tkhai Reviewed-by: Daniel Jordan Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index 5002cc43e32f..4c5f4b862420 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2004,10 +2004,10 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, static unsigned move_active_pages_to_lru(struct lruvec *lruvec, struct list_head *list, - struct list_head *pages_to_free, enum lru_list lru) { struct pglist_data *pgdat = lruvec_pgdat(lruvec); + LIST_HEAD(pages_to_free); struct page *page; int nr_pages; int nr_moved = 0; @@ -2034,12 +2034,17 @@ static unsigned move_active_pages_to_lru(struct lruvec *lruvec, (*get_compound_page_dtor(page))(page); spin_lock_irq(&pgdat->lru_lock); } else - list_add(&page->lru, pages_to_free); + list_add(&page->lru, &pages_to_free); } else { nr_moved += nr_pages; } } + /* + * To save our caller's stack, now use input list for pages to free. + */ + list_splice(&pages_to_free, list); + return nr_moved; } @@ -2129,8 +2134,10 @@ static void shrink_active_list(unsigned long nr_to_scan, */ reclaim_stat->recent_rotated[file] += nr_rotated; - nr_activate = move_active_pages_to_lru(lruvec, &l_active, &l_hold, lru); - nr_deactivate = move_active_pages_to_lru(lruvec, &l_inactive, &l_hold, lru - LRU_ACTIVE); + nr_activate = move_active_pages_to_lru(lruvec, &l_active, lru); + nr_deactivate = move_active_pages_to_lru(lruvec, &l_inactive, lru - LRU_ACTIVE); + /* Keep all free pages in l_active list */ + list_splice(&l_inactive, &l_active); __count_vm_events(PGDEACTIVATE, nr_deactivate); __count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE, nr_deactivate); @@ -2138,8 +2145,8 @@ static void shrink_active_list(unsigned long nr_to_scan, __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken); spin_unlock_irq(&pgdat->lru_lock); - mem_cgroup_uncharge_list(&l_hold); - free_unref_page_list(&l_hold); + mem_cgroup_uncharge_list(&l_active); + free_unref_page_list(&l_active); trace_mm_vmscan_lru_shrink_active(pgdat->node_id, nr_taken, nr_activate, nr_deactivate, nr_rotated, sc->priority, file); } -- cgit v1.2.3 From a222f341586834073c2bbea225be38216eb5d993 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Mon, 13 May 2019 17:17:00 -0700 Subject: mm: generalize putback scan functions This combines two similar functions move_active_pages_to_lru() and putback_inactive_pages() into single move_pages_to_lru(). This remove duplicate code and makes object file size smaller. Before: text data bss dec hex filename 57082 4732 128 61942 f1f6 mm/vmscan.o After: text data bss dec hex filename 55112 4600 128 59840 e9c0 mm/vmscan.o Note, that now we are checking for !page_evictable() coming from shrink_active_list(), which shouldn't change any behavior since that path works with evictable pages only. Link: http://lkml.kernel.org/r/155290129627.31489.8321971028677203248.stgit@localhost.localdomain Signed-off-by: Kirill Tkhai Reviewed-by: Daniel Jordan Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 122 ++++++++++++++++++++---------------------------------------- 1 file changed, 40 insertions(+), 82 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index 4c5f4b862420..40ff747e0b33 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1807,33 +1807,53 @@ static int too_many_isolated(struct pglist_data *pgdat, int file, return isolated > inactive; } -static noinline_for_stack void -putback_inactive_pages(struct lruvec *lruvec, struct list_head *page_list) +/* + * This moves pages from @list to corresponding LRU list. + * + * We move them the other way if the page is referenced by one or more + * processes, from rmap. + * + * If the pages are mostly unmapped, the processing is fast and it is + * appropriate to hold zone_lru_lock across the whole operation. But if + * the pages are mapped, the processing is slow (page_referenced()) so we + * should drop zone_lru_lock around each page. It's impossible to balance + * this, so instead we remove the pages from the LRU while processing them. + * It is safe to rely on PG_active against the non-LRU pages in here because + * nobody will play with that bit on a non-LRU page. + * + * The downside is that we have to touch page->_refcount against each page. + * But we had to alter page->flags anyway. + * + * Returns the number of pages moved to the given lruvec. + */ + +static unsigned noinline_for_stack move_pages_to_lru(struct lruvec *lruvec, + struct list_head *list) { struct pglist_data *pgdat = lruvec_pgdat(lruvec); + int nr_pages, nr_moved = 0; LIST_HEAD(pages_to_free); + struct page *page; + enum lru_list lru; - /* - * Put back any unfreeable pages. - */ - while (!list_empty(page_list)) { - struct page *page = lru_to_page(page_list); - int lru; - + while (!list_empty(list)) { + page = lru_to_page(list); VM_BUG_ON_PAGE(PageLRU(page), page); - list_del(&page->lru); if (unlikely(!page_evictable(page))) { + list_del(&page->lru); spin_unlock_irq(&pgdat->lru_lock); putback_lru_page(page); spin_lock_irq(&pgdat->lru_lock); continue; } - lruvec = mem_cgroup_page_lruvec(page, pgdat); SetPageLRU(page); lru = page_lru(page); - add_page_to_lru_list(page, lruvec, lru); + + nr_pages = hpage_nr_pages(page); + update_lru_size(lruvec, lru, page_zonenum(page), nr_pages); + list_move(&page->lru, &lruvec->lists[lru]); if (put_page_testzero(page)) { __ClearPageLRU(page); @@ -1847,13 +1867,17 @@ putback_inactive_pages(struct lruvec *lruvec, struct list_head *page_list) spin_lock_irq(&pgdat->lru_lock); } else list_add(&page->lru, &pages_to_free); + } else { + nr_moved += nr_pages; } } /* * To save our caller's stack, now use input list for pages to free. */ - list_splice(&pages_to_free, page_list); + list_splice(&pages_to_free, list); + + return nr_moved; } /* @@ -1945,7 +1969,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, reclaim_stat->recent_rotated[0] = stat.nr_activate[0]; reclaim_stat->recent_rotated[1] = stat.nr_activate[1]; - putback_inactive_pages(lruvec, &page_list); + move_pages_to_lru(lruvec, &page_list); __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken); @@ -1982,72 +2006,6 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, return nr_reclaimed; } -/* - * This moves pages from the active list to the inactive list. - * - * We move them the other way if the page is referenced by one or more - * processes, from rmap. - * - * If the pages are mostly unmapped, the processing is fast and it is - * appropriate to hold pgdat->lru_lock across the whole operation. But if - * the pages are mapped, the processing is slow (page_referenced()) so we - * should drop pgdat->lru_lock around each page. It's impossible to balance - * this, so instead we remove the pages from the LRU while processing them. - * It is safe to rely on PG_active against the non-LRU pages in here because - * nobody will play with that bit on a non-LRU page. - * - * The downside is that we have to touch page->_refcount against each page. - * But we had to alter page->flags anyway. - * - * Returns the number of pages moved to the given lru. - */ - -static unsigned move_active_pages_to_lru(struct lruvec *lruvec, - struct list_head *list, - enum lru_list lru) -{ - struct pglist_data *pgdat = lruvec_pgdat(lruvec); - LIST_HEAD(pages_to_free); - struct page *page; - int nr_pages; - int nr_moved = 0; - - while (!list_empty(list)) { - page = lru_to_page(list); - lruvec = mem_cgroup_page_lruvec(page, pgdat); - - VM_BUG_ON_PAGE(PageLRU(page), page); - SetPageLRU(page); - - nr_pages = hpage_nr_pages(page); - update_lru_size(lruvec, lru, page_zonenum(page), nr_pages); - list_move(&page->lru, &lruvec->lists[lru]); - - if (put_page_testzero(page)) { - __ClearPageLRU(page); - __ClearPageActive(page); - del_page_from_lru_list(page, lruvec, lru); - - if (unlikely(PageCompound(page))) { - spin_unlock_irq(&pgdat->lru_lock); - mem_cgroup_uncharge(page); - (*get_compound_page_dtor(page))(page); - spin_lock_irq(&pgdat->lru_lock); - } else - list_add(&page->lru, &pages_to_free); - } else { - nr_moved += nr_pages; - } - } - - /* - * To save our caller's stack, now use input list for pages to free. - */ - list_splice(&pages_to_free, list); - - return nr_moved; -} - static void shrink_active_list(unsigned long nr_to_scan, struct lruvec *lruvec, struct scan_control *sc, @@ -2134,8 +2092,8 @@ static void shrink_active_list(unsigned long nr_to_scan, */ reclaim_stat->recent_rotated[file] += nr_rotated; - nr_activate = move_active_pages_to_lru(lruvec, &l_active, lru); - nr_deactivate = move_active_pages_to_lru(lruvec, &l_inactive, lru - LRU_ACTIVE); + nr_activate = move_pages_to_lru(lruvec, &l_active); + nr_deactivate = move_pages_to_lru(lruvec, &l_inactive); /* Keep all free pages in l_active list */ list_splice(&l_inactive, &l_active); -- cgit v1.2.3 From 932f4a630a695212bdc7379b05f9bd0dafc5d968 Mon Sep 17 00:00:00 2001 From: Ira Weiny Date: Mon, 13 May 2019 17:17:03 -0700 Subject: mm/gup: replace get_user_pages_longterm() with FOLL_LONGTERM Pach series "Add FOLL_LONGTERM to GUP fast and use it". HFI1, qib, and mthca, use get_user_pages_fast() due to its performance advantages. These pages can be held for a significant time. But get_user_pages_fast() does not protect against mapping FS DAX pages. Introduce FOLL_LONGTERM and use this flag in get_user_pages_fast() which retains the performance while also adding the FS DAX checks. XDP has also shown interest in using this functionality.[1] In addition we change get_user_pages() to use the new FOLL_LONGTERM flag and remove the specialized get_user_pages_longterm call. [1] https://lkml.org/lkml/2019/3/19/939 "longterm" is a relative thing and at this point is probably a misnomer. This is really flagging a pin which is going to be given to hardware and can't move. I've thought of a couple of alternative names but I think we have to settle on if we are going to use FL_LAYOUT or something else to solve the "longterm" problem. Then I think we can change the flag to a better name. Secondly, it depends on how often you are registering memory. I have spoken with some RDMA users who consider MR in the performance path... For the overall application performance. I don't have the numbers as the tests for HFI1 were done a long time ago. But there was a significant advantage. Some of which is probably due to the fact that you don't have to hold mmap_sem. Finally, architecturally I think it would be good for everyone to use *_fast. There are patches submitted to the RDMA list which would allow the use of *_fast (they reworking the use of mmap_sem) and as soon as they are accepted I'll submit a patch to convert the RDMA core as well. Also to this point others are looking to use *_fast. As an aside, Jasons pointed out in my previous submission that *_fast and *_unlocked look very much the same. I agree and I think further cleanup will be coming. But I'm focused on getting the final solution for DAX at the moment. This patch (of 7): This patch starts a series which aims to support FOLL_LONGTERM in get_user_pages_fast(). Some callers who would like to do a longterm (user controlled pin) of pages with the fast variant of GUP for performance purposes. Rather than have a separate get_user_pages_longterm() call, introduce FOLL_LONGTERM and change the longterm callers to use it. This patch does not change any functionality. In the short term "longterm" or user controlled pins are unsafe for Filesystems and FS DAX in particular has been blocked. However, callers of get_user_pages_fast() were not "protected". FOLL_LONGTERM can _only_ be supported with get_user_pages[_fast]() as it requires vmas to determine if DAX is in use. NOTE: In merging with the CMA changes we opt to change the get_user_pages() call in check_and_migrate_cma_pages() to a call of __get_user_pages_locked() on the newly migrated pages. This makes the code read better in that we are calling __get_user_pages_locked() on the pages before and after a potential migration. As a side affect some of the interfaces are cleaned up but this is not the primary purpose of the series. In review[1] it was asked: > This I don't get - if you do lock down long term mappings performance > of the actual get_user_pages call shouldn't matter to start with. > > What do I miss? A couple of points. First "longterm" is a relative thing and at this point is probably a misnomer. This is really flagging a pin which is going to be given to hardware and can't move. I've thought of a couple of alternative names but I think we have to settle on if we are going to use FL_LAYOUT or something else to solve the "longterm" problem. Then I think we can change the flag to a better name. Second, It depends on how often you are registering memory. I have spoken with some RDMA users who consider MR in the performance path... For the overall application performance. I don't have the numbers as the tests for HFI1 were done a long time ago. But there was a significant advantage. Some of which is probably due to the fact that you don't have to hold mmap_sem. Finally, architecturally I think it would be good for everyone to use *_fast. There are patches submitted to the RDMA list which would allow the use of *_fast (they reworking the use of mmap_sem) and as soon as they are accepted I'll submit a patch to convert the RDMA core as well. Also to this point others are looking to use *_fast. As an asside, Jasons pointed out in my previous submission that *_fast and *_unlocked look very much the same. I agree and I think further cleanup will be coming. But I'm focused on getting the final solution for DAX at the moment. [1] https://lore.kernel.org/lkml/20190220180255.GA12020@iweiny-DESK2.sc.intel.com/T/#md6abad2569f3bf6c1f03686c8097ab6563e94965 [ira.weiny@intel.com: v3] Link: http://lkml.kernel.org/r/20190328084422.29911-2-ira.weiny@intel.com Link: http://lkml.kernel.org/r/20190328084422.29911-2-ira.weiny@intel.com Link: http://lkml.kernel.org/r/20190317183438.2057-2-ira.weiny@intel.com Signed-off-by: Ira Weiny Reviewed-by: Andrew Morton Cc: Aneesh Kumar K.V Cc: Michal Hocko Cc: John Hubbard Cc: "Kirill A. Shutemov" Cc: Peter Zijlstra Cc: Jason Gunthorpe Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: "David S. Miller" Cc: Martin Schwidefsky Cc: Heiko Carstens Cc: Rich Felker Cc: Yoshinori Sato Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Borislav Petkov Cc: Ralf Baechle Cc: James Hogan Cc: Dan Williams Cc: Mike Marshall Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/powerpc/mm/book3s64/iommu_api.c | 5 +- drivers/infiniband/core/umem.c | 5 +- drivers/infiniband/hw/qib/qib_user_pages.c | 8 +- drivers/infiniband/hw/usnic/usnic_uiom.c | 9 +- drivers/media/v4l2-core/videobuf-dma-sg.c | 6 +- drivers/vfio/vfio_iommu_type1.c | 3 +- fs/io_uring.c | 5 +- include/linux/mm.h | 41 +++++-- mm/gup.c | 190 ++++++++++++++++++----------- mm/gup_benchmark.c | 5 +- net/xdp/xdp_umem.c | 4 +- 11 files changed, 173 insertions(+), 108 deletions(-) diff --git a/arch/powerpc/mm/book3s64/iommu_api.c b/arch/powerpc/mm/book3s64/iommu_api.c index 8330f135294f..5c521f3924a5 100644 --- a/arch/powerpc/mm/book3s64/iommu_api.c +++ b/arch/powerpc/mm/book3s64/iommu_api.c @@ -141,8 +141,9 @@ static long mm_iommu_do_alloc(struct mm_struct *mm, unsigned long ua, for (entry = 0; entry < entries; entry += chunk) { unsigned long n = min(entries - entry, chunk); - ret = get_user_pages_longterm(ua + (entry << PAGE_SHIFT), n, - FOLL_WRITE, mem->hpages + entry, NULL); + ret = get_user_pages(ua + (entry << PAGE_SHIFT), n, + FOLL_WRITE | FOLL_LONGTERM, + mem->hpages + entry, NULL); if (ret == n) { pinned += n; continue; diff --git a/drivers/infiniband/core/umem.c b/drivers/infiniband/core/umem.c index 0a23048db523..e7ea819fcb11 100644 --- a/drivers/infiniband/core/umem.c +++ b/drivers/infiniband/core/umem.c @@ -295,10 +295,11 @@ struct ib_umem *ib_umem_get(struct ib_udata *udata, unsigned long addr, while (npages) { down_read(&mm->mmap_sem); - ret = get_user_pages_longterm(cur_base, + ret = get_user_pages(cur_base, min_t(unsigned long, npages, PAGE_SIZE / sizeof (struct page *)), - gup_flags, page_list, NULL); + gup_flags | FOLL_LONGTERM, + page_list, NULL); if (ret < 0) { up_read(&mm->mmap_sem); goto umem_release; diff --git a/drivers/infiniband/hw/qib/qib_user_pages.c b/drivers/infiniband/hw/qib/qib_user_pages.c index 123ca8f64f75..f712fb7fa82f 100644 --- a/drivers/infiniband/hw/qib/qib_user_pages.c +++ b/drivers/infiniband/hw/qib/qib_user_pages.c @@ -114,10 +114,10 @@ int qib_get_user_pages(unsigned long start_page, size_t num_pages, down_read(¤t->mm->mmap_sem); for (got = 0; got < num_pages; got += ret) { - ret = get_user_pages_longterm(start_page + got * PAGE_SIZE, - num_pages - got, - FOLL_WRITE | FOLL_FORCE, - p + got, NULL); + ret = get_user_pages(start_page + got * PAGE_SIZE, + num_pages - got, + FOLL_LONGTERM | FOLL_WRITE | FOLL_FORCE, + p + got, NULL); if (ret < 0) { up_read(¤t->mm->mmap_sem); goto bail_release; diff --git a/drivers/infiniband/hw/usnic/usnic_uiom.c b/drivers/infiniband/hw/usnic/usnic_uiom.c index da35d6fdfc5e..e312f522a66d 100644 --- a/drivers/infiniband/hw/usnic/usnic_uiom.c +++ b/drivers/infiniband/hw/usnic/usnic_uiom.c @@ -143,10 +143,11 @@ static int usnic_uiom_get_pages(unsigned long addr, size_t size, int writable, ret = 0; while (npages) { - ret = get_user_pages_longterm(cur_base, - min_t(unsigned long, npages, - PAGE_SIZE / sizeof(struct page *)), - gup_flags, page_list, NULL); + ret = get_user_pages(cur_base, + min_t(unsigned long, npages, + PAGE_SIZE / sizeof(struct page *)), + gup_flags | FOLL_LONGTERM, + page_list, NULL); if (ret < 0) goto out; diff --git a/drivers/media/v4l2-core/videobuf-dma-sg.c b/drivers/media/v4l2-core/videobuf-dma-sg.c index 08929c087e27..870a2a526e0b 100644 --- a/drivers/media/v4l2-core/videobuf-dma-sg.c +++ b/drivers/media/v4l2-core/videobuf-dma-sg.c @@ -186,12 +186,12 @@ static int videobuf_dma_init_user_locked(struct videobuf_dmabuf *dma, dprintk(1, "init user [0x%lx+0x%lx => %d pages]\n", data, size, dma->nr_pages); - err = get_user_pages_longterm(data & PAGE_MASK, dma->nr_pages, - flags, dma->pages, NULL); + err = get_user_pages(data & PAGE_MASK, dma->nr_pages, + flags | FOLL_LONGTERM, dma->pages, NULL); if (err != dma->nr_pages) { dma->nr_pages = (err >= 0) ? err : 0; - dprintk(1, "get_user_pages_longterm: err=%d [%d]\n", err, + dprintk(1, "get_user_pages: err=%d [%d]\n", err, dma->nr_pages); return err < 0 ? err : -EINVAL; } diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c index 3be1db3501cc..3ddc375e7063 100644 --- a/drivers/vfio/vfio_iommu_type1.c +++ b/drivers/vfio/vfio_iommu_type1.c @@ -358,7 +358,8 @@ static int vaddr_get_pfn(struct mm_struct *mm, unsigned long vaddr, down_read(&mm->mmap_sem); if (mm == current->mm) { - ret = get_user_pages_longterm(vaddr, 1, flags, page, vmas); + ret = get_user_pages(vaddr, 1, flags | FOLL_LONGTERM, page, + vmas); } else { ret = get_user_pages_remote(NULL, mm, vaddr, 1, flags, page, vmas, NULL); diff --git a/fs/io_uring.c b/fs/io_uring.c index 48ea3977012a..fdc18321d70c 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -2697,8 +2697,9 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, void __user *arg, ret = 0; down_read(¤t->mm->mmap_sem); - pret = get_user_pages_longterm(ubuf, nr_pages, FOLL_WRITE, - pages, vmas); + pret = get_user_pages(ubuf, nr_pages, + FOLL_WRITE | FOLL_LONGTERM, + pages, vmas); if (pret == nr_pages) { /* don't support file backed memory */ for (j = 0; j < nr_pages; j++) { diff --git a/include/linux/mm.h b/include/linux/mm.h index 083d7b4863ed..8bc677ce8f01 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1505,19 +1505,6 @@ long get_user_pages_locked(unsigned long start, unsigned long nr_pages, long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages, struct page **pages, unsigned int gup_flags); -#if defined(CONFIG_FS_DAX) || defined(CONFIG_CMA) -long get_user_pages_longterm(unsigned long start, unsigned long nr_pages, - unsigned int gup_flags, struct page **pages, - struct vm_area_struct **vmas); -#else -static inline long get_user_pages_longterm(unsigned long start, - unsigned long nr_pages, unsigned int gup_flags, - struct page **pages, struct vm_area_struct **vmas) -{ - return get_user_pages(start, nr_pages, gup_flags, pages, vmas); -} -#endif /* CONFIG_FS_DAX */ - int get_user_pages_fast(unsigned long start, int nr_pages, int write, struct page **pages); @@ -2583,6 +2570,34 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address, #define FOLL_REMOTE 0x2000 /* we are working on non-current tsk/mm */ #define FOLL_COW 0x4000 /* internal GUP flag */ #define FOLL_ANON 0x8000 /* don't do file mappings */ +#define FOLL_LONGTERM 0x10000 /* mapping lifetime is indefinite: see below */ + +/* + * NOTE on FOLL_LONGTERM: + * + * FOLL_LONGTERM indicates that the page will be held for an indefinite time + * period _often_ under userspace control. This is contrasted with + * iov_iter_get_pages() where usages which are transient. + * + * FIXME: For pages which are part of a filesystem, mappings are subject to the + * lifetime enforced by the filesystem and we need guarantees that longterm + * users like RDMA and V4L2 only establish mappings which coordinate usage with + * the filesystem. Ideas for this coordination include revoking the longterm + * pin, delaying writeback, bounce buffer page writeback, etc. As FS DAX was + * added after the problem with filesystems was found FS DAX VMAs are + * specifically failed. Filesystem pages are still subject to bugs and use of + * FOLL_LONGTERM should be avoided on those pages. + * + * FIXME: Also NOTE that FOLL_LONGTERM is not supported in every GUP call. + * Currently only get_user_pages() and get_user_pages_fast() support this flag + * and calls to get_user_pages_[un]locked are specifically not allowed. This + * is due to an incompatibility with the FS DAX check and + * FAULT_FLAG_ALLOW_RETRY + * + * In the CMA case: longterm pins in a CMA region would unnecessarily fragment + * that region. And so CMA attempts to migrate the page before pinning when + * FOLL_LONGTERM is specified. + */ static inline int vm_fault_to_errno(vm_fault_t vm_fault, int foll_flags) { diff --git a/mm/gup.c b/mm/gup.c index 91819b8ad9cc..25381102e21e 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -1018,6 +1018,15 @@ long get_user_pages_locked(unsigned long start, unsigned long nr_pages, unsigned int gup_flags, struct page **pages, int *locked) { + /* + * FIXME: Current FOLL_LONGTERM behavior is incompatible with + * FAULT_FLAG_ALLOW_RETRY because of the FS DAX check requirement on + * vmas. As there are no users of this flag in this call we simply + * disallow this option for now. + */ + if (WARN_ON_ONCE(gup_flags & FOLL_LONGTERM)) + return -EINVAL; + return __get_user_pages_locked(current, current->mm, start, nr_pages, pages, NULL, locked, gup_flags | FOLL_TOUCH); @@ -1046,6 +1055,15 @@ long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages, int locked = 1; long ret; + /* + * FIXME: Current FOLL_LONGTERM behavior is incompatible with + * FAULT_FLAG_ALLOW_RETRY because of the FS DAX check requirement on + * vmas. As there are no users of this flag in this call we simply + * disallow this option for now. + */ + if (WARN_ON_ONCE(gup_flags & FOLL_LONGTERM)) + return -EINVAL; + down_read(&mm->mmap_sem); ret = __get_user_pages_locked(current, mm, start, nr_pages, pages, NULL, &locked, gup_flags | FOLL_TOUCH); @@ -1116,32 +1134,22 @@ long get_user_pages_remote(struct task_struct *tsk, struct mm_struct *mm, unsigned int gup_flags, struct page **pages, struct vm_area_struct **vmas, int *locked) { + /* + * FIXME: Current FOLL_LONGTERM behavior is incompatible with + * FAULT_FLAG_ALLOW_RETRY because of the FS DAX check requirement on + * vmas. As there are no users of this flag in this call we simply + * disallow this option for now. + */ + if (WARN_ON_ONCE(gup_flags & FOLL_LONGTERM)) + return -EINVAL; + return __get_user_pages_locked(tsk, mm, start, nr_pages, pages, vmas, locked, gup_flags | FOLL_TOUCH | FOLL_REMOTE); } EXPORT_SYMBOL(get_user_pages_remote); -/* - * This is the same as get_user_pages_remote(), just with a - * less-flexible calling convention where we assume that the task - * and mm being operated on are the current task's and don't allow - * passing of a locked parameter. We also obviously don't pass - * FOLL_REMOTE in here. - */ -long get_user_pages(unsigned long start, unsigned long nr_pages, - unsigned int gup_flags, struct page **pages, - struct vm_area_struct **vmas) -{ - return __get_user_pages_locked(current, current->mm, start, nr_pages, - pages, vmas, NULL, - gup_flags | FOLL_TOUCH); -} -EXPORT_SYMBOL(get_user_pages); - #if defined(CONFIG_FS_DAX) || defined (CONFIG_CMA) - -#ifdef CONFIG_FS_DAX static bool check_dax_vmas(struct vm_area_struct **vmas, long nr_pages) { long i; @@ -1160,12 +1168,6 @@ static bool check_dax_vmas(struct vm_area_struct **vmas, long nr_pages) } return false; } -#else -static inline bool check_dax_vmas(struct vm_area_struct **vmas, long nr_pages) -{ - return false; -} -#endif #ifdef CONFIG_CMA static struct page *new_non_cma_page(struct page *page, unsigned long private) @@ -1219,10 +1221,13 @@ static struct page *new_non_cma_page(struct page *page, unsigned long private) return __alloc_pages_node(nid, gfp_mask, 0); } -static long check_and_migrate_cma_pages(unsigned long start, long nr_pages, - unsigned int gup_flags, +static long check_and_migrate_cma_pages(struct task_struct *tsk, + struct mm_struct *mm, + unsigned long start, + unsigned long nr_pages, struct page **pages, - struct vm_area_struct **vmas) + struct vm_area_struct **vmas, + unsigned int gup_flags) { long i; bool drain_allow = true; @@ -1278,10 +1283,14 @@ check_again: putback_movable_pages(&cma_page_list); } /* - * We did migrate all the pages, Try to get the page references again - * migrating any new CMA pages which we failed to isolate earlier. + * We did migrate all the pages, Try to get the page references + * again migrating any new CMA pages which we failed to isolate + * earlier. */ - nr_pages = get_user_pages(start, nr_pages, gup_flags, pages, vmas); + nr_pages = __get_user_pages_locked(tsk, mm, start, nr_pages, + pages, vmas, NULL, + gup_flags); + if ((nr_pages > 0) && migrate_allow) { drain_allow = true; goto check_again; @@ -1291,66 +1300,101 @@ check_again: return nr_pages; } #else -static inline long check_and_migrate_cma_pages(unsigned long start, long nr_pages, - unsigned int gup_flags, - struct page **pages, - struct vm_area_struct **vmas) +static long check_and_migrate_cma_pages(struct task_struct *tsk, + struct mm_struct *mm, + unsigned long start, + unsigned long nr_pages, + struct page **pages, + struct vm_area_struct **vmas, + unsigned int gup_flags) { return nr_pages; } #endif /* - * This is the same as get_user_pages() in that it assumes we are - * operating on the current task's mm, but it goes further to validate - * that the vmas associated with the address range are suitable for - * longterm elevated page reference counts. For example, filesystem-dax - * mappings are subject to the lifetime enforced by the filesystem and - * we need guarantees that longterm users like RDMA and V4L2 only - * establish mappings that have a kernel enforced revocation mechanism. - * - * "longterm" == userspace controlled elevated page count lifetime. - * Contrast this to iov_iter_get_pages() usages which are transient. + * __gup_longterm_locked() is a wrapper for __get_user_pages_locked which + * allows us to process the FOLL_LONGTERM flag. */ -long get_user_pages_longterm(unsigned long start, unsigned long nr_pages, - unsigned int gup_flags, struct page **pages, - struct vm_area_struct **vmas_arg) +static long __gup_longterm_locked(struct task_struct *tsk, + struct mm_struct *mm, + unsigned long start, + unsigned long nr_pages, + struct page **pages, + struct vm_area_struct **vmas, + unsigned int gup_flags) { - struct vm_area_struct **vmas = vmas_arg; - unsigned long flags; + struct vm_area_struct **vmas_tmp = vmas; + unsigned long flags = 0; long rc, i; - if (!pages) - return -EINVAL; - - if (!vmas) { - vmas = kcalloc(nr_pages, sizeof(struct vm_area_struct *), - GFP_KERNEL); - if (!vmas) - return -ENOMEM; + if (gup_flags & FOLL_LONGTERM) { + if (!pages) + return -EINVAL; + + if (!vmas_tmp) { + vmas_tmp = kcalloc(nr_pages, + sizeof(struct vm_area_struct *), + GFP_KERNEL); + if (!vmas_tmp) + return -ENOMEM; + } + flags = memalloc_nocma_save(); } - flags = memalloc_nocma_save(); - rc = get_user_pages(start, nr_pages, gup_flags, pages, vmas); - memalloc_nocma_restore(flags); - if (rc < 0) - goto out; + rc = __get_user_pages_locked(tsk, mm, start, nr_pages, pages, + vmas_tmp, NULL, gup_flags); - if (check_dax_vmas(vmas, rc)) { - for (i = 0; i < rc; i++) - put_page(pages[i]); - rc = -EOPNOTSUPP; - goto out; + if (gup_flags & FOLL_LONGTERM) { + memalloc_nocma_restore(flags); + if (rc < 0) + goto out; + + if (check_dax_vmas(vmas_tmp, rc)) { + for (i = 0; i < rc; i++) + put_page(pages[i]); + rc = -EOPNOTSUPP; + goto out; + } + + rc = check_and_migrate_cma_pages(tsk, mm, start, rc, pages, + vmas_tmp, gup_flags); } - rc = check_and_migrate_cma_pages(start, rc, gup_flags, pages, vmas); out: - if (vmas != vmas_arg) - kfree(vmas); + if (vmas_tmp != vmas) + kfree(vmas_tmp); return rc; } -EXPORT_SYMBOL(get_user_pages_longterm); -#endif /* CONFIG_FS_DAX */ +#else /* !CONFIG_FS_DAX && !CONFIG_CMA */ +static __always_inline long __gup_longterm_locked(struct task_struct *tsk, + struct mm_struct *mm, + unsigned long start, + unsigned long nr_pages, + struct page **pages, + struct vm_area_struct **vmas, + unsigned int flags) +{ + return __get_user_pages_locked(tsk, mm, start, nr_pages, pages, vmas, + NULL, flags); +} +#endif /* CONFIG_FS_DAX || CONFIG_CMA */ + +/* + * This is the same as get_user_pages_remote(), just with a + * less-flexible calling convention where we assume that the task + * and mm being operated on are the current task's and don't allow + * passing of a locked parameter. We also obviously don't pass + * FOLL_REMOTE in here. + */ +long get_user_pages(unsigned long start, unsigned long nr_pages, + unsigned int gup_flags, struct page **pages, + struct vm_area_struct **vmas) +{ + return __gup_longterm_locked(current, current->mm, start, nr_pages, + pages, vmas, gup_flags | FOLL_TOUCH); +} +EXPORT_SYMBOL(get_user_pages); /** * populate_vma_page_range() - populate a range of pages in the vma. diff --git a/mm/gup_benchmark.c b/mm/gup_benchmark.c index 6c0279e70cc4..7dd602d7f8db 100644 --- a/mm/gup_benchmark.c +++ b/mm/gup_benchmark.c @@ -54,8 +54,9 @@ static int __gup_benchmark_ioctl(unsigned int cmd, pages + i); break; case GUP_LONGTERM_BENCHMARK: - nr = get_user_pages_longterm(addr, nr, gup->flags & 1, - pages + i, NULL); + nr = get_user_pages(addr, nr, + (gup->flags & 1) | FOLL_LONGTERM, + pages + i, NULL); break; case GUP_BENCHMARK: nr = get_user_pages(addr, nr, gup->flags & 1, pages + i, diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c index 989e52386c35..2b18223e7eb8 100644 --- a/net/xdp/xdp_umem.c +++ b/net/xdp/xdp_umem.c @@ -253,8 +253,8 @@ static int xdp_umem_pin_pages(struct xdp_umem *umem) return -ENOMEM; down_read(¤t->mm->mmap_sem); - npgs = get_user_pages_longterm(umem->address, umem->npgs, - gup_flags, &umem->pgs[0], NULL); + npgs = get_user_pages(umem->address, umem->npgs, + gup_flags | FOLL_LONGTERM, &umem->pgs[0], NULL); up_read(¤t->mm->mmap_sem); if (npgs != umem->npgs) { -- cgit v1.2.3 From b798bec4741bdd80224214fdd004c8e52698e425 Mon Sep 17 00:00:00 2001 From: Ira Weiny Date: Mon, 13 May 2019 17:17:07 -0700 Subject: mm/gup: change write parameter to flags in fast walk In order to support more options in the GUP fast walk, change the write parameter to flags throughout the call stack. This patch does not change functionality and passes FOLL_WRITE where write was previously used. Link: http://lkml.kernel.org/r/20190328084422.29911-3-ira.weiny@intel.com Link: http://lkml.kernel.org/r/20190317183438.2057-3-ira.weiny@intel.com Signed-off-by: Ira Weiny Reviewed-by: Dan Williams Cc: Aneesh Kumar K.V Cc: Benjamin Herrenschmidt Cc: Borislav Petkov Cc: "David S. Miller" Cc: Heiko Carstens Cc: Ingo Molnar Cc: James Hogan Cc: Jason Gunthorpe Cc: John Hubbard Cc: "Kirill A. Shutemov" Cc: Martin Schwidefsky Cc: Michal Hocko Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Ralf Baechle Cc: Rich Felker Cc: Thomas Gleixner Cc: Yoshinori Sato Cc: Mike Marshall Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/gup.c | 52 ++++++++++++++++++++++++++-------------------------- 1 file changed, 26 insertions(+), 26 deletions(-) diff --git a/mm/gup.c b/mm/gup.c index 25381102e21e..113c18a98cf5 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -1615,7 +1615,7 @@ static inline struct page *try_get_compound_head(struct page *page, int refs) #ifdef CONFIG_ARCH_HAS_PTE_SPECIAL static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end, - int write, struct page **pages, int *nr) + unsigned int flags, struct page **pages, int *nr) { struct dev_pagemap *pgmap = NULL; int nr_start = *nr, ret = 0; @@ -1633,7 +1633,7 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end, if (pte_protnone(pte)) goto pte_unmap; - if (!pte_access_permitted(pte, write)) + if (!pte_access_permitted(pte, flags & FOLL_WRITE)) goto pte_unmap; if (pte_devmap(pte)) { @@ -1685,7 +1685,7 @@ pte_unmap: * useful to have gup_huge_pmd even if we can't operate on ptes. */ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end, - int write, struct page **pages, int *nr) + unsigned int flags, struct page **pages, int *nr) { return 0; } @@ -1768,12 +1768,12 @@ static int __gup_device_huge_pud(pud_t pud, pud_t *pudp, unsigned long addr, #endif static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr, - unsigned long end, int write, struct page **pages, int *nr) + unsigned long end, unsigned int flags, struct page **pages, int *nr) { struct page *head, *page; int refs; - if (!pmd_access_permitted(orig, write)) + if (!pmd_access_permitted(orig, flags & FOLL_WRITE)) return 0; if (pmd_devmap(orig)) @@ -1806,12 +1806,12 @@ static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr, } static int gup_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr, - unsigned long end, int write, struct page **pages, int *nr) + unsigned long end, unsigned int flags, struct page **pages, int *nr) { struct page *head, *page; int refs; - if (!pud_access_permitted(orig, write)) + if (!pud_access_permitted(orig, flags & FOLL_WRITE)) return 0; if (pud_devmap(orig)) @@ -1844,13 +1844,13 @@ static int gup_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr, } static int gup_huge_pgd(pgd_t orig, pgd_t *pgdp, unsigned long addr, - unsigned long end, int write, + unsigned long end, unsigned int flags, struct page **pages, int *nr) { int refs; struct page *head, *page; - if (!pgd_access_permitted(orig, write)) + if (!pgd_access_permitted(orig, flags & FOLL_WRITE)) return 0; BUILD_BUG_ON(pgd_devmap(orig)); @@ -1881,7 +1881,7 @@ static int gup_huge_pgd(pgd_t orig, pgd_t *pgdp, unsigned long addr, } static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end, - int write, struct page **pages, int *nr) + unsigned int flags, struct page **pages, int *nr) { unsigned long next; pmd_t *pmdp; @@ -1904,7 +1904,7 @@ static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end, if (pmd_protnone(pmd)) return 0; - if (!gup_huge_pmd(pmd, pmdp, addr, next, write, + if (!gup_huge_pmd(pmd, pmdp, addr, next, flags, pages, nr)) return 0; @@ -1914,9 +1914,9 @@ static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end, * pmd format and THP pmd format */ if (!gup_huge_pd(__hugepd(pmd_val(pmd)), addr, - PMD_SHIFT, next, write, pages, nr)) + PMD_SHIFT, next, flags, pages, nr)) return 0; - } else if (!gup_pte_range(pmd, addr, next, write, pages, nr)) + } else if (!gup_pte_range(pmd, addr, next, flags, pages, nr)) return 0; } while (pmdp++, addr = next, addr != end); @@ -1924,7 +1924,7 @@ static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end, } static int gup_pud_range(p4d_t p4d, unsigned long addr, unsigned long end, - int write, struct page **pages, int *nr) + unsigned int flags, struct page **pages, int *nr) { unsigned long next; pud_t *pudp; @@ -1937,14 +1937,14 @@ static int gup_pud_range(p4d_t p4d, unsigned long addr, unsigned long end, if (pud_none(pud)) return 0; if (unlikely(pud_huge(pud))) { - if (!gup_huge_pud(pud, pudp, addr, next, write, + if (!gup_huge_pud(pud, pudp, addr, next, flags, pages, nr)) return 0; } else if (unlikely(is_hugepd(__hugepd(pud_val(pud))))) { if (!gup_huge_pd(__hugepd(pud_val(pud)), addr, - PUD_SHIFT, next, write, pages, nr)) + PUD_SHIFT, next, flags, pages, nr)) return 0; - } else if (!gup_pmd_range(pud, addr, next, write, pages, nr)) + } else if (!gup_pmd_range(pud, addr, next, flags, pages, nr)) return 0; } while (pudp++, addr = next, addr != end); @@ -1952,7 +1952,7 @@ static int gup_pud_range(p4d_t p4d, unsigned long addr, unsigned long end, } static int gup_p4d_range(pgd_t pgd, unsigned long addr, unsigned long end, - int write, struct page **pages, int *nr) + unsigned int flags, struct page **pages, int *nr) { unsigned long next; p4d_t *p4dp; @@ -1967,9 +1967,9 @@ static int gup_p4d_range(pgd_t pgd, unsigned long addr, unsigned long end, BUILD_BUG_ON(p4d_huge(p4d)); if (unlikely(is_hugepd(__hugepd(p4d_val(p4d))))) { if (!gup_huge_pd(__hugepd(p4d_val(p4d)), addr, - P4D_SHIFT, next, write, pages, nr)) + P4D_SHIFT, next, flags, pages, nr)) return 0; - } else if (!gup_pud_range(p4d, addr, next, write, pages, nr)) + } else if (!gup_pud_range(p4d, addr, next, flags, pages, nr)) return 0; } while (p4dp++, addr = next, addr != end); @@ -1977,7 +1977,7 @@ static int gup_p4d_range(pgd_t pgd, unsigned long addr, unsigned long end, } static void gup_pgd_range(unsigned long addr, unsigned long end, - int write, struct page **pages, int *nr) + unsigned int flags, struct page **pages, int *nr) { unsigned long next; pgd_t *pgdp; @@ -1990,14 +1990,14 @@ static void gup_pgd_range(unsigned long addr, unsigned long end, if (pgd_none(pgd)) return; if (unlikely(pgd_huge(pgd))) { - if (!gup_huge_pgd(pgd, pgdp, addr, next, write, + if (!gup_huge_pgd(pgd, pgdp, addr, next, flags, pages, nr)) return; } else if (unlikely(is_hugepd(__hugepd(pgd_val(pgd))))) { if (!gup_huge_pd(__hugepd(pgd_val(pgd)), addr, - PGDIR_SHIFT, next, write, pages, nr)) + PGDIR_SHIFT, next, flags, pages, nr)) return; - } else if (!gup_p4d_range(pgd, addr, next, write, pages, nr)) + } else if (!gup_p4d_range(pgd, addr, next, flags, pages, nr)) return; } while (pgdp++, addr = next, addr != end); } @@ -2051,7 +2051,7 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write, if (gup_fast_permitted(start, nr_pages)) { local_irq_save(flags); - gup_pgd_range(start, end, write, pages, &nr); + gup_pgd_range(start, end, write ? FOLL_WRITE : 0, pages, &nr); local_irq_restore(flags); } @@ -2093,7 +2093,7 @@ int get_user_pages_fast(unsigned long start, int nr_pages, int write, if (gup_fast_permitted(start, nr_pages)) { local_irq_disable(); - gup_pgd_range(addr, end, write, pages, &nr); + gup_pgd_range(addr, end, write ? FOLL_WRITE : 0, pages, &nr); local_irq_enable(); ret = nr; } -- cgit v1.2.3 From 73b0140bf0fe9df90fb267c00673c4b9bf285430 Mon Sep 17 00:00:00 2001 From: Ira Weiny Date: Mon, 13 May 2019 17:17:11 -0700 Subject: mm/gup: change GUP fast to use flags rather than a write 'bool' To facilitate additional options to get_user_pages_fast() change the singular write parameter to be gup_flags. This patch does not change any functionality. New functionality will follow in subsequent patches. Some of the get_user_pages_fast() call sites were unchanged because they already passed FOLL_WRITE or 0 for the write parameter. NOTE: It was suggested to change the ordering of the get_user_pages_fast() arguments to ensure that callers were converted. This breaks the current GUP call site convention of having the returned pages be the final parameter. So the suggestion was rejected. Link: http://lkml.kernel.org/r/20190328084422.29911-4-ira.weiny@intel.com Link: http://lkml.kernel.org/r/20190317183438.2057-4-ira.weiny@intel.com Signed-off-by: Ira Weiny Reviewed-by: Mike Marshall Cc: Aneesh Kumar K.V Cc: Benjamin Herrenschmidt Cc: Borislav Petkov Cc: Dan Williams Cc: "David S. Miller" Cc: Heiko Carstens Cc: Ingo Molnar Cc: James Hogan Cc: Jason Gunthorpe Cc: John Hubbard Cc: "Kirill A. Shutemov" Cc: Martin Schwidefsky Cc: Michal Hocko Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Ralf Baechle Cc: Rich Felker Cc: Thomas Gleixner Cc: Yoshinori Sato Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/mips/mm/gup.c | 11 ++++++----- arch/powerpc/kvm/book3s_64_mmu_hv.c | 4 ++-- arch/powerpc/kvm/e500_mmu.c | 2 +- arch/s390/kvm/interrupt.c | 2 +- arch/sh/mm/gup.c | 11 ++++++----- arch/sparc/mm/gup.c | 9 +++++---- arch/x86/kvm/paging_tmpl.h | 2 +- arch/x86/kvm/svm.c | 2 +- drivers/fpga/dfl-afu-dma-region.c | 2 +- drivers/gpu/drm/via/via_dmablit.c | 3 ++- drivers/infiniband/hw/hfi1/user_pages.c | 3 ++- drivers/misc/genwqe/card_utils.c | 2 +- drivers/misc/vmw_vmci/vmci_host.c | 2 +- drivers/misc/vmw_vmci/vmci_queue_pair.c | 6 ++++-- drivers/platform/goldfish/goldfish_pipe.c | 3 ++- drivers/rapidio/devices/rio_mport_cdev.c | 4 +++- drivers/sbus/char/oradax.c | 2 +- drivers/scsi/st.c | 3 ++- drivers/staging/gasket/gasket_page_table.c | 4 ++-- drivers/tee/tee_shm.c | 2 +- drivers/vfio/vfio_iommu_spapr_tce.c | 3 ++- drivers/vhost/vhost.c | 2 +- drivers/video/fbdev/pvr2fb.c | 2 +- drivers/virt/fsl_hypervisor.c | 2 +- drivers/xen/gntdev.c | 2 +- fs/orangefs/orangefs-bufmap.c | 2 +- include/linux/mm.h | 4 ++-- kernel/futex.c | 2 +- lib/iov_iter.c | 7 +++++-- mm/gup.c | 10 +++++----- mm/util.c | 8 ++++---- net/ceph/pagevec.c | 2 +- net/rds/info.c | 2 +- net/rds/rdma.c | 3 ++- 34 files changed, 73 insertions(+), 57 deletions(-) diff --git a/arch/mips/mm/gup.c b/arch/mips/mm/gup.c index 0d14e0d8eacf..4c2b4483683c 100644 --- a/arch/mips/mm/gup.c +++ b/arch/mips/mm/gup.c @@ -235,7 +235,7 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write, * get_user_pages_fast() - pin user pages in memory * @start: starting user address * @nr_pages: number of pages from start to pin - * @write: whether pages will be written to + * @gup_flags: flags modifying pin behaviour * @pages: array that receives pointers to the pages pinned. * Should be at least nr_pages long. * @@ -247,8 +247,8 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write, * requested. If nr_pages is 0 or negative, returns 0. If no pages * were pinned, returns -errno. */ -int get_user_pages_fast(unsigned long start, int nr_pages, int write, - struct page **pages) +int get_user_pages_fast(unsigned long start, int nr_pages, + unsigned int gup_flags, struct page **pages) { struct mm_struct *mm = current->mm; unsigned long addr, len, end; @@ -273,7 +273,8 @@ int get_user_pages_fast(unsigned long start, int nr_pages, int write, next = pgd_addr_end(addr, end); if (pgd_none(pgd)) goto slow; - if (!gup_pud_range(pgd, addr, next, write, pages, &nr)) + if (!gup_pud_range(pgd, addr, next, gup_flags & FOLL_WRITE, + pages, &nr)) goto slow; } while (pgdp++, addr = next, addr != end); local_irq_enable(); @@ -289,7 +290,7 @@ slow_irqon: pages += nr; ret = get_user_pages_unlocked(start, (end - start) >> PAGE_SHIFT, - pages, write ? FOLL_WRITE : 0); + pages, gup_flags); /* Have to be a bit careful with return values */ if (nr > 0) { diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c index be7bc070eae5..ab3d484c5e2e 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_hv.c +++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c @@ -600,7 +600,7 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu, /* If writing != 0, then the HPTE must allow writing, if we get here */ write_ok = writing; hva = gfn_to_hva_memslot(memslot, gfn); - npages = get_user_pages_fast(hva, 1, writing, pages); + npages = get_user_pages_fast(hva, 1, writing ? FOLL_WRITE : 0, pages); if (npages < 1) { /* Check if it's an I/O mapping */ down_read(¤t->mm->mmap_sem); @@ -1193,7 +1193,7 @@ void *kvmppc_pin_guest_page(struct kvm *kvm, unsigned long gpa, if (!memslot || (memslot->flags & KVM_MEMSLOT_INVALID)) goto err; hva = gfn_to_hva_memslot(memslot, gfn); - npages = get_user_pages_fast(hva, 1, 1, pages); + npages = get_user_pages_fast(hva, 1, FOLL_WRITE, pages); if (npages < 1) goto err; page = pages[0]; diff --git a/arch/powerpc/kvm/e500_mmu.c b/arch/powerpc/kvm/e500_mmu.c index 24296f4cadc6..e0af53fd78c5 100644 --- a/arch/powerpc/kvm/e500_mmu.c +++ b/arch/powerpc/kvm/e500_mmu.c @@ -783,7 +783,7 @@ int kvm_vcpu_ioctl_config_tlb(struct kvm_vcpu *vcpu, if (!pages) return -ENOMEM; - ret = get_user_pages_fast(cfg->array, num_pages, 1, pages); + ret = get_user_pages_fast(cfg->array, num_pages, FOLL_WRITE, pages); if (ret < 0) goto free_pages; diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c index 37503ae62486..1fd706f6206c 100644 --- a/arch/s390/kvm/interrupt.c +++ b/arch/s390/kvm/interrupt.c @@ -2376,7 +2376,7 @@ static int kvm_s390_adapter_map(struct kvm *kvm, unsigned int id, __u64 addr) ret = -EFAULT; goto out; } - ret = get_user_pages_fast(map->addr, 1, 1, &map->page); + ret = get_user_pages_fast(map->addr, 1, FOLL_WRITE, &map->page); if (ret < 0) goto out; BUG_ON(ret != 1); diff --git a/arch/sh/mm/gup.c b/arch/sh/mm/gup.c index 3e27f6d1f1ec..277c882f7489 100644 --- a/arch/sh/mm/gup.c +++ b/arch/sh/mm/gup.c @@ -204,7 +204,7 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write, * get_user_pages_fast() - pin user pages in memory * @start: starting user address * @nr_pages: number of pages from start to pin - * @write: whether pages will be written to + * @gup_flags: flags modifying pin behaviour * @pages: array that receives pointers to the pages pinned. * Should be at least nr_pages long. * @@ -216,8 +216,8 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write, * requested. If nr_pages is 0 or negative, returns 0. If no pages * were pinned, returns -errno. */ -int get_user_pages_fast(unsigned long start, int nr_pages, int write, - struct page **pages) +int get_user_pages_fast(unsigned long start, int nr_pages, + unsigned int gup_flags, struct page **pages) { struct mm_struct *mm = current->mm; unsigned long addr, len, end; @@ -241,7 +241,8 @@ int get_user_pages_fast(unsigned long start, int nr_pages, int write, next = pgd_addr_end(addr, end); if (pgd_none(pgd)) goto slow; - if (!gup_pud_range(pgd, addr, next, write, pages, &nr)) + if (!gup_pud_range(pgd, addr, next, gup_flags & FOLL_WRITE, + pages, &nr)) goto slow; } while (pgdp++, addr = next, addr != end); local_irq_enable(); @@ -261,7 +262,7 @@ slow_irqon: ret = get_user_pages_unlocked(start, (end - start) >> PAGE_SHIFT, pages, - write ? FOLL_WRITE : 0); + gup_flags); /* Have to be a bit careful with return values */ if (nr > 0) { diff --git a/arch/sparc/mm/gup.c b/arch/sparc/mm/gup.c index aee6dba83d0e..1e770a517d4a 100644 --- a/arch/sparc/mm/gup.c +++ b/arch/sparc/mm/gup.c @@ -245,8 +245,8 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write, return nr; } -int get_user_pages_fast(unsigned long start, int nr_pages, int write, - struct page **pages) +int get_user_pages_fast(unsigned long start, int nr_pages, + unsigned int gup_flags, struct page **pages) { struct mm_struct *mm = current->mm; unsigned long addr, len, end; @@ -303,7 +303,8 @@ int get_user_pages_fast(unsigned long start, int nr_pages, int write, next = pgd_addr_end(addr, end); if (pgd_none(pgd)) goto slow; - if (!gup_pud_range(pgd, addr, next, write, pages, &nr)) + if (!gup_pud_range(pgd, addr, next, gup_flags & FOLL_WRITE, + pages, &nr)) goto slow; } while (pgdp++, addr = next, addr != end); @@ -324,7 +325,7 @@ slow: ret = get_user_pages_unlocked(start, (end - start) >> PAGE_SHIFT, pages, - write ? FOLL_WRITE : 0); + gup_flags); /* Have to be a bit careful with return values */ if (nr > 0) { diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 6bdca39829bc..08715034e315 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -140,7 +140,7 @@ static int FNAME(cmpxchg_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, pt_element_t *table; struct page *page; - npages = get_user_pages_fast((unsigned long)ptep_user, 1, 1, &page); + npages = get_user_pages_fast((unsigned long)ptep_user, 1, FOLL_WRITE, &page); /* Check if the user is doing something meaningless. */ if (unlikely(npages != 1)) return -EFAULT; diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 406b558abfef..6b92eaf4a3b1 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -1805,7 +1805,7 @@ static struct page **sev_pin_memory(struct kvm *kvm, unsigned long uaddr, return NULL; /* Pin the user virtual address. */ - npinned = get_user_pages_fast(uaddr, npages, write ? FOLL_WRITE : 0, pages); + npinned = get_user_pages_fast(uaddr, npages, FOLL_WRITE, pages); if (npinned != npages) { pr_err("SEV: Failure locking %lu pages.\n", npages); goto err; diff --git a/drivers/fpga/dfl-afu-dma-region.c b/drivers/fpga/dfl-afu-dma-region.c index e18a786fc943..c438722bf4e1 100644 --- a/drivers/fpga/dfl-afu-dma-region.c +++ b/drivers/fpga/dfl-afu-dma-region.c @@ -102,7 +102,7 @@ static int afu_dma_pin_pages(struct dfl_feature_platform_data *pdata, goto unlock_vm; } - pinned = get_user_pages_fast(region->user_addr, npages, 1, + pinned = get_user_pages_fast(region->user_addr, npages, FOLL_WRITE, region->pages); if (pinned < 0) { ret = pinned; diff --git a/drivers/gpu/drm/via/via_dmablit.c b/drivers/gpu/drm/via/via_dmablit.c index 8bf3a7c23ed3..062067438f1d 100644 --- a/drivers/gpu/drm/via/via_dmablit.c +++ b/drivers/gpu/drm/via/via_dmablit.c @@ -243,7 +243,8 @@ via_lock_all_dma_pages(drm_via_sg_info_t *vsg, drm_via_dmablit_t *xfer) if (NULL == vsg->pages) return -ENOMEM; ret = get_user_pages_fast((unsigned long)xfer->mem_addr, - vsg->num_pages, vsg->direction == DMA_FROM_DEVICE, + vsg->num_pages, + vsg->direction == DMA_FROM_DEVICE ? FOLL_WRITE : 0, vsg->pages); if (ret != vsg->num_pages) { if (ret < 0) diff --git a/drivers/infiniband/hw/hfi1/user_pages.c b/drivers/infiniband/hw/hfi1/user_pages.c index 24b592c6522e..78ccacaf97d0 100644 --- a/drivers/infiniband/hw/hfi1/user_pages.c +++ b/drivers/infiniband/hw/hfi1/user_pages.c @@ -105,7 +105,8 @@ int hfi1_acquire_user_pages(struct mm_struct *mm, unsigned long vaddr, size_t np { int ret; - ret = get_user_pages_fast(vaddr, npages, writable, pages); + ret = get_user_pages_fast(vaddr, npages, writable ? FOLL_WRITE : 0, + pages); if (ret < 0) return ret; diff --git a/drivers/misc/genwqe/card_utils.c b/drivers/misc/genwqe/card_utils.c index 25265fd0fd6e..89cff9d1012b 100644 --- a/drivers/misc/genwqe/card_utils.c +++ b/drivers/misc/genwqe/card_utils.c @@ -603,7 +603,7 @@ int genwqe_user_vmap(struct genwqe_dev *cd, struct dma_mapping *m, void *uaddr, /* pin user pages in memory */ rc = get_user_pages_fast(data & PAGE_MASK, /* page aligned addr */ m->nr_pages, - m->write, /* readable/writable */ + m->write ? FOLL_WRITE : 0, /* readable/writable */ m->page_list); /* ptrs to pages */ if (rc < 0) goto fail_get_user_pages; diff --git a/drivers/misc/vmw_vmci/vmci_host.c b/drivers/misc/vmw_vmci/vmci_host.c index 997f92543dd4..422d08da3244 100644 --- a/drivers/misc/vmw_vmci/vmci_host.c +++ b/drivers/misc/vmw_vmci/vmci_host.c @@ -242,7 +242,7 @@ static int vmci_host_setup_notify(struct vmci_ctx *context, /* * Lock physical page backing a given user VA. */ - retval = get_user_pages_fast(uva, 1, 1, &context->notify_page); + retval = get_user_pages_fast(uva, 1, FOLL_WRITE, &context->notify_page); if (retval != 1) { context->notify_page = NULL; return VMCI_ERROR_GENERIC; diff --git a/drivers/misc/vmw_vmci/vmci_queue_pair.c b/drivers/misc/vmw_vmci/vmci_queue_pair.c index f5f1aac9d163..1174735f003d 100644 --- a/drivers/misc/vmw_vmci/vmci_queue_pair.c +++ b/drivers/misc/vmw_vmci/vmci_queue_pair.c @@ -659,7 +659,8 @@ static int qp_host_get_user_memory(u64 produce_uva, int err = VMCI_SUCCESS; retval = get_user_pages_fast((uintptr_t) produce_uva, - produce_q->kernel_if->num_pages, 1, + produce_q->kernel_if->num_pages, + FOLL_WRITE, produce_q->kernel_if->u.h.header_page); if (retval < (int)produce_q->kernel_if->num_pages) { pr_debug("get_user_pages_fast(produce) failed (retval=%d)", @@ -671,7 +672,8 @@ static int qp_host_get_user_memory(u64 produce_uva, } retval = get_user_pages_fast((uintptr_t) consume_uva, - consume_q->kernel_if->num_pages, 1, + consume_q->kernel_if->num_pages, + FOLL_WRITE, consume_q->kernel_if->u.h.header_page); if (retval < (int)consume_q->kernel_if->num_pages) { pr_debug("get_user_pages_fast(consume) failed (retval=%d)", diff --git a/drivers/platform/goldfish/goldfish_pipe.c b/drivers/platform/goldfish/goldfish_pipe.c index 321bc673c417..cef0133aa47a 100644 --- a/drivers/platform/goldfish/goldfish_pipe.c +++ b/drivers/platform/goldfish/goldfish_pipe.c @@ -274,7 +274,8 @@ static int pin_user_pages(unsigned long first_page, *iter_last_page_size = last_page_size; } - ret = get_user_pages_fast(first_page, requested_pages, !is_write, + ret = get_user_pages_fast(first_page, requested_pages, + !is_write ? FOLL_WRITE : 0, pages); if (ret <= 0) return -EFAULT; diff --git a/drivers/rapidio/devices/rio_mport_cdev.c b/drivers/rapidio/devices/rio_mport_cdev.c index 1e1f42e210a0..4a4a75fa26d5 100644 --- a/drivers/rapidio/devices/rio_mport_cdev.c +++ b/drivers/rapidio/devices/rio_mport_cdev.c @@ -868,7 +868,9 @@ rio_dma_transfer(struct file *filp, u32 transfer_mode, pinned = get_user_pages_fast( (unsigned long)xfer->loc_addr & PAGE_MASK, - nr_pages, dir == DMA_FROM_DEVICE, page_list); + nr_pages, + dir == DMA_FROM_DEVICE ? FOLL_WRITE : 0, + page_list); if (pinned != nr_pages) { if (pinned < 0) { diff --git a/drivers/sbus/char/oradax.c b/drivers/sbus/char/oradax.c index acd9ba40eabe..8090dc9a1514 100644 --- a/drivers/sbus/char/oradax.c +++ b/drivers/sbus/char/oradax.c @@ -437,7 +437,7 @@ static int dax_lock_page(void *va, struct page **p) dax_dbg("uva %p", va); - ret = get_user_pages_fast((unsigned long)va, 1, 1, p); + ret = get_user_pages_fast((unsigned long)va, 1, FOLL_WRITE, p); if (ret == 1) { dax_dbg("locked page %p, for VA %p", *p, va); return 0; diff --git a/drivers/scsi/st.c b/drivers/scsi/st.c index 19c022e66d63..3c6a18ad9a87 100644 --- a/drivers/scsi/st.c +++ b/drivers/scsi/st.c @@ -4922,7 +4922,8 @@ static int sgl_map_user_pages(struct st_buffer *STbp, /* Try to fault in all of the necessary pages */ /* rw==READ means read from drive, write into memory area */ - res = get_user_pages_fast(uaddr, nr_pages, rw == READ, pages); + res = get_user_pages_fast(uaddr, nr_pages, rw == READ ? FOLL_WRITE : 0, + pages); /* Errors and no page mapped should return here */ if (res < nr_pages) diff --git a/drivers/staging/gasket/gasket_page_table.c b/drivers/staging/gasket/gasket_page_table.c index 600928f63577..d35c4fb19e28 100644 --- a/drivers/staging/gasket/gasket_page_table.c +++ b/drivers/staging/gasket/gasket_page_table.c @@ -486,8 +486,8 @@ static int gasket_perform_mapping(struct gasket_page_table *pg_tbl, ptes[i].dma_addr = pg_tbl->coherent_pages[0].paddr + off + i * PAGE_SIZE; } else { - ret = get_user_pages_fast(page_addr - offset, 1, 1, - &page); + ret = get_user_pages_fast(page_addr - offset, 1, + FOLL_WRITE, &page); if (ret <= 0) { dev_err(pg_tbl->device, diff --git a/drivers/tee/tee_shm.c b/drivers/tee/tee_shm.c index 0b9ab1d0dd45..49fd7312e2aa 100644 --- a/drivers/tee/tee_shm.c +++ b/drivers/tee/tee_shm.c @@ -273,7 +273,7 @@ struct tee_shm *tee_shm_register(struct tee_context *ctx, unsigned long addr, goto err; } - rc = get_user_pages_fast(start, num_pages, 1, shm->pages); + rc = get_user_pages_fast(start, num_pages, FOLL_WRITE, shm->pages); if (rc > 0) shm->num_pages = rc; if (rc != num_pages) { diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c index 6b64e45a5269..40ddc0c5f677 100644 --- a/drivers/vfio/vfio_iommu_spapr_tce.c +++ b/drivers/vfio/vfio_iommu_spapr_tce.c @@ -532,7 +532,8 @@ static int tce_iommu_use_page(unsigned long tce, unsigned long *hpa) enum dma_data_direction direction = iommu_tce_direction(tce); if (get_user_pages_fast(tce & PAGE_MASK, 1, - direction != DMA_TO_DEVICE, &page) != 1) + direction != DMA_TO_DEVICE ? FOLL_WRITE : 0, + &page) != 1) return -EFAULT; *hpa = __pa((unsigned long) page_address(page)); diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c index 351af88231ad..1e3ed41ae1f3 100644 --- a/drivers/vhost/vhost.c +++ b/drivers/vhost/vhost.c @@ -1704,7 +1704,7 @@ static int set_bit_to_user(int nr, void __user *addr) int bit = nr + (log % PAGE_SIZE) * 8; int r; - r = get_user_pages_fast(log, 1, 1, &page); + r = get_user_pages_fast(log, 1, FOLL_WRITE, &page); if (r < 0) return r; BUG_ON(r != 1); diff --git a/drivers/video/fbdev/pvr2fb.c b/drivers/video/fbdev/pvr2fb.c index dfed532ed606..4e4d6a0df978 100644 --- a/drivers/video/fbdev/pvr2fb.c +++ b/drivers/video/fbdev/pvr2fb.c @@ -686,7 +686,7 @@ static ssize_t pvr2fb_write(struct fb_info *info, const char *buf, if (!pages) return -ENOMEM; - ret = get_user_pages_fast((unsigned long)buf, nr_pages, true, pages); + ret = get_user_pages_fast((unsigned long)buf, nr_pages, FOLL_WRITE, pages); if (ret < nr_pages) { nr_pages = ret; ret = -EINVAL; diff --git a/drivers/virt/fsl_hypervisor.c b/drivers/virt/fsl_hypervisor.c index 8ba726e600e9..6446bcab4185 100644 --- a/drivers/virt/fsl_hypervisor.c +++ b/drivers/virt/fsl_hypervisor.c @@ -244,7 +244,7 @@ static long ioctl_memcpy(struct fsl_hv_ioctl_memcpy __user *p) /* Get the physical addresses of the source buffer */ num_pinned = get_user_pages_fast(param.local_vaddr - lb_offset, - num_pages, param.source != -1, pages); + num_pages, param.source != -1 ? FOLL_WRITE : 0, pages); if (num_pinned != num_pages) { /* get_user_pages() failed */ diff --git a/drivers/xen/gntdev.c b/drivers/xen/gntdev.c index 7cf9c51318aa..02bc815982d4 100644 --- a/drivers/xen/gntdev.c +++ b/drivers/xen/gntdev.c @@ -852,7 +852,7 @@ static int gntdev_get_page(struct gntdev_copy_batch *batch, void __user *virt, unsigned long xen_pfn; int ret; - ret = get_user_pages_fast(addr, 1, writeable, &page); + ret = get_user_pages_fast(addr, 1, writeable ? FOLL_WRITE : 0, &page); if (ret < 0) return ret; diff --git a/fs/orangefs/orangefs-bufmap.c b/fs/orangefs/orangefs-bufmap.c index d4811f981608..2bb916d68576 100644 --- a/fs/orangefs/orangefs-bufmap.c +++ b/fs/orangefs/orangefs-bufmap.c @@ -269,7 +269,7 @@ orangefs_bufmap_map(struct orangefs_bufmap *bufmap, /* map the pages */ ret = get_user_pages_fast((unsigned long)user_desc->ptr, - bufmap->page_count, 1, bufmap->page_array); + bufmap->page_count, FOLL_WRITE, bufmap->page_array); if (ret < 0) return ret; diff --git a/include/linux/mm.h b/include/linux/mm.h index 8bc677ce8f01..c3c73b3c9adc 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1505,8 +1505,8 @@ long get_user_pages_locked(unsigned long start, unsigned long nr_pages, long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages, struct page **pages, unsigned int gup_flags); -int get_user_pages_fast(unsigned long start, int nr_pages, int write, - struct page **pages); +int get_user_pages_fast(unsigned long start, int nr_pages, + unsigned int gup_flags, struct page **pages); /* Container for pinned pfns / pages */ struct frame_vector { diff --git a/kernel/futex.c b/kernel/futex.c index 6262f1534ac9..2268b97d5439 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -543,7 +543,7 @@ again: if (unlikely(should_fail_futex(fshared))) return -EFAULT; - err = get_user_pages_fast(address, 1, 1, &page); + err = get_user_pages_fast(address, 1, FOLL_WRITE, &page); /* * If write access is not required (eg. FUTEX_WAIT), try * and get read-only access. diff --git a/lib/iov_iter.c b/lib/iov_iter.c index b396d328a764..f74fa832f3aa 100644 --- a/lib/iov_iter.c +++ b/lib/iov_iter.c @@ -1293,7 +1293,9 @@ ssize_t iov_iter_get_pages(struct iov_iter *i, len = maxpages * PAGE_SIZE; addr &= ~(PAGE_SIZE - 1); n = DIV_ROUND_UP(len, PAGE_SIZE); - res = get_user_pages_fast(addr, n, iov_iter_rw(i) != WRITE, pages); + res = get_user_pages_fast(addr, n, + iov_iter_rw(i) != WRITE ? FOLL_WRITE : 0, + pages); if (unlikely(res < 0)) return res; return (res == n ? len : res * PAGE_SIZE) - *start; @@ -1374,7 +1376,8 @@ ssize_t iov_iter_get_pages_alloc(struct iov_iter *i, p = get_pages_array(n); if (!p) return -ENOMEM; - res = get_user_pages_fast(addr, n, iov_iter_rw(i) != WRITE, p); + res = get_user_pages_fast(addr, n, + iov_iter_rw(i) != WRITE ? FOLL_WRITE : 0, p); if (unlikely(res < 0)) { kvfree(p); return res; diff --git a/mm/gup.c b/mm/gup.c index 113c18a98cf5..3dde6a8da670 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -2062,7 +2062,7 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write, * get_user_pages_fast() - pin user pages in memory * @start: starting user address * @nr_pages: number of pages from start to pin - * @write: whether pages will be written to + * @gup_flags: flags modifying pin behaviour * @pages: array that receives pointers to the pages pinned. * Should be at least nr_pages long. * @@ -2074,8 +2074,8 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write, * requested. If nr_pages is 0 or negative, returns 0. If no pages * were pinned, returns -errno. */ -int get_user_pages_fast(unsigned long start, int nr_pages, int write, - struct page **pages) +int get_user_pages_fast(unsigned long start, int nr_pages, + unsigned int gup_flags, struct page **pages) { unsigned long addr, len, end; int nr = 0, ret = 0; @@ -2093,7 +2093,7 @@ int get_user_pages_fast(unsigned long start, int nr_pages, int write, if (gup_fast_permitted(start, nr_pages)) { local_irq_disable(); - gup_pgd_range(addr, end, write ? FOLL_WRITE : 0, pages, &nr); + gup_pgd_range(addr, end, gup_flags, pages, &nr); local_irq_enable(); ret = nr; } @@ -2104,7 +2104,7 @@ int get_user_pages_fast(unsigned long start, int nr_pages, int write, pages += nr; ret = get_user_pages_unlocked(start, nr_pages - nr, pages, - write ? FOLL_WRITE : 0); + gup_flags); /* Have to be a bit careful with return values */ if (nr > 0) { diff --git a/mm/util.c b/mm/util.c index 43a2984bccaa..05a464929b3e 100644 --- a/mm/util.c +++ b/mm/util.c @@ -318,7 +318,7 @@ EXPORT_SYMBOL_GPL(__get_user_pages_fast); * get_user_pages_fast() - pin user pages in memory * @start: starting user address * @nr_pages: number of pages from start to pin - * @write: whether pages will be written to + * @gup_flags: flags modifying pin behaviour * @pages: array that receives pointers to the pages pinned. * Should be at least nr_pages long. * @@ -339,10 +339,10 @@ EXPORT_SYMBOL_GPL(__get_user_pages_fast); * were pinned, returns -errno. */ int __weak get_user_pages_fast(unsigned long start, - int nr_pages, int write, struct page **pages) + int nr_pages, unsigned int gup_flags, + struct page **pages) { - return get_user_pages_unlocked(start, nr_pages, pages, - write ? FOLL_WRITE : 0); + return get_user_pages_unlocked(start, nr_pages, pages, gup_flags); } EXPORT_SYMBOL_GPL(get_user_pages_fast); diff --git a/net/ceph/pagevec.c b/net/ceph/pagevec.c index d3736f5bffec..74cafc0142ea 100644 --- a/net/ceph/pagevec.c +++ b/net/ceph/pagevec.c @@ -27,7 +27,7 @@ struct page **ceph_get_direct_page_vector(const void __user *data, while (got < num_pages) { rc = get_user_pages_fast( (unsigned long)data + ((unsigned long)got * PAGE_SIZE), - num_pages - got, write_page, pages + got); + num_pages - got, write_page ? FOLL_WRITE : 0, pages + got); if (rc < 0) break; BUG_ON(rc == 0); diff --git a/net/rds/info.c b/net/rds/info.c index e367a97a18c8..03f6fd56d237 100644 --- a/net/rds/info.c +++ b/net/rds/info.c @@ -193,7 +193,7 @@ int rds_info_getsockopt(struct socket *sock, int optname, char __user *optval, ret = -ENOMEM; goto out; } - ret = get_user_pages_fast(start, nr_pages, 1, pages); + ret = get_user_pages_fast(start, nr_pages, FOLL_WRITE, pages); if (ret != nr_pages) { if (ret > 0) nr_pages = ret; diff --git a/net/rds/rdma.c b/net/rds/rdma.c index 182ab8430594..b340ed4fc43a 100644 --- a/net/rds/rdma.c +++ b/net/rds/rdma.c @@ -158,7 +158,8 @@ static int rds_pin_pages(unsigned long user_addr, unsigned int nr_pages, { int ret; - ret = get_user_pages_fast(user_addr, nr_pages, write, pages); + ret = get_user_pages_fast(user_addr, nr_pages, write ? FOLL_WRITE : 0, + pages); if (ret >= 0 && ret < nr_pages) { while (ret--) -- cgit v1.2.3 From 7af75561e17132b20b5bc047d222f34b3e7a3e6e Mon Sep 17 00:00:00 2001 From: Ira Weiny Date: Mon, 13 May 2019 17:17:14 -0700 Subject: mm/gup: add FOLL_LONGTERM capability to GUP fast DAX pages were previously unprotected from longterm pins when users called get_user_pages_fast(). Use the new FOLL_LONGTERM flag to check for DEVMAP pages and fall back to regular GUP processing if a DEVMAP page is encountered. [ira.weiny@intel.com: v3] Link: http://lkml.kernel.org/r/20190328084422.29911-5-ira.weiny@intel.com Link: http://lkml.kernel.org/r/20190328084422.29911-5-ira.weiny@intel.com Link: http://lkml.kernel.org/r/20190317183438.2057-5-ira.weiny@intel.com Signed-off-by: Ira Weiny Reviewed-by: Andrew Morton Cc: Aneesh Kumar K.V Cc: Benjamin Herrenschmidt Cc: Borislav Petkov Cc: Dan Williams Cc: "David S. Miller" Cc: Heiko Carstens Cc: Ingo Molnar Cc: James Hogan Cc: Jason Gunthorpe Cc: John Hubbard Cc: "Kirill A. Shutemov" Cc: Martin Schwidefsky Cc: Michal Hocko Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Ralf Baechle Cc: Rich Felker Cc: Thomas Gleixner Cc: Yoshinori Sato Cc: Mike Marshall Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/gup.c | 40 ++++++++++++++++++++++++++++++++++++---- 1 file changed, 36 insertions(+), 4 deletions(-) diff --git a/mm/gup.c b/mm/gup.c index 3dde6a8da670..8e0a0a3a2b2d 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -1637,6 +1637,9 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end, goto pte_unmap; if (pte_devmap(pte)) { + if (unlikely(flags & FOLL_LONGTERM)) + goto pte_unmap; + pgmap = get_dev_pagemap(pte_pfn(pte), pgmap); if (unlikely(!pgmap)) { undo_dev_pagemap(nr, nr_start, pages); @@ -1776,8 +1779,11 @@ static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr, if (!pmd_access_permitted(orig, flags & FOLL_WRITE)) return 0; - if (pmd_devmap(orig)) + if (pmd_devmap(orig)) { + if (unlikely(flags & FOLL_LONGTERM)) + return 0; return __gup_device_huge_pmd(orig, pmdp, addr, end, pages, nr); + } refs = 0; page = pmd_page(orig) + ((addr & ~PMD_MASK) >> PAGE_SHIFT); @@ -1814,8 +1820,11 @@ static int gup_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr, if (!pud_access_permitted(orig, flags & FOLL_WRITE)) return 0; - if (pud_devmap(orig)) + if (pud_devmap(orig)) { + if (unlikely(flags & FOLL_LONGTERM)) + return 0; return __gup_device_huge_pud(orig, pudp, addr, end, pages, nr); + } refs = 0; page = pud_page(orig) + ((addr & ~PUD_MASK) >> PAGE_SHIFT); @@ -2058,6 +2067,29 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write, return nr; } +static int __gup_longterm_unlocked(unsigned long start, int nr_pages, + unsigned int gup_flags, struct page **pages) +{ + int ret; + + /* + * FIXME: FOLL_LONGTERM does not work with + * get_user_pages_unlocked() (see comments in that function) + */ + if (gup_flags & FOLL_LONGTERM) { + down_read(¤t->mm->mmap_sem); + ret = __gup_longterm_locked(current, current->mm, + start, nr_pages, + pages, NULL, gup_flags); + up_read(¤t->mm->mmap_sem); + } else { + ret = get_user_pages_unlocked(start, nr_pages, + pages, gup_flags); + } + + return ret; +} + /** * get_user_pages_fast() - pin user pages in memory * @start: starting user address @@ -2103,8 +2135,8 @@ int get_user_pages_fast(unsigned long start, int nr_pages, start += nr << PAGE_SHIFT; pages += nr; - ret = get_user_pages_unlocked(start, nr_pages - nr, pages, - gup_flags); + ret = __gup_longterm_unlocked(start, nr_pages - nr, + gup_flags, pages); /* Have to be a bit careful with return values */ if (nr > 0) { -- cgit v1.2.3 From 9fdf4aa156733e3f075a9d7d0b026648b3874afe Mon Sep 17 00:00:00 2001 From: Ira Weiny Date: Mon, 13 May 2019 17:17:18 -0700 Subject: IB/hfi1: use the new FOLL_LONGTERM flag to get_user_pages_fast() Use the new FOLL_LONGTERM to get_user_pages_fast() to protect against FS DAX pages being mapped. [ira.weiny@intel.com: v3] Link: http://lkml.kernel.org/r/20190328084422.29911-6-ira.weiny@intel.com Link: http://lkml.kernel.org/r/20190328084422.29911-6-ira.weiny@intel.com Link: http://lkml.kernel.org/r/20190317183438.2057-6-ira.weiny@intel.com Signed-off-by: Ira Weiny Reviewed-by: Andrew Morton Cc: Aneesh Kumar K.V Cc: Benjamin Herrenschmidt Cc: Borislav Petkov Cc: Dan Williams Cc: "David S. Miller" Cc: Heiko Carstens Cc: Ingo Molnar Cc: James Hogan Cc: Jason Gunthorpe Cc: John Hubbard Cc: "Kirill A. Shutemov" Cc: Martin Schwidefsky Cc: Michal Hocko Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Ralf Baechle Cc: Rich Felker Cc: Thomas Gleixner Cc: Yoshinori Sato Cc: Mike Marshall Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/infiniband/hw/hfi1/user_pages.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/hw/hfi1/user_pages.c b/drivers/infiniband/hw/hfi1/user_pages.c index 78ccacaf97d0..02eee8eff1db 100644 --- a/drivers/infiniband/hw/hfi1/user_pages.c +++ b/drivers/infiniband/hw/hfi1/user_pages.c @@ -104,9 +104,9 @@ int hfi1_acquire_user_pages(struct mm_struct *mm, unsigned long vaddr, size_t np bool writable, struct page **pages) { int ret; + unsigned int gup_flags = FOLL_LONGTERM | (writable ? FOLL_WRITE : 0); - ret = get_user_pages_fast(vaddr, npages, writable ? FOLL_WRITE : 0, - pages); + ret = get_user_pages_fast(vaddr, npages, gup_flags, pages); if (ret < 0) return ret; -- cgit v1.2.3 From 664b21e717cfe4781137263f2555da335549210e Mon Sep 17 00:00:00 2001 From: Ira Weiny Date: Mon, 13 May 2019 17:17:21 -0700 Subject: IB/qib: use the new FOLL_LONGTERM flag to get_user_pages_fast() Use the new FOLL_LONGTERM to get_user_pages_fast() to protect against FS DAX pages being mapped. Link: http://lkml.kernel.org/r/20190328084422.29911-7-ira.weiny@intel.com Link: http://lkml.kernel.org/r/20190317183438.2057-7-ira.weiny@intel.com Signed-off-by: Ira Weiny Reviewed-by: Dan Williams Cc: Aneesh Kumar K.V Cc: Benjamin Herrenschmidt Cc: Borislav Petkov Cc: "David S. Miller" Cc: Heiko Carstens Cc: Ingo Molnar Cc: James Hogan Cc: Jason Gunthorpe Cc: John Hubbard Cc: "Kirill A. Shutemov" Cc: Martin Schwidefsky Cc: Michal Hocko Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Ralf Baechle Cc: Rich Felker Cc: Thomas Gleixner Cc: Yoshinori Sato Cc: Mike Marshall Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/infiniband/hw/qib/qib_user_sdma.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/qib/qib_user_sdma.c b/drivers/infiniband/hw/qib/qib_user_sdma.c index ef19d39a44b1..0c204776263f 100644 --- a/drivers/infiniband/hw/qib/qib_user_sdma.c +++ b/drivers/infiniband/hw/qib/qib_user_sdma.c @@ -670,7 +670,7 @@ static int qib_user_sdma_pin_pages(const struct qib_devdata *dd, else j = npages; - ret = get_user_pages_fast(addr, j, 0, pages); + ret = get_user_pages_fast(addr, j, FOLL_LONGTERM, pages); if (ret != j) { i = 0; j = ret; -- cgit v1.2.3 From f3b4fdb18cb51bd6ca2c245fbe630ccbea95b3c9 Mon Sep 17 00:00:00 2001 From: Ira Weiny Date: Mon, 13 May 2019 17:17:25 -0700 Subject: IB/mthca: use the new FOLL_LONGTERM flag to get_user_pages_fast() Use the new FOLL_LONGTERM to get_user_pages_fast() to protect against FS DAX pages being mapped. Link: http://lkml.kernel.org/r/20190328084422.29911-8-ira.weiny@intel.com Link: http://lkml.kernel.org/r/20190317183438.2057-8-ira.weiny@intel.com Signed-off-by: Ira Weiny Reviewed-by: Andrew Morton Cc: Aneesh Kumar K.V Cc: Benjamin Herrenschmidt Cc: Borislav Petkov Cc: Dan Williams Cc: "David S. Miller" Cc: Heiko Carstens Cc: Ingo Molnar Cc: James Hogan Cc: Jason Gunthorpe Cc: John Hubbard Cc: "Kirill A. Shutemov" Cc: Martin Schwidefsky Cc: Michal Hocko Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Ralf Baechle Cc: Rich Felker Cc: Thomas Gleixner Cc: Yoshinori Sato Cc: Mike Marshall Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/infiniband/hw/mthca/mthca_memfree.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/mthca/mthca_memfree.c b/drivers/infiniband/hw/mthca/mthca_memfree.c index 112d2f38e0de..8ff0e90d7564 100644 --- a/drivers/infiniband/hw/mthca/mthca_memfree.c +++ b/drivers/infiniband/hw/mthca/mthca_memfree.c @@ -472,7 +472,8 @@ int mthca_map_user_db(struct mthca_dev *dev, struct mthca_uar *uar, goto out; } - ret = get_user_pages_fast(uaddr & PAGE_MASK, 1, FOLL_WRITE, pages); + ret = get_user_pages_fast(uaddr & PAGE_MASK, 1, + FOLL_WRITE | FOLL_LONGTERM, pages); if (ret < 0) goto out; -- cgit v1.2.3 From 10eeadf3045c35fc83649ac586973eb28255add9 Mon Sep 17 00:00:00 2001 From: Oscar Salvador Date: Mon, 13 May 2019 17:17:29 -0700 Subject: mm,memory_hotplug: unlock 1GB-hugetlb on x86_64 On x86_64, 1GB-hugetlb pages could never be offlined due to the fact that hugepage_migration_supported() returned false for PUD_SHIFT. So whenever we wanted to offline a memblock containing a gigantic hugetlb page, we never got beyond has_unmovable_pages() check. This changed with [1], where now we also return true for PUD_SHIFT. After that patch, the check in has_unmovable_pages() and scan_movable_pages() returned true, but we still had a final barrier in do_migrate_range(): if (compound_order(head) > PFN_SECTION_SHIFT) { ret = -EBUSY; break; } This is not really nice, and we do not really need it. It is perfectly possible to migrate a gigantic page as long as another node has a spare gigantic page for us. In alloc_huge_page_nodemask(), we calculate the __real__ number of free pages, and if any, we try to dequeue one from another node. This all works fine when we do have another node with a spare gigantic page, but if that is not the case, alloc_huge_page_nodemask() ends up calling alloc_migrate_huge_page() which bails out if the wanted page is gigantic. That is mainly because finding a 1GB (or even 16GB on powerpc) contiguous memory is quite unlikely when the system has been running for a while. In that situation, we will keep looping forever because scan_movable_pages() will give us the same page and we will fail again because there is no node where we can dequeue a gigantic page from. This is not nice, and it has been raised that we might want to treat -ENOMEM as a fatal error in do_migrate_range(), but this has to be checked further. Anyway, I would tend say that this is the administrator's job, to make sure that the system can keep up with the memory to be offlined, so that would mean that if we want to use gigantic pages, make sure that the other nodes have at least enough gigantic pages to keep up in case we need to offline memory. Just for the sake of completeness, this is one of the tests done: # echo 1 > /sys/devices/system/node/node1/hugepages/hugepages-1048576kB/nr_hugepages # echo 1 > /sys/devices/system/node/node2/hugepages/hugepages-1048576kB/nr_hugepages # cat /sys/devices/system/node/node1/hugepages/hugepages-1048576kB/nr_hugepages 1 # cat /sys/devices/system/node/node1/hugepages/hugepages-1048576kB/free_hugepages 1 # cat /sys/devices/system/node/node2/hugepages/hugepages-1048576kB/nr_hugepages 1 # cat /sys/devices/system/node/node2/hugepages/hugepages-1048576kB/free_hugepages 1 (hugetlb1gb is a program that maps 1GB region using MAP_HUGE_1GB) # numactl -m 1 ./hugetlb1gb # cat /sys/devices/system/node/node1/hugepages/hugepages-1048576kB/free_hugepages 0 # cat /sys/devices/system/node/node2/hugepages/hugepages-1048576kB/free_hugepages 1 # offline node1 memory # cat /sys/devices/system/node/node2/hugepages/hugepages-1048576kB/free_hugepages 0 [1] https://lore.kernel.org/patchwork/patch/998796/ Link: http://lkml.kernel.org/r/20190320152658.10855-2-osalvador@suse.de Signed-off-by: Oscar Salvador Acked-by: Michal Hocko Cc: David Hildenbrand Cc: Mike Kravetz Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory_hotplug.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index b236069ff0d8..df41b467e020 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1382,10 +1382,6 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) if (PageHuge(page)) { struct page *head = compound_head(page); - if (compound_order(head) > PFN_SECTION_SHIFT) { - ret = -EBUSY; - break; - } pfn = page_to_pfn(head) + (1< Date: Mon, 13 May 2019 17:17:32 -0700 Subject: mm,memory_hotplug: drop redundant hugepage_migration_supported check has_unmovable_pages() already checks whether the hugetlb page supports migration, so all non-migratable hugetlb pages should have been caught there. Let us drop the check from scan_movable_pages() as is redundant. Link: http://lkml.kernel.org/r/20190320152658.10855-3-osalvador@suse.de Signed-off-by: Oscar Salvador Acked-by: Michal Hocko Reviewed-by: David Hildenbrand Cc: Mike Kravetz Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory_hotplug.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index df41b467e020..ed4e70c501e6 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1341,8 +1341,7 @@ static unsigned long scan_movable_pages(unsigned long start, unsigned long end) if (!PageHuge(page)) continue; head = compound_head(page); - if (hugepage_migration_supported(page_hstate(head)) && - page_huge_active(head)) + if (page_huge_active(head)) return pfn; skip = (1 << compound_order(head)) - (page - head); pfn += skip - 1; -- cgit v1.2.3 From d3ba3ae19751e476b0840a0c9a673a5766fa3219 Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Mon, 13 May 2019 17:17:35 -0700 Subject: mm/memory_hotplug.c: fix the wrong usage of N_HIGH_MEMORY In node_states_check_changes_online(), N_HIGH_MEMORY is used to substitute ZONE_HIGHMEM directly. This is not right. N_HIGH_MEMORY is to mark the memory state of node. Here zone index is checked, which should be compared with 'ZONE_HIGHMEM' accordingly. Replace it with ZONE_HIGHMEM. This is a code cleanup - no known runtime effects. Link: http://lkml.kernel.org/r/20190320080732.14933-1-bhe@redhat.com Fixes: 8efe33f40f3e ("mm/memory_hotplug.c: simplify node_states_check_changes_online") Signed-off-by: Baoquan He Reviewed-by: David Hildenbrand Acked-by: Michal Hocko Reviewed-by: Oscar Salvador Cc: Wei Yang Cc: Mike Rapoport Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory_hotplug.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index ed4e70c501e6..a279671b9968 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -714,7 +714,7 @@ static void node_states_check_changes_online(unsigned long nr_pages, if (zone_idx(zone) <= ZONE_NORMAL && !node_state(nid, N_NORMAL_MEMORY)) arg->status_change_nid_normal = nid; #ifdef CONFIG_HIGHMEM - if (zone_idx(zone) <= N_HIGH_MEMORY && !node_state(nid, N_HIGH_MEMORY)) + if (zone_idx(zone) <= ZONE_HIGHMEM && !node_state(nid, N_HIGH_MEMORY)) arg->status_change_nid_high = nid; #endif } -- cgit v1.2.3 From dd7ef7bd14640f11763b54f55131000165f48321 Mon Sep 17 00:00:00 2001 From: Qian Cai Date: Mon, 13 May 2019 17:17:38 -0700 Subject: mm/compaction.c: fix an undefined behaviour In a low-memory situation, cc->fast_search_fail can keep increasing as it is unable to find an available page to isolate in fast_isolate_freepages(). As the result, it could trigger an error below, so just compare with the maximum bits can be shifted first. UBSAN: Undefined behaviour in mm/compaction.c:1160:30 shift exponent 64 is too large for 64-bit type 'unsigned long' CPU: 131 PID: 1308 Comm: kcompactd1 Kdump: loaded Tainted: G W L 5.0.0+ #17 Call trace: dump_backtrace+0x0/0x450 show_stack+0x20/0x2c dump_stack+0xc8/0x14c __ubsan_handle_shift_out_of_bounds+0x7e8/0x8c4 compaction_alloc+0x2344/0x2484 unmap_and_move+0xdc/0x1dbc migrate_pages+0x274/0x1310 compact_zone+0x26ec/0x43bc kcompactd+0x15b8/0x1a24 kthread+0x374/0x390 ret_from_fork+0x10/0x18 [akpm@linux-foundation.org: code cleanup] Link: http://lkml.kernel.org/r/20190320203338.53367-1-cai@lca.pw Fixes: 70b44595eafe ("mm, compaction: use free lists to quickly locate a migration source") Signed-off-by: Qian Cai Acked-by: Vlastimil Babka Acked-by: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/compaction.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/mm/compaction.c b/mm/compaction.c index 3319e0872d01..6cc4bea33dcb 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -1164,7 +1164,9 @@ static bool suitable_migration_target(struct compact_control *cc, static inline unsigned int freelist_scan_limit(struct compact_control *cc) { - return (COMPACT_CLUSTER_MAX >> cc->fast_search_fail) + 1; + unsigned short shift = BITS_PER_LONG - 1; + + return (COMPACT_CLUSTER_MAX >> min(shift, cc->fast_search_fail)) + 1; } /* -- cgit v1.2.3 From 2b59e01a3aa665f751d1410b99fae9336bd424e1 Mon Sep 17 00:00:00 2001 From: Yue Hu Date: Mon, 13 May 2019 17:17:41 -0700 Subject: mm/cma.c: fix the bitmap status to show failed allocation reason Currently one bit in cma bitmap represents number of pages rather than one page, cma->count means cma size in pages. So to find available pages via find_next_zero_bit()/find_next_bit() we should use cma size not in pages but in bits although current free pages number is correct due to zero value of order_per_bit. Once order_per_bit is changed the bitmap status will be incorrect. The size input in cma_debug_show_areas() is not correct. It will affect the available pages at some position to debug the failure issue. This is an example with order_per_bit = 1 Before this change: [ 4.120060] cma: number of available pages: 1@93+4@108+7@121+7@137+7@153+7@169+7@185+7@201+3@213+3@221+3@229+3@237+3@245+3@253+3@261+3@269+3@277+3@285+3@293+3@301+3@309+3@317+3@325+19@333+15@369+512@512=> 638 free of 1024 total pages After this change: [ 4.143234] cma: number of available pages: 2@93+8@108+14@121+14@137+14@153+14@169+14@185+14@201+6@213+6@221+6@229+6@237+6@245+6@253+6@261+6@269+6@277+6@285+6@293+6@301+6@309+6@317+6@325+38@333+30@369=> 252 free of 1024 total pages Obviously the bitmap status before is incorrect. Link: http://lkml.kernel.org/r/20190320060829.9144-1-zbestahu@gmail.com Signed-off-by: Yue Hu Reviewed-by: Andrew Morton Cc: Joonsoo Kim Cc: Ingo Molnar Cc: Vlastimil Babka Cc: Mike Rapoport Cc: Randy Dunlap Cc: Laura Abbott Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/cma.c | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/mm/cma.c b/mm/cma.c index bb2d333ffcb3..d72a02fb7759 100644 --- a/mm/cma.c +++ b/mm/cma.c @@ -367,23 +367,26 @@ err: #ifdef CONFIG_CMA_DEBUG static void cma_debug_show_areas(struct cma *cma) { - unsigned long next_zero_bit, next_set_bit; + unsigned long next_zero_bit, next_set_bit, nr_zero; unsigned long start = 0; - unsigned int nr_zero, nr_total = 0; + unsigned long nr_part, nr_total = 0; + unsigned long nbits = cma_bitmap_maxno(cma); mutex_lock(&cma->lock); pr_info("number of available pages: "); for (;;) { - next_zero_bit = find_next_zero_bit(cma->bitmap, cma->count, start); - if (next_zero_bit >= cma->count) + next_zero_bit = find_next_zero_bit(cma->bitmap, nbits, start); + if (next_zero_bit >= nbits) break; - next_set_bit = find_next_bit(cma->bitmap, cma->count, next_zero_bit); + next_set_bit = find_next_bit(cma->bitmap, nbits, next_zero_bit); nr_zero = next_set_bit - next_zero_bit; - pr_cont("%s%u@%lu", nr_total ? "+" : "", nr_zero, next_zero_bit); - nr_total += nr_zero; + nr_part = nr_zero << cma->order_per_bit; + pr_cont("%s%lu@%lu", nr_total ? "+" : "", nr_part, + next_zero_bit); + nr_total += nr_part; start = next_zero_bit + nr_zero; } - pr_cont("=> %u free of %lu total pages\n", nr_total, cma->count); + pr_cont("=> %lu free of %lu total pages\n", nr_total, cma->count); mutex_unlock(&cma->lock); } #else -- cgit v1.2.3 From b1746b991d621e35151386224f455fd6c0d291f0 Mon Sep 17 00:00:00 2001 From: Yafang Shao Date: Mon, 13 May 2019 17:17:44 -0700 Subject: mm: compaction: show gfp flag names in try_to_compact_pages tracepoint Showing the gfp flag names instead of the gfp_mask makes trace more convenient. Link: http://lkml.kernel.org/r/1552527998-13162-1-git-send-email-laoar.shao@gmail.com Signed-off-by: Yafang Shao Acked-by: Michal Hocko Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/trace/events/compaction.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/trace/events/compaction.h b/include/trace/events/compaction.h index 6074eff3d766..e66afb81815a 100644 --- a/include/trace/events/compaction.h +++ b/include/trace/events/compaction.h @@ -189,9 +189,9 @@ TRACE_EVENT(mm_compaction_try_to_compact_pages, __entry->prio = prio; ), - TP_printk("order=%d gfp_mask=0x%x priority=%d", + TP_printk("order=%d gfp_mask=%s priority=%d", __entry->order, - __entry->gfp_mask, + show_gfp_flags(__entry->gfp_mask), __entry->prio) ); -- cgit v1.2.3 From b6cfab7ad19d4920d5765e53042cfa62f2fced3d Mon Sep 17 00:00:00 2001 From: Yafang Shao Date: Mon, 13 May 2019 17:17:47 -0700 Subject: mm, compaction: some tracepoints should be defined only when CONFIG_COMPACTION is set Only mm_compaction_isolate_{free, migrate}pages may be used when CONFIG_COMPACTION is not set. All others are used only when CONFIG_COMPACTION is set. After this change, if CONFIG_COMPACTION is not set, the tracepoints that only work when CONFIG_COMPACTION is set will not be exposed to userspace. Without this change, they will always be exposed in debugfs whether CONFIG_COMPACTION is set or not. This is an improvement. Link: http://lkml.kernel.org/r/1552440403-11780-1-git-send-email-laoar.shao@gmail.com Signed-off-by: Yafang Shao Reviewed-by: Andrew Morton Cc: Michal Hocko Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/trace/events/compaction.h | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/include/trace/events/compaction.h b/include/trace/events/compaction.h index e66afb81815a..e5bf6ee4e814 100644 --- a/include/trace/events/compaction.h +++ b/include/trace/events/compaction.h @@ -64,6 +64,7 @@ DEFINE_EVENT(mm_compaction_isolate_template, mm_compaction_isolate_freepages, TP_ARGS(start_pfn, end_pfn, nr_scanned, nr_taken) ); +#ifdef CONFIG_COMPACTION TRACE_EVENT(mm_compaction_migratepages, TP_PROTO(unsigned long nr_all, @@ -132,7 +133,6 @@ TRACE_EVENT(mm_compaction_begin, __entry->sync ? "sync" : "async") ); -#ifdef CONFIG_COMPACTION TRACE_EVENT(mm_compaction_end, TP_PROTO(unsigned long zone_start, unsigned long migrate_pfn, unsigned long free_pfn, unsigned long zone_end, bool sync, @@ -166,7 +166,6 @@ TRACE_EVENT(mm_compaction_end, __entry->sync ? "sync" : "async", __print_symbolic(__entry->status, COMPACTION_STATUS)) ); -#endif TRACE_EVENT(mm_compaction_try_to_compact_pages, @@ -195,7 +194,6 @@ TRACE_EVENT(mm_compaction_try_to_compact_pages, __entry->prio) ); -#ifdef CONFIG_COMPACTION DECLARE_EVENT_CLASS(mm_compaction_suitable_template, TP_PROTO(struct zone *zone, @@ -296,7 +294,6 @@ DEFINE_EVENT(mm_compaction_defer_template, mm_compaction_defer_reset, TP_ARGS(zone, order) ); -#endif TRACE_EVENT(mm_compaction_kcompactd_sleep, @@ -352,6 +349,7 @@ DEFINE_EVENT(kcompactd_wake_template, mm_compaction_kcompactd_wake, TP_ARGS(nid, order, classzone_idx) ); +#endif #endif /* _TRACE_COMPACTION_H */ -- cgit v1.2.3 From 5e65af19e89ac33dc83e1869c78b33ed7099469b Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Mon, 13 May 2019 17:17:50 -0700 Subject: mm/page_isolation.c: remove redundant pfn_valid_within() in __first_valid_page() pfn_valid_within() calls pfn_valid() when CONFIG_HOLES_IN_ZONE making it redundant for both definitions (w/wo CONFIG_MEMORY_HOTPLUG) of the helper pfn_to_online_page() which either calls pfn_valid() or pfn_valid_within(). pfn_valid_within() being 1 when !CONFIG_HOLES_IN_ZONE is irrelevant either way. This does not change functionality. Link: http://lkml.kernel.org/r/1553141595-26907-1-git-send-email-anshuman.khandual@arm.com Signed-off-by: Anshuman Khandual Reviewed-by: Zi Yan Reviewed-by: Oscar Salvador Acked-by: Michal Hocko Cc: Mike Kravetz Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_isolation.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/mm/page_isolation.c b/mm/page_isolation.c index 019280712e1b..e3638a5bafff 100644 --- a/mm/page_isolation.c +++ b/mm/page_isolation.c @@ -151,8 +151,6 @@ __first_valid_page(unsigned long pfn, unsigned long nr_pages) for (i = 0; i < nr_pages; i++) { struct page *page; - if (!pfn_valid_within(pfn + i)) - continue; page = pfn_to_online_page(pfn + i); if (!page) continue; -- cgit v1.2.3 From 132bb8cfc9e081238e7e2fd0c37c8c75ad0d2963 Mon Sep 17 00:00:00 2001 From: Yafang Shao Date: Mon, 13 May 2019 17:17:53 -0700 Subject: mm/vmscan: add tracepoints for node reclaim The page alloc fast path it may perform node reclaim, which may cause a latency spike. We should add tracepoint for this event, and also measure the latency it causes. So bellow two tracepoints are introduced, mm_vmscan_node_reclaim_begin mm_vmscan_node_reclaim_end Link: http://lkml.kernel.org/r/1551421452-5385-1-git-send-email-laoar.shao@gmail.com Signed-off-by: Yafang Shao Acked-by: Michal Hocko Cc: Vlastimil Babka Cc: Souptick Joarder Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/trace/events/vmscan.h | 32 ++++++++++++++++++++++++++++++++ mm/vmscan.c | 6 ++++++ 2 files changed, 38 insertions(+) diff --git a/include/trace/events/vmscan.h b/include/trace/events/vmscan.h index cb2add69301a..b8b9d42944f9 100644 --- a/include/trace/events/vmscan.h +++ b/include/trace/events/vmscan.h @@ -469,6 +469,38 @@ TRACE_EVENT(mm_vmscan_inactive_list_is_low, __entry->ratio, show_reclaim_flags(__entry->reclaim_flags)) ); + +TRACE_EVENT(mm_vmscan_node_reclaim_begin, + + TP_PROTO(int nid, int order, gfp_t gfp_flags), + + TP_ARGS(nid, order, gfp_flags), + + TP_STRUCT__entry( + __field(int, nid) + __field(int, order) + __field(gfp_t, gfp_flags) + ), + + TP_fast_assign( + __entry->nid = nid; + __entry->order = order; + __entry->gfp_flags = gfp_flags; + ), + + TP_printk("nid=%d order=%d gfp_flags=%s", + __entry->nid, + __entry->order, + show_gfp_flags(__entry->gfp_flags)) +); + +DEFINE_EVENT(mm_vmscan_direct_reclaim_end_template, mm_vmscan_node_reclaim_end, + + TP_PROTO(unsigned long nr_reclaimed), + + TP_ARGS(nr_reclaimed) +); + #endif /* _TRACE_VMSCAN_H */ /* This part must be outside protection */ diff --git a/mm/vmscan.c b/mm/vmscan.c index 40ff747e0b33..39912c6b7181 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -4111,6 +4111,9 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in .reclaim_idx = gfp_zone(gfp_mask), }; + trace_mm_vmscan_node_reclaim_begin(pgdat->node_id, order, + sc.gfp_mask); + cond_resched(); fs_reclaim_acquire(sc.gfp_mask); /* @@ -4137,6 +4140,9 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in current->flags &= ~PF_SWAPWRITE; memalloc_noreclaim_restore(noreclaim_flag); fs_reclaim_release(sc.gfp_mask); + + trace_mm_vmscan_node_reclaim_end(sc.nr_reclaimed); + return sc.nr_reclaimed >= nr_pages; } -- cgit v1.2.3 From e0ee0e71078abbcadd4cbc38fb8570551fccc103 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Mon, 13 May 2019 17:17:57 -0700 Subject: mm: memcontrol: track LRU counts in the vmstats array Patch series "mm: memcontrol: clean up the LRU counts tracking". The memcg LRU stats usage is currently a bit messy. Memcg has private per-zone counters because reclaim needs zone granularity sometimes, but we also have plenty of users that need to awkwardly sum them up to node or memcg granularity. Meanwhile the canonical per-memcg vmstats do not track the LRU counts (NR_INACTIVE_ANON etc.) as you'd expect. This series enables LRU count tracking in the per-memcg vmstats array such that lruvec_page_state() and memcg_page_state() work on the enum node_stat_item items for the LRU counters. Then it converts all the callers that don't specifically need per-zone numbers over to that. This patch (of 6): The memcg code currently maintains private per-zone breakdowns of the LRU counters. This is necessary for reclaim decisions which are still zone-based, but there are a variety of users of these counters that only want the aggregate per-lruvec or per-memcg LRU counts, and they need to painfully sum up the zone counters on each request for that. These would be better served using the memcg vmstats arrays, which track VM statistics at the desired scope already. They just don't have the LRU counts right now. So to kick off the conversion, begin tracking LRU counts in those. Link: http://lkml.kernel.org/r/20190228163020.24100-2-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Reviewed-by: Roman Gushchin Cc: Tejun Heo Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm_inline.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h index 04ec454d44ce..6f2fef7b0784 100644 --- a/include/linux/mm_inline.h +++ b/include/linux/mm_inline.h @@ -29,7 +29,7 @@ static __always_inline void __update_lru_size(struct lruvec *lruvec, { struct pglist_data *pgdat = lruvec_pgdat(lruvec); - __mod_node_page_state(pgdat, NR_LRU_BASE + lru, nr_pages); + __mod_lruvec_state(lruvec, NR_LRU_BASE + lru, nr_pages); __mod_zone_page_state(&pgdat->node_zones[zid], NR_ZONE_LRU_BASE + lru, nr_pages); } -- cgit v1.2.3 From 1a61ab8038e724a6d8aa59e7d4931a119483294d Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Mon, 13 May 2019 17:18:00 -0700 Subject: mm: memcontrol: replace zone summing with lruvec_page_state() Instead of adding up the zone counters, use lruvec_page_state() to get the node state directly. This is a bit cheaper and more stream-lined. Link: http://lkml.kernel.org/r/20190228163020.24100-3-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Reviewed-by: Roman Gushchin Cc: Michal Hocko Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 18 ------------------ mm/memcontrol.c | 2 +- mm/vmscan.c | 2 +- 3 files changed, 2 insertions(+), 20 deletions(-) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index b238403f95b2..65f381b27a2d 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -504,19 +504,6 @@ void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru, unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg, int nid, unsigned int lru_mask); -static inline -unsigned long mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list lru) -{ - struct mem_cgroup_per_node *mz; - unsigned long nr_pages = 0; - int zid; - - mz = container_of(lruvec, struct mem_cgroup_per_node, lruvec); - for (zid = 0; zid < MAX_NR_ZONES; zid++) - nr_pages += mz->lru_zone_size[zid][lru]; - return nr_pages; -} - static inline unsigned long mem_cgroup_get_zone_lru_size(struct lruvec *lruvec, enum lru_list lru, int zone_idx) @@ -960,11 +947,6 @@ static inline bool mem_cgroup_online(struct mem_cgroup *memcg) return true; } -static inline unsigned long -mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list lru) -{ - return 0; -} static inline unsigned long mem_cgroup_get_zone_lru_size(struct lruvec *lruvec, enum lru_list lru, int zone_idx) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 81a0d3914ec9..f30381481c45 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -737,7 +737,7 @@ unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg, for_each_lru(lru) { if (!(BIT(lru) & lru_mask)) continue; - nr += mem_cgroup_get_lru_size(lruvec, lru); + nr += lruvec_page_state(lruvec, NR_LRU_BASE + lru); } return nr; } diff --git a/mm/vmscan.c b/mm/vmscan.c index 39912c6b7181..e869f9e25a3d 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -346,7 +346,7 @@ unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru, int zone int zid; if (!mem_cgroup_disabled()) - lru_size = mem_cgroup_get_lru_size(lruvec, lru); + lru_size = lruvec_page_state(lruvec, NR_LRU_BASE + lru); else lru_size = node_page_state(lruvec_pgdat(lruvec), NR_LRU_BASE + lru); -- cgit v1.2.3 From 22796c844fcb85f3b289c0e698713b7fa4d9c178 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Mon, 13 May 2019 17:18:03 -0700 Subject: mm: memcontrol: replace node summing with memcg_page_state() Instead of adding up the node counters, use memcg_page_state() to get the memcg state directly. This is a bit cheaper and more stream-lined. Link: http://lkml.kernel.org/r/20190228163020.24100-4-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Reviewed-by: Roman Gushchin Cc: Michal Hocko Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index f30381481c45..4000ea4ea0ec 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -746,10 +746,13 @@ static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg, unsigned int lru_mask) { unsigned long nr = 0; - int nid; + enum lru_list lru; - for_each_node_state(nid, N_MEMORY) - nr += mem_cgroup_node_nr_lru_pages(memcg, nid, lru_mask); + for_each_lru(lru) { + if (!(BIT(lru) & lru_mask)) + continue; + nr += memcg_page_state(memcg, NR_LRU_BASE + lru); + } return nr; } -- cgit v1.2.3 From 2b487e59f00aaa885ebf9c47d44d09f3ef4df80e Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Mon, 13 May 2019 17:18:05 -0700 Subject: mm: memcontrol: push down mem_cgroup_node_nr_lru_pages() mem_cgroup_node_nr_lru_pages() is just a convenience wrapper around lruvec_page_state() that takes bitmasks of lru indexes and aggregates the counts for those. Replace callsites where the bitmask is simple enough with direct lruvec_page_state() calls. This removes the last extern user of mem_cgroup_node_nr_lru_pages(), so make that function private again, too. Link: http://lkml.kernel.org/r/20190228163020.24100-5-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Reviewed-by: Roman Gushchin Cc: Michal Hocko Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 10 ---------- mm/memcontrol.c | 10 +++++++--- mm/workingset.c | 5 +++-- 3 files changed, 10 insertions(+), 15 deletions(-) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 65f381b27a2d..30561a954ee0 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -501,9 +501,6 @@ int mem_cgroup_select_victim_node(struct mem_cgroup *memcg); void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru, int zid, int nr_pages); -unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg, - int nid, unsigned int lru_mask); - static inline unsigned long mem_cgroup_get_zone_lru_size(struct lruvec *lruvec, enum lru_list lru, int zone_idx) @@ -954,13 +951,6 @@ unsigned long mem_cgroup_get_zone_lru_size(struct lruvec *lruvec, return 0; } -static inline unsigned long -mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg, - int nid, unsigned int lru_mask) -{ - return 0; -} - static inline unsigned long mem_cgroup_get_max(struct mem_cgroup *memcg) { return 0; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 4000ea4ea0ec..268a0bd83773 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -725,7 +725,7 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg, __this_cpu_add(memcg->stat_cpu->nr_page_events, nr_pages); } -unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg, +static unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg, int nid, unsigned int lru_mask) { struct lruvec *lruvec = mem_cgroup_lruvec(NODE_DATA(nid), memcg); @@ -1425,11 +1425,15 @@ static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, static bool test_mem_cgroup_node_reclaimable(struct mem_cgroup *memcg, int nid, bool noswap) { - if (mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL_FILE)) + struct lruvec *lruvec = mem_cgroup_lruvec(NODE_DATA(nid), memcg); + + if (lruvec_page_state(lruvec, NR_INACTIVE_FILE) || + lruvec_page_state(lruvec, NR_ACTIVE_FILE)) return true; if (noswap || !total_swap_pages) return false; - if (mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL_ANON)) + if (lruvec_page_state(lruvec, NR_INACTIVE_ANON) || + lruvec_page_state(lruvec, NR_ACTIVE_ANON)) return true; return false; diff --git a/mm/workingset.c b/mm/workingset.c index 0bedf67502d5..6419baebd306 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -426,10 +426,11 @@ static unsigned long count_shadow_nodes(struct shrinker *shrinker, #ifdef CONFIG_MEMCG if (sc->memcg) { struct lruvec *lruvec; + int i; - pages = mem_cgroup_node_nr_lru_pages(sc->memcg, sc->nid, - LRU_ALL); lruvec = mem_cgroup_lruvec(NODE_DATA(sc->nid), sc->memcg); + for (pages = 0, i = 0; i < NR_LRU_LISTS; i++) + pages += lruvec_page_state(lruvec, NR_LRU_BASE + i); pages += lruvec_page_state(lruvec, NR_SLAB_RECLAIMABLE); pages += lruvec_page_state(lruvec, NR_SLAB_UNRECLAIMABLE); } else -- cgit v1.2.3 From 21d89d151bb42bea1bcf0343f724ef62509d6161 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Mon, 13 May 2019 17:18:08 -0700 Subject: mm: memcontrol: push down mem_cgroup_nr_lru_pages() mem_cgroup_nr_lru_pages() is just a convenience wrapper around memcg_page_state() that takes bitmasks of lru indexes and aggregates the counts for those. Replace callsites where the bitmask is simple enough with direct memcg_page_state() call(s). Link: http://lkml.kernel.org/r/20190228163020.24100-6-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Reviewed-by: Roman Gushchin Cc: Michal Hocko Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 268a0bd83773..d29417b93a8b 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1361,7 +1361,7 @@ void mem_cgroup_print_oom_meminfo(struct mem_cgroup *memcg) for (i = 0; i < NR_LRU_LISTS; i++) pr_cont(" %s:%luKB", mem_cgroup_lru_names[i], - K(mem_cgroup_nr_lru_pages(iter, BIT(i)))); + K(memcg_page_state(iter, NR_LRU_BASE + i))); pr_cont("\n"); } @@ -2997,8 +2997,8 @@ static void accumulate_memcg_tree(struct mem_cgroup *memcg, acc->events_array ? acc->events_array[i] : i); for (i = 0; i < NR_LRU_LISTS; i++) - acc->lru_pages[i] += - mem_cgroup_nr_lru_pages(mi, BIT(i)); + acc->lru_pages[i] += memcg_page_state(mi, + NR_LRU_BASE + i); } } @@ -3428,7 +3428,8 @@ static int memcg_stat_show(struct seq_file *m, void *v) for (i = 0; i < NR_LRU_LISTS; i++) seq_printf(m, "%s %lu\n", mem_cgroup_lru_names[i], - mem_cgroup_nr_lru_pages(memcg, BIT(i)) * PAGE_SIZE); + memcg_page_state(memcg, NR_LRU_BASE + i) * + PAGE_SIZE); /* Hierarchical information */ memory = memsw = PAGE_COUNTER_MAX; @@ -3934,8 +3935,8 @@ void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages, /* this should eventually include NR_UNSTABLE_NFS */ *pwriteback = memcg_exact_page_state(memcg, NR_WRITEBACK); - *pfilepages = mem_cgroup_nr_lru_pages(memcg, (1 << LRU_INACTIVE_FILE) | - (1 << LRU_ACTIVE_FILE)); + *pfilepages = memcg_exact_page_state(memcg, NR_INACTIVE_FILE) + + memcg_exact_page_state(memcg, NR_ACTIVE_FILE); *pheadroom = PAGE_COUNTER_MAX; while ((parent = parent_mem_cgroup(memcg))) { -- cgit v1.2.3 From 113b7dfd827175977ea71cc4a29c1ac24acb9fce Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Mon, 13 May 2019 17:18:11 -0700 Subject: mm: memcontrol: quarantine the mem_cgroup_[node_]nr_lru_pages() API Only memcg_numa_stat_show() uses those wrappers and the lru bitmasks, group them together. Link: http://lkml.kernel.org/r/20190228163020.24100-7-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Reviewed-by: Roman Gushchin Cc: Michal Hocko Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mmzone.h | 5 ---- mm/memcontrol.c | 67 +++++++++++++++++++++++++++----------------------- 2 files changed, 36 insertions(+), 36 deletions(-) diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index fba7741533be..5a4aedc160bd 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -247,11 +247,6 @@ struct lruvec { #endif }; -/* Mask used at gathering information at once (see memcontrol.c) */ -#define LRU_ALL_FILE (BIT(LRU_INACTIVE_FILE) | BIT(LRU_ACTIVE_FILE)) -#define LRU_ALL_ANON (BIT(LRU_INACTIVE_ANON) | BIT(LRU_ACTIVE_ANON)) -#define LRU_ALL ((1 << NR_LRU_LISTS) - 1) - /* Isolate unmapped file */ #define ISOLATE_UNMAPPED ((__force isolate_mode_t)0x2) /* Isolate for asynchronous migration */ diff --git a/mm/memcontrol.c b/mm/memcontrol.c index d29417b93a8b..287933005e11 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -725,37 +725,6 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg, __this_cpu_add(memcg->stat_cpu->nr_page_events, nr_pages); } -static unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg, - int nid, unsigned int lru_mask) -{ - struct lruvec *lruvec = mem_cgroup_lruvec(NODE_DATA(nid), memcg); - unsigned long nr = 0; - enum lru_list lru; - - VM_BUG_ON((unsigned)nid >= nr_node_ids); - - for_each_lru(lru) { - if (!(BIT(lru) & lru_mask)) - continue; - nr += lruvec_page_state(lruvec, NR_LRU_BASE + lru); - } - return nr; -} - -static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg, - unsigned int lru_mask) -{ - unsigned long nr = 0; - enum lru_list lru; - - for_each_lru(lru) { - if (!(BIT(lru) & lru_mask)) - continue; - nr += memcg_page_state(memcg, NR_LRU_BASE + lru); - } - return nr; -} - static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg, enum mem_cgroup_events_target target) { @@ -3338,6 +3307,42 @@ static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css, #endif #ifdef CONFIG_NUMA + +#define LRU_ALL_FILE (BIT(LRU_INACTIVE_FILE) | BIT(LRU_ACTIVE_FILE)) +#define LRU_ALL_ANON (BIT(LRU_INACTIVE_ANON) | BIT(LRU_ACTIVE_ANON)) +#define LRU_ALL ((1 << NR_LRU_LISTS) - 1) + +static unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg, + int nid, unsigned int lru_mask) +{ + struct lruvec *lruvec = mem_cgroup_lruvec(NODE_DATA(nid), memcg); + unsigned long nr = 0; + enum lru_list lru; + + VM_BUG_ON((unsigned)nid >= nr_node_ids); + + for_each_lru(lru) { + if (!(BIT(lru) & lru_mask)) + continue; + nr += lruvec_page_state(lruvec, NR_LRU_BASE + lru); + } + return nr; +} + +static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg, + unsigned int lru_mask) +{ + unsigned long nr = 0; + enum lru_list lru; + + for_each_lru(lru) { + if (!(BIT(lru) & lru_mask)) + continue; + nr += memcg_page_state(memcg, NR_LRU_BASE + lru); + } + return nr; +} + static int memcg_numa_stat_show(struct seq_file *m, void *v) { struct numa_stat { -- cgit v1.2.3 From 1df3a339074e31db95c4790ea9236874b13ccd87 Mon Sep 17 00:00:00 2001 From: Yue Hu Date: Mon, 13 May 2019 17:18:14 -0700 Subject: mm/cma.c: fix crash on CMA allocation if bitmap allocation fails f022d8cb7ec7 ("mm: cma: Don't crash on allocation if CMA area can't be activated") fixes the crash issue when activation fails via setting cma->count as 0, same logic exists if bitmap allocation fails. Link: http://lkml.kernel.org/r/20190325081309.6004-1-zbestahu@gmail.com Signed-off-by: Yue Hu Reviewed-by: Anshuman Khandual Cc: Joonsoo Kim Cc: Laura Abbott Cc: Mike Rapoport Cc: Randy Dunlap Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/cma.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/mm/cma.c b/mm/cma.c index d72a02fb7759..5e36d7418031 100644 --- a/mm/cma.c +++ b/mm/cma.c @@ -106,8 +106,10 @@ static int __init cma_activate_area(struct cma *cma) cma->bitmap = kzalloc(bitmap_size, GFP_KERNEL); - if (!cma->bitmap) + if (!cma->bitmap) { + cma->count = 0; return -ENOMEM; + } WARN_ON_ONCE(!pfn_valid(pfn)); zone = page_zone(pfn_to_page(pfn)); -- cgit v1.2.3 From 54c7a8916a887f357088f99e9c3a7720cd57d2c8 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 13 May 2019 17:18:17 -0700 Subject: initramfs: free initrd memory if opening /initrd.image fails Patch series "initramfs tidyups". I've spent some time chasing down behavior in initramfs and found plenty of opportunity to improve the code. A first stab on that is contained in this series. This patch (of 7): We free the initrd memory for all successful or error cases except for the case where opening /initrd.image fails, which looks like an oversight. Steven said: : This also changes the behaviour when CONFIG_INITRAMFS_FORCE is enabled : - specifically it means that the initrd is freed (previously it was : ignored and never freed). But that seems like reasonable behaviour and : the previous behaviour looks like another oversight. Link: http://lkml.kernel.org/r/20190213174621.29297-3-hch@lst.de Signed-off-by: Christoph Hellwig Reviewed-by: Steven Price Acked-by: Mike Rapoport Cc: Catalin Marinas [arm64] Cc: Geert Uytterhoeven [m68k] Cc: Alexander Viro Cc: Russell King Cc: Will Deacon Cc: Guan Xuetao Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- init/initramfs.c | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/init/initramfs.c b/init/initramfs.c index 4749e1115eef..c322e1099f43 100644 --- a/init/initramfs.c +++ b/init/initramfs.c @@ -612,13 +612,12 @@ static int __init populate_rootfs(void) printk(KERN_INFO "Trying to unpack rootfs image as initramfs...\n"); err = unpack_to_rootfs((char *)initrd_start, initrd_end - initrd_start); - if (!err) { - free_initrd(); + if (!err) goto done; - } else { - clean_rootfs(); - unpack_to_rootfs(__initramfs_start, __initramfs_size); - } + + clean_rootfs(); + unpack_to_rootfs(__initramfs_start, __initramfs_size); + printk(KERN_INFO "rootfs image is not initramfs (%s)" "; looks like an initrd\n", err); fd = ksys_open("/initrd.image", @@ -632,7 +631,6 @@ static int __init populate_rootfs(void) written, initrd_end - initrd_start); ksys_close(fd); - free_initrd(); } done: /* empty statement */; @@ -642,9 +640,9 @@ static int __init populate_rootfs(void) initrd_end - initrd_start); if (err) printk(KERN_EMERG "Initramfs unpacking failed: %s\n", err); - free_initrd(); #endif } + free_initrd(); flush_delayed_fput(); return 0; } -- cgit v1.2.3 From 23091e287355440fb680868c23bcada594d3f399 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 13 May 2019 17:18:21 -0700 Subject: initramfs: cleanup initrd freeing Factor the kexec logic into a separate helper, and then inline the rest of free_initrd into the only caller. Link: http://lkml.kernel.org/r/20190213174621.29297-4-hch@lst.de Signed-off-by: Christoph Hellwig Acked-by: Mike Rapoport Cc: Catalin Marinas [arm64] Cc: Geert Uytterhoeven [m68k] Cc: Steven Price Cc: Alexander Viro Cc: Guan Xuetao Cc: Russell King Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- init/initramfs.c | 53 ++++++++++++++++++++++++++++++----------------------- 1 file changed, 30 insertions(+), 23 deletions(-) diff --git a/init/initramfs.c b/init/initramfs.c index c322e1099f43..5fda9557a134 100644 --- a/init/initramfs.c +++ b/init/initramfs.c @@ -518,37 +518,35 @@ extern unsigned long __initramfs_size; #include #include -static void __init free_initrd(void) -{ #ifdef CONFIG_KEXEC_CORE +static bool kexec_free_initrd(void) +{ unsigned long crashk_start = (unsigned long)__va(crashk_res.start); unsigned long crashk_end = (unsigned long)__va(crashk_res.end); -#endif - if (do_retain_initrd) - goto skip; -#ifdef CONFIG_KEXEC_CORE /* * If the initrd region is overlapped with crashkernel reserved region, * free only memory that is not part of crashkernel region. */ - if (initrd_start < crashk_end && initrd_end > crashk_start) { - /* - * Initialize initrd memory region since the kexec boot does - * not do. - */ - memset((void *)initrd_start, 0, initrd_end - initrd_start); - if (initrd_start < crashk_start) - free_initrd_mem(initrd_start, crashk_start); - if (initrd_end > crashk_end) - free_initrd_mem(crashk_end, initrd_end); - } else -#endif - free_initrd_mem(initrd_start, initrd_end); -skip: - initrd_start = 0; - initrd_end = 0; + if (initrd_start >= crashk_end || initrd_end <= crashk_start) + return false; + + /* + * Initialize initrd memory region since the kexec boot does not do. + */ + memset((void *)initrd_start, 0, initrd_end - initrd_start); + if (initrd_start < crashk_start) + free_initrd_mem(initrd_start, crashk_start); + if (initrd_end > crashk_end) + free_initrd_mem(crashk_end, initrd_end); + return true; } +#else +static inline bool kexec_free_initrd(void) +{ + return false; +} +#endif /* CONFIG_KEXEC_CORE */ #ifdef CONFIG_BLK_DEV_RAM #define BUF_SIZE 1024 @@ -642,7 +640,16 @@ static int __init populate_rootfs(void) printk(KERN_EMERG "Initramfs unpacking failed: %s\n", err); #endif } - free_initrd(); + + /* + * If the initrd region is overlapped with crashkernel reserved region, + * free only memory that is not part of crashkernel region. + */ + if (!do_retain_initrd && !kexec_free_initrd()) + free_initrd_mem(initrd_start, initrd_end); + initrd_start = 0; + initrd_end = 0; + flush_delayed_fput(); return 0; } -- cgit v1.2.3 From 7c184ecd262fe64fe8cf4e099e0f7cefe88d88b2 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 13 May 2019 17:18:24 -0700 Subject: initramfs: factor out a helper to populate the initrd image This will allow for cleaner code sharing in the caller. Link: http://lkml.kernel.org/r/20190213174621.29297-5-hch@lst.de Signed-off-by: Christoph Hellwig Acked-by: Mike Rapoport Cc: Catalin Marinas [arm64] Cc: Geert Uytterhoeven [m68k] Cc: Steven Price Cc: Alexander Viro Cc: Guan Xuetao Cc: Russell King Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- init/initramfs.c | 40 +++++++++++++++++++++++----------------- 1 file changed, 23 insertions(+), 17 deletions(-) diff --git a/init/initramfs.c b/init/initramfs.c index 5fda9557a134..e3de626dbd98 100644 --- a/init/initramfs.c +++ b/init/initramfs.c @@ -597,6 +597,28 @@ static void __init clean_rootfs(void) } #endif +#ifdef CONFIG_BLK_DEV_RAM +static void populate_initrd_image(char *err) +{ + ssize_t written; + int fd; + + unpack_to_rootfs(__initramfs_start, __initramfs_size); + + printk(KERN_INFO "rootfs image is not initramfs (%s); looks like an initrd\n", + err); + fd = ksys_open("/initrd.image", O_WRONLY | O_CREAT, 0700); + if (fd < 0) + return; + + written = xwrite(fd, (char *)initrd_start, initrd_end - initrd_start); + if (written != initrd_end - initrd_start) + pr_err("/initrd.image: incomplete write (%zd != %ld)\n", + written, initrd_end - initrd_start); + ksys_close(fd); +} +#endif /* CONFIG_BLK_DEV_RAM */ + static int __init populate_rootfs(void) { /* Load the built in initramfs */ @@ -606,7 +628,6 @@ static int __init populate_rootfs(void) /* If available load the bootloader supplied initrd */ if (initrd_start && !IS_ENABLED(CONFIG_INITRAMFS_FORCE)) { #ifdef CONFIG_BLK_DEV_RAM - int fd; printk(KERN_INFO "Trying to unpack rootfs image as initramfs...\n"); err = unpack_to_rootfs((char *)initrd_start, initrd_end - initrd_start); @@ -614,22 +635,7 @@ static int __init populate_rootfs(void) goto done; clean_rootfs(); - unpack_to_rootfs(__initramfs_start, __initramfs_size); - - printk(KERN_INFO "rootfs image is not initramfs (%s)" - "; looks like an initrd\n", err); - fd = ksys_open("/initrd.image", - O_WRONLY|O_CREAT, 0700); - if (fd >= 0) { - ssize_t written = xwrite(fd, (char *)initrd_start, - initrd_end - initrd_start); - - if (written != initrd_end - initrd_start) - pr_err("/initrd.image: incomplete write (%zd != %ld)\n", - written, initrd_end - initrd_start); - - ksys_close(fd); - } + populate_initrd_image(err); done: /* empty statement */; #else -- cgit v1.2.3 From afef7889c480ed134247f16c2ebdeabd75e77fd0 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 13 May 2019 17:18:27 -0700 Subject: initramfs: cleanup populate_rootfs The code for kernels that support ramdisks or not is mostly the same. Unify it by using an IS_ENABLED for the info message, and moving the error message into a stub for populate_initrd_image. [cai@lca.pw: fix a compilation error] Link: http://lkml.kernel.org/r/20190328014806.36375-1-cai@lca.pw Link: http://lkml.kernel.org/r/20190213174621.29297-6-hch@lst.de Signed-off-by: Christoph Hellwig Signed-off-by: Qian Cai Acked-by: Mike Rapoport Cc: Catalin Marinas [arm64] Cc: Geert Uytterhoeven [m68k] Cc: Steven Price Cc: Alexander Viro Cc: Guan Xuetao Cc: Russell King Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- init/initramfs.c | 37 ++++++++++++++++++++----------------- 1 file changed, 20 insertions(+), 17 deletions(-) diff --git a/init/initramfs.c b/init/initramfs.c index e3de626dbd98..32f940473d67 100644 --- a/init/initramfs.c +++ b/init/initramfs.c @@ -595,7 +595,11 @@ static void __init clean_rootfs(void) ksys_close(fd); kfree(buf); } -#endif +#else +static inline void clean_rootfs(void) +{ +} +#endif /* CONFIG_BLK_DEV_RAM */ #ifdef CONFIG_BLK_DEV_RAM static void populate_initrd_image(char *err) @@ -617,6 +621,11 @@ static void populate_initrd_image(char *err) written, initrd_end - initrd_start); ksys_close(fd); } +#else +static void populate_initrd_image(char *err) +{ + printk(KERN_EMERG "Initramfs unpacking failed: %s\n", err); +} #endif /* CONFIG_BLK_DEV_RAM */ static int __init populate_rootfs(void) @@ -625,28 +634,22 @@ static int __init populate_rootfs(void) char *err = unpack_to_rootfs(__initramfs_start, __initramfs_size); if (err) panic("%s", err); /* Failed to decompress INTERNAL initramfs */ - /* If available load the bootloader supplied initrd */ - if (initrd_start && !IS_ENABLED(CONFIG_INITRAMFS_FORCE)) { -#ifdef CONFIG_BLK_DEV_RAM + + if (!initrd_start || IS_ENABLED(CONFIG_INITRAMFS_FORCE)) + goto done; + + if (IS_ENABLED(CONFIG_BLK_DEV_RAM)) printk(KERN_INFO "Trying to unpack rootfs image as initramfs...\n"); - err = unpack_to_rootfs((char *)initrd_start, - initrd_end - initrd_start); - if (!err) - goto done; + else + printk(KERN_INFO "Unpacking initramfs...\n"); + err = unpack_to_rootfs((char *)initrd_start, initrd_end - initrd_start); + if (err) { clean_rootfs(); populate_initrd_image(err); - done: - /* empty statement */; -#else - printk(KERN_INFO "Unpacking initramfs...\n"); - err = unpack_to_rootfs((char *)initrd_start, - initrd_end - initrd_start); - if (err) - printk(KERN_EMERG "Initramfs unpacking failed: %s\n", err); -#endif } +done: /* * If the initrd region is overlapped with crashkernel reserved region, * free only memory that is not part of crashkernel region. -- cgit v1.2.3 From d8ae8a3765bfa1f9bf977e2496fcc9cf64fbfabd Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 13 May 2019 17:18:30 -0700 Subject: initramfs: move the legacy keepinitrd parameter to core code No need to handle the freeing disable in arch code when we already have a core hook (and a different name for the option) for it. Link: http://lkml.kernel.org/r/20190213174621.29297-7-hch@lst.de Signed-off-by: Christoph Hellwig Acked-by: Catalin Marinas [arm64] Acked-by: Mike Rapoport Cc: Geert Uytterhoeven [m68k] Cc: Steven Price Cc: Alexander Viro Cc: Guan Xuetao Cc: Russell King Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/Kconfig | 7 +++++++ arch/arm/Kconfig | 1 + arch/arm/mm/init.c | 25 ++++++------------------- arch/arm64/Kconfig | 1 + arch/arm64/mm/init.c | 17 ++--------------- arch/unicore32/Kconfig | 1 + arch/unicore32/mm/init.c | 14 +------------- init/initramfs.c | 9 +++++++++ 8 files changed, 28 insertions(+), 47 deletions(-) diff --git a/arch/Kconfig b/arch/Kconfig index 5e43fcbad4ca..f11f0698b148 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -245,6 +245,13 @@ config ARCH_HAS_FORTIFY_SOURCE An architecture should select this when it can successfully build and run with CONFIG_FORTIFY_SOURCE. +# +# Select if the arch provides a historic keepinit alias for the retain_initrd +# command line option +# +config ARCH_HAS_KEEPINITRD + bool + # Select if arch has all set_memory_ro/rw/x/nx() functions in asm/cacheflush.h config ARCH_HAS_SET_MEMORY bool diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index dc9855c4a3b4..a11dfcc2a130 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -9,6 +9,7 @@ config ARM select ARCH_HAS_DEVMEM_IS_ALLOWED select ARCH_HAS_ELF_RANDOMIZE select ARCH_HAS_FORTIFY_SOURCE + select ARCH_HAS_KEEPINITRD select ARCH_HAS_KCOV select ARCH_HAS_MEMBARRIER_SYNC_CORE select ARCH_HAS_PTE_SPECIAL if ARM_LPAE diff --git a/arch/arm/mm/init.c b/arch/arm/mm/init.c index c2daabbe0af0..68dcd5f8d7c6 100644 --- a/arch/arm/mm/init.c +++ b/arch/arm/mm/init.c @@ -695,27 +695,14 @@ void free_initmem(void) } #ifdef CONFIG_BLK_DEV_INITRD - -static int keep_initrd; - void free_initrd_mem(unsigned long start, unsigned long end) { - if (!keep_initrd) { - if (start == initrd_start) - start = round_down(start, PAGE_SIZE); - if (end == initrd_end) - end = round_up(end, PAGE_SIZE); + if (start == initrd_start) + start = round_down(start, PAGE_SIZE); + if (end == initrd_end) + end = round_up(end, PAGE_SIZE); - poison_init_mem((void *)start, PAGE_ALIGN(end) - start); - free_reserved_area((void *)start, (void *)end, -1, "initrd"); - } + poison_init_mem((void *)start, PAGE_ALIGN(end) - start); + free_reserved_area((void *)start, (void *)end, -1, "initrd"); } - -static int __init keepinitrd_setup(char *__unused) -{ - keep_initrd = 1; - return 1; -} - -__setup("keepinitrd", keepinitrd_setup); #endif diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 3f957443f286..e24dc16453aa 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -21,6 +21,7 @@ config ARM64 select ARCH_HAS_GCOV_PROFILE_ALL select ARCH_HAS_GIGANTIC_PAGE if (MEMORY_ISOLATION && COMPACTION) || CMA select ARCH_HAS_KCOV + select ARCH_HAS_KEEPINITRD select ARCH_HAS_MEMBARRIER_SYNC_CORE select ARCH_HAS_PTE_SPECIAL select ARCH_HAS_SETUP_DMA_OPS diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c index 40e2d7e5efcb..007c05a4cce0 100644 --- a/arch/arm64/mm/init.c +++ b/arch/arm64/mm/init.c @@ -578,24 +578,11 @@ void free_initmem(void) } #ifdef CONFIG_BLK_DEV_INITRD - -static int keep_initrd __initdata; - void __init free_initrd_mem(unsigned long start, unsigned long end) { - if (!keep_initrd) { - free_reserved_area((void *)start, (void *)end, 0, "initrd"); - memblock_free(__virt_to_phys(start), end - start); - } -} - -static int __init keepinitrd_setup(char *__unused) -{ - keep_initrd = 1; - return 1; + free_reserved_area((void *)start, (void *)end, 0, "initrd"); + memblock_free(__virt_to_phys(start), end - start); } - -__setup("keepinitrd", keepinitrd_setup); #endif /* diff --git a/arch/unicore32/Kconfig b/arch/unicore32/Kconfig index 2445dfcf6444..afe4949cfc2d 100644 --- a/arch/unicore32/Kconfig +++ b/arch/unicore32/Kconfig @@ -3,6 +3,7 @@ config UNICORE32 def_bool y select ARCH_32BIT_OFF_T select ARCH_HAS_DEVMEM_IS_ALLOWED + select ARCH_HAS_KEEPINITRD select ARCH_MIGHT_HAVE_PC_PARPORT select ARCH_MIGHT_HAVE_PC_SERIO select HAVE_KERNEL_GZIP diff --git a/arch/unicore32/mm/init.c b/arch/unicore32/mm/init.c index 74b6a2e29809..c0573feb5064 100644 --- a/arch/unicore32/mm/init.c +++ b/arch/unicore32/mm/init.c @@ -294,20 +294,8 @@ void free_initmem(void) } #ifdef CONFIG_BLK_DEV_INITRD - -static int keep_initrd; - void free_initrd_mem(unsigned long start, unsigned long end) { - if (!keep_initrd) - free_reserved_area((void *)start, (void *)end, -1, "initrd"); -} - -static int __init keepinitrd_setup(char *__unused) -{ - keep_initrd = 1; - return 1; + free_reserved_area((void *)start, (void *)end, -1, "initrd"); } - -__setup("keepinitrd", keepinitrd_setup); #endif diff --git a/init/initramfs.c b/init/initramfs.c index 32f940473d67..cb3d17735c66 100644 --- a/init/initramfs.c +++ b/init/initramfs.c @@ -513,6 +513,15 @@ static int __init retain_initrd_param(char *str) } __setup("retain_initrd", retain_initrd_param); +#ifdef CONFIG_ARCH_HAS_KEEPINITRD +static int __init keepinitrd_setup(char *__unused) +{ + do_retain_initrd = 1; + return 1; +} +__setup("keepinitrd", keepinitrd_setup); +#endif + extern char __initramfs_start[]; extern unsigned long __initramfs_size; #include -- cgit v1.2.3 From 4afd58e14dd415e456fd236755373f52e6055ec7 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 13 May 2019 17:18:34 -0700 Subject: initramfs: provide a generic free_initrd_mem implementation For most architectures free_initrd_mem just expands to the same free_reserved_area call. Provide that as a generic implementation marked __weak. Link: http://lkml.kernel.org/r/20190213174621.29297-8-hch@lst.de Signed-off-by: Christoph Hellwig Acked-by: Geert Uytterhoeven [m68k] Acked-by: Mike Rapoport Cc: Catalin Marinas [arm64] Cc: Steven Price Cc: Alexander Viro Cc: Guan Xuetao Cc: Russell King Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/alpha/mm/init.c | 8 -------- arch/arc/mm/init.c | 7 ------- arch/c6x/mm/init.c | 7 ------- arch/h8300/mm/init.c | 8 -------- arch/m68k/mm/init.c | 7 ------- arch/microblaze/mm/init.c | 7 ------- arch/nds32/mm/init.c | 7 ------- arch/nios2/mm/init.c | 7 ------- arch/openrisc/mm/init.c | 7 ------- arch/parisc/mm/init.c | 7 ------- arch/powerpc/mm/mem.c | 7 ------- arch/sh/mm/init.c | 7 ------- arch/um/kernel/mem.c | 7 ------- arch/unicore32/mm/init.c | 7 ------- init/initramfs.c | 5 +++++ 15 files changed, 5 insertions(+), 100 deletions(-) diff --git a/arch/alpha/mm/init.c b/arch/alpha/mm/init.c index a42fc5c4db89..97f4940f11e3 100644 --- a/arch/alpha/mm/init.c +++ b/arch/alpha/mm/init.c @@ -291,11 +291,3 @@ free_initmem(void) { free_initmem_default(-1); } - -#ifdef CONFIG_BLK_DEV_INITRD -void -free_initrd_mem(unsigned long start, unsigned long end) -{ - free_reserved_area((void *)start, (void *)end, -1, "initrd"); -} -#endif diff --git a/arch/arc/mm/init.c b/arch/arc/mm/init.c index e1ab2d7f1d64..c357a3bd1532 100644 --- a/arch/arc/mm/init.c +++ b/arch/arc/mm/init.c @@ -214,10 +214,3 @@ void __ref free_initmem(void) { free_initmem_default(-1); } - -#ifdef CONFIG_BLK_DEV_INITRD -void __init free_initrd_mem(unsigned long start, unsigned long end) -{ - free_reserved_area((void *)start, (void *)end, -1, "initrd"); -} -#endif diff --git a/arch/c6x/mm/init.c b/arch/c6x/mm/init.c index fe582c3a1794..6fd43ec53507 100644 --- a/arch/c6x/mm/init.c +++ b/arch/c6x/mm/init.c @@ -69,13 +69,6 @@ void __init mem_init(void) mem_init_print_info(NULL); } -#ifdef CONFIG_BLK_DEV_INITRD -void __init free_initrd_mem(unsigned long start, unsigned long end) -{ - free_reserved_area((void *)start, (void *)end, -1, "initrd"); -} -#endif - void __init free_initmem(void) { free_initmem_default(-1); diff --git a/arch/h8300/mm/init.c b/arch/h8300/mm/init.c index 0f04a5e9aa4f..ea635c9025fe 100644 --- a/arch/h8300/mm/init.c +++ b/arch/h8300/mm/init.c @@ -103,14 +103,6 @@ void __init mem_init(void) mem_init_print_info(NULL); } - -#ifdef CONFIG_BLK_DEV_INITRD -void free_initrd_mem(unsigned long start, unsigned long end) -{ - free_reserved_area((void *)start, (void *)end, -1, "initrd"); -} -#endif - void free_initmem(void) { diff --git a/arch/m68k/mm/init.c b/arch/m68k/mm/init.c index 8868a4c9adae..778cacb7d57b 100644 --- a/arch/m68k/mm/init.c +++ b/arch/m68k/mm/init.c @@ -147,10 +147,3 @@ void __init mem_init(void) init_pointer_tables(); mem_init_print_info(NULL); } - -#ifdef CONFIG_BLK_DEV_INITRD -void free_initrd_mem(unsigned long start, unsigned long end) -{ - free_reserved_area((void *)start, (void *)end, -1, "initrd"); -} -#endif diff --git a/arch/microblaze/mm/init.c b/arch/microblaze/mm/init.c index 7e97d44f6538..b675bc666e68 100644 --- a/arch/microblaze/mm/init.c +++ b/arch/microblaze/mm/init.c @@ -186,13 +186,6 @@ void __init setup_memory(void) paging_init(); } -#ifdef CONFIG_BLK_DEV_INITRD -void free_initrd_mem(unsigned long start, unsigned long end) -{ - free_reserved_area((void *)start, (void *)end, -1, "initrd"); -} -#endif - void free_initmem(void) { free_initmem_default(-1); diff --git a/arch/nds32/mm/init.c b/arch/nds32/mm/init.c index 1d03633f89a9..9a7065c1fb83 100644 --- a/arch/nds32/mm/init.c +++ b/arch/nds32/mm/init.c @@ -257,13 +257,6 @@ void free_initmem(void) free_initmem_default(-1); } -#ifdef CONFIG_BLK_DEV_INITRD -void free_initrd_mem(unsigned long start, unsigned long end) -{ - free_reserved_area((void *)start, (void *)end, -1, "initrd"); -} -#endif - void __set_fixmap(enum fixed_addresses idx, phys_addr_t phys, pgprot_t flags) { diff --git a/arch/nios2/mm/init.c b/arch/nios2/mm/init.c index 16cea5776b87..60736a725883 100644 --- a/arch/nios2/mm/init.c +++ b/arch/nios2/mm/init.c @@ -82,13 +82,6 @@ void __init mmu_init(void) flush_tlb_all(); } -#ifdef CONFIG_BLK_DEV_INITRD -void __init free_initrd_mem(unsigned long start, unsigned long end) -{ - free_reserved_area((void *)start, (void *)end, -1, "initrd"); -} -#endif - void __ref free_initmem(void) { free_initmem_default(-1); diff --git a/arch/openrisc/mm/init.c b/arch/openrisc/mm/init.c index caeb4184e8a6..08df7f0b1d96 100644 --- a/arch/openrisc/mm/init.c +++ b/arch/openrisc/mm/init.c @@ -224,13 +224,6 @@ void __init mem_init(void) return; } -#ifdef CONFIG_BLK_DEV_INITRD -void free_initrd_mem(unsigned long start, unsigned long end) -{ - free_reserved_area((void *)start, (void *)end, -1, "initrd"); -} -#endif - void free_initmem(void) { free_initmem_default(-1); diff --git a/arch/parisc/mm/init.c b/arch/parisc/mm/init.c index 3b0f9eab7f2c..11ec1f1221a6 100644 --- a/arch/parisc/mm/init.c +++ b/arch/parisc/mm/init.c @@ -917,10 +917,3 @@ void flush_tlb_all(void) spin_unlock(&sid_lock); } #endif - -#ifdef CONFIG_BLK_DEV_INITRD -void free_initrd_mem(unsigned long start, unsigned long end) -{ - free_reserved_area((void *)start, (void *)end, -1, "initrd"); -} -#endif diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c index cd525d709072..20266898f3a8 100644 --- a/arch/powerpc/mm/mem.c +++ b/arch/powerpc/mm/mem.c @@ -338,13 +338,6 @@ void free_initmem(void) free_initmem_default(POISON_FREE_INITMEM); } -#ifdef CONFIG_BLK_DEV_INITRD -void __init free_initrd_mem(unsigned long start, unsigned long end) -{ - free_reserved_area((void *)start, (void *)end, -1, "initrd"); -} -#endif - /* * This is called when a page has been modified by the kernel. * It just marks the page as not i-cache clean. We do the i-cache diff --git a/arch/sh/mm/init.c b/arch/sh/mm/init.c index 70621324db41..3e68d98af1bd 100644 --- a/arch/sh/mm/init.c +++ b/arch/sh/mm/init.c @@ -408,13 +408,6 @@ void free_initmem(void) free_initmem_default(-1); } -#ifdef CONFIG_BLK_DEV_INITRD -void free_initrd_mem(unsigned long start, unsigned long end) -{ - free_reserved_area((void *)start, (void *)end, -1, "initrd"); -} -#endif - #ifdef CONFIG_MEMORY_HOTPLUG int arch_add_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap, bool want_memblock) diff --git a/arch/um/kernel/mem.c b/arch/um/kernel/mem.c index 99aa11bf53d1..a9c9a94c096f 100644 --- a/arch/um/kernel/mem.c +++ b/arch/um/kernel/mem.c @@ -188,13 +188,6 @@ void free_initmem(void) { } -#ifdef CONFIG_BLK_DEV_INITRD -void free_initrd_mem(unsigned long start, unsigned long end) -{ - free_reserved_area((void *)start, (void *)end, -1, "initrd"); -} -#endif - /* Allocate and free page tables. */ pgd_t *pgd_alloc(struct mm_struct *mm) diff --git a/arch/unicore32/mm/init.c b/arch/unicore32/mm/init.c index c0573feb5064..6e352de80038 100644 --- a/arch/unicore32/mm/init.c +++ b/arch/unicore32/mm/init.c @@ -292,10 +292,3 @@ void free_initmem(void) { free_initmem_default(-1); } - -#ifdef CONFIG_BLK_DEV_INITRD -void free_initrd_mem(unsigned long start, unsigned long end) -{ - free_reserved_area((void *)start, (void *)end, -1, "initrd"); -} -#endif diff --git a/init/initramfs.c b/init/initramfs.c index cb3d17735c66..fcb759a106be 100644 --- a/init/initramfs.c +++ b/init/initramfs.c @@ -527,6 +527,11 @@ extern unsigned long __initramfs_size; #include #include +void __weak free_initrd_mem(unsigned long start, unsigned long end) +{ + free_reserved_area((void *)start, (void *)end, -1, "initrd"); +} + #ifdef CONFIG_KEXEC_CORE static bool kexec_free_initrd(void) { -- cgit v1.2.3 From f94f7434cbbb02f7eb55ed5ad66284023c47968f Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 13 May 2019 17:18:37 -0700 Subject: initramfs: poison freed initrd memory Various architectures including x86 poison the freed initrd memory. Do the same in the generic free_initrd_mem implementation and switch a few more architectures that are identical to the generic code over to it now. Link: http://lkml.kernel.org/r/20190213174621.29297-9-hch@lst.de Signed-off-by: Christoph Hellwig Acked-by: Mike Rapoport Cc: Catalin Marinas [arm64] Cc: Geert Uytterhoeven [m68k] Cc: Steven Price Cc: Alexander Viro Cc: Guan Xuetao Cc: Russell King Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/mips/mm/init.c | 8 -------- arch/s390/mm/init.c | 8 -------- arch/sparc/mm/init_32.c | 8 -------- arch/sparc/mm/init_64.c | 8 -------- init/initramfs.c | 3 ++- 5 files changed, 2 insertions(+), 33 deletions(-) diff --git a/arch/mips/mm/init.c b/arch/mips/mm/init.c index bbb196ad5f26..8a038b30d3c4 100644 --- a/arch/mips/mm/init.c +++ b/arch/mips/mm/init.c @@ -504,14 +504,6 @@ void free_init_pages(const char *what, unsigned long begin, unsigned long end) printk(KERN_INFO "Freeing %s: %ldk freed\n", what, (end - begin) >> 10); } -#ifdef CONFIG_BLK_DEV_INITRD -void free_initrd_mem(unsigned long start, unsigned long end) -{ - free_reserved_area((void *)start, (void *)end, POISON_FREE_INITMEM, - "initrd"); -} -#endif - void (*free_init_pages_eva)(void *begin, void *end) = NULL; void __ref free_initmem(void) diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c index 7cf48eefec8f..5f48fc7e61d5 100644 --- a/arch/s390/mm/init.c +++ b/arch/s390/mm/init.c @@ -157,14 +157,6 @@ void free_initmem(void) free_initmem_default(POISON_FREE_INITMEM); } -#ifdef CONFIG_BLK_DEV_INITRD -void __init free_initrd_mem(unsigned long start, unsigned long end) -{ - free_reserved_area((void *)start, (void *)end, POISON_FREE_INITMEM, - "initrd"); -} -#endif - unsigned long memory_block_size_bytes(void) { /* diff --git a/arch/sparc/mm/init_32.c b/arch/sparc/mm/init_32.c index a8ff29821bdb..417f89d5e0b2 100644 --- a/arch/sparc/mm/init_32.c +++ b/arch/sparc/mm/init_32.c @@ -299,14 +299,6 @@ void free_initmem (void) free_initmem_default(POISON_FREE_INITMEM); } -#ifdef CONFIG_BLK_DEV_INITRD -void free_initrd_mem(unsigned long start, unsigned long end) -{ - free_reserved_area((void *)start, (void *)end, POISON_FREE_INITMEM, - "initrd"); -} -#endif - void sparc_flush_page_to_ram(struct page *page) { unsigned long vaddr = (unsigned long)page_address(page); diff --git a/arch/sparc/mm/init_64.c b/arch/sparc/mm/init_64.c index bc2aaa47bc8a..4b099dd7a767 100644 --- a/arch/sparc/mm/init_64.c +++ b/arch/sparc/mm/init_64.c @@ -2572,14 +2572,6 @@ void free_initmem(void) } } -#ifdef CONFIG_BLK_DEV_INITRD -void free_initrd_mem(unsigned long start, unsigned long end) -{ - free_reserved_area((void *)start, (void *)end, POISON_FREE_INITMEM, - "initrd"); -} -#endif - pgprot_t PAGE_KERNEL __read_mostly; EXPORT_SYMBOL(PAGE_KERNEL); diff --git a/init/initramfs.c b/init/initramfs.c index fcb759a106be..435a428c2af1 100644 --- a/init/initramfs.c +++ b/init/initramfs.c @@ -529,7 +529,8 @@ extern unsigned long __initramfs_size; void __weak free_initrd_mem(unsigned long start, unsigned long end) { - free_reserved_area((void *)start, (void *)end, -1, "initrd"); + free_reserved_area((void *)start, (void *)end, POISON_FREE_INITMEM, + "initrd"); } #ifdef CONFIG_KEXEC_CORE -- cgit v1.2.3 From 997aef68af3ef1f2cb97da1c0b41a5afa87f63e2 Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Mon, 13 May 2019 17:18:40 -0700 Subject: init: provide a generic free_initmem implementation Patch series "provide a generic free_initmem implementation", v2. Many architectures implement free_initmem() in exactly the same or very similar way: they wrap the call to free_initmem_default() with sometimes different 'poison' parameter. These patches switch those architectures to use a generic implementation that does free_initmem_default(POISON_FREE_INITMEM). This was inspired by Christoph's patches for free_initrd_mem [1] and I shamelessly copied changelog entries from his patches :) [1] https://lore.kernel.org/lkml/20190213174621.29297-1-hch@lst.de/ This patch (of 2): For most architectures free_initmem just a wrapper for the same free_initmem_default(-1) call. Provide that as a generic implementation marked __weak. Link: http://lkml.kernel.org/r/1550515285-17446-2-git-send-email-rppt@linux.ibm.com Signed-off-by: Mike Rapoport Reviewed-by: Andrew Morton Cc: Christoph Hellwig Cc: Palmer Dabbelt Cc: Richard Kuo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/alpha/mm/init.c | 6 ------ arch/arc/mm/init.c | 8 -------- arch/c6x/mm/init.c | 5 ----- arch/h8300/mm/init.c | 6 ------ arch/microblaze/mm/init.c | 5 ----- arch/nds32/mm/init.c | 5 ----- arch/nios2/mm/init.c | 5 ----- arch/openrisc/mm/init.c | 5 ----- arch/sh/mm/init.c | 5 ----- arch/unicore32/mm/init.c | 5 ----- arch/xtensa/mm/init.c | 5 ----- init/main.c | 5 +++++ 12 files changed, 5 insertions(+), 60 deletions(-) diff --git a/arch/alpha/mm/init.c b/arch/alpha/mm/init.c index 97f4940f11e3..e2cbec3789e8 100644 --- a/arch/alpha/mm/init.c +++ b/arch/alpha/mm/init.c @@ -285,9 +285,3 @@ mem_init(void) memblock_free_all(); mem_init_print_info(NULL); } - -void -free_initmem(void) -{ - free_initmem_default(-1); -} diff --git a/arch/arc/mm/init.c b/arch/arc/mm/init.c index c357a3bd1532..02b7a3b20d7c 100644 --- a/arch/arc/mm/init.c +++ b/arch/arc/mm/init.c @@ -206,11 +206,3 @@ void __init mem_init(void) memblock_free_all(); mem_init_print_info(NULL); } - -/* - * free_initmem: Free all the __init memory. - */ -void __ref free_initmem(void) -{ - free_initmem_default(-1); -} diff --git a/arch/c6x/mm/init.c b/arch/c6x/mm/init.c index 6fd43ec53507..573242b160e1 100644 --- a/arch/c6x/mm/init.c +++ b/arch/c6x/mm/init.c @@ -68,8 +68,3 @@ void __init mem_init(void) mem_init_print_info(NULL); } - -void __init free_initmem(void) -{ - free_initmem_default(-1); -} diff --git a/arch/h8300/mm/init.c b/arch/h8300/mm/init.c index ea635c9025fe..1eab16b1a0bc 100644 --- a/arch/h8300/mm/init.c +++ b/arch/h8300/mm/init.c @@ -102,9 +102,3 @@ void __init mem_init(void) mem_init_print_info(NULL); } - -void -free_initmem(void) -{ - free_initmem_default(-1); -} diff --git a/arch/microblaze/mm/init.c b/arch/microblaze/mm/init.c index b675bc666e68..a015a951c8b7 100644 --- a/arch/microblaze/mm/init.c +++ b/arch/microblaze/mm/init.c @@ -186,11 +186,6 @@ void __init setup_memory(void) paging_init(); } -void free_initmem(void) -{ - free_initmem_default(-1); -} - void __init mem_init(void) { high_memory = (void *)__va(memory_start + lowmem_size - 1); diff --git a/arch/nds32/mm/init.c b/arch/nds32/mm/init.c index 9a7065c1fb83..1a4ab1b7525f 100644 --- a/arch/nds32/mm/init.c +++ b/arch/nds32/mm/init.c @@ -252,11 +252,6 @@ void __init mem_init(void) return; } -void free_initmem(void) -{ - free_initmem_default(-1); -} - void __set_fixmap(enum fixed_addresses idx, phys_addr_t phys, pgprot_t flags) { diff --git a/arch/nios2/mm/init.c b/arch/nios2/mm/init.c index 60736a725883..2c609c2516b2 100644 --- a/arch/nios2/mm/init.c +++ b/arch/nios2/mm/init.c @@ -82,11 +82,6 @@ void __init mmu_init(void) flush_tlb_all(); } -void __ref free_initmem(void) -{ - free_initmem_default(-1); -} - #define __page_aligned(order) __aligned(PAGE_SIZE << (order)) pgd_t swapper_pg_dir[PTRS_PER_PGD] __page_aligned(PGD_ORDER); pte_t invalid_pte_table[PTRS_PER_PTE] __page_aligned(PTE_ORDER); diff --git a/arch/openrisc/mm/init.c b/arch/openrisc/mm/init.c index 08df7f0b1d96..abe87e54e231 100644 --- a/arch/openrisc/mm/init.c +++ b/arch/openrisc/mm/init.c @@ -223,8 +223,3 @@ void __init mem_init(void) mem_init_done = 1; return; } - -void free_initmem(void) -{ - free_initmem_default(-1); -} diff --git a/arch/sh/mm/init.c b/arch/sh/mm/init.c index 3e68d98af1bd..aeb9f45c7a39 100644 --- a/arch/sh/mm/init.c +++ b/arch/sh/mm/init.c @@ -403,11 +403,6 @@ void __init mem_init(void) mem_init_done = 1; } -void free_initmem(void) -{ - free_initmem_default(-1); -} - #ifdef CONFIG_MEMORY_HOTPLUG int arch_add_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap, bool want_memblock) diff --git a/arch/unicore32/mm/init.c b/arch/unicore32/mm/init.c index 6e352de80038..b4442f3060ce 100644 --- a/arch/unicore32/mm/init.c +++ b/arch/unicore32/mm/init.c @@ -287,8 +287,3 @@ void __init mem_init(void) sysctl_overcommit_memory = OVERCOMMIT_ALWAYS; } } - -void free_initmem(void) -{ - free_initmem_default(-1); -} diff --git a/arch/xtensa/mm/init.c b/arch/xtensa/mm/init.c index d49861099684..b51746f2b80b 100644 --- a/arch/xtensa/mm/init.c +++ b/arch/xtensa/mm/init.c @@ -216,11 +216,6 @@ void free_initrd_mem(unsigned long start, unsigned long end) } #endif -void free_initmem(void) -{ - free_initmem_default(-1); -} - static void __init parse_memmap_one(char *p) { char *oldp; diff --git a/init/main.c b/init/main.c index 33c87e91dc37..26234570a324 100644 --- a/init/main.c +++ b/init/main.c @@ -1074,6 +1074,11 @@ static inline void mark_readonly(void) } #endif +void __weak free_initmem(void) +{ + free_initmem_default(-1); +} + static int __ref kernel_init(void *unused) { int ret; -- cgit v1.2.3 From 522c99194549e50a9bd76427a06922d7a68237d6 Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Mon, 13 May 2019 17:18:43 -0700 Subject: hexagon: switch over to generic free_initmem() hexagon implementation of free_initmem() is currently empty and marked with comment * Todo: free pages between __init_begin and __init_end; possibly * some devtree related stuff as well. Switch it to the generic implementation. Link: http://lkml.kernel.org/r/1550515285-17446-3-git-send-email-rppt@linux.ibm.com Signed-off-by: Mike Rapoport Reviewed-by: Andrew Morton Cc: Christoph Hellwig Cc: Palmer Dabbelt Cc: Richard Kuo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/hexagon/mm/init.c | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/arch/hexagon/mm/init.c b/arch/hexagon/mm/init.c index 1719ede9e9bd..41cf34243ea1 100644 --- a/arch/hexagon/mm/init.c +++ b/arch/hexagon/mm/init.c @@ -84,16 +84,6 @@ void __init mem_init(void) init_mm.context.ptbase = __pa(init_mm.pgd); } -/* - * free_initmem - frees memory used by stuff declared with __init - * - * Todo: free pages between __init_begin and __init_end; possibly - * some devtree related stuff as well. - */ -void __ref free_initmem(void) -{ -} - /* * free_initrd_mem - frees... initrd memory. * @start - start of init memory -- cgit v1.2.3 From f40399992a245c852ad446e265d1567010db5e10 Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Mon, 13 May 2019 17:18:46 -0700 Subject: init: free_initmem: poison freed init memory Various architectures including x86 poison the freed init memory. Do the same in the generic free_initmem implementation and switch sparc32 architecture that is identical to the generic code over to it now. Link: http://lkml.kernel.org/r/1550515285-17446-4-git-send-email-rppt@linux.ibm.com Signed-off-by: Mike Rapoport Reviewed-by: Andrew Morton Cc: Christoph Hellwig Cc: Palmer Dabbelt Cc: Richard Kuo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/sparc/mm/init_32.c | 5 ----- init/main.c | 2 +- 2 files changed, 1 insertion(+), 6 deletions(-) diff --git a/arch/sparc/mm/init_32.c b/arch/sparc/mm/init_32.c index 417f89d5e0b2..046ab116cc8c 100644 --- a/arch/sparc/mm/init_32.c +++ b/arch/sparc/mm/init_32.c @@ -294,11 +294,6 @@ void __init mem_init(void) mem_init_print_info(NULL); } -void free_initmem (void) -{ - free_initmem_default(POISON_FREE_INITMEM); -} - void sparc_flush_page_to_ram(struct page *page) { unsigned long vaddr = (unsigned long)page_address(page); diff --git a/init/main.c b/init/main.c index 26234570a324..5a2c69b4d7b3 100644 --- a/init/main.c +++ b/init/main.c @@ -1076,7 +1076,7 @@ static inline void mark_readonly(void) void __weak free_initmem(void) { - free_initmem_default(-1); + free_initmem_default(POISON_FREE_INITMEM); } static int __ref kernel_init(void *unused) -- cgit v1.2.3 From 0d7b4a607d8f0f95f1ce49e993a04317d10a4ecd Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Mon, 13 May 2019 17:18:49 -0700 Subject: riscv: switch over to generic free_initmem() The riscv version of free_initmem() differs from the generic one only in that it sets the freed memory to zero. Make ricsv use the generic version and poison the freed memory. Link: http://lkml.kernel.org/r/1550515285-17446-5-git-send-email-rppt@linux.ibm.com Signed-off-by: Mike Rapoport Reviewed-by: Palmer Dabbelt Reviewed-by: Andrew Morton Cc: Christoph Hellwig Cc: Richard Kuo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/riscv/mm/init.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c index bc7b77e34d09..8bf6f9c2d48c 100644 --- a/arch/riscv/mm/init.c +++ b/arch/riscv/mm/init.c @@ -66,11 +66,6 @@ void __init mem_init(void) mem_init_print_info(NULL); } -void free_initmem(void) -{ - free_initmem_default(0); -} - #ifdef CONFIG_BLK_DEV_INITRD static void __init setup_initrd(void) { -- cgit v1.2.3 From a861bbce27634160ae0330126b4ef001d6941c8f Mon Sep 17 00:00:00 2001 From: Alexandre Ghiti Date: Mon, 13 May 2019 17:18:53 -0700 Subject: sh: advertise gigantic page support Patch series "Fix free/allocation of runtime gigantic pages", v8. This series fixes sh and sparc that did not advertise their gigantic page support and then were not able to allocate and free those pages at runtime. It renames MEMORY_ISOLATION && COMPACTION || CMA condition into the more accurate CONTIG_ALLOC, since it allows the definition of alloc_contig_range function. Finally, it then fixes the wrong definition of ARCH_HAS_GIGANTIC_PAGE config that, without MEMORY_ISOLATION && COMPACTION || CMA defined, did not allow architectures to free boottime allocated gigantic pages although unrelated. This patch (of 4): sh actually supports gigantic pages and selecting ARCH_HAS_GIGANTIC_PAGE allows it to allocate and free gigantic pages at runtime. At least sdk7786_defconfig exposes such a configuration with huge pages of 64MB, pages of 4KB and MAX_ORDER = 11: HPAGE_SHIFT (26) - PAGE_SHIFT (12) = 14 >= MAX_ORDER (11) Link: http://lkml.kernel.org/r/20190327063626.18421-2-alex@ghiti.fr Signed-off-by: Alexandre Ghiti Reviewed-by: Andrew Morton Cc: Aneesh Kumar K.V Cc: Michael Ellerman Cc: Vlastimil Babka Cc: Catalin Marinas Cc: Will Deacon Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Martin Schwidefsky Cc: Heiko Carstens Cc: Yoshinori Sato Cc: Rich Felker Cc: David S. Miller Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Borislav Petkov Cc: "H . Peter Anvin" Cc: Dave Hansen Cc: Andy Lutomirsky Cc: Peter Zijlstra Cc: Mike Kravetz Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/sh/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig index 0be08d586d40..6349396317a9 100644 --- a/arch/sh/Kconfig +++ b/arch/sh/Kconfig @@ -53,6 +53,7 @@ config SUPERH select HAVE_FUTEX_CMPXCHG if FUTEX select HAVE_NMI select NEED_SG_DMA_LENGTH + select ARCH_HAS_GIGANTIC_PAGE if (MEMORY_ISOLATION && COMPACTION) || CMA help The SuperH is a RISC processor targeted for use in embedded systems -- cgit v1.2.3 From b53f45695449f692d5d2ab89cecc3316cdb636e8 Mon Sep 17 00:00:00 2001 From: Alexandre Ghiti Date: Mon, 13 May 2019 17:18:56 -0700 Subject: sparc: advertise gigantic page support sparc actually supports gigantic pages and selecting ARCH_HAS_GIGANTIC_PAGE allows it to allocate and free gigantic pages at runtime. sparc allows configuration such as huge pages of 16GB, pages of 8KB and MAX_ORDER = 13 (default): HPAGE_SHIFT (34) - PAGE_SHIFT (13) = 21 >= MAX_ORDER (13) Link: http://lkml.kernel.org/r/20190327063626.18421-3-alex@ghiti.fr Signed-off-by: Alexandre Ghiti Acked-by: David S. Miller Cc: Andy Lutomirsky Cc: Aneesh Kumar K.V Cc: Benjamin Herrenschmidt Cc: Borislav Petkov Cc: Catalin Marinas Cc: Dave Hansen Cc: Heiko Carstens Cc: "H . Peter Anvin" Cc: Ingo Molnar Cc: Martin Schwidefsky Cc: Michael Ellerman Cc: Mike Kravetz Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Rich Felker Cc: Thomas Gleixner Cc: Vlastimil Babka Cc: Will Deacon Cc: Yoshinori Sato Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/sparc/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig index f6421c9ce5d3..b848c8ddd92e 100644 --- a/arch/sparc/Kconfig +++ b/arch/sparc/Kconfig @@ -92,6 +92,7 @@ config SPARC64 select ARCH_CLOCKSOURCE_DATA select ARCH_HAS_PTE_SPECIAL select PCI_DOMAINS if PCI + select ARCH_HAS_GIGANTIC_PAGE if (MEMORY_ISOLATION && COMPACTION) || CMA config ARCH_DEFCONFIG string -- cgit v1.2.3 From 8df995f6bde01de96ce93373785f41c3bd13ad1c Mon Sep 17 00:00:00 2001 From: Alexandre Ghiti Date: Mon, 13 May 2019 17:19:00 -0700 Subject: mm: simplify MEMORY_ISOLATION && COMPACTION || CMA into CONTIG_ALLOC This condition allows to define alloc_contig_range, so simplify it into a more accurate naming. Link: http://lkml.kernel.org/r/20190327063626.18421-4-alex@ghiti.fr Signed-off-by: Alexandre Ghiti Suggested-by: Vlastimil Babka Acked-by: Vlastimil Babka Cc: Andy Lutomirsky Cc: Aneesh Kumar K.V Cc: Benjamin Herrenschmidt Cc: Borislav Petkov Cc: Catalin Marinas Cc: Dave Hansen Cc: David S. Miller Cc: Heiko Carstens Cc: "H . Peter Anvin" Cc: Ingo Molnar Cc: Martin Schwidefsky Cc: Michael Ellerman Cc: Mike Kravetz Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Rich Felker Cc: Thomas Gleixner Cc: Will Deacon Cc: Yoshinori Sato Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm64/Kconfig | 2 +- arch/powerpc/platforms/Kconfig.cputype | 2 +- arch/s390/Kconfig | 2 +- arch/sh/Kconfig | 2 +- arch/sparc/Kconfig | 2 +- arch/x86/Kconfig | 2 +- arch/x86/mm/hugetlbpage.c | 2 +- include/linux/gfp.h | 2 +- mm/Kconfig | 3 +++ mm/page_alloc.c | 3 +-- 10 files changed, 12 insertions(+), 10 deletions(-) diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index e24dc16453aa..7f7fbd8bd9d5 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -19,7 +19,7 @@ config ARM64 select ARCH_HAS_FAST_MULTIPLIER select ARCH_HAS_FORTIFY_SOURCE select ARCH_HAS_GCOV_PROFILE_ALL - select ARCH_HAS_GIGANTIC_PAGE if (MEMORY_ISOLATION && COMPACTION) || CMA + select ARCH_HAS_GIGANTIC_PAGE if CONTIG_ALLOC select ARCH_HAS_KCOV select ARCH_HAS_KEEPINITRD select ARCH_HAS_MEMBARRIER_SYNC_CORE diff --git a/arch/powerpc/platforms/Kconfig.cputype b/arch/powerpc/platforms/Kconfig.cputype index d0e172d47574..3a31d4289ea4 100644 --- a/arch/powerpc/platforms/Kconfig.cputype +++ b/arch/powerpc/platforms/Kconfig.cputype @@ -331,7 +331,7 @@ config ARCH_ENABLE_SPLIT_PMD_PTLOCK config PPC_RADIX_MMU bool "Radix MMU Support" depends on PPC_BOOK3S_64 && HUGETLB_PAGE - select ARCH_HAS_GIGANTIC_PAGE if (MEMORY_ISOLATION && COMPACTION) || CMA + select ARCH_HAS_GIGANTIC_PAGE if CONTIG_ALLOC select PPC_HAVE_KUEP select PPC_HAVE_KUAP default y diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index 07485582d027..724dbc6b7d33 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -63,7 +63,7 @@ config S390 select ARCH_HAS_ELF_RANDOMIZE select ARCH_HAS_FORTIFY_SOURCE select ARCH_HAS_GCOV_PROFILE_ALL - select ARCH_HAS_GIGANTIC_PAGE if (MEMORY_ISOLATION && COMPACTION) || CMA + select ARCH_HAS_GIGANTIC_PAGE if CONTIG_ALLOC select ARCH_HAS_KCOV select ARCH_HAS_PTE_SPECIAL select ARCH_HAS_SET_MEMORY diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig index 6349396317a9..2a5ec643fec0 100644 --- a/arch/sh/Kconfig +++ b/arch/sh/Kconfig @@ -53,7 +53,7 @@ config SUPERH select HAVE_FUTEX_CMPXCHG if FUTEX select HAVE_NMI select NEED_SG_DMA_LENGTH - select ARCH_HAS_GIGANTIC_PAGE if (MEMORY_ISOLATION && COMPACTION) || CMA + select ARCH_HAS_GIGANTIC_PAGE if CONTIG_ALLOC help The SuperH is a RISC processor targeted for use in embedded systems diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig index b848c8ddd92e..566de738e487 100644 --- a/arch/sparc/Kconfig +++ b/arch/sparc/Kconfig @@ -92,7 +92,7 @@ config SPARC64 select ARCH_CLOCKSOURCE_DATA select ARCH_HAS_PTE_SPECIAL select PCI_DOMAINS if PCI - select ARCH_HAS_GIGANTIC_PAGE if (MEMORY_ISOLATION && COMPACTION) || CMA + select ARCH_HAS_GIGANTIC_PAGE if CONTIG_ALLOC config ARCH_DEFCONFIG string diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index e7212731cffb..526d95abfe5e 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -22,7 +22,7 @@ config X86_64 def_bool y depends on 64BIT # Options that are inherently 64-bit kernel only: - select ARCH_HAS_GIGANTIC_PAGE if (MEMORY_ISOLATION && COMPACTION) || CMA + select ARCH_HAS_GIGANTIC_PAGE if CONTIG_ALLOC select ARCH_SUPPORTS_INT128 select ARCH_USE_CMPXCHG_LOCKREF select HAVE_ARCH_SOFT_DIRTY diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c index 92e4c4b85bba..fab095362c50 100644 --- a/arch/x86/mm/hugetlbpage.c +++ b/arch/x86/mm/hugetlbpage.c @@ -203,7 +203,7 @@ static __init int setup_hugepagesz(char *opt) } __setup("hugepagesz=", setup_hugepagesz); -#if (defined(CONFIG_MEMORY_ISOLATION) && defined(CONFIG_COMPACTION)) || defined(CONFIG_CMA) +#ifdef CONFIG_CONTIG_ALLOC static __init int gigantic_pages_init(void) { /* With compaction or CMA we can allocate gigantic pages at runtime */ diff --git a/include/linux/gfp.h b/include/linux/gfp.h index fdab7de7490d..e77ab30e9328 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -585,7 +585,7 @@ static inline bool pm_suspended_storage(void) } #endif /* CONFIG_PM_SLEEP */ -#if (defined(CONFIG_MEMORY_ISOLATION) && defined(CONFIG_COMPACTION)) || defined(CONFIG_CMA) +#ifdef CONFIG_CONTIG_ALLOC /* The below functions must be run on a range from a single zone. */ extern int alloc_contig_range(unsigned long start, unsigned long end, unsigned migratetype, gfp_t gfp_mask); diff --git a/mm/Kconfig b/mm/Kconfig index 25c71eb8a7db..137eadc18732 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -258,6 +258,9 @@ config ARCH_ENABLE_HUGEPAGE_MIGRATION config ARCH_ENABLE_THP_MIGRATION bool +config CONTIG_ALLOC + def_bool (MEMORY_ISOLATION && COMPACTION) || CMA + config PHYS_ADDR_T_64BIT def_bool 64BIT diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 07a0d722d481..2efb6525d932 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -8137,8 +8137,7 @@ unmovable: return true; } -#if (defined(CONFIG_MEMORY_ISOLATION) && defined(CONFIG_COMPACTION)) || defined(CONFIG_CMA) - +#ifdef CONFIG_CONTIG_ALLOC static unsigned long pfn_max_align_down(unsigned long pfn) { return pfn & ~(max_t(unsigned long, MAX_ORDER_NR_PAGES, -- cgit v1.2.3 From 4eb0716e868eed963967adb0b1b11d9bd8ca1d01 Mon Sep 17 00:00:00 2001 From: Alexandre Ghiti Date: Mon, 13 May 2019 17:19:04 -0700 Subject: hugetlb: allow to free gigantic pages regardless of the configuration On systems without CONTIG_ALLOC activated but that support gigantic pages, boottime reserved gigantic pages can not be freed at all. This patch simply enables the possibility to hand back those pages to memory allocator. Link: http://lkml.kernel.org/r/20190327063626.18421-5-alex@ghiti.fr Signed-off-by: Alexandre Ghiti Acked-by: David S. Miller [sparc] Reviewed-by: Mike Kravetz Cc: Andy Lutomirsky Cc: Aneesh Kumar K.V Cc: Benjamin Herrenschmidt Cc: Borislav Petkov Cc: Catalin Marinas Cc: Dave Hansen Cc: Heiko Carstens Cc: "H . Peter Anvin" Cc: Ingo Molnar Cc: Martin Schwidefsky Cc: Michael Ellerman Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Rich Felker Cc: Thomas Gleixner Cc: Vlastimil Babka Cc: Will Deacon Cc: Yoshinori Sato Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm64/Kconfig | 2 +- arch/arm64/include/asm/hugetlb.h | 4 --- arch/powerpc/include/asm/book3s/64/hugetlb.h | 5 ++- arch/powerpc/platforms/Kconfig.cputype | 2 +- arch/s390/Kconfig | 2 +- arch/s390/include/asm/hugetlb.h | 8 +++-- arch/sh/Kconfig | 2 +- arch/sparc/Kconfig | 2 +- arch/x86/Kconfig | 2 +- arch/x86/include/asm/hugetlb.h | 4 --- include/asm-generic/hugetlb.h | 7 ++++ include/linux/gfp.h | 2 +- mm/hugetlb.c | 54 +++++++++++++++++++--------- mm/page_alloc.c | 4 +-- 14 files changed, 61 insertions(+), 39 deletions(-) diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 7f7fbd8bd9d5..7a1aa53d188d 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -19,7 +19,7 @@ config ARM64 select ARCH_HAS_FAST_MULTIPLIER select ARCH_HAS_FORTIFY_SOURCE select ARCH_HAS_GCOV_PROFILE_ALL - select ARCH_HAS_GIGANTIC_PAGE if CONTIG_ALLOC + select ARCH_HAS_GIGANTIC_PAGE select ARCH_HAS_KCOV select ARCH_HAS_KEEPINITRD select ARCH_HAS_MEMBARRIER_SYNC_CORE diff --git a/arch/arm64/include/asm/hugetlb.h b/arch/arm64/include/asm/hugetlb.h index c6a07a3b433e..4aad6382f631 100644 --- a/arch/arm64/include/asm/hugetlb.h +++ b/arch/arm64/include/asm/hugetlb.h @@ -70,8 +70,4 @@ extern void set_huge_swap_pte_at(struct mm_struct *mm, unsigned long addr, #include -#ifdef CONFIG_ARCH_HAS_GIGANTIC_PAGE -static inline bool gigantic_page_supported(void) { return true; } -#endif - #endif /* __ASM_HUGETLB_H */ diff --git a/arch/powerpc/include/asm/book3s/64/hugetlb.h b/arch/powerpc/include/asm/book3s/64/hugetlb.h index 56140d19c85f..12e150e615b7 100644 --- a/arch/powerpc/include/asm/book3s/64/hugetlb.h +++ b/arch/powerpc/include/asm/book3s/64/hugetlb.h @@ -36,8 +36,8 @@ static inline int hstate_get_psize(struct hstate *hstate) } } -#ifdef CONFIG_ARCH_HAS_GIGANTIC_PAGE -static inline bool gigantic_page_supported(void) +#define __HAVE_ARCH_GIGANTIC_PAGE_RUNTIME_SUPPORTED +static inline bool gigantic_page_runtime_supported(void) { /* * We used gigantic page reservation with hypervisor assist in some case. @@ -49,7 +49,6 @@ static inline bool gigantic_page_supported(void) return true; } -#endif /* hugepd entry valid bit */ #define HUGEPD_VAL_BITS (0x8000000000000000UL) diff --git a/arch/powerpc/platforms/Kconfig.cputype b/arch/powerpc/platforms/Kconfig.cputype index 3a31d4289ea4..2794235e9d3e 100644 --- a/arch/powerpc/platforms/Kconfig.cputype +++ b/arch/powerpc/platforms/Kconfig.cputype @@ -331,7 +331,7 @@ config ARCH_ENABLE_SPLIT_PMD_PTLOCK config PPC_RADIX_MMU bool "Radix MMU Support" depends on PPC_BOOK3S_64 && HUGETLB_PAGE - select ARCH_HAS_GIGANTIC_PAGE if CONTIG_ALLOC + select ARCH_HAS_GIGANTIC_PAGE select PPC_HAVE_KUEP select PPC_HAVE_KUAP default y diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index 724dbc6b7d33..d0c046af65fa 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -63,7 +63,7 @@ config S390 select ARCH_HAS_ELF_RANDOMIZE select ARCH_HAS_FORTIFY_SOURCE select ARCH_HAS_GCOV_PROFILE_ALL - select ARCH_HAS_GIGANTIC_PAGE if CONTIG_ALLOC + select ARCH_HAS_GIGANTIC_PAGE select ARCH_HAS_KCOV select ARCH_HAS_PTE_SPECIAL select ARCH_HAS_SET_MEMORY diff --git a/arch/s390/include/asm/hugetlb.h b/arch/s390/include/asm/hugetlb.h index 2d1afa58a4b6..bb59dd964590 100644 --- a/arch/s390/include/asm/hugetlb.h +++ b/arch/s390/include/asm/hugetlb.h @@ -116,7 +116,9 @@ static inline pte_t huge_pte_modify(pte_t pte, pgprot_t newprot) return pte_modify(pte, newprot); } -#ifdef CONFIG_ARCH_HAS_GIGANTIC_PAGE -static inline bool gigantic_page_supported(void) { return true; } -#endif +static inline bool gigantic_page_runtime_supported(void) +{ + return true; +} + #endif /* _ASM_S390_HUGETLB_H */ diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig index 2a5ec643fec0..2a77033e1e7c 100644 --- a/arch/sh/Kconfig +++ b/arch/sh/Kconfig @@ -53,7 +53,7 @@ config SUPERH select HAVE_FUTEX_CMPXCHG if FUTEX select HAVE_NMI select NEED_SG_DMA_LENGTH - select ARCH_HAS_GIGANTIC_PAGE if CONTIG_ALLOC + select ARCH_HAS_GIGANTIC_PAGE help The SuperH is a RISC processor targeted for use in embedded systems diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig index 566de738e487..7c93f3121ee6 100644 --- a/arch/sparc/Kconfig +++ b/arch/sparc/Kconfig @@ -92,7 +92,7 @@ config SPARC64 select ARCH_CLOCKSOURCE_DATA select ARCH_HAS_PTE_SPECIAL select PCI_DOMAINS if PCI - select ARCH_HAS_GIGANTIC_PAGE if CONTIG_ALLOC + select ARCH_HAS_GIGANTIC_PAGE config ARCH_DEFCONFIG string diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 526d95abfe5e..f21bc56e5d7b 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -22,7 +22,7 @@ config X86_64 def_bool y depends on 64BIT # Options that are inherently 64-bit kernel only: - select ARCH_HAS_GIGANTIC_PAGE if CONTIG_ALLOC + select ARCH_HAS_GIGANTIC_PAGE select ARCH_SUPPORTS_INT128 select ARCH_USE_CMPXCHG_LOCKREF select HAVE_ARCH_SOFT_DIRTY diff --git a/arch/x86/include/asm/hugetlb.h b/arch/x86/include/asm/hugetlb.h index 7469d321f072..f65cfb48cfdd 100644 --- a/arch/x86/include/asm/hugetlb.h +++ b/arch/x86/include/asm/hugetlb.h @@ -17,8 +17,4 @@ static inline void arch_clear_hugepage_flags(struct page *page) { } -#ifdef CONFIG_ARCH_HAS_GIGANTIC_PAGE -static inline bool gigantic_page_supported(void) { return true; } -#endif - #endif /* _ASM_X86_HUGETLB_H */ diff --git a/include/asm-generic/hugetlb.h b/include/asm-generic/hugetlb.h index 71d7b77eea50..822f433ac95c 100644 --- a/include/asm-generic/hugetlb.h +++ b/include/asm-generic/hugetlb.h @@ -126,4 +126,11 @@ static inline pte_t huge_ptep_get(pte_t *ptep) } #endif +#ifndef __HAVE_ARCH_GIGANTIC_PAGE_RUNTIME_SUPPORTED +static inline bool gigantic_page_runtime_supported(void) +{ + return IS_ENABLED(CONFIG_ARCH_HAS_GIGANTIC_PAGE); +} +#endif /* __HAVE_ARCH_GIGANTIC_PAGE_RUNTIME_SUPPORTED */ + #endif /* _ASM_GENERIC_HUGETLB_H */ diff --git a/include/linux/gfp.h b/include/linux/gfp.h index e77ab30e9328..fb07b503dc45 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -589,8 +589,8 @@ static inline bool pm_suspended_storage(void) /* The below functions must be run on a range from a single zone. */ extern int alloc_contig_range(unsigned long start, unsigned long end, unsigned migratetype, gfp_t gfp_mask); -extern void free_contig_range(unsigned long pfn, unsigned nr_pages); #endif +void free_contig_range(unsigned long pfn, unsigned int nr_pages); #ifdef CONFIG_CMA /* CMA stuff */ diff --git a/mm/hugetlb.c b/mm/hugetlb.c index dffe5d9d03ae..2f901a6e13d2 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1059,6 +1059,7 @@ static void free_gigantic_page(struct page *page, unsigned int order) free_contig_range(page_to_pfn(page), 1 << order); } +#ifdef CONFIG_CONTIG_ALLOC static int __alloc_gigantic_page(unsigned long start_pfn, unsigned long nr_pages, gfp_t gfp_mask) { @@ -1143,11 +1144,20 @@ static struct page *alloc_gigantic_page(struct hstate *h, gfp_t gfp_mask, static void prep_new_huge_page(struct hstate *h, struct page *page, int nid); static void prep_compound_gigantic_page(struct page *page, unsigned int order); +#else /* !CONFIG_CONTIG_ALLOC */ +static struct page *alloc_gigantic_page(struct hstate *h, gfp_t gfp_mask, + int nid, nodemask_t *nodemask) +{ + return NULL; +} +#endif /* CONFIG_CONTIG_ALLOC */ #else /* !CONFIG_ARCH_HAS_GIGANTIC_PAGE */ -static inline bool gigantic_page_supported(void) { return false; } static struct page *alloc_gigantic_page(struct hstate *h, gfp_t gfp_mask, - int nid, nodemask_t *nodemask) { return NULL; } + int nid, nodemask_t *nodemask) +{ + return NULL; +} static inline void free_gigantic_page(struct page *page, unsigned int order) { } static inline void destroy_compound_gigantic_page(struct page *page, unsigned int order) { } @@ -1157,7 +1167,7 @@ static void update_and_free_page(struct hstate *h, struct page *page) { int i; - if (hstate_is_gigantic(h) && !gigantic_page_supported()) + if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported()) return; h->nr_huge_pages--; @@ -2278,13 +2288,27 @@ found: } #define persistent_huge_pages(h) (h->nr_huge_pages - h->surplus_huge_pages) -static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count, - nodemask_t *nodes_allowed) +static int set_max_huge_pages(struct hstate *h, unsigned long count, + nodemask_t *nodes_allowed) { unsigned long min_count, ret; - if (hstate_is_gigantic(h) && !gigantic_page_supported()) - return h->max_huge_pages; + spin_lock(&hugetlb_lock); + + /* + * Gigantic pages runtime allocation depend on the capability for large + * page range allocation. + * If the system does not provide this feature, return an error when + * the user tries to allocate gigantic pages but let the user free the + * boottime allocated gigantic pages. + */ + if (hstate_is_gigantic(h) && !IS_ENABLED(CONFIG_CONTIG_ALLOC)) { + if (count > persistent_huge_pages(h)) { + spin_unlock(&hugetlb_lock); + return -EINVAL; + } + /* Fall through to decrease pool */ + } /* * Increase the pool size @@ -2297,7 +2321,6 @@ static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count, * pool might be one hugepage larger than it needs to be, but * within all the constraints specified by the sysctls. */ - spin_lock(&hugetlb_lock); while (h->surplus_huge_pages && count > persistent_huge_pages(h)) { if (!adjust_pool_surplus(h, nodes_allowed, -1)) break; @@ -2352,9 +2375,10 @@ static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count, break; } out: - ret = persistent_huge_pages(h); + h->max_huge_pages = persistent_huge_pages(h); spin_unlock(&hugetlb_lock); - return ret; + + return 0; } #define HSTATE_ATTR_RO(_name) \ @@ -2406,7 +2430,7 @@ static ssize_t __nr_hugepages_store_common(bool obey_mempolicy, int err; NODEMASK_ALLOC(nodemask_t, nodes_allowed, GFP_KERNEL | __GFP_NORETRY); - if (hstate_is_gigantic(h) && !gigantic_page_supported()) { + if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported()) { err = -EINVAL; goto out; } @@ -2430,15 +2454,13 @@ static ssize_t __nr_hugepages_store_common(bool obey_mempolicy, } else nodes_allowed = &node_states[N_MEMORY]; - h->max_huge_pages = set_max_huge_pages(h, count, nodes_allowed); + err = set_max_huge_pages(h, count, nodes_allowed); +out: if (nodes_allowed != &node_states[N_MEMORY]) NODEMASK_FREE(nodes_allowed); - return len; -out: - NODEMASK_FREE(nodes_allowed); - return err; + return err ? err : len; } static ssize_t nr_hugepages_store_common(bool obey_mempolicy, diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 2efb6525d932..4ea71bc70413 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -8346,8 +8346,9 @@ done: pfn_max_align_up(end), migratetype); return ret; } +#endif /* CONFIG_CONTIG_ALLOC */ -void free_contig_range(unsigned long pfn, unsigned nr_pages) +void free_contig_range(unsigned long pfn, unsigned int nr_pages) { unsigned int count = 0; @@ -8359,7 +8360,6 @@ void free_contig_range(unsigned long pfn, unsigned nr_pages) } WARN(count != 0, "%d pages are still in use!\n", count); } -#endif #ifdef CONFIG_MEMORY_HOTPLUG /* -- cgit v1.2.3 From fc1d8e7cca2daa18d2fe56b94874848adf89d7f5 Mon Sep 17 00:00:00 2001 From: John Hubbard Date: Mon, 13 May 2019 17:19:08 -0700 Subject: mm: introduce put_user_page*(), placeholder versions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A discussion of the overall problem is below. As mentioned in patch 0001, the steps are to fix the problem are: 1) Provide put_user_page*() routines, intended to be used for releasing pages that were pinned via get_user_pages*(). 2) Convert all of the call sites for get_user_pages*(), to invoke put_user_page*(), instead of put_page(). This involves dozens of call sites, and will take some time. 3) After (2) is complete, use get_user_pages*() and put_user_page*() to implement tracking of these pages. This tracking will be separate from the existing struct page refcounting. 4) Use the tracking and identification of these pages, to implement special handling (especially in writeback paths) when the pages are backed by a filesystem. Overview ======== Some kernel components (file systems, device drivers) need to access memory that is specified via process virtual address. For a long time, the API to achieve that was get_user_pages ("GUP") and its variations. However, GUP has critical limitations that have been overlooked; in particular, GUP does not interact correctly with filesystems in all situations. That means that file-backed memory + GUP is a recipe for potential problems, some of which have already occurred in the field. GUP was first introduced for Direct IO (O_DIRECT), allowing filesystem code to get the struct page behind a virtual address and to let storage hardware perform a direct copy to or from that page. This is a short-lived access pattern, and as such, the window for a concurrent writeback of GUP'd page was small enough that there were not (we think) any reported problems. Also, userspace was expected to understand and accept that Direct IO was not synchronized with memory-mapped access to that data, nor with any process address space changes such as munmap(), mremap(), etc. Over the years, more GUP uses have appeared (virtualization, device drivers, RDMA) that can keep the pages they get via GUP for a long period of time (seconds, minutes, hours, days, ...). This long-term pinning makes an underlying design problem more obvious. In fact, there are a number of key problems inherent to GUP: Interactions with file systems ============================== File systems expect to be able to write back data, both to reclaim pages, and for data integrity. Allowing other hardware (NICs, GPUs, etc) to gain write access to the file memory pages means that such hardware can dirty the pages, without the filesystem being aware. This can, in some cases (depending on filesystem, filesystem options, block device, block device options, and other variables), lead to data corruption, and also to kernel bugs of the form: kernel BUG at /build/linux-fQ94TU/linux-4.4.0/fs/ext4/inode.c:1899! backtrace: ext4_writepage __writepage write_cache_pages ext4_writepages do_writepages __writeback_single_inode writeback_sb_inodes __writeback_inodes_wb wb_writeback wb_workfn process_one_work worker_thread kthread ret_from_fork ...which is due to the file system asserting that there are still buffer heads attached: ({ \ BUG_ON(!PagePrivate(page)); \ ((struct buffer_head *)page_private(page)); \ }) Dave Chinner's description of this is very clear: "The fundamental issue is that ->page_mkwrite must be called on every write access to a clean file backed page, not just the first one. How long the GUP reference lasts is irrelevant, if the page is clean and you need to dirty it, you must call ->page_mkwrite before it is marked writeable and dirtied. Every. Time." This is just one symptom of the larger design problem: real filesystems that actually write to a backing device, do not actually support get_user_pages() being called on their pages, and letting hardware write directly to those pages--even though that pattern has been going on since about 2005 or so. Long term GUP ============= Long term GUP is an issue when FOLL_WRITE is specified to GUP (so, a writeable mapping is created), and the pages are file-backed. That can lead to filesystem corruption. What happens is that when a file-backed page is being written back, it is first mapped read-only in all of the CPU page tables; the file system then assumes that nobody can write to the page, and that the page content is therefore stable. Unfortunately, the GUP callers generally do not monitor changes to the CPU pages tables; they instead assume that the following pattern is safe (it's not): get_user_pages() Hardware can keep a reference to those pages for a very long time, and write to it at any time. Because "hardware" here means "devices that are not a CPU", this activity occurs without any interaction with the kernel's file system code. for each page set_page_dirty put_page() In fact, the GUP documentation even recommends that pattern. Anyway, the file system assumes that the page is stable (nothing is writing to the page), and that is a problem: stable page content is necessary for many filesystem actions during writeback, such as checksum, encryption, RAID striping, etc. Furthermore, filesystem features like COW (copy on write) or snapshot also rely on being able to use a new page for as memory for that memory range inside the file. Corruption during write back is clearly possible here. To solve that, one idea is to identify pages that have active GUP, so that we can use a bounce page to write stable data to the filesystem. The filesystem would work on the bounce page, while any of the active GUP might write to the original page. This would avoid the stable page violation problem, but note that it is only part of the overall solution, because other problems remain. Other filesystem features that need to replace the page with a new one can be inhibited for pages that are GUP-pinned. This will, however, alter and limit some of those filesystem features. The only fix for that would be to require GUP users to monitor and respond to CPU page table updates. Subsystems such as ODP and HMM do this, for example. This aspect of the problem is still under discussion. Direct IO ========= Direct IO can cause corruption, if userspace does Direct-IO that writes to a range of virtual addresses that are mmap'd to a file. The pages written to are file-backed pages that can be under write back, while the Direct IO is taking place. Here, Direct IO races with a write back: it calls GUP before page_mkclean() has replaced the CPU pte with a read-only entry. The race window is pretty small, which is probably why years have gone by before we noticed this problem: Direct IO is generally very quick, and tends to finish up before the filesystem gets around to do anything with the page contents. However, it's still a real problem. The solution is to never let GUP return pages that are under write back, but instead, force GUP to take a write fault on those pages. That way, GUP will properly synchronize with the active write back. This does not change the required GUP behavior, it just avoids that race. Details ======= Introduces put_user_page(), which simply calls put_page(). This provides a way to update all get_user_pages*() callers, so that they call put_user_page(), instead of put_page(). Also introduces put_user_pages(), and a few dirty/locked variations, as a replacement for release_pages(), and also as a replacement for open-coded loops that release multiple pages. These may be used for subsequent performance improvements, via batching of pages to be released. This is the first step of fixing a problem (also described in [1] and [2]) with interactions between get_user_pages ("gup") and filesystems. Problem description: let's start with a bug report. Below, is what happens sometimes, under memory pressure, when a driver pins some pages via gup, and then marks those pages dirty, and releases them. Note that the gup documentation actually recommends that pattern. The problem is that the filesystem may do a writeback while the pages were gup-pinned, and then the filesystem believes that the pages are clean. So, when the driver later marks the pages as dirty, that conflicts with the filesystem's page tracking and results in a BUG(), like this one that I experienced: kernel BUG at /build/linux-fQ94TU/linux-4.4.0/fs/ext4/inode.c:1899! backtrace: ext4_writepage __writepage write_cache_pages ext4_writepages do_writepages __writeback_single_inode writeback_sb_inodes __writeback_inodes_wb wb_writeback wb_workfn process_one_work worker_thread kthread ret_from_fork ...which is due to the file system asserting that there are still buffer heads attached: ({ \ BUG_ON(!PagePrivate(page)); \ ((struct buffer_head *)page_private(page)); \ }) Dave Chinner's description of this is very clear: "The fundamental issue is that ->page_mkwrite must be called on every write access to a clean file backed page, not just the first one. How long the GUP reference lasts is irrelevant, if the page is clean and you need to dirty it, you must call ->page_mkwrite before it is marked writeable and dirtied. Every. Time." This is just one symptom of the larger design problem: real filesystems that actually write to a backing device, do not actually support get_user_pages() being called on their pages, and letting hardware write directly to those pages--even though that pattern has been going on since about 2005 or so. The steps are to fix it are: 1) (This patch): provide put_user_page*() routines, intended to be used for releasing pages that were pinned via get_user_pages*(). 2) Convert all of the call sites for get_user_pages*(), to invoke put_user_page*(), instead of put_page(). This involves dozens of call sites, and will take some time. 3) After (2) is complete, use get_user_pages*() and put_user_page*() to implement tracking of these pages. This tracking will be separate from the existing struct page refcounting. 4) Use the tracking and identification of these pages, to implement special handling (especially in writeback paths) when the pages are backed by a filesystem. [1] https://lwn.net/Articles/774411/ : "DMA and get_user_pages()" [2] https://lwn.net/Articles/753027/ : "The Trouble with get_user_pages()" Link: http://lkml.kernel.org/r/20190327023632.13307-2-jhubbard@nvidia.com Signed-off-by: John Hubbard Reviewed-by: Jan Kara Reviewed-by: Mike Rapoport [docs] Reviewed-by: Ira Weiny Reviewed-by: Jérôme Glisse Reviewed-by: Christoph Lameter Tested-by: Ira Weiny Cc: Al Viro Cc: Christoph Hellwig Cc: Dan Williams Cc: Dave Chinner Cc: Jason Gunthorpe Cc: Matthew Wilcox Cc: Michal Hocko Cc: Ralph Campbell Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 24 ++++++++++++ mm/gup.c | 105 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 129 insertions(+) diff --git a/include/linux/mm.h b/include/linux/mm.h index c3c73b3c9adc..e6b6be15609e 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1007,6 +1007,30 @@ static inline void put_page(struct page *page) __put_page(page); } +/** + * put_user_page() - release a gup-pinned page + * @page: pointer to page to be released + * + * Pages that were pinned via get_user_pages*() must be released via + * either put_user_page(), or one of the put_user_pages*() routines + * below. This is so that eventually, pages that are pinned via + * get_user_pages*() can be separately tracked and uniquely handled. In + * particular, interactions with RDMA and filesystems need special + * handling. + * + * put_user_page() and put_page() are not interchangeable, despite this early + * implementation that makes them look the same. put_user_page() calls must + * be perfectly matched up with get_user_page() calls. + */ +static inline void put_user_page(struct page *page) +{ + put_page(page); +} + +void put_user_pages_dirty(struct page **pages, unsigned long npages); +void put_user_pages_dirty_lock(struct page **pages, unsigned long npages); +void put_user_pages(struct page **pages, unsigned long npages); + #if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP) #define SECTION_IN_PAGE_FLAGS #endif diff --git a/mm/gup.c b/mm/gup.c index 8e0a0a3a2b2d..2c08248d4fa2 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -28,6 +28,111 @@ struct follow_page_context { unsigned int page_mask; }; +typedef int (*set_dirty_func_t)(struct page *page); + +static void __put_user_pages_dirty(struct page **pages, + unsigned long npages, + set_dirty_func_t sdf) +{ + unsigned long index; + + for (index = 0; index < npages; index++) { + struct page *page = compound_head(pages[index]); + + /* + * Checking PageDirty at this point may race with + * clear_page_dirty_for_io(), but that's OK. Two key cases: + * + * 1) This code sees the page as already dirty, so it skips + * the call to sdf(). That could happen because + * clear_page_dirty_for_io() called page_mkclean(), + * followed by set_page_dirty(). However, now the page is + * going to get written back, which meets the original + * intention of setting it dirty, so all is well: + * clear_page_dirty_for_io() goes on to call + * TestClearPageDirty(), and write the page back. + * + * 2) This code sees the page as clean, so it calls sdf(). + * The page stays dirty, despite being written back, so it + * gets written back again in the next writeback cycle. + * This is harmless. + */ + if (!PageDirty(page)) + sdf(page); + + put_user_page(page); + } +} + +/** + * put_user_pages_dirty() - release and dirty an array of gup-pinned pages + * @pages: array of pages to be marked dirty and released. + * @npages: number of pages in the @pages array. + * + * "gup-pinned page" refers to a page that has had one of the get_user_pages() + * variants called on that page. + * + * For each page in the @pages array, make that page (or its head page, if a + * compound page) dirty, if it was previously listed as clean. Then, release + * the page using put_user_page(). + * + * Please see the put_user_page() documentation for details. + * + * set_page_dirty(), which does not lock the page, is used here. + * Therefore, it is the caller's responsibility to ensure that this is + * safe. If not, then put_user_pages_dirty_lock() should be called instead. + * + */ +void put_user_pages_dirty(struct page **pages, unsigned long npages) +{ + __put_user_pages_dirty(pages, npages, set_page_dirty); +} +EXPORT_SYMBOL(put_user_pages_dirty); + +/** + * put_user_pages_dirty_lock() - release and dirty an array of gup-pinned pages + * @pages: array of pages to be marked dirty and released. + * @npages: number of pages in the @pages array. + * + * For each page in the @pages array, make that page (or its head page, if a + * compound page) dirty, if it was previously listed as clean. Then, release + * the page using put_user_page(). + * + * Please see the put_user_page() documentation for details. + * + * This is just like put_user_pages_dirty(), except that it invokes + * set_page_dirty_lock(), instead of set_page_dirty(). + * + */ +void put_user_pages_dirty_lock(struct page **pages, unsigned long npages) +{ + __put_user_pages_dirty(pages, npages, set_page_dirty_lock); +} +EXPORT_SYMBOL(put_user_pages_dirty_lock); + +/** + * put_user_pages() - release an array of gup-pinned pages. + * @pages: array of pages to be marked dirty and released. + * @npages: number of pages in the @pages array. + * + * For each page in the @pages array, release the page using put_user_page(). + * + * Please see the put_user_page() documentation for details. + */ +void put_user_pages(struct page **pages, unsigned long npages) +{ + unsigned long index; + + /* + * TODO: this can be optimized for huge pages: if a series of pages is + * physically contiguous and part of the same compound page, then a + * single operation to the head page should suffice. + */ + for (index = 0; index < npages; index++) + put_user_page(pages[index]); +} +EXPORT_SYMBOL(put_user_pages); + static struct page *no_page_table(struct vm_area_struct *vma, unsigned int flags) { -- cgit v1.2.3 From 024eee0e83f0df52317be607ca521e0fc572aa07 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Mon, 13 May 2019 17:19:11 -0700 Subject: mm: page_mkclean vs MADV_DONTNEED race MADV_DONTNEED is handled with mmap_sem taken in read mode. We call page_mkclean without holding mmap_sem. MADV_DONTNEED implies that pages in the region are unmapped and subsequent access to the pages in that range is handled as a new page fault. This implies that if we don't have parallel access to the region when MADV_DONTNEED is run we expect those range to be unallocated. w.r.t page_mkclean() we need to make sure that we don't break the MADV_DONTNEED semantics. MADV_DONTNEED check for pmd_none without holding pmd_lock. This implies we skip the pmd if we temporarily mark pmd none. Avoid doing that while marking the page clean. Keep the sequence same for dax too even though we don't support MADV_DONTNEED for dax mapping The bug was noticed by code review and I didn't observe any failures w.r.t test run. This is similar to commit 58ceeb6bec86d9140f9d91d71a710e963523d063 Author: Kirill A. Shutemov Date: Thu Apr 13 14:56:26 2017 -0700 thp: fix MADV_DONTNEED vs. MADV_FREE race commit ced108037c2aa542b3ed8b7afd1576064ad1362a Author: Kirill A. Shutemov Date: Thu Apr 13 14:56:20 2017 -0700 thp: fix MADV_DONTNEED vs. numa balancing race Link: http://lkml.kernel.org/r/20190321040610.14226-1-aneesh.kumar@linux.ibm.com Signed-off-by: Aneesh Kumar K.V Reviewed-by: Andrew Morton Cc: Dan Williams Cc:"Kirill A . Shutemov" Cc: Andrea Arcangeli Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/dax.c | 2 +- mm/rmap.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/dax.c b/fs/dax.c index 83009875308c..f74386293632 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -814,7 +814,7 @@ static void dax_entry_mkclean(struct address_space *mapping, pgoff_t index, goto unlock_pmd; flush_cache_page(vma, address, pfn); - pmd = pmdp_huge_clear_flush(vma, address, pmdp); + pmd = pmdp_invalidate(vma, address, pmdp); pmd = pmd_wrprotect(pmd); pmd = pmd_mkclean(pmd); set_pmd_at(vma->vm_mm, address, pmdp, pmd); diff --git a/mm/rmap.c b/mm/rmap.c index b30c7c71d1d9..76c8dfd3ae1c 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -928,7 +928,7 @@ static bool page_mkclean_one(struct page *page, struct vm_area_struct *vma, continue; flush_cache_page(vma, address, page_to_pfn(page)); - entry = pmdp_huge_clear_flush(vma, address, pmd); + entry = pmdp_invalidate(vma, address, pmd); entry = pmd_wrprotect(entry); entry = pmd_mkclean(entry); set_pmd_at(vma->vm_mm, address, pmd, entry); -- cgit v1.2.3 From 3481c37ffa1de58ef140d0fe9eabf56305e74666 Mon Sep 17 00:00:00 2001 From: Yafang Shao Date: Mon, 13 May 2019 17:19:14 -0700 Subject: mm/vmscan: drop may_writepage and classzone_idx from direct reclaim begin template There are three tracepoints using this template, which are mm_vmscan_direct_reclaim_begin, mm_vmscan_memcg_reclaim_begin, mm_vmscan_memcg_softlimit_reclaim_begin. Regarding mm_vmscan_direct_reclaim_begin, sc.may_writepage is !laptop_mode, that's a static setting, and reclaim_idx is derived from gfp_mask which is already show in this tracepoint. Regarding mm_vmscan_memcg_reclaim_begin, may_writepage is !laptop_mode too, and reclaim_idx is (MAX_NR_ZONES-1), which are both static value. mm_vmscan_memcg_softlimit_reclaim_begin is the same with mm_vmscan_memcg_reclaim_begin. So we can drop them all. Link: http://lkml.kernel.org/r/1553736322-32235-1-git-send-email-laoar.shao@gmail.com Signed-off-by: Yafang Shao Acked-by: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/trace/events/vmscan.h | 26 ++++++++++---------------- mm/vmscan.c | 14 +++----------- 2 files changed, 13 insertions(+), 27 deletions(-) diff --git a/include/trace/events/vmscan.h b/include/trace/events/vmscan.h index b8b9d42944f9..0aa882a4e870 100644 --- a/include/trace/events/vmscan.h +++ b/include/trace/events/vmscan.h @@ -106,51 +106,45 @@ TRACE_EVENT(mm_vmscan_wakeup_kswapd, DECLARE_EVENT_CLASS(mm_vmscan_direct_reclaim_begin_template, - TP_PROTO(int order, int may_writepage, gfp_t gfp_flags, int classzone_idx), + TP_PROTO(int order, gfp_t gfp_flags), - TP_ARGS(order, may_writepage, gfp_flags, classzone_idx), + TP_ARGS(order, gfp_flags), TP_STRUCT__entry( __field( int, order ) - __field( int, may_writepage ) __field( gfp_t, gfp_flags ) - __field( int, classzone_idx ) ), TP_fast_assign( __entry->order = order; - __entry->may_writepage = may_writepage; __entry->gfp_flags = gfp_flags; - __entry->classzone_idx = classzone_idx; ), - TP_printk("order=%d may_writepage=%d gfp_flags=%s classzone_idx=%d", + TP_printk("order=%d gfp_flags=%s", __entry->order, - __entry->may_writepage, - show_gfp_flags(__entry->gfp_flags), - __entry->classzone_idx) + show_gfp_flags(__entry->gfp_flags)) ); DEFINE_EVENT(mm_vmscan_direct_reclaim_begin_template, mm_vmscan_direct_reclaim_begin, - TP_PROTO(int order, int may_writepage, gfp_t gfp_flags, int classzone_idx), + TP_PROTO(int order, gfp_t gfp_flags), - TP_ARGS(order, may_writepage, gfp_flags, classzone_idx) + TP_ARGS(order, gfp_flags) ); #ifdef CONFIG_MEMCG DEFINE_EVENT(mm_vmscan_direct_reclaim_begin_template, mm_vmscan_memcg_reclaim_begin, - TP_PROTO(int order, int may_writepage, gfp_t gfp_flags, int classzone_idx), + TP_PROTO(int order, gfp_t gfp_flags), - TP_ARGS(order, may_writepage, gfp_flags, classzone_idx) + TP_ARGS(order, gfp_flags) ); DEFINE_EVENT(mm_vmscan_direct_reclaim_begin_template, mm_vmscan_memcg_softlimit_reclaim_begin, - TP_PROTO(int order, int may_writepage, gfp_t gfp_flags, int classzone_idx), + TP_PROTO(int order, gfp_t gfp_flags), - TP_ARGS(order, may_writepage, gfp_flags, classzone_idx) + TP_ARGS(order, gfp_flags) ); #endif /* CONFIG_MEMCG */ diff --git a/mm/vmscan.c b/mm/vmscan.c index e869f9e25a3d..41a14eed2e16 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -3174,10 +3174,7 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order, if (throttle_direct_reclaim(sc.gfp_mask, zonelist, nodemask)) return 1; - trace_mm_vmscan_direct_reclaim_begin(order, - sc.may_writepage, - sc.gfp_mask, - sc.reclaim_idx); + trace_mm_vmscan_direct_reclaim_begin(order, sc.gfp_mask); nr_reclaimed = do_try_to_free_pages(zonelist, &sc); @@ -3208,9 +3205,7 @@ unsigned long mem_cgroup_shrink_node(struct mem_cgroup *memcg, (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK); trace_mm_vmscan_memcg_softlimit_reclaim_begin(sc.order, - sc.may_writepage, - sc.gfp_mask, - sc.reclaim_idx); + sc.gfp_mask); /* * NOTE: Although we can get the priority field, using it @@ -3259,10 +3254,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, zonelist = &NODE_DATA(nid)->node_zonelists[ZONELIST_FALLBACK]; - trace_mm_vmscan_memcg_reclaim_begin(0, - sc.may_writepage, - sc.gfp_mask, - sc.reclaim_idx); + trace_mm_vmscan_memcg_reclaim_begin(0, sc.gfp_mask); psi_memstall_enter(&pflags); noreclaim_flag = memalloc_noreclaim_save(); -- cgit v1.2.3 From 299c83dce9ea3a79bb4b5511d2cb996b6b8e5111 Mon Sep 17 00:00:00 2001 From: Linxu Fang Date: Mon, 13 May 2019 17:19:17 -0700 Subject: mem-hotplug: fix node spanned pages when we have a node with only ZONE_MOVABLE 342332e6a925 ("mm/page_alloc.c: introduce kernelcore=mirror option") and later patches rewrote the calculation of node spanned pages. e506b99696a2 ("mem-hotplug: fix node spanned pages when we have a movable node"), but the current code still has problems, When we have a node with only zone_movable and the node id is not zero, the size of node spanned pages is double added. That's because we have an empty normal zone, and zone_start_pfn or zone_end_pfn is not between arch_zone_lowest_possible_pfn and arch_zone_highest_possible_pfn, so we need to use clamp to constrain the range just like the commit <96e907d13602> (bootmem: Reimplement __absent_pages_in_range() using for_each_mem_pfn_range()). e.g. Zone ranges: DMA [mem 0x0000000000001000-0x0000000000ffffff] DMA32 [mem 0x0000000001000000-0x00000000ffffffff] Normal [mem 0x0000000100000000-0x000000023fffffff] Movable zone start for each node Node 0: 0x0000000100000000 Node 1: 0x0000000140000000 Early memory node ranges node 0: [mem 0x0000000000001000-0x000000000009efff] node 0: [mem 0x0000000000100000-0x00000000bffdffff] node 0: [mem 0x0000000100000000-0x000000013fffffff] node 1: [mem 0x0000000140000000-0x000000023fffffff] node 0 DMA spanned:0xfff present:0xf9e absent:0x61 node 0 DMA32 spanned:0xff000 present:0xbefe0 absent:0x40020 node 0 Normal spanned:0 present:0 absent:0 node 0 Movable spanned:0x40000 present:0x40000 absent:0 On node 0 totalpages(node_present_pages): 1048446 node_spanned_pages:1310719 node 1 DMA spanned:0 present:0 absent:0 node 1 DMA32 spanned:0 present:0 absent:0 node 1 Normal spanned:0x100000 present:0x100000 absent:0 node 1 Movable spanned:0x100000 present:0x100000 absent:0 On node 1 totalpages(node_present_pages): 2097152 node_spanned_pages:2097152 Memory: 6967796K/12582392K available (16388K kernel code, 3686K rwdata, 4468K rodata, 2160K init, 10444K bss, 5614596K reserved, 0K cma-reserved) It shows that the current memory of node 1 is double added. After this patch, the problem is fixed. node 0 DMA spanned:0xfff present:0xf9e absent:0x61 node 0 DMA32 spanned:0xff000 present:0xbefe0 absent:0x40020 node 0 Normal spanned:0 present:0 absent:0 node 0 Movable spanned:0x40000 present:0x40000 absent:0 On node 0 totalpages(node_present_pages): 1048446 node_spanned_pages:1310719 node 1 DMA spanned:0 present:0 absent:0 node 1 DMA32 spanned:0 present:0 absent:0 node 1 Normal spanned:0 present:0 absent:0 node 1 Movable spanned:0x100000 present:0x100000 absent:0 On node 1 totalpages(node_present_pages): 1048576 node_spanned_pages:1048576 memory: 6967796K/8388088K available (16388K kernel code, 3686K rwdata, 4468K rodata, 2160K init, 10444K bss, 1420292K reserved, 0K cma-reserved) Link: http://lkml.kernel.org/r/1554178276-10372-1-git-send-email-fanglinxu@huawei.com Signed-off-by: Linxu Fang Cc: Taku Izumi Cc: Xishi Qiu Cc: Michal Hocko Cc: Vlastimil Babka Cc: Pavel Tatashin Cc: Oscar Salvador Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 4ea71bc70413..909adce33398 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -6255,13 +6255,15 @@ static unsigned long __init zone_spanned_pages_in_node(int nid, unsigned long *zone_end_pfn, unsigned long *ignored) { + unsigned long zone_low = arch_zone_lowest_possible_pfn[zone_type]; + unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type]; /* When hotadd a new node from cpu_up(), the node should be empty */ if (!node_start_pfn && !node_end_pfn) return 0; /* Get the start and end of the zone */ - *zone_start_pfn = arch_zone_lowest_possible_pfn[zone_type]; - *zone_end_pfn = arch_zone_highest_possible_pfn[zone_type]; + *zone_start_pfn = clamp(node_start_pfn, zone_low, zone_high); + *zone_end_pfn = clamp(node_end_pfn, zone_low, zone_high); adjust_zone_range_for_zone_movable(nid, zone_type, node_start_pfn, node_end_pfn, zone_start_pfn, zone_end_pfn); -- cgit v1.2.3 From fd875dca7c71744cbb0ebbcde7d45e5ee05b7637 Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Mon, 13 May 2019 17:19:20 -0700 Subject: hugetlbfs: fix potential over/underflow setting node specific nr_hugepages The number of node specific huge pages can be set via a file such as: /sys/devices/system/node/node1/hugepages/hugepages-2048kB/nr_hugepages When a node specific value is specified, the global number of huge pages must also be adjusted. This adjustment is calculated as the specified node specific value + (global value - current node value). If the node specific value provided by the user is large enough, this calculation could overflow an unsigned long leading to a smaller than expected number of huge pages. To fix, check the calculation for overflow. If overflow is detected, use ULONG_MAX as the requested value. This is inline with the user request to allocate as many huge pages as possible. It was also noticed that the above calculation was done outside the hugetlb_lock. Therefore, the values could be inconsistent and result in underflow. To fix, the calculation is moved within the routine set_max_huge_pages() where the lock is held. In addition, the code in __nr_hugepages_store_common() which tries to handle the case of not being able to allocate a node mask would likely result in incorrect behavior. Luckily, it is very unlikely we will ever take this path. If we do, simply return ENOMEM. Link: http://lkml.kernel.org/r/20190328220533.19884-1-mike.kravetz@oracle.com Signed-off-by: Mike Kravetz Reported-by: Jing Xiangfeng Reviewed-by: Naoya Horiguchi Reviewed-by: Oscar Salvador Cc: David Rientjes Cc: Alex Ghiti Cc: Jing Xiangfeng Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 41 ++++++++++++++++++++++++++++++++++------- 1 file changed, 34 insertions(+), 7 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 2f901a6e13d2..a81f2a8556c8 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -2288,13 +2288,33 @@ found: } #define persistent_huge_pages(h) (h->nr_huge_pages - h->surplus_huge_pages) -static int set_max_huge_pages(struct hstate *h, unsigned long count, +static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid, nodemask_t *nodes_allowed) { unsigned long min_count, ret; spin_lock(&hugetlb_lock); + /* + * Check for a node specific request. + * Changing node specific huge page count may require a corresponding + * change to the global count. In any case, the passed node mask + * (nodes_allowed) will restrict alloc/free to the specified node. + */ + if (nid != NUMA_NO_NODE) { + unsigned long old_count = count; + + count += h->nr_huge_pages - h->nr_huge_pages_node[nid]; + /* + * User may have specified a large count value which caused the + * above calculation to overflow. In this case, they wanted + * to allocate as many huge pages as possible. Set count to + * largest possible value to align with their intention. + */ + if (count < old_count) + count = ULONG_MAX; + } + /* * Gigantic pages runtime allocation depend on the capability for large * page range allocation. @@ -2446,15 +2466,22 @@ static ssize_t __nr_hugepages_store_common(bool obey_mempolicy, } } else if (nodes_allowed) { /* - * per node hstate attribute: adjust count to global, - * but restrict alloc/free to the specified node. + * Node specific request. count adjustment happens in + * set_max_huge_pages() after acquiring hugetlb_lock. */ - count += h->nr_huge_pages - h->nr_huge_pages_node[nid]; init_nodemask_of_node(nodes_allowed, nid); - } else - nodes_allowed = &node_states[N_MEMORY]; + } else { + /* + * Node specific request, but we could not allocate the few + * words required for a node mask. We are unlikely to hit + * this condition. Since we can not pass down the appropriate + * node mask, just return ENOMEM. + */ + err = -ENOMEM; + goto out; + } - err = set_max_huge_pages(h, count, nodes_allowed); + err = set_max_huge_pages(h, count, nid, nodes_allowed); out: if (nodes_allowed != &node_states[N_MEMORY]) -- cgit v1.2.3 From 2d0adf7e0d7ac1e18da874c5b19ef30a0db59658 Mon Sep 17 00:00:00 2001 From: Oscar Salvador Date: Mon, 13 May 2019 17:19:23 -0700 Subject: mm/hugetlb: get rid of NODEMASK_ALLOC NODEMASK_ALLOC is used to allocate a nodemask bitmap, and it does it by first determining whether it should be allocated on the stack or dynamically, depending on NODES_SHIFT. Right now, it goes the dynamic path whenever the nodemask_t is above 32 bytes. Although we could bump it to a reasonable value, the largest a nodemask_t can get is 128 bytes, so since __nr_hugepages_store_common is called from a rather short stack we can just get rid of the NODEMASK_ALLOC call here. This reduces some code churn and complexity. Link: http://lkml.kernel.org/r/20190402133415.21983-1-osalvador@suse.de Signed-off-by: Oscar Salvador Reviewed-by: Mike Kravetz Reviewed-by: Naoya Horiguchi Cc: Alex Ghiti Cc: David Rientjes Cc: Jing Xiangfeng Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 36 +++++++++++------------------------- 1 file changed, 11 insertions(+), 25 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index a81f2a8556c8..2b0abc30685d 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -2448,44 +2448,30 @@ static ssize_t __nr_hugepages_store_common(bool obey_mempolicy, unsigned long count, size_t len) { int err; - NODEMASK_ALLOC(nodemask_t, nodes_allowed, GFP_KERNEL | __GFP_NORETRY); + nodemask_t nodes_allowed, *n_mask; - if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported()) { - err = -EINVAL; - goto out; - } + if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported()) + return -EINVAL; if (nid == NUMA_NO_NODE) { /* * global hstate attribute */ if (!(obey_mempolicy && - init_nodemask_of_mempolicy(nodes_allowed))) { - NODEMASK_FREE(nodes_allowed); - nodes_allowed = &node_states[N_MEMORY]; - } - } else if (nodes_allowed) { + init_nodemask_of_mempolicy(&nodes_allowed))) + n_mask = &node_states[N_MEMORY]; + else + n_mask = &nodes_allowed; + } else { /* * Node specific request. count adjustment happens in * set_max_huge_pages() after acquiring hugetlb_lock. */ - init_nodemask_of_node(nodes_allowed, nid); - } else { - /* - * Node specific request, but we could not allocate the few - * words required for a node mask. We are unlikely to hit - * this condition. Since we can not pass down the appropriate - * node mask, just return ENOMEM. - */ - err = -ENOMEM; - goto out; + init_nodemask_of_node(&nodes_allowed, nid); + n_mask = &nodes_allowed; } - err = set_max_huge_pages(h, count, nid, nodes_allowed); - -out: - if (nodes_allowed != &node_states[N_MEMORY]) - NODEMASK_FREE(nodes_allowed); + err = set_max_huge_pages(h, count, nid, n_mask); return err ? err : len; } -- cgit v1.2.3 From dae966dc8f883f202887cc9cf257f92f068eac1e Mon Sep 17 00:00:00 2001 From: Peng Fan Date: Mon, 13 May 2019 17:19:26 -0700 Subject: mm/swap.c: __pagevec_lru_add_fn: typo fix There is no function named munlock_vma_pages(). Correct it to munlock_vma_page(). Link: http://lkml.kernel.org/r/20190402095609.27181-1-peng.fan@nxp.com Signed-off-by: Peng Fan Reviewed-by: Andrew Morton Reviewed-by: Mukesh Ojha Acked-by: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/swap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/swap.c b/mm/swap.c index 301ed4e04320..3a75722e68a9 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -867,7 +867,7 @@ static void __pagevec_lru_add_fn(struct page *page, struct lruvec *lruvec, SetPageLRU(page); /* * Page becomes evictable in two ways: - * 1) Within LRU lock [munlock_vma_pages() and __munlock_pagevec()]. + * 1) Within LRU lock [munlock_vma_page() and __munlock_pagevec()]. * 2) Before acquiring LRU lock to put the page to correct LRU and then * a) do PageLRU check with lock [check_move_unevictable_pages] * b) do PageLRU check before lock [clear_page_mlock] -- cgit v1.2.3 From 926e5d1cb525ec4faa66ddb24ac3b61c0102cb5c Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 13 May 2019 17:19:29 -0700 Subject: include/linux/balloon_compaction.h: drop unused function stubs These are leftovers from the pre-"general non-lru movable page" era. Link: http://lkml.kernel.org/r/20190329122649.28404-1-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Mukesh Ojha Acked-by: Michael S. Tsirkin Acked-by: Pankaj Gupta Acked-by: Rafael Aquini Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/balloon_compaction.h | 15 --------------- 1 file changed, 15 deletions(-) diff --git a/include/linux/balloon_compaction.h b/include/linux/balloon_compaction.h index f111c780ef1d..f31521dcb09a 100644 --- a/include/linux/balloon_compaction.h +++ b/include/linux/balloon_compaction.h @@ -151,21 +151,6 @@ static inline void balloon_page_delete(struct page *page) list_del(&page->lru); } -static inline bool __is_movable_balloon_page(struct page *page) -{ - return false; -} - -static inline bool balloon_page_movable(struct page *page) -{ - return false; -} - -static inline bool isolated_balloon_page(struct page *page) -{ - return false; -} - static inline bool balloon_page_isolate(struct page *page) { return false; -- cgit v1.2.3 From 7567cfc5da9faadbe56dbd65c802b6b828a57d8b Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Mon, 13 May 2019 17:19:32 -0700 Subject: mm/sparse.c: clean up obsolete code comment The code comment above sparse_add_one_section() is obsolete and incorrect. Clean it up and write a new one. Link: http://lkml.kernel.org/r/20190329144250.14315-1-bhe@redhat.com Signed-off-by: Baoquan He Acked-by: Michal Hocko Reviewed-by: Oscar Salvador Reviewed-by: Mukesh Ojha Reviewed-by: Mike Rapoport Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/sparse.c | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/mm/sparse.c b/mm/sparse.c index 56e057c432f9..fd13166949b5 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -684,10 +684,18 @@ static void free_map_bootmem(struct page *memmap) #endif /* CONFIG_MEMORY_HOTREMOVE */ #endif /* CONFIG_SPARSEMEM_VMEMMAP */ -/* - * returns the number of sections whose mem_maps were properly - * set. If this is <=0, then that means that the passed-in - * map was not consumed and must be freed. +/** + * sparse_add_one_section - add a memory section + * @nid: The node to add section on + * @start_pfn: start pfn of the memory range + * @altmap: device page map + * + * This is only intended for hotplug. + * + * Return: + * * 0 - On success. + * * -EEXIST - Section has been present. + * * -ENOMEM - Out of memory. */ int __meminit sparse_add_one_section(int nid, unsigned long start_pfn, struct vmem_altmap *altmap) -- cgit v1.2.3 From 063b8a4cee8088224bcdb79bcd08db98df16178e Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Mon, 13 May 2019 17:19:35 -0700 Subject: drivers/base/memory.c: clean up relics in function parameters The input parameter 'phys_index' of memory_block_action() is actually the section number, but not the phys_index of memory_block. This is a relic from the past when one memory block could only contain one section. Rename it to start_section_nr. And also in remove_memory_section(), the 'node_id' and 'phys_device' arguments are not used by anyone. Remove them. Link: http://lkml.kernel.org/r/20190329144250.14315-2-bhe@redhat.com Signed-off-by: Baoquan He Acked-by: Michal Hocko Reviewed-by: Rafael J. Wysocki Reviewed-by: Mukesh Ojha Reviewed-by: Oscar Salvador Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/base/memory.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/base/memory.c b/drivers/base/memory.c index e49028a60429..0c9e22ffa47a 100644 --- a/drivers/base/memory.c +++ b/drivers/base/memory.c @@ -231,13 +231,14 @@ static bool pages_correctly_probed(unsigned long start_pfn) * OK to have direct references to sparsemem variables in here. */ static int -memory_block_action(unsigned long phys_index, unsigned long action, int online_type) +memory_block_action(unsigned long start_section_nr, unsigned long action, + int online_type) { unsigned long start_pfn; unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block; int ret; - start_pfn = section_nr_to_pfn(phys_index); + start_pfn = section_nr_to_pfn(start_section_nr); switch (action) { case MEM_ONLINE: @@ -251,7 +252,7 @@ memory_block_action(unsigned long phys_index, unsigned long action, int online_t break; default: WARN(1, KERN_WARNING "%s(%ld, %ld) unknown action: " - "%ld\n", __func__, phys_index, action, action); + "%ld\n", __func__, start_section_nr, action, action); ret = -EINVAL; } @@ -738,8 +739,7 @@ unregister_memory(struct memory_block *memory) device_unregister(&memory->dev); } -static int remove_memory_section(unsigned long node_id, - struct mem_section *section, int phys_device) +static int remove_memory_section(struct mem_section *section) { struct memory_block *mem; @@ -771,7 +771,7 @@ int unregister_memory_section(struct mem_section *section) if (!present_section(section)) return -EINVAL; - return remove_memory_section(0, section, 0); + return remove_memory_section(section); } #endif /* CONFIG_MEMORY_HOTREMOVE */ -- cgit v1.2.3 From 0919e1b69ab459e06df45d3ba6658d281962db80 Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Mon, 13 May 2019 17:19:38 -0700 Subject: hugetlbfs: on restore reserve error path retain subpool reservation When a huge page is allocated, PagePrivate() is set if the allocation consumed a reservation. When freeing a huge page, PagePrivate is checked. If set, it indicates the reservation should be restored. PagePrivate being set at free huge page time mostly happens on error paths. When huge page reservations are created, a check is made to determine if the mapping is associated with an explicitly mounted filesystem. If so, pages are also reserved within the filesystem. The default action when freeing a huge page is to decrement the usage count in any associated explicitly mounted filesystem. However, if the reservation is to be restored the reservation/use count within the filesystem should not be decrementd. Otherwise, a subsequent page allocation and free for the same mapping location will cause the file filesystem usage to go 'negative'. Filesystem Size Used Avail Use% Mounted on nodev 4.0G -4.0M 4.1G - /opt/hugepool To fix, when freeing a huge page do not adjust filesystem usage if PagePrivate() is set to indicate the reservation should be restored. I did not cc stable as the problem has been around since reserves were added to hugetlbfs and nobody has noticed. Link: http://lkml.kernel.org/r/20190328234704.27083-2-mike.kravetz@oracle.com Signed-off-by: Mike Kravetz Reviewed-by: Naoya Horiguchi Cc: Davidlohr Bueso Cc: Joonsoo Kim Cc: Michal Hocko Cc: "Kirill A . Shutemov" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 2b0abc30685d..c33c5cbb67ff 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1268,12 +1268,23 @@ void free_huge_page(struct page *page) ClearPagePrivate(page); /* - * A return code of zero implies that the subpool will be under its - * minimum size if the reservation is not restored after page is free. - * Therefore, force restore_reserve operation. + * If PagePrivate() was set on page, page allocation consumed a + * reservation. If the page was associated with a subpool, there + * would have been a page reserved in the subpool before allocation + * via hugepage_subpool_get_pages(). Since we are 'restoring' the + * reservtion, do not call hugepage_subpool_put_pages() as this will + * remove the reserved page from the subpool. */ - if (hugepage_subpool_put_pages(spool, 1) == 0) - restore_reserve = true; + if (!restore_reserve) { + /* + * A return code of zero implies that the subpool will be + * under its minimum size if the reservation is not restored + * after page is free. Therefore, force restore_reserve + * operation. + */ + if (hugepage_subpool_put_pages(spool, 1) == 0) + restore_reserve = true; + } spin_lock(&hugetlb_lock); clear_page_huge_active(page); -- cgit v1.2.3 From 1b426bac66e6cc83c9f2d92b96e4e72acf43419a Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Mon, 13 May 2019 17:19:41 -0700 Subject: hugetlb: use same fault hash key for shared and private mappings hugetlb uses a fault mutex hash table to prevent page faults of the same pages concurrently. The key for shared and private mappings is different. Shared keys off address_space and file index. Private keys off mm and virtual address. Consider a private mappings of a populated hugetlbfs file. A fault will map the page from the file and if needed do a COW to map a writable page. Hugetlbfs hole punch uses the fault mutex to prevent mappings of file pages. It uses the address_space file index key. However, private mappings will use a different key and could race with this code to map the file page. This causes problems (BUG) for the page cache remove code as it expects the page to be unmapped. A sample stack is: page dumped because: VM_BUG_ON_PAGE(page_mapped(page)) kernel BUG at mm/filemap.c:169! ... RIP: 0010:unaccount_page_cache_page+0x1b8/0x200 ... Call Trace: __delete_from_page_cache+0x39/0x220 delete_from_page_cache+0x45/0x70 remove_inode_hugepages+0x13c/0x380 ? __add_to_page_cache_locked+0x162/0x380 hugetlbfs_fallocate+0x403/0x540 ? _cond_resched+0x15/0x30 ? __inode_security_revalidate+0x5d/0x70 ? selinux_file_permission+0x100/0x130 vfs_fallocate+0x13f/0x270 ksys_fallocate+0x3c/0x80 __x64_sys_fallocate+0x1a/0x20 do_syscall_64+0x5b/0x180 entry_SYSCALL_64_after_hwframe+0x44/0xa9 There seems to be another potential COW issue/race with this approach of different private and shared keys as noted in commit 8382d914ebf7 ("mm, hugetlb: improve page-fault scalability"). Since every hugetlb mapping (even anon and private) is actually a file mapping, just use the address_space index key for all mappings. This results in potentially more hash collisions. However, this should not be the common case. Link: http://lkml.kernel.org/r/20190328234704.27083-3-mike.kravetz@oracle.com Link: http://lkml.kernel.org/r/20190412165235.t4sscoujczfhuiyt@linux-r8p5 Fixes: b5cec28d36f5 ("hugetlbfs: truncate_hugepages() takes a range of pages") Signed-off-by: Mike Kravetz Reviewed-by: Naoya Horiguchi Reviewed-by: Davidlohr Bueso Cc: Joonsoo Kim Cc: "Kirill A . Shutemov" Cc: Michal Hocko Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/hugetlbfs/inode.c | 7 ++----- include/linux/hugetlb.h | 4 +--- mm/hugetlb.c | 22 ++++++---------------- mm/userfaultfd.c | 3 +-- 4 files changed, 10 insertions(+), 26 deletions(-) diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index c74ef4426282..f23237135163 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -440,9 +440,7 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart, u32 hash; index = page->index; - hash = hugetlb_fault_mutex_hash(h, current->mm, - &pseudo_vma, - mapping, index, 0); + hash = hugetlb_fault_mutex_hash(h, mapping, index, 0); mutex_lock(&hugetlb_fault_mutex_table[hash]); /* @@ -639,8 +637,7 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset, addr = index * hpage_size; /* mutex taken here, fault path and hole punch */ - hash = hugetlb_fault_mutex_hash(h, mm, &pseudo_vma, mapping, - index, addr); + hash = hugetlb_fault_mutex_hash(h, mapping, index, addr); mutex_lock(&hugetlb_fault_mutex_table[hash]); /* See if already present in mapping to avoid alloc/free */ diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 11943b60f208..edf476c8cfb9 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -123,9 +123,7 @@ void move_hugetlb_state(struct page *oldpage, struct page *newpage, int reason); void free_huge_page(struct page *page); void hugetlb_fix_reserve_counts(struct inode *inode); extern struct mutex *hugetlb_fault_mutex_table; -u32 hugetlb_fault_mutex_hash(struct hstate *h, struct mm_struct *mm, - struct vm_area_struct *vma, - struct address_space *mapping, +u32 hugetlb_fault_mutex_hash(struct hstate *h, struct address_space *mapping, pgoff_t idx, unsigned long address); pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud); diff --git a/mm/hugetlb.c b/mm/hugetlb.c index c33c5cbb67ff..98a3c7c224cb 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -3824,8 +3824,7 @@ retry: * handling userfault. Reacquire after handling * fault to make calling code simpler. */ - hash = hugetlb_fault_mutex_hash(h, mm, vma, mapping, - idx, haddr); + hash = hugetlb_fault_mutex_hash(h, mapping, idx, haddr); mutex_unlock(&hugetlb_fault_mutex_table[hash]); ret = handle_userfault(&vmf, VM_UFFD_MISSING); mutex_lock(&hugetlb_fault_mutex_table[hash]); @@ -3933,21 +3932,14 @@ backout_unlocked: } #ifdef CONFIG_SMP -u32 hugetlb_fault_mutex_hash(struct hstate *h, struct mm_struct *mm, - struct vm_area_struct *vma, - struct address_space *mapping, +u32 hugetlb_fault_mutex_hash(struct hstate *h, struct address_space *mapping, pgoff_t idx, unsigned long address) { unsigned long key[2]; u32 hash; - if (vma->vm_flags & VM_SHARED) { - key[0] = (unsigned long) mapping; - key[1] = idx; - } else { - key[0] = (unsigned long) mm; - key[1] = address >> huge_page_shift(h); - } + key[0] = (unsigned long) mapping; + key[1] = idx; hash = jhash2((u32 *)&key, sizeof(key)/sizeof(u32), 0); @@ -3958,9 +3950,7 @@ u32 hugetlb_fault_mutex_hash(struct hstate *h, struct mm_struct *mm, * For uniprocesor systems we always use a single mutex, so just * return 0 and avoid the hashing overhead. */ -u32 hugetlb_fault_mutex_hash(struct hstate *h, struct mm_struct *mm, - struct vm_area_struct *vma, - struct address_space *mapping, +u32 hugetlb_fault_mutex_hash(struct hstate *h, struct address_space *mapping, pgoff_t idx, unsigned long address) { return 0; @@ -4005,7 +3995,7 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, * get spurious allocation failures if two CPUs race to instantiate * the same page in the page cache. */ - hash = hugetlb_fault_mutex_hash(h, mm, vma, mapping, idx, haddr); + hash = hugetlb_fault_mutex_hash(h, mapping, idx, haddr); mutex_lock(&hugetlb_fault_mutex_table[hash]); entry = huge_ptep_get(ptep); diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index d59b5a73dfb3..9932d5755e4c 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -271,8 +271,7 @@ retry: */ idx = linear_page_index(dst_vma, dst_addr); mapping = dst_vma->vm_file->f_mapping; - hash = hugetlb_fault_mutex_hash(h, dst_mm, dst_vma, mapping, - idx, dst_addr); + hash = hugetlb_fault_mutex_hash(h, mapping, idx, dst_addr); mutex_lock(&hugetlb_fault_mutex_table[hash]); err = -ENOMEM; -- cgit v1.2.3 From 734fb89968900b5c5f8edd5038bd4cdeab8c61d2 Mon Sep 17 00:00:00 2001 From: Jérôme Glisse Date: Mon, 13 May 2019 17:19:45 -0700 Subject: mm/hmm: select mmu notifier when selecting HMM MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit To avoid random config build issue, select mmu notifier when HMM is selected. In any cases when HMM get selected it will be by users that will also wants the mmu notifier. Link: http://lkml.kernel.org/r/20190403193318.16478-2-jglisse@redhat.com Signed-off-by: Jérôme Glisse Acked-by: Balbir Singh Cc: Ralph Campbell Cc: John Hubbard Cc: Dan Williams Cc: Arnd Bergmann Cc: Dan Carpenter Cc: Ira Weiny Cc: Matthew Wilcox Cc: Souptick Joarder Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/Kconfig b/mm/Kconfig index 137eadc18732..0eada3f818fa 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -697,12 +697,12 @@ config DEV_PAGEMAP_OPS config HMM bool + select MMU_NOTIFIER select MIGRATE_VMA_HELPER config HMM_MIRROR bool "HMM mirror CPU page table into a device page table" depends on ARCH_HAS_HMM - select MMU_NOTIFIER select HMM help Select HMM_MIRROR if you want to mirror range of the CPU page table of a -- cgit v1.2.3 From 704f3f2cf63cdb76925ac2ff432182c73574b20b Mon Sep 17 00:00:00 2001 From: Jérôme Glisse Date: Mon, 13 May 2019 17:19:48 -0700 Subject: mm/hmm: use reference counting for HMM struct MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Every time I read the code to check that the HMM structure does not vanish before it should thanks to the many lock protecting its removal i get a headache. Switch to reference counting instead it is much easier to follow and harder to break. This also remove some code that is no longer needed with refcounting. Link: http://lkml.kernel.org/r/20190403193318.16478-3-jglisse@redhat.com Signed-off-by: Jérôme Glisse Reviewed-by: Ralph Campbell Cc: John Hubbard Cc: Dan Williams Cc: Arnd Bergmann Cc: Balbir Singh Cc: Dan Carpenter Cc: Ira Weiny Cc: Matthew Wilcox Cc: Souptick Joarder Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/hmm.h | 2 + mm/hmm.c | 190 +++++++++++++++++++++++++++++++++------------------- 2 files changed, 124 insertions(+), 68 deletions(-) diff --git a/include/linux/hmm.h b/include/linux/hmm.h index ad50b7b4f141..716fc61fa6d4 100644 --- a/include/linux/hmm.h +++ b/include/linux/hmm.h @@ -131,6 +131,7 @@ enum hmm_pfn_value_e { /* * struct hmm_range - track invalidation lock on virtual address range * + * @hmm: the core HMM structure this range is active against * @vma: the vm area struct for the range * @list: all range lock are on a list * @start: range virtual start address (inclusive) @@ -142,6 +143,7 @@ enum hmm_pfn_value_e { * @valid: pfns array did not change since it has been fill by an HMM function */ struct hmm_range { + struct hmm *hmm; struct vm_area_struct *vma; struct list_head list; unsigned long start; diff --git a/mm/hmm.c b/mm/hmm.c index fe1cd87e49ac..919d78fd21c5 100644 --- a/mm/hmm.c +++ b/mm/hmm.c @@ -50,6 +50,7 @@ static const struct mmu_notifier_ops hmm_mmu_notifier_ops; */ struct hmm { struct mm_struct *mm; + struct kref kref; spinlock_t lock; struct list_head ranges; struct list_head mirrors; @@ -57,24 +58,33 @@ struct hmm { struct rw_semaphore mirrors_sem; }; -/* - * hmm_register - register HMM against an mm (HMM internal) +static inline struct hmm *mm_get_hmm(struct mm_struct *mm) +{ + struct hmm *hmm = READ_ONCE(mm->hmm); + + if (hmm && kref_get_unless_zero(&hmm->kref)) + return hmm; + + return NULL; +} + +/** + * hmm_get_or_create - register HMM against an mm (HMM internal) * * @mm: mm struct to attach to + * Returns: returns an HMM object, either by referencing the existing + * (per-process) object, or by creating a new one. * - * This is not intended to be used directly by device drivers. It allocates an - * HMM struct if mm does not have one, and initializes it. + * This is not intended to be used directly by device drivers. If mm already + * has an HMM struct then it get a reference on it and returns it. Otherwise + * it allocates an HMM struct, initializes it, associate it with the mm and + * returns it. */ -static struct hmm *hmm_register(struct mm_struct *mm) +static struct hmm *hmm_get_or_create(struct mm_struct *mm) { - struct hmm *hmm = READ_ONCE(mm->hmm); + struct hmm *hmm = mm_get_hmm(mm); bool cleanup = false; - /* - * The hmm struct can only be freed once the mm_struct goes away, - * hence we should always have pre-allocated an new hmm struct - * above. - */ if (hmm) return hmm; @@ -86,6 +96,7 @@ static struct hmm *hmm_register(struct mm_struct *mm) hmm->mmu_notifier.ops = NULL; INIT_LIST_HEAD(&hmm->ranges); spin_lock_init(&hmm->lock); + kref_init(&hmm->kref); hmm->mm = mm; spin_lock(&mm->page_table_lock); @@ -106,7 +117,7 @@ static struct hmm *hmm_register(struct mm_struct *mm) if (__mmu_notifier_register(&hmm->mmu_notifier, mm)) goto error_mm; - return mm->hmm; + return hmm; error_mm: spin_lock(&mm->page_table_lock); @@ -118,9 +129,41 @@ error: return NULL; } +static void hmm_free(struct kref *kref) +{ + struct hmm *hmm = container_of(kref, struct hmm, kref); + struct mm_struct *mm = hmm->mm; + + mmu_notifier_unregister_no_release(&hmm->mmu_notifier, mm); + + spin_lock(&mm->page_table_lock); + if (mm->hmm == hmm) + mm->hmm = NULL; + spin_unlock(&mm->page_table_lock); + + kfree(hmm); +} + +static inline void hmm_put(struct hmm *hmm) +{ + kref_put(&hmm->kref, hmm_free); +} + void hmm_mm_destroy(struct mm_struct *mm) { - kfree(mm->hmm); + struct hmm *hmm; + + spin_lock(&mm->page_table_lock); + hmm = mm_get_hmm(mm); + mm->hmm = NULL; + if (hmm) { + hmm->mm = NULL; + spin_unlock(&mm->page_table_lock); + hmm_put(hmm); + return; + } + + spin_unlock(&mm->page_table_lock); } static int hmm_invalidate_range(struct hmm *hmm, bool device, @@ -165,7 +208,7 @@ static int hmm_invalidate_range(struct hmm *hmm, bool device, static void hmm_release(struct mmu_notifier *mn, struct mm_struct *mm) { struct hmm_mirror *mirror; - struct hmm *hmm = mm->hmm; + struct hmm *hmm = mm_get_hmm(mm); down_write(&hmm->mirrors_sem); mirror = list_first_entry_or_null(&hmm->mirrors, struct hmm_mirror, @@ -186,13 +229,16 @@ static void hmm_release(struct mmu_notifier *mn, struct mm_struct *mm) struct hmm_mirror, list); } up_write(&hmm->mirrors_sem); + + hmm_put(hmm); } static int hmm_invalidate_range_start(struct mmu_notifier *mn, const struct mmu_notifier_range *range) { + struct hmm *hmm = mm_get_hmm(range->mm); struct hmm_update update; - struct hmm *hmm = range->mm->hmm; + int ret; VM_BUG_ON(!hmm); @@ -200,14 +246,16 @@ static int hmm_invalidate_range_start(struct mmu_notifier *mn, update.end = range->end; update.event = HMM_UPDATE_INVALIDATE; update.blockable = range->blockable; - return hmm_invalidate_range(hmm, true, &update); + ret = hmm_invalidate_range(hmm, true, &update); + hmm_put(hmm); + return ret; } static void hmm_invalidate_range_end(struct mmu_notifier *mn, const struct mmu_notifier_range *range) { + struct hmm *hmm = mm_get_hmm(range->mm); struct hmm_update update; - struct hmm *hmm = range->mm->hmm; VM_BUG_ON(!hmm); @@ -216,6 +264,7 @@ static void hmm_invalidate_range_end(struct mmu_notifier *mn, update.event = HMM_UPDATE_INVALIDATE; update.blockable = true; hmm_invalidate_range(hmm, false, &update); + hmm_put(hmm); } static const struct mmu_notifier_ops hmm_mmu_notifier_ops = { @@ -241,24 +290,13 @@ int hmm_mirror_register(struct hmm_mirror *mirror, struct mm_struct *mm) if (!mm || !mirror || !mirror->ops) return -EINVAL; -again: - mirror->hmm = hmm_register(mm); + mirror->hmm = hmm_get_or_create(mm); if (!mirror->hmm) return -ENOMEM; down_write(&mirror->hmm->mirrors_sem); - if (mirror->hmm->mm == NULL) { - /* - * A racing hmm_mirror_unregister() is about to destroy the hmm - * struct. Try again to allocate a new one. - */ - up_write(&mirror->hmm->mirrors_sem); - mirror->hmm = NULL; - goto again; - } else { - list_add(&mirror->list, &mirror->hmm->mirrors); - up_write(&mirror->hmm->mirrors_sem); - } + list_add(&mirror->list, &mirror->hmm->mirrors); + up_write(&mirror->hmm->mirrors_sem); return 0; } @@ -273,33 +311,18 @@ EXPORT_SYMBOL(hmm_mirror_register); */ void hmm_mirror_unregister(struct hmm_mirror *mirror) { - bool should_unregister = false; - struct mm_struct *mm; - struct hmm *hmm; + struct hmm *hmm = READ_ONCE(mirror->hmm); - if (mirror->hmm == NULL) + if (hmm == NULL) return; - hmm = mirror->hmm; down_write(&hmm->mirrors_sem); list_del_init(&mirror->list); - should_unregister = list_empty(&hmm->mirrors); + /* To protect us against double unregister ... */ mirror->hmm = NULL; - mm = hmm->mm; - hmm->mm = NULL; up_write(&hmm->mirrors_sem); - if (!should_unregister || mm == NULL) - return; - - mmu_notifier_unregister_no_release(&hmm->mmu_notifier, mm); - - spin_lock(&mm->page_table_lock); - if (mm->hmm == hmm) - mm->hmm = NULL; - spin_unlock(&mm->page_table_lock); - - kfree(hmm); + hmm_put(hmm); } EXPORT_SYMBOL(hmm_mirror_unregister); @@ -708,23 +731,29 @@ int hmm_vma_get_pfns(struct hmm_range *range) struct mm_walk mm_walk; struct hmm *hmm; + range->hmm = NULL; + /* Sanity check, this really should not happen ! */ if (range->start < vma->vm_start || range->start >= vma->vm_end) return -EINVAL; if (range->end < vma->vm_start || range->end > vma->vm_end) return -EINVAL; - hmm = hmm_register(vma->vm_mm); + hmm = hmm_get_or_create(vma->vm_mm); if (!hmm) return -ENOMEM; - /* Caller must have registered a mirror, via hmm_mirror_register() ! */ - if (!hmm->mmu_notifier.ops) + + /* Check if hmm_mm_destroy() was call. */ + if (hmm->mm == NULL) { + hmm_put(hmm); return -EINVAL; + } /* FIXME support hugetlb fs */ if (is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_SPECIAL) || vma_is_dax(vma)) { hmm_pfns_special(range); + hmm_put(hmm); return -EINVAL; } @@ -736,6 +765,7 @@ int hmm_vma_get_pfns(struct hmm_range *range) * operations such has atomic access would not work. */ hmm_pfns_clear(range, range->pfns, range->start, range->end); + hmm_put(hmm); return -EPERM; } @@ -758,6 +788,12 @@ int hmm_vma_get_pfns(struct hmm_range *range) mm_walk.pte_hole = hmm_vma_walk_hole; walk_page_range(range->start, range->end, &mm_walk); + /* + * Transfer hmm reference to the range struct it will be drop inside + * the hmm_vma_range_done() function (which _must_ be call if this + * function return 0). + */ + range->hmm = hmm; return 0; } EXPORT_SYMBOL(hmm_vma_get_pfns); @@ -802,25 +838,27 @@ EXPORT_SYMBOL(hmm_vma_get_pfns); */ bool hmm_vma_range_done(struct hmm_range *range) { - unsigned long npages = (range->end - range->start) >> PAGE_SHIFT; - struct hmm *hmm; + bool ret = false; - if (range->end <= range->start) { + /* Sanity check this really should not happen. */ + if (range->hmm == NULL || range->end <= range->start) { BUG(); return false; } - hmm = hmm_register(range->vma->vm_mm); - if (!hmm) { - memset(range->pfns, 0, sizeof(*range->pfns) * npages); - return false; - } - - spin_lock(&hmm->lock); + spin_lock(&range->hmm->lock); list_del_rcu(&range->list); - spin_unlock(&hmm->lock); + ret = range->valid; + spin_unlock(&range->hmm->lock); - return range->valid; + /* Is the mm still alive ? */ + if (range->hmm->mm == NULL) + ret = false; + + /* Drop reference taken by hmm_vma_fault() or hmm_vma_get_pfns() */ + hmm_put(range->hmm); + range->hmm = NULL; + return ret; } EXPORT_SYMBOL(hmm_vma_range_done); @@ -880,25 +918,31 @@ int hmm_vma_fault(struct hmm_range *range, bool block) struct hmm *hmm; int ret; + range->hmm = NULL; + /* Sanity check, this really should not happen ! */ if (range->start < vma->vm_start || range->start >= vma->vm_end) return -EINVAL; if (range->end < vma->vm_start || range->end > vma->vm_end) return -EINVAL; - hmm = hmm_register(vma->vm_mm); + hmm = hmm_get_or_create(vma->vm_mm); if (!hmm) { hmm_pfns_clear(range, range->pfns, range->start, range->end); return -ENOMEM; } - /* Caller must have registered a mirror using hmm_mirror_register() */ - if (!hmm->mmu_notifier.ops) + + /* Check if hmm_mm_destroy() was call. */ + if (hmm->mm == NULL) { + hmm_put(hmm); return -EINVAL; + } /* FIXME support hugetlb fs */ if (is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_SPECIAL) || vma_is_dax(vma)) { hmm_pfns_special(range); + hmm_put(hmm); return -EINVAL; } @@ -910,6 +954,7 @@ int hmm_vma_fault(struct hmm_range *range, bool block) * operations such has atomic access would not work. */ hmm_pfns_clear(range, range->pfns, range->start, range->end); + hmm_put(hmm); return -EPERM; } @@ -945,7 +990,16 @@ int hmm_vma_fault(struct hmm_range *range, bool block) hmm_pfns_clear(range, &range->pfns[i], hmm_vma_walk.last, range->end); hmm_vma_range_done(range); + hmm_put(hmm); + } else { + /* + * Transfer hmm reference to the range struct it will be drop + * inside the hmm_vma_range_done() function (which _must_ be + * call if this function return 0). + */ + range->hmm = hmm; } + return ret; } EXPORT_SYMBOL(hmm_vma_fault); -- cgit v1.2.3 From 9f454612f602d02204b1f6e86b6bec2bfb368c4b Mon Sep 17 00:00:00 2001 From: Jérôme Glisse Date: Mon, 13 May 2019 17:19:52 -0700 Subject: mm/hmm: do not erase snapshot when a range is invalidated MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Users of HMM might be using the snapshot information to do preparatory step like dma mapping pages to a device before checking for invalidation through hmm_vma_range_done() so do not erase that information and assume users will do the right thing. Link: http://lkml.kernel.org/r/20190403193318.16478-4-jglisse@redhat.com Signed-off-by: Jérôme Glisse Reviewed-by: Ralph Campbell Reviewed-by: John Hubbard Cc: Dan Williams Cc: Arnd Bergmann Cc: Balbir Singh Cc: Dan Carpenter Cc: Ira Weiny Cc: Matthew Wilcox Cc: Souptick Joarder Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hmm.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/mm/hmm.c b/mm/hmm.c index 919d78fd21c5..84e0577a912a 100644 --- a/mm/hmm.c +++ b/mm/hmm.c @@ -174,16 +174,10 @@ static int hmm_invalidate_range(struct hmm *hmm, bool device, spin_lock(&hmm->lock); list_for_each_entry(range, &hmm->ranges, list) { - unsigned long addr, idx, npages; - if (update->end < range->start || update->start >= range->end) continue; range->valid = false; - addr = max(update->start, range->start); - idx = (addr - range->start) >> PAGE_SHIFT; - npages = (min(range->end, update->end) - addr) >> PAGE_SHIFT; - memset(&range->pfns[idx], 0, sizeof(*range->pfns) * npages); } spin_unlock(&hmm->lock); -- cgit v1.2.3 From 25f23a0c7127b65c4d8200ccda8a352ad5ce1e1d Mon Sep 17 00:00:00 2001 From: Jérôme Glisse Date: Mon, 13 May 2019 17:19:55 -0700 Subject: mm/hmm: improve and rename hmm_vma_get_pfns() to hmm_range_snapshot() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Rename for consistency between code, comments and documentation. Also improves the comments on all the possible returns values. Improve the function by returning the number of populated entries in pfns array. Link: http://lkml.kernel.org/r/20190403193318.16478-5-jglisse@redhat.com Signed-off-by: Jérôme Glisse Reviewed-by: Ralph Campbell Reviewed-by: John Hubbard Reviewed-by: Ira Weiny Cc: Dan Williams Cc: Arnd Bergmann Cc: Balbir Singh Cc: Dan Carpenter Cc: Matthew Wilcox Cc: Souptick Joarder Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/vm/hmm.rst | 26 ++++++++++++++++++-------- include/linux/hmm.h | 4 ++-- mm/hmm.c | 31 +++++++++++++++++-------------- 3 files changed, 37 insertions(+), 24 deletions(-) diff --git a/Documentation/vm/hmm.rst b/Documentation/vm/hmm.rst index 44205f0b671f..d9b27bdadd1b 100644 --- a/Documentation/vm/hmm.rst +++ b/Documentation/vm/hmm.rst @@ -189,11 +189,7 @@ the driver callback returns. When the device driver wants to populate a range of virtual addresses, it can use either:: - int hmm_vma_get_pfns(struct vm_area_struct *vma, - struct hmm_range *range, - unsigned long start, - unsigned long end, - hmm_pfn_t *pfns); + long hmm_range_snapshot(struct hmm_range *range); int hmm_vma_fault(struct vm_area_struct *vma, struct hmm_range *range, unsigned long start, @@ -202,7 +198,7 @@ use either:: bool write, bool block); -The first one (hmm_vma_get_pfns()) will only fetch present CPU page table +The first one (hmm_range_snapshot()) will only fetch present CPU page table entries and will not trigger a page fault on missing or non-present entries. The second one does trigger a page fault on missing or read-only entry if the write parameter is true. Page faults use the generic mm page fault code path @@ -220,19 +216,33 @@ respect in order to keep things properly synchronized. The usage pattern is:: { struct hmm_range range; ... + + range.start = ...; + range.end = ...; + range.pfns = ...; + range.flags = ...; + range.values = ...; + range.pfn_shift = ...; + again: - ret = hmm_vma_get_pfns(vma, &range, start, end, pfns); - if (ret) + down_read(&mm->mmap_sem); + range.vma = ...; + ret = hmm_range_snapshot(&range); + if (ret) { + up_read(&mm->mmap_sem); return ret; + } take_lock(driver->update); if (!hmm_vma_range_done(vma, &range)) { release_lock(driver->update); + up_read(&mm->mmap_sem); goto again; } // Use pfns array content to update device page table release_lock(driver->update); + up_read(&mm->mmap_sem); return 0; } diff --git a/include/linux/hmm.h b/include/linux/hmm.h index 716fc61fa6d4..32206b0b1bfd 100644 --- a/include/linux/hmm.h +++ b/include/linux/hmm.h @@ -365,11 +365,11 @@ void hmm_mirror_unregister(struct hmm_mirror *mirror); * table invalidation serializes on it. * * YOU MUST CALL hmm_vma_range_done() ONCE AND ONLY ONCE EACH TIME YOU CALL - * hmm_vma_get_pfns() WITHOUT ERROR ! + * hmm_range_snapshot() WITHOUT ERROR ! * * IF YOU DO NOT FOLLOW THE ABOVE RULE THE SNAPSHOT CONTENT MIGHT BE INVALID ! */ -int hmm_vma_get_pfns(struct hmm_range *range); +long hmm_range_snapshot(struct hmm_range *range); bool hmm_vma_range_done(struct hmm_range *range); diff --git a/mm/hmm.c b/mm/hmm.c index 84e0577a912a..bd957a9f10d1 100644 --- a/mm/hmm.c +++ b/mm/hmm.c @@ -702,23 +702,25 @@ static void hmm_pfns_special(struct hmm_range *range) } /* - * hmm_vma_get_pfns() - snapshot CPU page table for a range of virtual addresses - * @range: range being snapshotted - * Returns: -EINVAL if invalid argument, -ENOMEM out of memory, -EPERM invalid - * vma permission, 0 success + * hmm_range_snapshot() - snapshot CPU page table for a range + * @range: range + * Returns: number of valid pages in range->pfns[] (from range start + * address). This may be zero. If the return value is negative, + * then one of the following values may be returned: + * + * -EINVAL invalid arguments or mm or virtual address are in an + * invalid vma (ie either hugetlbfs or device file vma). + * -EPERM For example, asking for write, when the range is + * read-only + * -EAGAIN Caller needs to retry + * -EFAULT Either no valid vma exists for this range, or it is + * illegal to access the range * * This snapshots the CPU page table for a range of virtual addresses. Snapshot * validity is tracked by range struct. See hmm_vma_range_done() for further * information. - * - * The range struct is initialized here. It tracks the CPU page table, but only - * if the function returns success (0), in which case the caller must then call - * hmm_vma_range_done() to stop CPU page table update tracking on this range. - * - * NOT CALLING hmm_vma_range_done() IF FUNCTION RETURNS 0 WILL LEAD TO SERIOUS - * MEMORY CORRUPTION ! YOU HAVE BEEN WARNED ! */ -int hmm_vma_get_pfns(struct hmm_range *range) +long hmm_range_snapshot(struct hmm_range *range) { struct vm_area_struct *vma = range->vma; struct hmm_vma_walk hmm_vma_walk; @@ -772,6 +774,7 @@ int hmm_vma_get_pfns(struct hmm_range *range) hmm_vma_walk.fault = false; hmm_vma_walk.range = range; mm_walk.private = &hmm_vma_walk; + hmm_vma_walk.last = range->start; mm_walk.vma = vma; mm_walk.mm = vma->vm_mm; @@ -788,9 +791,9 @@ int hmm_vma_get_pfns(struct hmm_range *range) * function return 0). */ range->hmm = hmm; - return 0; + return (hmm_vma_walk.last - range->start) >> PAGE_SHIFT; } -EXPORT_SYMBOL(hmm_vma_get_pfns); +EXPORT_SYMBOL(hmm_range_snapshot); /* * hmm_vma_range_done() - stop tracking change to CPU page table over a range -- cgit v1.2.3 From 73231612dc7c907bd96880a4086ee55eef6b6888 Mon Sep 17 00:00:00 2001 From: Jérôme Glisse Date: Mon, 13 May 2019 17:19:58 -0700 Subject: mm/hmm: improve and rename hmm_vma_fault() to hmm_range_fault() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Minor optimization around hmm_pte_need_fault(). Rename for consistency between code, comments and documentation. Also improves the comments on all the possible returns values. Improve the function by returning the number of populated entries in pfns array. Link: http://lkml.kernel.org/r/20190403193318.16478-6-jglisse@redhat.com Signed-off-by: Jérôme Glisse Reviewed-by: Ralph Campbell Cc: John Hubbard Cc: Dan Williams Cc: Arnd Bergmann Cc: Balbir Singh Cc: Dan Carpenter Cc: Ira Weiny Cc: Matthew Wilcox Cc: Souptick Joarder Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/vm/hmm.rst | 8 +---- include/linux/hmm.h | 13 ++++++- mm/hmm.c | 91 +++++++++++++++++++++--------------------------- 3 files changed, 52 insertions(+), 60 deletions(-) diff --git a/Documentation/vm/hmm.rst b/Documentation/vm/hmm.rst index d9b27bdadd1b..61f073215a8d 100644 --- a/Documentation/vm/hmm.rst +++ b/Documentation/vm/hmm.rst @@ -190,13 +190,7 @@ When the device driver wants to populate a range of virtual addresses, it can use either:: long hmm_range_snapshot(struct hmm_range *range); - int hmm_vma_fault(struct vm_area_struct *vma, - struct hmm_range *range, - unsigned long start, - unsigned long end, - hmm_pfn_t *pfns, - bool write, - bool block); + long hmm_range_fault(struct hmm_range *range, bool block); The first one (hmm_range_snapshot()) will only fetch present CPU page table entries and will not trigger a page fault on missing or non-present entries. diff --git a/include/linux/hmm.h b/include/linux/hmm.h index 32206b0b1bfd..e9afd23c2eac 100644 --- a/include/linux/hmm.h +++ b/include/linux/hmm.h @@ -391,7 +391,18 @@ bool hmm_vma_range_done(struct hmm_range *range); * * See the function description in mm/hmm.c for further documentation. */ -int hmm_vma_fault(struct hmm_range *range, bool block); +long hmm_range_fault(struct hmm_range *range, bool block); + +/* This is a temporary helper to avoid merge conflict between trees. */ +static inline int hmm_vma_fault(struct hmm_range *range, bool block) +{ + long ret = hmm_range_fault(range, block); + if (ret == -EBUSY) + ret = -EAGAIN; + else if (ret == -EAGAIN) + ret = -EBUSY; + return ret < 0 ? ret : 0; +} /* Below are for HMM internal use only! Not to be used by device driver! */ void hmm_mm_destroy(struct mm_struct *mm); diff --git a/mm/hmm.c b/mm/hmm.c index bd957a9f10d1..b7e4034d96e1 100644 --- a/mm/hmm.c +++ b/mm/hmm.c @@ -340,13 +340,13 @@ static int hmm_vma_do_fault(struct mm_walk *walk, unsigned long addr, flags |= write_fault ? FAULT_FLAG_WRITE : 0; ret = handle_mm_fault(vma, addr, flags); if (ret & VM_FAULT_RETRY) - return -EBUSY; + return -EAGAIN; if (ret & VM_FAULT_ERROR) { *pfn = range->values[HMM_PFN_ERROR]; return -EFAULT; } - return -EAGAIN; + return -EBUSY; } static int hmm_pfns_bad(unsigned long addr, @@ -372,7 +372,7 @@ static int hmm_pfns_bad(unsigned long addr, * @fault: should we fault or not ? * @write_fault: write fault ? * @walk: mm_walk structure - * Returns: 0 on success, -EAGAIN after page fault, or page fault error + * Returns: 0 on success, -EBUSY after page fault, or page fault error * * This function will be called whenever pmd_none() or pte_none() returns true, * or whenever there is no page directory covering the virtual address range. @@ -395,12 +395,12 @@ static int hmm_vma_walk_hole_(unsigned long addr, unsigned long end, ret = hmm_vma_do_fault(walk, addr, write_fault, &pfns[i]); - if (ret != -EAGAIN) + if (ret != -EBUSY) return ret; } } - return (fault || write_fault) ? -EAGAIN : 0; + return (fault || write_fault) ? -EBUSY : 0; } static inline void hmm_pte_need_fault(const struct hmm_vma_walk *hmm_vma_walk, @@ -531,11 +531,11 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr, uint64_t orig_pfn = *pfn; *pfn = range->values[HMM_PFN_NONE]; - cpu_flags = pte_to_hmm_pfn_flags(range, pte); - hmm_pte_need_fault(hmm_vma_walk, orig_pfn, cpu_flags, - &fault, &write_fault); + fault = write_fault = false; if (pte_none(pte)) { + hmm_pte_need_fault(hmm_vma_walk, orig_pfn, 0, + &fault, &write_fault); if (fault || write_fault) goto fault; return 0; @@ -574,7 +574,7 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr, hmm_vma_walk->last = addr; migration_entry_wait(vma->vm_mm, pmdp, addr); - return -EAGAIN; + return -EBUSY; } return 0; } @@ -582,6 +582,10 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr, /* Report error for everything else */ *pfn = range->values[HMM_PFN_ERROR]; return -EFAULT; + } else { + cpu_flags = pte_to_hmm_pfn_flags(range, pte); + hmm_pte_need_fault(hmm_vma_walk, orig_pfn, cpu_flags, + &fault, &write_fault); } if (fault || write_fault) @@ -632,7 +636,7 @@ again: if (fault || write_fault) { hmm_vma_walk->last = addr; pmd_migration_entry_wait(vma->vm_mm, pmdp); - return -EAGAIN; + return -EBUSY; } return 0; } else if (!pmd_present(pmd)) @@ -860,53 +864,34 @@ bool hmm_vma_range_done(struct hmm_range *range) EXPORT_SYMBOL(hmm_vma_range_done); /* - * hmm_vma_fault() - try to fault some address in a virtual address range + * hmm_range_fault() - try to fault some address in a virtual address range * @range: range being faulted * @block: allow blocking on fault (if true it sleeps and do not drop mmap_sem) - * Returns: 0 success, error otherwise (-EAGAIN means mmap_sem have been drop) + * Returns: number of valid pages in range->pfns[] (from range start + * address). This may be zero. If the return value is negative, + * then one of the following values may be returned: + * + * -EINVAL invalid arguments or mm or virtual address are in an + * invalid vma (ie either hugetlbfs or device file vma). + * -ENOMEM: Out of memory. + * -EPERM: Invalid permission (for instance asking for write and + * range is read only). + * -EAGAIN: If you need to retry and mmap_sem was drop. This can only + * happens if block argument is false. + * -EBUSY: If the the range is being invalidated and you should wait + * for invalidation to finish. + * -EFAULT: Invalid (ie either no valid vma or it is illegal to access + * that range), number of valid pages in range->pfns[] (from + * range start address). * * This is similar to a regular CPU page fault except that it will not trigger - * any memory migration if the memory being faulted is not accessible by CPUs. + * any memory migration if the memory being faulted is not accessible by CPUs + * and caller does not ask for migration. * * On error, for one virtual address in the range, the function will mark the * corresponding HMM pfn entry with an error flag. - * - * Expected use pattern: - * retry: - * down_read(&mm->mmap_sem); - * // Find vma and address device wants to fault, initialize hmm_pfn_t - * // array accordingly - * ret = hmm_vma_fault(range, write, block); - * switch (ret) { - * case -EAGAIN: - * hmm_vma_range_done(range); - * // You might want to rate limit or yield to play nicely, you may - * // also commit any valid pfn in the array assuming that you are - * // getting true from hmm_vma_range_monitor_end() - * goto retry; - * case 0: - * break; - * case -ENOMEM: - * case -EINVAL: - * case -EPERM: - * default: - * // Handle error ! - * up_read(&mm->mmap_sem) - * return; - * } - * // Take device driver lock that serialize device page table update - * driver_lock_device_page_table_update(); - * hmm_vma_range_done(range); - * // Commit pfns we got from hmm_vma_fault() - * driver_unlock_device_page_table_update(); - * up_read(&mm->mmap_sem) - * - * YOU MUST CALL hmm_vma_range_done() AFTER THIS FUNCTION RETURN SUCCESS (0) - * BEFORE FREEING THE range struct OR YOU WILL HAVE SERIOUS MEMORY CORRUPTION ! - * - * YOU HAVE BEEN WARNED ! */ -int hmm_vma_fault(struct hmm_range *range, bool block) +long hmm_range_fault(struct hmm_range *range, bool block) { struct vm_area_struct *vma = range->vma; unsigned long start = range->start; @@ -978,7 +963,8 @@ int hmm_vma_fault(struct hmm_range *range, bool block) do { ret = walk_page_range(start, range->end, &mm_walk); start = hmm_vma_walk.last; - } while (ret == -EAGAIN); + /* Keep trying while the range is valid. */ + } while (ret == -EBUSY && range->valid); if (ret) { unsigned long i; @@ -988,6 +974,7 @@ int hmm_vma_fault(struct hmm_range *range, bool block) range->end); hmm_vma_range_done(range); hmm_put(hmm); + return ret; } else { /* * Transfer hmm reference to the range struct it will be drop @@ -997,9 +984,9 @@ int hmm_vma_fault(struct hmm_range *range, bool block) range->hmm = hmm; } - return ret; + return (hmm_vma_walk.last - range->start) >> PAGE_SHIFT; } -EXPORT_SYMBOL(hmm_vma_fault); +EXPORT_SYMBOL(hmm_range_fault); #endif /* IS_ENABLED(CONFIG_HMM_MIRROR) */ -- cgit v1.2.3 From a3e0d41c2b1f86b483b202d642140d8b86d677ca Mon Sep 17 00:00:00 2001 From: Jérôme Glisse Date: Mon, 13 May 2019 17:20:01 -0700 Subject: mm/hmm: improve driver API to work and wait over a range MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A common use case for HMM mirror is user trying to mirror a range and before they could program the hardware it get invalidated by some core mm event. Instead of having user re-try right away to mirror the range provide a completion mechanism for them to wait for any active invalidation affecting the range. This also changes how hmm_range_snapshot() and hmm_range_fault() works by not relying on vma so that we can drop the mmap_sem when waiting and lookup the vma again on retry. Link: http://lkml.kernel.org/r/20190403193318.16478-7-jglisse@redhat.com Signed-off-by: Jérôme Glisse Reviewed-by: Ralph Campbell Cc: John Hubbard Cc: Dan Williams Cc: Dan Carpenter Cc: Matthew Wilcox Cc: Arnd Bergmann Cc: Balbir Singh Cc: Ira Weiny Cc: Souptick Joarder Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/vm/hmm.rst | 25 ++- include/linux/hmm.h | 145 +++++++++---- mm/hmm.c | 531 +++++++++++++++++++++++------------------------ 3 files changed, 387 insertions(+), 314 deletions(-) diff --git a/Documentation/vm/hmm.rst b/Documentation/vm/hmm.rst index 61f073215a8d..945d5fb6d14a 100644 --- a/Documentation/vm/hmm.rst +++ b/Documentation/vm/hmm.rst @@ -217,17 +217,33 @@ respect in order to keep things properly synchronized. The usage pattern is:: range.flags = ...; range.values = ...; range.pfn_shift = ...; + hmm_range_register(&range); + + /* + * Just wait for range to be valid, safe to ignore return value as we + * will use the return value of hmm_range_snapshot() below under the + * mmap_sem to ascertain the validity of the range. + */ + hmm_range_wait_until_valid(&range, TIMEOUT_IN_MSEC); again: down_read(&mm->mmap_sem); - range.vma = ...; ret = hmm_range_snapshot(&range); if (ret) { up_read(&mm->mmap_sem); + if (ret == -EAGAIN) { + /* + * No need to check hmm_range_wait_until_valid() return value + * on retry we will get proper error with hmm_range_snapshot() + */ + hmm_range_wait_until_valid(&range, TIMEOUT_IN_MSEC); + goto again; + } + hmm_mirror_unregister(&range); return ret; } take_lock(driver->update); - if (!hmm_vma_range_done(vma, &range)) { + if (!range.valid) { release_lock(driver->update); up_read(&mm->mmap_sem); goto again; @@ -235,14 +251,15 @@ respect in order to keep things properly synchronized. The usage pattern is:: // Use pfns array content to update device page table + hmm_mirror_unregister(&range); release_lock(driver->update); up_read(&mm->mmap_sem); return 0; } The driver->update lock is the same lock that the driver takes inside its -update() callback. That lock must be held before hmm_vma_range_done() to avoid -any race with a concurrent CPU page table update. +update() callback. That lock must be held before checking the range.valid +field to avoid any race with a concurrent CPU page table update. HMM implements all this on top of the mmu_notifier API because we wanted a simpler API and also to be able to perform optimizations latter on like doing diff --git a/include/linux/hmm.h b/include/linux/hmm.h index e9afd23c2eac..ec4bfa91648f 100644 --- a/include/linux/hmm.h +++ b/include/linux/hmm.h @@ -77,8 +77,34 @@ #include #include #include +#include -struct hmm; + +/* + * struct hmm - HMM per mm struct + * + * @mm: mm struct this HMM struct is bound to + * @lock: lock protecting ranges list + * @ranges: list of range being snapshotted + * @mirrors: list of mirrors for this mm + * @mmu_notifier: mmu notifier to track updates to CPU page table + * @mirrors_sem: read/write semaphore protecting the mirrors list + * @wq: wait queue for user waiting on a range invalidation + * @notifiers: count of active mmu notifiers + * @dead: is the mm dead ? + */ +struct hmm { + struct mm_struct *mm; + struct kref kref; + struct mutex lock; + struct list_head ranges; + struct list_head mirrors; + struct mmu_notifier mmu_notifier; + struct rw_semaphore mirrors_sem; + wait_queue_head_t wq; + long notifiers; + bool dead; +}; /* * hmm_pfn_flag_e - HMM flag enums @@ -155,6 +181,38 @@ struct hmm_range { bool valid; }; +/* + * hmm_range_wait_until_valid() - wait for range to be valid + * @range: range affected by invalidation to wait on + * @timeout: time out for wait in ms (ie abort wait after that period of time) + * Returns: true if the range is valid, false otherwise. + */ +static inline bool hmm_range_wait_until_valid(struct hmm_range *range, + unsigned long timeout) +{ + /* Check if mm is dead ? */ + if (range->hmm == NULL || range->hmm->dead || range->hmm->mm == NULL) { + range->valid = false; + return false; + } + if (range->valid) + return true; + wait_event_timeout(range->hmm->wq, range->valid || range->hmm->dead, + msecs_to_jiffies(timeout)); + /* Return current valid status just in case we get lucky */ + return range->valid; +} + +/* + * hmm_range_valid() - test if a range is valid or not + * @range: range + * Returns: true if the range is valid, false otherwise. + */ +static inline bool hmm_range_valid(struct hmm_range *range) +{ + return range->valid; +} + /* * hmm_pfn_to_page() - return struct page pointed to by a valid HMM pfn * @range: range use to decode HMM pfn value @@ -357,51 +415,66 @@ void hmm_mirror_unregister(struct hmm_mirror *mirror); /* - * To snapshot the CPU page table, call hmm_vma_get_pfns(), then take a device - * driver lock that serializes device page table updates, then call - * hmm_vma_range_done(), to check if the snapshot is still valid. The same - * device driver page table update lock must also be used in the - * hmm_mirror_ops.sync_cpu_device_pagetables() callback, so that CPU page - * table invalidation serializes on it. - * - * YOU MUST CALL hmm_vma_range_done() ONCE AND ONLY ONCE EACH TIME YOU CALL - * hmm_range_snapshot() WITHOUT ERROR ! - * - * IF YOU DO NOT FOLLOW THE ABOVE RULE THE SNAPSHOT CONTENT MIGHT BE INVALID ! + * Please see Documentation/vm/hmm.rst for how to use the range API. */ +int hmm_range_register(struct hmm_range *range, + struct mm_struct *mm, + unsigned long start, + unsigned long end); +void hmm_range_unregister(struct hmm_range *range); long hmm_range_snapshot(struct hmm_range *range); -bool hmm_vma_range_done(struct hmm_range *range); - +long hmm_range_fault(struct hmm_range *range, bool block); /* - * Fault memory on behalf of device driver. Unlike handle_mm_fault(), this will - * not migrate any device memory back to system memory. The HMM pfn array will - * be updated with the fault result and current snapshot of the CPU page table - * for the range. - * - * The mmap_sem must be taken in read mode before entering and it might be - * dropped by the function if the block argument is false. In that case, the - * function returns -EAGAIN. - * - * Return value does not reflect if the fault was successful for every single - * address or not. Therefore, the caller must to inspect the HMM pfn array to - * determine fault status for each address. - * - * Trying to fault inside an invalid vma will result in -EINVAL. + * HMM_RANGE_DEFAULT_TIMEOUT - default timeout (ms) when waiting for a range * - * See the function description in mm/hmm.c for further documentation. + * When waiting for mmu notifiers we need some kind of time out otherwise we + * could potentialy wait for ever, 1000ms ie 1s sounds like a long time to + * wait already. */ -long hmm_range_fault(struct hmm_range *range, bool block); +#define HMM_RANGE_DEFAULT_TIMEOUT 1000 + +/* This is a temporary helper to avoid merge conflict between trees. */ +static inline bool hmm_vma_range_done(struct hmm_range *range) +{ + bool ret = hmm_range_valid(range); + + hmm_range_unregister(range); + return ret; +} /* This is a temporary helper to avoid merge conflict between trees. */ static inline int hmm_vma_fault(struct hmm_range *range, bool block) { - long ret = hmm_range_fault(range, block); - if (ret == -EBUSY) - ret = -EAGAIN; - else if (ret == -EAGAIN) - ret = -EBUSY; - return ret < 0 ? ret : 0; + long ret; + + ret = hmm_range_register(range, range->vma->vm_mm, + range->start, range->end); + if (ret) + return (int)ret; + + if (!hmm_range_wait_until_valid(range, HMM_RANGE_DEFAULT_TIMEOUT)) { + /* + * The mmap_sem was taken by driver we release it here and + * returns -EAGAIN which correspond to mmap_sem have been + * drop in the old API. + */ + up_read(&range->vma->vm_mm->mmap_sem); + return -EAGAIN; + } + + ret = hmm_range_fault(range, block); + if (ret <= 0) { + if (ret == -EBUSY || !ret) { + /* Same as above drop mmap_sem to match old API. */ + up_read(&range->vma->vm_mm->mmap_sem); + ret = -EBUSY; + } else if (ret == -EAGAIN) + ret = -EBUSY; + hmm_range_unregister(range); + return ret; + } + return 0; } /* Below are for HMM internal use only! Not to be used by device driver! */ diff --git a/mm/hmm.c b/mm/hmm.c index b7e4034d96e1..3e07f32b94f8 100644 --- a/mm/hmm.c +++ b/mm/hmm.c @@ -38,26 +38,6 @@ #if IS_ENABLED(CONFIG_HMM_MIRROR) static const struct mmu_notifier_ops hmm_mmu_notifier_ops; -/* - * struct hmm - HMM per mm struct - * - * @mm: mm struct this HMM struct is bound to - * @lock: lock protecting ranges list - * @ranges: list of range being snapshotted - * @mirrors: list of mirrors for this mm - * @mmu_notifier: mmu notifier to track updates to CPU page table - * @mirrors_sem: read/write semaphore protecting the mirrors list - */ -struct hmm { - struct mm_struct *mm; - struct kref kref; - spinlock_t lock; - struct list_head ranges; - struct list_head mirrors; - struct mmu_notifier mmu_notifier; - struct rw_semaphore mirrors_sem; -}; - static inline struct hmm *mm_get_hmm(struct mm_struct *mm) { struct hmm *hmm = READ_ONCE(mm->hmm); @@ -91,12 +71,15 @@ static struct hmm *hmm_get_or_create(struct mm_struct *mm) hmm = kmalloc(sizeof(*hmm), GFP_KERNEL); if (!hmm) return NULL; + init_waitqueue_head(&hmm->wq); INIT_LIST_HEAD(&hmm->mirrors); init_rwsem(&hmm->mirrors_sem); hmm->mmu_notifier.ops = NULL; INIT_LIST_HEAD(&hmm->ranges); - spin_lock_init(&hmm->lock); + mutex_init(&hmm->lock); kref_init(&hmm->kref); + hmm->notifiers = 0; + hmm->dead = false; hmm->mm = mm; spin_lock(&mm->page_table_lock); @@ -158,6 +141,7 @@ void hmm_mm_destroy(struct mm_struct *mm) mm->hmm = NULL; if (hmm) { hmm->mm = NULL; + hmm->dead = true; spin_unlock(&mm->page_table_lock); hmm_put(hmm); return; @@ -166,43 +150,22 @@ void hmm_mm_destroy(struct mm_struct *mm) spin_unlock(&mm->page_table_lock); } -static int hmm_invalidate_range(struct hmm *hmm, bool device, - const struct hmm_update *update) +static void hmm_release(struct mmu_notifier *mn, struct mm_struct *mm) { + struct hmm *hmm = mm_get_hmm(mm); struct hmm_mirror *mirror; struct hmm_range *range; - spin_lock(&hmm->lock); - list_for_each_entry(range, &hmm->ranges, list) { - if (update->end < range->start || update->start >= range->end) - continue; + /* Report this HMM as dying. */ + hmm->dead = true; + /* Wake-up everyone waiting on any range. */ + mutex_lock(&hmm->lock); + list_for_each_entry(range, &hmm->ranges, list) { range->valid = false; } - spin_unlock(&hmm->lock); - - if (!device) - return 0; - - down_read(&hmm->mirrors_sem); - list_for_each_entry(mirror, &hmm->mirrors, list) { - int ret; - - ret = mirror->ops->sync_cpu_device_pagetables(mirror, update); - if (!update->blockable && ret == -EAGAIN) { - up_read(&hmm->mirrors_sem); - return -EAGAIN; - } - } - up_read(&hmm->mirrors_sem); - - return 0; -} - -static void hmm_release(struct mmu_notifier *mn, struct mm_struct *mm) -{ - struct hmm_mirror *mirror; - struct hmm *hmm = mm_get_hmm(mm); + wake_up_all(&hmm->wq); + mutex_unlock(&hmm->lock); down_write(&hmm->mirrors_sem); mirror = list_first_entry_or_null(&hmm->mirrors, struct hmm_mirror, @@ -228,36 +191,80 @@ static void hmm_release(struct mmu_notifier *mn, struct mm_struct *mm) } static int hmm_invalidate_range_start(struct mmu_notifier *mn, - const struct mmu_notifier_range *range) + const struct mmu_notifier_range *nrange) { - struct hmm *hmm = mm_get_hmm(range->mm); + struct hmm *hmm = mm_get_hmm(nrange->mm); + struct hmm_mirror *mirror; struct hmm_update update; - int ret; + struct hmm_range *range; + int ret = 0; VM_BUG_ON(!hmm); - update.start = range->start; - update.end = range->end; + update.start = nrange->start; + update.end = nrange->end; update.event = HMM_UPDATE_INVALIDATE; - update.blockable = range->blockable; - ret = hmm_invalidate_range(hmm, true, &update); + update.blockable = nrange->blockable; + + if (nrange->blockable) + mutex_lock(&hmm->lock); + else if (!mutex_trylock(&hmm->lock)) { + ret = -EAGAIN; + goto out; + } + hmm->notifiers++; + list_for_each_entry(range, &hmm->ranges, list) { + if (update.end < range->start || update.start >= range->end) + continue; + + range->valid = false; + } + mutex_unlock(&hmm->lock); + + if (nrange->blockable) + down_read(&hmm->mirrors_sem); + else if (!down_read_trylock(&hmm->mirrors_sem)) { + ret = -EAGAIN; + goto out; + } + list_for_each_entry(mirror, &hmm->mirrors, list) { + int ret; + + ret = mirror->ops->sync_cpu_device_pagetables(mirror, &update); + if (!update.blockable && ret == -EAGAIN) { + up_read(&hmm->mirrors_sem); + ret = -EAGAIN; + goto out; + } + } + up_read(&hmm->mirrors_sem); + +out: hmm_put(hmm); return ret; } static void hmm_invalidate_range_end(struct mmu_notifier *mn, - const struct mmu_notifier_range *range) + const struct mmu_notifier_range *nrange) { - struct hmm *hmm = mm_get_hmm(range->mm); - struct hmm_update update; + struct hmm *hmm = mm_get_hmm(nrange->mm); VM_BUG_ON(!hmm); - update.start = range->start; - update.end = range->end; - update.event = HMM_UPDATE_INVALIDATE; - update.blockable = true; - hmm_invalidate_range(hmm, false, &update); + mutex_lock(&hmm->lock); + hmm->notifiers--; + if (!hmm->notifiers) { + struct hmm_range *range; + + list_for_each_entry(range, &hmm->ranges, list) { + if (range->valid) + continue; + range->valid = true; + } + wake_up_all(&hmm->wq); + } + mutex_unlock(&hmm->lock); + hmm_put(hmm); } @@ -409,7 +416,6 @@ static inline void hmm_pte_need_fault(const struct hmm_vma_walk *hmm_vma_walk, { struct hmm_range *range = hmm_vma_walk->range; - *fault = *write_fault = false; if (!hmm_vma_walk->fault) return; @@ -448,10 +454,11 @@ static void hmm_range_need_fault(const struct hmm_vma_walk *hmm_vma_walk, return; } + *fault = *write_fault = false; for (i = 0; i < npages; ++i) { hmm_pte_need_fault(hmm_vma_walk, pfns[i], cpu_flags, fault, write_fault); - if ((*fault) || (*write_fault)) + if ((*write_fault)) return; } } @@ -706,162 +713,155 @@ static void hmm_pfns_special(struct hmm_range *range) } /* - * hmm_range_snapshot() - snapshot CPU page table for a range + * hmm_range_register() - start tracking change to CPU page table over a range * @range: range - * Returns: number of valid pages in range->pfns[] (from range start - * address). This may be zero. If the return value is negative, - * then one of the following values may be returned: + * @mm: the mm struct for the range of virtual address + * @start: start virtual address (inclusive) + * @end: end virtual address (exclusive) + * Returns 0 on success, -EFAULT if the address space is no longer valid * - * -EINVAL invalid arguments or mm or virtual address are in an - * invalid vma (ie either hugetlbfs or device file vma). - * -EPERM For example, asking for write, when the range is - * read-only - * -EAGAIN Caller needs to retry - * -EFAULT Either no valid vma exists for this range, or it is - * illegal to access the range - * - * This snapshots the CPU page table for a range of virtual addresses. Snapshot - * validity is tracked by range struct. See hmm_vma_range_done() for further - * information. + * Track updates to the CPU page table see include/linux/hmm.h */ -long hmm_range_snapshot(struct hmm_range *range) +int hmm_range_register(struct hmm_range *range, + struct mm_struct *mm, + unsigned long start, + unsigned long end) { - struct vm_area_struct *vma = range->vma; - struct hmm_vma_walk hmm_vma_walk; - struct mm_walk mm_walk; - struct hmm *hmm; - + range->start = start & PAGE_MASK; + range->end = end & PAGE_MASK; + range->valid = false; range->hmm = NULL; - /* Sanity check, this really should not happen ! */ - if (range->start < vma->vm_start || range->start >= vma->vm_end) - return -EINVAL; - if (range->end < vma->vm_start || range->end > vma->vm_end) + if (range->start >= range->end) return -EINVAL; - hmm = hmm_get_or_create(vma->vm_mm); - if (!hmm) - return -ENOMEM; + range->start = start; + range->end = end; + + range->hmm = hmm_get_or_create(mm); + if (!range->hmm) + return -EFAULT; /* Check if hmm_mm_destroy() was call. */ - if (hmm->mm == NULL) { - hmm_put(hmm); - return -EINVAL; + if (range->hmm->mm == NULL || range->hmm->dead) { + hmm_put(range->hmm); + return -EFAULT; } - /* FIXME support hugetlb fs */ - if (is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_SPECIAL) || - vma_is_dax(vma)) { - hmm_pfns_special(range); - hmm_put(hmm); - return -EINVAL; - } + /* Initialize range to track CPU page table update */ + mutex_lock(&range->hmm->lock); - if (!(vma->vm_flags & VM_READ)) { - /* - * If vma do not allow read access, then assume that it does - * not allow write access, either. Architecture that allow - * write without read access are not supported by HMM, because - * operations such has atomic access would not work. - */ - hmm_pfns_clear(range, range->pfns, range->start, range->end); - hmm_put(hmm); - return -EPERM; - } + list_add_rcu(&range->list, &range->hmm->ranges); - /* Initialize range to track CPU page table update */ - spin_lock(&hmm->lock); - range->valid = true; - list_add_rcu(&range->list, &hmm->ranges); - spin_unlock(&hmm->lock); - - hmm_vma_walk.fault = false; - hmm_vma_walk.range = range; - mm_walk.private = &hmm_vma_walk; - hmm_vma_walk.last = range->start; - - mm_walk.vma = vma; - mm_walk.mm = vma->vm_mm; - mm_walk.pte_entry = NULL; - mm_walk.test_walk = NULL; - mm_walk.hugetlb_entry = NULL; - mm_walk.pmd_entry = hmm_vma_walk_pmd; - mm_walk.pte_hole = hmm_vma_walk_hole; - - walk_page_range(range->start, range->end, &mm_walk); /* - * Transfer hmm reference to the range struct it will be drop inside - * the hmm_vma_range_done() function (which _must_ be call if this - * function return 0). + * If there are any concurrent notifiers we have to wait for them for + * the range to be valid (see hmm_range_wait_until_valid()). */ - range->hmm = hmm; - return (hmm_vma_walk.last - range->start) >> PAGE_SHIFT; + if (!range->hmm->notifiers) + range->valid = true; + mutex_unlock(&range->hmm->lock); + + return 0; } -EXPORT_SYMBOL(hmm_range_snapshot); +EXPORT_SYMBOL(hmm_range_register); /* - * hmm_vma_range_done() - stop tracking change to CPU page table over a range - * @range: range being tracked - * Returns: false if range data has been invalidated, true otherwise + * hmm_range_unregister() - stop tracking change to CPU page table over a range + * @range: range * * Range struct is used to track updates to the CPU page table after a call to - * either hmm_vma_get_pfns() or hmm_vma_fault(). Once the device driver is done - * using the data, or wants to lock updates to the data it got from those - * functions, it must call the hmm_vma_range_done() function, which will then - * stop tracking CPU page table updates. - * - * Note that device driver must still implement general CPU page table update - * tracking either by using hmm_mirror (see hmm_mirror_register()) or by using - * the mmu_notifier API directly. - * - * CPU page table update tracking done through hmm_range is only temporary and - * to be used while trying to duplicate CPU page table contents for a range of - * virtual addresses. - * - * There are two ways to use this : - * again: - * hmm_vma_get_pfns(range); or hmm_vma_fault(...); - * trans = device_build_page_table_update_transaction(pfns); - * device_page_table_lock(); - * if (!hmm_vma_range_done(range)) { - * device_page_table_unlock(); - * goto again; - * } - * device_commit_transaction(trans); - * device_page_table_unlock(); - * - * Or: - * hmm_vma_get_pfns(range); or hmm_vma_fault(...); - * device_page_table_lock(); - * hmm_vma_range_done(range); - * device_update_page_table(range->pfns); - * device_page_table_unlock(); + * hmm_range_register(). See include/linux/hmm.h for how to use it. */ -bool hmm_vma_range_done(struct hmm_range *range) +void hmm_range_unregister(struct hmm_range *range) { - bool ret = false; - /* Sanity check this really should not happen. */ - if (range->hmm == NULL || range->end <= range->start) { - BUG(); - return false; - } + if (range->hmm == NULL || range->end <= range->start) + return; - spin_lock(&range->hmm->lock); + mutex_lock(&range->hmm->lock); list_del_rcu(&range->list); - ret = range->valid; - spin_unlock(&range->hmm->lock); + mutex_unlock(&range->hmm->lock); - /* Is the mm still alive ? */ - if (range->hmm->mm == NULL) - ret = false; - - /* Drop reference taken by hmm_vma_fault() or hmm_vma_get_pfns() */ + /* Drop reference taken by hmm_range_register() */ + range->valid = false; hmm_put(range->hmm); range->hmm = NULL; - return ret; } -EXPORT_SYMBOL(hmm_vma_range_done); +EXPORT_SYMBOL(hmm_range_unregister); + +/* + * hmm_range_snapshot() - snapshot CPU page table for a range + * @range: range + * Returns: -EINVAL if invalid argument, -ENOMEM out of memory, -EPERM invalid + * permission (for instance asking for write and range is read only), + * -EAGAIN if you need to retry, -EFAULT invalid (ie either no valid + * vma or it is illegal to access that range), number of valid pages + * in range->pfns[] (from range start address). + * + * This snapshots the CPU page table for a range of virtual addresses. Snapshot + * validity is tracked by range struct. See in include/linux/hmm.h for example + * on how to use. + */ +long hmm_range_snapshot(struct hmm_range *range) +{ + unsigned long start = range->start, end; + struct hmm_vma_walk hmm_vma_walk; + struct hmm *hmm = range->hmm; + struct vm_area_struct *vma; + struct mm_walk mm_walk; + + /* Check if hmm_mm_destroy() was call. */ + if (hmm->mm == NULL || hmm->dead) + return -EFAULT; + + do { + /* If range is no longer valid force retry. */ + if (!range->valid) + return -EAGAIN; + + vma = find_vma(hmm->mm, start); + if (vma == NULL || (vma->vm_flags & VM_SPECIAL)) + return -EFAULT; + + /* FIXME support hugetlb fs/dax */ + if (is_vm_hugetlb_page(vma) || vma_is_dax(vma)) { + hmm_pfns_special(range); + return -EINVAL; + } + + if (!(vma->vm_flags & VM_READ)) { + /* + * If vma do not allow read access, then assume that it + * does not allow write access, either. HMM does not + * support architecture that allow write without read. + */ + hmm_pfns_clear(range, range->pfns, + range->start, range->end); + return -EPERM; + } + + range->vma = vma; + hmm_vma_walk.last = start; + hmm_vma_walk.fault = false; + hmm_vma_walk.range = range; + mm_walk.private = &hmm_vma_walk; + end = min(range->end, vma->vm_end); + + mm_walk.vma = vma; + mm_walk.mm = vma->vm_mm; + mm_walk.pte_entry = NULL; + mm_walk.test_walk = NULL; + mm_walk.hugetlb_entry = NULL; + mm_walk.pmd_entry = hmm_vma_walk_pmd; + mm_walk.pte_hole = hmm_vma_walk_hole; + + walk_page_range(start, end, &mm_walk); + start = end; + } while (start < range->end); + + return (hmm_vma_walk.last - range->start) >> PAGE_SHIFT; +} +EXPORT_SYMBOL(hmm_range_snapshot); /* * hmm_range_fault() - try to fault some address in a virtual address range @@ -893,96 +893,79 @@ EXPORT_SYMBOL(hmm_vma_range_done); */ long hmm_range_fault(struct hmm_range *range, bool block) { - struct vm_area_struct *vma = range->vma; - unsigned long start = range->start; + unsigned long start = range->start, end; struct hmm_vma_walk hmm_vma_walk; + struct hmm *hmm = range->hmm; + struct vm_area_struct *vma; struct mm_walk mm_walk; - struct hmm *hmm; int ret; - range->hmm = NULL; - - /* Sanity check, this really should not happen ! */ - if (range->start < vma->vm_start || range->start >= vma->vm_end) - return -EINVAL; - if (range->end < vma->vm_start || range->end > vma->vm_end) - return -EINVAL; + /* Check if hmm_mm_destroy() was call. */ + if (hmm->mm == NULL || hmm->dead) + return -EFAULT; - hmm = hmm_get_or_create(vma->vm_mm); - if (!hmm) { - hmm_pfns_clear(range, range->pfns, range->start, range->end); - return -ENOMEM; - } + do { + /* If range is no longer valid force retry. */ + if (!range->valid) { + up_read(&hmm->mm->mmap_sem); + return -EAGAIN; + } - /* Check if hmm_mm_destroy() was call. */ - if (hmm->mm == NULL) { - hmm_put(hmm); - return -EINVAL; - } + vma = find_vma(hmm->mm, start); + if (vma == NULL || (vma->vm_flags & VM_SPECIAL)) + return -EFAULT; - /* FIXME support hugetlb fs */ - if (is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_SPECIAL) || - vma_is_dax(vma)) { - hmm_pfns_special(range); - hmm_put(hmm); - return -EINVAL; - } + /* FIXME support hugetlb fs/dax */ + if (is_vm_hugetlb_page(vma) || vma_is_dax(vma)) { + hmm_pfns_special(range); + return -EINVAL; + } - if (!(vma->vm_flags & VM_READ)) { - /* - * If vma do not allow read access, then assume that it does - * not allow write access, either. Architecture that allow - * write without read access are not supported by HMM, because - * operations such has atomic access would not work. - */ - hmm_pfns_clear(range, range->pfns, range->start, range->end); - hmm_put(hmm); - return -EPERM; - } + if (!(vma->vm_flags & VM_READ)) { + /* + * If vma do not allow read access, then assume that it + * does not allow write access, either. HMM does not + * support architecture that allow write without read. + */ + hmm_pfns_clear(range, range->pfns, + range->start, range->end); + return -EPERM; + } - /* Initialize range to track CPU page table update */ - spin_lock(&hmm->lock); - range->valid = true; - list_add_rcu(&range->list, &hmm->ranges); - spin_unlock(&hmm->lock); - - hmm_vma_walk.fault = true; - hmm_vma_walk.block = block; - hmm_vma_walk.range = range; - mm_walk.private = &hmm_vma_walk; - hmm_vma_walk.last = range->start; - - mm_walk.vma = vma; - mm_walk.mm = vma->vm_mm; - mm_walk.pte_entry = NULL; - mm_walk.test_walk = NULL; - mm_walk.hugetlb_entry = NULL; - mm_walk.pmd_entry = hmm_vma_walk_pmd; - mm_walk.pte_hole = hmm_vma_walk_hole; + range->vma = vma; + hmm_vma_walk.last = start; + hmm_vma_walk.fault = true; + hmm_vma_walk.block = block; + hmm_vma_walk.range = range; + mm_walk.private = &hmm_vma_walk; + end = min(range->end, vma->vm_end); + + mm_walk.vma = vma; + mm_walk.mm = vma->vm_mm; + mm_walk.pte_entry = NULL; + mm_walk.test_walk = NULL; + mm_walk.hugetlb_entry = NULL; + mm_walk.pmd_entry = hmm_vma_walk_pmd; + mm_walk.pte_hole = hmm_vma_walk_hole; + + do { + ret = walk_page_range(start, end, &mm_walk); + start = hmm_vma_walk.last; + + /* Keep trying while the range is valid. */ + } while (ret == -EBUSY && range->valid); + + if (ret) { + unsigned long i; + + i = (hmm_vma_walk.last - range->start) >> PAGE_SHIFT; + hmm_pfns_clear(range, &range->pfns[i], + hmm_vma_walk.last, range->end); + return ret; + } + start = end; - do { - ret = walk_page_range(start, range->end, &mm_walk); - start = hmm_vma_walk.last; - /* Keep trying while the range is valid. */ - } while (ret == -EBUSY && range->valid); - - if (ret) { - unsigned long i; - - i = (hmm_vma_walk.last - range->start) >> PAGE_SHIFT; - hmm_pfns_clear(range, &range->pfns[i], hmm_vma_walk.last, - range->end); - hmm_vma_range_done(range); - hmm_put(hmm); - return ret; - } else { - /* - * Transfer hmm reference to the range struct it will be drop - * inside the hmm_vma_range_done() function (which _must_ be - * call if this function return 0). - */ - range->hmm = hmm; - } + } while (start < range->end); return (hmm_vma_walk.last - range->start) >> PAGE_SHIFT; } -- cgit v1.2.3 From 023a019a9b4e90b9df8ed5be591787b5c914d74f Mon Sep 17 00:00:00 2001 From: Jérôme Glisse Date: Mon, 13 May 2019 17:20:05 -0700 Subject: mm/hmm: add default fault flags to avoid the need to pre-fill pfns arrays MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The HMM mirror API can be use in two fashions. The first one where the HMM user coalesce multiple page faults into one request and set flags per pfns for of those faults. The second one where the HMM user want to pre-fault a range with specific flags. For the latter one it is a waste to have the user pre-fill the pfn arrays with a default flags value. This patch adds a default flags value allowing user to set them for a range without having to pre-fill the pfn array. Link: http://lkml.kernel.org/r/20190403193318.16478-8-jglisse@redhat.com Signed-off-by: Jérôme Glisse Reviewed-by: Ralph Campbell Cc: John Hubbard Cc: Dan Williams Cc: Arnd Bergmann Cc: Balbir Singh Cc: Dan Carpenter Cc: Ira Weiny Cc: Matthew Wilcox Cc: Souptick Joarder Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/vm/hmm.rst | 35 +++++++++++++++++++++++++++++++++++ include/linux/hmm.h | 13 +++++++++++++ mm/hmm.c | 12 ++++++++++++ 3 files changed, 60 insertions(+) diff --git a/Documentation/vm/hmm.rst b/Documentation/vm/hmm.rst index 945d5fb6d14a..ec1efa32af3c 100644 --- a/Documentation/vm/hmm.rst +++ b/Documentation/vm/hmm.rst @@ -276,6 +276,41 @@ report commands as executed is serialized (there is no point in doing this concurrently). +Leverage default_flags and pfn_flags_mask +========================================= + +The hmm_range struct has 2 fields default_flags and pfn_flags_mask that allows +to set fault or snapshot policy for a whole range instead of having to set them +for each entries in the range. + +For instance if the device flags for device entries are: + VALID (1 << 63) + WRITE (1 << 62) + +Now let say that device driver wants to fault with at least read a range then +it does set: + range->default_flags = (1 << 63) + range->pfn_flags_mask = 0; + +and calls hmm_range_fault() as described above. This will fill fault all page +in the range with at least read permission. + +Now let say driver wants to do the same except for one page in the range for +which its want to have write. Now driver set: + range->default_flags = (1 << 63); + range->pfn_flags_mask = (1 << 62); + range->pfns[index_of_write] = (1 << 62); + +With this HMM will fault in all page with at least read (ie valid) and for the +address == range->start + (index_of_write << PAGE_SHIFT) it will fault with +write permission ie if the CPU pte does not have write permission set then HMM +will call handle_mm_fault(). + +Note that HMM will populate the pfns array with write permission for any entry +that have write permission within the CPU pte no matter what are the values set +in default_flags or pfn_flags_mask. + + Represent and manage device memory from core kernel point of view ================================================================= diff --git a/include/linux/hmm.h b/include/linux/hmm.h index ec4bfa91648f..dee2f8953b2e 100644 --- a/include/linux/hmm.h +++ b/include/linux/hmm.h @@ -165,6 +165,8 @@ enum hmm_pfn_value_e { * @pfns: array of pfns (big enough for the range) * @flags: pfn flags to match device driver page table * @values: pfn value for some special case (none, special, error, ...) + * @default_flags: default flags for the range (write, read, ... see hmm doc) + * @pfn_flags_mask: allows to mask pfn flags so that only default_flags matter * @pfn_shifts: pfn shift value (should be <= PAGE_SHIFT) * @valid: pfns array did not change since it has been fill by an HMM function */ @@ -177,6 +179,8 @@ struct hmm_range { uint64_t *pfns; const uint64_t *flags; const uint64_t *values; + uint64_t default_flags; + uint64_t pfn_flags_mask; uint8_t pfn_shift; bool valid; }; @@ -448,6 +452,15 @@ static inline int hmm_vma_fault(struct hmm_range *range, bool block) { long ret; + /* + * With the old API the driver must set each individual entries with + * the requested flags (valid, write, ...). So here we set the mask to + * keep intact the entries provided by the driver and zero out the + * default_flags. + */ + range->default_flags = 0; + range->pfn_flags_mask = -1UL; + ret = hmm_range_register(range, range->vma->vm_mm, range->start, range->end); if (ret) diff --git a/mm/hmm.c b/mm/hmm.c index 3e07f32b94f8..0e21d3594ab6 100644 --- a/mm/hmm.c +++ b/mm/hmm.c @@ -419,6 +419,18 @@ static inline void hmm_pte_need_fault(const struct hmm_vma_walk *hmm_vma_walk, if (!hmm_vma_walk->fault) return; + /* + * So we not only consider the individual per page request we also + * consider the default flags requested for the range. The API can + * be use in 2 fashions. The first one where the HMM user coalesce + * multiple page fault into one request and set flags per pfns for + * of those faults. The second one where the HMM user want to pre- + * fault a range with specific flags. For the latter one it is a + * waste to have the user pre-fill the pfn arrays with a default + * flags value. + */ + pfns = (pfns & range->pfn_flags_mask) | range->default_flags; + /* We aren't ask to do anything ... */ if (!(pfns & range->flags[HMM_PFN_VALID])) return; -- cgit v1.2.3 From 63d5066f6e5a1713d0247ef38f0add545408896b Mon Sep 17 00:00:00 2001 From: Jérôme Glisse Date: Mon, 13 May 2019 17:20:18 -0700 Subject: mm/hmm: mirror hugetlbfs (snapshoting, faulting and DMA mapping) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit HMM mirror is a device driver helpers to mirror range of virtual address. It means that the process jobs running on the device can access the same virtual address as the CPU threads of that process. This patch adds support for hugetlbfs mapping (ie range of virtual address that are mmap of a hugetlbfs). [rcampbell@nvidia.com: fix initial PFN for hugetlbfs pages] Link: http://lkml.kernel.org/r/20190419233536.8080-1-rcampbell@nvidia.com Link: http://lkml.kernel.org/r/20190403193318.16478-9-jglisse@redhat.com Signed-off-by: Jérôme Glisse Signed-off-by: Ralph Campbell Reviewed-by: Ralph Campbell Reviewed-by: Ira Weiny Cc: John Hubbard Cc: Dan Williams Cc: Arnd Bergmann Cc: Balbir Singh Cc: Dan Carpenter Cc: Matthew Wilcox Cc: Souptick Joarder Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/hmm.h | 27 +++++++++++- mm/hmm.c | 123 ++++++++++++++++++++++++++++++++++++++++++++++------ 2 files changed, 134 insertions(+), 16 deletions(-) diff --git a/include/linux/hmm.h b/include/linux/hmm.h index dee2f8953b2e..e5834082de60 100644 --- a/include/linux/hmm.h +++ b/include/linux/hmm.h @@ -181,10 +181,31 @@ struct hmm_range { const uint64_t *values; uint64_t default_flags; uint64_t pfn_flags_mask; + uint8_t page_shift; uint8_t pfn_shift; bool valid; }; +/* + * hmm_range_page_shift() - return the page shift for the range + * @range: range being queried + * Returns: page shift (page size = 1 << page shift) for the range + */ +static inline unsigned hmm_range_page_shift(const struct hmm_range *range) +{ + return range->page_shift; +} + +/* + * hmm_range_page_size() - return the page size for the range + * @range: range being queried + * Returns: page size for the range in bytes + */ +static inline unsigned long hmm_range_page_size(const struct hmm_range *range) +{ + return 1UL << hmm_range_page_shift(range); +} + /* * hmm_range_wait_until_valid() - wait for range to be valid * @range: range affected by invalidation to wait on @@ -424,7 +445,8 @@ void hmm_mirror_unregister(struct hmm_mirror *mirror); int hmm_range_register(struct hmm_range *range, struct mm_struct *mm, unsigned long start, - unsigned long end); + unsigned long end, + unsigned page_shift); void hmm_range_unregister(struct hmm_range *range); long hmm_range_snapshot(struct hmm_range *range); long hmm_range_fault(struct hmm_range *range, bool block); @@ -462,7 +484,8 @@ static inline int hmm_vma_fault(struct hmm_range *range, bool block) range->pfn_flags_mask = -1UL; ret = hmm_range_register(range, range->vma->vm_mm, - range->start, range->end); + range->start, range->end, + PAGE_SHIFT); if (ret) return (int)ret; diff --git a/mm/hmm.c b/mm/hmm.c index 0e21d3594ab6..52e40be56dc7 100644 --- a/mm/hmm.c +++ b/mm/hmm.c @@ -391,11 +391,13 @@ static int hmm_vma_walk_hole_(unsigned long addr, unsigned long end, struct hmm_vma_walk *hmm_vma_walk = walk->private; struct hmm_range *range = hmm_vma_walk->range; uint64_t *pfns = range->pfns; - unsigned long i; + unsigned long i, page_size; hmm_vma_walk->last = addr; - i = (addr - range->start) >> PAGE_SHIFT; - for (; addr < end; addr += PAGE_SIZE, i++) { + page_size = hmm_range_page_size(range); + i = (addr - range->start) >> range->page_shift; + + for (; addr < end; addr += page_size, i++) { pfns[i] = range->values[HMM_PFN_NONE]; if (fault || write_fault) { int ret; @@ -707,6 +709,69 @@ again: return 0; } +static int hmm_vma_walk_hugetlb_entry(pte_t *pte, unsigned long hmask, + unsigned long start, unsigned long end, + struct mm_walk *walk) +{ +#ifdef CONFIG_HUGETLB_PAGE + unsigned long addr = start, i, pfn, mask, size, pfn_inc; + struct hmm_vma_walk *hmm_vma_walk = walk->private; + struct hmm_range *range = hmm_vma_walk->range; + struct vm_area_struct *vma = walk->vma; + struct hstate *h = hstate_vma(vma); + uint64_t orig_pfn, cpu_flags; + bool fault, write_fault; + spinlock_t *ptl; + pte_t entry; + int ret = 0; + + size = 1UL << huge_page_shift(h); + mask = size - 1; + if (range->page_shift != PAGE_SHIFT) { + /* Make sure we are looking at full page. */ + if (start & mask) + return -EINVAL; + if (end < (start + size)) + return -EINVAL; + pfn_inc = size >> PAGE_SHIFT; + } else { + pfn_inc = 1; + size = PAGE_SIZE; + } + + + ptl = huge_pte_lock(hstate_vma(walk->vma), walk->mm, pte); + entry = huge_ptep_get(pte); + + i = (start - range->start) >> range->page_shift; + orig_pfn = range->pfns[i]; + range->pfns[i] = range->values[HMM_PFN_NONE]; + cpu_flags = pte_to_hmm_pfn_flags(range, entry); + fault = write_fault = false; + hmm_pte_need_fault(hmm_vma_walk, orig_pfn, cpu_flags, + &fault, &write_fault); + if (fault || write_fault) { + ret = -ENOENT; + goto unlock; + } + + pfn = pte_pfn(entry) + ((start & mask) >> range->page_shift); + for (; addr < end; addr += size, i++, pfn += pfn_inc) + range->pfns[i] = hmm_pfn_from_pfn(range, pfn) | cpu_flags; + hmm_vma_walk->last = end; + +unlock: + spin_unlock(ptl); + + if (ret == -ENOENT) + return hmm_vma_walk_hole_(addr, end, fault, write_fault, walk); + + return ret; +#else /* CONFIG_HUGETLB_PAGE */ + return -EINVAL; +#endif +} + static void hmm_pfns_clear(struct hmm_range *range, uint64_t *pfns, unsigned long addr, @@ -730,6 +795,7 @@ static void hmm_pfns_special(struct hmm_range *range) * @mm: the mm struct for the range of virtual address * @start: start virtual address (inclusive) * @end: end virtual address (exclusive) + * @page_shift: expect page shift for the range * Returns 0 on success, -EFAULT if the address space is no longer valid * * Track updates to the CPU page table see include/linux/hmm.h @@ -737,16 +803,20 @@ static void hmm_pfns_special(struct hmm_range *range) int hmm_range_register(struct hmm_range *range, struct mm_struct *mm, unsigned long start, - unsigned long end) + unsigned long end, + unsigned page_shift) { - range->start = start & PAGE_MASK; - range->end = end & PAGE_MASK; + unsigned long mask = ((1UL << page_shift) - 1UL); + range->valid = false; range->hmm = NULL; - if (range->start >= range->end) + if ((start & mask) || (end & mask)) + return -EINVAL; + if (start >= end) return -EINVAL; + range->page_shift = page_shift; range->start = start; range->end = end; @@ -816,6 +886,7 @@ EXPORT_SYMBOL(hmm_range_unregister); */ long hmm_range_snapshot(struct hmm_range *range) { + const unsigned long device_vma = VM_IO | VM_PFNMAP | VM_MIXEDMAP; unsigned long start = range->start, end; struct hmm_vma_walk hmm_vma_walk; struct hmm *hmm = range->hmm; @@ -832,15 +903,26 @@ long hmm_range_snapshot(struct hmm_range *range) return -EAGAIN; vma = find_vma(hmm->mm, start); - if (vma == NULL || (vma->vm_flags & VM_SPECIAL)) + if (vma == NULL || (vma->vm_flags & device_vma)) return -EFAULT; - /* FIXME support hugetlb fs/dax */ - if (is_vm_hugetlb_page(vma) || vma_is_dax(vma)) { + /* FIXME support dax */ + if (vma_is_dax(vma)) { hmm_pfns_special(range); return -EINVAL; } + if (is_vm_hugetlb_page(vma)) { + struct hstate *h = hstate_vma(vma); + + if (huge_page_shift(h) != range->page_shift && + range->page_shift != PAGE_SHIFT) + return -EINVAL; + } else { + if (range->page_shift != PAGE_SHIFT) + return -EINVAL; + } + if (!(vma->vm_flags & VM_READ)) { /* * If vma do not allow read access, then assume that it @@ -866,6 +948,7 @@ long hmm_range_snapshot(struct hmm_range *range) mm_walk.hugetlb_entry = NULL; mm_walk.pmd_entry = hmm_vma_walk_pmd; mm_walk.pte_hole = hmm_vma_walk_hole; + mm_walk.hugetlb_entry = hmm_vma_walk_hugetlb_entry; walk_page_range(start, end, &mm_walk); start = end; @@ -884,7 +967,7 @@ EXPORT_SYMBOL(hmm_range_snapshot); * then one of the following values may be returned: * * -EINVAL invalid arguments or mm or virtual address are in an - * invalid vma (ie either hugetlbfs or device file vma). + * invalid vma (for instance device file vma). * -ENOMEM: Out of memory. * -EPERM: Invalid permission (for instance asking for write and * range is read only). @@ -905,6 +988,7 @@ EXPORT_SYMBOL(hmm_range_snapshot); */ long hmm_range_fault(struct hmm_range *range, bool block) { + const unsigned long device_vma = VM_IO | VM_PFNMAP | VM_MIXEDMAP; unsigned long start = range->start, end; struct hmm_vma_walk hmm_vma_walk; struct hmm *hmm = range->hmm; @@ -924,15 +1008,25 @@ long hmm_range_fault(struct hmm_range *range, bool block) } vma = find_vma(hmm->mm, start); - if (vma == NULL || (vma->vm_flags & VM_SPECIAL)) + if (vma == NULL || (vma->vm_flags & device_vma)) return -EFAULT; - /* FIXME support hugetlb fs/dax */ - if (is_vm_hugetlb_page(vma) || vma_is_dax(vma)) { + /* FIXME support dax */ + if (vma_is_dax(vma)) { hmm_pfns_special(range); return -EINVAL; } + if (is_vm_hugetlb_page(vma)) { + if (huge_page_shift(hstate_vma(vma)) != + range->page_shift && + range->page_shift != PAGE_SHIFT) + return -EINVAL; + } else { + if (range->page_shift != PAGE_SHIFT) + return -EINVAL; + } + if (!(vma->vm_flags & VM_READ)) { /* * If vma do not allow read access, then assume that it @@ -959,6 +1053,7 @@ long hmm_range_fault(struct hmm_range *range, bool block) mm_walk.hugetlb_entry = NULL; mm_walk.pmd_entry = hmm_vma_walk_pmd; mm_walk.pte_hole = hmm_vma_walk_hole; + mm_walk.hugetlb_entry = hmm_vma_walk_hugetlb_entry; do { ret = walk_page_range(start, end, &mm_walk); -- cgit v1.2.3 From 992de9a8b7511673156df7d2bb1039dea3b2f7f3 Mon Sep 17 00:00:00 2001 From: Jérôme Glisse Date: Mon, 13 May 2019 17:20:21 -0700 Subject: mm/hmm: allow to mirror vma of a file on a DAX backed filesystem MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit HMM mirror is a device driver helpers to mirror range of virtual address. It means that the process jobs running on the device can access the same virtual address as the CPU threads of that process. This patch adds support for mirroring mapping of file that are on a DAX block device (ie range of virtual address that is an mmap of a file in a filesystem on a DAX block device). There is no reason to not support such case when mirroring virtual address on a device. Note that unlike GUP code we do not take page reference hence when we back-off we have nothing to undo. [jglisse@redhat.com: move THP and hugetlbfs code path behind #if KCONFIG] Link: http://lkml.kernel.org/r/20190422163741.13029-1-jglisse@redhat.com Link: http://lkml.kernel.org/r/20190403193318.16478-10-jglisse@redhat.com Signed-off-by: Jérôme Glisse Reviewed-by: Ralph Campbell Cc: Dan Williams Cc: John Hubbard Cc: Arnd Bergmann Cc: Balbir Singh Cc: Dan Carpenter Cc: Ira Weiny Cc: Matthew Wilcox Cc: Souptick Joarder Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hmm.c | 147 ++++++++++++++++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 126 insertions(+), 21 deletions(-) diff --git a/mm/hmm.c b/mm/hmm.c index 52e40be56dc7..b1c9b05bf26f 100644 --- a/mm/hmm.c +++ b/mm/hmm.c @@ -329,6 +329,7 @@ EXPORT_SYMBOL(hmm_mirror_unregister); struct hmm_vma_walk { struct hmm_range *range; + struct dev_pagemap *pgmap; unsigned long last; bool fault; bool block; @@ -503,12 +504,22 @@ static inline uint64_t pmd_to_hmm_pfn_flags(struct hmm_range *range, pmd_t pmd) range->flags[HMM_PFN_VALID]; } +static inline uint64_t pud_to_hmm_pfn_flags(struct hmm_range *range, pud_t pud) +{ + if (!pud_present(pud)) + return 0; + return pud_write(pud) ? range->flags[HMM_PFN_VALID] | + range->flags[HMM_PFN_WRITE] : + range->flags[HMM_PFN_VALID]; +} + static int hmm_vma_handle_pmd(struct mm_walk *walk, unsigned long addr, unsigned long end, uint64_t *pfns, pmd_t pmd) { +#ifdef CONFIG_TRANSPARENT_HUGEPAGE struct hmm_vma_walk *hmm_vma_walk = walk->private; struct hmm_range *range = hmm_vma_walk->range; unsigned long pfn, npages, i; @@ -524,10 +535,25 @@ static int hmm_vma_handle_pmd(struct mm_walk *walk, return hmm_vma_walk_hole_(addr, end, fault, write_fault, walk); pfn = pmd_pfn(pmd) + pte_index(addr); - for (i = 0; addr < end; addr += PAGE_SIZE, i++, pfn++) + for (i = 0; addr < end; addr += PAGE_SIZE, i++, pfn++) { + if (pmd_devmap(pmd)) { + hmm_vma_walk->pgmap = get_dev_pagemap(pfn, + hmm_vma_walk->pgmap); + if (unlikely(!hmm_vma_walk->pgmap)) + return -EBUSY; + } pfns[i] = hmm_pfn_from_pfn(range, pfn) | cpu_flags; + } + if (hmm_vma_walk->pgmap) { + put_dev_pagemap(hmm_vma_walk->pgmap); + hmm_vma_walk->pgmap = NULL; + } hmm_vma_walk->last = end; return 0; +#else + /* If THP is not enabled then we should never reach that code ! */ + return -EINVAL; +#endif } static inline uint64_t pte_to_hmm_pfn_flags(struct hmm_range *range, pte_t pte) @@ -612,10 +638,24 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr, if (fault || write_fault) goto fault; + if (pte_devmap(pte)) { + hmm_vma_walk->pgmap = get_dev_pagemap(pte_pfn(pte), + hmm_vma_walk->pgmap); + if (unlikely(!hmm_vma_walk->pgmap)) + return -EBUSY; + } else if (IS_ENABLED(CONFIG_ARCH_HAS_PTE_SPECIAL) && pte_special(pte)) { + *pfn = range->values[HMM_PFN_SPECIAL]; + return -EFAULT; + } + *pfn = hmm_pfn_from_pfn(range, pte_pfn(pte)) | cpu_flags; return 0; fault: + if (hmm_vma_walk->pgmap) { + put_dev_pagemap(hmm_vma_walk->pgmap); + hmm_vma_walk->pgmap = NULL; + } pte_unmap(ptep); /* Fault any virtual address we were asked to fault */ return hmm_vma_walk_hole_(addr, end, fault, write_fault, walk); @@ -703,12 +743,93 @@ again: return r; } } + if (hmm_vma_walk->pgmap) { + /* + * We do put_dev_pagemap() here and not in hmm_vma_handle_pte() + * so that we can leverage get_dev_pagemap() optimization which + * will not re-take a reference on a pgmap if we already have + * one. + */ + put_dev_pagemap(hmm_vma_walk->pgmap); + hmm_vma_walk->pgmap = NULL; + } pte_unmap(ptep - 1); hmm_vma_walk->last = addr; return 0; } +static int hmm_vma_walk_pud(pud_t *pudp, + unsigned long start, + unsigned long end, + struct mm_walk *walk) +{ + struct hmm_vma_walk *hmm_vma_walk = walk->private; + struct hmm_range *range = hmm_vma_walk->range; + unsigned long addr = start, next; + pmd_t *pmdp; + pud_t pud; + int ret; + +again: + pud = READ_ONCE(*pudp); + if (pud_none(pud)) + return hmm_vma_walk_hole(start, end, walk); + + if (pud_huge(pud) && pud_devmap(pud)) { + unsigned long i, npages, pfn; + uint64_t *pfns, cpu_flags; + bool fault, write_fault; + + if (!pud_present(pud)) + return hmm_vma_walk_hole(start, end, walk); + + i = (addr - range->start) >> PAGE_SHIFT; + npages = (end - addr) >> PAGE_SHIFT; + pfns = &range->pfns[i]; + + cpu_flags = pud_to_hmm_pfn_flags(range, pud); + hmm_range_need_fault(hmm_vma_walk, pfns, npages, + cpu_flags, &fault, &write_fault); + if (fault || write_fault) + return hmm_vma_walk_hole_(addr, end, fault, + write_fault, walk); + +#ifdef CONFIG_HUGETLB_PAGE + pfn = pud_pfn(pud) + ((addr & ~PUD_MASK) >> PAGE_SHIFT); + for (i = 0; i < npages; ++i, ++pfn) { + hmm_vma_walk->pgmap = get_dev_pagemap(pfn, + hmm_vma_walk->pgmap); + if (unlikely(!hmm_vma_walk->pgmap)) + return -EBUSY; + pfns[i] = hmm_pfn_from_pfn(range, pfn) | cpu_flags; + } + if (hmm_vma_walk->pgmap) { + put_dev_pagemap(hmm_vma_walk->pgmap); + hmm_vma_walk->pgmap = NULL; + } + hmm_vma_walk->last = end; + return 0; +#else + return -EINVAL; +#endif + } + + split_huge_pud(walk->vma, pudp, addr); + if (pud_none(*pudp)) + goto again; + + pmdp = pmd_offset(pudp, addr); + do { + next = pmd_addr_end(addr, end); + ret = hmm_vma_walk_pmd(pmdp, addr, next, walk); + if (ret) + return ret; + } while (pmdp++, addr = next, addr != end); + + return 0; +} + static int hmm_vma_walk_hugetlb_entry(pte_t *pte, unsigned long hmask, unsigned long start, unsigned long end, struct mm_walk *walk) @@ -781,14 +902,6 @@ static void hmm_pfns_clear(struct hmm_range *range, *pfns = range->values[HMM_PFN_NONE]; } -static void hmm_pfns_special(struct hmm_range *range) -{ - unsigned long addr = range->start, i = 0; - - for (; addr < range->end; addr += PAGE_SIZE, i++) - range->pfns[i] = range->values[HMM_PFN_SPECIAL]; -} - /* * hmm_range_register() - start tracking change to CPU page table over a range * @range: range @@ -906,12 +1019,6 @@ long hmm_range_snapshot(struct hmm_range *range) if (vma == NULL || (vma->vm_flags & device_vma)) return -EFAULT; - /* FIXME support dax */ - if (vma_is_dax(vma)) { - hmm_pfns_special(range); - return -EINVAL; - } - if (is_vm_hugetlb_page(vma)) { struct hstate *h = hstate_vma(vma); @@ -935,6 +1042,7 @@ long hmm_range_snapshot(struct hmm_range *range) } range->vma = vma; + hmm_vma_walk.pgmap = NULL; hmm_vma_walk.last = start; hmm_vma_walk.fault = false; hmm_vma_walk.range = range; @@ -946,6 +1054,7 @@ long hmm_range_snapshot(struct hmm_range *range) mm_walk.pte_entry = NULL; mm_walk.test_walk = NULL; mm_walk.hugetlb_entry = NULL; + mm_walk.pud_entry = hmm_vma_walk_pud; mm_walk.pmd_entry = hmm_vma_walk_pmd; mm_walk.pte_hole = hmm_vma_walk_hole; mm_walk.hugetlb_entry = hmm_vma_walk_hugetlb_entry; @@ -1011,12 +1120,6 @@ long hmm_range_fault(struct hmm_range *range, bool block) if (vma == NULL || (vma->vm_flags & device_vma)) return -EFAULT; - /* FIXME support dax */ - if (vma_is_dax(vma)) { - hmm_pfns_special(range); - return -EINVAL; - } - if (is_vm_hugetlb_page(vma)) { if (huge_page_shift(hstate_vma(vma)) != range->page_shift && @@ -1039,6 +1142,7 @@ long hmm_range_fault(struct hmm_range *range, bool block) } range->vma = vma; + hmm_vma_walk.pgmap = NULL; hmm_vma_walk.last = start; hmm_vma_walk.fault = true; hmm_vma_walk.block = block; @@ -1051,6 +1155,7 @@ long hmm_range_fault(struct hmm_range *range, bool block) mm_walk.pte_entry = NULL; mm_walk.test_walk = NULL; mm_walk.hugetlb_entry = NULL; + mm_walk.pud_entry = hmm_vma_walk_pud; mm_walk.pmd_entry = hmm_vma_walk_pmd; mm_walk.pte_hole = hmm_vma_walk_hole; mm_walk.hugetlb_entry = hmm_vma_walk_hugetlb_entry; -- cgit v1.2.3 From 202394178d027f8a1530df65d4a25229138fab62 Mon Sep 17 00:00:00 2001 From: Jérôme Glisse Date: Mon, 13 May 2019 17:20:24 -0700 Subject: mm/hmm: add helpers to test if mm is still alive or not MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The device driver can have kernel thread or worker doing work against a process mm and it is useful for those to test wether the mm is dead or alive to avoid doing useless work. Add an helper to test that so that driver can bail out early if a process is dying. Note that the helper does not perform any lock synchronization and thus is just a hint ie a process might be dying but the helper might still return the process as alive. All HMM functions are safe to use in that case as HMM internal properly protect itself with lock. If driver use this helper with non HMM functions it should ascertain that it is safe to do so. Link: http://lkml.kernel.org/r/20190403193318.16478-11-jglisse@redhat.com Signed-off-by: Jérôme Glisse Cc: Ralph Campbell Cc: John Hubbard Cc: Dan Williams Cc: Ira Weiny Cc: Arnd Bergmann Cc: Balbir Singh Cc: Dan Carpenter Cc: Matthew Wilcox Cc: Souptick Joarder Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/hmm.h | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/include/linux/hmm.h b/include/linux/hmm.h index e5834082de60..a79fcc6681f5 100644 --- a/include/linux/hmm.h +++ b/include/linux/hmm.h @@ -438,6 +438,30 @@ struct hmm_mirror { int hmm_mirror_register(struct hmm_mirror *mirror, struct mm_struct *mm); void hmm_mirror_unregister(struct hmm_mirror *mirror); +/* + * hmm_mirror_mm_is_alive() - test if mm is still alive + * @mirror: the HMM mm mirror for which we want to lock the mmap_sem + * Returns: false if the mm is dead, true otherwise + * + * This is an optimization it will not accurately always return -EINVAL if the + * mm is dead ie there can be false negative (process is being kill but HMM is + * not yet inform of that). It is only intented to be use to optimize out case + * where driver is about to do something time consuming and it would be better + * to skip it if the mm is dead. + */ +static inline bool hmm_mirror_mm_is_alive(struct hmm_mirror *mirror) +{ + struct mm_struct *mm; + + if (!mirror || !mirror->hmm) + return false; + mm = READ_ONCE(mirror->hmm->mm); + if (mirror->hmm->dead || !mm) + return false; + + return true; +} + /* * Please see Documentation/vm/hmm.rst for how to use the range API. -- cgit v1.2.3 From 55c0ece82ac6ad018a71465d332847dce023eeb3 Mon Sep 17 00:00:00 2001 From: Jérôme Glisse Date: Mon, 13 May 2019 17:20:28 -0700 Subject: mm/hmm: add a helper function that fault pages and map them to a device MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This is a all in one helper that fault pages in a range and map them to a device so that every single device driver do not have to re-implement this common pattern. This is taken from ODP RDMA in preparation of ODP RDMA convertion. It will be use by nouveau and other drivers. [jglisse@redhat.com: Was using wrong field and wrong enum] Link: http://lkml.kernel.org/r/20190409175340.26614-1-jglisse@redhat.com Link: http://lkml.kernel.org/r/20190403193318.16478-12-jglisse@redhat.com Signed-off-by: Jérôme Glisse Cc: Ralph Campbell Cc: John Hubbard Cc: Dan Williams Cc: Souptick Joarder Cc: Arnd Bergmann Cc: Balbir Singh Cc: Dan Carpenter Cc: Ira Weiny Cc: Matthew Wilcox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/hmm.h | 9 ++++ mm/hmm.c | 152 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 161 insertions(+) diff --git a/include/linux/hmm.h b/include/linux/hmm.h index a79fcc6681f5..f81fe2c0f343 100644 --- a/include/linux/hmm.h +++ b/include/linux/hmm.h @@ -474,6 +474,15 @@ int hmm_range_register(struct hmm_range *range, void hmm_range_unregister(struct hmm_range *range); long hmm_range_snapshot(struct hmm_range *range); long hmm_range_fault(struct hmm_range *range, bool block); +long hmm_range_dma_map(struct hmm_range *range, + struct device *device, + dma_addr_t *daddrs, + bool block); +long hmm_range_dma_unmap(struct hmm_range *range, + struct vm_area_struct *vma, + struct device *device, + dma_addr_t *daddrs, + bool dirty); /* * HMM_RANGE_DEFAULT_TIMEOUT - default timeout (ms) when waiting for a range diff --git a/mm/hmm.c b/mm/hmm.c index b1c9b05bf26f..95fa7abb9d67 100644 --- a/mm/hmm.c +++ b/mm/hmm.c @@ -30,6 +30,7 @@ #include #include #include +#include #include #include @@ -1182,6 +1183,157 @@ long hmm_range_fault(struct hmm_range *range, bool block) return (hmm_vma_walk.last - range->start) >> PAGE_SHIFT; } EXPORT_SYMBOL(hmm_range_fault); + +/** + * hmm_range_dma_map() - hmm_range_fault() and dma map page all in one. + * @range: range being faulted + * @device: device against to dma map page to + * @daddrs: dma address of mapped pages + * @block: allow blocking on fault (if true it sleeps and do not drop mmap_sem) + * Returns: number of pages mapped on success, -EAGAIN if mmap_sem have been + * drop and you need to try again, some other error value otherwise + * + * Note same usage pattern as hmm_range_fault(). + */ +long hmm_range_dma_map(struct hmm_range *range, + struct device *device, + dma_addr_t *daddrs, + bool block) +{ + unsigned long i, npages, mapped; + long ret; + + ret = hmm_range_fault(range, block); + if (ret <= 0) + return ret ? ret : -EBUSY; + + npages = (range->end - range->start) >> PAGE_SHIFT; + for (i = 0, mapped = 0; i < npages; ++i) { + enum dma_data_direction dir = DMA_TO_DEVICE; + struct page *page; + + /* + * FIXME need to update DMA API to provide invalid DMA address + * value instead of a function to test dma address value. This + * would remove lot of dumb code duplicated accross many arch. + * + * For now setting it to 0 here is good enough as the pfns[] + * value is what is use to check what is valid and what isn't. + */ + daddrs[i] = 0; + + page = hmm_pfn_to_page(range, range->pfns[i]); + if (page == NULL) + continue; + + /* Check if range is being invalidated */ + if (!range->valid) { + ret = -EBUSY; + goto unmap; + } + + /* If it is read and write than map bi-directional. */ + if (range->pfns[i] & range->flags[HMM_PFN_WRITE]) + dir = DMA_BIDIRECTIONAL; + + daddrs[i] = dma_map_page(device, page, 0, PAGE_SIZE, dir); + if (dma_mapping_error(device, daddrs[i])) { + ret = -EFAULT; + goto unmap; + } + + mapped++; + } + + return mapped; + +unmap: + for (npages = i, i = 0; (i < npages) && mapped; ++i) { + enum dma_data_direction dir = DMA_TO_DEVICE; + struct page *page; + + page = hmm_pfn_to_page(range, range->pfns[i]); + if (page == NULL) + continue; + + if (dma_mapping_error(device, daddrs[i])) + continue; + + /* If it is read and write than map bi-directional. */ + if (range->pfns[i] & range->flags[HMM_PFN_WRITE]) + dir = DMA_BIDIRECTIONAL; + + dma_unmap_page(device, daddrs[i], PAGE_SIZE, dir); + mapped--; + } + + return ret; +} +EXPORT_SYMBOL(hmm_range_dma_map); + +/** + * hmm_range_dma_unmap() - unmap range of that was map with hmm_range_dma_map() + * @range: range being unmapped + * @vma: the vma against which the range (optional) + * @device: device against which dma map was done + * @daddrs: dma address of mapped pages + * @dirty: dirty page if it had the write flag set + * Returns: number of page unmapped on success, -EINVAL otherwise + * + * Note that caller MUST abide by mmu notifier or use HMM mirror and abide + * to the sync_cpu_device_pagetables() callback so that it is safe here to + * call set_page_dirty(). Caller must also take appropriate locks to avoid + * concurrent mmu notifier or sync_cpu_device_pagetables() to make progress. + */ +long hmm_range_dma_unmap(struct hmm_range *range, + struct vm_area_struct *vma, + struct device *device, + dma_addr_t *daddrs, + bool dirty) +{ + unsigned long i, npages; + long cpages = 0; + + /* Sanity check. */ + if (range->end <= range->start) + return -EINVAL; + if (!daddrs) + return -EINVAL; + if (!range->pfns) + return -EINVAL; + + npages = (range->end - range->start) >> PAGE_SHIFT; + for (i = 0; i < npages; ++i) { + enum dma_data_direction dir = DMA_TO_DEVICE; + struct page *page; + + page = hmm_pfn_to_page(range, range->pfns[i]); + if (page == NULL) + continue; + + /* If it is read and write than map bi-directional. */ + if (range->pfns[i] & range->flags[HMM_PFN_WRITE]) { + dir = DMA_BIDIRECTIONAL; + + /* + * See comments in function description on why it is + * safe here to call set_page_dirty() + */ + if (dirty) + set_page_dirty(page); + } + + /* Unmap and clear pfns/dma address */ + dma_unmap_page(device, daddrs[i], PAGE_SIZE, dir); + range->pfns[i] = range->values[HMM_PFN_NONE]; + /* FIXME see comments in hmm_vma_dma_map() */ + daddrs[i] = 0; + cpages++; + } + + return cpages; +} +EXPORT_SYMBOL(hmm_range_dma_unmap); #endif /* IS_ENABLED(CONFIG_HMM_MIRROR) */ -- cgit v1.2.3 From 391aab11e93f36c421abeab62526954d08ac3eed Mon Sep 17 00:00:00 2001 From: Jérôme Glisse Date: Mon, 13 May 2019 17:20:31 -0700 Subject: mm/hmm: convert various hmm_pfn_* to device_entry which is a better name MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Convert hmm_pfn_* to device_entry_* as here we are dealing with device driver specific entry format and hmm provide helpers to allow differents components (including HMM) to create/parse device entry. We keep wrapper with the old name so that we can convert driver to use the new API in stages in each device driver tree. This will get remove once all driver are converted. Link: http://lkml.kernel.org/r/20190403193318.16478-13-jglisse@redhat.com Signed-off-by: Jérôme Glisse Cc: Ralph Campbell Cc: John Hubbard Cc: Dan Williams Cc: Ira Weiny Cc: Arnd Bergmann Cc: Balbir Singh Cc: Dan Carpenter Cc: Matthew Wilcox Cc: Souptick Joarder Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/hmm.h | 93 ++++++++++++++++++++++++++++++++++++----------------- mm/hmm.c | 19 ++++++----- 2 files changed, 75 insertions(+), 37 deletions(-) diff --git a/include/linux/hmm.h b/include/linux/hmm.h index f81fe2c0f343..51ec27a84668 100644 --- a/include/linux/hmm.h +++ b/include/linux/hmm.h @@ -239,36 +239,36 @@ static inline bool hmm_range_valid(struct hmm_range *range) } /* - * hmm_pfn_to_page() - return struct page pointed to by a valid HMM pfn - * @range: range use to decode HMM pfn value - * @pfn: HMM pfn value to get corresponding struct page from - * Returns: struct page pointer if pfn is a valid HMM pfn, NULL otherwise + * hmm_device_entry_to_page() - return struct page pointed to by a device entry + * @range: range use to decode device entry value + * @entry: device entry value to get corresponding struct page from + * Returns: struct page pointer if entry is a valid, NULL otherwise * - * If the HMM pfn is valid (ie valid flag set) then return the struct page - * matching the pfn value stored in the HMM pfn. Otherwise return NULL. + * If the device entry is valid (ie valid flag set) then return the struct page + * matching the entry value. Otherwise return NULL. */ -static inline struct page *hmm_pfn_to_page(const struct hmm_range *range, - uint64_t pfn) +static inline struct page *hmm_device_entry_to_page(const struct hmm_range *range, + uint64_t entry) { - if (pfn == range->values[HMM_PFN_NONE]) + if (entry == range->values[HMM_PFN_NONE]) return NULL; - if (pfn == range->values[HMM_PFN_ERROR]) + if (entry == range->values[HMM_PFN_ERROR]) return NULL; - if (pfn == range->values[HMM_PFN_SPECIAL]) + if (entry == range->values[HMM_PFN_SPECIAL]) return NULL; - if (!(pfn & range->flags[HMM_PFN_VALID])) + if (!(entry & range->flags[HMM_PFN_VALID])) return NULL; - return pfn_to_page(pfn >> range->pfn_shift); + return pfn_to_page(entry >> range->pfn_shift); } /* - * hmm_pfn_to_pfn() - return pfn value store in a HMM pfn - * @range: range use to decode HMM pfn value - * @pfn: HMM pfn value to extract pfn from - * Returns: pfn value if HMM pfn is valid, -1UL otherwise + * hmm_device_entry_to_pfn() - return pfn value store in a device entry + * @range: range use to decode device entry value + * @entry: device entry to extract pfn from + * Returns: pfn value if device entry is valid, -1UL otherwise */ -static inline unsigned long hmm_pfn_to_pfn(const struct hmm_range *range, - uint64_t pfn) +static inline unsigned long +hmm_device_entry_to_pfn(const struct hmm_range *range, uint64_t pfn) { if (pfn == range->values[HMM_PFN_NONE]) return -1UL; @@ -282,31 +282,66 @@ static inline unsigned long hmm_pfn_to_pfn(const struct hmm_range *range, } /* - * hmm_pfn_from_page() - create a valid HMM pfn value from struct page + * hmm_device_entry_from_page() - create a valid device entry for a page * @range: range use to encode HMM pfn value - * @page: struct page pointer for which to create the HMM pfn - * Returns: valid HMM pfn for the page + * @page: page for which to create the device entry + * Returns: valid device entry for the page */ -static inline uint64_t hmm_pfn_from_page(const struct hmm_range *range, - struct page *page) +static inline uint64_t hmm_device_entry_from_page(const struct hmm_range *range, + struct page *page) { return (page_to_pfn(page) << range->pfn_shift) | range->flags[HMM_PFN_VALID]; } /* - * hmm_pfn_from_pfn() - create a valid HMM pfn value from pfn + * hmm_device_entry_from_pfn() - create a valid device entry value from pfn * @range: range use to encode HMM pfn value - * @pfn: pfn value for which to create the HMM pfn - * Returns: valid HMM pfn for the pfn + * @pfn: pfn value for which to create the device entry + * Returns: valid device entry for the pfn */ -static inline uint64_t hmm_pfn_from_pfn(const struct hmm_range *range, - unsigned long pfn) +static inline uint64_t hmm_device_entry_from_pfn(const struct hmm_range *range, + unsigned long pfn) { return (pfn << range->pfn_shift) | range->flags[HMM_PFN_VALID]; } +/* + * Old API: + * hmm_pfn_to_page() + * hmm_pfn_to_pfn() + * hmm_pfn_from_page() + * hmm_pfn_from_pfn() + * + * This are the OLD API please use new API, it is here to avoid cross-tree + * merge painfullness ie we convert things to new API in stages. + */ +static inline struct page *hmm_pfn_to_page(const struct hmm_range *range, + uint64_t pfn) +{ + return hmm_device_entry_to_page(range, pfn); +} + +static inline unsigned long hmm_pfn_to_pfn(const struct hmm_range *range, + uint64_t pfn) +{ + return hmm_device_entry_to_pfn(range, pfn); +} + +static inline uint64_t hmm_pfn_from_page(const struct hmm_range *range, + struct page *page) +{ + return hmm_device_entry_from_page(range, page); +} + +static inline uint64_t hmm_pfn_from_pfn(const struct hmm_range *range, + unsigned long pfn) +{ + return hmm_device_entry_from_pfn(range, pfn); +} + + #if IS_ENABLED(CONFIG_HMM_MIRROR) /* diff --git a/mm/hmm.c b/mm/hmm.c index 95fa7abb9d67..44a238642b1d 100644 --- a/mm/hmm.c +++ b/mm/hmm.c @@ -543,7 +543,7 @@ static int hmm_vma_handle_pmd(struct mm_walk *walk, if (unlikely(!hmm_vma_walk->pgmap)) return -EBUSY; } - pfns[i] = hmm_pfn_from_pfn(range, pfn) | cpu_flags; + pfns[i] = hmm_device_entry_from_pfn(range, pfn) | cpu_flags; } if (hmm_vma_walk->pgmap) { put_dev_pagemap(hmm_vma_walk->pgmap); @@ -611,7 +611,8 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr, &fault, &write_fault); if (fault || write_fault) goto fault; - *pfn = hmm_pfn_from_pfn(range, swp_offset(entry)); + *pfn = hmm_device_entry_from_pfn(range, + swp_offset(entry)); *pfn |= cpu_flags; return 0; } @@ -649,7 +650,7 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr, return -EFAULT; } - *pfn = hmm_pfn_from_pfn(range, pte_pfn(pte)) | cpu_flags; + *pfn = hmm_device_entry_from_pfn(range, pte_pfn(pte)) | cpu_flags; return 0; fault: @@ -803,7 +804,8 @@ again: hmm_vma_walk->pgmap); if (unlikely(!hmm_vma_walk->pgmap)) return -EBUSY; - pfns[i] = hmm_pfn_from_pfn(range, pfn) | cpu_flags; + pfns[i] = hmm_device_entry_from_pfn(range, pfn) | + cpu_flags; } if (hmm_vma_walk->pgmap) { put_dev_pagemap(hmm_vma_walk->pgmap); @@ -879,7 +881,8 @@ static int hmm_vma_walk_hugetlb_entry(pte_t *pte, unsigned long hmask, pfn = pte_pfn(entry) + ((start & mask) >> range->page_shift); for (; addr < end; addr += size, i++, pfn += pfn_inc) - range->pfns[i] = hmm_pfn_from_pfn(range, pfn) | cpu_flags; + range->pfns[i] = hmm_device_entry_from_pfn(range, pfn) | + cpu_flags; hmm_vma_walk->last = end; unlock: @@ -1222,7 +1225,7 @@ long hmm_range_dma_map(struct hmm_range *range, */ daddrs[i] = 0; - page = hmm_pfn_to_page(range, range->pfns[i]); + page = hmm_device_entry_to_page(range, range->pfns[i]); if (page == NULL) continue; @@ -1252,7 +1255,7 @@ unmap: enum dma_data_direction dir = DMA_TO_DEVICE; struct page *page; - page = hmm_pfn_to_page(range, range->pfns[i]); + page = hmm_device_entry_to_page(range, range->pfns[i]); if (page == NULL) continue; @@ -1307,7 +1310,7 @@ long hmm_range_dma_unmap(struct hmm_range *range, enum dma_data_direction dir = DMA_TO_DEVICE; struct page *page; - page = hmm_pfn_to_page(range, range->pfns[i]); + page = hmm_device_entry_to_page(range, range->pfns[i]); if (page == NULL) continue; -- cgit v1.2.3 From 4a83bfe916f3d2100df5bc8389bd182a537ced3e Mon Sep 17 00:00:00 2001 From: Jérôme Glisse Date: Mon, 13 May 2019 17:20:34 -0700 Subject: mm/mmu_notifier: helper to test if a range invalidation is blockable MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Patch series "mmu notifier provide context informations", v6. Here I am not posting users of this, they already have been posted to appropriate mailing list [6] and will be merge through the appropriate tree once this patchset is upstream. Note that this serie does not change any behavior for any existing code. It just pass down more information to mmu notifier listener. The rationale for this patchset: CPU page table update can happens for many reasons, not only as a result of a syscall (munmap(), mprotect(), mremap(), madvise(), ...) but also as a result of kernel activities (memory compression, reclaim, migration, ...). This patchset introduce a set of enums that can be associated with each of the events triggering a mmu notifier: - UNMAP: munmap() or mremap() - CLEAR: page table is cleared (migration, compaction, reclaim, ...) - PROTECTION_VMA: change in access protections for the range - PROTECTION_PAGE: change in access protections for page in the range - SOFT_DIRTY: soft dirtyness tracking Being able to identify munmap() and mremap() from other reasons why the page table is cleared is important to allow user of mmu notifier to update their own internal tracking structure accordingly (on munmap or mremap it is not longer needed to track range of virtual address as it becomes invalid). Without this serie, driver are force to assume that every notification is an munmap which triggers useless trashing within drivers that associate structure with range of virtual address. Each driver is force to free up its tracking structure and then restore it on next device page fault. With this series we can also optimize device page table update. Patches to use this are at https://lkml.org/lkml/2019/1/23/833 https://lkml.org/lkml/2019/1/23/834 https://lkml.org/lkml/2019/1/23/832 https://lkml.org/lkml/2019/1/23/831 Moreover this can also be used to optimize out some page table updates such as for KVM where we can update the secondary MMU directly from the callback instead of clearing it. ACKS AMD/RADEON https://lkml.org/lkml/2019/2/1/395 ACKS RDMA https://lkml.org/lkml/2018/12/6/1473 This patch (of 8): Simple helpers to test if range invalidation is blockable. Latter patches use cocinnelle to convert all direct dereference of range-> blockable to use this function instead so that we can convert the blockable field to an unsigned for more flags. Link: http://lkml.kernel.org/r/20190326164747.24405-2-jglisse@redhat.com Signed-off-by: Jérôme Glisse Reviewed-by: Ralph Campbell Reviewed-by: Ira Weiny Cc: Christian König Cc: Joonas Lahtinen Cc: Jani Nikula Cc: Rodrigo Vivi Cc: Jan Kara Cc: Andrea Arcangeli Cc: Peter Xu Cc: Felix Kuehling Cc: Jason Gunthorpe Cc: Ross Zwisler Cc: Dan Williams Cc: Paolo Bonzini Cc: Radim Krcmar Cc: Michal Hocko Cc: Christian Koenig Cc: John Hubbard Cc: Arnd Bergmann Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mmu_notifier.h | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h index 4050ec1c3b45..e630def131ce 100644 --- a/include/linux/mmu_notifier.h +++ b/include/linux/mmu_notifier.h @@ -226,6 +226,12 @@ extern void __mmu_notifier_invalidate_range_end(struct mmu_notifier_range *r, extern void __mmu_notifier_invalidate_range(struct mm_struct *mm, unsigned long start, unsigned long end); +static inline bool +mmu_notifier_range_blockable(const struct mmu_notifier_range *range) +{ + return range->blockable; +} + static inline void mmu_notifier_release(struct mm_struct *mm) { if (mm_has_notifiers(mm)) @@ -455,6 +461,11 @@ static inline void _mmu_notifier_range_init(struct mmu_notifier_range *range, #define mmu_notifier_range_init(range, mm, start, end) \ _mmu_notifier_range_init(range, start, end) +static inline bool +mmu_notifier_range_blockable(const struct mmu_notifier_range *range) +{ + return true; +} static inline int mm_has_notifiers(struct mm_struct *mm) { -- cgit v1.2.3 From dfcd66604c1c116ffc7a94375becbed1d7ecbef1 Mon Sep 17 00:00:00 2001 From: Jérôme Glisse Date: Mon, 13 May 2019 17:20:38 -0700 Subject: mm/mmu_notifier: convert user range->blockable to helper function MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Use the mmu_notifier_range_blockable() helper function instead of directly dereferencing the range->blockable field. This is done to make it easier to change the mmu_notifier range field. This patch is the outcome of the following coccinelle patch: %<------------------------------------------------------------------- @@ identifier I1, FN; @@ FN(..., struct mmu_notifier_range *I1, ...) { <... -I1->blockable +mmu_notifier_range_blockable(I1) ...> } ------------------------------------------------------------------->% spatch --in-place --sp-file blockable.spatch --dir . Link: http://lkml.kernel.org/r/20190326164747.24405-3-jglisse@redhat.com Signed-off-by: Jérôme Glisse Reviewed-by: Ralph Campbell Reviewed-by: Ira Weiny Cc: Christian König Cc: Joonas Lahtinen Cc: Jani Nikula Cc: Rodrigo Vivi Cc: Jan Kara Cc: Andrea Arcangeli Cc: Peter Xu Cc: Felix Kuehling Cc: Jason Gunthorpe Cc: Ross Zwisler Cc: Dan Williams Cc: Paolo Bonzini Cc: Radim Krcmar Cc: Michal Hocko Cc: Christian Koenig Cc: John Hubbard Cc: Arnd Bergmann Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c | 8 ++++---- drivers/gpu/drm/i915/i915_gem_userptr.c | 2 +- drivers/gpu/drm/radeon/radeon_mn.c | 4 ++-- drivers/infiniband/core/umem_odp.c | 5 +++-- drivers/xen/gntdev.c | 6 +++--- mm/hmm.c | 6 +++--- mm/mmu_notifier.c | 2 +- virt/kvm/kvm_main.c | 3 ++- 8 files changed, 19 insertions(+), 17 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c index 3e6823fdd939..58ed401c5996 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c @@ -256,14 +256,14 @@ static int amdgpu_mn_invalidate_range_start_gfx(struct mmu_notifier *mn, /* TODO we should be able to split locking for interval tree and * amdgpu_mn_invalidate_node */ - if (amdgpu_mn_read_lock(amn, range->blockable)) + if (amdgpu_mn_read_lock(amn, mmu_notifier_range_blockable(range))) return -EAGAIN; it = interval_tree_iter_first(&amn->objects, range->start, end); while (it) { struct amdgpu_mn_node *node; - if (!range->blockable) { + if (!mmu_notifier_range_blockable(range)) { amdgpu_mn_read_unlock(amn); return -EAGAIN; } @@ -299,7 +299,7 @@ static int amdgpu_mn_invalidate_range_start_hsa(struct mmu_notifier *mn, /* notification is exclusive, but interval is inclusive */ end = range->end - 1; - if (amdgpu_mn_read_lock(amn, range->blockable)) + if (amdgpu_mn_read_lock(amn, mmu_notifier_range_blockable(range))) return -EAGAIN; it = interval_tree_iter_first(&amn->objects, range->start, end); @@ -307,7 +307,7 @@ static int amdgpu_mn_invalidate_range_start_hsa(struct mmu_notifier *mn, struct amdgpu_mn_node *node; struct amdgpu_bo *bo; - if (!range->blockable) { + if (!mmu_notifier_range_blockable(range)) { amdgpu_mn_read_unlock(amn); return -EAGAIN; } diff --git a/drivers/gpu/drm/i915/i915_gem_userptr.c b/drivers/gpu/drm/i915/i915_gem_userptr.c index 215bf3fef10c..8079ea3af103 100644 --- a/drivers/gpu/drm/i915/i915_gem_userptr.c +++ b/drivers/gpu/drm/i915/i915_gem_userptr.c @@ -122,7 +122,7 @@ userptr_mn_invalidate_range_start(struct mmu_notifier *_mn, while (it) { struct drm_i915_gem_object *obj; - if (!range->blockable) { + if (!mmu_notifier_range_blockable(range)) { ret = -EAGAIN; break; } diff --git a/drivers/gpu/drm/radeon/radeon_mn.c b/drivers/gpu/drm/radeon/radeon_mn.c index b3019505065a..c9bd1278f573 100644 --- a/drivers/gpu/drm/radeon/radeon_mn.c +++ b/drivers/gpu/drm/radeon/radeon_mn.c @@ -133,7 +133,7 @@ static int radeon_mn_invalidate_range_start(struct mmu_notifier *mn, /* TODO we should be able to split locking for interval tree and * the tear down. */ - if (range->blockable) + if (mmu_notifier_range_blockable(range)) mutex_lock(&rmn->lock); else if (!mutex_trylock(&rmn->lock)) return -EAGAIN; @@ -144,7 +144,7 @@ static int radeon_mn_invalidate_range_start(struct mmu_notifier *mn, struct radeon_bo *bo; long r; - if (!range->blockable) { + if (!mmu_notifier_range_blockable(range)) { ret = -EAGAIN; goto out_unlock; } diff --git a/drivers/infiniband/core/umem_odp.c b/drivers/infiniband/core/umem_odp.c index c7226cf52acc..f962b5bbfa40 100644 --- a/drivers/infiniband/core/umem_odp.c +++ b/drivers/infiniband/core/umem_odp.c @@ -152,7 +152,7 @@ static int ib_umem_notifier_invalidate_range_start(struct mmu_notifier *mn, struct ib_ucontext_per_mm *per_mm = container_of(mn, struct ib_ucontext_per_mm, mn); - if (range->blockable) + if (mmu_notifier_range_blockable(range)) down_read(&per_mm->umem_rwsem); else if (!down_read_trylock(&per_mm->umem_rwsem)) return -EAGAIN; @@ -170,7 +170,8 @@ static int ib_umem_notifier_invalidate_range_start(struct mmu_notifier *mn, return rbt_ib_umem_for_each_in_range(&per_mm->umem_tree, range->start, range->end, invalidate_range_start_trampoline, - range->blockable, NULL); + mmu_notifier_range_blockable(range), + NULL); } static int invalidate_range_end_trampoline(struct ib_umem_odp *item, u64 start, diff --git a/drivers/xen/gntdev.c b/drivers/xen/gntdev.c index 02bc815982d4..559d4b7f807d 100644 --- a/drivers/xen/gntdev.c +++ b/drivers/xen/gntdev.c @@ -526,20 +526,20 @@ static int mn_invl_range_start(struct mmu_notifier *mn, struct gntdev_grant_map *map; int ret = 0; - if (range->blockable) + if (mmu_notifier_range_blockable(range)) mutex_lock(&priv->lock); else if (!mutex_trylock(&priv->lock)) return -EAGAIN; list_for_each_entry(map, &priv->maps, next) { ret = unmap_if_in_range(map, range->start, range->end, - range->blockable); + mmu_notifier_range_blockable(range)); if (ret) goto out_unlock; } list_for_each_entry(map, &priv->freeable_maps, next) { ret = unmap_if_in_range(map, range->start, range->end, - range->blockable); + mmu_notifier_range_blockable(range)); if (ret) goto out_unlock; } diff --git a/mm/hmm.c b/mm/hmm.c index 44a238642b1d..0db8491090b8 100644 --- a/mm/hmm.c +++ b/mm/hmm.c @@ -205,9 +205,9 @@ static int hmm_invalidate_range_start(struct mmu_notifier *mn, update.start = nrange->start; update.end = nrange->end; update.event = HMM_UPDATE_INVALIDATE; - update.blockable = nrange->blockable; + update.blockable = mmu_notifier_range_blockable(nrange); - if (nrange->blockable) + if (mmu_notifier_range_blockable(nrange)) mutex_lock(&hmm->lock); else if (!mutex_trylock(&hmm->lock)) { ret = -EAGAIN; @@ -222,7 +222,7 @@ static int hmm_invalidate_range_start(struct mmu_notifier *mn, } mutex_unlock(&hmm->lock); - if (nrange->blockable) + if (mmu_notifier_range_blockable(nrange)) down_read(&hmm->mirrors_sem); else if (!down_read_trylock(&hmm->mirrors_sem)) { ret = -EAGAIN; diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c index 9c884abc7850..abd88c466eb2 100644 --- a/mm/mmu_notifier.c +++ b/mm/mmu_notifier.c @@ -180,7 +180,7 @@ int __mmu_notifier_invalidate_range_start(struct mmu_notifier_range *range) if (_ret) { pr_info("%pS callback failed with %d in %sblockable context.\n", mn->ops->invalidate_range_start, _ret, - !range->blockable ? "non-" : ""); + !mmu_notifier_range_blockable(range) ? "non-" : ""); ret = _ret; } } diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index a704d1f9bd96..5fb0f1656a96 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -391,7 +391,8 @@ static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn, spin_unlock(&kvm->mmu_lock); ret = kvm_arch_mmu_notifier_invalidate_range(kvm, range->start, - range->end, range->blockable); + range->end, + mmu_notifier_range_blockable(range)); srcu_read_unlock(&kvm->srcu, idx); -- cgit v1.2.3 From 27560ee96f40017075bcb975b85f85dae3622f01 Mon Sep 17 00:00:00 2001 From: Jérôme Glisse Date: Mon, 13 May 2019 17:20:42 -0700 Subject: mm/mmu_notifier: convert mmu_notifier_range->blockable to a flags MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Use an unsigned field for flags other than blockable and convert the blockable field to be one of those flags. Link: http://lkml.kernel.org/r/20190326164747.24405-4-jglisse@redhat.com Signed-off-by: Jérôme Glisse Reviewed-by: Ralph Campbell Reviewed-by: Ira Weiny Cc: Christian König Cc: Joonas Lahtinen Cc: Jani Nikula Cc: Rodrigo Vivi Cc: Jan Kara Cc: Andrea Arcangeli Cc: Peter Xu Cc: Felix Kuehling Cc: Jason Gunthorpe Cc: Ross Zwisler Cc: Dan Williams Cc: Paolo Bonzini Cc: Radim Krcmar Cc: Michal Hocko Cc: Christian Koenig Cc: John Hubbard Cc: Arnd Bergmann Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mmu_notifier.h | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h index e630def131ce..c8672c366f67 100644 --- a/include/linux/mmu_notifier.h +++ b/include/linux/mmu_notifier.h @@ -25,11 +25,13 @@ struct mmu_notifier_mm { spinlock_t lock; }; +#define MMU_NOTIFIER_RANGE_BLOCKABLE (1 << 0) + struct mmu_notifier_range { struct mm_struct *mm; unsigned long start; unsigned long end; - bool blockable; + unsigned flags; }; struct mmu_notifier_ops { @@ -229,7 +231,7 @@ extern void __mmu_notifier_invalidate_range(struct mm_struct *mm, static inline bool mmu_notifier_range_blockable(const struct mmu_notifier_range *range) { - return range->blockable; + return (range->flags & MMU_NOTIFIER_RANGE_BLOCKABLE); } static inline void mmu_notifier_release(struct mm_struct *mm) @@ -275,7 +277,7 @@ static inline void mmu_notifier_invalidate_range_start(struct mmu_notifier_range *range) { if (mm_has_notifiers(range->mm)) { - range->blockable = true; + range->flags |= MMU_NOTIFIER_RANGE_BLOCKABLE; __mmu_notifier_invalidate_range_start(range); } } @@ -284,7 +286,7 @@ static inline int mmu_notifier_invalidate_range_start_nonblock(struct mmu_notifier_range *range) { if (mm_has_notifiers(range->mm)) { - range->blockable = false; + range->flags &= ~MMU_NOTIFIER_RANGE_BLOCKABLE; return __mmu_notifier_invalidate_range_start(range); } return 0; @@ -331,6 +333,7 @@ static inline void mmu_notifier_range_init(struct mmu_notifier_range *range, range->mm = mm; range->start = start; range->end = end; + range->flags = 0; } #define ptep_clear_flush_young_notify(__vma, __address, __ptep) \ -- cgit v1.2.3 From d87f055b94ea9270c491b5e650dd776ecc30d7c9 Mon Sep 17 00:00:00 2001 From: Jérôme Glisse Date: Mon, 13 May 2019 17:20:45 -0700 Subject: mm/mmu_notifier: contextual information for event enums MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit CPU page table update can happens for many reasons, not only as a result of a syscall (munmap(), mprotect(), mremap(), madvise(), ...) but also as a result of kernel activities (memory compression, reclaim, migration, ...). This patch introduce a set of enums that can be associated with each of the events triggering a mmu notifier. Latter patches take advantages of those enum values. - UNMAP: munmap() or mremap() - CLEAR: page table is cleared (migration, compaction, reclaim, ...) - PROTECTION_VMA: change in access protections for the range - PROTECTION_PAGE: change in access protections for page in the range - SOFT_DIRTY: soft dirtyness tracking Being able to identify munmap() and mremap() from other reasons why the page table is cleared is important to allow user of mmu notifier to update their own internal tracking structure accordingly (on munmap or mremap it is not longer needed to track range of virtual address as it becomes invalid). Link: http://lkml.kernel.org/r/20190326164747.24405-5-jglisse@redhat.com Signed-off-by: Jérôme Glisse Reviewed-by: Ralph Campbell Reviewed-by: Ira Weiny Cc: Christian König Cc: Joonas Lahtinen Cc: Jani Nikula Cc: Rodrigo Vivi Cc: Jan Kara Cc: Andrea Arcangeli Cc: Peter Xu Cc: Felix Kuehling Cc: Jason Gunthorpe Cc: Ross Zwisler Cc: Dan Williams Cc: Paolo Bonzini Cc: Radim Krcmar Cc: Michal Hocko Cc: Christian Koenig Cc: John Hubbard Cc: Arnd Bergmann Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mmu_notifier.h | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h index c8672c366f67..2386e71ac1b8 100644 --- a/include/linux/mmu_notifier.h +++ b/include/linux/mmu_notifier.h @@ -10,6 +10,36 @@ struct mmu_notifier; struct mmu_notifier_ops; +/** + * enum mmu_notifier_event - reason for the mmu notifier callback + * @MMU_NOTIFY_UNMAP: either munmap() that unmap the range or a mremap() that + * move the range + * + * @MMU_NOTIFY_CLEAR: clear page table entry (many reasons for this like + * madvise() or replacing a page by another one, ...). + * + * @MMU_NOTIFY_PROTECTION_VMA: update is due to protection change for the range + * ie using the vma access permission (vm_page_prot) to update the whole range + * is enough no need to inspect changes to the CPU page table (mprotect() + * syscall) + * + * @MMU_NOTIFY_PROTECTION_PAGE: update is due to change in read/write flag for + * pages in the range so to mirror those changes the user must inspect the CPU + * page table (from the end callback). + * + * @MMU_NOTIFY_SOFT_DIRTY: soft dirty accounting (still same page and same + * access flags). User should soft dirty the page in the end callback to make + * sure that anyone relying on soft dirtyness catch pages that might be written + * through non CPU mappings. + */ +enum mmu_notifier_event { + MMU_NOTIFY_UNMAP = 0, + MMU_NOTIFY_CLEAR, + MMU_NOTIFY_PROTECTION_VMA, + MMU_NOTIFY_PROTECTION_PAGE, + MMU_NOTIFY_SOFT_DIRTY, +}; + #ifdef CONFIG_MMU_NOTIFIER /* -- cgit v1.2.3 From 6f4f13e8d9e27cefd2cd88dd4fd80aa6d68b9131 Mon Sep 17 00:00:00 2001 From: Jérôme Glisse Date: Mon, 13 May 2019 17:20:49 -0700 Subject: mm/mmu_notifier: contextual information for event triggering invalidation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit CPU page table update can happens for many reasons, not only as a result of a syscall (munmap(), mprotect(), mremap(), madvise(), ...) but also as a result of kernel activities (memory compression, reclaim, migration, ...). Users of mmu notifier API track changes to the CPU page table and take specific action for them. While current API only provide range of virtual address affected by the change, not why the changes is happening. This patchset do the initial mechanical convertion of all the places that calls mmu_notifier_range_init to also provide the default MMU_NOTIFY_UNMAP event as well as the vma if it is know (most invalidation happens against a given vma). Passing down the vma allows the users of mmu notifier to inspect the new vma page protection. The MMU_NOTIFY_UNMAP is always the safe default as users of mmu notifier should assume that every for the range is going away when that event happens. A latter patch do convert mm call path to use a more appropriate events for each call. This is done as 2 patches so that no call site is forgotten especialy as it uses this following coccinelle patch: %<---------------------------------------------------------------------- @@ identifier I1, I2, I3, I4; @@ static inline void mmu_notifier_range_init(struct mmu_notifier_range *I1, +enum mmu_notifier_event event, +unsigned flags, +struct vm_area_struct *vma, struct mm_struct *I2, unsigned long I3, unsigned long I4) { ... } @@ @@ -#define mmu_notifier_range_init(range, mm, start, end) +#define mmu_notifier_range_init(range, event, flags, vma, mm, start, end) @@ expression E1, E3, E4; identifier I1; @@ <... mmu_notifier_range_init(E1, +MMU_NOTIFY_UNMAP, 0, I1, I1->vm_mm, E3, E4) ...> @@ expression E1, E2, E3, E4; identifier FN, VMA; @@ FN(..., struct vm_area_struct *VMA, ...) { <... mmu_notifier_range_init(E1, +MMU_NOTIFY_UNMAP, 0, VMA, E2, E3, E4) ...> } @@ expression E1, E2, E3, E4; identifier FN, VMA; @@ FN(...) { struct vm_area_struct *VMA; <... mmu_notifier_range_init(E1, +MMU_NOTIFY_UNMAP, 0, VMA, E2, E3, E4) ...> } @@ expression E1, E2, E3, E4; identifier FN; @@ FN(...) { <... mmu_notifier_range_init(E1, +MMU_NOTIFY_UNMAP, 0, NULL, E2, E3, E4) ...> } ---------------------------------------------------------------------->% Applied with: spatch --all-includes --sp-file mmu-notifier.spatch fs/proc/task_mmu.c --in-place spatch --sp-file mmu-notifier.spatch --dir kernel/events/ --in-place spatch --sp-file mmu-notifier.spatch --dir mm --in-place Link: http://lkml.kernel.org/r/20190326164747.24405-6-jglisse@redhat.com Signed-off-by: Jérôme Glisse Reviewed-by: Ralph Campbell Reviewed-by: Ira Weiny Cc: Christian König Cc: Joonas Lahtinen Cc: Jani Nikula Cc: Rodrigo Vivi Cc: Jan Kara Cc: Andrea Arcangeli Cc: Peter Xu Cc: Felix Kuehling Cc: Jason Gunthorpe Cc: Ross Zwisler Cc: Dan Williams Cc: Paolo Bonzini Cc: Radim Krcmar Cc: Michal Hocko Cc: Christian Koenig Cc: John Hubbard Cc: Arnd Bergmann Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/task_mmu.c | 3 ++- include/linux/mmu_notifier.h | 5 ++++- kernel/events/uprobes.c | 3 ++- mm/huge_memory.c | 12 ++++++++---- mm/hugetlb.c | 12 ++++++++---- mm/khugepaged.c | 3 ++- mm/ksm.c | 6 ++++-- mm/madvise.c | 3 ++- mm/memory.c | 25 ++++++++++++++++--------- mm/migrate.c | 5 ++++- mm/mprotect.c | 3 ++- mm/mremap.c | 3 ++- mm/oom_kill.c | 3 ++- mm/rmap.c | 6 ++++-- 14 files changed, 62 insertions(+), 30 deletions(-) diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 95ca1fe7283c..ea464f2b9867 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -1169,7 +1169,8 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf, break; } - mmu_notifier_range_init(&range, mm, 0, -1UL); + mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, + NULL, mm, 0, -1UL); mmu_notifier_invalidate_range_start(&range); } walk_page_range(0, mm->highest_vm_end, &clear_refs_walk); diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h index 2386e71ac1b8..62f94cd85455 100644 --- a/include/linux/mmu_notifier.h +++ b/include/linux/mmu_notifier.h @@ -356,6 +356,9 @@ static inline void mmu_notifier_mm_destroy(struct mm_struct *mm) static inline void mmu_notifier_range_init(struct mmu_notifier_range *range, + enum mmu_notifier_event event, + unsigned flags, + struct vm_area_struct *vma, struct mm_struct *mm, unsigned long start, unsigned long end) @@ -491,7 +494,7 @@ static inline void _mmu_notifier_range_init(struct mmu_notifier_range *range, range->end = end; } -#define mmu_notifier_range_init(range, mm, start, end) \ +#define mmu_notifier_range_init(range,event,flags,vma,mm,start,end) \ _mmu_notifier_range_init(range, start, end) static inline bool diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 4ca7364c956d..e34b699f3865 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -161,7 +161,8 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, struct mmu_notifier_range range; struct mem_cgroup *memcg; - mmu_notifier_range_init(&range, mm, addr, addr + PAGE_SIZE); + mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, mm, addr, + addr + PAGE_SIZE); VM_BUG_ON_PAGE(PageTransHuge(old_page), old_page); diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 50c665b12cf1..428b5794f4b8 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1224,7 +1224,8 @@ static vm_fault_t do_huge_pmd_wp_page_fallback(struct vm_fault *vmf, cond_resched(); } - mmu_notifier_range_init(&range, vma->vm_mm, haddr, + mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, vma->vm_mm, + haddr, haddr + HPAGE_PMD_SIZE); mmu_notifier_invalidate_range_start(&range); @@ -1388,7 +1389,8 @@ alloc: vma, HPAGE_PMD_NR); __SetPageUptodate(new_page); - mmu_notifier_range_init(&range, vma->vm_mm, haddr, + mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, vma->vm_mm, + haddr, haddr + HPAGE_PMD_SIZE); mmu_notifier_invalidate_range_start(&range); @@ -2064,7 +2066,8 @@ void __split_huge_pud(struct vm_area_struct *vma, pud_t *pud, spinlock_t *ptl; struct mmu_notifier_range range; - mmu_notifier_range_init(&range, vma->vm_mm, address & HPAGE_PUD_MASK, + mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, vma->vm_mm, + address & HPAGE_PUD_MASK, (address & HPAGE_PUD_MASK) + HPAGE_PUD_SIZE); mmu_notifier_invalidate_range_start(&range); ptl = pud_lock(vma->vm_mm, pud); @@ -2282,7 +2285,8 @@ void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, spinlock_t *ptl; struct mmu_notifier_range range; - mmu_notifier_range_init(&range, vma->vm_mm, address & HPAGE_PMD_MASK, + mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, vma->vm_mm, + address & HPAGE_PMD_MASK, (address & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE); mmu_notifier_invalidate_range_start(&range); ptl = pmd_lock(vma->vm_mm, pmd); diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 98a3c7c224cb..89d206d6ecf3 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -3294,7 +3294,8 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; if (cow) { - mmu_notifier_range_init(&range, src, vma->vm_start, + mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, src, + vma->vm_start, vma->vm_end); mmu_notifier_invalidate_range_start(&range); } @@ -3406,7 +3407,8 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma, /* * If sharing possible, alert mmu notifiers of worst case. */ - mmu_notifier_range_init(&range, mm, start, end); + mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, mm, start, + end); adjust_range_if_pmd_sharing_possible(vma, &range.start, &range.end); mmu_notifier_invalidate_range_start(&range); address = start; @@ -3673,7 +3675,8 @@ retry_avoidcopy: pages_per_huge_page(h)); __SetPageUptodate(new_page); - mmu_notifier_range_init(&range, mm, haddr, haddr + huge_page_size(h)); + mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, mm, haddr, + haddr + huge_page_size(h)); mmu_notifier_invalidate_range_start(&range); /* @@ -4408,7 +4411,8 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma, * start/end. Set range.start/range.end to cover the maximum possible * range if PMD sharing is possible. */ - mmu_notifier_range_init(&range, mm, start, end); + mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, mm, start, + end); adjust_range_if_pmd_sharing_possible(vma, &range.start, &range.end); BUG_ON(address >= end); diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 7ba7a1e4fa79..14581dbf62a5 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -1016,7 +1016,8 @@ static void collapse_huge_page(struct mm_struct *mm, pte = pte_offset_map(pmd, address); pte_ptl = pte_lockptr(mm, pmd); - mmu_notifier_range_init(&range, mm, address, address + HPAGE_PMD_SIZE); + mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, NULL, mm, + address, address + HPAGE_PMD_SIZE); mmu_notifier_invalidate_range_start(&range); pmd_ptl = pmd_lock(mm, pmd); /* probably unnecessary */ /* diff --git a/mm/ksm.c b/mm/ksm.c index fc64874dc6f4..01f5fe2c90cf 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -1066,7 +1066,8 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page, BUG_ON(PageTransCompound(page)); - mmu_notifier_range_init(&range, mm, pvmw.address, + mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, mm, + pvmw.address, pvmw.address + PAGE_SIZE); mmu_notifier_invalidate_range_start(&range); @@ -1154,7 +1155,8 @@ static int replace_page(struct vm_area_struct *vma, struct page *page, if (!pmd) goto out; - mmu_notifier_range_init(&range, mm, addr, addr + PAGE_SIZE); + mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, mm, addr, + addr + PAGE_SIZE); mmu_notifier_invalidate_range_start(&range); ptep = pte_offset_map_lock(mm, pmd, addr, &ptl); diff --git a/mm/madvise.c b/mm/madvise.c index bb3a4554d5d5..1c52bdf1b696 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -472,7 +472,8 @@ static int madvise_free_single_vma(struct vm_area_struct *vma, range.end = min(vma->vm_end, end_addr); if (range.end <= vma->vm_start) return -EINVAL; - mmu_notifier_range_init(&range, mm, range.start, range.end); + mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, mm, + range.start, range.end); lru_add_drain(); tlb_gather_mmu(&tlb, mm, range.start, range.end); diff --git a/mm/memory.c b/mm/memory.c index f7d962d7de19..90672674c582 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1010,7 +1010,8 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, is_cow = is_cow_mapping(vma->vm_flags); if (is_cow) { - mmu_notifier_range_init(&range, src_mm, addr, end); + mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, + src_mm, addr, end); mmu_notifier_invalidate_range_start(&range); } @@ -1334,7 +1335,8 @@ void unmap_vmas(struct mmu_gather *tlb, { struct mmu_notifier_range range; - mmu_notifier_range_init(&range, vma->vm_mm, start_addr, end_addr); + mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, vma->vm_mm, + start_addr, end_addr); mmu_notifier_invalidate_range_start(&range); for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next) unmap_single_vma(tlb, vma, start_addr, end_addr, NULL); @@ -1356,7 +1358,8 @@ void zap_page_range(struct vm_area_struct *vma, unsigned long start, struct mmu_gather tlb; lru_add_drain(); - mmu_notifier_range_init(&range, vma->vm_mm, start, start + size); + mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, vma->vm_mm, + start, start + size); tlb_gather_mmu(&tlb, vma->vm_mm, start, range.end); update_hiwater_rss(vma->vm_mm); mmu_notifier_invalidate_range_start(&range); @@ -1382,7 +1385,8 @@ static void zap_page_range_single(struct vm_area_struct *vma, unsigned long addr struct mmu_gather tlb; lru_add_drain(); - mmu_notifier_range_init(&range, vma->vm_mm, address, address + size); + mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, vma->vm_mm, + address, address + size); tlb_gather_mmu(&tlb, vma->vm_mm, address, range.end); update_hiwater_rss(vma->vm_mm); mmu_notifier_invalidate_range_start(&range); @@ -2279,7 +2283,8 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf) __SetPageUptodate(new_page); - mmu_notifier_range_init(&range, mm, vmf->address & PAGE_MASK, + mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, mm, + vmf->address & PAGE_MASK, (vmf->address & PAGE_MASK) + PAGE_SIZE); mmu_notifier_invalidate_range_start(&range); @@ -4104,8 +4109,9 @@ static int __follow_pte_pmd(struct mm_struct *mm, unsigned long address, goto out; if (range) { - mmu_notifier_range_init(range, mm, address & PMD_MASK, - (address & PMD_MASK) + PMD_SIZE); + mmu_notifier_range_init(range, MMU_NOTIFY_UNMAP, 0, + NULL, mm, address & PMD_MASK, + (address & PMD_MASK) + PMD_SIZE); mmu_notifier_invalidate_range_start(range); } *ptlp = pmd_lock(mm, pmd); @@ -4122,8 +4128,9 @@ static int __follow_pte_pmd(struct mm_struct *mm, unsigned long address, goto out; if (range) { - mmu_notifier_range_init(range, mm, address & PAGE_MASK, - (address & PAGE_MASK) + PAGE_SIZE); + mmu_notifier_range_init(range, MMU_NOTIFY_UNMAP, 0, NULL, mm, + address & PAGE_MASK, + (address & PAGE_MASK) + PAGE_SIZE); mmu_notifier_invalidate_range_start(range); } ptep = pte_offset_map_lock(mm, pmd, address, ptlp); diff --git a/mm/migrate.c b/mm/migrate.c index a1770403ff7f..855bdb3b3333 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -2356,7 +2356,8 @@ static void migrate_vma_collect(struct migrate_vma *migrate) mm_walk.mm = migrate->vma->vm_mm; mm_walk.private = migrate; - mmu_notifier_range_init(&range, mm_walk.mm, migrate->start, + mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, NULL, mm_walk.mm, + migrate->start, migrate->end); mmu_notifier_invalidate_range_start(&range); walk_page_range(migrate->start, migrate->end, &mm_walk); @@ -2764,6 +2765,8 @@ static void migrate_vma_pages(struct migrate_vma *migrate) notified = true; mmu_notifier_range_init(&range, + MMU_NOTIFY_UNMAP, 0, + NULL, migrate->vma->vm_mm, addr, migrate->end); mmu_notifier_invalidate_range_start(&range); diff --git a/mm/mprotect.c b/mm/mprotect.c index 028c724dcb1a..b10984052ae9 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -185,7 +185,8 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma, /* invoke the mmu notifier if the pmd is populated */ if (!range.start) { - mmu_notifier_range_init(&range, vma->vm_mm, addr, end); + mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, + vma, vma->vm_mm, addr, end); mmu_notifier_invalidate_range_start(&range); } diff --git a/mm/mremap.c b/mm/mremap.c index e3edef6b7a12..fc241d23cd97 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -249,7 +249,8 @@ unsigned long move_page_tables(struct vm_area_struct *vma, old_end = old_addr + len; flush_cache_range(vma, old_addr, old_end); - mmu_notifier_range_init(&range, vma->vm_mm, old_addr, old_end); + mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, vma->vm_mm, + old_addr, old_end); mmu_notifier_invalidate_range_start(&range); for (; old_addr < old_end; old_addr += extent, new_addr += extent) { diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 3a2484884cfd..539c91d0b26a 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -531,7 +531,8 @@ bool __oom_reap_task_mm(struct mm_struct *mm) struct mmu_notifier_range range; struct mmu_gather tlb; - mmu_notifier_range_init(&range, mm, vma->vm_start, + mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, + vma, mm, vma->vm_start, vma->vm_end); tlb_gather_mmu(&tlb, mm, range.start, range.end); if (mmu_notifier_invalidate_range_start_nonblock(&range)) { diff --git a/mm/rmap.c b/mm/rmap.c index 76c8dfd3ae1c..288e636b7813 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -896,7 +896,8 @@ static bool page_mkclean_one(struct page *page, struct vm_area_struct *vma, * We have to assume the worse case ie pmd for invalidation. Note that * the page can not be free from this function. */ - mmu_notifier_range_init(&range, vma->vm_mm, address, + mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, vma->vm_mm, + address, min(vma->vm_end, address + (PAGE_SIZE << compound_order(page)))); mmu_notifier_invalidate_range_start(&range); @@ -1371,7 +1372,8 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma, * Note that the page can not be free in this function as call of * try_to_unmap() must hold a reference on the page. */ - mmu_notifier_range_init(&range, vma->vm_mm, address, + mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, vma->vm_mm, + address, min(vma->vm_end, address + (PAGE_SIZE << compound_order(page)))); if (PageHuge(page)) { -- cgit v1.2.3 From 7269f999934b289da7972e975b781417b07ef836 Mon Sep 17 00:00:00 2001 From: Jérôme Glisse Date: Mon, 13 May 2019 17:20:53 -0700 Subject: mm/mmu_notifier: use correct mmu_notifier events for each invalidation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This updates each existing invalidation to use the correct mmu notifier event that represent what is happening to the CPU page table. See the patch which introduced the events to see the rational behind this. Link: http://lkml.kernel.org/r/20190326164747.24405-7-jglisse@redhat.com Signed-off-by: Jérôme Glisse Reviewed-by: Ralph Campbell Reviewed-by: Ira Weiny Cc: Christian König Cc: Joonas Lahtinen Cc: Jani Nikula Cc: Rodrigo Vivi Cc: Jan Kara Cc: Andrea Arcangeli Cc: Peter Xu Cc: Felix Kuehling Cc: Jason Gunthorpe Cc: Ross Zwisler Cc: Dan Williams Cc: Paolo Bonzini Cc: Radim Krcmar Cc: Michal Hocko Cc: Christian Koenig Cc: John Hubbard Cc: Arnd Bergmann Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/task_mmu.c | 4 ++-- kernel/events/uprobes.c | 2 +- mm/huge_memory.c | 14 ++++++-------- mm/hugetlb.c | 8 ++++---- mm/khugepaged.c | 2 +- mm/ksm.c | 4 ++-- mm/madvise.c | 2 +- mm/memory.c | 14 +++++++------- mm/migrate.c | 4 ++-- mm/mprotect.c | 5 +++-- mm/rmap.c | 6 +++--- 11 files changed, 32 insertions(+), 33 deletions(-) diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index ea464f2b9867..01d4eb0e6bd1 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -1169,8 +1169,8 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf, break; } - mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, - NULL, mm, 0, -1UL); + mmu_notifier_range_init(&range, MMU_NOTIFY_SOFT_DIRTY, + 0, NULL, mm, 0, -1UL); mmu_notifier_invalidate_range_start(&range); } walk_page_range(0, mm->highest_vm_end, &clear_refs_walk); diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index e34b699f3865..78f61bfc6b79 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -161,7 +161,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, struct mmu_notifier_range range; struct mem_cgroup *memcg; - mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, mm, addr, + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, addr, addr + PAGE_SIZE); VM_BUG_ON_PAGE(PageTransHuge(old_page), old_page); diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 428b5794f4b8..61b1e05e86ee 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1224,9 +1224,8 @@ static vm_fault_t do_huge_pmd_wp_page_fallback(struct vm_fault *vmf, cond_resched(); } - mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, vma->vm_mm, - haddr, - haddr + HPAGE_PMD_SIZE); + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm, + haddr, haddr + HPAGE_PMD_SIZE); mmu_notifier_invalidate_range_start(&range); vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd); @@ -1389,9 +1388,8 @@ alloc: vma, HPAGE_PMD_NR); __SetPageUptodate(new_page); - mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, vma->vm_mm, - haddr, - haddr + HPAGE_PMD_SIZE); + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm, + haddr, haddr + HPAGE_PMD_SIZE); mmu_notifier_invalidate_range_start(&range); spin_lock(vmf->ptl); @@ -2066,7 +2064,7 @@ void __split_huge_pud(struct vm_area_struct *vma, pud_t *pud, spinlock_t *ptl; struct mmu_notifier_range range; - mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, vma->vm_mm, + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm, address & HPAGE_PUD_MASK, (address & HPAGE_PUD_MASK) + HPAGE_PUD_SIZE); mmu_notifier_invalidate_range_start(&range); @@ -2285,7 +2283,7 @@ void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, spinlock_t *ptl; struct mmu_notifier_range range; - mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, vma->vm_mm, + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm, address & HPAGE_PMD_MASK, (address & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE); mmu_notifier_invalidate_range_start(&range); diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 89d206d6ecf3..cab38ef30238 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -3294,7 +3294,7 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; if (cow) { - mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, src, + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, src, vma->vm_start, vma->vm_end); mmu_notifier_invalidate_range_start(&range); @@ -3675,7 +3675,7 @@ retry_avoidcopy: pages_per_huge_page(h)); __SetPageUptodate(new_page); - mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, mm, haddr, + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, haddr, haddr + huge_page_size(h)); mmu_notifier_invalidate_range_start(&range); @@ -4411,8 +4411,8 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma, * start/end. Set range.start/range.end to cover the maximum possible * range if PMD sharing is possible. */ - mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, mm, start, - end); + mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_VMA, + 0, vma, mm, start, end); adjust_range_if_pmd_sharing_possible(vma, &range.start, &range.end); BUG_ON(address >= end); diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 14581dbf62a5..a335f7c1fac4 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -1016,7 +1016,7 @@ static void collapse_huge_page(struct mm_struct *mm, pte = pte_offset_map(pmd, address); pte_ptl = pte_lockptr(mm, pmd); - mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, NULL, mm, + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, NULL, mm, address, address + HPAGE_PMD_SIZE); mmu_notifier_invalidate_range_start(&range); pmd_ptl = pmd_lock(mm, pmd); /* probably unnecessary */ diff --git a/mm/ksm.c b/mm/ksm.c index 01f5fe2c90cf..81c20ed57bf6 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -1066,7 +1066,7 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page, BUG_ON(PageTransCompound(page)); - mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, mm, + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, pvmw.address, pvmw.address + PAGE_SIZE); mmu_notifier_invalidate_range_start(&range); @@ -1155,7 +1155,7 @@ static int replace_page(struct vm_area_struct *vma, struct page *page, if (!pmd) goto out; - mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, mm, addr, + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, addr, addr + PAGE_SIZE); mmu_notifier_invalidate_range_start(&range); diff --git a/mm/madvise.c b/mm/madvise.c index 1c52bdf1b696..628022e674a7 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -472,7 +472,7 @@ static int madvise_free_single_vma(struct vm_area_struct *vma, range.end = min(vma->vm_end, end_addr); if (range.end <= vma->vm_start) return -EINVAL; - mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, mm, + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, range.start, range.end); lru_add_drain(); diff --git a/mm/memory.c b/mm/memory.c index 90672674c582..9b68a72f8c17 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1010,8 +1010,8 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, is_cow = is_cow_mapping(vma->vm_flags); if (is_cow) { - mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, - src_mm, addr, end); + mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_PAGE, + 0, vma, src_mm, addr, end); mmu_notifier_invalidate_range_start(&range); } @@ -1358,7 +1358,7 @@ void zap_page_range(struct vm_area_struct *vma, unsigned long start, struct mmu_gather tlb; lru_add_drain(); - mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, vma->vm_mm, + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm, start, start + size); tlb_gather_mmu(&tlb, vma->vm_mm, start, range.end); update_hiwater_rss(vma->vm_mm); @@ -1385,7 +1385,7 @@ static void zap_page_range_single(struct vm_area_struct *vma, unsigned long addr struct mmu_gather tlb; lru_add_drain(); - mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, vma->vm_mm, + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm, address, address + size); tlb_gather_mmu(&tlb, vma->vm_mm, address, range.end); update_hiwater_rss(vma->vm_mm); @@ -2283,7 +2283,7 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf) __SetPageUptodate(new_page); - mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, mm, + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, vmf->address & PAGE_MASK, (vmf->address & PAGE_MASK) + PAGE_SIZE); mmu_notifier_invalidate_range_start(&range); @@ -4109,7 +4109,7 @@ static int __follow_pte_pmd(struct mm_struct *mm, unsigned long address, goto out; if (range) { - mmu_notifier_range_init(range, MMU_NOTIFY_UNMAP, 0, + mmu_notifier_range_init(range, MMU_NOTIFY_CLEAR, 0, NULL, mm, address & PMD_MASK, (address & PMD_MASK) + PMD_SIZE); mmu_notifier_invalidate_range_start(range); @@ -4128,7 +4128,7 @@ static int __follow_pte_pmd(struct mm_struct *mm, unsigned long address, goto out; if (range) { - mmu_notifier_range_init(range, MMU_NOTIFY_UNMAP, 0, NULL, mm, + mmu_notifier_range_init(range, MMU_NOTIFY_CLEAR, 0, NULL, mm, address & PAGE_MASK, (address & PAGE_MASK) + PAGE_SIZE); mmu_notifier_invalidate_range_start(range); diff --git a/mm/migrate.c b/mm/migrate.c index 855bdb3b3333..f2ecc2855a12 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -2356,7 +2356,7 @@ static void migrate_vma_collect(struct migrate_vma *migrate) mm_walk.mm = migrate->vma->vm_mm; mm_walk.private = migrate; - mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, NULL, mm_walk.mm, + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, NULL, mm_walk.mm, migrate->start, migrate->end); mmu_notifier_invalidate_range_start(&range); @@ -2765,7 +2765,7 @@ static void migrate_vma_pages(struct migrate_vma *migrate) notified = true; mmu_notifier_range_init(&range, - MMU_NOTIFY_UNMAP, 0, + MMU_NOTIFY_CLEAR, 0, NULL, migrate->vma->vm_mm, addr, migrate->end); diff --git a/mm/mprotect.c b/mm/mprotect.c index b10984052ae9..65242f1e4457 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -185,8 +185,9 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma, /* invoke the mmu notifier if the pmd is populated */ if (!range.start) { - mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, - vma, vma->vm_mm, addr, end); + mmu_notifier_range_init(&range, + MMU_NOTIFY_PROTECTION_VMA, 0, + vma, vma->vm_mm, addr, end); mmu_notifier_invalidate_range_start(&range); } diff --git a/mm/rmap.c b/mm/rmap.c index 288e636b7813..0cbed70700ed 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -896,8 +896,8 @@ static bool page_mkclean_one(struct page *page, struct vm_area_struct *vma, * We have to assume the worse case ie pmd for invalidation. Note that * the page can not be free from this function. */ - mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, vma->vm_mm, - address, + mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_PAGE, + 0, vma, vma->vm_mm, address, min(vma->vm_end, address + (PAGE_SIZE << compound_order(page)))); mmu_notifier_invalidate_range_start(&range); @@ -1372,7 +1372,7 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma, * Note that the page can not be free in this function as call of * try_to_unmap() must hold a reference on the page. */ - mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, vma->vm_mm, + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm, address, min(vma->vm_end, address + (PAGE_SIZE << compound_order(page)))); -- cgit v1.2.3 From bf198b2b34bfd4bc9bd6abb33bf650b74329a2ac Mon Sep 17 00:00:00 2001 From: Jérôme Glisse Date: Mon, 13 May 2019 17:20:57 -0700 Subject: mm/mmu_notifier: pass down vma and reasons why mmu notifier is happening MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit CPU page table update can happens for many reasons, not only as a result of a syscall (munmap(), mprotect(), mremap(), madvise(), ...) but also as a result of kernel activities (memory compression, reclaim, migration, ...). Users of mmu notifier API track changes to the CPU page table and take specific action for them. While current API only provide range of virtual address affected by the change, not why the changes is happening This patch is just passing down the new informations by adding it to the mmu_notifier_range structure. Link: http://lkml.kernel.org/r/20190326164747.24405-8-jglisse@redhat.com Signed-off-by: Jérôme Glisse Reviewed-by: Ralph Campbell Reviewed-by: Ira Weiny Cc: Christian König Cc: Joonas Lahtinen Cc: Jani Nikula Cc: Rodrigo Vivi Cc: Jan Kara Cc: Andrea Arcangeli Cc: Peter Xu Cc: Felix Kuehling Cc: Jason Gunthorpe Cc: Ross Zwisler Cc: Dan Williams Cc: Paolo Bonzini Cc: Radim Krcmar Cc: Michal Hocko Cc: Christian Koenig Cc: John Hubbard Cc: Arnd Bergmann Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mmu_notifier.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h index 62f94cd85455..0379956fff23 100644 --- a/include/linux/mmu_notifier.h +++ b/include/linux/mmu_notifier.h @@ -58,10 +58,12 @@ struct mmu_notifier_mm { #define MMU_NOTIFIER_RANGE_BLOCKABLE (1 << 0) struct mmu_notifier_range { + struct vm_area_struct *vma; struct mm_struct *mm; unsigned long start; unsigned long end; unsigned flags; + enum mmu_notifier_event event; }; struct mmu_notifier_ops { @@ -363,10 +365,12 @@ static inline void mmu_notifier_range_init(struct mmu_notifier_range *range, unsigned long start, unsigned long end) { + range->vma = vma; + range->event = event; range->mm = mm; range->start = start; range->end = end; - range->flags = 0; + range->flags = flags; } #define ptep_clear_flush_young_notify(__vma, __address, __ptep) \ -- cgit v1.2.3 From c6d23413f81bd69935afedaf1da9d55b03febf58 Mon Sep 17 00:00:00 2001 From: Jérôme Glisse Date: Mon, 13 May 2019 17:21:00 -0700 Subject: mm/mmu_notifier: mmu_notifier_range_update_to_read_only() helper MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Helper to test if a range is updated to read only (it is still valid to read from the range). This is useful for device driver or anyone who wish to optimize out update when they know that they already have the range map read only. Link: http://lkml.kernel.org/r/20190326164747.24405-9-jglisse@redhat.com Signed-off-by: Jérôme Glisse Reviewed-by: Ralph Campbell Reviewed-by: Ira Weiny Cc: Christian König Cc: Joonas Lahtinen Cc: Jani Nikula Cc: Rodrigo Vivi Cc: Jan Kara Cc: Andrea Arcangeli Cc: Peter Xu Cc: Felix Kuehling Cc: Jason Gunthorpe Cc: Ross Zwisler Cc: Dan Williams Cc: Paolo Bonzini Cc: Radim Krcmar Cc: Michal Hocko Cc: Christian Koenig Cc: John Hubbard Cc: Arnd Bergmann Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mmu_notifier.h | 4 ++++ mm/mmu_notifier.c | 10 ++++++++++ 2 files changed, 14 insertions(+) diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h index 0379956fff23..b6c004bd9f6a 100644 --- a/include/linux/mmu_notifier.h +++ b/include/linux/mmu_notifier.h @@ -259,6 +259,8 @@ extern void __mmu_notifier_invalidate_range_end(struct mmu_notifier_range *r, bool only_end); extern void __mmu_notifier_invalidate_range(struct mm_struct *mm, unsigned long start, unsigned long end); +extern bool +mmu_notifier_range_update_to_read_only(const struct mmu_notifier_range *range); static inline bool mmu_notifier_range_blockable(const struct mmu_notifier_range *range) @@ -568,6 +570,8 @@ static inline void mmu_notifier_mm_destroy(struct mm_struct *mm) { } +#define mmu_notifier_range_update_to_read_only(r) false + #define ptep_clear_flush_young_notify ptep_clear_flush_young #define pmdp_clear_flush_young_notify pmdp_clear_flush_young #define ptep_clear_young_notify ptep_test_and_clear_young diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c index abd88c466eb2..ee36068077b6 100644 --- a/mm/mmu_notifier.c +++ b/mm/mmu_notifier.c @@ -395,3 +395,13 @@ void mmu_notifier_unregister_no_release(struct mmu_notifier *mn, mmdrop(mm); } EXPORT_SYMBOL_GPL(mmu_notifier_unregister_no_release); + +bool +mmu_notifier_range_update_to_read_only(const struct mmu_notifier_range *range) +{ + if (!range->vma || range->event != MMU_NOTIFY_PROTECTION_VMA) + return false; + /* Return true if the vma still have the read flag set. */ + return range->vma->vm_flags & VM_READ; +} +EXPORT_SYMBOL_GPL(mmu_notifier_range_update_to_read_only); -- cgit v1.2.3 From cfcbfb1382dbac331d8aa92d3a218a16b803b2a9 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Mon, 13 May 2019 17:21:04 -0700 Subject: mm/filemap.c: enable error injection at add_to_page_cache() Recently I messed up the error handling in filemap_fault() because of an unexpected ENOMEM (related to cgroup memory limits) in add_to_page_cache. Enable error injection at this point so I can add a testcase to xfstests to verify I don't mess this up again. [akpm@linux-foundation.org: include linux/error-injection.h] Link: http://lkml.kernel.org/r/20190403152604.14008-1-josef@toxicpanda.com Signed-off-by: Josef Bacik Reviewed-by: William Kucharski Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/mm/filemap.c b/mm/filemap.c index 4157f858a9c6..0e929b4da48b 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -24,6 +24,7 @@ #include #include #include +#include #include #include #include @@ -882,6 +883,7 @@ error: put_page(page); return xas_error(&xas); } +ALLOW_ERROR_INJECTION(__add_to_page_cache_locked, ERRNO); /** * add_to_page_cache_locked - add a locked page to the pagecache -- cgit v1.2.3 From 059d8442ea77dd995c7ec075c6a6cca527b1c244 Mon Sep 17 00:00:00 2001 From: Huang Shijie Date: Mon, 13 May 2019 17:21:07 -0700 Subject: mm/rmap.c: use the pra.mapcount to do the check We have the pra.mapcount already, and there is no need to call the page_mapped() which may do some complicated computing for compound page. Link: http://lkml.kernel.org/r/20190404054828.2731-1-sjhuang@iluvatar.ai Signed-off-by: Huang Shijie Acked-by: Kirill A. Shutemov Cc: Mike Kravetz Cc: Rik van Riel Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/rmap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/rmap.c b/mm/rmap.c index 0cbed70700ed..e5dfe2ae6b0d 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -850,7 +850,7 @@ int page_referenced(struct page *page, }; *vm_flags = 0; - if (!page_mapped(page)) + if (!pra.mapcount) return 0; if (!page_rmapping(page)) -- cgit v1.2.3 From 5470dea49f5382257c242ac617d908267727f1a8 Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Mon, 13 May 2019 17:21:10 -0700 Subject: mm: use mm_zero_struct_page from SPARC on all 64b architectures Patch series "Deferred page init improvements", v7. This patchset is essentially a refactor of the page initialization logic that is meant to provide for better code reuse while providing a significant improvement in deferred page initialization performance. In my testing on an x86_64 system with 384GB of RAM I have seen the following. In the case of regular memory initialization the deferred init time was decreased from 3.75s to 1.38s on average. This amounts to a 172% improvement for the deferred memory initialization performance. I have called out the improvement observed with each patch. This patch (of 4): Use the same approach that was already in use on Sparc on all the architectures that support a 64b long. This is mostly motivated by the fact that 7 to 10 store/move instructions are likely always going to be faster than having to call into a function that is not specialized for handling page init. An added advantage to doing it this way is that the compiler can get away with combining writes in the __init_single_page call. As a result the memset call will be reduced to only about 4 write operations, or at least that is what I am seeing with GCC 6.2 as the flags, LRU pointers, and count/mapcount seem to be cancelling out at least 4 of the 8 assignments on my system. One change I had to make to the function was to reduce the minimum page size to 56 to support some powerpc64 configurations. This change should introduce no change on SPARC since it already had this code. In the case of x86_64 I saw a reduction from 3.75s to 2.80s when initializing 384GB of RAM per node. Pavel Tatashin tested on a system with Broadcom's Stingray CPU and 48GB of RAM and found that __init_single_page() takes 19.30ns / 64-byte struct page before this patch and with this patch it takes 17.33ns / 64-byte struct page. Mike Rapoport ran a similar test on a OpenPower (S812LC 8348-21C) with Power8 processor and 128GB or RAM. His results per 64-byte struct page were 4.68ns before, and 4.59ns after this patch. Link: http://lkml.kernel.org/r/20190405221213.12227.9392.stgit@localhost.localdomain Signed-off-by: Alexander Duyck Reviewed-by: Pavel Tatashin Acked-by: Michal Hocko Cc: Mike Rapoport Cc: Dan Williams Cc: Dave Jiang Cc: David S. Miller Cc: Ingo Molnar Cc: Khalid Aziz Cc: "Kirill A. Shutemov" Cc: Laurent Dufour Cc: Matthew Wilcox Cc: Mel Gorman Cc: Mike Rapoport Cc: Pavel Tatashin Cc: Vlastimil Babka Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/sparc/include/asm/pgtable_64.h | 30 --------------------------- include/linux/mm.h | 41 ++++++++++++++++++++++++++++++++++--- 2 files changed, 38 insertions(+), 33 deletions(-) diff --git a/arch/sparc/include/asm/pgtable_64.h b/arch/sparc/include/asm/pgtable_64.h index 1393a8ac596b..22500c3be7a9 100644 --- a/arch/sparc/include/asm/pgtable_64.h +++ b/arch/sparc/include/asm/pgtable_64.h @@ -231,36 +231,6 @@ extern unsigned long _PAGE_ALL_SZ_BITS; extern struct page *mem_map_zero; #define ZERO_PAGE(vaddr) (mem_map_zero) -/* This macro must be updated when the size of struct page grows above 80 - * or reduces below 64. - * The idea that compiler optimizes out switch() statement, and only - * leaves clrx instructions - */ -#define mm_zero_struct_page(pp) do { \ - unsigned long *_pp = (void *)(pp); \ - \ - /* Check that struct page is either 64, 72, or 80 bytes */ \ - BUILD_BUG_ON(sizeof(struct page) & 7); \ - BUILD_BUG_ON(sizeof(struct page) < 64); \ - BUILD_BUG_ON(sizeof(struct page) > 80); \ - \ - switch (sizeof(struct page)) { \ - case 80: \ - _pp[9] = 0; /* fallthrough */ \ - case 72: \ - _pp[8] = 0; /* fallthrough */ \ - default: \ - _pp[7] = 0; \ - _pp[6] = 0; \ - _pp[5] = 0; \ - _pp[4] = 0; \ - _pp[3] = 0; \ - _pp[2] = 0; \ - _pp[1] = 0; \ - _pp[0] = 0; \ - } \ -} while (0) - /* PFNs are real physical page numbers. However, mem_map only begins to record * per-page information starting at pfn_base. This is to handle systems where * the first physical page in the machine is at some huge physical address, diff --git a/include/linux/mm.h b/include/linux/mm.h index e6b6be15609e..abb7eb7ef0f2 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -124,10 +124,45 @@ extern int mmap_rnd_compat_bits __read_mostly; /* * On some architectures it is expensive to call memset() for small sizes. - * Those architectures should provide their own implementation of "struct page" - * zeroing by defining this macro in . + * If an architecture decides to implement their own version of + * mm_zero_struct_page they should wrap the defines below in a #ifndef and + * define their own version of this macro in */ -#ifndef mm_zero_struct_page +#if BITS_PER_LONG == 64 +/* This function must be updated when the size of struct page grows above 80 + * or reduces below 56. The idea that compiler optimizes out switch() + * statement, and only leaves move/store instructions. Also the compiler can + * combine write statments if they are both assignments and can be reordered, + * this can result in several of the writes here being dropped. + */ +#define mm_zero_struct_page(pp) __mm_zero_struct_page(pp) +static inline void __mm_zero_struct_page(struct page *page) +{ + unsigned long *_pp = (void *)page; + + /* Check that struct page is either 56, 64, 72, or 80 bytes */ + BUILD_BUG_ON(sizeof(struct page) & 7); + BUILD_BUG_ON(sizeof(struct page) < 56); + BUILD_BUG_ON(sizeof(struct page) > 80); + + switch (sizeof(struct page)) { + case 80: + _pp[9] = 0; /* fallthrough */ + case 72: + _pp[8] = 0; /* fallthrough */ + case 64: + _pp[7] = 0; /* fallthrough */ + case 56: + _pp[6] = 0; + _pp[5] = 0; + _pp[4] = 0; + _pp[3] = 0; + _pp[2] = 0; + _pp[1] = 0; + _pp[0] = 0; + } +} +#else #define mm_zero_struct_page(pp) ((void)memset((pp), 0, sizeof(struct page))) #endif -- cgit v1.2.3 From 56ec43d8b02719402c9fcf984feb52ec2300f8a5 Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Mon, 13 May 2019 17:21:13 -0700 Subject: mm: drop meminit_pfn_in_nid as it is redundant As best as I can tell the meminit_pfn_in_nid call is completely redundant. The deferred memory initialization is already making use of for_each_free_mem_range which in turn will call into __next_mem_range which will only return a memory range if it matches the node ID provided assuming it is not NUMA_NO_NODE. I am operating on the assumption that there are no zones or pgdata_t structures that have a NUMA node of NUMA_NO_NODE associated with them. If that is the case then __next_mem_range will never return a memory range that doesn't match the zone's node ID and as such the check is redundant. So one piece I would like to verify on this is if this works for ia64. Technically it was using a different approach to get the node ID, but it seems to have the node ID also encoded into the memblock. So I am assuming this is okay, but would like to get confirmation on that. On my x86_64 test system with 384GB of memory per node I saw a reduction in initialization time from 2.80s to 1.85s as a result of this patch. Link: http://lkml.kernel.org/r/20190405221219.12227.93957.stgit@localhost.localdomain Signed-off-by: Alexander Duyck Reviewed-by: Pavel Tatashin Acked-by: Michal Hocko Cc: Mike Rapoport Cc: Dan Williams Cc: Dave Jiang Cc: David S. Miller Cc: Ingo Molnar Cc: Khalid Aziz Cc: "Kirill A. Shutemov" Cc: Laurent Dufour Cc: Matthew Wilcox Cc: Mel Gorman Cc: Mike Rapoport Cc: Pavel Tatashin Cc: Vlastimil Babka Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 51 ++++++++++++++------------------------------------- 1 file changed, 14 insertions(+), 37 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 909adce33398..25b82be438d7 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1416,36 +1416,22 @@ int __meminit early_pfn_to_nid(unsigned long pfn) #endif #ifdef CONFIG_NODES_SPAN_OTHER_NODES -static inline bool __meminit __maybe_unused -meminit_pfn_in_nid(unsigned long pfn, int node, - struct mminit_pfnnid_cache *state) +/* Only safe to use early in boot when initialisation is single-threaded */ +static inline bool __meminit early_pfn_in_nid(unsigned long pfn, int node) { int nid; - nid = __early_pfn_to_nid(pfn, state); + nid = __early_pfn_to_nid(pfn, &early_pfnnid_cache); if (nid >= 0 && nid != node) return false; return true; } -/* Only safe to use early in boot when initialisation is single-threaded */ -static inline bool __meminit early_pfn_in_nid(unsigned long pfn, int node) -{ - return meminit_pfn_in_nid(pfn, node, &early_pfnnid_cache); -} - #else - static inline bool __meminit early_pfn_in_nid(unsigned long pfn, int node) { return true; } -static inline bool __meminit __maybe_unused -meminit_pfn_in_nid(unsigned long pfn, int node, - struct mminit_pfnnid_cache *state) -{ - return true; -} #endif @@ -1574,21 +1560,13 @@ static inline void __init pgdat_init_report_one_done(void) * * Then, we check if a current large page is valid by only checking the validity * of the head pfn. - * - * Finally, meminit_pfn_in_nid is checked on systems where pfns can interleave - * within a node: a pfn is between start and end of a node, but does not belong - * to this memory node. */ -static inline bool __init -deferred_pfn_valid(int nid, unsigned long pfn, - struct mminit_pfnnid_cache *nid_init_state) +static inline bool __init deferred_pfn_valid(unsigned long pfn) { if (!pfn_valid_within(pfn)) return false; if (!(pfn & (pageblock_nr_pages - 1)) && !pfn_valid(pfn)) return false; - if (!meminit_pfn_in_nid(pfn, nid, nid_init_state)) - return false; return true; } @@ -1596,15 +1574,14 @@ deferred_pfn_valid(int nid, unsigned long pfn, * Free pages to buddy allocator. Try to free aligned pages in * pageblock_nr_pages sizes. */ -static void __init deferred_free_pages(int nid, int zid, unsigned long pfn, +static void __init deferred_free_pages(unsigned long pfn, unsigned long end_pfn) { - struct mminit_pfnnid_cache nid_init_state = { }; unsigned long nr_pgmask = pageblock_nr_pages - 1; unsigned long nr_free = 0; for (; pfn < end_pfn; pfn++) { - if (!deferred_pfn_valid(nid, pfn, &nid_init_state)) { + if (!deferred_pfn_valid(pfn)) { deferred_free_range(pfn - nr_free, nr_free); nr_free = 0; } else if (!(pfn & nr_pgmask)) { @@ -1624,17 +1601,18 @@ static void __init deferred_free_pages(int nid, int zid, unsigned long pfn, * by performing it only once every pageblock_nr_pages. * Return number of pages initialized. */ -static unsigned long __init deferred_init_pages(int nid, int zid, +static unsigned long __init deferred_init_pages(struct zone *zone, unsigned long pfn, unsigned long end_pfn) { - struct mminit_pfnnid_cache nid_init_state = { }; unsigned long nr_pgmask = pageblock_nr_pages - 1; + int nid = zone_to_nid(zone); unsigned long nr_pages = 0; + int zid = zone_idx(zone); struct page *page = NULL; for (; pfn < end_pfn; pfn++) { - if (!deferred_pfn_valid(nid, pfn, &nid_init_state)) { + if (!deferred_pfn_valid(pfn)) { page = NULL; continue; } else if (!page || !(pfn & nr_pgmask)) { @@ -1697,12 +1675,12 @@ static int __init deferred_init_memmap(void *data) for_each_free_mem_range(i, nid, MEMBLOCK_NONE, &spa, &epa, NULL) { spfn = max_t(unsigned long, first_init_pfn, PFN_UP(spa)); epfn = min_t(unsigned long, zone_end_pfn(zone), PFN_DOWN(epa)); - nr_pages += deferred_init_pages(nid, zid, spfn, epfn); + nr_pages += deferred_init_pages(zone, spfn, epfn); } for_each_free_mem_range(i, nid, MEMBLOCK_NONE, &spa, &epa, NULL) { spfn = max_t(unsigned long, first_init_pfn, PFN_UP(spa)); epfn = min_t(unsigned long, zone_end_pfn(zone), PFN_DOWN(epa)); - deferred_free_pages(nid, zid, spfn, epfn); + deferred_free_pages(spfn, epfn); } pgdat_resize_unlock(pgdat, &flags); @@ -1734,7 +1712,6 @@ static int __init deferred_init_memmap(void *data) static noinline bool __init deferred_grow_zone(struct zone *zone, unsigned int order) { - int zid = zone_idx(zone); int nid = zone_to_nid(zone); pg_data_t *pgdat = NODE_DATA(nid); unsigned long nr_pages_needed = ALIGN(1 << order, PAGES_PER_SECTION); @@ -1784,7 +1761,7 @@ deferred_grow_zone(struct zone *zone, unsigned int order) while (spfn < epfn && nr_pages < nr_pages_needed) { t = ALIGN(spfn + PAGES_PER_SECTION, PAGES_PER_SECTION); first_deferred_pfn = min(t, epfn); - nr_pages += deferred_init_pages(nid, zid, spfn, + nr_pages += deferred_init_pages(zone, spfn, first_deferred_pfn); spfn = first_deferred_pfn; } @@ -1796,7 +1773,7 @@ deferred_grow_zone(struct zone *zone, unsigned int order) for_each_free_mem_range(i, nid, MEMBLOCK_NONE, &spa, &epa, NULL) { spfn = max_t(unsigned long, first_init_pfn, PFN_UP(spa)); epfn = min_t(unsigned long, first_deferred_pfn, PFN_DOWN(epa)); - deferred_free_pages(nid, zid, spfn, epfn); + deferred_free_pages(spfn, epfn); if (first_deferred_pfn == epfn) break; -- cgit v1.2.3 From 837566e7e08e3f89444166444836a8a49b9f9322 Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Mon, 13 May 2019 17:21:17 -0700 Subject: mm: implement new zone specific memblock iterator Introduce a new iterator for_each_free_mem_pfn_range_in_zone. This iterator will take care of making sure a given memory range provided is in fact contained within a zone. It takes are of all the bounds checking we were doing in deferred_grow_zone, and deferred_init_memmap. In addition it should help to speed up the search a bit by iterating until the end of a range is greater than the start of the zone pfn range, and will exit completely if the start is beyond the end of the zone. Link: http://lkml.kernel.org/r/20190405221225.12227.22573.stgit@localhost.localdomain Signed-off-by: Alexander Duyck Reviewed-by: Pavel Tatashin Reviewed-by: Mike Rapoport Cc: Dan Williams Cc: Dave Jiang Cc: David S. Miller Cc: Ingo Molnar Cc: Khalid Aziz Cc: "Kirill A. Shutemov" Cc: Laurent Dufour Cc: Matthew Wilcox Cc: Mel Gorman Cc: Michal Hocko Cc: Mike Rapoport Cc: Pavel Tatashin Cc: Vlastimil Babka Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memblock.h | 25 +++++++++++++++++++ mm/memblock.c | 64 ++++++++++++++++++++++++++++++++++++++++++++++++ mm/page_alloc.c | 31 +++++++++-------------- 3 files changed, 101 insertions(+), 19 deletions(-) diff --git a/include/linux/memblock.h b/include/linux/memblock.h index 294d5d80e150..f8b78892b977 100644 --- a/include/linux/memblock.h +++ b/include/linux/memblock.h @@ -240,6 +240,31 @@ void __next_mem_pfn_range(int *idx, int nid, unsigned long *out_start_pfn, i >= 0; __next_mem_pfn_range(&i, nid, p_start, p_end, p_nid)) #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ +#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT +void __next_mem_pfn_range_in_zone(u64 *idx, struct zone *zone, + unsigned long *out_spfn, + unsigned long *out_epfn); +/** + * for_each_free_mem_range_in_zone - iterate through zone specific free + * memblock areas + * @i: u64 used as loop variable + * @zone: zone in which all of the memory blocks reside + * @p_start: ptr to phys_addr_t for start address of the range, can be %NULL + * @p_end: ptr to phys_addr_t for end address of the range, can be %NULL + * + * Walks over free (memory && !reserved) areas of memblock in a specific + * zone. Available once memblock and an empty zone is initialized. The main + * assumption is that the zone start, end, and pgdat have been associated. + * This way we can use the zone to determine NUMA node, and if a given part + * of the memblock is valid for the zone. + */ +#define for_each_free_mem_pfn_range_in_zone(i, zone, p_start, p_end) \ + for (i = 0, \ + __next_mem_pfn_range_in_zone(&i, zone, p_start, p_end); \ + i != U64_MAX; \ + __next_mem_pfn_range_in_zone(&i, zone, p_start, p_end)) +#endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */ + /** * for_each_free_mem_range - iterate through free memblock areas * @i: u64 used as loop variable diff --git a/mm/memblock.c b/mm/memblock.c index a48f520c2d01..f315eca9f4a1 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -1255,6 +1255,70 @@ int __init_memblock memblock_set_node(phys_addr_t base, phys_addr_t size, return 0; } #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ +#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT +/** + * __next_mem_pfn_range_in_zone - iterator for for_each_*_range_in_zone() + * + * @idx: pointer to u64 loop variable + * @zone: zone in which all of the memory blocks reside + * @out_spfn: ptr to ulong for start pfn of the range, can be %NULL + * @out_epfn: ptr to ulong for end pfn of the range, can be %NULL + * + * This function is meant to be a zone/pfn specific wrapper for the + * for_each_mem_range type iterators. Specifically they are used in the + * deferred memory init routines and as such we were duplicating much of + * this logic throughout the code. So instead of having it in multiple + * locations it seemed like it would make more sense to centralize this to + * one new iterator that does everything they need. + */ +void __init_memblock +__next_mem_pfn_range_in_zone(u64 *idx, struct zone *zone, + unsigned long *out_spfn, unsigned long *out_epfn) +{ + int zone_nid = zone_to_nid(zone); + phys_addr_t spa, epa; + int nid; + + __next_mem_range(idx, zone_nid, MEMBLOCK_NONE, + &memblock.memory, &memblock.reserved, + &spa, &epa, &nid); + + while (*idx != U64_MAX) { + unsigned long epfn = PFN_DOWN(epa); + unsigned long spfn = PFN_UP(spa); + + /* + * Verify the end is at least past the start of the zone and + * that we have at least one PFN to initialize. + */ + if (zone->zone_start_pfn < epfn && spfn < epfn) { + /* if we went too far just stop searching */ + if (zone_end_pfn(zone) <= spfn) { + *idx = U64_MAX; + break; + } + + if (out_spfn) + *out_spfn = max(zone->zone_start_pfn, spfn); + if (out_epfn) + *out_epfn = min(zone_end_pfn(zone), epfn); + + return; + } + + __next_mem_range(idx, zone_nid, MEMBLOCK_NONE, + &memblock.memory, &memblock.reserved, + &spa, &epa, &nid); + } + + /* signal end of iteration */ + if (out_spfn) + *out_spfn = ULONG_MAX; + if (out_epfn) + *out_epfn = 0; +} + +#endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */ /** * memblock_alloc_range_nid - allocate boot memory block diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 25b82be438d7..fd42321c02f0 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1631,11 +1631,9 @@ static unsigned long __init deferred_init_pages(struct zone *zone, static int __init deferred_init_memmap(void *data) { pg_data_t *pgdat = data; - int nid = pgdat->node_id; unsigned long start = jiffies; unsigned long nr_pages = 0; unsigned long spfn, epfn, first_init_pfn, flags; - phys_addr_t spa, epa; int zid; struct zone *zone; const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id); @@ -1672,14 +1670,12 @@ static int __init deferred_init_memmap(void *data) * freeing pages we can access pages that are ahead (computing buddy * page in __free_one_page()). */ - for_each_free_mem_range(i, nid, MEMBLOCK_NONE, &spa, &epa, NULL) { - spfn = max_t(unsigned long, first_init_pfn, PFN_UP(spa)); - epfn = min_t(unsigned long, zone_end_pfn(zone), PFN_DOWN(epa)); + for_each_free_mem_pfn_range_in_zone(i, zone, &spfn, &epfn) { + spfn = max_t(unsigned long, first_init_pfn, spfn); nr_pages += deferred_init_pages(zone, spfn, epfn); } - for_each_free_mem_range(i, nid, MEMBLOCK_NONE, &spa, &epa, NULL) { - spfn = max_t(unsigned long, first_init_pfn, PFN_UP(spa)); - epfn = min_t(unsigned long, zone_end_pfn(zone), PFN_DOWN(epa)); + for_each_free_mem_pfn_range_in_zone(i, zone, &spfn, &epfn) { + spfn = max_t(unsigned long, first_init_pfn, spfn); deferred_free_pages(spfn, epfn); } pgdat_resize_unlock(pgdat, &flags); @@ -1687,8 +1683,8 @@ static int __init deferred_init_memmap(void *data) /* Sanity check that the next zone really is unpopulated */ WARN_ON(++zid < MAX_NR_ZONES && populated_zone(++zone)); - pr_info("node %d initialised, %lu pages in %ums\n", nid, nr_pages, - jiffies_to_msecs(jiffies - start)); + pr_info("node %d initialised, %lu pages in %ums\n", + pgdat->node_id, nr_pages, jiffies_to_msecs(jiffies - start)); pgdat_init_report_one_done(); return 0; @@ -1712,13 +1708,11 @@ static int __init deferred_init_memmap(void *data) static noinline bool __init deferred_grow_zone(struct zone *zone, unsigned int order) { - int nid = zone_to_nid(zone); - pg_data_t *pgdat = NODE_DATA(nid); unsigned long nr_pages_needed = ALIGN(1 << order, PAGES_PER_SECTION); + pg_data_t *pgdat = zone->zone_pgdat; unsigned long nr_pages = 0; unsigned long first_init_pfn, spfn, epfn, t, flags; unsigned long first_deferred_pfn = pgdat->first_deferred_pfn; - phys_addr_t spa, epa; u64 i; /* Only the last zone may have deferred pages */ @@ -1754,9 +1748,8 @@ deferred_grow_zone(struct zone *zone, unsigned int order) return false; } - for_each_free_mem_range(i, nid, MEMBLOCK_NONE, &spa, &epa, NULL) { - spfn = max_t(unsigned long, first_init_pfn, PFN_UP(spa)); - epfn = min_t(unsigned long, zone_end_pfn(zone), PFN_DOWN(epa)); + for_each_free_mem_pfn_range_in_zone(i, zone, &spfn, &epfn) { + spfn = max_t(unsigned long, first_init_pfn, spfn); while (spfn < epfn && nr_pages < nr_pages_needed) { t = ALIGN(spfn + PAGES_PER_SECTION, PAGES_PER_SECTION); @@ -1770,9 +1763,9 @@ deferred_grow_zone(struct zone *zone, unsigned int order) break; } - for_each_free_mem_range(i, nid, MEMBLOCK_NONE, &spa, &epa, NULL) { - spfn = max_t(unsigned long, first_init_pfn, PFN_UP(spa)); - epfn = min_t(unsigned long, first_deferred_pfn, PFN_DOWN(epa)); + for_each_free_mem_pfn_range_in_zone(i, zone, &spfn, &epfn) { + spfn = max_t(unsigned long, first_init_pfn, spfn); + epfn = min_t(unsigned long, first_deferred_pfn, epfn); deferred_free_pages(spfn, epfn); if (first_deferred_pfn == epfn) -- cgit v1.2.3 From 0e56acae4b4dd4a9fbe897854ab83a109e2a9e11 Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Mon, 13 May 2019 17:21:20 -0700 Subject: mm: initialize MAX_ORDER_NR_PAGES at a time instead of doing larger sections Add yet another iterator, for_each_free_mem_range_in_zone_from, and then use it to support initializing and freeing pages in groups no larger than MAX_ORDER_NR_PAGES. By doing this we can greatly improve the cache locality of the pages while we do several loops over them in the init and freeing process. We are able to tighten the loops further as a result of the "from" iterator as we can perform the initial checks for first_init_pfn in our first call to the iterator, and continue without the need for those checks via the "from" iterator. I have added this functionality in the function called deferred_init_mem_pfn_range_in_zone that primes the iterator and causes us to exit if we encounter any failure. On my x86_64 test system with 384GB of memory per node I saw a reduction in initialization time from 1.85s to 1.38s as a result of this patch. Link: http://lkml.kernel.org/r/20190405221231.12227.85836.stgit@localhost.localdomain Signed-off-by: Alexander Duyck Reviewed-by: Pavel Tatashin Cc: Mike Rapoport Cc: Michal Hocko Cc: Dave Jiang Cc: Matthew Wilcox Cc: Ingo Molnar Cc: Cc: Khalid Aziz Cc: Mike Rapoport Cc: Vlastimil Babka Cc: Dan Williams Cc: Laurent Dufour Cc: Mel Gorman Cc: David S. Miller Cc: "Kirill A. Shutemov" Cc: Pavel Tatashin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memblock.h | 16 +++++ mm/page_alloc.c | 162 +++++++++++++++++++++++++++++++++++------------ 2 files changed, 137 insertions(+), 41 deletions(-) diff --git a/include/linux/memblock.h b/include/linux/memblock.h index f8b78892b977..47e3c0612592 100644 --- a/include/linux/memblock.h +++ b/include/linux/memblock.h @@ -263,6 +263,22 @@ void __next_mem_pfn_range_in_zone(u64 *idx, struct zone *zone, __next_mem_pfn_range_in_zone(&i, zone, p_start, p_end); \ i != U64_MAX; \ __next_mem_pfn_range_in_zone(&i, zone, p_start, p_end)) + +/** + * for_each_free_mem_range_in_zone_from - iterate through zone specific + * free memblock areas from a given point + * @i: u64 used as loop variable + * @zone: zone in which all of the memory blocks reside + * @p_start: ptr to phys_addr_t for start address of the range, can be %NULL + * @p_end: ptr to phys_addr_t for end address of the range, can be %NULL + * + * Walks over free (memory && !reserved) areas of memblock in a specific + * zone, continuing from current position. Available as soon as memblock is + * initialized. + */ +#define for_each_free_mem_pfn_range_in_zone_from(i, zone, p_start, p_end) \ + for (; i != U64_MAX; \ + __next_mem_pfn_range_in_zone(&i, zone, p_start, p_end)) #endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */ /** diff --git a/mm/page_alloc.c b/mm/page_alloc.c index fd42321c02f0..96ca65636e40 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1627,16 +1627,100 @@ static unsigned long __init deferred_init_pages(struct zone *zone, return (nr_pages); } +/* + * This function is meant to pre-load the iterator for the zone init. + * Specifically it walks through the ranges until we are caught up to the + * first_init_pfn value and exits there. If we never encounter the value we + * return false indicating there are no valid ranges left. + */ +static bool __init +deferred_init_mem_pfn_range_in_zone(u64 *i, struct zone *zone, + unsigned long *spfn, unsigned long *epfn, + unsigned long first_init_pfn) +{ + u64 j; + + /* + * Start out by walking through the ranges in this zone that have + * already been initialized. We don't need to do anything with them + * so we just need to flush them out of the system. + */ + for_each_free_mem_pfn_range_in_zone(j, zone, spfn, epfn) { + if (*epfn <= first_init_pfn) + continue; + if (*spfn < first_init_pfn) + *spfn = first_init_pfn; + *i = j; + return true; + } + + return false; +} + +/* + * Initialize and free pages. We do it in two loops: first we initialize + * struct page, then free to buddy allocator, because while we are + * freeing pages we can access pages that are ahead (computing buddy + * page in __free_one_page()). + * + * In order to try and keep some memory in the cache we have the loop + * broken along max page order boundaries. This way we will not cause + * any issues with the buddy page computation. + */ +static unsigned long __init +deferred_init_maxorder(u64 *i, struct zone *zone, unsigned long *start_pfn, + unsigned long *end_pfn) +{ + unsigned long mo_pfn = ALIGN(*start_pfn + 1, MAX_ORDER_NR_PAGES); + unsigned long spfn = *start_pfn, epfn = *end_pfn; + unsigned long nr_pages = 0; + u64 j = *i; + + /* First we loop through and initialize the page values */ + for_each_free_mem_pfn_range_in_zone_from(j, zone, start_pfn, end_pfn) { + unsigned long t; + + if (mo_pfn <= *start_pfn) + break; + + t = min(mo_pfn, *end_pfn); + nr_pages += deferred_init_pages(zone, *start_pfn, t); + + if (mo_pfn < *end_pfn) { + *start_pfn = mo_pfn; + break; + } + } + + /* Reset values and now loop through freeing pages as needed */ + swap(j, *i); + + for_each_free_mem_pfn_range_in_zone_from(j, zone, &spfn, &epfn) { + unsigned long t; + + if (mo_pfn <= spfn) + break; + + t = min(mo_pfn, epfn); + deferred_free_pages(spfn, t); + + if (mo_pfn <= epfn) + break; + } + + return nr_pages; +} + /* Initialise remaining memory on a node */ static int __init deferred_init_memmap(void *data) { pg_data_t *pgdat = data; + const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id); + unsigned long spfn = 0, epfn = 0, nr_pages = 0; + unsigned long first_init_pfn, flags; unsigned long start = jiffies; - unsigned long nr_pages = 0; - unsigned long spfn, epfn, first_init_pfn, flags; - int zid; struct zone *zone; - const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id); + int zid; u64 i; /* Bind memory initialisation thread to a local node if possible */ @@ -1662,22 +1746,20 @@ static int __init deferred_init_memmap(void *data) if (first_init_pfn < zone_end_pfn(zone)) break; } - first_init_pfn = max(zone->zone_start_pfn, first_init_pfn); + + /* If the zone is empty somebody else may have cleared out the zone */ + if (!deferred_init_mem_pfn_range_in_zone(&i, zone, &spfn, &epfn, + first_init_pfn)) + goto zone_empty; /* - * Initialize and free pages. We do it in two loops: first we initialize - * struct page, than free to buddy allocator, because while we are - * freeing pages we can access pages that are ahead (computing buddy - * page in __free_one_page()). + * Initialize and free pages in MAX_ORDER sized increments so + * that we can avoid introducing any issues with the buddy + * allocator. */ - for_each_free_mem_pfn_range_in_zone(i, zone, &spfn, &epfn) { - spfn = max_t(unsigned long, first_init_pfn, spfn); - nr_pages += deferred_init_pages(zone, spfn, epfn); - } - for_each_free_mem_pfn_range_in_zone(i, zone, &spfn, &epfn) { - spfn = max_t(unsigned long, first_init_pfn, spfn); - deferred_free_pages(spfn, epfn); - } + while (spfn < epfn) + nr_pages += deferred_init_maxorder(&i, zone, &spfn, &epfn); +zone_empty: pgdat_resize_unlock(pgdat, &flags); /* Sanity check that the next zone really is unpopulated */ @@ -1710,9 +1792,9 @@ deferred_grow_zone(struct zone *zone, unsigned int order) { unsigned long nr_pages_needed = ALIGN(1 << order, PAGES_PER_SECTION); pg_data_t *pgdat = zone->zone_pgdat; - unsigned long nr_pages = 0; - unsigned long first_init_pfn, spfn, epfn, t, flags; unsigned long first_deferred_pfn = pgdat->first_deferred_pfn; + unsigned long spfn, epfn, flags; + unsigned long nr_pages = 0; u64 i; /* Only the last zone may have deferred pages */ @@ -1741,37 +1823,35 @@ deferred_grow_zone(struct zone *zone, unsigned int order) return true; } - first_init_pfn = max(zone->zone_start_pfn, first_deferred_pfn); - - if (first_init_pfn >= pgdat_end_pfn(pgdat)) { + /* If the zone is empty somebody else may have cleared out the zone */ + if (!deferred_init_mem_pfn_range_in_zone(&i, zone, &spfn, &epfn, + first_deferred_pfn)) { + pgdat->first_deferred_pfn = ULONG_MAX; pgdat_resize_unlock(pgdat, &flags); - return false; + return true; } - for_each_free_mem_pfn_range_in_zone(i, zone, &spfn, &epfn) { - spfn = max_t(unsigned long, first_init_pfn, spfn); + /* + * Initialize and free pages in MAX_ORDER sized increments so + * that we can avoid introducing any issues with the buddy + * allocator. + */ + while (spfn < epfn) { + /* update our first deferred PFN for this section */ + first_deferred_pfn = spfn; + + nr_pages += deferred_init_maxorder(&i, zone, &spfn, &epfn); - while (spfn < epfn && nr_pages < nr_pages_needed) { - t = ALIGN(spfn + PAGES_PER_SECTION, PAGES_PER_SECTION); - first_deferred_pfn = min(t, epfn); - nr_pages += deferred_init_pages(zone, spfn, - first_deferred_pfn); - spfn = first_deferred_pfn; - } + /* We should only stop along section boundaries */ + if ((first_deferred_pfn ^ spfn) < PAGES_PER_SECTION) + continue; + /* If our quota has been met we can stop here */ if (nr_pages >= nr_pages_needed) break; } - for_each_free_mem_pfn_range_in_zone(i, zone, &spfn, &epfn) { - spfn = max_t(unsigned long, first_init_pfn, spfn); - epfn = min_t(unsigned long, first_deferred_pfn, epfn); - deferred_free_pages(spfn, epfn); - - if (first_deferred_pfn == epfn) - break; - } - pgdat->first_deferred_pfn = first_deferred_pfn; + pgdat->first_deferred_pfn = spfn; pgdat_resize_unlock(pgdat, &flags); return nr_pages > 0; -- cgit v1.2.3 From 5557c766abad25acc8091ccb9641b96e3b3da06f Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Mon, 13 May 2019 17:21:24 -0700 Subject: mm, memory_hotplug: cleanup memory offline path check_pages_isolated_cb currently accounts the whole pfn range as being offlined if test_pages_isolated suceeds on the range. This is based on the assumption that all pages in the range are freed which is currently the case in most cases but it won't be with later changes, as pages marked as vmemmap won't be isolated. Move the offlined pages counting to offline_isolated_pages_cb and rely on __offline_isolated_pages to return the correct value. check_pages_isolated_cb will still do it's primary job and check the pfn range. While we are at it remove check_pages_isolated and offline_isolated_pages and use directly walk_system_ram_range as do in online_pages. Link: http://lkml.kernel.org/r/20190408082633.2864-2-osalvador@suse.de Reviewed-by: David Hildenbrand Signed-off-by: Michal Hocko Signed-off-by: Oscar Salvador Cc: Dan Williams Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memory_hotplug.h | 3 ++- mm/memory_hotplug.c | 45 +++++++++++------------------------------- mm/page_alloc.c | 11 +++++++++-- 3 files changed, 22 insertions(+), 37 deletions(-) diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index 8ade08c50d26..3c8cf347804c 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -87,7 +87,8 @@ extern int add_one_highpage(struct page *page, int pfn, int bad_ppro); extern int online_pages(unsigned long, unsigned long, int); extern int test_pages_in_a_zone(unsigned long start_pfn, unsigned long end_pfn, unsigned long *valid_start, unsigned long *valid_end); -extern void __offline_isolated_pages(unsigned long, unsigned long); +extern unsigned long __offline_isolated_pages(unsigned long start_pfn, + unsigned long end_pfn); typedef void (*online_page_callback_t)(struct page *page, unsigned int order); diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index a279671b9968..75f9f6590677 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1449,15 +1449,10 @@ static int offline_isolated_pages_cb(unsigned long start, unsigned long nr_pages, void *data) { - __offline_isolated_pages(start, start + nr_pages); - return 0; -} + unsigned long *offlined_pages = (unsigned long *)data; -static void -offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn) -{ - walk_system_ram_range(start_pfn, end_pfn - start_pfn, NULL, - offline_isolated_pages_cb); + *offlined_pages += __offline_isolated_pages(start, start + nr_pages); + return 0; } /* @@ -1467,26 +1462,7 @@ static int check_pages_isolated_cb(unsigned long start_pfn, unsigned long nr_pages, void *data) { - int ret; - long offlined = *(long *)data; - ret = test_pages_isolated(start_pfn, start_pfn + nr_pages, true); - offlined = nr_pages; - if (!ret) - *(long *)data += offlined; - return ret; -} - -static long -check_pages_isolated(unsigned long start_pfn, unsigned long end_pfn) -{ - long offlined = 0; - int ret; - - ret = walk_system_ram_range(start_pfn, end_pfn - start_pfn, &offlined, - check_pages_isolated_cb); - if (ret < 0) - offlined = (long)ret; - return offlined; + return test_pages_isolated(start_pfn, start_pfn + nr_pages, true); } static int __init cmdline_parse_movable_node(char *p) @@ -1571,7 +1547,7 @@ static int __ref __offline_pages(unsigned long start_pfn, unsigned long end_pfn) { unsigned long pfn, nr_pages; - long offlined_pages; + unsigned long offlined_pages = 0; int ret, node, nr_isolate_pageblock; unsigned long flags; unsigned long valid_start, valid_end; @@ -1647,14 +1623,15 @@ static int __ref __offline_pages(unsigned long start_pfn, goto failed_removal_isolated; } /* check again */ - offlined_pages = check_pages_isolated(start_pfn, end_pfn); - } while (offlined_pages < 0); + ret = walk_system_ram_range(start_pfn, end_pfn - start_pfn, + NULL, check_pages_isolated_cb); + } while (ret); - pr_info("Offlined Pages %ld\n", offlined_pages); /* Ok, all of our target is isolated. We cannot do rollback at this point. */ - offline_isolated_pages(start_pfn, end_pfn); - + walk_system_ram_range(start_pfn, end_pfn - start_pfn, + &offlined_pages, offline_isolated_pages_cb); + pr_info("Offlined Pages %ld\n", offlined_pages); /* * Onlining will reset pagetype flags and makes migrate type * MOVABLE, so just need to decrease the number of isolated diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 96ca65636e40..c45da9fe3ce1 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -8453,7 +8453,7 @@ void zone_pcp_reset(struct zone *zone) * All pages in the range must be in a single zone and isolated * before calling this. */ -void +unsigned long __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn) { struct page *page; @@ -8461,12 +8461,15 @@ __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn) unsigned int order, i; unsigned long pfn; unsigned long flags; + unsigned long offlined_pages = 0; + /* find the first valid pfn */ for (pfn = start_pfn; pfn < end_pfn; pfn++) if (pfn_valid(pfn)) break; if (pfn == end_pfn) - return; + return offlined_pages; + offline_mem_sections(pfn, end_pfn); zone = page_zone(pfn_to_page(pfn)); spin_lock_irqsave(&zone->lock, flags); @@ -8484,12 +8487,14 @@ __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn) if (unlikely(!PageBuddy(page) && PageHWPoison(page))) { pfn++; SetPageReserved(page); + offlined_pages++; continue; } BUG_ON(page_count(page)); BUG_ON(!PageBuddy(page)); order = page_order(page); + offlined_pages += 1 << order; #ifdef CONFIG_DEBUG_VM pr_info("remove from free list %lx %d %lx\n", pfn, 1 << order, end_pfn); @@ -8502,6 +8507,8 @@ __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn) pfn += (1 << order); } spin_unlock_irqrestore(&zone->lock, flags); + + return offlined_pages; } #endif -- cgit v1.2.3 From 940519f0c8b757fdcbc5d14c93cdaada20ded14c Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Mon, 13 May 2019 17:21:26 -0700 Subject: mm, memory_hotplug: provide a more generic restrictions for memory hotplug arch_add_memory, __add_pages take a want_memblock which controls whether the newly added memory should get the sysfs memblock user API (e.g. ZONE_DEVICE users do not want/need this interface). Some callers even want to control where do we allocate the memmap from by configuring altmap. Add a more generic hotplug context for arch_add_memory and __add_pages. struct mhp_restrictions contains flags which contains additional features to be enabled by the memory hotplug (MHP_MEMBLOCK_API currently) and altmap for alternative memmap allocator. This patch shouldn't introduce any functional change. [akpm@linux-foundation.org: build fix] Link: http://lkml.kernel.org/r/20190408082633.2864-3-osalvador@suse.de Signed-off-by: Michal Hocko Signed-off-by: Oscar Salvador Cc: Dan Williams Cc: David Hildenbrand Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm64/mm/mmu.c | 6 +++--- arch/ia64/mm/init.c | 6 +++--- arch/powerpc/mm/mem.c | 6 +++--- arch/s390/mm/init.c | 6 +++--- arch/sh/mm/init.c | 6 +++--- arch/x86/mm/init_32.c | 6 +++--- arch/x86/mm/init_64.c | 10 +++++----- include/linux/memory_hotplug.h | 31 ++++++++++++++++++++++++------- kernel/memremap.c | 12 +++++++++--- mm/memory_hotplug.c | 11 +++++++---- 10 files changed, 63 insertions(+), 37 deletions(-) diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index ef82312860ac..ef32d4839c3f 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -1065,8 +1065,8 @@ int p4d_free_pud_page(p4d_t *p4d, unsigned long addr) } #ifdef CONFIG_MEMORY_HOTPLUG -int arch_add_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap, - bool want_memblock) +int arch_add_memory(int nid, u64 start, u64 size, + struct mhp_restrictions *restrictions) { int flags = 0; @@ -1077,6 +1077,6 @@ int arch_add_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap, size, PAGE_KERNEL, __pgd_pgtable_alloc, flags); return __add_pages(nid, start >> PAGE_SHIFT, size >> PAGE_SHIFT, - altmap, want_memblock); + restrictions); } #endif diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c index e49200e31750..379eb1f9adc9 100644 --- a/arch/ia64/mm/init.c +++ b/arch/ia64/mm/init.c @@ -666,14 +666,14 @@ mem_init (void) } #ifdef CONFIG_MEMORY_HOTPLUG -int arch_add_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap, - bool want_memblock) +int arch_add_memory(int nid, u64 start, u64 size, + struct mhp_restrictions *restrictions) { unsigned long start_pfn = start >> PAGE_SHIFT; unsigned long nr_pages = size >> PAGE_SHIFT; int ret; - ret = __add_pages(nid, start_pfn, nr_pages, altmap, want_memblock); + ret = __add_pages(nid, start_pfn, nr_pages, restrictions); if (ret) printk("%s: Problem encountered in __add_pages() as ret=%d\n", __func__, ret); diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c index 20266898f3a8..de5c591a550d 100644 --- a/arch/powerpc/mm/mem.c +++ b/arch/powerpc/mm/mem.c @@ -109,8 +109,8 @@ int __weak remove_section_mapping(unsigned long start, unsigned long end) return -ENODEV; } -int __ref arch_add_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap, - bool want_memblock) +int __ref arch_add_memory(int nid, u64 start, u64 size, + struct mhp_restrictions *restrictions) { unsigned long start_pfn = start >> PAGE_SHIFT; unsigned long nr_pages = size >> PAGE_SHIFT; @@ -127,7 +127,7 @@ int __ref arch_add_memory(int nid, u64 start, u64 size, struct vmem_altmap *altm } flush_inval_dcache_range(start, start + size); - return __add_pages(nid, start_pfn, nr_pages, altmap, want_memblock); + return __add_pages(nid, start_pfn, nr_pages, restrictions); } #ifdef CONFIG_MEMORY_HOTREMOVE diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c index 5f48fc7e61d5..06bd05137a00 100644 --- a/arch/s390/mm/init.c +++ b/arch/s390/mm/init.c @@ -219,8 +219,8 @@ device_initcall(s390_cma_mem_init); #endif /* CONFIG_CMA */ -int arch_add_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap, - bool want_memblock) +int arch_add_memory(int nid, u64 start, u64 size, + struct mhp_restrictions *restrictions) { unsigned long start_pfn = PFN_DOWN(start); unsigned long size_pages = PFN_DOWN(size); @@ -230,7 +230,7 @@ int arch_add_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap, if (rc) return rc; - rc = __add_pages(nid, start_pfn, size_pages, altmap, want_memblock); + rc = __add_pages(nid, start_pfn, size_pages, restrictions); if (rc) vmem_remove_mapping(start, size); return rc; diff --git a/arch/sh/mm/init.c b/arch/sh/mm/init.c index aeb9f45c7a39..d3cd07bd2dc1 100644 --- a/arch/sh/mm/init.c +++ b/arch/sh/mm/init.c @@ -404,15 +404,15 @@ void __init mem_init(void) } #ifdef CONFIG_MEMORY_HOTPLUG -int arch_add_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap, - bool want_memblock) +int arch_add_memory(int nid, u64 start, u64 size, + struct mhp_restrictions *restrictions) { unsigned long start_pfn = PFN_DOWN(start); unsigned long nr_pages = size >> PAGE_SHIFT; int ret; /* We only have ZONE_NORMAL, so this is easy.. */ - ret = __add_pages(nid, start_pfn, nr_pages, altmap, want_memblock); + ret = __add_pages(nid, start_pfn, nr_pages, restrictions); if (unlikely(ret)) printk("%s: Failed, __add_pages() == %d\n", __func__, ret); diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 85c94f9a87f8..755dbed85531 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -850,13 +850,13 @@ void __init mem_init(void) } #ifdef CONFIG_MEMORY_HOTPLUG -int arch_add_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap, - bool want_memblock) +int arch_add_memory(int nid, u64 start, u64 size, + struct mhp_restrictions *restrictions) { unsigned long start_pfn = start >> PAGE_SHIFT; unsigned long nr_pages = size >> PAGE_SHIFT; - return __add_pages(nid, start_pfn, nr_pages, altmap, want_memblock); + return __add_pages(nid, start_pfn, nr_pages, restrictions); } #ifdef CONFIG_MEMORY_HOTREMOVE diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index bccff68e3267..db42c11b48fb 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -777,11 +777,11 @@ static void update_end_of_memory_vars(u64 start, u64 size) } int add_pages(int nid, unsigned long start_pfn, unsigned long nr_pages, - struct vmem_altmap *altmap, bool want_memblock) + struct mhp_restrictions *restrictions) { int ret; - ret = __add_pages(nid, start_pfn, nr_pages, altmap, want_memblock); + ret = __add_pages(nid, start_pfn, nr_pages, restrictions); WARN_ON_ONCE(ret); /* update max_pfn, max_low_pfn and high_memory */ @@ -791,15 +791,15 @@ int add_pages(int nid, unsigned long start_pfn, unsigned long nr_pages, return ret; } -int arch_add_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap, - bool want_memblock) +int arch_add_memory(int nid, u64 start, u64 size, + struct mhp_restrictions *restrictions) { unsigned long start_pfn = start >> PAGE_SHIFT; unsigned long nr_pages = size >> PAGE_SHIFT; init_memory_mapping(start, start + size); - return add_pages(nid, start_pfn, nr_pages, altmap, want_memblock); + return add_pages(nid, start_pfn, nr_pages, restrictions); } #define PAGE_INUSE 0xFD diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index 3c8cf347804c..b24aca54353e 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -53,6 +53,16 @@ enum { MMOP_ONLINE_MOVABLE, }; +/* + * Restrictions for the memory hotplug: + * flags: MHP_ flags + * altmap: alternative allocator for memmap array + */ +struct mhp_restrictions { + unsigned long flags; + struct vmem_altmap *altmap; +}; + /* * Zone resizing functions * @@ -101,6 +111,8 @@ extern void __online_page_free(struct page *page); extern int try_online_node(int nid); +extern int arch_add_memory(int nid, u64 start, u64 size, + struct mhp_restrictions *restrictions); extern u64 max_mem_size; extern bool memhp_auto_online; @@ -118,20 +130,27 @@ extern int __remove_pages(struct zone *zone, unsigned long start_pfn, unsigned long nr_pages, struct vmem_altmap *altmap); #endif /* CONFIG_MEMORY_HOTREMOVE */ +/* + * Do we want sysfs memblock files created. This will allow userspace to online + * and offline memory explicitly. Lack of this bit means that the caller has to + * call move_pfn_range_to_zone to finish the initialization. + */ + +#define MHP_MEMBLOCK_API (1<<0) + /* reasonably generic interface to expand the physical pages */ extern int __add_pages(int nid, unsigned long start_pfn, unsigned long nr_pages, - struct vmem_altmap *altmap, bool want_memblock); + struct mhp_restrictions *restrictions); #ifndef CONFIG_ARCH_HAS_ADD_PAGES static inline int add_pages(int nid, unsigned long start_pfn, - unsigned long nr_pages, struct vmem_altmap *altmap, - bool want_memblock) + unsigned long nr_pages, struct mhp_restrictions *restrictions) { - return __add_pages(nid, start_pfn, nr_pages, altmap, want_memblock); + return __add_pages(nid, start_pfn, nr_pages, restrictions); } #else /* ARCH_HAS_ADD_PAGES */ int add_pages(int nid, unsigned long start_pfn, unsigned long nr_pages, - struct vmem_altmap *altmap, bool want_memblock); + struct mhp_restrictions *restrictions); #endif /* ARCH_HAS_ADD_PAGES */ #ifdef CONFIG_NUMA @@ -332,8 +351,6 @@ extern int walk_memory_range(unsigned long start_pfn, unsigned long end_pfn, extern int __add_memory(int nid, u64 start, u64 size); extern int add_memory(int nid, u64 start, u64 size); extern int add_memory_resource(int nid, struct resource *resource); -extern int arch_add_memory(int nid, u64 start, u64 size, - struct vmem_altmap *altmap, bool want_memblock); extern void move_pfn_range_to_zone(struct zone *zone, unsigned long start_pfn, unsigned long nr_pages, struct vmem_altmap *altmap); extern bool is_memblock_offlined(struct memory_block *mem); diff --git a/kernel/memremap.c b/kernel/memremap.c index a856cb5ff192..4e59d29245f4 100644 --- a/kernel/memremap.c +++ b/kernel/memremap.c @@ -148,6 +148,12 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap) &pgmap->altmap : NULL; struct resource *res = &pgmap->res; struct dev_pagemap *conflict_pgmap; + struct mhp_restrictions restrictions = { + /* + * We do not want any optional features only our own memmap + */ + .altmap = altmap, + }; pgprot_t pgprot = PAGE_KERNEL; int error, nid, is_ram; @@ -214,7 +220,7 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap) */ if (pgmap->type == MEMORY_DEVICE_PRIVATE) { error = add_pages(nid, align_start >> PAGE_SHIFT, - align_size >> PAGE_SHIFT, NULL, false); + align_size >> PAGE_SHIFT, &restrictions); } else { error = kasan_add_zero_shadow(__va(align_start), align_size); if (error) { @@ -222,8 +228,8 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap) goto err_kasan; } - error = arch_add_memory(nid, align_start, align_size, altmap, - false); + error = arch_add_memory(nid, align_start, align_size, + &restrictions); } if (!error) { diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 75f9f6590677..339d5a62d5d5 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -273,12 +273,12 @@ static int __meminit __add_section(int nid, unsigned long phys_start_pfn, * add the new pages. */ int __ref __add_pages(int nid, unsigned long phys_start_pfn, - unsigned long nr_pages, struct vmem_altmap *altmap, - bool want_memblock) + unsigned long nr_pages, struct mhp_restrictions *restrictions) { unsigned long i; int err = 0; int start_sec, end_sec; + struct vmem_altmap *altmap = restrictions->altmap; /* during initialize mem_map, align hot-added range to section */ start_sec = pfn_to_section_nr(phys_start_pfn); @@ -299,7 +299,7 @@ int __ref __add_pages(int nid, unsigned long phys_start_pfn, for (i = start_sec; i <= end_sec; i++) { err = __add_section(nid, section_nr_to_pfn(i), altmap, - want_memblock); + restrictions->flags & MHP_MEMBLOCK_API); /* * EEXIST is finally dealt with by ioresource collision @@ -1097,6 +1097,9 @@ static int online_memory_block(struct memory_block *mem, void *arg) */ int __ref add_memory_resource(int nid, struct resource *res) { + struct mhp_restrictions restrictions = { + .flags = MHP_MEMBLOCK_API, + }; u64 start, size; bool new_node = false; int ret; @@ -1124,7 +1127,7 @@ int __ref add_memory_resource(int nid, struct resource *res) new_node = ret; /* call arch's memory hotadd */ - ret = arch_add_memory(nid, start, size, NULL, true); + ret = arch_add_memory(nid, start, size, &restrictions); if (ret < 0) goto error; -- cgit v1.2.3 From 2346a560599a4438d66b17d83f102b2ec59f167c Mon Sep 17 00:00:00 2001 From: Laurent Dufour Date: Mon, 13 May 2019 17:21:29 -0700 Subject: mm/filemap.c: fix minor typo Link: http://lkml.kernel.org/r/20190304155240.19215-1-ldufour@linux.ibm.com Signed-off-by: Laurent Dufour Reviewed-by: William Kucharski Cc: Matthew Wilcox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/filemap.c b/mm/filemap.c index 0e929b4da48b..3ad18fa56057 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1446,7 +1446,7 @@ pgoff_t page_cache_next_miss(struct address_space *mapping, EXPORT_SYMBOL(page_cache_next_miss); /** - * page_cache_prev_miss() - Find the next gap in the page cache. + * page_cache_prev_miss() - Find the previous gap in the page cache. * @mapping: Mapping. * @index: Index. * @max_scan: Maximum range to search. -- cgit v1.2.3 From d9eb1417c77df7ce19abd2e41619e9dceccbdf2a Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 13 May 2019 17:21:33 -0700 Subject: mm/memory_hotplug: release memory resource after arch_remove_memory() Patch series "mm/memory_hotplug: Better error handling when removing memory", v1. Error handling when removing memory is somewhat messed up right now. Some errors result in warnings, others are completely ignored. Memory unplug code can essentially not deal with errors properly as of now. remove_memory() will never fail. We have basically two choices: 1. Allow arch_remov_memory() and friends to fail, propagating errors via remove_memory(). Might be problematic (e.g. DIMMs consisting of multiple pieces added/removed separately). 2. Don't allow the functions to fail, handling errors in a nicer way. It seems like most errors that can theoretically happen are really corner cases and mostly theoretical (e.g. "section not valid"). However e.g. aborting removal of sections while all callers simply continue in case of errors is not nice. If we can gurantee that removal of memory always works (and WARN/skip in case of theoretical errors so we can figure out what is going on), we can go ahead and implement better error handling when adding memory. E.g. via add_memory(): arch_add_memory() ret = do_stuff() if (ret) { arch_remove_memory(); goto error; } Handling here that arch_remove_memory() might fail is basically impossible. So I suggest, let's avoid reporting errors while removing memory, warning on theoretical errors instead and continuing instead of aborting. This patch (of 4): __add_pages() doesn't add the memory resource, so __remove_pages() shouldn't remove it. Let's factor it out. Especially as it is a special case for memory used as system memory, added via add_memory() and friends. We now remove the resource after removing the sections instead of doing it the other way around. I don't think this change is problematic. add_memory() register memory resource arch_add_memory() remove_memory arch_remove_memory() release memory resource While at it, explain why we ignore errors and that it only happeny if we remove memory in a different granularity as we added it. [david@redhat.com: fix printk warning] Link: http://lkml.kernel.org/r/20190417120204.6997-1-david@redhat.com Link: http://lkml.kernel.org/r/20190409100148.24703-2-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Oscar Salvador Cc: Michal Hocko Cc: David Hildenbrand Cc: Pavel Tatashin Cc: Wei Yang Cc: Qian Cai Cc: Arun KS Cc: Mathieu Malaterre Cc: Andrew Banman Cc: Andy Lutomirski Cc: Benjamin Herrenschmidt Cc: Borislav Petkov Cc: Christophe Leroy Cc: Dave Hansen Cc: Fenghua Yu Cc: Geert Uytterhoeven Cc: Greg Kroah-Hartman Cc: Heiko Carstens Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Ingo Molnar Cc: Joonsoo Kim Cc: "Kirill A. Shutemov" Cc: Martin Schwidefsky Cc: Masahiro Yamada Cc: Michael Ellerman Cc: Mike Rapoport Cc: Mike Travis Cc: Nicholas Piggin Cc: Oscar Salvador Cc: Paul Mackerras Cc: Peter Zijlstra Cc: "Rafael J. Wysocki" Cc: Rich Felker Cc: Rob Herring Cc: Stefan Agner Cc: Thomas Gleixner Cc: Tony Luck Cc: Vasily Gorbik Cc: Yoshinori Sato Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory_hotplug.c | 35 +++++++++++++++++++++-------------- 1 file changed, 21 insertions(+), 14 deletions(-) diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 339d5a62d5d5..65f166ec2e4c 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -561,20 +561,6 @@ int __remove_pages(struct zone *zone, unsigned long phys_start_pfn, if (is_dev_zone(zone)) { if (altmap) map_offset = vmem_altmap_offset(altmap); - } else { - resource_size_t start, size; - - start = phys_start_pfn << PAGE_SHIFT; - size = nr_pages * PAGE_SIZE; - - ret = release_mem_region_adjustable(&iomem_resource, start, - size); - if (ret) { - resource_size_t endres = start + size - 1; - - pr_warn("Unable to release resource <%pa-%pa> (%d)\n", - &start, &endres, ret); - } } clear_zone_contiguous(zone); @@ -1818,6 +1804,26 @@ void try_offline_node(int nid) } EXPORT_SYMBOL(try_offline_node); +static void __release_memory_resource(resource_size_t start, + resource_size_t size) +{ + int ret; + + /* + * When removing memory in the same granularity as it was added, + * this function never fails. It might only fail if resources + * have to be adjusted or split. We'll ignore the error, as + * removing of memory cannot fail. + */ + ret = release_mem_region_adjustable(&iomem_resource, start, size); + if (ret) { + resource_size_t endres = start + size - 1; + + pr_warn("Unable to release resource <%pa-%pa> (%d)\n", + &start, &endres, ret); + } +} + /** * remove_memory * @nid: the node ID @@ -1852,6 +1858,7 @@ void __ref __remove_memory(int nid, u64 start, u64 size) memblock_remove(start, size); arch_remove_memory(nid, start, size, NULL); + __release_memory_resource(start, size); try_offline_node(nid); -- cgit v1.2.3 From cb7b3a3685b20d3b5900ff24b2cb96d002960189 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 13 May 2019 17:21:37 -0700 Subject: mm/memory_hotplug: make unregister_memory_section() never fail Failing while removing memory is mostly ignored and cannot really be handled. Let's treat errors in unregister_memory_section() in a nice way, warning, but continuing. Link: http://lkml.kernel.org/r/20190409100148.24703-3-david@redhat.com Signed-off-by: David Hildenbrand Cc: Greg Kroah-Hartman Cc: "Rafael J. Wysocki" Cc: Ingo Molnar Cc: Andrew Banman Cc: Mike Travis Cc: David Hildenbrand Cc: Oscar Salvador Cc: Michal Hocko Cc: Pavel Tatashin Cc: Qian Cai Cc: Wei Yang Cc: Arun KS Cc: Mathieu Malaterre Cc: Andy Lutomirski Cc: Benjamin Herrenschmidt Cc: Borislav Petkov Cc: Christophe Leroy Cc: Dave Hansen Cc: Fenghua Yu Cc: Geert Uytterhoeven Cc: Heiko Carstens Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Joonsoo Kim Cc: "Kirill A. Shutemov" Cc: Martin Schwidefsky Cc: Masahiro Yamada Cc: Michael Ellerman Cc: Mike Rapoport Cc: Nicholas Piggin Cc: Oscar Salvador Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Rich Felker Cc: Rob Herring Cc: Stefan Agner Cc: Thomas Gleixner Cc: Tony Luck Cc: Vasily Gorbik Cc: Yoshinori Sato Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/base/memory.c | 16 +++++----------- include/linux/memory.h | 2 +- mm/memory_hotplug.c | 4 +--- 3 files changed, 7 insertions(+), 15 deletions(-) diff --git a/drivers/base/memory.c b/drivers/base/memory.c index 0c9e22ffa47a..f180427e48f4 100644 --- a/drivers/base/memory.c +++ b/drivers/base/memory.c @@ -734,15 +734,18 @@ unregister_memory(struct memory_block *memory) { BUG_ON(memory->dev.bus != &memory_subsys); - /* drop the ref. we got in remove_memory_section() */ + /* drop the ref. we got via find_memory_block() */ put_device(&memory->dev); device_unregister(&memory->dev); } -static int remove_memory_section(struct mem_section *section) +void unregister_memory_section(struct mem_section *section) { struct memory_block *mem; + if (WARN_ON_ONCE(!present_section(section))) + return; + mutex_lock(&mem_sysfs_mutex); /* @@ -763,15 +766,6 @@ static int remove_memory_section(struct mem_section *section) out_unlock: mutex_unlock(&mem_sysfs_mutex); - return 0; -} - -int unregister_memory_section(struct mem_section *section) -{ - if (!present_section(section)) - return -EINVAL; - - return remove_memory_section(section); } #endif /* CONFIG_MEMORY_HOTREMOVE */ diff --git a/include/linux/memory.h b/include/linux/memory.h index a6ddefc60517..e1dc1bb2b787 100644 --- a/include/linux/memory.h +++ b/include/linux/memory.h @@ -113,7 +113,7 @@ extern int register_memory_isolate_notifier(struct notifier_block *nb); extern void unregister_memory_isolate_notifier(struct notifier_block *nb); int hotplug_memory_register(int nid, struct mem_section *section); #ifdef CONFIG_MEMORY_HOTREMOVE -extern int unregister_memory_section(struct mem_section *); +extern void unregister_memory_section(struct mem_section *); #endif extern int memory_dev_init(void); extern int memory_notify(unsigned long val, void *v); diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 65f166ec2e4c..1f3707ab7a63 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -526,9 +526,7 @@ static int __remove_section(struct zone *zone, struct mem_section *ms, if (!valid_section(ms)) return ret; - ret = unregister_memory_section(ms); - if (ret) - return ret; + unregister_memory_section(ms); scn_nr = __section_nr(ms); start_pfn = section_nr_to_pfn((unsigned long)scn_nr); -- cgit v1.2.3 From 9d1d887d785b4fe0590bd3c5e71acaa3908044e2 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 13 May 2019 17:21:41 -0700 Subject: mm/memory_hotplug: make __remove_section() never fail Let's just warn in case a section is not valid instead of failing to remove somewhere in the middle of the process, returning an error that will be mostly ignored by callers. Link: http://lkml.kernel.org/r/20190409100148.24703-4-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Oscar Salvador Cc: Michal Hocko Cc: David Hildenbrand Cc: Pavel Tatashin Cc: Qian Cai Cc: Wei Yang Cc: Arun KS Cc: Mathieu Malaterre Cc: Andrew Banman Cc: Andy Lutomirski Cc: Benjamin Herrenschmidt Cc: Borislav Petkov Cc: Christophe Leroy Cc: Dave Hansen Cc: Fenghua Yu Cc: Geert Uytterhoeven Cc: Greg Kroah-Hartman Cc: Heiko Carstens Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Ingo Molnar Cc: Joonsoo Kim Cc: "Kirill A. Shutemov" Cc: Martin Schwidefsky Cc: Masahiro Yamada Cc: Michael Ellerman Cc: Mike Rapoport Cc: Mike Travis Cc: Nicholas Piggin Cc: Oscar Salvador Cc: Paul Mackerras Cc: Peter Zijlstra Cc: "Rafael J. Wysocki" Cc: Rich Felker Cc: Rob Herring Cc: Stefan Agner Cc: Thomas Gleixner Cc: Tony Luck Cc: Vasily Gorbik Cc: Yoshinori Sato Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory_hotplug.c | 22 +++++++++------------- 1 file changed, 9 insertions(+), 13 deletions(-) diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 1f3707ab7a63..3512bba20e2b 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -516,15 +516,15 @@ static void __remove_zone(struct zone *zone, unsigned long start_pfn) pgdat_resize_unlock(zone->zone_pgdat, &flags); } -static int __remove_section(struct zone *zone, struct mem_section *ms, - unsigned long map_offset, struct vmem_altmap *altmap) +static void __remove_section(struct zone *zone, struct mem_section *ms, + unsigned long map_offset, + struct vmem_altmap *altmap) { unsigned long start_pfn; int scn_nr; - int ret = -EINVAL; - if (!valid_section(ms)) - return ret; + if (WARN_ON_ONCE(!valid_section(ms))) + return; unregister_memory_section(ms); @@ -533,7 +533,6 @@ static int __remove_section(struct zone *zone, struct mem_section *ms, __remove_zone(zone, start_pfn); sparse_remove_one_section(zone, ms, map_offset, altmap); - return 0; } /** @@ -553,7 +552,7 @@ int __remove_pages(struct zone *zone, unsigned long phys_start_pfn, { unsigned long i; unsigned long map_offset = 0; - int sections_to_remove, ret = 0; + int sections_to_remove; /* In the ZONE_DEVICE case device driver owns the memory region */ if (is_dev_zone(zone)) { @@ -574,16 +573,13 @@ int __remove_pages(struct zone *zone, unsigned long phys_start_pfn, unsigned long pfn = phys_start_pfn + i*PAGES_PER_SECTION; cond_resched(); - ret = __remove_section(zone, __pfn_to_section(pfn), map_offset, - altmap); + __remove_section(zone, __pfn_to_section(pfn), map_offset, + altmap); map_offset = 0; - if (ret) - break; } set_zone_contiguous(zone); - - return ret; + return 0; } #endif /* CONFIG_MEMORY_HOTREMOVE */ -- cgit v1.2.3 From ac5c94264580f498e484c854031d0226b3c1038f Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 13 May 2019 17:21:46 -0700 Subject: mm/memory_hotplug: make __remove_pages() and arch_remove_memory() never fail All callers of arch_remove_memory() ignore errors. And we should really try to remove any errors from the memory removal path. No more errors are reported from __remove_pages(). BUG() in s390x code in case arch_remove_memory() is triggered. We may implement that properly later. WARN in case powerpc code failed to remove the section mapping, which is better than ignoring the error completely right now. Link: http://lkml.kernel.org/r/20190409100148.24703-5-david@redhat.com Signed-off-by: David Hildenbrand Cc: Tony Luck Cc: Fenghua Yu Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Michael Ellerman Cc: Martin Schwidefsky Cc: Heiko Carstens Cc: Yoshinori Sato Cc: Rich Felker Cc: Dave Hansen Cc: Andy Lutomirski Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Borislav Petkov Cc: "H. Peter Anvin" Cc: Michal Hocko Cc: Mike Rapoport Cc: Oscar Salvador Cc: "Kirill A. Shutemov" Cc: Christophe Leroy Cc: Stefan Agner Cc: Nicholas Piggin Cc: Pavel Tatashin Cc: Vasily Gorbik Cc: Arun KS Cc: Geert Uytterhoeven Cc: Masahiro Yamada Cc: Rob Herring Cc: Joonsoo Kim Cc: Wei Yang Cc: Qian Cai Cc: Mathieu Malaterre Cc: Andrew Banman Cc: Greg Kroah-Hartman Cc: Ingo Molnar Cc: Mike Travis Cc: Oscar Salvador Cc: "Rafael J. Wysocki" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/ia64/mm/init.c | 11 +++-------- arch/powerpc/mm/mem.c | 9 +++------ arch/s390/mm/init.c | 5 +++-- arch/sh/mm/init.c | 11 +++-------- arch/x86/mm/init_32.c | 5 +++-- arch/x86/mm/init_64.c | 10 +++------- include/linux/memory_hotplug.h | 8 ++++---- mm/memory_hotplug.c | 5 ++--- 8 files changed, 24 insertions(+), 40 deletions(-) diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c index 379eb1f9adc9..d28e29103bdb 100644 --- a/arch/ia64/mm/init.c +++ b/arch/ia64/mm/init.c @@ -682,20 +682,15 @@ int arch_add_memory(int nid, u64 start, u64 size, } #ifdef CONFIG_MEMORY_HOTREMOVE -int arch_remove_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap) +void arch_remove_memory(int nid, u64 start, u64 size, + struct vmem_altmap *altmap) { unsigned long start_pfn = start >> PAGE_SHIFT; unsigned long nr_pages = size >> PAGE_SHIFT; struct zone *zone; - int ret; zone = page_zone(pfn_to_page(start_pfn)); - ret = __remove_pages(zone, start_pfn, nr_pages, altmap); - if (ret) - pr_warn("%s: Problem encountered in __remove_pages() as" - " ret=%d\n", __func__, ret); - - return ret; + __remove_pages(zone, start_pfn, nr_pages, altmap); } #endif #endif diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c index de5c591a550d..e885fe2aafcc 100644 --- a/arch/powerpc/mm/mem.c +++ b/arch/powerpc/mm/mem.c @@ -131,7 +131,7 @@ int __ref arch_add_memory(int nid, u64 start, u64 size, } #ifdef CONFIG_MEMORY_HOTREMOVE -int __ref arch_remove_memory(int nid, u64 start, u64 size, +void __ref arch_remove_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap) { unsigned long start_pfn = start >> PAGE_SHIFT; @@ -147,14 +147,13 @@ int __ref arch_remove_memory(int nid, u64 start, u64 size, if (altmap) page += vmem_altmap_offset(altmap); - ret = __remove_pages(page_zone(page), start_pfn, nr_pages, altmap); - if (ret) - return ret; + __remove_pages(page_zone(page), start_pfn, nr_pages, altmap); /* Remove htab bolted mappings for this section of memory */ start = (unsigned long)__va(start); flush_inval_dcache_range(start, start + size); ret = remove_section_mapping(start, start + size); + WARN_ON_ONCE(ret); /* Ensure all vmalloc mappings are flushed in case they also * hit that section of memory @@ -163,8 +162,6 @@ int __ref arch_remove_memory(int nid, u64 start, u64 size, if (resize_hpt_for_hotplug(memblock_phys_mem_size()) == -ENOSPC) pr_warn("Hash collision while resizing HPT\n"); - - return ret; } #endif #endif /* CONFIG_MEMORY_HOTPLUG */ diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c index 06bd05137a00..14d1eae9fe43 100644 --- a/arch/s390/mm/init.c +++ b/arch/s390/mm/init.c @@ -237,14 +237,15 @@ int arch_add_memory(int nid, u64 start, u64 size, } #ifdef CONFIG_MEMORY_HOTREMOVE -int arch_remove_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap) +void arch_remove_memory(int nid, u64 start, u64 size, + struct vmem_altmap *altmap) { /* * There is no hardware or firmware interface which could trigger a * hot memory remove on s390. So there is nothing that needs to be * implemented. */ - return -EBUSY; + BUG(); } #endif #endif /* CONFIG_MEMORY_HOTPLUG */ diff --git a/arch/sh/mm/init.c b/arch/sh/mm/init.c index d3cd07bd2dc1..b95e343e3c9d 100644 --- a/arch/sh/mm/init.c +++ b/arch/sh/mm/init.c @@ -429,20 +429,15 @@ EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid); #endif #ifdef CONFIG_MEMORY_HOTREMOVE -int arch_remove_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap) +void arch_remove_memory(int nid, u64 start, u64 size, + struct vmem_altmap *altmap) { unsigned long start_pfn = PFN_DOWN(start); unsigned long nr_pages = size >> PAGE_SHIFT; struct zone *zone; - int ret; zone = page_zone(pfn_to_page(start_pfn)); - ret = __remove_pages(zone, start_pfn, nr_pages, altmap); - if (unlikely(ret)) - pr_warn("%s: Failed, __remove_pages() == %d\n", __func__, - ret); - - return ret; + __remove_pages(zone, start_pfn, nr_pages, altmap); } #endif #endif /* CONFIG_MEMORY_HOTPLUG */ diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 755dbed85531..075e568098f2 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -860,14 +860,15 @@ int arch_add_memory(int nid, u64 start, u64 size, } #ifdef CONFIG_MEMORY_HOTREMOVE -int arch_remove_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap) +void arch_remove_memory(int nid, u64 start, u64 size, + struct vmem_altmap *altmap) { unsigned long start_pfn = start >> PAGE_SHIFT; unsigned long nr_pages = size >> PAGE_SHIFT; struct zone *zone; zone = page_zone(pfn_to_page(start_pfn)); - return __remove_pages(zone, start_pfn, nr_pages, altmap); + __remove_pages(zone, start_pfn, nr_pages, altmap); } #endif #endif diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index db42c11b48fb..20d14254b686 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -1141,24 +1141,20 @@ kernel_physical_mapping_remove(unsigned long start, unsigned long end) remove_pagetable(start, end, true, NULL); } -int __ref arch_remove_memory(int nid, u64 start, u64 size, - struct vmem_altmap *altmap) +void __ref arch_remove_memory(int nid, u64 start, u64 size, + struct vmem_altmap *altmap) { unsigned long start_pfn = start >> PAGE_SHIFT; unsigned long nr_pages = size >> PAGE_SHIFT; struct page *page = pfn_to_page(start_pfn); struct zone *zone; - int ret; /* With altmap the first mapped page is offset from @start */ if (altmap) page += vmem_altmap_offset(altmap); zone = page_zone(page); - ret = __remove_pages(zone, start_pfn, nr_pages, altmap); - WARN_ON_ONCE(ret); + __remove_pages(zone, start_pfn, nr_pages, altmap); kernel_physical_mapping_remove(start, start + size); - - return ret; } #endif #endif /* CONFIG_MEMORY_HOTPLUG */ diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index b24aca54353e..ae892eef8b82 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -124,10 +124,10 @@ static inline bool movable_node_is_enabled(void) } #ifdef CONFIG_MEMORY_HOTREMOVE -extern int arch_remove_memory(int nid, u64 start, u64 size, - struct vmem_altmap *altmap); -extern int __remove_pages(struct zone *zone, unsigned long start_pfn, - unsigned long nr_pages, struct vmem_altmap *altmap); +extern void arch_remove_memory(int nid, u64 start, u64 size, + struct vmem_altmap *altmap); +extern void __remove_pages(struct zone *zone, unsigned long start_pfn, + unsigned long nr_pages, struct vmem_altmap *altmap); #endif /* CONFIG_MEMORY_HOTREMOVE */ /* diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 3512bba20e2b..6c0c4f48638e 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -547,8 +547,8 @@ static void __remove_section(struct zone *zone, struct mem_section *ms, * sure that pages are marked reserved and zones are adjust properly by * calling offline_pages(). */ -int __remove_pages(struct zone *zone, unsigned long phys_start_pfn, - unsigned long nr_pages, struct vmem_altmap *altmap) +void __remove_pages(struct zone *zone, unsigned long phys_start_pfn, + unsigned long nr_pages, struct vmem_altmap *altmap) { unsigned long i; unsigned long map_offset = 0; @@ -579,7 +579,6 @@ int __remove_pages(struct zone *zone, unsigned long phys_start_pfn, } set_zone_contiguous(zone); - return 0; } #endif /* CONFIG_MEMORY_HOTREMOVE */ -- cgit v1.2.3 From 8c7829b04c523cdc732cb77f59f03320e09f3386 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Mon, 13 May 2019 17:21:50 -0700 Subject: mm: fix false-positive OVERCOMMIT_GUESS failures With the default overcommit==guess we occasionally run into mmap rejections despite plenty of memory that would get dropped under pressure but just isn't accounted reclaimable. One example of this is dying cgroups pinned by some page cache. A previous case was auxiliary path name memory associated with dentries; we have since annotated those allocations to avoid overcommit failures (see d79f7aa496fc ("mm: treat indirectly reclaimable memory as free in overcommit logic")). But trying to classify all allocated memory reliably as reclaimable and unreclaimable is a bit of a fool's errand. There could be a myriad of dependencies that constantly change with kernel versions. It becomes even more questionable of an effort when considering how this estimate of available memory is used: it's not compared to the system-wide allocated virtual memory in any way. It's not even compared to the allocating process's address space. It's compared to the single allocation request at hand! So we have an elaborate left-hand side of the equation that tries to assess the exact breathing room the system has available down to a page - and then compare it to an isolated allocation request with no additional context. We could fail an allocation of N bytes, but for two allocations of N/2 bytes we'd do this elaborate dance twice in a row and then still let N bytes of virtual memory through. This doesn't make a whole lot of sense. Let's take a step back and look at the actual goal of the heuristic. From the documentation: Heuristic overcommit handling. Obvious overcommits of address space are refused. Used for a typical system. It ensures a seriously wild allocation fails while allowing overcommit to reduce swap usage. root is allowed to allocate slightly more memory in this mode. This is the default. If all we want to do is catch clearly bogus allocation requests irrespective of the general virtual memory situation, the physical memory counter-part doesn't need to be that complicated, either. When in GUESS mode, catch wild allocations by comparing their request size to total amount of ram and swap in the system. Link: http://lkml.kernel.org/r/20190412191418.26333-1-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Acked-by: Roman Gushchin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/util.c | 51 +++++---------------------------------------------- 1 file changed, 5 insertions(+), 46 deletions(-) diff --git a/mm/util.c b/mm/util.c index 05a464929b3e..e2e4f8c3fa12 100644 --- a/mm/util.c +++ b/mm/util.c @@ -652,7 +652,7 @@ EXPORT_SYMBOL_GPL(vm_memory_committed); */ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) { - long free, allowed, reserve; + long allowed; VM_WARN_ONCE(percpu_counter_read(&vm_committed_as) < -(s64)vm_committed_as_batch * num_online_cpus(), @@ -667,51 +667,9 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) return 0; if (sysctl_overcommit_memory == OVERCOMMIT_GUESS) { - free = global_zone_page_state(NR_FREE_PAGES); - free += global_node_page_state(NR_FILE_PAGES); - - /* - * shmem pages shouldn't be counted as free in this - * case, they can't be purged, only swapped out, and - * that won't affect the overall amount of available - * memory in the system. - */ - free -= global_node_page_state(NR_SHMEM); - - free += get_nr_swap_pages(); - - /* - * Any slabs which are created with the - * SLAB_RECLAIM_ACCOUNT flag claim to have contents - * which are reclaimable, under pressure. The dentry - * cache and most inode caches should fall into this - */ - free += global_node_page_state(NR_SLAB_RECLAIMABLE); - - /* - * Part of the kernel memory, which can be released - * under memory pressure. - */ - free += global_node_page_state(NR_KERNEL_MISC_RECLAIMABLE); - - /* - * Leave reserved pages. The pages are not for anonymous pages. - */ - if (free <= totalreserve_pages) + if (pages > totalram_pages() + total_swap_pages) goto error; - else - free -= totalreserve_pages; - - /* - * Reserve some for root - */ - if (!cap_sys_admin) - free -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10); - - if (free > pages) - return 0; - - goto error; + return 0; } allowed = vm_commit_limit(); @@ -725,7 +683,8 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) * Don't let a single process grow so big a user can't recover */ if (mm) { - reserve = sysctl_user_reserve_kbytes >> (PAGE_SHIFT - 10); + long reserve = sysctl_user_reserve_kbytes >> (PAGE_SHIFT - 10); + allowed -= min_t(long, mm->total_vm / 32, reserve); } -- cgit v1.2.3 From 62afcd1cb8e355330a699b456f05f781e877cc4f Mon Sep 17 00:00:00 2001 From: Bartlomiej Zolnierkiewicz Date: Mon, 13 May 2019 17:21:53 -0700 Subject: mm: remove redundant 'default n' from Kconfig-s 'default n' is the default value for any bool or tristate Kconfig setting so there is no need to write it explicitly. Also since commit f467c5640c29 ("kconfig: only write '# CONFIG_FOO is not set' for visible symbols") the Kconfig behavior is the same regardless of 'default n' being present or not: ... One side effect of (and the main motivation for) this change is making the following two definitions behave exactly the same: config FOO bool config FOO bool default n With this change, neither of these will generate a '# CONFIG_FOO is not set' line (assuming FOO isn't selected/implied). That might make it clearer to people that a bare 'default n' is redundant. ... Link: http://lkml.kernel.org/r/c3385916-e4d4-37d3-b330-e6b7dff83a52@samsung.com Signed-off-by: Bartlomiej Zolnierkiewicz Reviewed-by: Andrew Morton Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/Kconfig | 11 ----------- mm/Kconfig.debug | 1 - 2 files changed, 12 deletions(-) diff --git a/mm/Kconfig b/mm/Kconfig index 0eada3f818fa..4c2e6b63c064 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -161,7 +161,6 @@ config MEMORY_HOTPLUG_SPARSE config MEMORY_HOTPLUG_DEFAULT_ONLINE bool "Online the newly added memory blocks by default" - default n depends on MEMORY_HOTPLUG help This option sets the default policy setting for memory hotplug @@ -439,7 +438,6 @@ config NEED_PER_CPU_KM config CLEANCACHE bool "Enable cleancache driver to cache clean pages if tmem is present" - default n help Cleancache can be thought of as a page-granularity victim cache for clean pages that the kernel's pageframe replacement algorithm @@ -463,7 +461,6 @@ config CLEANCACHE config FRONTSWAP bool "Enable frontswap to cache swap pages if tmem is present" depends on SWAP - default n help Frontswap is so named because it can be thought of as the opposite of a "backing" store for a swap device. The data is stored into @@ -535,7 +532,6 @@ config ZSWAP depends on FRONTSWAP && CRYPTO=y select CRYPTO_LZO select ZPOOL - default n help A lightweight compressed cache for swap pages. It takes pages that are in the process of being swapped out and attempts to @@ -552,14 +548,12 @@ config ZSWAP config ZPOOL tristate "Common API for compressed memory storage" - default n help Compressed memory storage API. This allows using either zbud or zsmalloc. config ZBUD tristate "Low (Up to 2x) density storage for compressed pages" - default n help A special purpose allocator for storing compressed pages. It is designed to store up to two compressed pages per physical @@ -570,7 +564,6 @@ config ZBUD config Z3FOLD tristate "Up to 3x density storage for compressed pages" depends on ZPOOL - default n help A special purpose allocator for storing compressed pages. It is designed to store up to three compressed pages per physical @@ -580,7 +573,6 @@ config Z3FOLD config ZSMALLOC tristate "Memory allocator for compressed pages" depends on MMU - default n help zsmalloc is a slab-based memory allocator designed to store compressed RAM pages. zsmalloc uses virtual memory mapping @@ -631,7 +623,6 @@ config MAX_STACK_SIZE_MB config DEFERRED_STRUCT_PAGE_INIT bool "Defer initialisation of struct pages to kthreads" - default n depends on SPARSEMEM depends on !NEED_PER_CPU_KM depends on 64BIT @@ -743,7 +734,6 @@ config ARCH_HAS_PKEYS config PERCPU_STATS bool "Collect percpu memory statistics" - default n help This feature collects and exposes statistics via debugfs. The information includes global and per chunk statistics, which can @@ -751,7 +741,6 @@ config PERCPU_STATS config GUP_BENCHMARK bool "Enable infrastructure for get_user_pages_fast() benchmarking" - default n help Provides /sys/kernel/debug/gup_benchmark that helps with testing performance of get_user_pages_fast(). diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug index e3df921208c0..e980ceb775a4 100644 --- a/mm/Kconfig.debug +++ b/mm/Kconfig.debug @@ -33,7 +33,6 @@ config DEBUG_PAGEALLOC config DEBUG_PAGEALLOC_ENABLE_DEFAULT bool "Enable debug page memory allocations by default?" - default n depends on DEBUG_PAGEALLOC ---help--- Enable debug page memory allocations by default? This value -- cgit v1.2.3 From a667d7456f189e3422725dddcd067537feac49c0 Mon Sep 17 00:00:00 2001 From: Souptick Joarder Date: Mon, 13 May 2019 17:21:56 -0700 Subject: mm: introduce new vm_map_pages() and vm_map_pages_zero() API Patch series "mm: Use vm_map_pages() and vm_map_pages_zero() API", v5. This patch (of 5): Previouly drivers have their own way of mapping range of kernel pages/memory into user vma and this was done by invoking vm_insert_page() within a loop. As this pattern is common across different drivers, it can be generalized by creating new functions and using them across the drivers. vm_map_pages() is the API which can be used to map kernel memory/pages in drivers which have considered vm_pgoff vm_map_pages_zero() is the API which can be used to map a range of kernel memory/pages in drivers which have not considered vm_pgoff. vm_pgoff is passed as default 0 for those drivers. We _could_ then at a later "fix" these drivers which are using vm_map_pages_zero() to behave according to the normal vm_pgoff offsetting simply by removing the _zero suffix on the function name and if that causes regressions, it gives us an easy way to revert. Tested on Rockchip hardware and display is working, including talking to Lima via prime. Link: http://lkml.kernel.org/r/751cb8a0f4c3e67e95c58a3b072937617f338eea.1552921225.git.jrdr.linux@gmail.com Signed-off-by: Souptick Joarder Suggested-by: Russell King Suggested-by: Matthew Wilcox Reviewed-by: Mike Rapoport Tested-by: Heiko Stuebner Cc: Michal Hocko Cc: "Kirill A. Shutemov" Cc: Vlastimil Babka Cc: Rik van Riel Cc: Stephen Rothwell Cc: Peter Zijlstra Cc: Robin Murphy Cc: Joonsoo Kim Cc: Thierry Reding Cc: Kees Cook Cc: Marek Szyprowski Cc: Stefan Richter Cc: Sandy Huang Cc: David Airlie Cc: Oleksandr Andrushchenko Cc: Joerg Roedel Cc: Pawel Osciak Cc: Kyungmin Park Cc: Boris Ostrovsky Cc: Juergen Gross Cc: Mauro Carvalho Chehab Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 4 +++ mm/memory.c | 81 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ mm/nommu.c | 14 ++++++++++ 3 files changed, 99 insertions(+) diff --git a/include/linux/mm.h b/include/linux/mm.h index abb7eb7ef0f2..912614fbbef3 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2579,6 +2579,10 @@ struct vm_area_struct *find_extend_vma(struct mm_struct *, unsigned long addr); int remap_pfn_range(struct vm_area_struct *, unsigned long addr, unsigned long pfn, unsigned long size, pgprot_t); int vm_insert_page(struct vm_area_struct *, unsigned long addr, struct page *); +int vm_map_pages(struct vm_area_struct *vma, struct page **pages, + unsigned long num); +int vm_map_pages_zero(struct vm_area_struct *vma, struct page **pages, + unsigned long num); vm_fault_t vmf_insert_pfn(struct vm_area_struct *vma, unsigned long addr, unsigned long pfn); vm_fault_t vmf_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr, diff --git a/mm/memory.c b/mm/memory.c index 9b68a72f8c17..96f1d473c89a 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1527,6 +1527,87 @@ int vm_insert_page(struct vm_area_struct *vma, unsigned long addr, } EXPORT_SYMBOL(vm_insert_page); +/* + * __vm_map_pages - maps range of kernel pages into user vma + * @vma: user vma to map to + * @pages: pointer to array of source kernel pages + * @num: number of pages in page array + * @offset: user's requested vm_pgoff + * + * This allows drivers to map range of kernel pages into a user vma. + * + * Return: 0 on success and error code otherwise. + */ +static int __vm_map_pages(struct vm_area_struct *vma, struct page **pages, + unsigned long num, unsigned long offset) +{ + unsigned long count = vma_pages(vma); + unsigned long uaddr = vma->vm_start; + int ret, i; + + /* Fail if the user requested offset is beyond the end of the object */ + if (offset > num) + return -ENXIO; + + /* Fail if the user requested size exceeds available object size */ + if (count > num - offset) + return -ENXIO; + + for (i = 0; i < count; i++) { + ret = vm_insert_page(vma, uaddr, pages[offset + i]); + if (ret < 0) + return ret; + uaddr += PAGE_SIZE; + } + + return 0; +} + +/** + * vm_map_pages - maps range of kernel pages starts with non zero offset + * @vma: user vma to map to + * @pages: pointer to array of source kernel pages + * @num: number of pages in page array + * + * Maps an object consisting of @num pages, catering for the user's + * requested vm_pgoff + * + * If we fail to insert any page into the vma, the function will return + * immediately leaving any previously inserted pages present. Callers + * from the mmap handler may immediately return the error as their caller + * will destroy the vma, removing any successfully inserted pages. Other + * callers should make their own arrangements for calling unmap_region(). + * + * Context: Process context. Called by mmap handlers. + * Return: 0 on success and error code otherwise. + */ +int vm_map_pages(struct vm_area_struct *vma, struct page **pages, + unsigned long num) +{ + return __vm_map_pages(vma, pages, num, vma->vm_pgoff); +} +EXPORT_SYMBOL(vm_map_pages); + +/** + * vm_map_pages_zero - map range of kernel pages starts with zero offset + * @vma: user vma to map to + * @pages: pointer to array of source kernel pages + * @num: number of pages in page array + * + * Similar to vm_map_pages(), except that it explicitly sets the offset + * to 0. This function is intended for the drivers that did not consider + * vm_pgoff. + * + * Context: Process context. Called by mmap handlers. + * Return: 0 on success and error code otherwise. + */ +int vm_map_pages_zero(struct vm_area_struct *vma, struct page **pages, + unsigned long num) +{ + return __vm_map_pages(vma, pages, num, 0); +} +EXPORT_SYMBOL(vm_map_pages_zero); + static vm_fault_t insert_pfn(struct vm_area_struct *vma, unsigned long addr, pfn_t pfn, pgprot_t prot, bool mkwrite) { diff --git a/mm/nommu.c b/mm/nommu.c index 749276beb109..b492fd1fcf9f 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -473,6 +473,20 @@ int vm_insert_page(struct vm_area_struct *vma, unsigned long addr, } EXPORT_SYMBOL(vm_insert_page); +int vm_map_pages(struct vm_area_struct *vma, struct page **pages, + unsigned long num) +{ + return -EINVAL; +} +EXPORT_SYMBOL(vm_map_pages); + +int vm_map_pages_zero(struct vm_area_struct *vma, struct page **pages, + unsigned long num) +{ + return -EINVAL; +} +EXPORT_SYMBOL(vm_map_pages_zero); + /* * sys_brk() for the most part doesn't need the global kernel * lock, except when an application is doing something nasty -- cgit v1.2.3 From 6248461d2168233321601cd6bf2528179b6ac3d1 Mon Sep 17 00:00:00 2001 From: Souptick Joarder Date: Mon, 13 May 2019 17:22:00 -0700 Subject: arm: mm: dma-mapping: convert to use vm_map_pages() Convert to use vm_map_pages() to map range of kernel memory to user vma. Link: http://lkml.kernel.org/r/936e5e107c746a7310e3a3c471188ca3ac8f9754.1552921225.git.jrdr.linux@gmail.com Signed-off-by: Souptick Joarder Cc: Boris Ostrovsky Cc: David Airlie Cc: Heiko Stuebner Cc: Joerg Roedel Cc: Joonsoo Kim Cc: Juergen Gross Cc: Kees Cook Cc: "Kirill A. Shutemov" Cc: Kyungmin Park Cc: Marek Szyprowski Cc: Matthew Wilcox Cc: Mauro Carvalho Chehab Cc: Michal Hocko Cc: Mike Rapoport Cc: Oleksandr Andrushchenko Cc: Pawel Osciak Cc: Peter Zijlstra Cc: Rik van Riel Cc: Robin Murphy Cc: Russell King Cc: Sandy Huang Cc: Stefan Richter Cc: Stephen Rothwell Cc: Thierry Reding Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm/mm/dma-mapping.c | 22 ++++++---------------- 1 file changed, 6 insertions(+), 16 deletions(-) diff --git a/arch/arm/mm/dma-mapping.c b/arch/arm/mm/dma-mapping.c index 43f46aa7ef33..0a75058c11f3 100644 --- a/arch/arm/mm/dma-mapping.c +++ b/arch/arm/mm/dma-mapping.c @@ -1577,31 +1577,21 @@ static int __arm_iommu_mmap_attrs(struct device *dev, struct vm_area_struct *vma void *cpu_addr, dma_addr_t dma_addr, size_t size, unsigned long attrs) { - unsigned long uaddr = vma->vm_start; - unsigned long usize = vma->vm_end - vma->vm_start; struct page **pages = __iommu_get_pages(cpu_addr, attrs); unsigned long nr_pages = PAGE_ALIGN(size) >> PAGE_SHIFT; - unsigned long off = vma->vm_pgoff; + int err; if (!pages) return -ENXIO; - if (off >= nr_pages || (usize >> PAGE_SHIFT) > nr_pages - off) + if (vma->vm_pgoff >= nr_pages) return -ENXIO; - pages += off; - - do { - int ret = vm_insert_page(vma, uaddr, *pages++); - if (ret) { - pr_err("Remapping memory failed: %d\n", ret); - return ret; - } - uaddr += PAGE_SIZE; - usize -= PAGE_SIZE; - } while (usize > 0); + err = vm_map_pages(vma, pages, nr_pages); + if (err) + pr_err("Remapping memory failed: %d\n", err); - return 0; + return err; } static int arm_iommu_mmap_attrs(struct device *dev, struct vm_area_struct *vma, void *cpu_addr, -- cgit v1.2.3 From 22660db8926268d6f5cd7dfebc125a9c67bb0276 Mon Sep 17 00:00:00 2001 From: Souptick Joarder Date: Mon, 13 May 2019 17:22:03 -0700 Subject: drivers/firewire/core-iso.c: convert to use vm_map_pages_zero() Convert to use vm_map_pages_zero() to map range of kernel memory to user vma. This driver has ignored vm_pgoff and mapped the entire pages. We could later "fix" these drivers to behave according to the normal vm_pgoff offsetting simply by removing the _zero suffix on the function name and if that causes regressions, it gives us an easy way to revert. Link: http://lkml.kernel.org/r/88645f5ea8202784a8baaf389e592aeb8c505e8e.1552921225.git.jrdr.linux@gmail.com Signed-off-by: Souptick Joarder Cc: Boris Ostrovsky Cc: David Airlie Cc: Heiko Stuebner Cc: Joerg Roedel Cc: Joonsoo Kim Cc: Juergen Gross Cc: Kees Cook Cc: "Kirill A. Shutemov" Cc: Kyungmin Park Cc: Marek Szyprowski Cc: Matthew Wilcox Cc: Mauro Carvalho Chehab Cc: Michal Hocko Cc: Mike Rapoport Cc: Oleksandr Andrushchenko Cc: Pawel Osciak Cc: Peter Zijlstra Cc: Rik van Riel Cc: Robin Murphy Cc: Russell King Cc: Sandy Huang Cc: Stefan Richter Cc: Stephen Rothwell Cc: Thierry Reding Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/firewire/core-iso.c | 15 ++------------- 1 file changed, 2 insertions(+), 13 deletions(-) diff --git a/drivers/firewire/core-iso.c b/drivers/firewire/core-iso.c index 35e784cffc23..5414eb1306aa 100644 --- a/drivers/firewire/core-iso.c +++ b/drivers/firewire/core-iso.c @@ -107,19 +107,8 @@ EXPORT_SYMBOL(fw_iso_buffer_init); int fw_iso_buffer_map_vma(struct fw_iso_buffer *buffer, struct vm_area_struct *vma) { - unsigned long uaddr; - int i, err; - - uaddr = vma->vm_start; - for (i = 0; i < buffer->page_count; i++) { - err = vm_insert_page(vma, uaddr, buffer->pages[i]); - if (err) - return err; - - uaddr += PAGE_SIZE; - } - - return 0; + return vm_map_pages_zero(vma, buffer->pages, + buffer->page_count); } void fw_iso_buffer_destroy(struct fw_iso_buffer *buffer, -- cgit v1.2.3 From 2f69b3c8bad0c95311878959ae53ec838723bb31 Mon Sep 17 00:00:00 2001 From: Souptick Joarder Date: Mon, 13 May 2019 17:22:07 -0700 Subject: drm/rockchip/rockchip_drm_gem.c: convert to use vm_map_pages() Convert to use vm_map_pages() to map range of kernel memory to user vma. Tested on Rockchip hardware and display is working, including talking to Lima via prime. Link: http://lkml.kernel.org/r/7ba359eb1aceac388d05983c1f29b915bdf291f9.1552921225.git.jrdr.linux@gmail.com Signed-off-by: Souptick Joarder Tested-by: Heiko Stuebner Cc: Boris Ostrovsky Cc: David Airlie Cc: Joerg Roedel Cc: Joonsoo Kim Cc: Juergen Gross Cc: Kees Cook Cc: "Kirill A. Shutemov" Cc: Kyungmin Park Cc: Marek Szyprowski Cc: Matthew Wilcox Cc: Mauro Carvalho Chehab Cc: Michal Hocko Cc: Mike Rapoport Cc: Oleksandr Andrushchenko Cc: Pawel Osciak Cc: Peter Zijlstra Cc: Rik van Riel Cc: Robin Murphy Cc: Russell King Cc: Sandy Huang Cc: Stefan Richter Cc: Stephen Rothwell Cc: Thierry Reding Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/gpu/drm/rockchip/rockchip_drm_gem.c | 17 ++--------------- 1 file changed, 2 insertions(+), 15 deletions(-) diff --git a/drivers/gpu/drm/rockchip/rockchip_drm_gem.c b/drivers/gpu/drm/rockchip/rockchip_drm_gem.c index a8db758d523e..a2ebb08990e9 100644 --- a/drivers/gpu/drm/rockchip/rockchip_drm_gem.c +++ b/drivers/gpu/drm/rockchip/rockchip_drm_gem.c @@ -221,26 +221,13 @@ static int rockchip_drm_gem_object_mmap_iommu(struct drm_gem_object *obj, struct vm_area_struct *vma) { struct rockchip_gem_object *rk_obj = to_rockchip_obj(obj); - unsigned int i, count = obj->size >> PAGE_SHIFT; + unsigned int count = obj->size >> PAGE_SHIFT; unsigned long user_count = vma_pages(vma); - unsigned long uaddr = vma->vm_start; - unsigned long offset = vma->vm_pgoff; - unsigned long end = user_count + offset; - int ret; if (user_count == 0) return -ENXIO; - if (end > count) - return -ENXIO; - for (i = offset; i < end; i++) { - ret = vm_insert_page(vma, uaddr, rk_obj->pages[i]); - if (ret) - return ret; - uaddr += PAGE_SIZE; - } - - return 0; + return vm_map_pages(vma, rk_obj->pages, count); } static int rockchip_drm_gem_object_mmap_dma(struct drm_gem_object *obj, -- cgit v1.2.3 From e60b72b1a9f5244931bb0c09918f7992fafa9e56 Mon Sep 17 00:00:00 2001 From: Souptick Joarder Date: Mon, 13 May 2019 17:22:11 -0700 Subject: drm/xen/xen_drm_front_gem.c: convert to use vm_map_pages() Convert to use vm_map_pages() to map range of kernel memory to user vma. Link: http://lkml.kernel.org/r/ff8e10ba778d79419c66ee8215bccf01560540fd.1552921225.git.jrdr.linux@gmail.com Signed-off-by: Souptick Joarder Reviewed-by: Oleksandr Andrushchenko Cc: Boris Ostrovsky Cc: David Airlie Cc: Heiko Stuebner Cc: Joerg Roedel Cc: Joonsoo Kim Cc: Juergen Gross Cc: Kees Cook Cc: "Kirill A. Shutemov" Cc: Kyungmin Park Cc: Marek Szyprowski Cc: Matthew Wilcox Cc: Mauro Carvalho Chehab Cc: Michal Hocko Cc: Mike Rapoport Cc: Pawel Osciak Cc: Peter Zijlstra Cc: Rik van Riel Cc: Robin Murphy Cc: Russell King Cc: Sandy Huang Cc: Stefan Richter Cc: Stephen Rothwell Cc: Thierry Reding Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/gpu/drm/xen/xen_drm_front_gem.c | 18 +++++------------- 1 file changed, 5 insertions(+), 13 deletions(-) diff --git a/drivers/gpu/drm/xen/xen_drm_front_gem.c b/drivers/gpu/drm/xen/xen_drm_front_gem.c index 53c376d55fcf..a24548489dde 100644 --- a/drivers/gpu/drm/xen/xen_drm_front_gem.c +++ b/drivers/gpu/drm/xen/xen_drm_front_gem.c @@ -224,8 +224,7 @@ xen_drm_front_gem_import_sg_table(struct drm_device *dev, static int gem_mmap_obj(struct xen_gem_object *xen_obj, struct vm_area_struct *vma) { - unsigned long addr = vma->vm_start; - int i; + int ret; /* * clear the VM_PFNMAP flag that was set by drm_gem_mmap(), and set the @@ -252,18 +251,11 @@ static int gem_mmap_obj(struct xen_gem_object *xen_obj, * FIXME: as we insert all the pages now then no .fault handler must * be called, so don't provide one */ - for (i = 0; i < xen_obj->num_pages; i++) { - int ret; - - ret = vm_insert_page(vma, addr, xen_obj->pages[i]); - if (ret < 0) { - DRM_ERROR("Failed to insert pages into vma: %d\n", ret); - return ret; - } + ret = vm_map_pages(vma, xen_obj->pages, xen_obj->num_pages); + if (ret < 0) + DRM_ERROR("Failed to map pages into vma: %d\n", ret); - addr += PAGE_SIZE; - } - return 0; + return ret; } int xen_drm_front_gem_mmap(struct file *filp, struct vm_area_struct *vma) -- cgit v1.2.3 From b0d0084fd906c0a067909e45d3cc4cc01ceee33f Mon Sep 17 00:00:00 2001 From: Souptick Joarder Date: Mon, 13 May 2019 17:22:15 -0700 Subject: iommu/dma-iommu.c: convert to use vm_map_pages() Convert to use vm_map_pages() to map range of kernel memory to user vma. Link: http://lkml.kernel.org/r/80c3d220fc6ada73a88ce43ca049afb55a889258.1552921225.git.jrdr.linux@gmail.com Signed-off-by: Souptick Joarder Cc: Boris Ostrovsky Cc: David Airlie Cc: Heiko Stuebner Cc: Joerg Roedel Cc: Joonsoo Kim Cc: Juergen Gross Cc: Kees Cook Cc: "Kirill A. Shutemov" Cc: Kyungmin Park Cc: Marek Szyprowski Cc: Matthew Wilcox Cc: Mauro Carvalho Chehab Cc: Michal Hocko Cc: Mike Rapoport Cc: Oleksandr Andrushchenko Cc: Pawel Osciak Cc: Peter Zijlstra Cc: Rik van Riel Cc: Robin Murphy Cc: Russell King Cc: Sandy Huang Cc: Stefan Richter Cc: Stephen Rothwell Cc: Thierry Reding Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/iommu/dma-iommu.c | 12 +----------- 1 file changed, 1 insertion(+), 11 deletions(-) diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c index 77aabe637a60..20abd19bbfbe 100644 --- a/drivers/iommu/dma-iommu.c +++ b/drivers/iommu/dma-iommu.c @@ -619,17 +619,7 @@ out_free_pages: int iommu_dma_mmap(struct page **pages, size_t size, struct vm_area_struct *vma) { - unsigned long uaddr = vma->vm_start; - unsigned int i, count = PAGE_ALIGN(size) >> PAGE_SHIFT; - int ret = -ENXIO; - - for (i = vma->vm_pgoff; i < count && uaddr < vma->vm_end; i++) { - ret = vm_insert_page(vma, uaddr, pages[i]); - if (ret) - break; - uaddr += PAGE_SIZE; - } - return ret; + return vm_map_pages(vma, pages, PAGE_ALIGN(size) >> PAGE_SHIFT); } static dma_addr_t __iommu_dma_map(struct device *dev, phys_addr_t phys, -- cgit v1.2.3 From a17ae14766935aaccd7e7629d1ccf980341ffd24 Mon Sep 17 00:00:00 2001 From: Souptick Joarder Date: Mon, 13 May 2019 17:22:19 -0700 Subject: videobuf2/videobuf2-dma-sg.c: convert to use vm_map_pages() Convert to use vm_map_pages() to map range of kernel memory to user vma. vm_pgoff is treated in V4L2 API as a 'cookie' to select a buffer, not as a in-buffer offset by design and it always want to mmap a whole buffer from its beginning. Link: http://lkml.kernel.org/r/a953fe6b3056de1cc6eab654effdd4a22f125375.1552921225.git.jrdr.linux@gmail.com Signed-off-by: Souptick Joarder Suggested-by: Marek Szyprowski Reviewed-by: Marek Szyprowski Cc: Boris Ostrovsky Cc: David Airlie Cc: Heiko Stuebner Cc: Joerg Roedel Cc: Joonsoo Kim Cc: Juergen Gross Cc: Kees Cook Cc: "Kirill A. Shutemov" Cc: Kyungmin Park Cc: Matthew Wilcox Cc: Mauro Carvalho Chehab Cc: Michal Hocko Cc: Mike Rapoport Cc: Oleksandr Andrushchenko Cc: Pawel Osciak Cc: Peter Zijlstra Cc: Rik van Riel Cc: Robin Murphy Cc: Russell King Cc: Sandy Huang Cc: Stefan Richter Cc: Stephen Rothwell Cc: Thierry Reding Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/media/common/videobuf2/videobuf2-core.c | 7 +++++++ .../media/common/videobuf2/videobuf2-dma-contig.c | 6 ------ drivers/media/common/videobuf2/videobuf2-dma-sg.c | 22 ++++++---------------- 3 files changed, 13 insertions(+), 22 deletions(-) diff --git a/drivers/media/common/videobuf2/videobuf2-core.c b/drivers/media/common/videobuf2/videobuf2-core.c index 7ebd58a1c431..3cf25abf5807 100644 --- a/drivers/media/common/videobuf2/videobuf2-core.c +++ b/drivers/media/common/videobuf2/videobuf2-core.c @@ -2201,6 +2201,13 @@ int vb2_mmap(struct vb2_queue *q, struct vm_area_struct *vma) goto unlock; } + /* + * vm_pgoff is treated in V4L2 API as a 'cookie' to select a buffer, + * not as a in-buffer offset. We always want to mmap a whole buffer + * from its beginning. + */ + vma->vm_pgoff = 0; + ret = call_memop(vb, mmap, vb->planes[plane].mem_priv, vma); unlock: diff --git a/drivers/media/common/videobuf2/videobuf2-dma-contig.c b/drivers/media/common/videobuf2/videobuf2-dma-contig.c index 82389aead6ed..ecbef266130b 100644 --- a/drivers/media/common/videobuf2/videobuf2-dma-contig.c +++ b/drivers/media/common/videobuf2/videobuf2-dma-contig.c @@ -186,12 +186,6 @@ static int vb2_dc_mmap(void *buf_priv, struct vm_area_struct *vma) return -EINVAL; } - /* - * dma_mmap_* uses vm_pgoff as in-buffer offset, but we want to - * map whole buffer - */ - vma->vm_pgoff = 0; - ret = dma_mmap_attrs(buf->dev, vma, buf->cookie, buf->dma_addr, buf->size, buf->attrs); diff --git a/drivers/media/common/videobuf2/videobuf2-dma-sg.c b/drivers/media/common/videobuf2/videobuf2-dma-sg.c index 270c3162fdcb..4a4c49d6085c 100644 --- a/drivers/media/common/videobuf2/videobuf2-dma-sg.c +++ b/drivers/media/common/videobuf2/videobuf2-dma-sg.c @@ -328,28 +328,18 @@ static unsigned int vb2_dma_sg_num_users(void *buf_priv) static int vb2_dma_sg_mmap(void *buf_priv, struct vm_area_struct *vma) { struct vb2_dma_sg_buf *buf = buf_priv; - unsigned long uaddr = vma->vm_start; - unsigned long usize = vma->vm_end - vma->vm_start; - int i = 0; + int err; if (!buf) { printk(KERN_ERR "No memory to map\n"); return -EINVAL; } - do { - int ret; - - ret = vm_insert_page(vma, uaddr, buf->pages[i++]); - if (ret) { - printk(KERN_ERR "Remapping memory, error: %d\n", ret); - return ret; - } - - uaddr += PAGE_SIZE; - usize -= PAGE_SIZE; - } while (usize > 0); - + err = vm_map_pages(vma, buf->pages, buf->num_pages); + if (err) { + printk(KERN_ERR "Remapping memory, error: %d\n", err); + return err; + } /* * Use common vm_area operations to track buffer refcount. -- cgit v1.2.3 From df9bde015a72ffd978e39a750662c7cf579b1715 Mon Sep 17 00:00:00 2001 From: Souptick Joarder Date: Mon, 13 May 2019 17:22:23 -0700 Subject: xen/gntdev.c: convert to use vm_map_pages() Convert to use vm_map_pages() to map range of kernel memory to user vma. map->count is passed to vm_map_pages() and internal API verify map->count against count ( count = vma_pages(vma)) for page array boundary overrun condition. Link: http://lkml.kernel.org/r/88e56e82d2db98705c2d842e9c9806c00b366d67.1552921225.git.jrdr.linux@gmail.com Signed-off-by: Souptick Joarder Reviewed-by: Boris Ostrovsky Cc: David Airlie Cc: Heiko Stuebner Cc: Joerg Roedel Cc: Joonsoo Kim Cc: Juergen Gross Cc: Kees Cook Cc: "Kirill A. Shutemov" Cc: Kyungmin Park Cc: Marek Szyprowski Cc: Matthew Wilcox Cc: Mauro Carvalho Chehab Cc: Michal Hocko Cc: Mike Rapoport Cc: Oleksandr Andrushchenko Cc: Pawel Osciak Cc: Peter Zijlstra Cc: Rik van Riel Cc: Robin Murphy Cc: Russell King Cc: Sandy Huang Cc: Stefan Richter Cc: Stephen Rothwell Cc: Thierry Reding Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/xen/gntdev.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/drivers/xen/gntdev.c b/drivers/xen/gntdev.c index 559d4b7f807d..469dfbd6cf90 100644 --- a/drivers/xen/gntdev.c +++ b/drivers/xen/gntdev.c @@ -1084,7 +1084,7 @@ static int gntdev_mmap(struct file *flip, struct vm_area_struct *vma) int index = vma->vm_pgoff; int count = vma_pages(vma); struct gntdev_grant_map *map; - int i, err = -EINVAL; + int err = -EINVAL; if ((vma->vm_flags & VM_WRITE) && !(vma->vm_flags & VM_SHARED)) return -EINVAL; @@ -1145,12 +1145,9 @@ static int gntdev_mmap(struct file *flip, struct vm_area_struct *vma) goto out_put_map; if (!use_ptemod) { - for (i = 0; i < count; i++) { - err = vm_insert_page(vma, vma->vm_start + i*PAGE_SIZE, - map->pages[i]); - if (err) - goto out_put_map; - } + err = vm_map_pages(vma, map->pages, map->count); + if (err) + goto out_put_map; } else { #ifdef CONFIG_X86 /* -- cgit v1.2.3 From 5326905798dee047bc6216da63ecf2c93c15968e Mon Sep 17 00:00:00 2001 From: Souptick Joarder Date: Mon, 13 May 2019 17:22:27 -0700 Subject: xen/privcmd-buf.c: convert to use vm_map_pages_zero() Convert to use vm_map_pages_zero() to map range of kernel memory to user vma. This driver has ignored vm_pgoff. We could later "fix" these drivers to behave according to the normal vm_pgoff offsetting simply by removing the _zero suffix on the function name and if that causes regressions, it gives us an easy way to revert. Link: http://lkml.kernel.org/r/acf678e81d554d01a9b590716ac0ccbdcdf71c25.1552921225.git.jrdr.linux@gmail.com Signed-off-by: Souptick Joarder Reviewed-by: Boris Ostrovsky Cc: David Airlie Cc: Heiko Stuebner Cc: Joerg Roedel Cc: Joonsoo Kim Cc: Juergen Gross Cc: Kees Cook Cc: "Kirill A. Shutemov" Cc: Kyungmin Park Cc: Marek Szyprowski Cc: Matthew Wilcox Cc: Mauro Carvalho Chehab Cc: Michal Hocko Cc: Mike Rapoport Cc: Oleksandr Andrushchenko Cc: Pawel Osciak Cc: Peter Zijlstra Cc: Rik van Riel Cc: Robin Murphy Cc: Russell King Cc: Sandy Huang Cc: Stefan Richter Cc: Stephen Rothwell Cc: Thierry Reding Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/xen/privcmd-buf.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/drivers/xen/privcmd-buf.c b/drivers/xen/privcmd-buf.c index a1c61e351d3f..dd5bbb6e1b6b 100644 --- a/drivers/xen/privcmd-buf.c +++ b/drivers/xen/privcmd-buf.c @@ -165,12 +165,8 @@ static int privcmd_buf_mmap(struct file *file, struct vm_area_struct *vma) if (vma_priv->n_pages != count) ret = -ENOMEM; else - for (i = 0; i < vma_priv->n_pages; i++) { - ret = vm_insert_page(vma, vma->vm_start + i * PAGE_SIZE, - vma_priv->pages[i]); - if (ret) - break; - } + ret = vm_map_pages_zero(vma, vma_priv->pages, + vma_priv->n_pages); if (ret) privcmd_buf_vmapriv_free(vma_priv); -- cgit v1.2.3 From c553ea4fdf2701d64b9e9cca4497a8a2512bb025 Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Mon, 13 May 2019 17:22:30 -0700 Subject: fs/sync.c: sync_file_range(2) may use WB_SYNC_ALL writeback 23d0127096cb ("fs/sync.c: make sync_file_range(2) use WB_SYNC_NONE writeback") claims that sync_file_range(2) syscall was "created for userspace to be able to issue background writeout and so waiting for in-flight IO is undesirable there" and changes the writeback (back) to WB_SYNC_NONE. This claim is only partially true. It is true for users that use the flag SYNC_FILE_RANGE_WRITE by itself, as does PostgreSQL, the user that was the reason for changing to WB_SYNC_NONE writeback. However, that claim is not true for users that use that flag combination SYNC_FILE_RANGE_{WAIT_BEFORE|WRITE|_WAIT_AFTER}. Those users explicitly requested to wait for in-flight IO as well as to writeback of dirty pages. Re-brand that flag combination as SYNC_FILE_RANGE_WRITE_AND_WAIT and use WB_SYNC_ALL writeback to perform the full range sync request. Link: http://lkml.kernel.org/r/20190409114922.30095-1-amir73il@gmail.com Link: http://lkml.kernel.org/r/20190419072938.31320-1-amir73il@gmail.com Fixes: 23d0127096cb ("fs/sync.c: make sync_file_range(2) use WB_SYNC_NONE") Signed-off-by: Amir Goldstein Acked-by: Jan Kara Cc: Dave Chinner Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/sync.c | 21 +++++++++++++++------ include/uapi/linux/fs.h | 3 +++ 2 files changed, 18 insertions(+), 6 deletions(-) diff --git a/fs/sync.c b/fs/sync.c index 01e82170545a..4d1ff010bc5a 100644 --- a/fs/sync.c +++ b/fs/sync.c @@ -292,8 +292,14 @@ int sync_file_range(struct file *file, loff_t offset, loff_t nbytes, } if (flags & SYNC_FILE_RANGE_WRITE) { + int sync_mode = WB_SYNC_NONE; + + if ((flags & SYNC_FILE_RANGE_WRITE_AND_WAIT) == + SYNC_FILE_RANGE_WRITE_AND_WAIT) + sync_mode = WB_SYNC_ALL; + ret = __filemap_fdatawrite_range(mapping, offset, endbyte, - WB_SYNC_NONE); + sync_mode); if (ret < 0) goto out; } @@ -306,9 +312,9 @@ out: } /* - * sys_sync_file_range() permits finely controlled syncing over a segment of + * ksys_sync_file_range() permits finely controlled syncing over a segment of * a file in the range offset .. (offset+nbytes-1) inclusive. If nbytes is - * zero then sys_sync_file_range() will operate from offset out to EOF. + * zero then ksys_sync_file_range() will operate from offset out to EOF. * * The flag bits are: * @@ -325,7 +331,7 @@ out: * Useful combinations of the flag bits are: * * SYNC_FILE_RANGE_WAIT_BEFORE|SYNC_FILE_RANGE_WRITE: ensures that all pages - * in the range which were dirty on entry to sys_sync_file_range() are placed + * in the range which were dirty on entry to ksys_sync_file_range() are placed * under writeout. This is a start-write-for-data-integrity operation. * * SYNC_FILE_RANGE_WRITE: start writeout of all dirty pages in the range which @@ -337,10 +343,13 @@ out: * earlier SYNC_FILE_RANGE_WAIT_BEFORE|SYNC_FILE_RANGE_WRITE operation to wait * for that operation to complete and to return the result. * - * SYNC_FILE_RANGE_WAIT_BEFORE|SYNC_FILE_RANGE_WRITE|SYNC_FILE_RANGE_WAIT_AFTER: + * SYNC_FILE_RANGE_WAIT_BEFORE|SYNC_FILE_RANGE_WRITE|SYNC_FILE_RANGE_WAIT_AFTER + * (a.k.a. SYNC_FILE_RANGE_WRITE_AND_WAIT): * a traditional sync() operation. This is a write-for-data-integrity operation * which will ensure that all pages in the range which were dirty on entry to - * sys_sync_file_range() are committed to disk. + * ksys_sync_file_range() are written to disk. It should be noted that disk + * caches are not flushed by this call, so there are no guarantees here that the + * data will be available on disk after a crash. * * * SYNC_FILE_RANGE_WAIT_BEFORE and SYNC_FILE_RANGE_WAIT_AFTER will detect any diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h index 121e82ce296b..59c71fa8c553 100644 --- a/include/uapi/linux/fs.h +++ b/include/uapi/linux/fs.h @@ -320,6 +320,9 @@ struct fscrypt_key { #define SYNC_FILE_RANGE_WAIT_BEFORE 1 #define SYNC_FILE_RANGE_WRITE 2 #define SYNC_FILE_RANGE_WAIT_AFTER 4 +#define SYNC_FILE_RANGE_WRITE_AND_WAIT (SYNC_FILE_RANGE_WRITE | \ + SYNC_FILE_RANGE_WAIT_BEFORE | \ + SYNC_FILE_RANGE_WAIT_AFTER) /* * Flags for preadv2/pwritev2: -- cgit v1.2.3 From f46b79120e94f92843a83db1d9ece482ee735d3a Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Mon, 13 May 2019 17:22:33 -0700 Subject: mm/vmscan.c: simplify shrink_inactive_list() This merges together duplicated patterns of code. Also, replace count_memcg_events() with its irq-careless namesake, because they are already called in interrupts disabled context. Link: http://lkml.kernel.org/r/2ece1df4-2989-bc9b-6172-61e9fdde5bfd@virtuozzo.com Signed-off-by: Kirill Tkhai Acked-by: Michal Hocko Reviewed-by: Daniel Jordan Acked-by: Johannes Weiner Cc: Baoquan He Cc: Davidlohr Bueso Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 31 +++++++++---------------------- 1 file changed, 9 insertions(+), 22 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index 41a14eed2e16..888076701c33 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1907,6 +1907,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, unsigned long nr_taken; struct reclaim_stat stat; int file = is_file_lru(lru); + enum vm_event_item item; struct pglist_data *pgdat = lruvec_pgdat(lruvec); struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat; bool stalled = false; @@ -1934,17 +1935,10 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, nr_taken); reclaim_stat->recent_scanned[file] += nr_taken; - if (current_is_kswapd()) { - if (global_reclaim(sc)) - __count_vm_events(PGSCAN_KSWAPD, nr_scanned); - count_memcg_events(lruvec_memcg(lruvec), PGSCAN_KSWAPD, - nr_scanned); - } else { - if (global_reclaim(sc)) - __count_vm_events(PGSCAN_DIRECT, nr_scanned); - count_memcg_events(lruvec_memcg(lruvec), PGSCAN_DIRECT, - nr_scanned); - } + item = current_is_kswapd() ? PGSCAN_KSWAPD : PGSCAN_DIRECT; + if (global_reclaim(sc)) + __count_vm_events(item, nr_scanned); + __count_memcg_events(lruvec_memcg(lruvec), item, nr_scanned); spin_unlock_irq(&pgdat->lru_lock); if (nr_taken == 0) @@ -1955,17 +1949,10 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, spin_lock_irq(&pgdat->lru_lock); - if (current_is_kswapd()) { - if (global_reclaim(sc)) - __count_vm_events(PGSTEAL_KSWAPD, nr_reclaimed); - count_memcg_events(lruvec_memcg(lruvec), PGSTEAL_KSWAPD, - nr_reclaimed); - } else { - if (global_reclaim(sc)) - __count_vm_events(PGSTEAL_DIRECT, nr_reclaimed); - count_memcg_events(lruvec_memcg(lruvec), PGSTEAL_DIRECT, - nr_reclaimed); - } + item = current_is_kswapd() ? PGSTEAL_KSWAPD : PGSTEAL_DIRECT; + if (global_reclaim(sc)) + __count_vm_events(item, nr_reclaimed); + __count_memcg_events(lruvec_memcg(lruvec), item, nr_reclaimed); reclaim_stat->recent_rotated[0] = stat.nr_activate[0]; reclaim_stat->recent_rotated[1] = stat.nr_activate[1]; -- cgit v1.2.3 From 2c8fc3dcf2dc5cc06c30b63deada4237cd5bf8af Mon Sep 17 00:00:00 2001 From: Jérôme Glisse Date: Mon, 13 May 2019 17:22:37 -0700 Subject: mm/hmm: add ARCH_HAS_HMM_MIRROR ARCH_HAS_HMM_DEVICE Kconfig MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add 2 new Kconfig variables that are not used by anyone. I check that various make ARCH=somearch allmodconfig do work and do not complain. This new Kconfig needs to be added first so that device drivers that depend on HMM can be updated. Once drivers are updated then I can update the HMM Kconfig to depend on this new Kconfig in a followup patch. This is about solving Kconfig for HMM given that device driver are going through their own tree we want to avoid changing them from the mm tree. So plan is: 1 - Kernel release N add the new Kconfig to mm/Kconfig (this patch) 2 - Kernel release N+1 update driver to depend on new Kconfig ie stop using ARCH_HASH_HMM and start using ARCH_HAS_HMM_MIRROR and ARCH_HAS_HMM_DEVICE (one or the other or both depending on the driver) 3 - Kernel release N+2 remove ARCH_HASH_HMM and do final Kconfig update in mm/Kconfig Link: http://lkml.kernel.org/r/20190417211141.17580-1-jglisse@redhat.com Signed-off-by: Jérôme Glisse Cc: Guenter Roeck Cc: Leon Romanovsky Cc: Jason Gunthorpe Cc: Ralph Campbell Cc: John Hubbard Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/Kconfig | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/mm/Kconfig b/mm/Kconfig index 4c2e6b63c064..71e697e693df 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -670,6 +670,22 @@ config ZONE_DEVICE If FS_DAX is enabled, then say Y. +config ARCH_HAS_HMM_MIRROR + bool + default y + depends on (X86_64 || PPC64) + depends on MMU && 64BIT + +config ARCH_HAS_HMM_DEVICE + bool + default y + depends on (X86_64 || PPC64) + depends on MEMORY_HOTPLUG + depends on MEMORY_HOTREMOVE + depends on SPARSEMEM_VMEMMAP + depends on ARCH_HAS_ZONE_DEVICE + select XARRAY_MULTI + config ARCH_HAS_HMM bool default y -- cgit v1.2.3 From 1c52e6d0681020e2272c0cbce270dd91a215e7d3 Mon Sep 17 00:00:00 2001 From: Yafang Shao Date: Mon, 13 May 2019 17:22:40 -0700 Subject: mm/page_alloc.c: remove unnecessary parameter in rmqueue_pcplist Because rmqueue_pcplist() is only called when order is 0, we don't need to use order as a parameter. Link: http://lkml.kernel.org/r/1555591709-11744-1-git-send-email-laoar.shao@gmail.com Signed-off-by: Yafang Shao Acked-by: Michal Hocko Acked-by: Pankaj Gupta Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index c45da9fe3ce1..cbda9aea0bf5 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3170,9 +3170,8 @@ static struct page *__rmqueue_pcplist(struct zone *zone, int migratetype, /* Lock and remove page from the per-cpu list */ static struct page *rmqueue_pcplist(struct zone *preferred_zone, - struct zone *zone, unsigned int order, - gfp_t gfp_flags, int migratetype, - unsigned int alloc_flags) + struct zone *zone, gfp_t gfp_flags, + int migratetype, unsigned int alloc_flags) { struct per_cpu_pages *pcp; struct list_head *list; @@ -3184,7 +3183,7 @@ static struct page *rmqueue_pcplist(struct zone *preferred_zone, list = &pcp->lists[migratetype]; page = __rmqueue_pcplist(zone, migratetype, alloc_flags, pcp, list); if (page) { - __count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order); + __count_zid_vm_events(PGALLOC, page_zonenum(page), 1); zone_statistics(preferred_zone, zone); } local_irq_restore(flags); @@ -3204,8 +3203,8 @@ struct page *rmqueue(struct zone *preferred_zone, struct page *page; if (likely(order == 0)) { - page = rmqueue_pcplist(preferred_zone, zone, order, - gfp_flags, migratetype, alloc_flags); + page = rmqueue_pcplist(preferred_zone, zone, gfp_flags, + migratetype, alloc_flags); goto out; } -- cgit v1.2.3 From 9050cce104c04982fb7b5dea9aee4f66f245d6d3 Mon Sep 17 00:00:00 2001 From: Vitaly Wool Date: Mon, 13 May 2019 17:22:43 -0700 Subject: mm/z3fold.c: introduce helper functions Patch series "z3fold: support page migration", v2. This patchset implements page migration support and slightly better buddy search. To implement page migration support, z3fold has to move away from the current scheme of handle encoding. i. e. stop encoding page address in handles. Instead, a small per-page structure is created which will contain actual addresses for z3fold objects, while pointers to fields of that structure will be used as handles. Thus, it will be possible to change the underlying addresses to reflect page migration. To support migration itself, 3 callbacks will be implemented: 1: isolation callback: z3fold_page_isolate(): try to isolate the page by removing it from all lists. Pages scheduled for some activity and mapped pages will not be isolated. Return true if isolation was successful or false otherwise 2: migration callback: z3fold_page_migrate(): re-check critical conditions and migrate page contents to the new page provided by the system. Returns 0 on success or negative error code otherwise 3: putback callback: z3fold_page_putback(): put back the page if z3fold_page_migrate() for it failed permanently (i. e. not with -EAGAIN code). To make sure an isolated page doesn't get freed, its kref is incremented in z3fold_page_isolate() and decremented during post-migration compaction, if migration was successful, or by z3fold_page_putback() in the other case. Since the new handle encoding scheme implies slight memory consumption increase, better buddy search (which decreases memory consumption) is included in this patchset. This patch (of 4): Introduce a separate helper function for object allocation, as well as 2 smaller helpers to add a buddy to the list and to get a pointer to the pool from the z3fold header. No functional changes here. Link: http://lkml.kernel.org/r/20190417103633.a4bb770b5bf0fb7e43ce1666@gmail.com Signed-off-by: Vitaly Wool Cc: Dan Streetman Cc: Bartlomiej Zolnierkiewicz Cc: Krzysztof Kozlowski Cc: Oleksiy Avramchenko Cc: Uladzislau Rezki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/z3fold.c | 184 +++++++++++++++++++++++++++++++++--------------------------- 1 file changed, 100 insertions(+), 84 deletions(-) diff --git a/mm/z3fold.c b/mm/z3fold.c index aee9b0b8d907..7a59875d880c 100644 --- a/mm/z3fold.c +++ b/mm/z3fold.c @@ -255,10 +255,15 @@ static enum buddy handle_to_buddy(unsigned long handle) return (handle - zhdr->first_num) & BUDDY_MASK; } +static inline struct z3fold_pool *zhdr_to_pool(struct z3fold_header *zhdr) +{ + return zhdr->pool; +} + static void __release_z3fold_page(struct z3fold_header *zhdr, bool locked) { struct page *page = virt_to_page(zhdr); - struct z3fold_pool *pool = zhdr->pool; + struct z3fold_pool *pool = zhdr_to_pool(zhdr); WARN_ON(!list_empty(&zhdr->buddy)); set_bit(PAGE_STALE, &page->private); @@ -295,9 +300,10 @@ static void release_z3fold_page_locked_list(struct kref *ref) { struct z3fold_header *zhdr = container_of(ref, struct z3fold_header, refcount); - spin_lock(&zhdr->pool->lock); + struct z3fold_pool *pool = zhdr_to_pool(zhdr); + spin_lock(&pool->lock); list_del_init(&zhdr->buddy); - spin_unlock(&zhdr->pool->lock); + spin_unlock(&pool->lock); WARN_ON(z3fold_page_trylock(zhdr)); __release_z3fold_page(zhdr, true); @@ -349,6 +355,23 @@ static int num_free_chunks(struct z3fold_header *zhdr) return nfree; } +/* Add to the appropriate unbuddied list */ +static inline void add_to_unbuddied(struct z3fold_pool *pool, + struct z3fold_header *zhdr) +{ + if (zhdr->first_chunks == 0 || zhdr->last_chunks == 0 || + zhdr->middle_chunks == 0) { + struct list_head *unbuddied = get_cpu_ptr(pool->unbuddied); + + int freechunks = num_free_chunks(zhdr); + spin_lock(&pool->lock); + list_add(&zhdr->buddy, &unbuddied[freechunks]); + spin_unlock(&pool->lock); + zhdr->cpu = smp_processor_id(); + put_cpu_ptr(pool->unbuddied); + } +} + static inline void *mchunk_memmove(struct z3fold_header *zhdr, unsigned short dst_chunk) { @@ -406,10 +429,8 @@ static int z3fold_compact_page(struct z3fold_header *zhdr) static void do_compact_page(struct z3fold_header *zhdr, bool locked) { - struct z3fold_pool *pool = zhdr->pool; + struct z3fold_pool *pool = zhdr_to_pool(zhdr); struct page *page; - struct list_head *unbuddied; - int fchunks; page = virt_to_page(zhdr); if (locked) @@ -430,18 +451,7 @@ static void do_compact_page(struct z3fold_header *zhdr, bool locked) } z3fold_compact_page(zhdr); - unbuddied = get_cpu_ptr(pool->unbuddied); - fchunks = num_free_chunks(zhdr); - if (fchunks < NCHUNKS && - (!zhdr->first_chunks || !zhdr->middle_chunks || - !zhdr->last_chunks)) { - /* the page's not completely free and it's unbuddied */ - spin_lock(&pool->lock); - list_add(&zhdr->buddy, &unbuddied[fchunks]); - spin_unlock(&pool->lock); - zhdr->cpu = smp_processor_id(); - } - put_cpu_ptr(pool->unbuddied); + add_to_unbuddied(pool, zhdr); z3fold_page_unlock(zhdr); } @@ -453,6 +463,67 @@ static void compact_page_work(struct work_struct *w) do_compact_page(zhdr, false); } +/* returns _locked_ z3fold page header or NULL */ +static inline struct z3fold_header *__z3fold_alloc(struct z3fold_pool *pool, + size_t size, bool can_sleep) +{ + struct z3fold_header *zhdr = NULL; + struct page *page; + struct list_head *unbuddied; + int chunks = size_to_chunks(size), i; + +lookup: + /* First, try to find an unbuddied z3fold page. */ + unbuddied = get_cpu_ptr(pool->unbuddied); + for_each_unbuddied_list(i, chunks) { + struct list_head *l = &unbuddied[i]; + + zhdr = list_first_entry_or_null(READ_ONCE(l), + struct z3fold_header, buddy); + + if (!zhdr) + continue; + + /* Re-check under lock. */ + spin_lock(&pool->lock); + l = &unbuddied[i]; + if (unlikely(zhdr != list_first_entry(READ_ONCE(l), + struct z3fold_header, buddy)) || + !z3fold_page_trylock(zhdr)) { + spin_unlock(&pool->lock); + zhdr = NULL; + put_cpu_ptr(pool->unbuddied); + if (can_sleep) + cond_resched(); + goto lookup; + } + list_del_init(&zhdr->buddy); + zhdr->cpu = -1; + spin_unlock(&pool->lock); + + page = virt_to_page(zhdr); + if (test_bit(NEEDS_COMPACTING, &page->private)) { + z3fold_page_unlock(zhdr); + zhdr = NULL; + put_cpu_ptr(pool->unbuddied); + if (can_sleep) + cond_resched(); + goto lookup; + } + + /* + * this page could not be removed from its unbuddied + * list while pool lock was held, and then we've taken + * page lock so kref_put could not be called before + * we got here, so it's safe to just call kref_get() + */ + kref_get(&zhdr->refcount); + break; + } + put_cpu_ptr(pool->unbuddied); + + return zhdr; +} /* * API Functions @@ -546,7 +617,7 @@ static void z3fold_destroy_pool(struct z3fold_pool *pool) static int z3fold_alloc(struct z3fold_pool *pool, size_t size, gfp_t gfp, unsigned long *handle) { - int chunks = 0, i, freechunks; + int chunks = size_to_chunks(size); struct z3fold_header *zhdr = NULL; struct page *page = NULL; enum buddy bud; @@ -561,56 +632,8 @@ static int z3fold_alloc(struct z3fold_pool *pool, size_t size, gfp_t gfp, if (size > PAGE_SIZE - ZHDR_SIZE_ALIGNED - CHUNK_SIZE) bud = HEADLESS; else { - struct list_head *unbuddied; - chunks = size_to_chunks(size); - -lookup: - /* First, try to find an unbuddied z3fold page. */ - unbuddied = get_cpu_ptr(pool->unbuddied); - for_each_unbuddied_list(i, chunks) { - struct list_head *l = &unbuddied[i]; - - zhdr = list_first_entry_or_null(READ_ONCE(l), - struct z3fold_header, buddy); - - if (!zhdr) - continue; - - /* Re-check under lock. */ - spin_lock(&pool->lock); - l = &unbuddied[i]; - if (unlikely(zhdr != list_first_entry(READ_ONCE(l), - struct z3fold_header, buddy)) || - !z3fold_page_trylock(zhdr)) { - spin_unlock(&pool->lock); - put_cpu_ptr(pool->unbuddied); - goto lookup; - } - list_del_init(&zhdr->buddy); - zhdr->cpu = -1; - spin_unlock(&pool->lock); - - page = virt_to_page(zhdr); - if (test_bit(NEEDS_COMPACTING, &page->private)) { - z3fold_page_unlock(zhdr); - zhdr = NULL; - put_cpu_ptr(pool->unbuddied); - if (can_sleep) - cond_resched(); - goto lookup; - } - - /* - * this page could not be removed from its unbuddied - * list while pool lock was held, and then we've taken - * page lock so kref_put could not be called before - * we got here, so it's safe to just call kref_get() - */ - kref_get(&zhdr->refcount); - break; - } - put_cpu_ptr(pool->unbuddied); - +retry: + zhdr = __z3fold_alloc(pool, size, can_sleep); if (zhdr) { if (zhdr->first_chunks == 0) { if (zhdr->middle_chunks != 0 && @@ -630,8 +653,9 @@ lookup: z3fold_page_unlock(zhdr); pr_err("No free chunks in unbuddied\n"); WARN_ON(1); - goto lookup; + goto retry; } + page = virt_to_page(zhdr); goto found; } bud = FIRST; @@ -662,8 +686,12 @@ lookup: if (!page) return -ENOMEM; - atomic64_inc(&pool->pages_nr); zhdr = init_z3fold_page(page, pool); + if (!zhdr) { + __free_page(page); + return -ENOMEM; + } + atomic64_inc(&pool->pages_nr); if (bud == HEADLESS) { set_bit(PAGE_HEADLESS, &page->private); @@ -680,19 +708,7 @@ found: zhdr->middle_chunks = chunks; zhdr->start_middle = zhdr->first_chunks + ZHDR_CHUNKS; } - - if (zhdr->first_chunks == 0 || zhdr->last_chunks == 0 || - zhdr->middle_chunks == 0) { - struct list_head *unbuddied = get_cpu_ptr(pool->unbuddied); - - /* Add to unbuddied list */ - freechunks = num_free_chunks(zhdr); - spin_lock(&pool->lock); - list_add(&zhdr->buddy, &unbuddied[freechunks]); - spin_unlock(&pool->lock); - zhdr->cpu = smp_processor_id(); - put_cpu_ptr(pool->unbuddied); - } + add_to_unbuddied(pool, zhdr); headless: spin_lock(&pool->lock); -- cgit v1.2.3 From 351618b203acef13946a03ecf18fbe328c3cdb58 Mon Sep 17 00:00:00 2001 From: Vitaly Wool Date: Mon, 13 May 2019 17:22:46 -0700 Subject: mm/z3fold.c: improve compression by extending search The current z3fold implementation only searches this CPU's page lists for a fitting page to put a new object into. This patch adds quick search for very well fitting pages (i. e. those having exactly the required number of free space) on other CPUs too, before allocating a new page for that object. Link: http://lkml.kernel.org/r/20190417103733.72ae81abe1552397c95a008e@gmail.com Signed-off-by: Vitaly Wool Cc: Bartlomiej Zolnierkiewicz Cc: Dan Streetman Cc: Krzysztof Kozlowski Cc: Oleksiy Avramchenko Cc: Uladzislau Rezki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/z3fold.c | 36 ++++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/mm/z3fold.c b/mm/z3fold.c index 7a59875d880c..29a4f1249bef 100644 --- a/mm/z3fold.c +++ b/mm/z3fold.c @@ -522,6 +522,42 @@ lookup: } put_cpu_ptr(pool->unbuddied); + if (!zhdr) { + int cpu; + + /* look for _exact_ match on other cpus' lists */ + for_each_online_cpu(cpu) { + struct list_head *l; + + unbuddied = per_cpu_ptr(pool->unbuddied, cpu); + spin_lock(&pool->lock); + l = &unbuddied[chunks]; + + zhdr = list_first_entry_or_null(READ_ONCE(l), + struct z3fold_header, buddy); + + if (!zhdr || !z3fold_page_trylock(zhdr)) { + spin_unlock(&pool->lock); + zhdr = NULL; + continue; + } + list_del_init(&zhdr->buddy); + zhdr->cpu = -1; + spin_unlock(&pool->lock); + + page = virt_to_page(zhdr); + if (test_bit(NEEDS_COMPACTING, &page->private)) { + z3fold_page_unlock(zhdr); + zhdr = NULL; + if (can_sleep) + cond_resched(); + continue; + } + kref_get(&zhdr->refcount); + break; + } + } + return zhdr; } -- cgit v1.2.3 From 7c2b8baa61fe578af905342938ad12f8dbaeae79 Mon Sep 17 00:00:00 2001 From: Vitaly Wool Date: Mon, 13 May 2019 17:22:49 -0700 Subject: mm/z3fold.c: add structure for buddy handles For z3fold to be able to move its pages per request of the memory subsystem, it should not use direct object addresses in handles. Instead, it will create abstract handles (3 per page) which will contain pointers to z3fold objects. Thus, it will be possible to change these pointers when z3fold page is moved. Link: http://lkml.kernel.org/r/20190417103826.484eaf18c1294d682769880f@gmail.com Signed-off-by: Vitaly Wool Cc: Bartlomiej Zolnierkiewicz Cc: Dan Streetman Cc: Krzysztof Kozlowski Cc: Oleksiy Avramchenko Cc: Uladzislau Rezki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/z3fold.c | 185 +++++++++++++++++++++++++++++++++++++++++++++++------------- 1 file changed, 145 insertions(+), 40 deletions(-) diff --git a/mm/z3fold.c b/mm/z3fold.c index 29a4f1249bef..bebc10083f1c 100644 --- a/mm/z3fold.c +++ b/mm/z3fold.c @@ -34,6 +34,29 @@ #include #include +/* + * NCHUNKS_ORDER determines the internal allocation granularity, effectively + * adjusting internal fragmentation. It also determines the number of + * freelists maintained in each pool. NCHUNKS_ORDER of 6 means that the + * allocation granularity will be in chunks of size PAGE_SIZE/64. Some chunks + * in the beginning of an allocated page are occupied by z3fold header, so + * NCHUNKS will be calculated to 63 (or 62 in case CONFIG_DEBUG_SPINLOCK=y), + * which shows the max number of free chunks in z3fold page, also there will + * be 63, or 62, respectively, freelists per pool. + */ +#define NCHUNKS_ORDER 6 + +#define CHUNK_SHIFT (PAGE_SHIFT - NCHUNKS_ORDER) +#define CHUNK_SIZE (1 << CHUNK_SHIFT) +#define ZHDR_SIZE_ALIGNED round_up(sizeof(struct z3fold_header), CHUNK_SIZE) +#define ZHDR_CHUNKS (ZHDR_SIZE_ALIGNED >> CHUNK_SHIFT) +#define TOTAL_CHUNKS (PAGE_SIZE >> CHUNK_SHIFT) +#define NCHUNKS ((PAGE_SIZE - ZHDR_SIZE_ALIGNED) >> CHUNK_SHIFT) + +#define BUDDY_MASK (0x3) +#define BUDDY_SHIFT 2 +#define SLOTS_ALIGN (0x40) + /***************** * Structures *****************/ @@ -47,9 +70,19 @@ enum buddy { FIRST, MIDDLE, LAST, - BUDDIES_MAX + BUDDIES_MAX = LAST }; +struct z3fold_buddy_slots { + /* + * we are using BUDDY_MASK in handle_to_buddy etc. so there should + * be enough slots to hold all possible variants + */ + unsigned long slot[BUDDY_MASK + 1]; + unsigned long pool; /* back link + flags */ +}; +#define HANDLE_FLAG_MASK (0x03) + /* * struct z3fold_header - z3fold page metadata occupying first chunks of each * z3fold page, except for HEADLESS pages @@ -58,7 +91,7 @@ enum buddy { * @page_lock: per-page lock * @refcount: reference count for the z3fold page * @work: work_struct for page layout optimization - * @pool: pointer to the pool which this page belongs to + * @slots: pointer to the structure holding buddy slots * @cpu: CPU which this page "belongs" to * @first_chunks: the size of the first buddy in chunks, 0 if free * @middle_chunks: the size of the middle buddy in chunks, 0 if free @@ -70,7 +103,7 @@ struct z3fold_header { spinlock_t page_lock; struct kref refcount; struct work_struct work; - struct z3fold_pool *pool; + struct z3fold_buddy_slots *slots; short cpu; unsigned short first_chunks; unsigned short middle_chunks; @@ -79,28 +112,6 @@ struct z3fold_header { unsigned short first_num:2; }; -/* - * NCHUNKS_ORDER determines the internal allocation granularity, effectively - * adjusting internal fragmentation. It also determines the number of - * freelists maintained in each pool. NCHUNKS_ORDER of 6 means that the - * allocation granularity will be in chunks of size PAGE_SIZE/64. Some chunks - * in the beginning of an allocated page are occupied by z3fold header, so - * NCHUNKS will be calculated to 63 (or 62 in case CONFIG_DEBUG_SPINLOCK=y), - * which shows the max number of free chunks in z3fold page, also there will - * be 63, or 62, respectively, freelists per pool. - */ -#define NCHUNKS_ORDER 6 - -#define CHUNK_SHIFT (PAGE_SHIFT - NCHUNKS_ORDER) -#define CHUNK_SIZE (1 << CHUNK_SHIFT) -#define ZHDR_SIZE_ALIGNED round_up(sizeof(struct z3fold_header), CHUNK_SIZE) -#define ZHDR_CHUNKS (ZHDR_SIZE_ALIGNED >> CHUNK_SHIFT) -#define TOTAL_CHUNKS (PAGE_SIZE >> CHUNK_SHIFT) -#define NCHUNKS ((PAGE_SIZE - ZHDR_SIZE_ALIGNED) >> CHUNK_SHIFT) - -#define BUDDY_MASK (0x3) -#define BUDDY_SHIFT 2 - /** * struct z3fold_pool - stores metadata for each z3fold pool * @name: pool name @@ -113,6 +124,7 @@ struct z3fold_header { * added buddy. * @stale: list of pages marked for freeing * @pages_nr: number of z3fold pages in the pool. + * @c_handle: cache for z3fold_buddy_slots allocation * @ops: pointer to a structure of user defined operations specified at * pool creation time. * @compact_wq: workqueue for page layout background optimization @@ -130,6 +142,7 @@ struct z3fold_pool { struct list_head lru; struct list_head stale; atomic64_t pages_nr; + struct kmem_cache *c_handle; const struct z3fold_ops *ops; struct zpool *zpool; const struct zpool_ops *zpool_ops; @@ -164,11 +177,65 @@ static int size_to_chunks(size_t size) static void compact_page_work(struct work_struct *w); +static inline struct z3fold_buddy_slots *alloc_slots(struct z3fold_pool *pool) +{ + struct z3fold_buddy_slots *slots = kmem_cache_alloc(pool->c_handle, + GFP_KERNEL); + + if (slots) { + memset(slots->slot, 0, sizeof(slots->slot)); + slots->pool = (unsigned long)pool; + } + + return slots; +} + +static inline struct z3fold_pool *slots_to_pool(struct z3fold_buddy_slots *s) +{ + return (struct z3fold_pool *)(s->pool & ~HANDLE_FLAG_MASK); +} + +static inline struct z3fold_buddy_slots *handle_to_slots(unsigned long handle) +{ + return (struct z3fold_buddy_slots *)(handle & ~(SLOTS_ALIGN - 1)); +} + +static inline void free_handle(unsigned long handle) +{ + struct z3fold_buddy_slots *slots; + int i; + bool is_free; + + if (handle & (1 << PAGE_HEADLESS)) + return; + + WARN_ON(*(unsigned long *)handle == 0); + *(unsigned long *)handle = 0; + slots = handle_to_slots(handle); + is_free = true; + for (i = 0; i <= BUDDY_MASK; i++) { + if (slots->slot[i]) { + is_free = false; + break; + } + } + + if (is_free) { + struct z3fold_pool *pool = slots_to_pool(slots); + + kmem_cache_free(pool->c_handle, slots); + } +} + /* Initializes the z3fold header of a newly allocated z3fold page */ static struct z3fold_header *init_z3fold_page(struct page *page, struct z3fold_pool *pool) { struct z3fold_header *zhdr = page_address(page); + struct z3fold_buddy_slots *slots = alloc_slots(pool); + + if (!slots) + return NULL; INIT_LIST_HEAD(&page->lru); clear_bit(PAGE_HEADLESS, &page->private); @@ -185,7 +252,7 @@ static struct z3fold_header *init_z3fold_page(struct page *page, zhdr->first_num = 0; zhdr->start_middle = 0; zhdr->cpu = -1; - zhdr->pool = pool; + zhdr->slots = slots; INIT_LIST_HEAD(&zhdr->buddy); INIT_WORK(&zhdr->work, compact_page_work); return zhdr; @@ -215,33 +282,57 @@ static inline void z3fold_page_unlock(struct z3fold_header *zhdr) spin_unlock(&zhdr->page_lock); } +/* Helper function to build the index */ +static inline int __idx(struct z3fold_header *zhdr, enum buddy bud) +{ + return (bud + zhdr->first_num) & BUDDY_MASK; +} + /* * Encodes the handle of a particular buddy within a z3fold page * Pool lock should be held as this function accesses first_num */ static unsigned long encode_handle(struct z3fold_header *zhdr, enum buddy bud) { - unsigned long handle; + struct z3fold_buddy_slots *slots; + unsigned long h = (unsigned long)zhdr; + int idx = 0; - handle = (unsigned long)zhdr; - if (bud != HEADLESS) { - handle |= (bud + zhdr->first_num) & BUDDY_MASK; - if (bud == LAST) - handle |= (zhdr->last_chunks << BUDDY_SHIFT); - } - return handle; + /* + * For a headless page, its handle is its pointer with the extra + * PAGE_HEADLESS bit set + */ + if (bud == HEADLESS) + return h | (1 << PAGE_HEADLESS); + + /* otherwise, return pointer to encoded handle */ + idx = __idx(zhdr, bud); + h += idx; + if (bud == LAST) + h |= (zhdr->last_chunks << BUDDY_SHIFT); + + slots = zhdr->slots; + slots->slot[idx] = h; + return (unsigned long)&slots->slot[idx]; } /* Returns the z3fold page where a given handle is stored */ -static struct z3fold_header *handle_to_z3fold_header(unsigned long handle) +static inline struct z3fold_header *handle_to_z3fold_header(unsigned long handle) { - return (struct z3fold_header *)(handle & PAGE_MASK); + unsigned long addr = handle; + + if (!(addr & (1 << PAGE_HEADLESS))) + addr = *(unsigned long *)handle; + + return (struct z3fold_header *)(addr & PAGE_MASK); } /* only for LAST bud, returns zero otherwise */ static unsigned short handle_to_chunks(unsigned long handle) { - return (handle & ~PAGE_MASK) >> BUDDY_SHIFT; + unsigned long addr = *(unsigned long *)handle; + + return (addr & ~PAGE_MASK) >> BUDDY_SHIFT; } /* @@ -251,13 +342,18 @@ static unsigned short handle_to_chunks(unsigned long handle) */ static enum buddy handle_to_buddy(unsigned long handle) { - struct z3fold_header *zhdr = handle_to_z3fold_header(handle); - return (handle - zhdr->first_num) & BUDDY_MASK; + struct z3fold_header *zhdr; + unsigned long addr; + + WARN_ON(handle & (1 << PAGE_HEADLESS)); + addr = *(unsigned long *)handle; + zhdr = (struct z3fold_header *)(addr & PAGE_MASK); + return (addr - zhdr->first_num) & BUDDY_MASK; } static inline struct z3fold_pool *zhdr_to_pool(struct z3fold_header *zhdr) { - return zhdr->pool; + return slots_to_pool(zhdr->slots); } static void __release_z3fold_page(struct z3fold_header *zhdr, bool locked) @@ -583,6 +679,11 @@ static struct z3fold_pool *z3fold_create_pool(const char *name, gfp_t gfp, pool = kzalloc(sizeof(struct z3fold_pool), gfp); if (!pool) goto out; + pool->c_handle = kmem_cache_create("z3fold_handle", + sizeof(struct z3fold_buddy_slots), + SLOTS_ALIGN, 0, NULL); + if (!pool->c_handle) + goto out_c; spin_lock_init(&pool->lock); spin_lock_init(&pool->stale_lock); pool->unbuddied = __alloc_percpu(sizeof(struct list_head)*NCHUNKS, 2); @@ -613,6 +714,8 @@ out_wq: out_unbuddied: free_percpu(pool->unbuddied); out_pool: + kmem_cache_destroy(pool->c_handle); +out_c: kfree(pool); out: return NULL; @@ -626,6 +729,7 @@ out: */ static void z3fold_destroy_pool(struct z3fold_pool *pool) { + kmem_cache_destroy(pool->c_handle); destroy_workqueue(pool->release_wq); destroy_workqueue(pool->compact_wq); kfree(pool); @@ -818,6 +922,7 @@ static void z3fold_free(struct z3fold_pool *pool, unsigned long handle) return; } + free_handle(handle); if (kref_put(&zhdr->refcount, release_z3fold_page_locked_list)) { atomic64_dec(&pool->pages_nr); return; -- cgit v1.2.3 From 1f862989b04ade61d3aab49184c50e9957f84c7d Mon Sep 17 00:00:00 2001 From: Vitaly Wool Date: Mon, 13 May 2019 17:22:52 -0700 Subject: mm/z3fold.c: support page migration Now that we are not using page address in handles directly, we can make z3fold pages movable to decrease the memory fragmentation z3fold may create over time. This patch starts advertising non-headless z3fold pages as movable and uses the existing kernel infrastructure to implement moving of such pages per memory management subsystem's request. It thus implements 3 required callbacks for page migration: * isolation callback: z3fold_page_isolate(): try to isolate the page by removing it from all lists. Pages scheduled for some activity and mapped pages will not be isolated. Return true if isolation was successful or false otherwise * migration callback: z3fold_page_migrate(): re-check critical conditions and migrate page contents to the new page provided by the memory subsystem. Returns 0 on success or negative error code otherwise * putback callback: z3fold_page_putback(): put back the page if z3fold_page_migrate() for it failed permanently (i. e. not with -EAGAIN code). [lkp@intel.com: z3fold_page_isolate() can be static] Link: http://lkml.kernel.org/r/20190419130924.GA161478@ivb42 Link: http://lkml.kernel.org/r/20190417103922.31253da5c366c4ebe0419cfc@gmail.com Signed-off-by: Vitaly Wool Signed-off-by: kbuild test robot Cc: Bartlomiej Zolnierkiewicz Cc: Dan Streetman Cc: Krzysztof Kozlowski Cc: Oleksiy Avramchenko Cc: Uladzislau Rezki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/z3fold.c | 241 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 231 insertions(+), 10 deletions(-) diff --git a/mm/z3fold.c b/mm/z3fold.c index bebc10083f1c..1ffecd6333e5 100644 --- a/mm/z3fold.c +++ b/mm/z3fold.c @@ -24,10 +24,18 @@ #include #include +#include +#include #include #include #include +#include +#include +#include +#include #include +#include +#include #include #include #include @@ -97,6 +105,7 @@ struct z3fold_buddy_slots { * @middle_chunks: the size of the middle buddy in chunks, 0 if free * @last_chunks: the size of the last buddy in chunks, 0 if free * @first_num: the starting number (for the first handle) + * @mapped_count: the number of objects currently mapped */ struct z3fold_header { struct list_head buddy; @@ -110,6 +119,7 @@ struct z3fold_header { unsigned short last_chunks; unsigned short start_middle; unsigned short first_num:2; + unsigned short mapped_count:2; }; /** @@ -130,6 +140,7 @@ struct z3fold_header { * @compact_wq: workqueue for page layout background optimization * @release_wq: workqueue for safe page release * @work: work_struct for safe page release + * @inode: inode for z3fold pseudo filesystem * * This structure is allocated at pool creation time and maintains metadata * pertaining to a particular z3fold pool. @@ -149,6 +160,7 @@ struct z3fold_pool { struct workqueue_struct *compact_wq; struct workqueue_struct *release_wq; struct work_struct work; + struct inode *inode; }; /* @@ -227,6 +239,59 @@ static inline void free_handle(unsigned long handle) } } +static struct dentry *z3fold_do_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data) +{ + static const struct dentry_operations ops = { + .d_dname = simple_dname, + }; + + return mount_pseudo(fs_type, "z3fold:", NULL, &ops, 0x33); +} + +static struct file_system_type z3fold_fs = { + .name = "z3fold", + .mount = z3fold_do_mount, + .kill_sb = kill_anon_super, +}; + +static struct vfsmount *z3fold_mnt; +static int z3fold_mount(void) +{ + int ret = 0; + + z3fold_mnt = kern_mount(&z3fold_fs); + if (IS_ERR(z3fold_mnt)) + ret = PTR_ERR(z3fold_mnt); + + return ret; +} + +static void z3fold_unmount(void) +{ + kern_unmount(z3fold_mnt); +} + +static const struct address_space_operations z3fold_aops; +static int z3fold_register_migration(struct z3fold_pool *pool) +{ + pool->inode = alloc_anon_inode(z3fold_mnt->mnt_sb); + if (IS_ERR(pool->inode)) { + pool->inode = NULL; + return 1; + } + + pool->inode->i_mapping->private_data = pool; + pool->inode->i_mapping->a_ops = &z3fold_aops; + return 0; +} + +static void z3fold_unregister_migration(struct z3fold_pool *pool) +{ + if (pool->inode) + iput(pool->inode); + } + /* Initializes the z3fold header of a newly allocated z3fold page */ static struct z3fold_header *init_z3fold_page(struct page *page, struct z3fold_pool *pool) @@ -259,8 +324,14 @@ static struct z3fold_header *init_z3fold_page(struct page *page, } /* Resets the struct page fields and frees the page */ -static void free_z3fold_page(struct page *page) +static void free_z3fold_page(struct page *page, bool headless) { + if (!headless) { + lock_page(page); + __ClearPageMovable(page); + unlock_page(page); + } + ClearPagePrivate(page); __free_page(page); } @@ -317,12 +388,12 @@ static unsigned long encode_handle(struct z3fold_header *zhdr, enum buddy bud) } /* Returns the z3fold page where a given handle is stored */ -static inline struct z3fold_header *handle_to_z3fold_header(unsigned long handle) +static inline struct z3fold_header *handle_to_z3fold_header(unsigned long h) { - unsigned long addr = handle; + unsigned long addr = h; if (!(addr & (1 << PAGE_HEADLESS))) - addr = *(unsigned long *)handle; + addr = *(unsigned long *)h; return (struct z3fold_header *)(addr & PAGE_MASK); } @@ -366,7 +437,7 @@ static void __release_z3fold_page(struct z3fold_header *zhdr, bool locked) clear_bit(NEEDS_COMPACTING, &page->private); spin_lock(&pool->lock); if (!list_empty(&page->lru)) - list_del(&page->lru); + list_del_init(&page->lru); spin_unlock(&pool->lock); if (locked) z3fold_page_unlock(zhdr); @@ -420,7 +491,7 @@ static void free_pages_work(struct work_struct *w) continue; spin_unlock(&pool->stale_lock); cancel_work_sync(&zhdr->work); - free_z3fold_page(page); + free_z3fold_page(page, false); cond_resched(); spin_lock(&pool->stale_lock); } @@ -486,6 +557,9 @@ static int z3fold_compact_page(struct z3fold_header *zhdr) if (test_bit(MIDDLE_CHUNK_MAPPED, &page->private)) return 0; /* can't move middle chunk, it's used */ + if (unlikely(PageIsolated(page))) + return 0; + if (zhdr->middle_chunks == 0) return 0; /* nothing to compact */ @@ -546,6 +620,12 @@ static void do_compact_page(struct z3fold_header *zhdr, bool locked) return; } + if (unlikely(PageIsolated(page) || + test_bit(PAGE_STALE, &page->private))) { + z3fold_page_unlock(zhdr); + return; + } + z3fold_compact_page(zhdr); add_to_unbuddied(pool, zhdr); z3fold_page_unlock(zhdr); @@ -705,10 +785,14 @@ static struct z3fold_pool *z3fold_create_pool(const char *name, gfp_t gfp, pool->release_wq = create_singlethread_workqueue(pool->name); if (!pool->release_wq) goto out_wq; + if (z3fold_register_migration(pool)) + goto out_rwq; INIT_WORK(&pool->work, free_pages_work); pool->ops = ops; return pool; +out_rwq: + destroy_workqueue(pool->release_wq); out_wq: destroy_workqueue(pool->compact_wq); out_unbuddied: @@ -730,6 +814,7 @@ out: static void z3fold_destroy_pool(struct z3fold_pool *pool) { kmem_cache_destroy(pool->c_handle); + z3fold_unregister_migration(pool); destroy_workqueue(pool->release_wq); destroy_workqueue(pool->compact_wq); kfree(pool); @@ -837,6 +922,7 @@ retry: set_bit(PAGE_HEADLESS, &page->private); goto headless; } + __SetPageMovable(page, pool->inode->i_mapping); z3fold_page_lock(zhdr); found: @@ -895,7 +981,7 @@ static void z3fold_free(struct z3fold_pool *pool, unsigned long handle) spin_lock(&pool->lock); list_del(&page->lru); spin_unlock(&pool->lock); - free_z3fold_page(page); + free_z3fold_page(page, true); atomic64_dec(&pool->pages_nr); } return; @@ -931,7 +1017,8 @@ static void z3fold_free(struct z3fold_pool *pool, unsigned long handle) z3fold_page_unlock(zhdr); return; } - if (test_and_set_bit(NEEDS_COMPACTING, &page->private)) { + if (unlikely(PageIsolated(page)) || + test_and_set_bit(NEEDS_COMPACTING, &page->private)) { z3fold_page_unlock(zhdr); return; } @@ -1012,10 +1099,12 @@ static int z3fold_reclaim_page(struct z3fold_pool *pool, unsigned int retries) if (test_and_set_bit(PAGE_CLAIMED, &page->private)) continue; - zhdr = page_address(page); + if (unlikely(PageIsolated(page))) + continue; if (test_bit(PAGE_HEADLESS, &page->private)) break; + zhdr = page_address(page); if (!z3fold_page_trylock(zhdr)) { zhdr = NULL; continue; /* can't evict at this point */ @@ -1076,7 +1165,7 @@ static int z3fold_reclaim_page(struct z3fold_pool *pool, unsigned int retries) next: if (test_bit(PAGE_HEADLESS, &page->private)) { if (ret == 0) { - free_z3fold_page(page); + free_z3fold_page(page, true); atomic64_dec(&pool->pages_nr); return 0; } @@ -1153,6 +1242,8 @@ static void *z3fold_map(struct z3fold_pool *pool, unsigned long handle) break; } + if (addr) + zhdr->mapped_count++; z3fold_page_unlock(zhdr); out: return addr; @@ -1179,6 +1270,7 @@ static void z3fold_unmap(struct z3fold_pool *pool, unsigned long handle) buddy = handle_to_buddy(handle); if (buddy == MIDDLE) clear_bit(MIDDLE_CHUNK_MAPPED, &page->private); + zhdr->mapped_count--; z3fold_page_unlock(zhdr); } @@ -1193,6 +1285,128 @@ static u64 z3fold_get_pool_size(struct z3fold_pool *pool) return atomic64_read(&pool->pages_nr); } +static bool z3fold_page_isolate(struct page *page, isolate_mode_t mode) +{ + struct z3fold_header *zhdr; + struct z3fold_pool *pool; + + VM_BUG_ON_PAGE(!PageMovable(page), page); + VM_BUG_ON_PAGE(PageIsolated(page), page); + + if (test_bit(PAGE_HEADLESS, &page->private)) + return false; + + zhdr = page_address(page); + z3fold_page_lock(zhdr); + if (test_bit(NEEDS_COMPACTING, &page->private) || + test_bit(PAGE_STALE, &page->private)) + goto out; + + pool = zhdr_to_pool(zhdr); + + if (zhdr->mapped_count == 0) { + kref_get(&zhdr->refcount); + if (!list_empty(&zhdr->buddy)) + list_del_init(&zhdr->buddy); + spin_lock(&pool->lock); + if (!list_empty(&page->lru)) + list_del(&page->lru); + spin_unlock(&pool->lock); + z3fold_page_unlock(zhdr); + return true; + } +out: + z3fold_page_unlock(zhdr); + return false; +} + +static int z3fold_page_migrate(struct address_space *mapping, struct page *newpage, + struct page *page, enum migrate_mode mode) +{ + struct z3fold_header *zhdr, *new_zhdr; + struct z3fold_pool *pool; + struct address_space *new_mapping; + + VM_BUG_ON_PAGE(!PageMovable(page), page); + VM_BUG_ON_PAGE(!PageIsolated(page), page); + + zhdr = page_address(page); + pool = zhdr_to_pool(zhdr); + + if (!trylock_page(page)) + return -EAGAIN; + + if (!z3fold_page_trylock(zhdr)) { + unlock_page(page); + return -EAGAIN; + } + if (zhdr->mapped_count != 0) { + z3fold_page_unlock(zhdr); + unlock_page(page); + return -EBUSY; + } + new_zhdr = page_address(newpage); + memcpy(new_zhdr, zhdr, PAGE_SIZE); + newpage->private = page->private; + page->private = 0; + z3fold_page_unlock(zhdr); + spin_lock_init(&new_zhdr->page_lock); + new_mapping = page_mapping(page); + __ClearPageMovable(page); + ClearPagePrivate(page); + + get_page(newpage); + z3fold_page_lock(new_zhdr); + if (new_zhdr->first_chunks) + encode_handle(new_zhdr, FIRST); + if (new_zhdr->last_chunks) + encode_handle(new_zhdr, LAST); + if (new_zhdr->middle_chunks) + encode_handle(new_zhdr, MIDDLE); + set_bit(NEEDS_COMPACTING, &newpage->private); + new_zhdr->cpu = smp_processor_id(); + spin_lock(&pool->lock); + list_add(&newpage->lru, &pool->lru); + spin_unlock(&pool->lock); + __SetPageMovable(newpage, new_mapping); + z3fold_page_unlock(new_zhdr); + + queue_work_on(new_zhdr->cpu, pool->compact_wq, &new_zhdr->work); + + page_mapcount_reset(page); + unlock_page(page); + put_page(page); + return 0; +} + +static void z3fold_page_putback(struct page *page) +{ + struct z3fold_header *zhdr; + struct z3fold_pool *pool; + + zhdr = page_address(page); + pool = zhdr_to_pool(zhdr); + + z3fold_page_lock(zhdr); + if (!list_empty(&zhdr->buddy)) + list_del_init(&zhdr->buddy); + INIT_LIST_HEAD(&page->lru); + if (kref_put(&zhdr->refcount, release_z3fold_page_locked)) { + atomic64_dec(&pool->pages_nr); + return; + } + spin_lock(&pool->lock); + list_add(&page->lru, &pool->lru); + spin_unlock(&pool->lock); + z3fold_page_unlock(zhdr); +} + +static const struct address_space_operations z3fold_aops = { + .isolate_page = z3fold_page_isolate, + .migratepage = z3fold_page_migrate, + .putback_page = z3fold_page_putback, +}; + /***************** * zpool ****************/ @@ -1290,8 +1504,14 @@ MODULE_ALIAS("zpool-z3fold"); static int __init init_z3fold(void) { + int ret; + /* Make sure the z3fold header is not larger than the page size */ BUILD_BUG_ON(ZHDR_SIZE_ALIGNED > PAGE_SIZE); + ret = z3fold_mount(); + if (ret) + return ret; + zpool_register_driver(&z3fold_zpool_driver); return 0; @@ -1299,6 +1519,7 @@ static int __init init_z3fold(void) static void __exit exit_z3fold(void) { + z3fold_unmount(); zpool_unregister_driver(&z3fold_zpool_driver); } -- cgit v1.2.3 From f27a5136f70a8c90e8b30a983b6f54540742f849 Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Mon, 13 May 2019 17:22:55 -0700 Subject: hugetlbfs: always use address space in inode for resv_map pointer Continuing discussion about 58b6e5e8f1ad ("hugetlbfs: fix memory leak for resv_map") brought up the issue that inode->i_mapping may not point to the address space embedded within the inode at inode eviction time. The hugetlbfs truncate routine handles this by explicitly using inode->i_data. However, code cleaning up the resv_map will still use the address space pointed to by inode->i_mapping. Luckily, private_data is NULL for address spaces in all such cases today but, there is no guarantee this will continue. Change all hugetlbfs code getting a resv_map pointer to explicitly get it from the address space embedded within the inode. In addition, add more comments in the code to indicate why this is being done. Link: http://lkml.kernel.org/r/20190419204435.16984-1-mike.kravetz@oracle.com Signed-off-by: Mike Kravetz Reported-by: Yufen Yu Cc: Michal Hocko Cc: Naoya Horiguchi Cc: "Kirill A . Shutemov" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/hugetlbfs/inode.c | 11 +++++++++-- mm/hugetlb.c | 19 ++++++++++++++++++- 2 files changed, 27 insertions(+), 3 deletions(-) diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index f23237135163..1dcc57189382 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -497,8 +497,15 @@ static void hugetlbfs_evict_inode(struct inode *inode) struct resv_map *resv_map; remove_inode_hugepages(inode, 0, LLONG_MAX); - resv_map = (struct resv_map *)inode->i_mapping->private_data; - /* root inode doesn't have the resv_map, so we should check it */ + + /* + * Get the resv_map from the address space embedded in the inode. + * This is the address space which points to any resv_map allocated + * at inode creation time. If this is a device special inode, + * i_mapping may not point to the original address space. + */ + resv_map = (struct resv_map *)(&inode->i_data)->private_data; + /* Only regular and link inodes have associated reserve maps */ if (resv_map) resv_map_release(&resv_map->refs); clear_inode(inode); diff --git a/mm/hugetlb.c b/mm/hugetlb.c index cab38ef30238..81718c56b8f5 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -740,7 +740,15 @@ void resv_map_release(struct kref *ref) static inline struct resv_map *inode_resv_map(struct inode *inode) { - return inode->i_mapping->private_data; + /* + * At inode evict time, i_mapping may not point to the original + * address space within the inode. This original address space + * contains the pointer to the resv_map. So, always use the + * address space embedded within the inode. + * The VERY common case is inode->mapping == &inode->i_data but, + * this may not be true for device special inodes. + */ + return (struct resv_map *)(&inode->i_data)->private_data; } static struct resv_map *vma_resv_map(struct vm_area_struct *vma) @@ -4518,6 +4526,11 @@ int hugetlb_reserve_pages(struct inode *inode, * called to make the mapping read-write. Assume !vma is a shm mapping */ if (!vma || vma->vm_flags & VM_MAYSHARE) { + /* + * resv_map can not be NULL as hugetlb_reserve_pages is only + * called for inodes for which resv_maps were created (see + * hugetlbfs_get_inode). + */ resv_map = inode_resv_map(inode); chg = region_chg(resv_map, from, to); @@ -4609,6 +4622,10 @@ long hugetlb_unreserve_pages(struct inode *inode, long start, long end, struct hugepage_subpool *spool = subpool_inode(inode); long gbl_reserve; + /* + * Since this routine can be called in the evict inode path for all + * hugetlbfs inodes, resv_map could be NULL. + */ if (resv_map) { chg = region_del(resv_map, start, end); /* -- cgit v1.2.3 From 350e88bad4964da6feabee02a1a70381bcdb087e Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Mon, 13 May 2019 17:22:59 -0700 Subject: mm: memblock: make keeping memblock memory opt-in rather than opt-out Most architectures do not need the memblock memory after the page allocator is initialized, but only few enable ARCH_DISCARD_MEMBLOCK in the arch Kconfig. Replacing ARCH_DISCARD_MEMBLOCK with ARCH_KEEP_MEMBLOCK and inverting the logic makes it clear which architectures actually use memblock after system initialization and skips the necessity to add ARCH_DISCARD_MEMBLOCK to the architectures that are still missing that option. Link: http://lkml.kernel.org/r/1556102150-32517-1-git-send-email-rppt@linux.ibm.com Signed-off-by: Mike Rapoport Acked-by: Michael Ellerman (powerpc) Cc: Russell King Cc: Catalin Marinas Cc: Will Deacon Cc: Richard Kuo Cc: Tony Luck Cc: Fenghua Yu Cc: Geert Uytterhoeven Cc: Ralf Baechle Cc: Paul Burton Cc: James Hogan Cc: Ley Foon Tan Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Martin Schwidefsky Cc: Heiko Carstens Cc: Yoshinori Sato Cc: Rich Felker Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Borislav Petkov Cc: "H. Peter Anvin" Cc: Eric Biederman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm/Kconfig | 2 +- arch/arm64/Kconfig | 1 + arch/hexagon/Kconfig | 1 - arch/ia64/Kconfig | 1 - arch/m68k/Kconfig | 1 - arch/mips/Kconfig | 1 - arch/nios2/Kconfig | 1 - arch/powerpc/Kconfig | 1 + arch/s390/Kconfig | 1 + arch/sh/Kconfig | 1 - arch/x86/Kconfig | 1 - include/linux/memblock.h | 3 ++- kernel/kexec_file.c | 16 ++++++++-------- mm/Kconfig | 2 +- mm/memblock.c | 6 +++--- mm/page_alloc.c | 3 +-- 16 files changed, 19 insertions(+), 23 deletions(-) diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index a11dfcc2a130..5fd344bd06b9 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -4,7 +4,6 @@ config ARM default y select ARCH_32BIT_OFF_T select ARCH_CLOCKSOURCE_DATA - select ARCH_DISCARD_MEMBLOCK if !HAVE_ARCH_PFN_VALID && !KEXEC select ARCH_HAS_DEBUG_VIRTUAL if MMU select ARCH_HAS_DEVMEM_IS_ALLOWED select ARCH_HAS_ELF_RANDOMIZE @@ -22,6 +21,7 @@ config ARM select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST select ARCH_HAVE_CUSTOM_GPIO_H select ARCH_HAS_GCOV_PROFILE_ALL + select ARCH_KEEP_MEMBLOCK if HAVE_ARCH_PFN_VALID || KEXEC select ARCH_MIGHT_HAVE_PC_PARPORT select ARCH_NO_SG_CHAIN if !ARM_HAS_SG_CHAIN select ARCH_OPTIONAL_KERNEL_RWX if ARCH_HAS_STRICT_KERNEL_RWX diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 7a1aa53d188d..69a59a5d1143 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -60,6 +60,7 @@ config ARM64 select ARCH_INLINE_SPIN_UNLOCK_BH if !PREEMPT select ARCH_INLINE_SPIN_UNLOCK_IRQ if !PREEMPT select ARCH_INLINE_SPIN_UNLOCK_IRQRESTORE if !PREEMPT + select ARCH_KEEP_MEMBLOCK select ARCH_USE_CMPXCHG_LOCKREF select ARCH_USE_QUEUED_RWLOCKS select ARCH_USE_QUEUED_SPINLOCKS diff --git a/arch/hexagon/Kconfig b/arch/hexagon/Kconfig index 3e54a53208d5..b7d404bbaa0f 100644 --- a/arch/hexagon/Kconfig +++ b/arch/hexagon/Kconfig @@ -22,7 +22,6 @@ config HEXAGON select GENERIC_IRQ_SHOW select HAVE_ARCH_KGDB select HAVE_ARCH_TRACEHOOK - select ARCH_DISCARD_MEMBLOCK select NEED_SG_DMA_LENGTH select NO_IOPORT_MAP select GENERIC_IOMAP diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig index 73a26f04644e..7468d8e50467 100644 --- a/arch/ia64/Kconfig +++ b/arch/ia64/Kconfig @@ -33,7 +33,6 @@ config IA64 select ARCH_HAS_DMA_COHERENT_TO_PFN if SWIOTLB select ARCH_HAS_SYNC_DMA_FOR_CPU if SWIOTLB select VIRT_TO_BUS - select ARCH_DISCARD_MEMBLOCK select GENERIC_IRQ_PROBE select GENERIC_PENDING_IRQ if SMP select GENERIC_IRQ_SHOW diff --git a/arch/m68k/Kconfig b/arch/m68k/Kconfig index fe5cc2da6d10..218e037ef901 100644 --- a/arch/m68k/Kconfig +++ b/arch/m68k/Kconfig @@ -26,7 +26,6 @@ config M68K select MODULES_USE_ELF_RELA select OLD_SIGSUSPEND3 select OLD_SIGACTION - select ARCH_DISCARD_MEMBLOCK select MMU_GATHER_NO_RANGE if MMU config CPU_BIG_ENDIAN diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig index ff8cff9fcf54..677e5bfeff47 100644 --- a/arch/mips/Kconfig +++ b/arch/mips/Kconfig @@ -5,7 +5,6 @@ config MIPS select ARCH_32BIT_OFF_T if !64BIT select ARCH_BINFMT_ELF_STATE if MIPS_FP_SUPPORT select ARCH_CLOCKSOURCE_DATA - select ARCH_DISCARD_MEMBLOCK select ARCH_HAS_ELF_RANDOMIZE select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST select ARCH_HAS_UBSAN_SANITIZE_ALL diff --git a/arch/nios2/Kconfig b/arch/nios2/Kconfig index ea37394ff3ea..26a9c760a98b 100644 --- a/arch/nios2/Kconfig +++ b/arch/nios2/Kconfig @@ -23,7 +23,6 @@ config NIOS2 select SPARSE_IRQ select USB_ARCH_HAS_HCD if USB_SUPPORT select CPU_NO_EFFICIENT_FFS - select ARCH_DISCARD_MEMBLOCK select MMU_GATHER_NO_RANGE if MMU config GENERIC_CSUM diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index d7996cfaceca..8c1c636308c8 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -137,6 +137,7 @@ config PPC select ARCH_HAS_UBSAN_SANITIZE_ALL select ARCH_HAS_ZONE_DEVICE if PPC_BOOK3S_64 select ARCH_HAVE_NMI_SAFE_CMPXCHG + select ARCH_KEEP_MEMBLOCK select ARCH_MIGHT_HAVE_PC_PARPORT select ARCH_MIGHT_HAVE_PC_SERIO select ARCH_OPTIONAL_KERNEL_RWX if ARCH_HAS_STRICT_KERNEL_RWX diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index d0c046af65fa..109243fdb6ec 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -100,6 +100,7 @@ config S390 select ARCH_INLINE_WRITE_UNLOCK_BH select ARCH_INLINE_WRITE_UNLOCK_IRQ select ARCH_INLINE_WRITE_UNLOCK_IRQRESTORE + select ARCH_KEEP_MEMBLOCK select ARCH_SAVE_PAGE_KEYS if HIBERNATION select ARCH_SUPPORTS_ATOMIC_RMW select ARCH_SUPPORTS_NUMA_BALANCING diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig index 2a77033e1e7c..b77f512bb176 100644 --- a/arch/sh/Kconfig +++ b/arch/sh/Kconfig @@ -10,7 +10,6 @@ config SUPERH select DMA_DECLARE_COHERENT select HAVE_IDE if HAS_IOPORT_MAP select HAVE_MEMBLOCK_NODE_MAP - select ARCH_DISCARD_MEMBLOCK select HAVE_OPROFILE select HAVE_ARCH_TRACEHOOK select HAVE_PERF_EVENTS diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index f21bc56e5d7b..818b361094ed 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -47,7 +47,6 @@ config X86 select ARCH_32BIT_OFF_T if X86_32 select ARCH_CLOCKSOURCE_DATA select ARCH_CLOCKSOURCE_INIT - select ARCH_DISCARD_MEMBLOCK select ARCH_HAS_ACPI_TABLE_UPGRADE if ACPI select ARCH_HAS_DEBUG_VIRTUAL select ARCH_HAS_DEVMEM_IS_ALLOWED diff --git a/include/linux/memblock.h b/include/linux/memblock.h index 47e3c0612592..676d3900e1bd 100644 --- a/include/linux/memblock.h +++ b/include/linux/memblock.h @@ -96,13 +96,14 @@ struct memblock { extern struct memblock memblock; extern int memblock_debug; -#ifdef CONFIG_ARCH_DISCARD_MEMBLOCK +#ifndef CONFIG_ARCH_KEEP_MEMBLOCK #define __init_memblock __meminit #define __initdata_memblock __meminitdata void memblock_discard(void); #else #define __init_memblock #define __initdata_memblock +static inline void memblock_discard(void) {} #endif #define memblock_dbg(fmt, ...) \ diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index f7fb8f6a688f..072b6ee55e3f 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c @@ -500,13 +500,7 @@ static int locate_mem_hole_callback(struct resource *res, void *arg) return locate_mem_hole_bottom_up(start, end, kbuf); } -#ifdef CONFIG_ARCH_DISCARD_MEMBLOCK -static int kexec_walk_memblock(struct kexec_buf *kbuf, - int (*func)(struct resource *, void *)) -{ - return 0; -} -#else +#ifdef CONFIG_ARCH_KEEP_MEMBLOCK static int kexec_walk_memblock(struct kexec_buf *kbuf, int (*func)(struct resource *, void *)) { @@ -550,6 +544,12 @@ static int kexec_walk_memblock(struct kexec_buf *kbuf, return ret; } +#else +static int kexec_walk_memblock(struct kexec_buf *kbuf, + int (*func)(struct resource *, void *)) +{ + return 0; +} #endif /** @@ -589,7 +589,7 @@ int kexec_locate_mem_hole(struct kexec_buf *kbuf) if (kbuf->mem != KEXEC_BUF_MEM_UNKNOWN) return 0; - if (IS_ENABLED(CONFIG_ARCH_DISCARD_MEMBLOCK)) + if (!IS_ENABLED(CONFIG_ARCH_KEEP_MEMBLOCK)) ret = kexec_walk_resources(kbuf, locate_mem_hole_callback); else ret = kexec_walk_memblock(kbuf, locate_mem_hole_callback); diff --git a/mm/Kconfig b/mm/Kconfig index 71e697e693df..c5124c2cb0b2 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -136,7 +136,7 @@ config HAVE_MEMBLOCK_PHYS_MAP config HAVE_GENERIC_GUP bool -config ARCH_DISCARD_MEMBLOCK +config ARCH_KEEP_MEMBLOCK bool config MEMORY_ISOLATION diff --git a/mm/memblock.c b/mm/memblock.c index f315eca9f4a1..6bbad46f4d2c 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -94,7 +94,7 @@ * :c:func:`mem_init` function frees all the memory to the buddy page * allocator. * - * If an architecure enables %CONFIG_ARCH_DISCARD_MEMBLOCK, the + * Unless an architecure enables %CONFIG_ARCH_KEEP_MEMBLOCK, the * memblock data structures will be discarded after the system * initialization compltes. */ @@ -375,7 +375,7 @@ static void __init_memblock memblock_remove_region(struct memblock_type *type, u } } -#ifdef CONFIG_ARCH_DISCARD_MEMBLOCK +#ifndef CONFIG_ARCH_KEEP_MEMBLOCK /** * memblock_discard - discard memory and reserved arrays if they were allocated */ @@ -1987,7 +1987,7 @@ unsigned long __init memblock_free_all(void) return pages; } -#if defined(CONFIG_DEBUG_FS) && !defined(CONFIG_ARCH_DISCARD_MEMBLOCK) +#if defined(CONFIG_DEBUG_FS) && defined(CONFIG_ARCH_KEEP_MEMBLOCK) static int memblock_debug_show(struct seq_file *m, void *private) { diff --git a/mm/page_alloc.c b/mm/page_alloc.c index cbda9aea0bf5..f2f3fb4921d1 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1896,10 +1896,9 @@ void __init page_alloc_init_late(void) /* Reinit limits that are based on free pages after the kernel is up */ files_maxfiles_init(); #endif -#ifdef CONFIG_ARCH_DISCARD_MEMBLOCK + /* Discard memblock private memory */ memblock_discard(); -#endif for_each_populated_zone(zone) set_zone_contiguous(zone); -- cgit v1.2.3 From 2fa2690ca6174dfd36df85918ce5eb2f83e4d1b1 Mon Sep 17 00:00:00 2001 From: Yafang Shao Date: Mon, 13 May 2019 17:23:02 -0700 Subject: mm/vmscan.c: don't disable irq again when count pgrefill for memcg We can use __count_memcg_events() directly because this callsite is alreay protected by spin_lock_irq(). Link: http://lkml.kernel.org/r/1556093494-30798-1-git-send-email-laoar.shao@gmail.com Signed-off-by: Yafang Shao Reviewed-by: Andrew Morton Acked-by: Michal Hocko Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index 888076701c33..d96c54703948 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2022,7 +2022,7 @@ static void shrink_active_list(unsigned long nr_to_scan, reclaim_stat->recent_scanned[file] += nr_taken; __count_vm_events(PGREFILL, nr_scanned); - count_memcg_events(lruvec_memcg(lruvec), PGREFILL, nr_scanned); + __count_memcg_events(lruvec_memcg(lruvec), PGREFILL, nr_scanned); spin_unlock_irq(&pgdat->lru_lock); -- cgit v1.2.3 From d66d109d3c9c31f4c2140a5ac6ab486c53cb0156 Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Mon, 13 May 2019 17:23:05 -0700 Subject: mm/Kconfig: update "Memory Model" help text The help describing the memory model selection is outdated. It still says that SPARSEMEM is experimental and DISCONTIGMEM is a preferred over SPARSEMEM. Update the help text for the relevant options: * add a generic help for the "Memory Model" prompt * add description for FLATMEM * reduce the description of DISCONTIGMEM and add a deprecation note * prefer SPARSEMEM over DISCONTIGMEM Link: http://lkml.kernel.org/r/1556188531-20728-1-git-send-email-rppt@linux.ibm.com Signed-off-by: Mike Rapoport Acked-by: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/Kconfig | 48 +++++++++++++++++++++++------------------------- 1 file changed, 23 insertions(+), 25 deletions(-) diff --git a/mm/Kconfig b/mm/Kconfig index c5124c2cb0b2..ee8d1f311858 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -11,23 +11,24 @@ choice default DISCONTIGMEM_MANUAL if ARCH_DISCONTIGMEM_DEFAULT default SPARSEMEM_MANUAL if ARCH_SPARSEMEM_DEFAULT default FLATMEM_MANUAL + help + This option allows you to change some of the ways that + Linux manages its memory internally. Most users will + only have one option here selected by the architecture + configuration. This is normal. config FLATMEM_MANUAL bool "Flat Memory" depends on !(ARCH_DISCONTIGMEM_ENABLE || ARCH_SPARSEMEM_ENABLE) || ARCH_FLATMEM_ENABLE help - This option allows you to change some of the ways that - Linux manages its memory internally. Most users will - only have one option here: FLATMEM. This is normal - and a correct option. - - Some users of more advanced features like NUMA and - memory hotplug may have different options here. - DISCONTIGMEM is a more mature, better tested system, - but is incompatible with memory hotplug and may suffer - decreased performance over SPARSEMEM. If unsure between - "Sparse Memory" and "Discontiguous Memory", choose - "Discontiguous Memory". + This option is best suited for non-NUMA systems with + flat address space. The FLATMEM is the most efficient + system in terms of performance and resource consumption + and it is the best option for smaller systems. + + For systems that have holes in their physical address + spaces and for features like NUMA and memory hotplug, + choose "Sparse Memory" If unsure, choose this option (Flat Memory) over any other. @@ -38,29 +39,26 @@ config DISCONTIGMEM_MANUAL This option provides enhanced support for discontiguous memory systems, over FLATMEM. These systems have holes in their physical address spaces, and this option provides - more efficient handling of these holes. However, the vast - majority of hardware has quite flat address spaces, and - can have degraded performance from the extra overhead that - this option imposes. + more efficient handling of these holes. - Many NUMA configurations will have this as the only option. + Although "Discontiguous Memory" is still used by several + architectures, it is considered deprecated in favor of + "Sparse Memory". - If unsure, choose "Flat Memory" over this option. + If unsure, choose "Sparse Memory" over this option. config SPARSEMEM_MANUAL bool "Sparse Memory" depends on ARCH_SPARSEMEM_ENABLE help This will be the only option for some systems, including - memory hotplug systems. This is normal. + memory hot-plug systems. This is normal. - For many other systems, this will be an alternative to - "Discontiguous Memory". This option provides some potential - performance benefits, along with decreased code complexity, - but it is newer, and more experimental. + This option provides efficient support for systems with + holes is their physical address space and allows memory + hot-plug and hot-remove. - If unsure, choose "Discontiguous Memory" or "Flat Memory" - over this option. + If unsure, choose "Flat Memory" over this option. endchoice -- cgit v1.2.3 From 60b62ff7cc4217ac3de76535fa4c1510a798dbcb Mon Sep 17 00:00:00 2001 From: Yafang Shao Date: Mon, 13 May 2019 17:23:08 -0700 Subject: mm/vmscan: simplify trace_reclaim_flags and trace_shrink_flags trace_reclaim_flags and trace_shrink_flags are almost the same. We can simplify them to avoid redundant code. Link: http://lkml.kernel.org/r/1556169203-5858-1-git-send-email-laoar.shao@gmail.com Signed-off-by: Yafang Shao Reviewed-by: Andrew Morton Cc: Michal Hocko Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/trace/events/vmscan.h | 20 ++++++++------------ 1 file changed, 8 insertions(+), 12 deletions(-) diff --git a/include/trace/events/vmscan.h b/include/trace/events/vmscan.h index 0aa882a4e870..a5ab2973e8dc 100644 --- a/include/trace/events/vmscan.h +++ b/include/trace/events/vmscan.h @@ -27,17 +27,11 @@ {RECLAIM_WB_ASYNC, "RECLAIM_WB_ASYNC"} \ ) : "RECLAIM_WB_NONE" -#define trace_reclaim_flags(page) ( \ - (page_is_file_cache(page) ? RECLAIM_WB_FILE : RECLAIM_WB_ANON) | \ +#define trace_reclaim_flags(file) ( \ + (file ? RECLAIM_WB_FILE : RECLAIM_WB_ANON) | \ (RECLAIM_WB_ASYNC) \ ) -#define trace_shrink_flags(file) \ - ( \ - (file ? RECLAIM_WB_FILE : RECLAIM_WB_ANON) | \ - (RECLAIM_WB_ASYNC) \ - ) - TRACE_EVENT(mm_vmscan_kswapd_sleep, TP_PROTO(int nid), @@ -328,7 +322,8 @@ TRACE_EVENT(mm_vmscan_writepage, TP_fast_assign( __entry->pfn = page_to_pfn(page); - __entry->reclaim_flags = trace_reclaim_flags(page); + __entry->reclaim_flags = trace_reclaim_flags( + page_is_file_cache(page)); ), TP_printk("page=%p pfn=%lu flags=%s", @@ -374,7 +369,7 @@ TRACE_EVENT(mm_vmscan_lru_shrink_inactive, __entry->nr_ref_keep = stat->nr_ref_keep; __entry->nr_unmap_fail = stat->nr_unmap_fail; __entry->priority = priority; - __entry->reclaim_flags = trace_shrink_flags(file); + __entry->reclaim_flags = trace_reclaim_flags(file); ), TP_printk("nid=%d nr_scanned=%ld nr_reclaimed=%ld nr_dirty=%ld nr_writeback=%ld nr_congested=%ld nr_immediate=%ld nr_activate_anon=%d nr_activate_file=%d nr_ref_keep=%ld nr_unmap_fail=%ld priority=%d flags=%s", @@ -413,7 +408,7 @@ TRACE_EVENT(mm_vmscan_lru_shrink_active, __entry->nr_deactivated = nr_deactivated; __entry->nr_referenced = nr_referenced; __entry->priority = priority; - __entry->reclaim_flags = trace_shrink_flags(file); + __entry->reclaim_flags = trace_reclaim_flags(file); ), TP_printk("nid=%d nr_taken=%ld nr_active=%ld nr_deactivated=%ld nr_referenced=%ld priority=%d flags=%s", @@ -452,7 +447,8 @@ TRACE_EVENT(mm_vmscan_inactive_list_is_low, __entry->total_active = total_active; __entry->active = active; __entry->ratio = ratio; - __entry->reclaim_flags = trace_shrink_flags(file) & RECLAIM_WB_LRU; + __entry->reclaim_flags = trace_reclaim_flags(file) & + RECLAIM_WB_LRU; ), TP_printk("nid=%d reclaim_idx=%d total_inactive=%ld inactive=%ld total_active=%ld active=%ld ratio=%ld flags=%s", -- cgit v1.2.3 From 19343b5bdd16ad4ae6b845ef829f68b683c4dfb5 Mon Sep 17 00:00:00 2001 From: Yafang Shao Date: Mon, 13 May 2019 17:23:11 -0700 Subject: mm/page-writeback: introduce tracepoint for wait_on_page_writeback() Recently there have been some hung tasks on our server due to wait_on_page_writeback(), and we want to know the details of this PG_writeback, i.e. this page is writing back to which device. But it is not so convenient to get the details. I think it would be better to introduce a tracepoint for diagnosing the writeback details. Link: http://lkml.kernel.org/r/1556274402-19018-1-git-send-email-laoar.shao@gmail.com Signed-off-by: Yafang Shao Reviewed-by: Andrew Morton Cc: Jan Kara Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/pagemap.h | 10 +--------- include/trace/events/writeback.h | 16 +++++++++++++++- mm/page-writeback.c | 12 ++++++++++++ 3 files changed, 28 insertions(+), 10 deletions(-) diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 2e8438a1216a..112f15bb5907 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -540,15 +540,7 @@ static inline int wait_on_page_locked_killable(struct page *page) extern void put_and_wait_on_page_locked(struct page *page); -/* - * Wait for a page to complete writeback - */ -static inline void wait_on_page_writeback(struct page *page) -{ - if (PageWriteback(page)) - wait_on_page_bit(page, PG_writeback); -} - +void wait_on_page_writeback(struct page *page); extern void end_page_writeback(struct page *page); void wait_for_stable_page(struct page *page); diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h index 32db72c7c055..aa7f3aeac740 100644 --- a/include/trace/events/writeback.h +++ b/include/trace/events/writeback.h @@ -53,7 +53,7 @@ WB_WORK_REASON struct wb_writeback_work; -TRACE_EVENT(writeback_dirty_page, +DECLARE_EVENT_CLASS(writeback_page_template, TP_PROTO(struct page *page, struct address_space *mapping), @@ -79,6 +79,20 @@ TRACE_EVENT(writeback_dirty_page, ) ); +DEFINE_EVENT(writeback_page_template, writeback_dirty_page, + + TP_PROTO(struct page *page, struct address_space *mapping), + + TP_ARGS(page, mapping) +); + +DEFINE_EVENT(writeback_page_template, wait_on_page_writeback, + + TP_PROTO(struct page *page, struct address_space *mapping), + + TP_ARGS(page, mapping) +); + DECLARE_EVENT_CLASS(writeback_dirty_inode_template, TP_PROTO(struct inode *inode, int flags), diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 9f61dfec6a1f..07656485c0e6 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -2808,6 +2808,18 @@ int __test_set_page_writeback(struct page *page, bool keep_write) } EXPORT_SYMBOL(__test_set_page_writeback); +/* + * Wait for a page to complete writeback + */ +void wait_on_page_writeback(struct page *page) +{ + if (PageWriteback(page)) { + trace_wait_on_page_writeback(page, page_mapping(page)); + wait_on_page_bit(page, PG_writeback); + } +} +EXPORT_SYMBOL_GPL(wait_on_page_writeback); + /** * wait_for_stable_page() - wait for writeback to finish, if necessary. * @page: The page to wait on. -- cgit v1.2.3 From 94393c78964c432917014e3a456fa15c3e78f741 Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Mon, 13 May 2019 17:23:14 -0700 Subject: mm/mprotect.c: fix compilation warning because of unused 'mm' variable Since 0cbe3e26abe0 ("mm: update ptep_modify_prot_start/commit to take vm_area_struct as arg") the only place that uses the local 'mm' variable in change_pte_range() is the call to set_pte_at(). Many architectures define set_pte_at() as macro that does not use the 'mm' parameter, which generates the following compilation warning: CC mm/mprotect.o mm/mprotect.c: In function 'change_pte_range': mm/mprotect.c:42:20: warning: unused variable 'mm' [-Wunused-variable] struct mm_struct *mm = vma->vm_mm; ^~ Fix it by passing vma->mm to set_pte_at() and dropping the local 'mm' variable in change_pte_range(). [liu.song.a23@gmail.com: fix missed conversions] Link: http://lkml.kernel.org/r/CAPhsuW6wcQgYLHNdBdw6m0YiR4RWsS4XzfpSKU7wBLLeOCTbpw@mail.gmail.comLink: http://lkml.kernel.org/r/1557305432-4940-1-git-send-email-rppt@linux.ibm.com Signed-off-by: Mike Rapoport Reviewed-by: Andrew Morton Cc: Song Liu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mprotect.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/mm/mprotect.c b/mm/mprotect.c index 65242f1e4457..bf38dfbbb4b4 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -39,7 +39,6 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr, unsigned long end, pgprot_t newprot, int dirty_accountable, int prot_numa) { - struct mm_struct *mm = vma->vm_mm; pte_t *pte, oldpte; spinlock_t *ptl; unsigned long pages = 0; @@ -136,7 +135,7 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, newpte = swp_entry_to_pte(entry); if (pte_swp_soft_dirty(oldpte)) newpte = pte_swp_mksoft_dirty(newpte); - set_pte_at(mm, addr, pte, newpte); + set_pte_at(vma->vm_mm, addr, pte, newpte); pages++; } @@ -150,7 +149,7 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, */ make_device_private_entry_read(&entry); newpte = swp_entry_to_pte(entry); - set_pte_at(mm, addr, pte, newpte); + set_pte_at(vma->vm_mm, addr, pte, newpte); pages++; } -- cgit v1.2.3 From b3b07077b01ecbbd98efede778c195567de25b71 Mon Sep 17 00:00:00 2001 From: Bharath Vedartham Date: Mon, 13 May 2019 17:23:17 -0700 Subject: mm/huge_memory.c: make __thp_get_unmapped_area static __thp_get_unmapped_area is only used in mm/huge_memory.c. Make it static. Tested by building and booting the kernel. Link: http://lkml.kernel.org/r/20190504102353.GA22525@bharath12345-Inspiron-5559 Signed-off-by: Bharath Vedartham Acked-by: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 61b1e05e86ee..9f8bce9a6b32 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -509,7 +509,7 @@ void prep_transhuge_page(struct page *page) set_compound_page_dtor(page, TRANSHUGE_PAGE_DTOR); } -unsigned long __thp_get_unmapped_area(struct file *filp, unsigned long len, +static unsigned long __thp_get_unmapped_area(struct file *filp, unsigned long len, loff_t off, unsigned long flags, unsigned long size) { unsigned long addr; -- cgit v1.2.3 From a1b8e6abf35b9903807eced67a4c26e440663620 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 13 May 2019 17:23:20 -0700 Subject: mm: delete find_get_entries_tag I removed the only user of this and hadn't noticed it was now unused. Link: http://lkml.kernel.org/r/20190430152929.21813-1-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Ross Zwisler Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/pagemap.h | 3 --- mm/filemap.c | 61 ------------------------------------------------- 2 files changed, 64 deletions(-) diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 112f15bb5907..9ec3544baee2 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -373,9 +373,6 @@ static inline unsigned find_get_pages_tag(struct address_space *mapping, return find_get_pages_range_tag(mapping, index, (pgoff_t)-1, tag, nr_pages, pages); } -unsigned find_get_entries_tag(struct address_space *mapping, pgoff_t start, - xa_mark_t tag, unsigned int nr_entries, - struct page **entries, pgoff_t *indices); struct page *grab_cache_page_write_begin(struct address_space *mapping, pgoff_t index, unsigned flags); diff --git a/mm/filemap.c b/mm/filemap.c index 3ad18fa56057..c5af80c43d36 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1941,67 +1941,6 @@ out: } EXPORT_SYMBOL(find_get_pages_range_tag); -/** - * find_get_entries_tag - find and return entries that match @tag - * @mapping: the address_space to search - * @start: the starting page cache index - * @tag: the tag index - * @nr_entries: the maximum number of entries - * @entries: where the resulting entries are placed - * @indices: the cache indices corresponding to the entries in @entries - * - * Like find_get_entries, except we only return entries which are tagged with - * @tag. - * - * Return: the number of entries which were found. - */ -unsigned find_get_entries_tag(struct address_space *mapping, pgoff_t start, - xa_mark_t tag, unsigned int nr_entries, - struct page **entries, pgoff_t *indices) -{ - XA_STATE(xas, &mapping->i_pages, start); - struct page *page; - unsigned int ret = 0; - - if (!nr_entries) - return 0; - - rcu_read_lock(); - xas_for_each_marked(&xas, page, ULONG_MAX, tag) { - if (xas_retry(&xas, page)) - continue; - /* - * A shadow entry of a recently evicted page, a swap - * entry from shmem/tmpfs or a DAX entry. Return it - * without attempting to raise page count. - */ - if (xa_is_value(page)) - goto export; - - if (!page_cache_get_speculative(page)) - goto retry; - - /* Has the page moved or been split? */ - if (unlikely(page != xas_reload(&xas))) - goto put_page; - page = find_subpage(page, xas.xa_index); - -export: - indices[ret] = xas.xa_index; - entries[ret] = page; - if (++ret == nr_entries) - break; - continue; -put_page: - put_page(page); -retry: - xas_reset(&xas); - } - rcu_read_unlock(); - return ret; -} -EXPORT_SYMBOL(find_get_entries_tag); - /* * CD/DVDs are error prone. When a medium error occurs, the driver may fail * a _large_ part of the i/o request. Imagine the worst scenario: -- cgit v1.2.3 From 640be2d1ffbc1946f1547eb89b5005ed7542de99 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 13 May 2019 17:23:23 -0700 Subject: kernel/memremap.c: remove the unused device_private_entry_fault() export This export has been entirely unused since it was added more than 1 1/2 years ago. Link: http://lkml.kernel.org/r/20190429115535.12793-1-hch@lst.de Signed-off-by: Christoph Hellwig Reviewed-by: Dan Williams Reviewed-by: Andrew Morton Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/memremap.c | 1 - 1 file changed, 1 deletion(-) diff --git a/kernel/memremap.c b/kernel/memremap.c index 4e59d29245f4..1490e63f69a9 100644 --- a/kernel/memremap.c +++ b/kernel/memremap.c @@ -45,7 +45,6 @@ vm_fault_t device_private_entry_fault(struct vm_area_struct *vma, */ return devmem->page_fault(vma, addr, page, flags, pmdp); } -EXPORT_SYMBOL(device_private_entry_fault); #endif /* CONFIG_DEVICE_PRIVATE */ static void pgmap_array_delete(struct resource *res) -- cgit v1.2.3