diff options
Diffstat (limited to 'fs/hugetlbfs/inode.c')
-rw-r--r-- | fs/hugetlbfs/inode.c | 236 |
1 files changed, 111 insertions, 125 deletions
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 316adb968b65..4ea71eba40a5 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -4,11 +4,11 @@ * Nadia Yvette Chambers, 2002 * * Copyright (C) 2002 Linus Torvalds. + * License: GPL */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt -#include <linux/module.h> #include <linux/thread_info.h> #include <asm/current.h> #include <linux/sched.h> /* remove ASAP */ @@ -141,7 +141,7 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma) vma_len = (loff_t)(vma->vm_end - vma->vm_start); - mutex_lock(&inode->i_mutex); + inode_lock(inode); file_accessed(file); ret = -ENOMEM; @@ -157,7 +157,7 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma) if (vma->vm_flags & VM_WRITE && inode->i_size < len) inode->i_size = len; out: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return ret; } @@ -213,12 +213,12 @@ hugetlbfs_read_actor(struct page *page, unsigned long offset, int i, chunksize; /* Find which 4k chunk and offset with in that chunk */ - i = offset >> PAGE_CACHE_SHIFT; - offset = offset & ~PAGE_CACHE_MASK; + i = offset >> PAGE_SHIFT; + offset = offset & ~PAGE_MASK; while (size) { size_t n; - chunksize = PAGE_CACHE_SIZE; + chunksize = PAGE_SIZE; if (offset) chunksize -= offset; if (chunksize > size) @@ -237,7 +237,7 @@ hugetlbfs_read_actor(struct page *page, unsigned long offset, /* * Support for read() - Find the page attached to f_mapping and copy out the * data. Its *very* similar to do_generic_mapping_read(), we can't use that - * since it has PAGE_CACHE_SIZE assumptions. + * since it has PAGE_SIZE assumptions. */ static ssize_t hugetlbfs_read_iter(struct kiocb *iocb, struct iov_iter *to) { @@ -285,7 +285,7 @@ static ssize_t hugetlbfs_read_iter(struct kiocb *iocb, struct iov_iter *to) * We have the page, copy it to user space buffer. */ copied = hugetlbfs_read_actor(page, offset, to, nr); - page_cache_release(page); + put_page(page); } offset += copied; retval += copied; @@ -324,20 +324,62 @@ static void remove_huge_page(struct page *page) delete_from_page_cache(page); } +static void +hugetlb_vmdelete_list(struct rb_root *root, pgoff_t start, pgoff_t end) +{ + struct vm_area_struct *vma; + + /* + * end == 0 indicates that the entire range after + * start should be unmapped. + */ + vma_interval_tree_foreach(vma, root, start, end ? end : ULONG_MAX) { + unsigned long v_offset; + unsigned long v_end; + + /* + * Can the expression below overflow on 32-bit arches? + * No, because the interval tree returns us only those vmas + * which overlap the truncated area starting at pgoff, + * and no vma on a 32-bit arch can span beyond the 4GB. + */ + if (vma->vm_pgoff < start) + v_offset = (start - vma->vm_pgoff) << PAGE_SHIFT; + else + v_offset = 0; + + if (!end) + v_end = vma->vm_end; + else { + v_end = ((end - vma->vm_pgoff) << PAGE_SHIFT) + + vma->vm_start; + if (v_end > vma->vm_end) + v_end = vma->vm_end; + } + + unmap_hugepage_range(vma, vma->vm_start + v_offset, v_end, + NULL); + } +} /* * remove_inode_hugepages handles two distinct cases: truncation and hole * punch. There are subtle differences in operation for each case. - + * * truncation is indicated by end of range being LLONG_MAX * In this case, we first scan the range and release found pages. * After releasing pages, hugetlb_unreserve_pages cleans up region/reserv - * maps and global counts. + * maps and global counts. Page faults can not race with truncation + * in this routine. hugetlb_no_page() prevents page faults in the + * truncated range. It checks i_size before allocation, and again after + * with the page table lock for the page held. The same lock must be + * acquired to unmap a page. * hole punch is indicated if end is not LLONG_MAX * In the hole punch case we scan the range and release found pages. * Only when releasing a page is the associated region/reserv map * deleted. The region/reserv map for ranges without associated - * pages are not modified. + * pages are not modified. Page faults can race with hole punch. + * This is indicated if we find a mapped page. * Note: If the passed end of range value is beyond the end of file, but * not LLONG_MAX this routine still performs a hole punch operation. */ @@ -361,77 +403,81 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart, next = start; while (next < end) { /* - * Make sure to never grab more pages that we - * might possibly need. + * Don't grab more pages than the number left in the range. */ if (end - next < lookup_nr) lookup_nr = end - next; /* - * This pagevec_lookup() may return pages past 'end', - * so we must check for page->index > end. + * When no more pages are found, we are done. */ - if (!pagevec_lookup(&pvec, mapping, next, lookup_nr)) { - if (next == start) - break; - next = start; - continue; - } + if (!pagevec_lookup(&pvec, mapping, next, lookup_nr)) + break; for (i = 0; i < pagevec_count(&pvec); ++i) { struct page *page = pvec.pages[i]; + bool rsv_on_error; u32 hash; + /* + * The page (index) could be beyond end. This is + * only possible in the punch hole case as end is + * max page offset in the truncate case. + */ + next = page->index; + if (next >= end) + break; + hash = hugetlb_fault_mutex_hash(h, current->mm, &pseudo_vma, mapping, next, 0); mutex_lock(&hugetlb_fault_mutex_table[hash]); - lock_page(page); - if (page->index >= end) { - unlock_page(page); - mutex_unlock(&hugetlb_fault_mutex_table[hash]); - next = end; /* we are done */ - break; - } - /* * If page is mapped, it was faulted in after being - * unmapped. Do nothing in this race case. In the - * normal case page is not mapped. + * unmapped in caller. Unmap (again) now after taking + * the fault mutex. The mutex will prevent faults + * until we finish removing the page. + * + * This race can only happen in the hole punch case. + * Getting here in a truncate operation is a bug. */ - if (!page_mapped(page)) { - bool rsv_on_error = !PagePrivate(page); - /* - * We must free the huge page and remove - * from page cache (remove_huge_page) BEFORE - * removing the region/reserve map - * (hugetlb_unreserve_pages). In rare out - * of memory conditions, removal of the - * region/reserve map could fail. Before - * free'ing the page, note PagePrivate which - * is used in case of error. - */ - remove_huge_page(page); - freed++; - if (!truncate_op) { - if (unlikely(hugetlb_unreserve_pages( - inode, next, - next + 1, 1))) - hugetlb_fix_reserve_counts( - inode, rsv_on_error); - } + if (unlikely(page_mapped(page))) { + BUG_ON(truncate_op); + + i_mmap_lock_write(mapping); + hugetlb_vmdelete_list(&mapping->i_mmap, + next * pages_per_huge_page(h), + (next + 1) * pages_per_huge_page(h)); + i_mmap_unlock_write(mapping); } - if (page->index > next) - next = page->index; + lock_page(page); + /* + * We must free the huge page and remove from page + * cache (remove_huge_page) BEFORE removing the + * region/reserve map (hugetlb_unreserve_pages). In + * rare out of memory conditions, removal of the + * region/reserve map could fail. Before free'ing + * the page, note PagePrivate which is used in case + * of error. + */ + rsv_on_error = !PagePrivate(page); + remove_huge_page(page); + freed++; + if (!truncate_op) { + if (unlikely(hugetlb_unreserve_pages(inode, + next, next + 1, 1))) + hugetlb_fix_reserve_counts(inode, + rsv_on_error); + } - ++next; unlock_page(page); - mutex_unlock(&hugetlb_fault_mutex_table[hash]); } + ++next; huge_pagevec_release(&pvec); + cond_resched(); } if (truncate_op) @@ -450,41 +496,6 @@ static void hugetlbfs_evict_inode(struct inode *inode) clear_inode(inode); } -static inline void -hugetlb_vmdelete_list(struct rb_root *root, pgoff_t start, pgoff_t end) -{ - struct vm_area_struct *vma; - - /* - * end == 0 indicates that the entire range after - * start should be unmapped. - */ - vma_interval_tree_foreach(vma, root, start, end ? end : ULONG_MAX) { - unsigned long v_offset; - - /* - * Can the expression below overflow on 32-bit arches? - * No, because the interval tree returns us only those vmas - * which overlap the truncated area starting at pgoff, - * and no vma on a 32-bit arch can span beyond the 4GB. - */ - if (vma->vm_pgoff < start) - v_offset = (start - vma->vm_pgoff) << PAGE_SHIFT; - else - v_offset = 0; - - if (end) { - end = ((end - start) << PAGE_SHIFT) + - vma->vm_start + v_offset; - if (end > vma->vm_end) - end = vma->vm_end; - } else - end = vma->vm_end; - - unmap_hugepage_range(vma, vma->vm_start + v_offset, end, NULL); - } -} - static int hugetlb_vmtruncate(struct inode *inode, loff_t offset) { pgoff_t pgoff; @@ -519,7 +530,7 @@ static long hugetlbfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) if (hole_end > hole_start) { struct address_space *mapping = inode->i_mapping; - mutex_lock(&inode->i_mutex); + inode_lock(inode); i_mmap_lock_write(mapping); if (!RB_EMPTY_ROOT(&mapping->i_mmap)) hugetlb_vmdelete_list(&mapping->i_mmap, @@ -527,7 +538,7 @@ static long hugetlbfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) hole_end >> PAGE_SHIFT); i_mmap_unlock_write(mapping); remove_inode_hugepages(inode, hole_start, hole_end); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); } return 0; @@ -561,7 +572,7 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset, start = offset >> hpage_shift; end = (offset + len + hpage_size - 1) >> hpage_shift; - mutex_lock(&inode->i_mutex); + inode_lock(inode); /* We need to check rlimit even when FALLOC_FL_KEEP_SIZE */ error = inode_newsize_ok(inode, offset + len); @@ -647,11 +658,8 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset, if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > inode->i_size) i_size_write(inode, offset + len); inode->i_ctime = CURRENT_TIME; - spin_lock(&inode->i_lock); - inode->i_private = NULL; - spin_unlock(&inode->i_lock); out: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return error; } @@ -709,7 +717,7 @@ static struct inode *hugetlbfs_get_root(struct super_block *sb, /* * Hugetlbfs is not reclaimable; therefore its i_mmap_rwsem will never * be taken from reclaim -- unlike regular filesystems. This needs an - * annotation because huge_pmd_share() does an allocation under + * annotation because huge_pmd_share() does an allocation under hugetlb's * i_mmap_rwsem. */ static struct lock_class_key hugetlbfs_i_mmap_rwsem_key; @@ -739,7 +747,7 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb, /* * The policy is initialized here even if we are creating a * private inode because initialization simply creates an - * an empty rb tree and calls spin_lock_init(), later when we + * an empty rb tree and calls rwlock_init(), later when we * call mpol_free_shared_policy() it will just return because * the rb tree will still be empty. */ @@ -761,6 +769,7 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb, break; case S_IFLNK: inode->i_op = &page_symlink_inode_operations; + inode_nohighmem(inode); break; } lockdep_annotate_inode_mutex_key(inode); @@ -1202,7 +1211,6 @@ static struct file_system_type hugetlbfs_fs_type = { .mount = hugetlbfs_mount, .kill_sb = kill_litter_super, }; -MODULE_ALIAS_FS("hugetlbfs"); static struct vfsmount *hugetlbfs_vfsmount[HUGE_MAX_HSTATE]; @@ -1322,7 +1330,7 @@ static int __init init_hugetlbfs_fs(void) error = -ENOMEM; hugetlbfs_inode_cachep = kmem_cache_create("hugetlbfs_inode_cache", sizeof(struct hugetlbfs_inode_info), - 0, 0, init_once); + 0, SLAB_ACCOUNT, init_once); if (hugetlbfs_inode_cachep == NULL) goto out2; @@ -1356,26 +1364,4 @@ static int __init init_hugetlbfs_fs(void) out2: return error; } - -static void __exit exit_hugetlbfs_fs(void) -{ - struct hstate *h; - int i; - - - /* - * Make sure all delayed rcu free inodes are flushed before we - * destroy cache. - */ - rcu_barrier(); - kmem_cache_destroy(hugetlbfs_inode_cachep); - i = 0; - for_each_hstate(h) - kern_unmount(hugetlbfs_vfsmount[i++]); - unregister_filesystem(&hugetlbfs_fs_type); -} - -module_init(init_hugetlbfs_fs) -module_exit(exit_hugetlbfs_fs) - -MODULE_LICENSE("GPL"); +fs_initcall(init_hugetlbfs_fs) |