diff options
Diffstat (limited to 'mm')
-rw-r--r-- | mm/filemap.c | 3 | ||||
-rw-r--r-- | mm/madvise.c | 2 | ||||
-rw-r--r-- | mm/memcontrol.c | 141 | ||||
-rw-r--r-- | mm/memory-failure.c | 21 | ||||
-rw-r--r-- | mm/memory.c | 25 | ||||
-rw-r--r-- | mm/nommu.c | 12 | ||||
-rw-r--r-- | mm/oom_kill.c | 3 | ||||
-rw-r--r-- | mm/page_alloc.c | 54 | ||||
-rw-r--r-- | mm/rmap.c | 12 | ||||
-rw-r--r-- | mm/shmem.c | 74 | ||||
-rw-r--r-- | mm/slab.c | 17 | ||||
-rw-r--r-- | mm/slob.c | 6 | ||||
-rw-r--r-- | mm/slub.c | 105 | ||||
-rw-r--r-- | mm/swapfile.c | 31 | ||||
-rw-r--r-- | mm/truncate.c | 28 | ||||
-rw-r--r-- | mm/vmalloc.c | 18 | ||||
-rw-r--r-- | mm/vmscan.c | 160 |
17 files changed, 496 insertions, 216 deletions
diff --git a/mm/filemap.c b/mm/filemap.c index a8251a8d3457..f820e600f1ad 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -78,9 +78,6 @@ * ->i_mutex (generic_file_buffered_write) * ->mmap_sem (fault_in_pages_readable->do_page_fault) * - * ->i_mutex - * ->i_alloc_sem (various) - * * inode_wb_list_lock * sb_lock (fs/fs-writeback.c) * ->mapping->tree_lock (__sync_single_inode) diff --git a/mm/madvise.c b/mm/madvise.c index 2221491ed503..74bf193eff04 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -218,7 +218,7 @@ static long madvise_remove(struct vm_area_struct *vma, endoff = (loff_t)(end - vma->vm_start - 1) + ((loff_t)vma->vm_pgoff << PAGE_SHIFT); - /* vmtruncate_range needs to take i_mutex and i_alloc_sem */ + /* vmtruncate_range needs to take i_mutex */ up_read(¤t->mm->mmap_sem); error = vmtruncate_range(mapping->host, offset, endoff); down_read(¤t->mm->mmap_sem); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index cf7d027a8844..e013b8e57d25 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -35,6 +35,7 @@ #include <linux/limits.h> #include <linux/mutex.h> #include <linux/rbtree.h> +#include <linux/shmem_fs.h> #include <linux/slab.h> #include <linux/swap.h> #include <linux/swapops.h> @@ -107,10 +108,12 @@ enum mem_cgroup_events_index { enum mem_cgroup_events_target { MEM_CGROUP_TARGET_THRESH, MEM_CGROUP_TARGET_SOFTLIMIT, + MEM_CGROUP_TARGET_NUMAINFO, MEM_CGROUP_NTARGETS, }; #define THRESHOLDS_EVENTS_TARGET (128) #define SOFTLIMIT_EVENTS_TARGET (1024) +#define NUMAINFO_EVENTS_TARGET (1024) struct mem_cgroup_stat_cpu { long count[MEM_CGROUP_STAT_NSTATS]; @@ -236,7 +239,8 @@ struct mem_cgroup { int last_scanned_node; #if MAX_NUMNODES > 1 nodemask_t scan_nodes; - unsigned long next_scan_node_update; + atomic_t numainfo_events; + atomic_t numainfo_updating; #endif /* * Should the accounting and control be hierarchical, per subtree? @@ -576,15 +580,6 @@ static long mem_cgroup_read_stat(struct mem_cgroup *mem, return val; } -static long mem_cgroup_local_usage(struct mem_cgroup *mem) -{ - long ret; - - ret = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_RSS); - ret += mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_CACHE); - return ret; -} - static void mem_cgroup_swap_statistics(struct mem_cgroup *mem, bool charge) { @@ -688,6 +683,9 @@ static void __mem_cgroup_target_update(struct mem_cgroup *mem, int target) case MEM_CGROUP_TARGET_SOFTLIMIT: next = val + SOFTLIMIT_EVENTS_TARGET; break; + case MEM_CGROUP_TARGET_NUMAINFO: + next = val + NUMAINFO_EVENTS_TARGET; + break; default: return; } @@ -706,11 +704,19 @@ static void memcg_check_events(struct mem_cgroup *mem, struct page *page) mem_cgroup_threshold(mem); __mem_cgroup_target_update(mem, MEM_CGROUP_TARGET_THRESH); if (unlikely(__memcg_event_check(mem, - MEM_CGROUP_TARGET_SOFTLIMIT))){ + MEM_CGROUP_TARGET_SOFTLIMIT))) { mem_cgroup_update_tree(mem, page); __mem_cgroup_target_update(mem, - MEM_CGROUP_TARGET_SOFTLIMIT); + MEM_CGROUP_TARGET_SOFTLIMIT); + } +#if MAX_NUMNODES > 1 + if (unlikely(__memcg_event_check(mem, + MEM_CGROUP_TARGET_NUMAINFO))) { + atomic_inc(&mem->numainfo_events); + __mem_cgroup_target_update(mem, + MEM_CGROUP_TARGET_NUMAINFO); } +#endif } } @@ -1128,7 +1134,6 @@ unsigned long mem_cgroup_zone_nr_lru_pages(struct mem_cgroup *memcg, return MEM_CGROUP_ZSTAT(mz, lru); } -#ifdef CONFIG_NUMA static unsigned long mem_cgroup_node_nr_file_lru_pages(struct mem_cgroup *memcg, int nid) { @@ -1140,6 +1145,17 @@ static unsigned long mem_cgroup_node_nr_file_lru_pages(struct mem_cgroup *memcg, return ret; } +static unsigned long mem_cgroup_node_nr_anon_lru_pages(struct mem_cgroup *memcg, + int nid) +{ + unsigned long ret; + + ret = mem_cgroup_get_zonestat_node(memcg, nid, LRU_INACTIVE_ANON) + + mem_cgroup_get_zonestat_node(memcg, nid, LRU_ACTIVE_ANON); + return ret; +} + +#if MAX_NUMNODES > 1 static unsigned long mem_cgroup_nr_file_lru_pages(struct mem_cgroup *memcg) { u64 total = 0; @@ -1151,17 +1167,6 @@ static unsigned long mem_cgroup_nr_file_lru_pages(struct mem_cgroup *memcg) return total; } -static unsigned long mem_cgroup_node_nr_anon_lru_pages(struct mem_cgroup *memcg, - int nid) -{ - unsigned long ret; - - ret = mem_cgroup_get_zonestat_node(memcg, nid, LRU_INACTIVE_ANON) + - mem_cgroup_get_zonestat_node(memcg, nid, LRU_ACTIVE_ANON); - - return ret; -} - static unsigned long mem_cgroup_nr_anon_lru_pages(struct mem_cgroup *memcg) { u64 total = 0; @@ -1558,6 +1563,28 @@ mem_cgroup_select_victim(struct mem_cgroup *root_mem) return ret; } +/** + * test_mem_cgroup_node_reclaimable + * @mem: the target memcg + * @nid: the node ID to be checked. + * @noswap : specify true here if the user wants flle only information. + * + * This function returns whether the specified memcg contains any + * reclaimable pages on a node. Returns true if there are any reclaimable + * pages in the node. + */ +static bool test_mem_cgroup_node_reclaimable(struct mem_cgroup *mem, + int nid, bool noswap) +{ + if (mem_cgroup_node_nr_file_lru_pages(mem, nid)) + return true; + if (noswap || !total_swap_pages) + return false; + if (mem_cgroup_node_nr_anon_lru_pages(mem, nid)) + return true; + return false; + +} #if MAX_NUMNODES > 1 /* @@ -1569,26 +1596,26 @@ mem_cgroup_select_victim(struct mem_cgroup *root_mem) static void mem_cgroup_may_update_nodemask(struct mem_cgroup *mem) { int nid; - - if (time_after(mem->next_scan_node_update, jiffies)) + /* + * numainfo_events > 0 means there was at least NUMAINFO_EVENTS_TARGET + * pagein/pageout changes since the last update. + */ + if (!atomic_read(&mem->numainfo_events)) + return; + if (atomic_inc_return(&mem->numainfo_updating) > 1) return; - mem->next_scan_node_update = jiffies + 10*HZ; /* make a nodemask where this memcg uses memory from */ mem->scan_nodes = node_states[N_HIGH_MEMORY]; for_each_node_mask(nid, node_states[N_HIGH_MEMORY]) { - if (mem_cgroup_get_zonestat_node(mem, nid, LRU_INACTIVE_FILE) || - mem_cgroup_get_zonestat_node(mem, nid, LRU_ACTIVE_FILE)) - continue; - - if (total_swap_pages && - (mem_cgroup_get_zonestat_node(mem, nid, LRU_INACTIVE_ANON) || - mem_cgroup_get_zonestat_node(mem, nid, LRU_ACTIVE_ANON))) - continue; - node_clear(nid, mem->scan_nodes); + if (!test_mem_cgroup_node_reclaimable(mem, nid, false)) + node_clear(nid, mem->scan_nodes); } + + atomic_set(&mem->numainfo_events, 0); + atomic_set(&mem->numainfo_updating, 0); } /* @@ -1626,11 +1653,51 @@ int mem_cgroup_select_victim_node(struct mem_cgroup *mem) return node; } +/* + * Check all nodes whether it contains reclaimable pages or not. + * For quick scan, we make use of scan_nodes. This will allow us to skip + * unused nodes. But scan_nodes is lazily updated and may not cotain + * enough new information. We need to do double check. + */ +bool mem_cgroup_reclaimable(struct mem_cgroup *mem, bool noswap) +{ + int nid; + + /* + * quick check...making use of scan_node. + * We can skip unused nodes. + */ + if (!nodes_empty(mem->scan_nodes)) { + for (nid = first_node(mem->scan_nodes); + nid < MAX_NUMNODES; + nid = next_node(nid, mem->scan_nodes)) { + + if (test_mem_cgroup_node_reclaimable(mem, nid, noswap)) + return true; + } + } + /* + * Check rest of nodes. + */ + for_each_node_state(nid, N_HIGH_MEMORY) { + if (node_isset(nid, mem->scan_nodes)) + continue; + if (test_mem_cgroup_node_reclaimable(mem, nid, noswap)) + return true; + } + return false; +} + #else int mem_cgroup_select_victim_node(struct mem_cgroup *mem) { return 0; } + +bool mem_cgroup_reclaimable(struct mem_cgroup *mem, bool noswap) +{ + return test_mem_cgroup_node_reclaimable(mem, 0, noswap); +} #endif /* @@ -1701,7 +1768,7 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem, } } } - if (!mem_cgroup_local_usage(victim)) { + if (!mem_cgroup_reclaimable(victim, noswap)) { /* this cgroup's local usage == 0 */ css_put(&victim->css); continue; diff --git a/mm/memory-failure.c b/mm/memory-failure.c index eac0ba561491..740c4f52059c 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -391,10 +391,11 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill, struct task_struct *tsk; struct anon_vma *av; - read_lock(&tasklist_lock); av = page_lock_anon_vma(page); if (av == NULL) /* Not actually mapped anymore */ - goto out; + return; + + read_lock(&tasklist_lock); for_each_process (tsk) { struct anon_vma_chain *vmac; @@ -408,9 +409,8 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill, add_to_kill(tsk, page, vma, to_kill, tkc); } } - page_unlock_anon_vma(av); -out: read_unlock(&tasklist_lock); + page_unlock_anon_vma(av); } /* @@ -424,17 +424,8 @@ static void collect_procs_file(struct page *page, struct list_head *to_kill, struct prio_tree_iter iter; struct address_space *mapping = page->mapping; - /* - * A note on the locking order between the two locks. - * We don't rely on this particular order. - * If you have some other code that needs a different order - * feel free to switch them around. Or add a reverse link - * from mm_struct to task_struct, then this could be all - * done without taking tasklist_lock and looping over all tasks. - */ - - read_lock(&tasklist_lock); mutex_lock(&mapping->i_mmap_mutex); + read_lock(&tasklist_lock); for_each_process(tsk) { pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); @@ -454,8 +445,8 @@ static void collect_procs_file(struct page *page, struct list_head *to_kill, add_to_kill(tsk, page, vma, to_kill, tkc); } } - mutex_unlock(&mapping->i_mmap_mutex); read_unlock(&tasklist_lock); + mutex_unlock(&mapping->i_mmap_mutex); } /* diff --git a/mm/memory.c b/mm/memory.c index 87d935333f0d..9b8a01d941cb 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -305,6 +305,7 @@ int __tlb_remove_page(struct mmu_gather *tlb, struct page *page) if (batch->nr == batch->max) { if (!tlb_next_batch(tlb)) return 0; + batch = tlb->active; } VM_BUG_ON(batch->nr > batch->max); @@ -2798,30 +2799,6 @@ void unmap_mapping_range(struct address_space *mapping, } EXPORT_SYMBOL(unmap_mapping_range); -int vmtruncate_range(struct inode *inode, loff_t offset, loff_t end) -{ - struct address_space *mapping = inode->i_mapping; - - /* - * If the underlying filesystem is not going to provide - * a way to truncate a range of blocks (punch a hole) - - * we should return failure right now. - */ - if (!inode->i_op->truncate_range) - return -ENOSYS; - - mutex_lock(&inode->i_mutex); - down_write(&inode->i_alloc_sem); - unmap_mapping_range(mapping, offset, (end - offset), 1); - truncate_inode_pages_range(mapping, offset, end); - unmap_mapping_range(mapping, offset, (end - offset), 1); - inode->i_op->truncate_range(inode, offset, end); - up_write(&inode->i_alloc_sem); - mutex_unlock(&inode->i_mutex); - - return 0; -} - /* * We enter with non-exclusive mmap_sem (to exclude vma changes, * but allow concurrent faults), and pte mapped but not yet locked. diff --git a/mm/nommu.c b/mm/nommu.c index 1fd0c51b10a6..5c5c2d4b1807 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -22,7 +22,6 @@ #include <linux/pagemap.h> #include <linux/slab.h> #include <linux/vmalloc.h> -#include <linux/tracehook.h> #include <linux/blkdev.h> #include <linux/backing-dev.h> #include <linux/mount.h> @@ -1087,7 +1086,7 @@ static unsigned long determine_vm_flags(struct file *file, * it's being traced - otherwise breakpoints set in it may interfere * with another untraced process */ - if ((flags & MAP_PRIVATE) && tracehook_expect_breakpoints(current)) + if ((flags & MAP_PRIVATE) && current->ptrace) vm_flags &= ~VM_MAYSHARE; return vm_flags; @@ -1813,10 +1812,13 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address, return NULL; } -int remap_pfn_range(struct vm_area_struct *vma, unsigned long from, - unsigned long to, unsigned long size, pgprot_t prot) +int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, + unsigned long pfn, unsigned long size, pgprot_t prot) { - vma->vm_start = vma->vm_pgoff << PAGE_SHIFT; + if (addr != (pfn << PAGE_SHIFT)) + return -EINVAL; + + vma->vm_flags |= VM_IO | VM_RESERVED | VM_PFNMAP; return 0; } EXPORT_SYMBOL(remap_pfn_range); diff --git a/mm/oom_kill.c b/mm/oom_kill.c index e4b0991ca351..b0be989d4365 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -339,8 +339,7 @@ static struct task_struct *select_bad_process(unsigned int *ppoints, * then wait for it to finish before killing * some other task unnecessarily. */ - if (!(task_ptrace(p->group_leader) & - PT_TRACE_EXIT)) + if (!(p->group_leader->ptrace & PT_TRACE_EXIT)) return ERR_PTR(-1UL); } } diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 4e8985acdab8..9119faae6e6a 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4585,6 +4585,60 @@ void __init sort_node_map(void) cmp_node_active_region, NULL); } +/** + * node_map_pfn_alignment - determine the maximum internode alignment + * + * This function should be called after node map is populated and sorted. + * It calculates the maximum power of two alignment which can distinguish + * all the nodes. + * + * For example, if all nodes are 1GiB and aligned to 1GiB, the return value + * would indicate 1GiB alignment with (1 << (30 - PAGE_SHIFT)). If the + * nodes are shifted by 256MiB, 256MiB. Note that if only the last node is + * shifted, 1GiB is enough and this function will indicate so. + * + * This is used to test whether pfn -> nid mapping of the chosen memory + * model has fine enough granularity to avoid incorrect mapping for the + * populated node map. + * + * Returns the determined alignment in pfn's. 0 if there is no alignment + * requirement (single node). + */ +unsigned long __init node_map_pfn_alignment(void) +{ + unsigned long accl_mask = 0, last_end = 0; + int last_nid = -1; + int i; + + for_each_active_range_index_in_nid(i, MAX_NUMNODES) { + int nid = early_node_map[i].nid; + unsigned long start = early_node_map[i].start_pfn; + unsigned long end = early_node_map[i].end_pfn; + unsigned long mask; + + if (!start || last_nid < 0 || last_nid == nid) { + last_nid = nid; + last_end = end; + continue; + } + + /* + * Start with a mask granular enough to pin-point to the + * start pfn and tick off bits one-by-one until it becomes + * too coarse to separate the current node from the last. + */ + mask = ~((1 << __ffs(start)) - 1); + while (mask && last_end <= (start & (mask << 1))) + mask <<= 1; + + /* accumulate all internode masks */ + accl_mask |= mask; + } + + /* convert mask to number of pages */ + return ~accl_mask + 1; +} + /* Find the lowest pfn for a node */ static unsigned long __init find_min_pfn_for_node(int nid) { diff --git a/mm/rmap.c b/mm/rmap.c index 27dfd3b82b0f..9701574bb67a 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -21,7 +21,6 @@ * Lock ordering in mm: * * inode->i_mutex (while writing or truncating, not reading or faulting) - * inode->i_alloc_sem (vmtruncate_range) * mm->mmap_sem * page->flags PG_locked (lock_page) * mapping->i_mmap_mutex @@ -38,9 +37,8 @@ * in arch-dependent flush_dcache_mmap_lock, * within inode_wb_list_lock in __sync_single_inode) * - * (code doesn't rely on that order so it could be switched around) - * ->tasklist_lock - * anon_vma->mutex (memory_failure, collect_procs_anon) + * anon_vma->mutex,mapping->i_mutex (memory_failure, collect_procs_anon) + * ->tasklist_lock * pte map lock */ @@ -871,11 +869,11 @@ int page_referenced(struct page *page, vm_flags); if (we_locked) unlock_page(page); + + if (page_test_and_clear_young(page_to_pfn(page))) + referenced++; } out: - if (page_test_and_clear_young(page_to_pfn(page))) - referenced++; - return referenced; } diff --git a/mm/shmem.c b/mm/shmem.c index d221a1cfd7b1..fcedf5464eb7 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -539,7 +539,7 @@ static void shmem_free_pages(struct list_head *next) } while (next); } -static void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end) +void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end) { struct shmem_inode_info *info = SHMEM_I(inode); unsigned long idx; @@ -562,6 +562,8 @@ static void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end) spinlock_t *punch_lock; unsigned long upper_limit; + truncate_inode_pages_range(inode->i_mapping, start, end); + inode->i_ctime = inode->i_mtime = CURRENT_TIME; idx = (start + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; if (idx >= info->next_index) @@ -738,16 +740,8 @@ done2: * lowered next_index. Also, though shmem_getpage checks * i_size before adding to cache, no recheck after: so fix the * narrow window there too. - * - * Recalling truncate_inode_pages_range and unmap_mapping_range - * every time for punch_hole (which never got a chance to clear - * SHMEM_PAGEIN at the start of vmtruncate_range) is expensive, - * yet hardly ever necessary: try to optimize them out later. */ truncate_inode_pages_range(inode->i_mapping, start, end); - if (punch_hole) - unmap_mapping_range(inode->i_mapping, start, - end - start, 1); } spin_lock(&info->lock); @@ -766,22 +760,23 @@ done2: shmem_free_pages(pages_to_free.next); } } +EXPORT_SYMBOL_GPL(shmem_truncate_range); -static int shmem_notify_change(struct dentry *dentry, struct iattr *attr) +static int shmem_setattr(struct dentry *dentry, struct iattr *attr) { struct inode *inode = dentry->d_inode; - loff_t newsize = attr->ia_size; int error; error = inode_change_ok(inode, attr); if (error) return error; - if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE) - && newsize != inode->i_size) { + if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) { + loff_t oldsize = inode->i_size; + loff_t newsize = attr->ia_size; struct page *page = NULL; - if (newsize < inode->i_size) { + if (newsize < oldsize) { /* * If truncating down to a partial page, then * if that page is already allocated, hold it @@ -810,12 +805,19 @@ static int shmem_notify_change(struct dentry *dentry, struct iattr *attr) spin_unlock(&info->lock); } } - - /* XXX(truncate): truncate_setsize should be called last */ - truncate_setsize(inode, newsize); + if (newsize != oldsize) { + i_size_write(inode, newsize); + inode->i_ctime = inode->i_mtime = CURRENT_TIME; + } + if (newsize < oldsize) { + loff_t holebegin = round_up(newsize, PAGE_SIZE); + unmap_mapping_range(inode->i_mapping, holebegin, 0, 1); + shmem_truncate_range(inode, newsize, (loff_t)-1); + /* unmap again to remove racily COWed private pages */ + unmap_mapping_range(inode->i_mapping, holebegin, 0, 1); + } if (page) page_cache_release(page); - shmem_truncate_range(inode, newsize, (loff_t)-1); } setattr_copy(inode, attr); @@ -832,7 +834,6 @@ static void shmem_evict_inode(struct inode *inode) struct shmem_xattr *xattr, *nxattr; if (inode->i_mapping->a_ops == &shmem_aops) { - truncate_inode_pages(inode->i_mapping, 0); shmem_unacct_size(info->flags, inode->i_size); inode->i_size = 0; shmem_truncate_range(inode, 0, (loff_t)-1); @@ -2706,7 +2707,7 @@ static const struct file_operations shmem_file_operations = { }; static const struct inode_operations shmem_inode_operations = { - .setattr = shmem_notify_change, + .setattr = shmem_setattr, .truncate_range = shmem_truncate_range, #ifdef CONFIG_TMPFS_XATTR .setxattr = shmem_setxattr, @@ -2739,7 +2740,7 @@ static const struct inode_operations shmem_dir_inode_operations = { .removexattr = shmem_removexattr, #endif #ifdef CONFIG_TMPFS_POSIX_ACL - .setattr = shmem_notify_change, + .setattr = shmem_setattr, .check_acl = generic_check_acl, #endif }; @@ -2752,7 +2753,7 @@ static const struct inode_operations shmem_special_inode_operations = { .removexattr = shmem_removexattr, #endif #ifdef CONFIG_TMPFS_POSIX_ACL - .setattr = shmem_notify_change, + .setattr = shmem_setattr, .check_acl = generic_check_acl, #endif }; @@ -2908,6 +2909,12 @@ int shmem_lock(struct file *file, int lock, struct user_struct *user) return 0; } +void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end) +{ + truncate_inode_pages_range(inode->i_mapping, start, end); +} +EXPORT_SYMBOL_GPL(shmem_truncate_range); + #ifdef CONFIG_CGROUP_MEM_RES_CTLR /** * mem_cgroup_get_shmem_target - find a page or entry assigned to the shmem file @@ -3028,3 +3035,26 @@ int shmem_zero_setup(struct vm_area_struct *vma) vma->vm_flags |= VM_CAN_NONLINEAR; return 0; } + +/** + * shmem_read_mapping_page_gfp - read into page cache, using specified page allocation flags. + * @mapping: the page's address_space + * @index: the page index + * @gfp: the page allocator flags to use if allocating + * + * This behaves as a tmpfs "read_cache_page_gfp(mapping, index, gfp)", + * with any new page allocations done using the specified allocation flags. + * But read_cache_page_gfp() uses the ->readpage() method: which does not + * suit tmpfs, since it may have pages in swapcache, and needs to find those + * for itself; although drivers/gpu/drm i915 and ttm rely upon this support. + * + * Provide a stub for those callers to start using now, then later + * flesh it out to call shmem_getpage() with additional gfp mask, when + * shmem_file_splice_read() is added and shmem_readpage() is removed. + */ +struct page *shmem_read_mapping_page_gfp(struct address_space *mapping, + pgoff_t index, gfp_t gfp) +{ + return read_cache_page_gfp(mapping, index, gfp); +} +EXPORT_SYMBOL_GPL(shmem_read_mapping_page_gfp); diff --git a/mm/slab.c b/mm/slab.c index d96e223de775..1e523ed47c61 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -574,7 +574,9 @@ static struct arraycache_init initarray_generic = { {0, BOOT_CPUCACHE_ENTRIES, 1, 0} }; /* internal cache of cache description objs */ +static struct kmem_list3 *cache_cache_nodelists[MAX_NUMNODES]; static struct kmem_cache cache_cache = { + .nodelists = cache_cache_nodelists, .batchcount = 1, .limit = BOOT_CPUCACHE_ENTRIES, .shared = 1, @@ -1492,11 +1494,10 @@ void __init kmem_cache_init(void) cache_cache.nodelists[node] = &initkmem_list3[CACHE_CACHE + node]; /* - * struct kmem_cache size depends on nr_node_ids, which - * can be less than MAX_NUMNODES. + * struct kmem_cache size depends on nr_node_ids & nr_cpu_ids */ - cache_cache.buffer_size = offsetof(struct kmem_cache, nodelists) + - nr_node_ids * sizeof(struct kmem_list3 *); + cache_cache.buffer_size = offsetof(struct kmem_cache, array[nr_cpu_ids]) + + nr_node_ids * sizeof(struct kmem_list3 *); #if DEBUG cache_cache.obj_size = cache_cache.buffer_size; #endif @@ -2308,6 +2309,7 @@ kmem_cache_create (const char *name, size_t size, size_t align, if (!cachep) goto oops; + cachep->nodelists = (struct kmem_list3 **)&cachep->array[nr_cpu_ids]; #if DEBUG cachep->obj_size = size; @@ -3153,12 +3155,11 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep, objp += obj_offset(cachep); if (cachep->ctor && cachep->flags & SLAB_POISON) cachep->ctor(objp); -#if ARCH_SLAB_MINALIGN - if ((u32)objp & (ARCH_SLAB_MINALIGN-1)) { + if (ARCH_SLAB_MINALIGN && + ((unsigned long)objp & (ARCH_SLAB_MINALIGN-1))) { printk(KERN_ERR "0x%p: not aligned to ARCH_SLAB_MINALIGN=%d\n", - objp, ARCH_SLAB_MINALIGN); + objp, (int)ARCH_SLAB_MINALIGN); } -#endif return objp; } #else diff --git a/mm/slob.c b/mm/slob.c index 46e0aee33a23..0ae881831ae2 100644 --- a/mm/slob.c +++ b/mm/slob.c @@ -482,6 +482,8 @@ void *__kmalloc_node(size_t size, gfp_t gfp, int node) int align = max(ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN); void *ret; + gfp &= gfp_allowed_mask; + lockdep_trace_alloc(gfp); if (size < PAGE_SIZE - align) { @@ -608,6 +610,10 @@ void *kmem_cache_alloc_node(struct kmem_cache *c, gfp_t flags, int node) { void *b; + flags &= gfp_allowed_mask; + + lockdep_trace_alloc(flags); + if (c->size < PAGE_SIZE) { b = slob_alloc(c->size, flags, c->align, node); trace_kmem_cache_alloc_node(_RET_IP_, b, c->size, diff --git a/mm/slub.c b/mm/slub.c index 35f351f26193..ba83f3fd0757 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -27,6 +27,7 @@ #include <linux/memory.h> #include <linux/math64.h> #include <linux/fault-inject.h> +#include <linux/stacktrace.h> #include <trace/events/kmem.h> @@ -191,8 +192,12 @@ static LIST_HEAD(slab_caches); /* * Tracking user of a slab. */ +#define TRACK_ADDRS_COUNT 16 struct track { unsigned long addr; /* Called from address */ +#ifdef CONFIG_STACKTRACE + unsigned long addrs[TRACK_ADDRS_COUNT]; /* Called from address */ +#endif int cpu; /* Was running on cpu */ int pid; /* Pid context */ unsigned long when; /* When did the operation occur */ @@ -420,6 +425,24 @@ static void set_track(struct kmem_cache *s, void *object, struct track *p = get_track(s, object, alloc); if (addr) { +#ifdef CONFIG_STACKTRACE + struct stack_trace trace; + int i; + + trace.nr_entries = 0; + trace.max_entries = TRACK_ADDRS_COUNT; + trace.entries = p->addrs; + trace.skip = 3; + save_stack_trace(&trace); + + /* See rant in lockdep.c */ + if (trace.nr_entries != 0 && + trace.entries[trace.nr_entries - 1] == ULONG_MAX) + trace.nr_entries--; + + for (i = trace.nr_entries; i < TRACK_ADDRS_COUNT; i++) + p->addrs[i] = 0; +#endif p->addr = addr; p->cpu = smp_processor_id(); p->pid = current->pid; @@ -444,6 +467,16 @@ static void print_track(const char *s, struct track *t) printk(KERN_ERR "INFO: %s in %pS age=%lu cpu=%u pid=%d\n", s, (void *)t->addr, jiffies - t->when, t->cpu, t->pid); +#ifdef CONFIG_STACKTRACE + { + int i; + for (i = 0; i < TRACK_ADDRS_COUNT; i++) + if (t->addrs[i]) + printk(KERN_ERR "\t%pS\n", (void *)t->addrs[i]); + else + break; + } +#endif } static void print_tracking(struct kmem_cache *s, void *object) @@ -557,10 +590,10 @@ static void init_object(struct kmem_cache *s, void *object, u8 val) memset(p + s->objsize, val, s->inuse - s->objsize); } -static u8 *check_bytes(u8 *start, unsigned int value, unsigned int bytes) +static u8 *check_bytes8(u8 *start, u8 value, unsigned int bytes) { while (bytes) { - if (*start != (u8)value) + if (*start != value) return start; start++; bytes--; @@ -568,6 +601,38 @@ static u8 *check_bytes(u8 *start, unsigned int value, unsigned int bytes) return NULL; } +static u8 *check_bytes(u8 *start, u8 value, unsigned int bytes) +{ + u64 value64; + unsigned int words, prefix; + + if (bytes <= 16) + return check_bytes8(start, value, bytes); + + value64 = value | value << 8 | value << 16 | value << 24; + value64 = value64 | value64 << 32; + prefix = 8 - ((unsigned long)start) % 8; + + if (prefix) { + u8 *r = check_bytes8(start, value, prefix); + if (r) + return r; + start += prefix; + bytes -= prefix; + } + + words = bytes / 8; + + while (words) { + if (*(u64 *)start != value64) + return check_bytes8(start, value, 8); + start += 8; + words--; + } + + return check_bytes8(start, value, bytes % 8); +} + static void restore_bytes(struct kmem_cache *s, char *message, u8 data, void *from, void *to) { @@ -2928,6 +2993,42 @@ size_t ksize(const void *object) } EXPORT_SYMBOL(ksize); +#ifdef CONFIG_SLUB_DEBUG +bool verify_mem_not_deleted(const void *x) +{ + struct page *page; + void *object = (void *)x; + unsigned long flags; + bool rv; + + if (unlikely(ZERO_OR_NULL_PTR(x))) + return false; + + local_irq_save(flags); + + page = virt_to_head_page(x); + if (unlikely(!PageSlab(page))) { + /* maybe it was from stack? */ + rv = true; + goto out_unlock; + } + + slab_lock(page); + if (on_freelist(page->slab, page, object)) { + object_err(page->slab, page, object, "Object is on free-list"); + rv = false; + } else { + rv = true; + } + slab_unlock(page); + +out_unlock: + local_irq_restore(flags); + return rv; +} +EXPORT_SYMBOL(verify_mem_not_deleted); +#endif + void kfree(const void *x) { struct page *page; diff --git a/mm/swapfile.c b/mm/swapfile.c index d537d29e9b7b..1b8c33907242 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -14,7 +14,7 @@ #include <linux/vmalloc.h> #include <linux/pagemap.h> #include <linux/namei.h> -#include <linux/shm.h> +#include <linux/shmem_fs.h> #include <linux/blkdev.h> #include <linux/random.h> #include <linux/writeback.h> @@ -1681,19 +1681,14 @@ out: } #ifdef CONFIG_PROC_FS -struct proc_swaps { - struct seq_file seq; - int event; -}; - static unsigned swaps_poll(struct file *file, poll_table *wait) { - struct proc_swaps *s = file->private_data; + struct seq_file *seq = file->private_data; poll_wait(file, &proc_poll_wait, wait); - if (s->event != atomic_read(&proc_poll_event)) { - s->event = atomic_read(&proc_poll_event); + if (seq->poll_event != atomic_read(&proc_poll_event)) { + seq->poll_event = atomic_read(&proc_poll_event); return POLLIN | POLLRDNORM | POLLERR | POLLPRI; } @@ -1783,24 +1778,16 @@ static const struct seq_operations swaps_op = { static int swaps_open(struct inode *inode, struct file *file) { - struct proc_swaps *s; + struct seq_file *seq; int ret; - s = kmalloc(sizeof(struct proc_swaps), GFP_KERNEL); - if (!s) - return -ENOMEM; - - file->private_data = s; - ret = seq_open(file, &swaps_op); - if (ret) { - kfree(s); + if (ret) return ret; - } - s->seq.private = s; - s->event = atomic_read(&proc_poll_event); - return ret; + seq = file->private_data; + seq->poll_event = atomic_read(&proc_poll_event); + return 0; } static const struct file_operations proc_swaps_operations = { diff --git a/mm/truncate.c b/mm/truncate.c index 3a29a6180212..003c6c685fc8 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -304,6 +304,11 @@ EXPORT_SYMBOL(truncate_inode_pages_range); * @lstart: offset from which to truncate * * Called under (and serialised by) inode->i_mutex. + * + * Note: When this function returns, there can be a page in the process of + * deletion (inside __delete_from_page_cache()) in the specified range. Thus + * mapping->nrpages can be non-zero when this function returns even after + * truncation of the whole mapping. */ void truncate_inode_pages(struct address_space *mapping, loff_t lstart) { @@ -603,3 +608,26 @@ int vmtruncate(struct inode *inode, loff_t offset) return 0; } EXPORT_SYMBOL(vmtruncate); + +int vmtruncate_range(struct inode *inode, loff_t offset, loff_t end) +{ + struct address_space *mapping = inode->i_mapping; + + /* + * If the underlying filesystem is not going to provide + * a way to truncate a range of blocks (punch a hole) - + * we should return failure right now. + */ + if (!inode->i_op->truncate_range) + return -ENOSYS; + + mutex_lock(&inode->i_mutex); + inode_dio_wait(inode); + unmap_mapping_range(mapping, offset, (end - offset), 1); + inode->i_op->truncate_range(inode, offset, end); + /* unmap again to remove racily COWed private pages */ + unmap_mapping_range(mapping, offset, (end - offset), 1); + mutex_unlock(&inode->i_mutex); + + return 0; +} diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 1d34d75366a7..ab8494cde007 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -452,13 +452,6 @@ overflow: return ERR_PTR(-EBUSY); } -static void rcu_free_va(struct rcu_head *head) -{ - struct vmap_area *va = container_of(head, struct vmap_area, rcu_head); - - kfree(va); -} - static void __free_vmap_area(struct vmap_area *va) { BUG_ON(RB_EMPTY_NODE(&va->rb_node)); @@ -491,7 +484,7 @@ static void __free_vmap_area(struct vmap_area *va) if (va->va_end > VMALLOC_START && va->va_end <= VMALLOC_END) vmap_area_pcpu_hole = max(vmap_area_pcpu_hole, va->va_end); - call_rcu(&va->rcu_head, rcu_free_va); + kfree_rcu(va, rcu_head); } /* @@ -837,13 +830,6 @@ static struct vmap_block *new_vmap_block(gfp_t gfp_mask) return vb; } -static void rcu_free_vb(struct rcu_head *head) -{ - struct vmap_block *vb = container_of(head, struct vmap_block, rcu_head); - - kfree(vb); -} - static void free_vmap_block(struct vmap_block *vb) { struct vmap_block *tmp; @@ -856,7 +842,7 @@ static void free_vmap_block(struct vmap_block *vb) BUG_ON(tmp != vb); free_vmap_area_noflush(vb->va); - call_rcu(&vb->rcu_head, rcu_free_vb); + kfree_rcu(vb, rcu_head); } static void purge_fragmented_blocks(int cpu) diff --git a/mm/vmscan.c b/mm/vmscan.c index 8ff834e19c24..febbc044e792 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -250,49 +250,90 @@ unsigned long shrink_slab(struct shrink_control *shrink, unsigned long long delta; unsigned long total_scan; unsigned long max_pass; + int shrink_ret = 0; + long nr; + long new_nr; + long batch_size = shrinker->batch ? shrinker->batch + : SHRINK_BATCH; + /* + * copy the current shrinker scan count into a local variable + * and zero it so that other concurrent shrinker invocations + * don't also do this scanning work. + */ + do { + nr = shrinker->nr; + } while (cmpxchg(&shrinker->nr, nr, 0) != nr); + + total_scan = nr; max_pass = do_shrinker_shrink(shrinker, shrink, 0); delta = (4 * nr_pages_scanned) / shrinker->seeks; delta *= max_pass; do_div(delta, lru_pages + 1); - shrinker->nr += delta; - if (shrinker->nr < 0) { + total_scan += delta; + if (total_scan < 0) { printk(KERN_ERR "shrink_slab: %pF negative objects to " "delete nr=%ld\n", - shrinker->shrink, shrinker->nr); - shrinker->nr = max_pass; + shrinker->shrink, total_scan); + total_scan = max_pass; } /* + * We need to avoid excessive windup on filesystem shrinkers + * due to large numbers of GFP_NOFS allocations causing the + * shrinkers to return -1 all the time. This results in a large + * nr being built up so when a shrink that can do some work + * comes along it empties the entire cache due to nr >>> + * max_pass. This is bad for sustaining a working set in + * memory. + * + * Hence only allow the shrinker to scan the entire cache when + * a large delta change is calculated directly. + */ + if (delta < max_pass / 4) + total_scan = min(total_scan, max_pass / 2); + + /* * Avoid risking looping forever due to too large nr value: * never try to free more than twice the estimate number of * freeable entries. */ - if (shrinker->nr > max_pass * 2) - shrinker->nr = max_pass * 2; + if (total_scan > max_pass * 2) + total_scan = max_pass * 2; - total_scan = shrinker->nr; - shrinker->nr = 0; + trace_mm_shrink_slab_start(shrinker, shrink, nr, + nr_pages_scanned, lru_pages, + max_pass, delta, total_scan); - while (total_scan >= SHRINK_BATCH) { - long this_scan = SHRINK_BATCH; - int shrink_ret; + while (total_scan >= batch_size) { int nr_before; nr_before = do_shrinker_shrink(shrinker, shrink, 0); shrink_ret = do_shrinker_shrink(shrinker, shrink, - this_scan); + batch_size); if (shrink_ret == -1) break; if (shrink_ret < nr_before) ret += nr_before - shrink_ret; - count_vm_events(SLABS_SCANNED, this_scan); - total_scan -= this_scan; + count_vm_events(SLABS_SCANNED, batch_size); + total_scan -= batch_size; cond_resched(); } - shrinker->nr += total_scan; + /* + * move the unused scan count back into the shrinker in a + * manner that handles concurrent updates. If we exhausted the + * scan, there is no need to do an update. + */ + do { + nr = shrinker->nr; + new_nr = total_scan + nr; + if (total_scan <= 0) + break; + } while (cmpxchg(&shrinker->nr, nr, new_nr) != nr); + + trace_mm_shrink_slab_end(shrinker, shrink_ret, nr, new_nr); } up_read(&shrinker_rwsem); out: @@ -1995,14 +2036,13 @@ restart: * If a zone is deemed to be full of pinned pages then just give it a light * scan then give up on it. */ -static unsigned long shrink_zones(int priority, struct zonelist *zonelist, +static void shrink_zones(int priority, struct zonelist *zonelist, struct scan_control *sc) { struct zoneref *z; struct zone *zone; unsigned long nr_soft_reclaimed; unsigned long nr_soft_scanned; - unsigned long total_scanned = 0; for_each_zone_zonelist_nodemask(zone, z, zonelist, gfp_zone(sc->gfp_mask), sc->nodemask) { @@ -2017,19 +2057,23 @@ static unsigned long shrink_zones(int priority, struct zonelist *zonelist, continue; if (zone->all_unreclaimable && priority != DEF_PRIORITY) continue; /* Let kswapd poll it */ + /* + * This steals pages from memory cgroups over softlimit + * and returns the number of reclaimed pages and + * scanned pages. This works for global memory pressure + * and balancing, not for a memcg's limit. + */ + nr_soft_scanned = 0; + nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone, + sc->order, sc->gfp_mask, + &nr_soft_scanned); + sc->nr_reclaimed += nr_soft_reclaimed; + sc->nr_scanned += nr_soft_scanned; + /* need some check for avoid more shrink_zone() */ } - nr_soft_scanned = 0; - nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone, - sc->order, sc->gfp_mask, - &nr_soft_scanned); - sc->nr_reclaimed += nr_soft_reclaimed; - total_scanned += nr_soft_scanned; - shrink_zone(priority, zone, sc); } - - return total_scanned; } static bool zone_reclaimable(struct zone *zone) @@ -2094,7 +2138,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, sc->nr_scanned = 0; if (!priority) disable_swap_token(sc->mem_cgroup); - total_scanned += shrink_zones(priority, zonelist, sc); + shrink_zones(priority, zonelist, sc); /* * Don't shrink slabs when reclaiming memory from * over limit cgroups @@ -2307,7 +2351,8 @@ static bool pgdat_balanced(pg_data_t *pgdat, unsigned long balanced_pages, for (i = 0; i <= classzone_idx; i++) present_pages += pgdat->node_zones[i].present_pages; - return balanced_pages > (present_pages >> 2); + /* A special case here: if zone has no page, we think it's balanced */ + return balanced_pages >= (present_pages >> 2); } /* is kswapd sleeping prematurely? */ @@ -2323,7 +2368,7 @@ static bool sleeping_prematurely(pg_data_t *pgdat, int order, long remaining, return true; /* Check the watermark levels */ - for (i = 0; i < pgdat->nr_zones; i++) { + for (i = 0; i <= classzone_idx; i++) { struct zone *zone = pgdat->node_zones + i; if (!populated_zone(zone)) @@ -2341,7 +2386,7 @@ static bool sleeping_prematurely(pg_data_t *pgdat, int order, long remaining, } if (!zone_watermark_ok_safe(zone, order, high_wmark_pages(zone), - classzone_idx, 0)) + i, 0)) all_zones_ok = false; else balanced += zone->present_pages; @@ -2448,7 +2493,6 @@ loop_again: if (!zone_watermark_ok_safe(zone, order, high_wmark_pages(zone), 0, 0)) { end_zone = i; - *classzone_idx = i; break; } } @@ -2507,18 +2551,18 @@ loop_again: KSWAPD_ZONE_BALANCE_GAP_RATIO); if (!zone_watermark_ok_safe(zone, order, high_wmark_pages(zone) + balance_gap, - end_zone, 0)) + end_zone, 0)) { shrink_zone(priority, zone, &sc); - reclaim_state->reclaimed_slab = 0; - nr_slab = shrink_slab(&shrink, sc.nr_scanned, lru_pages); - sc.nr_reclaimed += reclaim_state->reclaimed_slab; - total_scanned += sc.nr_scanned; - if (zone->all_unreclaimable) - continue; - if (nr_slab == 0 && - !zone_reclaimable(zone)) - zone->all_unreclaimable = 1; + reclaim_state->reclaimed_slab = 0; + nr_slab = shrink_slab(&shrink, sc.nr_scanned, lru_pages); + sc.nr_reclaimed += reclaim_state->reclaimed_slab; + total_scanned += sc.nr_scanned; + + if (nr_slab == 0 && !zone_reclaimable(zone)) + zone->all_unreclaimable = 1; + } + /* * If we've done a decent amount of scanning and * the reclaim ratio is low, start doing writepage @@ -2528,6 +2572,12 @@ loop_again: total_scanned > sc.nr_reclaimed + sc.nr_reclaimed / 2) sc.may_writepage = 1; + if (zone->all_unreclaimable) { + if (end_zone && end_zone == i) + end_zone--; + continue; + } + if (!zone_watermark_ok_safe(zone, order, high_wmark_pages(zone), end_zone, 0)) { all_zones_ok = 0; @@ -2706,8 +2756,8 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx) */ static int kswapd(void *p) { - unsigned long order; - int classzone_idx; + unsigned long order, new_order; + int classzone_idx, new_classzone_idx; pg_data_t *pgdat = (pg_data_t*)p; struct task_struct *tsk = current; @@ -2737,17 +2787,23 @@ static int kswapd(void *p) tsk->flags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD; set_freezable(); - order = 0; - classzone_idx = MAX_NR_ZONES - 1; + order = new_order = 0; + classzone_idx = new_classzone_idx = pgdat->nr_zones - 1; for ( ; ; ) { - unsigned long new_order; - int new_classzone_idx; int ret; - new_order = pgdat->kswapd_max_order; - new_classzone_idx = pgdat->classzone_idx; - pgdat->kswapd_max_order = 0; - pgdat->classzone_idx = MAX_NR_ZONES - 1; + /* + * If the last balance_pgdat was unsuccessful it's unlikely a + * new request of a similar or harder type will succeed soon + * so consider going to sleep on the basis we reclaimed at + */ + if (classzone_idx >= new_classzone_idx && order == new_order) { + new_order = pgdat->kswapd_max_order; + new_classzone_idx = pgdat->classzone_idx; + pgdat->kswapd_max_order = 0; + pgdat->classzone_idx = pgdat->nr_zones - 1; + } + if (order < new_order || classzone_idx > new_classzone_idx) { /* * Don't sleep if someone wants a larger 'order' @@ -2760,7 +2816,7 @@ static int kswapd(void *p) order = pgdat->kswapd_max_order; classzone_idx = pgdat->classzone_idx; pgdat->kswapd_max_order = 0; - pgdat->classzone_idx = MAX_NR_ZONES - 1; + pgdat->classzone_idx = pgdat->nr_zones - 1; } ret = try_to_freeze(); |