diff options
author | Mauro Carvalho Chehab <mchehab@redhat.com> | 2011-09-17 15:29:49 +0200 |
---|---|---|
committer | Mauro Carvalho Chehab <mchehab@redhat.com> | 2011-09-17 15:29:49 +0200 |
commit | 7577911244c437f4a4abac5e4b67b059c06dbe9d (patch) | |
tree | 4f0078ddacff226e26b03fa4f6cf185d48633124 /mm | |
parent | [media] mt9p031: Aptina (Micron) MT9P031 5MP sensor driver (diff) | |
parent | Linux 3.1-rc6 (diff) | |
download | linux-7577911244c437f4a4abac5e4b67b059c06dbe9d.tar.xz linux-7577911244c437f4a4abac5e4b67b059c06dbe9d.zip |
Merge tag 'v3.1-rc6' into staging/for_v3.2
* tag 'v3.1-rc6': (1902 commits)
Linux 3.1-rc6
ioctl: register LTTng ioctl
fuse: fix memory leak
fuse: fix flock breakage
Btrfs: add dummy extent if dst offset excceeds file end in
Btrfs: calc file extent num_bytes correctly in file clone
btrfs: xattr: fix attribute removal
Btrfs: fix wrong nbytes information of the inode
Btrfs: fix the file extent gap when doing direct IO
Btrfs: fix unclosed transaction handle in btrfs_cont_expand
Btrfs: fix misuse of trans block rsv
Btrfs: reset to appropriate block rsv after orphan operations
Btrfs: skip locking if searching the commit root in csum lookup
btrfs: fix warning in iput for bad-inode
Btrfs: fix an oops when deleting snapshots
[media] vp7045: fix buffer setup
[media] nuvoton-cir: simplify raw IR sample handling
[media] [Resend] viacam: Don't explode if pci_find_bus() returns NULL
[media] v4l2: Fix documentation of the codec device controls
[media] gspca - sonixj: Fix the darkness of sensor om6802 in 320x240
...
Diffstat (limited to 'mm')
-rw-r--r-- | mm/failslab.c | 14 | ||||
-rw-r--r-- | mm/filemap.c | 106 | ||||
-rw-r--r-- | mm/highmem.c | 4 | ||||
-rw-r--r-- | mm/memcontrol.c | 104 | ||||
-rw-r--r-- | mm/memory-failure.c | 92 | ||||
-rw-r--r-- | mm/mincore.c | 11 | ||||
-rw-r--r-- | mm/oom_kill.c | 4 | ||||
-rw-r--r-- | mm/page-writeback.c | 15 | ||||
-rw-r--r-- | mm/page_alloc.c | 13 | ||||
-rw-r--r-- | mm/shmem.c | 1493 | ||||
-rw-r--r-- | mm/slab.c | 99 | ||||
-rw-r--r-- | mm/slub.c | 772 | ||||
-rw-r--r-- | mm/swapfile.c | 20 | ||||
-rw-r--r-- | mm/truncate.c | 8 | ||||
-rw-r--r-- | mm/vmalloc.c | 7 | ||||
-rw-r--r-- | mm/vmscan.c | 19 |
16 files changed, 1324 insertions, 1457 deletions
diff --git a/mm/failslab.c b/mm/failslab.c index 1ce58c201dca..0dd7b8fec71c 100644 --- a/mm/failslab.c +++ b/mm/failslab.c @@ -34,23 +34,23 @@ __setup("failslab=", setup_failslab); #ifdef CONFIG_FAULT_INJECTION_DEBUG_FS static int __init failslab_debugfs_init(void) { + struct dentry *dir; mode_t mode = S_IFREG | S_IRUSR | S_IWUSR; - int err; - err = init_fault_attr_dentries(&failslab.attr, "failslab"); - if (err) - return err; + dir = fault_create_debugfs_attr("failslab", NULL, &failslab.attr); + if (IS_ERR(dir)) + return PTR_ERR(dir); - if (!debugfs_create_bool("ignore-gfp-wait", mode, failslab.attr.dir, + if (!debugfs_create_bool("ignore-gfp-wait", mode, dir, &failslab.ignore_gfp_wait)) goto fail; - if (!debugfs_create_bool("cache-filter", mode, failslab.attr.dir, + if (!debugfs_create_bool("cache-filter", mode, dir, &failslab.cache_filter)) goto fail; return 0; fail: - cleanup_fault_attr_dentries(&failslab.attr); + debugfs_remove_recursive(dir); return -ENOMEM; } diff --git a/mm/filemap.c b/mm/filemap.c index 867d40222ec7..645a080ba4df 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -33,7 +33,6 @@ #include <linux/cpuset.h> #include <linux/hardirq.h> /* for BUG_ON(!in_atomic()) only */ #include <linux/memcontrol.h> -#include <linux/mm_inline.h> /* for page_is_file_cache() */ #include <linux/cleancache.h> #include "internal.h" @@ -462,6 +461,7 @@ int add_to_page_cache_locked(struct page *page, struct address_space *mapping, int error; VM_BUG_ON(!PageLocked(page)); + VM_BUG_ON(PageSwapBacked(page)); error = mem_cgroup_cache_charge(page, current->mm, gfp_mask & GFP_RECLAIM_MASK); @@ -479,8 +479,6 @@ int add_to_page_cache_locked(struct page *page, struct address_space *mapping, if (likely(!error)) { mapping->nrpages++; __inc_zone_page_state(page, NR_FILE_PAGES); - if (PageSwapBacked(page)) - __inc_zone_page_state(page, NR_SHMEM); spin_unlock_irq(&mapping->tree_lock); } else { page->mapping = NULL; @@ -502,22 +500,9 @@ int add_to_page_cache_lru(struct page *page, struct address_space *mapping, { int ret; - /* - * Splice_read and readahead add shmem/tmpfs pages into the page cache - * before shmem_readpage has a chance to mark them as SwapBacked: they - * need to go on the anon lru below, and mem_cgroup_cache_charge - * (called in add_to_page_cache) needs to know where they're going too. - */ - if (mapping_cap_swap_backed(mapping)) - SetPageSwapBacked(page); - ret = add_to_page_cache(page, mapping, offset, gfp_mask); - if (ret == 0) { - if (page_is_file_cache(page)) - lru_cache_add_file(page); - else - lru_cache_add_anon(page); - } + if (ret == 0) + lru_cache_add_file(page); return ret; } EXPORT_SYMBOL_GPL(add_to_page_cache_lru); @@ -714,9 +699,16 @@ repeat: page = radix_tree_deref_slot(pagep); if (unlikely(!page)) goto out; - if (radix_tree_deref_retry(page)) - goto repeat; - + if (radix_tree_exception(page)) { + if (radix_tree_deref_retry(page)) + goto repeat; + /* + * Otherwise, shmem/tmpfs must be storing a swap entry + * here as an exceptional entry: so return it without + * attempting to raise page count. + */ + goto out; + } if (!page_cache_get_speculative(page)) goto repeat; @@ -753,7 +745,7 @@ struct page *find_lock_page(struct address_space *mapping, pgoff_t offset) repeat: page = find_get_page(mapping, offset); - if (page) { + if (page && !radix_tree_exception(page)) { lock_page(page); /* Has the page been truncated? */ if (unlikely(page->mapping != mapping)) { @@ -840,7 +832,7 @@ unsigned find_get_pages(struct address_space *mapping, pgoff_t start, rcu_read_lock(); restart: nr_found = radix_tree_gang_lookup_slot(&mapping->page_tree, - (void ***)pages, start, nr_pages); + (void ***)pages, NULL, start, nr_pages); ret = 0; for (i = 0; i < nr_found; i++) { struct page *page; @@ -849,13 +841,22 @@ repeat: if (unlikely(!page)) continue; - /* - * This can only trigger when the entry at index 0 moves out - * of or back to the root: none yet gotten, safe to restart. - */ - if (radix_tree_deref_retry(page)) { - WARN_ON(start | i); - goto restart; + if (radix_tree_exception(page)) { + if (radix_tree_deref_retry(page)) { + /* + * Transient condition which can only trigger + * when entry at index 0 moves out of or back + * to root: none yet gotten, safe to restart. + */ + WARN_ON(start | i); + goto restart; + } + /* + * Otherwise, shmem/tmpfs must be storing a swap entry + * here as an exceptional entry: so skip over it - + * we only reach this from invalidate_mapping_pages(). + */ + continue; } if (!page_cache_get_speculative(page)) @@ -903,7 +904,7 @@ unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t index, rcu_read_lock(); restart: nr_found = radix_tree_gang_lookup_slot(&mapping->page_tree, - (void ***)pages, index, nr_pages); + (void ***)pages, NULL, index, nr_pages); ret = 0; for (i = 0; i < nr_found; i++) { struct page *page; @@ -912,12 +913,22 @@ repeat: if (unlikely(!page)) continue; - /* - * This can only trigger when the entry at index 0 moves out - * of or back to the root: none yet gotten, safe to restart. - */ - if (radix_tree_deref_retry(page)) - goto restart; + if (radix_tree_exception(page)) { + if (radix_tree_deref_retry(page)) { + /* + * Transient condition which can only trigger + * when entry at index 0 moves out of or back + * to root: none yet gotten, safe to restart. + */ + goto restart; + } + /* + * Otherwise, shmem/tmpfs must be storing a swap entry + * here as an exceptional entry: so stop looking for + * contiguous pages. + */ + break; + } if (!page_cache_get_speculative(page)) goto repeat; @@ -977,12 +988,21 @@ repeat: if (unlikely(!page)) continue; - /* - * This can only trigger when the entry at index 0 moves out - * of or back to the root: none yet gotten, safe to restart. - */ - if (radix_tree_deref_retry(page)) - goto restart; + if (radix_tree_exception(page)) { + if (radix_tree_deref_retry(page)) { + /* + * Transient condition which can only trigger + * when entry at index 0 moves out of or back + * to root: none yet gotten, safe to restart. + */ + goto restart; + } + /* + * This function is never used on a shmem/tmpfs + * mapping, so a swap entry won't be found here. + */ + BUG(); + } if (!page_cache_get_speculative(page)) goto repeat; diff --git a/mm/highmem.c b/mm/highmem.c index 693394daa2ed..5ef672c07f75 100644 --- a/mm/highmem.c +++ b/mm/highmem.c @@ -326,7 +326,7 @@ static struct page_address_slot { spinlock_t lock; /* Protect this bucket's list */ } ____cacheline_aligned_in_smp page_address_htable[1<<PA_HASH_ORDER]; -static struct page_address_slot *page_slot(struct page *page) +static struct page_address_slot *page_slot(const struct page *page) { return &page_address_htable[hash_ptr(page, PA_HASH_ORDER)]; } @@ -337,7 +337,7 @@ static struct page_address_slot *page_slot(struct page *page) * * Returns the page's virtual address. */ -void *page_address(struct page *page) +void *page_address(const struct page *page) { unsigned long flags; void *ret; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 5f84d2351ddb..ebd1e86bef1c 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -35,7 +35,6 @@ #include <linux/limits.h> #include <linux/mutex.h> #include <linux/rbtree.h> -#include <linux/shmem_fs.h> #include <linux/slab.h> #include <linux/swap.h> #include <linux/swapops.h> @@ -1842,29 +1841,23 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem, */ static bool mem_cgroup_oom_lock(struct mem_cgroup *mem) { - int lock_count = -1; struct mem_cgroup *iter, *failed = NULL; bool cond = true; for_each_mem_cgroup_tree_cond(iter, mem, cond) { - bool locked = iter->oom_lock; - - iter->oom_lock = true; - if (lock_count == -1) - lock_count = iter->oom_lock; - else if (lock_count != locked) { + if (iter->oom_lock) { /* * this subtree of our hierarchy is already locked * so we cannot give a lock. */ - lock_count = 0; failed = iter; cond = false; - } + } else + iter->oom_lock = true; } if (!failed) - goto done; + return true; /* * OK, we failed to lock the whole subtree so we have to clean up @@ -1878,8 +1871,7 @@ static bool mem_cgroup_oom_lock(struct mem_cgroup *mem) } iter->oom_lock = false; } -done: - return lock_count; + return false; } /* @@ -2092,6 +2084,7 @@ struct memcg_stock_pcp { #define FLUSHING_CACHED_CHARGE (0) }; static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock); +static DEFINE_MUTEX(percpu_charge_mutex); /* * Try to consume stocked charge on this cpu. If success, one page is consumed @@ -2169,13 +2162,7 @@ static void drain_all_stock(struct mem_cgroup *root_mem, bool sync) /* Notify other cpus that system-wide "drain" is running */ get_online_cpus(); - /* - * Get a hint for avoiding draining charges on the current cpu, - * which must be exhausted by our charging. It is not required that - * this be a precise check, so we use raw_smp_processor_id() instead of - * getcpu()/putcpu(). - */ - curcpu = raw_smp_processor_id(); + curcpu = get_cpu(); for_each_online_cpu(cpu) { struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu); struct mem_cgroup *mem; @@ -2192,14 +2179,14 @@ static void drain_all_stock(struct mem_cgroup *root_mem, bool sync) schedule_work_on(cpu, &stock->work); } } + put_cpu(); if (!sync) goto out; for_each_online_cpu(cpu) { struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu); - if (mem_cgroup_same_or_subtree(root_mem, stock->cached) && - test_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) + if (test_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) flush_work(&stock->work); } out: @@ -2214,14 +2201,22 @@ out: */ static void drain_all_stock_async(struct mem_cgroup *root_mem) { + /* + * If someone calls draining, avoid adding more kworker runs. + */ + if (!mutex_trylock(&percpu_charge_mutex)) + return; drain_all_stock(root_mem, false); + mutex_unlock(&percpu_charge_mutex); } /* This is a synchronous drain interface. */ static void drain_all_stock_sync(struct mem_cgroup *root_mem) { /* called when force_empty is called */ + mutex_lock(&percpu_charge_mutex); drain_all_stock(root_mem, true); + mutex_unlock(&percpu_charge_mutex); } /* @@ -2873,30 +2868,6 @@ int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, return 0; if (PageCompound(page)) return 0; - /* - * Corner case handling. This is called from add_to_page_cache() - * in usual. But some FS (shmem) precharges this page before calling it - * and call add_to_page_cache() with GFP_NOWAIT. - * - * For GFP_NOWAIT case, the page may be pre-charged before calling - * add_to_page_cache(). (See shmem.c) check it here and avoid to call - * charge twice. (It works but has to pay a bit larger cost.) - * And when the page is SwapCache, it should take swap information - * into account. This is under lock_page() now. - */ - if (!(gfp_mask & __GFP_WAIT)) { - struct page_cgroup *pc; - - pc = lookup_page_cgroup(page); - if (!pc) - return 0; - lock_page_cgroup(pc); - if (PageCgroupUsed(pc)) { - unlock_page_cgroup(pc); - return 0; - } - unlock_page_cgroup(pc); - } if (unlikely(!mm)) mm = &init_mm; @@ -3486,31 +3457,6 @@ void mem_cgroup_end_migration(struct mem_cgroup *mem, cgroup_release_and_wakeup_rmdir(&mem->css); } -/* - * A call to try to shrink memory usage on charge failure at shmem's swapin. - * Calling hierarchical_reclaim is not enough because we should update - * last_oom_jiffies to prevent pagefault_out_of_memory from invoking global OOM. - * Moreover considering hierarchy, we should reclaim from the mem_over_limit, - * not from the memcg which this page would be charged to. - * try_charge_swapin does all of these works properly. - */ -int mem_cgroup_shmem_charge_fallback(struct page *page, - struct mm_struct *mm, - gfp_t gfp_mask) -{ - struct mem_cgroup *mem; - int ret; - - if (mem_cgroup_disabled()) - return 0; - - ret = mem_cgroup_try_charge_swapin(mm, page, gfp_mask, &mem); - if (!ret) - mem_cgroup_cancel_charge_swapin(mem); /* it does !mem check */ - - return ret; -} - #ifdef CONFIG_DEBUG_VM static struct page_cgroup *lookup_page_cgroup_used(struct page *page) { @@ -5330,15 +5276,17 @@ static struct page *mc_handle_file_pte(struct vm_area_struct *vma, pgoff = pte_to_pgoff(ptent); /* page is moved even if it's not RSS of this task(page-faulted). */ - if (!mapping_cap_swap_backed(mapping)) { /* normal file */ - page = find_get_page(mapping, pgoff); - } else { /* shmem/tmpfs file. we should take account of swap too. */ - swp_entry_t ent; - mem_cgroup_get_shmem_target(inode, pgoff, &page, &ent); + page = find_get_page(mapping, pgoff); + +#ifdef CONFIG_SWAP + /* shmem/tmpfs may report page out on swap: account for that too. */ + if (radix_tree_exceptional_entry(page)) { + swp_entry_t swap = radix_to_swp_entry(page); if (do_swap_account) - entry->val = ent.val; + *entry = swap; + page = find_get_page(&swapper_space, swap.val); } - +#endif return page; } diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 740c4f52059c..2b43ba051ac9 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -53,6 +53,7 @@ #include <linux/hugetlb.h> #include <linux/memory_hotplug.h> #include <linux/mm_inline.h> +#include <linux/kfifo.h> #include "internal.h" int sysctl_memory_failure_early_kill __read_mostly = 0; @@ -1178,6 +1179,97 @@ void memory_failure(unsigned long pfn, int trapno) __memory_failure(pfn, trapno, 0); } +#define MEMORY_FAILURE_FIFO_ORDER 4 +#define MEMORY_FAILURE_FIFO_SIZE (1 << MEMORY_FAILURE_FIFO_ORDER) + +struct memory_failure_entry { + unsigned long pfn; + int trapno; + int flags; +}; + +struct memory_failure_cpu { + DECLARE_KFIFO(fifo, struct memory_failure_entry, + MEMORY_FAILURE_FIFO_SIZE); + spinlock_t lock; + struct work_struct work; +}; + +static DEFINE_PER_CPU(struct memory_failure_cpu, memory_failure_cpu); + +/** + * memory_failure_queue - Schedule handling memory failure of a page. + * @pfn: Page Number of the corrupted page + * @trapno: Trap number reported in the signal to user space. + * @flags: Flags for memory failure handling + * + * This function is called by the low level hardware error handler + * when it detects hardware memory corruption of a page. It schedules + * the recovering of error page, including dropping pages, killing + * processes etc. + * + * The function is primarily of use for corruptions that + * happen outside the current execution context (e.g. when + * detected by a background scrubber) + * + * Can run in IRQ context. + */ +void memory_failure_queue(unsigned long pfn, int trapno, int flags) +{ + struct memory_failure_cpu *mf_cpu; + unsigned long proc_flags; + struct memory_failure_entry entry = { + .pfn = pfn, + .trapno = trapno, + .flags = flags, + }; + + mf_cpu = &get_cpu_var(memory_failure_cpu); + spin_lock_irqsave(&mf_cpu->lock, proc_flags); + if (kfifo_put(&mf_cpu->fifo, &entry)) + schedule_work_on(smp_processor_id(), &mf_cpu->work); + else + pr_err("Memory failure: buffer overflow when queuing memory failure at 0x%#lx\n", + pfn); + spin_unlock_irqrestore(&mf_cpu->lock, proc_flags); + put_cpu_var(memory_failure_cpu); +} +EXPORT_SYMBOL_GPL(memory_failure_queue); + +static void memory_failure_work_func(struct work_struct *work) +{ + struct memory_failure_cpu *mf_cpu; + struct memory_failure_entry entry = { 0, }; + unsigned long proc_flags; + int gotten; + + mf_cpu = &__get_cpu_var(memory_failure_cpu); + for (;;) { + spin_lock_irqsave(&mf_cpu->lock, proc_flags); + gotten = kfifo_get(&mf_cpu->fifo, &entry); + spin_unlock_irqrestore(&mf_cpu->lock, proc_flags); + if (!gotten) + break; + __memory_failure(entry.pfn, entry.trapno, entry.flags); + } +} + +static int __init memory_failure_init(void) +{ + struct memory_failure_cpu *mf_cpu; + int cpu; + + for_each_possible_cpu(cpu) { + mf_cpu = &per_cpu(memory_failure_cpu, cpu); + spin_lock_init(&mf_cpu->lock); + INIT_KFIFO(mf_cpu->fifo); + INIT_WORK(&mf_cpu->work, memory_failure_work_func); + } + + return 0; +} +core_initcall(memory_failure_init); + /** * unpoison_memory - Unpoison a previously poisoned page * @pfn: Page number of the to be unpoisoned page diff --git a/mm/mincore.c b/mm/mincore.c index a4e6b9d75c76..636a86876ff2 100644 --- a/mm/mincore.c +++ b/mm/mincore.c @@ -69,12 +69,15 @@ static unsigned char mincore_page(struct address_space *mapping, pgoff_t pgoff) * file will not get a swp_entry_t in its pte, but rather it is like * any other file mapping (ie. marked !present and faulted in with * tmpfs's .fault). So swapped out tmpfs mappings are tested here. - * - * However when tmpfs moves the page from pagecache and into swapcache, - * it is still in core, but the find_get_page below won't find it. - * No big deal, but make a note of it. */ page = find_get_page(mapping, pgoff); +#ifdef CONFIG_SWAP + /* shmem/tmpfs may return swap: account for swapcache page too. */ + if (radix_tree_exceptional_entry(page)) { + swp_entry_t swap = radix_to_swp_entry(page); + page = find_get_page(&swapper_space, swap.val); + } +#endif if (page) { present = PageUptodate(page); page_cache_release(page); diff --git a/mm/oom_kill.c b/mm/oom_kill.c index eafff89b3dd6..626303b52f3c 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -303,7 +303,7 @@ static struct task_struct *select_bad_process(unsigned int *ppoints, do_each_thread(g, p) { unsigned int points; - if (!p->mm) + if (p->exit_state) continue; if (oom_unkillable_task(p, mem, nodemask)) continue; @@ -319,6 +319,8 @@ static struct task_struct *select_bad_process(unsigned int *ppoints, */ if (test_tsk_thread_flag(p, TIF_MEMDIE)) return ERR_PTR(-1UL); + if (!p->mm) + continue; if (p->flags & PF_EXITING) { /* diff --git a/mm/page-writeback.c b/mm/page-writeback.c index d1960744f881..0e309cd1b5b9 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -754,21 +754,10 @@ static void balance_dirty_pages(struct address_space *mapping, * 200ms is typically more than enough to curb heavy dirtiers; * (b) the pause time limit makes the dirtiers more responsive. */ - if (nr_dirty < dirty_thresh + - dirty_thresh / DIRTY_MAXPAUSE_AREA && + if (nr_dirty < dirty_thresh && + bdi_dirty < (task_bdi_thresh + bdi_thresh) / 2 && time_after(jiffies, start_time + MAX_PAUSE)) break; - /* - * pass-good area. When some bdi gets blocked (eg. NFS server - * not responding), or write bandwidth dropped dramatically due - * to concurrent reads, or dirty threshold suddenly dropped and - * the dirty pages cannot be brought down anytime soon (eg. on - * slow USB stick), at least let go of the good bdi's. - */ - if (nr_dirty < dirty_thresh + - dirty_thresh / DIRTY_PASSGOOD_AREA && - bdi_dirty < bdi_thresh) - break; /* * Increase the delay for each loop, up to our previous diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 1dbcf8888f14..6e8ecb6e021c 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1409,14 +1409,11 @@ static int __init fail_page_alloc_debugfs(void) { mode_t mode = S_IFREG | S_IRUSR | S_IWUSR; struct dentry *dir; - int err; - err = init_fault_attr_dentries(&fail_page_alloc.attr, - "fail_page_alloc"); - if (err) - return err; - - dir = fail_page_alloc.attr.dir; + dir = fault_create_debugfs_attr("fail_page_alloc", NULL, + &fail_page_alloc.attr); + if (IS_ERR(dir)) + return PTR_ERR(dir); if (!debugfs_create_bool("ignore-gfp-wait", mode, dir, &fail_page_alloc.ignore_gfp_wait)) @@ -1430,7 +1427,7 @@ static int __init fail_page_alloc_debugfs(void) return 0; fail: - cleanup_fault_attr_dentries(&fail_page_alloc.attr); + debugfs_remove_recursive(dir); return -ENOMEM; } diff --git a/mm/shmem.c b/mm/shmem.c index 5cc21f8b4cd3..32f6763f16fb 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -6,7 +6,8 @@ * 2000-2001 Christoph Rohland * 2000-2001 SAP AG * 2002 Red Hat Inc. - * Copyright (C) 2002-2005 Hugh Dickins. + * Copyright (C) 2002-2011 Hugh Dickins. + * Copyright (C) 2011 Google Inc. * Copyright (C) 2002-2005 VERITAS Software Corporation. * Copyright (C) 2004 Andi Kleen, SuSE Labs * @@ -28,7 +29,6 @@ #include <linux/file.h> #include <linux/mm.h> #include <linux/module.h> -#include <linux/percpu_counter.h> #include <linux/swap.h> static struct vfsmount *shm_mnt; @@ -51,6 +51,8 @@ static struct vfsmount *shm_mnt; #include <linux/shmem_fs.h> #include <linux/writeback.h> #include <linux/blkdev.h> +#include <linux/pagevec.h> +#include <linux/percpu_counter.h> #include <linux/splice.h> #include <linux/security.h> #include <linux/swapops.h> @@ -63,43 +65,17 @@ static struct vfsmount *shm_mnt; #include <linux/magic.h> #include <asm/uaccess.h> -#include <asm/div64.h> #include <asm/pgtable.h> -/* - * The maximum size of a shmem/tmpfs file is limited by the maximum size of - * its triple-indirect swap vector - see illustration at shmem_swp_entry(). - * - * With 4kB page size, maximum file size is just over 2TB on a 32-bit kernel, - * but one eighth of that on a 64-bit kernel. With 8kB page size, maximum - * file size is just over 4TB on a 64-bit kernel, but 16TB on a 32-bit kernel, - * MAX_LFS_FILESIZE being then more restrictive than swap vector layout. - * - * We use / and * instead of shifts in the definitions below, so that the swap - * vector can be tested with small even values (e.g. 20) for ENTRIES_PER_PAGE. - */ -#define ENTRIES_PER_PAGE (PAGE_CACHE_SIZE/sizeof(unsigned long)) -#define ENTRIES_PER_PAGEPAGE ((unsigned long long)ENTRIES_PER_PAGE*ENTRIES_PER_PAGE) - -#define SHMSWP_MAX_INDEX (SHMEM_NR_DIRECT + (ENTRIES_PER_PAGEPAGE/2) * (ENTRIES_PER_PAGE+1)) -#define SHMSWP_MAX_BYTES (SHMSWP_MAX_INDEX << PAGE_CACHE_SHIFT) - -#define SHMEM_MAX_BYTES min_t(unsigned long long, SHMSWP_MAX_BYTES, MAX_LFS_FILESIZE) -#define SHMEM_MAX_INDEX ((unsigned long)((SHMEM_MAX_BYTES+1) >> PAGE_CACHE_SHIFT)) - #define BLOCKS_PER_PAGE (PAGE_CACHE_SIZE/512) #define VM_ACCT(size) (PAGE_CACHE_ALIGN(size) >> PAGE_SHIFT) -/* info->flags needs VM_flags to handle pagein/truncate races efficiently */ -#define SHMEM_PAGEIN VM_READ -#define SHMEM_TRUNCATE VM_WRITE - -/* Definition to limit shmem_truncate's steps between cond_rescheds */ -#define LATENCY_LIMIT 64 - /* Pretend that each entry is of this size in directory's i_size */ #define BOGO_DIRENT_SIZE 20 +/* Symlink up to this size is kmalloc'ed instead of using a swappable page */ +#define SHORT_SYMLINK_LEN 128 + struct shmem_xattr { struct list_head list; /* anchored by shmem_inode_info->xattr_list */ char *name; /* xattr name */ @@ -107,7 +83,7 @@ struct shmem_xattr { char value[0]; }; -/* Flag allocation requirements to shmem_getpage and shmem_swp_alloc */ +/* Flag allocation requirements to shmem_getpage */ enum sgp_type { SGP_READ, /* don't exceed i_size, don't allocate page */ SGP_CACHE, /* don't exceed i_size, may allocate page */ @@ -137,56 +113,6 @@ static inline int shmem_getpage(struct inode *inode, pgoff_t index, mapping_gfp_mask(inode->i_mapping), fault_type); } -static inline struct page *shmem_dir_alloc(gfp_t gfp_mask) -{ - /* - * The above definition of ENTRIES_PER_PAGE, and the use of - * BLOCKS_PER_PAGE on indirect pages, assume PAGE_CACHE_SIZE: - * might be reconsidered if it ever diverges from PAGE_SIZE. - * - * Mobility flags are masked out as swap vectors cannot move - */ - return alloc_pages((gfp_mask & ~GFP_MOVABLE_MASK) | __GFP_ZERO, - PAGE_CACHE_SHIFT-PAGE_SHIFT); -} - -static inline void shmem_dir_free(struct page *page) -{ - __free_pages(page, PAGE_CACHE_SHIFT-PAGE_SHIFT); -} - -static struct page **shmem_dir_map(struct page *page) -{ - return (struct page **)kmap_atomic(page, KM_USER0); -} - -static inline void shmem_dir_unmap(struct page **dir) -{ - kunmap_atomic(dir, KM_USER0); -} - -static swp_entry_t *shmem_swp_map(struct page *page) -{ - return (swp_entry_t *)kmap_atomic(page, KM_USER1); -} - -static inline void shmem_swp_balance_unmap(void) -{ - /* - * When passing a pointer to an i_direct entry, to code which - * also handles indirect entries and so will shmem_swp_unmap, - * we must arrange for the preempt count to remain in balance. - * What kmap_atomic of a lowmem page does depends on config - * and architecture, so pretend to kmap_atomic some lowmem page. - */ - (void) kmap_atomic(ZERO_PAGE(0), KM_USER1); -} - -static inline void shmem_swp_unmap(swp_entry_t *entry) -{ - kunmap_atomic(entry, KM_USER1); -} - static inline struct shmem_sb_info *SHMEM_SB(struct super_block *sb) { return sb->s_fs_info; @@ -244,15 +170,6 @@ static struct backing_dev_info shmem_backing_dev_info __read_mostly = { static LIST_HEAD(shmem_swaplist); static DEFINE_MUTEX(shmem_swaplist_mutex); -static void shmem_free_blocks(struct inode *inode, long pages) -{ - struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); - if (sbinfo->max_blocks) { - percpu_counter_add(&sbinfo->used_blocks, -pages); - inode->i_blocks -= pages*BLOCKS_PER_PAGE; - } -} - static int shmem_reserve_inode(struct super_block *sb) { struct shmem_sb_info *sbinfo = SHMEM_SB(sb); @@ -279,7 +196,7 @@ static void shmem_free_inode(struct super_block *sb) } /** - * shmem_recalc_inode - recalculate the size of an inode + * shmem_recalc_inode - recalculate the block usage of an inode * @inode: inode to recalc * * We have to calculate the free blocks since the mm can drop @@ -297,474 +214,297 @@ static void shmem_recalc_inode(struct inode *inode) freed = info->alloced - info->swapped - inode->i_mapping->nrpages; if (freed > 0) { + struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); + if (sbinfo->max_blocks) + percpu_counter_add(&sbinfo->used_blocks, -freed); info->alloced -= freed; + inode->i_blocks -= freed * BLOCKS_PER_PAGE; shmem_unacct_blocks(info->flags, freed); - shmem_free_blocks(inode, freed); } } -/** - * shmem_swp_entry - find the swap vector position in the info structure - * @info: info structure for the inode - * @index: index of the page to find - * @page: optional page to add to the structure. Has to be preset to - * all zeros - * - * If there is no space allocated yet it will return NULL when - * page is NULL, else it will use the page for the needed block, - * setting it to NULL on return to indicate that it has been used. - * - * The swap vector is organized the following way: - * - * There are SHMEM_NR_DIRECT entries directly stored in the - * shmem_inode_info structure. So small files do not need an addional - * allocation. - * - * For pages with index > SHMEM_NR_DIRECT there is the pointer - * i_indirect which points to a page which holds in the first half - * doubly indirect blocks, in the second half triple indirect blocks: - * - * For an artificial ENTRIES_PER_PAGE = 4 this would lead to the - * following layout (for SHMEM_NR_DIRECT == 16): - * - * i_indirect -> dir --> 16-19 - * | +-> 20-23 - * | - * +-->dir2 --> 24-27 - * | +-> 28-31 - * | +-> 32-35 - * | +-> 36-39 - * | - * +-->dir3 --> 40-43 - * +-> 44-47 - * +-> 48-51 - * +-> 52-55 +/* + * Replace item expected in radix tree by a new item, while holding tree lock. */ -static swp_entry_t *shmem_swp_entry(struct shmem_inode_info *info, unsigned long index, struct page **page) -{ - unsigned long offset; - struct page **dir; - struct page *subdir; - - if (index < SHMEM_NR_DIRECT) { - shmem_swp_balance_unmap(); - return info->i_direct+index; - } - if (!info->i_indirect) { - if (page) { - info->i_indirect = *page; - *page = NULL; - } - return NULL; /* need another page */ - } - - index -= SHMEM_NR_DIRECT; - offset = index % ENTRIES_PER_PAGE; - index /= ENTRIES_PER_PAGE; - dir = shmem_dir_map(info->i_indirect); - - if (index >= ENTRIES_PER_PAGE/2) { - index -= ENTRIES_PER_PAGE/2; - dir += ENTRIES_PER_PAGE/2 + index/ENTRIES_PER_PAGE; - index %= ENTRIES_PER_PAGE; - subdir = *dir; - if (!subdir) { - if (page) { - *dir = *page; - *page = NULL; - } - shmem_dir_unmap(dir); - return NULL; /* need another page */ - } - shmem_dir_unmap(dir); - dir = shmem_dir_map(subdir); - } +static int shmem_radix_tree_replace(struct address_space *mapping, + pgoff_t index, void *expected, void *replacement) +{ + void **pslot; + void *item = NULL; + + VM_BUG_ON(!expected); + pslot = radix_tree_lookup_slot(&mapping->page_tree, index); + if (pslot) + item = radix_tree_deref_slot_protected(pslot, + &mapping->tree_lock); + if (item != expected) + return -ENOENT; + if (replacement) + radix_tree_replace_slot(pslot, replacement); + else + radix_tree_delete(&mapping->page_tree, index); + return 0; +} - dir += index; - subdir = *dir; - if (!subdir) { - if (!page || !(subdir = *page)) { - shmem_dir_unmap(dir); - return NULL; /* need a page */ +/* + * Like add_to_page_cache_locked, but error if expected item has gone. + */ +static int shmem_add_to_page_cache(struct page *page, + struct address_space *mapping, + pgoff_t index, gfp_t gfp, void *expected) +{ + int error = 0; + + VM_BUG_ON(!PageLocked(page)); + VM_BUG_ON(!PageSwapBacked(page)); + + if (!expected) + error = radix_tree_preload(gfp & GFP_RECLAIM_MASK); + if (!error) { + page_cache_get(page); + page->mapping = mapping; + page->index = index; + + spin_lock_irq(&mapping->tree_lock); + if (!expected) + error = radix_tree_insert(&mapping->page_tree, + index, page); + else + error = shmem_radix_tree_replace(mapping, index, + expected, page); + if (!error) { + mapping->nrpages++; + __inc_zone_page_state(page, NR_FILE_PAGES); + __inc_zone_page_state(page, NR_SHMEM); + spin_unlock_irq(&mapping->tree_lock); + } else { + page->mapping = NULL; + spin_unlock_irq(&mapping->tree_lock); + page_cache_release(page); } - *dir = subdir; - *page = NULL; + if (!expected) + radix_tree_preload_end(); } - shmem_dir_unmap(dir); - return shmem_swp_map(subdir) + offset; + if (error) + mem_cgroup_uncharge_cache_page(page); + return error; } -static void shmem_swp_set(struct shmem_inode_info *info, swp_entry_t *entry, unsigned long value) +/* + * Like delete_from_page_cache, but substitutes swap for page. + */ +static void shmem_delete_from_page_cache(struct page *page, void *radswap) { - long incdec = value? 1: -1; + struct address_space *mapping = page->mapping; + int error; - entry->val = value; - info->swapped += incdec; - if ((unsigned long)(entry - info->i_direct) >= SHMEM_NR_DIRECT) { - struct page *page = kmap_atomic_to_page(entry); - set_page_private(page, page_private(page) + incdec); - } + spin_lock_irq(&mapping->tree_lock); + error = shmem_radix_tree_replace(mapping, page->index, page, radswap); + page->mapping = NULL; + mapping->nrpages--; + __dec_zone_page_state(page, NR_FILE_PAGES); + __dec_zone_page_state(page, NR_SHMEM); + spin_unlock_irq(&mapping->tree_lock); + page_cache_release(page); + BUG_ON(error); } -/** - * shmem_swp_alloc - get the position of the swap entry for the page. - * @info: info structure for the inode - * @index: index of the page to find - * @sgp: check and recheck i_size? skip allocation? - * @gfp: gfp mask to use for any page allocation - * - * If the entry does not exist, allocate it. +/* + * Like find_get_pages, but collecting swap entries as well as pages. */ -static swp_entry_t *shmem_swp_alloc(struct shmem_inode_info *info, - unsigned long index, enum sgp_type sgp, gfp_t gfp) -{ - struct inode *inode = &info->vfs_inode; - struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); - struct page *page = NULL; - swp_entry_t *entry; - - if (sgp != SGP_WRITE && - ((loff_t) index << PAGE_CACHE_SHIFT) >= i_size_read(inode)) - return ERR_PTR(-EINVAL); - - while (!(entry = shmem_swp_entry(info, index, &page))) { - if (sgp == SGP_READ) - return shmem_swp_map(ZERO_PAGE(0)); - /* - * Test used_blocks against 1 less max_blocks, since we have 1 data - * page (and perhaps indirect index pages) yet to allocate: - * a waste to allocate index if we cannot allocate data. - */ - if (sbinfo->max_blocks) { - if (percpu_counter_compare(&sbinfo->used_blocks, - sbinfo->max_blocks - 1) >= 0) - return ERR_PTR(-ENOSPC); - percpu_counter_inc(&sbinfo->used_blocks); - inode->i_blocks += BLOCKS_PER_PAGE; +static unsigned shmem_find_get_pages_and_swap(struct address_space *mapping, + pgoff_t start, unsigned int nr_pages, + struct page **pages, pgoff_t *indices) +{ + unsigned int i; + unsigned int ret; + unsigned int nr_found; + + rcu_read_lock(); +restart: + nr_found = radix_tree_gang_lookup_slot(&mapping->page_tree, + (void ***)pages, indices, start, nr_pages); + ret = 0; + for (i = 0; i < nr_found; i++) { + struct page *page; +repeat: + page = radix_tree_deref_slot((void **)pages[i]); + if (unlikely(!page)) + continue; + if (radix_tree_exception(page)) { + if (radix_tree_deref_retry(page)) + goto restart; + /* + * Otherwise, we must be storing a swap entry + * here as an exceptional entry: so return it + * without attempting to raise page count. + */ + goto export; } + if (!page_cache_get_speculative(page)) + goto repeat; - spin_unlock(&info->lock); - page = shmem_dir_alloc(gfp); - spin_lock(&info->lock); - - if (!page) { - shmem_free_blocks(inode, 1); - return ERR_PTR(-ENOMEM); - } - if (sgp != SGP_WRITE && - ((loff_t) index << PAGE_CACHE_SHIFT) >= i_size_read(inode)) { - entry = ERR_PTR(-EINVAL); - break; + /* Has the page moved? */ + if (unlikely(page != *((void **)pages[i]))) { + page_cache_release(page); + goto repeat; } - if (info->next_index <= index) - info->next_index = index + 1; - } - if (page) { - /* another task gave its page, or truncated the file */ - shmem_free_blocks(inode, 1); - shmem_dir_free(page); - } - if (info->next_index <= index && !IS_ERR(entry)) - info->next_index = index + 1; - return entry; +export: + indices[ret] = indices[i]; + pages[ret] = page; + ret++; + } + if (unlikely(!ret && nr_found)) + goto restart; + rcu_read_unlock(); + return ret; } -/** - * shmem_free_swp - free some swap entries in a directory - * @dir: pointer to the directory - * @edir: pointer after last entry of the directory - * @punch_lock: pointer to spinlock when needed for the holepunch case +/* + * Remove swap entry from radix tree, free the swap and its page cache. */ -static int shmem_free_swp(swp_entry_t *dir, swp_entry_t *edir, - spinlock_t *punch_lock) -{ - spinlock_t *punch_unlock = NULL; - swp_entry_t *ptr; - int freed = 0; - - for (ptr = dir; ptr < edir; ptr++) { - if (ptr->val) { - if (unlikely(punch_lock)) { - punch_unlock = punch_lock; - punch_lock = NULL; - spin_lock(punch_unlock); - if (!ptr->val) - continue; - } - free_swap_and_cache(*ptr); - *ptr = (swp_entry_t){0}; - freed++; - } - } - if (punch_unlock) - spin_unlock(punch_unlock); - return freed; -} - -static int shmem_map_and_free_swp(struct page *subdir, int offset, - int limit, struct page ***dir, spinlock_t *punch_lock) -{ - swp_entry_t *ptr; - int freed = 0; - - ptr = shmem_swp_map(subdir); - for (; offset < limit; offset += LATENCY_LIMIT) { - int size = limit - offset; - if (size > LATENCY_LIMIT) - size = LATENCY_LIMIT; - freed += shmem_free_swp(ptr+offset, ptr+offset+size, - punch_lock); - if (need_resched()) { - shmem_swp_unmap(ptr); - if (*dir) { - shmem_dir_unmap(*dir); - *dir = NULL; - } - cond_resched(); - ptr = shmem_swp_map(subdir); - } - } - shmem_swp_unmap(ptr); - return freed; +static int shmem_free_swap(struct address_space *mapping, + pgoff_t index, void *radswap) +{ + int error; + + spin_lock_irq(&mapping->tree_lock); + error = shmem_radix_tree_replace(mapping, index, radswap, NULL); + spin_unlock_irq(&mapping->tree_lock); + if (!error) + free_swap_and_cache(radix_to_swp_entry(radswap)); + return error; } -static void shmem_free_pages(struct list_head *next) +/* + * Pagevec may contain swap entries, so shuffle up pages before releasing. + */ +static void shmem_pagevec_release(struct pagevec *pvec) { - struct page *page; - int freed = 0; - - do { - page = container_of(next, struct page, lru); - next = next->next; - shmem_dir_free(page); - freed++; - if (freed >= LATENCY_LIMIT) { - cond_resched(); - freed = 0; - } - } while (next); + int i, j; + + for (i = 0, j = 0; i < pagevec_count(pvec); i++) { + struct page *page = pvec->pages[i]; + if (!radix_tree_exceptional_entry(page)) + pvec->pages[j++] = page; + } + pvec->nr = j; + pagevec_release(pvec); } -void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end) +/* + * Remove range of pages and swap entries from radix tree, and free them. + */ +void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend) { + struct address_space *mapping = inode->i_mapping; struct shmem_inode_info *info = SHMEM_I(inode); - unsigned long idx; - unsigned long size; - unsigned long limit; - unsigned long stage; - unsigned long diroff; - struct page **dir; - struct page *topdir; - struct page *middir; - struct page *subdir; - swp_entry_t *ptr; - LIST_HEAD(pages_to_free); - long nr_pages_to_free = 0; + pgoff_t start = (lstart + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; + unsigned partial = lstart & (PAGE_CACHE_SIZE - 1); + pgoff_t end = (lend >> PAGE_CACHE_SHIFT); + struct pagevec pvec; + pgoff_t indices[PAGEVEC_SIZE]; long nr_swaps_freed = 0; - int offset; - int freed; - int punch_hole; - spinlock_t *needs_lock; - spinlock_t *punch_lock; - unsigned long upper_limit; + pgoff_t index; + int i; - truncate_inode_pages_range(inode->i_mapping, start, end); + BUG_ON((lend & (PAGE_CACHE_SIZE - 1)) != (PAGE_CACHE_SIZE - 1)); - inode->i_ctime = inode->i_mtime = CURRENT_TIME; - idx = (start + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; - if (idx >= info->next_index) - return; + pagevec_init(&pvec, 0); + index = start; + while (index <= end) { + pvec.nr = shmem_find_get_pages_and_swap(mapping, index, + min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1, + pvec.pages, indices); + if (!pvec.nr) + break; + mem_cgroup_uncharge_start(); + for (i = 0; i < pagevec_count(&pvec); i++) { + struct page *page = pvec.pages[i]; - spin_lock(&info->lock); - info->flags |= SHMEM_TRUNCATE; - if (likely(end == (loff_t) -1)) { - limit = info->next_index; - upper_limit = SHMEM_MAX_INDEX; - info->next_index = idx; - needs_lock = NULL; - punch_hole = 0; - } else { - if (end + 1 >= inode->i_size) { /* we may free a little more */ - limit = (inode->i_size + PAGE_CACHE_SIZE - 1) >> - PAGE_CACHE_SHIFT; - upper_limit = SHMEM_MAX_INDEX; - } else { - limit = (end + 1) >> PAGE_CACHE_SHIFT; - upper_limit = limit; - } - needs_lock = &info->lock; - punch_hole = 1; - } + index = indices[i]; + if (index > end) + break; + + if (radix_tree_exceptional_entry(page)) { + nr_swaps_freed += !shmem_free_swap(mapping, + index, page); + continue; + } - topdir = info->i_indirect; - if (topdir && idx <= SHMEM_NR_DIRECT && !punch_hole) { - info->i_indirect = NULL; - nr_pages_to_free++; - list_add(&topdir->lru, &pages_to_free); + if (!trylock_page(page)) + continue; + if (page->mapping == mapping) { + VM_BUG_ON(PageWriteback(page)); + truncate_inode_page(mapping, page); + } + unlock_page(page); + } + shmem_pagevec_release(&pvec); + mem_cgroup_uncharge_end(); + cond_resched(); + index++; } - spin_unlock(&info->lock); - if (info->swapped && idx < SHMEM_NR_DIRECT) { - ptr = info->i_direct; - size = limit; - if (size > SHMEM_NR_DIRECT) - size = SHMEM_NR_DIRECT; - nr_swaps_freed = shmem_free_swp(ptr+idx, ptr+size, needs_lock); + if (partial) { + struct page *page = NULL; + shmem_getpage(inode, start - 1, &page, SGP_READ, NULL); + if (page) { + zero_user_segment(page, partial, PAGE_CACHE_SIZE); + set_page_dirty(page); + unlock_page(page); + page_cache_release(page); + } } - /* - * If there are no indirect blocks or we are punching a hole - * below indirect blocks, nothing to be done. - */ - if (!topdir || limit <= SHMEM_NR_DIRECT) - goto done2; + index = start; + for ( ; ; ) { + cond_resched(); + pvec.nr = shmem_find_get_pages_and_swap(mapping, index, + min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1, + pvec.pages, indices); + if (!pvec.nr) { + if (index == start) + break; + index = start; + continue; + } + if (index == start && indices[0] > end) { + shmem_pagevec_release(&pvec); + break; + } + mem_cgroup_uncharge_start(); + for (i = 0; i < pagevec_count(&pvec); i++) { + struct page *page = pvec.pages[i]; - /* - * The truncation case has already dropped info->lock, and we're safe - * because i_size and next_index have already been lowered, preventing - * access beyond. But in the punch_hole case, we still need to take - * the lock when updating the swap directory, because there might be - * racing accesses by shmem_getpage(SGP_CACHE), shmem_unuse_inode or - * shmem_writepage. However, whenever we find we can remove a whole - * directory page (not at the misaligned start or end of the range), - * we first NULLify its pointer in the level above, and then have no - * need to take the lock when updating its contents: needs_lock and - * punch_lock (either pointing to info->lock or NULL) manage this. - */ + index = indices[i]; + if (index > end) + break; - upper_limit -= SHMEM_NR_DIRECT; - limit -= SHMEM_NR_DIRECT; - idx = (idx > SHMEM_NR_DIRECT)? (idx - SHMEM_NR_DIRECT): 0; - offset = idx % ENTRIES_PER_PAGE; - idx -= offset; - - dir = shmem_dir_map(topdir); - stage = ENTRIES_PER_PAGEPAGE/2; - if (idx < ENTRIES_PER_PAGEPAGE/2) { - middir = topdir; - diroff = idx/ENTRIES_PER_PAGE; - } else { - dir += ENTRIES_PER_PAGE/2; - dir += (idx - ENTRIES_PER_PAGEPAGE/2)/ENTRIES_PER_PAGEPAGE; - while (stage <= idx) - stage += ENTRIES_PER_PAGEPAGE; - middir = *dir; - if (*dir) { - diroff = ((idx - ENTRIES_PER_PAGEPAGE/2) % - ENTRIES_PER_PAGEPAGE) / ENTRIES_PER_PAGE; - if (!diroff && !offset && upper_limit >= stage) { - if (needs_lock) { - spin_lock(needs_lock); - *dir = NULL; - spin_unlock(needs_lock); - needs_lock = NULL; - } else - *dir = NULL; - nr_pages_to_free++; - list_add(&middir->lru, &pages_to_free); + if (radix_tree_exceptional_entry(page)) { + nr_swaps_freed += !shmem_free_swap(mapping, + index, page); + continue; } - shmem_dir_unmap(dir); - dir = shmem_dir_map(middir); - } else { - diroff = 0; - offset = 0; - idx = stage; - } - } - for (; idx < limit; idx += ENTRIES_PER_PAGE, diroff++) { - if (unlikely(idx == stage)) { - shmem_dir_unmap(dir); - dir = shmem_dir_map(topdir) + - ENTRIES_PER_PAGE/2 + idx/ENTRIES_PER_PAGEPAGE; - while (!*dir) { - dir++; - idx += ENTRIES_PER_PAGEPAGE; - if (idx >= limit) - goto done1; - } - stage = idx + ENTRIES_PER_PAGEPAGE; - middir = *dir; - if (punch_hole) - needs_lock = &info->lock; - if (upper_limit >= stage) { - if (needs_lock) { - spin_lock(needs_lock); - *dir = NULL; - spin_unlock(needs_lock); - needs_lock = NULL; - } else - *dir = NULL; - nr_pages_to_free++; - list_add(&middir->lru, &pages_to_free); + lock_page(page); + if (page->mapping == mapping) { + VM_BUG_ON(PageWriteback(page)); + truncate_inode_page(mapping, page); } - shmem_dir_unmap(dir); - cond_resched(); - dir = shmem_dir_map(middir); - diroff = 0; - } - punch_lock = needs_lock; - subdir = dir[diroff]; - if (subdir && !offset && upper_limit-idx >= ENTRIES_PER_PAGE) { - if (needs_lock) { - spin_lock(needs_lock); - dir[diroff] = NULL; - spin_unlock(needs_lock); - punch_lock = NULL; - } else - dir[diroff] = NULL; - nr_pages_to_free++; - list_add(&subdir->lru, &pages_to_free); - } - if (subdir && page_private(subdir) /* has swap entries */) { - size = limit - idx; - if (size > ENTRIES_PER_PAGE) - size = ENTRIES_PER_PAGE; - freed = shmem_map_and_free_swp(subdir, - offset, size, &dir, punch_lock); - if (!dir) - dir = shmem_dir_map(middir); - nr_swaps_freed += freed; - if (offset || punch_lock) { - spin_lock(&info->lock); - set_page_private(subdir, - page_private(subdir) - freed); - spin_unlock(&info->lock); - } else - BUG_ON(page_private(subdir) != freed); + unlock_page(page); } - offset = 0; - } -done1: - shmem_dir_unmap(dir); -done2: - if (inode->i_mapping->nrpages && (info->flags & SHMEM_PAGEIN)) { - /* - * Call truncate_inode_pages again: racing shmem_unuse_inode - * may have swizzled a page in from swap since - * truncate_pagecache or generic_delete_inode did it, before we - * lowered next_index. Also, though shmem_getpage checks - * i_size before adding to cache, no recheck after: so fix the - * narrow window there too. - */ - truncate_inode_pages_range(inode->i_mapping, start, end); + shmem_pagevec_release(&pvec); + mem_cgroup_uncharge_end(); + index++; } spin_lock(&info->lock); - info->flags &= ~SHMEM_TRUNCATE; info->swapped -= nr_swaps_freed; - if (nr_pages_to_free) - shmem_free_blocks(inode, nr_pages_to_free); shmem_recalc_inode(inode); spin_unlock(&info->lock); - /* - * Empty swap vector directory pages to be freed? - */ - if (!list_empty(&pages_to_free)) { - pages_to_free.prev->next = NULL; - shmem_free_pages(pages_to_free.next); - } + inode->i_ctime = inode->i_mtime = CURRENT_TIME; } EXPORT_SYMBOL_GPL(shmem_truncate_range); @@ -780,37 +520,7 @@ static int shmem_setattr(struct dentry *dentry, struct iattr *attr) if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) { loff_t oldsize = inode->i_size; loff_t newsize = attr->ia_size; - struct page *page = NULL; - if (newsize < oldsize) { - /* - * If truncating down to a partial page, then - * if that page is already allocated, hold it - * in memory until the truncation is over, so - * truncate_partial_page cannot miss it were - * it assigned to swap. - */ - if (newsize & (PAGE_CACHE_SIZE-1)) { - (void) shmem_getpage(inode, - newsize >> PAGE_CACHE_SHIFT, - &page, SGP_READ, NULL); - if (page) - unlock_page(page); - } - /* - * Reset SHMEM_PAGEIN flag so that shmem_truncate can - * detect if any pages might have been added to cache - * after truncate_inode_pages. But we needn't bother - * if it's being fully truncated to zero-length: the - * nrpages check is efficient enough in that case. - */ - if (newsize) { - struct shmem_inode_info *info = SHMEM_I(inode); - spin_lock(&info->lock); - info->flags &= ~SHMEM_PAGEIN; - spin_unlock(&info->lock); - } - } if (newsize != oldsize) { i_size_write(inode, newsize); inode->i_ctime = inode->i_mtime = CURRENT_TIME; @@ -822,8 +532,6 @@ static int shmem_setattr(struct dentry *dentry, struct iattr *attr) /* unmap again to remove racily COWed private pages */ unmap_mapping_range(inode->i_mapping, holebegin, 0, 1); } - if (page) - page_cache_release(page); } setattr_copy(inode, attr); @@ -848,7 +556,8 @@ static void shmem_evict_inode(struct inode *inode) list_del_init(&info->swaplist); mutex_unlock(&shmem_swaplist_mutex); } - } + } else + kfree(info->symlink); list_for_each_entry_safe(xattr, nxattr, &info->xattr_list, list) { kfree(xattr->name); @@ -859,106 +568,27 @@ static void shmem_evict_inode(struct inode *inode) end_writeback(inode); } -static inline int shmem_find_swp(swp_entry_t entry, swp_entry_t *dir, swp_entry_t *edir) -{ - swp_entry_t *ptr; - - for (ptr = dir; ptr < edir; ptr++) { - if (ptr->val == entry.val) - return ptr - dir; - } - return -1; -} - -static int shmem_unuse_inode(struct shmem_inode_info *info, swp_entry_t entry, struct page *page) +/* + * If swap found in inode, free it and move page from swapcache to filecache. + */ +static int shmem_unuse_inode(struct shmem_inode_info *info, + swp_entry_t swap, struct page *page) { - struct address_space *mapping; - unsigned long idx; - unsigned long size; - unsigned long limit; - unsigned long stage; - struct page **dir; - struct page *subdir; - swp_entry_t *ptr; - int offset; + struct address_space *mapping = info->vfs_inode.i_mapping; + void *radswap; + pgoff_t index; int error; - idx = 0; - ptr = info->i_direct; - spin_lock(&info->lock); - if (!info->swapped) { - list_del_init(&info->swaplist); - goto lost2; - } - limit = info->next_index; - size = limit; - if (size > SHMEM_NR_DIRECT) - size = SHMEM_NR_DIRECT; - offset = shmem_find_swp(entry, ptr, ptr+size); - if (offset >= 0) { - shmem_swp_balance_unmap(); - goto found; - } - if (!info->i_indirect) - goto lost2; - - dir = shmem_dir_map(info->i_indirect); - stage = SHMEM_NR_DIRECT + ENTRIES_PER_PAGEPAGE/2; - - for (idx = SHMEM_NR_DIRECT; idx < limit; idx += ENTRIES_PER_PAGE, dir++) { - if (unlikely(idx == stage)) { - shmem_dir_unmap(dir-1); - if (cond_resched_lock(&info->lock)) { - /* check it has not been truncated */ - if (limit > info->next_index) { - limit = info->next_index; - if (idx >= limit) - goto lost2; - } - } - dir = shmem_dir_map(info->i_indirect) + - ENTRIES_PER_PAGE/2 + idx/ENTRIES_PER_PAGEPAGE; - while (!*dir) { - dir++; - idx += ENTRIES_PER_PAGEPAGE; - if (idx >= limit) - goto lost1; - } - stage = idx + ENTRIES_PER_PAGEPAGE; - subdir = *dir; - shmem_dir_unmap(dir); - dir = shmem_dir_map(subdir); - } - subdir = *dir; - if (subdir && page_private(subdir)) { - ptr = shmem_swp_map(subdir); - size = limit - idx; - if (size > ENTRIES_PER_PAGE) - size = ENTRIES_PER_PAGE; - offset = shmem_find_swp(entry, ptr, ptr+size); - shmem_swp_unmap(ptr); - if (offset >= 0) { - shmem_dir_unmap(dir); - ptr = shmem_swp_map(subdir); - goto found; - } - } - } -lost1: - shmem_dir_unmap(dir-1); -lost2: - spin_unlock(&info->lock); - return 0; -found: - idx += offset; - ptr += offset; + radswap = swp_to_radix_entry(swap); + index = radix_tree_locate_item(&mapping->page_tree, radswap); + if (index == -1) + return 0; /* * Move _head_ to start search for next from here. * But be careful: shmem_evict_inode checks list_empty without taking * mutex, and there's an instant in list_move_tail when info->swaplist - * would appear empty, if it were the only one on shmem_swaplist. We - * could avoid doing it if inode NULL; or use this minor optimization. + * would appear empty, if it were the only one on shmem_swaplist. */ if (shmem_swaplist.next != &info->swaplist) list_move_tail(&shmem_swaplist, &info->swaplist); @@ -968,29 +598,34 @@ found: * but also to hold up shmem_evict_inode(): so inode cannot be freed * beneath us (pagelock doesn't help until the page is in pagecache). */ - mapping = info->vfs_inode.i_mapping; - error = add_to_page_cache_locked(page, mapping, idx, GFP_NOWAIT); + error = shmem_add_to_page_cache(page, mapping, index, + GFP_NOWAIT, radswap); /* which does mem_cgroup_uncharge_cache_page on error */ if (error != -ENOMEM) { + /* + * Truncation and eviction use free_swap_and_cache(), which + * only does trylock page: if we raced, best clean up here. + */ delete_from_swap_cache(page); set_page_dirty(page); - info->flags |= SHMEM_PAGEIN; - shmem_swp_set(info, ptr, 0); - swap_free(entry); + if (!error) { + spin_lock(&info->lock); + info->swapped--; + spin_unlock(&info->lock); + swap_free(swap); + } error = 1; /* not an error, but entry was found */ } - shmem_swp_unmap(ptr); - spin_unlock(&info->lock); return error; } /* - * shmem_unuse() search for an eventually swapped out shmem page. + * Search through swapped inodes to find and replace swap by page. */ -int shmem_unuse(swp_entry_t entry, struct page *page) +int shmem_unuse(swp_entry_t swap, struct page *page) { - struct list_head *p, *next; + struct list_head *this, *next; struct shmem_inode_info *info; int found = 0; int error; @@ -999,32 +634,25 @@ int shmem_unuse(swp_entry_t entry, struct page *page) * Charge page using GFP_KERNEL while we can wait, before taking * the shmem_swaplist_mutex which might hold up shmem_writepage(). * Charged back to the user (not to caller) when swap account is used. - * add_to_page_cache() will be called with GFP_NOWAIT. */ error = mem_cgroup_cache_charge(page, current->mm, GFP_KERNEL); if (error) goto out; - /* - * Try to preload while we can wait, to not make a habit of - * draining atomic reserves; but don't latch on to this cpu, - * it's okay if sometimes we get rescheduled after this. - */ - error = radix_tree_preload(GFP_KERNEL); - if (error) - goto uncharge; - radix_tree_preload_end(); + /* No radix_tree_preload: swap entry keeps a place for page in tree */ mutex_lock(&shmem_swaplist_mutex); - list_for_each_safe(p, next, &shmem_swaplist) { - info = list_entry(p, struct shmem_inode_info, swaplist); - found = shmem_unuse_inode(info, entry, page); + list_for_each_safe(this, next, &shmem_swaplist) { + info = list_entry(this, struct shmem_inode_info, swaplist); + if (info->swapped) + found = shmem_unuse_inode(info, swap, page); + else + list_del_init(&info->swaplist); cond_resched(); if (found) break; } mutex_unlock(&shmem_swaplist_mutex); -uncharge: if (!found) mem_cgroup_uncharge_cache_page(page); if (found < 0) @@ -1041,10 +669,10 @@ out: static int shmem_writepage(struct page *page, struct writeback_control *wbc) { struct shmem_inode_info *info; - swp_entry_t *entry, swap; struct address_space *mapping; - unsigned long index; struct inode *inode; + swp_entry_t swap; + pgoff_t index; BUG_ON(!PageLocked(page)); mapping = page->mapping; @@ -1073,50 +701,32 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc) /* * Add inode to shmem_unuse()'s list of swapped-out inodes, - * if it's not already there. Do it now because we cannot take - * mutex while holding spinlock, and must do so before the page - * is moved to swap cache, when its pagelock no longer protects + * if it's not already there. Do it now before the page is + * moved to swap cache, when its pagelock no longer protects * the inode from eviction. But don't unlock the mutex until - * we've taken the spinlock, because shmem_unuse_inode() will - * prune a !swapped inode from the swaplist under both locks. + * we've incremented swapped, because shmem_unuse_inode() will + * prune a !swapped inode from the swaplist under this mutex. */ mutex_lock(&shmem_swaplist_mutex); if (list_empty(&info->swaplist)) list_add_tail(&info->swaplist, &shmem_swaplist); - spin_lock(&info->lock); - mutex_unlock(&shmem_swaplist_mutex); - - if (index >= info->next_index) { - BUG_ON(!(info->flags & SHMEM_TRUNCATE)); - goto unlock; - } - entry = shmem_swp_entry(info, index, NULL); - if (entry->val) { - WARN_ON_ONCE(1); /* Still happens? Tell us about it! */ - free_swap_and_cache(*entry); - shmem_swp_set(info, entry, 0); - } - shmem_recalc_inode(inode); - if (add_to_swap_cache(page, swap, GFP_ATOMIC) == 0) { - delete_from_page_cache(page); - shmem_swp_set(info, entry, swap.val); - shmem_swp_unmap(entry); swap_shmem_alloc(swap); + shmem_delete_from_page_cache(page, swp_to_radix_entry(swap)); + + spin_lock(&info->lock); + info->swapped++; + shmem_recalc_inode(inode); spin_unlock(&info->lock); + + mutex_unlock(&shmem_swaplist_mutex); BUG_ON(page_mapped(page)); swap_writepage(page, wbc); return 0; } - shmem_swp_unmap(entry); -unlock: - spin_unlock(&info->lock); - /* - * add_to_swap_cache() doesn't return -EEXIST, so we can safely - * clear SWAP_HAS_CACHE flag. - */ + mutex_unlock(&shmem_swaplist_mutex); swapcache_free(swap, NULL); redirty: set_page_dirty(page); @@ -1153,35 +763,33 @@ static struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo) } #endif /* CONFIG_TMPFS */ -static struct page *shmem_swapin(swp_entry_t entry, gfp_t gfp, - struct shmem_inode_info *info, unsigned long idx) +static struct page *shmem_swapin(swp_entry_t swap, gfp_t gfp, + struct shmem_inode_info *info, pgoff_t index) { struct mempolicy mpol, *spol; struct vm_area_struct pvma; - struct page *page; spol = mpol_cond_copy(&mpol, - mpol_shared_policy_lookup(&info->policy, idx)); + mpol_shared_policy_lookup(&info->policy, index)); /* Create a pseudo vma that just contains the policy */ pvma.vm_start = 0; - pvma.vm_pgoff = idx; + pvma.vm_pgoff = index; pvma.vm_ops = NULL; pvma.vm_policy = spol; - page = swapin_readahead(entry, gfp, &pvma, 0); - return page; + return swapin_readahead(swap, gfp, &pvma, 0); } static struct page *shmem_alloc_page(gfp_t gfp, - struct shmem_inode_info *info, unsigned long idx) + struct shmem_inode_info *info, pgoff_t index) { struct vm_area_struct pvma; /* Create a pseudo vma that just contains the policy */ pvma.vm_start = 0; - pvma.vm_pgoff = idx; + pvma.vm_pgoff = index; pvma.vm_ops = NULL; - pvma.vm_policy = mpol_shared_policy_lookup(&info->policy, idx); + pvma.vm_policy = mpol_shared_policy_lookup(&info->policy, index); /* * alloc_page_vma() will drop the shared policy reference @@ -1190,19 +798,19 @@ static struct page *shmem_alloc_page(gfp_t gfp, } #else /* !CONFIG_NUMA */ #ifdef CONFIG_TMPFS -static inline void shmem_show_mpol(struct seq_file *seq, struct mempolicy *p) +static inline void shmem_show_mpol(struct seq_file *seq, struct mempolicy *mpol) { } #endif /* CONFIG_TMPFS */ -static inline struct page *shmem_swapin(swp_entry_t entry, gfp_t gfp, - struct shmem_inode_info *info, unsigned long idx) +static inline struct page *shmem_swapin(swp_entry_t swap, gfp_t gfp, + struct shmem_inode_info *info, pgoff_t index) { - return swapin_readahead(entry, gfp, NULL, 0); + return swapin_readahead(swap, gfp, NULL, 0); } static inline struct page *shmem_alloc_page(gfp_t gfp, - struct shmem_inode_info *info, unsigned long idx) + struct shmem_inode_info *info, pgoff_t index) { return alloc_page(gfp); } @@ -1222,243 +830,190 @@ static inline struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo) * vm. If we swap it in we mark it dirty since we also free the swap * entry since a page cannot live in both the swap and page cache */ -static int shmem_getpage_gfp(struct inode *inode, pgoff_t idx, +static int shmem_getpage_gfp(struct inode *inode, pgoff_t index, struct page **pagep, enum sgp_type sgp, gfp_t gfp, int *fault_type) { struct address_space *mapping = inode->i_mapping; - struct shmem_inode_info *info = SHMEM_I(inode); + struct shmem_inode_info *info; struct shmem_sb_info *sbinfo; struct page *page; - struct page *prealloc_page = NULL; - swp_entry_t *entry; swp_entry_t swap; int error; - int ret; + int once = 0; - if (idx >= SHMEM_MAX_INDEX) + if (index > (MAX_LFS_FILESIZE >> PAGE_CACHE_SHIFT)) return -EFBIG; repeat: - page = find_lock_page(mapping, idx); - if (page) { + swap.val = 0; + page = find_lock_page(mapping, index); + if (radix_tree_exceptional_entry(page)) { + swap = radix_to_swp_entry(page); + page = NULL; + } + + if (sgp != SGP_WRITE && + ((loff_t)index << PAGE_CACHE_SHIFT) >= i_size_read(inode)) { + error = -EINVAL; + goto failed; + } + + if (page || (sgp == SGP_READ && !swap.val)) { /* * Once we can get the page lock, it must be uptodate: * if there were an error in reading back from swap, * the page would not be inserted into the filecache. */ - BUG_ON(!PageUptodate(page)); - goto done; + BUG_ON(page && !PageUptodate(page)); + *pagep = page; + return 0; } /* - * Try to preload while we can wait, to not make a habit of - * draining atomic reserves; but don't latch on to this cpu. + * Fast cache lookup did not find it: + * bring it back from swap or allocate. */ - error = radix_tree_preload(gfp & GFP_RECLAIM_MASK); - if (error) - goto out; - radix_tree_preload_end(); - - if (sgp != SGP_READ && !prealloc_page) { - prealloc_page = shmem_alloc_page(gfp, info, idx); - if (prealloc_page) { - SetPageSwapBacked(prealloc_page); - if (mem_cgroup_cache_charge(prealloc_page, - current->mm, GFP_KERNEL)) { - page_cache_release(prealloc_page); - prealloc_page = NULL; - } - } - } - - spin_lock(&info->lock); - shmem_recalc_inode(inode); - entry = shmem_swp_alloc(info, idx, sgp, gfp); - if (IS_ERR(entry)) { - spin_unlock(&info->lock); - error = PTR_ERR(entry); - goto out; - } - swap = *entry; + info = SHMEM_I(inode); + sbinfo = SHMEM_SB(inode->i_sb); if (swap.val) { /* Look it up and read it in.. */ page = lookup_swap_cache(swap); if (!page) { - shmem_swp_unmap(entry); - spin_unlock(&info->lock); /* here we actually do the io */ if (fault_type) *fault_type |= VM_FAULT_MAJOR; - page = shmem_swapin(swap, gfp, info, idx); + page = shmem_swapin(swap, gfp, info, index); if (!page) { - spin_lock(&info->lock); - entry = shmem_swp_alloc(info, idx, sgp, gfp); - if (IS_ERR(entry)) - error = PTR_ERR(entry); - else { - if (entry->val == swap.val) - error = -ENOMEM; - shmem_swp_unmap(entry); - } - spin_unlock(&info->lock); - if (error) - goto out; - goto repeat; + error = -ENOMEM; + goto failed; } - wait_on_page_locked(page); - page_cache_release(page); - goto repeat; } /* We have to do this with page locked to prevent races */ - if (!trylock_page(page)) { - shmem_swp_unmap(entry); - spin_unlock(&info->lock); - wait_on_page_locked(page); - page_cache_release(page); - goto repeat; - } - if (PageWriteback(page)) { - shmem_swp_unmap(entry); - spin_unlock(&info->lock); - wait_on_page_writeback(page); - unlock_page(page); - page_cache_release(page); - goto repeat; - } + lock_page(page); if (!PageUptodate(page)) { - shmem_swp_unmap(entry); - spin_unlock(&info->lock); - unlock_page(page); - page_cache_release(page); error = -EIO; - goto out; + goto failed; } - - error = add_to_page_cache_locked(page, mapping, - idx, GFP_NOWAIT); - if (error) { - shmem_swp_unmap(entry); - spin_unlock(&info->lock); - if (error == -ENOMEM) { - /* - * reclaim from proper memory cgroup and - * call memcg's OOM if needed. - */ - error = mem_cgroup_shmem_charge_fallback( - page, current->mm, gfp); - if (error) { - unlock_page(page); - page_cache_release(page); - goto out; - } - } - unlock_page(page); - page_cache_release(page); - goto repeat; + wait_on_page_writeback(page); + + /* Someone may have already done it for us */ + if (page->mapping) { + if (page->mapping == mapping && + page->index == index) + goto done; + error = -EEXIST; + goto failed; } - info->flags |= SHMEM_PAGEIN; - shmem_swp_set(info, entry, 0); - shmem_swp_unmap(entry); - delete_from_swap_cache(page); + error = mem_cgroup_cache_charge(page, current->mm, + gfp & GFP_RECLAIM_MASK); + if (!error) + error = shmem_add_to_page_cache(page, mapping, index, + gfp, swp_to_radix_entry(swap)); + if (error) + goto failed; + + spin_lock(&info->lock); + info->swapped--; + shmem_recalc_inode(inode); spin_unlock(&info->lock); + + delete_from_swap_cache(page); set_page_dirty(page); swap_free(swap); - } else if (sgp == SGP_READ) { - shmem_swp_unmap(entry); - page = find_get_page(mapping, idx); - if (page && !trylock_page(page)) { - spin_unlock(&info->lock); - wait_on_page_locked(page); - page_cache_release(page); - goto repeat; + } else { + if (shmem_acct_block(info->flags)) { + error = -ENOSPC; + goto failed; } - spin_unlock(&info->lock); - - } else if (prealloc_page) { - shmem_swp_unmap(entry); - sbinfo = SHMEM_SB(inode->i_sb); if (sbinfo->max_blocks) { if (percpu_counter_compare(&sbinfo->used_blocks, - sbinfo->max_blocks) >= 0 || - shmem_acct_block(info->flags)) - goto nospace; + sbinfo->max_blocks) >= 0) { + error = -ENOSPC; + goto unacct; + } percpu_counter_inc(&sbinfo->used_blocks); - inode->i_blocks += BLOCKS_PER_PAGE; - } else if (shmem_acct_block(info->flags)) - goto nospace; - - page = prealloc_page; - prealloc_page = NULL; - - entry = shmem_swp_alloc(info, idx, sgp, gfp); - if (IS_ERR(entry)) - error = PTR_ERR(entry); - else { - swap = *entry; - shmem_swp_unmap(entry); } - ret = error || swap.val; - if (ret) - mem_cgroup_uncharge_cache_page(page); - else - ret = add_to_page_cache_lru(page, mapping, - idx, GFP_NOWAIT); - /* - * At add_to_page_cache_lru() failure, - * uncharge will be done automatically. - */ - if (ret) { - shmem_unacct_blocks(info->flags, 1); - shmem_free_blocks(inode, 1); - spin_unlock(&info->lock); - page_cache_release(page); - if (error) - goto out; - goto repeat; + + page = shmem_alloc_page(gfp, info, index); + if (!page) { + error = -ENOMEM; + goto decused; } - info->flags |= SHMEM_PAGEIN; + SetPageSwapBacked(page); + __set_page_locked(page); + error = mem_cgroup_cache_charge(page, current->mm, + gfp & GFP_RECLAIM_MASK); + if (!error) + error = shmem_add_to_page_cache(page, mapping, index, + gfp, NULL); + if (error) + goto decused; + lru_cache_add_anon(page); + + spin_lock(&info->lock); info->alloced++; + inode->i_blocks += BLOCKS_PER_PAGE; + shmem_recalc_inode(inode); spin_unlock(&info->lock); + clear_highpage(page); flush_dcache_page(page); SetPageUptodate(page); if (sgp == SGP_DIRTY) set_page_dirty(page); - - } else { - spin_unlock(&info->lock); - error = -ENOMEM; - goto out; } done: - *pagep = page; - error = 0; -out: - if (prealloc_page) { - mem_cgroup_uncharge_cache_page(prealloc_page); - page_cache_release(prealloc_page); + /* Perhaps the file has been truncated since we checked */ + if (sgp != SGP_WRITE && + ((loff_t)index << PAGE_CACHE_SHIFT) >= i_size_read(inode)) { + error = -EINVAL; + goto trunc; } - return error; + *pagep = page; + return 0; -nospace: /* - * Perhaps the page was brought in from swap between find_lock_page - * and taking info->lock? We allow for that at add_to_page_cache_lru, - * but must also avoid reporting a spurious ENOSPC while working on a - * full tmpfs. + * Error recovery. */ - page = find_get_page(mapping, idx); +trunc: + ClearPageDirty(page); + delete_from_page_cache(page); + spin_lock(&info->lock); + info->alloced--; + inode->i_blocks -= BLOCKS_PER_PAGE; spin_unlock(&info->lock); +decused: + if (sbinfo->max_blocks) + percpu_counter_add(&sbinfo->used_blocks, -1); +unacct: + shmem_unacct_blocks(info->flags, 1); +failed: + if (swap.val && error != -EINVAL) { + struct page *test = find_get_page(mapping, index); + if (test && !radix_tree_exceptional_entry(test)) + page_cache_release(test); + /* Have another try if the entry has changed */ + if (test != swp_to_radix_entry(swap)) + error = -EEXIST; + } if (page) { + unlock_page(page); page_cache_release(page); + } + if (error == -ENOSPC && !once++) { + info = SHMEM_I(inode); + spin_lock(&info->lock); + shmem_recalc_inode(inode); + spin_unlock(&info->lock); goto repeat; } - error = -ENOSPC; - goto out; + if (error == -EEXIST) + goto repeat; + return error; } static int shmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf) @@ -1467,9 +1022,6 @@ static int shmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf) int error; int ret = VM_FAULT_LOCKED; - if (((loff_t)vmf->pgoff << PAGE_CACHE_SHIFT) >= i_size_read(inode)) - return VM_FAULT_SIGBUS; - error = shmem_getpage(inode, vmf->pgoff, &vmf->page, SGP_CACHE, &ret); if (error) return ((error == -ENOMEM) ? VM_FAULT_OOM : VM_FAULT_SIGBUS); @@ -1482,20 +1034,20 @@ static int shmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf) } #ifdef CONFIG_NUMA -static int shmem_set_policy(struct vm_area_struct *vma, struct mempolicy *new) +static int shmem_set_policy(struct vm_area_struct *vma, struct mempolicy *mpol) { - struct inode *i = vma->vm_file->f_path.dentry->d_inode; - return mpol_set_shared_policy(&SHMEM_I(i)->policy, vma, new); + struct inode *inode = vma->vm_file->f_path.dentry->d_inode; + return mpol_set_shared_policy(&SHMEM_I(inode)->policy, vma, mpol); } static struct mempolicy *shmem_get_policy(struct vm_area_struct *vma, unsigned long addr) { - struct inode *i = vma->vm_file->f_path.dentry->d_inode; - unsigned long idx; + struct inode *inode = vma->vm_file->f_path.dentry->d_inode; + pgoff_t index; - idx = ((addr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; - return mpol_shared_policy_lookup(&SHMEM_I(i)->policy, idx); + index = ((addr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; + return mpol_shared_policy_lookup(&SHMEM_I(inode)->policy, index); } #endif @@ -1593,7 +1145,7 @@ static struct inode *shmem_get_inode(struct super_block *sb, const struct inode #ifdef CONFIG_TMPFS static const struct inode_operations shmem_symlink_inode_operations; -static const struct inode_operations shmem_symlink_inline_operations; +static const struct inode_operations shmem_short_symlink_operations; static int shmem_write_begin(struct file *file, struct address_space *mapping, @@ -1626,7 +1178,8 @@ static void do_shmem_file_read(struct file *filp, loff_t *ppos, read_descriptor_ { struct inode *inode = filp->f_path.dentry->d_inode; struct address_space *mapping = inode->i_mapping; - unsigned long index, offset; + pgoff_t index; + unsigned long offset; enum sgp_type sgp = SGP_READ; /* @@ -1642,7 +1195,8 @@ static void do_shmem_file_read(struct file *filp, loff_t *ppos, read_descriptor_ for (;;) { struct page *page = NULL; - unsigned long end_index, nr, ret; + pgoff_t end_index; + unsigned long nr, ret; loff_t i_size = i_size_read(inode); end_index = i_size >> PAGE_CACHE_SHIFT; @@ -1880,8 +1434,9 @@ static int shmem_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_namelen = NAME_MAX; if (sbinfo->max_blocks) { buf->f_blocks = sbinfo->max_blocks; - buf->f_bavail = buf->f_bfree = - sbinfo->max_blocks - percpu_counter_sum(&sbinfo->used_blocks); + buf->f_bavail = + buf->f_bfree = sbinfo->max_blocks - + percpu_counter_sum(&sbinfo->used_blocks); } if (sbinfo->max_inodes) { buf->f_files = sbinfo->max_inodes; @@ -2055,10 +1610,13 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s info = SHMEM_I(inode); inode->i_size = len-1; - if (len <= SHMEM_SYMLINK_INLINE_LEN) { - /* do it inline */ - memcpy(info->inline_symlink, symname, len); - inode->i_op = &shmem_symlink_inline_operations; + if (len <= SHORT_SYMLINK_LEN) { + info->symlink = kmemdup(symname, len, GFP_KERNEL); + if (!info->symlink) { + iput(inode); + return -ENOMEM; + } + inode->i_op = &shmem_short_symlink_operations; } else { error = shmem_getpage(inode, 0, &page, SGP_WRITE, NULL); if (error) { @@ -2081,17 +1639,17 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s return 0; } -static void *shmem_follow_link_inline(struct dentry *dentry, struct nameidata *nd) +static void *shmem_follow_short_symlink(struct dentry *dentry, struct nameidata *nd) { - nd_set_link(nd, SHMEM_I(dentry->d_inode)->inline_symlink); + nd_set_link(nd, SHMEM_I(dentry->d_inode)->symlink); return NULL; } static void *shmem_follow_link(struct dentry *dentry, struct nameidata *nd) { struct page *page = NULL; - int res = shmem_getpage(dentry->d_inode, 0, &page, SGP_READ, NULL); - nd_set_link(nd, res ? ERR_PTR(res) : kmap(page)); + int error = shmem_getpage(dentry->d_inode, 0, &page, SGP_READ, NULL); + nd_set_link(nd, error ? ERR_PTR(error) : kmap(page)); if (page) unlock_page(page); return page; @@ -2202,7 +1760,6 @@ out: return err; } - static const struct xattr_handler *shmem_xattr_handlers[] = { #ifdef CONFIG_TMPFS_POSIX_ACL &generic_acl_access_handler, @@ -2332,9 +1889,9 @@ static ssize_t shmem_listxattr(struct dentry *dentry, char *buffer, size_t size) } #endif /* CONFIG_TMPFS_XATTR */ -static const struct inode_operations shmem_symlink_inline_operations = { +static const struct inode_operations shmem_short_symlink_operations = { .readlink = generic_readlink, - .follow_link = shmem_follow_link_inline, + .follow_link = shmem_follow_short_symlink, #ifdef CONFIG_TMPFS_XATTR .setxattr = shmem_setxattr, .getxattr = shmem_getxattr, @@ -2534,8 +2091,7 @@ static int shmem_remount_fs(struct super_block *sb, int *flags, char *data) if (config.max_inodes < inodes) goto out; /* - * Those tests also disallow limited->unlimited while any are in - * use, so i_blocks will always be zero when max_blocks is zero; + * Those tests disallow limited->unlimited while any are in use; * but we must separately disallow unlimited->limited, because * in that case we have no record of how much is already in use. */ @@ -2627,7 +2183,7 @@ int shmem_fill_super(struct super_block *sb, void *data, int silent) goto failed; sbinfo->free_inodes = sbinfo->max_inodes; - sb->s_maxbytes = SHMEM_MAX_BYTES; + sb->s_maxbytes = MAX_LFS_FILESIZE; sb->s_blocksize = PAGE_CACHE_SIZE; sb->s_blocksize_bits = PAGE_CACHE_SHIFT; sb->s_magic = TMPFS_MAGIC; @@ -2662,14 +2218,14 @@ static struct kmem_cache *shmem_inode_cachep; static struct inode *shmem_alloc_inode(struct super_block *sb) { - struct shmem_inode_info *p; - p = (struct shmem_inode_info *)kmem_cache_alloc(shmem_inode_cachep, GFP_KERNEL); - if (!p) + struct shmem_inode_info *info; + info = kmem_cache_alloc(shmem_inode_cachep, GFP_KERNEL); + if (!info) return NULL; - return &p->vfs_inode; + return &info->vfs_inode; } -static void shmem_i_callback(struct rcu_head *head) +static void shmem_destroy_callback(struct rcu_head *head) { struct inode *inode = container_of(head, struct inode, i_rcu); INIT_LIST_HEAD(&inode->i_dentry); @@ -2678,29 +2234,26 @@ static void shmem_i_callback(struct rcu_head *head) static void shmem_destroy_inode(struct inode *inode) { - if ((inode->i_mode & S_IFMT) == S_IFREG) { - /* only struct inode is valid if it's an inline symlink */ + if ((inode->i_mode & S_IFMT) == S_IFREG) mpol_free_shared_policy(&SHMEM_I(inode)->policy); - } - call_rcu(&inode->i_rcu, shmem_i_callback); + call_rcu(&inode->i_rcu, shmem_destroy_callback); } -static void init_once(void *foo) +static void shmem_init_inode(void *foo) { - struct shmem_inode_info *p = (struct shmem_inode_info *) foo; - - inode_init_once(&p->vfs_inode); + struct shmem_inode_info *info = foo; + inode_init_once(&info->vfs_inode); } -static int init_inodecache(void) +static int shmem_init_inodecache(void) { shmem_inode_cachep = kmem_cache_create("shmem_inode_cache", sizeof(struct shmem_inode_info), - 0, SLAB_PANIC, init_once); + 0, SLAB_PANIC, shmem_init_inode); return 0; } -static void destroy_inodecache(void) +static void shmem_destroy_inodecache(void) { kmem_cache_destroy(shmem_inode_cachep); } @@ -2797,21 +2350,20 @@ static const struct vm_operations_struct shmem_vm_ops = { #endif }; - static struct dentry *shmem_mount(struct file_system_type *fs_type, int flags, const char *dev_name, void *data) { return mount_nodev(fs_type, flags, data, shmem_fill_super); } -static struct file_system_type tmpfs_fs_type = { +static struct file_system_type shmem_fs_type = { .owner = THIS_MODULE, .name = "tmpfs", .mount = shmem_mount, .kill_sb = kill_litter_super, }; -int __init init_tmpfs(void) +int __init shmem_init(void) { int error; @@ -2819,18 +2371,18 @@ int __init init_tmpfs(void) if (error) goto out4; - error = init_inodecache(); + error = shmem_init_inodecache(); if (error) goto out3; - error = register_filesystem(&tmpfs_fs_type); + error = register_filesystem(&shmem_fs_type); if (error) { printk(KERN_ERR "Could not register tmpfs\n"); goto out2; } - shm_mnt = vfs_kern_mount(&tmpfs_fs_type, MS_NOUSER, - tmpfs_fs_type.name, NULL); + shm_mnt = vfs_kern_mount(&shmem_fs_type, MS_NOUSER, + shmem_fs_type.name, NULL); if (IS_ERR(shm_mnt)) { error = PTR_ERR(shm_mnt); printk(KERN_ERR "Could not kern_mount tmpfs\n"); @@ -2839,9 +2391,9 @@ int __init init_tmpfs(void) return 0; out1: - unregister_filesystem(&tmpfs_fs_type); + unregister_filesystem(&shmem_fs_type); out2: - destroy_inodecache(); + shmem_destroy_inodecache(); out3: bdi_destroy(&shmem_backing_dev_info); out4: @@ -2849,45 +2401,6 @@ out4: return error; } -#ifdef CONFIG_CGROUP_MEM_RES_CTLR -/** - * mem_cgroup_get_shmem_target - find a page or entry assigned to the shmem file - * @inode: the inode to be searched - * @pgoff: the offset to be searched - * @pagep: the pointer for the found page to be stored - * @ent: the pointer for the found swap entry to be stored - * - * If a page is found, refcount of it is incremented. Callers should handle - * these refcount. - */ -void mem_cgroup_get_shmem_target(struct inode *inode, pgoff_t pgoff, - struct page **pagep, swp_entry_t *ent) -{ - swp_entry_t entry = { .val = 0 }, *ptr; - struct page *page = NULL; - struct shmem_inode_info *info = SHMEM_I(inode); - - if ((pgoff << PAGE_CACHE_SHIFT) >= i_size_read(inode)) - goto out; - - spin_lock(&info->lock); - ptr = shmem_swp_entry(info, pgoff, NULL); -#ifdef CONFIG_SWAP - if (ptr && ptr->val) { - entry.val = ptr->val; - page = find_get_page(&swapper_space, entry.val); - } else -#endif - page = find_get_page(inode->i_mapping, pgoff); - if (ptr) - shmem_swp_unmap(ptr); - spin_unlock(&info->lock); -out: - *pagep = page; - *ent = entry; -} -#endif - #else /* !CONFIG_SHMEM */ /* @@ -2901,23 +2414,23 @@ out: #include <linux/ramfs.h> -static struct file_system_type tmpfs_fs_type = { +static struct file_system_type shmem_fs_type = { .name = "tmpfs", .mount = ramfs_mount, .kill_sb = kill_litter_super, }; -int __init init_tmpfs(void) +int __init shmem_init(void) { - BUG_ON(register_filesystem(&tmpfs_fs_type) != 0); + BUG_ON(register_filesystem(&shmem_fs_type) != 0); - shm_mnt = kern_mount(&tmpfs_fs_type); + shm_mnt = kern_mount(&shmem_fs_type); BUG_ON(IS_ERR(shm_mnt)); return 0; } -int shmem_unuse(swp_entry_t entry, struct page *page) +int shmem_unuse(swp_entry_t swap, struct page *page) { return 0; } @@ -2927,43 +2440,17 @@ int shmem_lock(struct file *file, int lock, struct user_struct *user) return 0; } -void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end) +void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend) { - truncate_inode_pages_range(inode->i_mapping, start, end); + truncate_inode_pages_range(inode->i_mapping, lstart, lend); } EXPORT_SYMBOL_GPL(shmem_truncate_range); -#ifdef CONFIG_CGROUP_MEM_RES_CTLR -/** - * mem_cgroup_get_shmem_target - find a page or entry assigned to the shmem file - * @inode: the inode to be searched - * @pgoff: the offset to be searched - * @pagep: the pointer for the found page to be stored - * @ent: the pointer for the found swap entry to be stored - * - * If a page is found, refcount of it is incremented. Callers should handle - * these refcount. - */ -void mem_cgroup_get_shmem_target(struct inode *inode, pgoff_t pgoff, - struct page **pagep, swp_entry_t *ent) -{ - struct page *page = NULL; - - if ((pgoff << PAGE_CACHE_SHIFT) >= i_size_read(inode)) - goto out; - page = find_get_page(inode->i_mapping, pgoff); -out: - *pagep = page; - *ent = (swp_entry_t){ .val = 0 }; -} -#endif - #define shmem_vm_ops generic_file_vm_ops #define shmem_file_operations ramfs_file_operations #define shmem_get_inode(sb, dir, mode, dev, flags) ramfs_get_inode(sb, dir, mode, dev) #define shmem_acct_size(flags, size) 0 #define shmem_unacct_size(flags, size) do {} while (0) -#define SHMEM_MAX_BYTES MAX_LFS_FILESIZE #endif /* CONFIG_SHMEM */ @@ -2987,7 +2474,7 @@ struct file *shmem_file_setup(const char *name, loff_t size, unsigned long flags if (IS_ERR(shm_mnt)) return (void *)shm_mnt; - if (size < 0 || size > SHMEM_MAX_BYTES) + if (size < 0 || size > MAX_LFS_FILESIZE) return ERR_PTR(-EINVAL); if (shmem_acct_size(flags, size)) diff --git a/mm/slab.c b/mm/slab.c index 1e523ed47c61..6d90a091fdca 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -622,6 +622,51 @@ int slab_is_available(void) static struct lock_class_key on_slab_l3_key; static struct lock_class_key on_slab_alc_key; +static struct lock_class_key debugobj_l3_key; +static struct lock_class_key debugobj_alc_key; + +static void slab_set_lock_classes(struct kmem_cache *cachep, + struct lock_class_key *l3_key, struct lock_class_key *alc_key, + int q) +{ + struct array_cache **alc; + struct kmem_list3 *l3; + int r; + + l3 = cachep->nodelists[q]; + if (!l3) + return; + + lockdep_set_class(&l3->list_lock, l3_key); + alc = l3->alien; + /* + * FIXME: This check for BAD_ALIEN_MAGIC + * should go away when common slab code is taught to + * work even without alien caches. + * Currently, non NUMA code returns BAD_ALIEN_MAGIC + * for alloc_alien_cache, + */ + if (!alc || (unsigned long)alc == BAD_ALIEN_MAGIC) + return; + for_each_node(r) { + if (alc[r]) + lockdep_set_class(&alc[r]->lock, alc_key); + } +} + +static void slab_set_debugobj_lock_classes_node(struct kmem_cache *cachep, int node) +{ + slab_set_lock_classes(cachep, &debugobj_l3_key, &debugobj_alc_key, node); +} + +static void slab_set_debugobj_lock_classes(struct kmem_cache *cachep) +{ + int node; + + for_each_online_node(node) + slab_set_debugobj_lock_classes_node(cachep, node); +} + static void init_node_lock_keys(int q) { struct cache_sizes *s = malloc_sizes; @@ -630,29 +675,14 @@ static void init_node_lock_keys(int q) return; for (s = malloc_sizes; s->cs_size != ULONG_MAX; s++) { - struct array_cache **alc; struct kmem_list3 *l3; - int r; l3 = s->cs_cachep->nodelists[q]; if (!l3 || OFF_SLAB(s->cs_cachep)) continue; - lockdep_set_class(&l3->list_lock, &on_slab_l3_key); - alc = l3->alien; - /* - * FIXME: This check for BAD_ALIEN_MAGIC - * should go away when common slab code is taught to - * work even without alien caches. - * Currently, non NUMA code returns BAD_ALIEN_MAGIC - * for alloc_alien_cache, - */ - if (!alc || (unsigned long)alc == BAD_ALIEN_MAGIC) - continue; - for_each_node(r) { - if (alc[r]) - lockdep_set_class(&alc[r]->lock, - &on_slab_alc_key); - } + + slab_set_lock_classes(s->cs_cachep, &on_slab_l3_key, + &on_slab_alc_key, q); } } @@ -671,6 +701,14 @@ static void init_node_lock_keys(int q) static inline void init_lock_keys(void) { } + +static void slab_set_debugobj_lock_classes_node(struct kmem_cache *cachep, int node) +{ +} + +static void slab_set_debugobj_lock_classes(struct kmem_cache *cachep) +{ +} #endif /* @@ -1264,6 +1302,8 @@ static int __cpuinit cpuup_prepare(long cpu) spin_unlock_irq(&l3->list_lock); kfree(shared); free_alien_cache(alien); + if (cachep->flags & SLAB_DEBUG_OBJECTS) + slab_set_debugobj_lock_classes_node(cachep, node); } init_node_lock_keys(node); @@ -1626,6 +1666,9 @@ void __init kmem_cache_init_late(void) { struct kmem_cache *cachep; + /* Annotate slab for lockdep -- annotate the malloc caches */ + init_lock_keys(); + /* 6) resize the head arrays to their final sizes */ mutex_lock(&cache_chain_mutex); list_for_each_entry(cachep, &cache_chain, next) @@ -1636,9 +1679,6 @@ void __init kmem_cache_init_late(void) /* Done! */ g_cpucache_up = FULL; - /* Annotate slab for lockdep -- annotate the malloc caches */ - init_lock_keys(); - /* * Register a cpu startup notifier callback that initializes * cpu_cache_get for all new cpus @@ -2426,6 +2466,16 @@ kmem_cache_create (const char *name, size_t size, size_t align, goto oops; } + if (flags & SLAB_DEBUG_OBJECTS) { + /* + * Would deadlock through slab_destroy()->call_rcu()-> + * debug_object_activate()->kmem_cache_alloc(). + */ + WARN_ON_ONCE(flags & SLAB_DESTROY_BY_RCU); + + slab_set_debugobj_lock_classes(cachep); + } + /* cache setup completed, link it into the list */ list_add(&cachep->next, &cache_chain); oops: @@ -3403,7 +3453,7 @@ __cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid, cache_alloc_debugcheck_before(cachep, flags); local_irq_save(save_flags); - if (nodeid == -1) + if (nodeid == NUMA_NO_NODE) nodeid = slab_node; if (unlikely(!cachep->nodelists[nodeid])) { @@ -3934,7 +3984,7 @@ fail: struct ccupdate_struct { struct kmem_cache *cachep; - struct array_cache *new[NR_CPUS]; + struct array_cache *new[0]; }; static void do_ccupdate_local(void *info) @@ -3956,7 +4006,8 @@ static int do_tune_cpucache(struct kmem_cache *cachep, int limit, struct ccupdate_struct *new; int i; - new = kzalloc(sizeof(*new), gfp); + new = kzalloc(sizeof(*new) + nr_cpu_ids * sizeof(struct array_cache *), + gfp); if (!new) return -ENOMEM; diff --git a/mm/slub.c b/mm/slub.c index f8f5e8efeb88..9f662d70eb47 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -2,10 +2,11 @@ * SLUB: A slab allocator that limits cache line use instead of queuing * objects in per cpu and per node lists. * - * The allocator synchronizes using per slab locks and only - * uses a centralized lock to manage a pool of partial slabs. + * The allocator synchronizes using per slab locks or atomic operatios + * and only uses a centralized lock to manage a pool of partial slabs. * * (C) 2007 SGI, Christoph Lameter + * (C) 2011 Linux Foundation, Christoph Lameter */ #include <linux/mm.h> @@ -33,15 +34,27 @@ /* * Lock order: - * 1. slab_lock(page) - * 2. slab->list_lock + * 1. slub_lock (Global Semaphore) + * 2. node->list_lock + * 3. slab_lock(page) (Only on some arches and for debugging) * - * The slab_lock protects operations on the object of a particular - * slab and its metadata in the page struct. If the slab lock - * has been taken then no allocations nor frees can be performed - * on the objects in the slab nor can the slab be added or removed - * from the partial or full lists since this would mean modifying - * the page_struct of the slab. + * slub_lock + * + * The role of the slub_lock is to protect the list of all the slabs + * and to synchronize major metadata changes to slab cache structures. + * + * The slab_lock is only used for debugging and on arches that do not + * have the ability to do a cmpxchg_double. It only protects the second + * double word in the page struct. Meaning + * A. page->freelist -> List of object free in a page + * B. page->counters -> Counters of objects + * C. page->frozen -> frozen state + * + * If a slab is frozen then it is exempt from list management. It is not + * on any list. The processor that froze the slab is the one who can + * perform list operations on the page. Other processors may put objects + * onto the freelist but the processor that froze the slab is the only + * one that can retrieve the objects from the page's freelist. * * The list_lock protects the partial and full list on each node and * the partial slab counter. If taken then no new slabs may be added or @@ -54,20 +67,6 @@ * slabs, operations can continue without any centralized lock. F.e. * allocating a long series of objects that fill up slabs does not require * the list lock. - * - * The lock order is sometimes inverted when we are trying to get a slab - * off a list. We take the list_lock and then look for a page on the list - * to use. While we do that objects in the slabs may be freed. We can - * only operate on the slab if we have also taken the slab_lock. So we use - * a slab_trylock() on the slab. If trylock was successful then no frees - * can occur anymore and we can use the slab for allocations etc. If the - * slab_trylock() does not succeed then frees are in progress in the slab and - * we must stay away from it for a while since we may cause a bouncing - * cacheline if we try to acquire the lock. So go onto the next slab. - * If all pages are busy then we may allocate a new slab instead of reusing - * a partial slab. A new slab has no one operating on it and thus there is - * no danger of cacheline contention. - * * Interrupts are disabled during allocation and deallocation in order to * make the slab allocator safe to use in the context of an irq. In addition * interrupts are disabled to ensure that the processor does not change @@ -132,6 +131,9 @@ static inline int kmem_cache_debug(struct kmem_cache *s) /* Enable to test recovery from slab corruption on boot */ #undef SLUB_RESILIENCY_TEST +/* Enable to log cmpxchg failures */ +#undef SLUB_DEBUG_CMPXCHG + /* * Mininum number of partial slabs. These will be left on the partial * lists even if they are empty. kmem_cache_shrink may reclaim them. @@ -167,10 +169,11 @@ static inline int kmem_cache_debug(struct kmem_cache *s) #define OO_SHIFT 16 #define OO_MASK ((1 << OO_SHIFT) - 1) -#define MAX_OBJS_PER_PAGE 65535 /* since page.objects is u16 */ +#define MAX_OBJS_PER_PAGE 32767 /* since page.objects is u15 */ /* Internal SLUB flags */ #define __OBJECT_POISON 0x80000000UL /* Poison object */ +#define __CMPXCHG_DOUBLE 0x40000000UL /* Use cmpxchg_double */ static int kmem_size = sizeof(struct kmem_cache); @@ -343,11 +346,99 @@ static inline int oo_objects(struct kmem_cache_order_objects x) return x.x & OO_MASK; } +/* + * Per slab locking using the pagelock + */ +static __always_inline void slab_lock(struct page *page) +{ + bit_spin_lock(PG_locked, &page->flags); +} + +static __always_inline void slab_unlock(struct page *page) +{ + __bit_spin_unlock(PG_locked, &page->flags); +} + +/* Interrupts must be disabled (for the fallback code to work right) */ +static inline bool __cmpxchg_double_slab(struct kmem_cache *s, struct page *page, + void *freelist_old, unsigned long counters_old, + void *freelist_new, unsigned long counters_new, + const char *n) +{ + VM_BUG_ON(!irqs_disabled()); +#ifdef CONFIG_CMPXCHG_DOUBLE + if (s->flags & __CMPXCHG_DOUBLE) { + if (cmpxchg_double(&page->freelist, + freelist_old, counters_old, + freelist_new, counters_new)) + return 1; + } else +#endif + { + slab_lock(page); + if (page->freelist == freelist_old && page->counters == counters_old) { + page->freelist = freelist_new; + page->counters = counters_new; + slab_unlock(page); + return 1; + } + slab_unlock(page); + } + + cpu_relax(); + stat(s, CMPXCHG_DOUBLE_FAIL); + +#ifdef SLUB_DEBUG_CMPXCHG + printk(KERN_INFO "%s %s: cmpxchg double redo ", n, s->name); +#endif + + return 0; +} + +static inline bool cmpxchg_double_slab(struct kmem_cache *s, struct page *page, + void *freelist_old, unsigned long counters_old, + void *freelist_new, unsigned long counters_new, + const char *n) +{ +#ifdef CONFIG_CMPXCHG_DOUBLE + if (s->flags & __CMPXCHG_DOUBLE) { + if (cmpxchg_double(&page->freelist, + freelist_old, counters_old, + freelist_new, counters_new)) + return 1; + } else +#endif + { + unsigned long flags; + + local_irq_save(flags); + slab_lock(page); + if (page->freelist == freelist_old && page->counters == counters_old) { + page->freelist = freelist_new; + page->counters = counters_new; + slab_unlock(page); + local_irq_restore(flags); + return 1; + } + slab_unlock(page); + local_irq_restore(flags); + } + + cpu_relax(); + stat(s, CMPXCHG_DOUBLE_FAIL); + +#ifdef SLUB_DEBUG_CMPXCHG + printk(KERN_INFO "%s %s: cmpxchg double redo ", n, s->name); +#endif + + return 0; +} + #ifdef CONFIG_SLUB_DEBUG /* * Determine a map of object in use on a page. * - * Slab lock or node listlock must be held to guarantee that the page does + * Node listlock must be held to guarantee that the page does * not vanish from under us. */ static void get_map(struct kmem_cache *s, struct page *page, unsigned long *map) @@ -610,7 +701,7 @@ static u8 *check_bytes(u8 *start, u8 value, unsigned int bytes) return check_bytes8(start, value, bytes); value64 = value | value << 8 | value << 16 | value << 24; - value64 = value64 | value64 << 32; + value64 = (value64 & 0xffffffff) | value64 << 32; prefix = 8 - ((unsigned long)start) % 8; if (prefix) { @@ -838,10 +929,11 @@ static int check_slab(struct kmem_cache *s, struct page *page) static int on_freelist(struct kmem_cache *s, struct page *page, void *search) { int nr = 0; - void *fp = page->freelist; + void *fp; void *object = NULL; unsigned long max_objects; + fp = page->freelist; while (fp && nr <= page->objects) { if (fp == search) return 1; @@ -946,26 +1038,27 @@ static inline void slab_free_hook(struct kmem_cache *s, void *x) /* * Tracking of fully allocated slabs for debugging purposes. + * + * list_lock must be held. */ -static void add_full(struct kmem_cache_node *n, struct page *page) +static void add_full(struct kmem_cache *s, + struct kmem_cache_node *n, struct page *page) { - spin_lock(&n->list_lock); + if (!(s->flags & SLAB_STORE_USER)) + return; + list_add(&page->lru, &n->full); - spin_unlock(&n->list_lock); } +/* + * list_lock must be held. + */ static void remove_full(struct kmem_cache *s, struct page *page) { - struct kmem_cache_node *n; - if (!(s->flags & SLAB_STORE_USER)) return; - n = get_node(s, page_to_nid(page)); - - spin_lock(&n->list_lock); list_del(&page->lru); - spin_unlock(&n->list_lock); } /* Tracking of the number of slabs for debugging purposes */ @@ -1021,11 +1114,6 @@ static noinline int alloc_debug_processing(struct kmem_cache *s, struct page *pa if (!check_slab(s, page)) goto bad; - if (!on_freelist(s, page, object)) { - object_err(s, page, object, "Object already allocated"); - goto bad; - } - if (!check_valid_pointer(s, page, object)) { object_err(s, page, object, "Freelist Pointer check fails"); goto bad; @@ -1058,6 +1146,12 @@ bad: static noinline int free_debug_processing(struct kmem_cache *s, struct page *page, void *object, unsigned long addr) { + unsigned long flags; + int rc = 0; + + local_irq_save(flags); + slab_lock(page); + if (!check_slab(s, page)) goto fail; @@ -1072,7 +1166,7 @@ static noinline int free_debug_processing(struct kmem_cache *s, } if (!check_object(s, page, object, SLUB_RED_ACTIVE)) - return 0; + goto out; if (unlikely(s != page->slab)) { if (!PageSlab(page)) { @@ -1089,18 +1183,19 @@ static noinline int free_debug_processing(struct kmem_cache *s, goto fail; } - /* Special debug activities for freeing objects */ - if (!PageSlubFrozen(page) && !page->freelist) - remove_full(s, page); if (s->flags & SLAB_STORE_USER) set_track(s, object, TRACK_FREE, addr); trace(s, page, object, 0); init_object(s, object, SLUB_RED_INACTIVE); - return 1; + rc = 1; +out: + slab_unlock(page); + local_irq_restore(flags); + return rc; fail: slab_fix(s, "Object at 0x%p not freed", object); - return 0; + goto out; } static int __init setup_slub_debug(char *str) @@ -1200,7 +1295,9 @@ static inline int slab_pad_check(struct kmem_cache *s, struct page *page) { return 1; } static inline int check_object(struct kmem_cache *s, struct page *page, void *object, u8 val) { return 1; } -static inline void add_full(struct kmem_cache_node *n, struct page *page) {} +static inline void add_full(struct kmem_cache *s, struct kmem_cache_node *n, + struct page *page) {} +static inline void remove_full(struct kmem_cache *s, struct page *page) {} static inline unsigned long kmem_cache_flags(unsigned long objsize, unsigned long flags, const char *name, void (*ctor)(void *)) @@ -1252,6 +1349,11 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) struct kmem_cache_order_objects oo = s->oo; gfp_t alloc_gfp; + flags &= gfp_allowed_mask; + + if (flags & __GFP_WAIT) + local_irq_enable(); + flags |= s->allocflags; /* @@ -1268,12 +1370,17 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) * Try a lower order alloc if possible */ page = alloc_slab_page(flags, node, oo); - if (!page) - return NULL; - stat(s, ORDER_FALLBACK); + if (page) + stat(s, ORDER_FALLBACK); } + if (flags & __GFP_WAIT) + local_irq_disable(); + + if (!page) + return NULL; + if (kmemcheck_enabled && !(s->flags & (SLAB_NOTRACK | DEBUG_DEFAULT_FLAGS))) { int pages = 1 << oo_order(oo); @@ -1341,6 +1448,7 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node) page->freelist = start; page->inuse = 0; + page->frozen = 1; out: return page; } @@ -1418,77 +1526,87 @@ static void discard_slab(struct kmem_cache *s, struct page *page) } /* - * Per slab locking using the pagelock - */ -static __always_inline void slab_lock(struct page *page) -{ - bit_spin_lock(PG_locked, &page->flags); -} - -static __always_inline void slab_unlock(struct page *page) -{ - __bit_spin_unlock(PG_locked, &page->flags); -} - -static __always_inline int slab_trylock(struct page *page) -{ - int rc = 1; - - rc = bit_spin_trylock(PG_locked, &page->flags); - return rc; -} - -/* - * Management of partially allocated slabs + * Management of partially allocated slabs. + * + * list_lock must be held. */ -static void add_partial(struct kmem_cache_node *n, +static inline void add_partial(struct kmem_cache_node *n, struct page *page, int tail) { - spin_lock(&n->list_lock); n->nr_partial++; if (tail) list_add_tail(&page->lru, &n->partial); else list_add(&page->lru, &n->partial); - spin_unlock(&n->list_lock); } -static inline void __remove_partial(struct kmem_cache_node *n, +/* + * list_lock must be held. + */ +static inline void remove_partial(struct kmem_cache_node *n, struct page *page) { list_del(&page->lru); n->nr_partial--; } -static void remove_partial(struct kmem_cache *s, struct page *page) -{ - struct kmem_cache_node *n = get_node(s, page_to_nid(page)); - - spin_lock(&n->list_lock); - __remove_partial(n, page); - spin_unlock(&n->list_lock); -} - /* - * Lock slab and remove from the partial list. + * Lock slab, remove from the partial list and put the object into the + * per cpu freelist. * * Must hold list_lock. */ -static inline int lock_and_freeze_slab(struct kmem_cache_node *n, - struct page *page) +static inline int acquire_slab(struct kmem_cache *s, + struct kmem_cache_node *n, struct page *page) { - if (slab_trylock(page)) { - __remove_partial(n, page); - __SetPageSlubFrozen(page); + void *freelist; + unsigned long counters; + struct page new; + + /* + * Zap the freelist and set the frozen bit. + * The old freelist is the list of objects for the + * per cpu allocation list. + */ + do { + freelist = page->freelist; + counters = page->counters; + new.counters = counters; + new.inuse = page->objects; + + VM_BUG_ON(new.frozen); + new.frozen = 1; + + } while (!__cmpxchg_double_slab(s, page, + freelist, counters, + NULL, new.counters, + "lock and freeze")); + + remove_partial(n, page); + + if (freelist) { + /* Populate the per cpu freelist */ + this_cpu_write(s->cpu_slab->freelist, freelist); + this_cpu_write(s->cpu_slab->page, page); + this_cpu_write(s->cpu_slab->node, page_to_nid(page)); return 1; + } else { + /* + * Slab page came from the wrong list. No object to allocate + * from. Put it onto the correct list and continue partial + * scan. + */ + printk(KERN_ERR "SLUB: %s : Page without available objects on" + " partial list\n", s->name); + return 0; } - return 0; } /* * Try to allocate a partial slab from a specific node. */ -static struct page *get_partial_node(struct kmem_cache_node *n) +static struct page *get_partial_node(struct kmem_cache *s, + struct kmem_cache_node *n) { struct page *page; @@ -1503,7 +1621,7 @@ static struct page *get_partial_node(struct kmem_cache_node *n) spin_lock(&n->list_lock); list_for_each_entry(page, &n->partial, lru) - if (lock_and_freeze_slab(n, page)) + if (acquire_slab(s, n, page)) goto out; page = NULL; out: @@ -1554,7 +1672,7 @@ static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags) if (n && cpuset_zone_allowed_hardwall(zone, flags) && n->nr_partial > s->min_partial) { - page = get_partial_node(n); + page = get_partial_node(s, n); if (page) { put_mems_allowed(); return page; @@ -1574,60 +1692,13 @@ static struct page *get_partial(struct kmem_cache *s, gfp_t flags, int node) struct page *page; int searchnode = (node == NUMA_NO_NODE) ? numa_node_id() : node; - page = get_partial_node(get_node(s, searchnode)); + page = get_partial_node(s, get_node(s, searchnode)); if (page || node != NUMA_NO_NODE) return page; return get_any_partial(s, flags); } -/* - * Move a page back to the lists. - * - * Must be called with the slab lock held. - * - * On exit the slab lock will have been dropped. - */ -static void unfreeze_slab(struct kmem_cache *s, struct page *page, int tail) - __releases(bitlock) -{ - struct kmem_cache_node *n = get_node(s, page_to_nid(page)); - - __ClearPageSlubFrozen(page); - if (page->inuse) { - - if (page->freelist) { - add_partial(n, page, tail); - stat(s, tail ? DEACTIVATE_TO_TAIL : DEACTIVATE_TO_HEAD); - } else { - stat(s, DEACTIVATE_FULL); - if (kmem_cache_debug(s) && (s->flags & SLAB_STORE_USER)) - add_full(n, page); - } - slab_unlock(page); - } else { - stat(s, DEACTIVATE_EMPTY); - if (n->nr_partial < s->min_partial) { - /* - * Adding an empty slab to the partial slabs in order - * to avoid page allocator overhead. This slab needs - * to come after the other slabs with objects in - * so that the others get filled first. That way the - * size of the partial list stays small. - * - * kmem_cache_shrink can reclaim any empty slabs from - * the partial list. - */ - add_partial(n, page, 1); - slab_unlock(page); - } else { - slab_unlock(page); - stat(s, FREE_SLAB); - discard_slab(s, page); - } - } -} - #ifdef CONFIG_PREEMPT /* * Calculate the next globally unique transaction for disambiguiation @@ -1697,42 +1768,161 @@ void init_kmem_cache_cpus(struct kmem_cache *s) /* * Remove the cpu slab */ + +/* + * Remove the cpu slab + */ static void deactivate_slab(struct kmem_cache *s, struct kmem_cache_cpu *c) - __releases(bitlock) { + enum slab_modes { M_NONE, M_PARTIAL, M_FULL, M_FREE }; struct page *page = c->page; - int tail = 1; - - if (page->freelist) + struct kmem_cache_node *n = get_node(s, page_to_nid(page)); + int lock = 0; + enum slab_modes l = M_NONE, m = M_NONE; + void *freelist; + void *nextfree; + int tail = 0; + struct page new; + struct page old; + + if (page->freelist) { stat(s, DEACTIVATE_REMOTE_FREES); + tail = 1; + } + + c->tid = next_tid(c->tid); + c->page = NULL; + freelist = c->freelist; + c->freelist = NULL; + /* - * Merge cpu freelist into slab freelist. Typically we get here - * because both freelists are empty. So this is unlikely - * to occur. + * Stage one: Free all available per cpu objects back + * to the page freelist while it is still frozen. Leave the + * last one. + * + * There is no need to take the list->lock because the page + * is still frozen. + */ + while (freelist && (nextfree = get_freepointer(s, freelist))) { + void *prior; + unsigned long counters; + + do { + prior = page->freelist; + counters = page->counters; + set_freepointer(s, freelist, prior); + new.counters = counters; + new.inuse--; + VM_BUG_ON(!new.frozen); + + } while (!__cmpxchg_double_slab(s, page, + prior, counters, + freelist, new.counters, + "drain percpu freelist")); + + freelist = nextfree; + } + + /* + * Stage two: Ensure that the page is unfrozen while the + * list presence reflects the actual number of objects + * during unfreeze. + * + * We setup the list membership and then perform a cmpxchg + * with the count. If there is a mismatch then the page + * is not unfrozen but the page is on the wrong list. + * + * Then we restart the process which may have to remove + * the page from the list that we just put it on again + * because the number of objects in the slab may have + * changed. */ - while (unlikely(c->freelist)) { - void **object; +redo: - tail = 0; /* Hot objects. Put the slab first */ + old.freelist = page->freelist; + old.counters = page->counters; + VM_BUG_ON(!old.frozen); - /* Retrieve object from cpu_freelist */ - object = c->freelist; - c->freelist = get_freepointer(s, c->freelist); + /* Determine target state of the slab */ + new.counters = old.counters; + if (freelist) { + new.inuse--; + set_freepointer(s, freelist, old.freelist); + new.freelist = freelist; + } else + new.freelist = old.freelist; - /* And put onto the regular freelist */ - set_freepointer(s, object, page->freelist); - page->freelist = object; - page->inuse--; + new.frozen = 0; + + if (!new.inuse && n->nr_partial > s->min_partial) + m = M_FREE; + else if (new.freelist) { + m = M_PARTIAL; + if (!lock) { + lock = 1; + /* + * Taking the spinlock removes the possiblity + * that acquire_slab() will see a slab page that + * is frozen + */ + spin_lock(&n->list_lock); + } + } else { + m = M_FULL; + if (kmem_cache_debug(s) && !lock) { + lock = 1; + /* + * This also ensures that the scanning of full + * slabs from diagnostic functions will not see + * any frozen slabs. + */ + spin_lock(&n->list_lock); + } + } + + if (l != m) { + + if (l == M_PARTIAL) + + remove_partial(n, page); + + else if (l == M_FULL) + + remove_full(s, page); + + if (m == M_PARTIAL) { + + add_partial(n, page, tail); + stat(s, tail ? DEACTIVATE_TO_TAIL : DEACTIVATE_TO_HEAD); + + } else if (m == M_FULL) { + + stat(s, DEACTIVATE_FULL); + add_full(s, n, page); + + } + } + + l = m; + if (!__cmpxchg_double_slab(s, page, + old.freelist, old.counters, + new.freelist, new.counters, + "unfreezing slab")) + goto redo; + + if (lock) + spin_unlock(&n->list_lock); + + if (m == M_FREE) { + stat(s, DEACTIVATE_EMPTY); + discard_slab(s, page); + stat(s, FREE_SLAB); } - c->page = NULL; - c->tid = next_tid(c->tid); - unfreeze_slab(s, page, tail); } static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c) { stat(s, CPUSLAB_FLUSH); - slab_lock(c->page); deactivate_slab(s, c); } @@ -1861,6 +2051,8 @@ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, void **object; struct page *page; unsigned long flags; + struct page new; + unsigned long counters; local_irq_save(flags); #ifdef CONFIG_PREEMPT @@ -1879,72 +2071,97 @@ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, if (!page) goto new_slab; - slab_lock(page); - if (unlikely(!node_match(c, node))) - goto another_slab; + if (unlikely(!node_match(c, node))) { + stat(s, ALLOC_NODE_MISMATCH); + deactivate_slab(s, c); + goto new_slab; + } + + stat(s, ALLOC_SLOWPATH); + + do { + object = page->freelist; + counters = page->counters; + new.counters = counters; + VM_BUG_ON(!new.frozen); + + /* + * If there is no object left then we use this loop to + * deactivate the slab which is simple since no objects + * are left in the slab and therefore we do not need to + * put the page back onto the partial list. + * + * If there are objects left then we retrieve them + * and use them to refill the per cpu queue. + */ + + new.inuse = page->objects; + new.frozen = object != NULL; + + } while (!__cmpxchg_double_slab(s, page, + object, counters, + NULL, new.counters, + "__slab_alloc")); + + if (unlikely(!object)) { + c->page = NULL; + stat(s, DEACTIVATE_BYPASS); + goto new_slab; + } stat(s, ALLOC_REFILL); load_freelist: - object = page->freelist; - if (unlikely(!object)) - goto another_slab; - if (kmem_cache_debug(s)) - goto debug; - + VM_BUG_ON(!page->frozen); c->freelist = get_freepointer(s, object); - page->inuse = page->objects; - page->freelist = NULL; - - slab_unlock(page); c->tid = next_tid(c->tid); local_irq_restore(flags); - stat(s, ALLOC_SLOWPATH); return object; -another_slab: - deactivate_slab(s, c); - new_slab: page = get_partial(s, gfpflags, node); if (page) { stat(s, ALLOC_FROM_PARTIAL); - c->node = page_to_nid(page); - c->page = page; + object = c->freelist; + + if (kmem_cache_debug(s)) + goto debug; goto load_freelist; } - gfpflags &= gfp_allowed_mask; - if (gfpflags & __GFP_WAIT) - local_irq_enable(); - page = new_slab(s, gfpflags, node); - if (gfpflags & __GFP_WAIT) - local_irq_disable(); - if (page) { c = __this_cpu_ptr(s->cpu_slab); - stat(s, ALLOC_SLAB); if (c->page) flush_slab(s, c); - slab_lock(page); - __SetPageSlubFrozen(page); + /* + * No other reference to the page yet so we can + * muck around with it freely without cmpxchg + */ + object = page->freelist; + page->freelist = NULL; + page->inuse = page->objects; + + stat(s, ALLOC_SLAB); c->node = page_to_nid(page); c->page = page; + + if (kmem_cache_debug(s)) + goto debug; goto load_freelist; } if (!(gfpflags & __GFP_NOWARN) && printk_ratelimit()) slab_out_of_memory(s, gfpflags, node); local_irq_restore(flags); return NULL; + debug: - if (!alloc_debug_processing(s, page, object, addr)) - goto another_slab; + if (!object || !alloc_debug_processing(s, page, object, addr)) + goto new_slab; - page->inuse++; - page->freelist = get_freepointer(s, object); + c->freelist = get_freepointer(s, object); deactivate_slab(s, c); c->page = NULL; c->node = NUMA_NO_NODE; @@ -2096,52 +2313,89 @@ static void __slab_free(struct kmem_cache *s, struct page *page, { void *prior; void **object = (void *)x; - unsigned long flags; + int was_frozen; + int inuse; + struct page new; + unsigned long counters; + struct kmem_cache_node *n = NULL; + unsigned long uninitialized_var(flags); - local_irq_save(flags); - slab_lock(page); stat(s, FREE_SLOWPATH); if (kmem_cache_debug(s) && !free_debug_processing(s, page, x, addr)) - goto out_unlock; + return; - prior = page->freelist; - set_freepointer(s, object, prior); - page->freelist = object; - page->inuse--; + do { + prior = page->freelist; + counters = page->counters; + set_freepointer(s, object, prior); + new.counters = counters; + was_frozen = new.frozen; + new.inuse--; + if ((!new.inuse || !prior) && !was_frozen && !n) { + n = get_node(s, page_to_nid(page)); + /* + * Speculatively acquire the list_lock. + * If the cmpxchg does not succeed then we may + * drop the list_lock without any processing. + * + * Otherwise the list_lock will synchronize with + * other processors updating the list of slabs. + */ + spin_lock_irqsave(&n->list_lock, flags); + } + inuse = new.inuse; - if (unlikely(PageSlubFrozen(page))) { - stat(s, FREE_FROZEN); - goto out_unlock; - } + } while (!cmpxchg_double_slab(s, page, + prior, counters, + object, new.counters, + "__slab_free")); - if (unlikely(!page->inuse)) - goto slab_empty; + if (likely(!n)) { + /* + * The list lock was not taken therefore no list + * activity can be necessary. + */ + if (was_frozen) + stat(s, FREE_FROZEN); + return; + } /* - * Objects left in the slab. If it was not on the partial list before - * then add it. + * was_frozen may have been set after we acquired the list_lock in + * an earlier loop. So we need to check it here again. */ - if (unlikely(!prior)) { - add_partial(get_node(s, page_to_nid(page)), page, 1); - stat(s, FREE_ADD_PARTIAL); - } + if (was_frozen) + stat(s, FREE_FROZEN); + else { + if (unlikely(!inuse && n->nr_partial > s->min_partial)) + goto slab_empty; -out_unlock: - slab_unlock(page); - local_irq_restore(flags); + /* + * Objects left in the slab. If it was not on the partial list before + * then add it. + */ + if (unlikely(!prior)) { + remove_full(s, page); + add_partial(n, page, 0); + stat(s, FREE_ADD_PARTIAL); + } + } + spin_unlock_irqrestore(&n->list_lock, flags); return; slab_empty: if (prior) { /* - * Slab still on the partial list. + * Slab on the partial list. */ - remove_partial(s, page); + remove_partial(n, page); stat(s, FREE_REMOVE_PARTIAL); - } - slab_unlock(page); - local_irq_restore(flags); + } else + /* Slab must be on the full list */ + remove_full(s, page); + + spin_unlock_irqrestore(&n->list_lock, flags); stat(s, FREE_SLAB); discard_slab(s, page); } @@ -2415,7 +2669,6 @@ static void early_kmem_cache_node_alloc(int node) { struct page *page; struct kmem_cache_node *n; - unsigned long flags; BUG_ON(kmem_cache_node->size < sizeof(struct kmem_cache_node)); @@ -2433,6 +2686,7 @@ static void early_kmem_cache_node_alloc(int node) BUG_ON(!n); page->freelist = get_freepointer(kmem_cache_node, n); page->inuse++; + page->frozen = 0; kmem_cache_node->node[node] = n; #ifdef CONFIG_SLUB_DEBUG init_object(kmem_cache_node, n, SLUB_RED_ACTIVE); @@ -2441,14 +2695,7 @@ static void early_kmem_cache_node_alloc(int node) init_kmem_cache_node(n, kmem_cache_node); inc_slabs_node(kmem_cache_node, node, page->objects); - /* - * lockdep requires consistent irq usage for each lock - * so even though there cannot be a race this early in - * the boot sequence, we still disable irqs. - */ - local_irq_save(flags); add_partial(n, page, 0); - local_irq_restore(flags); } static void free_kmem_cache_nodes(struct kmem_cache *s) @@ -2654,6 +2901,12 @@ static int kmem_cache_open(struct kmem_cache *s, } } +#ifdef CONFIG_CMPXCHG_DOUBLE + if (system_has_cmpxchg_double() && (s->flags & SLAB_DEBUG_FLAGS) == 0) + /* Enable fast mode */ + s->flags |= __CMPXCHG_DOUBLE; +#endif + /* * The larger the object size is, the more pages we want on the partial * list to avoid pounding the page allocator excessively. @@ -2726,7 +2979,7 @@ static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n) spin_lock_irqsave(&n->list_lock, flags); list_for_each_entry_safe(page, h, &n->partial, lru) { if (!page->inuse) { - __remove_partial(n, page); + remove_partial(n, page); discard_slab(s, page); } else { list_slab_objects(s, page, @@ -3094,14 +3347,8 @@ int kmem_cache_shrink(struct kmem_cache *s) * list_lock. page->inuse here is the upper limit. */ list_for_each_entry_safe(page, t, &n->partial, lru) { - if (!page->inuse && slab_trylock(page)) { - /* - * Must hold slab lock here because slab_free - * may have freed the last object and be - * waiting to release the slab. - */ - __remove_partial(n, page); - slab_unlock(page); + if (!page->inuse) { + remove_partial(n, page); discard_slab(s, page); } else { list_move(&page->lru, @@ -3689,12 +3936,9 @@ static int validate_slab(struct kmem_cache *s, struct page *page, static void validate_slab_slab(struct kmem_cache *s, struct page *page, unsigned long *map) { - if (slab_trylock(page)) { - validate_slab(s, page, map); - slab_unlock(page); - } else - printk(KERN_INFO "SLUB %s: Skipped busy slab 0x%p\n", - s->name, page); + slab_lock(page); + validate_slab(s, page, map); + slab_unlock(page); } static int validate_slab_node(struct kmem_cache *s, @@ -4342,8 +4586,10 @@ static ssize_t sanity_checks_store(struct kmem_cache *s, const char *buf, size_t length) { s->flags &= ~SLAB_DEBUG_FREE; - if (buf[0] == '1') + if (buf[0] == '1') { + s->flags &= ~__CMPXCHG_DOUBLE; s->flags |= SLAB_DEBUG_FREE; + } return length; } SLAB_ATTR(sanity_checks); @@ -4357,8 +4603,10 @@ static ssize_t trace_store(struct kmem_cache *s, const char *buf, size_t length) { s->flags &= ~SLAB_TRACE; - if (buf[0] == '1') + if (buf[0] == '1') { + s->flags &= ~__CMPXCHG_DOUBLE; s->flags |= SLAB_TRACE; + } return length; } SLAB_ATTR(trace); @@ -4375,8 +4623,10 @@ static ssize_t red_zone_store(struct kmem_cache *s, return -EBUSY; s->flags &= ~SLAB_RED_ZONE; - if (buf[0] == '1') + if (buf[0] == '1') { + s->flags &= ~__CMPXCHG_DOUBLE; s->flags |= SLAB_RED_ZONE; + } calculate_sizes(s, -1); return length; } @@ -4394,8 +4644,10 @@ static ssize_t poison_store(struct kmem_cache *s, return -EBUSY; s->flags &= ~SLAB_POISON; - if (buf[0] == '1') + if (buf[0] == '1') { + s->flags &= ~__CMPXCHG_DOUBLE; s->flags |= SLAB_POISON; + } calculate_sizes(s, -1); return length; } @@ -4413,8 +4665,10 @@ static ssize_t store_user_store(struct kmem_cache *s, return -EBUSY; s->flags &= ~SLAB_STORE_USER; - if (buf[0] == '1') + if (buf[0] == '1') { + s->flags &= ~__CMPXCHG_DOUBLE; s->flags |= SLAB_STORE_USER; + } calculate_sizes(s, -1); return length; } @@ -4579,6 +4833,7 @@ STAT_ATTR(FREE_REMOVE_PARTIAL, free_remove_partial); STAT_ATTR(ALLOC_FROM_PARTIAL, alloc_from_partial); STAT_ATTR(ALLOC_SLAB, alloc_slab); STAT_ATTR(ALLOC_REFILL, alloc_refill); +STAT_ATTR(ALLOC_NODE_MISMATCH, alloc_node_mismatch); STAT_ATTR(FREE_SLAB, free_slab); STAT_ATTR(CPUSLAB_FLUSH, cpuslab_flush); STAT_ATTR(DEACTIVATE_FULL, deactivate_full); @@ -4586,7 +4841,10 @@ STAT_ATTR(DEACTIVATE_EMPTY, deactivate_empty); STAT_ATTR(DEACTIVATE_TO_HEAD, deactivate_to_head); STAT_ATTR(DEACTIVATE_TO_TAIL, deactivate_to_tail); STAT_ATTR(DEACTIVATE_REMOTE_FREES, deactivate_remote_frees); +STAT_ATTR(DEACTIVATE_BYPASS, deactivate_bypass); STAT_ATTR(ORDER_FALLBACK, order_fallback); +STAT_ATTR(CMPXCHG_DOUBLE_CPU_FAIL, cmpxchg_double_cpu_fail); +STAT_ATTR(CMPXCHG_DOUBLE_FAIL, cmpxchg_double_fail); #endif static struct attribute *slab_attrs[] = { @@ -4636,6 +4894,7 @@ static struct attribute *slab_attrs[] = { &alloc_from_partial_attr.attr, &alloc_slab_attr.attr, &alloc_refill_attr.attr, + &alloc_node_mismatch_attr.attr, &free_slab_attr.attr, &cpuslab_flush_attr.attr, &deactivate_full_attr.attr, @@ -4643,7 +4902,10 @@ static struct attribute *slab_attrs[] = { &deactivate_to_head_attr.attr, &deactivate_to_tail_attr.attr, &deactivate_remote_frees_attr.attr, + &deactivate_bypass_attr.attr, &order_fallback_attr.attr, + &cmpxchg_double_fail_attr.attr, + &cmpxchg_double_cpu_fail_attr.attr, #endif #ifdef CONFIG_FAILSLAB &failslab_attr.attr, diff --git a/mm/swapfile.c b/mm/swapfile.c index 1b8c33907242..17bc224bce68 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1924,20 +1924,24 @@ static unsigned long read_swap_header(struct swap_info_struct *p, /* * Find out how many pages are allowed for a single swap - * device. There are two limiting factors: 1) the number of - * bits for the swap offset in the swp_entry_t type and - * 2) the number of bits in the a swap pte as defined by - * the different architectures. In order to find the - * largest possible bit mask a swap entry with swap type 0 + * device. There are three limiting factors: 1) the number + * of bits for the swap offset in the swp_entry_t type, and + * 2) the number of bits in the swap pte as defined by the + * the different architectures, and 3) the number of free bits + * in an exceptional radix_tree entry. In order to find the + * largest possible bit mask, a swap entry with swap type 0 * and swap offset ~0UL is created, encoded to a swap pte, - * decoded to a swp_entry_t again and finally the swap + * decoded to a swp_entry_t again, and finally the swap * offset is extracted. This will mask all the bits from * the initial ~0UL mask that can't be encoded in either * the swp_entry_t or the architecture definition of a - * swap pte. + * swap pte. Then the same is done for a radix_tree entry. */ maxpages = swp_offset(pte_to_swp_entry( - swp_entry_to_pte(swp_entry(0, ~0UL)))) + 1; + swp_entry_to_pte(swp_entry(0, ~0UL)))); + maxpages = swp_offset(radix_to_swp_entry( + swp_to_radix_entry(swp_entry(0, maxpages)))) + 1; + if (maxpages > swap_header->info.last_page) { maxpages = swap_header->info.last_page + 1; /* p->max is an unsigned int: don't overflow it */ diff --git a/mm/truncate.c b/mm/truncate.c index 232eb2736a79..b40ac6d4e86e 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -336,6 +336,14 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping, unsigned long count = 0; int i; + /* + * Note: this function may get called on a shmem/tmpfs mapping: + * pagevec_lookup() might then return 0 prematurely (because it + * got a gangful of swap entries); but it's hardly worth worrying + * about - it can rarely have anything to free from such a mapping + * (most pages are dirty), and already skips over any difficulties. + */ + pagevec_init(&pvec, 0); while (index <= end && pagevec_lookup(&pvec, mapping, index, min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) { diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 464621d18eb2..7ef0903058ee 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -725,9 +725,10 @@ static void free_unmap_vmap_area_addr(unsigned long addr) #define VMAP_BBMAP_BITS_MIN (VMAP_MAX_ALLOC*2) #define VMAP_MIN(x, y) ((x) < (y) ? (x) : (y)) /* can't use min() */ #define VMAP_MAX(x, y) ((x) > (y) ? (x) : (y)) /* can't use max() */ -#define VMAP_BBMAP_BITS VMAP_MIN(VMAP_BBMAP_BITS_MAX, \ - VMAP_MAX(VMAP_BBMAP_BITS_MIN, \ - VMALLOC_PAGES / NR_CPUS / 16)) +#define VMAP_BBMAP_BITS \ + VMAP_MIN(VMAP_BBMAP_BITS_MAX, \ + VMAP_MAX(VMAP_BBMAP_BITS_MIN, \ + VMALLOC_PAGES / roundup_pow_of_two(NR_CPUS) / 16)) #define VMAP_BLOCK_SIZE (VMAP_BBMAP_BITS * PAGE_SIZE) diff --git a/mm/vmscan.c b/mm/vmscan.c index 7ef69124fa3e..b7719ec10dc5 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2283,7 +2283,7 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem, .mem_cgroup = mem, .memcg_record = rec, }; - unsigned long start, end; + ktime_t start, end; sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK); @@ -2292,7 +2292,7 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem, sc.may_writepage, sc.gfp_mask); - start = sched_clock(); + start = ktime_get(); /* * NOTE: Although we can get the priority field, using it * here is not a good idea, since it limits the pages we can scan. @@ -2301,10 +2301,10 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem, * the priority and make it zero. */ shrink_zone(0, zone, &sc); - end = sched_clock(); + end = ktime_get(); if (rec) - rec->elapsed += end - start; + rec->elapsed += ktime_to_ns(ktime_sub(end, start)); *scanned = sc.nr_scanned; trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed); @@ -2319,7 +2319,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont, { struct zonelist *zonelist; unsigned long nr_reclaimed; - unsigned long start, end; + ktime_t start, end; int nid; struct scan_control sc = { .may_writepage = !laptop_mode, @@ -2337,7 +2337,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont, .gfp_mask = sc.gfp_mask, }; - start = sched_clock(); + start = ktime_get(); /* * Unlike direct reclaim via alloc_pages(), memcg's reclaim doesn't * take care of from where we get pages. So the node where we start the @@ -2352,9 +2352,9 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont, sc.gfp_mask); nr_reclaimed = do_try_to_free_pages(zonelist, &sc, &shrink); - end = sched_clock(); + end = ktime_get(); if (rec) - rec->elapsed += end - start; + rec->elapsed += ktime_to_ns(ktime_sub(end, start)); trace_mm_vmscan_memcg_reclaim_end(nr_reclaimed); @@ -2529,6 +2529,9 @@ loop_again: high_wmark_pages(zone), 0, 0)) { end_zone = i; break; + } else { + /* If balanced, clear the congested flag */ + zone_clear_flag(zone, ZONE_CONGESTED); } } if (i < 0) |