summaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/hugetlb.c14
-rw-r--r--mm/madvise.c21
-rw-r--r--mm/memory.c15
-rw-r--r--mm/memory_hotplug.c1
-rw-r--r--mm/mempolicy.c198
-rw-r--r--mm/nommu.c10
-rw-r--r--mm/oom_kill.c124
-rw-r--r--mm/page_alloc.c73
-rw-r--r--mm/rmap.c21
-rw-r--r--mm/shmem.c81
-rw-r--r--mm/slab.c306
-rw-r--r--mm/slob.c2
-rw-r--r--mm/swap.c60
-rw-r--r--mm/vmscan.c137
14 files changed, 762 insertions, 301 deletions
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index b21d78c941b5..508707704d2c 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -85,7 +85,7 @@ void free_huge_page(struct page *page)
BUG_ON(page_count(page));
INIT_LIST_HEAD(&page->lru);
- page[1].mapping = NULL;
+ page[1].lru.next = NULL; /* reset dtor */
spin_lock(&hugetlb_lock);
enqueue_huge_page(page);
@@ -105,9 +105,9 @@ struct page *alloc_huge_page(struct vm_area_struct *vma, unsigned long addr)
}
spin_unlock(&hugetlb_lock);
set_page_count(page, 1);
- page[1].mapping = (void *)free_huge_page;
+ page[1].lru.next = (void *)free_huge_page; /* set dtor */
for (i = 0; i < (HPAGE_SIZE/PAGE_SIZE); ++i)
- clear_highpage(&page[i]);
+ clear_user_highpage(&page[i], addr);
return page;
}
@@ -391,12 +391,7 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
if (!new_page) {
page_cache_release(old_page);
-
- /* Logically this is OOM, not a SIGBUS, but an OOM
- * could cause the kernel to go killing other
- * processes which won't help the hugepage situation
- * at all (?) */
- return VM_FAULT_SIGBUS;
+ return VM_FAULT_OOM;
}
spin_unlock(&mm->page_table_lock);
@@ -444,6 +439,7 @@ retry:
page = alloc_huge_page(vma, address);
if (!page) {
hugetlb_put_quota(mapping);
+ ret = VM_FAULT_OOM;
goto out;
}
diff --git a/mm/madvise.c b/mm/madvise.c
index ae0ae3ea299a..af3d573b0141 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -22,16 +22,23 @@ static long madvise_behavior(struct vm_area_struct * vma,
struct mm_struct * mm = vma->vm_mm;
int error = 0;
pgoff_t pgoff;
- int new_flags = vma->vm_flags & ~VM_READHINTMASK;
+ int new_flags = vma->vm_flags;
switch (behavior) {
+ case MADV_NORMAL:
+ new_flags = new_flags & ~VM_RAND_READ & ~VM_SEQ_READ;
+ break;
case MADV_SEQUENTIAL:
- new_flags |= VM_SEQ_READ;
+ new_flags = (new_flags & ~VM_RAND_READ) | VM_SEQ_READ;
break;
case MADV_RANDOM:
- new_flags |= VM_RAND_READ;
+ new_flags = (new_flags & ~VM_SEQ_READ) | VM_RAND_READ;
break;
- default:
+ case MADV_DONTFORK:
+ new_flags |= VM_DONTCOPY;
+ break;
+ case MADV_DOFORK:
+ new_flags &= ~VM_DONTCOPY;
break;
}
@@ -177,6 +184,12 @@ madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev,
long error;
switch (behavior) {
+ case MADV_DOFORK:
+ if (vma->vm_flags & VM_IO) {
+ error = -EINVAL;
+ break;
+ }
+ case MADV_DONTFORK:
case MADV_NORMAL:
case MADV_SEQUENTIAL:
case MADV_RANDOM:
diff --git a/mm/memory.c b/mm/memory.c
index 2bee1f21aa8a..85e80a57db29 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -82,6 +82,16 @@ EXPORT_SYMBOL(num_physpages);
EXPORT_SYMBOL(high_memory);
EXPORT_SYMBOL(vmalloc_earlyreserve);
+int randomize_va_space __read_mostly = 1;
+
+static int __init disable_randmaps(char *s)
+{
+ randomize_va_space = 0;
+ return 0;
+}
+__setup("norandmaps", disable_randmaps);
+
+
/*
* If a p?d_bad entry is found while walking page tables, report
* the error, before resetting entry to p?d_none. Usually (but
@@ -613,11 +623,12 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
(*zap_work)--;
continue;
}
+
+ (*zap_work) -= PAGE_SIZE;
+
if (pte_present(ptent)) {
struct page *page;
- (*zap_work) -= PAGE_SIZE;
-
page = vm_normal_page(vma, addr, ptent);
if (unlikely(details) && page) {
/*
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index a918f77f02f3..1fe76d963ac2 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -130,6 +130,7 @@ int online_pages(unsigned long pfn, unsigned long nr_pages)
onlined_pages++;
}
zone->present_pages += onlined_pages;
+ zone->zone_pgdat->node_present_pages += onlined_pages;
setup_per_zone_pages_min();
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 27da6d5c77ba..b21869a39f0b 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -132,19 +132,29 @@ static int mpol_check_policy(int mode, nodemask_t *nodes)
}
return nodes_subset(*nodes, node_online_map) ? 0 : -EINVAL;
}
+
/* Generate a custom zonelist for the BIND policy. */
static struct zonelist *bind_zonelist(nodemask_t *nodes)
{
struct zonelist *zl;
- int num, max, nd;
+ int num, max, nd, k;
max = 1 + MAX_NR_ZONES * nodes_weight(*nodes);
- zl = kmalloc(sizeof(void *) * max, GFP_KERNEL);
+ zl = kmalloc(sizeof(struct zone *) * max, GFP_KERNEL);
if (!zl)
return NULL;
num = 0;
- for_each_node_mask(nd, *nodes)
- zl->zones[num++] = &NODE_DATA(nd)->node_zones[policy_zone];
+ /* First put in the highest zones from all nodes, then all the next
+ lower zones etc. Avoid empty zones because the memory allocator
+ doesn't like them. If you implement node hot removal you
+ have to fix that. */
+ for (k = policy_zone; k >= 0; k--) {
+ for_each_node_mask(nd, *nodes) {
+ struct zone *z = &NODE_DATA(nd)->node_zones[k];
+ if (z->present_pages > 0)
+ zl->zones[num++] = z;
+ }
+ }
zl->zones[num] = NULL;
return zl;
}
@@ -187,7 +197,7 @@ static struct mempolicy *mpol_new(int mode, nodemask_t *nodes)
return policy;
}
-static void gather_stats(struct page *, void *);
+static void gather_stats(struct page *, void *, int pte_dirty);
static void migrate_page_add(struct page *page, struct list_head *pagelist,
unsigned long flags);
@@ -229,7 +239,7 @@ static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
continue;
if (flags & MPOL_MF_STATS)
- gather_stats(page, private);
+ gather_stats(page, private, pte_dirty(*pte));
else if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
migrate_page_add(page, private, flags);
else
@@ -320,9 +330,19 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
int err;
struct vm_area_struct *first, *vma, *prev;
- /* Clear the LRU lists so pages can be isolated */
- if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
+ if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
+ /* Must have swap device for migration */
+ if (nr_swap_pages <= 0)
+ return ERR_PTR(-ENODEV);
+
+ /*
+ * Clear the LRU lists so pages can be isolated.
+ * Note that pages may be moved off the LRU after we have
+ * drained them. Those pages will fail to migrate like other
+ * pages that may be busy.
+ */
lru_add_drain_all();
+ }
first = find_vma(mm, start);
if (!first)
@@ -542,7 +562,7 @@ static void migrate_page_add(struct page *page, struct list_head *pagelist,
*/
if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1) {
if (isolate_lru_page(page))
- list_add(&page->lru, pagelist);
+ list_add_tail(&page->lru, pagelist);
}
}
@@ -559,6 +579,7 @@ static int migrate_pages_to(struct list_head *pagelist,
LIST_HEAD(moved);
LIST_HEAD(failed);
int err = 0;
+ unsigned long offset = 0;
int nr_pages;
struct page *page;
struct list_head *p;
@@ -566,8 +587,21 @@ static int migrate_pages_to(struct list_head *pagelist,
redo:
nr_pages = 0;
list_for_each(p, pagelist) {
- if (vma)
- page = alloc_page_vma(GFP_HIGHUSER, vma, vma->vm_start);
+ if (vma) {
+ /*
+ * The address passed to alloc_page_vma is used to
+ * generate the proper interleave behavior. We fake
+ * the address here by an increasing offset in order
+ * to get the proper distribution of pages.
+ *
+ * No decision has been made as to which page
+ * a certain old page is moved to so we cannot
+ * specify the correct address.
+ */
+ page = alloc_page_vma(GFP_HIGHUSER, vma,
+ offset + vma->vm_start);
+ offset += PAGE_SIZE;
+ }
else
page = alloc_pages_node(dest, GFP_HIGHUSER, 0);
@@ -575,9 +609,9 @@ redo:
err = -ENOMEM;
goto out;
}
- list_add(&page->lru, &newlist);
+ list_add_tail(&page->lru, &newlist);
nr_pages++;
- if (nr_pages > MIGRATE_CHUNK_SIZE);
+ if (nr_pages > MIGRATE_CHUNK_SIZE)
break;
}
err = migrate_pages(pagelist, &newlist, &moved, &failed);
@@ -724,7 +758,7 @@ long do_mbind(unsigned long start, unsigned long len,
MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
|| mode > MPOL_MAX)
return -EINVAL;
- if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_RESOURCE))
+ if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
return -EPERM;
if (start & ~PAGE_MASK)
@@ -798,6 +832,8 @@ static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
nodes_clear(*nodes);
if (maxnode == 0 || !nmask)
return 0;
+ if (maxnode > PAGE_SIZE*BITS_PER_BYTE)
+ return -EINVAL;
nlongs = BITS_TO_LONGS(maxnode);
if ((maxnode % BITS_PER_LONG) == 0)
@@ -916,19 +952,20 @@ asmlinkage long sys_migrate_pages(pid_t pid, unsigned long maxnode,
*/
if ((current->euid != task->suid) && (current->euid != task->uid) &&
(current->uid != task->suid) && (current->uid != task->uid) &&
- !capable(CAP_SYS_ADMIN)) {
+ !capable(CAP_SYS_NICE)) {
err = -EPERM;
goto out;
}
task_nodes = cpuset_mems_allowed(task);
/* Is the user allowed to access the target nodes? */
- if (!nodes_subset(new, task_nodes) && !capable(CAP_SYS_ADMIN)) {
+ if (!nodes_subset(new, task_nodes) && !capable(CAP_SYS_NICE)) {
err = -EPERM;
goto out;
}
- err = do_migrate_pages(mm, &old, &new, MPOL_MF_MOVE);
+ err = do_migrate_pages(mm, &old, &new,
+ capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
out:
mmput(mm);
return err;
@@ -1159,6 +1196,7 @@ static inline unsigned interleave_nid(struct mempolicy *pol,
return interleave_nodes(pol);
}
+#ifdef CONFIG_HUGETLBFS
/* Return a zonelist suitable for a huge page allocation. */
struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr)
{
@@ -1172,6 +1210,7 @@ struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr)
}
return zonelist_policy(GFP_HIGHUSER, pol);
}
+#endif
/* Allocate a page in interleaved policy.
Own path because it needs to do special accounting. */
@@ -1724,66 +1763,145 @@ static inline int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
struct numa_maps {
unsigned long pages;
unsigned long anon;
- unsigned long mapped;
+ unsigned long active;
+ unsigned long writeback;
unsigned long mapcount_max;
+ unsigned long dirty;
+ unsigned long swapcache;
unsigned long node[MAX_NUMNODES];
};
-static void gather_stats(struct page *page, void *private)
+static void gather_stats(struct page *page, void *private, int pte_dirty)
{
struct numa_maps *md = private;
int count = page_mapcount(page);
- if (count)
- md->mapped++;
+ md->pages++;
+ if (pte_dirty || PageDirty(page))
+ md->dirty++;
- if (count > md->mapcount_max)
- md->mapcount_max = count;
+ if (PageSwapCache(page))
+ md->swapcache++;
- md->pages++;
+ if (PageActive(page))
+ md->active++;
+
+ if (PageWriteback(page))
+ md->writeback++;
if (PageAnon(page))
md->anon++;
+ if (count > md->mapcount_max)
+ md->mapcount_max = count;
+
md->node[page_to_nid(page)]++;
cond_resched();
}
+#ifdef CONFIG_HUGETLB_PAGE
+static void check_huge_range(struct vm_area_struct *vma,
+ unsigned long start, unsigned long end,
+ struct numa_maps *md)
+{
+ unsigned long addr;
+ struct page *page;
+
+ for (addr = start; addr < end; addr += HPAGE_SIZE) {
+ pte_t *ptep = huge_pte_offset(vma->vm_mm, addr & HPAGE_MASK);
+ pte_t pte;
+
+ if (!ptep)
+ continue;
+
+ pte = *ptep;
+ if (pte_none(pte))
+ continue;
+
+ page = pte_page(pte);
+ if (!page)
+ continue;
+
+ gather_stats(page, md, pte_dirty(*ptep));
+ }
+}
+#else
+static inline void check_huge_range(struct vm_area_struct *vma,
+ unsigned long start, unsigned long end,
+ struct numa_maps *md)
+{
+}
+#endif
+
int show_numa_map(struct seq_file *m, void *v)
{
struct task_struct *task = m->private;
struct vm_area_struct *vma = v;
struct numa_maps *md;
+ struct file *file = vma->vm_file;
+ struct mm_struct *mm = vma->vm_mm;
int n;
char buffer[50];
- if (!vma->vm_mm)
+ if (!mm)
return 0;
md = kzalloc(sizeof(struct numa_maps), GFP_KERNEL);
if (!md)
return 0;
- check_pgd_range(vma, vma->vm_start, vma->vm_end,
- &node_online_map, MPOL_MF_STATS, md);
+ mpol_to_str(buffer, sizeof(buffer),
+ get_vma_policy(task, vma, vma->vm_start));
- if (md->pages) {
- mpol_to_str(buffer, sizeof(buffer),
- get_vma_policy(task, vma, vma->vm_start));
+ seq_printf(m, "%08lx %s", vma->vm_start, buffer);
- seq_printf(m, "%08lx %s pages=%lu mapped=%lu maxref=%lu",
- vma->vm_start, buffer, md->pages,
- md->mapped, md->mapcount_max);
+ if (file) {
+ seq_printf(m, " file=");
+ seq_path(m, file->f_vfsmnt, file->f_dentry, "\n\t= ");
+ } else if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) {
+ seq_printf(m, " heap");
+ } else if (vma->vm_start <= mm->start_stack &&
+ vma->vm_end >= mm->start_stack) {
+ seq_printf(m, " stack");
+ }
+
+ if (is_vm_hugetlb_page(vma)) {
+ check_huge_range(vma, vma->vm_start, vma->vm_end, md);
+ seq_printf(m, " huge");
+ } else {
+ check_pgd_range(vma, vma->vm_start, vma->vm_end,
+ &node_online_map, MPOL_MF_STATS, md);
+ }
- if (md->anon)
- seq_printf(m," anon=%lu",md->anon);
+ if (!md->pages)
+ goto out;
- for_each_online_node(n)
- if (md->node[n])
- seq_printf(m, " N%d=%lu", n, md->node[n]);
+ if (md->anon)
+ seq_printf(m," anon=%lu",md->anon);
- seq_putc(m, '\n');
- }
+ if (md->dirty)
+ seq_printf(m," dirty=%lu",md->dirty);
+
+ if (md->pages != md->anon && md->pages != md->dirty)
+ seq_printf(m, " mapped=%lu", md->pages);
+
+ if (md->mapcount_max > 1)
+ seq_printf(m, " mapmax=%lu", md->mapcount_max);
+
+ if (md->swapcache)
+ seq_printf(m," swapcache=%lu", md->swapcache);
+
+ if (md->active < md->pages && !is_vm_hugetlb_page(vma))
+ seq_printf(m," active=%lu", md->active);
+
+ if (md->writeback)
+ seq_printf(m," writeback=%lu", md->writeback);
+
+ for_each_online_node(n)
+ if (md->node[n])
+ seq_printf(m, " N%d=%lu", n, md->node[n]);
+out:
+ seq_putc(m, '\n');
kfree(md);
if (m->count < m->size)
diff --git a/mm/nommu.c b/mm/nommu.c
index c10262d68232..4951f4786f28 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -53,10 +53,11 @@ DECLARE_RWSEM(nommu_vma_sem);
struct vm_operations_struct generic_file_vm_ops = {
};
-EXPORT_SYMBOL(vmalloc);
EXPORT_SYMBOL(vfree);
EXPORT_SYMBOL(vmalloc_to_page);
EXPORT_SYMBOL(vmalloc_32);
+EXPORT_SYMBOL(vmap);
+EXPORT_SYMBOL(vunmap);
/*
* Handle all mappings that got truncated by a "truncate()"
@@ -203,6 +204,13 @@ void *vmalloc(unsigned long size)
{
return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL);
}
+EXPORT_SYMBOL(vmalloc);
+
+void *vmalloc_node(unsigned long size, int node)
+{
+ return vmalloc(size);
+}
+EXPORT_SYMBOL(vmalloc_node);
/*
* vmalloc_32 - allocate virtually continguos memory (32bit addressable)
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index b05ab8f2a562..78747afad6b0 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -58,15 +58,17 @@ unsigned long badness(struct task_struct *p, unsigned long uptime)
/*
* Processes which fork a lot of child processes are likely
- * a good choice. We add the vmsize of the children if they
+ * a good choice. We add half the vmsize of the children if they
* have an own mm. This prevents forking servers to flood the
- * machine with an endless amount of children
+ * machine with an endless amount of children. In case a single
+ * child is eating the vast majority of memory, adding only half
+ * to the parents will make the child our kill candidate of choice.
*/
list_for_each(tsk, &p->children) {
struct task_struct *chld;
chld = list_entry(tsk, struct task_struct, sibling);
if (chld->mm != p->mm && chld->mm)
- points += chld->mm->total_vm;
+ points += chld->mm->total_vm/2 + 1;
}
/*
@@ -131,17 +133,47 @@ unsigned long badness(struct task_struct *p, unsigned long uptime)
}
/*
+ * Types of limitations to the nodes from which allocations may occur
+ */
+#define CONSTRAINT_NONE 1
+#define CONSTRAINT_MEMORY_POLICY 2
+#define CONSTRAINT_CPUSET 3
+
+/*
+ * Determine the type of allocation constraint.
+ */
+static inline int constrained_alloc(struct zonelist *zonelist, gfp_t gfp_mask)
+{
+#ifdef CONFIG_NUMA
+ struct zone **z;
+ nodemask_t nodes = node_online_map;
+
+ for (z = zonelist->zones; *z; z++)
+ if (cpuset_zone_allowed(*z, gfp_mask))
+ node_clear((*z)->zone_pgdat->node_id,
+ nodes);
+ else
+ return CONSTRAINT_CPUSET;
+
+ if (!nodes_empty(nodes))
+ return CONSTRAINT_MEMORY_POLICY;
+#endif
+
+ return CONSTRAINT_NONE;
+}
+
+/*
* Simple selection loop. We chose the process with the highest
* number of 'points'. We expect the caller will lock the tasklist.
*
* (not docbooked, we don't want this one cluttering up the manual)
*/
-static struct task_struct * select_bad_process(void)
+static struct task_struct *select_bad_process(unsigned long *ppoints)
{
- unsigned long maxpoints = 0;
struct task_struct *g, *p;
struct task_struct *chosen = NULL;
struct timespec uptime;
+ *ppoints = 0;
do_posix_clock_monotonic_gettime(&uptime);
do_each_thread(g, p) {
@@ -169,9 +201,9 @@ static struct task_struct * select_bad_process(void)
return p;
points = badness(p, uptime.tv_sec);
- if (points > maxpoints || !chosen) {
+ if (points > *ppoints || !chosen) {
chosen = p;
- maxpoints = points;
+ *ppoints = points;
}
} while_each_thread(g, p);
return chosen;
@@ -182,7 +214,7 @@ static struct task_struct * select_bad_process(void)
* CAP_SYS_RAW_IO set, send SIGTERM instead (but it's unlikely that
* we select a process with CAP_SYS_RAW_IO set).
*/
-static void __oom_kill_task(task_t *p)
+static void __oom_kill_task(task_t *p, const char *message)
{
if (p->pid == 1) {
WARN_ON(1);
@@ -198,8 +230,8 @@ static void __oom_kill_task(task_t *p)
return;
}
task_unlock(p);
- printk(KERN_ERR "Out of Memory: Killed process %d (%s).\n",
- p->pid, p->comm);
+ printk(KERN_ERR "%s: Killed process %d (%s).\n",
+ message, p->pid, p->comm);
/*
* We give our sacrificial lamb high priority and access to
@@ -212,7 +244,7 @@ static void __oom_kill_task(task_t *p)
force_sig(SIGKILL, p);
}
-static struct mm_struct *oom_kill_task(task_t *p)
+static struct mm_struct *oom_kill_task(task_t *p, const char *message)
{
struct mm_struct *mm = get_task_mm(p);
task_t * g, * q;
@@ -224,35 +256,38 @@ static struct mm_struct *oom_kill_task(task_t *p)
return NULL;
}
- __oom_kill_task(p);
+ __oom_kill_task(p, message);
/*
* kill all processes that share the ->mm (i.e. all threads),
* but are in a different thread group
*/
do_each_thread(g, q)
if (q->mm == mm && q->tgid != p->tgid)
- __oom_kill_task(q);
+ __oom_kill_task(q, message);
while_each_thread(g, q);
return mm;
}
-static struct mm_struct *oom_kill_process(struct task_struct *p)
+static struct mm_struct *oom_kill_process(struct task_struct *p,
+ unsigned long points, const char *message)
{
struct mm_struct *mm;
struct task_struct *c;
struct list_head *tsk;
+ printk(KERN_ERR "Out of Memory: Kill process %d (%s) score %li and "
+ "children.\n", p->pid, p->comm, points);
/* Try to kill a child first */
list_for_each(tsk, &p->children) {
c = list_entry(tsk, struct task_struct, sibling);
if (c->mm == p->mm)
continue;
- mm = oom_kill_task(c);
+ mm = oom_kill_task(c, message);
if (mm)
return mm;
}
- return oom_kill_task(p);
+ return oom_kill_task(p, message);
}
/**
@@ -263,10 +298,11 @@ static struct mm_struct *oom_kill_process(struct task_struct *p)
* OR try to be smart about which process to kill. Note that we
* don't have to be perfect here, we just have to be good.
*/
-void out_of_memory(gfp_t gfp_mask, int order)
+void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order)
{
struct mm_struct *mm = NULL;
- task_t * p;
+ task_t *p;
+ unsigned long points = 0;
if (printk_ratelimit()) {
printk("oom-killer: gfp_mask=0x%x, order=%d\n",
@@ -277,24 +313,48 @@ void out_of_memory(gfp_t gfp_mask, int order)
cpuset_lock();
read_lock(&tasklist_lock);
+
+ /*
+ * Check if there were limitations on the allocation (only relevant for
+ * NUMA) that may require different handling.
+ */
+ switch (constrained_alloc(zonelist, gfp_mask)) {
+ case CONSTRAINT_MEMORY_POLICY:
+ mm = oom_kill_process(current, points,
+ "No available memory (MPOL_BIND)");
+ break;
+
+ case CONSTRAINT_CPUSET:
+ mm = oom_kill_process(current, points,
+ "No available memory in cpuset");
+ break;
+
+ case CONSTRAINT_NONE:
retry:
- p = select_bad_process();
+ /*
+ * Rambo mode: Shoot down a process and hope it solves whatever
+ * issues we may have.
+ */
+ p = select_bad_process(&points);
- if (PTR_ERR(p) == -1UL)
- goto out;
+ if (PTR_ERR(p) == -1UL)
+ goto out;
- /* Found nothing?!?! Either we hang forever, or we panic. */
- if (!p) {
- read_unlock(&tasklist_lock);
- cpuset_unlock();
- panic("Out of memory and no killable processes...\n");
- }
+ /* Found nothing?!?! Either we hang forever, or we panic. */
+ if (!p) {
+ read_unlock(&tasklist_lock);
+ cpuset_unlock();
+ panic("Out of memory and no killable processes...\n");
+ }
- mm = oom_kill_process(p);
- if (!mm)
- goto retry;
+ mm = oom_kill_process(p, points, "Out of memory");
+ if (!mm)
+ goto retry;
+
+ break;
+ }
- out:
+out:
read_unlock(&tasklist_lock);
cpuset_unlock();
if (mm)
@@ -305,5 +365,5 @@ retry:
* retry to allocate memory unless "p" is current
*/
if (!test_thread_flag(TIF_MEMDIE))
- schedule_timeout_interruptible(1);
+ schedule_timeout_uninterruptible(1);
}
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 44b4eb4202d9..234bd4895d14 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -56,6 +56,7 @@ long nr_swap_pages;
int percpu_pagelist_fraction;
static void fastcall free_hot_cold_page(struct page *page, int cold);
+static void __free_pages_ok(struct page *page, unsigned int order);
/*
* results with 256, 32 in the lowmem_reserve sysctl:
@@ -169,20 +170,23 @@ static void bad_page(struct page *page)
* All pages have PG_compound set. All pages have their ->private pointing at
* the head page (even the head page has this).
*
- * The first tail page's ->mapping, if non-zero, holds the address of the
- * compound page's put_page() function.
- *
- * The order of the allocation is stored in the first tail page's ->index
- * This is only for debug at present. This usage means that zero-order pages
- * may not be compound.
+ * The first tail page's ->lru.next holds the address of the compound page's
+ * put_page() function. Its ->lru.prev holds the order of allocation.
+ * This usage means that zero-order pages may not be compound.
*/
+
+static void free_compound_page(struct page *page)
+{
+ __free_pages_ok(page, (unsigned long)page[1].lru.prev);
+}
+
static void prep_compound_page(struct page *page, unsigned long order)
{
int i;
int nr_pages = 1 << order;
- page[1].mapping = NULL;
- page[1].index = order;
+ page[1].lru.next = (void *)free_compound_page; /* set dtor */
+ page[1].lru.prev = (void *)order;
for (i = 0; i < nr_pages; i++) {
struct page *p = page + i;
@@ -196,7 +200,7 @@ static void destroy_compound_page(struct page *page, unsigned long order)
int i;
int nr_pages = 1 << order;
- if (unlikely(page[1].index != order))
+ if (unlikely((unsigned long)page[1].lru.prev != order))
bad_page(page);
for (i = 0; i < nr_pages; i++) {
@@ -586,21 +590,20 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
}
#ifdef CONFIG_NUMA
-/* Called from the slab reaper to drain remote pagesets */
-void drain_remote_pages(void)
+/*
+ * Called from the slab reaper to drain pagesets on a particular node that
+ * belong to the currently executing processor.
+ */
+void drain_node_pages(int nodeid)
{
- struct zone *zone;
- int i;
+ int i, z;
unsigned long flags;
local_irq_save(flags);
- for_each_zone(zone) {
+ for (z = 0; z < MAX_NR_ZONES; z++) {
+ struct zone *zone = NODE_DATA(nodeid)->node_zones + z;
struct per_cpu_pageset *pset;
- /* Do not drain local pagesets */
- if (zone->zone_pgdat->node_id == numa_node_id())
- continue;
-
pset = zone_pcp(zone, smp_processor_id());
for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) {
struct per_cpu_pages *pcp;
@@ -1011,7 +1014,7 @@ rebalance:
if (page)
goto got_pg;
- out_of_memory(gfp_mask, order);
+ out_of_memory(zonelist, gfp_mask, order);
goto restart;
}
@@ -1213,18 +1216,21 @@ static void __get_page_state(struct page_state *ret, int nr, cpumask_t *cpumask)
{
int cpu = 0;
- memset(ret, 0, sizeof(*ret));
+ memset(ret, 0, nr * sizeof(unsigned long));
cpus_and(*cpumask, *cpumask, cpu_online_map);
cpu = first_cpu(*cpumask);
while (cpu < NR_CPUS) {
unsigned long *in, *out, off;
+ if (!cpu_isset(cpu, *cpumask))
+ continue;
+
in = (unsigned long *)&per_cpu(page_states, cpu);
cpu = next_cpu(cpu, *cpumask);
- if (cpu < NR_CPUS)
+ if (likely(cpu < NR_CPUS))
prefetch(&per_cpu(page_states, cpu));
out = (unsigned long *)ret;
@@ -1534,29 +1540,29 @@ static int __initdata node_load[MAX_NUMNODES];
*/
static int __init find_next_best_node(int node, nodemask_t *used_node_mask)
{
- int i, n, val;
+ int n, val;
int min_val = INT_MAX;
int best_node = -1;
- for_each_online_node(i) {
- cpumask_t tmp;
+ /* Use the local node if we haven't already */
+ if (!node_isset(node, *used_node_mask)) {
+ node_set(node, *used_node_mask);
+ return node;
+ }
- /* Start from local node */
- n = (node+i) % num_online_nodes();
+ for_each_online_node(n) {
+ cpumask_t tmp;
/* Don't want a node to appear more than once */
if (node_isset(n, *used_node_mask))
continue;
- /* Use the local node if we haven't already */
- if (!node_isset(node, *used_node_mask)) {
- best_node = node;
- break;
- }
-
/* Use the distance array to find the distance */
val = node_distance(node, n);
+ /* Penalize nodes under us ("prefer the next node") */
+ val += (n < node);
+
/* Give preference to headless and unused nodes */
tmp = node_to_cpumask(n);
if (!cpus_empty(tmp))
@@ -1886,8 +1892,7 @@ static void setup_pagelist_highmark(struct per_cpu_pageset *p,
* not check if the processor is online before following the pageset pointer.
* Other parts of the kernel may not check if the zone is available.
*/
-static struct per_cpu_pageset
- boot_pageset[NR_CPUS];
+static struct per_cpu_pageset boot_pageset[NR_CPUS];
/*
* Dynamically allocate memory for the
diff --git a/mm/rmap.c b/mm/rmap.c
index df2c41c2a9a2..67f0e20b101f 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -212,25 +212,33 @@ out:
* through real pte's pointing to valid pages and then releasing
* the page from the swap cache.
*
- * Must hold page lock on page.
+ * Must hold page lock on page and mmap_sem of one vma that contains
+ * the page.
*/
void remove_from_swap(struct page *page)
{
struct anon_vma *anon_vma;
struct vm_area_struct *vma;
+ unsigned long mapping;
- if (!PageAnon(page) || !PageSwapCache(page))
+ if (!PageSwapCache(page))
return;
- anon_vma = page_lock_anon_vma(page);
- if (!anon_vma)
+ mapping = (unsigned long)page->mapping;
+
+ if (!mapping || (mapping & PAGE_MAPPING_ANON) == 0)
return;
+ /*
+ * We hold the mmap_sem lock. So no need to call page_lock_anon_vma.
+ */
+ anon_vma = (struct anon_vma *) (mapping - PAGE_MAPPING_ANON);
+ spin_lock(&anon_vma->lock);
+
list_for_each_entry(vma, &anon_vma->head, anon_vma_node)
remove_vma_swap(vma, page);
spin_unlock(&anon_vma->lock);
-
delete_from_swap_cache(page);
}
EXPORT_SYMBOL(remove_from_swap);
@@ -529,9 +537,6 @@ void page_add_new_anon_rmap(struct page *page,
*/
void page_add_file_rmap(struct page *page)
{
- BUG_ON(PageAnon(page));
- BUG_ON(!pfn_valid(page_to_pfn(page)));
-
if (atomic_inc_and_test(&page->_mapcount))
__inc_page_state(nr_mapped);
}
diff --git a/mm/shmem.c b/mm/shmem.c
index f7ac7b812f92..7c455fbaff7b 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -45,6 +45,7 @@
#include <linux/swapops.h>
#include <linux/mempolicy.h>
#include <linux/namei.h>
+#include <linux/ctype.h>
#include <asm/uaccess.h>
#include <asm/div64.h>
#include <asm/pgtable.h>
@@ -874,6 +875,51 @@ redirty:
}
#ifdef CONFIG_NUMA
+static int shmem_parse_mpol(char *value, int *policy, nodemask_t *policy_nodes)
+{
+ char *nodelist = strchr(value, ':');
+ int err = 1;
+
+ if (nodelist) {
+ /* NUL-terminate policy string */
+ *nodelist++ = '\0';
+ if (nodelist_parse(nodelist, *policy_nodes))
+ goto out;
+ }
+ if (!strcmp(value, "default")) {
+ *policy = MPOL_DEFAULT;
+ /* Don't allow a nodelist */
+ if (!nodelist)
+ err = 0;
+ } else if (!strcmp(value, "prefer")) {
+ *policy = MPOL_PREFERRED;
+ /* Insist on a nodelist of one node only */
+ if (nodelist) {
+ char *rest = nodelist;
+ while (isdigit(*rest))
+ rest++;
+ if (!*rest)
+ err = 0;
+ }
+ } else if (!strcmp(value, "bind")) {
+ *policy = MPOL_BIND;
+ /* Insist on a nodelist */
+ if (nodelist)
+ err = 0;
+ } else if (!strcmp(value, "interleave")) {
+ *policy = MPOL_INTERLEAVE;
+ /* Default to nodes online if no nodelist */
+ if (!nodelist)
+ *policy_nodes = node_online_map;
+ err = 0;
+ }
+out:
+ /* Restore string for error message */
+ if (nodelist)
+ *--nodelist = ':';
+ return err;
+}
+
static struct page *shmem_swapin_async(struct shared_policy *p,
swp_entry_t entry, unsigned long idx)
{
@@ -926,6 +972,11 @@ shmem_alloc_page(gfp_t gfp, struct shmem_inode_info *info,
return page;
}
#else
+static inline int shmem_parse_mpol(char *value, int *policy, nodemask_t *policy_nodes)
+{
+ return 1;
+}
+
static inline struct page *
shmem_swapin(struct shmem_inode_info *info,swp_entry_t entry,unsigned long idx)
{
@@ -1859,7 +1910,23 @@ static int shmem_parse_options(char *options, int *mode, uid_t *uid,
{
char *this_char, *value, *rest;
- while ((this_char = strsep(&options, ",")) != NULL) {
+ while (options != NULL) {
+ this_char = options;
+ for (;;) {
+ /*
+ * NUL-terminate this option: unfortunately,
+ * mount options form a comma-separated list,
+ * but mpol's nodelist may also contain commas.
+ */
+ options = strchr(options, ',');
+ if (options == NULL)
+ break;
+ options++;
+ if (!isdigit(*options)) {
+ options[-1] = '\0';
+ break;
+ }
+ }
if (!*this_char)
continue;
if ((value = strchr(this_char,'=')) != NULL) {
@@ -1910,18 +1977,8 @@ static int shmem_parse_options(char *options, int *mode, uid_t *uid,
if (*rest)
goto bad_val;
} else if (!strcmp(this_char,"mpol")) {
- if (!strcmp(value,"default"))
- *policy = MPOL_DEFAULT;
- else if (!strcmp(value,"preferred"))
- *policy = MPOL_PREFERRED;
- else if (!strcmp(value,"bind"))
- *policy = MPOL_BIND;
- else if (!strcmp(value,"interleave"))
- *policy = MPOL_INTERLEAVE;
- else
+ if (shmem_parse_mpol(value,policy,policy_nodes))
goto bad_val;
- } else if (!strcmp(this_char,"mpol_nodelist")) {
- nodelist_parse(value, *policy_nodes);
} else {
printk(KERN_ERR "tmpfs: Bad mount option %s\n",
this_char);
diff --git a/mm/slab.c b/mm/slab.c
index 71370256a7eb..d0bd7f07ab04 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -294,6 +294,7 @@ struct kmem_list3 {
unsigned long next_reap;
int free_touched;
unsigned int free_limit;
+ unsigned int colour_next; /* Per-node cache coloring */
spinlock_t list_lock;
struct array_cache *shared; /* shared per node */
struct array_cache **alien; /* on other nodes */
@@ -344,6 +345,7 @@ static void kmem_list3_init(struct kmem_list3 *parent)
INIT_LIST_HEAD(&parent->slabs_free);
parent->shared = NULL;
parent->alien = NULL;
+ parent->colour_next = 0;
spin_lock_init(&parent->list_lock);
parent->free_objects = 0;
parent->free_touched = 0;
@@ -390,7 +392,6 @@ struct kmem_cache {
size_t colour; /* cache colouring range */
unsigned int colour_off; /* colour offset */
- unsigned int colour_next; /* cache colouring */
struct kmem_cache *slabp_cache;
unsigned int slab_size;
unsigned int dflags; /* dynamic flags */
@@ -788,6 +789,47 @@ static void __slab_error(const char *function, struct kmem_cache *cachep, char *
dump_stack();
}
+#ifdef CONFIG_NUMA
+/*
+ * Special reaping functions for NUMA systems called from cache_reap().
+ * These take care of doing round robin flushing of alien caches (containing
+ * objects freed on different nodes from which they were allocated) and the
+ * flushing of remote pcps by calling drain_node_pages.
+ */
+static DEFINE_PER_CPU(unsigned long, reap_node);
+
+static void init_reap_node(int cpu)
+{
+ int node;
+
+ node = next_node(cpu_to_node(cpu), node_online_map);
+ if (node == MAX_NUMNODES)
+ node = 0;
+
+ __get_cpu_var(reap_node) = node;
+}
+
+static void next_reap_node(void)
+{
+ int node = __get_cpu_var(reap_node);
+
+ /*
+ * Also drain per cpu pages on remote zones
+ */
+ if (node != numa_node_id())
+ drain_node_pages(node);
+
+ node = next_node(node, node_online_map);
+ if (unlikely(node >= MAX_NUMNODES))
+ node = first_node(node_online_map);
+ __get_cpu_var(reap_node) = node;
+}
+
+#else
+#define init_reap_node(cpu) do { } while (0)
+#define next_reap_node(void) do { } while (0)
+#endif
+
/*
* Initiate the reap timer running on the target CPU. We run at around 1 to 2Hz
* via the workqueue/eventd.
@@ -805,6 +847,7 @@ static void __devinit start_cpu_timer(int cpu)
* at that time.
*/
if (keventd_up() && reap_work->func == NULL) {
+ init_reap_node(cpu);
INIT_WORK(reap_work, cache_reap, NULL);
schedule_delayed_work_on(cpu, reap_work, HZ + 3 * cpu);
}
@@ -883,14 +926,31 @@ static void __drain_alien_cache(struct kmem_cache *cachep,
}
}
-static void drain_alien_cache(struct kmem_cache *cachep, struct kmem_list3 *l3)
+/*
+ * Called from cache_reap() to regularly drain alien caches round robin.
+ */
+static void reap_alien(struct kmem_cache *cachep, struct kmem_list3 *l3)
+{
+ int node = __get_cpu_var(reap_node);
+
+ if (l3->alien) {
+ struct array_cache *ac = l3->alien[node];
+ if (ac && ac->avail) {
+ spin_lock_irq(&ac->lock);
+ __drain_alien_cache(cachep, ac, node);
+ spin_unlock_irq(&ac->lock);
+ }
+ }
+}
+
+static void drain_alien_cache(struct kmem_cache *cachep, struct array_cache **alien)
{
int i = 0;
struct array_cache *ac;
unsigned long flags;
for_each_online_node(i) {
- ac = l3->alien[i];
+ ac = alien[i];
if (ac) {
spin_lock_irqsave(&ac->lock, flags);
__drain_alien_cache(cachep, ac, i);
@@ -899,9 +959,19 @@ static void drain_alien_cache(struct kmem_cache *cachep, struct kmem_list3 *l3)
}
}
#else
-#define alloc_alien_cache(node, limit) do { } while (0)
-#define free_alien_cache(ac_ptr) do { } while (0)
-#define drain_alien_cache(cachep, l3) do { } while (0)
+
+#define drain_alien_cache(cachep, alien) do { } while (0)
+#define reap_alien(cachep, l3) do { } while (0)
+
+static inline struct array_cache **alloc_alien_cache(int node, int limit)
+{
+ return (struct array_cache **) 0x01020304ul;
+}
+
+static inline void free_alien_cache(struct array_cache **ac_ptr)
+{
+}
+
#endif
static int __devinit cpuup_callback(struct notifier_block *nfb,
@@ -935,6 +1005,11 @@ static int __devinit cpuup_callback(struct notifier_block *nfb,
l3->next_reap = jiffies + REAPTIMEOUT_LIST3 +
((unsigned long)cachep) % REAPTIMEOUT_LIST3;
+ /*
+ * The l3s don't come and go as CPUs come and
+ * go. cache_chain_mutex is sufficient
+ * protection here.
+ */
cachep->nodelists[node] = l3;
}
@@ -949,26 +1024,46 @@ static int __devinit cpuup_callback(struct notifier_block *nfb,
& array cache's */
list_for_each_entry(cachep, &cache_chain, next) {
struct array_cache *nc;
+ struct array_cache *shared;
+ struct array_cache **alien;
nc = alloc_arraycache(node, cachep->limit,
- cachep->batchcount);
+ cachep->batchcount);
if (!nc)
goto bad;
+ shared = alloc_arraycache(node,
+ cachep->shared * cachep->batchcount,
+ 0xbaadf00d);
+ if (!shared)
+ goto bad;
+
+ alien = alloc_alien_cache(node, cachep->limit);
+ if (!alien)
+ goto bad;
cachep->array[cpu] = nc;
l3 = cachep->nodelists[node];
BUG_ON(!l3);
- if (!l3->shared) {
- if (!(nc = alloc_arraycache(node,
- cachep->shared *
- cachep->batchcount,
- 0xbaadf00d)))
- goto bad;
- /* we are serialised from CPU_DEAD or
- CPU_UP_CANCELLED by the cpucontrol lock */
- l3->shared = nc;
+ spin_lock_irq(&l3->list_lock);
+ if (!l3->shared) {
+ /*
+ * We are serialised from CPU_DEAD or
+ * CPU_UP_CANCELLED by the cpucontrol lock
+ */
+ l3->shared = shared;
+ shared = NULL;
}
+#ifdef CONFIG_NUMA
+ if (!l3->alien) {
+ l3->alien = alien;
+ alien = NULL;
+ }
+#endif
+ spin_unlock_irq(&l3->list_lock);
+
+ kfree(shared);
+ free_alien_cache(alien);
}
mutex_unlock(&cache_chain_mutex);
break;
@@ -977,25 +1072,34 @@ static int __devinit cpuup_callback(struct notifier_block *nfb,
break;
#ifdef CONFIG_HOTPLUG_CPU
case CPU_DEAD:
+ /*
+ * Even if all the cpus of a node are down, we don't free the
+ * kmem_list3 of any cache. This to avoid a race between
+ * cpu_down, and a kmalloc allocation from another cpu for
+ * memory from the node of the cpu going down. The list3
+ * structure is usually allocated from kmem_cache_create() and
+ * gets destroyed at kmem_cache_destroy().
+ */
/* fall thru */
case CPU_UP_CANCELED:
mutex_lock(&cache_chain_mutex);
list_for_each_entry(cachep, &cache_chain, next) {
struct array_cache *nc;
+ struct array_cache *shared;
+ struct array_cache **alien;
cpumask_t mask;
mask = node_to_cpumask(node);
- spin_lock_irq(&cachep->spinlock);
/* cpu is dead; no one can alloc from it. */
nc = cachep->array[cpu];
cachep->array[cpu] = NULL;
l3 = cachep->nodelists[node];
if (!l3)
- goto unlock_cache;
+ goto free_array_cache;
- spin_lock(&l3->list_lock);
+ spin_lock_irq(&l3->list_lock);
/* Free limit for this kmem_list3 */
l3->free_limit -= cachep->batchcount;
@@ -1003,34 +1107,44 @@ static int __devinit cpuup_callback(struct notifier_block *nfb,
free_block(cachep, nc->entry, nc->avail, node);
if (!cpus_empty(mask)) {
- spin_unlock(&l3->list_lock);
- goto unlock_cache;
+ spin_unlock_irq(&l3->list_lock);
+ goto free_array_cache;
}
- if (l3->shared) {
+ shared = l3->shared;
+ if (shared) {
free_block(cachep, l3->shared->entry,
l3->shared->avail, node);
- kfree(l3->shared);
l3->shared = NULL;
}
- if (l3->alien) {
- drain_alien_cache(cachep, l3);
- free_alien_cache(l3->alien);
- l3->alien = NULL;
- }
- /* free slabs belonging to this node */
- if (__node_shrink(cachep, node)) {
- cachep->nodelists[node] = NULL;
- spin_unlock(&l3->list_lock);
- kfree(l3);
- } else {
- spin_unlock(&l3->list_lock);
+ alien = l3->alien;
+ l3->alien = NULL;
+
+ spin_unlock_irq(&l3->list_lock);
+
+ kfree(shared);
+ if (alien) {
+ drain_alien_cache(cachep, alien);
+ free_alien_cache(alien);
}
- unlock_cache:
- spin_unlock_irq(&cachep->spinlock);
+free_array_cache:
kfree(nc);
}
+ /*
+ * In the previous loop, all the objects were freed to
+ * the respective cache's slabs, now we can go ahead and
+ * shrink each nodelist to its limit.
+ */
+ list_for_each_entry(cachep, &cache_chain, next) {
+ l3 = cachep->nodelists[node];
+ if (!l3)
+ continue;
+ spin_lock_irq(&l3->list_lock);
+ /* free slabs belonging to this node */
+ __node_shrink(cachep, node);
+ spin_unlock_irq(&l3->list_lock);
+ }
mutex_unlock(&cache_chain_mutex);
break;
#endif
@@ -1070,6 +1184,7 @@ void __init kmem_cache_init(void)
struct cache_sizes *sizes;
struct cache_names *names;
int i;
+ int order;
for (i = 0; i < NUM_INIT_LISTS; i++) {
kmem_list3_init(&initkmem_list3[i]);
@@ -1113,13 +1228,16 @@ void __init kmem_cache_init(void)
cache_cache.buffer_size = ALIGN(cache_cache.buffer_size, cache_line_size());
- cache_estimate(0, cache_cache.buffer_size, cache_line_size(), 0,
- &left_over, &cache_cache.num);
+ for (order = 0; order < MAX_ORDER; order++) {
+ cache_estimate(order, cache_cache.buffer_size,
+ cache_line_size(), 0, &left_over, &cache_cache.num);
+ if (cache_cache.num)
+ break;
+ }
if (!cache_cache.num)
BUG();
-
+ cache_cache.gfporder = order;
cache_cache.colour = left_over / cache_cache.colour_off;
- cache_cache.colour_next = 0;
cache_cache.slab_size = ALIGN(cache_cache.num * sizeof(kmem_bufctl_t) +
sizeof(struct slab), cache_line_size());
@@ -1575,36 +1693,44 @@ static inline size_t calculate_slab_order(struct kmem_cache *cachep,
size_t size, size_t align, unsigned long flags)
{
size_t left_over = 0;
+ int gfporder;
- for (;; cachep->gfporder++) {
+ for (gfporder = 0 ; gfporder <= MAX_GFP_ORDER; gfporder++) {
unsigned int num;
size_t remainder;
- if (cachep->gfporder > MAX_GFP_ORDER) {
- cachep->num = 0;
- break;
- }
-
- cache_estimate(cachep->gfporder, size, align, flags,
- &remainder, &num);
+ cache_estimate(gfporder, size, align, flags, &remainder, &num);
if (!num)
continue;
+
/* More than offslab_limit objects will cause problems */
- if (flags & CFLGS_OFF_SLAB && cachep->num > offslab_limit)
+ if ((flags & CFLGS_OFF_SLAB) && num > offslab_limit)
break;
+ /* Found something acceptable - save it away */
cachep->num = num;
+ cachep->gfporder = gfporder;
left_over = remainder;
/*
+ * A VFS-reclaimable slab tends to have most allocations
+ * as GFP_NOFS and we really don't want to have to be allocating
+ * higher-order pages when we are unable to shrink dcache.
+ */
+ if (flags & SLAB_RECLAIM_ACCOUNT)
+ break;
+
+ /*
* Large number of objects is good, but very large slabs are
* currently bad for the gfp()s.
*/
- if (cachep->gfporder >= slab_break_gfp_order)
+ if (gfporder >= slab_break_gfp_order)
break;
- if ((left_over * 8) <= (PAGE_SIZE << cachep->gfporder))
- /* Acceptable internal fragmentation */
+ /*
+ * Acceptable internal fragmentation?
+ */
+ if ((left_over * 8) <= (PAGE_SIZE << gfporder))
break;
}
return left_over;
@@ -1664,6 +1790,12 @@ kmem_cache_create (const char *name, size_t size, size_t align,
BUG();
}
+ /*
+ * Prevent CPUs from coming and going.
+ * lock_cpu_hotplug() nests outside cache_chain_mutex
+ */
+ lock_cpu_hotplug();
+
mutex_lock(&cache_chain_mutex);
list_for_each(p, &cache_chain) {
@@ -1810,17 +1942,7 @@ kmem_cache_create (const char *name, size_t size, size_t align,
size = ALIGN(size, align);
- if ((flags & SLAB_RECLAIM_ACCOUNT) && size <= PAGE_SIZE) {
- /*
- * A VFS-reclaimable slab tends to have most allocations
- * as GFP_NOFS and we really don't want to have to be allocating
- * higher-order pages when we are unable to shrink dcache.
- */
- cachep->gfporder = 0;
- cache_estimate(cachep->gfporder, size, align, flags,
- &left_over, &cachep->num);
- } else
- left_over = calculate_slab_order(cachep, size, align, flags);
+ left_over = calculate_slab_order(cachep, size, align, flags);
if (!cachep->num) {
printk("kmem_cache_create: couldn't create cache %s.\n", name);
@@ -1865,8 +1987,6 @@ kmem_cache_create (const char *name, size_t size, size_t align,
cachep->dtor = dtor;
cachep->name = name;
- /* Don't let CPUs to come and go */
- lock_cpu_hotplug();
if (g_cpucache_up == FULL) {
enable_cpucache(cachep);
@@ -1925,12 +2045,12 @@ kmem_cache_create (const char *name, size_t size, size_t align,
/* cache setup completed, link it into the list */
list_add(&cachep->next, &cache_chain);
- unlock_cpu_hotplug();
oops:
if (!cachep && (flags & SLAB_PANIC))
panic("kmem_cache_create(): failed to create slab `%s'\n",
name);
mutex_unlock(&cache_chain_mutex);
+ unlock_cpu_hotplug();
return cachep;
}
EXPORT_SYMBOL(kmem_cache_create);
@@ -2011,18 +2131,16 @@ static void drain_cpu_caches(struct kmem_cache *cachep)
smp_call_function_all_cpus(do_drain, cachep);
check_irq_on();
- spin_lock_irq(&cachep->spinlock);
for_each_online_node(node) {
l3 = cachep->nodelists[node];
if (l3) {
- spin_lock(&l3->list_lock);
+ spin_lock_irq(&l3->list_lock);
drain_array_locked(cachep, l3->shared, 1, node);
- spin_unlock(&l3->list_lock);
+ spin_unlock_irq(&l3->list_lock);
if (l3->alien)
- drain_alien_cache(cachep, l3);
+ drain_alien_cache(cachep, l3->alien);
}
}
- spin_unlock_irq(&cachep->spinlock);
}
static int __node_shrink(struct kmem_cache *cachep, int node)
@@ -2324,20 +2442,20 @@ static int cache_grow(struct kmem_cache *cachep, gfp_t flags, int nodeid)
*/
ctor_flags |= SLAB_CTOR_ATOMIC;
- /* About to mess with non-constant members - lock. */
+ /* Take the l3 list lock to change the colour_next on this node */
check_irq_off();
- spin_lock(&cachep->spinlock);
+ l3 = cachep->nodelists[nodeid];
+ spin_lock(&l3->list_lock);
/* Get colour for the slab, and cal the next value. */
- offset = cachep->colour_next;
- cachep->colour_next++;
- if (cachep->colour_next >= cachep->colour)
- cachep->colour_next = 0;
- offset *= cachep->colour_off;
+ offset = l3->colour_next;
+ l3->colour_next++;
+ if (l3->colour_next >= cachep->colour)
+ l3->colour_next = 0;
+ spin_unlock(&l3->list_lock);
- spin_unlock(&cachep->spinlock);
+ offset *= cachep->colour_off;
- check_irq_off();
if (local_flags & __GFP_WAIT)
local_irq_enable();
@@ -2367,7 +2485,6 @@ static int cache_grow(struct kmem_cache *cachep, gfp_t flags, int nodeid)
if (local_flags & __GFP_WAIT)
local_irq_disable();
check_irq_off();
- l3 = cachep->nodelists[nodeid];
spin_lock(&l3->list_lock);
/* Make slab active. */
@@ -2500,7 +2617,7 @@ static void check_slabp(struct kmem_cache *cachep, struct slab *slabp)
"slab: Internal list corruption detected in cache '%s'(%d), slabp %p(%d). Hexdump:\n",
cachep->name, cachep->num, slabp, slabp->inuse);
for (i = 0;
- i < sizeof(slabp) + cachep->num * sizeof(kmem_bufctl_t);
+ i < sizeof(*slabp) + cachep->num * sizeof(kmem_bufctl_t);
i++) {
if ((i % 16) == 0)
printk("\n%03x:", i);
@@ -2725,6 +2842,7 @@ static void *__cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int node
BUG_ON(!l3);
retry:
+ check_irq_off();
spin_lock(&l3->list_lock);
entry = l3->slabs_partial.next;
if (entry == &l3->slabs_partial) {
@@ -3304,11 +3422,11 @@ static int do_tune_cpucache(struct kmem_cache *cachep, int limit, int batchcount
smp_call_function_all_cpus(do_ccupdate_local, (void *)&new);
check_irq_on();
- spin_lock_irq(&cachep->spinlock);
+ spin_lock(&cachep->spinlock);
cachep->batchcount = batchcount;
cachep->limit = limit;
cachep->shared = shared;
- spin_unlock_irq(&cachep->spinlock);
+ spin_unlock(&cachep->spinlock);
for_each_online_cpu(i) {
struct array_cache *ccold = new.new[i];
@@ -3439,8 +3557,7 @@ static void cache_reap(void *unused)
check_irq_on();
l3 = searchp->nodelists[numa_node_id()];
- if (l3->alien)
- drain_alien_cache(searchp, l3);
+ reap_alien(searchp, l3);
spin_lock_irq(&l3->list_lock);
drain_array_locked(searchp, cpu_cache_get(searchp), 0,
@@ -3490,7 +3607,7 @@ static void cache_reap(void *unused)
}
check_irq_on();
mutex_unlock(&cache_chain_mutex);
- drain_remote_pages();
+ next_reap_node();
/* Setup the next iteration */
schedule_delayed_work(&__get_cpu_var(reap_work), REAPTIMEOUT_CPUC);
}
@@ -3564,8 +3681,7 @@ static int s_show(struct seq_file *m, void *p)
int node;
struct kmem_list3 *l3;
- check_irq_on();
- spin_lock_irq(&cachep->spinlock);
+ spin_lock(&cachep->spinlock);
active_objs = 0;
num_slabs = 0;
for_each_online_node(node) {
@@ -3573,7 +3689,8 @@ static int s_show(struct seq_file *m, void *p)
if (!l3)
continue;
- spin_lock(&l3->list_lock);
+ check_irq_on();
+ spin_lock_irq(&l3->list_lock);
list_for_each(q, &l3->slabs_full) {
slabp = list_entry(q, struct slab, list);
@@ -3598,9 +3715,10 @@ static int s_show(struct seq_file *m, void *p)
num_slabs++;
}
free_objects += l3->free_objects;
- shared_avail += l3->shared->avail;
+ if (l3->shared)
+ shared_avail += l3->shared->avail;
- spin_unlock(&l3->list_lock);
+ spin_unlock_irq(&l3->list_lock);
}
num_slabs += active_slabs;
num_objs = num_slabs * cachep->num;
@@ -3644,7 +3762,7 @@ static int s_show(struct seq_file *m, void *p)
}
#endif
seq_putc(m, '\n');
- spin_unlock_irq(&cachep->spinlock);
+ spin_unlock(&cachep->spinlock);
return 0;
}
diff --git a/mm/slob.c b/mm/slob.c
index 1c240c4b71d9..a1f42bdc0245 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -336,7 +336,7 @@ EXPORT_SYMBOL(slab_reclaim_pages);
#ifdef CONFIG_SMP
-void *__alloc_percpu(size_t size, size_t align)
+void *__alloc_percpu(size_t size)
{
int i;
struct percpu_data *pdata = kmalloc(sizeof (*pdata), GFP_KERNEL);
diff --git a/mm/swap.c b/mm/swap.c
index bc2442a7b0ee..b524ea90bddb 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -34,19 +34,22 @@
/* How many pages do we try to swap or page in/out together? */
int page_cluster;
-void put_page(struct page *page)
+static void put_compound_page(struct page *page)
{
- if (unlikely(PageCompound(page))) {
- page = (struct page *)page_private(page);
- if (put_page_testzero(page)) {
- void (*dtor)(struct page *page);
+ page = (struct page *)page_private(page);
+ if (put_page_testzero(page)) {
+ void (*dtor)(struct page *page);
- dtor = (void (*)(struct page *))page[1].mapping;
- (*dtor)(page);
- }
- return;
+ dtor = (void (*)(struct page *))page[1].lru.next;
+ (*dtor)(page);
}
- if (put_page_testzero(page))
+}
+
+void put_page(struct page *page)
+{
+ if (unlikely(PageCompound(page)))
+ put_compound_page(page);
+ else if (put_page_testzero(page))
__page_cache_release(page);
}
EXPORT_SYMBOL(put_page);
@@ -244,6 +247,15 @@ void release_pages(struct page **pages, int nr, int cold)
struct page *page = pages[i];
struct zone *pagezone;
+ if (unlikely(PageCompound(page))) {
+ if (zone) {
+ spin_unlock_irq(&zone->lru_lock);
+ zone = NULL;
+ }
+ put_compound_page(page);
+ continue;
+ }
+
if (!put_page_testzero(page))
continue;
@@ -381,7 +393,8 @@ void pagevec_strip(struct pagevec *pvec)
struct page *page = pvec->pages[i];
if (PagePrivate(page) && !TestSetPageLocked(page)) {
- try_to_release_page(page, 0);
+ if (PagePrivate(page))
+ try_to_release_page(page, 0);
unlock_page(page);
}
}
@@ -477,13 +490,34 @@ void percpu_counter_mod(struct percpu_counter *fbc, long amount)
if (count >= FBC_BATCH || count <= -FBC_BATCH) {
spin_lock(&fbc->lock);
fbc->count += count;
+ *pcount = 0;
spin_unlock(&fbc->lock);
- count = 0;
+ } else {
+ *pcount = count;
}
- *pcount = count;
put_cpu();
}
EXPORT_SYMBOL(percpu_counter_mod);
+
+/*
+ * Add up all the per-cpu counts, return the result. This is a more accurate
+ * but much slower version of percpu_counter_read_positive()
+ */
+long percpu_counter_sum(struct percpu_counter *fbc)
+{
+ long ret;
+ int cpu;
+
+ spin_lock(&fbc->lock);
+ ret = fbc->count;
+ for_each_cpu(cpu) {
+ long *pcount = per_cpu_ptr(fbc->counters, cpu);
+ ret += *pcount;
+ }
+ spin_unlock(&fbc->lock);
+ return ret < 0 ? 0 : ret;
+}
+EXPORT_SYMBOL(percpu_counter_sum);
#endif
/*
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 5a610804cd06..4fe7e3aa02e2 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -443,6 +443,10 @@ static int shrink_list(struct list_head *page_list, struct scan_control *sc)
BUG_ON(PageActive(page));
sc->nr_scanned++;
+
+ if (!sc->may_swap && page_mapped(page))
+ goto keep_locked;
+
/* Double the slab pressure for mapped and swapcache pages */
if (page_mapped(page) || PageSwapCache(page))
sc->nr_scanned++;
@@ -632,7 +636,7 @@ static int swap_page(struct page *page)
struct address_space *mapping = page_mapping(page);
if (page_mapped(page) && mapping)
- if (try_to_unmap(page, 0) != SWAP_SUCCESS)
+ if (try_to_unmap(page, 1) != SWAP_SUCCESS)
goto unlock_retry;
if (PageDirty(page)) {
@@ -696,7 +700,7 @@ int migrate_page_remove_references(struct page *newpage,
* the page.
*/
if (!mapping || page_mapcount(page) + nr_refs != page_count(page))
- return 1;
+ return -EAGAIN;
/*
* Establish swap ptes for anonymous pages or destroy pte
@@ -717,13 +721,15 @@ int migrate_page_remove_references(struct page *newpage,
* If the page was not migrated then the PageSwapCache bit
* is still set and the operation may continue.
*/
- try_to_unmap(page, 1);
+ if (try_to_unmap(page, 1) == SWAP_FAIL)
+ /* A vma has VM_LOCKED set -> Permanent failure */
+ return -EPERM;
/*
* Give up if we were unable to remove all mappings.
*/
if (page_mapcount(page))
- return 1;
+ return -EAGAIN;
write_lock_irq(&mapping->tree_lock);
@@ -734,7 +740,7 @@ int migrate_page_remove_references(struct page *newpage,
if (!page_mapping(page) || page_count(page) != nr_refs ||
*radix_pointer != page) {
write_unlock_irq(&mapping->tree_lock);
- return 1;
+ return -EAGAIN;
}
/*
@@ -809,10 +815,14 @@ EXPORT_SYMBOL(migrate_page_copy);
*/
int migrate_page(struct page *newpage, struct page *page)
{
+ int rc;
+
BUG_ON(PageWriteback(page)); /* Writeback must be complete */
- if (migrate_page_remove_references(newpage, page, 2))
- return -EAGAIN;
+ rc = migrate_page_remove_references(newpage, page, 2);
+
+ if (rc)
+ return rc;
migrate_page_copy(newpage, page);
@@ -839,7 +849,7 @@ EXPORT_SYMBOL(migrate_page);
* pages are swapped out.
*
* The function returns after 10 attempts or if no pages
- * are movable anymore because t has become empty
+ * are movable anymore because to has become empty
* or no retryable pages exist anymore.
*
* Return: Number of pages not migrated when "to" ran empty.
@@ -928,12 +938,21 @@ redo:
goto unlock_both;
if (mapping->a_ops->migratepage) {
+ /*
+ * Most pages have a mapping and most filesystems
+ * should provide a migration function. Anonymous
+ * pages are part of swap space which also has its
+ * own migration function. This is the most common
+ * path for page migration.
+ */
rc = mapping->a_ops->migratepage(newpage, page);
goto unlock_both;
}
/*
- * Trigger writeout if page is dirty
+ * Default handling if a filesystem does not provide
+ * a migration function. We can only migrate clean
+ * pages so try to write out any dirty pages first.
*/
if (PageDirty(page)) {
switch (pageout(page, mapping)) {
@@ -949,9 +968,10 @@ redo:
; /* try to migrate the page below */
}
}
+
/*
- * If we have no buffer or can release the buffer
- * then do a simple migration.
+ * Buffers are managed in a filesystem specific way.
+ * We must have no buffers or drop them.
*/
if (!page_has_buffers(page) ||
try_to_release_page(page, GFP_KERNEL)) {
@@ -966,6 +986,11 @@ redo:
* swap them out.
*/
if (pass > 4) {
+ /*
+ * Persistently unable to drop buffers..... As a
+ * measure of last resort we fall back to
+ * swap_page().
+ */
unlock_page(newpage);
newpage = NULL;
rc = swap_page(page);
@@ -1176,9 +1201,47 @@ refill_inactive_zone(struct zone *zone, struct scan_control *sc)
struct page *page;
struct pagevec pvec;
int reclaim_mapped = 0;
- long mapped_ratio;
- long distress;
- long swap_tendency;
+
+ if (unlikely(sc->may_swap)) {
+ long mapped_ratio;
+ long distress;
+ long swap_tendency;
+
+ /*
+ * `distress' is a measure of how much trouble we're having
+ * reclaiming pages. 0 -> no problems. 100 -> great trouble.
+ */
+ distress = 100 >> zone->prev_priority;
+
+ /*
+ * The point of this algorithm is to decide when to start
+ * reclaiming mapped memory instead of just pagecache. Work out
+ * how much memory
+ * is mapped.
+ */
+ mapped_ratio = (sc->nr_mapped * 100) / total_memory;
+
+ /*
+ * Now decide how much we really want to unmap some pages. The
+ * mapped ratio is downgraded - just because there's a lot of
+ * mapped memory doesn't necessarily mean that page reclaim
+ * isn't succeeding.
+ *
+ * The distress ratio is important - we don't want to start
+ * going oom.
+ *
+ * A 100% value of vm_swappiness overrides this algorithm
+ * altogether.
+ */
+ swap_tendency = mapped_ratio / 2 + distress + vm_swappiness;
+
+ /*
+ * Now use this metric to decide whether to start moving mapped
+ * memory onto the inactive list.
+ */
+ if (swap_tendency >= 100)
+ reclaim_mapped = 1;
+ }
lru_add_drain();
spin_lock_irq(&zone->lru_lock);
@@ -1188,37 +1251,6 @@ refill_inactive_zone(struct zone *zone, struct scan_control *sc)
zone->nr_active -= pgmoved;
spin_unlock_irq(&zone->lru_lock);
- /*
- * `distress' is a measure of how much trouble we're having reclaiming
- * pages. 0 -> no problems. 100 -> great trouble.
- */
- distress = 100 >> zone->prev_priority;
-
- /*
- * The point of this algorithm is to decide when to start reclaiming
- * mapped memory instead of just pagecache. Work out how much memory
- * is mapped.
- */
- mapped_ratio = (sc->nr_mapped * 100) / total_memory;
-
- /*
- * Now decide how much we really want to unmap some pages. The mapped
- * ratio is downgraded - just because there's a lot of mapped memory
- * doesn't necessarily mean that page reclaim isn't succeeding.
- *
- * The distress ratio is important - we don't want to start going oom.
- *
- * A 100% value of vm_swappiness overrides this algorithm altogether.
- */
- swap_tendency = mapped_ratio / 2 + distress + vm_swappiness;
-
- /*
- * Now use this metric to decide whether to start moving mapped memory
- * onto the inactive list.
- */
- if (swap_tendency >= 100)
- reclaim_mapped = 1;
-
while (!list_empty(&l_hold)) {
cond_resched();
page = lru_to_page(&l_hold);
@@ -1595,9 +1627,7 @@ scan:
sc.nr_reclaimed = 0;
sc.priority = priority;
sc.swap_cluster_max = nr_pages? nr_pages : SWAP_CLUSTER_MAX;
- atomic_inc(&zone->reclaim_in_progress);
shrink_zone(zone, &sc);
- atomic_dec(&zone->reclaim_in_progress);
reclaim_state->reclaimed_slab = 0;
nr_slab = shrink_slab(sc.nr_scanned, GFP_KERNEL,
lru_pages);
@@ -1859,7 +1889,8 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
if (!(gfp_mask & __GFP_WAIT) ||
zone->all_unreclaimable ||
- atomic_read(&zone->reclaim_in_progress) > 0)
+ atomic_read(&zone->reclaim_in_progress) > 0 ||
+ (p->flags & PF_MEMALLOC))
return 0;
node_id = zone->zone_pgdat->node_id;
@@ -1884,7 +1915,12 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
sc.swap_cluster_max = SWAP_CLUSTER_MAX;
cond_resched();
- p->flags |= PF_MEMALLOC;
+ /*
+ * We need to be able to allocate from the reserves for RECLAIM_SWAP
+ * and we also need to be able to write out pages for RECLAIM_WRITE
+ * and RECLAIM_SWAP.
+ */
+ p->flags |= PF_MEMALLOC | PF_SWAPWRITE;
reclaim_state.reclaimed_slab = 0;
p->reclaim_state = &reclaim_state;
@@ -1908,11 +1944,10 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
* a long time.
*/
shrink_slab(sc.nr_scanned, gfp_mask, order);
- sc.nr_reclaimed = 1; /* Avoid getting the off node timeout */
}
p->reclaim_state = NULL;
- current->flags &= ~PF_MEMALLOC;
+ current->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE);
if (sc.nr_reclaimed == 0)
zone->last_unsuccessful_zone_reclaim = jiffies;