summaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/Makefile1
-rw-r--r--mm/allocpercpu.c2
-rw-r--r--mm/bootmem.c27
-rw-r--r--mm/filemap.c38
-rw-r--r--mm/filemap_xip.c3
-rw-r--r--mm/hugetlb.c14
-rw-r--r--mm/memcontrol.c1192
-rw-r--r--mm/memory.c103
-rw-r--r--mm/mempolicy.c63
-rw-r--r--mm/migrate.c19
-rw-r--r--mm/mmap.c27
-rw-r--r--mm/oom_kill.c85
-rw-r--r--mm/page_alloc.c5
-rw-r--r--mm/rmap.c47
-rw-r--r--mm/shmem.c226
-rw-r--r--mm/slab.c3
-rw-r--r--mm/slub.c410
-rw-r--r--mm/swap.c2
-rw-r--r--mm/swapfile.c49
-rw-r--r--mm/vmalloc.c2
-rw-r--r--mm/vmscan.c495
21 files changed, 2378 insertions, 435 deletions
diff --git a/mm/Makefile b/mm/Makefile
index 4af5dff37277..9f117bab5322 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -32,4 +32,5 @@ obj-$(CONFIG_FS_XIP) += filemap_xip.o
obj-$(CONFIG_MIGRATION) += migrate.o
obj-$(CONFIG_SMP) += allocpercpu.o
obj-$(CONFIG_QUICKLIST) += quicklist.o
+obj-$(CONFIG_CGROUP_MEM_CONT) += memcontrol.o
diff --git a/mm/allocpercpu.c b/mm/allocpercpu.c
index 00b02623f008..7e58322b7134 100644
--- a/mm/allocpercpu.c
+++ b/mm/allocpercpu.c
@@ -98,7 +98,7 @@ EXPORT_SYMBOL_GPL(__percpu_populate_mask);
*/
void *__percpu_alloc_mask(size_t size, gfp_t gfp, cpumask_t *mask)
{
- void *pdata = kzalloc(sizeof(struct percpu_data), gfp);
+ void *pdata = kzalloc(nr_cpu_ids * sizeof(void *), gfp);
void *__pdata = __percpu_disguise(pdata);
if (unlikely(!pdata))
diff --git a/mm/bootmem.c b/mm/bootmem.c
index 00a96970b237..f6ff4337b424 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -111,11 +111,12 @@ static unsigned long __init init_bootmem_core(pg_data_t *pgdat,
* might be used for boot-time allocations - or it might get added
* to the free page pool later on.
*/
-static void __init reserve_bootmem_core(bootmem_data_t *bdata, unsigned long addr,
- unsigned long size)
+static int __init reserve_bootmem_core(bootmem_data_t *bdata,
+ unsigned long addr, unsigned long size, int flags)
{
unsigned long sidx, eidx;
unsigned long i;
+ int ret;
/*
* round up, partially reserved pages are considered
@@ -133,7 +134,20 @@ static void __init reserve_bootmem_core(bootmem_data_t *bdata, unsigned long add
#ifdef CONFIG_DEBUG_BOOTMEM
printk("hm, page %08lx reserved twice.\n", i*PAGE_SIZE);
#endif
+ if (flags & BOOTMEM_EXCLUSIVE) {
+ ret = -EBUSY;
+ goto err;
+ }
}
+
+ return 0;
+
+err:
+ /* unreserve memory we accidentally reserved */
+ for (i--; i >= sidx; i--)
+ clear_bit(i, bdata->node_bootmem_map);
+
+ return ret;
}
static void __init free_bootmem_core(bootmem_data_t *bdata, unsigned long addr,
@@ -374,9 +388,9 @@ unsigned long __init init_bootmem_node(pg_data_t *pgdat, unsigned long freepfn,
}
void __init reserve_bootmem_node(pg_data_t *pgdat, unsigned long physaddr,
- unsigned long size)
+ unsigned long size, int flags)
{
- reserve_bootmem_core(pgdat->bdata, physaddr, size);
+ reserve_bootmem_core(pgdat->bdata, physaddr, size, flags);
}
void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr,
@@ -398,9 +412,10 @@ unsigned long __init init_bootmem(unsigned long start, unsigned long pages)
}
#ifndef CONFIG_HAVE_ARCH_BOOTMEM_NODE
-void __init reserve_bootmem(unsigned long addr, unsigned long size)
+int __init reserve_bootmem(unsigned long addr, unsigned long size,
+ int flags)
{
- reserve_bootmem_core(NODE_DATA(0)->bdata, addr, size);
+ return reserve_bootmem_core(NODE_DATA(0)->bdata, addr, size, flags);
}
#endif /* !CONFIG_HAVE_ARCH_BOOTMEM_NODE */
diff --git a/mm/filemap.c b/mm/filemap.c
index 81fb9bff0d4f..5c74b68935ac 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -33,6 +33,7 @@
#include <linux/syscalls.h>
#include <linux/cpuset.h>
#include <linux/hardirq.h> /* for BUG_ON(!in_atomic()) only */
+#include <linux/memcontrol.h>
#include "internal.h"
/*
@@ -118,6 +119,7 @@ void __remove_from_page_cache(struct page *page)
{
struct address_space *mapping = page->mapping;
+ mem_cgroup_uncharge_page(page);
radix_tree_delete(&mapping->page_tree, page->index);
page->mapping = NULL;
mapping->nrpages--;
@@ -458,8 +460,12 @@ int filemap_write_and_wait_range(struct address_space *mapping,
int add_to_page_cache(struct page *page, struct address_space *mapping,
pgoff_t offset, gfp_t gfp_mask)
{
- int error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM);
+ int error = mem_cgroup_cache_charge(page, current->mm,
+ gfp_mask & ~__GFP_HIGHMEM);
+ if (error)
+ goto out;
+ error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM);
if (error == 0) {
write_lock_irq(&mapping->tree_lock);
error = radix_tree_insert(&mapping->page_tree, offset, page);
@@ -470,10 +476,14 @@ int add_to_page_cache(struct page *page, struct address_space *mapping,
page->index = offset;
mapping->nrpages++;
__inc_zone_page_state(page, NR_FILE_PAGES);
- }
+ } else
+ mem_cgroup_uncharge_page(page);
+
write_unlock_irq(&mapping->tree_lock);
radix_tree_preload_end();
- }
+ } else
+ mem_cgroup_uncharge_page(page);
+out:
return error;
}
EXPORT_SYMBOL(add_to_page_cache);
@@ -594,7 +604,7 @@ void __lock_page(struct page *page)
}
EXPORT_SYMBOL(__lock_page);
-int fastcall __lock_page_killable(struct page *page)
+int __lock_page_killable(struct page *page)
{
DEFINE_WAIT_BIT(wait, &page->flags, PG_locked);
@@ -865,9 +875,7 @@ static void shrink_readahead_size_eio(struct file *filp,
}
/**
- * do_generic_mapping_read - generic file read routine
- * @mapping: address_space to be read
- * @ra: file's readahead state
+ * do_generic_file_read - generic file read routine
* @filp: the file to read
* @ppos: current file position
* @desc: read_descriptor
@@ -878,18 +886,13 @@ static void shrink_readahead_size_eio(struct file *filp,
*
* This is really ugly. But the goto's actually try to clarify some
* of the logic when it comes to error handling etc.
- *
- * Note the struct file* is only passed for the use of readpage.
- * It may be NULL.
*/
-void do_generic_mapping_read(struct address_space *mapping,
- struct file_ra_state *ra,
- struct file *filp,
- loff_t *ppos,
- read_descriptor_t *desc,
- read_actor_t actor)
+static void do_generic_file_read(struct file *filp, loff_t *ppos,
+ read_descriptor_t *desc, read_actor_t actor)
{
+ struct address_space *mapping = filp->f_mapping;
struct inode *inode = mapping->host;
+ struct file_ra_state *ra = &filp->f_ra;
pgoff_t index;
pgoff_t last_index;
pgoff_t prev_index;
@@ -1081,7 +1084,6 @@ out:
if (filp)
file_accessed(filp);
}
-EXPORT_SYMBOL(do_generic_mapping_read);
int file_read_actor(read_descriptor_t *desc, struct page *page,
unsigned long offset, unsigned long size)
@@ -1322,7 +1324,7 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
struct file_ra_state *ra = &file->f_ra;
struct inode *inode = mapping->host;
struct page *page;
- unsigned long size;
+ pgoff_t size;
int did_readaround = 0;
int ret = 0;
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c
index 0420a0292b03..5e598c42afd7 100644
--- a/mm/filemap_xip.c
+++ b/mm/filemap_xip.c
@@ -56,7 +56,8 @@ do_xip_mapping_read(struct address_space *mapping,
read_actor_t actor)
{
struct inode *inode = mapping->host;
- unsigned long index, end_index, offset;
+ pgoff_t index, end_index;
+ unsigned long offset;
loff_t isize;
BUG_ON(!mapping->a_ops->get_xip_page);
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 1a5642074e34..cb1b3a7ecdfc 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -24,14 +24,15 @@
const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL;
static unsigned long nr_huge_pages, free_huge_pages, resv_huge_pages;
static unsigned long surplus_huge_pages;
+static unsigned long nr_overcommit_huge_pages;
unsigned long max_huge_pages;
+unsigned long sysctl_overcommit_huge_pages;
static struct list_head hugepage_freelists[MAX_NUMNODES];
static unsigned int nr_huge_pages_node[MAX_NUMNODES];
static unsigned int free_huge_pages_node[MAX_NUMNODES];
static unsigned int surplus_huge_pages_node[MAX_NUMNODES];
static gfp_t htlb_alloc_mask = GFP_HIGHUSER;
unsigned long hugepages_treat_as_movable;
-unsigned long nr_overcommit_huge_pages;
static int hugetlb_next_nid;
/*
@@ -605,6 +606,17 @@ int hugetlb_treat_movable_handler(struct ctl_table *table, int write,
return 0;
}
+int hugetlb_overcommit_handler(struct ctl_table *table, int write,
+ struct file *file, void __user *buffer,
+ size_t *length, loff_t *ppos)
+{
+ proc_doulongvec_minmax(table, write, file, buffer, length, ppos);
+ spin_lock(&hugetlb_lock);
+ nr_overcommit_huge_pages = sysctl_overcommit_huge_pages;
+ spin_unlock(&hugetlb_lock);
+ return 0;
+}
+
#endif /* CONFIG_SYSCTL */
int hugetlb_report_meminfo(char *buf)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
new file mode 100644
index 000000000000..6bded84c20c8
--- /dev/null
+++ b/mm/memcontrol.c
@@ -0,0 +1,1192 @@
+/* memcontrol.c - Memory Controller
+ *
+ * Copyright IBM Corporation, 2007
+ * Author Balbir Singh <balbir@linux.vnet.ibm.com>
+ *
+ * Copyright 2007 OpenVZ SWsoft Inc
+ * Author: Pavel Emelianov <xemul@openvz.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+
+#include <linux/res_counter.h>
+#include <linux/memcontrol.h>
+#include <linux/cgroup.h>
+#include <linux/mm.h>
+#include <linux/smp.h>
+#include <linux/page-flags.h>
+#include <linux/backing-dev.h>
+#include <linux/bit_spinlock.h>
+#include <linux/rcupdate.h>
+#include <linux/swap.h>
+#include <linux/spinlock.h>
+#include <linux/fs.h>
+#include <linux/seq_file.h>
+
+#include <asm/uaccess.h>
+
+struct cgroup_subsys mem_cgroup_subsys;
+static const int MEM_CGROUP_RECLAIM_RETRIES = 5;
+
+/*
+ * Statistics for memory cgroup.
+ */
+enum mem_cgroup_stat_index {
+ /*
+ * For MEM_CONTAINER_TYPE_ALL, usage = pagecache + rss.
+ */
+ MEM_CGROUP_STAT_CACHE, /* # of pages charged as cache */
+ MEM_CGROUP_STAT_RSS, /* # of pages charged as rss */
+
+ MEM_CGROUP_STAT_NSTATS,
+};
+
+struct mem_cgroup_stat_cpu {
+ s64 count[MEM_CGROUP_STAT_NSTATS];
+} ____cacheline_aligned_in_smp;
+
+struct mem_cgroup_stat {
+ struct mem_cgroup_stat_cpu cpustat[NR_CPUS];
+};
+
+/*
+ * For accounting under irq disable, no need for increment preempt count.
+ */
+static void __mem_cgroup_stat_add_safe(struct mem_cgroup_stat *stat,
+ enum mem_cgroup_stat_index idx, int val)
+{
+ int cpu = smp_processor_id();
+ stat->cpustat[cpu].count[idx] += val;
+}
+
+static s64 mem_cgroup_read_stat(struct mem_cgroup_stat *stat,
+ enum mem_cgroup_stat_index idx)
+{
+ int cpu;
+ s64 ret = 0;
+ for_each_possible_cpu(cpu)
+ ret += stat->cpustat[cpu].count[idx];
+ return ret;
+}
+
+/*
+ * per-zone information in memory controller.
+ */
+
+enum mem_cgroup_zstat_index {
+ MEM_CGROUP_ZSTAT_ACTIVE,
+ MEM_CGROUP_ZSTAT_INACTIVE,
+
+ NR_MEM_CGROUP_ZSTAT,
+};
+
+struct mem_cgroup_per_zone {
+ /*
+ * spin_lock to protect the per cgroup LRU
+ */
+ spinlock_t lru_lock;
+ struct list_head active_list;
+ struct list_head inactive_list;
+ unsigned long count[NR_MEM_CGROUP_ZSTAT];
+};
+/* Macro for accessing counter */
+#define MEM_CGROUP_ZSTAT(mz, idx) ((mz)->count[(idx)])
+
+struct mem_cgroup_per_node {
+ struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES];
+};
+
+struct mem_cgroup_lru_info {
+ struct mem_cgroup_per_node *nodeinfo[MAX_NUMNODES];
+};
+
+/*
+ * The memory controller data structure. The memory controller controls both
+ * page cache and RSS per cgroup. We would eventually like to provide
+ * statistics based on the statistics developed by Rik Van Riel for clock-pro,
+ * to help the administrator determine what knobs to tune.
+ *
+ * TODO: Add a water mark for the memory controller. Reclaim will begin when
+ * we hit the water mark. May be even add a low water mark, such that
+ * no reclaim occurs from a cgroup at it's low water mark, this is
+ * a feature that will be implemented much later in the future.
+ */
+struct mem_cgroup {
+ struct cgroup_subsys_state css;
+ /*
+ * the counter to account for memory usage
+ */
+ struct res_counter res;
+ /*
+ * Per cgroup active and inactive list, similar to the
+ * per zone LRU lists.
+ */
+ struct mem_cgroup_lru_info info;
+
+ int prev_priority; /* for recording reclaim priority */
+ /*
+ * statistics.
+ */
+ struct mem_cgroup_stat stat;
+};
+
+/*
+ * We use the lower bit of the page->page_cgroup pointer as a bit spin
+ * lock. We need to ensure that page->page_cgroup is atleast two
+ * byte aligned (based on comments from Nick Piggin)
+ */
+#define PAGE_CGROUP_LOCK_BIT 0x0
+#define PAGE_CGROUP_LOCK (1 << PAGE_CGROUP_LOCK_BIT)
+
+/*
+ * A page_cgroup page is associated with every page descriptor. The
+ * page_cgroup helps us identify information about the cgroup
+ */
+struct page_cgroup {
+ struct list_head lru; /* per cgroup LRU list */
+ struct page *page;
+ struct mem_cgroup *mem_cgroup;
+ atomic_t ref_cnt; /* Helpful when pages move b/w */
+ /* mapped and cached states */
+ int flags;
+};
+#define PAGE_CGROUP_FLAG_CACHE (0x1) /* charged as cache */
+#define PAGE_CGROUP_FLAG_ACTIVE (0x2) /* page is active in this cgroup */
+
+static inline int page_cgroup_nid(struct page_cgroup *pc)
+{
+ return page_to_nid(pc->page);
+}
+
+static inline enum zone_type page_cgroup_zid(struct page_cgroup *pc)
+{
+ return page_zonenum(pc->page);
+}
+
+enum {
+ MEM_CGROUP_TYPE_UNSPEC = 0,
+ MEM_CGROUP_TYPE_MAPPED,
+ MEM_CGROUP_TYPE_CACHED,
+ MEM_CGROUP_TYPE_ALL,
+ MEM_CGROUP_TYPE_MAX,
+};
+
+enum charge_type {
+ MEM_CGROUP_CHARGE_TYPE_CACHE = 0,
+ MEM_CGROUP_CHARGE_TYPE_MAPPED,
+};
+
+
+/*
+ * Always modified under lru lock. Then, not necessary to preempt_disable()
+ */
+static void mem_cgroup_charge_statistics(struct mem_cgroup *mem, int flags,
+ bool charge)
+{
+ int val = (charge)? 1 : -1;
+ struct mem_cgroup_stat *stat = &mem->stat;
+ VM_BUG_ON(!irqs_disabled());
+
+ if (flags & PAGE_CGROUP_FLAG_CACHE)
+ __mem_cgroup_stat_add_safe(stat,
+ MEM_CGROUP_STAT_CACHE, val);
+ else
+ __mem_cgroup_stat_add_safe(stat, MEM_CGROUP_STAT_RSS, val);
+}
+
+static inline struct mem_cgroup_per_zone *
+mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid)
+{
+ BUG_ON(!mem->info.nodeinfo[nid]);
+ return &mem->info.nodeinfo[nid]->zoneinfo[zid];
+}
+
+static inline struct mem_cgroup_per_zone *
+page_cgroup_zoneinfo(struct page_cgroup *pc)
+{
+ struct mem_cgroup *mem = pc->mem_cgroup;
+ int nid = page_cgroup_nid(pc);
+ int zid = page_cgroup_zid(pc);
+
+ return mem_cgroup_zoneinfo(mem, nid, zid);
+}
+
+static unsigned long mem_cgroup_get_all_zonestat(struct mem_cgroup *mem,
+ enum mem_cgroup_zstat_index idx)
+{
+ int nid, zid;
+ struct mem_cgroup_per_zone *mz;
+ u64 total = 0;
+
+ for_each_online_node(nid)
+ for (zid = 0; zid < MAX_NR_ZONES; zid++) {
+ mz = mem_cgroup_zoneinfo(mem, nid, zid);
+ total += MEM_CGROUP_ZSTAT(mz, idx);
+ }
+ return total;
+}
+
+static struct mem_cgroup init_mem_cgroup;
+
+static inline
+struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont)
+{
+ return container_of(cgroup_subsys_state(cont,
+ mem_cgroup_subsys_id), struct mem_cgroup,
+ css);
+}
+
+static inline
+struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
+{
+ return container_of(task_subsys_state(p, mem_cgroup_subsys_id),
+ struct mem_cgroup, css);
+}
+
+void mm_init_cgroup(struct mm_struct *mm, struct task_struct *p)
+{
+ struct mem_cgroup *mem;
+
+ mem = mem_cgroup_from_task(p);
+ css_get(&mem->css);
+ mm->mem_cgroup = mem;
+}
+
+void mm_free_cgroup(struct mm_struct *mm)
+{
+ css_put(&mm->mem_cgroup->css);
+}
+
+static inline int page_cgroup_locked(struct page *page)
+{
+ return bit_spin_is_locked(PAGE_CGROUP_LOCK_BIT,
+ &page->page_cgroup);
+}
+
+void page_assign_page_cgroup(struct page *page, struct page_cgroup *pc)
+{
+ int locked;
+
+ /*
+ * While resetting the page_cgroup we might not hold the
+ * page_cgroup lock. free_hot_cold_page() is an example
+ * of such a scenario
+ */
+ if (pc)
+ VM_BUG_ON(!page_cgroup_locked(page));
+ locked = (page->page_cgroup & PAGE_CGROUP_LOCK);
+ page->page_cgroup = ((unsigned long)pc | locked);
+}
+
+struct page_cgroup *page_get_page_cgroup(struct page *page)
+{
+ return (struct page_cgroup *)
+ (page->page_cgroup & ~PAGE_CGROUP_LOCK);
+}
+
+static void __always_inline lock_page_cgroup(struct page *page)
+{
+ bit_spin_lock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup);
+ VM_BUG_ON(!page_cgroup_locked(page));
+}
+
+static void __always_inline unlock_page_cgroup(struct page *page)
+{
+ bit_spin_unlock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup);
+}
+
+/*
+ * Tie new page_cgroup to struct page under lock_page_cgroup()
+ * This can fail if the page has been tied to a page_cgroup.
+ * If success, returns 0.
+ */
+static int page_cgroup_assign_new_page_cgroup(struct page *page,
+ struct page_cgroup *pc)
+{
+ int ret = 0;
+
+ lock_page_cgroup(page);
+ if (!page_get_page_cgroup(page))
+ page_assign_page_cgroup(page, pc);
+ else /* A page is tied to other pc. */
+ ret = 1;
+ unlock_page_cgroup(page);
+ return ret;
+}
+
+/*
+ * Clear page->page_cgroup member under lock_page_cgroup().
+ * If given "pc" value is different from one page->page_cgroup,
+ * page->cgroup is not cleared.
+ * Returns a value of page->page_cgroup at lock taken.
+ * A can can detect failure of clearing by following
+ * clear_page_cgroup(page, pc) == pc
+ */
+
+static struct page_cgroup *clear_page_cgroup(struct page *page,
+ struct page_cgroup *pc)
+{
+ struct page_cgroup *ret;
+ /* lock and clear */
+ lock_page_cgroup(page);
+ ret = page_get_page_cgroup(page);
+ if (likely(ret == pc))
+ page_assign_page_cgroup(page, NULL);
+ unlock_page_cgroup(page);
+ return ret;
+}
+
+static void __mem_cgroup_remove_list(struct page_cgroup *pc)
+{
+ int from = pc->flags & PAGE_CGROUP_FLAG_ACTIVE;
+ struct mem_cgroup_per_zone *mz = page_cgroup_zoneinfo(pc);
+
+ if (from)
+ MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_ACTIVE) -= 1;
+ else
+ MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_INACTIVE) -= 1;
+
+ mem_cgroup_charge_statistics(pc->mem_cgroup, pc->flags, false);
+ list_del_init(&pc->lru);
+}
+
+static void __mem_cgroup_add_list(struct page_cgroup *pc)
+{
+ int to = pc->flags & PAGE_CGROUP_FLAG_ACTIVE;
+ struct mem_cgroup_per_zone *mz = page_cgroup_zoneinfo(pc);
+
+ if (!to) {
+ MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_INACTIVE) += 1;
+ list_add(&pc->lru, &mz->inactive_list);
+ } else {
+ MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_ACTIVE) += 1;
+ list_add(&pc->lru, &mz->active_list);
+ }
+ mem_cgroup_charge_statistics(pc->mem_cgroup, pc->flags, true);
+}
+
+static void __mem_cgroup_move_lists(struct page_cgroup *pc, bool active)
+{
+ int from = pc->flags & PAGE_CGROUP_FLAG_ACTIVE;
+ struct mem_cgroup_per_zone *mz = page_cgroup_zoneinfo(pc);
+
+ if (from)
+ MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_ACTIVE) -= 1;
+ else
+ MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_INACTIVE) -= 1;
+
+ if (active) {
+ MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_ACTIVE) += 1;
+ pc->flags |= PAGE_CGROUP_FLAG_ACTIVE;
+ list_move(&pc->lru, &mz->active_list);
+ } else {
+ MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_INACTIVE) += 1;
+ pc->flags &= ~PAGE_CGROUP_FLAG_ACTIVE;
+ list_move(&pc->lru, &mz->inactive_list);
+ }
+}
+
+int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem)
+{
+ int ret;
+
+ task_lock(task);
+ ret = task->mm && vm_match_cgroup(task->mm, mem);
+ task_unlock(task);
+ return ret;
+}
+
+/*
+ * This routine assumes that the appropriate zone's lru lock is already held
+ */
+void mem_cgroup_move_lists(struct page_cgroup *pc, bool active)
+{
+ struct mem_cgroup_per_zone *mz;
+ unsigned long flags;
+
+ if (!pc)
+ return;
+
+ mz = page_cgroup_zoneinfo(pc);
+ spin_lock_irqsave(&mz->lru_lock, flags);
+ __mem_cgroup_move_lists(pc, active);
+ spin_unlock_irqrestore(&mz->lru_lock, flags);
+}
+
+/*
+ * Calculate mapped_ratio under memory controller. This will be used in
+ * vmscan.c for deteremining we have to reclaim mapped pages.
+ */
+int mem_cgroup_calc_mapped_ratio(struct mem_cgroup *mem)
+{
+ long total, rss;
+
+ /*
+ * usage is recorded in bytes. But, here, we assume the number of
+ * physical pages can be represented by "long" on any arch.
+ */
+ total = (long) (mem->res.usage >> PAGE_SHIFT) + 1L;
+ rss = (long)mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_RSS);
+ return (int)((rss * 100L) / total);
+}
+/*
+ * This function is called from vmscan.c. In page reclaiming loop. balance
+ * between active and inactive list is calculated. For memory controller
+ * page reclaiming, we should use using mem_cgroup's imbalance rather than
+ * zone's global lru imbalance.
+ */
+long mem_cgroup_reclaim_imbalance(struct mem_cgroup *mem)
+{
+ unsigned long active, inactive;
+ /* active and inactive are the number of pages. 'long' is ok.*/
+ active = mem_cgroup_get_all_zonestat(mem, MEM_CGROUP_ZSTAT_ACTIVE);
+ inactive = mem_cgroup_get_all_zonestat(mem, MEM_CGROUP_ZSTAT_INACTIVE);
+ return (long) (active / (inactive + 1));
+}
+
+/*
+ * prev_priority control...this will be used in memory reclaim path.
+ */
+int mem_cgroup_get_reclaim_priority(struct mem_cgroup *mem)
+{
+ return mem->prev_priority;
+}
+
+void mem_cgroup_note_reclaim_priority(struct mem_cgroup *mem, int priority)
+{
+ if (priority < mem->prev_priority)
+ mem->prev_priority = priority;
+}
+
+void mem_cgroup_record_reclaim_priority(struct mem_cgroup *mem, int priority)
+{
+ mem->prev_priority = priority;
+}
+
+/*
+ * Calculate # of pages to be scanned in this priority/zone.
+ * See also vmscan.c
+ *
+ * priority starts from "DEF_PRIORITY" and decremented in each loop.
+ * (see include/linux/mmzone.h)
+ */
+
+long mem_cgroup_calc_reclaim_active(struct mem_cgroup *mem,
+ struct zone *zone, int priority)
+{
+ long nr_active;
+ int nid = zone->zone_pgdat->node_id;
+ int zid = zone_idx(zone);
+ struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(mem, nid, zid);
+
+ nr_active = MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_ACTIVE);
+ return (nr_active >> priority);
+}
+
+long mem_cgroup_calc_reclaim_inactive(struct mem_cgroup *mem,
+ struct zone *zone, int priority)
+{
+ long nr_inactive;
+ int nid = zone->zone_pgdat->node_id;
+ int zid = zone_idx(zone);
+ struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(mem, nid, zid);
+
+ nr_inactive = MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_INACTIVE);
+
+ return (nr_inactive >> priority);
+}
+
+unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
+ struct list_head *dst,
+ unsigned long *scanned, int order,
+ int mode, struct zone *z,
+ struct mem_cgroup *mem_cont,
+ int active)
+{
+ unsigned long nr_taken = 0;
+ struct page *page;
+ unsigned long scan;
+ LIST_HEAD(pc_list);
+ struct list_head *src;
+ struct page_cgroup *pc, *tmp;
+ int nid = z->zone_pgdat->node_id;
+ int zid = zone_idx(z);
+ struct mem_cgroup_per_zone *mz;
+
+ mz = mem_cgroup_zoneinfo(mem_cont, nid, zid);
+ if (active)
+ src = &mz->active_list;
+ else
+ src = &mz->inactive_list;
+
+
+ spin_lock(&mz->lru_lock);
+ scan = 0;
+ list_for_each_entry_safe_reverse(pc, tmp, src, lru) {
+ if (scan >= nr_to_scan)
+ break;
+ page = pc->page;
+ VM_BUG_ON(!pc);
+
+ if (unlikely(!PageLRU(page)))
+ continue;
+
+ if (PageActive(page) && !active) {
+ __mem_cgroup_move_lists(pc, true);
+ continue;
+ }
+ if (!PageActive(page) && active) {
+ __mem_cgroup_move_lists(pc, false);
+ continue;
+ }
+
+ scan++;
+ list_move(&pc->lru, &pc_list);
+
+ if (__isolate_lru_page(page, mode) == 0) {
+ list_move(&page->lru, dst);
+ nr_taken++;
+ }
+ }
+
+ list_splice(&pc_list, src);
+ spin_unlock(&mz->lru_lock);
+
+ *scanned = scan;
+ return nr_taken;
+}
+
+/*
+ * Charge the memory controller for page usage.
+ * Return
+ * 0 if the charge was successful
+ * < 0 if the cgroup is over its limit
+ */
+static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
+ gfp_t gfp_mask, enum charge_type ctype)
+{
+ struct mem_cgroup *mem;
+ struct page_cgroup *pc;
+ unsigned long flags;
+ unsigned long nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
+ struct mem_cgroup_per_zone *mz;
+
+ /*
+ * Should page_cgroup's go to their own slab?
+ * One could optimize the performance of the charging routine
+ * by saving a bit in the page_flags and using it as a lock
+ * to see if the cgroup page already has a page_cgroup associated
+ * with it
+ */
+retry:
+ if (page) {
+ lock_page_cgroup(page);
+ pc = page_get_page_cgroup(page);
+ /*
+ * The page_cgroup exists and
+ * the page has already been accounted.
+ */
+ if (pc) {
+ if (unlikely(!atomic_inc_not_zero(&pc->ref_cnt))) {
+ /* this page is under being uncharged ? */
+ unlock_page_cgroup(page);
+ cpu_relax();
+ goto retry;
+ } else {
+ unlock_page_cgroup(page);
+ goto done;
+ }
+ }
+ unlock_page_cgroup(page);
+ }
+
+ pc = kzalloc(sizeof(struct page_cgroup), gfp_mask);
+ if (pc == NULL)
+ goto err;
+
+ /*
+ * We always charge the cgroup the mm_struct belongs to.
+ * The mm_struct's mem_cgroup changes on task migration if the
+ * thread group leader migrates. It's possible that mm is not
+ * set, if so charge the init_mm (happens for pagecache usage).
+ */
+ if (!mm)
+ mm = &init_mm;
+
+ rcu_read_lock();
+ mem = rcu_dereference(mm->mem_cgroup);
+ /*
+ * For every charge from the cgroup, increment reference
+ * count
+ */
+ css_get(&mem->css);
+ rcu_read_unlock();
+
+ /*
+ * If we created the page_cgroup, we should free it on exceeding
+ * the cgroup limit.
+ */
+ while (res_counter_charge(&mem->res, PAGE_SIZE)) {
+ if (!(gfp_mask & __GFP_WAIT))
+ goto out;
+
+ if (try_to_free_mem_cgroup_pages(mem, gfp_mask))
+ continue;
+
+ /*
+ * try_to_free_mem_cgroup_pages() might not give us a full
+ * picture of reclaim. Some pages are reclaimed and might be
+ * moved to swap cache or just unmapped from the cgroup.
+ * Check the limit again to see if the reclaim reduced the
+ * current usage of the cgroup before giving up
+ */
+ if (res_counter_check_under_limit(&mem->res))
+ continue;
+
+ if (!nr_retries--) {
+ mem_cgroup_out_of_memory(mem, gfp_mask);
+ goto out;
+ }
+ congestion_wait(WRITE, HZ/10);
+ }
+
+ atomic_set(&pc->ref_cnt, 1);
+ pc->mem_cgroup = mem;
+ pc->page = page;
+ pc->flags = PAGE_CGROUP_FLAG_ACTIVE;
+ if (ctype == MEM_CGROUP_CHARGE_TYPE_CACHE)
+ pc->flags |= PAGE_CGROUP_FLAG_CACHE;
+
+ if (!page || page_cgroup_assign_new_page_cgroup(page, pc)) {
+ /*
+ * Another charge has been added to this page already.
+ * We take lock_page_cgroup(page) again and read
+ * page->cgroup, increment refcnt.... just retry is OK.
+ */
+ res_counter_uncharge(&mem->res, PAGE_SIZE);
+ css_put(&mem->css);
+ kfree(pc);
+ if (!page)
+ goto done;
+ goto retry;
+ }
+
+ mz = page_cgroup_zoneinfo(pc);
+ spin_lock_irqsave(&mz->lru_lock, flags);
+ /* Update statistics vector */
+ __mem_cgroup_add_list(pc);
+ spin_unlock_irqrestore(&mz->lru_lock, flags);
+
+done:
+ return 0;
+out:
+ css_put(&mem->css);
+ kfree(pc);
+err:
+ return -ENOMEM;
+}
+
+int mem_cgroup_charge(struct page *page, struct mm_struct *mm,
+ gfp_t gfp_mask)
+{
+ return mem_cgroup_charge_common(page, mm, gfp_mask,
+ MEM_CGROUP_CHARGE_TYPE_MAPPED);
+}
+
+/*
+ * See if the cached pages should be charged at all?
+ */
+int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
+ gfp_t gfp_mask)
+{
+ int ret = 0;
+ if (!mm)
+ mm = &init_mm;
+
+ ret = mem_cgroup_charge_common(page, mm, gfp_mask,
+ MEM_CGROUP_CHARGE_TYPE_CACHE);
+ return ret;
+}
+
+/*
+ * Uncharging is always a welcome operation, we never complain, simply
+ * uncharge. This routine should be called with lock_page_cgroup held
+ */
+void mem_cgroup_uncharge(struct page_cgroup *pc)
+{
+ struct mem_cgroup *mem;
+ struct mem_cgroup_per_zone *mz;
+ struct page *page;
+ unsigned long flags;
+
+ /*
+ * Check if our page_cgroup is valid
+ */
+ if (!pc)
+ return;
+
+ if (atomic_dec_and_test(&pc->ref_cnt)) {
+ page = pc->page;
+ mz = page_cgroup_zoneinfo(pc);
+ /*
+ * get page->cgroup and clear it under lock.
+ * force_empty can drop page->cgroup without checking refcnt.
+ */
+ unlock_page_cgroup(page);
+ if (clear_page_cgroup(page, pc) == pc) {
+ mem = pc->mem_cgroup;
+ css_put(&mem->css);
+ res_counter_uncharge(&mem->res, PAGE_SIZE);
+ spin_lock_irqsave(&mz->lru_lock, flags);
+ __mem_cgroup_remove_list(pc);
+ spin_unlock_irqrestore(&mz->lru_lock, flags);
+ kfree(pc);
+ }
+ lock_page_cgroup(page);
+ }
+}
+
+void mem_cgroup_uncharge_page(struct page *page)
+{
+ lock_page_cgroup(page);
+ mem_cgroup_uncharge(page_get_page_cgroup(page));
+ unlock_page_cgroup(page);
+}
+
+/*
+ * Returns non-zero if a page (under migration) has valid page_cgroup member.
+ * Refcnt of page_cgroup is incremented.
+ */
+
+int mem_cgroup_prepare_migration(struct page *page)
+{
+ struct page_cgroup *pc;
+ int ret = 0;
+ lock_page_cgroup(page);
+ pc = page_get_page_cgroup(page);
+ if (pc && atomic_inc_not_zero(&pc->ref_cnt))
+ ret = 1;
+ unlock_page_cgroup(page);
+ return ret;
+}
+
+void mem_cgroup_end_migration(struct page *page)
+{
+ struct page_cgroup *pc;
+
+ lock_page_cgroup(page);
+ pc = page_get_page_cgroup(page);
+ mem_cgroup_uncharge(pc);
+ unlock_page_cgroup(page);
+}
+/*
+ * We know both *page* and *newpage* are now not-on-LRU and Pg_locked.
+ * And no race with uncharge() routines because page_cgroup for *page*
+ * has extra one reference by mem_cgroup_prepare_migration.
+ */
+
+void mem_cgroup_page_migration(struct page *page, struct page *newpage)
+{
+ struct page_cgroup *pc;
+ struct mem_cgroup *mem;
+ unsigned long flags;
+ struct mem_cgroup_per_zone *mz;
+retry:
+ pc = page_get_page_cgroup(page);
+ if (!pc)
+ return;
+ mem = pc->mem_cgroup;
+ mz = page_cgroup_zoneinfo(pc);
+ if (clear_page_cgroup(page, pc) != pc)
+ goto retry;
+ spin_lock_irqsave(&mz->lru_lock, flags);
+
+ __mem_cgroup_remove_list(pc);
+ spin_unlock_irqrestore(&mz->lru_lock, flags);
+
+ pc->page = newpage;
+ lock_page_cgroup(newpage);
+ page_assign_page_cgroup(newpage, pc);
+ unlock_page_cgroup(newpage);
+
+ mz = page_cgroup_zoneinfo(pc);
+ spin_lock_irqsave(&mz->lru_lock, flags);
+ __mem_cgroup_add_list(pc);
+ spin_unlock_irqrestore(&mz->lru_lock, flags);
+ return;
+}
+
+/*
+ * This routine traverse page_cgroup in given list and drop them all.
+ * This routine ignores page_cgroup->ref_cnt.
+ * *And* this routine doesn't reclaim page itself, just removes page_cgroup.
+ */
+#define FORCE_UNCHARGE_BATCH (128)
+static void
+mem_cgroup_force_empty_list(struct mem_cgroup *mem,
+ struct mem_cgroup_per_zone *mz,
+ int active)
+{
+ struct page_cgroup *pc;
+ struct page *page;
+ int count;
+ unsigned long flags;
+ struct list_head *list;
+
+ if (active)
+ list = &mz->active_list;
+ else
+ list = &mz->inactive_list;
+
+ if (list_empty(list))
+ return;
+retry:
+ count = FORCE_UNCHARGE_BATCH;
+ spin_lock_irqsave(&mz->lru_lock, flags);
+
+ while (--count && !list_empty(list)) {
+ pc = list_entry(list->prev, struct page_cgroup, lru);
+ page = pc->page;
+ /* Avoid race with charge */
+ atomic_set(&pc->ref_cnt, 0);
+ if (clear_page_cgroup(page, pc) == pc) {
+ css_put(&mem->css);
+ res_counter_uncharge(&mem->res, PAGE_SIZE);
+ __mem_cgroup_remove_list(pc);
+ kfree(pc);
+ } else /* being uncharged ? ...do relax */
+ break;
+ }
+ spin_unlock_irqrestore(&mz->lru_lock, flags);
+ if (!list_empty(list)) {
+ cond_resched();
+ goto retry;
+ }
+ return;
+}
+
+/*
+ * make mem_cgroup's charge to be 0 if there is no task.
+ * This enables deleting this mem_cgroup.
+ */
+
+int mem_cgroup_force_empty(struct mem_cgroup *mem)
+{
+ int ret = -EBUSY;
+ int node, zid;
+ css_get(&mem->css);
+ /*
+ * page reclaim code (kswapd etc..) will move pages between
+` * active_list <-> inactive_list while we don't take a lock.
+ * So, we have to do loop here until all lists are empty.
+ */
+ while (mem->res.usage > 0) {
+ if (atomic_read(&mem->css.cgroup->count) > 0)
+ goto out;
+ for_each_node_state(node, N_POSSIBLE)
+ for (zid = 0; zid < MAX_NR_ZONES; zid++) {
+ struct mem_cgroup_per_zone *mz;
+ mz = mem_cgroup_zoneinfo(mem, node, zid);
+ /* drop all page_cgroup in active_list */
+ mem_cgroup_force_empty_list(mem, mz, 1);
+ /* drop all page_cgroup in inactive_list */
+ mem_cgroup_force_empty_list(mem, mz, 0);
+ }
+ }
+ ret = 0;
+out:
+ css_put(&mem->css);
+ return ret;
+}
+
+
+
+int mem_cgroup_write_strategy(char *buf, unsigned long long *tmp)
+{
+ *tmp = memparse(buf, &buf);
+ if (*buf != '\0')
+ return -EINVAL;
+
+ /*
+ * Round up the value to the closest page size
+ */
+ *tmp = ((*tmp + PAGE_SIZE - 1) >> PAGE_SHIFT) << PAGE_SHIFT;
+ return 0;
+}
+
+static ssize_t mem_cgroup_read(struct cgroup *cont,
+ struct cftype *cft, struct file *file,
+ char __user *userbuf, size_t nbytes, loff_t *ppos)
+{
+ return res_counter_read(&mem_cgroup_from_cont(cont)->res,
+ cft->private, userbuf, nbytes, ppos,
+ NULL);
+}
+
+static ssize_t mem_cgroup_write(struct cgroup *cont, struct cftype *cft,
+ struct file *file, const char __user *userbuf,
+ size_t nbytes, loff_t *ppos)
+{
+ return res_counter_write(&mem_cgroup_from_cont(cont)->res,
+ cft->private, userbuf, nbytes, ppos,
+ mem_cgroup_write_strategy);
+}
+
+static ssize_t mem_force_empty_write(struct cgroup *cont,
+ struct cftype *cft, struct file *file,
+ const char __user *userbuf,
+ size_t nbytes, loff_t *ppos)
+{
+ struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
+ int ret;
+ ret = mem_cgroup_force_empty(mem);
+ if (!ret)
+ ret = nbytes;
+ return ret;
+}
+
+/*
+ * Note: This should be removed if cgroup supports write-only file.
+ */
+
+static ssize_t mem_force_empty_read(struct cgroup *cont,
+ struct cftype *cft,
+ struct file *file, char __user *userbuf,
+ size_t nbytes, loff_t *ppos)
+{
+ return -EINVAL;
+}
+
+
+static const struct mem_cgroup_stat_desc {
+ const char *msg;
+ u64 unit;
+} mem_cgroup_stat_desc[] = {
+ [MEM_CGROUP_STAT_CACHE] = { "cache", PAGE_SIZE, },
+ [MEM_CGROUP_STAT_RSS] = { "rss", PAGE_SIZE, },
+};
+
+static int mem_control_stat_show(struct seq_file *m, void *arg)
+{
+ struct cgroup *cont = m->private;
+ struct mem_cgroup *mem_cont = mem_cgroup_from_cont(cont);
+ struct mem_cgroup_stat *stat = &mem_cont->stat;
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(stat->cpustat[0].count); i++) {
+ s64 val;
+
+ val = mem_cgroup_read_stat(stat, i);
+ val *= mem_cgroup_stat_desc[i].unit;
+ seq_printf(m, "%s %lld\n", mem_cgroup_stat_desc[i].msg,
+ (long long)val);
+ }
+ /* showing # of active pages */
+ {
+ unsigned long active, inactive;
+
+ inactive = mem_cgroup_get_all_zonestat(mem_cont,
+ MEM_CGROUP_ZSTAT_INACTIVE);
+ active = mem_cgroup_get_all_zonestat(mem_cont,
+ MEM_CGROUP_ZSTAT_ACTIVE);
+ seq_printf(m, "active %ld\n", (active) * PAGE_SIZE);
+ seq_printf(m, "inactive %ld\n", (inactive) * PAGE_SIZE);
+ }
+ return 0;
+}
+
+static const struct file_operations mem_control_stat_file_operations = {
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+static int mem_control_stat_open(struct inode *unused, struct file *file)
+{
+ /* XXX __d_cont */
+ struct cgroup *cont = file->f_dentry->d_parent->d_fsdata;
+
+ file->f_op = &mem_control_stat_file_operations;
+ return single_open(file, mem_control_stat_show, cont);
+}
+
+
+
+static struct cftype mem_cgroup_files[] = {
+ {
+ .name = "usage_in_bytes",
+ .private = RES_USAGE,
+ .read = mem_cgroup_read,
+ },
+ {
+ .name = "limit_in_bytes",
+ .private = RES_LIMIT,
+ .write = mem_cgroup_write,
+ .read = mem_cgroup_read,
+ },
+ {
+ .name = "failcnt",
+ .private = RES_FAILCNT,
+ .read = mem_cgroup_read,
+ },
+ {
+ .name = "force_empty",
+ .write = mem_force_empty_write,
+ .read = mem_force_empty_read,
+ },
+ {
+ .name = "stat",
+ .open = mem_control_stat_open,
+ },
+};
+
+static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node)
+{
+ struct mem_cgroup_per_node *pn;
+ struct mem_cgroup_per_zone *mz;
+ int zone;
+ /*
+ * This routine is called against possible nodes.
+ * But it's BUG to call kmalloc() against offline node.
+ *
+ * TODO: this routine can waste much memory for nodes which will
+ * never be onlined. It's better to use memory hotplug callback
+ * function.
+ */
+ if (node_state(node, N_HIGH_MEMORY))
+ pn = kmalloc_node(sizeof(*pn), GFP_KERNEL, node);
+ else
+ pn = kmalloc(sizeof(*pn), GFP_KERNEL);
+ if (!pn)
+ return 1;
+
+ mem->info.nodeinfo[node] = pn;
+ memset(pn, 0, sizeof(*pn));
+
+ for (zone = 0; zone < MAX_NR_ZONES; zone++) {
+ mz = &pn->zoneinfo[zone];
+ INIT_LIST_HEAD(&mz->active_list);
+ INIT_LIST_HEAD(&mz->inactive_list);
+ spin_lock_init(&mz->lru_lock);
+ }
+ return 0;
+}
+
+static void free_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node)
+{
+ kfree(mem->info.nodeinfo[node]);
+}
+
+
+static struct mem_cgroup init_mem_cgroup;
+
+static struct cgroup_subsys_state *
+mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
+{
+ struct mem_cgroup *mem;
+ int node;
+
+ if (unlikely((cont->parent) == NULL)) {
+ mem = &init_mem_cgroup;
+ init_mm.mem_cgroup = mem;
+ } else
+ mem = kzalloc(sizeof(struct mem_cgroup), GFP_KERNEL);
+
+ if (mem == NULL)
+ return NULL;
+
+ res_counter_init(&mem->res);
+
+ memset(&mem->info, 0, sizeof(mem->info));
+
+ for_each_node_state(node, N_POSSIBLE)
+ if (alloc_mem_cgroup_per_zone_info(mem, node))
+ goto free_out;
+
+ return &mem->css;
+free_out:
+ for_each_node_state(node, N_POSSIBLE)
+ free_mem_cgroup_per_zone_info(mem, node);
+ if (cont->parent != NULL)
+ kfree(mem);
+ return NULL;
+}
+
+static void mem_cgroup_pre_destroy(struct cgroup_subsys *ss,
+ struct cgroup *cont)
+{
+ struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
+ mem_cgroup_force_empty(mem);
+}
+
+static void mem_cgroup_destroy(struct cgroup_subsys *ss,
+ struct cgroup *cont)
+{
+ int node;
+ struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
+
+ for_each_node_state(node, N_POSSIBLE)
+ free_mem_cgroup_per_zone_info(mem, node);
+
+ kfree(mem_cgroup_from_cont(cont));
+}
+
+static int mem_cgroup_populate(struct cgroup_subsys *ss,
+ struct cgroup *cont)
+{
+ return cgroup_add_files(cont, ss, mem_cgroup_files,
+ ARRAY_SIZE(mem_cgroup_files));
+}
+
+static void mem_cgroup_move_task(struct cgroup_subsys *ss,
+ struct cgroup *cont,
+ struct cgroup *old_cont,
+ struct task_struct *p)
+{
+ struct mm_struct *mm;
+ struct mem_cgroup *mem, *old_mem;
+
+ mm = get_task_mm(p);
+ if (mm == NULL)
+ return;
+
+ mem = mem_cgroup_from_cont(cont);
+ old_mem = mem_cgroup_from_cont(old_cont);
+
+ if (mem == old_mem)
+ goto out;
+
+ /*
+ * Only thread group leaders are allowed to migrate, the mm_struct is
+ * in effect owned by the leader
+ */
+ if (p->tgid != p->pid)
+ goto out;
+
+ css_get(&mem->css);
+ rcu_assign_pointer(mm->mem_cgroup, mem);
+ css_put(&old_mem->css);
+
+out:
+ mmput(mm);
+ return;
+}
+
+struct cgroup_subsys mem_cgroup_subsys = {
+ .name = "memory",
+ .subsys_id = mem_cgroup_subsys_id,
+ .create = mem_cgroup_create,
+ .pre_destroy = mem_cgroup_pre_destroy,
+ .destroy = mem_cgroup_destroy,
+ .populate = mem_cgroup_populate,
+ .attach = mem_cgroup_move_task,
+ .early_init = 0,
+};
diff --git a/mm/memory.c b/mm/memory.c
index 7bb70728bb52..ce3c9e4492d8 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -50,6 +50,7 @@
#include <linux/delayacct.h>
#include <linux/init.h>
#include <linux/writeback.h>
+#include <linux/memcontrol.h>
#include <asm/pgalloc.h>
#include <asm/uaccess.h>
@@ -82,7 +83,18 @@ void * high_memory;
EXPORT_SYMBOL(num_physpages);
EXPORT_SYMBOL(high_memory);
-int randomize_va_space __read_mostly = 1;
+/*
+ * Randomize the address space (stacks, mmaps, brk, etc.).
+ *
+ * ( When CONFIG_COMPAT_BRK=y we exclude brk from randomization,
+ * as ancient (libc5 based) binaries can segfault. )
+ */
+int randomize_va_space __read_mostly =
+#ifdef CONFIG_COMPAT_BRK
+ 1;
+#else
+ 2;
+#endif
static int __init disable_randmaps(char *s)
{
@@ -122,11 +134,9 @@ void pmd_clear_bad(pmd_t *pmd)
*/
static void free_pte_range(struct mmu_gather *tlb, pmd_t *pmd)
{
- struct page *page = pmd_page(*pmd);
+ pgtable_t token = pmd_pgtable(*pmd);
pmd_clear(pmd);
- pte_lock_deinit(page);
- pte_free_tlb(tlb, page);
- dec_zone_page_state(page, NR_PAGETABLE);
+ pte_free_tlb(tlb, token);
tlb->mm->nr_ptes--;
}
@@ -297,21 +307,19 @@ void free_pgtables(struct mmu_gather **tlb, struct vm_area_struct *vma,
int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address)
{
- struct page *new = pte_alloc_one(mm, address);
+ pgtable_t new = pte_alloc_one(mm, address);
if (!new)
return -ENOMEM;
- pte_lock_init(new);
spin_lock(&mm->page_table_lock);
- if (pmd_present(*pmd)) { /* Another has populated it */
- pte_lock_deinit(new);
- pte_free(mm, new);
- } else {
+ if (!pmd_present(*pmd)) { /* Has another populated it ? */
mm->nr_ptes++;
- inc_zone_page_state(new, NR_PAGETABLE);
pmd_populate(mm, pmd, new);
+ new = NULL;
}
spin_unlock(&mm->page_table_lock);
+ if (new)
+ pte_free(mm, new);
return 0;
}
@@ -322,11 +330,13 @@ int __pte_alloc_kernel(pmd_t *pmd, unsigned long address)
return -ENOMEM;
spin_lock(&init_mm.page_table_lock);
- if (pmd_present(*pmd)) /* Another has populated it */
- pte_free_kernel(&init_mm, new);
- else
+ if (!pmd_present(*pmd)) { /* Has another populated it ? */
pmd_populate_kernel(&init_mm, pmd, new);
+ new = NULL;
+ }
spin_unlock(&init_mm.page_table_lock);
+ if (new)
+ pte_free_kernel(&init_mm, new);
return 0;
}
@@ -979,6 +989,8 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
int i;
unsigned int vm_flags;
+ if (len <= 0)
+ return 0;
/*
* Require read or write permissions.
* If 'force' is set, we only require the "MAY" flags.
@@ -1133,16 +1145,20 @@ static int insert_page(struct mm_struct *mm, unsigned long addr, struct page *pa
{
int retval;
pte_t *pte;
- spinlock_t *ptl;
+ spinlock_t *ptl;
+
+ retval = mem_cgroup_charge(page, mm, GFP_KERNEL);
+ if (retval)
+ goto out;
retval = -EINVAL;
if (PageAnon(page))
- goto out;
+ goto out_uncharge;
retval = -ENOMEM;
flush_dcache_page(page);
pte = get_locked_pte(mm, addr, &ptl);
if (!pte)
- goto out;
+ goto out_uncharge;
retval = -EBUSY;
if (!pte_none(*pte))
goto out_unlock;
@@ -1154,8 +1170,12 @@ static int insert_page(struct mm_struct *mm, unsigned long addr, struct page *pa
set_pte_at(mm, addr, pte, mk_pte(page, prot));
retval = 0;
+ pte_unmap_unlock(pte, ptl);
+ return retval;
out_unlock:
pte_unmap_unlock(pte, ptl);
+out_uncharge:
+ mem_cgroup_uncharge_page(page);
out:
return retval;
}
@@ -1370,7 +1390,7 @@ static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd,
{
pte_t *pte;
int err;
- struct page *pmd_page;
+ pgtable_t token;
spinlock_t *uninitialized_var(ptl);
pte = (mm == &init_mm) ?
@@ -1381,10 +1401,10 @@ static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd,
BUG_ON(pmd_huge(*pmd));
- pmd_page = pmd_page(*pmd);
+ token = pmd_pgtable(*pmd);
do {
- err = fn(pte, pmd_page, addr, data);
+ err = fn(pte, token, addr, data);
if (err)
break;
} while (pte++, addr += PAGE_SIZE, addr != end);
@@ -1630,6 +1650,9 @@ gotten:
cow_user_page(new_page, old_page, address, vma);
__SetPageUptodate(new_page);
+ if (mem_cgroup_charge(new_page, mm, GFP_KERNEL))
+ goto oom_free_new;
+
/*
* Re-check the pte - we dropped the lock
*/
@@ -1661,7 +1684,9 @@ gotten:
/* Free the old page.. */
new_page = old_page;
ret |= VM_FAULT_WRITE;
- }
+ } else
+ mem_cgroup_uncharge_page(new_page);
+
if (new_page)
page_cache_release(new_page);
if (old_page)
@@ -1685,6 +1710,8 @@ unlock:
put_page(dirty_page);
}
return ret;
+oom_free_new:
+ __free_page(new_page);
oom:
if (old_page)
page_cache_release(old_page);
@@ -2025,6 +2052,12 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
count_vm_event(PGMAJFAULT);
}
+ if (mem_cgroup_charge(page, mm, GFP_KERNEL)) {
+ delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
+ ret = VM_FAULT_OOM;
+ goto out;
+ }
+
mark_page_accessed(page);
lock_page(page);
delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
@@ -2062,8 +2095,10 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
if (write_access) {
/* XXX: We could OR the do_wp_page code with this one? */
if (do_wp_page(mm, vma, address,
- page_table, pmd, ptl, pte) & VM_FAULT_OOM)
+ page_table, pmd, ptl, pte) & VM_FAULT_OOM) {
+ mem_cgroup_uncharge_page(page);
ret = VM_FAULT_OOM;
+ }
goto out;
}
@@ -2074,6 +2109,7 @@ unlock:
out:
return ret;
out_nomap:
+ mem_cgroup_uncharge_page(page);
pte_unmap_unlock(page_table, ptl);
unlock_page(page);
page_cache_release(page);
@@ -2103,6 +2139,9 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
goto oom;
__SetPageUptodate(page);
+ if (mem_cgroup_charge(page, mm, GFP_KERNEL))
+ goto oom_free_page;
+
entry = mk_pte(page, vma->vm_page_prot);
entry = maybe_mkwrite(pte_mkdirty(entry), vma);
@@ -2120,8 +2159,11 @@ unlock:
pte_unmap_unlock(page_table, ptl);
return 0;
release:
+ mem_cgroup_uncharge_page(page);
page_cache_release(page);
goto unlock;
+oom_free_page:
+ __free_page(page);
oom:
return VM_FAULT_OOM;
}
@@ -2235,6 +2277,11 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
}
+ if (mem_cgroup_charge(page, mm, GFP_KERNEL)) {
+ ret = VM_FAULT_OOM;
+ goto out;
+ }
+
page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
/*
@@ -2270,6 +2317,7 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
/* no need to invalidate: a not-present page won't be cached */
update_mmu_cache(vma, address, entry);
} else {
+ mem_cgroup_uncharge_page(page);
if (anon)
page_cache_release(page);
else
@@ -2663,6 +2711,13 @@ void print_vma_addr(char *prefix, unsigned long ip)
struct mm_struct *mm = current->mm;
struct vm_area_struct *vma;
+ /*
+ * Do not print if we are in atomic
+ * contexts (in exception stacks, etc.):
+ */
+ if (preempt_count())
+ return;
+
down_read(&mm->mmap_sem);
vma = find_vma(mm, ip);
if (vma && vma->vm_file) {
@@ -2671,7 +2726,7 @@ void print_vma_addr(char *prefix, unsigned long ip)
if (buf) {
char *p, *s;
- p = d_path(f->f_dentry, f->f_vfsmnt, buf, PAGE_SIZE);
+ p = d_path(&f->f_path, buf, PAGE_SIZE);
if (IS_ERR(p))
p = "?";
s = strrchr(p, '/');
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 83c69f8a64c2..6c7ba1a63d23 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -116,22 +116,51 @@ static void mpol_rebind_policy(struct mempolicy *pol,
/* Do sanity checking on a policy */
static int mpol_check_policy(int mode, nodemask_t *nodes)
{
- int empty = nodes_empty(*nodes);
+ int was_empty, is_empty;
+
+ if (!nodes)
+ return 0;
+
+ /*
+ * "Contextualize" the in-coming nodemast for cpusets:
+ * Remember whether in-coming nodemask was empty, If not,
+ * restrict the nodes to the allowed nodes in the cpuset.
+ * This is guaranteed to be a subset of nodes with memory.
+ */
+ cpuset_update_task_memory_state();
+ is_empty = was_empty = nodes_empty(*nodes);
+ if (!was_empty) {
+ nodes_and(*nodes, *nodes, cpuset_current_mems_allowed);
+ is_empty = nodes_empty(*nodes); /* after "contextualization" */
+ }
switch (mode) {
case MPOL_DEFAULT:
- if (!empty)
+ /*
+ * require caller to specify an empty nodemask
+ * before "contextualization"
+ */
+ if (!was_empty)
return -EINVAL;
break;
case MPOL_BIND:
case MPOL_INTERLEAVE:
- /* Preferred will only use the first bit, but allow
- more for now. */
- if (empty)
+ /*
+ * require at least 1 valid node after "contextualization"
+ */
+ if (is_empty)
+ return -EINVAL;
+ break;
+ case MPOL_PREFERRED:
+ /*
+ * Did caller specify invalid nodes?
+ * Don't silently accept this as "local allocation".
+ */
+ if (!was_empty && is_empty)
return -EINVAL;
break;
}
- return nodes_subset(*nodes, node_states[N_HIGH_MEMORY]) ? 0 : -EINVAL;
+ return 0;
}
/* Generate a custom zonelist for the BIND policy. */
@@ -188,8 +217,6 @@ static struct mempolicy *mpol_new(int mode, nodemask_t *nodes)
switch (mode) {
case MPOL_INTERLEAVE:
policy->v.nodes = *nodes;
- nodes_and(policy->v.nodes, policy->v.nodes,
- node_states[N_HIGH_MEMORY]);
if (nodes_weight(policy->v.nodes) == 0) {
kmem_cache_free(policy_cache, policy);
return ERR_PTR(-EINVAL);
@@ -421,18 +448,6 @@ static int mbind_range(struct vm_area_struct *vma, unsigned long start,
return err;
}
-static int contextualize_policy(int mode, nodemask_t *nodes)
-{
- if (!nodes)
- return 0;
-
- cpuset_update_task_memory_state();
- if (!cpuset_nodes_subset_current_mems_allowed(*nodes))
- return -EINVAL;
- return mpol_check_policy(mode, nodes);
-}
-
-
/*
* Update task->flags PF_MEMPOLICY bit: set iff non-default
* mempolicy. Allows more rapid checking of this (combined perhaps
@@ -468,7 +483,7 @@ static long do_set_mempolicy(int mode, nodemask_t *nodes)
{
struct mempolicy *new;
- if (contextualize_policy(mode, nodes))
+ if (mpol_check_policy(mode, nodes))
return -EINVAL;
new = mpol_new(mode, nodes);
if (IS_ERR(new))
@@ -915,10 +930,6 @@ asmlinkage long sys_mbind(unsigned long start, unsigned long len,
err = get_nodes(&nodes, nmask, maxnode);
if (err)
return err;
-#ifdef CONFIG_CPUSETS
- /* Restrict the nodes to the allowed nodes in the cpuset */
- nodes_and(nodes, nodes, current->mems_allowed);
-#endif
return do_mbind(start, len, mode, &nodes, flags);
}
@@ -1985,7 +1996,7 @@ int show_numa_map(struct seq_file *m, void *v)
if (file) {
seq_printf(m, " file=");
- seq_path(m, file->f_path.mnt, file->f_path.dentry, "\n\t= ");
+ seq_path(m, &file->f_path, "\n\t= ");
} else if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) {
seq_printf(m, " heap");
} else if (vma->vm_start <= mm->start_stack &&
diff --git a/mm/migrate.c b/mm/migrate.c
index 857a987e3690..a73504ff5ab9 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -29,6 +29,7 @@
#include <linux/mempolicy.h>
#include <linux/vmalloc.h>
#include <linux/security.h>
+#include <linux/memcontrol.h>
#include "internal.h"
@@ -152,6 +153,11 @@ static void remove_migration_pte(struct vm_area_struct *vma,
return;
}
+ if (mem_cgroup_charge(new, mm, GFP_KERNEL)) {
+ pte_unmap(ptep);
+ return;
+ }
+
ptl = pte_lockptr(mm, pmd);
spin_lock(ptl);
pte = *ptep;
@@ -587,9 +593,10 @@ static int move_to_new_page(struct page *newpage, struct page *page)
else
rc = fallback_migrate_page(mapping, newpage, page);
- if (!rc)
+ if (!rc) {
+ mem_cgroup_page_migration(page, newpage);
remove_migration_ptes(page, newpage);
- else
+ } else
newpage->mapping = NULL;
unlock_page(newpage);
@@ -608,6 +615,7 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
int *result = NULL;
struct page *newpage = get_new_page(page, private, &result);
int rcu_locked = 0;
+ int charge = 0;
if (!newpage)
return -ENOMEM;
@@ -667,14 +675,19 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
goto rcu_unlock;
}
+ charge = mem_cgroup_prepare_migration(page);
/* Establish migration ptes or remove ptes */
try_to_unmap(page, 1);
if (!page_mapped(page))
rc = move_to_new_page(newpage, page);
- if (rc)
+ if (rc) {
remove_migration_ptes(page, page);
+ if (charge)
+ mem_cgroup_end_migration(page);
+ } else if (charge)
+ mem_cgroup_end_migration(newpage);
rcu_unlock:
if (rcu_locked)
rcu_read_unlock();
diff --git a/mm/mmap.c b/mm/mmap.c
index bb4c963cc534..a32d28ce31cd 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -245,7 +245,7 @@ asmlinkage unsigned long sys_brk(unsigned long brk)
down_write(&mm->mmap_sem);
- if (brk < mm->end_code)
+ if (brk < mm->start_brk)
goto out;
/*
@@ -2165,24 +2165,31 @@ int may_expand_vm(struct mm_struct *mm, unsigned long npages)
}
-static struct page *special_mapping_nopage(struct vm_area_struct *vma,
- unsigned long address, int *type)
+static int special_mapping_fault(struct vm_area_struct *vma,
+ struct vm_fault *vmf)
{
+ pgoff_t pgoff;
struct page **pages;
- BUG_ON(address < vma->vm_start || address >= vma->vm_end);
+ /*
+ * special mappings have no vm_file, and in that case, the mm
+ * uses vm_pgoff internally. So we have to subtract it from here.
+ * We are allowed to do this because we are the mm; do not copy
+ * this code into drivers!
+ */
+ pgoff = vmf->pgoff - vma->vm_pgoff;
- address -= vma->vm_start;
- for (pages = vma->vm_private_data; address > 0 && *pages; ++pages)
- address -= PAGE_SIZE;
+ for (pages = vma->vm_private_data; pgoff && *pages; ++pages)
+ pgoff--;
if (*pages) {
struct page *page = *pages;
get_page(page);
- return page;
+ vmf->page = page;
+ return 0;
}
- return NOPAGE_SIGBUS;
+ return VM_FAULT_SIGBUS;
}
/*
@@ -2194,7 +2201,7 @@ static void special_mapping_close(struct vm_area_struct *vma)
static struct vm_operations_struct special_mapping_vmops = {
.close = special_mapping_close,
- .nopage = special_mapping_nopage,
+ .fault = special_mapping_fault,
};
/*
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index c1850bf991cd..4194b9db0104 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -25,9 +25,11 @@
#include <linux/cpuset.h>
#include <linux/module.h>
#include <linux/notifier.h>
+#include <linux/memcontrol.h>
int sysctl_panic_on_oom;
int sysctl_oom_kill_allocating_task;
+int sysctl_oom_dump_tasks;
static DEFINE_SPINLOCK(zone_scan_mutex);
/* #define DEBUG */
@@ -50,7 +52,8 @@ static DEFINE_SPINLOCK(zone_scan_mutex);
* of least surprise ... (be careful when you change it)
*/
-unsigned long badness(struct task_struct *p, unsigned long uptime)
+unsigned long badness(struct task_struct *p, unsigned long uptime,
+ struct mem_cgroup *mem)
{
unsigned long points, cpu_time, run_time, s;
struct mm_struct *mm;
@@ -193,7 +196,8 @@ static inline enum oom_constraint constrained_alloc(struct zonelist *zonelist,
*
* (not docbooked, we don't want this one cluttering up the manual)
*/
-static struct task_struct *select_bad_process(unsigned long *ppoints)
+static struct task_struct *select_bad_process(unsigned long *ppoints,
+ struct mem_cgroup *mem)
{
struct task_struct *g, *p;
struct task_struct *chosen = NULL;
@@ -213,6 +217,8 @@ static struct task_struct *select_bad_process(unsigned long *ppoints)
/* skip the init task */
if (is_global_init(p))
continue;
+ if (mem && !task_in_mem_cgroup(p, mem))
+ continue;
/*
* This task already has access to memory reserves and is
@@ -247,7 +253,7 @@ static struct task_struct *select_bad_process(unsigned long *ppoints)
if (p->oomkilladj == OOM_DISABLE)
continue;
- points = badness(p, uptime.tv_sec);
+ points = badness(p, uptime.tv_sec, mem);
if (points > *ppoints || !chosen) {
chosen = p;
*ppoints = points;
@@ -258,6 +264,41 @@ static struct task_struct *select_bad_process(unsigned long *ppoints)
}
/**
+ * Dumps the current memory state of all system tasks, excluding kernel threads.
+ * State information includes task's pid, uid, tgid, vm size, rss, cpu, oom_adj
+ * score, and name.
+ *
+ * If the actual is non-NULL, only tasks that are a member of the mem_cgroup are
+ * shown.
+ *
+ * Call with tasklist_lock read-locked.
+ */
+static void dump_tasks(const struct mem_cgroup *mem)
+{
+ struct task_struct *g, *p;
+
+ printk(KERN_INFO "[ pid ] uid tgid total_vm rss cpu oom_adj "
+ "name\n");
+ do_each_thread(g, p) {
+ /*
+ * total_vm and rss sizes do not exist for tasks with a
+ * detached mm so there's no need to report them.
+ */
+ if (!p->mm)
+ continue;
+ if (mem && !task_in_mem_cgroup(p, mem))
+ continue;
+
+ task_lock(p);
+ printk(KERN_INFO "[%5d] %5d %5d %8lu %8lu %3d %3d %s\n",
+ p->pid, p->uid, p->tgid, p->mm->total_vm,
+ get_mm_rss(p->mm), (int)task_cpu(p), p->oomkilladj,
+ p->comm);
+ task_unlock(p);
+ } while_each_thread(g, p);
+}
+
+/**
* Send SIGKILL to the selected process irrespective of CAP_SYS_RAW_IO
* flag though it's unlikely that we select a process with CAP_SYS_RAW_IO
* set.
@@ -334,7 +375,8 @@ static int oom_kill_task(struct task_struct *p)
}
static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
- unsigned long points, const char *message)
+ unsigned long points, struct mem_cgroup *mem,
+ const char *message)
{
struct task_struct *c;
@@ -344,6 +386,8 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
current->comm, gfp_mask, order, current->oomkilladj);
dump_stack();
show_mem();
+ if (sysctl_oom_dump_tasks)
+ dump_tasks(mem);
}
/*
@@ -368,6 +412,31 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
return oom_kill_task(p);
}
+#ifdef CONFIG_CGROUP_MEM_CONT
+void mem_cgroup_out_of_memory(struct mem_cgroup *mem, gfp_t gfp_mask)
+{
+ unsigned long points = 0;
+ struct task_struct *p;
+
+ cgroup_lock();
+ rcu_read_lock();
+retry:
+ p = select_bad_process(&points, mem);
+ if (PTR_ERR(p) == -1UL)
+ goto out;
+
+ if (!p)
+ p = current;
+
+ if (oom_kill_process(p, gfp_mask, 0, points, mem,
+ "Memory cgroup out of memory"))
+ goto retry;
+out:
+ rcu_read_unlock();
+ cgroup_unlock();
+}
+#endif
+
static BLOCKING_NOTIFIER_HEAD(oom_notify_list);
int register_oom_notifier(struct notifier_block *nb)
@@ -465,7 +534,7 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order)
switch (constraint) {
case CONSTRAINT_MEMORY_POLICY:
- oom_kill_process(current, gfp_mask, order, points,
+ oom_kill_process(current, gfp_mask, order, points, NULL,
"No available memory (MPOL_BIND)");
break;
@@ -475,7 +544,7 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order)
/* Fall-through */
case CONSTRAINT_CPUSET:
if (sysctl_oom_kill_allocating_task) {
- oom_kill_process(current, gfp_mask, order, points,
+ oom_kill_process(current, gfp_mask, order, points, NULL,
"Out of memory (oom_kill_allocating_task)");
break;
}
@@ -484,7 +553,7 @@ retry:
* Rambo mode: Shoot down a process and hope it solves whatever
* issues we may have.
*/
- p = select_bad_process(&points);
+ p = select_bad_process(&points, NULL);
if (PTR_ERR(p) == -1UL)
goto out;
@@ -495,7 +564,7 @@ retry:
panic("Out of memory and no killable processes...\n");
}
- if (oom_kill_process(p, gfp_mask, order, points,
+ if (oom_kill_process(p, gfp_mask, order, points, NULL,
"Out of memory"))
goto retry;
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 37576b822f06..75b979313346 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -43,6 +43,7 @@
#include <linux/backing-dev.h>
#include <linux/fault-inject.h>
#include <linux/page-isolation.h>
+#include <linux/memcontrol.h>
#include <asm/tlbflush.h>
#include <asm/div64.h>
@@ -987,6 +988,7 @@ static void free_hot_cold_page(struct page *page, int cold)
if (!PageHighMem(page))
debug_check_no_locks_freed(page_address(page), PAGE_SIZE);
+ VM_BUG_ON(page_get_page_cgroup(page));
arch_free_page(page, 0);
kernel_map_pages(page, 1, 0);
@@ -1449,7 +1451,7 @@ try_next_zone:
/*
* This is the 'heart' of the zoned buddy allocator.
*/
-struct page * fastcall
+struct page *
__alloc_pages(gfp_t gfp_mask, unsigned int order,
struct zonelist *zonelist)
{
@@ -2525,6 +2527,7 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
set_page_links(page, zone, nid, pfn);
init_page_count(page);
reset_page_mapcount(page);
+ page_assign_page_cgroup(page, NULL);
SetPageReserved(page);
/*
diff --git a/mm/rmap.c b/mm/rmap.c
index 57ad276900c9..8fd527c4e2bf 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -48,6 +48,7 @@
#include <linux/rcupdate.h>
#include <linux/module.h>
#include <linux/kallsyms.h>
+#include <linux/memcontrol.h>
#include <asm/tlbflush.h>
@@ -301,7 +302,8 @@ out:
return referenced;
}
-static int page_referenced_anon(struct page *page)
+static int page_referenced_anon(struct page *page,
+ struct mem_cgroup *mem_cont)
{
unsigned int mapcount;
struct anon_vma *anon_vma;
@@ -314,6 +316,13 @@ static int page_referenced_anon(struct page *page)
mapcount = page_mapcount(page);
list_for_each_entry(vma, &anon_vma->head, anon_vma_node) {
+ /*
+ * If we are reclaiming on behalf of a cgroup, skip
+ * counting on behalf of references from different
+ * cgroups
+ */
+ if (mem_cont && !vm_match_cgroup(vma->vm_mm, mem_cont))
+ continue;
referenced += page_referenced_one(page, vma, &mapcount);
if (!mapcount)
break;
@@ -334,7 +343,8 @@ static int page_referenced_anon(struct page *page)
*
* This function is only called from page_referenced for object-based pages.
*/
-static int page_referenced_file(struct page *page)
+static int page_referenced_file(struct page *page,
+ struct mem_cgroup *mem_cont)
{
unsigned int mapcount;
struct address_space *mapping = page->mapping;
@@ -367,6 +377,13 @@ static int page_referenced_file(struct page *page)
mapcount = page_mapcount(page);
vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
+ /*
+ * If we are reclaiming on behalf of a cgroup, skip
+ * counting on behalf of references from different
+ * cgroups
+ */
+ if (mem_cont && !vm_match_cgroup(vma->vm_mm, mem_cont))
+ continue;
if ((vma->vm_flags & (VM_LOCKED|VM_MAYSHARE))
== (VM_LOCKED|VM_MAYSHARE)) {
referenced++;
@@ -389,7 +406,8 @@ static int page_referenced_file(struct page *page)
* Quick test_and_clear_referenced for all mappings to a page,
* returns the number of ptes which referenced the page.
*/
-int page_referenced(struct page *page, int is_locked)
+int page_referenced(struct page *page, int is_locked,
+ struct mem_cgroup *mem_cont)
{
int referenced = 0;
@@ -401,14 +419,15 @@ int page_referenced(struct page *page, int is_locked)
if (page_mapped(page) && page->mapping) {
if (PageAnon(page))
- referenced += page_referenced_anon(page);
+ referenced += page_referenced_anon(page, mem_cont);
else if (is_locked)
- referenced += page_referenced_file(page);
+ referenced += page_referenced_file(page, mem_cont);
else if (TestSetPageLocked(page))
referenced++;
else {
if (page->mapping)
- referenced += page_referenced_file(page);
+ referenced +=
+ page_referenced_file(page, mem_cont);
unlock_page(page);
}
}
@@ -554,8 +573,14 @@ void page_add_anon_rmap(struct page *page,
VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end);
if (atomic_inc_and_test(&page->_mapcount))
__page_set_anon_rmap(page, vma, address);
- else
+ else {
__page_check_anon_rmap(page, vma, address);
+ /*
+ * We unconditionally charged during prepare, we uncharge here
+ * This takes care of balancing the reference counts
+ */
+ mem_cgroup_uncharge_page(page);
+ }
}
/*
@@ -586,6 +611,12 @@ void page_add_file_rmap(struct page *page)
{
if (atomic_inc_and_test(&page->_mapcount))
__inc_zone_page_state(page, NR_FILE_MAPPED);
+ else
+ /*
+ * We unconditionally charged during prepare, we uncharge here
+ * This takes care of balancing the reference counts
+ */
+ mem_cgroup_uncharge_page(page);
}
#ifdef CONFIG_DEBUG_VM
@@ -646,6 +677,8 @@ void page_remove_rmap(struct page *page, struct vm_area_struct *vma)
page_clear_dirty(page);
set_page_dirty(page);
}
+ mem_cgroup_uncharge_page(page);
+
__dec_zone_page_state(page,
PageAnon(page) ? NR_ANON_PAGES : NR_FILE_MAPPED);
}
diff --git a/mm/shmem.c b/mm/shmem.c
index 0f246c44a574..90b576cbc06e 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -49,6 +49,7 @@
#include <linux/ctype.h>
#include <linux/migrate.h>
#include <linux/highmem.h>
+#include <linux/seq_file.h>
#include <asm/uaccess.h>
#include <asm/div64.h>
@@ -84,6 +85,18 @@ enum sgp_type {
SGP_WRITE, /* may exceed i_size, may allocate page */
};
+#ifdef CONFIG_TMPFS
+static unsigned long shmem_default_max_blocks(void)
+{
+ return totalram_pages / 2;
+}
+
+static unsigned long shmem_default_max_inodes(void)
+{
+ return min(totalram_pages - totalhigh_pages, totalram_pages / 2);
+}
+#endif
+
static int shmem_getpage(struct inode *inode, unsigned long idx,
struct page **pagep, enum sgp_type sgp, int *type);
@@ -912,9 +925,13 @@ found:
error = 1;
if (!inode)
goto out;
- error = radix_tree_preload(GFP_KERNEL);
+ /* Precharge page while we can wait, compensate afterwards */
+ error = mem_cgroup_cache_charge(page, current->mm, GFP_KERNEL);
if (error)
goto out;
+ error = radix_tree_preload(GFP_KERNEL);
+ if (error)
+ goto uncharge;
error = 1;
spin_lock(&info->lock);
@@ -947,6 +964,8 @@ found:
shmem_swp_unmap(ptr);
spin_unlock(&info->lock);
radix_tree_preload_end();
+uncharge:
+ mem_cgroup_uncharge_page(page);
out:
unlock_page(page);
page_cache_release(page);
@@ -1062,7 +1081,8 @@ redirty:
}
#ifdef CONFIG_NUMA
-static inline int shmem_parse_mpol(char *value, int *policy, nodemask_t *policy_nodes)
+#ifdef CONFIG_TMPFS
+static int shmem_parse_mpol(char *value, int *policy, nodemask_t *policy_nodes)
{
char *nodelist = strchr(value, ':');
int err = 1;
@@ -1111,6 +1131,42 @@ out:
return err;
}
+static void shmem_show_mpol(struct seq_file *seq, int policy,
+ const nodemask_t policy_nodes)
+{
+ char *policy_string;
+
+ switch (policy) {
+ case MPOL_PREFERRED:
+ policy_string = "prefer";
+ break;
+ case MPOL_BIND:
+ policy_string = "bind";
+ break;
+ case MPOL_INTERLEAVE:
+ policy_string = "interleave";
+ break;
+ default:
+ /* MPOL_DEFAULT */
+ return;
+ }
+
+ seq_printf(seq, ",mpol=%s", policy_string);
+
+ if (policy != MPOL_INTERLEAVE ||
+ !nodes_equal(policy_nodes, node_states[N_HIGH_MEMORY])) {
+ char buffer[64];
+ int len;
+
+ len = nodelist_scnprintf(buffer, sizeof(buffer), policy_nodes);
+ if (len < sizeof(buffer))
+ seq_printf(seq, ":%s", buffer);
+ else
+ seq_printf(seq, ":?");
+ }
+}
+#endif /* CONFIG_TMPFS */
+
static struct page *shmem_swapin(swp_entry_t entry, gfp_t gfp,
struct shmem_inode_info *info, unsigned long idx)
{
@@ -1142,13 +1198,20 @@ static struct page *shmem_alloc_page(gfp_t gfp,
mpol_free(pvma.vm_policy);
return page;
}
-#else
+#else /* !CONFIG_NUMA */
+#ifdef CONFIG_TMPFS
static inline int shmem_parse_mpol(char *value, int *policy,
nodemask_t *policy_nodes)
{
return 1;
}
+static inline void shmem_show_mpol(struct seq_file *seq, int policy,
+ const nodemask_t policy_nodes)
+{
+}
+#endif /* CONFIG_TMPFS */
+
static inline struct page *shmem_swapin(swp_entry_t entry, gfp_t gfp,
struct shmem_inode_info *info, unsigned long idx)
{
@@ -1160,7 +1223,7 @@ static inline struct page *shmem_alloc_page(gfp_t gfp,
{
return alloc_page(gfp);
}
-#endif
+#endif /* CONFIG_NUMA */
/*
* shmem_getpage - either get the page from swap or allocate a new one
@@ -1308,6 +1371,13 @@ repeat:
spin_unlock(&info->lock);
unlock_page(swappage);
page_cache_release(swappage);
+ if (error == -ENOMEM) {
+ /* allow reclaim from this memory cgroup */
+ error = mem_cgroup_cache_charge(NULL,
+ current->mm, gfp & ~__GFP_HIGHMEM);
+ if (error)
+ goto failed;
+ }
goto repeat;
}
} else if (sgp == SGP_READ && !filepage) {
@@ -1353,6 +1423,17 @@ repeat:
goto failed;
}
+ /* Precharge page while we can wait, compensate after */
+ error = mem_cgroup_cache_charge(filepage, current->mm,
+ gfp & ~__GFP_HIGHMEM);
+ if (error) {
+ page_cache_release(filepage);
+ shmem_unacct_blocks(info->flags, 1);
+ shmem_free_blocks(inode, 1);
+ filepage = NULL;
+ goto failed;
+ }
+
spin_lock(&info->lock);
entry = shmem_swp_alloc(info, idx, sgp);
if (IS_ERR(entry))
@@ -1364,6 +1445,7 @@ repeat:
if (error || swap.val || 0 != add_to_page_cache_lru(
filepage, mapping, idx, GFP_NOWAIT)) {
spin_unlock(&info->lock);
+ mem_cgroup_uncharge_page(filepage);
page_cache_release(filepage);
shmem_unacct_blocks(info->flags, 1);
shmem_free_blocks(inode, 1);
@@ -1372,6 +1454,7 @@ repeat:
goto failed;
goto repeat;
}
+ mem_cgroup_uncharge_page(filepage);
info->flags |= SHMEM_PAGEIN;
}
@@ -2051,9 +2134,8 @@ static const struct export_operations shmem_export_ops = {
.fh_to_dentry = shmem_fh_to_dentry,
};
-static int shmem_parse_options(char *options, int *mode, uid_t *uid,
- gid_t *gid, unsigned long *blocks, unsigned long *inodes,
- int *policy, nodemask_t *policy_nodes)
+static int shmem_parse_options(char *options, struct shmem_sb_info *sbinfo,
+ bool remount)
{
char *this_char, *value, *rest;
@@ -2096,35 +2178,37 @@ static int shmem_parse_options(char *options, int *mode, uid_t *uid,
}
if (*rest)
goto bad_val;
- *blocks = DIV_ROUND_UP(size, PAGE_CACHE_SIZE);
+ sbinfo->max_blocks =
+ DIV_ROUND_UP(size, PAGE_CACHE_SIZE);
} else if (!strcmp(this_char,"nr_blocks")) {
- *blocks = memparse(value,&rest);
+ sbinfo->max_blocks = memparse(value, &rest);
if (*rest)
goto bad_val;
} else if (!strcmp(this_char,"nr_inodes")) {
- *inodes = memparse(value,&rest);
+ sbinfo->max_inodes = memparse(value, &rest);
if (*rest)
goto bad_val;
} else if (!strcmp(this_char,"mode")) {
- if (!mode)
+ if (remount)
continue;
- *mode = simple_strtoul(value,&rest,8);
+ sbinfo->mode = simple_strtoul(value, &rest, 8) & 07777;
if (*rest)
goto bad_val;
} else if (!strcmp(this_char,"uid")) {
- if (!uid)
+ if (remount)
continue;
- *uid = simple_strtoul(value,&rest,0);
+ sbinfo->uid = simple_strtoul(value, &rest, 0);
if (*rest)
goto bad_val;
} else if (!strcmp(this_char,"gid")) {
- if (!gid)
+ if (remount)
continue;
- *gid = simple_strtoul(value,&rest,0);
+ sbinfo->gid = simple_strtoul(value, &rest, 0);
if (*rest)
goto bad_val;
} else if (!strcmp(this_char,"mpol")) {
- if (shmem_parse_mpol(value,policy,policy_nodes))
+ if (shmem_parse_mpol(value, &sbinfo->policy,
+ &sbinfo->policy_nodes))
goto bad_val;
} else {
printk(KERN_ERR "tmpfs: Bad mount option %s\n",
@@ -2144,24 +2228,20 @@ bad_val:
static int shmem_remount_fs(struct super_block *sb, int *flags, char *data)
{
struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
- unsigned long max_blocks = sbinfo->max_blocks;
- unsigned long max_inodes = sbinfo->max_inodes;
- int policy = sbinfo->policy;
- nodemask_t policy_nodes = sbinfo->policy_nodes;
+ struct shmem_sb_info config = *sbinfo;
unsigned long blocks;
unsigned long inodes;
int error = -EINVAL;
- if (shmem_parse_options(data, NULL, NULL, NULL, &max_blocks,
- &max_inodes, &policy, &policy_nodes))
+ if (shmem_parse_options(data, &config, true))
return error;
spin_lock(&sbinfo->stat_lock);
blocks = sbinfo->max_blocks - sbinfo->free_blocks;
inodes = sbinfo->max_inodes - sbinfo->free_inodes;
- if (max_blocks < blocks)
+ if (config.max_blocks < blocks)
goto out;
- if (max_inodes < inodes)
+ if (config.max_inodes < inodes)
goto out;
/*
* Those tests also disallow limited->unlimited while any are in
@@ -2169,23 +2249,42 @@ static int shmem_remount_fs(struct super_block *sb, int *flags, char *data)
* but we must separately disallow unlimited->limited, because
* in that case we have no record of how much is already in use.
*/
- if (max_blocks && !sbinfo->max_blocks)
+ if (config.max_blocks && !sbinfo->max_blocks)
goto out;
- if (max_inodes && !sbinfo->max_inodes)
+ if (config.max_inodes && !sbinfo->max_inodes)
goto out;
error = 0;
- sbinfo->max_blocks = max_blocks;
- sbinfo->free_blocks = max_blocks - blocks;
- sbinfo->max_inodes = max_inodes;
- sbinfo->free_inodes = max_inodes - inodes;
- sbinfo->policy = policy;
- sbinfo->policy_nodes = policy_nodes;
+ sbinfo->max_blocks = config.max_blocks;
+ sbinfo->free_blocks = config.max_blocks - blocks;
+ sbinfo->max_inodes = config.max_inodes;
+ sbinfo->free_inodes = config.max_inodes - inodes;
+ sbinfo->policy = config.policy;
+ sbinfo->policy_nodes = config.policy_nodes;
out:
spin_unlock(&sbinfo->stat_lock);
return error;
}
-#endif
+
+static int shmem_show_options(struct seq_file *seq, struct vfsmount *vfs)
+{
+ struct shmem_sb_info *sbinfo = SHMEM_SB(vfs->mnt_sb);
+
+ if (sbinfo->max_blocks != shmem_default_max_blocks())
+ seq_printf(seq, ",size=%luk",
+ sbinfo->max_blocks << (PAGE_CACHE_SHIFT - 10));
+ if (sbinfo->max_inodes != shmem_default_max_inodes())
+ seq_printf(seq, ",nr_inodes=%lu", sbinfo->max_inodes);
+ if (sbinfo->mode != (S_IRWXUGO | S_ISVTX))
+ seq_printf(seq, ",mode=%03o", sbinfo->mode);
+ if (sbinfo->uid != 0)
+ seq_printf(seq, ",uid=%u", sbinfo->uid);
+ if (sbinfo->gid != 0)
+ seq_printf(seq, ",gid=%u", sbinfo->gid);
+ shmem_show_mpol(seq, sbinfo->policy, sbinfo->policy_nodes);
+ return 0;
+}
+#endif /* CONFIG_TMPFS */
static void shmem_put_super(struct super_block *sb)
{
@@ -2198,15 +2297,23 @@ static int shmem_fill_super(struct super_block *sb,
{
struct inode *inode;
struct dentry *root;
- int mode = S_IRWXUGO | S_ISVTX;
- uid_t uid = current->fsuid;
- gid_t gid = current->fsgid;
- int err = -ENOMEM;
struct shmem_sb_info *sbinfo;
- unsigned long blocks = 0;
- unsigned long inodes = 0;
- int policy = MPOL_DEFAULT;
- nodemask_t policy_nodes = node_states[N_HIGH_MEMORY];
+ int err = -ENOMEM;
+
+ /* Round up to L1_CACHE_BYTES to resist false sharing */
+ sbinfo = kmalloc(max((int)sizeof(struct shmem_sb_info),
+ L1_CACHE_BYTES), GFP_KERNEL);
+ if (!sbinfo)
+ return -ENOMEM;
+
+ sbinfo->max_blocks = 0;
+ sbinfo->max_inodes = 0;
+ sbinfo->mode = S_IRWXUGO | S_ISVTX;
+ sbinfo->uid = current->fsuid;
+ sbinfo->gid = current->fsgid;
+ sbinfo->policy = MPOL_DEFAULT;
+ sbinfo->policy_nodes = node_states[N_HIGH_MEMORY];
+ sb->s_fs_info = sbinfo;
#ifdef CONFIG_TMPFS
/*
@@ -2215,34 +2322,22 @@ static int shmem_fill_super(struct super_block *sb,
* but the internal instance is left unlimited.
*/
if (!(sb->s_flags & MS_NOUSER)) {
- blocks = totalram_pages / 2;
- inodes = totalram_pages - totalhigh_pages;
- if (inodes > blocks)
- inodes = blocks;
- if (shmem_parse_options(data, &mode, &uid, &gid, &blocks,
- &inodes, &policy, &policy_nodes))
- return -EINVAL;
+ sbinfo->max_blocks = shmem_default_max_blocks();
+ sbinfo->max_inodes = shmem_default_max_inodes();
+ if (shmem_parse_options(data, sbinfo, false)) {
+ err = -EINVAL;
+ goto failed;
+ }
}
sb->s_export_op = &shmem_export_ops;
#else
sb->s_flags |= MS_NOUSER;
#endif
- /* Round up to L1_CACHE_BYTES to resist false sharing */
- sbinfo = kmalloc(max((int)sizeof(struct shmem_sb_info),
- L1_CACHE_BYTES), GFP_KERNEL);
- if (!sbinfo)
- return -ENOMEM;
-
spin_lock_init(&sbinfo->stat_lock);
- sbinfo->max_blocks = blocks;
- sbinfo->free_blocks = blocks;
- sbinfo->max_inodes = inodes;
- sbinfo->free_inodes = inodes;
- sbinfo->policy = policy;
- sbinfo->policy_nodes = policy_nodes;
+ sbinfo->free_blocks = sbinfo->max_blocks;
+ sbinfo->free_inodes = sbinfo->max_inodes;
- sb->s_fs_info = sbinfo;
sb->s_maxbytes = SHMEM_MAX_BYTES;
sb->s_blocksize = PAGE_CACHE_SIZE;
sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
@@ -2254,11 +2349,11 @@ static int shmem_fill_super(struct super_block *sb,
sb->s_flags |= MS_POSIXACL;
#endif
- inode = shmem_get_inode(sb, S_IFDIR | mode, 0);
+ inode = shmem_get_inode(sb, S_IFDIR | sbinfo->mode, 0);
if (!inode)
goto failed;
- inode->i_uid = uid;
- inode->i_gid = gid;
+ inode->i_uid = sbinfo->uid;
+ inode->i_gid = sbinfo->gid;
root = d_alloc_root(inode);
if (!root)
goto failed_iput;
@@ -2394,6 +2489,7 @@ static const struct super_operations shmem_ops = {
#ifdef CONFIG_TMPFS
.statfs = shmem_statfs,
.remount_fs = shmem_remount_fs,
+ .show_options = shmem_show_options,
#endif
.delete_inode = shmem_delete_inode,
.drop_inode = generic_delete_inode,
diff --git a/mm/slab.c b/mm/slab.c
index 40c00dacbe4b..473e6c2eaefb 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -2630,6 +2630,7 @@ static struct slab *alloc_slabmgmt(struct kmem_cache *cachep, void *objp,
slabp->colouroff = colour_off;
slabp->s_mem = objp + colour_off;
slabp->nodeid = nodeid;
+ slabp->free = 0;
return slabp;
}
@@ -2683,7 +2684,6 @@ static void cache_init_objs(struct kmem_cache *cachep,
slab_bufctl(slabp)[i] = i + 1;
}
slab_bufctl(slabp)[i - 1] = BUFCTL_END;
- slabp->free = 0;
}
static void kmem_flagcheck(struct kmem_cache *cachep, gfp_t flags)
@@ -2816,7 +2816,6 @@ static int cache_grow(struct kmem_cache *cachep,
if (!slabp)
goto opps1;
- slabp->nodeid = nodeid;
slab_map_pages(cachep, slabp, objp);
cache_init_objs(cachep, slabp);
diff --git a/mm/slub.c b/mm/slub.c
index 3f056677fa8f..4b3895cb90ee 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -149,6 +149,13 @@ static inline void ClearSlabDebug(struct page *page)
/* Enable to test recovery from slab corruption on boot */
#undef SLUB_RESILIENCY_TEST
+/*
+ * Currently fastpath is not supported if preemption is enabled.
+ */
+#if defined(CONFIG_FAST_CMPXCHG_LOCAL) && !defined(CONFIG_PREEMPT)
+#define SLUB_FASTPATH
+#endif
+
#if PAGE_SHIFT <= 12
/*
@@ -204,6 +211,8 @@ static inline void ClearSlabDebug(struct page *page)
/* Internal SLUB flags */
#define __OBJECT_POISON 0x80000000 /* Poison object */
#define __SYSFS_ADD_DEFERRED 0x40000000 /* Not yet visible via sysfs */
+#define __KMALLOC_CACHE 0x20000000 /* objects freed using kfree */
+#define __PAGE_ALLOC_FALLBACK 0x10000000 /* Allow fallback to page alloc */
/* Not all arches define cache_line_size */
#ifndef cache_line_size
@@ -243,6 +252,7 @@ enum track_item { TRACK_ALLOC, TRACK_FREE };
static int sysfs_slab_add(struct kmem_cache *);
static int sysfs_slab_alias(struct kmem_cache *, const char *);
static void sysfs_slab_remove(struct kmem_cache *);
+
#else
static inline int sysfs_slab_add(struct kmem_cache *s) { return 0; }
static inline int sysfs_slab_alias(struct kmem_cache *s, const char *p)
@@ -251,8 +261,16 @@ static inline void sysfs_slab_remove(struct kmem_cache *s)
{
kfree(s);
}
+
#endif
+static inline void stat(struct kmem_cache_cpu *c, enum stat_item si)
+{
+#ifdef CONFIG_SLUB_STATS
+ c->stat[si]++;
+#endif
+}
+
/********************************************************************
* Core slab cache functions
*******************************************************************/
@@ -280,15 +298,32 @@ static inline struct kmem_cache_cpu *get_cpu_slab(struct kmem_cache *s, int cpu)
#endif
}
+/*
+ * The end pointer in a slab is special. It points to the first object in the
+ * slab but has bit 0 set to mark it.
+ *
+ * Note that SLUB relies on page_mapping returning NULL for pages with bit 0
+ * in the mapping set.
+ */
+static inline int is_end(void *addr)
+{
+ return (unsigned long)addr & PAGE_MAPPING_ANON;
+}
+
+static void *slab_address(struct page *page)
+{
+ return page->end - PAGE_MAPPING_ANON;
+}
+
static inline int check_valid_pointer(struct kmem_cache *s,
struct page *page, const void *object)
{
void *base;
- if (!object)
+ if (object == page->end)
return 1;
- base = page_address(page);
+ base = slab_address(page);
if (object < base || object >= base + s->objects * s->size ||
(object - base) % s->size) {
return 0;
@@ -321,7 +356,8 @@ static inline void set_freepointer(struct kmem_cache *s, void *object, void *fp)
/* Scan freelist */
#define for_each_free_object(__p, __s, __free) \
- for (__p = (__free); __p; __p = get_freepointer((__s), __p))
+ for (__p = (__free); (__p) != page->end; __p = get_freepointer((__s),\
+ __p))
/* Determine object index from a given position */
static inline int slab_index(void *p, struct kmem_cache *s, void *addr)
@@ -473,7 +509,7 @@ static void slab_fix(struct kmem_cache *s, char *fmt, ...)
static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p)
{
unsigned int off; /* Offset of last byte */
- u8 *addr = page_address(page);
+ u8 *addr = slab_address(page);
print_tracking(s, p);
@@ -651,7 +687,7 @@ static int slab_pad_check(struct kmem_cache *s, struct page *page)
if (!(s->flags & SLAB_POISON))
return 1;
- start = page_address(page);
+ start = slab_address(page);
end = start + (PAGE_SIZE << s->order);
length = s->objects * s->size;
remainder = end - (start + length);
@@ -685,9 +721,10 @@ static int check_object(struct kmem_cache *s, struct page *page,
endobject, red, s->inuse - s->objsize))
return 0;
} else {
- if ((s->flags & SLAB_POISON) && s->objsize < s->inuse)
- check_bytes_and_report(s, page, p, "Alignment padding", endobject,
- POISON_INUSE, s->inuse - s->objsize);
+ if ((s->flags & SLAB_POISON) && s->objsize < s->inuse) {
+ check_bytes_and_report(s, page, p, "Alignment padding",
+ endobject, POISON_INUSE, s->inuse - s->objsize);
+ }
}
if (s->flags & SLAB_POISON) {
@@ -718,7 +755,7 @@ static int check_object(struct kmem_cache *s, struct page *page,
* of the free objects in this slab. May cause
* another error because the object count is now wrong.
*/
- set_freepointer(s, p, NULL);
+ set_freepointer(s, p, page->end);
return 0;
}
return 1;
@@ -752,18 +789,18 @@ static int on_freelist(struct kmem_cache *s, struct page *page, void *search)
void *fp = page->freelist;
void *object = NULL;
- while (fp && nr <= s->objects) {
+ while (fp != page->end && nr <= s->objects) {
if (fp == search)
return 1;
if (!check_valid_pointer(s, page, fp)) {
if (object) {
object_err(s, page, object,
"Freechain corrupt");
- set_freepointer(s, object, NULL);
+ set_freepointer(s, object, page->end);
break;
} else {
slab_err(s, page, "Freepointer corrupt");
- page->freelist = NULL;
+ page->freelist = page->end;
page->inuse = s->objects;
slab_fix(s, "Freelist cleared");
return 0;
@@ -869,7 +906,7 @@ bad:
*/
slab_fix(s, "Marking all objects used");
page->inuse = s->objects;
- page->freelist = NULL;
+ page->freelist = page->end;
}
return 0;
}
@@ -894,11 +931,10 @@ static int free_debug_processing(struct kmem_cache *s, struct page *page,
return 0;
if (unlikely(s != page->slab)) {
- if (!PageSlab(page))
+ if (!PageSlab(page)) {
slab_err(s, page, "Attempt to free object(0x%p) "
"outside of slab", object);
- else
- if (!page->slab) {
+ } else if (!page->slab) {
printk(KERN_ERR
"SLUB <none>: no slab for object 0x%p.\n",
object);
@@ -910,7 +946,7 @@ static int free_debug_processing(struct kmem_cache *s, struct page *page,
}
/* Special debug activities for freeing objects */
- if (!SlabFrozen(page) && !page->freelist)
+ if (!SlabFrozen(page) && page->freelist == page->end)
remove_full(s, page);
if (s->flags & SLAB_STORE_USER)
set_track(s, object, TRACK_FREE, addr);
@@ -1007,7 +1043,7 @@ static unsigned long kmem_cache_flags(unsigned long objsize,
*/
if (slub_debug && (!slub_debug_slabs ||
strncmp(slub_debug_slabs, name,
- strlen(slub_debug_slabs)) == 0))
+ strlen(slub_debug_slabs)) == 0))
flags |= slub_debug;
}
@@ -1044,14 +1080,7 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
struct page *page;
int pages = 1 << s->order;
- if (s->order)
- flags |= __GFP_COMP;
-
- if (s->flags & SLAB_CACHE_DMA)
- flags |= SLUB_DMA;
-
- if (s->flags & SLAB_RECLAIM_ACCOUNT)
- flags |= __GFP_RECLAIMABLE;
+ flags |= s->allocflags;
if (node == -1)
page = alloc_pages(flags, s->order);
@@ -1102,6 +1131,7 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
SetSlabDebug(page);
start = page_address(page);
+ page->end = start + 1;
if (unlikely(s->flags & SLAB_POISON))
memset(start, POISON_INUSE, PAGE_SIZE << s->order);
@@ -1113,7 +1143,7 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
last = p;
}
setup_object(s, page, last);
- set_freepointer(s, last, NULL);
+ set_freepointer(s, last, page->end);
page->freelist = start;
page->inuse = 0;
@@ -1129,7 +1159,7 @@ static void __free_slab(struct kmem_cache *s, struct page *page)
void *p;
slab_pad_check(s, page);
- for_each_object(p, s, page_address(page))
+ for_each_object(p, s, slab_address(page))
check_object(s, page, p, 0);
ClearSlabDebug(page);
}
@@ -1139,6 +1169,7 @@ static void __free_slab(struct kmem_cache *s, struct page *page)
NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE,
-pages);
+ page->mapping = NULL;
__free_pages(page, s->order);
}
@@ -1183,7 +1214,7 @@ static __always_inline void slab_lock(struct page *page)
static __always_inline void slab_unlock(struct page *page)
{
- bit_spin_unlock(PG_locked, &page->flags);
+ __bit_spin_unlock(PG_locked, &page->flags);
}
static __always_inline int slab_trylock(struct page *page)
@@ -1294,8 +1325,8 @@ static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags)
get_cycles() % 1024 > s->remote_node_defrag_ratio)
return NULL;
- zonelist = &NODE_DATA(slab_node(current->mempolicy))
- ->node_zonelists[gfp_zone(flags)];
+ zonelist = &NODE_DATA(
+ slab_node(current->mempolicy))->node_zonelists[gfp_zone(flags)];
for (z = zonelist->zones; *z; z++) {
struct kmem_cache_node *n;
@@ -1337,17 +1368,22 @@ static struct page *get_partial(struct kmem_cache *s, gfp_t flags, int node)
static void unfreeze_slab(struct kmem_cache *s, struct page *page, int tail)
{
struct kmem_cache_node *n = get_node(s, page_to_nid(page));
+ struct kmem_cache_cpu *c = get_cpu_slab(s, smp_processor_id());
ClearSlabFrozen(page);
if (page->inuse) {
- if (page->freelist)
+ if (page->freelist != page->end) {
add_partial(n, page, tail);
- else if (SlabDebug(page) && (s->flags & SLAB_STORE_USER))
- add_full(n, page);
+ stat(c, tail ? DEACTIVATE_TO_TAIL : DEACTIVATE_TO_HEAD);
+ } else {
+ stat(c, DEACTIVATE_FULL);
+ if (SlabDebug(page) && (s->flags & SLAB_STORE_USER))
+ add_full(n, page);
+ }
slab_unlock(page);
-
} else {
+ stat(c, DEACTIVATE_EMPTY);
if (n->nr_partial < MIN_PARTIAL) {
/*
* Adding an empty slab to the partial slabs in order
@@ -1361,6 +1397,7 @@ static void unfreeze_slab(struct kmem_cache *s, struct page *page, int tail)
slab_unlock(page);
} else {
slab_unlock(page);
+ stat(get_cpu_slab(s, raw_smp_processor_id()), FREE_SLAB);
discard_slab(s, page);
}
}
@@ -1373,12 +1410,19 @@ static void deactivate_slab(struct kmem_cache *s, struct kmem_cache_cpu *c)
{
struct page *page = c->page;
int tail = 1;
+
+ if (c->freelist)
+ stat(c, DEACTIVATE_REMOTE_FREES);
/*
* Merge cpu freelist into freelist. Typically we get here
* because both freelists are empty. So this is unlikely
* to occur.
+ *
+ * We need to use _is_end here because deactivate slab may
+ * be called for a debug slab. Then c->freelist may contain
+ * a dummy pointer.
*/
- while (unlikely(c->freelist)) {
+ while (unlikely(!is_end(c->freelist))) {
void **object;
tail = 0; /* Hot objects. Put the slab first */
@@ -1398,6 +1442,7 @@ static void deactivate_slab(struct kmem_cache *s, struct kmem_cache_cpu *c)
static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c)
{
+ stat(c, CPUSLAB_FLUSH);
slab_lock(c->page);
deactivate_slab(s, c);
}
@@ -1469,16 +1514,21 @@ static void *__slab_alloc(struct kmem_cache *s,
{
void **object;
struct page *new;
+#ifdef SLUB_FASTPATH
+ unsigned long flags;
+ local_irq_save(flags);
+#endif
if (!c->page)
goto new_slab;
slab_lock(c->page);
if (unlikely(!node_match(c, node)))
goto another_slab;
+ stat(c, ALLOC_REFILL);
load_freelist:
object = c->page->freelist;
- if (unlikely(!object))
+ if (unlikely(object == c->page->end))
goto another_slab;
if (unlikely(SlabDebug(c->page)))
goto debug;
@@ -1486,9 +1536,14 @@ load_freelist:
object = c->page->freelist;
c->freelist = object[c->offset];
c->page->inuse = s->objects;
- c->page->freelist = NULL;
+ c->page->freelist = c->page->end;
c->node = page_to_nid(c->page);
+unlock_out:
slab_unlock(c->page);
+ stat(c, ALLOC_SLOWPATH);
+#ifdef SLUB_FASTPATH
+ local_irq_restore(flags);
+#endif
return object;
another_slab:
@@ -1498,6 +1553,7 @@ new_slab:
new = get_partial(s, gfpflags, node);
if (new) {
c->page = new;
+ stat(c, ALLOC_FROM_PARTIAL);
goto load_freelist;
}
@@ -1511,6 +1567,7 @@ new_slab:
if (new) {
c = get_cpu_slab(s, smp_processor_id());
+ stat(c, ALLOC_SLAB);
if (c->page)
flush_slab(s, c);
slab_lock(new);
@@ -1518,6 +1575,23 @@ new_slab:
c->page = new;
goto load_freelist;
}
+#ifdef SLUB_FASTPATH
+ local_irq_restore(flags);
+#endif
+ /*
+ * No memory available.
+ *
+ * If the slab uses higher order allocs but the object is
+ * smaller than a page size then we can fallback in emergencies
+ * to the page allocator via kmalloc_large. The page allocator may
+ * have failed to obtain a higher order page and we can try to
+ * allocate a single page if the object fits into a single page.
+ * That is only possible if certain conditions are met that are being
+ * checked when a slab is created.
+ */
+ if (!(gfpflags & __GFP_NORETRY) && (s->flags & __PAGE_ALLOC_FALLBACK))
+ return kmalloc_large(s->objsize, gfpflags);
+
return NULL;
debug:
object = c->page->freelist;
@@ -1527,8 +1601,7 @@ debug:
c->page->inuse++;
c->page->freelist = object[c->offset];
c->node = -1;
- slab_unlock(c->page);
- return object;
+ goto unlock_out;
}
/*
@@ -1545,20 +1618,50 @@ static __always_inline void *slab_alloc(struct kmem_cache *s,
gfp_t gfpflags, int node, void *addr)
{
void **object;
- unsigned long flags;
struct kmem_cache_cpu *c;
+/*
+ * The SLUB_FASTPATH path is provisional and is currently disabled if the
+ * kernel is compiled with preemption or if the arch does not support
+ * fast cmpxchg operations. There are a couple of coming changes that will
+ * simplify matters and allow preemption. Ultimately we may end up making
+ * SLUB_FASTPATH the default.
+ *
+ * 1. The introduction of the per cpu allocator will avoid array lookups
+ * through get_cpu_slab(). A special register can be used instead.
+ *
+ * 2. The introduction of per cpu atomic operations (cpu_ops) means that
+ * we can realize the logic here entirely with per cpu atomics. The
+ * per cpu atomic ops will take care of the preemption issues.
+ */
+
+#ifdef SLUB_FASTPATH
+ c = get_cpu_slab(s, raw_smp_processor_id());
+ do {
+ object = c->freelist;
+ if (unlikely(is_end(object) || !node_match(c, node))) {
+ object = __slab_alloc(s, gfpflags, node, addr, c);
+ break;
+ }
+ stat(c, ALLOC_FASTPATH);
+ } while (cmpxchg_local(&c->freelist, object, object[c->offset])
+ != object);
+#else
+ unsigned long flags;
+
local_irq_save(flags);
c = get_cpu_slab(s, smp_processor_id());
- if (unlikely(!c->freelist || !node_match(c, node)))
+ if (unlikely(is_end(c->freelist) || !node_match(c, node)))
object = __slab_alloc(s, gfpflags, node, addr, c);
else {
object = c->freelist;
c->freelist = object[c->offset];
+ stat(c, ALLOC_FASTPATH);
}
local_irq_restore(flags);
+#endif
if (unlikely((gfpflags & __GFP_ZERO) && object))
memset(object, 0, c->objsize);
@@ -1593,7 +1696,15 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
{
void *prior;
void **object = (void *)x;
+ struct kmem_cache_cpu *c;
+
+#ifdef SLUB_FASTPATH
+ unsigned long flags;
+ local_irq_save(flags);
+#endif
+ c = get_cpu_slab(s, raw_smp_processor_id());
+ stat(c, FREE_SLOWPATH);
slab_lock(page);
if (unlikely(SlabDebug(page)))
@@ -1603,8 +1714,10 @@ checks_ok:
page->freelist = object;
page->inuse--;
- if (unlikely(SlabFrozen(page)))
+ if (unlikely(SlabFrozen(page))) {
+ stat(c, FREE_FROZEN);
goto out_unlock;
+ }
if (unlikely(!page->inuse))
goto slab_empty;
@@ -1614,21 +1727,31 @@ checks_ok:
* was not on the partial list before
* then add it.
*/
- if (unlikely(!prior))
+ if (unlikely(prior == page->end)) {
add_partial(get_node(s, page_to_nid(page)), page, 1);
+ stat(c, FREE_ADD_PARTIAL);
+ }
out_unlock:
slab_unlock(page);
+#ifdef SLUB_FASTPATH
+ local_irq_restore(flags);
+#endif
return;
slab_empty:
- if (prior)
+ if (prior != page->end) {
/*
* Slab still on the partial list.
*/
remove_partial(s, page);
-
+ stat(c, FREE_REMOVE_PARTIAL);
+ }
slab_unlock(page);
+ stat(c, FREE_SLAB);
+#ifdef SLUB_FASTPATH
+ local_irq_restore(flags);
+#endif
discard_slab(s, page);
return;
@@ -1653,19 +1776,49 @@ static __always_inline void slab_free(struct kmem_cache *s,
struct page *page, void *x, void *addr)
{
void **object = (void *)x;
- unsigned long flags;
struct kmem_cache_cpu *c;
+#ifdef SLUB_FASTPATH
+ void **freelist;
+
+ c = get_cpu_slab(s, raw_smp_processor_id());
+ debug_check_no_locks_freed(object, s->objsize);
+ do {
+ freelist = c->freelist;
+ barrier();
+ /*
+ * If the compiler would reorder the retrieval of c->page to
+ * come before c->freelist then an interrupt could
+ * change the cpu slab before we retrieve c->freelist. We
+ * could be matching on a page no longer active and put the
+ * object onto the freelist of the wrong slab.
+ *
+ * On the other hand: If we already have the freelist pointer
+ * then any change of cpu_slab will cause the cmpxchg to fail
+ * since the freelist pointers are unique per slab.
+ */
+ if (unlikely(page != c->page || c->node < 0)) {
+ __slab_free(s, page, x, addr, c->offset);
+ break;
+ }
+ object[c->offset] = freelist;
+ stat(c, FREE_FASTPATH);
+ } while (cmpxchg_local(&c->freelist, freelist, object) != freelist);
+#else
+ unsigned long flags;
+
local_irq_save(flags);
debug_check_no_locks_freed(object, s->objsize);
c = get_cpu_slab(s, smp_processor_id());
if (likely(page == c->page && c->node >= 0)) {
object[c->offset] = c->freelist;
c->freelist = object;
+ stat(c, FREE_FASTPATH);
} else
__slab_free(s, page, x, addr, c->offset);
local_irq_restore(flags);
+#endif
}
void kmem_cache_free(struct kmem_cache *s, void *x)
@@ -1842,7 +1995,7 @@ static void init_kmem_cache_cpu(struct kmem_cache *s,
struct kmem_cache_cpu *c)
{
c->page = NULL;
- c->freelist = NULL;
+ c->freelist = (void *)PAGE_MAPPING_ANON;
c->node = 0;
c->offset = s->offset / sizeof(void *);
c->objsize = s->objsize;
@@ -2186,10 +2339,33 @@ static int calculate_sizes(struct kmem_cache *s)
size = ALIGN(size, align);
s->size = size;
- s->order = calculate_order(size);
+ if ((flags & __KMALLOC_CACHE) &&
+ PAGE_SIZE / size < slub_min_objects) {
+ /*
+ * Kmalloc cache that would not have enough objects in
+ * an order 0 page. Kmalloc slabs can fallback to
+ * page allocator order 0 allocs so take a reasonably large
+ * order that will allows us a good number of objects.
+ */
+ s->order = max(slub_max_order, PAGE_ALLOC_COSTLY_ORDER);
+ s->flags |= __PAGE_ALLOC_FALLBACK;
+ s->allocflags |= __GFP_NOWARN;
+ } else
+ s->order = calculate_order(size);
+
if (s->order < 0)
return 0;
+ s->allocflags = 0;
+ if (s->order)
+ s->allocflags |= __GFP_COMP;
+
+ if (s->flags & SLAB_CACHE_DMA)
+ s->allocflags |= SLUB_DMA;
+
+ if (s->flags & SLAB_RECLAIM_ACCOUNT)
+ s->allocflags |= __GFP_RECLAIMABLE;
+
/*
* Determine the number of objects per slab
*/
@@ -2341,11 +2517,11 @@ EXPORT_SYMBOL(kmem_cache_destroy);
* Kmalloc subsystem
*******************************************************************/
-struct kmem_cache kmalloc_caches[PAGE_SHIFT] __cacheline_aligned;
+struct kmem_cache kmalloc_caches[PAGE_SHIFT + 1] __cacheline_aligned;
EXPORT_SYMBOL(kmalloc_caches);
#ifdef CONFIG_ZONE_DMA
-static struct kmem_cache *kmalloc_caches_dma[PAGE_SHIFT];
+static struct kmem_cache *kmalloc_caches_dma[PAGE_SHIFT + 1];
#endif
static int __init setup_slub_min_order(char *str)
@@ -2393,7 +2569,7 @@ static struct kmem_cache *create_kmalloc_cache(struct kmem_cache *s,
down_write(&slub_lock);
if (!kmem_cache_open(s, gfp_flags, name, size, ARCH_KMALLOC_MINALIGN,
- flags, NULL))
+ flags | __KMALLOC_CACHE, NULL))
goto panic;
list_add(&s->list, &slab_caches);
@@ -2446,7 +2622,8 @@ static noinline struct kmem_cache *dma_kmalloc_cache(int index, gfp_t flags)
goto unlock_out;
realsize = kmalloc_caches[index].objsize;
- text = kasprintf(flags & ~SLUB_DMA, "kmalloc_dma-%d", (unsigned int)realsize),
+ text = kasprintf(flags & ~SLUB_DMA, "kmalloc_dma-%d",
+ (unsigned int)realsize);
s = kmalloc(kmem_size, flags & ~SLUB_DMA);
if (!s || !text || !kmem_cache_open(s, flags, text,
@@ -2526,9 +2703,8 @@ void *__kmalloc(size_t size, gfp_t flags)
{
struct kmem_cache *s;
- if (unlikely(size > PAGE_SIZE / 2))
- return (void *)__get_free_pages(flags | __GFP_COMP,
- get_order(size));
+ if (unlikely(size > PAGE_SIZE))
+ return kmalloc_large(size, flags);
s = get_slab(size, flags);
@@ -2544,9 +2720,8 @@ void *__kmalloc_node(size_t size, gfp_t flags, int node)
{
struct kmem_cache *s;
- if (unlikely(size > PAGE_SIZE / 2))
- return (void *)__get_free_pages(flags | __GFP_COMP,
- get_order(size));
+ if (unlikely(size > PAGE_SIZE))
+ return kmalloc_large(size, flags);
s = get_slab(size, flags);
@@ -2601,6 +2776,7 @@ EXPORT_SYMBOL(ksize);
void kfree(const void *x)
{
struct page *page;
+ void *object = (void *)x;
if (unlikely(ZERO_OR_NULL_PTR(x)))
return;
@@ -2610,7 +2786,7 @@ void kfree(const void *x)
put_page(page);
return;
}
- slab_free(page->slab, page, (void *)x, __builtin_return_address(0));
+ slab_free(page->slab, page, object, __builtin_return_address(0));
}
EXPORT_SYMBOL(kfree);
@@ -2856,7 +3032,7 @@ void __init kmem_cache_init(void)
caches++;
}
- for (i = KMALLOC_SHIFT_LOW; i < PAGE_SHIFT; i++) {
+ for (i = KMALLOC_SHIFT_LOW; i <= PAGE_SHIFT; i++) {
create_kmalloc_cache(&kmalloc_caches[i],
"kmalloc", 1 << i, GFP_KERNEL);
caches++;
@@ -2883,7 +3059,7 @@ void __init kmem_cache_init(void)
slab_state = UP;
/* Provide the correct kmalloc names now that the caches are up */
- for (i = KMALLOC_SHIFT_LOW; i < PAGE_SHIFT; i++)
+ for (i = KMALLOC_SHIFT_LOW; i <= PAGE_SHIFT; i++)
kmalloc_caches[i]. name =
kasprintf(GFP_KERNEL, "kmalloc-%d", 1 << i);
@@ -2896,7 +3072,8 @@ void __init kmem_cache_init(void)
#endif
- printk(KERN_INFO "SLUB: Genslabs=%d, HWalign=%d, Order=%d-%d, MinObjects=%d,"
+ printk(KERN_INFO
+ "SLUB: Genslabs=%d, HWalign=%d, Order=%d-%d, MinObjects=%d,"
" CPUs=%d, Nodes=%d\n",
caches, cache_line_size(),
slub_min_order, slub_max_order, slub_min_objects,
@@ -2911,6 +3088,9 @@ static int slab_unmergeable(struct kmem_cache *s)
if (slub_nomerge || (s->flags & SLUB_NEVER_MERGE))
return 1;
+ if ((s->flags & __PAGE_ALLOC_FALLBACK))
+ return 1;
+
if (s->ctor)
return 1;
@@ -3063,7 +3243,7 @@ static int __cpuinit slab_cpuup_callback(struct notifier_block *nfb,
}
static struct notifier_block __cpuinitdata slab_notifier = {
- &slab_cpuup_callback, NULL, 0
+ .notifier_call = slab_cpuup_callback
};
#endif
@@ -3072,9 +3252,9 @@ void *__kmalloc_track_caller(size_t size, gfp_t gfpflags, void *caller)
{
struct kmem_cache *s;
- if (unlikely(size > PAGE_SIZE / 2))
- return (void *)__get_free_pages(gfpflags | __GFP_COMP,
- get_order(size));
+ if (unlikely(size > PAGE_SIZE))
+ return kmalloc_large(size, gfpflags);
+
s = get_slab(size, gfpflags);
if (unlikely(ZERO_OR_NULL_PTR(s)))
@@ -3088,9 +3268,9 @@ void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags,
{
struct kmem_cache *s;
- if (unlikely(size > PAGE_SIZE / 2))
- return (void *)__get_free_pages(gfpflags | __GFP_COMP,
- get_order(size));
+ if (unlikely(size > PAGE_SIZE))
+ return kmalloc_large(size, gfpflags);
+
s = get_slab(size, gfpflags);
if (unlikely(ZERO_OR_NULL_PTR(s)))
@@ -3104,7 +3284,7 @@ static int validate_slab(struct kmem_cache *s, struct page *page,
unsigned long *map)
{
void *p;
- void *addr = page_address(page);
+ void *addr = slab_address(page);
if (!check_slab(s, page) ||
!on_freelist(s, page, NULL))
@@ -3221,8 +3401,9 @@ static void resiliency_test(void)
p = kzalloc(32, GFP_KERNEL);
p[32 + sizeof(void *)] = 0x34;
printk(KERN_ERR "\n2. kmalloc-32: Clobber next pointer/next slab"
- " 0x34 -> -0x%p\n", p);
- printk(KERN_ERR "If allocated object is overwritten then not detectable\n\n");
+ " 0x34 -> -0x%p\n", p);
+ printk(KERN_ERR
+ "If allocated object is overwritten then not detectable\n\n");
validate_slab_cache(kmalloc_caches + 5);
p = kzalloc(64, GFP_KERNEL);
@@ -3230,7 +3411,8 @@ static void resiliency_test(void)
*p = 0x56;
printk(KERN_ERR "\n3. kmalloc-64: corrupting random byte 0x56->0x%p\n",
p);
- printk(KERN_ERR "If allocated object is overwritten then not detectable\n\n");
+ printk(KERN_ERR
+ "If allocated object is overwritten then not detectable\n\n");
validate_slab_cache(kmalloc_caches + 6);
printk(KERN_ERR "\nB. Corruption after free\n");
@@ -3243,7 +3425,8 @@ static void resiliency_test(void)
p = kzalloc(256, GFP_KERNEL);
kfree(p);
p[50] = 0x9a;
- printk(KERN_ERR "\n2. kmalloc-256: Clobber 50th byte 0x9a->0x%p\n\n", p);
+ printk(KERN_ERR "\n2. kmalloc-256: Clobber 50th byte 0x9a->0x%p\n\n",
+ p);
validate_slab_cache(kmalloc_caches + 8);
p = kzalloc(512, GFP_KERNEL);
@@ -3384,7 +3567,7 @@ static int add_location(struct loc_track *t, struct kmem_cache *s,
static void process_slab(struct loc_track *t, struct kmem_cache *s,
struct page *page, enum track_item alloc)
{
- void *addr = page_address(page);
+ void *addr = slab_address(page);
DECLARE_BITMAP(map, s->objects);
void *p;
@@ -3872,6 +4055,62 @@ static ssize_t remote_node_defrag_ratio_store(struct kmem_cache *s,
SLAB_ATTR(remote_node_defrag_ratio);
#endif
+#ifdef CONFIG_SLUB_STATS
+
+static int show_stat(struct kmem_cache *s, char *buf, enum stat_item si)
+{
+ unsigned long sum = 0;
+ int cpu;
+ int len;
+ int *data = kmalloc(nr_cpu_ids * sizeof(int), GFP_KERNEL);
+
+ if (!data)
+ return -ENOMEM;
+
+ for_each_online_cpu(cpu) {
+ unsigned x = get_cpu_slab(s, cpu)->stat[si];
+
+ data[cpu] = x;
+ sum += x;
+ }
+
+ len = sprintf(buf, "%lu", sum);
+
+ for_each_online_cpu(cpu) {
+ if (data[cpu] && len < PAGE_SIZE - 20)
+ len += sprintf(buf + len, " c%d=%u", cpu, data[cpu]);
+ }
+ kfree(data);
+ return len + sprintf(buf + len, "\n");
+}
+
+#define STAT_ATTR(si, text) \
+static ssize_t text##_show(struct kmem_cache *s, char *buf) \
+{ \
+ return show_stat(s, buf, si); \
+} \
+SLAB_ATTR_RO(text); \
+
+STAT_ATTR(ALLOC_FASTPATH, alloc_fastpath);
+STAT_ATTR(ALLOC_SLOWPATH, alloc_slowpath);
+STAT_ATTR(FREE_FASTPATH, free_fastpath);
+STAT_ATTR(FREE_SLOWPATH, free_slowpath);
+STAT_ATTR(FREE_FROZEN, free_frozen);
+STAT_ATTR(FREE_ADD_PARTIAL, free_add_partial);
+STAT_ATTR(FREE_REMOVE_PARTIAL, free_remove_partial);
+STAT_ATTR(ALLOC_FROM_PARTIAL, alloc_from_partial);
+STAT_ATTR(ALLOC_SLAB, alloc_slab);
+STAT_ATTR(ALLOC_REFILL, alloc_refill);
+STAT_ATTR(FREE_SLAB, free_slab);
+STAT_ATTR(CPUSLAB_FLUSH, cpuslab_flush);
+STAT_ATTR(DEACTIVATE_FULL, deactivate_full);
+STAT_ATTR(DEACTIVATE_EMPTY, deactivate_empty);
+STAT_ATTR(DEACTIVATE_TO_HEAD, deactivate_to_head);
+STAT_ATTR(DEACTIVATE_TO_TAIL, deactivate_to_tail);
+STAT_ATTR(DEACTIVATE_REMOTE_FREES, deactivate_remote_frees);
+
+#endif
+
static struct attribute *slab_attrs[] = {
&slab_size_attr.attr,
&object_size_attr.attr,
@@ -3902,6 +4141,25 @@ static struct attribute *slab_attrs[] = {
#ifdef CONFIG_NUMA
&remote_node_defrag_ratio_attr.attr,
#endif
+#ifdef CONFIG_SLUB_STATS
+ &alloc_fastpath_attr.attr,
+ &alloc_slowpath_attr.attr,
+ &free_fastpath_attr.attr,
+ &free_slowpath_attr.attr,
+ &free_frozen_attr.attr,
+ &free_add_partial_attr.attr,
+ &free_remove_partial_attr.attr,
+ &alloc_from_partial_attr.attr,
+ &alloc_slab_attr.attr,
+ &alloc_refill_attr.attr,
+ &free_slab_attr.attr,
+ &cpuslab_flush_attr.attr,
+ &deactivate_full_attr.attr,
+ &deactivate_empty_attr.attr,
+ &deactivate_to_head_attr.attr,
+ &deactivate_to_tail_attr.attr,
+ &deactivate_remote_frees_attr.attr,
+#endif
NULL
};
diff --git a/mm/swap.c b/mm/swap.c
index 57b7e25a939c..710a20bb9749 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -29,6 +29,7 @@
#include <linux/cpu.h>
#include <linux/notifier.h>
#include <linux/backing-dev.h>
+#include <linux/memcontrol.h>
/* How many pages do we try to swap or page in/out together? */
int page_cluster;
@@ -175,6 +176,7 @@ void activate_page(struct page *page)
SetPageActive(page);
add_page_to_active_list(zone, page);
__count_vm_event(PGACTIVATE);
+ mem_cgroup_move_lists(page_get_page_cgroup(page), true);
}
spin_unlock_irq(&zone->lru_lock);
}
diff --git a/mm/swapfile.c b/mm/swapfile.c
index eade24da9310..2da149cfc9ac 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -27,6 +27,7 @@
#include <linux/mutex.h>
#include <linux/capability.h>
#include <linux/syscalls.h>
+#include <linux/memcontrol.h>
#include <asm/pgtable.h>
#include <asm/tlbflush.h>
@@ -511,11 +512,16 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
{
spinlock_t *ptl;
pte_t *pte;
- int found = 1;
+ int ret = 1;
+
+ if (mem_cgroup_charge(page, vma->vm_mm, GFP_KERNEL))
+ ret = -ENOMEM;
pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
if (unlikely(!pte_same(*pte, swp_entry_to_pte(entry)))) {
- found = 0;
+ if (ret > 0)
+ mem_cgroup_uncharge_page(page);
+ ret = 0;
goto out;
}
@@ -532,7 +538,7 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
activate_page(page);
out:
pte_unmap_unlock(pte, ptl);
- return found;
+ return ret;
}
static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
@@ -541,7 +547,7 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
{
pte_t swp_pte = swp_entry_to_pte(entry);
pte_t *pte;
- int found = 0;
+ int ret = 0;
/*
* We don't actually need pte lock while scanning for swp_pte: since
@@ -560,15 +566,15 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
*/
if (unlikely(pte_same(*pte, swp_pte))) {
pte_unmap(pte);
- found = unuse_pte(vma, pmd, addr, entry, page);
- if (found)
+ ret = unuse_pte(vma, pmd, addr, entry, page);
+ if (ret)
goto out;
pte = pte_offset_map(pmd, addr);
}
} while (pte++, addr += PAGE_SIZE, addr != end);
pte_unmap(pte - 1);
out:
- return found;
+ return ret;
}
static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud,
@@ -577,14 +583,16 @@ static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud,
{
pmd_t *pmd;
unsigned long next;
+ int ret;
pmd = pmd_offset(pud, addr);
do {
next = pmd_addr_end(addr, end);
if (pmd_none_or_clear_bad(pmd))
continue;
- if (unuse_pte_range(vma, pmd, addr, next, entry, page))
- return 1;
+ ret = unuse_pte_range(vma, pmd, addr, next, entry, page);
+ if (ret)
+ return ret;
} while (pmd++, addr = next, addr != end);
return 0;
}
@@ -595,14 +603,16 @@ static inline int unuse_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
{
pud_t *pud;
unsigned long next;
+ int ret;
pud = pud_offset(pgd, addr);
do {
next = pud_addr_end(addr, end);
if (pud_none_or_clear_bad(pud))
continue;
- if (unuse_pmd_range(vma, pud, addr, next, entry, page))
- return 1;
+ ret = unuse_pmd_range(vma, pud, addr, next, entry, page);
+ if (ret)
+ return ret;
} while (pud++, addr = next, addr != end);
return 0;
}
@@ -612,6 +622,7 @@ static int unuse_vma(struct vm_area_struct *vma,
{
pgd_t *pgd;
unsigned long addr, end, next;
+ int ret;
if (page->mapping) {
addr = page_address_in_vma(page, vma);
@@ -629,8 +640,9 @@ static int unuse_vma(struct vm_area_struct *vma,
next = pgd_addr_end(addr, end);
if (pgd_none_or_clear_bad(pgd))
continue;
- if (unuse_pud_range(vma, pgd, addr, next, entry, page))
- return 1;
+ ret = unuse_pud_range(vma, pgd, addr, next, entry, page);
+ if (ret)
+ return ret;
} while (pgd++, addr = next, addr != end);
return 0;
}
@@ -639,6 +651,7 @@ static int unuse_mm(struct mm_struct *mm,
swp_entry_t entry, struct page *page)
{
struct vm_area_struct *vma;
+ int ret = 0;
if (!down_read_trylock(&mm->mmap_sem)) {
/*
@@ -651,15 +664,11 @@ static int unuse_mm(struct mm_struct *mm,
lock_page(page);
}
for (vma = mm->mmap; vma; vma = vma->vm_next) {
- if (vma->anon_vma && unuse_vma(vma, entry, page))
+ if (vma->anon_vma && (ret = unuse_vma(vma, entry, page)))
break;
}
up_read(&mm->mmap_sem);
- /*
- * Currently unuse_mm cannot fail, but leave error handling
- * at call sites for now, since we change it from time to time.
- */
- return 0;
+ return (ret < 0)? ret: 0;
}
/*
@@ -1385,7 +1394,7 @@ static int swap_show(struct seq_file *swap, void *v)
}
file = ptr->swap_file;
- len = seq_path(swap, file->f_path.mnt, file->f_path.dentry, " \t\n\\");
+ len = seq_path(swap, &file->f_path, " \t\n\\");
seq_printf(swap, "%*s%s\t%u\t%u\t%d\n",
len < 40 ? 40 - len : 1, " ",
S_ISBLK(file->f_path.dentry->d_inode->i_mode) ?
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 0536dde139d1..950c0be9ca81 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -820,7 +820,7 @@ void __attribute__((weak)) vmalloc_sync_all(void)
}
-static int f(pte_t *pte, struct page *pmd_page, unsigned long addr, void *data)
+static int f(pte_t *pte, pgtable_t table, unsigned long addr, void *data)
{
/* apply_to_page_range() does all the hard work. */
return 0;
diff --git a/mm/vmscan.c b/mm/vmscan.c
index e5a9597e3bbc..a26dabd62fed 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -37,6 +37,7 @@
#include <linux/delay.h>
#include <linux/kthread.h>
#include <linux/freezer.h>
+#include <linux/memcontrol.h>
#include <asm/tlbflush.h>
#include <asm/div64.h>
@@ -68,6 +69,22 @@ struct scan_control {
int all_unreclaimable;
int order;
+
+ /*
+ * Pages that have (or should have) IO pending. If we run into
+ * a lot of these, we're better off waiting a little for IO to
+ * finish rather than scanning more pages in the VM.
+ */
+ int nr_io_pages;
+
+ /* Which cgroup do we reclaim from */
+ struct mem_cgroup *mem_cgroup;
+
+ /* Pluggable isolate pages callback */
+ unsigned long (*isolate_pages)(unsigned long nr, struct list_head *dst,
+ unsigned long *scanned, int order, int mode,
+ struct zone *z, struct mem_cgroup *mem_cont,
+ int active);
};
#define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru))
@@ -109,6 +126,12 @@ long vm_total_pages; /* The total number of pages which the VM controls */
static LIST_HEAD(shrinker_list);
static DECLARE_RWSEM(shrinker_rwsem);
+#ifdef CONFIG_CGROUP_MEM_CONT
+#define scan_global_lru(sc) (!(sc)->mem_cgroup)
+#else
+#define scan_global_lru(sc) (1)
+#endif
+
/*
* Add a shrinker callback to be called from the vm
*/
@@ -489,11 +512,13 @@ static unsigned long shrink_page_list(struct list_head *page_list,
*/
if (sync_writeback == PAGEOUT_IO_SYNC && may_enter_fs)
wait_on_page_writeback(page);
- else
+ else {
+ sc->nr_io_pages++;
goto keep_locked;
+ }
}
- referenced = page_referenced(page, 1);
+ referenced = page_referenced(page, 1, sc->mem_cgroup);
/* In active use or really unfreeable? Activate it. */
if (sc->order <= PAGE_ALLOC_COSTLY_ORDER &&
referenced && page_mapping_inuse(page))
@@ -529,8 +554,10 @@ static unsigned long shrink_page_list(struct list_head *page_list,
if (PageDirty(page)) {
if (sc->order <= PAGE_ALLOC_COSTLY_ORDER && referenced)
goto keep_locked;
- if (!may_enter_fs)
+ if (!may_enter_fs) {
+ sc->nr_io_pages++;
goto keep_locked;
+ }
if (!sc->may_writepage)
goto keep_locked;
@@ -541,8 +568,10 @@ static unsigned long shrink_page_list(struct list_head *page_list,
case PAGE_ACTIVATE:
goto activate_locked;
case PAGE_SUCCESS:
- if (PageWriteback(page) || PageDirty(page))
+ if (PageWriteback(page) || PageDirty(page)) {
+ sc->nr_io_pages++;
goto keep;
+ }
/*
* A synchronous write - probably a ramdisk. Go
* ahead and try to reclaim the page.
@@ -626,7 +655,7 @@ keep:
*
* returns 0 on success, -ve errno on failure.
*/
-static int __isolate_lru_page(struct page *page, int mode)
+int __isolate_lru_page(struct page *page, int mode)
{
int ret = -EINVAL;
@@ -760,6 +789,21 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
return nr_taken;
}
+static unsigned long isolate_pages_global(unsigned long nr,
+ struct list_head *dst,
+ unsigned long *scanned, int order,
+ int mode, struct zone *z,
+ struct mem_cgroup *mem_cont,
+ int active)
+{
+ if (active)
+ return isolate_lru_pages(nr, &z->active_list, dst,
+ scanned, order, mode);
+ else
+ return isolate_lru_pages(nr, &z->inactive_list, dst,
+ scanned, order, mode);
+}
+
/*
* clear_active_flags() is a helper for shrink_active_list(), clearing
* any active bits from the pages in the list.
@@ -801,18 +845,19 @@ static unsigned long shrink_inactive_list(unsigned long max_scan,
unsigned long nr_freed;
unsigned long nr_active;
- nr_taken = isolate_lru_pages(sc->swap_cluster_max,
- &zone->inactive_list,
+ nr_taken = sc->isolate_pages(sc->swap_cluster_max,
&page_list, &nr_scan, sc->order,
(sc->order > PAGE_ALLOC_COSTLY_ORDER)?
- ISOLATE_BOTH : ISOLATE_INACTIVE);
+ ISOLATE_BOTH : ISOLATE_INACTIVE,
+ zone, sc->mem_cgroup, 0);
nr_active = clear_active_flags(&page_list);
__count_vm_events(PGDEACTIVATE, nr_active);
__mod_zone_page_state(zone, NR_ACTIVE, -nr_active);
__mod_zone_page_state(zone, NR_INACTIVE,
-(nr_taken - nr_active));
- zone->pages_scanned += nr_scan;
+ if (scan_global_lru(sc))
+ zone->pages_scanned += nr_scan;
spin_unlock_irq(&zone->lru_lock);
nr_scanned += nr_scan;
@@ -844,8 +889,9 @@ static unsigned long shrink_inactive_list(unsigned long max_scan,
if (current_is_kswapd()) {
__count_zone_vm_events(PGSCAN_KSWAPD, zone, nr_scan);
__count_vm_events(KSWAPD_STEAL, nr_freed);
- } else
+ } else if (scan_global_lru(sc))
__count_zone_vm_events(PGSCAN_DIRECT, zone, nr_scan);
+
__count_zone_vm_events(PGSTEAL, zone, nr_freed);
if (nr_taken == 0)
@@ -899,6 +945,113 @@ static inline int zone_is_near_oom(struct zone *zone)
}
/*
+ * Determine we should try to reclaim mapped pages.
+ * This is called only when sc->mem_cgroup is NULL.
+ */
+static int calc_reclaim_mapped(struct scan_control *sc, struct zone *zone,
+ int priority)
+{
+ long mapped_ratio;
+ long distress;
+ long swap_tendency;
+ long imbalance;
+ int reclaim_mapped = 0;
+ int prev_priority;
+
+ if (scan_global_lru(sc) && zone_is_near_oom(zone))
+ return 1;
+ /*
+ * `distress' is a measure of how much trouble we're having
+ * reclaiming pages. 0 -> no problems. 100 -> great trouble.
+ */
+ if (scan_global_lru(sc))
+ prev_priority = zone->prev_priority;
+ else
+ prev_priority = mem_cgroup_get_reclaim_priority(sc->mem_cgroup);
+
+ distress = 100 >> min(prev_priority, priority);
+
+ /*
+ * The point of this algorithm is to decide when to start
+ * reclaiming mapped memory instead of just pagecache. Work out
+ * how much memory
+ * is mapped.
+ */
+ if (scan_global_lru(sc))
+ mapped_ratio = ((global_page_state(NR_FILE_MAPPED) +
+ global_page_state(NR_ANON_PAGES)) * 100) /
+ vm_total_pages;
+ else
+ mapped_ratio = mem_cgroup_calc_mapped_ratio(sc->mem_cgroup);
+
+ /*
+ * Now decide how much we really want to unmap some pages. The
+ * mapped ratio is downgraded - just because there's a lot of
+ * mapped memory doesn't necessarily mean that page reclaim
+ * isn't succeeding.
+ *
+ * The distress ratio is important - we don't want to start
+ * going oom.
+ *
+ * A 100% value of vm_swappiness overrides this algorithm
+ * altogether.
+ */
+ swap_tendency = mapped_ratio / 2 + distress + sc->swappiness;
+
+ /*
+ * If there's huge imbalance between active and inactive
+ * (think active 100 times larger than inactive) we should
+ * become more permissive, or the system will take too much
+ * cpu before it start swapping during memory pressure.
+ * Distress is about avoiding early-oom, this is about
+ * making swappiness graceful despite setting it to low
+ * values.
+ *
+ * Avoid div by zero with nr_inactive+1, and max resulting
+ * value is vm_total_pages.
+ */
+ if (scan_global_lru(sc)) {
+ imbalance = zone_page_state(zone, NR_ACTIVE);
+ imbalance /= zone_page_state(zone, NR_INACTIVE) + 1;
+ } else
+ imbalance = mem_cgroup_reclaim_imbalance(sc->mem_cgroup);
+
+ /*
+ * Reduce the effect of imbalance if swappiness is low,
+ * this means for a swappiness very low, the imbalance
+ * must be much higher than 100 for this logic to make
+ * the difference.
+ *
+ * Max temporary value is vm_total_pages*100.
+ */
+ imbalance *= (vm_swappiness + 1);
+ imbalance /= 100;
+
+ /*
+ * If not much of the ram is mapped, makes the imbalance
+ * less relevant, it's high priority we refill the inactive
+ * list with mapped pages only in presence of high ratio of
+ * mapped pages.
+ *
+ * Max temporary value is vm_total_pages*100.
+ */
+ imbalance *= mapped_ratio;
+ imbalance /= 100;
+
+ /* apply imbalance feedback to swap_tendency */
+ swap_tendency += imbalance;
+
+ /*
+ * Now use this metric to decide whether to start moving mapped
+ * memory onto the inactive list.
+ */
+ if (swap_tendency >= 100)
+ reclaim_mapped = 1;
+
+ return reclaim_mapped;
+}
+
+/*
* This moves pages from the active list to the inactive list.
*
* We move them the other way if the page is referenced by one or more
@@ -915,6 +1068,8 @@ static inline int zone_is_near_oom(struct zone *zone)
* The downside is that we have to touch page->_count against each page.
* But we had to alter page->flags anyway.
*/
+
+
static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
struct scan_control *sc, int priority)
{
@@ -928,99 +1083,21 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
struct pagevec pvec;
int reclaim_mapped = 0;
- if (sc->may_swap) {
- long mapped_ratio;
- long distress;
- long swap_tendency;
- long imbalance;
-
- if (zone_is_near_oom(zone))
- goto force_reclaim_mapped;
-
- /*
- * `distress' is a measure of how much trouble we're having
- * reclaiming pages. 0 -> no problems. 100 -> great trouble.
- */
- distress = 100 >> min(zone->prev_priority, priority);
-
- /*
- * The point of this algorithm is to decide when to start
- * reclaiming mapped memory instead of just pagecache. Work out
- * how much memory
- * is mapped.
- */
- mapped_ratio = ((global_page_state(NR_FILE_MAPPED) +
- global_page_state(NR_ANON_PAGES)) * 100) /
- vm_total_pages;
-
- /*
- * Now decide how much we really want to unmap some pages. The
- * mapped ratio is downgraded - just because there's a lot of
- * mapped memory doesn't necessarily mean that page reclaim
- * isn't succeeding.
- *
- * The distress ratio is important - we don't want to start
- * going oom.
- *
- * A 100% value of vm_swappiness overrides this algorithm
- * altogether.
- */
- swap_tendency = mapped_ratio / 2 + distress + sc->swappiness;
-
- /*
- * If there's huge imbalance between active and inactive
- * (think active 100 times larger than inactive) we should
- * become more permissive, or the system will take too much
- * cpu before it start swapping during memory pressure.
- * Distress is about avoiding early-oom, this is about
- * making swappiness graceful despite setting it to low
- * values.
- *
- * Avoid div by zero with nr_inactive+1, and max resulting
- * value is vm_total_pages.
- */
- imbalance = zone_page_state(zone, NR_ACTIVE);
- imbalance /= zone_page_state(zone, NR_INACTIVE) + 1;
-
- /*
- * Reduce the effect of imbalance if swappiness is low,
- * this means for a swappiness very low, the imbalance
- * must be much higher than 100 for this logic to make
- * the difference.
- *
- * Max temporary value is vm_total_pages*100.
- */
- imbalance *= (vm_swappiness + 1);
- imbalance /= 100;
-
- /*
- * If not much of the ram is mapped, makes the imbalance
- * less relevant, it's high priority we refill the inactive
- * list with mapped pages only in presence of high ratio of
- * mapped pages.
- *
- * Max temporary value is vm_total_pages*100.
- */
- imbalance *= mapped_ratio;
- imbalance /= 100;
-
- /* apply imbalance feedback to swap_tendency */
- swap_tendency += imbalance;
-
- /*
- * Now use this metric to decide whether to start moving mapped
- * memory onto the inactive list.
- */
- if (swap_tendency >= 100)
-force_reclaim_mapped:
- reclaim_mapped = 1;
- }
+ if (sc->may_swap)
+ reclaim_mapped = calc_reclaim_mapped(sc, zone, priority);
lru_add_drain();
spin_lock_irq(&zone->lru_lock);
- pgmoved = isolate_lru_pages(nr_pages, &zone->active_list,
- &l_hold, &pgscanned, sc->order, ISOLATE_ACTIVE);
- zone->pages_scanned += pgscanned;
+ pgmoved = sc->isolate_pages(nr_pages, &l_hold, &pgscanned, sc->order,
+ ISOLATE_ACTIVE, zone,
+ sc->mem_cgroup, 1);
+ /*
+ * zone->pages_scanned is used for detect zone's oom
+ * mem_cgroup remembers nr_scan by itself.
+ */
+ if (scan_global_lru(sc))
+ zone->pages_scanned += pgscanned;
+
__mod_zone_page_state(zone, NR_ACTIVE, -pgmoved);
spin_unlock_irq(&zone->lru_lock);
@@ -1031,7 +1108,7 @@ force_reclaim_mapped:
if (page_mapped(page)) {
if (!reclaim_mapped ||
(total_swap_pages == 0 && PageAnon(page)) ||
- page_referenced(page, 0)) {
+ page_referenced(page, 0, sc->mem_cgroup)) {
list_add(&page->lru, &l_active);
continue;
}
@@ -1051,6 +1128,7 @@ force_reclaim_mapped:
ClearPageActive(page);
list_move(&page->lru, &zone->inactive_list);
+ mem_cgroup_move_lists(page_get_page_cgroup(page), false);
pgmoved++;
if (!pagevec_add(&pvec, page)) {
__mod_zone_page_state(zone, NR_INACTIVE, pgmoved);
@@ -1079,6 +1157,7 @@ force_reclaim_mapped:
SetPageLRU(page);
VM_BUG_ON(!PageActive(page));
list_move(&page->lru, &zone->active_list);
+ mem_cgroup_move_lists(page_get_page_cgroup(page), true);
pgmoved++;
if (!pagevec_add(&pvec, page)) {
__mod_zone_page_state(zone, NR_ACTIVE, pgmoved);
@@ -1108,25 +1187,39 @@ static unsigned long shrink_zone(int priority, struct zone *zone,
unsigned long nr_to_scan;
unsigned long nr_reclaimed = 0;
- /*
- * Add one to `nr_to_scan' just to make sure that the kernel will
- * slowly sift through the active list.
- */
- zone->nr_scan_active +=
- (zone_page_state(zone, NR_ACTIVE) >> priority) + 1;
- nr_active = zone->nr_scan_active;
- if (nr_active >= sc->swap_cluster_max)
- zone->nr_scan_active = 0;
- else
- nr_active = 0;
+ if (scan_global_lru(sc)) {
+ /*
+ * Add one to nr_to_scan just to make sure that the kernel
+ * will slowly sift through the active list.
+ */
+ zone->nr_scan_active +=
+ (zone_page_state(zone, NR_ACTIVE) >> priority) + 1;
+ nr_active = zone->nr_scan_active;
+ zone->nr_scan_inactive +=
+ (zone_page_state(zone, NR_INACTIVE) >> priority) + 1;
+ nr_inactive = zone->nr_scan_inactive;
+ if (nr_inactive >= sc->swap_cluster_max)
+ zone->nr_scan_inactive = 0;
+ else
+ nr_inactive = 0;
+
+ if (nr_active >= sc->swap_cluster_max)
+ zone->nr_scan_active = 0;
+ else
+ nr_active = 0;
+ } else {
+ /*
+ * This reclaim occurs not because zone memory shortage but
+ * because memory controller hits its limit.
+ * Then, don't modify zone reclaim related data.
+ */
+ nr_active = mem_cgroup_calc_reclaim_active(sc->mem_cgroup,
+ zone, priority);
+
+ nr_inactive = mem_cgroup_calc_reclaim_inactive(sc->mem_cgroup,
+ zone, priority);
+ }
- zone->nr_scan_inactive +=
- (zone_page_state(zone, NR_INACTIVE) >> priority) + 1;
- nr_inactive = zone->nr_scan_inactive;
- if (nr_inactive >= sc->swap_cluster_max)
- zone->nr_scan_inactive = 0;
- else
- nr_inactive = 0;
while (nr_active || nr_inactive) {
if (nr_active) {
@@ -1171,25 +1264,39 @@ static unsigned long shrink_zones(int priority, struct zone **zones,
unsigned long nr_reclaimed = 0;
int i;
+
sc->all_unreclaimable = 1;
for (i = 0; zones[i] != NULL; i++) {
struct zone *zone = zones[i];
if (!populated_zone(zone))
continue;
+ /*
+ * Take care memory controller reclaiming has small influence
+ * to global LRU.
+ */
+ if (scan_global_lru(sc)) {
+ if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
+ continue;
+ note_zone_scanning_priority(zone, priority);
- if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
- continue;
-
- note_zone_scanning_priority(zone, priority);
-
- if (zone_is_all_unreclaimable(zone) && priority != DEF_PRIORITY)
- continue; /* Let kswapd poll it */
-
- sc->all_unreclaimable = 0;
+ if (zone_is_all_unreclaimable(zone) &&
+ priority != DEF_PRIORITY)
+ continue; /* Let kswapd poll it */
+ sc->all_unreclaimable = 0;
+ } else {
+ /*
+ * Ignore cpuset limitation here. We just want to reduce
+ * # of used pages by us regardless of memory shortage.
+ */
+ sc->all_unreclaimable = 0;
+ mem_cgroup_note_reclaim_priority(sc->mem_cgroup,
+ priority);
+ }
nr_reclaimed += shrink_zone(priority, zone, sc);
}
+
return nr_reclaimed;
}
@@ -1206,7 +1313,8 @@ static unsigned long shrink_zones(int priority, struct zone **zones,
* holds filesystem locks which prevent writeout this might not work, and the
* allocation attempt will fail.
*/
-unsigned long try_to_free_pages(struct zone **zones, int order, gfp_t gfp_mask)
+static unsigned long do_try_to_free_pages(struct zone **zones, gfp_t gfp_mask,
+ struct scan_control *sc)
{
int priority;
int ret = 0;
@@ -1215,39 +1323,43 @@ unsigned long try_to_free_pages(struct zone **zones, int order, gfp_t gfp_mask)
struct reclaim_state *reclaim_state = current->reclaim_state;
unsigned long lru_pages = 0;
int i;
- struct scan_control sc = {
- .gfp_mask = gfp_mask,
- .may_writepage = !laptop_mode,
- .swap_cluster_max = SWAP_CLUSTER_MAX,
- .may_swap = 1,
- .swappiness = vm_swappiness,
- .order = order,
- };
-
- count_vm_event(ALLOCSTALL);
- for (i = 0; zones[i] != NULL; i++) {
- struct zone *zone = zones[i];
+ if (scan_global_lru(sc))
+ count_vm_event(ALLOCSTALL);
+ /*
+ * mem_cgroup will not do shrink_slab.
+ */
+ if (scan_global_lru(sc)) {
+ for (i = 0; zones[i] != NULL; i++) {
+ struct zone *zone = zones[i];
- if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
- continue;
+ if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
+ continue;
- lru_pages += zone_page_state(zone, NR_ACTIVE)
- + zone_page_state(zone, NR_INACTIVE);
+ lru_pages += zone_page_state(zone, NR_ACTIVE)
+ + zone_page_state(zone, NR_INACTIVE);
+ }
}
for (priority = DEF_PRIORITY; priority >= 0; priority--) {
- sc.nr_scanned = 0;
+ sc->nr_scanned = 0;
+ sc->nr_io_pages = 0;
if (!priority)
disable_swap_token();
- nr_reclaimed += shrink_zones(priority, zones, &sc);
- shrink_slab(sc.nr_scanned, gfp_mask, lru_pages);
- if (reclaim_state) {
- nr_reclaimed += reclaim_state->reclaimed_slab;
- reclaim_state->reclaimed_slab = 0;
+ nr_reclaimed += shrink_zones(priority, zones, sc);
+ /*
+ * Don't shrink slabs when reclaiming memory from
+ * over limit cgroups
+ */
+ if (scan_global_lru(sc)) {
+ shrink_slab(sc->nr_scanned, gfp_mask, lru_pages);
+ if (reclaim_state) {
+ nr_reclaimed += reclaim_state->reclaimed_slab;
+ reclaim_state->reclaimed_slab = 0;
+ }
}
- total_scanned += sc.nr_scanned;
- if (nr_reclaimed >= sc.swap_cluster_max) {
+ total_scanned += sc->nr_scanned;
+ if (nr_reclaimed >= sc->swap_cluster_max) {
ret = 1;
goto out;
}
@@ -1259,18 +1371,19 @@ unsigned long try_to_free_pages(struct zone **zones, int order, gfp_t gfp_mask)
* that's undesirable in laptop mode, where we *want* lumpy
* writeout. So in laptop mode, write out the whole world.
*/
- if (total_scanned > sc.swap_cluster_max +
- sc.swap_cluster_max / 2) {
+ if (total_scanned > sc->swap_cluster_max +
+ sc->swap_cluster_max / 2) {
wakeup_pdflush(laptop_mode ? 0 : total_scanned);
- sc.may_writepage = 1;
+ sc->may_writepage = 1;
}
/* Take a nap, wait for some writeback to complete */
- if (sc.nr_scanned && priority < DEF_PRIORITY - 2)
+ if (sc->nr_scanned && priority < DEF_PRIORITY - 2 &&
+ sc->nr_io_pages > sc->swap_cluster_max)
congestion_wait(WRITE, HZ/10);
}
/* top priority shrink_caches still had more to do? don't OOM, then */
- if (!sc.all_unreclaimable)
+ if (!sc->all_unreclaimable && scan_global_lru(sc))
ret = 1;
out:
/*
@@ -1282,17 +1395,63 @@ out:
*/
if (priority < 0)
priority = 0;
- for (i = 0; zones[i] != NULL; i++) {
- struct zone *zone = zones[i];
- if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
- continue;
+ if (scan_global_lru(sc)) {
+ for (i = 0; zones[i] != NULL; i++) {
+ struct zone *zone = zones[i];
+
+ if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
+ continue;
+
+ zone->prev_priority = priority;
+ }
+ } else
+ mem_cgroup_record_reclaim_priority(sc->mem_cgroup, priority);
- zone->prev_priority = priority;
- }
return ret;
}
+unsigned long try_to_free_pages(struct zone **zones, int order, gfp_t gfp_mask)
+{
+ struct scan_control sc = {
+ .gfp_mask = gfp_mask,
+ .may_writepage = !laptop_mode,
+ .swap_cluster_max = SWAP_CLUSTER_MAX,
+ .may_swap = 1,
+ .swappiness = vm_swappiness,
+ .order = order,
+ .mem_cgroup = NULL,
+ .isolate_pages = isolate_pages_global,
+ };
+
+ return do_try_to_free_pages(zones, gfp_mask, &sc);
+}
+
+#ifdef CONFIG_CGROUP_MEM_CONT
+
+unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
+ gfp_t gfp_mask)
+{
+ struct scan_control sc = {
+ .gfp_mask = gfp_mask,
+ .may_writepage = !laptop_mode,
+ .may_swap = 1,
+ .swap_cluster_max = SWAP_CLUSTER_MAX,
+ .swappiness = vm_swappiness,
+ .order = 0,
+ .mem_cgroup = mem_cont,
+ .isolate_pages = mem_cgroup_isolate_pages,
+ };
+ struct zone **zones;
+ int target_zone = gfp_zone(GFP_HIGHUSER_MOVABLE);
+
+ zones = NODE_DATA(numa_node_id())->node_zonelists[target_zone].zones;
+ if (do_try_to_free_pages(zones, sc.gfp_mask, &sc))
+ return 1;
+ return 0;
+}
+#endif
+
/*
* For kswapd, balance_pgdat() will work across all this node's zones until
* they are all at pages_high.
@@ -1328,6 +1487,8 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order)
.swap_cluster_max = SWAP_CLUSTER_MAX,
.swappiness = vm_swappiness,
.order = order,
+ .mem_cgroup = NULL,
+ .isolate_pages = isolate_pages_global,
};
/*
* temp_priority is used to remember the scanning priority at which
@@ -1352,6 +1513,7 @@ loop_again:
if (!priority)
disable_swap_token();
+ sc.nr_io_pages = 0;
all_zones_ok = 1;
/*
@@ -1444,7 +1606,8 @@ loop_again:
* OK, kswapd is getting into trouble. Take a nap, then take
* another pass across the zones.
*/
- if (total_scanned && priority < DEF_PRIORITY - 2)
+ if (total_scanned && priority < DEF_PRIORITY - 2 &&
+ sc.nr_io_pages > sc.swap_cluster_max)
congestion_wait(WRITE, HZ/10);
/*
@@ -1649,6 +1812,7 @@ unsigned long shrink_all_memory(unsigned long nr_pages)
.swap_cluster_max = nr_pages,
.may_writepage = 1,
.swappiness = vm_swappiness,
+ .isolate_pages = isolate_pages_global,
};
current->reclaim_state = &reclaim_state;
@@ -1834,6 +1998,7 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
SWAP_CLUSTER_MAX),
.gfp_mask = gfp_mask,
.swappiness = vm_swappiness,
+ .isolate_pages = isolate_pages_global,
};
unsigned long slab_reclaimable;