summaryrefslogtreecommitdiffstats
path: root/mm/mempolicy.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/mempolicy.c')
-rw-r--r--mm/mempolicy.c189
1 files changed, 104 insertions, 85 deletions
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index d1b315e98627..74310017296e 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -26,7 +26,7 @@
* the allocation to memory nodes instead
*
* preferred Try a specific node first before normal fallback.
- * As a special case node -1 here means do the allocation
+ * As a special case NUMA_NO_NODE here means do the allocation
* on the local CPU. This is normally identical to default,
* but useful to set in a VMA when you have a non default
* process policy.
@@ -127,7 +127,7 @@ static struct mempolicy *get_task_policy(struct task_struct *p)
if (!pol) {
node = numa_node_id();
- if (node != -1)
+ if (node != NUMA_NO_NODE)
pol = &preferred_node_policy[node];
/* preferred_node_policy is not initialised early in boot */
@@ -161,19 +161,7 @@ static const struct mempolicy_operations {
/* Check that the nodemask contains at least one populated zone */
static int is_valid_nodemask(const nodemask_t *nodemask)
{
- int nd, k;
-
- for_each_node_mask(nd, *nodemask) {
- struct zone *z;
-
- for (k = 0; k <= policy_zone; k++) {
- z = &NODE_DATA(nd)->node_zones[k];
- if (z->present_pages > 0)
- return 1;
- }
- }
-
- return 0;
+ return nodes_intersects(*nodemask, node_states[N_MEMORY]);
}
static inline int mpol_store_user_nodemask(const struct mempolicy *pol)
@@ -270,7 +258,7 @@ static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
struct mempolicy *policy;
pr_debug("setting mode %d flags %d nodes[0] %lx\n",
- mode, flags, nodes ? nodes_addr(*nodes)[0] : -1);
+ mode, flags, nodes ? nodes_addr(*nodes)[0] : NUMA_NO_NODE);
if (mode == MPOL_DEFAULT) {
if (nodes && !nodes_empty(*nodes))
@@ -508,9 +496,8 @@ static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
/*
* vm_normal_page() filters out zero pages, but there might
* still be PageReserved pages to skip, perhaps in a VDSO.
- * And we cannot move PageKsm pages sensibly or safely yet.
*/
- if (PageReserved(page) || PageKsm(page))
+ if (PageReserved(page))
continue;
nid = page_to_nid(page);
if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT))
@@ -1027,8 +1014,7 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest,
if (!list_empty(&pagelist)) {
err = migrate_pages(&pagelist, new_node_page, dest,
- false, MIGRATE_SYNC,
- MR_SYSCALL);
+ MIGRATE_SYNC, MR_SYSCALL);
if (err)
putback_lru_pages(&pagelist);
}
@@ -1235,7 +1221,7 @@ static long do_mbind(unsigned long start, unsigned long len,
pr_debug("mbind %lx-%lx mode:%d flags:%d nodes:%lx\n",
start, start + len, mode, mode_flags,
- nmask ? nodes_addr(*nmask)[0] : -1);
+ nmask ? nodes_addr(*nmask)[0] : NUMA_NO_NODE);
if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
@@ -1272,9 +1258,8 @@ static long do_mbind(unsigned long start, unsigned long len,
if (!list_empty(&pagelist)) {
WARN_ON_ONCE(flags & MPOL_MF_LAZY);
nr_failed = migrate_pages(&pagelist, new_vma_page,
- (unsigned long)vma,
- false, MIGRATE_SYNC,
- MR_MEMPOLICY_MBIND);
+ (unsigned long)vma,
+ MIGRATE_SYNC, MR_MEMPOLICY_MBIND);
if (nr_failed)
putback_lru_pages(&pagelist);
}
@@ -1644,6 +1629,26 @@ struct mempolicy *get_vma_policy(struct task_struct *task,
return pol;
}
+static int apply_policy_zone(struct mempolicy *policy, enum zone_type zone)
+{
+ enum zone_type dynamic_policy_zone = policy_zone;
+
+ BUG_ON(dynamic_policy_zone == ZONE_MOVABLE);
+
+ /*
+ * if policy->v.nodes has movable memory only,
+ * we apply policy when gfp_zone(gfp) = ZONE_MOVABLE only.
+ *
+ * policy->v.nodes is intersect with node_states[N_MEMORY].
+ * so if the following test faile, it implies
+ * policy->v.nodes has movable memory only.
+ */
+ if (!nodes_intersects(policy->v.nodes, node_states[N_HIGH_MEMORY]))
+ dynamic_policy_zone = ZONE_MOVABLE;
+
+ return zone >= dynamic_policy_zone;
+}
+
/*
* Return a nodemask representing a mempolicy for filtering nodes for
* page allocation
@@ -1652,7 +1657,7 @@ static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy)
{
/* Lower zones don't get a nodemask applied for MPOL_BIND */
if (unlikely(policy->mode == MPOL_BIND) &&
- gfp_zone(gfp) >= policy_zone &&
+ apply_policy_zone(policy, gfp_zone(gfp)) &&
cpuset_nodemask_valid_mems_allowed(&policy->v.nodes))
return &policy->v.nodes;
@@ -2132,7 +2137,7 @@ bool __mpol_equal(struct mempolicy *a, struct mempolicy *b)
*/
/* lookup first element intersecting start-end */
-/* Caller holds sp->mutex */
+/* Caller holds sp->lock */
static struct sp_node *
sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
{
@@ -2196,13 +2201,13 @@ mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
if (!sp->root.rb_node)
return NULL;
- mutex_lock(&sp->mutex);
+ spin_lock(&sp->lock);
sn = sp_lookup(sp, idx, idx+1);
if (sn) {
mpol_get(sn->policy);
pol = sn->policy;
}
- mutex_unlock(&sp->mutex);
+ spin_unlock(&sp->lock);
return pol;
}
@@ -2308,7 +2313,7 @@ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long
* it less likely we act on an unlikely task<->page
* relation.
*/
- last_nid = page_xchg_last_nid(page, polnid);
+ last_nid = page_nid_xchg_last(page, polnid);
if (last_nid != polnid)
goto out;
}
@@ -2328,6 +2333,14 @@ static void sp_delete(struct shared_policy *sp, struct sp_node *n)
sp_free(n);
}
+static void sp_node_init(struct sp_node *node, unsigned long start,
+ unsigned long end, struct mempolicy *pol)
+{
+ node->start = start;
+ node->end = end;
+ node->policy = pol;
+}
+
static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
struct mempolicy *pol)
{
@@ -2344,10 +2357,7 @@ static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
return NULL;
}
newpol->flags |= MPOL_F_SHARED;
-
- n->start = start;
- n->end = end;
- n->policy = newpol;
+ sp_node_init(n, start, end, newpol);
return n;
}
@@ -2357,9 +2367,12 @@ static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
unsigned long end, struct sp_node *new)
{
struct sp_node *n;
+ struct sp_node *n_new = NULL;
+ struct mempolicy *mpol_new = NULL;
int ret = 0;
- mutex_lock(&sp->mutex);
+restart:
+ spin_lock(&sp->lock);
n = sp_lookup(sp, start, end);
/* Take care of old policies in the same range. */
while (n && n->start < end) {
@@ -2372,14 +2385,16 @@ static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
} else {
/* Old policy spanning whole new range. */
if (n->end > end) {
- struct sp_node *new2;
- new2 = sp_alloc(end, n->end, n->policy);
- if (!new2) {
- ret = -ENOMEM;
- goto out;
- }
+ if (!n_new)
+ goto alloc_new;
+
+ *mpol_new = *n->policy;
+ atomic_set(&mpol_new->refcnt, 1);
+ sp_node_init(n_new, end, n->end, mpol_new);
n->end = start;
- sp_insert(sp, new2);
+ sp_insert(sp, n_new);
+ n_new = NULL;
+ mpol_new = NULL;
break;
} else
n->end = start;
@@ -2390,9 +2405,27 @@ static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
}
if (new)
sp_insert(sp, new);
-out:
- mutex_unlock(&sp->mutex);
+ spin_unlock(&sp->lock);
+ ret = 0;
+
+err_out:
+ if (mpol_new)
+ mpol_put(mpol_new);
+ if (n_new)
+ kmem_cache_free(sn_cache, n_new);
+
return ret;
+
+alloc_new:
+ spin_unlock(&sp->lock);
+ ret = -ENOMEM;
+ n_new = kmem_cache_alloc(sn_cache, GFP_KERNEL);
+ if (!n_new)
+ goto err_out;
+ mpol_new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
+ if (!mpol_new)
+ goto err_out;
+ goto restart;
}
/**
@@ -2410,7 +2443,7 @@ void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
int ret;
sp->root = RB_ROOT; /* empty tree == default mempolicy */
- mutex_init(&sp->mutex);
+ spin_lock_init(&sp->lock);
if (mpol) {
struct vm_area_struct pvma;
@@ -2455,7 +2488,7 @@ int mpol_set_shared_policy(struct shared_policy *info,
vma->vm_pgoff,
sz, npol ? npol->mode : -1,
npol ? npol->flags : -1,
- npol ? nodes_addr(npol->v.nodes)[0] : -1);
+ npol ? nodes_addr(npol->v.nodes)[0] : NUMA_NO_NODE);
if (npol) {
new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
@@ -2476,14 +2509,14 @@ void mpol_free_shared_policy(struct shared_policy *p)
if (!p->root.rb_node)
return;
- mutex_lock(&p->mutex);
+ spin_lock(&p->lock);
next = rb_first(&p->root);
while (next) {
n = rb_entry(next, struct sp_node, nd);
next = rb_next(&n->nd);
sp_delete(p, n);
}
- mutex_unlock(&p->mutex);
+ spin_unlock(&p->lock);
}
#ifdef CONFIG_NUMA_BALANCING
@@ -2595,8 +2628,7 @@ void numa_default_policy(void)
*/
/*
- * "local" is pseudo-policy: MPOL_PREFERRED with MPOL_F_LOCAL flag
- * Used only for mpol_parse_str() and mpol_to_str()
+ * "local" is implemented internally by MPOL_PREFERRED with MPOL_F_LOCAL flag.
*/
static const char * const policy_modes[] =
{
@@ -2610,28 +2642,20 @@ static const char * const policy_modes[] =
#ifdef CONFIG_TMPFS
/**
- * mpol_parse_str - parse string to mempolicy
+ * mpol_parse_str - parse string to mempolicy, for tmpfs mpol mount option.
* @str: string containing mempolicy to parse
* @mpol: pointer to struct mempolicy pointer, returned on success.
- * @no_context: flag whether to "contextualize" the mempolicy
*
* Format of input:
* <mode>[=<flags>][:<nodelist>]
*
- * if @no_context is true, save the input nodemask in w.user_nodemask in
- * the returned mempolicy. This will be used to "clone" the mempolicy in
- * a specific context [cpuset] at a later time. Used to parse tmpfs mpol
- * mount option. Note that if 'static' or 'relative' mode flags were
- * specified, the input nodemask will already have been saved. Saving
- * it again is redundant, but safe.
- *
* On success, returns 0, else 1
*/
-int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context)
+int mpol_parse_str(char *str, struct mempolicy **mpol)
{
struct mempolicy *new = NULL;
unsigned short mode;
- unsigned short uninitialized_var(mode_flags);
+ unsigned short mode_flags;
nodemask_t nodes;
char *nodelist = strchr(str, ':');
char *flags = strchr(str, '=');
@@ -2719,24 +2743,23 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context)
if (IS_ERR(new))
goto out;
- if (no_context) {
- /* save for contextualization */
- new->w.user_nodemask = nodes;
- } else {
- int ret;
- NODEMASK_SCRATCH(scratch);
- if (scratch) {
- task_lock(current);
- ret = mpol_set_nodemask(new, &nodes, scratch);
- task_unlock(current);
- } else
- ret = -ENOMEM;
- NODEMASK_SCRATCH_FREE(scratch);
- if (ret) {
- mpol_put(new);
- goto out;
- }
- }
+ /*
+ * Save nodes for mpol_to_str() to show the tmpfs mount options
+ * for /proc/mounts, /proc/pid/mounts and /proc/pid/mountinfo.
+ */
+ if (mode != MPOL_PREFERRED)
+ new->v.nodes = nodes;
+ else if (nodelist)
+ new->v.preferred_node = first_node(nodes);
+ else
+ new->flags |= MPOL_F_LOCAL;
+
+ /*
+ * Save nodes for contextualization: this will be used to "clone"
+ * the mempolicy in a specific context [cpuset] at a later time.
+ */
+ new->w.user_nodemask = nodes;
+
err = 0;
out:
@@ -2756,13 +2779,12 @@ out:
* @buffer: to contain formatted mempolicy string
* @maxlen: length of @buffer
* @pol: pointer to mempolicy to be formatted
- * @no_context: "context free" mempolicy - use nodemask in w.user_nodemask
*
* Convert a mempolicy into a string.
* Returns the number of characters in buffer (if positive)
* or an error (negative)
*/
-int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol, int no_context)
+int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
{
char *p = buffer;
int l;
@@ -2788,7 +2810,7 @@ int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol, int no_context)
case MPOL_PREFERRED:
nodes_clear(nodes);
if (flags & MPOL_F_LOCAL)
- mode = MPOL_LOCAL; /* pseudo-policy */
+ mode = MPOL_LOCAL;
else
node_set(pol->v.preferred_node, nodes);
break;
@@ -2796,10 +2818,7 @@ int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol, int no_context)
case MPOL_BIND:
/* Fall through */
case MPOL_INTERLEAVE:
- if (no_context)
- nodes = pol->w.user_nodemask;
- else
- nodes = pol->v.nodes;
+ nodes = pol->v.nodes;
break;
default: