summaryrefslogtreecommitdiffstats
path: root/mm/mempolicy.c
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2013-11-12 02:20:12 +0100
committerLinus Torvalds <torvalds@linux-foundation.org>2013-11-12 02:20:12 +0100
commit39cf275a1a18ba3c7eb9b986c5c9b35b57332798 (patch)
tree40b119ca9d2fbaf8128d3fa25f4c64669002b0c0 /mm/mempolicy.c
parentMerge branch 'perf-core-for-linus' of git://git.kernel.org/pub/scm/linux/kern... (diff)
parentftrace, sched: Add TRACE_FLAG_PREEMPT_RESCHED (diff)
downloadlinux-39cf275a1a18ba3c7eb9b986c5c9b35b57332798.tar.xz
linux-39cf275a1a18ba3c7eb9b986c5c9b35b57332798.zip
Merge branch 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull scheduler changes from Ingo Molnar: "The main changes in this cycle are: - (much) improved CONFIG_NUMA_BALANCING support from Mel Gorman, Rik van Riel, Peter Zijlstra et al. Yay! - optimize preemption counter handling: merge the NEED_RESCHED flag into the preempt_count variable, by Peter Zijlstra. - wait.h fixes and code reorganization from Peter Zijlstra - cfs_bandwidth fixes from Ben Segall - SMP load-balancer cleanups from Peter Zijstra - idle balancer improvements from Jason Low - other fixes and cleanups" * 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (129 commits) ftrace, sched: Add TRACE_FLAG_PREEMPT_RESCHED stop_machine: Fix race between stop_two_cpus() and stop_cpus() sched: Remove unnecessary iteration over sched domains to update nr_busy_cpus sched: Fix asymmetric scheduling for POWER7 sched: Move completion code from core.c to completion.c sched: Move wait code from core.c to wait.c sched: Move wait.c into kernel/sched/ sched/wait: Fix __wait_event_interruptible_lock_irq_timeout() sched: Avoid throttle_cfs_rq() racing with period_timer stopping sched: Guarantee new group-entities always have weight sched: Fix hrtimer_cancel()/rq->lock deadlock sched: Fix cfs_bandwidth misuse of hrtimer_expires_remaining sched: Fix race on toggling cfs_bandwidth_used sched: Remove extra put_online_cpus() inside sched_setaffinity() sched/rt: Fix task_tick_rt() comment sched/wait: Fix build breakage sched/wait: Introduce prepare_to_wait_event() sched/wait: Add ___wait_cond_timeout() to wait_event*_timeout() too sched: Remove get_online_cpus() usage sched: Fix race in migrate_swap_stop() ...
Diffstat (limited to 'mm/mempolicy.c')
-rw-r--r--mm/mempolicy.c82
1 files changed, 78 insertions, 4 deletions
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 04729647f359..71cb253368cb 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1679,6 +1679,30 @@ struct mempolicy *get_vma_policy(struct task_struct *task,
return pol;
}
+bool vma_policy_mof(struct task_struct *task, struct vm_area_struct *vma)
+{
+ struct mempolicy *pol = get_task_policy(task);
+ if (vma) {
+ if (vma->vm_ops && vma->vm_ops->get_policy) {
+ bool ret = false;
+
+ pol = vma->vm_ops->get_policy(vma, vma->vm_start);
+ if (pol && (pol->flags & MPOL_F_MOF))
+ ret = true;
+ mpol_cond_put(pol);
+
+ return ret;
+ } else if (vma->vm_policy) {
+ pol = vma->vm_policy;
+ }
+ }
+
+ if (!pol)
+ return default_policy.flags & MPOL_F_MOF;
+
+ return pol->flags & MPOL_F_MOF;
+}
+
static int apply_policy_zone(struct mempolicy *policy, enum zone_type zone)
{
enum zone_type dynamic_policy_zone = policy_zone;
@@ -2277,6 +2301,35 @@ static void sp_free(struct sp_node *n)
kmem_cache_free(sn_cache, n);
}
+#ifdef CONFIG_NUMA_BALANCING
+static bool numa_migrate_deferred(struct task_struct *p, int last_cpupid)
+{
+ /* Never defer a private fault */
+ if (cpupid_match_pid(p, last_cpupid))
+ return false;
+
+ if (p->numa_migrate_deferred) {
+ p->numa_migrate_deferred--;
+ return true;
+ }
+ return false;
+}
+
+static inline void defer_numa_migrate(struct task_struct *p)
+{
+ p->numa_migrate_deferred = sysctl_numa_balancing_migrate_deferred;
+}
+#else
+static inline bool numa_migrate_deferred(struct task_struct *p, int last_cpupid)
+{
+ return false;
+}
+
+static inline void defer_numa_migrate(struct task_struct *p)
+{
+}
+#endif /* CONFIG_NUMA_BALANCING */
+
/**
* mpol_misplaced - check whether current page node is valid in policy
*
@@ -2300,6 +2353,8 @@ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long
struct zone *zone;
int curnid = page_to_nid(page);
unsigned long pgoff;
+ int thiscpu = raw_smp_processor_id();
+ int thisnid = cpu_to_node(thiscpu);
int polnid = -1;
int ret = -1;
@@ -2348,9 +2403,11 @@ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long
/* Migrate the page towards the node whose CPU is referencing it */
if (pol->flags & MPOL_F_MORON) {
- int last_nid;
+ int last_cpupid;
+ int this_cpupid;
- polnid = numa_node_id();
+ polnid = thisnid;
+ this_cpupid = cpu_pid_to_cpupid(thiscpu, current->pid);
/*
* Multi-stage node selection is used in conjunction
@@ -2373,8 +2430,25 @@ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long
* it less likely we act on an unlikely task<->page
* relation.
*/
- last_nid = page_nid_xchg_last(page, polnid);
- if (last_nid != polnid)
+ last_cpupid = page_cpupid_xchg_last(page, this_cpupid);
+ if (!cpupid_pid_unset(last_cpupid) && cpupid_to_nid(last_cpupid) != thisnid) {
+
+ /* See sysctl_numa_balancing_migrate_deferred comment */
+ if (!cpupid_match_pid(current, last_cpupid))
+ defer_numa_migrate(current);
+
+ goto out;
+ }
+
+ /*
+ * The quadratic filter above reduces extraneous migration
+ * of shared pages somewhat. This code reduces it even more,
+ * reducing the overhead of page migrations of shared pages.
+ * This makes workloads with shared pages rely more on
+ * "move task near its memory", and less on "move memory
+ * towards its task", which is exactly what we want.
+ */
+ if (numa_migrate_deferred(current, last_cpupid))
goto out;
}