diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2013-11-12 02:20:12 +0100 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2013-11-12 02:20:12 +0100 |
commit | 39cf275a1a18ba3c7eb9b986c5c9b35b57332798 (patch) | |
tree | 40b119ca9d2fbaf8128d3fa25f4c64669002b0c0 /mm/mempolicy.c | |
parent | Merge branch 'perf-core-for-linus' of git://git.kernel.org/pub/scm/linux/kern... (diff) | |
parent | ftrace, sched: Add TRACE_FLAG_PREEMPT_RESCHED (diff) | |
download | linux-39cf275a1a18ba3c7eb9b986c5c9b35b57332798.tar.xz linux-39cf275a1a18ba3c7eb9b986c5c9b35b57332798.zip |
Merge branch 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull scheduler changes from Ingo Molnar:
"The main changes in this cycle are:
- (much) improved CONFIG_NUMA_BALANCING support from Mel Gorman, Rik
van Riel, Peter Zijlstra et al. Yay!
- optimize preemption counter handling: merge the NEED_RESCHED flag
into the preempt_count variable, by Peter Zijlstra.
- wait.h fixes and code reorganization from Peter Zijlstra
- cfs_bandwidth fixes from Ben Segall
- SMP load-balancer cleanups from Peter Zijstra
- idle balancer improvements from Jason Low
- other fixes and cleanups"
* 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (129 commits)
ftrace, sched: Add TRACE_FLAG_PREEMPT_RESCHED
stop_machine: Fix race between stop_two_cpus() and stop_cpus()
sched: Remove unnecessary iteration over sched domains to update nr_busy_cpus
sched: Fix asymmetric scheduling for POWER7
sched: Move completion code from core.c to completion.c
sched: Move wait code from core.c to wait.c
sched: Move wait.c into kernel/sched/
sched/wait: Fix __wait_event_interruptible_lock_irq_timeout()
sched: Avoid throttle_cfs_rq() racing with period_timer stopping
sched: Guarantee new group-entities always have weight
sched: Fix hrtimer_cancel()/rq->lock deadlock
sched: Fix cfs_bandwidth misuse of hrtimer_expires_remaining
sched: Fix race on toggling cfs_bandwidth_used
sched: Remove extra put_online_cpus() inside sched_setaffinity()
sched/rt: Fix task_tick_rt() comment
sched/wait: Fix build breakage
sched/wait: Introduce prepare_to_wait_event()
sched/wait: Add ___wait_cond_timeout() to wait_event*_timeout() too
sched: Remove get_online_cpus() usage
sched: Fix race in migrate_swap_stop()
...
Diffstat (limited to 'mm/mempolicy.c')
-rw-r--r-- | mm/mempolicy.c | 82 |
1 files changed, 78 insertions, 4 deletions
diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 04729647f359..71cb253368cb 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1679,6 +1679,30 @@ struct mempolicy *get_vma_policy(struct task_struct *task, return pol; } +bool vma_policy_mof(struct task_struct *task, struct vm_area_struct *vma) +{ + struct mempolicy *pol = get_task_policy(task); + if (vma) { + if (vma->vm_ops && vma->vm_ops->get_policy) { + bool ret = false; + + pol = vma->vm_ops->get_policy(vma, vma->vm_start); + if (pol && (pol->flags & MPOL_F_MOF)) + ret = true; + mpol_cond_put(pol); + + return ret; + } else if (vma->vm_policy) { + pol = vma->vm_policy; + } + } + + if (!pol) + return default_policy.flags & MPOL_F_MOF; + + return pol->flags & MPOL_F_MOF; +} + static int apply_policy_zone(struct mempolicy *policy, enum zone_type zone) { enum zone_type dynamic_policy_zone = policy_zone; @@ -2277,6 +2301,35 @@ static void sp_free(struct sp_node *n) kmem_cache_free(sn_cache, n); } +#ifdef CONFIG_NUMA_BALANCING +static bool numa_migrate_deferred(struct task_struct *p, int last_cpupid) +{ + /* Never defer a private fault */ + if (cpupid_match_pid(p, last_cpupid)) + return false; + + if (p->numa_migrate_deferred) { + p->numa_migrate_deferred--; + return true; + } + return false; +} + +static inline void defer_numa_migrate(struct task_struct *p) +{ + p->numa_migrate_deferred = sysctl_numa_balancing_migrate_deferred; +} +#else +static inline bool numa_migrate_deferred(struct task_struct *p, int last_cpupid) +{ + return false; +} + +static inline void defer_numa_migrate(struct task_struct *p) +{ +} +#endif /* CONFIG_NUMA_BALANCING */ + /** * mpol_misplaced - check whether current page node is valid in policy * @@ -2300,6 +2353,8 @@ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long struct zone *zone; int curnid = page_to_nid(page); unsigned long pgoff; + int thiscpu = raw_smp_processor_id(); + int thisnid = cpu_to_node(thiscpu); int polnid = -1; int ret = -1; @@ -2348,9 +2403,11 @@ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long /* Migrate the page towards the node whose CPU is referencing it */ if (pol->flags & MPOL_F_MORON) { - int last_nid; + int last_cpupid; + int this_cpupid; - polnid = numa_node_id(); + polnid = thisnid; + this_cpupid = cpu_pid_to_cpupid(thiscpu, current->pid); /* * Multi-stage node selection is used in conjunction @@ -2373,8 +2430,25 @@ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long * it less likely we act on an unlikely task<->page * relation. */ - last_nid = page_nid_xchg_last(page, polnid); - if (last_nid != polnid) + last_cpupid = page_cpupid_xchg_last(page, this_cpupid); + if (!cpupid_pid_unset(last_cpupid) && cpupid_to_nid(last_cpupid) != thisnid) { + + /* See sysctl_numa_balancing_migrate_deferred comment */ + if (!cpupid_match_pid(current, last_cpupid)) + defer_numa_migrate(current); + + goto out; + } + + /* + * The quadratic filter above reduces extraneous migration + * of shared pages somewhat. This code reduces it even more, + * reducing the overhead of page migrations of shared pages. + * This makes workloads with shared pages rely more on + * "move task near its memory", and less on "move memory + * towards its task", which is exactly what we want. + */ + if (numa_migrate_deferred(current, last_cpupid)) goto out; } |