diff options
Diffstat (limited to 'kernel/cgroup/cpuset.c')
-rw-r--r-- | kernel/cgroup/cpuset.c | 79 |
1 files changed, 52 insertions, 27 deletions
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index ca8376e5008c..4657e2924ecb 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -56,6 +56,7 @@ #include <linux/time64.h> #include <linux/backing-dev.h> #include <linux/sort.h> +#include <linux/oom.h> #include <linux/uaccess.h> #include <linux/atomic.h> @@ -63,6 +64,7 @@ #include <linux/cgroup.h> #include <linux/wait.h> +DEFINE_STATIC_KEY_FALSE(cpusets_pre_enable_key); DEFINE_STATIC_KEY_FALSE(cpusets_enabled_key); /* See "Frequency meter" comments, below. */ @@ -299,6 +301,16 @@ static DECLARE_WORK(cpuset_hotplug_work, cpuset_hotplug_workfn); static DECLARE_WAIT_QUEUE_HEAD(cpuset_attach_wq); /* + * Cgroup v2 behavior is used when on default hierarchy or the + * cgroup_v2_mode flag is set. + */ +static inline bool is_in_v2_mode(void) +{ + return cgroup_subsys_on_dfl(cpuset_cgrp_subsys) || + (cpuset_cgrp_subsys.root->flags & CGRP_ROOT_CPUSET_V2_MODE); +} + +/* * This is ugly, but preserves the userspace API for existing cpuset * users. If someone tries to mount the "cpuset" filesystem, we * silently switch it to mount "cgroup" instead @@ -488,8 +500,7 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial) /* On legacy hiearchy, we must be a subset of our parent cpuset. */ ret = -EACCES; - if (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) && - !is_cpuset_subset(trial, par)) + if (!is_in_v2_mode() && !is_cpuset_subset(trial, par)) goto out; /* @@ -576,6 +587,13 @@ static void update_domain_attr_tree(struct sched_domain_attr *dattr, rcu_read_unlock(); } +/* Must be called with cpuset_mutex held. */ +static inline int nr_cpusets(void) +{ + /* jump label reference count + the top-level cpuset */ + return static_key_count(&cpusets_enabled_key.key) + 1; +} + /* * generate_sched_domains() * @@ -861,7 +879,7 @@ static void update_tasks_cpumask(struct cpuset *cs) struct css_task_iter it; struct task_struct *task; - css_task_iter_start(&cs->css, &it); + css_task_iter_start(&cs->css, 0, &it); while ((task = css_task_iter_next(&it))) set_cpus_allowed_ptr(task, cs->effective_cpus); css_task_iter_end(&it); @@ -895,8 +913,7 @@ static void update_cpumasks_hier(struct cpuset *cs, struct cpumask *new_cpus) * If it becomes empty, inherit the effective mask of the * parent, which is guaranteed to have some CPUs. */ - if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys) && - cpumask_empty(new_cpus)) + if (is_in_v2_mode() && cpumask_empty(new_cpus)) cpumask_copy(new_cpus, parent->effective_cpus); /* Skip the whole subtree if the cpumask remains the same. */ @@ -913,7 +930,7 @@ static void update_cpumasks_hier(struct cpuset *cs, struct cpumask *new_cpus) cpumask_copy(cp->effective_cpus, new_cpus); spin_unlock_irq(&callback_lock); - WARN_ON(!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) && + WARN_ON(!is_in_v2_mode() && !cpumask_equal(cp->cpus_allowed, cp->effective_cpus)); update_tasks_cpumask(cp); @@ -1091,7 +1108,7 @@ static void update_tasks_nodemask(struct cpuset *cs) * It's ok if we rebind the same mm twice; mpol_rebind_mm() * is idempotent. Also migrate pages in each mm to new nodes. */ - css_task_iter_start(&cs->css, &it); + css_task_iter_start(&cs->css, 0, &it); while ((task = css_task_iter_next(&it))) { struct mm_struct *mm; bool migrate; @@ -1149,8 +1166,7 @@ static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems) * If it becomes empty, inherit the effective mask of the * parent, which is guaranteed to have some MEMs. */ - if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys) && - nodes_empty(*new_mems)) + if (is_in_v2_mode() && nodes_empty(*new_mems)) *new_mems = parent->effective_mems; /* Skip the whole subtree if the nodemask remains the same. */ @@ -1167,7 +1183,7 @@ static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems) cp->effective_mems = *new_mems; spin_unlock_irq(&callback_lock); - WARN_ON(!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) && + WARN_ON(!is_in_v2_mode() && !nodes_equal(cp->mems_allowed, cp->effective_mems)); update_tasks_nodemask(cp); @@ -1284,7 +1300,7 @@ static void update_tasks_flags(struct cpuset *cs) struct css_task_iter it; struct task_struct *task; - css_task_iter_start(&cs->css, &it); + css_task_iter_start(&cs->css, 0, &it); while ((task = css_task_iter_next(&it))) cpuset_update_task_spread_flag(cs, task); css_task_iter_end(&it); @@ -1459,7 +1475,7 @@ static int cpuset_can_attach(struct cgroup_taskset *tset) /* allow moving tasks into an empty cpuset if on default hierarchy */ ret = -ENOSPC; - if (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) && + if (!is_in_v2_mode() && (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))) goto out_unlock; @@ -1891,6 +1907,7 @@ static struct cftype files[] = { { .name = "memory_pressure", .read_u64 = cpuset_read_u64, + .private = FILE_MEMORY_PRESSURE, }, { @@ -1977,7 +1994,7 @@ static int cpuset_css_online(struct cgroup_subsys_state *css) cpuset_inc(); spin_lock_irq(&callback_lock); - if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys)) { + if (is_in_v2_mode()) { cpumask_copy(cs->effective_cpus, parent->effective_cpus); cs->effective_mems = parent->effective_mems; } @@ -2054,7 +2071,7 @@ static void cpuset_bind(struct cgroup_subsys_state *root_css) mutex_lock(&cpuset_mutex); spin_lock_irq(&callback_lock); - if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys)) { + if (is_in_v2_mode()) { cpumask_copy(top_cpuset.cpus_allowed, cpu_possible_mask); top_cpuset.mems_allowed = node_possible_map; } else { @@ -2248,7 +2265,7 @@ retry: cpus_updated = !cpumask_equal(&new_cpus, cs->effective_cpus); mems_updated = !nodes_equal(new_mems, cs->effective_mems); - if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys)) + if (is_in_v2_mode()) hotplug_update_tasks(cs, &new_cpus, &new_mems, cpus_updated, mems_updated); else @@ -2258,6 +2275,13 @@ retry: mutex_unlock(&cpuset_mutex); } +static bool force_rebuild; + +void cpuset_force_rebuild(void) +{ + force_rebuild = true; +} + /** * cpuset_hotplug_workfn - handle CPU/memory hotunplug for a cpuset * @@ -2279,7 +2303,7 @@ static void cpuset_hotplug_workfn(struct work_struct *work) static cpumask_t new_cpus; static nodemask_t new_mems; bool cpus_updated, mems_updated; - bool on_dfl = cgroup_subsys_on_dfl(cpuset_cgrp_subsys); + bool on_dfl = is_in_v2_mode(); mutex_lock(&cpuset_mutex); @@ -2332,8 +2356,10 @@ static void cpuset_hotplug_workfn(struct work_struct *work) } /* rebuild sched domains if cpus_allowed has changed */ - if (cpus_updated) + if (cpus_updated || force_rebuild) { + force_rebuild = false; rebuild_sched_domains(); + } } void cpuset_update_active_cpus(void) @@ -2342,16 +2368,15 @@ void cpuset_update_active_cpus(void) * We're inside cpu hotplug critical region which usually nests * inside cgroup synchronization. Bounce actual hotplug processing * to a work item to avoid reverse locking order. - * - * We still need to do partition_sched_domains() synchronously; - * otherwise, the scheduler will get confused and put tasks to the - * dead CPU. Fall back to the default single domain. - * cpuset_hotplug_workfn() will rebuild it as necessary. */ - partition_sched_domains(1, NULL, NULL); schedule_work(&cpuset_hotplug_work); } +void cpuset_wait_for_hotplug(void) +{ + flush_work(&cpuset_hotplug_work); +} + /* * Keep top_cpuset.mems_allowed tracking node_states[N_MEMORY]. * Call this routine anytime after node_states[N_MEMORY] changes. @@ -2497,12 +2522,12 @@ static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs) * If we're in interrupt, yes, we can always allocate. If @node is set in * current's mems_allowed, yes. If it's not a __GFP_HARDWALL request and this * node is set in the nearest hardwalled cpuset ancestor to current's cpuset, - * yes. If current has access to memory reserves due to TIF_MEMDIE, yes. + * yes. If current has access to memory reserves as an oom victim, yes. * Otherwise, no. * * GFP_USER allocations are marked with the __GFP_HARDWALL bit, * and do not allow allocations outside the current tasks cpuset - * unless the task has been OOM killed as is marked TIF_MEMDIE. + * unless the task has been OOM killed. * GFP_KERNEL allocations are not so marked, so can escape to the * nearest enclosing hardwalled ancestor cpuset. * @@ -2525,7 +2550,7 @@ static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs) * affect that: * in_interrupt - any node ok (current task context irrelevant) * GFP_ATOMIC - any node ok - * TIF_MEMDIE - any node ok + * tsk_is_oom_victim - any node ok * GFP_KERNEL - any node in enclosing hardwalled cpuset ok * GFP_USER - only nodes in current tasks mems allowed ok. */ @@ -2543,7 +2568,7 @@ bool __cpuset_node_allowed(int node, gfp_t gfp_mask) * Allow tasks that have access to memory reserves because they have * been OOM killed to get memory anywhere. */ - if (unlikely(test_thread_flag(TIF_MEMDIE))) + if (unlikely(tsk_is_oom_victim(current))) return true; if (gfp_mask & __GFP_HARDWALL) /* If hardwall request, stop here */ return false; |