diff options
author | Chris Down <chris@chrisdown.name> | 2018-12-03 15:38:06 +0100 |
---|---|---|
committer | Chris Down <chris@chrisdown.name> | 2018-12-03 16:40:31 +0100 |
commit | c72703e26d21cb4994f21ae50c4e18675b02ded3 (patch) | |
tree | c2f6cf0b7772548594323b2c98aa6bea387b303e | |
parent | cgroup: Traverse leaves to realised cgroup to release controllers (diff) | |
download | systemd-c72703e26d21cb4994f21ae50c4e18675b02ded3.tar.xz systemd-c72703e26d21cb4994f21ae50c4e18675b02ded3.zip |
cgroup: Add DisableControllers= directive to disable controller in subtree
Some controllers (like the CPU controller) have a performance cost that
is non-trivial on certain workloads. While this can be mitigated and
improved to an extent, there will for some controllers always be some
overheads associated with the benefits gained from the controller.
Inside Facebook, the fix applied has been to disable the CPU controller
forcibly with `cgroup_disable=cpu` on the kernel command line.
This presents a problem: to disable or reenable the controller, a reboot
is required, but this is quite cumbersome and slow to do for many
thousands of machines, especially machines where disabling/enabling a
stateful service on a machine is a matter of several minutes.
Currently systemd provides some configuration knobs for these in the
form of `[Default]CPUAccounting`, `[Default]MemoryAccounting`, and the
like. The limitation of these is that Default*Accounting is overrideable
by individual services, of which any one could decide to reenable a
controller within the hierarchy at any point just by using a controller
feature implicitly (eg. `CPUWeight`), even if the use of that CPU
feature could just be opportunistic. Since many services are provided by
the distribution, or by upstream teams at a particular organisation,
it's not a sustainable solution to simply try to find and remove
offending directives from these units.
This commit presents a more direct solution -- a DisableControllers=
directive that forcibly disallows a controller from being enabled within
a subtree.
-rw-r--r-- | man/systemd.resource-control.xml | 21 | ||||
-rw-r--r-- | src/core/cgroup.c | 16 | ||||
-rw-r--r-- | src/core/cgroup.h | 5 | ||||
-rw-r--r-- | src/core/dbus-cgroup.c | 38 | ||||
-rw-r--r-- | src/core/load-fragment-gperf.gperf.m4 | 1 | ||||
-rw-r--r-- | src/core/load-fragment.c | 35 | ||||
-rw-r--r-- | src/core/load-fragment.h | 1 | ||||
-rw-r--r-- | src/test/test-cgroup-mask.c | 14 | ||||
-rw-r--r-- | test/meson.build | 2 | ||||
-rw-r--r-- | test/nomem.slice | 5 | ||||
-rw-r--r-- | test/nomemleaf.service | 9 |
11 files changed, 127 insertions, 20 deletions
diff --git a/man/systemd.resource-control.xml b/man/systemd.resource-control.xml index 4e282dad3d..aa7d9bcd59 100644 --- a/man/systemd.resource-control.xml +++ b/man/systemd.resource-control.xml @@ -736,6 +736,27 @@ </listitem> </varlistentry> + <varlistentry> + <term><varname>DisableControllers=</varname></term> + + <listitem> + <para>Disables controllers from being enabled for a unit's children. If a controller listed is already in use + in its subtree, the controller will be removed from the subtree. This can be used to avoid child units being + able to implicitly or explicitly enable a controller. Defaults to not disabling any controllers.</para> + + <para>It may not be possible to successfully disable a controller if the unit or any child of the unit in + question delegates controllers to its children, as any delegated subtree of the cgroup hierarchy is unmanaged + by systemd.</para> + + <para>Multiple controllers may be specified, separated by spaces. You may also pass + <varname>DisableControllers=</varname> multiple times, in which case each new instance adds another controller + to disable. Passing <varname>DisableControllers=</varname> by itself with no controller name present resets + the disabled controller list.</para> + + <para>Valid controllers are <option>cpu</option>, <option>cpuacct</option>, <option>io</option>, + <option>blkio</option>, <option>memory</option>, <option>devices</option>, and <option>pids</option>.</para> + </listitem> + </varlistentry> </variablelist> </refsect1> diff --git a/src/core/cgroup.c b/src/core/cgroup.c index 13c595bbcd..b585e4bd2b 100644 --- a/src/core/cgroup.c +++ b/src/core/cgroup.c @@ -1346,7 +1346,7 @@ CGroupMask unit_get_own_mask(Unit *u) { if (!c) return 0; - return cgroup_context_get_mask(c) | unit_get_bpf_mask(u) | unit_get_delegate_mask(u); + return (cgroup_context_get_mask(c) | unit_get_bpf_mask(u) | unit_get_delegate_mask(u)) & ~unit_get_ancestor_disable_mask(u); } CGroupMask unit_get_delegate_mask(Unit *u) { @@ -1461,6 +1461,7 @@ CGroupMask unit_get_target_mask(Unit *u) { mask = unit_get_own_mask(u) | unit_get_members_mask(u) | unit_get_siblings_mask(u); mask &= u->manager->cgroup_supported; + mask &= ~unit_get_ancestor_disable_mask(u); return mask; } @@ -1475,6 +1476,7 @@ CGroupMask unit_get_enable_mask(Unit *u) { mask = unit_get_members_mask(u); mask &= u->manager->cgroup_supported; + mask &= ~unit_get_ancestor_disable_mask(u); return mask; } @@ -1914,8 +1916,8 @@ static bool unit_has_mask_enables_realized( * we want to add is already added. */ return u->cgroup_realized && - ((u->cgroup_realized_mask | target_mask) & CGROUP_MASK_V1) == u->cgroup_realized_mask && - ((u->cgroup_enabled_mask | enable_mask) & CGROUP_MASK_V2) == u->cgroup_enabled_mask; + ((u->cgroup_realized_mask | target_mask) & CGROUP_MASK_V1) == (u->cgroup_realized_mask & CGROUP_MASK_V1) && + ((u->cgroup_enabled_mask | enable_mask) & CGROUP_MASK_V2) == (u->cgroup_enabled_mask & CGROUP_MASK_V2); } void unit_add_to_cgroup_realize_queue(Unit *u) { @@ -1965,11 +1967,7 @@ static int unit_realize_cgroup_now_enable(Unit *u, ManagerState state) { new_target_mask = u->cgroup_realized_mask | target_mask; new_enable_mask = u->cgroup_enabled_mask | enable_mask; - r = unit_create_cgroup(u, new_target_mask, new_enable_mask, state); - if (r < 0) - return r; - - return 0; + return unit_create_cgroup(u, new_target_mask, new_enable_mask, state); } /* Controllers can only be disabled depth-first, from the leaves of the @@ -2043,7 +2041,7 @@ static int unit_realize_cgroup_now_disable(Unit *u, ManagerState state) { * h i j k l m n o * * 1. We want to realise cgroup "d" now. - * 2. cgroup "a" has DisableController=cpu in the associated unit. + * 2. cgroup "a" has DisableControllers=cpu in the associated unit. * 3. cgroup "k" just started requesting the memory controller. * * To make this work we must do the following in order: diff --git a/src/core/cgroup.h b/src/core/cgroup.h index 828b6f0795..6d094e9ecd 100644 --- a/src/core/cgroup.h +++ b/src/core/cgroup.h @@ -118,6 +118,8 @@ struct CGroupContext { bool delegate; CGroupMask delegate_controllers; + + CGroupMask disable_controllers; }; /* Used when querying IP accounting data */ @@ -151,6 +153,9 @@ CGroupMask unit_get_delegate_mask(Unit *u); CGroupMask unit_get_members_mask(Unit *u); CGroupMask unit_get_siblings_mask(Unit *u); CGroupMask unit_get_subtree_mask(Unit *u); +CGroupMask unit_get_disable_mask(Unit *u); +CGroupMask unit_get_ancestor_disable_mask(Unit *u); + CGroupMask unit_get_target_mask(Unit *u); CGroupMask unit_get_enable_mask(Unit *u); diff --git a/src/core/dbus-cgroup.c b/src/core/dbus-cgroup.c index a2f532076d..53890bcafb 100644 --- a/src/core/dbus-cgroup.c +++ b/src/core/dbus-cgroup.c @@ -17,7 +17,7 @@ static BUS_DEFINE_PROPERTY_GET_ENUM(property_get_cgroup_device_policy, cgroup_device_policy, CGroupDevicePolicy); -static int property_get_delegate_controllers( +static int property_get_cgroup_mask( sd_bus *bus, const char *path, const char *interface, @@ -26,26 +26,22 @@ static int property_get_delegate_controllers( void *userdata, sd_bus_error *error) { - CGroupContext *c = userdata; - CGroupController cc; + CGroupMask *mask = userdata; + CGroupController ctrl; int r; assert(bus); assert(reply); - assert(c); - - if (!c->delegate) - return sd_bus_message_append(reply, "as", 0); r = sd_bus_message_open_container(reply, 'a', "s"); if (r < 0) return r; - for (cc = 0; cc < _CGROUP_CONTROLLER_MAX; cc++) { - if ((c->delegate_controllers & CGROUP_CONTROLLER_TO_MASK(cc)) == 0) + for (ctrl = 0; ctrl < _CGROUP_CONTROLLER_MAX; ctrl++) { + if ((*mask & CGROUP_CONTROLLER_TO_MASK(ctrl)) == 0) continue; - r = sd_bus_message_append(reply, "s", cgroup_controller_to_string(cc)); + r = sd_bus_message_append(reply, "s", cgroup_controller_to_string(ctrl)); if (r < 0) return r; } @@ -53,6 +49,27 @@ static int property_get_delegate_controllers( return sd_bus_message_close_container(reply); } +static int property_get_delegate_controllers( + sd_bus *bus, + const char *path, + const char *interface, + const char *property, + sd_bus_message *reply, + void *userdata, + sd_bus_error *error) { + + CGroupContext *c = userdata; + + assert(bus); + assert(reply); + assert(c); + + if (!c->delegate) + return sd_bus_message_append(reply, "as", 0); + + return property_get_cgroup_mask(bus, path, interface, property, reply, &c->delegate_controllers, error); +} + static int property_get_io_device_weight( sd_bus *bus, const char *path, @@ -342,6 +359,7 @@ const sd_bus_vtable bus_cgroup_vtable[] = { SD_BUS_PROPERTY("IPAccounting", "b", bus_property_get_bool, offsetof(CGroupContext, ip_accounting), 0), SD_BUS_PROPERTY("IPAddressAllow", "a(iayu)", property_get_ip_address_access, offsetof(CGroupContext, ip_address_allow), 0), SD_BUS_PROPERTY("IPAddressDeny", "a(iayu)", property_get_ip_address_access, offsetof(CGroupContext, ip_address_deny), 0), + SD_BUS_PROPERTY("DisableControllers", "as", property_get_cgroup_mask, offsetof(CGroupContext, disable_controllers), 0), SD_BUS_VTABLE_END }; diff --git a/src/core/load-fragment-gperf.gperf.m4 b/src/core/load-fragment-gperf.gperf.m4 index 97a707c144..cdbc67f885 100644 --- a/src/core/load-fragment-gperf.gperf.m4 +++ b/src/core/load-fragment-gperf.gperf.m4 @@ -192,6 +192,7 @@ $1.BlockIOWriteBandwidth, config_parse_blockio_bandwidth, 0, $1.TasksAccounting, config_parse_bool, 0, offsetof($1, cgroup_context.tasks_accounting) $1.TasksMax, config_parse_tasks_max, 0, offsetof($1, cgroup_context.tasks_max) $1.Delegate, config_parse_delegate, 0, offsetof($1, cgroup_context) +$1.DisableControllers, config_parse_disable_controllers, 0, offsetof($1, cgroup_context) $1.IPAccounting, config_parse_bool, 0, offsetof($1, cgroup_context.ip_accounting) $1.IPAddressAllow, config_parse_ip_address_access, 0, offsetof($1, cgroup_context.ip_address_allow) $1.IPAddressDeny, config_parse_ip_address_access, 0, offsetof($1, cgroup_context.ip_address_deny) diff --git a/src/core/load-fragment.c b/src/core/load-fragment.c index d1988190c2..041b622314 100644 --- a/src/core/load-fragment.c +++ b/src/core/load-fragment.c @@ -4323,6 +4323,41 @@ int config_parse_exit_status( return 0; } +int config_parse_disable_controllers( + const char *unit, + const char *filename, + unsigned line, + const char *section, + unsigned section_line, + const char *lvalue, + int ltype, + const char *rvalue, + void *data, + void *userdata) { + + int r; + CGroupContext *c = data; + CGroupMask disabled_mask; + + /* 1. If empty, make all controllers eligible for use again. + * 2. If non-empty, merge all listed controllers, space separated. */ + + if (isempty(rvalue)) { + c->disable_controllers = 0; + return 0; + } + + r = cg_mask_from_string(rvalue, &disabled_mask); + if (r < 0 || disabled_mask <= 0) { + log_syntax(unit, LOG_ERR, filename, line, r, "Invalid cgroup string: %s, ignoring", rvalue); + return 0; + } + + c->disable_controllers |= disabled_mask; + + return 0; +} + #define FOLLOW_MAX 8 static int open_follow(char **filename, FILE **_f, Set *names, char **_final) { diff --git a/src/core/load-fragment.h b/src/core/load-fragment.h index 71d94a7762..e0d3b4ec3b 100644 --- a/src/core/load-fragment.h +++ b/src/core/load-fragment.h @@ -105,6 +105,7 @@ CONFIG_PARSER_PROTOTYPE(config_parse_log_extra_fields); CONFIG_PARSER_PROTOTYPE(config_parse_collect_mode); CONFIG_PARSER_PROTOTYPE(config_parse_pid_file); CONFIG_PARSER_PROTOTYPE(config_parse_exit_status); +CONFIG_PARSER_PROTOTYPE(config_parse_disable_controllers); /* gperf prototypes */ const struct ConfigPerfItem* load_fragment_gperf_lookup(const char *key, GPERF_LEN_TYPE length); diff --git a/src/test/test-cgroup-mask.c b/src/test/test-cgroup-mask.c index 143e7aa424..7f6c0c2772 100644 --- a/src/test/test-cgroup-mask.c +++ b/src/test/test-cgroup-mask.c @@ -30,7 +30,7 @@ static void log_cgroup_mask(CGroupMask got, CGroupMask expected) { static int test_cgroup_mask(void) { _cleanup_(rm_rf_physical_and_freep) char *runtime_dir = NULL; _cleanup_(manager_freep) Manager *m = NULL; - Unit *son, *daughter, *parent, *root, *grandchild, *parent_deep; + Unit *son, *daughter, *parent, *root, *grandchild, *parent_deep, *nomem_parent, *nomem_leaf; int r; CGroupMask cpu_accounting_mask = get_cpu_accounting_mask(); @@ -68,11 +68,15 @@ static int test_cgroup_mask(void) { assert_se(manager_load_startable_unit_or_warn(m, "daughter.service", NULL, &daughter) >= 0); assert_se(manager_load_startable_unit_or_warn(m, "grandchild.service", NULL, &grandchild) >= 0); assert_se(manager_load_startable_unit_or_warn(m, "parent-deep.slice", NULL, &parent_deep) >= 0); + assert_se(manager_load_startable_unit_or_warn(m, "nomem.slice", NULL, &nomem_parent) >= 0); + assert_se(manager_load_startable_unit_or_warn(m, "nomemleaf.service", NULL, &nomem_leaf) >= 0); assert_se(UNIT_DEREF(son->slice) == parent); assert_se(UNIT_DEREF(daughter->slice) == parent); assert_se(UNIT_DEREF(parent_deep->slice) == parent); assert_se(UNIT_DEREF(grandchild->slice) == parent_deep); + assert_se(UNIT_DEREF(nomem_leaf->slice) == nomem_parent); root = UNIT_DEREF(parent->slice); + assert_se(UNIT_DEREF(nomem_parent->slice) == root); /* Verify per-unit cgroups settings. */ ASSERT_CGROUP_MASK_JOINED(unit_get_own_mask(son), CGROUP_MASK_CPU); @@ -80,6 +84,8 @@ static int test_cgroup_mask(void) { ASSERT_CGROUP_MASK_JOINED(unit_get_own_mask(grandchild), 0); ASSERT_CGROUP_MASK_JOINED(unit_get_own_mask(parent_deep), CGROUP_MASK_MEMORY); ASSERT_CGROUP_MASK_JOINED(unit_get_own_mask(parent), (CGROUP_MASK_IO | CGROUP_MASK_BLKIO)); + ASSERT_CGROUP_MASK_JOINED(unit_get_own_mask(nomem_parent), 0); + ASSERT_CGROUP_MASK_JOINED(unit_get_own_mask(nomem_leaf), (CGROUP_MASK_IO | CGROUP_MASK_BLKIO)); ASSERT_CGROUP_MASK_JOINED(unit_get_own_mask(root), 0); /* Verify aggregation of member masks */ @@ -88,6 +94,8 @@ static int test_cgroup_mask(void) { ASSERT_CGROUP_MASK_JOINED(unit_get_members_mask(grandchild), 0); ASSERT_CGROUP_MASK_JOINED(unit_get_members_mask(parent_deep), 0); ASSERT_CGROUP_MASK_JOINED(unit_get_members_mask(parent), (CGROUP_MASK_CPU | cpu_accounting_mask | CGROUP_MASK_MEMORY)); + ASSERT_CGROUP_MASK_JOINED(unit_get_members_mask(nomem_parent), (CGROUP_MASK_IO | CGROUP_MASK_BLKIO)); + ASSERT_CGROUP_MASK_JOINED(unit_get_members_mask(nomem_leaf), 0); ASSERT_CGROUP_MASK_JOINED(unit_get_members_mask(root), (CGROUP_MASK_CPU | cpu_accounting_mask | CGROUP_MASK_IO | CGROUP_MASK_BLKIO | CGROUP_MASK_MEMORY)); /* Verify aggregation of sibling masks. */ @@ -96,6 +104,8 @@ static int test_cgroup_mask(void) { ASSERT_CGROUP_MASK_JOINED(unit_get_siblings_mask(grandchild), 0); ASSERT_CGROUP_MASK_JOINED(unit_get_siblings_mask(parent_deep), (CGROUP_MASK_CPU | cpu_accounting_mask | CGROUP_MASK_MEMORY)); ASSERT_CGROUP_MASK_JOINED(unit_get_siblings_mask(parent), (CGROUP_MASK_CPU | cpu_accounting_mask | CGROUP_MASK_IO | CGROUP_MASK_BLKIO | CGROUP_MASK_MEMORY)); + ASSERT_CGROUP_MASK_JOINED(unit_get_siblings_mask(nomem_parent), (CGROUP_MASK_CPU | CGROUP_MASK_CPUACCT | CGROUP_MASK_IO | CGROUP_MASK_BLKIO | CGROUP_MASK_MEMORY)); + ASSERT_CGROUP_MASK_JOINED(unit_get_siblings_mask(nomem_leaf), (CGROUP_MASK_IO | CGROUP_MASK_BLKIO)); ASSERT_CGROUP_MASK_JOINED(unit_get_siblings_mask(root), (CGROUP_MASK_CPU | cpu_accounting_mask | CGROUP_MASK_IO | CGROUP_MASK_BLKIO | CGROUP_MASK_MEMORY)); /* Verify aggregation of target masks. */ @@ -104,6 +114,8 @@ static int test_cgroup_mask(void) { ASSERT_CGROUP_MASK(unit_get_target_mask(grandchild), 0); ASSERT_CGROUP_MASK(unit_get_target_mask(parent_deep), (CGROUP_MASK_EXTEND_JOINED(CGROUP_MASK_CPU | cpu_accounting_mask | CGROUP_MASK_MEMORY) & m->cgroup_supported)); ASSERT_CGROUP_MASK(unit_get_target_mask(parent), (CGROUP_MASK_EXTEND_JOINED(CGROUP_MASK_CPU | cpu_accounting_mask | CGROUP_MASK_IO | CGROUP_MASK_BLKIO | CGROUP_MASK_MEMORY) & m->cgroup_supported)); + ASSERT_CGROUP_MASK(unit_get_target_mask(nomem_parent), (CGROUP_MASK_EXTEND_JOINED(CGROUP_MASK_CPU | CGROUP_MASK_CPUACCT | CGROUP_MASK_IO | CGROUP_MASK_BLKIO) & m->cgroup_supported)); + ASSERT_CGROUP_MASK(unit_get_target_mask(nomem_leaf), (CGROUP_MASK_EXTEND_JOINED(CGROUP_MASK_IO | CGROUP_MASK_BLKIO) & m->cgroup_supported)); ASSERT_CGROUP_MASK(unit_get_target_mask(root), (CGROUP_MASK_EXTEND_JOINED(CGROUP_MASK_CPU | cpu_accounting_mask | CGROUP_MASK_IO | CGROUP_MASK_BLKIO | CGROUP_MASK_MEMORY) & m->cgroup_supported)); return 0; diff --git a/test/meson.build b/test/meson.build index bf02e39f43..d98bfb80d2 100644 --- a/test/meson.build +++ b/test/meson.build @@ -18,6 +18,8 @@ test_data_files = ''' hwdb/10-bad.hwdb journal-data/journal-1.txt journal-data/journal-2.txt + nomem.slice + nomemleaf.service parent-deep.slice parent.slice sched_idle_bad.service diff --git a/test/nomem.slice b/test/nomem.slice new file mode 100644 index 0000000000..9c5d208cb4 --- /dev/null +++ b/test/nomem.slice @@ -0,0 +1,5 @@ +[Unit] +Description=Nomem Parent Slice + +[Slice] +DisableControllers=memory diff --git a/test/nomemleaf.service b/test/nomemleaf.service new file mode 100644 index 0000000000..3cbaccb82f --- /dev/null +++ b/test/nomemleaf.service @@ -0,0 +1,9 @@ +[Unit] +Description=Nomem Leaf Service + +[Service] +Slice=nomem.slice +Type=oneshot +ExecStart=/bin/true +IOWeight=200 +MemoryAccounting=true |