diff options
author | Lennart Poettering <lennart@poettering.net> | 2023-10-16 16:13:16 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2023-10-16 16:13:16 +0200 |
commit | cde8cc946be9183c8381a5d30be0a862abd6b389 (patch) | |
tree | b6f5d178333e3d68f75bef09463fb39b157be916 | |
parent | mount-util: use mount beneath to replace previous namespace mount (diff) | |
parent | nspawn: set CoredumpReceive=yes on container's scope when --boot is set (diff) | |
download | systemd-cde8cc946be9183c8381a5d30be0a862abd6b389.tar.xz systemd-cde8cc946be9183c8381a5d30be0a862abd6b389.zip |
Merge pull request #29272 from enr0n/coredump-container
coredump: support forwarding coredumps to containers
-rw-r--r-- | docs/TRANSIENT-SETTINGS.md | 1 | ||||
-rw-r--r-- | man/org.freedesktop.systemd1.xml | 62 | ||||
-rw-r--r-- | man/systemd.resource-control.xml | 177 | ||||
-rw-r--r-- | src/basic/cgroup-util.c | 31 | ||||
-rw-r--r-- | src/basic/cgroup-util.h | 4 | ||||
-rw-r--r-- | src/basic/process-util.c | 27 | ||||
-rw-r--r-- | src/basic/process-util.h | 3 | ||||
-rw-r--r-- | src/core/cgroup.c | 22 | ||||
-rw-r--r-- | src/core/cgroup.h | 4 | ||||
-rw-r--r-- | src/core/dbus-cgroup.c | 18 | ||||
-rw-r--r-- | src/core/load-fragment-gperf.gperf.in | 1 | ||||
-rw-r--r-- | src/coredump/coredump.c | 336 | ||||
-rw-r--r-- | src/nspawn/nspawn-register.c | 24 | ||||
-rw-r--r-- | src/nspawn/nspawn-register.h | 5 | ||||
-rw-r--r-- | src/nspawn/nspawn.c | 6 | ||||
-rw-r--r-- | src/shared/bus-unit-util.c | 3 | ||||
-rw-r--r-- | src/shared/cgroup-show.c | 32 | ||||
-rwxr-xr-x | test/units/testsuite-74.coredump.sh | 27 | ||||
-rw-r--r-- | units/systemd-nspawn@.service.in | 1 |
19 files changed, 581 insertions, 203 deletions
diff --git a/docs/TRANSIENT-SETTINGS.md b/docs/TRANSIENT-SETTINGS.md index 07e248f8d5..15f1cbc47c 100644 --- a/docs/TRANSIENT-SETTINGS.md +++ b/docs/TRANSIENT-SETTINGS.md @@ -282,6 +282,7 @@ All cgroup/resource control settings are available for transient units ✓ ManagedOOMMemoryPressure= ✓ ManagedOOMMemoryPressureLimit= ✓ ManagedOOMPreference= +✓ CoredumpReceive= ``` ## Process Killing Settings diff --git a/man/org.freedesktop.systemd1.xml b/man/org.freedesktop.systemd1.xml index d8319318a9..199ce4f14c 100644 --- a/man/org.freedesktop.systemd1.xml +++ b/man/org.freedesktop.systemd1.xml @@ -2933,6 +2933,8 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2eservice { readonly t MemoryPressureThresholdUSec = ...; @org.freedesktop.DBus.Property.EmitsChangedSignal("false") readonly a(iiss) NFTSet = [...]; + @org.freedesktop.DBus.Property.EmitsChangedSignal("false") + readonly b CoredumpReceive = ...; @org.freedesktop.DBus.Property.EmitsChangedSignal("const") readonly as Environment = ['...', ...]; @org.freedesktop.DBus.Property.EmitsChangedSignal("const") @@ -3555,6 +3557,8 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2eservice { <!--property NFTSet is not documented!--> + <!--property CoredumpReceive is not documented!--> + <!--property EnvironmentFiles is not documented!--> <!--property PassEnvironment is not documented!--> @@ -4189,6 +4193,8 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2eservice { <variablelist class="dbus-property" generated="True" extra-ref="NFTSet"/> + <variablelist class="dbus-property" generated="True" extra-ref="CoredumpReceive"/> + <variablelist class="dbus-property" generated="True" extra-ref="Environment"/> <variablelist class="dbus-property" generated="True" extra-ref="EnvironmentFiles"/> @@ -4982,6 +4988,8 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2esocket { readonly t MemoryPressureThresholdUSec = ...; @org.freedesktop.DBus.Property.EmitsChangedSignal("false") readonly a(iiss) NFTSet = [...]; + @org.freedesktop.DBus.Property.EmitsChangedSignal("false") + readonly b CoredumpReceive = ...; @org.freedesktop.DBus.Property.EmitsChangedSignal("const") readonly as Environment = ['...', ...]; @org.freedesktop.DBus.Property.EmitsChangedSignal("const") @@ -5614,6 +5622,8 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2esocket { <!--property NFTSet is not documented!--> + <!--property CoredumpReceive is not documented!--> + <!--property EnvironmentFiles is not documented!--> <!--property PassEnvironment is not documented!--> @@ -6230,6 +6240,8 @@ node /org/freedesktop/systemd1/unit/avahi_2ddaemon_2esocket { <variablelist class="dbus-property" generated="True" extra-ref="NFTSet"/> + <variablelist class="dbus-property" generated="True" extra-ref="CoredumpReceive"/> + <variablelist class="dbus-property" generated="True" extra-ref="Environment"/> <variablelist class="dbus-property" generated="True" extra-ref="EnvironmentFiles"/> @@ -6897,6 +6909,8 @@ node /org/freedesktop/systemd1/unit/home_2emount { readonly t MemoryPressureThresholdUSec = ...; @org.freedesktop.DBus.Property.EmitsChangedSignal("false") readonly a(iiss) NFTSet = [...]; + @org.freedesktop.DBus.Property.EmitsChangedSignal("false") + readonly b CoredumpReceive = ...; @org.freedesktop.DBus.Property.EmitsChangedSignal("const") readonly as Environment = ['...', ...]; @org.freedesktop.DBus.Property.EmitsChangedSignal("const") @@ -7457,6 +7471,8 @@ node /org/freedesktop/systemd1/unit/home_2emount { <!--property NFTSet is not documented!--> + <!--property CoredumpReceive is not documented!--> + <!--property EnvironmentFiles is not documented!--> <!--property PassEnvironment is not documented!--> @@ -7987,6 +8003,8 @@ node /org/freedesktop/systemd1/unit/home_2emount { <variablelist class="dbus-property" generated="True" extra-ref="NFTSet"/> + <variablelist class="dbus-property" generated="True" extra-ref="CoredumpReceive"/> + <variablelist class="dbus-property" generated="True" extra-ref="Environment"/> <variablelist class="dbus-property" generated="True" extra-ref="EnvironmentFiles"/> @@ -8777,6 +8795,8 @@ node /org/freedesktop/systemd1/unit/dev_2dsda3_2eswap { readonly t MemoryPressureThresholdUSec = ...; @org.freedesktop.DBus.Property.EmitsChangedSignal("false") readonly a(iiss) NFTSet = [...]; + @org.freedesktop.DBus.Property.EmitsChangedSignal("false") + readonly b CoredumpReceive = ...; @org.freedesktop.DBus.Property.EmitsChangedSignal("const") readonly as Environment = ['...', ...]; @org.freedesktop.DBus.Property.EmitsChangedSignal("const") @@ -9323,6 +9343,8 @@ node /org/freedesktop/systemd1/unit/dev_2dsda3_2eswap { <!--property NFTSet is not documented!--> + <!--property CoredumpReceive is not documented!--> + <!--property EnvironmentFiles is not documented!--> <!--property PassEnvironment is not documented!--> @@ -9839,6 +9861,8 @@ node /org/freedesktop/systemd1/unit/dev_2dsda3_2eswap { <variablelist class="dbus-property" generated="True" extra-ref="NFTSet"/> + <variablelist class="dbus-property" generated="True" extra-ref="CoredumpReceive"/> + <variablelist class="dbus-property" generated="True" extra-ref="Environment"/> <variablelist class="dbus-property" generated="True" extra-ref="EnvironmentFiles"/> @@ -10488,6 +10512,8 @@ node /org/freedesktop/systemd1/unit/system_2eslice { readonly t MemoryPressureThresholdUSec = ...; @org.freedesktop.DBus.Property.EmitsChangedSignal("false") readonly a(iiss) NFTSet = [...]; + @org.freedesktop.DBus.Property.EmitsChangedSignal("false") + readonly b CoredumpReceive = ...; }; interface org.freedesktop.DBus.Peer { ... }; interface org.freedesktop.DBus.Introspectable { ... }; @@ -10660,6 +10686,8 @@ node /org/freedesktop/systemd1/unit/system_2eslice { <!--property NFTSet is not documented!--> + <!--property CoredumpReceive is not documented!--> + <!--Autogenerated cross-references for systemd.directives, do not edit--> <variablelist class="dbus-interface" generated="True" extra-ref="org.freedesktop.systemd1.Unit"/> @@ -10840,6 +10868,8 @@ node /org/freedesktop/systemd1/unit/system_2eslice { <variablelist class="dbus-property" generated="True" extra-ref="NFTSet"/> + <variablelist class="dbus-property" generated="True" extra-ref="CoredumpReceive"/> + <!--End of Autogenerated section--> <refsect2> @@ -11043,6 +11073,8 @@ node /org/freedesktop/systemd1/unit/session_2d1_2escope { readonly t MemoryPressureThresholdUSec = ...; @org.freedesktop.DBus.Property.EmitsChangedSignal("false") readonly a(iiss) NFTSet = [...]; + @org.freedesktop.DBus.Property.EmitsChangedSignal("false") + readonly b CoredumpReceive = ...; @org.freedesktop.DBus.Property.EmitsChangedSignal("const") readonly s KillMode = '...'; @org.freedesktop.DBus.Property.EmitsChangedSignal("const") @@ -11235,6 +11267,8 @@ node /org/freedesktop/systemd1/unit/session_2d1_2escope { <!--property NFTSet is not documented!--> + <!--property CoredumpReceive is not documented!--> + <!--property KillMode is not documented!--> <!--property KillSignal is not documented!--> @@ -11445,6 +11479,8 @@ node /org/freedesktop/systemd1/unit/session_2d1_2escope { <variablelist class="dbus-property" generated="True" extra-ref="NFTSet"/> + <variablelist class="dbus-property" generated="True" extra-ref="CoredumpReceive"/> + <variablelist class="dbus-property" generated="True" extra-ref="KillMode"/> <variablelist class="dbus-property" generated="True" extra-ref="KillSignal"/> @@ -11679,8 +11715,9 @@ $ gdbus introspect --system --dest org.freedesktop.systemd1 \ <varname>RootImagePolicy</varname>, <varname>MountImagePolicy</varname>, and <varname>ExtensionImagePolicy</varname> were added in version 254.</para> - <para><varname>NFTSet</varname> and - <varname>SetLoginEnvironment</varname> were added in version 255.</para> + <para><varname>NFTSet</varname>, + <varname>SetLoginEnvironment</varname> and + <varname>CoredumpReceive</varname> were added in version 255.</para> </refsect2> <refsect2> <title>Socket Unit Objects</title> @@ -11705,8 +11742,9 @@ $ gdbus introspect --system --dest org.freedesktop.systemd1 \ <varname>ExtensionImagePolicy</varname> were added in version 254.</para> <para><varname>PollLimitIntervalUSec</varname>, <varname>PollLimitBurst</varname>, - <varname>NFTSet</varname>, and - <varname>SetLoginEnvironment</varname> were added in version 255.</para> + <varname>NFTSet</varname>, + <varname>SetLoginEnvironment</varname> and + <varname>CoredumpReceive</varname> were added in version 255.</para> </refsect2> <refsect2> <title>Mount Unit Objects</title> @@ -11729,8 +11767,9 @@ $ gdbus introspect --system --dest org.freedesktop.systemd1 \ <varname>RootImagePolicy</varname>, <varname>MountImagePolicy</varname>, and <varname>ExtensionImagePolicy</varname> were added in version 254.</para> - <para><varname>NFTSet</varname> and - <varname>SetLoginEnvironment</varname> were added in version 255.</para> + <para><varname>NFTSet</varname>, + <varname>SetLoginEnvironment</varname> and + <varname>CoredumpReceive</varname> were added in version 255.</para> </refsect2> <refsect2> <title>Swap Unit Objects</title> @@ -11753,8 +11792,9 @@ $ gdbus introspect --system --dest org.freedesktop.systemd1 \ <varname>RootImagePolicy</varname>, <varname>MountImagePolicy</varname>, and <varname>ExtensionImagePolicy</varname> were added in version 254.</para> - <para><varname>NFTSet</varname> and - <varname>SetLoginEnvironment</varname> were added in version 255.</para> + <para><varname>NFTSet</varname>, + <varname>SetLoginEnvironment</varname> and + <varname>CoredumpReceive</varname> were added in version 255.</para> </refsect2> <refsect2> <title>Slice Unit Objects</title> @@ -11769,7 +11809,8 @@ $ gdbus introspect --system --dest org.freedesktop.systemd1 \ <varname>StartupMemoryZSwapMax</varname>, <varname>MemoryPressureWatch</varname>, and <varname>MemoryPressureThresholdUSec</varname> were added in version 254.</para> - <para><varname>NFTSet</varname> was added in version 255.</para> + <para><varname>NFTSet</varname> and + <varname>CoredumpReceive</varname> were added in version 255.</para> </refsect2> <refsect2> <title>Scope Unit Objects</title> @@ -11785,7 +11826,8 @@ $ gdbus introspect --system --dest org.freedesktop.systemd1 \ <varname>StartupMemoryZSwapMax</varname>, <varname>MemoryPressureWatch</varname>, and <varname>MemoryPressureThresholdUSec</varname> were added in version 254.</para> - <para><varname>NFTSet</varname> was added in version 255.</para> + <para><varname>NFTSet</varname> and + <varname>CoredumpReceive</varname> were added in version 255.</para> </refsect2> <refsect2> <title>Job Objects</title> diff --git a/man/systemd.resource-control.xml b/man/systemd.resource-control.xml index e9747cefbb..c3581e78b3 100644 --- a/man/systemd.resource-control.xml +++ b/man/systemd.resource-control.xml @@ -1005,6 +1005,92 @@ RestrictNetworkInterfaces=~eth1</programlisting> </listitem> </varlistentry> + <varlistentry> + <term><varname>NFTSet=</varname><replaceable>family</replaceable>:<replaceable>table</replaceable>:<replaceable>set</replaceable></term> + <listitem> + <para>This setting provides a method for integrating dynamic cgroup, user and group IDs into + firewall rules with <ulink url="https://netfilter.org/projects/nftables/index.html">NFT</ulink> + sets. The benefit of using this setting is to be able to use the IDs as selectors in firewall rules + easily and this in turn allows more fine grained filtering. NFT rules for cgroup matching use + numeric cgroup IDs, which change every time a service is restarted, making them hard to use in + systemd environment otherwise. Dynamic and random IDs used by <varname>DynamicUser=</varname> can + be also integrated with this setting.</para> + + <para>This option expects a whitespace separated list of NFT set definitions. Each definition + consists of a colon-separated tuple of source type (one of <literal>cgroup</literal>, + <literal>user</literal> or <literal>group</literal>), NFT address family (one of + <literal>arp</literal>, <literal>bridge</literal>, <literal>inet</literal>, <literal>ip</literal>, + <literal>ip6</literal>, or <literal>netdev</literal>), table name and set name. The names of tables + and sets must conform to lexical restrictions of NFT table names. The type of the element used in + the NFT filter must match the type implied by the directive (<literal>cgroup</literal>, + <literal>user</literal> or <literal>group</literal>) as shown in the table below. When a control + group or a unit is realized, the corresponding ID will be appended to the NFT sets and it will be + be removed when the control group or unit is removed. <command>systemd</command> only inserts + elements to (or removes from) the sets, so the related NFT rules, tables and sets must be prepared + elsewhere in advance. Failures to manage the sets will be ignored.</para> + + <table> + <title>Defined <varname>source type</varname> values</title> + <tgroup cols='3'> + <colspec colname='source type'/> + <colspec colname='description'/> + <colspec colname='NFT type name'/> + <thead> + <row> + <entry>Source type</entry> + <entry>Description</entry> + <entry>Corresponding NFT type name</entry> + </row> + </thead> + + <tbody> + <row> + <entry><literal>cgroup</literal></entry> + <entry>control group ID</entry> + <entry><literal>cgroupsv2</literal></entry> + </row> + <row> + <entry><literal>user</literal></entry> + <entry>user ID</entry> + <entry><literal>meta skuid</literal></entry> + </row> + <row> + <entry><literal>group</literal></entry> + <entry>group ID</entry> + <entry><literal>meta skgid</literal></entry> + </row> + </tbody> + </tgroup> + </table> + + <para>If the firewall rules are reinstalled so that the contents of NFT sets are destroyed, command + <command>systemctl daemon-reload</command> can be used to refill the sets.</para> + + <para>Example: + <programlisting>[Unit] +NFTSet=cgroup:inet:filter:my_service user:inet:filter:serviceuser +</programlisting> + Corresponding NFT rules: + <programlisting>table inet filter { + set my_service { + type cgroupsv2 + } + set serviceuser { + typeof meta skuid + } + chain x { + socket cgroupv2 level 2 @my_service accept + drop + } + chain y { + meta skuid @serviceuser accept + drop + } +}</programlisting> + </para> + <xi:include href="version-info.xml" xpointer="v255"/></listitem> + </varlistentry> + </variablelist> </refsect2><refsect2><title>BPF Programs</title> @@ -1500,92 +1586,27 @@ DeviceAllow=/dev/loop-control <xi:include href="version-info.xml" xpointer="v254"/></listitem> </varlistentry> + </variablelist> - <varlistentry> - <term><varname>NFTSet=</varname><replaceable>family</replaceable>:<replaceable>table</replaceable>:<replaceable>set</replaceable></term> - <listitem> - <para>This setting provides a method for integrating dynamic cgroup, user and group IDs into - firewall rules with <ulink url="https://netfilter.org/projects/nftables/index.html">NFT</ulink> - sets. The benefit of using this setting is to be able to use the IDs as selectors in firewall rules - easily and this in turn allows more fine grained filtering. NFT rules for cgroup matching use - numeric cgroup IDs, which change every time a service is restarted, making them hard to use in - systemd environment otherwise. Dynamic and random IDs used by <varname>DynamicUser=</varname> can - be also integrated with this setting.</para> + </refsect2><refsect2><title>Coredump Control</title> - <para>This option expects a whitespace separated list of NFT set definitions. Each definition - consists of a colon-separated tuple of source type (one of <literal>cgroup</literal>, - <literal>user</literal> or <literal>group</literal>), NFT address family (one of - <literal>arp</literal>, <literal>bridge</literal>, <literal>inet</literal>, <literal>ip</literal>, - <literal>ip6</literal>, or <literal>netdev</literal>), table name and set name. The names of tables - and sets must conform to lexical restrictions of NFT table names. The type of the element used in - the NFT filter must match the type implied by the directive (<literal>cgroup</literal>, - <literal>user</literal> or <literal>group</literal>) as shown in the table below. When a control - group or a unit is realized, the corresponding ID will be appended to the NFT sets and it will be - be removed when the control group or unit is removed. <command>systemd</command> only inserts - elements to (or removes from) the sets, so the related NFT rules, tables and sets must be prepared - elsewhere in advance. Failures to manage the sets will be ignored.</para> + <variablelist class='unit-directives'> - <table> - <title>Defined <varname>source type</varname> values</title> - <tgroup cols='3'> - <colspec colname='source type'/> - <colspec colname='description'/> - <colspec colname='NFT type name'/> - <thead> - <row> - <entry>Source type</entry> - <entry>Description</entry> - <entry>Corresponding NFT type name</entry> - </row> - </thead> + <varlistentry> + <term><varname>CoredumpReceive=</varname></term> - <tbody> - <row> - <entry><literal>cgroup</literal></entry> - <entry>control group ID</entry> - <entry><literal>cgroupsv2</literal></entry> - </row> - <row> - <entry><literal>user</literal></entry> - <entry>user ID</entry> - <entry><literal>meta skuid</literal></entry> - </row> - <row> - <entry><literal>group</literal></entry> - <entry>group ID</entry> - <entry><literal>meta skgid</literal></entry> - </row> - </tbody> - </tgroup> - </table> + <listitem><para>Takes a boolean argument. This setting is used to enable coredump forwarding for containers + that belong to this unit's cgroup. Units with <varname>CoredumpReceive=yes</varname> must also be configured + with <varname>Delegate=yes</varname>. Defaults to false.</para> - <para>If the firewall rules are reinstalled so that the contents of NFT sets are destroyed, command - <command>systemctl daemon-reload</command> can be used to refill the sets.</para> + <para>When <command>systemd-coredump</command> is handling a coredump for a process from a container, + if the container's leader process is a descendant of a cgroup with <varname>CoredumpReceive=yes</varname> + and <varname>Delegate=yes</varname>, then <command>systemd-coredump</command> will attempt to forward + the coredump to <command>systemd-coredump</command> within the container.</para> - <para>Example: - <programlisting>[Unit] -NFTSet=cgroup:inet:filter:my_service user:inet:filter:serviceuser -</programlisting> - Corresponding NFT rules: - <programlisting>table inet filter { - set my_service { - type cgroupsv2 - } - set serviceuser { - typeof meta skuid - } - chain x { - socket cgroupv2 level 2 @my_service accept - drop - } - chain y { - meta skuid @serviceuser accept - drop - } -}</programlisting> - </para> <xi:include href="version-info.xml" xpointer="v255"/></listitem> </varlistentry> + </variablelist> </refsect2> </refsect1> diff --git a/src/basic/cgroup-util.c b/src/basic/cgroup-util.c index db803084ae..f4012f1c61 100644 --- a/src/basic/cgroup-util.c +++ b/src/basic/cgroup-util.c @@ -2268,6 +2268,37 @@ int cg_hybrid_unified(void) { return r == CGROUP_UNIFIED_SYSTEMD && !unified_systemd_v232; } +int cg_is_delegated(const char *path) { + int r; + + assert(path); + + r = cg_get_xattr_bool(path, "trusted.delegate"); + if (ERRNO_IS_NEG_XATTR_ABSENT(r)) { + /* If the trusted xattr isn't set (preferred), then check the + * untrusted one. Under the assumption that whoever is trusted + * enough to own the cgroup, is also trusted enough to decide + * if it is delegated or not this should be safe. */ + r = cg_get_xattr_bool(path, "user.delegate"); + if (ERRNO_IS_NEG_XATTR_ABSENT(r)) + return false; + } + + return r; +} + +int cg_has_coredump_receive(const char *path) { + int r; + + assert(path); + + r = cg_get_xattr_bool(path, "user.coredump_receive"); + if (ERRNO_IS_NEG_XATTR_ABSENT(r)) + return false; + + return r; +} + const uint64_t cgroup_io_limit_defaults[_CGROUP_IO_LIMIT_TYPE_MAX] = { [CGROUP_IO_RBPS_MAX] = CGROUP_LIMIT_MAX, [CGROUP_IO_WBPS_MAX] = CGROUP_LIMIT_MAX, diff --git a/src/basic/cgroup-util.h b/src/basic/cgroup-util.h index 80ea7e7ffa..a64bdcf7a9 100644 --- a/src/basic/cgroup-util.h +++ b/src/basic/cgroup-util.h @@ -210,6 +210,10 @@ int cg_rmdir(const char *controller, const char *path); int cg_is_threaded(const char *path); +int cg_is_delegated(const char *path); + +int cg_has_coredump_receive(const char *path); + typedef enum { CG_KEY_MODE_GRACEFUL = 1 << 0, } CGroupKeyMode; diff --git a/src/basic/process-util.c b/src/basic/process-util.c index d0ffb2d614..ed096900ac 100644 --- a/src/basic/process-util.c +++ b/src/basic/process-util.c @@ -319,6 +319,33 @@ int container_get_leader(const char *machine, pid_t *pid) { return 0; } +int namespace_get_leader(pid_t pid, NamespaceType type, pid_t *ret) { + int r; + + assert(ret); + + for (;;) { + pid_t ppid; + + r = get_process_ppid(pid, &ppid); + if (r < 0) + return r; + + r = in_same_namespace(pid, ppid, type); + if (r < 0) + return r; + if (r == 0) { + /* If the parent and the child are not in the same + * namespace, then the child is the leader we are + * looking for. */ + *ret = pid; + return 0; + } + + pid = ppid; + } +} + int is_kernel_thread(pid_t pid) { _cleanup_free_ char *line = NULL; unsigned long long flags; diff --git a/src/basic/process-util.h b/src/basic/process-util.h index 0d50e797e5..7dfa510b93 100644 --- a/src/basic/process-util.h +++ b/src/basic/process-util.h @@ -14,6 +14,7 @@ #include "alloc-util.h" #include "format-util.h" #include "macro.h" +#include "namespace-util.h" #include "time-util.h" #define procfs_file_alloca(pid, field) \ @@ -53,6 +54,8 @@ int get_process_umask(pid_t pid, mode_t *ret); int container_get_leader(const char *machine, pid_t *pid); +int namespace_get_leader(pid_t pid, NamespaceType type, pid_t *ret); + int wait_for_terminate(pid_t pid, siginfo_t *status); typedef enum WaitFlags { diff --git a/src/core/cgroup.c b/src/core/cgroup.c index e217eab7ee..c2bf3af20c 100644 --- a/src/core/cgroup.c +++ b/src/core/cgroup.c @@ -533,7 +533,8 @@ void cgroup_context_dump(Unit *u, FILE* f, const char *prefix) { "%sManagedOOMMemoryPressure: %s\n" "%sManagedOOMMemoryPressureLimit: " PERMYRIAD_AS_PERCENT_FORMAT_STR "\n" "%sManagedOOMPreference: %s\n" - "%sMemoryPressureWatch: %s\n", + "%sMemoryPressureWatch: %s\n" + "%sCoredumpReceive: %s\n", prefix, yes_no(c->cpu_accounting), prefix, yes_no(c->io_accounting), prefix, yes_no(c->blockio_accounting), @@ -576,7 +577,8 @@ void cgroup_context_dump(Unit *u, FILE* f, const char *prefix) { prefix, managed_oom_mode_to_string(c->moom_mem_pressure), prefix, PERMYRIAD_AS_PERCENT_FORMAT_VAL(UINT32_SCALE_TO_PERMYRIAD(c->moom_mem_pressure_limit)), prefix, managed_oom_preference_to_string(c->moom_preference), - prefix, cgroup_pressure_watch_to_string(c->memory_pressure_watch)); + prefix, cgroup_pressure_watch_to_string(c->memory_pressure_watch), + prefix, yes_no(c->coredump_receive)); if (c->delegate_subgroup) fprintf(f, "%sDelegateSubgroup: %s\n", @@ -916,6 +918,21 @@ static void cgroup_invocation_id_xattr_apply(Unit *u) { } } +static void cgroup_coredump_xattr_apply(Unit *u) { + CGroupContext *c; + + assert(u); + + c = unit_get_cgroup_context(u); + if (!c) + return; + + if (unit_cgroup_delegate(u) && c->coredump_receive) + unit_set_xattr_graceful(u, "user.coredump_receive", "1", 1); + else + unit_remove_xattr_graceful(u, "user.coredump_receive"); +} + static void cgroup_delegate_xattr_apply(Unit *u) { bool b; @@ -976,6 +993,7 @@ static void cgroup_xattr_apply(Unit *u) { /* The 'user.*' xattrs can be set from a user manager. */ cgroup_oomd_xattr_apply(u); cgroup_log_xattr_apply(u); + cgroup_coredump_xattr_apply(u); if (!MANAGER_IS_SYSTEM(u->manager)) return; diff --git a/src/core/cgroup.h b/src/core/cgroup.h index 80d353fac6..5b5e868a58 100644 --- a/src/core/cgroup.h +++ b/src/core/cgroup.h @@ -227,6 +227,10 @@ struct CGroupContext { * triggers, nor triggers for non-memory pressure. We might add that later. */ NFTSetContext nft_set_context; + + /* Forward coredumps for processes that crash within this cgroup. + * Requires 'delegate' to also be true. */ + bool coredump_receive; }; /* Used when querying IP accounting data */ diff --git a/src/core/dbus-cgroup.c b/src/core/dbus-cgroup.c index 820b43ee1f..681b86dd87 100644 --- a/src/core/dbus-cgroup.c +++ b/src/core/dbus-cgroup.c @@ -521,6 +521,7 @@ const sd_bus_vtable bus_cgroup_vtable[] = { SD_BUS_PROPERTY("MemoryPressureWatch", "s", bus_property_get_cgroup_pressure_watch, offsetof(CGroupContext, memory_pressure_watch), 0), SD_BUS_PROPERTY("MemoryPressureThresholdUSec", "t", bus_property_get_usec, offsetof(CGroupContext, memory_pressure_threshold_usec), 0), SD_BUS_PROPERTY("NFTSet", "a(iiss)", property_get_cgroup_nft_set, 0, 0), + SD_BUS_PROPERTY("CoredumpReceive", "b", bus_property_get_bool, offsetof(CGroupContext, coredump_receive), 0), SD_BUS_VTABLE_END }; @@ -840,6 +841,23 @@ static int bus_cgroup_set_transient_property( } return 1; + } else if (streq(name, "CoredumpReceive")) { + int b; + + if (!UNIT_VTABLE(u)->can_delegate) + return sd_bus_error_set(error, SD_BUS_ERROR_INVALID_ARGS, "Delegation not available for unit type"); + + r = sd_bus_message_read(message, "b", &b); + if (r < 0) + return r; + + if (!UNIT_WRITE_FLAGS_NOOP(flags)) { + c->coredump_receive = b; + + unit_write_settingf(u, flags, name, "CoredumpReceive=%s", yes_no(b)); + } + + return 1; } return 0; diff --git a/src/core/load-fragment-gperf.gperf.in b/src/core/load-fragment-gperf.gperf.in index 0ab468ba17..45f9ab03c4 100644 --- a/src/core/load-fragment-gperf.gperf.in +++ b/src/core/load-fragment-gperf.gperf.in @@ -260,6 +260,7 @@ {{type}}.MemoryPressureThresholdSec, config_parse_sec, 0, offsetof({{type}}, cgroup_context.memory_pressure_threshold_usec) {{type}}.MemoryPressureWatch, config_parse_memory_pressure_watch, 0, offsetof({{type}}, cgroup_context.memory_pressure_watch) {{type}}.NFTSet, config_parse_cgroup_nft_set, NFT_SET_PARSE_CGROUP, offsetof({{type}}, cgroup_context) +{{type}}.CoredumpReceive, config_parse_bool, 0, offsetof({{type}}, cgroup_context.coredump_receive) {%- endmacro -%} %{ diff --git a/src/coredump/coredump.c b/src/coredump/coredump.c index bbe0f1387e..12f84ef965 100644 --- a/src/coredump/coredump.c +++ b/src/coredump/coredump.c @@ -38,7 +38,9 @@ #include "memory-util.h" #include "memstream-util.h" #include "mkdir-label.h" +#include "namespace-util.h" #include "parse-util.h" +#include "path-util.h" #include "process-util.h" #include "signal-util.h" #include "socket-util.h" @@ -131,6 +133,8 @@ typedef struct Context { const char *meta[_META_MAX]; size_t meta_size[_META_MAX]; pid_t pid; + uid_t uid; + gid_t gid; bool is_pid1; bool is_journald; } Context; @@ -271,7 +275,6 @@ static int fix_permissions( const char *filename, const char *target, const Context *context, - uid_t uid, bool allow_user) { int r; @@ -282,7 +285,7 @@ static int fix_permissions( /* Ignore errors on these */ (void) fchmod(fd, 0640); - (void) fix_acl(fd, uid, allow_user); + (void) fix_acl(fd, context->uid, allow_user); (void) fix_xattr(fd, context); r = link_tmpfile(fd, filename, target, LINK_TMPFILE_SYNC); @@ -418,7 +421,6 @@ static int save_external_coredump( uint64_t rlimit, process_limit, max_size; bool truncated, storage_on_tmpfs; struct stat st; - uid_t uid; int r; assert(context); @@ -429,10 +431,6 @@ static int save_external_coredump( assert(ret_compressed_size); assert(ret_truncated); - r = parse_uid(context->meta[META_ARGV_UID], &uid); - if (r < 0) - return log_error_errno(r, "Failed to parse UID: %m"); - r = safe_atou64(context->meta[META_ARGV_RLIMIT], &rlimit); if (r < 0) return log_error_errno(r, "Failed to parse resource limit '%s': %m", @@ -563,7 +561,7 @@ static int save_external_coredump( uncompressed_size += partial_uncompressed_size; } - r = fix_permissions(fd_compressed, tmp_compressed, fn_compressed, context, uid, allow_user); + r = fix_permissions(fd_compressed, tmp_compressed, fn_compressed, context, allow_user); if (r < 0) return r; @@ -590,7 +588,7 @@ static int save_external_coredump( "SIZE_LIMIT=%"PRIu64, max_size, "MESSAGE_ID=" SD_MESSAGE_TRUNCATED_CORE_STR); - r = fix_permissions(fd, tmp, fn, context, uid, allow_user); + r = fix_permissions(fd, tmp, fn, context, allow_user); if (r < 0) return log_error_errno(r, "Failed to fix permissions and finalize coredump %s into %s: %m", coredump_tmpfile_name(tmp), fn); @@ -717,56 +715,6 @@ static int compose_open_fds(pid_t pid, char **ret) { return memstream_finalize(&m, ret, NULL); } -static int get_process_ns(pid_t pid, const char *namespace, ino_t *ns) { - const char *p; - struct stat stbuf; - _cleanup_close_ int proc_ns_dir_fd = -EBADF; - - p = procfs_file_alloca(pid, "ns"); - - proc_ns_dir_fd = open(p, O_DIRECTORY | O_CLOEXEC | O_RDONLY); - if (proc_ns_dir_fd < 0) - return -errno; - - if (fstatat(proc_ns_dir_fd, namespace, &stbuf, /* flags */0) < 0) - return -errno; - - *ns = stbuf.st_ino; - return 0; -} - -static int get_mount_namespace_leader(pid_t pid, pid_t *ret) { - ino_t proc_mntns; - int r; - - r = get_process_ns(pid, "mnt", &proc_mntns); - if (r < 0) - return r; - - for (;;) { - ino_t parent_mntns; - pid_t ppid; - - r = get_process_ppid(pid, &ppid); - if (r == -EADDRNOTAVAIL) /* Reached the top (i.e. typically PID 1, but could also be a process - * whose parent is not in our pidns) */ - return -ENOENT; - if (r < 0) - return r; - - r = get_process_ns(ppid, "mnt", &parent_mntns); - if (r < 0) - return r; - - if (proc_mntns != parent_mntns) { - *ret = ppid; - return 0; - } - - pid = ppid; - } -} - /* Returns 1 if the parent was found. * Returns 0 if there is not a process we can call the pid's * container parent (the pid's process isn't 'containerized'). @@ -792,7 +740,7 @@ static int get_process_container_parent_cmdline(pid_t pid, char** cmdline) { return 0; } - r = get_mount_namespace_leader(pid, &container_pid); + r = namespace_get_leader(pid, NAMESPACE_MOUNT, &container_pid); if (r < 0) return r; @@ -804,14 +752,10 @@ static int get_process_container_parent_cmdline(pid_t pid, char** cmdline) { } static int change_uid_gid(const Context *context) { - uid_t uid; - gid_t gid; + uid_t uid = context->uid; + gid_t gid = context->gid; int r; - r = parse_uid(context->meta[META_ARGV_UID], &uid); - if (r < 0) - return r; - if (uid_is_system(uid)) { const char *user = "systemd-coredump"; @@ -820,10 +764,6 @@ static int change_uid_gid(const Context *context) { log_warning_errno(r, "Cannot resolve %s user. Proceeding to dump core as root: %m", user); uid = gid = 0; } - } else { - r = parse_gid(context->meta[META_ARGV_GID], &gid); - if (r < 0) - return r; } return drop_privileges(uid, gid, 0); @@ -1031,6 +971,14 @@ static int save_context(Context *context, const struct iovec_wrapper *iovw) { if (r < 0) return log_error_errno(r, "Failed to parse PID \"%s\": %m", context->meta[META_ARGV_PID]); + r = parse_uid(context->meta[META_ARGV_UID], &context->uid); + if (r < 0) + return log_error_errno(r, "Failed to parse UID \"%s\": %m", context->meta[META_ARGV_UID]); + + r = parse_gid(context->meta[META_ARGV_GID], &context->gid); + if (r < 0) + return log_error_errno(r, "Failed to parse GID \"%s\": %m", context->meta[META_ARGV_GID]); + unit = context->meta[META_UNIT]; context->is_pid1 = streq(context->meta[META_ARGV_PID], "1") || streq_ptr(unit, SPECIAL_INIT_SCOPE); context->is_journald = streq_ptr(unit, SPECIAL_JOURNALD_SERVICE); @@ -1368,6 +1316,237 @@ static int gather_pid_metadata_from_procfs(struct iovec_wrapper *iovw, Context * return save_context(context, iovw); } +static int send_ucred(int transport_fd, struct ucred *ucred) { + CMSG_BUFFER_TYPE(CMSG_SPACE(sizeof(struct ucred))) control = {}; + struct msghdr mh = { + .msg_control = &control, + .msg_controllen = sizeof(control), + }; + struct cmsghdr *cmsg; + + assert(transport_fd >= 0); + + cmsg = CMSG_FIRSTHDR(&mh); + *cmsg = (struct cmsghdr) { + .cmsg_level = SOL_SOCKET, + .cmsg_type = SCM_CREDENTIALS, + .cmsg_len = CMSG_LEN(sizeof(struct ucred)), + }; + memcpy(CMSG_DATA(cmsg), ucred, sizeof(struct ucred)); + + return RET_NERRNO(sendmsg(transport_fd, &mh, MSG_NOSIGNAL)); +} + +static int receive_ucred(int transport_fd, struct ucred *ret_ucred) { + CMSG_BUFFER_TYPE(CMSG_SPACE(sizeof(struct ucred))) control = {}; + struct msghdr mh = { + .msg_control = &control, + .msg_controllen = sizeof(control), + }; + struct cmsghdr *cmsg = NULL; + struct ucred *ucred = NULL; + ssize_t n; + + assert(ret_ucred); + + n = recvmsg_safe(transport_fd, &mh, 0); + if (n < 0) + return n; + + CMSG_FOREACH(cmsg, &mh) + if (cmsg->cmsg_level == SOL_SOCKET && + cmsg->cmsg_type == SCM_CREDENTIALS && + cmsg->cmsg_len == CMSG_LEN(sizeof(struct ucred))) { + + assert(!ucred); + ucred = CMSG_TYPED_DATA(cmsg, struct ucred); + } + + if (!ucred) + return -EIO; + + *ret_ucred = *ucred; + + return 0; +} + +static int can_forward_coredump(pid_t pid) { + _cleanup_free_ char *cgroup = NULL, *path = NULL, *unit = NULL; + int r; + + r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, pid, &cgroup); + if (r < 0) + return r; + + r = path_extract_directory(cgroup, &path); + if (r < 0) + return r; + + r = cg_path_get_unit_path(path, &unit); + if (r == -ENOMEM) + return log_oom(); + if (r == -ENXIO) + /* No valid units in this path. */ + return false; + if (r < 0) + return r; + + /* We require that this process belongs to a delegated cgroup + * (i.e. Delegate=yes), with CoredumpReceive=yes also. */ + r = cg_is_delegated(unit); + if (r <= 0) + return r; + + return cg_has_coredump_receive(unit); +} + +static int forward_coredump_to_container(Context *context) { + _cleanup_close_ int pidnsfd = -EBADF, mntnsfd = -EBADF, netnsfd = -EBADF, usernsfd = -EBADF, rootfd = -EBADF; + _cleanup_close_pair_ int pair[2] = PIPE_EBADF; + pid_t pid, child; + struct ucred ucred = { + .pid = context->pid, + .uid = context->uid, + .gid = context->gid, + }; + int r; + + r = namespace_get_leader(context->pid, NAMESPACE_PID, &pid); + if (r < 0) + return log_debug_errno(r, "Failed to get namespace leader: %m"); + + r = can_forward_coredump(pid); + if (r < 0) + return log_debug_errno(r, "Failed to check if coredump can be forwarded: %m"); + if (r == 0) + return log_debug_errno(SYNTHETIC_ERRNO(ENOENT), + "Coredump will not be forwarded because no target cgroup was found."); + + r = RET_NERRNO(socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, pair)); + if (r < 0) + return log_debug_errno(r, "Failed to create socket pair: %m"); + + r = setsockopt_int(pair[1], SOL_SOCKET, SO_PASSCRED, true); + if (r < 0) + return log_debug_errno(r, "Failed to set SO_PASSCRED: %m"); + + r = namespace_open(pid, &pidnsfd, &mntnsfd, &netnsfd, &usernsfd, &rootfd); + if (r < 0) + return log_debug_errno(r, "Failed to join namespaces of PID " PID_FMT ": %m", pid); + + r = namespace_fork("(sd-coredumpns)", "(sd-coredump)", NULL, 0, + FORK_RESET_SIGNALS|FORK_DEATHSIG, + pidnsfd, mntnsfd, netnsfd, usernsfd, rootfd, &child); + if (r < 0) + return log_debug_errno(r, "Failed to fork into namespaces of PID " PID_FMT ": %m", pid); + if (r == 0) { + _cleanup_(iovw_free_freep) struct iovec_wrapper *iovw = NULL; + Context child_context = {}; + + pair[0] = safe_close(pair[0]); + + if (laccess("/run/systemd/coredump", W_OK) < 0) { + log_debug_errno(errno, "Cannot find coredump socket, exiting: %m"); + _exit(EXIT_FAILURE); + } + + r = receive_ucred(pair[1], &ucred); + if (r < 0) { + log_debug_errno(r, "Failed to receive ucred and fd: %m"); + _exit(EXIT_FAILURE); + } + + iovw = iovw_new(); + if (!iovw) { + log_oom(); + _exit(EXIT_FAILURE); + } + + (void) iovw_put_string_field(iovw, "MESSAGE_ID=", SD_MESSAGE_COREDUMP_STR); + (void) iovw_put_string_field(iovw, "PRIORITY=", STRINGIFY(LOG_CRIT)); + (void) iovw_put_string_field(iovw, "COREDUMP_FORWARDED=", "1"); + + for (int i = 0; i < _META_ARGV_MAX; i++) { + int signo; + char buf[DECIMAL_STR_MAX(pid_t)]; + const char *t = context->meta[i]; + + switch(i) { + + case META_ARGV_PID: + xsprintf(buf, PID_FMT, ucred.pid); + t = buf; + + break; + + case META_ARGV_UID: + xsprintf(buf, UID_FMT, ucred.uid); + t = buf; + break; + + case META_ARGV_GID: + xsprintf(buf, GID_FMT, ucred.gid); + t = buf; + break; + + case META_ARGV_SIGNAL: + if (safe_atoi(t, &signo) >= 0 && SIGNAL_VALID(signo)) + (void) iovw_put_string_field(iovw, + "COREDUMP_SIGNAL_NAME=SIG", + signal_to_string(signo)); + break; + + default: + break; + } + + r = iovw_put_string_field(iovw, meta_field_names[i], t); + if (r < 0) { + log_debug_errno(r, "Failed to construct iovec: %m"); + _exit(EXIT_FAILURE); + } + } + + r = save_context(&child_context, iovw); + if (r < 0) { + log_debug_errno(r, "Failed to save context: %m"); + _exit(EXIT_FAILURE); + } + + r = gather_pid_metadata_from_procfs(iovw, &child_context); + if (r < 0) { + log_debug_errno(r, "Failed to gather metadata from procfs: %m"); + _exit(EXIT_FAILURE); + } + + r = send_iovec(iovw, STDIN_FILENO); + if (r < 0) { + log_debug_errno(r, "Failed to send iovec to coredump socket: %m"); + _exit(EXIT_FAILURE); + } + + _exit(EXIT_SUCCESS); + } + + pair[1] = safe_close(pair[1]); + + /* We need to translate the PID, UID, and GID of the crashing process + * to the container's namespaces. Do this by sending an SCM_CREDENTIALS + * message on a socket pair, and read the result when we join the + * container. The kernel will perform the translation for us. */ + r = send_ucred(pair[0], &ucred); + if (r < 0) + return log_debug_errno(r, "Failed to send metadata to container: %m"); + + r = wait_for_terminate_and_check("(sd-coredumpns)", child, 0); + if (r < 0) + return log_debug_errno(r, "Failed to wait for child to terminate: %m"); + if (r != EXIT_SUCCESS) + return log_debug_errno(SYNTHETIC_ERRNO(EPROTO), "Failed to process coredump in container: %m"); + + return 0; +} + static int process_kernel(int argc, char* argv[]) { _cleanup_(iovw_free_freep) struct iovec_wrapper *iovw = NULL; Context context = {}; @@ -1386,9 +1565,6 @@ static int process_kernel(int argc, char* argv[]) { if (!iovw) return log_oom(); - (void) iovw_put_string_field(iovw, "MESSAGE_ID=", SD_MESSAGE_COREDUMP_STR); - (void) iovw_put_string_field(iovw, "PRIORITY=", STRINGIFY(LOG_CRIT)); - /* Collect all process metadata passed by the kernel through argv[] */ r = gather_pid_metadata_from_argv(iovw, &context, argc - 1, argv + 1); if (r < 0) @@ -1403,6 +1579,17 @@ static int process_kernel(int argc, char* argv[]) { /* OK, now we know it's not the journal, hence we can make use of it now. */ log_set_target_and_open(LOG_TARGET_JOURNAL_OR_KMSG); + r = in_same_namespace(getpid_cached(), context.pid, NAMESPACE_PID); + if (r < 0) + log_debug_errno(r, "Failed to check pidns of crashing process, ignoring: %m"); + if (r == 0) { + /* If this fails, fallback to the old behavior so that + * there is still some record of the crash. */ + r = forward_coredump_to_container(&context); + if (r >= 0) + return 0; + } + /* If this is PID 1 disable coredump collection, we'll unlikely be able to process * it later on. * @@ -1414,6 +1601,9 @@ static int process_kernel(int argc, char* argv[]) { disable_coredumps(); } + (void) iovw_put_string_field(iovw, "MESSAGE_ID=", SD_MESSAGE_COREDUMP_STR); + (void) iovw_put_string_field(iovw, "PRIORITY=", STRINGIFY(LOG_CRIT)); + if (context.is_journald || context.is_pid1) return submit_coredump(&context, iovw, STDIN_FILENO); diff --git a/src/nspawn/nspawn-register.c b/src/nspawn/nspawn-register.c index 2c9ebda61a..f8f82e91b1 100644 --- a/src/nspawn/nspawn-register.c +++ b/src/nspawn/nspawn-register.c @@ -8,6 +8,7 @@ #include "bus-util.h" #include "bus-wait-for-jobs.h" #include "nspawn-register.h" +#include "nspawn-settings.h" #include "special.h" #include "stat-util.h" #include "strv.h" @@ -16,7 +17,8 @@ static int append_machine_properties( sd_bus_message *m, CustomMount *mounts, unsigned n_mounts, - int kill_signal) { + int kill_signal, + bool coredump_receive) { unsigned j; int r; @@ -79,6 +81,12 @@ static int append_machine_properties( return bus_log_create_error(r); } + if (coredump_receive) { + r = sd_bus_message_append(m, "(sv)", "CoredumpReceive", "b", true); + if (r < 0) + return bus_log_create_error(r); + } + return 0; } @@ -114,7 +122,8 @@ int register_machine( char **properties, sd_bus_message *properties_message, bool keep_unit, - const char *service) { + const char *service, + StartMode start_mode) { _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; int r; @@ -174,7 +183,8 @@ int register_machine( m, mounts, n_mounts, - kill_signal); + kill_signal, + start_mode == START_BOOT); if (r < 0) return r; @@ -226,7 +236,8 @@ int allocate_scope( int kill_signal, char **properties, sd_bus_message *properties_message, - bool allow_pidfd) { + bool allow_pidfd, + StartMode start_mode) { _cleanup_(sd_bus_message_unrefp) sd_bus_message *m = NULL, *reply = NULL; _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL; @@ -295,7 +306,8 @@ int allocate_scope( m, mounts, n_mounts, - kill_signal); + kill_signal, + start_mode == START_BOOT); if (r < 0) return r; @@ -321,7 +333,7 @@ int allocate_scope( * doesn't support PIDFDs yet, let's try without. */ if (allow_pidfd && sd_bus_error_has_names(&error, SD_BUS_ERROR_UNKNOWN_PROPERTY, SD_BUS_ERROR_PROPERTY_READ_ONLY)) - return allocate_scope(bus, machine_name, pid, slice, mounts, n_mounts, kill_signal, properties, properties_message, /* allow_pidfd= */ false); + return allocate_scope(bus, machine_name, pid, slice, mounts, n_mounts, kill_signal, properties, properties_message, /* allow_pidfd= */ false, start_mode); return log_error_errno(r, "Failed to allocate scope: %s", bus_error_message(&error, r)); } diff --git a/src/nspawn/nspawn-register.h b/src/nspawn/nspawn-register.h index be65d2b230..4d16ac20e2 100644 --- a/src/nspawn/nspawn-register.h +++ b/src/nspawn/nspawn-register.h @@ -6,9 +6,10 @@ #include "sd-id128.h" #include "nspawn-mount.h" +#include "nspawn-settings.h" -int register_machine(sd_bus *bus, const char *machine_name, pid_t pid, const char *directory, sd_id128_t uuid, int local_ifindex, const char *slice, CustomMount *mounts, unsigned n_mounts, int kill_signal, char **properties, sd_bus_message *properties_message, bool keep_unit, const char *service); +int register_machine(sd_bus *bus, const char *machine_name, pid_t pid, const char *directory, sd_id128_t uuid, int local_ifindex, const char *slice, CustomMount *mounts, unsigned n_mounts, int kill_signal, char **properties, sd_bus_message *properties_message, bool keep_unit, const char *service, StartMode start_mode); int unregister_machine(sd_bus *bus, const char *machine_name); -int allocate_scope(sd_bus *bus, const char *machine_name, pid_t pid, const char *slice, CustomMount *mounts, unsigned n_mounts, int kill_signal, char **properties, sd_bus_message *properties_message, bool allow_pidfds); +int allocate_scope(sd_bus *bus, const char *machine_name, pid_t pid, const char *slice, CustomMount *mounts, unsigned n_mounts, int kill_signal, char **properties, sd_bus_message *properties_message, bool allow_pidfds, StartMode start_mode); int terminate_scope(sd_bus *bus, const char *machine_name); diff --git a/src/nspawn/nspawn.c b/src/nspawn/nspawn.c index 07234df85b..de76b88c27 100644 --- a/src/nspawn/nspawn.c +++ b/src/nspawn/nspawn.c @@ -5061,7 +5061,8 @@ static int run_container( arg_property, arg_property_message, arg_keep_unit, - arg_container_service_name); + arg_container_service_name, + arg_start_mode); if (r < 0) return r; @@ -5075,7 +5076,8 @@ static int run_container( arg_kill_signal, arg_property, arg_property_message, - /* allow_pidfds= */ true); + /* allow_pidfds= */ true, + arg_start_mode); if (r < 0) return r; diff --git a/src/shared/bus-unit-util.c b/src/shared/bus-unit-util.c index 634a8f08c2..4ee9706847 100644 --- a/src/shared/bus-unit-util.c +++ b/src/shared/bus-unit-util.c @@ -565,7 +565,8 @@ static int bus_append_cgroup_property(sd_bus_message *m, const char *field, cons "IOAccounting", "BlockIOAccounting", "TasksAccounting", - "IPAccounting")) + "IPAccounting", + "CoredumpReceive")) return bus_append_parse_boolean(m, field, eq); if (STR_IN_SET(field, "CPUWeight", diff --git a/src/shared/cgroup-show.c b/src/shared/cgroup-show.c index d2d0339910..32e1176a57 100644 --- a/src/shared/cgroup-show.c +++ b/src/shared/cgroup-show.c @@ -128,33 +128,6 @@ static int show_cgroup_one_by_path( return 0; } -static int is_delegated(int cgfd, const char *path) { - _cleanup_free_ char *b = NULL; - int r; - - assert(cgfd >= 0 || path); - - const char *t = cgfd >= 0 ? FORMAT_PROC_FD_PATH(cgfd) : path; - - r = getxattr_malloc(t, "trusted.delegate", &b); - if (ERRNO_IS_NEG_XATTR_ABSENT(r)) { - /* If the trusted xattr isn't set (preferred), then check the untrusted one. Under the - * assumption that whoever is trusted enough to own the cgroup, is also trusted enough to - * decide if it is delegated or not this should be safe. */ - r = getxattr_malloc(t, "user.delegate", &b); - if (ERRNO_IS_NEG_XATTR_ABSENT(r)) - return false; - } - if (r < 0) - return log_debug_errno(r, "Failed to read delegate xattr from %s, ignoring: %m", t); - - r = parse_boolean(b); - if (r < 0) - return log_debug_errno(r, "Failed to parse delegate xattr from %s, ignoring: %m", t); - - return r; -} - static int show_cgroup_name( const char *path, const char *prefix, @@ -173,7 +146,10 @@ static int show_cgroup_name( log_debug_errno(errno, "Failed to open cgroup '%s', ignoring: %m", path); } - delegate = is_delegated(fd, path) > 0; + r = cg_is_delegated(fd >= 0 ? FORMAT_PROC_FD_PATH(fd) : path); + if (r < 0) + log_debug_errno(r, "Failed to check if cgroup is delegated, ignoring: %m"); + delegate = r > 0; if (FLAGS_SET(flags, OUTPUT_CGROUP_ID)) { cg_file_handle fh = CG_FILE_HANDLE_INIT; diff --git a/test/units/testsuite-74.coredump.sh b/test/units/testsuite-74.coredump.sh index d30fd73717..534232a9fb 100755 --- a/test/units/testsuite-74.coredump.sh +++ b/test/units/testsuite-74.coredump.sh @@ -74,6 +74,31 @@ rm -fv /run/systemd/coredump.conf.d/99-external.conf # Wait a bit for the coredumps to get processed timeout 30 bash -c "while [[ \$(coredumpctl list -q --no-legend $CORE_TEST_BIN | wc -l) -lt 4 ]]; do sleep 1; done" +# Make sure we can forward crashes back to containers +CONTAINER="testsuite-74-container" + +mkdir -p "/var/lib/machines/$CONTAINER" +mkdir -p "/run/systemd/system/systemd-nspawn@$CONTAINER.service.d" +# Bind-mounting /etc into the container kinda defeats the purpose of --volatile=, +# but we need the ASan-related overrides scattered across /etc +cat > "/run/systemd/system/systemd-nspawn@$CONTAINER.service.d/override.conf" << EOF +[Service] +ExecStart= +ExecStart=systemd-nspawn --quiet --link-journal=try-guest --keep-unit --machine=%i --boot \ + --volatile=yes --directory=/ --bind-ro=/etc --inaccessible=/etc/machine-id +EOF +systemctl daemon-reload + +machinectl start "$CONTAINER" +timeout 60 bash -xec "until systemd-run -M '$CONTAINER' -q --wait --pipe true; do sleep .5; done" + +[[ "$(systemd-run -M "$CONTAINER" -q --wait --pipe coredumpctl list -q --no-legend /usr/bin/sleep | wc -l)" -eq 0 ]] +machinectl copy-to "$CONTAINER" "$MAKE_DUMP_SCRIPT" +systemd-run -M "$CONTAINER" -q --wait --pipe "$MAKE_DUMP_SCRIPT" "/usr/bin/sleep" "SIGABRT" +systemd-run -M "$CONTAINER" -q --wait --pipe "$MAKE_DUMP_SCRIPT" "/usr/bin/sleep" "SIGTRAP" +# Wait a bit for the coredumps to get processed +timeout 30 bash -c "while [[ \$(systemd-run -M $CONTAINER -q --wait --pipe coredumpctl list -q --no-legend /usr/bin/sleep | wc -l) -lt 2 ]]; do sleep 1; done" + coredumpctl SYSTEMD_LOG_LEVEL=debug coredumpctl coredumpctl --help @@ -89,7 +114,7 @@ coredumpctl --json=pretty | jq coredumpctl --json=off coredumpctl --root=/ coredumpctl --directory=/var/log/journal -coredumpctl --file="/var/log/journal/$(</etc/machine-id)/system.journal" +coredumpctl --file="/var/log/journal/$(</etc/machine-id)"/*.journal coredumpctl --since=@0 coredumpctl --since=yesterday --until=tomorrow # We should have a couple of externally stored coredumps diff --git a/units/systemd-nspawn@.service.in b/units/systemd-nspawn@.service.in index 079d6064f6..ff66d4090a 100644 --- a/units/systemd-nspawn@.service.in +++ b/units/systemd-nspawn@.service.in @@ -26,6 +26,7 @@ SuccessExitStatus=133 Slice=machine.slice Delegate=yes DelegateSubgroup=supervisor +CoredumpReceive=yes TasksMax=16384 {{SERVICE_WATCHDOG}} |