summaryrefslogtreecommitdiffstats
path: root/src
diff options
context:
space:
mode:
authorZbigniew Jędrzejewski-Szmek <zbyszek@in.waw.pl>2019-03-01 16:08:55 +0100
committerGitHub <noreply@github.com>2019-03-01 16:08:55 +0100
commitcc5fc36aec85b706294aec568b62cb24c05c04a3 (patch)
tree7dab782e57cf37ff292a05bd666760293b5d5f8d /src
parentMerge pull request #11701 from poettering/discover-bls (diff)
parentman: document new systemd.volatile=overlay kernel command line option (diff)
downloadsystemd-cc5fc36aec85b706294aec568b62cb24c05c04a3.tar.xz
systemd-cc5fc36aec85b706294aec568b62cb24c05c04a3.zip
Merge pull request #11243 from poettering/nspawn-root-overlay
add systemd-nspawn --volatile=overlay support, as well as the same for host systems
Diffstat (limited to 'src')
-rw-r--r--src/basic/copy.c13
-rw-r--r--src/basic/copy.h3
-rw-r--r--src/boot/bootctl.c2
-rw-r--r--src/fstab-generator/fstab-generator.c5
-rw-r--r--src/gpt-auto-generator/gpt-auto-generator.c26
-rw-r--r--src/import/export-raw.c2
-rw-r--r--src/import/import-raw.c2
-rw-r--r--src/import/pull-raw.c2
-rw-r--r--src/nspawn/nspawn-mount.c115
-rw-r--r--src/nspawn/nspawn-mount.h3
-rw-r--r--src/nspawn/nspawn.c88
-rw-r--r--src/shared/machine-image.c2
-rw-r--r--src/shared/volatile-util.c20
-rw-r--r--src/shared/volatile-util.h1
-rw-r--r--src/volatile-root/volatile-root.c118
15 files changed, 293 insertions, 109 deletions
diff --git a/src/basic/copy.c b/src/basic/copy.c
index 46e02a3759..2f36c8eb87 100644
--- a/src/basic/copy.c
+++ b/src/basic/copy.c
@@ -743,7 +743,7 @@ int copy_file_fd_full(
r = copy_bytes_full(fdf, fdt, (uint64_t) -1, copy_flags, NULL, NULL, progress_bytes, userdata);
- (void) copy_times(fdf, fdt);
+ (void) copy_times(fdf, fdt, copy_flags);
(void) copy_xattr(fdf, fdt);
return r;
@@ -849,10 +849,9 @@ int copy_file_atomic_full(
return 0;
}
-int copy_times(int fdf, int fdt) {
+int copy_times(int fdf, int fdt, CopyFlags flags) {
struct timespec ut[2];
struct stat st;
- usec_t crtime = 0;
assert(fdf >= 0);
assert(fdt >= 0);
@@ -866,8 +865,12 @@ int copy_times(int fdf, int fdt) {
if (futimens(fdt, ut) < 0)
return -errno;
- if (fd_getcrtime(fdf, &crtime) >= 0)
- (void) fd_setcrtime(fdt, crtime);
+ if (FLAGS_SET(flags, COPY_CRTIME)) {
+ usec_t crtime;
+
+ if (fd_getcrtime(fdf, &crtime) >= 0)
+ (void) fd_setcrtime(fdt, crtime);
+ }
return 0;
}
diff --git a/src/basic/copy.h b/src/basic/copy.h
index f677021881..a33546d3ab 100644
--- a/src/basic/copy.h
+++ b/src/basic/copy.h
@@ -14,6 +14,7 @@ typedef enum CopyFlags {
COPY_REPLACE = 1 << 2, /* Replace an existing file if there's one */
COPY_SAME_MOUNT = 1 << 3, /* Don't descend recursively into other file systems, across mount point boundaries */
COPY_MERGE_EMPTY = 1 << 4, /* Merge an existing, empty directory with our new tree to copy */
+ COPY_CRTIME = 1 << 5, /* Generate a user.crtime_usec xattr off the source crtime if there is one, on copying */
} CopyFlags;
typedef int (*copy_progress_bytes_t)(uint64_t n_bytes, void *userdata);
@@ -57,5 +58,5 @@ static inline int copy_bytes(int fdf, int fdt, uint64_t max_bytes, CopyFlags cop
return copy_bytes_full(fdf, fdt, max_bytes, copy_flags, NULL, NULL, NULL, NULL);
}
-int copy_times(int fdf, int fdt);
+int copy_times(int fdf, int fdt, CopyFlags flags);
int copy_xattr(int fdf, int fdt);
diff --git a/src/boot/bootctl.c b/src/boot/bootctl.c
index a529989ea0..1e0d115fe3 100644
--- a/src/boot/bootctl.c
+++ b/src/boot/bootctl.c
@@ -494,7 +494,7 @@ static int copy_file_with_version_check(const char *from, const char *to, bool f
return log_error_errno(r, "Failed to copy data from \"%s\" to \"%s\": %m", from, t);
}
- (void) copy_times(fd_from, fd_to);
+ (void) copy_times(fd_from, fd_to, 0);
if (fsync(fd_to) < 0) {
(void) unlink_noerrno(t);
diff --git a/src/fstab-generator/fstab-generator.c b/src/fstab-generator/fstab-generator.c
index 30a6d356d0..d1bfa775e4 100644
--- a/src/fstab-generator/fstab-generator.c
+++ b/src/fstab-generator/fstab-generator.c
@@ -722,10 +722,11 @@ static int add_sysroot_usr_mount(void) {
}
static int add_volatile_root(void) {
+
/* Let's add in systemd-remount-volatile.service which will remount the root device to tmpfs if this is
- * requested, leaving only /usr from the root mount inside. */
+ * requested (or as an overlayfs), leaving only /usr from the root mount inside. */
- if (arg_volatile_mode != VOLATILE_YES)
+ if (!IN_SET(arg_volatile_mode, VOLATILE_YES, VOLATILE_OVERLAY))
return 0;
return generator_add_symlink(arg_dest, SPECIAL_INITRD_ROOT_FS_TARGET, "requires",
diff --git a/src/gpt-auto-generator/gpt-auto-generator.c b/src/gpt-auto-generator/gpt-auto-generator.c
index 2f8ccd025b..0f1e184eea 100644
--- a/src/gpt-auto-generator/gpt-auto-generator.c
+++ b/src/gpt-auto-generator/gpt-auto-generator.c
@@ -18,6 +18,7 @@
#include "efivars.h"
#include "fd-util.h"
#include "fileio.h"
+#include "fs-util.h"
#include "fstab-util.h"
#include "generator.h"
#include "gpt.h"
@@ -533,7 +534,7 @@ static int add_root_rw(DissectedPartition *p) {
return 0;
}
-static int open_parent(dev_t devnum, int *ret) {
+static int open_parent_devno(dev_t devnum, int *ret) {
_cleanup_(sd_device_unrefp) sd_device *d = NULL;
const char *name, *devtype, *node;
sd_device *parent;
@@ -601,7 +602,7 @@ static int enumerate_partitions(dev_t devnum) {
_cleanup_(dissected_image_unrefp) DissectedImage *m = NULL;
int r, k;
- r = open_parent(devnum, &fd);
+ r = open_parent_devno(devnum, &fd);
if (r <= 0)
return r;
@@ -763,8 +764,25 @@ static int add_mounts(void) {
if (r < 0)
return log_error_errno(r, "Failed to determine block device of /usr file system: %m");
if (r == 0) {
- log_debug("Neither root nor /usr file system are on a (single) block device.");
- return 0;
+ _cleanup_free_ char *p = NULL;
+ mode_t m;
+
+ /* If the root mount has been replaced by some form of volatile file system (overlayfs), the
+ * original root block device node is symlinked in /run/systemd/volatile-root. Let's read that
+ * here. */
+ r = readlink_malloc("/run/systemd/volatile-root", &p);
+ if (r == -ENOENT) {
+ log_debug("Neither root nor /usr file system are on a (single) block device.");
+ return 0;
+ }
+ if (r < 0)
+ return log_error_errno(r, "Failed to read symlink /run/systemd/volatile-root: %m");
+
+ r = device_path_parse_major_minor(p, &m, &devno);
+ if (r < 0)
+ return log_error_errno(r, "Failed to parse major/minor device node: %m");
+ if (!S_ISBLK(m))
+ return log_error_errno(SYNTHETIC_ERRNO(ENOTBLK), "Volatile root device is of wrong type.");
}
}
diff --git a/src/import/export-raw.c b/src/import/export-raw.c
index 6a02b47a17..c1c946cd2b 100644
--- a/src/import/export-raw.c
+++ b/src/import/export-raw.c
@@ -223,7 +223,7 @@ static int raw_export_process(RawExport *e) {
finish:
if (r >= 0) {
- (void) copy_times(e->input_fd, e->output_fd);
+ (void) copy_times(e->input_fd, e->output_fd, COPY_CRTIME);
(void) copy_xattr(e->input_fd, e->output_fd);
}
diff --git a/src/import/import-raw.c b/src/import/import-raw.c
index 4b1161557d..56f3431a08 100644
--- a/src/import/import-raw.c
+++ b/src/import/import-raw.c
@@ -215,7 +215,7 @@ static int raw_import_finish(RawImport *i) {
return r;
if (S_ISREG(i->st.st_mode)) {
- (void) copy_times(i->input_fd, i->output_fd);
+ (void) copy_times(i->input_fd, i->output_fd, COPY_CRTIME);
(void) copy_xattr(i->input_fd, i->output_fd);
}
diff --git a/src/import/pull-raw.c b/src/import/pull-raw.c
index 3a3e015df8..72b9054e49 100644
--- a/src/import/pull-raw.c
+++ b/src/import/pull-raw.c
@@ -368,7 +368,7 @@ static int raw_pull_make_local_copy(RawPull *i) {
return log_error_errno(r, "Failed to make writable copy of image: %m");
}
- (void) copy_times(i->raw_job->disk_fd, dfd);
+ (void) copy_times(i->raw_job->disk_fd, dfd, COPY_CRTIME);
(void) copy_xattr(i->raw_job->disk_fd, dfd);
dfd = safe_close(dfd);
diff --git a/src/nspawn/nspawn-mount.c b/src/nspawn/nspawn-mount.c
index a9af889747..eb0a26ef35 100644
--- a/src/nspawn/nspawn-mount.c
+++ b/src/nspawn/nspawn-mount.c
@@ -212,6 +212,8 @@ int bind_mount_parse(CustomMount **l, size_t *n, const char *s, bool read_only)
if (!path_is_absolute(destination))
return -EINVAL;
+ if (empty_or_root(destination))
+ return -EINVAL;
m = custom_mount_add(l, n, CUSTOM_MOUNT_BIND);
if (!m)
@@ -251,6 +253,8 @@ int tmpfs_mount_parse(CustomMount **l, size_t *n, const char *s) {
if (!path_is_absolute(path))
return -EINVAL;
+ if (empty_or_root(path))
+ return -EINVAL;
m = custom_mount_add(l, n, CUSTOM_MOUNT_TMPFS);
if (!m)
@@ -310,6 +314,9 @@ int overlay_mount_parse(CustomMount **l, size_t *n, const char *s, bool read_onl
return -EINVAL;
}
+ if (empty_or_root(destination))
+ return -EINVAL;
+
m = custom_mount_add(l, n, CUSTOM_MOUNT_OVERLAY);
if (!m)
return -ENOMEM;
@@ -849,9 +856,8 @@ int mount_custom(
return 0;
}
-int setup_volatile_state(
+static int setup_volatile_state(
const char *directory,
- VolatileMode mode,
bool userns, uid_t uid_shift, uid_t uid_range,
const char *selinux_apifs_context) {
@@ -861,11 +867,7 @@ int setup_volatile_state(
assert(directory);
- if (mode != VOLATILE_STATE)
- return 0;
-
- /* --volatile=state means we simply overmount /var
- with a tmpfs, and the rest read-only. */
+ /* --volatile=state means we simply overmount /var with a tmpfs, and the rest read-only. */
r = bind_remount_recursive(directory, true, NULL);
if (r < 0)
@@ -886,9 +888,8 @@ int setup_volatile_state(
return mount_verbose(LOG_ERR, "tmpfs", p, "tmpfs", MS_STRICTATIME, options);
}
-int setup_volatile(
+static int setup_volatile_yes(
const char *directory,
- VolatileMode mode,
bool userns, uid_t uid_shift, uid_t uid_range,
const char *selinux_apifs_context) {
@@ -900,11 +901,8 @@ int setup_volatile(
assert(directory);
- if (mode != VOLATILE_YES)
- return 0;
-
- /* --volatile=yes means we mount a tmpfs to the root dir, and
- the original /usr to use inside it, and that read-only. */
+ /* --volatile=yes means we mount a tmpfs to the root dir, and the original /usr to use inside it, and that
+ read-only. */
if (!mkdtemp(template))
return log_error_errno(errno, "Failed to create temporary directory: %m");
@@ -912,7 +910,7 @@ int setup_volatile(
options = "mode=755";
r = tmpfs_patch_options(options, uid_shift == 0 ? UID_INVALID : uid_shift, selinux_apifs_context, &buf);
if (r < 0)
- return log_oom();
+ goto fail;
if (r > 0)
options = buf;
@@ -961,6 +959,93 @@ fail:
return r;
}
+static int setup_volatile_overlay(
+ const char *directory,
+ bool userns, uid_t uid_shift, uid_t uid_range,
+ const char *selinux_apifs_context) {
+
+ _cleanup_free_ char *buf = NULL, *escaped_directory = NULL, *escaped_upper = NULL, *escaped_work = NULL;
+ char template[] = "/tmp/nspawn-volatile-XXXXXX";
+ const char *upper, *work, *options;
+ bool tmpfs_mounted = false;
+ int r;
+
+ assert(directory);
+
+ /* --volatile=overlay means we mount an overlayfs to the root dir. */
+
+ if (!mkdtemp(template))
+ return log_error_errno(errno, "Failed to create temporary directory: %m");
+
+ options = "mode=755";
+ r = tmpfs_patch_options(options, uid_shift == 0 ? UID_INVALID : uid_shift, selinux_apifs_context, &buf);
+ if (r < 0)
+ goto finish;
+ if (r > 0)
+ options = buf;
+
+ r = mount_verbose(LOG_ERR, "tmpfs", template, "tmpfs", MS_STRICTATIME, options);
+ if (r < 0)
+ goto finish;
+
+ tmpfs_mounted = true;
+
+ upper = strjoina(template, "/upper");
+ work = strjoina(template, "/work");
+
+ if (mkdir(upper, 0755) < 0) {
+ r = log_error_errno(errno, "Failed to create %s: %m", upper);
+ goto finish;
+ }
+ if (mkdir(work, 0755) < 0) {
+ r = log_error_errno(errno, "Failed to create %s: %m", work);
+ goto finish;
+ }
+
+ /* And now, let's overmount the root dir with an overlayfs that uses the root dir as lower dir. It's kinda nice
+ * that the kernel allows us to do that without going through some mount point rearrangements. */
+
+ escaped_directory = shell_escape(directory, ",:");
+ escaped_upper = shell_escape(upper, ",:");
+ escaped_work = shell_escape(work, ",:");
+ if (!escaped_directory || !escaped_upper || !escaped_work) {
+ r = -ENOMEM;
+ goto finish;
+ }
+
+ options = strjoina("lowerdir=", escaped_directory, ",upperdir=", escaped_upper, ",workdir=", escaped_work);
+ r = mount_verbose(LOG_ERR, "overlay", directory, "overlay", 0, options);
+
+finish:
+ if (tmpfs_mounted)
+ (void) umount_verbose(template);
+
+ (void) rmdir(template);
+ return r;
+}
+
+int setup_volatile_mode(
+ const char *directory,
+ VolatileMode mode,
+ bool userns, uid_t uid_shift, uid_t uid_range,
+ const char *selinux_apifs_context) {
+
+ switch (mode) {
+
+ case VOLATILE_YES:
+ return setup_volatile_yes(directory, userns, uid_shift, uid_range, selinux_apifs_context);
+
+ case VOLATILE_STATE:
+ return setup_volatile_state(directory, userns, uid_shift, uid_range, selinux_apifs_context);
+
+ case VOLATILE_OVERLAY:
+ return setup_volatile_overlay(directory, userns, uid_shift, uid_range, selinux_apifs_context);
+
+ default:
+ return 0;
+ }
+}
+
/* Expects *pivot_root_new and *pivot_root_old to be initialised to allocated memory or NULL. */
int pivot_root_parse(char **pivot_root_new, char **pivot_root_old, const char *s) {
_cleanup_free_ char *root_new = NULL, *root_old = NULL;
diff --git a/src/nspawn/nspawn-mount.h b/src/nspawn/nspawn-mount.h
index 8051a7d9d9..e060ca0e4d 100644
--- a/src/nspawn/nspawn-mount.h
+++ b/src/nspawn/nspawn-mount.h
@@ -49,8 +49,7 @@ int mount_sysfs(const char *dest, MountSettingsMask mount_settings);
int mount_custom(const char *dest, CustomMount *mounts, size_t n, bool userns, uid_t uid_shift, uid_t uid_range, const char *selinux_apifs_context);
-int setup_volatile(const char *directory, VolatileMode mode, bool userns, uid_t uid_shift, uid_t uid_range, const char *selinux_apifs_context);
-int setup_volatile_state(const char *directory, VolatileMode mode, bool userns, uid_t uid_shift, uid_t uid_range, const char *selinux_apifs_context);
+int setup_volatile_mode(const char *directory, VolatileMode mode, bool userns, uid_t uid_shift, uid_t uid_range, const char *selinux_apifs_context);
int pivot_root_parse(char **pivot_root_new, char **pivot_root_old, const char *s);
int setup_pivot_root(const char *directory, const char *pivot_root_new, const char *pivot_root_old);
diff --git a/src/nspawn/nspawn.c b/src/nspawn/nspawn.c
index e0c2d711e6..5cb049e5f7 100644
--- a/src/nspawn/nspawn.c
+++ b/src/nspawn/nspawn.c
@@ -1308,6 +1308,9 @@ static int verify_arguments(void) {
if (arg_start_mode == START_BOOT && arg_kill_signal <= 0)
arg_kill_signal = SIGRTMIN+3;
+ if (arg_volatile_mode != VOLATILE_NO) /* Make sure all file systems contained in the image are mounted read-only if we are in volatile mode */
+ arg_read_only = true;
+
if (arg_keep_unit && arg_register && cg_pid_get_owner_uid(0, NULL) >= 0)
/* Save the user from accidentally registering either user-$SESSION.scope or user@.service.
* The latter is not technically a user session, but we don't need to labour the point. */
@@ -1334,6 +1337,12 @@ static int verify_arguments(void) {
if (arg_userns_chown && arg_read_only)
return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--read-only and --private-users-chown may not be combined.");
+ /* We don't support --private-users-chown together with any of the volatile modes since we couldn't
+ * change the read-only part of the tree (i.e. /usr) anyway, or because it would trigger a massive
+ * copy-up (in case of overlay) making the entire excercise pointless. */
+ if (arg_userns_chown && arg_volatile_mode != VOLATILE_NO)
+ return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--volatile= and --private-users-chown may not be combined.");
+
/* If --network-namespace-path is given with any other network-related option,
* we need to error out, to avoid conflicts between different network options. */
if (arg_network_namespace_path &&
@@ -1352,9 +1361,6 @@ static int verify_arguments(void) {
if (arg_userns_mode != USER_NAMESPACE_NO && !(arg_mount_settings & MOUNT_APPLY_APIVFS_RO))
return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Cannot combine --private-users with read-write mounts.");
- if (arg_volatile_mode != VOLATILE_NO && arg_read_only)
- return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Cannot combine --read-only with --volatile. Note that --volatile already implies a read-only base hierarchy.");
-
if (arg_expose_ports && !arg_private_network)
return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Cannot use --port= without private networking.");
@@ -1420,6 +1426,10 @@ static const char *timezone_from_path(const char *path) {
"/usr/share/zoneinfo/");
}
+static bool etc_writable(void) {
+ return !arg_read_only || IN_SET(arg_volatile_mode, VOLATILE_YES, VOLATILE_OVERLAY);
+}
+
static int setup_timezone(const char *dest) {
_cleanup_free_ char *p = NULL, *etc = NULL;
const char *where, *check;
@@ -1431,9 +1441,9 @@ static int setup_timezone(const char *dest) {
if (IN_SET(arg_timezone, TIMEZONE_AUTO, TIMEZONE_SYMLINK)) {
r = readlink_malloc("/etc/localtime", &p);
if (r == -ENOENT && arg_timezone == TIMEZONE_AUTO)
- m = arg_read_only && arg_volatile_mode != VOLATILE_YES ? TIMEZONE_OFF : TIMEZONE_DELETE;
+ m = etc_writable() ? TIMEZONE_DELETE : TIMEZONE_OFF;
else if (r == -EINVAL && arg_timezone == TIMEZONE_AUTO) /* regular file? */
- m = arg_read_only && arg_volatile_mode != VOLATILE_YES ? TIMEZONE_BIND : TIMEZONE_COPY;
+ m = etc_writable() ? TIMEZONE_COPY : TIMEZONE_BIND;
else if (r < 0) {
log_warning_errno(r, "Failed to read host's /etc/localtime symlink, not updating container timezone: %m");
/* To handle warning, delete /etc/localtime and replace it with a symbolic link to a time zone data
@@ -1444,7 +1454,7 @@ static int setup_timezone(const char *dest) {
*/
return 0;
} else if (arg_timezone == TIMEZONE_AUTO)
- m = arg_read_only && arg_volatile_mode != VOLATILE_YES ? TIMEZONE_BIND : TIMEZONE_SYMLINK;
+ m = etc_writable() ? TIMEZONE_SYMLINK : TIMEZONE_BIND;
else
m = arg_timezone;
} else
@@ -1606,11 +1616,11 @@ static int setup_resolv_conf(const char *dest) {
if (arg_private_network)
m = RESOLV_CONF_OFF;
else if (have_resolv_conf(STATIC_RESOLV_CONF) > 0 && resolved_listening() > 0)
- m = arg_read_only && arg_volatile_mode != VOLATILE_YES ? RESOLV_CONF_BIND_STATIC : RESOLV_CONF_COPY_STATIC;
+ m = etc_writable() ? RESOLV_CONF_COPY_STATIC : RESOLV_CONF_BIND_STATIC;
else if (have_resolv_conf("/etc/resolv.conf") > 0)
- m = arg_read_only && arg_volatile_mode != VOLATILE_YES ? RESOLV_CONF_BIND_HOST : RESOLV_CONF_COPY_HOST;
+ m = etc_writable() ? RESOLV_CONF_COPY_HOST : RESOLV_CONF_BIND_HOST;
else
- m = arg_read_only && arg_volatile_mode != VOLATILE_YES ? RESOLV_CONF_OFF : RESOLV_CONF_DELETE;
+ m = etc_writable() ? RESOLV_CONF_DELETE : RESOLV_CONF_OFF;
} else
m = arg_resolv_conf;
@@ -2896,6 +2906,30 @@ static int outer_child(
"Selected user namespace base " UID_FMT " and range " UID_FMT ".", arg_uid_shift, arg_uid_range);
}
+ if (!dissected_image) {
+ /* Turn directory into bind mount */
+ r = mount_verbose(LOG_ERR, directory, directory, NULL, MS_BIND|MS_REC, NULL);
+ if (r < 0)
+ return r;
+ }
+
+ r = setup_pivot_root(
+ directory,
+ arg_pivot_root_new,
+ arg_pivot_root_old);
+ if (r < 0)
+ return r;
+
+ r = setup_volatile_mode(
+ directory,
+ arg_volatile_mode,
+ arg_userns_mode != USER_NAMESPACE_NO,
+ arg_uid_shift,
+ arg_uid_range,
+ arg_selinux_context);
+ if (r < 0)
+ return r;
+
if (dissected_image) {
/* Now we know the uid shift, let's now mount everything else that might be in the image. */
r = dissected_image_mount(dissected_image, directory, arg_uid_shift,
@@ -2921,38 +2955,6 @@ static int outer_child(
unified_cgroup_hierarchy_socket = safe_close(unified_cgroup_hierarchy_socket);
}
- /* Turn directory into bind mount */
- r = mount_verbose(LOG_ERR, directory, directory, NULL, MS_BIND|MS_REC, NULL);
- if (r < 0)
- return r;
-
- r = setup_pivot_root(
- directory,
- arg_pivot_root_new,
- arg_pivot_root_old);
- if (r < 0)
- return r;
-
- r = setup_volatile(
- directory,
- arg_volatile_mode,
- arg_userns_mode != USER_NAMESPACE_NO,
- arg_uid_shift,
- arg_uid_range,
- arg_selinux_context);
- if (r < 0)
- return r;
-
- r = setup_volatile_state(
- directory,
- arg_volatile_mode,
- arg_userns_mode != USER_NAMESPACE_NO,
- arg_uid_shift,
- arg_uid_range,
- arg_selinux_context);
- if (r < 0)
- return r;
-
/* Mark everything as shared so our mounts get propagated down. This is
* required to make new bind mounts available in systemd services
* inside the containter that create a new mount namespace.
@@ -2971,7 +2973,7 @@ static int outer_child(
if (r < 0)
return r;
- if (arg_read_only) {
+ if (arg_read_only && arg_volatile_mode == VOLATILE_NO) {
r = bind_remount_recursive(directory, true, NULL);
if (r < 0)
return log_error_errno(r, "Failed to make tree read-only: %m");
@@ -4398,7 +4400,7 @@ int main(int argc, char *argv[]) {
goto finish;
}
- r = copy_file(arg_image, np, O_EXCL, arg_read_only ? 0400 : 0600, FS_NOCOW_FL, COPY_REFLINK);
+ r = copy_file(arg_image, np, O_EXCL, arg_read_only ? 0400 : 0600, FS_NOCOW_FL, COPY_REFLINK|COPY_CRTIME);
if (r < 0) {
r = log_error_errno(r, "Failed to copy image file: %m");
goto finish;
diff --git a/src/shared/machine-image.c b/src/shared/machine-image.c
index af06ab22e8..3d61221056 100644
--- a/src/shared/machine-image.c
+++ b/src/shared/machine-image.c
@@ -870,7 +870,7 @@ int image_clone(Image *i, const char *new_name, bool read_only) {
case IMAGE_RAW:
new_path = strjoina("/var/lib/machines/", new_name, ".raw");
- r = copy_file_atomic(i->path, new_path, read_only ? 0444 : 0644, FS_NOCOW_FL, COPY_REFLINK);
+ r = copy_file_atomic(i->path, new_path, read_only ? 0444 : 0644, FS_NOCOW_FL, COPY_REFLINK|COPY_CRTIME);
break;
case IMAGE_BLOCK:
diff --git a/src/shared/volatile-util.c b/src/shared/volatile-util.c
index 4d75bc0e96..5ca6ab3376 100644
--- a/src/shared/volatile-util.c
+++ b/src/shared/volatile-util.c
@@ -12,33 +12,35 @@
int query_volatile_mode(VolatileMode *ret) {
_cleanup_free_ char *mode = NULL;
- VolatileMode m = VOLATILE_NO;
int r;
r = proc_cmdline_get_key("systemd.volatile", PROC_CMDLINE_VALUE_OPTIONAL, &mode);
if (r < 0)
return r;
- if (r == 0)
- goto finish;
+ if (r == 0) {
+ *ret = VOLATILE_NO;
+ return 0;
+ }
if (mode) {
+ VolatileMode m;
+
m = volatile_mode_from_string(mode);
if (m < 0)
return -EINVAL;
- } else
- m = VOLATILE_YES;
- r = 1;
+ *ret = m;
+ } else
+ *ret = VOLATILE_YES;
-finish:
- *ret = m;
- return r;
+ return 1;
}
static const char* const volatile_mode_table[_VOLATILE_MODE_MAX] = {
[VOLATILE_NO] = "no",
[VOLATILE_YES] = "yes",
[VOLATILE_STATE] = "state",
+ [VOLATILE_OVERLAY] = "overlay",
};
DEFINE_STRING_TABLE_LOOKUP_WITH_BOOLEAN(volatile_mode, VolatileMode, VOLATILE_YES);
diff --git a/src/shared/volatile-util.h b/src/shared/volatile-util.h
index 8761c44ab8..2d31bb1174 100644
--- a/src/shared/volatile-util.h
+++ b/src/shared/volatile-util.h
@@ -5,6 +5,7 @@ typedef enum VolatileMode {
VOLATILE_NO,
VOLATILE_YES,
VOLATILE_STATE,
+ VOLATILE_OVERLAY,
_VOLATILE_MODE_MAX,
_VOLATILE_MODE_INVALID = -1
} VolatileMode;
diff --git a/src/volatile-root/volatile-root.c b/src/volatile-root/volatile-root.c
index 5da9ce1681..701f5a2832 100644
--- a/src/volatile-root/volatile-root.c
+++ b/src/volatile-root/volatile-root.c
@@ -3,6 +3,8 @@
#include <sys/mount.h>
#include "alloc-util.h"
+#include "blockdev-util.h"
+#include "escape.h"
#include "fs-util.h"
#include "main-func.h"
#include "mkdir.h"
@@ -17,20 +19,7 @@ static int make_volatile(const char *path) {
_cleanup_free_ char *old_usr = NULL;
int r;
- r = path_is_mount_point(path, NULL, AT_SYMLINK_FOLLOW);
- if (r < 0)
- return log_error_errno(r, "Couldn't determine whether %s is a mount point: %m", path);
- if (r == 0)
- return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
- "%s is not a mount point.", path);
-
- r = path_is_temporary_fs(path);
- if (r < 0)
- return log_error_errno(r, "Couldn't determine whether %s is a temporary file system: %m", path);
- if (r > 0) {
- log_info("%s already is a temporary file system.", path);
- return 0;
- }
+ assert(path);
r = chase_symlinks("/usr", path, CHASE_PREFIX_ROOT, &old_usr);
if (r < 0)
@@ -45,7 +34,7 @@ static int make_volatile(const char *path) {
goto finish_rmdir;
if (mkdir("/run/systemd/volatile-sysroot/usr", 0755) < 0) {
- r = -errno;
+ r = log_error_errno(errno, "Failed to create /usr directory: %m");
goto finish_umount;
}
@@ -54,8 +43,10 @@ static int make_volatile(const char *path) {
goto finish_umount;
r = bind_remount_recursive("/run/systemd/volatile-sysroot/usr", true, NULL);
- if (r < 0)
+ if (r < 0) {
+ log_error_errno(r, "Failed to remount /usr read-only: %m");
goto finish_umount;
+ }
r = umount_recursive(path, 0);
if (r < 0) {
@@ -64,7 +55,7 @@ static int make_volatile(const char *path) {
}
if (mount(NULL, "/", NULL, MS_SLAVE|MS_REC, NULL) < 0)
- log_warning_errno(errno, "Failed to remount %s MS_SLAVE|MS_REC: %m", path);
+ log_warning_errno(errno, "Failed to remount %s MS_SLAVE|MS_REC, ignoring: %m", path);
r = mount_verbose(LOG_ERR, "/run/systemd/volatile-sysroot", path, NULL, MS_MOVE, NULL);
@@ -77,9 +68,55 @@ finish_rmdir:
return r;
}
+static int make_overlay(const char *path) {
+ _cleanup_free_ char *escaped_path = NULL;
+ bool tmpfs_mounted = false;
+ const char *options = NULL;
+ int r;
+
+ assert(path);
+
+ r = mkdir_p("/run/systemd/overlay-sysroot", 0700);
+ if (r < 0)
+ return log_error_errno(r, "Couldn't create overlay sysroot directory: %m");
+
+ r = mount_verbose(LOG_ERR, "tmpfs", "/run/systemd/overlay-sysroot", "tmpfs", MS_STRICTATIME, "mode=755");
+ if (r < 0)
+ goto finish;
+
+ tmpfs_mounted = true;
+
+ if (mkdir("/run/systemd/overlay-sysroot/upper", 0755) < 0) {
+ r = log_error_errno(errno, "Failed to create /run/systemd/overlay-sysroot/upper: %m");
+ goto finish;
+ }
+
+ if (mkdir("/run/systemd/overlay-sysroot/work", 0755) < 0) {
+ r = log_error_errno(errno, "Failed to create /run/systemd/overlay-sysroot/work: %m");
+ goto finish;
+ }
+
+ escaped_path = shell_escape(path, ",:");
+ if (!escaped_path) {
+ r = log_oom();
+ goto finish;
+ }
+
+ options = strjoina("lowerdir=", escaped_path, ",upperdir=/run/systemd/overlay-sysroot/upper,workdir=/run/systemd/overlay-sysroot/work");
+ r = mount_verbose(LOG_ERR, "overlay", path, "overlay", 0, options);
+
+finish:
+ if (tmpfs_mounted)
+ (void) umount_verbose("/run/systemd/overlay-sysroot");
+
+ (void) rmdir("/run/systemd/overlay-sysroot");
+ return r;
+}
+
static int run(int argc, char *argv[]) {
VolatileMode m = _VOLATILE_MODE_INVALID;
const char *path;
+ dev_t devt;
int r;
log_setup_service();
@@ -94,10 +131,8 @@ static int run(int argc, char *argv[]) {
if (r == 0 && argc >= 2) {
/* The kernel command line always wins. However if nothing was set there, the argument passed here wins instead. */
m = volatile_mode_from_string(argv[1]);
- if (m < 0) {
- log_error("Couldn't parse volatile mode: %s", argv[1]);
- r = -EINVAL;
- }
+ if (m < 0)
+ return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Couldn't parse volatile mode: %s", argv[1]);
}
if (argc < 3)
@@ -116,10 +151,47 @@ static int run(int argc, char *argv[]) {
"Directory cannot be the root directory.");
}
- if (m != VOLATILE_YES)
+ if (!IN_SET(m, VOLATILE_YES, VOLATILE_OVERLAY))
return 0;
- return make_volatile(path);
+ r = path_is_mount_point(path, NULL, AT_SYMLINK_FOLLOW);
+ if (r < 0)
+ return log_error_errno(r, "Couldn't determine whether %s is a mount point: %m", path);
+ if (r == 0)
+ return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "%s is not a mount point.", path);
+
+ r = path_is_temporary_fs(path);
+ if (r < 0)
+ return log_error_errno(r, "Couldn't determine whether %s is a temporary file system: %m", path);
+ if (r > 0) {
+ log_info("%s already is a temporary file system.", path);
+ return 0;
+ }
+
+ /* We are about to replace the root directory with something else. Later code might want to know what we
+ * replaced here, hence let's save that information as a symlink we can later use. (This is particularly
+ * relevant for the overlayfs case where we'll fully obstruct the view onto the underlying device, hence
+ * querying the backing device node from the file system directly is no longer possible. */
+ r = get_block_device_harder(path, &devt);
+ if (r < 0)
+ return log_error_errno(r, "Failed to determine device major/minor of %s: %m", path);
+ else if (r > 0) {
+ _cleanup_free_ char *dn = NULL;
+
+ r = device_path_make_major_minor(S_IFBLK, devt, &dn);
+ if (r < 0)
+ return log_error_errno(r, "Failed to format device node path: %m");
+
+ if (symlink(dn, "/run/systemd/volatile-root") < 0)
+ log_warning_errno(errno, "Failed to create symlink /run/systemd/volatile-root: %m");
+ }
+
+ if (m == VOLATILE_YES)
+ return make_volatile(path);
+ else {
+ assert(m == VOLATILE_OVERLAY);
+ return make_overlay(path);
+ }
}
DEFINE_MAIN_FUNCTION(run);