From 74c48bf5a8005f20dc4ef7b7d05b96572d880b25 Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Fri, 29 Jun 2018 12:03:33 +0200 Subject: core: add special handling for devices cgroup allow lists for /dev/block/* and /dev/char/* device nodes This adds some code to hanlde /dev/block/* and /dev/char/* device node paths specially: instead of actually stat()ing them we'll just parse the major/minor name from the name. This is useful 'hack' to allow clients to install whitelists for devices that don't actually have to exist. Also, let's similarly handle /run/systemd/inaccessible/{blk|chr}. This allows us to simplify our built-in default whitelist to not require a "ignore_enoent" mode for these nodes. In general we should be careful with hardcoding major/minor numbers, but in this case this should safe. --- src/core/cgroup.c | 82 +++++++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 64 insertions(+), 18 deletions(-) (limited to 'src/core') diff --git a/src/core/cgroup.c b/src/core/cgroup.c index 598b396186..8f3e646ad6 100644 --- a/src/core/cgroup.c +++ b/src/core/cgroup.c @@ -19,6 +19,7 @@ #include "process-util.h" #include "procfs-util.h" #include "special.h" +#include "stat-util.h" #include "stdio-util.h" #include "string-table.h" #include "string-util.h" @@ -407,31 +408,76 @@ static int lookup_block_device(const char *p, dev_t *ret) { return 0; } +static int shortcut_special_device_path(const char *p, struct stat *ret) { + const char *w; + mode_t mode; + dev_t devt; + int r; + + assert(p); + assert(ret); + + if (path_equal(p, "/run/systemd/inaccessible/chr")) { + *ret = (struct stat) { + .st_mode = S_IFCHR, + .st_rdev = makedev(0, 0), + }; + return 0; + } + + if (path_equal(p, "/run/systemd/inaccessible/blk")) { + *ret = (struct stat) { + .st_mode = S_IFBLK, + .st_rdev = makedev(0, 0), + }; + return 0; + } + + w = path_startswith(p, "/dev/block/"); + if (w) + mode = S_IFBLK; + else { + w = path_startswith(p, "/dev/char/"); + if (!w) + return -ENODEV; + + mode = S_IFCHR; + } + + r = parse_dev(w, &devt); + if (r < 0) + return r; + + *ret = (struct stat) { + .st_mode = mode, + .st_rdev = devt, + }; + + return 0; +} + static int whitelist_device(BPFProgram *prog, const char *path, const char *node, const char *acc) { struct stat st; - bool ignore_notfound; int r; assert(path); assert(acc); - if (node[0] == '-') { - /* Non-existent paths starting with "-" must be silently ignored */ - node++; - ignore_notfound = true; - } else - ignore_notfound = false; - - if (stat(node, &st) < 0) { - if (errno == ENOENT && ignore_notfound) - return 0; + /* Some special handling for /dev/block/%u:%u, /dev/char/%u:%u, /run/systemd/inaccessible/chr and + * /run/systemd/inaccessible/blk paths. Instead of stat()ing these we parse out the major/minor directly. This + * means clients can use these path without the device node actually around */ + r = shortcut_special_device_path(node, &st); + if (r < 0) { + if (r != -ENODEV) + return log_warning_errno(r, "Couldn't parse major/minor from device path '%s': %m", node); - return log_warning_errno(errno, "Couldn't stat device %s: %m", node); - } + if (stat(node, &st) < 0) + return log_warning_errno(errno, "Couldn't stat device %s: %m", node); - if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) { - log_warning("%s is not a device.", node); - return -ENODEV; + if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode)) { + log_warning("%s is not a device.", node); + return -ENODEV; + } } if (cg_all_unified() > 0) { @@ -1098,8 +1144,8 @@ static void cgroup_context_apply( "/dev/tty\0" "rwm\0" "/dev/ptmx\0" "rwm\0" /* Allow /run/systemd/inaccessible/{chr,blk} devices for mapping InaccessiblePaths */ - "-/run/systemd/inaccessible/chr\0" "rwm\0" - "-/run/systemd/inaccessible/blk\0" "rwm\0"; + "/run/systemd/inaccessible/chr\0" "rwm\0" + "/run/systemd/inaccessible/blk\0" "rwm\0"; const char *x, *y; -- cgit v1.2.3 From 8e8b5d2e6d91180a57844b09cdbdcbc1fa466bfa Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Fri, 29 Jun 2018 12:09:29 +0200 Subject: cgroups: beef up DeviceAllow= syntax a bit Previously we'd allow pattern expressions such as "char-input" to match all input devices. Internally, this would look up the right major to test in /proc/devices. With this commit the syntax is slightly extended: - "char-*" can be used to match any kind of character device, and similar "block-*. This expression would work previously already, but instead of actually installing a wildcard match it would install many individual matches for everything listed in /proc/devices. - "char-" with "" being a numerical parameter works now too. This allows clients to install whitelist items by specifying the major directly. The main reason to add these is to provide limited compat support for clients that for some reason contain whitelists with major/minor numbers (such as OCI containers). --- src/core/bpf-devices.c | 26 +++++++++++++++++++++++++ src/core/bpf-devices.h | 1 + src/core/cgroup.c | 51 +++++++++++++++++++++++++++++++++++++++++++++----- 3 files changed, 73 insertions(+), 5 deletions(-) (limited to 'src/core') diff --git a/src/core/bpf-devices.c b/src/core/bpf-devices.c index d8915244a7..dade7f0490 100644 --- a/src/core/bpf-devices.c +++ b/src/core/bpf-devices.c @@ -84,6 +84,32 @@ int cgroup_bpf_whitelist_major(BPFProgram *prog, int type, int major, const char return r; } +int cgroup_bpf_whitelist_class(BPFProgram *prog, int type, const char *acc) { + struct bpf_insn insn[] = { + BPF_JMP_IMM(BPF_JNE, BPF_REG_2, type, 5), /* compare device type */ + BPF_MOV32_REG(BPF_REG_1, BPF_REG_3), /* calculate access type */ + BPF_ALU32_IMM(BPF_AND, BPF_REG_1, 0), + BPF_JMP_REG(BPF_JNE, BPF_REG_1, BPF_REG_3, 1), /* compare access type */ + BPF_JMP_A(PASS_JUMP_OFF), /* jump to PASS */ + }; + int r, access; + + assert(prog); + assert(acc); + + access = bpf_access_type(acc); + if (access <= 0) + return -EINVAL; + + insn[2].imm = access; + + r = bpf_program_add_instructions(prog, insn, ELEMENTSOF(insn)); + if (r < 0) + log_error_errno(r, "Extending device control BPF program failed: %m"); + + return r; +} + int cgroup_init_device_bpf(BPFProgram **ret, CGroupDevicePolicy policy, bool whitelist) { struct bpf_insn pre_insn[] = { /* load device type to r2 */ diff --git a/src/core/bpf-devices.h b/src/core/bpf-devices.h index f9a6eec028..8d3de3bd94 100644 --- a/src/core/bpf-devices.h +++ b/src/core/bpf-devices.h @@ -11,6 +11,7 @@ int bpf_devices_supported(void); int cgroup_bpf_whitelist_device(BPFProgram *p, int type, int major, int minor, const char *acc); int cgroup_bpf_whitelist_major(BPFProgram *p, int type, int major, const char *acc); +int cgroup_bpf_whitelist_class(BPFProgram *prog, int type, const char *acc); int cgroup_init_device_bpf(BPFProgram **ret, CGroupDevicePolicy policy, bool whitelist); int cgroup_apply_device_bpf(Unit *u, BPFProgram *p, CGroupDevicePolicy policy, bool whitelist); diff --git a/src/core/cgroup.c b/src/core/cgroup.c index 8f3e646ad6..dd9b992ef1 100644 --- a/src/core/cgroup.c +++ b/src/core/cgroup.c @@ -509,21 +509,64 @@ static int whitelist_device(BPFProgram *prog, const char *path, const char *node static int whitelist_major(BPFProgram *prog, const char *path, const char *name, char type, const char *acc) { _cleanup_fclose_ FILE *f = NULL; - char *p, *w; + char buf[2+DECIMAL_STR_MAX(unsigned)+3+4]; bool good = false; + unsigned maj; int r; assert(path); assert(acc); assert(IN_SET(type, 'b', 'c')); + if (streq(name, "*")) { + /* If the name is a wildcard, then apply this list to all devices of this type */ + + if (cg_all_unified() > 0) { + if (!prog) + return 0; + + (void) cgroup_bpf_whitelist_class(prog, type == 'c' ? BPF_DEVCG_DEV_CHAR : BPF_DEVCG_DEV_BLOCK, acc); + } else { + xsprintf(buf, "%c *:* %s", type, acc); + + r = cg_set_attribute("devices", path, "devices.allow", buf); + if (r < 0) + log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EINVAL, -EACCES) ? LOG_DEBUG : LOG_WARNING, r, + "Failed to set devices.allow on %s: %m", path); + return 0; + } + } + + if (safe_atou(name, &maj) >= 0 && DEVICE_MAJOR_VALID(maj)) { + /* The name is numeric and suitable as major. In that case, let's take is major, and create the entry + * directly */ + + if (cg_all_unified() > 0) { + if (!prog) + return 0; + + (void) cgroup_bpf_whitelist_major(prog, + type == 'c' ? BPF_DEVCG_DEV_CHAR : BPF_DEVCG_DEV_BLOCK, + maj, acc); + } else { + xsprintf(buf, "%c %u:* %s", type, maj, acc); + + r = cg_set_attribute("devices", path, "devices.allow", buf); + if (r < 0) + log_full_errno(IN_SET(r, -ENOENT, -EROFS, -EINVAL, -EACCES) ? LOG_DEBUG : LOG_WARNING, r, + "Failed to set devices.allow on %s: %m", path); + } + + return 0; + } + f = fopen("/proc/devices", "re"); if (!f) return log_warning_errno(errno, "Cannot open /proc/devices to resolve %s (%c): %m", name, type); for (;;) { _cleanup_free_ char *line = NULL; - unsigned maj; + char *w, *p; r = read_line(f, LONG_LINE_MAX, &line); if (r < 0) @@ -576,8 +619,6 @@ static int whitelist_major(BPFProgram *prog, const char *path, const char *name, type == 'c' ? BPF_DEVCG_DEV_CHAR : BPF_DEVCG_DEV_BLOCK, maj, acc); } else { - char buf[2+DECIMAL_STR_MAX(unsigned)+3+4]; - sprintf(buf, "%c %u:* %s", type, @@ -1179,7 +1220,7 @@ static void cgroup_context_apply( else if ((val = startswith(a->path, "char-"))) (void) whitelist_major(prog, path, val, 'c', acc); else - log_unit_debug(u, "Ignoring device %s while writing cgroup attribute.", a->path); + log_unit_debug(u, "Ignoring device '%s' while writing cgroup attribute.", a->path); } r = cgroup_apply_device_bpf(u, prog, c->device_policy, c->device_allow); -- cgit v1.2.3 From 846b3bd61e1d575b0b28f73c4d15385f94bb1662 Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Fri, 29 Jun 2018 15:57:49 +0200 Subject: stat-util: add new APIs device_path_make_{major_minor|canonical}() and device_path_parse_major_minor() device_path_make_{major_minor|canonical) generate device node paths given a mode_t and a dev_t. We have similar code all over the place, let's unify this in one place. The former will generate a "/dev/char/" or "/dev/block" path, and never go to disk. The latter then goes to disk and resolves that path to the actual path of the device node. device_path_parse_major_minor() reverses device_path_make_major_minor(), also withozut going to disk. We have similar code doing something like this at various places, let's unify this in a single set of functions. This also allows us to teach them special tricks, for example handling of the /run/systemd/inaccessible/{blk|chr} device nodes, which we use for masking device nodes, and which do not exist in /dev/char/* and /dev/block/* --- src/basic/stat-util.c | 98 +++++++++++++++++++++++++++++++++++++++++++++++ src/basic/stat-util.h | 4 ++ src/core/cgroup.c | 52 +------------------------ src/test/test-stat-util.c | 40 +++++++++++++++++++ 4 files changed, 144 insertions(+), 50 deletions(-) (limited to 'src/core') diff --git a/src/basic/stat-util.c b/src/basic/stat-util.c index 8b63eb360b..57700e2388 100644 --- a/src/basic/stat-util.c +++ b/src/basic/stat-util.c @@ -10,11 +10,13 @@ #include #include +#include "alloc-util.h" #include "dirent-util.h" #include "fd-util.h" #include "fs-util.h" #include "macro.h" #include "missing.h" +#include "parse-util.h" #include "stat-util.h" #include "string-util.h" @@ -319,3 +321,99 @@ int fd_verify_directory(int fd) { return stat_verify_directory(&st); } + +int device_path_make_major_minor(mode_t mode, dev_t devno, char **ret) { + const char *t; + + /* Generates the /dev/{char|block}/MAJOR:MINOR path for a dev_t */ + + if (S_ISCHR(mode)) + t = "char"; + else if (S_ISBLK(mode)) + t = "block"; + else + return -ENODEV; + + if (asprintf(ret, "/dev/%s/%u:%u", t, major(devno), minor(devno)) < 0) + return -ENOMEM; + + return 0; + +} + +int device_path_make_canonical(mode_t mode, dev_t devno, char **ret) { + _cleanup_free_ char *p = NULL; + int r; + + /* Finds the canonical path for a device, i.e. resolves the /dev/{char|block}/MAJOR:MINOR path to the end. */ + + assert(ret); + + if (major(devno) == 0 && minor(devno) == 0) { + char *s; + + /* A special hack to make sure our 'inaccessible' device nodes work. They won't have symlinks in + * /dev/block/ and /dev/char/, hence we handle them specially here. */ + + if (S_ISCHR(mode)) + s = strdup("/run/systemd/inaccessible/chr"); + else if (S_ISBLK(mode)) + s = strdup("/run/systemd/inaccessible/blk"); + else + return -ENODEV; + + if (!s) + return -ENOMEM; + + *ret = s; + return 0; + } + + r = device_path_make_major_minor(mode, devno, &p); + if (r < 0) + return r; + + return chase_symlinks(p, NULL, 0, ret); +} + +int device_path_parse_major_minor(const char *path, mode_t *ret_mode, dev_t *ret_devno) { + mode_t mode; + dev_t devno; + int r; + + /* Tries to extract the major/minor directly from the device path if we can. Handles /dev/block/ and /dev/char/ + * paths, as well out synthetic inaccessible device nodes. Never goes to disk. Returns -ENODEV if the device + * path cannot be parsed like this. */ + + if (path_equal(path, "/run/systemd/inaccessible/chr")) { + mode = S_IFCHR; + devno = makedev(0, 0); + } else if (path_equal(path, "/run/systemd/inaccessible/blk")) { + mode = S_IFBLK; + devno = makedev(0, 0); + } else { + const char *w; + + w = path_startswith(path, "/dev/block/"); + if (w) + mode = S_IFBLK; + else { + w = path_startswith(path, "/dev/char/"); + if (!w) + return -ENODEV; + + mode = S_IFCHR; + } + + r = parse_dev(w, &devno); + if (r < 0) + return r; + } + + if (ret_mode) + *ret_mode = mode; + if (ret_devno) + *ret_devno = devno; + + return 0; +} diff --git a/src/basic/stat-util.h b/src/basic/stat-util.h index fe4a4bb717..0a08e642b5 100644 --- a/src/basic/stat-util.h +++ b/src/basic/stat-util.h @@ -81,3 +81,7 @@ int fd_verify_directory(int fd); typeof(x) _x = (x), _y = 0; \ _x >= _y && _x < (UINT32_C(1) << 20); \ }) + +int device_path_make_major_minor(mode_t mode, dev_t devno, char **ret); +int device_path_make_canonical(mode_t mode, dev_t devno, char **ret); +int device_path_parse_major_minor(const char *path, mode_t *ret_mode, dev_t *ret_devno); diff --git a/src/core/cgroup.c b/src/core/cgroup.c index dd9b992ef1..72af5e855f 100644 --- a/src/core/cgroup.c +++ b/src/core/cgroup.c @@ -408,56 +408,8 @@ static int lookup_block_device(const char *p, dev_t *ret) { return 0; } -static int shortcut_special_device_path(const char *p, struct stat *ret) { - const char *w; - mode_t mode; - dev_t devt; - int r; - - assert(p); - assert(ret); - - if (path_equal(p, "/run/systemd/inaccessible/chr")) { - *ret = (struct stat) { - .st_mode = S_IFCHR, - .st_rdev = makedev(0, 0), - }; - return 0; - } - - if (path_equal(p, "/run/systemd/inaccessible/blk")) { - *ret = (struct stat) { - .st_mode = S_IFBLK, - .st_rdev = makedev(0, 0), - }; - return 0; - } - - w = path_startswith(p, "/dev/block/"); - if (w) - mode = S_IFBLK; - else { - w = path_startswith(p, "/dev/char/"); - if (!w) - return -ENODEV; - - mode = S_IFCHR; - } - - r = parse_dev(w, &devt); - if (r < 0) - return r; - - *ret = (struct stat) { - .st_mode = mode, - .st_rdev = devt, - }; - - return 0; -} - static int whitelist_device(BPFProgram *prog, const char *path, const char *node, const char *acc) { - struct stat st; + struct stat st = {}; int r; assert(path); @@ -466,7 +418,7 @@ static int whitelist_device(BPFProgram *prog, const char *path, const char *node /* Some special handling for /dev/block/%u:%u, /dev/char/%u:%u, /run/systemd/inaccessible/chr and * /run/systemd/inaccessible/blk paths. Instead of stat()ing these we parse out the major/minor directly. This * means clients can use these path without the device node actually around */ - r = shortcut_special_device_path(node, &st); + r = device_path_parse_major_minor(node, &st.st_mode, &st.st_rdev); if (r < 0) { if (r != -ENODEV) return log_warning_errno(r, "Couldn't parse major/minor from device path '%s': %m", node); diff --git a/src/test/test-stat-util.c b/src/test/test-stat-util.c index 713fbc9a08..4201edac97 100644 --- a/src/test/test-stat-util.c +++ b/src/test/test-stat-util.c @@ -11,6 +11,7 @@ #include "missing.h" #include "mount-util.h" #include "stat-util.h" +#include "path-util.h" static void test_files_same(void) { _cleanup_close_ int fd = -1; @@ -116,6 +117,44 @@ static void test_device_major_minor_valid(void) { assert_se(DEVICE_MINOR_VALID(minor(0))); } +static void test_device_path_make_canonical_one(const char *path) { + _cleanup_free_ char *resolved = NULL, *raw = NULL; + struct stat st; + dev_t devno; + mode_t mode; + int r; + + assert_se(stat(path, &st) >= 0); + r = device_path_make_canonical(st.st_mode, st.st_rdev, &resolved); + if (r == -ENOENT) /* maybe /dev/char/x:y and /dev/block/x:y are missing in this test environment, because we + * run in a container or so? */ + return; + + assert_se(r >= 0); + assert_se(path_equal(path, resolved)); + + assert_se(device_path_make_major_minor(st.st_mode, st.st_rdev, &raw) >= 0); + assert_se(device_path_parse_major_minor(raw, &mode, &devno) >= 0); + + assert_se(st.st_rdev == devno); + assert_se((st.st_mode & S_IFMT) == (mode & S_IFMT)); +} + +static void test_device_path_make_canonical(void) { + + test_device_path_make_canonical_one("/dev/null"); + test_device_path_make_canonical_one("/dev/zero"); + test_device_path_make_canonical_one("/dev/full"); + test_device_path_make_canonical_one("/dev/random"); + test_device_path_make_canonical_one("/dev/urandom"); + test_device_path_make_canonical_one("/dev/tty"); + + if (is_device_node("/run/systemd/inaccessible/chr") > 0) { + test_device_path_make_canonical_one("/run/systemd/inaccessible/chr"); + test_device_path_make_canonical_one("/run/systemd/inaccessible/blk"); + } +} + int main(int argc, char *argv[]) { test_files_same(); test_is_symlink(); @@ -123,6 +162,7 @@ int main(int argc, char *argv[]) { test_path_is_temporary_fs(); test_fd_is_network_ns(); test_device_major_minor_valid(); + test_device_path_make_canonical(); return 0; } -- cgit v1.2.3 From d5aecba6e0b7c73657c4cf544ce57289115098e7 Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Mon, 2 Jul 2018 18:20:03 +0200 Subject: cgroup: use device_path_parse_major_minor() also for block device paths Not only when we populate the "devices" cgroup controller we need major/minor numbers, but for the io/blkio one it's the same, hence let's use the same logic for both. --- src/core/cgroup.c | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) (limited to 'src/core') diff --git a/src/core/cgroup.c b/src/core/cgroup.c index 72af5e855f..11f9611b71 100644 --- a/src/core/cgroup.c +++ b/src/core/cgroup.c @@ -376,16 +376,23 @@ int cgroup_add_device_allow(CGroupContext *c, const char *dev, const char *mode) } static int lookup_block_device(const char *p, dev_t *ret) { - struct stat st; + struct stat st = {}; int r; assert(p); assert(ret); - if (stat(p, &st) < 0) - return log_warning_errno(errno, "Couldn't stat device '%s': %m", p); - - if (S_ISBLK(st.st_mode)) + r = device_path_parse_major_minor(p, &st.st_mode, &st.st_rdev); + if (r == -ENODEV) { /* not a parsable device node, need to go to disk */ + if (stat(p, &st) < 0) + return log_warning_errno(errno, "Couldn't stat device '%s': %m", p); + } else if (r < 0) + return log_warning_errno(r, "Failed to parse major/minor from path '%s': %m", p); + + if (S_ISCHR(st.st_mode)) { + log_warning("Device node '%s' is a character device, but block device needed.", p); + return -ENOTBLK; + } else if (S_ISBLK(st.st_mode)) *ret = st.st_rdev; else if (major(st.st_dev) != 0) *ret = st.st_dev; /* If this is not a device node then use the block device this file is stored on */ -- cgit v1.2.3 From 30874dda3a66c0639773dd23079662fc4bf53afd Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Fri, 27 Jul 2018 18:04:11 +0200 Subject: dev-setup: generalize logic we use to create "inaccessible" device nodes Let's generalize this, so that we can use this in nspawn later on, which is pretty useful as we need to be able to mask files from the inner child of nspawn too, where the host's /run/systemd/inaccessible directory is not visible anymore. Moreover, if nspawn can create these nodes on its own before the payload this means the payload can run with fewer privileges. --- src/core/mount-setup.c | 17 +++---------- src/shared/dev-setup.c | 59 ++++++++++++++++++++++++++++++++++++++++++++ src/shared/dev-setup.h | 2 ++ src/test/meson.build | 4 +++ src/test/test-dev-setup.c | 62 +++++++++++++++++++++++++++++++++++++++++++++++ 5 files changed, 130 insertions(+), 14 deletions(-) create mode 100644 src/test/test-dev-setup.c (limited to 'src/core') diff --git a/src/core/mount-setup.c b/src/core/mount-setup.c index e15d94d98a..4c17395774 100644 --- a/src/core/mount-setup.c +++ b/src/core/mount-setup.c @@ -460,20 +460,9 @@ int mount_setup(bool loaded_policy) { (void) mkdir_label("/run/systemd", 0755); (void) mkdir_label("/run/systemd/system", 0755); - /* Set up inaccessible (and empty) file nodes of all types */ - (void) mkdir_label("/run/systemd/inaccessible", 0000); - (void) mknod("/run/systemd/inaccessible/reg", S_IFREG | 0000, 0); - (void) mkdir_label("/run/systemd/inaccessible/dir", 0000); - (void) mkfifo("/run/systemd/inaccessible/fifo", 0000); - (void) mknod("/run/systemd/inaccessible/sock", S_IFSOCK | 0000, 0); - - /* The following two are likely to fail if we lack the privs for it (for example in an userns environment, if - * CAP_SYS_MKNOD is missing, or if a device node policy prohibit major/minor of 0 device nodes to be - * created). But that's entirely fine. Consumers of these files should carry fallback to use a different node - * then, for example /run/systemd/inaccessible/sock, which is close enough in behaviour and semantics for most - * uses. */ - (void) mknod("/run/systemd/inaccessible/chr", S_IFCHR | 0000, makedev(0, 0)); - (void) mknod("/run/systemd/inaccessible/blk", S_IFBLK | 0000, makedev(0, 0)); + /* Also create /run/systemd/inaccessible nodes, so that we always have something to mount inaccessible nodes + * from. */ + (void) make_inaccessible_nodes(NULL, UID_INVALID, GID_INVALID); return 0; } diff --git a/src/shared/dev-setup.c b/src/shared/dev-setup.c index d117fbfda0..b545c2a1c0 100644 --- a/src/shared/dev-setup.c +++ b/src/shared/dev-setup.c @@ -9,6 +9,7 @@ #include "label.h" #include "log.h" #include "path-util.h" +#include "umask-util.h" #include "user-util.h" #include "util.h" @@ -54,3 +55,61 @@ int dev_setup(const char *prefix, uid_t uid, gid_t gid) { return 0; } + +int make_inaccessible_nodes(const char *root, uid_t uid, gid_t gid) { + static const struct { + const char *name; + mode_t mode; + } table[] = { + { "/run/systemd", S_IFDIR | 0755 }, + { "/run/systemd/inaccessible", S_IFDIR | 0000 }, + { "/run/systemd/inaccessible/reg", S_IFREG | 0000 }, + { "/run/systemd/inaccessible/dir", S_IFDIR | 0000 }, + { "/run/systemd/inaccessible/fifo", S_IFIFO | 0000 }, + { "/run/systemd/inaccessible/sock", S_IFSOCK | 0000 }, + + /* The following two are likely to fail if we lack the privs for it (for example in an userns + * environment, if CAP_SYS_MKNOD is missing, or if a device node policy prohibit major/minor of 0 + * device nodes to be created). But that's entirely fine. Consumers of these files should carry + * fallback to use a different node then, for example /run/systemd/inaccessible/sock, which is close + * enough in behaviour and semantics for most uses. */ + { "/run/systemd/inaccessible/chr", S_IFCHR | 0000 }, + { "/run/systemd/inaccessible/blk", S_IFBLK | 0000 }, + }; + + _cleanup_umask_ mode_t u; + size_t i; + int r; + + u = umask(0000); + + /* Set up inaccessible (and empty) file nodes of all types. This are used to as mount sources for over-mounting + * ("masking") file nodes that shall become inaccessible and empty for specific containers or services. We try + * to lock down these nodes as much as we can, but otherwise try to match them as closely as possible with the + * underlying file, i.e. in the best case we offer the same node type as the underlying node. */ + + for (i = 0; i < ELEMENTSOF(table); i++) { + _cleanup_free_ char *path = NULL; + + path = prefix_root(root, table[i].name); + if (!path) + return log_oom(); + + if (S_ISDIR(table[i].mode)) + r = mkdir(path, table[i].mode & 07777); + else + r = mknod(path, table[i].mode, makedev(0, 0)); + if (r < 0) { + if (errno != EEXIST) + log_debug_errno(errno, "Failed to create '%s', ignoring: %m", path); + continue; + } + + if (uid != UID_INVALID || gid != GID_INVALID) { + if (lchown(path, uid, gid) < 0) + log_debug_errno(errno, "Failed to chown '%s': %m", path); + } + } + + return 0; +} diff --git a/src/shared/dev-setup.h b/src/shared/dev-setup.h index f105f2f20f..72b90ec4de 100644 --- a/src/shared/dev-setup.h +++ b/src/shared/dev-setup.h @@ -4,3 +4,5 @@ #include int dev_setup(const char *prefix, uid_t uid, gid_t gid); + +int make_inaccessible_nodes(const char *root, uid_t uid, gid_t gid); diff --git a/src/test/meson.build b/src/test/meson.build index ade905733e..2635456a4f 100644 --- a/src/test/meson.build +++ b/src/test/meson.build @@ -156,6 +156,10 @@ tests += [ [], []], + [['src/test/test-dev-setup.c'], + [], + []], + [['src/test/test-capability.c'], [], [libcap]], diff --git a/src/test/test-dev-setup.c b/src/test/test-dev-setup.c new file mode 100644 index 0000000000..523cfe43b1 --- /dev/null +++ b/src/test/test-dev-setup.c @@ -0,0 +1,62 @@ +/* SPDX-License-Identifier: LGPL-2.1+ */ + +#include "capability-util.h" +#include "dev-setup.h" +#include "fileio.h" +#include "fs-util.h" +#include "path-util.h" +#include "rm-rf.h" + +int main(int argc, char *argv[]) { + _cleanup_(rm_rf_physical_and_freep) char *p = NULL; + const char *f; + struct stat st; + + if (have_effective_cap(CAP_DAC_OVERRIDE) <= 0) + return EXIT_TEST_SKIP; + + assert_se(mkdtemp_malloc("/tmp/test-dev-setupXXXXXX", &p) >= 0); + + f = prefix_roota(p, "/run"); + assert_se(mkdir(f, 0755) >= 0); + + assert_se(make_inaccessible_nodes(p, 1, 1) >= 0); + + f = prefix_roota(p, "/run/systemd/inaccessible/reg"); + assert_se(stat(f, &st) >= 0); + assert_se(S_ISREG(st.st_mode)); + assert_se((st.st_mode & 07777) == 0000); + + f = prefix_roota(p, "/run/systemd/inaccessible/dir"); + assert_se(stat(f, &st) >= 0); + assert_se(S_ISDIR(st.st_mode)); + assert_se((st.st_mode & 07777) == 0000); + + f = prefix_roota(p, "/run/systemd/inaccessible/fifo"); + assert_se(stat(f, &st) >= 0); + assert_se(S_ISFIFO(st.st_mode)); + assert_se((st.st_mode & 07777) == 0000); + + f = prefix_roota(p, "/run/systemd/inaccessible/sock"); + assert_se(stat(f, &st) >= 0); + assert_se(S_ISSOCK(st.st_mode)); + assert_se((st.st_mode & 07777) == 0000); + + f = prefix_roota(p, "/run/systemd/inaccessible/chr"); + if (stat(f, &st) < 0) + assert_se(errno == ENOENT); + else { + assert_se(S_ISCHR(st.st_mode)); + assert_se((st.st_mode & 07777) == 0000); + } + + f = prefix_roota(p, "/run/systemd/inaccessible/blk"); + if (stat(f, &st) < 0) + assert_se(errno == ENOENT); + else { + assert_se(S_ISBLK(st.st_mode)); + assert_se((st.st_mode & 07777) == 0000); + } + + return EXIT_SUCCESS; +} -- cgit v1.2.3