diff options
-rw-r--r-- | docs/BLOCK_DEVICE_LOCKING.md | 19 | ||||
-rw-r--r-- | man/udevadm.xml | 116 | ||||
-rw-r--r-- | src/basic/hash-funcs.c | 10 | ||||
-rw-r--r-- | src/basic/hash-funcs.h | 7 | ||||
-rw-r--r-- | src/udev/meson.build | 1 | ||||
-rw-r--r-- | src/udev/udevadm-lock.c | 399 | ||||
-rw-r--r-- | src/udev/udevadm.c | 2 | ||||
-rw-r--r-- | src/udev/udevadm.h | 1 |
8 files changed, 549 insertions, 6 deletions
diff --git a/docs/BLOCK_DEVICE_LOCKING.md b/docs/BLOCK_DEVICE_LOCKING.md index 13ae3f6e04..40fc61c671 100644 --- a/docs/BLOCK_DEVICE_LOCKING.md +++ b/docs/BLOCK_DEVICE_LOCKING.md @@ -75,6 +75,25 @@ And please keep in mind: BSD file locks (`flock()`) and POSIX file locks orthogonal. The scheme discussed above uses the former and not the latter, because these types of locks more closely match the required semantics. +If multiple devices are to be locked at the same time (for example in order to +format a RAID file system), the devices should be locked in the order of the +the device nodes' major numbers (primary ordering key, ascending) and minor +numbers (secondary ordering key, ditto), in order to avoid ABBA locking issues +between subsystems. + +Note that the locks should only be taken while the device is repartitioned, +file systems formatted or `dd`'ed in, and similar cases that +apply/remove/change superblocks/partition information. It should not be held +during normal operation, i.e. while file systems on it are mounted for +application use. + +The [`udevadm +lock`](https://www.freedesktop.org/software/systemd/man/udevadm.html) command +is provided to lock block devices following this scheme from the command line, +for the use in scripts and similar. (Note though that it's typically preferable +to use native support for block device locking in tools where that's +available.) + Summarizing: it is recommended to take `LOCK_EX` BSD file locks when manipulating block devices in all tools that change file system block devices (`mkfs`, `fsck`, …) or partition tables (`fdisk`, `parted`, …), right after diff --git a/man/udevadm.xml b/man/udevadm.xml index e299a75879..3248cfd256 100644 --- a/man/udevadm.xml +++ b/man/udevadm.xml @@ -51,6 +51,9 @@ <cmdsynopsis> <command>udevadm wait <optional>options</optional> <replaceable>device|syspath</replaceable></command> </cmdsynopsis> + <cmdsynopsis> + <command>udevadm lock <optional>options</optional> <replaceable>command</replaceable></command> + </cmdsynopsis> </refsynopsisdiv> <refsect1><title>Description</title> @@ -747,6 +750,87 @@ <xi:include href="standard-options.xml" xpointer="help" /> </variablelist> </refsect2> + + <refsect2> + <title>udevadm lock + <arg choice="opt"><replaceable>options</replaceable></arg> + <arg choice="opt"><replaceable>command</replaceable></arg> + … + </title> + + <para><command>udevadm lock</command> takes an (advisory) exclusive lock(s) on a block device (or + multiple therof), as per <ulink url="https://systemd.io/BLOCK_DEVICE_LOCKING">Locking Block Device + Access</ulink> and invokes a program with the lock(s) taken. When the invoked program exits the lock(s) + are automatically released.</para> + + <para>This tool is in particular useful to ensure that + <citerefentry><refentrytitle>systemd-udevd.service</refentrytitle><manvolnum>8</manvolnum></citerefentry> + does not probe a block device while changes are made to it, for example partitions created or file + systems formatted. Note that many tools that interface with block devices natively support taking + relevant locks, see for example + <citerefentry><refentrytitle>sfdisk</refentrytitle><manvolnum>8</manvolnum></citerefentry>'s + <option>--lock</option> switch.</para> + + <para>The command expects at least one block device specified via <option>--device=</option> or + <option>--backing=</option>, and a command line to execute as arguments.</para> + + <variablelist> + <varlistentry> + <term><option>--device=<replaceable>DEVICE</replaceable></option></term> + <term><option>-d <replaceable>DEVICE</replaceable></option></term> + + <listitem><para>Takes a path to a device node of the device to lock. This switch may be used + multiple times (and in combination with <option>--backing=</option>) in order to lock multiple + devices. If a partition block device node is specified the containing "whole" block device is + automatically determined and used for the lock, as per the specification. If multiple devices are + specified, they are deduplicated, sorted by the major/minor of their device nodes and then locked + in order.</para> + + <para>This switch must be used at least once, to specify at least one device to + lock. (Alternatively, use <option>--backing=</option>, see below.)</para></listitem> + </varlistentry> + + <varlistentry> + <term><option>--backing=<replaceable>PATH</replaceable></option></term> + <term><option>-b <replaceable>PATH</replaceable></option></term> + + <listitem><para>If a path to a device node is specified, identical to + <option>--device=</option>. However, this switch alternatively accepts a path to a regular file or + directory, in which case the block device of the file system the file/directory resides on is + automatically determined and used as if it was specified with + <option>--device=</option>.</para></listitem> + </varlistentry> + + <varlistentry> + <term><option>--timeout=<replaceable>SECS</replaceable></option></term> + <term><option>-t <replaceable>SECS</replaceable></option></term> + + <listitem><para>Specifies how long to wait at most until all locks can be taken. Takes a value in + seconds, or in the usual supported time units, see + <citerefentry><refentrytitle>systemd.time</refentrytitle><manvolnum>7</manvolnum></citerefentry>. If + specified as zero the lock is attempted and if not successful the invocation will immediately + fail. If passed as <literal>infinity</literal> (the default) the invocation will wait indefinitely + until the lock can be acquired. If the lock cannot be taken in the specified time the specified + command will not be executed and the invocation will fail.</para></listitem> + </varlistentry> + + <varlistentry> + <term><option>--print</option></term> + <term><option>-p</option></term> + + <listitem><para>Instead of locking the specified device(s) and executing a command, just print the + device path(s) that would be locked, and execute no command. This command is useful to determine + the "whole" block device in case a partition block device is specified. The devices will be sorted + by their device node major number as primary ordering key and the minor number as secondary + ordering key (i.e. they are shown in the order they'd be locked). Note that the number of lines + printed here can be less than the the number of <option>--device=</option> and + <option>--backing=</option> switches specified in case these resolve to the same "whole" + devices.</para></listitem> + </varlistentry> + + <xi:include href="standard-options.xml" xpointer="help" /> + </variablelist> + </refsect2> </refsect1> <refsect1> @@ -761,6 +845,38 @@ </refsect1> <refsect1> + <title>Example</title> + + <example> + <title>Format a File System</title> + + <para>Take a lock on the backing block device while creating a file system, to ensure that + <command>systemd-udevd</command> doesn't probe or announce the new superblock before it is + comprehensively written:</para> + + <programlisting># udevadm lock --device=/dev/sda1 mkfs.ext4 /dev/sda1</programlisting> + </example> + + <example> + <title>Format a RAID File System</title> + + <para>Similar, but take locks on multiple devices at once:</para> + + <programlisting># udevadm lock --device=/dev/sda1 --device=/dev/sdb1 mkfs.btrfs /dev/sda1 /dev/sdb1</programlisting> + </example> + + <example> + <title>Copy in a File System</title> + + <para>Take a lock on the backing block device while copying in a prepared file system image, to ensure + that <command>systemd-udevd</command> doesn't probe or announce the new superblock before it is fully + written:</para> + + <programlisting># udevadm lock -d /dev/sda1 dd if=fs.raw of=/dev/sda1</programlisting> + </example> + </refsect1> + + <refsect1> <title>See Also</title> <para><citerefentry> <refentrytitle>udev</refentrytitle><manvolnum>7</manvolnum> diff --git a/src/basic/hash-funcs.c b/src/basic/hash-funcs.c index 084ed0c0a2..6addb76f1b 100644 --- a/src/basic/hash-funcs.c +++ b/src/basic/hash-funcs.c @@ -102,10 +102,16 @@ DEFINE_HASH_OPS(uint64_hash_ops, uint64_t, uint64_hash_func, uint64_compare_func void devt_hash_func(const dev_t *p, struct siphash *state) { siphash24_compress(p, sizeof(dev_t), state); } +#endif int devt_compare_func(const dev_t *a, const dev_t *b) { - return CMP(*a, *b); + int r; + + r = CMP(major(*a), major(*b)); + if (r != 0) + return r; + + return CMP(minor(*a), minor(*b)); } DEFINE_HASH_OPS(devt_hash_ops, dev_t, devt_hash_func, devt_compare_func); -#endif diff --git a/src/basic/hash-funcs.h b/src/basic/hash-funcs.h index 023cfdf530..c537c6af7e 100644 --- a/src/basic/hash-funcs.h +++ b/src/basic/hash-funcs.h @@ -102,10 +102,9 @@ extern const struct hash_ops uint64_hash_ops; * 64bit archs. Yuck! */ #if SIZEOF_DEV_T != 8 void devt_hash_func(const dev_t *p, struct siphash *state) _pure_; -int devt_compare_func(const dev_t *a, const dev_t *b) _pure_; -extern const struct hash_ops devt_hash_ops; #else #define devt_hash_func uint64_hash_func -#define devt_compare_func uint64_compare_func -#define devt_hash_ops uint64_hash_ops #endif + +int devt_compare_func(const dev_t *a, const dev_t *b) _pure_; +extern const struct hash_ops devt_hash_ops; diff --git a/src/udev/meson.build b/src/udev/meson.build index 8a2926db30..354b923291 100644 --- a/src/udev/meson.build +++ b/src/udev/meson.build @@ -6,6 +6,7 @@ udevadm_sources = files( 'udevadm-control.c', 'udevadm-hwdb.c', 'udevadm-info.c', + 'udevadm-lock.c', 'udevadm-monitor.c', 'udevadm-settle.c', 'udevadm-test.c', diff --git a/src/udev/udevadm-lock.c b/src/udev/udevadm-lock.c new file mode 100644 index 0000000000..fae464704f --- /dev/null +++ b/src/udev/udevadm-lock.c @@ -0,0 +1,399 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ + +#include <getopt.h> +#include <stdlib.h> +#include <sys/file.h> +#include <unistd.h> + +#include "blockdev-util.h" +#include "btrfs-util.h" +#include "fd-util.h" +#include "fdset.h" +#include "main-func.h" +#include "parse-util.h" +#include "path-util.h" +#include "pretty-print.h" +#include "process-util.h" +#include "signal-util.h" +#include "sort-util.h" +#include "stat-util.h" +#include "strv.h" +#include "time-util.h" +#include "udevadm.h" + +static usec_t arg_timeout_usec = USEC_INFINITY; +static char **arg_devices = NULL; +static char **arg_backing = NULL; +static char **arg_cmdline = NULL; +static bool arg_print = false; + +STATIC_DESTRUCTOR_REGISTER(arg_devices, strv_freep); +STATIC_DESTRUCTOR_REGISTER(arg_backing, strv_freep); +STATIC_DESTRUCTOR_REGISTER(arg_cmdline, strv_freep); + +static int help(void) { + _cleanup_free_ char *link = NULL; + int r; + + r = terminal_urlify_man("udevadm", "8", &link); + if (r < 0) + return log_oom(); + + printf("%s [OPTIONS...] COMMAND\n" + "%s [OPTIONS...] --print\n" + "\n%sLock a block device and run a comand.%s\n\n" + " -h --help Print this message\n" + " -V --version Print version of the program\n" + " -d --device=DEVICE Block device to lock\n" + " -b --backing=FILE File whose backing block device to lock\n" + " -t --timeout=SECS Block at most the specified time waiting for lock\n" + " -p --print Only show which block device the lock would be taken on\n" + "\nSee the %s for details.\n", + program_invocation_short_name, + program_invocation_short_name, + ansi_highlight(), + ansi_normal(), + link); + + return 0; +} + +static int parse_argv(int argc, char *argv[]) { + + static const struct option options[] = { + { "help", no_argument, NULL, 'h' }, + { "version", no_argument, NULL, 'V' }, + { "device", required_argument, NULL, 'd' }, + { "backing", required_argument, NULL, 'b' }, + { "timeout", required_argument, NULL, 't' }, + { "print", no_argument, NULL, 'p' }, + {} + }; + + int c, r; + + assert(argc >= 0); + assert(argv); + + while ((c = getopt_long(argc, argv, arg_print ? "hVd:b:t:p" : "+hVd:b:t:p", options, NULL)) >= 0) + + switch (c) { + + case 'h': + return help(); + + case 'V': + return print_version(); + + case 'd': + case 'b': { + _cleanup_free_ char *s = NULL; + char ***l = c == 'd' ? &arg_devices : &arg_backing; + + r = path_make_absolute_cwd(optarg, &s); + if (r < 0) + return log_error_errno(r, "Failed to make path '%s' absolute: %m", optarg); + + path_simplify(s); + + if (strv_consume(l, TAKE_PTR(s)) < 0) + return log_oom(); + + strv_uniq(*l); + break; + } + + case 't': + r = parse_sec(optarg, &arg_timeout_usec); + if (r < 0) + return log_error_errno(r, "Failed to parse --timeout= parameter: %s", optarg); + break; + + case 'p': + arg_print = true; + break; + + case '?': + return -EINVAL; + + default: + assert_not_reached(); + } + + if (arg_print) { + if (optind != argc) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "No arguments expected"); + } else { + if (optind + 1 > argc) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Too few arguments, command to execute."); + + arg_cmdline = strv_copy(argv + optind); + if (!arg_cmdline) + return log_oom(); + } + + if (strv_isempty(arg_devices) && strv_isempty(arg_backing)) + return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "No devices to lock specified, refusing."); + + return 1; +} + +static int find_devno( + dev_t **devnos, + size_t *n_devnos, + const char *device, + bool backing) { + + dev_t devt, whole_devt; + struct stat st; + int r; + + assert(devnos); + assert(n_devnos); + assert(*devnos || *n_devnos == 0); + assert(device); + + if (stat(device, &st) < 0) + return log_error_errno(errno, "Failed to stat '%s': %m", device); + + if (S_ISBLK(st.st_mode)) + devt = st.st_rdev; + else if (!backing) + return log_error_errno(SYNTHETIC_ERRNO(ENOTBLK), "Not a block device: %s", device); + else if (!S_ISREG(st.st_mode) && !S_ISDIR(st.st_mode)) + return log_error_errno(SYNTHETIC_ERRNO(ENOTBLK), "Not a block device, regular file or directory: %s", device); + else if (major(st.st_dev) != 0) + devt = st.st_dev; + else { + _cleanup_close_ int regfd = -1; + struct stat st2; + + /* If major(st.st_dev) is zero, this might mean we are backed by btrfs, which needs special + * handing, to get the backing device node. */ + + regfd = open(device, O_RDONLY|O_CLOEXEC|O_NONBLOCK); + if (regfd < 0) + return log_error_errno(errno, "Failed to open '%s': %m", device); + + /* Extra safety: let's check we are still looking at the same file */ + if (fstat(regfd, &st2) < 0) + return log_error_errno(errno, "Failed to stat '%s': %m", device); + if (!stat_inode_same(&st, &st2)) + return log_error_errno(SYNTHETIC_ERRNO(ENXIO), "File '%s' was replaced while we were looking at it.", device); + + r = btrfs_get_block_device_fd(regfd, &devt); + if (r == -ENOTTY) + return log_error_errno(SYNTHETIC_ERRNO(ENOTBLK), "Path '%s' not backed by block device.", device); + if (r < 0) + return log_error_errno(r, "Failed to acquire btrfs backing device of '%s': %m", device); + } + + r = block_get_whole_disk(devt, &whole_devt); + if (r < 0) + return log_error_errno(r, "Failed to find whole block device for '%s': %m", device); + + if (typesafe_bsearch(&whole_devt, *devnos, *n_devnos, devt_compare_func)) { + log_debug("Device %u:%u already listed for locking, ignoring.", major(whole_devt), minor(whole_devt)); + return 0; + } + + if (!GREEDY_REALLOC(*devnos, *n_devnos + 1)) + return log_oom(); + + (*devnos)[(*n_devnos)++] = whole_devt; + + /* Immediately sort again, to ensure the binary search above will work for the next device we add */ + typesafe_qsort(*devnos, *n_devnos, devt_compare_func); + return 1; +} + +static int lock_device( + const char *path, + dev_t devno, + usec_t deadline) { + + _cleanup_close_ int fd = -1; + struct stat st; + int r; + + fd = open(path, O_RDONLY|O_CLOEXEC|O_NONBLOCK); + if (fd < 0) + return log_error_errno(errno, "Failed to open '%s': %m", path); + + if (fstat(fd, &st) < 0) + return log_error_errno(errno, "Failed to stat '%s': %m", path); + + /* Extra safety: check that the device still refers to what we think it refers to */ + if (!S_ISBLK(st.st_mode) || st.st_rdev != devno) + return log_error_errno(SYNTHETIC_ERRNO(ENXIO), "Path '%s' no longer refers to specified block device %u:%u: %m", path, major(devno), minor(devno)); + + if (flock(fd, LOCK_EX|LOCK_NB) < 0) { + + if (errno != EAGAIN) + return log_error_errno(errno, "Failed to lock device '%s': %m", path); + + if (deadline == 0) + return log_error_errno(SYNTHETIC_ERRNO(EBUSY), "Device '%s' is currently locked.", path); + + if (deadline == USEC_INFINITY) { + + log_info("Device '%s' is currently locked, waiting…", path); + + if (flock(fd, LOCK_EX) < 0) + return log_error_errno(errno, "Failed to lock device '%s': %m", path); + + } else { + _cleanup_(sigkill_waitp) pid_t flock_pid = 0; + + /* flock() doesn't support a time-out. Let's fake one then. The traditional way to do + * this is via alarm()/setitimer()/timer_create(), but that's racy, given that the + * SIGALRM might aleady fire between the alarm() and the flock() in which case the + * flock() is never cancelled and we lock up (this is a short time window, but with + * short timeouts on a loaded machine we might run into it, who knows?). Let's + * instead do the lock out-of-process: fork off a child that does the locking, and + * that we'll wait on and kill if it takes too long. */ + + log_info("Device '%s' is currently locked, waiting %s…", + path, FORMAT_TIMESPAN(usec_sub_unsigned(deadline, now(CLOCK_MONOTONIC)), 0)); + + BLOCK_SIGNALS(SIGCHLD); + + r = safe_fork("(timed-flock)", FORK_DEATHSIG|FORK_LOG, &flock_pid); + if (r < 0) + return r; + if (r == 0) { + /* Child */ + + if (flock(fd, LOCK_EX) < 0) { + log_error_errno(errno, "Failed to lock device '%s': %m", path); + _exit(EXIT_FAILURE); + } + + _exit(EXIT_SUCCESS); + } + + for (;;) { + siginfo_t si; + sigset_t ss; + usec_t n; + + assert(sigemptyset(&ss) >= 0); + assert(sigaddset(&ss, SIGCHLD) >= 0); + + n = now(CLOCK_MONOTONIC); + if (n >= deadline) + return log_error_errno(SYNTHETIC_ERRNO(ETIMEDOUT), "Timeout reached."); + + r = sigtimedwait(&ss, NULL, TIMESPEC_STORE(deadline - n)); + if (r < 0) { + if (errno != EAGAIN) + return log_error_errno(errno, "Failed to wait for SIGCHLD: %m"); + + return log_error_errno(SYNTHETIC_ERRNO(ETIMEDOUT), "Timeout reached."); + } + + assert(r == SIGCHLD); + + zero(si); + + if (waitid(P_PID, flock_pid, &si, WEXITED|WNOHANG|WNOWAIT) < 0) + return log_error_errno(errno, "Failed to wait for child: %m"); + + if (si.si_pid != 0) { + assert(si.si_pid == flock_pid); + + if (si.si_code != CLD_EXITED || si.si_status != EXIT_SUCCESS) + return log_error_errno(SYNTHETIC_ERRNO(EPROTO), "Unexpected exit status of file lock child."); + + break; + } + + log_debug("Got SIGCHLD for other child, continuing."); + } + } + } + + log_debug("Successfully locked %s (%u:%u)…", path, major(devno), minor(devno)); + + return TAKE_FD(fd); +} + +int lock_main(int argc, char *argv[], void *userdata) { + _cleanup_(fdset_freep) FDSet *fds = NULL; + _cleanup_free_ dev_t *devnos = NULL; + size_t n_devnos = 0; + usec_t deadline; + pid_t pid; + int r; + + r = parse_argv(argc, argv); + if (r <= 0) + return r; + + STRV_FOREACH(i, arg_devices) { + r = find_devno(&devnos, &n_devnos, *i, /* backing= */ false); + if (r < 0) + return r; + } + + STRV_FOREACH(i, arg_backing) { + r = find_devno(&devnos, &n_devnos, *i, /* backing= */ true); + if (r < 0) + return r; + } + + assert(n_devnos > 0); + + fds = fdset_new(); + if (!fds) + return log_oom(); + + if (IN_SET(arg_timeout_usec, 0, USEC_INFINITY)) + deadline = arg_timeout_usec; + else + deadline = usec_add(now(CLOCK_MONOTONIC), arg_timeout_usec); + + for (size_t i = 0; i < n_devnos; i++) { + _cleanup_free_ char *node = NULL; + + r = device_path_make_canonical(S_IFBLK, devnos[i], &node); + if (r < 0) + return log_error_errno(r, "Failed to format block device path: %m"); + + if (arg_print) + printf("%s\n", node); + else { + _cleanup_close_ int fd = -1; + + fd = lock_device(node, devnos[i], deadline); + if (fd < 0) + return fd; + + r = fdset_put(fds, fd); + if (r < 0) + return log_oom(); + + TAKE_FD(fd); + } + } + + if (arg_print) + return EXIT_SUCCESS; + + /* Ignore SIGINT and allow the forked process to receive it */ + (void) ignore_signals(SIGINT); + + r = safe_fork("(lock)", FORK_RESET_SIGNALS|FORK_DEATHSIG|FORK_CLOSE_ALL_FDS|FORK_RLIMIT_NOFILE_SAFE|FORK_LOG, &pid); + if (r < 0) + return r; + if (r == 0) { + /* Child */ + + execvp(arg_cmdline[0], arg_cmdline); + log_open(); + log_error_errno(errno, "Failed to execute %s: %m", arg_cmdline[0]); + _exit(EXIT_FAILURE); + } + + return wait_for_terminate_and_check(arg_cmdline[0], pid, 0); +} diff --git a/src/udev/udevadm.c b/src/udev/udevadm.c index df23a60c1a..cd3786b416 100644 --- a/src/udev/udevadm.c +++ b/src/udev/udevadm.c @@ -27,6 +27,7 @@ static int help(void) { { "test", "Test an event run" }, { "test-builtin", "Test a built-in command" }, { "wait", "Wait for device or device symlink" }, + { "lock", "Lock a block device" }, }; _cleanup_free_ char *link = NULL; @@ -103,6 +104,7 @@ static int udevadm_main(int argc, char *argv[]) { { "test", VERB_ANY, VERB_ANY, 0, test_main }, { "test-builtin", VERB_ANY, VERB_ANY, 0, builtin_main }, { "wait", VERB_ANY, VERB_ANY, 0, wait_main }, + { "lock", VERB_ANY, VERB_ANY, 0, lock_main }, { "version", VERB_ANY, VERB_ANY, 0, version_main }, { "help", VERB_ANY, VERB_ANY, 0, help_main }, {} diff --git a/src/udev/udevadm.h b/src/udev/udevadm.h index 808294ec9d..417611affe 100644 --- a/src/udev/udevadm.h +++ b/src/udev/udevadm.h @@ -14,6 +14,7 @@ int hwdb_main(int argc, char *argv[], void *userdata); int test_main(int argc, char *argv[], void *userdata); int builtin_main(int argc, char *argv[], void *userdata); int wait_main(int argc, char *argv[], void *userdata); +int lock_main(int argc, char *argv[], void *userdata); static inline int print_version(void) { /* Dracut relies on the version being a single integer */ |