diff options
author | Lennart Poettering <lennart@poettering.net> | 2022-04-07 16:09:45 +0200 |
---|---|---|
committer | Yu Watanabe <watanabe.yu+github@gmail.com> | 2022-04-10 15:52:29 +0200 |
commit | 41bc484906e973f6c91f8398a3ba309b7ee16f5a (patch) | |
tree | 57fad1132c0a90f01e527ac948f898ddea9863a4 /src | |
parent | test-loop-block: reenable test on CI (diff) | |
download | systemd-41bc484906e973f6c91f8398a3ba309b7ee16f5a.tar.xz systemd-41bc484906e973f6c91f8398a3ba309b7ee16f5a.zip |
tree-wide: take BSD lock on loopback devices we dissect/mount/operate on
So here's something we should always keep in mind:
systemd-udevd actually does *two* things with BSD file locks on block
devices:
1. While it probes a device it takes a LOCK_SH lock. Thus everyone else
taking a LOCK_EX lock will temporarily block udev from probing
devices, which is good when making changes to it.
2. Whenever a device is closed after write (detected via inotify), udevd
will issue BLKRRPART (requesting the kernel to reread the partition
table). It does this while holding a LOCK_EX lock on the block
device. Thus anyone else taking LOCK_SH or LOCK_EX will temporarily
block udevd from issuing that ioctl. And that's quite relevant, since
the kernel will temporarily flush out all partitions while re-reading
the partition table and then create them anew. Thus it is smart to
take LOCK_SH when dissecting a block device to ensure that no
BLKRRPART is issued in the background, until we mounted the devices.
Diffstat (limited to 'src')
-rw-r--r-- | src/core/namespace.c | 14 | ||||
-rw-r--r-- | src/dissect/dissect.c | 14 | ||||
-rw-r--r-- | src/gpt-auto-generator/gpt-auto-generator.c | 7 | ||||
-rw-r--r-- | src/nspawn/nspawn.c | 7 | ||||
-rw-r--r-- | src/portable/portable.c | 4 | ||||
-rw-r--r-- | src/shared/discover-image.c | 6 | ||||
-rw-r--r-- | src/shared/dissect-image.c | 17 | ||||
-rw-r--r-- | src/sysext/sysext.c | 4 | ||||
-rw-r--r-- | src/test/test-loop-block.c | 24 |
9 files changed, 97 insertions, 0 deletions
diff --git a/src/core/namespace.c b/src/core/namespace.c index e6aa7b6473..5fa7a59bc5 100644 --- a/src/core/namespace.c +++ b/src/core/namespace.c @@ -2055,6 +2055,12 @@ int setup_namespace( if (r < 0) return log_debug_errno(r, "Failed to create loop device for root image: %m"); + /* Make sure udevd won't issue BLKRRPART (which might flush out the loaded partition table) + * while we are still trying to mount things */ + r = loop_device_flock(loop_device, LOCK_SH); + if (r < 0) + return log_debug_errno(r, "Failed to lock loopback device with LOCK_SH: %m"); + r = dissect_image( loop_device->fd, &verity, @@ -2403,6 +2409,14 @@ int setup_namespace( goto finish; } + /* Now release the block device lock, so that udevd is free to call BLKRRPART on the device + * if it likes. */ + r = loop_device_flock(loop_device, LOCK_UN); + if (r < 0) { + log_debug_errno(r, "Failed to release lock on loopback block device: %m"); + goto finish; + } + if (decrypted_image) { r = decrypted_image_relinquish(decrypted_image); if (r < 0) { diff --git a/src/dissect/dissect.c b/src/dissect/dissect.c index 1d78dc5085..51e3c4b837 100644 --- a/src/dissect/dissect.c +++ b/src/dissect/dissect.c @@ -639,6 +639,10 @@ static int action_mount(DissectedImage *m, LoopDevice *d) { if (r < 0) return r; + r = loop_device_flock(d, LOCK_UN); + if (r < 0) + return log_error_errno(r, "Failed to unlock loopback block device: %m"); + if (di) { r = decrypted_image_relinquish(di); if (r < 0) @@ -687,6 +691,10 @@ static int action_copy(DissectedImage *m, LoopDevice *d) { mounted_dir = TAKE_PTR(created_dir); + r = loop_device_flock(d, LOCK_UN); + if (r < 0) + return log_error_errno(r, "Failed to unlock loopback block device: %m"); + if (di) { r = decrypted_image_relinquish(di); if (r < 0) @@ -845,6 +853,12 @@ static int run(int argc, char *argv[]) { if (r < 0) return log_error_errno(r, "Failed to set up loopback device for %s: %m", arg_image); + /* Make sure udevd doesn't issue BLKRRPART underneath us thus making devices disappear in the middle, + * that we assume already are there. */ + r = loop_device_flock(d, LOCK_SH); + if (r < 0) + return log_error_errno(r, "Failed to lock loopback device: %m"); + r = dissect_image_and_warn( d->fd, arg_image, diff --git a/src/gpt-auto-generator/gpt-auto-generator.c b/src/gpt-auto-generator/gpt-auto-generator.c index 08bb9ac724..4731687b7f 100644 --- a/src/gpt-auto-generator/gpt-auto-generator.c +++ b/src/gpt-auto-generator/gpt-auto-generator.c @@ -1,6 +1,7 @@ /* SPDX-License-Identifier: LGPL-2.1-or-later */ #include <stdlib.h> +#include <sys/file.h> #include <unistd.h> #include "sd-device.h" @@ -696,6 +697,12 @@ static int enumerate_partitions(dev_t devnum) { if (r <= 0) return r; + /* Let's take a LOCK_SH lock on the block device, in case udevd is already running. If we don't take + * the lock, udevd might end up issuing BLKRRPART in the middle, and we don't want that, since that + * might remove all partitions while we are operating on them. */ + if (flock(fd, LOCK_SH) < 0) + return log_error_errno(errno, "Failed to lock root block device: %m"); + r = dissect_image( fd, NULL, NULL, diff --git a/src/nspawn/nspawn.c b/src/nspawn/nspawn.c index a5b8265303..807c666980 100644 --- a/src/nspawn/nspawn.c +++ b/src/nspawn/nspawn.c @@ -5737,6 +5737,13 @@ static int run(int argc, char *argv[]) { goto finish; } + /* Take a LOCK_SH lock on the device, so that udevd doesn't issue BLKRRPART in our back */ + r = loop_device_flock(loop, LOCK_SH); + if (r < 0) { + log_error_errno(r, "Failed to take lock on loopback block device: %m"); + goto finish; + } + r = dissect_image_and_warn( loop->fd, arg_image, diff --git a/src/portable/portable.c b/src/portable/portable.c index 0c10b161f0..7bba7b47e4 100644 --- a/src/portable/portable.c +++ b/src/portable/portable.c @@ -359,6 +359,10 @@ static int portable_extract_by_path( /* We now have a loopback block device, let's fork off a child in its own mount namespace, mount it * there, and extract the metadata we need. The metadata is sent from the child back to us. */ + r = loop_device_flock(d, LOCK_SH); + if (r < 0) + return log_debug_errno(r, "Failed to acquire lock on loopback block device: %m"); + BLOCK_SIGNALS(SIGCHLD); r = mkdtemp_malloc("/tmp/inspect-XXXXXX", &tmpdir); diff --git a/src/shared/discover-image.c b/src/shared/discover-image.c index 3f1644ea85..ea0afb81a5 100644 --- a/src/shared/discover-image.c +++ b/src/shared/discover-image.c @@ -1196,6 +1196,12 @@ int image_read_metadata(Image *i) { if (r < 0) return r; + /* Make sure udevd doesn't issue BLKRRPART in the background which might make our partitions + * disappear temporarily. */ + r = loop_device_flock(d, LOCK_SH); + if (r < 0) + return r; + r = dissect_image( d->fd, NULL, NULL, diff --git a/src/shared/dissect-image.c b/src/shared/dissect-image.c index a84df05e3f..25fed4cf50 100644 --- a/src/shared/dissect-image.c +++ b/src/shared/dissect-image.c @@ -2980,6 +2980,11 @@ int mount_image_privately_interactively( if (r < 0) return log_error_errno(r, "Failed to set up loopback device for %s: %m", image); + /* Make sure udevd doesn't issue BLKRRPART behind our backs */ + r = loop_device_flock(d, LOCK_SH); + if (r < 0) + return r; + r = dissect_image_and_warn(d->fd, image, &verity, NULL, d->diskseq, d->uevent_seqnum_not_before, d->timestamp_not_before, flags, &dissected_image); if (r < 0) return r; @@ -3006,6 +3011,10 @@ int mount_image_privately_interactively( if (r < 0) return r; + r = loop_device_flock(d, LOCK_UN); + if (r < 0) + return r; + if (decrypted_image) { r = decrypted_image_relinquish(decrypted_image); if (r < 0) @@ -3086,6 +3095,10 @@ int verity_dissect_and_mount( if (r < 0) return log_debug_errno(r, "Failed to create loop device for image: %m"); + r = loop_device_flock(loop_device, LOCK_SH); + if (r < 0) + return log_debug_errno(r, "Failed to lock loop device: %m"); + r = dissect_image( loop_device->fd, &verity, @@ -3133,6 +3146,10 @@ int verity_dissect_and_mount( if (r < 0) return log_debug_errno(r, "Failed to mount image: %m"); + r = loop_device_flock(loop_device, LOCK_UN); + if (r < 0) + return log_debug_errno(r, "Failed to unlock loopback device: %m"); + /* If we got os-release values from the caller, then we need to match them with the image's * extension-release.d/ content. Return -EINVAL if there's any mismatch. * First, check the distro ID. If that matches, then check the new SYSEXT_LEVEL value if diff --git a/src/sysext/sysext.c b/src/sysext/sysext.c index 6d4df0afd2..ccc0bd2687 100644 --- a/src/sysext/sysext.c +++ b/src/sysext/sysext.c @@ -533,6 +533,10 @@ static int merge_subprocess(Hashmap *images, const char *workspace) { if (r < 0) return log_error_errno(r, "Failed to set up loopback device for %s: %m", img->path); + r = loop_device_flock(d, LOCK_SH); + if (r < 0) + return log_error_errno(r, "Failed to lock loopback device: %m"); + r = dissect_image_and_warn( d->fd, img->path, diff --git a/src/test/test-loop-block.c b/src/test/test-loop-block.c index 714d51cfc2..d1793222f0 100644 --- a/src/test/test-loop-block.c +++ b/src/test/test-loop-block.c @@ -55,6 +55,9 @@ static void* thread_func(void *ptr) { log_notice("Acquired loop device %s, will mount on %s", loop->node, mounted); + /* Let's make sure udev doesn't call BLKRRPART in the background, while we try to mount the device. */ + assert_se(loop_device_flock(loop, LOCK_SH) >= 0); + r = dissect_image(loop->fd, NULL, NULL, loop->diskseq, loop->uevent_seqnum_not_before, loop->timestamp_not_before, DISSECT_IMAGE_READ_ONLY, &dissected); if (r < 0) log_error_errno(r, "Failed dissect loopback device %s: %m", loop->node); @@ -85,6 +88,10 @@ static void* thread_func(void *ptr) { log_notice_errno(r, "Mounted %s → %s: %m", loop->node, mounted); assert_se(r >= 0); + /* Now the block device is mounted, we don't need no manual lock anymore, the devices are now + * pinned by the mounts. */ + assert_se(loop_device_flock(loop, LOCK_UN) >= 0); + log_notice("Unmounting %s", mounted); mounted = umount_and_rmdir_and_free(mounted); @@ -215,6 +222,11 @@ static int run(int argc, char *argv[]) { pthread_t threads[arg_n_threads]; sd_id128_t id; + /* Take an explicit lock while we format the file systems, in accordance with + * https://systemd.io/BLOCK_DEVICE_LOCKING/. We don't want udev to interfere and probe while we write + * or even issue BLKRRPART or similar while we are working on this. */ + assert_se(loop_device_flock(loop, LOCK_EX) >= 0); + assert_se(dissect_image(loop->fd, NULL, NULL, loop->diskseq, loop->uevent_seqnum_not_before, loop->timestamp_not_before, 0, &dissected) >= 0); assert_se(dissected->partitions[PARTITION_ESP].found); @@ -243,9 +255,21 @@ static int run(int argc, char *argv[]) { assert_se(mkdtemp_malloc(NULL, &mounted) >= 0); + /* We are particularly correct here, and now downgrade LOCK → LOCK_SH. That's because we are done + * with formatting the file systems, so we don't need the exclusive lock anymore. From now on a + * shared one is fine. This way udev can now probe the device if it wants, but still won't call + * BLKRRPART on it, and that's good, because that would destroy our partition table while we are at + * it. */ + assert_se(loop_device_flock(loop, LOCK_SH) >= 0); + /* This first (writable) mount will initialize the mount point dirs, so that the subsequent read-only ones can work */ assert_se(dissected_image_mount(dissected, mounted, UID_INVALID, UID_INVALID, 0) >= 0); + /* Now we mounted everything, the partitions are pinned. Now it's fine to release the lock + * fully. This means udev could now issue BLKRRPART again, but that's OK given this will fail because + * we now mounted the device. */ + assert_se(loop_device_flock(loop, LOCK_UN) >= 0); + assert_se(umount_recursive(mounted, 0) >= 0); loop = loop_device_unref(loop); |