summaryrefslogtreecommitdiffstats
path: root/Create.c
diff options
context:
space:
mode:
authorLogan Gunthorpe <logang@deltatee.com>2024-06-04 18:38:36 +0200
committerMariusz Tkaczyk <mariusz.tkaczyk@linux.intel.com>2024-06-13 15:19:34 +0200
commit1a5c0e60308651a20d25ff52511230a20d830330 (patch)
tree571e67b8dd22062f749a8da9afcc0ddd35674ed0 /Create.c
parentimsm: make freesize required to volume autolayout (diff)
downloadmdadm-1a5c0e60308651a20d25ff52511230a20d830330.tar.xz
mdadm-1a5c0e60308651a20d25ff52511230a20d830330.zip
mdadm: Fix hang race condition in wait_for_zero_forks()
Running a create operation with --write-zeros can randomly hang forever waiting for child processes. This happens roughly on in ten runs with when running with small (20MB) loop devices. The bug is caused by the fact that signals can be coallesced into one if they are not read by signalfd quick enough. So if two children finish at exactly the same time, only one SIGCHLD will be received by the parent. To fix this, wait on all processes with WNOHANG every time a SIGCHLD is received and exit when all processes have been waited on. Reported-by: Xiao Ni <xni@redhat.com> Signed-off-by: Logan Gunthorpe <logang@deltatee.com> Signed-off-by: Mariusz Tkaczyk <mariusz.tkaczyk@linux.intel.com>
Diffstat (limited to 'Create.c')
-rw-r--r--Create.c28
1 files changed, 15 insertions, 13 deletions
diff --git a/Create.c b/Create.c
index d033eb68..4f992a22 100644
--- a/Create.c
+++ b/Create.c
@@ -178,6 +178,7 @@ static int wait_for_zero_forks(int *zero_pids, int count)
bool interrupted = false;
sigset_t sigset;
ssize_t s;
+ pid_t pid;
for (i = 0; i < count; i++)
if (zero_pids[i])
@@ -196,7 +197,7 @@ static int wait_for_zero_forks(int *zero_pids, int count)
return 1;
}
- while (1) {
+ while (wait_count) {
s = read(sfd, &fdsi, sizeof(fdsi));
if (s != sizeof(fdsi)) {
pr_err("Invalid signalfd read: %s\n", strerror(errno));
@@ -209,23 +210,24 @@ static int wait_for_zero_forks(int *zero_pids, int count)
pr_info("Interrupting zeroing processes, please wait...\n");
interrupted = true;
} else if (fdsi.ssi_signo == SIGCHLD) {
- if (!--wait_count)
- break;
+ for (i = 0; i < count; i++) {
+ if (!zero_pids[i])
+ continue;
+
+ pid = waitpid(zero_pids[i], &wstatus, WNOHANG);
+ if (pid <= 0)
+ continue;
+
+ zero_pids[i] = 0;
+ if (!WIFEXITED(wstatus) || WEXITSTATUS(wstatus))
+ ret = 1;
+ wait_count--;
+ }
}
}
close(sfd);
- for (i = 0; i < count; i++) {
- if (!zero_pids[i])
- continue;
-
- waitpid(zero_pids[i], &wstatus, 0);
- zero_pids[i] = 0;
- if (!WIFEXITED(wstatus) || WEXITSTATUS(wstatus))
- ret = 1;
- }
-
if (interrupted) {
pr_err("zeroing interrupted!\n");
return 1;