src/shared/barrier.c


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395

/* SPDX-License-Identifier: LGPL-2.1-or-later */

#include <errno.h>
#include <fcntl.h>
#include <stdbool.h>
#include <stdint.h>
#include <stdlib.h>
#include <sys/eventfd.h>
#include <sys/types.h>
#include <unistd.h>

#include "barrier.h"
#include "fd-util.h"
#include "io-util.h"
#include "macro.h"

/**
 * Barriers
 * This barrier implementation provides a simple synchronization method based
 * on file-descriptors that can safely be used between threads and processes. A
 * barrier object contains 2 shared counters based on eventfd. Both processes
 * can now place barriers and wait for the other end to reach a random or
 * specific barrier.
 * Barriers are numbered, so you can either wait for the other end to reach any
 * barrier or the last barrier that you placed. This way, you can use barriers
 * for one-way *and* full synchronization. Note that even-though barriers are
 * numbered, these numbers are internal and recycled once both sides reached the
 * same barrier (implemented as a simple signed counter). It is thus not
 * possible to address barriers by their ID.
 *
 * Barrier-API: Both ends can place as many barriers via barrier_place() as
 * they want and each pair of barriers on both sides will be implicitly linked.
 * Each side can use the barrier_wait/sync_*() family of calls to wait for the
 * other side to place a specific barrier. barrier_wait_next() waits until the
 * other side calls barrier_place(). No links between the barriers are
 * considered and this simply serves as most basic asynchronous barrier.
 * barrier_sync_next() is like barrier_wait_next() and waits for the other side
 * to place their next barrier via barrier_place(). However, it only waits for
 * barriers that are linked to a barrier we already placed. If the other side
 * already placed more barriers than we did, barrier_sync_next() returns
 * immediately.
 * barrier_sync() extends barrier_sync_next() and waits until the other end
 * placed as many barriers via barrier_place() as we did. If they already placed
 * as many as we did (or more), it returns immediately.
 *
 * Additionally to basic barriers, an abortion event is available.
 * barrier_abort() places an abortion event that cannot be undone. An abortion
 * immediately cancels all placed barriers and replaces them. Any running and
 * following wait/sync call besides barrier_wait_abortion() will immediately
 * return false on both sides (otherwise, they always return true).
 * barrier_abort() can be called multiple times on both ends and will be a
 * no-op if already called on this side.
 * barrier_wait_abortion() can be used to wait for the other side to call
 * barrier_abort() and is the only wait/sync call that does not return
 * immediately if we aborted outself. It only returns once the other side
 * called barrier_abort().
 *
 * Barriers can be used for in-process and inter-process synchronization.
 * However, for in-process synchronization you could just use mutexes.
 * Therefore, main target is IPC and we require both sides to *not* share the FD
 * table. If that's given, barriers provide target tracking: If the remote side
 * exit()s, an abortion event is implicitly queued on the other side. This way,
 * a sync/wait call will be woken up if the remote side crashed or exited
 * unexpectedly. However, note that these abortion events are only queued if the
 * barrier-queue has been drained. Therefore, it is safe to place a barrier and
 * exit. The other side can safely wait on the barrier even though the exit
 * queued an abortion event. Usually, the abortion event would overwrite the
 * barrier, however, that's not true for exit-abortion events. Those are only
 * queued if the barrier-queue is drained (thus, the receiving side has placed
 * more barriers than the remote side).
 */

/**
 * barrier_create() - Initialize a barrier object
 * @obj: barrier to initialize
 *
 * This initializes a barrier object. The caller is responsible of allocating
 * the memory and keeping it valid. The memory does not have to be zeroed
 * beforehand.
 * Two eventfd objects are allocated for each barrier. If allocation fails, an
 * error is returned.
 *
 * If this function fails, the barrier is reset to an invalid state so it is
 * safe to call barrier_destroy() on the object regardless whether the
 * initialization succeeded or not.
 *
 * The caller is responsible to destroy the object via barrier_destroy() before
 * releasing the underlying memory.
 *
 * Returns: 0 on success, negative error code on failure.
 */
int barrier_create(Barrier *b) {
        _cleanup_(barrier_destroyp) Barrier *staging = b;
        int r;

        assert(b);

        b->me = eventfd(0, EFD_CLOEXEC | EFD_NONBLOCK);
        if (b->me < 0)
                return -errno;

        b->them = eventfd(0, EFD_CLOEXEC | EFD_NONBLOCK);
        if (b->them < 0)
                return -errno;

        r = pipe2(b->pipe, O_CLOEXEC | O_NONBLOCK);
        if (r < 0)
                return -errno;

        staging = NULL;
        return 0;
}

/**
 * barrier_destroy() - Destroy a barrier object
 * @b: barrier to destroy or NULL
 *
 * This destroys a barrier object that has previously been passed to
 * barrier_create(). The object is released and reset to invalid
 * state. Therefore, it is safe to call barrier_destroy() multiple
 * times or even if barrier_create() failed. However, barrier must be
 * always initialized with BARRIER_NULL.
 *
 * If @b is NULL, this is a no-op.
 */
Barrier* barrier_destroy(Barrier *b) {
        if (!b)
                return NULL;

        b->me = safe_close(b->me);
        b->them = safe_close(b->them);
        safe_close_pair(b->pipe);
        b->barriers = 0;
        return NULL;
}

/**
 * barrier_set_role() - Set the local role of the barrier
 * @b: barrier to operate on
 * @role: role to set on the barrier
 *
 * This sets the roles on a barrier object. This is needed to know
 * which side of the barrier you're on. Usually, the parent creates
 * the barrier via barrier_create() and then calls fork() or clone().
 * Therefore, the FDs are duplicated and the child retains the same
 * barrier object.
 *
 * Both sides need to call barrier_set_role() after fork() or clone()
 * are done. If this is not done, barriers will not work correctly.
 *
 * Note that barriers could be supported without fork() or clone(). However,
 * this is currently not needed so it hasn't been implemented.
 */
void barrier_set_role(Barrier *b, unsigned role) {
        assert(b);
        assert(IN_SET(role, BARRIER_PARENT, BARRIER_CHILD));
        /* make sure this is only called once */
        assert(b->pipe[0] >= 0 && b->pipe[1] >= 0);

        if (role == BARRIER_PARENT)
                b->pipe[1] = safe_close(b->pipe[1]);
        else {
                b->pipe[0] = safe_close(b->pipe[0]);

                /* swap me/them for children */
                SWAP_TWO(b->me, b->them);
        }
}

/* places barrier; returns false if we aborted, otherwise true */
static bool barrier_write(Barrier *b, uint64_t buf) {
        ssize_t len;

        /* prevent new sync-points if we already aborted */
        if (barrier_i_aborted(b))
                return false;

        assert(b->me >= 0);
        do {
                len = write(b->me, &buf, sizeof(buf));
        } while (len < 0 && IN_SET(errno, EAGAIN, EINTR));

        if (len != sizeof(buf))
                goto error;

        /* lock if we aborted */
        if (buf >= (uint64_t)BARRIER_ABORTION) {
                if (barrier_they_aborted(b))
                        b->barriers = BARRIER_WE_ABORTED;
                else
                        b->barriers = BARRIER_I_ABORTED;
        } else if (!barrier_is_aborted(b))
                b->barriers += buf;

        return !barrier_i_aborted(b);

error:
        /* If there is an unexpected error, we have to make this fatal. There
         * is no way we can recover from sync-errors. Therefore, we close the
         * pipe-ends and treat this as abortion. The other end will notice the
         * pipe-close and treat it as abortion, too. */

        safe_close_pair(b->pipe);
        b->barriers = BARRIER_WE_ABORTED;
        return false;
}

/* waits for barriers; returns false if they aborted, otherwise true */
static bool barrier_read(Barrier *b, int64_t comp) {
        if (barrier_they_aborted(b))
                return false;

        while (b->barriers > comp) {
                struct pollfd pfd[2] = {
                        { .fd = b->pipe[0] >= 0 ? b->pipe[0] : b->pipe[1],
                          .events = POLLHUP },
                        { .fd = b->them,
                          .events = POLLIN }};
                uint64_t buf;
                int r;

                r = ppoll_usec(pfd, ELEMENTSOF(pfd), USEC_INFINITY);
                if (r == -EINTR)
                        continue;
                if (r < 0)
                        goto error;

                if (pfd[1].revents) {
                        ssize_t len;

                        /* events on @them signal new data for us */
                        len = read(b->them, &buf, sizeof(buf));
                        if (len < 0 && IN_SET(errno, EAGAIN, EINTR))
                                continue;

                        if (len != sizeof(buf))
                                goto error;
                } else if (pfd[0].revents & (POLLHUP | POLLERR | POLLNVAL))
                        /* POLLHUP on the pipe tells us the other side exited.
                         * We treat this as implicit abortion. But we only
                         * handle it if there's no event on the eventfd. This
                         * guarantees that exit-abortions do not overwrite real
                         * barriers. */
                        buf = BARRIER_ABORTION;
                else
                        continue;

                /* lock if they aborted */
                if (buf >= (uint64_t)BARRIER_ABORTION) {
                        if (barrier_i_aborted(b))
                                b->barriers = BARRIER_WE_ABORTED;
                        else
                                b->barriers = BARRIER_THEY_ABORTED;
                } else if (!barrier_is_aborted(b))
                        b->barriers -= buf;
        }

        return !barrier_they_aborted(b);

error:
        /* If there is an unexpected error, we have to make this fatal. There
         * is no way we can recover from sync-errors. Therefore, we close the
         * pipe-ends and treat this as abortion. The other end will notice the
         * pipe-close and treat it as abortion, too. */

        safe_close_pair(b->pipe);
        b->barriers = BARRIER_WE_ABORTED;
        return false;
}

/**
 * barrier_place() - Place a new barrier
 * @b: barrier object
 *
 * This places a new barrier on the barrier object. If either side already
 * aborted, this is a no-op and returns "false". Otherwise, the barrier is
 * placed and this returns "true".
 *
 * Returns: true if barrier was placed, false if either side aborted.
 */
bool barrier_place(Barrier *b) {
        assert(b);

        if (barrier_is_aborted(b))
                return false;

        barrier_write(b, BARRIER_SINGLE);
        return true;
}

/**
 * barrier_abort() - Abort the synchronization
 * @b: barrier object to abort
 *
 * This aborts the barrier-synchronization. If barrier_abort() was already
 * called on this side, this is a no-op. Otherwise, the barrier is put into the
 * ABORT-state and will stay there. The other side is notified about the
 * abortion. Any following attempt to place normal barriers or to wait on normal
 * barriers will return immediately as "false".
 *
 * You can wait for the other side to call barrier_abort(), too. Use
 * barrier_wait_abortion() for that.
 *
 * Returns: false if the other side already aborted, true otherwise.
 */
bool barrier_abort(Barrier *b) {
        assert(b);

        barrier_write(b, BARRIER_ABORTION);
        return !barrier_they_aborted(b);
}

/**
 * barrier_wait_next() - Wait for the next barrier of the other side
 * @b: barrier to operate on
 *
 * This waits until the other side places its next barrier. This is independent
 * of any barrier-links and just waits for any next barrier of the other side.
 *
 * If either side aborted, this returns false.
 *
 * Returns: false if either side aborted, true otherwise.
 */
bool barrier_wait_next(Barrier *b) {
        assert(b);

        if (barrier_is_aborted(b))
                return false;

        barrier_read(b, b->barriers - 1);
        return !barrier_is_aborted(b);
}

/**
 * barrier_wait_abortion() - Wait for the other side to abort
 * @b: barrier to operate on
 *
 * This waits until the other side called barrier_abort(). This can be called
 * regardless whether the local side already called barrier_abort() or not.
 *
 * If the other side has already aborted, this returns immediately.
 *
 * Returns: false if the local side aborted, true otherwise.
 */
bool barrier_wait_abortion(Barrier *b) {
        assert(b);

        barrier_read(b, BARRIER_THEY_ABORTED);
        return !barrier_i_aborted(b);
}

/**
 * barrier_sync_next() - Wait for the other side to place a next linked barrier
 * @b: barrier to operate on
 *
 * This is like barrier_wait_next() and waits for the other side to call
 * barrier_place(). However, this only waits for linked barriers. That means, if
 * the other side already placed more barriers than (or as much as) we did, this
 * returns immediately instead of waiting.
 *
 * If either side aborted, this returns false.
 *
 * Returns: false if either side aborted, true otherwise.
 */
bool barrier_sync_next(Barrier *b) {
        assert(b);

        if (barrier_is_aborted(b))
                return false;

        barrier_read(b, MAX((int64_t)0, b->barriers - 1));
        return !barrier_is_aborted(b);
}

/**
 * barrier_sync() - Wait for the other side to place as many barriers as we did
 * @b: barrier to operate on
 *
 * This is like barrier_sync_next() but waits for the other side to call
 * barrier_place() as often as we did (in total). If they already placed as much
 * as we did (or more), this returns immediately instead of waiting.
 *
 * If either side aborted, this returns false.
 *
 * Returns: false if either side aborted, true otherwise.
 */
bool barrier_sync(Barrier *b) {
        assert(b);

        if (barrier_is_aborted(b))
                return false;

        barrier_read(b, 0);
        return !barrier_is_aborted(b);
}