core: store timestamps of unit load attempts

When the system is under heavy load, it can happen that the unit cache is refreshed for an unrelated reason (in the test I simulate this by attempting to start a non-existing unit). The new unit is found and accounted for in the cache, but it's ignored since we are loading something else. When we actually look for it, by attempting to start it, the cache is up to date so no refresh happens, and starting fails although we have it loaded in the cache. When the unit state is set to UNIT_NOT_FOUND, mark the timestamp in u->fragment_loadtime. Then when attempting to load again we can check both if the cache itself needs a refresh, OR if it was refreshed AFTER the last failed attempt that resulted in the state being UNIT_NOT_FOUND. Update the test so that this issue reproduces more often.
author: Luca Boccassi <luca.boccassi@microsoft.com> 2020-06-16 19:46:55 +0200
committer: Zbigniew Jędrzejewski-Szmek <zbyszek@in.waw.pl> 2020-06-30 16:50:00 +0200
commit: 7233e91af000b539c1724996758528af6d6dcc36 (patch)
tree: a62b8ffc410cf0c3afc2a979077b745f3592d134
parent: Merge pull request #16282 from poettering/repart-copy-blocks (diff)
download: systemd-7233e91af000b539c1724996758528af6d6dcc36.tar.xz
systemd-7233e91af000b539c1724996758528af6d6dcc36.zip
5 files changed, 37 insertions, 6 deletions
diff --git a/src/core/manager.c b/src/core/manager.c
index ab3d0b1192..7b199f5175 100644
--- a/src/core/manager.c
+++ b/src/core/manager.c
@@ -1932,10 +1932,11 @@ unsigned manager_dispatch_load_queue(Manager *m) {
         return n;
 }
 
-static bool manager_unit_cache_needs_refresh(Manager *m) {
+static bool manager_unit_cache_needs_refresh(Manager *m, Unit *u) {
         assert(m);
 
-        return m->unit_cache_mtime > 0 && !lookup_paths_mtime_good(&m->lookup_paths, m->unit_cache_mtime);
+        return m->unit_cache_mtime > 0 &&
+                (m->unit_cache_mtime > u->fragment_loadtime || !lookup_paths_mtime_good(&m->lookup_paths, m->unit_cache_mtime));
 }
 
 int manager_load_unit_prepare(
@@ -1982,8 +1983,12 @@ int manager_load_unit_prepare(
                  * but if they are already referenced (because of dependencies or ordering)
                  * then we have to force a load of the fragment. As an optimization, check
                  * first if anything in the usual paths was modified since the last time
-                 * the cache was loaded. */
-                if (ret->load_state == UNIT_NOT_FOUND && manager_unit_cache_needs_refresh(m))
+                 * the cache was loaded. Also check if the last time an attempt to load the
+                 * unit was made was before the most recent cache refresh, so that we know
+                 * we need to try again - even if the cache is current, it might have been
+                 * updated in a different context before we had a chance to retry loading
+                 * this particular unit. */
+                if (ret->load_state == UNIT_NOT_FOUND && manager_unit_cache_needs_refresh(m, ret))
                         ret->load_state = UNIT_STUB;
                 else {
                         *_ret = ret;
diff --git a/src/core/unit.c b/src/core/unit.c
index 2e4482da19..18bf0cd52a 100644
--- a/src/core/unit.c
+++ b/src/core/unit.c
@@ -1682,6 +1682,11 @@ fail:
                                                      UNIT_ERROR;
         u->load_error = r;
 
+        /* Record the last time we tried to load the unit, so that if the cache gets updated between now
+         * and the next time an attempt is made to load this unit, we know we need to check again */
+        if (u->load_state == UNIT_NOT_FOUND)
+                u->fragment_loadtime = now(CLOCK_REALTIME);
+
         unit_add_to_dbus_queue(u);
         unit_add_to_gc_queue(u);
 
diff --git a/src/core/unit.h b/src/core/unit.h
index 6a90daa7ce..d5e4c65989 100644
--- a/src/core/unit.h
+++ b/src/core/unit.h
@@ -136,6 +136,7 @@ typedef struct Unit {
         char *source_path; /* if converted, the source file */
         char **dropin_paths;
 
+        usec_t fragment_loadtime;
         usec_t fragment_mtime;
         usec_t source_mtime;
         usec_t dropin_mtime;
diff --git a/test/TEST-48-START-STOP-NO-RELOAD/blacklist-ubuntu-ci b/test/TEST-48-START-STOP-NO-RELOAD/blacklist-ubuntu-ci
deleted file mode 100644
index e69de29bb2..0000000000
--- a/test/TEST-48-START-STOP-NO-RELOAD/blacklist-ubuntu-ci
+++ /dev/null
diff --git a/test/units/testsuite-48.sh b/test/units/testsuite-48.sh
index a811134d77..93e5e98e42 100755
--- a/test/units/testsuite-48.sh
+++ b/test/units/testsuite-48.sh
@@ -21,18 +21,38 @@ systemctl start testservice-48.target
 # May 07 23:12:20 systemd-testsuite testsuite-48.sh[52]: -rw-r--r-- 1 root root 50 2020-05-07 23:12:20.000000000 +0100 /
 # May 07 23:12:20 systemd-testsuite testsuite-48.sh[30]: + stat -f --format=%t /etc/systemd/system/testservice-48.servic
 # May 07 23:12:20 systemd-testsuite testsuite-48.sh[53]: ef53
-sleep 1.1
+sleep 3.1
 
 cat > /run/systemd/system/testservice-48.service <<EOF
 [Service]
 ExecStart=/bin/sleep infinity
-Type=exec
 EOF
 
 systemctl start testservice-48.service
 
 systemctl is-active testservice-48.service
 
+# Stop and remove, and try again to exercise https://github.com/systemd/systemd/issues/15992
+systemctl stop testservice-48.service
+rm -f /run/systemd/system/testservice-48.service
+systemctl daemon-reload
+
+sleep 3.1
+
+cat > /run/systemd/system/testservice-48.service <<EOF
+[Service]
+ExecStart=/bin/sleep infinity
+EOF
+
+# Start a non-existing unit first, so that the cache is reloaded for an unrelated
+# reason. Starting the existing unit later should still work thanks to the check
+# for the last load attempt vs cache timestamp.
+systemctl start testservice-48-nonexistent.service || true
+
+systemctl start testservice-48.service
+
+systemctl is-active testservice-48.service
+
 echo OK > /testok
 
 exit 0
author	Luca Boccassi <luca.boccassi@microsoft.com>	2020-06-16 19:46:55 +0200
committer	Zbigniew Jędrzejewski-Szmek <zbyszek@in.waw.pl>	2020-06-30 16:50:00 +0200
commit	7233e91af000b539c1724996758528af6d6dcc36 (patch)
tree	a62b8ffc410cf0c3afc2a979077b745f3592d134
parent	Merge pull request #16282 from poettering/repart-copy-blocks (diff)
download	systemd-7233e91af000b539c1724996758528af6d6dcc36.tar.xz systemd-7233e91af000b539c1724996758528af6d6dcc36.zip