summaryrefslogtreecommitdiffstats
path: root/fs/afs
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2024-01-10 19:11:01 +0100
committerLinus Torvalds <torvalds@linux-foundation.org>2024-01-10 19:11:01 +0100
commit0c59ae1290741854b6cf597ef05bfa9bc811389f (patch)
treec0752e5f713ee966c10669cc95a5e630c73f3217 /fs/afs
parentMerge tag 'jfs-6.8' of github.com:kleikamp/linux-shaggy (diff)
parentafs: trace: Log afs_make_call(), including server address (diff)
downloadlinux-0c59ae1290741854b6cf597ef05bfa9bc811389f.tar.xz
linux-0c59ae1290741854b6cf597ef05bfa9bc811389f.zip
Merge tag 'afs-fix-rotation-20240105' of git://git.kernel.org/pub/scm/linux/kernel/git/dhowells/linux-fs
Pull afs updates from David Howells: "The majority of the patches are aimed at fixing and improving the AFS filesystem's rotation over server IP addresses, but there are also some fixes from Oleg Nesterov for the use of read_seqbegin_or_lock(). - Fix fileserver probe handling so that the next round of probes doesn't break ongoing server/address rotation by clearing all the probe result tracking. This could occasionally cause the rotation algorithm to drop straight through, give a 'successful' result without actually emitting any RPC calls, leaving the reply buffer in an undefined state. Instead, detach the probe results into a separate struct and allocate a new one each time we start probing and update the pointer to it. Probes are also sent in order of address preference to try and improve the chance that the preferred one will complete first. - Fix server rotation so that it uses configurable address preferences across on the probes that have completed so far than ranking them by RTT as the latter doesn't necessarily give the best route. The preference list can be altered by writing into /proc/net/afs/addr_prefs. - Fix the handling of Read-Only (and Backup) volume callbacks as there is one per volume, not one per file, so if someone performs a command that, say, offlines the volume but doesn't change it, when it comes back online we don't spam the server with a status fetch for every vnode we're using. Instead, check the Creation timestamp in the VolSync record when prompted by a callback break. - Handle volume regression (ie. a RW volume being restored from a backup) by scrubbing all cache data for that volume. This is detected from the VolSync creation timestamp. - Adjust abort handling and abort -> error mapping to match better with what other AFS clients do. - Fix offline and busy volume state handling as they only apply to individual server instances and not entire volumes and the rotation algorithm should go and look at other servers if available. Also make it sleep briefly before each retry if all the volume instances are unavailable" * tag 'afs-fix-rotation-20240105' of git://git.kernel.org/pub/scm/linux/kernel/git/dhowells/linux-fs: (40 commits) afs: trace: Log afs_make_call(), including server address afs: Fix offline and busy message emission afs: Fix fileserver rotation afs: Overhaul invalidation handling to better support RO volumes afs: Parse the VolSync record in the reply of a number of RPC ops afs: Don't leave DONTUSE/NEWREPSITE servers out of server list afs: Fix comment in afs_do_lookup() afs: Apply server breaks to mmap'd files in the call processor afs: Move the vnode/volume validity checking code into its own file afs: Defer volume record destruction to a workqueue afs: Make it possible to find the volumes that are using a server afs: Combine the endpoint state bools into a bitmask afs: Keep a record of the current fileserver endpoint state afs: Dispatch vlserver probes in priority order afs: Dispatch fileserver probes in priority order afs: Mark address lists with configured priorities afs: Provide a way to configure address priorities afs: Remove the unimplemented afs_cmp_addr_list() afs: Add some more info to /proc/net/afs/servers rxrpc: Create a procfile to display outstanding client conn bundles ...
Diffstat (limited to 'fs/afs')
-rw-r--r--fs/afs/Makefile2
-rw-r--r--fs/afs/addr_list.c224
-rw-r--r--fs/afs/addr_prefs.c531
-rw-r--r--fs/afs/afs.h3
-rw-r--r--fs/afs/callback.c141
-rw-r--r--fs/afs/cell.c5
-rw-r--r--fs/afs/cmservice.c5
-rw-r--r--fs/afs/dir.c66
-rw-r--r--fs/afs/dir_silly.c2
-rw-r--r--fs/afs/file.c20
-rw-r--r--fs/afs/fs_operation.c85
-rw-r--r--fs/afs/fs_probe.c323
-rw-r--r--fs/afs/fsclient.c74
-rw-r--r--fs/afs/inode.c204
-rw-r--r--fs/afs/internal.h370
-rw-r--r--fs/afs/main.c1
-rw-r--r--fs/afs/misc.c10
-rw-r--r--fs/afs/proc.c102
-rw-r--r--fs/afs/rotate.c520
-rw-r--r--fs/afs/rxrpc.c107
-rw-r--r--fs/afs/server.c135
-rw-r--r--fs/afs/server_list.c174
-rw-r--r--fs/afs/super.c7
-rw-r--r--fs/afs/validation.c473
-rw-r--r--fs/afs/vl_alias.c69
-rw-r--r--fs/afs/vl_list.c29
-rw-r--r--fs/afs/vl_probe.c60
-rw-r--r--fs/afs/vl_rotate.c215
-rw-r--r--fs/afs/vlclient.c143
-rw-r--r--fs/afs/volume.c61
-rw-r--r--fs/afs/write.c6
-rw-r--r--fs/afs/yfsclient.c25
32 files changed, 2882 insertions, 1310 deletions
diff --git a/fs/afs/Makefile b/fs/afs/Makefile
index e8956b65d7ff..dcdc0f1bb76f 100644
--- a/fs/afs/Makefile
+++ b/fs/afs/Makefile
@@ -5,6 +5,7 @@
kafs-y := \
addr_list.o \
+ addr_prefs.o \
callback.o \
cell.o \
cmservice.o \
@@ -27,6 +28,7 @@ kafs-y := \
server.o \
server_list.o \
super.o \
+ validation.o \
vlclient.o \
vl_alias.o \
vl_list.o \
diff --git a/fs/afs/addr_list.c b/fs/afs/addr_list.c
index de1ae0bead3b..6d42f85c6be5 100644
--- a/fs/afs/addr_list.c
+++ b/fs/afs/addr_list.c
@@ -13,26 +13,55 @@
#include "internal.h"
#include "afs_fs.h"
+static void afs_free_addrlist(struct rcu_head *rcu)
+{
+ struct afs_addr_list *alist = container_of(rcu, struct afs_addr_list, rcu);
+ unsigned int i;
+
+ for (i = 0; i < alist->nr_addrs; i++)
+ rxrpc_kernel_put_peer(alist->addrs[i].peer);
+ trace_afs_alist(alist->debug_id, refcount_read(&alist->usage), afs_alist_trace_free);
+ kfree(alist);
+}
+
/*
* Release an address list.
*/
-void afs_put_addrlist(struct afs_addr_list *alist)
+void afs_put_addrlist(struct afs_addr_list *alist, enum afs_alist_trace reason)
+{
+ unsigned int debug_id;
+ bool dead;
+ int r;
+
+ if (!alist)
+ return;
+ debug_id = alist->debug_id;
+ dead = __refcount_dec_and_test(&alist->usage, &r);
+ trace_afs_alist(debug_id, r - 1, reason);
+ if (dead)
+ call_rcu(&alist->rcu, afs_free_addrlist);
+}
+
+struct afs_addr_list *afs_get_addrlist(struct afs_addr_list *alist, enum afs_alist_trace reason)
{
- if (alist && refcount_dec_and_test(&alist->usage))
- kfree_rcu(alist, rcu);
+ int r;
+
+ if (alist) {
+ __refcount_inc(&alist->usage, &r);
+ trace_afs_alist(alist->debug_id, r + 1, reason);
+ }
+ return alist;
}
/*
* Allocate an address list.
*/
-struct afs_addr_list *afs_alloc_addrlist(unsigned int nr,
- unsigned short service,
- unsigned short port)
+struct afs_addr_list *afs_alloc_addrlist(unsigned int nr)
{
struct afs_addr_list *alist;
- unsigned int i;
+ static atomic_t debug_id;
- _enter("%u,%u,%u", nr, service, port);
+ _enter("%u", nr);
if (nr > AFS_MAX_ADDRESSES)
nr = AFS_MAX_ADDRESSES;
@@ -43,17 +72,8 @@ struct afs_addr_list *afs_alloc_addrlist(unsigned int nr,
refcount_set(&alist->usage, 1);
alist->max_addrs = nr;
-
- for (i = 0; i < nr; i++) {
- struct sockaddr_rxrpc *srx = &alist->addrs[i];
- srx->srx_family = AF_RXRPC;
- srx->srx_service = service;
- srx->transport_type = SOCK_DGRAM;
- srx->transport_len = sizeof(srx->transport.sin6);
- srx->transport.sin6.sin6_family = AF_INET6;
- srx->transport.sin6.sin6_port = htons(port);
- }
-
+ alist->debug_id = atomic_inc_return(&debug_id);
+ trace_afs_alist(alist->debug_id, 1, afs_alist_trace_alloc);
return alist;
}
@@ -126,7 +146,7 @@ struct afs_vlserver_list *afs_parse_text_addrs(struct afs_net *net,
if (!vllist->servers[0].server)
goto error_vl;
- alist = afs_alloc_addrlist(nr, service, AFS_VL_PORT);
+ alist = afs_alloc_addrlist(nr);
if (!alist)
goto error;
@@ -197,9 +217,11 @@ struct afs_vlserver_list *afs_parse_text_addrs(struct afs_net *net,
}
if (family == AF_INET)
- afs_merge_fs_addr4(alist, x[0], xport);
+ ret = afs_merge_fs_addr4(net, alist, x[0], xport);
else
- afs_merge_fs_addr6(alist, x, xport);
+ ret = afs_merge_fs_addr6(net, alist, x, xport);
+ if (ret < 0)
+ goto error;
} while (p < end);
@@ -216,26 +238,13 @@ bad_address:
problem, p - text, (int)len, (int)len, text);
ret = -EINVAL;
error:
- afs_put_addrlist(alist);
+ afs_put_addrlist(alist, afs_alist_trace_put_parse_error);
error_vl:
afs_put_vlserverlist(net, vllist);
return ERR_PTR(ret);
}
/*
- * Compare old and new address lists to see if there's been any change.
- * - How to do this in better than O(Nlog(N)) time?
- * - We don't really want to sort the address list, but would rather take the
- * list as we got it so as not to undo record rotation by the DNS server.
- */
-#if 0
-static int afs_cmp_addr_list(const struct afs_addr_list *a1,
- const struct afs_addr_list *a2)
-{
-}
-#endif
-
-/*
* Perform a DNS query for VL servers and build a up an address list.
*/
struct afs_vlserver_list *afs_dns_query(struct afs_cell *cell, time64_t *_expiry)
@@ -271,25 +280,33 @@ struct afs_vlserver_list *afs_dns_query(struct afs_cell *cell, time64_t *_expiry
/*
* Merge an IPv4 entry into a fileserver address list.
*/
-void afs_merge_fs_addr4(struct afs_addr_list *alist, __be32 xdr, u16 port)
+int afs_merge_fs_addr4(struct afs_net *net, struct afs_addr_list *alist,
+ __be32 xdr, u16 port)
{
- struct sockaddr_rxrpc *srx;
- u32 addr = ntohl(xdr);
+ struct sockaddr_rxrpc srx;
+ struct rxrpc_peer *peer;
int i;
if (alist->nr_addrs >= alist->max_addrs)
- return;
+ return 0;
- for (i = 0; i < alist->nr_ipv4; i++) {
- struct sockaddr_in *a = &alist->addrs[i].transport.sin;
- u32 a_addr = ntohl(a->sin_addr.s_addr);
- u16 a_port = ntohs(a->sin_port);
+ srx.srx_family = AF_RXRPC;
+ srx.transport_type = SOCK_DGRAM;
+ srx.transport_len = sizeof(srx.transport.sin);
+ srx.transport.sin.sin_family = AF_INET;
+ srx.transport.sin.sin_port = htons(port);
+ srx.transport.sin.sin_addr.s_addr = xdr;
- if (addr == a_addr && port == a_port)
- return;
- if (addr == a_addr && port < a_port)
- break;
- if (addr < a_addr)
+ peer = rxrpc_kernel_lookup_peer(net->socket, &srx, GFP_KERNEL);
+ if (!peer)
+ return -ENOMEM;
+
+ for (i = 0; i < alist->nr_ipv4; i++) {
+ if (peer == alist->addrs[i].peer) {
+ rxrpc_kernel_put_peer(peer);
+ return 0;
+ }
+ if (peer <= alist->addrs[i].peer)
break;
}
@@ -298,38 +315,42 @@ void afs_merge_fs_addr4(struct afs_addr_list *alist, __be32 xdr, u16 port)
alist->addrs + i,
sizeof(alist->addrs[0]) * (alist->nr_addrs - i));
- srx = &alist->addrs[i];
- srx->srx_family = AF_RXRPC;
- srx->transport_type = SOCK_DGRAM;
- srx->transport_len = sizeof(srx->transport.sin);
- srx->transport.sin.sin_family = AF_INET;
- srx->transport.sin.sin_port = htons(port);
- srx->transport.sin.sin_addr.s_addr = xdr;
+ alist->addrs[i].peer = peer;
alist->nr_ipv4++;
alist->nr_addrs++;
+ return 0;
}
/*
* Merge an IPv6 entry into a fileserver address list.
*/
-void afs_merge_fs_addr6(struct afs_addr_list *alist, __be32 *xdr, u16 port)
+int afs_merge_fs_addr6(struct afs_net *net, struct afs_addr_list *alist,
+ __be32 *xdr, u16 port)
{
- struct sockaddr_rxrpc *srx;
- int i, diff;
+ struct sockaddr_rxrpc srx;
+ struct rxrpc_peer *peer;
+ int i;
if (alist->nr_addrs >= alist->max_addrs)
- return;
+ return 0;
- for (i = alist->nr_ipv4; i < alist->nr_addrs; i++) {
- struct sockaddr_in6 *a = &alist->addrs[i].transport.sin6;
- u16 a_port = ntohs(a->sin6_port);
+ srx.srx_family = AF_RXRPC;
+ srx.transport_type = SOCK_DGRAM;
+ srx.transport_len = sizeof(srx.transport.sin6);
+ srx.transport.sin6.sin6_family = AF_INET6;
+ srx.transport.sin6.sin6_port = htons(port);
+ memcpy(&srx.transport.sin6.sin6_addr, xdr, 16);
- diff = memcmp(xdr, &a->sin6_addr, 16);
- if (diff == 0 && port == a_port)
- return;
- if (diff == 0 && port < a_port)
- break;
- if (diff < 0)
+ peer = rxrpc_kernel_lookup_peer(net->socket, &srx, GFP_KERNEL);
+ if (!peer)
+ return -ENOMEM;
+
+ for (i = alist->nr_ipv4; i < alist->nr_addrs; i++) {
+ if (peer == alist->addrs[i].peer) {
+ rxrpc_kernel_put_peer(peer);
+ return 0;
+ }
+ if (peer <= alist->addrs[i].peer)
break;
}
@@ -337,68 +358,7 @@ void afs_merge_fs_addr6(struct afs_addr_list *alist, __be32 *xdr, u16 port)
memmove(alist->addrs + i + 1,
alist->addrs + i,
sizeof(alist->addrs[0]) * (alist->nr_addrs - i));
-
- srx = &alist->addrs[i];
- srx->srx_family = AF_RXRPC;
- srx->transport_type = SOCK_DGRAM;
- srx->transport_len = sizeof(srx->transport.sin6);
- srx->transport.sin6.sin6_family = AF_INET6;
- srx->transport.sin6.sin6_port = htons(port);
- memcpy(&srx->transport.sin6.sin6_addr, xdr, 16);
+ alist->addrs[i].peer = peer;
alist->nr_addrs++;
-}
-
-/*
- * Get an address to try.
- */
-bool afs_iterate_addresses(struct afs_addr_cursor *ac)
-{
- unsigned long set, failed;
- int index;
-
- if (!ac->alist)
- return false;
-
- set = ac->alist->responded;
- failed = ac->alist->failed;
- _enter("%lx-%lx-%lx,%d", set, failed, ac->tried, ac->index);
-
- ac->nr_iterations++;
-
- set &= ~(failed | ac->tried);
-
- if (!set)
- return false;
-
- index = READ_ONCE(ac->alist->preferred);
- if (test_bit(index, &set))
- goto selected;
-
- index = __ffs(set);
-
-selected:
- ac->index = index;
- set_bit(index, &ac->tried);
- ac->responded = false;
- return true;
-}
-
-/*
- * Release an address list cursor.
- */
-int afs_end_cursor(struct afs_addr_cursor *ac)
-{
- struct afs_addr_list *alist;
-
- alist = ac->alist;
- if (alist) {
- if (ac->responded &&
- ac->index != alist->preferred &&
- test_bit(ac->alist->preferred, &ac->tried))
- WRITE_ONCE(alist->preferred, ac->index);
- afs_put_addrlist(alist);
- ac->alist = NULL;
- }
-
- return ac->error;
+ return 0;
}
diff --git a/fs/afs/addr_prefs.c b/fs/afs/addr_prefs.c
new file mode 100644
index 000000000000..a189ff8a5034
--- /dev/null
+++ b/fs/afs/addr_prefs.c
@@ -0,0 +1,531 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/* Address preferences management
+ *
+ * Copyright (C) 2023 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": addr_prefs: " fmt
+#include <linux/slab.h>
+#include <linux/ctype.h>
+#include <linux/inet.h>
+#include <linux/seq_file.h>
+#include <keys/rxrpc-type.h>
+#include "internal.h"
+
+static inline struct afs_net *afs_seq2net_single(struct seq_file *m)
+{
+ return afs_net(seq_file_single_net(m));
+}
+
+/*
+ * Split a NUL-terminated string up to the first newline around spaces. The
+ * source string will be modified to have NUL-terminations inserted.
+ */
+static int afs_split_string(char **pbuf, char *strv[], unsigned int maxstrv)
+{
+ unsigned int count = 0;
+ char *p = *pbuf;
+
+ maxstrv--; /* Allow for terminal NULL */
+ for (;;) {
+ /* Skip over spaces */
+ while (isspace(*p)) {
+ if (*p == '\n') {
+ p++;
+ break;
+ }
+ p++;
+ }
+ if (!*p)
+ break;
+
+ /* Mark start of word */
+ if (count >= maxstrv) {
+ pr_warn("Too many elements in string\n");
+ return -EINVAL;
+ }
+ strv[count++] = p;
+
+ /* Skip over word */
+ while (!isspace(*p))
+ p++;
+ if (!*p)
+ break;
+
+ /* Mark end of word */
+ if (*p == '\n') {
+ *p++ = 0;
+ break;
+ }
+ *p++ = 0;
+ }
+
+ *pbuf = p;
+ strv[count] = NULL;
+ return count;
+}
+
+/*
+ * Parse an address with an optional subnet mask.
+ */
+static int afs_parse_address(char *p, struct afs_addr_preference *pref)
+{
+ const char *stop;
+ unsigned long mask, tmp;
+ char *end = p + strlen(p);
+ bool bracket = false;
+
+ if (*p == '[') {
+ p++;
+ bracket = true;
+ }
+
+#if 0
+ if (*p == '[') {
+ p++;
+ q = memchr(p, ']', end - p);
+ if (!q) {
+ pr_warn("Can't find closing ']'\n");
+ return -EINVAL;
+ }
+ } else {
+ for (q = p; q < end; q++)
+ if (*q == '/')
+ break;
+ }
+#endif
+
+ if (in4_pton(p, end - p, (u8 *)&pref->ipv4_addr, -1, &stop)) {
+ pref->family = AF_INET;
+ mask = 32;
+ } else if (in6_pton(p, end - p, (u8 *)&pref->ipv6_addr, -1, &stop)) {
+ pref->family = AF_INET6;
+ mask = 128;
+ } else {
+ pr_warn("Can't determine address family\n");
+ return -EINVAL;
+ }
+
+ p = (char *)stop;
+ if (bracket) {
+ if (*p != ']') {
+ pr_warn("Can't find closing ']'\n");
+ return -EINVAL;
+ }
+ p++;
+ }
+
+ if (*p == '/') {
+ p++;
+ tmp = simple_strtoul(p, &p, 10);
+ if (tmp > mask) {
+ pr_warn("Subnet mask too large\n");
+ return -EINVAL;
+ }
+ if (tmp == 0) {
+ pr_warn("Subnet mask too small\n");
+ return -EINVAL;
+ }
+ mask = tmp;
+ }
+
+ if (*p) {
+ pr_warn("Invalid address\n");
+ return -EINVAL;
+ }
+
+ pref->subnet_mask = mask;
+ return 0;
+}
+
+enum cmp_ret {
+ CONTINUE_SEARCH,
+ INSERT_HERE,
+ EXACT_MATCH,
+ SUBNET_MATCH,
+};
+
+/*
+ * See if a candidate address matches a listed address.
+ */
+static enum cmp_ret afs_cmp_address_pref(const struct afs_addr_preference *a,
+ const struct afs_addr_preference *b)
+{
+ int subnet = min(a->subnet_mask, b->subnet_mask);
+ const __be32 *pa, *pb;
+ u32 mask, na, nb;
+ int diff;
+
+ if (a->family != b->family)
+ return INSERT_HERE;
+
+ switch (a->family) {
+ case AF_INET6:
+ pa = a->ipv6_addr.s6_addr32;
+ pb = b->ipv6_addr.s6_addr32;
+ break;
+ case AF_INET:
+ pa = &a->ipv4_addr.s_addr;
+ pb = &b->ipv4_addr.s_addr;
+ break;
+ }
+
+ while (subnet > 32) {
+ diff = ntohl(*pa++) - ntohl(*pb++);
+ if (diff < 0)
+ return INSERT_HERE; /* a<b */
+ if (diff > 0)
+ return CONTINUE_SEARCH; /* a>b */
+ subnet -= 32;
+ }
+
+ if (subnet == 0)
+ return EXACT_MATCH;
+
+ mask = 0xffffffffU << (32 - subnet);
+ na = ntohl(*pa);
+ nb = ntohl(*pb);
+ diff = (na & mask) - (nb & mask);
+ //kdebug("diff %08x %08x %08x %d", na, nb, mask, diff);
+ if (diff < 0)
+ return INSERT_HERE; /* a<b */
+ if (diff > 0)
+ return CONTINUE_SEARCH; /* a>b */
+ if (a->subnet_mask == b->subnet_mask)
+ return EXACT_MATCH;
+ if (a->subnet_mask > b->subnet_mask)
+ return SUBNET_MATCH; /* a binds tighter than b */
+ return CONTINUE_SEARCH; /* b binds tighter than a */
+}
+
+/*
+ * Insert an address preference.
+ */
+static int afs_insert_address_pref(struct afs_addr_preference_list **_preflist,
+ struct afs_addr_preference *pref,
+ int index)
+{
+ struct afs_addr_preference_list *preflist = *_preflist, *old = preflist;
+ size_t size, max_prefs;
+
+ _enter("{%u/%u/%u},%u", preflist->ipv6_off, preflist->nr, preflist->max_prefs, index);
+
+ if (preflist->nr == 255)
+ return -ENOSPC;
+ if (preflist->nr >= preflist->max_prefs) {
+ max_prefs = preflist->max_prefs + 1;
+ size = struct_size(preflist, prefs, max_prefs);
+ size = roundup_pow_of_two(size);
+ max_prefs = min_t(size_t, (size - sizeof(*preflist)) / sizeof(*pref), 255);
+ preflist = kmalloc(size, GFP_KERNEL);
+ if (!preflist)
+ return -ENOMEM;
+ *preflist = **_preflist;
+ preflist->max_prefs = max_prefs;
+ *_preflist = preflist;
+
+ if (index < preflist->nr)
+ memcpy(preflist->prefs + index + 1, old->prefs + index,
+ sizeof(*pref) * (preflist->nr - index));
+ if (index > 0)
+ memcpy(preflist->prefs, old->prefs, sizeof(*pref) * index);
+ } else {
+ if (index < preflist->nr)
+ memmove(preflist->prefs + index + 1, preflist->prefs + index,
+ sizeof(*pref) * (preflist->nr - index));
+ }
+
+ preflist->prefs[index] = *pref;
+ preflist->nr++;
+ if (pref->family == AF_INET)
+ preflist->ipv6_off++;
+ return 0;
+}
+
+/*
+ * Add an address preference.
+ * echo "add <proto> <IP>[/<mask>] <prior>" >/proc/fs/afs/addr_prefs
+ */
+static int afs_add_address_pref(struct afs_net *net, struct afs_addr_preference_list **_preflist,
+ int argc, char **argv)
+{
+ struct afs_addr_preference_list *preflist = *_preflist;
+ struct afs_addr_preference pref;
+ enum cmp_ret cmp;
+ int ret, i, stop;
+
+ if (argc != 3) {
+ pr_warn("Wrong number of params\n");
+ return -EINVAL;
+ }
+
+ if (strcmp(argv[0], "udp") != 0) {
+ pr_warn("Unsupported protocol\n");
+ return -EINVAL;
+ }
+
+ ret = afs_parse_address(argv[1], &pref);
+ if (ret < 0)
+ return ret;
+
+ ret = kstrtou16(argv[2], 10, &pref.prio);
+ if (ret < 0) {
+ pr_warn("Invalid priority\n");
+ return ret;
+ }
+
+ if (pref.family == AF_INET) {
+ i = 0;
+ stop = preflist->ipv6_off;
+ } else {
+ i = preflist->ipv6_off;
+ stop = preflist->nr;
+ }
+
+ for (; i < stop; i++) {
+ cmp = afs_cmp_address_pref(&pref, &preflist->prefs[i]);
+ switch (cmp) {
+ case CONTINUE_SEARCH:
+ continue;
+ case INSERT_HERE:
+ case SUBNET_MATCH:
+ return afs_insert_address_pref(_preflist, &pref, i);
+ case EXACT_MATCH:
+ preflist->prefs[i].prio = pref.prio;
+ return 0;
+ }
+ }
+
+ return afs_insert_address_pref(_preflist, &pref, i);
+}
+
+/*
+ * Delete an address preference.
+ */
+static int afs_delete_address_pref(struct afs_addr_preference_list **_preflist,
+ int index)
+{
+ struct afs_addr_preference_list *preflist = *_preflist;
+
+ _enter("{%u/%u/%u},%u", preflist->ipv6_off, preflist->nr, preflist->max_prefs, index);
+
+ if (preflist->nr == 0)
+ return -ENOENT;
+
+ if (index < preflist->nr - 1)
+ memmove(preflist->prefs + index, preflist->prefs + index + 1,
+ sizeof(preflist->prefs[0]) * (preflist->nr - index - 1));
+
+ if (index < preflist->ipv6_off)
+ preflist->ipv6_off--;
+ preflist->nr--;
+ return 0;
+}
+
+/*
+ * Delete an address preference.
+ * echo "del <proto> <IP>[/<mask>]" >/proc/fs/afs/addr_prefs
+ */
+static int afs_del_address_pref(struct afs_net *net, struct afs_addr_preference_list **_preflist,
+ int argc, char **argv)
+{
+ struct afs_addr_preference_list *preflist = *_preflist;
+ struct afs_addr_preference pref;
+ enum cmp_ret cmp;
+ int ret, i, stop;
+
+ if (argc != 2) {
+ pr_warn("Wrong number of params\n");
+ return -EINVAL;
+ }
+
+ if (strcmp(argv[0], "udp") != 0) {
+ pr_warn("Unsupported protocol\n");
+ return -EINVAL;
+ }
+
+ ret = afs_parse_address(argv[1], &pref);
+ if (ret < 0)
+ return ret;
+
+ if (pref.family == AF_INET) {
+ i = 0;
+ stop = preflist->ipv6_off;
+ } else {
+ i = preflist->ipv6_off;
+ stop = preflist->nr;
+ }
+
+ for (; i < stop; i++) {
+ cmp = afs_cmp_address_pref(&pref, &preflist->prefs[i]);
+ switch (cmp) {
+ case CONTINUE_SEARCH:
+ continue;
+ case INSERT_HERE:
+ case SUBNET_MATCH:
+ return 0;
+ case EXACT_MATCH:
+ return afs_delete_address_pref(_preflist, i);
+ }
+ }
+
+ return -ENOANO;
+}
+
+/*
+ * Handle writes to /proc/fs/afs/addr_prefs
+ */
+int afs_proc_addr_prefs_write(struct file *file, char *buf, size_t size)
+{
+ struct afs_addr_preference_list *preflist, *old;
+ struct seq_file *m = file->private_data;
+ struct afs_net *net = afs_seq2net_single(m);
+ size_t psize;
+ char *argv[5];
+ int ret, argc, max_prefs;
+
+ inode_lock(file_inode(file));
+
+ /* Allocate a candidate new list and initialise it from the old. */
+ old = rcu_dereference_protected(net->address_prefs,
+ lockdep_is_held(&file_inode(file)->i_rwsem));
+
+ if (old)
+ max_prefs = old->nr + 1;
+ else
+ max_prefs = 1;
+
+ psize = struct_size(old, prefs, max_prefs);
+ psize = roundup_pow_of_two(psize);
+ max_prefs = min_t(size_t, (psize - sizeof(*old)) / sizeof(old->prefs[0]), 255);
+
+ ret = -ENOMEM;
+ preflist = kmalloc(struct_size(preflist, prefs, max_prefs), GFP_KERNEL);
+ if (!preflist)
+ goto done;
+
+ if (old)
+ memcpy(preflist, old, struct_size(preflist, prefs, old->nr));
+ else
+ memset(preflist, 0, sizeof(*preflist));
+ preflist->max_prefs = max_prefs;
+
+ do {
+ argc = afs_split_string(&buf, argv, ARRAY_SIZE(argv));
+ if (argc < 0)
+ return argc;
+ if (argc < 2)
+ goto inval;
+
+ if (strcmp(argv[0], "add") == 0)
+ ret = afs_add_address_pref(net, &preflist, argc - 1, argv + 1);
+ else if (strcmp(argv[0], "del") == 0)
+ ret = afs_del_address_pref(net, &preflist, argc - 1, argv + 1);
+ else
+ goto inval;
+ if (ret < 0)
+ goto done;
+ } while (*buf);
+
+ preflist->version++;
+ rcu_assign_pointer(net->address_prefs, preflist);
+ /* Store prefs before version */
+ smp_store_release(&net->address_pref_version, preflist->version);
+ kfree_rcu(old, rcu);
+ preflist = NULL;
+ ret = 0;
+
+done:
+ kfree(preflist);
+ inode_unlock(file_inode(file));
+ _leave(" = %d", ret);
+ return ret;
+
+inval:
+ pr_warn("Invalid Command\n");
+ ret = -EINVAL;
+ goto done;
+}
+
+/*
+ * Mark the priorities on an address list if the address preferences table has
+ * changed. The caller must hold the RCU read lock.
+ */
+void afs_get_address_preferences_rcu(struct afs_net *net, struct afs_addr_list *alist)
+{
+ const struct afs_addr_preference_list *preflist =
+ rcu_dereference(net->address_prefs);
+ const struct sockaddr_in6 *sin6;
+ const struct sockaddr_in *sin;
+ const struct sockaddr *sa;
+ struct afs_addr_preference test;
+ enum cmp_ret cmp;
+ int i, j;
+
+ if (!preflist || !preflist->nr || !alist->nr_addrs ||
+ smp_load_acquire(&alist->addr_pref_version) == preflist->version)
+ return;
+
+ test.family = AF_INET;
+ test.subnet_mask = 32;
+ test.prio = 0;
+ for (i = 0; i < alist->nr_ipv4; i++) {
+ sa = rxrpc_kernel_remote_addr(alist->addrs[i].peer);
+ sin = (const struct sockaddr_in *)sa;
+ test.ipv4_addr = sin->sin_addr;
+ for (j = 0; j < preflist->ipv6_off; j++) {
+ cmp = afs_cmp_address_pref(&test, &preflist->prefs[j]);
+ switch (cmp) {
+ case CONTINUE_SEARCH:
+ continue;
+ case INSERT_HERE:
+ break;
+ case EXACT_MATCH:
+ case SUBNET_MATCH:
+ WRITE_ONCE(alist->addrs[i].prio, preflist->prefs[j].prio);
+ break;
+ }
+ }
+ }
+
+ test.family = AF_INET6;
+ test.subnet_mask = 128;
+ test.prio = 0;
+ for (; i < alist->nr_addrs; i++) {
+ sa = rxrpc_kernel_remote_addr(alist->addrs[i].peer);
+ sin6 = (const struct sockaddr_in6 *)sa;
+ test.ipv6_addr = sin6->sin6_addr;
+ for (j = preflist->ipv6_off; j < preflist->nr; j++) {
+ cmp = afs_cmp_address_pref(&test, &preflist->prefs[j]);
+ switch (cmp) {
+ case CONTINUE_SEARCH:
+ continue;
+ case INSERT_HERE:
+ break;
+ case EXACT_MATCH:
+ case SUBNET_MATCH:
+ WRITE_ONCE(alist->addrs[i].prio, preflist->prefs[j].prio);
+ break;
+ }
+ }
+ }
+
+ smp_store_release(&alist->addr_pref_version, preflist->version);
+}
+
+/*
+ * Mark the priorities on an address list if the address preferences table has
+ * changed. Avoid taking the RCU read lock if we can.
+ */
+void afs_get_address_preferences(struct afs_net *net, struct afs_addr_list *alist)
+{
+ if (!net->address_prefs ||
+ /* Load version before prefs */
+ smp_load_acquire(&net->address_pref_version) == alist->addr_pref_version)
+ return;
+
+ rcu_read_lock();
+ afs_get_address_preferences_rcu(net, alist);
+ rcu_read_unlock();
+}
diff --git a/fs/afs/afs.h b/fs/afs/afs.h
index 81815724db6c..b488072aee87 100644
--- a/fs/afs/afs.h
+++ b/fs/afs/afs.h
@@ -165,7 +165,8 @@ struct afs_status_cb {
* AFS volume synchronisation information
*/
struct afs_volsync {
- time64_t creation; /* volume creation time */
+ time64_t creation; /* Volume creation time (or TIME64_MIN) */
+ time64_t update; /* Volume update time (or TIME64_MIN) */
};
/*
diff --git a/fs/afs/callback.c b/fs/afs/callback.c
index a484fa642808..99b2c8172021 100644
--- a/fs/afs/callback.c
+++ b/fs/afs/callback.c
@@ -33,22 +33,20 @@ void afs_invalidate_mmap_work(struct work_struct *work)
unmap_mapping_pages(vnode->netfs.inode.i_mapping, 0, 0, false);
}
-void afs_server_init_callback_work(struct work_struct *work)
+static void afs_volume_init_callback(struct afs_volume *volume)
{
- struct afs_server *server = container_of(work, struct afs_server, initcb_work);
struct afs_vnode *vnode;
- struct afs_cell *cell = server->cell;
- down_read(&cell->fs_open_mmaps_lock);
+ down_read(&volume->open_mmaps_lock);
- list_for_each_entry(vnode, &cell->fs_open_mmaps, cb_mmap_link) {
- if (vnode->cb_server == server) {
- clear_bit(AFS_VNODE_CB_PROMISED, &vnode->flags);
+ list_for_each_entry(vnode, &volume->open_mmaps, cb_mmap_link) {
+ if (vnode->cb_v_check != atomic_read(&volume->cb_v_break)) {
+ atomic64_set(&vnode->cb_expires_at, AFS_NO_CB_PROMISE);
queue_work(system_unbound_wq, &vnode->cb_work);
}
}
- up_read(&cell->fs_open_mmaps_lock);
+ up_read(&volume->open_mmaps_lock);
}
/*
@@ -57,15 +55,20 @@ void afs_server_init_callback_work(struct work_struct *work)
*/
void afs_init_callback_state(struct afs_server *server)
{
- rcu_read_lock();
- do {
- server->cb_s_break++;
- atomic_inc(&server->cell->fs_s_break);
- if (!list_empty(&server->cell->fs_open_mmaps))
- queue_work(system_unbound_wq, &server->initcb_work);
+ struct afs_server_entry *se;
- } while ((server = rcu_dereference(server->uuid_next)));
- rcu_read_unlock();
+ down_read(&server->cell->vs_lock);
+
+ list_for_each_entry(se, &server->volumes, slink) {
+ se->cb_expires_at = AFS_NO_CB_PROMISE;
+ se->volume->cb_expires_at = AFS_NO_CB_PROMISE;
+ trace_afs_cb_v_break(se->volume->vid, atomic_read(&se->volume->cb_v_break),
+ afs_cb_break_for_s_reinit);
+ if (!list_empty(&se->volume->open_mmaps))
+ afs_volume_init_callback(se->volume);
+ }
+
+ up_read(&server->cell->vs_lock);
}
/*
@@ -76,9 +79,9 @@ void __afs_break_callback(struct afs_vnode *vnode, enum afs_cb_break_reason reas
_enter("");
clear_bit(AFS_VNODE_NEW_CONTENT, &vnode->flags);
- if (test_and_clear_bit(AFS_VNODE_CB_PROMISED, &vnode->flags)) {
+ if (atomic64_xchg(&vnode->cb_expires_at, AFS_NO_CB_PROMISE) != AFS_NO_CB_PROMISE) {
vnode->cb_break++;
- vnode->cb_v_break = vnode->volume->cb_v_break;
+ vnode->cb_v_check = atomic_read(&vnode->volume->cb_v_break);
afs_clear_permits(vnode);
if (vnode->lock_state == AFS_VNODE_LOCK_WAITING_FOR_CB)
@@ -110,13 +113,14 @@ static struct afs_volume *afs_lookup_volume_rcu(struct afs_cell *cell,
{
struct afs_volume *volume = NULL;
struct rb_node *p;
- int seq = 0;
+ int seq = 1;
- do {
+ for (;;) {
/* Unfortunately, rbtree walking doesn't give reliable results
* under just the RCU read lock, so we have to check for
* changes.
*/
+ seq++; /* 2 on the 1st/lockless path, otherwise odd */
read_seqbegin_or_lock(&cell->volume_lock, &seq);
p = rcu_dereference_raw(cell->volumes.rb_node);
@@ -132,35 +136,63 @@ static struct afs_volume *afs_lookup_volume_rcu(struct afs_cell *cell,
volume = NULL;
}
- } while (need_seqretry(&cell->volume_lock, seq));
+ if (volume && afs_try_get_volume(volume, afs_volume_trace_get_callback))
+ break;
+ if (!need_seqretry(&cell->volume_lock, seq))
+ break;
+ seq |= 1; /* Want a lock next time */
+ }
done_seqretry(&cell->volume_lock, seq);
return volume;
}
/*
+ * Allow the fileserver to break callbacks at the volume-level. This is
+ * typically done when, for example, a R/W volume is snapshotted to a R/O
+ * volume (the only way to change an R/O volume). It may also, however, happen
+ * when a volserver takes control of a volume (offlining it, moving it, etc.).
+ *
+ * Every file in that volume will need to be reevaluated.
+ */
+static void afs_break_volume_callback(struct afs_server *server,
+ struct afs_volume *volume)
+ __releases(RCU)
+{
+ struct afs_server_list *slist = rcu_dereference(volume->servers);
+ unsigned int i, cb_v_break;
+
+ write_lock(&volume->cb_v_break_lock);
+
+ for (i = 0; i < slist->nr_servers; i++)
+ if (slist->servers[i].server == server)
+ slist->servers[i].cb_expires_at = AFS_NO_CB_PROMISE;
+ volume->cb_expires_at = AFS_NO_CB_PROMISE;
+
+ cb_v_break = atomic_inc_return_release(&volume->cb_v_break);
+ trace_afs_cb_v_break(volume->vid, cb_v_break, afs_cb_break_for_volume_callback);
+
+ write_unlock(&volume->cb_v_break_lock);
+ rcu_read_unlock();
+
+ if (!list_empty(&volume->open_mmaps))
+ afs_volume_init_callback(volume);
+}
+
+/*
* allow the fileserver to explicitly break one callback
* - happens when
* - the backing file is changed
* - a lock is released
*/
-static void afs_break_one_callback(struct afs_volume *volume,
+static void afs_break_one_callback(struct afs_server *server,
+ struct afs_volume *volume,
struct afs_fid *fid)
{
struct super_block *sb;
struct afs_vnode *vnode;
struct inode *inode;
- if (fid->vnode == 0 && fid->unique == 0) {
- /* The callback break applies to an entire volume. */
- write_lock(&volume->cb_v_break_lock);
- volume->cb_v_break++;
- trace_afs_cb_break(fid, volume->cb_v_break,
- afs_cb_break_for_volume_callback, false);
- write_unlock(&volume->cb_v_break_lock);
- return;
- }
-
/* See if we can find a matching inode - even an I_NEW inode needs to
* be marked as it can have its callback broken before we finish
* setting up the local inode.
@@ -187,25 +219,35 @@ static void afs_break_some_callbacks(struct afs_server *server,
afs_volid_t vid = cbb->fid.vid;
size_t i;
+ rcu_read_lock();
volume = afs_lookup_volume_rcu(server->cell, vid);
+ if (cbb->fid.vnode == 0 && cbb->fid.unique == 0) {
+ afs_break_volume_callback(server, volume);
+ *_count -= 1;
+ if (*_count)
+ memmove(cbb, cbb + 1, sizeof(*cbb) * *_count);
+ } else {
+ /* TODO: Find all matching volumes if we couldn't match the server and
+ * break them anyway.
+ */
- /* TODO: Find all matching volumes if we couldn't match the server and
- * break them anyway.
- */
-
- for (i = *_count; i > 0; cbb++, i--) {
- if (cbb->fid.vid == vid) {
- _debug("- Fid { vl=%08llx n=%llu u=%u }",
- cbb->fid.vid,
- cbb->fid.vnode,
- cbb->fid.unique);
- --*_count;
- if (volume)
- afs_break_one_callback(volume, &cbb->fid);
- } else {
- *residue++ = *cbb;
+ for (i = *_count; i > 0; cbb++, i--) {
+ if (cbb->fid.vid == vid) {
+ _debug("- Fid { vl=%08llx n=%llu u=%u }",
+ cbb->fid.vid,
+ cbb->fid.vnode,
+ cbb->fid.unique);
+ --*_count;
+ if (volume)
+ afs_break_one_callback(server, volume, &cbb->fid);
+ } else {
+ *residue++ = *cbb;
+ }
}
+ rcu_read_unlock();
}
+
+ afs_put_volume(volume, afs_volume_trace_put_callback);
}
/*
@@ -218,11 +260,6 @@ void afs_break_callbacks(struct afs_server *server, size_t count,
ASSERT(server != NULL);
- rcu_read_lock();
-
while (count > 0)
afs_break_some_callbacks(server, callbacks, &count);
-
- rcu_read_unlock();
- return;
}
diff --git a/fs/afs/cell.c b/fs/afs/cell.c
index 926cb1188eba..caa09875f520 100644
--- a/fs/afs/cell.c
+++ b/fs/afs/cell.c
@@ -161,13 +161,12 @@ static struct afs_cell *afs_alloc_cell(struct afs_net *net,
refcount_set(&cell->ref, 1);
atomic_set(&cell->active, 0);
INIT_WORK(&cell->manager, afs_manage_cell_work);
+ init_rwsem(&cell->vs_lock);
cell->volumes = RB_ROOT;
INIT_HLIST_HEAD(&cell->proc_volumes);
seqlock_init(&cell->volume_lock);
cell->fs_servers = RB_ROOT;
seqlock_init(&cell->fs_lock);
- INIT_LIST_HEAD(&cell->fs_open_mmaps);
- init_rwsem(&cell->fs_open_mmaps_lock);
rwlock_init(&cell->vl_servers_lock);
cell->flags = (1 << AFS_CELL_FL_CHECK_ALIAS);
@@ -817,7 +816,7 @@ done:
final_destruction:
/* The root volume is pinning the cell */
- afs_put_volume(cell->net, cell->root_volume, afs_volume_trace_put_cell_root);
+ afs_put_volume(cell->root_volume, afs_volume_trace_put_cell_root);
cell->root_volume = NULL;
afs_put_cell(cell, afs_cell_trace_put_destroy);
}
diff --git a/fs/afs/cmservice.c b/fs/afs/cmservice.c
index d4ddb20d6732..99a3f20bc786 100644
--- a/fs/afs/cmservice.c
+++ b/fs/afs/cmservice.c
@@ -146,10 +146,11 @@ static int afs_find_cm_server_by_peer(struct afs_call *call)
{
struct sockaddr_rxrpc srx;
struct afs_server *server;
+ struct rxrpc_peer *peer;
- rxrpc_kernel_get_peer(call->net->socket, call->rxcall, &srx);
+ peer = rxrpc_kernel_get_call_peer(call->net->socket, call->rxcall);
- server = afs_find_server(call->net, &srx);
+ server = afs_find_server(call->net, peer);
if (!server) {
trace_afs_cm_no_server(call, &srx);
return 0;
diff --git a/fs/afs/dir.c b/fs/afs/dir.c
index 5219182e52e1..c14533ef108f 100644
--- a/fs/afs/dir.c
+++ b/fs/afs/dir.c
@@ -693,8 +693,9 @@ static void afs_do_lookup_success(struct afs_operation *op)
vp = &op->file[0];
abort_code = vp->scb.status.abort_code;
if (abort_code != 0) {
- op->ac.abort_code = abort_code;
- op->error = afs_abort_to_error(abort_code);
+ op->call_abort_code = abort_code;
+ afs_op_set_error(op, afs_abort_to_error(abort_code));
+ op->cumul_error.abort_code = abort_code;
}
break;
@@ -806,8 +807,8 @@ static struct inode *afs_do_lookup(struct inode *dir, struct dentry *dentry,
cookie->fids[i].vid = dvnode->fid.vid;
cookie->ctx.actor = afs_lookup_filldir;
cookie->name = dentry->d_name;
- cookie->nr_fids = 2; /* slot 0 is saved for the fid we actually want
- * and slot 1 for the directory */
+ cookie->nr_fids = 2; /* slot 1 is saved for the fid we actually want
+ * and slot 0 for the directory */
if (!afs_server_supports_ibulk(dvnode))
cookie->one_only = true;
@@ -846,13 +847,14 @@ static struct inode *afs_do_lookup(struct inode *dir, struct dentry *dentry,
_debug("nr_files %u", op->nr_files);
/* Need space for examining all the selected files */
- op->error = -ENOMEM;
if (op->nr_files > 2) {
op->more_files = kvcalloc(op->nr_files - 2,
sizeof(struct afs_vnode_param),
GFP_KERNEL);
- if (!op->more_files)
+ if (!op->more_files) {
+ afs_op_nomem(op);
goto out_op;
+ }
for (i = 2; i < op->nr_files; i++) {
vp = &op->more_files[i - 2];
@@ -878,14 +880,14 @@ static struct inode *afs_do_lookup(struct inode *dir, struct dentry *dentry,
* lookups contained therein are stored in the reply without aborting
* the whole operation.
*/
- op->error = -ENOTSUPP;
+ afs_op_set_error(op, -ENOTSUPP);
if (!cookie->one_only) {
op->ops = &afs_inline_bulk_status_operation;
afs_begin_vnode_operation(op);
afs_wait_for_operation(op);
}
- if (op->error == -ENOTSUPP) {
+ if (afs_op_error(op) == -ENOTSUPP) {
/* We could try FS.BulkStatus next, but this aborts the entire
* op if any of the lookups fails - so, for the moment, revert
* to FS.FetchStatus for op->file[1].
@@ -895,10 +897,10 @@ static struct inode *afs_do_lookup(struct inode *dir, struct dentry *dentry,
afs_begin_vnode_operation(op);
afs_wait_for_operation(op);
}
- inode = ERR_PTR(op->error);
+ inode = ERR_PTR(afs_op_error(op));
out_op:
- if (op->error == 0) {
+ if (!afs_op_error(op)) {
inode = &op->file[1].vnode->netfs.inode;
op->file[1].vnode = NULL;
}
@@ -1116,7 +1118,12 @@ static int afs_d_revalidate(struct dentry *dentry, unsigned int flags)
dir = AFS_FS_I(d_inode(parent));
/* validate the parent directory */
- afs_validate(dir, key);
+ ret = afs_validate(dir, key);
+ if (ret == -ERESTARTSYS) {
+ dput(parent);
+ key_put(key);
+ return ret;
+ }
if (test_bit(AFS_VNODE_DELETED, &dir->flags)) {
_debug("%pd: parent dir deleted", dentry);
@@ -1255,9 +1262,10 @@ void afs_check_for_remote_deletion(struct afs_operation *op)
{
struct afs_vnode *vnode = op->file[0].vnode;
- switch (op->ac.abort_code) {
+ switch (afs_op_abort_code(op)) {
case VNOVNODE:
set_bit(AFS_VNODE_DELETED, &vnode->flags);
+ clear_nlink(&vnode->netfs.inode);
afs_break_callback(vnode, afs_cb_break_for_deleted);
}
}
@@ -1273,20 +1281,20 @@ static void afs_vnode_new_inode(struct afs_operation *op)
_enter("");
- ASSERTCMP(op->error, ==, 0);
+ ASSERTCMP(afs_op_error(op), ==, 0);
inode = afs_iget(op, vp);
if (IS_ERR(inode)) {
/* ENOMEM or EINTR at a really inconvenient time - just abandon
* the new directory on the server.
*/
- op->error = PTR_ERR(inode);
+ afs_op_accumulate_error(op, PTR_ERR(inode), 0);
return;
}
vnode = AFS_FS_I(inode);
set_bit(AFS_VNODE_NEW_CONTENT, &vnode->flags);
- if (!op->error)
+ if (!afs_op_error(op))
afs_cache_permit(vnode, op->key, vnode->cb_break, &vp->scb);
d_instantiate(op->dentry, inode);
}
@@ -1320,7 +1328,7 @@ static void afs_create_put(struct afs_operation *op)
{
_enter("op=%08x", op->debug_id);
- if (op->error)
+ if (afs_op_error(op))
d_drop(op->dentry);
}
@@ -1373,7 +1381,7 @@ static void afs_dir_remove_subdir(struct dentry *dentry)
clear_nlink(&vnode->netfs.inode);
set_bit(AFS_VNODE_DELETED, &vnode->flags);
- clear_bit(AFS_VNODE_CB_PROMISED, &vnode->flags);
+ atomic64_set(&vnode->cb_expires_at, AFS_NO_CB_PROMISE);
clear_bit(AFS_VNODE_DIR_VALID, &vnode->flags);
}
}
@@ -1480,7 +1488,7 @@ static void afs_dir_remove_link(struct afs_operation *op)
struct dentry *dentry = op->dentry;
int ret;
- if (op->error != 0 ||
+ if (afs_op_error(op) ||
(op->file[1].scb.have_status && op->file[1].scb.have_error))
return;
if (d_really_is_positive(dentry))
@@ -1504,10 +1512,10 @@ static void afs_dir_remove_link(struct afs_operation *op)
ret = afs_validate(vnode, op->key);
if (ret != -ESTALE)
- op->error = ret;
+ afs_op_set_error(op, ret);
}
- _debug("nlink %d [val %d]", vnode->netfs.inode.i_nlink, op->error);
+ _debug("nlink %d [val %d]", vnode->netfs.inode.i_nlink, afs_op_error(op));
}
static void afs_unlink_success(struct afs_operation *op)
@@ -1538,7 +1546,7 @@ static void afs_unlink_edit_dir(struct afs_operation *op)
static void afs_unlink_put(struct afs_operation *op)
{
_enter("op=%08x", op->debug_id);
- if (op->unlink.need_rehash && op->error < 0 && op->error != -ENOENT)
+ if (op->unlink.need_rehash && afs_op_error(op) < 0 && afs_op_error(op) != -ENOENT)
d_rehash(op->dentry);
}
@@ -1579,7 +1587,7 @@ static int afs_unlink(struct inode *dir, struct dentry *dentry)
/* Try to make sure we have a callback promise on the victim. */
ret = afs_validate(vnode, op->key);
if (ret < 0) {
- op->error = ret;
+ afs_op_set_error(op, ret);
goto error;
}
@@ -1588,7 +1596,7 @@ static int afs_unlink(struct inode *dir, struct dentry *dentry)
spin_unlock(&dentry->d_lock);
/* Start asynchronous writeout of the inode */
write_inode_now(d_inode(dentry), 0);
- op->error = afs_sillyrename(dvnode, vnode, dentry, op->key);
+ afs_op_set_error(op, afs_sillyrename(dvnode, vnode, dentry, op->key));
goto error;
}
if (!d_unhashed(dentry)) {
@@ -1609,7 +1617,7 @@ static int afs_unlink(struct inode *dir, struct dentry *dentry)
/* If there was a conflict with a third party, check the status of the
* unlinked vnode.
*/
- if (op->error == 0 && (op->flags & AFS_OPERATION_DIR_CONFLICT)) {
+ if (afs_op_error(op) == 0 && (op->flags & AFS_OPERATION_DIR_CONFLICT)) {
op->file[1].update_ctime = false;
op->fetch_status.which = 1;
op->ops = &afs_fetch_status_operation;
@@ -1691,7 +1699,7 @@ static void afs_link_success(struct afs_operation *op)
static void afs_link_put(struct afs_operation *op)
{
_enter("op=%08x", op->debug_id);
- if (op->error)
+ if (afs_op_error(op))
d_drop(op->dentry);
}
@@ -1889,7 +1897,7 @@ static void afs_rename_put(struct afs_operation *op)
if (op->rename.rehash)
d_rehash(op->rename.rehash);
dput(op->rename.tmp);
- if (op->error)
+ if (afs_op_error(op))
d_rehash(op->dentry);
}
@@ -1934,7 +1942,7 @@ static int afs_rename(struct mnt_idmap *idmap, struct inode *old_dir,
return PTR_ERR(op);
ret = afs_validate(vnode, op->key);
- op->error = ret;
+ afs_op_set_error(op, ret);
if (ret < 0)
goto error;
@@ -1971,7 +1979,7 @@ static int afs_rename(struct mnt_idmap *idmap, struct inode *old_dir,
op->rename.tmp = d_alloc(new_dentry->d_parent,
&new_dentry->d_name);
if (!op->rename.tmp) {
- op->error = -ENOMEM;
+ afs_op_nomem(op);
goto error;
}
@@ -1979,7 +1987,7 @@ static int afs_rename(struct mnt_idmap *idmap, struct inode *old_dir,
AFS_FS_I(d_inode(new_dentry)),
new_dentry, op->key);
if (ret) {
- op->error = ret;
+ afs_op_set_error(op, ret);
goto error;
}
diff --git a/fs/afs/dir_silly.c b/fs/afs/dir_silly.c
index bb5807e87fa4..a1e581946b93 100644
--- a/fs/afs/dir_silly.c
+++ b/fs/afs/dir_silly.c
@@ -218,7 +218,7 @@ static int afs_do_silly_unlink(struct afs_vnode *dvnode, struct afs_vnode *vnode
/* If there was a conflict with a third party, check the status of the
* unlinked vnode.
*/
- if (op->error == 0 && (op->flags & AFS_OPERATION_DIR_CONFLICT)) {
+ if (op->cumul_error.error == 0 && (op->flags & AFS_OPERATION_DIR_CONFLICT)) {
op->file[1].update_ctime = false;
op->fetch_status.which = 1;
op->ops = &afs_fetch_status_operation;
diff --git a/fs/afs/file.c b/fs/afs/file.c
index d37dd201752b..30914e0d9cb2 100644
--- a/fs/afs/file.c
+++ b/fs/afs/file.c
@@ -243,12 +243,9 @@ static void afs_fetch_data_notify(struct afs_operation *op)
{
struct afs_read *req = op->fetch.req;
struct netfs_io_subrequest *subreq = req->subreq;
- int error = op->error;
+ int error = afs_op_error(op);
- if (error == -ECONNABORTED)
- error = afs_abort_to_error(op->ac.abort_code);
req->error = error;
-
if (subreq) {
__set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags);
netfs_subreq_terminated(subreq, error ?: req->actual_len, false);
@@ -271,7 +268,7 @@ static void afs_fetch_data_success(struct afs_operation *op)
static void afs_fetch_data_put(struct afs_operation *op)
{
- op->fetch.req->error = op->error;
+ op->fetch.req->error = afs_op_error(op);
afs_put_read(op->fetch.req);
}
@@ -517,13 +514,12 @@ static bool afs_release_folio(struct folio *folio, gfp_t gfp)
static void afs_add_open_mmap(struct afs_vnode *vnode)
{
if (atomic_inc_return(&vnode->cb_nr_mmap) == 1) {
- down_write(&vnode->volume->cell->fs_open_mmaps_lock);
+ down_write(&vnode->volume->open_mmaps_lock);
if (list_empty(&vnode->cb_mmap_link))
- list_add_tail(&vnode->cb_mmap_link,
- &vnode->volume->cell->fs_open_mmaps);
+ list_add_tail(&vnode->cb_mmap_link, &vnode->volume->open_mmaps);
- up_write(&vnode->volume->cell->fs_open_mmaps_lock);
+ up_write(&vnode->volume->open_mmaps_lock);
}
}
@@ -532,12 +528,12 @@ static void afs_drop_open_mmap(struct afs_vnode *vnode)
if (!atomic_dec_and_test(&vnode->cb_nr_mmap))
return;
- down_write(&vnode->volume->cell->fs_open_mmaps_lock);
+ down_write(&vnode->volume->open_mmaps_lock);
if (atomic_read(&vnode->cb_nr_mmap) == 0)
list_del_init(&vnode->cb_mmap_link);
- up_write(&vnode->volume->cell->fs_open_mmaps_lock);
+ up_write(&vnode->volume->open_mmaps_lock);
flush_work(&vnode->cb_work);
}
@@ -573,7 +569,7 @@ static vm_fault_t afs_vm_map_pages(struct vm_fault *vmf, pgoff_t start_pgoff, pg
{
struct afs_vnode *vnode = AFS_FS_I(file_inode(vmf->vma->vm_file));
- if (afs_pagecache_valid(vnode))
+ if (afs_check_validity(vnode))
return filemap_map_pages(vmf, start_pgoff, end_pgoff);
return 0;
}
diff --git a/fs/afs/fs_operation.c b/fs/afs/fs_operation.c
index 7a3803ce3a22..3546b087e791 100644
--- a/fs/afs/fs_operation.c
+++ b/fs/afs/fs_operation.c
@@ -35,13 +35,15 @@ struct afs_operation *afs_alloc_operation(struct key *key, struct afs_volume *vo
key_get(key);
}
- op->key = key;
- op->volume = afs_get_volume(volume, afs_volume_trace_get_new_op);
- op->net = volume->cell->net;
- op->cb_v_break = volume->cb_v_break;
- op->debug_id = atomic_inc_return(&afs_operation_debug_counter);
- op->error = -EDESTADDRREQ;
- op->ac.error = SHRT_MAX;
+ op->key = key;
+ op->volume = afs_get_volume(volume, afs_volume_trace_get_new_op);
+ op->net = volume->cell->net;
+ op->cb_v_break = atomic_read(&volume->cb_v_break);
+ op->pre_volsync.creation = volume->creation_time;
+ op->pre_volsync.update = volume->update_time;
+ op->debug_id = atomic_inc_return(&afs_operation_debug_counter);
+ op->nr_iterations = -1;
+ afs_op_set_error(op, -EDESTADDRREQ);
_leave(" = [op=%08x]", op->debug_id);
return op;
@@ -71,7 +73,7 @@ static bool afs_get_io_locks(struct afs_operation *op)
swap(vnode, vnode2);
if (mutex_lock_interruptible(&vnode->io_lock) < 0) {
- op->error = -ERESTARTSYS;
+ afs_op_set_error(op, -ERESTARTSYS);
op->flags |= AFS_OPERATION_STOP;
_leave(" = f [I 0]");
return false;
@@ -80,7 +82,7 @@ static bool afs_get_io_locks(struct afs_operation *op)
if (vnode2) {
if (mutex_lock_interruptible_nested(&vnode2->io_lock, 1) < 0) {
- op->error = -ERESTARTSYS;
+ afs_op_set_error(op, -ERESTARTSYS);
op->flags |= AFS_OPERATION_STOP;
mutex_unlock(&vnode->io_lock);
op->flags &= ~AFS_OPERATION_LOCK_0;
@@ -147,7 +149,7 @@ bool afs_begin_vnode_operation(struct afs_operation *op)
afs_prepare_vnode(op, &op->file[0], 0);
afs_prepare_vnode(op, &op->file[1], 1);
- op->cb_v_break = op->volume->cb_v_break;
+ op->cb_v_break = atomic_read(&op->volume->cb_v_break);
_leave(" = true");
return true;
}
@@ -159,16 +161,16 @@ static void afs_end_vnode_operation(struct afs_operation *op)
{
_enter("");
- if (op->error == -EDESTADDRREQ ||
- op->error == -EADDRNOTAVAIL ||
- op->error == -ENETUNREACH ||
- op->error == -EHOSTUNREACH)
+ switch (afs_op_error(op)) {
+ case -EDESTADDRREQ:
+ case -EADDRNOTAVAIL:
+ case -ENETUNREACH:
+ case -EHOSTUNREACH:
afs_dump_edestaddrreq(op);
+ break;
+ }
afs_drop_io_locks(op);
-
- if (op->error == -ECONNABORTED)
- op->error = afs_abort_to_error(op->ac.abort_code);
}
/*
@@ -179,37 +181,43 @@ void afs_wait_for_operation(struct afs_operation *op)
_enter("");
while (afs_select_fileserver(op)) {
- op->cb_s_break = op->server->cb_s_break;
+ op->call_responded = false;
+ op->call_error = 0;
+ op->call_abort_code = 0;
if (test_bit(AFS_SERVER_FL_IS_YFS, &op->server->flags) &&
op->ops->issue_yfs_rpc)
op->ops->issue_yfs_rpc(op);
else if (op->ops->issue_afs_rpc)
op->ops->issue_afs_rpc(op);
else
- op->ac.error = -ENOTSUPP;
-
- if (op->call)
- op->error = afs_wait_for_call_to_complete(op->call, &op->ac);
+ op->call_error = -ENOTSUPP;
+
+ if (op->call) {
+ afs_wait_for_call_to_complete(op->call);
+ op->call_abort_code = op->call->abort_code;
+ op->call_error = op->call->error;
+ op->call_responded = op->call->responded;
+ afs_put_call(op->call);
+ }
}
- switch (op->error) {
- case 0:
+ if (op->call_responded)
+ set_bit(AFS_SERVER_FL_RESPONDING, &op->server->flags);
+
+ if (!afs_op_error(op)) {
_debug("success");
op->ops->success(op);
- break;
- case -ECONNABORTED:
+ } else if (op->cumul_error.aborted) {
if (op->ops->aborted)
op->ops->aborted(op);
- fallthrough;
- default:
+ } else {
if (op->ops->failed)
op->ops->failed(op);
- break;
}
afs_end_vnode_operation(op);
- if (op->error == 0 && op->ops->edit_dir) {
+ if (!afs_op_error(op) && op->ops->edit_dir) {
_debug("edit_dir");
op->ops->edit_dir(op);
}
@@ -221,7 +229,8 @@ void afs_wait_for_operation(struct afs_operation *op)
*/
int afs_put_operation(struct afs_operation *op)
{
- int i, ret = op->error;
+ struct afs_addr_list *alist;
+ int i, ret = afs_op_error(op);
_enter("op=%08x,%d", op->debug_id, ret);
@@ -243,9 +252,19 @@ int afs_put_operation(struct afs_operation *op)
kfree(op->more_files);
}
- afs_end_cursor(&op->ac);
+ if (op->estate) {
+ alist = op->estate->addresses;
+ if (alist) {
+ if (op->call_responded &&
+ op->addr_index != alist->preferred &&
+ test_bit(alist->preferred, &op->addr_tried))
+ WRITE_ONCE(alist->preferred, op->addr_index);
+ }
+ }
+
+ afs_clear_server_states(op);
afs_put_serverlist(op->net, op->server_list);
- afs_put_volume(op->net, op->volume, afs_volume_trace_put_put_op);
+ afs_put_volume(op->volume, afs_volume_trace_put_put_op);
key_put(op->key);
kfree(op);
return ret;
diff --git a/fs/afs/fs_probe.c b/fs/afs/fs_probe.c
index daaf3810cc92..580de4adaaf6 100644
--- a/fs/afs/fs_probe.c
+++ b/fs/afs/fs_probe.c
@@ -15,6 +15,42 @@
static unsigned int afs_fs_probe_fast_poll_interval = 30 * HZ;
static unsigned int afs_fs_probe_slow_poll_interval = 5 * 60 * HZ;
+struct afs_endpoint_state *afs_get_endpoint_state(struct afs_endpoint_state *estate,
+ enum afs_estate_trace where)
+{
+ if (estate) {
+ int r;
+
+ __refcount_inc(&estate->ref, &r);
+ trace_afs_estate(estate->server_id, estate->probe_seq, r, where);
+ }
+ return estate;
+}
+
+static void afs_endpoint_state_rcu(struct rcu_head *rcu)
+{
+ struct afs_endpoint_state *estate = container_of(rcu, struct afs_endpoint_state, rcu);
+
+ trace_afs_estate(estate->server_id, estate->probe_seq, refcount_read(&estate->ref),
+ afs_estate_trace_free);
+ afs_put_addrlist(estate->addresses, afs_alist_trace_put_estate);
+ kfree(estate);
+}
+
+void afs_put_endpoint_state(struct afs_endpoint_state *estate, enum afs_estate_trace where)
+{
+ if (estate) {
+ unsigned int server_id = estate->server_id, probe_seq = estate->probe_seq;
+ bool dead;
+ int r;
+
+ dead = __refcount_dec_and_test(&estate->ref, &r);
+ trace_afs_estate(server_id, probe_seq, r, where);
+ if (dead)
+ call_rcu(&estate->rcu, afs_endpoint_state_rcu);
+ }
+}
+
/*
* Start the probe polling timer. We have to supply it with an inc on the
* outstanding server count.
@@ -38,9 +74,10 @@ static void afs_schedule_fs_probe(struct afs_net *net,
/*
* Handle the completion of a set of probes.
*/
-static void afs_finished_fs_probe(struct afs_net *net, struct afs_server *server)
+static void afs_finished_fs_probe(struct afs_net *net, struct afs_server *server,
+ struct afs_endpoint_state *estate)
{
- bool responded = server->probe.responded;
+ bool responded = test_bit(AFS_ESTATE_RESPONDED, &estate->flags);
write_seqlock(&net->fs_lock);
if (responded) {
@@ -50,6 +87,7 @@ static void afs_finished_fs_probe(struct afs_net *net, struct afs_server *server
clear_bit(AFS_SERVER_FL_RESPONDING, &server->flags);
list_add_tail(&server->probe_link, &net->fs_probe_fast);
}
+
write_sequnlock(&net->fs_lock);
afs_schedule_fs_probe(net, server, !responded);
@@ -58,12 +96,13 @@ static void afs_finished_fs_probe(struct afs_net *net, struct afs_server *server
/*
* Handle the completion of a probe.
*/
-static void afs_done_one_fs_probe(struct afs_net *net, struct afs_server *server)
+static void afs_done_one_fs_probe(struct afs_net *net, struct afs_server *server,
+ struct afs_endpoint_state *estate)
{
_enter("");
- if (atomic_dec_and_test(&server->probe_outstanding))
- afs_finished_fs_probe(net, server);
+ if (atomic_dec_and_test(&estate->nr_probing))
+ afs_finished_fs_probe(net, server, estate);
wake_up_all(&server->probe_wq);
}
@@ -74,24 +113,22 @@ static void afs_done_one_fs_probe(struct afs_net *net, struct afs_server *server
*/
static void afs_fs_probe_not_done(struct afs_net *net,
struct afs_server *server,
- struct afs_addr_cursor *ac)
+ struct afs_endpoint_state *estate,
+ int index)
{
- struct afs_addr_list *alist = ac->alist;
- unsigned int index = ac->index;
-
_enter("");
trace_afs_io_error(0, -ENOMEM, afs_io_error_fs_probe_fail);
spin_lock(&server->probe_lock);
- server->probe.local_failure = true;
- if (server->probe.error == 0)
- server->probe.error = -ENOMEM;
+ set_bit(AFS_ESTATE_LOCAL_FAILURE, &estate->flags);
+ if (estate->error == 0)
+ estate->error = -ENOMEM;
- set_bit(index, &alist->failed);
+ set_bit(index, &estate->failed_set);
spin_unlock(&server->probe_lock);
- return afs_done_one_fs_probe(net, server);
+ return afs_done_one_fs_probe(net, server, estate);
}
/*
@@ -100,30 +137,34 @@ static void afs_fs_probe_not_done(struct afs_net *net,
*/
void afs_fileserver_probe_result(struct afs_call *call)
{
- struct afs_addr_list *alist = call->alist;
+ struct afs_endpoint_state *estate = call->probe;
+ struct afs_addr_list *alist = estate->addresses;
+ struct afs_address *addr = &alist->addrs[call->probe_index];
struct afs_server *server = call->server;
- unsigned int index = call->addr_ix;
- unsigned int rtt_us = 0, cap0;
+ unsigned int index = call->probe_index;
+ unsigned int rtt_us = -1, cap0;
int ret = call->error;
_enter("%pU,%u", &server->uuid, index);
+ WRITE_ONCE(addr->last_error, ret);
+
spin_lock(&server->probe_lock);
switch (ret) {
case 0:
- server->probe.error = 0;
+ estate->error = 0;
goto responded;
case -ECONNABORTED:
- if (!server->probe.responded) {
- server->probe.abort_code = call->abort_code;
- server->probe.error = ret;
+ if (!test_bit(AFS_ESTATE_RESPONDED, &estate->flags)) {
+ estate->abort_code = call->abort_code;
+ estate->error = ret;
}
goto responded;
case -ENOMEM:
case -ENONET:
- clear_bit(index, &alist->responded);
- server->probe.local_failure = true;
+ clear_bit(index, &estate->responsive_set);
+ set_bit(AFS_ESTATE_LOCAL_FAILURE, &estate->flags);
trace_afs_io_error(call->debug_id, ret, afs_io_error_fs_probe_fail);
goto out;
case -ECONNRESET: /* Responded, but call expired. */
@@ -136,29 +177,29 @@ void afs_fileserver_probe_result(struct afs_call *call)
case -ETIMEDOUT:
case -ETIME:
default:
- clear_bit(index, &alist->responded);
- set_bit(index, &alist->failed);
- if (!server->probe.responded &&
- (server->probe.error == 0 ||
- server->probe.error == -ETIMEDOUT ||
- server->probe.error == -ETIME))
- server->probe.error = ret;
+ clear_bit(index, &estate->responsive_set);
+ set_bit(index, &estate->failed_set);
+ if (!test_bit(AFS_ESTATE_RESPONDED, &estate->flags) &&
+ (estate->error == 0 ||
+ estate->error == -ETIMEDOUT ||
+ estate->error == -ETIME))
+ estate->error = ret;
trace_afs_io_error(call->debug_id, ret, afs_io_error_fs_probe_fail);
goto out;
}
responded:
- clear_bit(index, &alist->failed);
+ clear_bit(index, &estate->failed_set);
if (call->service_id == YFS_FS_SERVICE) {
- server->probe.is_yfs = true;
+ set_bit(AFS_ESTATE_IS_YFS, &estate->flags);
set_bit(AFS_SERVER_FL_IS_YFS, &server->flags);
- alist->addrs[index].srx_service = call->service_id;
+ server->service_id = call->service_id;
} else {
- server->probe.not_yfs = true;
- if (!server->probe.is_yfs) {
+ set_bit(AFS_ESTATE_NOT_YFS, &estate->flags);
+ if (!test_bit(AFS_ESTATE_IS_YFS, &estate->flags)) {
clear_bit(AFS_SERVER_FL_IS_YFS, &server->flags);
- alist->addrs[index].srx_service = call->service_id;
+ server->service_id = call->service_id;
}
cap0 = ntohl(call->tmp);
if (cap0 & AFS3_VICED_CAPABILITY_64BITFILES)
@@ -167,116 +208,136 @@ responded:
clear_bit(AFS_SERVER_FL_HAS_FS64, &server->flags);
}
- rxrpc_kernel_get_srtt(call->net->socket, call->rxcall, &rtt_us);
- if (rtt_us < server->probe.rtt) {
- server->probe.rtt = rtt_us;
+ rtt_us = rxrpc_kernel_get_srtt(addr->peer);
+ if (rtt_us < estate->rtt) {
+ estate->rtt = rtt_us;
server->rtt = rtt_us;
alist->preferred = index;
}
smp_wmb(); /* Set rtt before responded. */
- server->probe.responded = true;
- set_bit(index, &alist->responded);
+ set_bit(AFS_ESTATE_RESPONDED, &estate->flags);
+ set_bit(index, &estate->responsive_set);
set_bit(AFS_SERVER_FL_RESPONDING, &server->flags);
out:
spin_unlock(&server->probe_lock);
- _debug("probe %pU [%u] %pISpc rtt=%u ret=%d",
- &server->uuid, index, &alist->addrs[index].transport,
+ trace_afs_fs_probe(server, false, estate, index, call->error, call->abort_code, rtt_us);
+ _debug("probe[%x] %pU [%u] %pISpc rtt=%d ret=%d",
+ estate->probe_seq, &server->uuid, index,
+ rxrpc_kernel_remote_addr(alist->addrs[index].peer),
rtt_us, ret);
- return afs_done_one_fs_probe(call->net, server);
+ return afs_done_one_fs_probe(call->net, server, estate);
}
/*
- * Probe one or all of a fileserver's addresses to find out the best route and
- * to query its capabilities.
+ * Probe all of a fileserver's addresses to find out the best route and to
+ * query its capabilities.
*/
void afs_fs_probe_fileserver(struct afs_net *net, struct afs_server *server,
- struct key *key, bool all)
+ struct afs_addr_list *new_alist, struct key *key)
{
- struct afs_addr_cursor ac = {
- .index = 0,
- };
+ struct afs_endpoint_state *estate, *old;
+ struct afs_addr_list *alist;
+ unsigned long unprobed;
_enter("%pU", &server->uuid);
- read_lock(&server->fs_lock);
- ac.alist = rcu_dereference_protected(server->addresses,
- lockdep_is_held(&server->fs_lock));
- afs_get_addrlist(ac.alist);
- read_unlock(&server->fs_lock);
+ estate = kzalloc(sizeof(*estate), GFP_KERNEL);
+ if (!estate)
+ return;
+
+ refcount_set(&estate->ref, 1);
+ estate->server_id = server->debug_id;
+ estate->rtt = UINT_MAX;
+
+ write_lock(&server->fs_lock);
+
+ old = rcu_dereference_protected(server->endpoint_state,
+ lockdep_is_held(&server->fs_lock));
+ estate->responsive_set = old->responsive_set;
+ estate->addresses = afs_get_addrlist(new_alist ?: old->addresses,
+ afs_alist_trace_get_estate);
+ alist = estate->addresses;
+ estate->probe_seq = ++server->probe_counter;
+ atomic_set(&estate->nr_probing, alist->nr_addrs);
+
+ rcu_assign_pointer(server->endpoint_state, estate);
+ set_bit(AFS_ESTATE_SUPERSEDED, &old->flags);
+ write_unlock(&server->fs_lock);
+
+ trace_afs_estate(estate->server_id, estate->probe_seq, refcount_read(&estate->ref),
+ afs_estate_trace_alloc_probe);
+
+ afs_get_address_preferences(net, alist);
server->probed_at = jiffies;
- atomic_set(&server->probe_outstanding, all ? ac.alist->nr_addrs : 1);
- memset(&server->probe, 0, sizeof(server->probe));
- server->probe.rtt = UINT_MAX;
-
- ac.index = ac.alist->preferred;
- if (ac.index < 0 || ac.index >= ac.alist->nr_addrs)
- all = true;
-
- if (all) {
- for (ac.index = 0; ac.index < ac.alist->nr_addrs; ac.index++)
- if (!afs_fs_get_capabilities(net, server, &ac, key))
- afs_fs_probe_not_done(net, server, &ac);
- } else {
- if (!afs_fs_get_capabilities(net, server, &ac, key))
- afs_fs_probe_not_done(net, server, &ac);
+ unprobed = (1UL << alist->nr_addrs) - 1;
+ while (unprobed) {
+ unsigned int index = 0, i;
+ int best_prio = -1;
+
+ for (i = 0; i < alist->nr_addrs; i++) {
+ if (test_bit(i, &unprobed) &&
+ alist->addrs[i].prio > best_prio) {
+ index = i;
+ best_prio = alist->addrs[i].prio;
+ }
+ }
+ __clear_bit(index, &unprobed);
+
+ trace_afs_fs_probe(server, true, estate, index, 0, 0, 0);
+ if (!afs_fs_get_capabilities(net, server, estate, index, key))
+ afs_fs_probe_not_done(net, server, estate, index);
}
- afs_put_addrlist(ac.alist);
+ afs_put_endpoint_state(old, afs_estate_trace_put_probe);
}
/*
- * Wait for the first as-yet untried fileserver to respond.
+ * Wait for the first as-yet untried fileserver to respond, for the probe state
+ * to be superseded or for all probes to finish.
*/
-int afs_wait_for_fs_probes(struct afs_server_list *slist, unsigned long untried)
+int afs_wait_for_fs_probes(struct afs_operation *op, struct afs_server_state *states, bool intr)
{
- struct wait_queue_entry *waits;
- struct afs_server *server;
- unsigned int rtt = UINT_MAX, rtt_s;
- bool have_responders = false;
- int pref = -1, i;
+ struct afs_endpoint_state *estate;
+ struct afs_server_list *slist = op->server_list;
+ bool still_probing = true;
+ int ret = 0, i;
- _enter("%u,%lx", slist->nr_servers, untried);
+ _enter("%u", slist->nr_servers);
- /* Only wait for servers that have a probe outstanding. */
for (i = 0; i < slist->nr_servers; i++) {
- if (test_bit(i, &untried)) {
- server = slist->servers[i].server;
- if (!atomic_read(&server->probe_outstanding))
- __clear_bit(i, &untried);
- if (server->probe.responded)
- have_responders = true;
- }
+ estate = states[i].endpoint_state;
+ if (test_bit(AFS_ESTATE_SUPERSEDED, &estate->flags))
+ return 2;
+ if (atomic_read(&estate->nr_probing))
+ still_probing = true;
+ if (estate->responsive_set & states[i].untried_addrs)
+ return 1;
}
- if (have_responders || !untried)
+ if (!still_probing)
return 0;
- waits = kmalloc(array_size(slist->nr_servers, sizeof(*waits)), GFP_KERNEL);
- if (!waits)
- return -ENOMEM;
-
- for (i = 0; i < slist->nr_servers; i++) {
- if (test_bit(i, &untried)) {
- server = slist->servers[i].server;
- init_waitqueue_entry(&waits[i], current);
- add_wait_queue(&server->probe_wq, &waits[i]);
- }
- }
+ for (i = 0; i < slist->nr_servers; i++)
+ add_wait_queue(&slist->servers[i].server->probe_wq, &states[i].probe_waiter);
for (;;) {
- bool still_probing = false;
+ still_probing = false;
- set_current_state(TASK_INTERRUPTIBLE);
+ set_current_state(intr ? TASK_INTERRUPTIBLE : TASK_UNINTERRUPTIBLE);
for (i = 0; i < slist->nr_servers; i++) {
- if (test_bit(i, &untried)) {
- server = slist->servers[i].server;
- if (server->probe.responded)
- goto stop;
- if (atomic_read(&server->probe_outstanding))
- still_probing = true;
+ estate = states[i].endpoint_state;
+ if (test_bit(AFS_ESTATE_SUPERSEDED, &estate->flags)) {
+ ret = 2;
+ goto stop;
+ }
+ if (atomic_read(&estate->nr_probing))
+ still_probing = true;
+ if (estate->responsive_set & states[i].untried_addrs) {
+ ret = 1;
+ goto stop;
}
}
@@ -288,28 +349,12 @@ int afs_wait_for_fs_probes(struct afs_server_list *slist, unsigned long untried)
stop:
set_current_state(TASK_RUNNING);
- for (i = 0; i < slist->nr_servers; i++) {
- if (test_bit(i, &untried)) {
- server = slist->servers[i].server;
- rtt_s = READ_ONCE(server->rtt);
- if (test_bit(AFS_SERVER_FL_RESPONDING, &server->flags) &&
- rtt_s < rtt) {
- pref = i;
- rtt = rtt_s;
- }
-
- remove_wait_queue(&server->probe_wq, &waits[i]);
- }
- }
-
- kfree(waits);
-
- if (pref == -1 && signal_pending(current))
- return -ERESTARTSYS;
+ for (i = 0; i < slist->nr_servers; i++)
+ remove_wait_queue(&slist->servers[i].server->probe_wq, &states[i].probe_waiter);
- if (pref >= 0)
- slist->preferred = pref;
- return 0;
+ if (!ret && signal_pending(current))
+ ret = -ERESTARTSYS;
+ return ret;
}
/*
@@ -327,7 +372,7 @@ void afs_fs_probe_timer(struct timer_list *timer)
/*
* Dispatch a probe to a server.
*/
-static void afs_dispatch_fs_probe(struct afs_net *net, struct afs_server *server, bool all)
+static void afs_dispatch_fs_probe(struct afs_net *net, struct afs_server *server)
__releases(&net->fs_lock)
{
struct key *key = NULL;
@@ -340,7 +385,7 @@ static void afs_dispatch_fs_probe(struct afs_net *net, struct afs_server *server
afs_get_server(server, afs_server_trace_get_probe);
write_sequnlock(&net->fs_lock);
- afs_fs_probe_fileserver(net, server, key, all);
+ afs_fs_probe_fileserver(net, server, NULL, key);
afs_put_server(net, server, afs_server_trace_put_probe);
}
@@ -352,7 +397,7 @@ void afs_probe_fileserver(struct afs_net *net, struct afs_server *server)
{
write_seqlock(&net->fs_lock);
if (!list_empty(&server->probe_link))
- return afs_dispatch_fs_probe(net, server, true);
+ return afs_dispatch_fs_probe(net, server);
write_sequnlock(&net->fs_lock);
}
@@ -412,7 +457,7 @@ again:
_debug("probe %pU", &server->uuid);
if (server && (first_pass || !need_resched())) {
- afs_dispatch_fs_probe(net, server, server == fast);
+ afs_dispatch_fs_probe(net, server);
first_pass = false;
goto again;
}
@@ -436,12 +481,13 @@ again:
/*
* Wait for a probe on a particular fileserver to complete for 2s.
*/
-int afs_wait_for_one_fs_probe(struct afs_server *server, bool is_intr)
+int afs_wait_for_one_fs_probe(struct afs_server *server, struct afs_endpoint_state *estate,
+ unsigned long exclude, bool is_intr)
{
struct wait_queue_entry wait;
unsigned long timo = 2 * HZ;
- if (atomic_read(&server->probe_outstanding) == 0)
+ if (atomic_read(&estate->nr_probing) == 0)
goto dont_wait;
init_wait_entry(&wait, 0);
@@ -449,8 +495,9 @@ int afs_wait_for_one_fs_probe(struct afs_server *server, bool is_intr)
prepare_to_wait_event(&server->probe_wq, &wait,
is_intr ? TASK_INTERRUPTIBLE : TASK_UNINTERRUPTIBLE);
if (timo == 0 ||
- server->probe.responded ||
- atomic_read(&server->probe_outstanding) == 0 ||
+ test_bit(AFS_ESTATE_SUPERSEDED, &estate->flags) ||
+ (estate->responsive_set & ~exclude) ||
+ atomic_read(&estate->nr_probing) == 0 ||
(is_intr && signal_pending(current)))
break;
timo = schedule_timeout(timo);
@@ -459,7 +506,9 @@ int afs_wait_for_one_fs_probe(struct afs_server *server, bool is_intr)
finish_wait(&server->probe_wq, &wait);
dont_wait:
- if (server->probe.responded)
+ if (estate->responsive_set & ~exclude)
+ return 1;
+ if (test_bit(AFS_ESTATE_SUPERSEDED, &estate->flags))
return 0;
if (is_intr && signal_pending(current))
return -ERESTARTSYS;
diff --git a/fs/afs/fsclient.c b/fs/afs/fsclient.c
index 7d37f63ef0f0..79cd30775b7a 100644
--- a/fs/afs/fsclient.c
+++ b/fs/afs/fsclient.c
@@ -290,6 +290,7 @@ void afs_fs_fetch_status(struct afs_operation *op)
bp[2] = htonl(vp->fid.vnode);
bp[3] = htonl(vp->fid.unique);
+ call->fid = vp->fid;
trace_afs_make_fs_call(call, &vp->fid);
afs_make_op_call(op, call, GFP_NOFS);
}
@@ -442,6 +443,7 @@ static void afs_fs_fetch_data64(struct afs_operation *op)
bp[6] = 0;
bp[7] = htonl(lower_32_bits(req->len));
+ call->fid = vp->fid;
trace_afs_make_fs_call(call, &vp->fid);
afs_make_op_call(op, call, GFP_NOFS);
}
@@ -476,6 +478,7 @@ void afs_fs_fetch_data(struct afs_operation *op)
bp[4] = htonl(lower_32_bits(req->pos));
bp[5] = htonl(lower_32_bits(req->len));
+ call->fid = vp->fid;
trace_afs_make_fs_call(call, &vp->fid);
afs_make_op_call(op, call, GFP_NOFS);
}
@@ -559,6 +562,7 @@ void afs_fs_create_file(struct afs_operation *op)
*bp++ = htonl(op->create.mode & S_IALLUGO); /* unix mode */
*bp++ = 0; /* segment size */
+ call->fid = dvp->fid;
trace_afs_make_fs_call1(call, &dvp->fid, name);
afs_make_op_call(op, call, GFP_NOFS);
}
@@ -612,6 +616,7 @@ void afs_fs_make_dir(struct afs_operation *op)
*bp++ = htonl(op->create.mode & S_IALLUGO); /* unix mode */
*bp++ = 0; /* segment size */
+ call->fid = dvp->fid;
trace_afs_make_fs_call1(call, &dvp->fid, name);
afs_make_op_call(op, call, GFP_NOFS);
}
@@ -685,6 +690,7 @@ void afs_fs_remove_file(struct afs_operation *op)
bp = (void *) bp + padsz;
}
+ call->fid = dvp->fid;
trace_afs_make_fs_call1(call, &dvp->fid, name);
afs_make_op_call(op, call, GFP_NOFS);
}
@@ -732,6 +738,7 @@ void afs_fs_remove_dir(struct afs_operation *op)
bp = (void *) bp + padsz;
}
+ call->fid = dvp->fid;
trace_afs_make_fs_call1(call, &dvp->fid, name);
afs_make_op_call(op, call, GFP_NOFS);
}
@@ -812,6 +819,7 @@ void afs_fs_link(struct afs_operation *op)
*bp++ = htonl(vp->fid.vnode);
*bp++ = htonl(vp->fid.unique);
+ call->fid = vp->fid;
trace_afs_make_fs_call1(call, &vp->fid, name);
afs_make_op_call(op, call, GFP_NOFS);
}
@@ -907,6 +915,7 @@ void afs_fs_symlink(struct afs_operation *op)
*bp++ = htonl(S_IRWXUGO); /* unix mode */
*bp++ = 0; /* segment size */
+ call->fid = dvp->fid;
trace_afs_make_fs_call1(call, &dvp->fid, name);
afs_make_op_call(op, call, GFP_NOFS);
}
@@ -1003,6 +1012,7 @@ void afs_fs_rename(struct afs_operation *op)
bp = (void *) bp + n_padsz;
}
+ call->fid = orig_dvp->fid;
trace_afs_make_fs_call2(call, &orig_dvp->fid, orig_name, new_name);
afs_make_op_call(op, call, GFP_NOFS);
}
@@ -1090,6 +1100,7 @@ static void afs_fs_store_data64(struct afs_operation *op)
*bp++ = htonl(upper_32_bits(op->store.i_size));
*bp++ = htonl(lower_32_bits(op->store.i_size));
+ call->fid = vp->fid;
trace_afs_make_fs_call(call, &vp->fid);
afs_make_op_call(op, call, GFP_NOFS);
}
@@ -1140,6 +1151,7 @@ void afs_fs_store_data(struct afs_operation *op)
*bp++ = htonl(lower_32_bits(op->store.size));
*bp++ = htonl(lower_32_bits(op->store.i_size));
+ call->fid = vp->fid;
trace_afs_make_fs_call(call, &vp->fid);
afs_make_op_call(op, call, GFP_NOFS);
}
@@ -1206,6 +1218,7 @@ static void afs_fs_setattr_size64(struct afs_operation *op)
*bp++ = htonl(upper_32_bits(attr->ia_size)); /* new file length */
*bp++ = htonl(lower_32_bits(attr->ia_size));
+ call->fid = vp->fid;
trace_afs_make_fs_call(call, &vp->fid);
afs_make_op_call(op, call, GFP_NOFS);
}
@@ -1247,6 +1260,7 @@ static void afs_fs_setattr_size(struct afs_operation *op)
*bp++ = 0; /* size of write */
*bp++ = htonl(attr->ia_size); /* new file length */
+ call->fid = vp->fid;
trace_afs_make_fs_call(call, &vp->fid);
afs_make_op_call(op, call, GFP_NOFS);
}
@@ -1283,6 +1297,7 @@ void afs_fs_setattr(struct afs_operation *op)
xdr_encode_AFS_StoreStatus(&bp, op->setattr.attr);
+ call->fid = vp->fid;
trace_afs_make_fs_call(call, &vp->fid);
afs_make_op_call(op, call, GFP_NOFS);
}
@@ -1446,6 +1461,7 @@ void afs_fs_get_volume_status(struct afs_operation *op)
bp[0] = htonl(FSGETVOLUMESTATUS);
bp[1] = htonl(vp->fid.vid);
+ call->fid = vp->fid;
trace_afs_make_fs_call(call, &vp->fid);
afs_make_op_call(op, call, GFP_NOFS);
}
@@ -1528,6 +1544,7 @@ void afs_fs_set_lock(struct afs_operation *op)
*bp++ = htonl(vp->fid.unique);
*bp++ = htonl(op->lock.type);
+ call->fid = vp->fid;
trace_afs_make_fs_calli(call, &vp->fid, op->lock.type);
afs_make_op_call(op, call, GFP_NOFS);
}
@@ -1554,6 +1571,7 @@ void afs_fs_extend_lock(struct afs_operation *op)
*bp++ = htonl(vp->fid.vnode);
*bp++ = htonl(vp->fid.unique);
+ call->fid = vp->fid;
trace_afs_make_fs_call(call, &vp->fid);
afs_make_op_call(op, call, GFP_NOFS);
}
@@ -1580,6 +1598,7 @@ void afs_fs_release_lock(struct afs_operation *op)
*bp++ = htonl(vp->fid.vnode);
*bp++ = htonl(vp->fid.unique);
+ call->fid = vp->fid;
trace_afs_make_fs_call(call, &vp->fid);
afs_make_op_call(op, call, GFP_NOFS);
}
@@ -1605,13 +1624,12 @@ static const struct afs_call_type afs_RXFSGiveUpAllCallBacks = {
/*
* Flush all the callbacks we have on a server.
*/
-int afs_fs_give_up_all_callbacks(struct afs_net *net,
- struct afs_server *server,
- struct afs_addr_cursor *ac,
- struct key *key)
+int afs_fs_give_up_all_callbacks(struct afs_net *net, struct afs_server *server,
+ struct afs_address *addr, struct key *key)
{
struct afs_call *call;
__be32 *bp;
+ int ret;
_enter("");
@@ -1619,15 +1637,22 @@ int afs_fs_give_up_all_callbacks(struct afs_net *net,
if (!call)
return -ENOMEM;
- call->key = key;
+ call->key = key;
+ call->peer = rxrpc_kernel_get_peer(addr->peer);
+ call->service_id = server->service_id;
/* marshall the parameters */
bp = call->request;
*bp++ = htonl(FSGIVEUPALLCALLBACKS);
call->server = afs_use_server(server, afs_server_trace_give_up_cb);
- afs_make_call(ac, call, GFP_NOFS);
- return afs_wait_for_call_to_complete(call, ac);
+ afs_make_call(call, GFP_NOFS);
+ afs_wait_for_call_to_complete(call);
+ ret = call->error;
+ if (call->responded)
+ set_bit(AFS_SERVER_FL_RESPONDING, &server->flags);
+ afs_put_call(call);
+ return ret;
}
/*
@@ -1689,6 +1714,12 @@ static int afs_deliver_fs_get_capabilities(struct afs_call *call)
return 0;
}
+static void afs_fs_get_capabilities_destructor(struct afs_call *call)
+{
+ afs_put_endpoint_state(call->probe, afs_estate_trace_put_getcaps);
+ afs_flat_call_destructor(call);
+}
+
/*
* FS.GetCapabilities operation type
*/
@@ -1697,7 +1728,7 @@ static const struct afs_call_type afs_RXFSGetCapabilities = {
.op = afs_FS_GetCapabilities,
.deliver = afs_deliver_fs_get_capabilities,
.done = afs_fileserver_probe_result,
- .destructor = afs_flat_call_destructor,
+ .destructor = afs_fs_get_capabilities_destructor,
};
/*
@@ -1707,7 +1738,8 @@ static const struct afs_call_type afs_RXFSGetCapabilities = {
* ->done() - otherwise we return false to indicate we didn't even try.
*/
bool afs_fs_get_capabilities(struct afs_net *net, struct afs_server *server,
- struct afs_addr_cursor *ac, struct key *key)
+ struct afs_endpoint_state *estate, unsigned int addr_index,
+ struct key *key)
{
struct afs_call *call;
__be32 *bp;
@@ -1718,10 +1750,14 @@ bool afs_fs_get_capabilities(struct afs_net *net, struct afs_server *server,
if (!call)
return false;
- call->key = key;
- call->server = afs_use_server(server, afs_server_trace_get_caps);
- call->upgrade = true;
- call->async = true;
+ call->key = key;
+ call->server = afs_use_server(server, afs_server_trace_get_caps);
+ call->peer = rxrpc_kernel_get_peer(estate->addresses->addrs[addr_index].peer);
+ call->probe = afs_get_endpoint_state(estate, afs_estate_trace_get_getcaps);
+ call->probe_index = addr_index;
+ call->service_id = server->service_id;
+ call->upgrade = true;
+ call->async = true;
call->max_lifespan = AFS_PROBE_MAX_LIFESPAN;
/* marshall the parameters */
@@ -1729,7 +1765,7 @@ bool afs_fs_get_capabilities(struct afs_net *net, struct afs_server *server,
*bp++ = htonl(FSGETCAPABILITIES);
trace_afs_make_fs_call(call, NULL);
- afs_make_call(ac, call, GFP_NOFS);
+ afs_make_call(call, GFP_NOFS);
afs_put_call(call);
return true;
}
@@ -1853,7 +1889,10 @@ static int afs_deliver_fs_inline_bulk_status(struct afs_call *call)
return ret;
bp = call->buffer;
- xdr_decode_AFSVolSync(&bp, &op->volsync);
+ /* Unfortunately, prior to OpenAFS-1.6, volsync here is filled
+ * with rubbish.
+ */
+ xdr_decode_AFSVolSync(&bp, NULL);
call->unmarshall++;
fallthrough;
@@ -1899,7 +1938,7 @@ void afs_fs_inline_bulk_status(struct afs_operation *op)
int i;
if (test_bit(AFS_SERVER_FL_NO_IBULK, &op->server->flags)) {
- op->error = -ENOTSUPP;
+ afs_op_set_error(op, -ENOTSUPP);
return;
}
@@ -1928,6 +1967,7 @@ void afs_fs_inline_bulk_status(struct afs_operation *op)
*bp++ = htonl(op->more_files[i].fid.unique);
}
+ call->fid = vp->fid;
trace_afs_make_fs_call(call, &vp->fid);
afs_make_op_call(op, call, GFP_NOFS);
}
@@ -2033,6 +2073,7 @@ void afs_fs_fetch_acl(struct afs_operation *op)
bp[2] = htonl(vp->fid.vnode);
bp[3] = htonl(vp->fid.unique);
+ call->fid = vp->fid;
trace_afs_make_fs_call(call, &vp->fid);
afs_make_op_call(op, call, GFP_KERNEL);
}
@@ -2078,6 +2119,7 @@ void afs_fs_store_acl(struct afs_operation *op)
if (acl->size != size)
memset((void *)&bp[5] + acl->size, 0, size - acl->size);
+ call->fid = vp->fid;
trace_afs_make_fs_call(call, &vp->fid);
afs_make_op_call(op, call, GFP_KERNEL);
}
diff --git a/fs/afs/inode.c b/fs/afs/inode.c
index 78efc9719349..4f04f6f33f46 100644
--- a/fs/afs/inode.c
+++ b/fs/afs/inode.c
@@ -85,8 +85,7 @@ static int afs_inode_init_from_status(struct afs_operation *op,
write_seqlock(&vnode->cb_lock);
- vnode->cb_v_break = op->cb_v_break;
- vnode->cb_s_break = op->cb_s_break;
+ vnode->cb_v_check = op->cb_v_break;
vnode->status = *status;
t = status->mtime_client;
@@ -146,11 +145,10 @@ static int afs_inode_init_from_status(struct afs_operation *op,
if (!vp->scb.have_cb) {
/* it's a symlink we just created (the fileserver
* didn't give us a callback) */
- vnode->cb_expires_at = ktime_get_real_seconds();
+ atomic64_set(&vnode->cb_expires_at, AFS_NO_CB_PROMISE);
} else {
- vnode->cb_expires_at = vp->scb.callback.expires_at;
vnode->cb_server = op->server;
- set_bit(AFS_VNODE_CB_PROMISED, &vnode->flags);
+ atomic64_set(&vnode->cb_expires_at, vp->scb.callback.expires_at);
}
write_sequnlock(&vnode->cb_lock);
@@ -214,7 +212,8 @@ static void afs_apply_status(struct afs_operation *op,
vnode->status = *status;
if (vp->dv_before + vp->dv_delta != status->data_version) {
- if (test_bit(AFS_VNODE_CB_PROMISED, &vnode->flags))
+ if (vnode->cb_ro_snapshot == atomic_read(&vnode->volume->cb_ro_snapshot) &&
+ atomic64_read(&vnode->cb_expires_at) != AFS_NO_CB_PROMISE)
pr_warn("kAFS: vnode modified {%llx:%llu} %llx->%llx %s (op=%x)\n",
vnode->fid.vid, vnode->fid.vnode,
(unsigned long long)vp->dv_before + vp->dv_delta,
@@ -268,9 +267,9 @@ static void afs_apply_callback(struct afs_operation *op,
struct afs_vnode *vnode = vp->vnode;
if (!afs_cb_is_broken(vp->cb_break_before, vnode)) {
- vnode->cb_expires_at = cb->expires_at;
- vnode->cb_server = op->server;
- set_bit(AFS_VNODE_CB_PROMISED, &vnode->flags);
+ if (op->volume->type == AFSVL_RWVOL)
+ vnode->cb_server = op->server;
+ atomic64_set(&vnode->cb_expires_at, cb->expires_at);
}
}
@@ -331,7 +330,7 @@ static void afs_fetch_status_success(struct afs_operation *op)
if (vnode->netfs.inode.i_state & I_NEW) {
ret = afs_inode_init_from_status(op, vp, vnode);
- op->error = ret;
+ afs_op_set_error(op, ret);
if (ret == 0)
afs_cache_permit(vnode, op->key, vp->cb_break_before, &vp->scb);
} else {
@@ -542,7 +541,7 @@ struct inode *afs_root_iget(struct super_block *sb, struct key *key)
BUG_ON(!(inode->i_state & I_NEW));
vnode = AFS_FS_I(inode);
- vnode->cb_v_break = as->volume->cb_v_break,
+ vnode->cb_v_check = atomic_read(&as->volume->cb_v_break),
afs_set_netfs_context(vnode);
op = afs_alloc_operation(key, as->volume);
@@ -573,180 +572,6 @@ error:
}
/*
- * mark the data attached to an inode as obsolete due to a write on the server
- * - might also want to ditch all the outstanding writes and dirty pages
- */
-static void afs_zap_data(struct afs_vnode *vnode)
-{
- _enter("{%llx:%llu}", vnode->fid.vid, vnode->fid.vnode);
-
- afs_invalidate_cache(vnode, 0);
-
- /* nuke all the non-dirty pages that aren't locked, mapped or being
- * written back in a regular file and completely discard the pages in a
- * directory or symlink */
- if (S_ISREG(vnode->netfs.inode.i_mode))
- invalidate_remote_inode(&vnode->netfs.inode);
- else
- invalidate_inode_pages2(vnode->netfs.inode.i_mapping);
-}
-
-/*
- * Check to see if we have a server currently serving this volume and that it
- * hasn't been reinitialised or dropped from the list.
- */
-static bool afs_check_server_good(struct afs_vnode *vnode)
-{
- struct afs_server_list *slist;
- struct afs_server *server;
- bool good;
- int i;
-
- if (vnode->cb_fs_s_break == atomic_read(&vnode->volume->cell->fs_s_break))
- return true;
-
- rcu_read_lock();
-
- slist = rcu_dereference(vnode->volume->servers);
- for (i = 0; i < slist->nr_servers; i++) {
- server = slist->servers[i].server;
- if (server == vnode->cb_server) {
- good = (vnode->cb_s_break == server->cb_s_break);
- rcu_read_unlock();
- return good;
- }
- }
-
- rcu_read_unlock();
- return false;
-}
-
-/*
- * Check the validity of a vnode/inode.
- */
-bool afs_check_validity(struct afs_vnode *vnode)
-{
- enum afs_cb_break_reason need_clear = afs_cb_break_no_break;
- time64_t now = ktime_get_real_seconds();
- unsigned int cb_break;
- int seq = 0;
-
- do {
- read_seqbegin_or_lock(&vnode->cb_lock, &seq);
- cb_break = vnode->cb_break;
-
- if (test_bit(AFS_VNODE_CB_PROMISED, &vnode->flags)) {
- if (vnode->cb_v_break != vnode->volume->cb_v_break)
- need_clear = afs_cb_break_for_v_break;
- else if (!afs_check_server_good(vnode))
- need_clear = afs_cb_break_for_s_reinit;
- else if (test_bit(AFS_VNODE_ZAP_DATA, &vnode->flags))
- need_clear = afs_cb_break_for_zap;
- else if (vnode->cb_expires_at - 10 <= now)
- need_clear = afs_cb_break_for_lapsed;
- } else if (test_bit(AFS_VNODE_DELETED, &vnode->flags)) {
- ;
- } else {
- need_clear = afs_cb_break_no_promise;
- }
-
- } while (need_seqretry(&vnode->cb_lock, seq));
-
- done_seqretry(&vnode->cb_lock, seq);
-
- if (need_clear == afs_cb_break_no_break)
- return true;
-
- write_seqlock(&vnode->cb_lock);
- if (need_clear == afs_cb_break_no_promise)
- vnode->cb_v_break = vnode->volume->cb_v_break;
- else if (cb_break == vnode->cb_break)
- __afs_break_callback(vnode, need_clear);
- else
- trace_afs_cb_miss(&vnode->fid, need_clear);
- write_sequnlock(&vnode->cb_lock);
- return false;
-}
-
-/*
- * Returns true if the pagecache is still valid. Does not sleep.
- */
-bool afs_pagecache_valid(struct afs_vnode *vnode)
-{
- if (unlikely(test_bit(AFS_VNODE_DELETED, &vnode->flags))) {
- if (vnode->netfs.inode.i_nlink)
- clear_nlink(&vnode->netfs.inode);
- return true;
- }
-
- if (test_bit(AFS_VNODE_CB_PROMISED, &vnode->flags) &&
- afs_check_validity(vnode))
- return true;
-
- return false;
-}
-
-/*
- * validate a vnode/inode
- * - there are several things we need to check
- * - parent dir data changes (rm, rmdir, rename, mkdir, create, link,
- * symlink)
- * - parent dir metadata changed (security changes)
- * - dentry data changed (write, truncate)
- * - dentry metadata changed (security changes)
- */
-int afs_validate(struct afs_vnode *vnode, struct key *key)
-{
- int ret;
-
- _enter("{v={%llx:%llu} fl=%lx},%x",
- vnode->fid.vid, vnode->fid.vnode, vnode->flags,
- key_serial(key));
-
- if (afs_pagecache_valid(vnode))
- goto valid;
-
- down_write(&vnode->validate_lock);
-
- /* if the promise has expired, we need to check the server again to get
- * a new promise - note that if the (parent) directory's metadata was
- * changed then the security may be different and we may no longer have
- * access */
- if (!test_bit(AFS_VNODE_CB_PROMISED, &vnode->flags)) {
- _debug("not promised");
- ret = afs_fetch_status(vnode, key, false, NULL);
- if (ret < 0) {
- if (ret == -ENOENT) {
- set_bit(AFS_VNODE_DELETED, &vnode->flags);
- ret = -ESTALE;
- }
- goto error_unlock;
- }
- _debug("new promise [fl=%lx]", vnode->flags);
- }
-
- if (test_bit(AFS_VNODE_DELETED, &vnode->flags)) {
- _debug("file already deleted");
- ret = -ESTALE;
- goto error_unlock;
- }
-
- /* if the vnode's data version number changed then its contents are
- * different */
- if (test_and_clear_bit(AFS_VNODE_ZAP_DATA, &vnode->flags))
- afs_zap_data(vnode);
- up_write(&vnode->validate_lock);
-valid:
- _leave(" = 0");
- return 0;
-
-error_unlock:
- up_write(&vnode->validate_lock);
- _leave(" = %d", ret);
- return ret;
-}
-
-/*
* read the attributes of an inode
*/
int afs_getattr(struct mnt_idmap *idmap, const struct path *path,
@@ -755,13 +580,13 @@ int afs_getattr(struct mnt_idmap *idmap, const struct path *path,
struct inode *inode = d_inode(path->dentry);
struct afs_vnode *vnode = AFS_FS_I(inode);
struct key *key;
- int ret, seq = 0;
+ int ret, seq;
_enter("{ ino=%lu v=%u }", inode->i_ino, inode->i_generation);
if (vnode->volume &&
!(query_flags & AT_STATX_DONT_SYNC) &&
- !test_bit(AFS_VNODE_CB_PROMISED, &vnode->flags)) {
+ atomic64_read(&vnode->cb_expires_at) == AFS_NO_CB_PROMISE) {
key = afs_request_key(vnode->volume->cell);
if (IS_ERR(key))
return PTR_ERR(key);
@@ -772,7 +597,7 @@ int afs_getattr(struct mnt_idmap *idmap, const struct path *path,
}
do {
- read_seqbegin_or_lock(&vnode->cb_lock, &seq);
+ seq = read_seqbegin(&vnode->cb_lock);
generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat);
if (test_bit(AFS_VNODE_SILLY_DELETED, &vnode->flags) &&
stat->nlink > 0)
@@ -784,9 +609,8 @@ int afs_getattr(struct mnt_idmap *idmap, const struct path *path,
*/
if (S_ISDIR(inode->i_mode))
stat->size = vnode->netfs.remote_i_size;
- } while (need_seqretry(&vnode->cb_lock, seq));
+ } while (read_seqretry(&vnode->cb_lock, seq));
- done_seqretry(&vnode->cb_lock, seq);
return 0;
}
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index 7385d62c8cf5..e33ace259cc6 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -33,6 +33,7 @@
struct pagevec;
struct afs_call;
struct afs_vnode;
+struct afs_server_probe;
/*
* Partial file-locking emulation mode. (The problem being that AFS3 only
@@ -73,21 +74,51 @@ enum afs_call_state {
};
/*
+ * Address preferences.
+ */
+struct afs_addr_preference {
+ union {
+ struct in_addr ipv4_addr; /* AF_INET address to compare against */
+ struct in6_addr ipv6_addr; /* AF_INET6 address to compare against */
+ };
+ sa_family_t family; /* Which address to use */
+ u16 prio; /* Priority */
+ u8 subnet_mask; /* How many bits to compare */
+};
+
+struct afs_addr_preference_list {
+ struct rcu_head rcu;
+ u16 version; /* Incremented when prefs list changes */
+ u8 ipv6_off; /* Offset of IPv6 addresses */
+ u8 nr; /* Number of addresses in total */
+ u8 max_prefs; /* Number of prefs allocated */
+ struct afs_addr_preference prefs[] __counted_by(max_prefs);
+};
+
+struct afs_address {
+ struct rxrpc_peer *peer;
+ short last_error; /* Last error from this address */
+ u16 prio; /* Address priority */
+};
+
+/*
* List of server addresses.
*/
struct afs_addr_list {
struct rcu_head rcu;
refcount_t usage;
u32 version; /* Version */
+ unsigned int debug_id;
+ unsigned int addr_pref_version; /* Version of address preference list */
unsigned char max_addrs;
unsigned char nr_addrs;
unsigned char preferred; /* Preferred address */
unsigned char nr_ipv4; /* Number of IPv4 addresses */
enum dns_record_source source:8;
enum dns_lookup_status status:8;
- unsigned long failed; /* Mask of addrs that failed locally/ICMP */
+ unsigned long probe_failed; /* Mask of addrs that failed locally/ICMP */
unsigned long responded; /* Mask of addrs that responded */
- struct sockaddr_rxrpc addrs[] __counted_by(max_addrs);
+ struct afs_address addrs[] __counted_by(max_addrs);
#define AFS_MAX_ADDRESSES ((unsigned int)(sizeof(unsigned long) * 8))
};
@@ -96,11 +127,11 @@ struct afs_addr_list {
*/
struct afs_call {
const struct afs_call_type *type; /* type of call */
- struct afs_addr_list *alist; /* Address is alist[addr_ix] */
wait_queue_head_t waitq; /* processes awaiting completion */
struct work_struct async_work; /* async I/O processor */
struct work_struct work; /* actual work processor */
struct rxrpc_call *rxcall; /* RxRPC call handle */
+ struct rxrpc_peer *peer; /* Remote endpoint */
struct key *key; /* security for this call */
struct afs_net *net; /* The network namespace */
struct afs_server *server; /* The fileserver record if fs op (pins ref) */
@@ -116,11 +147,14 @@ struct afs_call {
};
void *buffer; /* reply receive buffer */
union {
- long ret0; /* Value to reply with instead of 0 */
+ struct afs_endpoint_state *probe;
+ struct afs_addr_list *vl_probe;
struct afs_addr_list *ret_alist;
struct afs_vldb_entry *ret_vldb;
char *ret_str;
};
+ struct afs_fid fid; /* Primary vnode ID (or all zeroes) */
+ unsigned char probe_index; /* Address in ->probe_alist */
struct afs_operation *op;
unsigned int server_index;
refcount_t ref;
@@ -133,13 +167,13 @@ struct afs_call {
unsigned reply_max; /* maximum size of reply */
unsigned count2; /* count used in unmarshalling */
unsigned char unmarshall; /* unmarshalling phase */
- unsigned char addr_ix; /* Address in ->alist */
bool drop_ref; /* T if need to drop ref for incoming call */
bool need_attention; /* T if RxRPC poked us */
bool async; /* T if asynchronous */
bool upgrade; /* T to request service upgrade */
bool intr; /* T if interruptible */
bool unmarshalling_error; /* T if an unmarshalling error occurred */
+ bool responded; /* Got a response from the call (may be abort) */
u16 service_id; /* Actual service ID (after upgrade) */
unsigned int debug_id; /* Trace ID */
u32 operation_ID; /* operation ID for an incoming call */
@@ -306,6 +340,8 @@ struct afs_net {
struct proc_dir_entry *proc_afs; /* /proc/net/afs directory */
struct afs_sysnames *sysnames;
rwlock_t sysnames_lock;
+ struct afs_addr_preference_list __rcu *address_prefs;
+ u16 address_pref_version;
/* Statistics counters */
atomic_t n_lookup; /* Number of lookups done */
@@ -379,6 +415,7 @@ struct afs_cell {
unsigned int debug_id;
/* The volumes belonging to this cell */
+ struct rw_semaphore vs_lock; /* Lock for server->volumes */
struct rb_root volumes; /* Tree of volumes on this server */
struct hlist_head proc_volumes; /* procfs volume list */
seqlock_t volume_lock; /* For volumes */
@@ -386,9 +423,6 @@ struct afs_cell {
/* Active fileserver interaction state. */
struct rb_root fs_servers; /* afs_server (by server UUID) */
seqlock_t fs_lock; /* For fs_servers */
- struct rw_semaphore fs_open_mmaps_lock;
- struct list_head fs_open_mmaps; /* List of vnodes that are mmapped */
- atomic_t fs_s_break; /* Counter of CB.InitCallBackState messages */
/* VL server list. */
rwlock_t vl_servers_lock; /* Lock on vl_servers */
@@ -412,13 +446,14 @@ struct afs_vlserver {
rwlock_t lock; /* Lock on addresses */
refcount_t ref;
unsigned int rtt; /* Server's current RTT in uS */
+ unsigned int debug_id;
/* Probe state */
wait_queue_head_t probe_wq;
atomic_t probe_outstanding;
spinlock_t probe_lock;
struct {
- unsigned int rtt; /* RTT in uS */
+ unsigned int rtt; /* Best RTT in uS (or UINT_MAX) */
u32 abort_code;
short error;
unsigned short flags;
@@ -428,6 +463,7 @@ struct afs_vlserver {
#define AFS_VLSERVER_PROBE_LOCAL_FAILURE 0x08 /* A local failure prevented a probe */
} probe;
+ u16 service_id; /* Service ID we're using */
u16 port;
u16 name_len; /* Length of name */
char name[]; /* Server name, case-flattened */
@@ -477,6 +513,7 @@ struct afs_vldb_entry {
#define AFS_VOL_VTM_RW 0x01 /* R/W version of the volume is available (on this server) */
#define AFS_VOL_VTM_RO 0x02 /* R/O version of the volume is available (on this server) */
#define AFS_VOL_VTM_BAK 0x04 /* backup version of the volume is available (on this server) */
+ u8 vlsf_flags[AFS_NMAXNSERVERS];
short error;
u8 nr_servers; /* Number of server records */
u8 name_len;
@@ -484,6 +521,32 @@ struct afs_vldb_entry {
};
/*
+ * Fileserver endpoint state. The records the addresses of a fileserver's
+ * endpoints and the state and result of a round of probing on them. This
+ * allows the rotation algorithm to access those results without them being
+ * erased by a subsequent round of probing.
+ */
+struct afs_endpoint_state {
+ struct rcu_head rcu;
+ struct afs_addr_list *addresses; /* The addresses being probed */
+ unsigned long responsive_set; /* Bitset of responsive endpoints */
+ unsigned long failed_set; /* Bitset of endpoints we failed to probe */
+ refcount_t ref;
+ unsigned int server_id; /* Debug ID of server */
+ unsigned int probe_seq; /* Probe sequence (from server::probe_counter) */
+ atomic_t nr_probing; /* Number of outstanding probes */
+ unsigned int rtt; /* Best RTT in uS (or UINT_MAX) */
+ s32 abort_code;
+ short error;
+ unsigned long flags;
+#define AFS_ESTATE_RESPONDED 0 /* Set if the server responded */
+#define AFS_ESTATE_SUPERSEDED 1 /* Set if this record has been superseded */
+#define AFS_ESTATE_IS_YFS 2 /* Set if probe upgraded to YFS */
+#define AFS_ESTATE_NOT_YFS 3 /* Set if probe didn't upgrade to YFS */
+#define AFS_ESTATE_LOCAL_FAILURE 4 /* Set if there was a local failure (eg. ENOMEM) */
+};
+
+/*
* Record of fileserver with which we're actively communicating.
*/
struct afs_server {
@@ -493,7 +556,6 @@ struct afs_server {
struct afs_uuid _uuid;
};
- struct afs_addr_list __rcu *addresses;
struct afs_cell *cell; /* Cell to which belongs (pins ref) */
struct rb_node uuid_rb; /* Link in net->fs_servers */
struct afs_server __rcu *uuid_next; /* Next server with same UUID */
@@ -502,7 +564,7 @@ struct afs_server {
struct hlist_node addr4_link; /* Link in net->fs_addresses4 */
struct hlist_node addr6_link; /* Link in net->fs_addresses6 */
struct hlist_node proc_link; /* Link in net->fs_proc */
- struct work_struct initcb_work; /* Work for CB.InitCallBackState* */
+ struct list_head volumes; /* RCU list of afs_server_entry objects */
struct afs_server *gc_next; /* Next server in manager's list */
time64_t unuse_time; /* Time at which last unused */
unsigned long flags;
@@ -520,44 +582,47 @@ struct afs_server {
refcount_t ref; /* Object refcount */
atomic_t active; /* Active user count */
u32 addr_version; /* Address list version */
+ u16 service_id; /* Service ID we're using. */
unsigned int rtt; /* Server's current RTT in uS */
unsigned int debug_id; /* Debugging ID for traces */
/* file service access */
rwlock_t fs_lock; /* access lock */
- /* callback promise management */
- unsigned cb_s_break; /* Break-everything counter. */
-
/* Probe state */
+ struct afs_endpoint_state __rcu *endpoint_state; /* Latest endpoint/probe state */
unsigned long probed_at; /* Time last probe was dispatched (jiffies) */
wait_queue_head_t probe_wq;
- atomic_t probe_outstanding;
+ unsigned int probe_counter; /* Number of probes issued */
spinlock_t probe_lock;
- struct {
- unsigned int rtt; /* RTT in uS */
- u32 abort_code;
- short error;
- bool responded:1;
- bool is_yfs:1;
- bool not_yfs:1;
- bool local_failure:1;
- } probe;
};
+enum afs_ro_replicating {
+ AFS_RO_NOT_REPLICATING, /* Not doing replication */
+ AFS_RO_REPLICATING_USE_OLD, /* Replicating; use old version */
+ AFS_RO_REPLICATING_USE_NEW, /* Replicating; switch to new version */
+} __mode(byte);
+
/*
* Replaceable volume server list.
*/
struct afs_server_entry {
struct afs_server *server;
+ struct afs_volume *volume;
+ struct list_head slink; /* Link in server->volumes */
+ time64_t cb_expires_at; /* Time at which volume-level callback expires */
+ unsigned long flags;
+#define AFS_SE_EXCLUDED 0 /* Set if server is to be excluded in rotation */
+#define AFS_SE_VOLUME_OFFLINE 1 /* Set if volume offline notice given */
+#define AFS_SE_VOLUME_BUSY 2 /* Set if volume busy notice given */
};
struct afs_server_list {
struct rcu_head rcu;
- afs_volid_t vids[AFS_MAXTYPES]; /* Volume IDs */
refcount_t usage;
+ bool attached; /* T if attached to servers */
+ enum afs_ro_replicating ro_replicating; /* RW->RO update (probably) in progress */
unsigned char nr_servers;
- unsigned char preferred; /* Preferred server */
unsigned short vnovol_mask; /* Servers to be skipped due to VNOVOL */
unsigned int seq; /* Set to ->servers_seq when installed */
rwlock_t lock;
@@ -568,25 +633,23 @@ struct afs_server_list {
* Live AFS volume management.
*/
struct afs_volume {
- union {
- struct rcu_head rcu;
- afs_volid_t vid; /* volume ID */
- };
+ struct rcu_head rcu;
+ afs_volid_t vid; /* The volume ID of this volume */
+ afs_volid_t vids[AFS_MAXTYPES]; /* All associated volume IDs */
refcount_t ref;
time64_t update_at; /* Time at which to next update */
struct afs_cell *cell; /* Cell to which belongs (pins ref) */
struct rb_node cell_node; /* Link in cell->volumes */
struct hlist_node proc_link; /* Link in cell->proc_volumes */
struct super_block __rcu *sb; /* Superblock on which inodes reside */
+ struct work_struct destructor; /* Deferred destructor */
unsigned long flags;
#define AFS_VOLUME_NEEDS_UPDATE 0 /* - T if an update needs performing */
#define AFS_VOLUME_UPDATING 1 /* - T if an update is in progress */
#define AFS_VOLUME_WAIT 2 /* - T if users must wait for update */
#define AFS_VOLUME_DELETED 3 /* - T if volume appears deleted */
-#define AFS_VOLUME_OFFLINE 4 /* - T if volume offline notice given */
-#define AFS_VOLUME_BUSY 5 /* - T if volume busy notice given */
-#define AFS_VOLUME_MAYBE_NO_IBULK 6 /* - T if some servers don't have InlineBulkStatus */
-#define AFS_VOLUME_RM_TREE 7 /* - Set if volume removed from cell->volumes */
+#define AFS_VOLUME_MAYBE_NO_IBULK 4 /* - T if some servers don't have InlineBulkStatus */
+#define AFS_VOLUME_RM_TREE 5 /* - Set if volume removed from cell->volumes */
#ifdef CONFIG_AFS_FSCACHE
struct fscache_volume *cache; /* Caching cookie */
#endif
@@ -594,8 +657,21 @@ struct afs_volume {
rwlock_t servers_lock; /* Lock for ->servers */
unsigned int servers_seq; /* Incremented each time ->servers changes */
- unsigned cb_v_break; /* Break-everything counter. */
+ /* RO release tracking */
+ struct mutex volsync_lock; /* Time/state evaluation lock */
+ time64_t creation_time; /* Volume creation time (or TIME64_MIN) */
+ time64_t update_time; /* Volume update time (or TIME64_MIN) */
+
+ /* Callback management */
+ struct mutex cb_check_lock; /* Lock to control race to check after v_break */
+ time64_t cb_expires_at; /* Earliest volume callback expiry time */
+ atomic_t cb_ro_snapshot; /* RO volume update-from-snapshot counter */
+ atomic_t cb_v_break; /* Volume-break event counter. */
+ atomic_t cb_v_check; /* Volume-break has-been-checked counter. */
+ atomic_t cb_scrub; /* Scrub-all-data event counter. */
rwlock_t cb_v_break_lock;
+ struct rw_semaphore open_mmaps_lock;
+ struct list_head open_mmaps; /* List of vnodes that are mmapped */
afs_voltype_t type; /* type of volume */
char type_force; /* force volume type (suppress R/O -> R/W) */
@@ -634,7 +710,6 @@ struct afs_vnode {
spinlock_t wb_lock; /* lock for wb_keys */
spinlock_t lock; /* waitqueue/flags lock */
unsigned long flags;
-#define AFS_VNODE_CB_PROMISED 0 /* Set if vnode has a callback promise */
#define AFS_VNODE_UNSET 1 /* set if vnode attributes not yet set */
#define AFS_VNODE_DIR_VALID 2 /* Set if dir contents are valid */
#define AFS_VNODE_ZAP_DATA 3 /* set if vnode's data should be invalidated */
@@ -660,13 +735,14 @@ struct afs_vnode {
struct list_head cb_mmap_link; /* Link in cell->fs_open_mmaps */
void *cb_server; /* Server with callback/filelock */
atomic_t cb_nr_mmap; /* Number of mmaps */
- unsigned int cb_fs_s_break; /* Mass server break counter (cell->fs_s_break) */
- unsigned int cb_s_break; /* Mass break counter on ->server */
- unsigned int cb_v_break; /* Mass break counter on ->volume */
+ unsigned int cb_ro_snapshot; /* RO volume release counter on ->volume */
+ unsigned int cb_scrub; /* Scrub counter on ->volume */
unsigned int cb_break; /* Break counter on vnode */
+ unsigned int cb_v_check; /* Break check counter on ->volume */
seqlock_t cb_lock; /* Lock for ->cb_server, ->status, ->cb_*break */
- time64_t cb_expires_at; /* time at which callback expires */
+ atomic64_t cb_expires_at; /* time at which callback expires */
+#define AFS_NO_CB_PROMISE TIME64_MIN
};
static inline struct fscache_cookie *afs_vnode_cache(struct afs_vnode *vnode)
@@ -714,40 +790,49 @@ struct afs_permits {
* Error prioritisation and accumulation.
*/
struct afs_error {
- short error; /* Accumulated error */
+ s32 abort_code; /* Cumulative abort code */
+ short error; /* Cumulative error */
bool responded; /* T if server responded */
-};
-
-/*
- * Cursor for iterating over a server's address list.
- */
-struct afs_addr_cursor {
- struct afs_addr_list *alist; /* Current address list (pins ref) */
- unsigned long tried; /* Tried addresses */
- signed char index; /* Current address */
- bool responded; /* T if the current address responded */
- unsigned short nr_iterations; /* Number of address iterations */
- short error;
- u32 abort_code;
+ bool aborted; /* T if ->error is from an abort */
};
/*
* Cursor for iterating over a set of volume location servers.
*/
struct afs_vl_cursor {
- struct afs_addr_cursor ac;
struct afs_cell *cell; /* The cell we're querying */
struct afs_vlserver_list *server_list; /* Current server list (pins ref) */
struct afs_vlserver *server; /* Server on which this resides */
+ struct afs_addr_list *alist; /* Current address list (pins ref) */
struct key *key; /* Key for the server */
- unsigned long untried; /* Bitmask of untried servers */
- short index; /* Current server */
- short error;
+ unsigned long untried_servers; /* Bitmask of untried servers */
+ unsigned long addr_tried; /* Tried addresses */
+ struct afs_error cumul_error; /* Cumulative error */
+ unsigned int debug_id;
+ s32 call_abort_code;
+ short call_error; /* Error from single call */
+ short server_index; /* Current server */
+ signed char addr_index; /* Current address */
unsigned short flags;
#define AFS_VL_CURSOR_STOP 0x0001 /* Set to cease iteration */
#define AFS_VL_CURSOR_RETRY 0x0002 /* Set to do a retry */
#define AFS_VL_CURSOR_RETRIED 0x0004 /* Set if started a retry */
- unsigned short nr_iterations; /* Number of server iterations */
+ short nr_iterations; /* Number of server iterations */
+ bool call_responded; /* T if the current address responded */
+};
+
+/*
+ * Fileserver state tracking for an operation. An array of these is kept,
+ * indexed by server index.
+ */
+struct afs_server_state {
+ /* Tracking of fileserver probe state. Other operations may interfere
+ * by probing a fileserver when accessing other volumes.
+ */
+ unsigned int probe_seq;
+ unsigned long untried_addrs; /* Addresses we haven't tried yet */
+ struct wait_queue_entry probe_waiter;
+ struct afs_endpoint_state *endpoint_state; /* Endpoint state being monitored */
};
/*
@@ -768,7 +853,7 @@ struct afs_vnode_param {
struct afs_fid fid; /* Fid to access */
struct afs_status_cb scb; /* Returned status and callback promise */
afs_dataversion_t dv_before; /* Data version before the call */
- unsigned int cb_break_before; /* cb_break + cb_s_break before the call */
+ unsigned int cb_break_before; /* cb_break before the call */
u8 dv_delta; /* Expected change in data version */
bool put_vnode:1; /* T if we have a ref on the vnode */
bool need_io_lock:1; /* T if we need the I/O lock on this */
@@ -793,17 +878,17 @@ struct afs_operation {
struct afs_volume *volume; /* Volume being accessed */
struct afs_vnode_param file[2];
struct afs_vnode_param *more_files;
- struct afs_volsync volsync;
+ struct afs_volsync pre_volsync; /* Volsync before op */
+ struct afs_volsync volsync; /* Volsync returned by op */
struct dentry *dentry; /* Dentry to be altered */
struct dentry *dentry_2; /* Second dentry to be altered */
struct timespec64 mtime; /* Modification time to record */
struct timespec64 ctime; /* Change time to set */
+ struct afs_error cumul_error; /* Cumulative error */
short nr_files; /* Number of entries in file[], more_files */
- short error;
unsigned int debug_id;
unsigned int cb_v_break; /* Volume break counter before op */
- unsigned int cb_s_break; /* Server break counter before op */
union {
struct {
@@ -848,13 +933,19 @@ struct afs_operation {
};
/* Fileserver iteration state */
- struct afs_addr_cursor ac;
struct afs_server_list *server_list; /* Current server list (pins ref) */
struct afs_server *server; /* Server we're using (ref pinned by server_list) */
+ struct afs_endpoint_state *estate; /* Current endpoint state (doesn't pin ref) */
+ struct afs_server_state *server_states; /* States of the servers involved */
struct afs_call *call;
- unsigned long untried; /* Bitmask of untried servers */
- short index; /* Current server */
- unsigned short nr_iterations; /* Number of server iterations */
+ unsigned long untried_servers; /* Bitmask of untried servers */
+ unsigned long addr_tried; /* Tried addresses */
+ s32 call_abort_code; /* Abort code from single call */
+ short call_error; /* Error from single call */
+ short server_index; /* Current server */
+ short nr_iterations; /* Number of server iterations */
+ signed char addr_index; /* Current address */
+ bool call_responded; /* T if the current address responded */
unsigned int flags;
#define AFS_OPERATION_STOP 0x0001 /* Set to cease iteration */
@@ -956,31 +1047,32 @@ static inline bool afs_is_folio_dirty_mmapped(unsigned long priv)
/*
* addr_list.c
*/
-static inline struct afs_addr_list *afs_get_addrlist(struct afs_addr_list *alist)
-{
- if (alist)
- refcount_inc(&alist->usage);
- return alist;
-}
-extern struct afs_addr_list *afs_alloc_addrlist(unsigned int,
- unsigned short,
- unsigned short);
-extern void afs_put_addrlist(struct afs_addr_list *);
+struct afs_addr_list *afs_get_addrlist(struct afs_addr_list *alist, enum afs_alist_trace reason);
+extern struct afs_addr_list *afs_alloc_addrlist(unsigned int nr);
+extern void afs_put_addrlist(struct afs_addr_list *alist, enum afs_alist_trace reason);
extern struct afs_vlserver_list *afs_parse_text_addrs(struct afs_net *,
const char *, size_t, char,
unsigned short, unsigned short);
+bool afs_addr_list_same(const struct afs_addr_list *a,
+ const struct afs_addr_list *b);
extern struct afs_vlserver_list *afs_dns_query(struct afs_cell *, time64_t *);
-extern bool afs_iterate_addresses(struct afs_addr_cursor *);
-extern int afs_end_cursor(struct afs_addr_cursor *);
-extern void afs_merge_fs_addr4(struct afs_addr_list *, __be32, u16);
-extern void afs_merge_fs_addr6(struct afs_addr_list *, __be32 *, u16);
+extern int afs_merge_fs_addr4(struct afs_net *net, struct afs_addr_list *addr,
+ __be32 xdr, u16 port);
+extern int afs_merge_fs_addr6(struct afs_net *net, struct afs_addr_list *addr,
+ __be32 *xdr, u16 port);
+
+/*
+ * addr_prefs.c
+ */
+int afs_proc_addr_prefs_write(struct file *file, char *buf, size_t size);
+void afs_get_address_preferences_rcu(struct afs_net *net, struct afs_addr_list *alist);
+void afs_get_address_preferences(struct afs_net *net, struct afs_addr_list *alist);
/*
* callback.c
*/
extern void afs_invalidate_mmap_work(struct work_struct *);
-extern void afs_server_init_callback_work(struct work_struct *work);
extern void afs_init_callback_state(struct afs_server *);
extern void __afs_break_callback(struct afs_vnode *, enum afs_cb_break_reason);
extern void afs_break_callback(struct afs_vnode *, enum afs_cb_break_reason);
@@ -988,13 +1080,15 @@ extern void afs_break_callbacks(struct afs_server *, size_t, struct afs_callback
static inline unsigned int afs_calc_vnode_cb_break(struct afs_vnode *vnode)
{
- return vnode->cb_break + vnode->cb_v_break;
+ return vnode->cb_break + vnode->cb_ro_snapshot + vnode->cb_scrub;
}
static inline bool afs_cb_is_broken(unsigned int cb_break,
const struct afs_vnode *vnode)
{
- return cb_break != (vnode->cb_break + vnode->volume->cb_v_break);
+ return cb_break != (vnode->cb_break +
+ atomic_read(&vnode->volume->cb_ro_snapshot) +
+ atomic_read(&vnode->volume->cb_scrub));
}
/*
@@ -1110,10 +1204,11 @@ extern void afs_fs_get_volume_status(struct afs_operation *);
extern void afs_fs_set_lock(struct afs_operation *);
extern void afs_fs_extend_lock(struct afs_operation *);
extern void afs_fs_release_lock(struct afs_operation *);
-extern int afs_fs_give_up_all_callbacks(struct afs_net *, struct afs_server *,
- struct afs_addr_cursor *, struct key *);
-extern bool afs_fs_get_capabilities(struct afs_net *, struct afs_server *,
- struct afs_addr_cursor *, struct key *);
+int afs_fs_give_up_all_callbacks(struct afs_net *net, struct afs_server *server,
+ struct afs_address *addr, struct key *key);
+bool afs_fs_get_capabilities(struct afs_net *net, struct afs_server *server,
+ struct afs_endpoint_state *estate, unsigned int addr_index,
+ struct key *key);
extern void afs_fs_inline_bulk_status(struct afs_operation *);
struct afs_acl {
@@ -1133,11 +1228,6 @@ extern bool afs_begin_vnode_operation(struct afs_operation *);
extern void afs_wait_for_operation(struct afs_operation *);
extern int afs_do_sync_operation(struct afs_operation *);
-static inline void afs_op_nomem(struct afs_operation *op)
-{
- op->error = -ENOMEM;
-}
-
static inline void afs_op_set_vnode(struct afs_operation *op, unsigned int n,
struct afs_vnode *vnode)
{
@@ -1154,12 +1244,17 @@ static inline void afs_op_set_fid(struct afs_operation *op, unsigned int n,
/*
* fs_probe.c
*/
+struct afs_endpoint_state *afs_get_endpoint_state(struct afs_endpoint_state *estate,
+ enum afs_estate_trace where);
+void afs_put_endpoint_state(struct afs_endpoint_state *estate, enum afs_estate_trace where);
extern void afs_fileserver_probe_result(struct afs_call *);
-extern void afs_fs_probe_fileserver(struct afs_net *, struct afs_server *, struct key *, bool);
-extern int afs_wait_for_fs_probes(struct afs_server_list *, unsigned long);
+void afs_fs_probe_fileserver(struct afs_net *net, struct afs_server *server,
+ struct afs_addr_list *new_addrs, struct key *key);
+int afs_wait_for_fs_probes(struct afs_operation *op, struct afs_server_state *states, bool intr);
extern void afs_probe_fileserver(struct afs_net *, struct afs_server *);
extern void afs_fs_probe_dispatcher(struct work_struct *);
-extern int afs_wait_for_one_fs_probe(struct afs_server *, bool);
+int afs_wait_for_one_fs_probe(struct afs_server *server, struct afs_endpoint_state *estate,
+ unsigned long exclude, bool is_intr);
extern void afs_fs_probe_cleanup(struct afs_net *);
/*
@@ -1173,9 +1268,6 @@ extern int afs_ilookup5_test_by_fid(struct inode *, void *);
extern struct inode *afs_iget_pseudo_dir(struct super_block *, bool);
extern struct inode *afs_iget(struct afs_operation *, struct afs_vnode_param *);
extern struct inode *afs_root_iget(struct super_block *, struct key *);
-extern bool afs_check_validity(struct afs_vnode *);
-extern int afs_validate(struct afs_vnode *, struct key *);
-bool afs_pagecache_valid(struct afs_vnode *);
extern int afs_getattr(struct mnt_idmap *idmap, const struct path *,
struct kstat *, u32, unsigned int);
extern int afs_setattr(struct mnt_idmap *idmap, struct dentry *, struct iattr *);
@@ -1231,6 +1323,31 @@ static inline void __afs_stat(atomic_t *s)
extern int afs_abort_to_error(u32);
extern void afs_prioritise_error(struct afs_error *, int, u32);
+static inline void afs_op_nomem(struct afs_operation *op)
+{
+ op->cumul_error.error = -ENOMEM;
+}
+
+static inline int afs_op_error(const struct afs_operation *op)
+{
+ return op->cumul_error.error;
+}
+
+static inline s32 afs_op_abort_code(const struct afs_operation *op)
+{
+ return op->cumul_error.abort_code;
+}
+
+static inline int afs_op_set_error(struct afs_operation *op, int error)
+{
+ return op->cumul_error.error = error;
+}
+
+static inline void afs_op_accumulate_error(struct afs_operation *op, int error, s32 abort_code)
+{
+ afs_prioritise_error(&op->cumul_error, error, abort_code);
+}
+
/*
* mntpt.c
*/
@@ -1261,6 +1378,7 @@ static inline void afs_put_sysnames(struct afs_sysnames *sysnames) {}
/*
* rotate.c
*/
+void afs_clear_server_states(struct afs_operation *op);
extern bool afs_select_fileserver(struct afs_operation *);
extern void afs_dump_edestaddrreq(const struct afs_operation *);
@@ -1273,8 +1391,8 @@ extern int __net_init afs_open_socket(struct afs_net *);
extern void __net_exit afs_close_socket(struct afs_net *);
extern void afs_charge_preallocation(struct work_struct *);
extern void afs_put_call(struct afs_call *);
-extern void afs_make_call(struct afs_addr_cursor *, struct afs_call *, gfp_t);
-extern long afs_wait_for_call_to_complete(struct afs_call *, struct afs_addr_cursor *);
+void afs_make_call(struct afs_call *call, gfp_t gfp);
+void afs_wait_for_call_to_complete(struct afs_call *call);
extern struct afs_call *afs_alloc_flat_call(struct afs_net *,
const struct afs_call_type *,
size_t, size_t);
@@ -1287,12 +1405,16 @@ extern int afs_protocol_error(struct afs_call *, enum afs_eproto_cause);
static inline void afs_make_op_call(struct afs_operation *op, struct afs_call *call,
gfp_t gfp)
{
- op->call = call;
- op->type = call->type;
- call->op = op;
- call->key = op->key;
- call->intr = !(op->flags & AFS_OPERATION_UNINTR);
- afs_make_call(&op->ac, call, gfp);
+ struct afs_addr_list *alist = op->estate->addresses;
+
+ op->call = call;
+ op->type = call->type;
+ call->op = op;
+ call->key = op->key;
+ call->intr = !(op->flags & AFS_OPERATION_UNINTR);
+ call->peer = rxrpc_kernel_get_peer(alist->addrs[op->addr_index].peer);
+ call->service_id = op->server->service_id;
+ afs_make_call(call, gfp);
}
static inline void afs_extract_begin(struct afs_call *call, void *buf, size_t size)
@@ -1401,8 +1523,7 @@ extern void __exit afs_clean_up_permit_cache(void);
*/
extern spinlock_t afs_server_peer_lock;
-extern struct afs_server *afs_find_server(struct afs_net *,
- const struct sockaddr_rxrpc *);
+extern struct afs_server *afs_find_server(struct afs_net *, const struct rxrpc_peer *);
extern struct afs_server *afs_find_server_by_uuid(struct afs_net *, const uuid_t *);
extern struct afs_server *afs_lookup_server(struct afs_cell *, struct key *, const uuid_t *, u32);
extern struct afs_server *afs_get_server(struct afs_server *, enum afs_server_trace);
@@ -1414,7 +1535,7 @@ extern void afs_manage_servers(struct work_struct *);
extern void afs_servers_timer(struct timer_list *);
extern void afs_fs_probe_timer(struct timer_list *);
extern void __net_exit afs_purge_servers(struct afs_net *);
-extern bool afs_check_server_record(struct afs_operation *, struct afs_server *);
+bool afs_check_server_record(struct afs_operation *op, struct afs_server *server, struct key *key);
static inline void afs_inc_servers_outstanding(struct afs_net *net)
{
@@ -1442,10 +1563,14 @@ static inline struct afs_server_list *afs_get_serverlist(struct afs_server_list
}
extern void afs_put_serverlist(struct afs_net *, struct afs_server_list *);
-extern struct afs_server_list *afs_alloc_server_list(struct afs_cell *, struct key *,
- struct afs_vldb_entry *,
- u8);
+struct afs_server_list *afs_alloc_server_list(struct afs_volume *volume,
+ struct key *key,
+ struct afs_vldb_entry *vldb);
extern bool afs_annotate_server_list(struct afs_server_list *, struct afs_server_list *);
+void afs_attach_volume_to_servers(struct afs_volume *volume, struct afs_server_list *slist);
+void afs_reattach_volume_to_servers(struct afs_volume *volume, struct afs_server_list *slist,
+ struct afs_server_list *old);
+void afs_detach_volume_from_servers(struct afs_volume *volume, struct afs_server_list *slist);
/*
* super.c
@@ -1454,13 +1579,24 @@ extern int __init afs_fs_init(void);
extern void afs_fs_exit(void);
/*
+ * validation.c
+ */
+bool afs_check_validity(const struct afs_vnode *vnode);
+int afs_update_volume_state(struct afs_operation *op);
+int afs_validate(struct afs_vnode *vnode, struct key *key);
+
+/*
* vlclient.c
*/
extern struct afs_vldb_entry *afs_vl_get_entry_by_name_u(struct afs_vl_cursor *,
const char *, int);
extern struct afs_addr_list *afs_vl_get_addrs_u(struct afs_vl_cursor *, const uuid_t *);
-extern struct afs_call *afs_vl_get_capabilities(struct afs_net *, struct afs_addr_cursor *,
- struct key *, struct afs_vlserver *, unsigned int);
+struct afs_call *afs_vl_get_capabilities(struct afs_net *net,
+ struct afs_addr_list *alist,
+ unsigned int addr_index,
+ struct key *key,
+ struct afs_vlserver *server,
+ unsigned int server_index);
extern struct afs_addr_list *afs_yfsvl_get_endpoints(struct afs_vl_cursor *, const uuid_t *);
extern char *afs_yfsvl_get_cell_name(struct afs_vl_cursor *);
@@ -1516,7 +1652,7 @@ extern int afs_activate_volume(struct afs_volume *);
extern void afs_deactivate_volume(struct afs_volume *);
bool afs_try_get_volume(struct afs_volume *volume, enum afs_volume_trace reason);
extern struct afs_volume *afs_get_volume(struct afs_volume *, enum afs_volume_trace);
-extern void afs_put_volume(struct afs_net *, struct afs_volume *, enum afs_volume_trace);
+void afs_put_volume(struct afs_volume *volume, enum afs_volume_trace reason);
extern int afs_check_volume_status(struct afs_volume *, struct afs_operation *);
/*
@@ -1603,7 +1739,7 @@ static inline void afs_update_dentry_version(struct afs_operation *op,
struct afs_vnode_param *dir_vp,
struct dentry *dentry)
{
- if (!op->error)
+ if (!op->cumul_error.error)
dentry->d_fsdata =
(void *)(unsigned long)dir_vp->scb.status.data_version;
}
diff --git a/fs/afs/main.c b/fs/afs/main.c
index 6425c81d07de..1b3bd21c168a 100644
--- a/fs/afs/main.c
+++ b/fs/afs/main.c
@@ -156,6 +156,7 @@ static void __net_exit afs_net_exit(struct net *net_ns)
afs_close_socket(net);
afs_proc_cleanup(net);
afs_put_sysnames(net->sysnames);
+ kfree_rcu(rcu_access_pointer(net->address_prefs), rcu);
}
static struct pernet_operations afs_net_ops = {
diff --git a/fs/afs/misc.c b/fs/afs/misc.c
index 805328ca5428..b8180bf2281f 100644
--- a/fs/afs/misc.c
+++ b/fs/afs/misc.c
@@ -116,6 +116,8 @@ void afs_prioritise_error(struct afs_error *e, int error, u32 abort_code)
{
switch (error) {
case 0:
+ e->aborted = false;
+ e->error = 0;
return;
default:
if (e->error == -ETIMEDOUT ||
@@ -161,12 +163,16 @@ void afs_prioritise_error(struct afs_error *e, int error, u32 abort_code)
if (e->responded)
return;
e->error = error;
+ e->aborted = false;
return;
case -ECONNABORTED:
- error = afs_abort_to_error(abort_code);
- fallthrough;
+ e->error = afs_abort_to_error(abort_code);
+ e->aborted = true;
+ e->responded = true;
+ return;
case -ENETRESET: /* Responded, but we seem to have changed address */
+ e->aborted = false;
e->responded = true;
e->error = error;
return;
diff --git a/fs/afs/proc.c b/fs/afs/proc.c
index 2a0c83d71565..3bd02571f30d 100644
--- a/fs/afs/proc.c
+++ b/fs/afs/proc.c
@@ -147,6 +147,55 @@ inval:
}
/*
+ * Display the list of addr_prefs known to the namespace.
+ */
+static int afs_proc_addr_prefs_show(struct seq_file *m, void *v)
+{
+ struct afs_addr_preference_list *preflist;
+ struct afs_addr_preference *pref;
+ struct afs_net *net = afs_seq2net_single(m);
+ union {
+ struct sockaddr_in sin;
+ struct sockaddr_in6 sin6;
+ } addr;
+ unsigned int i;
+ char buf[44]; /* Maximum ipv6 + max subnet is 43 */
+
+ rcu_read_lock();
+ preflist = rcu_dereference(net->address_prefs);
+
+ if (!preflist) {
+ seq_puts(m, "NO PREFS\n");
+ return 0;
+ }
+
+ seq_printf(m, "PROT SUBNET PRIOR (v=%u n=%u/%u/%u)\n",
+ preflist->version, preflist->ipv6_off, preflist->nr, preflist->max_prefs);
+
+ memset(&addr, 0, sizeof(addr));
+
+ for (i = 0; i < preflist->nr; i++) {
+ pref = &preflist->prefs[i];
+
+ addr.sin.sin_family = pref->family;
+ if (pref->family == AF_INET) {
+ memcpy(&addr.sin.sin_addr, &pref->ipv4_addr,
+ sizeof(addr.sin.sin_addr));
+ snprintf(buf, sizeof(buf), "%pISc/%u", &addr.sin, pref->subnet_mask);
+ seq_printf(m, "UDP %-43.43s %5u\n", buf, pref->prio);
+ } else {
+ memcpy(&addr.sin6.sin6_addr, &pref->ipv6_addr,
+ sizeof(addr.sin6.sin6_addr));
+ snprintf(buf, sizeof(buf), "%pISc/%u", &addr.sin6, pref->subnet_mask);
+ seq_printf(m, "UDP %-43.43s %5u\n", buf, pref->prio);
+ }
+ }
+
+ rcu_read_lock();
+ return 0;
+}
+
+/*
* Display the name of the current workstation cell.
*/
static int afs_proc_rootcell_show(struct seq_file *m, void *v)
@@ -307,7 +356,7 @@ static int afs_proc_cell_vlservers_show(struct seq_file *m, void *v)
for (i = 0; i < alist->nr_addrs; i++)
seq_printf(m, " %c %pISpc\n",
alist->preferred == i ? '>' : '-',
- &alist->addrs[i].transport);
+ rxrpc_kernel_remote_addr(alist->addrs[i].peer));
}
seq_printf(m, " info: fl=%lx rtt=%d\n", vlserver->flags, vlserver->rtt);
seq_printf(m, " probe: fl=%x e=%d ac=%d out=%d\n",
@@ -375,32 +424,45 @@ static const struct seq_operations afs_proc_cell_vlservers_ops = {
*/
static int afs_proc_servers_show(struct seq_file *m, void *v)
{
- struct afs_server *server;
+ struct afs_endpoint_state *estate;
struct afs_addr_list *alist;
+ struct afs_server *server;
+ unsigned long failed;
int i;
if (v == SEQ_START_TOKEN) {
- seq_puts(m, "UUID REF ACT\n");
+ seq_puts(m, "UUID REF ACT CELL\n");
return 0;
}
server = list_entry(v, struct afs_server, proc_link);
- alist = rcu_dereference(server->addresses);
- seq_printf(m, "%pU %3d %3d\n",
+ estate = rcu_dereference(server->endpoint_state);
+ alist = estate->addresses;
+ seq_printf(m, "%pU %3d %3d %s\n",
&server->uuid,
refcount_read(&server->ref),
- atomic_read(&server->active));
- seq_printf(m, " - info: fl=%lx rtt=%u brk=%x\n",
- server->flags, server->rtt, server->cb_s_break);
- seq_printf(m, " - probe: last=%d out=%d\n",
- (int)(jiffies - server->probed_at) / HZ,
- atomic_read(&server->probe_outstanding));
- seq_printf(m, " - ALIST v=%u rsp=%lx f=%lx\n",
- alist->version, alist->responded, alist->failed);
- for (i = 0; i < alist->nr_addrs; i++)
- seq_printf(m, " [%x] %pISpc%s\n",
- i, &alist->addrs[i].transport,
- alist->preferred == i ? "*" : "");
+ atomic_read(&server->active),
+ server->cell->name);
+ seq_printf(m, " - info: fl=%lx rtt=%u\n",
+ server->flags, server->rtt);
+ seq_printf(m, " - probe: last=%d\n",
+ (int)(jiffies - server->probed_at) / HZ);
+ failed = estate->failed_set;
+ seq_printf(m, " - ESTATE pq=%x np=%u rsp=%lx f=%lx\n",
+ estate->probe_seq, atomic_read(&estate->nr_probing),
+ estate->responsive_set, estate->failed_set);
+ seq_printf(m, " - ALIST v=%u ap=%u\n",
+ alist->version, alist->addr_pref_version);
+ for (i = 0; i < alist->nr_addrs; i++) {
+ const struct afs_address *addr = &alist->addrs[i];
+
+ seq_printf(m, " [%x] %pISpc%s rtt=%d err=%d p=%u\n",
+ i, rxrpc_kernel_remote_addr(addr->peer),
+ alist->preferred == i ? "*" :
+ test_bit(i, &failed) ? "!" : "",
+ rxrpc_kernel_get_srtt(addr->peer),
+ addr->last_error, addr->prio);
+ }
return 0;
}
@@ -681,7 +743,11 @@ int afs_proc_init(struct afs_net *net)
&afs_proc_sysname_ops,
afs_proc_sysname_write,
sizeof(struct seq_net_private),
- NULL))
+ NULL) ||
+ !proc_create_net_single_write("addr_prefs", 0644, p,
+ afs_proc_addr_prefs_show,
+ afs_proc_addr_prefs_write,
+ NULL))
goto error_tree;
net->proc_afs = p;
diff --git a/fs/afs/rotate.c b/fs/afs/rotate.c
index a840c3588ebb..700a27bc8c25 100644
--- a/fs/afs/rotate.c
+++ b/fs/afs/rotate.c
@@ -13,6 +13,19 @@
#include <linux/sched/signal.h>
#include "internal.h"
#include "afs_fs.h"
+#include "protocol_uae.h"
+
+void afs_clear_server_states(struct afs_operation *op)
+{
+ unsigned int i;
+
+ if (op->server_states) {
+ for (i = 0; i < op->server_list->nr_servers; i++)
+ afs_put_endpoint_state(op->server_states[i].endpoint_state,
+ afs_estate_trace_put_server_state);
+ kfree(op->server_states);
+ }
+}
/*
* Begin iteration through a server list, starting with the vnode's last used
@@ -25,14 +38,41 @@ static bool afs_start_fs_iteration(struct afs_operation *op,
void *cb_server;
int i;
+ trace_afs_rotate(op, afs_rotate_trace_start, 0);
+
read_lock(&op->volume->servers_lock);
op->server_list = afs_get_serverlist(
rcu_dereference_protected(op->volume->servers,
lockdep_is_held(&op->volume->servers_lock)));
read_unlock(&op->volume->servers_lock);
- op->untried = (1UL << op->server_list->nr_servers) - 1;
- op->index = READ_ONCE(op->server_list->preferred);
+ op->server_states = kcalloc(op->server_list->nr_servers, sizeof(op->server_states[0]),
+ GFP_KERNEL);
+ if (!op->server_states) {
+ afs_op_nomem(op);
+ trace_afs_rotate(op, afs_rotate_trace_nomem, 0);
+ return false;
+ }
+
+ rcu_read_lock();
+ for (i = 0; i < op->server_list->nr_servers; i++) {
+ struct afs_endpoint_state *estate;
+ struct afs_server_state *s = &op->server_states[i];
+
+ server = op->server_list->servers[i].server;
+ estate = rcu_dereference(server->endpoint_state);
+ s->endpoint_state = afs_get_endpoint_state(estate,
+ afs_estate_trace_get_server_state);
+ s->probe_seq = estate->probe_seq;
+ s->untried_addrs = (1UL << estate->addresses->nr_addrs) - 1;
+ init_waitqueue_entry(&s->probe_waiter, current);
+ afs_get_address_preferences(op->net, estate->addresses);
+ }
+ rcu_read_unlock();
+
+
+ op->untried_servers = (1UL << op->server_list->nr_servers) - 1;
+ op->server_index = -1;
cb_server = vnode->cb_server;
if (cb_server) {
@@ -40,7 +80,7 @@ static bool afs_start_fs_iteration(struct afs_operation *op,
for (i = 0; i < op->server_list->nr_servers; i++) {
server = op->server_list->servers[i].server;
if (server == cb_server) {
- op->index = i;
+ op->server_index = i;
goto found_interest;
}
}
@@ -50,7 +90,8 @@ static bool afs_start_fs_iteration(struct afs_operation *op,
* and have to return an error.
*/
if (op->flags & AFS_OPERATION_CUR_ONLY) {
- op->error = -ESTALE;
+ afs_op_set_error(op, -ESTALE);
+ trace_afs_rotate(op, afs_rotate_trace_stale_lock, 0);
return false;
}
@@ -58,7 +99,7 @@ static bool afs_start_fs_iteration(struct afs_operation *op,
write_seqlock(&vnode->cb_lock);
ASSERTCMP(cb_server, ==, vnode->cb_server);
vnode->cb_server = NULL;
- if (test_and_clear_bit(AFS_VNODE_CB_PROMISED, &vnode->flags))
+ if (atomic64_xchg(&vnode->cb_expires_at, AFS_NO_CB_PROMISE) != AFS_NO_CB_PROMISE)
vnode->cb_break++;
write_sequnlock(&vnode->cb_lock);
}
@@ -70,7 +111,7 @@ found_interest:
/*
* Post volume busy note.
*/
-static void afs_busy(struct afs_volume *volume, u32 abort_code)
+static void afs_busy(struct afs_operation *op, u32 abort_code)
{
const char *m;
@@ -81,7 +122,8 @@ static void afs_busy(struct afs_volume *volume, u32 abort_code)
default: m = "busy"; break;
}
- pr_notice("kAFS: Volume %llu '%s' is %s\n", volume->vid, volume->name, m);
+ pr_notice("kAFS: Volume %llu '%s' on server %pU is %s\n",
+ op->volume->vid, op->volume->name, &op->server->uuid, m);
}
/*
@@ -89,10 +131,11 @@ static void afs_busy(struct afs_volume *volume, u32 abort_code)
*/
static bool afs_sleep_and_retry(struct afs_operation *op)
{
+ trace_afs_rotate(op, afs_rotate_trace_busy_sleep, 0);
if (!(op->flags & AFS_OPERATION_UNINTR)) {
msleep_interruptible(1000);
if (signal_pending(current)) {
- op->error = -ERESTARTSYS;
+ afs_op_set_error(op, -ERESTARTSYS);
return false;
}
} else {
@@ -111,62 +154,105 @@ bool afs_select_fileserver(struct afs_operation *op)
struct afs_addr_list *alist;
struct afs_server *server;
struct afs_vnode *vnode = op->file[0].vnode;
- struct afs_error e;
- u32 rtt;
- int error = op->ac.error, i;
+ unsigned long set, failed;
+ s32 abort_code = op->call_abort_code;
+ int best_prio = 0;
+ int error = op->call_error, addr_index, i, j;
+
+ op->nr_iterations++;
- _enter("%lx[%d],%lx[%d],%d,%d",
- op->untried, op->index,
- op->ac.tried, op->ac.index,
- error, op->ac.abort_code);
+ _enter("OP=%x+%x,%llx,%u{%lx},%u{%lx},%d,%d",
+ op->debug_id, op->nr_iterations, op->volume->vid,
+ op->server_index, op->untried_servers,
+ op->addr_index, op->addr_tried,
+ error, abort_code);
if (op->flags & AFS_OPERATION_STOP) {
+ trace_afs_rotate(op, afs_rotate_trace_stopped, 0);
_leave(" = f [stopped]");
return false;
}
- op->nr_iterations++;
-
- /* Evaluate the result of the previous operation, if there was one. */
- switch (error) {
- case SHRT_MAX:
+ if (op->nr_iterations == 0)
goto start;
+ WRITE_ONCE(op->estate->addresses->addrs[op->addr_index].last_error, error);
+ trace_afs_rotate(op, afs_rotate_trace_iter, op->call_error);
+
+ /* Evaluate the result of the previous operation, if there was one. */
+ switch (op->call_error) {
case 0:
+ clear_bit(AFS_SE_VOLUME_OFFLINE,
+ &op->server_list->servers[op->server_index].flags);
+ clear_bit(AFS_SE_VOLUME_BUSY,
+ &op->server_list->servers[op->server_index].flags);
+ op->cumul_error.responded = true;
+
+ /* We succeeded, but we may need to redo the op from another
+ * server if we're looking at a set of RO volumes where some of
+ * the servers have not yet been brought up to date lest we
+ * regress the data. We only switch to the new version once
+ * >=50% of the servers are updated.
+ */
+ error = afs_update_volume_state(op);
+ if (error != 0) {
+ if (error == 1) {
+ afs_sleep_and_retry(op);
+ goto restart_from_beginning;
+ }
+ afs_op_set_error(op, error);
+ goto failed;
+ }
+ fallthrough;
default:
/* Success or local failure. Stop. */
- op->error = error;
+ afs_op_set_error(op, error);
op->flags |= AFS_OPERATION_STOP;
+ trace_afs_rotate(op, afs_rotate_trace_stop, error);
_leave(" = f [okay/local %d]", error);
return false;
case -ECONNABORTED:
/* The far side rejected the operation on some grounds. This
* might involve the server being busy or the volume having been moved.
+ *
+ * Note that various V* errors should not be sent to a cache manager
+ * by a fileserver as they should be translated to more modern UAE*
+ * errors instead. IBM AFS and OpenAFS fileservers, however, do leak
+ * these abort codes.
*/
- switch (op->ac.abort_code) {
+ trace_afs_rotate(op, afs_rotate_trace_aborted, abort_code);
+ op->cumul_error.responded = true;
+ switch (abort_code) {
case VNOVOL:
/* This fileserver doesn't know about the volume.
* - May indicate that the VL is wrong - retry once and compare
* the results.
* - May indicate that the fileserver couldn't attach to the vol.
+ * - The volume might have been temporarily removed so that it can
+ * be replaced by a volume restore. "vos" might have ended one
+ * transaction and has yet to create the next.
+ * - The volume might not be blessed or might not be in-service
+ * (administrative action).
*/
if (op->flags & AFS_OPERATION_VNOVOL) {
- op->error = -EREMOTEIO;
+ afs_op_accumulate_error(op, -EREMOTEIO, abort_code);
goto next_server;
}
write_lock(&op->volume->servers_lock);
- op->server_list->vnovol_mask |= 1 << op->index;
+ op->server_list->vnovol_mask |= 1 << op->server_index;
write_unlock(&op->volume->servers_lock);
set_bit(AFS_VOLUME_NEEDS_UPDATE, &op->volume->flags);
error = afs_check_volume_status(op->volume, op);
- if (error < 0)
- goto failed_set_error;
+ if (error < 0) {
+ afs_op_set_error(op, error);
+ goto failed;
+ }
if (test_bit(AFS_VOLUME_DELETED, &op->volume->flags)) {
- op->error = -ENOMEDIUM;
+ afs_op_set_error(op, -ENOMEDIUM);
goto failed;
}
@@ -174,7 +260,7 @@ bool afs_select_fileserver(struct afs_operation *op)
* it's the fileserver having trouble.
*/
if (rcu_access_pointer(op->volume->servers) == op->server_list) {
- op->error = -EREMOTEIO;
+ afs_op_accumulate_error(op, -EREMOTEIO, abort_code);
goto next_server;
}
@@ -183,50 +269,99 @@ bool afs_select_fileserver(struct afs_operation *op)
_leave(" = t [vnovol]");
return true;
- case VSALVAGE: /* TODO: Should this return an error or iterate? */
case VVOLEXISTS:
- case VNOSERVICE:
case VONLINE:
- case VDISKFULL:
- case VOVERQUOTA:
- op->error = afs_abort_to_error(op->ac.abort_code);
+ /* These should not be returned from the fileserver. */
+ pr_warn("Fileserver returned unexpected abort %d\n",
+ abort_code);
+ afs_op_accumulate_error(op, -EREMOTEIO, abort_code);
goto next_server;
+ case VNOSERVICE:
+ /* Prior to AFS 3.2 VNOSERVICE was returned from the fileserver
+ * if the volume was neither in-service nor administratively
+ * blessed. All usage was replaced by VNOVOL because AFS 3.1 and
+ * earlier cache managers did not handle VNOSERVICE and assumed
+ * it was the client OSes errno 105.
+ *
+ * Starting with OpenAFS 1.4.8 VNOSERVICE was repurposed as the
+ * fileserver idle dead time error which was sent in place of
+ * RX_CALL_TIMEOUT (-3). The error was intended to be sent if the
+ * fileserver took too long to send a reply to the client.
+ * RX_CALL_TIMEOUT would have caused the cache manager to mark the
+ * server down whereas VNOSERVICE since AFS 3.2 would cause cache
+ * manager to temporarily (up to 15 minutes) mark the volume
+ * instance as unusable.
+ *
+ * The idle dead logic resulted in cache inconsistency since a
+ * state changing call that the cache manager assumed was dead
+ * could still be processed to completion by the fileserver. This
+ * logic was removed in OpenAFS 1.8.0 and VNOSERVICE is no longer
+ * returned. However, many 1.4.8 through 1.6.24 fileservers are
+ * still in existence.
+ *
+ * AuriStorFS fileservers have never returned VNOSERVICE.
+ *
+ * VNOSERVICE should be treated as an alias for RX_CALL_TIMEOUT.
+ */
+ case RX_CALL_TIMEOUT:
+ afs_op_accumulate_error(op, -ETIMEDOUT, abort_code);
+ goto next_server;
+
+ case VSALVAGING: /* This error should not be leaked to cache managers
+ * but is from OpenAFS demand attach fileservers.
+ * It should be treated as an alias for VOFFLINE.
+ */
+ case VSALVAGE: /* VSALVAGE should be treated as a synonym of VOFFLINE */
case VOFFLINE:
- if (!test_and_set_bit(AFS_VOLUME_OFFLINE, &op->volume->flags)) {
- afs_busy(op->volume, op->ac.abort_code);
- clear_bit(AFS_VOLUME_BUSY, &op->volume->flags);
+ /* The volume is in use by the volserver or another volume utility
+ * for an operation that might alter the contents. The volume is
+ * expected to come back but it might take a long time (could be
+ * days).
+ */
+ if (!test_and_set_bit(AFS_SE_VOLUME_OFFLINE,
+ &op->server_list->servers[op->server_index].flags)) {
+ afs_busy(op, abort_code);
+ clear_bit(AFS_SE_VOLUME_BUSY,
+ &op->server_list->servers[op->server_index].flags);
}
if (op->flags & AFS_OPERATION_NO_VSLEEP) {
- op->error = -EADV;
- goto failed;
- }
- if (op->flags & AFS_OPERATION_CUR_ONLY) {
- op->error = -ESTALE;
+ afs_op_set_error(op, -EADV);
goto failed;
}
goto busy;
- case VSALVAGING:
- case VRESTARTING:
+ case VRESTARTING: /* The fileserver is either shutting down or starting up. */
case VBUSY:
- /* Retry after going round all the servers unless we
- * have a file lock we need to maintain.
+ /* The volume is in use by the volserver or another volume
+ * utility for an operation that is not expected to alter the
+ * contents of the volume. VBUSY does not need to be returned
+ * for a ROVOL or BACKVOL bound to an ITBusy volserver
+ * transaction. The fileserver is permitted to continue serving
+ * content from ROVOLs and BACKVOLs during an ITBusy transaction
+ * because the content will not change. However, many fileserver
+ * releases do return VBUSY for ROVOL and BACKVOL instances under
+ * many circumstances.
+ *
+ * Retry after going round all the servers unless we have a file
+ * lock we need to maintain.
*/
if (op->flags & AFS_OPERATION_NO_VSLEEP) {
- op->error = -EBUSY;
+ afs_op_set_error(op, -EBUSY);
goto failed;
}
- if (!test_and_set_bit(AFS_VOLUME_BUSY, &op->volume->flags)) {
- afs_busy(op->volume, op->ac.abort_code);
- clear_bit(AFS_VOLUME_OFFLINE, &op->volume->flags);
+ if (!test_and_set_bit(AFS_SE_VOLUME_BUSY,
+ &op->server_list->servers[op->server_index].flags)) {
+ afs_busy(op, abort_code);
+ clear_bit(AFS_SE_VOLUME_OFFLINE,
+ &op->server_list->servers[op->server_index].flags);
}
busy:
if (op->flags & AFS_OPERATION_CUR_ONLY) {
if (!afs_sleep_and_retry(op))
goto failed;
- /* Retry with same server & address */
+ /* Retry with same server & address */
_leave(" = t [vbusy]");
return true;
}
@@ -243,7 +378,7 @@ bool afs_select_fileserver(struct afs_operation *op)
* honour, just in case someone sets up a loop.
*/
if (op->flags & AFS_OPERATION_VMOVED) {
- op->error = -EREMOTEIO;
+ afs_op_set_error(op, -EREMOTEIO);
goto failed;
}
op->flags |= AFS_OPERATION_VMOVED;
@@ -251,8 +386,10 @@ bool afs_select_fileserver(struct afs_operation *op)
set_bit(AFS_VOLUME_WAIT, &op->volume->flags);
set_bit(AFS_VOLUME_NEEDS_UPDATE, &op->volume->flags);
error = afs_check_volume_status(op->volume, op);
- if (error < 0)
- goto failed_set_error;
+ if (error < 0) {
+ afs_op_set_error(op, error);
+ goto failed;
+ }
/* If the server list didn't change, then the VLDB is
* out of sync with the fileservers. This is hopefully
@@ -264,22 +401,50 @@ bool afs_select_fileserver(struct afs_operation *op)
* TODO: Retry a few times with sleeps.
*/
if (rcu_access_pointer(op->volume->servers) == op->server_list) {
- op->error = -ENOMEDIUM;
+ afs_op_accumulate_error(op, -ENOMEDIUM, abort_code);
goto failed;
}
goto restart_from_beginning;
+ case UAEIO:
+ case VIO:
+ afs_op_accumulate_error(op, -EREMOTEIO, abort_code);
+ if (op->volume->type != AFSVL_RWVOL)
+ goto next_server;
+ goto failed;
+
+ case VDISKFULL:
+ case UAENOSPC:
+ /* The partition is full. Only applies to RWVOLs.
+ * Translate locally and return ENOSPC.
+ * No replicas to failover to.
+ */
+ afs_op_set_error(op, -ENOSPC);
+ goto failed_but_online;
+
+ case VOVERQUOTA:
+ case UAEDQUOT:
+ /* Volume is full. Only applies to RWVOLs.
+ * Translate locally and return EDQUOT.
+ * No replicas to failover to.
+ */
+ afs_op_set_error(op, -EDQUOT);
+ goto failed_but_online;
+
default:
- clear_bit(AFS_VOLUME_OFFLINE, &op->volume->flags);
- clear_bit(AFS_VOLUME_BUSY, &op->volume->flags);
- op->error = afs_abort_to_error(op->ac.abort_code);
+ afs_op_accumulate_error(op, error, abort_code);
+ failed_but_online:
+ clear_bit(AFS_SE_VOLUME_OFFLINE,
+ &op->server_list->servers[op->server_index].flags);
+ clear_bit(AFS_SE_VOLUME_BUSY,
+ &op->server_list->servers[op->server_index].flags);
goto failed;
}
case -ETIMEDOUT:
case -ETIME:
- if (op->error != -EDESTADDRREQ)
+ if (afs_op_error(op) != -EDESTADDRREQ)
goto iterate_address;
fallthrough;
case -ERFKILL:
@@ -289,7 +454,7 @@ bool afs_select_fileserver(struct afs_operation *op)
case -EHOSTDOWN:
case -ECONNREFUSED:
_debug("no conn");
- op->error = error;
+ afs_op_accumulate_error(op, error, 0);
goto iterate_address;
case -ENETRESET:
@@ -298,24 +463,31 @@ bool afs_select_fileserver(struct afs_operation *op)
fallthrough;
case -ECONNRESET:
_debug("call reset");
- op->error = error;
+ afs_op_set_error(op, error);
goto failed;
}
restart_from_beginning:
+ trace_afs_rotate(op, afs_rotate_trace_restart, 0);
_debug("restart");
- afs_end_cursor(&op->ac);
+ op->estate = NULL;
op->server = NULL;
+ afs_clear_server_states(op);
+ op->server_states = NULL;
afs_put_serverlist(op->net, op->server_list);
op->server_list = NULL;
start:
_debug("start");
+ ASSERTCMP(op->estate, ==, NULL);
/* See if we need to do an update of the volume record. Note that the
* volume may have moved or even have been deleted.
*/
error = afs_check_volume_status(op->volume, op);
- if (error < 0)
- goto failed_set_error;
+ trace_afs_rotate(op, afs_rotate_trace_check_vol_status, error);
+ if (error < 0) {
+ afs_op_set_error(op, error);
+ goto failed;
+ }
if (!afs_start_fs_iteration(op, vnode))
goto failed;
@@ -323,52 +495,83 @@ start:
_debug("__ VOL %llx __", op->volume->vid);
pick_server:
- _debug("pick [%lx]", op->untried);
+ _debug("pick [%lx]", op->untried_servers);
+ ASSERTCMP(op->estate, ==, NULL);
- error = afs_wait_for_fs_probes(op->server_list, op->untried);
- if (error < 0)
- goto failed_set_error;
+ error = afs_wait_for_fs_probes(op, op->server_states,
+ !(op->flags & AFS_OPERATION_UNINTR));
+ switch (error) {
+ case 0: /* No untried responsive servers and no outstanding probes */
+ trace_afs_rotate(op, afs_rotate_trace_probe_none, 0);
+ goto no_more_servers;
+ case 1: /* Got a response */
+ trace_afs_rotate(op, afs_rotate_trace_probe_response, 0);
+ break;
+ case 2: /* Probe data superseded */
+ trace_afs_rotate(op, afs_rotate_trace_probe_superseded, 0);
+ goto restart_from_beginning;
+ default:
+ trace_afs_rotate(op, afs_rotate_trace_probe_error, error);
+ afs_op_set_error(op, error);
+ goto failed;
+ }
- /* Pick the untried server with the lowest RTT. If we have outstanding
- * callbacks, we stick with the server we're already using if we can.
+ /* Pick the untried server with the highest priority untried endpoint.
+ * If we have outstanding callbacks, we stick with the server we're
+ * already using if we can.
*/
if (op->server) {
- _debug("server %u", op->index);
- if (test_bit(op->index, &op->untried))
+ _debug("server %u", op->server_index);
+ if (test_bit(op->server_index, &op->untried_servers))
goto selected_server;
op->server = NULL;
_debug("no server");
}
- op->index = -1;
- rtt = U32_MAX;
+ rcu_read_lock();
+ op->server_index = -1;
+ best_prio = -1;
for (i = 0; i < op->server_list->nr_servers; i++) {
- struct afs_server *s = op->server_list->servers[i].server;
+ struct afs_endpoint_state *es;
+ struct afs_server_entry *se = &op->server_list->servers[i];
+ struct afs_addr_list *sal;
+ struct afs_server *s = se->server;
- if (!test_bit(i, &op->untried) ||
+ if (!test_bit(i, &op->untried_servers) ||
+ test_bit(AFS_SE_EXCLUDED, &se->flags) ||
!test_bit(AFS_SERVER_FL_RESPONDING, &s->flags))
continue;
- if (s->probe.rtt < rtt) {
- op->index = i;
- rtt = s->probe.rtt;
+ es = op->server_states->endpoint_state;
+ sal = es->addresses;
+
+ afs_get_address_preferences_rcu(op->net, sal);
+ for (j = 0; j < sal->nr_addrs; j++) {
+ if (!sal->addrs[j].peer)
+ continue;
+ if (sal->addrs[j].prio > best_prio) {
+ op->server_index = i;
+ best_prio = sal->addrs[j].prio;
+ }
}
}
+ rcu_read_unlock();
- if (op->index == -1)
+ if (op->server_index == -1)
goto no_more_servers;
selected_server:
- _debug("use %d", op->index);
- __clear_bit(op->index, &op->untried);
+ trace_afs_rotate(op, afs_rotate_trace_selected_server, best_prio);
+ _debug("use %d prio %u", op->server_index, best_prio);
+ __clear_bit(op->server_index, &op->untried_servers);
/* We're starting on a different fileserver from the list. We need to
* check it, create a callback intercept, find its address list and
* probe its capabilities before we use it.
*/
- ASSERTCMP(op->ac.alist, ==, NULL);
- server = op->server_list->servers[op->index].server;
+ ASSERTCMP(op->estate, ==, NULL);
+ server = op->server_list->servers[op->server_index].server;
- if (!afs_check_server_record(op, server))
+ if (!afs_check_server_record(op, server, op->key))
goto failed;
_debug("USING SERVER: %pU", &server->uuid);
@@ -377,58 +580,73 @@ selected_server:
op->server = server;
if (vnode->cb_server != server) {
vnode->cb_server = server;
- vnode->cb_s_break = server->cb_s_break;
- vnode->cb_fs_s_break = atomic_read(&server->cell->fs_s_break);
- vnode->cb_v_break = vnode->volume->cb_v_break;
- clear_bit(AFS_VNODE_CB_PROMISED, &vnode->flags);
+ vnode->cb_v_check = atomic_read(&vnode->volume->cb_v_break);
+ atomic64_set(&vnode->cb_expires_at, AFS_NO_CB_PROMISE);
}
- read_lock(&server->fs_lock);
- alist = rcu_dereference_protected(server->addresses,
- lockdep_is_held(&server->fs_lock));
- afs_get_addrlist(alist);
- read_unlock(&server->fs_lock);
-
retry_server:
- memset(&op->ac, 0, sizeof(op->ac));
-
- if (!op->ac.alist)
- op->ac.alist = alist;
- else
- afs_put_addrlist(alist);
-
- op->ac.index = -1;
+ op->addr_tried = 0;
+ op->addr_index = -1;
iterate_address:
- ASSERT(op->ac.alist);
/* Iterate over the current server's address list to try and find an
* address on which it will respond to us.
*/
- if (!afs_iterate_addresses(&op->ac))
- goto out_of_addresses;
+ op->estate = op->server_states[op->server_index].endpoint_state;
+ set = READ_ONCE(op->estate->responsive_set);
+ failed = READ_ONCE(op->estate->failed_set);
+ _debug("iterate ES=%x rs=%lx fs=%lx", op->estate->probe_seq, set, failed);
+ set &= ~(failed | op->addr_tried);
+ trace_afs_rotate(op, afs_rotate_trace_iterate_addr, set);
+ if (!set)
+ goto wait_for_more_probe_results;
+
+ alist = op->estate->addresses;
+ for (i = 0; i < alist->nr_addrs; i++) {
+ if (alist->addrs[i].prio > best_prio) {
+ addr_index = i;
+ best_prio = alist->addrs[i].prio;
+ }
+ }
- _debug("address [%u] %u/%u %pISp",
- op->index, op->ac.index, op->ac.alist->nr_addrs,
- &op->ac.alist->addrs[op->ac.index].transport);
+ addr_index = READ_ONCE(alist->preferred);
+ if (!test_bit(addr_index, &set))
+ addr_index = __ffs(set);
+ op->addr_index = addr_index;
+ set_bit(addr_index, &op->addr_tried);
+
+ op->volsync.creation = TIME64_MIN;
+ op->volsync.update = TIME64_MIN;
+ op->call_responded = false;
+ _debug("address [%u] %u/%u %pISp",
+ op->server_index, addr_index, alist->nr_addrs,
+ rxrpc_kernel_remote_addr(alist->addrs[op->addr_index].peer));
_leave(" = t");
return true;
-out_of_addresses:
+wait_for_more_probe_results:
+ error = afs_wait_for_one_fs_probe(op->server, op->estate, op->addr_tried,
+ !(op->flags & AFS_OPERATION_UNINTR));
+ if (!error)
+ goto iterate_address;
+
/* We've now had a failure to respond on all of a server's addresses -
* immediately probe them again and consider retrying the server.
*/
+ trace_afs_rotate(op, afs_rotate_trace_probe_fileserver, 0);
afs_probe_fileserver(op->net, op->server);
if (op->flags & AFS_OPERATION_RETRY_SERVER) {
- alist = op->ac.alist;
- error = afs_wait_for_one_fs_probe(
- op->server, !(op->flags & AFS_OPERATION_UNINTR));
+ error = afs_wait_for_one_fs_probe(op->server, op->estate, op->addr_tried,
+ !(op->flags & AFS_OPERATION_UNINTR));
switch (error) {
case 0:
op->flags &= ~AFS_OPERATION_RETRY_SERVER;
+ trace_afs_rotate(op, afs_rotate_trace_retry_server, 0);
goto retry_server;
case -ERESTARTSYS:
- goto failed_set_error;
+ afs_op_set_error(op, error);
+ goto failed;
case -ETIME:
case -EDESTADDRREQ:
goto next_server;
@@ -436,34 +654,51 @@ out_of_addresses:
}
next_server:
+ trace_afs_rotate(op, afs_rotate_trace_next_server, 0);
_debug("next");
- afs_end_cursor(&op->ac);
+ ASSERT(op->estate);
+ alist = op->estate->addresses;
+ if (op->call_responded &&
+ op->addr_index != READ_ONCE(alist->preferred) &&
+ test_bit(alist->preferred, &op->addr_tried))
+ WRITE_ONCE(alist->preferred, op->addr_index);
+ op->estate = NULL;
goto pick_server;
no_more_servers:
/* That's all the servers poked to no good effect. Try again if some
* of them were busy.
*/
- if (op->flags & AFS_OPERATION_VBUSY)
+ trace_afs_rotate(op, afs_rotate_trace_no_more_servers, 0);
+ if (op->flags & AFS_OPERATION_VBUSY) {
+ afs_sleep_and_retry(op);
+ op->flags &= ~AFS_OPERATION_VBUSY;
goto restart_from_beginning;
+ }
- e.error = -EDESTADDRREQ;
- e.responded = false;
+ rcu_read_lock();
for (i = 0; i < op->server_list->nr_servers; i++) {
- struct afs_server *s = op->server_list->servers[i].server;
+ struct afs_endpoint_state *estate;
- afs_prioritise_error(&e, READ_ONCE(s->probe.error),
- s->probe.abort_code);
+ estate = op->server_states->endpoint_state;
+ error = READ_ONCE(estate->error);
+ if (error < 0)
+ afs_op_accumulate_error(op, error, estate->abort_code);
}
+ rcu_read_unlock();
- error = e.error;
-
-failed_set_error:
- op->error = error;
failed:
+ trace_afs_rotate(op, afs_rotate_trace_failed, 0);
op->flags |= AFS_OPERATION_STOP;
- afs_end_cursor(&op->ac);
- _leave(" = f [failed %d]", op->error);
+ if (op->estate) {
+ alist = op->estate->addresses;
+ if (op->call_responded &&
+ op->addr_index != READ_ONCE(alist->preferred) &&
+ test_bit(alist->preferred, &op->addr_tried))
+ WRITE_ONCE(alist->preferred, op->addr_index);
+ op->estate = NULL;
+ }
+ _leave(" = f [failed %d]", afs_op_error(op));
return false;
}
@@ -482,37 +717,40 @@ void afs_dump_edestaddrreq(const struct afs_operation *op)
rcu_read_lock();
pr_notice("EDESTADDR occurred\n");
- pr_notice("FC: cbb=%x cbb2=%x fl=%x err=%hd\n",
+ pr_notice("OP: cbb=%x cbb2=%x fl=%x err=%hd\n",
op->file[0].cb_break_before,
- op->file[1].cb_break_before, op->flags, op->error);
- pr_notice("FC: ut=%lx ix=%d ni=%u\n",
- op->untried, op->index, op->nr_iterations);
+ op->file[1].cb_break_before, op->flags, op->cumul_error.error);
+ pr_notice("OP: ut=%lx ix=%d ni=%u\n",
+ op->untried_servers, op->server_index, op->nr_iterations);
+ pr_notice("OP: call er=%d ac=%d r=%u\n",
+ op->call_error, op->call_abort_code, op->call_responded);
if (op->server_list) {
const struct afs_server_list *sl = op->server_list;
- pr_notice("FC: SL nr=%u pr=%u vnov=%hx\n",
- sl->nr_servers, sl->preferred, sl->vnovol_mask);
+
+ pr_notice("FC: SL nr=%u vnov=%hx\n",
+ sl->nr_servers, sl->vnovol_mask);
for (i = 0; i < sl->nr_servers; i++) {
const struct afs_server *s = sl->servers[i].server;
+ const struct afs_endpoint_state *e =
+ rcu_dereference(s->endpoint_state);
+ const struct afs_addr_list *a = e->addresses;
+
pr_notice("FC: server fl=%lx av=%u %pU\n",
s->flags, s->addr_version, &s->uuid);
- if (s->addresses) {
- const struct afs_addr_list *a =
- rcu_dereference(s->addresses);
+ pr_notice("FC: - pq=%x R=%lx F=%lx\n",
+ e->probe_seq, e->responsive_set, e->failed_set);
+ if (a) {
pr_notice("FC: - av=%u nr=%u/%u/%u pr=%u\n",
a->version,
a->nr_ipv4, a->nr_addrs, a->max_addrs,
a->preferred);
- pr_notice("FC: - R=%lx F=%lx\n",
- a->responded, a->failed);
- if (a == op->ac.alist)
+ if (a == e->addresses)
pr_notice("FC: - current\n");
}
}
}
- pr_notice("AC: t=%lx ax=%u ac=%d er=%d r=%u ni=%u\n",
- op->ac.tried, op->ac.index, op->ac.abort_code, op->ac.error,
- op->ac.responded, op->ac.nr_iterations);
+ pr_notice("AC: t=%lx ax=%d\n", op->addr_tried, op->addr_index);
rcu_read_unlock();
}
diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c
index d642d06a453b..c453428f3c8b 100644
--- a/fs/afs/rxrpc.c
+++ b/fs/afs/rxrpc.c
@@ -178,6 +178,8 @@ void afs_put_call(struct afs_call *call)
ASSERT(!work_pending(&call->async_work));
ASSERT(call->type->name != NULL);
+ rxrpc_kernel_put_peer(call->peer);
+
if (call->rxcall) {
rxrpc_kernel_shutdown_call(net->socket, call->rxcall);
rxrpc_kernel_put_call(net->socket, call->rxcall);
@@ -187,7 +189,6 @@ void afs_put_call(struct afs_call *call)
call->type->destructor(call);
afs_unuse_server_notime(call->net, call->server, afs_server_trace_put_call);
- afs_put_addrlist(call->alist);
kfree(call->request);
trace_afs_call(call->debug_id, afs_call_trace_free, 0, o,
@@ -294,9 +295,8 @@ static void afs_notify_end_request_tx(struct sock *sock,
* Initiate a call and synchronously queue up the parameters for dispatch. Any
* error is stored into the call struct, which the caller must check for.
*/
-void afs_make_call(struct afs_addr_cursor *ac, struct afs_call *call, gfp_t gfp)
+void afs_make_call(struct afs_call *call, gfp_t gfp)
{
- struct sockaddr_rxrpc *srx = &ac->alist->addrs[ac->index];
struct rxrpc_call *rxcall;
struct msghdr msg;
struct kvec iov[1];
@@ -304,7 +304,7 @@ void afs_make_call(struct afs_addr_cursor *ac, struct afs_call *call, gfp_t gfp)
s64 tx_total_len;
int ret;
- _enter(",{%pISp},", &srx->transport);
+ _enter(",{%pISp+%u},", rxrpc_kernel_remote_addr(call->peer), call->service_id);
ASSERT(call->type != NULL);
ASSERT(call->type->name != NULL);
@@ -313,8 +313,7 @@ void afs_make_call(struct afs_addr_cursor *ac, struct afs_call *call, gfp_t gfp)
call, call->type->name, key_serial(call->key),
atomic_read(&call->net->nr_outstanding_calls));
- call->addr_ix = ac->index;
- call->alist = afs_get_addrlist(ac->alist);
+ trace_afs_make_call(call);
/* Work out the length we're going to transmit. This is awkward for
* calls such as FS.StoreData where there's an extra injection of data
@@ -333,7 +332,7 @@ void afs_make_call(struct afs_addr_cursor *ac, struct afs_call *call, gfp_t gfp)
}
/* create a call */
- rxcall = rxrpc_kernel_begin_call(call->net->socket, srx, call->key,
+ rxcall = rxrpc_kernel_begin_call(call->net->socket, call->peer, call->key,
(unsigned long)call,
tx_total_len,
call->max_lifespan,
@@ -341,6 +340,7 @@ void afs_make_call(struct afs_addr_cursor *ac, struct afs_call *call, gfp_t gfp)
(call->async ?
afs_wake_up_async_call :
afs_wake_up_call_waiter),
+ call->service_id,
call->upgrade,
(call->intr ? RXRPC_PREINTERRUPTIBLE :
RXRPC_UNINTERRUPTIBLE),
@@ -390,7 +390,7 @@ void afs_make_call(struct afs_addr_cursor *ac, struct afs_call *call, gfp_t gfp)
/* Note that at this point, we may have received the reply or an abort
* - and an asynchronous call may already have completed.
*
- * afs_wait_for_call_to_complete(call, ac)
+ * afs_wait_for_call_to_complete(call)
* must be called to synchronously clean up.
*/
return;
@@ -406,8 +406,7 @@ error_do_abort:
rxrpc_kernel_recv_data(call->net->socket, rxcall,
&msg.msg_iter, &len, false,
&call->abort_code, &call->service_id);
- ac->abort_code = call->abort_code;
- ac->responded = true;
+ call->responded = true;
}
call->error = ret;
trace_afs_call_done(call);
@@ -427,7 +426,7 @@ error_kill_call:
afs_set_call_complete(call, ret, 0);
}
- ac->error = ret;
+ call->error = ret;
call->state = AFS_CALL_COMPLETE;
_leave(" = %d", ret);
}
@@ -461,7 +460,7 @@ static void afs_log_error(struct afs_call *call, s32 remote_abort)
max = m + 1;
pr_notice("kAFS: Peer reported %s failure on %s [%pISp]\n",
msg, call->type->name,
- &call->alist->addrs[call->addr_ix].transport);
+ rxrpc_kernel_remote_addr(call->peer));
}
}
@@ -508,6 +507,7 @@ static void afs_deliver_to_call(struct afs_call *call)
ret = -EBADMSG;
switch (ret) {
case 0:
+ call->responded = true;
afs_queue_call_work(call);
if (state == AFS_CALL_CL_PROC_REPLY) {
if (call->op)
@@ -522,9 +522,11 @@ static void afs_deliver_to_call(struct afs_call *call)
goto out;
case -ECONNABORTED:
ASSERTCMP(state, ==, AFS_CALL_COMPLETE);
+ call->responded = true;
afs_log_error(call, call->abort_code);
goto done;
case -ENOTSUPP:
+ call->responded = true;
abort_code = RXGEN_OPCODE;
rxrpc_kernel_abort_call(call->net->socket, call->rxcall,
abort_code, ret,
@@ -571,50 +573,46 @@ call_complete:
}
/*
- * Wait synchronously for a call to complete and clean up the call struct.
+ * Wait synchronously for a call to complete.
*/
-long afs_wait_for_call_to_complete(struct afs_call *call,
- struct afs_addr_cursor *ac)
+void afs_wait_for_call_to_complete(struct afs_call *call)
{
- long ret;
bool rxrpc_complete = false;
- DECLARE_WAITQUEUE(myself, current);
-
_enter("");
- ret = call->error;
- if (ret < 0)
- goto out;
+ if (!afs_check_call_state(call, AFS_CALL_COMPLETE)) {
+ DECLARE_WAITQUEUE(myself, current);
+
+ add_wait_queue(&call->waitq, &myself);
+ for (;;) {
+ set_current_state(TASK_UNINTERRUPTIBLE);
+
+ /* deliver any messages that are in the queue */
+ if (!afs_check_call_state(call, AFS_CALL_COMPLETE) &&
+ call->need_attention) {
+ call->need_attention = false;
+ __set_current_state(TASK_RUNNING);
+ afs_deliver_to_call(call);
+ continue;
+ }
- add_wait_queue(&call->waitq, &myself);
- for (;;) {
- set_current_state(TASK_UNINTERRUPTIBLE);
-
- /* deliver any messages that are in the queue */
- if (!afs_check_call_state(call, AFS_CALL_COMPLETE) &&
- call->need_attention) {
- call->need_attention = false;
- __set_current_state(TASK_RUNNING);
- afs_deliver_to_call(call);
- continue;
- }
+ if (afs_check_call_state(call, AFS_CALL_COMPLETE))
+ break;
- if (afs_check_call_state(call, AFS_CALL_COMPLETE))
- break;
+ if (!rxrpc_kernel_check_life(call->net->socket, call->rxcall)) {
+ /* rxrpc terminated the call. */
+ rxrpc_complete = true;
+ break;
+ }
- if (!rxrpc_kernel_check_life(call->net->socket, call->rxcall)) {
- /* rxrpc terminated the call. */
- rxrpc_complete = true;
- break;
+ schedule();
}
- schedule();
+ remove_wait_queue(&call->waitq, &myself);
+ __set_current_state(TASK_RUNNING);
}
- remove_wait_queue(&call->waitq, &myself);
- __set_current_state(TASK_RUNNING);
-
if (!afs_check_call_state(call, AFS_CALL_COMPLETE)) {
if (rxrpc_complete) {
afs_set_call_complete(call, call->error, call->abort_code);
@@ -627,29 +625,6 @@ long afs_wait_for_call_to_complete(struct afs_call *call,
afs_set_call_complete(call, -EINTR, 0);
}
}
-
- spin_lock_bh(&call->state_lock);
- ac->abort_code = call->abort_code;
- ac->error = call->error;
- spin_unlock_bh(&call->state_lock);
-
- ret = ac->error;
- switch (ret) {
- case 0:
- ret = call->ret0;
- call->ret0 = 0;
-
- fallthrough;
- case -ECONNABORTED:
- ac->responded = true;
- break;
- }
-
-out:
- _debug("call complete");
- afs_put_call(call);
- _leave(" = %p", (void *)ret);
- return ret;
}
/*
diff --git a/fs/afs/server.c b/fs/afs/server.c
index b5237206eac3..e169121f603e 100644
--- a/fs/afs/server.c
+++ b/fs/afs/server.c
@@ -21,13 +21,13 @@ static void __afs_put_server(struct afs_net *, struct afs_server *);
/*
* Find a server by one of its addresses.
*/
-struct afs_server *afs_find_server(struct afs_net *net,
- const struct sockaddr_rxrpc *srx)
+struct afs_server *afs_find_server(struct afs_net *net, const struct rxrpc_peer *peer)
{
+ const struct afs_endpoint_state *estate;
const struct afs_addr_list *alist;
struct afs_server *server = NULL;
unsigned int i;
- int seq = 0, diff;
+ int seq = 1;
rcu_read_lock();
@@ -35,39 +35,15 @@ struct afs_server *afs_find_server(struct afs_net *net,
if (server)
afs_unuse_server_notime(net, server, afs_server_trace_put_find_rsq);
server = NULL;
+ seq++; /* 2 on the 1st/lockless path, otherwise odd */
read_seqbegin_or_lock(&net->fs_addr_lock, &seq);
- if (srx->transport.family == AF_INET6) {
- const struct sockaddr_in6 *a = &srx->transport.sin6, *b;
- hlist_for_each_entry_rcu(server, &net->fs_addresses6, addr6_link) {
- alist = rcu_dereference(server->addresses);
- for (i = alist->nr_ipv4; i < alist->nr_addrs; i++) {
- b = &alist->addrs[i].transport.sin6;
- diff = ((u16 __force)a->sin6_port -
- (u16 __force)b->sin6_port);
- if (diff == 0)
- diff = memcmp(&a->sin6_addr,
- &b->sin6_addr,
- sizeof(struct in6_addr));
- if (diff == 0)
- goto found;
- }
- }
- } else {
- const struct sockaddr_in *a = &srx->transport.sin, *b;
- hlist_for_each_entry_rcu(server, &net->fs_addresses4, addr4_link) {
- alist = rcu_dereference(server->addresses);
- for (i = 0; i < alist->nr_ipv4; i++) {
- b = &alist->addrs[i].transport.sin;
- diff = ((u16 __force)a->sin_port -
- (u16 __force)b->sin_port);
- if (diff == 0)
- diff = ((u32 __force)a->sin_addr.s_addr -
- (u32 __force)b->sin_addr.s_addr);
- if (diff == 0)
- goto found;
- }
- }
+ hlist_for_each_entry_rcu(server, &net->fs_addresses6, addr6_link) {
+ estate = rcu_dereference(server->endpoint_state);
+ alist = estate->addresses;
+ for (i = 0; i < alist->nr_addrs; i++)
+ if (alist->addrs[i].peer == peer)
+ goto found;
}
server = NULL;
@@ -90,7 +66,7 @@ struct afs_server *afs_find_server_by_uuid(struct afs_net *net, const uuid_t *uu
{
struct afs_server *server = NULL;
struct rb_node *p;
- int diff, seq = 0;
+ int diff, seq = 1;
_enter("%pU", uuid);
@@ -102,7 +78,7 @@ struct afs_server *afs_find_server_by_uuid(struct afs_net *net, const uuid_t *uu
if (server)
afs_unuse_server(net, server, afs_server_trace_put_uuid_rsq);
server = NULL;
-
+ seq++; /* 2 on the 1st/lockless path, otherwise odd */
read_seqbegin_or_lock(&net->fs_lock, &seq);
p = net->fs_servers.rb_node;
@@ -137,6 +113,7 @@ struct afs_server *afs_find_server_by_uuid(struct afs_net *net, const uuid_t *uu
static struct afs_server *afs_install_server(struct afs_cell *cell,
struct afs_server *candidate)
{
+ const struct afs_endpoint_state *estate;
const struct afs_addr_list *alist;
struct afs_server *server, *next;
struct afs_net *net = cell->net;
@@ -188,8 +165,9 @@ static struct afs_server *afs_install_server(struct afs_cell *cell,
added_dup:
write_seqlock(&net->fs_addr_lock);
- alist = rcu_dereference_protected(server->addresses,
- lockdep_is_held(&net->fs_addr_lock.lock));
+ estate = rcu_dereference_protected(server->endpoint_state,
+ lockdep_is_held(&net->fs_addr_lock.lock));
+ alist = estate->addresses;
/* Secondly, if the server has any IPv4 and/or IPv6 addresses, install
* it in the IPv4 and/or IPv6 reverse-map lists.
@@ -219,6 +197,7 @@ static struct afs_server *afs_alloc_server(struct afs_cell *cell,
const uuid_t *uuid,
struct afs_addr_list *alist)
{
+ struct afs_endpoint_state *estate;
struct afs_server *server;
struct afs_net *net = cell->net;
@@ -228,25 +207,41 @@ static struct afs_server *afs_alloc_server(struct afs_cell *cell,
if (!server)
goto enomem;
+ estate = kzalloc(sizeof(struct afs_endpoint_state), GFP_KERNEL);
+ if (!estate)
+ goto enomem_server;
+
refcount_set(&server->ref, 1);
atomic_set(&server->active, 1);
server->debug_id = atomic_inc_return(&afs_server_debug_id);
- RCU_INIT_POINTER(server->addresses, alist);
server->addr_version = alist->version;
server->uuid = *uuid;
rwlock_init(&server->fs_lock);
- INIT_WORK(&server->initcb_work, afs_server_init_callback_work);
+ INIT_LIST_HEAD(&server->volumes);
init_waitqueue_head(&server->probe_wq);
INIT_LIST_HEAD(&server->probe_link);
spin_lock_init(&server->probe_lock);
server->cell = cell;
server->rtt = UINT_MAX;
+ server->service_id = FS_SERVICE;
+
+ server->probe_counter = 1;
+ server->probed_at = jiffies - LONG_MAX / 2;
+ refcount_set(&estate->ref, 1);
+ estate->addresses = alist;
+ estate->server_id = server->debug_id;
+ estate->probe_seq = 1;
+ rcu_assign_pointer(server->endpoint_state, estate);
afs_inc_servers_outstanding(net);
trace_afs_server(server->debug_id, 1, 1, afs_server_trace_alloc);
+ trace_afs_estate(estate->server_id, estate->probe_seq, refcount_read(&estate->ref),
+ afs_estate_trace_alloc_server);
_leave(" = %p", server);
return server;
+enomem_server:
+ kfree(server);
enomem:
_leave(" = NULL [nomem]");
return NULL;
@@ -301,20 +296,20 @@ struct afs_server *afs_lookup_server(struct afs_cell *cell, struct key *key,
candidate = afs_alloc_server(cell, uuid, alist);
if (!candidate) {
- afs_put_addrlist(alist);
+ afs_put_addrlist(alist, afs_alist_trace_put_server_oom);
return ERR_PTR(-ENOMEM);
}
server = afs_install_server(cell, candidate);
if (server != candidate) {
- afs_put_addrlist(alist);
+ afs_put_addrlist(alist, afs_alist_trace_put_server_dup);
kfree(candidate);
} else {
/* Immediately dispatch an asynchronous probe to each interface
* on the fileserver. This will make sure the repeat-probing
* service is started.
*/
- afs_fs_probe_fileserver(cell->net, server, key, true);
+ afs_fs_probe_fileserver(cell->net, server, alist, key);
}
return server;
@@ -447,7 +442,8 @@ static void afs_server_rcu(struct rcu_head *rcu)
trace_afs_server(server->debug_id, refcount_read(&server->ref),
atomic_read(&server->active), afs_server_trace_free);
- afs_put_addrlist(rcu_access_pointer(server->addresses));
+ afs_put_endpoint_state(rcu_access_pointer(server->endpoint_state),
+ afs_estate_trace_put_server);
kfree(server);
}
@@ -459,14 +455,10 @@ static void __afs_put_server(struct afs_net *net, struct afs_server *server)
static void afs_give_up_callbacks(struct afs_net *net, struct afs_server *server)
{
- struct afs_addr_list *alist = rcu_access_pointer(server->addresses);
- struct afs_addr_cursor ac = {
- .alist = alist,
- .index = alist->preferred,
- .error = 0,
- };
-
- afs_fs_give_up_all_callbacks(net, server, &ac, NULL);
+ struct afs_endpoint_state *estate = rcu_access_pointer(server->endpoint_state);
+ struct afs_addr_list *alist = estate->addresses;
+
+ afs_fs_give_up_all_callbacks(net, server, &alist->addrs[alist->preferred], NULL);
}
/*
@@ -477,7 +469,6 @@ static void afs_destroy_server(struct afs_net *net, struct afs_server *server)
if (test_bit(AFS_SERVER_FL_MAY_HAVE_CB, &server->flags))
afs_give_up_callbacks(net, server);
- flush_work(&server->initcb_work);
afs_put_server(net, server, afs_server_trace_destroy);
}
@@ -636,9 +627,12 @@ void afs_purge_servers(struct afs_net *net)
* Get an update for a server's address list.
*/
static noinline bool afs_update_server_record(struct afs_operation *op,
- struct afs_server *server)
+ struct afs_server *server,
+ struct key *key)
{
- struct afs_addr_list *alist, *discard;
+ struct afs_endpoint_state *estate;
+ struct afs_addr_list *alist;
+ bool has_addrs;
_enter("");
@@ -648,29 +642,27 @@ static noinline bool afs_update_server_record(struct afs_operation *op,
alist = afs_vl_lookup_addrs(op->volume->cell, op->key, &server->uuid);
if (IS_ERR(alist)) {
+ rcu_read_lock();
+ estate = rcu_dereference(server->endpoint_state);
+ has_addrs = estate->addresses;
+ rcu_read_unlock();
+
if ((PTR_ERR(alist) == -ERESTARTSYS ||
PTR_ERR(alist) == -EINTR) &&
(op->flags & AFS_OPERATION_UNINTR) &&
- server->addresses) {
+ has_addrs) {
_leave(" = t [intr]");
return true;
}
- op->error = PTR_ERR(alist);
- _leave(" = f [%d]", op->error);
+ afs_op_set_error(op, PTR_ERR(alist));
+ _leave(" = f [%d]", afs_op_error(op));
return false;
}
- discard = alist;
- if (server->addr_version != alist->version) {
- write_lock(&server->fs_lock);
- discard = rcu_dereference_protected(server->addresses,
- lockdep_is_held(&server->fs_lock));
- rcu_assign_pointer(server->addresses, alist);
- server->addr_version = alist->version;
- write_unlock(&server->fs_lock);
- }
+ if (server->addr_version != alist->version)
+ afs_fs_probe_fileserver(op->net, server, alist, key);
- afs_put_addrlist(discard);
+ afs_put_addrlist(alist, afs_alist_trace_put_server_update);
_leave(" = t");
return true;
}
@@ -678,7 +670,8 @@ static noinline bool afs_update_server_record(struct afs_operation *op,
/*
* See if a server's address list needs updating.
*/
-bool afs_check_server_record(struct afs_operation *op, struct afs_server *server)
+bool afs_check_server_record(struct afs_operation *op, struct afs_server *server,
+ struct key *key)
{
bool success;
int ret, retries = 0;
@@ -698,7 +691,7 @@ retry:
update:
if (!test_and_set_bit_lock(AFS_SERVER_FL_UPDATING, &server->flags)) {
clear_bit(AFS_SERVER_FL_NEEDS_UPDATE, &server->flags);
- success = afs_update_server_record(op, server);
+ success = afs_update_server_record(op, server, key);
clear_bit_unlock(AFS_SERVER_FL_UPDATING, &server->flags);
wake_up_bit(&server->flags, AFS_SERVER_FL_UPDATING);
_leave(" = %d", success);
@@ -710,7 +703,7 @@ wait:
(op->flags & AFS_OPERATION_UNINTR) ?
TASK_UNINTERRUPTIBLE : TASK_INTERRUPTIBLE);
if (ret == -ERESTARTSYS) {
- op->error = ret;
+ afs_op_set_error(op, ret);
_leave(" = f [intr]");
return false;
}
diff --git a/fs/afs/server_list.c b/fs/afs/server_list.c
index b59896b1de0a..7e7e567a7f8a 100644
--- a/fs/afs/server_list.c
+++ b/fs/afs/server_list.c
@@ -24,35 +24,62 @@ void afs_put_serverlist(struct afs_net *net, struct afs_server_list *slist)
/*
* Build a server list from a VLDB record.
*/
-struct afs_server_list *afs_alloc_server_list(struct afs_cell *cell,
+struct afs_server_list *afs_alloc_server_list(struct afs_volume *volume,
struct key *key,
- struct afs_vldb_entry *vldb,
- u8 type_mask)
+ struct afs_vldb_entry *vldb)
{
struct afs_server_list *slist;
struct afs_server *server;
- int ret = -ENOMEM, nr_servers = 0, i, j;
-
- for (i = 0; i < vldb->nr_servers; i++)
- if (vldb->fs_mask[i] & type_mask)
- nr_servers++;
+ unsigned int type_mask = 1 << volume->type;
+ bool use_newrepsites = false;
+ int ret = -ENOMEM, nr_servers = 0, newrep = 0, i, j, usable = 0;
+
+ /* Work out if we're going to restrict to NEWREPSITE-marked servers or
+ * not. If at least one site is marked as NEWREPSITE, then it's likely
+ * that "vos release" is busy updating RO sites. We cut over from one
+ * to the other when >=50% of the sites have been updated. Sites that
+ * are in the process of being updated are marked DONTUSE.
+ */
+ for (i = 0; i < vldb->nr_servers; i++) {
+ if (!(vldb->fs_mask[i] & type_mask))
+ continue;
+ nr_servers++;
+ if (vldb->vlsf_flags[i] & AFS_VLSF_DONTUSE)
+ continue;
+ usable++;
+ if (vldb->vlsf_flags[i] & AFS_VLSF_NEWREPSITE)
+ newrep++;
+ }
slist = kzalloc(struct_size(slist, servers, nr_servers), GFP_KERNEL);
if (!slist)
goto error;
+ if (newrep) {
+ if (newrep < usable / 2) {
+ slist->ro_replicating = AFS_RO_REPLICATING_USE_OLD;
+ } else {
+ slist->ro_replicating = AFS_RO_REPLICATING_USE_NEW;
+ use_newrepsites = true;
+ }
+ }
+
refcount_set(&slist->usage, 1);
rwlock_init(&slist->lock);
- for (i = 0; i < AFS_MAXTYPES; i++)
- slist->vids[i] = vldb->vid[i];
-
/* Make sure a records exists for each server in the list. */
for (i = 0; i < vldb->nr_servers; i++) {
+ unsigned long se_flags = 0;
+ bool newrepsite = vldb->vlsf_flags[i] & AFS_VLSF_NEWREPSITE;
+
if (!(vldb->fs_mask[i] & type_mask))
continue;
+ if (vldb->vlsf_flags[i] & AFS_VLSF_DONTUSE)
+ __set_bit(AFS_SE_EXCLUDED, &se_flags);
+ if (newrep && (newrepsite ^ use_newrepsites))
+ __set_bit(AFS_SE_EXCLUDED, &se_flags);
- server = afs_lookup_server(cell, key, &vldb->fs_server[i],
+ server = afs_lookup_server(volume->cell, key, &vldb->fs_server[i],
vldb->addr_version[i]);
if (IS_ERR(server)) {
ret = PTR_ERR(server);
@@ -70,7 +97,7 @@ struct afs_server_list *afs_alloc_server_list(struct afs_cell *cell,
break;
if (j < slist->nr_servers) {
if (slist->servers[j].server == server) {
- afs_put_server(cell->net, server,
+ afs_put_server(volume->cell->net, server,
afs_server_trace_put_slist_isort);
continue;
}
@@ -81,6 +108,9 @@ struct afs_server_list *afs_alloc_server_list(struct afs_cell *cell,
}
slist->servers[j].server = server;
+ slist->servers[j].volume = volume;
+ slist->servers[j].flags = se_flags;
+ slist->servers[j].cb_expires_at = AFS_NO_CB_PROMISE;
slist->nr_servers++;
}
@@ -92,7 +122,7 @@ struct afs_server_list *afs_alloc_server_list(struct afs_cell *cell,
return slist;
error_2:
- afs_put_serverlist(cell->net, slist);
+ afs_put_serverlist(volume->cell->net, slist);
error:
return ERR_PTR(ret);
}
@@ -103,27 +133,117 @@ error:
bool afs_annotate_server_list(struct afs_server_list *new,
struct afs_server_list *old)
{
- struct afs_server *cur;
- int i, j;
+ unsigned long mask = 1UL << AFS_SE_EXCLUDED;
+ int i;
- if (old->nr_servers != new->nr_servers)
+ if (old->nr_servers != new->nr_servers ||
+ old->ro_replicating != new->ro_replicating)
goto changed;
- for (i = 0; i < old->nr_servers; i++)
+ for (i = 0; i < old->nr_servers; i++) {
if (old->servers[i].server != new->servers[i].server)
goto changed;
-
+ if ((old->servers[i].flags & mask) != (new->servers[i].flags & mask))
+ goto changed;
+ }
return false;
-
changed:
- /* Maintain the same preferred server as before if possible. */
- cur = old->servers[old->preferred].server;
- for (j = 0; j < new->nr_servers; j++) {
- if (new->servers[j].server == cur) {
- new->preferred = j;
- break;
+ return true;
+}
+
+/*
+ * Attach a volume to the servers it is going to use.
+ */
+void afs_attach_volume_to_servers(struct afs_volume *volume, struct afs_server_list *slist)
+{
+ struct afs_server_entry *se, *pe;
+ struct afs_server *server;
+ struct list_head *p;
+ unsigned int i;
+
+ down_write(&volume->cell->vs_lock);
+
+ for (i = 0; i < slist->nr_servers; i++) {
+ se = &slist->servers[i];
+ server = se->server;
+
+ list_for_each(p, &server->volumes) {
+ pe = list_entry(p, struct afs_server_entry, slink);
+ if (volume->vid <= pe->volume->vid)
+ break;
}
+ list_add_tail(&se->slink, p);
}
- return true;
+ slist->attached = true;
+ up_write(&volume->cell->vs_lock);
+}
+
+/*
+ * Reattach a volume to the servers it is going to use when server list is
+ * replaced. We try to switch the attachment points to avoid rewalking the
+ * lists.
+ */
+void afs_reattach_volume_to_servers(struct afs_volume *volume, struct afs_server_list *new,
+ struct afs_server_list *old)
+{
+ unsigned int n = 0, o = 0;
+
+ down_write(&volume->cell->vs_lock);
+
+ while (n < new->nr_servers || o < old->nr_servers) {
+ struct afs_server_entry *pn = n < new->nr_servers ? &new->servers[n] : NULL;
+ struct afs_server_entry *po = o < old->nr_servers ? &old->servers[o] : NULL;
+ struct afs_server_entry *s;
+ struct list_head *p;
+ int diff;
+
+ if (pn && po && pn->server == po->server) {
+ pn->cb_expires_at = po->cb_expires_at;
+ list_replace(&po->slink, &pn->slink);
+ n++;
+ o++;
+ continue;
+ }
+
+ if (pn && po)
+ diff = memcmp(&pn->server->uuid, &po->server->uuid,
+ sizeof(pn->server->uuid));
+ else
+ diff = pn ? -1 : 1;
+
+ if (diff < 0) {
+ list_for_each(p, &pn->server->volumes) {
+ s = list_entry(p, struct afs_server_entry, slink);
+ if (volume->vid <= s->volume->vid)
+ break;
+ }
+ list_add_tail(&pn->slink, p);
+ n++;
+ } else {
+ list_del(&po->slink);
+ o++;
+ }
+ }
+
+ up_write(&volume->cell->vs_lock);
+}
+
+/*
+ * Detach a volume from the servers it has been using.
+ */
+void afs_detach_volume_from_servers(struct afs_volume *volume, struct afs_server_list *slist)
+{
+ unsigned int i;
+
+ if (!slist->attached)
+ return;
+
+ down_write(&volume->cell->vs_lock);
+
+ for (i = 0; i < slist->nr_servers; i++)
+ list_del(&slist->servers[i].slink);
+
+ slist->attached = false;
+ up_write(&volume->cell->vs_lock);
}
diff --git a/fs/afs/super.c b/fs/afs/super.c
index a01a0fb2cdbb..ae2d66a52add 100644
--- a/fs/afs/super.c
+++ b/fs/afs/super.c
@@ -381,8 +381,7 @@ static int afs_validate_fc(struct fs_context *fc)
ctx->key = key;
if (ctx->volume) {
- afs_put_volume(ctx->net, ctx->volume,
- afs_volume_trace_put_validate_fc);
+ afs_put_volume(ctx->volume, afs_volume_trace_put_validate_fc);
ctx->volume = NULL;
}
@@ -529,7 +528,7 @@ static void afs_destroy_sbi(struct afs_super_info *as)
{
if (as) {
struct afs_net *net = afs_net(as->net_ns);
- afs_put_volume(net, as->volume, afs_volume_trace_put_destroy_sbi);
+ afs_put_volume(as->volume, afs_volume_trace_put_destroy_sbi);
afs_unuse_cell(net, as->cell, afs_cell_trace_unuse_sbi);
put_net(as->net_ns);
kfree(as);
@@ -615,7 +614,7 @@ static void afs_free_fc(struct fs_context *fc)
struct afs_fs_context *ctx = fc->fs_private;
afs_destroy_sbi(fc->s_fs_info);
- afs_put_volume(ctx->net, ctx->volume, afs_volume_trace_put_free_fc);
+ afs_put_volume(ctx->volume, afs_volume_trace_put_free_fc);
afs_unuse_cell(ctx->net, ctx->cell, afs_cell_trace_unuse_fc);
key_put(ctx->key);
kfree(ctx);
diff --git a/fs/afs/validation.c b/fs/afs/validation.c
new file mode 100644
index 000000000000..46b37f2cce7d
--- /dev/null
+++ b/fs/afs/validation.c
@@ -0,0 +1,473 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/* vnode and volume validity verification.
+ *
+ * Copyright (C) 2023 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/sched.h>
+#include "internal.h"
+
+/*
+ * Data validation is managed through a number of mechanisms from the server:
+ *
+ * (1) On first contact with a server (such as if it has just been rebooted),
+ * the server sends us a CB.InitCallBackState* request.
+ *
+ * (2) On a RW volume, in response to certain vnode (inode)-accessing RPC
+ * calls, the server maintains a time-limited per-vnode promise that it
+ * will send us a CB.CallBack request if a third party alters the vnodes
+ * accessed.
+ *
+ * Note that a vnode-level callbacks may also be sent for other reasons,
+ * such as filelock release.
+ *
+ * (3) On a RO (or Backup) volume, in response to certain vnode-accessing RPC
+ * calls, each server maintains a time-limited per-volume promise that it
+ * will send us a CB.CallBack request if the RO volume is updated to a
+ * snapshot of the RW volume ("vos release"). This is an atomic event
+ * that cuts over all instances of the RO volume across multiple servers
+ * simultaneously.
+ *
+ * Note that a volume-level callbacks may also be sent for other reasons,
+ * such as the volumeserver taking over control of the volume from the
+ * fileserver.
+ *
+ * Note also that each server maintains an independent time limit on an
+ * independent callback.
+ *
+ * (4) Certain RPC calls include a volume information record "VolSync" in
+ * their reply. This contains a creation date for the volume that should
+ * remain unchanged for a RW volume (but will be changed if the volume is
+ * restored from backup) or will be bumped to the time of snapshotting
+ * when a RO volume is released.
+ *
+ * In order to track this events, the following are provided:
+ *
+ * ->cb_v_break. A counter of events that might mean that the contents of
+ * a volume have been altered since we last checked a vnode.
+ *
+ * ->cb_v_check. A counter of the number of events that we've sent a
+ * query to the server for. Everything's up to date if this equals
+ * cb_v_break.
+ *
+ * ->cb_scrub. A counter of the number of regression events for which we
+ * have to completely wipe the cache.
+ *
+ * ->cb_ro_snapshot. A counter of the number of times that we've
+ * recognised that a RO volume has been updated.
+ *
+ * ->cb_break. A counter of events that might mean that the contents of a
+ * vnode have been altered.
+ *
+ * ->cb_expires_at. The time at which the callback promise expires or
+ * AFS_NO_CB_PROMISE if we have no promise.
+ *
+ * The way we manage things is:
+ *
+ * (1) When a volume-level CB.CallBack occurs, we increment ->cb_v_break on
+ * the volume and reset ->cb_expires_at (ie. set AFS_NO_CB_PROMISE) on the
+ * volume and volume's server record.
+ *
+ * (2) When a CB.InitCallBackState occurs, we treat this as a volume-level
+ * callback break on all the volumes that have been using that volume
+ * (ie. increment ->cb_v_break and reset ->cb_expires_at).
+ *
+ * (3) When a vnode-level CB.CallBack occurs, we increment ->cb_break on the
+ * vnode and reset its ->cb_expires_at. If the vnode is mmapped, we also
+ * dispatch a work item to unmap all PTEs to the vnode's pagecache to
+ * force reentry to the filesystem for revalidation.
+ *
+ * (4) When entering the filesystem, we call afs_validate() to check the
+ * validity of a vnode. This first checks to see if ->cb_v_check and
+ * ->cb_v_break match, and if they don't, we lock volume->cb_check_lock
+ * exclusively and perform an FS.FetchStatus on the vnode.
+ *
+ * After checking the volume, we check the vnode. If there's a mismatch
+ * between the volume counters and the vnode's mirrors of those counters,
+ * we lock vnode->validate_lock and issue an FS.FetchStatus on the vnode.
+ *
+ * (5) When the reply from FS.FetchStatus arrives, the VolSync record is
+ * parsed:
+ *
+ * (A) If the Creation timestamp has changed on a RW volume or regressed
+ * on a RO volume, we try to increment ->cb_scrub; if it advances on a
+ * RO volume, we assume "vos release" happened and try to increment
+ * ->cb_ro_snapshot.
+ *
+ * (B) If the Update timestamp has regressed, we try to increment
+ * ->cb_scrub.
+ *
+ * Note that in both of these cases, we only do the increment if we can
+ * cmpxchg the value of the timestamp from the value we noted before the
+ * op. This tries to prevent parallel ops from fighting one another.
+ *
+ * volume->cb_v_check is then set to ->cb_v_break.
+ *
+ * (6) The AFSCallBack record included in the FS.FetchStatus reply is also
+ * parsed and used to set the promise in ->cb_expires_at for the vnode,
+ * the volume and the volume's server record.
+ *
+ * (7) If ->cb_scrub is seen to have advanced, we invalidate the pagecache for
+ * the vnode.
+ */
+
+/*
+ * Check the validity of a vnode/inode and its parent volume.
+ */
+bool afs_check_validity(const struct afs_vnode *vnode)
+{
+ const struct afs_volume *volume = vnode->volume;
+ time64_t deadline = ktime_get_real_seconds() + 10;
+
+ if (atomic_read(&volume->cb_v_check) != atomic_read(&volume->cb_v_break) ||
+ atomic64_read(&vnode->cb_expires_at) <= deadline ||
+ volume->cb_expires_at <= deadline ||
+ vnode->cb_ro_snapshot != atomic_read(&volume->cb_ro_snapshot) ||
+ vnode->cb_scrub != atomic_read(&volume->cb_scrub) ||
+ test_bit(AFS_VNODE_ZAP_DATA, &vnode->flags)) {
+ _debug("inval");
+ return false;
+ }
+
+ return true;
+}
+
+/*
+ * See if the server we've just talked to is currently excluded.
+ */
+static bool __afs_is_server_excluded(struct afs_operation *op, struct afs_volume *volume)
+{
+ const struct afs_server_entry *se;
+ const struct afs_server_list *slist;
+ bool is_excluded = true;
+ int i;
+
+ rcu_read_lock();
+
+ slist = rcu_dereference(volume->servers);
+ for (i = 0; i < slist->nr_servers; i++) {
+ se = &slist->servers[i];
+ if (op->server == se->server) {
+ is_excluded = test_bit(AFS_SE_EXCLUDED, &se->flags);
+ break;
+ }
+ }
+
+ rcu_read_unlock();
+ return is_excluded;
+}
+
+/*
+ * Update the volume's server list when the creation time changes and see if
+ * the server we've just talked to is currently excluded.
+ */
+static int afs_is_server_excluded(struct afs_operation *op, struct afs_volume *volume)
+{
+ int ret;
+
+ if (__afs_is_server_excluded(op, volume))
+ return 1;
+
+ set_bit(AFS_VOLUME_NEEDS_UPDATE, &volume->flags);
+ ret = afs_check_volume_status(op->volume, op);
+ if (ret < 0)
+ return ret;
+
+ return __afs_is_server_excluded(op, volume);
+}
+
+/*
+ * Handle a change to the volume creation time in the VolSync record.
+ */
+static int afs_update_volume_creation_time(struct afs_operation *op, struct afs_volume *volume)
+{
+ unsigned int snap;
+ time64_t cur = volume->creation_time;
+ time64_t old = op->pre_volsync.creation;
+ time64_t new = op->volsync.creation;
+ int ret;
+
+ _enter("%llx,%llx,%llx->%llx", volume->vid, cur, old, new);
+
+ if (cur == TIME64_MIN) {
+ volume->creation_time = new;
+ return 0;
+ }
+
+ if (new == cur)
+ return 0;
+
+ /* Try to advance the creation timestamp from what we had before the
+ * operation to what we got back from the server. This should
+ * hopefully ensure that in a race between multiple operations only one
+ * of them will do this.
+ */
+ if (cur != old)
+ return 0;
+
+ /* If the creation time changes in an unexpected way, we need to scrub
+ * our caches. For a RW vol, this will only change if the volume is
+ * restored from a backup; for a RO/Backup vol, this will advance when
+ * the volume is updated to a new snapshot (eg. "vos release").
+ */
+ if (volume->type == AFSVL_RWVOL)
+ goto regressed;
+ if (volume->type == AFSVL_BACKVOL) {
+ if (new < old)
+ goto regressed;
+ goto advance;
+ }
+
+ /* We have an RO volume, we need to query the VL server and look at the
+ * server flags to see if RW->RO replication is in progress.
+ */
+ ret = afs_is_server_excluded(op, volume);
+ if (ret < 0)
+ return ret;
+ if (ret > 0) {
+ snap = atomic_read(&volume->cb_ro_snapshot);
+ trace_afs_cb_v_break(volume->vid, snap, afs_cb_break_volume_excluded);
+ return ret;
+ }
+
+advance:
+ snap = atomic_inc_return(&volume->cb_ro_snapshot);
+ trace_afs_cb_v_break(volume->vid, snap, afs_cb_break_for_vos_release);
+ volume->creation_time = new;
+ return 0;
+
+regressed:
+ atomic_inc(&volume->cb_scrub);
+ trace_afs_cb_v_break(volume->vid, 0, afs_cb_break_for_creation_regress);
+ volume->creation_time = new;
+ return 0;
+}
+
+/*
+ * Handle a change to the volume update time in the VolSync record.
+ */
+static void afs_update_volume_update_time(struct afs_operation *op, struct afs_volume *volume)
+{
+ enum afs_cb_break_reason reason = afs_cb_break_no_break;
+ time64_t cur = volume->update_time;
+ time64_t old = op->pre_volsync.update;
+ time64_t new = op->volsync.update;
+
+ _enter("%llx,%llx,%llx->%llx", volume->vid, cur, old, new);
+
+ if (cur == TIME64_MIN) {
+ volume->update_time = new;
+ return;
+ }
+
+ if (new == cur)
+ return;
+
+ /* If the volume update time changes in an unexpected way, we need to
+ * scrub our caches. For a RW vol, this will advance on every
+ * modification op; for a RO/Backup vol, this will advance when the
+ * volume is updated to a new snapshot (eg. "vos release").
+ */
+ if (new < old)
+ reason = afs_cb_break_for_update_regress;
+
+ /* Try to advance the update timestamp from what we had before the
+ * operation to what we got back from the server. This should
+ * hopefully ensure that in a race between multiple operations only one
+ * of them will do this.
+ */
+ if (cur == old) {
+ if (reason == afs_cb_break_for_update_regress) {
+ atomic_inc(&volume->cb_scrub);
+ trace_afs_cb_v_break(volume->vid, 0, reason);
+ }
+ volume->update_time = new;
+ }
+}
+
+static int afs_update_volume_times(struct afs_operation *op, struct afs_volume *volume)
+{
+ int ret = 0;
+
+ if (likely(op->volsync.creation == volume->creation_time &&
+ op->volsync.update == volume->update_time))
+ return 0;
+
+ mutex_lock(&volume->volsync_lock);
+ if (op->volsync.creation != volume->creation_time) {
+ ret = afs_update_volume_creation_time(op, volume);
+ if (ret < 0)
+ goto out;
+ }
+ if (op->volsync.update != volume->update_time)
+ afs_update_volume_update_time(op, volume);
+out:
+ mutex_unlock(&volume->volsync_lock);
+ return ret;
+}
+
+/*
+ * Update the state of a volume, including recording the expiration time of the
+ * callback promise. Returns 1 to redo the operation from the start.
+ */
+int afs_update_volume_state(struct afs_operation *op)
+{
+ struct afs_server_list *slist = op->server_list;
+ struct afs_server_entry *se = &slist->servers[op->server_index];
+ struct afs_callback *cb = &op->file[0].scb.callback;
+ struct afs_volume *volume = op->volume;
+ unsigned int cb_v_break = atomic_read(&volume->cb_v_break);
+ unsigned int cb_v_check = atomic_read(&volume->cb_v_check);
+ int ret;
+
+ _enter("%llx", op->volume->vid);
+
+ if (op->volsync.creation != TIME64_MIN || op->volsync.update != TIME64_MIN) {
+ ret = afs_update_volume_times(op, volume);
+ if (ret != 0) {
+ _leave(" = %d", ret);
+ return ret;
+ }
+ }
+
+ if (op->cb_v_break == cb_v_break &&
+ (op->file[0].scb.have_cb || op->file[1].scb.have_cb)) {
+ time64_t expires_at = cb->expires_at;
+
+ if (!op->file[0].scb.have_cb)
+ expires_at = op->file[1].scb.callback.expires_at;
+
+ se->cb_expires_at = expires_at;
+ volume->cb_expires_at = expires_at;
+ }
+ if (cb_v_check < op->cb_v_break)
+ atomic_cmpxchg(&volume->cb_v_check, cb_v_check, op->cb_v_break);
+ return 0;
+}
+
+/*
+ * mark the data attached to an inode as obsolete due to a write on the server
+ * - might also want to ditch all the outstanding writes and dirty pages
+ */
+static void afs_zap_data(struct afs_vnode *vnode)
+{
+ _enter("{%llx:%llu}", vnode->fid.vid, vnode->fid.vnode);
+
+ afs_invalidate_cache(vnode, 0);
+
+ /* nuke all the non-dirty pages that aren't locked, mapped or being
+ * written back in a regular file and completely discard the pages in a
+ * directory or symlink */
+ if (S_ISREG(vnode->netfs.inode.i_mode))
+ invalidate_remote_inode(&vnode->netfs.inode);
+ else
+ invalidate_inode_pages2(vnode->netfs.inode.i_mapping);
+}
+
+/*
+ * validate a vnode/inode
+ * - there are several things we need to check
+ * - parent dir data changes (rm, rmdir, rename, mkdir, create, link,
+ * symlink)
+ * - parent dir metadata changed (security changes)
+ * - dentry data changed (write, truncate)
+ * - dentry metadata changed (security changes)
+ */
+int afs_validate(struct afs_vnode *vnode, struct key *key)
+{
+ struct afs_volume *volume = vnode->volume;
+ unsigned int cb_ro_snapshot, cb_scrub;
+ time64_t deadline = ktime_get_real_seconds() + 10;
+ bool zap = false, locked_vol = false;
+ int ret;
+
+ _enter("{v={%llx:%llu} fl=%lx},%x",
+ vnode->fid.vid, vnode->fid.vnode, vnode->flags,
+ key_serial(key));
+
+ if (afs_check_validity(vnode))
+ return 0;
+
+ ret = down_write_killable(&vnode->validate_lock);
+ if (ret < 0)
+ goto error;
+
+ /* Validate a volume after the v_break has changed or the volume
+ * callback expired. We only want to do this once per volume per
+ * v_break change. The actual work will be done when parsing the
+ * status fetch reply.
+ */
+ if (volume->cb_expires_at <= deadline ||
+ atomic_read(&volume->cb_v_check) != atomic_read(&volume->cb_v_break)) {
+ ret = mutex_lock_interruptible(&volume->cb_check_lock);
+ if (ret < 0)
+ goto error_unlock;
+ locked_vol = true;
+ }
+
+ cb_ro_snapshot = atomic_read(&volume->cb_ro_snapshot);
+ cb_scrub = atomic_read(&volume->cb_scrub);
+ if (vnode->cb_ro_snapshot != cb_ro_snapshot ||
+ vnode->cb_scrub != cb_scrub)
+ unmap_mapping_pages(vnode->netfs.inode.i_mapping, 0, 0, false);
+
+ if (vnode->cb_ro_snapshot != cb_ro_snapshot ||
+ vnode->cb_scrub != cb_scrub ||
+ volume->cb_expires_at <= deadline ||
+ atomic_read(&volume->cb_v_check) != atomic_read(&volume->cb_v_break) ||
+ atomic64_read(&vnode->cb_expires_at) <= deadline
+ ) {
+ ret = afs_fetch_status(vnode, key, false, NULL);
+ if (ret < 0) {
+ if (ret == -ENOENT) {
+ set_bit(AFS_VNODE_DELETED, &vnode->flags);
+ ret = -ESTALE;
+ }
+ goto error_unlock;
+ }
+
+ _debug("new promise [fl=%lx]", vnode->flags);
+ }
+
+ /* We can drop the volume lock now as. */
+ if (locked_vol) {
+ mutex_unlock(&volume->cb_check_lock);
+ locked_vol = false;
+ }
+
+ cb_ro_snapshot = atomic_read(&volume->cb_ro_snapshot);
+ cb_scrub = atomic_read(&volume->cb_scrub);
+ _debug("vnode inval %x==%x %x==%x",
+ vnode->cb_ro_snapshot, cb_ro_snapshot,
+ vnode->cb_scrub, cb_scrub);
+ if (vnode->cb_scrub != cb_scrub)
+ zap = true;
+ vnode->cb_ro_snapshot = cb_ro_snapshot;
+ vnode->cb_scrub = cb_scrub;
+
+ if (test_bit(AFS_VNODE_DELETED, &vnode->flags)) {
+ _debug("file already deleted");
+ ret = -ESTALE;
+ goto error_unlock;
+ }
+
+ /* if the vnode's data version number changed then its contents are
+ * different */
+ zap |= test_and_clear_bit(AFS_VNODE_ZAP_DATA, &vnode->flags);
+ if (zap)
+ afs_zap_data(vnode);
+ up_write(&vnode->validate_lock);
+ _leave(" = 0");
+ return 0;
+
+error_unlock:
+ if (locked_vol)
+ mutex_unlock(&volume->cb_check_lock);
+ up_write(&vnode->validate_lock);
+error:
+ _leave(" = %d", ret);
+ return ret;
+}
diff --git a/fs/afs/vl_alias.c b/fs/afs/vl_alias.c
index f04a80e4f5c3..9f36e14f1c2d 100644
--- a/fs/afs/vl_alias.c
+++ b/fs/afs/vl_alias.c
@@ -33,55 +33,6 @@ static struct afs_volume *afs_sample_volume(struct afs_cell *cell, struct key *k
}
/*
- * Compare two addresses.
- */
-static int afs_compare_addrs(const struct sockaddr_rxrpc *srx_a,
- const struct sockaddr_rxrpc *srx_b)
-{
- short port_a, port_b;
- int addr_a, addr_b, diff;
-
- diff = (short)srx_a->transport_type - (short)srx_b->transport_type;
- if (diff)
- goto out;
-
- switch (srx_a->transport_type) {
- case AF_INET: {
- const struct sockaddr_in *a = &srx_a->transport.sin;
- const struct sockaddr_in *b = &srx_b->transport.sin;
- addr_a = ntohl(a->sin_addr.s_addr);
- addr_b = ntohl(b->sin_addr.s_addr);
- diff = addr_a - addr_b;
- if (diff == 0) {
- port_a = ntohs(a->sin_port);
- port_b = ntohs(b->sin_port);
- diff = port_a - port_b;
- }
- break;
- }
-
- case AF_INET6: {
- const struct sockaddr_in6 *a = &srx_a->transport.sin6;
- const struct sockaddr_in6 *b = &srx_b->transport.sin6;
- diff = memcmp(&a->sin6_addr, &b->sin6_addr, 16);
- if (diff == 0) {
- port_a = ntohs(a->sin6_port);
- port_b = ntohs(b->sin6_port);
- diff = port_a - port_b;
- }
- break;
- }
-
- default:
- WARN_ON(1);
- diff = 1;
- }
-
-out:
- return diff;
-}
-
-/*
* Compare the address lists of a pair of fileservers.
*/
static int afs_compare_fs_alists(const struct afs_server *server_a,
@@ -90,13 +41,13 @@ static int afs_compare_fs_alists(const struct afs_server *server_a,
const struct afs_addr_list *la, *lb;
int a = 0, b = 0, addr_matches = 0;
- la = rcu_dereference(server_a->addresses);
- lb = rcu_dereference(server_b->addresses);
+ la = rcu_dereference(server_a->endpoint_state)->addresses;
+ lb = rcu_dereference(server_b->endpoint_state)->addresses;
while (a < la->nr_addrs && b < lb->nr_addrs) {
- const struct sockaddr_rxrpc *srx_a = &la->addrs[a];
- const struct sockaddr_rxrpc *srx_b = &lb->addrs[b];
- int diff = afs_compare_addrs(srx_a, srx_b);
+ unsigned long pa = (unsigned long)la->addrs[a].peer;
+ unsigned long pb = (unsigned long)lb->addrs[b].peer;
+ long diff = pa - pb;
if (diff < 0) {
a++;
@@ -126,7 +77,7 @@ static int afs_compare_volume_slists(const struct afs_volume *vol_a,
lb = rcu_dereference(vol_b->servers);
for (i = 0; i < AFS_MAXTYPES; i++)
- if (la->vids[i] != lb->vids[i])
+ if (vol_a->vids[i] != vol_b->vids[i])
return 0;
while (a < la->nr_servers && b < lb->nr_servers) {
@@ -205,7 +156,7 @@ static int afs_query_for_alias_one(struct afs_cell *cell, struct key *key,
/* And see if it's in the new cell. */
volume = afs_sample_volume(cell, key, pvol->name, pvol->name_len);
if (IS_ERR(volume)) {
- afs_put_volume(cell->net, pvol, afs_volume_trace_put_query_alias);
+ afs_put_volume(pvol, afs_volume_trace_put_query_alias);
if (PTR_ERR(volume) != -ENOMEDIUM)
return PTR_ERR(volume);
/* That volume is not in the new cell, so not an alias */
@@ -223,8 +174,8 @@ static int afs_query_for_alias_one(struct afs_cell *cell, struct key *key,
rcu_read_unlock();
}
- afs_put_volume(cell->net, volume, afs_volume_trace_put_query_alias);
- afs_put_volume(cell->net, pvol, afs_volume_trace_put_query_alias);
+ afs_put_volume(volume, afs_volume_trace_put_query_alias);
+ afs_put_volume(pvol, afs_volume_trace_put_query_alias);
return ret;
}
@@ -285,7 +236,7 @@ static char *afs_vl_get_cell_name(struct afs_cell *cell, struct key *key)
while (afs_select_vlserver(&vc)) {
if (!test_bit(AFS_VLSERVER_FL_IS_YFS, &vc.server->flags)) {
- vc.ac.error = -EOPNOTSUPP;
+ vc.call_error = -EOPNOTSUPP;
skipped = true;
continue;
}
diff --git a/fs/afs/vl_list.c b/fs/afs/vl_list.c
index acc48216136a..9b1c20daac53 100644
--- a/fs/afs/vl_list.c
+++ b/fs/afs/vl_list.c
@@ -13,6 +13,7 @@ struct afs_vlserver *afs_alloc_vlserver(const char *name, size_t name_len,
unsigned short port)
{
struct afs_vlserver *vlserver;
+ static atomic_t debug_ids;
vlserver = kzalloc(struct_size(vlserver, name, name_len + 1),
GFP_KERNEL);
@@ -21,8 +22,10 @@ struct afs_vlserver *afs_alloc_vlserver(const char *name, size_t name_len,
rwlock_init(&vlserver->lock);
init_waitqueue_head(&vlserver->probe_wq);
spin_lock_init(&vlserver->probe_lock);
+ vlserver->debug_id = atomic_inc_return(&debug_ids);
vlserver->rtt = UINT_MAX;
vlserver->name_len = name_len;
+ vlserver->service_id = VL_SERVICE;
vlserver->port = port;
memcpy(vlserver->name, name, name_len);
}
@@ -33,7 +36,8 @@ static void afs_vlserver_rcu(struct rcu_head *rcu)
{
struct afs_vlserver *vlserver = container_of(rcu, struct afs_vlserver, rcu);
- afs_put_addrlist(rcu_access_pointer(vlserver->addresses));
+ afs_put_addrlist(rcu_access_pointer(vlserver->addresses),
+ afs_alist_trace_put_vlserver);
kfree_rcu(vlserver, rcu);
}
@@ -83,14 +87,15 @@ static u16 afs_extract_le16(const u8 **_b)
/*
* Build a VL server address list from a DNS queried server list.
*/
-static struct afs_addr_list *afs_extract_vl_addrs(const u8 **_b, const u8 *end,
+static struct afs_addr_list *afs_extract_vl_addrs(struct afs_net *net,
+ const u8 **_b, const u8 *end,
u8 nr_addrs, u16 port)
{
struct afs_addr_list *alist;
const u8 *b = *_b;
int ret = -EINVAL;
- alist = afs_alloc_addrlist(nr_addrs, VL_SERVICE, port);
+ alist = afs_alloc_addrlist(nr_addrs);
if (!alist)
return ERR_PTR(-ENOMEM);
if (nr_addrs == 0)
@@ -109,7 +114,9 @@ static struct afs_addr_list *afs_extract_vl_addrs(const u8 **_b, const u8 *end,
goto error;
}
memcpy(x, b, 4);
- afs_merge_fs_addr4(alist, x[0], port);
+ ret = afs_merge_fs_addr4(net, alist, x[0], port);
+ if (ret < 0)
+ goto error;
b += 4;
break;
@@ -119,7 +126,9 @@ static struct afs_addr_list *afs_extract_vl_addrs(const u8 **_b, const u8 *end,
goto error;
}
memcpy(x, b, 16);
- afs_merge_fs_addr6(alist, x, port);
+ ret = afs_merge_fs_addr6(net, alist, x, port);
+ if (ret < 0)
+ goto error;
b += 16;
break;
@@ -140,7 +149,7 @@ static struct afs_addr_list *afs_extract_vl_addrs(const u8 **_b, const u8 *end,
error:
*_b = b;
- afs_put_addrlist(alist);
+ afs_put_addrlist(alist, afs_alist_trace_put_parse_error);
return ERR_PTR(ret);
}
@@ -247,7 +256,7 @@ struct afs_vlserver_list *afs_extract_vlserver_list(struct afs_cell *cell,
/* Extract the addresses - note that we can't skip this as we
* have to advance the payload pointer.
*/
- addrs = afs_extract_vl_addrs(&b, end, bs.nr_addrs, bs.port);
+ addrs = afs_extract_vl_addrs(cell->net, &b, end, bs.nr_addrs, bs.port);
if (IS_ERR(addrs)) {
ret = PTR_ERR(addrs);
goto error_2;
@@ -255,7 +264,7 @@ struct afs_vlserver_list *afs_extract_vlserver_list(struct afs_cell *cell,
if (vllist->nr_servers >= nr_servers) {
_debug("skip %u >= %u", vllist->nr_servers, nr_servers);
- afs_put_addrlist(addrs);
+ afs_put_addrlist(addrs, afs_alist_trace_put_parse_empty);
afs_put_vlserver(cell->net, server);
continue;
}
@@ -264,7 +273,7 @@ struct afs_vlserver_list *afs_extract_vlserver_list(struct afs_cell *cell,
addrs->status = bs.status;
if (addrs->nr_addrs == 0) {
- afs_put_addrlist(addrs);
+ afs_put_addrlist(addrs, afs_alist_trace_put_parse_empty);
if (!rcu_access_pointer(server->addresses)) {
afs_put_vlserver(cell->net, server);
continue;
@@ -276,7 +285,7 @@ struct afs_vlserver_list *afs_extract_vlserver_list(struct afs_cell *cell,
old = rcu_replace_pointer(server->addresses, old,
lockdep_is_held(&server->lock));
write_unlock(&server->lock);
- afs_put_addrlist(old);
+ afs_put_addrlist(old, afs_alist_trace_put_vlserver_old);
}
diff --git a/fs/afs/vl_probe.c b/fs/afs/vl_probe.c
index 58452b86e672..3d2e0c925460 100644
--- a/fs/afs/vl_probe.c
+++ b/fs/afs/vl_probe.c
@@ -46,11 +46,12 @@ static void afs_done_one_vl_probe(struct afs_vlserver *server, bool wake_up)
*/
void afs_vlserver_probe_result(struct afs_call *call)
{
- struct afs_addr_list *alist = call->alist;
+ struct afs_addr_list *alist = call->vl_probe;
struct afs_vlserver *server = call->vlserver;
+ struct afs_address *addr = &alist->addrs[call->probe_index];
unsigned int server_index = call->server_index;
unsigned int rtt_us = 0;
- unsigned int index = call->addr_ix;
+ unsigned int index = call->probe_index;
bool have_result = false;
int ret = call->error;
@@ -89,7 +90,7 @@ void afs_vlserver_probe_result(struct afs_call *call)
case -ETIME:
default:
clear_bit(index, &alist->responded);
- set_bit(index, &alist->failed);
+ set_bit(index, &alist->probe_failed);
if (!(server->probe.flags & AFS_VLSERVER_PROBE_RESPONDED) &&
(server->probe.error == 0 ||
server->probe.error == -ETIMEDOUT ||
@@ -101,21 +102,21 @@ void afs_vlserver_probe_result(struct afs_call *call)
responded:
set_bit(index, &alist->responded);
- clear_bit(index, &alist->failed);
+ clear_bit(index, &alist->probe_failed);
if (call->service_id == YFS_VL_SERVICE) {
server->probe.flags |= AFS_VLSERVER_PROBE_IS_YFS;
set_bit(AFS_VLSERVER_FL_IS_YFS, &server->flags);
- alist->addrs[index].srx_service = call->service_id;
+ server->service_id = call->service_id;
} else {
server->probe.flags |= AFS_VLSERVER_PROBE_NOT_YFS;
if (!(server->probe.flags & AFS_VLSERVER_PROBE_IS_YFS)) {
clear_bit(AFS_VLSERVER_FL_IS_YFS, &server->flags);
- alist->addrs[index].srx_service = call->service_id;
+ server->service_id = call->service_id;
}
}
- rxrpc_kernel_get_srtt(call->net->socket, call->rxcall, &rtt_us);
+ rtt_us = rxrpc_kernel_get_srtt(addr->peer);
if (rtt_us < server->probe.rtt) {
server->probe.rtt = rtt_us;
server->rtt = rtt_us;
@@ -130,8 +131,10 @@ responded:
out:
spin_unlock(&server->probe_lock);
- _debug("probe [%u][%u] %pISpc rtt=%u ret=%d",
- server_index, index, &alist->addrs[index].transport, rtt_us, ret);
+ trace_afs_vl_probe(server, false, alist, index, call->error, call->abort_code, rtt_us);
+ _debug("probe [%u][%u] %pISpc rtt=%d ret=%d",
+ server_index, index, rxrpc_kernel_remote_addr(addr->peer),
+ rtt_us, ret);
afs_done_one_vl_probe(server, have_result);
}
@@ -146,35 +149,52 @@ static bool afs_do_probe_vlserver(struct afs_net *net,
unsigned int server_index,
struct afs_error *_e)
{
- struct afs_addr_cursor ac = {
- .index = 0,
- };
+ struct afs_addr_list *alist;
struct afs_call *call;
+ unsigned long unprobed;
+ unsigned int index, i;
bool in_progress = false;
+ int best_prio;
_enter("%s", server->name);
read_lock(&server->lock);
- ac.alist = rcu_dereference_protected(server->addresses,
- lockdep_is_held(&server->lock));
+ alist = rcu_dereference_protected(server->addresses,
+ lockdep_is_held(&server->lock));
+ afs_get_addrlist(alist, afs_alist_trace_get_vlprobe);
read_unlock(&server->lock);
- atomic_set(&server->probe_outstanding, ac.alist->nr_addrs);
+ atomic_set(&server->probe_outstanding, alist->nr_addrs);
memset(&server->probe, 0, sizeof(server->probe));
server->probe.rtt = UINT_MAX;
- for (ac.index = 0; ac.index < ac.alist->nr_addrs; ac.index++) {
- call = afs_vl_get_capabilities(net, &ac, key, server,
+ unprobed = (1UL << alist->nr_addrs) - 1;
+ while (unprobed) {
+ best_prio = -1;
+ index = 0;
+ for (i = 0; i < alist->nr_addrs; i++) {
+ if (test_bit(i, &unprobed) &&
+ alist->addrs[i].prio > best_prio) {
+ index = i;
+ best_prio = alist->addrs[i].prio;
+ }
+ }
+ __clear_bit(index, &unprobed);
+
+ trace_afs_vl_probe(server, true, alist, index, 0, 0, 0);
+ call = afs_vl_get_capabilities(net, alist, index, key, server,
server_index);
if (!IS_ERR(call)) {
+ afs_prioritise_error(_e, call->error, call->abort_code);
afs_put_call(call);
in_progress = true;
} else {
- afs_prioritise_error(_e, PTR_ERR(call), ac.abort_code);
+ afs_prioritise_error(_e, PTR_ERR(call), 0);
afs_done_one_vl_probe(server, false);
}
}
+ afs_put_addrlist(alist, afs_alist_trace_put_vlprobe);
return in_progress;
}
@@ -185,12 +205,10 @@ int afs_send_vl_probes(struct afs_net *net, struct key *key,
struct afs_vlserver_list *vllist)
{
struct afs_vlserver *server;
- struct afs_error e;
+ struct afs_error e = {};
bool in_progress = false;
int i;
- e.error = 0;
- e.responded = false;
for (i = 0; i < vllist->nr_servers; i++) {
server = vllist->servers[i].server;
if (test_bit(AFS_VLSERVER_FL_PROBED, &server->flags))
diff --git a/fs/afs/vl_rotate.c b/fs/afs/vl_rotate.c
index eb415ce56360..d8f79f6ada3d 100644
--- a/fs/afs/vl_rotate.c
+++ b/fs/afs/vl_rotate.c
@@ -17,18 +17,21 @@
bool afs_begin_vlserver_operation(struct afs_vl_cursor *vc, struct afs_cell *cell,
struct key *key)
{
+ static atomic_t debug_ids;
+
memset(vc, 0, sizeof(*vc));
vc->cell = cell;
vc->key = key;
- vc->error = -EDESTADDRREQ;
- vc->ac.error = SHRT_MAX;
+ vc->cumul_error.error = -EDESTADDRREQ;
+ vc->nr_iterations = -1;
if (signal_pending(current)) {
- vc->error = -EINTR;
+ vc->cumul_error.error = -EINTR;
vc->flags |= AFS_VL_CURSOR_STOP;
return false;
}
+ vc->debug_id = atomic_inc_return(&debug_ids);
return true;
}
@@ -52,7 +55,7 @@ static bool afs_start_vl_iteration(struct afs_vl_cursor *vc)
&cell->dns_lookup_count,
smp_load_acquire(&cell->dns_lookup_count)
!= dns_lookup_count) < 0) {
- vc->error = -ERESTARTSYS;
+ vc->cumul_error.error = -ERESTARTSYS;
return false;
}
}
@@ -60,12 +63,12 @@ static bool afs_start_vl_iteration(struct afs_vl_cursor *vc)
/* Status load is ordered after lookup counter load */
if (cell->dns_status == DNS_LOOKUP_GOT_NOT_FOUND) {
pr_warn("No record of cell %s\n", cell->name);
- vc->error = -ENOENT;
+ vc->cumul_error.error = -ENOENT;
return false;
}
if (cell->dns_source == DNS_RECORD_UNAVAILABLE) {
- vc->error = -EDESTADDRREQ;
+ vc->cumul_error.error = -EDESTADDRREQ;
return false;
}
}
@@ -78,8 +81,8 @@ static bool afs_start_vl_iteration(struct afs_vl_cursor *vc)
if (!vc->server_list->nr_servers)
return false;
- vc->untried = (1UL << vc->server_list->nr_servers) - 1;
- vc->index = -1;
+ vc->untried_servers = (1UL << vc->server_list->nr_servers) - 1;
+ vc->server_index = -1;
return true;
}
@@ -89,54 +92,57 @@ static bool afs_start_vl_iteration(struct afs_vl_cursor *vc)
*/
bool afs_select_vlserver(struct afs_vl_cursor *vc)
{
- struct afs_addr_list *alist;
+ struct afs_addr_list *alist = vc->alist;
struct afs_vlserver *vlserver;
- struct afs_error e;
- u32 rtt;
- int error = vc->ac.error, i;
+ unsigned long set, failed;
+ unsigned int rtt;
+ s32 abort_code = vc->call_abort_code;
+ int error = vc->call_error, i;
+
+ vc->nr_iterations++;
- _enter("%lx[%d],%lx[%d],%d,%d",
- vc->untried, vc->index,
- vc->ac.tried, vc->ac.index,
- error, vc->ac.abort_code);
+ _enter("VC=%x+%x,%d{%lx},%d{%lx},%d,%d",
+ vc->debug_id, vc->nr_iterations, vc->server_index, vc->untried_servers,
+ vc->addr_index, vc->addr_tried,
+ error, abort_code);
if (vc->flags & AFS_VL_CURSOR_STOP) {
_leave(" = f [stopped]");
return false;
}
- vc->nr_iterations++;
+ if (vc->nr_iterations == 0)
+ goto start;
+
+ WRITE_ONCE(alist->addrs[vc->addr_index].last_error, error);
/* Evaluate the result of the previous operation, if there was one. */
switch (error) {
- case SHRT_MAX:
- goto start;
-
default:
case 0:
/* Success or local failure. Stop. */
- vc->error = error;
+ vc->cumul_error.error = error;
vc->flags |= AFS_VL_CURSOR_STOP;
- _leave(" = f [okay/local %d]", vc->ac.error);
+ _leave(" = f [okay/local %d]", vc->cumul_error.error);
return false;
case -ECONNABORTED:
/* The far side rejected the operation on some grounds. This
* might involve the server being busy or the volume having been moved.
*/
- switch (vc->ac.abort_code) {
+ switch (abort_code) {
case AFSVL_IO:
case AFSVL_BADVOLOPER:
case AFSVL_NOMEM:
/* The server went weird. */
- vc->error = -EREMOTEIO;
+ afs_prioritise_error(&vc->cumul_error, -EREMOTEIO, abort_code);
//write_lock(&vc->cell->vl_servers_lock);
- //vc->server_list->weird_mask |= 1 << vc->index;
+ //vc->server_list->weird_mask |= 1 << vc->server_index;
//write_unlock(&vc->cell->vl_servers_lock);
goto next_server;
default:
- vc->error = afs_abort_to_error(vc->ac.abort_code);
+ afs_prioritise_error(&vc->cumul_error, error, abort_code);
goto failed;
}
@@ -149,12 +155,12 @@ bool afs_select_vlserver(struct afs_vl_cursor *vc)
case -ETIMEDOUT:
case -ETIME:
_debug("no conn %d", error);
- vc->error = error;
+ afs_prioritise_error(&vc->cumul_error, error, 0);
goto iterate_address;
case -ECONNRESET:
_debug("call reset");
- vc->error = error;
+ afs_prioritise_error(&vc->cumul_error, error, 0);
vc->flags |= AFS_VL_CURSOR_RETRY;
goto next_server;
@@ -165,7 +171,13 @@ bool afs_select_vlserver(struct afs_vl_cursor *vc)
restart_from_beginning:
_debug("restart");
- afs_end_cursor(&vc->ac);
+ if (vc->call_responded &&
+ vc->addr_index != vc->alist->preferred &&
+ test_bit(alist->preferred, &vc->addr_tried))
+ WRITE_ONCE(alist->preferred, vc->addr_index);
+ afs_put_addrlist(alist, afs_alist_trace_put_vlrotate_restart);
+ alist = vc->alist = NULL;
+
afs_put_vlserverlist(vc->cell->net, vc->server_list);
vc->server_list = NULL;
if (vc->flags & AFS_VL_CURSOR_RETRIED)
@@ -173,53 +185,58 @@ restart_from_beginning:
vc->flags |= AFS_VL_CURSOR_RETRIED;
start:
_debug("start");
+ ASSERTCMP(alist, ==, NULL);
if (!afs_start_vl_iteration(vc))
goto failed;
error = afs_send_vl_probes(vc->cell->net, vc->key, vc->server_list);
- if (error < 0)
- goto failed_set_error;
+ if (error < 0) {
+ afs_prioritise_error(&vc->cumul_error, error, 0);
+ goto failed;
+ }
pick_server:
- _debug("pick [%lx]", vc->untried);
+ _debug("pick [%lx]", vc->untried_servers);
+ ASSERTCMP(alist, ==, NULL);
- error = afs_wait_for_vl_probes(vc->server_list, vc->untried);
- if (error < 0)
- goto failed_set_error;
+ error = afs_wait_for_vl_probes(vc->server_list, vc->untried_servers);
+ if (error < 0) {
+ afs_prioritise_error(&vc->cumul_error, error, 0);
+ goto failed;
+ }
/* Pick the untried server with the lowest RTT. */
- vc->index = vc->server_list->preferred;
- if (test_bit(vc->index, &vc->untried))
+ vc->server_index = vc->server_list->preferred;
+ if (test_bit(vc->server_index, &vc->untried_servers))
goto selected_server;
- vc->index = -1;
- rtt = U32_MAX;
+ vc->server_index = -1;
+ rtt = UINT_MAX;
for (i = 0; i < vc->server_list->nr_servers; i++) {
struct afs_vlserver *s = vc->server_list->servers[i].server;
- if (!test_bit(i, &vc->untried) ||
+ if (!test_bit(i, &vc->untried_servers) ||
!test_bit(AFS_VLSERVER_FL_RESPONDING, &s->flags))
continue;
- if (s->probe.rtt < rtt) {
- vc->index = i;
+ if (s->probe.rtt <= rtt) {
+ vc->server_index = i;
rtt = s->probe.rtt;
}
}
- if (vc->index == -1)
+ if (vc->server_index == -1)
goto no_more_servers;
selected_server:
- _debug("use %d", vc->index);
- __clear_bit(vc->index, &vc->untried);
+ _debug("use %d", vc->server_index);
+ __clear_bit(vc->server_index, &vc->untried_servers);
/* We're starting on a different vlserver from the list. We need to
* check it, find its address list and probe its capabilities before we
* use it.
*/
- ASSERTCMP(vc->ac.alist, ==, NULL);
- vlserver = vc->server_list->servers[vc->index].server;
+ vlserver = vc->server_list->servers[vc->server_index].server;
vc->server = vlserver;
_debug("USING VLSERVER: %s", vlserver->name);
@@ -227,34 +244,48 @@ selected_server:
read_lock(&vlserver->lock);
alist = rcu_dereference_protected(vlserver->addresses,
lockdep_is_held(&vlserver->lock));
- afs_get_addrlist(alist);
+ vc->alist = afs_get_addrlist(alist, afs_alist_trace_get_vlrotate_set);
read_unlock(&vlserver->lock);
- memset(&vc->ac, 0, sizeof(vc->ac));
-
- if (!vc->ac.alist)
- vc->ac.alist = alist;
- else
- afs_put_addrlist(alist);
-
- vc->ac.index = -1;
+ vc->addr_tried = 0;
+ vc->addr_index = -1;
iterate_address:
- ASSERT(vc->ac.alist);
/* Iterate over the current server's address list to try and find an
* address on which it will respond to us.
*/
- if (!afs_iterate_addresses(&vc->ac))
+ set = READ_ONCE(alist->responded);
+ failed = READ_ONCE(alist->probe_failed);
+ vc->addr_index = READ_ONCE(alist->preferred);
+
+ _debug("%lx-%lx-%lx,%d", set, failed, vc->addr_tried, vc->addr_index);
+
+ set &= ~(failed | vc->addr_tried);
+
+ if (!set)
goto next_server;
- _debug("VL address %d/%d", vc->ac.index, vc->ac.alist->nr_addrs);
+ if (!test_bit(vc->addr_index, &set))
+ vc->addr_index = __ffs(set);
+
+ set_bit(vc->addr_index, &vc->addr_tried);
+ vc->alist = alist;
- _leave(" = t %pISpc", &vc->ac.alist->addrs[vc->ac.index].transport);
+ _debug("VL address %d/%d", vc->addr_index, alist->nr_addrs);
+
+ vc->call_responded = false;
+ _leave(" = t %pISpc", rxrpc_kernel_remote_addr(alist->addrs[vc->addr_index].peer));
return true;
next_server:
_debug("next");
- afs_end_cursor(&vc->ac);
+ ASSERT(alist);
+ if (vc->call_responded &&
+ vc->addr_index != alist->preferred &&
+ test_bit(alist->preferred, &vc->addr_tried))
+ WRITE_ONCE(alist->preferred, vc->addr_index);
+ afs_put_addrlist(alist, afs_alist_trace_put_vlrotate_next);
+ alist = vc->alist = NULL;
goto pick_server;
no_more_servers:
@@ -264,25 +295,26 @@ no_more_servers:
if (vc->flags & AFS_VL_CURSOR_RETRY)
goto restart_from_beginning;
- e.error = -EDESTADDRREQ;
- e.responded = false;
for (i = 0; i < vc->server_list->nr_servers; i++) {
struct afs_vlserver *s = vc->server_list->servers[i].server;
if (test_bit(AFS_VLSERVER_FL_RESPONDING, &s->flags))
- e.responded = true;
- afs_prioritise_error(&e, READ_ONCE(s->probe.error),
+ vc->cumul_error.responded = true;
+ afs_prioritise_error(&vc->cumul_error, READ_ONCE(s->probe.error),
s->probe.abort_code);
}
- error = e.error;
-
-failed_set_error:
- vc->error = error;
failed:
+ if (alist) {
+ if (vc->call_responded &&
+ vc->addr_index != alist->preferred &&
+ test_bit(alist->preferred, &vc->addr_tried))
+ WRITE_ONCE(alist->preferred, vc->addr_index);
+ afs_put_addrlist(alist, afs_alist_trace_put_vlrotate_fail);
+ alist = vc->alist = NULL;
+ }
vc->flags |= AFS_VL_CURSOR_STOP;
- afs_end_cursor(&vc->ac);
- _leave(" = f [failed %d]", vc->error);
+ _leave(" = f [failed %d]", vc->cumul_error.error);
return false;
}
@@ -305,7 +337,10 @@ static void afs_vl_dump_edestaddrreq(const struct afs_vl_cursor *vc)
pr_notice("DNS: src=%u st=%u lc=%x\n",
cell->dns_source, cell->dns_status, cell->dns_lookup_count);
pr_notice("VC: ut=%lx ix=%u ni=%hu fl=%hx err=%hd\n",
- vc->untried, vc->index, vc->nr_iterations, vc->flags, vc->error);
+ vc->untried_servers, vc->server_index, vc->nr_iterations,
+ vc->flags, vc->cumul_error.error);
+ pr_notice("VC: call er=%d ac=%d r=%u\n",
+ vc->call_error, vc->call_abort_code, vc->call_responded);
if (vc->server_list) {
const struct afs_vlserver_list *sl = vc->server_list;
@@ -322,16 +357,14 @@ static void afs_vl_dump_edestaddrreq(const struct afs_vl_cursor *vc)
a->nr_ipv4, a->nr_addrs, a->max_addrs,
a->preferred);
pr_notice("VC: - R=%lx F=%lx\n",
- a->responded, a->failed);
- if (a == vc->ac.alist)
+ a->responded, a->probe_failed);
+ if (a == vc->alist)
pr_notice("VC: - current\n");
}
}
}
- pr_notice("AC: t=%lx ax=%u ac=%d er=%d r=%u ni=%u\n",
- vc->ac.tried, vc->ac.index, vc->ac.abort_code, vc->ac.error,
- vc->ac.responded, vc->ac.nr_iterations);
+ pr_notice("AC: t=%lx ax=%u\n", vc->addr_tried, vc->addr_index);
rcu_read_unlock();
}
@@ -342,17 +375,25 @@ int afs_end_vlserver_operation(struct afs_vl_cursor *vc)
{
struct afs_net *net = vc->cell->net;
- if (vc->error == -EDESTADDRREQ ||
- vc->error == -EADDRNOTAVAIL ||
- vc->error == -ENETUNREACH ||
- vc->error == -EHOSTUNREACH)
+ _enter("VC=%x+%x", vc->debug_id, vc->nr_iterations);
+
+ switch (vc->cumul_error.error) {
+ case -EDESTADDRREQ:
+ case -EADDRNOTAVAIL:
+ case -ENETUNREACH:
+ case -EHOSTUNREACH:
afs_vl_dump_edestaddrreq(vc);
+ break;
+ }
- afs_end_cursor(&vc->ac);
+ if (vc->alist) {
+ if (vc->call_responded &&
+ vc->addr_index != vc->alist->preferred &&
+ test_bit(vc->alist->preferred, &vc->addr_tried))
+ WRITE_ONCE(vc->alist->preferred, vc->addr_index);
+ afs_put_addrlist(vc->alist, afs_alist_trace_put_vlrotate_end);
+ vc->alist = NULL;
+ }
afs_put_vlserverlist(net, vc->server_list);
-
- if (vc->error == -ECONNABORTED)
- vc->error = afs_abort_to_error(vc->ac.abort_code);
-
- return vc->error;
+ return vc->cumul_error.error;
}
diff --git a/fs/afs/vlclient.c b/fs/afs/vlclient.c
index 00fca3c66ba6..cac75f89b64a 100644
--- a/fs/afs/vlclient.c
+++ b/fs/afs/vlclient.c
@@ -18,8 +18,7 @@ static int afs_deliver_vl_get_entry_by_name_u(struct afs_call *call)
{
struct afs_uvldbentry__xdr *uvldb;
struct afs_vldb_entry *entry;
- bool new_only = false;
- u32 tmp, nr_servers, vlflags;
+ u32 nr_servers, vlflags;
int i, ret;
_enter("");
@@ -41,27 +40,14 @@ static int afs_deliver_vl_get_entry_by_name_u(struct afs_call *call)
entry->name[i] = 0;
entry->name_len = strlen(entry->name);
- /* If there is a new replication site that we can use, ignore all the
- * sites that aren't marked as new.
- */
- for (i = 0; i < nr_servers; i++) {
- tmp = ntohl(uvldb->serverFlags[i]);
- if (!(tmp & AFS_VLSF_DONTUSE) &&
- (tmp & AFS_VLSF_NEWREPSITE))
- new_only = true;
- }
-
vlflags = ntohl(uvldb->flags);
for (i = 0; i < nr_servers; i++) {
struct afs_uuid__xdr *xdr;
struct afs_uuid *uuid;
+ u32 tmp = ntohl(uvldb->serverFlags[i]);
int j;
int n = entry->nr_servers;
- tmp = ntohl(uvldb->serverFlags[i]);
- if (tmp & AFS_VLSF_DONTUSE ||
- (new_only && !(tmp & AFS_VLSF_NEWREPSITE)))
- continue;
if (tmp & AFS_VLSF_RWVOL) {
entry->fs_mask[n] |= AFS_VOL_VTM_RW;
if (vlflags & AFS_VLF_BACKEXISTS)
@@ -82,6 +68,7 @@ static int afs_deliver_vl_get_entry_by_name_u(struct afs_call *call)
for (j = 0; j < 6; j++)
uuid->node[j] = (u8)ntohl(xdr->node[j]);
+ entry->vlsf_flags[n] = tmp;
entry->addr_version[n] = ntohl(uvldb->serverUnique[i]);
entry->nr_servers++;
}
@@ -106,12 +93,6 @@ static int afs_deliver_vl_get_entry_by_name_u(struct afs_call *call)
return 0;
}
-static void afs_destroy_vl_get_entry_by_name_u(struct afs_call *call)
-{
- kfree(call->ret_vldb);
- afs_flat_call_destructor(call);
-}
-
/*
* VL.GetEntryByNameU operation type.
*/
@@ -119,7 +100,7 @@ static const struct afs_call_type afs_RXVLGetEntryByNameU = {
.name = "VL.GetEntryByNameU",
.op = afs_VL_GetEntryByNameU,
.deliver = afs_deliver_vl_get_entry_by_name_u,
- .destructor = afs_destroy_vl_get_entry_by_name_u,
+ .destructor = afs_flat_call_destructor,
};
/*
@@ -155,6 +136,8 @@ struct afs_vldb_entry *afs_vl_get_entry_by_name_u(struct afs_vl_cursor *vc,
call->key = vc->key;
call->ret_vldb = entry;
call->max_lifespan = AFS_VL_MAX_LIFESPAN;
+ call->peer = rxrpc_kernel_get_peer(vc->alist->addrs[vc->addr_index].peer);
+ call->service_id = vc->server->service_id;
/* Marshall the parameters */
bp = call->request;
@@ -165,8 +148,17 @@ struct afs_vldb_entry *afs_vl_get_entry_by_name_u(struct afs_vl_cursor *vc,
memset((void *)bp + volnamesz, 0, padsz);
trace_afs_make_vl_call(call);
- afs_make_call(&vc->ac, call, GFP_KERNEL);
- return (struct afs_vldb_entry *)afs_wait_for_call_to_complete(call, &vc->ac);
+ afs_make_call(call, GFP_KERNEL);
+ afs_wait_for_call_to_complete(call);
+ vc->call_abort_code = call->abort_code;
+ vc->call_error = call->error;
+ vc->call_responded = call->responded;
+ afs_put_call(call);
+ if (vc->call_error) {
+ kfree(entry);
+ return ERR_PTR(vc->call_error);
+ }
+ return entry;
}
/*
@@ -208,7 +200,7 @@ static int afs_deliver_vl_get_addrs_u(struct afs_call *call)
count = ntohl(*bp);
nentries = min(nentries, count);
- alist = afs_alloc_addrlist(nentries, FS_SERVICE, AFS_FS_PORT);
+ alist = afs_alloc_addrlist(nentries);
if (!alist)
return -ENOMEM;
alist->version = uniquifier;
@@ -230,9 +222,13 @@ static int afs_deliver_vl_get_addrs_u(struct afs_call *call)
alist = call->ret_alist;
bp = call->buffer;
count = min(call->count, 4U);
- for (i = 0; i < count; i++)
- if (alist->nr_addrs < call->count2)
- afs_merge_fs_addr4(alist, *bp++, AFS_FS_PORT);
+ for (i = 0; i < count; i++) {
+ if (alist->nr_addrs < call->count2) {
+ ret = afs_merge_fs_addr4(call->net, alist, *bp++, AFS_FS_PORT);
+ if (ret < 0)
+ return ret;
+ }
+ }
call->count -= count;
if (call->count > 0)
@@ -245,12 +241,6 @@ static int afs_deliver_vl_get_addrs_u(struct afs_call *call)
return 0;
}
-static void afs_vl_get_addrs_u_destructor(struct afs_call *call)
-{
- afs_put_addrlist(call->ret_alist);
- return afs_flat_call_destructor(call);
-}
-
/*
* VL.GetAddrsU operation type.
*/
@@ -258,7 +248,7 @@ static const struct afs_call_type afs_RXVLGetAddrsU = {
.name = "VL.GetAddrsU",
.op = afs_VL_GetAddrsU,
.deliver = afs_deliver_vl_get_addrs_u,
- .destructor = afs_vl_get_addrs_u_destructor,
+ .destructor = afs_flat_call_destructor,
};
/*
@@ -269,6 +259,7 @@ struct afs_addr_list *afs_vl_get_addrs_u(struct afs_vl_cursor *vc,
const uuid_t *uuid)
{
struct afs_ListAddrByAttributes__xdr *r;
+ struct afs_addr_list *alist;
const struct afs_uuid *u = (const struct afs_uuid *)uuid;
struct afs_call *call;
struct afs_net *net = vc->cell->net;
@@ -286,6 +277,8 @@ struct afs_addr_list *afs_vl_get_addrs_u(struct afs_vl_cursor *vc,
call->key = vc->key;
call->ret_alist = NULL;
call->max_lifespan = AFS_VL_MAX_LIFESPAN;
+ call->peer = rxrpc_kernel_get_peer(vc->alist->addrs[vc->addr_index].peer);
+ call->service_id = vc->server->service_id;
/* Marshall the parameters */
bp = call->request;
@@ -304,8 +297,18 @@ struct afs_addr_list *afs_vl_get_addrs_u(struct afs_vl_cursor *vc,
r->uuid.node[i] = htonl(u->node[i]);
trace_afs_make_vl_call(call);
- afs_make_call(&vc->ac, call, GFP_KERNEL);
- return (struct afs_addr_list *)afs_wait_for_call_to_complete(call, &vc->ac);
+ afs_make_call(call, GFP_KERNEL);
+ afs_wait_for_call_to_complete(call);
+ vc->call_abort_code = call->abort_code;
+ vc->call_error = call->error;
+ vc->call_responded = call->responded;
+ alist = call->ret_alist;
+ afs_put_call(call);
+ if (vc->call_error) {
+ afs_put_addrlist(alist, afs_alist_trace_put_getaddru);
+ return ERR_PTR(vc->call_error);
+ }
+ return alist;
}
/*
@@ -355,6 +358,7 @@ static int afs_deliver_vl_get_capabilities(struct afs_call *call)
static void afs_destroy_vl_get_capabilities(struct afs_call *call)
{
+ afs_put_addrlist(call->vl_probe, afs_alist_trace_put_vlgetcaps);
afs_put_vlserver(call->net, call->vlserver);
afs_flat_call_destructor(call);
}
@@ -378,7 +382,8 @@ static const struct afs_call_type afs_RXVLGetCapabilities = {
* other end supports.
*/
struct afs_call *afs_vl_get_capabilities(struct afs_net *net,
- struct afs_addr_cursor *ac,
+ struct afs_addr_list *alist,
+ unsigned int addr_index,
struct key *key,
struct afs_vlserver *server,
unsigned int server_index)
@@ -395,6 +400,10 @@ struct afs_call *afs_vl_get_capabilities(struct afs_net *net,
call->key = key;
call->vlserver = afs_get_vlserver(server);
call->server_index = server_index;
+ call->peer = rxrpc_kernel_get_peer(alist->addrs[addr_index].peer);
+ call->vl_probe = afs_get_addrlist(alist, afs_alist_trace_get_vlgetcaps);
+ call->probe_index = addr_index;
+ call->service_id = server->service_id;
call->upgrade = true;
call->async = true;
call->max_lifespan = AFS_PROBE_MAX_LIFESPAN;
@@ -405,7 +414,7 @@ struct afs_call *afs_vl_get_capabilities(struct afs_net *net,
/* Can't take a ref on server */
trace_afs_make_vl_call(call);
- afs_make_call(ac, call, GFP_KERNEL);
+ afs_make_call(call, GFP_KERNEL);
return call;
}
@@ -450,7 +459,7 @@ static int afs_deliver_yfsvl_get_endpoints(struct afs_call *call)
if (call->count > YFS_MAXENDPOINTS)
return afs_protocol_error(call, afs_eproto_yvl_fsendpt_num);
- alist = afs_alloc_addrlist(call->count, FS_SERVICE, AFS_FS_PORT);
+ alist = afs_alloc_addrlist(call->count);
if (!alist)
return -ENOMEM;
alist->version = uniquifier;
@@ -488,14 +497,18 @@ static int afs_deliver_yfsvl_get_endpoints(struct afs_call *call)
if (ntohl(bp[0]) != sizeof(__be32) * 2)
return afs_protocol_error(
call, afs_eproto_yvl_fsendpt4_len);
- afs_merge_fs_addr4(alist, bp[1], ntohl(bp[2]));
+ ret = afs_merge_fs_addr4(call->net, alist, bp[1], ntohl(bp[2]));
+ if (ret < 0)
+ return ret;
bp += 3;
break;
case YFS_ENDPOINT_IPV6:
if (ntohl(bp[0]) != sizeof(__be32) * 5)
return afs_protocol_error(
call, afs_eproto_yvl_fsendpt6_len);
- afs_merge_fs_addr6(alist, bp + 1, ntohl(bp[5]));
+ ret = afs_merge_fs_addr6(call->net, alist, bp + 1, ntohl(bp[5]));
+ if (ret < 0)
+ return ret;
bp += 6;
break;
default:
@@ -610,7 +623,7 @@ static const struct afs_call_type afs_YFSVLGetEndpoints = {
.name = "YFSVL.GetEndpoints",
.op = afs_YFSVL_GetEndpoints,
.deliver = afs_deliver_yfsvl_get_endpoints,
- .destructor = afs_vl_get_addrs_u_destructor,
+ .destructor = afs_flat_call_destructor,
};
/*
@@ -620,6 +633,7 @@ static const struct afs_call_type afs_YFSVLGetEndpoints = {
struct afs_addr_list *afs_yfsvl_get_endpoints(struct afs_vl_cursor *vc,
const uuid_t *uuid)
{
+ struct afs_addr_list *alist;
struct afs_call *call;
struct afs_net *net = vc->cell->net;
__be32 *bp;
@@ -635,6 +649,8 @@ struct afs_addr_list *afs_yfsvl_get_endpoints(struct afs_vl_cursor *vc,
call->key = vc->key;
call->ret_alist = NULL;
call->max_lifespan = AFS_VL_MAX_LIFESPAN;
+ call->peer = rxrpc_kernel_get_peer(vc->alist->addrs[vc->addr_index].peer);
+ call->service_id = vc->server->service_id;
/* Marshall the parameters */
bp = call->request;
@@ -643,8 +659,18 @@ struct afs_addr_list *afs_yfsvl_get_endpoints(struct afs_vl_cursor *vc,
memcpy(bp, uuid, sizeof(*uuid)); /* Type opr_uuid */
trace_afs_make_vl_call(call);
- afs_make_call(&vc->ac, call, GFP_KERNEL);
- return (struct afs_addr_list *)afs_wait_for_call_to_complete(call, &vc->ac);
+ afs_make_call(call, GFP_KERNEL);
+ afs_wait_for_call_to_complete(call);
+ vc->call_abort_code = call->abort_code;
+ vc->call_error = call->error;
+ vc->call_responded = call->responded;
+ alist = call->ret_alist;
+ afs_put_call(call);
+ if (vc->call_error) {
+ afs_put_addrlist(alist, afs_alist_trace_put_getaddru);
+ return ERR_PTR(vc->call_error);
+ }
+ return alist;
}
/*
@@ -709,12 +735,6 @@ static int afs_deliver_yfsvl_get_cell_name(struct afs_call *call)
return 0;
}
-static void afs_destroy_yfsvl_get_cell_name(struct afs_call *call)
-{
- kfree(call->ret_str);
- afs_flat_call_destructor(call);
-}
-
/*
* VL.GetCapabilities operation type
*/
@@ -722,7 +742,7 @@ static const struct afs_call_type afs_YFSVLGetCellName = {
.name = "YFSVL.GetCellName",
.op = afs_YFSVL_GetCellName,
.deliver = afs_deliver_yfsvl_get_cell_name,
- .destructor = afs_destroy_yfsvl_get_cell_name,
+ .destructor = afs_flat_call_destructor,
};
/*
@@ -737,6 +757,7 @@ char *afs_yfsvl_get_cell_name(struct afs_vl_cursor *vc)
struct afs_call *call;
struct afs_net *net = vc->cell->net;
__be32 *bp;
+ char *cellname;
_enter("");
@@ -747,6 +768,8 @@ char *afs_yfsvl_get_cell_name(struct afs_vl_cursor *vc)
call->key = vc->key;
call->ret_str = NULL;
call->max_lifespan = AFS_VL_MAX_LIFESPAN;
+ call->peer = rxrpc_kernel_get_peer(vc->alist->addrs[vc->addr_index].peer);
+ call->service_id = vc->server->service_id;
/* marshall the parameters */
bp = call->request;
@@ -754,6 +777,16 @@ char *afs_yfsvl_get_cell_name(struct afs_vl_cursor *vc)
/* Can't take a ref on server */
trace_afs_make_vl_call(call);
- afs_make_call(&vc->ac, call, GFP_KERNEL);
- return (char *)afs_wait_for_call_to_complete(call, &vc->ac);
+ afs_make_call(call, GFP_KERNEL);
+ afs_wait_for_call_to_complete(call);
+ vc->call_abort_code = call->abort_code;
+ vc->call_error = call->error;
+ vc->call_responded = call->responded;
+ cellname = call->ret_str;
+ afs_put_call(call);
+ if (vc->call_error) {
+ kfree(cellname);
+ return ERR_PTR(vc->call_error);
+ }
+ return cellname;
}
diff --git a/fs/afs/volume.c b/fs/afs/volume.c
index 115c081a8e2c..020ecd45e476 100644
--- a/fs/afs/volume.c
+++ b/fs/afs/volume.c
@@ -11,6 +11,8 @@
static unsigned __read_mostly afs_volume_record_life = 60 * 60;
+static void afs_destroy_volume(struct work_struct *work);
+
/*
* Insert a volume into a cell. If there's an existing volume record, that is
* returned instead with a ref held.
@@ -72,11 +74,11 @@ static void afs_remove_volume_from_cell(struct afs_volume *volume)
*/
static struct afs_volume *afs_alloc_volume(struct afs_fs_context *params,
struct afs_vldb_entry *vldb,
- unsigned long type_mask)
+ struct afs_server_list **_slist)
{
struct afs_server_list *slist;
struct afs_volume *volume;
- int ret = -ENOMEM;
+ int ret = -ENOMEM, i;
volume = kzalloc(sizeof(struct afs_volume), GFP_KERNEL);
if (!volume)
@@ -88,20 +90,30 @@ static struct afs_volume *afs_alloc_volume(struct afs_fs_context *params,
volume->type = params->type;
volume->type_force = params->force;
volume->name_len = vldb->name_len;
+ volume->creation_time = TIME64_MIN;
+ volume->update_time = TIME64_MIN;
refcount_set(&volume->ref, 1);
INIT_HLIST_NODE(&volume->proc_link);
+ INIT_WORK(&volume->destructor, afs_destroy_volume);
rwlock_init(&volume->servers_lock);
+ mutex_init(&volume->volsync_lock);
+ mutex_init(&volume->cb_check_lock);
rwlock_init(&volume->cb_v_break_lock);
+ INIT_LIST_HEAD(&volume->open_mmaps);
+ init_rwsem(&volume->open_mmaps_lock);
memcpy(volume->name, vldb->name, vldb->name_len + 1);
- slist = afs_alloc_server_list(params->cell, params->key, vldb, type_mask);
+ for (i = 0; i < AFS_MAXTYPES; i++)
+ volume->vids[i] = vldb->vid[i];
+
+ slist = afs_alloc_server_list(volume, params->key, vldb);
if (IS_ERR(slist)) {
ret = PTR_ERR(slist);
goto error_1;
}
- refcount_set(&slist->usage, 1);
+ *_slist = slist;
rcu_assign_pointer(volume->servers, slist);
trace_afs_volume(volume->vid, 1, afs_volume_trace_alloc);
return volume;
@@ -117,18 +129,20 @@ error_0:
* Look up or allocate a volume record.
*/
static struct afs_volume *afs_lookup_volume(struct afs_fs_context *params,
- struct afs_vldb_entry *vldb,
- unsigned long type_mask)
+ struct afs_vldb_entry *vldb)
{
+ struct afs_server_list *slist;
struct afs_volume *candidate, *volume;
- candidate = afs_alloc_volume(params, vldb, type_mask);
+ candidate = afs_alloc_volume(params, vldb, &slist);
if (IS_ERR(candidate))
return candidate;
volume = afs_insert_volume_into_cell(params->cell, candidate);
- if (volume != candidate)
- afs_put_volume(params->net, candidate, afs_volume_trace_put_cell_dup);
+ if (volume == candidate)
+ afs_attach_volume_to_servers(volume, slist);
+ else
+ afs_put_volume(candidate, afs_volume_trace_put_cell_dup);
return volume;
}
@@ -208,8 +222,7 @@ struct afs_volume *afs_create_volume(struct afs_fs_context *params)
goto error;
}
- type_mask = 1UL << params->type;
- volume = afs_lookup_volume(params, vldb, type_mask);
+ volume = afs_lookup_volume(params, vldb);
error:
kfree(vldb);
@@ -219,16 +232,20 @@ error:
/*
* Destroy a volume record
*/
-static void afs_destroy_volume(struct afs_net *net, struct afs_volume *volume)
+static void afs_destroy_volume(struct work_struct *work)
{
+ struct afs_volume *volume = container_of(work, struct afs_volume, destructor);
+ struct afs_server_list *slist = rcu_access_pointer(volume->servers);
+
_enter("%p", volume);
#ifdef CONFIG_AFS_FSCACHE
ASSERTCMP(volume->cache, ==, NULL);
#endif
+ afs_detach_volume_from_servers(volume, slist);
afs_remove_volume_from_cell(volume);
- afs_put_serverlist(net, rcu_access_pointer(volume->servers));
+ afs_put_serverlist(volume->cell->net, slist);
afs_put_cell(volume->cell, afs_cell_trace_put_vol);
trace_afs_volume(volume->vid, refcount_read(&volume->ref),
afs_volume_trace_free);
@@ -270,8 +287,7 @@ struct afs_volume *afs_get_volume(struct afs_volume *volume,
/*
* Drop a reference on a volume record.
*/
-void afs_put_volume(struct afs_net *net, struct afs_volume *volume,
- enum afs_volume_trace reason)
+void afs_put_volume(struct afs_volume *volume, enum afs_volume_trace reason)
{
if (volume) {
afs_volid_t vid = volume->vid;
@@ -281,7 +297,7 @@ void afs_put_volume(struct afs_net *net, struct afs_volume *volume,
zero = __refcount_dec_and_test(&volume->ref, &r);
trace_afs_volume(vid, r - 1, reason);
if (zero)
- afs_destroy_volume(net, volume);
+ schedule_work(&volume->destructor);
}
}
@@ -362,8 +378,7 @@ static int afs_update_volume_status(struct afs_volume *volume, struct key *key)
}
/* See if the volume's server list got updated. */
- new = afs_alloc_server_list(volume->cell, key,
- vldb, (1 << volume->type));
+ new = afs_alloc_server_list(volume, key, vldb);
if (IS_ERR(new)) {
ret = PTR_ERR(new);
goto error_vldb;
@@ -382,11 +397,17 @@ static int afs_update_volume_status(struct afs_volume *volume, struct key *key)
discard = old;
}
- volume->update_at = ktime_get_real_seconds() + afs_volume_record_life;
+ /* Check more often if replication is ongoing. */
+ if (new->ro_replicating)
+ volume->update_at = ktime_get_real_seconds() + 10 * 60;
+ else
+ volume->update_at = ktime_get_real_seconds() + afs_volume_record_life;
write_unlock(&volume->servers_lock);
- ret = 0;
+ if (discard == old)
+ afs_reattach_volume_to_servers(volume, new, old);
afs_put_serverlist(volume->cell->net, discard);
+ ret = 0;
error_vldb:
kfree(vldb);
error:
diff --git a/fs/afs/write.c b/fs/afs/write.c
index e87b52b1f34c..61d34ad2ca7d 100644
--- a/fs/afs/write.c
+++ b/fs/afs/write.c
@@ -366,7 +366,7 @@ static void afs_store_data_success(struct afs_operation *op)
op->ctime = op->file[0].scb.status.mtime_client;
afs_vnode_commit_status(op, &op->file[0]);
- if (op->error == 0) {
+ if (!afs_op_error(op)) {
if (!op->store.laundering)
afs_pages_written_back(vnode, op->store.pos, op->store.size);
afs_stat_v(vnode, n_stores);
@@ -428,7 +428,7 @@ try_next_key:
afs_wait_for_operation(op);
- switch (op->error) {
+ switch (afs_op_error(op)) {
case -EACCES:
case -EPERM:
case -ENOKEY:
@@ -447,7 +447,7 @@ try_next_key:
}
afs_put_wb_key(wbk);
- _leave(" = %d", op->error);
+ _leave(" = %d", afs_op_error(op));
return afs_put_operation(op);
}
diff --git a/fs/afs/yfsclient.c b/fs/afs/yfsclient.c
index 11571cca86c1..f521e66d3bf6 100644
--- a/fs/afs/yfsclient.c
+++ b/fs/afs/yfsclient.c
@@ -245,12 +245,15 @@ static void xdr_decode_YFSVolSync(const __be32 **_bp,
struct afs_volsync *volsync)
{
struct yfs_xdr_YFSVolSync *x = (void *)*_bp;
- u64 creation;
+ u64 creation, update;
if (volsync) {
creation = xdr_to_u64(x->vol_creation_date);
do_div(creation, 10 * 1000 * 1000);
volsync->creation = creation;
+ update = xdr_to_u64(x->vol_update_date);
+ do_div(update, 10 * 1000 * 1000);
+ volsync->update = update;
}
*_bp += xdr_size(x);
@@ -490,6 +493,7 @@ void yfs_fs_fetch_data(struct afs_operation *op)
bp = xdr_encode_u64(bp, req->len);
yfs_check_req(call, bp);
+ call->fid = vp->fid;
trace_afs_make_fs_call(call, &vp->fid);
afs_make_op_call(op, call, GFP_NOFS);
}
@@ -572,6 +576,7 @@ void yfs_fs_create_file(struct afs_operation *op)
bp = xdr_encode_u32(bp, yfs_LockNone); /* ViceLockType */
yfs_check_req(call, bp);
+ call->fid = dvp->fid;
trace_afs_make_fs_call1(call, &dvp->fid, name);
afs_make_op_call(op, call, GFP_NOFS);
}
@@ -620,6 +625,7 @@ void yfs_fs_make_dir(struct afs_operation *op)
bp = xdr_encode_YFSStoreStatus(bp, &op->create.mode, &op->mtime);
yfs_check_req(call, bp);
+ call->fid = dvp->fid;
trace_afs_make_fs_call1(call, &dvp->fid, name);
afs_make_op_call(op, call, GFP_NOFS);
}
@@ -704,6 +710,7 @@ void yfs_fs_remove_file2(struct afs_operation *op)
bp = xdr_encode_name(bp, name);
yfs_check_req(call, bp);
+ call->fid = dvp->fid;
trace_afs_make_fs_call1(call, &dvp->fid, name);
afs_make_op_call(op, call, GFP_NOFS);
}
@@ -773,6 +780,7 @@ void yfs_fs_remove_file(struct afs_operation *op)
bp = xdr_encode_name(bp, name);
yfs_check_req(call, bp);
+ call->fid = dvp->fid;
trace_afs_make_fs_call1(call, &dvp->fid, name);
afs_make_op_call(op, call, GFP_NOFS);
}
@@ -814,6 +822,7 @@ void yfs_fs_remove_dir(struct afs_operation *op)
bp = xdr_encode_name(bp, name);
yfs_check_req(call, bp);
+ call->fid = dvp->fid;
trace_afs_make_fs_call1(call, &dvp->fid, name);
afs_make_op_call(op, call, GFP_NOFS);
}
@@ -887,6 +896,7 @@ void yfs_fs_link(struct afs_operation *op)
bp = xdr_encode_YFSFid(bp, &vp->fid);
yfs_check_req(call, bp);
+ call->fid = vp->fid;
trace_afs_make_fs_call1(call, &vp->fid, name);
afs_make_op_call(op, call, GFP_NOFS);
}
@@ -968,6 +978,7 @@ void yfs_fs_symlink(struct afs_operation *op)
bp = xdr_encode_YFSStoreStatus(bp, &mode, &op->mtime);
yfs_check_req(call, bp);
+ call->fid = dvp->fid;
trace_afs_make_fs_call1(call, &dvp->fid, name);
afs_make_op_call(op, call, GFP_NOFS);
}
@@ -1047,6 +1058,7 @@ void yfs_fs_rename(struct afs_operation *op)
bp = xdr_encode_name(bp, new_name);
yfs_check_req(call, bp);
+ call->fid = orig_dvp->fid;
trace_afs_make_fs_call2(call, &orig_dvp->fid, orig_name, new_name);
afs_make_op_call(op, call, GFP_NOFS);
}
@@ -1102,6 +1114,7 @@ void yfs_fs_store_data(struct afs_operation *op)
bp = xdr_encode_u64(bp, op->store.i_size);
yfs_check_req(call, bp);
+ call->fid = vp->fid;
trace_afs_make_fs_call(call, &vp->fid);
afs_make_op_call(op, call, GFP_NOFS);
}
@@ -1158,6 +1171,7 @@ static void yfs_fs_setattr_size(struct afs_operation *op)
bp = xdr_encode_u64(bp, attr->ia_size); /* new file length */
yfs_check_req(call, bp);
+ call->fid = vp->fid;
trace_afs_make_fs_call(call, &vp->fid);
afs_make_op_call(op, call, GFP_NOFS);
}
@@ -1196,6 +1210,7 @@ void yfs_fs_setattr(struct afs_operation *op)
bp = xdr_encode_YFS_StoreStatus(bp, attr);
yfs_check_req(call, bp);
+ call->fid = vp->fid;
trace_afs_make_fs_call(call, &vp->fid);
afs_make_op_call(op, call, GFP_NOFS);
}
@@ -1366,6 +1381,7 @@ void yfs_fs_get_volume_status(struct afs_operation *op)
bp = xdr_encode_u64(bp, vp->fid.vid);
yfs_check_req(call, bp);
+ call->fid = vp->fid;
trace_afs_make_fs_call(call, &vp->fid);
afs_make_op_call(op, call, GFP_NOFS);
}
@@ -1430,6 +1446,7 @@ void yfs_fs_set_lock(struct afs_operation *op)
bp = xdr_encode_u32(bp, op->lock.type);
yfs_check_req(call, bp);
+ call->fid = vp->fid;
trace_afs_make_fs_calli(call, &vp->fid, op->lock.type);
afs_make_op_call(op, call, GFP_NOFS);
}
@@ -1460,6 +1477,7 @@ void yfs_fs_extend_lock(struct afs_operation *op)
bp = xdr_encode_YFSFid(bp, &vp->fid);
yfs_check_req(call, bp);
+ call->fid = vp->fid;
trace_afs_make_fs_call(call, &vp->fid);
afs_make_op_call(op, call, GFP_NOFS);
}
@@ -1490,6 +1508,7 @@ void yfs_fs_release_lock(struct afs_operation *op)
bp = xdr_encode_YFSFid(bp, &vp->fid);
yfs_check_req(call, bp);
+ call->fid = vp->fid;
trace_afs_make_fs_call(call, &vp->fid);
afs_make_op_call(op, call, GFP_NOFS);
}
@@ -1556,6 +1575,7 @@ void yfs_fs_fetch_status(struct afs_operation *op)
bp = xdr_encode_YFSFid(bp, &vp->fid);
yfs_check_req(call, bp);
+ call->fid = vp->fid;
trace_afs_make_fs_call(call, &vp->fid);
afs_make_op_call(op, call, GFP_NOFS);
}
@@ -1736,6 +1756,7 @@ void yfs_fs_inline_bulk_status(struct afs_operation *op)
bp = xdr_encode_YFSFid(bp, &op->more_files[i].fid);
yfs_check_req(call, bp);
+ call->fid = vp->fid;
trace_afs_make_fs_call(call, &vp->fid);
afs_make_op_call(op, call, GFP_NOFS);
}
@@ -1898,6 +1919,7 @@ void yfs_fs_fetch_opaque_acl(struct afs_operation *op)
bp = xdr_encode_YFSFid(bp, &vp->fid);
yfs_check_req(call, bp);
+ call->fid = vp->fid;
trace_afs_make_fs_call(call, &vp->fid);
afs_make_op_call(op, call, GFP_KERNEL);
}
@@ -1948,6 +1970,7 @@ void yfs_fs_store_opaque_acl2(struct afs_operation *op)
bp += size / sizeof(__be32);
yfs_check_req(call, bp);
+ call->fid = vp->fid;
trace_afs_make_fs_call(call, &vp->fid);
afs_make_op_call(op, call, GFP_KERNEL);
}