diff options
Diffstat (limited to 'net')
44 files changed, 762 insertions, 1694 deletions
diff --git a/net/9p/client.c b/net/9p/client.c index 2c9a17b9b46b..357214a51f13 100644 --- a/net/9p/client.c +++ b/net/9p/client.c @@ -181,6 +181,12 @@ static int parse_opts(char *opts, struct p9_client *clnt) ret = r; continue; } + if (option < 4096) { + p9_debug(P9_DEBUG_ERROR, + "msize should be at least 4k\n"); + ret = -EINVAL; + continue; + } clnt->msize = option; break; case Opt_trans: @@ -983,10 +989,18 @@ static int p9_client_version(struct p9_client *c) else if (!strncmp(version, "9P2000", 6)) c->proto_version = p9_proto_legacy; else { + p9_debug(P9_DEBUG_ERROR, + "server returned an unknown version: %s\n", version); err = -EREMOTEIO; goto error; } + if (msize < 4096) { + p9_debug(P9_DEBUG_ERROR, + "server returned a msize < 4096: %d\n", msize); + err = -EREMOTEIO; + goto error; + } if (msize < c->msize) c->msize = msize; @@ -1043,6 +1057,13 @@ struct p9_client *p9_client_create(const char *dev_name, char *options) if (clnt->msize > clnt->trans_mod->maxsize) clnt->msize = clnt->trans_mod->maxsize; + if (clnt->msize < 4096) { + p9_debug(P9_DEBUG_ERROR, + "Please specify a msize of at least 4k\n"); + err = -EINVAL; + goto free_client; + } + err = p9_client_version(clnt); if (err) goto close_trans; diff --git a/net/9p/trans_common.c b/net/9p/trans_common.c index b718db2085b2..3dff68f05fb9 100644 --- a/net/9p/trans_common.c +++ b/net/9p/trans_common.c @@ -14,6 +14,7 @@ #include <linux/mm.h> #include <linux/module.h> +#include "trans_common.h" /** * p9_release_pages - Release pages after the transaction. diff --git a/net/compat.c b/net/compat.c index d1f3a8a0b3ef..c3a2f868e8af 100644 --- a/net/compat.c +++ b/net/compat.c @@ -813,34 +813,23 @@ COMPAT_SYSCALL_DEFINE6(recvfrom, int, fd, void __user *, buf, compat_size_t, len return __compat_sys_recvfrom(fd, buf, len, flags, addr, addrlen); } -static int __compat_sys_recvmmsg(int fd, struct compat_mmsghdr __user *mmsg, - unsigned int vlen, unsigned int flags, - struct old_timespec32 __user *timeout) +COMPAT_SYSCALL_DEFINE5(recvmmsg_time64, int, fd, struct compat_mmsghdr __user *, mmsg, + unsigned int, vlen, unsigned int, flags, + struct __kernel_timespec __user *, timeout) { - int datagrams; - struct timespec64 ktspec; - - if (timeout == NULL) - return __sys_recvmmsg(fd, (struct mmsghdr __user *)mmsg, vlen, - flags | MSG_CMSG_COMPAT, NULL); - - if (compat_get_timespec64(&ktspec, timeout)) - return -EFAULT; - - datagrams = __sys_recvmmsg(fd, (struct mmsghdr __user *)mmsg, vlen, - flags | MSG_CMSG_COMPAT, &ktspec); - if (datagrams > 0 && compat_put_timespec64(&ktspec, timeout)) - datagrams = -EFAULT; - - return datagrams; + return __sys_recvmmsg(fd, (struct mmsghdr __user *)mmsg, vlen, + flags | MSG_CMSG_COMPAT, timeout, NULL); } +#ifdef CONFIG_COMPAT_32BIT_TIME COMPAT_SYSCALL_DEFINE5(recvmmsg, int, fd, struct compat_mmsghdr __user *, mmsg, unsigned int, vlen, unsigned int, flags, struct old_timespec32 __user *, timeout) { - return __compat_sys_recvmmsg(fd, mmsg, vlen, flags, timeout); + return __sys_recvmmsg(fd, (struct mmsghdr __user *)mmsg, vlen, + flags | MSG_CMSG_COMPAT, NULL, timeout); } +#endif COMPAT_SYSCALL_DEFINE2(socketcall, int, call, u32 __user *, args) { @@ -928,8 +917,9 @@ COMPAT_SYSCALL_DEFINE2(socketcall, int, call, u32 __user *, args) ret = __compat_sys_recvmsg(a0, compat_ptr(a1), a[2]); break; case SYS_RECVMMSG: - ret = __compat_sys_recvmmsg(a0, compat_ptr(a1), a[2], a[3], - compat_ptr(a[4])); + ret = __sys_recvmmsg(a0, compat_ptr(a1), a[2], + a[3] | MSG_CMSG_COMPAT, NULL, + compat_ptr(a[4])); break; case SYS_ACCEPT4: ret = __sys_accept4(a0, compat_ptr(a1), compat_ptr(a[2]), a[3]); diff --git a/net/core/datagram.c b/net/core/datagram.c index 4bf62b1afa3b..b2651bb6d2a3 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -408,27 +408,20 @@ int skb_kill_datagram(struct sock *sk, struct sk_buff *skb, unsigned int flags) } EXPORT_SYMBOL(skb_kill_datagram); -/** - * skb_copy_datagram_iter - Copy a datagram to an iovec iterator. - * @skb: buffer to copy - * @offset: offset in the buffer to start copying from - * @to: iovec iterator to copy to - * @len: amount of data to copy from buffer to iovec - */ -int skb_copy_datagram_iter(const struct sk_buff *skb, int offset, - struct iov_iter *to, int len) +int __skb_datagram_iter(const struct sk_buff *skb, int offset, + struct iov_iter *to, int len, bool fault_short, + size_t (*cb)(const void *, size_t, void *, struct iov_iter *), + void *data) { int start = skb_headlen(skb); int i, copy = start - offset, start_off = offset, n; struct sk_buff *frag_iter; - trace_skb_copy_datagram_iovec(skb, len); - /* Copy header. */ if (copy > 0) { if (copy > len) copy = len; - n = copy_to_iter(skb->data + offset, copy, to); + n = cb(skb->data + offset, copy, data, to); offset += n; if (n != copy) goto short_copy; @@ -445,11 +438,14 @@ int skb_copy_datagram_iter(const struct sk_buff *skb, int offset, end = start + skb_frag_size(frag); if ((copy = end - offset) > 0) { + struct page *page = skb_frag_page(frag); + u8 *vaddr = kmap(page); + if (copy > len) copy = len; - n = copy_page_to_iter(skb_frag_page(frag), - frag->page_offset + offset - - start, copy, to); + n = cb(vaddr + frag->page_offset + + offset - start, copy, data, to); + kunmap(page); offset += n; if (n != copy) goto short_copy; @@ -468,8 +464,8 @@ int skb_copy_datagram_iter(const struct sk_buff *skb, int offset, if ((copy = end - offset) > 0) { if (copy > len) copy = len; - if (skb_copy_datagram_iter(frag_iter, offset - start, - to, copy)) + if (__skb_datagram_iter(frag_iter, offset - start, + to, copy, fault_short, cb, data)) goto fault; if ((len -= copy) == 0) return 0; @@ -490,11 +486,50 @@ fault: return -EFAULT; short_copy: - if (iov_iter_count(to)) + if (fault_short || iov_iter_count(to)) goto fault; return 0; } + +/** + * skb_copy_and_hash_datagram_iter - Copy datagram to an iovec iterator + * and update a hash. + * @skb: buffer to copy + * @offset: offset in the buffer to start copying from + * @to: iovec iterator to copy to + * @len: amount of data to copy from buffer to iovec + * @hash: hash request to update + */ +int skb_copy_and_hash_datagram_iter(const struct sk_buff *skb, int offset, + struct iov_iter *to, int len, + struct ahash_request *hash) +{ + return __skb_datagram_iter(skb, offset, to, len, true, + hash_and_copy_to_iter, hash); +} +EXPORT_SYMBOL(skb_copy_and_hash_datagram_iter); + +static size_t simple_copy_to_iter(const void *addr, size_t bytes, + void *data __always_unused, struct iov_iter *i) +{ + return copy_to_iter(addr, bytes, i); +} + +/** + * skb_copy_datagram_iter - Copy a datagram to an iovec iterator. + * @skb: buffer to copy + * @offset: offset in the buffer to start copying from + * @to: iovec iterator to copy to + * @len: amount of data to copy from buffer to iovec + */ +int skb_copy_datagram_iter(const struct sk_buff *skb, int offset, + struct iov_iter *to, int len) +{ + trace_skb_copy_datagram_iovec(skb, len); + return __skb_datagram_iter(skb, offset, to, len, false, + simple_copy_to_iter, NULL); +} EXPORT_SYMBOL(skb_copy_datagram_iter); /** @@ -645,87 +680,21 @@ int zerocopy_sg_from_iter(struct sk_buff *skb, struct iov_iter *from) } EXPORT_SYMBOL(zerocopy_sg_from_iter); +/** + * skb_copy_and_csum_datagram_iter - Copy datagram to an iovec iterator + * and update a checksum. + * @skb: buffer to copy + * @offset: offset in the buffer to start copying from + * @to: iovec iterator to copy to + * @len: amount of data to copy from buffer to iovec + * @csump: checksum pointer + */ static int skb_copy_and_csum_datagram(const struct sk_buff *skb, int offset, struct iov_iter *to, int len, __wsum *csump) { - int start = skb_headlen(skb); - int i, copy = start - offset, start_off = offset; - struct sk_buff *frag_iter; - int pos = 0; - int n; - - /* Copy header. */ - if (copy > 0) { - if (copy > len) - copy = len; - n = csum_and_copy_to_iter(skb->data + offset, copy, csump, to); - offset += n; - if (n != copy) - goto fault; - if ((len -= copy) == 0) - return 0; - pos = copy; - } - - for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { - int end; - const skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; - - WARN_ON(start > offset + len); - - end = start + skb_frag_size(frag); - if ((copy = end - offset) > 0) { - __wsum csum2 = 0; - struct page *page = skb_frag_page(frag); - u8 *vaddr = kmap(page); - - if (copy > len) - copy = len; - n = csum_and_copy_to_iter(vaddr + frag->page_offset + - offset - start, copy, - &csum2, to); - kunmap(page); - offset += n; - if (n != copy) - goto fault; - *csump = csum_block_add(*csump, csum2, pos); - if (!(len -= copy)) - return 0; - pos += copy; - } - start = end; - } - - skb_walk_frags(skb, frag_iter) { - int end; - - WARN_ON(start > offset + len); - - end = start + frag_iter->len; - if ((copy = end - offset) > 0) { - __wsum csum2 = 0; - if (copy > len) - copy = len; - if (skb_copy_and_csum_datagram(frag_iter, - offset - start, - to, copy, - &csum2)) - goto fault; - *csump = csum_block_add(*csump, csum2, pos); - if ((len -= copy) == 0) - return 0; - offset += copy; - pos += copy; - } - start = end; - } - if (!len) - return 0; - -fault: - iov_iter_revert(to, offset - start_off); - return -EFAULT; + return __skb_datagram_iter(skb, offset, to, len, true, + csum_and_copy_to_iter, csump); } /** diff --git a/net/dccp/proto.c b/net/dccp/proto.c index 2cc5fbb1b29e..0e2f71ab8367 100644 --- a/net/dccp/proto.c +++ b/net/dccp/proto.c @@ -1131,6 +1131,7 @@ EXPORT_SYMBOL_GPL(dccp_debug); static int __init dccp_init(void) { unsigned long goal; + unsigned long nr_pages = totalram_pages(); int ehash_order, bhash_order, i; int rc; @@ -1157,10 +1158,10 @@ static int __init dccp_init(void) * * The methodology is similar to that of the buffer cache. */ - if (totalram_pages >= (128 * 1024)) - goal = totalram_pages >> (21 - PAGE_SHIFT); + if (nr_pages >= (128 * 1024)) + goal = nr_pages >> (21 - PAGE_SHIFT); else - goal = totalram_pages >> (23 - PAGE_SHIFT); + goal = nr_pages >> (23 - PAGE_SHIFT); if (thash_entries) goal = (thash_entries * diff --git a/net/decnet/dn_route.c b/net/decnet/dn_route.c index 1c002c0fb712..950613ee7881 100644 --- a/net/decnet/dn_route.c +++ b/net/decnet/dn_route.c @@ -1866,7 +1866,7 @@ void __init dn_route_init(void) dn_route_timer.expires = jiffies + decnet_dst_gc_interval * HZ; add_timer(&dn_route_timer); - goal = totalram_pages >> (26 - PAGE_SHIFT); + goal = totalram_pages() >> (26 - PAGE_SHIFT); for(order = 0; (1UL << order) < goal; order++) /* NOTHING */; diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c index 03b51cdcc731..b467a7cabf40 100644 --- a/net/ipv4/tcp_metrics.c +++ b/net/ipv4/tcp_metrics.c @@ -1000,7 +1000,7 @@ static int __net_init tcp_net_metrics_init(struct net *net) slots = tcpmhash_entries; if (!slots) { - if (totalram_pages >= 128 * 1024) + if (totalram_pages() >= 128 * 1024) slots = 16 * 1024; else slots = 8 * 1024; diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index e87c21e47efe..741b533148ba 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -2248,6 +2248,7 @@ static __always_inline unsigned int total_extension_size(void) int nf_conntrack_init_start(void) { + unsigned long nr_pages = totalram_pages(); int max_factor = 8; int ret = -ENOMEM; int i; @@ -2267,11 +2268,11 @@ int nf_conntrack_init_start(void) * >= 4GB machines have 65536 buckets. */ nf_conntrack_htable_size - = (((totalram_pages << PAGE_SHIFT) / 16384) + = (((nr_pages << PAGE_SHIFT) / 16384) / sizeof(struct hlist_head)); - if (totalram_pages > (4 * (1024 * 1024 * 1024 / PAGE_SIZE))) + if (nr_pages > (4 * (1024 * 1024 * 1024 / PAGE_SIZE))) nf_conntrack_htable_size = 65536; - else if (totalram_pages > (1024 * 1024 * 1024 / PAGE_SIZE)) + else if (nr_pages > (1024 * 1024 * 1024 / PAGE_SIZE)) nf_conntrack_htable_size = 16384; if (nf_conntrack_htable_size < 32) nf_conntrack_htable_size = 32; diff --git a/net/netfilter/xt_hashlimit.c b/net/netfilter/xt_hashlimit.c index 28e27a32d9b9..8d86e39d6280 100644 --- a/net/netfilter/xt_hashlimit.c +++ b/net/netfilter/xt_hashlimit.c @@ -274,14 +274,15 @@ static int htable_create(struct net *net, struct hashlimit_cfg3 *cfg, struct xt_hashlimit_htable *hinfo; const struct seq_operations *ops; unsigned int size, i; + unsigned long nr_pages = totalram_pages(); int ret; if (cfg->size) { size = cfg->size; } else { - size = (totalram_pages << PAGE_SHIFT) / 16384 / + size = (nr_pages << PAGE_SHIFT) / 16384 / sizeof(struct hlist_head); - if (totalram_pages > 1024 * 1024 * 1024 / PAGE_SIZE) + if (nr_pages > 1024 * 1024 * 1024 / PAGE_SIZE) size = 8192; if (size < 16) size = 16; diff --git a/net/rds/ib.c b/net/rds/ib.c index eba75c1ba359..9d7b7586f240 100644 --- a/net/rds/ib.c +++ b/net/rds/ib.c @@ -148,8 +148,8 @@ static void rds_ib_add_one(struct ib_device *device) has_fr = (device->attrs.device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS); - has_fmr = (device->alloc_fmr && device->dealloc_fmr && - device->map_phys_fmr && device->unmap_fmr); + has_fmr = (device->ops.alloc_fmr && device->ops.dealloc_fmr && + device->ops.map_phys_fmr && device->ops.unmap_fmr); rds_ibdev->use_fastreg = (has_fr && !has_fmr); rds_ibdev->fmr_max_remaps = device->attrs.max_map_per_fmr?: 32; diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c index 9b277bd36d1a..d5878ae55840 100644 --- a/net/sctp/protocol.c +++ b/net/sctp/protocol.c @@ -1368,6 +1368,7 @@ static __init int sctp_init(void) int status = -EINVAL; unsigned long goal; unsigned long limit; + unsigned long nr_pages = totalram_pages(); int max_share; int order; int num_entries; @@ -1426,10 +1427,10 @@ static __init int sctp_init(void) * The methodology is similar to that of the tcp hash tables. * Though not identical. Start by getting a goal size */ - if (totalram_pages >= (128 * 1024)) - goal = totalram_pages >> (22 - PAGE_SHIFT); + if (nr_pages >= (128 * 1024)) + goal = nr_pages >> (22 - PAGE_SHIFT); else - goal = totalram_pages >> (24 - PAGE_SHIFT); + goal = nr_pages >> (24 - PAGE_SHIFT); /* Then compute the page order for said goal */ order = get_order(goal); diff --git a/net/socket.c b/net/socket.c index 334fcc617ef2..e89884e2197b 100644 --- a/net/socket.c +++ b/net/socket.c @@ -2341,8 +2341,9 @@ SYSCALL_DEFINE3(recvmsg, int, fd, struct user_msghdr __user *, msg, * Linux recvmmsg interface */ -int __sys_recvmmsg(int fd, struct mmsghdr __user *mmsg, unsigned int vlen, - unsigned int flags, struct timespec64 *timeout) +static int do_recvmmsg(int fd, struct mmsghdr __user *mmsg, + unsigned int vlen, unsigned int flags, + struct timespec64 *timeout) { int fput_needed, err, datagrams; struct socket *sock; @@ -2451,25 +2452,32 @@ out_put: return datagrams; } -static int do_sys_recvmmsg(int fd, struct mmsghdr __user *mmsg, - unsigned int vlen, unsigned int flags, - struct __kernel_timespec __user *timeout) +int __sys_recvmmsg(int fd, struct mmsghdr __user *mmsg, + unsigned int vlen, unsigned int flags, + struct __kernel_timespec __user *timeout, + struct old_timespec32 __user *timeout32) { int datagrams; struct timespec64 timeout_sys; - if (flags & MSG_CMSG_COMPAT) - return -EINVAL; - - if (!timeout) - return __sys_recvmmsg(fd, mmsg, vlen, flags, NULL); + if (timeout && get_timespec64(&timeout_sys, timeout)) + return -EFAULT; - if (get_timespec64(&timeout_sys, timeout)) + if (timeout32 && get_old_timespec32(&timeout_sys, timeout32)) return -EFAULT; - datagrams = __sys_recvmmsg(fd, mmsg, vlen, flags, &timeout_sys); + if (!timeout && !timeout32) + return do_recvmmsg(fd, mmsg, vlen, flags, NULL); + + datagrams = do_recvmmsg(fd, mmsg, vlen, flags, &timeout_sys); - if (datagrams > 0 && put_timespec64(&timeout_sys, timeout)) + if (datagrams <= 0) + return datagrams; + + if (timeout && put_timespec64(&timeout_sys, timeout)) + datagrams = -EFAULT; + + if (timeout32 && put_old_timespec32(&timeout_sys, timeout32)) datagrams = -EFAULT; return datagrams; @@ -2479,8 +2487,23 @@ SYSCALL_DEFINE5(recvmmsg, int, fd, struct mmsghdr __user *, mmsg, unsigned int, vlen, unsigned int, flags, struct __kernel_timespec __user *, timeout) { - return do_sys_recvmmsg(fd, mmsg, vlen, flags, timeout); + if (flags & MSG_CMSG_COMPAT) + return -EINVAL; + + return __sys_recvmmsg(fd, mmsg, vlen, flags, timeout, NULL); +} + +#ifdef CONFIG_COMPAT_32BIT_TIME +SYSCALL_DEFINE5(recvmmsg_time32, int, fd, struct mmsghdr __user *, mmsg, + unsigned int, vlen, unsigned int, flags, + struct old_timespec32 __user *, timeout) +{ + if (flags & MSG_CMSG_COMPAT) + return -EINVAL; + + return __sys_recvmmsg(fd, mmsg, vlen, flags, NULL, timeout); } +#endif #ifdef __ARCH_WANT_SYS_SOCKETCALL /* Argument list sizes for sys_socketcall */ @@ -2600,8 +2623,15 @@ SYSCALL_DEFINE2(socketcall, int, call, unsigned long __user *, args) a[2], true); break; case SYS_RECVMMSG: - err = do_sys_recvmmsg(a0, (struct mmsghdr __user *)a1, a[2], - a[3], (struct __kernel_timespec __user *)a[4]); + if (IS_ENABLED(CONFIG_64BIT) || !IS_ENABLED(CONFIG_64BIT_TIME)) + err = __sys_recvmmsg(a0, (struct mmsghdr __user *)a1, + a[2], a[3], + (struct __kernel_timespec __user *)a[4], + NULL); + else + err = __sys_recvmmsg(a0, (struct mmsghdr __user *)a1, + a[2], a[3], NULL, + (struct old_timespec32 __user *)a[4]); break; case SYS_ACCEPT4: err = __sys_accept4(a0, (struct sockaddr __user *)a1, diff --git a/net/sunrpc/Makefile b/net/sunrpc/Makefile index 090658c3da12..9488600451e8 100644 --- a/net/sunrpc/Makefile +++ b/net/sunrpc/Makefile @@ -9,7 +9,7 @@ obj-$(CONFIG_SUNRPC_GSS) += auth_gss/ obj-$(CONFIG_SUNRPC_XPRT_RDMA) += xprtrdma/ sunrpc-y := clnt.o xprt.o socklib.o xprtsock.o sched.o \ - auth.o auth_null.o auth_unix.o auth_generic.o \ + auth.o auth_null.o auth_unix.o \ svc.o svcsock.o svcauth.o svcauth_unix.o \ addr.o rpcb_clnt.o timer.o xdr.o \ sunrpc_syms.o cache.o rpc_pipe.o \ diff --git a/net/sunrpc/auth.c b/net/sunrpc/auth.c index ad8ead738981..1ff9768f5456 100644 --- a/net/sunrpc/auth.c +++ b/net/sunrpc/auth.c @@ -39,6 +39,20 @@ static const struct rpc_authops __rcu *auth_flavors[RPC_AUTH_MAXFLAVOR] = { static LIST_HEAD(cred_unused); static unsigned long number_cred_unused; +static struct cred machine_cred = { + .usage = ATOMIC_INIT(1), +}; + +/* + * Return the machine_cred pointer to be used whenever + * the a generic machine credential is needed. + */ +const struct cred *rpc_machine_cred(void) +{ + return &machine_cred; +} +EXPORT_SYMBOL_GPL(rpc_machine_cred); + #define MAX_HASHTABLE_BITS (14) static int param_set_hashtbl_sz(const char *val, const struct kernel_param *kp) { @@ -346,29 +360,6 @@ out_nocache: } EXPORT_SYMBOL_GPL(rpcauth_init_credcache); -/* - * Setup a credential key lifetime timeout notification - */ -int -rpcauth_key_timeout_notify(struct rpc_auth *auth, struct rpc_cred *cred) -{ - if (!cred->cr_auth->au_ops->key_timeout) - return 0; - return cred->cr_auth->au_ops->key_timeout(auth, cred); -} -EXPORT_SYMBOL_GPL(rpcauth_key_timeout_notify); - -bool -rpcauth_cred_key_to_expire(struct rpc_auth *auth, struct rpc_cred *cred) -{ - if (auth->au_flags & RPCAUTH_AUTH_NO_CRKEY_TIMEOUT) - return false; - if (!cred->cr_ops->crkey_to_expire) - return false; - return cred->cr_ops->crkey_to_expire(cred); -} -EXPORT_SYMBOL_GPL(rpcauth_cred_key_to_expire); - char * rpcauth_stringify_acceptor(struct rpc_cred *cred) { @@ -587,13 +578,6 @@ rpcauth_lookup_credcache(struct rpc_auth *auth, struct auth_cred * acred, hlist_for_each_entry_rcu(entry, &cache->hashtable[nr], cr_hash) { if (!entry->cr_ops->crmatch(acred, entry, flags)) continue; - if (flags & RPCAUTH_LOOKUP_RCU) { - if (test_bit(RPCAUTH_CRED_NEW, &entry->cr_flags) || - refcount_read(&entry->cr_count) == 0) - continue; - cred = entry; - break; - } cred = get_rpccred(entry); if (cred) break; @@ -603,9 +587,6 @@ rpcauth_lookup_credcache(struct rpc_auth *auth, struct auth_cred * acred, if (cred != NULL) goto found; - if (flags & RPCAUTH_LOOKUP_RCU) - return ERR_PTR(-ECHILD); - new = auth->au_ops->crcreate(auth, acred, flags, gfp); if (IS_ERR(new)) { cred = new; @@ -656,9 +637,7 @@ rpcauth_lookupcred(struct rpc_auth *auth, int flags) auth->au_ops->au_name); memset(&acred, 0, sizeof(acred)); - acred.uid = cred->fsuid; - acred.gid = cred->fsgid; - acred.group_info = cred->group_info; + acred.cred = cred; ret = auth->au_ops->lookup_cred(auth, &acred, flags); return ret; } @@ -672,31 +651,41 @@ rpcauth_init_cred(struct rpc_cred *cred, const struct auth_cred *acred, INIT_LIST_HEAD(&cred->cr_lru); refcount_set(&cred->cr_count, 1); cred->cr_auth = auth; + cred->cr_flags = 0; cred->cr_ops = ops; cred->cr_expire = jiffies; - cred->cr_uid = acred->uid; + cred->cr_cred = get_cred(acred->cred); } EXPORT_SYMBOL_GPL(rpcauth_init_cred); -struct rpc_cred * -rpcauth_generic_bind_cred(struct rpc_task *task, struct rpc_cred *cred, int lookupflags) +static struct rpc_cred * +rpcauth_bind_root_cred(struct rpc_task *task, int lookupflags) { - dprintk("RPC: %5u holding %s cred %p\n", task->tk_pid, - cred->cr_auth->au_ops->au_name, cred); - return get_rpccred(cred); + struct rpc_auth *auth = task->tk_client->cl_auth; + struct auth_cred acred = { + .cred = get_task_cred(&init_task), + }; + struct rpc_cred *ret; + + dprintk("RPC: %5u looking up %s cred\n", + task->tk_pid, task->tk_client->cl_auth->au_ops->au_name); + ret = auth->au_ops->lookup_cred(auth, &acred, lookupflags); + put_cred(acred.cred); + return ret; } -EXPORT_SYMBOL_GPL(rpcauth_generic_bind_cred); static struct rpc_cred * -rpcauth_bind_root_cred(struct rpc_task *task, int lookupflags) +rpcauth_bind_machine_cred(struct rpc_task *task, int lookupflags) { struct rpc_auth *auth = task->tk_client->cl_auth; struct auth_cred acred = { - .uid = GLOBAL_ROOT_UID, - .gid = GLOBAL_ROOT_GID, + .principal = task->tk_client->cl_principal, + .cred = init_task.cred, }; - dprintk("RPC: %5u looking up %s cred\n", + if (!acred.principal) + return NULL; + dprintk("RPC: %5u looking up %s machine cred\n", task->tk_pid, task->tk_client->cl_auth->au_ops->au_name); return auth->au_ops->lookup_cred(auth, &acred, lookupflags); } @@ -712,18 +701,33 @@ rpcauth_bind_new_cred(struct rpc_task *task, int lookupflags) } static int -rpcauth_bindcred(struct rpc_task *task, struct rpc_cred *cred, int flags) +rpcauth_bindcred(struct rpc_task *task, const struct cred *cred, int flags) { struct rpc_rqst *req = task->tk_rqstp; - struct rpc_cred *new; + struct rpc_cred *new = NULL; int lookupflags = 0; + struct rpc_auth *auth = task->tk_client->cl_auth; + struct auth_cred acred = { + .cred = cred, + }; if (flags & RPC_TASK_ASYNC) lookupflags |= RPCAUTH_LOOKUP_NEW; - if (cred != NULL) - new = cred->cr_ops->crbind(task, cred, lookupflags); - else if (flags & RPC_TASK_ROOTCREDS) + if (task->tk_op_cred) + /* Task must use exactly this rpc_cred */ + new = get_rpccred(task->tk_op_cred); + else if (cred != NULL && cred != &machine_cred) + new = auth->au_ops->lookup_cred(auth, &acred, lookupflags); + else if (cred == &machine_cred) + new = rpcauth_bind_machine_cred(task, lookupflags); + + /* If machine cred couldn't be bound, try a root cred */ + if (new) + ; + else if (cred == &machine_cred || (flags & RPC_TASK_ROOTCREDS)) new = rpcauth_bind_root_cred(task, lookupflags); + else if (flags & RPC_TASK_NULLCREDS) + new = authnull_ops.lookup_cred(NULL, NULL, 0); else new = rpcauth_bind_new_cred(task, lookupflags); if (IS_ERR(new)) @@ -901,15 +905,10 @@ int __init rpcauth_init_module(void) err = rpc_init_authunix(); if (err < 0) goto out1; - err = rpc_init_generic_auth(); - if (err < 0) - goto out2; err = register_shrinker(&rpc_cred_shrinker); if (err < 0) - goto out3; + goto out2; return 0; -out3: - rpc_destroy_generic_auth(); out2: rpc_destroy_authunix(); out1: @@ -919,6 +918,5 @@ out1: void rpcauth_remove_module(void) { rpc_destroy_authunix(); - rpc_destroy_generic_auth(); unregister_shrinker(&rpc_cred_shrinker); } diff --git a/net/sunrpc/auth_generic.c b/net/sunrpc/auth_generic.c deleted file mode 100644 index ab4a3be1542a..000000000000 --- a/net/sunrpc/auth_generic.c +++ /dev/null @@ -1,293 +0,0 @@ -/* - * Generic RPC credential - * - * Copyright (C) 2008, Trond Myklebust <Trond.Myklebust@netapp.com> - */ - -#include <linux/err.h> -#include <linux/slab.h> -#include <linux/types.h> -#include <linux/module.h> -#include <linux/sched.h> -#include <linux/sunrpc/auth.h> -#include <linux/sunrpc/clnt.h> -#include <linux/sunrpc/debug.h> -#include <linux/sunrpc/sched.h> - -#if IS_ENABLED(CONFIG_SUNRPC_DEBUG) -# define RPCDBG_FACILITY RPCDBG_AUTH -#endif - -#define RPC_MACHINE_CRED_USERID GLOBAL_ROOT_UID -#define RPC_MACHINE_CRED_GROUPID GLOBAL_ROOT_GID - -struct generic_cred { - struct rpc_cred gc_base; - struct auth_cred acred; -}; - -static struct rpc_auth generic_auth; -static const struct rpc_credops generic_credops; - -/* - * Public call interface - */ -struct rpc_cred *rpc_lookup_cred(void) -{ - return rpcauth_lookupcred(&generic_auth, 0); -} -EXPORT_SYMBOL_GPL(rpc_lookup_cred); - -struct rpc_cred * -rpc_lookup_generic_cred(struct auth_cred *acred, int flags, gfp_t gfp) -{ - return rpcauth_lookup_credcache(&generic_auth, acred, flags, gfp); -} -EXPORT_SYMBOL_GPL(rpc_lookup_generic_cred); - -struct rpc_cred *rpc_lookup_cred_nonblock(void) -{ - return rpcauth_lookupcred(&generic_auth, RPCAUTH_LOOKUP_RCU); -} -EXPORT_SYMBOL_GPL(rpc_lookup_cred_nonblock); - -/* - * Public call interface for looking up machine creds. - */ -struct rpc_cred *rpc_lookup_machine_cred(const char *service_name) -{ - struct auth_cred acred = { - .uid = RPC_MACHINE_CRED_USERID, - .gid = RPC_MACHINE_CRED_GROUPID, - .principal = service_name, - .machine_cred = 1, - }; - - dprintk("RPC: looking up machine cred for service %s\n", - service_name); - return generic_auth.au_ops->lookup_cred(&generic_auth, &acred, 0); -} -EXPORT_SYMBOL_GPL(rpc_lookup_machine_cred); - -static struct rpc_cred *generic_bind_cred(struct rpc_task *task, - struct rpc_cred *cred, int lookupflags) -{ - struct rpc_auth *auth = task->tk_client->cl_auth; - struct auth_cred *acred = &container_of(cred, struct generic_cred, gc_base)->acred; - - return auth->au_ops->lookup_cred(auth, acred, lookupflags); -} - -static int -generic_hash_cred(struct auth_cred *acred, unsigned int hashbits) -{ - return hash_64(from_kgid(&init_user_ns, acred->gid) | - ((u64)from_kuid(&init_user_ns, acred->uid) << - (sizeof(gid_t) * 8)), hashbits); -} - -/* - * Lookup generic creds for current process - */ -static struct rpc_cred * -generic_lookup_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags) -{ - return rpcauth_lookup_credcache(&generic_auth, acred, flags, GFP_KERNEL); -} - -static struct rpc_cred * -generic_create_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags, gfp_t gfp) -{ - struct generic_cred *gcred; - - gcred = kmalloc(sizeof(*gcred), gfp); - if (gcred == NULL) - return ERR_PTR(-ENOMEM); - - rpcauth_init_cred(&gcred->gc_base, acred, &generic_auth, &generic_credops); - gcred->gc_base.cr_flags = 1UL << RPCAUTH_CRED_UPTODATE; - - gcred->acred.uid = acred->uid; - gcred->acred.gid = acred->gid; - gcred->acred.group_info = acred->group_info; - gcred->acred.ac_flags = 0; - if (gcred->acred.group_info != NULL) - get_group_info(gcred->acred.group_info); - gcred->acred.machine_cred = acred->machine_cred; - gcred->acred.principal = acred->principal; - - dprintk("RPC: allocated %s cred %p for uid %d gid %d\n", - gcred->acred.machine_cred ? "machine" : "generic", - gcred, - from_kuid(&init_user_ns, acred->uid), - from_kgid(&init_user_ns, acred->gid)); - return &gcred->gc_base; -} - -static void -generic_free_cred(struct rpc_cred *cred) -{ - struct generic_cred *gcred = container_of(cred, struct generic_cred, gc_base); - - dprintk("RPC: generic_free_cred %p\n", gcred); - if (gcred->acred.group_info != NULL) - put_group_info(gcred->acred.group_info); - kfree(gcred); -} - -static void -generic_free_cred_callback(struct rcu_head *head) -{ - struct rpc_cred *cred = container_of(head, struct rpc_cred, cr_rcu); - generic_free_cred(cred); -} - -static void -generic_destroy_cred(struct rpc_cred *cred) -{ - call_rcu(&cred->cr_rcu, generic_free_cred_callback); -} - -static int -machine_cred_match(struct auth_cred *acred, struct generic_cred *gcred, int flags) -{ - if (!gcred->acred.machine_cred || - gcred->acred.principal != acred->principal || - !uid_eq(gcred->acred.uid, acred->uid) || - !gid_eq(gcred->acred.gid, acred->gid)) - return 0; - return 1; -} - -/* - * Match credentials against current process creds. - */ -static int -generic_match(struct auth_cred *acred, struct rpc_cred *cred, int flags) -{ - struct generic_cred *gcred = container_of(cred, struct generic_cred, gc_base); - int i; - - if (acred->machine_cred) - return machine_cred_match(acred, gcred, flags); - - if (!uid_eq(gcred->acred.uid, acred->uid) || - !gid_eq(gcred->acred.gid, acred->gid) || - gcred->acred.machine_cred != 0) - goto out_nomatch; - - /* Optimisation in the case where pointers are identical... */ - if (gcred->acred.group_info == acred->group_info) - goto out_match; - - /* Slow path... */ - if (gcred->acred.group_info->ngroups != acred->group_info->ngroups) - goto out_nomatch; - for (i = 0; i < gcred->acred.group_info->ngroups; i++) { - if (!gid_eq(gcred->acred.group_info->gid[i], - acred->group_info->gid[i])) - goto out_nomatch; - } -out_match: - return 1; -out_nomatch: - return 0; -} - -int __init rpc_init_generic_auth(void) -{ - return rpcauth_init_credcache(&generic_auth); -} - -void rpc_destroy_generic_auth(void) -{ - rpcauth_destroy_credcache(&generic_auth); -} - -/* - * Test the the current time (now) against the underlying credential key expiry - * minus a timeout and setup notification. - * - * The normal case: - * If 'now' is before the key expiry minus RPC_KEY_EXPIRE_TIMEO, set - * the RPC_CRED_NOTIFY_TIMEOUT flag to setup the underlying credential - * rpc_credops crmatch routine to notify this generic cred when it's key - * expiration is within RPC_KEY_EXPIRE_TIMEO, and return 0. - * - * The error case: - * If the underlying cred lookup fails, return -EACCES. - * - * The 'almost' error case: - * If 'now' is within key expiry minus RPC_KEY_EXPIRE_TIMEO, but not within - * key expiry minus RPC_KEY_EXPIRE_FAIL, set the RPC_CRED_EXPIRE_SOON bit - * on the acred ac_flags and return 0. - */ -static int -generic_key_timeout(struct rpc_auth *auth, struct rpc_cred *cred) -{ - struct auth_cred *acred = &container_of(cred, struct generic_cred, - gc_base)->acred; - struct rpc_cred *tcred; - int ret = 0; - - - /* Fast track for non crkey_timeout (no key) underlying credentials */ - if (auth->au_flags & RPCAUTH_AUTH_NO_CRKEY_TIMEOUT) - return 0; - - /* Fast track for the normal case */ - if (test_bit(RPC_CRED_NOTIFY_TIMEOUT, &acred->ac_flags)) - return 0; - - /* lookup_cred either returns a valid referenced rpc_cred, or PTR_ERR */ - tcred = auth->au_ops->lookup_cred(auth, acred, 0); - if (IS_ERR(tcred)) - return -EACCES; - - /* Test for the almost error case */ - ret = tcred->cr_ops->crkey_timeout(tcred); - if (ret != 0) { - set_bit(RPC_CRED_KEY_EXPIRE_SOON, &acred->ac_flags); - ret = 0; - } else { - /* In case underlying cred key has been reset */ - if (test_and_clear_bit(RPC_CRED_KEY_EXPIRE_SOON, - &acred->ac_flags)) - dprintk("RPC: UID %d Credential key reset\n", - from_kuid(&init_user_ns, tcred->cr_uid)); - /* set up fasttrack for the normal case */ - set_bit(RPC_CRED_NOTIFY_TIMEOUT, &acred->ac_flags); - } - - put_rpccred(tcred); - return ret; -} - -static const struct rpc_authops generic_auth_ops = { - .owner = THIS_MODULE, - .au_name = "Generic", - .hash_cred = generic_hash_cred, - .lookup_cred = generic_lookup_cred, - .crcreate = generic_create_cred, - .key_timeout = generic_key_timeout, -}; - -static struct rpc_auth generic_auth = { - .au_ops = &generic_auth_ops, - .au_count = REFCOUNT_INIT(1), -}; - -static bool generic_key_to_expire(struct rpc_cred *cred) -{ - struct auth_cred *acred = &container_of(cred, struct generic_cred, - gc_base)->acred; - return test_bit(RPC_CRED_KEY_EXPIRE_SOON, &acred->ac_flags); -} - -static const struct rpc_credops generic_credops = { - .cr_name = "Generic cred", - .crdestroy = generic_destroy_cred, - .crbind = generic_bind_cred, - .crmatch = generic_match, - .crkey_to_expire = generic_key_to_expire, -}; diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c index ba765473d1f0..dc86713b32b6 100644 --- a/net/sunrpc/auth_gss/auth_gss.c +++ b/net/sunrpc/auth_gss/auth_gss.c @@ -565,7 +565,7 @@ gss_setup_upcall(struct gss_auth *gss_auth, struct rpc_cred *cred) struct gss_cred *gss_cred = container_of(cred, struct gss_cred, gc_base); struct gss_upcall_msg *gss_new, *gss_msg; - kuid_t uid = cred->cr_uid; + kuid_t uid = cred->cr_cred->fsuid; gss_new = gss_alloc_msg(gss_auth, uid, gss_cred->gc_principal); if (IS_ERR(gss_new)) @@ -604,7 +604,7 @@ gss_refresh_upcall(struct rpc_task *task) int err = 0; dprintk("RPC: %5u %s for uid %u\n", - task->tk_pid, __func__, from_kuid(&init_user_ns, cred->cr_uid)); + task->tk_pid, __func__, from_kuid(&init_user_ns, cred->cr_cred->fsuid)); gss_msg = gss_setup_upcall(gss_auth, cred); if (PTR_ERR(gss_msg) == -EAGAIN) { /* XXX: warning on the first, under the assumption we @@ -637,7 +637,7 @@ gss_refresh_upcall(struct rpc_task *task) out: dprintk("RPC: %5u %s for uid %u result %d\n", task->tk_pid, __func__, - from_kuid(&init_user_ns, cred->cr_uid), err); + from_kuid(&init_user_ns, cred->cr_cred->fsuid), err); return err; } @@ -653,7 +653,7 @@ gss_create_upcall(struct gss_auth *gss_auth, struct gss_cred *gss_cred) int err; dprintk("RPC: %s for uid %u\n", - __func__, from_kuid(&init_user_ns, cred->cr_uid)); + __func__, from_kuid(&init_user_ns, cred->cr_cred->fsuid)); retry: err = 0; /* if gssd is down, just skip upcalling altogether */ @@ -701,7 +701,7 @@ out_intr: gss_release_msg(gss_msg); out: dprintk("RPC: %s for uid %u result %d\n", - __func__, from_kuid(&init_user_ns, cred->cr_uid), err); + __func__, from_kuid(&init_user_ns, cred->cr_cred->fsuid), err); return err; } @@ -1248,7 +1248,7 @@ gss_dup_cred(struct gss_auth *gss_auth, struct gss_cred *gss_cred) new = kzalloc(sizeof(*gss_cred), GFP_NOIO); if (new) { struct auth_cred acred = { - .uid = gss_cred->gc_base.cr_uid, + .cred = gss_cred->gc_base.cr_cred, }; struct gss_cl_ctx *ctx = rcu_dereference_protected(gss_cred->gc_ctx, 1); @@ -1343,6 +1343,7 @@ gss_destroy_nullcred(struct rpc_cred *cred) struct gss_cl_ctx *ctx = rcu_dereference_protected(gss_cred->gc_ctx, 1); RCU_INIT_POINTER(gss_cred->gc_ctx, NULL); + put_cred(cred->cr_cred); call_rcu(&cred->cr_rcu, gss_free_cred_callback); if (ctx) gss_put_ctx(ctx); @@ -1361,7 +1362,7 @@ gss_destroy_cred(struct rpc_cred *cred) static int gss_hash_cred(struct auth_cred *acred, unsigned int hashbits) { - return hash_64(from_kuid(&init_user_ns, acred->uid), hashbits); + return hash_64(from_kuid(&init_user_ns, acred->cred->fsuid), hashbits); } /* @@ -1381,7 +1382,7 @@ gss_create_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags, gfp_t int err = -ENOMEM; dprintk("RPC: %s for uid %d, flavor %d\n", - __func__, from_kuid(&init_user_ns, acred->uid), + __func__, from_kuid(&init_user_ns, acred->cred->fsuid), auth->au_flavor); if (!(cred = kzalloc(sizeof(*cred), gfp))) @@ -1394,9 +1395,7 @@ gss_create_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags, gfp_t */ cred->gc_base.cr_flags = 1UL << RPCAUTH_CRED_NEW; cred->gc_service = gss_auth->service; - cred->gc_principal = NULL; - if (acred->machine_cred) - cred->gc_principal = acred->principal; + cred->gc_principal = acred->principal; kref_get(&gss_auth->kref); return &cred->gc_base; @@ -1518,23 +1517,10 @@ out: if (gss_cred->gc_principal == NULL) return 0; ret = strcmp(acred->principal, gss_cred->gc_principal) == 0; - goto check_expire; - } - if (gss_cred->gc_principal != NULL) - return 0; - ret = uid_eq(rc->cr_uid, acred->uid); - -check_expire: - if (ret == 0) - return ret; - - /* Notify acred users of GSS context expiration timeout */ - if (test_bit(RPC_CRED_NOTIFY_TIMEOUT, &acred->ac_flags) && - (gss_key_timeout(rc) != 0)) { - /* test will now be done from generic cred */ - test_and_clear_bit(RPC_CRED_NOTIFY_TIMEOUT, &acred->ac_flags); - /* tell NFS layer that key will expire soon */ - set_bit(RPC_CRED_KEY_EXPIRE_SOON, &acred->ac_flags); + } else { + if (gss_cred->gc_principal != NULL) + return 0; + ret = uid_eq(rc->cr_cred->fsuid, acred->cred->fsuid); } return ret; } @@ -1607,9 +1593,8 @@ static int gss_renew_cred(struct rpc_task *task) gc_base); struct rpc_auth *auth = oldcred->cr_auth; struct auth_cred acred = { - .uid = oldcred->cr_uid, + .cred = oldcred->cr_cred, .principal = gss_cred->gc_principal, - .machine_cred = (gss_cred->gc_principal != NULL ? 1 : 0), }; struct rpc_cred *new; @@ -2110,7 +2095,6 @@ static const struct rpc_credops gss_credops = { .cr_name = "AUTH_GSS", .crdestroy = gss_destroy_cred, .cr_init = gss_cred_init, - .crbind = rpcauth_generic_bind_cred, .crmatch = gss_match, .crmarshal = gss_marshal, .crrefresh = gss_refresh, @@ -2125,7 +2109,6 @@ static const struct rpc_credops gss_credops = { static const struct rpc_credops gss_nullops = { .cr_name = "AUTH_GSS", .crdestroy = gss_destroy_nullcred, - .crbind = rpcauth_generic_bind_cred, .crmatch = gss_match, .crmarshal = gss_marshal, .crrefresh = gss_refresh_null, diff --git a/net/sunrpc/auth_gss/gss_mech_switch.c b/net/sunrpc/auth_gss/gss_mech_switch.c index 16ac0f4cb7d8..379318dff534 100644 --- a/net/sunrpc/auth_gss/gss_mech_switch.c +++ b/net/sunrpc/auth_gss/gss_mech_switch.c @@ -244,7 +244,7 @@ gss_mech_get_by_pseudoflavor(u32 pseudoflavor) /** * gss_mech_list_pseudoflavors - Discover registered GSS pseudoflavors - * @array: array to fill in + * @array_ptr: array to fill in * @size: size of "array" * * Returns the number of array items filled in, or a negative errno. diff --git a/net/sunrpc/auth_gss/svcauth_gss.c b/net/sunrpc/auth_gss/svcauth_gss.c index 1ece4bc3eb8d..152790ed309c 100644 --- a/net/sunrpc/auth_gss/svcauth_gss.c +++ b/net/sunrpc/auth_gss/svcauth_gss.c @@ -1142,7 +1142,7 @@ static int svcauth_gss_legacy_init(struct svc_rqst *rqstp, struct kvec *resv = &rqstp->rq_res.head[0]; struct rsi *rsip, rsikey; int ret; - struct sunrpc_net *sn = net_generic(rqstp->rq_xprt->xpt_net, sunrpc_net_id); + struct sunrpc_net *sn = net_generic(SVC_NET(rqstp), sunrpc_net_id); memset(&rsikey, 0, sizeof(rsikey)); ret = gss_read_verf(gc, argv, authp, @@ -1253,7 +1253,7 @@ static int svcauth_gss_proxy_init(struct svc_rqst *rqstp, uint64_t handle; int status; int ret; - struct net *net = rqstp->rq_xprt->xpt_net; + struct net *net = SVC_NET(rqstp); struct sunrpc_net *sn = net_generic(net, sunrpc_net_id); memset(&ud, 0, sizeof(ud)); @@ -1444,7 +1444,7 @@ svcauth_gss_accept(struct svc_rqst *rqstp, __be32 *authp) __be32 *rpcstart; __be32 *reject_stat = resv->iov_base + resv->iov_len; int ret; - struct sunrpc_net *sn = net_generic(rqstp->rq_xprt->xpt_net, sunrpc_net_id); + struct sunrpc_net *sn = net_generic(SVC_NET(rqstp), sunrpc_net_id); dprintk("RPC: svcauth_gss: argv->iov_len = %zd\n", argv->iov_len); @@ -1734,7 +1734,7 @@ svcauth_gss_release(struct svc_rqst *rqstp) struct rpc_gss_wire_cred *gc = &gsd->clcred; struct xdr_buf *resbuf = &rqstp->rq_res; int stat = -EINVAL; - struct sunrpc_net *sn = net_generic(rqstp->rq_xprt->xpt_net, sunrpc_net_id); + struct sunrpc_net *sn = net_generic(SVC_NET(rqstp), sunrpc_net_id); if (gc->gc_proc != RPC_GSS_PROC_DATA) goto out; diff --git a/net/sunrpc/auth_null.c b/net/sunrpc/auth_null.c index 2694a1bc026b..d0ceac57c06e 100644 --- a/net/sunrpc/auth_null.c +++ b/net/sunrpc/auth_null.c @@ -36,8 +36,6 @@ nul_destroy(struct rpc_auth *auth) static struct rpc_cred * nul_lookup_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags) { - if (flags & RPCAUTH_LOOKUP_RCU) - return &null_cred; return get_rpccred(&null_cred); } @@ -116,7 +114,6 @@ static struct rpc_auth null_auth = { .au_cslack = NUL_CALLSLACK, .au_rslack = NUL_REPLYSLACK, - .au_flags = RPCAUTH_AUTH_NO_CRKEY_TIMEOUT, .au_ops = &authnull_ops, .au_flavor = RPC_AUTH_NULL, .au_count = REFCOUNT_INIT(1), @@ -126,7 +123,6 @@ static const struct rpc_credops null_credops = { .cr_name = "AUTH_NULL", .crdestroy = nul_destroy_cred, - .crbind = rpcauth_generic_bind_cred, .crmatch = nul_match, .crmarshal = nul_marshal, .crrefresh = nul_refresh, diff --git a/net/sunrpc/auth_unix.c b/net/sunrpc/auth_unix.c index 4c1c7e56288f..387f6b3ffbea 100644 --- a/net/sunrpc/auth_unix.c +++ b/net/sunrpc/auth_unix.c @@ -11,16 +11,11 @@ #include <linux/types.h> #include <linux/sched.h> #include <linux/module.h> +#include <linux/mempool.h> #include <linux/sunrpc/clnt.h> #include <linux/sunrpc/auth.h> #include <linux/user_namespace.h> -struct unx_cred { - struct rpc_cred uc_base; - kgid_t uc_gid; - kgid_t uc_gids[UNX_NGROUPS]; -}; -#define uc_uid uc_base.cr_uid #if IS_ENABLED(CONFIG_SUNRPC_DEBUG) # define RPCDBG_FACILITY RPCDBG_AUTH @@ -28,6 +23,7 @@ struct unx_cred { static struct rpc_auth unix_auth; static const struct rpc_credops unix_credops; +static mempool_t *unix_pool; static struct rpc_auth * unx_create(const struct rpc_auth_create_args *args, struct rpc_clnt *clnt) @@ -42,15 +38,6 @@ static void unx_destroy(struct rpc_auth *auth) { dprintk("RPC: destroying UNIX authenticator %p\n", auth); - rpcauth_clear_credcache(auth->au_credcache); -} - -static int -unx_hash_cred(struct auth_cred *acred, unsigned int hashbits) -{ - return hash_64(from_kgid(&init_user_ns, acred->gid) | - ((u64)from_kuid(&init_user_ns, acred->uid) << - (sizeof(gid_t) * 8)), hashbits); } /* @@ -59,52 +46,24 @@ unx_hash_cred(struct auth_cred *acred, unsigned int hashbits) static struct rpc_cred * unx_lookup_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags) { - return rpcauth_lookup_credcache(auth, acred, flags, GFP_NOFS); -} - -static struct rpc_cred * -unx_create_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags, gfp_t gfp) -{ - struct unx_cred *cred; - unsigned int groups = 0; - unsigned int i; + struct rpc_cred *ret = mempool_alloc(unix_pool, GFP_NOFS); dprintk("RPC: allocating UNIX cred for uid %d gid %d\n", - from_kuid(&init_user_ns, acred->uid), - from_kgid(&init_user_ns, acred->gid)); - - if (!(cred = kmalloc(sizeof(*cred), gfp))) - return ERR_PTR(-ENOMEM); + from_kuid(&init_user_ns, acred->cred->fsuid), + from_kgid(&init_user_ns, acred->cred->fsgid)); - rpcauth_init_cred(&cred->uc_base, acred, auth, &unix_credops); - cred->uc_base.cr_flags = 1UL << RPCAUTH_CRED_UPTODATE; - - if (acred->group_info != NULL) - groups = acred->group_info->ngroups; - if (groups > UNX_NGROUPS) - groups = UNX_NGROUPS; - - cred->uc_gid = acred->gid; - for (i = 0; i < groups; i++) - cred->uc_gids[i] = acred->group_info->gid[i]; - if (i < UNX_NGROUPS) - cred->uc_gids[i] = INVALID_GID; - - return &cred->uc_base; -} - -static void -unx_free_cred(struct unx_cred *unx_cred) -{ - dprintk("RPC: unx_free_cred %p\n", unx_cred); - kfree(unx_cred); + rpcauth_init_cred(ret, acred, auth, &unix_credops); + ret->cr_flags = 1UL << RPCAUTH_CRED_UPTODATE; + return ret; } static void unx_free_cred_callback(struct rcu_head *head) { - struct unx_cred *unx_cred = container_of(head, struct unx_cred, uc_base.cr_rcu); - unx_free_cred(unx_cred); + struct rpc_cred *rpc_cred = container_of(head, struct rpc_cred, cr_rcu); + dprintk("RPC: unx_free_cred %p\n", rpc_cred); + put_cred(rpc_cred->cr_cred); + mempool_free(rpc_cred, unix_pool); } static void @@ -114,30 +73,32 @@ unx_destroy_cred(struct rpc_cred *cred) } /* - * Match credentials against current process creds. - * The root_override argument takes care of cases where the caller may - * request root creds (e.g. for NFS swapping). + * Match credentials against current the auth_cred. */ static int -unx_match(struct auth_cred *acred, struct rpc_cred *rcred, int flags) +unx_match(struct auth_cred *acred, struct rpc_cred *cred, int flags) { - struct unx_cred *cred = container_of(rcred, struct unx_cred, uc_base); unsigned int groups = 0; unsigned int i; + if (cred->cr_cred == acred->cred) + return 1; - if (!uid_eq(cred->uc_uid, acred->uid) || !gid_eq(cred->uc_gid, acred->gid)) + if (!uid_eq(cred->cr_cred->fsuid, acred->cred->fsuid) || !gid_eq(cred->cr_cred->fsgid, acred->cred->fsgid)) return 0; - if (acred->group_info != NULL) - groups = acred->group_info->ngroups; + if (acred->cred && acred->cred->group_info != NULL) + groups = acred->cred->group_info->ngroups; if (groups > UNX_NGROUPS) groups = UNX_NGROUPS; + if (cred->cr_cred->group_info == NULL) + return groups == 0; + if (groups != cred->cr_cred->group_info->ngroups) + return 0; + for (i = 0; i < groups ; i++) - if (!gid_eq(cred->uc_gids[i], acred->group_info->gid[i])) + if (!gid_eq(cred->cr_cred->group_info->gid[i], acred->cred->group_info->gid[i])) return 0; - if (groups < UNX_NGROUPS && gid_valid(cred->uc_gids[groups])) - return 0; return 1; } @@ -149,9 +110,10 @@ static __be32 * unx_marshal(struct rpc_task *task, __be32 *p) { struct rpc_clnt *clnt = task->tk_client; - struct unx_cred *cred = container_of(task->tk_rqstp->rq_cred, struct unx_cred, uc_base); + struct rpc_cred *cred = task->tk_rqstp->rq_cred; __be32 *base, *hold; int i; + struct group_info *gi = cred->cr_cred->group_info; *p++ = htonl(RPC_AUTH_UNIX); base = p++; @@ -162,11 +124,12 @@ unx_marshal(struct rpc_task *task, __be32 *p) */ p = xdr_encode_array(p, clnt->cl_nodename, clnt->cl_nodelen); - *p++ = htonl((u32) from_kuid(&init_user_ns, cred->uc_uid)); - *p++ = htonl((u32) from_kgid(&init_user_ns, cred->uc_gid)); + *p++ = htonl((u32) from_kuid(&init_user_ns, cred->cr_cred->fsuid)); + *p++ = htonl((u32) from_kgid(&init_user_ns, cred->cr_cred->fsgid)); hold = p++; - for (i = 0; i < UNX_NGROUPS && gid_valid(cred->uc_gids[i]); i++) - *p++ = htonl((u32) from_kgid(&init_user_ns, cred->uc_gids[i])); + if (gi) + for (i = 0; i < UNX_NGROUPS && i < gi->ngroups; i++) + *p++ = htonl((u32) from_kgid(&init_user_ns, gi->gid[i])); *hold = htonl(p - hold - 1); /* gid array length */ *base = htonl((p - base - 1) << 2); /* cred length */ @@ -213,12 +176,13 @@ unx_validate(struct rpc_task *task, __be32 *p) int __init rpc_init_authunix(void) { - return rpcauth_init_credcache(&unix_auth); + unix_pool = mempool_create_kmalloc_pool(16, sizeof(struct rpc_cred)); + return unix_pool ? 0 : -ENOMEM; } void rpc_destroy_authunix(void) { - rpcauth_destroy_credcache(&unix_auth); + mempool_destroy(unix_pool); } const struct rpc_authops authunix_ops = { @@ -227,16 +191,13 @@ const struct rpc_authops authunix_ops = { .au_name = "UNIX", .create = unx_create, .destroy = unx_destroy, - .hash_cred = unx_hash_cred, .lookup_cred = unx_lookup_cred, - .crcreate = unx_create_cred, }; static struct rpc_auth unix_auth = { .au_cslack = UNX_CALLSLACK, .au_rslack = NUL_REPLYSLACK, - .au_flags = RPCAUTH_AUTH_NO_CRKEY_TIMEOUT, .au_ops = &authunix_ops, .au_flavor = RPC_AUTH_UNIX, .au_count = REFCOUNT_INIT(1), @@ -246,7 +207,6 @@ static const struct rpc_credops unix_credops = { .cr_name = "AUTH_UNIX", .crdestroy = unx_destroy_cred, - .crbind = rpcauth_generic_bind_cred, .crmatch = unx_match, .crmarshal = unx_marshal, .crrefresh = unx_refresh, diff --git a/net/sunrpc/backchannel_rqst.c b/net/sunrpc/backchannel_rqst.c index fa5ba6ed3197..ec451b8114b0 100644 --- a/net/sunrpc/backchannel_rqst.c +++ b/net/sunrpc/backchannel_rqst.c @@ -197,7 +197,7 @@ out_free: /** * xprt_destroy_backchannel - Destroys the backchannel preallocated structures. * @xprt: the transport holding the preallocated strucures - * @max_reqs the maximum number of preallocated structures to destroy + * @max_reqs: the maximum number of preallocated structures to destroy * * Since these structures may have been allocated by multiple calls * to xprt_setup_backchannel, we only destroy up to the maximum number diff --git a/net/sunrpc/cache.c b/net/sunrpc/cache.c index f96345b1180e..12bb23b8e0c5 100644 --- a/net/sunrpc/cache.c +++ b/net/sunrpc/cache.c @@ -54,6 +54,11 @@ static void cache_init(struct cache_head *h, struct cache_detail *detail) h->last_refresh = now; } +static void cache_fresh_locked(struct cache_head *head, time_t expiry, + struct cache_detail *detail); +static void cache_fresh_unlocked(struct cache_head *head, + struct cache_detail *detail); + static struct cache_head *sunrpc_cache_find_rcu(struct cache_detail *detail, struct cache_head *key, int hash) @@ -100,6 +105,7 @@ static struct cache_head *sunrpc_cache_add_entry(struct cache_detail *detail, if (cache_is_expired(detail, tmp)) { hlist_del_init_rcu(&tmp->cache_list); detail->entries --; + cache_fresh_locked(tmp, 0, detail); freeme = tmp; break; } @@ -115,8 +121,10 @@ static struct cache_head *sunrpc_cache_add_entry(struct cache_detail *detail, cache_get(new); spin_unlock(&detail->hash_lock); - if (freeme) + if (freeme) { + cache_fresh_unlocked(freeme, detail); cache_put(freeme, detail); + } return new; } diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index 24cbddc44c88..71d9599b5816 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -627,6 +627,7 @@ static struct rpc_clnt *__rpc_clone_client(struct rpc_create_args *args, new->cl_noretranstimeo = clnt->cl_noretranstimeo; new->cl_discrtry = clnt->cl_discrtry; new->cl_chatty = clnt->cl_chatty; + new->cl_principal = clnt->cl_principal; return new; out_err: @@ -1029,7 +1030,7 @@ rpc_task_set_rpc_message(struct rpc_task *task, const struct rpc_message *msg) task->tk_msg.rpc_argp = msg->rpc_argp; task->tk_msg.rpc_resp = msg->rpc_resp; if (msg->rpc_cred != NULL) - task->tk_msg.rpc_cred = get_rpccred(msg->rpc_cred); + task->tk_msg.rpc_cred = get_cred(msg->rpc_cred); } } @@ -2521,9 +2522,8 @@ static int rpc_ping(struct rpc_clnt *clnt) .rpc_proc = &rpcproc_null, }; int err; - msg.rpc_cred = authnull_ops.lookup_cred(NULL, NULL, 0); - err = rpc_call_sync(clnt, &msg, RPC_TASK_SOFT | RPC_TASK_SOFTCONN); - put_rpccred(msg.rpc_cred); + err = rpc_call_sync(clnt, &msg, RPC_TASK_SOFT | RPC_TASK_SOFTCONN | + RPC_TASK_NULLCREDS); return err; } @@ -2534,15 +2534,15 @@ struct rpc_task *rpc_call_null_helper(struct rpc_clnt *clnt, { struct rpc_message msg = { .rpc_proc = &rpcproc_null, - .rpc_cred = cred, }; struct rpc_task_setup task_setup_data = { .rpc_client = clnt, .rpc_xprt = xprt, .rpc_message = &msg, + .rpc_op_cred = cred, .callback_ops = (ops != NULL) ? ops : &rpc_default_ops, .callback_data = data, - .flags = flags, + .flags = flags | RPC_TASK_NULLCREDS, }; return rpc_run_task(&task_setup_data); @@ -2593,7 +2593,6 @@ int rpc_clnt_test_and_add_xprt(struct rpc_clnt *clnt, void *dummy) { struct rpc_cb_add_xprt_calldata *data; - struct rpc_cred *cred; struct rpc_task *task; data = kmalloc(sizeof(*data), GFP_NOFS); @@ -2602,11 +2601,9 @@ int rpc_clnt_test_and_add_xprt(struct rpc_clnt *clnt, data->xps = xprt_switch_get(xps); data->xprt = xprt_get(xprt); - cred = authnull_ops.lookup_cred(NULL, NULL, 0); - task = rpc_call_null_helper(clnt, xprt, cred, - RPC_TASK_SOFT|RPC_TASK_SOFTCONN|RPC_TASK_ASYNC, + task = rpc_call_null_helper(clnt, xprt, NULL, + RPC_TASK_SOFT|RPC_TASK_SOFTCONN|RPC_TASK_ASYNC|RPC_TASK_NULLCREDS, &rpc_cb_add_xprt_call_ops, data); - put_rpccred(cred); if (IS_ERR(task)) return PTR_ERR(task); rpc_put_task(task); @@ -2637,7 +2634,6 @@ int rpc_clnt_setup_test_and_add_xprt(struct rpc_clnt *clnt, struct rpc_xprt *xprt, void *data) { - struct rpc_cred *cred; struct rpc_task *task; struct rpc_add_xprt_test *xtest = (struct rpc_add_xprt_test *)data; int status = -EADDRINUSE; @@ -2649,11 +2645,9 @@ int rpc_clnt_setup_test_and_add_xprt(struct rpc_clnt *clnt, goto out_err; /* Test the connection */ - cred = authnull_ops.lookup_cred(NULL, NULL, 0); - task = rpc_call_null_helper(clnt, xprt, cred, - RPC_TASK_SOFT | RPC_TASK_SOFTCONN, + task = rpc_call_null_helper(clnt, xprt, NULL, + RPC_TASK_SOFT | RPC_TASK_SOFTCONN | RPC_TASK_NULLCREDS, NULL, NULL); - put_rpccred(cred); if (IS_ERR(task)) { status = PTR_ERR(task); goto out_err; @@ -2667,6 +2661,9 @@ int rpc_clnt_setup_test_and_add_xprt(struct rpc_clnt *clnt, /* rpc_xprt_switch and rpc_xprt are deferrenced by add_xprt_test() */ xtest->add_xprt_test(clnt, xprt, xtest->data); + xprt_put(xprt); + xprt_switch_put(xps); + /* so that rpc_clnt_add_xprt does not call rpc_xprt_switch_add_xprt */ return 1; out_err: diff --git a/net/sunrpc/rpc_pipe.c b/net/sunrpc/rpc_pipe.c index 4fda18d47e2c..69663681bf9d 100644 --- a/net/sunrpc/rpc_pipe.c +++ b/net/sunrpc/rpc_pipe.c @@ -1266,7 +1266,7 @@ static const struct rpc_pipe_ops gssd_dummy_pipe_ops = { * that this file will be there and have a certain format. */ static int -rpc_show_dummy_info(struct seq_file *m, void *v) +rpc_dummy_info_show(struct seq_file *m, void *v) { seq_printf(m, "RPC server: %s\n", utsname()->nodename); seq_printf(m, "service: foo (1) version 0\n"); @@ -1275,25 +1275,12 @@ rpc_show_dummy_info(struct seq_file *m, void *v) seq_printf(m, "port: 0\n"); return 0; } - -static int -rpc_dummy_info_open(struct inode *inode, struct file *file) -{ - return single_open(file, rpc_show_dummy_info, NULL); -} - -static const struct file_operations rpc_dummy_info_operations = { - .owner = THIS_MODULE, - .open = rpc_dummy_info_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; +DEFINE_SHOW_ATTRIBUTE(rpc_dummy_info); static const struct rpc_filelist gssd_dummy_info_file[] = { [0] = { .name = "info", - .i_fop = &rpc_dummy_info_operations, + .i_fop = &rpc_dummy_info_fops, .mode = S_IFREG | 0400, }, }; diff --git a/net/sunrpc/rpcb_clnt.c b/net/sunrpc/rpcb_clnt.c index c7872bc13860..41a971ac1c63 100644 --- a/net/sunrpc/rpcb_clnt.c +++ b/net/sunrpc/rpcb_clnt.c @@ -752,7 +752,7 @@ void rpcb_getport_async(struct rpc_task *task) goto bailout_nofree; } - map = kzalloc(sizeof(struct rpcbind_args), GFP_ATOMIC); + map = kzalloc(sizeof(struct rpcbind_args), GFP_NOFS); if (!map) { status = -ENOMEM; dprintk("RPC: %5u %s: no memory available\n", @@ -770,7 +770,13 @@ void rpcb_getport_async(struct rpc_task *task) case RPCBVERS_4: case RPCBVERS_3: map->r_netid = xprt->address_strings[RPC_DISPLAY_NETID]; - map->r_addr = rpc_sockaddr2uaddr(sap, GFP_ATOMIC); + map->r_addr = rpc_sockaddr2uaddr(sap, GFP_NOFS); + if (!map->r_addr) { + status = -ENOMEM; + dprintk("RPC: %5u %s: no memory available\n", + task->tk_pid, __func__); + goto bailout_free_args; + } map->r_owner = ""; break; case RPCBVERS_2: @@ -793,6 +799,8 @@ void rpcb_getport_async(struct rpc_task *task) rpc_put_task(child); return; +bailout_free_args: + kfree(map); bailout_release_client: rpc_release_client(rpcb_clnt); bailout_nofree: diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c index 57ca5bead1cb..adc3c40cc733 100644 --- a/net/sunrpc/sched.c +++ b/net/sunrpc/sched.c @@ -997,6 +997,8 @@ static void rpc_init_task(struct rpc_task *task, const struct rpc_task_setup *ta task->tk_xprt = xprt_get(task_setup_data->rpc_xprt); + task->tk_op_cred = get_rpccred(task_setup_data->rpc_op_cred); + if (task->tk_ops->rpc_call_prepare != NULL) task->tk_action = rpc_prepare_task; @@ -1054,6 +1056,7 @@ static void rpc_free_task(struct rpc_task *task) { unsigned short tk_flags = task->tk_flags; + put_rpccred(task->tk_op_cred); rpc_release_calldata(task->tk_ops, task->tk_calldata); if (tk_flags & RPC_TASK_DYNAMIC) { @@ -1071,7 +1074,7 @@ static void rpc_release_resources_task(struct rpc_task *task) { xprt_release(task); if (task->tk_msg.rpc_cred) { - put_rpccred(task->tk_msg.rpc_cred); + put_cred(task->tk_msg.rpc_cred); task->tk_msg.rpc_cred = NULL; } rpc_task_release_client(task); diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c index d13e05f1a990..e87ddb9f7feb 100644 --- a/net/sunrpc/svc.c +++ b/net/sunrpc/svc.c @@ -1145,6 +1145,17 @@ static __printf(2,3) void svc_printk(struct svc_rqst *rqstp, const char *fmt, .. #endif /* + * Setup response header for TCP, it has a 4B record length field. + */ +static void svc_tcp_prep_reply_hdr(struct svc_rqst *rqstp) +{ + struct kvec *resv = &rqstp->rq_res.head[0]; + + /* tcp needs a space for the record length... */ + svc_putnl(resv, 0); +} + +/* * Common routine for processing the RPC request. */ static int @@ -1172,7 +1183,8 @@ svc_process_common(struct svc_rqst *rqstp, struct kvec *argv, struct kvec *resv) clear_bit(RQ_DROPME, &rqstp->rq_flags); /* Setup reply header */ - rqstp->rq_xprt->xpt_ops->xpo_prep_reply_hdr(rqstp); + if (rqstp->rq_prot == IPPROTO_TCP) + svc_tcp_prep_reply_hdr(rqstp); svc_putu32(resv, rqstp->rq_xid); @@ -1244,7 +1256,7 @@ svc_process_common(struct svc_rqst *rqstp, struct kvec *argv, struct kvec *resv) * for lower versions. RPC_PROG_MISMATCH seems to be the closest * fit. */ - if (versp->vs_need_cong_ctrl && + if (versp->vs_need_cong_ctrl && rqstp->rq_xprt && !test_bit(XPT_CONG_CTRL, &rqstp->rq_xprt->xpt_flags)) goto err_bad_vers; @@ -1336,7 +1348,7 @@ svc_process_common(struct svc_rqst *rqstp, struct kvec *argv, struct kvec *resv) return 0; close: - if (test_bit(XPT_TEMP, &rqstp->rq_xprt->xpt_flags)) + if (rqstp->rq_xprt && test_bit(XPT_TEMP, &rqstp->rq_xprt->xpt_flags)) svc_close_xprt(rqstp->rq_xprt); dprintk("svc: svc_process close\n"); return 0; @@ -1459,10 +1471,10 @@ bc_svc_process(struct svc_serv *serv, struct rpc_rqst *req, dprintk("svc: %s(%p)\n", __func__, req); /* Build the svc_rqst used by the common processing routine */ - rqstp->rq_xprt = serv->sv_bc_xprt; rqstp->rq_xid = req->rq_xid; rqstp->rq_prot = req->rq_xprt->prot; rqstp->rq_server = serv; + rqstp->rq_bc_net = req->rq_xprt->xprt_net; rqstp->rq_addrlen = sizeof(req->rq_xprt->addr); memcpy(&rqstp->rq_addr, &req->rq_xprt->addr, rqstp->rq_addrlen); @@ -1499,9 +1511,9 @@ bc_svc_process(struct svc_serv *serv, struct rpc_rqst *req, if (!proc_error) { /* Processing error: drop the request */ xprt_free_bc_request(req); - return 0; + error = -EINVAL; + goto out; } - /* Finally, send the reply synchronously */ memcpy(&req->rq_snd_buf, &rqstp->rq_res, sizeof(req->rq_snd_buf)); task = rpc_run_bc_task(req); diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c index 51d36230b6e3..4eb8fbf2508d 100644 --- a/net/sunrpc/svc_xprt.c +++ b/net/sunrpc/svc_xprt.c @@ -296,9 +296,9 @@ int svc_create_xprt(struct svc_serv *serv, const char *xprt_name, request_module("svc%s", xprt_name); err = _svc_create_xprt(serv, xprt_name, net, family, port, flags); } - if (err) + if (err < 0) dprintk("svc: transport %s not found, err %d\n", - xprt_name, err); + xprt_name, -err); return err; } EXPORT_SYMBOL_GPL(svc_create_xprt); @@ -468,10 +468,11 @@ out: */ void svc_reserve(struct svc_rqst *rqstp, int space) { + struct svc_xprt *xprt = rqstp->rq_xprt; + space += rqstp->rq_res.head[0].iov_len; - if (space < rqstp->rq_reserved) { - struct svc_xprt *xprt = rqstp->rq_xprt; + if (xprt && space < rqstp->rq_reserved) { atomic_sub((rqstp->rq_reserved - space), &xprt->xpt_reserved); rqstp->rq_reserved = space; diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index b7e67310ec37..a6a060925e5d 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c @@ -70,13 +70,6 @@ static void svc_sock_free(struct svc_xprt *); static struct svc_xprt *svc_create_socket(struct svc_serv *, int, struct net *, struct sockaddr *, int, int); -#if defined(CONFIG_SUNRPC_BACKCHANNEL) -static struct svc_xprt *svc_bc_create_socket(struct svc_serv *, int, - struct net *, struct sockaddr *, - int, int); -static void svc_bc_sock_free(struct svc_xprt *xprt); -#endif /* CONFIG_SUNRPC_BACKCHANNEL */ - #ifdef CONFIG_DEBUG_LOCK_ALLOC static struct lock_class_key svc_key[2]; static struct lock_class_key svc_slock_key[2]; @@ -617,10 +610,6 @@ svc_udp_sendto(struct svc_rqst *rqstp) return error; } -static void svc_udp_prep_reply_hdr(struct svc_rqst *rqstp) -{ -} - static int svc_udp_has_wspace(struct svc_xprt *xprt) { struct svc_sock *svsk = container_of(xprt, struct svc_sock, sk_xprt); @@ -664,7 +653,6 @@ static const struct svc_xprt_ops svc_udp_ops = { .xpo_release_rqst = svc_release_udp_skb, .xpo_detach = svc_sock_detach, .xpo_free = svc_sock_free, - .xpo_prep_reply_hdr = svc_udp_prep_reply_hdr, .xpo_has_wspace = svc_udp_has_wspace, .xpo_accept = svc_udp_accept, .xpo_secure_port = svc_sock_secure_port, @@ -1170,17 +1158,6 @@ static int svc_tcp_sendto(struct svc_rqst *rqstp) return sent; } -/* - * Setup response header. TCP has a 4B record length field. - */ -static void svc_tcp_prep_reply_hdr(struct svc_rqst *rqstp) -{ - struct kvec *resv = &rqstp->rq_res.head[0]; - - /* tcp needs a space for the record length... */ - svc_putnl(resv, 0); -} - static struct svc_xprt *svc_tcp_create(struct svc_serv *serv, struct net *net, struct sockaddr *sa, int salen, @@ -1189,58 +1166,6 @@ static struct svc_xprt *svc_tcp_create(struct svc_serv *serv, return svc_create_socket(serv, IPPROTO_TCP, net, sa, salen, flags); } -#if defined(CONFIG_SUNRPC_BACKCHANNEL) -static struct svc_xprt *svc_bc_create_socket(struct svc_serv *, int, - struct net *, struct sockaddr *, - int, int); -static void svc_bc_sock_free(struct svc_xprt *xprt); - -static struct svc_xprt *svc_bc_tcp_create(struct svc_serv *serv, - struct net *net, - struct sockaddr *sa, int salen, - int flags) -{ - return svc_bc_create_socket(serv, IPPROTO_TCP, net, sa, salen, flags); -} - -static void svc_bc_tcp_sock_detach(struct svc_xprt *xprt) -{ -} - -static const struct svc_xprt_ops svc_tcp_bc_ops = { - .xpo_create = svc_bc_tcp_create, - .xpo_detach = svc_bc_tcp_sock_detach, - .xpo_free = svc_bc_sock_free, - .xpo_prep_reply_hdr = svc_tcp_prep_reply_hdr, - .xpo_secure_port = svc_sock_secure_port, -}; - -static struct svc_xprt_class svc_tcp_bc_class = { - .xcl_name = "tcp-bc", - .xcl_owner = THIS_MODULE, - .xcl_ops = &svc_tcp_bc_ops, - .xcl_max_payload = RPCSVC_MAXPAYLOAD_TCP, -}; - -static void svc_init_bc_xprt_sock(void) -{ - svc_reg_xprt_class(&svc_tcp_bc_class); -} - -static void svc_cleanup_bc_xprt_sock(void) -{ - svc_unreg_xprt_class(&svc_tcp_bc_class); -} -#else /* CONFIG_SUNRPC_BACKCHANNEL */ -static void svc_init_bc_xprt_sock(void) -{ -} - -static void svc_cleanup_bc_xprt_sock(void) -{ -} -#endif /* CONFIG_SUNRPC_BACKCHANNEL */ - static const struct svc_xprt_ops svc_tcp_ops = { .xpo_create = svc_tcp_create, .xpo_recvfrom = svc_tcp_recvfrom, @@ -1248,7 +1173,6 @@ static const struct svc_xprt_ops svc_tcp_ops = { .xpo_release_rqst = svc_release_skb, .xpo_detach = svc_tcp_sock_detach, .xpo_free = svc_sock_free, - .xpo_prep_reply_hdr = svc_tcp_prep_reply_hdr, .xpo_has_wspace = svc_tcp_has_wspace, .xpo_accept = svc_tcp_accept, .xpo_secure_port = svc_sock_secure_port, @@ -1267,14 +1191,12 @@ void svc_init_xprt_sock(void) { svc_reg_xprt_class(&svc_tcp_class); svc_reg_xprt_class(&svc_udp_class); - svc_init_bc_xprt_sock(); } void svc_cleanup_xprt_sock(void) { svc_unreg_xprt_class(&svc_tcp_class); svc_unreg_xprt_class(&svc_udp_class); - svc_cleanup_bc_xprt_sock(); } static void svc_tcp_init(struct svc_sock *svsk, struct svc_serv *serv) @@ -1595,45 +1517,3 @@ static void svc_sock_free(struct svc_xprt *xprt) sock_release(svsk->sk_sock); kfree(svsk); } - -#if defined(CONFIG_SUNRPC_BACKCHANNEL) -/* - * Create a back channel svc_xprt which shares the fore channel socket. - */ -static struct svc_xprt *svc_bc_create_socket(struct svc_serv *serv, - int protocol, - struct net *net, - struct sockaddr *sin, int len, - int flags) -{ - struct svc_sock *svsk; - struct svc_xprt *xprt; - - if (protocol != IPPROTO_TCP) { - printk(KERN_WARNING "svc: only TCP sockets" - " supported on shared back channel\n"); - return ERR_PTR(-EINVAL); - } - - svsk = kzalloc(sizeof(*svsk), GFP_KERNEL); - if (!svsk) - return ERR_PTR(-ENOMEM); - - xprt = &svsk->sk_xprt; - svc_xprt_init(net, &svc_tcp_bc_class, xprt, serv); - set_bit(XPT_CONG_CTRL, &svsk->sk_xprt.xpt_flags); - - serv->sv_bc_xprt = xprt; - - return xprt; -} - -/* - * Free a back channel svc_sock. - */ -static void svc_bc_sock_free(struct svc_xprt *xprt) -{ - if (xprt) - kfree(container_of(xprt, struct svc_sock, sk_xprt)); -} -#endif /* CONFIG_SUNRPC_BACKCHANNEL */ diff --git a/net/sunrpc/xprtmultipath.c b/net/sunrpc/xprtmultipath.c index e2d64c7138c3..8394124126f8 100644 --- a/net/sunrpc/xprtmultipath.c +++ b/net/sunrpc/xprtmultipath.c @@ -383,7 +383,7 @@ void xprt_iter_init_listall(struct rpc_xprt_iter *xpi, /** * xprt_iter_xchg_switch - Atomically swap out the rpc_xprt_switch * @xpi: pointer to rpc_xprt_iter - * @xps: pointer to a new rpc_xprt_switch or NULL + * @newswitch: pointer to a new rpc_xprt_switch or NULL * * Swaps out the existing xpi->xpi_xpswitch with a new value. */ @@ -401,7 +401,7 @@ struct rpc_xprt_switch *xprt_iter_xchg_switch(struct rpc_xprt_iter *xpi, /** * xprt_iter_destroy - Destroys the xprt iterator - * @xpi pointer to rpc_xprt_iter + * @xpi: pointer to rpc_xprt_iter */ void xprt_iter_destroy(struct rpc_xprt_iter *xpi) { diff --git a/net/sunrpc/xprtrdma/Makefile b/net/sunrpc/xprtrdma/Makefile index 8bf19e142b6b..8ed0377d7a18 100644 --- a/net/sunrpc/xprtrdma/Makefile +++ b/net/sunrpc/xprtrdma/Makefile @@ -1,8 +1,7 @@ # SPDX-License-Identifier: GPL-2.0 obj-$(CONFIG_SUNRPC_XPRT_RDMA) += rpcrdma.o -rpcrdma-y := transport.o rpc_rdma.o verbs.o \ - fmr_ops.o frwr_ops.o \ +rpcrdma-y := transport.o rpc_rdma.o verbs.o frwr_ops.o \ svc_rdma.o svc_rdma_backchannel.o svc_rdma_transport.o \ svc_rdma_sendto.o svc_rdma_recvfrom.o svc_rdma_rw.o \ module.o diff --git a/net/sunrpc/xprtrdma/backchannel.c b/net/sunrpc/xprtrdma/backchannel.c index e5b367a3e517..0de9b3e63770 100644 --- a/net/sunrpc/xprtrdma/backchannel.c +++ b/net/sunrpc/xprtrdma/backchannel.c @@ -5,7 +5,6 @@ * Support for backward direction RPCs on RPC/RDMA. */ -#include <linux/module.h> #include <linux/sunrpc/xprt.h> #include <linux/sunrpc/svc.h> #include <linux/sunrpc/svc_xprt.h> @@ -20,29 +19,16 @@ #undef RPCRDMA_BACKCHANNEL_DEBUG -static void rpcrdma_bc_free_rqst(struct rpcrdma_xprt *r_xprt, - struct rpc_rqst *rqst) -{ - struct rpcrdma_buffer *buf = &r_xprt->rx_buf; - struct rpcrdma_req *req = rpcr_to_rdmar(rqst); - - spin_lock(&buf->rb_reqslock); - list_del(&req->rl_all); - spin_unlock(&buf->rb_reqslock); - - rpcrdma_destroy_req(req); -} - static int rpcrdma_bc_setup_reqs(struct rpcrdma_xprt *r_xprt, unsigned int count) { struct rpc_xprt *xprt = &r_xprt->rx_xprt; + struct rpcrdma_req *req; struct rpc_rqst *rqst; unsigned int i; for (i = 0; i < (count << 1); i++) { struct rpcrdma_regbuf *rb; - struct rpcrdma_req *req; size_t size; req = rpcrdma_create_req(r_xprt); @@ -68,7 +54,7 @@ static int rpcrdma_bc_setup_reqs(struct rpcrdma_xprt *r_xprt, return 0; out_fail: - rpcrdma_bc_free_rqst(r_xprt, rqst); + rpcrdma_req_destroy(req); return -ENOMEM; } @@ -101,7 +87,6 @@ int xprt_rdma_bc_setup(struct rpc_xprt *xprt, unsigned int reqs) goto out_free; r_xprt->rx_buf.rb_bc_srv_max_requests = reqs; - request_module("svcrdma"); trace_xprtrdma_cb_setup(r_xprt, reqs); return 0; @@ -114,26 +99,6 @@ out_err: } /** - * xprt_rdma_bc_up - Create transport endpoint for backchannel service - * @serv: server endpoint - * @net: network namespace - * - * The "xprt" is an implied argument: it supplies the name of the - * backchannel transport class. - * - * Returns zero on success, negative errno on failure - */ -int xprt_rdma_bc_up(struct svc_serv *serv, struct net *net) -{ - int ret; - - ret = svc_create_xprt(serv, "rdma-bc", net, PF_INET, 0, 0); - if (ret < 0) - return ret; - return 0; -} - -/** * xprt_rdma_bc_maxpayload - Return maximum backchannel message size * @xprt: transport * @@ -193,21 +158,21 @@ static int rpcrdma_bc_marshal_reply(struct rpc_rqst *rqst) */ int xprt_rdma_bc_send_reply(struct rpc_rqst *rqst) { - struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(rqst->rq_xprt); + struct rpc_xprt *xprt = rqst->rq_xprt; + struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); struct rpcrdma_req *req = rpcr_to_rdmar(rqst); int rc; - if (!xprt_connected(rqst->rq_xprt)) - goto drop_connection; + if (!xprt_connected(xprt)) + return -ENOTCONN; - if (!xprt_request_get_cong(rqst->rq_xprt, rqst)) + if (!xprt_request_get_cong(xprt, rqst)) return -EBADSLT; rc = rpcrdma_bc_marshal_reply(rqst); if (rc < 0) goto failed_marshal; - rpcrdma_post_recvs(r_xprt, true); if (rpcrdma_ep_post(&r_xprt->rx_ia, &r_xprt->rx_ep, req)) goto drop_connection; return 0; @@ -216,7 +181,7 @@ failed_marshal: if (rc != -ENOTCONN) return rc; drop_connection: - xprt_disconnect_done(rqst->rq_xprt); + xprt_rdma_close(xprt); return -ENOTCONN; } @@ -227,7 +192,6 @@ drop_connection: */ void xprt_rdma_bc_destroy(struct rpc_xprt *xprt, unsigned int reqs) { - struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); struct rpc_rqst *rqst, *tmp; spin_lock(&xprt->bc_pa_lock); @@ -235,7 +199,7 @@ void xprt_rdma_bc_destroy(struct rpc_xprt *xprt, unsigned int reqs) list_del(&rqst->rq_bc_pa_list); spin_unlock(&xprt->bc_pa_lock); - rpcrdma_bc_free_rqst(r_xprt, rqst); + rpcrdma_req_destroy(rpcr_to_rdmar(rqst)); spin_lock(&xprt->bc_pa_lock); } @@ -251,9 +215,6 @@ void xprt_rdma_bc_free_rqst(struct rpc_rqst *rqst) struct rpcrdma_req *req = rpcr_to_rdmar(rqst); struct rpc_xprt *xprt = rqst->rq_xprt; - dprintk("RPC: %s: freeing rqst %p (req %p)\n", - __func__, rqst, req); - rpcrdma_recv_buffer_put(req->rl_reply); req->rl_reply = NULL; @@ -339,7 +300,7 @@ void rpcrdma_bc_receive_call(struct rpcrdma_xprt *r_xprt, out_overflow: pr_warn("RPC/RDMA backchannel overflow\n"); - xprt_disconnect_done(xprt); + xprt_force_disconnect(xprt); /* This receive buffer gets reposted automatically * when the connection is re-established. */ diff --git a/net/sunrpc/xprtrdma/fmr_ops.c b/net/sunrpc/xprtrdma/fmr_ops.c deleted file mode 100644 index 7f5632cd5a48..000000000000 --- a/net/sunrpc/xprtrdma/fmr_ops.c +++ /dev/null @@ -1,337 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Copyright (c) 2015, 2017 Oracle. All rights reserved. - * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved. - */ - -/* Lightweight memory registration using Fast Memory Regions (FMR). - * Referred to sometimes as MTHCAFMR mode. - * - * FMR uses synchronous memory registration and deregistration. - * FMR registration is known to be fast, but FMR deregistration - * can take tens of usecs to complete. - */ - -/* Normal operation - * - * A Memory Region is prepared for RDMA READ or WRITE using the - * ib_map_phys_fmr verb (fmr_op_map). When the RDMA operation is - * finished, the Memory Region is unmapped using the ib_unmap_fmr - * verb (fmr_op_unmap). - */ - -#include <linux/sunrpc/svc_rdma.h> - -#include "xprt_rdma.h" -#include <trace/events/rpcrdma.h> - -#if IS_ENABLED(CONFIG_SUNRPC_DEBUG) -# define RPCDBG_FACILITY RPCDBG_TRANS -#endif - -/* Maximum scatter/gather per FMR */ -#define RPCRDMA_MAX_FMR_SGES (64) - -/* Access mode of externally registered pages */ -enum { - RPCRDMA_FMR_ACCESS_FLAGS = IB_ACCESS_REMOTE_WRITE | - IB_ACCESS_REMOTE_READ, -}; - -bool -fmr_is_supported(struct rpcrdma_ia *ia) -{ - if (!ia->ri_device->alloc_fmr) { - pr_info("rpcrdma: 'fmr' mode is not supported by device %s\n", - ia->ri_device->name); - return false; - } - return true; -} - -static void -__fmr_unmap(struct rpcrdma_mr *mr) -{ - LIST_HEAD(l); - int rc; - - list_add(&mr->fmr.fm_mr->list, &l); - rc = ib_unmap_fmr(&l); - list_del(&mr->fmr.fm_mr->list); - if (rc) - pr_err("rpcrdma: final ib_unmap_fmr for %p failed %i\n", - mr, rc); -} - -/* Release an MR. - */ -static void -fmr_op_release_mr(struct rpcrdma_mr *mr) -{ - int rc; - - kfree(mr->fmr.fm_physaddrs); - kfree(mr->mr_sg); - - /* In case this one was left mapped, try to unmap it - * to prevent dealloc_fmr from failing with EBUSY - */ - __fmr_unmap(mr); - - rc = ib_dealloc_fmr(mr->fmr.fm_mr); - if (rc) - pr_err("rpcrdma: final ib_dealloc_fmr for %p returned %i\n", - mr, rc); - - kfree(mr); -} - -/* MRs are dynamically allocated, so simply clean up and release the MR. - * A replacement MR will subsequently be allocated on demand. - */ -static void -fmr_mr_recycle_worker(struct work_struct *work) -{ - struct rpcrdma_mr *mr = container_of(work, struct rpcrdma_mr, mr_recycle); - struct rpcrdma_xprt *r_xprt = mr->mr_xprt; - - trace_xprtrdma_mr_recycle(mr); - - trace_xprtrdma_mr_unmap(mr); - ib_dma_unmap_sg(r_xprt->rx_ia.ri_device, - mr->mr_sg, mr->mr_nents, mr->mr_dir); - - spin_lock(&r_xprt->rx_buf.rb_mrlock); - list_del(&mr->mr_all); - r_xprt->rx_stats.mrs_recycled++; - spin_unlock(&r_xprt->rx_buf.rb_mrlock); - fmr_op_release_mr(mr); -} - -static int -fmr_op_init_mr(struct rpcrdma_ia *ia, struct rpcrdma_mr *mr) -{ - static struct ib_fmr_attr fmr_attr = { - .max_pages = RPCRDMA_MAX_FMR_SGES, - .max_maps = 1, - .page_shift = PAGE_SHIFT - }; - - mr->fmr.fm_physaddrs = kcalloc(RPCRDMA_MAX_FMR_SGES, - sizeof(u64), GFP_KERNEL); - if (!mr->fmr.fm_physaddrs) - goto out_free; - - mr->mr_sg = kcalloc(RPCRDMA_MAX_FMR_SGES, - sizeof(*mr->mr_sg), GFP_KERNEL); - if (!mr->mr_sg) - goto out_free; - - sg_init_table(mr->mr_sg, RPCRDMA_MAX_FMR_SGES); - - mr->fmr.fm_mr = ib_alloc_fmr(ia->ri_pd, RPCRDMA_FMR_ACCESS_FLAGS, - &fmr_attr); - if (IS_ERR(mr->fmr.fm_mr)) - goto out_fmr_err; - - INIT_LIST_HEAD(&mr->mr_list); - INIT_WORK(&mr->mr_recycle, fmr_mr_recycle_worker); - return 0; - -out_fmr_err: - dprintk("RPC: %s: ib_alloc_fmr returned %ld\n", __func__, - PTR_ERR(mr->fmr.fm_mr)); - -out_free: - kfree(mr->mr_sg); - kfree(mr->fmr.fm_physaddrs); - return -ENOMEM; -} - -/* On success, sets: - * ep->rep_attr.cap.max_send_wr - * ep->rep_attr.cap.max_recv_wr - * cdata->max_requests - * ia->ri_max_segs - */ -static int -fmr_op_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep, - struct rpcrdma_create_data_internal *cdata) -{ - int max_qp_wr; - - max_qp_wr = ia->ri_device->attrs.max_qp_wr; - max_qp_wr -= RPCRDMA_BACKWARD_WRS; - max_qp_wr -= 1; - if (max_qp_wr < RPCRDMA_MIN_SLOT_TABLE) - return -ENOMEM; - if (cdata->max_requests > max_qp_wr) - cdata->max_requests = max_qp_wr; - ep->rep_attr.cap.max_send_wr = cdata->max_requests; - ep->rep_attr.cap.max_send_wr += RPCRDMA_BACKWARD_WRS; - ep->rep_attr.cap.max_send_wr += 1; /* for ib_drain_sq */ - ep->rep_attr.cap.max_recv_wr = cdata->max_requests; - ep->rep_attr.cap.max_recv_wr += RPCRDMA_BACKWARD_WRS; - ep->rep_attr.cap.max_recv_wr += 1; /* for ib_drain_rq */ - - ia->ri_max_segs = max_t(unsigned int, 1, RPCRDMA_MAX_DATA_SEGS / - RPCRDMA_MAX_FMR_SGES); - ia->ri_max_segs += 2; /* segments for head and tail buffers */ - return 0; -} - -/* FMR mode conveys up to 64 pages of payload per chunk segment. - */ -static size_t -fmr_op_maxpages(struct rpcrdma_xprt *r_xprt) -{ - return min_t(unsigned int, RPCRDMA_MAX_DATA_SEGS, - RPCRDMA_MAX_HDR_SEGS * RPCRDMA_MAX_FMR_SGES); -} - -/* Use the ib_map_phys_fmr() verb to register a memory region - * for remote access via RDMA READ or RDMA WRITE. - */ -static struct rpcrdma_mr_seg * -fmr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, - int nsegs, bool writing, struct rpcrdma_mr **out) -{ - struct rpcrdma_mr_seg *seg1 = seg; - int len, pageoff, i, rc; - struct rpcrdma_mr *mr; - u64 *dma_pages; - - mr = rpcrdma_mr_get(r_xprt); - if (!mr) - return ERR_PTR(-EAGAIN); - - pageoff = offset_in_page(seg1->mr_offset); - seg1->mr_offset -= pageoff; /* start of page */ - seg1->mr_len += pageoff; - len = -pageoff; - if (nsegs > RPCRDMA_MAX_FMR_SGES) - nsegs = RPCRDMA_MAX_FMR_SGES; - for (i = 0; i < nsegs;) { - if (seg->mr_page) - sg_set_page(&mr->mr_sg[i], - seg->mr_page, - seg->mr_len, - offset_in_page(seg->mr_offset)); - else - sg_set_buf(&mr->mr_sg[i], seg->mr_offset, - seg->mr_len); - len += seg->mr_len; - ++seg; - ++i; - /* Check for holes */ - if ((i < nsegs && offset_in_page(seg->mr_offset)) || - offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len)) - break; - } - mr->mr_dir = rpcrdma_data_dir(writing); - - mr->mr_nents = ib_dma_map_sg(r_xprt->rx_ia.ri_device, - mr->mr_sg, i, mr->mr_dir); - if (!mr->mr_nents) - goto out_dmamap_err; - trace_xprtrdma_mr_map(mr); - - for (i = 0, dma_pages = mr->fmr.fm_physaddrs; i < mr->mr_nents; i++) - dma_pages[i] = sg_dma_address(&mr->mr_sg[i]); - rc = ib_map_phys_fmr(mr->fmr.fm_mr, dma_pages, mr->mr_nents, - dma_pages[0]); - if (rc) - goto out_maperr; - - mr->mr_handle = mr->fmr.fm_mr->rkey; - mr->mr_length = len; - mr->mr_offset = dma_pages[0] + pageoff; - - *out = mr; - return seg; - -out_dmamap_err: - pr_err("rpcrdma: failed to DMA map sg %p sg_nents %d\n", - mr->mr_sg, i); - rpcrdma_mr_put(mr); - return ERR_PTR(-EIO); - -out_maperr: - pr_err("rpcrdma: ib_map_phys_fmr %u@0x%llx+%i (%d) status %i\n", - len, (unsigned long long)dma_pages[0], - pageoff, mr->mr_nents, rc); - rpcrdma_mr_unmap_and_put(mr); - return ERR_PTR(-EIO); -} - -/* Post Send WR containing the RPC Call message. - */ -static int -fmr_op_send(struct rpcrdma_ia *ia, struct rpcrdma_req *req) -{ - return ib_post_send(ia->ri_id->qp, &req->rl_sendctx->sc_wr, NULL); -} - -/* Invalidate all memory regions that were registered for "req". - * - * Sleeps until it is safe for the host CPU to access the - * previously mapped memory regions. - * - * Caller ensures that @mrs is not empty before the call. This - * function empties the list. - */ -static void -fmr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct list_head *mrs) -{ - struct rpcrdma_mr *mr; - LIST_HEAD(unmap_list); - int rc; - - /* ORDER: Invalidate all of the req's MRs first - * - * ib_unmap_fmr() is slow, so use a single call instead - * of one call per mapped FMR. - */ - list_for_each_entry(mr, mrs, mr_list) { - dprintk("RPC: %s: unmapping fmr %p\n", - __func__, &mr->fmr); - trace_xprtrdma_mr_localinv(mr); - list_add_tail(&mr->fmr.fm_mr->list, &unmap_list); - } - r_xprt->rx_stats.local_inv_needed++; - rc = ib_unmap_fmr(&unmap_list); - if (rc) - goto out_release; - - /* ORDER: Now DMA unmap all of the req's MRs, and return - * them to the free MW list. - */ - while (!list_empty(mrs)) { - mr = rpcrdma_mr_pop(mrs); - list_del(&mr->fmr.fm_mr->list); - rpcrdma_mr_unmap_and_put(mr); - } - - return; - -out_release: - pr_err("rpcrdma: ib_unmap_fmr failed (%i)\n", rc); - - while (!list_empty(mrs)) { - mr = rpcrdma_mr_pop(mrs); - list_del(&mr->fmr.fm_mr->list); - rpcrdma_mr_recycle(mr); - } -} - -const struct rpcrdma_memreg_ops rpcrdma_fmr_memreg_ops = { - .ro_map = fmr_op_map, - .ro_send = fmr_op_send, - .ro_unmap_sync = fmr_op_unmap_sync, - .ro_open = fmr_op_open, - .ro_maxpages = fmr_op_maxpages, - .ro_init_mr = fmr_op_init_mr, - .ro_release_mr = fmr_op_release_mr, - .ro_displayname = "fmr", - .ro_send_w_inv_ok = 0, -}; diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c index fc6378cc0c1c..6a561056b538 100644 --- a/net/sunrpc/xprtrdma/frwr_ops.c +++ b/net/sunrpc/xprtrdma/frwr_ops.c @@ -15,21 +15,21 @@ /* Normal operation * * A Memory Region is prepared for RDMA READ or WRITE using a FAST_REG - * Work Request (frwr_op_map). When the RDMA operation is finished, this + * Work Request (frwr_map). When the RDMA operation is finished, this * Memory Region is invalidated using a LOCAL_INV Work Request - * (frwr_op_unmap_sync). + * (frwr_unmap_sync). * * Typically these Work Requests are not signaled, and neither are RDMA * SEND Work Requests (with the exception of signaling occasionally to * prevent provider work queue overflows). This greatly reduces HCA * interrupt workload. * - * As an optimization, frwr_op_unmap marks MRs INVALID before the + * As an optimization, frwr_unmap marks MRs INVALID before the * LOCAL_INV WR is posted. If posting succeeds, the MR is placed on * rb_mrs immediately so that no work (like managing a linked list * under a spinlock) is needed in the completion upcall. * - * But this means that frwr_op_map() can occasionally encounter an MR + * But this means that frwr_map() can occasionally encounter an MR * that is INVALID but the LOCAL_INV WR has not completed. Work Queue * ordering prevents a subsequent FAST_REG WR from executing against * that MR while it is still being invalidated. @@ -57,14 +57,14 @@ * FLUSHED_LI: The MR was being invalidated when the QP entered ERROR * state, and the pending WR was flushed. * - * When frwr_op_map encounters FLUSHED and VALID MRs, they are recovered + * When frwr_map encounters FLUSHED and VALID MRs, they are recovered * with ib_dereg_mr and then are re-initialized. Because MR recovery * allocates fresh resources, it is deferred to a workqueue, and the * recovered MRs are placed back on the rb_mrs list when recovery is - * complete. frwr_op_map allocates another MR for the current RPC while + * complete. frwr_map allocates another MR for the current RPC while * the broken MR is reset. * - * To ensure that frwr_op_map doesn't encounter an MR that is marked + * To ensure that frwr_map doesn't encounter an MR that is marked * INVALID but that is about to be flushed due to a previous transport * disconnect, the transport connect worker attempts to drain all * pending send queue WRs before the transport is reconnected. @@ -80,8 +80,13 @@ # define RPCDBG_FACILITY RPCDBG_TRANS #endif -bool -frwr_is_supported(struct rpcrdma_ia *ia) +/** + * frwr_is_supported - Check if device supports FRWR + * @ia: interface adapter to check + * + * Returns true if device supports FRWR, otherwise false + */ +bool frwr_is_supported(struct rpcrdma_ia *ia) { struct ib_device_attr *attrs = &ia->ri_device->attrs; @@ -97,15 +102,18 @@ out_not_supported: return false; } -static void -frwr_op_release_mr(struct rpcrdma_mr *mr) +/** + * frwr_release_mr - Destroy one MR + * @mr: MR allocated by frwr_init_mr + * + */ +void frwr_release_mr(struct rpcrdma_mr *mr) { int rc; rc = ib_dereg_mr(mr->frwr.fr_mr); if (rc) - pr_err("rpcrdma: final ib_dereg_mr for %p returned %i\n", - mr, rc); + trace_xprtrdma_frwr_dereg(mr, rc); kfree(mr->mr_sg); kfree(mr); } @@ -117,60 +125,78 @@ static void frwr_mr_recycle_worker(struct work_struct *work) { struct rpcrdma_mr *mr = container_of(work, struct rpcrdma_mr, mr_recycle); - enum rpcrdma_frwr_state state = mr->frwr.fr_state; struct rpcrdma_xprt *r_xprt = mr->mr_xprt; trace_xprtrdma_mr_recycle(mr); - if (state != FRWR_FLUSHED_LI) { + if (mr->mr_dir != DMA_NONE) { trace_xprtrdma_mr_unmap(mr); ib_dma_unmap_sg(r_xprt->rx_ia.ri_device, mr->mr_sg, mr->mr_nents, mr->mr_dir); + mr->mr_dir = DMA_NONE; } spin_lock(&r_xprt->rx_buf.rb_mrlock); list_del(&mr->mr_all); r_xprt->rx_stats.mrs_recycled++; spin_unlock(&r_xprt->rx_buf.rb_mrlock); - frwr_op_release_mr(mr); + + frwr_release_mr(mr); } -static int -frwr_op_init_mr(struct rpcrdma_ia *ia, struct rpcrdma_mr *mr) +/** + * frwr_init_mr - Initialize one MR + * @ia: interface adapter + * @mr: generic MR to prepare for FRWR + * + * Returns zero if successful. Otherwise a negative errno + * is returned. + */ +int frwr_init_mr(struct rpcrdma_ia *ia, struct rpcrdma_mr *mr) { unsigned int depth = ia->ri_max_frwr_depth; - struct rpcrdma_frwr *frwr = &mr->frwr; + struct scatterlist *sg; + struct ib_mr *frmr; int rc; - frwr->fr_mr = ib_alloc_mr(ia->ri_pd, ia->ri_mrtype, depth); - if (IS_ERR(frwr->fr_mr)) + frmr = ib_alloc_mr(ia->ri_pd, ia->ri_mrtype, depth); + if (IS_ERR(frmr)) goto out_mr_err; - mr->mr_sg = kcalloc(depth, sizeof(*mr->mr_sg), GFP_KERNEL); - if (!mr->mr_sg) + sg = kcalloc(depth, sizeof(*sg), GFP_KERNEL); + if (!sg) goto out_list_err; + mr->frwr.fr_mr = frmr; + mr->frwr.fr_state = FRWR_IS_INVALID; + mr->mr_dir = DMA_NONE; INIT_LIST_HEAD(&mr->mr_list); INIT_WORK(&mr->mr_recycle, frwr_mr_recycle_worker); - sg_init_table(mr->mr_sg, depth); - init_completion(&frwr->fr_linv_done); + init_completion(&mr->frwr.fr_linv_done); + + sg_init_table(sg, depth); + mr->mr_sg = sg; return 0; out_mr_err: - rc = PTR_ERR(frwr->fr_mr); - dprintk("RPC: %s: ib_alloc_mr status %i\n", - __func__, rc); + rc = PTR_ERR(frmr); + trace_xprtrdma_frwr_alloc(mr, rc); return rc; out_list_err: - rc = -ENOMEM; dprintk("RPC: %s: sg allocation failure\n", __func__); - ib_dereg_mr(frwr->fr_mr); - return rc; + ib_dereg_mr(frmr); + return -ENOMEM; } -/* On success, sets: +/** + * frwr_open - Prepare an endpoint for use with FRWR + * @ia: interface adapter this endpoint will use + * @ep: endpoint to prepare + * @cdata: transport parameters + * + * On success, sets: * ep->rep_attr.cap.max_send_wr * ep->rep_attr.cap.max_recv_wr * cdata->max_requests @@ -179,10 +205,11 @@ out_list_err: * And these FRWR-related fields: * ia->ri_max_frwr_depth * ia->ri_mrtype + * + * On failure, a negative errno is returned. */ -static int -frwr_op_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep, - struct rpcrdma_create_data_internal *cdata) +int frwr_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep, + struct rpcrdma_create_data_internal *cdata) { struct ib_device_attr *attrs = &ia->ri_device->attrs; int max_qp_wr, depth, delta; @@ -191,10 +218,17 @@ frwr_op_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep, if (attrs->device_cap_flags & IB_DEVICE_SG_GAPS_REG) ia->ri_mrtype = IB_MR_TYPE_SG_GAPS; - ia->ri_max_frwr_depth = - min_t(unsigned int, RPCRDMA_MAX_DATA_SEGS, - attrs->max_fast_reg_page_list_len); - dprintk("RPC: %s: device's max FR page list len = %u\n", + /* Quirk: Some devices advertise a large max_fast_reg_page_list_len + * capability, but perform optimally when the MRs are not larger + * than a page. + */ + if (attrs->max_sge_rd > 1) + ia->ri_max_frwr_depth = attrs->max_sge_rd; + else + ia->ri_max_frwr_depth = attrs->max_fast_reg_page_list_len; + if (ia->ri_max_frwr_depth > RPCRDMA_MAX_DATA_SEGS) + ia->ri_max_frwr_depth = RPCRDMA_MAX_DATA_SEGS; + dprintk("RPC: %s: max FR page list depth = %u\n", __func__, ia->ri_max_frwr_depth); /* Add room for frwr register and invalidate WRs. @@ -242,20 +276,28 @@ frwr_op_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep, ia->ri_max_segs = max_t(unsigned int, 1, RPCRDMA_MAX_DATA_SEGS / ia->ri_max_frwr_depth); - ia->ri_max_segs += 2; /* segments for head and tail buffers */ + /* Reply chunks require segments for head and tail buffers */ + ia->ri_max_segs += 2; + if (ia->ri_max_segs > RPCRDMA_MAX_HDR_SEGS) + ia->ri_max_segs = RPCRDMA_MAX_HDR_SEGS; return 0; } -/* FRWR mode conveys a list of pages per chunk segment. The +/** + * frwr_maxpages - Compute size of largest payload + * @r_xprt: transport + * + * Returns maximum size of an RPC message, in pages. + * + * FRWR mode conveys a list of pages per chunk segment. The * maximum length of that list is the FRWR page list depth. */ -static size_t -frwr_op_maxpages(struct rpcrdma_xprt *r_xprt) +size_t frwr_maxpages(struct rpcrdma_xprt *r_xprt) { struct rpcrdma_ia *ia = &r_xprt->rx_ia; return min_t(unsigned int, RPCRDMA_MAX_DATA_SEGS, - RPCRDMA_MAX_HDR_SEGS * ia->ri_max_frwr_depth); + (ia->ri_max_segs - 2) * ia->ri_max_frwr_depth); } static void @@ -332,12 +374,25 @@ frwr_wc_localinv_wake(struct ib_cq *cq, struct ib_wc *wc) trace_xprtrdma_wc_li_wake(wc, frwr); } -/* Post a REG_MR Work Request to register a memory region +/** + * frwr_map - Register a memory region + * @r_xprt: controlling transport + * @seg: memory region co-ordinates + * @nsegs: number of segments remaining + * @writing: true when RDMA Write will be used + * @xid: XID of RPC using the registered memory + * @out: initialized MR + * + * Prepare a REG_MR Work Request to register a memory region * for remote access via RDMA READ or RDMA WRITE. + * + * Returns the next segment or a negative errno pointer. + * On success, the prepared MR is planted in @out. */ -static struct rpcrdma_mr_seg * -frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, - int nsegs, bool writing, struct rpcrdma_mr **out) +struct rpcrdma_mr_seg *frwr_map(struct rpcrdma_xprt *r_xprt, + struct rpcrdma_mr_seg *seg, + int nsegs, bool writing, u32 xid, + struct rpcrdma_mr **out) { struct rpcrdma_ia *ia = &r_xprt->rx_ia; bool holes_ok = ia->ri_mrtype == IB_MR_TYPE_SG_GAPS; @@ -384,13 +439,14 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, mr->mr_nents = ib_dma_map_sg(ia->ri_device, mr->mr_sg, i, mr->mr_dir); if (!mr->mr_nents) goto out_dmamap_err; - trace_xprtrdma_mr_map(mr); ibmr = frwr->fr_mr; n = ib_map_mr_sg(ibmr, mr->mr_sg, mr->mr_nents, NULL, PAGE_SIZE); if (unlikely(n != mr->mr_nents)) goto out_mapmr_err; + ibmr->iova &= 0x00000000ffffffff; + ibmr->iova |= ((u64)cpu_to_be32(xid)) << 32; key = (u8)(ibmr->rkey & 0x000000FF); ib_update_fast_reg_key(ibmr, ++key); @@ -404,32 +460,35 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, mr->mr_handle = ibmr->rkey; mr->mr_length = ibmr->length; mr->mr_offset = ibmr->iova; + trace_xprtrdma_mr_map(mr); *out = mr; return seg; out_dmamap_err: - pr_err("rpcrdma: failed to DMA map sg %p sg_nents %d\n", - mr->mr_sg, i); frwr->fr_state = FRWR_IS_INVALID; + trace_xprtrdma_frwr_sgerr(mr, i); rpcrdma_mr_put(mr); return ERR_PTR(-EIO); out_mapmr_err: - pr_err("rpcrdma: failed to map mr %p (%d/%d)\n", - frwr->fr_mr, n, mr->mr_nents); + trace_xprtrdma_frwr_maperr(mr, n); rpcrdma_mr_recycle(mr); return ERR_PTR(-EIO); } -/* Post Send WR containing the RPC Call message. +/** + * frwr_send - post Send WR containing the RPC Call message + * @ia: interface adapter + * @req: Prepared RPC Call * - * For FRMR, chain any FastReg WRs to the Send WR. Only a + * For FRWR, chain any FastReg WRs to the Send WR. Only a * single ib_post_send call is needed to register memory * and then post the Send WR. + * + * Returns the result of ib_post_send. */ -static int -frwr_op_send(struct rpcrdma_ia *ia, struct rpcrdma_req *req) +int frwr_send(struct rpcrdma_ia *ia, struct rpcrdma_req *req) { struct ib_send_wr *post_wr; struct rpcrdma_mr *mr; @@ -451,15 +510,18 @@ frwr_op_send(struct rpcrdma_ia *ia, struct rpcrdma_req *req) } /* If ib_post_send fails, the next ->send_request for - * @req will queue these MWs for recovery. + * @req will queue these MRs for recovery. */ return ib_post_send(ia->ri_id->qp, post_wr, NULL); } -/* Handle a remotely invalidated mr on the @mrs list +/** + * frwr_reminv - handle a remotely invalidated mr on the @mrs list + * @rep: Received reply + * @mrs: list of MRs to check + * */ -static void -frwr_op_reminv(struct rpcrdma_rep *rep, struct list_head *mrs) +void frwr_reminv(struct rpcrdma_rep *rep, struct list_head *mrs) { struct rpcrdma_mr *mr; @@ -473,7 +535,10 @@ frwr_op_reminv(struct rpcrdma_rep *rep, struct list_head *mrs) } } -/* Invalidate all memory regions that were registered for "req". +/** + * frwr_unmap_sync - invalidate memory regions that were registered for @req + * @r_xprt: controlling transport + * @mrs: list of MRs to process * * Sleeps until it is safe for the host CPU to access the * previously mapped memory regions. @@ -481,8 +546,7 @@ frwr_op_reminv(struct rpcrdma_rep *rep, struct list_head *mrs) * Caller ensures that @mrs is not empty before the call. This * function empties the list. */ -static void -frwr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct list_head *mrs) +void frwr_unmap_sync(struct rpcrdma_xprt *r_xprt, struct list_head *mrs) { struct ib_send_wr *first, **prev, *last; const struct ib_send_wr *bad_wr; @@ -561,20 +625,7 @@ out_release: mr = container_of(frwr, struct rpcrdma_mr, frwr); bad_wr = bad_wr->next; - list_del(&mr->mr_list); - frwr_op_release_mr(mr); + list_del_init(&mr->mr_list); + rpcrdma_mr_recycle(mr); } } - -const struct rpcrdma_memreg_ops rpcrdma_frwr_memreg_ops = { - .ro_map = frwr_op_map, - .ro_send = frwr_op_send, - .ro_reminv = frwr_op_reminv, - .ro_unmap_sync = frwr_op_unmap_sync, - .ro_open = frwr_op_open, - .ro_maxpages = frwr_op_maxpages, - .ro_init_mr = frwr_op_init_mr, - .ro_release_mr = frwr_op_release_mr, - .ro_displayname = "frwr", - .ro_send_w_inv_ok = RPCRDMA_CMP_F_SND_W_INV_OK, -}; diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c index 9f53e0240035..d18614e02b4e 100644 --- a/net/sunrpc/xprtrdma/rpc_rdma.c +++ b/net/sunrpc/xprtrdma/rpc_rdma.c @@ -218,11 +218,12 @@ rpcrdma_convert_iovs(struct rpcrdma_xprt *r_xprt, struct xdr_buf *xdrbuf, ppages = xdrbuf->pages + (xdrbuf->page_base >> PAGE_SHIFT); page_base = offset_in_page(xdrbuf->page_base); while (len) { - if (unlikely(!*ppages)) { - /* XXX: Certain upper layer operations do - * not provide receive buffer pages. - */ - *ppages = alloc_page(GFP_ATOMIC); + /* ACL likes to be lazy in allocating pages - ACLs + * are small by default but can get huge. + */ + if (unlikely(xdrbuf->flags & XDRBUF_SPARSE_PAGES)) { + if (!*ppages) + *ppages = alloc_page(GFP_ATOMIC); if (!*ppages) return -ENOBUFS; } @@ -356,8 +357,7 @@ rpcrdma_encode_read_list(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, return nsegs; do { - seg = r_xprt->rx_ia.ri_ops->ro_map(r_xprt, seg, nsegs, - false, &mr); + seg = frwr_map(r_xprt, seg, nsegs, false, rqst->rq_xid, &mr); if (IS_ERR(seg)) return PTR_ERR(seg); rpcrdma_mr_push(mr, &req->rl_registered); @@ -365,7 +365,7 @@ rpcrdma_encode_read_list(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, if (encode_read_segment(xdr, mr, pos) < 0) return -EMSGSIZE; - trace_xprtrdma_read_chunk(rqst->rq_task, pos, mr, nsegs); + trace_xprtrdma_chunk_read(rqst->rq_task, pos, mr, nsegs); r_xprt->rx_stats.read_chunk_count++; nsegs -= mr->mr_nents; } while (nsegs); @@ -414,8 +414,7 @@ rpcrdma_encode_write_list(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, nchunks = 0; do { - seg = r_xprt->rx_ia.ri_ops->ro_map(r_xprt, seg, nsegs, - true, &mr); + seg = frwr_map(r_xprt, seg, nsegs, true, rqst->rq_xid, &mr); if (IS_ERR(seg)) return PTR_ERR(seg); rpcrdma_mr_push(mr, &req->rl_registered); @@ -423,7 +422,7 @@ rpcrdma_encode_write_list(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, if (encode_rdma_segment(xdr, mr) < 0) return -EMSGSIZE; - trace_xprtrdma_write_chunk(rqst->rq_task, mr, nsegs); + trace_xprtrdma_chunk_write(rqst->rq_task, mr, nsegs); r_xprt->rx_stats.write_chunk_count++; r_xprt->rx_stats.total_rdma_request += mr->mr_length; nchunks++; @@ -472,8 +471,7 @@ rpcrdma_encode_reply_chunk(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, nchunks = 0; do { - seg = r_xprt->rx_ia.ri_ops->ro_map(r_xprt, seg, nsegs, - true, &mr); + seg = frwr_map(r_xprt, seg, nsegs, true, rqst->rq_xid, &mr); if (IS_ERR(seg)) return PTR_ERR(seg); rpcrdma_mr_push(mr, &req->rl_registered); @@ -481,7 +479,7 @@ rpcrdma_encode_reply_chunk(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, if (encode_rdma_segment(xdr, mr) < 0) return -EMSGSIZE; - trace_xprtrdma_reply_chunk(rqst->rq_task, mr, nsegs); + trace_xprtrdma_chunk_reply(rqst->rq_task, mr, nsegs); r_xprt->rx_stats.reply_chunk_count++; r_xprt->rx_stats.total_rdma_request += mr->mr_length; nchunks++; @@ -667,7 +665,7 @@ out_mapping_overflow: out_mapping_err: rpcrdma_unmap_sendctx(sc); - pr_err("rpcrdma: Send mapping error\n"); + trace_xprtrdma_dma_maperr(sge[sge_no].addr); return false; } @@ -1188,17 +1186,20 @@ rpcrdma_decode_error(struct rpcrdma_xprt *r_xprt, struct rpcrdma_rep *rep, p = xdr_inline_decode(xdr, 2 * sizeof(*p)); if (!p) break; - dprintk("RPC: %5u: %s: server reports version error (%u-%u)\n", - rqst->rq_task->tk_pid, __func__, - be32_to_cpup(p), be32_to_cpu(*(p + 1))); + dprintk("RPC: %s: server reports " + "version error (%u-%u), xid %08x\n", __func__, + be32_to_cpup(p), be32_to_cpu(*(p + 1)), + be32_to_cpu(rep->rr_xid)); break; case err_chunk: - dprintk("RPC: %5u: %s: server reports header decoding error\n", - rqst->rq_task->tk_pid, __func__); + dprintk("RPC: %s: server reports " + "header decoding error, xid %08x\n", __func__, + be32_to_cpu(rep->rr_xid)); break; default: - dprintk("RPC: %5u: %s: server reports unrecognized error %d\n", - rqst->rq_task->tk_pid, __func__, be32_to_cpup(p)); + dprintk("RPC: %s: server reports " + "unrecognized error %d, xid %08x\n", __func__, + be32_to_cpup(p), be32_to_cpu(rep->rr_xid)); } r_xprt->rx_stats.bad_reply_count++; @@ -1248,7 +1249,6 @@ out: out_badheader: trace_xprtrdma_reply_hdr(rep); r_xprt->rx_stats.bad_reply_count++; - status = -EIO; goto out; } @@ -1262,8 +1262,7 @@ void rpcrdma_release_rqst(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) * RPC has relinquished all its Send Queue entries. */ if (!list_empty(&req->rl_registered)) - r_xprt->rx_ia.ri_ops->ro_unmap_sync(r_xprt, - &req->rl_registered); + frwr_unmap_sync(r_xprt, &req->rl_registered); /* Ensure that any DMA mapped pages associated with * the Send of the RPC Call have been unmapped before @@ -1292,7 +1291,7 @@ void rpcrdma_deferred_completion(struct work_struct *work) trace_xprtrdma_defer_cmp(rep); if (rep->rr_wc_flags & IB_WC_WITH_INVALIDATE) - r_xprt->rx_ia.ri_ops->ro_reminv(rep, &req->rl_registered); + frwr_reminv(rep, &req->rl_registered); rpcrdma_release_rqst(r_xprt, req); rpcrdma_complete_rqst(rep); } @@ -1312,11 +1311,6 @@ void rpcrdma_reply_handler(struct rpcrdma_rep *rep) u32 credits; __be32 *p; - --buf->rb_posted_receives; - - if (rep->rr_hdrbuf.head[0].iov_len == 0) - goto out_badstatus; - /* Fixed transport header fields */ xdr_init_decode(&rep->rr_stream, &rep->rr_hdrbuf, rep->rr_hdrbuf.head[0].iov_base); @@ -1356,36 +1350,30 @@ void rpcrdma_reply_handler(struct rpcrdma_rep *rep) } req = rpcr_to_rdmar(rqst); + if (req->rl_reply) { + trace_xprtrdma_leaked_rep(rqst, req->rl_reply); + rpcrdma_recv_buffer_put(req->rl_reply); + } req->rl_reply = rep; rep->rr_rqst = rqst; clear_bit(RPCRDMA_REQ_F_PENDING, &req->rl_flags); trace_xprtrdma_reply(rqst->rq_task, rep, req, credits); - - rpcrdma_post_recvs(r_xprt, false); - queue_work(rpcrdma_receive_wq, &rep->rr_work); + queue_work(buf->rb_completion_wq, &rep->rr_work); return; out_badversion: trace_xprtrdma_reply_vers(rep); - goto repost; + goto out; -/* The RPC transaction has already been terminated, or the header - * is corrupt. - */ out_norqst: spin_unlock(&xprt->queue_lock); trace_xprtrdma_reply_rqst(rep); - goto repost; + goto out; out_shortreply: trace_xprtrdma_reply_short(rep); -/* If no pending RPC transaction was matched, post a replacement - * receive buffer before returning. - */ -repost: - rpcrdma_post_recvs(r_xprt, false); -out_badstatus: +out: rpcrdma_recv_buffer_put(rep); } diff --git a/net/sunrpc/xprtrdma/svc_rdma.c b/net/sunrpc/xprtrdma/svc_rdma.c index 134bef6a451e..abdb3004a1e3 100644 --- a/net/sunrpc/xprtrdma/svc_rdma.c +++ b/net/sunrpc/xprtrdma/svc_rdma.c @@ -235,9 +235,6 @@ void svc_rdma_cleanup(void) unregister_sysctl_table(svcrdma_table_header); svcrdma_table_header = NULL; } -#if defined(CONFIG_SUNRPC_BACKCHANNEL) - svc_unreg_xprt_class(&svc_rdma_bc_class); -#endif svc_unreg_xprt_class(&svc_rdma_class); } @@ -259,8 +256,5 @@ int svc_rdma_init(void) /* Register RDMA with the SVC transport switch */ svc_reg_xprt_class(&svc_rdma_class); -#if defined(CONFIG_SUNRPC_BACKCHANNEL) - svc_reg_xprt_class(&svc_rdma_bc_class); -#endif return 0; } diff --git a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c index f3c147d70286..b908f2ca08fd 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c +++ b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c @@ -200,11 +200,10 @@ rpcrdma_bc_send_request(struct svcxprt_rdma *rdma, struct rpc_rqst *rqst) svc_rdma_send_ctxt_put(rdma, ctxt); goto drop_connection; } - return rc; + return 0; drop_connection: dprintk("svcrdma: failed to send bc call\n"); - xprt_disconnect_done(xprt); return -ENOTCONN; } @@ -225,8 +224,11 @@ xprt_rdma_bc_send_request(struct rpc_rqst *rqst) ret = -ENOTCONN; rdma = container_of(sxprt, struct svcxprt_rdma, sc_xprt); - if (!test_bit(XPT_DEAD, &sxprt->xpt_flags)) + if (!test_bit(XPT_DEAD, &sxprt->xpt_flags)) { ret = rpcrdma_bc_send_request(rdma, rqst); + if (ret == -ENOTCONN) + svc_close_xprt(sxprt); + } mutex_unlock(&sxprt->xpt_mutex); diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c index b24d5b8f2fee..828b149eaaef 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c +++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c @@ -485,6 +485,68 @@ static __be32 *xdr_check_reply_chunk(__be32 *p, const __be32 *end) return p; } +/* RPC-over-RDMA Version One private extension: Remote Invalidation. + * Responder's choice: requester signals it can handle Send With + * Invalidate, and responder chooses one R_key to invalidate. + * + * If there is exactly one distinct R_key in the received transport + * header, set rc_inv_rkey to that R_key. Otherwise, set it to zero. + * + * Perform this operation while the received transport header is + * still in the CPU cache. + */ +static void svc_rdma_get_inv_rkey(struct svcxprt_rdma *rdma, + struct svc_rdma_recv_ctxt *ctxt) +{ + __be32 inv_rkey, *p; + u32 i, segcount; + + ctxt->rc_inv_rkey = 0; + + if (!rdma->sc_snd_w_inv) + return; + + inv_rkey = xdr_zero; + p = ctxt->rc_recv_buf; + p += rpcrdma_fixed_maxsz; + + /* Read list */ + while (*p++ != xdr_zero) { + p++; /* position */ + if (inv_rkey == xdr_zero) + inv_rkey = *p; + else if (inv_rkey != *p) + return; + p += 4; + } + + /* Write list */ + while (*p++ != xdr_zero) { + segcount = be32_to_cpup(p++); + for (i = 0; i < segcount; i++) { + if (inv_rkey == xdr_zero) + inv_rkey = *p; + else if (inv_rkey != *p) + return; + p += 4; + } + } + + /* Reply chunk */ + if (*p++ != xdr_zero) { + segcount = be32_to_cpup(p++); + for (i = 0; i < segcount; i++) { + if (inv_rkey == xdr_zero) + inv_rkey = *p; + else if (inv_rkey != *p) + return; + p += 4; + } + } + + ctxt->rc_inv_rkey = be32_to_cpu(inv_rkey); +} + /* On entry, xdr->head[0].iov_base points to first byte in the * RPC-over-RDMA header. * @@ -746,6 +808,7 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp) svc_rdma_recv_ctxt_put(rdma_xprt, ctxt); return ret; } + svc_rdma_get_inv_rkey(rdma_xprt, ctxt); p += rpcrdma_fixed_maxsz; if (*p != xdr_zero) diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c index 8602a5f1b515..cf51b8f9b15f 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c +++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c @@ -484,32 +484,6 @@ static void svc_rdma_get_write_arrays(__be32 *rdma_argp, *reply = NULL; } -/* RPC-over-RDMA Version One private extension: Remote Invalidation. - * Responder's choice: requester signals it can handle Send With - * Invalidate, and responder chooses one rkey to invalidate. - * - * Find a candidate rkey to invalidate when sending a reply. Picks the - * first R_key it finds in the chunk lists. - * - * Returns zero if RPC's chunk lists are empty. - */ -static u32 svc_rdma_get_inv_rkey(__be32 *rdma_argp, - __be32 *wr_lst, __be32 *rp_ch) -{ - __be32 *p; - - p = rdma_argp + rpcrdma_fixed_maxsz; - if (*p != xdr_zero) - p += 2; - else if (wr_lst && be32_to_cpup(wr_lst + 1)) - p = wr_lst + 2; - else if (rp_ch && be32_to_cpup(rp_ch + 1)) - p = rp_ch + 2; - else - return 0; - return be32_to_cpup(p); -} - static int svc_rdma_dma_map_page(struct svcxprt_rdma *rdma, struct svc_rdma_send_ctxt *ctxt, struct page *page, @@ -672,7 +646,7 @@ static void svc_rdma_save_io_pages(struct svc_rqst *rqstp, * * RDMA Send is the last step of transmitting an RPC reply. Pages * involved in the earlier RDMA Writes are here transferred out - * of the rqstp and into the ctxt's page array. These pages are + * of the rqstp and into the sctxt's page array. These pages are * DMA unmapped by each Write completion, but the subsequent Send * completion finally releases these pages. * @@ -680,32 +654,31 @@ static void svc_rdma_save_io_pages(struct svc_rqst *rqstp, * - The Reply's transport header will never be larger than a page. */ static int svc_rdma_send_reply_msg(struct svcxprt_rdma *rdma, - struct svc_rdma_send_ctxt *ctxt, - __be32 *rdma_argp, + struct svc_rdma_send_ctxt *sctxt, + struct svc_rdma_recv_ctxt *rctxt, struct svc_rqst *rqstp, __be32 *wr_lst, __be32 *rp_ch) { int ret; if (!rp_ch) { - ret = svc_rdma_map_reply_msg(rdma, ctxt, + ret = svc_rdma_map_reply_msg(rdma, sctxt, &rqstp->rq_res, wr_lst); if (ret < 0) return ret; } - svc_rdma_save_io_pages(rqstp, ctxt); + svc_rdma_save_io_pages(rqstp, sctxt); - ctxt->sc_send_wr.opcode = IB_WR_SEND; - if (rdma->sc_snd_w_inv) { - ctxt->sc_send_wr.ex.invalidate_rkey = - svc_rdma_get_inv_rkey(rdma_argp, wr_lst, rp_ch); - if (ctxt->sc_send_wr.ex.invalidate_rkey) - ctxt->sc_send_wr.opcode = IB_WR_SEND_WITH_INV; + if (rctxt->rc_inv_rkey) { + sctxt->sc_send_wr.opcode = IB_WR_SEND_WITH_INV; + sctxt->sc_send_wr.ex.invalidate_rkey = rctxt->rc_inv_rkey; + } else { + sctxt->sc_send_wr.opcode = IB_WR_SEND; } dprintk("svcrdma: posting Send WR with %u sge(s)\n", - ctxt->sc_send_wr.num_sge); - return svc_rdma_send(rdma, &ctxt->sc_send_wr); + sctxt->sc_send_wr.num_sge); + return svc_rdma_send(rdma, &sctxt->sc_send_wr); } /* Given the client-provided Write and Reply chunks, the server was not @@ -741,10 +714,6 @@ static int svc_rdma_send_error_msg(struct svcxprt_rdma *rdma, return 0; } -void svc_rdma_prep_reply_hdr(struct svc_rqst *rqstp) -{ -} - /** * svc_rdma_sendto - Transmit an RPC reply * @rqstp: processed RPC request, reply XDR already in ::rq_res @@ -809,7 +778,7 @@ int svc_rdma_sendto(struct svc_rqst *rqstp) } svc_rdma_sync_reply_hdr(rdma, sctxt, svc_rdma_reply_hdr_len(rdma_resp)); - ret = svc_rdma_send_reply_msg(rdma, sctxt, rdma_argp, rqstp, + ret = svc_rdma_send_reply_msg(rdma, sctxt, rctxt, rqstp, wr_lst, rp_ch); if (ret < 0) goto err1; diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c index 2f7ec8912f49..924c17d46903 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_transport.c +++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c @@ -85,7 +85,6 @@ static const struct svc_xprt_ops svc_rdma_ops = { .xpo_release_rqst = svc_rdma_release_rqst, .xpo_detach = svc_rdma_detach, .xpo_free = svc_rdma_free, - .xpo_prep_reply_hdr = svc_rdma_prep_reply_hdr, .xpo_has_wspace = svc_rdma_has_wspace, .xpo_accept = svc_rdma_accept, .xpo_secure_port = svc_rdma_secure_port, @@ -100,64 +99,6 @@ struct svc_xprt_class svc_rdma_class = { .xcl_ident = XPRT_TRANSPORT_RDMA, }; -#if defined(CONFIG_SUNRPC_BACKCHANNEL) -static struct svc_xprt *svc_rdma_bc_create(struct svc_serv *, struct net *, - struct sockaddr *, int, int); -static void svc_rdma_bc_detach(struct svc_xprt *); -static void svc_rdma_bc_free(struct svc_xprt *); - -static const struct svc_xprt_ops svc_rdma_bc_ops = { - .xpo_create = svc_rdma_bc_create, - .xpo_detach = svc_rdma_bc_detach, - .xpo_free = svc_rdma_bc_free, - .xpo_prep_reply_hdr = svc_rdma_prep_reply_hdr, - .xpo_secure_port = svc_rdma_secure_port, -}; - -struct svc_xprt_class svc_rdma_bc_class = { - .xcl_name = "rdma-bc", - .xcl_owner = THIS_MODULE, - .xcl_ops = &svc_rdma_bc_ops, - .xcl_max_payload = (1024 - RPCRDMA_HDRLEN_MIN) -}; - -static struct svc_xprt *svc_rdma_bc_create(struct svc_serv *serv, - struct net *net, - struct sockaddr *sa, int salen, - int flags) -{ - struct svcxprt_rdma *cma_xprt; - struct svc_xprt *xprt; - - cma_xprt = svc_rdma_create_xprt(serv, net); - if (!cma_xprt) - return ERR_PTR(-ENOMEM); - xprt = &cma_xprt->sc_xprt; - - svc_xprt_init(net, &svc_rdma_bc_class, xprt, serv); - set_bit(XPT_CONG_CTRL, &xprt->xpt_flags); - serv->sv_bc_xprt = xprt; - - dprintk("svcrdma: %s(%p)\n", __func__, xprt); - return xprt; -} - -static void svc_rdma_bc_detach(struct svc_xprt *xprt) -{ - dprintk("svcrdma: %s(%p)\n", __func__, xprt); -} - -static void svc_rdma_bc_free(struct svc_xprt *xprt) -{ - struct svcxprt_rdma *rdma = - container_of(xprt, struct svcxprt_rdma, sc_xprt); - - dprintk("svcrdma: %s(%p)\n", __func__, xprt); - if (xprt) - kfree(rdma); -} -#endif /* CONFIG_SUNRPC_BACKCHANNEL */ - /* QP event handler */ static void qp_event_handler(struct ib_event *event, void *context) { diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c index ae2a83828953..fbc171ebfe91 100644 --- a/net/sunrpc/xprtrdma/transport.c +++ b/net/sunrpc/xprtrdma/transport.c @@ -268,7 +268,7 @@ xprt_rdma_inject_disconnect(struct rpc_xprt *xprt) { struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); - trace_xprtrdma_inject_dsc(r_xprt); + trace_xprtrdma_op_inject_dsc(r_xprt); rdma_disconnect(r_xprt->rx_ia.ri_id); } @@ -284,7 +284,7 @@ xprt_rdma_destroy(struct rpc_xprt *xprt) { struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); - trace_xprtrdma_destroy(r_xprt); + trace_xprtrdma_op_destroy(r_xprt); cancel_delayed_work_sync(&r_xprt->rx_connect_worker); @@ -318,17 +318,12 @@ xprt_setup_rdma(struct xprt_create *args) struct sockaddr *sap; int rc; - if (args->addrlen > sizeof(xprt->addr)) { - dprintk("RPC: %s: address too large\n", __func__); + if (args->addrlen > sizeof(xprt->addr)) return ERR_PTR(-EBADF); - } xprt = xprt_alloc(args->net, sizeof(struct rpcrdma_xprt), 0, 0); - if (xprt == NULL) { - dprintk("RPC: %s: couldn't allocate rpcrdma_xprt\n", - __func__); + if (!xprt) return ERR_PTR(-ENOMEM); - } /* 60 second timeout, no retries */ xprt->timeout = &xprt_rdma_default_timeout; @@ -399,7 +394,7 @@ xprt_setup_rdma(struct xprt_create *args) INIT_DELAYED_WORK(&new_xprt->rx_connect_worker, xprt_rdma_connect_worker); - xprt->max_payload = new_xprt->rx_ia.ri_ops->ro_maxpages(new_xprt); + xprt->max_payload = frwr_maxpages(new_xprt); if (xprt->max_payload == 0) goto out4; xprt->max_payload <<= PAGE_SHIFT; @@ -423,7 +418,7 @@ out3: out2: rpcrdma_ia_close(&new_xprt->rx_ia); out1: - trace_xprtrdma_destroy(new_xprt); + trace_xprtrdma_op_destroy(new_xprt); xprt_rdma_free_addresses(xprt); xprt_free(xprt); return ERR_PTR(rc); @@ -433,29 +428,33 @@ out1: * xprt_rdma_close - close a transport connection * @xprt: transport context * - * Called during transport shutdown, reconnect, or device removal. + * Called during autoclose or device removal. + * * Caller holds @xprt's send lock to prevent activity on this * transport while the connection is torn down. */ -static void -xprt_rdma_close(struct rpc_xprt *xprt) +void xprt_rdma_close(struct rpc_xprt *xprt) { struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); struct rpcrdma_ep *ep = &r_xprt->rx_ep; struct rpcrdma_ia *ia = &r_xprt->rx_ia; - dprintk("RPC: %s: closing xprt %p\n", __func__, xprt); + might_sleep(); + + trace_xprtrdma_op_close(r_xprt); + + /* Prevent marshaling and sending of new requests */ + xprt_clear_connected(xprt); if (test_and_clear_bit(RPCRDMA_IAF_REMOVING, &ia->ri_flags)) { - xprt_clear_connected(xprt); rpcrdma_ia_remove(ia); - return; + goto out; } + if (ep->rep_connected == -ENODEV) return; if (ep->rep_connected > 0) xprt->reestablish_timeout = 0; - xprt_disconnect_done(xprt); rpcrdma_ep_disconnect(ep, ia); /* Prepare @xprt for the next connection by reinitializing @@ -463,6 +462,10 @@ xprt_rdma_close(struct rpc_xprt *xprt) */ r_xprt->rx_buf.rb_credits = 1; xprt->cwnd = RPC_CWNDSHIFT; + +out: + ++xprt->connect_cookie; + xprt_disconnect_done(xprt); } /** @@ -525,6 +528,7 @@ xprt_rdma_connect(struct rpc_xprt *xprt, struct rpc_task *task) { struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); + trace_xprtrdma_op_connect(r_xprt); if (r_xprt->rx_ep.rep_connected != 0) { /* Reconnect */ schedule_delayed_work(&r_xprt->rx_connect_worker, @@ -659,11 +663,11 @@ xprt_rdma_allocate(struct rpc_task *task) rqst->rq_buffer = req->rl_sendbuf->rg_base; rqst->rq_rbuffer = req->rl_recvbuf->rg_base; - trace_xprtrdma_allocate(task, req); + trace_xprtrdma_op_allocate(task, req); return 0; out_fail: - trace_xprtrdma_allocate(task, NULL); + trace_xprtrdma_op_allocate(task, NULL); return -ENOMEM; } @@ -682,7 +686,7 @@ xprt_rdma_free(struct rpc_task *task) if (test_bit(RPCRDMA_REQ_F_PENDING, &req->rl_flags)) rpcrdma_release_rqst(r_xprt, req); - trace_xprtrdma_rpc_done(task, req); + trace_xprtrdma_op_free(task, req); } /** @@ -696,8 +700,10 @@ xprt_rdma_free(struct rpc_task *task) * %-ENOTCONN if the caller should reconnect and call again * %-EAGAIN if the caller should call again * %-ENOBUFS if the caller should call again after a delay - * %-EIO if a permanent error occurred and the request was not - * sent. Do not try to send this message again. + * %-EMSGSIZE if encoding ran out of buffer space. The request + * was not sent. Do not try to send this message again. + * %-EIO if an I/O error occurred. The request was not sent. + * Do not try to send this message again. */ static int xprt_rdma_send_request(struct rpc_rqst *rqst) @@ -713,7 +719,7 @@ xprt_rdma_send_request(struct rpc_rqst *rqst) #endif /* CONFIG_SUNRPC_BACKCHANNEL */ if (!xprt_connected(xprt)) - goto drop_connection; + return -ENOTCONN; if (!xprt_request_get_cong(xprt, rqst)) return -EBADSLT; @@ -745,8 +751,8 @@ failed_marshal: if (rc != -ENOTCONN) return rc; drop_connection: - xprt_disconnect_done(xprt); - return -ENOTCONN; /* implies disconnect */ + xprt_rdma_close(xprt); + return -ENOTCONN; } void xprt_rdma_print_stats(struct rpc_xprt *xprt, struct seq_file *seq) @@ -827,7 +833,6 @@ static const struct rpc_xprt_ops xprt_rdma_procs = { .inject_disconnect = xprt_rdma_inject_disconnect, #if defined(CONFIG_SUNRPC_BACKCHANNEL) .bc_setup = xprt_rdma_bc_setup, - .bc_up = xprt_rdma_bc_up, .bc_maxpayload = xprt_rdma_bc_maxpayload, .bc_free_rqst = xprt_rdma_bc_free_rqst, .bc_destroy = xprt_rdma_bc_destroy, @@ -844,58 +849,31 @@ static struct xprt_class xprt_rdma = { void xprt_rdma_cleanup(void) { - int rc; - - dprintk("RPCRDMA Module Removed, deregister RPC RDMA transport\n"); #if IS_ENABLED(CONFIG_SUNRPC_DEBUG) if (sunrpc_table_header) { unregister_sysctl_table(sunrpc_table_header); sunrpc_table_header = NULL; } #endif - rc = xprt_unregister_transport(&xprt_rdma); - if (rc) - dprintk("RPC: %s: xprt_unregister returned %i\n", - __func__, rc); - - rpcrdma_destroy_wq(); - rc = xprt_unregister_transport(&xprt_rdma_bc); - if (rc) - dprintk("RPC: %s: xprt_unregister(bc) returned %i\n", - __func__, rc); + xprt_unregister_transport(&xprt_rdma); + xprt_unregister_transport(&xprt_rdma_bc); } int xprt_rdma_init(void) { int rc; - rc = rpcrdma_alloc_wq(); - if (rc) - return rc; - rc = xprt_register_transport(&xprt_rdma); - if (rc) { - rpcrdma_destroy_wq(); + if (rc) return rc; - } rc = xprt_register_transport(&xprt_rdma_bc); if (rc) { xprt_unregister_transport(&xprt_rdma); - rpcrdma_destroy_wq(); return rc; } - dprintk("RPCRDMA Module Init, register RPC RDMA transport\n"); - - dprintk("Defaults:\n"); - dprintk("\tSlots %d\n" - "\tMaxInlineRead %d\n\tMaxInlineWrite %d\n", - xprt_rdma_slot_table_entries, - xprt_rdma_max_inline_read, xprt_rdma_max_inline_write); - dprintk("\tPadding 0\n\tMemreg %d\n", xprt_rdma_memreg_strategy); - #if IS_ENABLED(CONFIG_SUNRPC_DEBUG) if (!sunrpc_table_header) sunrpc_table_header = register_sysctl_table(sunrpc_table); diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index 3ddba94c939f..7749a2bf6887 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -78,53 +78,25 @@ static void rpcrdma_mrs_create(struct rpcrdma_xprt *r_xprt); static void rpcrdma_mrs_destroy(struct rpcrdma_buffer *buf); static int rpcrdma_create_rep(struct rpcrdma_xprt *r_xprt, bool temp); static void rpcrdma_dma_unmap_regbuf(struct rpcrdma_regbuf *rb); +static void rpcrdma_post_recvs(struct rpcrdma_xprt *r_xprt, bool temp); -struct workqueue_struct *rpcrdma_receive_wq __read_mostly; - -int -rpcrdma_alloc_wq(void) -{ - struct workqueue_struct *recv_wq; - - recv_wq = alloc_workqueue("xprtrdma_receive", - WQ_MEM_RECLAIM | WQ_HIGHPRI, - 0); - if (!recv_wq) - return -ENOMEM; - - rpcrdma_receive_wq = recv_wq; - return 0; -} - -void -rpcrdma_destroy_wq(void) -{ - struct workqueue_struct *wq; - - if (rpcrdma_receive_wq) { - wq = rpcrdma_receive_wq; - rpcrdma_receive_wq = NULL; - destroy_workqueue(wq); - } -} - -/** - * rpcrdma_disconnect_worker - Force a disconnect - * @work: endpoint to be disconnected - * - * Provider callbacks can possibly run in an IRQ context. This function - * is invoked in a worker thread to guarantee that disconnect wake-up - * calls are always done in process context. +/* Wait for outstanding transport work to finish. */ -static void -rpcrdma_disconnect_worker(struct work_struct *work) +static void rpcrdma_xprt_drain(struct rpcrdma_xprt *r_xprt) { - struct rpcrdma_ep *ep = container_of(work, struct rpcrdma_ep, - rep_disconnect_worker.work); - struct rpcrdma_xprt *r_xprt = - container_of(ep, struct rpcrdma_xprt, rx_ep); + struct rpcrdma_buffer *buf = &r_xprt->rx_buf; + struct rpcrdma_ia *ia = &r_xprt->rx_ia; - xprt_force_disconnect(&r_xprt->rx_xprt); + /* Flush Receives, then wait for deferred Reply work + * to complete. + */ + ib_drain_qp(ia->ri_id->qp); + drain_workqueue(buf->rb_completion_wq); + + /* Deferred Reply processing might have scheduled + * local invalidations. + */ + ib_drain_sq(ia->ri_id->qp); } /** @@ -143,15 +115,6 @@ rpcrdma_qp_event_handler(struct ib_event *event, void *context) rx_ep); trace_xprtrdma_qp_event(r_xprt, event); - pr_err("rpcrdma: %s on device %s connected to %s:%s\n", - ib_event_msg(event->event), event->device->name, - rpcrdma_addrstr(r_xprt), rpcrdma_portstr(r_xprt)); - - if (ep->rep_connected == 1) { - ep->rep_connected = -EIO; - schedule_delayed_work(&ep->rep_disconnect_worker, 0); - wake_up_all(&ep->rep_connect_wait); - } } /** @@ -189,11 +152,13 @@ rpcrdma_wc_receive(struct ib_cq *cq, struct ib_wc *wc) struct ib_cqe *cqe = wc->wr_cqe; struct rpcrdma_rep *rep = container_of(cqe, struct rpcrdma_rep, rr_cqe); + struct rpcrdma_xprt *r_xprt = rep->rr_rxprt; - /* WARNING: Only wr_id and status are reliable at this point */ + /* WARNING: Only wr_cqe and status are reliable at this point */ trace_xprtrdma_wc_receive(wc); + --r_xprt->rx_ep.rep_receive_count; if (wc->status != IB_WC_SUCCESS) - goto out_fail; + goto out_flushed; /* status == SUCCESS means all fields in wc are trustworthy */ rpcrdma_set_xdrlen(&rep->rr_hdrbuf, wc->byte_len); @@ -204,17 +169,16 @@ rpcrdma_wc_receive(struct ib_cq *cq, struct ib_wc *wc) rdmab_addr(rep->rr_rdmabuf), wc->byte_len, DMA_FROM_DEVICE); -out_schedule: + rpcrdma_post_recvs(r_xprt, false); rpcrdma_reply_handler(rep); return; -out_fail: +out_flushed: if (wc->status != IB_WC_WR_FLUSH_ERR) pr_err("rpcrdma: Recv: %s (%u/0x%x)\n", ib_wc_status_msg(wc->status), wc->status, wc->vendor_err); - rpcrdma_set_xdrlen(&rep->rr_hdrbuf, 0); - goto out_schedule; + rpcrdma_recv_buffer_put(rep); } static void @@ -316,7 +280,6 @@ rpcrdma_cm_event_handler(struct rdma_cm_id *id, struct rdma_cm_event *event) ep->rep_connected = -EAGAIN; goto disconnected; case RDMA_CM_EVENT_DISCONNECTED: - ++xprt->connect_cookie; ep->rep_connected = -ECONNABORTED; disconnected: xprt_force_disconnect(xprt); @@ -326,10 +289,9 @@ disconnected: break; } - dprintk("RPC: %s: %s:%s on %s/%s: %s\n", __func__, + dprintk("RPC: %s: %s:%s on %s/frwr: %s\n", __func__, rpcrdma_addrstr(r_xprt), rpcrdma_portstr(r_xprt), - ia->ri_device->name, ia->ri_ops->ro_displayname, - rdma_event_msg(event->event)); + ia->ri_device->name, rdma_event_msg(event->event)); return 0; } @@ -347,22 +309,15 @@ rpcrdma_create_id(struct rpcrdma_xprt *xprt, struct rpcrdma_ia *ia) id = rdma_create_id(xprt->rx_xprt.xprt_net, rpcrdma_cm_event_handler, xprt, RDMA_PS_TCP, IB_QPT_RC); - if (IS_ERR(id)) { - rc = PTR_ERR(id); - dprintk("RPC: %s: rdma_create_id() failed %i\n", - __func__, rc); + if (IS_ERR(id)) return id; - } ia->ri_async_rc = -ETIMEDOUT; rc = rdma_resolve_addr(id, NULL, (struct sockaddr *)&xprt->rx_xprt.addr, RDMA_RESOLVE_TIMEOUT); - if (rc) { - dprintk("RPC: %s: rdma_resolve_addr() failed %i\n", - __func__, rc); + if (rc) goto out; - } rc = wait_for_completion_interruptible_timeout(&ia->ri_done, wtimeout); if (rc < 0) { trace_xprtrdma_conn_tout(xprt); @@ -375,11 +330,8 @@ rpcrdma_create_id(struct rpcrdma_xprt *xprt, struct rpcrdma_ia *ia) ia->ri_async_rc = -ETIMEDOUT; rc = rdma_resolve_route(id, RDMA_RESOLVE_TIMEOUT); - if (rc) { - dprintk("RPC: %s: rdma_resolve_route() failed %i\n", - __func__, rc); + if (rc) goto out; - } rc = wait_for_completion_interruptible_timeout(&ia->ri_done, wtimeout); if (rc < 0) { trace_xprtrdma_conn_tout(xprt); @@ -429,16 +381,8 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt) switch (xprt_rdma_memreg_strategy) { case RPCRDMA_FRWR: - if (frwr_is_supported(ia)) { - ia->ri_ops = &rpcrdma_frwr_memreg_ops; - break; - } - /*FALLTHROUGH*/ - case RPCRDMA_MTHCAFMR: - if (fmr_is_supported(ia)) { - ia->ri_ops = &rpcrdma_fmr_memreg_ops; + if (frwr_is_supported(ia)) break; - } /*FALLTHROUGH*/ default: pr_err("rpcrdma: Device %s does not support memreg mode %d\n", @@ -481,7 +425,7 @@ rpcrdma_ia_remove(struct rpcrdma_ia *ia) * connection is already gone. */ if (ia->ri_id->qp) { - ib_drain_qp(ia->ri_id->qp); + rpcrdma_xprt_drain(r_xprt); rdma_destroy_qp(ia->ri_id); ia->ri_id->qp = NULL; } @@ -552,7 +496,7 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia, } ia->ri_max_send_sges = max_sge; - rc = ia->ri_ops->ro_open(ia, ep, cdata); + rc = frwr_open(ia, ep, cdata); if (rc) return rc; @@ -579,16 +523,13 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia, cdata->max_requests >> 2); ep->rep_send_count = ep->rep_send_batch; init_waitqueue_head(&ep->rep_connect_wait); - INIT_DELAYED_WORK(&ep->rep_disconnect_worker, - rpcrdma_disconnect_worker); + ep->rep_receive_count = 0; sendcq = ib_alloc_cq(ia->ri_device, NULL, ep->rep_attr.cap.max_send_wr + 1, 1, IB_POLL_WORKQUEUE); if (IS_ERR(sendcq)) { rc = PTR_ERR(sendcq); - dprintk("RPC: %s: failed to create send CQ: %i\n", - __func__, rc); goto out1; } @@ -597,8 +538,6 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia, 0, IB_POLL_WORKQUEUE); if (IS_ERR(recvcq)) { rc = PTR_ERR(recvcq); - dprintk("RPC: %s: failed to create recv CQ: %i\n", - __func__, rc); goto out2; } @@ -611,7 +550,7 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia, /* Prepare RDMA-CM private message */ pmsg->cp_magic = rpcrdma_cmp_magic; pmsg->cp_version = RPCRDMA_CMP_VERSION; - pmsg->cp_flags |= ia->ri_ops->ro_send_w_inv_ok; + pmsg->cp_flags |= RPCRDMA_CMP_F_SND_W_INV_OK; pmsg->cp_send_size = rpcrdma_encode_buffer_size(cdata->inline_wsize); pmsg->cp_recv_size = rpcrdma_encode_buffer_size(cdata->inline_rsize); ep->rep_remote_cma.private_data = pmsg; @@ -653,8 +592,6 @@ out1: void rpcrdma_ep_destroy(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia) { - cancel_delayed_work_sync(&ep->rep_disconnect_worker); - if (ia->ri_id && ia->ri_id->qp) { rpcrdma_ep_disconnect(ep, ia); rdma_destroy_qp(ia->ri_id); @@ -740,11 +677,8 @@ rpcrdma_ep_reconnect(struct rpcrdma_xprt *r_xprt, struct rpcrdma_ep *ep, } err = rdma_create_qp(id, ia->ri_pd, &ep->rep_attr); - if (err) { - dprintk("RPC: %s: rdma_create_qp returned %d\n", - __func__, err); + if (err) goto out_destroy; - } /* Atomically replace the transport's ID and QP. */ rc = 0; @@ -775,8 +709,6 @@ retry: dprintk("RPC: %s: connecting...\n", __func__); rc = rdma_create_qp(ia->ri_id, ia->ri_pd, &ep->rep_attr); if (rc) { - dprintk("RPC: %s: rdma_create_qp failed %i\n", - __func__, rc); rc = -ENETUNREACH; goto out_noupdate; } @@ -798,11 +730,8 @@ retry: rpcrdma_post_recvs(r_xprt, true); rc = rdma_connect(ia->ri_id, &ep->rep_remote_cma); - if (rc) { - dprintk("RPC: %s: rdma_connect() failed with %i\n", - __func__, rc); + if (rc) goto out; - } wait_event_interruptible(ep->rep_connect_wait, ep->rep_connected != 0); if (ep->rep_connected <= 0) { @@ -822,8 +751,10 @@ out_noupdate: return rc; } -/* - * rpcrdma_ep_disconnect +/** + * rpcrdma_ep_disconnect - Disconnect underlying transport + * @ep: endpoint to disconnect + * @ia: associated interface adapter * * This is separate from destroy to facilitate the ability * to reconnect without recreating the endpoint. @@ -834,19 +765,20 @@ out_noupdate: void rpcrdma_ep_disconnect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia) { + struct rpcrdma_xprt *r_xprt = container_of(ep, struct rpcrdma_xprt, + rx_ep); int rc; + /* returns without wait if ID is not connected */ rc = rdma_disconnect(ia->ri_id); if (!rc) - /* returns without wait if not connected */ wait_event_interruptible(ep->rep_connect_wait, ep->rep_connected != 1); else ep->rep_connected = rc; - trace_xprtrdma_disconnect(container_of(ep, struct rpcrdma_xprt, - rx_ep), rc); + trace_xprtrdma_disconnect(r_xprt, rc); - ib_drain_qp(ia->ri_id->qp); + rpcrdma_xprt_drain(r_xprt); } /* Fixed-size circular FIFO queue. This implementation is wait-free and @@ -1034,7 +966,7 @@ rpcrdma_mrs_create(struct rpcrdma_xprt *r_xprt) if (!mr) break; - rc = ia->ri_ops->ro_init_mr(ia, mr); + rc = frwr_init_mr(ia, mr); if (rc) { kfree(mr); break; @@ -1089,9 +1021,9 @@ rpcrdma_create_req(struct rpcrdma_xprt *r_xprt) req->rl_buffer = buffer; INIT_LIST_HEAD(&req->rl_registered); - spin_lock(&buffer->rb_reqslock); + spin_lock(&buffer->rb_lock); list_add(&req->rl_all, &buffer->rb_allreqs); - spin_unlock(&buffer->rb_reqslock); + spin_unlock(&buffer->rb_lock); return req; } @@ -1134,8 +1066,6 @@ rpcrdma_create_rep(struct rpcrdma_xprt *r_xprt, bool temp) out_free: kfree(rep); out: - dprintk("RPC: %s: reply buffer %d alloc failed\n", - __func__, rc); return rc; } @@ -1159,7 +1089,6 @@ rpcrdma_buffer_create(struct rpcrdma_xprt *r_xprt) INIT_LIST_HEAD(&buf->rb_send_bufs); INIT_LIST_HEAD(&buf->rb_allreqs); - spin_lock_init(&buf->rb_reqslock); for (i = 0; i < buf->rb_max_requests; i++) { struct rpcrdma_req *req; @@ -1174,13 +1103,19 @@ rpcrdma_buffer_create(struct rpcrdma_xprt *r_xprt) } buf->rb_credits = 1; - buf->rb_posted_receives = 0; INIT_LIST_HEAD(&buf->rb_recv_bufs); rc = rpcrdma_sendctxs_create(r_xprt); if (rc) goto out; + buf->rb_completion_wq = alloc_workqueue("rpcrdma-%s", + WQ_MEM_RECLAIM | WQ_HIGHPRI, + 0, + r_xprt->rx_xprt.address_strings[RPC_DISPLAY_ADDR]); + if (!buf->rb_completion_wq) + goto out; + return 0; out: rpcrdma_buffer_destroy(buf); @@ -1194,9 +1129,18 @@ rpcrdma_destroy_rep(struct rpcrdma_rep *rep) kfree(rep); } +/** + * rpcrdma_req_destroy - Destroy an rpcrdma_req object + * @req: unused object to be destroyed + * + * This function assumes that the caller prevents concurrent device + * unload and transport tear-down. + */ void -rpcrdma_destroy_req(struct rpcrdma_req *req) +rpcrdma_req_destroy(struct rpcrdma_req *req) { + list_del(&req->rl_all); + rpcrdma_free_regbuf(req->rl_recvbuf); rpcrdma_free_regbuf(req->rl_sendbuf); rpcrdma_free_regbuf(req->rl_rdmabuf); @@ -1208,7 +1152,6 @@ rpcrdma_mrs_destroy(struct rpcrdma_buffer *buf) { struct rpcrdma_xprt *r_xprt = container_of(buf, struct rpcrdma_xprt, rx_buf); - struct rpcrdma_ia *ia = rdmab_to_ia(buf); struct rpcrdma_mr *mr; unsigned int count; @@ -1224,7 +1167,7 @@ rpcrdma_mrs_destroy(struct rpcrdma_buffer *buf) if (!list_empty(&mr->mr_list)) list_del(&mr->mr_list); - ia->ri_ops->ro_release_mr(mr); + frwr_release_mr(mr); count++; spin_lock(&buf->rb_mrlock); } @@ -1234,11 +1177,24 @@ rpcrdma_mrs_destroy(struct rpcrdma_buffer *buf) dprintk("RPC: %s: released %u MRs\n", __func__, count); } +/** + * rpcrdma_buffer_destroy - Release all hw resources + * @buf: root control block for resources + * + * ORDERING: relies on a prior ib_drain_qp : + * - No more Send or Receive completions can occur + * - All MRs, reps, and reqs are returned to their free lists + */ void rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf) { cancel_delayed_work_sync(&buf->rb_refresh_worker); + if (buf->rb_completion_wq) { + destroy_workqueue(buf->rb_completion_wq); + buf->rb_completion_wq = NULL; + } + rpcrdma_sendctxs_destroy(buf); while (!list_empty(&buf->rb_recv_bufs)) { @@ -1250,19 +1206,14 @@ rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf) rpcrdma_destroy_rep(rep); } - spin_lock(&buf->rb_reqslock); - while (!list_empty(&buf->rb_allreqs)) { + while (!list_empty(&buf->rb_send_bufs)) { struct rpcrdma_req *req; - req = list_first_entry(&buf->rb_allreqs, - struct rpcrdma_req, rl_all); - list_del(&req->rl_all); - - spin_unlock(&buf->rb_reqslock); - rpcrdma_destroy_req(req); - spin_lock(&buf->rb_reqslock); + req = list_first_entry(&buf->rb_send_bufs, + struct rpcrdma_req, rl_list); + list_del(&req->rl_list); + rpcrdma_req_destroy(req); } - spin_unlock(&buf->rb_reqslock); rpcrdma_mrs_destroy(buf); } @@ -1329,9 +1280,12 @@ rpcrdma_mr_unmap_and_put(struct rpcrdma_mr *mr) { struct rpcrdma_xprt *r_xprt = mr->mr_xprt; - trace_xprtrdma_mr_unmap(mr); - ib_dma_unmap_sg(r_xprt->rx_ia.ri_device, - mr->mr_sg, mr->mr_nents, mr->mr_dir); + if (mr->mr_dir != DMA_NONE) { + trace_xprtrdma_mr_unmap(mr); + ib_dma_unmap_sg(r_xprt->rx_ia.ri_device, + mr->mr_sg, mr->mr_nents, mr->mr_dir); + mr->mr_dir = DMA_NONE; + } __rpcrdma_mr_put(&r_xprt->rx_buf, mr); } @@ -1410,7 +1364,7 @@ rpcrdma_recv_buffer_put(struct rpcrdma_rep *rep) * * xprtrdma uses a regbuf for posting an outgoing RDMA SEND, or for * receiving the payload of RDMA RECV operations. During Long Calls - * or Replies they may be registered externally via ro_map. + * or Replies they may be registered externally via frwr_map. */ struct rpcrdma_regbuf * rpcrdma_alloc_regbuf(size_t size, enum dma_data_direction direction, @@ -1446,8 +1400,10 @@ __rpcrdma_dma_map_regbuf(struct rpcrdma_ia *ia, struct rpcrdma_regbuf *rb) (void *)rb->rg_base, rdmab_length(rb), rb->rg_direction); - if (ib_dma_mapping_error(device, rdmab_addr(rb))) + if (ib_dma_mapping_error(device, rdmab_addr(rb))) { + trace_xprtrdma_dma_maperr(rdmab_addr(rb)); return false; + } rb->rg_device = device; rb->rg_iov.lkey = ia->ri_pd->local_dma_lkey; @@ -1479,10 +1435,14 @@ rpcrdma_free_regbuf(struct rpcrdma_regbuf *rb) kfree(rb); } -/* - * Prepost any receive buffer, then post send. +/** + * rpcrdma_ep_post - Post WRs to a transport's Send Queue + * @ia: transport's device information + * @ep: transport's RDMA endpoint information + * @req: rpcrdma_req containing the Send WR to post * - * Receive buffer is donated to hardware, reclaimed upon recv completion. + * Returns 0 if the post was successful, otherwise -ENOTCONN + * is returned. */ int rpcrdma_ep_post(struct rpcrdma_ia *ia, @@ -1501,32 +1461,27 @@ rpcrdma_ep_post(struct rpcrdma_ia *ia, --ep->rep_send_count; } - rc = ia->ri_ops->ro_send(ia, req); + rc = frwr_send(ia, req); trace_xprtrdma_post_send(req, rc); if (rc) return -ENOTCONN; return 0; } -/** - * rpcrdma_post_recvs - Maybe post some Receive buffers - * @r_xprt: controlling transport - * @temp: when true, allocate temp rpcrdma_rep objects - * - */ -void +static void rpcrdma_post_recvs(struct rpcrdma_xprt *r_xprt, bool temp) { struct rpcrdma_buffer *buf = &r_xprt->rx_buf; + struct rpcrdma_ep *ep = &r_xprt->rx_ep; struct ib_recv_wr *wr, *bad_wr; int needed, count, rc; rc = 0; count = 0; needed = buf->rb_credits + (buf->rb_bc_srv_max_requests << 1); - if (buf->rb_posted_receives > needed) + if (ep->rep_receive_count > needed) goto out; - needed -= buf->rb_posted_receives; + needed -= ep->rep_receive_count; count = 0; wr = NULL; @@ -1574,7 +1529,7 @@ rpcrdma_post_recvs(struct rpcrdma_xprt *r_xprt, bool temp) --count; } } - buf->rb_posted_receives += count; + ep->rep_receive_count += count; out: trace_xprtrdma_post_recvs(r_xprt, count, rc); } diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index a13ccb643ce0..5a18472f2c9c 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -66,7 +66,6 @@ * Interface Adapter -- one per transport instance */ struct rpcrdma_ia { - const struct rpcrdma_memreg_ops *ri_ops; struct ib_device *ri_device; struct rdma_cm_id *ri_id; struct ib_pd *ri_pd; @@ -81,8 +80,6 @@ struct rpcrdma_ia { bool ri_implicit_roundup; enum ib_mr_type ri_mrtype; unsigned long ri_flags; - struct ib_qp_attr ri_qp_attr; - struct ib_qp_init_attr ri_qp_init_attr; }; enum { @@ -101,7 +98,7 @@ struct rpcrdma_ep { wait_queue_head_t rep_connect_wait; struct rpcrdma_connect_private rep_cm_private; struct rdma_conn_param rep_remote_cma; - struct delayed_work rep_disconnect_worker; + int rep_receive_count; }; /* Pre-allocate extra Work Requests for handling backward receives @@ -262,20 +259,12 @@ struct rpcrdma_frwr { }; }; -struct rpcrdma_fmr { - struct ib_fmr *fm_mr; - u64 *fm_physaddrs; -}; - struct rpcrdma_mr { struct list_head mr_list; struct scatterlist *mr_sg; int mr_nents; enum dma_data_direction mr_dir; - union { - struct rpcrdma_fmr fmr; - struct rpcrdma_frwr frwr; - }; + struct rpcrdma_frwr frwr; struct rpcrdma_xprt *mr_xprt; u32 mr_handle; u32 mr_length; @@ -401,20 +390,18 @@ struct rpcrdma_buffer { spinlock_t rb_lock; /* protect buf lists */ struct list_head rb_send_bufs; struct list_head rb_recv_bufs; + struct list_head rb_allreqs; + unsigned long rb_flags; u32 rb_max_requests; u32 rb_credits; /* most recent credit grant */ - int rb_posted_receives; u32 rb_bc_srv_max_requests; - spinlock_t rb_reqslock; /* protect rb_allreqs */ - struct list_head rb_allreqs; - u32 rb_bc_max_requests; + struct workqueue_struct *rb_completion_wq; struct delayed_work rb_refresh_worker; }; -#define rdmab_to_ia(b) (&container_of((b), struct rpcrdma_xprt, rx_buf)->rx_ia) /* rb_flags */ enum { @@ -465,35 +452,6 @@ struct rpcrdma_stats { }; /* - * Per-registration mode operations - */ -struct rpcrdma_xprt; -struct rpcrdma_memreg_ops { - struct rpcrdma_mr_seg * - (*ro_map)(struct rpcrdma_xprt *, - struct rpcrdma_mr_seg *, int, bool, - struct rpcrdma_mr **); - int (*ro_send)(struct rpcrdma_ia *ia, - struct rpcrdma_req *req); - void (*ro_reminv)(struct rpcrdma_rep *rep, - struct list_head *mrs); - void (*ro_unmap_sync)(struct rpcrdma_xprt *, - struct list_head *); - int (*ro_open)(struct rpcrdma_ia *, - struct rpcrdma_ep *, - struct rpcrdma_create_data_internal *); - size_t (*ro_maxpages)(struct rpcrdma_xprt *); - int (*ro_init_mr)(struct rpcrdma_ia *, - struct rpcrdma_mr *); - void (*ro_release_mr)(struct rpcrdma_mr *mr); - const char *ro_displayname; - const int ro_send_w_inv_ok; -}; - -extern const struct rpcrdma_memreg_ops rpcrdma_fmr_memreg_ops; -extern const struct rpcrdma_memreg_ops rpcrdma_frwr_memreg_ops; - -/* * RPCRDMA transport -- encapsulates the structures above for * integration with RPC. * @@ -544,10 +502,6 @@ extern unsigned int xprt_rdma_memreg_strategy; int rpcrdma_ia_open(struct rpcrdma_xprt *xprt); void rpcrdma_ia_remove(struct rpcrdma_ia *ia); void rpcrdma_ia_close(struct rpcrdma_ia *); -bool frwr_is_supported(struct rpcrdma_ia *); -bool fmr_is_supported(struct rpcrdma_ia *); - -extern struct workqueue_struct *rpcrdma_receive_wq; /* * Endpoint calls - xprtrdma/verbs.c @@ -560,13 +514,12 @@ void rpcrdma_ep_disconnect(struct rpcrdma_ep *, struct rpcrdma_ia *); int rpcrdma_ep_post(struct rpcrdma_ia *, struct rpcrdma_ep *, struct rpcrdma_req *); -void rpcrdma_post_recvs(struct rpcrdma_xprt *r_xprt, bool temp); /* * Buffer calls - xprtrdma/verbs.c */ struct rpcrdma_req *rpcrdma_create_req(struct rpcrdma_xprt *); -void rpcrdma_destroy_req(struct rpcrdma_req *); +void rpcrdma_req_destroy(struct rpcrdma_req *req); int rpcrdma_buffer_create(struct rpcrdma_xprt *); void rpcrdma_buffer_destroy(struct rpcrdma_buffer *); struct rpcrdma_sendctx *rpcrdma_sendctx_get_locked(struct rpcrdma_buffer *buf); @@ -604,9 +557,6 @@ rpcrdma_dma_map_regbuf(struct rpcrdma_ia *ia, struct rpcrdma_regbuf *rb) return __rpcrdma_dma_map_regbuf(ia, rb); } -int rpcrdma_alloc_wq(void); -void rpcrdma_destroy_wq(void); - /* * Wrappers for chunk registration, shared by read/write chunk code. */ @@ -617,6 +567,23 @@ rpcrdma_data_dir(bool writing) return writing ? DMA_FROM_DEVICE : DMA_TO_DEVICE; } +/* Memory registration calls xprtrdma/frwr_ops.c + */ +bool frwr_is_supported(struct rpcrdma_ia *); +int frwr_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep, + struct rpcrdma_create_data_internal *cdata); +int frwr_init_mr(struct rpcrdma_ia *ia, struct rpcrdma_mr *mr); +void frwr_release_mr(struct rpcrdma_mr *mr); +size_t frwr_maxpages(struct rpcrdma_xprt *r_xprt); +struct rpcrdma_mr_seg *frwr_map(struct rpcrdma_xprt *r_xprt, + struct rpcrdma_mr_seg *seg, + int nsegs, bool writing, u32 xid, + struct rpcrdma_mr **mr); +int frwr_send(struct rpcrdma_ia *ia, struct rpcrdma_req *req); +void frwr_reminv(struct rpcrdma_rep *rep, struct list_head *mrs); +void frwr_unmap_sync(struct rpcrdma_xprt *r_xprt, + struct list_head *mrs); + /* * RPC/RDMA protocol calls - xprtrdma/rpc_rdma.c */ @@ -653,6 +620,7 @@ static inline void rpcrdma_set_xdrlen(struct xdr_buf *xdr, size_t len) extern unsigned int xprt_rdma_max_inline_read; void xprt_rdma_format_addresses(struct rpc_xprt *xprt, struct sockaddr *sap); void xprt_rdma_free_addresses(struct rpc_xprt *xprt); +void xprt_rdma_close(struct rpc_xprt *xprt); void xprt_rdma_print_stats(struct rpc_xprt *xprt, struct seq_file *seq); int xprt_rdma_init(void); void xprt_rdma_cleanup(void); @@ -661,7 +629,6 @@ void xprt_rdma_cleanup(void); */ #if defined(CONFIG_SUNRPC_BACKCHANNEL) int xprt_rdma_bc_setup(struct rpc_xprt *, unsigned int); -int xprt_rdma_bc_up(struct svc_serv *, struct net *); size_t xprt_rdma_bc_maxpayload(struct rpc_xprt *); int rpcrdma_bc_post_recv(struct rpcrdma_xprt *, unsigned int); void rpcrdma_bc_receive_call(struct rpcrdma_xprt *, struct rpcrdma_rep *); diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index f0b3700cec95..13559e6a460b 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -68,8 +68,6 @@ static unsigned int xprt_max_tcp_slot_table_entries = RPC_MAX_SLOT_TABLE; static unsigned int xprt_min_resvport = RPC_DEF_MIN_RESVPORT; static unsigned int xprt_max_resvport = RPC_DEF_MAX_RESVPORT; -#if IS_ENABLED(CONFIG_SUNRPC_DEBUG) - #define XS_TCP_LINGER_TO (15U * HZ) static unsigned int xs_tcp_fin_timeout __read_mostly = XS_TCP_LINGER_TO; @@ -159,8 +157,6 @@ static struct ctl_table sunrpc_table[] = { { }, }; -#endif - /* * Wait duration for a reply from the RPC portmapper. */ @@ -1400,17 +1396,6 @@ static void xs_tcp_force_close(struct rpc_xprt *xprt) } #if defined(CONFIG_SUNRPC_BACKCHANNEL) -static int xs_tcp_bc_up(struct svc_serv *serv, struct net *net) -{ - int ret; - - ret = svc_create_xprt(serv, "tcp-bc", net, PF_INET, 0, - SVC_SOCK_ANONYMOUS); - if (ret < 0) - return ret; - return 0; -} - static size_t xs_tcp_bc_maxpayload(struct rpc_xprt *xprt) { return PAGE_SIZE; @@ -1600,6 +1585,7 @@ static void xs_udp_set_buffer_size(struct rpc_xprt *xprt, size_t sndsize, size_t /** * xs_udp_timer - called when a retransmit timeout occurs on a UDP transport + * @xprt: controlling transport * @task: task that timed out * * Adjust the congestion window after a retransmit timeout has occurred. @@ -2257,6 +2243,7 @@ out: /** * xs_tcp_setup_socket - create a TCP socket and connect to a remote endpoint + * @work: queued work item * * Invoked by a work queue tasklet. */ @@ -2665,7 +2652,6 @@ static const struct rpc_xprt_ops xs_tcp_ops = { .inject_disconnect = xs_inject_disconnect, #ifdef CONFIG_SUNRPC_BACKCHANNEL .bc_setup = xprt_setup_bc, - .bc_up = xs_tcp_bc_up, .bc_maxpayload = xs_tcp_bc_maxpayload, .bc_free_rqst = xprt_free_bc_rqst, .bc_destroy = xprt_destroy_bc, @@ -3107,10 +3093,8 @@ static struct xprt_class xs_bc_tcp_transport = { */ int init_socket_xprt(void) { -#if IS_ENABLED(CONFIG_SUNRPC_DEBUG) if (!sunrpc_table_header) sunrpc_table_header = register_sysctl_table(sunrpc_table); -#endif xprt_register_transport(&xs_local_transport); xprt_register_transport(&xs_udp_transport); @@ -3126,12 +3110,10 @@ int init_socket_xprt(void) */ void cleanup_socket_xprt(void) { -#if IS_ENABLED(CONFIG_SUNRPC_DEBUG) if (sunrpc_table_header) { unregister_sysctl_table(sunrpc_table_header); sunrpc_table_header = NULL; } -#endif xprt_unregister_transport(&xs_local_transport); xprt_unregister_transport(&xs_udp_transport); |