diff options
Diffstat (limited to 'net')
59 files changed, 1632 insertions, 1672 deletions
diff --git a/net/9p/client.c b/net/9p/client.c index 1d48afc7033c..fc1f3635e5dd 100644 --- a/net/9p/client.c +++ b/net/9p/client.c @@ -1549,82 +1549,94 @@ EXPORT_SYMBOL(p9_client_unlinkat); int p9_client_read(struct p9_fid *fid, u64 offset, struct iov_iter *to, int *err) { - struct p9_client *clnt = fid->clnt; - struct p9_req_t *req; int total = 0; *err = 0; + while (iov_iter_count(to)) { + int count; + + count = p9_client_read_once(fid, offset, to, err); + if (!count || *err) + break; + offset += count; + total += count; + } + return total; +} +EXPORT_SYMBOL(p9_client_read); + +int +p9_client_read_once(struct p9_fid *fid, u64 offset, struct iov_iter *to, + int *err) +{ + struct p9_client *clnt = fid->clnt; + struct p9_req_t *req; + int count = iov_iter_count(to); + int rsize, non_zc = 0; + char *dataptr; + + *err = 0; p9_debug(P9_DEBUG_9P, ">>> TREAD fid %d offset %llu %d\n", fid->fid, (unsigned long long) offset, (int)iov_iter_count(to)); - while (iov_iter_count(to)) { - int count = iov_iter_count(to); - int rsize, non_zc = 0; - char *dataptr; + rsize = fid->iounit; + if (!rsize || rsize > clnt->msize - P9_IOHDRSZ) + rsize = clnt->msize - P9_IOHDRSZ; - rsize = fid->iounit; - if (!rsize || rsize > clnt->msize-P9_IOHDRSZ) - rsize = clnt->msize - P9_IOHDRSZ; + if (count < rsize) + rsize = count; - if (count < rsize) - rsize = count; + /* Don't bother zerocopy for small IO (< 1024) */ + if (clnt->trans_mod->zc_request && rsize > 1024) { + /* response header len is 11 + * PDU Header(7) + IO Size (4) + */ + req = p9_client_zc_rpc(clnt, P9_TREAD, to, NULL, rsize, + 0, 11, "dqd", fid->fid, + offset, rsize); + } else { + non_zc = 1; + req = p9_client_rpc(clnt, P9_TREAD, "dqd", fid->fid, offset, + rsize); + } + if (IS_ERR(req)) { + *err = PTR_ERR(req); + return 0; + } - /* Don't bother zerocopy for small IO (< 1024) */ - if (clnt->trans_mod->zc_request && rsize > 1024) { - /* - * response header len is 11 - * PDU Header(7) + IO Size (4) - */ - req = p9_client_zc_rpc(clnt, P9_TREAD, to, NULL, rsize, - 0, 11, "dqd", fid->fid, - offset, rsize); - } else { - non_zc = 1; - req = p9_client_rpc(clnt, P9_TREAD, "dqd", fid->fid, offset, - rsize); - } - if (IS_ERR(req)) { - *err = PTR_ERR(req); - break; - } + *err = p9pdu_readf(&req->rc, clnt->proto_version, + "D", &count, &dataptr); + if (*err) { + trace_9p_protocol_dump(clnt, &req->rc); + p9_tag_remove(clnt, req); + return 0; + } + if (rsize < count) { + pr_err("bogus RREAD count (%d > %d)\n", count, rsize); + count = rsize; + } - *err = p9pdu_readf(&req->rc, clnt->proto_version, - "D", &count, &dataptr); - if (*err) { - trace_9p_protocol_dump(clnt, &req->rc); - p9_tag_remove(clnt, req); - break; - } - if (rsize < count) { - pr_err("bogus RREAD count (%d > %d)\n", count, rsize); - count = rsize; - } + p9_debug(P9_DEBUG_9P, "<<< RREAD count %d\n", count); + if (!count) { + p9_tag_remove(clnt, req); + return 0; + } - p9_debug(P9_DEBUG_9P, "<<< RREAD count %d\n", count); - if (!count) { - p9_tag_remove(clnt, req); - break; - } + if (non_zc) { + int n = copy_to_iter(dataptr, count, to); - if (non_zc) { - int n = copy_to_iter(dataptr, count, to); - total += n; - offset += n; - if (n != count) { - *err = -EFAULT; - p9_tag_remove(clnt, req); - break; - } - } else { - iov_iter_advance(to, count); - total += count; - offset += count; + if (n != count) { + *err = -EFAULT; + p9_tag_remove(clnt, req); + return n; } - p9_tag_remove(clnt, req); + } else { + iov_iter_advance(to, count); } - return total; + p9_tag_remove(clnt, req); + return count; } -EXPORT_SYMBOL(p9_client_read); +EXPORT_SYMBOL(p9_client_read_once); int p9_client_write(struct p9_fid *fid, u64 offset, struct iov_iter *from, int *err) diff --git a/net/bpfilter/.gitignore b/net/bpfilter/.gitignore index e97084e3eea2..f34e85ee8204 100644 --- a/net/bpfilter/.gitignore +++ b/net/bpfilter/.gitignore @@ -1 +1,2 @@ +# SPDX-License-Identifier: GPL-2.0-only bpfilter_umh diff --git a/net/core/dev.c b/net/core/dev.c index 9c9e763bfe0e..df8097b8e286 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4140,7 +4140,8 @@ EXPORT_SYMBOL(netdev_max_backlog); int netdev_tstamp_prequeue __read_mostly = 1; int netdev_budget __read_mostly = 300; -unsigned int __read_mostly netdev_budget_usecs = 2000; +/* Must be at least 2 jiffes to guarantee 1 jiffy timeout */ +unsigned int __read_mostly netdev_budget_usecs = 2 * USEC_PER_SEC / HZ; int weight_p __read_mostly = 64; /* old backlog weight */ int dev_weight_rx_bias __read_mostly = 1; /* bias for backlog weight */ int dev_weight_tx_bias __read_mostly = 1; /* bias for output_queue quota */ diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 5bf8d22a47ec..39d37d0ef575 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -1065,11 +1065,12 @@ static void neigh_timer_handler(struct timer_list *t) neigh->updated = jiffies; atomic_set(&neigh->probes, 0); notify = 1; - next = now + NEIGH_VAR(neigh->parms, RETRANS_TIME); + next = now + max(NEIGH_VAR(neigh->parms, RETRANS_TIME), + HZ/100); } } else { /* NUD_PROBE|NUD_INCOMPLETE */ - next = now + NEIGH_VAR(neigh->parms, RETRANS_TIME); + next = now + max(NEIGH_VAR(neigh->parms, RETRANS_TIME), HZ/100); } if ((neigh->nud_state & (NUD_INCOMPLETE | NUD_PROBE)) && @@ -1125,7 +1126,7 @@ int __neigh_event_send(struct neighbour *neigh, struct sk_buff *skb) neigh->nud_state = NUD_INCOMPLETE; neigh->updated = now; next = now + max(NEIGH_VAR(neigh->parms, RETRANS_TIME), - HZ/2); + HZ/100); neigh_add_timer(neigh, next); immediate_probe = true; } else { @@ -1427,7 +1428,8 @@ void __neigh_set_probe_once(struct neighbour *neigh) neigh->nud_state = NUD_INCOMPLETE; atomic_set(&neigh->probes, neigh_max_probes(neigh)); neigh_add_timer(neigh, - jiffies + NEIGH_VAR(neigh->parms, RETRANS_TIME)); + jiffies + max(NEIGH_VAR(neigh->parms, RETRANS_TIME), + HZ/100)); } EXPORT_SYMBOL(__neigh_set_probe_once); diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index cf0215734ceb..4773ad6ec111 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -80,7 +80,7 @@ static ssize_t netdev_store(struct device *dev, struct device_attribute *attr, struct net_device *netdev = to_net_dev(dev); struct net *net = dev_net(netdev); unsigned long new; - int ret = -EINVAL; + int ret; if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) return -EPERM; diff --git a/net/core/sock.c b/net/core/sock.c index 0510826bf860..90509c37d291 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -574,7 +574,7 @@ static int sock_setbindtodevice_locked(struct sock *sk, int ifindex) /* Sorry... */ ret = -EPERM; - if (!ns_capable(net->user_ns, CAP_NET_RAW)) + if (sk->sk_bound_dev_if && !ns_capable(net->user_ns, CAP_NET_RAW)) goto out; ret = -EINVAL; diff --git a/net/dns_resolver/dns_key.c b/net/dns_resolver/dns_key.c index 3e1a90669006..ad53eb31d40f 100644 --- a/net/dns_resolver/dns_key.c +++ b/net/dns_resolver/dns_key.c @@ -302,7 +302,7 @@ static void dns_resolver_describe(const struct key *key, struct seq_file *m) * - the key's semaphore is read-locked */ static long dns_resolver_read(const struct key *key, - char __user *buffer, size_t buflen) + char *buffer, size_t buflen) { int err = PTR_ERR(key->payload.data[dns_key_error]); diff --git a/net/hsr/hsr_netlink.c b/net/hsr/hsr_netlink.c index 5465a395da04..1decb25f6764 100644 --- a/net/hsr/hsr_netlink.c +++ b/net/hsr/hsr_netlink.c @@ -69,10 +69,16 @@ static int hsr_newlink(struct net *src_net, struct net_device *dev, else multicast_spec = nla_get_u8(data[IFLA_HSR_MULTICAST_SPEC]); - if (!data[IFLA_HSR_VERSION]) + if (!data[IFLA_HSR_VERSION]) { hsr_version = 0; - else + } else { hsr_version = nla_get_u8(data[IFLA_HSR_VERSION]); + if (hsr_version > 1) { + NL_SET_ERR_MSG_MOD(extack, + "Only versions 0..1 are supported"); + return -EINVAL; + } + } return hsr_dev_finalize(dev, link, multicast_spec, hsr_version, extack); } diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index 30fa42f5997d..c0dd561aa190 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -614,12 +614,15 @@ struct in_ifaddr *inet_ifa_byprefix(struct in_device *in_dev, __be32 prefix, return NULL; } -static int ip_mc_config(struct sock *sk, bool join, const struct in_ifaddr *ifa) +static int ip_mc_autojoin_config(struct net *net, bool join, + const struct in_ifaddr *ifa) { +#if defined(CONFIG_IP_MULTICAST) struct ip_mreqn mreq = { .imr_multiaddr.s_addr = ifa->ifa_address, .imr_ifindex = ifa->ifa_dev->dev->ifindex, }; + struct sock *sk = net->ipv4.mc_autojoin_sk; int ret; ASSERT_RTNL(); @@ -632,6 +635,9 @@ static int ip_mc_config(struct sock *sk, bool join, const struct in_ifaddr *ifa) release_sock(sk); return ret; +#else + return -EOPNOTSUPP; +#endif } static int inet_rtm_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh, @@ -675,7 +681,7 @@ static int inet_rtm_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh, continue; if (ipv4_is_multicast(ifa->ifa_address)) - ip_mc_config(net->ipv4.mc_autojoin_sk, false, ifa); + ip_mc_autojoin_config(net, false, ifa); __inet_del_ifa(in_dev, ifap, 1, nlh, NETLINK_CB(skb).portid); return 0; } @@ -940,8 +946,7 @@ static int inet_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh, */ set_ifa_lifetime(ifa, valid_lft, prefered_lft); if (ifa->ifa_flags & IFA_F_MCAUTOJOIN) { - int ret = ip_mc_config(net->ipv4.mc_autojoin_sk, - true, ifa); + int ret = ip_mc_autojoin_config(net, true, ifa); if (ret < 0) { inet_free_ifa(ifa); diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 84a28b539c43..24e319dfb510 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -1357,7 +1357,7 @@ retry: regen_advance = idev->cnf.regen_max_retry * idev->cnf.dad_transmits * - NEIGH_VAR(idev->nd_parms, RETRANS_TIME) / HZ; + max(NEIGH_VAR(idev->nd_parms, RETRANS_TIME), HZ/100) / HZ; /* recalculate max_desync_factor each time and update * idev->desync_factor if it's larger @@ -4121,7 +4121,8 @@ static void addrconf_dad_work(struct work_struct *w) ifp->dad_probes--; addrconf_mod_dad_work(ifp, - NEIGH_VAR(ifp->idev->nd_parms, RETRANS_TIME)); + max(NEIGH_VAR(ifp->idev->nd_parms, RETRANS_TIME), + HZ/100)); spin_unlock(&ifp->lock); write_unlock_bh(&idev->lock); @@ -4527,7 +4528,7 @@ restart: !(ifp->flags&IFA_F_TENTATIVE)) { unsigned long regen_advance = ifp->idev->cnf.regen_max_retry * ifp->idev->cnf.dad_transmits * - NEIGH_VAR(ifp->idev->nd_parms, RETRANS_TIME) / HZ; + max(NEIGH_VAR(ifp->idev->nd_parms, RETRANS_TIME), HZ/100) / HZ; if (age >= ifp->prefered_lft - regen_advance) { struct inet6_ifaddr *ifpub = ifp->ifpub; diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c index 2688f3e82165..fc5000370030 100644 --- a/net/ipv6/icmp.c +++ b/net/ipv6/icmp.c @@ -229,6 +229,25 @@ static bool icmpv6_xrlim_allow(struct sock *sk, u8 type, return res; } +static bool icmpv6_rt_has_prefsrc(struct sock *sk, u8 type, + struct flowi6 *fl6) +{ + struct net *net = sock_net(sk); + struct dst_entry *dst; + bool res = false; + + dst = ip6_route_output(net, sk, fl6); + if (!dst->error) { + struct rt6_info *rt = (struct rt6_info *)dst; + struct in6_addr prefsrc; + + rt6_get_prefsrc(rt, &prefsrc); + res = !ipv6_addr_any(&prefsrc); + } + dst_release(dst); + return res; +} + /* * an inline helper for the "simple" if statement below * checks if parameter problem report is caused by an @@ -527,7 +546,7 @@ static void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info, saddr = force_saddr; if (saddr) { fl6.saddr = *saddr; - } else { + } else if (!icmpv6_rt_has_prefsrc(sk, type, &fl6)) { /* select a more meaningful saddr from input if */ struct net_device *in_netdev; diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c index 6ffa153e5166..1ecd4e9b0bdf 100644 --- a/net/ipv6/ndisc.c +++ b/net/ipv6/ndisc.c @@ -1359,8 +1359,8 @@ skip_defrtr: if (rtime && rtime/1000 < MAX_SCHEDULE_TIMEOUT/HZ) { rtime = (rtime*HZ)/1000; - if (rtime < HZ/10) - rtime = HZ/10; + if (rtime < HZ/100) + rtime = HZ/100; NEIGH_VAR_SET(in6_dev->nd_parms, RETRANS_TIME, rtime); in6_dev->tstamp = jiffies; send_ifinfo_notify = true; diff --git a/net/ipv6/rpl.c b/net/ipv6/rpl.c index dc4f20e23bf7..d38b476fc7f2 100644 --- a/net/ipv6/rpl.c +++ b/net/ipv6/rpl.c @@ -48,7 +48,7 @@ void ipv6_rpl_srh_decompress(struct ipv6_rpl_sr_hdr *outhdr, outhdr->cmpri = 0; outhdr->cmpre = 0; - for (i = 0; i <= n; i++) + for (i = 0; i < n; i++) ipv6_rpl_addr_decompress(&outhdr->rpl_segaddr[i], daddr, ipv6_rpl_segdata_pos(inhdr, i), inhdr->cmpri); @@ -66,7 +66,7 @@ static unsigned char ipv6_rpl_srh_calc_cmpri(const struct ipv6_rpl_sr_hdr *inhdr int i; for (plen = 0; plen < sizeof(*daddr); plen++) { - for (i = 0; i <= n; i++) { + for (i = 0; i < n; i++) { if (daddr->s6_addr[plen] != inhdr->rpl_segaddr[i].s6_addr[plen]) return plen; @@ -114,7 +114,7 @@ void ipv6_rpl_srh_compress(struct ipv6_rpl_sr_hdr *outhdr, outhdr->cmpri = cmpri; outhdr->cmpre = cmpre; - for (i = 0; i <= n; i++) + for (i = 0; i < n; i++) ipv6_rpl_addr_compress(ipv6_rpl_segdata_pos(outhdr, i), &inhdr->rpl_segaddr[i], cmpri); diff --git a/net/l2tp/l2tp_netlink.c b/net/l2tp/l2tp_netlink.c index f5a9bdc4980c..ebb381c3f1b9 100644 --- a/net/l2tp/l2tp_netlink.c +++ b/net/l2tp/l2tp_netlink.c @@ -920,51 +920,51 @@ static const struct genl_ops l2tp_nl_ops[] = { .cmd = L2TP_CMD_TUNNEL_CREATE, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = l2tp_nl_cmd_tunnel_create, - .flags = GENL_ADMIN_PERM, + .flags = GENL_UNS_ADMIN_PERM, }, { .cmd = L2TP_CMD_TUNNEL_DELETE, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = l2tp_nl_cmd_tunnel_delete, - .flags = GENL_ADMIN_PERM, + .flags = GENL_UNS_ADMIN_PERM, }, { .cmd = L2TP_CMD_TUNNEL_MODIFY, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = l2tp_nl_cmd_tunnel_modify, - .flags = GENL_ADMIN_PERM, + .flags = GENL_UNS_ADMIN_PERM, }, { .cmd = L2TP_CMD_TUNNEL_GET, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = l2tp_nl_cmd_tunnel_get, .dumpit = l2tp_nl_cmd_tunnel_dump, - .flags = GENL_ADMIN_PERM, + .flags = GENL_UNS_ADMIN_PERM, }, { .cmd = L2TP_CMD_SESSION_CREATE, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = l2tp_nl_cmd_session_create, - .flags = GENL_ADMIN_PERM, + .flags = GENL_UNS_ADMIN_PERM, }, { .cmd = L2TP_CMD_SESSION_DELETE, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = l2tp_nl_cmd_session_delete, - .flags = GENL_ADMIN_PERM, + .flags = GENL_UNS_ADMIN_PERM, }, { .cmd = L2TP_CMD_SESSION_MODIFY, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = l2tp_nl_cmd_session_modify, - .flags = GENL_ADMIN_PERM, + .flags = GENL_UNS_ADMIN_PERM, }, { .cmd = L2TP_CMD_SESSION_GET, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = l2tp_nl_cmd_session_get, .dumpit = l2tp_nl_cmd_session_dump, - .flags = GENL_ADMIN_PERM, + .flags = GENL_UNS_ADMIN_PERM, }, }; diff --git a/net/mptcp/options.c b/net/mptcp/options.c index bd220ee4aac9..faf57585b892 100644 --- a/net/mptcp/options.c +++ b/net/mptcp/options.c @@ -4,6 +4,8 @@ * Copyright (c) 2017 - 2019, Intel Corporation. */ +#define pr_fmt(fmt) "MPTCP: " fmt + #include <linux/kernel.h> #include <net/tcp.h> #include <net/mptcp.h> diff --git a/net/mptcp/pm.c b/net/mptcp/pm.c index 064639f72487..977d9c8b1453 100644 --- a/net/mptcp/pm.c +++ b/net/mptcp/pm.c @@ -3,6 +3,8 @@ * * Copyright (c) 2019, Intel Corporation. */ +#define pr_fmt(fmt) "MPTCP: " fmt + #include <linux/kernel.h> #include <net/tcp.h> #include <net/mptcp.h> diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c index a0ce7f324499..86d61ab34c7c 100644 --- a/net/mptcp/pm_netlink.c +++ b/net/mptcp/pm_netlink.c @@ -4,6 +4,8 @@ * Copyright (c) 2020, Red Hat, Inc. */ +#define pr_fmt(fmt) "MPTCP: " fmt + #include <linux/inet.h> #include <linux/kernel.h> #include <net/tcp.h> diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c index 8dd17589217d..340cb955af25 100644 --- a/net/netfilter/ipset/ip_set_core.c +++ b/net/netfilter/ipset/ip_set_core.c @@ -86,7 +86,8 @@ find_set_type(const char *name, u8 family, u8 revision) { struct ip_set_type *type; - list_for_each_entry_rcu(type, &ip_set_type_list, list) + list_for_each_entry_rcu(type, &ip_set_type_list, list, + lockdep_is_held(&ip_set_type_mutex)) if (STRNCMP(type->name, name) && (type->family == family || type->family == NFPROTO_UNSPEC) && diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index d0ab5ffa1e2c..9adfbc7e8ae7 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -3542,6 +3542,7 @@ cont: continue; if (!strcmp(set->name, i->name)) { kfree(set->name); + set->name = NULL; return -ENFILE; } } @@ -3961,8 +3962,8 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk, if (flags & ~(NFT_SET_ANONYMOUS | NFT_SET_CONSTANT | NFT_SET_INTERVAL | NFT_SET_TIMEOUT | NFT_SET_MAP | NFT_SET_EVAL | - NFT_SET_OBJECT)) - return -EINVAL; + NFT_SET_OBJECT | NFT_SET_CONCAT)) + return -EOPNOTSUPP; /* Only one of these operations is supported */ if ((flags & (NFT_SET_MAP | NFT_SET_OBJECT)) == (NFT_SET_MAP | NFT_SET_OBJECT)) @@ -4000,7 +4001,7 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk, objtype = ntohl(nla_get_be32(nla[NFTA_SET_OBJ_TYPE])); if (objtype == NFT_OBJECT_UNSPEC || objtype > NFT_OBJECT_MAX) - return -EINVAL; + return -EOPNOTSUPP; } else if (flags & NFT_SET_OBJECT) return -EINVAL; else diff --git a/net/netfilter/nft_lookup.c b/net/netfilter/nft_lookup.c index 1e70359d633c..f1363b8aabba 100644 --- a/net/netfilter/nft_lookup.c +++ b/net/netfilter/nft_lookup.c @@ -29,7 +29,7 @@ void nft_lookup_eval(const struct nft_expr *expr, { const struct nft_lookup *priv = nft_expr_priv(expr); const struct nft_set *set = priv->set; - const struct nft_set_ext *ext; + const struct nft_set_ext *ext = NULL; bool found; found = set->ops->lookup(nft_net(pkt), set, ®s->data[priv->sreg], @@ -39,11 +39,13 @@ void nft_lookup_eval(const struct nft_expr *expr, return; } - if (set->flags & NFT_SET_MAP) - nft_data_copy(®s->data[priv->dreg], - nft_set_ext_data(ext), set->dlen); + if (ext) { + if (set->flags & NFT_SET_MAP) + nft_data_copy(®s->data[priv->dreg], + nft_set_ext_data(ext), set->dlen); - nft_set_elem_update_expr(ext, regs, pkt); + nft_set_elem_update_expr(ext, regs, pkt); + } } static const struct nla_policy nft_lookup_policy[NFTA_LOOKUP_MAX + 1] = { diff --git a/net/netfilter/nft_set_bitmap.c b/net/netfilter/nft_set_bitmap.c index 32f0fc8be3a4..2a81ea421819 100644 --- a/net/netfilter/nft_set_bitmap.c +++ b/net/netfilter/nft_set_bitmap.c @@ -81,7 +81,6 @@ static bool nft_bitmap_lookup(const struct net *net, const struct nft_set *set, u32 idx, off; nft_bitmap_location(set, key, &idx, &off); - *ext = NULL; return nft_bitmap_active(priv->bitmap, idx, off, genmask); } diff --git a/net/netfilter/nft_set_rbtree.c b/net/netfilter/nft_set_rbtree.c index 3a5552e14f75..3ffef454d469 100644 --- a/net/netfilter/nft_set_rbtree.c +++ b/net/netfilter/nft_set_rbtree.c @@ -218,27 +218,26 @@ static int __nft_rbtree_insert(const struct net *net, const struct nft_set *set, /* Detect overlaps as we descend the tree. Set the flag in these cases: * - * a1. |__ _ _? >|__ _ _ (insert start after existing start) - * a2. _ _ __>| ?_ _ __| (insert end before existing end) - * a3. _ _ ___| ?_ _ _>| (insert end after existing end) - * a4. >|__ _ _ _ _ __| (insert start before existing end) + * a1. _ _ __>| ?_ _ __| (insert end before existing end) + * a2. _ _ ___| ?_ _ _>| (insert end after existing end) + * a3. _ _ ___? >|_ _ __| (insert start before existing end) * * and clear it later on, as we eventually reach the points indicated by * '?' above, in the cases described below. We'll always meet these * later, locally, due to tree ordering, and overlaps for the intervals * that are the closest together are always evaluated last. * - * b1. |__ _ _! >|__ _ _ (insert start after existing end) - * b2. _ _ __>| !_ _ __| (insert end before existing start) - * b3. !_____>| (insert end after existing start) + * b1. _ _ __>| !_ _ __| (insert end before existing start) + * b2. _ _ ___| !_ _ _>| (insert end after existing start) + * b3. _ _ ___! >|_ _ __| (insert start after existing end) * - * Case a4. resolves to b1.: + * Case a3. resolves to b3.: * - if the inserted start element is the leftmost, because the '0' * element in the tree serves as end element * - otherwise, if an existing end is found. Note that end elements are * always inserted after corresponding start elements. * - * For a new, rightmost pair of elements, we'll hit cases b1. and b3., + * For a new, rightmost pair of elements, we'll hit cases b3. and b2., * in that order. * * The flag is also cleared in two special cases: @@ -262,9 +261,9 @@ static int __nft_rbtree_insert(const struct net *net, const struct nft_set *set, p = &parent->rb_left; if (nft_rbtree_interval_start(new)) { - overlap = nft_rbtree_interval_start(rbe) && - nft_set_elem_active(&rbe->ext, - genmask); + if (nft_rbtree_interval_end(rbe) && + nft_set_elem_active(&rbe->ext, genmask)) + overlap = false; } else { overlap = nft_rbtree_interval_end(rbe) && nft_set_elem_active(&rbe->ext, diff --git a/net/netfilter/xt_IDLETIMER.c b/net/netfilter/xt_IDLETIMER.c index 75bd0e5dd312..7b2f359bfce4 100644 --- a/net/netfilter/xt_IDLETIMER.c +++ b/net/netfilter/xt_IDLETIMER.c @@ -346,6 +346,9 @@ static int idletimer_tg_checkentry_v1(const struct xt_tgchk_param *par) pr_debug("checkentry targinfo%s\n", info->label); + if (info->send_nl_msg) + return -EOPNOTSUPP; + ret = idletimer_tg_helper((struct idletimer_tg_info *)info); if(ret < 0) { diff --git a/net/openvswitch/flow_table.c b/net/openvswitch/flow_table.c index fd8a01ca7a2d..2398d7238300 100644 --- a/net/openvswitch/flow_table.c +++ b/net/openvswitch/flow_table.c @@ -462,12 +462,14 @@ static void flow_table_copy_flows(struct table_instance *old, struct hlist_head *head = &old->buckets[i]; if (ufid) - hlist_for_each_entry(flow, head, - ufid_table.node[old_ver]) + hlist_for_each_entry_rcu(flow, head, + ufid_table.node[old_ver], + lockdep_ovsl_is_held()) ufid_table_instance_insert(new, flow); else - hlist_for_each_entry(flow, head, - flow_table.node[old_ver]) + hlist_for_each_entry_rcu(flow, head, + flow_table.node[old_ver], + lockdep_ovsl_is_held()) table_instance_insert(new, flow); } diff --git a/net/qrtr/qrtr.c b/net/qrtr/qrtr.c index e22092e4a783..7ed31b5e77e4 100644 --- a/net/qrtr/qrtr.c +++ b/net/qrtr/qrtr.c @@ -906,20 +906,21 @@ static int qrtr_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) node = NULL; if (addr->sq_node == QRTR_NODE_BCAST) { - enqueue_fn = qrtr_bcast_enqueue; - if (addr->sq_port != QRTR_PORT_CTRL) { + if (addr->sq_port != QRTR_PORT_CTRL && + qrtr_local_nid != QRTR_NODE_BCAST) { release_sock(sk); return -ENOTCONN; } + enqueue_fn = qrtr_bcast_enqueue; } else if (addr->sq_node == ipc->us.sq_node) { enqueue_fn = qrtr_local_enqueue; } else { - enqueue_fn = qrtr_node_enqueue; node = qrtr_node_lookup(addr->sq_node); if (!node) { release_sock(sk); return -ECONNRESET; } + enqueue_fn = qrtr_node_enqueue; } plen = (len + 3) & ~3; diff --git a/net/rds/message.c b/net/rds/message.c index 50f13f1d4ae0..bbecb8cb873e 100644 --- a/net/rds/message.c +++ b/net/rds/message.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2006 Oracle. All rights reserved. + * Copyright (c) 2006, 2020 Oracle and/or its affiliates. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -162,12 +162,12 @@ static void rds_message_purge(struct rds_message *rm) if (rm->rdma.op_active) rds_rdma_free_op(&rm->rdma); if (rm->rdma.op_rdma_mr) - rds_mr_put(rm->rdma.op_rdma_mr); + kref_put(&rm->rdma.op_rdma_mr->r_kref, __rds_put_mr_final); if (rm->atomic.op_active) rds_atomic_free_op(&rm->atomic); if (rm->atomic.op_rdma_mr) - rds_mr_put(rm->atomic.op_rdma_mr); + kref_put(&rm->atomic.op_rdma_mr->r_kref, __rds_put_mr_final); } void rds_message_put(struct rds_message *rm) diff --git a/net/rds/rdma.c b/net/rds/rdma.c index 585e6b3b69ce..113e442101ce 100644 --- a/net/rds/rdma.c +++ b/net/rds/rdma.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2007, 2017 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2007, 2020 Oracle and/or its affiliates. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -84,7 +84,7 @@ static struct rds_mr *rds_mr_tree_walk(struct rb_root *root, u64 key, if (insert) { rb_link_node(&insert->r_rb_node, parent, p); rb_insert_color(&insert->r_rb_node, root); - refcount_inc(&insert->r_refcount); + kref_get(&insert->r_kref); } return NULL; } @@ -99,10 +99,7 @@ static void rds_destroy_mr(struct rds_mr *mr) unsigned long flags; rdsdebug("RDS: destroy mr key is %x refcnt %u\n", - mr->r_key, refcount_read(&mr->r_refcount)); - - if (test_and_set_bit(RDS_MR_DEAD, &mr->r_state)) - return; + mr->r_key, kref_read(&mr->r_kref)); spin_lock_irqsave(&rs->rs_rdma_lock, flags); if (!RB_EMPTY_NODE(&mr->r_rb_node)) @@ -115,8 +112,10 @@ static void rds_destroy_mr(struct rds_mr *mr) mr->r_trans->free_mr(trans_private, mr->r_invalidate); } -void __rds_put_mr_final(struct rds_mr *mr) +void __rds_put_mr_final(struct kref *kref) { + struct rds_mr *mr = container_of(kref, struct rds_mr, r_kref); + rds_destroy_mr(mr); kfree(mr); } @@ -140,8 +139,7 @@ void rds_rdma_drop_keys(struct rds_sock *rs) rb_erase(&mr->r_rb_node, &rs->rs_rdma_keys); RB_CLEAR_NODE(&mr->r_rb_node); spin_unlock_irqrestore(&rs->rs_rdma_lock, flags); - rds_destroy_mr(mr); - rds_mr_put(mr); + kref_put(&mr->r_kref, __rds_put_mr_final); spin_lock_irqsave(&rs->rs_rdma_lock, flags); } spin_unlock_irqrestore(&rs->rs_rdma_lock, flags); @@ -242,7 +240,7 @@ static int __rds_rdma_map(struct rds_sock *rs, struct rds_get_mr_args *args, goto out; } - refcount_set(&mr->r_refcount, 1); + kref_init(&mr->r_kref); RB_CLEAR_NODE(&mr->r_rb_node); mr->r_trans = rs->rs_transport; mr->r_sock = rs; @@ -343,7 +341,7 @@ static int __rds_rdma_map(struct rds_sock *rs, struct rds_get_mr_args *args, rdsdebug("RDS: get_mr key is %x\n", mr->r_key); if (mr_ret) { - refcount_inc(&mr->r_refcount); + kref_get(&mr->r_kref); *mr_ret = mr; } @@ -351,7 +349,7 @@ static int __rds_rdma_map(struct rds_sock *rs, struct rds_get_mr_args *args, out: kfree(pages); if (mr) - rds_mr_put(mr); + kref_put(&mr->r_kref, __rds_put_mr_final); return ret; } @@ -434,13 +432,7 @@ int rds_free_mr(struct rds_sock *rs, char __user *optval, int optlen) if (!mr) return -EINVAL; - /* - * call rds_destroy_mr() ourselves so that we're sure it's done by the time - * we return. If we let rds_mr_put() do it it might not happen until - * someone else drops their ref. - */ - rds_destroy_mr(mr); - rds_mr_put(mr); + kref_put(&mr->r_kref, __rds_put_mr_final); return 0; } @@ -464,6 +456,14 @@ void rds_rdma_unuse(struct rds_sock *rs, u32 r_key, int force) return; } + /* Get a reference so that the MR won't go away before calling + * sync_mr() below. + */ + kref_get(&mr->r_kref); + + /* If it is going to be freed, remove it from the tree now so + * that no other thread can find it and free it. + */ if (mr->r_use_once || force) { rb_erase(&mr->r_rb_node, &rs->rs_rdma_keys); RB_CLEAR_NODE(&mr->r_rb_node); @@ -477,12 +477,13 @@ void rds_rdma_unuse(struct rds_sock *rs, u32 r_key, int force) if (mr->r_trans->sync_mr) mr->r_trans->sync_mr(mr->r_trans_private, DMA_FROM_DEVICE); + /* Release the reference held above. */ + kref_put(&mr->r_kref, __rds_put_mr_final); + /* If the MR was marked as invalidate, this will * trigger an async flush. */ - if (zot_me) { - rds_destroy_mr(mr); - rds_mr_put(mr); - } + if (zot_me) + kref_put(&mr->r_kref, __rds_put_mr_final); } void rds_rdma_free_op(struct rm_rdma_op *ro) @@ -490,7 +491,7 @@ void rds_rdma_free_op(struct rm_rdma_op *ro) unsigned int i; if (ro->op_odp_mr) { - rds_mr_put(ro->op_odp_mr); + kref_put(&ro->op_odp_mr->r_kref, __rds_put_mr_final); } else { for (i = 0; i < ro->op_nents; i++) { struct page *page = sg_page(&ro->op_sg[i]); @@ -730,7 +731,7 @@ int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm, goto out_pages; } RB_CLEAR_NODE(&local_odp_mr->r_rb_node); - refcount_set(&local_odp_mr->r_refcount, 1); + kref_init(&local_odp_mr->r_kref); local_odp_mr->r_trans = rs->rs_transport; local_odp_mr->r_sock = rs; local_odp_mr->r_trans_private = @@ -827,7 +828,7 @@ int rds_cmsg_rdma_dest(struct rds_sock *rs, struct rds_message *rm, if (!mr) err = -EINVAL; /* invalid r_key */ else - refcount_inc(&mr->r_refcount); + kref_get(&mr->r_kref); spin_unlock_irqrestore(&rs->rs_rdma_lock, flags); if (mr) { diff --git a/net/rds/rds.h b/net/rds/rds.h index e4a603523083..8e18cd2aec51 100644 --- a/net/rds/rds.h +++ b/net/rds/rds.h @@ -291,7 +291,7 @@ struct rds_incoming { struct rds_mr { struct rb_node r_rb_node; - refcount_t r_refcount; + struct kref r_kref; u32 r_key; /* A copy of the creation flags */ @@ -299,19 +299,11 @@ struct rds_mr { unsigned int r_invalidate:1; unsigned int r_write:1; - /* This is for RDS_MR_DEAD. - * It would be nice & consistent to make this part of the above - * bit field here, but we need to use test_and_set_bit. - */ - unsigned long r_state; struct rds_sock *r_sock; /* back pointer to the socket that owns us */ struct rds_transport *r_trans; void *r_trans_private; }; -/* Flags for mr->r_state */ -#define RDS_MR_DEAD 0 - static inline rds_rdma_cookie_t rds_rdma_make_cookie(u32 r_key, u32 offset) { return r_key | (((u64) offset) << 32); @@ -946,12 +938,7 @@ void rds_atomic_send_complete(struct rds_message *rm, int wc_status); int rds_cmsg_atomic(struct rds_sock *rs, struct rds_message *rm, struct cmsghdr *cmsg); -void __rds_put_mr_final(struct rds_mr *mr); -static inline void rds_mr_put(struct rds_mr *mr) -{ - if (refcount_dec_and_test(&mr->r_refcount)) - __rds_put_mr_final(mr); -} +void __rds_put_mr_final(struct kref *kref); static inline bool rds_destroy_pending(struct rds_connection *conn) { diff --git a/net/rxrpc/key.c b/net/rxrpc/key.c index 6c3f35fac42d..0c98313dd7a8 100644 --- a/net/rxrpc/key.c +++ b/net/rxrpc/key.c @@ -31,7 +31,7 @@ static void rxrpc_free_preparse_s(struct key_preparsed_payload *); static void rxrpc_destroy(struct key *); static void rxrpc_destroy_s(struct key *); static void rxrpc_describe(const struct key *, struct seq_file *); -static long rxrpc_read(const struct key *, char __user *, size_t); +static long rxrpc_read(const struct key *, char *, size_t); /* * rxrpc defined keys take an arbitrary string as the description and an @@ -1042,12 +1042,12 @@ EXPORT_SYMBOL(rxrpc_get_null_key); * - this returns the result in XDR form */ static long rxrpc_read(const struct key *key, - char __user *buffer, size_t buflen) + char *buffer, size_t buflen) { const struct rxrpc_key_token *token; const struct krb5_principal *princ; size_t size; - __be32 __user *xdr, *oldxdr; + __be32 *xdr, *oldxdr; u32 cnlen, toksize, ntoks, tok, zero; u16 toksizes[AFSTOKEN_MAX]; int loop; @@ -1124,30 +1124,25 @@ static long rxrpc_read(const struct key *key, if (!buffer || buflen < size) return size; - xdr = (__be32 __user *) buffer; + xdr = (__be32 *)buffer; zero = 0; #define ENCODE(x) \ do { \ - __be32 y = htonl(x); \ - if (put_user(y, xdr++) < 0) \ - goto fault; \ + *xdr++ = htonl(x); \ } while(0) #define ENCODE_DATA(l, s) \ do { \ u32 _l = (l); \ ENCODE(l); \ - if (copy_to_user(xdr, (s), _l) != 0) \ - goto fault; \ - if (_l & 3 && \ - copy_to_user((u8 __user *)xdr + _l, &zero, 4 - (_l & 3)) != 0) \ - goto fault; \ + memcpy(xdr, (s), _l); \ + if (_l & 3) \ + memcpy((u8 *)xdr + _l, &zero, 4 - (_l & 3)); \ xdr += (_l + 3) >> 2; \ } while(0) #define ENCODE64(x) \ do { \ __be64 y = cpu_to_be64(x); \ - if (copy_to_user(xdr, &y, 8) != 0) \ - goto fault; \ + memcpy(xdr, &y, 8); \ xdr += 8 >> 2; \ } while(0) #define ENCODE_STR(s) \ @@ -1238,8 +1233,4 @@ static long rxrpc_read(const struct key *key, ASSERTCMP((char __user *) xdr - buffer, ==, size); _leave(" = %zu", size); return size; - -fault: - _leave(" = -EFAULT"); - return -EFAULT; } diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index f6a3b969ead0..55bd1429678f 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -1667,6 +1667,7 @@ int tcf_classify_ingress(struct sk_buff *skb, skb_ext_del(skb, TC_SKB_EXT); tp = rcu_dereference_bh(fchain->filter_chain); + last_executed_chain = fchain->index; } ret = __tcf_classify(skb, tp, orig_tp, res, compat_mode, diff --git a/net/sched/cls_tcindex.c b/net/sched/cls_tcindex.c index 065345832a69..61e95029c18f 100644 --- a/net/sched/cls_tcindex.c +++ b/net/sched/cls_tcindex.c @@ -151,6 +151,7 @@ static int tcindex_init(struct tcf_proto *tp) p->mask = 0xffff; p->hash = DEFAULT_HASH_SIZE; p->fall_through = 1; + refcount_set(&p->refcnt, 1); /* Paired with tcindex_destroy_work() */ rcu_assign_pointer(tp->root, p); return 0; diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c index 24ca861815b1..25fbd8d9de74 100644 --- a/net/sunrpc/auth_gss/auth_gss.c +++ b/net/sunrpc/auth_gss/auth_gss.c @@ -20,6 +20,7 @@ #include <linux/sunrpc/clnt.h> #include <linux/sunrpc/auth.h> #include <linux/sunrpc/auth_gss.h> +#include <linux/sunrpc/gss_krb5.h> #include <linux/sunrpc/svcauth_gss.h> #include <linux/sunrpc/gss_err.h> #include <linux/workqueue.h> @@ -1050,7 +1051,7 @@ gss_create_new(const struct rpc_auth_create_args *args, struct rpc_clnt *clnt) goto err_put_mech; auth = &gss_auth->rpc_auth; auth->au_cslack = GSS_CRED_SLACK >> 2; - auth->au_rslack = GSS_VERF_SLACK >> 2; + auth->au_rslack = GSS_KRB5_MAX_SLACK_NEEDED >> 2; auth->au_verfsize = GSS_VERF_SLACK >> 2; auth->au_ralign = GSS_VERF_SLACK >> 2; auth->au_flags = 0; @@ -1724,8 +1725,9 @@ bad_mic: goto out; } -static int gss_wrap_req_integ(struct rpc_cred *cred, struct gss_cl_ctx *ctx, - struct rpc_task *task, struct xdr_stream *xdr) +static noinline_for_stack int +gss_wrap_req_integ(struct rpc_cred *cred, struct gss_cl_ctx *ctx, + struct rpc_task *task, struct xdr_stream *xdr) { struct rpc_rqst *rqstp = task->tk_rqstp; struct xdr_buf integ_buf, *snd_buf = &rqstp->rq_snd_buf; @@ -1816,8 +1818,9 @@ out: return -EAGAIN; } -static int gss_wrap_req_priv(struct rpc_cred *cred, struct gss_cl_ctx *ctx, - struct rpc_task *task, struct xdr_stream *xdr) +static noinline_for_stack int +gss_wrap_req_priv(struct rpc_cred *cred, struct gss_cl_ctx *ctx, + struct rpc_task *task, struct xdr_stream *xdr) { struct rpc_rqst *rqstp = task->tk_rqstp; struct xdr_buf *snd_buf = &rqstp->rq_snd_buf; @@ -1877,7 +1880,7 @@ static int gss_wrap_req_priv(struct rpc_cred *cred, struct gss_cl_ctx *ctx, else iov = snd_buf->head; p = iov->iov_base + iov->iov_len; - pad = 3 - ((snd_buf->len - offset - 1) & 3); + pad = xdr_pad_size(snd_buf->len - offset); memset(p, 0, pad); iov->iov_len += pad; snd_buf->len += pad; @@ -1934,35 +1937,69 @@ gss_unwrap_resp_auth(struct rpc_cred *cred) return 0; } -static int +/* + * RFC 2203, Section 5.3.2.2 + * + * struct rpc_gss_integ_data { + * opaque databody_integ<>; + * opaque checksum<>; + * }; + * + * struct rpc_gss_data_t { + * unsigned int seq_num; + * proc_req_arg_t arg; + * }; + */ +static noinline_for_stack int gss_unwrap_resp_integ(struct rpc_task *task, struct rpc_cred *cred, struct gss_cl_ctx *ctx, struct rpc_rqst *rqstp, struct xdr_stream *xdr) { - struct xdr_buf integ_buf, *rcv_buf = &rqstp->rq_rcv_buf; - u32 data_offset, mic_offset, integ_len, maj_stat; + struct xdr_buf gss_data, *rcv_buf = &rqstp->rq_rcv_buf; struct rpc_auth *auth = cred->cr_auth; + u32 len, offset, seqno, maj_stat; struct xdr_netobj mic; - __be32 *p; + int ret; - p = xdr_inline_decode(xdr, 2 * sizeof(*p)); - if (unlikely(!p)) + ret = -EIO; + mic.data = NULL; + + /* opaque databody_integ<>; */ + if (xdr_stream_decode_u32(xdr, &len)) goto unwrap_failed; - integ_len = be32_to_cpup(p++); - if (integ_len & 3) + if (len & 3) goto unwrap_failed; - data_offset = (u8 *)(p) - (u8 *)rcv_buf->head[0].iov_base; - mic_offset = integ_len + data_offset; - if (mic_offset > rcv_buf->len) + offset = rcv_buf->len - xdr_stream_remaining(xdr); + if (xdr_stream_decode_u32(xdr, &seqno)) goto unwrap_failed; - if (be32_to_cpup(p) != rqstp->rq_seqno) + if (seqno != rqstp->rq_seqno) goto bad_seqno; + if (xdr_buf_subsegment(rcv_buf, &gss_data, offset, len)) + goto unwrap_failed; - if (xdr_buf_subsegment(rcv_buf, &integ_buf, data_offset, integ_len)) + /* + * The xdr_stream now points to the beginning of the + * upper layer payload, to be passed below to + * rpcauth_unwrap_resp_decode(). The checksum, which + * follows the upper layer payload in @rcv_buf, is + * located and parsed without updating the xdr_stream. + */ + + /* opaque checksum<>; */ + offset += len; + if (xdr_decode_word(rcv_buf, offset, &len)) + goto unwrap_failed; + offset += sizeof(__be32); + if (offset + len > rcv_buf->len) goto unwrap_failed; - if (xdr_buf_read_mic(rcv_buf, &mic, mic_offset)) + mic.len = len; + mic.data = kmalloc(len, GFP_NOFS); + if (!mic.data) + goto unwrap_failed; + if (read_bytes_from_xdr_buf(rcv_buf, offset, mic.data, mic.len)) goto unwrap_failed; - maj_stat = gss_verify_mic(ctx->gc_gss_ctx, &integ_buf, &mic); + + maj_stat = gss_verify_mic(ctx->gc_gss_ctx, &gss_data, &mic); if (maj_stat == GSS_S_CONTEXT_EXPIRED) clear_bit(RPCAUTH_CRED_UPTODATE, &cred->cr_flags); if (maj_stat != GSS_S_COMPLETE) @@ -1970,19 +2007,24 @@ gss_unwrap_resp_integ(struct rpc_task *task, struct rpc_cred *cred, auth->au_rslack = auth->au_verfsize + 2 + 1 + XDR_QUADLEN(mic.len); auth->au_ralign = auth->au_verfsize + 2; - return 0; + ret = 0; + +out: + kfree(mic.data); + return ret; + unwrap_failed: trace_rpcgss_unwrap_failed(task); - return -EIO; + goto out; bad_seqno: - trace_rpcgss_bad_seqno(task, rqstp->rq_seqno, be32_to_cpup(p)); - return -EIO; + trace_rpcgss_bad_seqno(task, rqstp->rq_seqno, seqno); + goto out; bad_mic: trace_rpcgss_verify_mic(task, maj_stat); - return -EIO; + goto out; } -static int +static noinline_for_stack int gss_unwrap_resp_priv(struct rpc_task *task, struct rpc_cred *cred, struct gss_cl_ctx *ctx, struct rpc_rqst *rqstp, struct xdr_stream *xdr) diff --git a/net/sunrpc/auth_gss/svcauth_gss.c b/net/sunrpc/auth_gss/svcauth_gss.c index 65b67b257302..54ae5be62f6a 100644 --- a/net/sunrpc/auth_gss/svcauth_gss.c +++ b/net/sunrpc/auth_gss/svcauth_gss.c @@ -55,10 +55,6 @@ #include "gss_rpc_upcall.h" -#if IS_ENABLED(CONFIG_SUNRPC_DEBUG) -# define RPCDBG_FACILITY RPCDBG_AUTH -#endif - /* The rpcsec_init cache is used for mapping RPCSEC_GSS_{,CONT_}INIT requests * into replies. * @@ -184,6 +180,11 @@ static struct cache_head *rsi_alloc(void) return NULL; } +static int rsi_upcall(struct cache_detail *cd, struct cache_head *h) +{ + return sunrpc_cache_pipe_upcall_timeout(cd, h); +} + static void rsi_request(struct cache_detail *cd, struct cache_head *h, char **bpp, int *blen) @@ -282,6 +283,7 @@ static const struct cache_detail rsi_cache_template = { .hash_size = RSI_HASHMAX, .name = "auth.rpcsec.init", .cache_put = rsi_put, + .cache_upcall = rsi_upcall, .cache_request = rsi_request, .cache_parse = rsi_parse, .match = rsi_match, @@ -428,6 +430,11 @@ rsc_alloc(void) return NULL; } +static int rsc_upcall(struct cache_detail *cd, struct cache_head *h) +{ + return -EINVAL; +} + static int rsc_parse(struct cache_detail *cd, char *mesg, int mlen) { @@ -554,6 +561,7 @@ static const struct cache_detail rsc_cache_template = { .hash_size = RSC_HASHMAX, .name = "auth.rpcsec.context", .cache_put = rsc_put, + .cache_upcall = rsc_upcall, .cache_parse = rsc_parse, .match = rsc_match, .init = rsc_init, @@ -713,14 +721,12 @@ gss_verify_header(struct svc_rqst *rqstp, struct rsc *rsci, } if (gc->gc_seq > MAXSEQ) { - dprintk("RPC: svcauth_gss: discarding request with " - "large sequence number %d\n", gc->gc_seq); + trace_rpcgss_svc_large_seqno(rqstp->rq_xid, gc->gc_seq); *authp = rpcsec_gsserr_ctxproblem; return SVC_DENIED; } if (!gss_check_seq_num(rsci, gc->gc_seq)) { - dprintk("RPC: svcauth_gss: discarding request with " - "old sequence number %d\n", gc->gc_seq); + trace_rpcgss_svc_old_seqno(rqstp->rq_xid, gc->gc_seq); return SVC_DROP; } return SVC_OK; @@ -961,7 +967,7 @@ unwrap_priv_data(struct svc_rqst *rqstp, struct xdr_buf *buf, u32 seq, struct gs /* XXX: This is very inefficient. It would be better to either do * this while we encrypt, or maybe in the receive code, if we can peak * ahead and work out the service and mechanism there. */ - offset = buf->head[0].iov_len % 4; + offset = xdr_pad_size(buf->head[0].iov_len); if (offset) { buf->buflen = RPCSVC_MAXPAYLOAD; xdr_shift_buf(buf, offset); @@ -1245,7 +1251,6 @@ static int gss_proxy_save_rsc(struct cache_detail *cd, if (!ud->found_creds) { /* userspace seem buggy, we should always get at least a * mapping to nobody */ - dprintk("RPC: No creds found!\n"); goto out; } else { struct timespec64 boot; @@ -1311,8 +1316,8 @@ static int svcauth_gss_proxy_init(struct svc_rqst *rqstp, if (status) goto out; - trace_rpcgss_accept_upcall(rqstp->rq_xid, ud.major_status, - ud.minor_status); + trace_rpcgss_svc_accept_upcall(rqstp->rq_xid, ud.major_status, + ud.minor_status); switch (ud.major_status) { case GSS_S_CONTINUE_NEEDED: @@ -1320,31 +1325,23 @@ static int svcauth_gss_proxy_init(struct svc_rqst *rqstp, break; case GSS_S_COMPLETE: status = gss_proxy_save_rsc(sn->rsc_cache, &ud, &handle); - if (status) { - pr_info("%s: gss_proxy_save_rsc failed (%d)\n", - __func__, status); + if (status) goto out; - } cli_handle.data = (u8 *)&handle; cli_handle.len = sizeof(handle); break; default: - ret = SVC_CLOSE; goto out; } /* Got an answer to the upcall; use it: */ if (gss_write_init_verf(sn->rsc_cache, rqstp, - &cli_handle, &ud.major_status)) { - pr_info("%s: gss_write_init_verf failed\n", __func__); + &cli_handle, &ud.major_status)) goto out; - } if (gss_write_resv(resv, PAGE_SIZE, &cli_handle, &ud.out_token, - ud.major_status, ud.minor_status)) { - pr_info("%s: gss_write_resv failed\n", __func__); + ud.major_status, ud.minor_status)) goto out; - } ret = SVC_COMPLETE; out: @@ -1495,8 +1492,7 @@ svcauth_gss_accept(struct svc_rqst *rqstp, __be32 *authp) int ret; struct sunrpc_net *sn = net_generic(SVC_NET(rqstp), sunrpc_net_id); - dprintk("RPC: svcauth_gss: argv->iov_len = %zd\n", - argv->iov_len); + trace_rpcgss_svc_accept(rqstp->rq_xid, argv->iov_len); *authp = rpc_autherr_badcred; if (!svcdata) @@ -1680,7 +1676,8 @@ svcauth_gss_wrap_resp_integ(struct svc_rqst *rqstp) goto out; integ_offset = (u8 *)(p + 1) - (u8 *)resbuf->head[0].iov_base; integ_len = resbuf->len - integ_offset; - BUG_ON(integ_len % 4); + if (integ_len & 3) + goto out; *p++ = htonl(integ_len); *p++ = htonl(gc->gc_seq); if (xdr_buf_subsegment(resbuf, &integ_buf, integ_offset, integ_len)) { @@ -1704,7 +1701,8 @@ svcauth_gss_wrap_resp_integ(struct svc_rqst *rqstp) resv->iov_len += XDR_QUADLEN(mic.len) << 2; /* not strictly required: */ resbuf->len += XDR_QUADLEN(mic.len) << 2; - BUG_ON(resv->iov_len > PAGE_SIZE); + if (resv->iov_len > PAGE_SIZE) + goto out_err; out: stat = 0; out_err: @@ -1740,9 +1738,11 @@ svcauth_gss_wrap_resp_priv(struct svc_rqst *rqstp) * both the head and tail. */ if (resbuf->tail[0].iov_base) { - BUG_ON(resbuf->tail[0].iov_base >= resbuf->head[0].iov_base - + PAGE_SIZE); - BUG_ON(resbuf->tail[0].iov_base < resbuf->head[0].iov_base); + if (resbuf->tail[0].iov_base >= + resbuf->head[0].iov_base + PAGE_SIZE) + return -EINVAL; + if (resbuf->tail[0].iov_base < resbuf->head[0].iov_base) + return -EINVAL; if (resbuf->tail[0].iov_len + resbuf->head[0].iov_len + 2 * RPC_MAX_AUTH_SIZE > PAGE_SIZE) return -ENOMEM; diff --git a/net/sunrpc/cache.c b/net/sunrpc/cache.c index bd843a81afa0..af0ddd28b081 100644 --- a/net/sunrpc/cache.c +++ b/net/sunrpc/cache.c @@ -32,13 +32,13 @@ #include <linux/sunrpc/cache.h> #include <linux/sunrpc/stats.h> #include <linux/sunrpc/rpc_pipe_fs.h> +#include <trace/events/sunrpc.h> #include "netns.h" #define RPCDBG_FACILITY RPCDBG_CACHE static bool cache_defer_req(struct cache_req *req, struct cache_head *item); static void cache_revisit_request(struct cache_head *item); -static bool cache_listeners_exist(struct cache_detail *detail); static void cache_init(struct cache_head *h, struct cache_detail *detail) { @@ -65,13 +65,14 @@ static struct cache_head *sunrpc_cache_find_rcu(struct cache_detail *detail, rcu_read_lock(); hlist_for_each_entry_rcu(tmp, head, cache_list) { - if (detail->match(tmp, key)) { - if (cache_is_expired(detail, tmp)) - continue; - tmp = cache_get_rcu(tmp); - rcu_read_unlock(); - return tmp; - } + if (!detail->match(tmp, key)) + continue; + if (test_bit(CACHE_VALID, &tmp->flags) && + cache_is_expired(detail, tmp)) + continue; + tmp = cache_get_rcu(tmp); + rcu_read_unlock(); + return tmp; } rcu_read_unlock(); return NULL; @@ -113,18 +114,21 @@ static struct cache_head *sunrpc_cache_add_entry(struct cache_detail *detail, spin_lock(&detail->hash_lock); /* check if entry appeared while we slept */ - hlist_for_each_entry_rcu(tmp, head, cache_list) { - if (detail->match(tmp, key)) { - if (cache_is_expired(detail, tmp)) { - sunrpc_begin_cache_remove_entry(tmp, detail); - freeme = tmp; - break; - } - cache_get(tmp); - spin_unlock(&detail->hash_lock); - cache_put(new, detail); - return tmp; + hlist_for_each_entry_rcu(tmp, head, cache_list, + lockdep_is_held(&detail->hash_lock)) { + if (!detail->match(tmp, key)) + continue; + if (test_bit(CACHE_VALID, &tmp->flags) && + cache_is_expired(detail, tmp)) { + sunrpc_begin_cache_remove_entry(tmp, detail); + trace_cache_entry_expired(detail, tmp); + freeme = tmp; + break; } + cache_get(tmp); + spin_unlock(&detail->hash_lock); + cache_put(new, detail); + return tmp; } hlist_add_head_rcu(&new->cache_list, head); @@ -174,6 +178,25 @@ static void cache_fresh_unlocked(struct cache_head *head, } } +static void cache_make_negative(struct cache_detail *detail, + struct cache_head *h) +{ + set_bit(CACHE_NEGATIVE, &h->flags); + trace_cache_entry_make_negative(detail, h); +} + +static void cache_entry_update(struct cache_detail *detail, + struct cache_head *h, + struct cache_head *new) +{ + if (!test_bit(CACHE_NEGATIVE, &new->flags)) { + detail->update(h, new); + trace_cache_entry_update(detail, h); + } else { + cache_make_negative(detail, h); + } +} + struct cache_head *sunrpc_cache_update(struct cache_detail *detail, struct cache_head *new, struct cache_head *old, int hash) { @@ -186,10 +209,7 @@ struct cache_head *sunrpc_cache_update(struct cache_detail *detail, if (!test_bit(CACHE_VALID, &old->flags)) { spin_lock(&detail->hash_lock); if (!test_bit(CACHE_VALID, &old->flags)) { - if (test_bit(CACHE_NEGATIVE, &new->flags)) - set_bit(CACHE_NEGATIVE, &old->flags); - else - detail->update(old, new); + cache_entry_update(detail, old, new); cache_fresh_locked(old, new->expiry_time, detail); spin_unlock(&detail->hash_lock); cache_fresh_unlocked(old, detail); @@ -207,10 +227,7 @@ struct cache_head *sunrpc_cache_update(struct cache_detail *detail, detail->init(tmp, old); spin_lock(&detail->hash_lock); - if (test_bit(CACHE_NEGATIVE, &new->flags)) - set_bit(CACHE_NEGATIVE, &tmp->flags); - else - detail->update(tmp, new); + cache_entry_update(detail, tmp, new); hlist_add_head(&tmp->cache_list, &detail->hash_table[hash]); detail->entries++; cache_get(tmp); @@ -224,13 +241,6 @@ struct cache_head *sunrpc_cache_update(struct cache_detail *detail, } EXPORT_SYMBOL_GPL(sunrpc_cache_update); -static int cache_make_upcall(struct cache_detail *cd, struct cache_head *h) -{ - if (cd->cache_upcall) - return cd->cache_upcall(cd, h); - return sunrpc_cache_pipe_upcall(cd, h); -} - static inline int cache_is_valid(struct cache_head *h) { if (!test_bit(CACHE_VALID, &h->flags)) @@ -259,7 +269,7 @@ static int try_to_negate_entry(struct cache_detail *detail, struct cache_head *h spin_lock(&detail->hash_lock); rv = cache_is_valid(h); if (rv == -EAGAIN) { - set_bit(CACHE_NEGATIVE, &h->flags); + cache_make_negative(detail, h); cache_fresh_locked(h, seconds_since_boot()+CACHE_NEW_EXPIRY, detail); rv = -ENOENT; @@ -303,17 +313,14 @@ int cache_check(struct cache_detail *detail, (h->expiry_time != 0 && age > refresh_age/2)) { dprintk("RPC: Want update, refage=%lld, age=%lld\n", refresh_age, age); - if (!test_and_set_bit(CACHE_PENDING, &h->flags)) { - switch (cache_make_upcall(detail, h)) { - case -EINVAL: - rv = try_to_negate_entry(detail, h); - break; - case -EAGAIN: - cache_fresh_unlocked(h, detail); - break; - } - } else if (!cache_listeners_exist(detail)) + switch (detail->cache_upcall(detail, h)) { + case -EINVAL: rv = try_to_negate_entry(detail, h); + break; + case -EAGAIN: + cache_fresh_unlocked(h, detail); + break; + } } if (rv == -EAGAIN) { @@ -468,6 +475,7 @@ static int cache_clean(void) continue; sunrpc_begin_cache_remove_entry(ch, current_detail); + trace_cache_entry_expired(current_detail, ch); rv = 1; break; } @@ -1195,20 +1203,12 @@ static bool cache_listeners_exist(struct cache_detail *detail) * * Each request is at most one page long. */ -int sunrpc_cache_pipe_upcall(struct cache_detail *detail, struct cache_head *h) +static int cache_pipe_upcall(struct cache_detail *detail, struct cache_head *h) { - char *buf; struct cache_request *crq; int ret = 0; - if (!detail->cache_request) - return -EINVAL; - - if (!cache_listeners_exist(detail)) { - warn_no_listener(detail); - return -EINVAL; - } if (test_bit(CACHE_CLEANED, &h->flags)) /* Too late to make an upcall */ return -EAGAIN; @@ -1231,6 +1231,7 @@ int sunrpc_cache_pipe_upcall(struct cache_detail *detail, struct cache_head *h) if (test_bit(CACHE_PENDING, &h->flags)) { crq->item = cache_get(h); list_add_tail(&crq->q.list, &detail->queue); + trace_cache_entry_upcall(detail, h); } else /* Lost a race, no longer PENDING, so don't enqueue */ ret = -EAGAIN; @@ -1242,8 +1243,27 @@ int sunrpc_cache_pipe_upcall(struct cache_detail *detail, struct cache_head *h) } return ret; } + +int sunrpc_cache_pipe_upcall(struct cache_detail *detail, struct cache_head *h) +{ + if (test_and_set_bit(CACHE_PENDING, &h->flags)) + return 0; + return cache_pipe_upcall(detail, h); +} EXPORT_SYMBOL_GPL(sunrpc_cache_pipe_upcall); +int sunrpc_cache_pipe_upcall_timeout(struct cache_detail *detail, + struct cache_head *h) +{ + if (!cache_listeners_exist(detail)) { + warn_no_listener(detail); + trace_cache_entry_no_listener(detail, h); + return -EINVAL; + } + return sunrpc_cache_pipe_upcall(detail, h); +} +EXPORT_SYMBOL_GPL(sunrpc_cache_pipe_upcall_timeout); + /* * parse a message from user-space and pass it * to an appropriate cache diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index 7324b21f923e..325a0858700f 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -1099,8 +1099,9 @@ rpc_task_set_rpc_message(struct rpc_task *task, const struct rpc_message *msg) task->tk_msg.rpc_proc = msg->rpc_proc; task->tk_msg.rpc_argp = msg->rpc_argp; task->tk_msg.rpc_resp = msg->rpc_resp; - if (msg->rpc_cred != NULL) - task->tk_msg.rpc_cred = get_cred(msg->rpc_cred); + task->tk_msg.rpc_cred = msg->rpc_cred; + if (!(task->tk_flags & RPC_TASK_CRED_NOREF)) + get_cred(task->tk_msg.rpc_cred); } } @@ -1126,6 +1127,9 @@ struct rpc_task *rpc_run_task(const struct rpc_task_setup *task_setup_data) task = rpc_new_task(task_setup_data); + if (!RPC_IS_ASYNC(task)) + task->tk_flags |= RPC_TASK_CRED_NOREF; + rpc_task_set_client(task, task_setup_data->rpc_client); rpc_task_set_rpc_message(task, task_setup_data->rpc_message); @@ -2509,6 +2513,7 @@ call_decode(struct rpc_task *task) goto out; req->rq_rcv_buf.len = req->rq_private_buf.len; + trace_xprt_recvfrom(&req->rq_rcv_buf); /* Check that the softirq receive buffer is valid */ WARN_ON(memcmp(&req->rq_rcv_buf, &req->rq_private_buf, diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c index 55e900255b0c..7eba20a88438 100644 --- a/net/sunrpc/sched.c +++ b/net/sunrpc/sched.c @@ -204,10 +204,6 @@ static void __rpc_add_wait_queue(struct rpc_wait_queue *queue, struct rpc_task *task, unsigned char queue_priority) { - WARN_ON_ONCE(RPC_IS_QUEUED(task)); - if (RPC_IS_QUEUED(task)) - return; - INIT_LIST_HEAD(&task->u.tk_wait.timer_list); if (RPC_IS_PRIORITY(queue)) __rpc_add_wait_queue_priority(queue, task, queue_priority); @@ -382,7 +378,7 @@ static void rpc_make_runnable(struct workqueue_struct *wq, * NB: An RPC task will only receive interrupt-driven events as long * as it's on a wait queue. */ -static void __rpc_sleep_on_priority(struct rpc_wait_queue *q, +static void __rpc_do_sleep_on_priority(struct rpc_wait_queue *q, struct rpc_task *task, unsigned char queue_priority) { @@ -395,12 +391,23 @@ static void __rpc_sleep_on_priority(struct rpc_wait_queue *q, } +static void __rpc_sleep_on_priority(struct rpc_wait_queue *q, + struct rpc_task *task, + unsigned char queue_priority) +{ + if (WARN_ON_ONCE(RPC_IS_QUEUED(task))) + return; + __rpc_do_sleep_on_priority(q, task, queue_priority); +} + static void __rpc_sleep_on_priority_timeout(struct rpc_wait_queue *q, struct rpc_task *task, unsigned long timeout, unsigned char queue_priority) { + if (WARN_ON_ONCE(RPC_IS_QUEUED(task))) + return; if (time_is_after_jiffies(timeout)) { - __rpc_sleep_on_priority(q, task, queue_priority); + __rpc_do_sleep_on_priority(q, task, queue_priority); __rpc_add_timer(q, task, timeout); } else task->tk_status = -ETIMEDOUT; @@ -1162,7 +1169,8 @@ static void rpc_release_resources_task(struct rpc_task *task) { xprt_release(task); if (task->tk_msg.rpc_cred) { - put_cred(task->tk_msg.rpc_cred); + if (!(task->tk_flags & RPC_TASK_CRED_NOREF)) + put_cred(task->tk_msg.rpc_cred); task->tk_msg.rpc_cred = NULL; } rpc_task_release_client(task); diff --git a/net/sunrpc/socklib.c b/net/sunrpc/socklib.c index 1a864f1ed119..3fc8af8bb961 100644 --- a/net/sunrpc/socklib.c +++ b/net/sunrpc/socklib.c @@ -14,9 +14,24 @@ #include <linux/types.h> #include <linux/pagemap.h> #include <linux/udp.h> +#include <linux/sunrpc/msg_prot.h> #include <linux/sunrpc/xdr.h> #include <linux/export.h> +#include "socklib.h" + +/* + * Helper structure for copying from an sk_buff. + */ +struct xdr_skb_reader { + struct sk_buff *skb; + unsigned int offset; + size_t count; + __wsum csum; +}; + +typedef size_t (*xdr_skb_read_actor)(struct xdr_skb_reader *desc, void *to, + size_t len); /** * xdr_skb_read_bits - copy some data bits from skb to internal buffer @@ -186,3 +201,129 @@ no_checksum: return 0; } EXPORT_SYMBOL_GPL(csum_partial_copy_to_xdr); + +static inline int xprt_sendmsg(struct socket *sock, struct msghdr *msg, + size_t seek) +{ + if (seek) + iov_iter_advance(&msg->msg_iter, seek); + return sock_sendmsg(sock, msg); +} + +static int xprt_send_kvec(struct socket *sock, struct msghdr *msg, + struct kvec *vec, size_t seek) +{ + iov_iter_kvec(&msg->msg_iter, WRITE, vec, 1, vec->iov_len); + return xprt_sendmsg(sock, msg, seek); +} + +static int xprt_send_pagedata(struct socket *sock, struct msghdr *msg, + struct xdr_buf *xdr, size_t base) +{ + int err; + + err = xdr_alloc_bvec(xdr, GFP_KERNEL); + if (err < 0) + return err; + + iov_iter_bvec(&msg->msg_iter, WRITE, xdr->bvec, xdr_buf_pagecount(xdr), + xdr->page_len + xdr->page_base); + return xprt_sendmsg(sock, msg, base + xdr->page_base); +} + +/* Common case: + * - stream transport + * - sending from byte 0 of the message + * - the message is wholly contained in @xdr's head iovec + */ +static int xprt_send_rm_and_kvec(struct socket *sock, struct msghdr *msg, + rpc_fraghdr marker, struct kvec *vec, + size_t base) +{ + struct kvec iov[2] = { + [0] = { + .iov_base = &marker, + .iov_len = sizeof(marker) + }, + [1] = *vec, + }; + size_t len = iov[0].iov_len + iov[1].iov_len; + + iov_iter_kvec(&msg->msg_iter, WRITE, iov, 2, len); + return xprt_sendmsg(sock, msg, base); +} + +/** + * xprt_sock_sendmsg - write an xdr_buf directly to a socket + * @sock: open socket to send on + * @msg: socket message metadata + * @xdr: xdr_buf containing this request + * @base: starting position in the buffer + * @marker: stream record marker field + * @sent_p: return the total number of bytes successfully queued for sending + * + * Return values: + * On success, returns zero and fills in @sent_p. + * %-ENOTSOCK if @sock is not a struct socket. + */ +int xprt_sock_sendmsg(struct socket *sock, struct msghdr *msg, + struct xdr_buf *xdr, unsigned int base, + rpc_fraghdr marker, unsigned int *sent_p) +{ + unsigned int rmsize = marker ? sizeof(marker) : 0; + unsigned int remainder = rmsize + xdr->len - base; + unsigned int want; + int err = 0; + + *sent_p = 0; + + if (unlikely(!sock)) + return -ENOTSOCK; + + msg->msg_flags |= MSG_MORE; + want = xdr->head[0].iov_len + rmsize; + if (base < want) { + unsigned int len = want - base; + + remainder -= len; + if (remainder == 0) + msg->msg_flags &= ~MSG_MORE; + if (rmsize) + err = xprt_send_rm_and_kvec(sock, msg, marker, + &xdr->head[0], base); + else + err = xprt_send_kvec(sock, msg, &xdr->head[0], base); + if (remainder == 0 || err != len) + goto out; + *sent_p += err; + base = 0; + } else { + base -= want; + } + + if (base < xdr->page_len) { + unsigned int len = xdr->page_len - base; + + remainder -= len; + if (remainder == 0) + msg->msg_flags &= ~MSG_MORE; + err = xprt_send_pagedata(sock, msg, xdr, base); + if (remainder == 0 || err != len) + goto out; + *sent_p += err; + base = 0; + } else { + base -= xdr->page_len; + } + + if (base >= xdr->tail[0].iov_len) + return 0; + msg->msg_flags &= ~MSG_MORE; + err = xprt_send_kvec(sock, msg, &xdr->tail[0], base); +out: + if (err > 0) { + *sent_p += err; + err = 0; + } + return err; +} diff --git a/net/sunrpc/socklib.h b/net/sunrpc/socklib.h new file mode 100644 index 000000000000..c48114ad6f00 --- /dev/null +++ b/net/sunrpc/socklib.h @@ -0,0 +1,15 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 1995-1997 Olaf Kirch <okir@monad.swb.de> + * Copyright (C) 2020, Oracle. + */ + +#ifndef _NET_SUNRPC_SOCKLIB_H_ +#define _NET_SUNRPC_SOCKLIB_H_ + +int csum_partial_copy_to_xdr(struct xdr_buf *xdr, struct sk_buff *skb); +int xprt_sock_sendmsg(struct socket *sock, struct msghdr *msg, + struct xdr_buf *xdr, unsigned int base, + rpc_fraghdr marker, unsigned int *sent_p); + +#endif /* _NET_SUNRPC_SOCKLIB_H_ */ diff --git a/net/sunrpc/sunrpc.h b/net/sunrpc/sunrpc.h index c9bacb3c930f..47a756503d11 100644 --- a/net/sunrpc/sunrpc.h +++ b/net/sunrpc/sunrpc.h @@ -50,10 +50,6 @@ static inline int sock_is_loopback(struct sock *sk) return loopback; } -int svc_send_common(struct socket *sock, struct xdr_buf *xdr, - struct page *headpage, unsigned long headoffset, - struct page *tailpage, unsigned long tailoffset); - int rpc_clients_notifier_register(void); void rpc_clients_notifier_unregister(void); #endif /* _NET_SUNRPC_SUNRPC_H */ diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c index 187dd4e73d64..9ed3126600ce 100644 --- a/net/sunrpc/svc.c +++ b/net/sunrpc/svc.c @@ -1529,10 +1529,6 @@ svc_process(struct svc_rqst *rqstp) goto out_drop; } - /* Reserve space for the record marker */ - if (rqstp->rq_prot == IPPROTO_TCP) - svc_putnl(resv, 0); - /* Returns 1 for send, 0 for drop */ if (likely(svc_process_common(rqstp, argv, resv))) return svc_send(rqstp); @@ -1637,6 +1633,22 @@ u32 svc_max_payload(const struct svc_rqst *rqstp) EXPORT_SYMBOL_GPL(svc_max_payload); /** + * svc_encode_read_payload - mark a range of bytes as a READ payload + * @rqstp: svc_rqst to operate on + * @offset: payload's byte offset in rqstp->rq_res + * @length: size of payload, in bytes + * + * Returns zero on success, or a negative errno if a permanent + * error occurred. + */ +int svc_encode_read_payload(struct svc_rqst *rqstp, unsigned int offset, + unsigned int length) +{ + return rqstp->rq_xprt->xpt_ops->xpo_read_payload(rqstp, offset, length); +} +EXPORT_SYMBOL_GPL(svc_encode_read_payload); + +/** * svc_fill_write_vector - Construct data argument for VFS write call * @rqstp: svc_rqst to operate on * @pages: list of pages containing data payload diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c index de3c077733a7..e27e3532ec75 100644 --- a/net/sunrpc/svc_xprt.c +++ b/net/sunrpc/svc_xprt.c @@ -104,8 +104,17 @@ void svc_unreg_xprt_class(struct svc_xprt_class *xcl) } EXPORT_SYMBOL_GPL(svc_unreg_xprt_class); -/* - * Format the transport list for printing +/** + * svc_print_xprts - Format the transport list for printing + * @buf: target buffer for formatted address + * @maxlen: length of target buffer + * + * Fills in @buf with a string containing a list of transport names, each name + * terminated with '\n'. If the buffer is too small, some entries may be + * missing, but it is guaranteed that all lines in the output buffer are + * complete. + * + * Returns positive length of the filled-in string. */ int svc_print_xprts(char *buf, int maxlen) { @@ -118,9 +127,9 @@ int svc_print_xprts(char *buf, int maxlen) list_for_each_entry(xcl, &svc_xprt_class_list, xcl_list) { int slen; - sprintf(tmpstr, "%s %d\n", xcl->xcl_name, xcl->xcl_max_payload); - slen = strlen(tmpstr); - if (len + slen > maxlen) + slen = snprintf(tmpstr, sizeof(tmpstr), "%s %d\n", + xcl->xcl_name, xcl->xcl_max_payload); + if (slen >= sizeof(tmpstr) || len + slen >= maxlen) break; len += slen; strcat(buf, tmpstr); @@ -802,6 +811,8 @@ static int svc_handle_xprt(struct svc_rqst *rqstp, struct svc_xprt *xprt) len = svc_deferred_recv(rqstp); else len = xprt->xpt_ops->xpo_recvfrom(rqstp); + if (len > 0) + trace_svc_recvfrom(&rqstp->rq_arg); rqstp->rq_stime = ktime_get(); rqstp->rq_reserved = serv->sv_max_mesg; atomic_add(rqstp->rq_reserved, &xprt->xpt_reserved); @@ -905,6 +916,7 @@ int svc_send(struct svc_rqst *rqstp) xb->len = xb->head[0].iov_len + xb->page_len + xb->tail[0].iov_len; + trace_svc_sendto(xb); /* Grab mutex to serialize outgoing data. */ mutex_lock(&xprt->xpt_mutex); diff --git a/net/sunrpc/svcauth_unix.c b/net/sunrpc/svcauth_unix.c index 04aa80a2d752..6c8f802c4261 100644 --- a/net/sunrpc/svcauth_unix.c +++ b/net/sunrpc/svcauth_unix.c @@ -148,6 +148,11 @@ static struct cache_head *ip_map_alloc(void) return NULL; } +static int ip_map_upcall(struct cache_detail *cd, struct cache_head *h) +{ + return sunrpc_cache_pipe_upcall(cd, h); +} + static void ip_map_request(struct cache_detail *cd, struct cache_head *h, char **bpp, int *blen) @@ -467,6 +472,11 @@ static struct cache_head *unix_gid_alloc(void) return NULL; } +static int unix_gid_upcall(struct cache_detail *cd, struct cache_head *h) +{ + return sunrpc_cache_pipe_upcall_timeout(cd, h); +} + static void unix_gid_request(struct cache_detail *cd, struct cache_head *h, char **bpp, int *blen) @@ -584,6 +594,7 @@ static const struct cache_detail unix_gid_cache_template = { .hash_size = GID_HASHMAX, .name = "auth.unix.gid", .cache_put = unix_gid_put, + .cache_upcall = unix_gid_upcall, .cache_request = unix_gid_request, .cache_parse = unix_gid_parse, .cache_show = unix_gid_show, @@ -881,6 +892,7 @@ static const struct cache_detail ip_map_cache_template = { .hash_size = IP_HASHMAX, .name = "auth.unix.ip", .cache_put = ip_map_put, + .cache_upcall = ip_map_upcall, .cache_request = ip_map_request, .cache_parse = ip_map_parse, .cache_show = ip_map_show, diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index 2934dd711715..519cf9c4f8fd 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c @@ -55,6 +55,7 @@ #include <linux/sunrpc/stats.h> #include <linux/sunrpc/xprt.h> +#include "socklib.h" #include "sunrpc.h" #define RPCDBG_FACILITY RPCDBG_SVCXPRT @@ -174,109 +175,10 @@ static void svc_set_cmsg_data(struct svc_rqst *rqstp, struct cmsghdr *cmh) } } -/* - * send routine intended to be shared by the fore- and back-channel - */ -int svc_send_common(struct socket *sock, struct xdr_buf *xdr, - struct page *headpage, unsigned long headoffset, - struct page *tailpage, unsigned long tailoffset) -{ - int result; - int size; - struct page **ppage = xdr->pages; - size_t base = xdr->page_base; - unsigned int pglen = xdr->page_len; - unsigned int flags = MSG_MORE | MSG_SENDPAGE_NOTLAST; - int slen; - int len = 0; - - slen = xdr->len; - - /* send head */ - if (slen == xdr->head[0].iov_len) - flags = 0; - len = kernel_sendpage(sock, headpage, headoffset, - xdr->head[0].iov_len, flags); - if (len != xdr->head[0].iov_len) - goto out; - slen -= xdr->head[0].iov_len; - if (slen == 0) - goto out; - - /* send page data */ - size = PAGE_SIZE - base < pglen ? PAGE_SIZE - base : pglen; - while (pglen > 0) { - if (slen == size) - flags = 0; - result = kernel_sendpage(sock, *ppage, base, size, flags); - if (result > 0) - len += result; - if (result != size) - goto out; - slen -= size; - pglen -= size; - size = PAGE_SIZE < pglen ? PAGE_SIZE : pglen; - base = 0; - ppage++; - } - - /* send tail */ - if (xdr->tail[0].iov_len) { - result = kernel_sendpage(sock, tailpage, tailoffset, - xdr->tail[0].iov_len, 0); - if (result > 0) - len += result; - } - -out: - return len; -} - - -/* - * Generic sendto routine - */ -static int svc_sendto(struct svc_rqst *rqstp, struct xdr_buf *xdr) +static int svc_sock_read_payload(struct svc_rqst *rqstp, unsigned int offset, + unsigned int length) { - struct svc_sock *svsk = - container_of(rqstp->rq_xprt, struct svc_sock, sk_xprt); - struct socket *sock = svsk->sk_sock; - union { - struct cmsghdr hdr; - long all[SVC_PKTINFO_SPACE / sizeof(long)]; - } buffer; - struct cmsghdr *cmh = &buffer.hdr; - int len = 0; - unsigned long tailoff; - unsigned long headoff; - RPC_IFDEBUG(char buf[RPC_MAX_ADDRBUFLEN]); - - if (rqstp->rq_prot == IPPROTO_UDP) { - struct msghdr msg = { - .msg_name = &rqstp->rq_addr, - .msg_namelen = rqstp->rq_addrlen, - .msg_control = cmh, - .msg_controllen = sizeof(buffer), - .msg_flags = MSG_MORE, - }; - - svc_set_cmsg_data(rqstp, cmh); - - if (sock_sendmsg(sock, &msg) < 0) - goto out; - } - - tailoff = ((unsigned long)xdr->tail[0].iov_base) & (PAGE_SIZE-1); - headoff = 0; - len = svc_send_common(sock, xdr, rqstp->rq_respages[0], headoff, - rqstp->rq_respages[0], tailoff); - -out: - dprintk("svc: socket %p sendto([%p %zu... ], %d) = %d (addr %s)\n", - svsk, xdr->head[0].iov_base, xdr->head[0].iov_len, - xdr->len, len, svc_print_addr(rqstp, buf, sizeof(buf))); - - return len; + return 0; } /* @@ -600,17 +502,43 @@ out_free: return 0; } -static int -svc_udp_sendto(struct svc_rqst *rqstp) +/** + * svc_udp_sendto - Send out a reply on a UDP socket + * @rqstp: completed svc_rqst + * + * Returns the number of bytes sent, or a negative errno. + */ +static int svc_udp_sendto(struct svc_rqst *rqstp) { - int error; + struct svc_xprt *xprt = rqstp->rq_xprt; + struct svc_sock *svsk = container_of(xprt, struct svc_sock, sk_xprt); + struct xdr_buf *xdr = &rqstp->rq_res; + union { + struct cmsghdr hdr; + long all[SVC_PKTINFO_SPACE / sizeof(long)]; + } buffer; + struct cmsghdr *cmh = &buffer.hdr; + struct msghdr msg = { + .msg_name = &rqstp->rq_addr, + .msg_namelen = rqstp->rq_addrlen, + .msg_control = cmh, + .msg_controllen = sizeof(buffer), + }; + unsigned int uninitialized_var(sent); + int err; - error = svc_sendto(rqstp, &rqstp->rq_res); - if (error == -ECONNREFUSED) - /* ICMP error on earlier request. */ - error = svc_sendto(rqstp, &rqstp->rq_res); + svc_set_cmsg_data(rqstp, cmh); - return error; + err = xprt_sock_sendmsg(svsk->sk_sock, &msg, xdr, 0, 0, &sent); + xdr_free_bvec(xdr); + if (err == -ECONNREFUSED) { + /* ICMP error on earlier request. */ + err = xprt_sock_sendmsg(svsk->sk_sock, &msg, xdr, 0, 0, &sent); + xdr_free_bvec(xdr); + } + if (err < 0) + return err; + return sent; } static int svc_udp_has_wspace(struct svc_xprt *xprt) @@ -653,6 +581,7 @@ static const struct svc_xprt_ops svc_udp_ops = { .xpo_create = svc_udp_create, .xpo_recvfrom = svc_udp_recvfrom, .xpo_sendto = svc_udp_sendto, + .xpo_read_payload = svc_sock_read_payload, .xpo_release_rqst = svc_release_udp_skb, .xpo_detach = svc_sock_detach, .xpo_free = svc_sock_free, @@ -1128,35 +1057,39 @@ err_noclose: return 0; /* record not complete */ } -/* - * Send out data on TCP socket. +/** + * svc_tcp_sendto - Send out a reply on a TCP socket + * @rqstp: completed svc_rqst + * + * Returns the number of bytes sent, or a negative errno. */ static int svc_tcp_sendto(struct svc_rqst *rqstp) { - struct xdr_buf *xbufp = &rqstp->rq_res; - int sent; - __be32 reclen; + struct svc_xprt *xprt = rqstp->rq_xprt; + struct svc_sock *svsk = container_of(xprt, struct svc_sock, sk_xprt); + struct xdr_buf *xdr = &rqstp->rq_res; + rpc_fraghdr marker = cpu_to_be32(RPC_LAST_STREAM_FRAGMENT | + (u32)xdr->len); + struct msghdr msg = { + .msg_flags = 0, + }; + unsigned int uninitialized_var(sent); + int err; - /* Set up the first element of the reply kvec. - * Any other kvecs that may be in use have been taken - * care of by the server implementation itself. - */ - reclen = htonl(0x80000000|((xbufp->len ) - 4)); - memcpy(xbufp->head[0].iov_base, &reclen, 4); - - sent = svc_sendto(rqstp, &rqstp->rq_res); - if (sent != xbufp->len) { - printk(KERN_NOTICE - "rpc-srv/tcp: %s: %s %d when sending %d bytes " - "- shutting down socket\n", - rqstp->rq_xprt->xpt_server->sv_name, - (sent<0)?"got error":"sent only", - sent, xbufp->len); - set_bit(XPT_CLOSE, &rqstp->rq_xprt->xpt_flags); - svc_xprt_enqueue(rqstp->rq_xprt); - sent = -EAGAIN; - } + err = xprt_sock_sendmsg(svsk->sk_sock, &msg, xdr, 0, marker, &sent); + xdr_free_bvec(xdr); + if (err < 0 || sent != (xdr->len + sizeof(marker))) + goto out_close; return sent; + +out_close: + pr_notice("rpc-srv/tcp: %s: %s %d when sending %d bytes - shutting down socket\n", + xprt->xpt_server->sv_name, + (err < 0) ? "got error" : "sent", + (err < 0) ? err : sent, xdr->len); + set_bit(XPT_CLOSE, &xprt->xpt_flags); + svc_xprt_enqueue(xprt); + return -EAGAIN; } static struct svc_xprt *svc_tcp_create(struct svc_serv *serv, @@ -1171,6 +1104,7 @@ static const struct svc_xprt_ops svc_tcp_ops = { .xpo_create = svc_tcp_create, .xpo_recvfrom = svc_tcp_recvfrom, .xpo_sendto = svc_tcp_sendto, + .xpo_read_payload = svc_sock_read_payload, .xpo_release_rqst = svc_release_skb, .xpo_detach = svc_tcp_sock_detach, .xpo_free = svc_sock_free, diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c index e5497dc2475b..15b58c5144f9 100644 --- a/net/sunrpc/xdr.c +++ b/net/sunrpc/xdr.c @@ -1235,61 +1235,6 @@ xdr_encode_word(struct xdr_buf *buf, unsigned int base, u32 obj) } EXPORT_SYMBOL_GPL(xdr_encode_word); -/** - * xdr_buf_read_mic() - obtain the address of the GSS mic from xdr buf - * @buf: pointer to buffer containing a mic - * @mic: on success, returns the address of the mic - * @offset: the offset in buf where mic may be found - * - * This function may modify the xdr buf if the mic is found to be straddling - * a boundary between head, pages, and tail. On success the mic can be read - * from the address returned. There is no need to free the mic. - * - * Return: Success returns 0, otherwise an integer error. - */ -int xdr_buf_read_mic(struct xdr_buf *buf, struct xdr_netobj *mic, unsigned int offset) -{ - struct xdr_buf subbuf; - unsigned int boundary; - - if (xdr_decode_word(buf, offset, &mic->len)) - return -EFAULT; - offset += 4; - - /* Is the mic partially in the head? */ - boundary = buf->head[0].iov_len; - if (offset < boundary && (offset + mic->len) > boundary) - xdr_shift_buf(buf, boundary - offset); - - /* Is the mic partially in the pages? */ - boundary += buf->page_len; - if (offset < boundary && (offset + mic->len) > boundary) - xdr_shrink_pagelen(buf, boundary - offset); - - if (xdr_buf_subsegment(buf, &subbuf, offset, mic->len)) - return -EFAULT; - - /* Is the mic contained entirely in the head? */ - mic->data = subbuf.head[0].iov_base; - if (subbuf.head[0].iov_len == mic->len) - return 0; - /* ..or is the mic contained entirely in the tail? */ - mic->data = subbuf.tail[0].iov_base; - if (subbuf.tail[0].iov_len == mic->len) - return 0; - - /* Find a contiguous area in @buf to hold all of @mic */ - if (mic->len > buf->buflen - buf->len) - return -ENOMEM; - if (buf->tail[0].iov_len != 0) - mic->data = buf->tail[0].iov_base + buf->tail[0].iov_len; - else - mic->data = buf->head[0].iov_base + buf->head[0].iov_len; - __read_bytes_from_xdr_buf(&subbuf, mic->data, mic->len); - return 0; -} -EXPORT_SYMBOL_GPL(xdr_buf_read_mic); - /* Returns 0 on success, or else a negative error code. */ static int xdr_xcode_array2(struct xdr_buf *buf, unsigned int base, diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index 1aafe8d3f3f4..493a30a296fc 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -1117,8 +1117,6 @@ void xprt_complete_rqst(struct rpc_task *task, int copied) struct rpc_rqst *req = task->tk_rqstp; struct rpc_xprt *xprt = req->rq_xprt; - dprintk("RPC: %5u xid %08x complete (%d bytes received)\n", - task->tk_pid, ntohl(req->rq_xid), copied); trace_xprt_complete_rqst(xprt, req->rq_xid, copied); xprt->stat.recvs++; @@ -1462,6 +1460,7 @@ xprt_request_transmit(struct rpc_rqst *req, struct rpc_task *snd_task) */ req->rq_ntrans++; + trace_xprt_sendto(&req->rq_snd_buf); connect_cookie = xprt->connect_cookie; status = xprt->ops->send_request(req); if (status != 0) { diff --git a/net/sunrpc/xprtrdma/backchannel.c b/net/sunrpc/xprtrdma/backchannel.c index 1a0ae0c61353..c92c1aac270a 100644 --- a/net/sunrpc/xprtrdma/backchannel.c +++ b/net/sunrpc/xprtrdma/backchannel.c @@ -44,10 +44,10 @@ int xprt_rdma_bc_setup(struct rpc_xprt *xprt, unsigned int reqs) size_t xprt_rdma_bc_maxpayload(struct rpc_xprt *xprt) { struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); - struct rpcrdma_ep *ep = &r_xprt->rx_ep; + struct rpcrdma_ep *ep = r_xprt->rx_ep; size_t maxmsg; - maxmsg = min_t(unsigned int, ep->rep_inline_send, ep->rep_inline_recv); + maxmsg = min_t(unsigned int, ep->re_inline_send, ep->re_inline_recv); maxmsg = min_t(unsigned int, maxmsg, PAGE_SIZE); return maxmsg - RPCRDMA_HDRLEN_MIN; } @@ -115,7 +115,7 @@ int xprt_rdma_bc_send_reply(struct rpc_rqst *rqst) if (rc < 0) goto failed_marshal; - if (rpcrdma_ep_post(&r_xprt->rx_ia, &r_xprt->rx_ep, req)) + if (rpcrdma_post_sends(r_xprt, req)) goto drop_connection; return 0; @@ -190,7 +190,7 @@ create_req: if (xprt->bc_alloc_count >= RPCRDMA_BACKWARD_WRS) return NULL; - size = min_t(size_t, r_xprt->rx_ep.rep_inline_recv, PAGE_SIZE); + size = min_t(size_t, r_xprt->rx_ep->re_inline_recv, PAGE_SIZE); req = rpcrdma_req_create(r_xprt, size, GFP_KERNEL); if (!req) return NULL; diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c index 125297c9aa3e..ef997880e17a 100644 --- a/net/sunrpc/xprtrdma/frwr_ops.c +++ b/net/sunrpc/xprtrdma/frwr_ops.c @@ -52,7 +52,7 @@ /** * frwr_release_mr - Destroy one MR - * @mr: MR allocated by frwr_init_mr + * @mr: MR allocated by frwr_mr_init * */ void frwr_release_mr(struct rpcrdma_mr *mr) @@ -74,7 +74,7 @@ static void frwr_mr_recycle(struct rpcrdma_mr *mr) if (mr->mr_dir != DMA_NONE) { trace_xprtrdma_mr_unmap(mr); - ib_dma_unmap_sg(r_xprt->rx_ia.ri_id->device, + ib_dma_unmap_sg(r_xprt->rx_ep->re_id->device, mr->mr_sg, mr->mr_nents, mr->mr_dir); mr->mr_dir = DMA_NONE; } @@ -106,21 +106,22 @@ void frwr_reset(struct rpcrdma_req *req) } /** - * frwr_init_mr - Initialize one MR - * @ia: interface adapter + * frwr_mr_init - Initialize one MR + * @r_xprt: controlling transport instance * @mr: generic MR to prepare for FRWR * * Returns zero if successful. Otherwise a negative errno * is returned. */ -int frwr_init_mr(struct rpcrdma_ia *ia, struct rpcrdma_mr *mr) +int frwr_mr_init(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr *mr) { - unsigned int depth = ia->ri_max_frwr_depth; + struct rpcrdma_ep *ep = r_xprt->rx_ep; + unsigned int depth = ep->re_max_fr_depth; struct scatterlist *sg; struct ib_mr *frmr; int rc; - frmr = ib_alloc_mr(ia->ri_pd, ia->ri_mrtype, depth); + frmr = ib_alloc_mr(ep->re_pd, ep->re_mrtype, depth); if (IS_ERR(frmr)) goto out_mr_err; @@ -128,6 +129,7 @@ int frwr_init_mr(struct rpcrdma_ia *ia, struct rpcrdma_mr *mr) if (!sg) goto out_list_err; + mr->mr_xprt = r_xprt; mr->frwr.fr_mr = frmr; mr->mr_dir = DMA_NONE; INIT_LIST_HEAD(&mr->mr_list); @@ -149,29 +151,24 @@ out_list_err: /** * frwr_query_device - Prepare a transport for use with FRWR - * @r_xprt: controlling transport instance + * @ep: endpoint to fill in * @device: RDMA device to query * * On success, sets: - * ep->rep_attr - * ep->rep_max_requests - * ia->ri_max_rdma_segs - * - * And these FRWR-related fields: - * ia->ri_max_frwr_depth - * ia->ri_mrtype + * ep->re_attr + * ep->re_max_requests + * ep->re_max_rdma_segs + * ep->re_max_fr_depth + * ep->re_mrtype * * Return values: * On success, returns zero. * %-EINVAL - the device does not support FRWR memory registration * %-ENOMEM - the device is not sufficiently capable for NFS/RDMA */ -int frwr_query_device(struct rpcrdma_xprt *r_xprt, - const struct ib_device *device) +int frwr_query_device(struct rpcrdma_ep *ep, const struct ib_device *device) { const struct ib_device_attr *attrs = &device->attrs; - struct rpcrdma_ia *ia = &r_xprt->rx_ia; - struct rpcrdma_ep *ep = &r_xprt->rx_ep; int max_qp_wr, depth, delta; unsigned int max_sge; @@ -188,23 +185,23 @@ int frwr_query_device(struct rpcrdma_xprt *r_xprt, pr_err("rpcrdma: HCA provides only %u send SGEs\n", max_sge); return -ENOMEM; } - ep->rep_attr.cap.max_send_sge = max_sge; - ep->rep_attr.cap.max_recv_sge = 1; + ep->re_attr.cap.max_send_sge = max_sge; + ep->re_attr.cap.max_recv_sge = 1; - ia->ri_mrtype = IB_MR_TYPE_MEM_REG; + ep->re_mrtype = IB_MR_TYPE_MEM_REG; if (attrs->device_cap_flags & IB_DEVICE_SG_GAPS_REG) - ia->ri_mrtype = IB_MR_TYPE_SG_GAPS; + ep->re_mrtype = IB_MR_TYPE_SG_GAPS; /* Quirk: Some devices advertise a large max_fast_reg_page_list_len * capability, but perform optimally when the MRs are not larger * than a page. */ if (attrs->max_sge_rd > RPCRDMA_MAX_HDR_SEGS) - ia->ri_max_frwr_depth = attrs->max_sge_rd; + ep->re_max_fr_depth = attrs->max_sge_rd; else - ia->ri_max_frwr_depth = attrs->max_fast_reg_page_list_len; - if (ia->ri_max_frwr_depth > RPCRDMA_MAX_DATA_SEGS) - ia->ri_max_frwr_depth = RPCRDMA_MAX_DATA_SEGS; + ep->re_max_fr_depth = attrs->max_fast_reg_page_list_len; + if (ep->re_max_fr_depth > RPCRDMA_MAX_DATA_SEGS) + ep->re_max_fr_depth = RPCRDMA_MAX_DATA_SEGS; /* Add room for frwr register and invalidate WRs. * 1. FRWR reg WR for head @@ -220,11 +217,11 @@ int frwr_query_device(struct rpcrdma_xprt *r_xprt, /* Calculate N if the device max FRWR depth is smaller than * RPCRDMA_MAX_DATA_SEGS. */ - if (ia->ri_max_frwr_depth < RPCRDMA_MAX_DATA_SEGS) { - delta = RPCRDMA_MAX_DATA_SEGS - ia->ri_max_frwr_depth; + if (ep->re_max_fr_depth < RPCRDMA_MAX_DATA_SEGS) { + delta = RPCRDMA_MAX_DATA_SEGS - ep->re_max_fr_depth; do { depth += 2; /* FRWR reg + invalidate */ - delta -= ia->ri_max_frwr_depth; + delta -= ep->re_max_fr_depth; } while (delta > 0); } @@ -233,34 +230,34 @@ int frwr_query_device(struct rpcrdma_xprt *r_xprt, max_qp_wr -= 1; if (max_qp_wr < RPCRDMA_MIN_SLOT_TABLE) return -ENOMEM; - if (ep->rep_max_requests > max_qp_wr) - ep->rep_max_requests = max_qp_wr; - ep->rep_attr.cap.max_send_wr = ep->rep_max_requests * depth; - if (ep->rep_attr.cap.max_send_wr > max_qp_wr) { - ep->rep_max_requests = max_qp_wr / depth; - if (!ep->rep_max_requests) + if (ep->re_max_requests > max_qp_wr) + ep->re_max_requests = max_qp_wr; + ep->re_attr.cap.max_send_wr = ep->re_max_requests * depth; + if (ep->re_attr.cap.max_send_wr > max_qp_wr) { + ep->re_max_requests = max_qp_wr / depth; + if (!ep->re_max_requests) return -ENOMEM; - ep->rep_attr.cap.max_send_wr = ep->rep_max_requests * depth; + ep->re_attr.cap.max_send_wr = ep->re_max_requests * depth; } - ep->rep_attr.cap.max_send_wr += RPCRDMA_BACKWARD_WRS; - ep->rep_attr.cap.max_send_wr += 1; /* for ib_drain_sq */ - ep->rep_attr.cap.max_recv_wr = ep->rep_max_requests; - ep->rep_attr.cap.max_recv_wr += RPCRDMA_BACKWARD_WRS; - ep->rep_attr.cap.max_recv_wr += 1; /* for ib_drain_rq */ - - ia->ri_max_rdma_segs = - DIV_ROUND_UP(RPCRDMA_MAX_DATA_SEGS, ia->ri_max_frwr_depth); + ep->re_attr.cap.max_send_wr += RPCRDMA_BACKWARD_WRS; + ep->re_attr.cap.max_send_wr += 1; /* for ib_drain_sq */ + ep->re_attr.cap.max_recv_wr = ep->re_max_requests; + ep->re_attr.cap.max_recv_wr += RPCRDMA_BACKWARD_WRS; + ep->re_attr.cap.max_recv_wr += 1; /* for ib_drain_rq */ + + ep->re_max_rdma_segs = + DIV_ROUND_UP(RPCRDMA_MAX_DATA_SEGS, ep->re_max_fr_depth); /* Reply chunks require segments for head and tail buffers */ - ia->ri_max_rdma_segs += 2; - if (ia->ri_max_rdma_segs > RPCRDMA_MAX_HDR_SEGS) - ia->ri_max_rdma_segs = RPCRDMA_MAX_HDR_SEGS; + ep->re_max_rdma_segs += 2; + if (ep->re_max_rdma_segs > RPCRDMA_MAX_HDR_SEGS) + ep->re_max_rdma_segs = RPCRDMA_MAX_HDR_SEGS; /* Ensure the underlying device is capable of conveying the * largest r/wsize NFS will ask for. This guarantees that * failing over from one RDMA device to another will not * break NFS I/O. */ - if ((ia->ri_max_rdma_segs * ia->ri_max_frwr_depth) < RPCRDMA_MAX_SEGS) + if ((ep->re_max_rdma_segs * ep->re_max_fr_depth) < RPCRDMA_MAX_SEGS) return -ENOMEM; return 0; @@ -286,14 +283,14 @@ struct rpcrdma_mr_seg *frwr_map(struct rpcrdma_xprt *r_xprt, int nsegs, bool writing, __be32 xid, struct rpcrdma_mr *mr) { - struct rpcrdma_ia *ia = &r_xprt->rx_ia; + struct rpcrdma_ep *ep = r_xprt->rx_ep; struct ib_reg_wr *reg_wr; int i, n, dma_nents; struct ib_mr *ibmr; u8 key; - if (nsegs > ia->ri_max_frwr_depth) - nsegs = ia->ri_max_frwr_depth; + if (nsegs > ep->re_max_fr_depth) + nsegs = ep->re_max_fr_depth; for (i = 0; i < nsegs;) { if (seg->mr_page) sg_set_page(&mr->mr_sg[i], @@ -306,7 +303,7 @@ struct rpcrdma_mr_seg *frwr_map(struct rpcrdma_xprt *r_xprt, ++seg; ++i; - if (ia->ri_mrtype == IB_MR_TYPE_SG_GAPS) + if (ep->re_mrtype == IB_MR_TYPE_SG_GAPS) continue; if ((i < nsegs && offset_in_page(seg->mr_offset)) || offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len)) @@ -315,7 +312,7 @@ struct rpcrdma_mr_seg *frwr_map(struct rpcrdma_xprt *r_xprt, mr->mr_dir = rpcrdma_data_dir(writing); mr->mr_nents = i; - dma_nents = ib_dma_map_sg(ia->ri_id->device, mr->mr_sg, mr->mr_nents, + dma_nents = ib_dma_map_sg(ep->re_id->device, mr->mr_sg, mr->mr_nents, mr->mr_dir); if (!dma_nents) goto out_dmamap_err; @@ -356,8 +353,8 @@ out_mapmr_err: /** * frwr_wc_fastreg - Invoked by RDMA provider for a flushed FastReg WC - * @cq: completion queue (ignored) - * @wc: completed WR + * @cq: completion queue + * @wc: WCE for a completed FastReg WR * */ static void frwr_wc_fastreg(struct ib_cq *cq, struct ib_wc *wc) @@ -369,20 +366,25 @@ static void frwr_wc_fastreg(struct ib_cq *cq, struct ib_wc *wc) /* WARNING: Only wr_cqe and status are reliable at this point */ trace_xprtrdma_wc_fastreg(wc, frwr); /* The MR will get recycled when the associated req is retransmitted */ + + rpcrdma_flush_disconnect(cq, wc); } /** - * frwr_send - post Send WR containing the RPC Call message - * @ia: interface adapter - * @req: Prepared RPC Call + * frwr_send - post Send WRs containing the RPC Call message + * @r_xprt: controlling transport instance + * @req: prepared RPC Call * * For FRWR, chain any FastReg WRs to the Send WR. Only a * single ib_post_send call is needed to register memory * and then post the Send WR. * - * Returns the result of ib_post_send. + * Returns the return code from ib_post_send. + * + * Caller must hold the transport send lock to ensure that the + * pointers to the transport's rdma_cm_id and QP are stable. */ -int frwr_send(struct rpcrdma_ia *ia, struct rpcrdma_req *req) +int frwr_send(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) { struct ib_send_wr *post_wr; struct rpcrdma_mr *mr; @@ -403,7 +405,7 @@ int frwr_send(struct rpcrdma_ia *ia, struct rpcrdma_req *req) post_wr = &frwr->fr_regwr.wr; } - return ib_post_send(ia->ri_id->qp, post_wr, NULL); + return ib_post_send(r_xprt->rx_ep->re_id->qp, post_wr, NULL); } /** @@ -419,7 +421,7 @@ void frwr_reminv(struct rpcrdma_rep *rep, struct list_head *mrs) list_for_each_entry(mr, mrs, mr_list) if (mr->mr_handle == rep->rr_inv_rkey) { list_del_init(&mr->mr_list); - trace_xprtrdma_mr_remoteinv(mr); + trace_xprtrdma_mr_reminv(mr); rpcrdma_mr_put(mr); break; /* only one invalidated MR per RPC */ } @@ -435,8 +437,8 @@ static void __frwr_release_mr(struct ib_wc *wc, struct rpcrdma_mr *mr) /** * frwr_wc_localinv - Invoked by RDMA provider for a LOCAL_INV WC - * @cq: completion queue (ignored) - * @wc: completed WR + * @cq: completion queue + * @wc: WCE for a completed LocalInv WR * */ static void frwr_wc_localinv(struct ib_cq *cq, struct ib_wc *wc) @@ -449,12 +451,14 @@ static void frwr_wc_localinv(struct ib_cq *cq, struct ib_wc *wc) /* WARNING: Only wr_cqe and status are reliable at this point */ trace_xprtrdma_wc_li(wc, frwr); __frwr_release_mr(wc, mr); + + rpcrdma_flush_disconnect(cq, wc); } /** * frwr_wc_localinv_wake - Invoked by RDMA provider for a LOCAL_INV WC - * @cq: completion queue (ignored) - * @wc: completed WR + * @cq: completion queue + * @wc: WCE for a completed LocalInv WR * * Awaken anyone waiting for an MR to finish being fenced. */ @@ -469,6 +473,8 @@ static void frwr_wc_localinv_wake(struct ib_cq *cq, struct ib_wc *wc) trace_xprtrdma_wc_li_wake(wc, frwr); __frwr_release_mr(wc, mr); complete(&frwr->fr_linv_done); + + rpcrdma_flush_disconnect(cq, wc); } /** @@ -526,10 +532,10 @@ void frwr_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) /* Transport disconnect drains the receive CQ before it * replaces the QP. The RPC reply handler won't call us - * unless ri_id->qp is a valid pointer. + * unless re_id->qp is a valid pointer. */ bad_wr = NULL; - rc = ib_post_send(r_xprt->rx_ia.ri_id->qp, first, &bad_wr); + rc = ib_post_send(r_xprt->rx_ep->re_id->qp, first, &bad_wr); /* The final LOCAL_INV WR in the chain is supposed to * do the wake. If it was never posted, the wake will @@ -556,8 +562,8 @@ void frwr_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) /** * frwr_wc_localinv_done - Invoked by RDMA provider for a signaled LOCAL_INV WC - * @cq: completion queue (ignored) - * @wc: completed WR + * @cq: completion queue + * @wc: WCE for a completed LocalInv WR * */ static void frwr_wc_localinv_done(struct ib_cq *cq, struct ib_wc *wc) @@ -575,6 +581,8 @@ static void frwr_wc_localinv_done(struct ib_cq *cq, struct ib_wc *wc) /* Ensure @rep is generated before __frwr_release_mr */ smp_rmb(); rpcrdma_complete_rqst(rep); + + rpcrdma_flush_disconnect(cq, wc); } /** @@ -629,10 +637,10 @@ void frwr_unmap_async(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) /* Transport disconnect drains the receive CQ before it * replaces the QP. The RPC reply handler won't call us - * unless ri_id->qp is a valid pointer. + * unless re_id->qp is a valid pointer. */ bad_wr = NULL; - rc = ib_post_send(r_xprt->rx_ia.ri_id->qp, first, &bad_wr); + rc = ib_post_send(r_xprt->rx_ep->re_id->qp, first, &bad_wr); if (!rc) return; diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c index 28020ec104d4..4a81e6995d3e 100644 --- a/net/sunrpc/xprtrdma/rpc_rdma.c +++ b/net/sunrpc/xprtrdma/rpc_rdma.c @@ -103,21 +103,20 @@ static unsigned int rpcrdma_max_reply_header_size(unsigned int maxsegs) /** * rpcrdma_set_max_header_sizes - Initialize inline payload sizes - * @r_xprt: transport instance to initialize + * @ep: endpoint to initialize * * The max_inline fields contain the maximum size of an RPC message * so the marshaling code doesn't have to repeat this calculation * for every RPC. */ -void rpcrdma_set_max_header_sizes(struct rpcrdma_xprt *r_xprt) +void rpcrdma_set_max_header_sizes(struct rpcrdma_ep *ep) { - unsigned int maxsegs = r_xprt->rx_ia.ri_max_rdma_segs; - struct rpcrdma_ep *ep = &r_xprt->rx_ep; + unsigned int maxsegs = ep->re_max_rdma_segs; - ep->rep_max_inline_send = - ep->rep_inline_send - rpcrdma_max_call_header_size(maxsegs); - ep->rep_max_inline_recv = - ep->rep_inline_recv - rpcrdma_max_reply_header_size(maxsegs); + ep->re_max_inline_send = + ep->re_inline_send - rpcrdma_max_call_header_size(maxsegs); + ep->re_max_inline_recv = + ep->re_inline_recv - rpcrdma_max_reply_header_size(maxsegs); } /* The client can send a request inline as long as the RPCRDMA header @@ -132,9 +131,10 @@ static bool rpcrdma_args_inline(struct rpcrdma_xprt *r_xprt, struct rpc_rqst *rqst) { struct xdr_buf *xdr = &rqst->rq_snd_buf; + struct rpcrdma_ep *ep = r_xprt->rx_ep; unsigned int count, remaining, offset; - if (xdr->len > r_xprt->rx_ep.rep_max_inline_send) + if (xdr->len > ep->re_max_inline_send) return false; if (xdr->page_len) { @@ -145,7 +145,7 @@ static bool rpcrdma_args_inline(struct rpcrdma_xprt *r_xprt, remaining -= min_t(unsigned int, PAGE_SIZE - offset, remaining); offset = 0; - if (++count > r_xprt->rx_ep.rep_attr.cap.max_send_sge) + if (++count > ep->re_attr.cap.max_send_sge) return false; } } @@ -162,7 +162,7 @@ static bool rpcrdma_args_inline(struct rpcrdma_xprt *r_xprt, static bool rpcrdma_results_inline(struct rpcrdma_xprt *r_xprt, struct rpc_rqst *rqst) { - return rqst->rq_rcv_buf.buflen <= r_xprt->rx_ep.rep_max_inline_recv; + return rqst->rq_rcv_buf.buflen <= r_xprt->rx_ep->re_max_inline_recv; } /* The client is required to provide a Reply chunk if the maximum @@ -176,7 +176,7 @@ rpcrdma_nonpayload_inline(const struct rpcrdma_xprt *r_xprt, const struct xdr_buf *buf = &rqst->rq_rcv_buf; return (buf->head[0].iov_len + buf->tail[0].iov_len) < - r_xprt->rx_ep.rep_max_inline_recv; + r_xprt->rx_ep->re_max_inline_recv; } /* Split @vec on page boundaries into SGEs. FMR registers pages, not @@ -255,7 +255,7 @@ rpcrdma_convert_iovs(struct rpcrdma_xprt *r_xprt, struct xdr_buf *xdrbuf, /* When encoding a Read chunk, the tail iovec contains an * XDR pad and may be omitted. */ - if (type == rpcrdma_readch && r_xprt->rx_ia.ri_implicit_roundup) + if (type == rpcrdma_readch && r_xprt->rx_ep->re_implicit_roundup) goto out; /* When encoding a Write chunk, some servers need to see an @@ -263,7 +263,7 @@ rpcrdma_convert_iovs(struct rpcrdma_xprt *r_xprt, struct xdr_buf *xdrbuf, * layer provides space in the tail iovec that may be used * for this purpose. */ - if (type == rpcrdma_writech && r_xprt->rx_ia.ri_implicit_roundup) + if (type == rpcrdma_writech && r_xprt->rx_ep->re_implicit_roundup) goto out; if (xdrbuf->tail[0].iov_len) @@ -275,32 +275,6 @@ out: return n; } -static inline int -encode_item_present(struct xdr_stream *xdr) -{ - __be32 *p; - - p = xdr_reserve_space(xdr, sizeof(*p)); - if (unlikely(!p)) - return -EMSGSIZE; - - *p = xdr_one; - return 0; -} - -static inline int -encode_item_not_present(struct xdr_stream *xdr) -{ - __be32 *p; - - p = xdr_reserve_space(xdr, sizeof(*p)); - if (unlikely(!p)) - return -EMSGSIZE; - - *p = xdr_zero; - return 0; -} - static void xdr_encode_rdma_segment(__be32 *iptr, struct rpcrdma_mr *mr) { @@ -414,7 +388,7 @@ static int rpcrdma_encode_read_list(struct rpcrdma_xprt *r_xprt, } while (nsegs); done: - return encode_item_not_present(xdr); + return xdr_stream_encode_item_absent(xdr); } /* Register and XDR encode the Write list. Supports encoding a list @@ -453,7 +427,7 @@ static int rpcrdma_encode_write_list(struct rpcrdma_xprt *r_xprt, if (nsegs < 0) return nsegs; - if (encode_item_present(xdr) < 0) + if (xdr_stream_encode_item_present(xdr) < 0) return -EMSGSIZE; segcount = xdr_reserve_space(xdr, sizeof(*segcount)); if (unlikely(!segcount)) @@ -480,7 +454,7 @@ static int rpcrdma_encode_write_list(struct rpcrdma_xprt *r_xprt, *segcount = cpu_to_be32(nchunks); done: - return encode_item_not_present(xdr); + return xdr_stream_encode_item_absent(xdr); } /* Register and XDR encode the Reply chunk. Supports encoding an array @@ -507,14 +481,14 @@ static int rpcrdma_encode_reply_chunk(struct rpcrdma_xprt *r_xprt, __be32 *segcount; if (wtype != rpcrdma_replych) - return encode_item_not_present(xdr); + return xdr_stream_encode_item_absent(xdr); seg = req->rl_segments; nsegs = rpcrdma_convert_iovs(r_xprt, &rqst->rq_rcv_buf, 0, wtype, seg); if (nsegs < 0) return nsegs; - if (encode_item_present(xdr) < 0) + if (xdr_stream_encode_item_present(xdr) < 0) return -EMSGSIZE; segcount = xdr_reserve_space(xdr, sizeof(*segcount)); if (unlikely(!segcount)) @@ -1476,8 +1450,8 @@ void rpcrdma_reply_handler(struct rpcrdma_rep *rep) if (credits == 0) credits = 1; /* don't deadlock */ - else if (credits > r_xprt->rx_ep.rep_max_requests) - credits = r_xprt->rx_ep.rep_max_requests; + else if (credits > r_xprt->rx_ep->re_max_requests) + credits = r_xprt->rx_ep->re_max_requests; if (buf->rb_credits != credits) rpcrdma_update_cwnd(r_xprt, credits); rpcrdma_post_recvs(r_xprt, false); diff --git a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c index 908e78bb87c6..d510a3a15d4b 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c +++ b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c @@ -117,7 +117,7 @@ static int svc_rdma_bc_sendto(struct svcxprt_rdma *rdma, { int ret; - ret = svc_rdma_map_reply_msg(rdma, ctxt, &rqst->rq_snd_buf, NULL); + ret = svc_rdma_map_reply_msg(rdma, ctxt, NULL, &rqst->rq_snd_buf); if (ret < 0) return -EIO; @@ -181,7 +181,9 @@ rpcrdma_bc_send_request(struct svcxprt_rdma *rdma, struct rpc_rqst *rqst) if (!ctxt) goto drop_connection; - p = ctxt->sc_xprt_buf; + p = xdr_reserve_space(&ctxt->sc_stream, RPCRDMA_HDRLEN_MIN); + if (!p) + goto put_ctxt; *p++ = rqst->rq_xid; *p++ = rpcrdma_version; *p++ = cpu_to_be32(r_xprt->rx_buf.rb_bc_max_requests); @@ -189,7 +191,6 @@ rpcrdma_bc_send_request(struct svcxprt_rdma *rdma, struct rpc_rqst *rqst) *p++ = xdr_zero; *p++ = xdr_zero; *p = xdr_zero; - svc_rdma_sync_reply_hdr(rdma, ctxt, RPCRDMA_HDRLEN_MIN); #ifdef SVCRDMA_BACKCHANNEL_DEBUG pr_info("%s: %*ph\n", __func__, 64, rqst->rq_buffer); @@ -197,12 +198,13 @@ rpcrdma_bc_send_request(struct svcxprt_rdma *rdma, struct rpc_rqst *rqst) rqst->rq_xtime = ktime_get(); rc = svc_rdma_bc_sendto(rdma, rqst, ctxt); - if (rc) { - svc_rdma_send_ctxt_put(rdma, ctxt); - goto drop_connection; - } + if (rc) + goto put_ctxt; return 0; +put_ctxt: + svc_rdma_send_ctxt_put(rdma, ctxt); + drop_connection: dprintk("svcrdma: failed to send bc call\n"); return -ENOTCONN; @@ -250,6 +252,7 @@ xprt_rdma_bc_put(struct rpc_xprt *xprt) { dprintk("svcrdma: %s: xprt %p\n", __func__, xprt); + xprt_rdma_free_addresses(xprt); xprt_free(xprt); } diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c index 96bccd398469..54469b72b25f 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c +++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c @@ -193,6 +193,7 @@ svc_rdma_recv_ctxt_get(struct svcxprt_rdma *rdma) out: ctxt->rc_page_count = 0; + ctxt->rc_read_payload_length = 0; return ctxt; out_empty: @@ -357,15 +358,14 @@ static void svc_rdma_build_arg_xdr(struct svc_rqst *rqstp, arg->len = ctxt->rc_byte_len; } -/* This accommodates the largest possible Write chunk, - * in one segment. +/* This accommodates the largest possible Write chunk. */ -#define MAX_BYTES_WRITE_SEG ((u32)(RPCSVC_MAXPAGES << PAGE_SHIFT)) +#define MAX_BYTES_WRITE_CHUNK ((u32)(RPCSVC_MAXPAGES << PAGE_SHIFT)) /* This accommodates the largest possible Position-Zero - * Read chunk or Reply chunk, in one segment. + * Read chunk or Reply chunk. */ -#define MAX_BYTES_SPECIAL_SEG ((u32)((RPCSVC_MAXPAGES + 2) << PAGE_SHIFT)) +#define MAX_BYTES_SPECIAL_CHUNK ((u32)((RPCSVC_MAXPAGES + 2) << PAGE_SHIFT)) /* Sanity check the Read list. * @@ -373,7 +373,7 @@ static void svc_rdma_build_arg_xdr(struct svc_rqst *rqstp, * - This implementation supports only one Read chunk. * * Sanity checks: - * - Read list does not overflow buffer. + * - Read list does not overflow Receive buffer. * - Segment size limited by largest NFS data payload. * * The segment count is limited to how many segments can @@ -381,30 +381,44 @@ static void svc_rdma_build_arg_xdr(struct svc_rqst *rqstp, * buffer. That's about 40 Read segments for a 1KB inline * threshold. * - * Returns pointer to the following Write list. + * Return values: + * %true: Read list is valid. @rctxt's xdr_stream is updated + * to point to the first byte past the Read list. + * %false: Read list is corrupt. @rctxt's xdr_stream is left + * in an unknown state. */ -static __be32 *xdr_check_read_list(__be32 *p, const __be32 *end) +static bool xdr_check_read_list(struct svc_rdma_recv_ctxt *rctxt) { - u32 position; + u32 position, len; bool first; + __be32 *p; + + p = xdr_inline_decode(&rctxt->rc_stream, sizeof(*p)); + if (!p) + return false; + len = 0; first = true; - while (*p++ != xdr_zero) { + while (*p != xdr_zero) { + p = xdr_inline_decode(&rctxt->rc_stream, + rpcrdma_readseg_maxsz * sizeof(*p)); + if (!p) + return false; + if (first) { - position = be32_to_cpup(p++); + position = be32_to_cpup(p); first = false; - } else if (be32_to_cpup(p++) != position) { - return NULL; + } else if (be32_to_cpup(p) != position) { + return false; } - p++; /* handle */ - if (be32_to_cpup(p++) > MAX_BYTES_SPECIAL_SEG) - return NULL; - p += 2; /* offset */ + p += 2; + len += be32_to_cpup(p); - if (p > end) - return NULL; + p = xdr_inline_decode(&rctxt->rc_stream, sizeof(*p)); + if (!p) + return false; } - return p; + return len <= MAX_BYTES_SPECIAL_CHUNK; } /* The segment count is limited to how many segments can @@ -412,67 +426,100 @@ static __be32 *xdr_check_read_list(__be32 *p, const __be32 *end) * buffer. That's about 60 Write segments for a 1KB inline * threshold. */ -static __be32 *xdr_check_write_chunk(__be32 *p, const __be32 *end, - u32 maxlen) +static bool xdr_check_write_chunk(struct svc_rdma_recv_ctxt *rctxt, u32 maxlen) { - u32 i, segcount; + u32 i, segcount, total; + __be32 *p; + + p = xdr_inline_decode(&rctxt->rc_stream, sizeof(*p)); + if (!p) + return false; + segcount = be32_to_cpup(p); - segcount = be32_to_cpup(p++); + total = 0; for (i = 0; i < segcount; i++) { - p++; /* handle */ - if (be32_to_cpup(p++) > maxlen) - return NULL; - p += 2; /* offset */ + u32 handle, length; + u64 offset; - if (p > end) - return NULL; - } + p = xdr_inline_decode(&rctxt->rc_stream, + rpcrdma_segment_maxsz * sizeof(*p)); + if (!p) + return false; + + handle = be32_to_cpup(p++); + length = be32_to_cpup(p++); + xdr_decode_hyper(p, &offset); + trace_svcrdma_decode_wseg(handle, length, offset); - return p; + total += length; + } + return total <= maxlen; } /* Sanity check the Write list. * * Implementation limits: - * - This implementation supports only one Write chunk. + * - This implementation currently supports only one Write chunk. * * Sanity checks: - * - Write list does not overflow buffer. - * - Segment size limited by largest NFS data payload. - * - * Returns pointer to the following Reply chunk. + * - Write list does not overflow Receive buffer. + * - Chunk size limited by largest NFS data payload. + * + * Return values: + * %true: Write list is valid. @rctxt's xdr_stream is updated + * to point to the first byte past the Write list. + * %false: Write list is corrupt. @rctxt's xdr_stream is left + * in an unknown state. */ -static __be32 *xdr_check_write_list(__be32 *p, const __be32 *end) +static bool xdr_check_write_list(struct svc_rdma_recv_ctxt *rctxt) { - u32 chcount; + u32 chcount = 0; + __be32 *p; - chcount = 0; - while (*p++ != xdr_zero) { - p = xdr_check_write_chunk(p, end, MAX_BYTES_WRITE_SEG); + p = xdr_inline_decode(&rctxt->rc_stream, sizeof(*p)); + if (!p) + return false; + rctxt->rc_write_list = p; + while (*p != xdr_zero) { + if (!xdr_check_write_chunk(rctxt, MAX_BYTES_WRITE_CHUNK)) + return false; + ++chcount; + p = xdr_inline_decode(&rctxt->rc_stream, sizeof(*p)); if (!p) - return NULL; - if (chcount++ > 1) - return NULL; + return false; } - return p; + if (!chcount) + rctxt->rc_write_list = NULL; + return chcount < 2; } /* Sanity check the Reply chunk. * * Sanity checks: - * - Reply chunk does not overflow buffer. - * - Segment size limited by largest NFS data payload. - * - * Returns pointer to the following RPC header. + * - Reply chunk does not overflow Receive buffer. + * - Chunk size limited by largest NFS data payload. + * + * Return values: + * %true: Reply chunk is valid. @rctxt's xdr_stream is updated + * to point to the first byte past the Reply chunk. + * %false: Reply chunk is corrupt. @rctxt's xdr_stream is left + * in an unknown state. */ -static __be32 *xdr_check_reply_chunk(__be32 *p, const __be32 *end) +static bool xdr_check_reply_chunk(struct svc_rdma_recv_ctxt *rctxt) { - if (*p++ != xdr_zero) { - p = xdr_check_write_chunk(p, end, MAX_BYTES_SPECIAL_SEG); - if (!p) - return NULL; + __be32 *p; + + p = xdr_inline_decode(&rctxt->rc_stream, sizeof(*p)); + if (!p) + return false; + rctxt->rc_reply_chunk = p; + if (*p != xdr_zero) { + if (!xdr_check_write_chunk(rctxt, MAX_BYTES_SPECIAL_CHUNK)) + return false; + } else { + rctxt->rc_reply_chunk = NULL; } - return p; + return true; } /* RPC-over-RDMA Version One private extension: Remote Invalidation. @@ -537,60 +584,61 @@ static void svc_rdma_get_inv_rkey(struct svcxprt_rdma *rdma, ctxt->rc_inv_rkey = be32_to_cpu(inv_rkey); } -/* On entry, xdr->head[0].iov_base points to first byte in the - * RPC-over-RDMA header. +/** + * svc_rdma_xdr_decode_req - Decode the transport header + * @rq_arg: xdr_buf containing ingress RPC/RDMA message + * @rctxt: state of decoding + * + * On entry, xdr->head[0].iov_base points to first byte of the + * RPC-over-RDMA transport header. * * On successful exit, head[0] points to first byte past the * RPC-over-RDMA header. For RDMA_MSG, this is the RPC message. + * * The length of the RPC-over-RDMA header is returned. * * Assumptions: * - The transport header is entirely contained in the head iovec. */ -static int svc_rdma_xdr_decode_req(struct xdr_buf *rq_arg) +static int svc_rdma_xdr_decode_req(struct xdr_buf *rq_arg, + struct svc_rdma_recv_ctxt *rctxt) { - __be32 *p, *end, *rdma_argp; + __be32 *p, *rdma_argp; unsigned int hdr_len; - /* Verify that there's enough bytes for header + something */ - if (rq_arg->len <= RPCRDMA_HDRLEN_ERR) - goto out_short; - rdma_argp = rq_arg->head[0].iov_base; - if (*(rdma_argp + 1) != rpcrdma_version) - goto out_version; + xdr_init_decode(&rctxt->rc_stream, rq_arg, rdma_argp, NULL); - switch (*(rdma_argp + 3)) { + p = xdr_inline_decode(&rctxt->rc_stream, + rpcrdma_fixed_maxsz * sizeof(*p)); + if (unlikely(!p)) + goto out_short; + p++; + if (*p != rpcrdma_version) + goto out_version; + p += 2; + switch (*p) { case rdma_msg: break; case rdma_nomsg: break; - case rdma_done: goto out_drop; - case rdma_error: goto out_drop; - default: goto out_proc; } - end = (__be32 *)((unsigned long)rdma_argp + rq_arg->len); - p = xdr_check_read_list(rdma_argp + 4, end); - if (!p) - goto out_inval; - p = xdr_check_write_list(p, end); - if (!p) + if (!xdr_check_read_list(rctxt)) goto out_inval; - p = xdr_check_reply_chunk(p, end); - if (!p) + if (!xdr_check_write_list(rctxt)) goto out_inval; - if (p > end) + if (!xdr_check_reply_chunk(rctxt)) goto out_inval; - rq_arg->head[0].iov_base = p; - hdr_len = (unsigned long)p - (unsigned long)rdma_argp; + rq_arg->head[0].iov_base = rctxt->rc_stream.p; + hdr_len = xdr_stream_pos(&rctxt->rc_stream); rq_arg->head[0].iov_len -= hdr_len; rq_arg->len -= hdr_len; trace_svcrdma_decode_rqst(rdma_argp, hdr_len); @@ -650,7 +698,6 @@ static void svc_rdma_send_error(struct svcxprt_rdma *xprt, __be32 *rdma_argp, int status) { struct svc_rdma_send_ctxt *ctxt; - unsigned int length; __be32 *p; int ret; @@ -658,29 +705,46 @@ static void svc_rdma_send_error(struct svcxprt_rdma *xprt, if (!ctxt) return; - p = ctxt->sc_xprt_buf; + p = xdr_reserve_space(&ctxt->sc_stream, + rpcrdma_fixed_maxsz * sizeof(*p)); + if (!p) + goto put_ctxt; + *p++ = *rdma_argp; *p++ = *(rdma_argp + 1); *p++ = xprt->sc_fc_credits; - *p++ = rdma_error; + *p = rdma_error; + switch (status) { case -EPROTONOSUPPORT: + p = xdr_reserve_space(&ctxt->sc_stream, 3 * sizeof(*p)); + if (!p) + goto put_ctxt; + *p++ = err_vers; *p++ = rpcrdma_version; - *p++ = rpcrdma_version; + *p = rpcrdma_version; trace_svcrdma_err_vers(*rdma_argp); break; default: - *p++ = err_chunk; + p = xdr_reserve_space(&ctxt->sc_stream, sizeof(*p)); + if (!p) + goto put_ctxt; + + *p = err_chunk; trace_svcrdma_err_chunk(*rdma_argp); } - length = (unsigned long)p - (unsigned long)ctxt->sc_xprt_buf; - svc_rdma_sync_reply_hdr(xprt, ctxt, length); + ctxt->sc_send_wr.num_sge = 1; ctxt->sc_send_wr.opcode = IB_WR_SEND; + ctxt->sc_sges[0].length = ctxt->sc_hdrbuf.len; ret = svc_rdma_send(xprt, &ctxt->sc_send_wr); if (ret) - svc_rdma_send_ctxt_put(xprt, ctxt); + goto put_ctxt; + return; + +put_ctxt: + svc_rdma_send_ctxt_put(xprt, ctxt); } /* By convention, backchannel calls arrive via rdma_msg type @@ -785,7 +849,7 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp) rqstp->rq_next_page = rqstp->rq_respages; p = (__be32 *)rqstp->rq_arg.head[0].iov_base; - ret = svc_rdma_xdr_decode_req(&rqstp->rq_arg); + ret = svc_rdma_xdr_decode_req(&rqstp->rq_arg, ctxt); if (ret < 0) goto out_err; if (ret == 0) diff --git a/net/sunrpc/xprtrdma/svc_rdma_rw.c b/net/sunrpc/xprtrdma/svc_rdma_rw.c index 48fe3b16b0d9..bd7c195d872e 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_rw.c +++ b/net/sunrpc/xprtrdma/svc_rdma_rw.c @@ -41,7 +41,7 @@ struct svc_rdma_rw_ctxt { struct rdma_rw_ctx rw_ctx; int rw_nents; struct sg_table rw_sg_table; - struct scatterlist rw_first_sgl[0]; + struct scatterlist rw_first_sgl[]; }; static inline struct svc_rdma_rw_ctxt * @@ -439,7 +439,8 @@ svc_rdma_build_writes(struct svc_rdma_write_info *info, if (ret < 0) goto out_initerr; - trace_svcrdma_encode_wseg(seg_handle, write_len, seg_offset); + trace_svcrdma_send_wseg(seg_handle, write_len, seg_offset); + list_add(&ctxt->rw_list, &cc->cc_rwctxts); cc->cc_sqecount += ret; if (write_len == seg_length - info->wi_seg_off) { @@ -482,18 +483,19 @@ static int svc_rdma_send_xdr_kvec(struct svc_rdma_write_info *info, vec->iov_len); } -/* Send an xdr_buf's page list by itself. A Write chunk is - * just the page list. a Reply chunk is the head, page list, - * and tail. This function is shared between the two types - * of chunk. +/* Send an xdr_buf's page list by itself. A Write chunk is just + * the page list. A Reply chunk is @xdr's head, page list, and + * tail. This function is shared between the two types of chunk. */ static int svc_rdma_send_xdr_pagelist(struct svc_rdma_write_info *info, - struct xdr_buf *xdr) + struct xdr_buf *xdr, + unsigned int offset, + unsigned long length) { info->wi_xdr = xdr; - info->wi_next_off = 0; + info->wi_next_off = offset - xdr->head[0].iov_len; return svc_rdma_build_writes(info, svc_rdma_pagelist_to_sg, - xdr->page_len); + length); } /** @@ -501,6 +503,8 @@ static int svc_rdma_send_xdr_pagelist(struct svc_rdma_write_info *info, * @rdma: controlling RDMA transport * @wr_ch: Write chunk provided by client * @xdr: xdr_buf containing the data payload + * @offset: payload's byte offset in @xdr + * @length: size of payload, in bytes * * Returns a non-negative number of bytes the chunk consumed, or * %-E2BIG if the payload was larger than the Write chunk, @@ -510,19 +514,20 @@ static int svc_rdma_send_xdr_pagelist(struct svc_rdma_write_info *info, * %-EIO if rdma_rw initialization failed (DMA mapping, etc). */ int svc_rdma_send_write_chunk(struct svcxprt_rdma *rdma, __be32 *wr_ch, - struct xdr_buf *xdr) + struct xdr_buf *xdr, + unsigned int offset, unsigned long length) { struct svc_rdma_write_info *info; int ret; - if (!xdr->page_len) + if (!length) return 0; info = svc_rdma_write_info_alloc(rdma, wr_ch); if (!info) return -ENOMEM; - ret = svc_rdma_send_xdr_pagelist(info, xdr); + ret = svc_rdma_send_xdr_pagelist(info, xdr, offset, length); if (ret < 0) goto out_err; @@ -530,8 +535,8 @@ int svc_rdma_send_write_chunk(struct svcxprt_rdma *rdma, __be32 *wr_ch, if (ret < 0) goto out_err; - trace_svcrdma_encode_write(xdr->page_len); - return xdr->page_len; + trace_svcrdma_send_write_chunk(xdr->page_len); + return length; out_err: svc_rdma_write_info_free(info); @@ -541,8 +546,7 @@ out_err: /** * svc_rdma_send_reply_chunk - Write all segments in the Reply chunk * @rdma: controlling RDMA transport - * @rp_ch: Reply chunk provided by client - * @writelist: true if client provided a Write list + * @rctxt: Write and Reply chunks from client * @xdr: xdr_buf containing an RPC Reply * * Returns a non-negative number of bytes the chunk consumed, or @@ -552,13 +556,14 @@ out_err: * %-ENOTCONN if posting failed (connection is lost), * %-EIO if rdma_rw initialization failed (DMA mapping, etc). */ -int svc_rdma_send_reply_chunk(struct svcxprt_rdma *rdma, __be32 *rp_ch, - bool writelist, struct xdr_buf *xdr) +int svc_rdma_send_reply_chunk(struct svcxprt_rdma *rdma, + const struct svc_rdma_recv_ctxt *rctxt, + struct xdr_buf *xdr) { struct svc_rdma_write_info *info; int consumed, ret; - info = svc_rdma_write_info_alloc(rdma, rp_ch); + info = svc_rdma_write_info_alloc(rdma, rctxt->rc_reply_chunk); if (!info) return -ENOMEM; @@ -570,8 +575,10 @@ int svc_rdma_send_reply_chunk(struct svcxprt_rdma *rdma, __be32 *rp_ch, /* Send the page list in the Reply chunk only if the * client did not provide Write chunks. */ - if (!writelist && xdr->page_len) { - ret = svc_rdma_send_xdr_pagelist(info, xdr); + if (!rctxt->rc_write_list && xdr->page_len) { + ret = svc_rdma_send_xdr_pagelist(info, xdr, + xdr->head[0].iov_len, + xdr->page_len); if (ret < 0) goto out_err; consumed += xdr->page_len; @@ -588,7 +595,7 @@ int svc_rdma_send_reply_chunk(struct svcxprt_rdma *rdma, __be32 *rp_ch, if (ret < 0) goto out_err; - trace_svcrdma_encode_reply(consumed); + trace_svcrdma_send_reply_chunk(consumed); return consumed; out_err: @@ -691,7 +698,7 @@ static int svc_rdma_build_read_chunk(struct svc_rqst *rqstp, if (ret < 0) break; - trace_svcrdma_encode_rseg(rs_handle, rs_length, rs_offset); + trace_svcrdma_send_rseg(rs_handle, rs_length, rs_offset); info->ri_chunklen += rs_length; } @@ -722,7 +729,7 @@ static int svc_rdma_build_normal_read_chunk(struct svc_rqst *rqstp, if (ret < 0) goto out; - trace_svcrdma_encode_read(info->ri_chunklen, info->ri_position); + trace_svcrdma_send_read_chunk(info->ri_chunklen, info->ri_position); head->rc_hdr_count = 0; @@ -778,7 +785,7 @@ static int svc_rdma_build_pz_read_chunk(struct svc_rqst *rqstp, if (ret < 0) goto out; - trace_svcrdma_encode_pzr(info->ri_chunklen); + trace_svcrdma_send_pzr(info->ri_chunklen); head->rc_arg.len += info->ri_chunklen; head->rc_arg.buflen += info->ri_chunklen; diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c index f3f108090aa4..90cba3058f04 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c +++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c @@ -151,6 +151,8 @@ svc_rdma_send_ctxt_alloc(struct svcxprt_rdma *rdma) ctxt->sc_send_wr.send_flags = IB_SEND_SIGNALED; ctxt->sc_cqe.done = svc_rdma_wc_send; ctxt->sc_xprt_buf = buffer; + xdr_buf_init(&ctxt->sc_hdrbuf, ctxt->sc_xprt_buf, + rdma->sc_max_req_size); ctxt->sc_sges[0].addr = addr; for (i = 0; i < rdma->sc_max_send_sges; i++) @@ -204,6 +206,10 @@ struct svc_rdma_send_ctxt *svc_rdma_send_ctxt_get(struct svcxprt_rdma *rdma) spin_unlock(&rdma->sc_send_lock); out: + rpcrdma_set_xdrlen(&ctxt->sc_hdrbuf, 0); + xdr_init_encode(&ctxt->sc_stream, &ctxt->sc_hdrbuf, + ctxt->sc_xprt_buf, NULL); + ctxt->sc_send_wr.num_sge = 0; ctxt->sc_cur_sge_no = 0; ctxt->sc_page_count = 0; @@ -295,6 +301,12 @@ int svc_rdma_send(struct svcxprt_rdma *rdma, struct ib_send_wr *wr) might_sleep(); + /* Sync the transport header buffer */ + ib_dma_sync_single_for_device(rdma->sc_pd->device, + wr->sg_list[0].addr, + wr->sg_list[0].length, + DMA_TO_DEVICE); + /* If the SQ is full, wait until an SQ entry is available */ while (1) { if ((atomic_dec_return(&rdma->sc_sq_avail) < 0)) { @@ -322,166 +334,173 @@ int svc_rdma_send(struct svcxprt_rdma *rdma, struct ib_send_wr *wr) return ret; } -static u32 xdr_padsize(u32 len) +/** + * svc_rdma_encode_read_list - Encode RPC Reply's Read chunk list + * @sctxt: Send context for the RPC Reply + * + * Return values: + * On success, returns length in bytes of the Reply XDR buffer + * that was consumed by the Reply Read list + * %-EMSGSIZE on XDR buffer overflow + */ +static ssize_t svc_rdma_encode_read_list(struct svc_rdma_send_ctxt *sctxt) { - return (len & 3) ? (4 - (len & 3)) : 0; + /* RPC-over-RDMA version 1 replies never have a Read list. */ + return xdr_stream_encode_item_absent(&sctxt->sc_stream); } -/* Returns length of transport header, in bytes. +/** + * svc_rdma_encode_write_segment - Encode one Write segment + * @src: matching Write chunk in the RPC Call header + * @sctxt: Send context for the RPC Reply + * @remaining: remaining bytes of the payload left in the Write chunk + * + * Return values: + * On success, returns length in bytes of the Reply XDR buffer + * that was consumed by the Write segment + * %-EMSGSIZE on XDR buffer overflow */ -static unsigned int svc_rdma_reply_hdr_len(__be32 *rdma_resp) +static ssize_t svc_rdma_encode_write_segment(__be32 *src, + struct svc_rdma_send_ctxt *sctxt, + unsigned int *remaining) { - unsigned int nsegs; __be32 *p; - - p = rdma_resp; - - /* RPC-over-RDMA V1 replies never have a Read list. */ - p += rpcrdma_fixed_maxsz + 1; - - /* Skip Write list. */ - while (*p++ != xdr_zero) { - nsegs = be32_to_cpup(p++); - p += nsegs * rpcrdma_segment_maxsz; - } - - /* Skip Reply chunk. */ - if (*p++ != xdr_zero) { - nsegs = be32_to_cpup(p++); - p += nsegs * rpcrdma_segment_maxsz; + const size_t len = rpcrdma_segment_maxsz * sizeof(*p); + u32 handle, length; + u64 offset; + + p = xdr_reserve_space(&sctxt->sc_stream, len); + if (!p) + return -EMSGSIZE; + + handle = be32_to_cpup(src++); + length = be32_to_cpup(src++); + xdr_decode_hyper(src, &offset); + + *p++ = cpu_to_be32(handle); + if (*remaining < length) { + /* segment only partly filled */ + length = *remaining; + *remaining = 0; + } else { + /* entire segment was consumed */ + *remaining -= length; } + *p++ = cpu_to_be32(length); + xdr_encode_hyper(p, offset); - return (unsigned long)p - (unsigned long)rdma_resp; + trace_svcrdma_encode_wseg(handle, length, offset); + return len; } -/* One Write chunk is copied from Call transport header to Reply - * transport header. Each segment's length field is updated to - * reflect number of bytes consumed in the segment. - * - * Returns number of segments in this chunk. +/** + * svc_rdma_encode_write_chunk - Encode one Write chunk + * @src: matching Write chunk in the RPC Call header + * @sctxt: Send context for the RPC Reply + * @remaining: size in bytes of the payload in the Write chunk + * + * Copy a Write chunk from the Call transport header to the + * Reply transport header. Update each segment's length field + * to reflect the number of bytes written in that segment. + * + * Return values: + * On success, returns length in bytes of the Reply XDR buffer + * that was consumed by the Write chunk + * %-EMSGSIZE on XDR buffer overflow */ -static unsigned int xdr_encode_write_chunk(__be32 *dst, __be32 *src, +static ssize_t svc_rdma_encode_write_chunk(__be32 *src, + struct svc_rdma_send_ctxt *sctxt, unsigned int remaining) { unsigned int i, nsegs; - u32 seg_len; + ssize_t len, ret; - /* Write list discriminator */ - *dst++ = *src++; + len = 0; + trace_svcrdma_encode_write_chunk(remaining); - /* number of segments in this chunk */ - nsegs = be32_to_cpup(src); - *dst++ = *src++; + src++; + ret = xdr_stream_encode_item_present(&sctxt->sc_stream); + if (ret < 0) + return -EMSGSIZE; + len += ret; - for (i = nsegs; i; i--) { - /* segment's RDMA handle */ - *dst++ = *src++; - - /* bytes returned in this segment */ - seg_len = be32_to_cpu(*src); - if (remaining >= seg_len) { - /* entire segment was consumed */ - *dst = *src; - remaining -= seg_len; - } else { - /* segment only partly filled */ - *dst = cpu_to_be32(remaining); - remaining = 0; - } - dst++; src++; + nsegs = be32_to_cpup(src++); + ret = xdr_stream_encode_u32(&sctxt->sc_stream, nsegs); + if (ret < 0) + return -EMSGSIZE; + len += ret; - /* segment's RDMA offset */ - *dst++ = *src++; - *dst++ = *src++; + for (i = nsegs; i; i--) { + ret = svc_rdma_encode_write_segment(src, sctxt, &remaining); + if (ret < 0) + return -EMSGSIZE; + src += rpcrdma_segment_maxsz; + len += ret; } - return nsegs; + return len; } -/* The client provided a Write list in the Call message. Fill in - * the segments in the first Write chunk in the Reply's transport +/** + * svc_rdma_encode_write_list - Encode RPC Reply's Write chunk list + * @rctxt: Reply context with information about the RPC Call + * @sctxt: Send context for the RPC Reply + * @length: size in bytes of the payload in the first Write chunk + * + * The client provides a Write chunk list in the Call message. Fill + * in the segments in the first Write chunk in the Reply's transport * header with the number of bytes consumed in each segment. * Remaining chunks are returned unused. * * Assumptions: * - Client has provided only one Write chunk - */ -static void svc_rdma_xdr_encode_write_list(__be32 *rdma_resp, __be32 *wr_ch, - unsigned int consumed) -{ - unsigned int nsegs; - __be32 *p, *q; - - /* RPC-over-RDMA V1 replies never have a Read list. */ - p = rdma_resp + rpcrdma_fixed_maxsz + 1; - - q = wr_ch; - while (*q != xdr_zero) { - nsegs = xdr_encode_write_chunk(p, q, consumed); - q += 2 + nsegs * rpcrdma_segment_maxsz; - p += 2 + nsegs * rpcrdma_segment_maxsz; - consumed = 0; - } - - /* Terminate Write list */ - *p++ = xdr_zero; - - /* Reply chunk discriminator; may be replaced later */ - *p = xdr_zero; -} - -/* The client provided a Reply chunk in the Call message. Fill in - * the segments in the Reply chunk in the Reply message with the - * number of bytes consumed in each segment. * - * Assumptions: - * - Reply can always fit in the provided Reply chunk + * Return values: + * On success, returns length in bytes of the Reply XDR buffer + * that was consumed by the Reply's Write list + * %-EMSGSIZE on XDR buffer overflow */ -static void svc_rdma_xdr_encode_reply_chunk(__be32 *rdma_resp, __be32 *rp_ch, - unsigned int consumed) +static ssize_t +svc_rdma_encode_write_list(const struct svc_rdma_recv_ctxt *rctxt, + struct svc_rdma_send_ctxt *sctxt, + unsigned int length) { - __be32 *p; + ssize_t len, ret; - /* Find the Reply chunk in the Reply's xprt header. - * RPC-over-RDMA V1 replies never have a Read list. - */ - p = rdma_resp + rpcrdma_fixed_maxsz + 1; + ret = svc_rdma_encode_write_chunk(rctxt->rc_write_list, sctxt, length); + if (ret < 0) + return ret; + len = ret; - /* Skip past Write list */ - while (*p++ != xdr_zero) - p += 1 + be32_to_cpup(p) * rpcrdma_segment_maxsz; + /* Terminate the Write list */ + ret = xdr_stream_encode_item_absent(&sctxt->sc_stream); + if (ret < 0) + return ret; - xdr_encode_write_chunk(p, rp_ch, consumed); + return len + ret; } -/* Parse the RPC Call's transport header. +/** + * svc_rdma_encode_reply_chunk - Encode RPC Reply's Reply chunk + * @rctxt: Reply context with information about the RPC Call + * @sctxt: Send context for the RPC Reply + * @length: size in bytes of the payload in the Reply chunk + * + * Assumptions: + * - Reply can always fit in the client-provided Reply chunk + * + * Return values: + * On success, returns length in bytes of the Reply XDR buffer + * that was consumed by the Reply's Reply chunk + * %-EMSGSIZE on XDR buffer overflow */ -static void svc_rdma_get_write_arrays(__be32 *rdma_argp, - __be32 **write, __be32 **reply) +static ssize_t +svc_rdma_encode_reply_chunk(const struct svc_rdma_recv_ctxt *rctxt, + struct svc_rdma_send_ctxt *sctxt, + unsigned int length) { - __be32 *p; - - p = rdma_argp + rpcrdma_fixed_maxsz; - - /* Read list */ - while (*p++ != xdr_zero) - p += 5; - - /* Write list */ - if (*p != xdr_zero) { - *write = p; - while (*p++ != xdr_zero) - p += 1 + be32_to_cpu(*p) * 4; - } else { - *write = NULL; - p++; - } - - /* Reply chunk */ - if (*p != xdr_zero) - *reply = p; - else - *reply = NULL; + return svc_rdma_encode_write_chunk(rctxt->rc_reply_chunk, sctxt, + length); } static int svc_rdma_dma_map_page(struct svcxprt_rdma *rdma, @@ -520,38 +539,36 @@ static int svc_rdma_dma_map_buf(struct svcxprt_rdma *rdma, } /** - * svc_rdma_sync_reply_hdr - DMA sync the transport header buffer + * svc_rdma_pull_up_needed - Determine whether to use pull-up * @rdma: controlling transport - * @ctxt: send_ctxt for the Send WR - * @len: length of transport header + * @sctxt: send_ctxt for the Send WR + * @rctxt: Write and Reply chunks provided by client + * @xdr: xdr_buf containing RPC message to transmit * - */ -void svc_rdma_sync_reply_hdr(struct svcxprt_rdma *rdma, - struct svc_rdma_send_ctxt *ctxt, - unsigned int len) -{ - ctxt->sc_sges[0].length = len; - ctxt->sc_send_wr.num_sge++; - ib_dma_sync_single_for_device(rdma->sc_pd->device, - ctxt->sc_sges[0].addr, len, - DMA_TO_DEVICE); -} - -/* If the xdr_buf has more elements than the device can - * transmit in a single RDMA Send, then the reply will - * have to be copied into a bounce buffer. + * Returns: + * %true if pull-up must be used + * %false otherwise */ static bool svc_rdma_pull_up_needed(struct svcxprt_rdma *rdma, - struct xdr_buf *xdr, - __be32 *wr_lst) + struct svc_rdma_send_ctxt *sctxt, + const struct svc_rdma_recv_ctxt *rctxt, + struct xdr_buf *xdr) { int elements; + /* For small messages, copying bytes is cheaper than DMA mapping. + */ + if (sctxt->sc_hdrbuf.len + xdr->len < RPCRDMA_PULLUP_THRESH) + return true; + + /* Check whether the xdr_buf has more elements than can + * fit in a single RDMA Send. + */ /* xdr->head */ elements = 1; /* xdr->pages */ - if (!wr_lst) { + if (!rctxt || !rctxt->rc_write_list) { unsigned int remaining; unsigned long pageoff; @@ -573,29 +590,36 @@ static bool svc_rdma_pull_up_needed(struct svcxprt_rdma *rdma, return elements >= rdma->sc_max_send_sges; } -/* The device is not capable of sending the reply directly. - * Assemble the elements of @xdr into the transport header - * buffer. +/** + * svc_rdma_pull_up_reply_msg - Copy Reply into a single buffer + * @rdma: controlling transport + * @sctxt: send_ctxt for the Send WR; xprt hdr is already prepared + * @rctxt: Write and Reply chunks provided by client + * @xdr: prepared xdr_buf containing RPC message + * + * The device is not capable of sending the reply directly. + * Assemble the elements of @xdr into the transport header buffer. + * + * Returns zero on success, or a negative errno on failure. */ static int svc_rdma_pull_up_reply_msg(struct svcxprt_rdma *rdma, - struct svc_rdma_send_ctxt *ctxt, - struct xdr_buf *xdr, __be32 *wr_lst) + struct svc_rdma_send_ctxt *sctxt, + const struct svc_rdma_recv_ctxt *rctxt, + const struct xdr_buf *xdr) { unsigned char *dst, *tailbase; unsigned int taillen; - dst = ctxt->sc_xprt_buf; - dst += ctxt->sc_sges[0].length; - + dst = sctxt->sc_xprt_buf + sctxt->sc_hdrbuf.len; memcpy(dst, xdr->head[0].iov_base, xdr->head[0].iov_len); dst += xdr->head[0].iov_len; tailbase = xdr->tail[0].iov_base; taillen = xdr->tail[0].iov_len; - if (wr_lst) { + if (rctxt && rctxt->rc_write_list) { u32 xdrpad; - xdrpad = xdr_padsize(xdr->page_len); + xdrpad = xdr_pad_size(xdr->page_len); if (taillen && xdrpad) { tailbase += xdrpad; taillen -= xdrpad; @@ -621,29 +645,26 @@ static int svc_rdma_pull_up_reply_msg(struct svcxprt_rdma *rdma, if (taillen) memcpy(dst, tailbase, taillen); - ctxt->sc_sges[0].length += xdr->len; - ib_dma_sync_single_for_device(rdma->sc_pd->device, - ctxt->sc_sges[0].addr, - ctxt->sc_sges[0].length, - DMA_TO_DEVICE); - + sctxt->sc_sges[0].length += xdr->len; + trace_svcrdma_send_pullup(sctxt->sc_sges[0].length); return 0; } -/* svc_rdma_map_reply_msg - Map the buffer holding RPC message +/* svc_rdma_map_reply_msg - DMA map the buffer holding RPC message * @rdma: controlling transport - * @ctxt: send_ctxt for the Send WR + * @sctxt: send_ctxt for the Send WR + * @rctxt: Write and Reply chunks provided by client * @xdr: prepared xdr_buf containing RPC message - * @wr_lst: pointer to Call header's Write list, or NULL * * Load the xdr_buf into the ctxt's sge array, and DMA map each - * element as it is added. + * element as it is added. The Send WR's num_sge field is set. * * Returns zero on success, or a negative errno on failure. */ int svc_rdma_map_reply_msg(struct svcxprt_rdma *rdma, - struct svc_rdma_send_ctxt *ctxt, - struct xdr_buf *xdr, __be32 *wr_lst) + struct svc_rdma_send_ctxt *sctxt, + const struct svc_rdma_recv_ctxt *rctxt, + struct xdr_buf *xdr) { unsigned int len, remaining; unsigned long page_off; @@ -652,11 +673,24 @@ int svc_rdma_map_reply_msg(struct svcxprt_rdma *rdma, u32 xdr_pad; int ret; - if (svc_rdma_pull_up_needed(rdma, xdr, wr_lst)) - return svc_rdma_pull_up_reply_msg(rdma, ctxt, xdr, wr_lst); + /* Set up the (persistently-mapped) transport header SGE. */ + sctxt->sc_send_wr.num_sge = 1; + sctxt->sc_sges[0].length = sctxt->sc_hdrbuf.len; - ++ctxt->sc_cur_sge_no; - ret = svc_rdma_dma_map_buf(rdma, ctxt, + /* If there is a Reply chunk, nothing follows the transport + * header, and we're done here. + */ + if (rctxt && rctxt->rc_reply_chunk) + return 0; + + /* For pull-up, svc_rdma_send() will sync the transport header. + * No additional DMA mapping is necessary. + */ + if (svc_rdma_pull_up_needed(rdma, sctxt, rctxt, xdr)) + return svc_rdma_pull_up_reply_msg(rdma, sctxt, rctxt, xdr); + + ++sctxt->sc_cur_sge_no; + ret = svc_rdma_dma_map_buf(rdma, sctxt, xdr->head[0].iov_base, xdr->head[0].iov_len); if (ret < 0) @@ -667,10 +701,10 @@ int svc_rdma_map_reply_msg(struct svcxprt_rdma *rdma, * have added XDR padding in the tail buffer, and that * should not be included inline. */ - if (wr_lst) { + if (rctxt && rctxt->rc_write_list) { base = xdr->tail[0].iov_base; len = xdr->tail[0].iov_len; - xdr_pad = xdr_padsize(xdr->page_len); + xdr_pad = xdr_pad_size(xdr->page_len); if (len && xdr_pad) { base += xdr_pad; @@ -686,8 +720,8 @@ int svc_rdma_map_reply_msg(struct svcxprt_rdma *rdma, while (remaining) { len = min_t(u32, PAGE_SIZE - page_off, remaining); - ++ctxt->sc_cur_sge_no; - ret = svc_rdma_dma_map_page(rdma, ctxt, *ppages++, + ++sctxt->sc_cur_sge_no; + ret = svc_rdma_dma_map_page(rdma, sctxt, *ppages++, page_off, len); if (ret < 0) return ret; @@ -700,8 +734,8 @@ int svc_rdma_map_reply_msg(struct svcxprt_rdma *rdma, len = xdr->tail[0].iov_len; tail: if (len) { - ++ctxt->sc_cur_sge_no; - ret = svc_rdma_dma_map_buf(rdma, ctxt, base, len); + ++sctxt->sc_cur_sge_no; + ret = svc_rdma_dma_map_buf(rdma, sctxt, base, len); if (ret < 0) return ret; } @@ -748,18 +782,14 @@ static void svc_rdma_save_io_pages(struct svc_rqst *rqstp, */ static int svc_rdma_send_reply_msg(struct svcxprt_rdma *rdma, struct svc_rdma_send_ctxt *sctxt, - struct svc_rdma_recv_ctxt *rctxt, - struct svc_rqst *rqstp, - __be32 *wr_lst, __be32 *rp_ch) + const struct svc_rdma_recv_ctxt *rctxt, + struct svc_rqst *rqstp) { int ret; - if (!rp_ch) { - ret = svc_rdma_map_reply_msg(rdma, sctxt, - &rqstp->rq_res, wr_lst); - if (ret < 0) - return ret; - } + ret = svc_rdma_map_reply_msg(rdma, sctxt, rctxt, &rqstp->rq_res); + if (ret < 0) + return ret; svc_rdma_save_io_pages(rqstp, sctxt); @@ -769,8 +799,6 @@ static int svc_rdma_send_reply_msg(struct svcxprt_rdma *rdma, } else { sctxt->sc_send_wr.opcode = IB_WR_SEND; } - dprintk("svcrdma: posting Send WR with %u sge(s)\n", - sctxt->sc_send_wr.num_sge); return svc_rdma_send(rdma, &sctxt->sc_send_wr); } @@ -785,26 +813,31 @@ static int svc_rdma_send_error_msg(struct svcxprt_rdma *rdma, struct svc_rdma_send_ctxt *ctxt, struct svc_rqst *rqstp) { + struct svc_rdma_recv_ctxt *rctxt = rqstp->rq_xprt_ctxt; + __be32 *rdma_argp = rctxt->rc_recv_buf; __be32 *p; - int ret; - p = ctxt->sc_xprt_buf; - trace_svcrdma_err_chunk(*p); - p += 3; + rpcrdma_set_xdrlen(&ctxt->sc_hdrbuf, 0); + xdr_init_encode(&ctxt->sc_stream, &ctxt->sc_hdrbuf, ctxt->sc_xprt_buf, + NULL); + + p = xdr_reserve_space(&ctxt->sc_stream, RPCRDMA_HDRLEN_ERR); + if (!p) + return -ENOMSG; + + *p++ = *rdma_argp; + *p++ = *(rdma_argp + 1); + *p++ = rdma->sc_fc_credits; *p++ = rdma_error; *p = err_chunk; - svc_rdma_sync_reply_hdr(rdma, ctxt, RPCRDMA_HDRLEN_ERR); + trace_svcrdma_err_chunk(*rdma_argp); svc_rdma_save_io_pages(rqstp, ctxt); + ctxt->sc_send_wr.num_sge = 1; ctxt->sc_send_wr.opcode = IB_WR_SEND; - ret = svc_rdma_send(rdma, &ctxt->sc_send_wr); - if (ret) { - svc_rdma_send_ctxt_put(rdma, ctxt); - return ret; - } - - return 0; + ctxt->sc_sges[0].length = ctxt->sc_hdrbuf.len; + return svc_rdma_send(rdma, &ctxt->sc_send_wr); } /** @@ -825,14 +858,14 @@ int svc_rdma_sendto(struct svc_rqst *rqstp) struct svcxprt_rdma *rdma = container_of(xprt, struct svcxprt_rdma, sc_xprt); struct svc_rdma_recv_ctxt *rctxt = rqstp->rq_xprt_ctxt; - __be32 *p, *rdma_argp, *rdma_resp, *wr_lst, *rp_ch; + __be32 *rdma_argp = rctxt->rc_recv_buf; + __be32 *wr_lst = rctxt->rc_write_list; + __be32 *rp_ch = rctxt->rc_reply_chunk; struct xdr_buf *xdr = &rqstp->rq_res; struct svc_rdma_send_ctxt *sctxt; + __be32 *p; int ret; - rdma_argp = rctxt->rc_recv_buf; - svc_rdma_get_write_arrays(rdma_argp, &wr_lst, &rp_ch); - /* Create the RDMA response header. xprt->xpt_mutex, * acquired in svc_send(), serializes RPC replies. The * code path below that inserts the credit grant value @@ -843,36 +876,52 @@ int svc_rdma_sendto(struct svc_rqst *rqstp) sctxt = svc_rdma_send_ctxt_get(rdma); if (!sctxt) goto err0; - rdma_resp = sctxt->sc_xprt_buf; - p = rdma_resp; + p = xdr_reserve_space(&sctxt->sc_stream, + rpcrdma_fixed_maxsz * sizeof(*p)); + if (!p) + goto err0; *p++ = *rdma_argp; *p++ = *(rdma_argp + 1); *p++ = rdma->sc_fc_credits; - *p++ = rp_ch ? rdma_nomsg : rdma_msg; - - /* Start with empty chunks */ - *p++ = xdr_zero; - *p++ = xdr_zero; - *p = xdr_zero; + *p = rp_ch ? rdma_nomsg : rdma_msg; + if (svc_rdma_encode_read_list(sctxt) < 0) + goto err0; if (wr_lst) { /* XXX: Presume the client sent only one Write chunk */ - ret = svc_rdma_send_write_chunk(rdma, wr_lst, xdr); + unsigned long offset; + unsigned int length; + + if (rctxt->rc_read_payload_length) { + offset = rctxt->rc_read_payload_offset; + length = rctxt->rc_read_payload_length; + } else { + offset = xdr->head[0].iov_len; + length = xdr->page_len; + } + ret = svc_rdma_send_write_chunk(rdma, wr_lst, xdr, offset, + length); if (ret < 0) goto err2; - svc_rdma_xdr_encode_write_list(rdma_resp, wr_lst, ret); + if (svc_rdma_encode_write_list(rctxt, sctxt, length) < 0) + goto err0; + } else { + if (xdr_stream_encode_item_absent(&sctxt->sc_stream) < 0) + goto err0; } if (rp_ch) { - ret = svc_rdma_send_reply_chunk(rdma, rp_ch, wr_lst, xdr); + ret = svc_rdma_send_reply_chunk(rdma, rctxt, &rqstp->rq_res); if (ret < 0) goto err2; - svc_rdma_xdr_encode_reply_chunk(rdma_resp, rp_ch, ret); + if (svc_rdma_encode_reply_chunk(rctxt, sctxt, ret) < 0) + goto err0; + } else { + if (xdr_stream_encode_item_absent(&sctxt->sc_stream) < 0) + goto err0; } - svc_rdma_sync_reply_hdr(rdma, sctxt, svc_rdma_reply_hdr_len(rdma_resp)); - ret = svc_rdma_send_reply_msg(rdma, sctxt, rctxt, rqstp, - wr_lst, rp_ch); + ret = svc_rdma_send_reply_msg(rdma, sctxt, rctxt, rqstp); if (ret < 0) goto err1; ret = 0; @@ -900,3 +949,30 @@ out: ret = -ENOTCONN; goto out; } + +/** + * svc_rdma_read_payload - special processing for a READ payload + * @rqstp: svc_rqst to operate on + * @offset: payload's byte offset in @xdr + * @length: size of payload, in bytes + * + * Returns zero on success. + * + * For the moment, just record the xdr_buf location of the READ + * payload. svc_rdma_sendto will use that location later when + * we actually send the payload. + */ +int svc_rdma_read_payload(struct svc_rqst *rqstp, unsigned int offset, + unsigned int length) +{ + struct svc_rdma_recv_ctxt *rctxt = rqstp->rq_xprt_ctxt; + + /* XXX: Just one READ payload slot for now, since our + * transport implementation currently supports only one + * Write chunk. + */ + rctxt->rc_read_payload_offset = offset; + rctxt->rc_read_payload_length = length; + + return 0; +} diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c index 145a3615c319..8bb99980ae85 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_transport.c +++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c @@ -82,6 +82,7 @@ static const struct svc_xprt_ops svc_rdma_ops = { .xpo_create = svc_rdma_create, .xpo_recvfrom = svc_rdma_recvfrom, .xpo_sendto = svc_rdma_sendto, + .xpo_read_payload = svc_rdma_read_payload, .xpo_release_rqst = svc_rdma_release_rqst, .xpo_detach = svc_rdma_detach, .xpo_free = svc_rdma_free, @@ -240,10 +241,6 @@ static void handle_connect_req(struct rdma_cm_id *new_cma_id, static int rdma_listen_handler(struct rdma_cm_id *cma_id, struct rdma_cm_event *event) { - struct sockaddr *sap = (struct sockaddr *)&cma_id->route.addr.src_addr; - - trace_svcrdma_cm_event(event, sap); - switch (event->event) { case RDMA_CM_EVENT_CONNECT_REQUEST: dprintk("svcrdma: Connect request on cma_id=%p, xprt = %p, " @@ -265,12 +262,9 @@ static int rdma_listen_handler(struct rdma_cm_id *cma_id, static int rdma_cma_handler(struct rdma_cm_id *cma_id, struct rdma_cm_event *event) { - struct sockaddr *sap = (struct sockaddr *)&cma_id->route.addr.dst_addr; struct svcxprt_rdma *rdma = cma_id->context; struct svc_xprt *xprt = &rdma->sc_xprt; - trace_svcrdma_cm_event(event, sap); - switch (event->event) { case RDMA_CM_EVENT_ESTABLISHED: /* Accept complete */ diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c index 3cfeba68ee9a..659da37020a4 100644 --- a/net/sunrpc/xprtrdma/transport.c +++ b/net/sunrpc/xprtrdma/transport.c @@ -240,9 +240,10 @@ xprt_rdma_connect_worker(struct work_struct *work) struct rpc_xprt *xprt = &r_xprt->rx_xprt; int rc; - rc = rpcrdma_ep_connect(&r_xprt->rx_ep, &r_xprt->rx_ia); + rc = rpcrdma_xprt_connect(r_xprt); xprt_clear_connecting(xprt); - if (r_xprt->rx_ep.rep_connected > 0) { + if (r_xprt->rx_ep && r_xprt->rx_ep->re_connect_status > 0) { + xprt->connect_cookie++; xprt->stat.connect_count++; xprt->stat.connect_time += (long)jiffies - xprt->stat.connect_start; @@ -265,7 +266,7 @@ xprt_rdma_inject_disconnect(struct rpc_xprt *xprt) struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); trace_xprtrdma_op_inject_dsc(r_xprt); - rdma_disconnect(r_xprt->rx_ia.ri_id); + rdma_disconnect(r_xprt->rx_ep->re_id); } /** @@ -284,9 +285,8 @@ xprt_rdma_destroy(struct rpc_xprt *xprt) cancel_delayed_work_sync(&r_xprt->rx_connect_worker); - rpcrdma_ep_destroy(r_xprt); + rpcrdma_xprt_disconnect(r_xprt); rpcrdma_buffer_destroy(&r_xprt->rx_buf); - rpcrdma_ia_close(&r_xprt->rx_ia); xprt_rdma_free_addresses(xprt); xprt_free(xprt); @@ -316,10 +316,15 @@ xprt_setup_rdma(struct xprt_create *args) if (args->addrlen > sizeof(xprt->addr)) return ERR_PTR(-EBADF); + if (!try_module_get(THIS_MODULE)) + return ERR_PTR(-EIO); + xprt = xprt_alloc(args->net, sizeof(struct rpcrdma_xprt), 0, xprt_rdma_slot_table_entries); - if (!xprt) + if (!xprt) { + module_put(THIS_MODULE); return ERR_PTR(-ENOMEM); + } xprt->timeout = &xprt_rdma_default_timeout; xprt->connect_timeout = xprt->timeout->to_initval; @@ -347,23 +352,17 @@ xprt_setup_rdma(struct xprt_create *args) xprt_rdma_format_addresses(xprt, sap); new_xprt = rpcx_to_rdmax(xprt); - rc = rpcrdma_ia_open(new_xprt); - if (rc) - goto out1; - - rc = rpcrdma_ep_create(new_xprt); - if (rc) - goto out2; - rc = rpcrdma_buffer_create(new_xprt); - if (rc) - goto out3; - - if (!try_module_get(THIS_MODULE)) - goto out4; + if (rc) { + xprt_rdma_free_addresses(xprt); + xprt_free(xprt); + module_put(THIS_MODULE); + return ERR_PTR(rc); + } INIT_DELAYED_WORK(&new_xprt->rx_connect_worker, xprt_rdma_connect_worker); + xprt->max_payload = RPCRDMA_MAX_DATA_SEGS << PAGE_SHIFT; dprintk("RPC: %s: %s:%s\n", __func__, @@ -371,19 +370,6 @@ xprt_setup_rdma(struct xprt_create *args) xprt->address_strings[RPC_DISPLAY_PORT]); trace_xprtrdma_create(new_xprt); return xprt; - -out4: - rpcrdma_buffer_destroy(&new_xprt->rx_buf); - rc = -ENODEV; -out3: - rpcrdma_ep_destroy(new_xprt); -out2: - rpcrdma_ia_close(&new_xprt->rx_ia); -out1: - trace_xprtrdma_op_destroy(new_xprt); - xprt_rdma_free_addresses(xprt); - xprt_free(xprt); - return ERR_PTR(rc); } /** @@ -398,26 +384,11 @@ out1: void xprt_rdma_close(struct rpc_xprt *xprt) { struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); - struct rpcrdma_ep *ep = &r_xprt->rx_ep; - struct rpcrdma_ia *ia = &r_xprt->rx_ia; - - might_sleep(); trace_xprtrdma_op_close(r_xprt); - /* Prevent marshaling and sending of new requests */ - xprt_clear_connected(xprt); - - if (test_and_clear_bit(RPCRDMA_IAF_REMOVING, &ia->ri_flags)) { - rpcrdma_ia_remove(ia); - goto out; - } - - if (ep->rep_connected == -ENODEV) - return; - rpcrdma_ep_disconnect(ep, ia); + rpcrdma_xprt_disconnect(r_xprt); -out: xprt->reestablish_timeout = 0; ++xprt->connect_cookie; xprt_disconnect_done(xprt); @@ -517,10 +488,11 @@ static void xprt_rdma_connect(struct rpc_xprt *xprt, struct rpc_task *task) { struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); + struct rpcrdma_ep *ep = r_xprt->rx_ep; unsigned long delay; delay = 0; - if (r_xprt->rx_ep.rep_connected != 0) { + if (ep && ep->re_connect_status != 0) { delay = xprt_reconnect_delay(xprt); xprt_reconnect_backoff(xprt, RPCRDMA_INIT_REEST_TO); } @@ -694,7 +666,7 @@ xprt_rdma_send_request(struct rpc_rqst *rqst) goto drop_connection; rqst->rq_xtime = ktime_get(); - if (rpcrdma_ep_post(&r_xprt->rx_ia, &r_xprt->rx_ep, req)) + if (rpcrdma_post_sends(r_xprt, req)) goto drop_connection; rqst->rq_xmit_bytes_sent += rqst->rq_snd_buf.len; diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index 353f61ac8d51..cdd84c09df10 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -84,6 +84,7 @@ static void rpcrdma_rep_destroy(struct rpcrdma_rep *rep); static void rpcrdma_reps_unmap(struct rpcrdma_xprt *r_xprt); static void rpcrdma_mrs_create(struct rpcrdma_xprt *r_xprt); static void rpcrdma_mrs_destroy(struct rpcrdma_xprt *r_xprt); +static int rpcrdma_ep_destroy(struct rpcrdma_ep *ep); static struct rpcrdma_regbuf * rpcrdma_regbuf_alloc(size_t size, enum dma_data_direction direction, gfp_t flags); @@ -96,17 +97,17 @@ static void rpcrdma_regbuf_free(struct rpcrdma_regbuf *rb); */ static void rpcrdma_xprt_drain(struct rpcrdma_xprt *r_xprt) { - struct rpcrdma_ia *ia = &r_xprt->rx_ia; + struct rdma_cm_id *id = r_xprt->rx_ep->re_id; /* Flush Receives, then wait for deferred Reply work * to complete. */ - ib_drain_rq(ia->ri_id->qp); + ib_drain_rq(id->qp); /* Deferred Reply processing might have scheduled * local invalidations. */ - ib_drain_sq(ia->ri_id->qp); + ib_drain_sq(id->qp); } /** @@ -115,26 +116,43 @@ static void rpcrdma_xprt_drain(struct rpcrdma_xprt *r_xprt) * @context: ep that owns QP where event occurred * * Called from the RDMA provider (device driver) possibly in an interrupt - * context. + * context. The QP is always destroyed before the ID, so the ID will be + * reliably available when this handler is invoked. */ -static void -rpcrdma_qp_event_handler(struct ib_event *event, void *context) +static void rpcrdma_qp_event_handler(struct ib_event *event, void *context) { struct rpcrdma_ep *ep = context; - struct rpcrdma_xprt *r_xprt = container_of(ep, struct rpcrdma_xprt, - rx_ep); - trace_xprtrdma_qp_event(r_xprt, event); + trace_xprtrdma_qp_event(ep, event); +} + +/** + * rpcrdma_flush_disconnect - Disconnect on flushed completion + * @cq: completion queue + * @wc: work completion entry + * + * Must be called in process context. + */ +void rpcrdma_flush_disconnect(struct ib_cq *cq, struct ib_wc *wc) +{ + struct rpcrdma_xprt *r_xprt = cq->cq_context; + struct rpc_xprt *xprt = &r_xprt->rx_xprt; + + if (wc->status != IB_WC_SUCCESS && + r_xprt->rx_ep->re_connect_status == 1) { + r_xprt->rx_ep->re_connect_status = -ECONNABORTED; + trace_xprtrdma_flush_dct(r_xprt, wc->status); + xprt_force_disconnect(xprt); + } } /** * rpcrdma_wc_send - Invoked by RDMA provider for each polled Send WC * @cq: completion queue - * @wc: completed WR + * @wc: WCE for a completed Send WR * */ -static void -rpcrdma_wc_send(struct ib_cq *cq, struct ib_wc *wc) +static void rpcrdma_wc_send(struct ib_cq *cq, struct ib_wc *wc) { struct ib_cqe *cqe = wc->wr_cqe; struct rpcrdma_sendctx *sc = @@ -143,25 +161,25 @@ rpcrdma_wc_send(struct ib_cq *cq, struct ib_wc *wc) /* WARNING: Only wr_cqe and status are reliable at this point */ trace_xprtrdma_wc_send(sc, wc); rpcrdma_sendctx_put_locked((struct rpcrdma_xprt *)cq->cq_context, sc); + rpcrdma_flush_disconnect(cq, wc); } /** * rpcrdma_wc_receive - Invoked by RDMA provider for each polled Receive WC - * @cq: completion queue (ignored) - * @wc: completed WR + * @cq: completion queue + * @wc: WCE for a completed Receive WR * */ -static void -rpcrdma_wc_receive(struct ib_cq *cq, struct ib_wc *wc) +static void rpcrdma_wc_receive(struct ib_cq *cq, struct ib_wc *wc) { struct ib_cqe *cqe = wc->wr_cqe; struct rpcrdma_rep *rep = container_of(cqe, struct rpcrdma_rep, rr_cqe); - struct rpcrdma_xprt *r_xprt = rep->rr_rxprt; + struct rpcrdma_xprt *r_xprt = cq->cq_context; /* WARNING: Only wr_cqe and status are reliable at this point */ trace_xprtrdma_wc_receive(wc); - --r_xprt->rx_ep.rep_receive_count; + --r_xprt->rx_ep->re_receive_count; if (wc->status != IB_WC_SUCCESS) goto out_flushed; @@ -178,35 +196,35 @@ rpcrdma_wc_receive(struct ib_cq *cq, struct ib_wc *wc) return; out_flushed: + rpcrdma_flush_disconnect(cq, wc); rpcrdma_rep_destroy(rep); } -static void rpcrdma_update_cm_private(struct rpcrdma_xprt *r_xprt, +static void rpcrdma_update_cm_private(struct rpcrdma_ep *ep, struct rdma_conn_param *param) { const struct rpcrdma_connect_private *pmsg = param->private_data; - struct rpcrdma_ep *ep = &r_xprt->rx_ep; unsigned int rsize, wsize; /* Default settings for RPC-over-RDMA Version One */ - r_xprt->rx_ia.ri_implicit_roundup = xprt_rdma_pad_optimize; + ep->re_implicit_roundup = xprt_rdma_pad_optimize; rsize = RPCRDMA_V1_DEF_INLINE_SIZE; wsize = RPCRDMA_V1_DEF_INLINE_SIZE; if (pmsg && pmsg->cp_magic == rpcrdma_cmp_magic && pmsg->cp_version == RPCRDMA_CMP_VERSION) { - r_xprt->rx_ia.ri_implicit_roundup = true; + ep->re_implicit_roundup = true; rsize = rpcrdma_decode_buffer_size(pmsg->cp_send_size); wsize = rpcrdma_decode_buffer_size(pmsg->cp_recv_size); } - if (rsize < ep->rep_inline_recv) - ep->rep_inline_recv = rsize; - if (wsize < ep->rep_inline_send) - ep->rep_inline_send = wsize; + if (rsize < ep->re_inline_recv) + ep->re_inline_recv = rsize; + if (wsize < ep->re_inline_send) + ep->re_inline_send = wsize; - rpcrdma_set_max_header_sizes(r_xprt); + rpcrdma_set_max_header_sizes(ep); } /** @@ -220,116 +238,103 @@ static void rpcrdma_update_cm_private(struct rpcrdma_xprt *r_xprt, static int rpcrdma_cm_event_handler(struct rdma_cm_id *id, struct rdma_cm_event *event) { - struct rpcrdma_xprt *r_xprt = id->context; - struct rpcrdma_ia *ia = &r_xprt->rx_ia; - struct rpcrdma_ep *ep = &r_xprt->rx_ep; - struct rpc_xprt *xprt = &r_xprt->rx_xprt; + struct sockaddr *sap = (struct sockaddr *)&id->route.addr.dst_addr; + struct rpcrdma_ep *ep = id->context; + struct rpc_xprt *xprt = ep->re_xprt; might_sleep(); - trace_xprtrdma_cm_event(r_xprt, event); switch (event->event) { case RDMA_CM_EVENT_ADDR_RESOLVED: case RDMA_CM_EVENT_ROUTE_RESOLVED: - ia->ri_async_rc = 0; - complete(&ia->ri_done); + ep->re_async_rc = 0; + complete(&ep->re_done); return 0; case RDMA_CM_EVENT_ADDR_ERROR: - ia->ri_async_rc = -EPROTO; - complete(&ia->ri_done); + ep->re_async_rc = -EPROTO; + complete(&ep->re_done); return 0; case RDMA_CM_EVENT_ROUTE_ERROR: - ia->ri_async_rc = -ENETUNREACH; - complete(&ia->ri_done); + ep->re_async_rc = -ENETUNREACH; + complete(&ep->re_done); return 0; case RDMA_CM_EVENT_DEVICE_REMOVAL: -#if IS_ENABLED(CONFIG_SUNRPC_DEBUG) - pr_info("rpcrdma: removing device %s for %s:%s\n", - ia->ri_id->device->name, - rpcrdma_addrstr(r_xprt), rpcrdma_portstr(r_xprt)); -#endif - init_completion(&ia->ri_remove_done); - set_bit(RPCRDMA_IAF_REMOVING, &ia->ri_flags); - ep->rep_connected = -ENODEV; + pr_info("rpcrdma: removing device %s for %pISpc\n", + ep->re_id->device->name, sap); + /* fall through */ + case RDMA_CM_EVENT_ADDR_CHANGE: + ep->re_connect_status = -ENODEV; xprt_force_disconnect(xprt); - wait_for_completion(&ia->ri_remove_done); - - ia->ri_id = NULL; - /* Return 1 to ensure the core destroys the id. */ - return 1; + goto disconnected; case RDMA_CM_EVENT_ESTABLISHED: - ++xprt->connect_cookie; - ep->rep_connected = 1; - rpcrdma_update_cm_private(r_xprt, &event->param.conn); - trace_xprtrdma_inline_thresh(r_xprt); - wake_up_all(&ep->rep_connect_wait); + kref_get(&ep->re_kref); + ep->re_connect_status = 1; + rpcrdma_update_cm_private(ep, &event->param.conn); + trace_xprtrdma_inline_thresh(ep); + wake_up_all(&ep->re_connect_wait); break; case RDMA_CM_EVENT_CONNECT_ERROR: - ep->rep_connected = -ENOTCONN; + ep->re_connect_status = -ENOTCONN; goto disconnected; case RDMA_CM_EVENT_UNREACHABLE: - ep->rep_connected = -ENETUNREACH; + ep->re_connect_status = -ENETUNREACH; goto disconnected; case RDMA_CM_EVENT_REJECTED: - dprintk("rpcrdma: connection to %s:%s rejected: %s\n", - rpcrdma_addrstr(r_xprt), rpcrdma_portstr(r_xprt), - rdma_reject_msg(id, event->status)); - ep->rep_connected = -ECONNREFUSED; + dprintk("rpcrdma: connection to %pISpc rejected: %s\n", + sap, rdma_reject_msg(id, event->status)); + ep->re_connect_status = -ECONNREFUSED; if (event->status == IB_CM_REJ_STALE_CONN) - ep->rep_connected = -EAGAIN; + ep->re_connect_status = -EAGAIN; goto disconnected; case RDMA_CM_EVENT_DISCONNECTED: - ep->rep_connected = -ECONNABORTED; + ep->re_connect_status = -ECONNABORTED; disconnected: - xprt_force_disconnect(xprt); - wake_up_all(&ep->rep_connect_wait); - break; + return rpcrdma_ep_destroy(ep); default: break; } - dprintk("RPC: %s: %s:%s on %s/frwr: %s\n", __func__, - rpcrdma_addrstr(r_xprt), rpcrdma_portstr(r_xprt), - ia->ri_id->device->name, rdma_event_msg(event->event)); + dprintk("RPC: %s: %pISpc on %s/frwr: %s\n", __func__, sap, + ep->re_id->device->name, rdma_event_msg(event->event)); return 0; } -static struct rdma_cm_id * -rpcrdma_create_id(struct rpcrdma_xprt *xprt, struct rpcrdma_ia *ia) +static struct rdma_cm_id *rpcrdma_create_id(struct rpcrdma_xprt *r_xprt, + struct rpcrdma_ep *ep) { unsigned long wtimeout = msecs_to_jiffies(RDMA_RESOLVE_TIMEOUT) + 1; + struct rpc_xprt *xprt = &r_xprt->rx_xprt; struct rdma_cm_id *id; int rc; - init_completion(&ia->ri_done); + init_completion(&ep->re_done); - id = rdma_create_id(xprt->rx_xprt.xprt_net, rpcrdma_cm_event_handler, - xprt, RDMA_PS_TCP, IB_QPT_RC); + id = rdma_create_id(xprt->xprt_net, rpcrdma_cm_event_handler, ep, + RDMA_PS_TCP, IB_QPT_RC); if (IS_ERR(id)) return id; - ia->ri_async_rc = -ETIMEDOUT; - rc = rdma_resolve_addr(id, NULL, - (struct sockaddr *)&xprt->rx_xprt.addr, + ep->re_async_rc = -ETIMEDOUT; + rc = rdma_resolve_addr(id, NULL, (struct sockaddr *)&xprt->addr, RDMA_RESOLVE_TIMEOUT); if (rc) goto out; - rc = wait_for_completion_interruptible_timeout(&ia->ri_done, wtimeout); + rc = wait_for_completion_interruptible_timeout(&ep->re_done, wtimeout); if (rc < 0) goto out; - rc = ia->ri_async_rc; + rc = ep->re_async_rc; if (rc) goto out; - ia->ri_async_rc = -ETIMEDOUT; + ep->re_async_rc = -ETIMEDOUT; rc = rdma_resolve_route(id, RDMA_RESOLVE_TIMEOUT); if (rc) goto out; - rc = wait_for_completion_interruptible_timeout(&ia->ri_done, wtimeout); + rc = wait_for_completion_interruptible_timeout(&ep->re_done, wtimeout); if (rc < 0) goto out; - rc = ia->ri_async_rc; + rc = ep->re_async_rc; if (rc) goto out; @@ -340,356 +345,181 @@ out: return ERR_PTR(rc); } -/* - * Exported functions. - */ - -/** - * rpcrdma_ia_open - Open and initialize an Interface Adapter. - * @xprt: transport with IA to (re)initialize - * - * Returns 0 on success, negative errno if an appropriate - * Interface Adapter could not be found and opened. - */ -int -rpcrdma_ia_open(struct rpcrdma_xprt *xprt) +static void rpcrdma_ep_put(struct kref *kref) { - struct rpcrdma_ia *ia = &xprt->rx_ia; - int rc; + struct rpcrdma_ep *ep = container_of(kref, struct rpcrdma_ep, re_kref); - ia->ri_id = rpcrdma_create_id(xprt, ia); - if (IS_ERR(ia->ri_id)) { - rc = PTR_ERR(ia->ri_id); - goto out_err; + if (ep->re_id->qp) { + rdma_destroy_qp(ep->re_id); + ep->re_id->qp = NULL; } - ia->ri_pd = ib_alloc_pd(ia->ri_id->device, 0); - if (IS_ERR(ia->ri_pd)) { - rc = PTR_ERR(ia->ri_pd); - pr_err("rpcrdma: ib_alloc_pd() returned %d\n", rc); - goto out_err; - } + if (ep->re_attr.recv_cq) + ib_free_cq(ep->re_attr.recv_cq); + ep->re_attr.recv_cq = NULL; + if (ep->re_attr.send_cq) + ib_free_cq(ep->re_attr.send_cq); + ep->re_attr.send_cq = NULL; - return 0; + if (ep->re_pd) + ib_dealloc_pd(ep->re_pd); + ep->re_pd = NULL; -out_err: - rpcrdma_ia_close(ia); - return rc; + kfree(ep); + module_put(THIS_MODULE); } -/** - * rpcrdma_ia_remove - Handle device driver unload - * @ia: interface adapter being removed - * - * Divest transport H/W resources associated with this adapter, - * but allow it to be restored later. - * - * Caller must hold the transport send lock. +/* Returns: + * %0 if @ep still has a positive kref count, or + * %1 if @ep was destroyed successfully. */ -void -rpcrdma_ia_remove(struct rpcrdma_ia *ia) +static int rpcrdma_ep_destroy(struct rpcrdma_ep *ep) { - struct rpcrdma_xprt *r_xprt = container_of(ia, struct rpcrdma_xprt, - rx_ia); - struct rpcrdma_ep *ep = &r_xprt->rx_ep; - - /* This is similar to rpcrdma_ep_destroy, but: - * - Don't cancel the connect worker. - * - Don't call rpcrdma_ep_disconnect, which waits - * for another conn upcall, which will deadlock. - * - rdma_disconnect is unneeded, the underlying - * connection is already gone. - */ - if (ia->ri_id->qp) { - rpcrdma_xprt_drain(r_xprt); - rdma_destroy_qp(ia->ri_id); - ia->ri_id->qp = NULL; - } - ib_free_cq(ep->rep_attr.recv_cq); - ep->rep_attr.recv_cq = NULL; - ib_free_cq(ep->rep_attr.send_cq); - ep->rep_attr.send_cq = NULL; - - /* The ULP is responsible for ensuring all DMA - * mappings and MRs are gone. - */ - rpcrdma_reps_unmap(r_xprt); - rpcrdma_reqs_reset(r_xprt); - rpcrdma_mrs_destroy(r_xprt); - rpcrdma_sendctxs_destroy(r_xprt); - ib_dealloc_pd(ia->ri_pd); - ia->ri_pd = NULL; - - /* Allow waiters to continue */ - complete(&ia->ri_remove_done); - - trace_xprtrdma_remove(r_xprt); -} - -/** - * rpcrdma_ia_close - Clean up/close an IA. - * @ia: interface adapter to close - * - */ -void -rpcrdma_ia_close(struct rpcrdma_ia *ia) -{ - if (ia->ri_id != NULL && !IS_ERR(ia->ri_id)) { - if (ia->ri_id->qp) - rdma_destroy_qp(ia->ri_id); - rdma_destroy_id(ia->ri_id); - } - ia->ri_id = NULL; - - /* If the pd is still busy, xprtrdma missed freeing a resource */ - if (ia->ri_pd && !IS_ERR(ia->ri_pd)) - ib_dealloc_pd(ia->ri_pd); - ia->ri_pd = NULL; + return kref_put(&ep->re_kref, rpcrdma_ep_put); } -/** - * rpcrdma_ep_create - Create unconnected endpoint - * @r_xprt: transport to instantiate - * - * Returns zero on success, or a negative errno. - */ -int rpcrdma_ep_create(struct rpcrdma_xprt *r_xprt) +static int rpcrdma_ep_create(struct rpcrdma_xprt *r_xprt) { - struct rpcrdma_ep *ep = &r_xprt->rx_ep; - struct rpcrdma_ia *ia = &r_xprt->rx_ia; - struct rpcrdma_connect_private *pmsg = &ep->rep_cm_private; - struct ib_cq *sendcq, *recvcq; + struct rpcrdma_connect_private *pmsg; + struct ib_device *device; + struct rdma_cm_id *id; + struct rpcrdma_ep *ep; int rc; - ep->rep_max_requests = r_xprt->rx_xprt.max_reqs; - ep->rep_inline_send = xprt_rdma_max_inline_write; - ep->rep_inline_recv = xprt_rdma_max_inline_read; + ep = kzalloc(sizeof(*ep), GFP_NOFS); + if (!ep) + return -EAGAIN; + ep->re_xprt = &r_xprt->rx_xprt; + kref_init(&ep->re_kref); - rc = frwr_query_device(r_xprt, ia->ri_id->device); + id = rpcrdma_create_id(r_xprt, ep); + if (IS_ERR(id)) { + rc = PTR_ERR(id); + goto out_free; + } + __module_get(THIS_MODULE); + device = id->device; + ep->re_id = id; + + ep->re_max_requests = r_xprt->rx_xprt.max_reqs; + ep->re_inline_send = xprt_rdma_max_inline_write; + ep->re_inline_recv = xprt_rdma_max_inline_read; + rc = frwr_query_device(ep, device); if (rc) - return rc; - r_xprt->rx_buf.rb_max_requests = cpu_to_be32(ep->rep_max_requests); + goto out_destroy; + + r_xprt->rx_buf.rb_max_requests = cpu_to_be32(ep->re_max_requests); - ep->rep_attr.event_handler = rpcrdma_qp_event_handler; - ep->rep_attr.qp_context = ep; - ep->rep_attr.srq = NULL; - ep->rep_attr.cap.max_inline_data = 0; - ep->rep_attr.sq_sig_type = IB_SIGNAL_REQ_WR; - ep->rep_attr.qp_type = IB_QPT_RC; - ep->rep_attr.port_num = ~0; + ep->re_attr.event_handler = rpcrdma_qp_event_handler; + ep->re_attr.qp_context = ep; + ep->re_attr.srq = NULL; + ep->re_attr.cap.max_inline_data = 0; + ep->re_attr.sq_sig_type = IB_SIGNAL_REQ_WR; + ep->re_attr.qp_type = IB_QPT_RC; + ep->re_attr.port_num = ~0; dprintk("RPC: %s: requested max: dtos: send %d recv %d; " "iovs: send %d recv %d\n", __func__, - ep->rep_attr.cap.max_send_wr, - ep->rep_attr.cap.max_recv_wr, - ep->rep_attr.cap.max_send_sge, - ep->rep_attr.cap.max_recv_sge); - - ep->rep_send_batch = ep->rep_max_requests >> 3; - ep->rep_send_count = ep->rep_send_batch; - init_waitqueue_head(&ep->rep_connect_wait); - ep->rep_receive_count = 0; - - sendcq = ib_alloc_cq_any(ia->ri_id->device, r_xprt, - ep->rep_attr.cap.max_send_wr + 1, - IB_POLL_WORKQUEUE); - if (IS_ERR(sendcq)) { - rc = PTR_ERR(sendcq); - goto out1; + ep->re_attr.cap.max_send_wr, + ep->re_attr.cap.max_recv_wr, + ep->re_attr.cap.max_send_sge, + ep->re_attr.cap.max_recv_sge); + + ep->re_send_batch = ep->re_max_requests >> 3; + ep->re_send_count = ep->re_send_batch; + init_waitqueue_head(&ep->re_connect_wait); + + ep->re_attr.send_cq = ib_alloc_cq_any(device, r_xprt, + ep->re_attr.cap.max_send_wr, + IB_POLL_WORKQUEUE); + if (IS_ERR(ep->re_attr.send_cq)) { + rc = PTR_ERR(ep->re_attr.send_cq); + goto out_destroy; } - recvcq = ib_alloc_cq_any(ia->ri_id->device, NULL, - ep->rep_attr.cap.max_recv_wr + 1, - IB_POLL_WORKQUEUE); - if (IS_ERR(recvcq)) { - rc = PTR_ERR(recvcq); - goto out2; + ep->re_attr.recv_cq = ib_alloc_cq_any(device, r_xprt, + ep->re_attr.cap.max_recv_wr, + IB_POLL_WORKQUEUE); + if (IS_ERR(ep->re_attr.recv_cq)) { + rc = PTR_ERR(ep->re_attr.recv_cq); + goto out_destroy; } - - ep->rep_attr.send_cq = sendcq; - ep->rep_attr.recv_cq = recvcq; + ep->re_receive_count = 0; /* Initialize cma parameters */ - memset(&ep->rep_remote_cma, 0, sizeof(ep->rep_remote_cma)); + memset(&ep->re_remote_cma, 0, sizeof(ep->re_remote_cma)); /* Prepare RDMA-CM private message */ + pmsg = &ep->re_cm_private; pmsg->cp_magic = rpcrdma_cmp_magic; pmsg->cp_version = RPCRDMA_CMP_VERSION; pmsg->cp_flags |= RPCRDMA_CMP_F_SND_W_INV_OK; - pmsg->cp_send_size = rpcrdma_encode_buffer_size(ep->rep_inline_send); - pmsg->cp_recv_size = rpcrdma_encode_buffer_size(ep->rep_inline_recv); - ep->rep_remote_cma.private_data = pmsg; - ep->rep_remote_cma.private_data_len = sizeof(*pmsg); + pmsg->cp_send_size = rpcrdma_encode_buffer_size(ep->re_inline_send); + pmsg->cp_recv_size = rpcrdma_encode_buffer_size(ep->re_inline_recv); + ep->re_remote_cma.private_data = pmsg; + ep->re_remote_cma.private_data_len = sizeof(*pmsg); /* Client offers RDMA Read but does not initiate */ - ep->rep_remote_cma.initiator_depth = 0; - ep->rep_remote_cma.responder_resources = - min_t(int, U8_MAX, ia->ri_id->device->attrs.max_qp_rd_atom); + ep->re_remote_cma.initiator_depth = 0; + ep->re_remote_cma.responder_resources = + min_t(int, U8_MAX, device->attrs.max_qp_rd_atom); /* Limit transport retries so client can detect server * GID changes quickly. RPC layer handles re-establishing * transport connection and retransmission. */ - ep->rep_remote_cma.retry_count = 6; + ep->re_remote_cma.retry_count = 6; /* RPC-over-RDMA handles its own flow control. In addition, * make all RNR NAKs visible so we know that RPC-over-RDMA * flow control is working correctly (no NAKs should be seen). */ - ep->rep_remote_cma.flow_control = 0; - ep->rep_remote_cma.rnr_retry_count = 0; + ep->re_remote_cma.flow_control = 0; + ep->re_remote_cma.rnr_retry_count = 0; - return 0; - -out2: - ib_free_cq(sendcq); -out1: - return rc; -} - -/** - * rpcrdma_ep_destroy - Disconnect and destroy endpoint. - * @r_xprt: transport instance to shut down - * - */ -void rpcrdma_ep_destroy(struct rpcrdma_xprt *r_xprt) -{ - struct rpcrdma_ep *ep = &r_xprt->rx_ep; - struct rpcrdma_ia *ia = &r_xprt->rx_ia; - - if (ia->ri_id && ia->ri_id->qp) { - rpcrdma_ep_disconnect(ep, ia); - rdma_destroy_qp(ia->ri_id); - ia->ri_id->qp = NULL; - } - - if (ep->rep_attr.recv_cq) - ib_free_cq(ep->rep_attr.recv_cq); - if (ep->rep_attr.send_cq) - ib_free_cq(ep->rep_attr.send_cq); -} - -/* Re-establish a connection after a device removal event. - * Unlike a normal reconnection, a fresh PD and a new set - * of MRs and buffers is needed. - */ -static int rpcrdma_ep_recreate_xprt(struct rpcrdma_xprt *r_xprt, - struct ib_qp_init_attr *qp_init_attr) -{ - struct rpcrdma_ia *ia = &r_xprt->rx_ia; - struct rpcrdma_ep *ep = &r_xprt->rx_ep; - int rc, err; - - trace_xprtrdma_reinsert(r_xprt); - - rc = -EHOSTUNREACH; - if (rpcrdma_ia_open(r_xprt)) - goto out1; - - rc = -ENOMEM; - err = rpcrdma_ep_create(r_xprt); - if (err) { - pr_err("rpcrdma: rpcrdma_ep_create returned %d\n", err); - goto out2; - } - memcpy(qp_init_attr, &ep->rep_attr, sizeof(*qp_init_attr)); - - rc = -ENETUNREACH; - err = rdma_create_qp(ia->ri_id, ia->ri_pd, qp_init_attr); - if (err) { - pr_err("rpcrdma: rdma_create_qp returned %d\n", err); - goto out3; - } - return 0; - -out3: - rpcrdma_ep_destroy(r_xprt); -out2: - rpcrdma_ia_close(ia); -out1: - return rc; -} - -static int rpcrdma_ep_reconnect(struct rpcrdma_xprt *r_xprt, - struct ib_qp_init_attr *qp_init_attr) -{ - struct rpcrdma_ia *ia = &r_xprt->rx_ia; - struct rdma_cm_id *id, *old; - int err, rc; - - rpcrdma_ep_disconnect(&r_xprt->rx_ep, ia); - - rc = -EHOSTUNREACH; - id = rpcrdma_create_id(r_xprt, ia); - if (IS_ERR(id)) - goto out; - - /* As long as the new ID points to the same device as the - * old ID, we can reuse the transport's existing PD and all - * previously allocated MRs. Also, the same device means - * the transport's previous DMA mappings are still valid. - * - * This is a sanity check only. There should be no way these - * point to two different devices here. - */ - old = id; - rc = -ENETUNREACH; - if (ia->ri_id->device != id->device) { - pr_err("rpcrdma: can't reconnect on different device!\n"); + ep->re_pd = ib_alloc_pd(device, 0); + if (IS_ERR(ep->re_pd)) { + rc = PTR_ERR(ep->re_pd); goto out_destroy; } - err = rdma_create_qp(id, ia->ri_pd, qp_init_attr); - if (err) + rc = rdma_create_qp(id, ep->re_pd, &ep->re_attr); + if (rc) goto out_destroy; - /* Atomically replace the transport's ID and QP. */ - rc = 0; - old = ia->ri_id; - ia->ri_id = id; - rdma_destroy_qp(old); + r_xprt->rx_ep = ep; + return 0; out_destroy: - rdma_destroy_id(old); -out: + rpcrdma_ep_destroy(ep); + rdma_destroy_id(id); +out_free: + kfree(ep); + r_xprt->rx_ep = NULL; return rc; } -/* - * Connect unconnected endpoint. +/** + * rpcrdma_xprt_connect - Connect an unconnected transport + * @r_xprt: controlling transport instance + * + * Returns 0 on success or a negative errno. */ -int -rpcrdma_ep_connect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia) +int rpcrdma_xprt_connect(struct rpcrdma_xprt *r_xprt) { - struct rpcrdma_xprt *r_xprt = container_of(ia, struct rpcrdma_xprt, - rx_ia); struct rpc_xprt *xprt = &r_xprt->rx_xprt; - struct ib_qp_init_attr qp_init_attr; + struct rpcrdma_ep *ep; int rc; retry: - memcpy(&qp_init_attr, &ep->rep_attr, sizeof(qp_init_attr)); - switch (ep->rep_connected) { - case 0: - rc = rdma_create_qp(ia->ri_id, ia->ri_pd, &qp_init_attr); - if (rc) { - rc = -ENETUNREACH; - goto out_noupdate; - } - break; - case -ENODEV: - rc = rpcrdma_ep_recreate_xprt(r_xprt, &qp_init_attr); - if (rc) - goto out_noupdate; - break; - default: - rc = rpcrdma_ep_reconnect(r_xprt, &qp_init_attr); - if (rc) - goto out; - } + rpcrdma_xprt_disconnect(r_xprt); + rc = rpcrdma_ep_create(r_xprt); + if (rc) + return rc; + ep = r_xprt->rx_ep; - ep->rep_connected = 0; + ep->re_connect_status = 0; xprt_clear_connected(xprt); rpcrdma_reset_cwnd(r_xprt); @@ -699,64 +529,68 @@ retry: if (rc) goto out; - rc = rdma_connect(ia->ri_id, &ep->rep_remote_cma); + rc = rdma_connect(ep->re_id, &ep->re_remote_cma); if (rc) goto out; if (xprt->reestablish_timeout < RPCRDMA_INIT_REEST_TO) xprt->reestablish_timeout = RPCRDMA_INIT_REEST_TO; - wait_event_interruptible(ep->rep_connect_wait, ep->rep_connected != 0); - if (ep->rep_connected <= 0) { - if (ep->rep_connected == -EAGAIN) + wait_event_interruptible(ep->re_connect_wait, + ep->re_connect_status != 0); + if (ep->re_connect_status <= 0) { + if (ep->re_connect_status == -EAGAIN) goto retry; - rc = ep->rep_connected; + rc = ep->re_connect_status; goto out; } rc = rpcrdma_reqs_setup(r_xprt); if (rc) { - rpcrdma_ep_disconnect(ep, ia); + rpcrdma_xprt_disconnect(r_xprt); goto out; } rpcrdma_mrs_create(r_xprt); out: if (rc) - ep->rep_connected = rc; - -out_noupdate: + ep->re_connect_status = rc; trace_xprtrdma_connect(r_xprt, rc); return rc; } /** - * rpcrdma_ep_disconnect - Disconnect underlying transport - * @ep: endpoint to disconnect - * @ia: associated interface adapter + * rpcrdma_xprt_disconnect - Disconnect underlying transport + * @r_xprt: controlling transport instance * * Caller serializes. Either the transport send lock is held, * or we're being called to destroy the transport. + * + * On return, @r_xprt is completely divested of all hardware + * resources and prepared for the next ->connect operation. */ -void -rpcrdma_ep_disconnect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia) +void rpcrdma_xprt_disconnect(struct rpcrdma_xprt *r_xprt) { - struct rpcrdma_xprt *r_xprt = container_of(ep, struct rpcrdma_xprt, - rx_ep); + struct rpcrdma_ep *ep = r_xprt->rx_ep; + struct rdma_cm_id *id; int rc; - /* returns without wait if ID is not connected */ - rc = rdma_disconnect(ia->ri_id); - if (!rc) - wait_event_interruptible(ep->rep_connect_wait, - ep->rep_connected != 1); - else - ep->rep_connected = rc; + if (!ep) + return; + + id = ep->re_id; + rc = rdma_disconnect(id); trace_xprtrdma_disconnect(r_xprt, rc); rpcrdma_xprt_drain(r_xprt); + rpcrdma_reps_unmap(r_xprt); rpcrdma_reqs_reset(r_xprt); rpcrdma_mrs_destroy(r_xprt); rpcrdma_sendctxs_destroy(r_xprt); + + if (rpcrdma_ep_destroy(ep)) + rdma_destroy_id(id); + + r_xprt->rx_ep = NULL; } /* Fixed-size circular FIFO queue. This implementation is wait-free and @@ -793,7 +627,7 @@ static struct rpcrdma_sendctx *rpcrdma_sendctx_create(struct rpcrdma_ep *ep) { struct rpcrdma_sendctx *sc; - sc = kzalloc(struct_size(sc, sc_sges, ep->rep_attr.cap.max_send_sge), + sc = kzalloc(struct_size(sc, sc_sges, ep->re_attr.cap.max_send_sge), GFP_KERNEL); if (!sc) return NULL; @@ -813,14 +647,14 @@ static int rpcrdma_sendctxs_create(struct rpcrdma_xprt *r_xprt) * the ->send_request call to fail temporarily before too many * Sends are posted. */ - i = r_xprt->rx_ep.rep_max_requests + RPCRDMA_MAX_BC_REQUESTS; + i = r_xprt->rx_ep->re_max_requests + RPCRDMA_MAX_BC_REQUESTS; buf->rb_sc_ctxs = kcalloc(i, sizeof(sc), GFP_KERNEL); if (!buf->rb_sc_ctxs) return -ENOMEM; buf->rb_sc_last = i - 1; for (i = 0; i <= buf->rb_sc_last; i++) { - sc = rpcrdma_sendctx_create(&r_xprt->rx_ep); + sc = rpcrdma_sendctx_create(r_xprt->rx_ep); if (!sc) return -ENOMEM; @@ -924,10 +758,10 @@ static void rpcrdma_mrs_create(struct rpcrdma_xprt *r_xprt) { struct rpcrdma_buffer *buf = &r_xprt->rx_buf; - struct rpcrdma_ia *ia = &r_xprt->rx_ia; + struct rpcrdma_ep *ep = r_xprt->rx_ep; unsigned int count; - for (count = 0; count < ia->ri_max_rdma_segs; count++) { + for (count = 0; count < ep->re_max_rdma_segs; count++) { struct rpcrdma_mr *mr; int rc; @@ -935,14 +769,12 @@ rpcrdma_mrs_create(struct rpcrdma_xprt *r_xprt) if (!mr) break; - rc = frwr_init_mr(ia, mr); + rc = frwr_mr_init(r_xprt, mr); if (rc) { kfree(mr); break; } - mr->mr_xprt = r_xprt; - spin_lock(&buf->rb_lock); rpcrdma_mr_push(mr, &buf->rb_mrs); list_add(&mr->mr_all, &buf->rb_all_mrs); @@ -973,12 +805,12 @@ rpcrdma_mr_refresh_worker(struct work_struct *work) void rpcrdma_mrs_refresh(struct rpcrdma_xprt *r_xprt) { struct rpcrdma_buffer *buf = &r_xprt->rx_buf; - struct rpcrdma_ep *ep = &r_xprt->rx_ep; + struct rpcrdma_ep *ep = r_xprt->rx_ep; - /* If there is no underlying device, it's no use to - * wake the refresh worker. + /* If there is no underlying connection, it's no use + * to wake the refresh worker. */ - if (ep->rep_connected != -ENODEV) { + if (ep->re_connect_status == 1) { /* The work is scheduled on a WQ_MEM_RECLAIM * workqueue in order to prevent MR allocation * from recursing into NFS during direct reclaim. @@ -1042,7 +874,7 @@ int rpcrdma_req_setup(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) /* Compute maximum header buffer size in bytes */ maxhdrsize = rpcrdma_fixed_maxsz + 3 + - r_xprt->rx_ia.ri_max_rdma_segs * rpcrdma_readchunk_maxsz; + r_xprt->rx_ep->re_max_rdma_segs * rpcrdma_readchunk_maxsz; maxhdrsize *= sizeof(__be32); rb = rpcrdma_regbuf_alloc(__roundup_pow_of_two(maxhdrsize), DMA_TO_DEVICE, GFP_KERNEL); @@ -1120,7 +952,7 @@ struct rpcrdma_rep *rpcrdma_rep_create(struct rpcrdma_xprt *r_xprt, if (rep == NULL) goto out; - rep->rr_rdmabuf = rpcrdma_regbuf_alloc(r_xprt->rx_ep.rep_inline_recv, + rep->rr_rdmabuf = rpcrdma_regbuf_alloc(r_xprt->rx_ep->re_inline_recv, DMA_FROM_DEVICE, GFP_KERNEL); if (!rep->rr_rdmabuf) goto out_free; @@ -1345,7 +1177,7 @@ void rpcrdma_mr_put(struct rpcrdma_mr *mr) if (mr->mr_dir != DMA_NONE) { trace_xprtrdma_mr_unmap(mr); - ib_dma_unmap_sg(r_xprt->rx_ia.ri_id->device, + ib_dma_unmap_sg(r_xprt->rx_ep->re_id->device, mr->mr_sg, mr->mr_nents, mr->mr_dir); mr->mr_dir = DMA_NONE; } @@ -1463,7 +1295,7 @@ bool rpcrdma_regbuf_realloc(struct rpcrdma_regbuf *rb, size_t size, gfp_t flags) bool __rpcrdma_regbuf_dma_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_regbuf *rb) { - struct ib_device *device = r_xprt->rx_ia.ri_id->device; + struct ib_device *device = r_xprt->rx_ep->re_id->device; if (rb->rg_direction == DMA_NONE) return false; @@ -1476,7 +1308,7 @@ bool __rpcrdma_regbuf_dma_map(struct rpcrdma_xprt *r_xprt, } rb->rg_device = device; - rb->rg_iov.lkey = r_xprt->rx_ia.ri_pd->local_dma_lkey; + rb->rg_iov.lkey = r_xprt->rx_ep->re_pd->local_dma_lkey; return true; } @@ -1502,31 +1334,28 @@ static void rpcrdma_regbuf_free(struct rpcrdma_regbuf *rb) } /** - * rpcrdma_ep_post - Post WRs to a transport's Send Queue - * @ia: transport's device information - * @ep: transport's RDMA endpoint information + * rpcrdma_post_sends - Post WRs to a transport's Send Queue + * @r_xprt: controlling transport instance * @req: rpcrdma_req containing the Send WR to post * * Returns 0 if the post was successful, otherwise -ENOTCONN * is returned. */ -int -rpcrdma_ep_post(struct rpcrdma_ia *ia, - struct rpcrdma_ep *ep, - struct rpcrdma_req *req) +int rpcrdma_post_sends(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) { struct ib_send_wr *send_wr = &req->rl_wr; + struct rpcrdma_ep *ep = r_xprt->rx_ep; int rc; - if (!ep->rep_send_count || kref_read(&req->rl_kref) > 1) { + if (!ep->re_send_count || kref_read(&req->rl_kref) > 1) { send_wr->send_flags |= IB_SEND_SIGNALED; - ep->rep_send_count = ep->rep_send_batch; + ep->re_send_count = ep->re_send_batch; } else { send_wr->send_flags &= ~IB_SEND_SIGNALED; - --ep->rep_send_count; + --ep->re_send_count; } - rc = frwr_send(ia, req); + rc = frwr_send(r_xprt, req); trace_xprtrdma_post_send(req, rc); if (rc) return -ENOTCONN; @@ -1542,7 +1371,7 @@ rpcrdma_ep_post(struct rpcrdma_ia *ia, void rpcrdma_post_recvs(struct rpcrdma_xprt *r_xprt, bool temp) { struct rpcrdma_buffer *buf = &r_xprt->rx_buf; - struct rpcrdma_ep *ep = &r_xprt->rx_ep; + struct rpcrdma_ep *ep = r_xprt->rx_ep; struct ib_recv_wr *wr, *bad_wr; struct rpcrdma_rep *rep; int needed, count, rc; @@ -1551,9 +1380,9 @@ void rpcrdma_post_recvs(struct rpcrdma_xprt *r_xprt, bool temp) count = 0; needed = buf->rb_credits + (buf->rb_bc_srv_max_requests << 1); - if (likely(ep->rep_receive_count > needed)) + if (likely(ep->re_receive_count > needed)) goto out; - needed -= ep->rep_receive_count; + needed -= ep->re_receive_count; if (!temp) needed += RPCRDMA_MAX_RECV_BATCH; @@ -1579,7 +1408,7 @@ void rpcrdma_post_recvs(struct rpcrdma_xprt *r_xprt, bool temp) if (!wr) goto out; - rc = ib_post_recv(r_xprt->rx_ia.ri_id->qp, wr, + rc = ib_post_recv(ep->re_id->qp, wr, (const struct ib_recv_wr **)&bad_wr); out: trace_xprtrdma_post_recvs(r_xprt, count, rc); @@ -1593,6 +1422,6 @@ out: --count; } } - ep->rep_receive_count += count; + ep->re_receive_count += count; return; } diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index 37d5080c250b..0a16fdb09b2c 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -65,43 +65,33 @@ #define RPCRDMA_IDLE_DISC_TO (5U * 60 * HZ) /* - * Interface Adapter -- one per transport instance + * RDMA Endpoint -- connection endpoint details */ -struct rpcrdma_ia { - struct rdma_cm_id *ri_id; - struct ib_pd *ri_pd; - int ri_async_rc; - unsigned int ri_max_rdma_segs; - unsigned int ri_max_frwr_depth; - bool ri_implicit_roundup; - enum ib_mr_type ri_mrtype; - unsigned long ri_flags; - struct completion ri_done; - struct completion ri_remove_done; -}; - -enum { - RPCRDMA_IAF_REMOVING = 0, -}; - -/* - * RDMA Endpoint -- one per transport instance - */ - struct rpcrdma_ep { - unsigned int rep_send_count; - unsigned int rep_send_batch; - unsigned int rep_max_inline_send; - unsigned int rep_max_inline_recv; - int rep_connected; - struct ib_qp_init_attr rep_attr; - wait_queue_head_t rep_connect_wait; - struct rpcrdma_connect_private rep_cm_private; - struct rdma_conn_param rep_remote_cma; - unsigned int rep_max_requests; /* depends on device */ - unsigned int rep_inline_send; /* negotiated */ - unsigned int rep_inline_recv; /* negotiated */ - int rep_receive_count; + struct kref re_kref; + struct rdma_cm_id *re_id; + struct ib_pd *re_pd; + unsigned int re_max_rdma_segs; + unsigned int re_max_fr_depth; + bool re_implicit_roundup; + enum ib_mr_type re_mrtype; + struct completion re_done; + unsigned int re_send_count; + unsigned int re_send_batch; + unsigned int re_max_inline_send; + unsigned int re_max_inline_recv; + int re_async_rc; + int re_connect_status; + struct ib_qp_init_attr re_attr; + wait_queue_head_t re_connect_wait; + struct rpc_xprt *re_xprt; + struct rpcrdma_connect_private + re_cm_private; + struct rdma_conn_param re_remote_cma; + int re_receive_count; + unsigned int re_max_requests; /* depends on device */ + unsigned int re_inline_send; /* negotiated */ + unsigned int re_inline_recv; /* negotiated */ }; /* Pre-allocate extra Work Requests for handling backward receives @@ -422,8 +412,7 @@ struct rpcrdma_stats { */ struct rpcrdma_xprt { struct rpc_xprt rx_xprt; - struct rpcrdma_ia rx_ia; - struct rpcrdma_ep rx_ep; + struct rpcrdma_ep *rx_ep; struct rpcrdma_buffer rx_buf; struct delayed_work rx_connect_worker; struct rpc_timeout rx_timeout; @@ -455,22 +444,13 @@ extern int xprt_rdma_pad_optimize; extern unsigned int xprt_rdma_memreg_strategy; /* - * Interface Adapter calls - xprtrdma/verbs.c - */ -int rpcrdma_ia_open(struct rpcrdma_xprt *xprt); -void rpcrdma_ia_remove(struct rpcrdma_ia *ia); -void rpcrdma_ia_close(struct rpcrdma_ia *); - -/* * Endpoint calls - xprtrdma/verbs.c */ -int rpcrdma_ep_create(struct rpcrdma_xprt *r_xprt); -void rpcrdma_ep_destroy(struct rpcrdma_xprt *r_xprt); -int rpcrdma_ep_connect(struct rpcrdma_ep *, struct rpcrdma_ia *); -void rpcrdma_ep_disconnect(struct rpcrdma_ep *, struct rpcrdma_ia *); +void rpcrdma_flush_disconnect(struct ib_cq *cq, struct ib_wc *wc); +int rpcrdma_xprt_connect(struct rpcrdma_xprt *r_xprt); +void rpcrdma_xprt_disconnect(struct rpcrdma_xprt *r_xprt); -int rpcrdma_ep_post(struct rpcrdma_ia *, struct rpcrdma_ep *, - struct rpcrdma_req *); +int rpcrdma_post_sends(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req); void rpcrdma_post_recvs(struct rpcrdma_xprt *r_xprt, bool temp); /* @@ -536,15 +516,14 @@ rpcrdma_data_dir(bool writing) /* Memory registration calls xprtrdma/frwr_ops.c */ void frwr_reset(struct rpcrdma_req *req); -int frwr_query_device(struct rpcrdma_xprt *r_xprt, - const struct ib_device *device); -int frwr_init_mr(struct rpcrdma_ia *ia, struct rpcrdma_mr *mr); +int frwr_query_device(struct rpcrdma_ep *ep, const struct ib_device *device); +int frwr_mr_init(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr *mr); void frwr_release_mr(struct rpcrdma_mr *mr); struct rpcrdma_mr_seg *frwr_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, int nsegs, bool writing, __be32 xid, struct rpcrdma_mr *mr); -int frwr_send(struct rpcrdma_ia *ia, struct rpcrdma_req *req); +int frwr_send(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req); void frwr_reminv(struct rpcrdma_rep *rep, struct list_head *mrs); void frwr_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req); void frwr_unmap_async(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req); @@ -569,7 +548,7 @@ int rpcrdma_prepare_send_sges(struct rpcrdma_xprt *r_xprt, enum rpcrdma_chunktype rtype); void rpcrdma_sendctx_unmap(struct rpcrdma_sendctx *sc); int rpcrdma_marshal_req(struct rpcrdma_xprt *r_xprt, struct rpc_rqst *rqst); -void rpcrdma_set_max_header_sizes(struct rpcrdma_xprt *); +void rpcrdma_set_max_header_sizes(struct rpcrdma_ep *ep); void rpcrdma_reset_cwnd(struct rpcrdma_xprt *r_xprt); void rpcrdma_complete_rqst(struct rpcrdma_rep *rep); void rpcrdma_reply_handler(struct rpcrdma_rep *rep); diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index d86c664ea6af..0bda8a73e8a8 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -54,6 +54,7 @@ #include <trace/events/sunrpc.h> +#include "socklib.h" #include "sunrpc.h" static void xs_close(struct rpc_xprt *xprt); @@ -749,125 +750,6 @@ xs_stream_start_connect(struct sock_xprt *transport) #define XS_SENDMSG_FLAGS (MSG_DONTWAIT | MSG_NOSIGNAL) -static int xs_sendmsg(struct socket *sock, struct msghdr *msg, size_t seek) -{ - if (seek) - iov_iter_advance(&msg->msg_iter, seek); - return sock_sendmsg(sock, msg); -} - -static int xs_send_kvec(struct socket *sock, struct msghdr *msg, struct kvec *vec, size_t seek) -{ - iov_iter_kvec(&msg->msg_iter, WRITE, vec, 1, vec->iov_len); - return xs_sendmsg(sock, msg, seek); -} - -static int xs_send_pagedata(struct socket *sock, struct msghdr *msg, struct xdr_buf *xdr, size_t base) -{ - int err; - - err = xdr_alloc_bvec(xdr, GFP_KERNEL); - if (err < 0) - return err; - - iov_iter_bvec(&msg->msg_iter, WRITE, xdr->bvec, - xdr_buf_pagecount(xdr), - xdr->page_len + xdr->page_base); - return xs_sendmsg(sock, msg, base + xdr->page_base); -} - -#define xs_record_marker_len() sizeof(rpc_fraghdr) - -/* Common case: - * - stream transport - * - sending from byte 0 of the message - * - the message is wholly contained in @xdr's head iovec - */ -static int xs_send_rm_and_kvec(struct socket *sock, struct msghdr *msg, - rpc_fraghdr marker, struct kvec *vec, size_t base) -{ - struct kvec iov[2] = { - [0] = { - .iov_base = &marker, - .iov_len = sizeof(marker) - }, - [1] = *vec, - }; - size_t len = iov[0].iov_len + iov[1].iov_len; - - iov_iter_kvec(&msg->msg_iter, WRITE, iov, 2, len); - return xs_sendmsg(sock, msg, base); -} - -/** - * xs_sendpages - write pages directly to a socket - * @sock: socket to send on - * @addr: UDP only -- address of destination - * @addrlen: UDP only -- length of destination address - * @xdr: buffer containing this request - * @base: starting position in the buffer - * @rm: stream record marker field - * @sent_p: return the total number of bytes successfully queued for sending - * - */ -static int xs_sendpages(struct socket *sock, struct sockaddr *addr, int addrlen, struct xdr_buf *xdr, unsigned int base, rpc_fraghdr rm, int *sent_p) -{ - struct msghdr msg = { - .msg_name = addr, - .msg_namelen = addrlen, - .msg_flags = XS_SENDMSG_FLAGS | MSG_MORE, - }; - unsigned int rmsize = rm ? sizeof(rm) : 0; - unsigned int remainder = rmsize + xdr->len - base; - unsigned int want; - int err = 0; - - if (unlikely(!sock)) - return -ENOTSOCK; - - want = xdr->head[0].iov_len + rmsize; - if (base < want) { - unsigned int len = want - base; - remainder -= len; - if (remainder == 0) - msg.msg_flags &= ~MSG_MORE; - if (rmsize) - err = xs_send_rm_and_kvec(sock, &msg, rm, - &xdr->head[0], base); - else - err = xs_send_kvec(sock, &msg, &xdr->head[0], base); - if (remainder == 0 || err != len) - goto out; - *sent_p += err; - base = 0; - } else - base -= want; - - if (base < xdr->page_len) { - unsigned int len = xdr->page_len - base; - remainder -= len; - if (remainder == 0) - msg.msg_flags &= ~MSG_MORE; - err = xs_send_pagedata(sock, &msg, xdr, base); - if (remainder == 0 || err != len) - goto out; - *sent_p += err; - base = 0; - } else - base -= xdr->page_len; - - if (base >= xdr->tail[0].iov_len) - return 0; - msg.msg_flags &= ~MSG_MORE; - err = xs_send_kvec(sock, &msg, &xdr->tail[0], base); -out: - if (err > 0) { - *sent_p += err; - err = 0; - } - return err; -} - /** * xs_nospace - handle transmit was incomplete * @req: pointer to RPC request @@ -959,8 +841,11 @@ static int xs_local_send_request(struct rpc_rqst *req) struct xdr_buf *xdr = &req->rq_snd_buf; rpc_fraghdr rm = xs_stream_record_marker(xdr); unsigned int msglen = rm ? req->rq_slen + sizeof(rm) : req->rq_slen; + struct msghdr msg = { + .msg_flags = XS_SENDMSG_FLAGS, + }; + unsigned int uninitialized_var(sent); int status; - int sent = 0; /* Close the stream if the previous transmission was incomplete */ if (xs_send_request_was_aborted(transport, req)) { @@ -972,8 +857,8 @@ static int xs_local_send_request(struct rpc_rqst *req) req->rq_svec->iov_base, req->rq_svec->iov_len); req->rq_xtime = ktime_get(); - status = xs_sendpages(transport->sock, NULL, 0, xdr, - transport->xmit.offset, rm, &sent); + status = xprt_sock_sendmsg(transport->sock, &msg, xdr, + transport->xmit.offset, rm, &sent); dprintk("RPC: %s(%u) = %d\n", __func__, xdr->len - transport->xmit.offset, status); @@ -1025,7 +910,12 @@ static int xs_udp_send_request(struct rpc_rqst *req) struct rpc_xprt *xprt = req->rq_xprt; struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt); struct xdr_buf *xdr = &req->rq_snd_buf; - int sent = 0; + struct msghdr msg = { + .msg_name = xs_addr(xprt), + .msg_namelen = xprt->addrlen, + .msg_flags = XS_SENDMSG_FLAGS, + }; + unsigned int uninitialized_var(sent); int status; xs_pktdump("packet data:", @@ -1039,8 +929,7 @@ static int xs_udp_send_request(struct rpc_rqst *req) return -EBADSLT; req->rq_xtime = ktime_get(); - status = xs_sendpages(transport->sock, xs_addr(xprt), xprt->addrlen, - xdr, 0, 0, &sent); + status = xprt_sock_sendmsg(transport->sock, &msg, xdr, 0, 0, &sent); dprintk("RPC: xs_udp_send_request(%u) = %d\n", xdr->len, status); @@ -1106,9 +995,12 @@ static int xs_tcp_send_request(struct rpc_rqst *req) struct xdr_buf *xdr = &req->rq_snd_buf; rpc_fraghdr rm = xs_stream_record_marker(xdr); unsigned int msglen = rm ? req->rq_slen + sizeof(rm) : req->rq_slen; + struct msghdr msg = { + .msg_flags = XS_SENDMSG_FLAGS, + }; bool vm_wait = false; + unsigned int uninitialized_var(sent); int status; - int sent; /* Close the stream if the previous transmission was incomplete */ if (xs_send_request_was_aborted(transport, req)) { @@ -1129,9 +1021,8 @@ static int xs_tcp_send_request(struct rpc_rqst *req) * called sendmsg(). */ req->rq_xtime = ktime_get(); while (1) { - sent = 0; - status = xs_sendpages(transport->sock, NULL, 0, xdr, - transport->xmit.offset, rm, &sent); + status = xprt_sock_sendmsg(transport->sock, &msg, xdr, + transport->xmit.offset, rm, &sent); dprintk("RPC: xs_tcp_send_request(%u) = %d\n", xdr->len - transport->xmit.offset, status); @@ -1970,7 +1861,7 @@ static int xs_local_setup_socket(struct sock_xprt *transport) struct rpc_xprt *xprt = &transport->xprt; struct file *filp; struct socket *sock; - int status = -EIO; + int status; status = __sock_create(xprt->xprt_net, AF_LOCAL, SOCK_STREAM, 0, &sock, 1); @@ -2636,46 +2527,25 @@ static void bc_free(struct rpc_task *task) free_page((unsigned long)buf); } -/* - * Use the svc_sock to send the callback. Must be called with svsk->sk_mutex - * held. Borrows heavily from svc_tcp_sendto and xs_tcp_send_request. - */ static int bc_sendto(struct rpc_rqst *req) { - int len; - struct xdr_buf *xbufp = &req->rq_snd_buf; + struct xdr_buf *xdr = &req->rq_snd_buf; struct sock_xprt *transport = container_of(req->rq_xprt, struct sock_xprt, xprt); - unsigned long headoff; - unsigned long tailoff; - struct page *tailpage; struct msghdr msg = { - .msg_flags = MSG_MORE + .msg_flags = 0, }; rpc_fraghdr marker = cpu_to_be32(RPC_LAST_STREAM_FRAGMENT | - (u32)xbufp->len); - struct kvec iov = { - .iov_base = &marker, - .iov_len = sizeof(marker), - }; + (u32)xdr->len); + unsigned int sent = 0; + int err; req->rq_xtime = ktime_get(); - - len = kernel_sendmsg(transport->sock, &msg, &iov, 1, iov.iov_len); - if (len != iov.iov_len) + err = xprt_sock_sendmsg(transport->sock, &msg, xdr, 0, marker, &sent); + xdr_free_bvec(xdr); + if (err < 0 || sent != (xdr->len + sizeof(marker))) return -EAGAIN; - - tailpage = NULL; - if (xbufp->tail[0].iov_len) - tailpage = virt_to_page(xbufp->tail[0].iov_base); - tailoff = (unsigned long)xbufp->tail[0].iov_base & ~PAGE_MASK; - headoff = (unsigned long)xbufp->head[0].iov_base & ~PAGE_MASK; - len = svc_send_common(transport->sock, xbufp, - virt_to_page(xbufp->head[0].iov_base), headoff, - tailpage, tailoff); - if (len != xbufp->len) - return -EAGAIN; - return len; + return sent; } /* diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c index 156efce50dbd..0e989005bdc2 100644 --- a/net/tls/tls_main.c +++ b/net/tls/tls_main.c @@ -56,9 +56,9 @@ enum { TLS_NUM_PROTS, }; -static struct proto *saved_tcpv6_prot; +static const struct proto *saved_tcpv6_prot; static DEFINE_MUTEX(tcpv6_prot_mutex); -static struct proto *saved_tcpv4_prot; +static const struct proto *saved_tcpv4_prot; static DEFINE_MUTEX(tcpv4_prot_mutex); static struct proto tls_prots[TLS_NUM_PROTS][TLS_NUM_CONFIG][TLS_NUM_CONFIG]; static struct proto_ops tls_sw_proto_ops; diff --git a/net/wireless/.gitignore b/net/wireless/.gitignore index 61cbc304a3d3..1a29cd69d6cf 100644 --- a/net/wireless/.gitignore +++ b/net/wireless/.gitignore @@ -1,2 +1,3 @@ +# SPDX-License-Identifier: GPL-2.0-only shipped-certs.c extra-certs.c |