diff options
Diffstat (limited to 'net')
-rw-r--r-- | net/8021q/vlan.c | 34 | ||||
-rw-r--r-- | net/8021q/vlan.h | 2 | ||||
-rw-r--r-- | net/8021q/vlan_dev.c | 5 | ||||
-rw-r--r-- | net/core/dev.c | 8 | ||||
-rw-r--r-- | net/core/pktgen.c | 4 | ||||
-rw-r--r-- | net/ipv4/route.c | 2 | ||||
-rw-r--r-- | net/ipv6/addrconf.c | 75 | ||||
-rw-r--r-- | net/ipv6/ndisc.c | 8 | ||||
-rw-r--r-- | net/ipv6/route.c | 12 | ||||
-rw-r--r-- | net/mac80211/mlme.c | 14 | ||||
-rw-r--r-- | net/mac80211/util.c | 37 | ||||
-rw-r--r-- | net/sched/cls_api.c | 2 | ||||
-rw-r--r-- | net/sunrpc/svc_xprt.c | 23 | ||||
-rw-r--r-- | net/sunrpc/svcauth_unix.c | 4 | ||||
-rw-r--r-- | net/sunrpc/xprtrdma/svc_rdma_recvfrom.c | 102 | ||||
-rw-r--r-- | net/sunrpc/xprtrdma/svc_rdma_sendto.c | 11 | ||||
-rw-r--r-- | net/sunrpc/xprtrdma/svc_rdma_transport.c | 290 |
17 files changed, 371 insertions, 262 deletions
diff --git a/net/8021q/vlan.c b/net/8021q/vlan.c index 2a739adaa92b..51961300b586 100644 --- a/net/8021q/vlan.c +++ b/net/8021q/vlan.c @@ -382,6 +382,24 @@ static void vlan_sync_address(struct net_device *dev, memcpy(vlan->real_dev_addr, dev->dev_addr, ETH_ALEN); } +static void vlan_transfer_features(struct net_device *dev, + struct net_device *vlandev) +{ + unsigned long old_features = vlandev->features; + + if (dev->features & NETIF_F_VLAN_TSO) { + vlandev->features &= ~VLAN_TSO_FEATURES; + vlandev->features |= dev->features & VLAN_TSO_FEATURES; + } + if (dev->features & NETIF_F_VLAN_CSUM) { + vlandev->features &= ~NETIF_F_ALL_CSUM; + vlandev->features |= dev->features & NETIF_F_ALL_CSUM; + } + + if (old_features != vlandev->features) + netdev_features_change(vlandev); +} + static void __vlan_device_event(struct net_device *dev, unsigned long event) { switch (event) { @@ -410,10 +428,8 @@ static int vlan_device_event(struct notifier_block *unused, unsigned long event, int i, flgs; struct net_device *vlandev; - if (is_vlan_dev(dev)) { + if (is_vlan_dev(dev)) __vlan_device_event(dev, event); - goto out; - } grp = __vlan_find_group(dev); if (!grp) @@ -450,6 +466,18 @@ static int vlan_device_event(struct notifier_block *unused, unsigned long event, } break; + case NETDEV_FEAT_CHANGE: + /* Propagate device features to underlying device */ + for (i = 0; i < VLAN_GROUP_ARRAY_LEN; i++) { + vlandev = vlan_group_get_device(grp, i); + if (!vlandev) + continue; + + vlan_transfer_features(dev, vlandev); + } + + break; + case NETDEV_DOWN: /* Put all VLANs for this dev in the down state too. */ for (i = 0; i < VLAN_GROUP_ARRAY_LEN; i++) { diff --git a/net/8021q/vlan.h b/net/8021q/vlan.h index 5229a72c7ea1..79625696e86a 100644 --- a/net/8021q/vlan.h +++ b/net/8021q/vlan.h @@ -7,6 +7,8 @@ #define VLAN_GRP_HASH_SIZE (1 << VLAN_GRP_HASH_SHIFT) #define VLAN_GRP_HASH_MASK (VLAN_GRP_HASH_SIZE - 1) +#define VLAN_TSO_FEATURES (NETIF_F_TSO | NETIF_F_TSO6 | NETIF_F_SG) + /* Find a VLAN device by the MAC address of its Ethernet device, and * it's VLAN ID. The default configuration is to have VLAN's scope * to be box-wide, so the MAC will be ignored. The mac will only be diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c index c961f0826005..b1cfbaa88db2 100644 --- a/net/8021q/vlan_dev.c +++ b/net/8021q/vlan_dev.c @@ -663,6 +663,11 @@ static int vlan_dev_init(struct net_device *dev) (1<<__LINK_STATE_DORMANT))) | (1<<__LINK_STATE_PRESENT); + if (real_dev->features & NETIF_F_VLAN_TSO) + dev->features |= real_dev->features & VLAN_TSO_FEATURES; + if (real_dev->features & NETIF_F_VLAN_CSUM) + dev->features |= real_dev->features & NETIF_F_ALL_CSUM; + /* ipv6 shared card related stuff */ dev->dev_id = real_dev->dev_id; diff --git a/net/core/dev.c b/net/core/dev.c index a1607bc0cd4c..582963077877 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -903,7 +903,11 @@ int dev_change_name(struct net_device *dev, char *newname) strlcpy(dev->name, newname, IFNAMSIZ); rollback: - device_rename(&dev->dev, dev->name); + err = device_rename(&dev->dev, dev->name); + if (err) { + memcpy(dev->name, oldname, IFNAMSIZ); + return err; + } write_lock_bh(&dev_base_lock); hlist_del(&dev->name_hlist); @@ -3137,7 +3141,7 @@ int dev_change_flags(struct net_device *dev, unsigned flags) * Load in the correct multicast list now the flags have changed. */ - if (dev->change_rx_flags && (dev->flags ^ flags) & IFF_MULTICAST) + if (dev->change_rx_flags && (old_flags ^ flags) & IFF_MULTICAST) dev->change_rx_flags(dev, IFF_MULTICAST); dev_set_rx_mode(dev); diff --git a/net/core/pktgen.c b/net/core/pktgen.c index 8dca21110493..fdf537707e51 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -390,6 +390,7 @@ struct pktgen_thread { int cpu; wait_queue_head_t queue; + struct completion start_done; }; #define REMOVE 1 @@ -3414,6 +3415,7 @@ static int pktgen_thread_worker(void *arg) BUG_ON(smp_processor_id() != cpu); init_waitqueue_head(&t->queue); + complete(&t->start_done); pr_debug("pktgen: starting pktgen/%d: pid=%d\n", cpu, task_pid_nr(current)); @@ -3615,6 +3617,7 @@ static int __init pktgen_create_thread(int cpu) INIT_LIST_HEAD(&t->if_list); list_add_tail(&t->th_list, &pktgen_threads); + init_completion(&t->start_done); p = kthread_create(pktgen_thread_worker, t, "kpktgend_%d", cpu); if (IS_ERR(p)) { @@ -3639,6 +3642,7 @@ static int __init pktgen_create_thread(int cpu) } wake_up_process(p); + wait_for_completion(&t->start_done); return 0; } diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 92f90ae46f4a..df41026b60db 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -160,7 +160,7 @@ static struct dst_ops ipv4_dst_ops = { .negative_advice = ipv4_negative_advice, .link_failure = ipv4_link_failure, .update_pmtu = ip_rt_update_pmtu, - .local_out = ip_local_out, + .local_out = __ip_local_out, .entry_size = sizeof(struct rtable), .entries = ATOMIC_INIT(0), }; diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index e591e09e5e4e..3a835578fd1c 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -1764,14 +1764,16 @@ void addrconf_prefix_rcv(struct net_device *dev, u8 *opt, int len) * 2) Configure prefixes with the auto flag set */ - /* Avoid arithmetic overflow. Really, we could - save rt_expires in seconds, likely valid_lft, - but it would require division in fib gc, that it - not good. - */ - if (valid_lft >= 0x7FFFFFFF/HZ) + if (valid_lft == INFINITY_LIFE_TIME) + rt_expires = ~0UL; + else if (valid_lft >= 0x7FFFFFFF/HZ) { + /* Avoid arithmetic overflow. Really, we could + * save rt_expires in seconds, likely valid_lft, + * but it would require division in fib gc, that it + * not good. + */ rt_expires = 0x7FFFFFFF - (0x7FFFFFFF % HZ); - else + } else rt_expires = valid_lft * HZ; /* @@ -1779,7 +1781,7 @@ void addrconf_prefix_rcv(struct net_device *dev, u8 *opt, int len) * Avoid arithmetic overflow there as well. * Overflow can happen only if HZ < USER_HZ. */ - if (HZ < USER_HZ && rt_expires > 0x7FFFFFFF / USER_HZ) + if (HZ < USER_HZ && ~rt_expires && rt_expires > 0x7FFFFFFF / USER_HZ) rt_expires = 0x7FFFFFFF / USER_HZ; if (pinfo->onlink) { @@ -1788,17 +1790,28 @@ void addrconf_prefix_rcv(struct net_device *dev, u8 *opt, int len) dev->ifindex, 1); if (rt && ((rt->rt6i_flags & (RTF_GATEWAY | RTF_DEFAULT)) == 0)) { - if (rt->rt6i_flags&RTF_EXPIRES) { - if (valid_lft == 0) { - ip6_del_rt(rt); - rt = NULL; - } else { - rt->rt6i_expires = jiffies + rt_expires; - } + /* Autoconf prefix route */ + if (valid_lft == 0) { + ip6_del_rt(rt); + rt = NULL; + } else if (~rt_expires) { + /* not infinity */ + rt->rt6i_expires = jiffies + rt_expires; + rt->rt6i_flags |= RTF_EXPIRES; + } else { + rt->rt6i_flags &= ~RTF_EXPIRES; + rt->rt6i_expires = 0; } } else if (valid_lft) { + int flags = RTF_ADDRCONF | RTF_PREFIX_RT; + clock_t expires = 0; + if (~rt_expires) { + /* not infinity */ + flags |= RTF_EXPIRES; + expires = jiffies_to_clock_t(rt_expires); + } addrconf_prefix_route(&pinfo->prefix, pinfo->prefix_len, - dev, jiffies_to_clock_t(rt_expires), RTF_ADDRCONF|RTF_EXPIRES|RTF_PREFIX_RT); + dev, expires, flags); } if (rt) dst_release(&rt->u.dst); @@ -2021,7 +2034,8 @@ static int inet6_addr_add(struct net *net, int ifindex, struct in6_addr *pfx, struct inet6_dev *idev; struct net_device *dev; int scope; - u32 flags = RTF_EXPIRES; + u32 flags; + clock_t expires; ASSERT_RTNL(); @@ -2041,8 +2055,13 @@ static int inet6_addr_add(struct net *net, int ifindex, struct in6_addr *pfx, if (valid_lft == INFINITY_LIFE_TIME) { ifa_flags |= IFA_F_PERMANENT; flags = 0; - } else if (valid_lft >= 0x7FFFFFFF/HZ) - valid_lft = 0x7FFFFFFF/HZ; + expires = 0; + } else { + if (valid_lft >= 0x7FFFFFFF/HZ) + valid_lft = 0x7FFFFFFF/HZ; + flags = RTF_EXPIRES; + expires = jiffies_to_clock_t(valid_lft * HZ); + } if (prefered_lft == 0) ifa_flags |= IFA_F_DEPRECATED; @@ -2060,7 +2079,7 @@ static int inet6_addr_add(struct net *net, int ifindex, struct in6_addr *pfx, spin_unlock_bh(&ifp->lock); addrconf_prefix_route(&ifp->addr, ifp->prefix_len, dev, - jiffies_to_clock_t(valid_lft * HZ), flags); + expires, flags); /* * Note that section 3.1 of RFC 4429 indicates * that the Optimistic flag should not be set for @@ -3148,7 +3167,8 @@ inet6_rtm_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) static int inet6_addr_modify(struct inet6_ifaddr *ifp, u8 ifa_flags, u32 prefered_lft, u32 valid_lft) { - u32 flags = RTF_EXPIRES; + u32 flags; + clock_t expires; if (!valid_lft || (prefered_lft > valid_lft)) return -EINVAL; @@ -3156,8 +3176,13 @@ static int inet6_addr_modify(struct inet6_ifaddr *ifp, u8 ifa_flags, if (valid_lft == INFINITY_LIFE_TIME) { ifa_flags |= IFA_F_PERMANENT; flags = 0; - } else if (valid_lft >= 0x7FFFFFFF/HZ) - valid_lft = 0x7FFFFFFF/HZ; + expires = 0; + } else { + if (valid_lft >= 0x7FFFFFFF/HZ) + valid_lft = 0x7FFFFFFF/HZ; + flags = RTF_EXPIRES; + expires = jiffies_to_clock_t(valid_lft * HZ); + } if (prefered_lft == 0) ifa_flags |= IFA_F_DEPRECATED; @@ -3176,7 +3201,7 @@ static int inet6_addr_modify(struct inet6_ifaddr *ifp, u8 ifa_flags, ipv6_ifa_notify(0, ifp); addrconf_prefix_route(&ifp->addr, ifp->prefix_len, ifp->idev->dev, - jiffies_to_clock_t(valid_lft * HZ), flags); + expires, flags); addrconf_verify(0); return 0; @@ -4242,7 +4267,7 @@ static void addrconf_sysctl_register(struct inet6_dev *idev) neigh_sysctl_register(idev->dev, idev->nd_parms, NET_IPV6, NET_IPV6_NEIGH, "ipv6", &ndisc_ifinfo_sysctl_change, - NULL); + ndisc_ifinfo_sysctl_strategy); __addrconf_sysctl_register(dev_net(idev->dev), idev->dev->name, idev->dev->ifindex, idev, &idev->cnf); } diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c index a55fc05b8125..282fdb31f8ed 100644 --- a/net/ipv6/ndisc.c +++ b/net/ipv6/ndisc.c @@ -1727,10 +1727,10 @@ int ndisc_ifinfo_sysctl_change(struct ctl_table *ctl, int write, struct file * f return ret; } -static int ndisc_ifinfo_sysctl_strategy(ctl_table *ctl, int __user *name, - int nlen, void __user *oldval, - size_t __user *oldlenp, - void __user *newval, size_t newlen) +int ndisc_ifinfo_sysctl_strategy(ctl_table *ctl, int __user *name, + int nlen, void __user *oldval, + size_t __user *oldlenp, + void __user *newval, size_t newlen) { struct net_device *dev = ctl->extra1; struct inet6_dev *idev; diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 12bba0880345..48534c6c0735 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -109,7 +109,7 @@ static struct dst_ops ip6_dst_ops_template = { .negative_advice = ip6_negative_advice, .link_failure = ip6_link_failure, .update_pmtu = ip6_rt_update_pmtu, - .local_out = ip6_local_out, + .local_out = __ip6_local_out, .entry_size = sizeof(struct rt6_info), .entries = ATOMIC_INIT(0), }; @@ -475,7 +475,7 @@ int rt6_route_rcv(struct net_device *dev, u8 *opt, int len, lifetime = ntohl(rinfo->lifetime); if (lifetime == 0xffffffff) { /* infinity */ - } else if (lifetime > 0x7fffffff/HZ) { + } else if (lifetime > 0x7fffffff/HZ - 1) { /* Avoid arithmetic overflow */ lifetime = 0x7fffffff/HZ - 1; } @@ -1106,7 +1106,9 @@ int ip6_route_add(struct fib6_config *cfg) } rt->u.dst.obsolete = -1; - rt->rt6i_expires = jiffies + clock_t_to_jiffies(cfg->fc_expires); + rt->rt6i_expires = (cfg->fc_flags & RTF_EXPIRES) ? + jiffies + clock_t_to_jiffies(cfg->fc_expires) : + 0; if (cfg->fc_protocol == RTPROT_UNSPEC) cfg->fc_protocol = RTPROT_BOOT; @@ -2200,7 +2202,9 @@ static int rt6_fill_node(struct sk_buff *skb, struct rt6_info *rt, NLA_PUT_U32(skb, RTA_PRIORITY, rt->rt6i_metric); - expires = rt->rt6i_expires ? rt->rt6i_expires - jiffies : 0; + expires = (rt->rt6i_flags & RTF_EXPIRES) ? + rt->rt6i_expires - jiffies : 0; + if (rtnl_put_cacheinfo(skb, &rt->u.dst, 0, 0, 0, expires, rt->u.dst.error) < 0) goto nla_put_failure; diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index 4adba09e80ca..e470bf12b765 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -3446,21 +3446,17 @@ static int ieee80211_sta_config_auth(struct net_device *dev, struct ieee80211_sta_bss *bss, *selected = NULL; int top_rssi = 0, freq; - if (!(ifsta->flags & (IEEE80211_STA_AUTO_SSID_SEL | - IEEE80211_STA_AUTO_BSSID_SEL | IEEE80211_STA_AUTO_CHANNEL_SEL))) { - ifsta->state = IEEE80211_AUTHENTICATE; - ieee80211_sta_reset_auth(dev, ifsta); - return 0; - } - spin_lock_bh(&local->sta_bss_lock); freq = local->oper_channel->center_freq; list_for_each_entry(bss, &local->sta_bss_list, list) { if (!(bss->capability & WLAN_CAPABILITY_ESS)) continue; - if (!!(bss->capability & WLAN_CAPABILITY_PRIVACY) ^ - !!sdata->default_key) + if ((ifsta->flags & (IEEE80211_STA_AUTO_SSID_SEL | + IEEE80211_STA_AUTO_BSSID_SEL | + IEEE80211_STA_AUTO_CHANNEL_SEL)) && + (!!(bss->capability & WLAN_CAPABILITY_PRIVACY) ^ + !!sdata->default_key)) continue; if (!(ifsta->flags & IEEE80211_STA_AUTO_CHANNEL_SEL) && diff --git a/net/mac80211/util.c b/net/mac80211/util.c index 24a465c4df09..131e9e6c8a32 100644 --- a/net/mac80211/util.c +++ b/net/mac80211/util.c @@ -389,6 +389,41 @@ void ieee80211_iterate_active_interfaces( struct ieee80211_local *local = hw_to_local(hw); struct ieee80211_sub_if_data *sdata; + rtnl_lock(); + + list_for_each_entry(sdata, &local->interfaces, list) { + switch (sdata->vif.type) { + case IEEE80211_IF_TYPE_INVALID: + case IEEE80211_IF_TYPE_MNTR: + case IEEE80211_IF_TYPE_VLAN: + continue; + case IEEE80211_IF_TYPE_AP: + case IEEE80211_IF_TYPE_STA: + case IEEE80211_IF_TYPE_IBSS: + case IEEE80211_IF_TYPE_WDS: + case IEEE80211_IF_TYPE_MESH_POINT: + break; + } + if (sdata->dev == local->mdev) + continue; + if (netif_running(sdata->dev)) + iterator(data, sdata->dev->dev_addr, + &sdata->vif); + } + + rtnl_unlock(); +} +EXPORT_SYMBOL_GPL(ieee80211_iterate_active_interfaces); + +void ieee80211_iterate_active_interfaces_atomic( + struct ieee80211_hw *hw, + void (*iterator)(void *data, u8 *mac, + struct ieee80211_vif *vif), + void *data) +{ + struct ieee80211_local *local = hw_to_local(hw); + struct ieee80211_sub_if_data *sdata; + rcu_read_lock(); list_for_each_entry_rcu(sdata, &local->interfaces, list) { @@ -413,4 +448,4 @@ void ieee80211_iterate_active_interfaces( rcu_read_unlock(); } -EXPORT_SYMBOL_GPL(ieee80211_iterate_active_interfaces); +EXPORT_SYMBOL_GPL(ieee80211_iterate_active_interfaces_atomic); diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index 1086df7478bc..9360fc81e8c7 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -220,7 +220,7 @@ replay: tp = kzalloc(sizeof(*tp), GFP_KERNEL); if (tp == NULL) goto errout; - err = -EINVAL; + err = -ENOENT; tp_ops = tcf_proto_lookup_ops(tca[TCA_KIND]); if (tp_ops == NULL) { #ifdef CONFIG_KMOD diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c index d8e8d79a8451..e46c825f4954 100644 --- a/net/sunrpc/svc_xprt.c +++ b/net/sunrpc/svc_xprt.c @@ -6,30 +6,9 @@ #include <linux/sched.h> #include <linux/errno.h> -#include <linux/fcntl.h> -#include <linux/net.h> -#include <linux/in.h> -#include <linux/inet.h> -#include <linux/udp.h> -#include <linux/tcp.h> -#include <linux/unistd.h> -#include <linux/slab.h> -#include <linux/netdevice.h> -#include <linux/skbuff.h> -#include <linux/file.h> #include <linux/freezer.h> #include <linux/kthread.h> #include <net/sock.h> -#include <net/checksum.h> -#include <net/ip.h> -#include <net/ipv6.h> -#include <net/tcp_states.h> -#include <linux/uaccess.h> -#include <asm/ioctls.h> - -#include <linux/sunrpc/types.h> -#include <linux/sunrpc/clnt.h> -#include <linux/sunrpc/xdr.h> #include <linux/sunrpc/stats.h> #include <linux/sunrpc/svc_xprt.h> @@ -296,8 +275,6 @@ void svc_xprt_enqueue(struct svc_xprt *xprt) if (!(xprt->xpt_flags & ((1<<XPT_CONN)|(1<<XPT_DATA)|(1<<XPT_CLOSE)|(1<<XPT_DEFERRED)))) return; - if (test_bit(XPT_DEAD, &xprt->xpt_flags)) - return; cpu = get_cpu(); pool = svc_pool_for_cpu(xprt->xpt_server, cpu); diff --git a/net/sunrpc/svcauth_unix.c b/net/sunrpc/svcauth_unix.c index 3f30ee6006ae..f24800f2c098 100644 --- a/net/sunrpc/svcauth_unix.c +++ b/net/sunrpc/svcauth_unix.c @@ -278,7 +278,7 @@ static int ip_map_show(struct seq_file *m, dom = im->m_client->h.name; if (ipv6_addr_v4mapped(&addr)) { - seq_printf(m, "%s" NIPQUAD_FMT "%s\n", + seq_printf(m, "%s " NIPQUAD_FMT " %s\n", im->m_class, ntohl(addr.s6_addr32[3]) >> 24 & 0xff, ntohl(addr.s6_addr32[3]) >> 16 & 0xff, @@ -286,7 +286,7 @@ static int ip_map_show(struct seq_file *m, ntohl(addr.s6_addr32[3]) >> 0 & 0xff, dom); } else { - seq_printf(m, "%s" NIP6_FMT "%s\n", + seq_printf(m, "%s " NIP6_FMT " %s\n", im->m_class, NIP6(addr), dom); } return 0; diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c index c22d6b6f2db4..06ab4841537b 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c +++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c @@ -260,11 +260,16 @@ static int rdma_read_max_sge(struct svcxprt_rdma *xprt, int sge_count) * On our side, we need to read into a pagelist. The first page immediately * follows the RPC header. * - * This function returns 1 to indicate success. The data is not yet in + * This function returns: + * 0 - No error and no read-list found. + * + * 1 - Successful read-list processing. The data is not yet in * the pagelist and therefore the RPC request must be deferred. The * I/O completion will enqueue the transport again and * svc_rdma_recvfrom will complete the request. * + * <0 - Error processing/posting read-list. + * * NOTE: The ctxt must not be touched after the last WR has been posted * because the I/O completion processing may occur on another * processor and free / modify the context. Ne touche pas! @@ -284,7 +289,6 @@ static int rdma_read_xdr(struct svcxprt_rdma *xprt, u64 sgl_offset; struct rpcrdma_read_chunk *ch; struct svc_rdma_op_ctxt *ctxt = NULL; - struct svc_rdma_op_ctxt *head; struct svc_rdma_op_ctxt *tmp_sge_ctxt; struct svc_rdma_op_ctxt *tmp_ch_ctxt; struct chunk_sge *ch_sge_ary; @@ -302,25 +306,19 @@ static int rdma_read_xdr(struct svcxprt_rdma *xprt, ch_sge_ary = (struct chunk_sge *)tmp_ch_ctxt->sge; svc_rdma_rcl_chunk_counts(ch, &ch_count, &byte_count); + if (ch_count > RPCSVC_MAXPAGES) + return -EINVAL; sge_count = rdma_rcl_to_sge(xprt, rqstp, hdr_ctxt, rmsgp, sge, ch_sge_ary, ch_count, byte_count); - head = svc_rdma_get_context(xprt); sgl_offset = 0; ch_no = 0; for (ch = (struct rpcrdma_read_chunk *)&rmsgp->rm_body.rm_chunks[0]; ch->rc_discrim != 0; ch++, ch_no++) { next_sge: - if (!ctxt) - ctxt = head; - else { - ctxt->next = svc_rdma_get_context(xprt); - ctxt = ctxt->next; - } - ctxt->next = NULL; + ctxt = svc_rdma_get_context(xprt); ctxt->direction = DMA_FROM_DEVICE; - clear_bit(RDMACTXT_F_READ_DONE, &ctxt->flags); clear_bit(RDMACTXT_F_LAST_CTXT, &ctxt->flags); /* Prepare READ WR */ @@ -347,20 +345,15 @@ next_sge: * the client and the RPC needs to be enqueued. */ set_bit(RDMACTXT_F_LAST_CTXT, &ctxt->flags); - ctxt->next = hdr_ctxt; - hdr_ctxt->next = head; + ctxt->read_hdr = hdr_ctxt; } /* Post the read */ err = svc_rdma_send(xprt, &read_wr); if (err) { - printk(KERN_ERR "svcrdma: Error posting send = %d\n", + printk(KERN_ERR "svcrdma: Error %d posting RDMA_READ\n", err); - /* - * Break the circular list so free knows when - * to stop if the error happened to occur on - * the last read - */ - ctxt->next = NULL; + set_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags); + svc_rdma_put_context(ctxt, 0); goto out; } atomic_inc(&rdma_stat_read); @@ -371,7 +364,7 @@ next_sge: goto next_sge; } sgl_offset = 0; - err = 0; + err = 1; } out: @@ -389,25 +382,12 @@ next_sge: while (rqstp->rq_resused) rqstp->rq_respages[--rqstp->rq_resused] = NULL; - if (err) { - printk(KERN_ERR "svcrdma : RDMA_READ error = %d\n", err); - set_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags); - /* Free the linked list of read contexts */ - while (head != NULL) { - ctxt = head->next; - svc_rdma_put_context(head, 1); - head = ctxt; - } - return 0; - } - - return 1; + return err; } static int rdma_read_complete(struct svc_rqst *rqstp, - struct svc_rdma_op_ctxt *data) + struct svc_rdma_op_ctxt *head) { - struct svc_rdma_op_ctxt *head = data->next; int page_no; int ret; @@ -433,21 +413,12 @@ static int rdma_read_complete(struct svc_rqst *rqstp, rqstp->rq_arg.len = head->arg.len; rqstp->rq_arg.buflen = head->arg.buflen; + /* Free the context */ + svc_rdma_put_context(head, 0); + /* XXX: What should this be? */ rqstp->rq_prot = IPPROTO_MAX; - - /* - * Free the contexts we used to build the RDMA_READ. We have - * to be careful here because the context list uses the same - * next pointer used to chain the contexts associated with the - * RDMA_READ - */ - data->next = NULL; /* terminate circular list */ - do { - data = head->next; - svc_rdma_put_context(head, 0); - head = data; - } while (head != NULL); + svc_xprt_copy_addrs(rqstp, rqstp->rq_xprt); ret = rqstp->rq_arg.head[0].iov_len + rqstp->rq_arg.page_len @@ -457,8 +428,6 @@ static int rdma_read_complete(struct svc_rqst *rqstp, ret, rqstp->rq_arg.len, rqstp->rq_arg.head[0].iov_base, rqstp->rq_arg.head[0].iov_len); - /* Indicate that we've consumed an RQ credit */ - rqstp->rq_xprt_ctxt = rqstp->rq_xprt; svc_xprt_received(rqstp->rq_xprt); return ret; } @@ -480,13 +449,6 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp) dprintk("svcrdma: rqstp=%p\n", rqstp); - /* - * The rq_xprt_ctxt indicates if we've consumed an RQ credit - * or not. It is used in the rdma xpo_release_rqst function to - * determine whether or not to return an RQ WQE to the RQ. - */ - rqstp->rq_xprt_ctxt = NULL; - spin_lock_bh(&rdma_xprt->sc_read_complete_lock); if (!list_empty(&rdma_xprt->sc_read_complete_q)) { ctxt = list_entry(rdma_xprt->sc_read_complete_q.next, @@ -537,21 +499,22 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp) /* If the request is invalid, reply with an error */ if (len < 0) { if (len == -ENOSYS) - (void)svc_rdma_send_error(rdma_xprt, rmsgp, ERR_VERS); + svc_rdma_send_error(rdma_xprt, rmsgp, ERR_VERS); goto close_out; } - /* Read read-list data. If we would need to wait, defer - * it. Not that in this case, we don't return the RQ credit - * until after the read completes. - */ - if (rdma_read_xdr(rdma_xprt, rmsgp, rqstp, ctxt)) { + /* Read read-list data. */ + ret = rdma_read_xdr(rdma_xprt, rmsgp, rqstp, ctxt); + if (ret > 0) { + /* read-list posted, defer until data received from client. */ svc_xprt_received(xprt); return 0; } - - /* Indicate we've consumed an RQ credit */ - rqstp->rq_xprt_ctxt = rqstp->rq_xprt; + if (ret < 0) { + /* Post of read-list failed, free context. */ + svc_rdma_put_context(ctxt, 1); + return 0; + } ret = rqstp->rq_arg.head[0].iov_len + rqstp->rq_arg.page_len @@ -569,11 +532,8 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp) return ret; close_out: - if (ctxt) { + if (ctxt) svc_rdma_put_context(ctxt, 1); - /* Indicate we've consumed an RQ credit */ - rqstp->rq_xprt_ctxt = rqstp->rq_xprt; - } dprintk("svcrdma: transport %p is closing\n", xprt); /* * Set the close bit and enqueue it. svc_recv will see the diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c index 981f190c1b39..fb82b1b683f8 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c +++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c @@ -389,6 +389,17 @@ static int send_reply(struct svcxprt_rdma *rdma, int page_no; int ret; + /* Post a recv buffer to handle another request. */ + ret = svc_rdma_post_recv(rdma); + if (ret) { + printk(KERN_INFO + "svcrdma: could not post a receive buffer, err=%d." + "Closing transport %p.\n", ret, rdma); + set_bit(XPT_CLOSE, &rdma->sc_xprt.xpt_flags); + svc_rdma_put_context(ctxt, 0); + return -ENOTCONN; + } + /* Prepare the context */ ctxt->pages[0] = page; ctxt->count = 1; diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c index af408fc12634..e132509d1db0 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_transport.c +++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c @@ -103,8 +103,8 @@ static int rdma_bump_context_cache(struct svcxprt_rdma *xprt) spin_lock_bh(&xprt->sc_ctxt_lock); if (ctxt) { at_least_one = 1; - ctxt->next = xprt->sc_ctxt_head; - xprt->sc_ctxt_head = ctxt; + INIT_LIST_HEAD(&ctxt->free_list); + list_add(&ctxt->free_list, &xprt->sc_ctxt_free); } else { /* kmalloc failed...give up for now */ xprt->sc_ctxt_cnt--; @@ -123,7 +123,7 @@ struct svc_rdma_op_ctxt *svc_rdma_get_context(struct svcxprt_rdma *xprt) while (1) { spin_lock_bh(&xprt->sc_ctxt_lock); - if (unlikely(xprt->sc_ctxt_head == NULL)) { + if (unlikely(list_empty(&xprt->sc_ctxt_free))) { /* Try to bump my cache. */ spin_unlock_bh(&xprt->sc_ctxt_lock); @@ -136,12 +136,15 @@ struct svc_rdma_op_ctxt *svc_rdma_get_context(struct svcxprt_rdma *xprt) schedule_timeout_uninterruptible(msecs_to_jiffies(500)); continue; } - ctxt = xprt->sc_ctxt_head; - xprt->sc_ctxt_head = ctxt->next; + ctxt = list_entry(xprt->sc_ctxt_free.next, + struct svc_rdma_op_ctxt, + free_list); + list_del_init(&ctxt->free_list); spin_unlock_bh(&xprt->sc_ctxt_lock); ctxt->xprt = xprt; INIT_LIST_HEAD(&ctxt->dto_q); ctxt->count = 0; + atomic_inc(&xprt->sc_ctxt_used); break; } return ctxt; @@ -159,14 +162,15 @@ void svc_rdma_put_context(struct svc_rdma_op_ctxt *ctxt, int free_pages) put_page(ctxt->pages[i]); for (i = 0; i < ctxt->count; i++) - dma_unmap_single(xprt->sc_cm_id->device->dma_device, - ctxt->sge[i].addr, - ctxt->sge[i].length, - ctxt->direction); + ib_dma_unmap_single(xprt->sc_cm_id->device, + ctxt->sge[i].addr, + ctxt->sge[i].length, + ctxt->direction); + spin_lock_bh(&xprt->sc_ctxt_lock); - ctxt->next = xprt->sc_ctxt_head; - xprt->sc_ctxt_head = ctxt; + list_add(&ctxt->free_list, &xprt->sc_ctxt_free); spin_unlock_bh(&xprt->sc_ctxt_lock); + atomic_dec(&xprt->sc_ctxt_used); } /* ib_cq event handler */ @@ -228,23 +232,8 @@ static void dto_tasklet_func(unsigned long data) list_del_init(&xprt->sc_dto_q); spin_unlock_irqrestore(&dto_lock, flags); - if (test_and_clear_bit(RDMAXPRT_RQ_PENDING, &xprt->sc_flags)) { - ib_req_notify_cq(xprt->sc_rq_cq, IB_CQ_NEXT_COMP); - rq_cq_reap(xprt); - set_bit(XPT_DATA, &xprt->sc_xprt.xpt_flags); - /* - * If data arrived before established event, - * don't enqueue. This defers RPC I/O until the - * RDMA connection is complete. - */ - if (!test_bit(RDMAXPRT_CONN_PENDING, &xprt->sc_flags)) - svc_xprt_enqueue(&xprt->sc_xprt); - } - - if (test_and_clear_bit(RDMAXPRT_SQ_PENDING, &xprt->sc_flags)) { - ib_req_notify_cq(xprt->sc_sq_cq, IB_CQ_NEXT_COMP); - sq_cq_reap(xprt); - } + rq_cq_reap(xprt); + sq_cq_reap(xprt); svc_xprt_put(&xprt->sc_xprt); spin_lock_irqsave(&dto_lock, flags); @@ -263,11 +252,15 @@ static void rq_comp_handler(struct ib_cq *cq, void *cq_context) struct svcxprt_rdma *xprt = cq_context; unsigned long flags; + /* Guard against unconditional flush call for destroyed QP */ + if (atomic_read(&xprt->sc_xprt.xpt_ref.refcount)==0) + return; + /* * Set the bit regardless of whether or not it's on the list * because it may be on the list already due to an SQ * completion. - */ + */ set_bit(RDMAXPRT_RQ_PENDING, &xprt->sc_flags); /* @@ -290,6 +283,8 @@ static void rq_comp_handler(struct ib_cq *cq, void *cq_context) * * Take all completing WC off the CQE and enqueue the associated DTO * context on the dto_q for the transport. + * + * Note that caller must hold a transport reference. */ static void rq_cq_reap(struct svcxprt_rdma *xprt) { @@ -297,29 +292,47 @@ static void rq_cq_reap(struct svcxprt_rdma *xprt) struct ib_wc wc; struct svc_rdma_op_ctxt *ctxt = NULL; + if (!test_and_clear_bit(RDMAXPRT_RQ_PENDING, &xprt->sc_flags)) + return; + + ib_req_notify_cq(xprt->sc_rq_cq, IB_CQ_NEXT_COMP); atomic_inc(&rdma_stat_rq_poll); - spin_lock_bh(&xprt->sc_rq_dto_lock); while ((ret = ib_poll_cq(xprt->sc_rq_cq, 1, &wc)) > 0) { ctxt = (struct svc_rdma_op_ctxt *)(unsigned long)wc.wr_id; ctxt->wc_status = wc.status; ctxt->byte_len = wc.byte_len; if (wc.status != IB_WC_SUCCESS) { /* Close the transport */ + dprintk("svcrdma: transport closing putting ctxt %p\n", ctxt); set_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags); svc_rdma_put_context(ctxt, 1); + svc_xprt_put(&xprt->sc_xprt); continue; } + spin_lock_bh(&xprt->sc_rq_dto_lock); list_add_tail(&ctxt->dto_q, &xprt->sc_rq_dto_q); + spin_unlock_bh(&xprt->sc_rq_dto_lock); + svc_xprt_put(&xprt->sc_xprt); } - spin_unlock_bh(&xprt->sc_rq_dto_lock); if (ctxt) atomic_inc(&rdma_stat_rq_prod); + + set_bit(XPT_DATA, &xprt->sc_xprt.xpt_flags); + /* + * If data arrived before established event, + * don't enqueue. This defers RPC I/O until the + * RDMA connection is complete. + */ + if (!test_bit(RDMAXPRT_CONN_PENDING, &xprt->sc_flags)) + svc_xprt_enqueue(&xprt->sc_xprt); } /* * Send Queue Completion Handler - potentially called on interrupt context. + * + * Note that caller must hold a transport reference. */ static void sq_cq_reap(struct svcxprt_rdma *xprt) { @@ -328,6 +341,11 @@ static void sq_cq_reap(struct svcxprt_rdma *xprt) struct ib_cq *cq = xprt->sc_sq_cq; int ret; + + if (!test_and_clear_bit(RDMAXPRT_SQ_PENDING, &xprt->sc_flags)) + return; + + ib_req_notify_cq(xprt->sc_sq_cq, IB_CQ_NEXT_COMP); atomic_inc(&rdma_stat_sq_poll); while ((ret = ib_poll_cq(cq, 1, &wc)) > 0) { ctxt = (struct svc_rdma_op_ctxt *)(unsigned long)wc.wr_id; @@ -349,14 +367,16 @@ static void sq_cq_reap(struct svcxprt_rdma *xprt) case IB_WR_RDMA_READ: if (test_bit(RDMACTXT_F_LAST_CTXT, &ctxt->flags)) { + struct svc_rdma_op_ctxt *read_hdr = ctxt->read_hdr; + BUG_ON(!read_hdr); set_bit(XPT_DATA, &xprt->sc_xprt.xpt_flags); - set_bit(RDMACTXT_F_READ_DONE, &ctxt->flags); spin_lock_bh(&xprt->sc_read_complete_lock); - list_add_tail(&ctxt->dto_q, + list_add_tail(&read_hdr->dto_q, &xprt->sc_read_complete_q); spin_unlock_bh(&xprt->sc_read_complete_lock); svc_xprt_enqueue(&xprt->sc_xprt); } + svc_rdma_put_context(ctxt, 0); break; default: @@ -365,6 +385,7 @@ static void sq_cq_reap(struct svcxprt_rdma *xprt) wc.opcode, wc.status); break; } + svc_xprt_put(&xprt->sc_xprt); } if (ctxt) @@ -376,11 +397,15 @@ static void sq_comp_handler(struct ib_cq *cq, void *cq_context) struct svcxprt_rdma *xprt = cq_context; unsigned long flags; + /* Guard against unconditional flush call for destroyed QP */ + if (atomic_read(&xprt->sc_xprt.xpt_ref.refcount)==0) + return; + /* * Set the bit regardless of whether or not it's on the list * because it may be on the list already due to an RQ * completion. - */ + */ set_bit(RDMAXPRT_SQ_PENDING, &xprt->sc_flags); /* @@ -407,28 +432,29 @@ static void create_context_cache(struct svcxprt_rdma *xprt, xprt->sc_ctxt_max = ctxt_max; xprt->sc_ctxt_bump = ctxt_bump; xprt->sc_ctxt_cnt = 0; - xprt->sc_ctxt_head = NULL; + atomic_set(&xprt->sc_ctxt_used, 0); + + INIT_LIST_HEAD(&xprt->sc_ctxt_free); for (i = 0; i < ctxt_count; i++) { ctxt = kmalloc(sizeof(*ctxt), GFP_KERNEL); if (ctxt) { - ctxt->next = xprt->sc_ctxt_head; - xprt->sc_ctxt_head = ctxt; + INIT_LIST_HEAD(&ctxt->free_list); + list_add(&ctxt->free_list, &xprt->sc_ctxt_free); xprt->sc_ctxt_cnt++; } } } -static void destroy_context_cache(struct svc_rdma_op_ctxt *ctxt) +static void destroy_context_cache(struct svcxprt_rdma *xprt) { - struct svc_rdma_op_ctxt *next; - if (!ctxt) - return; - - do { - next = ctxt->next; + while (!list_empty(&xprt->sc_ctxt_free)) { + struct svc_rdma_op_ctxt *ctxt; + ctxt = list_entry(xprt->sc_ctxt_free.next, + struct svc_rdma_op_ctxt, + free_list); + list_del_init(&ctxt->free_list); kfree(ctxt); - ctxt = next; - } while (next); + } } static struct svcxprt_rdma *rdma_create_xprt(struct svc_serv *serv, @@ -465,7 +491,7 @@ static struct svcxprt_rdma *rdma_create_xprt(struct svc_serv *serv, reqs + cma_xprt->sc_sq_depth + RPCRDMA_MAX_THREADS + 1); /* max */ - if (!cma_xprt->sc_ctxt_head) { + if (list_empty(&cma_xprt->sc_ctxt_free)) { kfree(cma_xprt); return NULL; } @@ -520,7 +546,12 @@ int svc_rdma_post_recv(struct svcxprt_rdma *xprt) recv_wr.num_sge = ctxt->count; recv_wr.wr_id = (u64)(unsigned long)ctxt; + svc_xprt_get(&xprt->sc_xprt); ret = ib_post_recv(xprt->sc_qp, &recv_wr, &bad_recv_wr); + if (ret) { + svc_xprt_put(&xprt->sc_xprt); + svc_rdma_put_context(ctxt, 1); + } return ret; } @@ -539,6 +570,7 @@ static void handle_connect_req(struct rdma_cm_id *new_cma_id) { struct svcxprt_rdma *listen_xprt = new_cma_id->context; struct svcxprt_rdma *newxprt; + struct sockaddr *sa; /* Create a new transport */ newxprt = rdma_create_xprt(listen_xprt->sc_xprt.xpt_server, 0); @@ -551,6 +583,12 @@ static void handle_connect_req(struct rdma_cm_id *new_cma_id) dprintk("svcrdma: Creating newxprt=%p, cm_id=%p, listenxprt=%p\n", newxprt, newxprt->sc_cm_id, listen_xprt); + /* Set the local and remote addresses in the transport */ + sa = (struct sockaddr *)&newxprt->sc_cm_id->route.addr.dst_addr; + svc_xprt_set_remote(&newxprt->sc_xprt, sa, svc_addr_len(sa)); + sa = (struct sockaddr *)&newxprt->sc_cm_id->route.addr.src_addr; + svc_xprt_set_local(&newxprt->sc_xprt, sa, svc_addr_len(sa)); + /* * Enqueue the new transport on the accept queue of the listening * transport @@ -627,6 +665,7 @@ static int rdma_cma_handler(struct rdma_cm_id *cma_id, if (xprt) { set_bit(XPT_CLOSE, &xprt->xpt_flags); svc_xprt_enqueue(xprt); + svc_xprt_put(xprt); } break; case RDMA_CM_EVENT_DEVICE_REMOVAL: @@ -661,31 +700,27 @@ static struct svc_xprt *svc_rdma_create(struct svc_serv *serv, cma_xprt = rdma_create_xprt(serv, 1); if (!cma_xprt) - return ERR_PTR(ENOMEM); + return ERR_PTR(-ENOMEM); xprt = &cma_xprt->sc_xprt; listen_id = rdma_create_id(rdma_listen_handler, cma_xprt, RDMA_PS_TCP); if (IS_ERR(listen_id)) { - svc_xprt_put(&cma_xprt->sc_xprt); - dprintk("svcrdma: rdma_create_id failed = %ld\n", - PTR_ERR(listen_id)); - return (void *)listen_id; + ret = PTR_ERR(listen_id); + dprintk("svcrdma: rdma_create_id failed = %d\n", ret); + goto err0; } + ret = rdma_bind_addr(listen_id, sa); if (ret) { - rdma_destroy_id(listen_id); - svc_xprt_put(&cma_xprt->sc_xprt); dprintk("svcrdma: rdma_bind_addr failed = %d\n", ret); - return ERR_PTR(ret); + goto err1; } cma_xprt->sc_cm_id = listen_id; ret = rdma_listen(listen_id, RPCRDMA_LISTEN_BACKLOG); if (ret) { - rdma_destroy_id(listen_id); - svc_xprt_put(&cma_xprt->sc_xprt); dprintk("svcrdma: rdma_listen failed = %d\n", ret); - return ERR_PTR(ret); + goto err1; } /* @@ -696,6 +731,12 @@ static struct svc_xprt *svc_rdma_create(struct svc_serv *serv, svc_xprt_set_local(&cma_xprt->sc_xprt, sa, salen); return &cma_xprt->sc_xprt; + + err1: + rdma_destroy_id(listen_id); + err0: + kfree(cma_xprt); + return ERR_PTR(ret); } /* @@ -716,7 +757,6 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) struct rdma_conn_param conn_param; struct ib_qp_init_attr qp_attr; struct ib_device_attr devattr; - struct sockaddr *sa; int ret; int i; @@ -826,7 +866,6 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) newxprt->sc_sq_depth = qp_attr.cap.max_send_wr; newxprt->sc_max_requests = qp_attr.cap.max_recv_wr; } - svc_xprt_get(&newxprt->sc_xprt); newxprt->sc_qp = newxprt->sc_cm_id->qp; /* Register all of physical memory */ @@ -850,6 +889,13 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) /* Swap out the handler */ newxprt->sc_cm_id->event_handler = rdma_cma_handler; + /* + * Arm the CQs for the SQ and RQ before accepting so we can't + * miss the first message + */ + ib_req_notify_cq(newxprt->sc_sq_cq, IB_CQ_NEXT_COMP); + ib_req_notify_cq(newxprt->sc_rq_cq, IB_CQ_NEXT_COMP); + /* Accept Connection */ set_bit(RDMAXPRT_CONN_PENDING, &newxprt->sc_flags); memset(&conn_param, 0, sizeof conn_param); @@ -886,58 +932,26 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) newxprt->sc_max_requests, newxprt->sc_ord); - /* Set the local and remote addresses in the transport */ - sa = (struct sockaddr *)&newxprt->sc_cm_id->route.addr.dst_addr; - svc_xprt_set_remote(&newxprt->sc_xprt, sa, svc_addr_len(sa)); - sa = (struct sockaddr *)&newxprt->sc_cm_id->route.addr.src_addr; - svc_xprt_set_local(&newxprt->sc_xprt, sa, svc_addr_len(sa)); - - ib_req_notify_cq(newxprt->sc_sq_cq, IB_CQ_NEXT_COMP); - ib_req_notify_cq(newxprt->sc_rq_cq, IB_CQ_NEXT_COMP); return &newxprt->sc_xprt; errout: dprintk("svcrdma: failure accepting new connection rc=%d.\n", ret); /* Take a reference in case the DTO handler runs */ svc_xprt_get(&newxprt->sc_xprt); - if (newxprt->sc_qp && !IS_ERR(newxprt->sc_qp)) { + if (newxprt->sc_qp && !IS_ERR(newxprt->sc_qp)) ib_destroy_qp(newxprt->sc_qp); - svc_xprt_put(&newxprt->sc_xprt); - } rdma_destroy_id(newxprt->sc_cm_id); /* This call to put will destroy the transport */ svc_xprt_put(&newxprt->sc_xprt); return NULL; } -/* - * Post an RQ WQE to the RQ when the rqst is being released. This - * effectively returns an RQ credit to the client. The rq_xprt_ctxt - * will be null if the request is deferred due to an RDMA_READ or the - * transport had no data ready (EAGAIN). Note that an RPC deferred in - * svc_process will still return the credit, this is because the data - * is copied and no longer consume a WQE/WC. - */ static void svc_rdma_release_rqst(struct svc_rqst *rqstp) { - int err; - struct svcxprt_rdma *rdma = - container_of(rqstp->rq_xprt, struct svcxprt_rdma, sc_xprt); - if (rqstp->rq_xprt_ctxt) { - BUG_ON(rqstp->rq_xprt_ctxt != rdma); - err = svc_rdma_post_recv(rdma); - if (err) - dprintk("svcrdma: failed to post an RQ WQE error=%d\n", - err); - } - rqstp->rq_xprt_ctxt = NULL; } /* - * When connected, an svc_xprt has at least three references: - * - * - A reference held by the QP. We still hold that here because this - * code deletes the QP and puts the reference. + * When connected, an svc_xprt has at least two references: * * - A reference held by the cm_id between the ESTABLISHED and * DISCONNECTED events. If the remote peer disconnected first, this @@ -946,7 +960,7 @@ static void svc_rdma_release_rqst(struct svc_rqst *rqstp) * - A reference held by the svc_recv code that called this function * as part of close processing. * - * At a minimum two references should still be held. + * At a minimum one references should still be held. */ static void svc_rdma_detach(struct svc_xprt *xprt) { @@ -956,23 +970,53 @@ static void svc_rdma_detach(struct svc_xprt *xprt) /* Disconnect and flush posted WQE */ rdma_disconnect(rdma->sc_cm_id); - - /* Destroy the QP if present (not a listener) */ - if (rdma->sc_qp && !IS_ERR(rdma->sc_qp)) { - ib_destroy_qp(rdma->sc_qp); - svc_xprt_put(xprt); - } - - /* Destroy the CM ID */ - rdma_destroy_id(rdma->sc_cm_id); } -static void svc_rdma_free(struct svc_xprt *xprt) +static void __svc_rdma_free(struct work_struct *work) { - struct svcxprt_rdma *rdma = (struct svcxprt_rdma *)xprt; + struct svcxprt_rdma *rdma = + container_of(work, struct svcxprt_rdma, sc_work); dprintk("svcrdma: svc_rdma_free(%p)\n", rdma); + /* We should only be called from kref_put */ - BUG_ON(atomic_read(&xprt->xpt_ref.refcount) != 0); + BUG_ON(atomic_read(&rdma->sc_xprt.xpt_ref.refcount) != 0); + + /* + * Destroy queued, but not processed read completions. Note + * that this cleanup has to be done before destroying the + * cm_id because the device ptr is needed to unmap the dma in + * svc_rdma_put_context. + */ + spin_lock_bh(&rdma->sc_read_complete_lock); + while (!list_empty(&rdma->sc_read_complete_q)) { + struct svc_rdma_op_ctxt *ctxt; + ctxt = list_entry(rdma->sc_read_complete_q.next, + struct svc_rdma_op_ctxt, + dto_q); + list_del_init(&ctxt->dto_q); + svc_rdma_put_context(ctxt, 1); + } + spin_unlock_bh(&rdma->sc_read_complete_lock); + + /* Destroy queued, but not processed recv completions */ + spin_lock_bh(&rdma->sc_rq_dto_lock); + while (!list_empty(&rdma->sc_rq_dto_q)) { + struct svc_rdma_op_ctxt *ctxt; + ctxt = list_entry(rdma->sc_rq_dto_q.next, + struct svc_rdma_op_ctxt, + dto_q); + list_del_init(&ctxt->dto_q); + svc_rdma_put_context(ctxt, 1); + } + spin_unlock_bh(&rdma->sc_rq_dto_lock); + + /* Warn if we leaked a resource or under-referenced */ + WARN_ON(atomic_read(&rdma->sc_ctxt_used) != 0); + + /* Destroy the QP if present (not a listener) */ + if (rdma->sc_qp && !IS_ERR(rdma->sc_qp)) + ib_destroy_qp(rdma->sc_qp); + if (rdma->sc_sq_cq && !IS_ERR(rdma->sc_sq_cq)) ib_destroy_cq(rdma->sc_sq_cq); @@ -985,10 +1029,21 @@ static void svc_rdma_free(struct svc_xprt *xprt) if (rdma->sc_pd && !IS_ERR(rdma->sc_pd)) ib_dealloc_pd(rdma->sc_pd); - destroy_context_cache(rdma->sc_ctxt_head); + /* Destroy the CM ID */ + rdma_destroy_id(rdma->sc_cm_id); + + destroy_context_cache(rdma); kfree(rdma); } +static void svc_rdma_free(struct svc_xprt *xprt) +{ + struct svcxprt_rdma *rdma = + container_of(xprt, struct svcxprt_rdma, sc_xprt); + INIT_WORK(&rdma->sc_work, __svc_rdma_free); + schedule_work(&rdma->sc_work); +} + static int svc_rdma_has_wspace(struct svc_xprt *xprt) { struct svcxprt_rdma *rdma = @@ -1018,7 +1073,7 @@ int svc_rdma_send(struct svcxprt_rdma *xprt, struct ib_send_wr *wr) int ret; if (test_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags)) - return 0; + return -ENOTCONN; BUG_ON(wr->send_flags != IB_SEND_SIGNALED); BUG_ON(((struct svc_rdma_op_ctxt *)(unsigned long)wr->wr_id)->wr_op != @@ -1029,7 +1084,8 @@ int svc_rdma_send(struct svcxprt_rdma *xprt, struct ib_send_wr *wr) if (xprt->sc_sq_depth == atomic_read(&xprt->sc_sq_count)) { spin_unlock_bh(&xprt->sc_lock); atomic_inc(&rdma_stat_sq_starve); - /* See if we can reap some SQ WR */ + + /* See if we can opportunistically reap SQ WR to make room */ sq_cq_reap(xprt); /* Wait until SQ WR available if SQ still full */ @@ -1041,22 +1097,25 @@ int svc_rdma_send(struct svcxprt_rdma *xprt, struct ib_send_wr *wr) continue; } /* Bumped used SQ WR count and post */ + svc_xprt_get(&xprt->sc_xprt); ret = ib_post_send(xprt->sc_qp, wr, &bad_wr); if (!ret) atomic_inc(&xprt->sc_sq_count); - else + else { + svc_xprt_put(&xprt->sc_xprt); dprintk("svcrdma: failed to post SQ WR rc=%d, " "sc_sq_count=%d, sc_sq_depth=%d\n", ret, atomic_read(&xprt->sc_sq_count), xprt->sc_sq_depth); + } spin_unlock_bh(&xprt->sc_lock); break; } return ret; } -int svc_rdma_send_error(struct svcxprt_rdma *xprt, struct rpcrdma_msg *rmsgp, - enum rpcrdma_errcode err) +void svc_rdma_send_error(struct svcxprt_rdma *xprt, struct rpcrdma_msg *rmsgp, + enum rpcrdma_errcode err) { struct ib_send_wr err_wr; struct ib_sge sge; @@ -1094,9 +1153,8 @@ int svc_rdma_send_error(struct svcxprt_rdma *xprt, struct rpcrdma_msg *rmsgp, /* Post It */ ret = svc_rdma_send(xprt, &err_wr); if (ret) { - dprintk("svcrdma: Error posting send = %d\n", ret); + dprintk("svcrdma: Error %d posting send for protocol error\n", + ret); svc_rdma_put_context(ctxt, 1); } - - return ret; } |