From 3b56113016400a4e2b9870c368ebb8080cb5739b Mon Sep 17 00:00:00 2001 From: Erez Shitrit Date: Wed, 25 May 2016 22:02:07 +0300 Subject: IB/ipoib: Support SendOnlyFullMember MCG for SendOnly join Check (via an SA query) if the SM supports the new option for SendOnly multicast joins. If the SM supports that option it will use the new join state to create such multicast group. If SendOnlyFullMember is supported, we wouldn't use faked FullMember state join for SendOnly MCG, use the correct state if supported. This check is performed at every invocation of mcast_restart task, to be sure that the driver stays in sync with the current state of the SM. Signed-off-by: Erez Shitrit Reviewed-by: Leon Romanovsky Signed-off-by: Doug Ledford --- drivers/infiniband/ulp/ipoib/ipoib.h | 2 + drivers/infiniband/ulp/ipoib/ipoib_main.c | 73 ++++++++++++++++++++++++++ drivers/infiniband/ulp/ipoib/ipoib_multicast.c | 38 +++++++++----- 3 files changed, 100 insertions(+), 13 deletions(-) (limited to 'drivers/infiniband/ulp/ipoib') diff --git a/drivers/infiniband/ulp/ipoib/ipoib.h b/drivers/infiniband/ulp/ipoib/ipoib.h index caec8e9c4666..c51f618f6120 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib.h +++ b/drivers/infiniband/ulp/ipoib/ipoib.h @@ -392,6 +392,7 @@ struct ipoib_dev_priv { struct ipoib_ethtool_st ethtool; struct timer_list poll_timer; unsigned max_send_sge; + bool sm_fullmember_sendonly_support; }; struct ipoib_ah { @@ -476,6 +477,7 @@ void ipoib_reap_ah(struct work_struct *work); void ipoib_mark_paths_invalid(struct net_device *dev); void ipoib_flush_paths(struct net_device *dev); +int ipoib_check_sm_sendonly_fullmember_support(struct ipoib_dev_priv *priv); struct ipoib_dev_priv *ipoib_intf_alloc(const char *format); int ipoib_ib_dev_init(struct net_device *dev, struct ib_device *ca, int port); diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c index 80807d6e5c4c..8bf1859ebed4 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_main.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c @@ -117,6 +117,8 @@ int ipoib_open(struct net_device *dev) set_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags); + priv->sm_fullmember_sendonly_support = false; + if (ipoib_ib_dev_open(dev)) { if (!test_bit(IPOIB_PKEY_ASSIGNED, &priv->flags)) return 0; @@ -629,6 +631,77 @@ void ipoib_mark_paths_invalid(struct net_device *dev) spin_unlock_irq(&priv->lock); } +struct classport_info_context { + struct ipoib_dev_priv *priv; + struct completion done; + struct ib_sa_query *sa_query; +}; + +static void classport_info_query_cb(int status, struct ib_class_port_info *rec, + void *context) +{ + struct classport_info_context *cb_ctx = context; + struct ipoib_dev_priv *priv; + + WARN_ON(!context); + + priv = cb_ctx->priv; + + if (status || !rec) { + pr_debug("device: %s failed query classport_info status: %d\n", + priv->dev->name, status); + /* keeps the default, will try next mcast_restart */ + priv->sm_fullmember_sendonly_support = false; + goto out; + } + + if (ib_get_cpi_capmask2(rec) & + IB_SA_CAP_MASK2_SENDONLY_FULL_MEM_SUPPORT) { + pr_debug("device: %s enabled fullmember-sendonly for sendonly MCG\n", + priv->dev->name); + priv->sm_fullmember_sendonly_support = true; + } else { + pr_debug("device: %s disabled fullmember-sendonly for sendonly MCG\n", + priv->dev->name); + priv->sm_fullmember_sendonly_support = false; + } + +out: + complete(&cb_ctx->done); +} + +int ipoib_check_sm_sendonly_fullmember_support(struct ipoib_dev_priv *priv) +{ + struct classport_info_context *callback_context; + int ret; + + callback_context = kmalloc(sizeof(*callback_context), GFP_KERNEL); + if (!callback_context) + return -ENOMEM; + + callback_context->priv = priv; + init_completion(&callback_context->done); + + ret = ib_sa_classport_info_rec_query(&ipoib_sa_client, + priv->ca, priv->port, 3000, + GFP_KERNEL, + classport_info_query_cb, + callback_context, + &callback_context->sa_query); + if (ret < 0) { + pr_info("%s failed to send ib_sa_classport_info query, ret: %d\n", + priv->dev->name, ret); + kfree(callback_context); + return ret; + } + + /* waiting for the callback to finish before returnning */ + wait_for_completion(&callback_context->done); + kfree(callback_context); + + return ret; +} + void ipoib_flush_paths(struct net_device *dev) { struct ipoib_dev_priv *priv = netdev_priv(dev); diff --git a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c index 25889311b1e9..fc3e50e8e391 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c @@ -64,6 +64,9 @@ struct ipoib_mcast_iter { unsigned int send_only; }; +/* join state that allows creating mcg with sendonly member request */ +#define SENDONLY_FULLMEMBER_JOIN 8 + /* * This should be called with the priv->lock held */ @@ -326,12 +329,23 @@ void ipoib_mcast_carrier_on_task(struct work_struct *work) struct ipoib_dev_priv *priv = container_of(work, struct ipoib_dev_priv, carrier_on_task); struct ib_port_attr attr; + int ret; if (ib_query_port(priv->ca, priv->port, &attr) || attr.state != IB_PORT_ACTIVE) { ipoib_dbg(priv, "Keeping carrier off until IB port is active\n"); return; } + /* + * Check if can send sendonly MCG's with sendonly-fullmember join state. + * It done here after the successfully join to the broadcast group, + * because the broadcast group must always be joined first and is always + * re-joined if the SM changes substantially. + */ + ret = ipoib_check_sm_sendonly_fullmember_support(priv); + if (ret < 0) + pr_debug("%s failed query sm support for sendonly-fullmember (ret: %d)\n", + priv->dev->name, ret); /* * Take rtnl_lock to avoid racing with ipoib_stop() and @@ -515,22 +529,20 @@ static int ipoib_mcast_join(struct net_device *dev, struct ipoib_mcast *mcast) rec.hop_limit = priv->broadcast->mcmember.hop_limit; /* - * Send-only IB Multicast joins do not work at the core - * IB layer yet, so we can't use them here. However, - * we are emulating an Ethernet multicast send, which - * does not require a multicast subscription and will - * still send properly. The most appropriate thing to + * Send-only IB Multicast joins work at the core IB layer but + * require specific SM support. + * We can use such joins here only if the current SM supports that feature. + * However, if not, we emulate an Ethernet multicast send, + * which does not require a multicast subscription and will + * still send properly. The most appropriate thing to * do is to create the group if it doesn't exist as that * most closely emulates the behavior, from a user space - * application perspecitive, of Ethernet multicast - * operation. For now, we do a full join, maybe later - * when the core IB layers support send only joins we - * will use them. + * application perspective, of Ethernet multicast operation. */ -#if 0 - if (test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags)) - rec.join_state = 4; -#endif + if (test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags) && + priv->sm_fullmember_sendonly_support) + /* SM supports sendonly-fullmember, otherwise fallback to full-member */ + rec.join_state = SENDONLY_FULLMEMBER_JOIN; } spin_unlock_irq(&priv->lock); -- cgit v1.2.3 From 492a7e67ff83fc59adb768de25ccaecd33d46beb Mon Sep 17 00:00:00 2001 From: Mark Bloch Date: Wed, 18 May 2016 16:42:43 +0300 Subject: IB/IPoIB: Allow setting the device address In IB networks, and specifically in IPoIB/rdmacm traffic, the device address of an IPoIB interface is used as a means to exchange information between nodes needed for communication. Currently an IPoIB interface will always be created with a device address based on its node GUID without a way to change that. This change adds the ability to set the device address of an IPoIB interface by value. We use the set mac address ndo to do that. The flow should be broken down to two: 1) The GID value is already in the GID table, in this case the interface will be able to set carrier up. 2) The GID value is not yet in the GID table, in this case the interface won't try to join the multicast group and will wait (listen on GID_CHANGE event) until the GID is inserted. In order to track those changes, we add a new flag: * IPOIB_FLAG_DEV_ADDR_SET. When set, it means the dev_addr is a based on a value in the gid table. this bit will be cleared upon a dev_addr change triggered by the user and set after validation. Per IB spec the port GUID can't change if the module is loaded. port GUID is the basis for GID at index 0 which is the basis for the default device address of a ipoib interface. The issue is that there are devices that don't follow the spec, they change the port GUID while HCA is powered on, so in order not to break userspace applications. We need to check if the user wanted to control the device address and we assume that if he sets the device address back to be based on GID index 0, he no longer wishs to control it. In order to track this, we add an additional flag: * IPOIB_FLAG_DEV_ADDR_CTRL When setting the device address, there is no validation of the upper twelve bytes of the device address (flags, qpn, subnet prefix) as those bytes are not under the control of the user. Signed-off-by: Mark Bloch Reviewed-by: Leon Romanovsky Signed-off-by: Leon Romanovsky Signed-off-by: Doug Ledford --- drivers/infiniband/ulp/ipoib/ipoib.h | 2 + drivers/infiniband/ulp/ipoib/ipoib_ib.c | 109 ++++++++++++++++++++++++- drivers/infiniband/ulp/ipoib/ipoib_main.c | 67 +++++++++++++++ drivers/infiniband/ulp/ipoib/ipoib_multicast.c | 10 ++- drivers/infiniband/ulp/ipoib/ipoib_verbs.c | 3 + drivers/infiniband/ulp/ipoib/ipoib_vlan.c | 2 + 6 files changed, 187 insertions(+), 6 deletions(-) (limited to 'drivers/infiniband/ulp/ipoib') diff --git a/drivers/infiniband/ulp/ipoib/ipoib.h b/drivers/infiniband/ulp/ipoib/ipoib.h index c51f618f6120..bab7db6fa9ab 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib.h +++ b/drivers/infiniband/ulp/ipoib/ipoib.h @@ -92,6 +92,8 @@ enum { IPOIB_FLAG_UMCAST = 10, IPOIB_STOP_NEIGH_GC = 11, IPOIB_NEIGH_TBL_FLUSH = 12, + IPOIB_FLAG_DEV_ADDR_SET = 13, + IPOIB_FLAG_DEV_ADDR_CTRL = 14, IPOIB_MAX_BACKOFF_SECONDS = 16, diff --git a/drivers/infiniband/ulp/ipoib/ipoib_ib.c b/drivers/infiniband/ulp/ipoib/ipoib_ib.c index da5f28c892ca..7e9a77040a24 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_ib.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_ib.c @@ -997,6 +997,106 @@ static inline int update_child_pkey(struct ipoib_dev_priv *priv) return 0; } +/* + * returns true if the device address of the ipoib interface has changed and the + * new address is a valid one (i.e in the gid table), return false otherwise. + */ +static bool ipoib_dev_addr_changed_valid(struct ipoib_dev_priv *priv) +{ + union ib_gid search_gid; + union ib_gid gid0; + union ib_gid *netdev_gid; + int err; + u16 index; + u8 port; + bool ret = false; + + netdev_gid = (union ib_gid *)(priv->dev->dev_addr + 4); + if (ib_query_gid(priv->ca, priv->port, 0, &gid0, NULL)) + return false; + + netif_addr_lock(priv->dev); + + /* The subnet prefix may have changed, update it now so we won't have + * to do it later + */ + priv->local_gid.global.subnet_prefix = gid0.global.subnet_prefix; + netdev_gid->global.subnet_prefix = gid0.global.subnet_prefix; + search_gid.global.subnet_prefix = gid0.global.subnet_prefix; + + search_gid.global.interface_id = priv->local_gid.global.interface_id; + + netif_addr_unlock(priv->dev); + + err = ib_find_gid(priv->ca, &search_gid, IB_GID_TYPE_IB, + priv->dev, &port, &index); + + netif_addr_lock(priv->dev); + + if (search_gid.global.interface_id != + priv->local_gid.global.interface_id) + /* There was a change while we were looking up the gid, bail + * here and let the next work sort this out + */ + goto out; + + /* The next section of code needs some background: + * Per IB spec the port GUID can't change if the HCA is powered on. + * port GUID is the basis for GID at index 0 which is the basis for + * the default device address of a ipoib interface. + * + * so it seems the flow should be: + * if user_changed_dev_addr && gid in gid tbl + * set bit dev_addr_set + * return true + * else + * return false + * + * The issue is that there are devices that don't follow the spec, + * they change the port GUID when the HCA is powered, so in order + * not to break userspace applications, We need to check if the + * user wanted to control the device address and we assume that + * if he sets the device address back to be based on GID index 0, + * he no longer wishs to control it. + * + * If the user doesn't control the the device address, + * IPOIB_FLAG_DEV_ADDR_SET is set and ib_find_gid failed it means + * the port GUID has changed and GID at index 0 has changed + * so we need to change priv->local_gid and priv->dev->dev_addr + * to reflect the new GID. + */ + if (!test_bit(IPOIB_FLAG_DEV_ADDR_SET, &priv->flags)) { + if (!err && port == priv->port) { + set_bit(IPOIB_FLAG_DEV_ADDR_SET, &priv->flags); + if (index == 0) + clear_bit(IPOIB_FLAG_DEV_ADDR_CTRL, + &priv->flags); + else + set_bit(IPOIB_FLAG_DEV_ADDR_CTRL, &priv->flags); + ret = true; + } else { + ret = false; + } + } else { + if (!err && port == priv->port) { + ret = true; + } else { + if (!test_bit(IPOIB_FLAG_DEV_ADDR_CTRL, &priv->flags)) { + memcpy(&priv->local_gid, &gid0, + sizeof(priv->local_gid)); + memcpy(priv->dev->dev_addr + 4, &gid0, + sizeof(priv->local_gid)); + ret = true; + } + } + } + +out: + netif_addr_unlock(priv->dev); + + return ret; +} + static void __ipoib_ib_dev_flush(struct ipoib_dev_priv *priv, enum ipoib_flush_level level, int nesting) @@ -1018,6 +1118,9 @@ static void __ipoib_ib_dev_flush(struct ipoib_dev_priv *priv, if (!test_bit(IPOIB_FLAG_INITIALIZED, &priv->flags) && level != IPOIB_FLUSH_HEAVY) { + /* Make sure the dev_addr is set even if not flushing */ + if (level == IPOIB_FLUSH_LIGHT) + ipoib_dev_addr_changed_valid(priv); ipoib_dbg(priv, "Not flushing - IPOIB_FLAG_INITIALIZED not set.\n"); return; } @@ -1029,7 +1132,8 @@ static void __ipoib_ib_dev_flush(struct ipoib_dev_priv *priv, update_parent_pkey(priv); else update_child_pkey(priv); - } + } else if (level == IPOIB_FLUSH_LIGHT) + ipoib_dev_addr_changed_valid(priv); ipoib_dbg(priv, "Not flushing - IPOIB_FLAG_ADMIN_UP not set.\n"); return; } @@ -1081,7 +1185,8 @@ static void __ipoib_ib_dev_flush(struct ipoib_dev_priv *priv, if (test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags)) { if (level >= IPOIB_FLUSH_NORMAL) ipoib_ib_dev_up(dev); - ipoib_mcast_restart_task(&priv->restart_task); + if (ipoib_dev_addr_changed_valid(priv)) + ipoib_mcast_restart_task(&priv->restart_task); } } diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c index 8bf1859ebed4..2c3fb5337bc1 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_main.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c @@ -99,6 +99,7 @@ static struct net_device *ipoib_get_net_dev_by_params( struct ib_device *dev, u8 port, u16 pkey, const union ib_gid *gid, const struct sockaddr *addr, void *client_data); +static int ipoib_set_mac(struct net_device *dev, void *addr); static struct ib_client ipoib_client = { .name = "ipoib", @@ -1722,6 +1723,7 @@ static const struct net_device_ops ipoib_netdev_ops_pf = { .ndo_get_vf_config = ipoib_get_vf_config, .ndo_get_vf_stats = ipoib_get_vf_stats, .ndo_set_vf_guid = ipoib_set_vf_guid, + .ndo_set_mac_address = ipoib_set_mac, }; static const struct net_device_ops ipoib_netdev_ops_vf = { @@ -1844,6 +1846,70 @@ int ipoib_add_umcast_attr(struct net_device *dev) return device_create_file(&dev->dev, &dev_attr_umcast); } +static void set_base_guid(struct ipoib_dev_priv *priv, union ib_gid *gid) +{ + struct ipoib_dev_priv *child_priv; + struct net_device *netdev = priv->dev; + + netif_addr_lock(netdev); + + memcpy(&priv->local_gid.global.interface_id, + &gid->global.interface_id, + sizeof(gid->global.interface_id)); + memcpy(netdev->dev_addr + 4, &priv->local_gid, sizeof(priv->local_gid)); + clear_bit(IPOIB_FLAG_DEV_ADDR_SET, &priv->flags); + + netif_addr_unlock(netdev); + + if (!test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags)) { + down_read(&priv->vlan_rwsem); + list_for_each_entry(child_priv, &priv->child_intfs, list) + set_base_guid(child_priv, gid); + up_read(&priv->vlan_rwsem); + } +} + +static int ipoib_check_lladdr(struct net_device *dev, + struct sockaddr_storage *ss) +{ + union ib_gid *gid = (union ib_gid *)(ss->__data + 4); + int ret = 0; + + netif_addr_lock(dev); + + /* Make sure the QPN, reserved and subnet prefix match the current + * lladdr, it also makes sure the lladdr is unicast. + */ + if (memcmp(dev->dev_addr, ss->__data, + 4 + sizeof(gid->global.subnet_prefix)) || + gid->global.interface_id == 0) + ret = -EINVAL; + + netif_addr_unlock(dev); + + return ret; +} + +static int ipoib_set_mac(struct net_device *dev, void *addr) +{ + struct ipoib_dev_priv *priv = netdev_priv(dev); + struct sockaddr_storage *ss = addr; + int ret; + + if (!(dev->priv_flags & IFF_LIVE_ADDR_CHANGE) && netif_running(dev)) + return -EBUSY; + + ret = ipoib_check_lladdr(dev, ss); + if (ret) + return ret; + + set_base_guid(priv, (union ib_gid *)(ss->__data + 4)); + + queue_work(ipoib_workqueue, &priv->flush_light); + + return 0; +} + static ssize_t create_child(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) @@ -1967,6 +2033,7 @@ static struct net_device *ipoib_add_port(const char *format, goto device_init_failed; } else memcpy(priv->dev->dev_addr + 4, priv->local_gid.raw, sizeof (union ib_gid)); + set_bit(IPOIB_FLAG_DEV_ADDR_SET, &priv->flags); result = ipoib_dev_init(priv->dev, hca, port); if (result < 0) { diff --git a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c index fc3e50e8e391..82fbc9442608 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c @@ -582,11 +582,13 @@ void ipoib_mcast_join_task(struct work_struct *work) return; } priv->local_lid = port_attr.lid; + netif_addr_lock(dev); - if (ib_query_gid(priv->ca, priv->port, 0, &priv->local_gid, NULL)) - ipoib_warn(priv, "ib_query_gid() failed\n"); - else - memcpy(priv->dev->dev_addr + 4, priv->local_gid.raw, sizeof (union ib_gid)); + if (!test_bit(IPOIB_FLAG_DEV_ADDR_SET, &priv->flags)) { + netif_addr_unlock(dev); + return; + } + netif_addr_unlock(dev); spin_lock_irq(&priv->lock); if (!test_bit(IPOIB_FLAG_OPER_UP, &priv->flags)) diff --git a/drivers/infiniband/ulp/ipoib/ipoib_verbs.c b/drivers/infiniband/ulp/ipoib/ipoib_verbs.c index b809c373e40e..1e7cbbaa15bd 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_verbs.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_verbs.c @@ -307,5 +307,8 @@ void ipoib_event(struct ib_event_handler *handler, queue_work(ipoib_workqueue, &priv->flush_normal); } else if (record->event == IB_EVENT_PKEY_CHANGE) { queue_work(ipoib_workqueue, &priv->flush_heavy); + } else if (record->event == IB_EVENT_GID_CHANGE && + !test_bit(IPOIB_FLAG_DEV_ADDR_SET, &priv->flags)) { + queue_work(ipoib_workqueue, &priv->flush_light); } } diff --git a/drivers/infiniband/ulp/ipoib/ipoib_vlan.c b/drivers/infiniband/ulp/ipoib/ipoib_vlan.c index fca1a882de27..64a35595eab8 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_vlan.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_vlan.c @@ -68,6 +68,8 @@ int __ipoib_vlan_add(struct ipoib_dev_priv *ppriv, struct ipoib_dev_priv *priv, priv->pkey = pkey; memcpy(priv->dev->dev_addr, ppriv->dev->dev_addr, INFINIBAND_ALEN); + memcpy(&priv->local_gid, &ppriv->local_gid, sizeof(priv->local_gid)); + set_bit(IPOIB_FLAG_DEV_ADDR_SET, &priv->flags); priv->dev->broadcast[8] = pkey >> 8; priv->dev->broadcast[9] = pkey & 0xff; -- cgit v1.2.3