diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2018-10-26 16:38:19 +0200 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2018-10-26 16:38:19 +0200 |
commit | da19a102ce87bf3e0a7fe277a659d1fc35330d6d (patch) | |
tree | a6c1d40ef544e812b31f4b5f497c20d449d45ec3 | |
parent | Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/sparc (diff) | |
parent | IB/mlx5: Add support for extended atomic operations (diff) | |
download | linux-da19a102ce87bf3e0a7fe277a659d1fc35330d6d.tar.xz linux-da19a102ce87bf3e0a7fe277a659d1fc35330d6d.zip |
Merge tag 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma
Pull rdma updates from Jason Gunthorpe:
"This has been a smaller cycle with many of the commits being smallish
code fixes and improvements across the drivers.
- Driver updates for bnxt_re, cxgb4, hfi1, hns, mlx5, nes, qedr, and
rxe
- Memory window support in hns
- mlx5 user API 'flow mutate/steering' allows accessing the full
packet mangling and matching machinery from user space
- Support inter-working with verbs API calls in the 'devx' mlx5 user
API, and provide options to use devx with less privilege
- Modernize the use of syfs and the device interface to use attribute
groups and cdev properly for uverbs, and clean up some of the core
code's device list management
- More progress on net namespaces for RDMA devices
- Consolidate driver BAR mmapping support into core code helpers and
rework how RDMA holds poitners to mm_struct for get_user_pages
cases
- First pass to use 'dev_name' instead of ib_device->name
- Device renaming for RDMA devices"
* tag 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma: (242 commits)
IB/mlx5: Add support for extended atomic operations
RDMA/core: Fix comment for hw stats init for port == 0
RDMA/core: Refactor ib_register_device() function
RDMA/core: Fix unwinding flow in case of error to register device
ib_srp: Remove WARN_ON in srp_terminate_io()
IB/mlx5: Allow scatter to CQE without global signaled WRs
IB/mlx5: Verify that driver supports user flags
IB/mlx5: Support scatter to CQE for DC transport type
RDMA/drivers: Use core provided API for registering device attributes
RDMA/core: Allow existing drivers to set one sysfs group per device
IB/rxe: Remove unnecessary enum values
RDMA/umad: Use kernel API to allocate umad indexes
RDMA/uverbs: Use kernel API to allocate uverbs indexes
RDMA/core: Increase total number of RDMA ports across all devices
IB/mlx4: Add port and TID to MAD debug print
IB/mlx4: Enable debug print of SMPs
RDMA/core: Rename ports_parent to ports_kobj
RDMA/core: Do not expose unsupported counters
IB/mlx4: Refer to the device kobject instead of ports_parent
RDMA/nldev: Allow IB device rename through RDMA netlink
...
204 files changed, 7618 insertions, 5190 deletions
diff --git a/Documentation/ABI/testing/sysfs-class-net b/Documentation/ABI/testing/sysfs-class-net index e2e0fe553ad8..664a8f6a634f 100644 --- a/Documentation/ABI/testing/sysfs-class-net +++ b/Documentation/ABI/testing/sysfs-class-net @@ -91,6 +91,24 @@ Description: stacked (e.g: VLAN interfaces) but still have the same MAC address as their parent device. +What: /sys/class/net/<iface>/dev_port +Date: February 2014 +KernelVersion: 3.15 +Contact: netdev@vger.kernel.org +Description: + Indicates the port number of this network device, formatted + as a decimal value. Some NICs have multiple independent ports + on the same PCI bus, device and function. This attribute allows + userspace to distinguish the respective interfaces. + + Note: some device drivers started to use 'dev_id' for this + purpose since long before 3.15 and have not adopted the new + attribute ever since. To query the port number, some tools look + exclusively at 'dev_port', while others only consult 'dev_id'. + If a network device has multiple client adapter ports as + described in the previous paragraph and does not set this + attribute to its port number, it's a kernel bug. + What: /sys/class/net/<iface>/dormant Date: March 2006 KernelVersion: 2.6.17 diff --git a/drivers/infiniband/Kconfig b/drivers/infiniband/Kconfig index abb6660c099c..0a3ec7c726ec 100644 --- a/drivers/infiniband/Kconfig +++ b/drivers/infiniband/Kconfig @@ -26,6 +26,7 @@ config INFINIBAND_USER_MAD config INFINIBAND_USER_ACCESS tristate "InfiniBand userspace access (verbs and CM)" select ANON_INODES + depends on MMU ---help--- Userspace InfiniBand access support. This enables the kernel side of userspace verbs and the userspace diff --git a/drivers/infiniband/core/addr.c b/drivers/infiniband/core/addr.c index 46b855a42884..0dce94e3c495 100644 --- a/drivers/infiniband/core/addr.c +++ b/drivers/infiniband/core/addr.c @@ -45,6 +45,7 @@ #include <net/addrconf.h> #include <net/ip6_route.h> #include <rdma/ib_addr.h> +#include <rdma/ib_sa.h> #include <rdma/ib.h> #include <rdma/rdma_netlink.h> #include <net/netlink.h> @@ -61,6 +62,7 @@ struct addr_req { struct rdma_dev_addr *addr, void *context); unsigned long timeout; struct delayed_work work; + bool resolve_by_gid_attr; /* Consider gid attr in resolve phase */ int status; u32 seq; }; @@ -219,60 +221,75 @@ int rdma_addr_size_kss(struct __kernel_sockaddr_storage *addr) } EXPORT_SYMBOL(rdma_addr_size_kss); -void rdma_copy_addr(struct rdma_dev_addr *dev_addr, - const struct net_device *dev, - const unsigned char *dst_dev_addr) +/** + * rdma_copy_src_l2_addr - Copy netdevice source addresses + * @dev_addr: Destination address pointer where to copy the addresses + * @dev: Netdevice whose source addresses to copy + * + * rdma_copy_src_l2_addr() copies source addresses from the specified netdevice. + * This includes unicast address, broadcast address, device type and + * interface index. + */ +void rdma_copy_src_l2_addr(struct rdma_dev_addr *dev_addr, + const struct net_device *dev) { dev_addr->dev_type = dev->type; memcpy(dev_addr->src_dev_addr, dev->dev_addr, MAX_ADDR_LEN); memcpy(dev_addr->broadcast, dev->broadcast, MAX_ADDR_LEN); - if (dst_dev_addr) - memcpy(dev_addr->dst_dev_addr, dst_dev_addr, MAX_ADDR_LEN); dev_addr->bound_dev_if = dev->ifindex; } -EXPORT_SYMBOL(rdma_copy_addr); +EXPORT_SYMBOL(rdma_copy_src_l2_addr); -int rdma_translate_ip(const struct sockaddr *addr, - struct rdma_dev_addr *dev_addr) +static struct net_device * +rdma_find_ndev_for_src_ip_rcu(struct net *net, const struct sockaddr *src_in) { - struct net_device *dev; + struct net_device *dev = NULL; + int ret = -EADDRNOTAVAIL; - if (dev_addr->bound_dev_if) { - dev = dev_get_by_index(dev_addr->net, dev_addr->bound_dev_if); - if (!dev) - return -ENODEV; - rdma_copy_addr(dev_addr, dev, NULL); - dev_put(dev); - return 0; - } - - switch (addr->sa_family) { + switch (src_in->sa_family) { case AF_INET: - dev = ip_dev_find(dev_addr->net, - ((const struct sockaddr_in *)addr)->sin_addr.s_addr); - - if (!dev) - return -EADDRNOTAVAIL; - - rdma_copy_addr(dev_addr, dev, NULL); - dev_put(dev); + dev = __ip_dev_find(net, + ((const struct sockaddr_in *)src_in)->sin_addr.s_addr, + false); + if (dev) + ret = 0; break; #if IS_ENABLED(CONFIG_IPV6) case AF_INET6: - rcu_read_lock(); - for_each_netdev_rcu(dev_addr->net, dev) { - if (ipv6_chk_addr(dev_addr->net, - &((const struct sockaddr_in6 *)addr)->sin6_addr, + for_each_netdev_rcu(net, dev) { + if (ipv6_chk_addr(net, + &((const struct sockaddr_in6 *)src_in)->sin6_addr, dev, 1)) { - rdma_copy_addr(dev_addr, dev, NULL); + ret = 0; break; } } - rcu_read_unlock(); break; #endif } - return 0; + return ret ? ERR_PTR(ret) : dev; +} + +int rdma_translate_ip(const struct sockaddr *addr, + struct rdma_dev_addr *dev_addr) +{ + struct net_device *dev; + + if (dev_addr->bound_dev_if) { + dev = dev_get_by_index(dev_addr->net, dev_addr->bound_dev_if); + if (!dev) + return -ENODEV; + rdma_copy_src_l2_addr(dev_addr, dev); + dev_put(dev); + return 0; + } + + rcu_read_lock(); + dev = rdma_find_ndev_for_src_ip_rcu(dev_addr->net, addr); + if (!IS_ERR(dev)) + rdma_copy_src_l2_addr(dev_addr, dev); + rcu_read_unlock(); + return PTR_ERR_OR_ZERO(dev); } EXPORT_SYMBOL(rdma_translate_ip); @@ -295,15 +312,12 @@ static void queue_req(struct addr_req *req) spin_unlock_bh(&lock); } -static int ib_nl_fetch_ha(const struct dst_entry *dst, - struct rdma_dev_addr *dev_addr, +static int ib_nl_fetch_ha(struct rdma_dev_addr *dev_addr, const void *daddr, u32 seq, u16 family) { - if (rdma_nl_chk_listeners(RDMA_NL_GROUP_LS)) + if (!rdma_nl_chk_listeners(RDMA_NL_GROUP_LS)) return -EADDRNOTAVAIL; - /* We fill in what we can, the response will fill the rest */ - rdma_copy_addr(dev_addr, dst->dev, NULL); return ib_nl_ip_send_msg(dev_addr, daddr, seq, family); } @@ -322,7 +336,7 @@ static int dst_fetch_ha(const struct dst_entry *dst, neigh_event_send(n, NULL); ret = -ENODATA; } else { - rdma_copy_addr(dev_addr, dst->dev, n->ha); + memcpy(dev_addr->dst_dev_addr, n->ha, MAX_ADDR_LEN); } neigh_release(n); @@ -356,18 +370,22 @@ static int fetch_ha(const struct dst_entry *dst, struct rdma_dev_addr *dev_addr, (const void *)&dst_in6->sin6_addr; sa_family_t family = dst_in->sa_family; - /* Gateway + ARPHRD_INFINIBAND -> IB router */ - if (has_gateway(dst, family) && dst->dev->type == ARPHRD_INFINIBAND) - return ib_nl_fetch_ha(dst, dev_addr, daddr, seq, family); + /* If we have a gateway in IB mode then it must be an IB network */ + if (has_gateway(dst, family) && dev_addr->network == RDMA_NETWORK_IB) + return ib_nl_fetch_ha(dev_addr, daddr, seq, family); else return dst_fetch_ha(dst, dev_addr, daddr); } -static int addr4_resolve(struct sockaddr_in *src_in, - const struct sockaddr_in *dst_in, +static int addr4_resolve(struct sockaddr *src_sock, + const struct sockaddr *dst_sock, struct rdma_dev_addr *addr, struct rtable **prt) { + struct sockaddr_in *src_in = (struct sockaddr_in *)src_sock; + const struct sockaddr_in *dst_in = + (const struct sockaddr_in *)dst_sock; + __be32 src_ip = src_in->sin_addr.s_addr; __be32 dst_ip = dst_in->sin_addr.s_addr; struct rtable *rt; @@ -383,16 +401,8 @@ static int addr4_resolve(struct sockaddr_in *src_in, if (ret) return ret; - src_in->sin_family = AF_INET; src_in->sin_addr.s_addr = fl4.saddr; - /* If there's a gateway and type of device not ARPHRD_INFINIBAND, we're - * definitely in RoCE v2 (as RoCE v1 isn't routable) set the network - * type accordingly. - */ - if (rt->rt_uses_gateway && rt->dst.dev->type != ARPHRD_INFINIBAND) - addr->network = RDMA_NETWORK_IPV4; - addr->hoplimit = ip4_dst_hoplimit(&rt->dst); *prt = rt; @@ -400,14 +410,16 @@ static int addr4_resolve(struct sockaddr_in *src_in, } #if IS_ENABLED(CONFIG_IPV6) -static int addr6_resolve(struct sockaddr_in6 *src_in, - const struct sockaddr_in6 *dst_in, +static int addr6_resolve(struct sockaddr *src_sock, + const struct sockaddr *dst_sock, struct rdma_dev_addr *addr, struct dst_entry **pdst) { + struct sockaddr_in6 *src_in = (struct sockaddr_in6 *)src_sock; + const struct sockaddr_in6 *dst_in = + (const struct sockaddr_in6 *)dst_sock; struct flowi6 fl6; struct dst_entry *dst; - struct rt6_info *rt; int ret; memset(&fl6, 0, sizeof fl6); @@ -419,19 +431,8 @@ static int addr6_resolve(struct sockaddr_in6 *src_in, if (ret < 0) return ret; - rt = (struct rt6_info *)dst; - if (ipv6_addr_any(&src_in->sin6_addr)) { - src_in->sin6_family = AF_INET6; + if (ipv6_addr_any(&src_in->sin6_addr)) src_in->sin6_addr = fl6.saddr; - } - - /* If there's a gateway and type of device not ARPHRD_INFINIBAND, we're - * definitely in RoCE v2 (as RoCE v1 isn't routable) set the network - * type accordingly. - */ - if (rt->rt6i_flags & RTF_GATEWAY && - ip6_dst_idev(dst)->dev->type != ARPHRD_INFINIBAND) - addr->network = RDMA_NETWORK_IPV6; addr->hoplimit = ip6_dst_hoplimit(dst); @@ -439,8 +440,8 @@ static int addr6_resolve(struct sockaddr_in6 *src_in, return 0; } #else -static int addr6_resolve(struct sockaddr_in6 *src_in, - const struct sockaddr_in6 *dst_in, +static int addr6_resolve(struct sockaddr *src_sock, + const struct sockaddr *dst_sock, struct rdma_dev_addr *addr, struct dst_entry **pdst) { @@ -451,36 +452,110 @@ static int addr6_resolve(struct sockaddr_in6 *src_in, static int addr_resolve_neigh(const struct dst_entry *dst, const struct sockaddr *dst_in, struct rdma_dev_addr *addr, + unsigned int ndev_flags, u32 seq) { - if (dst->dev->flags & IFF_LOOPBACK) { - int ret; + int ret = 0; - ret = rdma_translate_ip(dst_in, addr); - if (!ret) - memcpy(addr->dst_dev_addr, addr->src_dev_addr, - MAX_ADDR_LEN); + if (ndev_flags & IFF_LOOPBACK) { + memcpy(addr->dst_dev_addr, addr->src_dev_addr, MAX_ADDR_LEN); + } else { + if (!(ndev_flags & IFF_NOARP)) { + /* If the device doesn't do ARP internally */ + ret = fetch_ha(dst, addr, dst_in, seq); + } + } + return ret; +} - return ret; +static int copy_src_l2_addr(struct rdma_dev_addr *dev_addr, + const struct sockaddr *dst_in, + const struct dst_entry *dst, + const struct net_device *ndev) +{ + int ret = 0; + + if (dst->dev->flags & IFF_LOOPBACK) + ret = rdma_translate_ip(dst_in, dev_addr); + else + rdma_copy_src_l2_addr(dev_addr, dst->dev); + + /* + * If there's a gateway and type of device not ARPHRD_INFINIBAND, + * we're definitely in RoCE v2 (as RoCE v1 isn't routable) set the + * network type accordingly. + */ + if (has_gateway(dst, dst_in->sa_family) && + ndev->type != ARPHRD_INFINIBAND) + dev_addr->network = dst_in->sa_family == AF_INET ? + RDMA_NETWORK_IPV4 : + RDMA_NETWORK_IPV6; + else + dev_addr->network = RDMA_NETWORK_IB; + + return ret; +} + +static int rdma_set_src_addr_rcu(struct rdma_dev_addr *dev_addr, + unsigned int *ndev_flags, + const struct sockaddr *dst_in, + const struct dst_entry *dst) +{ + struct net_device *ndev = READ_ONCE(dst->dev); + + *ndev_flags = ndev->flags; + /* A physical device must be the RDMA device to use */ + if (ndev->flags & IFF_LOOPBACK) { + /* + * RDMA (IB/RoCE, iWarp) doesn't run on lo interface or + * loopback IP address. So if route is resolved to loopback + * interface, translate that to a real ndev based on non + * loopback IP address. + */ + ndev = rdma_find_ndev_for_src_ip_rcu(dev_net(ndev), dst_in); + if (IS_ERR(ndev)) + return -ENODEV; } - /* If the device doesn't do ARP internally */ - if (!(dst->dev->flags & IFF_NOARP)) - return fetch_ha(dst, addr, dst_in, seq); + return copy_src_l2_addr(dev_addr, dst_in, dst, ndev); +} + +static int set_addr_netns_by_gid_rcu(struct rdma_dev_addr *addr) +{ + struct net_device *ndev; - rdma_copy_addr(addr, dst->dev, NULL); + ndev = rdma_read_gid_attr_ndev_rcu(addr->sgid_attr); + if (IS_ERR(ndev)) + return PTR_ERR(ndev); + /* + * Since we are holding the rcu, reading net and ifindex + * are safe without any additional reference; because + * change_net_namespace() in net/core/dev.c does rcu sync + * after it changes the state to IFF_DOWN and before + * updating netdev fields {net, ifindex}. + */ + addr->net = dev_net(ndev); + addr->bound_dev_if = ndev->ifindex; return 0; } +static void rdma_addr_set_net_defaults(struct rdma_dev_addr *addr) +{ + addr->net = &init_net; + addr->bound_dev_if = 0; +} + static int addr_resolve(struct sockaddr *src_in, const struct sockaddr *dst_in, struct rdma_dev_addr *addr, bool resolve_neigh, + bool resolve_by_gid_attr, u32 seq) { - struct net_device *ndev; - struct dst_entry *dst; + struct dst_entry *dst = NULL; + unsigned int ndev_flags = 0; + struct rtable *rt = NULL; int ret; if (!addr->net) { @@ -488,58 +563,55 @@ static int addr_resolve(struct sockaddr *src_in, return -EINVAL; } - if (src_in->sa_family == AF_INET) { - struct rtable *rt = NULL; - const struct sockaddr_in *dst_in4 = - (const struct sockaddr_in *)dst_in; - - ret = addr4_resolve((struct sockaddr_in *)src_in, - dst_in4, addr, &rt); - if (ret) - return ret; - - if (resolve_neigh) - ret = addr_resolve_neigh(&rt->dst, dst_in, addr, seq); - - if (addr->bound_dev_if) { - ndev = dev_get_by_index(addr->net, addr->bound_dev_if); - } else { - ndev = rt->dst.dev; - dev_hold(ndev); + rcu_read_lock(); + if (resolve_by_gid_attr) { + if (!addr->sgid_attr) { + rcu_read_unlock(); + pr_warn_ratelimited("%s: missing gid_attr\n", __func__); + return -EINVAL; } - - ip_rt_put(rt); - } else { - const struct sockaddr_in6 *dst_in6 = - (const struct sockaddr_in6 *)dst_in; - - ret = addr6_resolve((struct sockaddr_in6 *)src_in, - dst_in6, addr, - &dst); - if (ret) + /* + * If the request is for a specific gid attribute of the + * rdma_dev_addr, derive net from the netdevice of the + * GID attribute. + */ + ret = set_addr_netns_by_gid_rcu(addr); + if (ret) { + rcu_read_unlock(); return ret; - - if (resolve_neigh) - ret = addr_resolve_neigh(dst, dst_in, addr, seq); - - if (addr->bound_dev_if) { - ndev = dev_get_by_index(addr->net, addr->bound_dev_if); - } else { - ndev = dst->dev; - dev_hold(ndev); } - - dst_release(dst); } - - if (ndev) { - if (ndev->flags & IFF_LOOPBACK) - ret = rdma_translate_ip(dst_in, addr); - else - addr->bound_dev_if = ndev->ifindex; - dev_put(ndev); + if (src_in->sa_family == AF_INET) { + ret = addr4_resolve(src_in, dst_in, addr, &rt); + dst = &rt->dst; + } else { + ret = addr6_resolve(src_in, dst_in, addr, &dst); } + if (ret) { + rcu_read_unlock(); + goto done; + } + ret = rdma_set_src_addr_rcu(addr, &ndev_flags, dst_in, dst); + rcu_read_unlock(); + + /* + * Resolve neighbor destination address if requested and + * only if src addr translation didn't fail. + */ + if (!ret && resolve_neigh) + ret = addr_resolve_neigh(dst, dst_in, addr, ndev_flags, seq); + if (src_in->sa_family == AF_INET) + ip_rt_put(rt); + else + dst_release(dst); +done: + /* + * Clear the addr net to go back to its original state, only if it was + * derived from GID attribute in this context. + */ + if (resolve_by_gid_attr) + rdma_addr_set_net_defaults(addr); return ret; } @@ -554,7 +626,8 @@ static void process_one_req(struct work_struct *_work) src_in = (struct sockaddr *)&req->src_addr; dst_in = (struct sockaddr *)&req->dst_addr; req->status = addr_resolve(src_in, dst_in, req->addr, - true, req->seq); + true, req->resolve_by_gid_attr, + req->seq); if (req->status && time_after_eq(jiffies, req->timeout)) { req->status = -ETIMEDOUT; } else if (req->status == -ENODATA) { @@ -586,10 +659,10 @@ static void process_one_req(struct work_struct *_work) } int rdma_resolve_ip(struct sockaddr *src_addr, const struct sockaddr *dst_addr, - struct rdma_dev_addr *addr, int timeout_ms, + struct rdma_dev_addr *addr, unsigned long timeout_ms, void (*callback)(int status, struct sockaddr *src_addr, struct rdma_dev_addr *addr, void *context), - void *context) + bool resolve_by_gid_attr, void *context) { struct sockaddr *src_in, *dst_in; struct addr_req *req; @@ -617,10 +690,12 @@ int rdma_resolve_ip(struct sockaddr *src_addr, const struct sockaddr *dst_addr, req->addr = addr; req->callback = callback; req->context = context; + req->resolve_by_gid_attr = resolve_by_gid_attr; INIT_DELAYED_WORK(&req->work, process_one_req); req->seq = (u32)atomic_inc_return(&ib_nl_addr_request_seq); - req->status = addr_resolve(src_in, dst_in, addr, true, req->seq); + req->status = addr_resolve(src_in, dst_in, addr, true, + req->resolve_by_gid_attr, req->seq); switch (req->status) { case 0: req->timeout = jiffies; @@ -641,25 +716,53 @@ err: } EXPORT_SYMBOL(rdma_resolve_ip); -int rdma_resolve_ip_route(struct sockaddr *src_addr, - const struct sockaddr *dst_addr, - struct rdma_dev_addr *addr) +int roce_resolve_route_from_path(struct sa_path_rec *rec, + const struct ib_gid_attr *attr) { - struct sockaddr_storage ssrc_addr = {}; - struct sockaddr *src_in = (struct sockaddr *)&ssrc_addr; + union { + struct sockaddr _sockaddr; + struct sockaddr_in _sockaddr_in; + struct sockaddr_in6 _sockaddr_in6; + } sgid, dgid; + struct rdma_dev_addr dev_addr = {}; + int ret; - if (src_addr) { - if (src_addr->sa_family != dst_addr->sa_family) - return -EINVAL; + if (rec->roce.route_resolved) + return 0; - memcpy(src_in, src_addr, rdma_addr_size(src_addr)); - } else { - src_in->sa_family = dst_addr->sa_family; - } + rdma_gid2ip(&sgid._sockaddr, &rec->sgid); + rdma_gid2ip(&dgid._sockaddr, &rec->dgid); + + if (sgid._sockaddr.sa_family != dgid._sockaddr.sa_family) + return -EINVAL; + + if (!attr || !attr->ndev) + return -EINVAL; + + dev_addr.net = &init_net; + dev_addr.sgid_attr = attr; + + ret = addr_resolve(&sgid._sockaddr, &dgid._sockaddr, + &dev_addr, false, true, 0); + if (ret) + return ret; + + if ((dev_addr.network == RDMA_NETWORK_IPV4 || + dev_addr.network == RDMA_NETWORK_IPV6) && + rec->rec_type != SA_PATH_REC_TYPE_ROCE_V2) + return -EINVAL; - return addr_resolve(src_in, dst_addr, addr, false, 0); + rec->roce.route_resolved = true; + return 0; } +/** + * rdma_addr_cancel - Cancel resolve ip request + * @addr: Pointer to address structure given previously + * during rdma_resolve_ip(). + * rdma_addr_cancel() is synchronous function which cancels any pending + * request if there is any. + */ void rdma_addr_cancel(struct rdma_dev_addr *addr) { struct addr_req *req, *temp_req; @@ -687,11 +790,6 @@ void rdma_addr_cancel(struct rdma_dev_addr *addr) * guarentees no work is running and none will be started. */ cancel_delayed_work_sync(&found->work); - - if (found->callback) - found->callback(-ECANCELED, (struct sockaddr *)&found->src_addr, - found->addr, found->context); - kfree(found); } EXPORT_SYMBOL(rdma_addr_cancel); @@ -710,7 +808,7 @@ static void resolve_cb(int status, struct sockaddr *src_addr, int rdma_addr_find_l2_eth_by_grh(const union ib_gid *sgid, const union ib_gid *dgid, - u8 *dmac, const struct net_device *ndev, + u8 *dmac, const struct ib_gid_attr *sgid_attr, int *hoplimit) { struct rdma_dev_addr dev_addr; @@ -726,12 +824,12 @@ int rdma_addr_find_l2_eth_by_grh(const union ib_gid *sgid, rdma_gid2ip(&dgid_addr._sockaddr, dgid); memset(&dev_addr, 0, sizeof(dev_addr)); - dev_addr.bound_dev_if = ndev->ifindex; dev_addr.net = &init_net; + dev_addr.sgid_attr = sgid_attr; init_completion(&ctx.comp); ret = rdma_resolve_ip(&sgid_addr._sockaddr, &dgid_addr._sockaddr, - &dev_addr, 1000, resolve_cb, &ctx); + &dev_addr, 1000, resolve_cb, true, &ctx); if (ret) return ret; diff --git a/drivers/infiniband/core/cache.c b/drivers/infiniband/core/cache.c index 3208ad6ad540..5b2fce4a7091 100644 --- a/drivers/infiniband/core/cache.c +++ b/drivers/infiniband/core/cache.c @@ -212,9 +212,8 @@ static void free_gid_entry_locked(struct ib_gid_table_entry *entry) u8 port_num = entry->attr.port_num; struct ib_gid_table *table = rdma_gid_table(device, port_num); - pr_debug("%s device=%s port=%d index=%d gid %pI6\n", __func__, - device->name, port_num, entry->attr.index, - entry->attr.gid.raw); + dev_dbg(&device->dev, "%s port=%d index=%d gid %pI6\n", __func__, + port_num, entry->attr.index, entry->attr.gid.raw); if (rdma_cap_roce_gid_table(device, port_num) && entry->state != GID_TABLE_ENTRY_INVALID) @@ -289,9 +288,9 @@ static void store_gid_entry(struct ib_gid_table *table, { entry->state = GID_TABLE_ENTRY_VALID; - pr_debug("%s device=%s port=%d index=%d gid %pI6\n", __func__, - entry->attr.device->name, entry->attr.port_num, - entry->attr.index, entry->attr.gid.raw); + dev_dbg(&entry->attr.device->dev, "%s port=%d index=%d gid %pI6\n", + __func__, entry->attr.port_num, entry->attr.index, + entry->attr.gid.raw); lockdep_assert_held(&table->lock); write_lock_irq(&table->rwlock); @@ -320,17 +319,16 @@ static int add_roce_gid(struct ib_gid_table_entry *entry) int ret; if (!attr->ndev) { - pr_err("%s NULL netdev device=%s port=%d index=%d\n", - __func__, attr->device->name, attr->port_num, - attr->index); + dev_err(&attr->device->dev, "%s NULL netdev port=%d index=%d\n", + __func__, attr->port_num, attr->index); return -EINVAL; } if (rdma_cap_roce_gid_table(attr->device, attr->port_num)) { ret = attr->device->add_gid(attr, &entry->context); if (ret) { - pr_err("%s GID add failed device=%s port=%d index=%d\n", - __func__, attr->device->name, attr->port_num, - attr->index); + dev_err(&attr->device->dev, + "%s GID add failed port=%d index=%d\n", + __func__, attr->port_num, attr->index); return ret; } } @@ -353,9 +351,8 @@ static void del_gid(struct ib_device *ib_dev, u8 port, lockdep_assert_held(&table->lock); - pr_debug("%s device=%s port=%d index=%d gid %pI6\n", __func__, - ib_dev->name, port, ix, - table->data_vec[ix]->attr.gid.raw); + dev_dbg(&ib_dev->dev, "%s port=%d index=%d gid %pI6\n", __func__, port, + ix, table->data_vec[ix]->attr.gid.raw); write_lock_irq(&table->rwlock); entry = table->data_vec[ix]; @@ -782,9 +779,9 @@ static void release_gid_table(struct ib_device *device, u8 port, if (is_gid_entry_free(table->data_vec[i])) continue; if (kref_read(&table->data_vec[i]->kref) > 1) { - pr_err("GID entry ref leak for %s (index %d) ref=%d\n", - device->name, i, - kref_read(&table->data_vec[i]->kref)); + dev_err(&device->dev, + "GID entry ref leak for index %d ref=%d\n", i, + kref_read(&table->data_vec[i]->kref)); leak = true; } } @@ -1252,6 +1249,39 @@ void rdma_hold_gid_attr(const struct ib_gid_attr *attr) } EXPORT_SYMBOL(rdma_hold_gid_attr); +/** + * rdma_read_gid_attr_ndev_rcu - Read GID attribute netdevice + * which must be in UP state. + * + * @attr:Pointer to the GID attribute + * + * Returns pointer to netdevice if the netdevice was attached to GID and + * netdevice is in UP state. Caller must hold RCU lock as this API + * reads the netdev flags which can change while netdevice migrates to + * different net namespace. Returns ERR_PTR with error code otherwise. + * + */ +struct net_device *rdma_read_gid_attr_ndev_rcu(const struct ib_gid_attr *attr) +{ + struct ib_gid_table_entry *entry = + container_of(attr, struct ib_gid_table_entry, attr); + struct ib_device *device = entry->attr.device; + struct net_device *ndev = ERR_PTR(-ENODEV); + u8 port_num = entry->attr.port_num; + struct ib_gid_table *table; + unsigned long flags; + bool valid; + + table = rdma_gid_table(device, port_num); + + read_lock_irqsave(&table->rwlock, flags); + valid = is_gid_entry_valid(table->data_vec[attr->index]); + if (valid && attr->ndev && (READ_ONCE(attr->ndev->flags) & IFF_UP)) + ndev = attr->ndev; + read_unlock_irqrestore(&table->rwlock, flags); + return ndev; +} + static int config_non_roce_gid_cache(struct ib_device *device, u8 port, int gid_tbl_len) { @@ -1270,8 +1300,9 @@ static int config_non_roce_gid_cache(struct ib_device *device, continue; ret = device->query_gid(device, port, i, &gid_attr.gid); if (ret) { - pr_warn("query_gid failed (%d) for %s (index %d)\n", - ret, device->name, i); + dev_warn(&device->dev, + "query_gid failed (%d) for index %d\n", ret, + i); goto err; } gid_attr.index = i; @@ -1300,8 +1331,7 @@ static void ib_cache_update(struct ib_device *device, ret = ib_query_port(device, port, tprops); if (ret) { - pr_warn("ib_query_port failed (%d) for %s\n", - ret, device->name); + dev_warn(&device->dev, "ib_query_port failed (%d)\n", ret); goto err; } @@ -1323,8 +1353,9 @@ static void ib_cache_update(struct ib_device *device, for (i = 0; i < pkey_cache->table_len; ++i) { ret = ib_query_pkey(device, port, i, pkey_cache->table + i); if (ret) { - pr_warn("ib_query_pkey failed (%d) for %s (index %d)\n", - ret, device->name, i); + dev_warn(&device->dev, + "ib_query_pkey failed (%d) for index %d\n", + ret, i); goto err; } } diff --git a/drivers/infiniband/core/cm.c b/drivers/infiniband/core/cm.c index 6e39c27dca8e..edb2cb758be7 100644 --- a/drivers/infiniband/core/cm.c +++ b/drivers/infiniband/core/cm.c @@ -3292,8 +3292,11 @@ static int cm_lap_handler(struct cm_work *work) if (ret) goto unlock; - cm_init_av_by_path(param->alternate_path, NULL, &cm_id_priv->alt_av, - cm_id_priv); + ret = cm_init_av_by_path(param->alternate_path, NULL, + &cm_id_priv->alt_av, cm_id_priv); + if (ret) + goto unlock; + cm_id_priv->id.lap_state = IB_CM_LAP_RCVD; cm_id_priv->tid = lap_msg->hdr.tid; ret = atomic_inc_and_test(&cm_id_priv->work_count); @@ -4367,7 +4370,7 @@ static void cm_add_one(struct ib_device *ib_device) cm_dev->going_down = 0; cm_dev->device = device_create(&cm_class, &ib_device->dev, MKDEV(0, 0), NULL, - "%s", ib_device->name); + "%s", dev_name(&ib_device->dev)); if (IS_ERR(cm_dev->device)) { kfree(cm_dev); return; diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c index a36c94930c31..15d5bb7bf6bb 100644 --- a/drivers/infiniband/core/cma.c +++ b/drivers/infiniband/core/cma.c @@ -639,13 +639,21 @@ static void cma_bind_sgid_attr(struct rdma_id_private *id_priv, id_priv->id.route.addr.dev_addr.sgid_attr = sgid_attr; } -static int cma_acquire_dev(struct rdma_id_private *id_priv, - const struct rdma_id_private *listen_id_priv) +/** + * cma_acquire_dev_by_src_ip - Acquire cma device, port, gid attribute + * based on source ip address. + * @id_priv: cm_id which should be bound to cma device + * + * cma_acquire_dev_by_src_ip() binds cm id to cma device, port and GID attribute + * based on source IP address. It returns 0 on success or error code otherwise. + * It is applicable to active and passive side cm_id. + */ +static int cma_acquire_dev_by_src_ip(struct rdma_id_private *id_priv) { struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr; const struct ib_gid_attr *sgid_attr; - struct cma_device *cma_dev; union ib_gid gid, iboe_gid, *gidp; + struct cma_device *cma_dev; enum ib_gid_type gid_type; int ret = -ENODEV; u8 port; @@ -654,41 +662,125 @@ static int cma_acquire_dev(struct rdma_id_private *id_priv, id_priv->id.ps == RDMA_PS_IPOIB) return -EINVAL; - mutex_lock(&lock); rdma_ip2gid((struct sockaddr *)&id_priv->id.route.addr.src_addr, &iboe_gid); memcpy(&gid, dev_addr->src_dev_addr + - rdma_addr_gid_offset(dev_addr), sizeof gid); - - if (listen_id_priv) { - cma_dev = listen_id_priv->cma_dev; - port = listen_id_priv->id.port_num; - gidp = rdma_protocol_roce(cma_dev->device, port) ? - &iboe_gid : &gid; - gid_type = listen_id_priv->gid_type; - sgid_attr = cma_validate_port(cma_dev->device, port, - gid_type, gidp, id_priv); - if (!IS_ERR(sgid_attr)) { - id_priv->id.port_num = port; - cma_bind_sgid_attr(id_priv, sgid_attr); - ret = 0; - goto out; + rdma_addr_gid_offset(dev_addr), sizeof(gid)); + + mutex_lock(&lock); + list_for_each_entry(cma_dev, &dev_list, list) { + for (port = rdma_start_port(cma_dev->device); + port <= rdma_end_port(cma_dev->device); port++) { + gidp = rdma_protocol_roce(cma_dev->device, port) ? + &iboe_gid : &gid; + gid_type = cma_dev->default_gid_type[port - 1]; + sgid_attr = cma_validate_port(cma_dev->device, port, + gid_type, gidp, id_priv); + if (!IS_ERR(sgid_attr)) { + id_priv->id.port_num = port; + cma_bind_sgid_attr(id_priv, sgid_attr); + cma_attach_to_dev(id_priv, cma_dev); + ret = 0; + goto out; + } } } +out: + mutex_unlock(&lock); + return ret; +} + +/** + * cma_ib_acquire_dev - Acquire cma device, port and SGID attribute + * @id_priv: cm id to bind to cma device + * @listen_id_priv: listener cm id to match against + * @req: Pointer to req structure containaining incoming + * request information + * cma_ib_acquire_dev() acquires cma device, port and SGID attribute when + * rdma device matches for listen_id and incoming request. It also verifies + * that a GID table entry is present for the source address. + * Returns 0 on success, or returns error code otherwise. + */ +static int cma_ib_acquire_dev(struct rdma_id_private *id_priv, + const struct rdma_id_private *listen_id_priv, + struct cma_req_info *req) +{ + struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr; + const struct ib_gid_attr *sgid_attr; + enum ib_gid_type gid_type; + union ib_gid gid; + + if (dev_addr->dev_type != ARPHRD_INFINIBAND && + id_priv->id.ps == RDMA_PS_IPOIB) + return -EINVAL; + + if (rdma_protocol_roce(req->device, req->port)) + rdma_ip2gid((struct sockaddr *)&id_priv->id.route.addr.src_addr, + &gid); + else + memcpy(&gid, dev_addr->src_dev_addr + + rdma_addr_gid_offset(dev_addr), sizeof(gid)); + + gid_type = listen_id_priv->cma_dev->default_gid_type[req->port - 1]; + sgid_attr = cma_validate_port(req->device, req->port, + gid_type, &gid, id_priv); + if (IS_ERR(sgid_attr)) + return PTR_ERR(sgid_attr); + + id_priv->id.port_num = req->port; + cma_bind_sgid_attr(id_priv, sgid_attr); + /* Need to acquire lock to protect against reader + * of cma_dev->id_list such as cma_netdev_callback() and + * cma_process_remove(). + */ + mutex_lock(&lock); + cma_attach_to_dev(id_priv, listen_id_priv->cma_dev); + mutex_unlock(&lock); + return 0; +} + +static int cma_iw_acquire_dev(struct rdma_id_private *id_priv, + const struct rdma_id_private *listen_id_priv) +{ + struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr; + const struct ib_gid_attr *sgid_attr; + struct cma_device *cma_dev; + enum ib_gid_type gid_type; + int ret = -ENODEV; + union ib_gid gid; + u8 port; + + if (dev_addr->dev_type != ARPHRD_INFINIBAND && + id_priv->id.ps == RDMA_PS_IPOIB) + return -EINVAL; + + memcpy(&gid, dev_addr->src_dev_addr + + rdma_addr_gid_offset(dev_addr), sizeof(gid)); + + mutex_lock(&lock); + + cma_dev = listen_id_priv->cma_dev; + port = listen_id_priv->id.port_num; + gid_type = listen_id_priv->gid_type; + sgid_attr = cma_validate_port(cma_dev->device, port, + gid_type, &gid, id_priv); + if (!IS_ERR(sgid_attr)) { + id_priv->id.port_num = port; + cma_bind_sgid_attr(id_priv, sgid_attr); + ret = 0; + goto out; + } list_for_each_entry(cma_dev, &dev_list, list) { for (port = 1; port <= cma_dev->device->phys_port_cnt; ++port) { - if (listen_id_priv && - listen_id_priv->cma_dev == cma_dev && + if (listen_id_priv->cma_dev == cma_dev && listen_id_priv->id.port_num == port) continue; - gidp = rdma_protocol_roce(cma_dev->device, port) ? - &iboe_gid : &gid; gid_type = cma_dev->default_gid_type[port - 1]; sgid_attr = cma_validate_port(cma_dev->device, port, - gid_type, gidp, id_priv); + gid_type, &gid, id_priv); if (!IS_ERR(sgid_attr)) { id_priv->id.port_num = port; cma_bind_sgid_attr(id_priv, sgid_attr); @@ -785,10 +877,7 @@ struct rdma_cm_id *__rdma_create_id(struct net *net, if (!id_priv) return ERR_PTR(-ENOMEM); - if (caller) - id_priv->res.kern_name = caller; - else - rdma_restrack_set_task(&id_priv->res, current); + rdma_restrack_set_task(&id_priv->res, caller); id_priv->res.type = RDMA_RESTRACK_CM_ID; id_priv->state = RDMA_CM_IDLE; id_priv->id.context = context; @@ -1462,18 +1551,35 @@ static bool cma_protocol_roce(const struct rdma_cm_id *id) return rdma_protocol_roce(device, port_num); } +static bool cma_is_req_ipv6_ll(const struct cma_req_info *req) +{ + const struct sockaddr *daddr = + (const struct sockaddr *)&req->listen_addr_storage; + const struct sockaddr_in6 *daddr6 = (const struct sockaddr_in6 *)daddr; + + /* Returns true if the req is for IPv6 link local */ + return (daddr->sa_family == AF_INET6 && + (ipv6_addr_type(&daddr6->sin6_addr) & IPV6_ADDR_LINKLOCAL)); +} + static bool cma_match_net_dev(const struct rdma_cm_id *id, const struct net_device *net_dev, - u8 port_num) + const struct cma_req_info *req) { const struct rdma_addr *addr = &id->route.addr; if (!net_dev) /* This request is an AF_IB request */ - return (!id->port_num || id->port_num == port_num) && + return (!id->port_num || id->port_num == req->port) && (addr->src_addr.ss_family == AF_IB); /* + * If the request is not for IPv6 link local, allow matching + * request to any netdevice of the one or multiport rdma device. + */ + if (!cma_is_req_ipv6_ll(req)) + return true; + /* * Net namespaces must match, and if the listner is listening * on a specific netdevice than netdevice must match as well. */ @@ -1500,13 +1606,14 @@ static struct rdma_id_private *cma_find_listener( hlist_for_each_entry(id_priv, &bind_list->owners, node) { if (cma_match_private_data(id_priv, ib_event->private_data)) { if (id_priv->id.device == cm_id->device && - cma_match_net_dev(&id_priv->id, net_dev, req->port)) + cma_match_net_dev(&id_priv->id, net_dev, req)) return id_priv; list_for_each_entry(id_priv_dev, &id_priv->listen_list, listen_list) { if (id_priv_dev->id.device == cm_id->device && - cma_match_net_dev(&id_priv_dev->id, net_dev, req->port)) + cma_match_net_dev(&id_priv_dev->id, + net_dev, req)) return id_priv_dev; } } @@ -1518,18 +1625,18 @@ static struct rdma_id_private *cma_find_listener( static struct rdma_id_private * cma_ib_id_from_event(struct ib_cm_id *cm_id, const struct ib_cm_event *ib_event, + struct cma_req_info *req, struct net_device **net_dev) { - struct cma_req_info req; struct rdma_bind_list *bind_list; struct rdma_id_private *id_priv; int err; - err = cma_save_req_info(ib_event, &req); + err = cma_save_req_info(ib_event, req); if (err) return ERR_PTR(err); - *net_dev = cma_get_net_dev(ib_event, &req); + *net_dev = cma_get_net_dev(ib_event, req); if (IS_ERR(*net_dev)) { if (PTR_ERR(*net_dev) == -EAFNOSUPPORT) { /* Assuming the protocol is AF_IB */ @@ -1567,17 +1674,17 @@ cma_ib_id_from_event(struct ib_cm_id *cm_id, } if (!validate_net_dev(*net_dev, - (struct sockaddr *)&req.listen_addr_storage, - (struct sockaddr *)&req.src_addr_storage)) { + (struct sockaddr *)&req->listen_addr_storage, + (struct sockaddr *)&req->src_addr_storage)) { id_priv = ERR_PTR(-EHOSTUNREACH); goto err; } } bind_list = cma_ps_find(*net_dev ? dev_net(*net_dev) : &init_net, - rdma_ps_from_service_id(req.service_id), - cma_port_from_service_id(req.service_id)); - id_priv = cma_find_listener(bind_list, cm_id, ib_event, &req, *net_dev); + rdma_ps_from_service_id(req->service_id), + cma_port_from_service_id(req->service_id)); + id_priv = cma_find_listener(bind_list, cm_id, ib_event, req, *net_dev); err: rcu_read_unlock(); if (IS_ERR(id_priv) && *net_dev) { @@ -1710,8 +1817,8 @@ void rdma_destroy_id(struct rdma_cm_id *id) mutex_lock(&id_priv->handler_mutex); mutex_unlock(&id_priv->handler_mutex); + rdma_restrack_del(&id_priv->res); if (id_priv->cma_dev) { - rdma_restrack_del(&id_priv->res); if (rdma_cap_ib_cm(id_priv->id.device, 1)) { if (id_priv->cm_id.ib) ib_destroy_cm_id(id_priv->cm_id.ib); @@ -1902,7 +2009,7 @@ cma_ib_new_conn_id(const struct rdma_cm_id *listen_id, rt->path_rec[1] = *ib_event->param.req_rcvd.alternate_path; if (net_dev) { - rdma_copy_addr(&rt->addr.dev_addr, net_dev, NULL); + rdma_copy_src_l2_addr(&rt->addr.dev_addr, net_dev); } else { if (!cma_protocol_roce(listen_id) && cma_any_addr(cma_src_addr(id_priv))) { @@ -1952,7 +2059,7 @@ cma_ib_new_udp_id(const struct rdma_cm_id *listen_id, goto err; if (net_dev) { - rdma_copy_addr(&id->route.addr.dev_addr, net_dev, NULL); + rdma_copy_src_l2_addr(&id->route.addr.dev_addr, net_dev); } else { if (!cma_any_addr(cma_src_addr(id_priv))) { ret = cma_translate_addr(cma_src_addr(id_priv), @@ -1999,11 +2106,12 @@ static int cma_ib_req_handler(struct ib_cm_id *cm_id, { struct rdma_id_private *listen_id, *conn_id = NULL; struct rdma_cm_event event = {}; + struct cma_req_info req = {}; struct net_device *net_dev; u8 offset; int ret; - listen_id = cma_ib_id_from_event(cm_id, ib_event, &net_dev); + listen_id = cma_ib_id_from_event(cm_id, ib_event, &req, &net_dev); if (IS_ERR(listen_id)) return PTR_ERR(listen_id); @@ -2036,7 +2144,7 @@ static int cma_ib_req_handler(struct ib_cm_id *cm_id, } mutex_lock_nested(&conn_id->handler_mutex, SINGLE_DEPTH_NESTING); - ret = cma_acquire_dev(conn_id, listen_id); + ret = cma_ib_acquire_dev(conn_id, listen_id, &req); if (ret) goto err2; @@ -2232,7 +2340,7 @@ static int iw_conn_req_handler(struct iw_cm_id *cm_id, goto out; } - ret = cma_acquire_dev(conn_id, listen_id); + ret = cma_iw_acquire_dev(conn_id, listen_id); if (ret) { mutex_unlock(&conn_id->handler_mutex); rdma_destroy_id(new_cm_id); @@ -2354,8 +2462,8 @@ static void cma_listen_on_dev(struct rdma_id_private *id_priv, ret = rdma_listen(id, id_priv->backlog); if (ret) - pr_warn("RDMA CMA: cma_listen_on_dev, error %d, listening on device %s\n", - ret, cma_dev->device->name); + dev_warn(&cma_dev->device->dev, + "RDMA CMA: cma_listen_on_dev, error %d\n", ret); } static void cma_listen_on_all(struct rdma_id_private *id_priv) @@ -2402,8 +2510,8 @@ static void cma_query_handler(int status, struct sa_path_rec *path_rec, queue_work(cma_wq, &work->work); } -static int cma_query_ib_route(struct rdma_id_private *id_priv, int timeout_ms, - struct cma_work *work) +static int cma_query_ib_route(struct rdma_id_private *id_priv, + unsigned long timeout_ms, struct cma_work *work) { struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr; struct sa_path_rec path_rec; @@ -2521,7 +2629,8 @@ static void cma_init_resolve_addr_work(struct cma_work *work, work->event.event = RDMA_CM_EVENT_ADDR_RESOLVED; } -static int cma_resolve_ib_route(struct rdma_id_private *id_priv, int timeout_ms) +static int cma_resolve_ib_route(struct rdma_id_private *id_priv, + unsigned long timeout_ms) { struct rdma_route *route = &id_priv->id.route; struct cma_work *work; @@ -2643,7 +2752,7 @@ err: } EXPORT_SYMBOL(rdma_set_ib_path); -static int cma_resolve_iw_route(struct rdma_id_private *id_priv, int timeout_ms) +static int cma_resolve_iw_route(struct rdma_id_private *id_priv) { struct cma_work *work; @@ -2744,7 +2853,7 @@ err1: return ret; } -int rdma_resolve_route(struct rdma_cm_id *id, int timeout_ms) +int rdma_resolve_route(struct rdma_cm_id *id, unsigned long timeout_ms) { struct rdma_id_private *id_priv; int ret; @@ -2759,7 +2868,7 @@ int rdma_resolve_route(struct rdma_cm_id *id, int timeout_ms) else if (rdma_protocol_roce(id->device, id->port_num)) ret = cma_resolve_iboe_route(id_priv); else if (rdma_protocol_iwarp(id->device, id->port_num)) - ret = cma_resolve_iw_route(id_priv, timeout_ms); + ret = cma_resolve_iw_route(id_priv); else ret = -ENOSYS; @@ -2862,7 +2971,7 @@ static void addr_handler(int status, struct sockaddr *src_addr, memcpy(cma_src_addr(id_priv), src_addr, rdma_addr_size(src_addr)); if (!status && !id_priv->cma_dev) { - status = cma_acquire_dev(id_priv, NULL); + status = cma_acquire_dev_by_src_ip(id_priv); if (status) pr_debug_ratelimited("RDMA CM: ADDR_ERROR: failed to acquire device. status %d\n", status); @@ -2882,13 +2991,11 @@ static void addr_handler(int status, struct sockaddr *src_addr, if (id_priv->id.event_handler(&id_priv->id, &event)) { cma_exch(id_priv, RDMA_CM_DESTROYING); mutex_unlock(&id_priv->handler_mutex); - cma_deref_id(id_priv); rdma_destroy_id(&id_priv->id); return; } out: mutex_unlock(&id_priv->handler_mutex); - cma_deref_id(id_priv); } static int cma_resolve_loopback(struct rdma_id_private *id_priv) @@ -2966,7 +3073,7 @@ static int cma_bind_addr(struct rdma_cm_id *id, struct sockaddr *src_addr, } int rdma_resolve_addr(struct rdma_cm_id *id, struct sockaddr *src_addr, - const struct sockaddr *dst_addr, int timeout_ms) + const struct sockaddr *dst_addr, unsigned long timeout_ms) { struct rdma_id_private *id_priv; int ret; @@ -2985,16 +3092,16 @@ int rdma_resolve_addr(struct rdma_cm_id *id, struct sockaddr *src_addr, return -EINVAL; memcpy(cma_dst_addr(id_priv), dst_addr, rdma_addr_size(dst_addr)); - atomic_inc(&id_priv->refcount); if (cma_any_addr(dst_addr)) { ret = cma_resolve_loopback(id_priv); } else { if (dst_addr->sa_family == AF_IB) { ret = cma_resolve_ib_addr(id_priv); } else { - ret = rdma_resolve_ip(cma_src_addr(id_priv), - dst_addr, &id->route.addr.dev_addr, - timeout_ms, addr_handler, id_priv); + ret = rdma_resolve_ip(cma_src_addr(id_priv), dst_addr, + &id->route.addr.dev_addr, + timeout_ms, addr_handler, + false, id_priv); } } if (ret) @@ -3003,7 +3110,6 @@ int rdma_resolve_addr(struct rdma_cm_id *id, struct sockaddr *src_addr, return 0; err: cma_comp_exch(id_priv, RDMA_CM_ADDR_QUERY, RDMA_CM_ADDR_BOUND); - cma_deref_id(id_priv); return ret; } EXPORT_SYMBOL(rdma_resolve_addr); @@ -3414,7 +3520,7 @@ int rdma_bind_addr(struct rdma_cm_id *id, struct sockaddr *addr) if (ret) goto err1; - ret = cma_acquire_dev(id_priv, NULL); + ret = cma_acquire_dev_by_src_ip(id_priv); if (ret) goto err1; } @@ -3439,10 +3545,9 @@ int rdma_bind_addr(struct rdma_cm_id *id, struct sockaddr *addr) return 0; err2: - if (id_priv->cma_dev) { - rdma_restrack_del(&id_priv->res); + rdma_restrack_del(&id_priv->res); + if (id_priv->cma_dev) cma_release_dev(id_priv); - } err1: cma_comp_exch(id_priv, RDMA_CM_ADDR_BOUND, RDMA_CM_IDLE); return ret; @@ -3839,10 +3944,7 @@ int __rdma_accept(struct rdma_cm_id *id, struct rdma_conn_param *conn_param, id_priv = container_of(id, struct rdma_id_private, id); - if (caller) - id_priv->res.kern_name = caller; - else - rdma_restrack_set_task(&id_priv->res, current); + rdma_restrack_set_task(&id_priv->res, caller); if (!cma_comp(id_priv, RDMA_CM_CONNECT)) return -EINVAL; @@ -4087,9 +4189,10 @@ static int cma_join_ib_multicast(struct rdma_id_private *id_priv, (!ib_sa_sendonly_fullmem_support(&sa_client, id_priv->id.device, id_priv->id.port_num))) { - pr_warn("RDMA CM: %s port %u Unable to multicast join\n" - "RDMA CM: SM doesn't support Send Only Full Member option\n", - id_priv->id.device->name, id_priv->id.port_num); + dev_warn( + &id_priv->id.device->dev, + "RDMA CM: port %u Unable to multicast join: SM doesn't support Send Only Full Member option\n", + id_priv->id.port_num); return -EOPNOTSUPP; } diff --git a/drivers/infiniband/core/cma_configfs.c b/drivers/infiniband/core/cma_configfs.c index eee38b40be99..8c2dfb3e294e 100644 --- a/drivers/infiniband/core/cma_configfs.c +++ b/drivers/infiniband/core/cma_configfs.c @@ -65,7 +65,7 @@ static struct cma_dev_port_group *to_dev_port_group(struct config_item *item) static bool filter_by_name(struct ib_device *ib_dev, void *cookie) { - return !strcmp(ib_dev->name, cookie); + return !strcmp(dev_name(&ib_dev->dev), cookie); } static int cma_configfs_params_get(struct config_item *item, diff --git a/drivers/infiniband/core/core_priv.h b/drivers/infiniband/core/core_priv.h index 77c7005c396c..bb9007a0cca7 100644 --- a/drivers/infiniband/core/core_priv.h +++ b/drivers/infiniband/core/core_priv.h @@ -44,7 +44,7 @@ #include "mad_priv.h" /* Total number of ports combined across all struct ib_devices's */ -#define RDMA_MAX_PORTS 1024 +#define RDMA_MAX_PORTS 8192 struct pkey_index_qp_list { struct list_head pkey_index_list; @@ -87,6 +87,7 @@ int ib_device_register_sysfs(struct ib_device *device, int (*port_callback)(struct ib_device *, u8, struct kobject *)); void ib_device_unregister_sysfs(struct ib_device *device); +int ib_device_rename(struct ib_device *ibdev, const char *name); typedef void (*roce_netdev_callback)(struct ib_device *device, u8 port, struct net_device *idev, void *cookie); @@ -338,7 +339,14 @@ int rdma_resolve_ip_route(struct sockaddr *src_addr, int rdma_addr_find_l2_eth_by_grh(const union ib_gid *sgid, const union ib_gid *dgid, - u8 *dmac, const struct net_device *ndev, + u8 *dmac, const struct ib_gid_attr *sgid_attr, int *hoplimit); +void rdma_copy_src_l2_addr(struct rdma_dev_addr *dev_addr, + const struct net_device *dev); +struct sa_path_rec; +int roce_resolve_route_from_path(struct sa_path_rec *rec, + const struct ib_gid_attr *attr); + +struct net_device *rdma_read_gid_attr_ndev_rcu(const struct ib_gid_attr *attr); #endif /* _CORE_PRIV_H */ diff --git a/drivers/infiniband/core/cq.c b/drivers/infiniband/core/cq.c index af5ad6a56ae4..b1e5365ddafa 100644 --- a/drivers/infiniband/core/cq.c +++ b/drivers/infiniband/core/cq.c @@ -112,12 +112,12 @@ static void ib_cq_poll_work(struct work_struct *work) IB_POLL_BATCH); if (completed >= IB_POLL_BUDGET_WORKQUEUE || ib_req_notify_cq(cq, IB_POLL_FLAGS) > 0) - queue_work(ib_comp_wq, &cq->work); + queue_work(cq->comp_wq, &cq->work); } static void ib_cq_completion_workqueue(struct ib_cq *cq, void *private) { - queue_work(ib_comp_wq, &cq->work); + queue_work(cq->comp_wq, &cq->work); } /** @@ -161,7 +161,7 @@ struct ib_cq *__ib_alloc_cq(struct ib_device *dev, void *private, goto out_destroy_cq; cq->res.type = RDMA_RESTRACK_CQ; - cq->res.kern_name = caller; + rdma_restrack_set_task(&cq->res, caller); rdma_restrack_add(&cq->res); switch (cq->poll_ctx) { @@ -175,9 +175,12 @@ struct ib_cq *__ib_alloc_cq(struct ib_device *dev, void *private, ib_req_notify_cq(cq, IB_CQ_NEXT_COMP); break; case IB_POLL_WORKQUEUE: + case IB_POLL_UNBOUND_WORKQUEUE: cq->comp_handler = ib_cq_completion_workqueue; INIT_WORK(&cq->work, ib_cq_poll_work); ib_req_notify_cq(cq, IB_CQ_NEXT_COMP); + cq->comp_wq = (cq->poll_ctx == IB_POLL_WORKQUEUE) ? + ib_comp_wq : ib_comp_unbound_wq; break; default: ret = -EINVAL; @@ -213,6 +216,7 @@ void ib_free_cq(struct ib_cq *cq) irq_poll_disable(&cq->iop); break; case IB_POLL_WORKQUEUE: + case IB_POLL_UNBOUND_WORKQUEUE: cancel_work_sync(&cq->work); break; default: diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c index db3b6271f09d..87eb4f2cdd7d 100644 --- a/drivers/infiniband/core/device.c +++ b/drivers/infiniband/core/device.c @@ -61,6 +61,7 @@ struct ib_client_data { }; struct workqueue_struct *ib_comp_wq; +struct workqueue_struct *ib_comp_unbound_wq; struct workqueue_struct *ib_wq; EXPORT_SYMBOL_GPL(ib_wq); @@ -122,8 +123,9 @@ static int ib_device_check_mandatory(struct ib_device *device) for (i = 0; i < ARRAY_SIZE(mandatory_table); ++i) { if (!*(void **) ((void *) device + mandatory_table[i].offset)) { - pr_warn("Device %s is missing mandatory function %s\n", - device->name, mandatory_table[i].name); + dev_warn(&device->dev, + "Device is missing mandatory function %s\n", + mandatory_table[i].name); return -EINVAL; } } @@ -163,16 +165,40 @@ static struct ib_device *__ib_device_get_by_name(const char *name) struct ib_device *device; list_for_each_entry(device, &device_list, core_list) - if (!strncmp(name, device->name, IB_DEVICE_NAME_MAX)) + if (!strcmp(name, dev_name(&device->dev))) return device; return NULL; } -static int alloc_name(char *name) +int ib_device_rename(struct ib_device *ibdev, const char *name) +{ + struct ib_device *device; + int ret = 0; + + if (!strcmp(name, dev_name(&ibdev->dev))) + return ret; + + mutex_lock(&device_mutex); + list_for_each_entry(device, &device_list, core_list) { + if (!strcmp(name, dev_name(&device->dev))) { + ret = -EEXIST; + goto out; + } + } + + ret = device_rename(&ibdev->dev, name); + if (ret) + goto out; + strlcpy(ibdev->name, name, IB_DEVICE_NAME_MAX); +out: + mutex_unlock(&device_mutex); + return ret; +} + +static int alloc_name(struct ib_device *ibdev, const char *name) { unsigned long *inuse; - char buf[IB_DEVICE_NAME_MAX]; struct ib_device *device; int i; @@ -181,24 +207,21 @@ static int alloc_name(char *name) return -ENOMEM; list_for_each_entry(device, &device_list, core_list) { - if (!sscanf(device->name, name, &i)) + char buf[IB_DEVICE_NAME_MAX]; + + if (sscanf(dev_name(&device->dev), name, &i) != 1) continue; if (i < 0 || i >= PAGE_SIZE * 8) continue; snprintf(buf, sizeof buf, name, i); - if (!strncmp(buf, device->name, IB_DEVICE_NAME_MAX)) + if (!strcmp(buf, dev_name(&device->dev))) set_bit(i, inuse); } i = find_first_zero_bit(inuse, PAGE_SIZE * 8); free_page((unsigned long) inuse); - snprintf(buf, sizeof buf, name, i); - if (__ib_device_get_by_name(buf)) - return -ENFILE; - - strlcpy(name, buf, IB_DEVICE_NAME_MAX); - return 0; + return dev_set_name(&ibdev->dev, name, i); } static void ib_device_release(struct device *device) @@ -221,9 +244,7 @@ static void ib_device_release(struct device *device) static int ib_device_uevent(struct device *device, struct kobj_uevent_env *env) { - struct ib_device *dev = container_of(device, struct ib_device, dev); - - if (add_uevent_var(env, "NAME=%s", dev->name)) + if (add_uevent_var(env, "NAME=%s", dev_name(device))) return -ENOMEM; /* @@ -269,7 +290,7 @@ struct ib_device *ib_alloc_device(size_t size) INIT_LIST_HEAD(&device->event_handler_list); spin_lock_init(&device->event_handler_lock); - spin_lock_init(&device->client_data_lock); + rwlock_init(&device->client_data_lock); INIT_LIST_HEAD(&device->client_data_list); INIT_LIST_HEAD(&device->port_list); @@ -285,6 +306,7 @@ EXPORT_SYMBOL(ib_alloc_device); */ void ib_dealloc_device(struct ib_device *device) { + WARN_ON(!list_empty(&device->client_data_list)); WARN_ON(device->reg_state != IB_DEV_UNREGISTERED && device->reg_state != IB_DEV_UNINITIALIZED); rdma_restrack_clean(&device->res); @@ -295,9 +317,8 @@ EXPORT_SYMBOL(ib_dealloc_device); static int add_client_context(struct ib_device *device, struct ib_client *client) { struct ib_client_data *context; - unsigned long flags; - context = kmalloc(sizeof *context, GFP_KERNEL); + context = kmalloc(sizeof(*context), GFP_KERNEL); if (!context) return -ENOMEM; @@ -306,9 +327,9 @@ static int add_client_context(struct ib_device *device, struct ib_client *client context->going_down = false; down_write(&lists_rwsem); - spin_lock_irqsave(&device->client_data_lock, flags); + write_lock_irq(&device->client_data_lock); list_add(&context->list, &device->client_data_list); - spin_unlock_irqrestore(&device->client_data_lock, flags); + write_unlock_irq(&device->client_data_lock); up_write(&lists_rwsem); return 0; @@ -444,22 +465,8 @@ static u32 __dev_new_index(void) } } -/** - * ib_register_device - Register an IB device with IB core - * @device:Device to register - * - * Low-level drivers use ib_register_device() to register their - * devices with the IB core. All registered clients will receive a - * callback for each device that is added. @device must be allocated - * with ib_alloc_device(). - */ -int ib_register_device(struct ib_device *device, - int (*port_callback)(struct ib_device *, - u8, struct kobject *)) +static void setup_dma_device(struct ib_device *device) { - int ret; - struct ib_client *client; - struct ib_udata uhw = {.outlen = 0, .inlen = 0}; struct device *parent = device->dev.parent; WARN_ON_ONCE(device->dma_device); @@ -491,56 +498,113 @@ int ib_register_device(struct ib_device *device, WARN_ON_ONCE(!parent); device->dma_device = parent; } +} - mutex_lock(&device_mutex); +static void cleanup_device(struct ib_device *device) +{ + ib_cache_cleanup_one(device); + ib_cache_release_one(device); + kfree(device->port_pkey_list); + kfree(device->port_immutable); +} - if (strchr(device->name, '%')) { - ret = alloc_name(device->name); - if (ret) - goto out; - } +static int setup_device(struct ib_device *device) +{ + struct ib_udata uhw = {.outlen = 0, .inlen = 0}; + int ret; - if (ib_device_check_mandatory(device)) { - ret = -EINVAL; - goto out; - } + ret = ib_device_check_mandatory(device); + if (ret) + return ret; ret = read_port_immutable(device); if (ret) { - pr_warn("Couldn't create per port immutable data %s\n", - device->name); - goto out; + dev_warn(&device->dev, + "Couldn't create per port immutable data\n"); + return ret; } - ret = setup_port_pkey_list(device); + memset(&device->attrs, 0, sizeof(device->attrs)); + ret = device->query_device(device, &device->attrs, &uhw); if (ret) { - pr_warn("Couldn't create per port_pkey_list\n"); - goto out; + dev_warn(&device->dev, + "Couldn't query the device attributes\n"); + goto port_cleanup; } - ret = ib_cache_setup_one(device); + ret = setup_port_pkey_list(device); if (ret) { - pr_warn("Couldn't set up InfiniBand P_Key/GID cache\n"); + dev_warn(&device->dev, "Couldn't create per port_pkey_list\n"); goto port_cleanup; } - ret = ib_device_register_rdmacg(device); + ret = ib_cache_setup_one(device); if (ret) { - pr_warn("Couldn't register device with rdma cgroup\n"); - goto cache_cleanup; + dev_warn(&device->dev, + "Couldn't set up InfiniBand P_Key/GID cache\n"); + goto pkey_cleanup; + } + return 0; + +pkey_cleanup: + kfree(device->port_pkey_list); +port_cleanup: + kfree(device->port_immutable); + return ret; +} + +/** + * ib_register_device - Register an IB device with IB core + * @device:Device to register + * + * Low-level drivers use ib_register_device() to register their + * devices with the IB core. All registered clients will receive a + * callback for each device that is added. @device must be allocated + * with ib_alloc_device(). + */ +int ib_register_device(struct ib_device *device, const char *name, + int (*port_callback)(struct ib_device *, u8, + struct kobject *)) +{ + int ret; + struct ib_client *client; + + setup_dma_device(device); + + mutex_lock(&device_mutex); + + if (strchr(name, '%')) { + ret = alloc_name(device, name); + if (ret) + goto out; + } else { + ret = dev_set_name(&device->dev, name); + if (ret) + goto out; + } + if (__ib_device_get_by_name(dev_name(&device->dev))) { + ret = -ENFILE; + goto out; } + strlcpy(device->name, dev_name(&device->dev), IB_DEVICE_NAME_MAX); - memset(&device->attrs, 0, sizeof(device->attrs)); - ret = device->query_device(device, &device->attrs, &uhw); + ret = setup_device(device); + if (ret) + goto out; + + device->index = __dev_new_index(); + + ret = ib_device_register_rdmacg(device); if (ret) { - pr_warn("Couldn't query the device attributes\n"); - goto cg_cleanup; + dev_warn(&device->dev, + "Couldn't register device with rdma cgroup\n"); + goto dev_cleanup; } ret = ib_device_register_sysfs(device, port_callback); if (ret) { - pr_warn("Couldn't register device %s with driver model\n", - device->name); + dev_warn(&device->dev, + "Couldn't register device with driver model\n"); goto cg_cleanup; } @@ -550,7 +614,6 @@ int ib_register_device(struct ib_device *device, if (!add_client_context(device, client) && client->add) client->add(device); - device->index = __dev_new_index(); down_write(&lists_rwsem); list_add_tail(&device->core_list, &device_list); up_write(&lists_rwsem); @@ -559,11 +622,8 @@ int ib_register_device(struct ib_device *device, cg_cleanup: ib_device_unregister_rdmacg(device); -cache_cleanup: - ib_cache_cleanup_one(device); - ib_cache_release_one(device); -port_cleanup: - kfree(device->port_immutable); +dev_cleanup: + cleanup_device(device); out: mutex_unlock(&device_mutex); return ret; @@ -585,21 +645,20 @@ void ib_unregister_device(struct ib_device *device) down_write(&lists_rwsem); list_del(&device->core_list); - spin_lock_irqsave(&device->client_data_lock, flags); - list_for_each_entry_safe(context, tmp, &device->client_data_list, list) + write_lock_irq(&device->client_data_lock); + list_for_each_entry(context, &device->client_data_list, list) context->going_down = true; - spin_unlock_irqrestore(&device->client_data_lock, flags); + write_unlock_irq(&device->client_data_lock); downgrade_write(&lists_rwsem); - list_for_each_entry_safe(context, tmp, &device->client_data_list, - list) { + list_for_each_entry(context, &device->client_data_list, list) { if (context->client->remove) context->client->remove(device, context->data); } up_read(&lists_rwsem); - ib_device_unregister_rdmacg(device); ib_device_unregister_sysfs(device); + ib_device_unregister_rdmacg(device); mutex_unlock(&device_mutex); @@ -609,10 +668,13 @@ void ib_unregister_device(struct ib_device *device) kfree(device->port_pkey_list); down_write(&lists_rwsem); - spin_lock_irqsave(&device->client_data_lock, flags); - list_for_each_entry_safe(context, tmp, &device->client_data_list, list) + write_lock_irqsave(&device->client_data_lock, flags); + list_for_each_entry_safe(context, tmp, &device->client_data_list, + list) { + list_del(&context->list); kfree(context); - spin_unlock_irqrestore(&device->client_data_lock, flags); + } + write_unlock_irqrestore(&device->client_data_lock, flags); up_write(&lists_rwsem); device->reg_state = IB_DEV_UNREGISTERED; @@ -662,9 +724,8 @@ EXPORT_SYMBOL(ib_register_client); */ void ib_unregister_client(struct ib_client *client) { - struct ib_client_data *context, *tmp; + struct ib_client_data *context; struct ib_device *device; - unsigned long flags; mutex_lock(&device_mutex); @@ -676,14 +737,14 @@ void ib_unregister_client(struct ib_client *client) struct ib_client_data *found_context = NULL; down_write(&lists_rwsem); - spin_lock_irqsave(&device->client_data_lock, flags); - list_for_each_entry_safe(context, tmp, &device->client_data_list, list) + write_lock_irq(&device->client_data_lock); + list_for_each_entry(context, &device->client_data_list, list) if (context->client == client) { context->going_down = true; found_context = context; break; } - spin_unlock_irqrestore(&device->client_data_lock, flags); + write_unlock_irq(&device->client_data_lock); up_write(&lists_rwsem); if (client->remove) @@ -691,17 +752,18 @@ void ib_unregister_client(struct ib_client *client) found_context->data : NULL); if (!found_context) { - pr_warn("No client context found for %s/%s\n", - device->name, client->name); + dev_warn(&device->dev, + "No client context found for %s\n", + client->name); continue; } down_write(&lists_rwsem); - spin_lock_irqsave(&device->client_data_lock, flags); + write_lock_irq(&device->client_data_lock); list_del(&found_context->list); - kfree(found_context); - spin_unlock_irqrestore(&device->client_data_lock, flags); + write_unlock_irq(&device->client_data_lock); up_write(&lists_rwsem); + kfree(found_context); } mutex_unlock(&device_mutex); @@ -722,13 +784,13 @@ void *ib_get_client_data(struct ib_device *device, struct ib_client *client) void *ret = NULL; unsigned long flags; - spin_lock_irqsave(&device->client_data_lock, flags); + read_lock_irqsave(&device->client_data_lock, flags); list_for_each_entry(context, &device->client_data_list, list) if (context->client == client) { ret = context->data; break; } - spin_unlock_irqrestore(&device->client_data_lock, flags); + read_unlock_irqrestore(&device->client_data_lock, flags); return ret; } @@ -749,18 +811,18 @@ void ib_set_client_data(struct ib_device *device, struct ib_client *client, struct ib_client_data *context; unsigned long flags; - spin_lock_irqsave(&device->client_data_lock, flags); + write_lock_irqsave(&device->client_data_lock, flags); list_for_each_entry(context, &device->client_data_list, list) if (context->client == client) { context->data = data; goto out; } - pr_warn("No client context found for %s/%s\n", - device->name, client->name); + dev_warn(&device->dev, "No client context found for %s\n", + client->name); out: - spin_unlock_irqrestore(&device->client_data_lock, flags); + write_unlock_irqrestore(&device->client_data_lock, flags); } EXPORT_SYMBOL(ib_set_client_data); @@ -1166,10 +1228,19 @@ static int __init ib_core_init(void) goto err; } + ib_comp_unbound_wq = + alloc_workqueue("ib-comp-unb-wq", + WQ_UNBOUND | WQ_HIGHPRI | WQ_MEM_RECLAIM | + WQ_SYSFS, WQ_UNBOUND_MAX_ACTIVE); + if (!ib_comp_unbound_wq) { + ret = -ENOMEM; + goto err_comp; + } + ret = class_register(&ib_class); if (ret) { pr_warn("Couldn't create InfiniBand device class\n"); - goto err_comp; + goto err_comp_unbound; } ret = rdma_nl_init(); @@ -1218,6 +1289,8 @@ err_ibnl: rdma_nl_exit(); err_sysfs: class_unregister(&ib_class); +err_comp_unbound: + destroy_workqueue(ib_comp_unbound_wq); err_comp: destroy_workqueue(ib_comp_wq); err: @@ -1236,6 +1309,7 @@ static void __exit ib_core_cleanup(void) addr_cleanup(); rdma_nl_exit(); class_unregister(&ib_class); + destroy_workqueue(ib_comp_unbound_wq); destroy_workqueue(ib_comp_wq); /* Make sure that any pending umem accounting work is done. */ destroy_workqueue(ib_wq); diff --git a/drivers/infiniband/core/fmr_pool.c b/drivers/infiniband/core/fmr_pool.c index a077500f7f32..83ba0068e8bb 100644 --- a/drivers/infiniband/core/fmr_pool.c +++ b/drivers/infiniband/core/fmr_pool.c @@ -213,7 +213,7 @@ struct ib_fmr_pool *ib_create_fmr_pool(struct ib_pd *pd, device = pd->device; if (!device->alloc_fmr || !device->dealloc_fmr || !device->map_phys_fmr || !device->unmap_fmr) { - pr_info(PFX "Device %s does not support FMRs\n", device->name); + dev_info(&device->dev, "Device does not support FMRs\n"); return ERR_PTR(-ENOSYS); } @@ -257,7 +257,8 @@ struct ib_fmr_pool *ib_create_fmr_pool(struct ib_pd *pd, atomic_set(&pool->flush_ser, 0); init_waitqueue_head(&pool->force_wait); - pool->worker = kthread_create_worker(0, "ib_fmr(%s)", device->name); + pool->worker = + kthread_create_worker(0, "ib_fmr(%s)", dev_name(&device->dev)); if (IS_ERR(pool->worker)) { pr_warn(PFX "couldn't start cleanup kthread worker\n"); ret = PTR_ERR(pool->worker); diff --git a/drivers/infiniband/core/iwcm.c b/drivers/infiniband/core/iwcm.c index 5d676cff41f4..ba668d49c751 100644 --- a/drivers/infiniband/core/iwcm.c +++ b/drivers/infiniband/core/iwcm.c @@ -509,7 +509,7 @@ static int iw_cm_map(struct iw_cm_id *cm_id, bool active) cm_id->m_local_addr = cm_id->local_addr; cm_id->m_remote_addr = cm_id->remote_addr; - memcpy(pm_reg_msg.dev_name, cm_id->device->name, + memcpy(pm_reg_msg.dev_name, dev_name(&cm_id->device->dev), sizeof(pm_reg_msg.dev_name)); memcpy(pm_reg_msg.if_name, cm_id->device->iwcm->ifname, sizeof(pm_reg_msg.if_name)); diff --git a/drivers/infiniband/core/mad.c b/drivers/infiniband/core/mad.c index ef459f2f2eeb..d7025cd5be28 100644 --- a/drivers/infiniband/core/mad.c +++ b/drivers/infiniband/core/mad.c @@ -220,33 +220,37 @@ struct ib_mad_agent *ib_register_mad_agent(struct ib_device *device, int ret2, qpn; u8 mgmt_class, vclass; + if ((qp_type == IB_QPT_SMI && !rdma_cap_ib_smi(device, port_num)) || + (qp_type == IB_QPT_GSI && !rdma_cap_ib_cm(device, port_num))) + return ERR_PTR(-EPROTONOSUPPORT); + /* Validate parameters */ qpn = get_spl_qp_index(qp_type); if (qpn == -1) { - dev_notice(&device->dev, - "ib_register_mad_agent: invalid QP Type %d\n", - qp_type); + dev_dbg_ratelimited(&device->dev, "%s: invalid QP Type %d\n", + __func__, qp_type); goto error1; } if (rmpp_version && rmpp_version != IB_MGMT_RMPP_VERSION) { - dev_notice(&device->dev, - "ib_register_mad_agent: invalid RMPP Version %u\n", - rmpp_version); + dev_dbg_ratelimited(&device->dev, + "%s: invalid RMPP Version %u\n", + __func__, rmpp_version); goto error1; } /* Validate MAD registration request if supplied */ if (mad_reg_req) { if (mad_reg_req->mgmt_class_version >= MAX_MGMT_VERSION) { - dev_notice(&device->dev, - "ib_register_mad_agent: invalid Class Version %u\n", - mad_reg_req->mgmt_class_version); + dev_dbg_ratelimited(&device->dev, + "%s: invalid Class Version %u\n", + __func__, + mad_reg_req->mgmt_class_version); goto error1; } if (!recv_handler) { - dev_notice(&device->dev, - "ib_register_mad_agent: no recv_handler\n"); + dev_dbg_ratelimited(&device->dev, + "%s: no recv_handler\n", __func__); goto error1; } if (mad_reg_req->mgmt_class >= MAX_MGMT_CLASS) { @@ -256,9 +260,9 @@ struct ib_mad_agent *ib_register_mad_agent(struct ib_device *device, */ if (mad_reg_req->mgmt_class != IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) { - dev_notice(&device->dev, - "ib_register_mad_agent: Invalid Mgmt Class 0x%x\n", - mad_reg_req->mgmt_class); + dev_dbg_ratelimited(&device->dev, + "%s: Invalid Mgmt Class 0x%x\n", + __func__, mad_reg_req->mgmt_class); goto error1; } } else if (mad_reg_req->mgmt_class == 0) { @@ -266,8 +270,9 @@ struct ib_mad_agent *ib_register_mad_agent(struct ib_device *device, * Class 0 is reserved in IBA and is used for * aliasing of IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE */ - dev_notice(&device->dev, - "ib_register_mad_agent: Invalid Mgmt Class 0\n"); + dev_dbg_ratelimited(&device->dev, + "%s: Invalid Mgmt Class 0\n", + __func__); goto error1; } else if (is_vendor_class(mad_reg_req->mgmt_class)) { /* @@ -275,18 +280,19 @@ struct ib_mad_agent *ib_register_mad_agent(struct ib_device *device, * ensure supplied OUI is not zero */ if (!is_vendor_oui(mad_reg_req->oui)) { - dev_notice(&device->dev, - "ib_register_mad_agent: No OUI specified for class 0x%x\n", - mad_reg_req->mgmt_class); + dev_dbg_ratelimited(&device->dev, + "%s: No OUI specified for class 0x%x\n", + __func__, + mad_reg_req->mgmt_class); goto error1; } } /* Make sure class supplied is consistent with RMPP */ if (!ib_is_mad_class_rmpp(mad_reg_req->mgmt_class)) { if (rmpp_version) { - dev_notice(&device->dev, - "ib_register_mad_agent: RMPP version for non-RMPP class 0x%x\n", - mad_reg_req->mgmt_class); + dev_dbg_ratelimited(&device->dev, + "%s: RMPP version for non-RMPP class 0x%x\n", + __func__, mad_reg_req->mgmt_class); goto error1; } } @@ -297,9 +303,9 @@ struct ib_mad_agent *ib_register_mad_agent(struct ib_device *device, IB_MGMT_CLASS_SUBN_LID_ROUTED) && (mad_reg_req->mgmt_class != IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE)) { - dev_notice(&device->dev, - "ib_register_mad_agent: Invalid SM QP type: class 0x%x\n", - mad_reg_req->mgmt_class); + dev_dbg_ratelimited(&device->dev, + "%s: Invalid SM QP type: class 0x%x\n", + __func__, mad_reg_req->mgmt_class); goto error1; } } else { @@ -307,9 +313,9 @@ struct ib_mad_agent *ib_register_mad_agent(struct ib_device *device, IB_MGMT_CLASS_SUBN_LID_ROUTED) || (mad_reg_req->mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE)) { - dev_notice(&device->dev, - "ib_register_mad_agent: Invalid GS QP type: class 0x%x\n", - mad_reg_req->mgmt_class); + dev_dbg_ratelimited(&device->dev, + "%s: Invalid GS QP type: class 0x%x\n", + __func__, mad_reg_req->mgmt_class); goto error1; } } @@ -324,18 +330,18 @@ struct ib_mad_agent *ib_register_mad_agent(struct ib_device *device, /* Validate device and port */ port_priv = ib_get_mad_port(device, port_num); if (!port_priv) { - dev_notice(&device->dev, - "ib_register_mad_agent: Invalid port %d\n", - port_num); + dev_dbg_ratelimited(&device->dev, "%s: Invalid port %d\n", + __func__, port_num); ret = ERR_PTR(-ENODEV); goto error1; } - /* Verify the QP requested is supported. For example, Ethernet devices - * will not have QP0 */ + /* Verify the QP requested is supported. For example, Ethernet devices + * will not have QP0. + */ if (!port_priv->qp_info[qpn].qp) { - dev_notice(&device->dev, - "ib_register_mad_agent: QP %d not supported\n", qpn); + dev_dbg_ratelimited(&device->dev, "%s: QP %d not supported\n", + __func__, qpn); ret = ERR_PTR(-EPROTONOSUPPORT); goto error1; } @@ -2408,7 +2414,7 @@ static void wait_for_response(struct ib_mad_send_wr_private *mad_send_wr) } void ib_reset_mad_timeout(struct ib_mad_send_wr_private *mad_send_wr, - int timeout_ms) + unsigned long timeout_ms) { mad_send_wr->timeout = msecs_to_jiffies(timeout_ms); wait_for_response(mad_send_wr); @@ -3183,7 +3189,7 @@ static int ib_mad_port_open(struct ib_device *device, cq_size *= 2; port_priv->cq = ib_alloc_cq(port_priv->device, port_priv, cq_size, 0, - IB_POLL_WORKQUEUE); + IB_POLL_UNBOUND_WORKQUEUE); if (IS_ERR(port_priv->cq)) { dev_err(&device->dev, "Couldn't create ib_mad CQ\n"); ret = PTR_ERR(port_priv->cq); diff --git a/drivers/infiniband/core/mad_priv.h b/drivers/infiniband/core/mad_priv.h index d84ae1671898..216509036aa8 100644 --- a/drivers/infiniband/core/mad_priv.h +++ b/drivers/infiniband/core/mad_priv.h @@ -221,6 +221,6 @@ void ib_mad_complete_send_wr(struct ib_mad_send_wr_private *mad_send_wr, void ib_mark_mad_done(struct ib_mad_send_wr_private *mad_send_wr); void ib_reset_mad_timeout(struct ib_mad_send_wr_private *mad_send_wr, - int timeout_ms); + unsigned long timeout_ms); #endif /* __IB_MAD_PRIV_H__ */ diff --git a/drivers/infiniband/core/netlink.c b/drivers/infiniband/core/netlink.c index 3ccaae18ad75..724f5a62e82f 100644 --- a/drivers/infiniband/core/netlink.c +++ b/drivers/infiniband/core/netlink.c @@ -47,9 +47,9 @@ static struct { const struct rdma_nl_cbs *cb_table; } rdma_nl_types[RDMA_NL_NUM_CLIENTS]; -int rdma_nl_chk_listeners(unsigned int group) +bool rdma_nl_chk_listeners(unsigned int group) { - return (netlink_has_listeners(nls, group)) ? 0 : -1; + return netlink_has_listeners(nls, group); } EXPORT_SYMBOL(rdma_nl_chk_listeners); diff --git a/drivers/infiniband/core/nldev.c b/drivers/infiniband/core/nldev.c index 0385ab438320..573399e3ccc1 100644 --- a/drivers/infiniband/core/nldev.c +++ b/drivers/infiniband/core/nldev.c @@ -179,7 +179,8 @@ static int fill_nldev_handle(struct sk_buff *msg, struct ib_device *device) { if (nla_put_u32(msg, RDMA_NLDEV_ATTR_DEV_INDEX, device->index)) return -EMSGSIZE; - if (nla_put_string(msg, RDMA_NLDEV_ATTR_DEV_NAME, device->name)) + if (nla_put_string(msg, RDMA_NLDEV_ATTR_DEV_NAME, + dev_name(&device->dev))) return -EMSGSIZE; return 0; @@ -645,6 +646,36 @@ err: return err; } +static int nldev_set_doit(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) +{ + struct nlattr *tb[RDMA_NLDEV_ATTR_MAX]; + struct ib_device *device; + u32 index; + int err; + + err = nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, nldev_policy, + extack); + if (err || !tb[RDMA_NLDEV_ATTR_DEV_INDEX]) + return -EINVAL; + + index = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]); + device = ib_device_get_by_index(index); + if (!device) + return -EINVAL; + + if (tb[RDMA_NLDEV_ATTR_DEV_NAME]) { + char name[IB_DEVICE_NAME_MAX] = {}; + + nla_strlcpy(name, tb[RDMA_NLDEV_ATTR_DEV_NAME], + IB_DEVICE_NAME_MAX); + err = ib_device_rename(device, name); + } + + put_device(&device->dev); + return err; +} + static int _nldev_get_dumpit(struct ib_device *device, struct sk_buff *skb, struct netlink_callback *cb, @@ -1077,6 +1108,10 @@ static const struct rdma_nl_cbs nldev_cb_table[RDMA_NLDEV_NUM_OPS] = { .doit = nldev_get_doit, .dump = nldev_get_dumpit, }, + [RDMA_NLDEV_CMD_SET] = { + .doit = nldev_set_doit, + .flags = RDMA_NL_ADMIN_PERM, + }, [RDMA_NLDEV_CMD_PORT_GET] = { .doit = nldev_port_get_doit, .dump = nldev_port_get_dumpit, diff --git a/drivers/infiniband/core/rdma_core.c b/drivers/infiniband/core/rdma_core.c index c4118bcd5103..752a55c6bdce 100644 --- a/drivers/infiniband/core/rdma_core.c +++ b/drivers/infiniband/core/rdma_core.c @@ -794,44 +794,6 @@ void uverbs_close_fd(struct file *f) uverbs_uobject_put(uobj); } -static void ufile_disassociate_ucontext(struct ib_ucontext *ibcontext) -{ - struct ib_device *ib_dev = ibcontext->device; - struct task_struct *owning_process = NULL; - struct mm_struct *owning_mm = NULL; - - owning_process = get_pid_task(ibcontext->tgid, PIDTYPE_PID); - if (!owning_process) - return; - - owning_mm = get_task_mm(owning_process); - if (!owning_mm) { - pr_info("no mm, disassociate ucontext is pending task termination\n"); - while (1) { - put_task_struct(owning_process); - usleep_range(1000, 2000); - owning_process = get_pid_task(ibcontext->tgid, - PIDTYPE_PID); - if (!owning_process || - owning_process->state == TASK_DEAD) { - pr_info("disassociate ucontext done, task was terminated\n"); - /* in case task was dead need to release the - * task struct. - */ - if (owning_process) - put_task_struct(owning_process); - return; - } - } - } - - down_write(&owning_mm->mmap_sem); - ib_dev->disassociate_ucontext(ibcontext); - up_write(&owning_mm->mmap_sem); - mmput(owning_mm); - put_task_struct(owning_process); -} - /* * Drop the ucontext off the ufile and completely disconnect it from the * ib_device @@ -840,20 +802,28 @@ static void ufile_destroy_ucontext(struct ib_uverbs_file *ufile, enum rdma_remove_reason reason) { struct ib_ucontext *ucontext = ufile->ucontext; + struct ib_device *ib_dev = ucontext->device; int ret; - if (reason == RDMA_REMOVE_DRIVER_REMOVE) - ufile_disassociate_ucontext(ucontext); + /* + * If we are closing the FD then the user mmap VMAs must have + * already been destroyed as they hold on to the filep, otherwise + * they need to be zap'd. + */ + if (reason == RDMA_REMOVE_DRIVER_REMOVE) { + uverbs_user_mmap_disassociate(ufile); + if (ib_dev->disassociate_ucontext) + ib_dev->disassociate_ucontext(ucontext); + } - put_pid(ucontext->tgid); - ib_rdmacg_uncharge(&ucontext->cg_obj, ucontext->device, + ib_rdmacg_uncharge(&ucontext->cg_obj, ib_dev, RDMACG_RESOURCE_HCA_HANDLE); /* * FIXME: Drivers are not permitted to fail dealloc_ucontext, remove * the error return. */ - ret = ucontext->device->dealloc_ucontext(ucontext); + ret = ib_dev->dealloc_ucontext(ucontext); WARN_ON(ret); ufile->ucontext = NULL; diff --git a/drivers/infiniband/core/rdma_core.h b/drivers/infiniband/core/rdma_core.h index f962f2a593ba..4886d2bba7c7 100644 --- a/drivers/infiniband/core/rdma_core.h +++ b/drivers/infiniband/core/rdma_core.h @@ -160,5 +160,6 @@ void uverbs_disassociate_api(struct uverbs_api *uapi); void uverbs_destroy_api(struct uverbs_api *uapi); void uapi_compute_bundle_size(struct uverbs_api_ioctl_method *method_elm, unsigned int num_attrs); +void uverbs_user_mmap_disassociate(struct ib_uverbs_file *ufile); #endif /* RDMA_CORE_H */ diff --git a/drivers/infiniband/core/restrack.c b/drivers/infiniband/core/restrack.c index 3b7fa0ccaa08..06d8657ce583 100644 --- a/drivers/infiniband/core/restrack.c +++ b/drivers/infiniband/core/restrack.c @@ -50,8 +50,7 @@ void rdma_restrack_clean(struct rdma_restrack_root *res) dev = container_of(res, struct ib_device, res); pr_err("restrack: %s", CUT_HERE); - pr_err("restrack: BUG: RESTRACK detected leak of resources on %s\n", - dev->name); + dev_err(&dev->dev, "BUG: RESTRACK detected leak of resources\n"); hash_for_each(res->hash, bkt, e, node) { if (rdma_is_kernel_res(e)) { owner = e->kern_name; @@ -156,6 +155,21 @@ static bool res_is_user(struct rdma_restrack_entry *res) } } +void rdma_restrack_set_task(struct rdma_restrack_entry *res, + const char *caller) +{ + if (caller) { + res->kern_name = caller; + return; + } + + if (res->task) + put_task_struct(res->task); + get_task_struct(current); + res->task = current; +} +EXPORT_SYMBOL(rdma_restrack_set_task); + void rdma_restrack_add(struct rdma_restrack_entry *res) { struct ib_device *dev = res_to_dev(res); @@ -168,7 +182,7 @@ void rdma_restrack_add(struct rdma_restrack_entry *res) if (res_is_user(res)) { if (!res->task) - rdma_restrack_set_task(res, current); + rdma_restrack_set_task(res, NULL); res->kern_name = NULL; } else { set_kern_name(res); @@ -209,7 +223,7 @@ void rdma_restrack_del(struct rdma_restrack_entry *res) struct ib_device *dev; if (!res->valid) - return; + goto out; dev = res_to_dev(res); if (!dev) @@ -222,8 +236,12 @@ void rdma_restrack_del(struct rdma_restrack_entry *res) down_write(&dev->res.rwsem); hash_del(&res->node); res->valid = false; - if (res->task) - put_task_struct(res->task); up_write(&dev->res.rwsem); + +out: + if (res->task) { + put_task_struct(res->task); + res->task = NULL; + } } EXPORT_SYMBOL(rdma_restrack_del); diff --git a/drivers/infiniband/core/sa.h b/drivers/infiniband/core/sa.h index b1d4bbf4ce5c..cbaaaa92fff3 100644 --- a/drivers/infiniband/core/sa.h +++ b/drivers/infiniband/core/sa.h @@ -49,16 +49,14 @@ static inline void ib_sa_client_put(struct ib_sa_client *client) } int ib_sa_mcmember_rec_query(struct ib_sa_client *client, - struct ib_device *device, u8 port_num, - u8 method, + struct ib_device *device, u8 port_num, u8 method, struct ib_sa_mcmember_rec *rec, ib_sa_comp_mask comp_mask, - int timeout_ms, gfp_t gfp_mask, + unsigned long timeout_ms, gfp_t gfp_mask, void (*callback)(int status, struct ib_sa_mcmember_rec *resp, void *context), - void *context, - struct ib_sa_query **sa_query); + void *context, struct ib_sa_query **sa_query); int mcast_init(void); void mcast_cleanup(void); diff --git a/drivers/infiniband/core/sa_query.c b/drivers/infiniband/core/sa_query.c index 7b794a14d6e8..be5ba5e15496 100644 --- a/drivers/infiniband/core/sa_query.c +++ b/drivers/infiniband/core/sa_query.c @@ -761,7 +761,7 @@ static void ib_nl_set_path_rec_attrs(struct sk_buff *skb, /* Construct the family header first */ header = skb_put(skb, NLMSG_ALIGN(sizeof(*header))); - memcpy(header->device_name, query->port->agent->device->name, + memcpy(header->device_name, dev_name(&query->port->agent->device->dev), LS_DEVICE_NAME_MAX); header->port_num = query->port->port_num; @@ -835,7 +835,6 @@ static int ib_nl_send_msg(struct ib_sa_query *query, gfp_t gfp_mask) struct sk_buff *skb = NULL; struct nlmsghdr *nlh; void *data; - int ret = 0; struct ib_sa_mad *mad; int len; @@ -862,13 +861,7 @@ static int ib_nl_send_msg(struct ib_sa_query *query, gfp_t gfp_mask) /* Repair the nlmsg header length */ nlmsg_end(skb, nlh); - ret = rdma_nl_multicast(skb, RDMA_NL_GROUP_LS, gfp_mask); - if (!ret) - ret = len; - else - ret = 0; - - return ret; + return rdma_nl_multicast(skb, RDMA_NL_GROUP_LS, gfp_mask); } static int ib_nl_make_request(struct ib_sa_query *query, gfp_t gfp_mask) @@ -891,14 +884,12 @@ static int ib_nl_make_request(struct ib_sa_query *query, gfp_t gfp_mask) spin_unlock_irqrestore(&ib_nl_request_lock, flags); ret = ib_nl_send_msg(query, gfp_mask); - if (ret <= 0) { + if (ret) { ret = -EIO; /* Remove the request */ spin_lock_irqsave(&ib_nl_request_lock, flags); list_del(&query->list); spin_unlock_irqrestore(&ib_nl_request_lock, flags); - } else { - ret = 0; } return ret; @@ -1227,46 +1218,6 @@ static u8 get_src_path_mask(struct ib_device *device, u8 port_num) return src_path_mask; } -static int roce_resolve_route_from_path(struct sa_path_rec *rec, - const struct ib_gid_attr *attr) -{ - struct rdma_dev_addr dev_addr = {}; - union { - struct sockaddr _sockaddr; - struct sockaddr_in _sockaddr_in; - struct sockaddr_in6 _sockaddr_in6; - } sgid_addr, dgid_addr; - int ret; - - if (rec->roce.route_resolved) - return 0; - if (!attr || !attr->ndev) - return -EINVAL; - - dev_addr.bound_dev_if = attr->ndev->ifindex; - /* TODO: Use net from the ib_gid_attr once it is added to it, - * until than, limit itself to init_net. - */ - dev_addr.net = &init_net; - - rdma_gid2ip(&sgid_addr._sockaddr, &rec->sgid); - rdma_gid2ip(&dgid_addr._sockaddr, &rec->dgid); - - /* validate the route */ - ret = rdma_resolve_ip_route(&sgid_addr._sockaddr, - &dgid_addr._sockaddr, &dev_addr); - if (ret) - return ret; - - if ((dev_addr.network == RDMA_NETWORK_IPV4 || - dev_addr.network == RDMA_NETWORK_IPV6) && - rec->rec_type != SA_PATH_REC_TYPE_ROCE_V2) - return -EINVAL; - - rec->roce.route_resolved = true; - return 0; -} - static int init_ah_attr_grh_fields(struct ib_device *device, u8 port_num, struct sa_path_rec *rec, struct rdma_ah_attr *ah_attr, @@ -1409,7 +1360,8 @@ static void init_mad(struct ib_sa_query *query, struct ib_mad_agent *agent) spin_unlock_irqrestore(&tid_lock, flags); } -static int send_mad(struct ib_sa_query *query, int timeout_ms, gfp_t gfp_mask) +static int send_mad(struct ib_sa_query *query, unsigned long timeout_ms, + gfp_t gfp_mask) { bool preload = gfpflags_allow_blocking(gfp_mask); unsigned long flags; @@ -1433,7 +1385,7 @@ static int send_mad(struct ib_sa_query *query, int timeout_ms, gfp_t gfp_mask) if ((query->flags & IB_SA_ENABLE_LOCAL_SERVICE) && (!(query->flags & IB_SA_QUERY_OPA))) { - if (!rdma_nl_chk_listeners(RDMA_NL_GROUP_LS)) { + if (rdma_nl_chk_listeners(RDMA_NL_GROUP_LS)) { if (!ib_nl_make_request(query, gfp_mask)) return id; } @@ -1599,7 +1551,7 @@ int ib_sa_path_rec_get(struct ib_sa_client *client, struct ib_device *device, u8 port_num, struct sa_path_rec *rec, ib_sa_comp_mask comp_mask, - int timeout_ms, gfp_t gfp_mask, + unsigned long timeout_ms, gfp_t gfp_mask, void (*callback)(int status, struct sa_path_rec *resp, void *context), @@ -1753,7 +1705,7 @@ int ib_sa_service_rec_query(struct ib_sa_client *client, struct ib_device *device, u8 port_num, u8 method, struct ib_sa_service_rec *rec, ib_sa_comp_mask comp_mask, - int timeout_ms, gfp_t gfp_mask, + unsigned long timeout_ms, gfp_t gfp_mask, void (*callback)(int status, struct ib_sa_service_rec *resp, void *context), @@ -1850,7 +1802,7 @@ int ib_sa_mcmember_rec_query(struct ib_sa_client *client, u8 method, struct ib_sa_mcmember_rec *rec, ib_sa_comp_mask comp_mask, - int timeout_ms, gfp_t gfp_mask, + unsigned long timeout_ms, gfp_t gfp_mask, void (*callback)(int status, struct ib_sa_mcmember_rec *resp, void *context), @@ -1941,7 +1893,7 @@ int ib_sa_guid_info_rec_query(struct ib_sa_client *client, struct ib_device *device, u8 port_num, struct ib_sa_guidinfo_rec *rec, ib_sa_comp_mask comp_mask, u8 method, - int timeout_ms, gfp_t gfp_mask, + unsigned long timeout_ms, gfp_t gfp_mask, void (*callback)(int status, struct ib_sa_guidinfo_rec *resp, void *context), @@ -2108,7 +2060,7 @@ static void ib_sa_classport_info_rec_release(struct ib_sa_query *sa_query) } static int ib_sa_classport_info_rec_query(struct ib_sa_port *port, - int timeout_ms, + unsigned long timeout_ms, void (*callback)(void *context), void *context, struct ib_sa_query **sa_query) diff --git a/drivers/infiniband/core/security.c b/drivers/infiniband/core/security.c index 9b0bea8303e0..1143c0448666 100644 --- a/drivers/infiniband/core/security.c +++ b/drivers/infiniband/core/security.c @@ -685,9 +685,8 @@ static int ib_mad_agent_security_change(struct notifier_block *nb, if (event != LSM_POLICY_CHANGE) return NOTIFY_DONE; - ag->smp_allowed = !security_ib_endport_manage_subnet(ag->security, - ag->device->name, - ag->port_num); + ag->smp_allowed = !security_ib_endport_manage_subnet( + ag->security, dev_name(&ag->device->dev), ag->port_num); return NOTIFY_OK; } @@ -708,7 +707,7 @@ int ib_mad_agent_security_setup(struct ib_mad_agent *agent, return 0; ret = security_ib_endport_manage_subnet(agent->security, - agent->device->name, + dev_name(&agent->device->dev), agent->port_num); if (ret) return ret; diff --git a/drivers/infiniband/core/sysfs.c b/drivers/infiniband/core/sysfs.c index 7fd14ead7b37..6fcce2c206c6 100644 --- a/drivers/infiniband/core/sysfs.c +++ b/drivers/infiniband/core/sysfs.c @@ -512,7 +512,7 @@ static ssize_t show_pma_counter(struct ib_port *p, struct port_attribute *attr, ret = get_perf_mad(p->ibdev, p->port_num, tab_attr->attr_id, &data, 40 + offset / 8, sizeof(data)); if (ret < 0) - return sprintf(buf, "N/A (no PMA)\n"); + return ret; switch (width) { case 4: @@ -1036,7 +1036,7 @@ static int add_port(struct ib_device *device, int port_num, p->port_num = port_num; ret = kobject_init_and_add(&p->kobj, &port_type, - device->ports_parent, + device->ports_kobj, "%d", port_num); if (ret) { kfree(p); @@ -1057,10 +1057,12 @@ static int add_port(struct ib_device *device, int port_num, goto err_put; } - p->pma_table = get_counter_table(device, port_num); - ret = sysfs_create_group(&p->kobj, p->pma_table); - if (ret) - goto err_put_gid_attrs; + if (device->process_mad) { + p->pma_table = get_counter_table(device, port_num); + ret = sysfs_create_group(&p->kobj, p->pma_table); + if (ret) + goto err_put_gid_attrs; + } p->gid_group.name = "gids"; p->gid_group.attrs = alloc_group_attrs(show_port_gid, attr.gid_tbl_len); @@ -1118,9 +1120,9 @@ static int add_port(struct ib_device *device, int port_num, } /* - * If port == 0, it means we have only one port and the parent - * device, not this port device, should be the holder of the - * hw_counters + * If port == 0, it means hw_counters are per device and not per + * port, so holder should be device. Therefore skip per port conunter + * initialization. */ if (device->alloc_hw_stats && port_num) setup_hw_stats(device, p, port_num); @@ -1173,7 +1175,8 @@ err_free_gid: p->gid_group.attrs = NULL; err_remove_pma: - sysfs_remove_group(&p->kobj, p->pma_table); + if (p->pma_table) + sysfs_remove_group(&p->kobj, p->pma_table); err_put_gid_attrs: kobject_put(&p->gid_attr_group->kobj); @@ -1183,7 +1186,7 @@ err_put: return ret; } -static ssize_t show_node_type(struct device *device, +static ssize_t node_type_show(struct device *device, struct device_attribute *attr, char *buf) { struct ib_device *dev = container_of(device, struct ib_device, dev); @@ -1198,8 +1201,9 @@ static ssize_t show_node_type(struct device *device, default: return sprintf(buf, "%d: <unknown>\n", dev->node_type); } } +static DEVICE_ATTR_RO(node_type); -static ssize_t show_sys_image_guid(struct device *device, +static ssize_t sys_image_guid_show(struct device *device, struct device_attribute *dev_attr, char *buf) { struct ib_device *dev = container_of(device, struct ib_device, dev); @@ -1210,8 +1214,9 @@ static ssize_t show_sys_image_guid(struct device *device, be16_to_cpu(((__be16 *) &dev->attrs.sys_image_guid)[2]), be16_to_cpu(((__be16 *) &dev->attrs.sys_image_guid)[3])); } +static DEVICE_ATTR_RO(sys_image_guid); -static ssize_t show_node_guid(struct device *device, +static ssize_t node_guid_show(struct device *device, struct device_attribute *attr, char *buf) { struct ib_device *dev = container_of(device, struct ib_device, dev); @@ -1222,8 +1227,9 @@ static ssize_t show_node_guid(struct device *device, be16_to_cpu(((__be16 *) &dev->node_guid)[2]), be16_to_cpu(((__be16 *) &dev->node_guid)[3])); } +static DEVICE_ATTR_RO(node_guid); -static ssize_t show_node_desc(struct device *device, +static ssize_t node_desc_show(struct device *device, struct device_attribute *attr, char *buf) { struct ib_device *dev = container_of(device, struct ib_device, dev); @@ -1231,9 +1237,9 @@ static ssize_t show_node_desc(struct device *device, return sprintf(buf, "%.64s\n", dev->node_desc); } -static ssize_t set_node_desc(struct device *device, - struct device_attribute *attr, - const char *buf, size_t count) +static ssize_t node_desc_store(struct device *device, + struct device_attribute *attr, + const char *buf, size_t count) { struct ib_device *dev = container_of(device, struct ib_device, dev); struct ib_device_modify desc = {}; @@ -1249,8 +1255,9 @@ static ssize_t set_node_desc(struct device *device, return count; } +static DEVICE_ATTR_RW(node_desc); -static ssize_t show_fw_ver(struct device *device, struct device_attribute *attr, +static ssize_t fw_ver_show(struct device *device, struct device_attribute *attr, char *buf) { struct ib_device *dev = container_of(device, struct ib_device, dev); @@ -1259,19 +1266,19 @@ static ssize_t show_fw_ver(struct device *device, struct device_attribute *attr, strlcat(buf, "\n", IB_FW_VERSION_NAME_MAX); return strlen(buf); } +static DEVICE_ATTR_RO(fw_ver); + +static struct attribute *ib_dev_attrs[] = { + &dev_attr_node_type.attr, + &dev_attr_node_guid.attr, + &dev_attr_sys_image_guid.attr, + &dev_attr_fw_ver.attr, + &dev_attr_node_desc.attr, + NULL, +}; -static DEVICE_ATTR(node_type, S_IRUGO, show_node_type, NULL); -static DEVICE_ATTR(sys_image_guid, S_IRUGO, show_sys_image_guid, NULL); -static DEVICE_ATTR(node_guid, S_IRUGO, show_node_guid, NULL); -static DEVICE_ATTR(node_desc, S_IRUGO | S_IWUSR, show_node_desc, set_node_desc); -static DEVICE_ATTR(fw_ver, S_IRUGO, show_fw_ver, NULL); - -static struct device_attribute *ib_class_attributes[] = { - &dev_attr_node_type, - &dev_attr_sys_image_guid, - &dev_attr_node_guid, - &dev_attr_node_desc, - &dev_attr_fw_ver, +static const struct attribute_group dev_attr_group = { + .attrs = ib_dev_attrs, }; static void free_port_list_attributes(struct ib_device *device) @@ -1285,7 +1292,9 @@ static void free_port_list_attributes(struct ib_device *device) kfree(port->hw_stats); free_hsag(&port->kobj, port->hw_stats_ag); } - sysfs_remove_group(p, port->pma_table); + + if (port->pma_table) + sysfs_remove_group(p, port->pma_table); sysfs_remove_group(p, &port->pkey_group); sysfs_remove_group(p, &port->gid_group); sysfs_remove_group(&port->gid_attr_group->kobj, @@ -1296,7 +1305,7 @@ static void free_port_list_attributes(struct ib_device *device) kobject_put(p); } - kobject_put(device->ports_parent); + kobject_put(device->ports_kobj); } int ib_device_register_sysfs(struct ib_device *device, @@ -1307,23 +1316,15 @@ int ib_device_register_sysfs(struct ib_device *device, int ret; int i; - ret = dev_set_name(class_dev, "%s", device->name); - if (ret) - return ret; + device->groups[0] = &dev_attr_group; + class_dev->groups = device->groups; ret = device_add(class_dev); if (ret) goto err; - for (i = 0; i < ARRAY_SIZE(ib_class_attributes); ++i) { - ret = device_create_file(class_dev, ib_class_attributes[i]); - if (ret) - goto err_unregister; - } - - device->ports_parent = kobject_create_and_add("ports", - &class_dev->kobj); - if (!device->ports_parent) { + device->ports_kobj = kobject_create_and_add("ports", &class_dev->kobj); + if (!device->ports_kobj) { ret = -ENOMEM; goto err_put; } @@ -1347,20 +1348,15 @@ int ib_device_register_sysfs(struct ib_device *device, err_put: free_port_list_attributes(device); - -err_unregister: device_del(class_dev); - err: return ret; } void ib_device_unregister_sysfs(struct ib_device *device) { - int i; - - /* Hold kobject until ib_dealloc_device() */ - kobject_get(&device->dev.kobj); + /* Hold device until ib_dealloc_device() */ + get_device(&device->dev); free_port_list_attributes(device); @@ -1369,8 +1365,5 @@ void ib_device_unregister_sysfs(struct ib_device *device) free_hsag(&device->dev.kobj, device->hw_stats_ag); } - for (i = 0; i < ARRAY_SIZE(ib_class_attributes); ++i) - device_remove_file(&device->dev, ib_class_attributes[i]); - device_unregister(&device->dev); } diff --git a/drivers/infiniband/core/umem.c b/drivers/infiniband/core/umem.c index a41792dbae1f..c6144df47ea4 100644 --- a/drivers/infiniband/core/umem.c +++ b/drivers/infiniband/core/umem.c @@ -85,7 +85,9 @@ struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr, struct page **page_list; struct vm_area_struct **vma_list; unsigned long lock_limit; + unsigned long new_pinned; unsigned long cur_base; + struct mm_struct *mm; unsigned long npages; int ret; int i; @@ -107,25 +109,32 @@ struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr, if (!can_do_mlock()) return ERR_PTR(-EPERM); - umem = kzalloc(sizeof *umem, GFP_KERNEL); - if (!umem) - return ERR_PTR(-ENOMEM); + if (access & IB_ACCESS_ON_DEMAND) { + umem = kzalloc(sizeof(struct ib_umem_odp), GFP_KERNEL); + if (!umem) + return ERR_PTR(-ENOMEM); + umem->is_odp = 1; + } else { + umem = kzalloc(sizeof(*umem), GFP_KERNEL); + if (!umem) + return ERR_PTR(-ENOMEM); + } umem->context = context; umem->length = size; umem->address = addr; umem->page_shift = PAGE_SHIFT; umem->writable = ib_access_writable(access); + umem->owning_mm = mm = current->mm; + mmgrab(mm); if (access & IB_ACCESS_ON_DEMAND) { - ret = ib_umem_odp_get(context, umem, access); + ret = ib_umem_odp_get(to_ib_umem_odp(umem), access); if (ret) goto umem_kfree; return umem; } - umem->odp_data = NULL; - /* We assume the memory is from hugetlb until proved otherwise */ umem->hugetlb = 1; @@ -144,25 +153,25 @@ struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr, umem->hugetlb = 0; npages = ib_umem_num_pages(umem); + if (npages == 0 || npages > UINT_MAX) { + ret = -EINVAL; + goto out; + } lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; - down_write(¤t->mm->mmap_sem); - current->mm->pinned_vm += npages; - if ((current->mm->pinned_vm > lock_limit) && !capable(CAP_IPC_LOCK)) { - up_write(¤t->mm->mmap_sem); + down_write(&mm->mmap_sem); + if (check_add_overflow(mm->pinned_vm, npages, &new_pinned) || + (new_pinned > lock_limit && !capable(CAP_IPC_LOCK))) { + up_write(&mm->mmap_sem); ret = -ENOMEM; - goto vma; + goto out; } - up_write(¤t->mm->mmap_sem); + mm->pinned_vm = new_pinned; + up_write(&mm->mmap_sem); cur_base = addr & PAGE_MASK; - if (npages == 0 || npages > UINT_MAX) { - ret = -EINVAL; - goto vma; - } - ret = sg_alloc_table(&umem->sg_head, npages, GFP_KERNEL); if (ret) goto vma; @@ -172,14 +181,14 @@ struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr, sg_list_start = umem->sg_head.sgl; - down_read(¤t->mm->mmap_sem); while (npages) { + down_read(&mm->mmap_sem); ret = get_user_pages_longterm(cur_base, min_t(unsigned long, npages, PAGE_SIZE / sizeof (struct page *)), gup_flags, page_list, vma_list); if (ret < 0) { - up_read(¤t->mm->mmap_sem); + up_read(&mm->mmap_sem); goto umem_release; } @@ -187,17 +196,20 @@ struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr, cur_base += ret * PAGE_SIZE; npages -= ret; + /* Continue to hold the mmap_sem as vma_list access + * needs to be protected. + */ for_each_sg(sg_list_start, sg, ret, i) { if (vma_list && !is_vm_hugetlb_page(vma_list[i])) umem->hugetlb = 0; sg_set_page(sg, page_list[i], PAGE_SIZE, 0); } + up_read(&mm->mmap_sem); /* preparing for next loop */ sg_list_start = sg; } - up_read(¤t->mm->mmap_sem); umem->nmap = ib_dma_map_sg_attrs(context->device, umem->sg_head.sgl, @@ -216,29 +228,40 @@ struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr, umem_release: __ib_umem_release(context->device, umem, 0); vma: - down_write(¤t->mm->mmap_sem); - current->mm->pinned_vm -= ib_umem_num_pages(umem); - up_write(¤t->mm->mmap_sem); + down_write(&mm->mmap_sem); + mm->pinned_vm -= ib_umem_num_pages(umem); + up_write(&mm->mmap_sem); out: if (vma_list) free_page((unsigned long) vma_list); free_page((unsigned long) page_list); umem_kfree: - if (ret) + if (ret) { + mmdrop(umem->owning_mm); kfree(umem); + } return ret ? ERR_PTR(ret) : umem; } EXPORT_SYMBOL(ib_umem_get); -static void ib_umem_account(struct work_struct *work) +static void __ib_umem_release_tail(struct ib_umem *umem) +{ + mmdrop(umem->owning_mm); + if (umem->is_odp) + kfree(to_ib_umem_odp(umem)); + else + kfree(umem); +} + +static void ib_umem_release_defer(struct work_struct *work) { struct ib_umem *umem = container_of(work, struct ib_umem, work); - down_write(&umem->mm->mmap_sem); - umem->mm->pinned_vm -= umem->diff; - up_write(&umem->mm->mmap_sem); - mmput(umem->mm); - kfree(umem); + down_write(&umem->owning_mm->mmap_sem); + umem->owning_mm->pinned_vm -= ib_umem_num_pages(umem); + up_write(&umem->owning_mm->mmap_sem); + + __ib_umem_release_tail(umem); } /** @@ -248,52 +271,36 @@ static void ib_umem_account(struct work_struct *work) void ib_umem_release(struct ib_umem *umem) { struct ib_ucontext *context = umem->context; - struct mm_struct *mm; - struct task_struct *task; - unsigned long diff; - if (umem->odp_data) { - ib_umem_odp_release(umem); + if (umem->is_odp) { + ib_umem_odp_release(to_ib_umem_odp(umem)); + __ib_umem_release_tail(umem); return; } __ib_umem_release(umem->context->device, umem, 1); - task = get_pid_task(umem->context->tgid, PIDTYPE_PID); - if (!task) - goto out; - mm = get_task_mm(task); - put_task_struct(task); - if (!mm) - goto out; - - diff = ib_umem_num_pages(umem); - /* * We may be called with the mm's mmap_sem already held. This * can happen when a userspace munmap() is the call that drops * the last reference to our file and calls our release * method. If there are memory regions to destroy, we'll end * up here and not be able to take the mmap_sem. In that case - * we defer the vm_locked accounting to the system workqueue. + * we defer the vm_locked accounting a workqueue. */ if (context->closing) { - if (!down_write_trylock(&mm->mmap_sem)) { - INIT_WORK(&umem->work, ib_umem_account); - umem->mm = mm; - umem->diff = diff; - + if (!down_write_trylock(&umem->owning_mm->mmap_sem)) { + INIT_WORK(&umem->work, ib_umem_release_defer); queue_work(ib_wq, &umem->work); return; } - } else - down_write(&mm->mmap_sem); + } else { + down_write(&umem->owning_mm->mmap_sem); + } + umem->owning_mm->pinned_vm -= ib_umem_num_pages(umem); + up_write(&umem->owning_mm->mmap_sem); - mm->pinned_vm -= diff; - up_write(&mm->mmap_sem); - mmput(mm); -out: - kfree(umem); + __ib_umem_release_tail(umem); } EXPORT_SYMBOL(ib_umem_release); @@ -303,7 +310,7 @@ int ib_umem_page_count(struct ib_umem *umem) int n; struct scatterlist *sg; - if (umem->odp_data) + if (umem->is_odp) return ib_umem_num_pages(umem); n = 0; diff --git a/drivers/infiniband/core/umem_odp.c b/drivers/infiniband/core/umem_odp.c index 6ec748eccff7..2b4c5e7dd5a1 100644 --- a/drivers/infiniband/core/umem_odp.c +++ b/drivers/infiniband/core/umem_odp.c @@ -58,7 +58,7 @@ static u64 node_start(struct umem_odp_node *n) struct ib_umem_odp *umem_odp = container_of(n, struct ib_umem_odp, interval_tree); - return ib_umem_start(umem_odp->umem); + return ib_umem_start(&umem_odp->umem); } /* Note that the representation of the intervals in the interval tree @@ -71,140 +71,86 @@ static u64 node_last(struct umem_odp_node *n) struct ib_umem_odp *umem_odp = container_of(n, struct ib_umem_odp, interval_tree); - return ib_umem_end(umem_odp->umem) - 1; + return ib_umem_end(&umem_odp->umem) - 1; } INTERVAL_TREE_DEFINE(struct umem_odp_node, rb, u64, __subtree_last, node_start, node_last, static, rbt_ib_umem) -static void ib_umem_notifier_start_account(struct ib_umem *item) +static void ib_umem_notifier_start_account(struct ib_umem_odp *umem_odp) { - mutex_lock(&item->odp_data->umem_mutex); - - /* Only update private counters for this umem if it has them. - * Otherwise skip it. All page faults will be delayed for this umem. */ - if (item->odp_data->mn_counters_active) { - int notifiers_count = item->odp_data->notifiers_count++; - - if (notifiers_count == 0) - /* Initialize the completion object for waiting on - * notifiers. Since notifier_count is zero, no one - * should be waiting right now. */ - reinit_completion(&item->odp_data->notifier_completion); - } - mutex_unlock(&item->odp_data->umem_mutex); -} - -static void ib_umem_notifier_end_account(struct ib_umem *item) -{ - mutex_lock(&item->odp_data->umem_mutex); - - /* Only update private counters for this umem if it has them. - * Otherwise skip it. All page faults will be delayed for this umem. */ - if (item->odp_data->mn_counters_active) { + mutex_lock(&umem_odp->umem_mutex); + if (umem_odp->notifiers_count++ == 0) /* - * This sequence increase will notify the QP page fault that - * the page that is going to be mapped in the spte could have - * been freed. + * Initialize the completion object for waiting on + * notifiers. Since notifier_count is zero, no one should be + * waiting right now. */ - ++item->odp_data->notifiers_seq; - if (--item->odp_data->notifiers_count == 0) - complete_all(&item->odp_data->notifier_completion); - } - mutex_unlock(&item->odp_data->umem_mutex); + reinit_completion(&umem_odp->notifier_completion); + mutex_unlock(&umem_odp->umem_mutex); } -/* Account for a new mmu notifier in an ib_ucontext. */ -static void ib_ucontext_notifier_start_account(struct ib_ucontext *context) +static void ib_umem_notifier_end_account(struct ib_umem_odp *umem_odp) { - atomic_inc(&context->notifier_count); + mutex_lock(&umem_odp->umem_mutex); + /* + * This sequence increase will notify the QP page fault that the page + * that is going to be mapped in the spte could have been freed. + */ + ++umem_odp->notifiers_seq; + if (--umem_odp->notifiers_count == 0) + complete_all(&umem_odp->notifier_completion); + mutex_unlock(&umem_odp->umem_mutex); } -/* Account for a terminating mmu notifier in an ib_ucontext. - * - * Must be called with the ib_ucontext->umem_rwsem semaphore unlocked, since - * the function takes the semaphore itself. */ -static void ib_ucontext_notifier_end_account(struct ib_ucontext *context) +static int ib_umem_notifier_release_trampoline(struct ib_umem_odp *umem_odp, + u64 start, u64 end, void *cookie) { - int zero_notifiers = atomic_dec_and_test(&context->notifier_count); - - if (zero_notifiers && - !list_empty(&context->no_private_counters)) { - /* No currently running mmu notifiers. Now is the chance to - * add private accounting to all previously added umems. */ - struct ib_umem_odp *odp_data, *next; - - /* Prevent concurrent mmu notifiers from working on the - * no_private_counters list. */ - down_write(&context->umem_rwsem); - - /* Read the notifier_count again, with the umem_rwsem - * semaphore taken for write. */ - if (!atomic_read(&context->notifier_count)) { - list_for_each_entry_safe(odp_data, next, - &context->no_private_counters, - no_private_counters) { - mutex_lock(&odp_data->umem_mutex); - odp_data->mn_counters_active = true; - list_del(&odp_data->no_private_counters); - complete_all(&odp_data->notifier_completion); - mutex_unlock(&odp_data->umem_mutex); - } - } - - up_write(&context->umem_rwsem); - } -} + struct ib_umem *umem = &umem_odp->umem; -static int ib_umem_notifier_release_trampoline(struct ib_umem *item, u64 start, - u64 end, void *cookie) { /* * Increase the number of notifiers running, to * prevent any further fault handling on this MR. */ - ib_umem_notifier_start_account(item); - item->odp_data->dying = 1; + ib_umem_notifier_start_account(umem_odp); + umem_odp->dying = 1; /* Make sure that the fact the umem is dying is out before we release * all pending page faults. */ smp_wmb(); - complete_all(&item->odp_data->notifier_completion); - item->context->invalidate_range(item, ib_umem_start(item), - ib_umem_end(item)); + complete_all(&umem_odp->notifier_completion); + umem->context->invalidate_range(umem_odp, ib_umem_start(umem), + ib_umem_end(umem)); return 0; } static void ib_umem_notifier_release(struct mmu_notifier *mn, struct mm_struct *mm) { - struct ib_ucontext *context = container_of(mn, struct ib_ucontext, mn); - - if (!context->invalidate_range) - return; - - ib_ucontext_notifier_start_account(context); - down_read(&context->umem_rwsem); - rbt_ib_umem_for_each_in_range(&context->umem_tree, 0, - ULLONG_MAX, - ib_umem_notifier_release_trampoline, - true, - NULL); - up_read(&context->umem_rwsem); + struct ib_ucontext_per_mm *per_mm = + container_of(mn, struct ib_ucontext_per_mm, mn); + + down_read(&per_mm->umem_rwsem); + if (per_mm->active) + rbt_ib_umem_for_each_in_range( + &per_mm->umem_tree, 0, ULLONG_MAX, + ib_umem_notifier_release_trampoline, true, NULL); + up_read(&per_mm->umem_rwsem); } -static int invalidate_page_trampoline(struct ib_umem *item, u64 start, +static int invalidate_page_trampoline(struct ib_umem_odp *item, u64 start, u64 end, void *cookie) { ib_umem_notifier_start_account(item); - item->context->invalidate_range(item, start, start + PAGE_SIZE); + item->umem.context->invalidate_range(item, start, start + PAGE_SIZE); ib_umem_notifier_end_account(item); return 0; } -static int invalidate_range_start_trampoline(struct ib_umem *item, u64 start, - u64 end, void *cookie) +static int invalidate_range_start_trampoline(struct ib_umem_odp *item, + u64 start, u64 end, void *cookie) { ib_umem_notifier_start_account(item); - item->context->invalidate_range(item, start, end); + item->umem.context->invalidate_range(item, start, end); return 0; } @@ -214,28 +160,30 @@ static int ib_umem_notifier_invalidate_range_start(struct mmu_notifier *mn, unsigned long end, bool blockable) { - struct ib_ucontext *context = container_of(mn, struct ib_ucontext, mn); - int ret; - - if (!context->invalidate_range) - return 0; + struct ib_ucontext_per_mm *per_mm = + container_of(mn, struct ib_ucontext_per_mm, mn); if (blockable) - down_read(&context->umem_rwsem); - else if (!down_read_trylock(&context->umem_rwsem)) + down_read(&per_mm->umem_rwsem); + else if (!down_read_trylock(&per_mm->umem_rwsem)) return -EAGAIN; - ib_ucontext_notifier_start_account(context); - ret = rbt_ib_umem_for_each_in_range(&context->umem_tree, start, - end, - invalidate_range_start_trampoline, - blockable, NULL); - up_read(&context->umem_rwsem); + if (!per_mm->active) { + up_read(&per_mm->umem_rwsem); + /* + * At this point active is permanently set and visible to this + * CPU without a lock, that fact is relied on to skip the unlock + * in range_end. + */ + return 0; + } - return ret; + return rbt_ib_umem_for_each_in_range(&per_mm->umem_tree, start, end, + invalidate_range_start_trampoline, + blockable, NULL); } -static int invalidate_range_end_trampoline(struct ib_umem *item, u64 start, +static int invalidate_range_end_trampoline(struct ib_umem_odp *item, u64 start, u64 end, void *cookie) { ib_umem_notifier_end_account(item); @@ -247,22 +195,16 @@ static void ib_umem_notifier_invalidate_range_end(struct mmu_notifier *mn, unsigned long start, unsigned long end) { - struct ib_ucontext *context = container_of(mn, struct ib_ucontext, mn); + struct ib_ucontext_per_mm *per_mm = + container_of(mn, struct ib_ucontext_per_mm, mn); - if (!context->invalidate_range) + if (unlikely(!per_mm->active)) return; - /* - * TODO: we currently bail out if there is any sleepable work to be done - * in ib_umem_notifier_invalidate_range_start so we shouldn't really block - * here. But this is ugly and fragile. - */ - down_read(&context->umem_rwsem); - rbt_ib_umem_for_each_in_range(&context->umem_tree, start, + rbt_ib_umem_for_each_in_range(&per_mm->umem_tree, start, end, invalidate_range_end_trampoline, true, NULL); - up_read(&context->umem_rwsem); - ib_ucontext_notifier_end_account(context); + up_read(&per_mm->umem_rwsem); } static const struct mmu_notifier_ops ib_umem_notifiers = { @@ -271,31 +213,158 @@ static const struct mmu_notifier_ops ib_umem_notifiers = { .invalidate_range_end = ib_umem_notifier_invalidate_range_end, }; -struct ib_umem *ib_alloc_odp_umem(struct ib_ucontext *context, - unsigned long addr, - size_t size) +static void add_umem_to_per_mm(struct ib_umem_odp *umem_odp) { - struct ib_umem *umem; + struct ib_ucontext_per_mm *per_mm = umem_odp->per_mm; + struct ib_umem *umem = &umem_odp->umem; + + down_write(&per_mm->umem_rwsem); + if (likely(ib_umem_start(umem) != ib_umem_end(umem))) + rbt_ib_umem_insert(&umem_odp->interval_tree, + &per_mm->umem_tree); + up_write(&per_mm->umem_rwsem); +} + +static void remove_umem_from_per_mm(struct ib_umem_odp *umem_odp) +{ + struct ib_ucontext_per_mm *per_mm = umem_odp->per_mm; + struct ib_umem *umem = &umem_odp->umem; + + down_write(&per_mm->umem_rwsem); + if (likely(ib_umem_start(umem) != ib_umem_end(umem))) + rbt_ib_umem_remove(&umem_odp->interval_tree, + &per_mm->umem_tree); + complete_all(&umem_odp->notifier_completion); + + up_write(&per_mm->umem_rwsem); +} + +static struct ib_ucontext_per_mm *alloc_per_mm(struct ib_ucontext *ctx, + struct mm_struct *mm) +{ + struct ib_ucontext_per_mm *per_mm; + int ret; + + per_mm = kzalloc(sizeof(*per_mm), GFP_KERNEL); + if (!per_mm) + return ERR_PTR(-ENOMEM); + + per_mm->context = ctx; + per_mm->mm = mm; + per_mm->umem_tree = RB_ROOT_CACHED; + init_rwsem(&per_mm->umem_rwsem); + per_mm->active = ctx->invalidate_range; + + rcu_read_lock(); + per_mm->tgid = get_task_pid(current->group_leader, PIDTYPE_PID); + rcu_read_unlock(); + + WARN_ON(mm != current->mm); + + per_mm->mn.ops = &ib_umem_notifiers; + ret = mmu_notifier_register(&per_mm->mn, per_mm->mm); + if (ret) { + dev_err(&ctx->device->dev, + "Failed to register mmu_notifier %d\n", ret); + goto out_pid; + } + + list_add(&per_mm->ucontext_list, &ctx->per_mm_list); + return per_mm; + +out_pid: + put_pid(per_mm->tgid); + kfree(per_mm); + return ERR_PTR(ret); +} + +static int get_per_mm(struct ib_umem_odp *umem_odp) +{ + struct ib_ucontext *ctx = umem_odp->umem.context; + struct ib_ucontext_per_mm *per_mm; + + /* + * Generally speaking we expect only one or two per_mm in this list, + * so no reason to optimize this search today. + */ + mutex_lock(&ctx->per_mm_list_lock); + list_for_each_entry(per_mm, &ctx->per_mm_list, ucontext_list) { + if (per_mm->mm == umem_odp->umem.owning_mm) + goto found; + } + + per_mm = alloc_per_mm(ctx, umem_odp->umem.owning_mm); + if (IS_ERR(per_mm)) { + mutex_unlock(&ctx->per_mm_list_lock); + return PTR_ERR(per_mm); + } + +found: + umem_odp->per_mm = per_mm; + per_mm->odp_mrs_count++; + mutex_unlock(&ctx->per_mm_list_lock); + + return 0; +} + +static void free_per_mm(struct rcu_head *rcu) +{ + kfree(container_of(rcu, struct ib_ucontext_per_mm, rcu)); +} + +void put_per_mm(struct ib_umem_odp *umem_odp) +{ + struct ib_ucontext_per_mm *per_mm = umem_odp->per_mm; + struct ib_ucontext *ctx = umem_odp->umem.context; + bool need_free; + + mutex_lock(&ctx->per_mm_list_lock); + umem_odp->per_mm = NULL; + per_mm->odp_mrs_count--; + need_free = per_mm->odp_mrs_count == 0; + if (need_free) + list_del(&per_mm->ucontext_list); + mutex_unlock(&ctx->per_mm_list_lock); + + if (!need_free) + return; + + /* + * NOTE! mmu_notifier_unregister() can happen between a start/end + * callback, resulting in an start/end, and thus an unbalanced + * lock. This doesn't really matter to us since we are about to kfree + * the memory that holds the lock, however LOCKDEP doesn't like this. + */ + down_write(&per_mm->umem_rwsem); + per_mm->active = false; + up_write(&per_mm->umem_rwsem); + + WARN_ON(!RB_EMPTY_ROOT(&per_mm->umem_tree.rb_root)); + mmu_notifier_unregister_no_release(&per_mm->mn, per_mm->mm); + put_pid(per_mm->tgid); + mmu_notifier_call_srcu(&per_mm->rcu, free_per_mm); +} + +struct ib_umem_odp *ib_alloc_odp_umem(struct ib_ucontext_per_mm *per_mm, + unsigned long addr, size_t size) +{ + struct ib_ucontext *ctx = per_mm->context; struct ib_umem_odp *odp_data; + struct ib_umem *umem; int pages = size >> PAGE_SHIFT; int ret; - umem = kzalloc(sizeof(*umem), GFP_KERNEL); - if (!umem) + odp_data = kzalloc(sizeof(*odp_data), GFP_KERNEL); + if (!odp_data) return ERR_PTR(-ENOMEM); - - umem->context = context; + umem = &odp_data->umem; + umem->context = ctx; umem->length = size; umem->address = addr; umem->page_shift = PAGE_SHIFT; umem->writable = 1; - - odp_data = kzalloc(sizeof(*odp_data), GFP_KERNEL); - if (!odp_data) { - ret = -ENOMEM; - goto out_umem; - } - odp_data->umem = umem; + umem->is_odp = 1; + odp_data->per_mm = per_mm; mutex_init(&odp_data->umem_mutex); init_completion(&odp_data->notifier_completion); @@ -314,39 +383,34 @@ struct ib_umem *ib_alloc_odp_umem(struct ib_ucontext *context, goto out_page_list; } - down_write(&context->umem_rwsem); - context->odp_mrs_count++; - rbt_ib_umem_insert(&odp_data->interval_tree, &context->umem_tree); - if (likely(!atomic_read(&context->notifier_count))) - odp_data->mn_counters_active = true; - else - list_add(&odp_data->no_private_counters, - &context->no_private_counters); - up_write(&context->umem_rwsem); - - umem->odp_data = odp_data; + /* + * Caller must ensure that the umem_odp that the per_mm came from + * cannot be freed during the call to ib_alloc_odp_umem. + */ + mutex_lock(&ctx->per_mm_list_lock); + per_mm->odp_mrs_count++; + mutex_unlock(&ctx->per_mm_list_lock); + add_umem_to_per_mm(odp_data); - return umem; + return odp_data; out_page_list: vfree(odp_data->page_list); out_odp_data: kfree(odp_data); -out_umem: - kfree(umem); return ERR_PTR(ret); } EXPORT_SYMBOL(ib_alloc_odp_umem); -int ib_umem_odp_get(struct ib_ucontext *context, struct ib_umem *umem, - int access) +int ib_umem_odp_get(struct ib_umem_odp *umem_odp, int access) { + struct ib_umem *umem = &umem_odp->umem; + /* + * NOTE: This must called in a process context where umem->owning_mm + * == current->mm + */ + struct mm_struct *mm = umem->owning_mm; int ret_val; - struct pid *our_pid; - struct mm_struct *mm = get_task_mm(current); - - if (!mm) - return -EINVAL; if (access & IB_ACCESS_HUGETLB) { struct vm_area_struct *vma; @@ -366,111 +430,43 @@ int ib_umem_odp_get(struct ib_ucontext *context, struct ib_umem *umem, umem->hugetlb = 0; } - /* Prevent creating ODP MRs in child processes */ - rcu_read_lock(); - our_pid = get_task_pid(current->group_leader, PIDTYPE_PID); - rcu_read_unlock(); - put_pid(our_pid); - if (context->tgid != our_pid) { - ret_val = -EINVAL; - goto out_mm; - } - - umem->odp_data = kzalloc(sizeof(*umem->odp_data), GFP_KERNEL); - if (!umem->odp_data) { - ret_val = -ENOMEM; - goto out_mm; - } - umem->odp_data->umem = umem; - - mutex_init(&umem->odp_data->umem_mutex); + mutex_init(&umem_odp->umem_mutex); - init_completion(&umem->odp_data->notifier_completion); + init_completion(&umem_odp->notifier_completion); if (ib_umem_num_pages(umem)) { - umem->odp_data->page_list = - vzalloc(array_size(sizeof(*umem->odp_data->page_list), + umem_odp->page_list = + vzalloc(array_size(sizeof(*umem_odp->page_list), ib_umem_num_pages(umem))); - if (!umem->odp_data->page_list) { - ret_val = -ENOMEM; - goto out_odp_data; - } + if (!umem_odp->page_list) + return -ENOMEM; - umem->odp_data->dma_list = - vzalloc(array_size(sizeof(*umem->odp_data->dma_list), + umem_odp->dma_list = + vzalloc(array_size(sizeof(*umem_odp->dma_list), ib_umem_num_pages(umem))); - if (!umem->odp_data->dma_list) { + if (!umem_odp->dma_list) { ret_val = -ENOMEM; goto out_page_list; } } - /* - * When using MMU notifiers, we will get a - * notification before the "current" task (and MM) is - * destroyed. We use the umem_rwsem semaphore to synchronize. - */ - down_write(&context->umem_rwsem); - context->odp_mrs_count++; - if (likely(ib_umem_start(umem) != ib_umem_end(umem))) - rbt_ib_umem_insert(&umem->odp_data->interval_tree, - &context->umem_tree); - if (likely(!atomic_read(&context->notifier_count)) || - context->odp_mrs_count == 1) - umem->odp_data->mn_counters_active = true; - else - list_add(&umem->odp_data->no_private_counters, - &context->no_private_counters); - downgrade_write(&context->umem_rwsem); - - if (context->odp_mrs_count == 1) { - /* - * Note that at this point, no MMU notifier is running - * for this context! - */ - atomic_set(&context->notifier_count, 0); - INIT_HLIST_NODE(&context->mn.hlist); - context->mn.ops = &ib_umem_notifiers; - /* - * Lock-dep detects a false positive for mmap_sem vs. - * umem_rwsem, due to not grasping downgrade_write correctly. - */ - lockdep_off(); - ret_val = mmu_notifier_register(&context->mn, mm); - lockdep_on(); - if (ret_val) { - pr_err("Failed to register mmu_notifier %d\n", ret_val); - ret_val = -EBUSY; - goto out_mutex; - } - } - - up_read(&context->umem_rwsem); + ret_val = get_per_mm(umem_odp); + if (ret_val) + goto out_dma_list; + add_umem_to_per_mm(umem_odp); - /* - * Note that doing an mmput can cause a notifier for the relevant mm. - * If the notifier is called while we hold the umem_rwsem, this will - * cause a deadlock. Therefore, we release the reference only after we - * released the semaphore. - */ - mmput(mm); return 0; -out_mutex: - up_read(&context->umem_rwsem); - vfree(umem->odp_data->dma_list); +out_dma_list: + vfree(umem_odp->dma_list); out_page_list: - vfree(umem->odp_data->page_list); -out_odp_data: - kfree(umem->odp_data); -out_mm: - mmput(mm); + vfree(umem_odp->page_list); return ret_val; } -void ib_umem_odp_release(struct ib_umem *umem) +void ib_umem_odp_release(struct ib_umem_odp *umem_odp) { - struct ib_ucontext *context = umem->context; + struct ib_umem *umem = &umem_odp->umem; /* * Ensure that no more pages are mapped in the umem. @@ -478,61 +474,13 @@ void ib_umem_odp_release(struct ib_umem *umem) * It is the driver's responsibility to ensure, before calling us, * that the hardware will not attempt to access the MR any more. */ - ib_umem_odp_unmap_dma_pages(umem, ib_umem_start(umem), + ib_umem_odp_unmap_dma_pages(umem_odp, ib_umem_start(umem), ib_umem_end(umem)); - down_write(&context->umem_rwsem); - if (likely(ib_umem_start(umem) != ib_umem_end(umem))) - rbt_ib_umem_remove(&umem->odp_data->interval_tree, - &context->umem_tree); - context->odp_mrs_count--; - if (!umem->odp_data->mn_counters_active) { - list_del(&umem->odp_data->no_private_counters); - complete_all(&umem->odp_data->notifier_completion); - } - - /* - * Downgrade the lock to a read lock. This ensures that the notifiers - * (who lock the mutex for reading) will be able to finish, and we - * will be able to enventually obtain the mmu notifiers SRCU. Note - * that since we are doing it atomically, no other user could register - * and unregister while we do the check. - */ - downgrade_write(&context->umem_rwsem); - if (!context->odp_mrs_count) { - struct task_struct *owning_process = NULL; - struct mm_struct *owning_mm = NULL; - - owning_process = get_pid_task(context->tgid, - PIDTYPE_PID); - if (owning_process == NULL) - /* - * The process is already dead, notifier were removed - * already. - */ - goto out; - - owning_mm = get_task_mm(owning_process); - if (owning_mm == NULL) - /* - * The process' mm is already dead, notifier were - * removed already. - */ - goto out_put_task; - mmu_notifier_unregister(&context->mn, owning_mm); - - mmput(owning_mm); - -out_put_task: - put_task_struct(owning_process); - } -out: - up_read(&context->umem_rwsem); - - vfree(umem->odp_data->dma_list); - vfree(umem->odp_data->page_list); - kfree(umem->odp_data); - kfree(umem); + remove_umem_from_per_mm(umem_odp); + put_per_mm(umem_odp); + vfree(umem_odp->dma_list); + vfree(umem_odp->page_list); } /* @@ -544,7 +492,7 @@ out: * @access_mask: access permissions needed for this page. * @current_seq: sequence number for synchronization with invalidations. * the sequence number is taken from - * umem->odp_data->notifiers_seq. + * umem_odp->notifiers_seq. * * The function returns -EFAULT if the DMA mapping operation fails. It returns * -EAGAIN if a concurrent invalidation prevents us from updating the page. @@ -554,12 +502,13 @@ out: * umem. */ static int ib_umem_odp_map_dma_single_page( - struct ib_umem *umem, + struct ib_umem_odp *umem_odp, int page_index, struct page *page, u64 access_mask, unsigned long current_seq) { + struct ib_umem *umem = &umem_odp->umem; struct ib_device *dev = umem->context->device; dma_addr_t dma_addr; int stored_page = 0; @@ -571,11 +520,11 @@ static int ib_umem_odp_map_dma_single_page( * handle case of a racing notifier. This check also allows us to bail * early if we have a notifier running in parallel with us. */ - if (ib_umem_mmu_notifier_retry(umem, current_seq)) { + if (ib_umem_mmu_notifier_retry(umem_odp, current_seq)) { ret = -EAGAIN; goto out; } - if (!(umem->odp_data->dma_list[page_index])) { + if (!(umem_odp->dma_list[page_index])) { dma_addr = ib_dma_map_page(dev, page, 0, BIT(umem->page_shift), @@ -584,15 +533,15 @@ static int ib_umem_odp_map_dma_single_page( ret = -EFAULT; goto out; } - umem->odp_data->dma_list[page_index] = dma_addr | access_mask; - umem->odp_data->page_list[page_index] = page; + umem_odp->dma_list[page_index] = dma_addr | access_mask; + umem_odp->page_list[page_index] = page; umem->npages++; stored_page = 1; - } else if (umem->odp_data->page_list[page_index] == page) { - umem->odp_data->dma_list[page_index] |= access_mask; + } else if (umem_odp->page_list[page_index] == page) { + umem_odp->dma_list[page_index] |= access_mask; } else { pr_err("error: got different pages in IB device and from get_user_pages. IB device page: %p, gup page: %p\n", - umem->odp_data->page_list[page_index], page); + umem_odp->page_list[page_index], page); /* Better remove the mapping now, to prevent any further * damage. */ remove_existing_mapping = 1; @@ -605,7 +554,7 @@ out: if (remove_existing_mapping && umem->context->invalidate_range) { invalidate_page_trampoline( - umem, + umem_odp, ib_umem_start(umem) + (page_index >> umem->page_shift), ib_umem_start(umem) + ((page_index + 1) >> umem->page_shift), @@ -621,7 +570,7 @@ out: * * Pins the range of pages passed in the argument, and maps them to * DMA addresses. The DMA addresses of the mapped pages is updated in - * umem->odp_data->dma_list. + * umem_odp->dma_list. * * Returns the number of pages mapped in success, negative error code * for failure. @@ -629,7 +578,7 @@ out: * the function from completing its task. * An -ENOENT error code indicates that userspace process is being terminated * and mm was already destroyed. - * @umem: the umem to map and pin + * @umem_odp: the umem to map and pin * @user_virt: the address from which we need to map. * @bcnt: the minimal number of bytes to pin and map. The mapping might be * bigger due to alignment, and may also be smaller in case of an error @@ -639,13 +588,15 @@ out: * range. * @current_seq: the MMU notifiers sequance value for synchronization with * invalidations. the sequance number is read from - * umem->odp_data->notifiers_seq before calling this function + * umem_odp->notifiers_seq before calling this function */ -int ib_umem_odp_map_dma_pages(struct ib_umem *umem, u64 user_virt, u64 bcnt, - u64 access_mask, unsigned long current_seq) +int ib_umem_odp_map_dma_pages(struct ib_umem_odp *umem_odp, u64 user_virt, + u64 bcnt, u64 access_mask, + unsigned long current_seq) { + struct ib_umem *umem = &umem_odp->umem; struct task_struct *owning_process = NULL; - struct mm_struct *owning_mm = NULL; + struct mm_struct *owning_mm = umem_odp->umem.owning_mm; struct page **local_page_list = NULL; u64 page_mask, off; int j, k, ret = 0, start_idx, npages = 0, page_shift; @@ -669,15 +620,14 @@ int ib_umem_odp_map_dma_pages(struct ib_umem *umem, u64 user_virt, u64 bcnt, user_virt = user_virt & page_mask; bcnt += off; /* Charge for the first page offset as well. */ - owning_process = get_pid_task(umem->context->tgid, PIDTYPE_PID); - if (owning_process == NULL) { + /* + * owning_process is allowed to be NULL, this means somehow the mm is + * existing beyond the lifetime of the originating process.. Presumably + * mmget_not_zero will fail in this case. + */ + owning_process = get_pid_task(umem_odp->per_mm->tgid, PIDTYPE_PID); + if (WARN_ON(!mmget_not_zero(umem_odp->umem.owning_mm))) { ret = -EINVAL; - goto out_no_task; - } - - owning_mm = get_task_mm(owning_process); - if (owning_mm == NULL) { - ret = -ENOENT; goto out_put_task; } @@ -709,7 +659,7 @@ int ib_umem_odp_map_dma_pages(struct ib_umem *umem, u64 user_virt, u64 bcnt, break; bcnt -= min_t(size_t, npages << PAGE_SHIFT, bcnt); - mutex_lock(&umem->odp_data->umem_mutex); + mutex_lock(&umem_odp->umem_mutex); for (j = 0; j < npages; j++, user_virt += PAGE_SIZE) { if (user_virt & ~page_mask) { p += PAGE_SIZE; @@ -722,7 +672,7 @@ int ib_umem_odp_map_dma_pages(struct ib_umem *umem, u64 user_virt, u64 bcnt, } ret = ib_umem_odp_map_dma_single_page( - umem, k, local_page_list[j], + umem_odp, k, local_page_list[j], access_mask, current_seq); if (ret < 0) break; @@ -730,7 +680,7 @@ int ib_umem_odp_map_dma_pages(struct ib_umem *umem, u64 user_virt, u64 bcnt, p = page_to_phys(local_page_list[j]); k++; } - mutex_unlock(&umem->odp_data->umem_mutex); + mutex_unlock(&umem_odp->umem_mutex); if (ret < 0) { /* Release left over pages when handling errors. */ @@ -749,16 +699,17 @@ int ib_umem_odp_map_dma_pages(struct ib_umem *umem, u64 user_virt, u64 bcnt, mmput(owning_mm); out_put_task: - put_task_struct(owning_process); -out_no_task: + if (owning_process) + put_task_struct(owning_process); free_page((unsigned long)local_page_list); return ret; } EXPORT_SYMBOL(ib_umem_odp_map_dma_pages); -void ib_umem_odp_unmap_dma_pages(struct ib_umem *umem, u64 virt, +void ib_umem_odp_unmap_dma_pages(struct ib_umem_odp *umem_odp, u64 virt, u64 bound) { + struct ib_umem *umem = &umem_odp->umem; int idx; u64 addr; struct ib_device *dev = umem->context->device; @@ -770,12 +721,12 @@ void ib_umem_odp_unmap_dma_pages(struct ib_umem *umem, u64 virt, * faults from completion. We might be racing with other * invalidations, so we must make sure we free each page only * once. */ - mutex_lock(&umem->odp_data->umem_mutex); + mutex_lock(&umem_odp->umem_mutex); for (addr = virt; addr < bound; addr += BIT(umem->page_shift)) { idx = (addr - ib_umem_start(umem)) >> umem->page_shift; - if (umem->odp_data->page_list[idx]) { - struct page *page = umem->odp_data->page_list[idx]; - dma_addr_t dma = umem->odp_data->dma_list[idx]; + if (umem_odp->page_list[idx]) { + struct page *page = umem_odp->page_list[idx]; + dma_addr_t dma = umem_odp->dma_list[idx]; dma_addr_t dma_addr = dma & ODP_DMA_ADDR_MASK; WARN_ON(!dma_addr); @@ -798,12 +749,12 @@ void ib_umem_odp_unmap_dma_pages(struct ib_umem *umem, u64 virt, /* on demand pinning support */ if (!umem->context->invalidate_range) put_page(page); - umem->odp_data->page_list[idx] = NULL; - umem->odp_data->dma_list[idx] = 0; + umem_odp->page_list[idx] = NULL; + umem_odp->dma_list[idx] = 0; umem->npages--; } } - mutex_unlock(&umem->odp_data->umem_mutex); + mutex_unlock(&umem_odp->umem_mutex); } EXPORT_SYMBOL(ib_umem_odp_unmap_dma_pages); @@ -830,7 +781,7 @@ int rbt_ib_umem_for_each_in_range(struct rb_root_cached *root, return -EAGAIN; next = rbt_ib_umem_iter_next(node, start, last - 1); umem = container_of(node, struct ib_umem_odp, interval_tree); - ret_val = cb(umem->umem, start, last, cookie) || ret_val; + ret_val = cb(umem, start, last, cookie) || ret_val; } return ret_val; diff --git a/drivers/infiniband/core/user_mad.c b/drivers/infiniband/core/user_mad.c index c34a6852d691..f55f48f6b272 100644 --- a/drivers/infiniband/core/user_mad.c +++ b/drivers/infiniband/core/user_mad.c @@ -138,7 +138,7 @@ static const dev_t base_issm_dev = MKDEV(IB_UMAD_MAJOR, IB_UMAD_MINOR_BASE) + static dev_t dynamic_umad_dev; static dev_t dynamic_issm_dev; -static DECLARE_BITMAP(dev_map, IB_UMAD_MAX_PORTS); +static DEFINE_IDA(umad_ida); static void ib_umad_add_one(struct ib_device *device); static void ib_umad_remove_one(struct ib_device *device, void *client_data); @@ -1132,7 +1132,7 @@ static ssize_t show_ibdev(struct device *dev, struct device_attribute *attr, if (!port) return -ENODEV; - return sprintf(buf, "%s\n", port->ib_dev->name); + return sprintf(buf, "%s\n", dev_name(&port->ib_dev->dev)); } static DEVICE_ATTR(ibdev, S_IRUGO, show_ibdev, NULL); @@ -1159,11 +1159,10 @@ static int ib_umad_init_port(struct ib_device *device, int port_num, dev_t base_umad; dev_t base_issm; - devnum = find_first_zero_bit(dev_map, IB_UMAD_MAX_PORTS); - if (devnum >= IB_UMAD_MAX_PORTS) + devnum = ida_alloc_max(&umad_ida, IB_UMAD_MAX_PORTS - 1, GFP_KERNEL); + if (devnum < 0) return -1; port->dev_num = devnum; - set_bit(devnum, dev_map); if (devnum >= IB_UMAD_NUM_FIXED_MINOR) { base_umad = dynamic_umad_dev + devnum - IB_UMAD_NUM_FIXED_MINOR; base_issm = dynamic_issm_dev + devnum - IB_UMAD_NUM_FIXED_MINOR; @@ -1227,7 +1226,7 @@ err_dev: err_cdev: cdev_del(&port->cdev); - clear_bit(devnum, dev_map); + ida_free(&umad_ida, devnum); return -1; } @@ -1261,7 +1260,7 @@ static void ib_umad_kill_port(struct ib_umad_port *port) } mutex_unlock(&port->file_mutex); - clear_bit(port->dev_num, dev_map); + ida_free(&umad_ida, port->dev_num); } static void ib_umad_add_one(struct ib_device *device) diff --git a/drivers/infiniband/core/uverbs.h b/drivers/infiniband/core/uverbs.h index 5df8e548cc14..c97935a0c7c6 100644 --- a/drivers/infiniband/core/uverbs.h +++ b/drivers/infiniband/core/uverbs.h @@ -100,13 +100,14 @@ struct ib_uverbs_device { atomic_t refcount; int num_comp_vectors; struct completion comp; - struct device *dev; + struct device dev; + /* First group for device attributes, NULL terminated array */ + const struct attribute_group *groups[2]; struct ib_device __rcu *ib_dev; int devnum; struct cdev cdev; struct rb_root xrcd_tree; struct mutex xrcd_tree_mutex; - struct kobject kobj; struct srcu_struct disassociate_srcu; struct mutex lists_mutex; /* protect lists */ struct list_head uverbs_file_list; @@ -146,7 +147,6 @@ struct ib_uverbs_file { struct ib_event_handler event_handler; struct ib_uverbs_async_event_file *async_file; struct list_head list; - int is_closed; /* * To access the uobjects list hw_destroy_rwsem must be held for write @@ -158,6 +158,9 @@ struct ib_uverbs_file { spinlock_t uobjects_lock; struct list_head uobjects; + struct mutex umap_lock; + struct list_head umaps; + u64 uverbs_cmd_mask; u64 uverbs_ex_cmd_mask; @@ -218,12 +221,6 @@ struct ib_ucq_object { u32 async_events_reported; }; -struct ib_uflow_resources; -struct ib_uflow_object { - struct ib_uobject uobject; - struct ib_uflow_resources *resources; -}; - extern const struct file_operations uverbs_event_fops; void ib_uverbs_init_event_queue(struct ib_uverbs_event_queue *ev_queue); struct file *ib_uverbs_alloc_async_event_file(struct ib_uverbs_file *uverbs_file, diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c index e012ca80f9d1..a93853770e3c 100644 --- a/drivers/infiniband/core/uverbs_cmd.c +++ b/drivers/infiniband/core/uverbs_cmd.c @@ -117,18 +117,12 @@ ssize_t ib_uverbs_get_context(struct ib_uverbs_file *file, /* ufile is required when some objects are released */ ucontext->ufile = file; - rcu_read_lock(); - ucontext->tgid = get_task_pid(current->group_leader, PIDTYPE_PID); - rcu_read_unlock(); - ucontext->closing = 0; + ucontext->closing = false; ucontext->cleanup_retryable = false; #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING - ucontext->umem_tree = RB_ROOT_CACHED; - init_rwsem(&ucontext->umem_rwsem); - ucontext->odp_mrs_count = 0; - INIT_LIST_HEAD(&ucontext->no_private_counters); - + mutex_init(&ucontext->per_mm_list_lock); + INIT_LIST_HEAD(&ucontext->per_mm_list); if (!(ib_dev->attrs.device_cap_flags & IB_DEVICE_ON_DEMAND_PAGING)) ucontext->invalidate_range = NULL; @@ -172,7 +166,6 @@ err_fd: put_unused_fd(resp.async_fd); err_free: - put_pid(ucontext->tgid); ib_dev->dealloc_ucontext(ucontext); err_alloc: @@ -2769,16 +2762,7 @@ out_put: return ret ? ret : in_len; } -struct ib_uflow_resources { - size_t max; - size_t num; - size_t collection_num; - size_t counters_num; - struct ib_counters **counters; - struct ib_flow_action **collection; -}; - -static struct ib_uflow_resources *flow_resources_alloc(size_t num_specs) +struct ib_uflow_resources *flow_resources_alloc(size_t num_specs) { struct ib_uflow_resources *resources; @@ -2808,6 +2792,7 @@ err: return NULL; } +EXPORT_SYMBOL(flow_resources_alloc); void ib_uverbs_flow_resources_free(struct ib_uflow_resources *uflow_res) { @@ -2826,10 +2811,11 @@ void ib_uverbs_flow_resources_free(struct ib_uflow_resources *uflow_res) kfree(uflow_res->counters); kfree(uflow_res); } +EXPORT_SYMBOL(ib_uverbs_flow_resources_free); -static void flow_resources_add(struct ib_uflow_resources *uflow_res, - enum ib_flow_spec_type type, - void *ibobj) +void flow_resources_add(struct ib_uflow_resources *uflow_res, + enum ib_flow_spec_type type, + void *ibobj) { WARN_ON(uflow_res->num >= uflow_res->max); @@ -2850,6 +2836,7 @@ static void flow_resources_add(struct ib_uflow_resources *uflow_res, uflow_res->num++; } +EXPORT_SYMBOL(flow_resources_add); static int kern_spec_to_ib_spec_action(struct ib_uverbs_file *ufile, struct ib_uverbs_flow_spec *kern_spec, @@ -3484,7 +3471,6 @@ int ib_uverbs_ex_create_flow(struct ib_uverbs_file *file, struct ib_uverbs_create_flow cmd; struct ib_uverbs_create_flow_resp resp; struct ib_uobject *uobj; - struct ib_uflow_object *uflow; struct ib_flow *flow_id; struct ib_uverbs_flow_attr *kern_flow_attr; struct ib_flow_attr *flow_attr; @@ -3623,13 +3609,8 @@ int ib_uverbs_ex_create_flow(struct ib_uverbs_file *file, err = PTR_ERR(flow_id); goto err_free; } - atomic_inc(&qp->usecnt); - flow_id->qp = qp; - flow_id->device = qp->device; - flow_id->uobject = uobj; - uobj->object = flow_id; - uflow = container_of(uobj, typeof(*uflow), uobject); - uflow->resources = uflow_res; + + ib_set_flow(uobj, flow_id, qp, qp->device, uflow_res); memset(&resp, 0, sizeof(resp)); resp.flow_handle = uobj->id; diff --git a/drivers/infiniband/core/uverbs_ioctl.c b/drivers/infiniband/core/uverbs_ioctl.c index 1a6b229e3db3..b0e493e8d860 100644 --- a/drivers/infiniband/core/uverbs_ioctl.c +++ b/drivers/infiniband/core/uverbs_ioctl.c @@ -57,6 +57,7 @@ struct bundle_priv { struct ib_uverbs_attr *uattrs; DECLARE_BITMAP(uobj_finalize, UVERBS_API_ATTR_BKEY_LEN); + DECLARE_BITMAP(spec_finalize, UVERBS_API_ATTR_BKEY_LEN); /* * Must be last. bundle ends in a flex array which overlaps @@ -143,6 +144,86 @@ static bool uverbs_is_attr_cleared(const struct ib_uverbs_attr *uattr, 0, uattr->len - len); } +static int uverbs_process_idrs_array(struct bundle_priv *pbundle, + const struct uverbs_api_attr *attr_uapi, + struct uverbs_objs_arr_attr *attr, + struct ib_uverbs_attr *uattr, + u32 attr_bkey) +{ + const struct uverbs_attr_spec *spec = &attr_uapi->spec; + size_t array_len; + u32 *idr_vals; + int ret = 0; + size_t i; + + if (uattr->attr_data.reserved) + return -EINVAL; + + if (uattr->len % sizeof(u32)) + return -EINVAL; + + array_len = uattr->len / sizeof(u32); + if (array_len < spec->u2.objs_arr.min_len || + array_len > spec->u2.objs_arr.max_len) + return -EINVAL; + + attr->uobjects = + uverbs_alloc(&pbundle->bundle, + array_size(array_len, sizeof(*attr->uobjects))); + if (IS_ERR(attr->uobjects)) + return PTR_ERR(attr->uobjects); + + /* + * Since idr is 4B and *uobjects is >= 4B, we can use attr->uobjects + * to store idrs array and avoid additional memory allocation. The + * idrs array is offset to the end of the uobjects array so we will be + * able to read idr and replace with a pointer. + */ + idr_vals = (u32 *)(attr->uobjects + array_len) - array_len; + + if (uattr->len > sizeof(uattr->data)) { + ret = copy_from_user(idr_vals, u64_to_user_ptr(uattr->data), + uattr->len); + if (ret) + return -EFAULT; + } else { + memcpy(idr_vals, &uattr->data, uattr->len); + } + + for (i = 0; i != array_len; i++) { + attr->uobjects[i] = uverbs_get_uobject_from_file( + spec->u2.objs_arr.obj_type, pbundle->bundle.ufile, + spec->u2.objs_arr.access, idr_vals[i]); + if (IS_ERR(attr->uobjects[i])) { + ret = PTR_ERR(attr->uobjects[i]); + break; + } + } + + attr->len = i; + __set_bit(attr_bkey, pbundle->spec_finalize); + return ret; +} + +static int uverbs_free_idrs_array(const struct uverbs_api_attr *attr_uapi, + struct uverbs_objs_arr_attr *attr, + bool commit) +{ + const struct uverbs_attr_spec *spec = &attr_uapi->spec; + int current_ret; + int ret = 0; + size_t i; + + for (i = 0; i != attr->len; i++) { + current_ret = uverbs_finalize_object( + attr->uobjects[i], spec->u2.objs_arr.access, commit); + if (!ret) + ret = current_ret; + } + + return ret; +} + static int uverbs_process_attr(struct bundle_priv *pbundle, const struct uverbs_api_attr *attr_uapi, struct ib_uverbs_attr *uattr, u32 attr_bkey) @@ -246,6 +327,11 @@ static int uverbs_process_attr(struct bundle_priv *pbundle, } break; + + case UVERBS_ATTR_TYPE_IDRS_ARRAY: + return uverbs_process_idrs_array(pbundle, attr_uapi, + &e->objs_arr_attr, uattr, + attr_bkey); default: return -EOPNOTSUPP; } @@ -300,8 +386,7 @@ static int uverbs_set_attr(struct bundle_priv *pbundle, return -EPROTONOSUPPORT; return 0; } - attr = srcu_dereference( - *slot, &pbundle->bundle.ufile->device->disassociate_srcu); + attr = rcu_dereference_protected(*slot, true); /* Reject duplicate attributes from user-space */ if (test_bit(attr_bkey, pbundle->bundle.attr_present)) @@ -384,6 +469,7 @@ static int bundle_destroy(struct bundle_priv *pbundle, bool commit) unsigned int i; int ret = 0; + /* fast path for simple uobjects */ i = -1; while ((i = find_next_bit(pbundle->uobj_finalize, key_bitmap_len, i + 1)) < key_bitmap_len) { @@ -397,6 +483,30 @@ static int bundle_destroy(struct bundle_priv *pbundle, bool commit) ret = current_ret; } + i = -1; + while ((i = find_next_bit(pbundle->spec_finalize, key_bitmap_len, + i + 1)) < key_bitmap_len) { + struct uverbs_attr *attr = &pbundle->bundle.attrs[i]; + const struct uverbs_api_attr *attr_uapi; + void __rcu **slot; + int current_ret; + + slot = uapi_get_attr_for_method( + pbundle, + pbundle->method_key | uapi_bkey_to_key_attr(i)); + if (WARN_ON(!slot)) + continue; + + attr_uapi = rcu_dereference_protected(*slot, true); + + if (attr_uapi->spec.type == UVERBS_ATTR_TYPE_IDRS_ARRAY) { + current_ret = uverbs_free_idrs_array( + attr_uapi, &attr->objs_arr_attr, commit); + if (!ret) + ret = current_ret; + } + } + for (memblock = pbundle->allocated_mem; memblock;) { struct bundle_alloc_head *tmp = memblock; @@ -429,7 +539,7 @@ static int ib_uverbs_cmd_verbs(struct ib_uverbs_file *ufile, uapi_key_ioctl_method(hdr->method_id)); if (unlikely(!slot)) return -EPROTONOSUPPORT; - method_elm = srcu_dereference(*slot, &ufile->device->disassociate_srcu); + method_elm = rcu_dereference_protected(*slot, true); if (!method_elm->use_stack) { pbundle = kmalloc(method_elm->bundle_size, GFP_KERNEL); @@ -461,6 +571,7 @@ static int ib_uverbs_cmd_verbs(struct ib_uverbs_file *ufile, memset(pbundle->bundle.attr_present, 0, sizeof(pbundle->bundle.attr_present)); memset(pbundle->uobj_finalize, 0, sizeof(pbundle->uobj_finalize)); + memset(pbundle->spec_finalize, 0, sizeof(pbundle->spec_finalize)); ret = ib_uverbs_run_method(pbundle, hdr->num_attrs); destroy_ret = bundle_destroy(pbundle, ret == 0); @@ -611,3 +722,26 @@ int uverbs_copy_to(const struct uverbs_attr_bundle *bundle, size_t idx, return 0; } EXPORT_SYMBOL(uverbs_copy_to); + +int _uverbs_get_const(s64 *to, const struct uverbs_attr_bundle *attrs_bundle, + size_t idx, s64 lower_bound, u64 upper_bound, + s64 *def_val) +{ + const struct uverbs_attr *attr; + + attr = uverbs_attr_get(attrs_bundle, idx); + if (IS_ERR(attr)) { + if ((PTR_ERR(attr) != -ENOENT) || !def_val) + return PTR_ERR(attr); + + *to = *def_val; + } else { + *to = attr->ptr_attr.data; + } + + if (*to < lower_bound || (*to > 0 && (u64)*to > upper_bound)) + return -EINVAL; + + return 0; +} +EXPORT_SYMBOL(_uverbs_get_const); diff --git a/drivers/infiniband/core/uverbs_main.c b/drivers/infiniband/core/uverbs_main.c index 50152c1b1004..6d373f5515b7 100644 --- a/drivers/infiniband/core/uverbs_main.c +++ b/drivers/infiniband/core/uverbs_main.c @@ -45,6 +45,7 @@ #include <linux/cdev.h> #include <linux/anon_inodes.h> #include <linux/slab.h> +#include <linux/sched/mm.h> #include <linux/uaccess.h> @@ -72,7 +73,7 @@ enum { static dev_t dynamic_uverbs_dev; static struct class *uverbs_class; -static DECLARE_BITMAP(dev_map, IB_UVERBS_MAX_DEVICES); +static DEFINE_IDA(uverbs_ida); static ssize_t (*uverbs_cmd_table[])(struct ib_uverbs_file *file, const char __user *buf, int in_len, @@ -169,20 +170,16 @@ int uverbs_dealloc_mw(struct ib_mw *mw) return ret; } -static void ib_uverbs_release_dev(struct kobject *kobj) +static void ib_uverbs_release_dev(struct device *device) { struct ib_uverbs_device *dev = - container_of(kobj, struct ib_uverbs_device, kobj); + container_of(device, struct ib_uverbs_device, dev); uverbs_destroy_api(dev->uapi); cleanup_srcu_struct(&dev->disassociate_srcu); kfree(dev); } -static struct kobj_type ib_uverbs_dev_ktype = { - .release = ib_uverbs_release_dev, -}; - static void ib_uverbs_release_async_event_file(struct kref *ref) { struct ib_uverbs_async_event_file *file = @@ -265,7 +262,7 @@ void ib_uverbs_release_file(struct kref *ref) if (atomic_dec_and_test(&file->device->refcount)) ib_uverbs_comp_dev(file->device); - kobject_put(&file->device->kobj); + put_device(&file->device->dev); kfree(file); } @@ -817,6 +814,226 @@ out: } /* + * Each time we map IO memory into user space this keeps track of the mapping. + * When the device is hot-unplugged we 'zap' the mmaps in user space to point + * to the zero page and allow the hot unplug to proceed. + * + * This is necessary for cases like PCI physical hot unplug as the actual BAR + * memory may vanish after this and access to it from userspace could MCE. + * + * RDMA drivers supporting disassociation must have their user space designed + * to cope in some way with their IO pages going to the zero page. + */ +struct rdma_umap_priv { + struct vm_area_struct *vma; + struct list_head list; +}; + +static const struct vm_operations_struct rdma_umap_ops; + +static void rdma_umap_priv_init(struct rdma_umap_priv *priv, + struct vm_area_struct *vma) +{ + struct ib_uverbs_file *ufile = vma->vm_file->private_data; + + priv->vma = vma; + vma->vm_private_data = priv; + vma->vm_ops = &rdma_umap_ops; + + mutex_lock(&ufile->umap_lock); + list_add(&priv->list, &ufile->umaps); + mutex_unlock(&ufile->umap_lock); +} + +/* + * The VMA has been dup'd, initialize the vm_private_data with a new tracking + * struct + */ +static void rdma_umap_open(struct vm_area_struct *vma) +{ + struct ib_uverbs_file *ufile = vma->vm_file->private_data; + struct rdma_umap_priv *opriv = vma->vm_private_data; + struct rdma_umap_priv *priv; + + if (!opriv) + return; + + /* We are racing with disassociation */ + if (!down_read_trylock(&ufile->hw_destroy_rwsem)) + goto out_zap; + /* + * Disassociation already completed, the VMA should already be zapped. + */ + if (!ufile->ucontext) + goto out_unlock; + + priv = kzalloc(sizeof(*priv), GFP_KERNEL); + if (!priv) + goto out_unlock; + rdma_umap_priv_init(priv, vma); + + up_read(&ufile->hw_destroy_rwsem); + return; + +out_unlock: + up_read(&ufile->hw_destroy_rwsem); +out_zap: + /* + * We can't allow the VMA to be created with the actual IO pages, that + * would break our API contract, and it can't be stopped at this + * point, so zap it. + */ + vma->vm_private_data = NULL; + zap_vma_ptes(vma, vma->vm_start, vma->vm_end - vma->vm_start); +} + +static void rdma_umap_close(struct vm_area_struct *vma) +{ + struct ib_uverbs_file *ufile = vma->vm_file->private_data; + struct rdma_umap_priv *priv = vma->vm_private_data; + + if (!priv) + return; + + /* + * The vma holds a reference on the struct file that created it, which + * in turn means that the ib_uverbs_file is guaranteed to exist at + * this point. + */ + mutex_lock(&ufile->umap_lock); + list_del(&priv->list); + mutex_unlock(&ufile->umap_lock); + kfree(priv); +} + +static const struct vm_operations_struct rdma_umap_ops = { + .open = rdma_umap_open, + .close = rdma_umap_close, +}; + +static struct rdma_umap_priv *rdma_user_mmap_pre(struct ib_ucontext *ucontext, + struct vm_area_struct *vma, + unsigned long size) +{ + struct ib_uverbs_file *ufile = ucontext->ufile; + struct rdma_umap_priv *priv; + + if (vma->vm_end - vma->vm_start != size) + return ERR_PTR(-EINVAL); + + /* Driver is using this wrong, must be called by ib_uverbs_mmap */ + if (WARN_ON(!vma->vm_file || + vma->vm_file->private_data != ufile)) + return ERR_PTR(-EINVAL); + lockdep_assert_held(&ufile->device->disassociate_srcu); + + priv = kzalloc(sizeof(*priv), GFP_KERNEL); + if (!priv) + return ERR_PTR(-ENOMEM); + return priv; +} + +/* + * Map IO memory into a process. This is to be called by drivers as part of + * their mmap() functions if they wish to send something like PCI-E BAR memory + * to userspace. + */ +int rdma_user_mmap_io(struct ib_ucontext *ucontext, struct vm_area_struct *vma, + unsigned long pfn, unsigned long size, pgprot_t prot) +{ + struct rdma_umap_priv *priv = rdma_user_mmap_pre(ucontext, vma, size); + + if (IS_ERR(priv)) + return PTR_ERR(priv); + + vma->vm_page_prot = prot; + if (io_remap_pfn_range(vma, vma->vm_start, pfn, size, prot)) { + kfree(priv); + return -EAGAIN; + } + + rdma_umap_priv_init(priv, vma); + return 0; +} +EXPORT_SYMBOL(rdma_user_mmap_io); + +/* + * The page case is here for a slightly different reason, the driver expects + * to be able to free the page it is sharing to user space when it destroys + * its ucontext, which means we need to zap the user space references. + * + * We could handle this differently by providing an API to allocate a shared + * page and then only freeing the shared page when the last ufile is + * destroyed. + */ +int rdma_user_mmap_page(struct ib_ucontext *ucontext, + struct vm_area_struct *vma, struct page *page, + unsigned long size) +{ + struct rdma_umap_priv *priv = rdma_user_mmap_pre(ucontext, vma, size); + + if (IS_ERR(priv)) + return PTR_ERR(priv); + + if (remap_pfn_range(vma, vma->vm_start, page_to_pfn(page), size, + vma->vm_page_prot)) { + kfree(priv); + return -EAGAIN; + } + + rdma_umap_priv_init(priv, vma); + return 0; +} +EXPORT_SYMBOL(rdma_user_mmap_page); + +void uverbs_user_mmap_disassociate(struct ib_uverbs_file *ufile) +{ + struct rdma_umap_priv *priv, *next_priv; + + lockdep_assert_held(&ufile->hw_destroy_rwsem); + + while (1) { + struct mm_struct *mm = NULL; + + /* Get an arbitrary mm pointer that hasn't been cleaned yet */ + mutex_lock(&ufile->umap_lock); + if (!list_empty(&ufile->umaps)) { + mm = list_first_entry(&ufile->umaps, + struct rdma_umap_priv, list) + ->vma->vm_mm; + mmget(mm); + } + mutex_unlock(&ufile->umap_lock); + if (!mm) + return; + + /* + * The umap_lock is nested under mmap_sem since it used within + * the vma_ops callbacks, so we have to clean the list one mm + * at a time to get the lock ordering right. Typically there + * will only be one mm, so no big deal. + */ + down_write(&mm->mmap_sem); + mutex_lock(&ufile->umap_lock); + list_for_each_entry_safe (priv, next_priv, &ufile->umaps, + list) { + struct vm_area_struct *vma = priv->vma; + + if (vma->vm_mm != mm) + continue; + list_del_init(&priv->list); + + zap_vma_ptes(vma, vma->vm_start, + vma->vm_end - vma->vm_start); + vma->vm_flags &= ~(VM_SHARED | VM_MAYSHARE); + } + mutex_unlock(&ufile->umap_lock); + up_write(&mm->mmap_sem); + mmput(mm); + } +} + +/* * ib_uverbs_open() does not need the BKL: * * - the ib_uverbs_device structures are properly reference counted and @@ -839,6 +1056,7 @@ static int ib_uverbs_open(struct inode *inode, struct file *filp) if (!atomic_inc_not_zero(&dev->refcount)) return -ENXIO; + get_device(&dev->dev); srcu_key = srcu_read_lock(&dev->disassociate_srcu); mutex_lock(&dev->lists_mutex); ib_dev = srcu_dereference(dev->ib_dev, @@ -876,9 +1094,10 @@ static int ib_uverbs_open(struct inode *inode, struct file *filp) spin_lock_init(&file->uobjects_lock); INIT_LIST_HEAD(&file->uobjects); init_rwsem(&file->hw_destroy_rwsem); + mutex_init(&file->umap_lock); + INIT_LIST_HEAD(&file->umaps); filp->private_data = file; - kobject_get(&dev->kobj); list_add_tail(&file->list, &dev->uverbs_file_list); mutex_unlock(&dev->lists_mutex); srcu_read_unlock(&dev->disassociate_srcu, srcu_key); @@ -899,6 +1118,7 @@ err: if (atomic_dec_and_test(&dev->refcount)) ib_uverbs_comp_dev(dev); + put_device(&dev->dev); return ret; } @@ -909,10 +1129,7 @@ static int ib_uverbs_close(struct inode *inode, struct file *filp) uverbs_destroy_ufile_hw(file, RDMA_REMOVE_CLOSE); mutex_lock(&file->device->lists_mutex); - if (!file->is_closed) { - list_del(&file->list); - file->is_closed = 1; - } + list_del_init(&file->list); mutex_unlock(&file->device->lists_mutex); if (file->async_file) @@ -951,37 +1168,34 @@ static struct ib_client uverbs_client = { .remove = ib_uverbs_remove_one }; -static ssize_t show_ibdev(struct device *device, struct device_attribute *attr, +static ssize_t ibdev_show(struct device *device, struct device_attribute *attr, char *buf) { + struct ib_uverbs_device *dev = + container_of(device, struct ib_uverbs_device, dev); int ret = -ENODEV; int srcu_key; - struct ib_uverbs_device *dev = dev_get_drvdata(device); struct ib_device *ib_dev; - if (!dev) - return -ENODEV; - srcu_key = srcu_read_lock(&dev->disassociate_srcu); ib_dev = srcu_dereference(dev->ib_dev, &dev->disassociate_srcu); if (ib_dev) - ret = sprintf(buf, "%s\n", ib_dev->name); + ret = sprintf(buf, "%s\n", dev_name(&ib_dev->dev)); srcu_read_unlock(&dev->disassociate_srcu, srcu_key); return ret; } -static DEVICE_ATTR(ibdev, S_IRUGO, show_ibdev, NULL); +static DEVICE_ATTR_RO(ibdev); -static ssize_t show_dev_abi_version(struct device *device, - struct device_attribute *attr, char *buf) +static ssize_t abi_version_show(struct device *device, + struct device_attribute *attr, char *buf) { - struct ib_uverbs_device *dev = dev_get_drvdata(device); + struct ib_uverbs_device *dev = + container_of(device, struct ib_uverbs_device, dev); int ret = -ENODEV; int srcu_key; struct ib_device *ib_dev; - if (!dev) - return -ENODEV; srcu_key = srcu_read_lock(&dev->disassociate_srcu); ib_dev = srcu_dereference(dev->ib_dev, &dev->disassociate_srcu); if (ib_dev) @@ -990,7 +1204,17 @@ static ssize_t show_dev_abi_version(struct device *device, return ret; } -static DEVICE_ATTR(abi_version, S_IRUGO, show_dev_abi_version, NULL); +static DEVICE_ATTR_RO(abi_version); + +static struct attribute *ib_dev_attrs[] = { + &dev_attr_abi_version.attr, + &dev_attr_ibdev.attr, + NULL, +}; + +static const struct attribute_group dev_attr_group = { + .attrs = ib_dev_attrs, +}; static CLASS_ATTR_STRING(abi_version, S_IRUGO, __stringify(IB_USER_VERBS_ABI_VERSION)); @@ -1028,65 +1252,56 @@ static void ib_uverbs_add_one(struct ib_device *device) return; } + device_initialize(&uverbs_dev->dev); + uverbs_dev->dev.class = uverbs_class; + uverbs_dev->dev.parent = device->dev.parent; + uverbs_dev->dev.release = ib_uverbs_release_dev; + uverbs_dev->groups[0] = &dev_attr_group; + uverbs_dev->dev.groups = uverbs_dev->groups; atomic_set(&uverbs_dev->refcount, 1); init_completion(&uverbs_dev->comp); uverbs_dev->xrcd_tree = RB_ROOT; mutex_init(&uverbs_dev->xrcd_tree_mutex); - kobject_init(&uverbs_dev->kobj, &ib_uverbs_dev_ktype); mutex_init(&uverbs_dev->lists_mutex); INIT_LIST_HEAD(&uverbs_dev->uverbs_file_list); INIT_LIST_HEAD(&uverbs_dev->uverbs_events_file_list); + rcu_assign_pointer(uverbs_dev->ib_dev, device); + uverbs_dev->num_comp_vectors = device->num_comp_vectors; - devnum = find_first_zero_bit(dev_map, IB_UVERBS_MAX_DEVICES); - if (devnum >= IB_UVERBS_MAX_DEVICES) + devnum = ida_alloc_max(&uverbs_ida, IB_UVERBS_MAX_DEVICES - 1, + GFP_KERNEL); + if (devnum < 0) goto err; uverbs_dev->devnum = devnum; - set_bit(devnum, dev_map); if (devnum >= IB_UVERBS_NUM_FIXED_MINOR) base = dynamic_uverbs_dev + devnum - IB_UVERBS_NUM_FIXED_MINOR; else base = IB_UVERBS_BASE_DEV + devnum; - rcu_assign_pointer(uverbs_dev->ib_dev, device); - uverbs_dev->num_comp_vectors = device->num_comp_vectors; - if (ib_uverbs_create_uapi(device, uverbs_dev)) goto err_uapi; - cdev_init(&uverbs_dev->cdev, NULL); + uverbs_dev->dev.devt = base; + dev_set_name(&uverbs_dev->dev, "uverbs%d", uverbs_dev->devnum); + + cdev_init(&uverbs_dev->cdev, + device->mmap ? &uverbs_mmap_fops : &uverbs_fops); uverbs_dev->cdev.owner = THIS_MODULE; - uverbs_dev->cdev.ops = device->mmap ? &uverbs_mmap_fops : &uverbs_fops; - cdev_set_parent(&uverbs_dev->cdev, &uverbs_dev->kobj); - kobject_set_name(&uverbs_dev->cdev.kobj, "uverbs%d", uverbs_dev->devnum); - if (cdev_add(&uverbs_dev->cdev, base, 1)) - goto err_cdev; - - uverbs_dev->dev = device_create(uverbs_class, device->dev.parent, - uverbs_dev->cdev.dev, uverbs_dev, - "uverbs%d", uverbs_dev->devnum); - if (IS_ERR(uverbs_dev->dev)) - goto err_cdev; - - if (device_create_file(uverbs_dev->dev, &dev_attr_ibdev)) - goto err_class; - if (device_create_file(uverbs_dev->dev, &dev_attr_abi_version)) - goto err_class; - ib_set_client_data(device, &uverbs_client, uverbs_dev); + ret = cdev_device_add(&uverbs_dev->cdev, &uverbs_dev->dev); + if (ret) + goto err_uapi; + ib_set_client_data(device, &uverbs_client, uverbs_dev); return; -err_class: - device_destroy(uverbs_class, uverbs_dev->cdev.dev); -err_cdev: - cdev_del(&uverbs_dev->cdev); err_uapi: - clear_bit(devnum, dev_map); + ida_free(&uverbs_ida, devnum); err: if (atomic_dec_and_test(&uverbs_dev->refcount)) ib_uverbs_comp_dev(uverbs_dev); wait_for_completion(&uverbs_dev->comp); - kobject_put(&uverbs_dev->kobj); + put_device(&uverbs_dev->dev); return; } @@ -1107,8 +1322,7 @@ static void ib_uverbs_free_hw_resources(struct ib_uverbs_device *uverbs_dev, while (!list_empty(&uverbs_dev->uverbs_file_list)) { file = list_first_entry(&uverbs_dev->uverbs_file_list, struct ib_uverbs_file, list); - file->is_closed = 1; - list_del(&file->list); + list_del_init(&file->list); kref_get(&file->ref); /* We must release the mutex before going ahead and calling @@ -1156,10 +1370,8 @@ static void ib_uverbs_remove_one(struct ib_device *device, void *client_data) if (!uverbs_dev) return; - dev_set_drvdata(uverbs_dev->dev, NULL); - device_destroy(uverbs_class, uverbs_dev->cdev.dev); - cdev_del(&uverbs_dev->cdev); - clear_bit(uverbs_dev->devnum, dev_map); + cdev_device_del(&uverbs_dev->cdev, &uverbs_dev->dev); + ida_free(&uverbs_ida, uverbs_dev->devnum); if (device->disassociate_ucontext) { /* We disassociate HW resources and immediately return. @@ -1182,7 +1394,7 @@ static void ib_uverbs_remove_one(struct ib_device *device, void *client_data) if (wait_clients) wait_for_completion(&uverbs_dev->comp); - kobject_put(&uverbs_dev->kobj); + put_device(&uverbs_dev->dev); } static char *uverbs_devnode(struct device *dev, umode_t *mode) diff --git a/drivers/infiniband/core/uverbs_std_types_flow_action.c b/drivers/infiniband/core/uverbs_std_types_flow_action.c index d8cfafe23bd9..cb9486ad5c67 100644 --- a/drivers/infiniband/core/uverbs_std_types_flow_action.c +++ b/drivers/infiniband/core/uverbs_std_types_flow_action.c @@ -326,11 +326,8 @@ static int UVERBS_HANDLER(UVERBS_METHOD_FLOW_ACTION_ESP_CREATE)( if (IS_ERR(action)) return PTR_ERR(action); - atomic_set(&action->usecnt, 0); - action->device = ib_dev; - action->type = IB_FLOW_ACTION_ESP; - action->uobject = uobj; - uobj->object = action; + uverbs_flow_action_fill_action(action, uobj, ib_dev, + IB_FLOW_ACTION_ESP); return 0; } diff --git a/drivers/infiniband/core/uverbs_uapi.c b/drivers/infiniband/core/uverbs_uapi.c index be854628a7c6..86f3fc5e04b4 100644 --- a/drivers/infiniband/core/uverbs_uapi.c +++ b/drivers/infiniband/core/uverbs_uapi.c @@ -73,6 +73,18 @@ static int uapi_merge_method(struct uverbs_api *uapi, if (attr->attr.type == UVERBS_ATTR_TYPE_ENUM_IN) method_elm->driver_method |= is_driver; + /* + * Like other uobject based things we only support a single + * uobject being NEW'd or DESTROY'd + */ + if (attr->attr.type == UVERBS_ATTR_TYPE_IDRS_ARRAY) { + u8 access = attr->attr.u2.objs_arr.access; + + if (WARN_ON(access == UVERBS_ACCESS_NEW || + access == UVERBS_ACCESS_DESTROY)) + return -EINVAL; + } + attr_slot = uapi_add_elm(uapi, method_key | uapi_key_attr(attr->id), sizeof(*attr_slot)); diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c index 8ec7418e99f0..178899e3ce73 100644 --- a/drivers/infiniband/core/verbs.c +++ b/drivers/infiniband/core/verbs.c @@ -264,7 +264,7 @@ struct ib_pd *__ib_alloc_pd(struct ib_device *device, unsigned int flags, } pd->res.type = RDMA_RESTRACK_PD; - pd->res.kern_name = caller; + rdma_restrack_set_task(&pd->res, caller); rdma_restrack_add(&pd->res); if (mr_access_flags) { @@ -710,7 +710,7 @@ static int ib_resolve_unicast_gid_dmac(struct ib_device *device, ret = rdma_addr_find_l2_eth_by_grh(&sgid_attr->gid, &grh->dgid, ah_attr->roce.dmac, - sgid_attr->ndev, &hop_limit); + sgid_attr, &hop_limit); grh->hop_limit = hop_limit; return ret; @@ -1509,8 +1509,7 @@ static const struct { }; bool ib_modify_qp_is_ok(enum ib_qp_state cur_state, enum ib_qp_state next_state, - enum ib_qp_type type, enum ib_qp_attr_mask mask, - enum rdma_link_layer ll) + enum ib_qp_type type, enum ib_qp_attr_mask mask) { enum ib_qp_attr_mask req_param, opt_param; @@ -1629,14 +1628,16 @@ static int _ib_modify_qp(struct ib_qp *qp, struct ib_qp_attr *attr, if (rdma_ib_or_roce(qp->device, port)) { if (attr_mask & IB_QP_RQ_PSN && attr->rq_psn & ~0xffffff) { - pr_warn("%s: %s rq_psn overflow, masking to 24 bits\n", - __func__, qp->device->name); + dev_warn(&qp->device->dev, + "%s rq_psn overflow, masking to 24 bits\n", + __func__); attr->rq_psn &= 0xffffff; } if (attr_mask & IB_QP_SQ_PSN && attr->sq_psn & ~0xffffff) { - pr_warn("%s: %s sq_psn overflow, masking to 24 bits\n", - __func__, qp->device->name); + dev_warn(&qp->device->dev, + " %s sq_psn overflow, masking to 24 bits\n", + __func__); attr->sq_psn &= 0xffffff; } } @@ -1888,7 +1889,7 @@ struct ib_cq *__ib_create_cq(struct ib_device *device, cq->cq_context = cq_context; atomic_set(&cq->usecnt, 0); cq->res.type = RDMA_RESTRACK_CQ; - cq->res.kern_name = caller; + rdma_restrack_set_task(&cq->res, caller); rdma_restrack_add(&cq->res); } diff --git a/drivers/infiniband/hw/bnxt_re/bnxt_re.h b/drivers/infiniband/hw/bnxt_re/bnxt_re.h index 96f76896488d..31baa8939a4f 100644 --- a/drivers/infiniband/hw/bnxt_re/bnxt_re.h +++ b/drivers/infiniband/hw/bnxt_re/bnxt_re.h @@ -40,7 +40,6 @@ #ifndef __BNXT_RE_H__ #define __BNXT_RE_H__ #define ROCE_DRV_MODULE_NAME "bnxt_re" -#define ROCE_DRV_MODULE_VERSION "1.0.0" #define BNXT_RE_DESC "Broadcom NetXtreme-C/E RoCE Driver" #define BNXT_RE_PAGE_SHIFT_4K (12) @@ -120,6 +119,8 @@ struct bnxt_re_dev { #define BNXT_RE_FLAG_HAVE_L2_REF 3 #define BNXT_RE_FLAG_RCFW_CHANNEL_EN 4 #define BNXT_RE_FLAG_QOS_WORK_REG 5 +#define BNXT_RE_FLAG_RESOURCES_ALLOCATED 7 +#define BNXT_RE_FLAG_RESOURCES_INITIALIZED 8 #define BNXT_RE_FLAG_ISSUE_ROCE_STATS 29 struct net_device *netdev; unsigned int version, major, minor; diff --git a/drivers/infiniband/hw/bnxt_re/hw_counters.c b/drivers/infiniband/hw/bnxt_re/hw_counters.c index 77416bc61e6e..604b71875f5f 100644 --- a/drivers/infiniband/hw/bnxt_re/hw_counters.c +++ b/drivers/infiniband/hw/bnxt_re/hw_counters.c @@ -68,6 +68,8 @@ static const char * const bnxt_re_stat_name[] = { [BNXT_RE_TX_PKTS] = "tx_pkts", [BNXT_RE_TX_BYTES] = "tx_bytes", [BNXT_RE_RECOVERABLE_ERRORS] = "recoverable_errors", + [BNXT_RE_RX_DROPS] = "rx_roce_drops", + [BNXT_RE_RX_DISCARDS] = "rx_roce_discards", [BNXT_RE_TO_RETRANSMITS] = "to_retransmits", [BNXT_RE_SEQ_ERR_NAKS_RCVD] = "seq_err_naks_rcvd", [BNXT_RE_MAX_RETRY_EXCEEDED] = "max_retry_exceeded", @@ -106,7 +108,8 @@ static const char * const bnxt_re_stat_name[] = { [BNXT_RE_RES_CQ_LOAD_ERR] = "res_cq_load_err", [BNXT_RE_RES_SRQ_LOAD_ERR] = "res_srq_load_err", [BNXT_RE_RES_TX_PCI_ERR] = "res_tx_pci_err", - [BNXT_RE_RES_RX_PCI_ERR] = "res_rx_pci_err" + [BNXT_RE_RES_RX_PCI_ERR] = "res_rx_pci_err", + [BNXT_RE_OUT_OF_SEQ_ERR] = "oos_drop_count" }; int bnxt_re_ib_get_hw_stats(struct ib_device *ibdev, @@ -128,6 +131,10 @@ int bnxt_re_ib_get_hw_stats(struct ib_device *ibdev, if (bnxt_re_stats) { stats->value[BNXT_RE_RECOVERABLE_ERRORS] = le64_to_cpu(bnxt_re_stats->tx_bcast_pkts); + stats->value[BNXT_RE_RX_DROPS] = + le64_to_cpu(bnxt_re_stats->rx_drop_pkts); + stats->value[BNXT_RE_RX_DISCARDS] = + le64_to_cpu(bnxt_re_stats->rx_discard_pkts); stats->value[BNXT_RE_RX_PKTS] = le64_to_cpu(bnxt_re_stats->rx_ucast_pkts); stats->value[BNXT_RE_RX_BYTES] = @@ -220,6 +227,8 @@ int bnxt_re_ib_get_hw_stats(struct ib_device *ibdev, rdev->stats.res_tx_pci_err; stats->value[BNXT_RE_RES_RX_PCI_ERR] = rdev->stats.res_rx_pci_err; + stats->value[BNXT_RE_OUT_OF_SEQ_ERR] = + rdev->stats.res_oos_drop_count; } return ARRAY_SIZE(bnxt_re_stat_name); diff --git a/drivers/infiniband/hw/bnxt_re/hw_counters.h b/drivers/infiniband/hw/bnxt_re/hw_counters.h index a01a922717d5..76399f477e5c 100644 --- a/drivers/infiniband/hw/bnxt_re/hw_counters.h +++ b/drivers/infiniband/hw/bnxt_re/hw_counters.h @@ -51,6 +51,8 @@ enum bnxt_re_hw_stats { BNXT_RE_TX_PKTS, BNXT_RE_TX_BYTES, BNXT_RE_RECOVERABLE_ERRORS, + BNXT_RE_RX_DROPS, + BNXT_RE_RX_DISCARDS, BNXT_RE_TO_RETRANSMITS, BNXT_RE_SEQ_ERR_NAKS_RCVD, BNXT_RE_MAX_RETRY_EXCEEDED, @@ -90,6 +92,7 @@ enum bnxt_re_hw_stats { BNXT_RE_RES_SRQ_LOAD_ERR, BNXT_RE_RES_TX_PCI_ERR, BNXT_RE_RES_RX_PCI_ERR, + BNXT_RE_OUT_OF_SEQ_ERR, BNXT_RE_NUM_COUNTERS }; diff --git a/drivers/infiniband/hw/bnxt_re/ib_verbs.c b/drivers/infiniband/hw/bnxt_re/ib_verbs.c index bc2b9e038439..54fdd4cf5288 100644 --- a/drivers/infiniband/hw/bnxt_re/ib_verbs.c +++ b/drivers/infiniband/hw/bnxt_re/ib_verbs.c @@ -1598,8 +1598,7 @@ int bnxt_re_modify_qp(struct ib_qp *ib_qp, struct ib_qp_attr *qp_attr, curr_qp_state = __to_ib_qp_state(qp->qplib_qp.cur_qp_state); new_qp_state = qp_attr->qp_state; if (!ib_modify_qp_is_ok(curr_qp_state, new_qp_state, - ib_qp->qp_type, qp_attr_mask, - IB_LINK_LAYER_ETHERNET)) { + ib_qp->qp_type, qp_attr_mask)) { dev_err(rdev_to_dev(rdev), "Invalid attribute mask: %#x specified ", qp_attr_mask); @@ -2664,6 +2663,7 @@ struct ib_cq *bnxt_re_create_cq(struct ib_device *ibdev, nq->budget++; atomic_inc(&rdev->cq_count); + spin_lock_init(&cq->cq_lock); if (context) { struct bnxt_re_cq_resp resp; diff --git a/drivers/infiniband/hw/bnxt_re/main.c b/drivers/infiniband/hw/bnxt_re/main.c index 85cd1a3593d6..cf2282654210 100644 --- a/drivers/infiniband/hw/bnxt_re/main.c +++ b/drivers/infiniband/hw/bnxt_re/main.c @@ -67,7 +67,7 @@ #include "hw_counters.h" static char version[] = - BNXT_RE_DESC " v" ROCE_DRV_MODULE_VERSION "\n"; + BNXT_RE_DESC "\n"; MODULE_AUTHOR("Eddie Wai <eddie.wai@broadcom.com>"); MODULE_DESCRIPTION(BNXT_RE_DESC " Driver"); @@ -535,6 +535,34 @@ static struct bnxt_en_dev *bnxt_re_dev_probe(struct net_device *netdev) return en_dev; } +static ssize_t hw_rev_show(struct device *device, struct device_attribute *attr, + char *buf) +{ + struct bnxt_re_dev *rdev = to_bnxt_re_dev(device, ibdev.dev); + + return scnprintf(buf, PAGE_SIZE, "0x%x\n", rdev->en_dev->pdev->vendor); +} +static DEVICE_ATTR_RO(hw_rev); + +static ssize_t hca_type_show(struct device *device, + struct device_attribute *attr, char *buf) +{ + struct bnxt_re_dev *rdev = to_bnxt_re_dev(device, ibdev.dev); + + return scnprintf(buf, PAGE_SIZE, "%s\n", rdev->ibdev.node_desc); +} +static DEVICE_ATTR_RO(hca_type); + +static struct attribute *bnxt_re_attributes[] = { + &dev_attr_hw_rev.attr, + &dev_attr_hca_type.attr, + NULL +}; + +static const struct attribute_group bnxt_re_dev_attr_group = { + .attrs = bnxt_re_attributes, +}; + static void bnxt_re_unregister_ib(struct bnxt_re_dev *rdev) { ib_unregister_device(&rdev->ibdev); @@ -547,7 +575,6 @@ static int bnxt_re_register_ib(struct bnxt_re_dev *rdev) /* ib device init */ ibdev->owner = THIS_MODULE; ibdev->node_type = RDMA_NODE_IB_CA; - strlcpy(ibdev->name, "bnxt_re%d", IB_DEVICE_NAME_MAX); strlcpy(ibdev->node_desc, BNXT_RE_DESC " HCA", strlen(BNXT_RE_DESC) + 5); ibdev->phys_port_cnt = 1; @@ -639,34 +666,11 @@ static int bnxt_re_register_ib(struct bnxt_re_dev *rdev) ibdev->get_hw_stats = bnxt_re_ib_get_hw_stats; ibdev->alloc_hw_stats = bnxt_re_ib_alloc_hw_stats; + rdma_set_device_sysfs_group(ibdev, &bnxt_re_dev_attr_group); ibdev->driver_id = RDMA_DRIVER_BNXT_RE; - return ib_register_device(ibdev, NULL); -} - -static ssize_t show_rev(struct device *device, struct device_attribute *attr, - char *buf) -{ - struct bnxt_re_dev *rdev = to_bnxt_re_dev(device, ibdev.dev); - - return scnprintf(buf, PAGE_SIZE, "0x%x\n", rdev->en_dev->pdev->vendor); -} - -static ssize_t show_hca(struct device *device, struct device_attribute *attr, - char *buf) -{ - struct bnxt_re_dev *rdev = to_bnxt_re_dev(device, ibdev.dev); - - return scnprintf(buf, PAGE_SIZE, "%s\n", rdev->ibdev.node_desc); + return ib_register_device(ibdev, "bnxt_re%d", NULL); } -static DEVICE_ATTR(hw_rev, 0444, show_rev, NULL); -static DEVICE_ATTR(hca_type, 0444, show_hca, NULL); - -static struct device_attribute *bnxt_re_attributes[] = { - &dev_attr_hw_rev, - &dev_attr_hca_type -}; - static void bnxt_re_dev_remove(struct bnxt_re_dev *rdev) { dev_put(rdev->netdev); @@ -864,10 +868,8 @@ static void bnxt_re_cleanup_res(struct bnxt_re_dev *rdev) { int i; - if (rdev->nq[0].hwq.max_elements) { - for (i = 1; i < rdev->num_msix; i++) - bnxt_qplib_disable_nq(&rdev->nq[i - 1]); - } + for (i = 1; i < rdev->num_msix; i++) + bnxt_qplib_disable_nq(&rdev->nq[i - 1]); if (rdev->qplib_res.rcfw) bnxt_qplib_cleanup_res(&rdev->qplib_res); @@ -876,6 +878,7 @@ static void bnxt_re_cleanup_res(struct bnxt_re_dev *rdev) static int bnxt_re_init_res(struct bnxt_re_dev *rdev) { int rc = 0, i; + int num_vec_enabled = 0; bnxt_qplib_init_res(&rdev->qplib_res); @@ -891,9 +894,13 @@ static int bnxt_re_init_res(struct bnxt_re_dev *rdev) "Failed to enable NQ with rc = 0x%x", rc); goto fail; } + num_vec_enabled++; } return 0; fail: + for (i = num_vec_enabled; i >= 0; i--) + bnxt_qplib_disable_nq(&rdev->nq[i]); + return rc; } @@ -925,6 +932,7 @@ static void bnxt_re_free_res(struct bnxt_re_dev *rdev) static int bnxt_re_alloc_res(struct bnxt_re_dev *rdev) { int rc = 0, i; + int num_vec_created = 0; /* Configure and allocate resources for qplib */ rdev->qplib_res.rcfw = &rdev->rcfw; @@ -951,7 +959,7 @@ static int bnxt_re_alloc_res(struct bnxt_re_dev *rdev) if (rc) { dev_err(rdev_to_dev(rdev), "Alloc Failed NQ%d rc:%#x", i, rc); - goto dealloc_dpi; + goto free_nq; } rc = bnxt_re_net_ring_alloc (rdev, rdev->nq[i].hwq.pbl[PBL_LVL_0].pg_map_arr, @@ -964,14 +972,17 @@ static int bnxt_re_alloc_res(struct bnxt_re_dev *rdev) dev_err(rdev_to_dev(rdev), "Failed to allocate NQ fw id with rc = 0x%x", rc); + bnxt_qplib_free_nq(&rdev->nq[i]); goto free_nq; } + num_vec_created++; } return 0; free_nq: - for (i = 0; i < rdev->num_msix - 1; i++) + for (i = num_vec_created; i >= 0; i--) { + bnxt_re_net_ring_free(rdev, rdev->nq[i].ring_id); bnxt_qplib_free_nq(&rdev->nq[i]); -dealloc_dpi: + } bnxt_qplib_dealloc_dpi(&rdev->qplib_res, &rdev->qplib_res.dpi_tbl, &rdev->dpi_privileged); @@ -989,12 +1000,17 @@ static void bnxt_re_dispatch_event(struct ib_device *ibdev, struct ib_qp *qp, struct ib_event ib_event; ib_event.device = ibdev; - if (qp) + if (qp) { ib_event.element.qp = qp; - else + ib_event.event = event; + if (qp->event_handler) + qp->event_handler(&ib_event, qp->qp_context); + + } else { ib_event.element.port_num = port_num; - ib_event.event = event; - ib_dispatch_event(&ib_event); + ib_event.event = event; + ib_dispatch_event(&ib_event); + } } #define HWRM_QUEUE_PRI2COS_QCFG_INPUT_FLAGS_IVLAN 0x02 @@ -1189,20 +1205,20 @@ static int bnxt_re_setup_qos(struct bnxt_re_dev *rdev) static void bnxt_re_ib_unreg(struct bnxt_re_dev *rdev) { - int i, rc; + int rc; if (test_and_clear_bit(BNXT_RE_FLAG_IBDEV_REGISTERED, &rdev->flags)) { - for (i = 0; i < ARRAY_SIZE(bnxt_re_attributes); i++) - device_remove_file(&rdev->ibdev.dev, - bnxt_re_attributes[i]); /* Cleanup ib dev */ bnxt_re_unregister_ib(rdev); } if (test_and_clear_bit(BNXT_RE_FLAG_QOS_WORK_REG, &rdev->flags)) - cancel_delayed_work(&rdev->worker); + cancel_delayed_work_sync(&rdev->worker); - bnxt_re_cleanup_res(rdev); - bnxt_re_free_res(rdev); + if (test_and_clear_bit(BNXT_RE_FLAG_RESOURCES_INITIALIZED, + &rdev->flags)) + bnxt_re_cleanup_res(rdev); + if (test_and_clear_bit(BNXT_RE_FLAG_RESOURCES_ALLOCATED, &rdev->flags)) + bnxt_re_free_res(rdev); if (test_and_clear_bit(BNXT_RE_FLAG_RCFW_CHANNEL_EN, &rdev->flags)) { rc = bnxt_qplib_deinit_rcfw(&rdev->rcfw); @@ -1241,7 +1257,7 @@ static void bnxt_re_worker(struct work_struct *work) static int bnxt_re_ib_reg(struct bnxt_re_dev *rdev) { - int i, j, rc; + int rc; bool locked; @@ -1331,12 +1347,15 @@ static int bnxt_re_ib_reg(struct bnxt_re_dev *rdev) pr_err("Failed to allocate resources: %#x\n", rc); goto fail; } + set_bit(BNXT_RE_FLAG_RESOURCES_ALLOCATED, &rdev->flags); rc = bnxt_re_init_res(rdev); if (rc) { pr_err("Failed to initialize resources: %#x\n", rc); goto fail; } + set_bit(BNXT_RE_FLAG_RESOURCES_INITIALIZED, &rdev->flags); + if (!rdev->is_virtfn) { rc = bnxt_re_setup_qos(rdev); if (rc) @@ -1358,20 +1377,6 @@ static int bnxt_re_ib_reg(struct bnxt_re_dev *rdev) } set_bit(BNXT_RE_FLAG_IBDEV_REGISTERED, &rdev->flags); dev_info(rdev_to_dev(rdev), "Device registered successfully"); - for (i = 0; i < ARRAY_SIZE(bnxt_re_attributes); i++) { - rc = device_create_file(&rdev->ibdev.dev, - bnxt_re_attributes[i]); - if (rc) { - dev_err(rdev_to_dev(rdev), - "Failed to create IB sysfs: %#x", rc); - /* Must clean up all created device files */ - for (j = 0; j < i; j++) - device_remove_file(&rdev->ibdev.dev, - bnxt_re_attributes[j]); - bnxt_re_unregister_ib(rdev); - goto fail; - } - } ib_get_eth_speed(&rdev->ibdev, 1, &rdev->active_speed, &rdev->active_width); set_bit(BNXT_RE_FLAG_ISSUE_ROCE_STATS, &rdev->flags); diff --git a/drivers/infiniband/hw/bnxt_re/qplib_fp.c b/drivers/infiniband/hw/bnxt_re/qplib_fp.c index 6ad0d46ab879..b98b054148cd 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_fp.c +++ b/drivers/infiniband/hw/bnxt_re/qplib_fp.c @@ -36,6 +36,8 @@ * Description: Fast Path Operators */ +#define dev_fmt(fmt) "QPLIB: " fmt + #include <linux/interrupt.h> #include <linux/spinlock.h> #include <linux/sched.h> @@ -71,8 +73,7 @@ static void __bnxt_qplib_add_flush_qp(struct bnxt_qplib_qp *qp) if (!qp->sq.flushed) { dev_dbg(&scq->hwq.pdev->dev, - "QPLIB: FP: Adding to SQ Flush list = %p", - qp); + "FP: Adding to SQ Flush list = %p\n", qp); bnxt_qplib_cancel_phantom_processing(qp); list_add_tail(&qp->sq_flush, &scq->sqf_head); qp->sq.flushed = true; @@ -80,8 +81,7 @@ static void __bnxt_qplib_add_flush_qp(struct bnxt_qplib_qp *qp) if (!qp->srq) { if (!qp->rq.flushed) { dev_dbg(&rcq->hwq.pdev->dev, - "QPLIB: FP: Adding to RQ Flush list = %p", - qp); + "FP: Adding to RQ Flush list = %p\n", qp); list_add_tail(&qp->rq_flush, &rcq->rqf_head); qp->rq.flushed = true; } @@ -207,7 +207,7 @@ static int bnxt_qplib_alloc_qp_hdr_buf(struct bnxt_qplib_res *res, if (!qp->sq_hdr_buf) { rc = -ENOMEM; dev_err(&res->pdev->dev, - "QPLIB: Failed to create sq_hdr_buf"); + "Failed to create sq_hdr_buf\n"); goto fail; } } @@ -221,7 +221,7 @@ static int bnxt_qplib_alloc_qp_hdr_buf(struct bnxt_qplib_res *res, if (!qp->rq_hdr_buf) { rc = -ENOMEM; dev_err(&res->pdev->dev, - "QPLIB: Failed to create rq_hdr_buf"); + "Failed to create rq_hdr_buf\n"); goto fail; } } @@ -277,8 +277,7 @@ static void bnxt_qplib_service_nq(unsigned long data) num_cqne_processed++; else dev_warn(&nq->pdev->dev, - "QPLIB: cqn - type 0x%x not handled", - type); + "cqn - type 0x%x not handled\n", type); spin_unlock_bh(&cq->compl_lock); break; } @@ -298,7 +297,7 @@ static void bnxt_qplib_service_nq(unsigned long data) num_srqne_processed++; else dev_warn(&nq->pdev->dev, - "QPLIB: SRQ event 0x%x not handled", + "SRQ event 0x%x not handled\n", nqsrqe->event); break; } @@ -306,8 +305,7 @@ static void bnxt_qplib_service_nq(unsigned long data) break; default: dev_warn(&nq->pdev->dev, - "QPLIB: nqe with type = 0x%x not handled", - type); + "nqe with type = 0x%x not handled\n", type); break; } raw_cons++; @@ -360,7 +358,8 @@ void bnxt_qplib_disable_nq(struct bnxt_qplib_nq *nq) } /* Make sure the HW is stopped! */ - bnxt_qplib_nq_stop_irq(nq, true); + if (nq->requested) + bnxt_qplib_nq_stop_irq(nq, true); if (nq->bar_reg_iomem) iounmap(nq->bar_reg_iomem); @@ -396,7 +395,7 @@ int bnxt_qplib_nq_start_irq(struct bnxt_qplib_nq *nq, int nq_indx, rc = irq_set_affinity_hint(nq->vector, &nq->mask); if (rc) { dev_warn(&nq->pdev->dev, - "QPLIB: set affinity failed; vector: %d nq_idx: %d\n", + "set affinity failed; vector: %d nq_idx: %d\n", nq->vector, nq_indx); } nq->requested = true; @@ -443,7 +442,7 @@ int bnxt_qplib_enable_nq(struct pci_dev *pdev, struct bnxt_qplib_nq *nq, rc = bnxt_qplib_nq_start_irq(nq, nq_idx, msix_vector, true); if (rc) { dev_err(&nq->pdev->dev, - "QPLIB: Failed to request irq for nq-idx %d", nq_idx); + "Failed to request irq for nq-idx %d\n", nq_idx); goto fail; } @@ -662,8 +661,8 @@ int bnxt_qplib_post_srq_recv(struct bnxt_qplib_srq *srq, spin_lock(&srq_hwq->lock); if (srq->start_idx == srq->last_idx) { - dev_err(&srq_hwq->pdev->dev, "QPLIB: FP: SRQ (0x%x) is full!", - srq->id); + dev_err(&srq_hwq->pdev->dev, + "FP: SRQ (0x%x) is full!\n", srq->id); rc = -EINVAL; spin_unlock(&srq_hwq->lock); goto done; @@ -1324,7 +1323,7 @@ int bnxt_qplib_query_qp(struct bnxt_qplib_res *res, struct bnxt_qplib_qp *qp) } } if (i == res->sgid_tbl.max) - dev_warn(&res->pdev->dev, "QPLIB: SGID not found??"); + dev_warn(&res->pdev->dev, "SGID not found??\n"); qp->ah.hop_limit = sb->hop_limit; qp->ah.traffic_class = sb->traffic_class; @@ -1536,7 +1535,7 @@ int bnxt_qplib_post_send(struct bnxt_qplib_qp *qp, if (bnxt_qplib_queue_full(sq)) { dev_err(&sq->hwq.pdev->dev, - "QPLIB: prod = %#x cons = %#x qdepth = %#x delta = %#x", + "prod = %#x cons = %#x qdepth = %#x delta = %#x\n", sq->hwq.prod, sq->hwq.cons, sq->hwq.max_elements, sq->q_full_delta); rc = -ENOMEM; @@ -1561,7 +1560,7 @@ int bnxt_qplib_post_send(struct bnxt_qplib_qp *qp, /* Copy the inline data */ if (wqe->inline_len > BNXT_QPLIB_SWQE_MAX_INLINE_LENGTH) { dev_warn(&sq->hwq.pdev->dev, - "QPLIB: Inline data length > 96 detected"); + "Inline data length > 96 detected\n"); data_len = BNXT_QPLIB_SWQE_MAX_INLINE_LENGTH; } else { data_len = wqe->inline_len; @@ -1776,7 +1775,7 @@ done: queue_work(qp->scq->nq->cqn_wq, &nq_work->work); } else { dev_err(&sq->hwq.pdev->dev, - "QPLIB: FP: Failed to allocate SQ nq_work!"); + "FP: Failed to allocate SQ nq_work!\n"); rc = -ENOMEM; } } @@ -1815,13 +1814,12 @@ int bnxt_qplib_post_recv(struct bnxt_qplib_qp *qp, if (qp->state == CMDQ_MODIFY_QP_NEW_STATE_ERR) { sch_handler = true; dev_dbg(&rq->hwq.pdev->dev, - "%s Error QP. Scheduling for poll_cq\n", - __func__); + "%s: Error QP. Scheduling for poll_cq\n", __func__); goto queue_err; } if (bnxt_qplib_queue_full(rq)) { dev_err(&rq->hwq.pdev->dev, - "QPLIB: FP: QP (0x%x) RQ is full!", qp->id); + "FP: QP (0x%x) RQ is full!\n", qp->id); rc = -EINVAL; goto done; } @@ -1870,7 +1868,7 @@ queue_err: queue_work(qp->rcq->nq->cqn_wq, &nq_work->work); } else { dev_err(&rq->hwq.pdev->dev, - "QPLIB: FP: Failed to allocate RQ nq_work!"); + "FP: Failed to allocate RQ nq_work!\n"); rc = -ENOMEM; } } @@ -1932,7 +1930,7 @@ int bnxt_qplib_create_cq(struct bnxt_qplib_res *res, struct bnxt_qplib_cq *cq) if (!cq->dpi) { dev_err(&rcfw->pdev->dev, - "QPLIB: FP: CREATE_CQ failed due to NULL DPI"); + "FP: CREATE_CQ failed due to NULL DPI\n"); return -EINVAL; } req.dpi = cpu_to_le32(cq->dpi->dpi); @@ -1969,6 +1967,7 @@ int bnxt_qplib_create_cq(struct bnxt_qplib_res *res, struct bnxt_qplib_cq *cq) INIT_LIST_HEAD(&cq->sqf_head); INIT_LIST_HEAD(&cq->rqf_head); spin_lock_init(&cq->compl_lock); + spin_lock_init(&cq->flush_lock); bnxt_qplib_arm_cq_enable(cq); return 0; @@ -2172,7 +2171,7 @@ static int do_wa9060(struct bnxt_qplib_qp *qp, struct bnxt_qplib_cq *cq, * comes back */ dev_dbg(&cq->hwq.pdev->dev, - "FP:Got Phantom CQE"); + "FP: Got Phantom CQE\n"); sq->condition = false; sq->single = true; rc = 0; @@ -2189,7 +2188,7 @@ static int do_wa9060(struct bnxt_qplib_qp *qp, struct bnxt_qplib_cq *cq, peek_raw_cq_cons++; } dev_err(&cq->hwq.pdev->dev, - "Should not have come here! cq_cons=0x%x qp=0x%x sq cons sw=0x%x hw=0x%x", + "Should not have come here! cq_cons=0x%x qp=0x%x sq cons sw=0x%x hw=0x%x\n", cq_cons, qp->id, sw_sq_cons, cqe_sq_cons); rc = -EINVAL; } @@ -2213,7 +2212,7 @@ static int bnxt_qplib_cq_process_req(struct bnxt_qplib_cq *cq, le64_to_cpu(hwcqe->qp_handle)); if (!qp) { dev_err(&cq->hwq.pdev->dev, - "QPLIB: FP: Process Req qp is NULL"); + "FP: Process Req qp is NULL\n"); return -EINVAL; } sq = &qp->sq; @@ -2221,16 +2220,14 @@ static int bnxt_qplib_cq_process_req(struct bnxt_qplib_cq *cq, cqe_sq_cons = HWQ_CMP(le16_to_cpu(hwcqe->sq_cons_idx), &sq->hwq); if (cqe_sq_cons > sq->hwq.max_elements) { dev_err(&cq->hwq.pdev->dev, - "QPLIB: FP: CQ Process req reported "); - dev_err(&cq->hwq.pdev->dev, - "QPLIB: sq_cons_idx 0x%x which exceeded max 0x%x", + "FP: CQ Process req reported sq_cons_idx 0x%x which exceeded max 0x%x\n", cqe_sq_cons, sq->hwq.max_elements); return -EINVAL; } if (qp->sq.flushed) { dev_dbg(&cq->hwq.pdev->dev, - "%s: QPLIB: QP in Flush QP = %p\n", __func__, qp); + "%s: QP in Flush QP = %p\n", __func__, qp); goto done; } /* Require to walk the sq's swq to fabricate CQEs for all previously @@ -2262,9 +2259,7 @@ static int bnxt_qplib_cq_process_req(struct bnxt_qplib_cq *cq, hwcqe->status != CQ_REQ_STATUS_OK) { cqe->status = hwcqe->status; dev_err(&cq->hwq.pdev->dev, - "QPLIB: FP: CQ Processed Req "); - dev_err(&cq->hwq.pdev->dev, - "QPLIB: wr_id[%d] = 0x%llx with status 0x%x", + "FP: CQ Processed Req wr_id[%d] = 0x%llx with status 0x%x\n", sw_sq_cons, cqe->wr_id, cqe->status); cqe++; (*budget)--; @@ -2330,12 +2325,12 @@ static int bnxt_qplib_cq_process_res_rc(struct bnxt_qplib_cq *cq, qp = (struct bnxt_qplib_qp *)((unsigned long) le64_to_cpu(hwcqe->qp_handle)); if (!qp) { - dev_err(&cq->hwq.pdev->dev, "QPLIB: process_cq RC qp is NULL"); + dev_err(&cq->hwq.pdev->dev, "process_cq RC qp is NULL\n"); return -EINVAL; } if (qp->rq.flushed) { dev_dbg(&cq->hwq.pdev->dev, - "%s: QPLIB: QP in Flush QP = %p\n", __func__, qp); + "%s: QP in Flush QP = %p\n", __func__, qp); goto done; } @@ -2356,9 +2351,7 @@ static int bnxt_qplib_cq_process_res_rc(struct bnxt_qplib_cq *cq, return -EINVAL; if (wr_id_idx >= srq->hwq.max_elements) { dev_err(&cq->hwq.pdev->dev, - "QPLIB: FP: CQ Process RC "); - dev_err(&cq->hwq.pdev->dev, - "QPLIB: wr_id idx 0x%x exceeded SRQ max 0x%x", + "FP: CQ Process RC wr_id idx 0x%x exceeded SRQ max 0x%x\n", wr_id_idx, srq->hwq.max_elements); return -EINVAL; } @@ -2371,9 +2364,7 @@ static int bnxt_qplib_cq_process_res_rc(struct bnxt_qplib_cq *cq, rq = &qp->rq; if (wr_id_idx >= rq->hwq.max_elements) { dev_err(&cq->hwq.pdev->dev, - "QPLIB: FP: CQ Process RC "); - dev_err(&cq->hwq.pdev->dev, - "QPLIB: wr_id idx 0x%x exceeded RQ max 0x%x", + "FP: CQ Process RC wr_id idx 0x%x exceeded RQ max 0x%x\n", wr_id_idx, rq->hwq.max_elements); return -EINVAL; } @@ -2409,12 +2400,12 @@ static int bnxt_qplib_cq_process_res_ud(struct bnxt_qplib_cq *cq, qp = (struct bnxt_qplib_qp *)((unsigned long) le64_to_cpu(hwcqe->qp_handle)); if (!qp) { - dev_err(&cq->hwq.pdev->dev, "QPLIB: process_cq UD qp is NULL"); + dev_err(&cq->hwq.pdev->dev, "process_cq UD qp is NULL\n"); return -EINVAL; } if (qp->rq.flushed) { dev_dbg(&cq->hwq.pdev->dev, - "%s: QPLIB: QP in Flush QP = %p\n", __func__, qp); + "%s: QP in Flush QP = %p\n", __func__, qp); goto done; } cqe = *pcqe; @@ -2439,9 +2430,7 @@ static int bnxt_qplib_cq_process_res_ud(struct bnxt_qplib_cq *cq, if (wr_id_idx >= srq->hwq.max_elements) { dev_err(&cq->hwq.pdev->dev, - "QPLIB: FP: CQ Process UD "); - dev_err(&cq->hwq.pdev->dev, - "QPLIB: wr_id idx 0x%x exceeded SRQ max 0x%x", + "FP: CQ Process UD wr_id idx 0x%x exceeded SRQ max 0x%x\n", wr_id_idx, srq->hwq.max_elements); return -EINVAL; } @@ -2454,9 +2443,7 @@ static int bnxt_qplib_cq_process_res_ud(struct bnxt_qplib_cq *cq, rq = &qp->rq; if (wr_id_idx >= rq->hwq.max_elements) { dev_err(&cq->hwq.pdev->dev, - "QPLIB: FP: CQ Process UD "); - dev_err(&cq->hwq.pdev->dev, - "QPLIB: wr_id idx 0x%x exceeded RQ max 0x%x", + "FP: CQ Process UD wr_id idx 0x%x exceeded RQ max 0x%x\n", wr_id_idx, rq->hwq.max_elements); return -EINVAL; } @@ -2508,13 +2495,12 @@ static int bnxt_qplib_cq_process_res_raweth_qp1(struct bnxt_qplib_cq *cq, qp = (struct bnxt_qplib_qp *)((unsigned long) le64_to_cpu(hwcqe->qp_handle)); if (!qp) { - dev_err(&cq->hwq.pdev->dev, - "QPLIB: process_cq Raw/QP1 qp is NULL"); + dev_err(&cq->hwq.pdev->dev, "process_cq Raw/QP1 qp is NULL\n"); return -EINVAL; } if (qp->rq.flushed) { dev_dbg(&cq->hwq.pdev->dev, - "%s: QPLIB: QP in Flush QP = %p\n", __func__, qp); + "%s: QP in Flush QP = %p\n", __func__, qp); goto done; } cqe = *pcqe; @@ -2543,14 +2529,12 @@ static int bnxt_qplib_cq_process_res_raweth_qp1(struct bnxt_qplib_cq *cq, srq = qp->srq; if (!srq) { dev_err(&cq->hwq.pdev->dev, - "QPLIB: FP: SRQ used but not defined??"); + "FP: SRQ used but not defined??\n"); return -EINVAL; } if (wr_id_idx >= srq->hwq.max_elements) { dev_err(&cq->hwq.pdev->dev, - "QPLIB: FP: CQ Process Raw/QP1 "); - dev_err(&cq->hwq.pdev->dev, - "QPLIB: wr_id idx 0x%x exceeded SRQ max 0x%x", + "FP: CQ Process Raw/QP1 wr_id idx 0x%x exceeded SRQ max 0x%x\n", wr_id_idx, srq->hwq.max_elements); return -EINVAL; } @@ -2563,9 +2547,7 @@ static int bnxt_qplib_cq_process_res_raweth_qp1(struct bnxt_qplib_cq *cq, rq = &qp->rq; if (wr_id_idx >= rq->hwq.max_elements) { dev_err(&cq->hwq.pdev->dev, - "QPLIB: FP: CQ Process Raw/QP1 RQ wr_id "); - dev_err(&cq->hwq.pdev->dev, - "QPLIB: ix 0x%x exceeded RQ max 0x%x", + "FP: CQ Process Raw/QP1 RQ wr_id idx 0x%x exceeded RQ max 0x%x\n", wr_id_idx, rq->hwq.max_elements); return -EINVAL; } @@ -2600,14 +2582,14 @@ static int bnxt_qplib_cq_process_terminal(struct bnxt_qplib_cq *cq, /* Check the Status */ if (hwcqe->status != CQ_TERMINAL_STATUS_OK) dev_warn(&cq->hwq.pdev->dev, - "QPLIB: FP: CQ Process Terminal Error status = 0x%x", + "FP: CQ Process Terminal Error status = 0x%x\n", hwcqe->status); qp = (struct bnxt_qplib_qp *)((unsigned long) le64_to_cpu(hwcqe->qp_handle)); if (!qp) { dev_err(&cq->hwq.pdev->dev, - "QPLIB: FP: CQ Process terminal qp is NULL"); + "FP: CQ Process terminal qp is NULL\n"); return -EINVAL; } @@ -2623,16 +2605,14 @@ static int bnxt_qplib_cq_process_terminal(struct bnxt_qplib_cq *cq, if (cqe_cons > sq->hwq.max_elements) { dev_err(&cq->hwq.pdev->dev, - "QPLIB: FP: CQ Process terminal reported "); - dev_err(&cq->hwq.pdev->dev, - "QPLIB: sq_cons_idx 0x%x which exceeded max 0x%x", + "FP: CQ Process terminal reported sq_cons_idx 0x%x which exceeded max 0x%x\n", cqe_cons, sq->hwq.max_elements); goto do_rq; } if (qp->sq.flushed) { dev_dbg(&cq->hwq.pdev->dev, - "%s: QPLIB: QP in Flush QP = %p\n", __func__, qp); + "%s: QP in Flush QP = %p\n", __func__, qp); goto sq_done; } @@ -2673,16 +2653,14 @@ do_rq: goto done; } else if (cqe_cons > rq->hwq.max_elements) { dev_err(&cq->hwq.pdev->dev, - "QPLIB: FP: CQ Processed terminal "); - dev_err(&cq->hwq.pdev->dev, - "QPLIB: reported rq_cons_idx 0x%x exceeds max 0x%x", + "FP: CQ Processed terminal reported rq_cons_idx 0x%x exceeds max 0x%x\n", cqe_cons, rq->hwq.max_elements); goto done; } if (qp->rq.flushed) { dev_dbg(&cq->hwq.pdev->dev, - "%s: QPLIB: QP in Flush QP = %p\n", __func__, qp); + "%s: QP in Flush QP = %p\n", __func__, qp); rc = 0; goto done; } @@ -2704,7 +2682,7 @@ static int bnxt_qplib_cq_process_cutoff(struct bnxt_qplib_cq *cq, /* Check the Status */ if (hwcqe->status != CQ_CUTOFF_STATUS_OK) { dev_err(&cq->hwq.pdev->dev, - "QPLIB: FP: CQ Process Cutoff Error status = 0x%x", + "FP: CQ Process Cutoff Error status = 0x%x\n", hwcqe->status); return -EINVAL; } @@ -2724,16 +2702,12 @@ int bnxt_qplib_process_flush_list(struct bnxt_qplib_cq *cq, spin_lock_irqsave(&cq->flush_lock, flags); list_for_each_entry(qp, &cq->sqf_head, sq_flush) { - dev_dbg(&cq->hwq.pdev->dev, - "QPLIB: FP: Flushing SQ QP= %p", - qp); + dev_dbg(&cq->hwq.pdev->dev, "FP: Flushing SQ QP= %p\n", qp); __flush_sq(&qp->sq, qp, &cqe, &budget); } list_for_each_entry(qp, &cq->rqf_head, rq_flush) { - dev_dbg(&cq->hwq.pdev->dev, - "QPLIB: FP: Flushing RQ QP= %p", - qp); + dev_dbg(&cq->hwq.pdev->dev, "FP: Flushing RQ QP= %p\n", qp); __flush_rq(&qp->rq, qp, &cqe, &budget); } spin_unlock_irqrestore(&cq->flush_lock, flags); @@ -2801,7 +2775,7 @@ int bnxt_qplib_poll_cq(struct bnxt_qplib_cq *cq, struct bnxt_qplib_cqe *cqe, goto exit; default: dev_err(&cq->hwq.pdev->dev, - "QPLIB: process_cq unknown type 0x%lx", + "process_cq unknown type 0x%lx\n", hw_cqe->cqe_type_toggle & CQ_BASE_CQE_TYPE_MASK); rc = -EINVAL; @@ -2814,7 +2788,7 @@ int bnxt_qplib_poll_cq(struct bnxt_qplib_cq *cq, struct bnxt_qplib_cqe *cqe, * next one */ dev_err(&cq->hwq.pdev->dev, - "QPLIB: process_cqe error rc = 0x%x", rc); + "process_cqe error rc = 0x%x\n", rc); } raw_cons++; } diff --git a/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c b/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c index 2852d350ada1..be4e33e9f962 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c +++ b/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c @@ -35,6 +35,9 @@ * * Description: RDMA Controller HW interface */ + +#define dev_fmt(fmt) "QPLIB: " fmt + #include <linux/interrupt.h> #include <linux/spinlock.h> #include <linux/pci.h> @@ -96,14 +99,13 @@ static int __send_message(struct bnxt_qplib_rcfw *rcfw, struct cmdq_base *req, opcode != CMDQ_BASE_OPCODE_INITIALIZE_FW && opcode != CMDQ_BASE_OPCODE_QUERY_VERSION)) { dev_err(&rcfw->pdev->dev, - "QPLIB: RCFW not initialized, reject opcode 0x%x", - opcode); + "RCFW not initialized, reject opcode 0x%x\n", opcode); return -EINVAL; } if (test_bit(FIRMWARE_INITIALIZED_FLAG, &rcfw->flags) && opcode == CMDQ_BASE_OPCODE_INITIALIZE_FW) { - dev_err(&rcfw->pdev->dev, "QPLIB: RCFW already initialized!"); + dev_err(&rcfw->pdev->dev, "RCFW already initialized!\n"); return -EINVAL; } @@ -115,7 +117,7 @@ static int __send_message(struct bnxt_qplib_rcfw *rcfw, struct cmdq_base *req, */ spin_lock_irqsave(&cmdq->lock, flags); if (req->cmd_size >= HWQ_FREE_SLOTS(cmdq)) { - dev_err(&rcfw->pdev->dev, "QPLIB: RCFW: CMDQ is full!"); + dev_err(&rcfw->pdev->dev, "RCFW: CMDQ is full!\n"); spin_unlock_irqrestore(&cmdq->lock, flags); return -EAGAIN; } @@ -154,7 +156,7 @@ static int __send_message(struct bnxt_qplib_rcfw *rcfw, struct cmdq_base *req, cmdqe = &cmdq_ptr[get_cmdq_pg(sw_prod)][get_cmdq_idx(sw_prod)]; if (!cmdqe) { dev_err(&rcfw->pdev->dev, - "QPLIB: RCFW request failed with no cmdqe!"); + "RCFW request failed with no cmdqe!\n"); goto done; } /* Copy a segment of the req cmd to the cmdq */ @@ -210,7 +212,7 @@ int bnxt_qplib_rcfw_send_message(struct bnxt_qplib_rcfw *rcfw, if (!retry_cnt || (rc != -EAGAIN && rc != -EBUSY)) { /* send failed */ - dev_err(&rcfw->pdev->dev, "QPLIB: cmdq[%#x]=%#x send failed", + dev_err(&rcfw->pdev->dev, "cmdq[%#x]=%#x send failed\n", cookie, opcode); return rc; } @@ -224,7 +226,7 @@ int bnxt_qplib_rcfw_send_message(struct bnxt_qplib_rcfw *rcfw, rc = __wait_for_resp(rcfw, cookie); if (rc) { /* timed out */ - dev_err(&rcfw->pdev->dev, "QPLIB: cmdq[%#x]=%#x timedout (%d)msec", + dev_err(&rcfw->pdev->dev, "cmdq[%#x]=%#x timedout (%d)msec\n", cookie, opcode, RCFW_CMD_WAIT_TIME_MS); set_bit(FIRMWARE_TIMED_OUT, &rcfw->flags); return rc; @@ -232,7 +234,7 @@ int bnxt_qplib_rcfw_send_message(struct bnxt_qplib_rcfw *rcfw, if (evnt->status) { /* failed with status */ - dev_err(&rcfw->pdev->dev, "QPLIB: cmdq[%#x]=%#x status %#x", + dev_err(&rcfw->pdev->dev, "cmdq[%#x]=%#x status %#x\n", cookie, opcode, evnt->status); rc = -EFAULT; } @@ -298,9 +300,9 @@ static int bnxt_qplib_process_qp_event(struct bnxt_qplib_rcfw *rcfw, qp_id = le32_to_cpu(err_event->xid); qp = rcfw->qp_tbl[qp_id].qp_handle; dev_dbg(&rcfw->pdev->dev, - "QPLIB: Received QP error notification"); + "Received QP error notification\n"); dev_dbg(&rcfw->pdev->dev, - "QPLIB: qpid 0x%x, req_err=0x%x, resp_err=0x%x\n", + "qpid 0x%x, req_err=0x%x, resp_err=0x%x\n", qp_id, err_event->req_err_state_reason, err_event->res_err_state_reason); if (!qp) @@ -309,8 +311,17 @@ static int bnxt_qplib_process_qp_event(struct bnxt_qplib_rcfw *rcfw, rcfw->aeq_handler(rcfw, qp_event, qp); break; default: - /* Command Response */ - spin_lock_irqsave(&cmdq->lock, flags); + /* + * Command Response + * cmdq->lock needs to be acquired to synchronie + * the command send and completion reaping. This function + * is always called with creq->lock held. Using + * the nested variant of spin_lock. + * + */ + + spin_lock_irqsave_nested(&cmdq->lock, flags, + SINGLE_DEPTH_NESTING); cookie = le16_to_cpu(qp_event->cookie); mcookie = qp_event->cookie; blocked = cookie & RCFW_CMD_IS_BLOCKING; @@ -322,14 +333,16 @@ static int bnxt_qplib_process_qp_event(struct bnxt_qplib_rcfw *rcfw, memcpy(crsqe->resp, qp_event, sizeof(*qp_event)); crsqe->resp = NULL; } else { - dev_err(&rcfw->pdev->dev, - "QPLIB: CMD %s resp->cookie = %#x, evnt->cookie = %#x", - crsqe->resp ? "mismatch" : "collision", - crsqe->resp ? crsqe->resp->cookie : 0, mcookie); + if (crsqe->resp && crsqe->resp->cookie) + dev_err(&rcfw->pdev->dev, + "CMD %s cookie sent=%#x, recd=%#x\n", + crsqe->resp ? "mismatch" : "collision", + crsqe->resp ? crsqe->resp->cookie : 0, + mcookie); } if (!test_and_clear_bit(cbit, rcfw->cmdq_bitmap)) dev_warn(&rcfw->pdev->dev, - "QPLIB: CMD bit %d was not requested", cbit); + "CMD bit %d was not requested\n", cbit); cmdq->cons += crsqe->req_size; crsqe->req_size = 0; @@ -376,14 +389,14 @@ static void bnxt_qplib_service_creq(unsigned long data) (rcfw, (struct creq_func_event *)creqe)) rcfw->creq_func_event_processed++; else - dev_warn - (&rcfw->pdev->dev, "QPLIB:aeqe:%#x Not handled", - type); + dev_warn(&rcfw->pdev->dev, + "aeqe:%#x Not handled\n", type); break; default: - dev_warn(&rcfw->pdev->dev, "QPLIB: creqe with "); - dev_warn(&rcfw->pdev->dev, - "QPLIB: op_event = 0x%x not handled", type); + if (type != ASYNC_EVENT_CMPL_TYPE_HWRM_ASYNC_EVENT) + dev_warn(&rcfw->pdev->dev, + "creqe with event 0x%x not handled\n", + type); break; } raw_cons++; @@ -551,7 +564,7 @@ int bnxt_qplib_alloc_rcfw_channel(struct pci_dev *pdev, BNXT_QPLIB_CREQE_UNITS, 0, PAGE_SIZE, HWQ_TYPE_L2_CMPL)) { dev_err(&rcfw->pdev->dev, - "QPLIB: HW channel CREQ allocation failed"); + "HW channel CREQ allocation failed\n"); goto fail; } rcfw->cmdq.max_elements = BNXT_QPLIB_CMDQE_MAX_CNT; @@ -560,7 +573,7 @@ int bnxt_qplib_alloc_rcfw_channel(struct pci_dev *pdev, BNXT_QPLIB_CMDQE_UNITS, 0, PAGE_SIZE, HWQ_TYPE_CTX)) { dev_err(&rcfw->pdev->dev, - "QPLIB: HW channel CMDQ allocation failed"); + "HW channel CMDQ allocation failed\n"); goto fail; } @@ -605,21 +618,18 @@ void bnxt_qplib_disable_rcfw_channel(struct bnxt_qplib_rcfw *rcfw) bnxt_qplib_rcfw_stop_irq(rcfw, true); - if (rcfw->cmdq_bar_reg_iomem) - iounmap(rcfw->cmdq_bar_reg_iomem); - rcfw->cmdq_bar_reg_iomem = NULL; - - if (rcfw->creq_bar_reg_iomem) - iounmap(rcfw->creq_bar_reg_iomem); - rcfw->creq_bar_reg_iomem = NULL; + iounmap(rcfw->cmdq_bar_reg_iomem); + iounmap(rcfw->creq_bar_reg_iomem); indx = find_first_bit(rcfw->cmdq_bitmap, rcfw->bmap_size); if (indx != rcfw->bmap_size) dev_err(&rcfw->pdev->dev, - "QPLIB: disabling RCFW with pending cmd-bit %lx", indx); + "disabling RCFW with pending cmd-bit %lx\n", indx); kfree(rcfw->cmdq_bitmap); rcfw->bmap_size = 0; + rcfw->cmdq_bar_reg_iomem = NULL; + rcfw->creq_bar_reg_iomem = NULL; rcfw->aeq_handler = NULL; rcfw->vector = 0; } @@ -681,8 +691,7 @@ int bnxt_qplib_enable_rcfw_channel(struct pci_dev *pdev, RCFW_COMM_BASE_OFFSET, RCFW_COMM_SIZE); if (!rcfw->cmdq_bar_reg_iomem) { - dev_err(&rcfw->pdev->dev, - "QPLIB: CMDQ BAR region %d mapping failed", + dev_err(&rcfw->pdev->dev, "CMDQ BAR region %d mapping failed\n", rcfw->cmdq_bar_reg); return -ENOMEM; } @@ -697,14 +706,15 @@ int bnxt_qplib_enable_rcfw_channel(struct pci_dev *pdev, res_base = pci_resource_start(pdev, rcfw->creq_bar_reg); if (!res_base) dev_err(&rcfw->pdev->dev, - "QPLIB: CREQ BAR region %d resc start is 0!", + "CREQ BAR region %d resc start is 0!\n", rcfw->creq_bar_reg); rcfw->creq_bar_reg_iomem = ioremap_nocache(res_base + cp_bar_reg_off, 4); if (!rcfw->creq_bar_reg_iomem) { - dev_err(&rcfw->pdev->dev, - "QPLIB: CREQ BAR region %d mapping failed", + dev_err(&rcfw->pdev->dev, "CREQ BAR region %d mapping failed\n", rcfw->creq_bar_reg); + iounmap(rcfw->cmdq_bar_reg_iomem); + rcfw->cmdq_bar_reg_iomem = NULL; return -ENOMEM; } rcfw->creq_qp_event_processed = 0; @@ -717,7 +727,7 @@ int bnxt_qplib_enable_rcfw_channel(struct pci_dev *pdev, rc = bnxt_qplib_rcfw_start_irq(rcfw, msix_vector, true); if (rc) { dev_err(&rcfw->pdev->dev, - "QPLIB: Failed to request IRQ for CREQ rc = 0x%x", rc); + "Failed to request IRQ for CREQ rc = 0x%x\n", rc); bnxt_qplib_disable_rcfw_channel(rcfw); return rc; } diff --git a/drivers/infiniband/hw/bnxt_re/qplib_rcfw.h b/drivers/infiniband/hw/bnxt_re/qplib_rcfw.h index 46416dfe8830..9a8687dc0a79 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_rcfw.h +++ b/drivers/infiniband/hw/bnxt_re/qplib_rcfw.h @@ -154,6 +154,8 @@ struct bnxt_qplib_qp_node { void *qp_handle; /* ptr to qplib_qp */ }; +#define BNXT_QPLIB_OOS_COUNT_MASK 0xFFFFFFFF + /* RCFW Communication Channels */ struct bnxt_qplib_rcfw { struct pci_dev *pdev; @@ -190,6 +192,8 @@ struct bnxt_qplib_rcfw { struct bnxt_qplib_crsq *crsqe_tbl; int qp_tbl_size; struct bnxt_qplib_qp_node *qp_tbl; + u64 oos_prev; + u32 init_oos_stats; }; void bnxt_qplib_free_rcfw_channel(struct bnxt_qplib_rcfw *rcfw); diff --git a/drivers/infiniband/hw/bnxt_re/qplib_res.c b/drivers/infiniband/hw/bnxt_re/qplib_res.c index 539a5d44e6db..59eeac55626f 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_res.c +++ b/drivers/infiniband/hw/bnxt_re/qplib_res.c @@ -36,6 +36,8 @@ * Description: QPLib resource manager */ +#define dev_fmt(fmt) "QPLIB: " fmt + #include <linux/spinlock.h> #include <linux/pci.h> #include <linux/interrupt.h> @@ -68,8 +70,7 @@ static void __free_pbl(struct pci_dev *pdev, struct bnxt_qplib_pbl *pbl, pbl->pg_map_arr[i]); else dev_warn(&pdev->dev, - "QPLIB: PBL free pg_arr[%d] empty?!", - i); + "PBL free pg_arr[%d] empty?!\n", i); pbl->pg_arr[i] = NULL; } } @@ -537,7 +538,7 @@ static void bnxt_qplib_free_pkey_tbl(struct bnxt_qplib_res *res, struct bnxt_qplib_pkey_tbl *pkey_tbl) { if (!pkey_tbl->tbl) - dev_dbg(&res->pdev->dev, "QPLIB: PKEY tbl not present"); + dev_dbg(&res->pdev->dev, "PKEY tbl not present\n"); else kfree(pkey_tbl->tbl); @@ -578,7 +579,7 @@ int bnxt_qplib_dealloc_pd(struct bnxt_qplib_res *res, struct bnxt_qplib_pd *pd) { if (test_and_set_bit(pd->id, pdt->tbl)) { - dev_warn(&res->pdev->dev, "Freeing an unused PD? pdn = %d", + dev_warn(&res->pdev->dev, "Freeing an unused PD? pdn = %d\n", pd->id); return -EINVAL; } @@ -639,11 +640,11 @@ int bnxt_qplib_dealloc_dpi(struct bnxt_qplib_res *res, struct bnxt_qplib_dpi *dpi) { if (dpi->dpi >= dpit->max) { - dev_warn(&res->pdev->dev, "Invalid DPI? dpi = %d", dpi->dpi); + dev_warn(&res->pdev->dev, "Invalid DPI? dpi = %d\n", dpi->dpi); return -EINVAL; } if (test_and_set_bit(dpi->dpi, dpit->tbl)) { - dev_warn(&res->pdev->dev, "Freeing an unused DPI? dpi = %d", + dev_warn(&res->pdev->dev, "Freeing an unused DPI? dpi = %d\n", dpi->dpi); return -EINVAL; } @@ -673,22 +674,21 @@ static int bnxt_qplib_alloc_dpi_tbl(struct bnxt_qplib_res *res, u32 dbr_len, bytes; if (dpit->dbr_bar_reg_iomem) { - dev_err(&res->pdev->dev, - "QPLIB: DBR BAR region %d already mapped", dbr_bar_reg); + dev_err(&res->pdev->dev, "DBR BAR region %d already mapped\n", + dbr_bar_reg); return -EALREADY; } bar_reg_base = pci_resource_start(res->pdev, dbr_bar_reg); if (!bar_reg_base) { - dev_err(&res->pdev->dev, - "QPLIB: BAR region %d resc start failed", dbr_bar_reg); + dev_err(&res->pdev->dev, "BAR region %d resc start failed\n", + dbr_bar_reg); return -ENOMEM; } dbr_len = pci_resource_len(res->pdev, dbr_bar_reg) - dbr_offset; if (!dbr_len || ((dbr_len & (PAGE_SIZE - 1)) != 0)) { - dev_err(&res->pdev->dev, "QPLIB: Invalid DBR length %d", - dbr_len); + dev_err(&res->pdev->dev, "Invalid DBR length %d\n", dbr_len); return -ENOMEM; } @@ -696,8 +696,7 @@ static int bnxt_qplib_alloc_dpi_tbl(struct bnxt_qplib_res *res, dbr_len); if (!dpit->dbr_bar_reg_iomem) { dev_err(&res->pdev->dev, - "QPLIB: FP: DBR BAR region %d mapping failed", - dbr_bar_reg); + "FP: DBR BAR region %d mapping failed\n", dbr_bar_reg); return -ENOMEM; } @@ -767,7 +766,7 @@ static int bnxt_qplib_alloc_stats_ctx(struct pci_dev *pdev, stats->dma = dma_alloc_coherent(&pdev->dev, stats->size, &stats->dma_map, GFP_KERNEL); if (!stats->dma) { - dev_err(&pdev->dev, "QPLIB: Stats DMA allocation failed"); + dev_err(&pdev->dev, "Stats DMA allocation failed\n"); return -ENOMEM; } return 0; diff --git a/drivers/infiniband/hw/bnxt_re/qplib_sp.c b/drivers/infiniband/hw/bnxt_re/qplib_sp.c index 4097f3fa25c5..5216b5f844cc 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_sp.c +++ b/drivers/infiniband/hw/bnxt_re/qplib_sp.c @@ -36,6 +36,8 @@ * Description: Slow Path Operators */ +#define dev_fmt(fmt) "QPLIB: " fmt + #include <linux/interrupt.h> #include <linux/spinlock.h> #include <linux/sched.h> @@ -89,7 +91,7 @@ int bnxt_qplib_get_dev_attr(struct bnxt_qplib_rcfw *rcfw, sbuf = bnxt_qplib_rcfw_alloc_sbuf(rcfw, sizeof(*sb)); if (!sbuf) { dev_err(&rcfw->pdev->dev, - "QPLIB: SP: QUERY_FUNC alloc side buffer failed"); + "SP: QUERY_FUNC alloc side buffer failed\n"); return -ENOMEM; } @@ -135,8 +137,16 @@ int bnxt_qplib_get_dev_attr(struct bnxt_qplib_rcfw *rcfw, attr->max_srq = le16_to_cpu(sb->max_srq); attr->max_srq_wqes = le32_to_cpu(sb->max_srq_wr) - 1; attr->max_srq_sges = sb->max_srq_sge; - /* Bono only reports 1 PKEY for now, but it can support > 1 */ attr->max_pkey = le32_to_cpu(sb->max_pkeys); + /* + * Some versions of FW reports more than 0xFFFF. + * Restrict it for now to 0xFFFF to avoid + * reporting trucated value + */ + if (attr->max_pkey > 0xFFFF) { + /* ib_port_attr::pkey_tbl_len is u16 */ + attr->max_pkey = 0xFFFF; + } attr->max_inline_data = le32_to_cpu(sb->max_inline_data); attr->l2_db_size = (sb->l2_db_space_size + 1) * @@ -186,8 +196,7 @@ int bnxt_qplib_set_func_resources(struct bnxt_qplib_res *res, (void *)&resp, NULL, 0); if (rc) { - dev_err(&res->pdev->dev, - "QPLIB: Failed to set function resources"); + dev_err(&res->pdev->dev, "Failed to set function resources\n"); } return rc; } @@ -199,7 +208,7 @@ int bnxt_qplib_get_sgid(struct bnxt_qplib_res *res, { if (index >= sgid_tbl->max) { dev_err(&res->pdev->dev, - "QPLIB: Index %d exceeded SGID table max (%d)", + "Index %d exceeded SGID table max (%d)\n", index, sgid_tbl->max); return -EINVAL; } @@ -217,13 +226,12 @@ int bnxt_qplib_del_sgid(struct bnxt_qplib_sgid_tbl *sgid_tbl, int index; if (!sgid_tbl) { - dev_err(&res->pdev->dev, "QPLIB: SGID table not allocated"); + dev_err(&res->pdev->dev, "SGID table not allocated\n"); return -EINVAL; } /* Do we need a sgid_lock here? */ if (!sgid_tbl->active) { - dev_err(&res->pdev->dev, - "QPLIB: SGID table has no active entries"); + dev_err(&res->pdev->dev, "SGID table has no active entries\n"); return -ENOMEM; } for (index = 0; index < sgid_tbl->max; index++) { @@ -231,7 +239,7 @@ int bnxt_qplib_del_sgid(struct bnxt_qplib_sgid_tbl *sgid_tbl, break; } if (index == sgid_tbl->max) { - dev_warn(&res->pdev->dev, "GID not found in the SGID table"); + dev_warn(&res->pdev->dev, "GID not found in the SGID table\n"); return 0; } /* Remove GID from the SGID table */ @@ -244,7 +252,7 @@ int bnxt_qplib_del_sgid(struct bnxt_qplib_sgid_tbl *sgid_tbl, RCFW_CMD_PREP(req, DELETE_GID, cmd_flags); if (sgid_tbl->hw_id[index] == 0xFFFF) { dev_err(&res->pdev->dev, - "QPLIB: GID entry contains an invalid HW id"); + "GID entry contains an invalid HW id\n"); return -EINVAL; } req.gid_index = cpu_to_le16(sgid_tbl->hw_id[index]); @@ -258,7 +266,7 @@ int bnxt_qplib_del_sgid(struct bnxt_qplib_sgid_tbl *sgid_tbl, sgid_tbl->vlan[index] = 0; sgid_tbl->active--; dev_dbg(&res->pdev->dev, - "QPLIB: SGID deleted hw_id[0x%x] = 0x%x active = 0x%x", + "SGID deleted hw_id[0x%x] = 0x%x active = 0x%x\n", index, sgid_tbl->hw_id[index], sgid_tbl->active); sgid_tbl->hw_id[index] = (u16)-1; @@ -277,20 +285,19 @@ int bnxt_qplib_add_sgid(struct bnxt_qplib_sgid_tbl *sgid_tbl, int i, free_idx; if (!sgid_tbl) { - dev_err(&res->pdev->dev, "QPLIB: SGID table not allocated"); + dev_err(&res->pdev->dev, "SGID table not allocated\n"); return -EINVAL; } /* Do we need a sgid_lock here? */ if (sgid_tbl->active == sgid_tbl->max) { - dev_err(&res->pdev->dev, "QPLIB: SGID table is full"); + dev_err(&res->pdev->dev, "SGID table is full\n"); return -ENOMEM; } free_idx = sgid_tbl->max; for (i = 0; i < sgid_tbl->max; i++) { if (!memcmp(&sgid_tbl->tbl[i], gid, sizeof(*gid))) { dev_dbg(&res->pdev->dev, - "QPLIB: SGID entry already exist in entry %d!", - i); + "SGID entry already exist in entry %d!\n", i); *index = i; return -EALREADY; } else if (!memcmp(&sgid_tbl->tbl[i], &bnxt_qplib_gid_zero, @@ -301,7 +308,7 @@ int bnxt_qplib_add_sgid(struct bnxt_qplib_sgid_tbl *sgid_tbl, } if (free_idx == sgid_tbl->max) { dev_err(&res->pdev->dev, - "QPLIB: SGID table is FULL but count is not MAX??"); + "SGID table is FULL but count is not MAX??\n"); return -ENOMEM; } if (update) { @@ -348,7 +355,7 @@ int bnxt_qplib_add_sgid(struct bnxt_qplib_sgid_tbl *sgid_tbl, sgid_tbl->vlan[free_idx] = 1; dev_dbg(&res->pdev->dev, - "QPLIB: SGID added hw_id[0x%x] = 0x%x active = 0x%x", + "SGID added hw_id[0x%x] = 0x%x active = 0x%x\n", free_idx, sgid_tbl->hw_id[free_idx], sgid_tbl->active); *index = free_idx; @@ -404,7 +411,7 @@ int bnxt_qplib_get_pkey(struct bnxt_qplib_res *res, } if (index >= pkey_tbl->max) { dev_err(&res->pdev->dev, - "QPLIB: Index %d exceeded PKEY table max (%d)", + "Index %d exceeded PKEY table max (%d)\n", index, pkey_tbl->max); return -EINVAL; } @@ -419,14 +426,13 @@ int bnxt_qplib_del_pkey(struct bnxt_qplib_res *res, int i, rc = 0; if (!pkey_tbl) { - dev_err(&res->pdev->dev, "QPLIB: PKEY table not allocated"); + dev_err(&res->pdev->dev, "PKEY table not allocated\n"); return -EINVAL; } /* Do we need a pkey_lock here? */ if (!pkey_tbl->active) { - dev_err(&res->pdev->dev, - "QPLIB: PKEY table has no active entries"); + dev_err(&res->pdev->dev, "PKEY table has no active entries\n"); return -ENOMEM; } for (i = 0; i < pkey_tbl->max; i++) { @@ -435,8 +441,7 @@ int bnxt_qplib_del_pkey(struct bnxt_qplib_res *res, } if (i == pkey_tbl->max) { dev_err(&res->pdev->dev, - "QPLIB: PKEY 0x%04x not found in the pkey table", - *pkey); + "PKEY 0x%04x not found in the pkey table\n", *pkey); return -ENOMEM; } memset(&pkey_tbl->tbl[i], 0, sizeof(*pkey)); @@ -453,13 +458,13 @@ int bnxt_qplib_add_pkey(struct bnxt_qplib_res *res, int i, free_idx, rc = 0; if (!pkey_tbl) { - dev_err(&res->pdev->dev, "QPLIB: PKEY table not allocated"); + dev_err(&res->pdev->dev, "PKEY table not allocated\n"); return -EINVAL; } /* Do we need a pkey_lock here? */ if (pkey_tbl->active == pkey_tbl->max) { - dev_err(&res->pdev->dev, "QPLIB: PKEY table is full"); + dev_err(&res->pdev->dev, "PKEY table is full\n"); return -ENOMEM; } free_idx = pkey_tbl->max; @@ -471,7 +476,7 @@ int bnxt_qplib_add_pkey(struct bnxt_qplib_res *res, } if (free_idx == pkey_tbl->max) { dev_err(&res->pdev->dev, - "QPLIB: PKEY table is FULL but count is not MAX??"); + "PKEY table is FULL but count is not MAX??\n"); return -ENOMEM; } /* Add PKEY to the pkey_tbl */ @@ -555,8 +560,7 @@ int bnxt_qplib_free_mrw(struct bnxt_qplib_res *res, struct bnxt_qplib_mrw *mrw) int rc; if (mrw->lkey == 0xFFFFFFFF) { - dev_info(&res->pdev->dev, - "QPLIB: SP: Free a reserved lkey MRW"); + dev_info(&res->pdev->dev, "SP: Free a reserved lkey MRW\n"); return 0; } @@ -666,9 +670,8 @@ int bnxt_qplib_reg_mr(struct bnxt_qplib_res *res, struct bnxt_qplib_mrw *mr, pages++; if (pages > MAX_PBL_LVL_1_PGS) { - dev_err(&res->pdev->dev, "QPLIB: SP: Reg MR pages "); dev_err(&res->pdev->dev, - "requested (0x%x) exceeded max (0x%x)", + "SP: Reg MR pages requested (0x%x) exceeded max (0x%x)\n", pages, MAX_PBL_LVL_1_PGS); return -ENOMEM; } @@ -684,7 +687,7 @@ int bnxt_qplib_reg_mr(struct bnxt_qplib_res *res, struct bnxt_qplib_mrw *mr, HWQ_TYPE_CTX); if (rc) { dev_err(&res->pdev->dev, - "SP: Reg MR memory allocation failed"); + "SP: Reg MR memory allocation failed\n"); return -ENOMEM; } /* Write to the hwq */ @@ -795,7 +798,7 @@ int bnxt_qplib_get_roce_stats(struct bnxt_qplib_rcfw *rcfw, sbuf = bnxt_qplib_rcfw_alloc_sbuf(rcfw, sizeof(*sb)); if (!sbuf) { dev_err(&rcfw->pdev->dev, - "QPLIB: SP: QUERY_ROCE_STATS alloc side buffer failed"); + "SP: QUERY_ROCE_STATS alloc side buffer failed\n"); return -ENOMEM; } @@ -845,6 +848,16 @@ int bnxt_qplib_get_roce_stats(struct bnxt_qplib_rcfw *rcfw, stats->res_srq_load_err = le64_to_cpu(sb->res_srq_load_err); stats->res_tx_pci_err = le64_to_cpu(sb->res_tx_pci_err); stats->res_rx_pci_err = le64_to_cpu(sb->res_rx_pci_err); + if (!rcfw->init_oos_stats) { + rcfw->oos_prev = le64_to_cpu(sb->res_oos_drop_count); + rcfw->init_oos_stats = 1; + } else { + stats->res_oos_drop_count += + (le64_to_cpu(sb->res_oos_drop_count) - + rcfw->oos_prev) & BNXT_QPLIB_OOS_COUNT_MASK; + rcfw->oos_prev = le64_to_cpu(sb->res_oos_drop_count); + } + bail: bnxt_qplib_rcfw_free_sbuf(rcfw, sbuf); return rc; diff --git a/drivers/infiniband/hw/bnxt_re/qplib_sp.h b/drivers/infiniband/hw/bnxt_re/qplib_sp.h index 9d3e8b994945..8079d7f5a008 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_sp.h +++ b/drivers/infiniband/hw/bnxt_re/qplib_sp.h @@ -205,6 +205,16 @@ struct bnxt_qplib_roce_stats { /* res_tx_pci_err is 64 b */ u64 res_rx_pci_err; /* res_rx_pci_err is 64 b */ + u64 res_oos_drop_count; + /* res_oos_drop_count */ + u64 active_qp_count_p0; + /* port 0 active qps */ + u64 active_qp_count_p1; + /* port 1 active qps */ + u64 active_qp_count_p2; + /* port 2 active qps */ + u64 active_qp_count_p3; + /* port 3 active qps */ }; int bnxt_qplib_get_sgid(struct bnxt_qplib_res *res, diff --git a/drivers/infiniband/hw/bnxt_re/roce_hsi.h b/drivers/infiniband/hw/bnxt_re/roce_hsi.h index 3e5a4f760d0e..8a9ead419ac2 100644 --- a/drivers/infiniband/hw/bnxt_re/roce_hsi.h +++ b/drivers/infiniband/hw/bnxt_re/roce_hsi.h @@ -2929,6 +2929,11 @@ struct creq_query_roce_stats_resp_sb { __le64 res_srq_load_err; __le64 res_tx_pci_err; __le64 res_rx_pci_err; + __le64 res_oos_drop_count; + __le64 active_qp_count_p0; + __le64 active_qp_count_p1; + __le64 active_qp_count_p2; + __le64 active_qp_count_p3; }; /* QP error notification event (16 bytes) */ diff --git a/drivers/infiniband/hw/cxgb3/iwch_provider.c b/drivers/infiniband/hw/cxgb3/iwch_provider.c index 1b9ff21aa1d5..ebbec02cebe0 100644 --- a/drivers/infiniband/hw/cxgb3/iwch_provider.c +++ b/drivers/infiniband/hw/cxgb3/iwch_provider.c @@ -1127,17 +1127,18 @@ static int iwch_query_port(struct ib_device *ibdev, return 0; } -static ssize_t show_rev(struct device *dev, struct device_attribute *attr, - char *buf) +static ssize_t hw_rev_show(struct device *dev, + struct device_attribute *attr, char *buf) { struct iwch_dev *iwch_dev = container_of(dev, struct iwch_dev, ibdev.dev); pr_debug("%s dev 0x%p\n", __func__, dev); return sprintf(buf, "%d\n", iwch_dev->rdev.t3cdev_p->type); } +static DEVICE_ATTR_RO(hw_rev); -static ssize_t show_hca(struct device *dev, struct device_attribute *attr, - char *buf) +static ssize_t hca_type_show(struct device *dev, + struct device_attribute *attr, char *buf) { struct iwch_dev *iwch_dev = container_of(dev, struct iwch_dev, ibdev.dev); @@ -1148,9 +1149,10 @@ static ssize_t show_hca(struct device *dev, struct device_attribute *attr, lldev->ethtool_ops->get_drvinfo(lldev, &info); return sprintf(buf, "%s\n", info.driver); } +static DEVICE_ATTR_RO(hca_type); -static ssize_t show_board(struct device *dev, struct device_attribute *attr, - char *buf) +static ssize_t board_id_show(struct device *dev, + struct device_attribute *attr, char *buf) { struct iwch_dev *iwch_dev = container_of(dev, struct iwch_dev, ibdev.dev); @@ -1158,6 +1160,7 @@ static ssize_t show_board(struct device *dev, struct device_attribute *attr, return sprintf(buf, "%x.%x\n", iwch_dev->rdev.rnic_info.pdev->vendor, iwch_dev->rdev.rnic_info.pdev->device); } +static DEVICE_ATTR_RO(board_id); enum counters { IPINRECEIVES, @@ -1274,14 +1277,15 @@ static int iwch_get_mib(struct ib_device *ibdev, struct rdma_hw_stats *stats, return stats->num_counters; } -static DEVICE_ATTR(hw_rev, S_IRUGO, show_rev, NULL); -static DEVICE_ATTR(hca_type, S_IRUGO, show_hca, NULL); -static DEVICE_ATTR(board_id, S_IRUGO, show_board, NULL); +static struct attribute *iwch_class_attributes[] = { + &dev_attr_hw_rev.attr, + &dev_attr_hca_type.attr, + &dev_attr_board_id.attr, + NULL +}; -static struct device_attribute *iwch_class_attributes[] = { - &dev_attr_hw_rev, - &dev_attr_hca_type, - &dev_attr_board_id, +static const struct attribute_group iwch_attr_group = { + .attrs = iwch_class_attributes, }; static int iwch_port_immutable(struct ib_device *ibdev, u8 port_num, @@ -1316,10 +1320,8 @@ static void get_dev_fw_ver_str(struct ib_device *ibdev, char *str) int iwch_register_device(struct iwch_dev *dev) { int ret; - int i; pr_debug("%s iwch_dev %p\n", __func__, dev); - strlcpy(dev->ibdev.name, "cxgb3_%d", IB_DEVICE_NAME_MAX); memset(&dev->ibdev.node_guid, 0, sizeof(dev->ibdev.node_guid)); memcpy(&dev->ibdev.node_guid, dev->rdev.t3cdev_p->lldev->dev_addr, 6); dev->ibdev.owner = THIS_MODULE; @@ -1402,33 +1404,16 @@ int iwch_register_device(struct iwch_dev *dev) sizeof(dev->ibdev.iwcm->ifname)); dev->ibdev.driver_id = RDMA_DRIVER_CXGB3; - ret = ib_register_device(&dev->ibdev, NULL); + rdma_set_device_sysfs_group(&dev->ibdev, &iwch_attr_group); + ret = ib_register_device(&dev->ibdev, "cxgb3_%d", NULL); if (ret) - goto bail1; - - for (i = 0; i < ARRAY_SIZE(iwch_class_attributes); ++i) { - ret = device_create_file(&dev->ibdev.dev, - iwch_class_attributes[i]); - if (ret) { - goto bail2; - } - } - return 0; -bail2: - ib_unregister_device(&dev->ibdev); -bail1: - kfree(dev->ibdev.iwcm); + kfree(dev->ibdev.iwcm); return ret; } void iwch_unregister_device(struct iwch_dev *dev) { - int i; - pr_debug("%s iwch_dev %p\n", __func__, dev); - for (i = 0; i < ARRAY_SIZE(iwch_class_attributes); ++i) - device_remove_file(&dev->ibdev.dev, - iwch_class_attributes[i]); ib_unregister_device(&dev->ibdev); kfree(dev->ibdev.iwcm); return; diff --git a/drivers/infiniband/hw/cxgb4/cm.c b/drivers/infiniband/hw/cxgb4/cm.c index 0f83cbec33f3..615413bd3e8d 100644 --- a/drivers/infiniband/hw/cxgb4/cm.c +++ b/drivers/infiniband/hw/cxgb4/cm.c @@ -403,8 +403,7 @@ void _c4iw_free_ep(struct kref *kref) ep->com.local_addr.ss_family); dst_release(ep->dst); cxgb4_l2t_release(ep->l2t); - if (ep->mpa_skb) - kfree_skb(ep->mpa_skb); + kfree_skb(ep->mpa_skb); } if (!skb_queue_empty(&ep->com.ep_skb_list)) skb_queue_purge(&ep->com.ep_skb_list); diff --git a/drivers/infiniband/hw/cxgb4/cq.c b/drivers/infiniband/hw/cxgb4/cq.c index 6d3042794094..1fd8798d91a7 100644 --- a/drivers/infiniband/hw/cxgb4/cq.c +++ b/drivers/infiniband/hw/cxgb4/cq.c @@ -161,7 +161,7 @@ static int create_cq(struct c4iw_rdev *rdev, struct t4_cq *cq, cq->gts = rdev->lldi.gts_reg; cq->rdev = rdev; - cq->bar2_va = c4iw_bar2_addrs(rdev, cq->cqid, T4_BAR2_QTYPE_INGRESS, + cq->bar2_va = c4iw_bar2_addrs(rdev, cq->cqid, CXGB4_BAR2_QTYPE_INGRESS, &cq->bar2_qid, user ? &cq->bar2_pa : NULL); if (user && !cq->bar2_pa) { diff --git a/drivers/infiniband/hw/cxgb4/provider.c b/drivers/infiniband/hw/cxgb4/provider.c index 4eda6872e617..cbb3c0ddd990 100644 --- a/drivers/infiniband/hw/cxgb4/provider.c +++ b/drivers/infiniband/hw/cxgb4/provider.c @@ -373,8 +373,8 @@ static int c4iw_query_port(struct ib_device *ibdev, u8 port, return 0; } -static ssize_t show_rev(struct device *dev, struct device_attribute *attr, - char *buf) +static ssize_t hw_rev_show(struct device *dev, + struct device_attribute *attr, char *buf) { struct c4iw_dev *c4iw_dev = container_of(dev, struct c4iw_dev, ibdev.dev); @@ -382,9 +382,10 @@ static ssize_t show_rev(struct device *dev, struct device_attribute *attr, return sprintf(buf, "%d\n", CHELSIO_CHIP_RELEASE(c4iw_dev->rdev.lldi.adapter_type)); } +static DEVICE_ATTR_RO(hw_rev); -static ssize_t show_hca(struct device *dev, struct device_attribute *attr, - char *buf) +static ssize_t hca_type_show(struct device *dev, + struct device_attribute *attr, char *buf) { struct c4iw_dev *c4iw_dev = container_of(dev, struct c4iw_dev, ibdev.dev); @@ -395,9 +396,10 @@ static ssize_t show_hca(struct device *dev, struct device_attribute *attr, lldev->ethtool_ops->get_drvinfo(lldev, &info); return sprintf(buf, "%s\n", info.driver); } +static DEVICE_ATTR_RO(hca_type); -static ssize_t show_board(struct device *dev, struct device_attribute *attr, - char *buf) +static ssize_t board_id_show(struct device *dev, struct device_attribute *attr, + char *buf) { struct c4iw_dev *c4iw_dev = container_of(dev, struct c4iw_dev, ibdev.dev); @@ -405,6 +407,7 @@ static ssize_t show_board(struct device *dev, struct device_attribute *attr, return sprintf(buf, "%x.%x\n", c4iw_dev->rdev.lldi.pdev->vendor, c4iw_dev->rdev.lldi.pdev->device); } +static DEVICE_ATTR_RO(board_id); enum counters { IP4INSEGS, @@ -461,14 +464,15 @@ static int c4iw_get_mib(struct ib_device *ibdev, return stats->num_counters; } -static DEVICE_ATTR(hw_rev, S_IRUGO, show_rev, NULL); -static DEVICE_ATTR(hca_type, S_IRUGO, show_hca, NULL); -static DEVICE_ATTR(board_id, S_IRUGO, show_board, NULL); +static struct attribute *c4iw_class_attributes[] = { + &dev_attr_hw_rev.attr, + &dev_attr_hca_type.attr, + &dev_attr_board_id.attr, + NULL +}; -static struct device_attribute *c4iw_class_attributes[] = { - &dev_attr_hw_rev, - &dev_attr_hca_type, - &dev_attr_board_id, +static const struct attribute_group c4iw_attr_group = { + .attrs = c4iw_class_attributes, }; static int c4iw_port_immutable(struct ib_device *ibdev, u8 port_num, @@ -530,12 +534,10 @@ static int fill_res_entry(struct sk_buff *msg, struct rdma_restrack_entry *res) void c4iw_register_device(struct work_struct *work) { int ret; - int i; struct uld_ctx *ctx = container_of(work, struct uld_ctx, reg_work); struct c4iw_dev *dev = ctx->dev; pr_debug("c4iw_dev %p\n", dev); - strlcpy(dev->ibdev.name, "cxgb4_%d", IB_DEVICE_NAME_MAX); memset(&dev->ibdev.node_guid, 0, sizeof(dev->ibdev.node_guid)); memcpy(&dev->ibdev.node_guid, dev->rdev.lldi.ports[0]->dev_addr, 6); dev->ibdev.owner = THIS_MODULE; @@ -626,20 +628,13 @@ void c4iw_register_device(struct work_struct *work) memcpy(dev->ibdev.iwcm->ifname, dev->rdev.lldi.ports[0]->name, sizeof(dev->ibdev.iwcm->ifname)); + rdma_set_device_sysfs_group(&dev->ibdev, &c4iw_attr_group); dev->ibdev.driver_id = RDMA_DRIVER_CXGB4; - ret = ib_register_device(&dev->ibdev, NULL); + ret = ib_register_device(&dev->ibdev, "cxgb4_%d", NULL); if (ret) goto err_kfree_iwcm; - - for (i = 0; i < ARRAY_SIZE(c4iw_class_attributes); ++i) { - ret = device_create_file(&dev->ibdev.dev, - c4iw_class_attributes[i]); - if (ret) - goto err_unregister_device; - } return; -err_unregister_device: - ib_unregister_device(&dev->ibdev); + err_kfree_iwcm: kfree(dev->ibdev.iwcm); err_dealloc_ctx: @@ -651,12 +646,7 @@ err_dealloc_ctx: void c4iw_unregister_device(struct c4iw_dev *dev) { - int i; - pr_debug("c4iw_dev %p\n", dev); - for (i = 0; i < ARRAY_SIZE(c4iw_class_attributes); ++i) - device_remove_file(&dev->ibdev.dev, - c4iw_class_attributes[i]); ib_unregister_device(&dev->ibdev); kfree(dev->ibdev.iwcm); return; diff --git a/drivers/infiniband/hw/cxgb4/qp.c b/drivers/infiniband/hw/cxgb4/qp.c index 62d6f197ec0b..13478f3b7057 100644 --- a/drivers/infiniband/hw/cxgb4/qp.c +++ b/drivers/infiniband/hw/cxgb4/qp.c @@ -279,12 +279,13 @@ static int create_qp(struct c4iw_rdev *rdev, struct t4_wq *wq, wq->db = rdev->lldi.db_reg; - wq->sq.bar2_va = c4iw_bar2_addrs(rdev, wq->sq.qid, T4_BAR2_QTYPE_EGRESS, + wq->sq.bar2_va = c4iw_bar2_addrs(rdev, wq->sq.qid, + CXGB4_BAR2_QTYPE_EGRESS, &wq->sq.bar2_qid, user ? &wq->sq.bar2_pa : NULL); if (need_rq) wq->rq.bar2_va = c4iw_bar2_addrs(rdev, wq->rq.qid, - T4_BAR2_QTYPE_EGRESS, + CXGB4_BAR2_QTYPE_EGRESS, &wq->rq.bar2_qid, user ? &wq->rq.bar2_pa : NULL); @@ -2572,7 +2573,7 @@ static int alloc_srq_queue(struct c4iw_srq *srq, struct c4iw_dev_ucontext *uctx, memset(wq->queue, 0, wq->memsize); dma_unmap_addr_set(wq, mapping, wq->dma_addr); - wq->bar2_va = c4iw_bar2_addrs(rdev, wq->qid, T4_BAR2_QTYPE_EGRESS, + wq->bar2_va = c4iw_bar2_addrs(rdev, wq->qid, CXGB4_BAR2_QTYPE_EGRESS, &wq->bar2_qid, user ? &wq->bar2_pa : NULL); @@ -2813,8 +2814,7 @@ err_free_queue: free_srq_queue(srq, ucontext ? &ucontext->uctx : &rhp->rdev.uctx, srq->wr_waitp); err_free_skb: - if (srq->destroy_skb) - kfree_skb(srq->destroy_skb); + kfree_skb(srq->destroy_skb); err_free_srq_idx: c4iw_free_srq_idx(&rhp->rdev, srq->idx); err_free_wr_wait: diff --git a/drivers/infiniband/hw/hfi1/Makefile b/drivers/infiniband/hw/hfi1/Makefile index f451ba912f47..ff790390c91a 100644 --- a/drivers/infiniband/hw/hfi1/Makefile +++ b/drivers/infiniband/hw/hfi1/Makefile @@ -8,12 +8,42 @@ # obj-$(CONFIG_INFINIBAND_HFI1) += hfi1.o -hfi1-y := affinity.o chip.o device.o driver.o efivar.o \ - eprom.o exp_rcv.o file_ops.o firmware.o \ - init.o intr.o mad.o mmu_rb.o pcie.o pio.o pio_copy.o platform.o \ - qp.o qsfp.o rc.o ruc.o sdma.o sysfs.o trace.o \ - uc.o ud.o user_exp_rcv.o user_pages.o user_sdma.o verbs.o \ - verbs_txreq.o vnic_main.o vnic_sdma.o +hfi1-y := \ + affinity.o \ + chip.o \ + device.o \ + driver.o \ + efivar.o \ + eprom.o \ + exp_rcv.o \ + file_ops.o \ + firmware.o \ + init.o \ + intr.o \ + iowait.o \ + mad.o \ + mmu_rb.o \ + msix.o \ + pcie.o \ + pio.o \ + pio_copy.o \ + platform.o \ + qp.o \ + qsfp.o \ + rc.o \ + ruc.o \ + sdma.o \ + sysfs.o \ + trace.o \ + uc.o \ + ud.o \ + user_exp_rcv.o \ + user_pages.o \ + user_sdma.o \ + verbs.o \ + verbs_txreq.o \ + vnic_main.o \ + vnic_sdma.o ifdef CONFIG_DEBUG_FS hfi1-y += debugfs.o diff --git a/drivers/infiniband/hw/hfi1/affinity.c b/drivers/infiniband/hw/hfi1/affinity.c index bedd5fba33b0..2baf38cc1e23 100644 --- a/drivers/infiniband/hw/hfi1/affinity.c +++ b/drivers/infiniband/hw/hfi1/affinity.c @@ -817,10 +817,10 @@ static void hfi1_update_sdma_affinity(struct hfi1_msix_entry *msix, int cpu) set = &entry->def_intr; cpumask_set_cpu(cpu, &set->mask); cpumask_set_cpu(cpu, &set->used); - for (i = 0; i < dd->num_msix_entries; i++) { + for (i = 0; i < dd->msix_info.max_requested; i++) { struct hfi1_msix_entry *other_msix; - other_msix = &dd->msix_entries[i]; + other_msix = &dd->msix_info.msix_entries[i]; if (other_msix->type != IRQ_SDMA || other_msix == msix) continue; diff --git a/drivers/infiniband/hw/hfi1/chip.c b/drivers/infiniband/hw/hfi1/chip.c index e1668bcc2d13..9b20479dc710 100644 --- a/drivers/infiniband/hw/hfi1/chip.c +++ b/drivers/infiniband/hw/hfi1/chip.c @@ -67,8 +67,6 @@ #include "debugfs.h" #include "fault.h" -#define NUM_IB_PORTS 1 - uint kdeth_qp; module_param_named(kdeth_qp, kdeth_qp, uint, S_IRUGO); MODULE_PARM_DESC(kdeth_qp, "Set the KDETH queue pair prefix"); @@ -1100,9 +1098,9 @@ struct err_reg_info { const char *desc; }; -#define NUM_MISC_ERRS (IS_GENERAL_ERR_END - IS_GENERAL_ERR_START) -#define NUM_DC_ERRS (IS_DC_END - IS_DC_START) -#define NUM_VARIOUS (IS_VARIOUS_END - IS_VARIOUS_START) +#define NUM_MISC_ERRS (IS_GENERAL_ERR_END + 1 - IS_GENERAL_ERR_START) +#define NUM_DC_ERRS (IS_DC_END + 1 - IS_DC_START) +#define NUM_VARIOUS (IS_VARIOUS_END + 1 - IS_VARIOUS_START) /* * Helpers for building HFI and DC error interrupt table entries. Different @@ -8181,7 +8179,7 @@ static void is_rcv_avail_int(struct hfi1_devdata *dd, unsigned int source) /** * is_rcv_urgent_int() - User receive context urgent IRQ handler * @dd: valid dd - * @source: logical IRQ source (ofse from IS_RCVURGENT_START) + * @source: logical IRQ source (offset from IS_RCVURGENT_START) * * RX block receive urgent interrupt. Source is < 160. * @@ -8231,7 +8229,7 @@ static const struct is_table is_table[] = { is_sdma_eng_err_name, is_sdma_eng_err_int }, { IS_SENDCTXT_ERR_START, IS_SENDCTXT_ERR_END, is_sendctxt_err_name, is_sendctxt_err_int }, -{ IS_SDMA_START, IS_SDMA_END, +{ IS_SDMA_START, IS_SDMA_IDLE_END, is_sdma_eng_name, is_sdma_eng_int }, { IS_VARIOUS_START, IS_VARIOUS_END, is_various_name, is_various_int }, @@ -8257,7 +8255,7 @@ static void is_interrupt(struct hfi1_devdata *dd, unsigned int source) /* avoids a double compare by walking the table in-order */ for (entry = &is_table[0]; entry->is_name; entry++) { - if (source < entry->end) { + if (source <= entry->end) { trace_hfi1_interrupt(dd, entry, source); entry->is_int(dd, source - entry->start); return; @@ -8276,7 +8274,7 @@ static void is_interrupt(struct hfi1_devdata *dd, unsigned int source) * context DATA IRQs are threaded and are not supported by this handler. * */ -static irqreturn_t general_interrupt(int irq, void *data) +irqreturn_t general_interrupt(int irq, void *data) { struct hfi1_devdata *dd = data; u64 regs[CCE_NUM_INT_CSRS]; @@ -8309,7 +8307,7 @@ static irqreturn_t general_interrupt(int irq, void *data) return handled; } -static irqreturn_t sdma_interrupt(int irq, void *data) +irqreturn_t sdma_interrupt(int irq, void *data) { struct sdma_engine *sde = data; struct hfi1_devdata *dd = sde->dd; @@ -8401,7 +8399,7 @@ static inline int check_packet_present(struct hfi1_ctxtdata *rcd) * invoked) is finished. The intent is to avoid extra interrupts while we * are processing packets anyway. */ -static irqreturn_t receive_context_interrupt(int irq, void *data) +irqreturn_t receive_context_interrupt(int irq, void *data) { struct hfi1_ctxtdata *rcd = data; struct hfi1_devdata *dd = rcd->dd; @@ -8441,7 +8439,7 @@ static irqreturn_t receive_context_interrupt(int irq, void *data) * Receive packet thread handler. This expects to be invoked with the * receive interrupt still blocked. */ -static irqreturn_t receive_context_thread(int irq, void *data) +irqreturn_t receive_context_thread(int irq, void *data) { struct hfi1_ctxtdata *rcd = data; int present; @@ -9651,30 +9649,10 @@ void qsfp_event(struct work_struct *work) } } -static void init_qsfp_int(struct hfi1_devdata *dd) +void init_qsfp_int(struct hfi1_devdata *dd) { struct hfi1_pportdata *ppd = dd->pport; - u64 qsfp_mask, cce_int_mask; - const int qsfp1_int_smask = QSFP1_INT % 64; - const int qsfp2_int_smask = QSFP2_INT % 64; - - /* - * disable QSFP1 interrupts for HFI1, QSFP2 interrupts for HFI0 - * Qsfp1Int and Qsfp2Int are adjacent bits in the same CSR, - * therefore just one of QSFP1_INT/QSFP2_INT can be used to find - * the index of the appropriate CSR in the CCEIntMask CSR array - */ - cce_int_mask = read_csr(dd, CCE_INT_MASK + - (8 * (QSFP1_INT / 64))); - if (dd->hfi1_id) { - cce_int_mask &= ~((u64)1 << qsfp1_int_smask); - write_csr(dd, CCE_INT_MASK + (8 * (QSFP1_INT / 64)), - cce_int_mask); - } else { - cce_int_mask &= ~((u64)1 << qsfp2_int_smask); - write_csr(dd, CCE_INT_MASK + (8 * (QSFP2_INT / 64)), - cce_int_mask); - } + u64 qsfp_mask; qsfp_mask = (u64)(QSFP_HFI0_INT_N | QSFP_HFI0_MODPRST_N); /* Clear current status to avoid spurious interrupts */ @@ -9691,6 +9669,12 @@ static void init_qsfp_int(struct hfi1_devdata *dd) write_csr(dd, dd->hfi1_id ? ASIC_QSFP2_INVERT : ASIC_QSFP1_INVERT, qsfp_mask); + + /* Enable the appropriate QSFP IRQ source */ + if (!dd->hfi1_id) + set_intr_bits(dd, QSFP1_INT, QSFP1_INT, true); + else + set_intr_bits(dd, QSFP2_INT, QSFP2_INT, true); } /* @@ -10577,12 +10561,29 @@ void set_link_down_reason(struct hfi1_pportdata *ppd, u8 lcl_reason, } } -/* - * Verify if BCT for data VLs is non-zero. +/** + * data_vls_operational() - Verify if data VL BCT credits and MTU + * are both set. + * @ppd: pointer to hfi1_pportdata structure + * + * Return: true - Ok, false -otherwise. */ static inline bool data_vls_operational(struct hfi1_pportdata *ppd) { - return !!ppd->actual_vls_operational; + int i; + u64 reg; + + if (!ppd->actual_vls_operational) + return false; + + for (i = 0; i < ppd->vls_supported; i++) { + reg = read_csr(ppd->dd, SEND_CM_CREDIT_VL + (8 * i)); + if ((reg && !ppd->dd->vld[i].mtu) || + (!reg && ppd->dd->vld[i].mtu)) + return false; + } + + return true; } /* @@ -10695,7 +10696,8 @@ int set_link_state(struct hfi1_pportdata *ppd, u32 state) if (!data_vls_operational(ppd)) { dd_dev_err(dd, - "%s: data VLs not operational\n", __func__); + "%s: Invalid data VL credits or mtu\n", + __func__); ret = -EINVAL; break; } @@ -11932,10 +11934,16 @@ void hfi1_rcvctrl(struct hfi1_devdata *dd, unsigned int op, rcvctrl &= ~RCV_CTXT_CTRL_ENABLE_SMASK; } - if (op & HFI1_RCVCTRL_INTRAVAIL_ENB) + if (op & HFI1_RCVCTRL_INTRAVAIL_ENB) { + set_intr_bits(dd, IS_RCVAVAIL_START + rcd->ctxt, + IS_RCVAVAIL_START + rcd->ctxt, true); rcvctrl |= RCV_CTXT_CTRL_INTR_AVAIL_SMASK; - if (op & HFI1_RCVCTRL_INTRAVAIL_DIS) + } + if (op & HFI1_RCVCTRL_INTRAVAIL_DIS) { + set_intr_bits(dd, IS_RCVAVAIL_START + rcd->ctxt, + IS_RCVAVAIL_START + rcd->ctxt, false); rcvctrl &= ~RCV_CTXT_CTRL_INTR_AVAIL_SMASK; + } if ((op & HFI1_RCVCTRL_TAILUPD_ENB) && rcd->rcvhdrtail_kvaddr) rcvctrl |= RCV_CTXT_CTRL_TAIL_UPD_SMASK; if (op & HFI1_RCVCTRL_TAILUPD_DIS) { @@ -11965,6 +11973,13 @@ void hfi1_rcvctrl(struct hfi1_devdata *dd, unsigned int op, rcvctrl |= RCV_CTXT_CTRL_DONT_DROP_EGR_FULL_SMASK; if (op & HFI1_RCVCTRL_NO_EGR_DROP_DIS) rcvctrl &= ~RCV_CTXT_CTRL_DONT_DROP_EGR_FULL_SMASK; + if (op & HFI1_RCVCTRL_URGENT_ENB) + set_intr_bits(dd, IS_RCVURGENT_START + rcd->ctxt, + IS_RCVURGENT_START + rcd->ctxt, true); + if (op & HFI1_RCVCTRL_URGENT_DIS) + set_intr_bits(dd, IS_RCVURGENT_START + rcd->ctxt, + IS_RCVURGENT_START + rcd->ctxt, false); + hfi1_cdbg(RCVCTRL, "ctxt %d rcvctrl 0x%llx\n", ctxt, rcvctrl); write_kctxt_csr(dd, ctxt, RCV_CTXT_CTRL, rcvctrl); @@ -12963,63 +12978,71 @@ int hfi1_tempsense_rd(struct hfi1_devdata *dd, struct hfi1_temp *temp) return ret; } +/* ========================================================================= */ + /** - * get_int_mask - get 64 bit int mask - * @dd - the devdata - * @i - the csr (relative to CCE_INT_MASK) + * read_mod_write() - Calculate the IRQ register index and set/clear the bits + * @dd: valid devdata + * @src: IRQ source to determine register index from + * @bits: the bits to set or clear + * @set: true == set the bits, false == clear the bits * - * Returns the mask with the urgent interrupt mask - * bit clear for kernel receive contexts. */ -static u64 get_int_mask(struct hfi1_devdata *dd, u32 i) +static void read_mod_write(struct hfi1_devdata *dd, u16 src, u64 bits, + bool set) { - u64 mask = U64_MAX; /* default to no change */ - - if (i >= (IS_RCVURGENT_START / 64) && i < (IS_RCVURGENT_END / 64)) { - int j = (i - (IS_RCVURGENT_START / 64)) * 64; - int k = !j ? IS_RCVURGENT_START % 64 : 0; + u64 reg; + u16 idx = src / BITS_PER_REGISTER; - if (j) - j -= IS_RCVURGENT_START % 64; - /* j = 0..dd->first_dyn_alloc_ctxt - 1,k = 0..63 */ - for (; j < dd->first_dyn_alloc_ctxt && k < 64; j++, k++) - /* convert to bit in mask and clear */ - mask &= ~BIT_ULL(k); - } - return mask; + spin_lock(&dd->irq_src_lock); + reg = read_csr(dd, CCE_INT_MASK + (8 * idx)); + if (set) + reg |= bits; + else + reg &= ~bits; + write_csr(dd, CCE_INT_MASK + (8 * idx), reg); + spin_unlock(&dd->irq_src_lock); } -/* ========================================================================= */ - -/* - * Enable/disable chip from delivering interrupts. +/** + * set_intr_bits() - Enable/disable a range (one or more) IRQ sources + * @dd: valid devdata + * @first: first IRQ source to set/clear + * @last: last IRQ source (inclusive) to set/clear + * @set: true == set the bits, false == clear the bits + * + * If first == last, set the exact source. */ -void set_intr_state(struct hfi1_devdata *dd, u32 enable) +int set_intr_bits(struct hfi1_devdata *dd, u16 first, u16 last, bool set) { - int i; + u64 bits = 0; + u64 bit; + u16 src; - /* - * In HFI, the mask needs to be 1 to allow interrupts. - */ - if (enable) { - /* enable all interrupts but urgent on kernel contexts */ - for (i = 0; i < CCE_NUM_INT_CSRS; i++) { - u64 mask = get_int_mask(dd, i); + if (first > NUM_INTERRUPT_SOURCES || last > NUM_INTERRUPT_SOURCES) + return -EINVAL; - write_csr(dd, CCE_INT_MASK + (8 * i), mask); - } + if (last < first) + return -ERANGE; - init_qsfp_int(dd); - } else { - for (i = 0; i < CCE_NUM_INT_CSRS; i++) - write_csr(dd, CCE_INT_MASK + (8 * i), 0ull); + for (src = first; src <= last; src++) { + bit = src % BITS_PER_REGISTER; + /* wrapped to next register? */ + if (!bit && bits) { + read_mod_write(dd, src - 1, bits, set); + bits = 0; + } + bits |= BIT_ULL(bit); } + read_mod_write(dd, last, bits, set); + + return 0; } /* * Clear all interrupt sources on the chip. */ -static void clear_all_interrupts(struct hfi1_devdata *dd) +void clear_all_interrupts(struct hfi1_devdata *dd) { int i; @@ -13043,38 +13066,11 @@ static void clear_all_interrupts(struct hfi1_devdata *dd) write_csr(dd, DC_DC8051_ERR_CLR, ~(u64)0); } -/** - * hfi1_clean_up_interrupts() - Free all IRQ resources - * @dd: valid device data data structure - * - * Free the MSIx and assoicated PCI resources, if they have been allocated. - */ -void hfi1_clean_up_interrupts(struct hfi1_devdata *dd) -{ - int i; - struct hfi1_msix_entry *me = dd->msix_entries; - - /* remove irqs - must happen before disabling/turning off */ - for (i = 0; i < dd->num_msix_entries; i++, me++) { - if (!me->arg) /* => no irq, no affinity */ - continue; - hfi1_put_irq_affinity(dd, me); - pci_free_irq(dd->pcidev, i, me->arg); - } - - /* clean structures */ - kfree(dd->msix_entries); - dd->msix_entries = NULL; - dd->num_msix_entries = 0; - - pci_free_irq_vectors(dd->pcidev); -} - /* * Remap the interrupt source from the general handler to the given MSI-X * interrupt. */ -static void remap_intr(struct hfi1_devdata *dd, int isrc, int msix_intr) +void remap_intr(struct hfi1_devdata *dd, int isrc, int msix_intr) { u64 reg; int m, n; @@ -13098,8 +13094,7 @@ static void remap_intr(struct hfi1_devdata *dd, int isrc, int msix_intr) write_csr(dd, CCE_INT_MAP + (8 * m), reg); } -static void remap_sdma_interrupts(struct hfi1_devdata *dd, - int engine, int msix_intr) +void remap_sdma_interrupts(struct hfi1_devdata *dd, int engine, int msix_intr) { /* * SDMA engine interrupt sources grouped by type, rather than @@ -13108,204 +13103,16 @@ static void remap_sdma_interrupts(struct hfi1_devdata *dd, * SDMAProgress * SDMAIdle */ - remap_intr(dd, IS_SDMA_START + 0 * TXE_NUM_SDMA_ENGINES + engine, - msix_intr); - remap_intr(dd, IS_SDMA_START + 1 * TXE_NUM_SDMA_ENGINES + engine, - msix_intr); - remap_intr(dd, IS_SDMA_START + 2 * TXE_NUM_SDMA_ENGINES + engine, - msix_intr); -} - -static int request_msix_irqs(struct hfi1_devdata *dd) -{ - int first_general, last_general; - int first_sdma, last_sdma; - int first_rx, last_rx; - int i, ret = 0; - - /* calculate the ranges we are going to use */ - first_general = 0; - last_general = first_general + 1; - first_sdma = last_general; - last_sdma = first_sdma + dd->num_sdma; - first_rx = last_sdma; - last_rx = first_rx + dd->n_krcv_queues + dd->num_vnic_contexts; - - /* VNIC MSIx interrupts get mapped when VNIC contexts are created */ - dd->first_dyn_msix_idx = first_rx + dd->n_krcv_queues; - - /* - * Sanity check - the code expects all SDMA chip source - * interrupts to be in the same CSR, starting at bit 0. Verify - * that this is true by checking the bit location of the start. - */ - BUILD_BUG_ON(IS_SDMA_START % 64); - - for (i = 0; i < dd->num_msix_entries; i++) { - struct hfi1_msix_entry *me = &dd->msix_entries[i]; - const char *err_info; - irq_handler_t handler; - irq_handler_t thread = NULL; - void *arg = NULL; - int idx; - struct hfi1_ctxtdata *rcd = NULL; - struct sdma_engine *sde = NULL; - char name[MAX_NAME_SIZE]; - - /* obtain the arguments to pci_request_irq */ - if (first_general <= i && i < last_general) { - idx = i - first_general; - handler = general_interrupt; - arg = dd; - snprintf(name, sizeof(name), - DRIVER_NAME "_%d", dd->unit); - err_info = "general"; - me->type = IRQ_GENERAL; - } else if (first_sdma <= i && i < last_sdma) { - idx = i - first_sdma; - sde = &dd->per_sdma[idx]; - handler = sdma_interrupt; - arg = sde; - snprintf(name, sizeof(name), - DRIVER_NAME "_%d sdma%d", dd->unit, idx); - err_info = "sdma"; - remap_sdma_interrupts(dd, idx, i); - me->type = IRQ_SDMA; - } else if (first_rx <= i && i < last_rx) { - idx = i - first_rx; - rcd = hfi1_rcd_get_by_index_safe(dd, idx); - if (rcd) { - /* - * Set the interrupt register and mask for this - * context's interrupt. - */ - rcd->ireg = (IS_RCVAVAIL_START + idx) / 64; - rcd->imask = ((u64)1) << - ((IS_RCVAVAIL_START + idx) % 64); - handler = receive_context_interrupt; - thread = receive_context_thread; - arg = rcd; - snprintf(name, sizeof(name), - DRIVER_NAME "_%d kctxt%d", - dd->unit, idx); - err_info = "receive context"; - remap_intr(dd, IS_RCVAVAIL_START + idx, i); - me->type = IRQ_RCVCTXT; - rcd->msix_intr = i; - hfi1_rcd_put(rcd); - } - } else { - /* not in our expected range - complain, then - * ignore it - */ - dd_dev_err(dd, - "Unexpected extra MSI-X interrupt %d\n", i); - continue; - } - /* no argument, no interrupt */ - if (!arg) - continue; - /* make sure the name is terminated */ - name[sizeof(name) - 1] = 0; - me->irq = pci_irq_vector(dd->pcidev, i); - ret = pci_request_irq(dd->pcidev, i, handler, thread, arg, - name); - if (ret) { - dd_dev_err(dd, - "unable to allocate %s interrupt, irq %d, index %d, err %d\n", - err_info, me->irq, idx, ret); - return ret; - } - /* - * assign arg after pci_request_irq call, so it will be - * cleaned up - */ - me->arg = arg; - - ret = hfi1_get_irq_affinity(dd, me); - if (ret) - dd_dev_err(dd, "unable to pin IRQ %d\n", ret); - } - - return ret; -} - -void hfi1_vnic_synchronize_irq(struct hfi1_devdata *dd) -{ - int i; - - for (i = 0; i < dd->vnic.num_ctxt; i++) { - struct hfi1_ctxtdata *rcd = dd->vnic.ctxt[i]; - struct hfi1_msix_entry *me = &dd->msix_entries[rcd->msix_intr]; - - synchronize_irq(me->irq); - } -} - -void hfi1_reset_vnic_msix_info(struct hfi1_ctxtdata *rcd) -{ - struct hfi1_devdata *dd = rcd->dd; - struct hfi1_msix_entry *me = &dd->msix_entries[rcd->msix_intr]; - - if (!me->arg) /* => no irq, no affinity */ - return; - - hfi1_put_irq_affinity(dd, me); - pci_free_irq(dd->pcidev, rcd->msix_intr, me->arg); - - me->arg = NULL; -} - -void hfi1_set_vnic_msix_info(struct hfi1_ctxtdata *rcd) -{ - struct hfi1_devdata *dd = rcd->dd; - struct hfi1_msix_entry *me; - int idx = rcd->ctxt; - void *arg = rcd; - int ret; - - rcd->msix_intr = dd->vnic.msix_idx++; - me = &dd->msix_entries[rcd->msix_intr]; - - /* - * Set the interrupt register and mask for this - * context's interrupt. - */ - rcd->ireg = (IS_RCVAVAIL_START + idx) / 64; - rcd->imask = ((u64)1) << - ((IS_RCVAVAIL_START + idx) % 64); - me->type = IRQ_RCVCTXT; - me->irq = pci_irq_vector(dd->pcidev, rcd->msix_intr); - remap_intr(dd, IS_RCVAVAIL_START + idx, rcd->msix_intr); - - ret = pci_request_irq(dd->pcidev, rcd->msix_intr, - receive_context_interrupt, - receive_context_thread, arg, - DRIVER_NAME "_%d kctxt%d", dd->unit, idx); - if (ret) { - dd_dev_err(dd, "vnic irq request (irq %d, idx %d) fail %d\n", - me->irq, idx, ret); - return; - } - /* - * assign arg after pci_request_irq call, so it will be - * cleaned up - */ - me->arg = arg; - - ret = hfi1_get_irq_affinity(dd, me); - if (ret) { - dd_dev_err(dd, - "unable to pin IRQ %d\n", ret); - pci_free_irq(dd->pcidev, rcd->msix_intr, me->arg); - } + remap_intr(dd, IS_SDMA_START + engine, msix_intr); + remap_intr(dd, IS_SDMA_PROGRESS_START + engine, msix_intr); + remap_intr(dd, IS_SDMA_IDLE_START + engine, msix_intr); } /* * Set the general handler to accept all interrupts, remap all * chip interrupts back to MSI-X 0. */ -static void reset_interrupts(struct hfi1_devdata *dd) +void reset_interrupts(struct hfi1_devdata *dd) { int i; @@ -13318,54 +13125,33 @@ static void reset_interrupts(struct hfi1_devdata *dd) write_csr(dd, CCE_INT_MAP + (8 * i), 0); } +/** + * set_up_interrupts() - Initialize the IRQ resources and state + * @dd: valid devdata + * + */ static int set_up_interrupts(struct hfi1_devdata *dd) { - u32 total; - int ret, request; - - /* - * Interrupt count: - * 1 general, "slow path" interrupt (includes the SDMA engines - * slow source, SDMACleanupDone) - * N interrupts - one per used SDMA engine - * M interrupt - one per kernel receive context - * V interrupt - one for each VNIC context - */ - total = 1 + dd->num_sdma + dd->n_krcv_queues + dd->num_vnic_contexts; - - /* ask for MSI-X interrupts */ - request = request_msix(dd, total); - if (request < 0) { - ret = request; - goto fail; - } else { - dd->msix_entries = kcalloc(total, sizeof(*dd->msix_entries), - GFP_KERNEL); - if (!dd->msix_entries) { - ret = -ENOMEM; - goto fail; - } - /* using MSI-X */ - dd->num_msix_entries = total; - dd_dev_info(dd, "%u MSI-X interrupts allocated\n", total); - } + int ret; /* mask all interrupts */ - set_intr_state(dd, 0); + set_intr_bits(dd, IS_FIRST_SOURCE, IS_LAST_SOURCE, false); + /* clear all pending interrupts */ clear_all_interrupts(dd); /* reset general handler mask, chip MSI-X mappings */ reset_interrupts(dd); - ret = request_msix_irqs(dd); + /* ask for MSI-X interrupts */ + ret = msix_initialize(dd); if (ret) - goto fail; + return ret; - return 0; + ret = msix_request_irqs(dd); + if (ret) + msix_clean_up_interrupts(dd); -fail: - hfi1_clean_up_interrupts(dd); return ret; } @@ -14918,20 +14704,16 @@ err_exit: } /** - * Allocate and initialize the device structure for the hfi. + * hfi1_init_dd() - Initialize most of the dd structure. * @dev: the pci_dev for hfi1_ib device * @ent: pci_device_id struct for this dev * - * Also allocates, initializes, and returns the devdata struct for this - * device instance - * * This is global, and is called directly at init to set up the * chip-specific function pointers for later use. */ -struct hfi1_devdata *hfi1_init_dd(struct pci_dev *pdev, - const struct pci_device_id *ent) +int hfi1_init_dd(struct hfi1_devdata *dd) { - struct hfi1_devdata *dd; + struct pci_dev *pdev = dd->pcidev; struct hfi1_pportdata *ppd; u64 reg; int i, ret; @@ -14942,13 +14724,8 @@ struct hfi1_devdata *hfi1_init_dd(struct pci_dev *pdev, "Functional simulator" }; struct pci_dev *parent = pdev->bus->self; - u32 sdma_engines; + u32 sdma_engines = chip_sdma_engines(dd); - dd = hfi1_alloc_devdata(pdev, NUM_IB_PORTS * - sizeof(struct hfi1_pportdata)); - if (IS_ERR(dd)) - goto bail; - sdma_engines = chip_sdma_engines(dd); ppd = dd->pport; for (i = 0; i < dd->num_pports; i++, ppd++) { int vl; @@ -15127,6 +14904,12 @@ struct hfi1_devdata *hfi1_init_dd(struct pci_dev *pdev, if (ret) goto bail_cleanup; + /* + * This should probably occur in hfi1_pcie_init(), but historically + * occurs after the do_pcie_gen3_transition() code. + */ + tune_pcie_caps(dd); + /* start setting dd values and adjusting CSRs */ init_early_variables(dd); @@ -15239,14 +15022,13 @@ bail_free_cntrs: free_cntrs(dd); bail_clear_intr: hfi1_comp_vectors_clean_up(dd); - hfi1_clean_up_interrupts(dd); + msix_clean_up_interrupts(dd); bail_cleanup: hfi1_pcie_ddcleanup(dd); bail_free: hfi1_free_devdata(dd); - dd = ERR_PTR(ret); bail: - return dd; + return ret; } static u16 delay_cycles(struct hfi1_pportdata *ppd, u32 desired_egress_rate, diff --git a/drivers/infiniband/hw/hfi1/chip.h b/drivers/infiniband/hw/hfi1/chip.h index 36b04d6300e5..6b9c8f12dff8 100644 --- a/drivers/infiniband/hw/hfi1/chip.h +++ b/drivers/infiniband/hw/hfi1/chip.h @@ -52,9 +52,7 @@ */ /* sizes */ -#define CCE_NUM_MSIX_VECTORS 256 -#define CCE_NUM_INT_CSRS 12 -#define CCE_NUM_INT_MAP_CSRS 96 +#define BITS_PER_REGISTER (BITS_PER_BYTE * sizeof(u64)) #define NUM_INTERRUPT_SOURCES 768 #define RXE_NUM_CONTEXTS 160 #define RXE_PER_CONTEXT_SIZE 0x1000 /* 4k */ @@ -161,34 +159,49 @@ (CR_CREDIT_RETURN_DUE_TO_FORCE_MASK << \ CR_CREDIT_RETURN_DUE_TO_FORCE_SHIFT) -/* interrupt source numbers */ -#define IS_GENERAL_ERR_START 0 -#define IS_SDMAENG_ERR_START 16 -#define IS_SENDCTXT_ERR_START 32 -#define IS_SDMA_START 192 /* includes SDmaProgress,SDmaIdle */ +/* Specific IRQ sources */ +#define CCE_ERR_INT 0 +#define RXE_ERR_INT 1 +#define MISC_ERR_INT 2 +#define PIO_ERR_INT 4 +#define SDMA_ERR_INT 5 +#define EGRESS_ERR_INT 6 +#define TXE_ERR_INT 7 +#define PBC_INT 240 +#define GPIO_ASSERT_INT 241 +#define QSFP1_INT 242 +#define QSFP2_INT 243 +#define TCRIT_INT 244 + +/* interrupt source ranges */ +#define IS_FIRST_SOURCE CCE_ERR_INT +#define IS_GENERAL_ERR_START 0 +#define IS_SDMAENG_ERR_START 16 +#define IS_SENDCTXT_ERR_START 32 +#define IS_SDMA_START 192 +#define IS_SDMA_PROGRESS_START 208 +#define IS_SDMA_IDLE_START 224 #define IS_VARIOUS_START 240 #define IS_DC_START 248 #define IS_RCVAVAIL_START 256 #define IS_RCVURGENT_START 416 #define IS_SENDCREDIT_START 576 #define IS_RESERVED_START 736 -#define IS_MAX_SOURCES 768 +#define IS_LAST_SOURCE 767 /* derived interrupt source values */ -#define IS_GENERAL_ERR_END IS_SDMAENG_ERR_START -#define IS_SDMAENG_ERR_END IS_SENDCTXT_ERR_START -#define IS_SENDCTXT_ERR_END IS_SDMA_START -#define IS_SDMA_END IS_VARIOUS_START -#define IS_VARIOUS_END IS_DC_START -#define IS_DC_END IS_RCVAVAIL_START -#define IS_RCVAVAIL_END IS_RCVURGENT_START -#define IS_RCVURGENT_END IS_SENDCREDIT_START -#define IS_SENDCREDIT_END IS_RESERVED_START -#define IS_RESERVED_END IS_MAX_SOURCES - -/* absolute interrupt numbers for QSFP1Int and QSFP2Int */ -#define QSFP1_INT 242 -#define QSFP2_INT 243 +#define IS_GENERAL_ERR_END 7 +#define IS_SDMAENG_ERR_END 31 +#define IS_SENDCTXT_ERR_END 191 +#define IS_SDMA_END 207 +#define IS_SDMA_PROGRESS_END 223 +#define IS_SDMA_IDLE_END 239 +#define IS_VARIOUS_END 244 +#define IS_DC_END 255 +#define IS_RCVAVAIL_END 415 +#define IS_RCVURGENT_END 575 +#define IS_SENDCREDIT_END 735 +#define IS_RESERVED_END IS_LAST_SOURCE /* DCC_CFG_PORT_CONFIG logical link states */ #define LSTATE_DOWN 0x1 @@ -1416,6 +1429,18 @@ void hfi1_read_link_quality(struct hfi1_devdata *dd, u8 *link_quality); void hfi1_init_vnic_rsm(struct hfi1_devdata *dd); void hfi1_deinit_vnic_rsm(struct hfi1_devdata *dd); +irqreturn_t general_interrupt(int irq, void *data); +irqreturn_t sdma_interrupt(int irq, void *data); +irqreturn_t receive_context_interrupt(int irq, void *data); +irqreturn_t receive_context_thread(int irq, void *data); + +int set_intr_bits(struct hfi1_devdata *dd, u16 first, u16 last, bool set); +void init_qsfp_int(struct hfi1_devdata *dd); +void clear_all_interrupts(struct hfi1_devdata *dd); +void remap_intr(struct hfi1_devdata *dd, int isrc, int msix_intr); +void remap_sdma_interrupts(struct hfi1_devdata *dd, int engine, int msix_intr); +void reset_interrupts(struct hfi1_devdata *dd); + /* * Interrupt source table. * diff --git a/drivers/infiniband/hw/hfi1/chip_registers.h b/drivers/infiniband/hw/hfi1/chip_registers.h index ee6dca5e2a2f..c6163a347e93 100644 --- a/drivers/infiniband/hw/hfi1/chip_registers.h +++ b/drivers/infiniband/hw/hfi1/chip_registers.h @@ -878,6 +878,10 @@ #define SEND_CTRL (TXE + 0x000000000000) #define SEND_CTRL_CM_RESET_SMASK 0x4ull #define SEND_CTRL_SEND_ENABLE_SMASK 0x1ull +#define SEND_CTRL_UNSUPPORTED_VL_SHIFT 3 +#define SEND_CTRL_UNSUPPORTED_VL_MASK 0xFFull +#define SEND_CTRL_UNSUPPORTED_VL_SMASK (SEND_CTRL_UNSUPPORTED_VL_MASK \ + << SEND_CTRL_UNSUPPORTED_VL_SHIFT) #define SEND_CTRL_VL_ARBITER_ENABLE_SMASK 0x2ull #define SEND_CTXT_CHECK_ENABLE (TXE + 0x000000100080) #define SEND_CTXT_CHECK_ENABLE_CHECK_BYPASS_VL_MAPPING_SMASK 0x80ull diff --git a/drivers/infiniband/hw/hfi1/file_ops.c b/drivers/infiniband/hw/hfi1/file_ops.c index 1fc75647e47b..c22ebc774a6a 100644 --- a/drivers/infiniband/hw/hfi1/file_ops.c +++ b/drivers/infiniband/hw/hfi1/file_ops.c @@ -681,7 +681,8 @@ static int hfi1_file_close(struct inode *inode, struct file *fp) HFI1_RCVCTRL_TAILUPD_DIS | HFI1_RCVCTRL_ONE_PKT_EGR_DIS | HFI1_RCVCTRL_NO_RHQ_DROP_DIS | - HFI1_RCVCTRL_NO_EGR_DROP_DIS, uctxt); + HFI1_RCVCTRL_NO_EGR_DROP_DIS | + HFI1_RCVCTRL_URGENT_DIS, uctxt); /* Clear the context's J_KEY */ hfi1_clear_ctxt_jkey(dd, uctxt); /* @@ -1096,6 +1097,7 @@ static void user_init(struct hfi1_ctxtdata *uctxt) hfi1_set_ctxt_jkey(uctxt->dd, uctxt, uctxt->jkey); rcvctrl_ops = HFI1_RCVCTRL_CTXT_ENB; + rcvctrl_ops |= HFI1_RCVCTRL_URGENT_ENB; if (HFI1_CAP_UGET_MASK(uctxt->flags, HDRSUPP)) rcvctrl_ops |= HFI1_RCVCTRL_TIDFLOW_ENB; /* diff --git a/drivers/infiniband/hw/hfi1/hfi.h b/drivers/infiniband/hw/hfi1/hfi.h index d9470317983f..1401b6ea4a28 100644 --- a/drivers/infiniband/hw/hfi1/hfi.h +++ b/drivers/infiniband/hw/hfi1/hfi.h @@ -80,6 +80,7 @@ #include "qsfp.h" #include "platform.h" #include "affinity.h" +#include "msix.h" /* bumped 1 from s/w major version of TrueScale */ #define HFI1_CHIP_VERS_MAJ 3U @@ -620,6 +621,8 @@ struct rvt_sge_state; #define HFI1_RCVCTRL_NO_RHQ_DROP_DIS 0x8000 #define HFI1_RCVCTRL_NO_EGR_DROP_ENB 0x10000 #define HFI1_RCVCTRL_NO_EGR_DROP_DIS 0x20000 +#define HFI1_RCVCTRL_URGENT_ENB 0x40000 +#define HFI1_RCVCTRL_URGENT_DIS 0x80000 /* partition enforcement flags */ #define HFI1_PART_ENFORCE_IN 0x1 @@ -667,6 +670,14 @@ struct hfi1_msix_entry { struct irq_affinity_notify notify; }; +struct hfi1_msix_info { + /* lock to synchronize in_use_msix access */ + spinlock_t msix_lock; + DECLARE_BITMAP(in_use_msix, CCE_NUM_MSIX_VECTORS); + struct hfi1_msix_entry *msix_entries; + u16 max_requested; +}; + /* per-SL CCA information */ struct cca_timer { struct hrtimer hrtimer; @@ -992,7 +1003,6 @@ struct hfi1_vnic_data { struct idr vesw_idr; u8 rmt_start; u8 num_ctxt; - u32 msix_idx; }; struct hfi1_vnic_vport_info; @@ -1205,11 +1215,6 @@ struct hfi1_devdata { struct diag_client *diag_client; - /* MSI-X information */ - struct hfi1_msix_entry *msix_entries; - u32 num_msix_entries; - u32 first_dyn_msix_idx; - /* general interrupt: mask of handled interrupts */ u64 gi_mask[CCE_NUM_INT_CSRS]; @@ -1223,6 +1228,9 @@ struct hfi1_devdata { */ struct timer_list synth_stats_timer; + /* MSI-X information */ + struct hfi1_msix_info msix_info; + /* * device counters */ @@ -1349,6 +1357,8 @@ struct hfi1_devdata { /* vnic data */ struct hfi1_vnic_data vnic; + /* Lock to protect IRQ SRC register access */ + spinlock_t irq_src_lock; }; static inline bool hfi1_vnic_is_rsm_full(struct hfi1_devdata *dd, int spare) @@ -1431,9 +1441,6 @@ int handle_receive_interrupt(struct hfi1_ctxtdata *rcd, int thread); int handle_receive_interrupt_nodma_rtail(struct hfi1_ctxtdata *rcd, int thread); int handle_receive_interrupt_dma_rtail(struct hfi1_ctxtdata *rcd, int thread); void set_all_slowpath(struct hfi1_devdata *dd); -void hfi1_vnic_synchronize_irq(struct hfi1_devdata *dd); -void hfi1_set_vnic_msix_info(struct hfi1_ctxtdata *rcd); -void hfi1_reset_vnic_msix_info(struct hfi1_ctxtdata *rcd); extern const struct pci_device_id hfi1_pci_tbl[]; void hfi1_make_ud_req_9B(struct rvt_qp *qp, @@ -1887,10 +1894,8 @@ struct cc_state *get_cc_state_protected(struct hfi1_pportdata *ppd) #define HFI1_CTXT_WAITING_URG 4 /* free up any allocated data at closes */ -struct hfi1_devdata *hfi1_init_dd(struct pci_dev *pdev, - const struct pci_device_id *ent); +int hfi1_init_dd(struct hfi1_devdata *dd); void hfi1_free_devdata(struct hfi1_devdata *dd); -struct hfi1_devdata *hfi1_alloc_devdata(struct pci_dev *pdev, size_t extra); /* LED beaconing functions */ void hfi1_start_led_override(struct hfi1_pportdata *ppd, unsigned int timeon, @@ -1963,6 +1968,7 @@ static inline u32 get_rcvhdrtail(const struct hfi1_ctxtdata *rcd) */ extern const char ib_hfi1_version[]; +extern const struct attribute_group ib_hfi1_attr_group; int hfi1_device_create(struct hfi1_devdata *dd); void hfi1_device_remove(struct hfi1_devdata *dd); @@ -1974,16 +1980,15 @@ void hfi1_verbs_unregister_sysfs(struct hfi1_devdata *dd); /* Hook for sysfs read of QSFP */ int qsfp_dump(struct hfi1_pportdata *ppd, char *buf, int len); -int hfi1_pcie_init(struct pci_dev *pdev, const struct pci_device_id *ent); -void hfi1_clean_up_interrupts(struct hfi1_devdata *dd); +int hfi1_pcie_init(struct hfi1_devdata *dd); void hfi1_pcie_cleanup(struct pci_dev *pdev); int hfi1_pcie_ddinit(struct hfi1_devdata *dd, struct pci_dev *pdev); void hfi1_pcie_ddcleanup(struct hfi1_devdata *); int pcie_speeds(struct hfi1_devdata *dd); -int request_msix(struct hfi1_devdata *dd, u32 msireq); int restore_pci_variables(struct hfi1_devdata *dd); int save_pci_variables(struct hfi1_devdata *dd); int do_pcie_gen3_transition(struct hfi1_devdata *dd); +void tune_pcie_caps(struct hfi1_devdata *dd); int parse_platform_config(struct hfi1_devdata *dd); int get_platform_config_field(struct hfi1_devdata *dd, enum platform_config_table_type_encoding @@ -2124,19 +2129,6 @@ static inline u64 hfi1_pkt_base_sdma_integrity(struct hfi1_devdata *dd) return base_sdma_integrity; } -/* - * hfi1_early_err is used (only!) to print early errors before devdata is - * allocated, or when dd->pcidev may not be valid, and at the tail end of - * cleanup when devdata may have been freed, etc. hfi1_dev_porterr is - * the same as dd_dev_err, but is used when the message really needs - * the IB port# to be definitive as to what's happening.. - */ -#define hfi1_early_err(dev, fmt, ...) \ - dev_err(dev, fmt, ##__VA_ARGS__) - -#define hfi1_early_info(dev, fmt, ...) \ - dev_info(dev, fmt, ##__VA_ARGS__) - #define dd_dev_emerg(dd, fmt, ...) \ dev_emerg(&(dd)->pcidev->dev, "%s: " fmt, \ rvt_get_ibdev_name(&(dd)->verbs_dev.rdi), ##__VA_ARGS__) diff --git a/drivers/infiniband/hw/hfi1/init.c b/drivers/infiniband/hw/hfi1/init.c index 758d273c32cf..09044905284f 100644 --- a/drivers/infiniband/hw/hfi1/init.c +++ b/drivers/infiniband/hw/hfi1/init.c @@ -83,6 +83,8 @@ #define HFI1_MIN_EAGER_BUFFER_SIZE (4 * 1024) /* 4KB */ #define HFI1_MAX_EAGER_BUFFER_SIZE (256 * 1024) /* 256KB */ +#define NUM_IB_PORTS 1 + /* * Number of user receive contexts we are configured to use (to allow for more * pio buffers per ctxt, etc.) Zero means use one user context per CPU. @@ -654,9 +656,8 @@ void hfi1_init_pportdata(struct pci_dev *pdev, struct hfi1_pportdata *ppd, ppd->part_enforce |= HFI1_PART_ENFORCE_IN; if (loopback) { - hfi1_early_err(&pdev->dev, - "Faking data partition 0x8001 in idx %u\n", - !default_pkey_idx); + dd_dev_err(dd, "Faking data partition 0x8001 in idx %u\n", + !default_pkey_idx); ppd->pkeys[!default_pkey_idx] = 0x8001; } @@ -702,9 +703,7 @@ void hfi1_init_pportdata(struct pci_dev *pdev, struct hfi1_pportdata *ppd, return; bail: - - hfi1_early_err(&pdev->dev, - "Congestion Control Agent disabled for port %d\n", port); + dd_dev_err(dd, "Congestion Control Agent disabled for port %d\n", port); } /* @@ -833,6 +832,23 @@ wq_error: } /** + * enable_general_intr() - Enable the IRQs that will be handled by the + * general interrupt handler. + * @dd: valid devdata + * + */ +static void enable_general_intr(struct hfi1_devdata *dd) +{ + set_intr_bits(dd, CCE_ERR_INT, MISC_ERR_INT, true); + set_intr_bits(dd, PIO_ERR_INT, TXE_ERR_INT, true); + set_intr_bits(dd, IS_SENDCTXT_ERR_START, IS_SENDCTXT_ERR_END, true); + set_intr_bits(dd, PBC_INT, GPIO_ASSERT_INT, true); + set_intr_bits(dd, TCRIT_INT, TCRIT_INT, true); + set_intr_bits(dd, IS_DC_START, IS_DC_END, true); + set_intr_bits(dd, IS_SENDCREDIT_START, IS_SENDCREDIT_END, true); +} + +/** * hfi1_init - do the actual initialization sequence on the chip * @dd: the hfi1_ib device * @reinit: re-initializing, so don't allocate new memory @@ -916,6 +932,7 @@ int hfi1_init(struct hfi1_devdata *dd, int reinit) "failed to allocate kernel ctxt's rcvhdrq and/or egr bufs\n"); ret = lastfail; } + /* enable IRQ */ hfi1_rcd_put(rcd); } @@ -954,7 +971,8 @@ done: HFI1_STATUS_INITTED; if (!ret) { /* enable all interrupts from the chip */ - set_intr_state(dd, 1); + enable_general_intr(dd); + init_qsfp_int(dd); /* chip is OK for user apps; mark it as initialized */ for (pidx = 0; pidx < dd->num_pports; ++pidx) { @@ -1051,9 +1069,9 @@ static void shutdown_device(struct hfi1_devdata *dd) } dd->flags &= ~HFI1_INITTED; - /* mask and clean up interrupts, but not errors */ - set_intr_state(dd, 0); - hfi1_clean_up_interrupts(dd); + /* mask and clean up interrupts */ + set_intr_bits(dd, IS_FIRST_SOURCE, IS_LAST_SOURCE, false); + msix_clean_up_interrupts(dd); for (pidx = 0; pidx < dd->num_pports; ++pidx) { ppd = dd->pport + pidx; @@ -1246,15 +1264,19 @@ void hfi1_free_devdata(struct hfi1_devdata *dd) kobject_put(&dd->kobj); } -/* - * Allocate our primary per-unit data structure. Must be done via verbs - * allocator, because the verbs cleanup process both does cleanup and - * free of the data structure. +/** + * hfi1_alloc_devdata - Allocate our primary per-unit data structure. + * @pdev: Valid PCI device + * @extra: How many bytes to alloc past the default + * + * Must be done via verbs allocator, because the verbs cleanup process + * both does cleanup and free of the data structure. * "extra" is for chip-specific data. * * Use the idr mechanism to get a unit number for this unit. */ -struct hfi1_devdata *hfi1_alloc_devdata(struct pci_dev *pdev, size_t extra) +static struct hfi1_devdata *hfi1_alloc_devdata(struct pci_dev *pdev, + size_t extra) { unsigned long flags; struct hfi1_devdata *dd; @@ -1287,8 +1309,8 @@ struct hfi1_devdata *hfi1_alloc_devdata(struct pci_dev *pdev, size_t extra) idr_preload_end(); if (ret < 0) { - hfi1_early_err(&pdev->dev, - "Could not allocate unit ID: error %d\n", -ret); + dev_err(&pdev->dev, + "Could not allocate unit ID: error %d\n", -ret); goto bail; } rvt_set_ibdev_name(&dd->verbs_dev.rdi, "%s_%d", class_name(), dd->unit); @@ -1309,6 +1331,7 @@ struct hfi1_devdata *hfi1_alloc_devdata(struct pci_dev *pdev, size_t extra) spin_lock_init(&dd->pio_map_lock); mutex_init(&dd->dc8051_lock); init_waitqueue_head(&dd->event_queue); + spin_lock_init(&dd->irq_src_lock); dd->int_counter = alloc_percpu(u64); if (!dd->int_counter) { @@ -1481,9 +1504,6 @@ static int __init hfi1_mod_init(void) idr_init(&hfi1_unit_table); hfi1_dbg_init(); - ret = hfi1_wss_init(); - if (ret < 0) - goto bail_wss; ret = pci_register_driver(&hfi1_pci_driver); if (ret < 0) { pr_err("Unable to register driver: error %d\n", -ret); @@ -1492,8 +1512,6 @@ static int __init hfi1_mod_init(void) goto bail; /* all OK */ bail_dev: - hfi1_wss_exit(); -bail_wss: hfi1_dbg_exit(); idr_destroy(&hfi1_unit_table); dev_cleanup(); @@ -1510,7 +1528,6 @@ static void __exit hfi1_mod_cleanup(void) { pci_unregister_driver(&hfi1_pci_driver); node_affinity_destroy_all(); - hfi1_wss_exit(); hfi1_dbg_exit(); idr_destroy(&hfi1_unit_table); @@ -1604,23 +1621,23 @@ static void postinit_cleanup(struct hfi1_devdata *dd) hfi1_free_devdata(dd); } -static int init_validate_rcvhdrcnt(struct device *dev, uint thecnt) +static int init_validate_rcvhdrcnt(struct hfi1_devdata *dd, uint thecnt) { if (thecnt <= HFI1_MIN_HDRQ_EGRBUF_CNT) { - hfi1_early_err(dev, "Receive header queue count too small\n"); + dd_dev_err(dd, "Receive header queue count too small\n"); return -EINVAL; } if (thecnt > HFI1_MAX_HDRQ_EGRBUF_CNT) { - hfi1_early_err(dev, - "Receive header queue count cannot be greater than %u\n", - HFI1_MAX_HDRQ_EGRBUF_CNT); + dd_dev_err(dd, + "Receive header queue count cannot be greater than %u\n", + HFI1_MAX_HDRQ_EGRBUF_CNT); return -EINVAL; } if (thecnt % HDRQ_INCREMENT) { - hfi1_early_err(dev, "Receive header queue count %d must be divisible by %lu\n", - thecnt, HDRQ_INCREMENT); + dd_dev_err(dd, "Receive header queue count %d must be divisible by %lu\n", + thecnt, HDRQ_INCREMENT); return -EINVAL; } @@ -1639,22 +1656,29 @@ static int init_one(struct pci_dev *pdev, const struct pci_device_id *ent) /* Validate dev ids */ if (!(ent->device == PCI_DEVICE_ID_INTEL0 || ent->device == PCI_DEVICE_ID_INTEL1)) { - hfi1_early_err(&pdev->dev, - "Failing on unknown Intel deviceid 0x%x\n", - ent->device); + dev_err(&pdev->dev, "Failing on unknown Intel deviceid 0x%x\n", + ent->device); ret = -ENODEV; goto bail; } + /* Allocate the dd so we can get to work */ + dd = hfi1_alloc_devdata(pdev, NUM_IB_PORTS * + sizeof(struct hfi1_pportdata)); + if (IS_ERR(dd)) { + ret = PTR_ERR(dd); + goto bail; + } + /* Validate some global module parameters */ - ret = init_validate_rcvhdrcnt(&pdev->dev, rcvhdrcnt); + ret = init_validate_rcvhdrcnt(dd, rcvhdrcnt); if (ret) goto bail; /* use the encoding function as a sanitization check */ if (!encode_rcv_header_entry_size(hfi1_hdrq_entsize)) { - hfi1_early_err(&pdev->dev, "Invalid HdrQ Entry size %u\n", - hfi1_hdrq_entsize); + dd_dev_err(dd, "Invalid HdrQ Entry size %u\n", + hfi1_hdrq_entsize); ret = -EINVAL; goto bail; } @@ -1676,10 +1700,10 @@ static int init_one(struct pci_dev *pdev, const struct pci_device_id *ent) clamp_val(eager_buffer_size, MIN_EAGER_BUFFER * 8, MAX_EAGER_BUFFER_TOTAL); - hfi1_early_info(&pdev->dev, "Eager buffer size %u\n", - eager_buffer_size); + dd_dev_info(dd, "Eager buffer size %u\n", + eager_buffer_size); } else { - hfi1_early_err(&pdev->dev, "Invalid Eager buffer size of 0\n"); + dd_dev_err(dd, "Invalid Eager buffer size of 0\n"); ret = -EINVAL; goto bail; } @@ -1687,7 +1711,7 @@ static int init_one(struct pci_dev *pdev, const struct pci_device_id *ent) /* restrict value of hfi1_rcvarr_split */ hfi1_rcvarr_split = clamp_val(hfi1_rcvarr_split, 0, 100); - ret = hfi1_pcie_init(pdev, ent); + ret = hfi1_pcie_init(dd); if (ret) goto bail; @@ -1695,12 +1719,9 @@ static int init_one(struct pci_dev *pdev, const struct pci_device_id *ent) * Do device-specific initialization, function table setup, dd * allocation, etc. */ - dd = hfi1_init_dd(pdev, ent); - - if (IS_ERR(dd)) { - ret = PTR_ERR(dd); + ret = hfi1_init_dd(dd); + if (ret) goto clean_bail; /* error already printed */ - } ret = create_workqueues(dd); if (ret) @@ -1731,7 +1752,7 @@ static int init_one(struct pci_dev *pdev, const struct pci_device_id *ent) dd_dev_err(dd, "Failed to create /dev devices: %d\n", -j); if (initfail || ret) { - hfi1_clean_up_interrupts(dd); + msix_clean_up_interrupts(dd); stop_timers(dd); flush_workqueue(ib_wq); for (pidx = 0; pidx < dd->num_pports; ++pidx) { diff --git a/drivers/infiniband/hw/hfi1/iowait.c b/drivers/infiniband/hw/hfi1/iowait.c new file mode 100644 index 000000000000..582f1ba136ff --- /dev/null +++ b/drivers/infiniband/hw/hfi1/iowait.c @@ -0,0 +1,94 @@ +// SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause) +/* + * Copyright(c) 2018 Intel Corporation. + * + */ +#include "iowait.h" +#include "trace_iowait.h" + +void iowait_set_flag(struct iowait *wait, u32 flag) +{ + trace_hfi1_iowait_set(wait, flag); + set_bit(flag, &wait->flags); +} + +bool iowait_flag_set(struct iowait *wait, u32 flag) +{ + return test_bit(flag, &wait->flags); +} + +inline void iowait_clear_flag(struct iowait *wait, u32 flag) +{ + trace_hfi1_iowait_clear(wait, flag); + clear_bit(flag, &wait->flags); +} + +/** + * iowait_init() - initialize wait structure + * @wait: wait struct to initialize + * @tx_limit: limit for overflow queuing + * @func: restart function for workqueue + * @sleep: sleep function for no space + * @resume: wakeup function for no space + * + * This function initializes the iowait + * structure embedded in the QP or PQ. + * + */ +void iowait_init(struct iowait *wait, u32 tx_limit, + void (*func)(struct work_struct *work), + void (*tidfunc)(struct work_struct *work), + int (*sleep)(struct sdma_engine *sde, + struct iowait_work *wait, + struct sdma_txreq *tx, + uint seq, + bool pkts_sent), + void (*wakeup)(struct iowait *wait, int reason), + void (*sdma_drained)(struct iowait *wait)) +{ + int i; + + wait->count = 0; + INIT_LIST_HEAD(&wait->list); + init_waitqueue_head(&wait->wait_dma); + init_waitqueue_head(&wait->wait_pio); + atomic_set(&wait->sdma_busy, 0); + atomic_set(&wait->pio_busy, 0); + wait->tx_limit = tx_limit; + wait->sleep = sleep; + wait->wakeup = wakeup; + wait->sdma_drained = sdma_drained; + wait->flags = 0; + for (i = 0; i < IOWAIT_SES; i++) { + wait->wait[i].iow = wait; + INIT_LIST_HEAD(&wait->wait[i].tx_head); + if (i == IOWAIT_IB_SE) + INIT_WORK(&wait->wait[i].iowork, func); + else + INIT_WORK(&wait->wait[i].iowork, tidfunc); + } +} + +/** + * iowait_cancel_work - cancel all work in iowait + * @w: the iowait struct + */ +void iowait_cancel_work(struct iowait *w) +{ + cancel_work_sync(&iowait_get_ib_work(w)->iowork); + cancel_work_sync(&iowait_get_tid_work(w)->iowork); +} + +/** + * iowait_set_work_flag - set work flag based on leg + * @w - the iowait work struct + */ +int iowait_set_work_flag(struct iowait_work *w) +{ + if (w == &w->iow->wait[IOWAIT_IB_SE]) { + iowait_set_flag(w->iow, IOWAIT_PENDING_IB); + return IOWAIT_IB_SE; + } + iowait_set_flag(w->iow, IOWAIT_PENDING_TID); + return IOWAIT_TID_SE; +} diff --git a/drivers/infiniband/hw/hfi1/iowait.h b/drivers/infiniband/hw/hfi1/iowait.h index 3d9c32c7c340..23a58ac0d47c 100644 --- a/drivers/infiniband/hw/hfi1/iowait.h +++ b/drivers/infiniband/hw/hfi1/iowait.h @@ -1,7 +1,7 @@ #ifndef _HFI1_IOWAIT_H #define _HFI1_IOWAIT_H /* - * Copyright(c) 2015, 2016 Intel Corporation. + * Copyright(c) 2015 - 2018 Intel Corporation. * * This file is provided under a dual BSD/GPLv2 license. When using or * redistributing this file, you may do so under either license. @@ -49,6 +49,7 @@ #include <linux/list.h> #include <linux/workqueue.h> +#include <linux/wait.h> #include <linux/sched.h> #include "sdma_txreq.h" @@ -59,16 +60,47 @@ */ typedef void (*restart_t)(struct work_struct *work); +#define IOWAIT_PENDING_IB 0x0 +#define IOWAIT_PENDING_TID 0x1 + +/* + * A QP can have multiple Send Engines (SEs). + * + * The current use case is for supporting a TID RDMA + * packet build/xmit mechanism independent from verbs. + */ +#define IOWAIT_SES 2 +#define IOWAIT_IB_SE 0 +#define IOWAIT_TID_SE 1 + struct sdma_txreq; struct sdma_engine; /** - * struct iowait - linkage for delayed progress/waiting + * @iowork: the work struct + * @tx_head: list of prebuilt packets + * @iow: the parent iowait structure + * + * This structure is the work item (process) specific + * details associated with the each of the two SEs of the + * QP. + * + * The workstruct and the queued TXs are unique to each + * SE. + */ +struct iowait; +struct iowait_work { + struct work_struct iowork; + struct list_head tx_head; + struct iowait *iow; +}; + +/** * @list: used to add/insert into QP/PQ wait lists - * @lock: uses to record the list head lock * @tx_head: overflow list of sdma_txreq's * @sleep: no space callback * @wakeup: space callback wakeup * @sdma_drained: sdma count drained + * @lock: lock protected head of wait queue * @iowork: workqueue overhead * @wait_dma: wait for sdma_busy == 0 * @wait_pio: wait for pio_busy == 0 @@ -76,6 +108,8 @@ struct sdma_engine; * @count: total number of descriptors in tx_head'ed list * @tx_limit: limit for overflow queuing * @tx_count: number of tx entry's in tx_head'ed list + * @flags: wait flags (one per QP) + * @wait: SE array * * This is to be embedded in user's state structure * (QP or PQ). @@ -98,13 +132,11 @@ struct sdma_engine; * Waiters explicity know that, but the destroy * code that unwaits QPs does not. */ - struct iowait { struct list_head list; - struct list_head tx_head; int (*sleep)( struct sdma_engine *sde, - struct iowait *wait, + struct iowait_work *wait, struct sdma_txreq *tx, uint seq, bool pkts_sent @@ -112,7 +144,6 @@ struct iowait { void (*wakeup)(struct iowait *wait, int reason); void (*sdma_drained)(struct iowait *wait); seqlock_t *lock; - struct work_struct iowork; wait_queue_head_t wait_dma; wait_queue_head_t wait_pio; atomic_t sdma_busy; @@ -121,63 +152,37 @@ struct iowait { u32 tx_limit; u32 tx_count; u8 starved_cnt; + unsigned long flags; + struct iowait_work wait[IOWAIT_SES]; }; #define SDMA_AVAIL_REASON 0 -/** - * iowait_init() - initialize wait structure - * @wait: wait struct to initialize - * @tx_limit: limit for overflow queuing - * @func: restart function for workqueue - * @sleep: sleep function for no space - * @resume: wakeup function for no space - * - * This function initializes the iowait - * structure embedded in the QP or PQ. - * - */ +void iowait_set_flag(struct iowait *wait, u32 flag); +bool iowait_flag_set(struct iowait *wait, u32 flag); +void iowait_clear_flag(struct iowait *wait, u32 flag); -static inline void iowait_init( - struct iowait *wait, - u32 tx_limit, - void (*func)(struct work_struct *work), - int (*sleep)( - struct sdma_engine *sde, - struct iowait *wait, - struct sdma_txreq *tx, - uint seq, - bool pkts_sent), - void (*wakeup)(struct iowait *wait, int reason), - void (*sdma_drained)(struct iowait *wait)) -{ - wait->count = 0; - wait->lock = NULL; - INIT_LIST_HEAD(&wait->list); - INIT_LIST_HEAD(&wait->tx_head); - INIT_WORK(&wait->iowork, func); - init_waitqueue_head(&wait->wait_dma); - init_waitqueue_head(&wait->wait_pio); - atomic_set(&wait->sdma_busy, 0); - atomic_set(&wait->pio_busy, 0); - wait->tx_limit = tx_limit; - wait->sleep = sleep; - wait->wakeup = wakeup; - wait->sdma_drained = sdma_drained; -} +void iowait_init(struct iowait *wait, u32 tx_limit, + void (*func)(struct work_struct *work), + void (*tidfunc)(struct work_struct *work), + int (*sleep)(struct sdma_engine *sde, + struct iowait_work *wait, + struct sdma_txreq *tx, + uint seq, + bool pkts_sent), + void (*wakeup)(struct iowait *wait, int reason), + void (*sdma_drained)(struct iowait *wait)); /** - * iowait_schedule() - initialize wait structure + * iowait_schedule() - schedule the default send engine work * @wait: wait struct to schedule * @wq: workqueue for schedule * @cpu: cpu */ -static inline void iowait_schedule( - struct iowait *wait, - struct workqueue_struct *wq, - int cpu) +static inline bool iowait_schedule(struct iowait *wait, + struct workqueue_struct *wq, int cpu) { - queue_work_on(cpu, wq, &wait->iowork); + return !!queue_work_on(cpu, wq, &wait->wait[IOWAIT_IB_SE].iowork); } /** @@ -228,6 +233,8 @@ static inline void iowait_sdma_add(struct iowait *wait, int count) */ static inline int iowait_sdma_dec(struct iowait *wait) { + if (!wait) + return 0; return atomic_dec_and_test(&wait->sdma_busy); } @@ -267,11 +274,13 @@ static inline void iowait_pio_inc(struct iowait *wait) } /** - * iowait_sdma_dec - note pio complete + * iowait_pio_dec - note pio complete * @wait: iowait structure */ static inline int iowait_pio_dec(struct iowait *wait) { + if (!wait) + return 0; return atomic_dec_and_test(&wait->pio_busy); } @@ -293,9 +302,9 @@ static inline void iowait_drain_wakeup(struct iowait *wait) /** * iowait_get_txhead() - get packet off of iowait list * - * @wait wait struture + * @wait iowait_work struture */ -static inline struct sdma_txreq *iowait_get_txhead(struct iowait *wait) +static inline struct sdma_txreq *iowait_get_txhead(struct iowait_work *wait) { struct sdma_txreq *tx = NULL; @@ -309,6 +318,28 @@ static inline struct sdma_txreq *iowait_get_txhead(struct iowait *wait) return tx; } +static inline u16 iowait_get_desc(struct iowait_work *w) +{ + u16 num_desc = 0; + struct sdma_txreq *tx = NULL; + + if (!list_empty(&w->tx_head)) { + tx = list_first_entry(&w->tx_head, struct sdma_txreq, + list); + num_desc = tx->num_desc; + } + return num_desc; +} + +static inline u32 iowait_get_all_desc(struct iowait *w) +{ + u32 num_desc = 0; + + num_desc = iowait_get_desc(&w->wait[IOWAIT_IB_SE]); + num_desc += iowait_get_desc(&w->wait[IOWAIT_TID_SE]); + return num_desc; +} + /** * iowait_queue - Put the iowait on a wait queue * @pkts_sent: have some packets been sent before queuing? @@ -372,12 +403,57 @@ static inline void iowait_starve_find_max(struct iowait *w, u8 *max, } /** - * iowait_packet_queued() - determine if a packet is already built - * @wait: the wait structure + * iowait_packet_queued() - determine if a packet is queued + * @wait: the iowait_work structure */ -static inline bool iowait_packet_queued(struct iowait *wait) +static inline bool iowait_packet_queued(struct iowait_work *wait) { return !list_empty(&wait->tx_head); } +/** + * inc_wait_count - increment wait counts + * @w: the log work struct + * @n: the count + */ +static inline void iowait_inc_wait_count(struct iowait_work *w, u16 n) +{ + if (!w) + return; + w->iow->tx_count++; + w->iow->count += n; +} + +/** + * iowait_get_tid_work - return iowait_work for tid SE + * @w: the iowait struct + */ +static inline struct iowait_work *iowait_get_tid_work(struct iowait *w) +{ + return &w->wait[IOWAIT_TID_SE]; +} + +/** + * iowait_get_ib_work - return iowait_work for ib SE + * @w: the iowait struct + */ +static inline struct iowait_work *iowait_get_ib_work(struct iowait *w) +{ + return &w->wait[IOWAIT_IB_SE]; +} + +/** + * iowait_ioww_to_iow - return iowait given iowait_work + * @w: the iowait_work struct + */ +static inline struct iowait *iowait_ioww_to_iow(struct iowait_work *w) +{ + if (likely(w)) + return w->iow; + return NULL; +} + +void iowait_cancel_work(struct iowait *w); +int iowait_set_work_flag(struct iowait_work *w); + #endif diff --git a/drivers/infiniband/hw/hfi1/mad.c b/drivers/infiniband/hw/hfi1/mad.c index 0307405491e0..88a0cf930136 100644 --- a/drivers/infiniband/hw/hfi1/mad.c +++ b/drivers/infiniband/hw/hfi1/mad.c @@ -1,5 +1,5 @@ /* - * Copyright(c) 2015-2017 Intel Corporation. + * Copyright(c) 2015-2018 Intel Corporation. * * This file is provided under a dual BSD/GPLv2 license. When using or * redistributing this file, you may do so under either license. @@ -4836,7 +4836,7 @@ static int hfi1_process_opa_mad(struct ib_device *ibdev, int mad_flags, int ret; int pkey_idx; int local_mad = 0; - u32 resp_len = 0; + u32 resp_len = in_wc->byte_len - sizeof(*in_grh); struct hfi1_ibport *ibp = to_iport(ibdev, port); pkey_idx = hfi1_lookup_pkey_idx(ibp, LIM_MGMT_P_KEY); diff --git a/drivers/infiniband/hw/hfi1/msix.c b/drivers/infiniband/hw/hfi1/msix.c new file mode 100644 index 000000000000..d920b165d696 --- /dev/null +++ b/drivers/infiniband/hw/hfi1/msix.c @@ -0,0 +1,363 @@ +// SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause) +/* + * Copyright(c) 2018 Intel Corporation. + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * BSD LICENSE + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * - Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +#include "hfi.h" +#include "affinity.h" +#include "sdma.h" + +/** + * msix_initialize() - Calculate, request and configure MSIx IRQs + * @dd: valid hfi1 devdata + * + */ +int msix_initialize(struct hfi1_devdata *dd) +{ + u32 total; + int ret; + struct hfi1_msix_entry *entries; + + /* + * MSIx interrupt count: + * one for the general, "slow path" interrupt + * one per used SDMA engine + * one per kernel receive context + * one for each VNIC context + * ...any new IRQs should be added here. + */ + total = 1 + dd->num_sdma + dd->n_krcv_queues + dd->num_vnic_contexts; + + if (total >= CCE_NUM_MSIX_VECTORS) + return -EINVAL; + + ret = pci_alloc_irq_vectors(dd->pcidev, total, total, PCI_IRQ_MSIX); + if (ret < 0) { + dd_dev_err(dd, "pci_alloc_irq_vectors() failed: %d\n", ret); + return ret; + } + + entries = kcalloc(total, sizeof(*dd->msix_info.msix_entries), + GFP_KERNEL); + if (!entries) { + pci_free_irq_vectors(dd->pcidev); + return -ENOMEM; + } + + dd->msix_info.msix_entries = entries; + spin_lock_init(&dd->msix_info.msix_lock); + bitmap_zero(dd->msix_info.in_use_msix, total); + dd->msix_info.max_requested = total; + dd_dev_info(dd, "%u MSI-X interrupts allocated\n", total); + + return 0; +} + +/** + * msix_request_irq() - Allocate a free MSIx IRQ + * @dd: valid devdata + * @arg: context information for the IRQ + * @handler: IRQ handler + * @thread: IRQ thread handler (could be NULL) + * @idx: zero base idx if multiple devices are needed + * @type: affinty IRQ type + * + * Allocated an MSIx vector if available, and then create the appropriate + * meta data needed to keep track of the pci IRQ request. + * + * Return: + * < 0 Error + * >= 0 MSIx vector + * + */ +static int msix_request_irq(struct hfi1_devdata *dd, void *arg, + irq_handler_t handler, irq_handler_t thread, + u32 idx, enum irq_type type) +{ + unsigned long nr; + int irq; + int ret; + const char *err_info; + char name[MAX_NAME_SIZE]; + struct hfi1_msix_entry *me; + + /* Allocate an MSIx vector */ + spin_lock(&dd->msix_info.msix_lock); + nr = find_first_zero_bit(dd->msix_info.in_use_msix, + dd->msix_info.max_requested); + if (nr < dd->msix_info.max_requested) + __set_bit(nr, dd->msix_info.in_use_msix); + spin_unlock(&dd->msix_info.msix_lock); + + if (nr == dd->msix_info.max_requested) + return -ENOSPC; + + /* Specific verification and determine the name */ + switch (type) { + case IRQ_GENERAL: + /* general interrupt must be MSIx vector 0 */ + if (nr) { + spin_lock(&dd->msix_info.msix_lock); + __clear_bit(nr, dd->msix_info.in_use_msix); + spin_unlock(&dd->msix_info.msix_lock); + dd_dev_err(dd, "Invalid index %lu for GENERAL IRQ\n", + nr); + return -EINVAL; + } + snprintf(name, sizeof(name), DRIVER_NAME "_%d", dd->unit); + err_info = "general"; + break; + case IRQ_SDMA: + snprintf(name, sizeof(name), DRIVER_NAME "_%d sdma%d", + dd->unit, idx); + err_info = "sdma"; + break; + case IRQ_RCVCTXT: + snprintf(name, sizeof(name), DRIVER_NAME "_%d kctxt%d", + dd->unit, idx); + err_info = "receive context"; + break; + case IRQ_OTHER: + default: + return -EINVAL; + } + name[sizeof(name) - 1] = 0; + + irq = pci_irq_vector(dd->pcidev, nr); + ret = pci_request_irq(dd->pcidev, nr, handler, thread, arg, name); + if (ret) { + dd_dev_err(dd, + "%s: request for IRQ %d failed, MSIx %d, err %d\n", + err_info, irq, idx, ret); + spin_lock(&dd->msix_info.msix_lock); + __clear_bit(nr, dd->msix_info.in_use_msix); + spin_unlock(&dd->msix_info.msix_lock); + return ret; + } + + /* + * assign arg after pci_request_irq call, so it will be + * cleaned up + */ + me = &dd->msix_info.msix_entries[nr]; + me->irq = irq; + me->arg = arg; + me->type = type; + + /* This is a request, so a failure is not fatal */ + ret = hfi1_get_irq_affinity(dd, me); + if (ret) + dd_dev_err(dd, "unable to pin IRQ %d\n", ret); + + return nr; +} + +/** + * msix_request_rcd_irq() - Helper function for RCVAVAIL IRQs + * @rcd: valid rcd context + * + */ +int msix_request_rcd_irq(struct hfi1_ctxtdata *rcd) +{ + int nr; + + nr = msix_request_irq(rcd->dd, rcd, receive_context_interrupt, + receive_context_thread, rcd->ctxt, IRQ_RCVCTXT); + if (nr < 0) + return nr; + + /* + * Set the interrupt register and mask for this + * context's interrupt. + */ + rcd->ireg = (IS_RCVAVAIL_START + rcd->ctxt) / 64; + rcd->imask = ((u64)1) << ((IS_RCVAVAIL_START + rcd->ctxt) % 64); + rcd->msix_intr = nr; + remap_intr(rcd->dd, IS_RCVAVAIL_START + rcd->ctxt, nr); + + return 0; +} + +/** + * msix_request_smda_ira() - Helper for getting SDMA IRQ resources + * @sde: valid sdma engine + * + */ +int msix_request_sdma_irq(struct sdma_engine *sde) +{ + int nr; + + nr = msix_request_irq(sde->dd, sde, sdma_interrupt, NULL, + sde->this_idx, IRQ_SDMA); + if (nr < 0) + return nr; + sde->msix_intr = nr; + remap_sdma_interrupts(sde->dd, sde->this_idx, nr); + + return 0; +} + +/** + * enable_sdma_src() - Helper to enable SDMA IRQ srcs + * @dd: valid devdata structure + * @i: index of SDMA engine + */ +static void enable_sdma_srcs(struct hfi1_devdata *dd, int i) +{ + set_intr_bits(dd, IS_SDMA_START + i, IS_SDMA_START + i, true); + set_intr_bits(dd, IS_SDMA_PROGRESS_START + i, + IS_SDMA_PROGRESS_START + i, true); + set_intr_bits(dd, IS_SDMA_IDLE_START + i, IS_SDMA_IDLE_START + i, true); + set_intr_bits(dd, IS_SDMAENG_ERR_START + i, IS_SDMAENG_ERR_START + i, + true); +} + +/** + * msix_request_irqs() - Allocate all MSIx IRQs + * @dd: valid devdata structure + * + * Helper function to request the used MSIx IRQs. + * + */ +int msix_request_irqs(struct hfi1_devdata *dd) +{ + int i; + int ret; + + ret = msix_request_irq(dd, dd, general_interrupt, NULL, 0, IRQ_GENERAL); + if (ret < 0) + return ret; + + for (i = 0; i < dd->num_sdma; i++) { + struct sdma_engine *sde = &dd->per_sdma[i]; + + ret = msix_request_sdma_irq(sde); + if (ret) + return ret; + enable_sdma_srcs(sde->dd, i); + } + + for (i = 0; i < dd->n_krcv_queues; i++) { + struct hfi1_ctxtdata *rcd = hfi1_rcd_get_by_index_safe(dd, i); + + if (rcd) + ret = msix_request_rcd_irq(rcd); + hfi1_rcd_put(rcd); + if (ret) + return ret; + } + + return 0; +} + +/** + * msix_free_irq() - Free the specified MSIx resources and IRQ + * @dd: valid devdata + * @msix_intr: MSIx vector to free. + * + */ +void msix_free_irq(struct hfi1_devdata *dd, u8 msix_intr) +{ + struct hfi1_msix_entry *me; + + if (msix_intr >= dd->msix_info.max_requested) + return; + + me = &dd->msix_info.msix_entries[msix_intr]; + + if (!me->arg) /* => no irq, no affinity */ + return; + + hfi1_put_irq_affinity(dd, me); + pci_free_irq(dd->pcidev, msix_intr, me->arg); + + me->arg = NULL; + + spin_lock(&dd->msix_info.msix_lock); + __clear_bit(msix_intr, dd->msix_info.in_use_msix); + spin_unlock(&dd->msix_info.msix_lock); +} + +/** + * hfi1_clean_up_msix_interrupts() - Free all MSIx IRQ resources + * @dd: valid device data data structure + * + * Free the MSIx and associated PCI resources, if they have been allocated. + */ +void msix_clean_up_interrupts(struct hfi1_devdata *dd) +{ + int i; + struct hfi1_msix_entry *me = dd->msix_info.msix_entries; + + /* remove irqs - must happen before disabling/turning off */ + for (i = 0; i < dd->msix_info.max_requested; i++, me++) + msix_free_irq(dd, i); + + /* clean structures */ + kfree(dd->msix_info.msix_entries); + dd->msix_info.msix_entries = NULL; + dd->msix_info.max_requested = 0; + + pci_free_irq_vectors(dd->pcidev); +} + +/** + * msix_vnic_syncrhonize_irq() - Vnic IRQ synchronize + * @dd: valid devdata + */ +void msix_vnic_synchronize_irq(struct hfi1_devdata *dd) +{ + int i; + + for (i = 0; i < dd->vnic.num_ctxt; i++) { + struct hfi1_ctxtdata *rcd = dd->vnic.ctxt[i]; + struct hfi1_msix_entry *me; + + me = &dd->msix_info.msix_entries[rcd->msix_intr]; + + synchronize_irq(me->irq); + } +} diff --git a/drivers/infiniband/hw/hfi1/msix.h b/drivers/infiniband/hw/hfi1/msix.h new file mode 100644 index 000000000000..a514881632a4 --- /dev/null +++ b/drivers/infiniband/hw/hfi1/msix.h @@ -0,0 +1,64 @@ +/* SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause) */ +/* + * Copyright(c) 2018 Intel Corporation. + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * BSD LICENSE + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * - Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ +#ifndef _HFI1_MSIX_H +#define _HFI1_MSIX_H + +#include "hfi.h" + +/* MSIx interface */ +int msix_initialize(struct hfi1_devdata *dd); +int msix_request_irqs(struct hfi1_devdata *dd); +void msix_clean_up_interrupts(struct hfi1_devdata *dd); +int msix_request_rcd_irq(struct hfi1_ctxtdata *rcd); +int msix_request_sdma_irq(struct sdma_engine *sde); +void msix_free_irq(struct hfi1_devdata *dd, u8 msix_intr); + +/* VNIC interface */ +void msix_vnic_synchronize_irq(struct hfi1_devdata *dd); + +#endif diff --git a/drivers/infiniband/hw/hfi1/pcie.c b/drivers/infiniband/hw/hfi1/pcie.c index cca413eaa74e..c96d193bb236 100644 --- a/drivers/infiniband/hw/hfi1/pcie.c +++ b/drivers/infiniband/hw/hfi1/pcie.c @@ -1,5 +1,5 @@ /* - * Copyright(c) 2015 - 2017 Intel Corporation. + * Copyright(c) 2015 - 2018 Intel Corporation. * * This file is provided under a dual BSD/GPLv2 license. When using or * redistributing this file, you may do so under either license. @@ -61,19 +61,12 @@ */ /* - * Code to adjust PCIe capabilities. - */ -static void tune_pcie_caps(struct hfi1_devdata *); - -/* * Do all the common PCIe setup and initialization. - * devdata is not yet allocated, and is not allocated until after this - * routine returns success. Therefore dd_dev_err() can't be used for error - * printing. */ -int hfi1_pcie_init(struct pci_dev *pdev, const struct pci_device_id *ent) +int hfi1_pcie_init(struct hfi1_devdata *dd) { int ret; + struct pci_dev *pdev = dd->pcidev; ret = pci_enable_device(pdev); if (ret) { @@ -89,15 +82,13 @@ int hfi1_pcie_init(struct pci_dev *pdev, const struct pci_device_id *ent) * about that, it appears. If the original BAR was retained * in the kernel data structures, this may be OK. */ - hfi1_early_err(&pdev->dev, "pci enable failed: error %d\n", - -ret); - goto done; + dd_dev_err(dd, "pci enable failed: error %d\n", -ret); + return ret; } ret = pci_request_regions(pdev, DRIVER_NAME); if (ret) { - hfi1_early_err(&pdev->dev, - "pci_request_regions fails: err %d\n", -ret); + dd_dev_err(dd, "pci_request_regions fails: err %d\n", -ret); goto bail; } @@ -110,8 +101,7 @@ int hfi1_pcie_init(struct pci_dev *pdev, const struct pci_device_id *ent) */ ret = pci_set_dma_mask(pdev, DMA_BIT_MASK(32)); if (ret) { - hfi1_early_err(&pdev->dev, - "Unable to set DMA mask: %d\n", ret); + dd_dev_err(dd, "Unable to set DMA mask: %d\n", ret); goto bail; } ret = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(32)); @@ -119,18 +109,16 @@ int hfi1_pcie_init(struct pci_dev *pdev, const struct pci_device_id *ent) ret = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(64)); } if (ret) { - hfi1_early_err(&pdev->dev, - "Unable to set DMA consistent mask: %d\n", ret); + dd_dev_err(dd, "Unable to set DMA consistent mask: %d\n", ret); goto bail; } pci_set_master(pdev); (void)pci_enable_pcie_error_reporting(pdev); - goto done; + return 0; bail: hfi1_pcie_cleanup(pdev); -done: return ret; } @@ -206,7 +194,7 @@ int hfi1_pcie_ddinit(struct hfi1_devdata *dd, struct pci_dev *pdev) dd_dev_err(dd, "WC mapping of send buffers failed\n"); goto nomem; } - dd_dev_info(dd, "WC piobase: %p\n for %x", dd->piobase, TXE_PIO_SIZE); + dd_dev_info(dd, "WC piobase: %p for %x\n", dd->piobase, TXE_PIO_SIZE); dd->physaddr = addr; /* used for io_remap, etc. */ @@ -344,26 +332,6 @@ int pcie_speeds(struct hfi1_devdata *dd) return 0; } -/* - * Returns: - * - actual number of interrupts allocated or - * - error - */ -int request_msix(struct hfi1_devdata *dd, u32 msireq) -{ - int nvec; - - nvec = pci_alloc_irq_vectors(dd->pcidev, msireq, msireq, PCI_IRQ_MSIX); - if (nvec < 0) { - dd_dev_err(dd, "pci_alloc_irq_vectors() failed: %d\n", nvec); - return nvec; - } - - tune_pcie_caps(dd); - - return nvec; -} - /* restore command and BARs after a reset has wiped them out */ int restore_pci_variables(struct hfi1_devdata *dd) { @@ -479,14 +447,19 @@ error: * Check and optionally adjust them to maximize our throughput. */ static int hfi1_pcie_caps; -module_param_named(pcie_caps, hfi1_pcie_caps, int, S_IRUGO); +module_param_named(pcie_caps, hfi1_pcie_caps, int, 0444); MODULE_PARM_DESC(pcie_caps, "Max PCIe tuning: Payload (0..3), ReadReq (4..7)"); uint aspm_mode = ASPM_MODE_DISABLED; -module_param_named(aspm, aspm_mode, uint, S_IRUGO); +module_param_named(aspm, aspm_mode, uint, 0444); MODULE_PARM_DESC(aspm, "PCIe ASPM: 0: disable, 1: enable, 2: dynamic"); -static void tune_pcie_caps(struct hfi1_devdata *dd) +/** + * tune_pcie_caps() - Code to adjust PCIe capabilities. + * @dd: Valid device data structure + * + */ +void tune_pcie_caps(struct hfi1_devdata *dd) { struct pci_dev *parent; u16 rc_mpss, rc_mps, ep_mpss, ep_mps; @@ -1028,6 +1001,7 @@ int do_pcie_gen3_transition(struct hfi1_devdata *dd) const u8 (*ctle_tunings)[4]; uint static_ctle_mode; int return_error = 0; + u32 target_width; /* PCIe Gen3 is for the ASIC only */ if (dd->icode != ICODE_RTL_SILICON) @@ -1067,6 +1041,9 @@ int do_pcie_gen3_transition(struct hfi1_devdata *dd) return 0; } + /* Previous Gen1/Gen2 bus width */ + target_width = dd->lbus_width; + /* * Do the Gen3 transition. Steps are those of the PCIe Gen3 * recipe. @@ -1435,11 +1412,12 @@ retry: dd_dev_info(dd, "%s: new speed and width: %s\n", __func__, dd->lbus_info); - if (dd->lbus_speed != target_speed) { /* not target */ + if (dd->lbus_speed != target_speed || + dd->lbus_width < target_width) { /* not target */ /* maybe retry */ do_retry = retry_count < pcie_retry; - dd_dev_err(dd, "PCIe link speed did not switch to Gen%d%s\n", - pcie_target, do_retry ? ", retrying" : ""); + dd_dev_err(dd, "PCIe link speed or width did not match target%s\n", + do_retry ? ", retrying" : ""); retry_count++; if (do_retry) { msleep(100); /* allow time to settle */ diff --git a/drivers/infiniband/hw/hfi1/pio.c b/drivers/infiniband/hw/hfi1/pio.c index 752057647f09..9ab50d2308dc 100644 --- a/drivers/infiniband/hw/hfi1/pio.c +++ b/drivers/infiniband/hw/hfi1/pio.c @@ -71,14 +71,6 @@ void __cm_reset(struct hfi1_devdata *dd, u64 sendctrl) } } -/* defined in header release 48 and higher */ -#ifndef SEND_CTRL_UNSUPPORTED_VL_SHIFT -#define SEND_CTRL_UNSUPPORTED_VL_SHIFT 3 -#define SEND_CTRL_UNSUPPORTED_VL_MASK 0xffull -#define SEND_CTRL_UNSUPPORTED_VL_SMASK (SEND_CTRL_UNSUPPORTED_VL_MASK \ - << SEND_CTRL_UNSUPPORTED_VL_SHIFT) -#endif - /* global control of PIO send */ void pio_send_control(struct hfi1_devdata *dd, int op) { diff --git a/drivers/infiniband/hw/hfi1/qp.c b/drivers/infiniband/hw/hfi1/qp.c index 9b1e84a6b1cc..6f3bc4dab858 100644 --- a/drivers/infiniband/hw/hfi1/qp.c +++ b/drivers/infiniband/hw/hfi1/qp.c @@ -66,7 +66,7 @@ MODULE_PARM_DESC(qp_table_size, "QP table size"); static void flush_tx_list(struct rvt_qp *qp); static int iowait_sleep( struct sdma_engine *sde, - struct iowait *wait, + struct iowait_work *wait, struct sdma_txreq *stx, unsigned int seq, bool pkts_sent); @@ -134,15 +134,13 @@ const struct rvt_operation_params hfi1_post_parms[RVT_OPERATION_MAX] = { }; -static void flush_tx_list(struct rvt_qp *qp) +static void flush_list_head(struct list_head *l) { - struct hfi1_qp_priv *priv = qp->priv; - - while (!list_empty(&priv->s_iowait.tx_head)) { + while (!list_empty(l)) { struct sdma_txreq *tx; tx = list_first_entry( - &priv->s_iowait.tx_head, + l, struct sdma_txreq, list); list_del_init(&tx->list); @@ -151,6 +149,14 @@ static void flush_tx_list(struct rvt_qp *qp) } } +static void flush_tx_list(struct rvt_qp *qp) +{ + struct hfi1_qp_priv *priv = qp->priv; + + flush_list_head(&iowait_get_ib_work(&priv->s_iowait)->tx_head); + flush_list_head(&iowait_get_tid_work(&priv->s_iowait)->tx_head); +} + static void flush_iowait(struct rvt_qp *qp) { struct hfi1_qp_priv *priv = qp->priv; @@ -282,33 +288,46 @@ void hfi1_modify_qp(struct rvt_qp *qp, struct ib_qp_attr *attr, } /** - * hfi1_check_send_wqe - validate wqe + * hfi1_setup_wqe - set up the wqe * @qp - The qp * @wqe - The built wqe + * @call_send - Determine if the send should be posted or scheduled. * - * validate wqe. This is called - * prior to inserting the wqe into - * the ring but after the wqe has been - * setup. + * Perform setup of the wqe. This is called + * prior to inserting the wqe into the ring but after + * the wqe has been setup by RDMAVT. This function + * allows the driver the opportunity to perform + * validation and additional setup of the wqe. * * Returns 0 on success, -EINVAL on failure * */ -int hfi1_check_send_wqe(struct rvt_qp *qp, - struct rvt_swqe *wqe) +int hfi1_setup_wqe(struct rvt_qp *qp, struct rvt_swqe *wqe, bool *call_send) { struct hfi1_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num); struct rvt_ah *ah; + struct hfi1_pportdata *ppd; + struct hfi1_devdata *dd; switch (qp->ibqp.qp_type) { case IB_QPT_RC: case IB_QPT_UC: if (wqe->length > 0x80000000U) return -EINVAL; + if (wqe->length > qp->pmtu) + *call_send = false; break; case IB_QPT_SMI: - ah = ibah_to_rvtah(wqe->ud_wr.ah); - if (wqe->length > (1 << ah->log_pmtu)) + /* + * SM packets should exclusively use VL15 and their SL is + * ignored (IBTA v1.3, Section 3.5.8.2). Therefore, when ah + * is created, SL is 0 in most cases and as a result some + * fields (vl and pmtu) in ah may not be set correctly, + * depending on the SL2SC and SC2VL tables at the time. + */ + ppd = ppd_from_ibp(ibp); + dd = dd_from_ppd(ppd); + if (wqe->length > dd->vld[15].mtu) return -EINVAL; break; case IB_QPT_GSI: @@ -321,7 +340,7 @@ int hfi1_check_send_wqe(struct rvt_qp *qp, default: break; } - return wqe->length <= piothreshold; + return 0; } /** @@ -333,7 +352,7 @@ int hfi1_check_send_wqe(struct rvt_qp *qp, * It is only used in the post send, which doesn't hold * the s_lock. */ -void _hfi1_schedule_send(struct rvt_qp *qp) +bool _hfi1_schedule_send(struct rvt_qp *qp) { struct hfi1_qp_priv *priv = qp->priv; struct hfi1_ibport *ibp = @@ -341,10 +360,10 @@ void _hfi1_schedule_send(struct rvt_qp *qp) struct hfi1_pportdata *ppd = ppd_from_ibp(ibp); struct hfi1_devdata *dd = dd_from_ibdev(qp->ibqp.device); - iowait_schedule(&priv->s_iowait, ppd->hfi1_wq, - priv->s_sde ? - priv->s_sde->cpu : - cpumask_first(cpumask_of_node(dd->node))); + return iowait_schedule(&priv->s_iowait, ppd->hfi1_wq, + priv->s_sde ? + priv->s_sde->cpu : + cpumask_first(cpumask_of_node(dd->node))); } static void qp_pio_drain(struct rvt_qp *qp) @@ -372,12 +391,32 @@ static void qp_pio_drain(struct rvt_qp *qp) * * This schedules qp progress and caller should hold * the s_lock. + * @return true if the first leg is scheduled; + * false if the first leg is not scheduled. */ -void hfi1_schedule_send(struct rvt_qp *qp) +bool hfi1_schedule_send(struct rvt_qp *qp) { lockdep_assert_held(&qp->s_lock); - if (hfi1_send_ok(qp)) + if (hfi1_send_ok(qp)) { _hfi1_schedule_send(qp); + return true; + } + if (qp->s_flags & HFI1_S_ANY_WAIT_IO) + iowait_set_flag(&((struct hfi1_qp_priv *)qp->priv)->s_iowait, + IOWAIT_PENDING_IB); + return false; +} + +static void hfi1_qp_schedule(struct rvt_qp *qp) +{ + struct hfi1_qp_priv *priv = qp->priv; + bool ret; + + if (iowait_flag_set(&priv->s_iowait, IOWAIT_PENDING_IB)) { + ret = hfi1_schedule_send(qp); + if (ret) + iowait_clear_flag(&priv->s_iowait, IOWAIT_PENDING_IB); + } } void hfi1_qp_wakeup(struct rvt_qp *qp, u32 flag) @@ -388,16 +427,22 @@ void hfi1_qp_wakeup(struct rvt_qp *qp, u32 flag) if (qp->s_flags & flag) { qp->s_flags &= ~flag; trace_hfi1_qpwakeup(qp, flag); - hfi1_schedule_send(qp); + hfi1_qp_schedule(qp); } spin_unlock_irqrestore(&qp->s_lock, flags); /* Notify hfi1_destroy_qp() if it is waiting. */ rvt_put_qp(qp); } +void hfi1_qp_unbusy(struct rvt_qp *qp, struct iowait_work *wait) +{ + if (iowait_set_work_flag(wait) == IOWAIT_IB_SE) + qp->s_flags &= ~RVT_S_BUSY; +} + static int iowait_sleep( struct sdma_engine *sde, - struct iowait *wait, + struct iowait_work *wait, struct sdma_txreq *stx, uint seq, bool pkts_sent) @@ -438,7 +483,7 @@ static int iowait_sleep( rvt_get_qp(qp); } write_sequnlock(&dev->iowait_lock); - qp->s_flags &= ~RVT_S_BUSY; + hfi1_qp_unbusy(qp, wait); spin_unlock_irqrestore(&qp->s_lock, flags); ret = -EBUSY; } else { @@ -637,6 +682,7 @@ void *qp_priv_alloc(struct rvt_dev_info *rdi, struct rvt_qp *qp) &priv->s_iowait, 1, _hfi1_do_send, + NULL, iowait_sleep, iowait_wakeup, iowait_sdma_drained); @@ -686,7 +732,7 @@ void stop_send_queue(struct rvt_qp *qp) { struct hfi1_qp_priv *priv = qp->priv; - cancel_work_sync(&priv->s_iowait.iowork); + iowait_cancel_work(&priv->s_iowait); } void quiesce_qp(struct rvt_qp *qp) diff --git a/drivers/infiniband/hw/hfi1/qp.h b/drivers/infiniband/hw/hfi1/qp.h index 078cff7560b6..7adb6dff6813 100644 --- a/drivers/infiniband/hw/hfi1/qp.h +++ b/drivers/infiniband/hw/hfi1/qp.h @@ -58,18 +58,6 @@ extern unsigned int hfi1_qp_table_size; extern const struct rvt_operation_params hfi1_post_parms[]; /* - * Send if not busy or waiting for I/O and either - * a RC response is pending or we can process send work requests. - */ -static inline int hfi1_send_ok(struct rvt_qp *qp) -{ - return !(qp->s_flags & (RVT_S_BUSY | RVT_S_ANY_WAIT_IO)) && - (verbs_txreq_queued(qp) || - (qp->s_flags & RVT_S_RESP_PENDING) || - !(qp->s_flags & RVT_S_ANY_WAIT_SEND)); -} - -/* * Driver specific s_flags starting at bit 31 down to HFI1_S_MIN_BIT_MASK * * HFI1_S_AHG_VALID - ahg header valid on chip @@ -90,6 +78,20 @@ static inline int hfi1_send_ok(struct rvt_qp *qp) #define HFI1_S_ANY_WAIT (HFI1_S_ANY_WAIT_IO | RVT_S_ANY_WAIT_SEND) /* + * Send if not busy or waiting for I/O and either + * a RC response is pending or we can process send work requests. + */ +static inline int hfi1_send_ok(struct rvt_qp *qp) +{ + struct hfi1_qp_priv *priv = qp->priv; + + return !(qp->s_flags & (RVT_S_BUSY | HFI1_S_ANY_WAIT_IO)) && + (verbs_txreq_queued(iowait_get_ib_work(&priv->s_iowait)) || + (qp->s_flags & RVT_S_RESP_PENDING) || + !(qp->s_flags & RVT_S_ANY_WAIT_SEND)); +} + +/* * free_ahg - clear ahg from QP */ static inline void clear_ahg(struct rvt_qp *qp) @@ -129,8 +131,8 @@ struct send_context *qp_to_send_context(struct rvt_qp *qp, u8 sc5); void qp_iter_print(struct seq_file *s, struct rvt_qp_iter *iter); -void _hfi1_schedule_send(struct rvt_qp *qp); -void hfi1_schedule_send(struct rvt_qp *qp); +bool _hfi1_schedule_send(struct rvt_qp *qp); +bool hfi1_schedule_send(struct rvt_qp *qp); void hfi1_migrate_qp(struct rvt_qp *qp); @@ -150,4 +152,5 @@ void quiesce_qp(struct rvt_qp *qp); u32 mtu_from_qp(struct rvt_dev_info *rdi, struct rvt_qp *qp, u32 pmtu); int mtu_to_path_mtu(u32 mtu); void hfi1_error_port_qps(struct hfi1_ibport *ibp, u8 sl); +void hfi1_qp_unbusy(struct rvt_qp *qp, struct iowait_work *wait); #endif /* _QP_H */ diff --git a/drivers/infiniband/hw/hfi1/rc.c b/drivers/infiniband/hw/hfi1/rc.c index 9bd63abb2dfe..188aa4f686a0 100644 --- a/drivers/infiniband/hw/hfi1/rc.c +++ b/drivers/infiniband/hw/hfi1/rc.c @@ -309,7 +309,7 @@ int hfi1_make_rc_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps) } clear_ahg(qp); wqe = rvt_get_swqe_ptr(qp, qp->s_last); - hfi1_send_complete(qp, wqe, qp->s_last != qp->s_acked ? + rvt_send_complete(qp, wqe, qp->s_last != qp->s_acked ? IB_WC_SUCCESS : IB_WC_WR_FLUSH_ERR); /* will get called again */ goto done_free_tx; @@ -378,9 +378,9 @@ int hfi1_make_rc_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps) wqe->wr.ex.invalidate_rkey); local_ops = 1; } - hfi1_send_complete(qp, wqe, - err ? IB_WC_LOC_PROT_ERR - : IB_WC_SUCCESS); + rvt_send_complete(qp, wqe, + err ? IB_WC_LOC_PROT_ERR + : IB_WC_SUCCESS); if (local_ops) atomic_dec(&qp->local_ops_pending); goto done_free_tx; @@ -1043,7 +1043,7 @@ void hfi1_restart_rc(struct rvt_qp *qp, u32 psn, int wait) hfi1_migrate_qp(qp); qp->s_retry = qp->s_retry_cnt; } else if (qp->s_last == qp->s_acked) { - hfi1_send_complete(qp, wqe, IB_WC_RETRY_EXC_ERR); + rvt_send_complete(qp, wqe, IB_WC_RETRY_EXC_ERR); rvt_error_qp(qp, IB_WC_WR_FLUSH_ERR); return; } else { /* need to handle delayed completion */ @@ -1468,7 +1468,7 @@ static int do_rc_ack(struct rvt_qp *qp, u32 aeth, u32 psn, int opcode, ibp->rvp.n_other_naks++; class_b: if (qp->s_last == qp->s_acked) { - hfi1_send_complete(qp, wqe, status); + rvt_send_complete(qp, wqe, status); rvt_error_qp(qp, IB_WC_WR_FLUSH_ERR); } break; @@ -1644,7 +1644,8 @@ read_middle: qp->s_rdma_read_len -= pmtu; update_last_psn(qp, psn); spin_unlock_irqrestore(&qp->s_lock, flags); - hfi1_copy_sge(&qp->s_rdma_read_sge, data, pmtu, false, false); + rvt_copy_sge(qp, &qp->s_rdma_read_sge, + data, pmtu, false, false); goto bail; case OP(RDMA_READ_RESPONSE_ONLY): @@ -1684,7 +1685,8 @@ read_last: if (unlikely(tlen != qp->s_rdma_read_len)) goto ack_len_err; aeth = be32_to_cpu(ohdr->u.aeth); - hfi1_copy_sge(&qp->s_rdma_read_sge, data, tlen, false, false); + rvt_copy_sge(qp, &qp->s_rdma_read_sge, + data, tlen, false, false); WARN_ON(qp->s_rdma_read_sge.num_sge); (void)do_rc_ack(qp, aeth, psn, OP(RDMA_READ_RESPONSE_LAST), 0, rcd); @@ -1704,7 +1706,7 @@ ack_len_err: status = IB_WC_LOC_LEN_ERR; ack_err: if (qp->s_last == qp->s_acked) { - hfi1_send_complete(qp, wqe, status); + rvt_send_complete(qp, wqe, status); rvt_error_qp(qp, IB_WC_WR_FLUSH_ERR); } ack_done: @@ -2144,7 +2146,7 @@ send_middle: qp->r_rcv_len += pmtu; if (unlikely(qp->r_rcv_len > qp->r_len)) goto nack_inv; - hfi1_copy_sge(&qp->r_sge, data, pmtu, true, false); + rvt_copy_sge(qp, &qp->r_sge, data, pmtu, true, false); break; case OP(RDMA_WRITE_LAST_WITH_IMMEDIATE): @@ -2200,7 +2202,7 @@ send_last: wc.byte_len = tlen + qp->r_rcv_len; if (unlikely(wc.byte_len > qp->r_len)) goto nack_inv; - hfi1_copy_sge(&qp->r_sge, data, tlen, true, copy_last); + rvt_copy_sge(qp, &qp->r_sge, data, tlen, true, copy_last); rvt_put_ss(&qp->r_sge); qp->r_msn++; if (!__test_and_clear_bit(RVT_R_WRID_VALID, &qp->r_aflags)) diff --git a/drivers/infiniband/hw/hfi1/ruc.c b/drivers/infiniband/hw/hfi1/ruc.c index 5f56f3c1b4c4..7fb317c711df 100644 --- a/drivers/infiniband/hw/hfi1/ruc.c +++ b/drivers/infiniband/hw/hfi1/ruc.c @@ -156,333 +156,6 @@ int hfi1_ruc_check_hdr(struct hfi1_ibport *ibp, struct hfi1_packet *packet) } /** - * ruc_loopback - handle UC and RC loopback requests - * @sqp: the sending QP - * - * This is called from hfi1_do_send() to - * forward a WQE addressed to the same HFI. - * Note that although we are single threaded due to the send engine, we still - * have to protect against post_send(). We don't have to worry about - * receive interrupts since this is a connected protocol and all packets - * will pass through here. - */ -static void ruc_loopback(struct rvt_qp *sqp) -{ - struct hfi1_ibport *ibp = to_iport(sqp->ibqp.device, sqp->port_num); - struct rvt_qp *qp; - struct rvt_swqe *wqe; - struct rvt_sge *sge; - unsigned long flags; - struct ib_wc wc; - u64 sdata; - atomic64_t *maddr; - enum ib_wc_status send_status; - bool release; - int ret; - bool copy_last = false; - int local_ops = 0; - - rcu_read_lock(); - - /* - * Note that we check the responder QP state after - * checking the requester's state. - */ - qp = rvt_lookup_qpn(ib_to_rvt(sqp->ibqp.device), &ibp->rvp, - sqp->remote_qpn); - - spin_lock_irqsave(&sqp->s_lock, flags); - - /* Return if we are already busy processing a work request. */ - if ((sqp->s_flags & (RVT_S_BUSY | HFI1_S_ANY_WAIT)) || - !(ib_rvt_state_ops[sqp->state] & RVT_PROCESS_OR_FLUSH_SEND)) - goto unlock; - - sqp->s_flags |= RVT_S_BUSY; - -again: - if (sqp->s_last == READ_ONCE(sqp->s_head)) - goto clr_busy; - wqe = rvt_get_swqe_ptr(sqp, sqp->s_last); - - /* Return if it is not OK to start a new work request. */ - if (!(ib_rvt_state_ops[sqp->state] & RVT_PROCESS_NEXT_SEND_OK)) { - if (!(ib_rvt_state_ops[sqp->state] & RVT_FLUSH_SEND)) - goto clr_busy; - /* We are in the error state, flush the work request. */ - send_status = IB_WC_WR_FLUSH_ERR; - goto flush_send; - } - - /* - * We can rely on the entry not changing without the s_lock - * being held until we update s_last. - * We increment s_cur to indicate s_last is in progress. - */ - if (sqp->s_last == sqp->s_cur) { - if (++sqp->s_cur >= sqp->s_size) - sqp->s_cur = 0; - } - spin_unlock_irqrestore(&sqp->s_lock, flags); - - if (!qp || !(ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK) || - qp->ibqp.qp_type != sqp->ibqp.qp_type) { - ibp->rvp.n_pkt_drops++; - /* - * For RC, the requester would timeout and retry so - * shortcut the timeouts and just signal too many retries. - */ - if (sqp->ibqp.qp_type == IB_QPT_RC) - send_status = IB_WC_RETRY_EXC_ERR; - else - send_status = IB_WC_SUCCESS; - goto serr; - } - - memset(&wc, 0, sizeof(wc)); - send_status = IB_WC_SUCCESS; - - release = true; - sqp->s_sge.sge = wqe->sg_list[0]; - sqp->s_sge.sg_list = wqe->sg_list + 1; - sqp->s_sge.num_sge = wqe->wr.num_sge; - sqp->s_len = wqe->length; - switch (wqe->wr.opcode) { - case IB_WR_REG_MR: - goto send_comp; - - case IB_WR_LOCAL_INV: - if (!(wqe->wr.send_flags & RVT_SEND_COMPLETION_ONLY)) { - if (rvt_invalidate_rkey(sqp, - wqe->wr.ex.invalidate_rkey)) - send_status = IB_WC_LOC_PROT_ERR; - local_ops = 1; - } - goto send_comp; - - case IB_WR_SEND_WITH_INV: - if (!rvt_invalidate_rkey(qp, wqe->wr.ex.invalidate_rkey)) { - wc.wc_flags = IB_WC_WITH_INVALIDATE; - wc.ex.invalidate_rkey = wqe->wr.ex.invalidate_rkey; - } - goto send; - - case IB_WR_SEND_WITH_IMM: - wc.wc_flags = IB_WC_WITH_IMM; - wc.ex.imm_data = wqe->wr.ex.imm_data; - /* FALLTHROUGH */ - case IB_WR_SEND: -send: - ret = rvt_get_rwqe(qp, false); - if (ret < 0) - goto op_err; - if (!ret) - goto rnr_nak; - break; - - case IB_WR_RDMA_WRITE_WITH_IMM: - if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_WRITE))) - goto inv_err; - wc.wc_flags = IB_WC_WITH_IMM; - wc.ex.imm_data = wqe->wr.ex.imm_data; - ret = rvt_get_rwqe(qp, true); - if (ret < 0) - goto op_err; - if (!ret) - goto rnr_nak; - /* skip copy_last set and qp_access_flags recheck */ - goto do_write; - case IB_WR_RDMA_WRITE: - copy_last = rvt_is_user_qp(qp); - if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_WRITE))) - goto inv_err; -do_write: - if (wqe->length == 0) - break; - if (unlikely(!rvt_rkey_ok(qp, &qp->r_sge.sge, wqe->length, - wqe->rdma_wr.remote_addr, - wqe->rdma_wr.rkey, - IB_ACCESS_REMOTE_WRITE))) - goto acc_err; - qp->r_sge.sg_list = NULL; - qp->r_sge.num_sge = 1; - qp->r_sge.total_len = wqe->length; - break; - - case IB_WR_RDMA_READ: - if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_READ))) - goto inv_err; - if (unlikely(!rvt_rkey_ok(qp, &sqp->s_sge.sge, wqe->length, - wqe->rdma_wr.remote_addr, - wqe->rdma_wr.rkey, - IB_ACCESS_REMOTE_READ))) - goto acc_err; - release = false; - sqp->s_sge.sg_list = NULL; - sqp->s_sge.num_sge = 1; - qp->r_sge.sge = wqe->sg_list[0]; - qp->r_sge.sg_list = wqe->sg_list + 1; - qp->r_sge.num_sge = wqe->wr.num_sge; - qp->r_sge.total_len = wqe->length; - break; - - case IB_WR_ATOMIC_CMP_AND_SWP: - case IB_WR_ATOMIC_FETCH_AND_ADD: - if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_ATOMIC))) - goto inv_err; - if (unlikely(!rvt_rkey_ok(qp, &qp->r_sge.sge, sizeof(u64), - wqe->atomic_wr.remote_addr, - wqe->atomic_wr.rkey, - IB_ACCESS_REMOTE_ATOMIC))) - goto acc_err; - /* Perform atomic OP and save result. */ - maddr = (atomic64_t *)qp->r_sge.sge.vaddr; - sdata = wqe->atomic_wr.compare_add; - *(u64 *)sqp->s_sge.sge.vaddr = - (wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD) ? - (u64)atomic64_add_return(sdata, maddr) - sdata : - (u64)cmpxchg((u64 *)qp->r_sge.sge.vaddr, - sdata, wqe->atomic_wr.swap); - rvt_put_mr(qp->r_sge.sge.mr); - qp->r_sge.num_sge = 0; - goto send_comp; - - default: - send_status = IB_WC_LOC_QP_OP_ERR; - goto serr; - } - - sge = &sqp->s_sge.sge; - while (sqp->s_len) { - u32 len = sqp->s_len; - - if (len > sge->length) - len = sge->length; - if (len > sge->sge_length) - len = sge->sge_length; - WARN_ON_ONCE(len == 0); - hfi1_copy_sge(&qp->r_sge, sge->vaddr, len, release, copy_last); - sge->vaddr += len; - sge->length -= len; - sge->sge_length -= len; - if (sge->sge_length == 0) { - if (!release) - rvt_put_mr(sge->mr); - if (--sqp->s_sge.num_sge) - *sge = *sqp->s_sge.sg_list++; - } else if (sge->length == 0 && sge->mr->lkey) { - if (++sge->n >= RVT_SEGSZ) { - if (++sge->m >= sge->mr->mapsz) - break; - sge->n = 0; - } - sge->vaddr = - sge->mr->map[sge->m]->segs[sge->n].vaddr; - sge->length = - sge->mr->map[sge->m]->segs[sge->n].length; - } - sqp->s_len -= len; - } - if (release) - rvt_put_ss(&qp->r_sge); - - if (!test_and_clear_bit(RVT_R_WRID_VALID, &qp->r_aflags)) - goto send_comp; - - if (wqe->wr.opcode == IB_WR_RDMA_WRITE_WITH_IMM) - wc.opcode = IB_WC_RECV_RDMA_WITH_IMM; - else - wc.opcode = IB_WC_RECV; - wc.wr_id = qp->r_wr_id; - wc.status = IB_WC_SUCCESS; - wc.byte_len = wqe->length; - wc.qp = &qp->ibqp; - wc.src_qp = qp->remote_qpn; - wc.slid = rdma_ah_get_dlid(&qp->remote_ah_attr) & U16_MAX; - wc.sl = rdma_ah_get_sl(&qp->remote_ah_attr); - wc.port_num = 1; - /* Signal completion event if the solicited bit is set. */ - rvt_cq_enter(ibcq_to_rvtcq(qp->ibqp.recv_cq), &wc, - wqe->wr.send_flags & IB_SEND_SOLICITED); - -send_comp: - spin_lock_irqsave(&sqp->s_lock, flags); - ibp->rvp.n_loop_pkts++; -flush_send: - sqp->s_rnr_retry = sqp->s_rnr_retry_cnt; - hfi1_send_complete(sqp, wqe, send_status); - if (local_ops) { - atomic_dec(&sqp->local_ops_pending); - local_ops = 0; - } - goto again; - -rnr_nak: - /* Handle RNR NAK */ - if (qp->ibqp.qp_type == IB_QPT_UC) - goto send_comp; - ibp->rvp.n_rnr_naks++; - /* - * Note: we don't need the s_lock held since the BUSY flag - * makes this single threaded. - */ - if (sqp->s_rnr_retry == 0) { - send_status = IB_WC_RNR_RETRY_EXC_ERR; - goto serr; - } - if (sqp->s_rnr_retry_cnt < 7) - sqp->s_rnr_retry--; - spin_lock_irqsave(&sqp->s_lock, flags); - if (!(ib_rvt_state_ops[sqp->state] & RVT_PROCESS_RECV_OK)) - goto clr_busy; - rvt_add_rnr_timer(sqp, qp->r_min_rnr_timer << - IB_AETH_CREDIT_SHIFT); - goto clr_busy; - -op_err: - send_status = IB_WC_REM_OP_ERR; - wc.status = IB_WC_LOC_QP_OP_ERR; - goto err; - -inv_err: - send_status = IB_WC_REM_INV_REQ_ERR; - wc.status = IB_WC_LOC_QP_OP_ERR; - goto err; - -acc_err: - send_status = IB_WC_REM_ACCESS_ERR; - wc.status = IB_WC_LOC_PROT_ERR; -err: - /* responder goes to error state */ - rvt_rc_error(qp, wc.status); - -serr: - spin_lock_irqsave(&sqp->s_lock, flags); - hfi1_send_complete(sqp, wqe, send_status); - if (sqp->ibqp.qp_type == IB_QPT_RC) { - int lastwqe = rvt_error_qp(sqp, IB_WC_WR_FLUSH_ERR); - - sqp->s_flags &= ~RVT_S_BUSY; - spin_unlock_irqrestore(&sqp->s_lock, flags); - if (lastwqe) { - struct ib_event ev; - - ev.device = sqp->ibqp.device; - ev.element.qp = &sqp->ibqp; - ev.event = IB_EVENT_QP_LAST_WQE_REACHED; - sqp->ibqp.event_handler(&ev, sqp->ibqp.qp_context); - } - goto done; - } -clr_busy: - sqp->s_flags &= ~RVT_S_BUSY; -unlock: - spin_unlock_irqrestore(&sqp->s_lock, flags); -done: - rcu_read_unlock(); -} - -/** * hfi1_make_grh - construct a GRH header * @ibp: a pointer to the IB port * @hdr: a pointer to the GRH header being constructed @@ -825,8 +498,8 @@ void hfi1_do_send_from_rvt(struct rvt_qp *qp) void _hfi1_do_send(struct work_struct *work) { - struct iowait *wait = container_of(work, struct iowait, iowork); - struct rvt_qp *qp = iowait_to_qp(wait); + struct iowait_work *w = container_of(work, struct iowait_work, iowork); + struct rvt_qp *qp = iowait_to_qp(w->iow); hfi1_do_send(qp, true); } @@ -850,6 +523,7 @@ void hfi1_do_send(struct rvt_qp *qp, bool in_thread) ps.ibp = to_iport(qp->ibqp.device, qp->port_num); ps.ppd = ppd_from_ibp(ps.ibp); ps.in_thread = in_thread; + ps.wait = iowait_get_ib_work(&priv->s_iowait); trace_hfi1_rc_do_send(qp, in_thread); @@ -858,7 +532,7 @@ void hfi1_do_send(struct rvt_qp *qp, bool in_thread) if (!loopback && ((rdma_ah_get_dlid(&qp->remote_ah_attr) & ~((1 << ps.ppd->lmc) - 1)) == ps.ppd->lid)) { - ruc_loopback(qp); + rvt_ruc_loopback(qp); return; } make_req = hfi1_make_rc_req; @@ -868,7 +542,7 @@ void hfi1_do_send(struct rvt_qp *qp, bool in_thread) if (!loopback && ((rdma_ah_get_dlid(&qp->remote_ah_attr) & ~((1 << ps.ppd->lmc) - 1)) == ps.ppd->lid)) { - ruc_loopback(qp); + rvt_ruc_loopback(qp); return; } make_req = hfi1_make_uc_req; @@ -883,6 +557,8 @@ void hfi1_do_send(struct rvt_qp *qp, bool in_thread) /* Return if we are already busy processing a work request. */ if (!hfi1_send_ok(qp)) { + if (qp->s_flags & HFI1_S_ANY_WAIT_IO) + iowait_set_flag(&priv->s_iowait, IOWAIT_PENDING_IB); spin_unlock_irqrestore(&qp->s_lock, ps.flags); return; } @@ -896,7 +572,7 @@ void hfi1_do_send(struct rvt_qp *qp, bool in_thread) ps.pkts_sent = false; /* insure a pre-built packet is handled */ - ps.s_txreq = get_waiting_verbs_txreq(qp); + ps.s_txreq = get_waiting_verbs_txreq(ps.wait); do { /* Check for a constructed packet to be sent. */ if (ps.s_txreq) { @@ -907,6 +583,7 @@ void hfi1_do_send(struct rvt_qp *qp, bool in_thread) */ if (hfi1_verbs_send(qp, &ps)) return; + /* allow other tasks to run */ if (schedule_send_yield(qp, &ps)) return; @@ -917,44 +594,3 @@ void hfi1_do_send(struct rvt_qp *qp, bool in_thread) iowait_starve_clear(ps.pkts_sent, &priv->s_iowait); spin_unlock_irqrestore(&qp->s_lock, ps.flags); } - -/* - * This should be called with s_lock held. - */ -void hfi1_send_complete(struct rvt_qp *qp, struct rvt_swqe *wqe, - enum ib_wc_status status) -{ - u32 old_last, last; - - if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_OR_FLUSH_SEND)) - return; - - last = qp->s_last; - old_last = last; - trace_hfi1_qp_send_completion(qp, wqe, last); - if (++last >= qp->s_size) - last = 0; - trace_hfi1_qp_send_completion(qp, wqe, last); - qp->s_last = last; - /* See post_send() */ - barrier(); - rvt_put_swqe(wqe); - if (qp->ibqp.qp_type == IB_QPT_UD || - qp->ibqp.qp_type == IB_QPT_SMI || - qp->ibqp.qp_type == IB_QPT_GSI) - atomic_dec(&ibah_to_rvtah(wqe->ud_wr.ah)->refcount); - - rvt_qp_swqe_complete(qp, - wqe, - ib_hfi1_wc_opcode[wqe->wr.opcode], - status); - - if (qp->s_acked == old_last) - qp->s_acked = last; - if (qp->s_cur == old_last) - qp->s_cur = last; - if (qp->s_tail == old_last) - qp->s_tail = last; - if (qp->state == IB_QPS_SQD && last == qp->s_cur) - qp->s_draining = 0; -} diff --git a/drivers/infiniband/hw/hfi1/sdma.c b/drivers/infiniband/hw/hfi1/sdma.c index 88e326d6cc49..891d2386d1ca 100644 --- a/drivers/infiniband/hw/hfi1/sdma.c +++ b/drivers/infiniband/hw/hfi1/sdma.c @@ -378,7 +378,7 @@ static inline void complete_tx(struct sdma_engine *sde, __sdma_txclean(sde->dd, tx); if (complete) (*complete)(tx, res); - if (wait && iowait_sdma_dec(wait)) + if (iowait_sdma_dec(wait)) iowait_drain_wakeup(wait); } @@ -1758,7 +1758,6 @@ static void sdma_desc_avail(struct sdma_engine *sde, uint avail) struct iowait *wait, *nw; struct iowait *waits[SDMA_WAIT_BATCH_SIZE]; uint i, n = 0, seq, max_idx = 0; - struct sdma_txreq *stx; struct hfi1_ibdev *dev = &sde->dd->verbs_dev; u8 max_starved_cnt = 0; @@ -1779,19 +1778,13 @@ static void sdma_desc_avail(struct sdma_engine *sde, uint avail) nw, &sde->dmawait, list) { - u16 num_desc = 0; + u32 num_desc; if (!wait->wakeup) continue; if (n == ARRAY_SIZE(waits)) break; - if (!list_empty(&wait->tx_head)) { - stx = list_first_entry( - &wait->tx_head, - struct sdma_txreq, - list); - num_desc = stx->num_desc; - } + num_desc = iowait_get_all_desc(wait); if (num_desc > avail) break; avail -= num_desc; @@ -2346,7 +2339,7 @@ static inline u16 submit_tx(struct sdma_engine *sde, struct sdma_txreq *tx) */ static int sdma_check_progress( struct sdma_engine *sde, - struct iowait *wait, + struct iowait_work *wait, struct sdma_txreq *tx, bool pkts_sent) { @@ -2356,12 +2349,12 @@ static int sdma_check_progress( if (tx->num_desc <= sde->desc_avail) return -EAGAIN; /* pulse the head_lock */ - if (wait && wait->sleep) { + if (wait && iowait_ioww_to_iow(wait)->sleep) { unsigned seq; seq = raw_seqcount_begin( (const seqcount_t *)&sde->head_lock.seqcount); - ret = wait->sleep(sde, wait, tx, seq, pkts_sent); + ret = wait->iow->sleep(sde, wait, tx, seq, pkts_sent); if (ret == -EAGAIN) sde->desc_avail = sdma_descq_freecnt(sde); } else { @@ -2373,7 +2366,7 @@ static int sdma_check_progress( /** * sdma_send_txreq() - submit a tx req to ring * @sde: sdma engine to use - * @wait: wait structure to use when full (may be NULL) + * @wait: SE wait structure to use when full (may be NULL) * @tx: sdma_txreq to submit * @pkts_sent: has any packet been sent yet? * @@ -2386,7 +2379,7 @@ static int sdma_check_progress( * -EIOCBQUEUED - tx queued to iowait, -ECOMM bad sdma state */ int sdma_send_txreq(struct sdma_engine *sde, - struct iowait *wait, + struct iowait_work *wait, struct sdma_txreq *tx, bool pkts_sent) { @@ -2397,7 +2390,7 @@ int sdma_send_txreq(struct sdma_engine *sde, /* user should have supplied entire packet */ if (unlikely(tx->tlen)) return -EINVAL; - tx->wait = wait; + tx->wait = iowait_ioww_to_iow(wait); spin_lock_irqsave(&sde->tail_lock, flags); retry: if (unlikely(!__sdma_running(sde))) @@ -2406,14 +2399,14 @@ retry: goto nodesc; tail = submit_tx(sde, tx); if (wait) - iowait_sdma_inc(wait); + iowait_sdma_inc(iowait_ioww_to_iow(wait)); sdma_update_tail(sde, tail); unlock: spin_unlock_irqrestore(&sde->tail_lock, flags); return ret; unlock_noconn: if (wait) - iowait_sdma_inc(wait); + iowait_sdma_inc(iowait_ioww_to_iow(wait)); tx->next_descq_idx = 0; #ifdef CONFIG_HFI1_DEBUG_SDMA_ORDER tx->sn = sde->tail_sn++; @@ -2422,10 +2415,7 @@ unlock_noconn: spin_lock(&sde->flushlist_lock); list_add_tail(&tx->list, &sde->flushlist); spin_unlock(&sde->flushlist_lock); - if (wait) { - wait->tx_count++; - wait->count += tx->num_desc; - } + iowait_inc_wait_count(wait, tx->num_desc); schedule_work(&sde->flush_worker); ret = -ECOMM; goto unlock; @@ -2442,9 +2432,9 @@ nodesc: /** * sdma_send_txlist() - submit a list of tx req to ring * @sde: sdma engine to use - * @wait: wait structure to use when full (may be NULL) + * @wait: SE wait structure to use when full (may be NULL) * @tx_list: list of sdma_txreqs to submit - * @count: pointer to a u32 which, after return will contain the total number of + * @count: pointer to a u16 which, after return will contain the total number of * sdma_txreqs removed from the tx_list. This will include sdma_txreqs * whose SDMA descriptors are submitted to the ring and the sdma_txreqs * which are added to SDMA engine flush list if the SDMA engine state is @@ -2467,8 +2457,8 @@ nodesc: * -EINVAL - sdma_txreq incomplete, -EBUSY - no space in ring (wait == NULL) * -EIOCBQUEUED - tx queued to iowait, -ECOMM bad sdma state */ -int sdma_send_txlist(struct sdma_engine *sde, struct iowait *wait, - struct list_head *tx_list, u32 *count_out) +int sdma_send_txlist(struct sdma_engine *sde, struct iowait_work *wait, + struct list_head *tx_list, u16 *count_out) { struct sdma_txreq *tx, *tx_next; int ret = 0; @@ -2479,7 +2469,7 @@ int sdma_send_txlist(struct sdma_engine *sde, struct iowait *wait, spin_lock_irqsave(&sde->tail_lock, flags); retry: list_for_each_entry_safe(tx, tx_next, tx_list, list) { - tx->wait = wait; + tx->wait = iowait_ioww_to_iow(wait); if (unlikely(!__sdma_running(sde))) goto unlock_noconn; if (unlikely(tx->num_desc > sde->desc_avail)) @@ -2500,8 +2490,9 @@ retry: update_tail: total_count = submit_count + flush_count; if (wait) { - iowait_sdma_add(wait, total_count); - iowait_starve_clear(submit_count > 0, wait); + iowait_sdma_add(iowait_ioww_to_iow(wait), total_count); + iowait_starve_clear(submit_count > 0, + iowait_ioww_to_iow(wait)); } if (tail != INVALID_TAIL) sdma_update_tail(sde, tail); @@ -2511,7 +2502,7 @@ update_tail: unlock_noconn: spin_lock(&sde->flushlist_lock); list_for_each_entry_safe(tx, tx_next, tx_list, list) { - tx->wait = wait; + tx->wait = iowait_ioww_to_iow(wait); list_del_init(&tx->list); tx->next_descq_idx = 0; #ifdef CONFIG_HFI1_DEBUG_SDMA_ORDER @@ -2520,10 +2511,7 @@ unlock_noconn: #endif list_add_tail(&tx->list, &sde->flushlist); flush_count++; - if (wait) { - wait->tx_count++; - wait->count += tx->num_desc; - } + iowait_inc_wait_count(wait, tx->num_desc); } spin_unlock(&sde->flushlist_lock); schedule_work(&sde->flush_worker); diff --git a/drivers/infiniband/hw/hfi1/sdma.h b/drivers/infiniband/hw/hfi1/sdma.h index 46c775f255d1..6dc63d7c5685 100644 --- a/drivers/infiniband/hw/hfi1/sdma.h +++ b/drivers/infiniband/hw/hfi1/sdma.h @@ -1,7 +1,7 @@ #ifndef _HFI1_SDMA_H #define _HFI1_SDMA_H /* - * Copyright(c) 2015, 2016 Intel Corporation. + * Copyright(c) 2015 - 2018 Intel Corporation. * * This file is provided under a dual BSD/GPLv2 license. When using or * redistributing this file, you may do so under either license. @@ -62,16 +62,6 @@ /* Hardware limit for SDMA packet size */ #define MAX_SDMA_PKT_SIZE ((16 * 1024) - 1) -#define SDMA_TXREQ_S_OK 0 -#define SDMA_TXREQ_S_SENDERROR 1 -#define SDMA_TXREQ_S_ABORTED 2 -#define SDMA_TXREQ_S_SHUTDOWN 3 - -/* flags bits */ -#define SDMA_TXREQ_F_URGENT 0x0001 -#define SDMA_TXREQ_F_AHG_COPY 0x0002 -#define SDMA_TXREQ_F_USE_AHG 0x0004 - #define SDMA_MAP_NONE 0 #define SDMA_MAP_SINGLE 1 #define SDMA_MAP_PAGE 2 @@ -415,6 +405,7 @@ struct sdma_engine { struct list_head flushlist; struct cpumask cpu_mask; struct kobject kobj; + u32 msix_intr; }; int sdma_init(struct hfi1_devdata *dd, u8 port); @@ -849,16 +840,16 @@ static inline int sdma_txadd_kvaddr( dd, SDMA_MAP_SINGLE, tx, addr, len); } -struct iowait; +struct iowait_work; int sdma_send_txreq(struct sdma_engine *sde, - struct iowait *wait, + struct iowait_work *wait, struct sdma_txreq *tx, bool pkts_sent); int sdma_send_txlist(struct sdma_engine *sde, - struct iowait *wait, + struct iowait_work *wait, struct list_head *tx_list, - u32 *count); + u16 *count_out); int sdma_ahg_alloc(struct sdma_engine *sde); void sdma_ahg_free(struct sdma_engine *sde, int ahg_index); diff --git a/drivers/infiniband/hw/hfi1/sysfs.c b/drivers/infiniband/hw/hfi1/sysfs.c index 25e867393463..2be513d4c9da 100644 --- a/drivers/infiniband/hw/hfi1/sysfs.c +++ b/drivers/infiniband/hw/hfi1/sysfs.c @@ -494,17 +494,18 @@ static struct kobj_type hfi1_vl2mtu_ktype = { * Start of per-unit (or driver, in some cases, but replicated * per unit) functions (these get a device *) */ -static ssize_t show_rev(struct device *device, struct device_attribute *attr, - char *buf) +static ssize_t hw_rev_show(struct device *device, struct device_attribute *attr, + char *buf) { struct hfi1_ibdev *dev = container_of(device, struct hfi1_ibdev, rdi.ibdev.dev); return sprintf(buf, "%x\n", dd_from_dev(dev)->minrev); } +static DEVICE_ATTR_RO(hw_rev); -static ssize_t show_hfi(struct device *device, struct device_attribute *attr, - char *buf) +static ssize_t board_id_show(struct device *device, + struct device_attribute *attr, char *buf) { struct hfi1_ibdev *dev = container_of(device, struct hfi1_ibdev, rdi.ibdev.dev); @@ -517,8 +518,9 @@ static ssize_t show_hfi(struct device *device, struct device_attribute *attr, ret = scnprintf(buf, PAGE_SIZE, "%s\n", dd->boardname); return ret; } +static DEVICE_ATTR_RO(board_id); -static ssize_t show_boardversion(struct device *device, +static ssize_t boardversion_show(struct device *device, struct device_attribute *attr, char *buf) { struct hfi1_ibdev *dev = @@ -528,8 +530,9 @@ static ssize_t show_boardversion(struct device *device, /* The string printed here is already newline-terminated. */ return scnprintf(buf, PAGE_SIZE, "%s", dd->boardversion); } +static DEVICE_ATTR_RO(boardversion); -static ssize_t show_nctxts(struct device *device, +static ssize_t nctxts_show(struct device *device, struct device_attribute *attr, char *buf) { struct hfi1_ibdev *dev = @@ -546,8 +549,9 @@ static ssize_t show_nctxts(struct device *device, min(dd->num_user_contexts, (u32)dd->sc_sizes[SC_USER].count)); } +static DEVICE_ATTR_RO(nctxts); -static ssize_t show_nfreectxts(struct device *device, +static ssize_t nfreectxts_show(struct device *device, struct device_attribute *attr, char *buf) { struct hfi1_ibdev *dev = @@ -557,8 +561,9 @@ static ssize_t show_nfreectxts(struct device *device, /* Return the number of free user ports (contexts) available. */ return scnprintf(buf, PAGE_SIZE, "%u\n", dd->freectxts); } +static DEVICE_ATTR_RO(nfreectxts); -static ssize_t show_serial(struct device *device, +static ssize_t serial_show(struct device *device, struct device_attribute *attr, char *buf) { struct hfi1_ibdev *dev = @@ -567,8 +572,9 @@ static ssize_t show_serial(struct device *device, return scnprintf(buf, PAGE_SIZE, "%s", dd->serial); } +static DEVICE_ATTR_RO(serial); -static ssize_t store_chip_reset(struct device *device, +static ssize_t chip_reset_store(struct device *device, struct device_attribute *attr, const char *buf, size_t count) { @@ -586,6 +592,7 @@ static ssize_t store_chip_reset(struct device *device, bail: return ret < 0 ? ret : count; } +static DEVICE_ATTR_WO(chip_reset); /* * Convert the reported temperature from an integer (reported in @@ -598,7 +605,7 @@ bail: /* * Dump tempsense values, in decimal, to ease shell-scripts. */ -static ssize_t show_tempsense(struct device *device, +static ssize_t tempsense_show(struct device *device, struct device_attribute *attr, char *buf) { struct hfi1_ibdev *dev = @@ -622,6 +629,7 @@ static ssize_t show_tempsense(struct device *device, } return ret; } +static DEVICE_ATTR_RO(tempsense); /* * end of per-unit (or driver, in some cases, but replicated @@ -629,24 +637,20 @@ static ssize_t show_tempsense(struct device *device, */ /* start of per-unit file structures and support code */ -static DEVICE_ATTR(hw_rev, S_IRUGO, show_rev, NULL); -static DEVICE_ATTR(board_id, S_IRUGO, show_hfi, NULL); -static DEVICE_ATTR(nctxts, S_IRUGO, show_nctxts, NULL); -static DEVICE_ATTR(nfreectxts, S_IRUGO, show_nfreectxts, NULL); -static DEVICE_ATTR(serial, S_IRUGO, show_serial, NULL); -static DEVICE_ATTR(boardversion, S_IRUGO, show_boardversion, NULL); -static DEVICE_ATTR(tempsense, S_IRUGO, show_tempsense, NULL); -static DEVICE_ATTR(chip_reset, S_IWUSR, NULL, store_chip_reset); - -static struct device_attribute *hfi1_attributes[] = { - &dev_attr_hw_rev, - &dev_attr_board_id, - &dev_attr_nctxts, - &dev_attr_nfreectxts, - &dev_attr_serial, - &dev_attr_boardversion, - &dev_attr_tempsense, - &dev_attr_chip_reset, +static struct attribute *hfi1_attributes[] = { + &dev_attr_hw_rev.attr, + &dev_attr_board_id.attr, + &dev_attr_nctxts.attr, + &dev_attr_nfreectxts.attr, + &dev_attr_serial.attr, + &dev_attr_boardversion.attr, + &dev_attr_tempsense.attr, + &dev_attr_chip_reset.attr, + NULL, +}; + +const struct attribute_group ib_hfi1_attr_group = { + .attrs = hfi1_attributes, }; int hfi1_create_port_files(struct ib_device *ibdev, u8 port_num, @@ -832,12 +836,6 @@ int hfi1_verbs_register_sysfs(struct hfi1_devdata *dd) struct device *class_dev = &dev->dev; int i, j, ret; - for (i = 0; i < ARRAY_SIZE(hfi1_attributes); ++i) { - ret = device_create_file(&dev->dev, hfi1_attributes[i]); - if (ret) - goto bail; - } - for (i = 0; i < dd->num_sdma; i++) { ret = kobject_init_and_add(&dd->per_sdma[i].kobj, &sde_ktype, &class_dev->kobj, @@ -855,9 +853,6 @@ int hfi1_verbs_register_sysfs(struct hfi1_devdata *dd) return 0; bail: - for (i = 0; i < ARRAY_SIZE(hfi1_attributes); ++i) - device_remove_file(&dev->dev, hfi1_attributes[i]); - for (i = 0; i < dd->num_sdma; i++) kobject_del(&dd->per_sdma[i].kobj); diff --git a/drivers/infiniband/hw/hfi1/trace.h b/drivers/infiniband/hw/hfi1/trace.h index 8540463ef3f7..84458f1325e1 100644 --- a/drivers/infiniband/hw/hfi1/trace.h +++ b/drivers/infiniband/hw/hfi1/trace.h @@ -1,5 +1,5 @@ /* - * Copyright(c) 2015 - 2017 Intel Corporation. + * Copyright(c) 2015 - 2018 Intel Corporation. * * This file is provided under a dual BSD/GPLv2 license. When using or * redistributing this file, you may do so under either license. @@ -62,3 +62,4 @@ __print_symbolic(etype, \ #include "trace_rx.h" #include "trace_tx.h" #include "trace_mmu.h" +#include "trace_iowait.h" diff --git a/drivers/infiniband/hw/hfi1/trace_iowait.h b/drivers/infiniband/hw/hfi1/trace_iowait.h new file mode 100644 index 000000000000..27f4334ece2b --- /dev/null +++ b/drivers/infiniband/hw/hfi1/trace_iowait.h @@ -0,0 +1,54 @@ +/* SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause) */ +/* + * Copyright(c) 2018 Intel Corporation. + * + */ +#if !defined(__HFI1_TRACE_IOWAIT_H) || defined(TRACE_HEADER_MULTI_READ) +#define __HFI1_TRACE_IOWAIT_H + +#include <linux/tracepoint.h> +#include "iowait.h" +#include "verbs.h" + +#undef TRACE_SYSTEM +#define TRACE_SYSTEM hfi1_iowait + +DECLARE_EVENT_CLASS(hfi1_iowait_template, + TP_PROTO(struct iowait *wait, u32 flag), + TP_ARGS(wait, flag), + TP_STRUCT__entry(/* entry */ + __field(unsigned long, addr) + __field(unsigned long, flags) + __field(u32, flag) + __field(u32, qpn) + ), + TP_fast_assign(/* assign */ + __entry->addr = (unsigned long)wait; + __entry->flags = wait->flags; + __entry->flag = (1 << flag); + __entry->qpn = iowait_to_qp(wait)->ibqp.qp_num; + ), + TP_printk(/* print */ + "iowait 0x%lx qp %u flags 0x%lx flag 0x%x", + __entry->addr, + __entry->qpn, + __entry->flags, + __entry->flag + ) + ); + +DEFINE_EVENT(hfi1_iowait_template, hfi1_iowait_set, + TP_PROTO(struct iowait *wait, u32 flag), + TP_ARGS(wait, flag)); + +DEFINE_EVENT(hfi1_iowait_template, hfi1_iowait_clear, + TP_PROTO(struct iowait *wait, u32 flag), + TP_ARGS(wait, flag)); + +#endif /* __HFI1_TRACE_IOWAIT_H */ + +#undef TRACE_INCLUDE_PATH +#undef TRACE_INCLUDE_FILE +#define TRACE_INCLUDE_PATH . +#define TRACE_INCLUDE_FILE trace_iowait +#include <trace/define_trace.h> diff --git a/drivers/infiniband/hw/hfi1/uc.c b/drivers/infiniband/hw/hfi1/uc.c index e254dcec6f64..6aca0c5a7f97 100644 --- a/drivers/infiniband/hw/hfi1/uc.c +++ b/drivers/infiniband/hw/hfi1/uc.c @@ -88,7 +88,7 @@ int hfi1_make_uc_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps) } clear_ahg(qp); wqe = rvt_get_swqe_ptr(qp, qp->s_last); - hfi1_send_complete(qp, wqe, IB_WC_WR_FLUSH_ERR); + rvt_send_complete(qp, wqe, IB_WC_WR_FLUSH_ERR); goto done_free_tx; } @@ -140,7 +140,7 @@ int hfi1_make_uc_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps) qp, wqe->wr.ex.invalidate_rkey); local_ops = 1; } - hfi1_send_complete(qp, wqe, err ? IB_WC_LOC_PROT_ERR + rvt_send_complete(qp, wqe, err ? IB_WC_LOC_PROT_ERR : IB_WC_SUCCESS); if (local_ops) atomic_dec(&qp->local_ops_pending); @@ -426,7 +426,7 @@ send_first: qp->r_rcv_len += pmtu; if (unlikely(qp->r_rcv_len > qp->r_len)) goto rewind; - hfi1_copy_sge(&qp->r_sge, data, pmtu, false, false); + rvt_copy_sge(qp, &qp->r_sge, data, pmtu, false, false); break; case OP(SEND_LAST_WITH_IMMEDIATE): @@ -449,7 +449,7 @@ send_last: if (unlikely(wc.byte_len > qp->r_len)) goto rewind; wc.opcode = IB_WC_RECV; - hfi1_copy_sge(&qp->r_sge, data, tlen, false, false); + rvt_copy_sge(qp, &qp->r_sge, data, tlen, false, false); rvt_put_ss(&qp->s_rdma_read_sge); last_imm: wc.wr_id = qp->r_wr_id; @@ -523,7 +523,7 @@ rdma_first: qp->r_rcv_len += pmtu; if (unlikely(qp->r_rcv_len > qp->r_len)) goto drop; - hfi1_copy_sge(&qp->r_sge, data, pmtu, true, false); + rvt_copy_sge(qp, &qp->r_sge, data, pmtu, true, false); break; case OP(RDMA_WRITE_LAST_WITH_IMMEDIATE): @@ -550,7 +550,7 @@ rdma_last_imm: } wc.byte_len = qp->r_len; wc.opcode = IB_WC_RECV_RDMA_WITH_IMM; - hfi1_copy_sge(&qp->r_sge, data, tlen, true, false); + rvt_copy_sge(qp, &qp->r_sge, data, tlen, true, false); rvt_put_ss(&qp->r_sge); goto last_imm; @@ -564,7 +564,7 @@ rdma_last: tlen -= (hdrsize + extra_bytes); if (unlikely(tlen + qp->r_rcv_len != qp->r_len)) goto drop; - hfi1_copy_sge(&qp->r_sge, data, tlen, true, false); + rvt_copy_sge(qp, &qp->r_sge, data, tlen, true, false); rvt_put_ss(&qp->r_sge); break; diff --git a/drivers/infiniband/hw/hfi1/ud.c b/drivers/infiniband/hw/hfi1/ud.c index 70d39fc450a1..4baa8f4d49de 100644 --- a/drivers/infiniband/hw/hfi1/ud.c +++ b/drivers/infiniband/hw/hfi1/ud.c @@ -210,8 +210,8 @@ static void ud_loopback(struct rvt_qp *sqp, struct rvt_swqe *swqe) } hfi1_make_grh(ibp, &grh, &grd, 0, 0); - hfi1_copy_sge(&qp->r_sge, &grh, - sizeof(grh), true, false); + rvt_copy_sge(qp, &qp->r_sge, &grh, + sizeof(grh), true, false); wc.wc_flags |= IB_WC_GRH; } else { rvt_skip_sge(&qp->r_sge, sizeof(struct ib_grh), true); @@ -228,7 +228,7 @@ static void ud_loopback(struct rvt_qp *sqp, struct rvt_swqe *swqe) if (len > sge->sge_length) len = sge->sge_length; WARN_ON_ONCE(len == 0); - hfi1_copy_sge(&qp->r_sge, sge->vaddr, len, true, false); + rvt_copy_sge(qp, &qp->r_sge, sge->vaddr, len, true, false); sge->vaddr += len; sge->length -= len; sge->sge_length -= len; @@ -518,7 +518,7 @@ int hfi1_make_ud_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps) goto bail; } wqe = rvt_get_swqe_ptr(qp, qp->s_last); - hfi1_send_complete(qp, wqe, IB_WC_WR_FLUSH_ERR); + rvt_send_complete(qp, wqe, IB_WC_WR_FLUSH_ERR); goto done_free_tx; } @@ -560,7 +560,7 @@ int hfi1_make_ud_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps) ud_loopback(qp, wqe); spin_lock_irqsave(&qp->s_lock, tflags); ps->flags = tflags; - hfi1_send_complete(qp, wqe, IB_WC_SUCCESS); + rvt_send_complete(qp, wqe, IB_WC_SUCCESS); goto done_free_tx; } } @@ -1019,8 +1019,8 @@ void hfi1_ud_rcv(struct hfi1_packet *packet) goto drop; } if (packet->grh) { - hfi1_copy_sge(&qp->r_sge, packet->grh, - sizeof(struct ib_grh), true, false); + rvt_copy_sge(qp, &qp->r_sge, packet->grh, + sizeof(struct ib_grh), true, false); wc.wc_flags |= IB_WC_GRH; } else if (packet->etype == RHF_RCV_TYPE_BYPASS) { struct ib_grh grh; @@ -1030,14 +1030,14 @@ void hfi1_ud_rcv(struct hfi1_packet *packet) * out when creating 16B, add back the GRH here. */ hfi1_make_ext_grh(packet, &grh, slid, dlid); - hfi1_copy_sge(&qp->r_sge, &grh, - sizeof(struct ib_grh), true, false); + rvt_copy_sge(qp, &qp->r_sge, &grh, + sizeof(struct ib_grh), true, false); wc.wc_flags |= IB_WC_GRH; } else { rvt_skip_sge(&qp->r_sge, sizeof(struct ib_grh), true); } - hfi1_copy_sge(&qp->r_sge, data, wc.byte_len - sizeof(struct ib_grh), - true, false); + rvt_copy_sge(qp, &qp->r_sge, data, wc.byte_len - sizeof(struct ib_grh), + true, false); rvt_put_ss(&qp->r_sge); if (!test_and_clear_bit(RVT_R_WRID_VALID, &qp->r_aflags)) return; diff --git a/drivers/infiniband/hw/hfi1/user_sdma.c b/drivers/infiniband/hw/hfi1/user_sdma.c index 5c88706121c1..3f0aadccd9f6 100644 --- a/drivers/infiniband/hw/hfi1/user_sdma.c +++ b/drivers/infiniband/hw/hfi1/user_sdma.c @@ -1,5 +1,5 @@ /* - * Copyright(c) 2015 - 2017 Intel Corporation. + * Copyright(c) 2015 - 2018 Intel Corporation. * * This file is provided under a dual BSD/GPLv2 license. When using or * redistributing this file, you may do so under either license. @@ -76,8 +76,7 @@ MODULE_PARM_DESC(sdma_comp_size, "Size of User SDMA completion ring. Default: 12 static unsigned initial_pkt_count = 8; -static int user_sdma_send_pkts(struct user_sdma_request *req, - unsigned maxpkts); +static int user_sdma_send_pkts(struct user_sdma_request *req, u16 maxpkts); static void user_sdma_txreq_cb(struct sdma_txreq *txreq, int status); static inline void pq_update(struct hfi1_user_sdma_pkt_q *pq); static void user_sdma_free_request(struct user_sdma_request *req, bool unpin); @@ -101,7 +100,7 @@ static inline u32 get_lrh_len(struct hfi1_pkt_header, u32 len); static int defer_packet_queue( struct sdma_engine *sde, - struct iowait *wait, + struct iowait_work *wait, struct sdma_txreq *txreq, uint seq, bool pkts_sent); @@ -124,13 +123,13 @@ static struct mmu_rb_ops sdma_rb_ops = { static int defer_packet_queue( struct sdma_engine *sde, - struct iowait *wait, + struct iowait_work *wait, struct sdma_txreq *txreq, uint seq, bool pkts_sent) { struct hfi1_user_sdma_pkt_q *pq = - container_of(wait, struct hfi1_user_sdma_pkt_q, busy); + container_of(wait->iow, struct hfi1_user_sdma_pkt_q, busy); struct hfi1_ibdev *dev = &pq->dd->verbs_dev; struct user_sdma_txreq *tx = container_of(txreq, struct user_sdma_txreq, txreq); @@ -187,13 +186,12 @@ int hfi1_user_sdma_alloc_queues(struct hfi1_ctxtdata *uctxt, pq->ctxt = uctxt->ctxt; pq->subctxt = fd->subctxt; pq->n_max_reqs = hfi1_sdma_comp_ring_size; - pq->state = SDMA_PKT_Q_INACTIVE; atomic_set(&pq->n_reqs, 0); init_waitqueue_head(&pq->wait); atomic_set(&pq->n_locked, 0); pq->mm = fd->mm; - iowait_init(&pq->busy, 0, NULL, defer_packet_queue, + iowait_init(&pq->busy, 0, NULL, NULL, defer_packet_queue, activate_packet_queue, NULL); pq->reqidx = 0; @@ -276,7 +274,7 @@ int hfi1_user_sdma_free_queues(struct hfi1_filedata *fd, /* Wait until all requests have been freed. */ wait_event_interruptible( pq->wait, - (READ_ONCE(pq->state) == SDMA_PKT_Q_INACTIVE)); + !atomic_read(&pq->n_reqs)); kfree(pq->reqs); kfree(pq->req_in_use); kmem_cache_destroy(pq->txreq_cache); @@ -312,6 +310,13 @@ static u8 dlid_to_selector(u16 dlid) return mapping[hash]; } +/** + * hfi1_user_sdma_process_request() - Process and start a user sdma request + * @fd: valid file descriptor + * @iovec: array of io vectors to process + * @dim: overall iovec array size + * @count: number of io vector array entries processed + */ int hfi1_user_sdma_process_request(struct hfi1_filedata *fd, struct iovec *iovec, unsigned long dim, unsigned long *count) @@ -328,7 +333,6 @@ int hfi1_user_sdma_process_request(struct hfi1_filedata *fd, u8 opcode, sc, vl; u16 pkey; u32 slid; - int req_queued = 0; u16 dlid; u32 selector; @@ -392,7 +396,6 @@ int hfi1_user_sdma_process_request(struct hfi1_filedata *fd, req->data_len = 0; req->pq = pq; req->cq = cq; - req->status = -1; req->ahg_idx = -1; req->iov_idx = 0; req->sent = 0; @@ -400,12 +403,14 @@ int hfi1_user_sdma_process_request(struct hfi1_filedata *fd, req->seqcomp = 0; req->seqsubmitted = 0; req->tids = NULL; - req->done = 0; req->has_error = 0; INIT_LIST_HEAD(&req->txps); memcpy(&req->info, &info, sizeof(info)); + /* The request is initialized, count it */ + atomic_inc(&pq->n_reqs); + if (req_opcode(info.ctrl) == EXPECTED) { /* expected must have a TID info and at least one data vector */ if (req->data_iovs < 2) { @@ -500,7 +505,6 @@ int hfi1_user_sdma_process_request(struct hfi1_filedata *fd, ret = pin_vector_pages(req, &req->iovs[i]); if (ret) { req->data_iovs = i; - req->status = ret; goto free_req; } req->data_len += req->iovs[i].iov.iov_len; @@ -561,23 +565,11 @@ int hfi1_user_sdma_process_request(struct hfi1_filedata *fd, req->ahg_idx = sdma_ahg_alloc(req->sde); set_comp_state(pq, cq, info.comp_idx, QUEUED, 0); - atomic_inc(&pq->n_reqs); - req_queued = 1; + pq->state = SDMA_PKT_Q_ACTIVE; /* Send the first N packets in the request to buy us some time */ ret = user_sdma_send_pkts(req, pcount); - if (unlikely(ret < 0 && ret != -EBUSY)) { - req->status = ret; + if (unlikely(ret < 0 && ret != -EBUSY)) goto free_req; - } - - /* - * It is possible that the SDMA engine would have processed all the - * submitted packets by the time we get here. Therefore, only set - * packet queue state to ACTIVE if there are still uncompleted - * requests. - */ - if (atomic_read(&pq->n_reqs)) - xchg(&pq->state, SDMA_PKT_Q_ACTIVE); /* * This is a somewhat blocking send implementation. @@ -588,14 +580,8 @@ int hfi1_user_sdma_process_request(struct hfi1_filedata *fd, while (req->seqsubmitted != req->info.npkts) { ret = user_sdma_send_pkts(req, pcount); if (ret < 0) { - if (ret != -EBUSY) { - req->status = ret; - WRITE_ONCE(req->has_error, 1); - if (READ_ONCE(req->seqcomp) == - req->seqsubmitted - 1) - goto free_req; - return ret; - } + if (ret != -EBUSY) + goto free_req; wait_event_interruptible_timeout( pq->busy.wait_dma, (pq->state == SDMA_PKT_Q_ACTIVE), @@ -606,10 +592,19 @@ int hfi1_user_sdma_process_request(struct hfi1_filedata *fd, *count += idx; return 0; free_req: - user_sdma_free_request(req, true); - if (req_queued) + /* + * If the submitted seqsubmitted == npkts, the completion routine + * controls the final state. If sequbmitted < npkts, wait for any + * outstanding packets to finish before cleaning up. + */ + if (req->seqsubmitted < req->info.npkts) { + if (req->seqsubmitted) + wait_event(pq->busy.wait_dma, + (req->seqcomp == req->seqsubmitted - 1)); + user_sdma_free_request(req, true); pq_update(pq); - set_comp_state(pq, cq, info.comp_idx, ERROR, req->status); + set_comp_state(pq, cq, info.comp_idx, ERROR, ret); + } return ret; } @@ -760,9 +755,10 @@ static int user_sdma_txadd(struct user_sdma_request *req, return ret; } -static int user_sdma_send_pkts(struct user_sdma_request *req, unsigned maxpkts) +static int user_sdma_send_pkts(struct user_sdma_request *req, u16 maxpkts) { - int ret = 0, count; + int ret = 0; + u16 count; unsigned npkts = 0; struct user_sdma_txreq *tx = NULL; struct hfi1_user_sdma_pkt_q *pq = NULL; @@ -864,8 +860,10 @@ static int user_sdma_send_pkts(struct user_sdma_request *req, unsigned maxpkts) changes = set_txreq_header_ahg(req, tx, datalen); - if (changes < 0) + if (changes < 0) { + ret = changes; goto free_tx; + } } } else { ret = sdma_txinit(&tx->txreq, 0, sizeof(req->hdr) + @@ -914,10 +912,11 @@ static int user_sdma_send_pkts(struct user_sdma_request *req, unsigned maxpkts) npkts++; } dosend: - ret = sdma_send_txlist(req->sde, &pq->busy, &req->txps, &count); + ret = sdma_send_txlist(req->sde, + iowait_get_ib_work(&pq->busy), + &req->txps, &count); req->seqsubmitted += count; if (req->seqsubmitted == req->info.npkts) { - WRITE_ONCE(req->done, 1); /* * The txreq has already been submitted to the HW queue * so we can free the AHG entry now. Corruption will not @@ -1365,11 +1364,15 @@ static int set_txreq_header_ahg(struct user_sdma_request *req, return idx; } -/* - * SDMA tx request completion callback. Called when the SDMA progress - * state machine gets notification that the SDMA descriptors for this - * tx request have been processed by the DMA engine. Called in - * interrupt context. +/** + * user_sdma_txreq_cb() - SDMA tx request completion callback. + * @txreq: valid sdma tx request + * @status: success/failure of request + * + * Called when the SDMA progress state machine gets notification that + * the SDMA descriptors for this tx request have been processed by the + * DMA engine. Called in interrupt context. + * Only do work on completed sequences. */ static void user_sdma_txreq_cb(struct sdma_txreq *txreq, int status) { @@ -1378,7 +1381,7 @@ static void user_sdma_txreq_cb(struct sdma_txreq *txreq, int status) struct user_sdma_request *req; struct hfi1_user_sdma_pkt_q *pq; struct hfi1_user_sdma_comp_q *cq; - u16 idx; + enum hfi1_sdma_comp_state state = COMPLETE; if (!tx->req) return; @@ -1391,39 +1394,25 @@ static void user_sdma_txreq_cb(struct sdma_txreq *txreq, int status) SDMA_DBG(req, "SDMA completion with error %d", status); WRITE_ONCE(req->has_error, 1); + state = ERROR; } req->seqcomp = tx->seqnum; kmem_cache_free(pq->txreq_cache, tx); - tx = NULL; - - idx = req->info.comp_idx; - if (req->status == -1 && status == SDMA_TXREQ_S_OK) { - if (req->seqcomp == req->info.npkts - 1) { - req->status = 0; - user_sdma_free_request(req, false); - pq_update(pq); - set_comp_state(pq, cq, idx, COMPLETE, 0); - } - } else { - if (status != SDMA_TXREQ_S_OK) - req->status = status; - if (req->seqcomp == (READ_ONCE(req->seqsubmitted) - 1) && - (READ_ONCE(req->done) || - READ_ONCE(req->has_error))) { - user_sdma_free_request(req, false); - pq_update(pq); - set_comp_state(pq, cq, idx, ERROR, req->status); - } - } + + /* sequence isn't complete? We are done */ + if (req->seqcomp != req->info.npkts - 1) + return; + + user_sdma_free_request(req, false); + set_comp_state(pq, cq, req->info.comp_idx, state, status); + pq_update(pq); } static inline void pq_update(struct hfi1_user_sdma_pkt_q *pq) { - if (atomic_dec_and_test(&pq->n_reqs)) { - xchg(&pq->state, SDMA_PKT_Q_INACTIVE); + if (atomic_dec_and_test(&pq->n_reqs)) wake_up(&pq->wait); - } } static void user_sdma_free_request(struct user_sdma_request *req, bool unpin) @@ -1448,6 +1437,8 @@ static void user_sdma_free_request(struct user_sdma_request *req, bool unpin) if (!node) continue; + req->iovs[i].node = NULL; + if (unpin) hfi1_mmu_rb_remove(req->pq->handler, &node->rb); diff --git a/drivers/infiniband/hw/hfi1/user_sdma.h b/drivers/infiniband/hw/hfi1/user_sdma.h index d2bc77f75253..14dfd757dafd 100644 --- a/drivers/infiniband/hw/hfi1/user_sdma.h +++ b/drivers/infiniband/hw/hfi1/user_sdma.h @@ -105,9 +105,10 @@ static inline int ahg_header_set(u32 *arr, int idx, size_t array_size, #define TXREQ_FLAGS_REQ_ACK BIT(0) /* Set the ACK bit in the header */ #define TXREQ_FLAGS_REQ_DISABLE_SH BIT(1) /* Disable header suppression */ -#define SDMA_PKT_Q_INACTIVE BIT(0) -#define SDMA_PKT_Q_ACTIVE BIT(1) -#define SDMA_PKT_Q_DEFERRED BIT(2) +enum pkt_q_sdma_state { + SDMA_PKT_Q_ACTIVE, + SDMA_PKT_Q_DEFERRED, +}; /* * Maximum retry attempts to submit a TX request @@ -133,7 +134,7 @@ struct hfi1_user_sdma_pkt_q { struct user_sdma_request *reqs; unsigned long *req_in_use; struct iowait busy; - unsigned state; + enum pkt_q_sdma_state state; wait_queue_head_t wait; unsigned long unpinned; struct mmu_rb_handler *handler; @@ -203,14 +204,12 @@ struct user_sdma_request { s8 ahg_idx; /* Writeable fields shared with interrupt */ - u64 seqcomp ____cacheline_aligned_in_smp; - u64 seqsubmitted; - /* status of the last txreq completed */ - int status; + u16 seqcomp ____cacheline_aligned_in_smp; + u16 seqsubmitted; /* Send side fields */ struct list_head txps ____cacheline_aligned_in_smp; - u64 seqnum; + u16 seqnum; /* * KDETH.OFFSET (TID) field * The offset can cover multiple packets, depending on the @@ -228,7 +227,6 @@ struct user_sdma_request { u16 tididx; /* progress index moving along the iovs array */ u8 iov_idx; - u8 done; u8 has_error; struct user_sdma_iovec iovs[MAX_VECTORS_PER_REQ]; @@ -248,7 +246,7 @@ struct user_sdma_txreq { struct user_sdma_request *req; u16 flags; unsigned int busycount; - u64 seqnum; + u16 seqnum; }; int hfi1_user_sdma_alloc_queues(struct hfi1_ctxtdata *uctxt, diff --git a/drivers/infiniband/hw/hfi1/verbs.c b/drivers/infiniband/hw/hfi1/verbs.c index a7c586a5589d..48e11e510358 100644 --- a/drivers/infiniband/hw/hfi1/verbs.c +++ b/drivers/infiniband/hw/hfi1/verbs.c @@ -129,8 +129,6 @@ unsigned short piothreshold = 256; module_param(piothreshold, ushort, S_IRUGO); MODULE_PARM_DESC(piothreshold, "size used to determine sdma vs. pio"); -#define COPY_CACHELESS 1 -#define COPY_ADAPTIVE 2 static unsigned int sge_copy_mode; module_param(sge_copy_mode, uint, S_IRUGO); MODULE_PARM_DESC(sge_copy_mode, @@ -151,159 +149,13 @@ static int pio_wait(struct rvt_qp *qp, /* 16B trailing buffer */ static const u8 trail_buf[MAX_16B_PADDING]; -static uint wss_threshold; +static uint wss_threshold = 80; module_param(wss_threshold, uint, S_IRUGO); MODULE_PARM_DESC(wss_threshold, "Percentage (1-100) of LLC to use as a threshold for a cacheless copy"); static uint wss_clean_period = 256; module_param(wss_clean_period, uint, S_IRUGO); MODULE_PARM_DESC(wss_clean_period, "Count of verbs copies before an entry in the page copy table is cleaned"); -/* memory working set size */ -struct hfi1_wss { - unsigned long *entries; - atomic_t total_count; - atomic_t clean_counter; - atomic_t clean_entry; - - int threshold; - int num_entries; - long pages_mask; -}; - -static struct hfi1_wss wss; - -int hfi1_wss_init(void) -{ - long llc_size; - long llc_bits; - long table_size; - long table_bits; - - /* check for a valid percent range - default to 80 if none or invalid */ - if (wss_threshold < 1 || wss_threshold > 100) - wss_threshold = 80; - /* reject a wildly large period */ - if (wss_clean_period > 1000000) - wss_clean_period = 256; - /* reject a zero period */ - if (wss_clean_period == 0) - wss_clean_period = 1; - - /* - * Calculate the table size - the next power of 2 larger than the - * LLC size. LLC size is in KiB. - */ - llc_size = wss_llc_size() * 1024; - table_size = roundup_pow_of_two(llc_size); - - /* one bit per page in rounded up table */ - llc_bits = llc_size / PAGE_SIZE; - table_bits = table_size / PAGE_SIZE; - wss.pages_mask = table_bits - 1; - wss.num_entries = table_bits / BITS_PER_LONG; - - wss.threshold = (llc_bits * wss_threshold) / 100; - if (wss.threshold == 0) - wss.threshold = 1; - - atomic_set(&wss.clean_counter, wss_clean_period); - - wss.entries = kcalloc(wss.num_entries, sizeof(*wss.entries), - GFP_KERNEL); - if (!wss.entries) { - hfi1_wss_exit(); - return -ENOMEM; - } - - return 0; -} - -void hfi1_wss_exit(void) -{ - /* coded to handle partially initialized and repeat callers */ - kfree(wss.entries); - wss.entries = NULL; -} - -/* - * Advance the clean counter. When the clean period has expired, - * clean an entry. - * - * This is implemented in atomics to avoid locking. Because multiple - * variables are involved, it can be racy which can lead to slightly - * inaccurate information. Since this is only a heuristic, this is - * OK. Any innaccuracies will clean themselves out as the counter - * advances. That said, it is unlikely the entry clean operation will - * race - the next possible racer will not start until the next clean - * period. - * - * The clean counter is implemented as a decrement to zero. When zero - * is reached an entry is cleaned. - */ -static void wss_advance_clean_counter(void) -{ - int entry; - int weight; - unsigned long bits; - - /* become the cleaner if we decrement the counter to zero */ - if (atomic_dec_and_test(&wss.clean_counter)) { - /* - * Set, not add, the clean period. This avoids an issue - * where the counter could decrement below the clean period. - * Doing a set can result in lost decrements, slowing the - * clean advance. Since this a heuristic, this possible - * slowdown is OK. - * - * An alternative is to loop, advancing the counter by a - * clean period until the result is > 0. However, this could - * lead to several threads keeping another in the clean loop. - * This could be mitigated by limiting the number of times - * we stay in the loop. - */ - atomic_set(&wss.clean_counter, wss_clean_period); - - /* - * Uniquely grab the entry to clean and move to next. - * The current entry is always the lower bits of - * wss.clean_entry. The table size, wss.num_entries, - * is always a power-of-2. - */ - entry = (atomic_inc_return(&wss.clean_entry) - 1) - & (wss.num_entries - 1); - - /* clear the entry and count the bits */ - bits = xchg(&wss.entries[entry], 0); - weight = hweight64((u64)bits); - /* only adjust the contended total count if needed */ - if (weight) - atomic_sub(weight, &wss.total_count); - } -} - -/* - * Insert the given address into the working set array. - */ -static void wss_insert(void *address) -{ - u32 page = ((unsigned long)address >> PAGE_SHIFT) & wss.pages_mask; - u32 entry = page / BITS_PER_LONG; /* assumes this ends up a shift */ - u32 nr = page & (BITS_PER_LONG - 1); - - if (!test_and_set_bit(nr, &wss.entries[entry])) - atomic_inc(&wss.total_count); - - wss_advance_clean_counter(); -} - -/* - * Is the working set larger than the threshold? - */ -static inline bool wss_exceeds_threshold(void) -{ - return atomic_read(&wss.total_count) >= wss.threshold; -} - /* * Translate ib_wr_opcode into ib_wc_opcode. */ @@ -438,79 +290,6 @@ static const u32 pio_opmask[BIT(3)] = { */ __be64 ib_hfi1_sys_image_guid; -/** - * hfi1_copy_sge - copy data to SGE memory - * @ss: the SGE state - * @data: the data to copy - * @length: the length of the data - * @release: boolean to release MR - * @copy_last: do a separate copy of the last 8 bytes - */ -void hfi1_copy_sge( - struct rvt_sge_state *ss, - void *data, u32 length, - bool release, - bool copy_last) -{ - struct rvt_sge *sge = &ss->sge; - int i; - bool in_last = false; - bool cacheless_copy = false; - - if (sge_copy_mode == COPY_CACHELESS) { - cacheless_copy = length >= PAGE_SIZE; - } else if (sge_copy_mode == COPY_ADAPTIVE) { - if (length >= PAGE_SIZE) { - /* - * NOTE: this *assumes*: - * o The first vaddr is the dest. - * o If multiple pages, then vaddr is sequential. - */ - wss_insert(sge->vaddr); - if (length >= (2 * PAGE_SIZE)) - wss_insert(sge->vaddr + PAGE_SIZE); - - cacheless_copy = wss_exceeds_threshold(); - } else { - wss_advance_clean_counter(); - } - } - if (copy_last) { - if (length > 8) { - length -= 8; - } else { - copy_last = false; - in_last = true; - } - } - -again: - while (length) { - u32 len = rvt_get_sge_length(sge, length); - - WARN_ON_ONCE(len == 0); - if (unlikely(in_last)) { - /* enforce byte transfer ordering */ - for (i = 0; i < len; i++) - ((u8 *)sge->vaddr)[i] = ((u8 *)data)[i]; - } else if (cacheless_copy) { - cacheless_memcpy(sge->vaddr, data, len); - } else { - memcpy(sge->vaddr, data, len); - } - rvt_update_sge(ss, len, release); - data += len; - length -= len; - } - - if (copy_last) { - copy_last = false; - in_last = true; - length = 8; - goto again; - } -} - /* * Make sure the QP is ready and able to accept the given opcode. */ @@ -713,7 +492,7 @@ static void verbs_sdma_complete( spin_lock(&qp->s_lock); if (tx->wqe) { - hfi1_send_complete(qp, tx->wqe, IB_WC_SUCCESS); + rvt_send_complete(qp, tx->wqe, IB_WC_SUCCESS); } else if (qp->ibqp.qp_type == IB_QPT_RC) { struct hfi1_opa_header *hdr; @@ -737,7 +516,7 @@ static int wait_kmem(struct hfi1_ibdev *dev, if (ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK) { write_seqlock(&dev->iowait_lock); list_add_tail(&ps->s_txreq->txreq.list, - &priv->s_iowait.tx_head); + &ps->wait->tx_head); if (list_empty(&priv->s_iowait.list)) { if (list_empty(&dev->memwait)) mod_timer(&dev->mem_timer, jiffies + 1); @@ -748,7 +527,7 @@ static int wait_kmem(struct hfi1_ibdev *dev, rvt_get_qp(qp); } write_sequnlock(&dev->iowait_lock); - qp->s_flags &= ~RVT_S_BUSY; + hfi1_qp_unbusy(qp, ps->wait); ret = -EBUSY; } spin_unlock_irqrestore(&qp->s_lock, flags); @@ -950,8 +729,7 @@ int hfi1_verbs_send_dma(struct rvt_qp *qp, struct hfi1_pkt_state *ps, if (unlikely(ret)) goto bail_build; } - ret = sdma_send_txreq(tx->sde, &priv->s_iowait, &tx->txreq, - ps->pkts_sent); + ret = sdma_send_txreq(tx->sde, ps->wait, &tx->txreq, ps->pkts_sent); if (unlikely(ret < 0)) { if (ret == -ECOMM) goto bail_ecomm; @@ -1001,7 +779,7 @@ static int pio_wait(struct rvt_qp *qp, if (ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK) { write_seqlock(&dev->iowait_lock); list_add_tail(&ps->s_txreq->txreq.list, - &priv->s_iowait.tx_head); + &ps->wait->tx_head); if (list_empty(&priv->s_iowait.list)) { struct hfi1_ibdev *dev = &dd->verbs_dev; int was_empty; @@ -1020,7 +798,7 @@ static int pio_wait(struct rvt_qp *qp, hfi1_sc_wantpiobuf_intr(sc, 1); } write_sequnlock(&dev->iowait_lock); - qp->s_flags &= ~RVT_S_BUSY; + hfi1_qp_unbusy(qp, ps->wait); ret = -EBUSY; } spin_unlock_irqrestore(&qp->s_lock, flags); @@ -1160,7 +938,7 @@ int hfi1_verbs_send_pio(struct rvt_qp *qp, struct hfi1_pkt_state *ps, pio_bail: if (qp->s_wqe) { spin_lock_irqsave(&qp->s_lock, flags); - hfi1_send_complete(qp, qp->s_wqe, wc_status); + rvt_send_complete(qp, qp->s_wqe, wc_status); spin_unlock_irqrestore(&qp->s_lock, flags); } else if (qp->ibqp.qp_type == IB_QPT_RC) { spin_lock_irqsave(&qp->s_lock, flags); @@ -1367,7 +1145,7 @@ int hfi1_verbs_send(struct rvt_qp *qp, struct hfi1_pkt_state *ps) hfi1_cdbg(PIO, "%s() Failed. Completing with err", __func__); spin_lock_irqsave(&qp->s_lock, flags); - hfi1_send_complete(qp, qp->s_wqe, IB_WC_GENERAL_ERR); + rvt_send_complete(qp, qp->s_wqe, IB_WC_GENERAL_ERR); spin_unlock_irqrestore(&qp->s_lock, flags); } return -EINVAL; @@ -1943,7 +1721,7 @@ int hfi1_register_ib_device(struct hfi1_devdata *dd) dd->verbs_dev.rdi.driver_f.check_modify_qp = hfi1_check_modify_qp; dd->verbs_dev.rdi.driver_f.modify_qp = hfi1_modify_qp; dd->verbs_dev.rdi.driver_f.notify_restart_rc = hfi1_restart_rc; - dd->verbs_dev.rdi.driver_f.check_send_wqe = hfi1_check_send_wqe; + dd->verbs_dev.rdi.driver_f.setup_wqe = hfi1_setup_wqe; dd->verbs_dev.rdi.driver_f.comp_vect_cpu_lookup = hfi1_comp_vect_mappings_lookup; @@ -1956,10 +1734,16 @@ int hfi1_register_ib_device(struct hfi1_devdata *dd) dd->verbs_dev.rdi.dparms.lkey_table_size = hfi1_lkey_table_size; dd->verbs_dev.rdi.dparms.nports = dd->num_pports; dd->verbs_dev.rdi.dparms.npkeys = hfi1_get_npkeys(dd); + dd->verbs_dev.rdi.dparms.sge_copy_mode = sge_copy_mode; + dd->verbs_dev.rdi.dparms.wss_threshold = wss_threshold; + dd->verbs_dev.rdi.dparms.wss_clean_period = wss_clean_period; /* post send table */ dd->verbs_dev.rdi.post_parms = hfi1_post_parms; + /* opcode translation table */ + dd->verbs_dev.rdi.wc_opcode = ib_hfi1_wc_opcode; + ppd = dd->pport; for (i = 0; i < dd->num_pports; i++, ppd++) rvt_init_port(&dd->verbs_dev.rdi, @@ -1967,6 +1751,9 @@ int hfi1_register_ib_device(struct hfi1_devdata *dd) i, ppd->pkeys); + rdma_set_device_sysfs_group(&dd->verbs_dev.rdi.ibdev, + &ib_hfi1_attr_group); + ret = rvt_register_device(&dd->verbs_dev.rdi, RDMA_DRIVER_HFI1); if (ret) goto err_verbs_txreq; diff --git a/drivers/infiniband/hw/hfi1/verbs.h b/drivers/infiniband/hw/hfi1/verbs.h index a4d06502f06d..64c9054db5f3 100644 --- a/drivers/infiniband/hw/hfi1/verbs.h +++ b/drivers/infiniband/hw/hfi1/verbs.h @@ -166,11 +166,13 @@ struct hfi1_qp_priv { * This structure is used to hold commonly lookedup and computed values during * the send engine progress. */ +struct iowait_work; struct hfi1_pkt_state { struct hfi1_ibdev *dev; struct hfi1_ibport *ibp; struct hfi1_pportdata *ppd; struct verbs_txreq *s_txreq; + struct iowait_work *wait; unsigned long flags; unsigned long timeout; unsigned long timeout_int; @@ -247,7 +249,7 @@ static inline struct hfi1_ibdev *to_idev(struct ib_device *ibdev) return container_of(rdi, struct hfi1_ibdev, rdi); } -static inline struct rvt_qp *iowait_to_qp(struct iowait *s_iowait) +static inline struct rvt_qp *iowait_to_qp(struct iowait *s_iowait) { struct hfi1_qp_priv *priv; @@ -313,9 +315,6 @@ void hfi1_put_txreq(struct verbs_txreq *tx); int hfi1_verbs_send(struct rvt_qp *qp, struct hfi1_pkt_state *ps); -void hfi1_copy_sge(struct rvt_sge_state *ss, void *data, u32 length, - bool release, bool copy_last); - void hfi1_cnp_rcv(struct hfi1_packet *packet); void hfi1_uc_rcv(struct hfi1_packet *packet); @@ -343,7 +342,8 @@ int hfi1_check_modify_qp(struct rvt_qp *qp, struct ib_qp_attr *attr, void hfi1_modify_qp(struct rvt_qp *qp, struct ib_qp_attr *attr, int attr_mask, struct ib_udata *udata); void hfi1_restart_rc(struct rvt_qp *qp, u32 psn, int wait); -int hfi1_check_send_wqe(struct rvt_qp *qp, struct rvt_swqe *wqe); +int hfi1_setup_wqe(struct rvt_qp *qp, struct rvt_swqe *wqe, + bool *call_send); extern const u32 rc_only_opcode; extern const u32 uc_only_opcode; @@ -363,9 +363,6 @@ void hfi1_do_send_from_rvt(struct rvt_qp *qp); void hfi1_do_send(struct rvt_qp *qp, bool in_thread); -void hfi1_send_complete(struct rvt_qp *qp, struct rvt_swqe *wqe, - enum ib_wc_status status); - void hfi1_send_rc_ack(struct hfi1_packet *packet, bool is_fecn); int hfi1_make_rc_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps); @@ -390,28 +387,6 @@ int hfi1_verbs_send_dma(struct rvt_qp *qp, struct hfi1_pkt_state *ps, int hfi1_verbs_send_pio(struct rvt_qp *qp, struct hfi1_pkt_state *ps, u64 pbc); -int hfi1_wss_init(void); -void hfi1_wss_exit(void); - -/* platform specific: return the lowest level cache (llc) size, in KiB */ -static inline int wss_llc_size(void) -{ - /* assume that the boot CPU value is universal for all CPUs */ - return boot_cpu_data.x86_cache_size; -} - -/* platform specific: cacheless copy */ -static inline void cacheless_memcpy(void *dst, void *src, size_t n) -{ - /* - * Use the only available X64 cacheless copy. Add a __user cast - * to quiet sparse. The src agument is already in the kernel so - * there are no security issues. The extra fault recovery machinery - * is not invoked. - */ - __copy_user_nocache(dst, (void __user *)src, n, 0); -} - static inline bool opa_bth_is_migration(struct ib_other_headers *ohdr) { return ohdr->bth[1] & cpu_to_be32(OPA_BTH_MIG_REQ); diff --git a/drivers/infiniband/hw/hfi1/verbs_txreq.h b/drivers/infiniband/hw/hfi1/verbs_txreq.h index 1c19bbc764b2..2a77af26a231 100644 --- a/drivers/infiniband/hw/hfi1/verbs_txreq.h +++ b/drivers/infiniband/hw/hfi1/verbs_txreq.h @@ -102,22 +102,19 @@ static inline struct sdma_txreq *get_sdma_txreq(struct verbs_txreq *tx) return &tx->txreq; } -static inline struct verbs_txreq *get_waiting_verbs_txreq(struct rvt_qp *qp) +static inline struct verbs_txreq *get_waiting_verbs_txreq(struct iowait_work *w) { struct sdma_txreq *stx; - struct hfi1_qp_priv *priv = qp->priv; - stx = iowait_get_txhead(&priv->s_iowait); + stx = iowait_get_txhead(w); if (stx) return container_of(stx, struct verbs_txreq, txreq); return NULL; } -static inline bool verbs_txreq_queued(struct rvt_qp *qp) +static inline bool verbs_txreq_queued(struct iowait_work *w) { - struct hfi1_qp_priv *priv = qp->priv; - - return iowait_packet_queued(&priv->s_iowait); + return iowait_packet_queued(w); } void hfi1_put_txreq(struct verbs_txreq *tx); diff --git a/drivers/infiniband/hw/hfi1/vnic_main.c b/drivers/infiniband/hw/hfi1/vnic_main.c index c643d80c5a53..c9876d9e3cb9 100644 --- a/drivers/infiniband/hw/hfi1/vnic_main.c +++ b/drivers/infiniband/hw/hfi1/vnic_main.c @@ -120,7 +120,7 @@ static int allocate_vnic_ctxt(struct hfi1_devdata *dd, uctxt->seq_cnt = 1; uctxt->is_vnic = true; - hfi1_set_vnic_msix_info(uctxt); + msix_request_rcd_irq(uctxt); hfi1_stats.sps_ctxts++; dd_dev_dbg(dd, "created vnic context %d\n", uctxt->ctxt); @@ -135,8 +135,6 @@ static void deallocate_vnic_ctxt(struct hfi1_devdata *dd, dd_dev_dbg(dd, "closing vnic context %d\n", uctxt->ctxt); flush_wc(); - hfi1_reset_vnic_msix_info(uctxt); - /* * Disable receive context and interrupt available, reset all * RcvCtxtCtrl bits to default values. @@ -148,6 +146,10 @@ static void deallocate_vnic_ctxt(struct hfi1_devdata *dd, HFI1_RCVCTRL_NO_RHQ_DROP_DIS | HFI1_RCVCTRL_NO_EGR_DROP_DIS, uctxt); + /* msix_intr will always be > 0, only clean up if this is true */ + if (uctxt->msix_intr) + msix_free_irq(dd, uctxt->msix_intr); + uctxt->event_flags = 0; hfi1_clear_tids(uctxt); @@ -626,7 +628,7 @@ static void hfi1_vnic_down(struct hfi1_vnic_vport_info *vinfo) idr_remove(&dd->vnic.vesw_idr, vinfo->vesw_id); /* ensure irqs see the change */ - hfi1_vnic_synchronize_irq(dd); + msix_vnic_synchronize_irq(dd); /* remove unread skbs */ for (i = 0; i < vinfo->num_rx_q; i++) { @@ -690,8 +692,6 @@ static int hfi1_vnic_init(struct hfi1_vnic_vport_info *vinfo) rc = hfi1_vnic_txreq_init(dd); if (rc) goto txreq_fail; - - dd->vnic.msix_idx = dd->first_dyn_msix_idx; } for (i = dd->vnic.num_ctxt; i < vinfo->num_rx_q; i++) { diff --git a/drivers/infiniband/hw/hfi1/vnic_sdma.c b/drivers/infiniband/hw/hfi1/vnic_sdma.c index c3c96c5869ed..97bd940a056a 100644 --- a/drivers/infiniband/hw/hfi1/vnic_sdma.c +++ b/drivers/infiniband/hw/hfi1/vnic_sdma.c @@ -1,5 +1,5 @@ /* - * Copyright(c) 2017 Intel Corporation. + * Copyright(c) 2017 - 2018 Intel Corporation. * * This file is provided under a dual BSD/GPLv2 license. When using or * redistributing this file, you may do so under either license. @@ -198,8 +198,8 @@ int hfi1_vnic_send_dma(struct hfi1_devdata *dd, u8 q_idx, goto free_desc; tx->retry_count = 0; - ret = sdma_send_txreq(sde, &vnic_sdma->wait, &tx->txreq, - vnic_sdma->pkts_sent); + ret = sdma_send_txreq(sde, iowait_get_ib_work(&vnic_sdma->wait), + &tx->txreq, vnic_sdma->pkts_sent); /* When -ECOMM, sdma callback will be called with ABORT status */ if (unlikely(ret && unlikely(ret != -ECOMM))) goto free_desc; @@ -230,13 +230,13 @@ tx_err: * become available. */ static int hfi1_vnic_sdma_sleep(struct sdma_engine *sde, - struct iowait *wait, + struct iowait_work *wait, struct sdma_txreq *txreq, uint seq, bool pkts_sent) { struct hfi1_vnic_sdma *vnic_sdma = - container_of(wait, struct hfi1_vnic_sdma, wait); + container_of(wait->iow, struct hfi1_vnic_sdma, wait); struct hfi1_ibdev *dev = &vnic_sdma->dd->verbs_dev; struct vnic_txreq *tx = container_of(txreq, struct vnic_txreq, txreq); @@ -247,7 +247,7 @@ static int hfi1_vnic_sdma_sleep(struct sdma_engine *sde, vnic_sdma->state = HFI1_VNIC_SDMA_Q_DEFERRED; write_seqlock(&dev->iowait_lock); if (list_empty(&vnic_sdma->wait.list)) - iowait_queue(pkts_sent, wait, &sde->dmawait); + iowait_queue(pkts_sent, wait->iow, &sde->dmawait); write_sequnlock(&dev->iowait_lock); return -EBUSY; } @@ -285,7 +285,8 @@ void hfi1_vnic_sdma_init(struct hfi1_vnic_vport_info *vinfo) for (i = 0; i < vinfo->num_tx_q; i++) { struct hfi1_vnic_sdma *vnic_sdma = &vinfo->sdma[i]; - iowait_init(&vnic_sdma->wait, 0, NULL, hfi1_vnic_sdma_sleep, + iowait_init(&vnic_sdma->wait, 0, NULL, NULL, + hfi1_vnic_sdma_sleep, hfi1_vnic_sdma_wakeup, NULL); vnic_sdma->sde = &vinfo->dd->per_sdma[i]; vnic_sdma->dd = vinfo->dd; @@ -295,10 +296,12 @@ void hfi1_vnic_sdma_init(struct hfi1_vnic_vport_info *vinfo) /* Add a free descriptor watermark for wakeups */ if (vnic_sdma->sde->descq_cnt > HFI1_VNIC_SDMA_DESC_WTRMRK) { + struct iowait_work *work; + INIT_LIST_HEAD(&vnic_sdma->stx.list); vnic_sdma->stx.num_desc = HFI1_VNIC_SDMA_DESC_WTRMRK; - list_add_tail(&vnic_sdma->stx.list, - &vnic_sdma->wait.tx_head); + work = iowait_get_ib_work(&vnic_sdma->wait); + list_add_tail(&vnic_sdma->stx.list, &work->tx_head); } } } diff --git a/drivers/infiniband/hw/hns/Kconfig b/drivers/infiniband/hw/hns/Kconfig index fddb5fdf92de..21c2100b2ea9 100644 --- a/drivers/infiniband/hw/hns/Kconfig +++ b/drivers/infiniband/hw/hns/Kconfig @@ -1,6 +1,7 @@ config INFINIBAND_HNS tristate "HNS RoCE Driver" depends on NET_VENDOR_HISILICON + depends on INFINIBAND_USER_ACCESS || !INFINIBAND_USER_ACCESS depends on ARM64 || (COMPILE_TEST && 64BIT) ---help--- This is a RoCE/RDMA driver for the Hisilicon RoCE engine. The engine diff --git a/drivers/infiniband/hw/hns/hns_roce_ah.c b/drivers/infiniband/hw/hns/hns_roce_ah.c index 0d96c5bb38cd..9990dc9eb96a 100644 --- a/drivers/infiniband/hw/hns/hns_roce_ah.c +++ b/drivers/infiniband/hw/hns/hns_roce_ah.c @@ -49,6 +49,7 @@ struct ib_ah *hns_roce_create_ah(struct ib_pd *ibpd, struct hns_roce_ah *ah; u16 vlan_tag = 0xffff; const struct ib_global_route *grh = rdma_ah_read_grh(ah_attr); + bool vlan_en = false; ah = kzalloc(sizeof(*ah), GFP_ATOMIC); if (!ah) @@ -58,8 +59,10 @@ struct ib_ah *hns_roce_create_ah(struct ib_pd *ibpd, memcpy(ah->av.mac, ah_attr->roce.dmac, ETH_ALEN); gid_attr = ah_attr->grh.sgid_attr; - if (is_vlan_dev(gid_attr->ndev)) + if (is_vlan_dev(gid_attr->ndev)) { vlan_tag = vlan_dev_vlan_id(gid_attr->ndev); + vlan_en = true; + } if (vlan_tag < 0x1000) vlan_tag |= (rdma_ah_get_sl(ah_attr) & @@ -71,6 +74,7 @@ struct ib_ah *hns_roce_create_ah(struct ib_pd *ibpd, HNS_ROCE_PORT_NUM_SHIFT)); ah->av.gid_index = grh->sgid_index; ah->av.vlan = cpu_to_le16(vlan_tag); + ah->av.vlan_en = vlan_en; dev_dbg(dev, "gid_index = 0x%x,vlan = 0x%x\n", ah->av.gid_index, ah->av.vlan); diff --git a/drivers/infiniband/hw/hns/hns_roce_device.h b/drivers/infiniband/hw/hns/hns_roce_device.h index 9a24fd0ee3e7..d39bdfdb5de9 100644 --- a/drivers/infiniband/hw/hns/hns_roce_device.h +++ b/drivers/infiniband/hw/hns/hns_roce_device.h @@ -88,8 +88,11 @@ #define BITMAP_RR 1 #define MR_TYPE_MR 0x00 +#define MR_TYPE_FRMR 0x01 #define MR_TYPE_DMA 0x03 +#define HNS_ROCE_FRMR_MAX_PA 512 + #define PKEY_ID 0xffff #define GUID_LEN 8 #define NODE_DESC_SIZE 64 @@ -193,6 +196,9 @@ enum { HNS_ROCE_CAP_FLAG_RQ_INLINE = BIT(2), HNS_ROCE_CAP_FLAG_RECORD_DB = BIT(3), HNS_ROCE_CAP_FLAG_SQ_RECORD_DB = BIT(4), + HNS_ROCE_CAP_FLAG_MW = BIT(7), + HNS_ROCE_CAP_FLAG_FRMR = BIT(8), + HNS_ROCE_CAP_FLAG_ATOMIC = BIT(10), }; enum hns_roce_mtt_type { @@ -219,19 +225,11 @@ struct hns_roce_uar { unsigned long logic_idx; }; -struct hns_roce_vma_data { - struct list_head list; - struct vm_area_struct *vma; - struct mutex *vma_list_mutex; -}; - struct hns_roce_ucontext { struct ib_ucontext ibucontext; struct hns_roce_uar uar; struct list_head page_list; struct mutex page_mutex; - struct list_head vma_list; - struct mutex vma_list_mutex; }; struct hns_roce_pd { @@ -293,6 +291,16 @@ struct hns_roce_mtt { enum hns_roce_mtt_type mtt_type; }; +struct hns_roce_mw { + struct ib_mw ibmw; + u32 pdn; + u32 rkey; + int enabled; /* MW's active status */ + u32 pbl_hop_num; + u32 pbl_ba_pg_sz; + u32 pbl_buf_pg_sz; +}; + /* Only support 4K page size for mr register */ #define MR_SIZE_4K 0 @@ -304,6 +312,7 @@ struct hns_roce_mr { u32 key; /* Key of MR */ u32 pd; /* PD num of MR */ u32 access;/* Access permission of MR */ + u32 npages; int enabled; /* MR's active status */ int type; /* MR's register type */ u64 *pbl_buf;/* MR's PBL space */ @@ -457,6 +466,7 @@ struct hns_roce_av { u8 dgid[HNS_ROCE_GID_SIZE]; u8 mac[6]; __le16 vlan; + bool vlan_en; }; struct hns_roce_ah { @@ -656,6 +666,7 @@ struct hns_roce_eq_table { }; struct hns_roce_caps { + u64 fw_ver; u8 num_ports; int gid_table_len[HNS_ROCE_MAX_PORTS]; int pkey_table_len[HNS_ROCE_MAX_PORTS]; @@ -665,7 +676,9 @@ struct hns_roce_caps { u32 max_sq_sg; /* 2 */ u32 max_sq_inline; /* 32 */ u32 max_rq_sg; /* 2 */ + u32 max_extend_sg; int num_qps; /* 256k */ + int reserved_qps; u32 max_wqes; /* 16k */ u32 max_sq_desc_sz; /* 64 */ u32 max_rq_desc_sz; /* 64 */ @@ -738,6 +751,7 @@ struct hns_roce_work { struct hns_roce_dev *hr_dev; struct work_struct work; u32 qpn; + u32 cqn; int event_type; int sub_type; }; @@ -764,6 +778,8 @@ struct hns_roce_hw { struct hns_roce_mr *mr, int flags, u32 pdn, int mr_access_flags, u64 iova, u64 size, void *mb_buf); + int (*frmr_write_mtpt)(void *mb_buf, struct hns_roce_mr *mr); + int (*mw_write_mtpt)(void *mb_buf, struct hns_roce_mw *mw); void (*write_cqc)(struct hns_roce_dev *hr_dev, struct hns_roce_cq *hr_cq, void *mb_buf, u64 *mtts, dma_addr_t dma_handle, int nent, u32 vector); @@ -863,6 +879,11 @@ static inline struct hns_roce_mr *to_hr_mr(struct ib_mr *ibmr) return container_of(ibmr, struct hns_roce_mr, ibmr); } +static inline struct hns_roce_mw *to_hr_mw(struct ib_mw *ibmw) +{ + return container_of(ibmw, struct hns_roce_mw, ibmw); +} + static inline struct hns_roce_qp *to_hr_qp(struct ib_qp *ibqp) { return container_of(ibqp, struct hns_roce_qp, ibqp); @@ -968,12 +989,20 @@ struct ib_mr *hns_roce_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, int hns_roce_rereg_user_mr(struct ib_mr *mr, int flags, u64 start, u64 length, u64 virt_addr, int mr_access_flags, struct ib_pd *pd, struct ib_udata *udata); +struct ib_mr *hns_roce_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type, + u32 max_num_sg); +int hns_roce_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, int sg_nents, + unsigned int *sg_offset); int hns_roce_dereg_mr(struct ib_mr *ibmr); int hns_roce_hw2sw_mpt(struct hns_roce_dev *hr_dev, struct hns_roce_cmd_mailbox *mailbox, unsigned long mpt_index); unsigned long key_to_hw_index(u32 key); +struct ib_mw *hns_roce_alloc_mw(struct ib_pd *pd, enum ib_mw_type, + struct ib_udata *udata); +int hns_roce_dealloc_mw(struct ib_mw *ibmw); + void hns_roce_buf_free(struct hns_roce_dev *hr_dev, u32 size, struct hns_roce_buf *buf); int hns_roce_buf_alloc(struct hns_roce_dev *hr_dev, u32 size, u32 max_direct, diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v1.c b/drivers/infiniband/hw/hns/hns_roce_hw_v1.c index 081aa91fc162..ca05810c92dc 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v1.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v1.c @@ -731,7 +731,7 @@ static int hns_roce_v1_rsv_lp_qp(struct hns_roce_dev *hr_dev) cq_init_attr.comp_vector = 0; cq = hns_roce_ib_create_cq(&hr_dev->ib_dev, &cq_init_attr, NULL, NULL); if (IS_ERR(cq)) { - dev_err(dev, "Create cq for reseved loop qp failed!"); + dev_err(dev, "Create cq for reserved loop qp failed!"); return -ENOMEM; } free_mr->mr_free_cq = to_hr_cq(cq); @@ -744,7 +744,7 @@ static int hns_roce_v1_rsv_lp_qp(struct hns_roce_dev *hr_dev) pd = hns_roce_alloc_pd(&hr_dev->ib_dev, NULL, NULL); if (IS_ERR(pd)) { - dev_err(dev, "Create pd for reseved loop qp failed!"); + dev_err(dev, "Create pd for reserved loop qp failed!"); ret = -ENOMEM; goto alloc_pd_failed; } diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index 0218c0f8c2a7..a4c62ae23a9a 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -54,6 +54,59 @@ static void set_data_seg_v2(struct hns_roce_v2_wqe_data_seg *dseg, dseg->len = cpu_to_le32(sg->length); } +static void set_frmr_seg(struct hns_roce_v2_rc_send_wqe *rc_sq_wqe, + struct hns_roce_wqe_frmr_seg *fseg, + const struct ib_reg_wr *wr) +{ + struct hns_roce_mr *mr = to_hr_mr(wr->mr); + + /* use ib_access_flags */ + roce_set_bit(rc_sq_wqe->byte_4, + V2_RC_FRMR_WQE_BYTE_4_BIND_EN_S, + wr->access & IB_ACCESS_MW_BIND ? 1 : 0); + roce_set_bit(rc_sq_wqe->byte_4, + V2_RC_FRMR_WQE_BYTE_4_ATOMIC_S, + wr->access & IB_ACCESS_REMOTE_ATOMIC ? 1 : 0); + roce_set_bit(rc_sq_wqe->byte_4, + V2_RC_FRMR_WQE_BYTE_4_RR_S, + wr->access & IB_ACCESS_REMOTE_READ ? 1 : 0); + roce_set_bit(rc_sq_wqe->byte_4, + V2_RC_FRMR_WQE_BYTE_4_RW_S, + wr->access & IB_ACCESS_REMOTE_WRITE ? 1 : 0); + roce_set_bit(rc_sq_wqe->byte_4, + V2_RC_FRMR_WQE_BYTE_4_LW_S, + wr->access & IB_ACCESS_LOCAL_WRITE ? 1 : 0); + + /* Data structure reuse may lead to confusion */ + rc_sq_wqe->msg_len = cpu_to_le32(mr->pbl_ba & 0xffffffff); + rc_sq_wqe->inv_key = cpu_to_le32(mr->pbl_ba >> 32); + + rc_sq_wqe->byte_16 = cpu_to_le32(wr->mr->length & 0xffffffff); + rc_sq_wqe->byte_20 = cpu_to_le32(wr->mr->length >> 32); + rc_sq_wqe->rkey = cpu_to_le32(wr->key); + rc_sq_wqe->va = cpu_to_le64(wr->mr->iova); + + fseg->pbl_size = cpu_to_le32(mr->pbl_size); + roce_set_field(fseg->mode_buf_pg_sz, + V2_RC_FRMR_WQE_BYTE_40_PBL_BUF_PG_SZ_M, + V2_RC_FRMR_WQE_BYTE_40_PBL_BUF_PG_SZ_S, + mr->pbl_buf_pg_sz + PG_SHIFT_OFFSET); + roce_set_bit(fseg->mode_buf_pg_sz, + V2_RC_FRMR_WQE_BYTE_40_BLK_MODE_S, 0); +} + +static void set_atomic_seg(struct hns_roce_wqe_atomic_seg *aseg, + const struct ib_atomic_wr *wr) +{ + if (wr->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP) { + aseg->fetchadd_swap_data = cpu_to_le64(wr->swap); + aseg->cmp_data = cpu_to_le64(wr->compare_add); + } else { + aseg->fetchadd_swap_data = cpu_to_le64(wr->compare_add); + aseg->cmp_data = 0; + } +} + static void set_extend_sge(struct hns_roce_qp *qp, const struct ib_send_wr *wr, unsigned int *sge_ind) { @@ -121,6 +174,7 @@ static int set_rwqe_data_seg(struct ib_qp *ibqp, const struct ib_send_wr *wr, } if (wr->opcode == IB_WR_RDMA_READ) { + *bad_wr = wr; dev_err(hr_dev->dev, "Not support inline data!\n"); return -EINVAL; } @@ -179,6 +233,7 @@ static int hns_roce_v2_post_send(struct ib_qp *ibqp, struct hns_roce_v2_ud_send_wqe *ud_sq_wqe; struct hns_roce_v2_rc_send_wqe *rc_sq_wqe; struct hns_roce_qp *qp = to_hr_qp(ibqp); + struct hns_roce_wqe_frmr_seg *fseg; struct device *dev = hr_dev->dev; struct hns_roce_v2_db sq_db; struct ib_qp_attr attr; @@ -191,6 +246,7 @@ static int hns_roce_v2_post_send(struct ib_qp *ibqp, int attr_mask; u32 tmp_len; int ret = 0; + u32 hr_op; u8 *smac; int nreq; int i; @@ -356,6 +412,9 @@ static int hns_roce_v2_post_send(struct ib_qp *ibqp, V2_UD_SEND_WQE_BYTE_40_PORTN_S, qp->port); + roce_set_bit(ud_sq_wqe->byte_40, + V2_UD_SEND_WQE_BYTE_40_UD_VLAN_EN_S, + ah->av.vlan_en ? 1 : 0); roce_set_field(ud_sq_wqe->byte_48, V2_UD_SEND_WQE_BYTE_48_SGID_INDX_M, V2_UD_SEND_WQE_BYTE_48_SGID_INDX_S, @@ -406,99 +465,100 @@ static int hns_roce_v2_post_send(struct ib_qp *ibqp, roce_set_bit(rc_sq_wqe->byte_4, V2_RC_SEND_WQE_BYTE_4_OWNER_S, owner_bit); + wqe += sizeof(struct hns_roce_v2_rc_send_wqe); switch (wr->opcode) { case IB_WR_RDMA_READ: - roce_set_field(rc_sq_wqe->byte_4, - V2_RC_SEND_WQE_BYTE_4_OPCODE_M, - V2_RC_SEND_WQE_BYTE_4_OPCODE_S, - HNS_ROCE_V2_WQE_OP_RDMA_READ); + hr_op = HNS_ROCE_V2_WQE_OP_RDMA_READ; rc_sq_wqe->rkey = cpu_to_le32(rdma_wr(wr)->rkey); rc_sq_wqe->va = cpu_to_le64(rdma_wr(wr)->remote_addr); break; case IB_WR_RDMA_WRITE: - roce_set_field(rc_sq_wqe->byte_4, - V2_RC_SEND_WQE_BYTE_4_OPCODE_M, - V2_RC_SEND_WQE_BYTE_4_OPCODE_S, - HNS_ROCE_V2_WQE_OP_RDMA_WRITE); + hr_op = HNS_ROCE_V2_WQE_OP_RDMA_WRITE; rc_sq_wqe->rkey = cpu_to_le32(rdma_wr(wr)->rkey); rc_sq_wqe->va = cpu_to_le64(rdma_wr(wr)->remote_addr); break; case IB_WR_RDMA_WRITE_WITH_IMM: - roce_set_field(rc_sq_wqe->byte_4, - V2_RC_SEND_WQE_BYTE_4_OPCODE_M, - V2_RC_SEND_WQE_BYTE_4_OPCODE_S, - HNS_ROCE_V2_WQE_OP_RDMA_WRITE_WITH_IMM); + hr_op = HNS_ROCE_V2_WQE_OP_RDMA_WRITE_WITH_IMM; rc_sq_wqe->rkey = cpu_to_le32(rdma_wr(wr)->rkey); rc_sq_wqe->va = cpu_to_le64(rdma_wr(wr)->remote_addr); break; case IB_WR_SEND: - roce_set_field(rc_sq_wqe->byte_4, - V2_RC_SEND_WQE_BYTE_4_OPCODE_M, - V2_RC_SEND_WQE_BYTE_4_OPCODE_S, - HNS_ROCE_V2_WQE_OP_SEND); + hr_op = HNS_ROCE_V2_WQE_OP_SEND; break; case IB_WR_SEND_WITH_INV: - roce_set_field(rc_sq_wqe->byte_4, - V2_RC_SEND_WQE_BYTE_4_OPCODE_M, - V2_RC_SEND_WQE_BYTE_4_OPCODE_S, - HNS_ROCE_V2_WQE_OP_SEND_WITH_INV); + hr_op = HNS_ROCE_V2_WQE_OP_SEND_WITH_INV; break; case IB_WR_SEND_WITH_IMM: - roce_set_field(rc_sq_wqe->byte_4, - V2_RC_SEND_WQE_BYTE_4_OPCODE_M, - V2_RC_SEND_WQE_BYTE_4_OPCODE_S, - HNS_ROCE_V2_WQE_OP_SEND_WITH_IMM); + hr_op = HNS_ROCE_V2_WQE_OP_SEND_WITH_IMM; break; case IB_WR_LOCAL_INV: - roce_set_field(rc_sq_wqe->byte_4, - V2_RC_SEND_WQE_BYTE_4_OPCODE_M, - V2_RC_SEND_WQE_BYTE_4_OPCODE_S, - HNS_ROCE_V2_WQE_OP_LOCAL_INV); + hr_op = HNS_ROCE_V2_WQE_OP_LOCAL_INV; + roce_set_bit(rc_sq_wqe->byte_4, + V2_RC_SEND_WQE_BYTE_4_SO_S, 1); + rc_sq_wqe->inv_key = + cpu_to_le32(wr->ex.invalidate_rkey); + break; + case IB_WR_REG_MR: + hr_op = HNS_ROCE_V2_WQE_OP_FAST_REG_PMR; + fseg = wqe; + set_frmr_seg(rc_sq_wqe, fseg, reg_wr(wr)); break; case IB_WR_ATOMIC_CMP_AND_SWP: - roce_set_field(rc_sq_wqe->byte_4, - V2_RC_SEND_WQE_BYTE_4_OPCODE_M, - V2_RC_SEND_WQE_BYTE_4_OPCODE_S, - HNS_ROCE_V2_WQE_OP_ATOM_CMP_AND_SWAP); + hr_op = HNS_ROCE_V2_WQE_OP_ATOM_CMP_AND_SWAP; + rc_sq_wqe->rkey = + cpu_to_le32(atomic_wr(wr)->rkey); + rc_sq_wqe->va = + cpu_to_le64(atomic_wr(wr)->remote_addr); break; case IB_WR_ATOMIC_FETCH_AND_ADD: - roce_set_field(rc_sq_wqe->byte_4, - V2_RC_SEND_WQE_BYTE_4_OPCODE_M, - V2_RC_SEND_WQE_BYTE_4_OPCODE_S, - HNS_ROCE_V2_WQE_OP_ATOM_FETCH_AND_ADD); + hr_op = HNS_ROCE_V2_WQE_OP_ATOM_FETCH_AND_ADD; + rc_sq_wqe->rkey = + cpu_to_le32(atomic_wr(wr)->rkey); + rc_sq_wqe->va = + cpu_to_le64(atomic_wr(wr)->remote_addr); break; case IB_WR_MASKED_ATOMIC_CMP_AND_SWP: - roce_set_field(rc_sq_wqe->byte_4, - V2_RC_SEND_WQE_BYTE_4_OPCODE_M, - V2_RC_SEND_WQE_BYTE_4_OPCODE_S, - HNS_ROCE_V2_WQE_OP_ATOM_MSK_CMP_AND_SWAP); + hr_op = + HNS_ROCE_V2_WQE_OP_ATOM_MSK_CMP_AND_SWAP; break; case IB_WR_MASKED_ATOMIC_FETCH_AND_ADD: - roce_set_field(rc_sq_wqe->byte_4, - V2_RC_SEND_WQE_BYTE_4_OPCODE_M, - V2_RC_SEND_WQE_BYTE_4_OPCODE_S, - HNS_ROCE_V2_WQE_OP_ATOM_MSK_FETCH_AND_ADD); + hr_op = + HNS_ROCE_V2_WQE_OP_ATOM_MSK_FETCH_AND_ADD; break; default: - roce_set_field(rc_sq_wqe->byte_4, - V2_RC_SEND_WQE_BYTE_4_OPCODE_M, - V2_RC_SEND_WQE_BYTE_4_OPCODE_S, - HNS_ROCE_V2_WQE_OP_MASK); + hr_op = HNS_ROCE_V2_WQE_OP_MASK; break; } - wqe += sizeof(struct hns_roce_v2_rc_send_wqe); + roce_set_field(rc_sq_wqe->byte_4, + V2_RC_SEND_WQE_BYTE_4_OPCODE_M, + V2_RC_SEND_WQE_BYTE_4_OPCODE_S, hr_op); + + if (wr->opcode == IB_WR_ATOMIC_CMP_AND_SWP || + wr->opcode == IB_WR_ATOMIC_FETCH_AND_ADD) { + struct hns_roce_v2_wqe_data_seg *dseg; + + dseg = wqe; + set_data_seg_v2(dseg, wr->sg_list); + wqe += sizeof(struct hns_roce_v2_wqe_data_seg); + set_atomic_seg(wqe, atomic_wr(wr)); + roce_set_field(rc_sq_wqe->byte_16, + V2_RC_SEND_WQE_BYTE_16_SGE_NUM_M, + V2_RC_SEND_WQE_BYTE_16_SGE_NUM_S, + wr->num_sge); + } else if (wr->opcode != IB_WR_REG_MR) { + ret = set_rwqe_data_seg(ibqp, wr, rc_sq_wqe, + wqe, &sge_ind, bad_wr); + if (ret) + goto out; + } - ret = set_rwqe_data_seg(ibqp, wr, rc_sq_wqe, wqe, - &sge_ind, bad_wr); - if (ret) - goto out; ind++; } else { dev_err(dev, "Illegal qp_type(0x%x)\n", ibqp->qp_type); @@ -935,7 +995,24 @@ static int hns_roce_cmq_query_hw_info(struct hns_roce_dev *hr_dev) resp = (struct hns_roce_query_version *)desc.data; hr_dev->hw_rev = le32_to_cpu(resp->rocee_hw_version); - hr_dev->vendor_id = le32_to_cpu(resp->rocee_vendor_id); + hr_dev->vendor_id = hr_dev->pci_dev->vendor; + + return 0; +} + +static int hns_roce_query_fw_ver(struct hns_roce_dev *hr_dev) +{ + struct hns_roce_query_fw_info *resp; + struct hns_roce_cmq_desc desc; + int ret; + + hns_roce_cmq_setup_basic_desc(&desc, HNS_QUERY_FW_VER, true); + ret = hns_roce_cmq_send(hr_dev, &desc, 1); + if (ret) + return ret; + + resp = (struct hns_roce_query_fw_info *)desc.data; + hr_dev->caps.fw_ver = (u64)(le32_to_cpu(resp->fw_ver)); return 0; } @@ -1158,6 +1235,13 @@ static int hns_roce_v2_profile(struct hns_roce_dev *hr_dev) ret = hns_roce_cmq_query_hw_info(hr_dev); if (ret) { + dev_err(hr_dev->dev, "Query hardware version fail, ret = %d.\n", + ret); + return ret; + } + + ret = hns_roce_query_fw_ver(hr_dev); + if (ret) { dev_err(hr_dev->dev, "Query firmware version fail, ret = %d.\n", ret); return ret; @@ -1185,14 +1269,16 @@ static int hns_roce_v2_profile(struct hns_roce_dev *hr_dev) return ret; } - hr_dev->vendor_part_id = 0; - hr_dev->sys_image_guid = 0; + + hr_dev->vendor_part_id = hr_dev->pci_dev->device; + hr_dev->sys_image_guid = be64_to_cpu(hr_dev->ib_dev.node_guid); caps->num_qps = HNS_ROCE_V2_MAX_QP_NUM; caps->max_wqes = HNS_ROCE_V2_MAX_WQE_NUM; caps->num_cqs = HNS_ROCE_V2_MAX_CQ_NUM; caps->max_cqes = HNS_ROCE_V2_MAX_CQE_NUM; caps->max_sq_sg = HNS_ROCE_V2_MAX_SQ_SGE_NUM; + caps->max_extend_sg = HNS_ROCE_V2_MAX_EXTEND_SGE_NUM; caps->max_rq_sg = HNS_ROCE_V2_MAX_RQ_SGE_NUM; caps->max_sq_inline = HNS_ROCE_V2_MAX_SQ_INLINE; caps->num_uars = HNS_ROCE_V2_UAR_NUM; @@ -1222,6 +1308,7 @@ static int hns_roce_v2_profile(struct hns_roce_dev *hr_dev) caps->reserved_mrws = 1; caps->reserved_uars = 0; caps->reserved_cqs = 0; + caps->reserved_qps = HNS_ROCE_V2_RSV_QPS; caps->qpc_ba_pg_sz = 0; caps->qpc_buf_pg_sz = 0; @@ -1255,6 +1342,11 @@ static int hns_roce_v2_profile(struct hns_roce_dev *hr_dev) HNS_ROCE_CAP_FLAG_RQ_INLINE | HNS_ROCE_CAP_FLAG_RECORD_DB | HNS_ROCE_CAP_FLAG_SQ_RECORD_DB; + + if (hr_dev->pci_dev->revision == 0x21) + caps->flags |= HNS_ROCE_CAP_FLAG_MW | + HNS_ROCE_CAP_FLAG_FRMR; + caps->pkey_table_len[0] = 1; caps->gid_table_len[0] = HNS_ROCE_V2_GID_INDEX_NUM; caps->ceqe_depth = HNS_ROCE_V2_COMP_EQE_NUM; @@ -1262,6 +1354,9 @@ static int hns_roce_v2_profile(struct hns_roce_dev *hr_dev) caps->local_ca_ack_delay = 0; caps->max_mtu = IB_MTU_4096; + if (hr_dev->pci_dev->revision == 0x21) + caps->flags |= HNS_ROCE_CAP_FLAG_ATOMIC; + ret = hns_roce_v2_set_bt(hr_dev); if (ret) dev_err(hr_dev->dev, "Configure bt attribute fail, ret = %d.\n", @@ -1690,10 +1785,11 @@ static int hns_roce_v2_write_mtpt(void *mb_buf, struct hns_roce_mr *mr, roce_set_bit(mpt_entry->byte_8_mw_cnt_en, V2_MPT_BYTE_8_RA_EN_S, 0); roce_set_bit(mpt_entry->byte_8_mw_cnt_en, V2_MPT_BYTE_8_R_INV_EN_S, 1); - roce_set_bit(mpt_entry->byte_8_mw_cnt_en, V2_MPT_BYTE_8_L_INV_EN_S, 0); + roce_set_bit(mpt_entry->byte_8_mw_cnt_en, V2_MPT_BYTE_8_L_INV_EN_S, 1); roce_set_bit(mpt_entry->byte_8_mw_cnt_en, V2_MPT_BYTE_8_BIND_EN_S, (mr->access & IB_ACCESS_MW_BIND ? 1 : 0)); - roce_set_bit(mpt_entry->byte_8_mw_cnt_en, V2_MPT_BYTE_8_ATOMIC_EN_S, 0); + roce_set_bit(mpt_entry->byte_8_mw_cnt_en, V2_MPT_BYTE_8_ATOMIC_EN_S, + mr->access & IB_ACCESS_REMOTE_ATOMIC ? 1 : 0); roce_set_bit(mpt_entry->byte_8_mw_cnt_en, V2_MPT_BYTE_8_RR_EN_S, (mr->access & IB_ACCESS_REMOTE_READ ? 1 : 0)); roce_set_bit(mpt_entry->byte_8_mw_cnt_en, V2_MPT_BYTE_8_RW_EN_S, @@ -1817,6 +1913,88 @@ static int hns_roce_v2_rereg_write_mtpt(struct hns_roce_dev *hr_dev, return 0; } +static int hns_roce_v2_frmr_write_mtpt(void *mb_buf, struct hns_roce_mr *mr) +{ + struct hns_roce_v2_mpt_entry *mpt_entry; + + mpt_entry = mb_buf; + memset(mpt_entry, 0, sizeof(*mpt_entry)); + + roce_set_field(mpt_entry->byte_4_pd_hop_st, V2_MPT_BYTE_4_MPT_ST_M, + V2_MPT_BYTE_4_MPT_ST_S, V2_MPT_ST_FREE); + roce_set_field(mpt_entry->byte_4_pd_hop_st, V2_MPT_BYTE_4_PBL_HOP_NUM_M, + V2_MPT_BYTE_4_PBL_HOP_NUM_S, 1); + roce_set_field(mpt_entry->byte_4_pd_hop_st, + V2_MPT_BYTE_4_PBL_BA_PG_SZ_M, + V2_MPT_BYTE_4_PBL_BA_PG_SZ_S, + mr->pbl_ba_pg_sz + PG_SHIFT_OFFSET); + roce_set_field(mpt_entry->byte_4_pd_hop_st, V2_MPT_BYTE_4_PD_M, + V2_MPT_BYTE_4_PD_S, mr->pd); + + roce_set_bit(mpt_entry->byte_8_mw_cnt_en, V2_MPT_BYTE_8_RA_EN_S, 1); + roce_set_bit(mpt_entry->byte_8_mw_cnt_en, V2_MPT_BYTE_8_R_INV_EN_S, 1); + roce_set_bit(mpt_entry->byte_8_mw_cnt_en, V2_MPT_BYTE_8_L_INV_EN_S, 1); + + roce_set_bit(mpt_entry->byte_12_mw_pa, V2_MPT_BYTE_12_FRE_S, 1); + roce_set_bit(mpt_entry->byte_12_mw_pa, V2_MPT_BYTE_12_PA_S, 0); + roce_set_bit(mpt_entry->byte_12_mw_pa, V2_MPT_BYTE_12_MR_MW_S, 0); + roce_set_bit(mpt_entry->byte_12_mw_pa, V2_MPT_BYTE_12_BPD_S, 1); + + mpt_entry->pbl_size = cpu_to_le32(mr->pbl_size); + + mpt_entry->pbl_ba_l = cpu_to_le32(lower_32_bits(mr->pbl_ba >> 3)); + roce_set_field(mpt_entry->byte_48_mode_ba, V2_MPT_BYTE_48_PBL_BA_H_M, + V2_MPT_BYTE_48_PBL_BA_H_S, + upper_32_bits(mr->pbl_ba >> 3)); + + roce_set_field(mpt_entry->byte_64_buf_pa1, + V2_MPT_BYTE_64_PBL_BUF_PG_SZ_M, + V2_MPT_BYTE_64_PBL_BUF_PG_SZ_S, + mr->pbl_buf_pg_sz + PG_SHIFT_OFFSET); + + return 0; +} + +static int hns_roce_v2_mw_write_mtpt(void *mb_buf, struct hns_roce_mw *mw) +{ + struct hns_roce_v2_mpt_entry *mpt_entry; + + mpt_entry = mb_buf; + memset(mpt_entry, 0, sizeof(*mpt_entry)); + + roce_set_field(mpt_entry->byte_4_pd_hop_st, V2_MPT_BYTE_4_MPT_ST_M, + V2_MPT_BYTE_4_MPT_ST_S, V2_MPT_ST_FREE); + roce_set_field(mpt_entry->byte_4_pd_hop_st, V2_MPT_BYTE_4_PD_M, + V2_MPT_BYTE_4_PD_S, mw->pdn); + roce_set_field(mpt_entry->byte_4_pd_hop_st, + V2_MPT_BYTE_4_PBL_HOP_NUM_M, + V2_MPT_BYTE_4_PBL_HOP_NUM_S, + mw->pbl_hop_num == HNS_ROCE_HOP_NUM_0 ? + 0 : mw->pbl_hop_num); + roce_set_field(mpt_entry->byte_4_pd_hop_st, + V2_MPT_BYTE_4_PBL_BA_PG_SZ_M, + V2_MPT_BYTE_4_PBL_BA_PG_SZ_S, + mw->pbl_ba_pg_sz + PG_SHIFT_OFFSET); + + roce_set_bit(mpt_entry->byte_8_mw_cnt_en, V2_MPT_BYTE_8_R_INV_EN_S, 1); + roce_set_bit(mpt_entry->byte_8_mw_cnt_en, V2_MPT_BYTE_8_L_INV_EN_S, 1); + + roce_set_bit(mpt_entry->byte_12_mw_pa, V2_MPT_BYTE_12_PA_S, 0); + roce_set_bit(mpt_entry->byte_12_mw_pa, V2_MPT_BYTE_12_MR_MW_S, 1); + roce_set_bit(mpt_entry->byte_12_mw_pa, V2_MPT_BYTE_12_BPD_S, 1); + roce_set_bit(mpt_entry->byte_12_mw_pa, V2_MPT_BYTE_12_BQP_S, + mw->ibmw.type == IB_MW_TYPE_1 ? 0 : 1); + + roce_set_field(mpt_entry->byte_64_buf_pa1, + V2_MPT_BYTE_64_PBL_BUF_PG_SZ_M, + V2_MPT_BYTE_64_PBL_BUF_PG_SZ_S, + mw->pbl_buf_pg_sz + PG_SHIFT_OFFSET); + + mpt_entry->lkey = cpu_to_le32(mw->rkey); + + return 0; +} + static void *get_cqe_v2(struct hns_roce_cq *hr_cq, int n) { return hns_roce_buf_offset(&hr_cq->hr_buf.hr_buf, @@ -2274,6 +2452,7 @@ static int hns_roce_v2_poll_one(struct hns_roce_cq *hr_cq, wc->src_qp = (u8)roce_get_field(cqe->byte_32, V2_CQE_BYTE_32_RMT_QPN_M, V2_CQE_BYTE_32_RMT_QPN_S); + wc->slid = 0; wc->wc_flags |= (roce_get_bit(cqe->byte_32, V2_CQE_BYTE_32_GRH_S) ? IB_WC_GRH : 0); @@ -2287,7 +2466,14 @@ static int hns_roce_v2_poll_one(struct hns_roce_cq *hr_cq, wc->smac[5] = roce_get_field(cqe->byte_28, V2_CQE_BYTE_28_SMAC_5_M, V2_CQE_BYTE_28_SMAC_5_S); - wc->vlan_id = 0xffff; + if (roce_get_bit(cqe->byte_28, V2_CQE_BYTE_28_VID_VLD_S)) { + wc->vlan_id = (u16)roce_get_field(cqe->byte_28, + V2_CQE_BYTE_28_VID_M, + V2_CQE_BYTE_28_VID_S); + } else { + wc->vlan_id = 0xffff; + } + wc->wc_flags |= (IB_WC_WITH_VLAN | IB_WC_WITH_SMAC); wc->network_hdr_type = roce_get_field(cqe->byte_28, V2_CQE_BYTE_28_PORT_TYPE_M, @@ -2589,21 +2775,16 @@ static void modify_qp_reset_to_init(struct ib_qp *ibqp, roce_set_bit(qpc_mask->byte_56_dqpn_err, V2_QPC_BYTE_56_RQ_TX_ERR_S, 0); roce_set_bit(qpc_mask->byte_56_dqpn_err, V2_QPC_BYTE_56_RQ_RX_ERR_S, 0); - roce_set_field(qpc_mask->byte_60_qpst_mapid, V2_QPC_BYTE_60_MAPID_M, - V2_QPC_BYTE_60_MAPID_S, 0); + roce_set_field(qpc_mask->byte_60_qpst_tempid, V2_QPC_BYTE_60_TEMPID_M, + V2_QPC_BYTE_60_TEMPID_S, 0); - roce_set_bit(qpc_mask->byte_60_qpst_mapid, - V2_QPC_BYTE_60_INNER_MAP_IND_S, 0); - roce_set_bit(qpc_mask->byte_60_qpst_mapid, V2_QPC_BYTE_60_SQ_MAP_IND_S, - 0); - roce_set_bit(qpc_mask->byte_60_qpst_mapid, V2_QPC_BYTE_60_RQ_MAP_IND_S, - 0); - roce_set_bit(qpc_mask->byte_60_qpst_mapid, V2_QPC_BYTE_60_EXT_MAP_IND_S, - 0); - roce_set_bit(qpc_mask->byte_60_qpst_mapid, V2_QPC_BYTE_60_SQ_RLS_IND_S, - 0); - roce_set_bit(qpc_mask->byte_60_qpst_mapid, V2_QPC_BYTE_60_SQ_EXT_IND_S, - 0); + roce_set_field(qpc_mask->byte_60_qpst_tempid, + V2_QPC_BYTE_60_SCC_TOKEN_M, V2_QPC_BYTE_60_SCC_TOKEN_S, + 0); + roce_set_bit(qpc_mask->byte_60_qpst_tempid, + V2_QPC_BYTE_60_SQ_DB_DOING_S, 0); + roce_set_bit(qpc_mask->byte_60_qpst_tempid, + V2_QPC_BYTE_60_RQ_DB_DOING_S, 0); roce_set_bit(qpc_mask->byte_28_at_fl, V2_QPC_BYTE_28_CNP_TX_FLAG_S, 0); roce_set_bit(qpc_mask->byte_28_at_fl, V2_QPC_BYTE_28_CE_FLAG_S, 0); @@ -2685,7 +2866,8 @@ static void modify_qp_reset_to_init(struct ib_qp *ibqp, roce_set_field(qpc_mask->byte_132_trrl, V2_QPC_BYTE_132_TRRL_TAIL_MAX_M, V2_QPC_BYTE_132_TRRL_TAIL_MAX_S, 0); - roce_set_bit(qpc_mask->byte_140_raq, V2_QPC_BYTE_140_RSVD_RAQ_MAP_S, 0); + roce_set_bit(qpc_mask->byte_140_raq, V2_QPC_BYTE_140_RQ_RTY_WAIT_DO_S, + 0); roce_set_field(qpc_mask->byte_140_raq, V2_QPC_BYTE_140_RAQ_TRRL_HEAD_M, V2_QPC_BYTE_140_RAQ_TRRL_HEAD_S, 0); roce_set_field(qpc_mask->byte_140_raq, V2_QPC_BYTE_140_RAQ_TRRL_TAIL_M, @@ -2694,8 +2876,6 @@ static void modify_qp_reset_to_init(struct ib_qp *ibqp, roce_set_field(qpc_mask->byte_144_raq, V2_QPC_BYTE_144_RAQ_RTY_INI_PSN_M, V2_QPC_BYTE_144_RAQ_RTY_INI_PSN_S, 0); - roce_set_bit(qpc_mask->byte_144_raq, V2_QPC_BYTE_144_RAQ_RTY_INI_IND_S, - 0); roce_set_field(qpc_mask->byte_144_raq, V2_QPC_BYTE_144_RAQ_CREDIT_M, V2_QPC_BYTE_144_RAQ_CREDIT_S, 0); roce_set_bit(qpc_mask->byte_144_raq, V2_QPC_BYTE_144_RESP_RTY_FLG_S, 0); @@ -2721,14 +2901,12 @@ static void modify_qp_reset_to_init(struct ib_qp *ibqp, V2_QPC_BYTE_160_SQ_CONSUMER_IDX_M, V2_QPC_BYTE_160_SQ_CONSUMER_IDX_S, 0); - roce_set_field(context->byte_168_irrl_idx, - V2_QPC_BYTE_168_SQ_SHIFT_BAK_M, - V2_QPC_BYTE_168_SQ_SHIFT_BAK_S, - ilog2((unsigned int)hr_qp->sq.wqe_cnt)); - roce_set_field(qpc_mask->byte_168_irrl_idx, - V2_QPC_BYTE_168_SQ_SHIFT_BAK_M, - V2_QPC_BYTE_168_SQ_SHIFT_BAK_S, 0); - + roce_set_bit(qpc_mask->byte_168_irrl_idx, + V2_QPC_BYTE_168_POLL_DB_WAIT_DO_S, 0); + roce_set_bit(qpc_mask->byte_168_irrl_idx, + V2_QPC_BYTE_168_SCC_TOKEN_FORBID_SQ_DEQ_S, 0); + roce_set_bit(qpc_mask->byte_168_irrl_idx, + V2_QPC_BYTE_168_WAIT_ACK_TIMEOUT_S, 0); roce_set_bit(qpc_mask->byte_168_irrl_idx, V2_QPC_BYTE_168_MSG_RTY_LP_FLG_S, 0); roce_set_bit(qpc_mask->byte_168_irrl_idx, @@ -2746,6 +2924,9 @@ static void modify_qp_reset_to_init(struct ib_qp *ibqp, roce_set_bit(qpc_mask->byte_172_sq_psn, V2_QPC_BYTE_172_MSG_RNR_FLG_S, 0); + roce_set_bit(context->byte_172_sq_psn, V2_QPC_BYTE_172_FRE_S, 1); + roce_set_bit(qpc_mask->byte_172_sq_psn, V2_QPC_BYTE_172_FRE_S, 0); + roce_set_field(qpc_mask->byte_176_msg_pktn, V2_QPC_BYTE_176_MSG_USE_PKTN_M, V2_QPC_BYTE_176_MSG_USE_PKTN_S, 0); @@ -2790,6 +2971,13 @@ static void modify_qp_reset_to_init(struct ib_qp *ibqp, V2_QPC_BYTE_232_IRRL_SGE_IDX_M, V2_QPC_BYTE_232_IRRL_SGE_IDX_S, 0); + roce_set_bit(qpc_mask->byte_232_irrl_sge, V2_QPC_BYTE_232_SO_LP_VLD_S, + 0); + roce_set_bit(qpc_mask->byte_232_irrl_sge, + V2_QPC_BYTE_232_FENCE_LP_VLD_S, 0); + roce_set_bit(qpc_mask->byte_232_irrl_sge, V2_QPC_BYTE_232_IRRL_LP_VLD_S, + 0); + qpc_mask->irrl_cur_sge_offset = 0; roce_set_field(qpc_mask->byte_240_irrl_tail, @@ -2955,13 +3143,6 @@ static void modify_qp_init_to_init(struct ib_qp *ibqp, roce_set_field(qpc_mask->byte_56_dqpn_err, V2_QPC_BYTE_56_DQPN_M, V2_QPC_BYTE_56_DQPN_S, 0); } - roce_set_field(context->byte_168_irrl_idx, - V2_QPC_BYTE_168_SQ_SHIFT_BAK_M, - V2_QPC_BYTE_168_SQ_SHIFT_BAK_S, - ilog2((unsigned int)hr_qp->sq.wqe_cnt)); - roce_set_field(qpc_mask->byte_168_irrl_idx, - V2_QPC_BYTE_168_SQ_SHIFT_BAK_M, - V2_QPC_BYTE_168_SQ_SHIFT_BAK_S, 0); } static int modify_qp_init_to_rtr(struct ib_qp *ibqp, @@ -3271,13 +3452,6 @@ static int modify_qp_rtr_to_rts(struct ib_qp *ibqp, * we should set all bits of the relevant fields in context mask to * 0 at the same time, else set them to 0x1. */ - roce_set_field(context->byte_60_qpst_mapid, - V2_QPC_BYTE_60_RTY_NUM_INI_BAK_M, - V2_QPC_BYTE_60_RTY_NUM_INI_BAK_S, attr->retry_cnt); - roce_set_field(qpc_mask->byte_60_qpst_mapid, - V2_QPC_BYTE_60_RTY_NUM_INI_BAK_M, - V2_QPC_BYTE_60_RTY_NUM_INI_BAK_S, 0); - context->sq_cur_blk_addr = (u32)(mtts[0] >> PAGE_ADDR_SHIFT); roce_set_field(context->byte_168_irrl_idx, V2_QPC_BYTE_168_SQ_CUR_BLK_ADDR_M, @@ -3538,6 +3712,17 @@ static int hns_roce_v2_modify_qp(struct ib_qp *ibqp, memcpy(src_mac, gid_attr->ndev->dev_addr, ETH_ALEN); } + if (is_vlan_dev(gid_attr->ndev)) { + roce_set_bit(context->byte_76_srqn_op_en, + V2_QPC_BYTE_76_RQ_VLAN_EN_S, 1); + roce_set_bit(qpc_mask->byte_76_srqn_op_en, + V2_QPC_BYTE_76_RQ_VLAN_EN_S, 0); + roce_set_bit(context->byte_168_irrl_idx, + V2_QPC_BYTE_168_SQ_VLAN_EN_S, 1); + roce_set_bit(qpc_mask->byte_168_irrl_idx, + V2_QPC_BYTE_168_SQ_VLAN_EN_S, 0); + } + roce_set_field(context->byte_24_mtu_tc, V2_QPC_BYTE_24_VLAN_ID_M, V2_QPC_BYTE_24_VLAN_ID_S, vlan); @@ -3584,8 +3769,15 @@ static int hns_roce_v2_modify_qp(struct ib_qp *ibqp, V2_QPC_BYTE_24_HOP_LIMIT_M, V2_QPC_BYTE_24_HOP_LIMIT_S, 0); - roce_set_field(context->byte_24_mtu_tc, V2_QPC_BYTE_24_TC_M, - V2_QPC_BYTE_24_TC_S, grh->traffic_class); + if (hr_dev->pci_dev->revision == 0x21 && + gid_attr->gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP) + roce_set_field(context->byte_24_mtu_tc, + V2_QPC_BYTE_24_TC_M, V2_QPC_BYTE_24_TC_S, + grh->traffic_class >> 2); + else + roce_set_field(context->byte_24_mtu_tc, + V2_QPC_BYTE_24_TC_M, V2_QPC_BYTE_24_TC_S, + grh->traffic_class); roce_set_field(qpc_mask->byte_24_mtu_tc, V2_QPC_BYTE_24_TC_M, V2_QPC_BYTE_24_TC_S, 0); roce_set_field(context->byte_28_at_fl, V2_QPC_BYTE_28_FL_M, @@ -3606,9 +3798,9 @@ static int hns_roce_v2_modify_qp(struct ib_qp *ibqp, set_access_flags(hr_qp, context, qpc_mask, attr, attr_mask); /* Every status migrate must change state */ - roce_set_field(context->byte_60_qpst_mapid, V2_QPC_BYTE_60_QP_ST_M, + roce_set_field(context->byte_60_qpst_tempid, V2_QPC_BYTE_60_QP_ST_M, V2_QPC_BYTE_60_QP_ST_S, new_state); - roce_set_field(qpc_mask->byte_60_qpst_mapid, V2_QPC_BYTE_60_QP_ST_M, + roce_set_field(qpc_mask->byte_60_qpst_tempid, V2_QPC_BYTE_60_QP_ST_M, V2_QPC_BYTE_60_QP_ST_S, 0); /* SW pass context to HW */ @@ -3728,7 +3920,7 @@ static int hns_roce_v2_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr, goto out; } - state = roce_get_field(context->byte_60_qpst_mapid, + state = roce_get_field(context->byte_60_qpst_tempid, V2_QPC_BYTE_60_QP_ST_M, V2_QPC_BYTE_60_QP_ST_S); tmp_qp_state = to_ib_qp_st((enum hns_roce_v2_qp_state)state); if (tmp_qp_state == -1) { @@ -3995,13 +4187,103 @@ static void hns_roce_irq_work_handle(struct work_struct *work) { struct hns_roce_work *irq_work = container_of(work, struct hns_roce_work, work); + struct device *dev = irq_work->hr_dev->dev; u32 qpn = irq_work->qpn; + u32 cqn = irq_work->cqn; switch (irq_work->event_type) { + case HNS_ROCE_EVENT_TYPE_PATH_MIG: + dev_info(dev, "Path migrated succeeded.\n"); + break; + case HNS_ROCE_EVENT_TYPE_PATH_MIG_FAILED: + dev_warn(dev, "Path migration failed.\n"); + break; + case HNS_ROCE_EVENT_TYPE_COMM_EST: + dev_info(dev, "Communication established.\n"); + break; + case HNS_ROCE_EVENT_TYPE_SQ_DRAINED: + dev_warn(dev, "Send queue drained.\n"); + break; case HNS_ROCE_EVENT_TYPE_WQ_CATAS_ERROR: + dev_err(dev, "Local work queue catastrophic error.\n"); + hns_roce_set_qps_to_err(irq_work->hr_dev, qpn); + switch (irq_work->sub_type) { + case HNS_ROCE_LWQCE_QPC_ERROR: + dev_err(dev, "QP %d, QPC error.\n", qpn); + break; + case HNS_ROCE_LWQCE_MTU_ERROR: + dev_err(dev, "QP %d, MTU error.\n", qpn); + break; + case HNS_ROCE_LWQCE_WQE_BA_ADDR_ERROR: + dev_err(dev, "QP %d, WQE BA addr error.\n", qpn); + break; + case HNS_ROCE_LWQCE_WQE_ADDR_ERROR: + dev_err(dev, "QP %d, WQE addr error.\n", qpn); + break; + case HNS_ROCE_LWQCE_SQ_WQE_SHIFT_ERROR: + dev_err(dev, "QP %d, WQE shift error.\n", qpn); + break; + default: + dev_err(dev, "Unhandled sub_event type %d.\n", + irq_work->sub_type); + break; + } + break; case HNS_ROCE_EVENT_TYPE_INV_REQ_LOCAL_WQ_ERROR: + dev_err(dev, "Invalid request local work queue error.\n"); + hns_roce_set_qps_to_err(irq_work->hr_dev, qpn); + break; case HNS_ROCE_EVENT_TYPE_LOCAL_WQ_ACCESS_ERROR: + dev_err(dev, "Local access violation work queue error.\n"); hns_roce_set_qps_to_err(irq_work->hr_dev, qpn); + switch (irq_work->sub_type) { + case HNS_ROCE_LAVWQE_R_KEY_VIOLATION: + dev_err(dev, "QP %d, R_key violation.\n", qpn); + break; + case HNS_ROCE_LAVWQE_LENGTH_ERROR: + dev_err(dev, "QP %d, length error.\n", qpn); + break; + case HNS_ROCE_LAVWQE_VA_ERROR: + dev_err(dev, "QP %d, VA error.\n", qpn); + break; + case HNS_ROCE_LAVWQE_PD_ERROR: + dev_err(dev, "QP %d, PD error.\n", qpn); + break; + case HNS_ROCE_LAVWQE_RW_ACC_ERROR: + dev_err(dev, "QP %d, rw acc error.\n", qpn); + break; + case HNS_ROCE_LAVWQE_KEY_STATE_ERROR: + dev_err(dev, "QP %d, key state error.\n", qpn); + break; + case HNS_ROCE_LAVWQE_MR_OPERATION_ERROR: + dev_err(dev, "QP %d, MR operation error.\n", qpn); + break; + default: + dev_err(dev, "Unhandled sub_event type %d.\n", + irq_work->sub_type); + break; + } + break; + case HNS_ROCE_EVENT_TYPE_SRQ_LIMIT_REACH: + dev_warn(dev, "SRQ limit reach.\n"); + break; + case HNS_ROCE_EVENT_TYPE_SRQ_LAST_WQE_REACH: + dev_warn(dev, "SRQ last wqe reach.\n"); + break; + case HNS_ROCE_EVENT_TYPE_SRQ_CATAS_ERROR: + dev_err(dev, "SRQ catas error.\n"); + break; + case HNS_ROCE_EVENT_TYPE_CQ_ACCESS_ERROR: + dev_err(dev, "CQ 0x%x access err.\n", cqn); + break; + case HNS_ROCE_EVENT_TYPE_CQ_OVERFLOW: + dev_warn(dev, "CQ 0x%x overflow\n", cqn); + break; + case HNS_ROCE_EVENT_TYPE_DB_OVERFLOW: + dev_warn(dev, "DB overflow.\n"); + break; + case HNS_ROCE_EVENT_TYPE_FLR: + dev_warn(dev, "Function level reset.\n"); break; default: break; @@ -4011,7 +4293,8 @@ static void hns_roce_irq_work_handle(struct work_struct *work) } static void hns_roce_v2_init_irq_work(struct hns_roce_dev *hr_dev, - struct hns_roce_eq *eq, u32 qpn) + struct hns_roce_eq *eq, + u32 qpn, u32 cqn) { struct hns_roce_work *irq_work; @@ -4022,6 +4305,7 @@ static void hns_roce_v2_init_irq_work(struct hns_roce_dev *hr_dev, INIT_WORK(&(irq_work->work), hns_roce_irq_work_handle); irq_work->hr_dev = hr_dev; irq_work->qpn = qpn; + irq_work->cqn = cqn; irq_work->event_type = eq->event_type; irq_work->sub_type = eq->sub_type; queue_work(hr_dev->irq_workq, &(irq_work->work)); @@ -4058,124 +4342,6 @@ static void set_eq_cons_index_v2(struct hns_roce_eq *eq) hns_roce_write64_k(doorbell, eq->doorbell); } -static void hns_roce_v2_wq_catas_err_handle(struct hns_roce_dev *hr_dev, - struct hns_roce_aeqe *aeqe, - u32 qpn) -{ - struct device *dev = hr_dev->dev; - int sub_type; - - dev_warn(dev, "Local work queue catastrophic error.\n"); - sub_type = roce_get_field(aeqe->asyn, HNS_ROCE_V2_AEQE_SUB_TYPE_M, - HNS_ROCE_V2_AEQE_SUB_TYPE_S); - switch (sub_type) { - case HNS_ROCE_LWQCE_QPC_ERROR: - dev_warn(dev, "QP %d, QPC error.\n", qpn); - break; - case HNS_ROCE_LWQCE_MTU_ERROR: - dev_warn(dev, "QP %d, MTU error.\n", qpn); - break; - case HNS_ROCE_LWQCE_WQE_BA_ADDR_ERROR: - dev_warn(dev, "QP %d, WQE BA addr error.\n", qpn); - break; - case HNS_ROCE_LWQCE_WQE_ADDR_ERROR: - dev_warn(dev, "QP %d, WQE addr error.\n", qpn); - break; - case HNS_ROCE_LWQCE_SQ_WQE_SHIFT_ERROR: - dev_warn(dev, "QP %d, WQE shift error.\n", qpn); - break; - default: - dev_err(dev, "Unhandled sub_event type %d.\n", sub_type); - break; - } -} - -static void hns_roce_v2_local_wq_access_err_handle(struct hns_roce_dev *hr_dev, - struct hns_roce_aeqe *aeqe, u32 qpn) -{ - struct device *dev = hr_dev->dev; - int sub_type; - - dev_warn(dev, "Local access violation work queue error.\n"); - sub_type = roce_get_field(aeqe->asyn, HNS_ROCE_V2_AEQE_SUB_TYPE_M, - HNS_ROCE_V2_AEQE_SUB_TYPE_S); - switch (sub_type) { - case HNS_ROCE_LAVWQE_R_KEY_VIOLATION: - dev_warn(dev, "QP %d, R_key violation.\n", qpn); - break; - case HNS_ROCE_LAVWQE_LENGTH_ERROR: - dev_warn(dev, "QP %d, length error.\n", qpn); - break; - case HNS_ROCE_LAVWQE_VA_ERROR: - dev_warn(dev, "QP %d, VA error.\n", qpn); - break; - case HNS_ROCE_LAVWQE_PD_ERROR: - dev_err(dev, "QP %d, PD error.\n", qpn); - break; - case HNS_ROCE_LAVWQE_RW_ACC_ERROR: - dev_warn(dev, "QP %d, rw acc error.\n", qpn); - break; - case HNS_ROCE_LAVWQE_KEY_STATE_ERROR: - dev_warn(dev, "QP %d, key state error.\n", qpn); - break; - case HNS_ROCE_LAVWQE_MR_OPERATION_ERROR: - dev_warn(dev, "QP %d, MR operation error.\n", qpn); - break; - default: - dev_err(dev, "Unhandled sub_event type %d.\n", sub_type); - break; - } -} - -static void hns_roce_v2_qp_err_handle(struct hns_roce_dev *hr_dev, - struct hns_roce_aeqe *aeqe, - int event_type, u32 qpn) -{ - struct device *dev = hr_dev->dev; - - switch (event_type) { - case HNS_ROCE_EVENT_TYPE_COMM_EST: - dev_warn(dev, "Communication established.\n"); - break; - case HNS_ROCE_EVENT_TYPE_SQ_DRAINED: - dev_warn(dev, "Send queue drained.\n"); - break; - case HNS_ROCE_EVENT_TYPE_WQ_CATAS_ERROR: - hns_roce_v2_wq_catas_err_handle(hr_dev, aeqe, qpn); - break; - case HNS_ROCE_EVENT_TYPE_INV_REQ_LOCAL_WQ_ERROR: - dev_warn(dev, "Invalid request local work queue error.\n"); - break; - case HNS_ROCE_EVENT_TYPE_LOCAL_WQ_ACCESS_ERROR: - hns_roce_v2_local_wq_access_err_handle(hr_dev, aeqe, qpn); - break; - default: - break; - } - - hns_roce_qp_event(hr_dev, qpn, event_type); -} - -static void hns_roce_v2_cq_err_handle(struct hns_roce_dev *hr_dev, - struct hns_roce_aeqe *aeqe, - int event_type, u32 cqn) -{ - struct device *dev = hr_dev->dev; - - switch (event_type) { - case HNS_ROCE_EVENT_TYPE_CQ_ACCESS_ERROR: - dev_warn(dev, "CQ 0x%x access err.\n", cqn); - break; - case HNS_ROCE_EVENT_TYPE_CQ_OVERFLOW: - dev_warn(dev, "CQ 0x%x overflow\n", cqn); - break; - default: - break; - } - - hns_roce_cq_event(hr_dev, cqn, event_type); -} - static struct hns_roce_aeqe *get_aeqe_v2(struct hns_roce_eq *eq, u32 entry) { u32 buf_chk_sz; @@ -4251,31 +4417,23 @@ static int hns_roce_v2_aeq_int(struct hns_roce_dev *hr_dev, switch (event_type) { case HNS_ROCE_EVENT_TYPE_PATH_MIG: - dev_warn(dev, "Path migrated succeeded.\n"); - break; case HNS_ROCE_EVENT_TYPE_PATH_MIG_FAILED: - dev_warn(dev, "Path migration failed.\n"); - break; case HNS_ROCE_EVENT_TYPE_COMM_EST: case HNS_ROCE_EVENT_TYPE_SQ_DRAINED: case HNS_ROCE_EVENT_TYPE_WQ_CATAS_ERROR: case HNS_ROCE_EVENT_TYPE_INV_REQ_LOCAL_WQ_ERROR: case HNS_ROCE_EVENT_TYPE_LOCAL_WQ_ACCESS_ERROR: - hns_roce_v2_qp_err_handle(hr_dev, aeqe, event_type, - qpn); + hns_roce_qp_event(hr_dev, qpn, event_type); break; case HNS_ROCE_EVENT_TYPE_SRQ_LIMIT_REACH: case HNS_ROCE_EVENT_TYPE_SRQ_LAST_WQE_REACH: case HNS_ROCE_EVENT_TYPE_SRQ_CATAS_ERROR: - dev_warn(dev, "SRQ not support.\n"); break; case HNS_ROCE_EVENT_TYPE_CQ_ACCESS_ERROR: case HNS_ROCE_EVENT_TYPE_CQ_OVERFLOW: - hns_roce_v2_cq_err_handle(hr_dev, aeqe, event_type, - cqn); + hns_roce_cq_event(hr_dev, cqn, event_type); break; case HNS_ROCE_EVENT_TYPE_DB_OVERFLOW: - dev_warn(dev, "DB overflow.\n"); break; case HNS_ROCE_EVENT_TYPE_MB: hns_roce_cmd_event(hr_dev, @@ -4284,10 +4442,8 @@ static int hns_roce_v2_aeq_int(struct hns_roce_dev *hr_dev, le64_to_cpu(aeqe->event.cmd.out_param)); break; case HNS_ROCE_EVENT_TYPE_CEQ_OVERFLOW: - dev_warn(dev, "CEQ overflow.\n"); break; case HNS_ROCE_EVENT_TYPE_FLR: - dev_warn(dev, "Function level reset.\n"); break; default: dev_err(dev, "Unhandled event %d on EQ %d at idx %u.\n", @@ -4304,7 +4460,7 @@ static int hns_roce_v2_aeq_int(struct hns_roce_dev *hr_dev, dev_warn(dev, "cons_index overflow, set back to 0.\n"); eq->cons_index = 0; } - hns_roce_v2_init_irq_work(hr_dev, eq, qpn); + hns_roce_v2_init_irq_work(hr_dev, eq, qpn, cqn); } set_eq_cons_index_v2(eq); @@ -5125,6 +5281,7 @@ static int hns_roce_v2_init_eq_table(struct hns_roce_dev *hr_dev) create_singlethread_workqueue("hns_roce_irq_workqueue"); if (!hr_dev->irq_workq) { dev_err(dev, "Create irq workqueue failed!\n"); + ret = -ENOMEM; goto err_request_irq_fail; } @@ -5195,6 +5352,8 @@ static const struct hns_roce_hw hns_roce_hw_v2 = { .set_mac = hns_roce_v2_set_mac, .write_mtpt = hns_roce_v2_write_mtpt, .rereg_write_mtpt = hns_roce_v2_rereg_write_mtpt, + .frmr_write_mtpt = hns_roce_v2_frmr_write_mtpt, + .mw_write_mtpt = hns_roce_v2_mw_write_mtpt, .write_cqc = hns_roce_v2_write_cqc, .set_hem = hns_roce_v2_set_hem, .clear_hem = hns_roce_v2_clear_hem, diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.h b/drivers/infiniband/hw/hns/hns_roce_hw_v2.h index 14aa308befef..8bc820635bbd 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.h +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.h @@ -50,6 +50,7 @@ #define HNS_ROCE_V2_MAX_CQE_NUM 0x10000 #define HNS_ROCE_V2_MAX_RQ_SGE_NUM 0x100 #define HNS_ROCE_V2_MAX_SQ_SGE_NUM 0xff +#define HNS_ROCE_V2_MAX_EXTEND_SGE_NUM 0x200000 #define HNS_ROCE_V2_MAX_SQ_INLINE 0x20 #define HNS_ROCE_V2_UAR_NUM 256 #define HNS_ROCE_V2_PHY_UAR_NUM 1 @@ -78,6 +79,7 @@ #define HNS_ROCE_INVALID_LKEY 0x100 #define HNS_ROCE_CMQ_TX_TIMEOUT 30000 #define HNS_ROCE_V2_UC_RC_SGE_NUM_IN_WQE 2 +#define HNS_ROCE_V2_RSV_QPS 8 #define HNS_ROCE_CONTEXT_HOP_NUM 1 #define HNS_ROCE_MTT_HOP_NUM 1 @@ -201,6 +203,7 @@ enum { /* CMQ command */ enum hns_roce_opcode_type { + HNS_QUERY_FW_VER = 0x0001, HNS_ROCE_OPC_QUERY_HW_VER = 0x8000, HNS_ROCE_OPC_CFG_GLOBAL_PARAM = 0x8001, HNS_ROCE_OPC_ALLOC_PF_RES = 0x8004, @@ -324,6 +327,7 @@ struct hns_roce_v2_cq_context { enum{ V2_MPT_ST_VALID = 0x1, + V2_MPT_ST_FREE = 0x2, }; enum hns_roce_v2_qp_state { @@ -350,7 +354,7 @@ struct hns_roce_v2_qp_context { __le32 dmac; __le32 byte_52_udpspn_dmac; __le32 byte_56_dqpn_err; - __le32 byte_60_qpst_mapid; + __le32 byte_60_qpst_tempid; __le32 qkey_xrcd; __le32 byte_68_rq_db; __le32 rq_db_record_addr; @@ -492,26 +496,15 @@ struct hns_roce_v2_qp_context { #define V2_QPC_BYTE_56_LP_PKTN_INI_S 28 #define V2_QPC_BYTE_56_LP_PKTN_INI_M GENMASK(31, 28) -#define V2_QPC_BYTE_60_MAPID_S 0 -#define V2_QPC_BYTE_60_MAPID_M GENMASK(12, 0) +#define V2_QPC_BYTE_60_TEMPID_S 0 +#define V2_QPC_BYTE_60_TEMPID_M GENMASK(7, 0) -#define V2_QPC_BYTE_60_INNER_MAP_IND_S 13 +#define V2_QPC_BYTE_60_SCC_TOKEN_S 8 +#define V2_QPC_BYTE_60_SCC_TOKEN_M GENMASK(26, 8) -#define V2_QPC_BYTE_60_SQ_MAP_IND_S 14 +#define V2_QPC_BYTE_60_SQ_DB_DOING_S 27 -#define V2_QPC_BYTE_60_RQ_MAP_IND_S 15 - -#define V2_QPC_BYTE_60_TEMPID_S 16 -#define V2_QPC_BYTE_60_TEMPID_M GENMASK(22, 16) - -#define V2_QPC_BYTE_60_EXT_MAP_IND_S 23 - -#define V2_QPC_BYTE_60_RTY_NUM_INI_BAK_S 24 -#define V2_QPC_BYTE_60_RTY_NUM_INI_BAK_M GENMASK(26, 24) - -#define V2_QPC_BYTE_60_SQ_RLS_IND_S 27 - -#define V2_QPC_BYTE_60_SQ_EXT_IND_S 28 +#define V2_QPC_BYTE_60_RQ_DB_DOING_S 28 #define V2_QPC_BYTE_60_QP_ST_S 29 #define V2_QPC_BYTE_60_QP_ST_M GENMASK(31, 29) @@ -534,6 +527,7 @@ struct hns_roce_v2_qp_context { #define V2_QPC_BYTE_76_RQIE_S 28 +#define V2_QPC_BYTE_76_RQ_VLAN_EN_S 30 #define V2_QPC_BYTE_80_RX_CQN_S 0 #define V2_QPC_BYTE_80_RX_CQN_M GENMASK(23, 0) @@ -588,7 +582,7 @@ struct hns_roce_v2_qp_context { #define V2_QPC_BYTE_140_RR_MAX_S 12 #define V2_QPC_BYTE_140_RR_MAX_M GENMASK(14, 12) -#define V2_QPC_BYTE_140_RSVD_RAQ_MAP_S 15 +#define V2_QPC_BYTE_140_RQ_RTY_WAIT_DO_S 15 #define V2_QPC_BYTE_140_RAQ_TRRL_HEAD_S 16 #define V2_QPC_BYTE_140_RAQ_TRRL_HEAD_M GENMASK(23, 16) @@ -599,8 +593,6 @@ struct hns_roce_v2_qp_context { #define V2_QPC_BYTE_144_RAQ_RTY_INI_PSN_S 0 #define V2_QPC_BYTE_144_RAQ_RTY_INI_PSN_M GENMASK(23, 0) -#define V2_QPC_BYTE_144_RAQ_RTY_INI_IND_S 24 - #define V2_QPC_BYTE_144_RAQ_CREDIT_S 25 #define V2_QPC_BYTE_144_RAQ_CREDIT_M GENMASK(29, 25) @@ -637,9 +629,10 @@ struct hns_roce_v2_qp_context { #define V2_QPC_BYTE_168_LP_SGEN_INI_S 22 #define V2_QPC_BYTE_168_LP_SGEN_INI_M GENMASK(23, 22) -#define V2_QPC_BYTE_168_SQ_SHIFT_BAK_S 24 -#define V2_QPC_BYTE_168_SQ_SHIFT_BAK_M GENMASK(27, 24) - +#define V2_QPC_BYTE_168_SQ_VLAN_EN_S 24 +#define V2_QPC_BYTE_168_POLL_DB_WAIT_DO_S 25 +#define V2_QPC_BYTE_168_SCC_TOKEN_FORBID_SQ_DEQ_S 26 +#define V2_QPC_BYTE_168_WAIT_ACK_TIMEOUT_S 27 #define V2_QPC_BYTE_168_IRRL_IDX_LSB_S 28 #define V2_QPC_BYTE_168_IRRL_IDX_LSB_M GENMASK(31, 28) @@ -725,6 +718,10 @@ struct hns_roce_v2_qp_context { #define V2_QPC_BYTE_232_IRRL_SGE_IDX_S 20 #define V2_QPC_BYTE_232_IRRL_SGE_IDX_M GENMASK(28, 20) +#define V2_QPC_BYTE_232_SO_LP_VLD_S 29 +#define V2_QPC_BYTE_232_FENCE_LP_VLD_S 30 +#define V2_QPC_BYTE_232_IRRL_LP_VLD_S 31 + #define V2_QPC_BYTE_240_IRRL_TAIL_REAL_S 0 #define V2_QPC_BYTE_240_IRRL_TAIL_REAL_M GENMASK(7, 0) @@ -743,6 +740,9 @@ struct hns_roce_v2_qp_context { #define V2_QPC_BYTE_244_RNR_CNT_S 27 #define V2_QPC_BYTE_244_RNR_CNT_M GENMASK(29, 27) +#define V2_QPC_BYTE_244_LCL_OP_FLG_S 30 +#define V2_QPC_BYTE_244_IRRL_RD_FLG_S 31 + #define V2_QPC_BYTE_248_IRRL_PSN_S 0 #define V2_QPC_BYTE_248_IRRL_PSN_M GENMASK(23, 0) @@ -818,6 +818,11 @@ struct hns_roce_v2_cqe { #define V2_CQE_BYTE_28_PORT_TYPE_S 16 #define V2_CQE_BYTE_28_PORT_TYPE_M GENMASK(17, 16) +#define V2_CQE_BYTE_28_VID_S 18 +#define V2_CQE_BYTE_28_VID_M GENMASK(29, 18) + +#define V2_CQE_BYTE_28_VID_VLD_S 30 + #define V2_CQE_BYTE_32_RMT_QPN_S 0 #define V2_CQE_BYTE_32_RMT_QPN_M GENMASK(23, 0) @@ -878,8 +883,19 @@ struct hns_roce_v2_mpt_entry { #define V2_MPT_BYTE_8_LW_EN_S 7 +#define V2_MPT_BYTE_8_MW_CNT_S 8 +#define V2_MPT_BYTE_8_MW_CNT_M GENMASK(31, 8) + +#define V2_MPT_BYTE_12_FRE_S 0 + #define V2_MPT_BYTE_12_PA_S 1 +#define V2_MPT_BYTE_12_MR_MW_S 4 + +#define V2_MPT_BYTE_12_BPD_S 5 + +#define V2_MPT_BYTE_12_BQP_S 6 + #define V2_MPT_BYTE_12_INNER_PA_VLD_S 7 #define V2_MPT_BYTE_12_MW_BIND_QPN_S 8 @@ -988,6 +1004,8 @@ struct hns_roce_v2_ud_send_wqe { #define V2_UD_SEND_WQE_BYTE_40_PORTN_S 24 #define V2_UD_SEND_WQE_BYTE_40_PORTN_M GENMASK(26, 24) +#define V2_UD_SEND_WQE_BYTE_40_UD_VLAN_EN_S 30 + #define V2_UD_SEND_WQE_BYTE_40_LBI_S 31 #define V2_UD_SEND_WQE_DMAC_0_S 0 @@ -1042,6 +1060,16 @@ struct hns_roce_v2_rc_send_wqe { #define V2_RC_SEND_WQE_BYTE_4_INLINE_S 12 +#define V2_RC_FRMR_WQE_BYTE_4_BIND_EN_S 19 + +#define V2_RC_FRMR_WQE_BYTE_4_ATOMIC_S 20 + +#define V2_RC_FRMR_WQE_BYTE_4_RR_S 21 + +#define V2_RC_FRMR_WQE_BYTE_4_RW_S 22 + +#define V2_RC_FRMR_WQE_BYTE_4_LW_S 23 + #define V2_RC_SEND_WQE_BYTE_16_XRC_SRQN_S 0 #define V2_RC_SEND_WQE_BYTE_16_XRC_SRQN_M GENMASK(23, 0) @@ -1051,6 +1079,16 @@ struct hns_roce_v2_rc_send_wqe { #define V2_RC_SEND_WQE_BYTE_20_MSG_START_SGE_IDX_S 0 #define V2_RC_SEND_WQE_BYTE_20_MSG_START_SGE_IDX_M GENMASK(23, 0) +struct hns_roce_wqe_frmr_seg { + __le32 pbl_size; + __le32 mode_buf_pg_sz; +}; + +#define V2_RC_FRMR_WQE_BYTE_40_PBL_BUF_PG_SZ_S 4 +#define V2_RC_FRMR_WQE_BYTE_40_PBL_BUF_PG_SZ_M GENMASK(7, 4) + +#define V2_RC_FRMR_WQE_BYTE_40_BLK_MODE_S 8 + struct hns_roce_v2_wqe_data_seg { __le32 len; __le32 lkey; @@ -1068,6 +1106,11 @@ struct hns_roce_query_version { __le32 rsv[5]; }; +struct hns_roce_query_fw_info { + __le32 fw_ver; + __le32 rsv[5]; +}; + struct hns_roce_cfg_llm_a { __le32 base_addr_l; __le32 base_addr_h; @@ -1564,4 +1607,9 @@ struct hns_roce_eq_context { #define HNS_ROCE_V2_AEQE_EVENT_QUEUE_NUM_S 0 #define HNS_ROCE_V2_AEQE_EVENT_QUEUE_NUM_M GENMASK(23, 0) +struct hns_roce_wqe_atomic_seg { + __le64 fetchadd_swap_data; + __le64 cmp_data; +}; + #endif diff --git a/drivers/infiniband/hw/hns/hns_roce_main.c b/drivers/infiniband/hw/hns/hns_roce_main.c index c5cae9a38c04..1b3ee514f2ef 100644 --- a/drivers/infiniband/hw/hns/hns_roce_main.c +++ b/drivers/infiniband/hw/hns/hns_roce_main.c @@ -196,6 +196,7 @@ static int hns_roce_query_device(struct ib_device *ib_dev, memset(props, 0, sizeof(*props)); + props->fw_ver = hr_dev->caps.fw_ver; props->sys_image_guid = cpu_to_be64(hr_dev->sys_image_guid); props->max_mr_size = (u64)(~(0ULL)); props->page_size_cap = hr_dev->caps.page_size_cap; @@ -215,7 +216,8 @@ static int hns_roce_query_device(struct ib_device *ib_dev, props->max_pd = hr_dev->caps.num_pds; props->max_qp_rd_atom = hr_dev->caps.max_qp_dest_rdma; props->max_qp_init_rd_atom = hr_dev->caps.max_qp_init_rdma; - props->atomic_cap = IB_ATOMIC_NONE; + props->atomic_cap = hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_ATOMIC ? + IB_ATOMIC_HCA : IB_ATOMIC_NONE; props->max_pkeys = 1; props->local_ca_ack_delay = hr_dev->caps.local_ca_ack_delay; @@ -344,8 +346,6 @@ static struct ib_ucontext *hns_roce_alloc_ucontext(struct ib_device *ib_dev, if (ret) goto error_fail_uar_alloc; - INIT_LIST_HEAD(&context->vma_list); - mutex_init(&context->vma_list_mutex); if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_RECORD_DB) { INIT_LIST_HEAD(&context->page_list); mutex_init(&context->page_mutex); @@ -376,76 +376,34 @@ static int hns_roce_dealloc_ucontext(struct ib_ucontext *ibcontext) return 0; } -static void hns_roce_vma_open(struct vm_area_struct *vma) -{ - vma->vm_ops = NULL; -} - -static void hns_roce_vma_close(struct vm_area_struct *vma) -{ - struct hns_roce_vma_data *vma_data; - - vma_data = (struct hns_roce_vma_data *)vma->vm_private_data; - vma_data->vma = NULL; - mutex_lock(vma_data->vma_list_mutex); - list_del(&vma_data->list); - mutex_unlock(vma_data->vma_list_mutex); - kfree(vma_data); -} - -static const struct vm_operations_struct hns_roce_vm_ops = { - .open = hns_roce_vma_open, - .close = hns_roce_vma_close, -}; - -static int hns_roce_set_vma_data(struct vm_area_struct *vma, - struct hns_roce_ucontext *context) -{ - struct list_head *vma_head = &context->vma_list; - struct hns_roce_vma_data *vma_data; - - vma_data = kzalloc(sizeof(*vma_data), GFP_KERNEL); - if (!vma_data) - return -ENOMEM; - - vma_data->vma = vma; - vma_data->vma_list_mutex = &context->vma_list_mutex; - vma->vm_private_data = vma_data; - vma->vm_ops = &hns_roce_vm_ops; - - mutex_lock(&context->vma_list_mutex); - list_add(&vma_data->list, vma_head); - mutex_unlock(&context->vma_list_mutex); - - return 0; -} - static int hns_roce_mmap(struct ib_ucontext *context, struct vm_area_struct *vma) { struct hns_roce_dev *hr_dev = to_hr_dev(context->device); - if (((vma->vm_end - vma->vm_start) % PAGE_SIZE) != 0) - return -EINVAL; + switch (vma->vm_pgoff) { + case 0: + return rdma_user_mmap_io(context, vma, + to_hr_ucontext(context)->uar.pfn, + PAGE_SIZE, + pgprot_noncached(vma->vm_page_prot)); + + /* vm_pgoff: 1 -- TPTR */ + case 1: + if (!hr_dev->tptr_dma_addr || !hr_dev->tptr_size) + return -EINVAL; + /* + * FIXME: using io_remap_pfn_range on the dma address returned + * by dma_alloc_coherent is totally wrong. + */ + return rdma_user_mmap_io(context, vma, + hr_dev->tptr_dma_addr >> PAGE_SHIFT, + hr_dev->tptr_size, + vma->vm_page_prot); - if (vma->vm_pgoff == 0) { - vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); - if (io_remap_pfn_range(vma, vma->vm_start, - to_hr_ucontext(context)->uar.pfn, - PAGE_SIZE, vma->vm_page_prot)) - return -EAGAIN; - } else if (vma->vm_pgoff == 1 && hr_dev->tptr_dma_addr && - hr_dev->tptr_size) { - /* vm_pgoff: 1 -- TPTR */ - if (io_remap_pfn_range(vma, vma->vm_start, - hr_dev->tptr_dma_addr >> PAGE_SHIFT, - hr_dev->tptr_size, - vma->vm_page_prot)) - return -EAGAIN; - } else + default: return -EINVAL; - - return hns_roce_set_vma_data(vma, to_hr_ucontext(context)); + } } static int hns_roce_port_immutable(struct ib_device *ib_dev, u8 port_num, @@ -471,21 +429,6 @@ static int hns_roce_port_immutable(struct ib_device *ib_dev, u8 port_num, static void hns_roce_disassociate_ucontext(struct ib_ucontext *ibcontext) { - struct hns_roce_ucontext *context = to_hr_ucontext(ibcontext); - struct hns_roce_vma_data *vma_data, *n; - struct vm_area_struct *vma; - - mutex_lock(&context->vma_list_mutex); - list_for_each_entry_safe(vma_data, n, &context->vma_list, list) { - vma = vma_data->vma; - zap_vma_ptes(vma, vma->vm_start, PAGE_SIZE); - - vma->vm_flags &= ~(VM_SHARED | VM_MAYSHARE); - vma->vm_ops = NULL; - list_del(&vma_data->list); - kfree(vma_data); - } - mutex_unlock(&context->vma_list_mutex); } static void hns_roce_unregister_device(struct hns_roce_dev *hr_dev) @@ -508,7 +451,6 @@ static int hns_roce_register_device(struct hns_roce_dev *hr_dev) spin_lock_init(&iboe->lock); ib_dev = &hr_dev->ib_dev; - strlcpy(ib_dev->name, "hns_%d", IB_DEVICE_NAME_MAX); ib_dev->owner = THIS_MODULE; ib_dev->node_type = RDMA_NODE_IB_CA; @@ -584,12 +526,27 @@ static int hns_roce_register_device(struct hns_roce_dev *hr_dev) ib_dev->uverbs_cmd_mask |= (1ULL << IB_USER_VERBS_CMD_REREG_MR); } + /* MW */ + if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_MW) { + ib_dev->alloc_mw = hns_roce_alloc_mw; + ib_dev->dealloc_mw = hns_roce_dealloc_mw; + ib_dev->uverbs_cmd_mask |= + (1ULL << IB_USER_VERBS_CMD_ALLOC_MW) | + (1ULL << IB_USER_VERBS_CMD_DEALLOC_MW); + } + + /* FRMR */ + if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_FRMR) { + ib_dev->alloc_mr = hns_roce_alloc_mr; + ib_dev->map_mr_sg = hns_roce_map_mr_sg; + } + /* OTHERS */ ib_dev->get_port_immutable = hns_roce_port_immutable; ib_dev->disassociate_ucontext = hns_roce_disassociate_ucontext; ib_dev->driver_id = RDMA_DRIVER_HNS; - ret = ib_register_device(ib_dev, NULL); + ret = ib_register_device(ib_dev, "hns_%d", NULL); if (ret) { dev_err(dev, "ib_register_device failed!\n"); return ret; diff --git a/drivers/infiniband/hw/hns/hns_roce_mr.c b/drivers/infiniband/hw/hns/hns_roce_mr.c index eb26a5f6fc58..521ad2aa3a4e 100644 --- a/drivers/infiniband/hw/hns/hns_roce_mr.c +++ b/drivers/infiniband/hw/hns/hns_roce_mr.c @@ -329,7 +329,7 @@ static int hns_roce_mhop_alloc(struct hns_roce_dev *hr_dev, int npages, u64 bt_idx; u64 size; - mhop_num = hr_dev->caps.pbl_hop_num; + mhop_num = (mr->type == MR_TYPE_FRMR ? 1 : hr_dev->caps.pbl_hop_num); pbl_bt_sz = 1 << (hr_dev->caps.pbl_ba_pg_sz + PAGE_SHIFT); pbl_last_bt_num = (npages + pbl_bt_sz / 8 - 1) / (pbl_bt_sz / 8); @@ -351,7 +351,7 @@ static int hns_roce_mhop_alloc(struct hns_roce_dev *hr_dev, int npages, mr->pbl_size = npages; mr->pbl_ba = mr->pbl_dma_addr; - mr->pbl_hop_num = hr_dev->caps.pbl_hop_num; + mr->pbl_hop_num = mhop_num; mr->pbl_ba_pg_sz = hr_dev->caps.pbl_ba_pg_sz; mr->pbl_buf_pg_sz = hr_dev->caps.pbl_buf_pg_sz; return 0; @@ -511,7 +511,6 @@ static int hns_roce_mr_alloc(struct hns_roce_dev *hr_dev, u32 pd, u64 iova, mr->key = hw_index_to_key(index); /* MR key */ if (size == ~0ull) { - mr->type = MR_TYPE_DMA; mr->pbl_buf = NULL; mr->pbl_dma_addr = 0; /* PBL multi-hop addressing parameters */ @@ -522,7 +521,6 @@ static int hns_roce_mr_alloc(struct hns_roce_dev *hr_dev, u32 pd, u64 iova, mr->pbl_l1_dma_addr = NULL; mr->pbl_l0_dma_addr = 0; } else { - mr->type = MR_TYPE_MR; if (!hr_dev->caps.pbl_hop_num) { mr->pbl_buf = dma_alloc_coherent(dev, npages * 8, &(mr->pbl_dma_addr), @@ -548,9 +546,9 @@ static void hns_roce_mhop_free(struct hns_roce_dev *hr_dev, u32 mhop_num; u64 bt_idx; - npages = ib_umem_page_count(mr->umem); + npages = mr->pbl_size; pbl_bt_sz = 1 << (hr_dev->caps.pbl_ba_pg_sz + PAGE_SHIFT); - mhop_num = hr_dev->caps.pbl_hop_num; + mhop_num = (mr->type == MR_TYPE_FRMR) ? 1 : hr_dev->caps.pbl_hop_num; if (mhop_num == HNS_ROCE_HOP_NUM_0) return; @@ -636,7 +634,8 @@ static void hns_roce_mr_free(struct hns_roce_dev *hr_dev, } if (mr->size != ~0ULL) { - npages = ib_umem_page_count(mr->umem); + if (mr->type == MR_TYPE_MR) + npages = ib_umem_page_count(mr->umem); if (!hr_dev->caps.pbl_hop_num) dma_free_coherent(dev, (unsigned int)(npages * 8), @@ -674,7 +673,10 @@ static int hns_roce_mr_enable(struct hns_roce_dev *hr_dev, goto err_table; } - ret = hr_dev->hw->write_mtpt(mailbox->buf, mr, mtpt_idx); + if (mr->type != MR_TYPE_FRMR) + ret = hr_dev->hw->write_mtpt(mailbox->buf, mr, mtpt_idx); + else + ret = hr_dev->hw->frmr_write_mtpt(mailbox->buf, mr); if (ret) { dev_err(dev, "Write mtpt fail!\n"); goto err_page; @@ -855,6 +857,8 @@ struct ib_mr *hns_roce_get_dma_mr(struct ib_pd *pd, int acc) if (mr == NULL) return ERR_PTR(-ENOMEM); + mr->type = MR_TYPE_DMA; + /* Allocate memory region key */ ret = hns_roce_mr_alloc(to_hr_dev(pd->device), to_hr_pd(pd)->pdn, 0, ~0ULL, acc, 0, mr); @@ -1031,6 +1035,8 @@ struct ib_mr *hns_roce_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, } } + mr->type = MR_TYPE_MR; + ret = hns_roce_mr_alloc(hr_dev, to_hr_pd(pd)->pdn, virt_addr, length, access_flags, n, mr); if (ret) @@ -1201,3 +1207,193 @@ int hns_roce_dereg_mr(struct ib_mr *ibmr) return ret; } + +struct ib_mr *hns_roce_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type, + u32 max_num_sg) +{ + struct hns_roce_dev *hr_dev = to_hr_dev(pd->device); + struct device *dev = hr_dev->dev; + struct hns_roce_mr *mr; + u64 length; + u32 page_size; + int ret; + + page_size = 1 << (hr_dev->caps.pbl_buf_pg_sz + PAGE_SHIFT); + length = max_num_sg * page_size; + + if (mr_type != IB_MR_TYPE_MEM_REG) + return ERR_PTR(-EINVAL); + + if (max_num_sg > HNS_ROCE_FRMR_MAX_PA) { + dev_err(dev, "max_num_sg larger than %d\n", + HNS_ROCE_FRMR_MAX_PA); + return ERR_PTR(-EINVAL); + } + + mr = kzalloc(sizeof(*mr), GFP_KERNEL); + if (!mr) + return ERR_PTR(-ENOMEM); + + mr->type = MR_TYPE_FRMR; + + /* Allocate memory region key */ + ret = hns_roce_mr_alloc(hr_dev, to_hr_pd(pd)->pdn, 0, length, + 0, max_num_sg, mr); + if (ret) + goto err_free; + + ret = hns_roce_mr_enable(hr_dev, mr); + if (ret) + goto err_mr; + + mr->ibmr.rkey = mr->ibmr.lkey = mr->key; + mr->umem = NULL; + + return &mr->ibmr; + +err_mr: + hns_roce_mr_free(to_hr_dev(pd->device), mr); + +err_free: + kfree(mr); + return ERR_PTR(ret); +} + +static int hns_roce_set_page(struct ib_mr *ibmr, u64 addr) +{ + struct hns_roce_mr *mr = to_hr_mr(ibmr); + + mr->pbl_buf[mr->npages++] = cpu_to_le64(addr); + + return 0; +} + +int hns_roce_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, int sg_nents, + unsigned int *sg_offset) +{ + struct hns_roce_mr *mr = to_hr_mr(ibmr); + + mr->npages = 0; + + return ib_sg_to_pages(ibmr, sg, sg_nents, sg_offset, hns_roce_set_page); +} + +static void hns_roce_mw_free(struct hns_roce_dev *hr_dev, + struct hns_roce_mw *mw) +{ + struct device *dev = hr_dev->dev; + int ret; + + if (mw->enabled) { + ret = hns_roce_hw2sw_mpt(hr_dev, NULL, key_to_hw_index(mw->rkey) + & (hr_dev->caps.num_mtpts - 1)); + if (ret) + dev_warn(dev, "MW HW2SW_MPT failed (%d)\n", ret); + + hns_roce_table_put(hr_dev, &hr_dev->mr_table.mtpt_table, + key_to_hw_index(mw->rkey)); + } + + hns_roce_bitmap_free(&hr_dev->mr_table.mtpt_bitmap, + key_to_hw_index(mw->rkey), BITMAP_NO_RR); +} + +static int hns_roce_mw_enable(struct hns_roce_dev *hr_dev, + struct hns_roce_mw *mw) +{ + struct hns_roce_mr_table *mr_table = &hr_dev->mr_table; + struct hns_roce_cmd_mailbox *mailbox; + struct device *dev = hr_dev->dev; + unsigned long mtpt_idx = key_to_hw_index(mw->rkey); + int ret; + + /* prepare HEM entry memory */ + ret = hns_roce_table_get(hr_dev, &mr_table->mtpt_table, mtpt_idx); + if (ret) + return ret; + + mailbox = hns_roce_alloc_cmd_mailbox(hr_dev); + if (IS_ERR(mailbox)) { + ret = PTR_ERR(mailbox); + goto err_table; + } + + ret = hr_dev->hw->mw_write_mtpt(mailbox->buf, mw); + if (ret) { + dev_err(dev, "MW write mtpt fail!\n"); + goto err_page; + } + + ret = hns_roce_sw2hw_mpt(hr_dev, mailbox, + mtpt_idx & (hr_dev->caps.num_mtpts - 1)); + if (ret) { + dev_err(dev, "MW sw2hw_mpt failed (%d)\n", ret); + goto err_page; + } + + mw->enabled = 1; + + hns_roce_free_cmd_mailbox(hr_dev, mailbox); + + return 0; + +err_page: + hns_roce_free_cmd_mailbox(hr_dev, mailbox); + +err_table: + hns_roce_table_put(hr_dev, &mr_table->mtpt_table, mtpt_idx); + + return ret; +} + +struct ib_mw *hns_roce_alloc_mw(struct ib_pd *ib_pd, enum ib_mw_type type, + struct ib_udata *udata) +{ + struct hns_roce_dev *hr_dev = to_hr_dev(ib_pd->device); + struct hns_roce_mw *mw; + unsigned long index = 0; + int ret; + + mw = kmalloc(sizeof(*mw), GFP_KERNEL); + if (!mw) + return ERR_PTR(-ENOMEM); + + /* Allocate a key for mw from bitmap */ + ret = hns_roce_bitmap_alloc(&hr_dev->mr_table.mtpt_bitmap, &index); + if (ret) + goto err_bitmap; + + mw->rkey = hw_index_to_key(index); + + mw->ibmw.rkey = mw->rkey; + mw->ibmw.type = type; + mw->pdn = to_hr_pd(ib_pd)->pdn; + mw->pbl_hop_num = hr_dev->caps.pbl_hop_num; + mw->pbl_ba_pg_sz = hr_dev->caps.pbl_ba_pg_sz; + mw->pbl_buf_pg_sz = hr_dev->caps.pbl_buf_pg_sz; + + ret = hns_roce_mw_enable(hr_dev, mw); + if (ret) + goto err_mw; + + return &mw->ibmw; + +err_mw: + hns_roce_mw_free(hr_dev, mw); + +err_bitmap: + kfree(mw); + + return ERR_PTR(ret); +} + +int hns_roce_dealloc_mw(struct ib_mw *ibmw) +{ + struct hns_roce_dev *hr_dev = to_hr_dev(ibmw->device); + struct hns_roce_mw *mw = to_hr_mw(ibmw); + + hns_roce_mw_free(hr_dev, mw); + kfree(mw); + + return 0; +} diff --git a/drivers/infiniband/hw/hns/hns_roce_qp.c b/drivers/infiniband/hw/hns/hns_roce_qp.c index efb7e961ca65..5ebf481a39d9 100644 --- a/drivers/infiniband/hw/hns/hns_roce_qp.c +++ b/drivers/infiniband/hw/hns/hns_roce_qp.c @@ -31,6 +31,7 @@ * SOFTWARE. */ +#include <linux/pci.h> #include <linux/platform_device.h> #include <rdma/ib_addr.h> #include <rdma/ib_umem.h> @@ -343,6 +344,7 @@ static int hns_roce_set_user_sq_size(struct hns_roce_dev *hr_dev, { u32 roundup_sq_stride = roundup_pow_of_two(hr_dev->caps.max_sq_desc_sz); u8 max_sq_stride = ilog2(roundup_sq_stride); + u32 ex_sge_num; u32 page_size; u32 max_cnt; @@ -372,7 +374,18 @@ static int hns_roce_set_user_sq_size(struct hns_roce_dev *hr_dev, if (hr_qp->sq.max_gs > 2) hr_qp->sge.sge_cnt = roundup_pow_of_two(hr_qp->sq.wqe_cnt * (hr_qp->sq.max_gs - 2)); + + if ((hr_qp->sq.max_gs > 2) && (hr_dev->pci_dev->revision == 0x20)) { + if (hr_qp->sge.sge_cnt > hr_dev->caps.max_extend_sg) { + dev_err(hr_dev->dev, + "The extended sge cnt error! sge_cnt=%d\n", + hr_qp->sge.sge_cnt); + return -EINVAL; + } + } + hr_qp->sge.sge_shift = 4; + ex_sge_num = hr_qp->sge.sge_cnt; /* Get buf size, SQ and RQ are aligned to page_szie */ if (hr_dev->caps.max_sq_sg <= 2) { @@ -386,6 +399,8 @@ static int hns_roce_set_user_sq_size(struct hns_roce_dev *hr_dev, hr_qp->sq.wqe_shift), PAGE_SIZE); } else { page_size = 1 << (hr_dev->caps.mtt_buf_pg_sz + PAGE_SHIFT); + hr_qp->sge.sge_cnt = + max(page_size / (1 << hr_qp->sge.sge_shift), ex_sge_num); hr_qp->buff_size = HNS_ROCE_ALOGN_UP((hr_qp->rq.wqe_cnt << hr_qp->rq.wqe_shift), page_size) + HNS_ROCE_ALOGN_UP((hr_qp->sge.sge_cnt << @@ -394,7 +409,7 @@ static int hns_roce_set_user_sq_size(struct hns_roce_dev *hr_dev, hr_qp->sq.wqe_shift), page_size); hr_qp->sq.offset = 0; - if (hr_qp->sge.sge_cnt) { + if (ex_sge_num) { hr_qp->sge.offset = HNS_ROCE_ALOGN_UP( (hr_qp->sq.wqe_cnt << hr_qp->sq.wqe_shift), @@ -465,6 +480,14 @@ static int hns_roce_set_kernel_sq_size(struct hns_roce_dev *hr_dev, hr_qp->sge.sge_shift = 4; } + if ((hr_qp->sq.max_gs > 2) && hr_dev->pci_dev->revision == 0x20) { + if (hr_qp->sge.sge_cnt > hr_dev->caps.max_extend_sg) { + dev_err(dev, "The extended sge cnt error! sge_cnt=%d\n", + hr_qp->sge.sge_cnt); + return -EINVAL; + } + } + /* Get buf size, SQ and RQ are aligned to PAGE_SIZE */ page_size = 1 << (hr_dev->caps.mtt_buf_pg_sz + PAGE_SHIFT); hr_qp->sq.offset = 0; @@ -472,6 +495,8 @@ static int hns_roce_set_kernel_sq_size(struct hns_roce_dev *hr_dev, page_size); if (hr_dev->caps.max_sq_sg > 2 && hr_qp->sge.sge_cnt) { + hr_qp->sge.sge_cnt = max(page_size/(1 << hr_qp->sge.sge_shift), + (u32)hr_qp->sge.sge_cnt); hr_qp->sge.offset = size; size += HNS_ROCE_ALOGN_UP(hr_qp->sge.sge_cnt << hr_qp->sge.sge_shift, page_size); @@ -952,8 +977,8 @@ int hns_roce_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, } } - if (!ib_modify_qp_is_ok(cur_state, new_state, ibqp->qp_type, attr_mask, - IB_LINK_LAYER_ETHERNET)) { + if (!ib_modify_qp_is_ok(cur_state, new_state, ibqp->qp_type, + attr_mask)) { dev_err(dev, "ib_modify_qp_is_ok failed\n"); goto out; } @@ -1106,14 +1131,20 @@ int hns_roce_init_qp_table(struct hns_roce_dev *hr_dev) { struct hns_roce_qp_table *qp_table = &hr_dev->qp_table; int reserved_from_top = 0; + int reserved_from_bot; int ret; spin_lock_init(&qp_table->lock); INIT_RADIX_TREE(&hr_dev->qp_table_tree, GFP_ATOMIC); - /* A port include two SQP, six port total 12 */ + /* In hw v1, a port include two SQP, six ports total 12 */ + if (hr_dev->caps.max_sq_sg <= 2) + reserved_from_bot = SQP_NUM; + else + reserved_from_bot = hr_dev->caps.reserved_qps; + ret = hns_roce_bitmap_init(&qp_table->bitmap, hr_dev->caps.num_qps, - hr_dev->caps.num_qps - 1, SQP_NUM, + hr_dev->caps.num_qps - 1, reserved_from_bot, reserved_from_top); if (ret) { dev_err(hr_dev->dev, "qp bitmap init failed!error=%d\n", diff --git a/drivers/infiniband/hw/i40iw/i40iw_cm.c b/drivers/infiniband/hw/i40iw/i40iw_cm.c index 423818a7d333..771eb6bd0785 100644 --- a/drivers/infiniband/hw/i40iw/i40iw_cm.c +++ b/drivers/infiniband/hw/i40iw/i40iw_cm.c @@ -1689,7 +1689,7 @@ static enum i40iw_status_code i40iw_add_mqh_6(struct i40iw_device *iwdev, unsigned long flags; rtnl_lock(); - for_each_netdev_rcu(&init_net, ip_dev) { + for_each_netdev(&init_net, ip_dev) { if ((((rdma_vlan_dev_vlan_id(ip_dev) < I40IW_NO_VLAN) && (rdma_vlan_dev_real_dev(ip_dev) == iwdev->netdev)) || (ip_dev == iwdev->netdev)) && (ip_dev->flags & IFF_UP)) { diff --git a/drivers/infiniband/hw/i40iw/i40iw_verbs.c b/drivers/infiniband/hw/i40iw/i40iw_verbs.c index e2e6c74a7452..102875872bea 100644 --- a/drivers/infiniband/hw/i40iw/i40iw_verbs.c +++ b/drivers/infiniband/hw/i40iw/i40iw_verbs.c @@ -2135,10 +2135,10 @@ static int i40iw_dereg_mr(struct ib_mr *ib_mr) } /** - * i40iw_show_rev + * hw_rev_show */ -static ssize_t i40iw_show_rev(struct device *dev, - struct device_attribute *attr, char *buf) +static ssize_t hw_rev_show(struct device *dev, + struct device_attribute *attr, char *buf) { struct i40iw_ib_device *iwibdev = container_of(dev, struct i40iw_ib_device, @@ -2147,34 +2147,37 @@ static ssize_t i40iw_show_rev(struct device *dev, return sprintf(buf, "%x\n", hw_rev); } +static DEVICE_ATTR_RO(hw_rev); /** - * i40iw_show_hca + * hca_type_show */ -static ssize_t i40iw_show_hca(struct device *dev, - struct device_attribute *attr, char *buf) +static ssize_t hca_type_show(struct device *dev, + struct device_attribute *attr, char *buf) { return sprintf(buf, "I40IW\n"); } +static DEVICE_ATTR_RO(hca_type); /** - * i40iw_show_board + * board_id_show */ -static ssize_t i40iw_show_board(struct device *dev, - struct device_attribute *attr, - char *buf) +static ssize_t board_id_show(struct device *dev, + struct device_attribute *attr, char *buf) { return sprintf(buf, "%.*s\n", 32, "I40IW Board ID"); } +static DEVICE_ATTR_RO(board_id); -static DEVICE_ATTR(hw_rev, S_IRUGO, i40iw_show_rev, NULL); -static DEVICE_ATTR(hca_type, S_IRUGO, i40iw_show_hca, NULL); -static DEVICE_ATTR(board_id, S_IRUGO, i40iw_show_board, NULL); +static struct attribute *i40iw_dev_attributes[] = { + &dev_attr_hw_rev.attr, + &dev_attr_hca_type.attr, + &dev_attr_board_id.attr, + NULL +}; -static struct device_attribute *i40iw_dev_attributes[] = { - &dev_attr_hw_rev, - &dev_attr_hca_type, - &dev_attr_board_id +static const struct attribute_group i40iw_attr_group = { + .attrs = i40iw_dev_attributes, }; /** @@ -2752,7 +2755,6 @@ static struct i40iw_ib_device *i40iw_init_rdma_device(struct i40iw_device *iwdev i40iw_pr_err("iwdev == NULL\n"); return NULL; } - strlcpy(iwibdev->ibdev.name, "i40iw%d", IB_DEVICE_NAME_MAX); iwibdev->ibdev.owner = THIS_MODULE; iwdev->iwibdev = iwibdev; iwibdev->iwdev = iwdev; @@ -2851,20 +2853,6 @@ void i40iw_port_ibevent(struct i40iw_device *iwdev) } /** - * i40iw_unregister_rdma_device - unregister of iwarp from IB - * @iwibdev: rdma device ptr - */ -static void i40iw_unregister_rdma_device(struct i40iw_ib_device *iwibdev) -{ - int i; - - for (i = 0; i < ARRAY_SIZE(i40iw_dev_attributes); ++i) - device_remove_file(&iwibdev->ibdev.dev, - i40iw_dev_attributes[i]); - ib_unregister_device(&iwibdev->ibdev); -} - -/** * i40iw_destroy_rdma_device - destroy rdma device and free resources * @iwibdev: IB device ptr */ @@ -2873,7 +2861,7 @@ void i40iw_destroy_rdma_device(struct i40iw_ib_device *iwibdev) if (!iwibdev) return; - i40iw_unregister_rdma_device(iwibdev); + ib_unregister_device(&iwibdev->ibdev); kfree(iwibdev->ibdev.iwcm); iwibdev->ibdev.iwcm = NULL; wait_event_timeout(iwibdev->iwdev->close_wq, @@ -2888,32 +2876,19 @@ void i40iw_destroy_rdma_device(struct i40iw_ib_device *iwibdev) */ int i40iw_register_rdma_device(struct i40iw_device *iwdev) { - int i, ret; + int ret; struct i40iw_ib_device *iwibdev; iwdev->iwibdev = i40iw_init_rdma_device(iwdev); if (!iwdev->iwibdev) return -ENOMEM; iwibdev = iwdev->iwibdev; - + rdma_set_device_sysfs_group(&iwibdev->ibdev, &i40iw_attr_group); iwibdev->ibdev.driver_id = RDMA_DRIVER_I40IW; - ret = ib_register_device(&iwibdev->ibdev, NULL); + ret = ib_register_device(&iwibdev->ibdev, "i40iw%d", NULL); if (ret) goto error; - for (i = 0; i < ARRAY_SIZE(i40iw_dev_attributes); ++i) { - ret = - device_create_file(&iwibdev->ibdev.dev, - i40iw_dev_attributes[i]); - if (ret) { - while (i > 0) { - i--; - device_remove_file(&iwibdev->ibdev.dev, i40iw_dev_attributes[i]); - } - ib_unregister_device(&iwibdev->ibdev); - goto error; - } - } return 0; error: kfree(iwdev->iwibdev->ibdev.iwcm); diff --git a/drivers/infiniband/hw/mlx4/Kconfig b/drivers/infiniband/hw/mlx4/Kconfig index db4aa13ebae0..d1de3285fd88 100644 --- a/drivers/infiniband/hw/mlx4/Kconfig +++ b/drivers/infiniband/hw/mlx4/Kconfig @@ -1,6 +1,7 @@ config MLX4_INFINIBAND tristate "Mellanox ConnectX HCA support" depends on NETDEVICES && ETHERNET && PCI && INET + depends on INFINIBAND_USER_ACCESS || !INFINIBAND_USER_ACCESS depends on MAY_USE_DEVLINK select NET_VENDOR_MELLANOX select MLX4_CORE diff --git a/drivers/infiniband/hw/mlx4/mad.c b/drivers/infiniband/hw/mlx4/mad.c index e5466d786bb1..8942f5f7f04d 100644 --- a/drivers/infiniband/hw/mlx4/mad.c +++ b/drivers/infiniband/hw/mlx4/mad.c @@ -807,15 +807,17 @@ static int ib_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num, int err; struct ib_port_attr pattr; - if (in_wc && in_wc->qp->qp_num) { - pr_debug("received MAD: slid:%d sqpn:%d " - "dlid_bits:%d dqpn:%d wc_flags:0x%x, cls %x, mtd %x, atr %x\n", - in_wc->slid, in_wc->src_qp, - in_wc->dlid_path_bits, - in_wc->qp->qp_num, - in_wc->wc_flags, - in_mad->mad_hdr.mgmt_class, in_mad->mad_hdr.method, - be16_to_cpu(in_mad->mad_hdr.attr_id)); + if (in_wc && in_wc->qp) { + pr_debug("received MAD: port:%d slid:%d sqpn:%d " + "dlid_bits:%d dqpn:%d wc_flags:0x%x tid:%016llx cls:%x mtd:%x atr:%x\n", + port_num, + in_wc->slid, in_wc->src_qp, + in_wc->dlid_path_bits, + in_wc->qp->qp_num, + in_wc->wc_flags, + be64_to_cpu(in_mad->mad_hdr.tid), + in_mad->mad_hdr.mgmt_class, in_mad->mad_hdr.method, + be16_to_cpu(in_mad->mad_hdr.attr_id)); if (in_wc->wc_flags & IB_WC_GRH) { pr_debug("sgid_hi:0x%016llx sgid_lo:0x%016llx\n", be64_to_cpu(in_grh->sgid.global.subnet_prefix), diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c index 0bbeaaae47e0..0def2323459c 100644 --- a/drivers/infiniband/hw/mlx4/main.c +++ b/drivers/infiniband/hw/mlx4/main.c @@ -1140,144 +1140,50 @@ static int mlx4_ib_dealloc_ucontext(struct ib_ucontext *ibcontext) return 0; } -static void mlx4_ib_vma_open(struct vm_area_struct *area) -{ - /* vma_open is called when a new VMA is created on top of our VMA. - * This is done through either mremap flow or split_vma (usually due - * to mlock, madvise, munmap, etc.). We do not support a clone of the - * vma, as this VMA is strongly hardware related. Therefore we set the - * vm_ops of the newly created/cloned VMA to NULL, to prevent it from - * calling us again and trying to do incorrect actions. We assume that - * the original vma size is exactly a single page that there will be no - * "splitting" operations on. - */ - area->vm_ops = NULL; -} - -static void mlx4_ib_vma_close(struct vm_area_struct *area) -{ - struct mlx4_ib_vma_private_data *mlx4_ib_vma_priv_data; - - /* It's guaranteed that all VMAs opened on a FD are closed before the - * file itself is closed, therefore no sync is needed with the regular - * closing flow. (e.g. mlx4_ib_dealloc_ucontext) However need a sync - * with accessing the vma as part of mlx4_ib_disassociate_ucontext. - * The close operation is usually called under mm->mmap_sem except when - * process is exiting. The exiting case is handled explicitly as part - * of mlx4_ib_disassociate_ucontext. - */ - mlx4_ib_vma_priv_data = (struct mlx4_ib_vma_private_data *) - area->vm_private_data; - - /* set the vma context pointer to null in the mlx4_ib driver's private - * data to protect against a race condition in mlx4_ib_dissassociate_ucontext(). - */ - mlx4_ib_vma_priv_data->vma = NULL; -} - -static const struct vm_operations_struct mlx4_ib_vm_ops = { - .open = mlx4_ib_vma_open, - .close = mlx4_ib_vma_close -}; - static void mlx4_ib_disassociate_ucontext(struct ib_ucontext *ibcontext) { - int i; - struct vm_area_struct *vma; - struct mlx4_ib_ucontext *context = to_mucontext(ibcontext); - - /* need to protect from a race on closing the vma as part of - * mlx4_ib_vma_close(). - */ - for (i = 0; i < HW_BAR_COUNT; i++) { - vma = context->hw_bar_info[i].vma; - if (!vma) - continue; - - zap_vma_ptes(context->hw_bar_info[i].vma, - context->hw_bar_info[i].vma->vm_start, PAGE_SIZE); - - context->hw_bar_info[i].vma->vm_flags &= - ~(VM_SHARED | VM_MAYSHARE); - /* context going to be destroyed, should not access ops any more */ - context->hw_bar_info[i].vma->vm_ops = NULL; - } -} - -static void mlx4_ib_set_vma_data(struct vm_area_struct *vma, - struct mlx4_ib_vma_private_data *vma_private_data) -{ - vma_private_data->vma = vma; - vma->vm_private_data = vma_private_data; - vma->vm_ops = &mlx4_ib_vm_ops; } static int mlx4_ib_mmap(struct ib_ucontext *context, struct vm_area_struct *vma) { struct mlx4_ib_dev *dev = to_mdev(context->device); - struct mlx4_ib_ucontext *mucontext = to_mucontext(context); - if (vma->vm_end - vma->vm_start != PAGE_SIZE) - return -EINVAL; - - if (vma->vm_pgoff == 0) { - /* We prevent double mmaping on same context */ - if (mucontext->hw_bar_info[HW_BAR_DB].vma) - return -EINVAL; - - vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); - - if (io_remap_pfn_range(vma, vma->vm_start, - to_mucontext(context)->uar.pfn, - PAGE_SIZE, vma->vm_page_prot)) - return -EAGAIN; - - mlx4_ib_set_vma_data(vma, &mucontext->hw_bar_info[HW_BAR_DB]); + switch (vma->vm_pgoff) { + case 0: + return rdma_user_mmap_io(context, vma, + to_mucontext(context)->uar.pfn, + PAGE_SIZE, + pgprot_noncached(vma->vm_page_prot)); - } else if (vma->vm_pgoff == 1 && dev->dev->caps.bf_reg_size != 0) { - /* We prevent double mmaping on same context */ - if (mucontext->hw_bar_info[HW_BAR_BF].vma) + case 1: + if (dev->dev->caps.bf_reg_size == 0) return -EINVAL; + return rdma_user_mmap_io( + context, vma, + to_mucontext(context)->uar.pfn + + dev->dev->caps.num_uars, + PAGE_SIZE, pgprot_writecombine(vma->vm_page_prot)); - vma->vm_page_prot = pgprot_writecombine(vma->vm_page_prot); - - if (io_remap_pfn_range(vma, vma->vm_start, - to_mucontext(context)->uar.pfn + - dev->dev->caps.num_uars, - PAGE_SIZE, vma->vm_page_prot)) - return -EAGAIN; - - mlx4_ib_set_vma_data(vma, &mucontext->hw_bar_info[HW_BAR_BF]); - - } else if (vma->vm_pgoff == 3) { + case 3: { struct mlx4_clock_params params; int ret; - /* We prevent double mmaping on same context */ - if (mucontext->hw_bar_info[HW_BAR_CLOCK].vma) - return -EINVAL; - ret = mlx4_get_internal_clock_params(dev->dev, ¶ms); - if (ret) return ret; - vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); - if (io_remap_pfn_range(vma, vma->vm_start, - (pci_resource_start(dev->dev->persist->pdev, - params.bar) + - params.offset) - >> PAGE_SHIFT, - PAGE_SIZE, vma->vm_page_prot)) - return -EAGAIN; - - mlx4_ib_set_vma_data(vma, - &mucontext->hw_bar_info[HW_BAR_CLOCK]); - } else { - return -EINVAL; + return rdma_user_mmap_io( + context, vma, + (pci_resource_start(dev->dev->persist->pdev, + params.bar) + + params.offset) >> + PAGE_SHIFT, + PAGE_SIZE, pgprot_noncached(vma->vm_page_prot)); } - return 0; + default: + return -EINVAL; + } } static struct ib_pd *mlx4_ib_alloc_pd(struct ib_device *ibdev, @@ -2133,39 +2039,43 @@ out: return err; } -static ssize_t show_hca(struct device *device, struct device_attribute *attr, - char *buf) +static ssize_t hca_type_show(struct device *device, + struct device_attribute *attr, char *buf) { struct mlx4_ib_dev *dev = container_of(device, struct mlx4_ib_dev, ib_dev.dev); return sprintf(buf, "MT%d\n", dev->dev->persist->pdev->device); } +static DEVICE_ATTR_RO(hca_type); -static ssize_t show_rev(struct device *device, struct device_attribute *attr, - char *buf) +static ssize_t hw_rev_show(struct device *device, + struct device_attribute *attr, char *buf) { struct mlx4_ib_dev *dev = container_of(device, struct mlx4_ib_dev, ib_dev.dev); return sprintf(buf, "%x\n", dev->dev->rev_id); } +static DEVICE_ATTR_RO(hw_rev); -static ssize_t show_board(struct device *device, struct device_attribute *attr, - char *buf) +static ssize_t board_id_show(struct device *device, + struct device_attribute *attr, char *buf) { struct mlx4_ib_dev *dev = container_of(device, struct mlx4_ib_dev, ib_dev.dev); return sprintf(buf, "%.*s\n", MLX4_BOARD_ID_LEN, dev->dev->board_id); } +static DEVICE_ATTR_RO(board_id); -static DEVICE_ATTR(hw_rev, S_IRUGO, show_rev, NULL); -static DEVICE_ATTR(hca_type, S_IRUGO, show_hca, NULL); -static DEVICE_ATTR(board_id, S_IRUGO, show_board, NULL); +static struct attribute *mlx4_class_attributes[] = { + &dev_attr_hw_rev.attr, + &dev_attr_hca_type.attr, + &dev_attr_board_id.attr, + NULL +}; -static struct device_attribute *mlx4_class_attributes[] = { - &dev_attr_hw_rev, - &dev_attr_hca_type, - &dev_attr_board_id +static const struct attribute_group mlx4_attr_group = { + .attrs = mlx4_class_attributes, }; struct diag_counter { @@ -2636,7 +2546,6 @@ static void *mlx4_ib_add(struct mlx4_dev *dev) ibdev->dev = dev; ibdev->bond_next_port = 0; - strlcpy(ibdev->ib_dev.name, "mlx4_%d", IB_DEVICE_NAME_MAX); ibdev->ib_dev.owner = THIS_MODULE; ibdev->ib_dev.node_type = RDMA_NODE_IB_CA; ibdev->ib_dev.local_dma_lkey = dev->caps.reserved_lkey; @@ -2898,8 +2807,9 @@ static void *mlx4_ib_add(struct mlx4_dev *dev) if (mlx4_ib_alloc_diag_counters(ibdev)) goto err_steer_free_bitmap; + rdma_set_device_sysfs_group(&ibdev->ib_dev, &mlx4_attr_group); ibdev->ib_dev.driver_id = RDMA_DRIVER_MLX4; - if (ib_register_device(&ibdev->ib_dev, NULL)) + if (ib_register_device(&ibdev->ib_dev, "mlx4_%d", NULL)) goto err_diag_counters; if (mlx4_ib_mad_init(ibdev)) @@ -2922,12 +2832,6 @@ static void *mlx4_ib_add(struct mlx4_dev *dev) goto err_notif; } - for (j = 0; j < ARRAY_SIZE(mlx4_class_attributes); ++j) { - if (device_create_file(&ibdev->ib_dev.dev, - mlx4_class_attributes[j])) - goto err_notif; - } - ibdev->ib_active = true; mlx4_foreach_port(i, dev, MLX4_PORT_TYPE_IB) devlink_port_type_ib_set(mlx4_get_devlink_port(dev, i), diff --git a/drivers/infiniband/hw/mlx4/mcg.c b/drivers/infiniband/hw/mlx4/mcg.c index 81ffc007e0a1..d844831179cf 100644 --- a/drivers/infiniband/hw/mlx4/mcg.c +++ b/drivers/infiniband/hw/mlx4/mcg.c @@ -673,7 +673,7 @@ static void mlx4_ib_mcg_work_handler(struct work_struct *work) if (!list_empty(&group->pending_list)) req = list_first_entry(&group->pending_list, struct mcast_req, group_list); - if ((method == IB_MGMT_METHOD_GET_RESP)) { + if (method == IB_MGMT_METHOD_GET_RESP) { if (req) { send_reply_to_slave(req->func, group, &req->sa_mad, status); --group->func[req->func].num_pend_reqs; diff --git a/drivers/infiniband/hw/mlx4/mlx4_ib.h b/drivers/infiniband/hw/mlx4/mlx4_ib.h index e10dccc7958f..8850dfc3826d 100644 --- a/drivers/infiniband/hw/mlx4/mlx4_ib.h +++ b/drivers/infiniband/hw/mlx4/mlx4_ib.h @@ -80,16 +80,11 @@ enum hw_bar_type { HW_BAR_COUNT }; -struct mlx4_ib_vma_private_data { - struct vm_area_struct *vma; -}; - struct mlx4_ib_ucontext { struct ib_ucontext ibucontext; struct mlx4_uar uar; struct list_head db_page_list; struct mutex db_page_mutex; - struct mlx4_ib_vma_private_data hw_bar_info[HW_BAR_COUNT]; struct list_head wqn_ranges_list; struct mutex wqn_ranges_mutex; /* protect wqn_ranges_list */ }; diff --git a/drivers/infiniband/hw/mlx4/qp.c b/drivers/infiniband/hw/mlx4/qp.c index 6dd3cd2c2f80..0711ca1dfb8f 100644 --- a/drivers/infiniband/hw/mlx4/qp.c +++ b/drivers/infiniband/hw/mlx4/qp.c @@ -2629,7 +2629,6 @@ enum { static int _mlx4_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, int attr_mask, struct ib_udata *udata) { - enum rdma_link_layer ll = IB_LINK_LAYER_UNSPECIFIED; struct mlx4_ib_dev *dev = to_mdev(ibqp->device); struct mlx4_ib_qp *qp = to_mqp(ibqp); enum ib_qp_state cur_state, new_state; @@ -2639,13 +2638,8 @@ static int _mlx4_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, cur_state = attr_mask & IB_QP_CUR_STATE ? attr->cur_qp_state : qp->state; new_state = attr_mask & IB_QP_STATE ? attr->qp_state : cur_state; - if (cur_state != new_state || cur_state != IB_QPS_RESET) { - int port = attr_mask & IB_QP_PORT ? attr->port_num : qp->port; - ll = rdma_port_get_link_layer(&dev->ib_dev, port); - } - if (!ib_modify_qp_is_ok(cur_state, new_state, ibqp->qp_type, - attr_mask, ll)) { + attr_mask)) { pr_debug("qpn 0x%x: invalid attribute mask specified " "for transition %d to %d. qp_type %d," " attr_mask 0x%x\n", diff --git a/drivers/infiniband/hw/mlx4/sysfs.c b/drivers/infiniband/hw/mlx4/sysfs.c index e219093d2764..752bdd536130 100644 --- a/drivers/infiniband/hw/mlx4/sysfs.c +++ b/drivers/infiniband/hw/mlx4/sysfs.c @@ -818,9 +818,7 @@ int mlx4_ib_device_register_sysfs(struct mlx4_ib_dev *dev) if (!mlx4_is_master(dev->dev)) return 0; - dev->iov_parent = - kobject_create_and_add("iov", - kobject_get(dev->ib_dev.ports_parent->parent)); + dev->iov_parent = kobject_create_and_add("iov", &dev->ib_dev.dev.kobj); if (!dev->iov_parent) { ret = -ENOMEM; goto err; @@ -850,7 +848,6 @@ err_add_entries: err_ports: kobject_put(dev->iov_parent); err: - kobject_put(dev->ib_dev.ports_parent->parent); pr_err("mlx4_ib_device_register_sysfs error (%d)\n", ret); return ret; } @@ -886,5 +883,4 @@ void mlx4_ib_device_unregister_sysfs(struct mlx4_ib_dev *device) kobject_put(device->ports_parent); kobject_put(device->iov_parent); kobject_put(device->iov_parent); - kobject_put(device->ib_dev.ports_parent->parent); } diff --git a/drivers/infiniband/hw/mlx5/cmd.c b/drivers/infiniband/hw/mlx5/cmd.c index c84fef9a8a08..ca060a2e2b36 100644 --- a/drivers/infiniband/hw/mlx5/cmd.c +++ b/drivers/infiniband/hw/mlx5/cmd.c @@ -197,3 +197,132 @@ int mlx5_cmd_query_ext_ppcnt_counters(struct mlx5_core_dev *dev, void *out) return mlx5_core_access_reg(dev, in, sz, out, sz, MLX5_REG_PPCNT, 0, 0); } + +void mlx5_cmd_destroy_tir(struct mlx5_core_dev *dev, u32 tirn, u16 uid) +{ + u32 in[MLX5_ST_SZ_DW(destroy_tir_in)] = {}; + u32 out[MLX5_ST_SZ_DW(destroy_tir_out)] = {}; + + MLX5_SET(destroy_tir_in, in, opcode, MLX5_CMD_OP_DESTROY_TIR); + MLX5_SET(destroy_tir_in, in, tirn, tirn); + MLX5_SET(destroy_tir_in, in, uid, uid); + mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); +} + +void mlx5_cmd_destroy_tis(struct mlx5_core_dev *dev, u32 tisn, u16 uid) +{ + u32 in[MLX5_ST_SZ_DW(destroy_tis_in)] = {0}; + u32 out[MLX5_ST_SZ_DW(destroy_tis_out)] = {0}; + + MLX5_SET(destroy_tis_in, in, opcode, MLX5_CMD_OP_DESTROY_TIS); + MLX5_SET(destroy_tis_in, in, tisn, tisn); + MLX5_SET(destroy_tis_in, in, uid, uid); + mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); +} + +void mlx5_cmd_destroy_rqt(struct mlx5_core_dev *dev, u32 rqtn, u16 uid) +{ + u32 in[MLX5_ST_SZ_DW(destroy_rqt_in)] = {}; + u32 out[MLX5_ST_SZ_DW(destroy_rqt_out)] = {}; + + MLX5_SET(destroy_rqt_in, in, opcode, MLX5_CMD_OP_DESTROY_RQT); + MLX5_SET(destroy_rqt_in, in, rqtn, rqtn); + MLX5_SET(destroy_rqt_in, in, uid, uid); + mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); +} + +int mlx5_cmd_alloc_transport_domain(struct mlx5_core_dev *dev, u32 *tdn, + u16 uid) +{ + u32 in[MLX5_ST_SZ_DW(alloc_transport_domain_in)] = {0}; + u32 out[MLX5_ST_SZ_DW(alloc_transport_domain_out)] = {0}; + int err; + + MLX5_SET(alloc_transport_domain_in, in, opcode, + MLX5_CMD_OP_ALLOC_TRANSPORT_DOMAIN); + + err = mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); + if (!err) + *tdn = MLX5_GET(alloc_transport_domain_out, out, + transport_domain); + + return err; +} + +void mlx5_cmd_dealloc_transport_domain(struct mlx5_core_dev *dev, u32 tdn, + u16 uid) +{ + u32 in[MLX5_ST_SZ_DW(dealloc_transport_domain_in)] = {0}; + u32 out[MLX5_ST_SZ_DW(dealloc_transport_domain_out)] = {0}; + + MLX5_SET(dealloc_transport_domain_in, in, opcode, + MLX5_CMD_OP_DEALLOC_TRANSPORT_DOMAIN); + MLX5_SET(dealloc_transport_domain_in, in, transport_domain, tdn); + mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); +} + +void mlx5_cmd_dealloc_pd(struct mlx5_core_dev *dev, u32 pdn, u16 uid) +{ + u32 out[MLX5_ST_SZ_DW(dealloc_pd_out)] = {}; + u32 in[MLX5_ST_SZ_DW(dealloc_pd_in)] = {}; + + MLX5_SET(dealloc_pd_in, in, opcode, MLX5_CMD_OP_DEALLOC_PD); + MLX5_SET(dealloc_pd_in, in, pd, pdn); + MLX5_SET(dealloc_pd_in, in, uid, uid); + mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); +} + +int mlx5_cmd_attach_mcg(struct mlx5_core_dev *dev, union ib_gid *mgid, + u32 qpn, u16 uid) +{ + u32 out[MLX5_ST_SZ_DW(attach_to_mcg_out)] = {}; + u32 in[MLX5_ST_SZ_DW(attach_to_mcg_in)] = {}; + void *gid; + + MLX5_SET(attach_to_mcg_in, in, opcode, MLX5_CMD_OP_ATTACH_TO_MCG); + MLX5_SET(attach_to_mcg_in, in, qpn, qpn); + MLX5_SET(attach_to_mcg_in, in, uid, uid); + gid = MLX5_ADDR_OF(attach_to_mcg_in, in, multicast_gid); + memcpy(gid, mgid, sizeof(*mgid)); + return mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); +} + +int mlx5_cmd_detach_mcg(struct mlx5_core_dev *dev, union ib_gid *mgid, + u32 qpn, u16 uid) +{ + u32 out[MLX5_ST_SZ_DW(detach_from_mcg_out)] = {}; + u32 in[MLX5_ST_SZ_DW(detach_from_mcg_in)] = {}; + void *gid; + + MLX5_SET(detach_from_mcg_in, in, opcode, MLX5_CMD_OP_DETACH_FROM_MCG); + MLX5_SET(detach_from_mcg_in, in, qpn, qpn); + MLX5_SET(detach_from_mcg_in, in, uid, uid); + gid = MLX5_ADDR_OF(detach_from_mcg_in, in, multicast_gid); + memcpy(gid, mgid, sizeof(*mgid)); + return mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); +} + +int mlx5_cmd_xrcd_alloc(struct mlx5_core_dev *dev, u32 *xrcdn, u16 uid) +{ + u32 out[MLX5_ST_SZ_DW(alloc_xrcd_out)] = {}; + u32 in[MLX5_ST_SZ_DW(alloc_xrcd_in)] = {}; + int err; + + MLX5_SET(alloc_xrcd_in, in, opcode, MLX5_CMD_OP_ALLOC_XRCD); + MLX5_SET(alloc_xrcd_in, in, uid, uid); + err = mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); + if (!err) + *xrcdn = MLX5_GET(alloc_xrcd_out, out, xrcd); + return err; +} + +int mlx5_cmd_xrcd_dealloc(struct mlx5_core_dev *dev, u32 xrcdn, u16 uid) +{ + u32 out[MLX5_ST_SZ_DW(dealloc_xrcd_out)] = {}; + u32 in[MLX5_ST_SZ_DW(dealloc_xrcd_in)] = {}; + + MLX5_SET(dealloc_xrcd_in, in, opcode, MLX5_CMD_OP_DEALLOC_XRCD); + MLX5_SET(dealloc_xrcd_in, in, xrcd, xrcdn); + MLX5_SET(dealloc_xrcd_in, in, uid, uid); + return mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); +} diff --git a/drivers/infiniband/hw/mlx5/cmd.h b/drivers/infiniband/hw/mlx5/cmd.h index 88cbb1c41703..c03c56455534 100644 --- a/drivers/infiniband/hw/mlx5/cmd.h +++ b/drivers/infiniband/hw/mlx5/cmd.h @@ -47,4 +47,18 @@ int mlx5_cmd_modify_cong_params(struct mlx5_core_dev *mdev, int mlx5_cmd_alloc_memic(struct mlx5_memic *memic, phys_addr_t *addr, u64 length, u32 alignment); int mlx5_cmd_dealloc_memic(struct mlx5_memic *memic, u64 addr, u64 length); +void mlx5_cmd_dealloc_pd(struct mlx5_core_dev *dev, u32 pdn, u16 uid); +void mlx5_cmd_destroy_tir(struct mlx5_core_dev *dev, u32 tirn, u16 uid); +void mlx5_cmd_destroy_tis(struct mlx5_core_dev *dev, u32 tisn, u16 uid); +void mlx5_cmd_destroy_rqt(struct mlx5_core_dev *dev, u32 rqtn, u16 uid); +int mlx5_cmd_alloc_transport_domain(struct mlx5_core_dev *dev, u32 *tdn, + u16 uid); +void mlx5_cmd_dealloc_transport_domain(struct mlx5_core_dev *dev, u32 tdn, + u16 uid); +int mlx5_cmd_attach_mcg(struct mlx5_core_dev *dev, union ib_gid *mgid, + u32 qpn, u16 uid); +int mlx5_cmd_detach_mcg(struct mlx5_core_dev *dev, union ib_gid *mgid, + u32 qpn, u16 uid); +int mlx5_cmd_xrcd_alloc(struct mlx5_core_dev *dev, u32 *xrcdn, u16 uid); +int mlx5_cmd_xrcd_dealloc(struct mlx5_core_dev *dev, u32 xrcdn, u16 uid); #endif /* MLX5_IB_CMD_H */ diff --git a/drivers/infiniband/hw/mlx5/cq.c b/drivers/infiniband/hw/mlx5/cq.c index cca1820802b8..7d769b5538b4 100644 --- a/drivers/infiniband/hw/mlx5/cq.c +++ b/drivers/infiniband/hw/mlx5/cq.c @@ -874,6 +874,7 @@ static int create_cq_user(struct mlx5_ib_dev *dev, struct ib_udata *udata, cq->private_flags |= MLX5_IB_CQ_PR_FLAGS_CQE_128_PAD; } + MLX5_SET(create_cq_in, *cqb, uid, to_mucontext(context)->devx_uid); return 0; err_cqb: @@ -1454,7 +1455,7 @@ ex: return err; } -int mlx5_ib_get_cqe_size(struct mlx5_ib_dev *dev, struct ib_cq *ibcq) +int mlx5_ib_get_cqe_size(struct ib_cq *ibcq) { struct mlx5_ib_cq *cq; diff --git a/drivers/infiniband/hw/mlx5/devx.c b/drivers/infiniband/hw/mlx5/devx.c index 66dc337e49a7..61aab7c0c513 100644 --- a/drivers/infiniband/hw/mlx5/devx.c +++ b/drivers/infiniband/hw/mlx5/devx.c @@ -19,7 +19,7 @@ #define MLX5_MAX_DESTROY_INBOX_SIZE_DW MLX5_ST_SZ_DW(delete_fte_in) struct devx_obj { struct mlx5_core_dev *mdev; - u32 obj_id; + u64 obj_id; u32 dinlen; /* destroy inbox length */ u32 dinbox[MLX5_MAX_DESTROY_INBOX_SIZE_DW]; }; @@ -45,13 +45,14 @@ static struct mlx5_ib_ucontext *devx_ufile2uctx(struct ib_uverbs_file *file) return to_mucontext(ib_uverbs_get_ucontext(file)); } -int mlx5_ib_devx_create(struct mlx5_ib_dev *dev, struct mlx5_ib_ucontext *context) +int mlx5_ib_devx_create(struct mlx5_ib_dev *dev) { u32 in[MLX5_ST_SZ_DW(create_uctx_in)] = {0}; u32 out[MLX5_ST_SZ_DW(general_obj_out_cmd_hdr)] = {0}; u64 general_obj_types; void *hdr; int err; + u16 uid; hdr = MLX5_ADDR_OF(create_uctx_in, in, hdr); @@ -60,9 +61,6 @@ int mlx5_ib_devx_create(struct mlx5_ib_dev *dev, struct mlx5_ib_ucontext *contex !(general_obj_types & MLX5_GENERAL_OBJ_TYPES_CAP_UMEM)) return -EINVAL; - if (!capable(CAP_NET_RAW)) - return -EPERM; - MLX5_SET(general_obj_in_cmd_hdr, hdr, opcode, MLX5_CMD_OP_CREATE_GENERAL_OBJECT); MLX5_SET(general_obj_in_cmd_hdr, hdr, obj_type, MLX5_OBJ_TYPE_UCTX); @@ -70,19 +68,18 @@ int mlx5_ib_devx_create(struct mlx5_ib_dev *dev, struct mlx5_ib_ucontext *contex if (err) return err; - context->devx_uid = MLX5_GET(general_obj_out_cmd_hdr, out, obj_id); - return 0; + uid = MLX5_GET(general_obj_out_cmd_hdr, out, obj_id); + return uid; } -void mlx5_ib_devx_destroy(struct mlx5_ib_dev *dev, - struct mlx5_ib_ucontext *context) +void mlx5_ib_devx_destroy(struct mlx5_ib_dev *dev, u16 uid) { u32 in[MLX5_ST_SZ_DW(general_obj_in_cmd_hdr)] = {0}; u32 out[MLX5_ST_SZ_DW(general_obj_out_cmd_hdr)] = {0}; MLX5_SET(general_obj_in_cmd_hdr, in, opcode, MLX5_CMD_OP_DESTROY_GENERAL_OBJECT); MLX5_SET(general_obj_in_cmd_hdr, in, obj_type, MLX5_OBJ_TYPE_UCTX); - MLX5_SET(general_obj_in_cmd_hdr, in, obj_id, context->devx_uid); + MLX5_SET(general_obj_in_cmd_hdr, in, obj_id, uid); mlx5_cmd_exec(dev->mdev, in, sizeof(in), out, sizeof(out)); } @@ -109,150 +106,218 @@ bool mlx5_ib_devx_is_flow_dest(void *obj, int *dest_id, int *dest_type) } } +/* + * As the obj_id in the firmware is not globally unique the object type + * must be considered upon checking for a valid object id. + * For that the opcode of the creator command is encoded as part of the obj_id. + */ +static u64 get_enc_obj_id(u16 opcode, u32 obj_id) +{ + return ((u64)opcode << 32) | obj_id; +} + static int devx_is_valid_obj_id(struct devx_obj *obj, const void *in) { u16 opcode = MLX5_GET(general_obj_in_cmd_hdr, in, opcode); - u32 obj_id; + u64 obj_id; switch (opcode) { case MLX5_CMD_OP_MODIFY_GENERAL_OBJECT: case MLX5_CMD_OP_QUERY_GENERAL_OBJECT: - obj_id = MLX5_GET(general_obj_in_cmd_hdr, in, obj_id); + obj_id = get_enc_obj_id(MLX5_CMD_OP_CREATE_GENERAL_OBJECT, + MLX5_GET(general_obj_in_cmd_hdr, in, + obj_id)); break; case MLX5_CMD_OP_QUERY_MKEY: - obj_id = MLX5_GET(query_mkey_in, in, mkey_index); + obj_id = get_enc_obj_id(MLX5_CMD_OP_CREATE_MKEY, + MLX5_GET(query_mkey_in, in, + mkey_index)); break; case MLX5_CMD_OP_QUERY_CQ: - obj_id = MLX5_GET(query_cq_in, in, cqn); + obj_id = get_enc_obj_id(MLX5_CMD_OP_CREATE_CQ, + MLX5_GET(query_cq_in, in, cqn)); break; case MLX5_CMD_OP_MODIFY_CQ: - obj_id = MLX5_GET(modify_cq_in, in, cqn); + obj_id = get_enc_obj_id(MLX5_CMD_OP_CREATE_CQ, + MLX5_GET(modify_cq_in, in, cqn)); break; case MLX5_CMD_OP_QUERY_SQ: - obj_id = MLX5_GET(query_sq_in, in, sqn); + obj_id = get_enc_obj_id(MLX5_CMD_OP_CREATE_SQ, + MLX5_GET(query_sq_in, in, sqn)); break; case MLX5_CMD_OP_MODIFY_SQ: - obj_id = MLX5_GET(modify_sq_in, in, sqn); + obj_id = get_enc_obj_id(MLX5_CMD_OP_CREATE_SQ, + MLX5_GET(modify_sq_in, in, sqn)); break; case MLX5_CMD_OP_QUERY_RQ: - obj_id = MLX5_GET(query_rq_in, in, rqn); + obj_id = get_enc_obj_id(MLX5_CMD_OP_CREATE_RQ, + MLX5_GET(query_rq_in, in, rqn)); break; case MLX5_CMD_OP_MODIFY_RQ: - obj_id = MLX5_GET(modify_rq_in, in, rqn); + obj_id = get_enc_obj_id(MLX5_CMD_OP_CREATE_RQ, + MLX5_GET(modify_rq_in, in, rqn)); break; case MLX5_CMD_OP_QUERY_RMP: - obj_id = MLX5_GET(query_rmp_in, in, rmpn); + obj_id = get_enc_obj_id(MLX5_CMD_OP_CREATE_RMP, + MLX5_GET(query_rmp_in, in, rmpn)); break; case MLX5_CMD_OP_MODIFY_RMP: - obj_id = MLX5_GET(modify_rmp_in, in, rmpn); + obj_id = get_enc_obj_id(MLX5_CMD_OP_CREATE_RMP, + MLX5_GET(modify_rmp_in, in, rmpn)); break; case MLX5_CMD_OP_QUERY_RQT: - obj_id = MLX5_GET(query_rqt_in, in, rqtn); + obj_id = get_enc_obj_id(MLX5_CMD_OP_CREATE_RQT, + MLX5_GET(query_rqt_in, in, rqtn)); break; case MLX5_CMD_OP_MODIFY_RQT: - obj_id = MLX5_GET(modify_rqt_in, in, rqtn); + obj_id = get_enc_obj_id(MLX5_CMD_OP_CREATE_RQT, + MLX5_GET(modify_rqt_in, in, rqtn)); break; case MLX5_CMD_OP_QUERY_TIR: - obj_id = MLX5_GET(query_tir_in, in, tirn); + obj_id = get_enc_obj_id(MLX5_CMD_OP_CREATE_TIR, + MLX5_GET(query_tir_in, in, tirn)); break; case MLX5_CMD_OP_MODIFY_TIR: - obj_id = MLX5_GET(modify_tir_in, in, tirn); + obj_id = get_enc_obj_id(MLX5_CMD_OP_CREATE_TIR, + MLX5_GET(modify_tir_in, in, tirn)); break; case MLX5_CMD_OP_QUERY_TIS: - obj_id = MLX5_GET(query_tis_in, in, tisn); + obj_id = get_enc_obj_id(MLX5_CMD_OP_CREATE_TIS, + MLX5_GET(query_tis_in, in, tisn)); break; case MLX5_CMD_OP_MODIFY_TIS: - obj_id = MLX5_GET(modify_tis_in, in, tisn); + obj_id = get_enc_obj_id(MLX5_CMD_OP_CREATE_TIS, + MLX5_GET(modify_tis_in, in, tisn)); break; case MLX5_CMD_OP_QUERY_FLOW_TABLE: - obj_id = MLX5_GET(query_flow_table_in, in, table_id); + obj_id = get_enc_obj_id(MLX5_CMD_OP_CREATE_FLOW_TABLE, + MLX5_GET(query_flow_table_in, in, + table_id)); break; case MLX5_CMD_OP_MODIFY_FLOW_TABLE: - obj_id = MLX5_GET(modify_flow_table_in, in, table_id); + obj_id = get_enc_obj_id(MLX5_CMD_OP_CREATE_FLOW_TABLE, + MLX5_GET(modify_flow_table_in, in, + table_id)); break; case MLX5_CMD_OP_QUERY_FLOW_GROUP: - obj_id = MLX5_GET(query_flow_group_in, in, group_id); + obj_id = get_enc_obj_id(MLX5_CMD_OP_CREATE_FLOW_GROUP, + MLX5_GET(query_flow_group_in, in, + group_id)); break; case MLX5_CMD_OP_QUERY_FLOW_TABLE_ENTRY: - obj_id = MLX5_GET(query_fte_in, in, flow_index); + obj_id = get_enc_obj_id(MLX5_CMD_OP_SET_FLOW_TABLE_ENTRY, + MLX5_GET(query_fte_in, in, + flow_index)); break; case MLX5_CMD_OP_SET_FLOW_TABLE_ENTRY: - obj_id = MLX5_GET(set_fte_in, in, flow_index); + obj_id = get_enc_obj_id(MLX5_CMD_OP_SET_FLOW_TABLE_ENTRY, + MLX5_GET(set_fte_in, in, flow_index)); break; case MLX5_CMD_OP_QUERY_Q_COUNTER: - obj_id = MLX5_GET(query_q_counter_in, in, counter_set_id); + obj_id = get_enc_obj_id(MLX5_CMD_OP_ALLOC_Q_COUNTER, + MLX5_GET(query_q_counter_in, in, + counter_set_id)); break; case MLX5_CMD_OP_QUERY_FLOW_COUNTER: - obj_id = MLX5_GET(query_flow_counter_in, in, flow_counter_id); + obj_id = get_enc_obj_id(MLX5_CMD_OP_ALLOC_FLOW_COUNTER, + MLX5_GET(query_flow_counter_in, in, + flow_counter_id)); break; case MLX5_CMD_OP_QUERY_MODIFY_HEADER_CONTEXT: - obj_id = MLX5_GET(general_obj_in_cmd_hdr, in, obj_id); + obj_id = get_enc_obj_id(MLX5_CMD_OP_ALLOC_MODIFY_HEADER_CONTEXT, + MLX5_GET(general_obj_in_cmd_hdr, in, + obj_id)); break; case MLX5_CMD_OP_QUERY_SCHEDULING_ELEMENT: - obj_id = MLX5_GET(query_scheduling_element_in, in, - scheduling_element_id); + obj_id = get_enc_obj_id(MLX5_CMD_OP_CREATE_SCHEDULING_ELEMENT, + MLX5_GET(query_scheduling_element_in, + in, scheduling_element_id)); break; case MLX5_CMD_OP_MODIFY_SCHEDULING_ELEMENT: - obj_id = MLX5_GET(modify_scheduling_element_in, in, - scheduling_element_id); + obj_id = get_enc_obj_id(MLX5_CMD_OP_CREATE_SCHEDULING_ELEMENT, + MLX5_GET(modify_scheduling_element_in, + in, scheduling_element_id)); break; case MLX5_CMD_OP_ADD_VXLAN_UDP_DPORT: - obj_id = MLX5_GET(add_vxlan_udp_dport_in, in, vxlan_udp_port); + obj_id = get_enc_obj_id(MLX5_CMD_OP_ADD_VXLAN_UDP_DPORT, + MLX5_GET(add_vxlan_udp_dport_in, in, + vxlan_udp_port)); break; case MLX5_CMD_OP_QUERY_L2_TABLE_ENTRY: - obj_id = MLX5_GET(query_l2_table_entry_in, in, table_index); + obj_id = get_enc_obj_id(MLX5_CMD_OP_SET_L2_TABLE_ENTRY, + MLX5_GET(query_l2_table_entry_in, in, + table_index)); break; case MLX5_CMD_OP_SET_L2_TABLE_ENTRY: - obj_id = MLX5_GET(set_l2_table_entry_in, in, table_index); + obj_id = get_enc_obj_id(MLX5_CMD_OP_SET_L2_TABLE_ENTRY, + MLX5_GET(set_l2_table_entry_in, in, + table_index)); break; case MLX5_CMD_OP_QUERY_QP: - obj_id = MLX5_GET(query_qp_in, in, qpn); + obj_id = get_enc_obj_id(MLX5_CMD_OP_CREATE_QP, + MLX5_GET(query_qp_in, in, qpn)); break; case MLX5_CMD_OP_RST2INIT_QP: - obj_id = MLX5_GET(rst2init_qp_in, in, qpn); + obj_id = get_enc_obj_id(MLX5_CMD_OP_CREATE_QP, + MLX5_GET(rst2init_qp_in, in, qpn)); break; case MLX5_CMD_OP_INIT2RTR_QP: - obj_id = MLX5_GET(init2rtr_qp_in, in, qpn); + obj_id = get_enc_obj_id(MLX5_CMD_OP_CREATE_QP, + MLX5_GET(init2rtr_qp_in, in, qpn)); break; case MLX5_CMD_OP_RTR2RTS_QP: - obj_id = MLX5_GET(rtr2rts_qp_in, in, qpn); + obj_id = get_enc_obj_id(MLX5_CMD_OP_CREATE_QP, + MLX5_GET(rtr2rts_qp_in, in, qpn)); break; case MLX5_CMD_OP_RTS2RTS_QP: - obj_id = MLX5_GET(rts2rts_qp_in, in, qpn); + obj_id = get_enc_obj_id(MLX5_CMD_OP_CREATE_QP, + MLX5_GET(rts2rts_qp_in, in, qpn)); break; case MLX5_CMD_OP_SQERR2RTS_QP: - obj_id = MLX5_GET(sqerr2rts_qp_in, in, qpn); + obj_id = get_enc_obj_id(MLX5_CMD_OP_CREATE_QP, + MLX5_GET(sqerr2rts_qp_in, in, qpn)); break; case MLX5_CMD_OP_2ERR_QP: - obj_id = MLX5_GET(qp_2err_in, in, qpn); + obj_id = get_enc_obj_id(MLX5_CMD_OP_CREATE_QP, + MLX5_GET(qp_2err_in, in, qpn)); break; case MLX5_CMD_OP_2RST_QP: - obj_id = MLX5_GET(qp_2rst_in, in, qpn); + obj_id = get_enc_obj_id(MLX5_CMD_OP_CREATE_QP, + MLX5_GET(qp_2rst_in, in, qpn)); break; case MLX5_CMD_OP_QUERY_DCT: - obj_id = MLX5_GET(query_dct_in, in, dctn); + obj_id = get_enc_obj_id(MLX5_CMD_OP_CREATE_DCT, + MLX5_GET(query_dct_in, in, dctn)); break; case MLX5_CMD_OP_QUERY_XRQ: - obj_id = MLX5_GET(query_xrq_in, in, xrqn); + obj_id = get_enc_obj_id(MLX5_CMD_OP_CREATE_XRQ, + MLX5_GET(query_xrq_in, in, xrqn)); break; case MLX5_CMD_OP_QUERY_XRC_SRQ: - obj_id = MLX5_GET(query_xrc_srq_in, in, xrc_srqn); + obj_id = get_enc_obj_id(MLX5_CMD_OP_CREATE_XRC_SRQ, + MLX5_GET(query_xrc_srq_in, in, + xrc_srqn)); break; case MLX5_CMD_OP_ARM_XRC_SRQ: - obj_id = MLX5_GET(arm_xrc_srq_in, in, xrc_srqn); + obj_id = get_enc_obj_id(MLX5_CMD_OP_CREATE_XRC_SRQ, + MLX5_GET(arm_xrc_srq_in, in, xrc_srqn)); break; case MLX5_CMD_OP_QUERY_SRQ: - obj_id = MLX5_GET(query_srq_in, in, srqn); + obj_id = get_enc_obj_id(MLX5_CMD_OP_CREATE_SRQ, + MLX5_GET(query_srq_in, in, srqn)); break; case MLX5_CMD_OP_ARM_RQ: - obj_id = MLX5_GET(arm_rq_in, in, srq_number); + obj_id = get_enc_obj_id(MLX5_CMD_OP_CREATE_RQ, + MLX5_GET(arm_rq_in, in, srq_number)); break; case MLX5_CMD_OP_DRAIN_DCT: case MLX5_CMD_OP_ARM_DCT_FOR_KEY_VIOLATION: - obj_id = MLX5_GET(drain_dct_in, in, dctn); + obj_id = get_enc_obj_id(MLX5_CMD_OP_CREATE_DCT, + MLX5_GET(drain_dct_in, in, dctn)); break; case MLX5_CMD_OP_ARM_XRQ: - obj_id = MLX5_GET(arm_xrq_in, in, xrqn); + obj_id = get_enc_obj_id(MLX5_CMD_OP_CREATE_XRQ, + MLX5_GET(arm_xrq_in, in, xrqn)); break; default: return false; @@ -264,11 +329,102 @@ static int devx_is_valid_obj_id(struct devx_obj *obj, const void *in) return false; } -static bool devx_is_obj_create_cmd(const void *in) +static void devx_set_umem_valid(const void *in) { u16 opcode = MLX5_GET(general_obj_in_cmd_hdr, in, opcode); switch (opcode) { + case MLX5_CMD_OP_CREATE_MKEY: + MLX5_SET(create_mkey_in, in, mkey_umem_valid, 1); + break; + case MLX5_CMD_OP_CREATE_CQ: + { + void *cqc; + + MLX5_SET(create_cq_in, in, cq_umem_valid, 1); + cqc = MLX5_ADDR_OF(create_cq_in, in, cq_context); + MLX5_SET(cqc, cqc, dbr_umem_valid, 1); + break; + } + case MLX5_CMD_OP_CREATE_QP: + { + void *qpc; + + qpc = MLX5_ADDR_OF(create_qp_in, in, qpc); + MLX5_SET(qpc, qpc, dbr_umem_valid, 1); + MLX5_SET(create_qp_in, in, wq_umem_valid, 1); + break; + } + + case MLX5_CMD_OP_CREATE_RQ: + { + void *rqc, *wq; + + rqc = MLX5_ADDR_OF(create_rq_in, in, ctx); + wq = MLX5_ADDR_OF(rqc, rqc, wq); + MLX5_SET(wq, wq, dbr_umem_valid, 1); + MLX5_SET(wq, wq, wq_umem_valid, 1); + break; + } + + case MLX5_CMD_OP_CREATE_SQ: + { + void *sqc, *wq; + + sqc = MLX5_ADDR_OF(create_sq_in, in, ctx); + wq = MLX5_ADDR_OF(sqc, sqc, wq); + MLX5_SET(wq, wq, dbr_umem_valid, 1); + MLX5_SET(wq, wq, wq_umem_valid, 1); + break; + } + + case MLX5_CMD_OP_MODIFY_CQ: + MLX5_SET(modify_cq_in, in, cq_umem_valid, 1); + break; + + case MLX5_CMD_OP_CREATE_RMP: + { + void *rmpc, *wq; + + rmpc = MLX5_ADDR_OF(create_rmp_in, in, ctx); + wq = MLX5_ADDR_OF(rmpc, rmpc, wq); + MLX5_SET(wq, wq, dbr_umem_valid, 1); + MLX5_SET(wq, wq, wq_umem_valid, 1); + break; + } + + case MLX5_CMD_OP_CREATE_XRQ: + { + void *xrqc, *wq; + + xrqc = MLX5_ADDR_OF(create_xrq_in, in, xrq_context); + wq = MLX5_ADDR_OF(xrqc, xrqc, wq); + MLX5_SET(wq, wq, dbr_umem_valid, 1); + MLX5_SET(wq, wq, wq_umem_valid, 1); + break; + } + + case MLX5_CMD_OP_CREATE_XRC_SRQ: + { + void *xrc_srqc; + + MLX5_SET(create_xrc_srq_in, in, xrc_srq_umem_valid, 1); + xrc_srqc = MLX5_ADDR_OF(create_xrc_srq_in, in, + xrc_srq_context_entry); + MLX5_SET(xrc_srqc, xrc_srqc, dbr_umem_valid, 1); + break; + } + + default: + return; + } +} + +static bool devx_is_obj_create_cmd(const void *in, u16 *opcode) +{ + *opcode = MLX5_GET(general_obj_in_cmd_hdr, in, opcode); + + switch (*opcode) { case MLX5_CMD_OP_CREATE_GENERAL_OBJECT: case MLX5_CMD_OP_CREATE_MKEY: case MLX5_CMD_OP_CREATE_CQ: @@ -385,12 +541,49 @@ static bool devx_is_obj_query_cmd(const void *in) } } +static bool devx_is_whitelist_cmd(void *in) +{ + u16 opcode = MLX5_GET(general_obj_in_cmd_hdr, in, opcode); + + switch (opcode) { + case MLX5_CMD_OP_QUERY_HCA_CAP: + case MLX5_CMD_OP_QUERY_HCA_VPORT_CONTEXT: + return true; + default: + return false; + } +} + +static int devx_get_uid(struct mlx5_ib_ucontext *c, void *cmd_in) +{ + if (devx_is_whitelist_cmd(cmd_in)) { + struct mlx5_ib_dev *dev; + + if (c->devx_uid) + return c->devx_uid; + + dev = to_mdev(c->ibucontext.device); + if (dev->devx_whitelist_uid) + return dev->devx_whitelist_uid; + + return -EOPNOTSUPP; + } + + if (!c->devx_uid) + return -EINVAL; + + if (!capable(CAP_NET_RAW)) + return -EPERM; + + return c->devx_uid; +} static bool devx_is_general_cmd(void *in) { u16 opcode = MLX5_GET(general_obj_in_cmd_hdr, in, opcode); switch (opcode) { case MLX5_CMD_OP_QUERY_HCA_CAP: + case MLX5_CMD_OP_QUERY_HCA_VPORT_CONTEXT: case MLX5_CMD_OP_QUERY_VPORT_STATE: case MLX5_CMD_OP_QUERY_ADAPTER: case MLX5_CMD_OP_QUERY_ISSI: @@ -498,14 +691,16 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_DEVX_OTHER)( MLX5_IB_ATTR_DEVX_OTHER_CMD_OUT); void *cmd_out; int err; + int uid; c = devx_ufile2uctx(file); if (IS_ERR(c)) return PTR_ERR(c); dev = to_mdev(c->ibucontext.device); - if (!c->devx_uid) - return -EPERM; + uid = devx_get_uid(c, cmd_in); + if (uid < 0) + return uid; /* Only white list of some general HCA commands are allowed for this method. */ if (!devx_is_general_cmd(cmd_in)) @@ -515,7 +710,7 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_DEVX_OTHER)( if (IS_ERR(cmd_out)) return PTR_ERR(cmd_out); - MLX5_SET(general_obj_in_cmd_hdr, cmd_in, uid, c->devx_uid); + MLX5_SET(general_obj_in_cmd_hdr, cmd_in, uid, uid); err = mlx5_cmd_exec(dev->mdev, cmd_in, uverbs_attr_get_len(attrs, MLX5_IB_ATTR_DEVX_OTHER_CMD_IN), cmd_out, cmd_out_len); @@ -726,11 +921,15 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_DEVX_OBJ_CREATE)( u32 out[MLX5_ST_SZ_DW(general_obj_out_cmd_hdr)]; struct devx_obj *obj; int err; + int uid; + u32 obj_id; + u16 opcode; - if (!c->devx_uid) - return -EPERM; + uid = devx_get_uid(c, cmd_in); + if (uid < 0) + return uid; - if (!devx_is_obj_create_cmd(cmd_in)) + if (!devx_is_obj_create_cmd(cmd_in, &opcode)) return -EINVAL; cmd_out = uverbs_zalloc(attrs, cmd_out_len); @@ -741,7 +940,9 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_DEVX_OBJ_CREATE)( if (!obj) return -ENOMEM; - MLX5_SET(general_obj_in_cmd_hdr, cmd_in, uid, c->devx_uid); + MLX5_SET(general_obj_in_cmd_hdr, cmd_in, uid, uid); + devx_set_umem_valid(cmd_in); + err = mlx5_cmd_exec(dev->mdev, cmd_in, uverbs_attr_get_len(attrs, MLX5_IB_ATTR_DEVX_OBJ_CREATE_CMD_IN), cmd_out, cmd_out_len); @@ -750,13 +951,15 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_DEVX_OBJ_CREATE)( uobj->object = obj; obj->mdev = dev->mdev; - devx_obj_build_destroy_cmd(cmd_in, cmd_out, obj->dinbox, &obj->dinlen, &obj->obj_id); + devx_obj_build_destroy_cmd(cmd_in, cmd_out, obj->dinbox, &obj->dinlen, + &obj_id); WARN_ON(obj->dinlen > MLX5_MAX_DESTROY_INBOX_SIZE_DW * sizeof(u32)); err = uverbs_copy_to(attrs, MLX5_IB_ATTR_DEVX_OBJ_CREATE_CMD_OUT, cmd_out, cmd_out_len); if (err) goto obj_destroy; + obj->obj_id = get_enc_obj_id(opcode, obj_id); return 0; obj_destroy: @@ -778,9 +981,11 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_DEVX_OBJ_MODIFY)( struct devx_obj *obj = uobj->object; void *cmd_out; int err; + int uid; - if (!c->devx_uid) - return -EPERM; + uid = devx_get_uid(c, cmd_in); + if (uid < 0) + return uid; if (!devx_is_obj_modify_cmd(cmd_in)) return -EINVAL; @@ -792,7 +997,9 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_DEVX_OBJ_MODIFY)( if (IS_ERR(cmd_out)) return PTR_ERR(cmd_out); - MLX5_SET(general_obj_in_cmd_hdr, cmd_in, uid, c->devx_uid); + MLX5_SET(general_obj_in_cmd_hdr, cmd_in, uid, uid); + devx_set_umem_valid(cmd_in); + err = mlx5_cmd_exec(obj->mdev, cmd_in, uverbs_attr_get_len(attrs, MLX5_IB_ATTR_DEVX_OBJ_MODIFY_CMD_IN), cmd_out, cmd_out_len); @@ -815,9 +1022,11 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_DEVX_OBJ_QUERY)( struct devx_obj *obj = uobj->object; void *cmd_out; int err; + int uid; - if (!c->devx_uid) - return -EPERM; + uid = devx_get_uid(c, cmd_in); + if (uid < 0) + return uid; if (!devx_is_obj_query_cmd(cmd_in)) return -EINVAL; @@ -829,7 +1038,7 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_DEVX_OBJ_QUERY)( if (IS_ERR(cmd_out)) return PTR_ERR(cmd_out); - MLX5_SET(general_obj_in_cmd_hdr, cmd_in, uid, c->devx_uid); + MLX5_SET(general_obj_in_cmd_hdr, cmd_in, uid, uid); err = mlx5_cmd_exec(obj->mdev, cmd_in, uverbs_attr_get_len(attrs, MLX5_IB_ATTR_DEVX_OBJ_QUERY_CMD_IN), cmd_out, cmd_out_len); @@ -928,6 +1137,9 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_DEVX_UMEM_REG)( int err; if (!c->devx_uid) + return -EINVAL; + + if (!capable(CAP_NET_RAW)) return -EPERM; obj = kzalloc(sizeof(struct devx_umem), GFP_KERNEL); diff --git a/drivers/infiniband/hw/mlx5/flow.c b/drivers/infiniband/hw/mlx5/flow.c index 1a29f47f836e..f86cdcafdafc 100644 --- a/drivers/infiniband/hw/mlx5/flow.c +++ b/drivers/infiniband/hw/mlx5/flow.c @@ -7,7 +7,9 @@ #include <rdma/ib_verbs.h> #include <rdma/uverbs_types.h> #include <rdma/uverbs_ioctl.h> +#include <rdma/uverbs_std_types.h> #include <rdma/mlx5_user_ioctl_cmds.h> +#include <rdma/mlx5_user_ioctl_verbs.h> #include <rdma/ib_umem.h> #include <linux/mlx5/driver.h> #include <linux/mlx5/fs.h> @@ -16,6 +18,24 @@ #define UVERBS_MODULE_NAME mlx5_ib #include <rdma/uverbs_named_ioctl.h> +static int +mlx5_ib_ft_type_to_namespace(enum mlx5_ib_uapi_flow_table_type table_type, + enum mlx5_flow_namespace_type *namespace) +{ + switch (table_type) { + case MLX5_IB_UAPI_FLOW_TABLE_TYPE_NIC_RX: + *namespace = MLX5_FLOW_NAMESPACE_BYPASS; + break; + case MLX5_IB_UAPI_FLOW_TABLE_TYPE_NIC_TX: + *namespace = MLX5_FLOW_NAMESPACE_EGRESS; + break; + default: + return -EINVAL; + } + + return 0; +} + static const struct uverbs_attr_spec mlx5_ib_flow_type[] = { [MLX5_IB_FLOW_TYPE_NORMAL] = { .type = UVERBS_ATTR_TYPE_PTR_IN, @@ -38,11 +58,15 @@ static const struct uverbs_attr_spec mlx5_ib_flow_type[] = { }, }; +#define MLX5_IB_CREATE_FLOW_MAX_FLOW_ACTIONS 2 static int UVERBS_HANDLER(MLX5_IB_METHOD_CREATE_FLOW)( struct ib_uverbs_file *file, struct uverbs_attr_bundle *attrs) { + struct mlx5_flow_act flow_act = {.flow_tag = MLX5_FS_DEFAULT_FLOW_TAG}; struct mlx5_ib_flow_handler *flow_handler; struct mlx5_ib_flow_matcher *fs_matcher; + struct ib_uobject **arr_flow_actions; + struct ib_uflow_resources *uflow_res; void *devx_obj; int dest_id, dest_type; void *cmd_in; @@ -52,6 +76,7 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_CREATE_FLOW)( struct ib_uobject *uobj = uverbs_attr_get_uobject(attrs, MLX5_IB_ATTR_CREATE_FLOW_HANDLE); struct mlx5_ib_dev *dev = to_mdev(uobj->context->device); + int len, ret, i; if (!capable(CAP_NET_RAW)) return -EPERM; @@ -61,7 +86,14 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_CREATE_FLOW)( dest_qp = uverbs_attr_is_valid(attrs, MLX5_IB_ATTR_CREATE_FLOW_DEST_QP); - if ((dest_devx && dest_qp) || (!dest_devx && !dest_qp)) + fs_matcher = uverbs_attr_get_obj(attrs, + MLX5_IB_ATTR_CREATE_FLOW_MATCHER); + if (fs_matcher->ns_type == MLX5_FLOW_NAMESPACE_BYPASS && + ((dest_devx && dest_qp) || (!dest_devx && !dest_qp))) + return -EINVAL; + + if (fs_matcher->ns_type == MLX5_FLOW_NAMESPACE_EGRESS && + (dest_devx || dest_qp)) return -EINVAL; if (dest_devx) { @@ -75,7 +107,7 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_CREATE_FLOW)( */ if (!mlx5_ib_devx_is_flow_dest(devx_obj, &dest_id, &dest_type)) return -EINVAL; - } else { + } else if (dest_qp) { struct mlx5_ib_qp *mqp; qp = uverbs_attr_get_obj(attrs, @@ -92,6 +124,8 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_CREATE_FLOW)( else dest_id = mqp->raw_packet_qp.rq.tirn; dest_type = MLX5_FLOW_DESTINATION_TYPE_TIR; + } else { + dest_type = MLX5_FLOW_DESTINATION_TYPE_PORT; } if (dev->rep) @@ -101,16 +135,48 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_CREATE_FLOW)( attrs, MLX5_IB_ATTR_CREATE_FLOW_MATCH_VALUE); inlen = uverbs_attr_get_len(attrs, MLX5_IB_ATTR_CREATE_FLOW_MATCH_VALUE); - fs_matcher = uverbs_attr_get_obj(attrs, - MLX5_IB_ATTR_CREATE_FLOW_MATCHER); - flow_handler = mlx5_ib_raw_fs_rule_add(dev, fs_matcher, cmd_in, inlen, + + uflow_res = flow_resources_alloc(MLX5_IB_CREATE_FLOW_MAX_FLOW_ACTIONS); + if (!uflow_res) + return -ENOMEM; + + len = uverbs_attr_get_uobjs_arr(attrs, + MLX5_IB_ATTR_CREATE_FLOW_ARR_FLOW_ACTIONS, &arr_flow_actions); + for (i = 0; i < len; i++) { + struct mlx5_ib_flow_action *maction = + to_mflow_act(arr_flow_actions[i]->object); + + ret = parse_flow_flow_action(maction, false, &flow_act); + if (ret) + goto err_out; + flow_resources_add(uflow_res, IB_FLOW_SPEC_ACTION_HANDLE, + arr_flow_actions[i]->object); + } + + ret = uverbs_copy_from(&flow_act.flow_tag, attrs, + MLX5_IB_ATTR_CREATE_FLOW_TAG); + if (!ret) { + if (flow_act.flow_tag >= BIT(24)) { + ret = -EINVAL; + goto err_out; + } + flow_act.flags |= FLOW_ACT_HAS_TAG; + } + + flow_handler = mlx5_ib_raw_fs_rule_add(dev, fs_matcher, &flow_act, + cmd_in, inlen, dest_id, dest_type); - if (IS_ERR(flow_handler)) - return PTR_ERR(flow_handler); + if (IS_ERR(flow_handler)) { + ret = PTR_ERR(flow_handler); + goto err_out; + } - ib_set_flow(uobj, &flow_handler->ibflow, qp, &dev->ib_dev); + ib_set_flow(uobj, &flow_handler->ibflow, qp, &dev->ib_dev, uflow_res); return 0; +err_out: + ib_uverbs_flow_resources_free(uflow_res); + return ret; } static int flow_matcher_cleanup(struct ib_uobject *uobject, @@ -134,12 +200,14 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_FLOW_MATCHER_CREATE)( attrs, MLX5_IB_ATTR_FLOW_MATCHER_CREATE_HANDLE); struct mlx5_ib_dev *dev = to_mdev(uobj->context->device); struct mlx5_ib_flow_matcher *obj; + u32 flags; int err; obj = kzalloc(sizeof(struct mlx5_ib_flow_matcher), GFP_KERNEL); if (!obj) return -ENOMEM; + obj->ns_type = MLX5_FLOW_NAMESPACE_BYPASS; obj->mask_len = uverbs_attr_get_len( attrs, MLX5_IB_ATTR_FLOW_MATCHER_MATCH_MASK); err = uverbs_copy_from(&obj->matcher_mask, @@ -165,6 +233,19 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_FLOW_MATCHER_CREATE)( if (err) goto end; + err = uverbs_get_flags32(&flags, attrs, + MLX5_IB_ATTR_FLOW_MATCHER_FLOW_FLAGS, + IB_FLOW_ATTR_FLAGS_EGRESS); + if (err) + goto end; + + if (flags) { + err = mlx5_ib_ft_type_to_namespace( + MLX5_IB_UAPI_FLOW_TABLE_TYPE_NIC_TX, &obj->ns_type); + if (err) + goto end; + } + uobj->object = obj; obj->mdev = dev->mdev; atomic_set(&obj->usecnt, 0); @@ -175,6 +256,248 @@ end: return err; } +void mlx5_ib_destroy_flow_action_raw(struct mlx5_ib_flow_action *maction) +{ + switch (maction->flow_action_raw.sub_type) { + case MLX5_IB_FLOW_ACTION_MODIFY_HEADER: + mlx5_modify_header_dealloc(maction->flow_action_raw.dev->mdev, + maction->flow_action_raw.action_id); + break; + case MLX5_IB_FLOW_ACTION_PACKET_REFORMAT: + mlx5_packet_reformat_dealloc(maction->flow_action_raw.dev->mdev, + maction->flow_action_raw.action_id); + break; + case MLX5_IB_FLOW_ACTION_DECAP: + break; + default: + break; + } +} + +static struct ib_flow_action * +mlx5_ib_create_modify_header(struct mlx5_ib_dev *dev, + enum mlx5_ib_uapi_flow_table_type ft_type, + u8 num_actions, void *in) +{ + enum mlx5_flow_namespace_type namespace; + struct mlx5_ib_flow_action *maction; + int ret; + + ret = mlx5_ib_ft_type_to_namespace(ft_type, &namespace); + if (ret) + return ERR_PTR(-EINVAL); + + maction = kzalloc(sizeof(*maction), GFP_KERNEL); + if (!maction) + return ERR_PTR(-ENOMEM); + + ret = mlx5_modify_header_alloc(dev->mdev, namespace, num_actions, in, + &maction->flow_action_raw.action_id); + + if (ret) { + kfree(maction); + return ERR_PTR(ret); + } + maction->flow_action_raw.sub_type = + MLX5_IB_FLOW_ACTION_MODIFY_HEADER; + maction->flow_action_raw.dev = dev; + + return &maction->ib_action; +} + +static bool mlx5_ib_modify_header_supported(struct mlx5_ib_dev *dev) +{ + return MLX5_CAP_FLOWTABLE_NIC_RX(dev->mdev, + max_modify_header_actions) || + MLX5_CAP_FLOWTABLE_NIC_TX(dev->mdev, max_modify_header_actions); +} + +static int UVERBS_HANDLER(MLX5_IB_METHOD_FLOW_ACTION_CREATE_MODIFY_HEADER)( + struct ib_uverbs_file *file, + struct uverbs_attr_bundle *attrs) +{ + struct ib_uobject *uobj = uverbs_attr_get_uobject( + attrs, MLX5_IB_ATTR_CREATE_MODIFY_HEADER_HANDLE); + struct mlx5_ib_dev *mdev = to_mdev(uobj->context->device); + enum mlx5_ib_uapi_flow_table_type ft_type; + struct ib_flow_action *action; + size_t num_actions; + void *in; + int len; + int ret; + + if (!mlx5_ib_modify_header_supported(mdev)) + return -EOPNOTSUPP; + + in = uverbs_attr_get_alloced_ptr(attrs, + MLX5_IB_ATTR_CREATE_MODIFY_HEADER_ACTIONS_PRM); + len = uverbs_attr_get_len(attrs, + MLX5_IB_ATTR_CREATE_MODIFY_HEADER_ACTIONS_PRM); + + if (len % MLX5_UN_SZ_BYTES(set_action_in_add_action_in_auto)) + return -EINVAL; + + ret = uverbs_get_const(&ft_type, attrs, + MLX5_IB_ATTR_CREATE_MODIFY_HEADER_FT_TYPE); + if (ret) + return ret; + + num_actions = len / MLX5_UN_SZ_BYTES(set_action_in_add_action_in_auto), + action = mlx5_ib_create_modify_header(mdev, ft_type, num_actions, in); + if (IS_ERR(action)) + return PTR_ERR(action); + + uverbs_flow_action_fill_action(action, uobj, uobj->context->device, + IB_FLOW_ACTION_UNSPECIFIED); + + return 0; +} + +static bool mlx5_ib_flow_action_packet_reformat_valid(struct mlx5_ib_dev *ibdev, + u8 packet_reformat_type, + u8 ft_type) +{ + switch (packet_reformat_type) { + case MLX5_IB_UAPI_FLOW_ACTION_PACKET_REFORMAT_TYPE_L2_TO_L2_TUNNEL: + if (ft_type == MLX5_IB_UAPI_FLOW_TABLE_TYPE_NIC_TX) + return MLX5_CAP_FLOWTABLE(ibdev->mdev, + encap_general_header); + break; + case MLX5_IB_UAPI_FLOW_ACTION_PACKET_REFORMAT_TYPE_L2_TO_L3_TUNNEL: + if (ft_type == MLX5_IB_UAPI_FLOW_TABLE_TYPE_NIC_TX) + return MLX5_CAP_FLOWTABLE_NIC_TX(ibdev->mdev, + reformat_l2_to_l3_tunnel); + break; + case MLX5_IB_UAPI_FLOW_ACTION_PACKET_REFORMAT_TYPE_L3_TUNNEL_TO_L2: + if (ft_type == MLX5_IB_UAPI_FLOW_TABLE_TYPE_NIC_RX) + return MLX5_CAP_FLOWTABLE_NIC_RX(ibdev->mdev, + reformat_l3_tunnel_to_l2); + break; + case MLX5_IB_UAPI_FLOW_ACTION_PACKET_REFORMAT_TYPE_L2_TUNNEL_TO_L2: + if (ft_type == MLX5_IB_UAPI_FLOW_TABLE_TYPE_NIC_RX) + return MLX5_CAP_FLOWTABLE_NIC_RX(ibdev->mdev, decap); + break; + default: + break; + } + + return false; +} + +static int mlx5_ib_dv_to_prm_packet_reforamt_type(u8 dv_prt, u8 *prm_prt) +{ + switch (dv_prt) { + case MLX5_IB_UAPI_FLOW_ACTION_PACKET_REFORMAT_TYPE_L2_TO_L2_TUNNEL: + *prm_prt = MLX5_REFORMAT_TYPE_L2_TO_L2_TUNNEL; + break; + case MLX5_IB_UAPI_FLOW_ACTION_PACKET_REFORMAT_TYPE_L3_TUNNEL_TO_L2: + *prm_prt = MLX5_REFORMAT_TYPE_L3_TUNNEL_TO_L2; + break; + case MLX5_IB_UAPI_FLOW_ACTION_PACKET_REFORMAT_TYPE_L2_TO_L3_TUNNEL: + *prm_prt = MLX5_REFORMAT_TYPE_L2_TO_L3_TUNNEL; + break; + default: + return -EINVAL; + } + + return 0; +} + +static int mlx5_ib_flow_action_create_packet_reformat_ctx( + struct mlx5_ib_dev *dev, + struct mlx5_ib_flow_action *maction, + u8 ft_type, u8 dv_prt, + void *in, size_t len) +{ + enum mlx5_flow_namespace_type namespace; + u8 prm_prt; + int ret; + + ret = mlx5_ib_ft_type_to_namespace(ft_type, &namespace); + if (ret) + return ret; + + ret = mlx5_ib_dv_to_prm_packet_reforamt_type(dv_prt, &prm_prt); + if (ret) + return ret; + + ret = mlx5_packet_reformat_alloc(dev->mdev, prm_prt, len, + in, namespace, + &maction->flow_action_raw.action_id); + if (ret) + return ret; + + maction->flow_action_raw.sub_type = + MLX5_IB_FLOW_ACTION_PACKET_REFORMAT; + maction->flow_action_raw.dev = dev; + + return 0; +} + +static int UVERBS_HANDLER(MLX5_IB_METHOD_FLOW_ACTION_CREATE_PACKET_REFORMAT)( + struct ib_uverbs_file *file, + struct uverbs_attr_bundle *attrs) +{ + struct ib_uobject *uobj = uverbs_attr_get_uobject(attrs, + MLX5_IB_ATTR_CREATE_PACKET_REFORMAT_HANDLE); + struct mlx5_ib_dev *mdev = to_mdev(uobj->context->device); + enum mlx5_ib_uapi_flow_action_packet_reformat_type dv_prt; + enum mlx5_ib_uapi_flow_table_type ft_type; + struct mlx5_ib_flow_action *maction; + int ret; + + ret = uverbs_get_const(&ft_type, attrs, + MLX5_IB_ATTR_CREATE_PACKET_REFORMAT_FT_TYPE); + if (ret) + return ret; + + ret = uverbs_get_const(&dv_prt, attrs, + MLX5_IB_ATTR_CREATE_PACKET_REFORMAT_TYPE); + if (ret) + return ret; + + if (!mlx5_ib_flow_action_packet_reformat_valid(mdev, dv_prt, ft_type)) + return -EOPNOTSUPP; + + maction = kzalloc(sizeof(*maction), GFP_KERNEL); + if (!maction) + return -ENOMEM; + + if (dv_prt == + MLX5_IB_UAPI_FLOW_ACTION_PACKET_REFORMAT_TYPE_L2_TUNNEL_TO_L2) { + maction->flow_action_raw.sub_type = + MLX5_IB_FLOW_ACTION_DECAP; + maction->flow_action_raw.dev = mdev; + } else { + void *in; + int len; + + in = uverbs_attr_get_alloced_ptr(attrs, + MLX5_IB_ATTR_CREATE_PACKET_REFORMAT_DATA_BUF); + if (IS_ERR(in)) { + ret = PTR_ERR(in); + goto free_maction; + } + + len = uverbs_attr_get_len(attrs, + MLX5_IB_ATTR_CREATE_PACKET_REFORMAT_DATA_BUF); + + ret = mlx5_ib_flow_action_create_packet_reformat_ctx(mdev, + maction, ft_type, dv_prt, in, len); + if (ret) + goto free_maction; + } + + uverbs_flow_action_fill_action(&maction->ib_action, uobj, + uobj->context->device, + IB_FLOW_ACTION_UNSPECIFIED); + return 0; + +free_maction: + kfree(maction); + return ret; +} + DECLARE_UVERBS_NAMED_METHOD( MLX5_IB_METHOD_CREATE_FLOW, UVERBS_ATTR_IDR(MLX5_IB_ATTR_CREATE_FLOW_HANDLE, @@ -195,7 +518,15 @@ DECLARE_UVERBS_NAMED_METHOD( UVERBS_ACCESS_READ), UVERBS_ATTR_IDR(MLX5_IB_ATTR_CREATE_FLOW_DEST_DEVX, MLX5_IB_OBJECT_DEVX_OBJ, - UVERBS_ACCESS_READ)); + UVERBS_ACCESS_READ), + UVERBS_ATTR_IDRS_ARR(MLX5_IB_ATTR_CREATE_FLOW_ARR_FLOW_ACTIONS, + UVERBS_OBJECT_FLOW_ACTION, + UVERBS_ACCESS_READ, 1, + MLX5_IB_CREATE_FLOW_MAX_FLOW_ACTIONS, + UA_OPTIONAL), + UVERBS_ATTR_PTR_IN(MLX5_IB_ATTR_CREATE_FLOW_TAG, + UVERBS_ATTR_TYPE(u32), + UA_OPTIONAL)); DECLARE_UVERBS_NAMED_METHOD_DESTROY( MLX5_IB_METHOD_DESTROY_FLOW, @@ -210,6 +541,44 @@ ADD_UVERBS_METHODS(mlx5_ib_fs, &UVERBS_METHOD(MLX5_IB_METHOD_DESTROY_FLOW)); DECLARE_UVERBS_NAMED_METHOD( + MLX5_IB_METHOD_FLOW_ACTION_CREATE_MODIFY_HEADER, + UVERBS_ATTR_IDR(MLX5_IB_ATTR_CREATE_MODIFY_HEADER_HANDLE, + UVERBS_OBJECT_FLOW_ACTION, + UVERBS_ACCESS_NEW, + UA_MANDATORY), + UVERBS_ATTR_PTR_IN(MLX5_IB_ATTR_CREATE_MODIFY_HEADER_ACTIONS_PRM, + UVERBS_ATTR_MIN_SIZE(MLX5_UN_SZ_BYTES( + set_action_in_add_action_in_auto)), + UA_MANDATORY, + UA_ALLOC_AND_COPY), + UVERBS_ATTR_CONST_IN(MLX5_IB_ATTR_CREATE_MODIFY_HEADER_FT_TYPE, + enum mlx5_ib_uapi_flow_table_type, + UA_MANDATORY)); + +DECLARE_UVERBS_NAMED_METHOD( + MLX5_IB_METHOD_FLOW_ACTION_CREATE_PACKET_REFORMAT, + UVERBS_ATTR_IDR(MLX5_IB_ATTR_CREATE_PACKET_REFORMAT_HANDLE, + UVERBS_OBJECT_FLOW_ACTION, + UVERBS_ACCESS_NEW, + UA_MANDATORY), + UVERBS_ATTR_PTR_IN(MLX5_IB_ATTR_CREATE_PACKET_REFORMAT_DATA_BUF, + UVERBS_ATTR_MIN_SIZE(1), + UA_ALLOC_AND_COPY, + UA_OPTIONAL), + UVERBS_ATTR_CONST_IN(MLX5_IB_ATTR_CREATE_PACKET_REFORMAT_TYPE, + enum mlx5_ib_uapi_flow_action_packet_reformat_type, + UA_MANDATORY), + UVERBS_ATTR_CONST_IN(MLX5_IB_ATTR_CREATE_PACKET_REFORMAT_FT_TYPE, + enum mlx5_ib_uapi_flow_table_type, + UA_MANDATORY)); + +ADD_UVERBS_METHODS( + mlx5_ib_flow_actions, + UVERBS_OBJECT_FLOW_ACTION, + &UVERBS_METHOD(MLX5_IB_METHOD_FLOW_ACTION_CREATE_MODIFY_HEADER), + &UVERBS_METHOD(MLX5_IB_METHOD_FLOW_ACTION_CREATE_PACKET_REFORMAT)); + +DECLARE_UVERBS_NAMED_METHOD( MLX5_IB_METHOD_FLOW_MATCHER_CREATE, UVERBS_ATTR_IDR(MLX5_IB_ATTR_FLOW_MATCHER_CREATE_HANDLE, MLX5_IB_OBJECT_FLOW_MATCHER, @@ -224,7 +593,10 @@ DECLARE_UVERBS_NAMED_METHOD( UA_MANDATORY), UVERBS_ATTR_PTR_IN(MLX5_IB_ATTR_FLOW_MATCHER_MATCH_CRITERIA, UVERBS_ATTR_TYPE(u8), - UA_MANDATORY)); + UA_MANDATORY), + UVERBS_ATTR_FLAGS_IN(MLX5_IB_ATTR_FLOW_MATCHER_FLOW_FLAGS, + enum ib_flow_flags, + UA_OPTIONAL)); DECLARE_UVERBS_NAMED_METHOD_DESTROY( MLX5_IB_METHOD_FLOW_MATCHER_DESTROY, @@ -247,6 +619,7 @@ int mlx5_ib_get_flow_trees(const struct uverbs_object_tree_def **root) root[i++] = &flow_objects; root[i++] = &mlx5_ib_fs; + root[i++] = &mlx5_ib_flow_actions; return i; } diff --git a/drivers/infiniband/hw/mlx5/ib_rep.c b/drivers/infiniband/hw/mlx5/ib_rep.c index 35a0e04c38f2..584ff2ea7810 100644 --- a/drivers/infiniband/hw/mlx5/ib_rep.c +++ b/drivers/infiniband/hw/mlx5/ib_rep.c @@ -39,9 +39,6 @@ static const struct mlx5_ib_profile rep_profile = { STAGE_CREATE(MLX5_IB_STAGE_POST_IB_REG_UMR, mlx5_ib_stage_post_ib_reg_umr_init, NULL), - STAGE_CREATE(MLX5_IB_STAGE_CLASS_ATTR, - mlx5_ib_stage_class_attr_init, - NULL), }; static int diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index af32899bb72a..e9c428071df3 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -1571,14 +1571,57 @@ static void deallocate_uars(struct mlx5_ib_dev *dev, mlx5_cmd_free_uar(dev->mdev, bfregi->sys_pages[i]); } -static int mlx5_ib_alloc_transport_domain(struct mlx5_ib_dev *dev, u32 *tdn) +int mlx5_ib_enable_lb(struct mlx5_ib_dev *dev, bool td, bool qp) +{ + int err = 0; + + mutex_lock(&dev->lb.mutex); + if (td) + dev->lb.user_td++; + if (qp) + dev->lb.qps++; + + if (dev->lb.user_td == 2 || + dev->lb.qps == 1) { + if (!dev->lb.enabled) { + err = mlx5_nic_vport_update_local_lb(dev->mdev, true); + dev->lb.enabled = true; + } + } + + mutex_unlock(&dev->lb.mutex); + + return err; +} + +void mlx5_ib_disable_lb(struct mlx5_ib_dev *dev, bool td, bool qp) +{ + mutex_lock(&dev->lb.mutex); + if (td) + dev->lb.user_td--; + if (qp) + dev->lb.qps--; + + if (dev->lb.user_td == 1 && + dev->lb.qps == 0) { + if (dev->lb.enabled) { + mlx5_nic_vport_update_local_lb(dev->mdev, false); + dev->lb.enabled = false; + } + } + + mutex_unlock(&dev->lb.mutex); +} + +static int mlx5_ib_alloc_transport_domain(struct mlx5_ib_dev *dev, u32 *tdn, + u16 uid) { int err; if (!MLX5_CAP_GEN(dev->mdev, log_max_transport_domain)) return 0; - err = mlx5_core_alloc_transport_domain(dev->mdev, tdn); + err = mlx5_cmd_alloc_transport_domain(dev->mdev, tdn, uid); if (err) return err; @@ -1587,35 +1630,23 @@ static int mlx5_ib_alloc_transport_domain(struct mlx5_ib_dev *dev, u32 *tdn) !MLX5_CAP_GEN(dev->mdev, disable_local_lb_mc))) return err; - mutex_lock(&dev->lb_mutex); - dev->user_td++; - - if (dev->user_td == 2) - err = mlx5_nic_vport_update_local_lb(dev->mdev, true); - - mutex_unlock(&dev->lb_mutex); - return err; + return mlx5_ib_enable_lb(dev, true, false); } -static void mlx5_ib_dealloc_transport_domain(struct mlx5_ib_dev *dev, u32 tdn) +static void mlx5_ib_dealloc_transport_domain(struct mlx5_ib_dev *dev, u32 tdn, + u16 uid) { if (!MLX5_CAP_GEN(dev->mdev, log_max_transport_domain)) return; - mlx5_core_dealloc_transport_domain(dev->mdev, tdn); + mlx5_cmd_dealloc_transport_domain(dev->mdev, tdn, uid); if ((MLX5_CAP_GEN(dev->mdev, port_type) != MLX5_CAP_PORT_TYPE_ETH) || (!MLX5_CAP_GEN(dev->mdev, disable_local_lb_uc) && !MLX5_CAP_GEN(dev->mdev, disable_local_lb_mc))) return; - mutex_lock(&dev->lb_mutex); - dev->user_td--; - - if (dev->user_td < 2) - mlx5_nic_vport_update_local_lb(dev->mdev, false); - - mutex_unlock(&dev->lb_mutex); + mlx5_ib_disable_lb(dev, true, false); } static struct ib_ucontext *mlx5_ib_alloc_ucontext(struct ib_device *ibdev, @@ -1727,30 +1758,24 @@ static struct ib_ucontext *mlx5_ib_alloc_ucontext(struct ib_device *ibdev, context->ibucontext.invalidate_range = &mlx5_ib_invalidate_range; #endif - err = mlx5_ib_alloc_transport_domain(dev, &context->tdn); - if (err) - goto out_uars; - if (req.flags & MLX5_IB_ALLOC_UCTX_DEVX) { - /* Block DEVX on Infiniband as of SELinux */ - if (mlx5_ib_port_link_layer(ibdev, 1) != IB_LINK_LAYER_ETHERNET) { - err = -EPERM; - goto out_td; - } - - err = mlx5_ib_devx_create(dev, context); - if (err) - goto out_td; + err = mlx5_ib_devx_create(dev); + if (err < 0) + goto out_uars; + context->devx_uid = err; } + err = mlx5_ib_alloc_transport_domain(dev, &context->tdn, + context->devx_uid); + if (err) + goto out_devx; + if (MLX5_CAP_GEN(dev->mdev, dump_fill_mkey)) { err = mlx5_cmd_dump_fill_mkey(dev->mdev, &dump_fill_mkey); if (err) goto out_mdev; } - INIT_LIST_HEAD(&context->vma_private_list); - mutex_init(&context->vma_private_list_mutex); INIT_LIST_HEAD(&context->db_page_list); mutex_init(&context->db_page_mutex); @@ -1826,13 +1851,21 @@ static struct ib_ucontext *mlx5_ib_alloc_ucontext(struct ib_device *ibdev, context->lib_caps = req.lib_caps; print_lib_caps(dev, context->lib_caps); + if (mlx5_lag_is_active(dev->mdev)) { + u8 port = mlx5_core_native_port_num(dev->mdev); + + atomic_set(&context->tx_port_affinity, + atomic_add_return( + 1, &dev->roce[port].tx_port_affinity)); + } + return &context->ibucontext; out_mdev: + mlx5_ib_dealloc_transport_domain(dev, context->tdn, context->devx_uid); +out_devx: if (req.flags & MLX5_IB_ALLOC_UCTX_DEVX) - mlx5_ib_devx_destroy(dev, context); -out_td: - mlx5_ib_dealloc_transport_domain(dev, context->tdn); + mlx5_ib_devx_destroy(dev, context->devx_uid); out_uars: deallocate_uars(dev, context); @@ -1855,11 +1888,18 @@ static int mlx5_ib_dealloc_ucontext(struct ib_ucontext *ibcontext) struct mlx5_ib_dev *dev = to_mdev(ibcontext->device); struct mlx5_bfreg_info *bfregi; - if (context->devx_uid) - mlx5_ib_devx_destroy(dev, context); +#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING + /* All umem's must be destroyed before destroying the ucontext. */ + mutex_lock(&ibcontext->per_mm_list_lock); + WARN_ON(!list_empty(&ibcontext->per_mm_list)); + mutex_unlock(&ibcontext->per_mm_list_lock); +#endif bfregi = &context->bfregi; - mlx5_ib_dealloc_transport_domain(dev, context->tdn); + mlx5_ib_dealloc_transport_domain(dev, context->tdn, context->devx_uid); + + if (context->devx_uid) + mlx5_ib_devx_destroy(dev, context->devx_uid); deallocate_uars(dev, context); kfree(bfregi->sys_pages); @@ -1900,94 +1940,9 @@ static int get_extended_index(unsigned long offset) return get_arg(offset) | ((offset >> 16) & 0xff) << 8; } -static void mlx5_ib_vma_open(struct vm_area_struct *area) -{ - /* vma_open is called when a new VMA is created on top of our VMA. This - * is done through either mremap flow or split_vma (usually due to - * mlock, madvise, munmap, etc.) We do not support a clone of the VMA, - * as this VMA is strongly hardware related. Therefore we set the - * vm_ops of the newly created/cloned VMA to NULL, to prevent it from - * calling us again and trying to do incorrect actions. We assume that - * the original VMA size is exactly a single page, and therefore all - * "splitting" operation will not happen to it. - */ - area->vm_ops = NULL; -} - -static void mlx5_ib_vma_close(struct vm_area_struct *area) -{ - struct mlx5_ib_vma_private_data *mlx5_ib_vma_priv_data; - - /* It's guaranteed that all VMAs opened on a FD are closed before the - * file itself is closed, therefore no sync is needed with the regular - * closing flow. (e.g. mlx5 ib_dealloc_ucontext) - * However need a sync with accessing the vma as part of - * mlx5_ib_disassociate_ucontext. - * The close operation is usually called under mm->mmap_sem except when - * process is exiting. - * The exiting case is handled explicitly as part of - * mlx5_ib_disassociate_ucontext. - */ - mlx5_ib_vma_priv_data = (struct mlx5_ib_vma_private_data *)area->vm_private_data; - - /* setting the vma context pointer to null in the mlx5_ib driver's - * private data, to protect a race condition in - * mlx5_ib_disassociate_ucontext(). - */ - mlx5_ib_vma_priv_data->vma = NULL; - mutex_lock(mlx5_ib_vma_priv_data->vma_private_list_mutex); - list_del(&mlx5_ib_vma_priv_data->list); - mutex_unlock(mlx5_ib_vma_priv_data->vma_private_list_mutex); - kfree(mlx5_ib_vma_priv_data); -} - -static const struct vm_operations_struct mlx5_ib_vm_ops = { - .open = mlx5_ib_vma_open, - .close = mlx5_ib_vma_close -}; - -static int mlx5_ib_set_vma_data(struct vm_area_struct *vma, - struct mlx5_ib_ucontext *ctx) -{ - struct mlx5_ib_vma_private_data *vma_prv; - struct list_head *vma_head = &ctx->vma_private_list; - - vma_prv = kzalloc(sizeof(*vma_prv), GFP_KERNEL); - if (!vma_prv) - return -ENOMEM; - - vma_prv->vma = vma; - vma_prv->vma_private_list_mutex = &ctx->vma_private_list_mutex; - vma->vm_private_data = vma_prv; - vma->vm_ops = &mlx5_ib_vm_ops; - - mutex_lock(&ctx->vma_private_list_mutex); - list_add(&vma_prv->list, vma_head); - mutex_unlock(&ctx->vma_private_list_mutex); - - return 0; -} static void mlx5_ib_disassociate_ucontext(struct ib_ucontext *ibcontext) { - struct vm_area_struct *vma; - struct mlx5_ib_vma_private_data *vma_private, *n; - struct mlx5_ib_ucontext *context = to_mucontext(ibcontext); - - mutex_lock(&context->vma_private_list_mutex); - list_for_each_entry_safe(vma_private, n, &context->vma_private_list, - list) { - vma = vma_private->vma; - zap_vma_ptes(vma, vma->vm_start, PAGE_SIZE); - /* context going to be destroyed, should - * not access ops any more. - */ - vma->vm_flags &= ~(VM_SHARED | VM_MAYSHARE); - vma->vm_ops = NULL; - list_del(&vma_private->list); - kfree(vma_private); - } - mutex_unlock(&context->vma_private_list_mutex); } static inline char *mmap_cmd2str(enum mlx5_ib_mmap_cmd cmd) @@ -2010,9 +1965,6 @@ static int mlx5_ib_mmap_clock_info_page(struct mlx5_ib_dev *dev, struct vm_area_struct *vma, struct mlx5_ib_ucontext *context) { - phys_addr_t pfn; - int err; - if (vma->vm_end - vma->vm_start != PAGE_SIZE) return -EINVAL; @@ -2025,13 +1977,8 @@ static int mlx5_ib_mmap_clock_info_page(struct mlx5_ib_dev *dev, if (!dev->mdev->clock_info_page) return -EOPNOTSUPP; - pfn = page_to_pfn(dev->mdev->clock_info_page); - err = remap_pfn_range(vma, vma->vm_start, pfn, PAGE_SIZE, - vma->vm_page_prot); - if (err) - return err; - - return mlx5_ib_set_vma_data(vma, context); + return rdma_user_mmap_page(&context->ibucontext, vma, + dev->mdev->clock_info_page, PAGE_SIZE); } static int uar_mmap(struct mlx5_ib_dev *dev, enum mlx5_ib_mmap_cmd cmd, @@ -2121,21 +2068,15 @@ static int uar_mmap(struct mlx5_ib_dev *dev, enum mlx5_ib_mmap_cmd cmd, pfn = uar_index2pfn(dev, uar_index); mlx5_ib_dbg(dev, "uar idx 0x%lx, pfn %pa\n", idx, &pfn); - vma->vm_page_prot = prot; - err = io_remap_pfn_range(vma, vma->vm_start, pfn, - PAGE_SIZE, vma->vm_page_prot); + err = rdma_user_mmap_io(&context->ibucontext, vma, pfn, PAGE_SIZE, + prot); if (err) { mlx5_ib_err(dev, - "io_remap_pfn_range failed with error=%d, mmap_cmd=%s\n", + "rdma_user_mmap_io failed with error=%d, mmap_cmd=%s\n", err, mmap_cmd2str(cmd)); - err = -EAGAIN; goto err; } - err = mlx5_ib_set_vma_data(vma, context); - if (err) - goto err; - if (dyn_uar) bfregi->sys_pages[idx] = uar_index; return 0; @@ -2160,7 +2101,6 @@ static int dm_mmap(struct ib_ucontext *context, struct vm_area_struct *vma) size_t map_size = vma->vm_end - vma->vm_start; u32 npages = map_size >> PAGE_SHIFT; phys_addr_t pfn; - pgprot_t prot; if (find_next_zero_bit(mctx->dm_pages, page_idx + npages, page_idx) != page_idx + npages) @@ -2170,14 +2110,8 @@ static int dm_mmap(struct ib_ucontext *context, struct vm_area_struct *vma) MLX5_CAP64_DEV_MEM(dev->mdev, memic_bar_start_addr)) >> PAGE_SHIFT) + page_idx; - prot = pgprot_writecombine(vma->vm_page_prot); - vma->vm_page_prot = prot; - - if (io_remap_pfn_range(vma, vma->vm_start, pfn, map_size, - vma->vm_page_prot)) - return -EAGAIN; - - return mlx5_ib_set_vma_data(vma, mctx); + return rdma_user_mmap_io(context, vma, pfn, map_size, + pgprot_writecombine(vma->vm_page_prot)); } static int mlx5_ib_mmap(struct ib_ucontext *ibcontext, struct vm_area_struct *vma) @@ -2318,21 +2252,30 @@ static struct ib_pd *mlx5_ib_alloc_pd(struct ib_device *ibdev, struct mlx5_ib_alloc_pd_resp resp; struct mlx5_ib_pd *pd; int err; + u32 out[MLX5_ST_SZ_DW(alloc_pd_out)] = {}; + u32 in[MLX5_ST_SZ_DW(alloc_pd_in)] = {}; + u16 uid = 0; pd = kmalloc(sizeof(*pd), GFP_KERNEL); if (!pd) return ERR_PTR(-ENOMEM); - err = mlx5_core_alloc_pd(to_mdev(ibdev)->mdev, &pd->pdn); + uid = context ? to_mucontext(context)->devx_uid : 0; + MLX5_SET(alloc_pd_in, in, opcode, MLX5_CMD_OP_ALLOC_PD); + MLX5_SET(alloc_pd_in, in, uid, uid); + err = mlx5_cmd_exec(to_mdev(ibdev)->mdev, in, sizeof(in), + out, sizeof(out)); if (err) { kfree(pd); return ERR_PTR(err); } + pd->pdn = MLX5_GET(alloc_pd_out, out, pd); + pd->uid = uid; if (context) { resp.pdn = pd->pdn; if (ib_copy_to_udata(udata, &resp, sizeof(resp))) { - mlx5_core_dealloc_pd(to_mdev(ibdev)->mdev, pd->pdn); + mlx5_cmd_dealloc_pd(to_mdev(ibdev)->mdev, pd->pdn, uid); kfree(pd); return ERR_PTR(-EFAULT); } @@ -2346,7 +2289,7 @@ static int mlx5_ib_dealloc_pd(struct ib_pd *pd) struct mlx5_ib_dev *mdev = to_mdev(pd->device); struct mlx5_ib_pd *mpd = to_mpd(pd); - mlx5_core_dealloc_pd(mdev->mdev, mpd->pdn); + mlx5_cmd_dealloc_pd(mdev->mdev, mpd->pdn, mpd->uid); kfree(mpd); return 0; @@ -2452,20 +2395,50 @@ static int check_mpls_supp_fields(u32 field_support, const __be32 *set_mask) offsetof(typeof(filter), field) -\ sizeof(filter.field)) -static int parse_flow_flow_action(const union ib_flow_spec *ib_spec, - const struct ib_flow_attr *flow_attr, - struct mlx5_flow_act *action) +int parse_flow_flow_action(struct mlx5_ib_flow_action *maction, + bool is_egress, + struct mlx5_flow_act *action) { - struct mlx5_ib_flow_action *maction = to_mflow_act(ib_spec->action.act); switch (maction->ib_action.type) { case IB_FLOW_ACTION_ESP: + if (action->action & (MLX5_FLOW_CONTEXT_ACTION_ENCRYPT | + MLX5_FLOW_CONTEXT_ACTION_DECRYPT)) + return -EINVAL; /* Currently only AES_GCM keymat is supported by the driver */ action->esp_id = (uintptr_t)maction->esp_aes_gcm.ctx; - action->action |= flow_attr->flags & IB_FLOW_ATTR_FLAGS_EGRESS ? + action->action |= is_egress ? MLX5_FLOW_CONTEXT_ACTION_ENCRYPT : MLX5_FLOW_CONTEXT_ACTION_DECRYPT; return 0; + case IB_FLOW_ACTION_UNSPECIFIED: + if (maction->flow_action_raw.sub_type == + MLX5_IB_FLOW_ACTION_MODIFY_HEADER) { + if (action->action & MLX5_FLOW_CONTEXT_ACTION_MOD_HDR) + return -EINVAL; + action->action |= MLX5_FLOW_CONTEXT_ACTION_MOD_HDR; + action->modify_id = maction->flow_action_raw.action_id; + return 0; + } + if (maction->flow_action_raw.sub_type == + MLX5_IB_FLOW_ACTION_DECAP) { + if (action->action & MLX5_FLOW_CONTEXT_ACTION_DECAP) + return -EINVAL; + action->action |= MLX5_FLOW_CONTEXT_ACTION_DECAP; + return 0; + } + if (maction->flow_action_raw.sub_type == + MLX5_IB_FLOW_ACTION_PACKET_REFORMAT) { + if (action->action & + MLX5_FLOW_CONTEXT_ACTION_PACKET_REFORMAT) + return -EINVAL; + action->action |= + MLX5_FLOW_CONTEXT_ACTION_PACKET_REFORMAT; + action->reformat_id = + maction->flow_action_raw.action_id; + return 0; + } + /* fall through */ default: return -EOPNOTSUPP; } @@ -2802,7 +2775,8 @@ static int parse_flow_attr(struct mlx5_core_dev *mdev, u32 *match_c, action->action |= MLX5_FLOW_CONTEXT_ACTION_DROP; break; case IB_FLOW_SPEC_ACTION_HANDLE: - ret = parse_flow_flow_action(ib_spec, flow_attr, action); + ret = parse_flow_flow_action(to_mflow_act(ib_spec->action.act), + flow_attr->flags & IB_FLOW_ATTR_FLAGS_EGRESS, action); if (ret) return ret; break; @@ -2883,7 +2857,7 @@ is_valid_esp_aes_gcm(struct mlx5_core_dev *mdev, * rules would be supported, always return VALID_SPEC_NA. */ if (!is_crypto) - return egress ? VALID_SPEC_INVALID : VALID_SPEC_NA; + return VALID_SPEC_NA; return is_crypto && is_ipsec && (!egress || (!is_drop && !(flow_act->flags & FLOW_ACT_HAS_TAG))) ? @@ -3026,14 +3000,15 @@ enum flow_table_type { static struct mlx5_ib_flow_prio *_get_prio(struct mlx5_flow_namespace *ns, struct mlx5_ib_flow_prio *prio, int priority, - int num_entries, int num_groups) + int num_entries, int num_groups, + u32 flags) { struct mlx5_flow_table *ft; ft = mlx5_create_auto_grouped_flow_table(ns, priority, num_entries, num_groups, - 0, 0); + 0, flags); if (IS_ERR(ft)) return ERR_CAST(ft); @@ -3053,26 +3028,43 @@ static struct mlx5_ib_flow_prio *get_flow_table(struct mlx5_ib_dev *dev, int max_table_size; int num_entries; int num_groups; + u32 flags = 0; int priority; max_table_size = BIT(MLX5_CAP_FLOWTABLE_NIC_RX(dev->mdev, log_max_ft_size)); if (flow_attr->type == IB_FLOW_ATTR_NORMAL) { - if (ft_type == MLX5_IB_FT_TX) - priority = 0; - else if (flow_is_multicast_only(flow_attr) && - !dont_trap) + enum mlx5_flow_namespace_type fn_type; + + if (flow_is_multicast_only(flow_attr) && + !dont_trap) priority = MLX5_IB_FLOW_MCAST_PRIO; else priority = ib_prio_to_core_prio(flow_attr->priority, dont_trap); - ns = mlx5_get_flow_namespace(dev->mdev, - ft_type == MLX5_IB_FT_TX ? - MLX5_FLOW_NAMESPACE_EGRESS : - MLX5_FLOW_NAMESPACE_BYPASS); + if (ft_type == MLX5_IB_FT_RX) { + fn_type = MLX5_FLOW_NAMESPACE_BYPASS; + prio = &dev->flow_db->prios[priority]; + if (!dev->rep && + MLX5_CAP_FLOWTABLE_NIC_RX(dev->mdev, decap)) + flags |= MLX5_FLOW_TABLE_TUNNEL_EN_DECAP; + if (!dev->rep && + MLX5_CAP_FLOWTABLE_NIC_RX(dev->mdev, + reformat_l3_tunnel_to_l2)) + flags |= MLX5_FLOW_TABLE_TUNNEL_EN_REFORMAT; + } else { + max_table_size = + BIT(MLX5_CAP_FLOWTABLE_NIC_TX(dev->mdev, + log_max_ft_size)); + fn_type = MLX5_FLOW_NAMESPACE_EGRESS; + prio = &dev->flow_db->egress_prios[priority]; + if (!dev->rep && + MLX5_CAP_FLOWTABLE_NIC_TX(dev->mdev, reformat)) + flags |= MLX5_FLOW_TABLE_TUNNEL_EN_REFORMAT; + } + ns = mlx5_get_flow_namespace(dev->mdev, fn_type); num_entries = MLX5_FS_MAX_ENTRIES; num_groups = MLX5_FS_MAX_TYPES; - prio = &dev->flow_db->prios[priority]; } else if (flow_attr->type == IB_FLOW_ATTR_ALL_DEFAULT || flow_attr->type == IB_FLOW_ATTR_MC_DEFAULT) { ns = mlx5_get_flow_namespace(dev->mdev, @@ -3104,7 +3096,8 @@ static struct mlx5_ib_flow_prio *get_flow_table(struct mlx5_ib_dev *dev, ft = prio->flow_table; if (!ft) - return _get_prio(ns, prio, priority, num_entries, num_groups); + return _get_prio(ns, prio, priority, num_entries, num_groups, + flags); return prio; } @@ -3271,6 +3264,9 @@ static struct mlx5_ib_flow_handler *_create_flow_rule(struct mlx5_ib_dev *dev, if (!is_valid_attr(dev->mdev, flow_attr)) return ERR_PTR(-EINVAL); + if (dev->rep && is_egress) + return ERR_PTR(-EINVAL); + spec = kvzalloc(sizeof(*spec), GFP_KERNEL); handler = kzalloc(sizeof(*handler), GFP_KERNEL); if (!handler || !spec) { @@ -3661,34 +3657,54 @@ free_ucmd: return ERR_PTR(err); } -static struct mlx5_ib_flow_prio *_get_flow_table(struct mlx5_ib_dev *dev, - int priority, bool mcast) +static struct mlx5_ib_flow_prio * +_get_flow_table(struct mlx5_ib_dev *dev, + struct mlx5_ib_flow_matcher *fs_matcher, + bool mcast) { - int max_table_size; struct mlx5_flow_namespace *ns = NULL; struct mlx5_ib_flow_prio *prio; + int max_table_size; + u32 flags = 0; + int priority; + + if (fs_matcher->ns_type == MLX5_FLOW_NAMESPACE_BYPASS) { + max_table_size = BIT(MLX5_CAP_FLOWTABLE_NIC_RX(dev->mdev, + log_max_ft_size)); + if (MLX5_CAP_FLOWTABLE_NIC_RX(dev->mdev, decap)) + flags |= MLX5_FLOW_TABLE_TUNNEL_EN_DECAP; + if (MLX5_CAP_FLOWTABLE_NIC_RX(dev->mdev, + reformat_l3_tunnel_to_l2)) + flags |= MLX5_FLOW_TABLE_TUNNEL_EN_REFORMAT; + } else { /* Can only be MLX5_FLOW_NAMESPACE_EGRESS */ + max_table_size = BIT(MLX5_CAP_FLOWTABLE_NIC_TX(dev->mdev, + log_max_ft_size)); + if (MLX5_CAP_FLOWTABLE_NIC_TX(dev->mdev, reformat)) + flags |= MLX5_FLOW_TABLE_TUNNEL_EN_REFORMAT; + } - max_table_size = BIT(MLX5_CAP_FLOWTABLE_NIC_RX(dev->mdev, - log_max_ft_size)); if (max_table_size < MLX5_FS_MAX_ENTRIES) return ERR_PTR(-ENOMEM); if (mcast) priority = MLX5_IB_FLOW_MCAST_PRIO; else - priority = ib_prio_to_core_prio(priority, false); + priority = ib_prio_to_core_prio(fs_matcher->priority, false); - ns = mlx5_get_flow_namespace(dev->mdev, MLX5_FLOW_NAMESPACE_BYPASS); + ns = mlx5_get_flow_namespace(dev->mdev, fs_matcher->ns_type); if (!ns) return ERR_PTR(-ENOTSUPP); - prio = &dev->flow_db->prios[priority]; + if (fs_matcher->ns_type == MLX5_FLOW_NAMESPACE_BYPASS) + prio = &dev->flow_db->prios[priority]; + else + prio = &dev->flow_db->egress_prios[priority]; if (prio->flow_table) return prio; return _get_prio(ns, prio, priority, MLX5_FS_MAX_ENTRIES, - MLX5_FS_MAX_TYPES); + MLX5_FS_MAX_TYPES, flags); } static struct mlx5_ib_flow_handler * @@ -3696,10 +3712,10 @@ _create_raw_flow_rule(struct mlx5_ib_dev *dev, struct mlx5_ib_flow_prio *ft_prio, struct mlx5_flow_destination *dst, struct mlx5_ib_flow_matcher *fs_matcher, + struct mlx5_flow_act *flow_act, void *cmd_in, int inlen) { struct mlx5_ib_flow_handler *handler; - struct mlx5_flow_act flow_act = {.flow_tag = MLX5_FS_DEFAULT_FLOW_TAG}; struct mlx5_flow_spec *spec; struct mlx5_flow_table *ft = ft_prio->flow_table; int err = 0; @@ -3718,9 +3734,8 @@ _create_raw_flow_rule(struct mlx5_ib_dev *dev, fs_matcher->mask_len); spec->match_criteria_enable = fs_matcher->match_criteria_enable; - flow_act.action |= MLX5_FLOW_CONTEXT_ACTION_FWD_DEST; handler->rule = mlx5_add_flow_rules(ft, spec, - &flow_act, dst, 1); + flow_act, dst, 1); if (IS_ERR(handler->rule)) { err = PTR_ERR(handler->rule); @@ -3782,12 +3797,12 @@ static bool raw_fs_is_multicast(struct mlx5_ib_flow_matcher *fs_matcher, struct mlx5_ib_flow_handler * mlx5_ib_raw_fs_rule_add(struct mlx5_ib_dev *dev, struct mlx5_ib_flow_matcher *fs_matcher, + struct mlx5_flow_act *flow_act, void *cmd_in, int inlen, int dest_id, int dest_type) { struct mlx5_flow_destination *dst; struct mlx5_ib_flow_prio *ft_prio; - int priority = fs_matcher->priority; struct mlx5_ib_flow_handler *handler; bool mcast; int err; @@ -3805,7 +3820,7 @@ mlx5_ib_raw_fs_rule_add(struct mlx5_ib_dev *dev, mcast = raw_fs_is_multicast(fs_matcher, cmd_in); mutex_lock(&dev->flow_db->lock); - ft_prio = _get_flow_table(dev, priority, mcast); + ft_prio = _get_flow_table(dev, fs_matcher, mcast); if (IS_ERR(ft_prio)) { err = PTR_ERR(ft_prio); goto unlock; @@ -3814,13 +3829,18 @@ mlx5_ib_raw_fs_rule_add(struct mlx5_ib_dev *dev, if (dest_type == MLX5_FLOW_DESTINATION_TYPE_TIR) { dst->type = dest_type; dst->tir_num = dest_id; - } else { + flow_act->action |= MLX5_FLOW_CONTEXT_ACTION_FWD_DEST; + } else if (dest_type == MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE) { dst->type = MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE_NUM; dst->ft_num = dest_id; + flow_act->action |= MLX5_FLOW_CONTEXT_ACTION_FWD_DEST; + } else { + dst->type = MLX5_FLOW_DESTINATION_TYPE_PORT; + flow_act->action |= MLX5_FLOW_CONTEXT_ACTION_ALLOW; } - handler = _create_raw_flow_rule(dev, ft_prio, dst, fs_matcher, cmd_in, - inlen); + handler = _create_raw_flow_rule(dev, ft_prio, dst, fs_matcher, flow_act, + cmd_in, inlen); if (IS_ERR(handler)) { err = PTR_ERR(handler); @@ -3998,6 +4018,9 @@ static int mlx5_ib_destroy_flow_action(struct ib_flow_action *action) */ mlx5_accel_esp_destroy_xfrm(maction->esp_aes_gcm.ctx); break; + case IB_FLOW_ACTION_UNSPECIFIED: + mlx5_ib_destroy_flow_action_raw(maction); + break; default: WARN_ON(true); break; @@ -4012,13 +4035,17 @@ static int mlx5_ib_mcg_attach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid) struct mlx5_ib_dev *dev = to_mdev(ibqp->device); struct mlx5_ib_qp *mqp = to_mqp(ibqp); int err; + u16 uid; + + uid = ibqp->pd ? + to_mpd(ibqp->pd)->uid : 0; if (mqp->flags & MLX5_IB_QP_UNDERLAY) { mlx5_ib_dbg(dev, "Attaching a multi cast group to underlay QP is not supported\n"); return -EOPNOTSUPP; } - err = mlx5_core_attach_mcg(dev->mdev, gid, ibqp->qp_num); + err = mlx5_cmd_attach_mcg(dev->mdev, gid, ibqp->qp_num, uid); if (err) mlx5_ib_warn(dev, "failed attaching QPN 0x%x, MGID %pI6\n", ibqp->qp_num, gid->raw); @@ -4030,8 +4057,11 @@ static int mlx5_ib_mcg_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid) { struct mlx5_ib_dev *dev = to_mdev(ibqp->device); int err; + u16 uid; - err = mlx5_core_detach_mcg(dev->mdev, gid, ibqp->qp_num); + uid = ibqp->pd ? + to_mpd(ibqp->pd)->uid : 0; + err = mlx5_cmd_detach_mcg(dev->mdev, gid, ibqp->qp_num, uid); if (err) mlx5_ib_warn(dev, "failed detaching QPN 0x%x, MGID %pI6\n", ibqp->qp_num, gid->raw); @@ -4052,16 +4082,17 @@ static int init_node_data(struct mlx5_ib_dev *dev) return mlx5_query_node_guid(dev, &dev->ib_dev.node_guid); } -static ssize_t show_fw_pages(struct device *device, struct device_attribute *attr, - char *buf) +static ssize_t fw_pages_show(struct device *device, + struct device_attribute *attr, char *buf) { struct mlx5_ib_dev *dev = container_of(device, struct mlx5_ib_dev, ib_dev.dev); return sprintf(buf, "%d\n", dev->mdev->priv.fw_pages); } +static DEVICE_ATTR_RO(fw_pages); -static ssize_t show_reg_pages(struct device *device, +static ssize_t reg_pages_show(struct device *device, struct device_attribute *attr, char *buf) { struct mlx5_ib_dev *dev = @@ -4069,44 +4100,47 @@ static ssize_t show_reg_pages(struct device *device, return sprintf(buf, "%d\n", atomic_read(&dev->mdev->priv.reg_pages)); } +static DEVICE_ATTR_RO(reg_pages); -static ssize_t show_hca(struct device *device, struct device_attribute *attr, - char *buf) +static ssize_t hca_type_show(struct device *device, + struct device_attribute *attr, char *buf) { struct mlx5_ib_dev *dev = container_of(device, struct mlx5_ib_dev, ib_dev.dev); return sprintf(buf, "MT%d\n", dev->mdev->pdev->device); } +static DEVICE_ATTR_RO(hca_type); -static ssize_t show_rev(struct device *device, struct device_attribute *attr, - char *buf) +static ssize_t hw_rev_show(struct device *device, + struct device_attribute *attr, char *buf) { struct mlx5_ib_dev *dev = container_of(device, struct mlx5_ib_dev, ib_dev.dev); return sprintf(buf, "%x\n", dev->mdev->rev_id); } +static DEVICE_ATTR_RO(hw_rev); -static ssize_t show_board(struct device *device, struct device_attribute *attr, - char *buf) +static ssize_t board_id_show(struct device *device, + struct device_attribute *attr, char *buf) { struct mlx5_ib_dev *dev = container_of(device, struct mlx5_ib_dev, ib_dev.dev); return sprintf(buf, "%.*s\n", MLX5_BOARD_ID_LEN, dev->mdev->board_id); } +static DEVICE_ATTR_RO(board_id); -static DEVICE_ATTR(hw_rev, S_IRUGO, show_rev, NULL); -static DEVICE_ATTR(hca_type, S_IRUGO, show_hca, NULL); -static DEVICE_ATTR(board_id, S_IRUGO, show_board, NULL); -static DEVICE_ATTR(fw_pages, S_IRUGO, show_fw_pages, NULL); -static DEVICE_ATTR(reg_pages, S_IRUGO, show_reg_pages, NULL); +static struct attribute *mlx5_class_attributes[] = { + &dev_attr_hw_rev.attr, + &dev_attr_hca_type.attr, + &dev_attr_board_id.attr, + &dev_attr_fw_pages.attr, + &dev_attr_reg_pages.attr, + NULL, +}; -static struct device_attribute *mlx5_class_attributes[] = { - &dev_attr_hw_rev, - &dev_attr_hca_type, - &dev_attr_board_id, - &dev_attr_fw_pages, - &dev_attr_reg_pages, +static const struct attribute_group mlx5_attr_group = { + .attrs = mlx5_class_attributes, }; static void pkey_change_handler(struct work_struct *work) @@ -5631,7 +5665,6 @@ void mlx5_ib_stage_init_cleanup(struct mlx5_ib_dev *dev) int mlx5_ib_stage_init_init(struct mlx5_ib_dev *dev) { struct mlx5_core_dev *mdev = dev->mdev; - const char *name; int err; int i; @@ -5664,12 +5697,6 @@ int mlx5_ib_stage_init_init(struct mlx5_ib_dev *dev) if (mlx5_use_mad_ifc(dev)) get_ext_port_caps(dev); - if (!mlx5_lag_is_active(mdev)) - name = "mlx5_%d"; - else - name = "mlx5_bond_%d"; - - strlcpy(dev->ib_dev.name, name, IB_DEVICE_NAME_MAX); dev->ib_dev.owner = THIS_MODULE; dev->ib_dev.node_type = RDMA_NODE_IB_CA; dev->ib_dev.local_dma_lkey = 0 /* not supported for now */; @@ -5876,7 +5903,7 @@ int mlx5_ib_stage_caps_init(struct mlx5_ib_dev *dev) if ((MLX5_CAP_GEN(dev->mdev, port_type) == MLX5_CAP_PORT_TYPE_ETH) && (MLX5_CAP_GEN(dev->mdev, disable_local_lb_uc) || MLX5_CAP_GEN(dev->mdev, disable_local_lb_mc))) - mutex_init(&dev->lb_mutex); + mutex_init(&dev->lb.mutex); return 0; } @@ -6083,7 +6110,14 @@ static int mlx5_ib_stage_populate_specs(struct mlx5_ib_dev *dev) int mlx5_ib_stage_ib_reg_init(struct mlx5_ib_dev *dev) { - return ib_register_device(&dev->ib_dev, NULL); + const char *name; + + rdma_set_device_sysfs_group(&dev->ib_dev, &mlx5_attr_group); + if (!mlx5_lag_is_active(dev->mdev)) + name = "mlx5_%d"; + else + name = "mlx5_bond_%d"; + return ib_register_device(&dev->ib_dev, name, NULL); } void mlx5_ib_stage_pre_ib_reg_umr_cleanup(struct mlx5_ib_dev *dev) @@ -6113,21 +6147,6 @@ static void mlx5_ib_stage_delay_drop_cleanup(struct mlx5_ib_dev *dev) cancel_delay_drop(dev); } -int mlx5_ib_stage_class_attr_init(struct mlx5_ib_dev *dev) -{ - int err; - int i; - - for (i = 0; i < ARRAY_SIZE(mlx5_class_attributes); i++) { - err = device_create_file(&dev->ib_dev.dev, - mlx5_class_attributes[i]); - if (err) - return err; - } - - return 0; -} - static int mlx5_ib_stage_rep_reg_init(struct mlx5_ib_dev *dev) { mlx5_ib_register_vport_reps(dev); @@ -6151,6 +6170,8 @@ void __mlx5_ib_remove(struct mlx5_ib_dev *dev, profile->stage[stage].cleanup(dev); } + if (dev->devx_whitelist_uid) + mlx5_ib_devx_destroy(dev, dev->devx_whitelist_uid); ib_dealloc_device((struct ib_device *)dev); } @@ -6159,8 +6180,7 @@ void *__mlx5_ib_add(struct mlx5_ib_dev *dev, { int err; int i; - - printk_once(KERN_INFO "%s", mlx5_version); + int uid; for (i = 0; i < MLX5_IB_STAGE_MAX; i++) { if (profile->stage[i].init) { @@ -6170,6 +6190,10 @@ void *__mlx5_ib_add(struct mlx5_ib_dev *dev, } } + uid = mlx5_ib_devx_create(dev); + if (uid > 0) + dev->devx_whitelist_uid = uid; + dev->profile = profile; dev->ib_active = true; @@ -6230,9 +6254,6 @@ static const struct mlx5_ib_profile pf_profile = { STAGE_CREATE(MLX5_IB_STAGE_DELAY_DROP, mlx5_ib_stage_delay_drop_init, mlx5_ib_stage_delay_drop_cleanup), - STAGE_CREATE(MLX5_IB_STAGE_CLASS_ATTR, - mlx5_ib_stage_class_attr_init, - NULL), }; static const struct mlx5_ib_profile nic_rep_profile = { @@ -6275,9 +6296,6 @@ static const struct mlx5_ib_profile nic_rep_profile = { STAGE_CREATE(MLX5_IB_STAGE_POST_IB_REG_UMR, mlx5_ib_stage_post_ib_reg_umr_init, NULL), - STAGE_CREATE(MLX5_IB_STAGE_CLASS_ATTR, - mlx5_ib_stage_class_attr_init, - NULL), STAGE_CREATE(MLX5_IB_STAGE_REP_REG, mlx5_ib_stage_rep_reg_init, mlx5_ib_stage_rep_reg_cleanup), diff --git a/drivers/infiniband/hw/mlx5/mem.c b/drivers/infiniband/hw/mlx5/mem.c index f3dbd75a0a96..549234988bb4 100644 --- a/drivers/infiniband/hw/mlx5/mem.c +++ b/drivers/infiniband/hw/mlx5/mem.c @@ -57,7 +57,7 @@ void mlx5_ib_cont_pages(struct ib_umem *umem, u64 addr, int entry; unsigned long page_shift = umem->page_shift; - if (umem->odp_data) { + if (umem->is_odp) { *ncont = ib_umem_page_count(umem); *count = *ncont << (page_shift - PAGE_SHIFT); *shift = page_shift; @@ -152,14 +152,13 @@ void __mlx5_ib_populate_pas(struct mlx5_ib_dev *dev, struct ib_umem *umem, struct scatterlist *sg; int entry; #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING - const bool odp = umem->odp_data != NULL; - - if (odp) { + if (umem->is_odp) { WARN_ON(shift != 0); WARN_ON(access_flags != (MLX5_IB_MTT_READ | MLX5_IB_MTT_WRITE)); for (i = 0; i < num_pages; ++i) { - dma_addr_t pa = umem->odp_data->dma_list[offset + i]; + dma_addr_t pa = + to_ib_umem_odp(umem)->dma_list[offset + i]; pas[i] = cpu_to_be64(umem_dma_to_mtt(pa)); } diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h index 289c18db2611..b651a7a6fde9 100644 --- a/drivers/infiniband/hw/mlx5/mlx5_ib.h +++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h @@ -39,8 +39,10 @@ #include <rdma/ib_smi.h> #include <linux/mlx5/driver.h> #include <linux/mlx5/cq.h> +#include <linux/mlx5/fs.h> #include <linux/mlx5/qp.h> #include <linux/mlx5/srq.h> +#include <linux/mlx5/fs.h> #include <linux/types.h> #include <linux/mlx5/transobj.h> #include <rdma/ib_user_verbs.h> @@ -48,17 +50,17 @@ #include <rdma/uverbs_ioctl.h> #include <rdma/mlx5_user_ioctl_cmds.h> -#define mlx5_ib_dbg(dev, format, arg...) \ -pr_debug("%s:%s:%d:(pid %d): " format, (dev)->ib_dev.name, __func__, \ - __LINE__, current->pid, ##arg) +#define mlx5_ib_dbg(_dev, format, arg...) \ + dev_dbg(&(_dev)->ib_dev.dev, "%s:%d:(pid %d): " format, __func__, \ + __LINE__, current->pid, ##arg) -#define mlx5_ib_err(dev, format, arg...) \ -pr_err("%s:%s:%d:(pid %d): " format, (dev)->ib_dev.name, __func__, \ - __LINE__, current->pid, ##arg) +#define mlx5_ib_err(_dev, format, arg...) \ + dev_err(&(_dev)->ib_dev.dev, "%s:%d:(pid %d): " format, __func__, \ + __LINE__, current->pid, ##arg) -#define mlx5_ib_warn(dev, format, arg...) \ -pr_warn("%s:%s:%d:(pid %d): " format, (dev)->ib_dev.name, __func__, \ - __LINE__, current->pid, ##arg) +#define mlx5_ib_warn(_dev, format, arg...) \ + dev_warn(&(_dev)->ib_dev.dev, "%s:%d:(pid %d): " format, __func__, \ + __LINE__, current->pid, ##arg) #define field_avail(type, fld, sz) (offsetof(type, fld) + \ sizeof(((type *)0)->fld) <= (sz)) @@ -114,13 +116,6 @@ enum { MLX5_MEMIC_BASE_SIZE = 1 << MLX5_MEMIC_BASE_ALIGN, }; -struct mlx5_ib_vma_private_data { - struct list_head list; - struct vm_area_struct *vma; - /* protect vma_private_list add/del */ - struct mutex *vma_private_list_mutex; -}; - struct mlx5_ib_ucontext { struct ib_ucontext ibucontext; struct list_head db_page_list; @@ -132,13 +127,12 @@ struct mlx5_ib_ucontext { u8 cqe_version; /* Transport Domain number */ u32 tdn; - struct list_head vma_private_list; - /* protect vma_private_list add/del */ - struct mutex vma_private_list_mutex; u64 lib_caps; DECLARE_BITMAP(dm_pages, MLX5_MAX_MEMIC_PAGES); u16 devx_uid; + /* For RoCE LAG TX affinity */ + atomic_t tx_port_affinity; }; static inline struct mlx5_ib_ucontext *to_mucontext(struct ib_ucontext *ibucontext) @@ -149,6 +143,13 @@ static inline struct mlx5_ib_ucontext *to_mucontext(struct ib_ucontext *ibuconte struct mlx5_ib_pd { struct ib_pd ibpd; u32 pdn; + u16 uid; +}; + +enum { + MLX5_IB_FLOW_ACTION_MODIFY_HEADER, + MLX5_IB_FLOW_ACTION_PACKET_REFORMAT, + MLX5_IB_FLOW_ACTION_DECAP, }; #define MLX5_IB_FLOW_MCAST_PRIO (MLX5_BY_PASS_NUM_PRIOS - 1) @@ -180,6 +181,7 @@ struct mlx5_ib_flow_matcher { struct mlx5_ib_match_params matcher_mask; int mask_len; enum mlx5_ib_flow_type flow_type; + enum mlx5_flow_namespace_type ns_type; u16 priority; struct mlx5_core_dev *mdev; atomic_t usecnt; @@ -188,6 +190,7 @@ struct mlx5_ib_flow_matcher { struct mlx5_ib_flow_db { struct mlx5_ib_flow_prio prios[MLX5_IB_NUM_FLOW_FT]; + struct mlx5_ib_flow_prio egress_prios[MLX5_IB_NUM_FLOW_FT]; struct mlx5_ib_flow_prio sniffer[MLX5_IB_NUM_SNIFFER_FTS]; struct mlx5_ib_flow_prio egress[MLX5_IB_NUM_EGRESS_FTS]; struct mlx5_flow_table *lag_demux_ft; @@ -322,6 +325,7 @@ enum { struct mlx5_ib_rwq_ind_table { struct ib_rwq_ind_table ib_rwq_ind_tbl; u32 rqtn; + u16 uid; }; struct mlx5_ib_ubuffer { @@ -428,7 +432,7 @@ struct mlx5_ib_qp { struct list_head cq_send_list; struct mlx5_rate_limit rl; u32 underlay_qpn; - bool tunnel_offload_en; + u32 flags_en; /* storage for qp sub type when core qp type is IB_QPT_DRIVER */ enum ib_qp_type qp_sub_type; }; @@ -536,6 +540,7 @@ struct mlx5_ib_srq { struct mlx5_ib_xrcd { struct ib_xrcd ibxrcd; u32 xrcdn; + u16 uid; }; enum mlx5_ib_mtt_access_flags { @@ -700,7 +705,7 @@ struct mlx5_roce { rwlock_t netdev_lock; struct net_device *netdev; struct notifier_block nb; - atomic_t next_port; + atomic_t tx_port_affinity; enum ib_port_state last_port_state; struct mlx5_ib_dev *dev; u8 native_port_num; @@ -815,6 +820,11 @@ struct mlx5_ib_flow_action { u64 ib_flags; struct mlx5_accel_esp_xfrm *ctx; } esp_aes_gcm; + struct { + struct mlx5_ib_dev *dev; + u32 sub_type; + u32 action_id; + } flow_action_raw; }; }; @@ -859,9 +869,20 @@ to_mcounters(struct ib_counters *ibcntrs) return container_of(ibcntrs, struct mlx5_ib_mcounters, ibcntrs); } +int parse_flow_flow_action(struct mlx5_ib_flow_action *maction, + bool is_egress, + struct mlx5_flow_act *action); +struct mlx5_ib_lb_state { + /* protect the user_td */ + struct mutex mutex; + u32 user_td; + int qps; + bool enabled; +}; + struct mlx5_ib_dev { struct ib_device ib_dev; - const struct uverbs_object_tree_def *driver_trees[6]; + const struct uverbs_object_tree_def *driver_trees[7]; struct mlx5_core_dev *mdev; struct mlx5_roce roce[MLX5_MAX_PORTS]; int num_ports; @@ -900,13 +921,12 @@ struct mlx5_ib_dev { const struct mlx5_ib_profile *profile; struct mlx5_eswitch_rep *rep; - /* protect the user_td */ - struct mutex lb_mutex; - u32 user_td; + struct mlx5_ib_lb_state lb; u8 umr_fence; struct list_head ib_dev_list; u64 sys_image_guid; struct mlx5_memic memic; + u16 devx_whitelist_uid; }; static inline struct mlx5_ib_cq *to_mibcq(struct mlx5_core_cq *mcq) @@ -1017,6 +1037,8 @@ int mlx5_ib_query_srq(struct ib_srq *ibsrq, struct ib_srq_attr *srq_attr); int mlx5_ib_destroy_srq(struct ib_srq *srq); int mlx5_ib_post_srq_recv(struct ib_srq *ibsrq, const struct ib_recv_wr *wr, const struct ib_recv_wr **bad_wr); +int mlx5_ib_enable_lb(struct mlx5_ib_dev *dev, bool td, bool qp); +void mlx5_ib_disable_lb(struct mlx5_ib_dev *dev, bool td, bool qp); struct ib_qp *mlx5_ib_create_qp(struct ib_pd *pd, struct ib_qp_init_attr *init_attr, struct ib_udata *udata); @@ -1106,7 +1128,7 @@ void __mlx5_ib_populate_pas(struct mlx5_ib_dev *dev, struct ib_umem *umem, void mlx5_ib_populate_pas(struct mlx5_ib_dev *dev, struct ib_umem *umem, int page_shift, __be64 *pas, int access_flags); void mlx5_ib_copy_pas(u64 *old, u64 *new, int step, int num); -int mlx5_ib_get_cqe_size(struct mlx5_ib_dev *dev, struct ib_cq *ibcq); +int mlx5_ib_get_cqe_size(struct ib_cq *ibcq); int mlx5_mr_cache_init(struct mlx5_ib_dev *dev); int mlx5_mr_cache_cleanup(struct mlx5_ib_dev *dev); @@ -1141,7 +1163,7 @@ void mlx5_ib_pfault(struct mlx5_core_dev *mdev, void *context, int mlx5_ib_odp_init_one(struct mlx5_ib_dev *ibdev); int __init mlx5_ib_odp_init(void); void mlx5_ib_odp_cleanup(void); -void mlx5_ib_invalidate_range(struct ib_umem *umem, unsigned long start, +void mlx5_ib_invalidate_range(struct ib_umem_odp *umem_odp, unsigned long start, unsigned long end); void mlx5_odp_init_mr_cache_entry(struct mlx5_cache_ent *ent); void mlx5_odp_populate_klm(struct mlx5_klm *pklm, size_t offset, @@ -1180,7 +1202,6 @@ void mlx5_ib_stage_pre_ib_reg_umr_cleanup(struct mlx5_ib_dev *dev); int mlx5_ib_stage_ib_reg_init(struct mlx5_ib_dev *dev); void mlx5_ib_stage_ib_reg_cleanup(struct mlx5_ib_dev *dev); int mlx5_ib_stage_post_ib_reg_umr_init(struct mlx5_ib_dev *dev); -int mlx5_ib_stage_class_attr_init(struct mlx5_ib_dev *dev); void __mlx5_ib_remove(struct mlx5_ib_dev *dev, const struct mlx5_ib_profile *profile, int stage); @@ -1229,22 +1250,20 @@ void mlx5_ib_put_native_port_mdev(struct mlx5_ib_dev *dev, u8 port_num); #if IS_ENABLED(CONFIG_INFINIBAND_USER_ACCESS) -int mlx5_ib_devx_create(struct mlx5_ib_dev *dev, - struct mlx5_ib_ucontext *context); -void mlx5_ib_devx_destroy(struct mlx5_ib_dev *dev, - struct mlx5_ib_ucontext *context); +int mlx5_ib_devx_create(struct mlx5_ib_dev *dev); +void mlx5_ib_devx_destroy(struct mlx5_ib_dev *dev, u16 uid); const struct uverbs_object_tree_def *mlx5_ib_get_devx_tree(void); struct mlx5_ib_flow_handler *mlx5_ib_raw_fs_rule_add( struct mlx5_ib_dev *dev, struct mlx5_ib_flow_matcher *fs_matcher, - void *cmd_in, int inlen, int dest_id, int dest_type); + struct mlx5_flow_act *flow_act, void *cmd_in, int inlen, + int dest_id, int dest_type); bool mlx5_ib_devx_is_flow_dest(void *obj, int *dest_id, int *dest_type); int mlx5_ib_get_flow_trees(const struct uverbs_object_tree_def **root); +void mlx5_ib_destroy_flow_action_raw(struct mlx5_ib_flow_action *maction); #else static inline int -mlx5_ib_devx_create(struct mlx5_ib_dev *dev, - struct mlx5_ib_ucontext *context) { return -EOPNOTSUPP; }; -static inline void mlx5_ib_devx_destroy(struct mlx5_ib_dev *dev, - struct mlx5_ib_ucontext *context) {} +mlx5_ib_devx_create(struct mlx5_ib_dev *dev) { return -EOPNOTSUPP; }; +static inline void mlx5_ib_devx_destroy(struct mlx5_ib_dev *dev, u16 uid) {} static inline const struct uverbs_object_tree_def * mlx5_ib_get_devx_tree(void) { return NULL; } static inline bool mlx5_ib_devx_is_flow_dest(void *obj, int *dest_id, @@ -1257,6 +1276,11 @@ mlx5_ib_get_flow_trees(const struct uverbs_object_tree_def **root) { return 0; } +static inline void +mlx5_ib_destroy_flow_action_raw(struct mlx5_ib_flow_action *maction) +{ + return; +}; #endif static inline void init_query_mad(struct ib_smp *mad) { diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c index e22314837645..9b195d65a13e 100644 --- a/drivers/infiniband/hw/mlx5/mr.c +++ b/drivers/infiniband/hw/mlx5/mr.c @@ -98,7 +98,7 @@ static bool use_umr_mtt_update(struct mlx5_ib_mr *mr, u64 start, u64 length) #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING static void update_odp_mr(struct mlx5_ib_mr *mr) { - if (mr->umem->odp_data) { + if (mr->umem->is_odp) { /* * This barrier prevents the compiler from moving the * setting of umem->odp_data->private to point to our @@ -107,7 +107,7 @@ static void update_odp_mr(struct mlx5_ib_mr *mr) * handle invalidations. */ smp_wmb(); - mr->umem->odp_data->private = mr; + to_ib_umem_odp(mr->umem)->private = mr; /* * Make sure we will see the new * umem->odp_data->private value in the invalidation @@ -691,7 +691,6 @@ int mlx5_mr_cache_init(struct mlx5_ib_dev *dev) init_completion(&ent->compl); INIT_WORK(&ent->work, cache_work_func); INIT_DELAYED_WORK(&ent->dwork, delayed_cache_work_func); - queue_work(cache->wq, &ent->work); if (i > MR_CACHE_LAST_STD_ENTRY) { mlx5_odp_init_mr_cache_entry(ent); @@ -711,6 +710,7 @@ int mlx5_mr_cache_init(struct mlx5_ib_dev *dev) ent->limit = dev->mdev->profile->mr_cache[i].limit; else ent->limit = 0; + queue_work(cache->wq, &ent->work); } err = mlx5_mr_cache_debugfs_init(dev); @@ -1627,14 +1627,16 @@ static void dereg_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr) struct ib_umem *umem = mr->umem; #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING - if (umem && umem->odp_data) { + if (umem && umem->is_odp) { + struct ib_umem_odp *umem_odp = to_ib_umem_odp(umem); + /* Prevent new page faults from succeeding */ mr->live = 0; /* Wait for all running page-fault handlers to finish. */ synchronize_srcu(&dev->mr_srcu); /* Destroy all page mappings */ - if (umem->odp_data->page_list) - mlx5_ib_invalidate_range(umem, ib_umem_start(umem), + if (umem_odp->page_list) + mlx5_ib_invalidate_range(umem_odp, ib_umem_start(umem), ib_umem_end(umem)); else mlx5_ib_free_implicit_mr(mr); diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c index d216e0d2921d..b04eb6775326 100644 --- a/drivers/infiniband/hw/mlx5/odp.c +++ b/drivers/infiniband/hw/mlx5/odp.c @@ -61,13 +61,21 @@ static int check_parent(struct ib_umem_odp *odp, return mr && mr->parent == parent && !odp->dying; } +struct ib_ucontext_per_mm *mr_to_per_mm(struct mlx5_ib_mr *mr) +{ + if (WARN_ON(!mr || !mr->umem || !mr->umem->is_odp)) + return NULL; + + return to_ib_umem_odp(mr->umem)->per_mm; +} + static struct ib_umem_odp *odp_next(struct ib_umem_odp *odp) { struct mlx5_ib_mr *mr = odp->private, *parent = mr->parent; - struct ib_ucontext *ctx = odp->umem->context; + struct ib_ucontext_per_mm *per_mm = odp->per_mm; struct rb_node *rb; - down_read(&ctx->umem_rwsem); + down_read(&per_mm->umem_rwsem); while (1) { rb = rb_next(&odp->interval_tree.rb); if (!rb) @@ -79,19 +87,19 @@ static struct ib_umem_odp *odp_next(struct ib_umem_odp *odp) not_found: odp = NULL; end: - up_read(&ctx->umem_rwsem); + up_read(&per_mm->umem_rwsem); return odp; } -static struct ib_umem_odp *odp_lookup(struct ib_ucontext *ctx, - u64 start, u64 length, +static struct ib_umem_odp *odp_lookup(u64 start, u64 length, struct mlx5_ib_mr *parent) { + struct ib_ucontext_per_mm *per_mm = mr_to_per_mm(parent); struct ib_umem_odp *odp; struct rb_node *rb; - down_read(&ctx->umem_rwsem); - odp = rbt_ib_umem_lookup(&ctx->umem_tree, start, length); + down_read(&per_mm->umem_rwsem); + odp = rbt_ib_umem_lookup(&per_mm->umem_tree, start, length); if (!odp) goto end; @@ -102,13 +110,13 @@ static struct ib_umem_odp *odp_lookup(struct ib_ucontext *ctx, if (!rb) goto not_found; odp = rb_entry(rb, struct ib_umem_odp, interval_tree.rb); - if (ib_umem_start(odp->umem) > start + length) + if (ib_umem_start(&odp->umem) > start + length) goto not_found; } not_found: odp = NULL; end: - up_read(&ctx->umem_rwsem); + up_read(&per_mm->umem_rwsem); return odp; } @@ -116,7 +124,6 @@ void mlx5_odp_populate_klm(struct mlx5_klm *pklm, size_t offset, size_t nentries, struct mlx5_ib_mr *mr, int flags) { struct ib_pd *pd = mr->ibmr.pd; - struct ib_ucontext *ctx = pd->uobject->context; struct mlx5_ib_dev *dev = to_mdev(pd->device); struct ib_umem_odp *odp; unsigned long va; @@ -131,13 +138,13 @@ void mlx5_odp_populate_klm(struct mlx5_klm *pklm, size_t offset, return; } - odp = odp_lookup(ctx, offset * MLX5_IMR_MTT_SIZE, - nentries * MLX5_IMR_MTT_SIZE, mr); + odp = odp_lookup(offset * MLX5_IMR_MTT_SIZE, + nentries * MLX5_IMR_MTT_SIZE, mr); for (i = 0; i < nentries; i++, pklm++) { pklm->bcount = cpu_to_be32(MLX5_IMR_MTT_SIZE); va = (offset + i) * MLX5_IMR_MTT_SIZE; - if (odp && odp->umem->address == va) { + if (odp && odp->umem.address == va) { struct mlx5_ib_mr *mtt = odp->private; pklm->key = cpu_to_be32(mtt->ibmr.lkey); @@ -153,13 +160,13 @@ void mlx5_odp_populate_klm(struct mlx5_klm *pklm, size_t offset, static void mr_leaf_free_action(struct work_struct *work) { struct ib_umem_odp *odp = container_of(work, struct ib_umem_odp, work); - int idx = ib_umem_start(odp->umem) >> MLX5_IMR_MTT_SHIFT; + int idx = ib_umem_start(&odp->umem) >> MLX5_IMR_MTT_SHIFT; struct mlx5_ib_mr *mr = odp->private, *imr = mr->parent; mr->parent = NULL; synchronize_srcu(&mr->dev->mr_srcu); - ib_umem_release(odp->umem); + ib_umem_release(&odp->umem); if (imr->live) mlx5_ib_update_xlt(imr, idx, 1, 0, MLX5_IB_UPD_XLT_INDIRECT | @@ -170,22 +177,24 @@ static void mr_leaf_free_action(struct work_struct *work) wake_up(&imr->q_leaf_free); } -void mlx5_ib_invalidate_range(struct ib_umem *umem, unsigned long start, +void mlx5_ib_invalidate_range(struct ib_umem_odp *umem_odp, unsigned long start, unsigned long end) { struct mlx5_ib_mr *mr; const u64 umr_block_mask = (MLX5_UMR_MTT_ALIGNMENT / sizeof(struct mlx5_mtt)) - 1; u64 idx = 0, blk_start_idx = 0; + struct ib_umem *umem; int in_block = 0; u64 addr; - if (!umem || !umem->odp_data) { + if (!umem_odp) { pr_err("invalidation called on NULL umem or non-ODP umem\n"); return; } + umem = &umem_odp->umem; - mr = umem->odp_data->private; + mr = umem_odp->private; if (!mr || !mr->ibmr.pd) return; @@ -208,7 +217,7 @@ void mlx5_ib_invalidate_range(struct ib_umem *umem, unsigned long start, * estimate the cost of another UMR vs. the cost of bigger * UMR. */ - if (umem->odp_data->dma_list[idx] & + if (umem_odp->dma_list[idx] & (ODP_READ_ALLOWED_BIT | ODP_WRITE_ALLOWED_BIT)) { if (!in_block) { blk_start_idx = idx; @@ -237,13 +246,13 @@ void mlx5_ib_invalidate_range(struct ib_umem *umem, unsigned long start, * needed. */ - ib_umem_odp_unmap_dma_pages(umem, start, end); + ib_umem_odp_unmap_dma_pages(umem_odp, start, end); if (unlikely(!umem->npages && mr->parent && - !umem->odp_data->dying)) { - WRITE_ONCE(umem->odp_data->dying, 1); + !umem_odp->dying)) { + WRITE_ONCE(umem_odp->dying, 1); atomic_inc(&mr->parent->num_leaf_free); - schedule_work(&umem->odp_data->work); + schedule_work(&umem_odp->work); } } @@ -366,16 +375,15 @@ fail: static struct ib_umem_odp *implicit_mr_get_data(struct mlx5_ib_mr *mr, u64 io_virt, size_t bcnt) { - struct ib_ucontext *ctx = mr->ibmr.pd->uobject->context; struct mlx5_ib_dev *dev = to_mdev(mr->ibmr.pd->device); struct ib_umem_odp *odp, *result = NULL; + struct ib_umem_odp *odp_mr = to_ib_umem_odp(mr->umem); u64 addr = io_virt & MLX5_IMR_MTT_MASK; int nentries = 0, start_idx = 0, ret; struct mlx5_ib_mr *mtt; - struct ib_umem *umem; - mutex_lock(&mr->umem->odp_data->umem_mutex); - odp = odp_lookup(ctx, addr, 1, mr); + mutex_lock(&odp_mr->umem_mutex); + odp = odp_lookup(addr, 1, mr); mlx5_ib_dbg(dev, "io_virt:%llx bcnt:%zx addr:%llx odp:%p\n", io_virt, bcnt, addr, odp); @@ -385,22 +393,23 @@ next_mr: if (nentries) nentries++; } else { - umem = ib_alloc_odp_umem(ctx, addr, MLX5_IMR_MTT_SIZE); - if (IS_ERR(umem)) { - mutex_unlock(&mr->umem->odp_data->umem_mutex); - return ERR_CAST(umem); + odp = ib_alloc_odp_umem(odp_mr->per_mm, addr, + MLX5_IMR_MTT_SIZE); + if (IS_ERR(odp)) { + mutex_unlock(&odp_mr->umem_mutex); + return ERR_CAST(odp); } - mtt = implicit_mr_alloc(mr->ibmr.pd, umem, 0, mr->access_flags); + mtt = implicit_mr_alloc(mr->ibmr.pd, &odp->umem, 0, + mr->access_flags); if (IS_ERR(mtt)) { - mutex_unlock(&mr->umem->odp_data->umem_mutex); - ib_umem_release(umem); + mutex_unlock(&odp_mr->umem_mutex); + ib_umem_release(&odp->umem); return ERR_CAST(mtt); } - odp = umem->odp_data; odp->private = mtt; - mtt->umem = umem; + mtt->umem = &odp->umem; mtt->mmkey.iova = addr; mtt->parent = mr; INIT_WORK(&odp->work, mr_leaf_free_action); @@ -417,7 +426,7 @@ next_mr: addr += MLX5_IMR_MTT_SIZE; if (unlikely(addr < io_virt + bcnt)) { odp = odp_next(odp); - if (odp && odp->umem->address != addr) + if (odp && odp->umem.address != addr) odp = NULL; goto next_mr; } @@ -432,7 +441,7 @@ next_mr: } } - mutex_unlock(&mr->umem->odp_data->umem_mutex); + mutex_unlock(&odp_mr->umem_mutex); return result; } @@ -460,36 +469,36 @@ struct mlx5_ib_mr *mlx5_ib_alloc_implicit_mr(struct mlx5_ib_pd *pd, return imr; } -static int mr_leaf_free(struct ib_umem *umem, u64 start, - u64 end, void *cookie) +static int mr_leaf_free(struct ib_umem_odp *umem_odp, u64 start, u64 end, + void *cookie) { - struct mlx5_ib_mr *mr = umem->odp_data->private, *imr = cookie; + struct mlx5_ib_mr *mr = umem_odp->private, *imr = cookie; + struct ib_umem *umem = &umem_odp->umem; if (mr->parent != imr) return 0; - ib_umem_odp_unmap_dma_pages(umem, - ib_umem_start(umem), + ib_umem_odp_unmap_dma_pages(umem_odp, ib_umem_start(umem), ib_umem_end(umem)); - if (umem->odp_data->dying) + if (umem_odp->dying) return 0; - WRITE_ONCE(umem->odp_data->dying, 1); + WRITE_ONCE(umem_odp->dying, 1); atomic_inc(&imr->num_leaf_free); - schedule_work(&umem->odp_data->work); + schedule_work(&umem_odp->work); return 0; } void mlx5_ib_free_implicit_mr(struct mlx5_ib_mr *imr) { - struct ib_ucontext *ctx = imr->ibmr.pd->uobject->context; + struct ib_ucontext_per_mm *per_mm = mr_to_per_mm(imr); - down_read(&ctx->umem_rwsem); - rbt_ib_umem_for_each_in_range(&ctx->umem_tree, 0, ULLONG_MAX, + down_read(&per_mm->umem_rwsem); + rbt_ib_umem_for_each_in_range(&per_mm->umem_tree, 0, ULLONG_MAX, mr_leaf_free, true, imr); - up_read(&ctx->umem_rwsem); + up_read(&per_mm->umem_rwsem); wait_event(imr->q_leaf_free, !atomic_read(&imr->num_leaf_free)); } @@ -497,6 +506,7 @@ void mlx5_ib_free_implicit_mr(struct mlx5_ib_mr *imr) static int pagefault_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr, u64 io_virt, size_t bcnt, u32 *bytes_mapped) { + struct ib_umem_odp *odp_mr = to_ib_umem_odp(mr->umem); u64 access_mask = ODP_READ_ALLOWED_BIT; int npages = 0, page_shift, np; u64 start_idx, page_mask; @@ -505,7 +515,7 @@ static int pagefault_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr, size_t size; int ret; - if (!mr->umem->odp_data->page_list) { + if (!odp_mr->page_list) { odp = implicit_mr_get_data(mr, io_virt, bcnt); if (IS_ERR(odp)) @@ -513,11 +523,11 @@ static int pagefault_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr, mr = odp->private; } else { - odp = mr->umem->odp_data; + odp = odp_mr; } next_mr: - size = min_t(size_t, bcnt, ib_umem_end(odp->umem) - io_virt); + size = min_t(size_t, bcnt, ib_umem_end(&odp->umem) - io_virt); page_shift = mr->umem->page_shift; page_mask = ~(BIT(page_shift) - 1); @@ -533,7 +543,7 @@ next_mr: */ smp_rmb(); - ret = ib_umem_odp_map_dma_pages(mr->umem, io_virt, size, + ret = ib_umem_odp_map_dma_pages(to_ib_umem_odp(mr->umem), io_virt, size, access_mask, current_seq); if (ret < 0) @@ -542,7 +552,8 @@ next_mr: np = ret; mutex_lock(&odp->umem_mutex); - if (!ib_umem_mmu_notifier_retry(mr->umem, current_seq)) { + if (!ib_umem_mmu_notifier_retry(to_ib_umem_odp(mr->umem), + current_seq)) { /* * No need to check whether the MTTs really belong to * this MR, since ib_umem_odp_map_dma_pages already @@ -575,7 +586,7 @@ next_mr: io_virt += size; next = odp_next(odp); - if (unlikely(!next || next->umem->address != io_virt)) { + if (unlikely(!next || next->umem.address != io_virt)) { mlx5_ib_dbg(dev, "next implicit leaf removed at 0x%llx. got %p\n", io_virt, next); return -EAGAIN; diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c index daf1eb84cd31..6841c0f9237f 100644 --- a/drivers/infiniband/hw/mlx5/qp.c +++ b/drivers/infiniband/hw/mlx5/qp.c @@ -37,6 +37,7 @@ #include <linux/mlx5/fs.h> #include "mlx5_ib.h" #include "ib_rep.h" +#include "cmd.h" /* not supported currently */ static int wq_signature; @@ -850,6 +851,7 @@ static int create_user_qp(struct mlx5_ib_dev *dev, struct ib_pd *pd, goto err_umem; } + MLX5_SET(create_qp_in, *in, uid, to_mpd(pd)->uid); pas = (__be64 *)MLX5_ADDR_OF(create_qp_in, *in, pas); if (ubuffer->umem) mlx5_ib_populate_pas(dev, ubuffer->umem, page_shift, pas, 0); @@ -1051,7 +1053,8 @@ static u32 get_rx_type(struct mlx5_ib_qp *qp, struct ib_qp_init_attr *attr) static int is_connected(enum ib_qp_type qp_type) { - if (qp_type == IB_QPT_RC || qp_type == IB_QPT_UC) + if (qp_type == IB_QPT_RC || qp_type == IB_QPT_UC || + qp_type == MLX5_IB_QPT_DCI) return 1; return 0; @@ -1059,11 +1062,13 @@ static int is_connected(enum ib_qp_type qp_type) static int create_raw_packet_qp_tis(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp, - struct mlx5_ib_sq *sq, u32 tdn) + struct mlx5_ib_sq *sq, u32 tdn, + struct ib_pd *pd) { u32 in[MLX5_ST_SZ_DW(create_tis_in)] = {0}; void *tisc = MLX5_ADDR_OF(create_tis_in, in, ctx); + MLX5_SET(create_tis_in, in, uid, to_mpd(pd)->uid); MLX5_SET(tisc, tisc, transport_domain, tdn); if (qp->flags & MLX5_IB_QP_UNDERLAY) MLX5_SET(tisc, tisc, underlay_qpn, qp->underlay_qpn); @@ -1072,9 +1077,9 @@ static int create_raw_packet_qp_tis(struct mlx5_ib_dev *dev, } static void destroy_raw_packet_qp_tis(struct mlx5_ib_dev *dev, - struct mlx5_ib_sq *sq) + struct mlx5_ib_sq *sq, struct ib_pd *pd) { - mlx5_core_destroy_tis(dev->mdev, sq->tisn); + mlx5_cmd_destroy_tis(dev->mdev, sq->tisn, to_mpd(pd)->uid); } static void destroy_flow_rule_vport_sq(struct mlx5_ib_dev *dev, @@ -1114,6 +1119,7 @@ static int create_raw_packet_qp_sq(struct mlx5_ib_dev *dev, goto err_umem; } + MLX5_SET(create_sq_in, in, uid, to_mpd(pd)->uid); sqc = MLX5_ADDR_OF(create_sq_in, in, ctx); MLX5_SET(sqc, sqc, flush_in_error_en, 1); if (MLX5_CAP_ETH(dev->mdev, multi_pkt_send_wqe)) @@ -1188,7 +1194,7 @@ static size_t get_rq_pas_size(void *qpc) static int create_raw_packet_qp_rq(struct mlx5_ib_dev *dev, struct mlx5_ib_rq *rq, void *qpin, - size_t qpinlen) + size_t qpinlen, struct ib_pd *pd) { struct mlx5_ib_qp *mqp = rq->base.container_mibqp; __be64 *pas; @@ -1209,6 +1215,7 @@ static int create_raw_packet_qp_rq(struct mlx5_ib_dev *dev, if (!in) return -ENOMEM; + MLX5_SET(create_rq_in, in, uid, to_mpd(pd)->uid); rqc = MLX5_ADDR_OF(create_rq_in, in, ctx); if (!(rq->flags & MLX5_IB_RQ_CVLAN_STRIPPING)) MLX5_SET(rqc, rqc, vsd, 1); @@ -1256,10 +1263,23 @@ static bool tunnel_offload_supported(struct mlx5_core_dev *dev) MLX5_CAP_ETH(dev, tunnel_stateless_geneve_rx)); } +static void destroy_raw_packet_qp_tir(struct mlx5_ib_dev *dev, + struct mlx5_ib_rq *rq, + u32 qp_flags_en, + struct ib_pd *pd) +{ + if (qp_flags_en & (MLX5_QP_FLAG_TIR_ALLOW_SELF_LB_UC | + MLX5_QP_FLAG_TIR_ALLOW_SELF_LB_MC)) + mlx5_ib_disable_lb(dev, false, true); + mlx5_cmd_destroy_tir(dev->mdev, rq->tirn, to_mpd(pd)->uid); +} + static int create_raw_packet_qp_tir(struct mlx5_ib_dev *dev, struct mlx5_ib_rq *rq, u32 tdn, - bool tunnel_offload_en) + u32 *qp_flags_en, + struct ib_pd *pd) { + u8 lb_flag = 0; u32 *in; void *tirc; int inlen; @@ -1270,33 +1290,45 @@ static int create_raw_packet_qp_tir(struct mlx5_ib_dev *dev, if (!in) return -ENOMEM; + MLX5_SET(create_tir_in, in, uid, to_mpd(pd)->uid); tirc = MLX5_ADDR_OF(create_tir_in, in, ctx); MLX5_SET(tirc, tirc, disp_type, MLX5_TIRC_DISP_TYPE_DIRECT); MLX5_SET(tirc, tirc, inline_rqn, rq->base.mqp.qpn); MLX5_SET(tirc, tirc, transport_domain, tdn); - if (tunnel_offload_en) + if (*qp_flags_en & MLX5_QP_FLAG_TUNNEL_OFFLOADS) MLX5_SET(tirc, tirc, tunneled_offload_en, 1); - if (dev->rep) - MLX5_SET(tirc, tirc, self_lb_block, - MLX5_TIRC_SELF_LB_BLOCK_BLOCK_UNICAST); + if (*qp_flags_en & MLX5_QP_FLAG_TIR_ALLOW_SELF_LB_UC) + lb_flag |= MLX5_TIRC_SELF_LB_BLOCK_BLOCK_UNICAST; + + if (*qp_flags_en & MLX5_QP_FLAG_TIR_ALLOW_SELF_LB_MC) + lb_flag |= MLX5_TIRC_SELF_LB_BLOCK_BLOCK_MULTICAST; + + if (dev->rep) { + lb_flag |= MLX5_TIRC_SELF_LB_BLOCK_BLOCK_UNICAST; + *qp_flags_en |= MLX5_QP_FLAG_TIR_ALLOW_SELF_LB_UC; + } + + MLX5_SET(tirc, tirc, self_lb_block, lb_flag); err = mlx5_core_create_tir(dev->mdev, in, inlen, &rq->tirn); + if (!err && MLX5_GET(tirc, tirc, self_lb_block)) { + err = mlx5_ib_enable_lb(dev, false, true); + + if (err) + destroy_raw_packet_qp_tir(dev, rq, 0, pd); + } kvfree(in); return err; } -static void destroy_raw_packet_qp_tir(struct mlx5_ib_dev *dev, - struct mlx5_ib_rq *rq) -{ - mlx5_core_destroy_tir(dev->mdev, rq->tirn); -} - static int create_raw_packet_qp(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp, u32 *in, size_t inlen, - struct ib_pd *pd) + struct ib_pd *pd, + struct ib_udata *udata, + struct mlx5_ib_create_qp_resp *resp) { struct mlx5_ib_raw_packet_qp *raw_packet_qp = &qp->raw_packet_qp; struct mlx5_ib_sq *sq = &raw_packet_qp->sq; @@ -1306,9 +1338,10 @@ static int create_raw_packet_qp(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp, struct mlx5_ib_ucontext *mucontext = to_mucontext(ucontext); int err; u32 tdn = mucontext->tdn; + u16 uid = to_mpd(pd)->uid; if (qp->sq.wqe_cnt) { - err = create_raw_packet_qp_tis(dev, qp, sq, tdn); + err = create_raw_packet_qp_tis(dev, qp, sq, tdn, pd); if (err) return err; @@ -1316,6 +1349,13 @@ static int create_raw_packet_qp(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp, if (err) goto err_destroy_tis; + if (uid) { + resp->tisn = sq->tisn; + resp->comp_mask |= MLX5_IB_CREATE_QP_RESP_MASK_TISN; + resp->sqn = sq->base.mqp.qpn; + resp->comp_mask |= MLX5_IB_CREATE_QP_RESP_MASK_SQN; + } + sq->base.container_mibqp = qp; sq->base.mqp.event = mlx5_ib_qp_event; } @@ -1327,22 +1367,32 @@ static int create_raw_packet_qp(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp, rq->flags |= MLX5_IB_RQ_CVLAN_STRIPPING; if (qp->flags & MLX5_IB_QP_PCI_WRITE_END_PADDING) rq->flags |= MLX5_IB_RQ_PCI_WRITE_END_PADDING; - err = create_raw_packet_qp_rq(dev, rq, in, inlen); + err = create_raw_packet_qp_rq(dev, rq, in, inlen, pd); if (err) goto err_destroy_sq; - - err = create_raw_packet_qp_tir(dev, rq, tdn, - qp->tunnel_offload_en); + err = create_raw_packet_qp_tir(dev, rq, tdn, &qp->flags_en, pd); if (err) goto err_destroy_rq; + + if (uid) { + resp->rqn = rq->base.mqp.qpn; + resp->comp_mask |= MLX5_IB_CREATE_QP_RESP_MASK_RQN; + resp->tirn = rq->tirn; + resp->comp_mask |= MLX5_IB_CREATE_QP_RESP_MASK_TIRN; + } } qp->trans_qp.base.mqp.qpn = qp->sq.wqe_cnt ? sq->base.mqp.qpn : rq->base.mqp.qpn; + err = ib_copy_to_udata(udata, resp, min(udata->outlen, sizeof(*resp))); + if (err) + goto err_destroy_tir; return 0; +err_destroy_tir: + destroy_raw_packet_qp_tir(dev, rq, qp->flags_en, pd); err_destroy_rq: destroy_raw_packet_qp_rq(dev, rq); err_destroy_sq: @@ -1350,7 +1400,7 @@ err_destroy_sq: return err; destroy_raw_packet_qp_sq(dev, sq); err_destroy_tis: - destroy_raw_packet_qp_tis(dev, sq); + destroy_raw_packet_qp_tis(dev, sq, pd); return err; } @@ -1363,13 +1413,13 @@ static void destroy_raw_packet_qp(struct mlx5_ib_dev *dev, struct mlx5_ib_rq *rq = &raw_packet_qp->rq; if (qp->rq.wqe_cnt) { - destroy_raw_packet_qp_tir(dev, rq); + destroy_raw_packet_qp_tir(dev, rq, qp->flags_en, qp->ibqp.pd); destroy_raw_packet_qp_rq(dev, rq); } if (qp->sq.wqe_cnt) { destroy_raw_packet_qp_sq(dev, sq); - destroy_raw_packet_qp_tis(dev, sq); + destroy_raw_packet_qp_tis(dev, sq, qp->ibqp.pd); } } @@ -1387,7 +1437,11 @@ static void raw_packet_qp_copy_info(struct mlx5_ib_qp *qp, static void destroy_rss_raw_qp_tir(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp) { - mlx5_core_destroy_tir(dev->mdev, qp->rss_qp.tirn); + if (qp->flags_en & (MLX5_QP_FLAG_TIR_ALLOW_SELF_LB_UC | + MLX5_QP_FLAG_TIR_ALLOW_SELF_LB_MC)) + mlx5_ib_disable_lb(dev, false, true); + mlx5_cmd_destroy_tir(dev->mdev, qp->rss_qp.tirn, + to_mpd(qp->ibqp.pd)->uid); } static int create_rss_raw_qp_tir(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp, @@ -1410,6 +1464,7 @@ static int create_rss_raw_qp_tir(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp, u32 tdn = mucontext->tdn; struct mlx5_ib_create_qp_rss ucmd = {}; size_t required_cmd_sz; + u8 lb_flag = 0; if (init_attr->qp_type != IB_QPT_RAW_PACKET) return -EOPNOTSUPP; @@ -1444,7 +1499,9 @@ static int create_rss_raw_qp_tir(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp, return -EOPNOTSUPP; } - if (ucmd.flags & ~MLX5_QP_FLAG_TUNNEL_OFFLOADS) { + if (ucmd.flags & ~(MLX5_QP_FLAG_TUNNEL_OFFLOADS | + MLX5_QP_FLAG_TIR_ALLOW_SELF_LB_UC | + MLX5_QP_FLAG_TIR_ALLOW_SELF_LB_MC)) { mlx5_ib_dbg(dev, "invalid flags\n"); return -EOPNOTSUPP; } @@ -1461,6 +1518,16 @@ static int create_rss_raw_qp_tir(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp, return -EOPNOTSUPP; } + if (ucmd.flags & MLX5_QP_FLAG_TIR_ALLOW_SELF_LB_UC || dev->rep) { + lb_flag |= MLX5_TIRC_SELF_LB_BLOCK_BLOCK_UNICAST; + qp->flags_en |= MLX5_QP_FLAG_TIR_ALLOW_SELF_LB_UC; + } + + if (ucmd.flags & MLX5_QP_FLAG_TIR_ALLOW_SELF_LB_MC) { + lb_flag |= MLX5_TIRC_SELF_LB_BLOCK_BLOCK_MULTICAST; + qp->flags_en |= MLX5_QP_FLAG_TIR_ALLOW_SELF_LB_MC; + } + err = ib_copy_to_udata(udata, &resp, min(udata->outlen, sizeof(resp))); if (err) { mlx5_ib_dbg(dev, "copy failed\n"); @@ -1472,6 +1539,7 @@ static int create_rss_raw_qp_tir(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp, if (!in) return -ENOMEM; + MLX5_SET(create_tir_in, in, uid, to_mpd(pd)->uid); tirc = MLX5_ADDR_OF(create_tir_in, in, ctx); MLX5_SET(tirc, tirc, disp_type, MLX5_TIRC_DISP_TYPE_INDIRECT); @@ -1484,6 +1552,8 @@ static int create_rss_raw_qp_tir(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp, if (ucmd.flags & MLX5_QP_FLAG_TUNNEL_OFFLOADS) MLX5_SET(tirc, tirc, tunneled_offload_en, 1); + MLX5_SET(tirc, tirc, self_lb_block, lb_flag); + if (ucmd.rx_hash_fields_mask & MLX5_RX_HASH_INNER) hfso = MLX5_ADDR_OF(tirc, tirc, rx_hash_field_selector_inner); else @@ -1580,26 +1650,141 @@ static int create_rss_raw_qp_tir(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp, MLX5_SET(rx_hash_field_select, hfso, selected_fields, selected_fields); create_tir: - if (dev->rep) - MLX5_SET(tirc, tirc, self_lb_block, - MLX5_TIRC_SELF_LB_BLOCK_BLOCK_UNICAST); - err = mlx5_core_create_tir(dev->mdev, in, inlen, &qp->rss_qp.tirn); + if (!err && MLX5_GET(tirc, tirc, self_lb_block)) { + err = mlx5_ib_enable_lb(dev, false, true); + + if (err) + mlx5_cmd_destroy_tir(dev->mdev, qp->rss_qp.tirn, + to_mpd(pd)->uid); + } + if (err) goto err; + if (mucontext->devx_uid) { + resp.comp_mask |= MLX5_IB_CREATE_QP_RESP_MASK_TIRN; + resp.tirn = qp->rss_qp.tirn; + } + + err = ib_copy_to_udata(udata, &resp, min(udata->outlen, sizeof(resp))); + if (err) + goto err_copy; + kvfree(in); /* qpn is reserved for that QP */ qp->trans_qp.base.mqp.qpn = 0; qp->flags |= MLX5_IB_QP_RSS; return 0; +err_copy: + mlx5_cmd_destroy_tir(dev->mdev, qp->rss_qp.tirn, mucontext->devx_uid); err: kvfree(in); return err; } +static void configure_responder_scat_cqe(struct ib_qp_init_attr *init_attr, + void *qpc) +{ + int rcqe_sz; + + if (init_attr->qp_type == MLX5_IB_QPT_DCI) + return; + + rcqe_sz = mlx5_ib_get_cqe_size(init_attr->recv_cq); + + if (rcqe_sz == 128) { + MLX5_SET(qpc, qpc, cs_res, MLX5_RES_SCAT_DATA64_CQE); + return; + } + + if (init_attr->qp_type != MLX5_IB_QPT_DCT) + MLX5_SET(qpc, qpc, cs_res, MLX5_RES_SCAT_DATA32_CQE); +} + +static void configure_requester_scat_cqe(struct mlx5_ib_dev *dev, + struct ib_qp_init_attr *init_attr, + struct mlx5_ib_create_qp *ucmd, + void *qpc) +{ + enum ib_qp_type qpt = init_attr->qp_type; + int scqe_sz; + bool allow_scat_cqe = 0; + + if (qpt == IB_QPT_UC || qpt == IB_QPT_UD) + return; + + if (ucmd) + allow_scat_cqe = ucmd->flags & MLX5_QP_FLAG_ALLOW_SCATTER_CQE; + + if (!allow_scat_cqe && init_attr->sq_sig_type != IB_SIGNAL_ALL_WR) + return; + + scqe_sz = mlx5_ib_get_cqe_size(init_attr->send_cq); + if (scqe_sz == 128) { + MLX5_SET(qpc, qpc, cs_req, MLX5_REQ_SCAT_DATA64_CQE); + return; + } + + if (init_attr->qp_type != MLX5_IB_QPT_DCI || + MLX5_CAP_GEN(dev->mdev, dc_req_scat_data_cqe)) + MLX5_SET(qpc, qpc, cs_req, MLX5_REQ_SCAT_DATA32_CQE); +} + +static int atomic_size_to_mode(int size_mask) +{ + /* driver does not support atomic_size > 256B + * and does not know how to translate bigger sizes + */ + int supported_size_mask = size_mask & 0x1ff; + int log_max_size; + + if (!supported_size_mask) + return -EOPNOTSUPP; + + log_max_size = __fls(supported_size_mask); + + if (log_max_size > 3) + return log_max_size; + + return MLX5_ATOMIC_MODE_8B; +} + +static int get_atomic_mode(struct mlx5_ib_dev *dev, + enum ib_qp_type qp_type) +{ + u8 atomic_operations = MLX5_CAP_ATOMIC(dev->mdev, atomic_operations); + u8 atomic = MLX5_CAP_GEN(dev->mdev, atomic); + int atomic_mode = -EOPNOTSUPP; + int atomic_size_mask; + + if (!atomic) + return -EOPNOTSUPP; + + if (qp_type == MLX5_IB_QPT_DCT) + atomic_size_mask = MLX5_CAP_ATOMIC(dev->mdev, atomic_size_dc); + else + atomic_size_mask = MLX5_CAP_ATOMIC(dev->mdev, atomic_size_qp); + + if ((atomic_operations & MLX5_ATOMIC_OPS_EXTENDED_CMP_SWAP) || + (atomic_operations & MLX5_ATOMIC_OPS_EXTENDED_FETCH_ADD)) + atomic_mode = atomic_size_to_mode(atomic_size_mask); + + if (atomic_mode <= 0 && + (atomic_operations & MLX5_ATOMIC_OPS_CMP_SWAP && + atomic_operations & MLX5_ATOMIC_OPS_FETCH_ADD)) + atomic_mode = MLX5_ATOMIC_MODE_IB_COMP; + + return atomic_mode; +} + +static inline bool check_flags_mask(uint64_t input, uint64_t supported) +{ + return (input & ~supported) == 0; +} + static int create_qp_common(struct mlx5_ib_dev *dev, struct ib_pd *pd, struct ib_qp_init_attr *init_attr, struct ib_udata *udata, struct mlx5_ib_qp *qp) @@ -1697,20 +1882,47 @@ static int create_qp_common(struct mlx5_ib_dev *dev, struct ib_pd *pd, return -EFAULT; } + if (!check_flags_mask(ucmd.flags, + MLX5_QP_FLAG_SIGNATURE | + MLX5_QP_FLAG_SCATTER_CQE | + MLX5_QP_FLAG_TUNNEL_OFFLOADS | + MLX5_QP_FLAG_BFREG_INDEX | + MLX5_QP_FLAG_TYPE_DCT | + MLX5_QP_FLAG_TYPE_DCI | + MLX5_QP_FLAG_ALLOW_SCATTER_CQE)) + return -EINVAL; + err = get_qp_user_index(to_mucontext(pd->uobject->context), &ucmd, udata->inlen, &uidx); if (err) return err; qp->wq_sig = !!(ucmd.flags & MLX5_QP_FLAG_SIGNATURE); - qp->scat_cqe = !!(ucmd.flags & MLX5_QP_FLAG_SCATTER_CQE); + if (MLX5_CAP_GEN(dev->mdev, sctr_data_cqe)) + qp->scat_cqe = !!(ucmd.flags & MLX5_QP_FLAG_SCATTER_CQE); if (ucmd.flags & MLX5_QP_FLAG_TUNNEL_OFFLOADS) { if (init_attr->qp_type != IB_QPT_RAW_PACKET || !tunnel_offload_supported(mdev)) { mlx5_ib_dbg(dev, "Tunnel offload isn't supported\n"); return -EOPNOTSUPP; } - qp->tunnel_offload_en = true; + qp->flags_en |= MLX5_QP_FLAG_TUNNEL_OFFLOADS; + } + + if (ucmd.flags & MLX5_QP_FLAG_TIR_ALLOW_SELF_LB_UC) { + if (init_attr->qp_type != IB_QPT_RAW_PACKET) { + mlx5_ib_dbg(dev, "Self-LB UC isn't supported\n"); + return -EOPNOTSUPP; + } + qp->flags_en |= MLX5_QP_FLAG_TIR_ALLOW_SELF_LB_UC; + } + + if (ucmd.flags & MLX5_QP_FLAG_TIR_ALLOW_SELF_LB_MC) { + if (init_attr->qp_type != IB_QPT_RAW_PACKET) { + mlx5_ib_dbg(dev, "Self-LB UM isn't supported\n"); + return -EOPNOTSUPP; + } + qp->flags_en |= MLX5_QP_FLAG_TIR_ALLOW_SELF_LB_MC; } if (init_attr->create_flags & IB_QP_CREATE_SOURCE_QPN) { @@ -1811,23 +2023,10 @@ static int create_qp_common(struct mlx5_ib_dev *dev, struct ib_pd *pd, MLX5_SET(qpc, qpc, cd_slave_receive, 1); if (qp->scat_cqe && is_connected(init_attr->qp_type)) { - int rcqe_sz; - int scqe_sz; - - rcqe_sz = mlx5_ib_get_cqe_size(dev, init_attr->recv_cq); - scqe_sz = mlx5_ib_get_cqe_size(dev, init_attr->send_cq); - - if (rcqe_sz == 128) - MLX5_SET(qpc, qpc, cs_res, MLX5_RES_SCAT_DATA64_CQE); - else - MLX5_SET(qpc, qpc, cs_res, MLX5_RES_SCAT_DATA32_CQE); - - if (init_attr->sq_sig_type == IB_SIGNAL_ALL_WR) { - if (scqe_sz == 128) - MLX5_SET(qpc, qpc, cs_req, MLX5_REQ_SCAT_DATA64_CQE); - else - MLX5_SET(qpc, qpc, cs_req, MLX5_REQ_SCAT_DATA32_CQE); - } + configure_responder_scat_cqe(init_attr, qpc); + configure_requester_scat_cqe(dev, init_attr, + (pd && pd->uobject) ? &ucmd : NULL, + qpc); } if (qp->rq.wqe_cnt) { @@ -1911,7 +2110,8 @@ static int create_qp_common(struct mlx5_ib_dev *dev, struct ib_pd *pd, qp->flags & MLX5_IB_QP_UNDERLAY) { qp->raw_packet_qp.sq.ubuffer.buf_addr = ucmd.sq_buf_addr; raw_packet_qp_copy_info(qp, &qp->raw_packet_qp); - err = create_raw_packet_qp(dev, qp, in, inlen, pd); + err = create_raw_packet_qp(dev, qp, in, inlen, pd, udata, + &resp); } else { err = mlx5_core_create_qp(dev->mdev, &base->mqp, in, inlen); } @@ -2192,6 +2392,7 @@ static struct ib_qp *mlx5_ib_create_dct(struct ib_pd *pd, goto err_free; } + MLX5_SET(create_dct_in, qp->dct.in, uid, to_mpd(pd)->uid); dctc = MLX5_ADDR_OF(create_dct_in, qp->dct.in, dct_context_entry); qp->qp_sub_type = MLX5_IB_QPT_DCT; MLX5_SET(dctc, dctc, pd, to_mpd(pd)->pdn); @@ -2200,6 +2401,9 @@ static struct ib_qp *mlx5_ib_create_dct(struct ib_pd *pd, MLX5_SET64(dctc, dctc, dc_access_key, ucmd->access_key); MLX5_SET(dctc, dctc, user_index, uidx); + if (ucmd->flags & MLX5_QP_FLAG_SCATTER_CQE) + configure_responder_scat_cqe(attr, dctc); + qp->state = IB_QPS_RESET; return &qp->ibqp; @@ -2405,13 +2609,15 @@ int mlx5_ib_destroy_qp(struct ib_qp *qp) return 0; } -static __be32 to_mlx5_access_flags(struct mlx5_ib_qp *qp, const struct ib_qp_attr *attr, - int attr_mask) +static int to_mlx5_access_flags(struct mlx5_ib_qp *qp, + const struct ib_qp_attr *attr, + int attr_mask, __be32 *hw_access_flags) { - u32 hw_access_flags = 0; u8 dest_rd_atomic; u32 access_flags; + struct mlx5_ib_dev *dev = to_mdev(qp->ibqp.device); + if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC) dest_rd_atomic = attr->max_dest_rd_atomic; else @@ -2426,13 +2632,25 @@ static __be32 to_mlx5_access_flags(struct mlx5_ib_qp *qp, const struct ib_qp_att access_flags &= IB_ACCESS_REMOTE_WRITE; if (access_flags & IB_ACCESS_REMOTE_READ) - hw_access_flags |= MLX5_QP_BIT_RRE; - if (access_flags & IB_ACCESS_REMOTE_ATOMIC) - hw_access_flags |= (MLX5_QP_BIT_RAE | MLX5_ATOMIC_MODE_CX); + *hw_access_flags |= MLX5_QP_BIT_RRE; + if ((access_flags & IB_ACCESS_REMOTE_ATOMIC) && + qp->ibqp.qp_type == IB_QPT_RC) { + int atomic_mode; + + atomic_mode = get_atomic_mode(dev, qp->ibqp.qp_type); + if (atomic_mode < 0) + return -EOPNOTSUPP; + + *hw_access_flags |= MLX5_QP_BIT_RAE; + *hw_access_flags |= atomic_mode << MLX5_ATOMIC_MODE_OFFSET; + } + if (access_flags & IB_ACCESS_REMOTE_WRITE) - hw_access_flags |= MLX5_QP_BIT_RWE; + *hw_access_flags |= MLX5_QP_BIT_RWE; + + *hw_access_flags = cpu_to_be32(*hw_access_flags); - return cpu_to_be32(hw_access_flags); + return 0; } enum { @@ -2458,7 +2676,8 @@ static int ib_rate_to_mlx5(struct mlx5_ib_dev *dev, u8 rate) } static int modify_raw_packet_eth_prio(struct mlx5_core_dev *dev, - struct mlx5_ib_sq *sq, u8 sl) + struct mlx5_ib_sq *sq, u8 sl, + struct ib_pd *pd) { void *in; void *tisc; @@ -2471,6 +2690,7 @@ static int modify_raw_packet_eth_prio(struct mlx5_core_dev *dev, return -ENOMEM; MLX5_SET(modify_tis_in, in, bitmask.prio, 1); + MLX5_SET(modify_tis_in, in, uid, to_mpd(pd)->uid); tisc = MLX5_ADDR_OF(modify_tis_in, in, ctx); MLX5_SET(tisc, tisc, prio, ((sl & 0x7) << 1)); @@ -2483,7 +2703,8 @@ static int modify_raw_packet_eth_prio(struct mlx5_core_dev *dev, } static int modify_raw_packet_tx_affinity(struct mlx5_core_dev *dev, - struct mlx5_ib_sq *sq, u8 tx_affinity) + struct mlx5_ib_sq *sq, u8 tx_affinity, + struct ib_pd *pd) { void *in; void *tisc; @@ -2496,6 +2717,7 @@ static int modify_raw_packet_tx_affinity(struct mlx5_core_dev *dev, return -ENOMEM; MLX5_SET(modify_tis_in, in, bitmask.lag_tx_port_affinity, 1); + MLX5_SET(modify_tis_in, in, uid, to_mpd(pd)->uid); tisc = MLX5_ADDR_OF(modify_tis_in, in, ctx); MLX5_SET(tisc, tisc, lag_tx_port_affinity, tx_affinity); @@ -2580,7 +2802,7 @@ static int mlx5_set_path(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp, if ((qp->ibqp.qp_type == IB_QPT_RAW_PACKET) && qp->sq.wqe_cnt) return modify_raw_packet_eth_prio(dev->mdev, &qp->raw_packet_qp.sq, - sl & 0xf); + sl & 0xf, qp->ibqp.pd); return 0; } @@ -2728,9 +2950,9 @@ static int ib_mask_to_mlx5_opt(int ib_mask) return result; } -static int modify_raw_packet_qp_rq(struct mlx5_ib_dev *dev, - struct mlx5_ib_rq *rq, int new_state, - const struct mlx5_modify_raw_qp_param *raw_qp_param) +static int modify_raw_packet_qp_rq( + struct mlx5_ib_dev *dev, struct mlx5_ib_rq *rq, int new_state, + const struct mlx5_modify_raw_qp_param *raw_qp_param, struct ib_pd *pd) { void *in; void *rqc; @@ -2743,6 +2965,7 @@ static int modify_raw_packet_qp_rq(struct mlx5_ib_dev *dev, return -ENOMEM; MLX5_SET(modify_rq_in, in, rq_state, rq->state); + MLX5_SET(modify_rq_in, in, uid, to_mpd(pd)->uid); rqc = MLX5_ADDR_OF(modify_rq_in, in, ctx); MLX5_SET(rqc, rqc, state, new_state); @@ -2753,8 +2976,9 @@ static int modify_raw_packet_qp_rq(struct mlx5_ib_dev *dev, MLX5_MODIFY_RQ_IN_MODIFY_BITMASK_RQ_COUNTER_SET_ID); MLX5_SET(rqc, rqc, counter_set_id, raw_qp_param->rq_q_ctr_id); } else - pr_info_once("%s: RAW PACKET QP counters are not supported on current FW\n", - dev->ib_dev.name); + dev_info_once( + &dev->ib_dev.dev, + "RAW PACKET QP counters are not supported on current FW\n"); } err = mlx5_core_modify_rq(dev->mdev, rq->base.mqp.qpn, in, inlen); @@ -2768,10 +2992,9 @@ out: return err; } -static int modify_raw_packet_qp_sq(struct mlx5_core_dev *dev, - struct mlx5_ib_sq *sq, - int new_state, - const struct mlx5_modify_raw_qp_param *raw_qp_param) +static int modify_raw_packet_qp_sq( + struct mlx5_core_dev *dev, struct mlx5_ib_sq *sq, int new_state, + const struct mlx5_modify_raw_qp_param *raw_qp_param, struct ib_pd *pd) { struct mlx5_ib_qp *ibqp = sq->base.container_mibqp; struct mlx5_rate_limit old_rl = ibqp->rl; @@ -2788,6 +3011,7 @@ static int modify_raw_packet_qp_sq(struct mlx5_core_dev *dev, if (!in) return -ENOMEM; + MLX5_SET(modify_sq_in, in, uid, to_mpd(pd)->uid); MLX5_SET(modify_sq_in, in, sq_state, sq->state); sqc = MLX5_ADDR_OF(modify_sq_in, in, ctx); @@ -2890,7 +3114,8 @@ static int modify_raw_packet_qp(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp, } if (modify_rq) { - err = modify_raw_packet_qp_rq(dev, rq, rq_state, raw_qp_param); + err = modify_raw_packet_qp_rq(dev, rq, rq_state, raw_qp_param, + qp->ibqp.pd); if (err) return err; } @@ -2898,17 +3123,50 @@ static int modify_raw_packet_qp(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp, if (modify_sq) { if (tx_affinity) { err = modify_raw_packet_tx_affinity(dev->mdev, sq, - tx_affinity); + tx_affinity, + qp->ibqp.pd); if (err) return err; } - return modify_raw_packet_qp_sq(dev->mdev, sq, sq_state, raw_qp_param); + return modify_raw_packet_qp_sq(dev->mdev, sq, sq_state, + raw_qp_param, qp->ibqp.pd); } return 0; } +static unsigned int get_tx_affinity(struct mlx5_ib_dev *dev, + struct mlx5_ib_pd *pd, + struct mlx5_ib_qp_base *qp_base, + u8 port_num) +{ + struct mlx5_ib_ucontext *ucontext = NULL; + unsigned int tx_port_affinity; + + if (pd && pd->ibpd.uobject && pd->ibpd.uobject->context) + ucontext = to_mucontext(pd->ibpd.uobject->context); + + if (ucontext) { + tx_port_affinity = (unsigned int)atomic_add_return( + 1, &ucontext->tx_port_affinity) % + MLX5_MAX_PORTS + + 1; + mlx5_ib_dbg(dev, "Set tx affinity 0x%x to qpn 0x%x ucontext %p\n", + tx_port_affinity, qp_base->mqp.qpn, ucontext); + } else { + tx_port_affinity = + (unsigned int)atomic_add_return( + 1, &dev->roce[port_num].tx_port_affinity) % + MLX5_MAX_PORTS + + 1; + mlx5_ib_dbg(dev, "Set tx affinity 0x%x to qpn 0x%x\n", + tx_port_affinity, qp_base->mqp.qpn); + } + + return tx_port_affinity; +} + static int __mlx5_ib_modify_qp(struct ib_qp *ibqp, const struct ib_qp_attr *attr, int attr_mask, enum ib_qp_state cur_state, enum ib_qp_state new_state, @@ -2974,6 +3232,7 @@ static int __mlx5_ib_modify_qp(struct ib_qp *ibqp, if (!context) return -ENOMEM; + pd = get_pd(qp); context->flags = cpu_to_be32(mlx5_st << 16); if (!(attr_mask & IB_QP_PATH_MIG_STATE)) { @@ -3002,9 +3261,7 @@ static int __mlx5_ib_modify_qp(struct ib_qp *ibqp, (ibqp->qp_type == IB_QPT_XRC_TGT)) { if (mlx5_lag_is_active(dev->mdev)) { u8 p = mlx5_core_native_port_num(dev->mdev); - tx_affinity = (unsigned int)atomic_add_return(1, - &dev->roce[p].next_port) % - MLX5_MAX_PORTS + 1; + tx_affinity = get_tx_affinity(dev, pd, base, p); context->flags |= cpu_to_be32(tx_affinity << 24); } } @@ -3062,7 +3319,6 @@ static int __mlx5_ib_modify_qp(struct ib_qp *ibqp, goto out; } - pd = get_pd(qp); get_cqs(qp->ibqp.qp_type, qp->ibqp.send_cq, qp->ibqp.recv_cq, &send_cq, &recv_cq); @@ -3092,8 +3348,15 @@ static int __mlx5_ib_modify_qp(struct ib_qp *ibqp, cpu_to_be32(fls(attr->max_dest_rd_atomic - 1) << 21); } - if (attr_mask & (IB_QP_ACCESS_FLAGS | IB_QP_MAX_DEST_RD_ATOMIC)) - context->params2 |= to_mlx5_access_flags(qp, attr, attr_mask); + if (attr_mask & (IB_QP_ACCESS_FLAGS | IB_QP_MAX_DEST_RD_ATOMIC)) { + __be32 access_flags = 0; + + err = to_mlx5_access_flags(qp, attr, attr_mask, &access_flags); + if (err) + goto out; + + context->params2 |= access_flags; + } if (attr_mask & IB_QP_MIN_RNR_TIMER) context->rnr_nextrecvpsn |= cpu_to_be32(attr->min_rnr_timer << 24); @@ -3243,7 +3506,9 @@ static bool modify_dci_qp_is_ok(enum ib_qp_state cur_state, enum ib_qp_state new int req = IB_QP_STATE; int opt = 0; - if (cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT) { + if (new_state == IB_QPS_RESET) { + return is_valid_mask(attr_mask, req, opt); + } else if (cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT) { req |= IB_QP_PKEY_INDEX | IB_QP_PORT; return is_valid_mask(attr_mask, req, opt); } else if (cur_state == IB_QPS_INIT && new_state == IB_QPS_INIT) { @@ -3307,10 +3572,14 @@ static int mlx5_ib_modify_dct(struct ib_qp *ibqp, struct ib_qp_attr *attr, if (attr->qp_access_flags & IB_ACCESS_REMOTE_WRITE) MLX5_SET(dctc, dctc, rwe, 1); if (attr->qp_access_flags & IB_ACCESS_REMOTE_ATOMIC) { - if (!mlx5_ib_dc_atomic_is_supported(dev)) + int atomic_mode; + + atomic_mode = get_atomic_mode(dev, MLX5_IB_QPT_DCT); + if (atomic_mode < 0) return -EOPNOTSUPP; + + MLX5_SET(dctc, dctc, atomic_mode, atomic_mode); MLX5_SET(dctc, dctc, rae, 1); - MLX5_SET(dctc, dctc, atomic_mode, MLX5_ATOMIC_MODE_DCT_CX); } MLX5_SET(dctc, dctc, pkey_index, attr->pkey_index); MLX5_SET(dctc, dctc, port, attr->port_num); @@ -3367,7 +3636,6 @@ int mlx5_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, size_t required_cmd_sz; int err = -EINVAL; int port; - enum rdma_link_layer ll = IB_LINK_LAYER_UNSPECIFIED; if (ibqp->rwq_ind_tbl) return -ENOSYS; @@ -3413,7 +3681,6 @@ int mlx5_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, if (!(cur_state == new_state && cur_state == IB_QPS_RESET)) { port = attr_mask & IB_QP_PORT ? attr->port_num : qp->port; - ll = dev->ib_dev.get_link_layer(&dev->ib_dev, port); } if (qp->flags & MLX5_IB_QP_UNDERLAY) { @@ -3424,7 +3691,8 @@ int mlx5_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, } } else if (qp_type != MLX5_IB_QPT_REG_UMR && qp_type != MLX5_IB_QPT_DCI && - !ib_modify_qp_is_ok(cur_state, new_state, qp_type, attr_mask, ll)) { + !ib_modify_qp_is_ok(cur_state, new_state, qp_type, + attr_mask)) { mlx5_ib_dbg(dev, "invalid QP state transition from %d to %d, qp_type %d, attr_mask 0x%x\n", cur_state, new_state, ibqp->qp_type, attr_mask); goto out; @@ -4371,6 +4639,12 @@ static int _mlx5_ib_post_send(struct ib_qp *ibqp, const struct ib_send_wr *wr, u8 next_fence = 0; u8 fence; + if (unlikely(mdev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR && + !drain)) { + *bad_wr = wr; + return -EIO; + } + if (unlikely(ibqp->qp_type == IB_QPT_GSI)) return mlx5_ib_gsi_post_send(ibqp, wr, bad_wr); @@ -4380,13 +4654,6 @@ static int _mlx5_ib_post_send(struct ib_qp *ibqp, const struct ib_send_wr *wr, spin_lock_irqsave(&qp->sq.lock, flags); - if (mdev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR && !drain) { - err = -EIO; - *bad_wr = wr; - nreq = 0; - goto out; - } - for (nreq = 0; wr; nreq++, wr = wr->next) { if (unlikely(wr->opcode >= ARRAY_SIZE(mlx5_ib_opcode))) { mlx5_ib_warn(dev, "\n"); @@ -4700,18 +4967,17 @@ static int _mlx5_ib_post_recv(struct ib_qp *ibqp, const struct ib_recv_wr *wr, int ind; int i; + if (unlikely(mdev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR && + !drain)) { + *bad_wr = wr; + return -EIO; + } + if (unlikely(ibqp->qp_type == IB_QPT_GSI)) return mlx5_ib_gsi_post_recv(ibqp, wr, bad_wr); spin_lock_irqsave(&qp->rq.lock, flags); - if (mdev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR && !drain) { - err = -EIO; - *bad_wr = wr; - nreq = 0; - goto out; - } - ind = qp->rq.head & (qp->rq.wqe_cnt - 1); for (nreq = 0; wr; nreq++, wr = wr->next) { @@ -5175,6 +5441,7 @@ struct ib_xrcd *mlx5_ib_alloc_xrcd(struct ib_device *ibdev, struct mlx5_ib_dev *dev = to_mdev(ibdev); struct mlx5_ib_xrcd *xrcd; int err; + u16 uid; if (!MLX5_CAP_GEN(dev->mdev, xrc)) return ERR_PTR(-ENOSYS); @@ -5183,12 +5450,14 @@ struct ib_xrcd *mlx5_ib_alloc_xrcd(struct ib_device *ibdev, if (!xrcd) return ERR_PTR(-ENOMEM); - err = mlx5_core_xrcd_alloc(dev->mdev, &xrcd->xrcdn); + uid = context ? to_mucontext(context)->devx_uid : 0; + err = mlx5_cmd_xrcd_alloc(dev->mdev, &xrcd->xrcdn, uid); if (err) { kfree(xrcd); return ERR_PTR(-ENOMEM); } + xrcd->uid = uid; return &xrcd->ibxrcd; } @@ -5196,9 +5465,10 @@ int mlx5_ib_dealloc_xrcd(struct ib_xrcd *xrcd) { struct mlx5_ib_dev *dev = to_mdev(xrcd->device); u32 xrcdn = to_mxrcd(xrcd)->xrcdn; + u16 uid = to_mxrcd(xrcd)->uid; int err; - err = mlx5_core_xrcd_dealloc(dev->mdev, xrcdn); + err = mlx5_cmd_xrcd_dealloc(dev->mdev, xrcdn, uid); if (err) mlx5_ib_warn(dev, "failed to dealloc xrcdn 0x%x\n", xrcdn); @@ -5268,6 +5538,7 @@ static int create_rq(struct mlx5_ib_rwq *rwq, struct ib_pd *pd, if (!in) return -ENOMEM; + MLX5_SET(create_rq_in, in, uid, to_mpd(pd)->uid); rqc = MLX5_ADDR_OF(create_rq_in, in, ctx); MLX5_SET(rqc, rqc, mem_rq_type, MLX5_RQC_MEM_RQ_TYPE_MEMORY_RQ_INLINE); @@ -5443,8 +5714,7 @@ static int prepare_user_rq(struct ib_pd *pd, err = create_user_rq(dev, pd, rwq, &ucmd); if (err) { mlx5_ib_dbg(dev, "err %d\n", err); - if (err) - return err; + return err; } rwq->user_index = ucmd.user_index; @@ -5573,6 +5843,9 @@ struct ib_rwq_ind_table *mlx5_ib_create_rwq_ind_table(struct ib_device *device, for (i = 0; i < sz; i++) MLX5_SET(rqtc, rqtc, rq_num[i], init_attr->ind_tbl[i]->wq_num); + rwq_ind_tbl->uid = to_mpd(init_attr->ind_tbl[0]->pd)->uid; + MLX5_SET(create_rqt_in, in, uid, rwq_ind_tbl->uid); + err = mlx5_core_create_rqt(dev->mdev, in, inlen, &rwq_ind_tbl->rqtn); kvfree(in); @@ -5591,7 +5864,7 @@ struct ib_rwq_ind_table *mlx5_ib_create_rwq_ind_table(struct ib_device *device, return &rwq_ind_tbl->ib_rwq_ind_tbl; err_copy: - mlx5_core_destroy_rqt(dev->mdev, rwq_ind_tbl->rqtn); + mlx5_cmd_destroy_rqt(dev->mdev, rwq_ind_tbl->rqtn, rwq_ind_tbl->uid); err: kfree(rwq_ind_tbl); return ERR_PTR(err); @@ -5602,7 +5875,7 @@ int mlx5_ib_destroy_rwq_ind_table(struct ib_rwq_ind_table *ib_rwq_ind_tbl) struct mlx5_ib_rwq_ind_table *rwq_ind_tbl = to_mrwq_ind_table(ib_rwq_ind_tbl); struct mlx5_ib_dev *dev = to_mdev(ib_rwq_ind_tbl->device); - mlx5_core_destroy_rqt(dev->mdev, rwq_ind_tbl->rqtn); + mlx5_cmd_destroy_rqt(dev->mdev, rwq_ind_tbl->rqtn, rwq_ind_tbl->uid); kfree(rwq_ind_tbl); return 0; @@ -5653,6 +5926,7 @@ int mlx5_ib_modify_wq(struct ib_wq *wq, struct ib_wq_attr *wq_attr, if (wq_state == IB_WQS_ERR) wq_state = MLX5_RQC_STATE_ERR; MLX5_SET(modify_rq_in, in, rq_state, curr_wq_state); + MLX5_SET(modify_rq_in, in, uid, to_mpd(wq->pd)->uid); MLX5_SET(rqc, rqc, state, wq_state); if (wq_attr_mask & IB_WQ_FLAGS) { @@ -5684,8 +5958,9 @@ int mlx5_ib_modify_wq(struct ib_wq *wq, struct ib_wq_attr *wq_attr, MLX5_SET(rqc, rqc, counter_set_id, dev->port->cnts.set_id); } else - pr_info_once("%s: Receive WQ counters are not supported on current FW\n", - dev->ib_dev.name); + dev_info_once( + &dev->ib_dev.dev, + "Receive WQ counters are not supported on current FW\n"); } err = mlx5_core_modify_rq(dev->mdev, rwq->core_qp.qpn, in, inlen); diff --git a/drivers/infiniband/hw/mlx5/srq.c b/drivers/infiniband/hw/mlx5/srq.c index d359fecf7a5b..d012e7dbcc38 100644 --- a/drivers/infiniband/hw/mlx5/srq.c +++ b/drivers/infiniband/hw/mlx5/srq.c @@ -144,6 +144,7 @@ static int create_srq_user(struct ib_pd *pd, struct mlx5_ib_srq *srq, in->log_page_size = page_shift - MLX5_ADAPTER_PAGE_SHIFT; in->page_offset = offset; + in->uid = to_mpd(pd)->uid; if (MLX5_CAP_GEN(dev->mdev, cqe_version) == MLX5_CQE_VERSION_V1 && in->type != IB_SRQT_BASIC) in->user_index = uidx; diff --git a/drivers/infiniband/hw/mthca/mthca_mad.c b/drivers/infiniband/hw/mthca/mthca_mad.c index 093f7755c843..2e5dc0a67cfc 100644 --- a/drivers/infiniband/hw/mthca/mthca_mad.c +++ b/drivers/infiniband/hw/mthca/mthca_mad.c @@ -58,8 +58,9 @@ static int mthca_update_rate(struct mthca_dev *dev, u8 port_num) ret = ib_query_port(&dev->ib_dev, port_num, tprops); if (ret) { - printk(KERN_WARNING "ib_query_port failed (%d) for %s port %d\n", - ret, dev->ib_dev.name, port_num); + dev_warn(&dev->ib_dev.dev, + "ib_query_port failed (%d) forport %d\n", ret, + port_num); goto out; } diff --git a/drivers/infiniband/hw/mthca/mthca_main.c b/drivers/infiniband/hw/mthca/mthca_main.c index f3e80dec1334..92c49bff22bc 100644 --- a/drivers/infiniband/hw/mthca/mthca_main.c +++ b/drivers/infiniband/hw/mthca/mthca_main.c @@ -986,7 +986,8 @@ static int __mthca_init_one(struct pci_dev *pdev, int hca_type) goto err_free_dev; } - if (mthca_cmd_init(mdev)) { + err = mthca_cmd_init(mdev); + if (err) { mthca_err(mdev, "Failed to init command interface, aborting.\n"); goto err_free_dev; } @@ -1014,8 +1015,7 @@ static int __mthca_init_one(struct pci_dev *pdev, int hca_type) err = mthca_setup_hca(mdev); if (err == -EBUSY && (mdev->mthca_flags & MTHCA_FLAG_MSI_X)) { - if (mdev->mthca_flags & MTHCA_FLAG_MSI_X) - pci_free_irq_vectors(pdev); + pci_free_irq_vectors(pdev); mdev->mthca_flags &= ~MTHCA_FLAG_MSI_X; err = mthca_setup_hca(mdev); diff --git a/drivers/infiniband/hw/mthca/mthca_provider.c b/drivers/infiniband/hw/mthca/mthca_provider.c index 0d3473b4596e..691c6f048938 100644 --- a/drivers/infiniband/hw/mthca/mthca_provider.c +++ b/drivers/infiniband/hw/mthca/mthca_provider.c @@ -1076,16 +1076,17 @@ static int mthca_unmap_fmr(struct list_head *fmr_list) return err; } -static ssize_t show_rev(struct device *device, struct device_attribute *attr, - char *buf) +static ssize_t hw_rev_show(struct device *device, + struct device_attribute *attr, char *buf) { struct mthca_dev *dev = container_of(device, struct mthca_dev, ib_dev.dev); return sprintf(buf, "%x\n", dev->rev_id); } +static DEVICE_ATTR_RO(hw_rev); -static ssize_t show_hca(struct device *device, struct device_attribute *attr, - char *buf) +static ssize_t hca_type_show(struct device *device, + struct device_attribute *attr, char *buf) { struct mthca_dev *dev = container_of(device, struct mthca_dev, ib_dev.dev); @@ -1103,23 +1104,26 @@ static ssize_t show_hca(struct device *device, struct device_attribute *attr, return sprintf(buf, "unknown\n"); } } +static DEVICE_ATTR_RO(hca_type); -static ssize_t show_board(struct device *device, struct device_attribute *attr, - char *buf) +static ssize_t board_id_show(struct device *device, + struct device_attribute *attr, char *buf) { struct mthca_dev *dev = container_of(device, struct mthca_dev, ib_dev.dev); return sprintf(buf, "%.*s\n", MTHCA_BOARD_ID_LEN, dev->board_id); } +static DEVICE_ATTR_RO(board_id); -static DEVICE_ATTR(hw_rev, S_IRUGO, show_rev, NULL); -static DEVICE_ATTR(hca_type, S_IRUGO, show_hca, NULL); -static DEVICE_ATTR(board_id, S_IRUGO, show_board, NULL); +static struct attribute *mthca_dev_attributes[] = { + &dev_attr_hw_rev.attr, + &dev_attr_hca_type.attr, + &dev_attr_board_id.attr, + NULL +}; -static struct device_attribute *mthca_dev_attributes[] = { - &dev_attr_hw_rev, - &dev_attr_hca_type, - &dev_attr_board_id +static const struct attribute_group mthca_attr_group = { + .attrs = mthca_dev_attributes, }; static int mthca_init_node_data(struct mthca_dev *dev) @@ -1192,13 +1196,11 @@ static void get_dev_fw_str(struct ib_device *device, char *str) int mthca_register_device(struct mthca_dev *dev) { int ret; - int i; ret = mthca_init_node_data(dev); if (ret) return ret; - strlcpy(dev->ib_dev.name, "mthca%d", IB_DEVICE_NAME_MAX); dev->ib_dev.owner = THIS_MODULE; dev->ib_dev.uverbs_abi_ver = MTHCA_UVERBS_ABI_VERSION; @@ -1296,20 +1298,12 @@ int mthca_register_device(struct mthca_dev *dev) mutex_init(&dev->cap_mask_mutex); + rdma_set_device_sysfs_group(&dev->ib_dev, &mthca_attr_group); dev->ib_dev.driver_id = RDMA_DRIVER_MTHCA; - ret = ib_register_device(&dev->ib_dev, NULL); + ret = ib_register_device(&dev->ib_dev, "mthca%d", NULL); if (ret) return ret; - for (i = 0; i < ARRAY_SIZE(mthca_dev_attributes); ++i) { - ret = device_create_file(&dev->ib_dev.dev, - mthca_dev_attributes[i]); - if (ret) { - ib_unregister_device(&dev->ib_dev); - return ret; - } - } - mthca_start_catas_poll(dev); return 0; diff --git a/drivers/infiniband/hw/mthca/mthca_qp.c b/drivers/infiniband/hw/mthca/mthca_qp.c index 3d37f2373d63..9d178ee3c96a 100644 --- a/drivers/infiniband/hw/mthca/mthca_qp.c +++ b/drivers/infiniband/hw/mthca/mthca_qp.c @@ -872,8 +872,8 @@ int mthca_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, int attr_mask, new_state = attr_mask & IB_QP_STATE ? attr->qp_state : cur_state; - if (!ib_modify_qp_is_ok(cur_state, new_state, ibqp->qp_type, attr_mask, - IB_LINK_LAYER_UNSPECIFIED)) { + if (!ib_modify_qp_is_ok(cur_state, new_state, ibqp->qp_type, + attr_mask)) { mthca_dbg(dev, "Bad QP transition (transport %d) " "%d->%d with attr 0x%08x\n", qp->transport, cur_state, new_state, diff --git a/drivers/infiniband/hw/nes/nes.c b/drivers/infiniband/hw/nes/nes.c index 42b68aa999fc..e00add6d78ec 100644 --- a/drivers/infiniband/hw/nes/nes.c +++ b/drivers/infiniband/hw/nes/nes.c @@ -456,9 +456,6 @@ static int nes_probe(struct pci_dev *pcidev, const struct pci_device_id *ent) void __iomem *mmio_regs = NULL; u8 hw_rev; - assert(pcidev != NULL); - assert(ent != NULL); - printk(KERN_INFO PFX "NetEffect RNIC driver v%s loading. (%s)\n", DRV_VERSION, pci_name(pcidev)); diff --git a/drivers/infiniband/hw/nes/nes.h b/drivers/infiniband/hw/nes/nes.h index bedaa02749fb..a895fe980d10 100644 --- a/drivers/infiniband/hw/nes/nes.h +++ b/drivers/infiniband/hw/nes/nes.h @@ -149,18 +149,9 @@ do { \ printk(KERN_ERR PFX "%s[%u]: " fmt, __func__, __LINE__, ##args); \ } while (0) -#define assert(expr) \ -do { \ - if (!(expr)) { \ - printk(KERN_ERR PFX "Assertion failed! %s, %s, %s, line %d\n", \ - #expr, __FILE__, __func__, __LINE__); \ - } \ -} while (0) - #define NES_EVENT_TIMEOUT 1200000 #else #define nes_debug(level, fmt, args...) no_printk(fmt, ##args) -#define assert(expr) do {} while (0) #define NES_EVENT_TIMEOUT 100000 #endif diff --git a/drivers/infiniband/hw/nes/nes_hw.c b/drivers/infiniband/hw/nes/nes_hw.c index bd0675d8f298..5517e392bc01 100644 --- a/drivers/infiniband/hw/nes/nes_hw.c +++ b/drivers/infiniband/hw/nes/nes_hw.c @@ -1443,7 +1443,7 @@ static int nes_init_2025_phy(struct nes_device *nesdev, u8 phy_type, u8 phy_inde mdelay(1); nes_read_10G_phy_reg(nesdev, phy_index, 0x3, 0xd7ee); temp_phy_data2 = (u16)nes_read_indexed(nesdev, NES_IDX_MAC_MDIO_CONTROL); - } while ((temp_phy_data2 == temp_phy_data)); + } while (temp_phy_data2 == temp_phy_data); /* wait for tracking */ counter = 0; diff --git a/drivers/infiniband/hw/nes/nes_nic.c b/drivers/infiniband/hw/nes/nes_nic.c index 61014e251555..16f33454c198 100644 --- a/drivers/infiniband/hw/nes/nes_nic.c +++ b/drivers/infiniband/hw/nes/nes_nic.c @@ -146,8 +146,6 @@ static int nes_netdev_open(struct net_device *netdev) struct list_head *list_pos, *list_temp; unsigned long flags; - assert(nesdev != NULL); - if (nesvnic->netdev_open == 1) return 0; diff --git a/drivers/infiniband/hw/nes/nes_verbs.c b/drivers/infiniband/hw/nes/nes_verbs.c index 6940c7215961..92d1cadd4cfd 100644 --- a/drivers/infiniband/hw/nes/nes_verbs.c +++ b/drivers/infiniband/hw/nes/nes_verbs.c @@ -687,7 +687,7 @@ static struct ib_pd *nes_alloc_pd(struct ib_device *ibdev, } nes_debug(NES_DBG_PD, "Allocating PD (%p) for ib device %s\n", - nespd, nesvnic->nesibdev->ibdev.name); + nespd, dev_name(&nesvnic->nesibdev->ibdev.dev)); nespd->pd_id = (pd_num << (PAGE_SHIFT-12)) + nesadapter->base_pd; @@ -2556,8 +2556,8 @@ static int nes_dereg_mr(struct ib_mr *ib_mr) /** * show_rev */ -static ssize_t show_rev(struct device *dev, struct device_attribute *attr, - char *buf) +static ssize_t hw_rev_show(struct device *dev, + struct device_attribute *attr, char *buf) { struct nes_ib_device *nesibdev = container_of(dev, struct nes_ib_device, ibdev.dev); @@ -2566,40 +2566,40 @@ static ssize_t show_rev(struct device *dev, struct device_attribute *attr, nes_debug(NES_DBG_INIT, "\n"); return sprintf(buf, "%x\n", nesvnic->nesdev->nesadapter->hw_rev); } - +static DEVICE_ATTR_RO(hw_rev); /** * show_hca */ -static ssize_t show_hca(struct device *dev, struct device_attribute *attr, - char *buf) +static ssize_t hca_type_show(struct device *dev, + struct device_attribute *attr, char *buf) { nes_debug(NES_DBG_INIT, "\n"); return sprintf(buf, "NES020\n"); } - +static DEVICE_ATTR_RO(hca_type); /** * show_board */ -static ssize_t show_board(struct device *dev, struct device_attribute *attr, - char *buf) +static ssize_t board_id_show(struct device *dev, + struct device_attribute *attr, char *buf) { nes_debug(NES_DBG_INIT, "\n"); return sprintf(buf, "%.*s\n", 32, "NES020 Board ID"); } +static DEVICE_ATTR_RO(board_id); - -static DEVICE_ATTR(hw_rev, S_IRUGO, show_rev, NULL); -static DEVICE_ATTR(hca_type, S_IRUGO, show_hca, NULL); -static DEVICE_ATTR(board_id, S_IRUGO, show_board, NULL); - -static struct device_attribute *nes_dev_attributes[] = { - &dev_attr_hw_rev, - &dev_attr_hca_type, - &dev_attr_board_id +static struct attribute *nes_dev_attributes[] = { + &dev_attr_hw_rev.attr, + &dev_attr_hca_type.attr, + &dev_attr_board_id.attr, + NULL }; +static const struct attribute_group nes_attr_group = { + .attrs = nes_dev_attributes, +}; /** * nes_query_qp @@ -3640,7 +3640,6 @@ struct nes_ib_device *nes_init_ofa_device(struct net_device *netdev) if (nesibdev == NULL) { return NULL; } - strlcpy(nesibdev->ibdev.name, "nes%d", IB_DEVICE_NAME_MAX); nesibdev->ibdev.owner = THIS_MODULE; nesibdev->ibdev.node_type = RDMA_NODE_RNIC; @@ -3795,10 +3794,11 @@ int nes_register_ofa_device(struct nes_ib_device *nesibdev) struct nes_vnic *nesvnic = nesibdev->nesvnic; struct nes_device *nesdev = nesvnic->nesdev; struct nes_adapter *nesadapter = nesdev->nesadapter; - int i, ret; + int ret; + rdma_set_device_sysfs_group(&nesvnic->nesibdev->ibdev, &nes_attr_group); nesvnic->nesibdev->ibdev.driver_id = RDMA_DRIVER_NES; - ret = ib_register_device(&nesvnic->nesibdev->ibdev, NULL); + ret = ib_register_device(&nesvnic->nesibdev->ibdev, "nes%d", NULL); if (ret) { return ret; } @@ -3809,19 +3809,6 @@ int nes_register_ofa_device(struct nes_ib_device *nesibdev) nesibdev->max_qp = (nesadapter->max_qp-NES_FIRST_QPN) / nesadapter->port_count; nesibdev->max_pd = nesadapter->max_pd / nesadapter->port_count; - for (i = 0; i < ARRAY_SIZE(nes_dev_attributes); ++i) { - ret = device_create_file(&nesibdev->ibdev.dev, nes_dev_attributes[i]); - if (ret) { - while (i > 0) { - i--; - device_remove_file(&nesibdev->ibdev.dev, - nes_dev_attributes[i]); - } - ib_unregister_device(&nesibdev->ibdev); - return ret; - } - } - nesvnic->of_device_registered = 1; return 0; @@ -3834,15 +3821,9 @@ int nes_register_ofa_device(struct nes_ib_device *nesibdev) static void nes_unregister_ofa_device(struct nes_ib_device *nesibdev) { struct nes_vnic *nesvnic = nesibdev->nesvnic; - int i; - for (i = 0; i < ARRAY_SIZE(nes_dev_attributes); ++i) { - device_remove_file(&nesibdev->ibdev.dev, nes_dev_attributes[i]); - } - - if (nesvnic->of_device_registered) { + if (nesvnic->of_device_registered) ib_unregister_device(&nesibdev->ibdev); - } nesvnic->of_device_registered = 0; } diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_hw.c b/drivers/infiniband/hw/ocrdma/ocrdma_hw.c index e578281471af..241a57a07485 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma_hw.c +++ b/drivers/infiniband/hw/ocrdma/ocrdma_hw.c @@ -792,7 +792,7 @@ static void ocrdma_dispatch_ibevent(struct ocrdma_dev *dev, qp->srq->ibsrq. srq_context); } else if (dev_event) { - pr_err("%s: Fatal event received\n", dev->ibdev.name); + dev_err(&dev->ibdev.dev, "Fatal event received\n"); ib_dispatch_event(&ib_evt); } diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_main.c b/drivers/infiniband/hw/ocrdma/ocrdma_main.c index 7832ee3e0c84..873cc7f6fe61 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma_main.c +++ b/drivers/infiniband/hw/ocrdma/ocrdma_main.c @@ -114,9 +114,37 @@ static void get_dev_fw_str(struct ib_device *device, char *str) snprintf(str, IB_FW_VERSION_NAME_MAX, "%s", &dev->attr.fw_ver[0]); } +/* OCRDMA sysfs interface */ +static ssize_t hw_rev_show(struct device *device, + struct device_attribute *attr, char *buf) +{ + struct ocrdma_dev *dev = dev_get_drvdata(device); + + return scnprintf(buf, PAGE_SIZE, "0x%x\n", dev->nic_info.pdev->vendor); +} +static DEVICE_ATTR_RO(hw_rev); + +static ssize_t hca_type_show(struct device *device, + struct device_attribute *attr, char *buf) +{ + struct ocrdma_dev *dev = dev_get_drvdata(device); + + return scnprintf(buf, PAGE_SIZE, "%s\n", &dev->model_number[0]); +} +static DEVICE_ATTR_RO(hca_type); + +static struct attribute *ocrdma_attributes[] = { + &dev_attr_hw_rev.attr, + &dev_attr_hca_type.attr, + NULL +}; + +static const struct attribute_group ocrdma_attr_group = { + .attrs = ocrdma_attributes, +}; + static int ocrdma_register_device(struct ocrdma_dev *dev) { - strlcpy(dev->ibdev.name, "ocrdma%d", IB_DEVICE_NAME_MAX); ocrdma_get_guid(dev, (u8 *)&dev->ibdev.node_guid); BUILD_BUG_ON(sizeof(OCRDMA_NODE_DESC) > IB_DEVICE_NODE_DESC_MAX); memcpy(dev->ibdev.node_desc, OCRDMA_NODE_DESC, @@ -213,8 +241,9 @@ static int ocrdma_register_device(struct ocrdma_dev *dev) dev->ibdev.destroy_srq = ocrdma_destroy_srq; dev->ibdev.post_srq_recv = ocrdma_post_srq_recv; } + rdma_set_device_sysfs_group(&dev->ibdev, &ocrdma_attr_group); dev->ibdev.driver_id = RDMA_DRIVER_OCRDMA; - return ib_register_device(&dev->ibdev, NULL); + return ib_register_device(&dev->ibdev, "ocrdma%d", NULL); } static int ocrdma_alloc_resources(struct ocrdma_dev *dev) @@ -260,42 +289,9 @@ static void ocrdma_free_resources(struct ocrdma_dev *dev) kfree(dev->cq_tbl); } -/* OCRDMA sysfs interface */ -static ssize_t show_rev(struct device *device, struct device_attribute *attr, - char *buf) -{ - struct ocrdma_dev *dev = dev_get_drvdata(device); - - return scnprintf(buf, PAGE_SIZE, "0x%x\n", dev->nic_info.pdev->vendor); -} - -static ssize_t show_hca_type(struct device *device, - struct device_attribute *attr, char *buf) -{ - struct ocrdma_dev *dev = dev_get_drvdata(device); - - return scnprintf(buf, PAGE_SIZE, "%s\n", &dev->model_number[0]); -} - -static DEVICE_ATTR(hw_rev, S_IRUGO, show_rev, NULL); -static DEVICE_ATTR(hca_type, S_IRUGO, show_hca_type, NULL); - -static struct device_attribute *ocrdma_attributes[] = { - &dev_attr_hw_rev, - &dev_attr_hca_type -}; - -static void ocrdma_remove_sysfiles(struct ocrdma_dev *dev) -{ - int i; - - for (i = 0; i < ARRAY_SIZE(ocrdma_attributes); i++) - device_remove_file(&dev->ibdev.dev, ocrdma_attributes[i]); -} - static struct ocrdma_dev *ocrdma_add(struct be_dev_info *dev_info) { - int status = 0, i; + int status = 0; u8 lstate = 0; struct ocrdma_dev *dev; @@ -331,9 +327,6 @@ static struct ocrdma_dev *ocrdma_add(struct be_dev_info *dev_info) if (!status) ocrdma_update_link_state(dev, lstate); - for (i = 0; i < ARRAY_SIZE(ocrdma_attributes); i++) - if (device_create_file(&dev->ibdev.dev, ocrdma_attributes[i])) - goto sysfs_err; /* Init stats */ ocrdma_add_port_stats(dev); /* Interrupt Moderation */ @@ -348,8 +341,6 @@ static struct ocrdma_dev *ocrdma_add(struct be_dev_info *dev_info) dev_name(&dev->nic_info.pdev->dev), dev->id); return dev; -sysfs_err: - ocrdma_remove_sysfiles(dev); alloc_err: ocrdma_free_resources(dev); ocrdma_cleanup_hw(dev); @@ -376,7 +367,6 @@ static void ocrdma_remove(struct ocrdma_dev *dev) * of the registered clients. */ cancel_delayed_work_sync(&dev->eqd_work); - ocrdma_remove_sysfiles(dev); ib_unregister_device(&dev->ibdev); ocrdma_rem_port_stats(dev); diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_stats.c b/drivers/infiniband/hw/ocrdma/ocrdma_stats.c index 24d20a4aa262..290d776edf48 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma_stats.c +++ b/drivers/infiniband/hw/ocrdma/ocrdma_stats.c @@ -764,7 +764,8 @@ void ocrdma_add_port_stats(struct ocrdma_dev *dev) return; /* Create post stats base dir */ - dev->dir = debugfs_create_dir(dev->ibdev.name, ocrdma_dbgfs_dir); + dev->dir = + debugfs_create_dir(dev_name(&dev->ibdev.dev), ocrdma_dbgfs_dir); if (!dev->dir) goto err; diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c index c158ca9fde6d..06d2a7f3304c 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c +++ b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c @@ -1480,8 +1480,7 @@ int ocrdma_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, new_qps = old_qps; spin_unlock_irqrestore(&qp->q_lock, flags); - if (!ib_modify_qp_is_ok(old_qps, new_qps, ibqp->qp_type, attr_mask, - IB_LINK_LAYER_ETHERNET)) { + if (!ib_modify_qp_is_ok(old_qps, new_qps, ibqp->qp_type, attr_mask)) { pr_err("%s(%d) invalid attribute mask=0x%x specified for\n" "qpn=0x%x of type=0x%x old_qps=0x%x, new_qps=0x%x\n", __func__, dev->id, attr_mask, qp->id, ibqp->qp_type, diff --git a/drivers/infiniband/hw/qedr/main.c b/drivers/infiniband/hw/qedr/main.c index a0af6d424aed..8d6ff9df49fe 100644 --- a/drivers/infiniband/hw/qedr/main.c +++ b/drivers/infiniband/hw/qedr/main.c @@ -133,6 +133,33 @@ static int qedr_iw_port_immutable(struct ib_device *ibdev, u8 port_num, return 0; } +/* QEDR sysfs interface */ +static ssize_t hw_rev_show(struct device *device, struct device_attribute *attr, + char *buf) +{ + struct qedr_dev *dev = dev_get_drvdata(device); + + return scnprintf(buf, PAGE_SIZE, "0x%x\n", dev->pdev->vendor); +} +static DEVICE_ATTR_RO(hw_rev); + +static ssize_t hca_type_show(struct device *device, + struct device_attribute *attr, char *buf) +{ + return scnprintf(buf, PAGE_SIZE, "%s\n", "HCA_TYPE_TO_SET"); +} +static DEVICE_ATTR_RO(hca_type); + +static struct attribute *qedr_attributes[] = { + &dev_attr_hw_rev.attr, + &dev_attr_hca_type.attr, + NULL +}; + +static const struct attribute_group qedr_attr_group = { + .attrs = qedr_attributes, +}; + static int qedr_iw_register_device(struct qedr_dev *dev) { dev->ibdev.node_type = RDMA_NODE_RNIC; @@ -170,8 +197,6 @@ static int qedr_register_device(struct qedr_dev *dev) { int rc; - strlcpy(dev->ibdev.name, "qedr%d", IB_DEVICE_NAME_MAX); - dev->ibdev.node_guid = dev->attr.node_guid; memcpy(dev->ibdev.node_desc, QEDR_NODE_DESC, sizeof(QEDR_NODE_DESC)); dev->ibdev.owner = THIS_MODULE; @@ -262,9 +287,9 @@ static int qedr_register_device(struct qedr_dev *dev) dev->ibdev.get_link_layer = qedr_link_layer; dev->ibdev.get_dev_fw_str = qedr_get_dev_fw_str; - + rdma_set_device_sysfs_group(&dev->ibdev, &qedr_attr_group); dev->ibdev.driver_id = RDMA_DRIVER_QEDR; - return ib_register_device(&dev->ibdev, NULL); + return ib_register_device(&dev->ibdev, "qedr%d", NULL); } /* This function allocates fast-path status block memory */ @@ -404,37 +429,6 @@ err1: return rc; } -/* QEDR sysfs interface */ -static ssize_t show_rev(struct device *device, struct device_attribute *attr, - char *buf) -{ - struct qedr_dev *dev = dev_get_drvdata(device); - - return scnprintf(buf, PAGE_SIZE, "0x%x\n", dev->pdev->vendor); -} - -static ssize_t show_hca_type(struct device *device, - struct device_attribute *attr, char *buf) -{ - return scnprintf(buf, PAGE_SIZE, "%s\n", "HCA_TYPE_TO_SET"); -} - -static DEVICE_ATTR(hw_rev, S_IRUGO, show_rev, NULL); -static DEVICE_ATTR(hca_type, S_IRUGO, show_hca_type, NULL); - -static struct device_attribute *qedr_attributes[] = { - &dev_attr_hw_rev, - &dev_attr_hca_type -}; - -static void qedr_remove_sysfiles(struct qedr_dev *dev) -{ - int i; - - for (i = 0; i < ARRAY_SIZE(qedr_attributes); i++) - device_remove_file(&dev->ibdev.dev, qedr_attributes[i]); -} - static void qedr_pci_set_atomic(struct qedr_dev *dev, struct pci_dev *pdev) { int rc = pci_enable_atomic_ops_to_root(pdev, @@ -855,7 +849,7 @@ static struct qedr_dev *qedr_add(struct qed_dev *cdev, struct pci_dev *pdev, { struct qed_dev_rdma_info dev_info; struct qedr_dev *dev; - int rc = 0, i; + int rc = 0; dev = (struct qedr_dev *)ib_alloc_device(sizeof(*dev)); if (!dev) { @@ -914,18 +908,12 @@ static struct qedr_dev *qedr_add(struct qed_dev *cdev, struct pci_dev *pdev, goto reg_err; } - for (i = 0; i < ARRAY_SIZE(qedr_attributes); i++) - if (device_create_file(&dev->ibdev.dev, qedr_attributes[i])) - goto sysfs_err; - if (!test_and_set_bit(QEDR_ENET_STATE_BIT, &dev->enet_state)) qedr_ib_dispatch_event(dev, QEDR_PORT, IB_EVENT_PORT_ACTIVE); DP_DEBUG(dev, QEDR_MSG_INIT, "qedr driver loaded successfully\n"); return dev; -sysfs_err: - ib_unregister_device(&dev->ibdev); reg_err: qedr_sync_free_irqs(dev); irq_err: @@ -944,7 +932,6 @@ static void qedr_remove(struct qedr_dev *dev) /* First unregister with stack to stop all the active traffic * of the registered clients. */ - qedr_remove_sysfiles(dev); ib_unregister_device(&dev->ibdev); qedr_stop_hw(dev); diff --git a/drivers/infiniband/hw/qedr/qedr.h b/drivers/infiniband/hw/qedr/qedr.h index a2d708dceb8d..53bbe6b4e6e6 100644 --- a/drivers/infiniband/hw/qedr/qedr.h +++ b/drivers/infiniband/hw/qedr/qedr.h @@ -43,7 +43,7 @@ #include "qedr_hsi_rdma.h" #define QEDR_NODE_DESC "QLogic 579xx RoCE HCA" -#define DP_NAME(dev) ((dev)->ibdev.name) +#define DP_NAME(_dev) dev_name(&(_dev)->ibdev.dev) #define IS_IWARP(_dev) ((_dev)->rdma_type == QED_RDMA_TYPE_IWARP) #define IS_ROCE(_dev) ((_dev)->rdma_type == QED_RDMA_TYPE_ROCE) diff --git a/drivers/infiniband/hw/qedr/qedr_roce_cm.c b/drivers/infiniband/hw/qedr/qedr_roce_cm.c index 85578887421b..e1ac2fd60bb1 100644 --- a/drivers/infiniband/hw/qedr/qedr_roce_cm.c +++ b/drivers/infiniband/hw/qedr/qedr_roce_cm.c @@ -519,9 +519,9 @@ static inline int qedr_gsi_build_packet(struct qedr_dev *dev, } if (ether_addr_equal(udh.eth.smac_h, udh.eth.dmac_h)) - packet->tx_dest = QED_ROCE_LL2_TX_DEST_LB; + packet->tx_dest = QED_LL2_TX_DEST_LB; else - packet->tx_dest = QED_ROCE_LL2_TX_DEST_NW; + packet->tx_dest = QED_LL2_TX_DEST_NW; packet->roce_mode = roce_mode; memcpy(packet->header.vaddr, ud_header_buffer, header_size); diff --git a/drivers/infiniband/hw/qedr/verbs.c b/drivers/infiniband/hw/qedr/verbs.c index 8cc3df24e04e..82ee4b4a7084 100644 --- a/drivers/infiniband/hw/qedr/verbs.c +++ b/drivers/infiniband/hw/qedr/verbs.c @@ -1447,7 +1447,6 @@ struct ib_srq *qedr_create_srq(struct ib_pd *ibpd, u64 pbl_base_addr, phy_prod_pair_addr; struct ib_ucontext *ib_ctx = NULL; struct qedr_srq_hwq_info *hw_srq; - struct qedr_ucontext *ctx = NULL; u32 page_cnt, page_size; struct qedr_srq *srq; int rc = 0; @@ -1473,7 +1472,6 @@ struct ib_srq *qedr_create_srq(struct ib_pd *ibpd, if (udata && ibpd->uobject && ibpd->uobject->context) { ib_ctx = ibpd->uobject->context; - ctx = get_qedr_ucontext(ib_ctx); if (ib_copy_from_udata(&ureq, udata, sizeof(ureq))) { DP_ERR(dev, @@ -2240,8 +2238,7 @@ int qedr_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, if (rdma_protocol_roce(&dev->ibdev, 1)) { if (!ib_modify_qp_is_ok(old_qp_state, new_qp_state, - ibqp->qp_type, attr_mask, - IB_LINK_LAYER_ETHERNET)) { + ibqp->qp_type, attr_mask)) { DP_ERR(dev, "modify qp: invalid attribute mask=0x%x specified for\n" "qpn=0x%x of type=0x%x old_qp_state=0x%x, new_qp_state=0x%x\n", diff --git a/drivers/infiniband/hw/qib/qib.h b/drivers/infiniband/hw/qib/qib.h index 3461df002f81..83d2349188db 100644 --- a/drivers/infiniband/hw/qib/qib.h +++ b/drivers/infiniband/hw/qib/qib.h @@ -1390,13 +1390,13 @@ static inline u32 qib_get_hdrqtail(const struct qib_ctxtdata *rcd) */ extern const char ib_qib_version[]; +extern const struct attribute_group qib_attr_group; int qib_device_create(struct qib_devdata *); void qib_device_remove(struct qib_devdata *); int qib_create_port_files(struct ib_device *ibdev, u8 port_num, struct kobject *kobj); -int qib_verbs_register_sysfs(struct qib_devdata *); void qib_verbs_unregister_sysfs(struct qib_devdata *); /* Hook for sysfs read of QSFP */ extern int qib_qsfp_dump(struct qib_pportdata *ppd, char *buf, int len); diff --git a/drivers/infiniband/hw/qib/qib_qp.c b/drivers/infiniband/hw/qib/qib_qp.c index 344e401915f7..a81905df2d0f 100644 --- a/drivers/infiniband/hw/qib/qib_qp.c +++ b/drivers/infiniband/hw/qib/qib_qp.c @@ -378,25 +378,22 @@ void qib_flush_qp_waiters(struct rvt_qp *qp) * qib_check_send_wqe - validate wr/wqe * @qp - The qp * @wqe - The built wqe + * @call_send - Determine if the send should be posted or scheduled * - * validate wr/wqe. This is called - * prior to inserting the wqe into - * the ring but after the wqe has been - * setup. - * - * Returns 1 to force direct progress, 0 otherwise, -EINVAL on failure + * Returns 0 on success, -EINVAL on failure */ int qib_check_send_wqe(struct rvt_qp *qp, - struct rvt_swqe *wqe) + struct rvt_swqe *wqe, bool *call_send) { struct rvt_ah *ah; - int ret = 0; switch (qp->ibqp.qp_type) { case IB_QPT_RC: case IB_QPT_UC: if (wqe->length > 0x80000000U) return -EINVAL; + if (wqe->length > qp->pmtu) + *call_send = false; break; case IB_QPT_SMI: case IB_QPT_GSI: @@ -405,12 +402,12 @@ int qib_check_send_wqe(struct rvt_qp *qp, if (wqe->length > (1 << ah->log_pmtu)) return -EINVAL; /* progress hint */ - ret = 1; + *call_send = true; break; default: break; } - return ret; + return 0; } #ifdef CONFIG_DEBUG_FS diff --git a/drivers/infiniband/hw/qib/qib_rc.c b/drivers/infiniband/hw/qib/qib_rc.c index f35fdeb14347..6fa002940451 100644 --- a/drivers/infiniband/hw/qib/qib_rc.c +++ b/drivers/infiniband/hw/qib/qib_rc.c @@ -254,7 +254,7 @@ int qib_make_rc_req(struct rvt_qp *qp, unsigned long *flags) goto bail; } wqe = rvt_get_swqe_ptr(qp, qp->s_last); - qib_send_complete(qp, wqe, qp->s_last != qp->s_acked ? + rvt_send_complete(qp, wqe, qp->s_last != qp->s_acked ? IB_WC_SUCCESS : IB_WC_WR_FLUSH_ERR); /* will get called again */ goto done; @@ -838,7 +838,7 @@ void qib_restart_rc(struct rvt_qp *qp, u32 psn, int wait) qib_migrate_qp(qp); qp->s_retry = qp->s_retry_cnt; } else if (qp->s_last == qp->s_acked) { - qib_send_complete(qp, wqe, IB_WC_RETRY_EXC_ERR); + rvt_send_complete(qp, wqe, IB_WC_RETRY_EXC_ERR); rvt_error_qp(qp, IB_WC_WR_FLUSH_ERR); return; } else /* XXX need to handle delayed completion */ @@ -1221,7 +1221,7 @@ static int do_rc_ack(struct rvt_qp *qp, u32 aeth, u32 psn, int opcode, ibp->rvp.n_other_naks++; class_b: if (qp->s_last == qp->s_acked) { - qib_send_complete(qp, wqe, status); + rvt_send_complete(qp, wqe, status); rvt_error_qp(qp, IB_WC_WR_FLUSH_ERR); } break; @@ -1425,7 +1425,8 @@ read_middle: qp->s_rdma_read_len -= pmtu; update_last_psn(qp, psn); spin_unlock_irqrestore(&qp->s_lock, flags); - qib_copy_sge(&qp->s_rdma_read_sge, data, pmtu, 0); + rvt_copy_sge(qp, &qp->s_rdma_read_sge, + data, pmtu, false, false); goto bail; case OP(RDMA_READ_RESPONSE_ONLY): @@ -1471,7 +1472,8 @@ read_last: if (unlikely(tlen != qp->s_rdma_read_len)) goto ack_len_err; aeth = be32_to_cpu(ohdr->u.aeth); - qib_copy_sge(&qp->s_rdma_read_sge, data, tlen, 0); + rvt_copy_sge(qp, &qp->s_rdma_read_sge, + data, tlen, false, false); WARN_ON(qp->s_rdma_read_sge.num_sge); (void) do_rc_ack(qp, aeth, psn, OP(RDMA_READ_RESPONSE_LAST), 0, rcd); @@ -1490,7 +1492,7 @@ ack_len_err: status = IB_WC_LOC_LEN_ERR; ack_err: if (qp->s_last == qp->s_acked) { - qib_send_complete(qp, wqe, status); + rvt_send_complete(qp, wqe, status); rvt_error_qp(qp, IB_WC_WR_FLUSH_ERR); } ack_done: @@ -1844,7 +1846,7 @@ send_middle: qp->r_rcv_len += pmtu; if (unlikely(qp->r_rcv_len > qp->r_len)) goto nack_inv; - qib_copy_sge(&qp->r_sge, data, pmtu, 1); + rvt_copy_sge(qp, &qp->r_sge, data, pmtu, true, false); break; case OP(RDMA_WRITE_LAST_WITH_IMMEDIATE): @@ -1890,7 +1892,7 @@ send_last: wc.byte_len = tlen + qp->r_rcv_len; if (unlikely(wc.byte_len > qp->r_len)) goto nack_inv; - qib_copy_sge(&qp->r_sge, data, tlen, 1); + rvt_copy_sge(qp, &qp->r_sge, data, tlen, true, false); rvt_put_ss(&qp->r_sge); qp->r_msn++; if (!test_and_clear_bit(RVT_R_WRID_VALID, &qp->r_aflags)) diff --git a/drivers/infiniband/hw/qib/qib_ruc.c b/drivers/infiniband/hw/qib/qib_ruc.c index f8a7de795beb..1fa21938f310 100644 --- a/drivers/infiniband/hw/qib/qib_ruc.c +++ b/drivers/infiniband/hw/qib/qib_ruc.c @@ -171,307 +171,6 @@ err: } /** - * qib_ruc_loopback - handle UC and RC lookback requests - * @sqp: the sending QP - * - * This is called from qib_do_send() to - * forward a WQE addressed to the same HCA. - * Note that although we are single threaded due to the tasklet, we still - * have to protect against post_send(). We don't have to worry about - * receive interrupts since this is a connected protocol and all packets - * will pass through here. - */ -static void qib_ruc_loopback(struct rvt_qp *sqp) -{ - struct qib_ibport *ibp = to_iport(sqp->ibqp.device, sqp->port_num); - struct qib_pportdata *ppd = ppd_from_ibp(ibp); - struct qib_devdata *dd = ppd->dd; - struct rvt_dev_info *rdi = &dd->verbs_dev.rdi; - struct rvt_qp *qp; - struct rvt_swqe *wqe; - struct rvt_sge *sge; - unsigned long flags; - struct ib_wc wc; - u64 sdata; - atomic64_t *maddr; - enum ib_wc_status send_status; - int release; - int ret; - - rcu_read_lock(); - /* - * Note that we check the responder QP state after - * checking the requester's state. - */ - qp = rvt_lookup_qpn(rdi, &ibp->rvp, sqp->remote_qpn); - if (!qp) - goto done; - - spin_lock_irqsave(&sqp->s_lock, flags); - - /* Return if we are already busy processing a work request. */ - if ((sqp->s_flags & (RVT_S_BUSY | RVT_S_ANY_WAIT)) || - !(ib_rvt_state_ops[sqp->state] & RVT_PROCESS_OR_FLUSH_SEND)) - goto unlock; - - sqp->s_flags |= RVT_S_BUSY; - -again: - if (sqp->s_last == READ_ONCE(sqp->s_head)) - goto clr_busy; - wqe = rvt_get_swqe_ptr(sqp, sqp->s_last); - - /* Return if it is not OK to start a new work reqeust. */ - if (!(ib_rvt_state_ops[sqp->state] & RVT_PROCESS_NEXT_SEND_OK)) { - if (!(ib_rvt_state_ops[sqp->state] & RVT_FLUSH_SEND)) - goto clr_busy; - /* We are in the error state, flush the work request. */ - send_status = IB_WC_WR_FLUSH_ERR; - goto flush_send; - } - - /* - * We can rely on the entry not changing without the s_lock - * being held until we update s_last. - * We increment s_cur to indicate s_last is in progress. - */ - if (sqp->s_last == sqp->s_cur) { - if (++sqp->s_cur >= sqp->s_size) - sqp->s_cur = 0; - } - spin_unlock_irqrestore(&sqp->s_lock, flags); - - if (!qp || !(ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK) || - qp->ibqp.qp_type != sqp->ibqp.qp_type) { - ibp->rvp.n_pkt_drops++; - /* - * For RC, the requester would timeout and retry so - * shortcut the timeouts and just signal too many retries. - */ - if (sqp->ibqp.qp_type == IB_QPT_RC) - send_status = IB_WC_RETRY_EXC_ERR; - else - send_status = IB_WC_SUCCESS; - goto serr; - } - - memset(&wc, 0, sizeof(wc)); - send_status = IB_WC_SUCCESS; - - release = 1; - sqp->s_sge.sge = wqe->sg_list[0]; - sqp->s_sge.sg_list = wqe->sg_list + 1; - sqp->s_sge.num_sge = wqe->wr.num_sge; - sqp->s_len = wqe->length; - switch (wqe->wr.opcode) { - case IB_WR_SEND_WITH_IMM: - wc.wc_flags = IB_WC_WITH_IMM; - wc.ex.imm_data = wqe->wr.ex.imm_data; - /* FALLTHROUGH */ - case IB_WR_SEND: - ret = rvt_get_rwqe(qp, false); - if (ret < 0) - goto op_err; - if (!ret) - goto rnr_nak; - break; - - case IB_WR_RDMA_WRITE_WITH_IMM: - if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_WRITE))) - goto inv_err; - wc.wc_flags = IB_WC_WITH_IMM; - wc.ex.imm_data = wqe->wr.ex.imm_data; - ret = rvt_get_rwqe(qp, true); - if (ret < 0) - goto op_err; - if (!ret) - goto rnr_nak; - /* FALLTHROUGH */ - case IB_WR_RDMA_WRITE: - if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_WRITE))) - goto inv_err; - if (wqe->length == 0) - break; - if (unlikely(!rvt_rkey_ok(qp, &qp->r_sge.sge, wqe->length, - wqe->rdma_wr.remote_addr, - wqe->rdma_wr.rkey, - IB_ACCESS_REMOTE_WRITE))) - goto acc_err; - qp->r_sge.sg_list = NULL; - qp->r_sge.num_sge = 1; - qp->r_sge.total_len = wqe->length; - break; - - case IB_WR_RDMA_READ: - if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_READ))) - goto inv_err; - if (unlikely(!rvt_rkey_ok(qp, &sqp->s_sge.sge, wqe->length, - wqe->rdma_wr.remote_addr, - wqe->rdma_wr.rkey, - IB_ACCESS_REMOTE_READ))) - goto acc_err; - release = 0; - sqp->s_sge.sg_list = NULL; - sqp->s_sge.num_sge = 1; - qp->r_sge.sge = wqe->sg_list[0]; - qp->r_sge.sg_list = wqe->sg_list + 1; - qp->r_sge.num_sge = wqe->wr.num_sge; - qp->r_sge.total_len = wqe->length; - break; - - case IB_WR_ATOMIC_CMP_AND_SWP: - case IB_WR_ATOMIC_FETCH_AND_ADD: - if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_ATOMIC))) - goto inv_err; - if (unlikely(!rvt_rkey_ok(qp, &qp->r_sge.sge, sizeof(u64), - wqe->atomic_wr.remote_addr, - wqe->atomic_wr.rkey, - IB_ACCESS_REMOTE_ATOMIC))) - goto acc_err; - /* Perform atomic OP and save result. */ - maddr = (atomic64_t *) qp->r_sge.sge.vaddr; - sdata = wqe->atomic_wr.compare_add; - *(u64 *) sqp->s_sge.sge.vaddr = - (wqe->atomic_wr.wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD) ? - (u64) atomic64_add_return(sdata, maddr) - sdata : - (u64) cmpxchg((u64 *) qp->r_sge.sge.vaddr, - sdata, wqe->atomic_wr.swap); - rvt_put_mr(qp->r_sge.sge.mr); - qp->r_sge.num_sge = 0; - goto send_comp; - - default: - send_status = IB_WC_LOC_QP_OP_ERR; - goto serr; - } - - sge = &sqp->s_sge.sge; - while (sqp->s_len) { - u32 len = sqp->s_len; - - if (len > sge->length) - len = sge->length; - if (len > sge->sge_length) - len = sge->sge_length; - BUG_ON(len == 0); - qib_copy_sge(&qp->r_sge, sge->vaddr, len, release); - sge->vaddr += len; - sge->length -= len; - sge->sge_length -= len; - if (sge->sge_length == 0) { - if (!release) - rvt_put_mr(sge->mr); - if (--sqp->s_sge.num_sge) - *sge = *sqp->s_sge.sg_list++; - } else if (sge->length == 0 && sge->mr->lkey) { - if (++sge->n >= RVT_SEGSZ) { - if (++sge->m >= sge->mr->mapsz) - break; - sge->n = 0; - } - sge->vaddr = - sge->mr->map[sge->m]->segs[sge->n].vaddr; - sge->length = - sge->mr->map[sge->m]->segs[sge->n].length; - } - sqp->s_len -= len; - } - if (release) - rvt_put_ss(&qp->r_sge); - - if (!test_and_clear_bit(RVT_R_WRID_VALID, &qp->r_aflags)) - goto send_comp; - - if (wqe->wr.opcode == IB_WR_RDMA_WRITE_WITH_IMM) - wc.opcode = IB_WC_RECV_RDMA_WITH_IMM; - else - wc.opcode = IB_WC_RECV; - wc.wr_id = qp->r_wr_id; - wc.status = IB_WC_SUCCESS; - wc.byte_len = wqe->length; - wc.qp = &qp->ibqp; - wc.src_qp = qp->remote_qpn; - wc.slid = rdma_ah_get_dlid(&qp->remote_ah_attr); - wc.sl = rdma_ah_get_sl(&qp->remote_ah_attr); - wc.port_num = 1; - /* Signal completion event if the solicited bit is set. */ - rvt_cq_enter(ibcq_to_rvtcq(qp->ibqp.recv_cq), &wc, - wqe->wr.send_flags & IB_SEND_SOLICITED); - -send_comp: - spin_lock_irqsave(&sqp->s_lock, flags); - ibp->rvp.n_loop_pkts++; -flush_send: - sqp->s_rnr_retry = sqp->s_rnr_retry_cnt; - qib_send_complete(sqp, wqe, send_status); - goto again; - -rnr_nak: - /* Handle RNR NAK */ - if (qp->ibqp.qp_type == IB_QPT_UC) - goto send_comp; - ibp->rvp.n_rnr_naks++; - /* - * Note: we don't need the s_lock held since the BUSY flag - * makes this single threaded. - */ - if (sqp->s_rnr_retry == 0) { - send_status = IB_WC_RNR_RETRY_EXC_ERR; - goto serr; - } - if (sqp->s_rnr_retry_cnt < 7) - sqp->s_rnr_retry--; - spin_lock_irqsave(&sqp->s_lock, flags); - if (!(ib_rvt_state_ops[sqp->state] & RVT_PROCESS_RECV_OK)) - goto clr_busy; - rvt_add_rnr_timer(sqp, qp->r_min_rnr_timer << - IB_AETH_CREDIT_SHIFT); - goto clr_busy; - -op_err: - send_status = IB_WC_REM_OP_ERR; - wc.status = IB_WC_LOC_QP_OP_ERR; - goto err; - -inv_err: - send_status = IB_WC_REM_INV_REQ_ERR; - wc.status = IB_WC_LOC_QP_OP_ERR; - goto err; - -acc_err: - send_status = IB_WC_REM_ACCESS_ERR; - wc.status = IB_WC_LOC_PROT_ERR; -err: - /* responder goes to error state */ - rvt_rc_error(qp, wc.status); - -serr: - spin_lock_irqsave(&sqp->s_lock, flags); - qib_send_complete(sqp, wqe, send_status); - if (sqp->ibqp.qp_type == IB_QPT_RC) { - int lastwqe = rvt_error_qp(sqp, IB_WC_WR_FLUSH_ERR); - - sqp->s_flags &= ~RVT_S_BUSY; - spin_unlock_irqrestore(&sqp->s_lock, flags); - if (lastwqe) { - struct ib_event ev; - - ev.device = sqp->ibqp.device; - ev.element.qp = &sqp->ibqp; - ev.event = IB_EVENT_QP_LAST_WQE_REACHED; - sqp->ibqp.event_handler(&ev, sqp->ibqp.qp_context); - } - goto done; - } -clr_busy: - sqp->s_flags &= ~RVT_S_BUSY; -unlock: - spin_unlock_irqrestore(&sqp->s_lock, flags); -done: - rcu_read_unlock(); -} - -/** * qib_make_grh - construct a GRH header * @ibp: a pointer to the IB port * @hdr: a pointer to the GRH header being constructed @@ -573,7 +272,7 @@ void qib_do_send(struct rvt_qp *qp) qp->ibqp.qp_type == IB_QPT_UC) && (rdma_ah_get_dlid(&qp->remote_ah_attr) & ~((1 << ppd->lmc) - 1)) == ppd->lid) { - qib_ruc_loopback(qp); + rvt_ruc_loopback(qp); return; } @@ -613,42 +312,3 @@ void qib_do_send(struct rvt_qp *qp) spin_unlock_irqrestore(&qp->s_lock, flags); } - -/* - * This should be called with s_lock held. - */ -void qib_send_complete(struct rvt_qp *qp, struct rvt_swqe *wqe, - enum ib_wc_status status) -{ - u32 old_last, last; - - if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_OR_FLUSH_SEND)) - return; - - last = qp->s_last; - old_last = last; - if (++last >= qp->s_size) - last = 0; - qp->s_last = last; - /* See post_send() */ - barrier(); - rvt_put_swqe(wqe); - if (qp->ibqp.qp_type == IB_QPT_UD || - qp->ibqp.qp_type == IB_QPT_SMI || - qp->ibqp.qp_type == IB_QPT_GSI) - atomic_dec(&ibah_to_rvtah(wqe->ud_wr.ah)->refcount); - - rvt_qp_swqe_complete(qp, - wqe, - ib_qib_wc_opcode[wqe->wr.opcode], - status); - - if (qp->s_acked == old_last) - qp->s_acked = last; - if (qp->s_cur == old_last) - qp->s_cur = last; - if (qp->s_tail == old_last) - qp->s_tail = last; - if (qp->state == IB_QPS_SQD && last == qp->s_cur) - qp->s_draining = 0; -} diff --git a/drivers/infiniband/hw/qib/qib_sdma.c b/drivers/infiniband/hw/qib/qib_sdma.c index d0723d4aef5c..757d4c9d713d 100644 --- a/drivers/infiniband/hw/qib/qib_sdma.c +++ b/drivers/infiniband/hw/qib/qib_sdma.c @@ -651,7 +651,7 @@ unmap: if (ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK) rvt_error_qp(qp, IB_WC_GENERAL_ERR); } else if (qp->s_wqe) - qib_send_complete(qp, qp->s_wqe, IB_WC_GENERAL_ERR); + rvt_send_complete(qp, qp->s_wqe, IB_WC_GENERAL_ERR); spin_unlock(&qp->s_lock); spin_unlock(&qp->r_lock); /* return zero to process the next send work request */ diff --git a/drivers/infiniband/hw/qib/qib_sysfs.c b/drivers/infiniband/hw/qib/qib_sysfs.c index ca2638d8f35e..1cf4ca3f23e3 100644 --- a/drivers/infiniband/hw/qib/qib_sysfs.c +++ b/drivers/infiniband/hw/qib/qib_sysfs.c @@ -551,17 +551,18 @@ static struct kobj_type qib_diagc_ktype = { * Start of per-unit (or driver, in some cases, but replicated * per unit) functions (these get a device *) */ -static ssize_t show_rev(struct device *device, struct device_attribute *attr, - char *buf) +static ssize_t hw_rev_show(struct device *device, struct device_attribute *attr, + char *buf) { struct qib_ibdev *dev = container_of(device, struct qib_ibdev, rdi.ibdev.dev); return sprintf(buf, "%x\n", dd_from_dev(dev)->minrev); } +static DEVICE_ATTR_RO(hw_rev); -static ssize_t show_hca(struct device *device, struct device_attribute *attr, - char *buf) +static ssize_t hca_type_show(struct device *device, + struct device_attribute *attr, char *buf) { struct qib_ibdev *dev = container_of(device, struct qib_ibdev, rdi.ibdev.dev); @@ -574,15 +575,18 @@ static ssize_t show_hca(struct device *device, struct device_attribute *attr, ret = scnprintf(buf, PAGE_SIZE, "%s\n", dd->boardname); return ret; } +static DEVICE_ATTR_RO(hca_type); +static DEVICE_ATTR(board_id, 0444, hca_type_show, NULL); -static ssize_t show_version(struct device *device, +static ssize_t version_show(struct device *device, struct device_attribute *attr, char *buf) { /* The string printed here is already newline-terminated. */ return scnprintf(buf, PAGE_SIZE, "%s", (char *)ib_qib_version); } +static DEVICE_ATTR_RO(version); -static ssize_t show_boardversion(struct device *device, +static ssize_t boardversion_show(struct device *device, struct device_attribute *attr, char *buf) { struct qib_ibdev *dev = @@ -592,9 +596,9 @@ static ssize_t show_boardversion(struct device *device, /* The string printed here is already newline-terminated. */ return scnprintf(buf, PAGE_SIZE, "%s", dd->boardversion); } +static DEVICE_ATTR_RO(boardversion); - -static ssize_t show_localbus_info(struct device *device, +static ssize_t localbus_info_show(struct device *device, struct device_attribute *attr, char *buf) { struct qib_ibdev *dev = @@ -604,9 +608,9 @@ static ssize_t show_localbus_info(struct device *device, /* The string printed here is already newline-terminated. */ return scnprintf(buf, PAGE_SIZE, "%s", dd->lbus_info); } +static DEVICE_ATTR_RO(localbus_info); - -static ssize_t show_nctxts(struct device *device, +static ssize_t nctxts_show(struct device *device, struct device_attribute *attr, char *buf) { struct qib_ibdev *dev = @@ -620,9 +624,10 @@ static ssize_t show_nctxts(struct device *device, (dd->first_user_ctxt > dd->cfgctxts) ? 0 : (dd->cfgctxts - dd->first_user_ctxt)); } +static DEVICE_ATTR_RO(nctxts); -static ssize_t show_nfreectxts(struct device *device, - struct device_attribute *attr, char *buf) +static ssize_t nfreectxts_show(struct device *device, + struct device_attribute *attr, char *buf) { struct qib_ibdev *dev = container_of(device, struct qib_ibdev, rdi.ibdev.dev); @@ -631,8 +636,9 @@ static ssize_t show_nfreectxts(struct device *device, /* Return the number of free user ports (contexts) available. */ return scnprintf(buf, PAGE_SIZE, "%u\n", dd->freectxts); } +static DEVICE_ATTR_RO(nfreectxts); -static ssize_t show_serial(struct device *device, +static ssize_t serial_show(struct device *device, struct device_attribute *attr, char *buf) { struct qib_ibdev *dev = @@ -644,8 +650,9 @@ static ssize_t show_serial(struct device *device, strcat(buf, "\n"); return strlen(buf); } +static DEVICE_ATTR_RO(serial); -static ssize_t store_chip_reset(struct device *device, +static ssize_t chip_reset_store(struct device *device, struct device_attribute *attr, const char *buf, size_t count) { @@ -663,11 +670,12 @@ static ssize_t store_chip_reset(struct device *device, bail: return ret < 0 ? ret : count; } +static DEVICE_ATTR_WO(chip_reset); /* * Dump tempsense regs. in decimal, to ease shell-scripts. */ -static ssize_t show_tempsense(struct device *device, +static ssize_t tempsense_show(struct device *device, struct device_attribute *attr, char *buf) { struct qib_ibdev *dev = @@ -695,6 +703,7 @@ static ssize_t show_tempsense(struct device *device, *(signed char *)(regvals + 7)); return ret; } +static DEVICE_ATTR_RO(tempsense); /* * end of per-unit (or driver, in some cases, but replicated @@ -702,30 +711,23 @@ static ssize_t show_tempsense(struct device *device, */ /* start of per-unit file structures and support code */ -static DEVICE_ATTR(hw_rev, S_IRUGO, show_rev, NULL); -static DEVICE_ATTR(hca_type, S_IRUGO, show_hca, NULL); -static DEVICE_ATTR(board_id, S_IRUGO, show_hca, NULL); -static DEVICE_ATTR(version, S_IRUGO, show_version, NULL); -static DEVICE_ATTR(nctxts, S_IRUGO, show_nctxts, NULL); -static DEVICE_ATTR(nfreectxts, S_IRUGO, show_nfreectxts, NULL); -static DEVICE_ATTR(serial, S_IRUGO, show_serial, NULL); -static DEVICE_ATTR(boardversion, S_IRUGO, show_boardversion, NULL); -static DEVICE_ATTR(tempsense, S_IRUGO, show_tempsense, NULL); -static DEVICE_ATTR(localbus_info, S_IRUGO, show_localbus_info, NULL); -static DEVICE_ATTR(chip_reset, S_IWUSR, NULL, store_chip_reset); - -static struct device_attribute *qib_attributes[] = { - &dev_attr_hw_rev, - &dev_attr_hca_type, - &dev_attr_board_id, - &dev_attr_version, - &dev_attr_nctxts, - &dev_attr_nfreectxts, - &dev_attr_serial, - &dev_attr_boardversion, - &dev_attr_tempsense, - &dev_attr_localbus_info, - &dev_attr_chip_reset, +static struct attribute *qib_attributes[] = { + &dev_attr_hw_rev.attr, + &dev_attr_hca_type.attr, + &dev_attr_board_id.attr, + &dev_attr_version.attr, + &dev_attr_nctxts.attr, + &dev_attr_nfreectxts.attr, + &dev_attr_serial.attr, + &dev_attr_boardversion.attr, + &dev_attr_tempsense.attr, + &dev_attr_localbus_info.attr, + &dev_attr_chip_reset.attr, + NULL, +}; + +const struct attribute_group qib_attr_group = { + .attrs = qib_attributes, }; int qib_create_port_files(struct ib_device *ibdev, u8 port_num, @@ -827,27 +829,6 @@ bail: } /* - * Register and create our files in /sys/class/infiniband. - */ -int qib_verbs_register_sysfs(struct qib_devdata *dd) -{ - struct ib_device *dev = &dd->verbs_dev.rdi.ibdev; - int i, ret; - - for (i = 0; i < ARRAY_SIZE(qib_attributes); ++i) { - ret = device_create_file(&dev->dev, qib_attributes[i]); - if (ret) - goto bail; - } - - return 0; -bail: - for (i = 0; i < ARRAY_SIZE(qib_attributes); ++i) - device_remove_file(&dev->dev, qib_attributes[i]); - return ret; -} - -/* * Unregister and remove our files in /sys/class/infiniband. */ void qib_verbs_unregister_sysfs(struct qib_devdata *dd) diff --git a/drivers/infiniband/hw/qib/qib_uc.c b/drivers/infiniband/hw/qib/qib_uc.c index 3e54bc11e0ae..30c70ad0f4bf 100644 --- a/drivers/infiniband/hw/qib/qib_uc.c +++ b/drivers/infiniband/hw/qib/qib_uc.c @@ -68,7 +68,7 @@ int qib_make_uc_req(struct rvt_qp *qp, unsigned long *flags) goto bail; } wqe = rvt_get_swqe_ptr(qp, qp->s_last); - qib_send_complete(qp, wqe, IB_WC_WR_FLUSH_ERR); + rvt_send_complete(qp, wqe, IB_WC_WR_FLUSH_ERR); goto done; } @@ -359,7 +359,7 @@ send_first: qp->r_rcv_len += pmtu; if (unlikely(qp->r_rcv_len > qp->r_len)) goto rewind; - qib_copy_sge(&qp->r_sge, data, pmtu, 0); + rvt_copy_sge(qp, &qp->r_sge, data, pmtu, false, false); break; case OP(SEND_LAST_WITH_IMMEDIATE): @@ -385,7 +385,7 @@ send_last: if (unlikely(wc.byte_len > qp->r_len)) goto rewind; wc.opcode = IB_WC_RECV; - qib_copy_sge(&qp->r_sge, data, tlen, 0); + rvt_copy_sge(qp, &qp->r_sge, data, tlen, false, false); rvt_put_ss(&qp->s_rdma_read_sge); last_imm: wc.wr_id = qp->r_wr_id; @@ -449,7 +449,7 @@ rdma_first: qp->r_rcv_len += pmtu; if (unlikely(qp->r_rcv_len > qp->r_len)) goto drop; - qib_copy_sge(&qp->r_sge, data, pmtu, 1); + rvt_copy_sge(qp, &qp->r_sge, data, pmtu, true, false); break; case OP(RDMA_WRITE_LAST_WITH_IMMEDIATE): @@ -479,7 +479,7 @@ rdma_last_imm: } wc.byte_len = qp->r_len; wc.opcode = IB_WC_RECV_RDMA_WITH_IMM; - qib_copy_sge(&qp->r_sge, data, tlen, 1); + rvt_copy_sge(qp, &qp->r_sge, data, tlen, true, false); rvt_put_ss(&qp->r_sge); goto last_imm; @@ -495,7 +495,7 @@ rdma_last: tlen -= (hdrsize + pad + 4); if (unlikely(tlen + qp->r_rcv_len != qp->r_len)) goto drop; - qib_copy_sge(&qp->r_sge, data, tlen, 1); + rvt_copy_sge(qp, &qp->r_sge, data, tlen, true, false); rvt_put_ss(&qp->r_sge); break; diff --git a/drivers/infiniband/hw/qib/qib_ud.c b/drivers/infiniband/hw/qib/qib_ud.c index f8d029a2390f..4d4c31ea4e2d 100644 --- a/drivers/infiniband/hw/qib/qib_ud.c +++ b/drivers/infiniband/hw/qib/qib_ud.c @@ -162,8 +162,8 @@ static void qib_ud_loopback(struct rvt_qp *sqp, struct rvt_swqe *swqe) const struct ib_global_route *grd = rdma_ah_read_grh(ah_attr); qib_make_grh(ibp, &grh, grd, 0, 0); - qib_copy_sge(&qp->r_sge, &grh, - sizeof(grh), 1); + rvt_copy_sge(qp, &qp->r_sge, &grh, + sizeof(grh), true, false); wc.wc_flags |= IB_WC_GRH; } else rvt_skip_sge(&qp->r_sge, sizeof(struct ib_grh), true); @@ -179,7 +179,7 @@ static void qib_ud_loopback(struct rvt_qp *sqp, struct rvt_swqe *swqe) if (len > sge->sge_length) len = sge->sge_length; BUG_ON(len == 0); - qib_copy_sge(&qp->r_sge, sge->vaddr, len, 1); + rvt_copy_sge(qp, &qp->r_sge, sge->vaddr, len, true, false); sge->vaddr += len; sge->length -= len; sge->sge_length -= len; @@ -260,7 +260,7 @@ int qib_make_ud_req(struct rvt_qp *qp, unsigned long *flags) goto bail; } wqe = rvt_get_swqe_ptr(qp, qp->s_last); - qib_send_complete(qp, wqe, IB_WC_WR_FLUSH_ERR); + rvt_send_complete(qp, wqe, IB_WC_WR_FLUSH_ERR); goto done; } @@ -304,7 +304,7 @@ int qib_make_ud_req(struct rvt_qp *qp, unsigned long *flags) qib_ud_loopback(qp, wqe); spin_lock_irqsave(&qp->s_lock, tflags); *flags = tflags; - qib_send_complete(qp, wqe, IB_WC_SUCCESS); + rvt_send_complete(qp, wqe, IB_WC_SUCCESS); goto done; } } @@ -551,12 +551,13 @@ void qib_ud_rcv(struct qib_ibport *ibp, struct ib_header *hdr, goto drop; } if (has_grh) { - qib_copy_sge(&qp->r_sge, &hdr->u.l.grh, - sizeof(struct ib_grh), 1); + rvt_copy_sge(qp, &qp->r_sge, &hdr->u.l.grh, + sizeof(struct ib_grh), true, false); wc.wc_flags |= IB_WC_GRH; } else rvt_skip_sge(&qp->r_sge, sizeof(struct ib_grh), true); - qib_copy_sge(&qp->r_sge, data, wc.byte_len - sizeof(struct ib_grh), 1); + rvt_copy_sge(qp, &qp->r_sge, data, wc.byte_len - sizeof(struct ib_grh), + true, false); rvt_put_ss(&qp->r_sge); if (!test_and_clear_bit(RVT_R_WRID_VALID, &qp->r_aflags)) return; diff --git a/drivers/infiniband/hw/qib/qib_verbs.c b/drivers/infiniband/hw/qib/qib_verbs.c index 41babbc0db58..4b0f5761a646 100644 --- a/drivers/infiniband/hw/qib/qib_verbs.c +++ b/drivers/infiniband/hw/qib/qib_verbs.c @@ -131,27 +131,6 @@ const enum ib_wc_opcode ib_qib_wc_opcode[] = { */ __be64 ib_qib_sys_image_guid; -/** - * qib_copy_sge - copy data to SGE memory - * @ss: the SGE state - * @data: the data to copy - * @length: the length of the data - */ -void qib_copy_sge(struct rvt_sge_state *ss, void *data, u32 length, int release) -{ - struct rvt_sge *sge = &ss->sge; - - while (length) { - u32 len = rvt_get_sge_length(sge, length); - - WARN_ON_ONCE(len == 0); - memcpy(sge->vaddr, data, len); - rvt_update_sge(ss, len, release); - data += len; - length -= len; - } -} - /* * Count the number of DMA descriptors needed to send length bytes of data. * Don't modify the qib_sge_state to get the count. @@ -752,7 +731,7 @@ static void sdma_complete(struct qib_sdma_txreq *cookie, int status) spin_lock(&qp->s_lock); if (tx->wqe) - qib_send_complete(qp, tx->wqe, IB_WC_SUCCESS); + rvt_send_complete(qp, tx->wqe, IB_WC_SUCCESS); else if (qp->ibqp.qp_type == IB_QPT_RC) { struct ib_header *hdr; @@ -1025,7 +1004,7 @@ done: } if (qp->s_wqe) { spin_lock_irqsave(&qp->s_lock, flags); - qib_send_complete(qp, qp->s_wqe, IB_WC_SUCCESS); + rvt_send_complete(qp, qp->s_wqe, IB_WC_SUCCESS); spin_unlock_irqrestore(&qp->s_lock, flags); } else if (qp->ibqp.qp_type == IB_QPT_RC) { spin_lock_irqsave(&qp->s_lock, flags); @@ -1512,6 +1491,9 @@ static void qib_fill_device_attr(struct qib_devdata *dd) rdi->dparms.props.max_mcast_grp; /* post send table */ dd->verbs_dev.rdi.post_parms = qib_post_parms; + + /* opcode translation table */ + dd->verbs_dev.rdi.wc_opcode = ib_qib_wc_opcode; } /** @@ -1588,7 +1570,7 @@ int qib_register_ib_device(struct qib_devdata *dd) dd->verbs_dev.rdi.driver_f.port_callback = qib_create_port_files; dd->verbs_dev.rdi.driver_f.get_pci_dev = qib_get_pci_dev; dd->verbs_dev.rdi.driver_f.check_ah = qib_check_ah; - dd->verbs_dev.rdi.driver_f.check_send_wqe = qib_check_send_wqe; + dd->verbs_dev.rdi.driver_f.setup_wqe = qib_check_send_wqe; dd->verbs_dev.rdi.driver_f.notify_new_ah = qib_notify_new_ah; dd->verbs_dev.rdi.driver_f.alloc_qpn = qib_alloc_qpn; dd->verbs_dev.rdi.driver_f.qp_priv_alloc = qib_qp_priv_alloc; @@ -1631,6 +1613,7 @@ int qib_register_ib_device(struct qib_devdata *dd) dd->verbs_dev.rdi.dparms.node = dd->assigned_node_id; dd->verbs_dev.rdi.dparms.core_cap_flags = RDMA_CORE_PORT_IBA_IB; dd->verbs_dev.rdi.dparms.max_mad_size = IB_MGMT_MAD_SIZE; + dd->verbs_dev.rdi.dparms.sge_copy_mode = RVT_SGE_COPY_MEMCPY; qib_fill_device_attr(dd); @@ -1642,19 +1625,14 @@ int qib_register_ib_device(struct qib_devdata *dd) i, dd->rcd[ctxt]->pkeys); } + rdma_set_device_sysfs_group(&dd->verbs_dev.rdi.ibdev, &qib_attr_group); ret = rvt_register_device(&dd->verbs_dev.rdi, RDMA_DRIVER_QIB); if (ret) goto err_tx; - ret = qib_verbs_register_sysfs(dd); - if (ret) - goto err_class; - return ret; -err_class: - rvt_unregister_device(&dd->verbs_dev.rdi); err_tx: while (!list_empty(&dev->txreq_free)) { struct list_head *l = dev->txreq_free.next; @@ -1716,14 +1694,14 @@ void qib_unregister_ib_device(struct qib_devdata *dd) * It is only used in post send, which doesn't hold * the s_lock. */ -void _qib_schedule_send(struct rvt_qp *qp) +bool _qib_schedule_send(struct rvt_qp *qp) { struct qib_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num); struct qib_pportdata *ppd = ppd_from_ibp(ibp); struct qib_qp_priv *priv = qp->priv; - queue_work(ppd->qib_wq, &priv->s_work); + return queue_work(ppd->qib_wq, &priv->s_work); } /** @@ -1733,8 +1711,9 @@ void _qib_schedule_send(struct rvt_qp *qp) * This schedules qp progress. The s_lock * should be held. */ -void qib_schedule_send(struct rvt_qp *qp) +bool qib_schedule_send(struct rvt_qp *qp) { if (qib_send_ok(qp)) - _qib_schedule_send(qp); + return _qib_schedule_send(qp); + return false; } diff --git a/drivers/infiniband/hw/qib/qib_verbs.h b/drivers/infiniband/hw/qib/qib_verbs.h index 666613eef88f..a4426c24b0d1 100644 --- a/drivers/infiniband/hw/qib/qib_verbs.h +++ b/drivers/infiniband/hw/qib/qib_verbs.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2012 - 2017 Intel Corporation. All rights reserved. + * Copyright (c) 2012 - 2018 Intel Corporation. All rights reserved. * Copyright (c) 2006 - 2012 QLogic Corporation. All rights reserved. * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved. * @@ -223,8 +223,8 @@ static inline int qib_send_ok(struct rvt_qp *qp) !(qp->s_flags & RVT_S_ANY_WAIT_SEND)); } -void _qib_schedule_send(struct rvt_qp *qp); -void qib_schedule_send(struct rvt_qp *qp); +bool _qib_schedule_send(struct rvt_qp *qp); +bool qib_schedule_send(struct rvt_qp *qp); static inline int qib_pkey_ok(u16 pkey1, u16 pkey2) { @@ -292,9 +292,6 @@ void qib_put_txreq(struct qib_verbs_txreq *tx); int qib_verbs_send(struct rvt_qp *qp, struct ib_header *hdr, u32 hdrwords, struct rvt_sge_state *ss, u32 len); -void qib_copy_sge(struct rvt_sge_state *ss, void *data, u32 length, - int release); - void qib_uc_rcv(struct qib_ibport *ibp, struct ib_header *hdr, int has_grh, void *data, u32 tlen, struct rvt_qp *qp); @@ -303,7 +300,8 @@ void qib_rc_rcv(struct qib_ctxtdata *rcd, struct ib_header *hdr, int qib_check_ah(struct ib_device *ibdev, struct rdma_ah_attr *ah_attr); -int qib_check_send_wqe(struct rvt_qp *qp, struct rvt_swqe *wqe); +int qib_check_send_wqe(struct rvt_qp *qp, struct rvt_swqe *wqe, + bool *call_send); struct ib_ah *qib_create_qp0_ah(struct qib_ibport *ibp, u16 dlid); @@ -333,9 +331,6 @@ void _qib_do_send(struct work_struct *work); void qib_do_send(struct rvt_qp *qp); -void qib_send_complete(struct rvt_qp *qp, struct rvt_swqe *wqe, - enum ib_wc_status status); - void qib_send_rc_ack(struct rvt_qp *qp); int qib_make_rc_req(struct rvt_qp *qp, unsigned long *flags); diff --git a/drivers/infiniband/hw/usnic/usnic_debugfs.c b/drivers/infiniband/hw/usnic/usnic_debugfs.c index 92dc66cc2d50..a3115709fb03 100644 --- a/drivers/infiniband/hw/usnic/usnic_debugfs.c +++ b/drivers/infiniband/hw/usnic/usnic_debugfs.c @@ -165,6 +165,5 @@ void usnic_debugfs_flow_add(struct usnic_ib_qp_grp_flow *qp_flow) void usnic_debugfs_flow_remove(struct usnic_ib_qp_grp_flow *qp_flow) { - if (!IS_ERR_OR_NULL(qp_flow->dbgfs_dentry)) - debugfs_remove(qp_flow->dbgfs_dentry); + debugfs_remove(qp_flow->dbgfs_dentry); } diff --git a/drivers/infiniband/hw/usnic/usnic_ib_main.c b/drivers/infiniband/hw/usnic/usnic_ib_main.c index f0538a460328..73bd00f8d2c8 100644 --- a/drivers/infiniband/hw/usnic/usnic_ib_main.c +++ b/drivers/infiniband/hw/usnic/usnic_ib_main.c @@ -76,7 +76,7 @@ static LIST_HEAD(usnic_ib_ibdev_list); static int usnic_ib_dump_vf_hdr(void *obj, char *buf, int buf_sz) { struct usnic_ib_vf *vf = obj; - return scnprintf(buf, buf_sz, "PF: %s ", vf->pf->ib_dev.name); + return scnprintf(buf, buf_sz, "PF: %s ", dev_name(&vf->pf->ib_dev.dev)); } /* End callback dump funcs */ @@ -138,7 +138,7 @@ static void usnic_ib_handle_usdev_event(struct usnic_ib_dev *us_ibdev, netdev = us_ibdev->netdev; switch (event) { case NETDEV_REBOOT: - usnic_info("PF Reset on %s\n", us_ibdev->ib_dev.name); + usnic_info("PF Reset on %s\n", dev_name(&us_ibdev->ib_dev.dev)); usnic_ib_qp_grp_modify_active_to_err(us_ibdev); ib_event.event = IB_EVENT_PORT_ERR; ib_event.device = &us_ibdev->ib_dev; @@ -151,7 +151,8 @@ static void usnic_ib_handle_usdev_event(struct usnic_ib_dev *us_ibdev, if (!us_ibdev->ufdev->link_up && netif_carrier_ok(netdev)) { usnic_fwd_carrier_up(us_ibdev->ufdev); - usnic_info("Link UP on %s\n", us_ibdev->ib_dev.name); + usnic_info("Link UP on %s\n", + dev_name(&us_ibdev->ib_dev.dev)); ib_event.event = IB_EVENT_PORT_ACTIVE; ib_event.device = &us_ibdev->ib_dev; ib_event.element.port_num = 1; @@ -159,7 +160,8 @@ static void usnic_ib_handle_usdev_event(struct usnic_ib_dev *us_ibdev, } else if (us_ibdev->ufdev->link_up && !netif_carrier_ok(netdev)) { usnic_fwd_carrier_down(us_ibdev->ufdev); - usnic_info("Link DOWN on %s\n", us_ibdev->ib_dev.name); + usnic_info("Link DOWN on %s\n", + dev_name(&us_ibdev->ib_dev.dev)); usnic_ib_qp_grp_modify_active_to_err(us_ibdev); ib_event.event = IB_EVENT_PORT_ERR; ib_event.device = &us_ibdev->ib_dev; @@ -168,17 +170,17 @@ static void usnic_ib_handle_usdev_event(struct usnic_ib_dev *us_ibdev, } else { usnic_dbg("Ignoring %s on %s\n", netdev_cmd_to_name(event), - us_ibdev->ib_dev.name); + dev_name(&us_ibdev->ib_dev.dev)); } break; case NETDEV_CHANGEADDR: if (!memcmp(us_ibdev->ufdev->mac, netdev->dev_addr, sizeof(us_ibdev->ufdev->mac))) { usnic_dbg("Ignoring addr change on %s\n", - us_ibdev->ib_dev.name); + dev_name(&us_ibdev->ib_dev.dev)); } else { usnic_info(" %s old mac: %pM new mac: %pM\n", - us_ibdev->ib_dev.name, + dev_name(&us_ibdev->ib_dev.dev), us_ibdev->ufdev->mac, netdev->dev_addr); usnic_fwd_set_mac(us_ibdev->ufdev, netdev->dev_addr); @@ -193,19 +195,19 @@ static void usnic_ib_handle_usdev_event(struct usnic_ib_dev *us_ibdev, case NETDEV_CHANGEMTU: if (us_ibdev->ufdev->mtu != netdev->mtu) { usnic_info("MTU Change on %s old: %u new: %u\n", - us_ibdev->ib_dev.name, + dev_name(&us_ibdev->ib_dev.dev), us_ibdev->ufdev->mtu, netdev->mtu); usnic_fwd_set_mtu(us_ibdev->ufdev, netdev->mtu); usnic_ib_qp_grp_modify_active_to_err(us_ibdev); } else { usnic_dbg("Ignoring MTU change on %s\n", - us_ibdev->ib_dev.name); + dev_name(&us_ibdev->ib_dev.dev)); } break; default: usnic_dbg("Ignoring event %s on %s", netdev_cmd_to_name(event), - us_ibdev->ib_dev.name); + dev_name(&us_ibdev->ib_dev.dev)); } mutex_unlock(&us_ibdev->usdev_lock); } @@ -267,7 +269,7 @@ static int usnic_ib_handle_inet_event(struct usnic_ib_dev *us_ibdev, default: usnic_info("Ignoring event %s on %s", netdev_cmd_to_name(event), - us_ibdev->ib_dev.name); + dev_name(&us_ibdev->ib_dev.dev)); } mutex_unlock(&us_ibdev->usdev_lock); @@ -364,7 +366,6 @@ static void *usnic_ib_device_add(struct pci_dev *dev) us_ibdev->ib_dev.num_comp_vectors = USNIC_IB_NUM_COMP_VECTORS; us_ibdev->ib_dev.dev.parent = &dev->dev; us_ibdev->ib_dev.uverbs_abi_ver = USNIC_UVERBS_ABI_VERSION; - strlcpy(us_ibdev->ib_dev.name, "usnic_%d", IB_DEVICE_NAME_MAX); us_ibdev->ib_dev.uverbs_cmd_mask = (1ull << IB_USER_VERBS_CMD_GET_CONTEXT) | @@ -416,7 +417,9 @@ static void *usnic_ib_device_add(struct pci_dev *dev) us_ibdev->ib_dev.driver_id = RDMA_DRIVER_USNIC; - if (ib_register_device(&us_ibdev->ib_dev, NULL)) + rdma_set_device_sysfs_group(&us_ibdev->ib_dev, &usnic_attr_group); + + if (ib_register_device(&us_ibdev->ib_dev, "usnic_%d", NULL)) goto err_fwd_dealloc; usnic_fwd_set_mtu(us_ibdev->ufdev, us_ibdev->netdev->mtu); @@ -437,9 +440,9 @@ static void *usnic_ib_device_add(struct pci_dev *dev) kref_init(&us_ibdev->vf_cnt); usnic_info("Added ibdev: %s netdev: %s with mac %pM Link: %u MTU: %u\n", - us_ibdev->ib_dev.name, netdev_name(us_ibdev->netdev), - us_ibdev->ufdev->mac, us_ibdev->ufdev->link_up, - us_ibdev->ufdev->mtu); + dev_name(&us_ibdev->ib_dev.dev), + netdev_name(us_ibdev->netdev), us_ibdev->ufdev->mac, + us_ibdev->ufdev->link_up, us_ibdev->ufdev->mtu); return us_ibdev; err_fwd_dealloc: @@ -452,7 +455,7 @@ err_dealloc: static void usnic_ib_device_remove(struct usnic_ib_dev *us_ibdev) { - usnic_info("Unregistering %s\n", us_ibdev->ib_dev.name); + usnic_info("Unregistering %s\n", dev_name(&us_ibdev->ib_dev.dev)); usnic_ib_sysfs_unregister_usdev(us_ibdev); usnic_fwd_dev_free(us_ibdev->ufdev); ib_unregister_device(&us_ibdev->ib_dev); @@ -591,7 +594,7 @@ static int usnic_ib_pci_probe(struct pci_dev *pdev, mutex_unlock(&pf->usdev_lock); usnic_info("Registering usnic VF %s into PF %s\n", pci_name(pdev), - pf->ib_dev.name); + dev_name(&pf->ib_dev.dev)); usnic_ib_log_vf(vf); return 0; diff --git a/drivers/infiniband/hw/usnic/usnic_ib_sysfs.c b/drivers/infiniband/hw/usnic/usnic_ib_sysfs.c index 4210ca14014d..a7e4b2ccfaf8 100644 --- a/drivers/infiniband/hw/usnic/usnic_ib_sysfs.c +++ b/drivers/infiniband/hw/usnic/usnic_ib_sysfs.c @@ -46,9 +46,8 @@ #include "usnic_ib_sysfs.h" #include "usnic_log.h" -static ssize_t usnic_ib_show_board(struct device *device, - struct device_attribute *attr, - char *buf) +static ssize_t board_id_show(struct device *device, + struct device_attribute *attr, char *buf) { struct usnic_ib_dev *us_ibdev = container_of(device, struct usnic_ib_dev, ib_dev.dev); @@ -60,13 +59,13 @@ static ssize_t usnic_ib_show_board(struct device *device, return scnprintf(buf, PAGE_SIZE, "%hu\n", subsystem_device_id); } +static DEVICE_ATTR_RO(board_id); /* * Report the configuration for this PF */ static ssize_t -usnic_ib_show_config(struct device *device, struct device_attribute *attr, - char *buf) +config_show(struct device *device, struct device_attribute *attr, char *buf) { struct usnic_ib_dev *us_ibdev; char *ptr; @@ -94,7 +93,7 @@ usnic_ib_show_config(struct device *device, struct device_attribute *attr, n = scnprintf(ptr, left, "%s: %s:%d.%d, %s, %pM, %u VFs\n Per VF:", - us_ibdev->ib_dev.name, + dev_name(&us_ibdev->ib_dev.dev), busname, PCI_SLOT(us_ibdev->pdev->devfn), PCI_FUNC(us_ibdev->pdev->devfn), @@ -119,17 +118,17 @@ usnic_ib_show_config(struct device *device, struct device_attribute *attr, UPDATE_PTR_LEFT(n, ptr, left); } else { n = scnprintf(ptr, left, "%s: no VFs\n", - us_ibdev->ib_dev.name); + dev_name(&us_ibdev->ib_dev.dev)); UPDATE_PTR_LEFT(n, ptr, left); } mutex_unlock(&us_ibdev->usdev_lock); return ptr - buf; } +static DEVICE_ATTR_RO(config); static ssize_t -usnic_ib_show_iface(struct device *device, struct device_attribute *attr, - char *buf) +iface_show(struct device *device, struct device_attribute *attr, char *buf) { struct usnic_ib_dev *us_ibdev; @@ -138,10 +137,10 @@ usnic_ib_show_iface(struct device *device, struct device_attribute *attr, return scnprintf(buf, PAGE_SIZE, "%s\n", netdev_name(us_ibdev->netdev)); } +static DEVICE_ATTR_RO(iface); static ssize_t -usnic_ib_show_max_vf(struct device *device, struct device_attribute *attr, - char *buf) +max_vf_show(struct device *device, struct device_attribute *attr, char *buf) { struct usnic_ib_dev *us_ibdev; @@ -150,10 +149,10 @@ usnic_ib_show_max_vf(struct device *device, struct device_attribute *attr, return scnprintf(buf, PAGE_SIZE, "%u\n", kref_read(&us_ibdev->vf_cnt)); } +static DEVICE_ATTR_RO(max_vf); static ssize_t -usnic_ib_show_qp_per_vf(struct device *device, struct device_attribute *attr, - char *buf) +qp_per_vf_show(struct device *device, struct device_attribute *attr, char *buf) { struct usnic_ib_dev *us_ibdev; int qp_per_vf; @@ -165,10 +164,10 @@ usnic_ib_show_qp_per_vf(struct device *device, struct device_attribute *attr, return scnprintf(buf, PAGE_SIZE, "%d\n", qp_per_vf); } +static DEVICE_ATTR_RO(qp_per_vf); static ssize_t -usnic_ib_show_cq_per_vf(struct device *device, struct device_attribute *attr, - char *buf) +cq_per_vf_show(struct device *device, struct device_attribute *attr, char *buf) { struct usnic_ib_dev *us_ibdev; @@ -177,21 +176,20 @@ usnic_ib_show_cq_per_vf(struct device *device, struct device_attribute *attr, return scnprintf(buf, PAGE_SIZE, "%d\n", us_ibdev->vf_res_cnt[USNIC_VNIC_RES_TYPE_CQ]); } +static DEVICE_ATTR_RO(cq_per_vf); + +static struct attribute *usnic_class_attributes[] = { + &dev_attr_board_id.attr, + &dev_attr_config.attr, + &dev_attr_iface.attr, + &dev_attr_max_vf.attr, + &dev_attr_qp_per_vf.attr, + &dev_attr_cq_per_vf.attr, + NULL +}; -static DEVICE_ATTR(board_id, S_IRUGO, usnic_ib_show_board, NULL); -static DEVICE_ATTR(config, S_IRUGO, usnic_ib_show_config, NULL); -static DEVICE_ATTR(iface, S_IRUGO, usnic_ib_show_iface, NULL); -static DEVICE_ATTR(max_vf, S_IRUGO, usnic_ib_show_max_vf, NULL); -static DEVICE_ATTR(qp_per_vf, S_IRUGO, usnic_ib_show_qp_per_vf, NULL); -static DEVICE_ATTR(cq_per_vf, S_IRUGO, usnic_ib_show_cq_per_vf, NULL); - -static struct device_attribute *usnic_class_attributes[] = { - &dev_attr_board_id, - &dev_attr_config, - &dev_attr_iface, - &dev_attr_max_vf, - &dev_attr_qp_per_vf, - &dev_attr_cq_per_vf, +const struct attribute_group usnic_attr_group = { + .attrs = usnic_class_attributes, }; struct qpn_attribute { @@ -278,18 +276,6 @@ static struct kobj_type usnic_ib_qpn_type = { int usnic_ib_sysfs_register_usdev(struct usnic_ib_dev *us_ibdev) { - int i; - int err; - for (i = 0; i < ARRAY_SIZE(usnic_class_attributes); ++i) { - err = device_create_file(&us_ibdev->ib_dev.dev, - usnic_class_attributes[i]); - if (err) { - usnic_err("Failed to create device file %d for %s eith err %d", - i, us_ibdev->ib_dev.name, err); - return -EINVAL; - } - } - /* create kernel object for looking at individual QPs */ kobject_get(&us_ibdev->ib_dev.dev.kobj); us_ibdev->qpn_kobj = kobject_create_and_add("qpn", @@ -304,12 +290,6 @@ int usnic_ib_sysfs_register_usdev(struct usnic_ib_dev *us_ibdev) void usnic_ib_sysfs_unregister_usdev(struct usnic_ib_dev *us_ibdev) { - int i; - for (i = 0; i < ARRAY_SIZE(usnic_class_attributes); ++i) { - device_remove_file(&us_ibdev->ib_dev.dev, - usnic_class_attributes[i]); - } - kobject_put(us_ibdev->qpn_kobj); } diff --git a/drivers/infiniband/hw/usnic/usnic_ib_sysfs.h b/drivers/infiniband/hw/usnic/usnic_ib_sysfs.h index 3d98e16cfeaf..b1f064cec850 100644 --- a/drivers/infiniband/hw/usnic/usnic_ib_sysfs.h +++ b/drivers/infiniband/hw/usnic/usnic_ib_sysfs.h @@ -41,4 +41,6 @@ void usnic_ib_sysfs_unregister_usdev(struct usnic_ib_dev *us_ibdev); void usnic_ib_sysfs_qpn_add(struct usnic_ib_qp_grp *qp_grp); void usnic_ib_sysfs_qpn_remove(struct usnic_ib_qp_grp *qp_grp); +extern const struct attribute_group usnic_attr_group; + #endif /* !USNIC_IB_SYSFS_H_ */ diff --git a/drivers/infiniband/hw/usnic/usnic_ib_verbs.c b/drivers/infiniband/hw/usnic/usnic_ib_verbs.c index 9973ac893635..0b91ff36768a 100644 --- a/drivers/infiniband/hw/usnic/usnic_ib_verbs.c +++ b/drivers/infiniband/hw/usnic/usnic_ib_verbs.c @@ -159,7 +159,8 @@ static int usnic_ib_fill_create_qp_resp(struct usnic_ib_qp_grp *qp_grp, err = ib_copy_to_udata(udata, &resp, sizeof(resp)); if (err) { - usnic_err("Failed to copy udata for %s", us_ibdev->ib_dev.name); + usnic_err("Failed to copy udata for %s", + dev_name(&us_ibdev->ib_dev.dev)); return err; } @@ -197,7 +198,7 @@ find_free_vf_and_create_qp_grp(struct usnic_ib_dev *us_ibdev, vnic = vf->vnic; if (!usnic_vnic_check_room(vnic, res_spec)) { usnic_dbg("Found used vnic %s from %s\n", - us_ibdev->ib_dev.name, + dev_name(&us_ibdev->ib_dev.dev), pci_name(usnic_vnic_get_pdev( vnic))); qp_grp = usnic_ib_qp_grp_create(us_ibdev->ufdev, @@ -230,7 +231,8 @@ find_free_vf_and_create_qp_grp(struct usnic_ib_dev *us_ibdev, spin_unlock(&vf->lock); } - usnic_info("No free qp grp found on %s\n", us_ibdev->ib_dev.name); + usnic_info("No free qp grp found on %s\n", + dev_name(&us_ibdev->ib_dev.dev)); return ERR_PTR(-ENOMEM); qp_grp_check: @@ -471,7 +473,7 @@ struct ib_pd *usnic_ib_alloc_pd(struct ib_device *ibdev, } usnic_info("domain 0x%p allocated for context 0x%p and device %s\n", - pd, context, ibdev->name); + pd, context, dev_name(&ibdev->dev)); return &pd->ibpd; } @@ -508,20 +510,20 @@ struct ib_qp *usnic_ib_create_qp(struct ib_pd *pd, err = ib_copy_from_udata(&cmd, udata, sizeof(cmd)); if (err) { usnic_err("%s: cannot copy udata for create_qp\n", - us_ibdev->ib_dev.name); + dev_name(&us_ibdev->ib_dev.dev)); return ERR_PTR(-EINVAL); } err = create_qp_validate_user_data(cmd); if (err) { usnic_err("%s: Failed to validate user data\n", - us_ibdev->ib_dev.name); + dev_name(&us_ibdev->ib_dev.dev)); return ERR_PTR(-EINVAL); } if (init_attr->qp_type != IB_QPT_UD) { usnic_err("%s asked to make a non-UD QP: %d\n", - us_ibdev->ib_dev.name, init_attr->qp_type); + dev_name(&us_ibdev->ib_dev.dev), init_attr->qp_type); return ERR_PTR(-EINVAL); } diff --git a/drivers/infiniband/hw/usnic/usnic_transport.c b/drivers/infiniband/hw/usnic/usnic_transport.c index e0a95538c364..82dd810bc000 100644 --- a/drivers/infiniband/hw/usnic/usnic_transport.c +++ b/drivers/infiniband/hw/usnic/usnic_transport.c @@ -121,7 +121,7 @@ void usnic_transport_unrsrv_port(enum usnic_transport_type type, u16 port_num) if (type == USNIC_TRANSPORT_ROCE_CUSTOM) { spin_lock(&roce_bitmap_lock); if (!port_num) { - usnic_err("Unreserved unvalid port num 0 for %s\n", + usnic_err("Unreserved invalid port num 0 for %s\n", usnic_transport_to_str(type)); goto out_roce_custom; } diff --git a/drivers/infiniband/hw/usnic/usnic_uiom.c b/drivers/infiniband/hw/usnic/usnic_uiom.c index 9dd39daa602b..49275a548751 100644 --- a/drivers/infiniband/hw/usnic/usnic_uiom.c +++ b/drivers/infiniband/hw/usnic/usnic_uiom.c @@ -54,18 +54,6 @@ static struct workqueue_struct *usnic_uiom_wq; ((void *) &((struct usnic_uiom_chunk *) 0)->page_list[1] - \ (void *) &((struct usnic_uiom_chunk *) 0)->page_list[0])) -static void usnic_uiom_reg_account(struct work_struct *work) -{ - struct usnic_uiom_reg *umem = container_of(work, - struct usnic_uiom_reg, work); - - down_write(&umem->mm->mmap_sem); - umem->mm->locked_vm -= umem->diff; - up_write(&umem->mm->mmap_sem); - mmput(umem->mm); - kfree(umem); -} - static int usnic_uiom_dma_fault(struct iommu_domain *domain, struct device *dev, unsigned long iova, int flags, @@ -99,8 +87,9 @@ static void usnic_uiom_put_pages(struct list_head *chunk_list, int dirty) } static int usnic_uiom_get_pages(unsigned long addr, size_t size, int writable, - int dmasync, struct list_head *chunk_list) + int dmasync, struct usnic_uiom_reg *uiomr) { + struct list_head *chunk_list = &uiomr->chunk_list; struct page **page_list; struct scatterlist *sg; struct usnic_uiom_chunk *chunk; @@ -114,6 +103,7 @@ static int usnic_uiom_get_pages(unsigned long addr, size_t size, int writable, int flags; dma_addr_t pa; unsigned int gup_flags; + struct mm_struct *mm; /* * If the combination of the addr and size requested for this memory @@ -136,7 +126,8 @@ static int usnic_uiom_get_pages(unsigned long addr, size_t size, int writable, npages = PAGE_ALIGN(size + (addr & ~PAGE_MASK)) >> PAGE_SHIFT; - down_write(¤t->mm->mmap_sem); + uiomr->owning_mm = mm = current->mm; + down_write(&mm->mmap_sem); locked = npages + current->mm->pinned_vm; lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; @@ -196,10 +187,12 @@ static int usnic_uiom_get_pages(unsigned long addr, size_t size, int writable, out: if (ret < 0) usnic_uiom_put_pages(chunk_list, 0); - else - current->mm->pinned_vm = locked; + else { + mm->pinned_vm = locked; + mmgrab(uiomr->owning_mm); + } - up_write(¤t->mm->mmap_sem); + up_write(&mm->mmap_sem); free_page((unsigned long) page_list); return ret; } @@ -379,7 +372,7 @@ struct usnic_uiom_reg *usnic_uiom_reg_get(struct usnic_uiom_pd *pd, uiomr->pd = pd; err = usnic_uiom_get_pages(addr, size, writable, dmasync, - &uiomr->chunk_list); + uiomr); if (err) { usnic_err("Failed get_pages vpn [0x%lx,0x%lx] err %d\n", vpn_start, vpn_last, err); @@ -426,29 +419,39 @@ out_put_intervals: out_put_pages: usnic_uiom_put_pages(&uiomr->chunk_list, 0); spin_unlock(&pd->lock); + mmdrop(uiomr->owning_mm); out_free_uiomr: kfree(uiomr); return ERR_PTR(err); } -void usnic_uiom_reg_release(struct usnic_uiom_reg *uiomr, - struct ib_ucontext *ucontext) +static void __usnic_uiom_release_tail(struct usnic_uiom_reg *uiomr) { - struct task_struct *task; - struct mm_struct *mm; - unsigned long diff; + mmdrop(uiomr->owning_mm); + kfree(uiomr); +} - __usnic_uiom_reg_release(uiomr->pd, uiomr, 1); +static inline size_t usnic_uiom_num_pages(struct usnic_uiom_reg *uiomr) +{ + return PAGE_ALIGN(uiomr->length + uiomr->offset) >> PAGE_SHIFT; +} - task = get_pid_task(ucontext->tgid, PIDTYPE_PID); - if (!task) - goto out; - mm = get_task_mm(task); - put_task_struct(task); - if (!mm) - goto out; +static void usnic_uiom_release_defer(struct work_struct *work) +{ + struct usnic_uiom_reg *uiomr = + container_of(work, struct usnic_uiom_reg, work); - diff = PAGE_ALIGN(uiomr->length + uiomr->offset) >> PAGE_SHIFT; + down_write(&uiomr->owning_mm->mmap_sem); + uiomr->owning_mm->pinned_vm -= usnic_uiom_num_pages(uiomr); + up_write(&uiomr->owning_mm->mmap_sem); + + __usnic_uiom_release_tail(uiomr); +} + +void usnic_uiom_reg_release(struct usnic_uiom_reg *uiomr, + struct ib_ucontext *context) +{ + __usnic_uiom_reg_release(uiomr->pd, uiomr, 1); /* * We may be called with the mm's mmap_sem already held. This @@ -456,25 +459,21 @@ void usnic_uiom_reg_release(struct usnic_uiom_reg *uiomr, * the last reference to our file and calls our release * method. If there are memory regions to destroy, we'll end * up here and not be able to take the mmap_sem. In that case - * we defer the vm_locked accounting to the system workqueue. + * we defer the vm_locked accounting to a workqueue. */ - if (ucontext->closing) { - if (!down_write_trylock(&mm->mmap_sem)) { - INIT_WORK(&uiomr->work, usnic_uiom_reg_account); - uiomr->mm = mm; - uiomr->diff = diff; - + if (context->closing) { + if (!down_write_trylock(&uiomr->owning_mm->mmap_sem)) { + INIT_WORK(&uiomr->work, usnic_uiom_release_defer); queue_work(usnic_uiom_wq, &uiomr->work); return; } - } else - down_write(&mm->mmap_sem); + } else { + down_write(&uiomr->owning_mm->mmap_sem); + } + uiomr->owning_mm->pinned_vm -= usnic_uiom_num_pages(uiomr); + up_write(&uiomr->owning_mm->mmap_sem); - mm->pinned_vm -= diff; - up_write(&mm->mmap_sem); - mmput(mm); -out: - kfree(uiomr); + __usnic_uiom_release_tail(uiomr); } struct usnic_uiom_pd *usnic_uiom_alloc_pd(void) diff --git a/drivers/infiniband/hw/usnic/usnic_uiom.h b/drivers/infiniband/hw/usnic/usnic_uiom.h index 8c096acff123..b86a9731071b 100644 --- a/drivers/infiniband/hw/usnic/usnic_uiom.h +++ b/drivers/infiniband/hw/usnic/usnic_uiom.h @@ -71,8 +71,7 @@ struct usnic_uiom_reg { int writable; struct list_head chunk_list; struct work_struct work; - struct mm_struct *mm; - unsigned long diff; + struct mm_struct *owning_mm; }; struct usnic_uiom_chunk { diff --git a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_main.c b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_main.c index a5719899f49a..398443f43dc3 100644 --- a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_main.c +++ b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_main.c @@ -65,32 +65,36 @@ static struct workqueue_struct *event_wq; static int pvrdma_add_gid(const struct ib_gid_attr *attr, void **context); static int pvrdma_del_gid(const struct ib_gid_attr *attr, void **context); -static ssize_t show_hca(struct device *device, struct device_attribute *attr, - char *buf) +static ssize_t hca_type_show(struct device *device, + struct device_attribute *attr, char *buf) { return sprintf(buf, "VMW_PVRDMA-%s\n", DRV_VERSION); } +static DEVICE_ATTR_RO(hca_type); -static ssize_t show_rev(struct device *device, struct device_attribute *attr, - char *buf) +static ssize_t hw_rev_show(struct device *device, + struct device_attribute *attr, char *buf) { return sprintf(buf, "%d\n", PVRDMA_REV_ID); } +static DEVICE_ATTR_RO(hw_rev); -static ssize_t show_board(struct device *device, struct device_attribute *attr, - char *buf) +static ssize_t board_id_show(struct device *device, + struct device_attribute *attr, char *buf) { return sprintf(buf, "%d\n", PVRDMA_BOARD_ID); } +static DEVICE_ATTR_RO(board_id); -static DEVICE_ATTR(hw_rev, S_IRUGO, show_rev, NULL); -static DEVICE_ATTR(hca_type, S_IRUGO, show_hca, NULL); -static DEVICE_ATTR(board_id, S_IRUGO, show_board, NULL); +static struct attribute *pvrdma_class_attributes[] = { + &dev_attr_hw_rev.attr, + &dev_attr_hca_type.attr, + &dev_attr_board_id.attr, + NULL, +}; -static struct device_attribute *pvrdma_class_attributes[] = { - &dev_attr_hw_rev, - &dev_attr_hca_type, - &dev_attr_board_id +static const struct attribute_group pvrdma_attr_group = { + .attrs = pvrdma_class_attributes, }; static void pvrdma_get_fw_ver_str(struct ib_device *device, char *str) @@ -160,9 +164,7 @@ static struct net_device *pvrdma_get_netdev(struct ib_device *ibdev, static int pvrdma_register_device(struct pvrdma_dev *dev) { int ret = -1; - int i = 0; - strlcpy(dev->ib_dev.name, "vmw_pvrdma%d", IB_DEVICE_NAME_MAX); dev->ib_dev.node_guid = dev->dsr->caps.node_guid; dev->sys_image_guid = dev->dsr->caps.sys_image_guid; dev->flags = 0; @@ -266,24 +268,16 @@ static int pvrdma_register_device(struct pvrdma_dev *dev) } dev->ib_dev.driver_id = RDMA_DRIVER_VMW_PVRDMA; spin_lock_init(&dev->srq_tbl_lock); + rdma_set_device_sysfs_group(&dev->ib_dev, &pvrdma_attr_group); - ret = ib_register_device(&dev->ib_dev, NULL); + ret = ib_register_device(&dev->ib_dev, "vmw_pvrdma%d", NULL); if (ret) goto err_srq_free; - for (i = 0; i < ARRAY_SIZE(pvrdma_class_attributes); ++i) { - ret = device_create_file(&dev->ib_dev.dev, - pvrdma_class_attributes[i]); - if (ret) - goto err_class; - } - dev->ib_active = true; return 0; -err_class: - ib_unregister_device(&dev->ib_dev); err_srq_free: kfree(dev->srq_tbl); err_qp_free: @@ -735,7 +729,7 @@ static void pvrdma_netdevice_event_handle(struct pvrdma_dev *dev, default: dev_dbg(&dev->pdev->dev, "ignore netdevice event %ld on %s\n", - event, dev->ib_dev.name); + event, dev_name(&dev->ib_dev.dev)); break; } } diff --git a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_qp.c b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_qp.c index 60083c0363a5..cf22f57a9f0d 100644 --- a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_qp.c +++ b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_qp.c @@ -499,7 +499,7 @@ int pvrdma_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, next_state = (attr_mask & IB_QP_STATE) ? attr->qp_state : cur_state; if (!ib_modify_qp_is_ok(cur_state, next_state, ibqp->qp_type, - attr_mask, IB_LINK_LAYER_ETHERNET)) { + attr_mask)) { ret = -EINVAL; goto out; } diff --git a/drivers/infiniband/sw/rdmavt/Kconfig b/drivers/infiniband/sw/rdmavt/Kconfig index 98e798007f75..7df896a18d38 100644 --- a/drivers/infiniband/sw/rdmavt/Kconfig +++ b/drivers/infiniband/sw/rdmavt/Kconfig @@ -1,6 +1,6 @@ config INFINIBAND_RDMAVT tristate "RDMA verbs transport library" - depends on 64BIT && ARCH_DMA_ADDR_T_64BIT + depends on X86_64 && ARCH_DMA_ADDR_T_64BIT depends on PCI select DMA_VIRT_OPS ---help--- diff --git a/drivers/infiniband/sw/rdmavt/qp.c b/drivers/infiniband/sw/rdmavt/qp.c index 5ce403c6cddb..1735deb1a9d4 100644 --- a/drivers/infiniband/sw/rdmavt/qp.c +++ b/drivers/infiniband/sw/rdmavt/qp.c @@ -118,6 +118,187 @@ const int ib_rvt_state_ops[IB_QPS_ERR + 1] = { }; EXPORT_SYMBOL(ib_rvt_state_ops); +/* platform specific: return the last level cache (llc) size, in KiB */ +static int rvt_wss_llc_size(void) +{ + /* assume that the boot CPU value is universal for all CPUs */ + return boot_cpu_data.x86_cache_size; +} + +/* platform specific: cacheless copy */ +static void cacheless_memcpy(void *dst, void *src, size_t n) +{ + /* + * Use the only available X64 cacheless copy. Add a __user cast + * to quiet sparse. The src agument is already in the kernel so + * there are no security issues. The extra fault recovery machinery + * is not invoked. + */ + __copy_user_nocache(dst, (void __user *)src, n, 0); +} + +void rvt_wss_exit(struct rvt_dev_info *rdi) +{ + struct rvt_wss *wss = rdi->wss; + + if (!wss) + return; + + /* coded to handle partially initialized and repeat callers */ + kfree(wss->entries); + wss->entries = NULL; + kfree(rdi->wss); + rdi->wss = NULL; +} + +/** + * rvt_wss_init - Init wss data structures + * + * Return: 0 on success + */ +int rvt_wss_init(struct rvt_dev_info *rdi) +{ + unsigned int sge_copy_mode = rdi->dparms.sge_copy_mode; + unsigned int wss_threshold = rdi->dparms.wss_threshold; + unsigned int wss_clean_period = rdi->dparms.wss_clean_period; + long llc_size; + long llc_bits; + long table_size; + long table_bits; + struct rvt_wss *wss; + int node = rdi->dparms.node; + + if (sge_copy_mode != RVT_SGE_COPY_ADAPTIVE) { + rdi->wss = NULL; + return 0; + } + + rdi->wss = kzalloc_node(sizeof(*rdi->wss), GFP_KERNEL, node); + if (!rdi->wss) + return -ENOMEM; + wss = rdi->wss; + + /* check for a valid percent range - default to 80 if none or invalid */ + if (wss_threshold < 1 || wss_threshold > 100) + wss_threshold = 80; + + /* reject a wildly large period */ + if (wss_clean_period > 1000000) + wss_clean_period = 256; + + /* reject a zero period */ + if (wss_clean_period == 0) + wss_clean_period = 1; + + /* + * Calculate the table size - the next power of 2 larger than the + * LLC size. LLC size is in KiB. + */ + llc_size = rvt_wss_llc_size() * 1024; + table_size = roundup_pow_of_two(llc_size); + + /* one bit per page in rounded up table */ + llc_bits = llc_size / PAGE_SIZE; + table_bits = table_size / PAGE_SIZE; + wss->pages_mask = table_bits - 1; + wss->num_entries = table_bits / BITS_PER_LONG; + + wss->threshold = (llc_bits * wss_threshold) / 100; + if (wss->threshold == 0) + wss->threshold = 1; + + wss->clean_period = wss_clean_period; + atomic_set(&wss->clean_counter, wss_clean_period); + + wss->entries = kcalloc_node(wss->num_entries, sizeof(*wss->entries), + GFP_KERNEL, node); + if (!wss->entries) { + rvt_wss_exit(rdi); + return -ENOMEM; + } + + return 0; +} + +/* + * Advance the clean counter. When the clean period has expired, + * clean an entry. + * + * This is implemented in atomics to avoid locking. Because multiple + * variables are involved, it can be racy which can lead to slightly + * inaccurate information. Since this is only a heuristic, this is + * OK. Any innaccuracies will clean themselves out as the counter + * advances. That said, it is unlikely the entry clean operation will + * race - the next possible racer will not start until the next clean + * period. + * + * The clean counter is implemented as a decrement to zero. When zero + * is reached an entry is cleaned. + */ +static void wss_advance_clean_counter(struct rvt_wss *wss) +{ + int entry; + int weight; + unsigned long bits; + + /* become the cleaner if we decrement the counter to zero */ + if (atomic_dec_and_test(&wss->clean_counter)) { + /* + * Set, not add, the clean period. This avoids an issue + * where the counter could decrement below the clean period. + * Doing a set can result in lost decrements, slowing the + * clean advance. Since this a heuristic, this possible + * slowdown is OK. + * + * An alternative is to loop, advancing the counter by a + * clean period until the result is > 0. However, this could + * lead to several threads keeping another in the clean loop. + * This could be mitigated by limiting the number of times + * we stay in the loop. + */ + atomic_set(&wss->clean_counter, wss->clean_period); + + /* + * Uniquely grab the entry to clean and move to next. + * The current entry is always the lower bits of + * wss.clean_entry. The table size, wss.num_entries, + * is always a power-of-2. + */ + entry = (atomic_inc_return(&wss->clean_entry) - 1) + & (wss->num_entries - 1); + + /* clear the entry and count the bits */ + bits = xchg(&wss->entries[entry], 0); + weight = hweight64((u64)bits); + /* only adjust the contended total count if needed */ + if (weight) + atomic_sub(weight, &wss->total_count); + } +} + +/* + * Insert the given address into the working set array. + */ +static void wss_insert(struct rvt_wss *wss, void *address) +{ + u32 page = ((unsigned long)address >> PAGE_SHIFT) & wss->pages_mask; + u32 entry = page / BITS_PER_LONG; /* assumes this ends up a shift */ + u32 nr = page & (BITS_PER_LONG - 1); + + if (!test_and_set_bit(nr, &wss->entries[entry])) + atomic_inc(&wss->total_count); + + wss_advance_clean_counter(wss); +} + +/* + * Is the working set larger than the threshold? + */ +static inline bool wss_exceeds_threshold(struct rvt_wss *wss) +{ + return atomic_read(&wss->total_count) >= wss->threshold; +} + static void get_map_page(struct rvt_qpn_table *qpt, struct rvt_qpn_map *map) { @@ -1164,11 +1345,8 @@ int rvt_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, int lastwqe = 0; int mig = 0; int pmtu = 0; /* for gcc warning only */ - enum rdma_link_layer link; int opa_ah; - link = rdma_port_get_link_layer(ibqp->device, qp->port_num); - spin_lock_irq(&qp->r_lock); spin_lock(&qp->s_hlock); spin_lock(&qp->s_lock); @@ -1179,7 +1357,7 @@ int rvt_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, opa_ah = rdma_cap_opa_ah(ibqp->device, qp->port_num); if (!ib_modify_qp_is_ok(cur_state, new_state, ibqp->qp_type, - attr_mask, link)) + attr_mask)) goto inval; if (rdi->driver_f.check_modify_qp && @@ -1718,7 +1896,7 @@ static inline int rvt_qp_is_avail( */ static int rvt_post_one_wr(struct rvt_qp *qp, const struct ib_send_wr *wr, - int *call_send) + bool *call_send) { struct rvt_swqe *wqe; u32 next; @@ -1823,15 +2001,11 @@ static int rvt_post_one_wr(struct rvt_qp *qp, wqe->wr.num_sge = j; } - /* general part of wqe valid - allow for driver checks */ - if (rdi->driver_f.check_send_wqe) { - ret = rdi->driver_f.check_send_wqe(qp, wqe); - if (ret < 0) - goto bail_inval_free; - if (ret) - *call_send = ret; - } - + /* + * Calculate and set SWQE PSN values prior to handing it off + * to the driver's check routine. This give the driver the + * opportunity to adjust PSN values based on internal checks. + */ log_pmtu = qp->log_pmtu; if (qp->ibqp.qp_type != IB_QPT_UC && qp->ibqp.qp_type != IB_QPT_RC) { @@ -1856,8 +2030,18 @@ static int rvt_post_one_wr(struct rvt_qp *qp, (wqe->length ? ((wqe->length - 1) >> log_pmtu) : 0); - qp->s_next_psn = wqe->lpsn + 1; } + + /* general part of wqe valid - allow for driver checks */ + if (rdi->driver_f.setup_wqe) { + ret = rdi->driver_f.setup_wqe(qp, wqe, call_send); + if (ret < 0) + goto bail_inval_free_ref; + } + + if (!(rdi->post_parms[wr->opcode].flags & RVT_OPERATION_LOCAL)) + qp->s_next_psn = wqe->lpsn + 1; + if (unlikely(reserved_op)) { wqe->wr.send_flags |= RVT_SEND_RESERVE_USED; rvt_qp_wqe_reserve(qp, wqe); @@ -1871,6 +2055,10 @@ static int rvt_post_one_wr(struct rvt_qp *qp, return 0; +bail_inval_free_ref: + if (qp->ibqp.qp_type != IB_QPT_UC && + qp->ibqp.qp_type != IB_QPT_RC) + atomic_dec(&ibah_to_rvtah(ud_wr(wr)->ah)->refcount); bail_inval_free: /* release mr holds */ while (j) { @@ -1897,7 +2085,7 @@ int rvt_post_send(struct ib_qp *ibqp, const struct ib_send_wr *wr, struct rvt_qp *qp = ibqp_to_rvtqp(ibqp); struct rvt_dev_info *rdi = ib_to_rvt(ibqp->device); unsigned long flags = 0; - int call_send; + bool call_send; unsigned nreq = 0; int err = 0; @@ -1930,7 +2118,11 @@ int rvt_post_send(struct ib_qp *ibqp, const struct ib_send_wr *wr, bail: spin_unlock_irqrestore(&qp->s_hlock, flags); if (nreq) { - if (call_send) + /* + * Only call do_send if there is exactly one packet, and the + * driver said it was ok. + */ + if (nreq == 1 && call_send) rdi->driver_f.do_send(qp); else rdi->driver_f.schedule_send_no_lock(qp); @@ -2465,3 +2657,454 @@ void rvt_qp_iter(struct rvt_dev_info *rdi, rcu_read_unlock(); } EXPORT_SYMBOL(rvt_qp_iter); + +/* + * This should be called with s_lock held. + */ +void rvt_send_complete(struct rvt_qp *qp, struct rvt_swqe *wqe, + enum ib_wc_status status) +{ + u32 old_last, last; + struct rvt_dev_info *rdi = ib_to_rvt(qp->ibqp.device); + + if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_OR_FLUSH_SEND)) + return; + + last = qp->s_last; + old_last = last; + trace_rvt_qp_send_completion(qp, wqe, last); + if (++last >= qp->s_size) + last = 0; + trace_rvt_qp_send_completion(qp, wqe, last); + qp->s_last = last; + /* See post_send() */ + barrier(); + rvt_put_swqe(wqe); + if (qp->ibqp.qp_type == IB_QPT_UD || + qp->ibqp.qp_type == IB_QPT_SMI || + qp->ibqp.qp_type == IB_QPT_GSI) + atomic_dec(&ibah_to_rvtah(wqe->ud_wr.ah)->refcount); + + rvt_qp_swqe_complete(qp, + wqe, + rdi->wc_opcode[wqe->wr.opcode], + status); + + if (qp->s_acked == old_last) + qp->s_acked = last; + if (qp->s_cur == old_last) + qp->s_cur = last; + if (qp->s_tail == old_last) + qp->s_tail = last; + if (qp->state == IB_QPS_SQD && last == qp->s_cur) + qp->s_draining = 0; +} +EXPORT_SYMBOL(rvt_send_complete); + +/** + * rvt_copy_sge - copy data to SGE memory + * @qp: associated QP + * @ss: the SGE state + * @data: the data to copy + * @length: the length of the data + * @release: boolean to release MR + * @copy_last: do a separate copy of the last 8 bytes + */ +void rvt_copy_sge(struct rvt_qp *qp, struct rvt_sge_state *ss, + void *data, u32 length, + bool release, bool copy_last) +{ + struct rvt_sge *sge = &ss->sge; + int i; + bool in_last = false; + bool cacheless_copy = false; + struct rvt_dev_info *rdi = ib_to_rvt(qp->ibqp.device); + struct rvt_wss *wss = rdi->wss; + unsigned int sge_copy_mode = rdi->dparms.sge_copy_mode; + + if (sge_copy_mode == RVT_SGE_COPY_CACHELESS) { + cacheless_copy = length >= PAGE_SIZE; + } else if (sge_copy_mode == RVT_SGE_COPY_ADAPTIVE) { + if (length >= PAGE_SIZE) { + /* + * NOTE: this *assumes*: + * o The first vaddr is the dest. + * o If multiple pages, then vaddr is sequential. + */ + wss_insert(wss, sge->vaddr); + if (length >= (2 * PAGE_SIZE)) + wss_insert(wss, (sge->vaddr + PAGE_SIZE)); + + cacheless_copy = wss_exceeds_threshold(wss); + } else { + wss_advance_clean_counter(wss); + } + } + + if (copy_last) { + if (length > 8) { + length -= 8; + } else { + copy_last = false; + in_last = true; + } + } + +again: + while (length) { + u32 len = rvt_get_sge_length(sge, length); + + WARN_ON_ONCE(len == 0); + if (unlikely(in_last)) { + /* enforce byte transfer ordering */ + for (i = 0; i < len; i++) + ((u8 *)sge->vaddr)[i] = ((u8 *)data)[i]; + } else if (cacheless_copy) { + cacheless_memcpy(sge->vaddr, data, len); + } else { + memcpy(sge->vaddr, data, len); + } + rvt_update_sge(ss, len, release); + data += len; + length -= len; + } + + if (copy_last) { + copy_last = false; + in_last = true; + length = 8; + goto again; + } +} +EXPORT_SYMBOL(rvt_copy_sge); + +/** + * ruc_loopback - handle UC and RC loopback requests + * @sqp: the sending QP + * + * This is called from rvt_do_send() to forward a WQE addressed to the same HFI + * Note that although we are single threaded due to the send engine, we still + * have to protect against post_send(). We don't have to worry about + * receive interrupts since this is a connected protocol and all packets + * will pass through here. + */ +void rvt_ruc_loopback(struct rvt_qp *sqp) +{ + struct rvt_ibport *rvp = NULL; + struct rvt_dev_info *rdi = ib_to_rvt(sqp->ibqp.device); + struct rvt_qp *qp; + struct rvt_swqe *wqe; + struct rvt_sge *sge; + unsigned long flags; + struct ib_wc wc; + u64 sdata; + atomic64_t *maddr; + enum ib_wc_status send_status; + bool release; + int ret; + bool copy_last = false; + int local_ops = 0; + + rcu_read_lock(); + rvp = rdi->ports[sqp->port_num - 1]; + + /* + * Note that we check the responder QP state after + * checking the requester's state. + */ + + qp = rvt_lookup_qpn(ib_to_rvt(sqp->ibqp.device), rvp, + sqp->remote_qpn); + + spin_lock_irqsave(&sqp->s_lock, flags); + + /* Return if we are already busy processing a work request. */ + if ((sqp->s_flags & (RVT_S_BUSY | RVT_S_ANY_WAIT)) || + !(ib_rvt_state_ops[sqp->state] & RVT_PROCESS_OR_FLUSH_SEND)) + goto unlock; + + sqp->s_flags |= RVT_S_BUSY; + +again: + if (sqp->s_last == READ_ONCE(sqp->s_head)) + goto clr_busy; + wqe = rvt_get_swqe_ptr(sqp, sqp->s_last); + + /* Return if it is not OK to start a new work request. */ + if (!(ib_rvt_state_ops[sqp->state] & RVT_PROCESS_NEXT_SEND_OK)) { + if (!(ib_rvt_state_ops[sqp->state] & RVT_FLUSH_SEND)) + goto clr_busy; + /* We are in the error state, flush the work request. */ + send_status = IB_WC_WR_FLUSH_ERR; + goto flush_send; + } + + /* + * We can rely on the entry not changing without the s_lock + * being held until we update s_last. + * We increment s_cur to indicate s_last is in progress. + */ + if (sqp->s_last == sqp->s_cur) { + if (++sqp->s_cur >= sqp->s_size) + sqp->s_cur = 0; + } + spin_unlock_irqrestore(&sqp->s_lock, flags); + + if (!qp || !(ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK) || + qp->ibqp.qp_type != sqp->ibqp.qp_type) { + rvp->n_pkt_drops++; + /* + * For RC, the requester would timeout and retry so + * shortcut the timeouts and just signal too many retries. + */ + if (sqp->ibqp.qp_type == IB_QPT_RC) + send_status = IB_WC_RETRY_EXC_ERR; + else + send_status = IB_WC_SUCCESS; + goto serr; + } + + memset(&wc, 0, sizeof(wc)); + send_status = IB_WC_SUCCESS; + + release = true; + sqp->s_sge.sge = wqe->sg_list[0]; + sqp->s_sge.sg_list = wqe->sg_list + 1; + sqp->s_sge.num_sge = wqe->wr.num_sge; + sqp->s_len = wqe->length; + switch (wqe->wr.opcode) { + case IB_WR_REG_MR: + goto send_comp; + + case IB_WR_LOCAL_INV: + if (!(wqe->wr.send_flags & RVT_SEND_COMPLETION_ONLY)) { + if (rvt_invalidate_rkey(sqp, + wqe->wr.ex.invalidate_rkey)) + send_status = IB_WC_LOC_PROT_ERR; + local_ops = 1; + } + goto send_comp; + + case IB_WR_SEND_WITH_INV: + if (!rvt_invalidate_rkey(qp, wqe->wr.ex.invalidate_rkey)) { + wc.wc_flags = IB_WC_WITH_INVALIDATE; + wc.ex.invalidate_rkey = wqe->wr.ex.invalidate_rkey; + } + goto send; + + case IB_WR_SEND_WITH_IMM: + wc.wc_flags = IB_WC_WITH_IMM; + wc.ex.imm_data = wqe->wr.ex.imm_data; + /* FALLTHROUGH */ + case IB_WR_SEND: +send: + ret = rvt_get_rwqe(qp, false); + if (ret < 0) + goto op_err; + if (!ret) + goto rnr_nak; + break; + + case IB_WR_RDMA_WRITE_WITH_IMM: + if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_WRITE))) + goto inv_err; + wc.wc_flags = IB_WC_WITH_IMM; + wc.ex.imm_data = wqe->wr.ex.imm_data; + ret = rvt_get_rwqe(qp, true); + if (ret < 0) + goto op_err; + if (!ret) + goto rnr_nak; + /* skip copy_last set and qp_access_flags recheck */ + goto do_write; + case IB_WR_RDMA_WRITE: + copy_last = rvt_is_user_qp(qp); + if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_WRITE))) + goto inv_err; +do_write: + if (wqe->length == 0) + break; + if (unlikely(!rvt_rkey_ok(qp, &qp->r_sge.sge, wqe->length, + wqe->rdma_wr.remote_addr, + wqe->rdma_wr.rkey, + IB_ACCESS_REMOTE_WRITE))) + goto acc_err; + qp->r_sge.sg_list = NULL; + qp->r_sge.num_sge = 1; + qp->r_sge.total_len = wqe->length; + break; + + case IB_WR_RDMA_READ: + if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_READ))) + goto inv_err; + if (unlikely(!rvt_rkey_ok(qp, &sqp->s_sge.sge, wqe->length, + wqe->rdma_wr.remote_addr, + wqe->rdma_wr.rkey, + IB_ACCESS_REMOTE_READ))) + goto acc_err; + release = false; + sqp->s_sge.sg_list = NULL; + sqp->s_sge.num_sge = 1; + qp->r_sge.sge = wqe->sg_list[0]; + qp->r_sge.sg_list = wqe->sg_list + 1; + qp->r_sge.num_sge = wqe->wr.num_sge; + qp->r_sge.total_len = wqe->length; + break; + + case IB_WR_ATOMIC_CMP_AND_SWP: + case IB_WR_ATOMIC_FETCH_AND_ADD: + if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_ATOMIC))) + goto inv_err; + if (unlikely(!rvt_rkey_ok(qp, &qp->r_sge.sge, sizeof(u64), + wqe->atomic_wr.remote_addr, + wqe->atomic_wr.rkey, + IB_ACCESS_REMOTE_ATOMIC))) + goto acc_err; + /* Perform atomic OP and save result. */ + maddr = (atomic64_t *)qp->r_sge.sge.vaddr; + sdata = wqe->atomic_wr.compare_add; + *(u64 *)sqp->s_sge.sge.vaddr = + (wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD) ? + (u64)atomic64_add_return(sdata, maddr) - sdata : + (u64)cmpxchg((u64 *)qp->r_sge.sge.vaddr, + sdata, wqe->atomic_wr.swap); + rvt_put_mr(qp->r_sge.sge.mr); + qp->r_sge.num_sge = 0; + goto send_comp; + + default: + send_status = IB_WC_LOC_QP_OP_ERR; + goto serr; + } + + sge = &sqp->s_sge.sge; + while (sqp->s_len) { + u32 len = sqp->s_len; + + if (len > sge->length) + len = sge->length; + if (len > sge->sge_length) + len = sge->sge_length; + WARN_ON_ONCE(len == 0); + rvt_copy_sge(qp, &qp->r_sge, sge->vaddr, + len, release, copy_last); + sge->vaddr += len; + sge->length -= len; + sge->sge_length -= len; + if (sge->sge_length == 0) { + if (!release) + rvt_put_mr(sge->mr); + if (--sqp->s_sge.num_sge) + *sge = *sqp->s_sge.sg_list++; + } else if (sge->length == 0 && sge->mr->lkey) { + if (++sge->n >= RVT_SEGSZ) { + if (++sge->m >= sge->mr->mapsz) + break; + sge->n = 0; + } + sge->vaddr = + sge->mr->map[sge->m]->segs[sge->n].vaddr; + sge->length = + sge->mr->map[sge->m]->segs[sge->n].length; + } + sqp->s_len -= len; + } + if (release) + rvt_put_ss(&qp->r_sge); + + if (!test_and_clear_bit(RVT_R_WRID_VALID, &qp->r_aflags)) + goto send_comp; + + if (wqe->wr.opcode == IB_WR_RDMA_WRITE_WITH_IMM) + wc.opcode = IB_WC_RECV_RDMA_WITH_IMM; + else + wc.opcode = IB_WC_RECV; + wc.wr_id = qp->r_wr_id; + wc.status = IB_WC_SUCCESS; + wc.byte_len = wqe->length; + wc.qp = &qp->ibqp; + wc.src_qp = qp->remote_qpn; + wc.slid = rdma_ah_get_dlid(&qp->remote_ah_attr) & U16_MAX; + wc.sl = rdma_ah_get_sl(&qp->remote_ah_attr); + wc.port_num = 1; + /* Signal completion event if the solicited bit is set. */ + rvt_cq_enter(ibcq_to_rvtcq(qp->ibqp.recv_cq), &wc, + wqe->wr.send_flags & IB_SEND_SOLICITED); + +send_comp: + spin_lock_irqsave(&sqp->s_lock, flags); + rvp->n_loop_pkts++; +flush_send: + sqp->s_rnr_retry = sqp->s_rnr_retry_cnt; + rvt_send_complete(sqp, wqe, send_status); + if (local_ops) { + atomic_dec(&sqp->local_ops_pending); + local_ops = 0; + } + goto again; + +rnr_nak: + /* Handle RNR NAK */ + if (qp->ibqp.qp_type == IB_QPT_UC) + goto send_comp; + rvp->n_rnr_naks++; + /* + * Note: we don't need the s_lock held since the BUSY flag + * makes this single threaded. + */ + if (sqp->s_rnr_retry == 0) { + send_status = IB_WC_RNR_RETRY_EXC_ERR; + goto serr; + } + if (sqp->s_rnr_retry_cnt < 7) + sqp->s_rnr_retry--; + spin_lock_irqsave(&sqp->s_lock, flags); + if (!(ib_rvt_state_ops[sqp->state] & RVT_PROCESS_RECV_OK)) + goto clr_busy; + rvt_add_rnr_timer(sqp, qp->r_min_rnr_timer << + IB_AETH_CREDIT_SHIFT); + goto clr_busy; + +op_err: + send_status = IB_WC_REM_OP_ERR; + wc.status = IB_WC_LOC_QP_OP_ERR; + goto err; + +inv_err: + send_status = IB_WC_REM_INV_REQ_ERR; + wc.status = IB_WC_LOC_QP_OP_ERR; + goto err; + +acc_err: + send_status = IB_WC_REM_ACCESS_ERR; + wc.status = IB_WC_LOC_PROT_ERR; +err: + /* responder goes to error state */ + rvt_rc_error(qp, wc.status); + +serr: + spin_lock_irqsave(&sqp->s_lock, flags); + rvt_send_complete(sqp, wqe, send_status); + if (sqp->ibqp.qp_type == IB_QPT_RC) { + int lastwqe = rvt_error_qp(sqp, IB_WC_WR_FLUSH_ERR); + + sqp->s_flags &= ~RVT_S_BUSY; + spin_unlock_irqrestore(&sqp->s_lock, flags); + if (lastwqe) { + struct ib_event ev; + + ev.device = sqp->ibqp.device; + ev.element.qp = &sqp->ibqp; + ev.event = IB_EVENT_QP_LAST_WQE_REACHED; + sqp->ibqp.event_handler(&ev, sqp->ibqp.qp_context); + } + goto done; + } +clr_busy: + sqp->s_flags &= ~RVT_S_BUSY; +unlock: + spin_unlock_irqrestore(&sqp->s_lock, flags); +done: + rcu_read_unlock(); +} +EXPORT_SYMBOL(rvt_ruc_loopback); diff --git a/drivers/infiniband/sw/rdmavt/qp.h b/drivers/infiniband/sw/rdmavt/qp.h index 264811fdc530..6d883972e0b8 100644 --- a/drivers/infiniband/sw/rdmavt/qp.h +++ b/drivers/infiniband/sw/rdmavt/qp.h @@ -66,4 +66,6 @@ int rvt_post_send(struct ib_qp *ibqp, const struct ib_send_wr *wr, const struct ib_send_wr **bad_wr); int rvt_post_srq_recv(struct ib_srq *ibsrq, const struct ib_recv_wr *wr, const struct ib_recv_wr **bad_wr); +int rvt_wss_init(struct rvt_dev_info *rdi); +void rvt_wss_exit(struct rvt_dev_info *rdi); #endif /* DEF_RVTQP_H */ diff --git a/drivers/infiniband/sw/rdmavt/trace_tx.h b/drivers/infiniband/sw/rdmavt/trace_tx.h index 0ef25fc49f25..d5df352eadb1 100644 --- a/drivers/infiniband/sw/rdmavt/trace_tx.h +++ b/drivers/infiniband/sw/rdmavt/trace_tx.h @@ -153,6 +153,48 @@ TRACE_EVENT( ) ); +TRACE_EVENT( + rvt_qp_send_completion, + TP_PROTO(struct rvt_qp *qp, struct rvt_swqe *wqe, u32 idx), + TP_ARGS(qp, wqe, idx), + TP_STRUCT__entry( + RDI_DEV_ENTRY(ib_to_rvt(qp->ibqp.device)) + __field(struct rvt_swqe *, wqe) + __field(u64, wr_id) + __field(u32, qpn) + __field(u32, qpt) + __field(u32, length) + __field(u32, idx) + __field(u32, ssn) + __field(enum ib_wr_opcode, opcode) + __field(int, send_flags) + ), + TP_fast_assign( + RDI_DEV_ASSIGN(ib_to_rvt(qp->ibqp.device)) + __entry->wqe = wqe; + __entry->wr_id = wqe->wr.wr_id; + __entry->qpn = qp->ibqp.qp_num; + __entry->qpt = qp->ibqp.qp_type; + __entry->length = wqe->length; + __entry->idx = idx; + __entry->ssn = wqe->ssn; + __entry->opcode = wqe->wr.opcode; + __entry->send_flags = wqe->wr.send_flags; + ), + TP_printk( + "[%s] qpn 0x%x qpt %u wqe %p idx %u wr_id %llx length %u ssn %u opcode %x send_flags %x", + __get_str(dev), + __entry->qpn, + __entry->qpt, + __entry->wqe, + __entry->idx, + __entry->wr_id, + __entry->length, + __entry->ssn, + __entry->opcode, + __entry->send_flags + ) +); #endif /* __RVT_TRACE_TX_H */ #undef TRACE_INCLUDE_PATH diff --git a/drivers/infiniband/sw/rdmavt/vt.c b/drivers/infiniband/sw/rdmavt/vt.c index 17e4abc067af..723d3daf2eba 100644 --- a/drivers/infiniband/sw/rdmavt/vt.c +++ b/drivers/infiniband/sw/rdmavt/vt.c @@ -774,6 +774,13 @@ int rvt_register_device(struct rvt_dev_info *rdi, u32 driver_id) goto bail_no_mr; } + /* Memory Working Set Size */ + ret = rvt_wss_init(rdi); + if (ret) { + rvt_pr_err(rdi, "Error in WSS init.\n"); + goto bail_mr; + } + /* Completion queues */ spin_lock_init(&rdi->n_cqs_lock); @@ -828,10 +835,11 @@ int rvt_register_device(struct rvt_dev_info *rdi, u32 driver_id) rdi->ibdev.driver_id = driver_id; /* We are now good to announce we exist */ - ret = ib_register_device(&rdi->ibdev, rdi->driver_f.port_callback); + ret = ib_register_device(&rdi->ibdev, dev_name(&rdi->ibdev.dev), + rdi->driver_f.port_callback); if (ret) { rvt_pr_err(rdi, "Failed to register driver with ib core.\n"); - goto bail_mr; + goto bail_wss; } rvt_create_mad_agents(rdi); @@ -839,6 +847,8 @@ int rvt_register_device(struct rvt_dev_info *rdi, u32 driver_id) rvt_pr_info(rdi, "Registration with rdmavt done.\n"); return ret; +bail_wss: + rvt_wss_exit(rdi); bail_mr: rvt_mr_exit(rdi); @@ -862,6 +872,7 @@ void rvt_unregister_device(struct rvt_dev_info *rdi) rvt_free_mad_agents(rdi); ib_unregister_device(&rdi->ibdev); + rvt_wss_exit(rdi); rvt_mr_exit(rdi); rvt_qp_exit(rdi); } diff --git a/drivers/infiniband/sw/rxe/rxe.c b/drivers/infiniband/sw/rxe/rxe.c index 10999fa69281..383e65c7bbc0 100644 --- a/drivers/infiniband/sw/rxe/rxe.c +++ b/drivers/infiniband/sw/rxe/rxe.c @@ -103,7 +103,7 @@ static void rxe_init_device_param(struct rxe_dev *rxe) rxe->attr.max_res_rd_atom = RXE_MAX_RES_RD_ATOM; rxe->attr.max_qp_init_rd_atom = RXE_MAX_QP_INIT_RD_ATOM; rxe->attr.max_ee_init_rd_atom = RXE_MAX_EE_INIT_RD_ATOM; - rxe->attr.atomic_cap = RXE_ATOMIC_CAP; + rxe->attr.atomic_cap = IB_ATOMIC_HCA; rxe->attr.max_ee = RXE_MAX_EE; rxe->attr.max_rdd = RXE_MAX_RDD; rxe->attr.max_mw = RXE_MAX_MW; @@ -128,9 +128,9 @@ static void rxe_init_device_param(struct rxe_dev *rxe) /* initialize port attributes */ static int rxe_init_port_param(struct rxe_port *port) { - port->attr.state = RXE_PORT_STATE; - port->attr.max_mtu = RXE_PORT_MAX_MTU; - port->attr.active_mtu = RXE_PORT_ACTIVE_MTU; + port->attr.state = IB_PORT_DOWN; + port->attr.max_mtu = IB_MTU_4096; + port->attr.active_mtu = IB_MTU_256; port->attr.gid_tbl_len = RXE_PORT_GID_TBL_LEN; port->attr.port_cap_flags = RXE_PORT_PORT_CAP_FLAGS; port->attr.max_msg_sz = RXE_PORT_MAX_MSG_SZ; @@ -147,8 +147,7 @@ static int rxe_init_port_param(struct rxe_port *port) port->attr.active_width = RXE_PORT_ACTIVE_WIDTH; port->attr.active_speed = RXE_PORT_ACTIVE_SPEED; port->attr.phys_state = RXE_PORT_PHYS_STATE; - port->mtu_cap = - ib_mtu_enum_to_int(RXE_PORT_ACTIVE_MTU); + port->mtu_cap = ib_mtu_enum_to_int(IB_MTU_256); port->subnet_prefix = cpu_to_be64(RXE_PORT_SUBNET_PREFIX); return 0; @@ -300,7 +299,7 @@ void rxe_set_mtu(struct rxe_dev *rxe, unsigned int ndev_mtu) mtu = eth_mtu_int_to_enum(ndev_mtu); /* Make sure that new MTU in range */ - mtu = mtu ? min_t(enum ib_mtu, mtu, RXE_PORT_MAX_MTU) : IB_MTU_256; + mtu = mtu ? min_t(enum ib_mtu, mtu, IB_MTU_4096) : IB_MTU_256; port->attr.active_mtu = mtu; port->mtu_cap = ib_mtu_enum_to_int(mtu); diff --git a/drivers/infiniband/sw/rxe/rxe_comp.c b/drivers/infiniband/sw/rxe/rxe_comp.c index 83311dd07019..ea089cb091ad 100644 --- a/drivers/infiniband/sw/rxe/rxe_comp.c +++ b/drivers/infiniband/sw/rxe/rxe_comp.c @@ -191,6 +191,7 @@ static inline void reset_retry_counters(struct rxe_qp *qp) { qp->comp.retry_cnt = qp->attr.retry_cnt; qp->comp.rnr_retry = qp->attr.rnr_retry; + qp->comp.started_retry = 0; } static inline enum comp_state check_psn(struct rxe_qp *qp, @@ -253,6 +254,17 @@ static inline enum comp_state check_ack(struct rxe_qp *qp, case IB_OPCODE_RC_RDMA_READ_RESPONSE_MIDDLE: if (pkt->opcode != IB_OPCODE_RC_RDMA_READ_RESPONSE_MIDDLE && pkt->opcode != IB_OPCODE_RC_RDMA_READ_RESPONSE_LAST) { + /* read retries of partial data may restart from + * read response first or response only. + */ + if ((pkt->psn == wqe->first_psn && + pkt->opcode == + IB_OPCODE_RC_RDMA_READ_RESPONSE_FIRST) || + (wqe->first_psn == wqe->last_psn && + pkt->opcode == + IB_OPCODE_RC_RDMA_READ_RESPONSE_ONLY)) + break; + return COMPST_ERROR; } break; @@ -499,11 +511,11 @@ static inline enum comp_state complete_wqe(struct rxe_qp *qp, struct rxe_pkt_info *pkt, struct rxe_send_wqe *wqe) { - qp->comp.opcode = -1; - - if (pkt) { - if (psn_compare(pkt->psn, qp->comp.psn) >= 0) - qp->comp.psn = (pkt->psn + 1) & BTH_PSN_MASK; + if (pkt && wqe->state == wqe_state_pending) { + if (psn_compare(wqe->last_psn, qp->comp.psn) >= 0) { + qp->comp.psn = (wqe->last_psn + 1) & BTH_PSN_MASK; + qp->comp.opcode = -1; + } if (qp->req.wait_psn) { qp->req.wait_psn = 0; @@ -676,6 +688,20 @@ int rxe_completer(void *arg) goto exit; } + /* if we've started a retry, don't start another + * retry sequence, unless this is a timeout. + */ + if (qp->comp.started_retry && + !qp->comp.timeout_retry) { + if (pkt) { + rxe_drop_ref(pkt->qp); + kfree_skb(skb); + skb = NULL; + } + + goto done; + } + if (qp->comp.retry_cnt > 0) { if (qp->comp.retry_cnt != 7) qp->comp.retry_cnt--; @@ -692,6 +718,7 @@ int rxe_completer(void *arg) rxe_counter_inc(rxe, RXE_CNT_COMP_RETRY); qp->req.need_retry = 1; + qp->comp.started_retry = 1; rxe_run_task(&qp->req.task, 1); } @@ -701,7 +728,7 @@ int rxe_completer(void *arg) skb = NULL; } - goto exit; + goto done; } else { rxe_counter_inc(rxe, RXE_CNT_RETRY_EXCEEDED); diff --git a/drivers/infiniband/sw/rxe/rxe_cq.c b/drivers/infiniband/sw/rxe/rxe_cq.c index 2ee4b08b00ea..a57276f2cb84 100644 --- a/drivers/infiniband/sw/rxe/rxe_cq.c +++ b/drivers/infiniband/sw/rxe/rxe_cq.c @@ -30,7 +30,7 @@ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ - +#include <linux/vmalloc.h> #include "rxe.h" #include "rxe_loc.h" #include "rxe_queue.h" @@ -97,7 +97,7 @@ int rxe_cq_from_init(struct rxe_dev *rxe, struct rxe_cq *cq, int cqe, err = do_mmap_info(rxe, uresp ? &uresp->mi : NULL, context, cq->queue->buf, cq->queue->buf_size, &cq->queue->ip); if (err) { - kvfree(cq->queue->buf); + vfree(cq->queue->buf); kfree(cq->queue); return err; } diff --git a/drivers/infiniband/sw/rxe/rxe_loc.h b/drivers/infiniband/sw/rxe/rxe_loc.h index 87d14f7ef21b..afd53f57a62b 100644 --- a/drivers/infiniband/sw/rxe/rxe_loc.h +++ b/drivers/infiniband/sw/rxe/rxe_loc.h @@ -144,8 +144,7 @@ void rxe_loopback(struct sk_buff *skb); int rxe_send(struct rxe_pkt_info *pkt, struct sk_buff *skb); struct sk_buff *rxe_init_packet(struct rxe_dev *rxe, struct rxe_av *av, int paylen, struct rxe_pkt_info *pkt); -int rxe_prepare(struct rxe_dev *rxe, struct rxe_pkt_info *pkt, - struct sk_buff *skb, u32 *crc); +int rxe_prepare(struct rxe_pkt_info *pkt, struct sk_buff *skb, u32 *crc); enum rdma_link_layer rxe_link_layer(struct rxe_dev *rxe, unsigned int port_num); const char *rxe_parent_name(struct rxe_dev *rxe, unsigned int port_num); struct device *rxe_dma_device(struct rxe_dev *rxe); @@ -196,7 +195,7 @@ static inline int qp_mtu(struct rxe_qp *qp) if (qp->ibqp.qp_type == IB_QPT_RC || qp->ibqp.qp_type == IB_QPT_UC) return qp->attr.path_mtu; else - return RXE_PORT_MAX_MTU; + return IB_MTU_4096; } static inline int rcv_wqe_size(int max_sge) diff --git a/drivers/infiniband/sw/rxe/rxe_mr.c b/drivers/infiniband/sw/rxe/rxe_mr.c index dff605fdf60f..9d3916b93f23 100644 --- a/drivers/infiniband/sw/rxe/rxe_mr.c +++ b/drivers/infiniband/sw/rxe/rxe_mr.c @@ -573,33 +573,20 @@ struct rxe_mem *lookup_mem(struct rxe_pd *pd, int access, u32 key, struct rxe_dev *rxe = to_rdev(pd->ibpd.device); int index = key >> 8; - if (index >= RXE_MIN_MR_INDEX && index <= RXE_MAX_MR_INDEX) { - mem = rxe_pool_get_index(&rxe->mr_pool, index); - if (!mem) - goto err1; - } else { - goto err1; + mem = rxe_pool_get_index(&rxe->mr_pool, index); + if (!mem) + return NULL; + + if (unlikely((type == lookup_local && mem->lkey != key) || + (type == lookup_remote && mem->rkey != key) || + mem->pd != pd || + (access && !(access & mem->access)) || + mem->state != RXE_MEM_STATE_VALID)) { + rxe_drop_ref(mem); + mem = NULL; } - if ((type == lookup_local && mem->lkey != key) || - (type == lookup_remote && mem->rkey != key)) - goto err2; - - if (mem->pd != pd) - goto err2; - - if (access && !(access & mem->access)) - goto err2; - - if (mem->state != RXE_MEM_STATE_VALID) - goto err2; - return mem; - -err2: - rxe_drop_ref(mem); -err1: - return NULL; } int rxe_mem_map_pages(struct rxe_dev *rxe, struct rxe_mem *mem, diff --git a/drivers/infiniband/sw/rxe/rxe_net.c b/drivers/infiniband/sw/rxe/rxe_net.c index 8094cbaa54a9..40e82e0f6c2d 100644 --- a/drivers/infiniband/sw/rxe/rxe_net.c +++ b/drivers/infiniband/sw/rxe/rxe_net.c @@ -72,7 +72,7 @@ struct rxe_dev *get_rxe_by_name(const char *name) spin_lock_bh(&dev_list_lock); list_for_each_entry(rxe, &rxe_dev_list, list) { - if (!strcmp(name, rxe->ib_dev.name)) { + if (!strcmp(name, dev_name(&rxe->ib_dev.dev))) { found = rxe; break; } @@ -182,19 +182,11 @@ static struct dst_entry *rxe_find_route6(struct net_device *ndev, #endif -static struct dst_entry *rxe_find_route(struct rxe_dev *rxe, +static struct dst_entry *rxe_find_route(struct net_device *ndev, struct rxe_qp *qp, struct rxe_av *av) { - const struct ib_gid_attr *attr; struct dst_entry *dst = NULL; - struct net_device *ndev; - - attr = rdma_get_gid_attr(&rxe->ib_dev, qp->attr.port_num, - av->grh.sgid_index); - if (IS_ERR(attr)) - return NULL; - ndev = attr->ndev; if (qp_type(qp) == IB_QPT_RC) dst = sk_dst_get(qp->sk->sk); @@ -229,7 +221,6 @@ static struct dst_entry *rxe_find_route(struct rxe_dev *rxe, sk_dst_set(qp->sk->sk, dst); } } - rdma_put_gid_attr(attr); return dst; } @@ -377,8 +368,8 @@ static void prepare_ipv6_hdr(struct dst_entry *dst, struct sk_buff *skb, ip6h->payload_len = htons(skb->len - sizeof(*ip6h)); } -static int prepare4(struct rxe_dev *rxe, struct rxe_pkt_info *pkt, - struct sk_buff *skb, struct rxe_av *av) +static int prepare4(struct rxe_pkt_info *pkt, struct sk_buff *skb, + struct rxe_av *av) { struct rxe_qp *qp = pkt->qp; struct dst_entry *dst; @@ -387,7 +378,7 @@ static int prepare4(struct rxe_dev *rxe, struct rxe_pkt_info *pkt, struct in_addr *saddr = &av->sgid_addr._sockaddr_in.sin_addr; struct in_addr *daddr = &av->dgid_addr._sockaddr_in.sin_addr; - dst = rxe_find_route(rxe, qp, av); + dst = rxe_find_route(skb->dev, qp, av); if (!dst) { pr_err("Host not reachable\n"); return -EHOSTUNREACH; @@ -396,8 +387,8 @@ static int prepare4(struct rxe_dev *rxe, struct rxe_pkt_info *pkt, if (!memcmp(saddr, daddr, sizeof(*daddr))) pkt->mask |= RXE_LOOPBACK_MASK; - prepare_udp_hdr(skb, htons(RXE_ROCE_V2_SPORT), - htons(ROCE_V2_UDP_DPORT)); + prepare_udp_hdr(skb, cpu_to_be16(qp->src_port), + cpu_to_be16(ROCE_V2_UDP_DPORT)); prepare_ipv4_hdr(dst, skb, saddr->s_addr, daddr->s_addr, IPPROTO_UDP, av->grh.traffic_class, av->grh.hop_limit, df, xnet); @@ -406,15 +397,15 @@ static int prepare4(struct rxe_dev *rxe, struct rxe_pkt_info *pkt, return 0; } -static int prepare6(struct rxe_dev *rxe, struct rxe_pkt_info *pkt, - struct sk_buff *skb, struct rxe_av *av) +static int prepare6(struct rxe_pkt_info *pkt, struct sk_buff *skb, + struct rxe_av *av) { struct rxe_qp *qp = pkt->qp; struct dst_entry *dst; struct in6_addr *saddr = &av->sgid_addr._sockaddr_in6.sin6_addr; struct in6_addr *daddr = &av->dgid_addr._sockaddr_in6.sin6_addr; - dst = rxe_find_route(rxe, qp, av); + dst = rxe_find_route(skb->dev, qp, av); if (!dst) { pr_err("Host not reachable\n"); return -EHOSTUNREACH; @@ -423,8 +414,8 @@ static int prepare6(struct rxe_dev *rxe, struct rxe_pkt_info *pkt, if (!memcmp(saddr, daddr, sizeof(*daddr))) pkt->mask |= RXE_LOOPBACK_MASK; - prepare_udp_hdr(skb, htons(RXE_ROCE_V2_SPORT), - htons(ROCE_V2_UDP_DPORT)); + prepare_udp_hdr(skb, cpu_to_be16(qp->src_port), + cpu_to_be16(ROCE_V2_UDP_DPORT)); prepare_ipv6_hdr(dst, skb, saddr, daddr, IPPROTO_UDP, av->grh.traffic_class, @@ -434,16 +425,15 @@ static int prepare6(struct rxe_dev *rxe, struct rxe_pkt_info *pkt, return 0; } -int rxe_prepare(struct rxe_dev *rxe, struct rxe_pkt_info *pkt, - struct sk_buff *skb, u32 *crc) +int rxe_prepare(struct rxe_pkt_info *pkt, struct sk_buff *skb, u32 *crc) { int err = 0; struct rxe_av *av = rxe_get_av(pkt); if (av->network_type == RDMA_NETWORK_IPV4) - err = prepare4(rxe, pkt, skb, av); + err = prepare4(pkt, skb, av); else if (av->network_type == RDMA_NETWORK_IPV6) - err = prepare6(rxe, pkt, skb, av); + err = prepare6(pkt, skb, av); *crc = rxe_icrc_hdr(pkt, skb); @@ -501,11 +491,6 @@ void rxe_loopback(struct sk_buff *skb) rxe_rcv(skb); } -static inline int addr_same(struct rxe_dev *rxe, struct rxe_av *av) -{ - return rxe->port.port_guid == av->grh.dgid.global.interface_id; -} - struct sk_buff *rxe_init_packet(struct rxe_dev *rxe, struct rxe_av *av, int paylen, struct rxe_pkt_info *pkt) { @@ -625,7 +610,7 @@ void rxe_port_up(struct rxe_dev *rxe) port->attr.phys_state = IB_PHYS_STATE_LINK_UP; rxe_port_event(rxe, IB_EVENT_PORT_ACTIVE); - pr_info("set %s active\n", rxe->ib_dev.name); + dev_info(&rxe->ib_dev.dev, "set active\n"); } /* Caller must hold net_info_lock */ @@ -638,7 +623,7 @@ void rxe_port_down(struct rxe_dev *rxe) port->attr.phys_state = IB_PHYS_STATE_LINK_DOWN; rxe_port_event(rxe, IB_EVENT_PORT_ERR); - pr_info("set %s down\n", rxe->ib_dev.name); + dev_info(&rxe->ib_dev.dev, "set down\n"); } static int rxe_notify(struct notifier_block *not_blk, diff --git a/drivers/infiniband/sw/rxe/rxe_param.h b/drivers/infiniband/sw/rxe/rxe_param.h index 4555510d86c4..bdea899a58ac 100644 --- a/drivers/infiniband/sw/rxe/rxe_param.h +++ b/drivers/infiniband/sw/rxe/rxe_param.h @@ -90,7 +90,6 @@ enum rxe_device_param { RXE_MAX_RES_RD_ATOM = 0x3f000, RXE_MAX_QP_INIT_RD_ATOM = 128, RXE_MAX_EE_INIT_RD_ATOM = 0, - RXE_ATOMIC_CAP = 1, RXE_MAX_EE = 0, RXE_MAX_RDD = 0, RXE_MAX_MW = 0, @@ -139,9 +138,6 @@ enum rxe_device_param { /* default/initial rxe port parameters */ enum rxe_port_param { - RXE_PORT_STATE = IB_PORT_DOWN, - RXE_PORT_MAX_MTU = IB_MTU_4096, - RXE_PORT_ACTIVE_MTU = IB_MTU_256, RXE_PORT_GID_TBL_LEN = 1024, RXE_PORT_PORT_CAP_FLAGS = RDMA_CORE_CAP_PROT_ROCE_UDP_ENCAP, RXE_PORT_MAX_MSG_SZ = 0x800000, diff --git a/drivers/infiniband/sw/rxe/rxe_pool.c b/drivers/infiniband/sw/rxe/rxe_pool.c index b4a8acc7bb7d..36b53fb94a49 100644 --- a/drivers/infiniband/sw/rxe/rxe_pool.c +++ b/drivers/infiniband/sw/rxe/rxe_pool.c @@ -207,7 +207,7 @@ int rxe_pool_init( kref_init(&pool->ref_cnt); - spin_lock_init(&pool->pool_lock); + rwlock_init(&pool->pool_lock); if (rxe_type_info[type].flags & RXE_POOL_INDEX) { err = rxe_pool_init_index(pool, @@ -222,7 +222,7 @@ int rxe_pool_init( pool->key_size = rxe_type_info[type].key_size; } - pool->state = rxe_pool_valid; + pool->state = RXE_POOL_STATE_VALID; out: return err; @@ -232,7 +232,7 @@ static void rxe_pool_release(struct kref *kref) { struct rxe_pool *pool = container_of(kref, struct rxe_pool, ref_cnt); - pool->state = rxe_pool_invalid; + pool->state = RXE_POOL_STATE_INVALID; kfree(pool->table); } @@ -245,12 +245,12 @@ int rxe_pool_cleanup(struct rxe_pool *pool) { unsigned long flags; - spin_lock_irqsave(&pool->pool_lock, flags); - pool->state = rxe_pool_invalid; + write_lock_irqsave(&pool->pool_lock, flags); + pool->state = RXE_POOL_STATE_INVALID; if (atomic_read(&pool->num_elem) > 0) pr_warn("%s pool destroyed with unfree'd elem\n", pool_name(pool)); - spin_unlock_irqrestore(&pool->pool_lock, flags); + write_unlock_irqrestore(&pool->pool_lock, flags); rxe_pool_put(pool); @@ -336,10 +336,10 @@ void rxe_add_key(void *arg, void *key) struct rxe_pool *pool = elem->pool; unsigned long flags; - spin_lock_irqsave(&pool->pool_lock, flags); + write_lock_irqsave(&pool->pool_lock, flags); memcpy((u8 *)elem + pool->key_offset, key, pool->key_size); insert_key(pool, elem); - spin_unlock_irqrestore(&pool->pool_lock, flags); + write_unlock_irqrestore(&pool->pool_lock, flags); } void rxe_drop_key(void *arg) @@ -348,9 +348,9 @@ void rxe_drop_key(void *arg) struct rxe_pool *pool = elem->pool; unsigned long flags; - spin_lock_irqsave(&pool->pool_lock, flags); + write_lock_irqsave(&pool->pool_lock, flags); rb_erase(&elem->node, &pool->tree); - spin_unlock_irqrestore(&pool->pool_lock, flags); + write_unlock_irqrestore(&pool->pool_lock, flags); } void rxe_add_index(void *arg) @@ -359,10 +359,10 @@ void rxe_add_index(void *arg) struct rxe_pool *pool = elem->pool; unsigned long flags; - spin_lock_irqsave(&pool->pool_lock, flags); + write_lock_irqsave(&pool->pool_lock, flags); elem->index = alloc_index(pool); insert_index(pool, elem); - spin_unlock_irqrestore(&pool->pool_lock, flags); + write_unlock_irqrestore(&pool->pool_lock, flags); } void rxe_drop_index(void *arg) @@ -371,10 +371,10 @@ void rxe_drop_index(void *arg) struct rxe_pool *pool = elem->pool; unsigned long flags; - spin_lock_irqsave(&pool->pool_lock, flags); + write_lock_irqsave(&pool->pool_lock, flags); clear_bit(elem->index - pool->min_index, pool->table); rb_erase(&elem->node, &pool->tree); - spin_unlock_irqrestore(&pool->pool_lock, flags); + write_unlock_irqrestore(&pool->pool_lock, flags); } void *rxe_alloc(struct rxe_pool *pool) @@ -384,13 +384,13 @@ void *rxe_alloc(struct rxe_pool *pool) might_sleep_if(!(pool->flags & RXE_POOL_ATOMIC)); - spin_lock_irqsave(&pool->pool_lock, flags); - if (pool->state != rxe_pool_valid) { - spin_unlock_irqrestore(&pool->pool_lock, flags); + read_lock_irqsave(&pool->pool_lock, flags); + if (pool->state != RXE_POOL_STATE_VALID) { + read_unlock_irqrestore(&pool->pool_lock, flags); return NULL; } kref_get(&pool->ref_cnt); - spin_unlock_irqrestore(&pool->pool_lock, flags); + read_unlock_irqrestore(&pool->pool_lock, flags); kref_get(&pool->rxe->ref_cnt); @@ -436,9 +436,9 @@ void *rxe_pool_get_index(struct rxe_pool *pool, u32 index) struct rxe_pool_entry *elem = NULL; unsigned long flags; - spin_lock_irqsave(&pool->pool_lock, flags); + read_lock_irqsave(&pool->pool_lock, flags); - if (pool->state != rxe_pool_valid) + if (pool->state != RXE_POOL_STATE_VALID) goto out; node = pool->tree.rb_node; @@ -450,15 +450,14 @@ void *rxe_pool_get_index(struct rxe_pool *pool, u32 index) node = node->rb_left; else if (elem->index < index) node = node->rb_right; - else + else { + kref_get(&elem->ref_cnt); break; + } } - if (node) - kref_get(&elem->ref_cnt); - out: - spin_unlock_irqrestore(&pool->pool_lock, flags); + read_unlock_irqrestore(&pool->pool_lock, flags); return node ? elem : NULL; } @@ -469,9 +468,9 @@ void *rxe_pool_get_key(struct rxe_pool *pool, void *key) int cmp; unsigned long flags; - spin_lock_irqsave(&pool->pool_lock, flags); + read_lock_irqsave(&pool->pool_lock, flags); - if (pool->state != rxe_pool_valid) + if (pool->state != RXE_POOL_STATE_VALID) goto out; node = pool->tree.rb_node; @@ -494,6 +493,6 @@ void *rxe_pool_get_key(struct rxe_pool *pool, void *key) kref_get(&elem->ref_cnt); out: - spin_unlock_irqrestore(&pool->pool_lock, flags); + read_unlock_irqrestore(&pool->pool_lock, flags); return node ? elem : NULL; } diff --git a/drivers/infiniband/sw/rxe/rxe_pool.h b/drivers/infiniband/sw/rxe/rxe_pool.h index 47df28e43acf..aa4ba307097b 100644 --- a/drivers/infiniband/sw/rxe/rxe_pool.h +++ b/drivers/infiniband/sw/rxe/rxe_pool.h @@ -74,8 +74,8 @@ struct rxe_type_info { extern struct rxe_type_info rxe_type_info[]; enum rxe_pool_state { - rxe_pool_invalid, - rxe_pool_valid, + RXE_POOL_STATE_INVALID, + RXE_POOL_STATE_VALID, }; struct rxe_pool_entry { @@ -90,7 +90,7 @@ struct rxe_pool_entry { struct rxe_pool { struct rxe_dev *rxe; - spinlock_t pool_lock; /* pool spinlock */ + rwlock_t pool_lock; /* protects pool add/del/search */ size_t elem_size; struct kref ref_cnt; void (*cleanup)(struct rxe_pool_entry *obj); diff --git a/drivers/infiniband/sw/rxe/rxe_qp.c b/drivers/infiniband/sw/rxe/rxe_qp.c index c58452daffc7..b9710907dac2 100644 --- a/drivers/infiniband/sw/rxe/rxe_qp.c +++ b/drivers/infiniband/sw/rxe/rxe_qp.c @@ -34,6 +34,7 @@ #include <linux/skbuff.h> #include <linux/delay.h> #include <linux/sched.h> +#include <linux/vmalloc.h> #include "rxe.h" #include "rxe_loc.h" @@ -227,6 +228,16 @@ static int rxe_qp_init_req(struct rxe_dev *rxe, struct rxe_qp *qp, return err; qp->sk->sk->sk_user_data = qp; + /* pick a source UDP port number for this QP based on + * the source QPN. this spreads traffic for different QPs + * across different NIC RX queues (while using a single + * flow for a given QP to maintain packet order). + * the port number must be in the Dynamic Ports range + * (0xc000 - 0xffff). + */ + qp->src_port = RXE_ROCE_V2_SPORT + + (hash_32_generic(qp_num(qp), 14) & 0x3fff); + qp->sq.max_wr = init->cap.max_send_wr; qp->sq.max_sge = init->cap.max_send_sge; qp->sq.max_inline = init->cap.max_inline_data; @@ -247,7 +258,7 @@ static int rxe_qp_init_req(struct rxe_dev *rxe, struct rxe_qp *qp, &qp->sq.queue->ip); if (err) { - kvfree(qp->sq.queue->buf); + vfree(qp->sq.queue->buf); kfree(qp->sq.queue); return err; } @@ -300,7 +311,7 @@ static int rxe_qp_init_resp(struct rxe_dev *rxe, struct rxe_qp *qp, qp->rq.queue->buf, qp->rq.queue->buf_size, &qp->rq.queue->ip); if (err) { - kvfree(qp->rq.queue->buf); + vfree(qp->rq.queue->buf); kfree(qp->rq.queue); return err; } @@ -408,8 +419,7 @@ int rxe_qp_chk_attr(struct rxe_dev *rxe, struct rxe_qp *qp, enum ib_qp_state new_state = (mask & IB_QP_STATE) ? attr->qp_state : cur_state; - if (!ib_modify_qp_is_ok(cur_state, new_state, qp_type(qp), mask, - IB_LINK_LAYER_ETHERNET)) { + if (!ib_modify_qp_is_ok(cur_state, new_state, qp_type(qp), mask)) { pr_warn("invalid mask or state for qp\n"); goto err1; } diff --git a/drivers/infiniband/sw/rxe/rxe_recv.c b/drivers/infiniband/sw/rxe/rxe_recv.c index d30dbac24583..5c29a1bb575a 100644 --- a/drivers/infiniband/sw/rxe/rxe_recv.c +++ b/drivers/infiniband/sw/rxe/rxe_recv.c @@ -122,7 +122,7 @@ static int check_keys(struct rxe_dev *rxe, struct rxe_pkt_info *pkt, set_bad_pkey_cntr(port); goto err1; } - } else if (qpn != 0) { + } else { if (unlikely(!pkey_match(pkey, port->pkey_tbl[qp->attr.pkey_index] ))) { @@ -134,7 +134,7 @@ static int check_keys(struct rxe_dev *rxe, struct rxe_pkt_info *pkt, } if ((qp_type(qp) == IB_QPT_UD || qp_type(qp) == IB_QPT_GSI) && - qpn != 0 && pkt->mask) { + pkt->mask) { u32 qkey = (qpn == 1) ? GSI_QKEY : qp->attr.qkey; if (unlikely(deth_qkey(pkt) != qkey)) { diff --git a/drivers/infiniband/sw/rxe/rxe_req.c b/drivers/infiniband/sw/rxe/rxe_req.c index 8be27238a86e..6c361d70d7cd 100644 --- a/drivers/infiniband/sw/rxe/rxe_req.c +++ b/drivers/infiniband/sw/rxe/rxe_req.c @@ -73,9 +73,6 @@ static void req_retry(struct rxe_qp *qp) int npsn; int first = 1; - wqe = queue_head(qp->sq.queue); - npsn = (qp->comp.psn - wqe->first_psn) & BTH_PSN_MASK; - qp->req.wqe_index = consumer_index(qp->sq.queue); qp->req.psn = qp->comp.psn; qp->req.opcode = -1; @@ -107,11 +104,17 @@ static void req_retry(struct rxe_qp *qp) if (first) { first = 0; - if (mask & WR_WRITE_OR_SEND_MASK) + if (mask & WR_WRITE_OR_SEND_MASK) { + npsn = (qp->comp.psn - wqe->first_psn) & + BTH_PSN_MASK; retry_first_write_send(qp, wqe, mask, npsn); + } - if (mask & WR_READ_MASK) + if (mask & WR_READ_MASK) { + npsn = (wqe->dma.length - wqe->dma.resid) / + qp->mtu; wqe->iova += npsn * qp->mtu; + } } wqe->state = wqe_state_posted; @@ -435,7 +438,7 @@ static struct sk_buff *init_req_packet(struct rxe_qp *qp, if (pkt->mask & RXE_RETH_MASK) { reth_set_rkey(pkt, ibwr->wr.rdma.rkey); reth_set_va(pkt, wqe->iova); - reth_set_len(pkt, wqe->dma.length); + reth_set_len(pkt, wqe->dma.resid); } if (pkt->mask & RXE_IMMDT_MASK) @@ -476,7 +479,7 @@ static int fill_packet(struct rxe_qp *qp, struct rxe_send_wqe *wqe, u32 *p; int err; - err = rxe_prepare(rxe, pkt, skb, &crc); + err = rxe_prepare(pkt, skb, &crc); if (err) return err; diff --git a/drivers/infiniband/sw/rxe/rxe_resp.c b/drivers/infiniband/sw/rxe/rxe_resp.c index aa5833318372..c962160292f4 100644 --- a/drivers/infiniband/sw/rxe/rxe_resp.c +++ b/drivers/infiniband/sw/rxe/rxe_resp.c @@ -637,7 +637,7 @@ static struct sk_buff *prepare_ack_packet(struct rxe_qp *qp, if (ack->mask & RXE_ATMACK_MASK) atmack_set_orig(ack, qp->resp.atomic_orig); - err = rxe_prepare(rxe, ack, skb, &crc); + err = rxe_prepare(ack, skb, &crc); if (err) { kfree_skb(skb); return NULL; @@ -682,6 +682,7 @@ static enum resp_states read_reply(struct rxe_qp *qp, rxe_advance_resp_resource(qp); res->type = RXE_READ_MASK; + res->replay = 0; res->read.va = qp->resp.va; res->read.va_org = qp->resp.va; @@ -752,7 +753,8 @@ static enum resp_states read_reply(struct rxe_qp *qp, state = RESPST_DONE; } else { qp->resp.res = NULL; - qp->resp.opcode = -1; + if (!res->replay) + qp->resp.opcode = -1; if (psn_compare(res->cur_psn, qp->resp.psn) >= 0) qp->resp.psn = res->cur_psn; state = RESPST_CLEANUP; @@ -814,6 +816,7 @@ static enum resp_states execute(struct rxe_qp *qp, struct rxe_pkt_info *pkt) /* next expected psn, read handles this separately */ qp->resp.psn = (pkt->psn + 1) & BTH_PSN_MASK; + qp->resp.ack_psn = qp->resp.psn; qp->resp.opcode = pkt->opcode; qp->resp.status = IB_WC_SUCCESS; @@ -1065,7 +1068,7 @@ static enum resp_states duplicate_request(struct rxe_qp *qp, struct rxe_pkt_info *pkt) { enum resp_states rc; - u32 prev_psn = (qp->resp.psn - 1) & BTH_PSN_MASK; + u32 prev_psn = (qp->resp.ack_psn - 1) & BTH_PSN_MASK; if (pkt->mask & RXE_SEND_MASK || pkt->mask & RXE_WRITE_MASK) { @@ -1108,6 +1111,7 @@ static enum resp_states duplicate_request(struct rxe_qp *qp, res->state = (pkt->psn == res->first_psn) ? rdatm_res_state_new : rdatm_res_state_replay; + res->replay = 1; /* Reset the resource, except length. */ res->read.va_org = iova; diff --git a/drivers/infiniband/sw/rxe/rxe_srq.c b/drivers/infiniband/sw/rxe/rxe_srq.c index 0d6c04ba7fc3..c41a5fee81f7 100644 --- a/drivers/infiniband/sw/rxe/rxe_srq.c +++ b/drivers/infiniband/sw/rxe/rxe_srq.c @@ -31,6 +31,7 @@ * SOFTWARE. */ +#include <linux/vmalloc.h> #include "rxe.h" #include "rxe_loc.h" #include "rxe_queue.h" @@ -129,13 +130,18 @@ int rxe_srq_from_init(struct rxe_dev *rxe, struct rxe_srq *srq, err = do_mmap_info(rxe, uresp ? &uresp->mi : NULL, context, q->buf, q->buf_size, &q->ip); - if (err) + if (err) { + vfree(q->buf); + kfree(q); return err; + } if (uresp) { if (copy_to_user(&uresp->srq_num, &srq->srq_num, - sizeof(uresp->srq_num))) + sizeof(uresp->srq_num))) { + rxe_queue_cleanup(q); return -EFAULT; + } } return 0; diff --git a/drivers/infiniband/sw/rxe/rxe_sysfs.c b/drivers/infiniband/sw/rxe/rxe_sysfs.c index d5ed7571128f..73a19f808e1b 100644 --- a/drivers/infiniband/sw/rxe/rxe_sysfs.c +++ b/drivers/infiniband/sw/rxe/rxe_sysfs.c @@ -105,7 +105,7 @@ static int rxe_param_set_add(const char *val, const struct kernel_param *kp) } rxe_set_port_state(ndev); - pr_info("added %s to %s\n", rxe->ib_dev.name, intf); + dev_info(&rxe->ib_dev.dev, "added %s\n", intf); err: if (ndev) dev_put(ndev); diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.c b/drivers/infiniband/sw/rxe/rxe_verbs.c index f5b1e0ad6142..9c19f2027511 100644 --- a/drivers/infiniband/sw/rxe/rxe_verbs.c +++ b/drivers/infiniband/sw/rxe/rxe_verbs.c @@ -1148,18 +1148,21 @@ static ssize_t parent_show(struct device *device, static DEVICE_ATTR_RO(parent); -static struct device_attribute *rxe_dev_attributes[] = { - &dev_attr_parent, +static struct attribute *rxe_dev_attributes[] = { + &dev_attr_parent.attr, + NULL +}; + +static const struct attribute_group rxe_attr_group = { + .attrs = rxe_dev_attributes, }; int rxe_register_device(struct rxe_dev *rxe) { int err; - int i; struct ib_device *dev = &rxe->ib_dev; struct crypto_shash *tfm; - strlcpy(dev->name, "rxe%d", IB_DEVICE_NAME_MAX); strlcpy(dev->node_desc, "rxe", sizeof(dev->node_desc)); dev->owner = THIS_MODULE; @@ -1260,26 +1263,16 @@ int rxe_register_device(struct rxe_dev *rxe) } rxe->tfm = tfm; + rdma_set_device_sysfs_group(dev, &rxe_attr_group); dev->driver_id = RDMA_DRIVER_RXE; - err = ib_register_device(dev, NULL); + err = ib_register_device(dev, "rxe%d", NULL); if (err) { pr_warn("%s failed with error %d\n", __func__, err); goto err1; } - for (i = 0; i < ARRAY_SIZE(rxe_dev_attributes); ++i) { - err = device_create_file(&dev->dev, rxe_dev_attributes[i]); - if (err) { - pr_warn("%s failed with error %d for attr number %d\n", - __func__, err, i); - goto err2; - } - } - return 0; -err2: - ib_unregister_device(dev); err1: crypto_free_shash(rxe->tfm); @@ -1288,12 +1281,8 @@ err1: int rxe_unregister_device(struct rxe_dev *rxe) { - int i; struct ib_device *dev = &rxe->ib_dev; - for (i = 0; i < ARRAY_SIZE(rxe_dev_attributes); ++i) - device_remove_file(&dev->dev, rxe_dev_attributes[i]); - ib_unregister_device(dev); return 0; diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.h b/drivers/infiniband/sw/rxe/rxe_verbs.h index af1470d29391..82e670d6eeea 100644 --- a/drivers/infiniband/sw/rxe/rxe_verbs.h +++ b/drivers/infiniband/sw/rxe/rxe_verbs.h @@ -158,6 +158,7 @@ struct rxe_comp_info { int opcode; int timeout; int timeout_retry; + int started_retry; u32 retry_cnt; u32 rnr_retry; struct rxe_task task; @@ -171,6 +172,7 @@ enum rdatm_res_state { struct resp_res { int type; + int replay; u32 first_psn; u32 last_psn; u32 cur_psn; @@ -195,6 +197,7 @@ struct rxe_resp_info { enum rxe_qp_state state; u32 msn; u32 psn; + u32 ack_psn; int opcode; int drop_msg; int goto_error; @@ -248,6 +251,7 @@ struct rxe_qp { struct socket *sk; u32 dst_cookie; + u16 src_port; struct rxe_av pri_av; struct rxe_av alt_av; diff --git a/drivers/infiniband/ulp/ipoib/ipoib_cm.c b/drivers/infiniband/ulp/ipoib/ipoib_cm.c index 3d5424f335cb..0428e01e8f69 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_cm.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_cm.c @@ -1438,11 +1438,15 @@ static void ipoib_cm_skb_reap(struct work_struct *work) spin_unlock_irqrestore(&priv->lock, flags); netif_tx_unlock_bh(dev); - if (skb->protocol == htons(ETH_P_IP)) + if (skb->protocol == htons(ETH_P_IP)) { + memset(IPCB(skb), 0, sizeof(*IPCB(skb))); icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu)); + } #if IS_ENABLED(CONFIG_IPV6) - else if (skb->protocol == htons(ETH_P_IPV6)) + else if (skb->protocol == htons(ETH_P_IPV6)) { + memset(IP6CB(skb), 0, sizeof(*IP6CB(skb))); icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); + } #endif dev_kfree_skb_any(skb); diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c index 8baa75a705c5..8710214594d8 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_main.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c @@ -243,7 +243,8 @@ static int ipoib_change_mtu(struct net_device *dev, int new_mtu) return 0; } - if (new_mtu > IPOIB_UD_MTU(priv->max_ib_mtu)) + if (new_mtu < (ETH_MIN_MTU + IPOIB_ENCAP_LEN) || + new_mtu > IPOIB_UD_MTU(priv->max_ib_mtu)) return -EINVAL; priv->admin_mtu = new_mtu; @@ -1880,6 +1881,8 @@ static int ipoib_parent_init(struct net_device *ndev) sizeof(union ib_gid)); SET_NETDEV_DEV(priv->dev, priv->ca->dev.parent); + priv->dev->dev_port = priv->port - 1; + /* Let's set this one too for backwards compatibility. */ priv->dev->dev_id = priv->port - 1; return 0; @@ -2385,6 +2388,35 @@ int ipoib_add_pkey_attr(struct net_device *dev) return device_create_file(&dev->dev, &dev_attr_pkey); } +/* + * We erroneously exposed the iface's port number in the dev_id + * sysfs field long after dev_port was introduced for that purpose[1], + * and we need to stop everyone from relying on that. + * Let's overload the shower routine for the dev_id file here + * to gently bring the issue up. + * + * [1] https://www.spinics.net/lists/netdev/msg272123.html + */ +static ssize_t dev_id_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct net_device *ndev = to_net_dev(dev); + + if (ndev->dev_id == ndev->dev_port) + netdev_info_once(ndev, + "\"%s\" wants to know my dev_id. Should it look at dev_port instead? See Documentation/ABI/testing/sysfs-class-net for more info.\n", + current->comm); + + return sprintf(buf, "%#x\n", ndev->dev_id); +} +static DEVICE_ATTR_RO(dev_id); + +int ipoib_intercept_dev_id_attr(struct net_device *dev) +{ + device_remove_file(&dev->dev, &dev_attr_dev_id); + return device_create_file(&dev->dev, &dev_attr_dev_id); +} + static struct net_device *ipoib_add_port(const char *format, struct ib_device *hca, u8 port) { @@ -2437,6 +2469,8 @@ static struct net_device *ipoib_add_port(const char *format, */ ndev->priv_destructor = ipoib_intf_free; + if (ipoib_intercept_dev_id_attr(ndev)) + goto sysfs_failed; if (ipoib_cm_add_mode_attr(ndev)) goto sysfs_failed; if (ipoib_add_pkey_attr(ndev)) diff --git a/drivers/infiniband/ulp/ipoib/ipoib_verbs.c b/drivers/infiniband/ulp/ipoib/ipoib_verbs.c index 9f36ca786df8..1e88213459f2 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_verbs.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_verbs.c @@ -277,7 +277,7 @@ void ipoib_event(struct ib_event_handler *handler, return; ipoib_dbg(priv, "Event %d on device %s port %d\n", record->event, - record->device->name, record->element.port_num); + dev_name(&record->device->dev), record->element.port_num); if (record->event == IB_EVENT_SM_CHANGE || record->event == IB_EVENT_CLIENT_REREGISTER) { diff --git a/drivers/infiniband/ulp/iser/iser_initiator.c b/drivers/infiniband/ulp/iser/iser_initiator.c index 2f6388596f88..96af06cfe0af 100644 --- a/drivers/infiniband/ulp/iser/iser_initiator.c +++ b/drivers/infiniband/ulp/iser/iser_initiator.c @@ -589,13 +589,19 @@ void iser_login_rsp(struct ib_cq *cq, struct ib_wc *wc) ib_conn->post_recv_buf_count--; } -static inline void +static inline int iser_inv_desc(struct iser_fr_desc *desc, u32 rkey) { - if (likely(rkey == desc->rsc.mr->rkey)) + if (likely(rkey == desc->rsc.mr->rkey)) { desc->rsc.mr_valid = 0; - else if (likely(rkey == desc->pi_ctx->sig_mr->rkey)) + } else if (likely(desc->pi_ctx && rkey == desc->pi_ctx->sig_mr->rkey)) { desc->pi_ctx->sig_mr_valid = 0; + } else { + iser_err("Bogus remote invalidation for rkey %#x\n", rkey); + return -EINVAL; + } + + return 0; } static int @@ -623,12 +629,14 @@ iser_check_remote_inv(struct iser_conn *iser_conn, if (iser_task->dir[ISER_DIR_IN]) { desc = iser_task->rdma_reg[ISER_DIR_IN].mem_h; - iser_inv_desc(desc, rkey); + if (unlikely(iser_inv_desc(desc, rkey))) + return -EINVAL; } if (iser_task->dir[ISER_DIR_OUT]) { desc = iser_task->rdma_reg[ISER_DIR_OUT].mem_h; - iser_inv_desc(desc, rkey); + if (unlikely(iser_inv_desc(desc, rkey))) + return -EINVAL; } } else { iser_err("failed to get task for itt=%d\n", hdr->itt); diff --git a/drivers/infiniband/ulp/iser/iser_verbs.c b/drivers/infiniband/ulp/iser/iser_verbs.c index b686a4aaffe8..946b623ba5eb 100644 --- a/drivers/infiniband/ulp/iser/iser_verbs.c +++ b/drivers/infiniband/ulp/iser/iser_verbs.c @@ -55,7 +55,7 @@ static void iser_event_handler(struct ib_event_handler *handler, { iser_err("async event %s (%d) on device %s port %d\n", ib_event_msg(event->event), event->event, - event->device->name, event->element.port_num); + dev_name(&event->device->dev), event->element.port_num); } /** @@ -85,7 +85,7 @@ static int iser_create_device_ib_res(struct iser_device *device) max_cqe = min(ISER_MAX_CQ_LEN, ib_dev->attrs.max_cqe); iser_info("using %d CQs, device %s supports %d vectors max_cqe %d\n", - device->comps_used, ib_dev->name, + device->comps_used, dev_name(&ib_dev->dev), ib_dev->num_comp_vectors, max_cqe); device->pd = ib_alloc_pd(ib_dev, @@ -468,7 +468,8 @@ static int iser_create_ib_conn_res(struct ib_conn *ib_conn) iser_conn->max_cmds = ISER_GET_MAX_XMIT_CMDS(ib_dev->attrs.max_qp_wr); iser_dbg("device %s supports max_send_wr %d\n", - device->ib_device->name, ib_dev->attrs.max_qp_wr); + dev_name(&device->ib_device->dev), + ib_dev->attrs.max_qp_wr); } } @@ -764,7 +765,7 @@ static void iser_addr_handler(struct rdma_cm_id *cma_id) IB_DEVICE_SIGNATURE_HANDOVER)) { iser_warn("T10-PI requested but not supported on %s, " "continue without T10-PI\n", - ib_conn->device->ib_device->name); + dev_name(&ib_conn->device->ib_device->dev)); ib_conn->pi_support = false; } else { ib_conn->pi_support = true; diff --git a/drivers/infiniband/ulp/isert/ib_isert.c b/drivers/infiniband/ulp/isert/ib_isert.c index f39670c5c25c..e3dd13798d79 100644 --- a/drivers/infiniband/ulp/isert/ib_isert.c +++ b/drivers/infiniband/ulp/isert/ib_isert.c @@ -262,7 +262,7 @@ isert_alloc_comps(struct isert_device *device) isert_info("Using %d CQs, %s supports %d vectors support " "pi_capable %d\n", - device->comps_used, device->ib_device->name, + device->comps_used, dev_name(&device->ib_device->dev), device->ib_device->num_comp_vectors, device->pi_capable); diff --git a/drivers/infiniband/ulp/opa_vnic/opa_vnic_encap.c b/drivers/infiniband/ulp/opa_vnic/opa_vnic_encap.c index 267da8215e08..31cd361416ac 100644 --- a/drivers/infiniband/ulp/opa_vnic/opa_vnic_encap.c +++ b/drivers/infiniband/ulp/opa_vnic/opa_vnic_encap.c @@ -351,7 +351,8 @@ static uint32_t opa_vnic_get_dlid(struct opa_vnic_adapter *adapter, if (unlikely(!dlid)) v_warn("Null dlid in MAC address\n"); } else if (def_port != OPA_VNIC_INVALID_PORT) { - dlid = info->vesw.u_ucast_dlid[def_port]; + if (def_port < OPA_VESW_MAX_NUM_DEF_PORT) + dlid = info->vesw.u_ucast_dlid[def_port]; } } diff --git a/drivers/infiniband/ulp/opa_vnic/opa_vnic_vema.c b/drivers/infiniband/ulp/opa_vnic/opa_vnic_vema.c index 15711dcc6f58..d119d9afa845 100644 --- a/drivers/infiniband/ulp/opa_vnic/opa_vnic_vema.c +++ b/drivers/infiniband/ulp/opa_vnic/opa_vnic_vema.c @@ -888,7 +888,8 @@ static void opa_vnic_event(struct ib_event_handler *handler, return; c_dbg("OPA_VNIC received event %d on device %s port %d\n", - record->event, record->device->name, record->element.port_num); + record->event, dev_name(&record->device->dev), + record->element.port_num); if (record->event == IB_EVENT_PORT_ERR) idr_for_each(&port->vport_idr, vema_disable_vport, NULL); diff --git a/drivers/infiniband/ulp/srp/ib_srp.c b/drivers/infiniband/ulp/srp/ib_srp.c index 0b34e909505f..eed0eb3bb04c 100644 --- a/drivers/infiniband/ulp/srp/ib_srp.c +++ b/drivers/infiniband/ulp/srp/ib_srp.c @@ -1330,17 +1330,8 @@ static void srp_terminate_io(struct srp_rport *rport) { struct srp_target_port *target = rport->lld_data; struct srp_rdma_ch *ch; - struct Scsi_Host *shost = target->scsi_host; - struct scsi_device *sdev; int i, j; - /* - * Invoking srp_terminate_io() while srp_queuecommand() is running - * is not safe. Hence the warning statement below. - */ - shost_for_each_device(sdev, shost) - WARN_ON_ONCE(sdev->request_queue->request_fn_active); - for (i = 0; i < target->ch_count; i++) { ch = &target->ch[i]; @@ -3124,7 +3115,8 @@ static ssize_t show_local_ib_device(struct device *dev, { struct srp_target_port *target = host_to_target(class_to_shost(dev)); - return sprintf(buf, "%s\n", target->srp_host->srp_dev->dev->name); + return sprintf(buf, "%s\n", + dev_name(&target->srp_host->srp_dev->dev->dev)); } static ssize_t show_ch_count(struct device *dev, struct device_attribute *attr, @@ -3987,7 +3979,7 @@ static ssize_t show_ibdev(struct device *dev, struct device_attribute *attr, { struct srp_host *host = container_of(dev, struct srp_host, dev); - return sprintf(buf, "%s\n", host->srp_dev->dev->name); + return sprintf(buf, "%s\n", dev_name(&host->srp_dev->dev->dev)); } static DEVICE_ATTR(ibdev, S_IRUGO, show_ibdev, NULL); @@ -4019,7 +4011,8 @@ static struct srp_host *srp_add_port(struct srp_device *device, u8 port) host->dev.class = &srp_class; host->dev.parent = device->dev->dev.parent; - dev_set_name(&host->dev, "srp-%s-%d", device->dev->name, port); + dev_set_name(&host->dev, "srp-%s-%d", dev_name(&device->dev->dev), + port); if (device_register(&host->dev)) goto free_host; @@ -4095,7 +4088,7 @@ static void srp_add_one(struct ib_device *device) srp_dev->mr_max_size = srp_dev->mr_page_size * srp_dev->max_pages_per_mr; pr_debug("%s: mr_page_shift = %d, device->max_mr_size = %#llx, device->max_fast_reg_page_list_len = %u, max_pages_per_mr = %d, mr_max_size = %#x\n", - device->name, mr_page_shift, attr->max_mr_size, + dev_name(&device->dev), mr_page_shift, attr->max_mr_size, attr->max_fast_reg_page_list_len, srp_dev->max_pages_per_mr, srp_dev->mr_max_size); diff --git a/drivers/infiniband/ulp/srpt/ib_srpt.c b/drivers/infiniband/ulp/srpt/ib_srpt.c index f37cbad022a2..2357aa727dcf 100644 --- a/drivers/infiniband/ulp/srpt/ib_srpt.c +++ b/drivers/infiniband/ulp/srpt/ib_srpt.c @@ -148,7 +148,7 @@ static void srpt_event_handler(struct ib_event_handler *handler, return; pr_debug("ASYNC event= %d on device= %s\n", event->event, - sdev->device->name); + dev_name(&sdev->device->dev)); switch (event->event) { case IB_EVENT_PORT_ERR: @@ -1941,7 +1941,8 @@ static void __srpt_close_all_ch(struct srpt_port *sport) if (srpt_disconnect_ch(ch) >= 0) pr_info("Closing channel %s because target %s_%d has been disabled\n", ch->sess_name, - sport->sdev->device->name, sport->port); + dev_name(&sport->sdev->device->dev), + sport->port); srpt_close_ch(ch); } } @@ -2127,7 +2128,7 @@ static int srpt_cm_req_recv(struct srpt_device *const sdev, if (!sport->enabled) { rej->reason = cpu_to_be32(SRP_LOGIN_REJ_INSUFFICIENT_RESOURCES); pr_info("rejected SRP_LOGIN_REQ because target port %s_%d has not yet been enabled\n", - sport->sdev->device->name, port_num); + dev_name(&sport->sdev->device->dev), port_num); goto reject; } @@ -2267,7 +2268,7 @@ static int srpt_cm_req_recv(struct srpt_device *const sdev, rej->reason = cpu_to_be32( SRP_LOGIN_REJ_INSUFFICIENT_RESOURCES); pr_info("rejected SRP_LOGIN_REQ because target %s_%d is not enabled\n", - sdev->device->name, port_num); + dev_name(&sdev->device->dev), port_num); mutex_unlock(&sport->mutex); goto reject; } @@ -2708,7 +2709,7 @@ static void srpt_queue_response(struct se_cmd *cmd) break; } - if (unlikely(WARN_ON_ONCE(state == SRPT_STATE_CMD_RSP_SENT))) + if (WARN_ON_ONCE(state == SRPT_STATE_CMD_RSP_SENT)) return; /* For read commands, transfer the data to the initiator. */ @@ -2842,7 +2843,7 @@ static int srpt_release_sport(struct srpt_port *sport) while (wait_event_timeout(sport->ch_releaseQ, srpt_ch_list_empty(sport), 5 * HZ) <= 0) { pr_info("%s_%d: waiting for session unregistration ...\n", - sport->sdev->device->name, sport->port); + dev_name(&sport->sdev->device->dev), sport->port); rcu_read_lock(); list_for_each_entry(nexus, &sport->nexus_list, entry) { list_for_each_entry(ch, &nexus->ch_list, list) { @@ -2932,7 +2933,7 @@ static int srpt_alloc_srq(struct srpt_device *sdev) } pr_debug("create SRQ #wr= %d max_allow=%d dev= %s\n", sdev->srq_size, - sdev->device->attrs.max_srq_wr, device->name); + sdev->device->attrs.max_srq_wr, dev_name(&device->dev)); sdev->ioctx_ring = (struct srpt_recv_ioctx **) srpt_alloc_ioctx_ring(sdev, sdev->srq_size, @@ -2965,8 +2966,8 @@ static int srpt_use_srq(struct srpt_device *sdev, bool use_srq) } else if (use_srq && !sdev->srq) { ret = srpt_alloc_srq(sdev); } - pr_debug("%s(%s): use_srq = %d; ret = %d\n", __func__, device->name, - sdev->use_srq, ret); + pr_debug("%s(%s): use_srq = %d; ret = %d\n", __func__, + dev_name(&device->dev), sdev->use_srq, ret); return ret; } @@ -3052,7 +3053,7 @@ static void srpt_add_one(struct ib_device *device) if (srpt_refresh_port(sport)) { pr_err("MAD registration failed for %s-%d.\n", - sdev->device->name, i); + dev_name(&sdev->device->dev), i); goto err_event; } } @@ -3063,7 +3064,7 @@ static void srpt_add_one(struct ib_device *device) out: ib_set_client_data(device, &srpt_client, sdev); - pr_debug("added %s.\n", device->name); + pr_debug("added %s.\n", dev_name(&device->dev)); return; err_event: @@ -3078,7 +3079,7 @@ free_dev: kfree(sdev); err: sdev = NULL; - pr_info("%s(%s) failed.\n", __func__, device->name); + pr_info("%s(%s) failed.\n", __func__, dev_name(&device->dev)); goto out; } @@ -3093,7 +3094,8 @@ static void srpt_remove_one(struct ib_device *device, void *client_data) int i; if (!sdev) { - pr_info("%s(%s): nothing to do.\n", __func__, device->name); + pr_info("%s(%s): nothing to do.\n", __func__, + dev_name(&device->dev)); return; } diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 31460eeb6fe0..aa5963b5d38e 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -97,14 +97,15 @@ enum { }; enum { - MLX5_ATOMIC_MODE_IB_COMP = 1 << 16, - MLX5_ATOMIC_MODE_CX = 2 << 16, - MLX5_ATOMIC_MODE_8B = 3 << 16, - MLX5_ATOMIC_MODE_16B = 4 << 16, - MLX5_ATOMIC_MODE_32B = 5 << 16, - MLX5_ATOMIC_MODE_64B = 6 << 16, - MLX5_ATOMIC_MODE_128B = 7 << 16, - MLX5_ATOMIC_MODE_256B = 8 << 16, + MLX5_ATOMIC_MODE_OFFSET = 16, + MLX5_ATOMIC_MODE_IB_COMP = 1, + MLX5_ATOMIC_MODE_CX = 2, + MLX5_ATOMIC_MODE_8B = 3, + MLX5_ATOMIC_MODE_16B = 4, + MLX5_ATOMIC_MODE_32B = 5, + MLX5_ATOMIC_MODE_64B = 6, + MLX5_ATOMIC_MODE_128B = 7, + MLX5_ATOMIC_MODE_256B = 8, }; enum { @@ -163,13 +164,11 @@ enum mlx5_dcbx_oper_mode { MLX5E_DCBX_PARAM_VER_OPER_AUTO = 0x3, }; -enum mlx5_dct_atomic_mode { - MLX5_ATOMIC_MODE_DCT_CX = 2, -}; - enum { MLX5_ATOMIC_OPS_CMP_SWAP = 1 << 0, MLX5_ATOMIC_OPS_FETCH_ADD = 1 << 1, + MLX5_ATOMIC_OPS_EXTENDED_CMP_SWAP = 1 << 2, + MLX5_ATOMIC_OPS_EXTENDED_FETCH_ADD = 1 << 3, }; enum mlx5_page_fault_resume_flags { diff --git a/include/linux/qed/qed_rdma_if.h b/include/linux/qed/qed_rdma_if.h index df4d13f7e191..d15f8e4815e3 100644 --- a/include/linux/qed/qed_rdma_if.h +++ b/include/linux/qed/qed_rdma_if.h @@ -39,15 +39,6 @@ #include <linux/qed/qed_ll2_if.h> #include <linux/qed/rdma_common.h> -enum qed_roce_ll2_tx_dest { - /* Light L2 TX Destination to the Network */ - QED_ROCE_LL2_TX_DEST_NW, - - /* Light L2 TX Destination to the Loopback */ - QED_ROCE_LL2_TX_DEST_LB, - QED_ROCE_LL2_TX_DEST_MAX -}; - #define QED_RDMA_MAX_CNQ_SIZE (0xFFFF) /* rdma interface */ @@ -581,7 +572,7 @@ struct qed_roce_ll2_packet { int n_seg; struct qed_roce_ll2_buffer payload[RDMA_MAX_SGE_PER_SQ_WQE]; int roce_mode; - enum qed_roce_ll2_tx_dest tx_dest; + enum qed_ll2_tx_dest tx_dest; }; enum qed_rdma_type { diff --git a/include/rdma/ib_addr.h b/include/rdma/ib_addr.h index 77c7908b7d73..2734c895c1bf 100644 --- a/include/rdma/ib_addr.h +++ b/include/rdma/ib_addr.h @@ -46,7 +46,6 @@ #include <net/ip.h> #include <rdma/ib_verbs.h> #include <rdma/ib_pack.h> -#include <net/ipv6.h> #include <net/net_namespace.h> /** @@ -95,20 +94,18 @@ int rdma_translate_ip(const struct sockaddr *addr, * @timeout_ms: Amount of time to wait for the address resolution to complete. * @callback: Call invoked once address resolution has completed, timed out, * or been canceled. A status of 0 indicates success. + * @resolve_by_gid_attr: Resolve the ip based on the GID attribute from + * rdma_dev_addr. * @context: User-specified context associated with the call. */ int rdma_resolve_ip(struct sockaddr *src_addr, const struct sockaddr *dst_addr, - struct rdma_dev_addr *addr, int timeout_ms, + struct rdma_dev_addr *addr, unsigned long timeout_ms, void (*callback)(int status, struct sockaddr *src_addr, struct rdma_dev_addr *addr, void *context), - void *context); + bool resolve_by_gid_attr, void *context); void rdma_addr_cancel(struct rdma_dev_addr *addr); -void rdma_copy_addr(struct rdma_dev_addr *dev_addr, - const struct net_device *dev, - const unsigned char *dst_dev_addr); - int rdma_addr_size(const struct sockaddr *addr); int rdma_addr_size_in6(struct sockaddr_in6 *addr); int rdma_addr_size_kss(struct __kernel_sockaddr_storage *addr); diff --git a/include/rdma/ib_cm.h b/include/rdma/ib_cm.h index c10f4b5ea8ab..49f4f75499b3 100644 --- a/include/rdma/ib_cm.h +++ b/include/rdma/ib_cm.h @@ -583,7 +583,7 @@ struct ib_cm_sidr_req_param { struct sa_path_rec *path; const struct ib_gid_attr *sgid_attr; __be64 service_id; - int timeout_ms; + unsigned long timeout_ms; const void *private_data; u8 private_data_len; u8 max_cm_retries; diff --git a/include/rdma/ib_sa.h b/include/rdma/ib_sa.h index b6ddf2a1b9d8..19520979b84c 100644 --- a/include/rdma/ib_sa.h +++ b/include/rdma/ib_sa.h @@ -449,28 +449,23 @@ struct ib_sa_query; void ib_sa_cancel_query(int id, struct ib_sa_query *query); -int ib_sa_path_rec_get(struct ib_sa_client *client, - struct ib_device *device, u8 port_num, - struct sa_path_rec *rec, - ib_sa_comp_mask comp_mask, - int timeout_ms, gfp_t gfp_mask, - void (*callback)(int status, - struct sa_path_rec *resp, +int ib_sa_path_rec_get(struct ib_sa_client *client, struct ib_device *device, + u8 port_num, struct sa_path_rec *rec, + ib_sa_comp_mask comp_mask, unsigned long timeout_ms, + gfp_t gfp_mask, + void (*callback)(int status, struct sa_path_rec *resp, void *context), - void *context, - struct ib_sa_query **query); + void *context, struct ib_sa_query **query); int ib_sa_service_rec_query(struct ib_sa_client *client, - struct ib_device *device, u8 port_num, - u8 method, - struct ib_sa_service_rec *rec, - ib_sa_comp_mask comp_mask, - int timeout_ms, gfp_t gfp_mask, - void (*callback)(int status, - struct ib_sa_service_rec *resp, - void *context), - void *context, - struct ib_sa_query **sa_query); + struct ib_device *device, u8 port_num, u8 method, + struct ib_sa_service_rec *rec, + ib_sa_comp_mask comp_mask, unsigned long timeout_ms, + gfp_t gfp_mask, + void (*callback)(int status, + struct ib_sa_service_rec *resp, + void *context), + void *context, struct ib_sa_query **sa_query); struct ib_sa_multicast { struct ib_sa_mcmember_rec rec; @@ -573,12 +568,11 @@ int ib_sa_guid_info_rec_query(struct ib_sa_client *client, struct ib_device *device, u8 port_num, struct ib_sa_guidinfo_rec *rec, ib_sa_comp_mask comp_mask, u8 method, - int timeout_ms, gfp_t gfp_mask, + unsigned long timeout_ms, gfp_t gfp_mask, void (*callback)(int status, struct ib_sa_guidinfo_rec *resp, void *context), - void *context, - struct ib_sa_query **sa_query); + void *context, struct ib_sa_query **sa_query); bool ib_sa_sendonly_fullmem_support(struct ib_sa_client *client, struct ib_device *device, diff --git a/include/rdma/ib_umem.h b/include/rdma/ib_umem.h index a1fd63871d17..5d3755ec5afa 100644 --- a/include/rdma/ib_umem.h +++ b/include/rdma/ib_umem.h @@ -42,15 +42,14 @@ struct ib_umem_odp; struct ib_umem { struct ib_ucontext *context; + struct mm_struct *owning_mm; size_t length; unsigned long address; int page_shift; - int writable; - int hugetlb; + u32 writable : 1; + u32 hugetlb : 1; + u32 is_odp : 1; struct work_struct work; - struct mm_struct *mm; - unsigned long diff; - struct ib_umem_odp *odp_data; struct sg_table sg_head; int nmap; int npages; diff --git a/include/rdma/ib_umem_odp.h b/include/rdma/ib_umem_odp.h index 381cdf5a9bd1..0b1446fe2fab 100644 --- a/include/rdma/ib_umem_odp.h +++ b/include/rdma/ib_umem_odp.h @@ -43,6 +43,9 @@ struct umem_odp_node { }; struct ib_umem_odp { + struct ib_umem umem; + struct ib_ucontext_per_mm *per_mm; + /* * An array of the pages included in the on-demand paging umem. * Indices of pages that are currently not mapped into the device will @@ -64,16 +67,9 @@ struct ib_umem_odp { struct mutex umem_mutex; void *private; /* for the HW driver to use. */ - /* When false, use the notifier counter in the ucontext struct. */ - bool mn_counters_active; int notifiers_seq; int notifiers_count; - /* A linked list of umems that don't have private mmu notifier - * counters yet. */ - struct list_head no_private_counters; - struct ib_umem *umem; - /* Tree tracking */ struct umem_odp_node interval_tree; @@ -82,15 +78,34 @@ struct ib_umem_odp { struct work_struct work; }; +static inline struct ib_umem_odp *to_ib_umem_odp(struct ib_umem *umem) +{ + return container_of(umem, struct ib_umem_odp, umem); +} + #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING -int ib_umem_odp_get(struct ib_ucontext *context, struct ib_umem *umem, - int access); -struct ib_umem *ib_alloc_odp_umem(struct ib_ucontext *context, - unsigned long addr, - size_t size); +struct ib_ucontext_per_mm { + struct ib_ucontext *context; + struct mm_struct *mm; + struct pid *tgid; + bool active; + + struct rb_root_cached umem_tree; + /* Protects umem_tree */ + struct rw_semaphore umem_rwsem; -void ib_umem_odp_release(struct ib_umem *umem); + struct mmu_notifier mn; + unsigned int odp_mrs_count; + + struct list_head ucontext_list; + struct rcu_head rcu; +}; + +int ib_umem_odp_get(struct ib_umem_odp *umem_odp, int access); +struct ib_umem_odp *ib_alloc_odp_umem(struct ib_ucontext_per_mm *per_mm, + unsigned long addr, size_t size); +void ib_umem_odp_release(struct ib_umem_odp *umem_odp); /* * The lower 2 bits of the DMA address signal the R/W permissions for @@ -105,13 +120,14 @@ void ib_umem_odp_release(struct ib_umem *umem); #define ODP_DMA_ADDR_MASK (~(ODP_READ_ALLOWED_BIT | ODP_WRITE_ALLOWED_BIT)) -int ib_umem_odp_map_dma_pages(struct ib_umem *umem, u64 start_offset, u64 bcnt, - u64 access_mask, unsigned long current_seq); +int ib_umem_odp_map_dma_pages(struct ib_umem_odp *umem_odp, u64 start_offset, + u64 bcnt, u64 access_mask, + unsigned long current_seq); -void ib_umem_odp_unmap_dma_pages(struct ib_umem *umem, u64 start_offset, +void ib_umem_odp_unmap_dma_pages(struct ib_umem_odp *umem_odp, u64 start_offset, u64 bound); -typedef int (*umem_call_back)(struct ib_umem *item, u64 start, u64 end, +typedef int (*umem_call_back)(struct ib_umem_odp *item, u64 start, u64 end, void *cookie); /* * Call the callback on each ib_umem in the range. Returns the logical or of @@ -129,46 +145,37 @@ int rbt_ib_umem_for_each_in_range(struct rb_root_cached *root, struct ib_umem_odp *rbt_ib_umem_lookup(struct rb_root_cached *root, u64 addr, u64 length); -static inline int ib_umem_mmu_notifier_retry(struct ib_umem *item, +static inline int ib_umem_mmu_notifier_retry(struct ib_umem_odp *umem_odp, unsigned long mmu_seq) { /* * This code is strongly based on the KVM code from * mmu_notifier_retry. Should be called with - * the relevant locks taken (item->odp_data->umem_mutex + * the relevant locks taken (umem_odp->umem_mutex * and the ucontext umem_mutex semaphore locked for read). */ - /* Do not allow page faults while the new ib_umem hasn't seen a state - * with zero notifiers yet, and doesn't have its own valid set of - * private counters. */ - if (!item->odp_data->mn_counters_active) - return 1; - - if (unlikely(item->odp_data->notifiers_count)) + if (unlikely(umem_odp->notifiers_count)) return 1; - if (item->odp_data->notifiers_seq != mmu_seq) + if (umem_odp->notifiers_seq != mmu_seq) return 1; return 0; } #else /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */ -static inline int ib_umem_odp_get(struct ib_ucontext *context, - struct ib_umem *umem, - int access) +static inline int ib_umem_odp_get(struct ib_umem_odp *umem_odp, int access) { return -EINVAL; } -static inline struct ib_umem *ib_alloc_odp_umem(struct ib_ucontext *context, - unsigned long addr, - size_t size) +static inline struct ib_umem_odp * +ib_alloc_odp_umem(struct ib_ucontext *context, unsigned long addr, size_t size) { return ERR_PTR(-EINVAL); } -static inline void ib_umem_odp_release(struct ib_umem *umem) {} +static inline void ib_umem_odp_release(struct ib_umem_odp *umem_odp) {} #endif /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */ diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index 0ed5d913a492..9c0c2132a2d6 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -69,8 +69,11 @@ #define IB_FW_VERSION_NAME_MAX ETHTOOL_FWVERS_LEN +struct ib_umem_odp; + extern struct workqueue_struct *ib_wq; extern struct workqueue_struct *ib_comp_wq; +extern struct workqueue_struct *ib_comp_unbound_wq; union ib_gid { u8 raw[16]; @@ -1137,7 +1140,9 @@ enum ib_qp_create_flags { */ struct ib_qp_init_attr { + /* Consumer's event_handler callback must not block */ void (*event_handler)(struct ib_event *, void *); + void *qp_context; struct ib_cq *send_cq; struct ib_cq *recv_cq; @@ -1146,7 +1151,7 @@ struct ib_qp_init_attr { struct ib_qp_cap cap; enum ib_sig_type sq_sig_type; enum ib_qp_type qp_type; - enum ib_qp_create_flags create_flags; + u32 create_flags; /* * Only needed for special QP types, or when using the RW API. @@ -1278,21 +1283,27 @@ struct ib_qp_attr { }; enum ib_wr_opcode { - IB_WR_RDMA_WRITE, - IB_WR_RDMA_WRITE_WITH_IMM, - IB_WR_SEND, - IB_WR_SEND_WITH_IMM, - IB_WR_RDMA_READ, - IB_WR_ATOMIC_CMP_AND_SWP, - IB_WR_ATOMIC_FETCH_AND_ADD, - IB_WR_LSO, - IB_WR_SEND_WITH_INV, - IB_WR_RDMA_READ_WITH_INV, - IB_WR_LOCAL_INV, - IB_WR_REG_MR, - IB_WR_MASKED_ATOMIC_CMP_AND_SWP, - IB_WR_MASKED_ATOMIC_FETCH_AND_ADD, + /* These are shared with userspace */ + IB_WR_RDMA_WRITE = IB_UVERBS_WR_RDMA_WRITE, + IB_WR_RDMA_WRITE_WITH_IMM = IB_UVERBS_WR_RDMA_WRITE_WITH_IMM, + IB_WR_SEND = IB_UVERBS_WR_SEND, + IB_WR_SEND_WITH_IMM = IB_UVERBS_WR_SEND_WITH_IMM, + IB_WR_RDMA_READ = IB_UVERBS_WR_RDMA_READ, + IB_WR_ATOMIC_CMP_AND_SWP = IB_UVERBS_WR_ATOMIC_CMP_AND_SWP, + IB_WR_ATOMIC_FETCH_AND_ADD = IB_UVERBS_WR_ATOMIC_FETCH_AND_ADD, + IB_WR_LSO = IB_UVERBS_WR_TSO, + IB_WR_SEND_WITH_INV = IB_UVERBS_WR_SEND_WITH_INV, + IB_WR_RDMA_READ_WITH_INV = IB_UVERBS_WR_RDMA_READ_WITH_INV, + IB_WR_LOCAL_INV = IB_UVERBS_WR_LOCAL_INV, + IB_WR_MASKED_ATOMIC_CMP_AND_SWP = + IB_UVERBS_WR_MASKED_ATOMIC_CMP_AND_SWP, + IB_WR_MASKED_ATOMIC_FETCH_AND_ADD = + IB_UVERBS_WR_MASKED_ATOMIC_FETCH_AND_ADD, + + /* These are kernel only and can not be issued by userspace */ + IB_WR_REG_MR = 0x20, IB_WR_REG_SIG_MR, + /* reserve values for low level drivers' internal use. * These values will not be used at all in the ib core layer. */ @@ -1485,26 +1496,15 @@ struct ib_ucontext { * it is set when we are closing the file descriptor and indicates * that mm_sem may be locked. */ - int closing; + bool closing; bool cleanup_retryable; - struct pid *tgid; #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING - struct rb_root_cached umem_tree; - /* - * Protects .umem_rbroot and tree, as well as odp_mrs_count and - * mmu notifiers registration. - */ - struct rw_semaphore umem_rwsem; - void (*invalidate_range)(struct ib_umem *umem, + void (*invalidate_range)(struct ib_umem_odp *umem_odp, unsigned long start, unsigned long end); - - struct mmu_notifier mn; - atomic_t notifier_count; - /* A list of umems that don't have private mmu notifier counters yet. */ - struct list_head no_private_counters; - int odp_mrs_count; + struct mutex per_mm_list_lock; + struct list_head per_mm_list; #endif struct ib_rdmacg_object cg_obj; @@ -1570,9 +1570,10 @@ struct ib_ah { typedef void (*ib_comp_handler)(struct ib_cq *cq, void *cq_context); enum ib_poll_context { - IB_POLL_DIRECT, /* caller context, no hw completions */ - IB_POLL_SOFTIRQ, /* poll from softirq context */ - IB_POLL_WORKQUEUE, /* poll from workqueue */ + IB_POLL_DIRECT, /* caller context, no hw completions */ + IB_POLL_SOFTIRQ, /* poll from softirq context */ + IB_POLL_WORKQUEUE, /* poll from workqueue */ + IB_POLL_UNBOUND_WORKQUEUE, /* poll from unbound workqueue */ }; struct ib_cq { @@ -1589,6 +1590,7 @@ struct ib_cq { struct irq_poll iop; struct work_struct work; }; + struct workqueue_struct *comp_wq; /* * Implementation details of the RDMA core, don't use in drivers: */ @@ -2263,10 +2265,11 @@ struct ib_device { struct list_head event_handler_list; spinlock_t event_handler_lock; - spinlock_t client_data_lock; + rwlock_t client_data_lock; struct list_head core_list; /* Access to the client_data_list is protected by the client_data_lock - * spinlock and the lists_rwsem read-write semaphore */ + * rwlock and the lists_rwsem read-write semaphore + */ struct list_head client_data_list; struct ib_cache cache; @@ -2550,7 +2553,13 @@ struct ib_device { struct module *owner; struct device dev; - struct kobject *ports_parent; + /* First group for device attributes, + * Second group for driver provided attributes (optional). + * It is NULL terminated array. + */ + const struct attribute_group *groups[3]; + + struct kobject *ports_kobj; struct list_head port_list; enum { @@ -2633,9 +2642,9 @@ void ib_dealloc_device(struct ib_device *device); void ib_get_device_fw_str(struct ib_device *device, char *str); -int ib_register_device(struct ib_device *device, - int (*port_callback)(struct ib_device *, - u8, struct kobject *)); +int ib_register_device(struct ib_device *device, const char *name, + int (*port_callback)(struct ib_device *, u8, + struct kobject *)); void ib_unregister_device(struct ib_device *device); int ib_register_client (struct ib_client *client); @@ -2645,6 +2654,28 @@ void *ib_get_client_data(struct ib_device *device, struct ib_client *client); void ib_set_client_data(struct ib_device *device, struct ib_client *client, void *data); +#if IS_ENABLED(CONFIG_INFINIBAND_USER_ACCESS) +int rdma_user_mmap_io(struct ib_ucontext *ucontext, struct vm_area_struct *vma, + unsigned long pfn, unsigned long size, pgprot_t prot); +int rdma_user_mmap_page(struct ib_ucontext *ucontext, + struct vm_area_struct *vma, struct page *page, + unsigned long size); +#else +static inline int rdma_user_mmap_io(struct ib_ucontext *ucontext, + struct vm_area_struct *vma, + unsigned long pfn, unsigned long size, + pgprot_t prot) +{ + return -EINVAL; +} +static inline int rdma_user_mmap_page(struct ib_ucontext *ucontext, + struct vm_area_struct *vma, struct page *page, + unsigned long size) +{ + return -EINVAL; +} +#endif + static inline int ib_copy_from_udata(void *dest, struct ib_udata *udata, size_t len) { return copy_from_user(dest, udata->inbuf, len) ? -EFAULT : 0; @@ -2728,7 +2759,6 @@ static inline int ib_destroy_usecnt(atomic_t *usecnt, * @next_state: Next QP state * @type: QP type * @mask: Mask of supplied QP attributes - * @ll : link layer of port * * This function is a helper function that a low-level driver's * modify_qp method can use to validate the consumer's input. It @@ -2737,8 +2767,7 @@ static inline int ib_destroy_usecnt(atomic_t *usecnt, * and that the attribute mask supplied is allowed for the transition. */ bool ib_modify_qp_is_ok(enum ib_qp_state cur_state, enum ib_qp_state next_state, - enum ib_qp_type type, enum ib_qp_attr_mask mask, - enum rdma_link_layer ll); + enum ib_qp_type type, enum ib_qp_attr_mask mask); void ib_register_event_handler(struct ib_event_handler *event_handler); void ib_unregister_event_handler(struct ib_event_handler *event_handler); @@ -4167,20 +4196,6 @@ ib_get_vector_affinity(struct ib_device *device, int comp_vector) } -static inline void ib_set_flow(struct ib_uobject *uobj, struct ib_flow *ibflow, - struct ib_qp *qp, struct ib_device *device) -{ - uobj->object = ibflow; - ibflow->uobject = uobj; - - if (qp) { - atomic_inc(&qp->usecnt); - ibflow->qp = qp; - } - - ibflow->device = device; -} - /** * rdma_roce_rescan_device - Rescan all of the network devices in the system * and add their gids, as needed, to the relevant RoCE devices. @@ -4205,4 +4220,26 @@ int rdma_init_netdev(struct ib_device *device, u8 port_num, void (*setup)(struct net_device *), struct net_device *netdev); +/** + * rdma_set_device_sysfs_group - Set device attributes group to have + * driver specific sysfs entries at + * for infiniband class. + * + * @device: device pointer for which attributes to be created + * @group: Pointer to group which should be added when device + * is registered with sysfs. + * rdma_set_device_sysfs_group() allows existing drivers to expose one + * group per device to have sysfs attributes. + * + * NOTE: New drivers should not make use of this API; instead new device + * parameter should be exposed via netlink command. This API and mechanism + * exist only for existing drivers. + */ +static inline void +rdma_set_device_sysfs_group(struct ib_device *dev, + const struct attribute_group *group) +{ + dev->groups[1] = group; +} + #endif /* IB_VERBS_H */ diff --git a/include/rdma/rdma_cm.h b/include/rdma/rdma_cm.h index 5d71a7f51a9f..60987a5903b7 100644 --- a/include/rdma/rdma_cm.h +++ b/include/rdma/rdma_cm.h @@ -152,7 +152,11 @@ struct rdma_cm_id *__rdma_create_id(struct net *net, * @ps: RDMA port space. * @qp_type: type of queue pair associated with the id. * - * The id holds a reference on the network namespace until it is destroyed. + * Returns a new rdma_cm_id. The id holds a reference on the network + * namespace until it is destroyed. + * + * The event handler callback serializes on the id's mutex and is + * allowed to sleep. */ #define rdma_create_id(net, event_handler, context, ps, qp_type) \ __rdma_create_id((net), (event_handler), (context), (ps), (qp_type), \ @@ -192,7 +196,8 @@ int rdma_bind_addr(struct rdma_cm_id *id, struct sockaddr *addr); * @timeout_ms: Time to wait for resolution to complete. */ int rdma_resolve_addr(struct rdma_cm_id *id, struct sockaddr *src_addr, - const struct sockaddr *dst_addr, int timeout_ms); + const struct sockaddr *dst_addr, + unsigned long timeout_ms); /** * rdma_resolve_route - Resolve the RDMA address bound to the RDMA identifier @@ -202,7 +207,7 @@ int rdma_resolve_addr(struct rdma_cm_id *id, struct sockaddr *src_addr, * Users must have first called rdma_resolve_addr to resolve a dst_addr * into an RDMA address before calling this routine. */ -int rdma_resolve_route(struct rdma_cm_id *id, int timeout_ms); +int rdma_resolve_route(struct rdma_cm_id *id, unsigned long timeout_ms); /** * rdma_create_qp - Allocate a QP and associate it with the specified RDMA diff --git a/include/rdma/rdma_netlink.h b/include/rdma/rdma_netlink.h index c369703fcd69..70218e6b5187 100644 --- a/include/rdma/rdma_netlink.h +++ b/include/rdma/rdma_netlink.h @@ -96,7 +96,7 @@ int rdma_nl_multicast(struct sk_buff *skb, unsigned int group, gfp_t flags); /** * Check if there are any listeners to the netlink group * @group: the netlink group ID - * Returns 0 on success or a negative for no listeners. + * Returns true on success or false if no listeners. */ -int rdma_nl_chk_listeners(unsigned int group); +bool rdma_nl_chk_listeners(unsigned int group); #endif /* _RDMA_NETLINK_H */ diff --git a/include/rdma/rdma_vt.h b/include/rdma/rdma_vt.h index e79229a0cf01..3584d0816fcd 100644 --- a/include/rdma/rdma_vt.h +++ b/include/rdma/rdma_vt.h @@ -149,6 +149,10 @@ struct rvt_ibport { #define RVT_CQN_MAX 16 /* maximum length of cq name */ +#define RVT_SGE_COPY_MEMCPY 0 +#define RVT_SGE_COPY_CACHELESS 1 +#define RVT_SGE_COPY_ADAPTIVE 2 + /* * Things that are driver specific, module parameters in hfi1 and qib */ @@ -161,6 +165,9 @@ struct rvt_driver_params { */ unsigned int lkey_table_size; unsigned int qp_table_size; + unsigned int sge_copy_mode; + unsigned int wss_threshold; + unsigned int wss_clean_period; int qpn_start; int qpn_inc; int qpn_res_start; @@ -193,6 +200,19 @@ struct rvt_ah { u8 log_pmtu; }; +/* memory working set size */ +struct rvt_wss { + unsigned long *entries; + atomic_t total_count; + atomic_t clean_counter; + atomic_t clean_entry; + + int threshold; + int num_entries; + long pages_mask; + unsigned int clean_period; +}; + struct rvt_dev_info; struct rvt_swqe; struct rvt_driver_provided { @@ -211,11 +231,18 @@ struct rvt_driver_provided { * version requires the s_lock not to be held. The other assumes the * s_lock is held. */ - void (*schedule_send)(struct rvt_qp *qp); - void (*schedule_send_no_lock)(struct rvt_qp *qp); + bool (*schedule_send)(struct rvt_qp *qp); + bool (*schedule_send_no_lock)(struct rvt_qp *qp); - /* Driver specific work request checking */ - int (*check_send_wqe)(struct rvt_qp *qp, struct rvt_swqe *wqe); + /* + * Driver specific work request setup and checking. + * This function is allowed to perform any setup, checks, or + * adjustments required to the SWQE in order to be usable by + * underlying protocols. This includes private data structure + * allocations. + */ + int (*setup_wqe)(struct rvt_qp *qp, struct rvt_swqe *wqe, + bool *call_send); /* * Sometimes rdmavt needs to kick the driver's send progress. That is @@ -371,6 +398,9 @@ struct rvt_dev_info { /* post send table */ const struct rvt_operation_params *post_parms; + /* opcode translation table */ + const enum ib_wc_opcode *wc_opcode; + /* Driver specific helper functions */ struct rvt_driver_provided driver_f; @@ -411,6 +441,8 @@ struct rvt_dev_info { u32 n_mcast_grps_allocated; /* number of mcast groups allocated */ spinlock_t n_mcast_grps_lock; + /* Memory Working Set Size */ + struct rvt_wss *wss; }; /** @@ -423,7 +455,14 @@ static inline void rvt_set_ibdev_name(struct rvt_dev_info *rdi, const char *fmt, const char *name, const int unit) { - snprintf(rdi->ibdev.name, sizeof(rdi->ibdev.name), fmt, name, unit); + /* + * FIXME: rvt and its users want to touch the ibdev before + * registration and have things like the name work. We don't have the + * infrastructure in the core to support this directly today, hack it + * to work by setting the name manually here. + */ + dev_set_name(&rdi->ibdev.dev, fmt, name, unit); + strlcpy(rdi->ibdev.name, dev_name(&rdi->ibdev.dev), IB_DEVICE_NAME_MAX); } /** @@ -434,7 +473,7 @@ static inline void rvt_set_ibdev_name(struct rvt_dev_info *rdi, */ static inline const char *rvt_get_ibdev_name(const struct rvt_dev_info *rdi) { - return rdi->ibdev.name; + return dev_name(&rdi->ibdev.dev); } static inline struct rvt_pd *ibpd_to_rvtpd(struct ib_pd *ibpd) diff --git a/include/rdma/rdmavt_qp.h b/include/rdma/rdmavt_qp.h index 927f6d5b6d0f..cbafb1878669 100644 --- a/include/rdma/rdmavt_qp.h +++ b/include/rdma/rdmavt_qp.h @@ -678,6 +678,13 @@ void rvt_del_timers_sync(struct rvt_qp *qp); void rvt_stop_rc_timers(struct rvt_qp *qp); void rvt_add_retry_timer(struct rvt_qp *qp); +void rvt_copy_sge(struct rvt_qp *qp, struct rvt_sge_state *ss, + void *data, u32 length, + bool release, bool copy_last); +void rvt_send_complete(struct rvt_qp *qp, struct rvt_swqe *wqe, + enum ib_wc_status status); +void rvt_ruc_loopback(struct rvt_qp *qp); + /** * struct rvt_qp_iter - the iterator for QPs * @qp - the current QP diff --git a/include/rdma/restrack.h b/include/rdma/restrack.h index 9654d33edd98..2638fa7cd702 100644 --- a/include/rdma/restrack.h +++ b/include/rdma/restrack.h @@ -173,16 +173,10 @@ int rdma_restrack_put(struct rdma_restrack_entry *res); /** * rdma_restrack_set_task() - set the task for this resource * @res: resource entry - * @task: task struct + * @caller: kernel name, the current task will be used if the caller is NULL. */ -static inline void rdma_restrack_set_task(struct rdma_restrack_entry *res, - struct task_struct *task) -{ - if (res->task) - put_task_struct(res->task); - get_task_struct(task); - res->task = task; -} +void rdma_restrack_set_task(struct rdma_restrack_entry *res, + const char *caller); /* * Helper functions for rdma drivers when filling out diff --git a/include/rdma/uverbs_ioctl.h b/include/rdma/uverbs_ioctl.h index 9e997c3c2f04..84d3d15f1f38 100644 --- a/include/rdma/uverbs_ioctl.h +++ b/include/rdma/uverbs_ioctl.h @@ -52,6 +52,7 @@ enum uverbs_attr_type { UVERBS_ATTR_TYPE_IDR, UVERBS_ATTR_TYPE_FD, UVERBS_ATTR_TYPE_ENUM_IN, + UVERBS_ATTR_TYPE_IDRS_ARRAY, }; enum uverbs_obj_access { @@ -101,7 +102,7 @@ struct uverbs_attr_spec { } enum_def; } u; - /* This weird split of the enum lets us remove some padding */ + /* This weird split lets us remove some padding */ union { struct { /* @@ -111,6 +112,17 @@ struct uverbs_attr_spec { */ const struct uverbs_attr_spec *ids; } enum_def; + + struct { + /* + * higher bits mean the namespace and lower bits mean + * the type id within the namespace. + */ + u16 obj_type; + u16 min_len; + u16 max_len; + u8 access; + } objs_arr; } u2; }; @@ -251,6 +263,11 @@ static inline __attribute_const__ u32 uapi_bkey_attr(u32 attr_key) return attr_key - 1; } +static inline __attribute_const__ u32 uapi_bkey_to_key_attr(u32 attr_bkey) +{ + return attr_bkey + 1; +} + /* * ======================================= * Verbs definitions @@ -323,6 +340,27 @@ struct uverbs_object_tree_def { #define UA_MANDATORY .mandatory = 1 #define UA_OPTIONAL .mandatory = 0 +/* + * min_len must be bigger than 0 and _max_len must be smaller than 4095. Only + * READ\WRITE accesses are supported. + */ +#define UVERBS_ATTR_IDRS_ARR(_attr_id, _idr_type, _access, _min_len, _max_len, \ + ...) \ + (&(const struct uverbs_attr_def){ \ + .id = (_attr_id) + \ + BUILD_BUG_ON_ZERO((_min_len) == 0 || \ + (_max_len) > \ + PAGE_SIZE / sizeof(void *) || \ + (_min_len) > (_max_len) || \ + (_access) == UVERBS_ACCESS_NEW || \ + (_access) == UVERBS_ACCESS_DESTROY), \ + .attr = { .type = UVERBS_ATTR_TYPE_IDRS_ARRAY, \ + .u2.objs_arr.obj_type = _idr_type, \ + .u2.objs_arr.access = _access, \ + .u2.objs_arr.min_len = _min_len, \ + .u2.objs_arr.max_len = _max_len, \ + __VA_ARGS__ } }) + #define UVERBS_ATTR_IDR(_attr_id, _idr_type, _access, ...) \ (&(const struct uverbs_attr_def){ \ .id = _attr_id, \ @@ -365,6 +403,15 @@ struct uverbs_object_tree_def { __VA_ARGS__ }, \ }) +/* An input value that is a member in the enum _enum_type. */ +#define UVERBS_ATTR_CONST_IN(_attr_id, _enum_type, ...) \ + UVERBS_ATTR_PTR_IN( \ + _attr_id, \ + UVERBS_ATTR_SIZE( \ + sizeof(u64) + BUILD_BUG_ON_ZERO(!sizeof(_enum_type)), \ + sizeof(u64)), \ + __VA_ARGS__) + /* * An input value that is a bitwise combination of values of _enum_type. * This permits the flag value to be passed as either a u32 or u64, it must @@ -431,10 +478,16 @@ struct uverbs_obj_attr { const struct uverbs_api_attr *attr_elm; }; +struct uverbs_objs_arr_attr { + struct ib_uobject **uobjects; + u16 len; +}; + struct uverbs_attr { union { struct uverbs_ptr_attr ptr_attr; struct uverbs_obj_attr obj_attr; + struct uverbs_objs_arr_attr objs_arr_attr; }; }; @@ -507,6 +560,31 @@ uverbs_attr_get_len(const struct uverbs_attr_bundle *attrs_bundle, u16 idx) return attr->ptr_attr.len; } +/** + * uverbs_attr_get_uobjs_arr() - Provides array's properties for attribute for + * UVERBS_ATTR_TYPE_IDRS_ARRAY. + * @arr: Returned pointer to array of pointers for uobjects or NULL if + * the attribute isn't provided. + * + * Return: The array length or 0 if no attribute was provided. + */ +static inline int uverbs_attr_get_uobjs_arr( + const struct uverbs_attr_bundle *attrs_bundle, u16 attr_idx, + struct ib_uobject ***arr) +{ + const struct uverbs_attr *attr = + uverbs_attr_get(attrs_bundle, attr_idx); + + if (IS_ERR(attr)) { + *arr = NULL; + return 0; + } + + *arr = attr->objs_arr_attr.uobjects; + + return attr->objs_arr_attr.len; +} + static inline bool uverbs_attr_ptr_is_inline(const struct uverbs_attr *attr) { return attr->ptr_attr.len <= sizeof(attr->ptr_attr.data); @@ -603,6 +681,9 @@ static inline __malloc void *uverbs_zalloc(struct uverbs_attr_bundle *bundle, { return _uverbs_alloc(bundle, size, GFP_KERNEL | __GFP_ZERO); } +int _uverbs_get_const(s64 *to, const struct uverbs_attr_bundle *attrs_bundle, + size_t idx, s64 lower_bound, u64 upper_bound, + s64 *def_val); #else static inline int uverbs_get_flags64(u64 *to, const struct uverbs_attr_bundle *attrs_bundle, @@ -631,6 +712,34 @@ static inline __malloc void *uverbs_zalloc(struct uverbs_attr_bundle *bundle, { return ERR_PTR(-EINVAL); } +static inline int +_uverbs_get_const(s64 *to, const struct uverbs_attr_bundle *attrs_bundle, + size_t idx, s64 lower_bound, u64 upper_bound, + s64 *def_val) +{ + return -EINVAL; +} #endif +#define uverbs_get_const(_to, _attrs_bundle, _idx) \ + ({ \ + s64 _val; \ + int _ret = _uverbs_get_const(&_val, _attrs_bundle, _idx, \ + type_min(typeof(*_to)), \ + type_max(typeof(*_to)), NULL); \ + (*_to) = _val; \ + _ret; \ + }) + +#define uverbs_get_const_default(_to, _attrs_bundle, _idx, _default) \ + ({ \ + s64 _val; \ + s64 _def_val = _default; \ + int _ret = \ + _uverbs_get_const(&_val, _attrs_bundle, _idx, \ + type_min(typeof(*_to)), \ + type_max(typeof(*_to)), &_def_val); \ + (*_to) = _val; \ + _ret; \ + }) #endif diff --git a/include/rdma/uverbs_std_types.h b/include/rdma/uverbs_std_types.h index 3b00231cc084..3db2802fbc68 100644 --- a/include/rdma/uverbs_std_types.h +++ b/include/rdma/uverbs_std_types.h @@ -140,5 +140,56 @@ __uobj_alloc(const struct uverbs_api_object *obj, struct ib_uverbs_file *ufile, #define uobj_alloc(_type, _ufile, _ib_dev) \ __uobj_alloc(uobj_get_type(_ufile, _type), _ufile, _ib_dev) +static inline void uverbs_flow_action_fill_action(struct ib_flow_action *action, + struct ib_uobject *uobj, + struct ib_device *ib_dev, + enum ib_flow_action_type type) +{ + atomic_set(&action->usecnt, 0); + action->device = ib_dev; + action->type = type; + action->uobject = uobj; + uobj->object = action; +} + +struct ib_uflow_resources { + size_t max; + size_t num; + size_t collection_num; + size_t counters_num; + struct ib_counters **counters; + struct ib_flow_action **collection; +}; + +struct ib_uflow_object { + struct ib_uobject uobject; + struct ib_uflow_resources *resources; +}; + +struct ib_uflow_resources *flow_resources_alloc(size_t num_specs); +void flow_resources_add(struct ib_uflow_resources *uflow_res, + enum ib_flow_spec_type type, + void *ibobj); +void ib_uverbs_flow_resources_free(struct ib_uflow_resources *uflow_res); + +static inline void ib_set_flow(struct ib_uobject *uobj, struct ib_flow *ibflow, + struct ib_qp *qp, struct ib_device *device, + struct ib_uflow_resources *uflow_res) +{ + struct ib_uflow_object *uflow; + + uobj->object = ibflow; + ibflow->uobject = uobj; + + if (qp) { + atomic_inc(&qp->usecnt); + ibflow->qp = qp; + } + + ibflow->device = device; + uflow = container_of(uobj, typeof(*uflow), uobject); + uflow->resources = uflow_res; +} + #endif diff --git a/include/uapi/rdma/ib_user_verbs.h b/include/uapi/rdma/ib_user_verbs.h index 25a16760de2a..1254b51a551a 100644 --- a/include/uapi/rdma/ib_user_verbs.h +++ b/include/uapi/rdma/ib_user_verbs.h @@ -763,10 +763,28 @@ struct ib_uverbs_sge { __u32 lkey; }; +enum ib_uverbs_wr_opcode { + IB_UVERBS_WR_RDMA_WRITE = 0, + IB_UVERBS_WR_RDMA_WRITE_WITH_IMM = 1, + IB_UVERBS_WR_SEND = 2, + IB_UVERBS_WR_SEND_WITH_IMM = 3, + IB_UVERBS_WR_RDMA_READ = 4, + IB_UVERBS_WR_ATOMIC_CMP_AND_SWP = 5, + IB_UVERBS_WR_ATOMIC_FETCH_AND_ADD = 6, + IB_UVERBS_WR_LOCAL_INV = 7, + IB_UVERBS_WR_BIND_MW = 8, + IB_UVERBS_WR_SEND_WITH_INV = 9, + IB_UVERBS_WR_TSO = 10, + IB_UVERBS_WR_RDMA_READ_WITH_INV = 11, + IB_UVERBS_WR_MASKED_ATOMIC_CMP_AND_SWP = 12, + IB_UVERBS_WR_MASKED_ATOMIC_FETCH_AND_ADD = 13, + /* Review enum ib_wr_opcode before modifying this */ +}; + struct ib_uverbs_send_wr { __aligned_u64 wr_id; __u32 num_sge; - __u32 opcode; + __u32 opcode; /* see enum ib_uverbs_wr_opcode */ __u32 send_flags; union { __be32 imm_data; diff --git a/include/uapi/rdma/mlx5-abi.h b/include/uapi/rdma/mlx5-abi.h index addbb9c4529e..8fa9f90e2bb1 100644 --- a/include/uapi/rdma/mlx5-abi.h +++ b/include/uapi/rdma/mlx5-abi.h @@ -45,6 +45,9 @@ enum { MLX5_QP_FLAG_BFREG_INDEX = 1 << 3, MLX5_QP_FLAG_TYPE_DCT = 1 << 4, MLX5_QP_FLAG_TYPE_DCI = 1 << 5, + MLX5_QP_FLAG_TIR_ALLOW_SELF_LB_UC = 1 << 6, + MLX5_QP_FLAG_TIR_ALLOW_SELF_LB_MC = 1 << 7, + MLX5_QP_FLAG_ALLOW_SCATTER_CQE = 1 << 8, }; enum { @@ -349,9 +352,22 @@ struct mlx5_ib_create_qp_rss { __u32 flags; }; +enum mlx5_ib_create_qp_resp_mask { + MLX5_IB_CREATE_QP_RESP_MASK_TIRN = 1UL << 0, + MLX5_IB_CREATE_QP_RESP_MASK_TISN = 1UL << 1, + MLX5_IB_CREATE_QP_RESP_MASK_RQN = 1UL << 2, + MLX5_IB_CREATE_QP_RESP_MASK_SQN = 1UL << 3, +}; + struct mlx5_ib_create_qp_resp { __u32 bfreg_index; __u32 reserved; + __u32 comp_mask; + __u32 tirn; + __u32 tisn; + __u32 rqn; + __u32 sqn; + __u32 reserved1; }; struct mlx5_ib_alloc_mw { diff --git a/include/uapi/rdma/mlx5_user_ioctl_cmds.h b/include/uapi/rdma/mlx5_user_ioctl_cmds.h index 9c51801b9e64..408e220034de 100644 --- a/include/uapi/rdma/mlx5_user_ioctl_cmds.h +++ b/include/uapi/rdma/mlx5_user_ioctl_cmds.h @@ -125,6 +125,7 @@ enum mlx5_ib_flow_matcher_create_attrs { MLX5_IB_ATTR_FLOW_MATCHER_MATCH_MASK, MLX5_IB_ATTR_FLOW_MATCHER_FLOW_TYPE, MLX5_IB_ATTR_FLOW_MATCHER_MATCH_CRITERIA, + MLX5_IB_ATTR_FLOW_MATCHER_FLOW_FLAGS, }; enum mlx5_ib_flow_matcher_destroy_attrs { @@ -155,6 +156,8 @@ enum mlx5_ib_create_flow_attrs { MLX5_IB_ATTR_CREATE_FLOW_DEST_QP, MLX5_IB_ATTR_CREATE_FLOW_DEST_DEVX, MLX5_IB_ATTR_CREATE_FLOW_MATCHER, + MLX5_IB_ATTR_CREATE_FLOW_ARR_FLOW_ACTIONS, + MLX5_IB_ATTR_CREATE_FLOW_TAG, }; enum mlx5_ib_destoy_flow_attrs { @@ -166,4 +169,22 @@ enum mlx5_ib_flow_methods { MLX5_IB_METHOD_DESTROY_FLOW, }; +enum mlx5_ib_flow_action_methods { + MLX5_IB_METHOD_FLOW_ACTION_CREATE_MODIFY_HEADER = (1U << UVERBS_ID_NS_SHIFT), + MLX5_IB_METHOD_FLOW_ACTION_CREATE_PACKET_REFORMAT, +}; + +enum mlx5_ib_create_flow_action_create_modify_header_attrs { + MLX5_IB_ATTR_CREATE_MODIFY_HEADER_HANDLE = (1U << UVERBS_ID_NS_SHIFT), + MLX5_IB_ATTR_CREATE_MODIFY_HEADER_ACTIONS_PRM, + MLX5_IB_ATTR_CREATE_MODIFY_HEADER_FT_TYPE, +}; + +enum mlx5_ib_create_flow_action_create_packet_reformat_attrs { + MLX5_IB_ATTR_CREATE_PACKET_REFORMAT_HANDLE = (1U << UVERBS_ID_NS_SHIFT), + MLX5_IB_ATTR_CREATE_PACKET_REFORMAT_TYPE, + MLX5_IB_ATTR_CREATE_PACKET_REFORMAT_FT_TYPE, + MLX5_IB_ATTR_CREATE_PACKET_REFORMAT_DATA_BUF, +}; + #endif diff --git a/include/uapi/rdma/mlx5_user_ioctl_verbs.h b/include/uapi/rdma/mlx5_user_ioctl_verbs.h index 8a2fb33f3ed4..4ef62c0e8452 100644 --- a/include/uapi/rdma/mlx5_user_ioctl_verbs.h +++ b/include/uapi/rdma/mlx5_user_ioctl_verbs.h @@ -39,5 +39,17 @@ enum mlx5_ib_uapi_flow_action_flags { MLX5_IB_UAPI_FLOW_ACTION_FLAGS_REQUIRE_METADATA = 1 << 0, }; +enum mlx5_ib_uapi_flow_table_type { + MLX5_IB_UAPI_FLOW_TABLE_TYPE_NIC_RX = 0x0, + MLX5_IB_UAPI_FLOW_TABLE_TYPE_NIC_TX = 0x1, +}; + +enum mlx5_ib_uapi_flow_action_packet_reformat_type { + MLX5_IB_UAPI_FLOW_ACTION_PACKET_REFORMAT_TYPE_L2_TUNNEL_TO_L2 = 0x0, + MLX5_IB_UAPI_FLOW_ACTION_PACKET_REFORMAT_TYPE_L2_TO_L2_TUNNEL = 0x1, + MLX5_IB_UAPI_FLOW_ACTION_PACKET_REFORMAT_TYPE_L3_TUNNEL_TO_L2 = 0x2, + MLX5_IB_UAPI_FLOW_ACTION_PACKET_REFORMAT_TYPE_L2_TO_L3_TUNNEL = 0x3, +}; + #endif diff --git a/include/uapi/rdma/rdma_netlink.h b/include/uapi/rdma/rdma_netlink.h index edba6351ac13..f9c41bf59efc 100644 --- a/include/uapi/rdma/rdma_netlink.h +++ b/include/uapi/rdma/rdma_netlink.h @@ -227,8 +227,9 @@ enum rdma_nldev_command { RDMA_NLDEV_CMD_UNSPEC, RDMA_NLDEV_CMD_GET, /* can dump */ + RDMA_NLDEV_CMD_SET, - /* 2 - 4 are free to use */ + /* 3 - 4 are free to use */ RDMA_NLDEV_CMD_PORT_GET = 5, /* can dump */ diff --git a/include/uapi/rdma/rdma_user_ioctl_cmds.h b/include/uapi/rdma/rdma_user_ioctl_cmds.h index 24800c6c1f32..06c34d99be85 100644 --- a/include/uapi/rdma/rdma_user_ioctl_cmds.h +++ b/include/uapi/rdma/rdma_user_ioctl_cmds.h @@ -53,7 +53,7 @@ enum { struct ib_uverbs_attr { __u16 attr_id; /* command specific type attribute */ - __u16 len; /* only for pointers */ + __u16 len; /* only for pointers and IDRs array */ __u16 flags; /* combination of UVERBS_ATTR_F_XXXX */ union { struct { @@ -63,7 +63,10 @@ struct ib_uverbs_attr { __u16 reserved; } attr_data; union { - /* Used by PTR_IN/OUT, ENUM_IN and IDR */ + /* + * ptr to command, inline data, idr/fd or + * ptr to __u32 array of IDRs + */ __aligned_u64 data; /* Used by FD_IN and FD_OUT */ __s64 data_s64; |