diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2024-09-24 20:48:00 +0200 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2024-09-24 20:48:00 +0200 |
commit | 54d7e8190ecfe72ff0dab96545e782f7298cb69a (patch) | |
tree | 1ffd6c6083413871ebc06f30e649e1670726dbe8 /drivers | |
parent | Merge tag 'sched_ext-for-6.12-rc1-fixes' of git://git.kernel.org/pub/scm/linu... (diff) | |
parent | RDMA/bnxt_re: Remove the unused variable en_dev (diff) | |
download | linux-54d7e8190ecfe72ff0dab96545e782f7298cb69a.tar.xz linux-54d7e8190ecfe72ff0dab96545e782f7298cb69a.zip |
Merge tag 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma
Pull rdma updates from Jason Gunthorpe:
"Usual collection of small improvements and fixes, nothing especially
stands out to me here.
The new multipath PCI feature is a sign of things to come, I think we
will see more of this in the next 10 years. Broadcom and HNS continue
to update their drivers for their new HW generations.
Summary:
- Bug fixes and minor improvments in cxgb4, siw, mlx5, rxe, efa, rts,
hfi, erdma, hns, irdma
- Code cleanups/typos/etc. Tidy alloc_ordered_workqueue() calls
- Multipath PCI for mlx5
- Variable size work queue, SRQ changes, and relaxed ordering for new
bnxt HW
- New ODP fault resolution FW protocol in mlx5
- New 'rdma monitor' netlink mechanism"
* tag 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma: (99 commits)
RDMA/bnxt_re: Remove the unused variable en_dev
RDMA/nldev: Add missing break in rdma_nl_notify_err_msg()
RDMA/irdma: fix error message in irdma_modify_qp_roce()
RDMA/cxgb4: Added NULL check for lookup_atid
RDMA/hns: Fix ah error counter in sw stat not increasing
RDMA/bnxt_re: Recover the device when FW error is detected
RDMA/bnxt_re: Group all operations under add_device and remove_device
RDMA/bnxt_re: Use the aux device for L2 ULP callbacks
RDMA/bnxt_re: Change aux driver data to en_info to hold more information
RDMA/nldev: Expose whether RDMA monitoring is supported
RDMA/nldev: Add support for RDMA monitoring
RDMA/mlx5: Use IB set_netdev and get_netdev functions
RDMA/device: Remove optimization in ib_device_get_netdev()
RDMA/mlx5: Initialize phys_port_cnt earlier in RDMA device creation
RDMA/mlx5: Obtain upper net device only when needed
RDMA/mlx5: Check RoCE LAG status before getting netdev
RDMA/mlx5: Consider the query_vuid cap for data_direct
net/mlx5: Handle memory scheme ODP capabilities
RDMA/mlx5: Add implicit MR handling to ODP memory scheme
RDMA/mlx5: Add handling for memory scheme page fault events
...
Diffstat (limited to 'drivers')
73 files changed, 2595 insertions, 875 deletions
diff --git a/drivers/infiniband/core/cache.c b/drivers/infiniband/core/cache.c index 6791df64a5fe..b7c078b7f7cf 100644 --- a/drivers/infiniband/core/cache.c +++ b/drivers/infiniband/core/cache.c @@ -1640,8 +1640,10 @@ int ib_cache_setup_one(struct ib_device *device) rdma_for_each_port (device, p) { err = ib_cache_update(device, p, true, true, true); - if (err) + if (err) { + gid_table_cleanup_one(device); return err; + } } return 0; diff --git a/drivers/infiniband/core/core_priv.h b/drivers/infiniband/core/core_priv.h index dd7715ba9fd1..05102769a918 100644 --- a/drivers/infiniband/core/core_priv.h +++ b/drivers/infiniband/core/core_priv.h @@ -325,9 +325,6 @@ void ib_qp_usecnt_inc(struct ib_qp *qp); void ib_qp_usecnt_dec(struct ib_qp *qp); struct rdma_dev_addr; -int rdma_resolve_ip_route(struct sockaddr *src_addr, - const struct sockaddr *dst_addr, - struct rdma_dev_addr *addr); int rdma_addr_find_l2_eth_by_grh(const union ib_gid *sgid, const union ib_gid *dgid, diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c index 0290aca18d26..e029401b5680 100644 --- a/drivers/infiniband/core/device.c +++ b/drivers/infiniband/core/device.c @@ -1351,6 +1351,29 @@ static void prevent_dealloc_device(struct ib_device *ib_dev) { } +static void ib_device_notify_register(struct ib_device *device) +{ + struct net_device *netdev; + u32 port; + int ret; + + ret = rdma_nl_notify_event(device, 0, RDMA_REGISTER_EVENT); + if (ret) + return; + + rdma_for_each_port(device, port) { + netdev = ib_device_get_netdev(device, port); + if (!netdev) + continue; + + ret = rdma_nl_notify_event(device, port, + RDMA_NETDEV_ATTACH_EVENT); + dev_put(netdev); + if (ret) + return; + } +} + /** * ib_register_device - Register an IB device with IB core * @device: Device to register @@ -1449,6 +1472,8 @@ int ib_register_device(struct ib_device *device, const char *name, dev_set_uevent_suppress(&device->dev, false); /* Mark for userspace that device is ready */ kobject_uevent(&device->dev.kobj, KOBJ_ADD); + + ib_device_notify_register(device); ib_device_put(device); return 0; @@ -1491,6 +1516,7 @@ static void __ib_unregister_device(struct ib_device *ib_dev) goto out; disable_device(ib_dev); + rdma_nl_notify_event(ib_dev, 0, RDMA_UNREGISTER_EVENT); /* Expedite removing unregistered pointers from the hash table */ free_netdevs(ib_dev); @@ -2159,6 +2185,7 @@ static void add_ndev_hash(struct ib_port_data *pdata) int ib_device_set_netdev(struct ib_device *ib_dev, struct net_device *ndev, u32 port) { + enum rdma_nl_notify_event_type etype; struct net_device *old_ndev; struct ib_port_data *pdata; unsigned long flags; @@ -2190,6 +2217,14 @@ int ib_device_set_netdev(struct ib_device *ib_dev, struct net_device *ndev, spin_unlock_irqrestore(&pdata->netdev_lock, flags); add_ndev_hash(pdata); + + /* Make sure that the device is registered before we send events */ + if (xa_load(&devices, ib_dev->index) != ib_dev) + return 0; + + etype = ndev ? RDMA_NETDEV_ATTACH_EVENT : RDMA_NETDEV_DETACH_EVENT; + rdma_nl_notify_event(ib_dev, port, etype); + return 0; } EXPORT_SYMBOL(ib_device_set_netdev); @@ -2236,6 +2271,9 @@ struct net_device *ib_device_get_netdev(struct ib_device *ib_dev, if (!rdma_is_port_valid(ib_dev, port)) return NULL; + if (!ib_dev->port_data) + return NULL; + pdata = &ib_dev->port_data[port]; /* @@ -2252,17 +2290,9 @@ struct net_device *ib_device_get_netdev(struct ib_device *ib_dev, spin_unlock(&pdata->netdev_lock); } - /* - * If we are starting to unregister expedite things by preventing - * propagation of an unregistering netdev. - */ - if (res && res->reg_state != NETREG_REGISTERED) { - dev_put(res); - return NULL; - } - return res; } +EXPORT_SYMBOL(ib_device_get_netdev); /** * ib_device_get_by_netdev - Find an IB device associated with a netdev diff --git a/drivers/infiniband/core/iwcm.c b/drivers/infiniband/core/iwcm.c index 1a6339f3a63f..7e3a55349e10 100644 --- a/drivers/infiniband/core/iwcm.c +++ b/drivers/infiniband/core/iwcm.c @@ -1182,7 +1182,7 @@ static int __init iw_cm_init(void) if (ret) return ret; - iwcm_wq = alloc_ordered_workqueue("iw_cm_wq", 0); + iwcm_wq = alloc_ordered_workqueue("iw_cm_wq", WQ_MEM_RECLAIM); if (!iwcm_wq) goto err_alloc; diff --git a/drivers/infiniband/core/mad.c b/drivers/infiniband/core/mad.c index 7439e47ff951..1fd54d5c4dd8 100644 --- a/drivers/infiniband/core/mad.c +++ b/drivers/infiniband/core/mad.c @@ -2616,14 +2616,16 @@ static int retry_send(struct ib_mad_send_wr_private *mad_send_wr) static void timeout_sends(struct work_struct *work) { + struct ib_mad_send_wr_private *mad_send_wr, *n; struct ib_mad_agent_private *mad_agent_priv; - struct ib_mad_send_wr_private *mad_send_wr; struct ib_mad_send_wc mad_send_wc; + struct list_head local_list; unsigned long flags, delay; mad_agent_priv = container_of(work, struct ib_mad_agent_private, timed_work.work); mad_send_wc.vendor_err = 0; + INIT_LIST_HEAD(&local_list); spin_lock_irqsave(&mad_agent_priv->lock, flags); while (!list_empty(&mad_agent_priv->wait_list)) { @@ -2641,13 +2643,16 @@ static void timeout_sends(struct work_struct *work) break; } - list_del(&mad_send_wr->agent_list); + list_del_init(&mad_send_wr->agent_list); if (mad_send_wr->status == IB_WC_SUCCESS && !retry_send(mad_send_wr)) continue; - spin_unlock_irqrestore(&mad_agent_priv->lock, flags); + list_add_tail(&mad_send_wr->agent_list, &local_list); + } + spin_unlock_irqrestore(&mad_agent_priv->lock, flags); + list_for_each_entry_safe(mad_send_wr, n, &local_list, agent_list) { if (mad_send_wr->status == IB_WC_SUCCESS) mad_send_wc.status = IB_WC_RESP_TIMEOUT_ERR; else @@ -2655,11 +2660,8 @@ static void timeout_sends(struct work_struct *work) mad_send_wc.send_buf = &mad_send_wr->send_buf; mad_agent_priv->agent.send_handler(&mad_agent_priv->agent, &mad_send_wc); - deref_mad_agent(mad_agent_priv); - spin_lock_irqsave(&mad_agent_priv->lock, flags); } - spin_unlock_irqrestore(&mad_agent_priv->lock, flags); } /* @@ -2937,7 +2939,6 @@ static int ib_mad_port_open(struct ib_device *device, int ret, cq_size; struct ib_mad_port_private *port_priv; unsigned long flags; - char name[sizeof "ib_mad123"]; int has_smi; if (WARN_ON(rdma_max_mad_size(device, port_num) < IB_MGMT_MAD_SIZE)) @@ -2990,8 +2991,8 @@ static int ib_mad_port_open(struct ib_device *device, goto error7; } - snprintf(name, sizeof(name), "ib_mad%u", port_num); - port_priv->wq = alloc_ordered_workqueue(name, WQ_MEM_RECLAIM); + port_priv->wq = alloc_ordered_workqueue("ib_mad%u", WQ_MEM_RECLAIM, + port_num); if (!port_priv->wq) { ret = -ENOMEM; goto error8; diff --git a/drivers/infiniband/core/netlink.c b/drivers/infiniband/core/netlink.c index ae2db0c70788..def14c54b648 100644 --- a/drivers/infiniband/core/netlink.c +++ b/drivers/infiniband/core/netlink.c @@ -311,6 +311,7 @@ int rdma_nl_net_init(struct rdma_dev_net *rnet) struct net *net = read_pnet(&rnet->net); struct netlink_kernel_cfg cfg = { .input = rdma_nl_rcv, + .flags = NL_CFG_F_NONROOT_RECV, }; struct sock *nls; diff --git a/drivers/infiniband/core/nldev.c b/drivers/infiniband/core/nldev.c index a6b80cdc96f7..39f89a4b8649 100644 --- a/drivers/infiniband/core/nldev.c +++ b/drivers/infiniband/core/nldev.c @@ -170,6 +170,7 @@ static const struct nla_policy nldev_policy[RDMA_NLDEV_ATTR_MAX] = { [RDMA_NLDEV_ATTR_DEV_TYPE] = { .type = NLA_U8 }, [RDMA_NLDEV_ATTR_PARENT_NAME] = { .type = NLA_NUL_STRING }, [RDMA_NLDEV_ATTR_NAME_ASSIGN_TYPE] = { .type = NLA_U8 }, + [RDMA_NLDEV_ATTR_EVENT_TYPE] = { .type = NLA_U8 }, }; static int put_driver_name_print_type(struct sk_buff *msg, const char *name, @@ -1074,8 +1075,8 @@ static int nldev_get_doit(struct sk_buff *skb, struct nlmsghdr *nlh, u32 index; int err; - err = nlmsg_parse_deprecated(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, - nldev_policy, extack); + err = __nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, + nldev_policy, NL_VALIDATE_LIBERAL, extack); if (err || !tb[RDMA_NLDEV_ATTR_DEV_INDEX]) return -EINVAL; @@ -1123,8 +1124,8 @@ static int nldev_set_doit(struct sk_buff *skb, struct nlmsghdr *nlh, u32 index; int err; - err = nlmsg_parse_deprecated(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, - nldev_policy, extack); + err = nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, + nldev_policy, extack); if (err || !tb[RDMA_NLDEV_ATTR_DEV_INDEX]) return -EINVAL; @@ -1215,8 +1216,8 @@ static int nldev_port_get_doit(struct sk_buff *skb, struct nlmsghdr *nlh, u32 port; int err; - err = nlmsg_parse_deprecated(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, - nldev_policy, extack); + err = __nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, + nldev_policy, NL_VALIDATE_LIBERAL, extack); if (err || !tb[RDMA_NLDEV_ATTR_DEV_INDEX] || !tb[RDMA_NLDEV_ATTR_PORT_INDEX]) @@ -1275,8 +1276,8 @@ static int nldev_port_get_dumpit(struct sk_buff *skb, int err; unsigned int p; - err = nlmsg_parse_deprecated(cb->nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, - nldev_policy, NULL); + err = __nlmsg_parse(cb->nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, + nldev_policy, NL_VALIDATE_LIBERAL, NULL); if (err || !tb[RDMA_NLDEV_ATTR_DEV_INDEX]) return -EINVAL; @@ -1331,8 +1332,8 @@ static int nldev_res_get_doit(struct sk_buff *skb, struct nlmsghdr *nlh, u32 index; int ret; - ret = nlmsg_parse_deprecated(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, - nldev_policy, extack); + ret = __nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, + nldev_policy, NL_VALIDATE_LIBERAL, extack); if (ret || !tb[RDMA_NLDEV_ATTR_DEV_INDEX]) return -EINVAL; @@ -1481,8 +1482,8 @@ static int res_get_common_doit(struct sk_buff *skb, struct nlmsghdr *nlh, struct sk_buff *msg; int ret; - ret = nlmsg_parse_deprecated(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, - nldev_policy, extack); + ret = __nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, + nldev_policy, NL_VALIDATE_LIBERAL, extack); if (ret || !tb[RDMA_NLDEV_ATTR_DEV_INDEX] || !fe->id || !tb[fe->id]) return -EINVAL; @@ -1569,8 +1570,8 @@ static int res_get_common_dumpit(struct sk_buff *skb, u32 index, port = 0; bool filled = false; - err = nlmsg_parse_deprecated(cb->nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, - nldev_policy, NULL); + err = __nlmsg_parse(cb->nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, + nldev_policy, NL_VALIDATE_LIBERAL, NULL); /* * Right now, we are expecting the device index to get res information, * but it is possible to extend this code to return all devices in @@ -1762,8 +1763,8 @@ static int nldev_newlink(struct sk_buff *skb, struct nlmsghdr *nlh, char type[IFNAMSIZ]; int err; - err = nlmsg_parse_deprecated(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, - nldev_policy, extack); + err = nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, + nldev_policy, extack); if (err || !tb[RDMA_NLDEV_ATTR_DEV_NAME] || !tb[RDMA_NLDEV_ATTR_LINK_TYPE] || !tb[RDMA_NLDEV_ATTR_NDEV_NAME]) return -EINVAL; @@ -1806,8 +1807,8 @@ static int nldev_dellink(struct sk_buff *skb, struct nlmsghdr *nlh, u32 index; int err; - err = nlmsg_parse_deprecated(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, - nldev_policy, extack); + err = nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, + nldev_policy, extack); if (err || !tb[RDMA_NLDEV_ATTR_DEV_INDEX]) return -EINVAL; @@ -1836,8 +1837,8 @@ static int nldev_get_chardev(struct sk_buff *skb, struct nlmsghdr *nlh, u32 index; int err; - err = nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, nldev_policy, - extack); + err = __nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, nldev_policy, + NL_VALIDATE_LIBERAL, extack); if (err || !tb[RDMA_NLDEV_ATTR_CHARDEV_TYPE]) return -EINVAL; @@ -1920,8 +1921,8 @@ static int nldev_sys_get_doit(struct sk_buff *skb, struct nlmsghdr *nlh, struct sk_buff *msg; int err; - err = nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, - nldev_policy, extack); + err = __nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, + nldev_policy, NL_VALIDATE_LIBERAL, extack); if (err) return err; @@ -1951,6 +1952,12 @@ static int nldev_sys_get_doit(struct sk_buff *skb, struct nlmsghdr *nlh, nlmsg_free(msg); return err; } + + err = nla_put_u8(msg, RDMA_NLDEV_SYS_ATTR_MONITOR_MODE, 1); + if (err) { + nlmsg_free(msg); + return err; + } /* * Copy-on-fork is supported. * See commits: @@ -2420,8 +2427,8 @@ static int nldev_stat_get_doit(struct sk_buff *skb, struct nlmsghdr *nlh, struct nlattr *tb[RDMA_NLDEV_ATTR_MAX]; int ret; - ret = nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, - nldev_policy, extack); + ret = __nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, + nldev_policy, NL_VALIDATE_LIBERAL, extack); if (ret) return -EINVAL; @@ -2450,8 +2457,8 @@ static int nldev_stat_get_dumpit(struct sk_buff *skb, struct nlattr *tb[RDMA_NLDEV_ATTR_MAX]; int ret; - ret = nlmsg_parse(cb->nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, - nldev_policy, NULL); + ret = __nlmsg_parse(cb->nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, + nldev_policy, NL_VALIDATE_LIBERAL, NULL); if (ret || !tb[RDMA_NLDEV_ATTR_STAT_RES]) return -EINVAL; @@ -2482,8 +2489,8 @@ static int nldev_stat_get_counter_status_doit(struct sk_buff *skb, u32 devid, port; int ret, i; - ret = nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, - nldev_policy, extack); + ret = __nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, + nldev_policy, NL_VALIDATE_LIBERAL, extack); if (ret || !tb[RDMA_NLDEV_ATTR_DEV_INDEX] || !tb[RDMA_NLDEV_ATTR_PORT_INDEX]) return -EINVAL; @@ -2722,6 +2729,130 @@ static const struct rdma_nl_cbs nldev_cb_table[RDMA_NLDEV_NUM_OPS] = { }, }; +static int fill_mon_netdev_association(struct sk_buff *msg, + struct ib_device *device, u32 port, + const struct net *net) +{ + struct net_device *netdev = ib_device_get_netdev(device, port); + int ret = 0; + + if (netdev && !net_eq(dev_net(netdev), net)) + goto out; + + ret = nla_put_u32(msg, RDMA_NLDEV_ATTR_DEV_INDEX, device->index); + if (ret) + goto out; + + ret = nla_put_string(msg, RDMA_NLDEV_ATTR_DEV_NAME, + dev_name(&device->dev)); + if (ret) + goto out; + + ret = nla_put_u32(msg, RDMA_NLDEV_ATTR_PORT_INDEX, port); + if (ret) + goto out; + + if (netdev) { + ret = nla_put_u32(msg, + RDMA_NLDEV_ATTR_NDEV_INDEX, netdev->ifindex); + if (ret) + goto out; + + ret = nla_put_string(msg, + RDMA_NLDEV_ATTR_NDEV_NAME, netdev->name); + } + +out: + dev_put(netdev); + return ret; +} + +static void rdma_nl_notify_err_msg(struct ib_device *device, u32 port_num, + enum rdma_nl_notify_event_type type) +{ + struct net_device *netdev; + + switch (type) { + case RDMA_REGISTER_EVENT: + dev_warn_ratelimited(&device->dev, + "Failed to send RDMA monitor register device event\n"); + break; + case RDMA_UNREGISTER_EVENT: + dev_warn_ratelimited(&device->dev, + "Failed to send RDMA monitor unregister device event\n"); + break; + case RDMA_NETDEV_ATTACH_EVENT: + netdev = ib_device_get_netdev(device, port_num); + dev_warn_ratelimited(&device->dev, + "Failed to send RDMA monitor netdev attach event: port %d netdev %d\n", + port_num, netdev->ifindex); + dev_put(netdev); + break; + case RDMA_NETDEV_DETACH_EVENT: + dev_warn_ratelimited(&device->dev, + "Failed to send RDMA monitor netdev detach event: port %d\n", + port_num); + break; + default: + break; + } +} + +int rdma_nl_notify_event(struct ib_device *device, u32 port_num, + enum rdma_nl_notify_event_type type) +{ + struct sk_buff *skb; + struct net *net; + int ret = 0; + void *nlh; + + net = read_pnet(&device->coredev.rdma_net); + if (!net) + return -EINVAL; + + skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!skb) + return -ENOMEM; + nlh = nlmsg_put(skb, 0, 0, + RDMA_NL_GET_TYPE(RDMA_NL_NLDEV, RDMA_NLDEV_CMD_MONITOR), + 0, 0); + + switch (type) { + case RDMA_REGISTER_EVENT: + case RDMA_UNREGISTER_EVENT: + ret = fill_nldev_handle(skb, device); + if (ret) + goto err_free; + break; + case RDMA_NETDEV_ATTACH_EVENT: + case RDMA_NETDEV_DETACH_EVENT: + ret = fill_mon_netdev_association(skb, device, + port_num, net); + if (ret) + goto err_free; + break; + default: + break; + } + + ret = nla_put_u8(skb, RDMA_NLDEV_ATTR_EVENT_TYPE, type); + if (ret) + goto err_free; + + nlmsg_end(skb, nlh); + ret = rdma_nl_multicast(net, skb, RDMA_NL_GROUP_NOTIFY, GFP_KERNEL); + if (ret && ret != -ESRCH) { + skb = NULL; /* skb is freed in the netlink send-op handling */ + goto err_free; + } + return 0; + +err_free: + rdma_nl_notify_err_msg(device, port_num, type); + nlmsg_free(skb); + return ret; +} + void __init nldev_init(void) { rdma_nl_register(RDMA_NL_NLDEV, nldev_cb_table); diff --git a/drivers/infiniband/core/umem_dmabuf.c b/drivers/infiniband/core/umem_dmabuf.c index 39357dc2d229..9fcd37761264 100644 --- a/drivers/infiniband/core/umem_dmabuf.c +++ b/drivers/infiniband/core/umem_dmabuf.c @@ -23,6 +23,9 @@ int ib_umem_dmabuf_map_pages(struct ib_umem_dmabuf *umem_dmabuf) dma_resv_assert_held(umem_dmabuf->attach->dmabuf->resv); + if (umem_dmabuf->revoked) + return -EINVAL; + if (umem_dmabuf->sgt) goto wait_fence; @@ -110,10 +113,12 @@ void ib_umem_dmabuf_unmap_pages(struct ib_umem_dmabuf *umem_dmabuf) } EXPORT_SYMBOL(ib_umem_dmabuf_unmap_pages); -struct ib_umem_dmabuf *ib_umem_dmabuf_get(struct ib_device *device, - unsigned long offset, size_t size, - int fd, int access, - const struct dma_buf_attach_ops *ops) +static struct ib_umem_dmabuf * +ib_umem_dmabuf_get_with_dma_device(struct ib_device *device, + struct device *dma_device, + unsigned long offset, size_t size, + int fd, int access, + const struct dma_buf_attach_ops *ops) { struct dma_buf *dmabuf; struct ib_umem_dmabuf *umem_dmabuf; @@ -152,7 +157,7 @@ struct ib_umem_dmabuf *ib_umem_dmabuf_get(struct ib_device *device, umem_dmabuf->attach = dma_buf_dynamic_attach( dmabuf, - device->dma_device, + dma_device, ops, umem_dmabuf); if (IS_ERR(umem_dmabuf->attach)) { @@ -168,6 +173,15 @@ out_release_dmabuf: dma_buf_put(dmabuf); return ret; } + +struct ib_umem_dmabuf *ib_umem_dmabuf_get(struct ib_device *device, + unsigned long offset, size_t size, + int fd, int access, + const struct dma_buf_attach_ops *ops) +{ + return ib_umem_dmabuf_get_with_dma_device(device, device->dma_device, + offset, size, fd, access, ops); +} EXPORT_SYMBOL(ib_umem_dmabuf_get); static void @@ -184,16 +198,18 @@ static struct dma_buf_attach_ops ib_umem_dmabuf_attach_pinned_ops = { .move_notify = ib_umem_dmabuf_unsupported_move_notify, }; -struct ib_umem_dmabuf *ib_umem_dmabuf_get_pinned(struct ib_device *device, - unsigned long offset, - size_t size, int fd, - int access) +struct ib_umem_dmabuf * +ib_umem_dmabuf_get_pinned_with_dma_device(struct ib_device *device, + struct device *dma_device, + unsigned long offset, size_t size, + int fd, int access) { struct ib_umem_dmabuf *umem_dmabuf; int err; - umem_dmabuf = ib_umem_dmabuf_get(device, offset, size, fd, access, - &ib_umem_dmabuf_attach_pinned_ops); + umem_dmabuf = ib_umem_dmabuf_get_with_dma_device(device, dma_device, offset, + size, fd, access, + &ib_umem_dmabuf_attach_pinned_ops); if (IS_ERR(umem_dmabuf)) return umem_dmabuf; @@ -217,17 +233,41 @@ err_release: ib_umem_release(&umem_dmabuf->umem); return ERR_PTR(err); } +EXPORT_SYMBOL(ib_umem_dmabuf_get_pinned_with_dma_device); + +struct ib_umem_dmabuf *ib_umem_dmabuf_get_pinned(struct ib_device *device, + unsigned long offset, + size_t size, int fd, + int access) +{ + return ib_umem_dmabuf_get_pinned_with_dma_device(device, device->dma_device, + offset, size, fd, access); +} EXPORT_SYMBOL(ib_umem_dmabuf_get_pinned); -void ib_umem_dmabuf_release(struct ib_umem_dmabuf *umem_dmabuf) +void ib_umem_dmabuf_revoke(struct ib_umem_dmabuf *umem_dmabuf) { struct dma_buf *dmabuf = umem_dmabuf->attach->dmabuf; dma_resv_lock(dmabuf->resv, NULL); + if (umem_dmabuf->revoked) + goto end; ib_umem_dmabuf_unmap_pages(umem_dmabuf); - if (umem_dmabuf->pinned) + if (umem_dmabuf->pinned) { dma_buf_unpin(umem_dmabuf->attach); + umem_dmabuf->pinned = 0; + } + umem_dmabuf->revoked = 1; +end: dma_resv_unlock(dmabuf->resv); +} +EXPORT_SYMBOL(ib_umem_dmabuf_revoke); + +void ib_umem_dmabuf_release(struct ib_umem_dmabuf *umem_dmabuf) +{ + struct dma_buf *dmabuf = umem_dmabuf->attach->dmabuf; + + ib_umem_dmabuf_revoke(umem_dmabuf); dma_buf_detach(dmabuf, umem_dmabuf->attach); dma_buf_put(dmabuf); diff --git a/drivers/infiniband/core/uverbs_std_types_mr.c b/drivers/infiniband/core/uverbs_std_types_mr.c index 03e1db5d1e8c..7ebc7bd3caae 100644 --- a/drivers/infiniband/core/uverbs_std_types_mr.c +++ b/drivers/infiniband/core/uverbs_std_types_mr.c @@ -239,7 +239,7 @@ static int UVERBS_HANDLER(UVERBS_METHOD_REG_DMABUF_MR)( mr = pd->device->ops.reg_user_mr_dmabuf(pd, offset, length, iova, fd, access_flags, - &attrs->driver_udata); + attrs); if (IS_ERR(mr)) return PTR_ERR(mr); diff --git a/drivers/infiniband/hw/bnxt_re/bnxt_re.h b/drivers/infiniband/hw/bnxt_re/bnxt_re.h index 0912d2fa9634..e94518b12f86 100644 --- a/drivers/infiniband/hw/bnxt_re/bnxt_re.h +++ b/drivers/infiniband/hw/bnxt_re/bnxt_re.h @@ -91,6 +91,15 @@ struct bnxt_re_ring_attr { u8 mode; }; +/* + * Data structure and defines to handle + * recovery + */ +#define BNXT_RE_PRE_RECOVERY_REMOVE 0x1 +#define BNXT_RE_COMPLETE_REMOVE 0x2 +#define BNXT_RE_POST_RECOVERY_INIT 0x4 +#define BNXT_RE_COMPLETE_INIT 0x8 + struct bnxt_re_sqp_entries { struct bnxt_qplib_sge sge; u64 wrid; @@ -107,6 +116,11 @@ struct bnxt_re_gsi_context { struct bnxt_re_sqp_entries *sqp_tbl; }; +struct bnxt_re_en_dev_info { + struct bnxt_en_dev *en_dev; + struct bnxt_re_dev *rdev; +}; + #define BNXT_RE_AEQ_IDX 0 #define BNXT_RE_NQ_IDX 1 #define BNXT_RE_GEN_P5_MAX_VF 64 @@ -141,6 +155,7 @@ struct bnxt_re_pacing { #define BNXT_RE_GRC_FIFO_REG_BASE 0x2000 #define MAX_CQ_HASH_BITS (16) +#define MAX_SRQ_HASH_BITS (16) struct bnxt_re_dev { struct ib_device ibdev; struct list_head list; @@ -154,6 +169,7 @@ struct bnxt_re_dev { #define BNXT_RE_FLAG_ERR_DEVICE_DETACHED 17 #define BNXT_RE_FLAG_ISSUE_ROCE_STATS 29 struct net_device *netdev; + struct auxiliary_device *adev; struct notifier_block nb; unsigned int version, major, minor; struct bnxt_qplib_chip_ctx *chip_ctx; @@ -196,6 +212,7 @@ struct bnxt_re_dev { struct work_struct dbq_fifo_check_work; struct delayed_work dbq_pacing_work; DECLARE_HASHTABLE(cq_hash, MAX_CQ_HASH_BITS); + DECLARE_HASHTABLE(srq_hash, MAX_SRQ_HASH_BITS); }; #define to_bnxt_re_dev(ptr, member) \ @@ -216,4 +233,10 @@ static inline struct device *rdev_to_dev(struct bnxt_re_dev *rdev) } extern const struct uapi_definition bnxt_re_uapi_defs[]; + +static inline void bnxt_re_set_pacing_dev_state(struct bnxt_re_dev *rdev) +{ + rdev->qplib_res.pacing_data->dev_err_state = + test_bit(BNXT_RE_FLAG_ERR_DEVICE_DETACHED, &rdev->flags); +} #endif diff --git a/drivers/infiniband/hw/bnxt_re/ib_verbs.c b/drivers/infiniband/hw/bnxt_re/ib_verbs.c index 7c757351a016..460f33914825 100644 --- a/drivers/infiniband/hw/bnxt_re/ib_verbs.c +++ b/drivers/infiniband/hw/bnxt_re/ib_verbs.c @@ -115,6 +115,14 @@ static enum ib_access_flags __to_ib_access_flags(int qflags) return iflags; }; +static void bnxt_re_check_and_set_relaxed_ordering(struct bnxt_re_dev *rdev, + struct bnxt_qplib_mrw *qplib_mr) +{ + if (_is_relaxed_ordering_supported(rdev->dev_attr.dev_cap_flags2) && + pcie_relaxed_ordering_enabled(rdev->en_dev->pdev)) + qplib_mr->flags |= CMDQ_REGISTER_MR_FLAGS_ENABLE_RO; +} + static int bnxt_re_build_sgl(struct ib_sge *ib_sg_list, struct bnxt_qplib_sge *sg_list, int num) { @@ -517,15 +525,19 @@ static int bnxt_re_create_fence_mr(struct bnxt_re_pd *pd) mr->rdev = rdev; mr->qplib_mr.pd = &pd->qplib_pd; mr->qplib_mr.type = CMDQ_ALLOCATE_MRW_MRW_FLAGS_PMR; - mr->qplib_mr.flags = __from_ib_access_flags(mr_access_flags); - rc = bnxt_qplib_alloc_mrw(&rdev->qplib_res, &mr->qplib_mr); - if (rc) { - ibdev_err(&rdev->ibdev, "Failed to alloc fence-HW-MR\n"); - goto fail; - } + mr->qplib_mr.access_flags = __from_ib_access_flags(mr_access_flags); + if (!_is_alloc_mr_unified(rdev->dev_attr.dev_cap_flags)) { + rc = bnxt_qplib_alloc_mrw(&rdev->qplib_res, &mr->qplib_mr); + if (rc) { + ibdev_err(&rdev->ibdev, "Failed to alloc fence-HW-MR\n"); + goto fail; + } - /* Register MR */ - mr->ib_mr.lkey = mr->qplib_mr.lkey; + /* Register MR */ + mr->ib_mr.lkey = mr->qplib_mr.lkey; + } else { + mr->qplib_mr.flags = CMDQ_REGISTER_MR_FLAGS_ALLOC_MR; + } mr->qplib_mr.va = (u64)(unsigned long)fence->va; mr->qplib_mr.total_size = BNXT_RE_FENCE_BYTES; rc = bnxt_qplib_reg_mr(&rdev->qplib_res, &mr->qplib_mr, NULL, @@ -994,43 +1006,37 @@ static int bnxt_re_setup_swqe_size(struct bnxt_re_qp *qp, align = sizeof(struct sq_send_hdr); ilsize = ALIGN(init_attr->cap.max_inline_data, align); - sq->wqe_size = bnxt_re_get_wqe_size(ilsize, sq->max_sge); - if (sq->wqe_size > bnxt_re_get_swqe_size(dev_attr->max_qp_sges)) - return -EINVAL; - /* For gen p4 and gen p5 backward compatibility mode - * wqe size is fixed to 128 bytes + /* For gen p4 and gen p5 fixed wqe compatibility mode + * wqe size is fixed to 128 bytes - ie 6 SGEs */ - if (sq->wqe_size < bnxt_re_get_swqe_size(dev_attr->max_qp_sges) && - qplqp->wqe_mode == BNXT_QPLIB_WQE_MODE_STATIC) - sq->wqe_size = bnxt_re_get_swqe_size(dev_attr->max_qp_sges); + if (qplqp->wqe_mode == BNXT_QPLIB_WQE_MODE_STATIC) { + sq->wqe_size = bnxt_re_get_swqe_size(BNXT_STATIC_MAX_SGE); + sq->max_sge = BNXT_STATIC_MAX_SGE; + } else { + sq->wqe_size = bnxt_re_get_wqe_size(ilsize, sq->max_sge); + if (sq->wqe_size > bnxt_re_get_swqe_size(dev_attr->max_qp_sges)) + return -EINVAL; + } if (init_attr->cap.max_inline_data) { qplqp->max_inline_data = sq->wqe_size - sizeof(struct sq_send_hdr); init_attr->cap.max_inline_data = qplqp->max_inline_data; - if (qplqp->wqe_mode == BNXT_QPLIB_WQE_MODE_STATIC) - sq->max_sge = qplqp->max_inline_data / - sizeof(struct sq_sge); } return 0; } static int bnxt_re_init_user_qp(struct bnxt_re_dev *rdev, struct bnxt_re_pd *pd, - struct bnxt_re_qp *qp, struct ib_udata *udata) + struct bnxt_re_qp *qp, struct bnxt_re_ucontext *cntx, + struct bnxt_re_qp_req *ureq) { struct bnxt_qplib_qp *qplib_qp; - struct bnxt_re_ucontext *cntx; - struct bnxt_re_qp_req ureq; int bytes = 0, psn_sz; struct ib_umem *umem; int psn_nume; qplib_qp = &qp->qplib_qp; - cntx = rdma_udata_to_drv_context(udata, struct bnxt_re_ucontext, - ib_uctx); - if (ib_copy_from_udata(&ureq, udata, sizeof(ureq))) - return -EFAULT; bytes = (qplib_qp->sq.max_wqe * qplib_qp->sq.wqe_size); /* Consider mapping PSN search memory only for RC QPs. */ @@ -1038,15 +1044,20 @@ static int bnxt_re_init_user_qp(struct bnxt_re_dev *rdev, struct bnxt_re_pd *pd, psn_sz = bnxt_qplib_is_chip_gen_p5_p7(rdev->chip_ctx) ? sizeof(struct sq_psn_search_ext) : sizeof(struct sq_psn_search); - psn_nume = (qplib_qp->wqe_mode == BNXT_QPLIB_WQE_MODE_STATIC) ? - qplib_qp->sq.max_wqe : - ((qplib_qp->sq.max_wqe * qplib_qp->sq.wqe_size) / - sizeof(struct bnxt_qplib_sge)); + if (cntx && bnxt_re_is_var_size_supported(rdev, cntx)) { + psn_nume = ureq->sq_slots; + } else { + psn_nume = (qplib_qp->wqe_mode == BNXT_QPLIB_WQE_MODE_STATIC) ? + qplib_qp->sq.max_wqe : ((qplib_qp->sq.max_wqe * qplib_qp->sq.wqe_size) / + sizeof(struct bnxt_qplib_sge)); + } + if (_is_host_msn_table(rdev->qplib_res.dattr->dev_cap_flags2)) + psn_nume = roundup_pow_of_two(psn_nume); bytes += (psn_nume * psn_sz); } bytes = PAGE_ALIGN(bytes); - umem = ib_umem_get(&rdev->ibdev, ureq.qpsva, bytes, + umem = ib_umem_get(&rdev->ibdev, ureq->qpsva, bytes, IB_ACCESS_LOCAL_WRITE); if (IS_ERR(umem)) return PTR_ERR(umem); @@ -1055,12 +1066,12 @@ static int bnxt_re_init_user_qp(struct bnxt_re_dev *rdev, struct bnxt_re_pd *pd, qplib_qp->sq.sg_info.umem = umem; qplib_qp->sq.sg_info.pgsize = PAGE_SIZE; qplib_qp->sq.sg_info.pgshft = PAGE_SHIFT; - qplib_qp->qp_handle = ureq.qp_handle; + qplib_qp->qp_handle = ureq->qp_handle; if (!qp->qplib_qp.srq) { bytes = (qplib_qp->rq.max_wqe * qplib_qp->rq.wqe_size); bytes = PAGE_ALIGN(bytes); - umem = ib_umem_get(&rdev->ibdev, ureq.qprva, bytes, + umem = ib_umem_get(&rdev->ibdev, ureq->qprva, bytes, IB_ACCESS_LOCAL_WRITE); if (IS_ERR(umem)) goto rqfail; @@ -1156,6 +1167,7 @@ static struct bnxt_re_qp *bnxt_re_create_shadow_qp /* Shadow QP SQ depth should be same as QP1 RQ depth */ qp->qplib_qp.sq.wqe_size = bnxt_re_get_wqe_size(0, 6); qp->qplib_qp.sq.max_wqe = qp1_qp->rq.max_wqe; + qp->qplib_qp.sq.max_sw_wqe = qp1_qp->rq.max_wqe; qp->qplib_qp.sq.max_sge = 2; /* Q full delta can be 1 since it is internal QP */ qp->qplib_qp.sq.q_full_delta = 1; @@ -1167,6 +1179,7 @@ static struct bnxt_re_qp *bnxt_re_create_shadow_qp qp->qplib_qp.rq.wqe_size = bnxt_re_get_rwqe_size(6); qp->qplib_qp.rq.max_wqe = qp1_qp->rq.max_wqe; + qp->qplib_qp.rq.max_sw_wqe = qp1_qp->rq.max_wqe; qp->qplib_qp.rq.max_sge = qp1_qp->rq.max_sge; /* Q full delta can be 1 since it is internal QP */ qp->qplib_qp.rq.q_full_delta = 1; @@ -1228,6 +1241,7 @@ static int bnxt_re_init_rq_attr(struct bnxt_re_qp *qp, */ entries = bnxt_re_init_depth(init_attr->cap.max_recv_wr + 1, uctx); rq->max_wqe = min_t(u32, entries, dev_attr->max_qp_wqes + 1); + rq->max_sw_wqe = rq->max_wqe; rq->q_full_delta = 0; rq->sg_info.pgsize = PAGE_SIZE; rq->sg_info.pgshft = PAGE_SHIFT; @@ -1256,14 +1270,15 @@ static void bnxt_re_adjust_gsi_rq_attr(struct bnxt_re_qp *qp) static int bnxt_re_init_sq_attr(struct bnxt_re_qp *qp, struct ib_qp_init_attr *init_attr, - struct bnxt_re_ucontext *uctx) + struct bnxt_re_ucontext *uctx, + struct bnxt_re_qp_req *ureq) { struct bnxt_qplib_dev_attr *dev_attr; struct bnxt_qplib_qp *qplqp; struct bnxt_re_dev *rdev; struct bnxt_qplib_q *sq; + int diff = 0; int entries; - int diff; int rc; rdev = qp->rdev; @@ -1272,21 +1287,28 @@ static int bnxt_re_init_sq_attr(struct bnxt_re_qp *qp, dev_attr = &rdev->dev_attr; sq->max_sge = init_attr->cap.max_send_sge; - if (sq->max_sge > dev_attr->max_qp_sges) { - sq->max_sge = dev_attr->max_qp_sges; - init_attr->cap.max_send_sge = sq->max_sge; - } + entries = init_attr->cap.max_send_wr; + if (uctx && qplqp->wqe_mode == BNXT_QPLIB_WQE_MODE_VARIABLE) { + sq->max_wqe = ureq->sq_slots; + sq->max_sw_wqe = ureq->sq_slots; + sq->wqe_size = sizeof(struct sq_sge); + } else { + if (sq->max_sge > dev_attr->max_qp_sges) { + sq->max_sge = dev_attr->max_qp_sges; + init_attr->cap.max_send_sge = sq->max_sge; + } - rc = bnxt_re_setup_swqe_size(qp, init_attr); - if (rc) - return rc; + rc = bnxt_re_setup_swqe_size(qp, init_attr); + if (rc) + return rc; - entries = init_attr->cap.max_send_wr; - /* Allocate 128 + 1 more than what's provided */ - diff = (qplqp->wqe_mode == BNXT_QPLIB_WQE_MODE_VARIABLE) ? - 0 : BNXT_QPLIB_RESERVED_QP_WRS; - entries = bnxt_re_init_depth(entries + diff + 1, uctx); - sq->max_wqe = min_t(u32, entries, dev_attr->max_qp_wqes + diff + 1); + /* Allocate 128 + 1 more than what's provided */ + diff = (qplqp->wqe_mode == BNXT_QPLIB_WQE_MODE_VARIABLE) ? + 0 : BNXT_QPLIB_RESERVED_QP_WRS; + entries = bnxt_re_init_depth(entries + diff + 1, uctx); + sq->max_wqe = min_t(u32, entries, dev_attr->max_qp_wqes + diff + 1); + sq->max_sw_wqe = bnxt_qplib_get_depth(sq, qplqp->wqe_mode, true); + } sq->q_full_delta = diff + 1; /* * Reserving one slot for Phantom WQE. Application can @@ -1349,10 +1371,10 @@ out: static int bnxt_re_init_qp_attr(struct bnxt_re_qp *qp, struct bnxt_re_pd *pd, struct ib_qp_init_attr *init_attr, - struct ib_udata *udata) + struct bnxt_re_ucontext *uctx, + struct bnxt_re_qp_req *ureq) { struct bnxt_qplib_dev_attr *dev_attr; - struct bnxt_re_ucontext *uctx; struct bnxt_qplib_qp *qplqp; struct bnxt_re_dev *rdev; struct bnxt_re_cq *cq; @@ -1362,7 +1384,6 @@ static int bnxt_re_init_qp_attr(struct bnxt_re_qp *qp, struct bnxt_re_pd *pd, qplqp = &qp->qplib_qp; dev_attr = &rdev->dev_attr; - uctx = rdma_udata_to_drv_context(udata, struct bnxt_re_ucontext, ib_uctx); /* Setup misc params */ ether_addr_copy(qplqp->smac, rdev->netdev->dev_addr); qplqp->pd = &pd->qplib_pd; @@ -1375,8 +1396,7 @@ static int bnxt_re_init_qp_attr(struct bnxt_re_qp *qp, struct bnxt_re_pd *pd, goto out; } qplqp->type = (u8)qptype; - qplqp->wqe_mode = rdev->chip_ctx->modes.wqe_mode; - + qplqp->wqe_mode = bnxt_re_is_var_size_supported(rdev, uctx); if (init_attr->qp_type == IB_QPT_RC) { qplqp->max_rd_atomic = dev_attr->max_qp_rd_atom; qplqp->max_dest_rd_atomic = dev_attr->max_qp_init_rd_atom; @@ -1411,14 +1431,14 @@ static int bnxt_re_init_qp_attr(struct bnxt_re_qp *qp, struct bnxt_re_pd *pd, bnxt_re_adjust_gsi_rq_attr(qp); /* Setup SQ */ - rc = bnxt_re_init_sq_attr(qp, init_attr, uctx); + rc = bnxt_re_init_sq_attr(qp, init_attr, uctx, ureq); if (rc) goto out; if (init_attr->qp_type == IB_QPT_GSI) bnxt_re_adjust_gsi_sq_attr(qp, init_attr, uctx); - if (udata) /* This will update DPI and qp_handle */ - rc = bnxt_re_init_user_qp(rdev, pd, qp, udata); + if (uctx) /* This will update DPI and qp_handle */ + rc = bnxt_re_init_user_qp(rdev, pd, qp, uctx, ureq); out: return rc; } @@ -1519,14 +1539,27 @@ static bool bnxt_re_test_qp_limits(struct bnxt_re_dev *rdev, int bnxt_re_create_qp(struct ib_qp *ib_qp, struct ib_qp_init_attr *qp_init_attr, struct ib_udata *udata) { - struct ib_pd *ib_pd = ib_qp->pd; - struct bnxt_re_pd *pd = container_of(ib_pd, struct bnxt_re_pd, ib_pd); - struct bnxt_re_dev *rdev = pd->rdev; - struct bnxt_qplib_dev_attr *dev_attr = &rdev->dev_attr; - struct bnxt_re_qp *qp = container_of(ib_qp, struct bnxt_re_qp, ib_qp); + struct bnxt_qplib_dev_attr *dev_attr; + struct bnxt_re_ucontext *uctx; + struct bnxt_re_qp_req ureq; + struct bnxt_re_dev *rdev; + struct bnxt_re_pd *pd; + struct bnxt_re_qp *qp; + struct ib_pd *ib_pd; u32 active_qps; int rc; + ib_pd = ib_qp->pd; + pd = container_of(ib_pd, struct bnxt_re_pd, ib_pd); + rdev = pd->rdev; + dev_attr = &rdev->dev_attr; + qp = container_of(ib_qp, struct bnxt_re_qp, ib_qp); + + uctx = rdma_udata_to_drv_context(udata, struct bnxt_re_ucontext, ib_uctx); + if (udata) + if (ib_copy_from_udata(&ureq, udata, min(udata->inlen, sizeof(ureq)))) + return -EFAULT; + rc = bnxt_re_test_qp_limits(rdev, qp_init_attr, dev_attr); if (!rc) { rc = -EINVAL; @@ -1534,7 +1567,7 @@ int bnxt_re_create_qp(struct ib_qp *ib_qp, struct ib_qp_init_attr *qp_init_attr, } qp->rdev = rdev; - rc = bnxt_re_init_qp_attr(qp, pd, qp_init_attr, udata); + rc = bnxt_re_init_qp_attr(qp, pd, qp_init_attr, uctx, &ureq); if (rc) goto fail; @@ -1685,6 +1718,10 @@ int bnxt_re_destroy_srq(struct ib_srq *ib_srq, struct ib_udata *udata) if (qplib_srq->cq) nq = qplib_srq->cq->nq; + if (rdev->chip_ctx->modes.toggle_bits & BNXT_QPLIB_SRQ_TOGGLE_BIT) { + free_page((unsigned long)srq->uctx_srq_page); + hash_del(&srq->hash_entry); + } bnxt_qplib_destroy_srq(&rdev->qplib_res, qplib_srq); ib_umem_release(srq->umem); atomic_dec(&rdev->stats.res.srq_count); @@ -1789,9 +1826,18 @@ int bnxt_re_create_srq(struct ib_srq *ib_srq, } if (udata) { - struct bnxt_re_srq_resp resp; + struct bnxt_re_srq_resp resp = {}; resp.srqid = srq->qplib_srq.id; + if (rdev->chip_ctx->modes.toggle_bits & BNXT_QPLIB_SRQ_TOGGLE_BIT) { + hash_add(rdev->srq_hash, &srq->hash_entry, srq->qplib_srq.id); + srq->uctx_srq_page = (void *)get_zeroed_page(GFP_KERNEL); + if (!srq->uctx_srq_page) { + rc = -ENOMEM; + goto fail; + } + resp.comp_mask |= BNXT_RE_SRQ_TOGGLE_PAGE_SUPPORT; + } rc = ib_copy_to_udata(udata, &resp, sizeof(resp)); if (rc) { ibdev_err(&rdev->ibdev, "SRQ copy to udata failed!"); @@ -2155,6 +2201,7 @@ int bnxt_re_modify_qp(struct ib_qp *ib_qp, struct ib_qp_attr *qp_attr, entries = bnxt_re_init_depth(qp_attr->cap.max_recv_wr, uctx); qp->qplib_qp.rq.max_wqe = min_t(u32, entries, dev_attr->max_qp_wqes + 1); + qp->qplib_qp.rq.max_sw_wqe = qp->qplib_qp.rq.max_wqe; qp->qplib_qp.rq.q_full_delta = qp->qplib_qp.rq.max_wqe - qp_attr->cap.max_recv_wr; qp->qplib_qp.rq.max_sge = qp_attr->cap.max_recv_sge; @@ -3845,9 +3892,12 @@ struct ib_mr *bnxt_re_get_dma_mr(struct ib_pd *ib_pd, int mr_access_flags) mr->rdev = rdev; mr->qplib_mr.pd = &pd->qplib_pd; - mr->qplib_mr.flags = __from_ib_access_flags(mr_access_flags); + mr->qplib_mr.access_flags = __from_ib_access_flags(mr_access_flags); mr->qplib_mr.type = CMDQ_ALLOCATE_MRW_MRW_FLAGS_PMR; + if (mr_access_flags & IB_ACCESS_RELAXED_ORDERING) + bnxt_re_check_and_set_relaxed_ordering(rdev, &mr->qplib_mr); + /* Allocate and register 0 as the address */ rc = bnxt_qplib_alloc_mrw(&rdev->qplib_res, &mr->qplib_mr); if (rc) @@ -3945,7 +3995,7 @@ struct ib_mr *bnxt_re_alloc_mr(struct ib_pd *ib_pd, enum ib_mr_type type, mr->rdev = rdev; mr->qplib_mr.pd = &pd->qplib_pd; - mr->qplib_mr.flags = BNXT_QPLIB_FR_PMR; + mr->qplib_mr.access_flags = BNXT_QPLIB_FR_PMR; mr->qplib_mr.type = CMDQ_ALLOCATE_MRW_MRW_FLAGS_PMR; rc = bnxt_qplib_alloc_mrw(&rdev->qplib_res, &mr->qplib_mr); @@ -4062,21 +4112,28 @@ static struct ib_mr *__bnxt_re_user_reg_mr(struct ib_pd *ib_pd, u64 length, u64 mr->rdev = rdev; mr->qplib_mr.pd = &pd->qplib_pd; - mr->qplib_mr.flags = __from_ib_access_flags(mr_access_flags); + mr->qplib_mr.access_flags = __from_ib_access_flags(mr_access_flags); mr->qplib_mr.type = CMDQ_ALLOCATE_MRW_MRW_FLAGS_MR; - rc = bnxt_qplib_alloc_mrw(&rdev->qplib_res, &mr->qplib_mr); - if (rc) { - ibdev_err(&rdev->ibdev, "Failed to allocate MR rc = %d", rc); - rc = -EIO; - goto free_mr; + if (!_is_alloc_mr_unified(rdev->dev_attr.dev_cap_flags)) { + rc = bnxt_qplib_alloc_mrw(&rdev->qplib_res, &mr->qplib_mr); + if (rc) { + ibdev_err(&rdev->ibdev, "Failed to allocate MR rc = %d", rc); + rc = -EIO; + goto free_mr; + } + /* The fixed portion of the rkey is the same as the lkey */ + mr->ib_mr.rkey = mr->qplib_mr.rkey; + } else { + mr->qplib_mr.flags = CMDQ_REGISTER_MR_FLAGS_ALLOC_MR; } - /* The fixed portion of the rkey is the same as the lkey */ - mr->ib_mr.rkey = mr->qplib_mr.rkey; mr->ib_umem = umem; mr->qplib_mr.va = virt_addr; mr->qplib_mr.total_size = length; + if (mr_access_flags & IB_ACCESS_RELAXED_ORDERING) + bnxt_re_check_and_set_relaxed_ordering(rdev, &mr->qplib_mr); + umem_pgs = ib_umem_num_dma_blocks(umem, page_size); rc = bnxt_qplib_reg_mr(&rdev->qplib_res, &mr->qplib_mr, umem, umem_pgs, page_size); @@ -4122,7 +4179,8 @@ struct ib_mr *bnxt_re_reg_user_mr(struct ib_pd *ib_pd, u64 start, u64 length, struct ib_mr *bnxt_re_reg_user_mr_dmabuf(struct ib_pd *ib_pd, u64 start, u64 length, u64 virt_addr, int fd, - int mr_access_flags, struct ib_udata *udata) + int mr_access_flags, + struct uverbs_attr_bundle *attrs) { struct bnxt_re_pd *pd = container_of(ib_pd, struct bnxt_re_pd, ib_pd); struct bnxt_re_dev *rdev = pd->rdev; @@ -4187,9 +4245,6 @@ int bnxt_re_alloc_ucontext(struct ib_ucontext *ctx, struct ib_udata *udata) resp.cqe_sz = sizeof(struct cq_base); resp.max_cqd = dev_attr->max_cq_wqes; - resp.comp_mask |= BNXT_RE_UCNTX_CMASK_HAVE_MODE; - resp.mode = rdev->chip_ctx->modes.wqe_mode; - if (rdev->chip_ctx->modes.db_push) resp.comp_mask |= BNXT_RE_UCNTX_CMASK_WC_DPI_ENABLED; @@ -4211,7 +4266,13 @@ int bnxt_re_alloc_ucontext(struct ib_ucontext *ctx, struct ib_udata *udata) goto cfail; if (ureq.comp_mask & BNXT_RE_COMP_MASK_REQ_UCNTX_POW2_SUPPORT) { resp.comp_mask |= BNXT_RE_UCNTX_CMASK_POW2_DISABLED; - uctx->cmask |= BNXT_RE_UCNTX_CMASK_POW2_DISABLED; + uctx->cmask |= BNXT_RE_UCNTX_CAP_POW2_DISABLED; + } + if (ureq.comp_mask & BNXT_RE_COMP_MASK_REQ_UCNTX_VAR_WQE_SUPPORT) { + resp.comp_mask |= BNXT_RE_UCNTX_CMASK_HAVE_MODE; + resp.mode = rdev->chip_ctx->modes.wqe_mode; + if (resp.mode == BNXT_QPLIB_WQE_MODE_VARIABLE) + uctx->cmask |= BNXT_RE_UCNTX_CAP_VAR_WQE_ENABLED; } } @@ -4265,6 +4326,19 @@ static struct bnxt_re_cq *bnxt_re_search_for_cq(struct bnxt_re_dev *rdev, u32 cq return cq; } +static struct bnxt_re_srq *bnxt_re_search_for_srq(struct bnxt_re_dev *rdev, u32 srq_id) +{ + struct bnxt_re_srq *srq = NULL, *tmp_srq; + + hash_for_each_possible(rdev->srq_hash, tmp_srq, hash_entry, srq_id) { + if (tmp_srq->qplib_srq.id == srq_id) { + srq = tmp_srq; + break; + } + } + return srq; +} + /* Helper function to mmap the virtual memory from user app */ int bnxt_re_mmap(struct ib_ucontext *ib_uctx, struct vm_area_struct *vma) { @@ -4493,12 +4567,13 @@ static int UVERBS_HANDLER(BNXT_RE_METHOD_GET_TOGGLE_MEM)(struct uverbs_attr_bund struct bnxt_re_ucontext *uctx; struct ib_ucontext *ib_uctx; struct bnxt_re_dev *rdev; + struct bnxt_re_srq *srq; + u32 length = PAGE_SIZE; struct bnxt_re_cq *cq; u64 mem_offset; + u32 offset = 0; u64 addr = 0; - u32 length; - u32 offset; - u32 cq_id; + u32 res_id; int err; ib_uctx = ib_uverbs_get_ucontext(attrs); @@ -4511,23 +4586,24 @@ static int UVERBS_HANDLER(BNXT_RE_METHOD_GET_TOGGLE_MEM)(struct uverbs_attr_bund uctx = container_of(ib_uctx, struct bnxt_re_ucontext, ib_uctx); rdev = uctx->rdev; + err = uverbs_copy_from(&res_id, attrs, BNXT_RE_TOGGLE_MEM_RES_ID); + if (err) + return err; switch (res_type) { case BNXT_RE_CQ_TOGGLE_MEM: - err = uverbs_copy_from(&cq_id, attrs, BNXT_RE_TOGGLE_MEM_RES_ID); - if (err) - return err; - - cq = bnxt_re_search_for_cq(rdev, cq_id); + cq = bnxt_re_search_for_cq(rdev, res_id); if (!cq) return -EINVAL; - length = PAGE_SIZE; addr = (u64)cq->uctx_cq_page; - mmap_flag = BNXT_RE_MMAP_TOGGLE_PAGE; - offset = 0; break; case BNXT_RE_SRQ_TOGGLE_MEM: + srq = bnxt_re_search_for_srq(rdev, res_id); + if (!srq) + return -EINVAL; + + addr = (u64)srq->uctx_srq_page; break; default: diff --git a/drivers/infiniband/hw/bnxt_re/ib_verbs.h b/drivers/infiniband/hw/bnxt_re/ib_verbs.h index e98cb1717338..b789e47ec97a 100644 --- a/drivers/infiniband/hw/bnxt_re/ib_verbs.h +++ b/drivers/infiniband/hw/bnxt_re/ib_verbs.h @@ -77,6 +77,8 @@ struct bnxt_re_srq { struct bnxt_qplib_srq qplib_srq; struct ib_umem *umem; spinlock_t lock; /* protect srq */ + void *uctx_srq_page; + struct hlist_node hash_entry; }; struct bnxt_re_qp { @@ -171,12 +173,26 @@ static inline u16 bnxt_re_get_rwqe_size(int nsge) return sizeof(struct rq_wqe_hdr) + (nsge * sizeof(struct sq_sge)); } +enum { + BNXT_RE_UCNTX_CAP_POW2_DISABLED = 0x1ULL, + BNXT_RE_UCNTX_CAP_VAR_WQE_ENABLED = 0x2ULL, +}; + static inline u32 bnxt_re_init_depth(u32 ent, struct bnxt_re_ucontext *uctx) { - return uctx ? (uctx->cmask & BNXT_RE_UCNTX_CMASK_POW2_DISABLED) ? + return uctx ? (uctx->cmask & BNXT_RE_UCNTX_CAP_POW2_DISABLED) ? ent : roundup_pow_of_two(ent) : ent; } +static inline bool bnxt_re_is_var_size_supported(struct bnxt_re_dev *rdev, + struct bnxt_re_ucontext *uctx) +{ + if (uctx) + return uctx->cmask & BNXT_RE_UCNTX_CAP_VAR_WQE_ENABLED; + else + return rdev->chip_ctx->modes.wqe_mode; +} + int bnxt_re_query_device(struct ib_device *ibdev, struct ib_device_attr *ib_attr, struct ib_udata *udata); @@ -242,7 +258,7 @@ struct ib_mr *bnxt_re_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, struct ib_mr *bnxt_re_reg_user_mr_dmabuf(struct ib_pd *ib_pd, u64 start, u64 length, u64 virt_addr, int fd, int mr_access_flags, - struct ib_udata *udata); + struct uverbs_attr_bundle *attrs); int bnxt_re_alloc_ucontext(struct ib_ucontext *ctx, struct ib_udata *udata); void bnxt_re_dealloc_ucontext(struct ib_ucontext *context); int bnxt_re_mmap(struct ib_ucontext *context, struct vm_area_struct *vma); diff --git a/drivers/infiniband/hw/bnxt_re/main.c b/drivers/infiniband/hw/bnxt_re/main.c index 9714b9ab7524..777068de4bbc 100644 --- a/drivers/infiniband/hw/bnxt_re/main.c +++ b/drivers/infiniband/hw/bnxt_re/main.c @@ -83,11 +83,12 @@ static void bnxt_re_dev_stop(struct bnxt_re_dev *rdev); static int bnxt_re_netdev_event(struct notifier_block *notifier, unsigned long event, void *ptr); static struct bnxt_re_dev *bnxt_re_from_netdev(struct net_device *netdev); -static void bnxt_re_dev_uninit(struct bnxt_re_dev *rdev); +static void bnxt_re_dev_uninit(struct bnxt_re_dev *rdev, u8 op_type); static int bnxt_re_hwrm_qcaps(struct bnxt_re_dev *rdev); static int bnxt_re_hwrm_qcfg(struct bnxt_re_dev *rdev, u32 *db_len, u32 *offset); +static void bnxt_re_setup_cc(struct bnxt_re_dev *rdev, bool enable); static void bnxt_re_set_db_offset(struct bnxt_re_dev *rdev) { struct bnxt_qplib_chip_ctx *cctx; @@ -129,18 +130,20 @@ static void bnxt_re_set_db_offset(struct bnxt_re_dev *rdev) } } -static void bnxt_re_set_drv_mode(struct bnxt_re_dev *rdev, u8 mode) +static void bnxt_re_set_drv_mode(struct bnxt_re_dev *rdev) { struct bnxt_qplib_chip_ctx *cctx; cctx = rdev->chip_ctx; - cctx->modes.wqe_mode = bnxt_qplib_is_chip_gen_p5_p7(rdev->chip_ctx) ? - mode : BNXT_QPLIB_WQE_MODE_STATIC; + cctx->modes.wqe_mode = bnxt_qplib_is_chip_gen_p7(rdev->chip_ctx) ? + BNXT_QPLIB_WQE_MODE_VARIABLE : BNXT_QPLIB_WQE_MODE_STATIC; if (bnxt_re_hwrm_qcaps(rdev)) dev_err(rdev_to_dev(rdev), "Failed to query hwrm qcaps\n"); - if (bnxt_qplib_is_chip_gen_p7(rdev->chip_ctx)) + if (bnxt_qplib_is_chip_gen_p7(rdev->chip_ctx)) { cctx->modes.toggle_bits |= BNXT_QPLIB_CQ_TOGGLE_BIT; + cctx->modes.toggle_bits |= BNXT_QPLIB_SRQ_TOGGLE_BIT; + } } static void bnxt_re_destroy_chip_ctx(struct bnxt_re_dev *rdev) @@ -158,7 +161,7 @@ static void bnxt_re_destroy_chip_ctx(struct bnxt_re_dev *rdev) kfree(chip_ctx); } -static int bnxt_re_setup_chip_ctx(struct bnxt_re_dev *rdev, u8 wqe_mode) +static int bnxt_re_setup_chip_ctx(struct bnxt_re_dev *rdev) { struct bnxt_qplib_chip_ctx *chip_ctx; struct bnxt_en_dev *en_dev; @@ -166,6 +169,7 @@ static int bnxt_re_setup_chip_ctx(struct bnxt_re_dev *rdev, u8 wqe_mode) en_dev = rdev->en_dev; + rdev->qplib_res.pdev = en_dev->pdev; chip_ctx = kzalloc(sizeof(*chip_ctx), GFP_KERNEL); if (!chip_ctx) return -ENOMEM; @@ -180,7 +184,7 @@ static int bnxt_re_setup_chip_ctx(struct bnxt_re_dev *rdev, u8 wqe_mode) rdev->qplib_res.dattr = &rdev->dev_attr; rdev->qplib_res.is_vf = BNXT_EN_VF(en_dev); - bnxt_re_set_drv_mode(rdev, wqe_mode); + bnxt_re_set_drv_mode(rdev); bnxt_re_set_db_offset(rdev); rc = bnxt_qplib_map_db_bar(&rdev->qplib_res); @@ -290,21 +294,31 @@ static void bnxt_re_vf_res_config(struct bnxt_re_dev *rdev) static void bnxt_re_shutdown(struct auxiliary_device *adev) { - struct bnxt_re_dev *rdev = auxiliary_get_drvdata(adev); + struct bnxt_re_en_dev_info *en_info = auxiliary_get_drvdata(adev); + struct bnxt_re_dev *rdev; - if (!rdev) + if (!en_info) return; + + rdev = en_info->rdev; ib_unregister_device(&rdev->ibdev); - bnxt_re_dev_uninit(rdev); + bnxt_re_dev_uninit(rdev, BNXT_RE_COMPLETE_REMOVE); } static void bnxt_re_stop_irq(void *handle) { - struct bnxt_re_dev *rdev = (struct bnxt_re_dev *)handle; - struct bnxt_qplib_rcfw *rcfw = &rdev->rcfw; + struct bnxt_re_en_dev_info *en_info = auxiliary_get_drvdata(handle); + struct bnxt_qplib_rcfw *rcfw; + struct bnxt_re_dev *rdev; struct bnxt_qplib_nq *nq; int indx; + if (!en_info) + return; + + rdev = en_info->rdev; + rcfw = &rdev->rcfw; + for (indx = BNXT_RE_NQ_IDX; indx < rdev->num_msix; indx++) { nq = &rdev->nq[indx - 1]; bnxt_qplib_nq_stop_irq(nq, false); @@ -315,12 +329,19 @@ static void bnxt_re_stop_irq(void *handle) static void bnxt_re_start_irq(void *handle, struct bnxt_msix_entry *ent) { - struct bnxt_re_dev *rdev = (struct bnxt_re_dev *)handle; - struct bnxt_msix_entry *msix_ent = rdev->en_dev->msix_entries; - struct bnxt_qplib_rcfw *rcfw = &rdev->rcfw; + struct bnxt_re_en_dev_info *en_info = auxiliary_get_drvdata(handle); + struct bnxt_msix_entry *msix_ent; + struct bnxt_qplib_rcfw *rcfw; + struct bnxt_re_dev *rdev; struct bnxt_qplib_nq *nq; int indx, rc; + if (!en_info) + return; + + rdev = en_info->rdev; + msix_ent = rdev->en_dev->msix_entries; + rcfw = &rdev->rcfw; if (!ent) { /* Not setting the f/w timeout bit in rcfw. * During the driver unload the first command @@ -365,14 +386,9 @@ static struct bnxt_ulp_ops bnxt_re_ulp_ops = { static int bnxt_re_register_netdev(struct bnxt_re_dev *rdev) { struct bnxt_en_dev *en_dev; - int rc; en_dev = rdev->en_dev; - - rc = bnxt_register_dev(en_dev, &bnxt_re_ulp_ops, rdev); - if (!rc) - rdev->qplib_res.pdev = rdev->en_dev->pdev; - return rc; + return bnxt_register_dev(en_dev, &bnxt_re_ulp_ops, rdev->adev); } static void bnxt_re_init_hwrm_hdr(struct input *hdr, u16 opcd) @@ -1573,7 +1589,7 @@ static int bnxt_re_ib_init(struct bnxt_re_dev *rdev) return rc; } -static void bnxt_re_dev_uninit(struct bnxt_re_dev *rdev) +static void bnxt_re_dev_uninit(struct bnxt_re_dev *rdev, u8 op_type) { u8 type; int rc; @@ -1606,8 +1622,10 @@ static void bnxt_re_dev_uninit(struct bnxt_re_dev *rdev) bnxt_re_deinitialize_dbr_pacing(rdev); bnxt_re_destroy_chip_ctx(rdev); - if (test_and_clear_bit(BNXT_RE_FLAG_NETDEV_REGISTERED, &rdev->flags)) - bnxt_unregister_dev(rdev->en_dev); + if (op_type == BNXT_RE_COMPLETE_REMOVE) { + if (test_and_clear_bit(BNXT_RE_FLAG_NETDEV_REGISTERED, &rdev->flags)) + bnxt_unregister_dev(rdev->en_dev); + } } /* worker thread for polling periodic events. Now used for QoS programming*/ @@ -1620,7 +1638,7 @@ static void bnxt_re_worker(struct work_struct *work) schedule_delayed_work(&rdev->worker, msecs_to_jiffies(30000)); } -static int bnxt_re_dev_init(struct bnxt_re_dev *rdev, u8 wqe_mode) +static int bnxt_re_dev_init(struct bnxt_re_dev *rdev, u8 op_type) { struct bnxt_re_ring_attr rattr = {}; struct bnxt_qplib_creq_ctx *creq; @@ -1629,16 +1647,18 @@ static int bnxt_re_dev_init(struct bnxt_re_dev *rdev, u8 wqe_mode) u8 type; int rc; - /* Registered a new RoCE device instance to netdev */ - rc = bnxt_re_register_netdev(rdev); - if (rc) { - ibdev_err(&rdev->ibdev, - "Failed to register with netedev: %#x\n", rc); - return -EINVAL; + if (op_type == BNXT_RE_COMPLETE_INIT) { + /* Registered a new RoCE device instance to netdev */ + rc = bnxt_re_register_netdev(rdev); + if (rc) { + ibdev_err(&rdev->ibdev, + "Failed to register with netedev: %#x\n", rc); + return -EINVAL; + } } set_bit(BNXT_RE_FLAG_NETDEV_REGISTERED, &rdev->flags); - rc = bnxt_re_setup_chip_ctx(rdev, wqe_mode); + rc = bnxt_re_setup_chip_ctx(rdev); if (rc) { bnxt_unregister_dev(rdev->en_dev); clear_bit(BNXT_RE_FLAG_NETDEV_REGISTERED, &rdev->flags); @@ -1771,6 +1791,8 @@ static int bnxt_re_dev_init(struct bnxt_re_dev *rdev, u8 wqe_mode) bnxt_re_vf_res_config(rdev); } hash_init(rdev->cq_hash); + if (rdev->chip_ctx->modes.toggle_bits & BNXT_QPLIB_SRQ_TOGGLE_BIT) + hash_init(rdev->srq_hash); return 0; free_sctx: @@ -1785,21 +1807,38 @@ free_ring: free_rcfw: bnxt_qplib_free_rcfw_channel(&rdev->rcfw); fail: - bnxt_re_dev_uninit(rdev); + bnxt_re_dev_uninit(rdev, BNXT_RE_COMPLETE_REMOVE); return rc; } -static int bnxt_re_add_device(struct auxiliary_device *adev, u8 wqe_mode) +static void bnxt_re_update_en_info_rdev(struct bnxt_re_dev *rdev, + struct bnxt_re_en_dev_info *en_info, + struct auxiliary_device *adev) +{ + /* Before updating the rdev pointer in bnxt_re_en_dev_info structure, + * take the rtnl lock to avoid accessing invalid rdev pointer from + * L2 ULP callbacks. This is applicable in all the places where rdev + * pointer is updated in bnxt_re_en_dev_info. + */ + rtnl_lock(); + en_info->rdev = rdev; + rdev->adev = adev; + rtnl_unlock(); +} + +static int bnxt_re_add_device(struct auxiliary_device *adev, u8 op_type) { struct bnxt_aux_priv *aux_priv = container_of(adev, struct bnxt_aux_priv, aux_dev); + struct bnxt_re_en_dev_info *en_info; struct bnxt_en_dev *en_dev; struct bnxt_re_dev *rdev; int rc; - /* en_dev should never be NULL as long as adev and aux_dev are valid. */ - en_dev = aux_priv->edev; + en_info = auxiliary_get_drvdata(adev); + en_dev = en_info->en_dev; + rdev = bnxt_re_dev_add(aux_priv, en_dev); if (!rdev || !rdev_to_dev(rdev)) { @@ -1807,7 +1846,9 @@ static int bnxt_re_add_device(struct auxiliary_device *adev, u8 wqe_mode) goto exit; } - rc = bnxt_re_dev_init(rdev, wqe_mode); + bnxt_re_update_en_info_rdev(rdev, en_info, adev); + + rc = bnxt_re_dev_init(rdev, op_type); if (rc) goto re_dev_dealloc; @@ -1817,12 +1858,22 @@ static int bnxt_re_add_device(struct auxiliary_device *adev, u8 wqe_mode) aux_priv->aux_dev.name); goto re_dev_uninit; } - auxiliary_set_drvdata(adev, rdev); + + rdev->nb.notifier_call = bnxt_re_netdev_event; + rc = register_netdevice_notifier(&rdev->nb); + if (rc) { + rdev->nb.notifier_call = NULL; + pr_err("%s: Cannot register to netdevice_notifier", + ROCE_DRV_MODULE_NAME); + return rc; + } + bnxt_re_setup_cc(rdev, true); return 0; re_dev_uninit: - bnxt_re_dev_uninit(rdev); + bnxt_re_update_en_info_rdev(NULL, en_info, adev); + bnxt_re_dev_uninit(rdev, BNXT_RE_COMPLETE_REMOVE); re_dev_dealloc: ib_dealloc_device(&rdev->ibdev); exit: @@ -1905,14 +1956,9 @@ exit: #define BNXT_ADEV_NAME "bnxt_en" -static void bnxt_re_remove(struct auxiliary_device *adev) +static void bnxt_re_remove_device(struct bnxt_re_dev *rdev, u8 op_type, + struct auxiliary_device *aux_dev) { - struct bnxt_re_dev *rdev = auxiliary_get_drvdata(adev); - - if (!rdev) - return; - - mutex_lock(&bnxt_re_mutex); if (rdev->nb.notifier_call) { unregister_netdevice_notifier(&rdev->nb); rdev->nb.notifier_call = NULL; @@ -1920,41 +1966,56 @@ static void bnxt_re_remove(struct auxiliary_device *adev) /* If notifier is null, we should have already done a * clean up before coming here. */ - goto skip_remove; + return; } bnxt_re_setup_cc(rdev, false); ib_unregister_device(&rdev->ibdev); - bnxt_re_dev_uninit(rdev); + bnxt_re_dev_uninit(rdev, op_type); ib_dealloc_device(&rdev->ibdev); -skip_remove: +} + +static void bnxt_re_remove(struct auxiliary_device *adev) +{ + struct bnxt_re_en_dev_info *en_info = auxiliary_get_drvdata(adev); + struct bnxt_re_dev *rdev; + + mutex_lock(&bnxt_re_mutex); + if (!en_info) { + mutex_unlock(&bnxt_re_mutex); + return; + } + rdev = en_info->rdev; + + if (rdev) + bnxt_re_remove_device(rdev, BNXT_RE_COMPLETE_REMOVE, adev); + kfree(en_info); mutex_unlock(&bnxt_re_mutex); } static int bnxt_re_probe(struct auxiliary_device *adev, const struct auxiliary_device_id *id) { - struct bnxt_re_dev *rdev; + struct bnxt_aux_priv *aux_priv = + container_of(adev, struct bnxt_aux_priv, aux_dev); + struct bnxt_re_en_dev_info *en_info; + struct bnxt_en_dev *en_dev; int rc; + en_dev = aux_priv->edev; + mutex_lock(&bnxt_re_mutex); - rc = bnxt_re_add_device(adev, BNXT_QPLIB_WQE_MODE_STATIC); - if (rc) { + en_info = kzalloc(sizeof(*en_info), GFP_KERNEL); + if (!en_info) { mutex_unlock(&bnxt_re_mutex); - return rc; + return -ENOMEM; } + en_info->en_dev = en_dev; - rdev = auxiliary_get_drvdata(adev); + auxiliary_set_drvdata(adev, en_info); - rdev->nb.notifier_call = bnxt_re_netdev_event; - rc = register_netdevice_notifier(&rdev->nb); - if (rc) { - rdev->nb.notifier_call = NULL; - pr_err("%s: Cannot register to netdevice_notifier", - ROCE_DRV_MODULE_NAME); + rc = bnxt_re_add_device(adev, BNXT_RE_COMPLETE_INIT); + if (rc) goto err; - } - - bnxt_re_setup_cc(rdev, true); mutex_unlock(&bnxt_re_mutex); return 0; @@ -1967,11 +2028,15 @@ err: static int bnxt_re_suspend(struct auxiliary_device *adev, pm_message_t state) { - struct bnxt_re_dev *rdev = auxiliary_get_drvdata(adev); + struct bnxt_re_en_dev_info *en_info = auxiliary_get_drvdata(adev); + struct bnxt_en_dev *en_dev; + struct bnxt_re_dev *rdev; - if (!rdev) + if (!en_info) return 0; + rdev = en_info->rdev; + en_dev = en_info->en_dev; mutex_lock(&bnxt_re_mutex); /* L2 driver may invoke this callback during device error/crash or device * reset. Current RoCE driver doesn't recover the device in case of @@ -1990,13 +2055,20 @@ static int bnxt_re_suspend(struct auxiliary_device *adev, pm_message_t state) set_bit(ERR_DEVICE_DETACHED, &rdev->rcfw.cmdq.flags); bnxt_re_dev_stop(rdev); - bnxt_re_stop_irq(rdev); + bnxt_re_stop_irq(adev); /* Move the device states to detached and avoid sending any more * commands to HW */ set_bit(BNXT_RE_FLAG_ERR_DEVICE_DETACHED, &rdev->flags); set_bit(ERR_DEVICE_DETACHED, &rdev->rcfw.cmdq.flags); wake_up_all(&rdev->rcfw.cmdq.waitq); + + if (rdev->pacing.dbr_pacing) + bnxt_re_set_pacing_dev_state(rdev); + + ibdev_info(&rdev->ibdev, "%s: L2 driver notified to stop en_state 0x%lx", + __func__, en_dev->en_state); + bnxt_re_remove_device(rdev, BNXT_RE_PRE_RECOVERY_REMOVE, adev); mutex_unlock(&bnxt_re_mutex); return 0; @@ -2004,9 +2076,10 @@ static int bnxt_re_suspend(struct auxiliary_device *adev, pm_message_t state) static int bnxt_re_resume(struct auxiliary_device *adev) { - struct bnxt_re_dev *rdev = auxiliary_get_drvdata(adev); + struct bnxt_re_en_dev_info *en_info = auxiliary_get_drvdata(adev); + struct bnxt_re_dev *rdev; - if (!rdev) + if (!en_info) return 0; mutex_lock(&bnxt_re_mutex); @@ -2017,7 +2090,9 @@ static int bnxt_re_resume(struct auxiliary_device *adev) * L2 driver want to modify the MSIx table. */ - ibdev_info(&rdev->ibdev, "Handle device resume call"); + bnxt_re_add_device(adev, BNXT_RE_POST_RECOVERY_INIT); + rdev = en_info->rdev; + ibdev_info(&rdev->ibdev, "Device resume completed"); mutex_unlock(&bnxt_re_mutex); return 0; diff --git a/drivers/infiniband/hw/bnxt_re/qplib_fp.c b/drivers/infiniband/hw/bnxt_re/qplib_fp.c index 49e4a4a50bfa..42e98e5f94cb 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_fp.c +++ b/drivers/infiniband/hw/bnxt_re/qplib_fp.c @@ -54,6 +54,10 @@ #include "qplib_rcfw.h" #include "qplib_sp.h" #include "qplib_fp.h" +#include <rdma/ib_addr.h> +#include "bnxt_ulp.h" +#include "bnxt_re.h" +#include "ib_verbs.h" static void __clean_cq(struct bnxt_qplib_cq *cq, u64 qp); @@ -347,6 +351,7 @@ static void bnxt_qplib_service_nq(struct tasklet_struct *t) case NQ_BASE_TYPE_SRQ_EVENT: { struct bnxt_qplib_srq *srq; + struct bnxt_re_srq *srq_p; struct nq_srq_event *nqsrqe = (struct nq_srq_event *)nqe; @@ -354,6 +359,12 @@ static void bnxt_qplib_service_nq(struct tasklet_struct *t) q_handle |= (u64)le32_to_cpu(nqsrqe->srq_handle_high) << 32; srq = (struct bnxt_qplib_srq *)q_handle; + srq->toggle = (le16_to_cpu(nqe->info10_type) & NQ_CN_TOGGLE_MASK) + >> NQ_CN_TOGGLE_SFT; + srq->dbinfo.toggle = srq->toggle; + srq_p = container_of(srq, struct bnxt_re_srq, qplib_srq); + if (srq_p->uctx_srq_page) + *((u32 *)srq_p->uctx_srq_page) = srq->toggle; bnxt_qplib_armen_db(&srq->dbinfo, DBC_DBC_TYPE_SRQ_ARMENA); if (nq->srqn_handler(nq, @@ -809,13 +820,13 @@ static int bnxt_qplib_alloc_init_swq(struct bnxt_qplib_q *que) { int indx; - que->swq = kcalloc(que->max_wqe, sizeof(*que->swq), GFP_KERNEL); + que->swq = kcalloc(que->max_sw_wqe, sizeof(*que->swq), GFP_KERNEL); if (!que->swq) return -ENOMEM; que->swq_start = 0; - que->swq_last = que->max_wqe - 1; - for (indx = 0; indx < que->max_wqe; indx++) + que->swq_last = que->max_sw_wqe - 1; + for (indx = 0; indx < que->max_sw_wqe; indx++) que->swq[indx].next_idx = indx + 1; que->swq[que->swq_last].next_idx = 0; /* Make it circular */ que->swq_last = 0; @@ -851,7 +862,7 @@ int bnxt_qplib_create_qp1(struct bnxt_qplib_res *res, struct bnxt_qplib_qp *qp) hwq_attr.res = res; hwq_attr.sginfo = &sq->sg_info; hwq_attr.stride = sizeof(struct sq_sge); - hwq_attr.depth = bnxt_qplib_get_depth(sq); + hwq_attr.depth = bnxt_qplib_get_depth(sq, qp->wqe_mode, false); hwq_attr.type = HWQ_TYPE_QUEUE; rc = bnxt_qplib_alloc_init_hwq(&sq->hwq, &hwq_attr); if (rc) @@ -879,7 +890,7 @@ int bnxt_qplib_create_qp1(struct bnxt_qplib_res *res, struct bnxt_qplib_qp *qp) hwq_attr.res = res; hwq_attr.sginfo = &rq->sg_info; hwq_attr.stride = sizeof(struct sq_sge); - hwq_attr.depth = bnxt_qplib_get_depth(rq); + hwq_attr.depth = bnxt_qplib_get_depth(rq, qp->wqe_mode, false); hwq_attr.type = HWQ_TYPE_QUEUE; rc = bnxt_qplib_alloc_init_hwq(&rq->hwq, &hwq_attr); if (rc) @@ -1011,7 +1022,7 @@ int bnxt_qplib_create_qp(struct bnxt_qplib_res *res, struct bnxt_qplib_qp *qp) hwq_attr.res = res; hwq_attr.sginfo = &sq->sg_info; hwq_attr.stride = sizeof(struct sq_sge); - hwq_attr.depth = bnxt_qplib_get_depth(sq); + hwq_attr.depth = bnxt_qplib_get_depth(sq, qp->wqe_mode, true); hwq_attr.aux_stride = psn_sz; hwq_attr.aux_depth = psn_sz ? bnxt_qplib_set_sq_size(sq, qp->wqe_mode) : 0; @@ -1052,7 +1063,7 @@ int bnxt_qplib_create_qp(struct bnxt_qplib_res *res, struct bnxt_qplib_qp *qp) hwq_attr.res = res; hwq_attr.sginfo = &rq->sg_info; hwq_attr.stride = sizeof(struct sq_sge); - hwq_attr.depth = bnxt_qplib_get_depth(rq); + hwq_attr.depth = bnxt_qplib_get_depth(rq, qp->wqe_mode, false); hwq_attr.aux_stride = 0; hwq_attr.aux_depth = 0; hwq_attr.type = HWQ_TYPE_QUEUE; @@ -2471,6 +2482,32 @@ out: return rc; } +static int bnxt_qplib_get_cqe_sq_cons(struct bnxt_qplib_q *sq, u32 cqe_slot) +{ + struct bnxt_qplib_hwq *sq_hwq; + struct bnxt_qplib_swq *swq; + int cqe_sq_cons = -1; + u32 start, last; + + sq_hwq = &sq->hwq; + + start = sq->swq_start; + last = sq->swq_last; + + while (last != start) { + swq = &sq->swq[last]; + if (swq->slot_idx == cqe_slot) { + cqe_sq_cons = swq->next_idx; + dev_err(&sq_hwq->pdev->dev, "%s: Found cons wqe = %d slot = %d\n", + __func__, cqe_sq_cons, cqe_slot); + break; + } + + last = swq->next_idx; + } + return cqe_sq_cons; +} + static int bnxt_qplib_cq_process_req(struct bnxt_qplib_cq *cq, struct cq_req *hwcqe, struct bnxt_qplib_cqe **pcqe, int *budget, @@ -2478,9 +2515,10 @@ static int bnxt_qplib_cq_process_req(struct bnxt_qplib_cq *cq, { struct bnxt_qplib_swq *swq; struct bnxt_qplib_cqe *cqe; + u32 cqe_sq_cons, slot_num; struct bnxt_qplib_qp *qp; struct bnxt_qplib_q *sq; - u32 cqe_sq_cons; + int cqe_cons; int rc = 0; qp = (struct bnxt_qplib_qp *)((unsigned long) @@ -2492,12 +2530,26 @@ static int bnxt_qplib_cq_process_req(struct bnxt_qplib_cq *cq, } sq = &qp->sq; - cqe_sq_cons = le16_to_cpu(hwcqe->sq_cons_idx) % sq->max_wqe; + cqe_sq_cons = le16_to_cpu(hwcqe->sq_cons_idx) % sq->max_sw_wqe; if (qp->sq.flushed) { dev_dbg(&cq->hwq.pdev->dev, "%s: QP in Flush QP = %p\n", __func__, qp); goto done; } + + if (__is_err_cqe_for_var_wqe(qp, hwcqe->status)) { + slot_num = le16_to_cpu(hwcqe->sq_cons_idx); + cqe_cons = bnxt_qplib_get_cqe_sq_cons(sq, slot_num); + if (cqe_cons < 0) { + dev_err(&cq->hwq.pdev->dev, "%s: Wrong SQ cons cqe_slot_indx = %d\n", + __func__, slot_num); + goto done; + } + cqe_sq_cons = cqe_cons; + dev_err(&cq->hwq.pdev->dev, "%s: cqe_sq_cons = %d swq_last = %d swq_start = %d\n", + __func__, cqe_sq_cons, sq->swq_last, sq->swq_start); + } + /* Require to walk the sq's swq to fabricate CQEs for all previously * signaled SWQEs due to CQE aggregation from the current sq cons * to the cqe_sq_cons @@ -2882,7 +2934,7 @@ static int bnxt_qplib_cq_process_terminal(struct bnxt_qplib_cq *cq, cqe_cons = le16_to_cpu(hwcqe->sq_cons_idx); if (cqe_cons == 0xFFFF) goto do_rq; - cqe_cons %= sq->max_wqe; + cqe_cons %= sq->max_sw_wqe; if (qp->sq.flushed) { dev_dbg(&cq->hwq.pdev->dev, diff --git a/drivers/infiniband/hw/bnxt_re/qplib_fp.h b/drivers/infiniband/hw/bnxt_re/qplib_fp.h index 56538b90d6c5..b62df8701950 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_fp.h +++ b/drivers/infiniband/hw/bnxt_re/qplib_fp.h @@ -105,6 +105,7 @@ struct bnxt_qplib_srq { struct bnxt_qplib_sg_info sg_info; u16 eventq_hw_ring_id; spinlock_t lock; /* protect SRQE link list */ + u8 toggle; }; struct bnxt_qplib_sge { @@ -251,6 +252,7 @@ struct bnxt_qplib_q { struct bnxt_qplib_db_info dbinfo; struct bnxt_qplib_sg_info sg_info; u32 max_wqe; + u32 max_sw_wqe; u16 wqe_size; u16 q_full_delta; u16 max_sge; @@ -586,15 +588,22 @@ static inline void bnxt_qplib_swq_mod_start(struct bnxt_qplib_q *que, u32 idx) que->swq_start = que->swq[idx].next_idx; } -static inline u32 bnxt_qplib_get_depth(struct bnxt_qplib_q *que) +static inline u32 bnxt_qplib_get_depth(struct bnxt_qplib_q *que, u8 wqe_mode, bool is_sq) { - return (que->wqe_size * que->max_wqe) / sizeof(struct sq_sge); + u32 slots; + + /* Queue depth is the number of slots. */ + slots = (que->wqe_size * que->max_wqe) / sizeof(struct sq_sge); + /* For variable WQE mode, need to align the slots to 256 */ + if (wqe_mode == BNXT_QPLIB_WQE_MODE_VARIABLE && is_sq) + slots = ALIGN(slots, BNXT_VAR_MAX_SLOT_ALIGN); + return slots; } static inline u32 bnxt_qplib_set_sq_size(struct bnxt_qplib_q *que, u8 wqe_mode) { return (wqe_mode == BNXT_QPLIB_WQE_MODE_STATIC) ? - que->max_wqe : bnxt_qplib_get_depth(que); + que->max_wqe : bnxt_qplib_get_depth(que, wqe_mode, true); } static inline u32 bnxt_qplib_set_sq_max_slot(u8 wqe_mode) @@ -641,4 +650,14 @@ static inline __le64 bnxt_re_update_msn_tbl(u32 st_idx, u32 npsn, u32 start_psn) (((start_psn) << SQ_MSN_SEARCH_START_PSN_SFT) & SQ_MSN_SEARCH_START_PSN_MASK)); } + +static inline bool __is_var_wqe(struct bnxt_qplib_qp *qp) +{ + return (qp->wqe_mode == BNXT_QPLIB_WQE_MODE_VARIABLE); +} + +static inline bool __is_err_cqe_for_var_wqe(struct bnxt_qplib_qp *qp, u8 status) +{ + return (status != CQ_REQ_STATUS_OK) && __is_var_wqe(qp); +} #endif /* __BNXT_QPLIB_FP_H__ */ diff --git a/drivers/infiniband/hw/bnxt_re/qplib_res.h b/drivers/infiniband/hw/bnxt_re/qplib_res.h index a0f78cde314f..c2f710364e0f 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_res.h +++ b/drivers/infiniband/hw/bnxt_re/qplib_res.h @@ -82,6 +82,7 @@ struct bnxt_qplib_db_pacing_data { u32 fifo_room_mask; u32 fifo_room_shift; u32 grc_reg_offset; + u32 dev_err_state; }; #define BNXT_QPLIB_DBR_PF_DB_OFFSET 0x10000 @@ -565,4 +566,14 @@ static inline u8 bnxt_qplib_dbr_pacing_en(struct bnxt_qplib_chip_ctx *cctx) return cctx->modes.dbr_pacing; } +static inline bool _is_alloc_mr_unified(u16 dev_cap_flags) +{ + return dev_cap_flags & CREQ_QUERY_FUNC_RESP_SB_MR_REGISTER_ALLOC; +} + +static inline bool _is_relaxed_ordering_supported(u16 dev_cap_ext_flags2) +{ + return dev_cap_ext_flags2 & CREQ_QUERY_FUNC_RESP_SB_MEMORY_REGION_RO_SUPPORTED; +} + #endif /* __BNXT_QPLIB_RES_H__ */ diff --git a/drivers/infiniband/hw/bnxt_re/qplib_sp.c b/drivers/infiniband/hw/bnxt_re/qplib_sp.c index 9328db92fa6d..4f75e7e5bcf7 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_sp.c +++ b/drivers/infiniband/hw/bnxt_re/qplib_sp.c @@ -95,11 +95,13 @@ int bnxt_qplib_get_dev_attr(struct bnxt_qplib_rcfw *rcfw, struct bnxt_qplib_cmdqmsg msg = {}; struct creq_query_func_resp_sb *sb; struct bnxt_qplib_rcfw_sbuf sbuf; + struct bnxt_qplib_chip_ctx *cctx; struct cmdq_query_func req = {}; u8 *tqm_alloc; int i, rc; u32 temp; + cctx = rcfw->res->cctx; bnxt_qplib_rcfw_cmd_prep((struct cmdq_base *)&req, CMDQ_BASE_OPCODE_QUERY_FUNC, sizeof(req)); @@ -133,8 +135,9 @@ int bnxt_qplib_get_dev_attr(struct bnxt_qplib_rcfw *rcfw, * reporting the max number */ attr->max_qp_wqes -= BNXT_QPLIB_RESERVED_QP_WRS + 1; - attr->max_qp_sges = bnxt_qplib_is_chip_gen_p5_p7(rcfw->res->cctx) ? - 6 : sb->max_sge; + + attr->max_qp_sges = cctx->modes.wqe_mode == BNXT_QPLIB_WQE_MODE_VARIABLE ? + min_t(u32, sb->max_sge_var_wqe, BNXT_VAR_MAX_SGE) : 6; attr->max_cq = le32_to_cpu(sb->max_cq); attr->max_cq_wqes = le32_to_cpu(sb->max_cqe); attr->max_cq_sges = attr->max_qp_sges; @@ -541,7 +544,7 @@ int bnxt_qplib_alloc_mrw(struct bnxt_qplib_res *res, struct bnxt_qplib_mrw *mrw) req.pd_id = cpu_to_le32(mrw->pd->id); req.mrw_flags = mrw->type; if ((mrw->type == CMDQ_ALLOCATE_MRW_MRW_FLAGS_PMR && - mrw->flags & BNXT_QPLIB_FR_PMR) || + mrw->access_flags & BNXT_QPLIB_FR_PMR) || mrw->type == CMDQ_ALLOCATE_MRW_MRW_FLAGS_MW_TYPE2A || mrw->type == CMDQ_ALLOCATE_MRW_MRW_FLAGS_MW_TYPE2B) req.access = CMDQ_ALLOCATE_MRW_ACCESS_CONSUMER_OWNED_KEY; @@ -653,9 +656,12 @@ int bnxt_qplib_reg_mr(struct bnxt_qplib_res *res, struct bnxt_qplib_mrw *mr, req.log2_pbl_pg_size = cpu_to_le16(((ilog2(PAGE_SIZE) << CMDQ_REGISTER_MR_LOG2_PBL_PG_SIZE_SFT) & CMDQ_REGISTER_MR_LOG2_PBL_PG_SIZE_MASK)); - req.access = (mr->flags & 0xFFFF); + req.access = (mr->access_flags & 0xFFFF); req.va = cpu_to_le64(mr->va); req.key = cpu_to_le32(mr->lkey); + if (_is_alloc_mr_unified(res->dattr->dev_cap_flags)) + req.key = cpu_to_le32(mr->pd->id); + req.flags = cpu_to_le16(mr->flags); req.mr_size = cpu_to_le64(mr->total_size); bnxt_qplib_fill_cmdqmsg(&msg, &req, &resp, NULL, sizeof(req), @@ -664,6 +670,11 @@ int bnxt_qplib_reg_mr(struct bnxt_qplib_res *res, struct bnxt_qplib_mrw *mr, if (rc) goto fail; + if (_is_alloc_mr_unified(res->dattr->dev_cap_flags)) { + mr->lkey = le32_to_cpu(resp.xid); + mr->rkey = mr->lkey; + } + return 0; fail: diff --git a/drivers/infiniband/hw/bnxt_re/qplib_sp.h b/drivers/infiniband/hw/bnxt_re/qplib_sp.h index 16a67d70a6fc..acd9c14a31c4 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_sp.h +++ b/drivers/infiniband/hw/bnxt_re/qplib_sp.h @@ -40,6 +40,7 @@ #ifndef __BNXT_QPLIB_SP_H__ #define __BNXT_QPLIB_SP_H__ +#include <rdma/bnxt_re-abi.h> #define BNXT_QPLIB_RESERVED_QP_WRS 128 struct bnxt_qplib_dev_attr { @@ -108,7 +109,7 @@ struct bnxt_qplib_ah { struct bnxt_qplib_mrw { struct bnxt_qplib_pd *pd; int type; - u32 flags; + u32 access_flags; #define BNXT_QPLIB_FR_PMR 0x80000000 u32 lkey; u32 rkey; @@ -116,6 +117,7 @@ struct bnxt_qplib_mrw { u64 va; u64 total_size; u32 npages; + u16 flags; u64 mr_handle; struct bnxt_qplib_hwq hwq; }; @@ -351,4 +353,11 @@ int bnxt_qplib_qext_stat(struct bnxt_qplib_rcfw *rcfw, u32 fid, int bnxt_qplib_modify_cc(struct bnxt_qplib_res *res, struct bnxt_qplib_cc_param *cc_param); +#define BNXT_VAR_MAX_WQE 4352 +#define BNXT_VAR_MAX_SLOT_ALIGN 256 +#define BNXT_VAR_MAX_SGE 13 +#define BNXT_RE_MAX_RQ_WQES 65536 + +#define BNXT_STATIC_MAX_SGE 6 + #endif /* __BNXT_QPLIB_SP_H__*/ diff --git a/drivers/infiniband/hw/bnxt_re/roce_hsi.h b/drivers/infiniband/hw/bnxt_re/roce_hsi.h index 042530969505..3ec895284e49 100644 --- a/drivers/infiniband/hw/bnxt_re/roce_hsi.h +++ b/drivers/infiniband/hw/bnxt_re/roce_hsi.h @@ -409,7 +409,7 @@ struct creq_deinitialize_fw_resp { u8 reserved48[6]; }; -/* cmdq_create_qp (size:768b/96B) */ +/* cmdq_create_qp (size:832b/104B) */ struct cmdq_create_qp { u8 opcode; #define CMDQ_CREATE_QP_OPCODE_CREATE_QP 0x1UL @@ -430,8 +430,11 @@ struct cmdq_create_qp { #define CMDQ_CREATE_QP_QP_FLAGS_OPTIMIZED_TRANSMIT_ENABLED 0x20UL #define CMDQ_CREATE_QP_QP_FLAGS_RESPONDER_UD_CQE_WITH_CFA 0x40UL #define CMDQ_CREATE_QP_QP_FLAGS_EXT_STATS_ENABLED 0x80UL + #define CMDQ_CREATE_QP_QP_FLAGS_EXPRESS_MODE_ENABLED 0x100UL + #define CMDQ_CREATE_QP_QP_FLAGS_STEERING_TAG_VALID 0x200UL + #define CMDQ_CREATE_QP_QP_FLAGS_RDMA_READ_OR_ATOMICS_USED 0x400UL #define CMDQ_CREATE_QP_QP_FLAGS_LAST \ - CMDQ_CREATE_QP_QP_FLAGS_EXT_STATS_ENABLED + CMDQ_CREATE_QP_QP_FLAGS_RDMA_READ_OR_ATOMICS_USED u8 type; #define CMDQ_CREATE_QP_TYPE_RC 0x2UL #define CMDQ_CREATE_QP_TYPE_UD 0x4UL @@ -492,6 +495,9 @@ struct cmdq_create_qp { __le64 rq_pbl; __le64 irrq_addr; __le64 orrq_addr; + __le32 request_xid; + __le16 steering_tag; + __le16 reserved16; }; /* creq_create_qp_resp (size:128b/16B) */ @@ -972,13 +978,14 @@ struct creq_query_qp_extend_resp_sb_tlv { __le16 reserved_16; }; -/* cmdq_create_srq (size:384b/48B) */ +/* cmdq_create_srq (size:448b/56B) */ struct cmdq_create_srq { u8 opcode; #define CMDQ_CREATE_SRQ_OPCODE_CREATE_SRQ 0x5UL #define CMDQ_CREATE_SRQ_OPCODE_LAST CMDQ_CREATE_SRQ_OPCODE_CREATE_SRQ u8 cmd_size; __le16 flags; + #define CMDQ_CREATE_SRQ_FLAGS_STEERING_TAG_VALID 0x1UL __le16 cookie; u8 resp_size; u8 reserved8; @@ -1012,6 +1019,8 @@ struct cmdq_create_srq { __le32 dpi; __le32 pd_id; __le64 pbl; + __le16 steering_tag; + u8 reserved48[6]; }; /* creq_create_srq_resp (size:128b/16B) */ @@ -1118,7 +1127,7 @@ struct creq_query_srq_resp_sb { __le32 data[4]; }; -/* cmdq_create_cq (size:384b/48B) */ +/* cmdq_create_cq (size:448b/56B) */ struct cmdq_create_cq { u8 opcode; #define CMDQ_CREATE_CQ_OPCODE_CREATE_CQ 0x9UL @@ -1126,6 +1135,8 @@ struct cmdq_create_cq { u8 cmd_size; __le16 flags; #define CMDQ_CREATE_CQ_FLAGS_DISABLE_CQ_OVERFLOW_DETECTION 0x1UL + #define CMDQ_CREATE_CQ_FLAGS_STEERING_TAG_VALID 0x2UL + #define CMDQ_CREATE_CQ_FLAGS_INFINITE_CQ_MODE 0x4UL __le16 cookie; u8 resp_size; u8 reserved8; @@ -1157,6 +1168,8 @@ struct cmdq_create_cq { __le32 dpi; __le32 cq_size; __le64 pbl; + __le16 steering_tag; + u8 reserved48[6]; }; /* creq_create_cq_resp (size:128b/16B) */ @@ -1288,11 +1301,12 @@ struct cmdq_allocate_mrw { #define CMDQ_ALLOCATE_MRW_MRW_FLAGS_MW_TYPE2A 0x3UL #define CMDQ_ALLOCATE_MRW_MRW_FLAGS_MW_TYPE2B 0x4UL #define CMDQ_ALLOCATE_MRW_MRW_FLAGS_LAST CMDQ_ALLOCATE_MRW_MRW_FLAGS_MW_TYPE2B - #define CMDQ_ALLOCATE_MRW_UNUSED4_MASK 0xf0UL - #define CMDQ_ALLOCATE_MRW_UNUSED4_SFT 4 + #define CMDQ_ALLOCATE_MRW_STEERING_TAG_VALID 0x10UL + #define CMDQ_ALLOCATE_MRW_UNUSED4_MASK 0xe0UL + #define CMDQ_ALLOCATE_MRW_UNUSED4_SFT 5 u8 access; #define CMDQ_ALLOCATE_MRW_ACCESS_CONSUMER_OWNED_KEY 0x20UL - __le16 unused16; + __le16 steering_tag; __le32 pd_id; }; @@ -1359,14 +1373,16 @@ struct creq_deallocate_key_resp { __le32 bound_window_info; }; -/* cmdq_register_mr (size:384b/48B) */ +/* cmdq_register_mr (size:448b/56B) */ struct cmdq_register_mr { u8 opcode; #define CMDQ_REGISTER_MR_OPCODE_REGISTER_MR 0xfUL #define CMDQ_REGISTER_MR_OPCODE_LAST CMDQ_REGISTER_MR_OPCODE_REGISTER_MR u8 cmd_size; __le16 flags; - #define CMDQ_REGISTER_MR_FLAGS_ALLOC_MR 0x1UL + #define CMDQ_REGISTER_MR_FLAGS_ALLOC_MR 0x1UL + #define CMDQ_REGISTER_MR_FLAGS_STEERING_TAG_VALID 0x2UL + #define CMDQ_REGISTER_MR_FLAGS_ENABLE_RO 0x4UL __le16 cookie; u8 resp_size; u8 reserved8; @@ -1415,6 +1431,8 @@ struct cmdq_register_mr { __le64 pbl; __le64 va; __le64 mr_size; + __le16 steering_tag; + u8 reserved48[6]; }; /* creq_register_mr_resp (size:128b/16B) */ diff --git a/drivers/infiniband/hw/cxgb4/cm.c b/drivers/infiniband/hw/cxgb4/cm.c index 040ba2224f9f..b3757c6a0457 100644 --- a/drivers/infiniband/hw/cxgb4/cm.c +++ b/drivers/infiniband/hw/cxgb4/cm.c @@ -1222,6 +1222,8 @@ static int act_establish(struct c4iw_dev *dev, struct sk_buff *skb) int ret; ep = lookup_atid(t, atid); + if (!ep) + return -EINVAL; pr_debug("ep %p tid %u snd_isn %u rcv_isn %u\n", ep, tid, be32_to_cpu(req->snd_isn), be32_to_cpu(req->rcv_isn)); @@ -2279,6 +2281,9 @@ static int act_open_rpl(struct c4iw_dev *dev, struct sk_buff *skb) int ret = 0; ep = lookup_atid(t, atid); + if (!ep) + return -EINVAL; + la = (struct sockaddr_in *)&ep->com.local_addr; ra = (struct sockaddr_in *)&ep->com.remote_addr; la6 = (struct sockaddr_in6 *)&ep->com.local_addr; diff --git a/drivers/infiniband/hw/cxgb4/cq.c b/drivers/infiniband/hw/cxgb4/cq.c index 5111421f9473..14ced7b667fa 100644 --- a/drivers/infiniband/hw/cxgb4/cq.c +++ b/drivers/infiniband/hw/cxgb4/cq.c @@ -1126,13 +1126,19 @@ int c4iw_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, goto err_free_mm2; mm->key = uresp.key; - mm->addr = virt_to_phys(chp->cq.queue); + mm->addr = 0; + mm->vaddr = chp->cq.queue; + mm->dma_addr = chp->cq.dma_addr; mm->len = chp->cq.memsize; + insert_flag_to_mmap(&rhp->rdev, mm, mm->addr); insert_mmap(ucontext, mm); mm2->key = uresp.gts_key; mm2->addr = chp->cq.bar2_pa; mm2->len = PAGE_SIZE; + mm2->vaddr = NULL; + mm2->dma_addr = 0; + insert_flag_to_mmap(&rhp->rdev, mm2, mm2->addr); insert_mmap(ucontext, mm2); } diff --git a/drivers/infiniband/hw/cxgb4/iw_cxgb4.h b/drivers/infiniband/hw/cxgb4/iw_cxgb4.h index f838bb6718af..5b3007acaa1f 100644 --- a/drivers/infiniband/hw/cxgb4/iw_cxgb4.h +++ b/drivers/infiniband/hw/cxgb4/iw_cxgb4.h @@ -532,11 +532,21 @@ static inline struct c4iw_ucontext *to_c4iw_ucontext(struct ib_ucontext *c) return container_of(c, struct c4iw_ucontext, ibucontext); } +enum { + CXGB4_MMAP_BAR, + CXGB4_MMAP_BAR_WC, + CXGB4_MMAP_CONTIG, + CXGB4_MMAP_NON_CONTIG, +}; + struct c4iw_mm_entry { struct list_head entry; u64 addr; u32 key; + void *vaddr; + dma_addr_t dma_addr; unsigned len; + u8 mmap_flag; }; static inline struct c4iw_mm_entry *remove_mmap(struct c4iw_ucontext *ucontext, @@ -561,6 +571,32 @@ static inline struct c4iw_mm_entry *remove_mmap(struct c4iw_ucontext *ucontext, return NULL; } +static inline void insert_flag_to_mmap(struct c4iw_rdev *rdev, + struct c4iw_mm_entry *mm, u64 addr) +{ + if (addr >= pci_resource_start(rdev->lldi.pdev, 0) && + (addr < (pci_resource_start(rdev->lldi.pdev, 0) + + pci_resource_len(rdev->lldi.pdev, 0)))) + mm->mmap_flag = CXGB4_MMAP_BAR; + else if (addr >= pci_resource_start(rdev->lldi.pdev, 2) && + (addr < (pci_resource_start(rdev->lldi.pdev, 2) + + pci_resource_len(rdev->lldi.pdev, 2)))) { + if (addr >= rdev->oc_mw_pa) { + mm->mmap_flag = CXGB4_MMAP_BAR_WC; + } else { + if (is_t4(rdev->lldi.adapter_type)) + mm->mmap_flag = CXGB4_MMAP_BAR; + else + mm->mmap_flag = CXGB4_MMAP_BAR_WC; + } + } else { + if (addr) + mm->mmap_flag = CXGB4_MMAP_CONTIG; + else + mm->mmap_flag = CXGB4_MMAP_NON_CONTIG; + } +} + static inline void insert_mmap(struct c4iw_ucontext *ucontext, struct c4iw_mm_entry *mm) { @@ -936,7 +972,6 @@ u32 c4iw_get_resource(struct c4iw_id_table *id_table); void c4iw_put_resource(struct c4iw_id_table *id_table, u32 entry); int c4iw_init_resource(struct c4iw_rdev *rdev, u32 nr_tpt, u32 nr_pdid, u32 nr_srqt); -int c4iw_init_ctrl_qp(struct c4iw_rdev *rdev); int c4iw_pblpool_create(struct c4iw_rdev *rdev); int c4iw_rqtpool_create(struct c4iw_rdev *rdev); int c4iw_ocqp_pool_create(struct c4iw_rdev *rdev); @@ -944,7 +979,6 @@ void c4iw_pblpool_destroy(struct c4iw_rdev *rdev); void c4iw_rqtpool_destroy(struct c4iw_rdev *rdev); void c4iw_ocqp_pool_destroy(struct c4iw_rdev *rdev); void c4iw_destroy_resource(struct c4iw_resource *rscp); -int c4iw_destroy_ctrl_qp(struct c4iw_rdev *rdev); void c4iw_register_device(struct work_struct *work); void c4iw_unregister_device(struct c4iw_dev *dev); int __init c4iw_cm_init(void); @@ -1006,8 +1040,6 @@ int c4iw_ep_disconnect(struct c4iw_ep *ep, int abrupt, gfp_t gfp); int c4iw_flush_rq(struct t4_wq *wq, struct t4_cq *cq, int count); int c4iw_flush_sq(struct c4iw_qp *qhp); int c4iw_ev_handler(struct c4iw_dev *rnicp, u32 qid); -u16 c4iw_rqes_posted(struct c4iw_qp *qhp); -int c4iw_post_terminate(struct c4iw_qp *qhp, struct t4_cqe *err_cqe); u32 c4iw_get_cqid(struct c4iw_rdev *rdev, struct c4iw_dev_ucontext *uctx); void c4iw_put_cqid(struct c4iw_rdev *rdev, u32 qid, struct c4iw_dev_ucontext *uctx); diff --git a/drivers/infiniband/hw/cxgb4/provider.c b/drivers/infiniband/hw/cxgb4/provider.c index 246b739ddb2b..10a4c738b59f 100644 --- a/drivers/infiniband/hw/cxgb4/provider.c +++ b/drivers/infiniband/hw/cxgb4/provider.c @@ -113,6 +113,9 @@ static int c4iw_alloc_ucontext(struct ib_ucontext *ucontext, mm->key = uresp.status_page_key; mm->addr = virt_to_phys(rhp->rdev.status_page); mm->len = PAGE_SIZE; + mm->vaddr = NULL; + mm->dma_addr = 0; + insert_flag_to_mmap(&rhp->rdev, mm, mm->addr); insert_mmap(context, mm); } return 0; @@ -131,6 +134,11 @@ static int c4iw_mmap(struct ib_ucontext *context, struct vm_area_struct *vma) struct c4iw_mm_entry *mm; struct c4iw_ucontext *ucontext; u64 addr; + u8 mmap_flag; + size_t size; + void *vaddr; + unsigned long vm_pgoff; + dma_addr_t dma_addr; pr_debug("pgoff 0x%lx key 0x%x len %d\n", vma->vm_pgoff, key, len); @@ -145,47 +153,38 @@ static int c4iw_mmap(struct ib_ucontext *context, struct vm_area_struct *vma) if (!mm) return -EINVAL; addr = mm->addr; + vaddr = mm->vaddr; + dma_addr = mm->dma_addr; + size = mm->len; + mmap_flag = mm->mmap_flag; kfree(mm); - if ((addr >= pci_resource_start(rdev->lldi.pdev, 0)) && - (addr < (pci_resource_start(rdev->lldi.pdev, 0) + - pci_resource_len(rdev->lldi.pdev, 0)))) { - - /* - * MA_SYNC register... - */ - vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); + switch (mmap_flag) { + case CXGB4_MMAP_BAR: + ret = io_remap_pfn_range(vma, vma->vm_start, addr >> PAGE_SHIFT, + len, + pgprot_noncached(vma->vm_page_prot)); + break; + case CXGB4_MMAP_BAR_WC: ret = io_remap_pfn_range(vma, vma->vm_start, addr >> PAGE_SHIFT, - len, vma->vm_page_prot); - } else if ((addr >= pci_resource_start(rdev->lldi.pdev, 2)) && - (addr < (pci_resource_start(rdev->lldi.pdev, 2) + - pci_resource_len(rdev->lldi.pdev, 2)))) { - - /* - * Map user DB or OCQP memory... - */ - if (addr >= rdev->oc_mw_pa) - vma->vm_page_prot = t4_pgprot_wc(vma->vm_page_prot); - else { - if (!is_t4(rdev->lldi.adapter_type)) - vma->vm_page_prot = - t4_pgprot_wc(vma->vm_page_prot); - else - vma->vm_page_prot = - pgprot_noncached(vma->vm_page_prot); - } + len, t4_pgprot_wc(vma->vm_page_prot)); + break; + case CXGB4_MMAP_CONTIG: ret = io_remap_pfn_range(vma, vma->vm_start, addr >> PAGE_SHIFT, len, vma->vm_page_prot); - } else { - - /* - * Map WQ or CQ contig dma memory... - */ - ret = remap_pfn_range(vma, vma->vm_start, - addr >> PAGE_SHIFT, - len, vma->vm_page_prot); + break; + case CXGB4_MMAP_NON_CONTIG: + vm_pgoff = vma->vm_pgoff; + vma->vm_pgoff = 0; + ret = dma_mmap_coherent(&rdev->lldi.pdev->dev, vma, + vaddr, dma_addr, size); + vma->vm_pgoff = vm_pgoff; + break; + default: + ret = -EINVAL; + break; } return ret; diff --git a/drivers/infiniband/hw/cxgb4/qp.c b/drivers/infiniband/hw/cxgb4/qp.c index d16d8eaa1415..7b5c4522b426 100644 --- a/drivers/infiniband/hw/cxgb4/qp.c +++ b/drivers/infiniband/hw/cxgb4/qp.c @@ -2281,24 +2281,39 @@ int c4iw_create_qp(struct ib_qp *qp, struct ib_qp_init_attr *attrs, if (ret) goto err_free_ma_sync_key; sq_key_mm->key = uresp.sq_key; - sq_key_mm->addr = qhp->wq.sq.phys_addr; + sq_key_mm->addr = 0; + sq_key_mm->vaddr = qhp->wq.sq.queue; + sq_key_mm->dma_addr = qhp->wq.sq.dma_addr; sq_key_mm->len = PAGE_ALIGN(qhp->wq.sq.memsize); + insert_flag_to_mmap(&rhp->rdev, sq_key_mm, sq_key_mm->addr); insert_mmap(ucontext, sq_key_mm); if (!attrs->srq) { rq_key_mm->key = uresp.rq_key; - rq_key_mm->addr = virt_to_phys(qhp->wq.rq.queue); + rq_key_mm->addr = 0; + rq_key_mm->vaddr = qhp->wq.rq.queue; + rq_key_mm->dma_addr = qhp->wq.rq.dma_addr; rq_key_mm->len = PAGE_ALIGN(qhp->wq.rq.memsize); + insert_flag_to_mmap(&rhp->rdev, rq_key_mm, + rq_key_mm->addr); insert_mmap(ucontext, rq_key_mm); } sq_db_key_mm->key = uresp.sq_db_gts_key; sq_db_key_mm->addr = (u64)(unsigned long)qhp->wq.sq.bar2_pa; + sq_db_key_mm->vaddr = NULL; + sq_db_key_mm->dma_addr = 0; sq_db_key_mm->len = PAGE_SIZE; + insert_flag_to_mmap(&rhp->rdev, sq_db_key_mm, + sq_db_key_mm->addr); insert_mmap(ucontext, sq_db_key_mm); if (!attrs->srq) { rq_db_key_mm->key = uresp.rq_db_gts_key; rq_db_key_mm->addr = (u64)(unsigned long)qhp->wq.rq.bar2_pa; rq_db_key_mm->len = PAGE_SIZE; + rq_db_key_mm->vaddr = NULL; + rq_db_key_mm->dma_addr = 0; + insert_flag_to_mmap(&rhp->rdev, rq_db_key_mm, + rq_db_key_mm->addr); insert_mmap(ucontext, rq_db_key_mm); } if (ma_sync_key_mm) { @@ -2307,6 +2322,10 @@ int c4iw_create_qp(struct ib_qp *qp, struct ib_qp_init_attr *attrs, (pci_resource_start(rhp->rdev.lldi.pdev, 0) + PCIE_MA_SYNC_A) & PAGE_MASK; ma_sync_key_mm->len = PAGE_SIZE; + ma_sync_key_mm->vaddr = NULL; + ma_sync_key_mm->dma_addr = 0; + insert_flag_to_mmap(&rhp->rdev, ma_sync_key_mm, + ma_sync_key_mm->addr); insert_mmap(ucontext, ma_sync_key_mm); } @@ -2761,12 +2780,19 @@ int c4iw_create_srq(struct ib_srq *ib_srq, struct ib_srq_init_attr *attrs, if (ret) goto err_free_srq_db_key_mm; srq_key_mm->key = uresp.srq_key; - srq_key_mm->addr = virt_to_phys(srq->wq.queue); + srq_key_mm->addr = 0; srq_key_mm->len = PAGE_ALIGN(srq->wq.memsize); + srq_key_mm->vaddr = srq->wq.queue; + srq_key_mm->dma_addr = srq->wq.dma_addr; + insert_flag_to_mmap(&rhp->rdev, srq_key_mm, srq_key_mm->addr); insert_mmap(ucontext, srq_key_mm); srq_db_key_mm->key = uresp.srq_db_gts_key; srq_db_key_mm->addr = (u64)(unsigned long)srq->wq.bar2_pa; srq_db_key_mm->len = PAGE_SIZE; + srq_db_key_mm->vaddr = NULL; + srq_db_key_mm->dma_addr = 0; + insert_flag_to_mmap(&rhp->rdev, srq_db_key_mm, + srq_db_key_mm->addr); insert_mmap(ucontext, srq_db_key_mm); } diff --git a/drivers/infiniband/hw/efa/efa.h b/drivers/infiniband/hw/efa/efa.h index e580e087e9da..d7fc9d5eeefd 100644 --- a/drivers/infiniband/hw/efa/efa.h +++ b/drivers/infiniband/hw/efa/efa.h @@ -168,7 +168,7 @@ struct ib_mr *efa_reg_mr(struct ib_pd *ibpd, u64 start, u64 length, struct ib_mr *efa_reg_user_mr_dmabuf(struct ib_pd *ibpd, u64 start, u64 length, u64 virt_addr, int fd, int access_flags, - struct ib_udata *udata); + struct uverbs_attr_bundle *attrs); int efa_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata); int efa_get_port_immutable(struct ib_device *ibdev, u32 port_num, struct ib_port_immutable *immutable); diff --git a/drivers/infiniband/hw/efa/efa_admin_cmds_defs.h b/drivers/infiniband/hw/efa/efa_admin_cmds_defs.h index 4296662e59c3..cd03a5429beb 100644 --- a/drivers/infiniband/hw/efa/efa_admin_cmds_defs.h +++ b/drivers/infiniband/hw/efa/efa_admin_cmds_defs.h @@ -674,6 +674,9 @@ struct efa_admin_feature_device_attr_desc { /* Max RDMA transfer size in bytes */ u32 max_rdma_size; + + /* Unique global ID for an EFA device */ + u64 guid; }; struct efa_admin_feature_queue_attr_desc { diff --git a/drivers/infiniband/hw/efa/efa_com_cmd.c b/drivers/infiniband/hw/efa/efa_com_cmd.c index 5b9c2b16df0e..5a774925cdea 100644 --- a/drivers/infiniband/hw/efa/efa_com_cmd.c +++ b/drivers/infiniband/hw/efa/efa_com_cmd.c @@ -465,6 +465,7 @@ int efa_com_get_device_attr(struct efa_com_dev *edev, result->db_bar = resp.u.device_attr.db_bar; result->max_rdma_size = resp.u.device_attr.max_rdma_size; result->device_caps = resp.u.device_attr.device_caps; + result->guid = resp.u.device_attr.guid; if (result->admin_api_version < 1) { ibdev_err_ratelimited( diff --git a/drivers/infiniband/hw/efa/efa_com_cmd.h b/drivers/infiniband/hw/efa/efa_com_cmd.h index 9714105fcf7e..668d033f7477 100644 --- a/drivers/infiniband/hw/efa/efa_com_cmd.h +++ b/drivers/infiniband/hw/efa/efa_com_cmd.h @@ -112,6 +112,7 @@ struct efa_com_get_device_attr_result { u8 addr[EFA_GID_SIZE]; u64 page_size_cap; u64 max_mr_pages; + u64 guid; u32 mtu; u32 fw_version; u32 admin_api_version; diff --git a/drivers/infiniband/hw/efa/efa_main.c b/drivers/infiniband/hw/efa/efa_main.c index 1a777791bea3..ad225823e6f2 100644 --- a/drivers/infiniband/hw/efa/efa_main.c +++ b/drivers/infiniband/hw/efa/efa_main.c @@ -441,6 +441,7 @@ static int efa_ib_device_add(struct efa_dev *dev) efa_set_host_info(dev); dev->ibdev.node_type = RDMA_NODE_UNSPECIFIED; + dev->ibdev.node_guid = dev->dev_attr.guid; dev->ibdev.phys_port_cnt = 1; dev->ibdev.num_comp_vectors = dev->neqs ?: 1; dev->ibdev.dev.parent = &pdev->dev; diff --git a/drivers/infiniband/hw/efa/efa_verbs.c b/drivers/infiniband/hw/efa/efa_verbs.c index b1e0a1b7c59d..cc13415ff7e7 100644 --- a/drivers/infiniband/hw/efa/efa_verbs.c +++ b/drivers/infiniband/hw/efa/efa_verbs.c @@ -1684,14 +1684,14 @@ static int efa_register_mr(struct ib_pd *ibpd, struct efa_mr *mr, u64 start, struct ib_mr *efa_reg_user_mr_dmabuf(struct ib_pd *ibpd, u64 start, u64 length, u64 virt_addr, int fd, int access_flags, - struct ib_udata *udata) + struct uverbs_attr_bundle *attrs) { struct efa_dev *dev = to_edev(ibpd->device); struct ib_umem_dmabuf *umem_dmabuf; struct efa_mr *mr; int err; - mr = efa_alloc_mr(ibpd, access_flags, udata); + mr = efa_alloc_mr(ibpd, access_flags, &attrs->driver_udata); if (IS_ERR(mr)) { err = PTR_ERR(mr); goto err_out; diff --git a/drivers/infiniband/hw/erdma/erdma.h b/drivers/infiniband/hw/erdma/erdma.h index c8bd698e21b0..3c166359448d 100644 --- a/drivers/infiniband/hw/erdma/erdma.h +++ b/drivers/infiniband/hw/erdma/erdma.h @@ -274,7 +274,8 @@ void notify_eq(struct erdma_eq *eq); void *get_next_valid_eqe(struct erdma_eq *eq); int erdma_aeq_init(struct erdma_dev *dev); -void erdma_aeq_destroy(struct erdma_dev *dev); +int erdma_eq_common_init(struct erdma_dev *dev, struct erdma_eq *eq, u32 depth); +void erdma_eq_destroy(struct erdma_dev *dev, struct erdma_eq *eq); void erdma_aeq_event_handler(struct erdma_dev *dev); void erdma_ceq_completion_handler(struct erdma_eq_cb *ceq_cb); diff --git a/drivers/infiniband/hw/erdma/erdma_cmdq.c b/drivers/infiniband/hw/erdma/erdma_cmdq.c index 43ff40b5a09d..a3d8922d1ad1 100644 --- a/drivers/infiniband/hw/erdma/erdma_cmdq.c +++ b/drivers/infiniband/hw/erdma/erdma_cmdq.c @@ -158,20 +158,13 @@ static int erdma_cmdq_eq_init(struct erdma_dev *dev) { struct erdma_cmdq *cmdq = &dev->cmdq; struct erdma_eq *eq = &cmdq->eq; + int ret; - eq->depth = cmdq->max_outstandings; - eq->qbuf = dma_alloc_coherent(&dev->pdev->dev, eq->depth << EQE_SHIFT, - &eq->qbuf_dma_addr, GFP_KERNEL); - if (!eq->qbuf) - return -ENOMEM; - - spin_lock_init(&eq->lock); - atomic64_set(&eq->event_num, 0); + ret = erdma_eq_common_init(dev, eq, cmdq->max_outstandings); + if (ret) + return ret; eq->db = dev->func_bar + ERDMA_REGS_CEQ_DB_BASE_REG; - eq->dbrec = dma_pool_zalloc(dev->db_pool, GFP_KERNEL, &eq->dbrec_dma); - if (!eq->dbrec) - goto err_out; erdma_reg_write32(dev, ERDMA_REGS_CMDQ_EQ_ADDR_H_REG, upper_32_bits(eq->qbuf_dma_addr)); @@ -181,12 +174,6 @@ static int erdma_cmdq_eq_init(struct erdma_dev *dev) erdma_reg_write64(dev, ERDMA_CMDQ_EQ_DB_HOST_ADDR_REG, eq->dbrec_dma); return 0; - -err_out: - dma_free_coherent(&dev->pdev->dev, eq->depth << EQE_SHIFT, eq->qbuf, - eq->qbuf_dma_addr); - - return -ENOMEM; } int erdma_cmdq_init(struct erdma_dev *dev) @@ -247,10 +234,7 @@ void erdma_cmdq_destroy(struct erdma_dev *dev) clear_bit(ERDMA_CMDQ_STATE_OK_BIT, &cmdq->state); - dma_free_coherent(&dev->pdev->dev, cmdq->eq.depth << EQE_SHIFT, - cmdq->eq.qbuf, cmdq->eq.qbuf_dma_addr); - - dma_pool_free(dev->db_pool, cmdq->eq.dbrec, cmdq->eq.dbrec_dma); + erdma_eq_destroy(dev, &cmdq->eq); dma_free_coherent(&dev->pdev->dev, cmdq->sq.depth << SQEBB_SHIFT, cmdq->sq.qbuf, cmdq->sq.qbuf_dma_addr); diff --git a/drivers/infiniband/hw/erdma/erdma_eq.c b/drivers/infiniband/hw/erdma/erdma_eq.c index 84ccdd8144c9..9a72fec6d5cc 100644 --- a/drivers/infiniband/hw/erdma/erdma_eq.c +++ b/drivers/infiniband/hw/erdma/erdma_eq.c @@ -80,50 +80,60 @@ void erdma_aeq_event_handler(struct erdma_dev *dev) notify_eq(&dev->aeq); } -int erdma_aeq_init(struct erdma_dev *dev) +int erdma_eq_common_init(struct erdma_dev *dev, struct erdma_eq *eq, u32 depth) { - struct erdma_eq *eq = &dev->aeq; + u32 buf_size = depth << EQE_SHIFT; - eq->depth = ERDMA_DEFAULT_EQ_DEPTH; - - eq->qbuf = dma_alloc_coherent(&dev->pdev->dev, eq->depth << EQE_SHIFT, + eq->qbuf = dma_alloc_coherent(&dev->pdev->dev, buf_size, &eq->qbuf_dma_addr, GFP_KERNEL); if (!eq->qbuf) return -ENOMEM; - spin_lock_init(&eq->lock); - atomic64_set(&eq->event_num, 0); - atomic64_set(&eq->notify_num, 0); - - eq->db = dev->func_bar + ERDMA_REGS_AEQ_DB_REG; eq->dbrec = dma_pool_zalloc(dev->db_pool, GFP_KERNEL, &eq->dbrec_dma); if (!eq->dbrec) - goto err_out; + goto err_free_qbuf; - erdma_reg_write32(dev, ERDMA_REGS_AEQ_ADDR_H_REG, - upper_32_bits(eq->qbuf_dma_addr)); - erdma_reg_write32(dev, ERDMA_REGS_AEQ_ADDR_L_REG, - lower_32_bits(eq->qbuf_dma_addr)); - erdma_reg_write32(dev, ERDMA_REGS_AEQ_DEPTH_REG, eq->depth); - erdma_reg_write64(dev, ERDMA_AEQ_DB_HOST_ADDR_REG, eq->dbrec_dma); + spin_lock_init(&eq->lock); + atomic64_set(&eq->event_num, 0); + atomic64_set(&eq->notify_num, 0); + eq->ci = 0; + eq->depth = depth; return 0; -err_out: - dma_free_coherent(&dev->pdev->dev, eq->depth << EQE_SHIFT, eq->qbuf, +err_free_qbuf: + dma_free_coherent(&dev->pdev->dev, buf_size, eq->qbuf, eq->qbuf_dma_addr); return -ENOMEM; } -void erdma_aeq_destroy(struct erdma_dev *dev) +void erdma_eq_destroy(struct erdma_dev *dev, struct erdma_eq *eq) { - struct erdma_eq *eq = &dev->aeq; - + dma_pool_free(dev->db_pool, eq->dbrec, eq->dbrec_dma); dma_free_coherent(&dev->pdev->dev, eq->depth << EQE_SHIFT, eq->qbuf, eq->qbuf_dma_addr); +} - dma_pool_free(dev->db_pool, eq->dbrec, eq->dbrec_dma); +int erdma_aeq_init(struct erdma_dev *dev) +{ + struct erdma_eq *eq = &dev->aeq; + int ret; + + ret = erdma_eq_common_init(dev, &dev->aeq, ERDMA_DEFAULT_EQ_DEPTH); + if (ret) + return ret; + + eq->db = dev->func_bar + ERDMA_REGS_AEQ_DB_REG; + + erdma_reg_write32(dev, ERDMA_REGS_AEQ_ADDR_H_REG, + upper_32_bits(eq->qbuf_dma_addr)); + erdma_reg_write32(dev, ERDMA_REGS_AEQ_ADDR_L_REG, + lower_32_bits(eq->qbuf_dma_addr)); + erdma_reg_write32(dev, ERDMA_REGS_AEQ_DEPTH_REG, eq->depth); + erdma_reg_write64(dev, ERDMA_AEQ_DB_HOST_ADDR_REG, eq->dbrec_dma); + + return 0; } void erdma_ceq_completion_handler(struct erdma_eq_cb *ceq_cb) @@ -234,32 +244,21 @@ static int erdma_ceq_init_one(struct erdma_dev *dev, u16 ceqn) struct erdma_eq *eq = &dev->ceqs[ceqn].eq; int ret; - eq->depth = ERDMA_DEFAULT_EQ_DEPTH; - eq->qbuf = dma_alloc_coherent(&dev->pdev->dev, eq->depth << EQE_SHIFT, - &eq->qbuf_dma_addr, GFP_KERNEL); - if (!eq->qbuf) - return -ENOMEM; - - spin_lock_init(&eq->lock); - atomic64_set(&eq->event_num, 0); - atomic64_set(&eq->notify_num, 0); + ret = erdma_eq_common_init(dev, eq, ERDMA_DEFAULT_EQ_DEPTH); + if (ret) + return ret; eq->db = dev->func_bar + ERDMA_REGS_CEQ_DB_BASE_REG + (ceqn + 1) * ERDMA_DB_SIZE; - - eq->dbrec = dma_pool_zalloc(dev->db_pool, GFP_KERNEL, &eq->dbrec_dma); - if (!eq->dbrec) { - dma_free_coherent(&dev->pdev->dev, eq->depth << EQE_SHIFT, - eq->qbuf, eq->qbuf_dma_addr); - return -ENOMEM; - } - - eq->ci = 0; dev->ceqs[ceqn].dev = dev; + dev->ceqs[ceqn].ready = true; /* CEQ indexed from 1, 0 rsvd for CMDQ-EQ. */ ret = create_eq_cmd(dev, ceqn + 1, eq); - dev->ceqs[ceqn].ready = ret ? false : true; + if (ret) { + erdma_eq_destroy(dev, eq); + dev->ceqs[ceqn].ready = false; + } return ret; } @@ -283,9 +282,7 @@ static void erdma_ceq_uninit_one(struct erdma_dev *dev, u16 ceqn) if (err) return; - dma_free_coherent(&dev->pdev->dev, eq->depth << EQE_SHIFT, eq->qbuf, - eq->qbuf_dma_addr); - dma_pool_free(dev->db_pool, eq->dbrec, eq->dbrec_dma); + erdma_eq_destroy(dev, eq); } int erdma_ceqs_init(struct erdma_dev *dev) diff --git a/drivers/infiniband/hw/erdma/erdma_main.c b/drivers/infiniband/hw/erdma/erdma_main.c index 7080f8a71ec4..62f497a71004 100644 --- a/drivers/infiniband/hw/erdma/erdma_main.c +++ b/drivers/infiniband/hw/erdma/erdma_main.c @@ -333,7 +333,7 @@ err_uninit_cmdq: erdma_cmdq_destroy(dev); err_uninit_aeq: - erdma_aeq_destroy(dev); + erdma_eq_destroy(dev, &dev->aeq); err_uninit_comm_irq: erdma_comm_irq_uninit(dev); @@ -366,7 +366,7 @@ static void erdma_remove_dev(struct pci_dev *pdev) erdma_ceqs_uninit(dev); erdma_hw_reset(dev); erdma_cmdq_destroy(dev); - erdma_aeq_destroy(dev); + erdma_eq_destroy(dev, &dev->aeq); erdma_comm_irq_uninit(dev); pci_free_irq_vectors(dev->pdev); erdma_device_uninit(dev); @@ -490,6 +490,7 @@ static const struct ib_device_ops erdma_device_ops = { .dereg_mr = erdma_dereg_mr, .destroy_cq = erdma_destroy_cq, .destroy_qp = erdma_destroy_qp, + .disassociate_ucontext = erdma_disassociate_ucontext, .get_dma_mr = erdma_get_dma_mr, .get_hw_stats = erdma_get_hw_stats, .get_port_immutable = erdma_get_port_immutable, diff --git a/drivers/infiniband/hw/erdma/erdma_verbs.c b/drivers/infiniband/hw/erdma/erdma_verbs.c index d7e1cbf9f5c2..51d619edb6c5 100644 --- a/drivers/infiniband/hw/erdma/erdma_verbs.c +++ b/drivers/infiniband/hw/erdma/erdma_verbs.c @@ -1544,11 +1544,31 @@ int erdma_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, int attr_mask, return ret; } +static enum ib_qp_state query_qp_state(struct erdma_qp *qp) +{ + switch (qp->attrs.state) { + case ERDMA_QP_STATE_IDLE: + return IB_QPS_INIT; + case ERDMA_QP_STATE_RTR: + return IB_QPS_RTR; + case ERDMA_QP_STATE_RTS: + return IB_QPS_RTS; + case ERDMA_QP_STATE_CLOSING: + return IB_QPS_ERR; + case ERDMA_QP_STATE_TERMINATE: + return IB_QPS_ERR; + case ERDMA_QP_STATE_ERROR: + return IB_QPS_ERR; + default: + return IB_QPS_ERR; + } +} + int erdma_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr, int qp_attr_mask, struct ib_qp_init_attr *qp_init_attr) { - struct erdma_qp *qp; struct erdma_dev *dev; + struct erdma_qp *qp; if (ibqp && qp_attr && qp_init_attr) { qp = to_eqp(ibqp); @@ -1575,6 +1595,9 @@ int erdma_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr, qp_init_attr->cap = qp_attr->cap; + qp_attr->qp_state = query_qp_state(qp); + qp_attr->cur_qp_state = query_qp_state(qp); + return 0; } @@ -1701,6 +1724,10 @@ err_out_xa: return ret; } +void erdma_disassociate_ucontext(struct ib_ucontext *ibcontext) +{ +} + void erdma_set_mtu(struct erdma_dev *dev, u32 mtu) { struct erdma_cmdq_config_mtu_req req; diff --git a/drivers/infiniband/hw/erdma/erdma_verbs.h b/drivers/infiniband/hw/erdma/erdma_verbs.h index 6afdc02f5869..c998acd39a78 100644 --- a/drivers/infiniband/hw/erdma/erdma_verbs.h +++ b/drivers/infiniband/hw/erdma/erdma_verbs.h @@ -344,6 +344,7 @@ int erdma_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, int mask, struct ib_udata *data); int erdma_destroy_qp(struct ib_qp *ibqp, struct ib_udata *udata); int erdma_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata); +void erdma_disassociate_ucontext(struct ib_ucontext *ibcontext); int erdma_req_notify_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags); struct ib_mr *erdma_reg_user_mr(struct ib_pd *ibpd, u64 start, u64 len, u64 virt, int access, struct ib_udata *udata); diff --git a/drivers/infiniband/hw/hns/hns_roce_ah.c b/drivers/infiniband/hw/hns/hns_roce_ah.c index 3e02c474f59f..4fc5b9d5fea8 100644 --- a/drivers/infiniband/hw/hns/hns_roce_ah.c +++ b/drivers/infiniband/hw/hns/hns_roce_ah.c @@ -64,8 +64,10 @@ int hns_roce_create_ah(struct ib_ah *ibah, struct rdma_ah_init_attr *init_attr, u8 tc_mode = 0; int ret; - if (hr_dev->pci_dev->revision == PCI_REVISION_ID_HIP08 && udata) - return -EOPNOTSUPP; + if (hr_dev->pci_dev->revision == PCI_REVISION_ID_HIP08 && udata) { + ret = -EOPNOTSUPP; + goto err_out; + } ah->av.port = rdma_ah_get_port_num(ah_attr); ah->av.gid_index = grh->sgid_index; @@ -83,7 +85,7 @@ int hns_roce_create_ah(struct ib_ah *ibah, struct rdma_ah_init_attr *init_attr, ret = 0; if (ret && grh->sgid_attr->gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP) - return ret; + goto err_out; if (tc_mode == HNAE3_TC_MAP_MODE_DSCP && grh->sgid_attr->gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP) @@ -91,8 +93,10 @@ int hns_roce_create_ah(struct ib_ah *ibah, struct rdma_ah_init_attr *init_attr, else ah->av.sl = rdma_ah_get_sl(ah_attr); - if (!check_sl_valid(hr_dev, ah->av.sl)) - return -EINVAL; + if (!check_sl_valid(hr_dev, ah->av.sl)) { + ret = -EINVAL; + goto err_out; + } memcpy(ah->av.dgid, grh->dgid.raw, HNS_ROCE_GID_SIZE); memcpy(ah->av.mac, ah_attr->roce.dmac, ETH_ALEN); diff --git a/drivers/infiniband/hw/hns/hns_roce_hem.c b/drivers/infiniband/hw/hns/hns_roce_hem.c index 02baa853a76c..c7c167e2a045 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hem.c +++ b/drivers/infiniband/hw/hns/hns_roce_hem.c @@ -1041,9 +1041,9 @@ static bool hem_list_is_bottom_bt(int hopnum, int bt_level) * @bt_level: base address table level * @unit: ba entries per bt page */ -static u32 hem_list_calc_ba_range(int hopnum, int bt_level, int unit) +static u64 hem_list_calc_ba_range(int hopnum, int bt_level, int unit) { - u32 step; + u64 step; int max; int i; @@ -1079,7 +1079,7 @@ int hns_roce_hem_list_calc_root_ba(const struct hns_roce_buf_region *regions, { struct hns_roce_buf_region *r; int total = 0; - int step; + u64 step; int i; for (i = 0; i < region_cnt; i++) { @@ -1110,7 +1110,7 @@ static int hem_list_alloc_mid_bt(struct hns_roce_dev *hr_dev, int ret = 0; int max_ofs; int level; - u32 step; + u64 step; int end; if (hopnum <= 1) @@ -1134,10 +1134,12 @@ static int hem_list_alloc_mid_bt(struct hns_roce_dev *hr_dev, /* config L1 bt to last bt and link them to corresponding parent */ for (level = 1; level < hopnum; level++) { - cur = hem_list_search_item(&mid_bt[level], offset); - if (cur) { - hem_ptrs[level] = cur; - continue; + if (!hem_list_is_bottom_bt(hopnum, level)) { + cur = hem_list_search_item(&mid_bt[level], offset); + if (cur) { + hem_ptrs[level] = cur; + continue; + } } step = hem_list_calc_ba_range(hopnum, level, unit); @@ -1147,7 +1149,7 @@ static int hem_list_alloc_mid_bt(struct hns_roce_dev *hr_dev, } start_aligned = (distance / step) * step + r->offset; - end = min_t(int, start_aligned + step - 1, max_ofs); + end = min_t(u64, start_aligned + step - 1, max_ofs); cur = hem_list_alloc_item(hr_dev, start_aligned, end, unit, true); if (!cur) { @@ -1235,7 +1237,7 @@ static int setup_middle_bt(struct hns_roce_dev *hr_dev, void *cpu_base, struct hns_roce_hem_item *hem, *temp_hem; int total = 0; int offset; - int step; + u64 step; step = hem_list_calc_ba_range(r->hopnum, 1, unit); if (step < 1) diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index 621b057fb9da..24e906b9d3ae 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -1681,8 +1681,8 @@ static int hns_roce_hw_v2_query_counter(struct hns_roce_dev *hr_dev, for (i = 0; i < HNS_ROCE_HW_CNT_TOTAL && i < *num_counters; i++) { bd_idx = i / CNT_PER_DESC; - if (!(desc[bd_idx].flag & HNS_ROCE_CMD_FLAG_NEXT) && - bd_idx != HNS_ROCE_HW_CNT_TOTAL / CNT_PER_DESC) + if (bd_idx != HNS_ROCE_HW_CNT_TOTAL / CNT_PER_DESC && + !(desc[bd_idx].flag & cpu_to_le16(HNS_ROCE_CMD_FLAG_NEXT))) break; cnt_data = (__le64 *)&desc[bd_idx].data[0]; @@ -2972,6 +2972,9 @@ err_llm_init_failed: static void hns_roce_v2_exit(struct hns_roce_dev *hr_dev) { + if (hr_dev->pci_dev->revision == PCI_REVISION_ID_HIP08) + free_mr_exit(hr_dev); + hns_roce_function_clear(hr_dev); if (!hr_dev->is_vf) @@ -4423,12 +4426,14 @@ static int config_qp_rq_buf(struct hns_roce_dev *hr_dev, upper_32_bits(to_hr_hw_page_addr(mtts[0]))); hr_reg_clear(qpc_mask, QPC_RQ_CUR_BLK_ADDR_H); - context->rq_nxt_blk_addr = cpu_to_le32(to_hr_hw_page_addr(mtts[1])); - qpc_mask->rq_nxt_blk_addr = 0; - - hr_reg_write(context, QPC_RQ_NXT_BLK_ADDR_H, - upper_32_bits(to_hr_hw_page_addr(mtts[1]))); - hr_reg_clear(qpc_mask, QPC_RQ_NXT_BLK_ADDR_H); + if (hr_dev->pci_dev->revision == PCI_REVISION_ID_HIP08) { + context->rq_nxt_blk_addr = + cpu_to_le32(to_hr_hw_page_addr(mtts[1])); + qpc_mask->rq_nxt_blk_addr = 0; + hr_reg_write(context, QPC_RQ_NXT_BLK_ADDR_H, + upper_32_bits(to_hr_hw_page_addr(mtts[1]))); + hr_reg_clear(qpc_mask, QPC_RQ_NXT_BLK_ADDR_H); + } return 0; } @@ -6193,6 +6198,7 @@ static irqreturn_t abnormal_interrupt_basic(struct hns_roce_dev *hr_dev, struct pci_dev *pdev = hr_dev->pci_dev; struct hnae3_ae_dev *ae_dev = pci_get_drvdata(pdev); const struct hnae3_ae_ops *ops = ae_dev->ops; + enum hnae3_reset_type reset_type; irqreturn_t int_work = IRQ_NONE; u32 int_en; @@ -6204,10 +6210,12 @@ static irqreturn_t abnormal_interrupt_basic(struct hns_roce_dev *hr_dev, roce_write(hr_dev, ROCEE_VF_ABN_INT_ST_REG, 1 << HNS_ROCE_V2_VF_INT_ST_AEQ_OVERFLOW_S); + reset_type = hr_dev->is_vf ? + HNAE3_VF_FUNC_RESET : HNAE3_FUNC_RESET; + /* Set reset level for reset_event() */ if (ops->set_default_reset_request) - ops->set_default_reset_request(ae_dev, - HNAE3_FUNC_RESET); + ops->set_default_reset_request(ae_dev, reset_type); if (ops->reset_event) ops->reset_event(pdev, NULL); @@ -6277,7 +6285,7 @@ static u64 fmea_get_ram_res_addr(u32 res_type, __le64 *data) res_type == ECC_RESOURCE_SCCC) return le64_to_cpu(*data); - return le64_to_cpu(*data) << PAGE_SHIFT; + return le64_to_cpu(*data) << HNS_HW_PAGE_SHIFT; } static int fmea_recover_others(struct hns_roce_dev *hr_dev, u32 res_type, @@ -6949,9 +6957,6 @@ static void __hns_roce_hw_v2_uninit_instance(struct hnae3_handle *handle, hr_dev->state = HNS_ROCE_DEVICE_STATE_UNINIT; hns_roce_handle_device_err(hr_dev); - if (hr_dev->pci_dev->revision == PCI_REVISION_ID_HIP08) - free_mr_exit(hr_dev); - hns_roce_exit(hr_dev); kfree(hr_dev->priv); ib_dealloc_device(&hr_dev->ib_dev); diff --git a/drivers/infiniband/hw/hns/hns_roce_qp.c b/drivers/infiniband/hw/hns/hns_roce_qp.c index 1de384ce4d0e..6b03ba671ff8 100644 --- a/drivers/infiniband/hw/hns/hns_roce_qp.c +++ b/drivers/infiniband/hw/hns/hns_roce_qp.c @@ -1460,19 +1460,19 @@ void hns_roce_lock_cqs(struct hns_roce_cq *send_cq, struct hns_roce_cq *recv_cq) __acquire(&send_cq->lock); __acquire(&recv_cq->lock); } else if (unlikely(send_cq != NULL && recv_cq == NULL)) { - spin_lock_irq(&send_cq->lock); + spin_lock(&send_cq->lock); __acquire(&recv_cq->lock); } else if (unlikely(send_cq == NULL && recv_cq != NULL)) { - spin_lock_irq(&recv_cq->lock); + spin_lock(&recv_cq->lock); __acquire(&send_cq->lock); } else if (send_cq == recv_cq) { - spin_lock_irq(&send_cq->lock); + spin_lock(&send_cq->lock); __acquire(&recv_cq->lock); } else if (send_cq->cqn < recv_cq->cqn) { - spin_lock_irq(&send_cq->lock); + spin_lock(&send_cq->lock); spin_lock_nested(&recv_cq->lock, SINGLE_DEPTH_NESTING); } else { - spin_lock_irq(&recv_cq->lock); + spin_lock(&recv_cq->lock); spin_lock_nested(&send_cq->lock, SINGLE_DEPTH_NESTING); } } @@ -1492,13 +1492,13 @@ void hns_roce_unlock_cqs(struct hns_roce_cq *send_cq, spin_unlock(&recv_cq->lock); } else if (send_cq == recv_cq) { __release(&recv_cq->lock); - spin_unlock_irq(&send_cq->lock); + spin_unlock(&send_cq->lock); } else if (send_cq->cqn < recv_cq->cqn) { spin_unlock(&recv_cq->lock); - spin_unlock_irq(&send_cq->lock); + spin_unlock(&send_cq->lock); } else { spin_unlock(&send_cq->lock); - spin_unlock_irq(&recv_cq->lock); + spin_unlock(&recv_cq->lock); } } diff --git a/drivers/infiniband/hw/irdma/verbs.c b/drivers/infiniband/hw/irdma/verbs.c index fc0ce35da14e..eeb932e58730 100644 --- a/drivers/infiniband/hw/irdma/verbs.c +++ b/drivers/infiniband/hw/irdma/verbs.c @@ -1347,7 +1347,7 @@ int irdma_modify_qp_roce(struct ib_qp *ibqp, struct ib_qp_attr *attr, if (attr->max_dest_rd_atomic > dev->hw_attrs.max_hw_ird) { ibdev_err(&iwdev->ibdev, "rd_atomic = %d, above max_hw_ird=%d\n", - attr->max_rd_atomic, + attr->max_dest_rd_atomic, dev->hw_attrs.max_hw_ird); return -EINVAL; } @@ -3085,7 +3085,7 @@ error: static struct ib_mr *irdma_reg_user_mr_dmabuf(struct ib_pd *pd, u64 start, u64 len, u64 virt, int fd, int access, - struct ib_udata *udata) + struct uverbs_attr_bundle *attrs) { struct irdma_device *iwdev = to_iwdev(pd->device); struct ib_umem_dmabuf *umem_dmabuf; diff --git a/drivers/infiniband/hw/mana/main.c b/drivers/infiniband/hw/mana/main.c index d13abc954d2a..67c2d43135a8 100644 --- a/drivers/infiniband/hw/mana/main.c +++ b/drivers/infiniband/hw/mana/main.c @@ -383,7 +383,7 @@ static int mana_ib_gd_create_dma_region(struct mana_ib_dev *dev, struct ib_umem create_req->length = umem->length; create_req->offset_in_page = ib_umem_dma_offset(umem, page_sz); - create_req->gdma_page_type = order_base_2(page_sz) - PAGE_SHIFT; + create_req->gdma_page_type = order_base_2(page_sz) - MANA_PAGE_SHIFT; create_req->page_count = num_pages_total; ibdev_dbg(&dev->ib_dev, "size_dma_region %lu num_pages_total %lu\n", @@ -511,13 +511,13 @@ int mana_ib_mmap(struct ib_ucontext *ibcontext, struct vm_area_struct *vma) PAGE_SHIFT; prot = pgprot_writecombine(vma->vm_page_prot); - ret = rdma_user_mmap_io(ibcontext, vma, pfn, gc->db_page_size, prot, + ret = rdma_user_mmap_io(ibcontext, vma, pfn, PAGE_SIZE, prot, NULL); if (ret) ibdev_dbg(ibdev, "can't rdma_user_mmap_io ret %d\n", ret); else - ibdev_dbg(ibdev, "mapped I/O pfn 0x%llx page_size %u, ret %d\n", - pfn, gc->db_page_size, ret); + ibdev_dbg(ibdev, "mapped I/O pfn 0x%llx page_size %lu, ret %d\n", + pfn, PAGE_SIZE, ret); return ret; } diff --git a/drivers/infiniband/hw/mlx4/alias_GUID.c b/drivers/infiniband/hw/mlx4/alias_GUID.c index 9a439569ffcf..d7327735b8d0 100644 --- a/drivers/infiniband/hw/mlx4/alias_GUID.c +++ b/drivers/infiniband/hw/mlx4/alias_GUID.c @@ -829,7 +829,6 @@ void mlx4_ib_destroy_alias_guid_service(struct mlx4_ib_dev *dev) int mlx4_ib_init_alias_guid_service(struct mlx4_ib_dev *dev) { - char alias_wq_name[22]; int ret = 0; int i, j; union ib_gid gid; @@ -875,9 +874,8 @@ int mlx4_ib_init_alias_guid_service(struct mlx4_ib_dev *dev) dev->sriov.alias_guid.ports_guid[i].parent = &dev->sriov.alias_guid; dev->sriov.alias_guid.ports_guid[i].port = i; - snprintf(alias_wq_name, sizeof alias_wq_name, "alias_guid%d", i); dev->sriov.alias_guid.ports_guid[i].wq = - alloc_ordered_workqueue(alias_wq_name, WQ_MEM_RECLAIM); + alloc_ordered_workqueue("alias_guid%d", WQ_MEM_RECLAIM, i); if (!dev->sriov.alias_guid.ports_guid[i].wq) { ret = -ENOMEM; goto err_thread; diff --git a/drivers/infiniband/hw/mlx4/mad.c b/drivers/infiniband/hw/mlx4/mad.c index dc9cf45d2d32..e6e132f10625 100644 --- a/drivers/infiniband/hw/mlx4/mad.c +++ b/drivers/infiniband/hw/mlx4/mad.c @@ -2158,7 +2158,6 @@ static int mlx4_ib_alloc_demux_ctx(struct mlx4_ib_dev *dev, struct mlx4_ib_demux_ctx *ctx, int port) { - char name[21]; int ret = 0; int i; @@ -2194,24 +2193,21 @@ static int mlx4_ib_alloc_demux_ctx(struct mlx4_ib_dev *dev, goto err_mcg; } - snprintf(name, sizeof(name), "mlx4_ibt%d", port); - ctx->wq = alloc_ordered_workqueue(name, WQ_MEM_RECLAIM); + ctx->wq = alloc_ordered_workqueue("mlx4_ibt%d", WQ_MEM_RECLAIM, port); if (!ctx->wq) { pr_err("Failed to create tunnelling WQ for port %d\n", port); ret = -ENOMEM; goto err_wq; } - snprintf(name, sizeof(name), "mlx4_ibwi%d", port); - ctx->wi_wq = alloc_ordered_workqueue(name, WQ_MEM_RECLAIM); + ctx->wi_wq = alloc_ordered_workqueue("mlx4_ibwi%d", WQ_MEM_RECLAIM, port); if (!ctx->wi_wq) { pr_err("Failed to create wire WQ for port %d\n", port); ret = -ENOMEM; goto err_wiwq; } - snprintf(name, sizeof(name), "mlx4_ibud%d", port); - ctx->ud_wq = alloc_ordered_workqueue(name, WQ_MEM_RECLAIM); + ctx->ud_wq = alloc_ordered_workqueue("mlx4_ibud%d", WQ_MEM_RECLAIM, port); if (!ctx->ud_wq) { pr_err("Failed to create up/down WQ for port %d\n", port); ret = -ENOMEM; diff --git a/drivers/infiniband/hw/mlx5/Makefile b/drivers/infiniband/hw/mlx5/Makefile index 72a526236c2e..b38961f5058e 100644 --- a/drivers/infiniband/hw/mlx5/Makefile +++ b/drivers/infiniband/hw/mlx5/Makefile @@ -6,6 +6,7 @@ mlx5_ib-y := ah.o \ cong.o \ counters.o \ cq.o \ + data_direct.o \ dm.o \ doorbell.o \ gsi.o \ diff --git a/drivers/infiniband/hw/mlx5/cmd.c b/drivers/infiniband/hw/mlx5/cmd.c index 895b62cc528d..7c08e3008927 100644 --- a/drivers/infiniband/hw/mlx5/cmd.c +++ b/drivers/infiniband/hw/mlx5/cmd.c @@ -245,3 +245,24 @@ int mlx5_cmd_uar_dealloc(struct mlx5_core_dev *dev, u32 uarn, u16 uid) MLX5_SET(dealloc_uar_in, in, uid, uid); return mlx5_cmd_exec_in(dev, dealloc_uar, in); } + +int mlx5_cmd_query_vuid(struct mlx5_core_dev *dev, bool data_direct, + char *out_vuid) +{ + u8 out[MLX5_ST_SZ_BYTES(query_vuid_out) + + MLX5_ST_SZ_BYTES(array1024_auto)] = {}; + u8 in[MLX5_ST_SZ_BYTES(query_vuid_in)] = {}; + char *vuid; + int err; + + MLX5_SET(query_vuid_in, in, opcode, MLX5_CMD_OPCODE_QUERY_VUID); + MLX5_SET(query_vuid_in, in, vhca_id, MLX5_CAP_GEN(dev, vhca_id)); + MLX5_SET(query_vuid_in, in, data_direct, data_direct); + err = mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); + if (err) + return err; + + vuid = MLX5_ADDR_OF(query_vuid_out, out, vuid); + memcpy(out_vuid, vuid, MLX5_ST_SZ_BYTES(array1024_auto)); + return 0; +} diff --git a/drivers/infiniband/hw/mlx5/cmd.h b/drivers/infiniband/hw/mlx5/cmd.h index e5cd31270443..e6c88b6ebd0d 100644 --- a/drivers/infiniband/hw/mlx5/cmd.h +++ b/drivers/infiniband/hw/mlx5/cmd.h @@ -58,4 +58,6 @@ int mlx5_cmd_mad_ifc(struct mlx5_ib_dev *dev, const void *inb, void *outb, u16 opmod, u8 port); int mlx5_cmd_uar_alloc(struct mlx5_core_dev *dev, u32 *uarn, u16 uid); int mlx5_cmd_uar_dealloc(struct mlx5_core_dev *dev, u32 uarn, u16 uid); +int mlx5_cmd_query_vuid(struct mlx5_core_dev *dev, bool data_direct, + char *out_vuid); #endif /* MLX5_IB_CMD_H */ diff --git a/drivers/infiniband/hw/mlx5/data_direct.c b/drivers/infiniband/hw/mlx5/data_direct.c new file mode 100644 index 000000000000..b9ba84afaae2 --- /dev/null +++ b/drivers/infiniband/hw/mlx5/data_direct.c @@ -0,0 +1,227 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* + * Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved + */ + +#include "mlx5_ib.h" +#include "data_direct.h" + +static LIST_HEAD(mlx5_data_direct_dev_list); +static LIST_HEAD(mlx5_data_direct_reg_list); + +/* + * This mutex should be held when accessing either of the above lists + */ +static DEFINE_MUTEX(mlx5_data_direct_mutex); + +struct mlx5_data_direct_registration { + struct mlx5_ib_dev *ibdev; + char vuid[MLX5_ST_SZ_BYTES(array1024_auto) + 1]; + struct list_head list; +}; + +static const struct pci_device_id mlx5_data_direct_pci_table[] = { + { PCI_VDEVICE(MELLANOX, 0x2100) }, /* ConnectX-8 Data Direct */ + { 0, } +}; + +static int mlx5_data_direct_vpd_get_vuid(struct mlx5_data_direct_dev *dev) +{ + struct pci_dev *pdev = dev->pdev; + unsigned int vpd_size, kw_len; + u8 *vpd_data; + int start; + int ret; + + vpd_data = pci_vpd_alloc(pdev, &vpd_size); + if (IS_ERR(vpd_data)) { + pci_err(pdev, "Unable to read VPD, err=%ld\n", PTR_ERR(vpd_data)); + return PTR_ERR(vpd_data); + } + + start = pci_vpd_find_ro_info_keyword(vpd_data, vpd_size, "VU", &kw_len); + if (start < 0) { + ret = start; + pci_err(pdev, "VU keyword not found, err=%d\n", ret); + goto end; + } + + dev->vuid = kmemdup_nul(vpd_data + start, kw_len, GFP_KERNEL); + ret = dev->vuid ? 0 : -ENOMEM; + +end: + kfree(vpd_data); + return ret; +} + +static void mlx5_data_direct_shutdown(struct pci_dev *pdev) +{ + pci_disable_device(pdev); +} + +static int mlx5_data_direct_set_dma_caps(struct pci_dev *pdev) +{ + int err; + + err = dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(64)); + if (err) { + dev_warn(&pdev->dev, + "Warning: couldn't set 64-bit PCI DMA mask, err=%d\n", err); + err = dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(32)); + if (err) { + dev_err(&pdev->dev, "Can't set PCI DMA mask, err=%d\n", err); + return err; + } + } + + dma_set_max_seg_size(&pdev->dev, SZ_2G); + return 0; +} + +int mlx5_data_direct_ib_reg(struct mlx5_ib_dev *ibdev, char *vuid) +{ + struct mlx5_data_direct_registration *reg; + struct mlx5_data_direct_dev *dev; + + reg = kzalloc(sizeof(*reg), GFP_KERNEL); + if (!reg) + return -ENOMEM; + + reg->ibdev = ibdev; + strcpy(reg->vuid, vuid); + + mutex_lock(&mlx5_data_direct_mutex); + list_for_each_entry(dev, &mlx5_data_direct_dev_list, list) { + if (strcmp(dev->vuid, vuid) == 0) { + mlx5_ib_data_direct_bind(ibdev, dev); + break; + } + } + + /* Add the registration to its global list, to be used upon bind/unbind + * of its affiliated data direct device + */ + list_add_tail(®->list, &mlx5_data_direct_reg_list); + mutex_unlock(&mlx5_data_direct_mutex); + return 0; +} + +void mlx5_data_direct_ib_unreg(struct mlx5_ib_dev *ibdev) +{ + struct mlx5_data_direct_registration *reg; + + mutex_lock(&mlx5_data_direct_mutex); + list_for_each_entry(reg, &mlx5_data_direct_reg_list, list) { + if (reg->ibdev == ibdev) { + list_del(®->list); + kfree(reg); + goto end; + } + } + + WARN_ON(true); +end: + mutex_unlock(&mlx5_data_direct_mutex); +} + +static void mlx5_data_direct_dev_reg(struct mlx5_data_direct_dev *dev) +{ + struct mlx5_data_direct_registration *reg; + + mutex_lock(&mlx5_data_direct_mutex); + list_for_each_entry(reg, &mlx5_data_direct_reg_list, list) { + if (strcmp(dev->vuid, reg->vuid) == 0) + mlx5_ib_data_direct_bind(reg->ibdev, dev); + } + + /* Add the data direct device to the global list, further IB devices may + * use it later as well + */ + list_add_tail(&dev->list, &mlx5_data_direct_dev_list); + mutex_unlock(&mlx5_data_direct_mutex); +} + +static void mlx5_data_direct_dev_unreg(struct mlx5_data_direct_dev *dev) +{ + struct mlx5_data_direct_registration *reg; + + mutex_lock(&mlx5_data_direct_mutex); + /* Prevent any further affiliations */ + list_del(&dev->list); + list_for_each_entry(reg, &mlx5_data_direct_reg_list, list) { + if (strcmp(dev->vuid, reg->vuid) == 0) + mlx5_ib_data_direct_unbind(reg->ibdev); + } + mutex_unlock(&mlx5_data_direct_mutex); +} + +static int mlx5_data_direct_probe(struct pci_dev *pdev, const struct pci_device_id *id) +{ + struct mlx5_data_direct_dev *dev; + int err; + + dev = kzalloc(sizeof(*dev), GFP_KERNEL); + if (!dev) + return -ENOMEM; + + dev->device = &pdev->dev; + dev->pdev = pdev; + + pci_set_drvdata(dev->pdev, dev); + err = pci_enable_device(pdev); + if (err) { + dev_err(dev->device, "Cannot enable PCI device, err=%d\n", err); + goto err; + } + + pci_set_master(pdev); + err = mlx5_data_direct_set_dma_caps(pdev); + if (err) + goto err_disable; + + if (pci_enable_atomic_ops_to_root(pdev, PCI_EXP_DEVCAP2_ATOMIC_COMP32) && + pci_enable_atomic_ops_to_root(pdev, PCI_EXP_DEVCAP2_ATOMIC_COMP64) && + pci_enable_atomic_ops_to_root(pdev, PCI_EXP_DEVCAP2_ATOMIC_COMP128)) + dev_dbg(dev->device, "Enabling pci atomics failed\n"); + + err = mlx5_data_direct_vpd_get_vuid(dev); + if (err) + goto err_disable; + + mlx5_data_direct_dev_reg(dev); + return 0; + +err_disable: + pci_disable_device(pdev); +err: + kfree(dev); + return err; +} + +static void mlx5_data_direct_remove(struct pci_dev *pdev) +{ + struct mlx5_data_direct_dev *dev = pci_get_drvdata(pdev); + + mlx5_data_direct_dev_unreg(dev); + pci_disable_device(pdev); + kfree(dev->vuid); + kfree(dev); +} + +static struct pci_driver mlx5_data_direct_driver = { + .name = KBUILD_MODNAME, + .id_table = mlx5_data_direct_pci_table, + .probe = mlx5_data_direct_probe, + .remove = mlx5_data_direct_remove, + .shutdown = mlx5_data_direct_shutdown, +}; + +int mlx5_data_direct_driver_register(void) +{ + return pci_register_driver(&mlx5_data_direct_driver); +} + +void mlx5_data_direct_driver_unregister(void) +{ + pci_unregister_driver(&mlx5_data_direct_driver); +} diff --git a/drivers/infiniband/hw/mlx5/data_direct.h b/drivers/infiniband/hw/mlx5/data_direct.h new file mode 100644 index 000000000000..2fd2bdbe8f69 --- /dev/null +++ b/drivers/infiniband/hw/mlx5/data_direct.h @@ -0,0 +1,23 @@ +/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ +/* + * Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved + */ + +#ifndef _MLX5_IB_DATA_DIRECT_H +#define _MLX5_IB_DATA_DIRECT_H + +struct mlx5_ib_dev; + +struct mlx5_data_direct_dev { + struct device *device; + struct pci_dev *pdev; + char *vuid; + struct list_head list; +}; + +int mlx5_data_direct_ib_reg(struct mlx5_ib_dev *ibdev, char *vuid); +void mlx5_data_direct_ib_unreg(struct mlx5_ib_dev *ibdev); +int mlx5_data_direct_driver_register(void); +void mlx5_data_direct_driver_unregister(void); + +#endif diff --git a/drivers/infiniband/hw/mlx5/ib_rep.c b/drivers/infiniband/hw/mlx5/ib_rep.c index c7a4ee896121..49af1cfbe6d1 100644 --- a/drivers/infiniband/hw/mlx5/ib_rep.c +++ b/drivers/infiniband/hw/mlx5/ib_rep.c @@ -13,6 +13,7 @@ mlx5_ib_set_vport_rep(struct mlx5_core_dev *dev, int vport_index) { struct mlx5_ib_dev *ibdev; + struct net_device *ndev; ibdev = mlx5_eswitch_uplink_get_proto_dev(dev->priv.eswitch, REP_IB); if (!ibdev) @@ -20,12 +21,9 @@ mlx5_ib_set_vport_rep(struct mlx5_core_dev *dev, ibdev->port[vport_index].rep = rep; rep->rep_data[REP_IB].priv = ibdev; - write_lock(&ibdev->port[vport_index].roce.netdev_lock); - ibdev->port[vport_index].roce.netdev = - mlx5_ib_get_rep_netdev(rep->esw, rep->vport); - write_unlock(&ibdev->port[vport_index].roce.netdev_lock); + ndev = mlx5_ib_get_rep_netdev(rep->esw, rep->vport); - return 0; + return ib_device_set_netdev(&ibdev->ib_dev, ndev, vport_index + 1); } static void mlx5_ib_register_peer_vport_reps(struct mlx5_core_dev *mdev); @@ -104,10 +102,15 @@ mlx5_ib_vport_rep_load(struct mlx5_core_dev *dev, struct mlx5_eswitch_rep *rep) ibdev->is_rep = true; vport_index = rep->vport_index; ibdev->port[vport_index].rep = rep; - ibdev->port[vport_index].roce.netdev = - mlx5_ib_get_rep_netdev(lag_master->priv.eswitch, rep->vport); ibdev->mdev = lag_master; ibdev->num_ports = num_ports; + ibdev->ib_dev.phys_port_cnt = num_ports; + ret = ib_device_set_netdev(&ibdev->ib_dev, + mlx5_ib_get_rep_netdev(lag_master->priv.eswitch, + rep->vport), + vport_index + 1); + if (ret) + goto fail_add; ret = __mlx5_ib_add(ibdev, profile); if (ret) @@ -160,9 +163,8 @@ mlx5_ib_vport_rep_unload(struct mlx5_eswitch_rep *rep) } port = &dev->port[vport_index]; - write_lock(&port->roce.netdev_lock); - port->roce.netdev = NULL; - write_unlock(&port->roce.netdev_lock); + + ib_device_set_netdev(&dev->ib_dev, NULL, vport_index + 1); rep->rep_data[REP_IB].priv = NULL; port->rep = NULL; diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index 6048b9ad13bb..4999239c8f41 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -48,6 +48,7 @@ #include <rdma/mlx5_user_ioctl_verbs.h> #include <rdma/mlx5_user_ioctl_cmds.h> #include "macsec.h" +#include "data_direct.h" #define UVERBS_MODULE_NAME mlx5_ib #include <rdma/uverbs_named_ioctl.h> @@ -146,16 +147,52 @@ static struct mlx5_roce *mlx5_get_rep_roce(struct mlx5_ib_dev *dev, if (upper && port->rep->vport == MLX5_VPORT_UPLINK) continue; - - read_lock(&port->roce.netdev_lock); - rep_ndev = mlx5_ib_get_rep_netdev(port->rep->esw, - port->rep->vport); - if (rep_ndev == ndev) { - read_unlock(&port->roce.netdev_lock); + rep_ndev = ib_device_get_netdev(&dev->ib_dev, i + 1); + if (rep_ndev && rep_ndev == ndev) { + dev_put(rep_ndev); *port_num = i + 1; return &port->roce; } - read_unlock(&port->roce.netdev_lock); + + dev_put(rep_ndev); + } + + return NULL; +} + +static bool mlx5_netdev_send_event(struct mlx5_ib_dev *dev, + struct net_device *ndev, + struct net_device *upper, + struct net_device *ib_ndev) +{ + if (!dev->ib_active) + return false; + + /* Event is about our upper device */ + if (upper == ndev) + return true; + + /* RDMA device is not in lag and not in switchdev */ + if (!dev->is_rep && !upper && ndev == ib_ndev) + return true; + + /* RDMA devie is in switchdev */ + if (dev->is_rep && ndev == ib_ndev) + return true; + + return false; +} + +static struct net_device *mlx5_ib_get_rep_uplink_netdev(struct mlx5_ib_dev *ibdev) +{ + struct mlx5_ib_port *port; + int i; + + for (i = 0; i < ibdev->num_ports; i++) { + port = &ibdev->port[i]; + if (port->rep && port->rep->vport == MLX5_VPORT_UPLINK) { + return ib_device_get_netdev(&ibdev->ib_dev, i + 1); + } } return NULL; @@ -167,6 +204,7 @@ static int mlx5_netdev_event(struct notifier_block *this, struct mlx5_roce *roce = container_of(this, struct mlx5_roce, nb); struct net_device *ndev = netdev_notifier_info_to_dev(ptr); u32 port_num = roce->native_port_num; + struct net_device *ib_ndev = NULL; struct mlx5_core_dev *mdev; struct mlx5_ib_dev *ibdev; @@ -180,47 +218,63 @@ static int mlx5_netdev_event(struct notifier_block *this, /* Should already be registered during the load */ if (ibdev->is_rep) break; - write_lock(&roce->netdev_lock); + + ib_ndev = ib_device_get_netdev(&ibdev->ib_dev, port_num); + /* Exit if already registered */ + if (ib_ndev) + goto put_ndev; + if (ndev->dev.parent == mdev->device) - roce->netdev = ndev; - write_unlock(&roce->netdev_lock); + ib_device_set_netdev(&ibdev->ib_dev, ndev, port_num); break; case NETDEV_UNREGISTER: /* In case of reps, ib device goes away before the netdevs */ - write_lock(&roce->netdev_lock); - if (roce->netdev == ndev) - roce->netdev = NULL; - write_unlock(&roce->netdev_lock); - break; + if (ibdev->is_rep) + break; + ib_ndev = ib_device_get_netdev(&ibdev->ib_dev, port_num); + if (ib_ndev == ndev) + ib_device_set_netdev(&ibdev->ib_dev, NULL, port_num); + goto put_ndev; case NETDEV_CHANGE: case NETDEV_UP: case NETDEV_DOWN: { - struct net_device *lag_ndev = mlx5_lag_get_roce_netdev(mdev); struct net_device *upper = NULL; - if (lag_ndev) { - upper = netdev_master_upper_dev_get(lag_ndev); - dev_put(lag_ndev); + if (mlx5_lag_is_roce(mdev) || mlx5_lag_is_sriov(mdev)) { + struct net_device *lag_ndev; + + if(mlx5_lag_is_roce(mdev)) + lag_ndev = ib_device_get_netdev(&ibdev->ib_dev, 1); + else /* sriov lag */ + lag_ndev = mlx5_ib_get_rep_uplink_netdev(ibdev); + + if (lag_ndev) { + upper = netdev_master_upper_dev_get(lag_ndev); + dev_put(lag_ndev); + } else { + goto done; + } } if (ibdev->is_rep) roce = mlx5_get_rep_roce(ibdev, ndev, upper, &port_num); if (!roce) return NOTIFY_DONE; - if ((upper == ndev || - ((!upper || ibdev->is_rep) && ndev == roce->netdev)) && - ibdev->ib_active) { + + ib_ndev = ib_device_get_netdev(&ibdev->ib_dev, port_num); + + if (mlx5_netdev_send_event(ibdev, ndev, upper, ib_ndev)) { struct ib_event ibev = { }; enum ib_port_state port_state; if (get_port_state(&ibdev->ib_dev, port_num, &port_state)) - goto done; + goto put_ndev; if (roce->last_port_state == port_state) - goto done; + goto put_ndev; roce->last_port_state = port_state; ibev.device = &ibdev->ib_dev; @@ -229,7 +283,7 @@ static int mlx5_netdev_event(struct notifier_block *this, else if (port_state == IB_PORT_ACTIVE) ibev.event = IB_EVENT_PORT_ACTIVE; else - goto done; + goto put_ndev; ibev.element.port_num = port_num; ib_dispatch_event(&ibev); @@ -240,38 +294,13 @@ static int mlx5_netdev_event(struct notifier_block *this, default: break; } +put_ndev: + dev_put(ib_ndev); done: mlx5_ib_put_native_port_mdev(ibdev, port_num); return NOTIFY_DONE; } -static struct net_device *mlx5_ib_get_netdev(struct ib_device *device, - u32 port_num) -{ - struct mlx5_ib_dev *ibdev = to_mdev(device); - struct net_device *ndev; - struct mlx5_core_dev *mdev; - - mdev = mlx5_ib_get_native_port_mdev(ibdev, port_num, NULL); - if (!mdev) - return NULL; - - ndev = mlx5_lag_get_roce_netdev(mdev); - if (ndev) - goto out; - - /* Ensure ndev does not disappear before we invoke dev_hold() - */ - read_lock(&ibdev->port[port_num - 1].roce.netdev_lock); - ndev = ibdev->port[port_num - 1].roce.netdev; - dev_hold(ndev); - read_unlock(&ibdev->port[port_num - 1].roce.netdev_lock); - -out: - mlx5_ib_put_native_port_mdev(ibdev, port_num); - return ndev; -} - struct mlx5_core_dev *mlx5_ib_get_native_port_mdev(struct mlx5_ib_dev *ibdev, u32 ib_port_num, u32 *native_port_num) @@ -546,11 +575,11 @@ static int mlx5_query_port_roce(struct ib_device *device, u32 port_num, if (!put_mdev) goto out; - ndev = mlx5_ib_get_netdev(device, port_num); + ndev = ib_device_get_netdev(device, port_num); if (!ndev) goto out; - if (dev->lag_active) { + if (mlx5_lag_is_roce(mdev) || mlx5_lag_is_sriov(mdev)) { rcu_read_lock(); upper = netdev_master_upper_dev_get_rcu(ndev); if (upper) { @@ -3024,6 +3053,59 @@ static void mlx5_ib_dev_res_cleanup(struct mlx5_ib_dev *dev) mutex_destroy(&devr->srq_lock); } +static int +mlx5_ib_create_data_direct_resources(struct mlx5_ib_dev *dev) +{ + int inlen = MLX5_ST_SZ_BYTES(create_mkey_in); + struct mlx5_core_dev *mdev = dev->mdev; + void *mkc; + u32 mkey; + u32 pdn; + u32 *in; + int err; + + err = mlx5_core_alloc_pd(mdev, &pdn); + if (err) + return err; + + in = kvzalloc(inlen, GFP_KERNEL); + if (!in) { + err = -ENOMEM; + goto err; + } + + MLX5_SET(create_mkey_in, in, data_direct, 1); + mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); + MLX5_SET(mkc, mkc, access_mode_1_0, MLX5_MKC_ACCESS_MODE_PA); + MLX5_SET(mkc, mkc, lw, 1); + MLX5_SET(mkc, mkc, lr, 1); + MLX5_SET(mkc, mkc, rw, 1); + MLX5_SET(mkc, mkc, rr, 1); + MLX5_SET(mkc, mkc, a, 1); + MLX5_SET(mkc, mkc, pd, pdn); + MLX5_SET(mkc, mkc, length64, 1); + MLX5_SET(mkc, mkc, qpn, 0xffffff); + err = mlx5_core_create_mkey(mdev, &mkey, in, inlen); + kvfree(in); + if (err) + goto err; + + dev->ddr.mkey = mkey; + dev->ddr.pdn = pdn; + return 0; + +err: + mlx5_core_dealloc_pd(mdev, pdn); + return err; +} + +static void +mlx5_ib_free_data_direct_resources(struct mlx5_ib_dev *dev) +{ + mlx5_core_destroy_mkey(dev->mdev, dev->ddr.mkey); + mlx5_core_dealloc_pd(dev->mdev, dev->ddr.pdn); +} + static u32 get_core_cap_flags(struct ib_device *ibdev, struct mlx5_hca_vport_context *rep) { @@ -3124,6 +3206,60 @@ static void get_dev_fw_str(struct ib_device *ibdev, char *str) fw_rev_sub(dev->mdev)); } +static int lag_event(struct notifier_block *nb, unsigned long event, void *data) +{ + struct mlx5_ib_dev *dev = container_of(nb, struct mlx5_ib_dev, + lag_events); + struct mlx5_core_dev *mdev = dev->mdev; + struct mlx5_ib_port *port; + struct net_device *ndev; + int i, err; + int portnum; + + portnum = 0; + switch (event) { + case MLX5_DRIVER_EVENT_ACTIVE_BACKUP_LAG_CHANGE_LOWERSTATE: + ndev = data; + if (ndev) { + if (!mlx5_lag_is_roce(mdev)) { + // sriov lag + for (i = 0; i < dev->num_ports; i++) { + port = &dev->port[i]; + if (port->rep && port->rep->vport == + MLX5_VPORT_UPLINK) { + portnum = i; + break; + } + } + } + err = ib_device_set_netdev(&dev->ib_dev, ndev, + portnum + 1); + dev_put(ndev); + if (err) + return err; + /* Rescan gids after new netdev assignment */ + rdma_roce_rescan_device(&dev->ib_dev); + } + break; + default: + return NOTIFY_DONE; + } + return NOTIFY_OK; +} + +static void mlx5e_lag_event_register(struct mlx5_ib_dev *dev) +{ + dev->lag_events.notifier_call = lag_event; + blocking_notifier_chain_register(&dev->mdev->priv.lag_nh, + &dev->lag_events); +} + +static void mlx5e_lag_event_unregister(struct mlx5_ib_dev *dev) +{ + blocking_notifier_chain_unregister(&dev->mdev->priv.lag_nh, + &dev->lag_events); +} + static int mlx5_eth_lag_init(struct mlx5_ib_dev *dev) { struct mlx5_core_dev *mdev = dev->mdev; @@ -3145,6 +3281,7 @@ static int mlx5_eth_lag_init(struct mlx5_ib_dev *dev) goto err_destroy_vport_lag; } + mlx5e_lag_event_register(dev); dev->flow_db->lag_demux_ft = ft; dev->lag_ports = mlx5_lag_get_num_ports(mdev); dev->lag_active = true; @@ -3162,6 +3299,7 @@ static void mlx5_eth_lag_cleanup(struct mlx5_ib_dev *dev) if (dev->lag_active) { dev->lag_active = false; + mlx5e_lag_event_unregister(dev); mlx5_destroy_flow_table(dev->flow_db->lag_demux_ft); dev->flow_db->lag_demux_ft = NULL; @@ -3420,6 +3558,41 @@ unbind: return false; } +static int mlx5_ib_data_direct_init(struct mlx5_ib_dev *dev) +{ + char vuid[MLX5_ST_SZ_BYTES(array1024_auto) + 1] = {}; + int ret; + + if (!MLX5_CAP_GEN(dev->mdev, data_direct) || + !MLX5_CAP_GEN_2(dev->mdev, query_vuid)) + return 0; + + ret = mlx5_cmd_query_vuid(dev->mdev, true, vuid); + if (ret) + return ret; + + ret = mlx5_ib_create_data_direct_resources(dev); + if (ret) + return ret; + + INIT_LIST_HEAD(&dev->data_direct_mr_list); + ret = mlx5_data_direct_ib_reg(dev, vuid); + if (ret) + mlx5_ib_free_data_direct_resources(dev); + + return ret; +} + +static void mlx5_ib_data_direct_cleanup(struct mlx5_ib_dev *dev) +{ + if (!MLX5_CAP_GEN(dev->mdev, data_direct) || + !MLX5_CAP_GEN_2(dev->mdev, query_vuid)) + return; + + mlx5_data_direct_ib_unreg(dev); + mlx5_ib_free_data_direct_resources(dev); +} + static int mlx5_ib_init_multiport_master(struct mlx5_ib_dev *dev) { u32 port_num = mlx5_core_native_port_num(dev->mdev) - 1; @@ -3796,6 +3969,14 @@ ADD_UVERBS_ATTRIBUTES_SIMPLE( dump_fill_mkey), UA_MANDATORY)); +ADD_UVERBS_ATTRIBUTES_SIMPLE( + mlx5_ib_reg_dmabuf_mr, + UVERBS_OBJECT_MR, + UVERBS_METHOD_REG_DMABUF_MR, + UVERBS_ATTR_FLAGS_IN(MLX5_IB_ATTR_REG_DMABUF_MR_ACCESS_FLAGS, + enum mlx5_ib_uapi_reg_dmabuf_flags, + UA_OPTIONAL)); + static const struct uapi_definition mlx5_ib_defs[] = { UAPI_DEF_CHAIN(mlx5_ib_devx_defs), UAPI_DEF_CHAIN(mlx5_ib_flow_defs), @@ -3805,6 +3986,7 @@ static const struct uapi_definition mlx5_ib_defs[] = { UAPI_DEF_CHAIN(mlx5_ib_create_cq_defs), UAPI_DEF_CHAIN_OBJ_TREE(UVERBS_OBJECT_DEVICE, &mlx5_ib_query_context), + UAPI_DEF_CHAIN_OBJ_TREE(UVERBS_OBJECT_MR, &mlx5_ib_reg_dmabuf_mr), UAPI_DEF_CHAIN_OBJ_TREE_NAMED(MLX5_IB_OBJECT_VAR, UAPI_DEF_IS_OBJ_SUPPORTED(var_is_supported)), UAPI_DEF_CHAIN_OBJ_TREE_NAMED(MLX5_IB_OBJECT_UAR), @@ -3813,6 +3995,7 @@ static const struct uapi_definition mlx5_ib_defs[] = { static void mlx5_ib_stage_init_cleanup(struct mlx5_ib_dev *dev) { + mlx5_ib_data_direct_cleanup(dev); mlx5_ib_cleanup_multiport_master(dev); WARN_ON(!xa_empty(&dev->odp_mkeys)); mutex_destroy(&dev->cap_mask_mutex); @@ -3828,13 +4011,11 @@ static int mlx5_ib_stage_init_init(struct mlx5_ib_dev *dev) dev->ib_dev.node_type = RDMA_NODE_IB_CA; dev->ib_dev.local_dma_lkey = 0 /* not supported for now */; - dev->ib_dev.phys_port_cnt = dev->num_ports; dev->ib_dev.dev.parent = mdev->device; dev->ib_dev.lag_flags = RDMA_LAG_FLAGS_HASH_ALL_SLAVES; for (i = 0; i < dev->num_ports; i++) { spin_lock_init(&dev->port[i].mp.mpi_lock); - rwlock_init(&dev->port[i].roce.netdev_lock); dev->port[i].roce.dev = dev; dev->port[i].roce.native_port_num = i + 1; dev->port[i].roce.last_port_state = IB_PORT_DOWN; @@ -3866,6 +4047,7 @@ static int mlx5_ib_stage_init_init(struct mlx5_ib_dev *dev) dev->ib_dev.num_comp_vectors = mlx5_comp_vectors_max(mdev); mutex_init(&dev->cap_mask_mutex); + mutex_init(&dev->data_direct_lock); INIT_LIST_HEAD(&dev->qp_list); spin_lock_init(&dev->reset_flow_resource_lock); xa_init(&dev->odp_mkeys); @@ -3874,6 +4056,10 @@ static int mlx5_ib_stage_init_init(struct mlx5_ib_dev *dev) spin_lock_init(&dev->dm.lock); dev->dm.dev = mdev; + err = mlx5_ib_data_direct_init(dev); + if (err) + goto err_mp; + return 0; err_mp: mlx5_ib_cleanup_multiport_master(dev); @@ -4094,7 +4280,6 @@ static const struct ib_device_ops mlx5_ib_dev_common_roce_ops = { .create_wq = mlx5_ib_create_wq, .destroy_rwq_ind_table = mlx5_ib_destroy_rwq_ind_table, .destroy_wq = mlx5_ib_destroy_wq, - .get_netdev = mlx5_ib_get_netdev, .modify_wq = mlx5_ib_modify_wq, INIT_RDMA_OBJ_SIZE(ib_rwq_ind_table, mlx5_ib_rwq_ind_table, @@ -4293,6 +4478,22 @@ static void mlx5_ib_stage_dev_notifier_cleanup(struct mlx5_ib_dev *dev) mlx5_notifier_unregister(dev->mdev, &dev->mdev_events); } +void mlx5_ib_data_direct_bind(struct mlx5_ib_dev *ibdev, + struct mlx5_data_direct_dev *dev) +{ + mutex_lock(&ibdev->data_direct_lock); + ibdev->data_direct_dev = dev; + mutex_unlock(&ibdev->data_direct_lock); +} + +void mlx5_ib_data_direct_unbind(struct mlx5_ib_dev *ibdev) +{ + mutex_lock(&ibdev->data_direct_lock); + mlx5_ib_revoke_data_direct_mrs(ibdev); + ibdev->data_direct_dev = NULL; + mutex_unlock(&ibdev->data_direct_lock); +} + void __mlx5_ib_remove(struct mlx5_ib_dev *dev, const struct mlx5_ib_profile *profile, int stage) @@ -4522,6 +4723,7 @@ static struct ib_device *mlx5_ib_add_sub_dev(struct ib_device *parent, mplane->mdev = mparent->mdev; mplane->num_ports = mparent->num_plane; mplane->sub_dev_name = name; + mplane->ib_dev.phys_port_cnt = mplane->num_ports; ret = __mlx5_ib_add(mplane, &plane_profile); if (ret) @@ -4638,6 +4840,7 @@ static int mlx5r_probe(struct auxiliary_device *adev, dev->mdev = mdev; dev->num_ports = num_ports; + dev->ib_dev.phys_port_cnt = num_ports; if (ll == IB_LINK_LAYER_ETHERNET && !mlx5_get_roce_state(mdev)) profile = &raw_eth_profile; @@ -4715,17 +4918,23 @@ static int __init mlx5_ib_init(void) ret = mlx5r_rep_init(); if (ret) goto rep_err; + ret = mlx5_data_direct_driver_register(); + if (ret) + goto dd_err; ret = auxiliary_driver_register(&mlx5r_mp_driver); if (ret) goto mp_err; ret = auxiliary_driver_register(&mlx5r_driver); if (ret) goto drv_err; + return 0; drv_err: auxiliary_driver_unregister(&mlx5r_mp_driver); mp_err: + mlx5_data_direct_driver_unregister(); +dd_err: mlx5r_rep_cleanup(); rep_err: mlx5_ib_qp_event_cleanup(); @@ -4737,6 +4946,7 @@ qp_event_err: static void __exit mlx5_ib_cleanup(void) { + mlx5_data_direct_driver_unregister(); auxiliary_driver_unregister(&mlx5r_driver); auxiliary_driver_unregister(&mlx5r_mp_driver); mlx5r_rep_cleanup(); diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h index d5eb1b726675..23fd72f7f63d 100644 --- a/drivers/infiniband/hw/mlx5/mlx5_ib.h +++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h @@ -63,17 +63,6 @@ __mlx5_log_page_size_to_bitmap(unsigned int log_pgsz_bits, return GENMASK(largest_pg_shift, pgsz_shift); } -/* - * For mkc users, instead of a page_offset the command has a start_iova which - * specifies both the page_offset and the on-the-wire IOVA - */ -#define mlx5_umem_find_best_pgsz(umem, typ, log_pgsz_fld, pgsz_shift, iova) \ - ib_umem_find_best_pgsz(umem, \ - __mlx5_log_page_size_to_bitmap( \ - __mlx5_bit_sz(typ, log_pgsz_fld), \ - pgsz_shift), \ - iova) - static __always_inline unsigned long __mlx5_page_offset_to_bitmask(unsigned int page_offset_bits, unsigned int offset_shift) @@ -640,6 +629,8 @@ enum mlx5_mkey_type { MLX5_MKEY_MR = 1, MLX5_MKEY_MW, MLX5_MKEY_INDIRECT_DEVX, + MLX5_MKEY_NULL, + MLX5_MKEY_IMPLICIT_CHILD, }; struct mlx5r_cache_rb_key { @@ -682,6 +673,8 @@ struct mlx5_ib_mr { struct mlx5_ib_mkey mmkey; struct ib_umem *umem; + /* The mr is data direct related */ + u8 data_direct :1; union { /* Used only by kernel MRs (umem == NULL) */ @@ -719,6 +712,11 @@ struct mlx5_ib_mr { } odp_destroy; struct ib_odp_counters odp_stats; bool is_odp_implicit; + /* The affilated data direct crossed mr */ + struct mlx5_ib_mr *dd_crossed_mr; + struct list_head dd_node; + u8 revoked :1; + struct mlx5_ib_mkey null_mmkey; }; }; }; @@ -796,6 +794,7 @@ struct mlx5_cache_ent { u8 is_tmp:1; u8 disabled:1; u8 fill_to_high_water:1; + u8 tmp_cleanup_scheduled:1; /* * - limit is the low water mark for stored mkeys, 2* limit is the @@ -827,7 +826,6 @@ struct mlx5_mkey_cache { struct mutex rb_lock; struct dentry *fs_root; unsigned long last_add; - struct delayed_work remove_ent_dwork; }; struct mlx5_ib_port_resources { @@ -835,6 +833,11 @@ struct mlx5_ib_port_resources { struct work_struct pkey_change_work; }; +struct mlx5_data_direct_resources { + u32 pdn; + u32 mkey; +}; + struct mlx5_ib_resources { struct ib_cq *c0; struct mutex cq_lock; @@ -885,8 +888,6 @@ struct mlx5_roce { /* Protect mlx5_ib_get_netdev from invoking dev_hold() with a NULL * netdev pointer */ - rwlock_t netdev_lock; - struct net_device *netdev; struct notifier_block nb; struct netdev_net_notifier nn; struct notifier_block mdev_nb; @@ -1131,7 +1132,11 @@ struct mlx5_macsec { struct mlx5_ib_dev { struct ib_device ib_dev; struct mlx5_core_dev *mdev; + struct mlx5_data_direct_dev *data_direct_dev; + /* protect accessing data_direct_dev */ + struct mutex data_direct_lock; struct notifier_block mdev_events; + struct notifier_block lag_events; int num_ports; /* serialize update of capability mask */ @@ -1161,6 +1166,7 @@ struct mlx5_ib_dev { /* protect resources needed as part of reset flow */ spinlock_t reset_flow_resource_lock; struct list_head qp_list; + struct list_head data_direct_mr_list; /* Array with num_ports elements */ struct mlx5_ib_port *port; struct mlx5_sq_bfreg bfreg; @@ -1185,6 +1191,7 @@ struct mlx5_ib_dev { u16 pkey_table_len; u8 lag_ports; struct mlx5_special_mkeys mkeys; + struct mlx5_data_direct_resources ddr; #ifdef CONFIG_MLX5_MACSEC struct mlx5_macsec macsec; @@ -1345,7 +1352,7 @@ struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, struct ib_mr *mlx5_ib_reg_user_mr_dmabuf(struct ib_pd *pd, u64 start, u64 length, u64 virt_addr, int fd, int access_flags, - struct ib_udata *udata); + struct uverbs_attr_bundle *attrs); int mlx5_ib_advise_mr(struct ib_pd *pd, enum ib_uverbs_advise_mr_advice advice, u32 flags, @@ -1356,7 +1363,6 @@ int mlx5_ib_alloc_mw(struct ib_mw *mw, struct ib_udata *udata); int mlx5_ib_dealloc_mw(struct ib_mw *mw); struct mlx5_ib_mr *mlx5_ib_alloc_implicit_mr(struct mlx5_ib_pd *pd, int access_flags); -void mlx5_ib_free_implicit_mr(struct mlx5_ib_mr *mr); void mlx5_ib_free_odp_mr(struct mlx5_ib_mr *mr); struct ib_mr *mlx5_ib_rereg_user_mr(struct ib_mr *ib_mr, int flags, u64 start, u64 length, u64 virt_addr, int access_flags, @@ -1425,6 +1431,10 @@ int mlx5_ib_destroy_rwq_ind_table(struct ib_rwq_ind_table *wq_ind_table); struct ib_mr *mlx5_ib_reg_dm_mr(struct ib_pd *pd, struct ib_dm *dm, struct ib_dm_mr_attr *attr, struct uverbs_attr_bundle *attrs); +void mlx5_ib_data_direct_bind(struct mlx5_ib_dev *ibdev, + struct mlx5_data_direct_dev *dev); +void mlx5_ib_data_direct_unbind(struct mlx5_ib_dev *ibdev); +void mlx5_ib_revoke_data_direct_mrs(struct mlx5_ib_dev *dev); #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING int mlx5_ib_odp_init_one(struct mlx5_ib_dev *ibdev); @@ -1633,8 +1643,6 @@ static inline void mlx5r_deref_wait_odp_mkey(struct mlx5_ib_mkey *mmkey) wait_event(mmkey->wait, refcount_read(&mmkey->usecount) == 0); } -int mlx5_ib_test_wc(struct mlx5_ib_dev *dev); - static inline bool mlx5_ib_lag_should_assign_affinity(struct mlx5_ib_dev *dev) { /* @@ -1707,4 +1715,20 @@ static inline u32 smi_to_native_portnum(struct mlx5_ib_dev *dev, u32 port) return (port - 1) / dev->num_ports + 1; } +/* + * For mkc users, instead of a page_offset the command has a start_iova which + * specifies both the page_offset and the on-the-wire IOVA + */ +static __always_inline unsigned long +mlx5_umem_mkc_find_best_pgsz(struct mlx5_ib_dev *dev, struct ib_umem *umem, + u64 iova) +{ + int page_size_bits = + MLX5_CAP_GEN_2(dev->mdev, umr_log_entity_size_5) ? 6 : 5; + unsigned long bitmap = + __mlx5_log_page_size_to_bitmap(page_size_bits, 0); + + return ib_umem_find_best_pgsz(umem, bitmap, iova); +} + #endif /* MLX5_IB_H */ diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c index 98bd8eaa393e..45d9dc9c6c8f 100644 --- a/drivers/infiniband/hw/mlx5/mr.c +++ b/drivers/infiniband/hw/mlx5/mr.c @@ -43,18 +43,22 @@ #include "dm.h" #include "mlx5_ib.h" #include "umr.h" +#include "data_direct.h" enum { MAX_PENDING_REG_MR = 8, }; +#define MLX5_MR_CACHE_PERSISTENT_ENTRY_MIN_DESCS 4 #define MLX5_UMR_ALIGN 2048 static void create_mkey_callback(int status, struct mlx5_async_work *context); static struct mlx5_ib_mr *reg_create(struct ib_pd *pd, struct ib_umem *umem, u64 iova, int access_flags, - unsigned int page_size, bool populate); + unsigned int page_size, bool populate, + int access_mode); +static int __mlx5_ib_dereg_mr(struct ib_mr *ibmr); static void set_mkc_access_pd_addr_fields(void *mkc, int acc, u64 start_addr, struct ib_pd *pd) @@ -211,9 +215,9 @@ static void create_mkey_callback(int status, struct mlx5_async_work *context) spin_lock_irqsave(&ent->mkeys_queue.lock, flags); push_mkey_locked(ent, mkey_out->mkey); + ent->pending--; /* If we are doing fill_to_high_water then keep going. */ queue_adjust_cache_locked(ent); - ent->pending--; spin_unlock_irqrestore(&ent->mkeys_queue.lock, flags); kfree(mkey_out); } @@ -527,6 +531,21 @@ static void queue_adjust_cache_locked(struct mlx5_cache_ent *ent) } } +static void clean_keys(struct mlx5_ib_dev *dev, struct mlx5_cache_ent *ent) +{ + u32 mkey; + + spin_lock_irq(&ent->mkeys_queue.lock); + while (ent->mkeys_queue.ci) { + mkey = pop_mkey_locked(ent); + spin_unlock_irq(&ent->mkeys_queue.lock); + mlx5_core_destroy_mkey(dev->mdev, mkey); + spin_lock_irq(&ent->mkeys_queue.lock); + } + ent->tmp_cleanup_scheduled = false; + spin_unlock_irq(&ent->mkeys_queue.lock); +} + static void __cache_work_func(struct mlx5_cache_ent *ent) { struct mlx5_ib_dev *dev = ent->dev; @@ -598,7 +617,11 @@ static void delayed_cache_work_func(struct work_struct *work) struct mlx5_cache_ent *ent; ent = container_of(work, struct mlx5_cache_ent, dwork.work); - __cache_work_func(ent); + /* temp entries are never filled, only cleaned */ + if (ent->is_tmp) + clean_keys(ent->dev, ent); + else + __cache_work_func(ent); } static int cache_ent_key_cmp(struct mlx5r_cache_rb_key key1, @@ -659,6 +682,7 @@ mkey_cache_ent_from_rb_key(struct mlx5_ib_dev *dev, { struct rb_node *node = dev->cache.rb_root.rb_node; struct mlx5_cache_ent *cur, *smallest = NULL; + u64 ndescs_limit; int cmp; /* @@ -677,10 +701,18 @@ mkey_cache_ent_from_rb_key(struct mlx5_ib_dev *dev, return cur; } + /* + * Limit the usage of mkeys larger than twice the required size while + * also allowing the usage of smallest cache entry for small MRs. + */ + ndescs_limit = max_t(u64, rb_key.ndescs * 2, + MLX5_MR_CACHE_PERSISTENT_ENTRY_MIN_DESCS); + return (smallest && smallest->rb_key.access_mode == rb_key.access_mode && smallest->rb_key.access_flags == rb_key.access_flags && - smallest->rb_key.ats == rb_key.ats) ? + smallest->rb_key.ats == rb_key.ats && + smallest->rb_key.ndescs <= ndescs_limit) ? smallest : NULL; } @@ -765,21 +797,6 @@ struct mlx5_ib_mr *mlx5_mr_cache_alloc(struct mlx5_ib_dev *dev, return _mlx5_mr_cache_alloc(dev, ent, access_flags); } -static void clean_keys(struct mlx5_ib_dev *dev, struct mlx5_cache_ent *ent) -{ - u32 mkey; - - cancel_delayed_work(&ent->dwork); - spin_lock_irq(&ent->mkeys_queue.lock); - while (ent->mkeys_queue.ci) { - mkey = pop_mkey_locked(ent); - spin_unlock_irq(&ent->mkeys_queue.lock); - mlx5_core_destroy_mkey(dev->mdev, mkey); - spin_lock_irq(&ent->mkeys_queue.lock); - } - spin_unlock_irq(&ent->mkeys_queue.lock); -} - static void mlx5_mkey_cache_debugfs_cleanup(struct mlx5_ib_dev *dev) { if (!mlx5_debugfs_root || dev->is_rep) @@ -892,10 +909,6 @@ mlx5r_cache_create_ent_locked(struct mlx5_ib_dev *dev, ent->limit = 0; mlx5_mkey_cache_debugfs_add_ent(dev, ent); - } else { - mod_delayed_work(ent->dev->cache.wq, - &ent->dev->cache.remove_ent_dwork, - msecs_to_jiffies(30 * 1000)); } return ent; @@ -906,35 +919,6 @@ mkeys_err: return ERR_PTR(ret); } -static void remove_ent_work_func(struct work_struct *work) -{ - struct mlx5_mkey_cache *cache; - struct mlx5_cache_ent *ent; - struct rb_node *cur; - - cache = container_of(work, struct mlx5_mkey_cache, - remove_ent_dwork.work); - mutex_lock(&cache->rb_lock); - cur = rb_last(&cache->rb_root); - while (cur) { - ent = rb_entry(cur, struct mlx5_cache_ent, node); - cur = rb_prev(cur); - mutex_unlock(&cache->rb_lock); - - spin_lock_irq(&ent->mkeys_queue.lock); - if (!ent->is_tmp) { - spin_unlock_irq(&ent->mkeys_queue.lock); - mutex_lock(&cache->rb_lock); - continue; - } - spin_unlock_irq(&ent->mkeys_queue.lock); - - clean_keys(ent->dev, ent); - mutex_lock(&cache->rb_lock); - } - mutex_unlock(&cache->rb_lock); -} - int mlx5_mkey_cache_init(struct mlx5_ib_dev *dev) { struct mlx5_mkey_cache *cache = &dev->cache; @@ -950,7 +934,6 @@ int mlx5_mkey_cache_init(struct mlx5_ib_dev *dev) mutex_init(&dev->slow_path_mutex); mutex_init(&dev->cache.rb_lock); dev->cache.rb_root = RB_ROOT; - INIT_DELAYED_WORK(&dev->cache.remove_ent_dwork, remove_ent_work_func); cache->wq = alloc_ordered_workqueue("mkey_cache", WQ_MEM_RECLAIM); if (!cache->wq) { mlx5_ib_warn(dev, "failed to create work queue\n"); @@ -962,7 +945,7 @@ int mlx5_mkey_cache_init(struct mlx5_ib_dev *dev) mlx5_mkey_cache_debugfs_init(dev); mutex_lock(&cache->rb_lock); for (i = 0; i <= mkey_cache_max_order(dev); i++) { - rb_key.ndescs = 1 << (i + 2); + rb_key.ndescs = MLX5_MR_CACHE_PERSISTENT_ENTRY_MIN_DESCS << i; ent = mlx5r_cache_create_ent_locked(dev, rb_key, true); if (IS_ERR(ent)) { ret = PTR_ERR(ent); @@ -1001,7 +984,6 @@ void mlx5_mkey_cache_cleanup(struct mlx5_ib_dev *dev) return; mutex_lock(&dev->cache.rb_lock); - cancel_delayed_work(&dev->cache.remove_ent_dwork); for (node = rb_first(root); node; node = rb_next(node)) { ent = rb_entry(node, struct mlx5_cache_ent, node); spin_lock_irq(&ent->mkeys_queue.lock); @@ -1062,6 +1044,7 @@ struct ib_mr *mlx5_ib_get_dma_mr(struct ib_pd *pd, int acc) MLX5_SET(mkc, mkc, length64, 1); set_mkc_access_pd_addr_fields(mkc, acc | IB_ACCESS_RELAXED_ORDERING, 0, pd); + MLX5_SET(mkc, mkc, ma_translation_mode, MLX5_CAP_GEN(dev->mdev, ats)); err = mlx5_ib_create_mkey(dev, &mr->mmkey, in, inlen); if (err) @@ -1126,12 +1109,10 @@ static unsigned int mlx5_umem_dmabuf_default_pgsz(struct ib_umem *umem, static struct mlx5_ib_mr *alloc_cacheable_mr(struct ib_pd *pd, struct ib_umem *umem, u64 iova, - int access_flags) + int access_flags, int access_mode) { - struct mlx5r_cache_rb_key rb_key = { - .access_mode = MLX5_MKC_ACCESS_MODE_MTT, - }; struct mlx5_ib_dev *dev = to_mdev(pd->device); + struct mlx5r_cache_rb_key rb_key = {}; struct mlx5_cache_ent *ent; struct mlx5_ib_mr *mr; unsigned int page_size; @@ -1139,11 +1120,11 @@ static struct mlx5_ib_mr *alloc_cacheable_mr(struct ib_pd *pd, if (umem->is_dmabuf) page_size = mlx5_umem_dmabuf_default_pgsz(umem, iova); else - page_size = mlx5_umem_find_best_pgsz(umem, mkc, log_page_size, - 0, iova); + page_size = mlx5_umem_mkc_find_best_pgsz(dev, umem, iova); if (WARN_ON(!page_size)) return ERR_PTR(-EINVAL); + rb_key.access_mode = access_mode; rb_key.ndescs = ib_umem_num_dma_blocks(umem, page_size); rb_key.ats = mlx5_umem_needs_ats(dev, umem, access_flags); rb_key.access_flags = get_unchangeable_access_flags(dev, access_flags); @@ -1154,7 +1135,7 @@ static struct mlx5_ib_mr *alloc_cacheable_mr(struct ib_pd *pd, */ if (!ent) { mutex_lock(&dev->slow_path_mutex); - mr = reg_create(pd, umem, iova, access_flags, page_size, false); + mr = reg_create(pd, umem, iova, access_flags, page_size, false, access_mode); mutex_unlock(&dev->slow_path_mutex); if (IS_ERR(mr)) return mr; @@ -1175,13 +1156,71 @@ static struct mlx5_ib_mr *alloc_cacheable_mr(struct ib_pd *pd, return mr; } +static struct ib_mr * +reg_create_crossing_vhca_mr(struct ib_pd *pd, u64 iova, u64 length, int access_flags, + u32 crossed_lkey) +{ + struct mlx5_ib_dev *dev = to_mdev(pd->device); + int access_mode = MLX5_MKC_ACCESS_MODE_CROSSING; + struct mlx5_ib_mr *mr; + void *mkc; + int inlen; + u32 *in; + int err; + + if (!MLX5_CAP_GEN(dev->mdev, crossing_vhca_mkey)) + return ERR_PTR(-EOPNOTSUPP); + + mr = kzalloc(sizeof(*mr), GFP_KERNEL); + if (!mr) + return ERR_PTR(-ENOMEM); + + inlen = MLX5_ST_SZ_BYTES(create_mkey_in); + in = kvzalloc(inlen, GFP_KERNEL); + if (!in) { + err = -ENOMEM; + goto err_1; + } + + mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); + MLX5_SET(mkc, mkc, crossing_target_vhca_id, + MLX5_CAP_GEN(dev->mdev, vhca_id)); + MLX5_SET(mkc, mkc, translations_octword_size, crossed_lkey); + MLX5_SET(mkc, mkc, access_mode_1_0, access_mode & 0x3); + MLX5_SET(mkc, mkc, access_mode_4_2, (access_mode >> 2) & 0x7); + + /* for this crossing mkey IOVA should be 0 and len should be IOVA + len */ + set_mkc_access_pd_addr_fields(mkc, access_flags, 0, pd); + MLX5_SET64(mkc, mkc, len, iova + length); + + MLX5_SET(mkc, mkc, free, 0); + MLX5_SET(mkc, mkc, umr_en, 0); + err = mlx5_ib_create_mkey(dev, &mr->mmkey, in, inlen); + if (err) + goto err_2; + + mr->mmkey.type = MLX5_MKEY_MR; + set_mr_fields(dev, mr, length, access_flags, iova); + mr->ibmr.pd = pd; + kvfree(in); + mlx5_ib_dbg(dev, "crossing mkey = 0x%x\n", mr->mmkey.key); + + return &mr->ibmr; +err_2: + kvfree(in); +err_1: + kfree(mr); + return ERR_PTR(err); +} + /* * If ibmr is NULL it will be allocated by reg_create. * Else, the given ibmr will be used. */ static struct mlx5_ib_mr *reg_create(struct ib_pd *pd, struct ib_umem *umem, u64 iova, int access_flags, - unsigned int page_size, bool populate) + unsigned int page_size, bool populate, + int access_mode) { struct mlx5_ib_dev *dev = to_mdev(pd->device); struct mlx5_ib_mr *mr; @@ -1190,7 +1229,9 @@ static struct mlx5_ib_mr *reg_create(struct ib_pd *pd, struct ib_umem *umem, int inlen; u32 *in; int err; - bool pg_cap = !!(MLX5_CAP_GEN(dev->mdev, pg)); + bool pg_cap = !!(MLX5_CAP_GEN(dev->mdev, pg)) && + (access_mode == MLX5_MKC_ACCESS_MODE_MTT); + bool ksm_mode = (access_mode == MLX5_MKC_ACCESS_MODE_KSM); if (!page_size) return ERR_PTR(-EINVAL); @@ -1213,7 +1254,7 @@ static struct mlx5_ib_mr *reg_create(struct ib_pd *pd, struct ib_umem *umem, } pas = (__be64 *)MLX5_ADDR_OF(create_mkey_in, in, klm_pas_mtt); if (populate) { - if (WARN_ON(access_flags & IB_ACCESS_ON_DEMAND)) { + if (WARN_ON(access_flags & IB_ACCESS_ON_DEMAND || ksm_mode)) { err = -EINVAL; goto err_2; } @@ -1229,14 +1270,22 @@ static struct mlx5_ib_mr *reg_create(struct ib_pd *pd, struct ib_umem *umem, mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); set_mkc_access_pd_addr_fields(mkc, access_flags, iova, populate ? pd : dev->umrc.pd); + /* In case a data direct flow, overwrite the pdn field by its internal kernel PD */ + if (umem->is_dmabuf && ksm_mode) + MLX5_SET(mkc, mkc, pd, dev->ddr.pdn); + MLX5_SET(mkc, mkc, free, !populate); - MLX5_SET(mkc, mkc, access_mode_1_0, MLX5_MKC_ACCESS_MODE_MTT); + MLX5_SET(mkc, mkc, access_mode_1_0, access_mode); MLX5_SET(mkc, mkc, umr_en, 1); MLX5_SET64(mkc, mkc, len, umem->length); MLX5_SET(mkc, mkc, bsf_octword_size, 0); - MLX5_SET(mkc, mkc, translations_octword_size, - get_octo_len(iova, umem->length, mr->page_shift)); + if (ksm_mode) + MLX5_SET(mkc, mkc, translations_octword_size, + get_octo_len(iova, umem->length, mr->page_shift) * 2); + else + MLX5_SET(mkc, mkc, translations_octword_size, + get_octo_len(iova, umem->length, mr->page_shift)); MLX5_SET(mkc, mkc, log_page_size, mr->page_shift); if (mlx5_umem_needs_ats(dev, umem, access_flags)) MLX5_SET(mkc, mkc, ma_translation_mode, 1); @@ -1373,13 +1422,15 @@ static struct ib_mr *create_real_mr(struct ib_pd *pd, struct ib_umem *umem, xlt_with_umr = mlx5r_umr_can_load_pas(dev, umem->length); if (xlt_with_umr) { - mr = alloc_cacheable_mr(pd, umem, iova, access_flags); + mr = alloc_cacheable_mr(pd, umem, iova, access_flags, + MLX5_MKC_ACCESS_MODE_MTT); } else { - unsigned int page_size = mlx5_umem_find_best_pgsz( - umem, mkc, log_page_size, 0, iova); + unsigned int page_size = + mlx5_umem_mkc_find_best_pgsz(dev, umem, iova); mutex_lock(&dev->slow_path_mutex); - mr = reg_create(pd, umem, iova, access_flags, page_size, true); + mr = reg_create(pd, umem, iova, access_flags, page_size, + true, MLX5_MKC_ACCESS_MODE_MTT); mutex_unlock(&dev->slow_path_mutex); } if (IS_ERR(mr)) { @@ -1442,7 +1493,8 @@ static struct ib_mr *create_user_odp_mr(struct ib_pd *pd, u64 start, u64 length, if (IS_ERR(odp)) return ERR_CAST(odp); - mr = alloc_cacheable_mr(pd, &odp->umem, iova, access_flags); + mr = alloc_cacheable_mr(pd, &odp->umem, iova, access_flags, + MLX5_MKC_ACCESS_MODE_MTT); if (IS_ERR(mr)) { ib_umem_release(&odp->umem); return ERR_CAST(mr); @@ -1510,35 +1562,31 @@ static struct dma_buf_attach_ops mlx5_ib_dmabuf_attach_ops = { .move_notify = mlx5_ib_dmabuf_invalidate_cb, }; -struct ib_mr *mlx5_ib_reg_user_mr_dmabuf(struct ib_pd *pd, u64 offset, - u64 length, u64 virt_addr, - int fd, int access_flags, - struct ib_udata *udata) +static struct ib_mr * +reg_user_mr_dmabuf(struct ib_pd *pd, struct device *dma_device, + u64 offset, u64 length, u64 virt_addr, + int fd, int access_flags, int access_mode) { + bool pinned_mode = (access_mode == MLX5_MKC_ACCESS_MODE_KSM); struct mlx5_ib_dev *dev = to_mdev(pd->device); struct mlx5_ib_mr *mr = NULL; struct ib_umem_dmabuf *umem_dmabuf; int err; - if (!IS_ENABLED(CONFIG_INFINIBAND_USER_MEM) || - !IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING)) - return ERR_PTR(-EOPNOTSUPP); - - mlx5_ib_dbg(dev, - "offset 0x%llx, virt_addr 0x%llx, length 0x%llx, fd %d, access_flags 0x%x\n", - offset, virt_addr, length, fd, access_flags); - err = mlx5r_umr_resource_init(dev); if (err) return ERR_PTR(err); - /* dmabuf requires xlt update via umr to work. */ - if (!mlx5r_umr_can_load_pas(dev, length)) - return ERR_PTR(-EINVAL); + if (!pinned_mode) + umem_dmabuf = ib_umem_dmabuf_get(&dev->ib_dev, + offset, length, fd, + access_flags, + &mlx5_ib_dmabuf_attach_ops); + else + umem_dmabuf = ib_umem_dmabuf_get_pinned_with_dma_device(&dev->ib_dev, + dma_device, offset, length, + fd, access_flags); - umem_dmabuf = ib_umem_dmabuf_get(&dev->ib_dev, offset, length, fd, - access_flags, - &mlx5_ib_dmabuf_attach_ops); if (IS_ERR(umem_dmabuf)) { mlx5_ib_dbg(dev, "umem_dmabuf get failed (%ld)\n", PTR_ERR(umem_dmabuf)); @@ -1546,7 +1594,7 @@ struct ib_mr *mlx5_ib_reg_user_mr_dmabuf(struct ib_pd *pd, u64 offset, } mr = alloc_cacheable_mr(pd, &umem_dmabuf->umem, virt_addr, - access_flags); + access_flags, access_mode); if (IS_ERR(mr)) { ib_umem_release(&umem_dmabuf->umem); return ERR_CAST(mr); @@ -1556,9 +1604,13 @@ struct ib_mr *mlx5_ib_reg_user_mr_dmabuf(struct ib_pd *pd, u64 offset, atomic_add(ib_umem_num_pages(mr->umem), &dev->mdev->priv.reg_pages); umem_dmabuf->private = mr; - err = mlx5r_store_odp_mkey(dev, &mr->mmkey); - if (err) - goto err_dereg_mr; + if (!pinned_mode) { + err = mlx5r_store_odp_mkey(dev, &mr->mmkey); + if (err) + goto err_dereg_mr; + } else { + mr->data_direct = true; + } err = mlx5_ib_init_dmabuf_mr(mr); if (err) @@ -1566,10 +1618,101 @@ struct ib_mr *mlx5_ib_reg_user_mr_dmabuf(struct ib_pd *pd, u64 offset, return &mr->ibmr; err_dereg_mr: - mlx5_ib_dereg_mr(&mr->ibmr, NULL); + __mlx5_ib_dereg_mr(&mr->ibmr); return ERR_PTR(err); } +static struct ib_mr * +reg_user_mr_dmabuf_by_data_direct(struct ib_pd *pd, u64 offset, + u64 length, u64 virt_addr, + int fd, int access_flags) +{ + struct mlx5_ib_dev *dev = to_mdev(pd->device); + struct mlx5_data_direct_dev *data_direct_dev; + struct ib_mr *crossing_mr; + struct ib_mr *crossed_mr; + int ret = 0; + + /* As of HW behaviour the IOVA must be page aligned in KSM mode */ + if (!PAGE_ALIGNED(virt_addr) || (access_flags & IB_ACCESS_ON_DEMAND)) + return ERR_PTR(-EOPNOTSUPP); + + mutex_lock(&dev->data_direct_lock); + data_direct_dev = dev->data_direct_dev; + if (!data_direct_dev) { + ret = -EINVAL; + goto end; + } + + /* The device's 'data direct mkey' was created without RO flags to + * simplify things and allow for a single mkey per device. + * Since RO is not a must, mask it out accordingly. + */ + access_flags &= ~IB_ACCESS_RELAXED_ORDERING; + crossed_mr = reg_user_mr_dmabuf(pd, &data_direct_dev->pdev->dev, + offset, length, virt_addr, fd, + access_flags, MLX5_MKC_ACCESS_MODE_KSM); + if (IS_ERR(crossed_mr)) { + ret = PTR_ERR(crossed_mr); + goto end; + } + + mutex_lock(&dev->slow_path_mutex); + crossing_mr = reg_create_crossing_vhca_mr(pd, virt_addr, length, access_flags, + crossed_mr->lkey); + mutex_unlock(&dev->slow_path_mutex); + if (IS_ERR(crossing_mr)) { + __mlx5_ib_dereg_mr(crossed_mr); + ret = PTR_ERR(crossing_mr); + goto end; + } + + list_add_tail(&to_mmr(crossed_mr)->dd_node, &dev->data_direct_mr_list); + to_mmr(crossing_mr)->dd_crossed_mr = to_mmr(crossed_mr); + to_mmr(crossing_mr)->data_direct = true; +end: + mutex_unlock(&dev->data_direct_lock); + return ret ? ERR_PTR(ret) : crossing_mr; +} + +struct ib_mr *mlx5_ib_reg_user_mr_dmabuf(struct ib_pd *pd, u64 offset, + u64 length, u64 virt_addr, + int fd, int access_flags, + struct uverbs_attr_bundle *attrs) +{ + struct mlx5_ib_dev *dev = to_mdev(pd->device); + int mlx5_access_flags = 0; + int err; + + if (!IS_ENABLED(CONFIG_INFINIBAND_USER_MEM) || + !IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING)) + return ERR_PTR(-EOPNOTSUPP); + + if (uverbs_attr_is_valid(attrs, MLX5_IB_ATTR_REG_DMABUF_MR_ACCESS_FLAGS)) { + err = uverbs_get_flags32(&mlx5_access_flags, attrs, + MLX5_IB_ATTR_REG_DMABUF_MR_ACCESS_FLAGS, + MLX5_IB_UAPI_REG_DMABUF_ACCESS_DATA_DIRECT); + if (err) + return ERR_PTR(err); + } + + mlx5_ib_dbg(dev, + "offset 0x%llx, virt_addr 0x%llx, length 0x%llx, fd %d, access_flags 0x%x, mlx5_access_flags 0x%x\n", + offset, virt_addr, length, fd, access_flags, mlx5_access_flags); + + /* dmabuf requires xlt update via umr to work. */ + if (!mlx5r_umr_can_load_pas(dev, length)) + return ERR_PTR(-EINVAL); + + if (mlx5_access_flags & MLX5_IB_UAPI_REG_DMABUF_ACCESS_DATA_DIRECT) + return reg_user_mr_dmabuf_by_data_direct(pd, offset, length, virt_addr, + fd, access_flags); + + return reg_user_mr_dmabuf(pd, pd->device->dma_device, + offset, length, virt_addr, + fd, access_flags, MLX5_MKC_ACCESS_MODE_MTT); +} + /* * True if the change in access flags can be done via UMR, only some access * flags can be updated. @@ -1601,8 +1744,7 @@ static bool can_use_umr_rereg_pas(struct mlx5_ib_mr *mr, if (!mlx5r_umr_can_load_pas(dev, new_umem->length)) return false; - *page_size = - mlx5_umem_find_best_pgsz(new_umem, mkc, log_page_size, 0, iova); + *page_size = mlx5_umem_mkc_find_best_pgsz(dev, new_umem, iova); if (WARN_ON(!*page_size)) return false; return (mr->mmkey.cache_ent->rb_key.ndescs) >= @@ -1665,7 +1807,7 @@ struct ib_mr *mlx5_ib_rereg_user_mr(struct ib_mr *ib_mr, int flags, u64 start, struct mlx5_ib_mr *mr = to_mmr(ib_mr); int err; - if (!IS_ENABLED(CONFIG_INFINIBAND_USER_MEM)) + if (!IS_ENABLED(CONFIG_INFINIBAND_USER_MEM) || mr->data_direct) return ERR_PTR(-EOPNOTSUPP); mlx5_ib_dbg( @@ -1793,7 +1935,7 @@ err: static void mlx5_free_priv_descs(struct mlx5_ib_mr *mr) { - if (!mr->umem && mr->descs) { + if (!mr->umem && !mr->data_direct && mr->descs) { struct ib_device *device = mr->ibmr.device; int size = mr->max_descs * mr->desc_size; struct mlx5_ib_dev *dev = to_mdev(device); @@ -1847,13 +1989,51 @@ end: return ret; } +static int mlx5_ib_revoke_data_direct_mr(struct mlx5_ib_mr *mr) +{ + struct mlx5_ib_dev *dev = to_mdev(mr->ibmr.device); + struct ib_umem_dmabuf *umem_dmabuf = to_ib_umem_dmabuf(mr->umem); + int err; + + lockdep_assert_held(&dev->data_direct_lock); + mr->revoked = true; + err = mlx5r_umr_revoke_mr(mr); + if (WARN_ON(err)) + return err; + + ib_umem_dmabuf_revoke(umem_dmabuf); + return 0; +} + +void mlx5_ib_revoke_data_direct_mrs(struct mlx5_ib_dev *dev) +{ + struct mlx5_ib_mr *mr, *next; + + lockdep_assert_held(&dev->data_direct_lock); + + list_for_each_entry_safe(mr, next, &dev->data_direct_mr_list, dd_node) { + list_del(&mr->dd_node); + mlx5_ib_revoke_data_direct_mr(mr); + } +} + static int mlx5_revoke_mr(struct mlx5_ib_mr *mr) { struct mlx5_ib_dev *dev = to_mdev(mr->ibmr.device); struct mlx5_cache_ent *ent = mr->mmkey.cache_ent; - if (mr->mmkey.cacheable && !mlx5r_umr_revoke_mr(mr) && !cache_ent_find_and_store(dev, mr)) + if (mr->mmkey.cacheable && !mlx5r_umr_revoke_mr(mr) && !cache_ent_find_and_store(dev, mr)) { + ent = mr->mmkey.cache_ent; + /* upon storing to a clean temp entry - schedule its cleanup */ + spin_lock_irq(&ent->mkeys_queue.lock); + if (ent->is_tmp && !ent->tmp_cleanup_scheduled) { + mod_delayed_work(ent->dev->cache.wq, &ent->dwork, + msecs_to_jiffies(30 * 1000)); + ent->tmp_cleanup_scheduled = true; + } + spin_unlock_irq(&ent->mkeys_queue.lock); return 0; + } if (ent) { spin_lock_irq(&ent->mkeys_queue.lock); @@ -1864,7 +2044,7 @@ static int mlx5_revoke_mr(struct mlx5_ib_mr *mr) return destroy_mkey(dev, mr); } -int mlx5_ib_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata) +static int __mlx5_ib_dereg_mr(struct ib_mr *ibmr) { struct mlx5_ib_mr *mr = to_mmr(ibmr); struct mlx5_ib_dev *dev = to_mdev(ibmr->device); @@ -1931,9 +2111,40 @@ int mlx5_ib_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata) return 0; } +static int dereg_crossing_data_direct_mr(struct mlx5_ib_dev *dev, + struct mlx5_ib_mr *mr) +{ + struct mlx5_ib_mr *dd_crossed_mr = mr->dd_crossed_mr; + int ret; + + ret = __mlx5_ib_dereg_mr(&mr->ibmr); + if (ret) + return ret; + + mutex_lock(&dev->data_direct_lock); + if (!dd_crossed_mr->revoked) + list_del(&dd_crossed_mr->dd_node); + + ret = __mlx5_ib_dereg_mr(&dd_crossed_mr->ibmr); + mutex_unlock(&dev->data_direct_lock); + return ret; +} + +int mlx5_ib_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata) +{ + struct mlx5_ib_mr *mr = to_mmr(ibmr); + struct mlx5_ib_dev *dev = to_mdev(ibmr->device); + + if (mr->data_direct) + return dereg_crossing_data_direct_mr(dev, mr); + + return __mlx5_ib_dereg_mr(ibmr); +} + static void mlx5_set_umr_free_mkey(struct ib_pd *pd, u32 *in, int ndescs, int access_mode, int page_shift) { + struct mlx5_ib_dev *dev = to_mdev(pd->device); void *mkc; mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); @@ -1946,6 +2157,9 @@ static void mlx5_set_umr_free_mkey(struct ib_pd *pd, u32 *in, int ndescs, MLX5_SET(mkc, mkc, access_mode_4_2, (access_mode >> 2) & 0x7); MLX5_SET(mkc, mkc, umr_en, 1); MLX5_SET(mkc, mkc, log_page_size, page_shift); + if (access_mode == MLX5_MKC_ACCESS_MODE_PA || + access_mode == MLX5_MKC_ACCESS_MODE_MTT) + MLX5_SET(mkc, mkc, ma_translation_mode, MLX5_CAP_GEN(dev->mdev, ats)); } static int _mlx5_alloc_mkey_descs(struct ib_pd *pd, struct mlx5_ib_mr *mr, diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c index a524181f34df..4b37446758fd 100644 --- a/drivers/infiniband/hw/mlx5/odp.c +++ b/drivers/infiniband/hw/mlx5/odp.c @@ -45,7 +45,7 @@ /* Contains the details of a pagefault. */ struct mlx5_pagefault { u32 bytes_committed; - u32 token; + u64 token; u8 event_subtype; u8 type; union { @@ -74,6 +74,14 @@ struct mlx5_pagefault { u32 rdma_op_len; u64 rdma_va; } rdma; + struct { + u64 va; + u32 mkey; + u32 fault_byte_count; + u32 prefetch_before_byte_count; + u32 prefetch_after_byte_count; + u8 flags; + } memory; }; struct mlx5_ib_pf_eq *eq; @@ -99,13 +107,20 @@ static u64 mlx5_imr_ksm_entries; static void populate_klm(struct mlx5_klm *pklm, size_t idx, size_t nentries, struct mlx5_ib_mr *imr, int flags) { + struct mlx5_core_dev *dev = mr_to_mdev(imr)->mdev; struct mlx5_klm *end = pklm + nentries; + int step = MLX5_CAP_ODP(dev, mem_page_fault) ? MLX5_IMR_MTT_SIZE : 0; + __be32 key = MLX5_CAP_ODP(dev, mem_page_fault) ? + cpu_to_be32(imr->null_mmkey.key) : + mr_to_mdev(imr)->mkeys.null_mkey; + u64 va = + MLX5_CAP_ODP(dev, mem_page_fault) ? idx * MLX5_IMR_MTT_SIZE : 0; if (flags & MLX5_IB_UPD_XLT_ZAP) { - for (; pklm != end; pklm++, idx++) { + for (; pklm != end; pklm++, idx++, va += step) { pklm->bcount = cpu_to_be32(MLX5_IMR_MTT_SIZE); - pklm->key = mr_to_mdev(imr)->mkeys.null_mkey; - pklm->va = 0; + pklm->key = key; + pklm->va = cpu_to_be64(va); } return; } @@ -129,7 +144,7 @@ static void populate_klm(struct mlx5_klm *pklm, size_t idx, size_t nentries, */ lockdep_assert_held(&to_ib_umem_odp(imr->umem)->umem_mutex); - for (; pklm != end; pklm++, idx++) { + for (; pklm != end; pklm++, idx++, va += step) { struct mlx5_ib_mr *mtt = xa_load(&imr->implicit_children, idx); pklm->bcount = cpu_to_be32(MLX5_IMR_MTT_SIZE); @@ -137,8 +152,8 @@ static void populate_klm(struct mlx5_klm *pklm, size_t idx, size_t nentries, pklm->key = cpu_to_be32(mtt->ibmr.lkey); pklm->va = cpu_to_be64(idx * MLX5_IMR_MTT_SIZE); } else { - pklm->key = mr_to_mdev(imr)->mkeys.null_mkey; - pklm->va = 0; + pklm->key = key; + pklm->va = cpu_to_be64(va); } } } @@ -217,6 +232,9 @@ static void destroy_unused_implicit_child_mr(struct mlx5_ib_mr *mr) return; xa_erase(&imr->implicit_children, idx); + if (MLX5_CAP_ODP(mr_to_mdev(mr)->mdev, mem_page_fault)) + xa_erase(&mr_to_mdev(mr)->odp_mkeys, + mlx5_base_mkey(mr->mmkey.key)); /* Freeing a MR is a sleeping operation, so bounce to a work queue */ INIT_WORK(&mr->odp_destroy.work, free_implicit_child_mr_work); @@ -332,46 +350,46 @@ static void internal_fill_odp_caps(struct mlx5_ib_dev *dev) else dev->odp_max_size = BIT_ULL(MLX5_MAX_UMR_SHIFT + PAGE_SHIFT); - if (MLX5_CAP_ODP(dev->mdev, ud_odp_caps.send)) + if (MLX5_CAP_ODP_SCHEME(dev->mdev, ud_odp_caps.send)) caps->per_transport_caps.ud_odp_caps |= IB_ODP_SUPPORT_SEND; - if (MLX5_CAP_ODP(dev->mdev, ud_odp_caps.srq_receive)) + if (MLX5_CAP_ODP_SCHEME(dev->mdev, ud_odp_caps.srq_receive)) caps->per_transport_caps.ud_odp_caps |= IB_ODP_SUPPORT_SRQ_RECV; - if (MLX5_CAP_ODP(dev->mdev, rc_odp_caps.send)) + if (MLX5_CAP_ODP_SCHEME(dev->mdev, rc_odp_caps.send)) caps->per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_SEND; - if (MLX5_CAP_ODP(dev->mdev, rc_odp_caps.receive)) + if (MLX5_CAP_ODP_SCHEME(dev->mdev, rc_odp_caps.receive)) caps->per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_RECV; - if (MLX5_CAP_ODP(dev->mdev, rc_odp_caps.write)) + if (MLX5_CAP_ODP_SCHEME(dev->mdev, rc_odp_caps.write)) caps->per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_WRITE; - if (MLX5_CAP_ODP(dev->mdev, rc_odp_caps.read)) + if (MLX5_CAP_ODP_SCHEME(dev->mdev, rc_odp_caps.read)) caps->per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_READ; - if (MLX5_CAP_ODP(dev->mdev, rc_odp_caps.atomic)) + if (MLX5_CAP_ODP_SCHEME(dev->mdev, rc_odp_caps.atomic)) caps->per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_ATOMIC; - if (MLX5_CAP_ODP(dev->mdev, rc_odp_caps.srq_receive)) + if (MLX5_CAP_ODP_SCHEME(dev->mdev, rc_odp_caps.srq_receive)) caps->per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_SRQ_RECV; - if (MLX5_CAP_ODP(dev->mdev, xrc_odp_caps.send)) + if (MLX5_CAP_ODP_SCHEME(dev->mdev, xrc_odp_caps.send)) caps->per_transport_caps.xrc_odp_caps |= IB_ODP_SUPPORT_SEND; - if (MLX5_CAP_ODP(dev->mdev, xrc_odp_caps.receive)) + if (MLX5_CAP_ODP_SCHEME(dev->mdev, xrc_odp_caps.receive)) caps->per_transport_caps.xrc_odp_caps |= IB_ODP_SUPPORT_RECV; - if (MLX5_CAP_ODP(dev->mdev, xrc_odp_caps.write)) + if (MLX5_CAP_ODP_SCHEME(dev->mdev, xrc_odp_caps.write)) caps->per_transport_caps.xrc_odp_caps |= IB_ODP_SUPPORT_WRITE; - if (MLX5_CAP_ODP(dev->mdev, xrc_odp_caps.read)) + if (MLX5_CAP_ODP_SCHEME(dev->mdev, xrc_odp_caps.read)) caps->per_transport_caps.xrc_odp_caps |= IB_ODP_SUPPORT_READ; - if (MLX5_CAP_ODP(dev->mdev, xrc_odp_caps.atomic)) + if (MLX5_CAP_ODP_SCHEME(dev->mdev, xrc_odp_caps.atomic)) caps->per_transport_caps.xrc_odp_caps |= IB_ODP_SUPPORT_ATOMIC; - if (MLX5_CAP_ODP(dev->mdev, xrc_odp_caps.srq_receive)) + if (MLX5_CAP_ODP_SCHEME(dev->mdev, xrc_odp_caps.srq_receive)) caps->per_transport_caps.xrc_odp_caps |= IB_ODP_SUPPORT_SRQ_RECV; if (MLX5_CAP_GEN(dev->mdev, fixed_buffer_size) && @@ -388,13 +406,29 @@ static void mlx5_ib_page_fault_resume(struct mlx5_ib_dev *dev, int wq_num = pfault->event_subtype == MLX5_PFAULT_SUBTYPE_WQE ? pfault->wqe.wq_num : pfault->token; u32 in[MLX5_ST_SZ_DW(page_fault_resume_in)] = {}; + void *info; int err; MLX5_SET(page_fault_resume_in, in, opcode, MLX5_CMD_OP_PAGE_FAULT_RESUME); - MLX5_SET(page_fault_resume_in, in, page_fault_type, pfault->type); - MLX5_SET(page_fault_resume_in, in, token, pfault->token); - MLX5_SET(page_fault_resume_in, in, wq_number, wq_num); - MLX5_SET(page_fault_resume_in, in, error, !!error); + + if (pfault->event_subtype == MLX5_PFAULT_SUBTYPE_MEMORY) { + info = MLX5_ADDR_OF(page_fault_resume_in, in, + page_fault_info.mem_page_fault_info); + MLX5_SET(mem_page_fault_info, info, fault_token_31_0, + pfault->token & 0xffffffff); + MLX5_SET(mem_page_fault_info, info, fault_token_47_32, + (pfault->token >> 32) & 0xffff); + MLX5_SET(mem_page_fault_info, info, error, !!error); + } else { + info = MLX5_ADDR_OF(page_fault_resume_in, in, + page_fault_info.trans_page_fault_info); + MLX5_SET(trans_page_fault_info, info, page_fault_type, + pfault->type); + MLX5_SET(trans_page_fault_info, info, fault_token, + pfault->token); + MLX5_SET(trans_page_fault_info, info, wq_number, wq_num); + MLX5_SET(trans_page_fault_info, info, error, !!error); + } err = mlx5_cmd_exec_in(dev->mdev, page_fault_resume, in); if (err) @@ -468,6 +502,16 @@ static struct mlx5_ib_mr *implicit_get_child_mr(struct mlx5_ib_mr *imr, } xa_unlock(&imr->implicit_children); + if (MLX5_CAP_ODP(dev->mdev, mem_page_fault)) { + ret = xa_store(&dev->odp_mkeys, mlx5_base_mkey(mr->mmkey.key), + &mr->mmkey, GFP_KERNEL); + if (xa_is_err(ret)) { + ret = ERR_PTR(xa_err(ret)); + xa_erase(&imr->implicit_children, idx); + goto out_mr; + } + mr->mmkey.type = MLX5_MKEY_IMPLICIT_CHILD; + } mlx5_ib_dbg(mr_to_mdev(imr), "key %x mr %p\n", mr->mmkey.key, mr); return mr; @@ -478,6 +522,57 @@ out_mr: return ret; } +/* + * When using memory scheme ODP, implicit MRs can't use the reserved null mkey + * and each implicit MR needs to assign a private null mkey to get the page + * faults on. + * The null mkey is created with the properties to enable getting the page + * fault for every time it is accessed and having all relevant access flags. + */ +static int alloc_implicit_mr_null_mkey(struct mlx5_ib_dev *dev, + struct mlx5_ib_mr *imr, + struct mlx5_ib_pd *pd) +{ + size_t inlen = MLX5_ST_SZ_BYTES(create_mkey_in) + 64; + void *mkc; + u32 *in; + int err; + + in = kzalloc(inlen, GFP_KERNEL); + if (!in) + return -ENOMEM; + + MLX5_SET(create_mkey_in, in, translations_octword_actual_size, 4); + MLX5_SET(create_mkey_in, in, pg_access, 1); + + mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); + MLX5_SET(mkc, mkc, a, 1); + MLX5_SET(mkc, mkc, rw, 1); + MLX5_SET(mkc, mkc, rr, 1); + MLX5_SET(mkc, mkc, lw, 1); + MLX5_SET(mkc, mkc, lr, 1); + MLX5_SET(mkc, mkc, free, 0); + MLX5_SET(mkc, mkc, umr_en, 0); + MLX5_SET(mkc, mkc, access_mode_1_0, MLX5_MKC_ACCESS_MODE_MTT); + + MLX5_SET(mkc, mkc, translations_octword_size, 4); + MLX5_SET(mkc, mkc, log_page_size, 61); + MLX5_SET(mkc, mkc, length64, 1); + MLX5_SET(mkc, mkc, pd, pd->pdn); + MLX5_SET64(mkc, mkc, start_addr, 0); + MLX5_SET(mkc, mkc, qpn, 0xffffff); + + err = mlx5_core_create_mkey(dev->mdev, &imr->null_mmkey.key, in, inlen); + if (err) + goto free_in; + + imr->null_mmkey.type = MLX5_MKEY_NULL; + +free_in: + kfree(in); + return err; +} + struct mlx5_ib_mr *mlx5_ib_alloc_implicit_mr(struct mlx5_ib_pd *pd, int access_flags) { @@ -510,6 +605,16 @@ struct mlx5_ib_mr *mlx5_ib_alloc_implicit_mr(struct mlx5_ib_pd *pd, imr->is_odp_implicit = true; xa_init(&imr->implicit_children); + if (MLX5_CAP_ODP(dev->mdev, mem_page_fault)) { + err = alloc_implicit_mr_null_mkey(dev, imr, pd); + if (err) + goto out_mr; + + err = mlx5r_store_odp_mkey(dev, &imr->null_mmkey); + if (err) + goto out_mr; + } + err = mlx5r_umr_update_xlt(imr, 0, mlx5_imr_ksm_entries, MLX5_KSM_PAGE_SHIFT, @@ -544,6 +649,14 @@ void mlx5_ib_free_odp_mr(struct mlx5_ib_mr *mr) xa_erase(&mr->implicit_children, idx); mlx5_ib_dereg_mr(&mtt->ibmr, NULL); } + + if (mr->null_mmkey.key) { + xa_erase(&mr_to_mdev(mr)->odp_mkeys, + mlx5_base_mkey(mr->null_mmkey.key)); + + mlx5_core_destroy_mkey(mr_to_mdev(mr)->mdev, + mr->null_mmkey.key); + } } #define MLX5_PF_FLAGS_DOWNGRADE BIT(1) @@ -693,7 +806,7 @@ static int pagefault_dmabuf_mr(struct mlx5_ib_mr *mr, size_t bcnt, struct ib_umem_dmabuf *umem_dmabuf = to_ib_umem_dmabuf(mr->umem); u32 xlt_flags = 0; int err; - unsigned int page_size; + unsigned long page_size; if (flags & MLX5_PF_FLAGS_ENABLE) xlt_flags |= MLX5_IB_UPD_XLT_ENABLE; @@ -710,7 +823,10 @@ static int pagefault_dmabuf_mr(struct mlx5_ib_mr *mr, size_t bcnt, ib_umem_dmabuf_unmap_pages(umem_dmabuf); err = -EINVAL; } else { - err = mlx5r_umr_update_mr_pas(mr, xlt_flags); + if (mr->data_direct) + err = mlx5r_umr_update_data_direct_ksm_pas(mr, xlt_flags); + else + err = mlx5r_umr_update_mr_pas(mr, xlt_flags); } dma_resv_unlock(umem_dmabuf->attach->dmabuf->resv); @@ -733,24 +849,31 @@ static int pagefault_dmabuf_mr(struct mlx5_ib_mr *mr, size_t bcnt, * >0: Number of pages mapped */ static int pagefault_mr(struct mlx5_ib_mr *mr, u64 io_virt, size_t bcnt, - u32 *bytes_mapped, u32 flags) + u32 *bytes_mapped, u32 flags, bool permissive_fault) { struct ib_umem_odp *odp = to_ib_umem_odp(mr->umem); - if (unlikely(io_virt < mr->ibmr.iova)) + if (unlikely(io_virt < mr->ibmr.iova) && !permissive_fault) return -EFAULT; if (mr->umem->is_dmabuf) return pagefault_dmabuf_mr(mr, bcnt, bytes_mapped, flags); if (!odp->is_implicit_odp) { + u64 offset = io_virt < mr->ibmr.iova ? 0 : io_virt - mr->ibmr.iova; u64 user_va; - if (check_add_overflow(io_virt - mr->ibmr.iova, - (u64)odp->umem.address, &user_va)) + if (check_add_overflow(offset, (u64)odp->umem.address, + &user_va)) return -EFAULT; - if (unlikely(user_va >= ib_umem_end(odp) || - ib_umem_end(odp) - user_va < bcnt)) + + if (permissive_fault) { + if (user_va < ib_umem_start(odp)) + user_va = ib_umem_start(odp); + if ((user_va + bcnt) > ib_umem_end(odp)) + bcnt = ib_umem_end(odp) - user_va; + } else if (unlikely(user_va >= ib_umem_end(odp) || + ib_umem_end(odp) - user_va < bcnt)) return -EFAULT; return pagefault_real_mr(mr, odp, user_va, bcnt, bytes_mapped, flags); @@ -797,6 +920,27 @@ static bool mkey_is_eq(struct mlx5_ib_mkey *mmkey, u32 key) return mmkey->key == key; } +static struct mlx5_ib_mkey *find_odp_mkey(struct mlx5_ib_dev *dev, u32 key) +{ + struct mlx5_ib_mkey *mmkey; + + xa_lock(&dev->odp_mkeys); + mmkey = xa_load(&dev->odp_mkeys, mlx5_base_mkey(key)); + if (!mmkey) { + mmkey = ERR_PTR(-ENOENT); + goto out; + } + if (!mkey_is_eq(mmkey, key)) { + mmkey = ERR_PTR(-EFAULT); + goto out; + } + refcount_inc(&mmkey->usecount); +out: + xa_unlock(&dev->odp_mkeys); + + return mmkey; +} + /* * Handle a single data segment in a page-fault WQE or RDMA region. * @@ -824,32 +968,24 @@ static int pagefault_single_data_segment(struct mlx5_ib_dev *dev, io_virt += *bytes_committed; bcnt -= *bytes_committed; - next_mr: - xa_lock(&dev->odp_mkeys); - mmkey = xa_load(&dev->odp_mkeys, mlx5_base_mkey(key)); - if (!mmkey) { - xa_unlock(&dev->odp_mkeys); - mlx5_ib_dbg( - dev, - "skipping non ODP MR (lkey=0x%06x) in page fault handler.\n", - key); - if (bytes_mapped) - *bytes_mapped += bcnt; - /* - * The user could specify a SGL with multiple lkeys and only - * some of them are ODP. Treat the non-ODP ones as fully - * faulted. - */ - ret = 0; - goto end; - } - refcount_inc(&mmkey->usecount); - xa_unlock(&dev->odp_mkeys); - - if (!mkey_is_eq(mmkey, key)) { - mlx5_ib_dbg(dev, "failed to find mkey %x\n", key); - ret = -EFAULT; + mmkey = find_odp_mkey(dev, key); + if (IS_ERR(mmkey)) { + ret = PTR_ERR(mmkey); + if (ret == -ENOENT) { + mlx5_ib_dbg( + dev, + "skipping non ODP MR (lkey=0x%06x) in page fault handler.\n", + key); + if (bytes_mapped) + *bytes_mapped += bcnt; + /* + * The user could specify a SGL with multiple lkeys and + * only some of them are ODP. Treat the non-ODP ones as + * fully faulted. + */ + ret = 0; + } goto end; } @@ -857,7 +993,7 @@ next_mr: case MLX5_MKEY_MR: mr = container_of(mmkey, struct mlx5_ib_mr, mmkey); - ret = pagefault_mr(mr, io_virt, bcnt, bytes_mapped, 0); + ret = pagefault_mr(mr, io_virt, bcnt, bytes_mapped, 0, false); if (ret < 0) goto end; @@ -944,7 +1080,7 @@ next_mr: } end: - if (mmkey) + if (!IS_ERR(mmkey)) mlx5r_deref_odp_mkey(mmkey); while (head) { frame = head; @@ -1266,7 +1402,7 @@ read_user: if (ret) mlx5_ib_err( dev, - "Failed reading a WQE following page fault, error %d, wqe_index %x, qpn %x\n", + "Failed reading a WQE following page fault, error %d, wqe_index %x, qpn %llx\n", ret, wqe_index, pfault->token); resolve_page_fault: @@ -1325,13 +1461,13 @@ static void mlx5_ib_mr_rdma_pfault_handler(struct mlx5_ib_dev *dev, } else if (ret < 0 || pages_in_range(address, length) > ret) { mlx5_ib_page_fault_resume(dev, pfault, 1); if (ret != -ENOENT) - mlx5_ib_dbg(dev, "PAGE FAULT error %d. QP 0x%x, type: 0x%x\n", + mlx5_ib_dbg(dev, "PAGE FAULT error %d. QP 0x%llx, type: 0x%x\n", ret, pfault->token, pfault->type); return; } mlx5_ib_page_fault_resume(dev, pfault, 0); - mlx5_ib_dbg(dev, "PAGE FAULT completed. QP 0x%x, type: 0x%x, prefetch_activated: %d\n", + mlx5_ib_dbg(dev, "PAGE FAULT completed. QP 0x%llx, type: 0x%x, prefetch_activated: %d\n", pfault->token, pfault->type, prefetch_activated); @@ -1347,12 +1483,80 @@ static void mlx5_ib_mr_rdma_pfault_handler(struct mlx5_ib_dev *dev, prefetch_len, &bytes_committed, NULL); if (ret < 0 && ret != -EAGAIN) { - mlx5_ib_dbg(dev, "Prefetch failed. ret: %d, QP 0x%x, address: 0x%.16llx, length = 0x%.16x\n", + mlx5_ib_dbg(dev, "Prefetch failed. ret: %d, QP 0x%llx, address: 0x%.16llx, length = 0x%.16x\n", ret, pfault->token, address, prefetch_len); } } } +#define MLX5_MEMORY_PAGE_FAULT_FLAGS_LAST BIT(7) +static void mlx5_ib_mr_memory_pfault_handler(struct mlx5_ib_dev *dev, + struct mlx5_pagefault *pfault) +{ + u64 prefetch_va = + pfault->memory.va - pfault->memory.prefetch_before_byte_count; + size_t prefetch_size = pfault->memory.prefetch_before_byte_count + + pfault->memory.fault_byte_count + + pfault->memory.prefetch_after_byte_count; + struct mlx5_ib_mkey *mmkey; + struct mlx5_ib_mr *mr, *child_mr; + int ret = 0; + + mmkey = find_odp_mkey(dev, pfault->memory.mkey); + if (IS_ERR(mmkey)) + goto err; + + switch (mmkey->type) { + case MLX5_MKEY_IMPLICIT_CHILD: + child_mr = container_of(mmkey, struct mlx5_ib_mr, mmkey); + mr = child_mr->parent; + break; + case MLX5_MKEY_NULL: + mr = container_of(mmkey, struct mlx5_ib_mr, null_mmkey); + break; + default: + mr = container_of(mmkey, struct mlx5_ib_mr, mmkey); + break; + } + + /* If prefetch fails, handle only demanded page fault */ + ret = pagefault_mr(mr, prefetch_va, prefetch_size, NULL, 0, true); + if (ret < 0) { + ret = pagefault_mr(mr, pfault->memory.va, + pfault->memory.fault_byte_count, NULL, 0, + true); + if (ret < 0) + goto err; + } + + mlx5_update_odp_stats(mr, faults, ret); + mlx5r_deref_odp_mkey(mmkey); + + if (pfault->memory.flags & MLX5_MEMORY_PAGE_FAULT_FLAGS_LAST) + mlx5_ib_page_fault_resume(dev, pfault, 0); + + mlx5_ib_dbg( + dev, + "PAGE FAULT completed %s. token 0x%llx, mkey: 0x%x, va: 0x%llx, byte_count: 0x%x\n", + pfault->memory.flags & MLX5_MEMORY_PAGE_FAULT_FLAGS_LAST ? + "" : + "without resume cmd", + pfault->token, pfault->memory.mkey, pfault->memory.va, + pfault->memory.fault_byte_count); + + return; + +err: + if (!IS_ERR(mmkey)) + mlx5r_deref_odp_mkey(mmkey); + mlx5_ib_page_fault_resume(dev, pfault, 1); + mlx5_ib_dbg( + dev, + "PAGE FAULT error. token 0x%llx, mkey: 0x%x, va: 0x%llx, byte_count: 0x%x, err: %d\n", + pfault->token, pfault->memory.mkey, pfault->memory.va, + pfault->memory.fault_byte_count, ret); +} + static void mlx5_ib_pfault(struct mlx5_ib_dev *dev, struct mlx5_pagefault *pfault) { u8 event_subtype = pfault->event_subtype; @@ -1364,6 +1568,9 @@ static void mlx5_ib_pfault(struct mlx5_ib_dev *dev, struct mlx5_pagefault *pfaul case MLX5_PFAULT_SUBTYPE_RDMA: mlx5_ib_mr_rdma_pfault_handler(dev, pfault); break; + case MLX5_PFAULT_SUBTYPE_MEMORY: + mlx5_ib_mr_memory_pfault_handler(dev, pfault); + break; default: mlx5_ib_err(dev, "Invalid page fault event subtype: 0x%x\n", event_subtype); @@ -1382,6 +1589,7 @@ static void mlx5_ib_eqe_pf_action(struct work_struct *work) mempool_free(pfault, eq->pool); } +#define MEMORY_SCHEME_PAGE_FAULT_GRANULARITY 4096 static void mlx5_ib_eq_pf_process(struct mlx5_ib_pf_eq *eq) { struct mlx5_eqe_page_fault *pf_eqe; @@ -1398,15 +1606,12 @@ static void mlx5_ib_eq_pf_process(struct mlx5_ib_pf_eq *eq) pf_eqe = &eqe->data.page_fault; pfault->event_subtype = eqe->sub_type; - pfault->bytes_committed = be32_to_cpu(pf_eqe->bytes_committed); - - mlx5_ib_dbg(eq->dev, - "PAGE_FAULT: subtype: 0x%02x, bytes_committed: 0x%06x\n", - eqe->sub_type, pfault->bytes_committed); switch (eqe->sub_type) { case MLX5_PFAULT_SUBTYPE_RDMA: /* RDMA based event */ + pfault->bytes_committed = + be32_to_cpu(pf_eqe->rdma.bytes_committed); pfault->type = be32_to_cpu(pf_eqe->rdma.pftype_token) >> 24; pfault->token = @@ -1420,10 +1625,12 @@ static void mlx5_ib_eq_pf_process(struct mlx5_ib_pf_eq *eq) be32_to_cpu(pf_eqe->rdma.rdma_op_len); pfault->rdma.rdma_va = be64_to_cpu(pf_eqe->rdma.rdma_va); - mlx5_ib_dbg(eq->dev, - "PAGE_FAULT: type:0x%x, token: 0x%06x, r_key: 0x%08x\n", - pfault->type, pfault->token, - pfault->rdma.r_key); + mlx5_ib_dbg( + eq->dev, + "PAGE_FAULT: subtype: 0x%02x, bytes_committed: 0x%06x, type:0x%x, token: 0x%06llx, r_key: 0x%08x\n", + eqe->sub_type, pfault->bytes_committed, + pfault->type, pfault->token, + pfault->rdma.r_key); mlx5_ib_dbg(eq->dev, "PAGE_FAULT: rdma_op_len: 0x%08x, rdma_va: 0x%016llx\n", pfault->rdma.rdma_op_len, @@ -1432,6 +1639,8 @@ static void mlx5_ib_eq_pf_process(struct mlx5_ib_pf_eq *eq) case MLX5_PFAULT_SUBTYPE_WQE: /* WQE based event */ + pfault->bytes_committed = + be32_to_cpu(pf_eqe->wqe.bytes_committed); pfault->type = (be32_to_cpu(pf_eqe->wqe.pftype_wq) >> 24) & 0x7; pfault->token = @@ -1443,11 +1652,47 @@ static void mlx5_ib_eq_pf_process(struct mlx5_ib_pf_eq *eq) be16_to_cpu(pf_eqe->wqe.wqe_index); pfault->wqe.packet_size = be16_to_cpu(pf_eqe->wqe.packet_length); - mlx5_ib_dbg(eq->dev, - "PAGE_FAULT: type:0x%x, token: 0x%06x, wq_num: 0x%06x, wqe_index: 0x%04x\n", - pfault->type, pfault->token, - pfault->wqe.wq_num, - pfault->wqe.wqe_index); + mlx5_ib_dbg( + eq->dev, + "PAGE_FAULT: subtype: 0x%02x, bytes_committed: 0x%06x, type:0x%x, token: 0x%06llx, wq_num: 0x%06x, wqe_index: 0x%04x\n", + eqe->sub_type, pfault->bytes_committed, + pfault->type, pfault->token, pfault->wqe.wq_num, + pfault->wqe.wqe_index); + break; + + case MLX5_PFAULT_SUBTYPE_MEMORY: + /* Memory based event */ + pfault->bytes_committed = 0; + pfault->token = + be32_to_cpu(pf_eqe->memory.token31_0) | + ((u64)be16_to_cpu(pf_eqe->memory.token47_32) + << 32); + pfault->memory.va = be64_to_cpu(pf_eqe->memory.va); + pfault->memory.mkey = be32_to_cpu(pf_eqe->memory.mkey); + pfault->memory.fault_byte_count = (be32_to_cpu( + pf_eqe->memory.demand_fault_pages) >> 12) * + MEMORY_SCHEME_PAGE_FAULT_GRANULARITY; + pfault->memory.prefetch_before_byte_count = + be16_to_cpu( + pf_eqe->memory.pre_demand_fault_pages) * + MEMORY_SCHEME_PAGE_FAULT_GRANULARITY; + pfault->memory.prefetch_after_byte_count = + be16_to_cpu( + pf_eqe->memory.post_demand_fault_pages) * + MEMORY_SCHEME_PAGE_FAULT_GRANULARITY; + pfault->memory.flags = pf_eqe->memory.flags; + mlx5_ib_dbg( + eq->dev, + "PAGE_FAULT: subtype: 0x%02x, token: 0x%06llx, mkey: 0x%06x, fault_byte_count: 0x%06x, va: 0x%016llx, flags: 0x%02x\n", + eqe->sub_type, pfault->token, + pfault->memory.mkey, + pfault->memory.fault_byte_count, + pfault->memory.va, pfault->memory.flags); + mlx5_ib_dbg( + eq->dev, + "PAGE_FAULT: prefetch size: before: 0x%06x, after 0x%06x\n", + pfault->memory.prefetch_before_byte_count, + pfault->memory.prefetch_after_byte_count); break; default: @@ -1710,7 +1955,7 @@ static void mlx5_ib_prefetch_mr_work(struct work_struct *w) for (i = 0; i < work->num_sge; ++i) { ret = pagefault_mr(work->frags[i].mr, work->frags[i].io_virt, work->frags[i].length, &bytes_mapped, - work->pf_flags); + work->pf_flags, false); if (ret <= 0) continue; mlx5_update_odp_stats(work->frags[i].mr, prefetch, ret); @@ -1761,7 +2006,7 @@ static int mlx5_ib_prefetch_sg_list(struct ib_pd *pd, if (IS_ERR(mr)) return PTR_ERR(mr); ret = pagefault_mr(mr, sg_list[i].addr, sg_list[i].length, - &bytes_mapped, pf_flags); + &bytes_mapped, pf_flags, false); if (ret < 0) { mlx5r_deref_odp_mkey(&mr->mmkey); return ret; diff --git a/drivers/infiniband/hw/mlx5/std_types.c b/drivers/infiniband/hw/mlx5/std_types.c index bbfcce3bdc84..bdb568411091 100644 --- a/drivers/infiniband/hw/mlx5/std_types.c +++ b/drivers/infiniband/hw/mlx5/std_types.c @@ -10,6 +10,7 @@ #include <linux/mlx5/eswitch.h> #include <linux/mlx5/vport.h> #include "mlx5_ib.h" +#include "data_direct.h" #define UVERBS_MODULE_NAME mlx5_ib #include <rdma/uverbs_named_ioctl.h> @@ -111,6 +112,23 @@ out: return err; } +static int fill_multiport_info(struct mlx5_ib_dev *dev, u32 port_num, + struct mlx5_ib_uapi_query_port *info) +{ + struct mlx5_core_dev *mdev; + + mdev = mlx5_ib_get_native_port_mdev(dev, port_num, NULL); + if (!mdev) + return -EINVAL; + + info->vport_vhca_id = MLX5_CAP_GEN(mdev, vhca_id); + info->flags |= MLX5_IB_UAPI_QUERY_PORT_VPORT_VHCA_ID; + + mlx5_ib_put_native_port_mdev(dev, port_num); + + return 0; +} + static int fill_switchdev_info(struct mlx5_ib_dev *dev, u32 port_num, struct mlx5_ib_uapi_query_port *info) { @@ -177,12 +195,60 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_QUERY_PORT)( ret = fill_switchdev_info(dev, port_num, &info); if (ret) return ret; + } else if (mlx5_core_mp_enabled(dev->mdev)) { + ret = fill_multiport_info(dev, port_num, &info); + if (ret) + return ret; } return uverbs_copy_to_struct_or_zero(attrs, MLX5_IB_ATTR_QUERY_PORT, &info, sizeof(info)); } +static int UVERBS_HANDLER(MLX5_IB_METHOD_GET_DATA_DIRECT_SYSFS_PATH)( + struct uverbs_attr_bundle *attrs) +{ + struct mlx5_data_direct_dev *data_direct_dev; + struct mlx5_ib_ucontext *c; + struct mlx5_ib_dev *dev; + int out_len = uverbs_attr_get_len(attrs, + MLX5_IB_ATTR_GET_DATA_DIRECT_SYSFS_PATH); + u32 dev_path_len; + char *dev_path; + int ret; + + c = to_mucontext(ib_uverbs_get_ucontext(attrs)); + if (IS_ERR(c)) + return PTR_ERR(c); + dev = to_mdev(c->ibucontext.device); + mutex_lock(&dev->data_direct_lock); + data_direct_dev = dev->data_direct_dev; + if (!data_direct_dev) { + ret = -ENODEV; + goto end; + } + + dev_path = kobject_get_path(&data_direct_dev->device->kobj, GFP_KERNEL); + if (!dev_path) { + ret = -ENOMEM; + goto end; + } + + dev_path_len = strlen(dev_path) + 1; + if (dev_path_len > out_len) { + ret = -ENOSPC; + goto end; + } + + ret = uverbs_copy_to(attrs, MLX5_IB_ATTR_GET_DATA_DIRECT_SYSFS_PATH, dev_path, + dev_path_len); + kfree(dev_path); + +end: + mutex_unlock(&dev->data_direct_lock); + return ret; +} + DECLARE_UVERBS_NAMED_METHOD( MLX5_IB_METHOD_QUERY_PORT, UVERBS_ATTR_PTR_IN(MLX5_IB_ATTR_QUERY_PORT_PORT_NUM, @@ -193,9 +259,17 @@ DECLARE_UVERBS_NAMED_METHOD( reg_c0), UA_MANDATORY)); +DECLARE_UVERBS_NAMED_METHOD( + MLX5_IB_METHOD_GET_DATA_DIRECT_SYSFS_PATH, + UVERBS_ATTR_PTR_OUT( + MLX5_IB_ATTR_GET_DATA_DIRECT_SYSFS_PATH, + UVERBS_ATTR_MIN_SIZE(0), + UA_MANDATORY)); + ADD_UVERBS_METHODS(mlx5_ib_device, UVERBS_OBJECT_DEVICE, - &UVERBS_METHOD(MLX5_IB_METHOD_QUERY_PORT)); + &UVERBS_METHOD(MLX5_IB_METHOD_QUERY_PORT), + &UVERBS_METHOD(MLX5_IB_METHOD_GET_DATA_DIRECT_SYSFS_PATH)); DECLARE_UVERBS_NAMED_METHOD( MLX5_IB_METHOD_PD_QUERY, diff --git a/drivers/infiniband/hw/mlx5/umr.c b/drivers/infiniband/hw/mlx5/umr.c index ffc31b01f690..887fd6fa3ba9 100644 --- a/drivers/infiniband/hw/mlx5/umr.c +++ b/drivers/infiniband/hw/mlx5/umr.c @@ -224,6 +224,9 @@ int mlx5r_umr_init(struct mlx5_ib_dev *dev) void mlx5r_umr_cleanup(struct mlx5_ib_dev *dev) { + if (!dev->umrc.pd) + return; + mutex_destroy(&dev->umrc.init_lock); ib_dealloc_pd(dev->umrc.pd); } @@ -632,44 +635,47 @@ static void mlx5r_umr_final_update_xlt(struct mlx5_ib_dev *dev, wqe->data_seg.byte_count = cpu_to_be32(sg->length); } -/* - * Send the DMA list to the HW for a normal MR using UMR. - * Dmabuf MR is handled in a similar way, except that the MLX5_IB_UPD_XLT_ZAP - * flag may be used. - */ -int mlx5r_umr_update_mr_pas(struct mlx5_ib_mr *mr, unsigned int flags) +static int +_mlx5r_umr_update_mr_pas(struct mlx5_ib_mr *mr, unsigned int flags, bool dd) { + size_t ent_size = dd ? sizeof(struct mlx5_ksm) : sizeof(struct mlx5_mtt); struct mlx5_ib_dev *dev = mr_to_mdev(mr); struct device *ddev = &dev->mdev->pdev->dev; struct mlx5r_umr_wqe wqe = {}; struct ib_block_iter biter; + struct mlx5_ksm *cur_ksm; struct mlx5_mtt *cur_mtt; size_t orig_sg_length; - struct mlx5_mtt *mtt; size_t final_size; + void *curr_entry; struct ib_sge sg; + void *entry; u64 offset = 0; int err = 0; - if (WARN_ON(mr->umem->is_odp)) - return -EINVAL; - - mtt = mlx5r_umr_create_xlt( - dev, &sg, ib_umem_num_dma_blocks(mr->umem, 1 << mr->page_shift), - sizeof(*mtt), flags); - if (!mtt) + entry = mlx5r_umr_create_xlt(dev, &sg, + ib_umem_num_dma_blocks(mr->umem, 1 << mr->page_shift), + ent_size, flags); + if (!entry) return -ENOMEM; orig_sg_length = sg.length; - mlx5r_umr_set_update_xlt_ctrl_seg(&wqe.ctrl_seg, flags, &sg); mlx5r_umr_set_update_xlt_mkey_seg(dev, &wqe.mkey_seg, mr, mr->page_shift); + if (dd) { + /* Use the data direct internal kernel PD */ + MLX5_SET(mkc, &wqe.mkey_seg, pd, dev->ddr.pdn); + cur_ksm = entry; + } else { + cur_mtt = entry; + } + mlx5r_umr_set_update_xlt_data_seg(&wqe.data_seg, &sg); - cur_mtt = mtt; + curr_entry = entry; rdma_umem_for_each_dma_block(mr->umem, &biter, BIT(mr->page_shift)) { - if (cur_mtt == (void *)mtt + sg.length) { + if (curr_entry == entry + sg.length) { dma_sync_single_for_device(ddev, sg.addr, sg.length, DMA_TO_DEVICE); @@ -681,23 +687,31 @@ int mlx5r_umr_update_mr_pas(struct mlx5_ib_mr *mr, unsigned int flags) DMA_TO_DEVICE); offset += sg.length; mlx5r_umr_update_offset(&wqe.ctrl_seg, offset); - - cur_mtt = mtt; + if (dd) + cur_ksm = entry; + else + cur_mtt = entry; } - cur_mtt->ptag = - cpu_to_be64(rdma_block_iter_dma_address(&biter) | - MLX5_IB_MTT_PRESENT); - - if (mr->umem->is_dmabuf && (flags & MLX5_IB_UPD_XLT_ZAP)) - cur_mtt->ptag = 0; - - cur_mtt++; + if (dd) { + cur_ksm->va = cpu_to_be64(rdma_block_iter_dma_address(&biter)); + cur_ksm->key = cpu_to_be32(dev->ddr.mkey); + cur_ksm++; + curr_entry = cur_ksm; + } else { + cur_mtt->ptag = + cpu_to_be64(rdma_block_iter_dma_address(&biter) | + MLX5_IB_MTT_PRESENT); + if (mr->umem->is_dmabuf && (flags & MLX5_IB_UPD_XLT_ZAP)) + cur_mtt->ptag = 0; + cur_mtt++; + curr_entry = cur_mtt; + } } - final_size = (void *)cur_mtt - (void *)mtt; + final_size = curr_entry - entry; sg.length = ALIGN(final_size, MLX5_UMR_FLEX_ALIGNMENT); - memset(cur_mtt, 0, sg.length - final_size); + memset(curr_entry, 0, sg.length - final_size); mlx5r_umr_final_update_xlt(dev, &wqe, mr, &sg, flags); dma_sync_single_for_device(ddev, sg.addr, sg.length, DMA_TO_DEVICE); @@ -705,10 +719,32 @@ int mlx5r_umr_update_mr_pas(struct mlx5_ib_mr *mr, unsigned int flags) err: sg.length = orig_sg_length; - mlx5r_umr_unmap_free_xlt(dev, mtt, &sg); + mlx5r_umr_unmap_free_xlt(dev, entry, &sg); return err; } +int mlx5r_umr_update_data_direct_ksm_pas(struct mlx5_ib_mr *mr, unsigned int flags) +{ + /* No invalidation flow is expected */ + if (WARN_ON(!mr->umem->is_dmabuf) || (flags & MLX5_IB_UPD_XLT_ZAP)) + return -EINVAL; + + return _mlx5r_umr_update_mr_pas(mr, flags, true); +} + +/* + * Send the DMA list to the HW for a normal MR using UMR. + * Dmabuf MR is handled in a similar way, except that the MLX5_IB_UPD_XLT_ZAP + * flag may be used. + */ +int mlx5r_umr_update_mr_pas(struct mlx5_ib_mr *mr, unsigned int flags) +{ + if (WARN_ON(mr->umem->is_odp)) + return -EINVAL; + + return _mlx5r_umr_update_mr_pas(mr, flags, false); +} + static bool umr_can_use_indirect_mkey(struct mlx5_ib_dev *dev) { return !MLX5_CAP_GEN(dev->mdev, umr_indirect_mkey_disabled); diff --git a/drivers/infiniband/hw/mlx5/umr.h b/drivers/infiniband/hw/mlx5/umr.h index 5f734dc72bef..4a02c9b5aad8 100644 --- a/drivers/infiniband/hw/mlx5/umr.h +++ b/drivers/infiniband/hw/mlx5/umr.h @@ -95,6 +95,7 @@ int mlx5r_umr_revoke_mr(struct mlx5_ib_mr *mr); int mlx5r_umr_rereg_pd_access(struct mlx5_ib_mr *mr, struct ib_pd *pd, int access_flags); int mlx5r_umr_update_mr_pas(struct mlx5_ib_mr *mr, unsigned int flags); +int mlx5r_umr_update_data_direct_ksm_pas(struct mlx5_ib_mr *mr, unsigned int flags); int mlx5r_umr_update_xlt(struct mlx5_ib_mr *mr, u64 idx, int npages, int page_shift, int flags); diff --git a/drivers/infiniband/hw/qib/qib_init.c b/drivers/infiniband/hw/qib/qib_init.c index db3b25c8433a..4100656fe9a3 100644 --- a/drivers/infiniband/hw/qib/qib_init.c +++ b/drivers/infiniband/hw/qib/qib_init.c @@ -581,12 +581,9 @@ static int qib_create_workqueues(struct qib_devdata *dd) for (pidx = 0; pidx < dd->num_pports; ++pidx) { ppd = dd->pport + pidx; if (!ppd->qib_wq) { - char wq_name[23]; - - snprintf(wq_name, sizeof(wq_name), "qib%d_%d", - dd->unit, pidx); - ppd->qib_wq = alloc_ordered_workqueue(wq_name, - WQ_MEM_RECLAIM); + ppd->qib_wq = alloc_ordered_workqueue("qib%d_%d", + WQ_MEM_RECLAIM, + dd->unit, pidx); if (!ppd->qib_wq) goto wq_error; } diff --git a/drivers/infiniband/hw/qib/qib_verbs.h b/drivers/infiniband/hw/qib/qib_verbs.h index 07548fac1d8e..408fe1ba74b9 100644 --- a/drivers/infiniband/hw/qib/qib_verbs.h +++ b/drivers/infiniband/hw/qib/qib_verbs.h @@ -303,8 +303,6 @@ int qib_check_send_wqe(struct rvt_qp *qp, struct rvt_swqe *wqe, struct ib_ah *qib_create_qp0_ah(struct qib_ibport *ibp, u16 dlid); -void qib_rc_rnr_retry(unsigned long arg); - void qib_rc_send_complete(struct rvt_qp *qp, struct ib_header *hdr); int qib_post_ud_send(struct rvt_qp *qp, const struct ib_send_wr *wr); @@ -312,8 +310,6 @@ int qib_post_ud_send(struct rvt_qp *qp, const struct ib_send_wr *wr); void qib_ud_rcv(struct qib_ibport *ibp, struct ib_header *hdr, int has_grh, void *data, u32 tlen, struct rvt_qp *qp); -void mr_rcu_callback(struct rcu_head *list); - void qib_migrate_qp(struct rvt_qp *qp); int qib_ruc_check_hdr(struct qib_ibport *ibp, struct ib_header *hdr, diff --git a/drivers/infiniband/sw/rdmavt/mr.c b/drivers/infiniband/sw/rdmavt/mr.c index 7a9afd5231d5..5ed5cfc2b280 100644 --- a/drivers/infiniband/sw/rdmavt/mr.c +++ b/drivers/infiniband/sw/rdmavt/mr.c @@ -348,13 +348,13 @@ struct ib_mr *rvt_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, umem = ib_umem_get(pd->device, start, length, mr_access_flags); if (IS_ERR(umem)) - return (void *)umem; + return ERR_CAST(umem); n = ib_umem_num_pages(umem); mr = __rvt_alloc_mr(n, pd); if (IS_ERR(mr)) { - ret = (struct ib_mr *)mr; + ret = ERR_CAST(mr); goto bail_umem; } @@ -542,7 +542,7 @@ struct ib_mr *rvt_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type, mr = __rvt_alloc_mr(max_num_sg, pd); if (IS_ERR(mr)) - return (struct ib_mr *)mr; + return ERR_CAST(mr); return &mr->ibmr; } diff --git a/drivers/infiniband/sw/rxe/rxe_hdr.h b/drivers/infiniband/sw/rxe/rxe_hdr.h index 46f82b27fcd2..1f0322491d8c 100644 --- a/drivers/infiniband/sw/rxe/rxe_hdr.h +++ b/drivers/infiniband/sw/rxe/rxe_hdr.h @@ -234,7 +234,7 @@ static inline void __bth_set_resv6a(void *arg) { struct rxe_bth *bth = arg; - bth->qpn = cpu_to_be32(~BTH_RESV6A_MASK); + bth->qpn &= cpu_to_be32(~BTH_RESV6A_MASK); } static inline int __bth_ack(void *arg) diff --git a/drivers/infiniband/sw/rxe/rxe_resp.c b/drivers/infiniband/sw/rxe/rxe_resp.c index 6596a85723c9..c11ab280551a 100644 --- a/drivers/infiniband/sw/rxe/rxe_resp.c +++ b/drivers/infiniband/sw/rxe/rxe_resp.c @@ -341,7 +341,7 @@ static enum resp_states rxe_resp_check_length(struct rxe_qp *qp, /* * See IBA C9-92 * For UD QPs we only check if the packet will fit in the - * receive buffer later. For rmda operations additional + * receive buffer later. For RDMA operations additional * length checks are performed in check_rkey. */ if ((qp_type(qp) == IB_QPT_GSI) || (qp_type(qp) == IB_QPT_UD)) { @@ -351,7 +351,7 @@ static enum resp_states rxe_resp_check_length(struct rxe_qp *qp, for (i = 0; i < qp->resp.wqe->dma.num_sge; i++) recv_buffer_len += qp->resp.wqe->dma.sge[i].length; - if (payload + 40 > recv_buffer_len) { + if (payload + sizeof(union rdma_network_hdr) > recv_buffer_len) { rxe_dbg_qp(qp, "The receive buffer is too small for this UD packet.\n"); return RESPST_ERR_LENGTH; } diff --git a/drivers/infiniband/sw/siw/siw.h b/drivers/infiniband/sw/siw/siw.h index 75253f2b3e3d..86d4d6a2170e 100644 --- a/drivers/infiniband/sw/siw/siw.h +++ b/drivers/infiniband/sw/siw/siw.h @@ -94,8 +94,6 @@ struct siw_device { atomic_t num_mr; atomic_t num_srq; atomic_t num_ctx; - - struct work_struct netdev_down; }; struct siw_ucontext { diff --git a/drivers/infiniband/sw/siw/siw_main.c b/drivers/infiniband/sw/siw/siw_main.c index b2b54242aa69..17abef48abcd 100644 --- a/drivers/infiniband/sw/siw/siw_main.c +++ b/drivers/infiniband/sw/siw/siw_main.c @@ -364,39 +364,6 @@ error: return NULL; } -/* - * Network link becomes unavailable. Mark all - * affected QP's accordingly. - */ -static void siw_netdev_down(struct work_struct *work) -{ - struct siw_device *sdev = - container_of(work, struct siw_device, netdev_down); - - struct siw_qp_attrs qp_attrs; - struct list_head *pos, *tmp; - - memset(&qp_attrs, 0, sizeof(qp_attrs)); - qp_attrs.state = SIW_QP_STATE_ERROR; - - list_for_each_safe(pos, tmp, &sdev->qp_list) { - struct siw_qp *qp = list_entry(pos, struct siw_qp, devq); - - down_write(&qp->state_lock); - WARN_ON(siw_qp_modify(qp, &qp_attrs, SIW_QP_ATTR_STATE)); - up_write(&qp->state_lock); - } - ib_device_put(&sdev->base_dev); -} - -static void siw_device_goes_down(struct siw_device *sdev) -{ - if (ib_device_try_get(&sdev->base_dev)) { - INIT_WORK(&sdev->netdev_down, siw_netdev_down); - schedule_work(&sdev->netdev_down); - } -} - static int siw_netdev_event(struct notifier_block *nb, unsigned long event, void *arg) { @@ -418,10 +385,6 @@ static int siw_netdev_event(struct notifier_block *nb, unsigned long event, siw_port_event(sdev, 1, IB_EVENT_PORT_ACTIVE); break; - case NETDEV_GOING_DOWN: - siw_device_goes_down(sdev); - break; - case NETDEV_DOWN: sdev->state = IB_PORT_DOWN; siw_port_event(sdev, 1, IB_EVENT_PORT_ERR); diff --git a/drivers/infiniband/ulp/ipoib/ipoib.h b/drivers/infiniband/ulp/ipoib/ipoib.h index 963e936da5e3..abe0522b7df4 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib.h +++ b/drivers/infiniband/ulp/ipoib/ipoib.h @@ -509,12 +509,10 @@ struct net_device *ipoib_intf_alloc(struct ib_device *hca, u32 port, const char *format); int ipoib_intf_init(struct ib_device *hca, u32 port, const char *format, struct net_device *dev); -void ipoib_ib_tx_timer_func(struct timer_list *t); void ipoib_ib_dev_flush_light(struct work_struct *work); void ipoib_ib_dev_flush_normal(struct work_struct *work); void ipoib_ib_dev_flush_heavy(struct work_struct *work); void ipoib_ib_tx_timeout_work(struct work_struct *work); -void ipoib_pkey_event(struct work_struct *work); void ipoib_ib_dev_cleanup(struct net_device *dev); int ipoib_ib_dev_open_default(struct net_device *dev); @@ -533,7 +531,6 @@ void ipoib_mcast_restart_task(struct work_struct *work); void ipoib_mcast_start_thread(struct net_device *dev); void ipoib_mcast_stop_thread(struct net_device *dev); -void ipoib_mcast_dev_down(struct net_device *dev); void ipoib_mcast_dev_flush(struct net_device *dev); int ipoib_dma_map_tx(struct ib_device *ca, struct ipoib_tx_buf *tx_req); @@ -610,7 +607,6 @@ int ipoib_set_mode(struct net_device *dev, const char *buf); void ipoib_setup_common(struct net_device *dev); -void ipoib_pkey_open(struct ipoib_dev_priv *priv); void ipoib_drain_cq(struct net_device *dev); void ipoib_set_ethtool_ops(struct net_device *dev); diff --git a/drivers/infiniband/ulp/iser/iscsi_iser.h b/drivers/infiniband/ulp/iser/iscsi_iser.h index 68429a5f796d..1d7ac24c4c00 100644 --- a/drivers/infiniband/ulp/iser/iscsi_iser.h +++ b/drivers/infiniband/ulp/iser/iscsi_iser.h @@ -507,10 +507,6 @@ void iser_task_rdma_finalize(struct iscsi_iser_task *task); void iser_free_rx_descriptors(struct iser_conn *iser_conn); -void iser_finalize_rdma_unaligned_sg(struct iscsi_iser_task *iser_task, - struct iser_data_buf *mem, - enum iser_data_dir cmd_dir); - int iser_reg_mem_fastreg(struct iscsi_iser_task *task, enum iser_data_dir dir, bool all_imm); diff --git a/drivers/infiniband/ulp/rtrs/rtrs-clt.c b/drivers/infiniband/ulp/rtrs/rtrs-clt.c index 88106cf5ce55..71387811b281 100644 --- a/drivers/infiniband/ulp/rtrs/rtrs-clt.c +++ b/drivers/infiniband/ulp/rtrs/rtrs-clt.c @@ -331,7 +331,7 @@ static void rtrs_clt_fast_reg_done(struct ib_cq *cq, struct ib_wc *wc) struct rtrs_clt_con *con = to_clt_con(wc->qp->qp_context); if (wc->status != IB_WC_SUCCESS) { - rtrs_err(con->c.path, "Failed IB_WR_REG_MR: %s\n", + rtrs_err_rl(con->c.path, "Failed IB_WR_REG_MR: %s\n", ib_wc_status_msg(wc->status)); rtrs_rdma_error_recovery(con); } @@ -351,11 +351,11 @@ static void rtrs_clt_inv_rkey_done(struct ib_cq *cq, struct ib_wc *wc) struct rtrs_clt_con *con = to_clt_con(wc->qp->qp_context); if (wc->status != IB_WC_SUCCESS) { - rtrs_err(con->c.path, "Failed IB_WR_LOCAL_INV: %s\n", + rtrs_err_rl(con->c.path, "Failed IB_WR_LOCAL_INV: %s\n", ib_wc_status_msg(wc->status)); rtrs_rdma_error_recovery(con); } - req->need_inv = false; + req->mr->need_inval = false; if (req->need_inv_comp) complete(&req->inv_comp); else @@ -391,12 +391,13 @@ static void complete_rdma_req(struct rtrs_clt_io_req *req, int errno, clt_path = to_clt_path(con->c.path); if (req->sg_cnt) { - if (req->dir == DMA_FROM_DEVICE && req->need_inv) { + if (req->mr->need_inval) { /* - * We are here to invalidate read requests + * We are here to invalidate read/write requests * ourselves. In normal scenario server should - * send INV for all read requests, but - * we are here, thus two things could happen: + * send INV for all read requests, we do local + * invalidate for write requests ourselves, but + * we are here, thus three things could happen: * * 1. this is failover, when errno != 0 * and can_wait == 1, @@ -404,6 +405,9 @@ static void complete_rdma_req(struct rtrs_clt_io_req *req, int errno, * 2. something totally bad happened and * server forgot to send INV, so we * should do that ourselves. + * + * 3. write request finishes, we need to do local + * invalidate */ if (can_wait) { @@ -418,18 +422,10 @@ static void complete_rdma_req(struct rtrs_clt_io_req *req, int errno, refcount_inc(&req->ref); err = rtrs_inv_rkey(req); if (err) { - rtrs_err(con->c.path, "Send INV WR key=%#x: %d\n", + rtrs_err_rl(con->c.path, "Send INV WR key=%#x: %d\n", req->mr->rkey, err); } else if (can_wait) { wait_for_completion(&req->inv_comp); - } else { - /* - * Something went wrong, so request will be - * completed from INV callback. - */ - WARN_ON_ONCE(1); - - return; } if (!refcount_dec_and_test(&req->ref)) return; @@ -446,8 +442,10 @@ static void complete_rdma_req(struct rtrs_clt_io_req *req, int errno, req->con = NULL; if (errno) { - rtrs_err_rl(con->c.path, "IO request failed: error=%d path=%s [%s:%u] notify=%d\n", - errno, kobject_name(&clt_path->kobj), clt_path->hca_name, + rtrs_err_rl(con->c.path, + "IO %s request failed: error=%d path=%s [%s:%u] notify=%d\n", + req->dir == DMA_TO_DEVICE ? "write" : "read", errno, + kobject_name(&clt_path->kobj), clt_path->hca_name, clt_path->hca_port, notify); } @@ -501,7 +499,7 @@ static void process_io_rsp(struct rtrs_clt_path *clt_path, u32 msg_id, req = &clt_path->reqs[msg_id]; /* Drop need_inv if server responded with send with invalidation */ - req->need_inv &= !w_inval; + req->mr->need_inval &= !w_inval; complete_rdma_req(req, errno, true, false); } @@ -626,6 +624,7 @@ static void rtrs_clt_rdma_done(struct ib_cq *cq, struct ib_wc *wc) */ if (WARN_ON(wc->wr_cqe->done != rtrs_clt_rdma_done)) return; + clt_path->s.hb_missed_cnt = 0; rtrs_from_imm(be32_to_cpu(wc->ex.imm_data), &imm_type, &imm_payload); if (imm_type == RTRS_IO_RSP_IMM || @@ -643,7 +642,6 @@ static void rtrs_clt_rdma_done(struct ib_cq *cq, struct ib_wc *wc) return rtrs_clt_recv_done(con, wc); } else if (imm_type == RTRS_HB_ACK_IMM) { WARN_ON(con->c.cid); - clt_path->s.hb_missed_cnt = 0; clt_path->s.hb_cur_latency = ktime_sub(ktime_get(), clt_path->s.hb_last_sent); if (clt_path->flags & RTRS_MSG_NEW_RKEY_F) @@ -670,6 +668,7 @@ static void rtrs_clt_rdma_done(struct ib_cq *cq, struct ib_wc *wc) /* * Key invalidations from server side */ + clt_path->s.hb_missed_cnt = 0; WARN_ON(!(wc->wc_flags & IB_WC_WITH_INVALIDATE || wc->wc_flags & IB_WC_WITH_IMM)); WARN_ON(wc->wr_cqe->done != rtrs_clt_rdma_done); @@ -967,7 +966,7 @@ static void rtrs_clt_init_req(struct rtrs_clt_io_req *req, req->dir = dir; req->con = rtrs_permit_to_clt_con(clt_path, permit); req->conf = conf; - req->need_inv = false; + req->mr->need_inval = false; req->need_inv_comp = false; req->inv_errno = 0; refcount_set(&req->ref, 1); @@ -1089,7 +1088,6 @@ static int rtrs_clt_write_req(struct rtrs_clt_io_req *req) int ret, count = 0; u32 imm, buf_id; struct ib_reg_wr rwr; - struct ib_send_wr inv_wr; struct ib_send_wr *wr = NULL; bool fr_en = false; @@ -1130,13 +1128,6 @@ static int rtrs_clt_write_req(struct rtrs_clt_io_req *req) req->sg_cnt, req->dir); return ret; } - inv_wr = (struct ib_send_wr) { - .opcode = IB_WR_LOCAL_INV, - .wr_cqe = &req->inv_cqe, - .send_flags = IB_SEND_SIGNALED, - .ex.invalidate_rkey = req->mr->rkey, - }; - req->inv_cqe.done = rtrs_clt_inv_rkey_done; rwr = (struct ib_reg_wr) { .wr.opcode = IB_WR_REG_MR, .wr.wr_cqe = &fast_reg_cqe, @@ -1146,7 +1137,7 @@ static int rtrs_clt_write_req(struct rtrs_clt_io_req *req) }; wr = &rwr.wr; fr_en = true; - refcount_inc(&req->ref); + req->mr->need_inval = true; } /* * Update stats now, after request is successfully sent it is not @@ -1156,7 +1147,7 @@ static int rtrs_clt_write_req(struct rtrs_clt_io_req *req) ret = rtrs_post_rdma_write_sg(req->con, req, rbuf, fr_en, count, req->usr_len + sizeof(*msg), - imm, wr, &inv_wr); + imm, wr, NULL); if (ret) { rtrs_err_rl(s, "Write request failed: error=%d path=%s [%s:%u]\n", @@ -1164,6 +1155,10 @@ static int rtrs_clt_write_req(struct rtrs_clt_io_req *req) clt_path->hca_port); if (req->mp_policy == MP_POLICY_MIN_INFLIGHT) atomic_dec(&clt_path->stats->inflight); + if (req->mr->need_inval) { + req->mr->need_inval = false; + refcount_dec(&req->ref); + } if (req->sg_cnt) ib_dma_unmap_sg(clt_path->s.dev->ib_dev, req->sglist, req->sg_cnt, req->dir); @@ -1213,7 +1208,7 @@ static int rtrs_clt_read_req(struct rtrs_clt_io_req *req) ret = rtrs_map_sg_fr(req, count); if (ret < 0) { rtrs_err_rl(s, - "Read request failed, failed to map fast reg. data, err: %d\n", + "Read request failed, failed to map fast reg. data, err: %d\n", ret); ib_dma_unmap_sg(dev->ib_dev, req->sglist, req->sg_cnt, req->dir); @@ -1237,7 +1232,7 @@ static int rtrs_clt_read_req(struct rtrs_clt_io_req *req) msg->desc[0].len = cpu_to_le32(req->mr->length); /* Further invalidation is required */ - req->need_inv = !!RTRS_MSG_NEED_INVAL_F; + req->mr->need_inval = !!RTRS_MSG_NEED_INVAL_F; } else { msg->sg_cnt = 0; @@ -1270,7 +1265,7 @@ static int rtrs_clt_read_req(struct rtrs_clt_io_req *req) clt_path->hca_port); if (req->mp_policy == MP_POLICY_MIN_INFLIGHT) atomic_dec(&clt_path->stats->inflight); - req->need_inv = false; + req->mr->need_inval = false; if (req->sg_cnt) ib_dma_unmap_sg(dev->ib_dev, req->sglist, req->sg_cnt, req->dir); @@ -1494,7 +1489,9 @@ static bool rtrs_clt_change_state_get_old(struct rtrs_clt_path *clt_path, static void rtrs_clt_hb_err_handler(struct rtrs_con *c) { struct rtrs_clt_con *con = container_of(c, typeof(*con), c); + struct rtrs_clt_path *clt_path = to_clt_path(con->c.path); + rtrs_err(con->c.path, "HB err handler for path=%s\n", kobject_name(&clt_path->kobj)); rtrs_rdma_error_recovery(con); } @@ -2346,6 +2343,12 @@ static int init_conns(struct rtrs_clt_path *clt_path) if (err) goto destroy; } + + /* + * Set the cid to con_num - 1, since if we fail later, we want to stay in bounds. + */ + cid = clt_path->s.con_num - 1; + err = alloc_path_reqs(clt_path); if (err) goto destroy; @@ -3140,8 +3143,20 @@ close_path: return err; } +void rtrs_clt_ib_event_handler(struct ib_event_handler *handler, + struct ib_event *ibevent) +{ + pr_info("Handling event: %s (%d).\n", ib_event_msg(ibevent->event), + ibevent->event); +} + + static int rtrs_clt_ib_dev_init(struct rtrs_ib_dev *dev) { + INIT_IB_EVENT_HANDLER(&dev->event_handler, dev->ib_dev, + rtrs_clt_ib_event_handler); + ib_register_event_handler(&dev->event_handler); + if (!(dev->ib_dev->attrs.device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS)) { pr_err("Memory registrations not supported.\n"); @@ -3151,8 +3166,15 @@ static int rtrs_clt_ib_dev_init(struct rtrs_ib_dev *dev) return 0; } +static void rtrs_clt_ib_dev_deinit(struct rtrs_ib_dev *dev) +{ + ib_unregister_event_handler(&dev->event_handler); +} + + static const struct rtrs_rdma_dev_pd_ops dev_pd_ops = { - .init = rtrs_clt_ib_dev_init + .init = rtrs_clt_ib_dev_init, + .deinit = rtrs_clt_ib_dev_deinit }; static int __init rtrs_client_init(void) diff --git a/drivers/infiniband/ulp/rtrs/rtrs-clt.h b/drivers/infiniband/ulp/rtrs/rtrs-clt.h index f848c0392d98..0f57759b3080 100644 --- a/drivers/infiniband/ulp/rtrs/rtrs-clt.h +++ b/drivers/infiniband/ulp/rtrs/rtrs-clt.h @@ -115,7 +115,6 @@ struct rtrs_clt_io_req { struct completion inv_comp; int inv_errno; bool need_inv_comp; - bool need_inv; refcount_t ref; }; @@ -213,6 +212,8 @@ int rtrs_clt_remove_path_from_sysfs(struct rtrs_clt_path *path, void rtrs_clt_set_max_reconnect_attempts(struct rtrs_clt_sess *clt, int value); int rtrs_clt_get_max_reconnect_attempts(const struct rtrs_clt_sess *clt); void free_path(struct rtrs_clt_path *clt_path); +void rtrs_clt_ib_event_handler(struct ib_event_handler *handler, + struct ib_event *ibevent); /* rtrs-clt-stats.c */ diff --git a/drivers/infiniband/ulp/rtrs/rtrs-pri.h b/drivers/infiniband/ulp/rtrs/rtrs-pri.h index ab25619261d2..ef29bd483b5a 100644 --- a/drivers/infiniband/ulp/rtrs/rtrs-pri.h +++ b/drivers/infiniband/ulp/rtrs/rtrs-pri.h @@ -69,6 +69,7 @@ struct rtrs_ib_dev; struct rtrs_rdma_dev_pd_ops { int (*init)(struct rtrs_ib_dev *dev); + void (*deinit)(struct rtrs_ib_dev *dev); }; struct rtrs_rdma_dev_pd { @@ -84,6 +85,7 @@ struct rtrs_ib_dev { struct kref ref; struct list_head entry; struct rtrs_rdma_dev_pd *pool; + struct ib_event_handler event_handler; }; struct rtrs_con { diff --git a/drivers/infiniband/ulp/rtrs/rtrs-srv.c b/drivers/infiniband/ulp/rtrs/rtrs-srv.c index 1d33efb8fb03..e83d95647852 100644 --- a/drivers/infiniband/ulp/rtrs/rtrs-srv.c +++ b/drivers/infiniband/ulp/rtrs/rtrs-srv.c @@ -26,7 +26,10 @@ MODULE_LICENSE("GPL"); #define DEFAULT_SESS_QUEUE_DEPTH 512 #define MAX_HDR_SIZE PAGE_SIZE -static struct rtrs_rdma_dev_pd dev_pd; +static const struct rtrs_rdma_dev_pd_ops dev_pd_ops; +static struct rtrs_rdma_dev_pd dev_pd = { + .ops = &dev_pd_ops +}; const struct class rtrs_dev_class = { .name = "rtrs-server", }; @@ -672,6 +675,10 @@ err: static void rtrs_srv_hb_err_handler(struct rtrs_con *c) { + struct rtrs_srv_con *con = container_of(c, typeof(*con), c); + struct rtrs_srv_path *srv_path = to_srv_path(con->c.path); + + rtrs_err(con->c.path, "HB err handler for path=%s\n", kobject_name(&srv_path->kobj)); close_path(to_srv_path(c->path)); } @@ -931,12 +938,11 @@ static void rtrs_srv_info_req_done(struct ib_cq *cq, struct ib_wc *wc) if (err) goto close; -out: rtrs_iu_free(iu, srv_path->s.dev->ib_dev, 1); return; close: + rtrs_iu_free(iu, srv_path->s.dev->ib_dev, 1); close_path(srv_path); - goto out; } static int post_recv_info_req(struct rtrs_srv_con *con) @@ -987,6 +993,16 @@ static int post_recv_path(struct rtrs_srv_path *srv_path) q_size = SERVICE_CON_QUEUE_DEPTH; else q_size = srv->queue_depth; + if (srv_path->state != RTRS_SRV_CONNECTING) { + rtrs_err(s, "Path state invalid. state %s\n", + rtrs_srv_state_str(srv_path->state)); + return -EIO; + } + + if (!srv_path->s.con[cid]) { + rtrs_err(s, "Conn not set for %d\n", cid); + return -EIO; + } err = post_recv_io(to_srv_con(srv_path->s.con[cid]), q_size); if (err) { @@ -1229,6 +1245,7 @@ static void rtrs_srv_rdma_done(struct ib_cq *cq, struct ib_wc *wc) */ if (WARN_ON(wc->wr_cqe != &io_comp_cqe)) return; + srv_path->s.hb_missed_cnt = 0; err = rtrs_post_recv_empty(&con->c, &io_comp_cqe); if (err) { rtrs_err(s, "rtrs_post_recv(), err: %d\n", err); @@ -2255,6 +2272,34 @@ static int check_module_params(void) return 0; } +void rtrs_srv_ib_event_handler(struct ib_event_handler *handler, + struct ib_event *ibevent) +{ + pr_info("Handling event: %s (%d).\n", ib_event_msg(ibevent->event), + ibevent->event); +} + +static int rtrs_srv_ib_dev_init(struct rtrs_ib_dev *dev) +{ + INIT_IB_EVENT_HANDLER(&dev->event_handler, dev->ib_dev, + rtrs_srv_ib_event_handler); + ib_register_event_handler(&dev->event_handler); + + return 0; +} + +static void rtrs_srv_ib_dev_deinit(struct rtrs_ib_dev *dev) +{ + ib_unregister_event_handler(&dev->event_handler); +} + + +static const struct rtrs_rdma_dev_pd_ops dev_pd_ops = { + .init = rtrs_srv_ib_dev_init, + .deinit = rtrs_srv_ib_dev_deinit +}; + + static int __init rtrs_server_init(void) { int err; diff --git a/drivers/infiniband/ulp/rtrs/rtrs-srv.h b/drivers/infiniband/ulp/rtrs/rtrs-srv.h index 5e325b82ff33..014f85681f37 100644 --- a/drivers/infiniband/ulp/rtrs/rtrs-srv.h +++ b/drivers/infiniband/ulp/rtrs/rtrs-srv.h @@ -132,6 +132,8 @@ struct rtrs_srv_ib_ctx { extern const struct class rtrs_dev_class; void close_path(struct rtrs_srv_path *srv_path); +void rtrs_srv_ib_event_handler(struct ib_event_handler *handler, + struct ib_event *ibevent); static inline void rtrs_srv_update_rdma_stats(struct rtrs_srv_stats *s, size_t size, int d) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c b/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c index cf8045b92689..8577db3308cc 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c @@ -445,6 +445,34 @@ static int _mlx5_modify_lag(struct mlx5_lag *ldev, u8 *ports) return mlx5_cmd_modify_lag(dev0, ldev->ports, ports); } +static struct net_device *mlx5_lag_active_backup_get_netdev(struct mlx5_core_dev *dev) +{ + struct net_device *ndev = NULL; + struct mlx5_lag *ldev; + unsigned long flags; + int i; + + spin_lock_irqsave(&lag_lock, flags); + ldev = mlx5_lag_dev(dev); + + if (!ldev) + goto unlock; + + for (i = 0; i < ldev->ports; i++) + if (ldev->tracker.netdev_state[i].tx_enabled) + ndev = ldev->pf[i].netdev; + if (!ndev) + ndev = ldev->pf[ldev->ports - 1].netdev; + + if (ndev) + dev_hold(ndev); + +unlock: + spin_unlock_irqrestore(&lag_lock, flags); + + return ndev; +} + void mlx5_modify_lag(struct mlx5_lag *ldev, struct lag_tracker *tracker) { @@ -477,9 +505,18 @@ void mlx5_modify_lag(struct mlx5_lag *ldev, } } - if (tracker->tx_type == NETDEV_LAG_TX_TYPE_ACTIVEBACKUP && - !(ldev->mode == MLX5_LAG_MODE_ROCE)) - mlx5_lag_drop_rule_setup(ldev, tracker); + if (tracker->tx_type == NETDEV_LAG_TX_TYPE_ACTIVEBACKUP) { + struct net_device *ndev = mlx5_lag_active_backup_get_netdev(dev0); + + if(!(ldev->mode == MLX5_LAG_MODE_ROCE)) + mlx5_lag_drop_rule_setup(ldev, tracker); + /** Only sriov and roce lag should have tracker->tx_type set so + * no need to check the mode + */ + blocking_notifier_call_chain(&dev0->priv.lag_nh, + MLX5_DRIVER_EVENT_ACTIVE_BACKUP_LAG_CHANGE_LOWERSTATE, + ndev); + } } static int mlx5_lag_set_port_sel_mode_roce(struct mlx5_lag *ldev, @@ -613,6 +650,7 @@ static int mlx5_create_lag(struct mlx5_lag *ldev, mlx5_core_err(dev0, "Failed to deactivate RoCE LAG; driver restart required\n"); } + BLOCKING_INIT_NOTIFIER_HEAD(&dev0->priv.lag_nh); return err; } @@ -1492,38 +1530,6 @@ void mlx5_lag_enable_change(struct mlx5_core_dev *dev) mlx5_queue_bond_work(ldev, 0); } -struct net_device *mlx5_lag_get_roce_netdev(struct mlx5_core_dev *dev) -{ - struct net_device *ndev = NULL; - struct mlx5_lag *ldev; - unsigned long flags; - int i; - - spin_lock_irqsave(&lag_lock, flags); - ldev = mlx5_lag_dev(dev); - - if (!(ldev && __mlx5_lag_is_roce(ldev))) - goto unlock; - - if (ldev->tracker.tx_type == NETDEV_LAG_TX_TYPE_ACTIVEBACKUP) { - for (i = 0; i < ldev->ports; i++) - if (ldev->tracker.netdev_state[i].tx_enabled) - ndev = ldev->pf[i].netdev; - if (!ndev) - ndev = ldev->pf[ldev->ports - 1].netdev; - } else { - ndev = ldev->pf[MLX5_LAG_P1].netdev; - } - if (ndev) - dev_hold(ndev); - -unlock: - spin_unlock_irqrestore(&lag_lock, flags); - - return ndev; -} -EXPORT_SYMBOL(mlx5_lag_get_roce_netdev); - u8 mlx5_lag_get_slave_port(struct mlx5_core_dev *dev, struct net_device *slave) { diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c index 8b0abd61eca6..220a9ac75c8b 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c @@ -454,8 +454,8 @@ static int handle_hca_cap_atomic(struct mlx5_core_dev *dev, void *set_ctx) static int handle_hca_cap_odp(struct mlx5_core_dev *dev, void *set_ctx) { + bool do_set = false, mem_page_fault = false; void *set_hca_cap; - bool do_set = false; int err; if (!IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING) || @@ -470,6 +470,17 @@ static int handle_hca_cap_odp(struct mlx5_core_dev *dev, void *set_ctx) memcpy(set_hca_cap, dev->caps.hca[MLX5_CAP_ODP]->cur, MLX5_ST_SZ_BYTES(odp_cap)); + /* For best performance, enable memory scheme ODP only when + * it has page prefetch enabled. + */ + if (MLX5_CAP_ODP_MAX(dev, mem_page_fault) && + MLX5_CAP_ODP_MAX(dev, memory_page_fault_scheme_cap.page_prefetch)) { + mem_page_fault = true; + do_set = true; + MLX5_SET(odp_cap, set_hca_cap, mem_page_fault, mem_page_fault); + goto set; + } + #define ODP_CAP_SET_MAX(dev, field) \ do { \ u32 _res = MLX5_CAP_ODP_MAX(dev, field); \ @@ -479,25 +490,28 @@ static int handle_hca_cap_odp(struct mlx5_core_dev *dev, void *set_ctx) } \ } while (0) - ODP_CAP_SET_MAX(dev, ud_odp_caps.srq_receive); - ODP_CAP_SET_MAX(dev, rc_odp_caps.srq_receive); - ODP_CAP_SET_MAX(dev, xrc_odp_caps.srq_receive); - ODP_CAP_SET_MAX(dev, xrc_odp_caps.send); - ODP_CAP_SET_MAX(dev, xrc_odp_caps.receive); - ODP_CAP_SET_MAX(dev, xrc_odp_caps.write); - ODP_CAP_SET_MAX(dev, xrc_odp_caps.read); - ODP_CAP_SET_MAX(dev, xrc_odp_caps.atomic); - ODP_CAP_SET_MAX(dev, dc_odp_caps.srq_receive); - ODP_CAP_SET_MAX(dev, dc_odp_caps.send); - ODP_CAP_SET_MAX(dev, dc_odp_caps.receive); - ODP_CAP_SET_MAX(dev, dc_odp_caps.write); - ODP_CAP_SET_MAX(dev, dc_odp_caps.read); - ODP_CAP_SET_MAX(dev, dc_odp_caps.atomic); - - if (!do_set) - return 0; - - return set_caps(dev, set_ctx, MLX5_SET_HCA_CAP_OP_MOD_ODP); + ODP_CAP_SET_MAX(dev, transport_page_fault_scheme_cap.ud_odp_caps.srq_receive); + ODP_CAP_SET_MAX(dev, transport_page_fault_scheme_cap.rc_odp_caps.srq_receive); + ODP_CAP_SET_MAX(dev, transport_page_fault_scheme_cap.xrc_odp_caps.srq_receive); + ODP_CAP_SET_MAX(dev, transport_page_fault_scheme_cap.xrc_odp_caps.send); + ODP_CAP_SET_MAX(dev, transport_page_fault_scheme_cap.xrc_odp_caps.receive); + ODP_CAP_SET_MAX(dev, transport_page_fault_scheme_cap.xrc_odp_caps.write); + ODP_CAP_SET_MAX(dev, transport_page_fault_scheme_cap.xrc_odp_caps.read); + ODP_CAP_SET_MAX(dev, transport_page_fault_scheme_cap.xrc_odp_caps.atomic); + ODP_CAP_SET_MAX(dev, transport_page_fault_scheme_cap.dc_odp_caps.srq_receive); + ODP_CAP_SET_MAX(dev, transport_page_fault_scheme_cap.dc_odp_caps.send); + ODP_CAP_SET_MAX(dev, transport_page_fault_scheme_cap.dc_odp_caps.receive); + ODP_CAP_SET_MAX(dev, transport_page_fault_scheme_cap.dc_odp_caps.write); + ODP_CAP_SET_MAX(dev, transport_page_fault_scheme_cap.dc_odp_caps.read); + ODP_CAP_SET_MAX(dev, transport_page_fault_scheme_cap.dc_odp_caps.atomic); + +set: + if (do_set) + err = set_caps(dev, set_ctx, MLX5_SET_HCA_CAP_OP_MOD_ODP); + + mlx5_core_dbg(dev, "Using ODP %s scheme\n", + mem_page_fault ? "memory" : "transport"); + return err; } static int max_uc_list_get_devlink_param(struct mlx5_core_dev *dev) |