diff options
author | Ingo Molnar <mingo@kernel.org> | 2017-03-01 09:02:26 +0100 |
---|---|---|
committer | Ingo Molnar <mingo@kernel.org> | 2017-03-01 09:02:26 +0100 |
commit | 0871d5a66da5c41151e0896a90298b163e42f2e0 (patch) | |
tree | 1ba71fab9016cb28bb9d18ffd62b6b744f2f761c /drivers/infiniband/hw/mlx5 | |
parent | x86/boot: Fix pr_debug() API braindamage (diff) | |
parent | Merge tag 'for-linus-4.11' of git://git.code.sf.net/p/openipmi/linux-ipmi (diff) | |
download | linux-0871d5a66da5c41151e0896a90298b163e42f2e0.tar.xz linux-0871d5a66da5c41151e0896a90298b163e42f2e0.zip |
Merge branch 'linus' into WIP.x86/boot, to fix up conflicts and to pick up updates
Conflicts:
arch/x86/xen/setup.c
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Diffstat (limited to 'drivers/infiniband/hw/mlx5')
-rw-r--r-- | drivers/infiniband/hw/mlx5/Makefile | 2 | ||||
-rw-r--r-- | drivers/infiniband/hw/mlx5/cmd.c | 48 | ||||
-rw-r--r-- | drivers/infiniband/hw/mlx5/cmd.h | 40 | ||||
-rw-r--r-- | drivers/infiniband/hw/mlx5/cq.c | 10 | ||||
-rw-r--r-- | drivers/infiniband/hw/mlx5/mad.c | 14 | ||||
-rw-r--r-- | drivers/infiniband/hw/mlx5/main.c | 676 | ||||
-rw-r--r-- | drivers/infiniband/hw/mlx5/mem.c | 32 | ||||
-rw-r--r-- | drivers/infiniband/hw/mlx5/mlx5_ib.h | 167 | ||||
-rw-r--r-- | drivers/infiniband/hw/mlx5/mr.c | 644 | ||||
-rw-r--r-- | drivers/infiniband/hw/mlx5/odp.c | 891 | ||||
-rw-r--r-- | drivers/infiniband/hw/mlx5/qp.c | 539 | ||||
-rw-r--r-- | drivers/infiniband/hw/mlx5/srq.c | 11 |
12 files changed, 1900 insertions, 1174 deletions
diff --git a/drivers/infiniband/hw/mlx5/Makefile b/drivers/infiniband/hw/mlx5/Makefile index 7493a83acd28..90ad2adc752f 100644 --- a/drivers/infiniband/hw/mlx5/Makefile +++ b/drivers/infiniband/hw/mlx5/Makefile @@ -1,4 +1,4 @@ obj-$(CONFIG_MLX5_INFINIBAND) += mlx5_ib.o -mlx5_ib-y := main.o cq.o doorbell.o qp.o mem.o srq.o mr.o ah.o mad.o gsi.o ib_virt.o +mlx5_ib-y := main.o cq.o doorbell.o qp.o mem.o srq.o mr.o ah.o mad.o gsi.o ib_virt.o cmd.o mlx5_ib-$(CONFIG_INFINIBAND_ON_DEMAND_PAGING) += odp.o diff --git a/drivers/infiniband/hw/mlx5/cmd.c b/drivers/infiniband/hw/mlx5/cmd.c new file mode 100644 index 000000000000..cdc2d3017da7 --- /dev/null +++ b/drivers/infiniband/hw/mlx5/cmd.c @@ -0,0 +1,48 @@ +/* + * Copyright (c) 2017, Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "cmd.h" + +int mlx5_cmd_null_mkey(struct mlx5_core_dev *dev, u32 *null_mkey) +{ + u32 out[MLX5_ST_SZ_DW(query_special_contexts_out)] = {}; + u32 in[MLX5_ST_SZ_DW(query_special_contexts_in)] = {}; + int err; + + MLX5_SET(query_special_contexts_in, in, opcode, + MLX5_CMD_OP_QUERY_SPECIAL_CONTEXTS); + err = mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); + if (!err) + *null_mkey = MLX5_GET(query_special_contexts_out, out, + null_mkey); + return err; +} diff --git a/drivers/infiniband/hw/mlx5/cmd.h b/drivers/infiniband/hw/mlx5/cmd.h new file mode 100644 index 000000000000..7ca8a7b6434d --- /dev/null +++ b/drivers/infiniband/hw/mlx5/cmd.h @@ -0,0 +1,40 @@ +/* + * Copyright (c) 2017, Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef MLX5_IB_CMD_H +#define MLX5_IB_CMD_H + +#include <linux/kernel.h> +#include <linux/mlx5/driver.h> + +int mlx5_cmd_null_mkey(struct mlx5_core_dev *dev, u32 *null_mkey); +#endif /* MLX5_IB_CMD_H */ diff --git a/drivers/infiniband/hw/mlx5/cq.c b/drivers/infiniband/hw/mlx5/cq.c index b3ef47c3ab73..31803b367104 100644 --- a/drivers/infiniband/hw/mlx5/cq.c +++ b/drivers/infiniband/hw/mlx5/cq.c @@ -689,7 +689,7 @@ int mlx5_ib_arm_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags) { struct mlx5_core_dev *mdev = to_mdev(ibcq->device)->mdev; struct mlx5_ib_cq *cq = to_mcq(ibcq); - void __iomem *uar_page = mdev->priv.uuari.uars[0].map; + void __iomem *uar_page = mdev->priv.uar->map; unsigned long irq_flags; int ret = 0; @@ -704,9 +704,7 @@ int mlx5_ib_arm_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags) mlx5_cq_arm(&cq->mcq, (flags & IB_CQ_SOLICITED_MASK) == IB_CQ_SOLICITED ? MLX5_CQ_DB_REQ_NOT_SOL : MLX5_CQ_DB_REQ_NOT, - uar_page, - MLX5_GET_DOORBELL_LOCK(&mdev->priv.cq_uar_lock), - to_mcq(ibcq)->mcq.cons_index); + uar_page, to_mcq(ibcq)->mcq.cons_index); return ret; } @@ -790,7 +788,7 @@ static int create_cq_user(struct mlx5_ib_dev *dev, struct ib_udata *udata, MLX5_SET(cqc, cqc, log_page_size, page_shift - MLX5_ADAPTER_PAGE_SHIFT); - *index = to_mucontext(context)->uuari.uars[0].index; + *index = to_mucontext(context)->bfregi.sys_pages[0]; if (ucmd.cqe_comp_en == 1) { if (unlikely((*cqe_size != 64) || @@ -886,7 +884,7 @@ static int create_cq_kernel(struct mlx5_ib_dev *dev, struct mlx5_ib_cq *cq, MLX5_SET(cqc, cqc, log_page_size, cq->buf.buf.page_shift - MLX5_ADAPTER_PAGE_SHIFT); - *index = dev->mdev->priv.uuari.uars[0].index; + *index = dev->mdev->priv.uar->index; return 0; diff --git a/drivers/infiniband/hw/mlx5/mad.c b/drivers/infiniband/hw/mlx5/mad.c index 39e58489dcc2..8dacb49eabd9 100644 --- a/drivers/infiniband/hw/mlx5/mad.c +++ b/drivers/infiniband/hw/mlx5/mad.c @@ -42,12 +42,24 @@ enum { MLX5_IB_VENDOR_CLASS2 = 0xa }; +static bool can_do_mad_ifc(struct mlx5_ib_dev *dev, u8 port_num, + struct ib_mad *in_mad) +{ + if (in_mad->mad_hdr.mgmt_class != IB_MGMT_CLASS_SUBN_LID_ROUTED && + in_mad->mad_hdr.mgmt_class != IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) + return true; + return dev->mdev->port_caps[port_num - 1].has_smi; +} + int mlx5_MAD_IFC(struct mlx5_ib_dev *dev, int ignore_mkey, int ignore_bkey, u8 port, const struct ib_wc *in_wc, const struct ib_grh *in_grh, const void *in_mad, void *response_mad) { u8 op_modifier = 0; + if (!can_do_mad_ifc(dev, port, (struct ib_mad *)in_mad)) + return -EPERM; + /* Key check traps can't be generated unless we have in_wc to * tell us where to send the trap. */ @@ -515,7 +527,7 @@ int mlx5_query_mad_ifc_port(struct ib_device *ibdev, u8 port, if (!in_mad || !out_mad) goto out; - memset(props, 0, sizeof(*props)); + /* props being zeroed by the caller, avoid zeroing it here */ init_query_mad(in_mad); in_mad->attr_id = IB_SMP_ATTR_PORT_INFO; diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index d566f6738833..5b3355268725 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -53,6 +53,7 @@ #include <linux/in.h> #include <linux/etherdevice.h> #include <linux/mlx5/fs.h> +#include <linux/mlx5/vport.h> #include "mlx5_ib.h" #define DRIVER_NAME "mlx5_ib" @@ -64,10 +65,6 @@ MODULE_DESCRIPTION("Mellanox Connect-IB HCA IB driver"); MODULE_LICENSE("Dual BSD/GPL"); MODULE_VERSION(DRIVER_VERSION); -static int deprecated_prof_sel = 2; -module_param_named(prof_sel, deprecated_prof_sel, int, 0444); -MODULE_PARM_DESC(prof_sel, "profile selector. Deprecated here. Moved to module mlx5_core"); - static char mlx5_version[] = DRIVER_NAME ": Mellanox Connect-IB Infiniband driver v" DRIVER_VERSION " (" DRIVER_RELDATE ")\n"; @@ -174,7 +171,7 @@ static int mlx5_query_port_roce(struct ib_device *device, u8 port_num, enum ib_mtu ndev_ib_mtu; u16 qkey_viol_cntr; - memset(props, 0, sizeof(*props)); + /* props being zeroed by the caller, avoid zeroing it here */ props->port_cap_flags |= IB_PORT_CM_SUP; props->port_cap_flags |= IB_PORT_IP_BASED_GIDS; @@ -325,6 +322,27 @@ __be16 mlx5_get_roce_udp_sport(struct mlx5_ib_dev *dev, u8 port_num, return cpu_to_be16(MLX5_CAP_ROCE(dev->mdev, r_roce_min_src_udp_port)); } +int mlx5_get_roce_gid_type(struct mlx5_ib_dev *dev, u8 port_num, + int index, enum ib_gid_type *gid_type) +{ + struct ib_gid_attr attr; + union ib_gid gid; + int ret; + + ret = ib_get_cached_gid(&dev->ib_dev, port_num, index, &gid, &attr); + if (ret) + return ret; + + if (!attr.ndev) + return -ENODEV; + + dev_put(attr.ndev); + + *gid_type = attr.gid_type; + + return 0; +} + static int mlx5_use_mad_ifc(struct mlx5_ib_dev *dev) { if (MLX5_CAP_GEN(dev->mdev, port_type) == MLX5_CAP_PORT_TYPE_IB) @@ -564,8 +582,15 @@ static int mlx5_ib_query_device(struct ib_device *ibdev, props->device_cap_flags |= IB_DEVICE_BLOCK_MULTICAST_LOOPBACK; if (MLX5_CAP_GEN(dev->mdev, eth_net_offloads)) { - if (MLX5_CAP_ETH(mdev, csum_cap)) + if (MLX5_CAP_ETH(mdev, csum_cap)) { + /* Legacy bit to support old userspace libraries */ props->device_cap_flags |= IB_DEVICE_RAW_IP_CSUM; + props->raw_packet_caps |= IB_RAW_PACKET_CAP_IP_CSUM; + } + + if (MLX5_CAP_ETH(dev->mdev, vlan_cap)) + props->raw_packet_caps |= + IB_RAW_PACKET_CAP_CVLAN_STRIPPING; if (field_avail(typeof(resp), tso_caps, uhw->outlen)) { max_tso = MLX5_CAP_ETH(mdev, max_lso_cap); @@ -604,8 +629,11 @@ static int mlx5_ib_query_device(struct ib_device *ibdev, } if (MLX5_CAP_GEN(dev->mdev, eth_net_offloads) && - MLX5_CAP_ETH(dev->mdev, scatter_fcs)) + MLX5_CAP_ETH(dev->mdev, scatter_fcs)) { + /* Legacy bit to support old userspace libraries */ props->device_cap_flags |= IB_DEVICE_RAW_SCATTER_FCS; + props->raw_packet_caps |= IB_RAW_PACKET_CAP_SCATTER_FCS; + } if (mlx5_get_flow_namespace(dev->mdev, MLX5_FLOW_NAMESPACE_BYPASS)) props->device_cap_flags |= IB_DEVICE_MANAGED_FLOW_STEERING; @@ -672,17 +700,6 @@ static int mlx5_ib_query_device(struct ib_device *ibdev, 1 << MLX5_CAP_GEN(dev->mdev, log_max_rq); } - if (field_avail(typeof(resp), mlx5_ib_support_multi_pkt_send_wqes, - uhw->outlen)) { - resp.mlx5_ib_support_multi_pkt_send_wqes = - MLX5_CAP_ETH(mdev, multi_pkt_send_wqe); - resp.response_length += - sizeof(resp.mlx5_ib_support_multi_pkt_send_wqes); - } - - if (field_avail(typeof(resp), reserved, uhw->outlen)) - resp.response_length += sizeof(resp.reserved); - if (field_avail(typeof(resp), cqe_comp_caps, uhw->outlen)) { resp.cqe_comp_caps.max_num = MLX5_CAP_GEN(dev->mdev, cqe_compression) ? @@ -706,6 +723,17 @@ static int mlx5_ib_query_device(struct ib_device *ibdev, resp.response_length += sizeof(resp.packet_pacing_caps); } + if (field_avail(typeof(resp), mlx5_ib_support_multi_pkt_send_wqes, + uhw->outlen)) { + resp.mlx5_ib_support_multi_pkt_send_wqes = + MLX5_CAP_ETH(mdev, multi_pkt_send_wqe); + resp.response_length += + sizeof(resp.mlx5_ib_support_multi_pkt_send_wqes); + } + + if (field_avail(typeof(resp), reserved, uhw->outlen)) + resp.response_length += sizeof(resp.reserved); + if (uhw->outlen) { err = ib_copy_to_udata(uhw, &resp, resp.response_length); @@ -830,7 +858,7 @@ static int mlx5_query_hca_port(struct ib_device *ibdev, u8 port, goto out; } - memset(props, 0, sizeof(*props)); + /* props being zeroed by the caller, avoid zeroing it here */ err = mlx5_query_hca_vport_context(mdev, 0, port, 0, rep); if (err) @@ -968,6 +996,31 @@ static int mlx5_ib_modify_device(struct ib_device *ibdev, int mask, return err; } +static int set_port_caps_atomic(struct mlx5_ib_dev *dev, u8 port_num, u32 mask, + u32 value) +{ + struct mlx5_hca_vport_context ctx = {}; + int err; + + err = mlx5_query_hca_vport_context(dev->mdev, 0, + port_num, 0, &ctx); + if (err) + return err; + + if (~ctx.cap_mask1_perm & mask) { + mlx5_ib_warn(dev, "trying to change bitmask 0x%X but change supported 0x%X\n", + mask, ctx.cap_mask1_perm); + return -EINVAL; + } + + ctx.cap_mask1 = value; + ctx.cap_mask1_perm = mask; + err = mlx5_core_modify_hca_vport_context(dev->mdev, 0, + port_num, 0, &ctx); + + return err; +} + static int mlx5_ib_modify_port(struct ib_device *ibdev, u8 port, int mask, struct ib_port_modify *props) { @@ -975,10 +1028,20 @@ static int mlx5_ib_modify_port(struct ib_device *ibdev, u8 port, int mask, struct ib_port_attr attr; u32 tmp; int err; + u32 change_mask; + u32 value; + bool is_ib = (mlx5_ib_port_link_layer(ibdev, port) == + IB_LINK_LAYER_INFINIBAND); + + if (MLX5_CAP_GEN(dev->mdev, ib_virt) && is_ib) { + change_mask = props->clr_port_cap_mask | props->set_port_cap_mask; + value = ~props->clr_port_cap_mask | props->set_port_cap_mask; + return set_port_caps_atomic(dev, port, change_mask, value); + } mutex_lock(&dev->cap_mask_mutex); - err = mlx5_ib_query_port(ibdev, port, &attr); + err = ib_query_port(ibdev, port, &attr); if (err) goto out; @@ -992,6 +1055,86 @@ out: return err; } +static void print_lib_caps(struct mlx5_ib_dev *dev, u64 caps) +{ + mlx5_ib_dbg(dev, "MLX5_LIB_CAP_4K_UAR = %s\n", + caps & MLX5_LIB_CAP_4K_UAR ? "y" : "n"); +} + +static int calc_total_bfregs(struct mlx5_ib_dev *dev, bool lib_uar_4k, + struct mlx5_ib_alloc_ucontext_req_v2 *req, + u32 *num_sys_pages) +{ + int uars_per_sys_page; + int bfregs_per_sys_page; + int ref_bfregs = req->total_num_bfregs; + + if (req->total_num_bfregs == 0) + return -EINVAL; + + BUILD_BUG_ON(MLX5_MAX_BFREGS % MLX5_NON_FP_BFREGS_IN_PAGE); + BUILD_BUG_ON(MLX5_MAX_BFREGS < MLX5_NON_FP_BFREGS_IN_PAGE); + + if (req->total_num_bfregs > MLX5_MAX_BFREGS) + return -ENOMEM; + + uars_per_sys_page = get_uars_per_sys_page(dev, lib_uar_4k); + bfregs_per_sys_page = uars_per_sys_page * MLX5_NON_FP_BFREGS_PER_UAR; + req->total_num_bfregs = ALIGN(req->total_num_bfregs, bfregs_per_sys_page); + *num_sys_pages = req->total_num_bfregs / bfregs_per_sys_page; + + if (req->num_low_latency_bfregs > req->total_num_bfregs - 1) + return -EINVAL; + + mlx5_ib_dbg(dev, "uar_4k: fw support %s, lib support %s, user requested %d bfregs, alloated %d, using %d sys pages\n", + MLX5_CAP_GEN(dev->mdev, uar_4k) ? "yes" : "no", + lib_uar_4k ? "yes" : "no", ref_bfregs, + req->total_num_bfregs, *num_sys_pages); + + return 0; +} + +static int allocate_uars(struct mlx5_ib_dev *dev, struct mlx5_ib_ucontext *context) +{ + struct mlx5_bfreg_info *bfregi; + int err; + int i; + + bfregi = &context->bfregi; + for (i = 0; i < bfregi->num_sys_pages; i++) { + err = mlx5_cmd_alloc_uar(dev->mdev, &bfregi->sys_pages[i]); + if (err) + goto error; + + mlx5_ib_dbg(dev, "allocated uar %d\n", bfregi->sys_pages[i]); + } + return 0; + +error: + for (--i; i >= 0; i--) + if (mlx5_cmd_free_uar(dev->mdev, bfregi->sys_pages[i])) + mlx5_ib_warn(dev, "failed to free uar %d\n", i); + + return err; +} + +static int deallocate_uars(struct mlx5_ib_dev *dev, struct mlx5_ib_ucontext *context) +{ + struct mlx5_bfreg_info *bfregi; + int err; + int i; + + bfregi = &context->bfregi; + for (i = 0; i < bfregi->num_sys_pages; i++) { + err = mlx5_cmd_free_uar(dev->mdev, bfregi->sys_pages[i]); + if (err) { + mlx5_ib_warn(dev, "failed to free uar %d\n", i); + return err; + } + } + return 0; +} + static struct ib_ucontext *mlx5_ib_alloc_ucontext(struct ib_device *ibdev, struct ib_udata *udata) { @@ -999,17 +1142,13 @@ static struct ib_ucontext *mlx5_ib_alloc_ucontext(struct ib_device *ibdev, struct mlx5_ib_alloc_ucontext_req_v2 req = {}; struct mlx5_ib_alloc_ucontext_resp resp = {}; struct mlx5_ib_ucontext *context; - struct mlx5_uuar_info *uuari; - struct mlx5_uar *uars; - int gross_uuars; - int num_uars; + struct mlx5_bfreg_info *bfregi; int ver; - int uuarn; int err; - int i; size_t reqlen; size_t min_req_v2 = offsetof(struct mlx5_ib_alloc_ucontext_req_v2, max_cqe_version); + bool lib_uar_4k; if (!dev->ib_active) return ERR_PTR(-EAGAIN); @@ -1032,27 +1171,14 @@ static struct ib_ucontext *mlx5_ib_alloc_ucontext(struct ib_device *ibdev, if (req.flags) return ERR_PTR(-EINVAL); - if (req.total_num_uuars > MLX5_MAX_UUARS) - return ERR_PTR(-ENOMEM); - - if (req.total_num_uuars == 0) - return ERR_PTR(-EINVAL); - if (req.comp_mask || req.reserved0 || req.reserved1 || req.reserved2) return ERR_PTR(-EOPNOTSUPP); - if (reqlen > sizeof(req) && - !ib_is_udata_cleared(udata, sizeof(req), - reqlen - sizeof(req))) - return ERR_PTR(-EOPNOTSUPP); - - req.total_num_uuars = ALIGN(req.total_num_uuars, - MLX5_NON_FP_BF_REGS_PER_PAGE); - if (req.num_low_latency_uuars > req.total_num_uuars - 1) + req.total_num_bfregs = ALIGN(req.total_num_bfregs, + MLX5_NON_FP_BFREGS_PER_UAR); + if (req.num_low_latency_bfregs > req.total_num_bfregs - 1) return ERR_PTR(-EINVAL); - num_uars = req.total_num_uuars / MLX5_NON_FP_BF_REGS_PER_PAGE; - gross_uuars = num_uars * MLX5_BF_REGS_PER_PAGE; resp.qp_tab_size = 1 << MLX5_CAP_GEN(dev->mdev, log_max_qp); if (mlx5_core_is_pf(dev->mdev) && MLX5_CAP_GEN(dev->mdev, bf)) resp.bf_reg_size = 1 << MLX5_CAP_GEN(dev->mdev, log_bf_reg_size); @@ -1065,6 +1191,10 @@ static struct ib_ucontext *mlx5_ib_alloc_ucontext(struct ib_device *ibdev, resp.cqe_version = min_t(__u8, (__u8)MLX5_CAP_GEN(dev->mdev, cqe_version), req.max_cqe_version); + resp.log_uar_size = MLX5_CAP_GEN(dev->mdev, uar_4k) ? + MLX5_ADAPTER_PAGE_SHIFT : PAGE_SHIFT; + resp.num_uars_per_page = MLX5_CAP_GEN(dev->mdev, uar_4k) ? + MLX5_CAP_GEN(dev->mdev, num_of_uars_per_page) : 1; resp.response_length = min(offsetof(typeof(resp), response_length) + sizeof(resp.response_length), udata->outlen); @@ -1072,58 +1202,58 @@ static struct ib_ucontext *mlx5_ib_alloc_ucontext(struct ib_device *ibdev, if (!context) return ERR_PTR(-ENOMEM); - uuari = &context->uuari; - mutex_init(&uuari->lock); - uars = kcalloc(num_uars, sizeof(*uars), GFP_KERNEL); - if (!uars) { - err = -ENOMEM; + lib_uar_4k = req.lib_caps & MLX5_LIB_CAP_4K_UAR; + bfregi = &context->bfregi; + + /* updates req->total_num_bfregs */ + err = calc_total_bfregs(dev, lib_uar_4k, &req, &bfregi->num_sys_pages); + if (err) goto out_ctx; - } - uuari->bitmap = kcalloc(BITS_TO_LONGS(gross_uuars), - sizeof(*uuari->bitmap), + mutex_init(&bfregi->lock); + bfregi->lib_uar_4k = lib_uar_4k; + bfregi->count = kcalloc(req.total_num_bfregs, sizeof(*bfregi->count), GFP_KERNEL); - if (!uuari->bitmap) { + if (!bfregi->count) { err = -ENOMEM; - goto out_uar_ctx; - } - /* - * clear all fast path uuars - */ - for (i = 0; i < gross_uuars; i++) { - uuarn = i & 3; - if (uuarn == 2 || uuarn == 3) - set_bit(i, uuari->bitmap); + goto out_ctx; } - uuari->count = kcalloc(gross_uuars, sizeof(*uuari->count), GFP_KERNEL); - if (!uuari->count) { + bfregi->sys_pages = kcalloc(bfregi->num_sys_pages, + sizeof(*bfregi->sys_pages), + GFP_KERNEL); + if (!bfregi->sys_pages) { err = -ENOMEM; - goto out_bitmap; + goto out_count; } - for (i = 0; i < num_uars; i++) { - err = mlx5_cmd_alloc_uar(dev->mdev, &uars[i].index); - if (err) - goto out_count; - } + err = allocate_uars(dev, context); + if (err) + goto out_sys_pages; #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING context->ibucontext.invalidate_range = &mlx5_ib_invalidate_range; #endif + context->upd_xlt_page = __get_free_page(GFP_KERNEL); + if (!context->upd_xlt_page) { + err = -ENOMEM; + goto out_uars; + } + mutex_init(&context->upd_xlt_page_mutex); + if (MLX5_CAP_GEN(dev->mdev, log_max_transport_domain)) { err = mlx5_core_alloc_transport_domain(dev->mdev, &context->tdn); if (err) - goto out_uars; + goto out_page; } INIT_LIST_HEAD(&context->vma_private_list); INIT_LIST_HEAD(&context->db_page_list); mutex_init(&context->db_page_mutex); - resp.tot_uuars = req.total_num_uuars; + resp.tot_bfregs = req.total_num_bfregs; resp.num_ports = MLX5_CAP_GEN(dev->mdev, num_ports); if (field_avail(typeof(resp), cqe_version, udata->outlen)) @@ -1135,32 +1265,46 @@ static struct ib_ucontext *mlx5_ib_alloc_ucontext(struct ib_device *ibdev, resp.response_length += sizeof(resp.cmds_supp_uhw); } + if (field_avail(typeof(resp), eth_min_inline, udata->outlen)) { + if (mlx5_ib_port_link_layer(ibdev, 1) == IB_LINK_LAYER_ETHERNET) { + mlx5_query_min_inline(dev->mdev, &resp.eth_min_inline); + resp.eth_min_inline++; + } + resp.response_length += sizeof(resp.eth_min_inline); + } + /* * We don't want to expose information from the PCI bar that is located * after 4096 bytes, so if the arch only supports larger pages, let's * pretend we don't support reading the HCA's core clock. This is also * forced by mmap function. */ - if (PAGE_SIZE <= 4096 && - field_avail(typeof(resp), hca_core_clock_offset, udata->outlen)) { - resp.comp_mask |= - MLX5_IB_ALLOC_UCONTEXT_RESP_MASK_CORE_CLOCK_OFFSET; - resp.hca_core_clock_offset = - offsetof(struct mlx5_init_seg, internal_timer_h) % - PAGE_SIZE; + if (field_avail(typeof(resp), hca_core_clock_offset, udata->outlen)) { + if (PAGE_SIZE <= 4096) { + resp.comp_mask |= + MLX5_IB_ALLOC_UCONTEXT_RESP_MASK_CORE_CLOCK_OFFSET; + resp.hca_core_clock_offset = + offsetof(struct mlx5_init_seg, internal_timer_h) % PAGE_SIZE; + } resp.response_length += sizeof(resp.hca_core_clock_offset) + sizeof(resp.reserved2); } + if (field_avail(typeof(resp), log_uar_size, udata->outlen)) + resp.response_length += sizeof(resp.log_uar_size); + + if (field_avail(typeof(resp), num_uars_per_page, udata->outlen)) + resp.response_length += sizeof(resp.num_uars_per_page); + err = ib_copy_to_udata(udata, &resp, resp.response_length); if (err) goto out_td; - uuari->ver = ver; - uuari->num_low_latency_uuars = req.num_low_latency_uuars; - uuari->uars = uars; - uuari->num_uars = num_uars; + bfregi->ver = ver; + bfregi->num_low_latency_bfregs = req.num_low_latency_bfregs; context->cqe_version = resp.cqe_version; + context->lib_caps = req.lib_caps; + print_lib_caps(dev, context->lib_caps); return &context->ibucontext; @@ -1168,20 +1312,21 @@ out_td: if (MLX5_CAP_GEN(dev->mdev, log_max_transport_domain)) mlx5_core_dealloc_transport_domain(dev->mdev, context->tdn); +out_page: + free_page(context->upd_xlt_page); + out_uars: - for (i--; i >= 0; i--) - mlx5_cmd_free_uar(dev->mdev, uars[i].index); -out_count: - kfree(uuari->count); + deallocate_uars(dev, context); -out_bitmap: - kfree(uuari->bitmap); +out_sys_pages: + kfree(bfregi->sys_pages); -out_uar_ctx: - kfree(uars); +out_count: + kfree(bfregi->count); out_ctx: kfree(context); + return ERR_PTR(err); } @@ -1189,28 +1334,31 @@ static int mlx5_ib_dealloc_ucontext(struct ib_ucontext *ibcontext) { struct mlx5_ib_ucontext *context = to_mucontext(ibcontext); struct mlx5_ib_dev *dev = to_mdev(ibcontext->device); - struct mlx5_uuar_info *uuari = &context->uuari; - int i; + struct mlx5_bfreg_info *bfregi; + bfregi = &context->bfregi; if (MLX5_CAP_GEN(dev->mdev, log_max_transport_domain)) mlx5_core_dealloc_transport_domain(dev->mdev, context->tdn); - for (i = 0; i < uuari->num_uars; i++) { - if (mlx5_cmd_free_uar(dev->mdev, uuari->uars[i].index)) - mlx5_ib_warn(dev, "failed to free UAR 0x%x\n", uuari->uars[i].index); - } - - kfree(uuari->count); - kfree(uuari->bitmap); - kfree(uuari->uars); + free_page(context->upd_xlt_page); + deallocate_uars(dev, context); + kfree(bfregi->sys_pages); + kfree(bfregi->count); kfree(context); return 0; } -static phys_addr_t uar_index2pfn(struct mlx5_ib_dev *dev, int index) +static phys_addr_t uar_index2pfn(struct mlx5_ib_dev *dev, + struct mlx5_bfreg_info *bfregi, + int idx) { - return (pci_resource_start(dev->mdev->pdev, 0) >> PAGE_SHIFT) + index; + int fw_uars_per_page; + + fw_uars_per_page = MLX5_CAP_GEN(dev->mdev, uar_4k) ? MLX5_UARS_IN_PAGE : 1; + + return (pci_resource_start(dev->mdev->pdev, 0) >> PAGE_SHIFT) + + bfregi->sys_pages[idx] / fw_uars_per_page; } static int get_command(unsigned long offset) @@ -1365,11 +1513,23 @@ static int uar_mmap(struct mlx5_ib_dev *dev, enum mlx5_ib_mmap_cmd cmd, struct vm_area_struct *vma, struct mlx5_ib_ucontext *context) { - struct mlx5_uuar_info *uuari = &context->uuari; + struct mlx5_bfreg_info *bfregi = &context->bfregi; int err; unsigned long idx; phys_addr_t pfn, pa; pgprot_t prot; + int uars_per_page; + + if (vma->vm_end - vma->vm_start != PAGE_SIZE) + return -EINVAL; + + uars_per_page = get_uars_per_sys_page(dev, bfregi->lib_uar_4k); + idx = get_index(vma->vm_pgoff); + if (idx % uars_per_page || + idx * uars_per_page >= bfregi->num_sys_pages) { + mlx5_ib_warn(dev, "invalid uar index %lu\n", idx); + return -EINVAL; + } switch (cmd) { case MLX5_IB_MMAP_WC_PAGE: @@ -1392,14 +1552,7 @@ static int uar_mmap(struct mlx5_ib_dev *dev, enum mlx5_ib_mmap_cmd cmd, return -EINVAL; } - if (vma->vm_end - vma->vm_start != PAGE_SIZE) - return -EINVAL; - - idx = get_index(vma->vm_pgoff); - if (idx >= uuari->num_uars) - return -EINVAL; - - pfn = uar_index2pfn(dev, uuari->uars[idx].index); + pfn = uar_index2pfn(dev, bfregi, idx); mlx5_ib_dbg(dev, "uar idx 0x%lx, pfn %pa\n", idx, &pfn); vma->vm_page_prot = prot; @@ -1570,6 +1723,7 @@ static void set_tos(void *outer_c, void *outer_v, u8 mask, u8 val) #define LAST_IPV6_FIELD traffic_class #define LAST_TCP_UDP_FIELD src_port #define LAST_TUNNEL_FIELD tunnel_id +#define LAST_FLOW_TAG_FIELD tag_id /* Field is the last supported field */ #define FIELDS_NOT_SUPPORTED(filter, field)\ @@ -1580,7 +1734,7 @@ static void set_tos(void *outer_c, void *outer_v, u8 mask, u8 val) sizeof(filter.field)) static int parse_flow_attr(u32 *match_c, u32 *match_v, - const union ib_flow_spec *ib_spec) + const union ib_flow_spec *ib_spec, u32 *tag_id) { void *misc_params_c = MLX5_ADDR_OF(fte_match_param, match_c, misc_parameters); @@ -1604,7 +1758,7 @@ static int parse_flow_attr(u32 *match_c, u32 *match_v, switch (ib_spec->type & ~IB_FLOW_SPEC_INNER) { case IB_FLOW_SPEC_ETH: if (FIELDS_NOT_SUPPORTED(ib_spec->eth.mask, LAST_ETH_FIELD)) - return -ENOTSUPP; + return -EOPNOTSUPP; ether_addr_copy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_c, dmac_47_16), @@ -1622,9 +1776,9 @@ static int parse_flow_attr(u32 *match_c, u32 *match_v, if (ib_spec->eth.mask.vlan_tag) { MLX5_SET(fte_match_set_lyr_2_4, headers_c, - vlan_tag, 1); + cvlan_tag, 1); MLX5_SET(fte_match_set_lyr_2_4, headers_v, - vlan_tag, 1); + cvlan_tag, 1); MLX5_SET(fte_match_set_lyr_2_4, headers_c, first_vid, ntohs(ib_spec->eth.mask.vlan_tag)); @@ -1652,7 +1806,7 @@ static int parse_flow_attr(u32 *match_c, u32 *match_v, break; case IB_FLOW_SPEC_IPV4: if (FIELDS_NOT_SUPPORTED(ib_spec->ipv4.mask, LAST_IPV4_FIELD)) - return -ENOTSUPP; + return -EOPNOTSUPP; MLX5_SET(fte_match_set_lyr_2_4, headers_c, ethertype, 0xffff); @@ -1684,7 +1838,7 @@ static int parse_flow_attr(u32 *match_c, u32 *match_v, break; case IB_FLOW_SPEC_IPV6: if (FIELDS_NOT_SUPPORTED(ib_spec->ipv6.mask, LAST_IPV6_FIELD)) - return -ENOTSUPP; + return -EOPNOTSUPP; MLX5_SET(fte_match_set_lyr_2_4, headers_c, ethertype, 0xffff); @@ -1725,7 +1879,7 @@ static int parse_flow_attr(u32 *match_c, u32 *match_v, case IB_FLOW_SPEC_TCP: if (FIELDS_NOT_SUPPORTED(ib_spec->tcp_udp.mask, LAST_TCP_UDP_FIELD)) - return -ENOTSUPP; + return -EOPNOTSUPP; MLX5_SET(fte_match_set_lyr_2_4, headers_c, ip_protocol, 0xff); @@ -1745,7 +1899,7 @@ static int parse_flow_attr(u32 *match_c, u32 *match_v, case IB_FLOW_SPEC_UDP: if (FIELDS_NOT_SUPPORTED(ib_spec->tcp_udp.mask, LAST_TCP_UDP_FIELD)) - return -ENOTSUPP; + return -EOPNOTSUPP; MLX5_SET(fte_match_set_lyr_2_4, headers_c, ip_protocol, 0xff); @@ -1765,13 +1919,22 @@ static int parse_flow_attr(u32 *match_c, u32 *match_v, case IB_FLOW_SPEC_VXLAN_TUNNEL: if (FIELDS_NOT_SUPPORTED(ib_spec->tunnel.mask, LAST_TUNNEL_FIELD)) - return -ENOTSUPP; + return -EOPNOTSUPP; MLX5_SET(fte_match_set_misc, misc_params_c, vxlan_vni, ntohl(ib_spec->tunnel.mask.tunnel_id)); MLX5_SET(fte_match_set_misc, misc_params_v, vxlan_vni, ntohl(ib_spec->tunnel.val.tunnel_id)); break; + case IB_FLOW_SPEC_ACTION_TAG: + if (FIELDS_NOT_SUPPORTED(ib_spec->flow_tag, + LAST_FLOW_TAG_FIELD)) + return -EOPNOTSUPP; + if (ib_spec->flow_tag.tag_id >= BIT(24)) + return -EINVAL; + + *tag_id = ib_spec->flow_tag.tag_id; + break; default: return -EINVAL; } @@ -1955,6 +2118,7 @@ static struct mlx5_ib_flow_handler *create_flow_rule(struct mlx5_ib_dev *dev, struct mlx5_flow_spec *spec; const void *ib_flow = (const void *)flow_attr + sizeof(*flow_attr); unsigned int spec_index; + u32 flow_tag = MLX5_FS_DEFAULT_FLOW_TAG; int err = 0; if (!is_valid_attr(flow_attr)) @@ -1971,7 +2135,7 @@ static struct mlx5_ib_flow_handler *create_flow_rule(struct mlx5_ib_dev *dev, for (spec_index = 0; spec_index < flow_attr->num_of_specs; spec_index++) { err = parse_flow_attr(spec->match_criteria, - spec->match_value, ib_flow); + spec->match_value, ib_flow, &flow_tag); if (err < 0) goto free; @@ -1981,7 +2145,16 @@ static struct mlx5_ib_flow_handler *create_flow_rule(struct mlx5_ib_dev *dev, spec->match_criteria_enable = get_match_criteria_enable(spec->match_criteria); flow_act.action = dst ? MLX5_FLOW_CONTEXT_ACTION_FWD_DEST : MLX5_FLOW_CONTEXT_ACTION_FWD_NEXT_PRIO; - flow_act.flow_tag = MLX5_FS_DEFAULT_FLOW_TAG; + + if (flow_tag != MLX5_FS_DEFAULT_FLOW_TAG && + (flow_attr->type == IB_FLOW_ATTR_ALL_DEFAULT || + flow_attr->type == IB_FLOW_ATTR_MC_DEFAULT)) { + mlx5_ib_warn(dev, "Flow tag %u and attribute type %x isn't allowed in leftovers\n", + flow_tag, flow_attr->type); + err = -EINVAL; + goto free; + } + flow_act.flow_tag = flow_tag; handler->rule = mlx5_add_flow_rules(ft, spec, &flow_act, dst, 1); @@ -2451,6 +2624,35 @@ static void mlx5_ib_event(struct mlx5_core_dev *dev, void *context, ibdev->ib_active = false; } +static int set_has_smi_cap(struct mlx5_ib_dev *dev) +{ + struct mlx5_hca_vport_context vport_ctx; + int err; + int port; + + for (port = 1; port <= MLX5_CAP_GEN(dev->mdev, num_ports); port++) { + dev->mdev->port_caps[port - 1].has_smi = false; + if (MLX5_CAP_GEN(dev->mdev, port_type) == + MLX5_CAP_PORT_TYPE_IB) { + if (MLX5_CAP_GEN(dev->mdev, ib_virt)) { + err = mlx5_query_hca_vport_context(dev->mdev, 0, + port, 0, + &vport_ctx); + if (err) { + mlx5_ib_err(dev, "query_hca_vport_context for port=%d failed %d\n", + port, err); + return err; + } + dev->mdev->port_caps[port - 1].has_smi = + vport_ctx.has_smi; + } else { + dev->mdev->port_caps[port - 1].has_smi = true; + } + } + } + return 0; +} + static void get_ext_port_caps(struct mlx5_ib_dev *dev) { int port; @@ -2475,6 +2677,10 @@ static int get_port_caps(struct mlx5_ib_dev *dev) if (!dprops) goto out; + err = set_has_smi_cap(dev); + if (err) + goto out; + err = mlx5_ib_query_device(&dev->ib_dev, dprops, &uhw); if (err) { mlx5_ib_warn(dev, "query_device failed %d\n", err); @@ -2482,6 +2688,7 @@ static int get_port_caps(struct mlx5_ib_dev *dev) } for (port = 1; port <= MLX5_CAP_GEN(dev->mdev, num_ports); port++) { + memset(pprops, 0, sizeof(*pprops)); err = mlx5_ib_query_port(&dev->ib_dev, port, pprops); if (err) { mlx5_ib_warn(dev, "query_port %d failed %d\n", @@ -2776,11 +2983,13 @@ static u32 get_core_cap_flags(struct ib_device *ibdev) if (ll == IB_LINK_LAYER_INFINIBAND) return RDMA_CORE_PORT_IBA_IB; + ret = RDMA_CORE_PORT_RAW_PACKET; + if (!(l3_type_cap & MLX5_ROCE_L3_TYPE_IPV4_CAP)) - return 0; + return ret; if (!(l3_type_cap & MLX5_ROCE_L3_TYPE_IPV6_CAP)) - return 0; + return ret; if (roce_version_cap & MLX5_ROCE_VERSION_1_CAP) ret |= RDMA_CORE_PORT_IBA_ROCE; @@ -2799,7 +3008,9 @@ static int mlx5_port_immutable(struct ib_device *ibdev, u8 port_num, enum rdma_link_layer ll = mlx5_ib_port_link_layer(ibdev, port_num); int err; - err = mlx5_ib_query_port(ibdev, port_num, &attr); + immutable->core_cap_flags = get_core_cap_flags(ibdev); + + err = ib_query_port(ibdev, port_num, &attr); if (err) return err; @@ -2920,13 +3131,102 @@ static void mlx5_disable_eth(struct mlx5_ib_dev *dev) mlx5_nic_vport_disable_roce(dev->mdev); } +struct mlx5_ib_q_counter { + const char *name; + size_t offset; +}; + +#define INIT_Q_COUNTER(_name) \ + { .name = #_name, .offset = MLX5_BYTE_OFF(query_q_counter_out, _name)} + +static const struct mlx5_ib_q_counter basic_q_cnts[] = { + INIT_Q_COUNTER(rx_write_requests), + INIT_Q_COUNTER(rx_read_requests), + INIT_Q_COUNTER(rx_atomic_requests), + INIT_Q_COUNTER(out_of_buffer), +}; + +static const struct mlx5_ib_q_counter out_of_seq_q_cnts[] = { + INIT_Q_COUNTER(out_of_sequence), +}; + +static const struct mlx5_ib_q_counter retrans_q_cnts[] = { + INIT_Q_COUNTER(duplicate_request), + INIT_Q_COUNTER(rnr_nak_retry_err), + INIT_Q_COUNTER(packet_seq_err), + INIT_Q_COUNTER(implied_nak_seq_err), + INIT_Q_COUNTER(local_ack_timeout_err), +}; + static void mlx5_ib_dealloc_q_counters(struct mlx5_ib_dev *dev) { unsigned int i; - for (i = 0; i < dev->num_ports; i++) + for (i = 0; i < dev->num_ports; i++) { mlx5_core_dealloc_q_counter(dev->mdev, - dev->port[i].q_cnt_id); + dev->port[i].q_cnts.set_id); + kfree(dev->port[i].q_cnts.names); + kfree(dev->port[i].q_cnts.offsets); + } +} + +static int __mlx5_ib_alloc_q_counters(struct mlx5_ib_dev *dev, + const char ***names, + size_t **offsets, + u32 *num) +{ + u32 num_counters; + + num_counters = ARRAY_SIZE(basic_q_cnts); + + if (MLX5_CAP_GEN(dev->mdev, out_of_seq_cnt)) + num_counters += ARRAY_SIZE(out_of_seq_q_cnts); + + if (MLX5_CAP_GEN(dev->mdev, retransmission_q_counters)) + num_counters += ARRAY_SIZE(retrans_q_cnts); + + *names = kcalloc(num_counters, sizeof(**names), GFP_KERNEL); + if (!*names) + return -ENOMEM; + + *offsets = kcalloc(num_counters, sizeof(**offsets), GFP_KERNEL); + if (!*offsets) + goto err_names; + + *num = num_counters; + + return 0; + +err_names: + kfree(*names); + return -ENOMEM; +} + +static void mlx5_ib_fill_q_counters(struct mlx5_ib_dev *dev, + const char **names, + size_t *offsets) +{ + int i; + int j = 0; + + for (i = 0; i < ARRAY_SIZE(basic_q_cnts); i++, j++) { + names[j] = basic_q_cnts[i].name; + offsets[j] = basic_q_cnts[i].offset; + } + + if (MLX5_CAP_GEN(dev->mdev, out_of_seq_cnt)) { + for (i = 0; i < ARRAY_SIZE(out_of_seq_q_cnts); i++, j++) { + names[j] = out_of_seq_q_cnts[i].name; + offsets[j] = out_of_seq_q_cnts[i].offset; + } + } + + if (MLX5_CAP_GEN(dev->mdev, retransmission_q_counters)) { + for (i = 0; i < ARRAY_SIZE(retrans_q_cnts); i++, j++) { + names[j] = retrans_q_cnts[i].name; + offsets[j] = retrans_q_cnts[i].offset; + } + } } static int mlx5_ib_alloc_q_counters(struct mlx5_ib_dev *dev) @@ -2935,14 +3235,26 @@ static int mlx5_ib_alloc_q_counters(struct mlx5_ib_dev *dev) int ret; for (i = 0; i < dev->num_ports; i++) { + struct mlx5_ib_port *port = &dev->port[i]; + ret = mlx5_core_alloc_q_counter(dev->mdev, - &dev->port[i].q_cnt_id); + &port->q_cnts.set_id); if (ret) { mlx5_ib_warn(dev, "couldn't allocate queue counter for port %d, err %d\n", i + 1, ret); goto dealloc_counters; } + + ret = __mlx5_ib_alloc_q_counters(dev, + &port->q_cnts.names, + &port->q_cnts.offsets, + &port->q_cnts.num_counters); + if (ret) + goto dealloc_counters; + + mlx5_ib_fill_q_counters(dev, port->q_cnts.names, + port->q_cnts.offsets); } return 0; @@ -2950,62 +3262,39 @@ static int mlx5_ib_alloc_q_counters(struct mlx5_ib_dev *dev) dealloc_counters: while (--i >= 0) mlx5_core_dealloc_q_counter(dev->mdev, - dev->port[i].q_cnt_id); + dev->port[i].q_cnts.set_id); return ret; } -static const char * const names[] = { - "rx_write_requests", - "rx_read_requests", - "rx_atomic_requests", - "out_of_buffer", - "out_of_sequence", - "duplicate_request", - "rnr_nak_retry_err", - "packet_seq_err", - "implied_nak_seq_err", - "local_ack_timeout_err", -}; - -static const size_t stats_offsets[] = { - MLX5_BYTE_OFF(query_q_counter_out, rx_write_requests), - MLX5_BYTE_OFF(query_q_counter_out, rx_read_requests), - MLX5_BYTE_OFF(query_q_counter_out, rx_atomic_requests), - MLX5_BYTE_OFF(query_q_counter_out, out_of_buffer), - MLX5_BYTE_OFF(query_q_counter_out, out_of_sequence), - MLX5_BYTE_OFF(query_q_counter_out, duplicate_request), - MLX5_BYTE_OFF(query_q_counter_out, rnr_nak_retry_err), - MLX5_BYTE_OFF(query_q_counter_out, packet_seq_err), - MLX5_BYTE_OFF(query_q_counter_out, implied_nak_seq_err), - MLX5_BYTE_OFF(query_q_counter_out, local_ack_timeout_err), -}; - static struct rdma_hw_stats *mlx5_ib_alloc_hw_stats(struct ib_device *ibdev, u8 port_num) { - BUILD_BUG_ON(ARRAY_SIZE(names) != ARRAY_SIZE(stats_offsets)); + struct mlx5_ib_dev *dev = to_mdev(ibdev); + struct mlx5_ib_port *port = &dev->port[port_num - 1]; /* We support only per port stats */ if (port_num == 0) return NULL; - return rdma_alloc_hw_stats_struct(names, ARRAY_SIZE(names), + return rdma_alloc_hw_stats_struct(port->q_cnts.names, + port->q_cnts.num_counters, RDMA_HW_STATS_DEFAULT_LIFESPAN); } static int mlx5_ib_get_hw_stats(struct ib_device *ibdev, struct rdma_hw_stats *stats, - u8 port, int index) + u8 port_num, int index) { struct mlx5_ib_dev *dev = to_mdev(ibdev); + struct mlx5_ib_port *port = &dev->port[port_num - 1]; int outlen = MLX5_ST_SZ_BYTES(query_q_counter_out); void *out; __be32 val; int ret; int i; - if (!port || !stats) + if (!stats) return -ENOSYS; out = mlx5_vzalloc(outlen); @@ -3013,18 +3302,19 @@ static int mlx5_ib_get_hw_stats(struct ib_device *ibdev, return -ENOMEM; ret = mlx5_core_query_q_counter(dev->mdev, - dev->port[port - 1].q_cnt_id, 0, + port->q_cnts.set_id, 0, out, outlen); if (ret) goto free; - for (i = 0; i < ARRAY_SIZE(names); i++) { - val = *(__be32 *)(out + stats_offsets[i]); + for (i = 0; i < port->q_cnts.num_counters; i++) { + val = *(__be32 *)(out + port->q_cnts.offsets[i]); stats->value[i] = (u64)be32_to_cpu(val); } + free: kvfree(out); - return ARRAY_SIZE(names); + return port->q_cnts.num_counters; } static void *mlx5_ib_add(struct mlx5_core_dev *mdev) @@ -3060,8 +3350,6 @@ static void *mlx5_ib_add(struct mlx5_core_dev *mdev) if (mlx5_use_mad_ifc(dev)) get_ext_port_caps(dev); - MLX5_INIT_DOORBELL_LOCK(&dev->uar_lock); - if (!mlx5_lag_is_active(mdev)) name = "mlx5_%d"; else @@ -3075,7 +3363,7 @@ static void *mlx5_ib_add(struct mlx5_core_dev *mdev) dev->ib_dev.phys_port_cnt = dev->num_ports; dev->ib_dev.num_comp_vectors = dev->mdev->priv.eq_table.num_comp_vectors; - dev->ib_dev.dma_device = &mdev->pdev->dev; + dev->ib_dev.dev.parent = &mdev->pdev->dev; dev->ib_dev.uverbs_abi_ver = MLX5_IB_UVERBS_ABI_VERSION; dev->ib_dev.uverbs_cmd_mask = @@ -3178,8 +3466,7 @@ static void *mlx5_ib_add(struct mlx5_core_dev *mdev) (1ull << IB_USER_VERBS_CMD_DEALLOC_MW); } - if (MLX5_CAP_GEN(dev->mdev, out_of_seq_cnt) && - MLX5_CAP_GEN(dev->mdev, retransmission_q_counters)) { + if (MLX5_CAP_GEN(dev->mdev, max_qp_cnt)) { dev->ib_dev.get_hw_stats = mlx5_ib_get_hw_stats; dev->ib_dev.alloc_hw_stats = mlx5_ib_alloc_hw_stats; } @@ -3233,13 +3520,27 @@ static void *mlx5_ib_add(struct mlx5_core_dev *mdev) if (err) goto err_rsrc; - err = mlx5_ib_alloc_q_counters(dev); + if (MLX5_CAP_GEN(dev->mdev, max_qp_cnt)) { + err = mlx5_ib_alloc_q_counters(dev); + if (err) + goto err_odp; + } + + dev->mdev->priv.uar = mlx5_get_uars_page(dev->mdev); + if (!dev->mdev->priv.uar) + goto err_q_cnt; + + err = mlx5_alloc_bfreg(dev->mdev, &dev->bfreg, false, false); + if (err) + goto err_uar_page; + + err = mlx5_alloc_bfreg(dev->mdev, &dev->fp_bfreg, false, true); if (err) - goto err_odp; + goto err_bfreg; err = ib_register_device(&dev->ib_dev, NULL); if (err) - goto err_q_cnt; + goto err_fp_bfreg; err = create_umr_res(dev); if (err) @@ -3262,8 +3563,18 @@ err_umrc: err_dev: ib_unregister_device(&dev->ib_dev); +err_fp_bfreg: + mlx5_free_bfreg(dev->mdev, &dev->fp_bfreg); + +err_bfreg: + mlx5_free_bfreg(dev->mdev, &dev->bfreg); + +err_uar_page: + mlx5_put_uars_page(dev->mdev, dev->mdev->priv.uar); + err_q_cnt: - mlx5_ib_dealloc_q_counters(dev); + if (MLX5_CAP_GEN(dev->mdev, max_qp_cnt)) + mlx5_ib_dealloc_q_counters(dev); err_odp: mlx5_ib_odp_remove_one(dev); @@ -3293,7 +3604,11 @@ static void mlx5_ib_remove(struct mlx5_core_dev *mdev, void *context) mlx5_remove_netdev_notifier(dev); ib_unregister_device(&dev->ib_dev); - mlx5_ib_dealloc_q_counters(dev); + mlx5_free_bfreg(dev->mdev, &dev->fp_bfreg); + mlx5_free_bfreg(dev->mdev, &dev->bfreg); + mlx5_put_uars_page(dev->mdev, mdev->priv.uar); + if (MLX5_CAP_GEN(dev->mdev, max_qp_cnt)) + mlx5_ib_dealloc_q_counters(dev); destroy_umrc_res(dev); mlx5_ib_odp_remove_one(dev); destroy_dev_resources(&dev->devr); @@ -3307,6 +3622,9 @@ static struct mlx5_interface mlx5_ib_interface = { .add = mlx5_ib_add, .remove = mlx5_ib_remove, .event = mlx5_ib_event, +#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING + .pfault = mlx5_ib_pfault, +#endif .protocol = MLX5_INTERFACE_PROTOCOL_IB, }; @@ -3314,28 +3632,16 @@ static int __init mlx5_ib_init(void) { int err; - if (deprecated_prof_sel != 2) - pr_warn("prof_sel is deprecated for mlx5_ib, set it for mlx5_core\n"); - - err = mlx5_ib_odp_init(); - if (err) - return err; + mlx5_ib_odp_init(); err = mlx5_register_interface(&mlx5_ib_interface); - if (err) - goto clean_odp; - - return err; -clean_odp: - mlx5_ib_odp_cleanup(); return err; } static void __exit mlx5_ib_cleanup(void) { mlx5_unregister_interface(&mlx5_ib_interface); - mlx5_ib_odp_cleanup(); } module_init(mlx5_ib_init); diff --git a/drivers/infiniband/hw/mlx5/mem.c b/drivers/infiniband/hw/mlx5/mem.c index 6851357c16f4..778d8a18925f 100644 --- a/drivers/infiniband/hw/mlx5/mem.c +++ b/drivers/infiniband/hw/mlx5/mem.c @@ -159,7 +159,7 @@ void __mlx5_ib_populate_pas(struct mlx5_ib_dev *dev, struct ib_umem *umem, unsigned long umem_page_shift = ilog2(umem->page_size); int shift = page_shift - umem_page_shift; int mask = (1 << shift) - 1; - int i, k; + int i, k, idx; u64 cur = 0; u64 base; int len; @@ -185,18 +185,36 @@ void __mlx5_ib_populate_pas(struct mlx5_ib_dev *dev, struct ib_umem *umem, for_each_sg(umem->sg_head.sgl, sg, umem->nmap, entry) { len = sg_dma_len(sg) >> umem_page_shift; base = sg_dma_address(sg); - for (k = 0; k < len; k++) { + + /* Skip elements below offset */ + if (i + len < offset << shift) { + i += len; + continue; + } + + /* Skip pages below offset */ + if (i < offset << shift) { + k = (offset << shift) - i; + i = offset << shift; + } else { + k = 0; + } + + for (; k < len; k++) { if (!(i & mask)) { cur = base + (k << umem_page_shift); cur |= access_flags; + idx = (i >> shift) - offset; - pas[i >> shift] = cpu_to_be64(cur); + pas[idx] = cpu_to_be64(cur); mlx5_ib_dbg(dev, "pas[%d] 0x%llx\n", - i >> shift, be64_to_cpu(pas[i >> shift])); - } else - mlx5_ib_dbg(dev, "=====> 0x%llx\n", - base + (k << umem_page_shift)); + i >> shift, be64_to_cpu(pas[idx])); + } i++; + + /* Stop after num_pages reached */ + if (i >> shift >= offset + num_pages) + return; } } } diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h index 6c6057eb60ea..3cd064b5f0bf 100644 --- a/drivers/infiniband/hw/mlx5/mlx5_ib.h +++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h @@ -90,7 +90,6 @@ enum mlx5_ib_latency_class { MLX5_IB_LATENCY_CLASS_LOW, MLX5_IB_LATENCY_CLASS_MEDIUM, MLX5_IB_LATENCY_CLASS_HIGH, - MLX5_IB_LATENCY_CLASS_FAST_PATH }; enum mlx5_ib_mad_ifc_flags { @@ -100,7 +99,7 @@ enum mlx5_ib_mad_ifc_flags { }; enum { - MLX5_CROSS_CHANNEL_UUAR = 0, + MLX5_CROSS_CHANNEL_BFREG = 0, }; enum { @@ -120,11 +119,16 @@ struct mlx5_ib_ucontext { /* protect doorbell record alloc/free */ struct mutex db_page_mutex; - struct mlx5_uuar_info uuari; + struct mlx5_bfreg_info bfregi; u8 cqe_version; /* Transport Domain number */ u32 tdn; struct list_head vma_private_list; + + unsigned long upd_xlt_page; + /* protect ODP/KSM */ + struct mutex upd_xlt_page_mutex; + u64 lib_caps; }; static inline struct mlx5_ib_ucontext *to_mucontext(struct ib_ucontext *ibucontext) @@ -174,13 +178,12 @@ struct mlx5_ib_flow_db { * enum ib_send_flags and enum ib_qp_type for low-level driver */ -#define MLX5_IB_SEND_UMR_UNREG IB_SEND_RESERVED_START -#define MLX5_IB_SEND_UMR_FAIL_IF_FREE (IB_SEND_RESERVED_START << 1) -#define MLX5_IB_SEND_UMR_UPDATE_MTT (IB_SEND_RESERVED_START << 2) - -#define MLX5_IB_SEND_UMR_UPDATE_TRANSLATION (IB_SEND_RESERVED_START << 3) -#define MLX5_IB_SEND_UMR_UPDATE_PD (IB_SEND_RESERVED_START << 4) -#define MLX5_IB_SEND_UMR_UPDATE_ACCESS IB_SEND_RESERVED_END +#define MLX5_IB_SEND_UMR_ENABLE_MR (IB_SEND_RESERVED_START << 0) +#define MLX5_IB_SEND_UMR_DISABLE_MR (IB_SEND_RESERVED_START << 1) +#define MLX5_IB_SEND_UMR_FAIL_IF_FREE (IB_SEND_RESERVED_START << 2) +#define MLX5_IB_SEND_UMR_UPDATE_XLT (IB_SEND_RESERVED_START << 3) +#define MLX5_IB_SEND_UMR_UPDATE_TRANSLATION (IB_SEND_RESERVED_START << 4) +#define MLX5_IB_SEND_UMR_UPDATE_PD_ACCESS IB_SEND_RESERVED_END #define MLX5_IB_QPT_REG_UMR IB_QPT_RESERVED1 /* @@ -190,6 +193,17 @@ struct mlx5_ib_flow_db { #define MLX5_IB_QPT_HW_GSI IB_QPT_RESERVED2 #define MLX5_IB_WR_UMR IB_WR_RESERVED1 +#define MLX5_IB_UMR_OCTOWORD 16 +#define MLX5_IB_UMR_XLT_ALIGNMENT 64 + +#define MLX5_IB_UPD_XLT_ZAP BIT(0) +#define MLX5_IB_UPD_XLT_ENABLE BIT(1) +#define MLX5_IB_UPD_XLT_ATOMIC BIT(2) +#define MLX5_IB_UPD_XLT_ADDR BIT(3) +#define MLX5_IB_UPD_XLT_PD BIT(4) +#define MLX5_IB_UPD_XLT_ACCESS BIT(5) +#define MLX5_IB_UPD_XLT_INDIRECT BIT(6) + /* Private QP creation flags to be passed in ib_qp_init_attr.create_flags. * * These flags are intended for internal use by the mlx5_ib driver, and they @@ -207,6 +221,10 @@ struct wr_list { u16 next; }; +enum mlx5_ib_rq_flags { + MLX5_IB_RQ_CVLAN_STRIPPING = 1 << 0, +}; + struct mlx5_ib_wq { u64 *wrid; u32 *wr_data; @@ -264,29 +282,6 @@ struct mlx5_ib_rwq_ind_table { u32 rqtn; }; -/* - * Connect-IB can trigger up to four concurrent pagefaults - * per-QP. - */ -enum mlx5_ib_pagefault_context { - MLX5_IB_PAGEFAULT_RESPONDER_READ, - MLX5_IB_PAGEFAULT_REQUESTOR_READ, - MLX5_IB_PAGEFAULT_RESPONDER_WRITE, - MLX5_IB_PAGEFAULT_REQUESTOR_WRITE, - MLX5_IB_PAGEFAULT_CONTEXTS -}; - -static inline enum mlx5_ib_pagefault_context - mlx5_ib_get_pagefault_context(struct mlx5_pagefault *pagefault) -{ - return pagefault->flags & (MLX5_PFAULT_REQUESTOR | MLX5_PFAULT_WRITE); -} - -struct mlx5_ib_pfault { - struct work_struct work; - struct mlx5_pagefault mpfault; -}; - struct mlx5_ib_ubuffer { struct ib_umem *umem; int buf_size; @@ -318,6 +313,7 @@ struct mlx5_ib_rq { struct mlx5_db *doorbell; u32 tirn; u8 state; + u32 flags; }; struct mlx5_ib_sq { @@ -334,6 +330,12 @@ struct mlx5_ib_raw_packet_qp { struct mlx5_ib_rq rq; }; +struct mlx5_bf { + int buf_size; + unsigned long offset; + struct mlx5_sq_bfreg *bfreg; +}; + struct mlx5_ib_qp { struct ib_qp ibqp; union { @@ -359,33 +361,19 @@ struct mlx5_ib_qp { int wq_sig; int scat_cqe; int max_inline_data; - struct mlx5_bf *bf; + struct mlx5_bf bf; int has_rq; /* only for user space QPs. For kernel * we have it from the bf object */ - int uuarn; + int bfregn; int create_type; /* Store signature errors */ bool signature_en; -#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING - /* - * A flag that is true for QP's that are in a state that doesn't - * allow page faults, and shouldn't schedule any more faults. - */ - int disable_page_faults; - /* - * The disable_page_faults_lock protects a QP's disable_page_faults - * field, allowing for a thread to atomically check whether the QP - * allows page faults, and if so schedule a page fault. - */ - spinlock_t disable_page_faults_lock; - struct mlx5_ib_pfault pagefaults[MLX5_IB_PAGEFAULT_CONTEXTS]; -#endif struct list_head qps_list; struct list_head cq_recv_list; struct list_head cq_send_list; @@ -410,17 +398,16 @@ enum mlx5_ib_qp_flags { MLX5_IB_QP_SQPN_QP1 = 1 << 6, MLX5_IB_QP_CAP_SCATTER_FCS = 1 << 7, MLX5_IB_QP_RSS = 1 << 8, + MLX5_IB_QP_CVLAN_STRIPPING = 1 << 9, }; struct mlx5_umr_wr { struct ib_send_wr wr; - union { - u64 virt_addr; - u64 offset; - } target; + u64 virt_addr; + u64 offset; struct ib_pd *pd; unsigned int page_shift; - unsigned int npages; + unsigned int xlt_size; u64 length; int access_flags; u32 mkey; @@ -517,6 +504,10 @@ struct mlx5_ib_mr { int live; void *descs_alloc; int access_flags; /* Needed for rereg MR */ + + struct mlx5_ib_mr *parent; + atomic_t num_leaf_free; + wait_queue_head_t q_leaf_free; }; struct mlx5_ib_mw { @@ -555,6 +546,10 @@ struct mlx5_cache_ent { struct dentry *dir; char name[4]; u32 order; + u32 xlt; + u32 access_mode; + u32 page; + u32 size; u32 cur; u32 miss; @@ -569,6 +564,7 @@ struct mlx5_cache_ent { struct work_struct work; struct delayed_work dwork; int pending; + struct completion compl; }; struct mlx5_mr_cache { @@ -599,8 +595,15 @@ struct mlx5_ib_resources { struct mutex mutex; }; +struct mlx5_ib_q_counters { + const char **names; + size_t *offsets; + u32 num_counters; + u16 set_id; +}; + struct mlx5_ib_port { - u16 q_cnt_id; + struct mlx5_ib_q_counters q_cnts; }; struct mlx5_roce { @@ -617,7 +620,6 @@ struct mlx5_ib_dev { struct ib_device ib_dev; struct mlx5_core_dev *mdev; struct mlx5_roce roce; - MLX5_DECLARE_DOORBELL_LOCK(uar_lock); int num_ports; /* serialize update of capability mask */ @@ -634,11 +636,13 @@ struct mlx5_ib_dev { int fill_delay; #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING struct ib_odp_caps odp_caps; + u64 odp_max_size; /* * Sleepable RCU that prevents destruction of MRs while they are still * being used by a page fault handler. */ struct srcu_struct mr_srcu; + u32 null_mkey; #endif struct mlx5_ib_flow_db flow_db; /* protect resources needed as part of reset flow */ @@ -646,6 +650,8 @@ struct mlx5_ib_dev { struct list_head qp_list; /* Array with num_ports elements */ struct mlx5_ib_port *port; + struct mlx5_sq_bfreg bfreg; + struct mlx5_sq_bfreg fp_bfreg; }; static inline struct mlx5_ib_cq *to_mibcq(struct mlx5_core_cq *mcq) @@ -787,8 +793,11 @@ struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, struct ib_mw *mlx5_ib_alloc_mw(struct ib_pd *pd, enum ib_mw_type type, struct ib_udata *udata); int mlx5_ib_dealloc_mw(struct ib_mw *mw); -int mlx5_ib_update_mtt(struct mlx5_ib_mr *mr, u64 start_page_index, - int npages, int zap); +int mlx5_ib_update_xlt(struct mlx5_ib_mr *mr, u64 idx, int npages, + int page_shift, int flags); +struct mlx5_ib_mr *mlx5_ib_alloc_implicit_mr(struct mlx5_ib_pd *pd, + int access_flags); +void mlx5_ib_free_implicit_mr(struct mlx5_ib_mr *mr); int mlx5_ib_rereg_user_mr(struct ib_mr *ib_mr, int flags, u64 start, u64 length, u64 virt_addr, int access_flags, struct ib_pd *pd, struct ib_udata *udata); @@ -842,7 +851,9 @@ void mlx5_ib_copy_pas(u64 *old, u64 *new, int step, int num); int mlx5_ib_get_cqe_size(struct mlx5_ib_dev *dev, struct ib_cq *ibcq); int mlx5_mr_cache_init(struct mlx5_ib_dev *dev); int mlx5_mr_cache_cleanup(struct mlx5_ib_dev *dev); -int mlx5_mr_ib_cont_pages(struct ib_umem *umem, u64 addr, int *count, int *shift); + +struct mlx5_ib_mr *mlx5_mr_cache_alloc(struct mlx5_ib_dev *dev, int entry); +void mlx5_mr_cache_free(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr); int mlx5_ib_check_mr_status(struct ib_mr *ibmr, u32 check_mask, struct ib_mr_status *mr_status); struct ib_wq *mlx5_ib_create_wq(struct ib_pd *pd, @@ -857,33 +868,32 @@ struct ib_rwq_ind_table *mlx5_ib_create_rwq_ind_table(struct ib_device *device, int mlx5_ib_destroy_rwq_ind_table(struct ib_rwq_ind_table *wq_ind_table); #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING -extern struct workqueue_struct *mlx5_ib_page_fault_wq; - void mlx5_ib_internal_fill_odp_caps(struct mlx5_ib_dev *dev); -void mlx5_ib_mr_pfault_handler(struct mlx5_ib_qp *qp, - struct mlx5_ib_pfault *pfault); -void mlx5_ib_odp_create_qp(struct mlx5_ib_qp *qp); +void mlx5_ib_pfault(struct mlx5_core_dev *mdev, void *context, + struct mlx5_pagefault *pfault); int mlx5_ib_odp_init_one(struct mlx5_ib_dev *ibdev); void mlx5_ib_odp_remove_one(struct mlx5_ib_dev *ibdev); int __init mlx5_ib_odp_init(void); void mlx5_ib_odp_cleanup(void); -void mlx5_ib_qp_disable_pagefaults(struct mlx5_ib_qp *qp); -void mlx5_ib_qp_enable_pagefaults(struct mlx5_ib_qp *qp); void mlx5_ib_invalidate_range(struct ib_umem *umem, unsigned long start, unsigned long end); +void mlx5_odp_init_mr_cache_entry(struct mlx5_cache_ent *ent); +void mlx5_odp_populate_klm(struct mlx5_klm *pklm, size_t offset, + size_t nentries, struct mlx5_ib_mr *mr, int flags); #else /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */ static inline void mlx5_ib_internal_fill_odp_caps(struct mlx5_ib_dev *dev) { return; } -static inline void mlx5_ib_odp_create_qp(struct mlx5_ib_qp *qp) {} static inline int mlx5_ib_odp_init_one(struct mlx5_ib_dev *ibdev) { return 0; } -static inline void mlx5_ib_odp_remove_one(struct mlx5_ib_dev *ibdev) {} +static inline void mlx5_ib_odp_remove_one(struct mlx5_ib_dev *ibdev) {} static inline int mlx5_ib_odp_init(void) { return 0; } -static inline void mlx5_ib_odp_cleanup(void) {} -static inline void mlx5_ib_qp_disable_pagefaults(struct mlx5_ib_qp *qp) {} -static inline void mlx5_ib_qp_enable_pagefaults(struct mlx5_ib_qp *qp) {} +static inline void mlx5_ib_odp_cleanup(void) {} +static inline void mlx5_odp_init_mr_cache_entry(struct mlx5_cache_ent *ent) {} +static inline void mlx5_odp_populate_klm(struct mlx5_klm *pklm, size_t offset, + size_t nentries, struct mlx5_ib_mr *mr, + int flags) {} #endif /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */ @@ -898,6 +908,8 @@ int mlx5_ib_set_vf_guid(struct ib_device *device, int vf, u8 port, __be16 mlx5_get_roce_udp_sport(struct mlx5_ib_dev *dev, u8 port_num, int index); +int mlx5_get_roce_gid_type(struct mlx5_ib_dev *dev, u8 port_num, + int index, enum ib_gid_type *gid_type); /* GSI QP helper functions */ struct ib_qp *mlx5_ib_gsi_create_qp(struct ib_pd *pd, @@ -1001,4 +1013,17 @@ static inline int get_srq_user_index(struct mlx5_ib_ucontext *ucontext, return verify_assign_uidx(cqe_version, ucmd->uidx, user_index); } + +static inline int get_uars_per_sys_page(struct mlx5_ib_dev *dev, bool lib_support) +{ + return lib_support && MLX5_CAP_GEN(dev->mdev, uar_4k) ? + MLX5_UARS_IN_PAGE : 1; +} + +static inline int get_num_uars(struct mlx5_ib_dev *dev, + struct mlx5_bfreg_info *bfregi) +{ + return get_uars_per_sys_page(dev, bfregi->lib_uar_4k) * bfregi->num_sys_pages; +} + #endif /* MLX5_IB_H */ diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c index 8f608debe141..b8f9382a8b7d 100644 --- a/drivers/infiniband/hw/mlx5/mr.c +++ b/drivers/infiniband/hw/mlx5/mr.c @@ -46,14 +46,10 @@ enum { }; #define MLX5_UMR_ALIGN 2048 -#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING -static __be64 mlx5_ib_update_mtt_emergency_buffer[ - MLX5_UMR_MTT_MIN_CHUNK_SIZE/sizeof(__be64)] - __aligned(MLX5_UMR_ALIGN); -static DEFINE_MUTEX(mlx5_ib_update_mtt_emergency_buffer_mutex); -#endif static int clean_mr(struct mlx5_ib_mr *mr); +static int use_umr(struct mlx5_ib_dev *dev, int order); +static int unreg_umr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr); static int destroy_mkey(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr) { @@ -134,6 +130,7 @@ static void reg_mr_callback(int status, void *context) return; } + mr->mmkey.type = MLX5_MKEY_MR; spin_lock_irqsave(&dev->mdev->priv.mkey_lock, flags); key = dev->mdev->priv.mkey_key++; spin_unlock_irqrestore(&dev->mdev->priv.mkey_lock, flags); @@ -153,6 +150,9 @@ static void reg_mr_callback(int status, void *context) if (err) pr_err("Error inserting to mkey tree. 0x%x\n", -err); write_unlock_irqrestore(&table->lock, flags); + + if (!completion_done(&ent->compl)) + complete(&ent->compl); } static int add_keys(struct mlx5_ib_dev *dev, int c, int num) @@ -161,7 +161,6 @@ static int add_keys(struct mlx5_ib_dev *dev, int c, int num) struct mlx5_cache_ent *ent = &cache->ent[c]; int inlen = MLX5_ST_SZ_BYTES(create_mkey_in); struct mlx5_ib_mr *mr; - int npages = 1 << ent->order; void *mkc; u32 *in; int err = 0; @@ -189,11 +188,11 @@ static int add_keys(struct mlx5_ib_dev *dev, int c, int num) MLX5_SET(mkc, mkc, free, 1); MLX5_SET(mkc, mkc, umr_en, 1); - MLX5_SET(mkc, mkc, access_mode, MLX5_MKC_ACCESS_MODE_MTT); + MLX5_SET(mkc, mkc, access_mode, ent->access_mode); MLX5_SET(mkc, mkc, qpn, 0xffffff); - MLX5_SET(mkc, mkc, translations_octword_size, (npages + 1) / 2); - MLX5_SET(mkc, mkc, log_page_size, 12); + MLX5_SET(mkc, mkc, translations_octword_size, ent->xlt); + MLX5_SET(mkc, mkc, log_page_size, ent->page); spin_lock_irq(&ent->lock); ent->pending++; @@ -451,6 +450,42 @@ static void cache_work_func(struct work_struct *work) __cache_work_func(ent); } +struct mlx5_ib_mr *mlx5_mr_cache_alloc(struct mlx5_ib_dev *dev, int entry) +{ + struct mlx5_mr_cache *cache = &dev->cache; + struct mlx5_cache_ent *ent; + struct mlx5_ib_mr *mr; + int err; + + if (entry < 0 || entry >= MAX_MR_CACHE_ENTRIES) { + mlx5_ib_err(dev, "cache entry %d is out of range\n", entry); + return NULL; + } + + ent = &cache->ent[entry]; + while (1) { + spin_lock_irq(&ent->lock); + if (list_empty(&ent->head)) { + spin_unlock_irq(&ent->lock); + + err = add_keys(dev, entry, 1); + if (err && err != -EAGAIN) + return ERR_PTR(err); + + wait_for_completion(&ent->compl); + } else { + mr = list_first_entry(&ent->head, struct mlx5_ib_mr, + list); + list_del(&mr->list); + ent->cur--; + spin_unlock_irq(&ent->lock); + if (ent->cur < ent->limit) + queue_work(cache->wq, &ent->work); + return mr; + } + } +} + static struct mlx5_ib_mr *alloc_cached_mr(struct mlx5_ib_dev *dev, int order) { struct mlx5_mr_cache *cache = &dev->cache; @@ -460,12 +495,12 @@ static struct mlx5_ib_mr *alloc_cached_mr(struct mlx5_ib_dev *dev, int order) int i; c = order2idx(dev, order); - if (c < 0 || c >= MAX_MR_CACHE_ENTRIES) { + if (c < 0 || c > MAX_UMR_CACHE_ENTRY) { mlx5_ib_warn(dev, "order %d, cache index %d\n", order, c); return NULL; } - for (i = c; i < MAX_MR_CACHE_ENTRIES; i++) { + for (i = c; i < MAX_UMR_CACHE_ENTRY; i++) { ent = &cache->ent[i]; mlx5_ib_dbg(dev, "order %d, cache index %d\n", ent->order, i); @@ -492,7 +527,7 @@ static struct mlx5_ib_mr *alloc_cached_mr(struct mlx5_ib_dev *dev, int order) return mr; } -static void free_cached_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr) +void mlx5_mr_cache_free(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr) { struct mlx5_mr_cache *cache = &dev->cache; struct mlx5_cache_ent *ent; @@ -504,6 +539,10 @@ static void free_cached_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr) mlx5_ib_warn(dev, "order %d, cache index %d\n", mr->order, c); return; } + + if (unreg_umr(dev, mr)) + return; + ent = &cache->ent[c]; spin_lock_irq(&ent->lock); list_add_tail(&mr->list, &ent->head); @@ -606,7 +645,6 @@ int mlx5_mr_cache_init(struct mlx5_ib_dev *dev) { struct mlx5_mr_cache *cache = &dev->cache; struct mlx5_cache_ent *ent; - int limit; int err; int i; @@ -619,25 +657,35 @@ int mlx5_mr_cache_init(struct mlx5_ib_dev *dev) setup_timer(&dev->delay_timer, delay_time_func, (unsigned long)dev); for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++) { - INIT_LIST_HEAD(&cache->ent[i].head); - spin_lock_init(&cache->ent[i].lock); - ent = &cache->ent[i]; INIT_LIST_HEAD(&ent->head); spin_lock_init(&ent->lock); ent->order = i + 2; ent->dev = dev; + ent->limit = 0; - if ((dev->mdev->profile->mask & MLX5_PROF_MASK_MR_CACHE) && - (mlx5_core_is_pf(dev->mdev))) - limit = dev->mdev->profile->mr_cache[i].limit; - else - limit = 0; - + init_completion(&ent->compl); INIT_WORK(&ent->work, cache_work_func); INIT_DELAYED_WORK(&ent->dwork, delayed_cache_work_func); - ent->limit = limit; queue_work(cache->wq, &ent->work); + + if (i > MAX_UMR_CACHE_ENTRY) { + mlx5_odp_init_mr_cache_entry(ent); + continue; + } + + if (!use_umr(dev, ent->order)) + continue; + + ent->page = PAGE_SHIFT; + ent->xlt = (1 << ent->order) * sizeof(struct mlx5_mtt) / + MLX5_IB_UMR_OCTOWORD; + ent->access_mode = MLX5_MKC_ACCESS_MODE_MTT; + if ((dev->mdev->profile->mask & MLX5_PROF_MASK_MR_CACHE) && + mlx5_core_is_pf(dev->mdev)) + ent->limit = dev->mdev->profile->mr_cache[i].limit; + else + ent->limit = 0; } err = mlx5_mr_cache_debugfs_init(dev); @@ -732,6 +780,7 @@ struct ib_mr *mlx5_ib_get_dma_mr(struct ib_pd *pd, int acc) goto err_in; kfree(in); + mr->mmkey.type = MLX5_MKEY_MR; mr->ibmr.lkey = mr->mmkey.key; mr->ibmr.rkey = mr->mmkey.key; mr->umem = NULL; @@ -757,94 +806,13 @@ static int get_octo_len(u64 addr, u64 len, int page_size) return (npages + 1) / 2; } -static int use_umr(int order) +static int use_umr(struct mlx5_ib_dev *dev, int order) { + if (MLX5_CAP_GEN(dev->mdev, umr_extended_translation_offset)) + return order <= MAX_UMR_CACHE_ENTRY + 2; return order <= MLX5_MAX_UMR_SHIFT; } -static int dma_map_mr_pas(struct mlx5_ib_dev *dev, struct ib_umem *umem, - int npages, int page_shift, int *size, - __be64 **mr_pas, dma_addr_t *dma) -{ - __be64 *pas; - struct device *ddev = dev->ib_dev.dma_device; - - /* - * UMR copies MTTs in units of MLX5_UMR_MTT_ALIGNMENT bytes. - * To avoid copying garbage after the pas array, we allocate - * a little more. - */ - *size = ALIGN(sizeof(u64) * npages, MLX5_UMR_MTT_ALIGNMENT); - *mr_pas = kmalloc(*size + MLX5_UMR_ALIGN - 1, GFP_KERNEL); - if (!(*mr_pas)) - return -ENOMEM; - - pas = PTR_ALIGN(*mr_pas, MLX5_UMR_ALIGN); - mlx5_ib_populate_pas(dev, umem, page_shift, pas, MLX5_IB_MTT_PRESENT); - /* Clear padding after the actual pages. */ - memset(pas + npages, 0, *size - npages * sizeof(u64)); - - *dma = dma_map_single(ddev, pas, *size, DMA_TO_DEVICE); - if (dma_mapping_error(ddev, *dma)) { - kfree(*mr_pas); - return -ENOMEM; - } - - return 0; -} - -static void prep_umr_wqe_common(struct ib_pd *pd, struct ib_send_wr *wr, - struct ib_sge *sg, u64 dma, int n, u32 key, - int page_shift) -{ - struct mlx5_ib_dev *dev = to_mdev(pd->device); - struct mlx5_umr_wr *umrwr = umr_wr(wr); - - sg->addr = dma; - sg->length = ALIGN(sizeof(u64) * n, 64); - sg->lkey = dev->umrc.pd->local_dma_lkey; - - wr->next = NULL; - wr->sg_list = sg; - if (n) - wr->num_sge = 1; - else - wr->num_sge = 0; - - wr->opcode = MLX5_IB_WR_UMR; - - umrwr->npages = n; - umrwr->page_shift = page_shift; - umrwr->mkey = key; -} - -static void prep_umr_reg_wqe(struct ib_pd *pd, struct ib_send_wr *wr, - struct ib_sge *sg, u64 dma, int n, u32 key, - int page_shift, u64 virt_addr, u64 len, - int access_flags) -{ - struct mlx5_umr_wr *umrwr = umr_wr(wr); - - prep_umr_wqe_common(pd, wr, sg, dma, n, key, page_shift); - - wr->send_flags = 0; - - umrwr->target.virt_addr = virt_addr; - umrwr->length = len; - umrwr->access_flags = access_flags; - umrwr->pd = pd; -} - -static void prep_umr_unreg_wqe(struct mlx5_ib_dev *dev, - struct ib_send_wr *wr, u32 key) -{ - struct mlx5_umr_wr *umrwr = umr_wr(wr); - - wr->send_flags = MLX5_IB_SEND_UMR_UNREG | MLX5_IB_SEND_UMR_FAIL_IF_FREE; - wr->opcode = MLX5_IB_WR_UMR; - umrwr->mkey = key; -} - static int mr_umem_get(struct ib_pd *pd, u64 start, u64 length, int access_flags, struct ib_umem **umem, int *npages, int *page_shift, int *ncont, @@ -891,21 +859,39 @@ static inline void mlx5_ib_init_umr_context(struct mlx5_ib_umr_context *context) init_completion(&context->done); } +static int mlx5_ib_post_send_wait(struct mlx5_ib_dev *dev, + struct mlx5_umr_wr *umrwr) +{ + struct umr_common *umrc = &dev->umrc; + struct ib_send_wr *bad; + int err; + struct mlx5_ib_umr_context umr_context; + + mlx5_ib_init_umr_context(&umr_context); + umrwr->wr.wr_cqe = &umr_context.cqe; + + down(&umrc->sem); + err = ib_post_send(umrc->qp, &umrwr->wr, &bad); + if (err) { + mlx5_ib_warn(dev, "UMR post send failed, err %d\n", err); + } else { + wait_for_completion(&umr_context.done); + if (umr_context.status != IB_WC_SUCCESS) { + mlx5_ib_warn(dev, "reg umr failed (%u)\n", + umr_context.status); + err = -EFAULT; + } + } + up(&umrc->sem); + return err; +} + static struct mlx5_ib_mr *reg_umr(struct ib_pd *pd, struct ib_umem *umem, u64 virt_addr, u64 len, int npages, int page_shift, int order, int access_flags) { struct mlx5_ib_dev *dev = to_mdev(pd->device); - struct device *ddev = dev->ib_dev.dma_device; - struct umr_common *umrc = &dev->umrc; - struct mlx5_ib_umr_context umr_context; - struct mlx5_umr_wr umrwr = {}; - struct ib_send_wr *bad; struct mlx5_ib_mr *mr; - struct ib_sge sg; - int size; - __be64 *mr_pas; - dma_addr_t dma; int err = 0; int i; @@ -924,173 +910,180 @@ static struct mlx5_ib_mr *reg_umr(struct ib_pd *pd, struct ib_umem *umem, if (!mr) return ERR_PTR(-EAGAIN); - err = dma_map_mr_pas(dev, umem, npages, page_shift, &size, &mr_pas, - &dma); - if (err) - goto free_mr; - - mlx5_ib_init_umr_context(&umr_context); - - umrwr.wr.wr_cqe = &umr_context.cqe; - prep_umr_reg_wqe(pd, &umrwr.wr, &sg, dma, npages, mr->mmkey.key, - page_shift, virt_addr, len, access_flags); - - down(&umrc->sem); - err = ib_post_send(umrc->qp, &umrwr.wr, &bad); - if (err) { - mlx5_ib_warn(dev, "post send failed, err %d\n", err); - goto unmap_dma; - } else { - wait_for_completion(&umr_context.done); - if (umr_context.status != IB_WC_SUCCESS) { - mlx5_ib_warn(dev, "reg umr failed\n"); - err = -EFAULT; - } - } - + mr->ibmr.pd = pd; + mr->umem = umem; + mr->access_flags = access_flags; + mr->desc_size = sizeof(struct mlx5_mtt); mr->mmkey.iova = virt_addr; mr->mmkey.size = len; mr->mmkey.pd = to_mpd(pd)->pdn; - mr->live = 1; - -unmap_dma: - up(&umrc->sem); - dma_unmap_single(ddev, dma, size, DMA_TO_DEVICE); + err = mlx5_ib_update_xlt(mr, 0, npages, page_shift, + MLX5_IB_UPD_XLT_ENABLE); - kfree(mr_pas); - -free_mr: if (err) { - free_cached_mr(dev, mr); + mlx5_mr_cache_free(dev, mr); return ERR_PTR(err); } + mr->live = 1; + return mr; } -#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING -int mlx5_ib_update_mtt(struct mlx5_ib_mr *mr, u64 start_page_index, int npages, - int zap) +static inline int populate_xlt(struct mlx5_ib_mr *mr, int idx, int npages, + void *xlt, int page_shift, size_t size, + int flags) { struct mlx5_ib_dev *dev = mr->dev; - struct device *ddev = dev->ib_dev.dma_device; - struct umr_common *umrc = &dev->umrc; - struct mlx5_ib_umr_context umr_context; struct ib_umem *umem = mr->umem; + if (flags & MLX5_IB_UPD_XLT_INDIRECT) { + mlx5_odp_populate_klm(xlt, idx, npages, mr, flags); + return npages; + } + + npages = min_t(size_t, npages, ib_umem_num_pages(umem) - idx); + + if (!(flags & MLX5_IB_UPD_XLT_ZAP)) { + __mlx5_ib_populate_pas(dev, umem, page_shift, + idx, npages, xlt, + MLX5_IB_MTT_PRESENT); + /* Clear padding after the pages + * brought from the umem. + */ + memset(xlt + (npages * sizeof(struct mlx5_mtt)), 0, + size - npages * sizeof(struct mlx5_mtt)); + } + + return npages; +} + +#define MLX5_MAX_UMR_CHUNK ((1 << (MLX5_MAX_UMR_SHIFT + 4)) - \ + MLX5_UMR_MTT_ALIGNMENT) +#define MLX5_SPARE_UMR_CHUNK 0x10000 + +int mlx5_ib_update_xlt(struct mlx5_ib_mr *mr, u64 idx, int npages, + int page_shift, int flags) +{ + struct mlx5_ib_dev *dev = mr->dev; + struct device *ddev = dev->ib_dev.dev.parent; + struct mlx5_ib_ucontext *uctx = NULL; int size; - __be64 *pas; + void *xlt; dma_addr_t dma; - struct ib_send_wr *bad; struct mlx5_umr_wr wr; struct ib_sge sg; int err = 0; - const int page_index_alignment = MLX5_UMR_MTT_ALIGNMENT / sizeof(u64); - const int page_index_mask = page_index_alignment - 1; + int desc_size = (flags & MLX5_IB_UPD_XLT_INDIRECT) + ? sizeof(struct mlx5_klm) + : sizeof(struct mlx5_mtt); + const int page_align = MLX5_UMR_MTT_ALIGNMENT / desc_size; + const int page_mask = page_align - 1; size_t pages_mapped = 0; size_t pages_to_map = 0; size_t pages_iter = 0; - int use_emergency_buf = 0; + gfp_t gfp; /* UMR copies MTTs in units of MLX5_UMR_MTT_ALIGNMENT bytes, - * so we need to align the offset and length accordingly */ - if (start_page_index & page_index_mask) { - npages += start_page_index & page_index_mask; - start_page_index &= ~page_index_mask; + * so we need to align the offset and length accordingly + */ + if (idx & page_mask) { + npages += idx & page_mask; + idx &= ~page_mask; } - pages_to_map = ALIGN(npages, page_index_alignment); + gfp = flags & MLX5_IB_UPD_XLT_ATOMIC ? GFP_ATOMIC : GFP_KERNEL; + gfp |= __GFP_ZERO | __GFP_NOWARN; - if (start_page_index + pages_to_map > MLX5_MAX_UMR_PAGES) - return -EINVAL; + pages_to_map = ALIGN(npages, page_align); + size = desc_size * pages_to_map; + size = min_t(int, size, MLX5_MAX_UMR_CHUNK); + + xlt = (void *)__get_free_pages(gfp, get_order(size)); + if (!xlt && size > MLX5_SPARE_UMR_CHUNK) { + mlx5_ib_dbg(dev, "Failed to allocate %d bytes of order %d. fallback to spare UMR allocation od %d bytes\n", + size, get_order(size), MLX5_SPARE_UMR_CHUNK); - size = sizeof(u64) * pages_to_map; - size = min_t(int, PAGE_SIZE, size); - /* We allocate with GFP_ATOMIC to avoid recursion into page-reclaim - * code, when we are called from an invalidation. The pas buffer must - * be 2k-aligned for Connect-IB. */ - pas = (__be64 *)get_zeroed_page(GFP_ATOMIC); - if (!pas) { - mlx5_ib_warn(dev, "unable to allocate memory during MTT update, falling back to slower chunked mechanism.\n"); - pas = mlx5_ib_update_mtt_emergency_buffer; - size = MLX5_UMR_MTT_MIN_CHUNK_SIZE; - use_emergency_buf = 1; - mutex_lock(&mlx5_ib_update_mtt_emergency_buffer_mutex); - memset(pas, 0, size); + size = MLX5_SPARE_UMR_CHUNK; + xlt = (void *)__get_free_pages(gfp, get_order(size)); } - pages_iter = size / sizeof(u64); - dma = dma_map_single(ddev, pas, size, DMA_TO_DEVICE); + + if (!xlt) { + uctx = to_mucontext(mr->ibmr.uobject->context); + mlx5_ib_warn(dev, "Using XLT emergency buffer\n"); + size = PAGE_SIZE; + xlt = (void *)uctx->upd_xlt_page; + mutex_lock(&uctx->upd_xlt_page_mutex); + memset(xlt, 0, size); + } + pages_iter = size / desc_size; + dma = dma_map_single(ddev, xlt, size, DMA_TO_DEVICE); if (dma_mapping_error(ddev, dma)) { - mlx5_ib_err(dev, "unable to map DMA during MTT update.\n"); + mlx5_ib_err(dev, "unable to map DMA during XLT update.\n"); err = -ENOMEM; - goto free_pas; + goto free_xlt; } + sg.addr = dma; + sg.lkey = dev->umrc.pd->local_dma_lkey; + + memset(&wr, 0, sizeof(wr)); + wr.wr.send_flags = MLX5_IB_SEND_UMR_UPDATE_XLT; + if (!(flags & MLX5_IB_UPD_XLT_ENABLE)) + wr.wr.send_flags |= MLX5_IB_SEND_UMR_FAIL_IF_FREE; + wr.wr.sg_list = &sg; + wr.wr.num_sge = 1; + wr.wr.opcode = MLX5_IB_WR_UMR; + + wr.pd = mr->ibmr.pd; + wr.mkey = mr->mmkey.key; + wr.length = mr->mmkey.size; + wr.virt_addr = mr->mmkey.iova; + wr.access_flags = mr->access_flags; + wr.page_shift = page_shift; + for (pages_mapped = 0; pages_mapped < pages_to_map && !err; - pages_mapped += pages_iter, start_page_index += pages_iter) { + pages_mapped += pages_iter, idx += pages_iter) { dma_sync_single_for_cpu(ddev, dma, size, DMA_TO_DEVICE); - - npages = min_t(size_t, - pages_iter, - ib_umem_num_pages(umem) - start_page_index); - - if (!zap) { - __mlx5_ib_populate_pas(dev, umem, PAGE_SHIFT, - start_page_index, npages, pas, - MLX5_IB_MTT_PRESENT); - /* Clear padding after the pages brought from the - * umem. */ - memset(pas + npages, 0, size - npages * sizeof(u64)); - } + npages = populate_xlt(mr, idx, pages_iter, xlt, + page_shift, size, flags); dma_sync_single_for_device(ddev, dma, size, DMA_TO_DEVICE); - mlx5_ib_init_umr_context(&umr_context); - - memset(&wr, 0, sizeof(wr)); - wr.wr.wr_cqe = &umr_context.cqe; - - sg.addr = dma; - sg.length = ALIGN(npages * sizeof(u64), - MLX5_UMR_MTT_ALIGNMENT); - sg.lkey = dev->umrc.pd->local_dma_lkey; + sg.length = ALIGN(npages * desc_size, + MLX5_UMR_MTT_ALIGNMENT); + + if (pages_mapped + pages_iter >= pages_to_map) { + if (flags & MLX5_IB_UPD_XLT_ENABLE) + wr.wr.send_flags |= + MLX5_IB_SEND_UMR_ENABLE_MR | + MLX5_IB_SEND_UMR_UPDATE_PD_ACCESS | + MLX5_IB_SEND_UMR_UPDATE_TRANSLATION; + if (flags & MLX5_IB_UPD_XLT_PD || + flags & MLX5_IB_UPD_XLT_ACCESS) + wr.wr.send_flags |= + MLX5_IB_SEND_UMR_UPDATE_PD_ACCESS; + if (flags & MLX5_IB_UPD_XLT_ADDR) + wr.wr.send_flags |= + MLX5_IB_SEND_UMR_UPDATE_TRANSLATION; + } - wr.wr.send_flags = MLX5_IB_SEND_UMR_FAIL_IF_FREE | - MLX5_IB_SEND_UMR_UPDATE_MTT; - wr.wr.sg_list = &sg; - wr.wr.num_sge = 1; - wr.wr.opcode = MLX5_IB_WR_UMR; - wr.npages = sg.length / sizeof(u64); - wr.page_shift = PAGE_SHIFT; - wr.mkey = mr->mmkey.key; - wr.target.offset = start_page_index; + wr.offset = idx * desc_size; + wr.xlt_size = sg.length; - down(&umrc->sem); - err = ib_post_send(umrc->qp, &wr.wr, &bad); - if (err) { - mlx5_ib_err(dev, "UMR post send failed, err %d\n", err); - } else { - wait_for_completion(&umr_context.done); - if (umr_context.status != IB_WC_SUCCESS) { - mlx5_ib_err(dev, "UMR completion failed, code %d\n", - umr_context.status); - err = -EFAULT; - } - } - up(&umrc->sem); + err = mlx5_ib_post_send_wait(dev, &wr); } dma_unmap_single(ddev, dma, size, DMA_TO_DEVICE); -free_pas: - if (!use_emergency_buf) - free_page((unsigned long)pas); +free_xlt: + if (uctx) + mutex_unlock(&uctx->upd_xlt_page_mutex); else - mutex_unlock(&mlx5_ib_update_mtt_emergency_buffer_mutex); + free_pages((unsigned long)xlt, get_order(size)); return err; } -#endif /* * If ibmr is NULL it will be allocated by reg_create. @@ -1122,8 +1115,9 @@ static struct mlx5_ib_mr *reg_create(struct ib_mr *ibmr, struct ib_pd *pd, goto err_1; } pas = (__be64 *)MLX5_ADDR_OF(create_mkey_in, in, klm_pas_mtt); - mlx5_ib_populate_pas(dev, umem, page_shift, pas, - pg_cap ? MLX5_IB_MTT_PRESENT : 0); + if (!(access_flags & IB_ACCESS_ON_DEMAND)) + mlx5_ib_populate_pas(dev, umem, page_shift, pas, + pg_cap ? MLX5_IB_MTT_PRESENT : 0); /* The pg_access bit allows setting the access flags * in the page list submitted with the command. */ @@ -1153,6 +1147,8 @@ static struct mlx5_ib_mr *reg_create(struct ib_mr *ibmr, struct ib_pd *pd, mlx5_ib_warn(dev, "create mkey failed\n"); goto err_2; } + mr->mmkey.type = MLX5_MKEY_MR; + mr->desc_size = sizeof(struct mlx5_mtt); mr->umem = umem; mr->dev = dev; mr->live = 1; @@ -1198,20 +1194,33 @@ struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, mlx5_ib_dbg(dev, "start 0x%llx, virt_addr 0x%llx, length 0x%llx, access_flags 0x%x\n", start, virt_addr, length, access_flags); + +#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING + if (!start && length == U64_MAX) { + if (!(access_flags & IB_ACCESS_ON_DEMAND) || + !(dev->odp_caps.general_caps & IB_ODP_SUPPORT_IMPLICIT)) + return ERR_PTR(-EINVAL); + + mr = mlx5_ib_alloc_implicit_mr(to_mpd(pd), access_flags); + return &mr->ibmr; + } +#endif + err = mr_umem_get(pd, start, length, access_flags, &umem, &npages, &page_shift, &ncont, &order); if (err < 0) return ERR_PTR(err); - if (use_umr(order)) { + if (use_umr(dev, order)) { mr = reg_umr(pd, umem, virt_addr, length, ncont, page_shift, order, access_flags); if (PTR_ERR(mr) == -EAGAIN) { mlx5_ib_dbg(dev, "cache empty for order %d", order); mr = NULL; } - } else if (access_flags & IB_ACCESS_ON_DEMAND) { + } else if (access_flags & IB_ACCESS_ON_DEMAND && + !MLX5_CAP_GEN(dev->mdev, umr_extended_translation_offset)) { err = -EINVAL; pr_err("Got MR registration for ODP MR > 512MB, not supported for Connect-IB"); goto error; @@ -1248,106 +1257,39 @@ error: static int unreg_umr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr) { struct mlx5_core_dev *mdev = dev->mdev; - struct umr_common *umrc = &dev->umrc; - struct mlx5_ib_umr_context umr_context; struct mlx5_umr_wr umrwr = {}; - struct ib_send_wr *bad; - int err; if (mdev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR) return 0; - mlx5_ib_init_umr_context(&umr_context); + umrwr.wr.send_flags = MLX5_IB_SEND_UMR_DISABLE_MR | + MLX5_IB_SEND_UMR_FAIL_IF_FREE; + umrwr.wr.opcode = MLX5_IB_WR_UMR; + umrwr.mkey = mr->mmkey.key; - umrwr.wr.wr_cqe = &umr_context.cqe; - prep_umr_unreg_wqe(dev, &umrwr.wr, mr->mmkey.key); - - down(&umrc->sem); - err = ib_post_send(umrc->qp, &umrwr.wr, &bad); - if (err) { - up(&umrc->sem); - mlx5_ib_dbg(dev, "err %d\n", err); - goto error; - } else { - wait_for_completion(&umr_context.done); - up(&umrc->sem); - } - if (umr_context.status != IB_WC_SUCCESS) { - mlx5_ib_warn(dev, "unreg umr failed\n"); - err = -EFAULT; - goto error; - } - return 0; - -error: - return err; + return mlx5_ib_post_send_wait(dev, &umrwr); } -static int rereg_umr(struct ib_pd *pd, struct mlx5_ib_mr *mr, u64 virt_addr, - u64 length, int npages, int page_shift, int order, +static int rereg_umr(struct ib_pd *pd, struct mlx5_ib_mr *mr, int access_flags, int flags) { struct mlx5_ib_dev *dev = to_mdev(pd->device); - struct device *ddev = dev->ib_dev.dma_device; - struct mlx5_ib_umr_context umr_context; - struct ib_send_wr *bad; struct mlx5_umr_wr umrwr = {}; - struct ib_sge sg; - struct umr_common *umrc = &dev->umrc; - dma_addr_t dma = 0; - __be64 *mr_pas = NULL; - int size; int err; - mlx5_ib_init_umr_context(&umr_context); - - umrwr.wr.wr_cqe = &umr_context.cqe; umrwr.wr.send_flags = MLX5_IB_SEND_UMR_FAIL_IF_FREE; - if (flags & IB_MR_REREG_TRANS) { - err = dma_map_mr_pas(dev, mr->umem, npages, page_shift, &size, - &mr_pas, &dma); - if (err) - return err; - - umrwr.target.virt_addr = virt_addr; - umrwr.length = length; - umrwr.wr.send_flags |= MLX5_IB_SEND_UMR_UPDATE_TRANSLATION; - } - - prep_umr_wqe_common(pd, &umrwr.wr, &sg, dma, npages, mr->mmkey.key, - page_shift); + umrwr.wr.opcode = MLX5_IB_WR_UMR; + umrwr.mkey = mr->mmkey.key; - if (flags & IB_MR_REREG_PD) { + if (flags & IB_MR_REREG_PD || flags & IB_MR_REREG_ACCESS) { umrwr.pd = pd; - umrwr.wr.send_flags |= MLX5_IB_SEND_UMR_UPDATE_PD; - } - - if (flags & IB_MR_REREG_ACCESS) { umrwr.access_flags = access_flags; - umrwr.wr.send_flags |= MLX5_IB_SEND_UMR_UPDATE_ACCESS; + umrwr.wr.send_flags |= MLX5_IB_SEND_UMR_UPDATE_PD_ACCESS; } - /* post send request to UMR QP */ - down(&umrc->sem); - err = ib_post_send(umrc->qp, &umrwr.wr, &bad); - - if (err) { - mlx5_ib_warn(dev, "post send failed, err %d\n", err); - } else { - wait_for_completion(&umr_context.done); - if (umr_context.status != IB_WC_SUCCESS) { - mlx5_ib_warn(dev, "reg umr failed (%u)\n", - umr_context.status); - err = -EFAULT; - } - } + err = mlx5_ib_post_send_wait(dev, &umrwr); - up(&umrc->sem); - if (flags & IB_MR_REREG_TRANS) { - dma_unmap_single(ddev, dma, size, DMA_TO_DEVICE); - kfree(mr_pas); - } return err; } @@ -1364,6 +1306,7 @@ int mlx5_ib_rereg_user_mr(struct ib_mr *ib_mr, int flags, u64 start, u64 addr = (flags & IB_MR_REREG_TRANS) ? virt_addr : mr->umem->address; u64 len = (flags & IB_MR_REREG_TRANS) ? length : mr->umem->length; int page_shift = 0; + int upd_flags = 0; int npages = 0; int ncont = 0; int order = 0; @@ -1372,6 +1315,8 @@ int mlx5_ib_rereg_user_mr(struct ib_mr *ib_mr, int flags, u64 start, mlx5_ib_dbg(dev, "start 0x%llx, virt_addr 0x%llx, length 0x%llx, access_flags 0x%x\n", start, virt_addr, length, access_flags); + atomic_sub(mr->npages, &dev->mdev->priv.reg_pages); + if (flags != IB_MR_REREG_PD) { /* * Replace umem. This needs to be done whether or not UMR is @@ -1382,7 +1327,7 @@ int mlx5_ib_rereg_user_mr(struct ib_mr *ib_mr, int flags, u64 start, err = mr_umem_get(pd, addr, len, access_flags, &mr->umem, &npages, &page_shift, &ncont, &order); if (err < 0) { - mr->umem = NULL; + clean_mr(mr); return err; } } @@ -1414,32 +1359,37 @@ int mlx5_ib_rereg_user_mr(struct ib_mr *ib_mr, int flags, u64 start, /* * Send a UMR WQE */ - err = rereg_umr(pd, mr, addr, len, npages, page_shift, - order, access_flags, flags); + mr->ibmr.pd = pd; + mr->access_flags = access_flags; + mr->mmkey.iova = addr; + mr->mmkey.size = len; + mr->mmkey.pd = to_mpd(pd)->pdn; + + if (flags & IB_MR_REREG_TRANS) { + upd_flags = MLX5_IB_UPD_XLT_ADDR; + if (flags & IB_MR_REREG_PD) + upd_flags |= MLX5_IB_UPD_XLT_PD; + if (flags & IB_MR_REREG_ACCESS) + upd_flags |= MLX5_IB_UPD_XLT_ACCESS; + err = mlx5_ib_update_xlt(mr, 0, npages, page_shift, + upd_flags); + } else { + err = rereg_umr(pd, mr, access_flags, flags); + } + if (err) { mlx5_ib_warn(dev, "Failed to rereg UMR\n"); + ib_umem_release(mr->umem); + clean_mr(mr); return err; } } - if (flags & IB_MR_REREG_PD) { - ib_mr->pd = pd; - mr->mmkey.pd = to_mpd(pd)->pdn; - } + set_mr_fileds(dev, mr, npages, len, access_flags); - if (flags & IB_MR_REREG_ACCESS) - mr->access_flags = access_flags; - - if (flags & IB_MR_REREG_TRANS) { - atomic_sub(mr->npages, &dev->mdev->priv.reg_pages); - set_mr_fileds(dev, mr, npages, len, access_flags); - mr->mmkey.iova = addr; - mr->mmkey.size = len; - } #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING update_odp_mr(mr); #endif - return 0; } @@ -1461,9 +1411,9 @@ mlx5_alloc_priv_descs(struct ib_device *device, mr->descs = PTR_ALIGN(mr->descs_alloc, MLX5_UMR_ALIGN); - mr->desc_map = dma_map_single(device->dma_device, mr->descs, + mr->desc_map = dma_map_single(device->dev.parent, mr->descs, size, DMA_TO_DEVICE); - if (dma_mapping_error(device->dma_device, mr->desc_map)) { + if (dma_mapping_error(device->dev.parent, mr->desc_map)) { ret = -ENOMEM; goto err; } @@ -1482,7 +1432,7 @@ mlx5_free_priv_descs(struct mlx5_ib_mr *mr) struct ib_device *device = mr->ibmr.device; int size = mr->max_descs * mr->desc_size; - dma_unmap_single(device->dma_device, mr->desc_map, + dma_unmap_single(device->dev.parent, mr->desc_map, size, DMA_TO_DEVICE); kfree(mr->descs_alloc); mr->descs = NULL; @@ -1518,12 +1468,7 @@ static int clean_mr(struct mlx5_ib_mr *mr) return err; } } else { - err = unreg_umr(dev, mr); - if (err) { - mlx5_ib_warn(dev, "failed unregister\n"); - return err; - } - free_cached_mr(dev, mr); + mlx5_mr_cache_free(dev, mr); } if (!umred) @@ -1546,8 +1491,11 @@ int mlx5_ib_dereg_mr(struct ib_mr *ibmr) /* Wait for all running page-fault handlers to finish. */ synchronize_srcu(&dev->mr_srcu); /* Destroy all page mappings */ - mlx5_ib_invalidate_range(umem, ib_umem_start(umem), - ib_umem_end(umem)); + if (umem->odp_data->page_list) + mlx5_ib_invalidate_range(umem, ib_umem_start(umem), + ib_umem_end(umem)); + else + mlx5_ib_free_implicit_mr(mr); /* * We kill the umem before the MR for ODP, * so that there will not be any invalidations in @@ -1603,11 +1551,11 @@ struct ib_mr *mlx5_ib_alloc_mr(struct ib_pd *pd, mr->access_mode = MLX5_MKC_ACCESS_MODE_MTT; MLX5_SET(mkc, mkc, log_page_size, PAGE_SHIFT); err = mlx5_alloc_priv_descs(pd->device, mr, - ndescs, sizeof(u64)); + ndescs, sizeof(struct mlx5_mtt)); if (err) goto err_free_in; - mr->desc_size = sizeof(u64); + mr->desc_size = sizeof(struct mlx5_mtt); mr->max_descs = ndescs; } else if (mr_type == IB_MR_TYPE_SG_GAPS) { mr->access_mode = MLX5_MKC_ACCESS_MODE_KLMS; @@ -1656,6 +1604,7 @@ struct ib_mr *mlx5_ib_alloc_mr(struct ib_pd *pd, if (err) goto err_destroy_psv; + mr->mmkey.type = MLX5_MKEY_MR; mr->ibmr.lkey = mr->mmkey.key; mr->ibmr.rkey = mr->mmkey.key; mr->umem = NULL; @@ -1736,6 +1685,7 @@ struct ib_mw *mlx5_ib_alloc_mw(struct ib_pd *pd, enum ib_mw_type type, if (err) goto free; + mw->mmkey.type = MLX5_MKEY_MW; mw->ibmw.rkey = mw->mmkey.key; resp.response_length = min(offsetof(typeof(resp), response_length) + diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c index cacb631a7b0a..d7b12f0750e2 100644 --- a/drivers/infiniband/hw/mlx5/odp.c +++ b/drivers/infiniband/hw/mlx5/odp.c @@ -34,6 +34,7 @@ #include <rdma/ib_umem_odp.h> #include "mlx5_ib.h" +#include "cmd.h" #define MAX_PREFETCH_LEN (4*1024*1024U) @@ -41,13 +42,146 @@ * a pagefault. */ #define MMU_NOTIFIER_TIMEOUT 1000 -struct workqueue_struct *mlx5_ib_page_fault_wq; +#define MLX5_IMR_MTT_BITS (30 - PAGE_SHIFT) +#define MLX5_IMR_MTT_SHIFT (MLX5_IMR_MTT_BITS + PAGE_SHIFT) +#define MLX5_IMR_MTT_ENTRIES BIT_ULL(MLX5_IMR_MTT_BITS) +#define MLX5_IMR_MTT_SIZE BIT_ULL(MLX5_IMR_MTT_SHIFT) +#define MLX5_IMR_MTT_MASK (~(MLX5_IMR_MTT_SIZE - 1)) + +#define MLX5_KSM_PAGE_SHIFT MLX5_IMR_MTT_SHIFT + +static u64 mlx5_imr_ksm_entries; + +static int check_parent(struct ib_umem_odp *odp, + struct mlx5_ib_mr *parent) +{ + struct mlx5_ib_mr *mr = odp->private; + + return mr && mr->parent == parent; +} + +static struct ib_umem_odp *odp_next(struct ib_umem_odp *odp) +{ + struct mlx5_ib_mr *mr = odp->private, *parent = mr->parent; + struct ib_ucontext *ctx = odp->umem->context; + struct rb_node *rb; + + down_read(&ctx->umem_rwsem); + while (1) { + rb = rb_next(&odp->interval_tree.rb); + if (!rb) + goto not_found; + odp = rb_entry(rb, struct ib_umem_odp, interval_tree.rb); + if (check_parent(odp, parent)) + goto end; + } +not_found: + odp = NULL; +end: + up_read(&ctx->umem_rwsem); + return odp; +} + +static struct ib_umem_odp *odp_lookup(struct ib_ucontext *ctx, + u64 start, u64 length, + struct mlx5_ib_mr *parent) +{ + struct ib_umem_odp *odp; + struct rb_node *rb; + + down_read(&ctx->umem_rwsem); + odp = rbt_ib_umem_lookup(&ctx->umem_tree, start, length); + if (!odp) + goto end; + + while (1) { + if (check_parent(odp, parent)) + goto end; + rb = rb_next(&odp->interval_tree.rb); + if (!rb) + goto not_found; + odp = rb_entry(rb, struct ib_umem_odp, interval_tree.rb); + if (ib_umem_start(odp->umem) > start + length) + goto not_found; + } +not_found: + odp = NULL; +end: + up_read(&ctx->umem_rwsem); + return odp; +} + +void mlx5_odp_populate_klm(struct mlx5_klm *pklm, size_t offset, + size_t nentries, struct mlx5_ib_mr *mr, int flags) +{ + struct ib_pd *pd = mr->ibmr.pd; + struct ib_ucontext *ctx = pd->uobject->context; + struct mlx5_ib_dev *dev = to_mdev(pd->device); + struct ib_umem_odp *odp; + unsigned long va; + int i; + + if (flags & MLX5_IB_UPD_XLT_ZAP) { + for (i = 0; i < nentries; i++, pklm++) { + pklm->bcount = cpu_to_be32(MLX5_IMR_MTT_SIZE); + pklm->key = cpu_to_be32(dev->null_mkey); + pklm->va = 0; + } + return; + } + + odp = odp_lookup(ctx, offset * MLX5_IMR_MTT_SIZE, + nentries * MLX5_IMR_MTT_SIZE, mr); + + for (i = 0; i < nentries; i++, pklm++) { + pklm->bcount = cpu_to_be32(MLX5_IMR_MTT_SIZE); + va = (offset + i) * MLX5_IMR_MTT_SIZE; + if (odp && odp->umem->address == va) { + struct mlx5_ib_mr *mtt = odp->private; + + pklm->key = cpu_to_be32(mtt->ibmr.lkey); + odp = odp_next(odp); + } else { + pklm->key = cpu_to_be32(dev->null_mkey); + } + mlx5_ib_dbg(dev, "[%d] va %lx key %x\n", + i, va, be32_to_cpu(pklm->key)); + } +} + +static void mr_leaf_free_action(struct work_struct *work) +{ + struct ib_umem_odp *odp = container_of(work, struct ib_umem_odp, work); + int idx = ib_umem_start(odp->umem) >> MLX5_IMR_MTT_SHIFT; + struct mlx5_ib_mr *mr = odp->private, *imr = mr->parent; + + mr->parent = NULL; + synchronize_srcu(&mr->dev->mr_srcu); + + if (!READ_ONCE(odp->dying)) { + mr->parent = imr; + if (atomic_dec_and_test(&imr->num_leaf_free)) + wake_up(&imr->q_leaf_free); + return; + } + + ib_umem_release(odp->umem); + if (imr->live) + mlx5_ib_update_xlt(imr, idx, 1, 0, + MLX5_IB_UPD_XLT_INDIRECT | + MLX5_IB_UPD_XLT_ATOMIC); + mlx5_mr_cache_free(mr->dev, mr); + + if (atomic_dec_and_test(&imr->num_leaf_free)) + wake_up(&imr->q_leaf_free); +} void mlx5_ib_invalidate_range(struct ib_umem *umem, unsigned long start, unsigned long end) { struct mlx5_ib_mr *mr; - const u64 umr_block_mask = (MLX5_UMR_MTT_ALIGNMENT / sizeof(u64)) - 1; + const u64 umr_block_mask = (MLX5_UMR_MTT_ALIGNMENT / + sizeof(struct mlx5_mtt)) - 1; u64 idx = 0, blk_start_idx = 0; int in_block = 0; u64 addr; @@ -90,16 +224,21 @@ void mlx5_ib_invalidate_range(struct ib_umem *umem, unsigned long start, u64 umr_offset = idx & umr_block_mask; if (in_block && umr_offset == 0) { - mlx5_ib_update_mtt(mr, blk_start_idx, - idx - blk_start_idx, 1); + mlx5_ib_update_xlt(mr, blk_start_idx, + idx - blk_start_idx, + PAGE_SHIFT, + MLX5_IB_UPD_XLT_ZAP | + MLX5_IB_UPD_XLT_ATOMIC); in_block = 0; } } } if (in_block) - mlx5_ib_update_mtt(mr, blk_start_idx, idx - blk_start_idx + 1, - 1); - + mlx5_ib_update_xlt(mr, blk_start_idx, + idx - blk_start_idx + 1, + PAGE_SHIFT, + MLX5_IB_UPD_XLT_ZAP | + MLX5_IB_UPD_XLT_ATOMIC); /* * We are now sure that the device will not access the * memory. We can safely unmap it, and mark it as dirty if @@ -107,6 +246,13 @@ void mlx5_ib_invalidate_range(struct ib_umem *umem, unsigned long start, */ ib_umem_odp_unmap_dma_pages(umem, start, end); + + if (unlikely(!umem->npages && mr->parent && + !umem->odp_data->dying)) { + WRITE_ONCE(umem->odp_data->dying, 1); + atomic_inc(&mr->parent->num_leaf_free); + schedule_work(&umem->odp_data->work); + } } void mlx5_ib_internal_fill_odp_caps(struct mlx5_ib_dev *dev) @@ -120,6 +266,11 @@ void mlx5_ib_internal_fill_odp_caps(struct mlx5_ib_dev *dev) caps->general_caps = IB_ODP_SUPPORT; + if (MLX5_CAP_GEN(dev->mdev, umr_extended_translation_offset)) + dev->odp_max_size = U64_MAX; + else + dev->odp_max_size = BIT_ULL(MLX5_MAX_UMR_SHIFT + PAGE_SHIFT); + if (MLX5_CAP_ODP(dev->mdev, ud_odp_caps.send)) caps->per_transport_caps.ud_odp_caps |= IB_ODP_SUPPORT_SEND; @@ -135,6 +286,14 @@ void mlx5_ib_internal_fill_odp_caps(struct mlx5_ib_dev *dev) if (MLX5_CAP_ODP(dev->mdev, rc_odp_caps.read)) caps->per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_READ; + if (MLX5_CAP_ODP(dev->mdev, rc_odp_caps.atomic)) + caps->per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_ATOMIC; + + if (MLX5_CAP_GEN(dev->mdev, fixed_buffer_size) && + MLX5_CAP_GEN(dev->mdev, null_mkey) && + MLX5_CAP_GEN(dev->mdev, umr_extended_translation_offset)) + caps->general_caps |= IB_ODP_SUPPORT_IMPLICIT; + return; } @@ -143,151 +302,386 @@ static struct mlx5_ib_mr *mlx5_ib_odp_find_mr_lkey(struct mlx5_ib_dev *dev, { u32 base_key = mlx5_base_mkey(key); struct mlx5_core_mkey *mmkey = __mlx5_mr_lookup(dev->mdev, base_key); - struct mlx5_ib_mr *mr = container_of(mmkey, struct mlx5_ib_mr, mmkey); + struct mlx5_ib_mr *mr; + + if (!mmkey || mmkey->key != key || mmkey->type != MLX5_MKEY_MR) + return NULL; - if (!mmkey || mmkey->key != key || !mr->live) + mr = container_of(mmkey, struct mlx5_ib_mr, mmkey); + + if (!mr->live) return NULL; return container_of(mmkey, struct mlx5_ib_mr, mmkey); } -static void mlx5_ib_page_fault_resume(struct mlx5_ib_qp *qp, - struct mlx5_ib_pfault *pfault, +static void mlx5_ib_page_fault_resume(struct mlx5_ib_dev *dev, + struct mlx5_pagefault *pfault, int error) { - struct mlx5_ib_dev *dev = to_mdev(qp->ibqp.pd->device); - u32 qpn = qp->trans_qp.base.mqp.qpn; + int wq_num = pfault->event_subtype == MLX5_PFAULT_SUBTYPE_WQE ? + pfault->wqe.wq_num : pfault->token; int ret = mlx5_core_page_fault_resume(dev->mdev, - qpn, - pfault->mpfault.flags, + pfault->token, + wq_num, + pfault->type, error); if (ret) - pr_err("Failed to resolve the page fault on QP 0x%x\n", qpn); + mlx5_ib_err(dev, "Failed to resolve the page fault on WQ 0x%x\n", + wq_num); +} + +static struct mlx5_ib_mr *implicit_mr_alloc(struct ib_pd *pd, + struct ib_umem *umem, + bool ksm, int access_flags) +{ + struct mlx5_ib_dev *dev = to_mdev(pd->device); + struct mlx5_ib_mr *mr; + int err; + + mr = mlx5_mr_cache_alloc(dev, ksm ? MLX5_IMR_KSM_CACHE_ENTRY : + MLX5_IMR_MTT_CACHE_ENTRY); + + if (IS_ERR(mr)) + return mr; + + mr->ibmr.pd = pd; + + mr->dev = dev; + mr->access_flags = access_flags; + mr->mmkey.iova = 0; + mr->umem = umem; + + if (ksm) { + err = mlx5_ib_update_xlt(mr, 0, + mlx5_imr_ksm_entries, + MLX5_KSM_PAGE_SHIFT, + MLX5_IB_UPD_XLT_INDIRECT | + MLX5_IB_UPD_XLT_ZAP | + MLX5_IB_UPD_XLT_ENABLE); + + } else { + err = mlx5_ib_update_xlt(mr, 0, + MLX5_IMR_MTT_ENTRIES, + PAGE_SHIFT, + MLX5_IB_UPD_XLT_ZAP | + MLX5_IB_UPD_XLT_ENABLE | + MLX5_IB_UPD_XLT_ATOMIC); + } + + if (err) + goto fail; + + mr->ibmr.lkey = mr->mmkey.key; + mr->ibmr.rkey = mr->mmkey.key; + + mr->live = 1; + + mlx5_ib_dbg(dev, "key %x dev %p mr %p\n", + mr->mmkey.key, dev->mdev, mr); + + return mr; + +fail: + mlx5_ib_err(dev, "Failed to register MKEY %d\n", err); + mlx5_mr_cache_free(dev, mr); + + return ERR_PTR(err); +} + +static struct ib_umem_odp *implicit_mr_get_data(struct mlx5_ib_mr *mr, + u64 io_virt, size_t bcnt) +{ + struct ib_ucontext *ctx = mr->ibmr.pd->uobject->context; + struct mlx5_ib_dev *dev = to_mdev(mr->ibmr.pd->device); + struct ib_umem_odp *odp, *result = NULL; + u64 addr = io_virt & MLX5_IMR_MTT_MASK; + int nentries = 0, start_idx = 0, ret; + struct mlx5_ib_mr *mtt; + struct ib_umem *umem; + + mutex_lock(&mr->umem->odp_data->umem_mutex); + odp = odp_lookup(ctx, addr, 1, mr); + + mlx5_ib_dbg(dev, "io_virt:%llx bcnt:%zx addr:%llx odp:%p\n", + io_virt, bcnt, addr, odp); + +next_mr: + if (likely(odp)) { + if (nentries) + nentries++; + } else { + umem = ib_alloc_odp_umem(ctx, addr, MLX5_IMR_MTT_SIZE); + if (IS_ERR(umem)) { + mutex_unlock(&mr->umem->odp_data->umem_mutex); + return ERR_CAST(umem); + } + + mtt = implicit_mr_alloc(mr->ibmr.pd, umem, 0, mr->access_flags); + if (IS_ERR(mtt)) { + mutex_unlock(&mr->umem->odp_data->umem_mutex); + ib_umem_release(umem); + return ERR_CAST(mtt); + } + + odp = umem->odp_data; + odp->private = mtt; + mtt->umem = umem; + mtt->mmkey.iova = addr; + mtt->parent = mr; + INIT_WORK(&odp->work, mr_leaf_free_action); + + if (!nentries) + start_idx = addr >> MLX5_IMR_MTT_SHIFT; + nentries++; + } + + odp->dying = 0; + + /* Return first odp if region not covered by single one */ + if (likely(!result)) + result = odp; + + addr += MLX5_IMR_MTT_SIZE; + if (unlikely(addr < io_virt + bcnt)) { + odp = odp_next(odp); + if (odp && odp->umem->address != addr) + odp = NULL; + goto next_mr; + } + + if (unlikely(nentries)) { + ret = mlx5_ib_update_xlt(mr, start_idx, nentries, 0, + MLX5_IB_UPD_XLT_INDIRECT | + MLX5_IB_UPD_XLT_ATOMIC); + if (ret) { + mlx5_ib_err(dev, "Failed to update PAS\n"); + result = ERR_PTR(ret); + } + } + + mutex_unlock(&mr->umem->odp_data->umem_mutex); + return result; +} + +struct mlx5_ib_mr *mlx5_ib_alloc_implicit_mr(struct mlx5_ib_pd *pd, + int access_flags) +{ + struct ib_ucontext *ctx = pd->ibpd.uobject->context; + struct mlx5_ib_mr *imr; + struct ib_umem *umem; + + umem = ib_umem_get(ctx, 0, 0, IB_ACCESS_ON_DEMAND, 0); + if (IS_ERR(umem)) + return ERR_CAST(umem); + + imr = implicit_mr_alloc(&pd->ibpd, umem, 1, access_flags); + if (IS_ERR(imr)) { + ib_umem_release(umem); + return ERR_CAST(imr); + } + + imr->umem = umem; + init_waitqueue_head(&imr->q_leaf_free); + atomic_set(&imr->num_leaf_free, 0); + + return imr; +} + +static int mr_leaf_free(struct ib_umem *umem, u64 start, + u64 end, void *cookie) +{ + struct mlx5_ib_mr *mr = umem->odp_data->private, *imr = cookie; + + if (mr->parent != imr) + return 0; + + ib_umem_odp_unmap_dma_pages(umem, + ib_umem_start(umem), + ib_umem_end(umem)); + + if (umem->odp_data->dying) + return 0; + + WRITE_ONCE(umem->odp_data->dying, 1); + atomic_inc(&imr->num_leaf_free); + schedule_work(&umem->odp_data->work); + + return 0; +} + +void mlx5_ib_free_implicit_mr(struct mlx5_ib_mr *imr) +{ + struct ib_ucontext *ctx = imr->ibmr.pd->uobject->context; + + down_read(&ctx->umem_rwsem); + rbt_ib_umem_for_each_in_range(&ctx->umem_tree, 0, ULLONG_MAX, + mr_leaf_free, imr); + up_read(&ctx->umem_rwsem); + + wait_event(imr->q_leaf_free, !atomic_read(&imr->num_leaf_free)); } /* - * Handle a single data segment in a page-fault WQE. + * Handle a single data segment in a page-fault WQE or RDMA region. * - * Returns number of pages retrieved on success. The caller will continue to + * Returns number of pages retrieved on success. The caller may continue to * the next data segment. * Can return the following error codes: * -EAGAIN to designate a temporary error. The caller will abort handling the * page fault and resolve it. * -EFAULT when there's an error mapping the requested pages. The caller will - * abort the page fault handling and possibly move the QP to an error state. - * On other errors the QP should also be closed with an error. + * abort the page fault handling. */ -static int pagefault_single_data_segment(struct mlx5_ib_qp *qp, - struct mlx5_ib_pfault *pfault, +static int pagefault_single_data_segment(struct mlx5_ib_dev *dev, u32 key, u64 io_virt, size_t bcnt, + u32 *bytes_committed, u32 *bytes_mapped) { - struct mlx5_ib_dev *mib_dev = to_mdev(qp->ibqp.pd->device); int srcu_key; - unsigned int current_seq; + unsigned int current_seq = 0; u64 start_idx; int npages = 0, ret = 0; struct mlx5_ib_mr *mr; u64 access_mask = ODP_READ_ALLOWED_BIT; + struct ib_umem_odp *odp; + int implicit = 0; + size_t size; - srcu_key = srcu_read_lock(&mib_dev->mr_srcu); - mr = mlx5_ib_odp_find_mr_lkey(mib_dev, key); + srcu_key = srcu_read_lock(&dev->mr_srcu); + mr = mlx5_ib_odp_find_mr_lkey(dev, key); /* * If we didn't find the MR, it means the MR was closed while we were * handling the ODP event. In this case we return -EFAULT so that the * QP will be closed. */ if (!mr || !mr->ibmr.pd) { - pr_err("Failed to find relevant mr for lkey=0x%06x, probably the MR was destroyed\n", - key); + mlx5_ib_dbg(dev, "Failed to find relevant mr for lkey=0x%06x, probably the MR was destroyed\n", + key); ret = -EFAULT; goto srcu_unlock; } if (!mr->umem->odp_data) { - pr_debug("skipping non ODP MR (lkey=0x%06x) in page fault handler.\n", - key); + mlx5_ib_dbg(dev, "skipping non ODP MR (lkey=0x%06x) in page fault handler.\n", + key); if (bytes_mapped) *bytes_mapped += - (bcnt - pfault->mpfault.bytes_committed); + (bcnt - *bytes_committed); goto srcu_unlock; } - if (mr->ibmr.pd != qp->ibqp.pd) { - pr_err("Page-fault with different PDs for QP and MR.\n"); - ret = -EFAULT; - goto srcu_unlock; + + /* + * Avoid branches - this code will perform correctly + * in all iterations (in iteration 2 and above, + * bytes_committed == 0). + */ + io_virt += *bytes_committed; + bcnt -= *bytes_committed; + + if (!mr->umem->odp_data->page_list) { + odp = implicit_mr_get_data(mr, io_virt, bcnt); + + if (IS_ERR(odp)) { + ret = PTR_ERR(odp); + goto srcu_unlock; + } + mr = odp->private; + implicit = 1; + + } else { + odp = mr->umem->odp_data; } - current_seq = ACCESS_ONCE(mr->umem->odp_data->notifiers_seq); +next_mr: + current_seq = READ_ONCE(odp->notifiers_seq); /* * Ensure the sequence number is valid for some time before we call * gup. */ smp_rmb(); - /* - * Avoid branches - this code will perform correctly - * in all iterations (in iteration 2 and above, - * bytes_committed == 0). - */ - io_virt += pfault->mpfault.bytes_committed; - bcnt -= pfault->mpfault.bytes_committed; - + size = min_t(size_t, bcnt, ib_umem_end(odp->umem) - io_virt); start_idx = (io_virt - (mr->mmkey.iova & PAGE_MASK)) >> PAGE_SHIFT; if (mr->umem->writable) access_mask |= ODP_WRITE_ALLOWED_BIT; - npages = ib_umem_odp_map_dma_pages(mr->umem, io_virt, bcnt, - access_mask, current_seq); - if (npages < 0) { - ret = npages; + + ret = ib_umem_odp_map_dma_pages(mr->umem, io_virt, size, + access_mask, current_seq); + + if (ret < 0) goto srcu_unlock; - } - if (npages > 0) { - mutex_lock(&mr->umem->odp_data->umem_mutex); + if (ret > 0) { + int np = ret; + + mutex_lock(&odp->umem_mutex); if (!ib_umem_mmu_notifier_retry(mr->umem, current_seq)) { /* * No need to check whether the MTTs really belong to * this MR, since ib_umem_odp_map_dma_pages already * checks this. */ - ret = mlx5_ib_update_mtt(mr, start_idx, npages, 0); + ret = mlx5_ib_update_xlt(mr, start_idx, np, + PAGE_SHIFT, + MLX5_IB_UPD_XLT_ATOMIC); } else { ret = -EAGAIN; } - mutex_unlock(&mr->umem->odp_data->umem_mutex); + mutex_unlock(&odp->umem_mutex); if (ret < 0) { if (ret != -EAGAIN) - pr_err("Failed to update mkey page tables\n"); + mlx5_ib_err(dev, "Failed to update mkey page tables\n"); goto srcu_unlock; } if (bytes_mapped) { - u32 new_mappings = npages * PAGE_SIZE - + u32 new_mappings = np * PAGE_SIZE - (io_virt - round_down(io_virt, PAGE_SIZE)); - *bytes_mapped += min_t(u32, new_mappings, bcnt); + *bytes_mapped += min_t(u32, new_mappings, size); } + + npages += np; + } + + bcnt -= size; + if (unlikely(bcnt)) { + struct ib_umem_odp *next; + + io_virt += size; + next = odp_next(odp); + if (unlikely(!next || next->umem->address != io_virt)) { + mlx5_ib_dbg(dev, "next implicit leaf removed at 0x%llx. got %p\n", + io_virt, next); + ret = -EAGAIN; + goto srcu_unlock_no_wait; + } + odp = next; + mr = odp->private; + goto next_mr; } srcu_unlock: if (ret == -EAGAIN) { - if (!mr->umem->odp_data->dying) { - struct ib_umem_odp *odp_data = mr->umem->odp_data; + if (implicit || !odp->dying) { unsigned long timeout = msecs_to_jiffies(MMU_NOTIFIER_TIMEOUT); if (!wait_for_completion_timeout( - &odp_data->notifier_completion, + &odp->notifier_completion, timeout)) { - pr_warn("timeout waiting for mmu notifier completion\n"); + mlx5_ib_warn(dev, "timeout waiting for mmu notifier. seq %d against %d\n", + current_seq, odp->notifiers_seq); } } else { /* The MR is being killed, kill the QP as well. */ ret = -EFAULT; } } - srcu_read_unlock(&mib_dev->mr_srcu, srcu_key); - pfault->mpfault.bytes_committed = 0; + +srcu_unlock_no_wait: + srcu_read_unlock(&dev->mr_srcu, srcu_key); + *bytes_committed = 0; return ret ? ret : npages; } @@ -309,8 +703,9 @@ srcu_unlock: * Returns the number of pages loaded if positive, zero for an empty WQE, or a * negative error code. */ -static int pagefault_data_segments(struct mlx5_ib_qp *qp, - struct mlx5_ib_pfault *pfault, void *wqe, +static int pagefault_data_segments(struct mlx5_ib_dev *dev, + struct mlx5_pagefault *pfault, + struct mlx5_ib_qp *qp, void *wqe, void *wqe_end, u32 *bytes_mapped, u32 *total_wqe_bytes, int receive_queue) { @@ -354,22 +749,23 @@ static int pagefault_data_segments(struct mlx5_ib_qp *qp, if (!inline_segment && total_wqe_bytes) { *total_wqe_bytes += bcnt - min_t(size_t, bcnt, - pfault->mpfault.bytes_committed); + pfault->bytes_committed); } /* A zero length data segment designates a length of 2GB. */ if (bcnt == 0) bcnt = 1U << 31; - if (inline_segment || bcnt <= pfault->mpfault.bytes_committed) { - pfault->mpfault.bytes_committed -= + if (inline_segment || bcnt <= pfault->bytes_committed) { + pfault->bytes_committed -= min_t(size_t, bcnt, - pfault->mpfault.bytes_committed); + pfault->bytes_committed); continue; } - ret = pagefault_single_data_segment(qp, pfault, key, io_virt, - bcnt, bytes_mapped); + ret = pagefault_single_data_segment(dev, key, io_virt, bcnt, + &pfault->bytes_committed, + bytes_mapped); if (ret < 0) break; npages += ret; @@ -378,17 +774,29 @@ static int pagefault_data_segments(struct mlx5_ib_qp *qp, return ret < 0 ? ret : npages; } +static const u32 mlx5_ib_odp_opcode_cap[] = { + [MLX5_OPCODE_SEND] = IB_ODP_SUPPORT_SEND, + [MLX5_OPCODE_SEND_IMM] = IB_ODP_SUPPORT_SEND, + [MLX5_OPCODE_SEND_INVAL] = IB_ODP_SUPPORT_SEND, + [MLX5_OPCODE_RDMA_WRITE] = IB_ODP_SUPPORT_WRITE, + [MLX5_OPCODE_RDMA_WRITE_IMM] = IB_ODP_SUPPORT_WRITE, + [MLX5_OPCODE_RDMA_READ] = IB_ODP_SUPPORT_READ, + [MLX5_OPCODE_ATOMIC_CS] = IB_ODP_SUPPORT_ATOMIC, + [MLX5_OPCODE_ATOMIC_FA] = IB_ODP_SUPPORT_ATOMIC, +}; + /* * Parse initiator WQE. Advances the wqe pointer to point at the * scatter-gather list, and set wqe_end to the end of the WQE. */ static int mlx5_ib_mr_initiator_pfault_handler( - struct mlx5_ib_qp *qp, struct mlx5_ib_pfault *pfault, - void **wqe, void **wqe_end, int wqe_length) + struct mlx5_ib_dev *dev, struct mlx5_pagefault *pfault, + struct mlx5_ib_qp *qp, void **wqe, void **wqe_end, int wqe_length) { - struct mlx5_ib_dev *dev = to_mdev(qp->ibqp.pd->device); struct mlx5_wqe_ctrl_seg *ctrl = *wqe; - u16 wqe_index = pfault->mpfault.wqe.wqe_index; + u16 wqe_index = pfault->wqe.wqe_index; + u32 transport_caps; + struct mlx5_base_av *av; unsigned ds, opcode; #if defined(DEBUG) u32 ctrl_wqe_index, ctrl_qpn; @@ -434,53 +842,49 @@ static int mlx5_ib_mr_initiator_pfault_handler( opcode = be32_to_cpu(ctrl->opmod_idx_opcode) & MLX5_WQE_CTRL_OPCODE_MASK; + switch (qp->ibqp.qp_type) { case IB_QPT_RC: - switch (opcode) { - case MLX5_OPCODE_SEND: - case MLX5_OPCODE_SEND_IMM: - case MLX5_OPCODE_SEND_INVAL: - if (!(dev->odp_caps.per_transport_caps.rc_odp_caps & - IB_ODP_SUPPORT_SEND)) - goto invalid_transport_or_opcode; - break; - case MLX5_OPCODE_RDMA_WRITE: - case MLX5_OPCODE_RDMA_WRITE_IMM: - if (!(dev->odp_caps.per_transport_caps.rc_odp_caps & - IB_ODP_SUPPORT_WRITE)) - goto invalid_transport_or_opcode; - *wqe += sizeof(struct mlx5_wqe_raddr_seg); - break; - case MLX5_OPCODE_RDMA_READ: - if (!(dev->odp_caps.per_transport_caps.rc_odp_caps & - IB_ODP_SUPPORT_READ)) - goto invalid_transport_or_opcode; - *wqe += sizeof(struct mlx5_wqe_raddr_seg); - break; - default: - goto invalid_transport_or_opcode; - } + transport_caps = dev->odp_caps.per_transport_caps.rc_odp_caps; break; case IB_QPT_UD: - switch (opcode) { - case MLX5_OPCODE_SEND: - case MLX5_OPCODE_SEND_IMM: - if (!(dev->odp_caps.per_transport_caps.ud_odp_caps & - IB_ODP_SUPPORT_SEND)) - goto invalid_transport_or_opcode; - *wqe += sizeof(struct mlx5_wqe_datagram_seg); - break; - default: - goto invalid_transport_or_opcode; - } + transport_caps = dev->odp_caps.per_transport_caps.ud_odp_caps; break; default: -invalid_transport_or_opcode: - mlx5_ib_err(dev, "ODP fault on QP of an unsupported opcode or transport. transport: 0x%x opcode: 0x%x.\n", - qp->ibqp.qp_type, opcode); + mlx5_ib_err(dev, "ODP fault on QP of an unsupported transport 0x%x\n", + qp->ibqp.qp_type); + return -EFAULT; + } + + if (unlikely(opcode >= sizeof(mlx5_ib_odp_opcode_cap) / + sizeof(mlx5_ib_odp_opcode_cap[0]) || + !(transport_caps & mlx5_ib_odp_opcode_cap[opcode]))) { + mlx5_ib_err(dev, "ODP fault on QP of an unsupported opcode 0x%x\n", + opcode); return -EFAULT; } + if (qp->ibqp.qp_type != IB_QPT_RC) { + av = *wqe; + if (av->dqp_dct & be32_to_cpu(MLX5_WQE_AV_EXT)) + *wqe += sizeof(struct mlx5_av); + else + *wqe += sizeof(struct mlx5_base_av); + } + + switch (opcode) { + case MLX5_OPCODE_RDMA_WRITE: + case MLX5_OPCODE_RDMA_WRITE_IMM: + case MLX5_OPCODE_RDMA_READ: + *wqe += sizeof(struct mlx5_wqe_raddr_seg); + break; + case MLX5_OPCODE_ATOMIC_CS: + case MLX5_OPCODE_ATOMIC_FA: + *wqe += sizeof(struct mlx5_wqe_raddr_seg); + *wqe += sizeof(struct mlx5_wqe_atomic_seg); + break; + } + return 0; } @@ -489,10 +893,9 @@ invalid_transport_or_opcode: * scatter-gather list, and set wqe_end to the end of the WQE. */ static int mlx5_ib_mr_responder_pfault_handler( - struct mlx5_ib_qp *qp, struct mlx5_ib_pfault *pfault, - void **wqe, void **wqe_end, int wqe_length) + struct mlx5_ib_dev *dev, struct mlx5_pagefault *pfault, + struct mlx5_ib_qp *qp, void **wqe, void **wqe_end, int wqe_length) { - struct mlx5_ib_dev *dev = to_mdev(qp->ibqp.pd->device); struct mlx5_ib_wq *wq = &qp->rq; int wqe_size = 1 << wq->wqe_shift; @@ -529,70 +932,83 @@ invalid_transport_or_opcode: return 0; } -static void mlx5_ib_mr_wqe_pfault_handler(struct mlx5_ib_qp *qp, - struct mlx5_ib_pfault *pfault) +static struct mlx5_ib_qp *mlx5_ib_odp_find_qp(struct mlx5_ib_dev *dev, + u32 wq_num) +{ + struct mlx5_core_qp *mqp = __mlx5_qp_lookup(dev->mdev, wq_num); + + if (!mqp) { + mlx5_ib_err(dev, "QPN 0x%6x not found\n", wq_num); + return NULL; + } + + return to_mibqp(mqp); +} + +static void mlx5_ib_mr_wqe_pfault_handler(struct mlx5_ib_dev *dev, + struct mlx5_pagefault *pfault) { - struct mlx5_ib_dev *dev = to_mdev(qp->ibqp.pd->device); int ret; void *wqe, *wqe_end; u32 bytes_mapped, total_wqe_bytes; char *buffer = NULL; - int resume_with_error = 0; - u16 wqe_index = pfault->mpfault.wqe.wqe_index; - int requestor = pfault->mpfault.flags & MLX5_PFAULT_REQUESTOR; - u32 qpn = qp->trans_qp.base.mqp.qpn; + int resume_with_error = 1; + u16 wqe_index = pfault->wqe.wqe_index; + int requestor = pfault->type & MLX5_PFAULT_REQUESTOR; + struct mlx5_ib_qp *qp; buffer = (char *)__get_free_page(GFP_KERNEL); if (!buffer) { mlx5_ib_err(dev, "Error allocating memory for IO page fault handling.\n"); - resume_with_error = 1; goto resolve_page_fault; } + qp = mlx5_ib_odp_find_qp(dev, pfault->wqe.wq_num); + if (!qp) + goto resolve_page_fault; + ret = mlx5_ib_read_user_wqe(qp, requestor, wqe_index, buffer, PAGE_SIZE, &qp->trans_qp.base); if (ret < 0) { - mlx5_ib_err(dev, "Failed reading a WQE following page fault, error=%x, wqe_index=%x, qpn=%x\n", - -ret, wqe_index, qpn); - resume_with_error = 1; + mlx5_ib_err(dev, "Failed reading a WQE following page fault, error=%d, wqe_index=%x, qpn=%x\n", + ret, wqe_index, pfault->token); goto resolve_page_fault; } wqe = buffer; if (requestor) - ret = mlx5_ib_mr_initiator_pfault_handler(qp, pfault, &wqe, + ret = mlx5_ib_mr_initiator_pfault_handler(dev, pfault, qp, &wqe, &wqe_end, ret); else - ret = mlx5_ib_mr_responder_pfault_handler(qp, pfault, &wqe, + ret = mlx5_ib_mr_responder_pfault_handler(dev, pfault, qp, &wqe, &wqe_end, ret); - if (ret < 0) { - resume_with_error = 1; + if (ret < 0) goto resolve_page_fault; - } if (wqe >= wqe_end) { mlx5_ib_err(dev, "ODP fault on invalid WQE.\n"); - resume_with_error = 1; goto resolve_page_fault; } - ret = pagefault_data_segments(qp, pfault, wqe, wqe_end, &bytes_mapped, - &total_wqe_bytes, !requestor); + ret = pagefault_data_segments(dev, pfault, qp, wqe, wqe_end, + &bytes_mapped, &total_wqe_bytes, + !requestor); if (ret == -EAGAIN) { + resume_with_error = 0; goto resolve_page_fault; } else if (ret < 0 || total_wqe_bytes > bytes_mapped) { - mlx5_ib_err(dev, "Error getting user pages for page fault. Error: 0x%x\n", - -ret); - resume_with_error = 1; + if (ret != -ENOENT) + mlx5_ib_err(dev, "PAGE FAULT error: %d. QP 0x%x. type: 0x%x\n", + ret, pfault->wqe.wq_num, pfault->type); goto resolve_page_fault; } + resume_with_error = 0; resolve_page_fault: - mlx5_ib_page_fault_resume(qp, pfault, resume_with_error); - mlx5_ib_dbg(dev, "PAGE FAULT completed. QP 0x%x resume_with_error=%d, flags: 0x%x\n", - qpn, resume_with_error, - pfault->mpfault.flags); - + mlx5_ib_page_fault_resume(dev, pfault, resume_with_error); + mlx5_ib_dbg(dev, "PAGE FAULT completed. QP 0x%x resume_with_error=%d, type: 0x%x\n", + pfault->wqe.wq_num, resume_with_error, + pfault->type); free_page((unsigned long)buffer); } @@ -602,15 +1018,14 @@ static int pages_in_range(u64 address, u32 length) (address & PAGE_MASK)) >> PAGE_SHIFT; } -static void mlx5_ib_mr_rdma_pfault_handler(struct mlx5_ib_qp *qp, - struct mlx5_ib_pfault *pfault) +static void mlx5_ib_mr_rdma_pfault_handler(struct mlx5_ib_dev *dev, + struct mlx5_pagefault *pfault) { - struct mlx5_pagefault *mpfault = &pfault->mpfault; u64 address; u32 length; - u32 prefetch_len = mpfault->bytes_committed; + u32 prefetch_len = pfault->bytes_committed; int prefetch_activated = 0; - u32 rkey = mpfault->rdma.r_key; + u32 rkey = pfault->rdma.r_key; int ret; /* The RDMA responder handler handles the page fault in two parts. @@ -619,38 +1034,40 @@ static void mlx5_ib_mr_rdma_pfault_handler(struct mlx5_ib_qp *qp, * prefetches more pages. The second operation cannot use the pfault * context and therefore uses the dummy_pfault context allocated on * the stack */ - struct mlx5_ib_pfault dummy_pfault = {}; - - dummy_pfault.mpfault.bytes_committed = 0; - - mpfault->rdma.rdma_va += mpfault->bytes_committed; - mpfault->rdma.rdma_op_len -= min(mpfault->bytes_committed, - mpfault->rdma.rdma_op_len); - mpfault->bytes_committed = 0; + pfault->rdma.rdma_va += pfault->bytes_committed; + pfault->rdma.rdma_op_len -= min(pfault->bytes_committed, + pfault->rdma.rdma_op_len); + pfault->bytes_committed = 0; - address = mpfault->rdma.rdma_va; - length = mpfault->rdma.rdma_op_len; + address = pfault->rdma.rdma_va; + length = pfault->rdma.rdma_op_len; /* For some operations, the hardware cannot tell the exact message * length, and in those cases it reports zero. Use prefetch * logic. */ if (length == 0) { prefetch_activated = 1; - length = mpfault->rdma.packet_size; + length = pfault->rdma.packet_size; prefetch_len = min(MAX_PREFETCH_LEN, prefetch_len); } - ret = pagefault_single_data_segment(qp, pfault, rkey, address, length, - NULL); + ret = pagefault_single_data_segment(dev, rkey, address, length, + &pfault->bytes_committed, NULL); if (ret == -EAGAIN) { /* We're racing with an invalidation, don't prefetch */ prefetch_activated = 0; } else if (ret < 0 || pages_in_range(address, length) > ret) { - mlx5_ib_page_fault_resume(qp, pfault, 1); + mlx5_ib_page_fault_resume(dev, pfault, 1); + if (ret != -ENOENT) + mlx5_ib_warn(dev, "PAGE FAULT error %d. QP 0x%x, type: 0x%x\n", + ret, pfault->token, pfault->type); return; } - mlx5_ib_page_fault_resume(qp, pfault, 0); + mlx5_ib_page_fault_resume(dev, pfault, 0); + mlx5_ib_dbg(dev, "PAGE FAULT completed. QP 0x%x, type: 0x%x, prefetch_activated: %d\n", + pfault->token, pfault->type, + prefetch_activated); /* At this point, there might be a new pagefault already arriving in * the eq, switch to the dummy pagefault for the rest of the @@ -658,139 +1075,93 @@ static void mlx5_ib_mr_rdma_pfault_handler(struct mlx5_ib_qp *qp, * work-queue is being fenced. */ if (prefetch_activated) { - ret = pagefault_single_data_segment(qp, &dummy_pfault, rkey, - address, + u32 bytes_committed = 0; + + ret = pagefault_single_data_segment(dev, rkey, address, prefetch_len, - NULL); - if (ret < 0) { - pr_warn("Prefetch failed (ret = %d, prefetch_activated = %d) for QPN %d, address: 0x%.16llx, length = 0x%.16x\n", - ret, prefetch_activated, - qp->ibqp.qp_num, address, prefetch_len); + &bytes_committed, NULL); + if (ret < 0 && ret != -EAGAIN) { + mlx5_ib_warn(dev, "Prefetch failed. ret: %d, QP 0x%x, address: 0x%.16llx, length = 0x%.16x\n", + ret, pfault->token, address, prefetch_len); } } } -void mlx5_ib_mr_pfault_handler(struct mlx5_ib_qp *qp, - struct mlx5_ib_pfault *pfault) +void mlx5_ib_pfault(struct mlx5_core_dev *mdev, void *context, + struct mlx5_pagefault *pfault) { - u8 event_subtype = pfault->mpfault.event_subtype; + struct mlx5_ib_dev *dev = context; + u8 event_subtype = pfault->event_subtype; switch (event_subtype) { case MLX5_PFAULT_SUBTYPE_WQE: - mlx5_ib_mr_wqe_pfault_handler(qp, pfault); + mlx5_ib_mr_wqe_pfault_handler(dev, pfault); break; case MLX5_PFAULT_SUBTYPE_RDMA: - mlx5_ib_mr_rdma_pfault_handler(qp, pfault); + mlx5_ib_mr_rdma_pfault_handler(dev, pfault); break; default: - pr_warn("Invalid page fault event subtype: 0x%x\n", - event_subtype); - mlx5_ib_page_fault_resume(qp, pfault, 1); - break; + mlx5_ib_err(dev, "Invalid page fault event subtype: 0x%x\n", + event_subtype); + mlx5_ib_page_fault_resume(dev, pfault, 1); } } -static void mlx5_ib_qp_pfault_action(struct work_struct *work) +void mlx5_odp_init_mr_cache_entry(struct mlx5_cache_ent *ent) { - struct mlx5_ib_pfault *pfault = container_of(work, - struct mlx5_ib_pfault, - work); - enum mlx5_ib_pagefault_context context = - mlx5_ib_get_pagefault_context(&pfault->mpfault); - struct mlx5_ib_qp *qp = container_of(pfault, struct mlx5_ib_qp, - pagefaults[context]); - mlx5_ib_mr_pfault_handler(qp, pfault); -} - -void mlx5_ib_qp_disable_pagefaults(struct mlx5_ib_qp *qp) -{ - unsigned long flags; - - spin_lock_irqsave(&qp->disable_page_faults_lock, flags); - qp->disable_page_faults = 1; - spin_unlock_irqrestore(&qp->disable_page_faults_lock, flags); - - /* - * Note that at this point, we are guarenteed that no more - * work queue elements will be posted to the work queue with - * the QP we are closing. - */ - flush_workqueue(mlx5_ib_page_fault_wq); -} - -void mlx5_ib_qp_enable_pagefaults(struct mlx5_ib_qp *qp) -{ - unsigned long flags; - - spin_lock_irqsave(&qp->disable_page_faults_lock, flags); - qp->disable_page_faults = 0; - spin_unlock_irqrestore(&qp->disable_page_faults_lock, flags); -} - -static void mlx5_ib_pfault_handler(struct mlx5_core_qp *qp, - struct mlx5_pagefault *pfault) -{ - /* - * Note that we will only get one fault event per QP per context - * (responder/initiator, read/write), until we resolve the page fault - * with the mlx5_ib_page_fault_resume command. Since this function is - * called from within the work element, there is no risk of missing - * events. - */ - struct mlx5_ib_qp *mibqp = to_mibqp(qp); - enum mlx5_ib_pagefault_context context = - mlx5_ib_get_pagefault_context(pfault); - struct mlx5_ib_pfault *qp_pfault = &mibqp->pagefaults[context]; - - qp_pfault->mpfault = *pfault; - - /* No need to stop interrupts here since we are in an interrupt */ - spin_lock(&mibqp->disable_page_faults_lock); - if (!mibqp->disable_page_faults) - queue_work(mlx5_ib_page_fault_wq, &qp_pfault->work); - spin_unlock(&mibqp->disable_page_faults_lock); -} - -void mlx5_ib_odp_create_qp(struct mlx5_ib_qp *qp) -{ - int i; - - qp->disable_page_faults = 1; - spin_lock_init(&qp->disable_page_faults_lock); + if (!(ent->dev->odp_caps.general_caps & IB_ODP_SUPPORT_IMPLICIT)) + return; - qp->trans_qp.base.mqp.pfault_handler = mlx5_ib_pfault_handler; + switch (ent->order - 2) { + case MLX5_IMR_MTT_CACHE_ENTRY: + ent->page = PAGE_SHIFT; + ent->xlt = MLX5_IMR_MTT_ENTRIES * + sizeof(struct mlx5_mtt) / + MLX5_IB_UMR_OCTOWORD; + ent->access_mode = MLX5_MKC_ACCESS_MODE_MTT; + ent->limit = 0; + break; - for (i = 0; i < MLX5_IB_PAGEFAULT_CONTEXTS; ++i) - INIT_WORK(&qp->pagefaults[i].work, mlx5_ib_qp_pfault_action); + case MLX5_IMR_KSM_CACHE_ENTRY: + ent->page = MLX5_KSM_PAGE_SHIFT; + ent->xlt = mlx5_imr_ksm_entries * + sizeof(struct mlx5_klm) / + MLX5_IB_UMR_OCTOWORD; + ent->access_mode = MLX5_MKC_ACCESS_MODE_KSM; + ent->limit = 0; + break; + } } -int mlx5_ib_odp_init_one(struct mlx5_ib_dev *ibdev) +int mlx5_ib_odp_init_one(struct mlx5_ib_dev *dev) { int ret; - ret = init_srcu_struct(&ibdev->mr_srcu); + ret = init_srcu_struct(&dev->mr_srcu); if (ret) return ret; + if (dev->odp_caps.general_caps & IB_ODP_SUPPORT_IMPLICIT) { + ret = mlx5_cmd_null_mkey(dev->mdev, &dev->null_mkey); + if (ret) { + mlx5_ib_err(dev, "Error getting null_mkey %d\n", ret); + return ret; + } + } + return 0; } -void mlx5_ib_odp_remove_one(struct mlx5_ib_dev *ibdev) +void mlx5_ib_odp_remove_one(struct mlx5_ib_dev *dev) { - cleanup_srcu_struct(&ibdev->mr_srcu); + cleanup_srcu_struct(&dev->mr_srcu); } -int __init mlx5_ib_odp_init(void) +int mlx5_ib_odp_init(void) { - mlx5_ib_page_fault_wq = alloc_ordered_workqueue("mlx5_ib_page_faults", - WQ_MEM_RECLAIM); - if (!mlx5_ib_page_fault_wq) - return -ENOMEM; + mlx5_imr_ksm_entries = BIT_ULL(get_order(TASK_SIZE) - + MLX5_IMR_MTT_BITS); return 0; } -void mlx5_ib_odp_cleanup(void) -{ - destroy_workqueue(mlx5_ib_page_fault_wq); -} diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c index a1b3125f0a6e..ad8a2638e339 100644 --- a/drivers/infiniband/hw/mlx5/qp.c +++ b/drivers/infiniband/hw/mlx5/qp.c @@ -475,60 +475,53 @@ static int qp_has_rq(struct ib_qp_init_attr *attr) return 1; } -static int first_med_uuar(void) +static int first_med_bfreg(void) { return 1; } -static int next_uuar(int n) -{ - n++; - - while (((n % 4) & 2)) - n++; +enum { + /* this is the first blue flame register in the array of bfregs assigned + * to a processes. Since we do not use it for blue flame but rather + * regular 64 bit doorbells, we do not need a lock for maintaiing + * "odd/even" order + */ + NUM_NON_BLUE_FLAME_BFREGS = 1, +}; - return n; +static int max_bfregs(struct mlx5_ib_dev *dev, struct mlx5_bfreg_info *bfregi) +{ + return get_num_uars(dev, bfregi) * MLX5_NON_FP_BFREGS_PER_UAR; } -static int num_med_uuar(struct mlx5_uuar_info *uuari) +static int num_med_bfreg(struct mlx5_ib_dev *dev, + struct mlx5_bfreg_info *bfregi) { int n; - n = uuari->num_uars * MLX5_NON_FP_BF_REGS_PER_PAGE - - uuari->num_low_latency_uuars - 1; + n = max_bfregs(dev, bfregi) - bfregi->num_low_latency_bfregs - + NUM_NON_BLUE_FLAME_BFREGS; return n >= 0 ? n : 0; } -static int max_uuari(struct mlx5_uuar_info *uuari) -{ - return uuari->num_uars * 4; -} - -static int first_hi_uuar(struct mlx5_uuar_info *uuari) +static int first_hi_bfreg(struct mlx5_ib_dev *dev, + struct mlx5_bfreg_info *bfregi) { int med; - int i; - int t; - - med = num_med_uuar(uuari); - for (t = 0, i = first_med_uuar();; i = next_uuar(i)) { - t++; - if (t == med) - return next_uuar(i); - } - return 0; + med = num_med_bfreg(dev, bfregi); + return ++med; } -static int alloc_high_class_uuar(struct mlx5_uuar_info *uuari) +static int alloc_high_class_bfreg(struct mlx5_ib_dev *dev, + struct mlx5_bfreg_info *bfregi) { int i; - for (i = first_hi_uuar(uuari); i < max_uuari(uuari); i = next_uuar(i)) { - if (!test_bit(i, uuari->bitmap)) { - set_bit(i, uuari->bitmap); - uuari->count[i]++; + for (i = first_hi_bfreg(dev, bfregi); i < max_bfregs(dev, bfregi); i++) { + if (!bfregi->count[i]) { + bfregi->count[i]++; return i; } } @@ -536,87 +529,61 @@ static int alloc_high_class_uuar(struct mlx5_uuar_info *uuari) return -ENOMEM; } -static int alloc_med_class_uuar(struct mlx5_uuar_info *uuari) +static int alloc_med_class_bfreg(struct mlx5_ib_dev *dev, + struct mlx5_bfreg_info *bfregi) { - int minidx = first_med_uuar(); + int minidx = first_med_bfreg(); int i; - for (i = first_med_uuar(); i < first_hi_uuar(uuari); i = next_uuar(i)) { - if (uuari->count[i] < uuari->count[minidx]) + for (i = first_med_bfreg(); i < first_hi_bfreg(dev, bfregi); i++) { + if (bfregi->count[i] < bfregi->count[minidx]) minidx = i; + if (!bfregi->count[minidx]) + break; } - uuari->count[minidx]++; + bfregi->count[minidx]++; return minidx; } -static int alloc_uuar(struct mlx5_uuar_info *uuari, - enum mlx5_ib_latency_class lat) +static int alloc_bfreg(struct mlx5_ib_dev *dev, + struct mlx5_bfreg_info *bfregi, + enum mlx5_ib_latency_class lat) { - int uuarn = -EINVAL; + int bfregn = -EINVAL; - mutex_lock(&uuari->lock); + mutex_lock(&bfregi->lock); switch (lat) { case MLX5_IB_LATENCY_CLASS_LOW: - uuarn = 0; - uuari->count[uuarn]++; + BUILD_BUG_ON(NUM_NON_BLUE_FLAME_BFREGS != 1); + bfregn = 0; + bfregi->count[bfregn]++; break; case MLX5_IB_LATENCY_CLASS_MEDIUM: - if (uuari->ver < 2) - uuarn = -ENOMEM; + if (bfregi->ver < 2) + bfregn = -ENOMEM; else - uuarn = alloc_med_class_uuar(uuari); + bfregn = alloc_med_class_bfreg(dev, bfregi); break; case MLX5_IB_LATENCY_CLASS_HIGH: - if (uuari->ver < 2) - uuarn = -ENOMEM; + if (bfregi->ver < 2) + bfregn = -ENOMEM; else - uuarn = alloc_high_class_uuar(uuari); - break; - - case MLX5_IB_LATENCY_CLASS_FAST_PATH: - uuarn = 2; + bfregn = alloc_high_class_bfreg(dev, bfregi); break; } - mutex_unlock(&uuari->lock); + mutex_unlock(&bfregi->lock); - return uuarn; + return bfregn; } -static void free_med_class_uuar(struct mlx5_uuar_info *uuari, int uuarn) +static void free_bfreg(struct mlx5_ib_dev *dev, struct mlx5_bfreg_info *bfregi, int bfregn) { - clear_bit(uuarn, uuari->bitmap); - --uuari->count[uuarn]; -} - -static void free_high_class_uuar(struct mlx5_uuar_info *uuari, int uuarn) -{ - clear_bit(uuarn, uuari->bitmap); - --uuari->count[uuarn]; -} - -static void free_uuar(struct mlx5_uuar_info *uuari, int uuarn) -{ - int nuuars = uuari->num_uars * MLX5_BF_REGS_PER_PAGE; - int high_uuar = nuuars - uuari->num_low_latency_uuars; - - mutex_lock(&uuari->lock); - if (uuarn == 0) { - --uuari->count[uuarn]; - goto out; - } - - if (uuarn < high_uuar) { - free_med_class_uuar(uuari, uuarn); - goto out; - } - - free_high_class_uuar(uuari, uuarn); - -out: - mutex_unlock(&uuari->lock); + mutex_lock(&bfregi->lock); + bfregi->count[bfregn]--; + mutex_unlock(&bfregi->lock); } static enum mlx5_qp_state to_mlx5_state(enum ib_qp_state state) @@ -657,9 +624,20 @@ static void mlx5_ib_lock_cqs(struct mlx5_ib_cq *send_cq, static void mlx5_ib_unlock_cqs(struct mlx5_ib_cq *send_cq, struct mlx5_ib_cq *recv_cq); -static int uuarn_to_uar_index(struct mlx5_uuar_info *uuari, int uuarn) +static int bfregn_to_uar_index(struct mlx5_ib_dev *dev, + struct mlx5_bfreg_info *bfregi, int bfregn) { - return uuari->uars[uuarn / MLX5_BF_REGS_PER_PAGE].index; + int bfregs_per_sys_page; + int index_of_sys_page; + int offset; + + bfregs_per_sys_page = get_uars_per_sys_page(dev, bfregi->lib_uar_4k) * + MLX5_NON_FP_BFREGS_PER_UAR; + index_of_sys_page = bfregn / bfregs_per_sys_page; + + offset = bfregn % bfregs_per_sys_page / MLX5_NON_FP_BFREGS_PER_UAR; + + return bfregi->sys_pages[index_of_sys_page] + offset; } static int mlx5_ib_umem_get(struct mlx5_ib_dev *dev, @@ -762,6 +740,13 @@ err_umem: return err; } +static int adjust_bfregn(struct mlx5_ib_dev *dev, + struct mlx5_bfreg_info *bfregi, int bfregn) +{ + return bfregn / MLX5_NON_FP_BFREGS_PER_UAR * MLX5_BFREGS_PER_UAR + + bfregn % MLX5_NON_FP_BFREGS_PER_UAR; +} + static int create_user_qp(struct mlx5_ib_dev *dev, struct ib_pd *pd, struct mlx5_ib_qp *qp, struct ib_udata *udata, struct ib_qp_init_attr *attr, @@ -776,7 +761,7 @@ static int create_user_qp(struct mlx5_ib_dev *dev, struct ib_pd *pd, int uar_index; int npages; u32 offset = 0; - int uuarn; + int bfregn; int ncont = 0; __be64 *pas; void *qpc; @@ -794,27 +779,27 @@ static int create_user_qp(struct mlx5_ib_dev *dev, struct ib_pd *pd, */ if (qp->flags & MLX5_IB_QP_CROSS_CHANNEL) /* In CROSS_CHANNEL CQ and QP must use the same UAR */ - uuarn = MLX5_CROSS_CHANNEL_UUAR; + bfregn = MLX5_CROSS_CHANNEL_BFREG; else { - uuarn = alloc_uuar(&context->uuari, MLX5_IB_LATENCY_CLASS_HIGH); - if (uuarn < 0) { - mlx5_ib_dbg(dev, "failed to allocate low latency UUAR\n"); + bfregn = alloc_bfreg(dev, &context->bfregi, MLX5_IB_LATENCY_CLASS_HIGH); + if (bfregn < 0) { + mlx5_ib_dbg(dev, "failed to allocate low latency BFREG\n"); mlx5_ib_dbg(dev, "reverting to medium latency\n"); - uuarn = alloc_uuar(&context->uuari, MLX5_IB_LATENCY_CLASS_MEDIUM); - if (uuarn < 0) { - mlx5_ib_dbg(dev, "failed to allocate medium latency UUAR\n"); + bfregn = alloc_bfreg(dev, &context->bfregi, MLX5_IB_LATENCY_CLASS_MEDIUM); + if (bfregn < 0) { + mlx5_ib_dbg(dev, "failed to allocate medium latency BFREG\n"); mlx5_ib_dbg(dev, "reverting to high latency\n"); - uuarn = alloc_uuar(&context->uuari, MLX5_IB_LATENCY_CLASS_LOW); - if (uuarn < 0) { - mlx5_ib_warn(dev, "uuar allocation failed\n"); - return uuarn; + bfregn = alloc_bfreg(dev, &context->bfregi, MLX5_IB_LATENCY_CLASS_LOW); + if (bfregn < 0) { + mlx5_ib_warn(dev, "bfreg allocation failed\n"); + return bfregn; } } } } - uar_index = uuarn_to_uar_index(&context->uuari, uuarn); - mlx5_ib_dbg(dev, "uuarn 0x%x, uar_index 0x%x\n", uuarn, uar_index); + uar_index = bfregn_to_uar_index(dev, &context->bfregi, bfregn); + mlx5_ib_dbg(dev, "bfregn 0x%x, uar_index 0x%x\n", bfregn, uar_index); qp->rq.offset = 0; qp->sq.wqe_shift = ilog2(MLX5_SEND_WQE_BB); @@ -822,7 +807,7 @@ static int create_user_qp(struct mlx5_ib_dev *dev, struct ib_pd *pd, err = set_user_buf_size(dev, qp, &ucmd, base, attr); if (err) - goto err_uuar; + goto err_bfreg; if (ucmd.buf_addr && ubuffer->buf_size) { ubuffer->buf_addr = ucmd.buf_addr; @@ -831,7 +816,7 @@ static int create_user_qp(struct mlx5_ib_dev *dev, struct ib_pd *pd, &ubuffer->umem, &npages, &page_shift, &ncont, &offset); if (err) - goto err_uuar; + goto err_bfreg; } else { ubuffer->umem = NULL; } @@ -854,8 +839,8 @@ static int create_user_qp(struct mlx5_ib_dev *dev, struct ib_pd *pd, MLX5_SET(qpc, qpc, page_offset, offset); MLX5_SET(qpc, qpc, uar_page, uar_index); - resp->uuar_index = uuarn; - qp->uuarn = uuarn; + resp->bfreg_index = adjust_bfregn(dev, &context->bfregi, bfregn); + qp->bfregn = bfregn; err = mlx5_ib_db_map_user(context, ucmd.db_addr, &qp->db); if (err) { @@ -882,13 +867,13 @@ err_umem: if (ubuffer->umem) ib_umem_release(ubuffer->umem); -err_uuar: - free_uuar(&context->uuari, uuarn); +err_bfreg: + free_bfreg(dev, &context->bfregi, bfregn); return err; } -static void destroy_qp_user(struct ib_pd *pd, struct mlx5_ib_qp *qp, - struct mlx5_ib_qp_base *base) +static void destroy_qp_user(struct mlx5_ib_dev *dev, struct ib_pd *pd, + struct mlx5_ib_qp *qp, struct mlx5_ib_qp_base *base) { struct mlx5_ib_ucontext *context; @@ -896,7 +881,7 @@ static void destroy_qp_user(struct ib_pd *pd, struct mlx5_ib_qp *qp, mlx5_ib_db_unmap_user(context, &qp->db); if (base->ubuffer.umem) ib_umem_release(base->ubuffer.umem); - free_uuar(&context->uuari, qp->uuarn); + free_bfreg(dev, &context->bfregi, qp->bfregn); } static int create_kernel_qp(struct mlx5_ib_dev *dev, @@ -905,14 +890,10 @@ static int create_kernel_qp(struct mlx5_ib_dev *dev, u32 **in, int *inlen, struct mlx5_ib_qp_base *base) { - enum mlx5_ib_latency_class lc = MLX5_IB_LATENCY_CLASS_LOW; - struct mlx5_uuar_info *uuari; int uar_index; void *qpc; - int uuarn; int err; - uuari = &dev->mdev->priv.uuari; if (init_attr->create_flags & ~(IB_QP_CREATE_SIGNATURE_EN | IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK | IB_QP_CREATE_IPOIB_UD_LSO | @@ -920,21 +901,20 @@ static int create_kernel_qp(struct mlx5_ib_dev *dev, return -EINVAL; if (init_attr->qp_type == MLX5_IB_QPT_REG_UMR) - lc = MLX5_IB_LATENCY_CLASS_FAST_PATH; - - uuarn = alloc_uuar(uuari, lc); - if (uuarn < 0) { - mlx5_ib_dbg(dev, "\n"); - return -ENOMEM; - } + qp->bf.bfreg = &dev->fp_bfreg; + else + qp->bf.bfreg = &dev->bfreg; - qp->bf = &uuari->bfs[uuarn]; - uar_index = qp->bf->uar->index; + /* We need to divide by two since each register is comprised of + * two buffers of identical size, namely odd and even + */ + qp->bf.buf_size = (1 << MLX5_CAP_GEN(dev->mdev, log_bf_reg_size)) / 2; + uar_index = qp->bf.bfreg->index; err = calc_sq_size(dev, init_attr, qp); if (err < 0) { mlx5_ib_dbg(dev, "err %d\n", err); - goto err_uuar; + return err; } qp->rq.offset = 0; @@ -944,7 +924,7 @@ static int create_kernel_qp(struct mlx5_ib_dev *dev, err = mlx5_buf_alloc(dev->mdev, base->ubuffer.buf_size, &qp->buf); if (err) { mlx5_ib_dbg(dev, "err %d\n", err); - goto err_uuar; + return err; } qp->sq.qend = mlx5_get_send_wqe(qp, qp->sq.wqe_cnt); @@ -994,34 +974,30 @@ static int create_kernel_qp(struct mlx5_ib_dev *dev, return 0; err_wrid: - mlx5_db_free(dev->mdev, &qp->db); kfree(qp->sq.wqe_head); kfree(qp->sq.w_list); kfree(qp->sq.wrid); kfree(qp->sq.wr_data); kfree(qp->rq.wrid); + mlx5_db_free(dev->mdev, &qp->db); err_free: kvfree(*in); err_buf: mlx5_buf_free(dev->mdev, &qp->buf); - -err_uuar: - free_uuar(&dev->mdev->priv.uuari, uuarn); return err; } static void destroy_qp_kernel(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp) { - mlx5_db_free(dev->mdev, &qp->db); kfree(qp->sq.wqe_head); kfree(qp->sq.w_list); kfree(qp->sq.wrid); kfree(qp->sq.wr_data); kfree(qp->rq.wrid); + mlx5_db_free(dev->mdev, &qp->db); mlx5_buf_free(dev->mdev, &qp->buf); - free_uuar(&dev->mdev->priv.uuari, qp->bf->uuarn); } static u32 get_rx_type(struct mlx5_ib_qp *qp, struct ib_qp_init_attr *attr) @@ -1168,7 +1144,8 @@ static int create_raw_packet_qp_rq(struct mlx5_ib_dev *dev, return -ENOMEM; rqc = MLX5_ADDR_OF(create_rq_in, in, ctx); - MLX5_SET(rqc, rqc, vsd, 1); + if (!(rq->flags & MLX5_IB_RQ_CVLAN_STRIPPING)) + MLX5_SET(rqc, rqc, vsd, 1); MLX5_SET(rqc, rqc, mem_rq_type, MLX5_RQC_MEM_RQ_TYPE_MEMORY_RQ_INLINE); MLX5_SET(rqc, rqc, state, MLX5_RQC_STATE_RST); MLX5_SET(rqc, rqc, flush_in_error_en, 1); @@ -1265,6 +1242,8 @@ static int create_raw_packet_qp(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp, if (qp->rq.wqe_cnt) { rq->base.container_mibqp = qp; + if (qp->flags & MLX5_IB_QP_CVLAN_STRIPPING) + rq->flags |= MLX5_IB_RQ_CVLAN_STRIPPING; err = create_raw_packet_qp_rq(dev, rq, in); if (err) goto err_destroy_sq; @@ -1353,7 +1332,7 @@ static int create_rss_raw_qp_tir(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp, if (init_attr->create_flags || init_attr->send_cq) return -EINVAL; - min_resp_len = offsetof(typeof(resp), uuar_index) + sizeof(resp.uuar_index); + min_resp_len = offsetof(typeof(resp), bfreg_index) + sizeof(resp.bfreg_index); if (udata->outlen < min_resp_len) return -EINVAL; @@ -1526,9 +1505,6 @@ static int create_qp_common(struct mlx5_ib_dev *dev, struct ib_pd *pd, &qp->raw_packet_qp.rq.base : &qp->trans_qp.base; - if (init_attr->qp_type != IB_QPT_RAW_PACKET) - mlx5_ib_odp_create_qp(qp); - mutex_init(&qp->mutex); spin_lock_init(&qp->sq.lock); spin_lock_init(&qp->rq.lock); @@ -1589,6 +1565,14 @@ static int create_qp_common(struct mlx5_ib_dev *dev, struct ib_pd *pd, if (init_attr->sq_sig_type == IB_SIGNAL_ALL_WR) qp->sq_signal_bits = MLX5_WQE_CTRL_CQ_UPDATE; + if (init_attr->create_flags & IB_QP_CREATE_CVLAN_STRIPPING) { + if (!(MLX5_CAP_GEN(dev->mdev, eth_net_offloads) && + MLX5_CAP_ETH(dev->mdev, vlan_cap)) || + (init_attr->qp_type != IB_QPT_RAW_PACKET)) + return -EOPNOTSUPP; + qp->flags |= MLX5_IB_QP_CVLAN_STRIPPING; + } + if (pd && pd->uobject) { if (ib_copy_from_udata(&ucmd, udata, sizeof(ucmd))) { mlx5_ib_dbg(dev, "copy failed\n"); @@ -1795,7 +1779,7 @@ static int create_qp_common(struct mlx5_ib_dev *dev, struct ib_pd *pd, err_create: if (qp->create_type == MLX5_QP_USER) - destroy_qp_user(pd, qp, base); + destroy_qp_user(dev, pd, qp, base); else if (qp->create_type == MLX5_QP_KERNEL) destroy_qp_kernel(dev, qp); @@ -1923,7 +1907,6 @@ static void destroy_qp_common(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp) if (qp->state != IB_QPS_RESET) { if (qp->ibqp.qp_type != IB_QPT_RAW_PACKET) { - mlx5_ib_qp_disable_pagefaults(qp); err = mlx5_core_qp_modify(dev->mdev, MLX5_CMD_OP_2RST_QP, 0, NULL, &base->mqp); @@ -1974,7 +1957,7 @@ static void destroy_qp_common(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp) if (qp->create_type == MLX5_QP_KERNEL) destroy_qp_kernel(dev, qp); else if (qp->create_type == MLX5_QP_USER) - destroy_qp_user(&get_pd(qp)->ibpd, qp, base); + destroy_qp_user(dev, &get_pd(qp)->ibpd, qp, base); } static const char *ib_qp_type_str(enum ib_qp_type type) @@ -2229,6 +2212,7 @@ static int mlx5_set_path(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp, { enum rdma_link_layer ll = rdma_port_get_link_layer(&dev->ib_dev, port); int err; + enum ib_gid_type gid_type; if (attr_mask & IB_QP_PKEY_INDEX) path->pkey_index = cpu_to_be16(alt ? attr->alt_pkey_index : @@ -2247,10 +2231,16 @@ static int mlx5_set_path(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp, if (ll == IB_LINK_LAYER_ETHERNET) { if (!(ah->ah_flags & IB_AH_GRH)) return -EINVAL; + err = mlx5_get_roce_gid_type(dev, port, ah->grh.sgid_index, + &gid_type); + if (err) + return err; memcpy(path->rmac, ah->dmac, sizeof(ah->dmac)); path->udp_sport = mlx5_get_roce_udp_sport(dev, port, ah->grh.sgid_index); path->dci_cfi_prio_sl = (ah->sl & 0x7) << 4; + if (gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP) + path->ecn_dscp = (ah->grh.traffic_class >> 2) & 0x3f; } else { path->fl_free_ar = (path_flags & MLX5_PATH_FLAG_FL) ? 0x80 : 0; path->fl_free_ar |= @@ -2453,7 +2443,7 @@ static int modify_raw_packet_qp_rq(struct mlx5_ib_dev *dev, if (raw_qp_param->set_mask & MLX5_RAW_QP_MOD_SET_RQ_Q_CTR_ID) { if (MLX5_CAP_GEN(dev->mdev, modify_rq_counter_set_id)) { MLX5_SET64(modify_rq_in, in, modify_bitmask, - MLX5_MODIFY_RQ_IN_MODIFY_BITMASK_MODIFY_RQ_COUNTER_SET_ID); + MLX5_MODIFY_RQ_IN_MODIFY_BITMASK_RQ_COUNTER_SET_ID); MLX5_SET(rqc, rqc, counter_set_id, raw_qp_param->rq_q_ctr_id); } else pr_info_once("%s: RAW PACKET QP counters are not supported on current FW\n", @@ -2808,7 +2798,7 @@ static int __mlx5_ib_modify_qp(struct ib_qp *ibqp, qp->port) - 1; mibport = &dev->port[port_num]; context->qp_counter_set_usr_page |= - cpu_to_be32((u32)(mibport->q_cnt_id) << 24); + cpu_to_be32((u32)(mibport->q_cnts.set_id) << 24); } if (!ibqp->uobject && cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT) @@ -2823,16 +2813,6 @@ static int __mlx5_ib_modify_qp(struct ib_qp *ibqp, if (mlx5_st < 0) goto out; - /* If moving to a reset or error state, we must disable page faults on - * this QP and flush all current page faults. Otherwise a stale page - * fault may attempt to work on this QP after it is reset and moved - * again to RTS, and may cause the driver and the device to get out of - * sync. */ - if (cur_state != IB_QPS_RESET && cur_state != IB_QPS_ERR && - (new_state == IB_QPS_RESET || new_state == IB_QPS_ERR) && - (qp->ibqp.qp_type != IB_QPT_RAW_PACKET)) - mlx5_ib_qp_disable_pagefaults(qp); - if (mlx5_cur >= MLX5_QP_NUM_STATE || mlx5_new >= MLX5_QP_NUM_STATE || !optab[mlx5_cur][mlx5_new]) goto out; @@ -2846,7 +2826,7 @@ static int __mlx5_ib_modify_qp(struct ib_qp *ibqp, raw_qp_param.operation = op; if (cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT) { - raw_qp_param.rq_q_ctr_id = mibport->q_cnt_id; + raw_qp_param.rq_q_ctr_id = mibport->q_cnts.set_id; raw_qp_param.set_mask |= MLX5_RAW_QP_MOD_SET_RQ_Q_CTR_ID; } @@ -2864,10 +2844,6 @@ static int __mlx5_ib_modify_qp(struct ib_qp *ibqp, if (err) goto out; - if (cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT && - (qp->ibqp.qp_type != IB_QPT_RAW_PACKET)) - mlx5_ib_qp_enable_pagefaults(qp); - qp->state = new_state; if (attr_mask & IB_QP_ACCESS_FLAGS) @@ -3029,20 +3005,20 @@ static void *set_eth_seg(struct mlx5_wqe_eth_seg *eseg, if (wr->opcode == IB_WR_LSO) { struct ib_ud_wr *ud_wr = container_of(wr, struct ib_ud_wr, wr); - int size_of_inl_hdr_start = sizeof(eseg->inline_hdr_start); + int size_of_inl_hdr_start = sizeof(eseg->inline_hdr.start); u64 left, leftlen, copysz; void *pdata = ud_wr->header; left = ud_wr->hlen; eseg->mss = cpu_to_be16(ud_wr->mss); - eseg->inline_hdr_sz = cpu_to_be16(left); + eseg->inline_hdr.sz = cpu_to_be16(left); /* * check if there is space till the end of queue, if yes, * copy all in one shot, otherwise copy till the end of queue, * rollback and than the copy the left */ - leftlen = qend - (void *)eseg->inline_hdr_start; + leftlen = qend - (void *)eseg->inline_hdr.start; copysz = min_t(u64, leftlen, left); memcpy(seg - size_of_inl_hdr_start, pdata, copysz); @@ -3080,9 +3056,10 @@ static void set_data_ptr_seg(struct mlx5_wqe_data_seg *dseg, struct ib_sge *sg) dseg->addr = cpu_to_be64(sg->addr); } -static __be16 get_klm_octo(int npages) +static u64 get_xlt_octo(u64 bytes) { - return cpu_to_be16(ALIGN(npages, 8) / 2); + return ALIGN(bytes, MLX5_IB_UMR_XLT_ALIGNMENT) / + MLX5_IB_UMR_OCTOWORD; } static __be64 frwr_mkey_mask(void) @@ -3127,18 +3104,14 @@ static __be64 sig_mkey_mask(void) } static void set_reg_umr_seg(struct mlx5_wqe_umr_ctrl_seg *umr, - struct mlx5_ib_mr *mr) + struct mlx5_ib_mr *mr) { - int ndescs = mr->ndescs; + int size = mr->ndescs * mr->desc_size; memset(umr, 0, sizeof(*umr)); - if (mr->access_mode == MLX5_MKC_ACCESS_MODE_KLMS) - /* KLMs take twice the size of MTTs */ - ndescs *= 2; - umr->flags = MLX5_UMR_CHECK_NOT_FREE; - umr->klm_octowords = get_klm_octo(ndescs); + umr->xlt_octowords = cpu_to_be16(get_xlt_octo(size)); umr->mkey_mask = frwr_mkey_mask(); } @@ -3149,37 +3122,17 @@ static void set_linv_umr_seg(struct mlx5_wqe_umr_ctrl_seg *umr) umr->flags = MLX5_UMR_INLINE; } -static __be64 get_umr_reg_mr_mask(int atomic) +static __be64 get_umr_enable_mr_mask(void) { u64 result; - result = MLX5_MKEY_MASK_LEN | - MLX5_MKEY_MASK_PAGE_SIZE | - MLX5_MKEY_MASK_START_ADDR | - MLX5_MKEY_MASK_PD | - MLX5_MKEY_MASK_LR | - MLX5_MKEY_MASK_LW | - MLX5_MKEY_MASK_KEY | - MLX5_MKEY_MASK_RR | - MLX5_MKEY_MASK_RW | + result = MLX5_MKEY_MASK_KEY | MLX5_MKEY_MASK_FREE; - if (atomic) - result |= MLX5_MKEY_MASK_A; - - return cpu_to_be64(result); -} - -static __be64 get_umr_unreg_mr_mask(void) -{ - u64 result; - - result = MLX5_MKEY_MASK_FREE; - return cpu_to_be64(result); } -static __be64 get_umr_update_mtt_mask(void) +static __be64 get_umr_disable_mr_mask(void) { u64 result; @@ -3194,23 +3147,22 @@ static __be64 get_umr_update_translation_mask(void) result = MLX5_MKEY_MASK_LEN | MLX5_MKEY_MASK_PAGE_SIZE | - MLX5_MKEY_MASK_START_ADDR | - MLX5_MKEY_MASK_KEY | - MLX5_MKEY_MASK_FREE; + MLX5_MKEY_MASK_START_ADDR; return cpu_to_be64(result); } -static __be64 get_umr_update_access_mask(void) +static __be64 get_umr_update_access_mask(int atomic) { u64 result; - result = MLX5_MKEY_MASK_LW | + result = MLX5_MKEY_MASK_LR | + MLX5_MKEY_MASK_LW | MLX5_MKEY_MASK_RR | - MLX5_MKEY_MASK_RW | - MLX5_MKEY_MASK_A | - MLX5_MKEY_MASK_KEY | - MLX5_MKEY_MASK_FREE; + MLX5_MKEY_MASK_RW; + + if (atomic) + result |= MLX5_MKEY_MASK_A; return cpu_to_be64(result); } @@ -3219,9 +3171,7 @@ static __be64 get_umr_update_pd_mask(void) { u64 result; - result = MLX5_MKEY_MASK_PD | - MLX5_MKEY_MASK_KEY | - MLX5_MKEY_MASK_FREE; + result = MLX5_MKEY_MASK_PD; return cpu_to_be64(result); } @@ -3238,24 +3188,24 @@ static void set_reg_umr_segment(struct mlx5_wqe_umr_ctrl_seg *umr, else umr->flags = MLX5_UMR_CHECK_NOT_FREE; /* fail if not free */ - if (!(wr->send_flags & MLX5_IB_SEND_UMR_UNREG)) { - umr->klm_octowords = get_klm_octo(umrwr->npages); - if (wr->send_flags & MLX5_IB_SEND_UMR_UPDATE_MTT) { - umr->mkey_mask = get_umr_update_mtt_mask(); - umr->bsf_octowords = get_klm_octo(umrwr->target.offset); - umr->flags |= MLX5_UMR_TRANSLATION_OFFSET_EN; - } - if (wr->send_flags & MLX5_IB_SEND_UMR_UPDATE_TRANSLATION) - umr->mkey_mask |= get_umr_update_translation_mask(); - if (wr->send_flags & MLX5_IB_SEND_UMR_UPDATE_ACCESS) - umr->mkey_mask |= get_umr_update_access_mask(); - if (wr->send_flags & MLX5_IB_SEND_UMR_UPDATE_PD) - umr->mkey_mask |= get_umr_update_pd_mask(); - if (!umr->mkey_mask) - umr->mkey_mask = get_umr_reg_mr_mask(atomic); - } else { - umr->mkey_mask = get_umr_unreg_mr_mask(); + umr->xlt_octowords = cpu_to_be16(get_xlt_octo(umrwr->xlt_size)); + if (wr->send_flags & MLX5_IB_SEND_UMR_UPDATE_XLT) { + u64 offset = get_xlt_octo(umrwr->offset); + + umr->xlt_offset = cpu_to_be16(offset & 0xffff); + umr->xlt_offset_47_16 = cpu_to_be32(offset >> 16); + umr->flags |= MLX5_UMR_TRANSLATION_OFFSET_EN; + } + if (wr->send_flags & MLX5_IB_SEND_UMR_UPDATE_TRANSLATION) + umr->mkey_mask |= get_umr_update_translation_mask(); + if (wr->send_flags & MLX5_IB_SEND_UMR_UPDATE_PD_ACCESS) { + umr->mkey_mask |= get_umr_update_access_mask(atomic); + umr->mkey_mask |= get_umr_update_pd_mask(); } + if (wr->send_flags & MLX5_IB_SEND_UMR_ENABLE_MR) + umr->mkey_mask |= get_umr_enable_mr_mask(); + if (wr->send_flags & MLX5_IB_SEND_UMR_DISABLE_MR) + umr->mkey_mask |= get_umr_disable_mr_mask(); if (!wr->num_sge) umr->flags |= MLX5_UMR_INLINE; @@ -3303,17 +3253,17 @@ static void set_reg_mkey_segment(struct mlx5_mkey_seg *seg, struct ib_send_wr *w struct mlx5_umr_wr *umrwr = umr_wr(wr); memset(seg, 0, sizeof(*seg)); - if (wr->send_flags & MLX5_IB_SEND_UMR_UNREG) { + if (wr->send_flags & MLX5_IB_SEND_UMR_DISABLE_MR) seg->status = MLX5_MKEY_STATUS_FREE; - return; - } seg->flags = convert_access(umrwr->access_flags); - if (!(wr->send_flags & MLX5_IB_SEND_UMR_UPDATE_MTT)) { - if (umrwr->pd) - seg->flags_pd = cpu_to_be32(to_mpd(umrwr->pd)->pdn); - seg->start_addr = cpu_to_be64(umrwr->target.virt_addr); - } + if (umrwr->pd) + seg->flags_pd = cpu_to_be32(to_mpd(umrwr->pd)->pdn); + if (wr->send_flags & MLX5_IB_SEND_UMR_UPDATE_TRANSLATION && + !umrwr->length) + seg->flags_pd |= cpu_to_be32(MLX5_MKEY_LEN64); + + seg->start_addr = cpu_to_be64(umrwr->virt_addr); seg->len = cpu_to_be64(umrwr->length); seg->log2_page_size = umrwr->page_shift; seg->qpn_mkey7_0 = cpu_to_be32(0xffffff00 | @@ -3611,7 +3561,7 @@ static int set_sig_data_segment(struct ib_sig_handover_wr *wr, } static void set_sig_mkey_segment(struct mlx5_mkey_seg *seg, - struct ib_sig_handover_wr *wr, u32 nelements, + struct ib_sig_handover_wr *wr, u32 size, u32 length, u32 pdn) { struct ib_mr *sig_mr = wr->sig_mr; @@ -3626,17 +3576,17 @@ static void set_sig_mkey_segment(struct mlx5_mkey_seg *seg, seg->flags_pd = cpu_to_be32(MLX5_MKEY_REMOTE_INVAL | sigerr << 26 | MLX5_MKEY_BSF_EN | pdn); seg->len = cpu_to_be64(length); - seg->xlt_oct_size = cpu_to_be32(be16_to_cpu(get_klm_octo(nelements))); + seg->xlt_oct_size = cpu_to_be32(get_xlt_octo(size)); seg->bsfs_octo_size = cpu_to_be32(MLX5_MKEY_BSF_OCTO_SIZE); } static void set_sig_umr_segment(struct mlx5_wqe_umr_ctrl_seg *umr, - u32 nelements) + u32 size) { memset(umr, 0, sizeof(*umr)); umr->flags = MLX5_FLAGS_INLINE | MLX5_FLAGS_CHECK_FREE; - umr->klm_octowords = get_klm_octo(nelements); + umr->xlt_octowords = cpu_to_be16(get_xlt_octo(size)); umr->bsf_octowords = cpu_to_be16(MLX5_MKEY_BSF_OCTO_SIZE); umr->mkey_mask = sig_mkey_mask(); } @@ -3648,7 +3598,7 @@ static int set_sig_umr_wr(struct ib_send_wr *send_wr, struct mlx5_ib_qp *qp, struct ib_sig_handover_wr *wr = sig_handover_wr(send_wr); struct mlx5_ib_mr *sig_mr = to_mmr(wr->sig_mr); u32 pdn = get_pd(qp)->pdn; - u32 klm_oct_size; + u32 xlt_size; int region_len, ret; if (unlikely(wr->wr.num_sge != 1) || @@ -3670,15 +3620,15 @@ static int set_sig_umr_wr(struct ib_send_wr *send_wr, struct mlx5_ib_qp *qp, * then we use strided block format (3 octowords), * else we use single KLM (1 octoword) **/ - klm_oct_size = wr->prot ? 3 : 1; + xlt_size = wr->prot ? 0x30 : sizeof(struct mlx5_klm); - set_sig_umr_segment(*seg, klm_oct_size); + set_sig_umr_segment(*seg, xlt_size); *seg += sizeof(struct mlx5_wqe_umr_ctrl_seg); *size += sizeof(struct mlx5_wqe_umr_ctrl_seg) / 16; if (unlikely((*seg == qp->sq.qend))) *seg = mlx5_get_send_wqe(qp, 0); - set_sig_mkey_segment(*seg, wr, klm_oct_size, region_len, pdn); + set_sig_mkey_segment(*seg, wr, xlt_size, region_len, pdn); *seg += sizeof(struct mlx5_mkey_seg); *size += sizeof(struct mlx5_mkey_seg) / 16; if (unlikely((*seg == qp->sq.qend))) @@ -3708,8 +3658,9 @@ static int set_psv_wr(struct ib_sig_domain *domain, psv_seg->ref_tag = cpu_to_be32(domain->sig.dif.ref_tag); break; default: - pr_err("Bad signature type given.\n"); - return 1; + pr_err("Bad signature type (%d) is given.\n", + domain->sig_type); + return -EINVAL; } *seg += sizeof(*psv_seg); @@ -3784,24 +3735,6 @@ static void dump_wqe(struct mlx5_ib_qp *qp, int idx, int size_16) } } -static void mlx5_bf_copy(u64 __iomem *dst, u64 *src, - unsigned bytecnt, struct mlx5_ib_qp *qp) -{ - while (bytecnt > 0) { - __iowrite64_copy(dst++, src++, 8); - __iowrite64_copy(dst++, src++, 8); - __iowrite64_copy(dst++, src++, 8); - __iowrite64_copy(dst++, src++, 8); - __iowrite64_copy(dst++, src++, 8); - __iowrite64_copy(dst++, src++, 8); - __iowrite64_copy(dst++, src++, 8); - __iowrite64_copy(dst++, src++, 8); - bytecnt -= 64; - if (unlikely(src == qp->sq.qend)) - src = mlx5_get_send_wqe(qp, 0); - } -} - static u8 get_fence(u8 fence, struct ib_send_wr *wr) { if (unlikely(wr->opcode == IB_WR_LOCAL_INV && @@ -3897,7 +3830,7 @@ int mlx5_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, return mlx5_ib_gsi_post_send(ibqp, wr, bad_wr); qp = to_mqp(ibqp); - bf = qp->bf; + bf = &qp->bf; qend = qp->sq.qend; spin_lock_irqsave(&qp->sq.lock, flags); @@ -4067,6 +4000,12 @@ int mlx5_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, break; case IB_QPT_SMI: + if (unlikely(!mdev->port_caps[qp->port - 1].has_smi)) { + mlx5_ib_warn(dev, "Send SMP MADs is not allowed\n"); + err = -EPERM; + *bad_wr = wr; + goto out; + } case MLX5_IB_QPT_HW_GSI: set_datagram_seg(seg, wr); seg += sizeof(struct mlx5_wqe_datagram_seg); @@ -4170,28 +4109,13 @@ out: * we hit doorbell */ wmb(); - if (bf->need_lock) - spin_lock(&bf->lock); - else - __acquire(&bf->lock); - - /* TBD enable WC */ - if (0 && nreq == 1 && bf->uuarn && inl && size > 1 && size <= bf->buf_size / 16) { - mlx5_bf_copy(bf->reg + bf->offset, (u64 *)ctrl, ALIGN(size * 16, 64), qp); - /* wc_wmb(); */ - } else { - mlx5_write64((__be32 *)ctrl, bf->regreg + bf->offset, - MLX5_GET_DOORBELL_LOCK(&bf->lock32)); - /* Make sure doorbells don't leak out of SQ spinlock - * and reach the HCA out of order. - */ - mmiowb(); - } + /* currently we support only regular doorbells */ + mlx5_write64((__be32 *)ctrl, bf->bfreg->map + bf->offset, NULL); + /* Make sure doorbells don't leak out of SQ spinlock + * and reach the HCA out of order. + */ + mmiowb(); bf->offset ^= bf->buf_size; - if (bf->need_lock) - spin_unlock(&bf->lock); - else - __release(&bf->lock); } spin_unlock_irqrestore(&qp->sq.lock, flags); @@ -4559,14 +4483,6 @@ int mlx5_ib_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr, return mlx5_ib_gsi_query_qp(ibqp, qp_attr, qp_attr_mask, qp_init_attr); -#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING - /* - * Wait for any outstanding page faults, in case the user frees memory - * based upon this query's result. - */ - flush_workqueue(mlx5_ib_page_fault_wq); -#endif - mutex_lock(&qp->mutex); if (qp->ibqp.qp_type == IB_QPT_RAW_PACKET) { @@ -4691,6 +4607,7 @@ static int create_rq(struct mlx5_ib_rwq *rwq, struct ib_pd *pd, struct ib_wq_init_attr *init_attr) { struct mlx5_ib_dev *dev; + int has_net_offloads; __be64 *rq_pas0; void *in; void *rqc; @@ -4722,9 +4639,28 @@ static int create_rq(struct mlx5_ib_rwq *rwq, struct ib_pd *pd, MLX5_SET(wq, wq, log_wq_pg_sz, rwq->log_page_size); MLX5_SET(wq, wq, wq_signature, rwq->wq_sig); MLX5_SET64(wq, wq, dbr_addr, rwq->db.dma); + has_net_offloads = MLX5_CAP_GEN(dev->mdev, eth_net_offloads); + if (init_attr->create_flags & IB_WQ_FLAGS_CVLAN_STRIPPING) { + if (!(has_net_offloads && MLX5_CAP_ETH(dev->mdev, vlan_cap))) { + mlx5_ib_dbg(dev, "VLAN offloads are not supported\n"); + err = -EOPNOTSUPP; + goto out; + } + } else { + MLX5_SET(rqc, rqc, vsd, 1); + } + if (init_attr->create_flags & IB_WQ_FLAGS_SCATTER_FCS) { + if (!(has_net_offloads && MLX5_CAP_ETH(dev->mdev, scatter_fcs))) { + mlx5_ib_dbg(dev, "Scatter FCS is not supported\n"); + err = -EOPNOTSUPP; + goto out; + } + MLX5_SET(rqc, rqc, scatter_fcs, 1); + } rq_pas0 = (__be64 *)MLX5_ADDR_OF(wq, wq, pas); mlx5_ib_populate_pas(dev, rwq->umem, rwq->page_shift, rq_pas0, 0); err = mlx5_core_create_rq_tracked(dev->mdev, in, inlen, &rwq->core_qp); +out: kvfree(in); return err; } @@ -5008,10 +4944,37 @@ int mlx5_ib_modify_wq(struct ib_wq *wq, struct ib_wq_attr *wq_attr, MLX5_SET(modify_rq_in, in, rq_state, curr_wq_state); MLX5_SET(rqc, rqc, state, wq_state); + if (wq_attr_mask & IB_WQ_FLAGS) { + if (wq_attr->flags_mask & IB_WQ_FLAGS_CVLAN_STRIPPING) { + if (!(MLX5_CAP_GEN(dev->mdev, eth_net_offloads) && + MLX5_CAP_ETH(dev->mdev, vlan_cap))) { + mlx5_ib_dbg(dev, "VLAN offloads are not " + "supported\n"); + err = -EOPNOTSUPP; + goto out; + } + MLX5_SET64(modify_rq_in, in, modify_bitmask, + MLX5_MODIFY_RQ_IN_MODIFY_BITMASK_VSD); + MLX5_SET(rqc, rqc, vsd, + (wq_attr->flags & IB_WQ_FLAGS_CVLAN_STRIPPING) ? 0 : 1); + } + } + + if (curr_wq_state == IB_WQS_RESET && wq_state == IB_WQS_RDY) { + if (MLX5_CAP_GEN(dev->mdev, modify_rq_counter_set_id)) { + MLX5_SET64(modify_rq_in, in, modify_bitmask, + MLX5_MODIFY_RQ_IN_MODIFY_BITMASK_RQ_COUNTER_SET_ID); + MLX5_SET(rqc, rqc, counter_set_id, dev->port->q_cnts.set_id); + } else + pr_info_once("%s: Receive WQ counters are not supported on current FW\n", + dev->ib_dev.name); + } + err = mlx5_core_modify_rq(dev->mdev, rwq->core_qp.qpn, in, inlen); - kvfree(in); if (!err) rwq->ibwq.state = (wq_state == MLX5_RQC_STATE_ERR) ? IB_WQS_ERR : wq_state; +out: + kvfree(in); return err; } diff --git a/drivers/infiniband/hw/mlx5/srq.c b/drivers/infiniband/hw/mlx5/srq.c index 6f4397ee1ed6..7cb145f9a6db 100644 --- a/drivers/infiniband/hw/mlx5/srq.c +++ b/drivers/infiniband/hw/mlx5/srq.c @@ -165,8 +165,6 @@ static int create_srq_kernel(struct mlx5_ib_dev *dev, struct mlx5_ib_srq *srq, int err; int i; struct mlx5_wqe_srq_next_seg *next; - int page_shift; - int npages; err = mlx5_db_alloc(dev->mdev, &srq->db); if (err) { @@ -179,7 +177,6 @@ static int create_srq_kernel(struct mlx5_ib_dev *dev, struct mlx5_ib_srq *srq, err = -ENOMEM; goto err_db; } - page_shift = srq->buf.page_shift; srq->head = 0; srq->tail = srq->msrq.max - 1; @@ -191,10 +188,8 @@ static int create_srq_kernel(struct mlx5_ib_dev *dev, struct mlx5_ib_srq *srq, cpu_to_be16((i + 1) & (srq->msrq.max - 1)); } - npages = DIV_ROUND_UP(srq->buf.npages, 1 << (page_shift - PAGE_SHIFT)); - mlx5_ib_dbg(dev, "buf_size %d, page_shift %d, npages %d, calc npages %d\n", - buf_size, page_shift, srq->buf.npages, npages); - in->pas = mlx5_vzalloc(sizeof(*in->pas) * npages); + mlx5_ib_dbg(dev, "srq->buf.page_shift = %d\n", srq->buf.page_shift); + in->pas = mlx5_vzalloc(sizeof(*in->pas) * srq->buf.npages); if (!in->pas) { err = -ENOMEM; goto err_buf; @@ -208,7 +203,7 @@ static int create_srq_kernel(struct mlx5_ib_dev *dev, struct mlx5_ib_srq *srq, } srq->wq_sig = !!srq_signature; - in->log_page_size = page_shift - MLX5_ADAPTER_PAGE_SHIFT; + in->log_page_size = srq->buf.page_shift - MLX5_ADAPTER_PAGE_SHIFT; if (MLX5_CAP_GEN(dev->mdev, cqe_version) == MLX5_CQE_VERSION_V1 && in->type == IB_SRQT_XRC) in->user_index = MLX5_IB_DEFAULT_UIDX; |