diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2020-04-02 03:18:18 +0200 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2020-04-02 03:18:18 +0200 |
commit | 919dce24701f7b34681a6a1d3ef95c9f6c4fb1cc (patch) | |
tree | f8636f88b76cf96103aca9276c848b30b3e18baa /drivers/infiniband/hw/mlx5 | |
parent | Merge tag 'for-linus-hmm' of git://git.kernel.org/pub/scm/linux/kernel/git/rd... (diff) | |
parent | RDMA/bnxt_re: make bnxt_re_ib_init static (diff) | |
download | linux-919dce24701f7b34681a6a1d3ef95c9f6c4fb1cc.tar.xz linux-919dce24701f7b34681a6a1d3ef95c9f6c4fb1cc.zip |
Merge tag 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma
Pull rdma updates from Jason Gunthorpe:
"The majority of the patches are cleanups, refactorings and clarity
improvements.
This cycle saw some more activity from Syzkaller, I think we are now
clean on all but one of those bugs, including the long standing and
obnoxious rdma_cm locking design defect. Continue to see many drivers
getting cleanups, with a few new user visible features.
Summary:
- Various driver updates for siw, bnxt_re, rxe, efa, mlx5, hfi1
- Lots of cleanup patches for hns
- Convert more places to use refcount
- Aggressively lock the RDMA CM code that syzkaller says isn't
working
- Work to clarify ib_cm
- Use the new ib_device lifecycle model in bnxt_re
- Fix mlx5's MR cache which seems to be failing more often with the
new ODP code
- mlx5 'dynamic uar' and 'tx steering' user interfaces"
* tag 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma: (144 commits)
RDMA/bnxt_re: make bnxt_re_ib_init static
IB/qib: Delete struct qib_ivdev.qp_rnd
RDMA/hns: Fix uninitialized variable bug
RDMA/hns: Modify the mask of QP number for CQE of hip08
RDMA/hns: Reduce the maximum number of extend SGE per WQE
RDMA/hns: Reduce PFC frames in congestion scenarios
RDMA/mlx5: Add support for RDMA TX flow table
net/mlx5: Add support for RDMA TX steering
IB/hfi1: Call kobject_put() when kobject_init_and_add() fails
IB/hfi1: Fix memory leaks in sysfs registration and unregistration
IB/mlx5: Move to fully dynamic UAR mode once user space supports it
IB/mlx5: Limit the scope of struct mlx5_bfreg_info to mlx5_ib
IB/mlx5: Extend QP creation to get uar page index from user space
IB/mlx5: Extend CQ creation to get uar page index from user space
IB/mlx5: Expose UAR object and its alloc/destroy commands
IB/hfi1: Get rid of a warning
RDMA/hns: Remove redundant judgment of qp_type
RDMA/hns: Remove redundant assignment of wc->smac when polling cq
RDMA/hns: Remove redundant qpc setup operations
RDMA/hns: Remove meaningless prints
...
Diffstat (limited to 'drivers/infiniband/hw/mlx5')
-rw-r--r-- | drivers/infiniband/hw/mlx5/Makefile | 1 | ||||
-rw-r--r-- | drivers/infiniband/hw/mlx5/cong.c | 20 | ||||
-rw-r--r-- | drivers/infiniband/hw/mlx5/cq.c | 21 | ||||
-rw-r--r-- | drivers/infiniband/hw/mlx5/flow.c | 3 | ||||
-rw-r--r-- | drivers/infiniband/hw/mlx5/main.c | 265 | ||||
-rw-r--r-- | drivers/infiniband/hw/mlx5/mem.c | 2 | ||||
-rw-r--r-- | drivers/infiniband/hw/mlx5/mlx5_ib.h | 89 | ||||
-rw-r--r-- | drivers/infiniband/hw/mlx5/mr.c | 608 | ||||
-rw-r--r-- | drivers/infiniband/hw/mlx5/odp.c | 2 | ||||
-rw-r--r-- | drivers/infiniband/hw/mlx5/qos.c | 136 | ||||
-rw-r--r-- | drivers/infiniband/hw/mlx5/qp.c | 35 |
11 files changed, 822 insertions, 360 deletions
diff --git a/drivers/infiniband/hw/mlx5/Makefile b/drivers/infiniband/hw/mlx5/Makefile index d0a043ccbe58..2a334800f109 100644 --- a/drivers/infiniband/hw/mlx5/Makefile +++ b/drivers/infiniband/hw/mlx5/Makefile @@ -8,3 +8,4 @@ mlx5_ib-$(CONFIG_INFINIBAND_ON_DEMAND_PAGING) += odp.o mlx5_ib-$(CONFIG_MLX5_ESWITCH) += ib_rep.o mlx5_ib-$(CONFIG_INFINIBAND_USER_ACCESS) += devx.o mlx5_ib-$(CONFIG_INFINIBAND_USER_ACCESS) += flow.o +mlx5_ib-$(CONFIG_INFINIBAND_USER_ACCESS) += qos.o diff --git a/drivers/infiniband/hw/mlx5/cong.c b/drivers/infiniband/hw/mlx5/cong.c index 8ba439fabf7f..de4da92b81a6 100644 --- a/drivers/infiniband/hw/mlx5/cong.c +++ b/drivers/infiniband/hw/mlx5/cong.c @@ -47,6 +47,7 @@ static const char * const mlx5_ib_dbg_cc_name[] = { "rp_byte_reset", "rp_threshold", "rp_ai_rate", + "rp_max_rate", "rp_hai_rate", "rp_min_dec_fac", "rp_min_rate", @@ -56,6 +57,7 @@ static const char * const mlx5_ib_dbg_cc_name[] = { "rp_rate_reduce_monitor_period", "rp_initial_alpha_value", "rp_gd", + "np_min_time_between_cnps", "np_cnp_dscp", "np_cnp_prio_mode", "np_cnp_prio", @@ -66,6 +68,7 @@ static const char * const mlx5_ib_dbg_cc_name[] = { #define MLX5_IB_RP_TIME_RESET_ATTR BIT(3) #define MLX5_IB_RP_BYTE_RESET_ATTR BIT(4) #define MLX5_IB_RP_THRESHOLD_ATTR BIT(5) +#define MLX5_IB_RP_MAX_RATE_ATTR BIT(6) #define MLX5_IB_RP_AI_RATE_ATTR BIT(7) #define MLX5_IB_RP_HAI_RATE_ATTR BIT(8) #define MLX5_IB_RP_MIN_DEC_FAC_ATTR BIT(9) @@ -77,6 +80,7 @@ static const char * const mlx5_ib_dbg_cc_name[] = { #define MLX5_IB_RP_INITIAL_ALPHA_VALUE_ATTR BIT(15) #define MLX5_IB_RP_GD_ATTR BIT(16) +#define MLX5_IB_NP_MIN_TIME_BETWEEN_CNPS_ATTR BIT(2) #define MLX5_IB_NP_CNP_DSCP_ATTR BIT(3) #define MLX5_IB_NP_CNP_PRIO_MODE_ATTR BIT(4) @@ -111,6 +115,9 @@ static u32 mlx5_get_cc_param_val(void *field, int offset) case MLX5_IB_DBG_CC_RP_AI_RATE: return MLX5_GET(cong_control_r_roce_ecn_rp, field, rpg_ai_rate); + case MLX5_IB_DBG_CC_RP_MAX_RATE: + return MLX5_GET(cong_control_r_roce_ecn_rp, field, + rpg_max_rate); case MLX5_IB_DBG_CC_RP_HAI_RATE: return MLX5_GET(cong_control_r_roce_ecn_rp, field, rpg_hai_rate); @@ -138,6 +145,9 @@ static u32 mlx5_get_cc_param_val(void *field, int offset) case MLX5_IB_DBG_CC_RP_GD: return MLX5_GET(cong_control_r_roce_ecn_rp, field, rpg_gd); + case MLX5_IB_DBG_CC_NP_MIN_TIME_BETWEEN_CNPS: + return MLX5_GET(cong_control_r_roce_ecn_np, field, + min_time_between_cnps); case MLX5_IB_DBG_CC_NP_CNP_DSCP: return MLX5_GET(cong_control_r_roce_ecn_np, field, cnp_dscp); @@ -186,6 +196,11 @@ static void mlx5_ib_set_cc_param_mask_val(void *field, int offset, MLX5_SET(cong_control_r_roce_ecn_rp, field, rpg_ai_rate, var); break; + case MLX5_IB_DBG_CC_RP_MAX_RATE: + *attr_mask |= MLX5_IB_RP_MAX_RATE_ATTR; + MLX5_SET(cong_control_r_roce_ecn_rp, field, + rpg_max_rate, var); + break; case MLX5_IB_DBG_CC_RP_HAI_RATE: *attr_mask |= MLX5_IB_RP_HAI_RATE_ATTR; MLX5_SET(cong_control_r_roce_ecn_rp, field, @@ -231,6 +246,11 @@ static void mlx5_ib_set_cc_param_mask_val(void *field, int offset, MLX5_SET(cong_control_r_roce_ecn_rp, field, rpg_gd, var); break; + case MLX5_IB_DBG_CC_NP_MIN_TIME_BETWEEN_CNPS: + *attr_mask |= MLX5_IB_NP_MIN_TIME_BETWEEN_CNPS_ATTR; + MLX5_SET(cong_control_r_roce_ecn_np, field, + min_time_between_cnps, var); + break; case MLX5_IB_DBG_CC_NP_CNP_DSCP: *attr_mask |= MLX5_IB_NP_CNP_DSCP_ATTR; MLX5_SET(cong_control_r_roce_ecn_np, field, cnp_dscp, var); diff --git a/drivers/infiniband/hw/mlx5/cq.c b/drivers/infiniband/hw/mlx5/cq.c index 3dec3de903b7..146ba2966744 100644 --- a/drivers/infiniband/hw/mlx5/cq.c +++ b/drivers/infiniband/hw/mlx5/cq.c @@ -715,17 +715,19 @@ static int create_cq_user(struct mlx5_ib_dev *dev, struct ib_udata *udata, struct mlx5_ib_ucontext *context = rdma_udata_to_drv_context( udata, struct mlx5_ib_ucontext, ibucontext); - ucmdlen = udata->inlen < sizeof(ucmd) ? - (sizeof(ucmd) - sizeof(ucmd.flags)) : sizeof(ucmd); + ucmdlen = min(udata->inlen, sizeof(ucmd)); + if (ucmdlen < offsetof(struct mlx5_ib_create_cq, flags)) + return -EINVAL; if (ib_copy_from_udata(&ucmd, udata, ucmdlen)) return -EFAULT; - if (ucmdlen == sizeof(ucmd) && - (ucmd.flags & ~(MLX5_IB_CREATE_CQ_FLAGS_CQE_128B_PAD))) + if ((ucmd.flags & ~(MLX5_IB_CREATE_CQ_FLAGS_CQE_128B_PAD | + MLX5_IB_CREATE_CQ_FLAGS_UAR_PAGE_INDEX))) return -EINVAL; - if (ucmd.cqe_size != 64 && ucmd.cqe_size != 128) + if ((ucmd.cqe_size != 64 && ucmd.cqe_size != 128) || + ucmd.reserved0 || ucmd.reserved1) return -EINVAL; *cqe_size = ucmd.cqe_size; @@ -762,7 +764,14 @@ static int create_cq_user(struct mlx5_ib_dev *dev, struct ib_udata *udata, MLX5_SET(cqc, cqc, log_page_size, page_shift - MLX5_ADAPTER_PAGE_SHIFT); - *index = context->bfregi.sys_pages[0]; + if (ucmd.flags & MLX5_IB_CREATE_CQ_FLAGS_UAR_PAGE_INDEX) { + *index = ucmd.uar_page_index; + } else if (context->bfregi.lib_uar_dyn) { + err = -EINVAL; + goto err_cqb; + } else { + *index = context->bfregi.sys_pages[0]; + } if (ucmd.cqe_comp_en == 1) { int mini_cqe_format; diff --git a/drivers/infiniband/hw/mlx5/flow.c b/drivers/infiniband/hw/mlx5/flow.c index dbee17d22d50..862b7bf3e646 100644 --- a/drivers/infiniband/hw/mlx5/flow.c +++ b/drivers/infiniband/hw/mlx5/flow.c @@ -35,6 +35,9 @@ mlx5_ib_ft_type_to_namespace(enum mlx5_ib_uapi_flow_table_type table_type, case MLX5_IB_UAPI_FLOW_TABLE_TYPE_RDMA_RX: *namespace = MLX5_FLOW_NAMESPACE_RDMA_RX; break; + case MLX5_IB_UAPI_FLOW_TABLE_TYPE_RDMA_TX: + *namespace = MLX5_FLOW_NAMESPACE_RDMA_TX; + break; default: return -EINVAL; } diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index 3efa7493456b..6679756506e6 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -39,9 +39,6 @@ #include <linux/dma-mapping.h> #include <linux/slab.h> #include <linux/bitmap.h> -#if defined(CONFIG_X86) -#include <asm/memtype.h> -#endif #include <linux/sched.h> #include <linux/sched/mm.h> #include <linux/sched/task.h> @@ -898,7 +895,7 @@ static int mlx5_ib_query_device(struct ib_device *ibdev, props->raw_packet_caps |= IB_RAW_PACKET_CAP_CVLAN_STRIPPING; - if (field_avail(typeof(resp), tso_caps, uhw_outlen)) { + if (offsetofend(typeof(resp), tso_caps) <= uhw_outlen) { max_tso = MLX5_CAP_ETH(mdev, max_lso_cap); if (max_tso) { resp.tso_caps.max_tso = 1 << max_tso; @@ -908,7 +905,7 @@ static int mlx5_ib_query_device(struct ib_device *ibdev, } } - if (field_avail(typeof(resp), rss_caps, uhw_outlen)) { + if (offsetofend(typeof(resp), rss_caps) <= uhw_outlen) { resp.rss_caps.rx_hash_function = MLX5_RX_HASH_FUNC_TOEPLITZ; resp.rss_caps.rx_hash_fields_mask = @@ -928,9 +925,9 @@ static int mlx5_ib_query_device(struct ib_device *ibdev, resp.response_length += sizeof(resp.rss_caps); } } else { - if (field_avail(typeof(resp), tso_caps, uhw_outlen)) + if (offsetofend(typeof(resp), tso_caps) <= uhw_outlen) resp.response_length += sizeof(resp.tso_caps); - if (field_avail(typeof(resp), rss_caps, uhw_outlen)) + if (offsetofend(typeof(resp), rss_caps) <= uhw_outlen) resp.response_length += sizeof(resp.rss_caps); } @@ -1072,7 +1069,7 @@ static int mlx5_ib_query_device(struct ib_device *ibdev, MLX5_MAX_CQ_PERIOD; } - if (field_avail(typeof(resp), cqe_comp_caps, uhw_outlen)) { + if (offsetofend(typeof(resp), cqe_comp_caps) <= uhw_outlen) { resp.response_length += sizeof(resp.cqe_comp_caps); if (MLX5_CAP_GEN(dev->mdev, cqe_compression)) { @@ -1090,7 +1087,7 @@ static int mlx5_ib_query_device(struct ib_device *ibdev, } } - if (field_avail(typeof(resp), packet_pacing_caps, uhw_outlen) && + if (offsetofend(typeof(resp), packet_pacing_caps) <= uhw_outlen && raw_support) { if (MLX5_CAP_QOS(mdev, packet_pacing) && MLX5_CAP_GEN(mdev, qos)) { @@ -1108,8 +1105,8 @@ static int mlx5_ib_query_device(struct ib_device *ibdev, resp.response_length += sizeof(resp.packet_pacing_caps); } - if (field_avail(typeof(resp), mlx5_ib_support_multi_pkt_send_wqes, - uhw_outlen)) { + if (offsetofend(typeof(resp), mlx5_ib_support_multi_pkt_send_wqes) <= + uhw_outlen) { if (MLX5_CAP_ETH(mdev, multi_pkt_send_wqe)) resp.mlx5_ib_support_multi_pkt_send_wqes = MLX5_IB_ALLOW_MPW; @@ -1122,7 +1119,7 @@ static int mlx5_ib_query_device(struct ib_device *ibdev, sizeof(resp.mlx5_ib_support_multi_pkt_send_wqes); } - if (field_avail(typeof(resp), flags, uhw_outlen)) { + if (offsetofend(typeof(resp), flags) <= uhw_outlen) { resp.response_length += sizeof(resp.flags); if (MLX5_CAP_GEN(mdev, cqe_compression_128)) @@ -1138,7 +1135,7 @@ static int mlx5_ib_query_device(struct ib_device *ibdev, resp.flags |= MLX5_IB_QUERY_DEV_RESP_FLAGS_SCAT2CQE_DCT; } - if (field_avail(typeof(resp), sw_parsing_caps, uhw_outlen)) { + if (offsetofend(typeof(resp), sw_parsing_caps) <= uhw_outlen) { resp.response_length += sizeof(resp.sw_parsing_caps); if (MLX5_CAP_ETH(mdev, swp)) { resp.sw_parsing_caps.sw_parsing_offloads |= @@ -1158,7 +1155,7 @@ static int mlx5_ib_query_device(struct ib_device *ibdev, } } - if (field_avail(typeof(resp), striding_rq_caps, uhw_outlen) && + if (offsetofend(typeof(resp), striding_rq_caps) <= uhw_outlen && raw_support) { resp.response_length += sizeof(resp.striding_rq_caps); if (MLX5_CAP_GEN(mdev, striding_rq)) { @@ -1181,7 +1178,7 @@ static int mlx5_ib_query_device(struct ib_device *ibdev, } } - if (field_avail(typeof(resp), tunnel_offloads_caps, uhw_outlen)) { + if (offsetofend(typeof(resp), tunnel_offloads_caps) <= uhw_outlen) { resp.response_length += sizeof(resp.tunnel_offloads_caps); if (MLX5_CAP_ETH(mdev, tunnel_stateless_vxlan)) resp.tunnel_offloads_caps |= @@ -1192,12 +1189,10 @@ static int mlx5_ib_query_device(struct ib_device *ibdev, if (MLX5_CAP_ETH(mdev, tunnel_stateless_gre)) resp.tunnel_offloads_caps |= MLX5_IB_TUNNELED_OFFLOADS_GRE; - if (MLX5_CAP_GEN(mdev, flex_parser_protocols) & - MLX5_FLEX_PROTO_CW_MPLS_GRE) + if (MLX5_CAP_ETH(mdev, tunnel_stateless_mpls_over_gre)) resp.tunnel_offloads_caps |= MLX5_IB_TUNNELED_OFFLOADS_MPLS_GRE; - if (MLX5_CAP_GEN(mdev, flex_parser_protocols) & - MLX5_FLEX_PROTO_CW_MPLS_UDP) + if (MLX5_CAP_ETH(mdev, tunnel_stateless_mpls_over_udp)) resp.tunnel_offloads_caps |= MLX5_IB_TUNNELED_OFFLOADS_MPLS_UDP; } @@ -1791,6 +1786,7 @@ static int mlx5_ib_alloc_ucontext(struct ib_ucontext *uctx, max_cqe_version); u32 dump_fill_mkey; bool lib_uar_4k; + bool lib_uar_dyn; if (!dev->ib_active) return -EAGAIN; @@ -1849,8 +1845,14 @@ static int mlx5_ib_alloc_ucontext(struct ib_ucontext *uctx, } lib_uar_4k = req.lib_caps & MLX5_LIB_CAP_4K_UAR; + lib_uar_dyn = req.lib_caps & MLX5_LIB_CAP_DYN_UAR; bfregi = &context->bfregi; + if (lib_uar_dyn) { + bfregi->lib_uar_dyn = lib_uar_dyn; + goto uar_done; + } + /* updates req->total_num_bfregs */ err = calc_total_bfregs(dev, lib_uar_4k, &req, bfregi); if (err) @@ -1877,6 +1879,7 @@ static int mlx5_ib_alloc_ucontext(struct ib_ucontext *uctx, if (err) goto out_sys_pages; +uar_done: if (req.flags & MLX5_IB_ALLOC_UCTX_DEVX) { err = mlx5_ib_devx_create(dev, true); if (err < 0) @@ -1898,19 +1901,19 @@ static int mlx5_ib_alloc_ucontext(struct ib_ucontext *uctx, INIT_LIST_HEAD(&context->db_page_list); mutex_init(&context->db_page_mutex); - resp.tot_bfregs = req.total_num_bfregs; + resp.tot_bfregs = lib_uar_dyn ? 0 : req.total_num_bfregs; resp.num_ports = dev->num_ports; - if (field_avail(typeof(resp), cqe_version, udata->outlen)) + if (offsetofend(typeof(resp), cqe_version) <= udata->outlen) resp.response_length += sizeof(resp.cqe_version); - if (field_avail(typeof(resp), cmds_supp_uhw, udata->outlen)) { + if (offsetofend(typeof(resp), cmds_supp_uhw) <= udata->outlen) { resp.cmds_supp_uhw |= MLX5_USER_CMDS_SUPP_UHW_QUERY_DEVICE | MLX5_USER_CMDS_SUPP_UHW_CREATE_AH; resp.response_length += sizeof(resp.cmds_supp_uhw); } - if (field_avail(typeof(resp), eth_min_inline, udata->outlen)) { + if (offsetofend(typeof(resp), eth_min_inline) <= udata->outlen) { if (mlx5_ib_port_link_layer(ibdev, 1) == IB_LINK_LAYER_ETHERNET) { mlx5_query_min_inline(dev->mdev, &resp.eth_min_inline); resp.eth_min_inline++; @@ -1918,7 +1921,7 @@ static int mlx5_ib_alloc_ucontext(struct ib_ucontext *uctx, resp.response_length += sizeof(resp.eth_min_inline); } - if (field_avail(typeof(resp), clock_info_versions, udata->outlen)) { + if (offsetofend(typeof(resp), clock_info_versions) <= udata->outlen) { if (mdev->clock_info) resp.clock_info_versions = BIT(MLX5_IB_CLOCK_INFO_V1); resp.response_length += sizeof(resp.clock_info_versions); @@ -1930,7 +1933,7 @@ static int mlx5_ib_alloc_ucontext(struct ib_ucontext *uctx, * pretend we don't support reading the HCA's core clock. This is also * forced by mmap function. */ - if (field_avail(typeof(resp), hca_core_clock_offset, udata->outlen)) { + if (offsetofend(typeof(resp), hca_core_clock_offset) <= udata->outlen) { if (PAGE_SIZE <= 4096) { resp.comp_mask |= MLX5_IB_ALLOC_UCONTEXT_RESP_MASK_CORE_CLOCK_OFFSET; @@ -1940,18 +1943,18 @@ static int mlx5_ib_alloc_ucontext(struct ib_ucontext *uctx, resp.response_length += sizeof(resp.hca_core_clock_offset); } - if (field_avail(typeof(resp), log_uar_size, udata->outlen)) + if (offsetofend(typeof(resp), log_uar_size) <= udata->outlen) resp.response_length += sizeof(resp.log_uar_size); - if (field_avail(typeof(resp), num_uars_per_page, udata->outlen)) + if (offsetofend(typeof(resp), num_uars_per_page) <= udata->outlen) resp.response_length += sizeof(resp.num_uars_per_page); - if (field_avail(typeof(resp), num_dyn_bfregs, udata->outlen)) { + if (offsetofend(typeof(resp), num_dyn_bfregs) <= udata->outlen) { resp.num_dyn_bfregs = bfregi->num_dyn_bfregs; resp.response_length += sizeof(resp.num_dyn_bfregs); } - if (field_avail(typeof(resp), dump_fill_mkey, udata->outlen)) { + if (offsetofend(typeof(resp), dump_fill_mkey) <= udata->outlen) { if (MLX5_CAP_GEN(dev->mdev, dump_fill_mkey)) { resp.dump_fill_mkey = dump_fill_mkey; resp.comp_mask |= @@ -2026,6 +2029,17 @@ static phys_addr_t uar_index2pfn(struct mlx5_ib_dev *dev, return (dev->mdev->bar_addr >> PAGE_SHIFT) + uar_idx / fw_uars_per_page; } +static u64 uar_index2paddress(struct mlx5_ib_dev *dev, + int uar_idx) +{ + unsigned int fw_uars_per_page; + + fw_uars_per_page = MLX5_CAP_GEN(dev->mdev, uar_4k) ? + MLX5_UARS_IN_PAGE : 1; + + return (dev->mdev->bar_addr + (uar_idx / fw_uars_per_page) * PAGE_SIZE); +} + static int get_command(unsigned long offset) { return (offset >> MLX5_IB_MMAP_CMD_SHIFT) & MLX5_IB_MMAP_CMD_MASK; @@ -2110,6 +2124,11 @@ static void mlx5_ib_mmap_free(struct rdma_user_mmap_entry *entry) mutex_unlock(&var_table->bitmap_lock); kfree(mentry); break; + case MLX5_IB_MMAP_TYPE_UAR_WC: + case MLX5_IB_MMAP_TYPE_UAR_NC: + mlx5_cmd_free_uar(dev->mdev, mentry->page_idx); + kfree(mentry); + break; default: WARN_ON(true); } @@ -2130,6 +2149,9 @@ static int uar_mmap(struct mlx5_ib_dev *dev, enum mlx5_ib_mmap_cmd cmd, int max_valid_idx = dyn_uar ? bfregi->num_sys_pages : bfregi->num_static_sys_pages; + if (bfregi->lib_uar_dyn) + return -EINVAL; + if (vma->vm_end - vma->vm_start != PAGE_SIZE) return -EINVAL; @@ -2147,14 +2169,6 @@ static int uar_mmap(struct mlx5_ib_dev *dev, enum mlx5_ib_mmap_cmd cmd, switch (cmd) { case MLX5_IB_MMAP_WC_PAGE: case MLX5_IB_MMAP_ALLOC_WC: -/* Some architectures don't support WC memory */ -#if defined(CONFIG_X86) - if (!pat_enabled()) - return -EPERM; -#elif !(defined(CONFIG_PPC) || (defined(CONFIG_ARM) && defined(CONFIG_MMU))) - return -EPERM; -#endif - /* fall through */ case MLX5_IB_MMAP_REGULAR_PAGE: /* For MLX5_IB_MMAP_REGULAR_PAGE do the best effort to get WC */ prot = pgprot_writecombine(vma->vm_page_prot); @@ -2269,7 +2283,8 @@ static int mlx5_ib_mmap_offset(struct mlx5_ib_dev *dev, mentry = to_mmmap(entry); pfn = (mentry->address >> PAGE_SHIFT); - if (mentry->mmap_flag == MLX5_IB_MMAP_TYPE_VAR) + if (mentry->mmap_flag == MLX5_IB_MMAP_TYPE_VAR || + mentry->mmap_flag == MLX5_IB_MMAP_TYPE_UAR_NC) prot = pgprot_noncached(vma->vm_page_prot); else prot = pgprot_writecombine(vma->vm_page_prot); @@ -2300,9 +2315,12 @@ static int mlx5_ib_mmap(struct ib_ucontext *ibcontext, struct vm_area_struct *vm command = get_command(vma->vm_pgoff); switch (command) { case MLX5_IB_MMAP_WC_PAGE: + case MLX5_IB_MMAP_ALLOC_WC: + if (!dev->wc_support) + return -EPERM; + fallthrough; case MLX5_IB_MMAP_NC_PAGE: case MLX5_IB_MMAP_REGULAR_PAGE: - case MLX5_IB_MMAP_ALLOC_WC: return uar_mmap(dev, command, vma, context); case MLX5_IB_MMAP_GET_CONTIGUOUS_PAGES: @@ -4046,6 +4064,11 @@ _get_flow_table(struct mlx5_ib_dev *dev, BIT(MLX5_CAP_FLOWTABLE_RDMA_RX(dev->mdev, log_max_ft_size)); priority = fs_matcher->priority; + } else if (fs_matcher->ns_type == MLX5_FLOW_NAMESPACE_RDMA_TX) { + max_table_size = + BIT(MLX5_CAP_FLOWTABLE_RDMA_TX(dev->mdev, + log_max_ft_size)); + priority = fs_matcher->priority; } max_table_size = min_t(int, max_table_size, MLX5_FS_MAX_ENTRIES); @@ -4062,6 +4085,8 @@ _get_flow_table(struct mlx5_ib_dev *dev, prio = &dev->flow_db->fdb; else if (fs_matcher->ns_type == MLX5_FLOW_NAMESPACE_RDMA_RX) prio = &dev->flow_db->rdma_rx[priority]; + else if (fs_matcher->ns_type == MLX5_FLOW_NAMESPACE_RDMA_TX) + prio = &dev->flow_db->rdma_tx[priority]; if (!prio) return ERR_PTR(-EINVAL); @@ -6090,9 +6115,9 @@ static void mlx5_ib_cleanup_multiport_master(struct mlx5_ib_dev *dev) mlx5_nic_vport_disable_roce(dev->mdev); } -static int var_obj_cleanup(struct ib_uobject *uobject, - enum rdma_remove_reason why, - struct uverbs_attr_bundle *attrs) +static int mmap_obj_cleanup(struct ib_uobject *uobject, + enum rdma_remove_reason why, + struct uverbs_attr_bundle *attrs) { struct mlx5_user_mmap_entry *obj = uobject->object; @@ -6100,6 +6125,16 @@ static int var_obj_cleanup(struct ib_uobject *uobject, return 0; } +static int mlx5_rdma_user_mmap_entry_insert(struct mlx5_ib_ucontext *c, + struct mlx5_user_mmap_entry *entry, + size_t length) +{ + return rdma_user_mmap_entry_insert_range( + &c->ibucontext, &entry->rdma_entry, length, + (MLX5_IB_MMAP_OFFSET_START << 16), + ((MLX5_IB_MMAP_OFFSET_END << 16) + (1UL << 16) - 1)); +} + static struct mlx5_user_mmap_entry * alloc_var_entry(struct mlx5_ib_ucontext *c) { @@ -6130,10 +6165,8 @@ alloc_var_entry(struct mlx5_ib_ucontext *c) entry->page_idx = page_idx; entry->mmap_flag = MLX5_IB_MMAP_TYPE_VAR; - err = rdma_user_mmap_entry_insert_range( - &c->ibucontext, &entry->rdma_entry, var_table->stride_size, - MLX5_IB_MMAP_OFFSET_START << 16, - (MLX5_IB_MMAP_OFFSET_END << 16) + (1UL << 16) - 1); + err = mlx5_rdma_user_mmap_entry_insert(c, entry, + var_table->stride_size); if (err) goto err_insert; @@ -6217,7 +6250,7 @@ DECLARE_UVERBS_NAMED_METHOD_DESTROY( UA_MANDATORY)); DECLARE_UVERBS_NAMED_OBJECT(MLX5_IB_OBJECT_VAR, - UVERBS_TYPE_ALLOC_IDR(var_obj_cleanup), + UVERBS_TYPE_ALLOC_IDR(mmap_obj_cleanup), &UVERBS_METHOD(MLX5_IB_METHOD_VAR_OBJ_ALLOC), &UVERBS_METHOD(MLX5_IB_METHOD_VAR_OBJ_DESTROY)); @@ -6229,6 +6262,134 @@ static bool var_is_supported(struct ib_device *device) MLX5_GENERAL_OBJ_TYPES_CAP_VIRTIO_NET_Q); } +static struct mlx5_user_mmap_entry * +alloc_uar_entry(struct mlx5_ib_ucontext *c, + enum mlx5_ib_uapi_uar_alloc_type alloc_type) +{ + struct mlx5_user_mmap_entry *entry; + struct mlx5_ib_dev *dev; + u32 uar_index; + int err; + + entry = kzalloc(sizeof(*entry), GFP_KERNEL); + if (!entry) + return ERR_PTR(-ENOMEM); + + dev = to_mdev(c->ibucontext.device); + err = mlx5_cmd_alloc_uar(dev->mdev, &uar_index); + if (err) + goto end; + + entry->page_idx = uar_index; + entry->address = uar_index2paddress(dev, uar_index); + if (alloc_type == MLX5_IB_UAPI_UAR_ALLOC_TYPE_BF) + entry->mmap_flag = MLX5_IB_MMAP_TYPE_UAR_WC; + else + entry->mmap_flag = MLX5_IB_MMAP_TYPE_UAR_NC; + + err = mlx5_rdma_user_mmap_entry_insert(c, entry, PAGE_SIZE); + if (err) + goto err_insert; + + return entry; + +err_insert: + mlx5_cmd_free_uar(dev->mdev, uar_index); +end: + kfree(entry); + return ERR_PTR(err); +} + +static int UVERBS_HANDLER(MLX5_IB_METHOD_UAR_OBJ_ALLOC)( + struct uverbs_attr_bundle *attrs) +{ + struct ib_uobject *uobj = uverbs_attr_get_uobject( + attrs, MLX5_IB_ATTR_UAR_OBJ_ALLOC_HANDLE); + enum mlx5_ib_uapi_uar_alloc_type alloc_type; + struct mlx5_ib_ucontext *c; + struct mlx5_user_mmap_entry *entry; + u64 mmap_offset; + u32 length; + int err; + + c = to_mucontext(ib_uverbs_get_ucontext(attrs)); + if (IS_ERR(c)) + return PTR_ERR(c); + + err = uverbs_get_const(&alloc_type, attrs, + MLX5_IB_ATTR_UAR_OBJ_ALLOC_TYPE); + if (err) + return err; + + if (alloc_type != MLX5_IB_UAPI_UAR_ALLOC_TYPE_BF && + alloc_type != MLX5_IB_UAPI_UAR_ALLOC_TYPE_NC) + return -EOPNOTSUPP; + + if (!to_mdev(c->ibucontext.device)->wc_support && + alloc_type == MLX5_IB_UAPI_UAR_ALLOC_TYPE_BF) + return -EOPNOTSUPP; + + entry = alloc_uar_entry(c, alloc_type); + if (IS_ERR(entry)) + return PTR_ERR(entry); + + mmap_offset = mlx5_entry_to_mmap_offset(entry); + length = entry->rdma_entry.npages * PAGE_SIZE; + uobj->object = entry; + + err = uverbs_copy_to(attrs, MLX5_IB_ATTR_UAR_OBJ_ALLOC_MMAP_OFFSET, + &mmap_offset, sizeof(mmap_offset)); + if (err) + goto err; + + err = uverbs_copy_to(attrs, MLX5_IB_ATTR_UAR_OBJ_ALLOC_PAGE_ID, + &entry->page_idx, sizeof(entry->page_idx)); + if (err) + goto err; + + err = uverbs_copy_to(attrs, MLX5_IB_ATTR_UAR_OBJ_ALLOC_MMAP_LENGTH, + &length, sizeof(length)); + if (err) + goto err; + + return 0; + +err: + rdma_user_mmap_entry_remove(&entry->rdma_entry); + return err; +} + +DECLARE_UVERBS_NAMED_METHOD( + MLX5_IB_METHOD_UAR_OBJ_ALLOC, + UVERBS_ATTR_IDR(MLX5_IB_ATTR_UAR_OBJ_ALLOC_HANDLE, + MLX5_IB_OBJECT_UAR, + UVERBS_ACCESS_NEW, + UA_MANDATORY), + UVERBS_ATTR_CONST_IN(MLX5_IB_ATTR_UAR_OBJ_ALLOC_TYPE, + enum mlx5_ib_uapi_uar_alloc_type, + UA_MANDATORY), + UVERBS_ATTR_PTR_OUT(MLX5_IB_ATTR_UAR_OBJ_ALLOC_PAGE_ID, + UVERBS_ATTR_TYPE(u32), + UA_MANDATORY), + UVERBS_ATTR_PTR_OUT(MLX5_IB_ATTR_UAR_OBJ_ALLOC_MMAP_LENGTH, + UVERBS_ATTR_TYPE(u32), + UA_MANDATORY), + UVERBS_ATTR_PTR_OUT(MLX5_IB_ATTR_UAR_OBJ_ALLOC_MMAP_OFFSET, + UVERBS_ATTR_TYPE(u64), + UA_MANDATORY)); + +DECLARE_UVERBS_NAMED_METHOD_DESTROY( + MLX5_IB_METHOD_UAR_OBJ_DESTROY, + UVERBS_ATTR_IDR(MLX5_IB_ATTR_UAR_OBJ_DESTROY_HANDLE, + MLX5_IB_OBJECT_UAR, + UVERBS_ACCESS_DESTROY, + UA_MANDATORY)); + +DECLARE_UVERBS_NAMED_OBJECT(MLX5_IB_OBJECT_UAR, + UVERBS_TYPE_ALLOC_IDR(mmap_obj_cleanup), + &UVERBS_METHOD(MLX5_IB_METHOD_UAR_OBJ_ALLOC), + &UVERBS_METHOD(MLX5_IB_METHOD_UAR_OBJ_DESTROY)); + ADD_UVERBS_ATTRIBUTES_SIMPLE( mlx5_ib_dm, UVERBS_OBJECT_DM, @@ -6253,12 +6414,14 @@ ADD_UVERBS_ATTRIBUTES_SIMPLE( static const struct uapi_definition mlx5_ib_defs[] = { UAPI_DEF_CHAIN(mlx5_ib_devx_defs), UAPI_DEF_CHAIN(mlx5_ib_flow_defs), + UAPI_DEF_CHAIN(mlx5_ib_qos_defs), UAPI_DEF_CHAIN_OBJ_TREE(UVERBS_OBJECT_FLOW_ACTION, &mlx5_ib_flow_action), UAPI_DEF_CHAIN_OBJ_TREE(UVERBS_OBJECT_DM, &mlx5_ib_dm), UAPI_DEF_CHAIN_OBJ_TREE_NAMED(MLX5_IB_OBJECT_VAR, UAPI_DEF_IS_OBJ_SUPPORTED(var_is_supported)), + UAPI_DEF_CHAIN_OBJ_TREE_NAMED(MLX5_IB_OBJECT_UAR), {} }; @@ -6392,7 +6555,7 @@ static int mlx5_ib_stage_init_init(struct mlx5_ib_dev *dev) spin_lock_init(&dev->reset_flow_resource_lock); xa_init(&dev->odp_mkeys); xa_init(&dev->sig_mrs); - spin_lock_init(&dev->mkey_lock); + atomic_set(&dev->mkey_var, 0); spin_lock_init(&dev->dm.lock); dev->dm.dev = mdev; @@ -6548,7 +6711,8 @@ static int mlx5_ib_init_var_table(struct mlx5_ib_dev *dev) doorbell_bar_offset); bar_size = (1ULL << log_doorbell_bar_size) * 4096; var_table->stride_size = 1ULL << log_doorbell_stride; - var_table->num_var_hw_entries = div64_u64(bar_size, var_table->stride_size); + var_table->num_var_hw_entries = div_u64(bar_size, + var_table->stride_size); mutex_init(&var_table->bitmap_lock); var_table->bitmap = bitmap_zalloc(var_table->num_var_hw_entries, GFP_KERNEL); @@ -7080,6 +7244,9 @@ const struct mlx5_ib_profile raw_eth_profile = { STAGE_CREATE(MLX5_IB_STAGE_COUNTERS, mlx5_ib_stage_counters_init, mlx5_ib_stage_counters_cleanup), + STAGE_CREATE(MLX5_IB_STAGE_CONG_DEBUGFS, + mlx5_ib_stage_cong_debugfs_init, + mlx5_ib_stage_cong_debugfs_cleanup), STAGE_CREATE(MLX5_IB_STAGE_UAR, mlx5_ib_stage_uar_init, mlx5_ib_stage_uar_cleanup), diff --git a/drivers/infiniband/hw/mlx5/mem.c b/drivers/infiniband/hw/mlx5/mem.c index b90a3649e7d1..c19ec9fd8a63 100644 --- a/drivers/infiniband/hw/mlx5/mem.c +++ b/drivers/infiniband/hw/mlx5/mem.c @@ -316,7 +316,7 @@ int mlx5_ib_test_wc(struct mlx5_ib_dev *dev) if (!dev->mdev->roce.roce_en && port_type_cap == MLX5_CAP_PORT_TYPE_ETH) { if (mlx5_core_is_pf(dev->mdev)) - dev->wc_support = true; + dev->wc_support = arch_can_pci_mmap_wc(); return 0; } diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h index fc19dc1cf2e1..a4e522385de0 100644 --- a/drivers/infiniband/hw/mlx5/mlx5_ib.h +++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h @@ -64,8 +64,6 @@ dev_warn(&(_dev)->ib_dev.dev, "%s:%d:(pid %d): " format, __func__, \ __LINE__, current->pid, ##arg) -#define field_avail(type, fld, sz) (offsetof(type, fld) + \ - sizeof(((type *)0)->fld) <= (sz)) #define MLX5_IB_DEFAULT_UIDX 0xffffff #define MLX5_USER_ASSIGNED_UIDX_MASK __mlx5_mask(qpc, user_index) @@ -126,11 +124,27 @@ enum { enum mlx5_ib_mmap_type { MLX5_IB_MMAP_TYPE_MEMIC = 1, MLX5_IB_MMAP_TYPE_VAR = 2, + MLX5_IB_MMAP_TYPE_UAR_WC = 3, + MLX5_IB_MMAP_TYPE_UAR_NC = 4, }; -#define MLX5_LOG_SW_ICM_BLOCK_SIZE(dev) \ - (MLX5_CAP_DEV_MEM(dev, log_sw_icm_alloc_granularity)) -#define MLX5_SW_ICM_BLOCK_SIZE(dev) (1 << MLX5_LOG_SW_ICM_BLOCK_SIZE(dev)) +struct mlx5_bfreg_info { + u32 *sys_pages; + int num_low_latency_bfregs; + unsigned int *count; + + /* + * protect bfreg allocation data structs + */ + struct mutex lock; + u32 ver; + u8 lib_uar_4k : 1; + u8 lib_uar_dyn : 1; + u32 num_sys_pages; + u32 num_static_sys_pages; + u32 total_num_bfregs; + u32 num_dyn_bfregs; +}; struct mlx5_ib_ucontext { struct ib_ucontext ibucontext; @@ -203,6 +217,11 @@ struct mlx5_ib_flow_matcher { u8 match_criteria_enable; }; +struct mlx5_ib_pp { + u16 index; + struct mlx5_core_dev *mdev; +}; + struct mlx5_ib_flow_db { struct mlx5_ib_flow_prio prios[MLX5_IB_NUM_FLOW_FT]; struct mlx5_ib_flow_prio egress_prios[MLX5_IB_NUM_FLOW_FT]; @@ -210,6 +229,7 @@ struct mlx5_ib_flow_db { struct mlx5_ib_flow_prio egress[MLX5_IB_NUM_EGRESS_FTS]; struct mlx5_ib_flow_prio fdb; struct mlx5_ib_flow_prio rdma_rx[MLX5_IB_NUM_FLOW_FT]; + struct mlx5_ib_flow_prio rdma_tx[MLX5_IB_NUM_FLOW_FT]; struct mlx5_flow_table *lag_demux_ft; /* Protect flow steering bypass flow tables * when add/del flow rules. @@ -618,8 +638,8 @@ struct mlx5_ib_mr { struct ib_umem *umem; struct mlx5_shared_mr_info *smr_info; struct list_head list; - int order; - bool allocated_from_cache; + unsigned int order; + struct mlx5_cache_ent *cache_ent; int npages; struct mlx5_ib_dev *dev; u32 out[MLX5_ST_SZ_DW(create_mkey_out)]; @@ -701,22 +721,34 @@ struct mlx5_cache_ent { u32 access_mode; u32 page; - u32 size; - u32 cur; + u8 disabled:1; + u8 fill_to_high_water:1; + + /* + * - available_mrs is the length of list head, ie the number of MRs + * available for immediate allocation. + * - total_mrs is available_mrs plus all in use MRs that could be + * returned to the cache. + * - limit is the low water mark for available_mrs, 2* limit is the + * upper water mark. + * - pending is the number of MRs currently being created + */ + u32 total_mrs; + u32 available_mrs; + u32 limit; + u32 pending; + + /* Statistics */ u32 miss; - u32 limit; struct mlx5_ib_dev *dev; struct work_struct work; struct delayed_work dwork; - int pending; - struct completion compl; }; struct mlx5_mr_cache { struct workqueue_struct *wq; struct mlx5_cache_ent ent[MAX_MR_CACHE_ENTRIES]; - int stopped; struct dentry *root; unsigned long last_add; }; @@ -794,6 +826,7 @@ enum mlx5_ib_dbg_cc_types { MLX5_IB_DBG_CC_RP_BYTE_RESET, MLX5_IB_DBG_CC_RP_THRESHOLD, MLX5_IB_DBG_CC_RP_AI_RATE, + MLX5_IB_DBG_CC_RP_MAX_RATE, MLX5_IB_DBG_CC_RP_HAI_RATE, MLX5_IB_DBG_CC_RP_MIN_DEC_FAC, MLX5_IB_DBG_CC_RP_MIN_RATE, @@ -803,6 +836,7 @@ enum mlx5_ib_dbg_cc_types { MLX5_IB_DBG_CC_RP_RATE_REDUCE_MONITOR_PERIOD, MLX5_IB_DBG_CC_RP_INITIAL_ALPHA_VALUE, MLX5_IB_DBG_CC_RP_GD, + MLX5_IB_DBG_CC_NP_MIN_TIME_BETWEEN_CNPS, MLX5_IB_DBG_CC_NP_CNP_DSCP, MLX5_IB_DBG_CC_NP_CNP_PRIO_MODE, MLX5_IB_DBG_CC_NP_CNP_PRIO, @@ -986,19 +1020,16 @@ struct mlx5_ib_dev { */ struct mutex cap_mask_mutex; u8 ib_active:1; - u8 fill_delay:1; u8 is_rep:1; u8 lag_active:1; u8 wc_support:1; + u8 fill_delay; struct umr_common umrc; /* sync used page count stats */ struct mlx5_ib_resources devr; - /* protect mkey key part */ - spinlock_t mkey_lock; - u8 mkey_key; - + atomic_t mkey_var; struct mlx5_mr_cache cache; struct timer_list delay_timer; /* Prevents soft lock on massive reg MRs */ @@ -1268,7 +1299,8 @@ int mlx5_ib_get_cqe_size(struct ib_cq *ibcq); int mlx5_mr_cache_init(struct mlx5_ib_dev *dev); int mlx5_mr_cache_cleanup(struct mlx5_ib_dev *dev); -struct mlx5_ib_mr *mlx5_mr_cache_alloc(struct mlx5_ib_dev *dev, int entry); +struct mlx5_ib_mr *mlx5_mr_cache_alloc(struct mlx5_ib_dev *dev, + unsigned int entry); void mlx5_mr_cache_free(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr); int mlx5_mr_cache_invalidate(struct mlx5_ib_mr *mr); @@ -1388,6 +1420,7 @@ int mlx5_ib_fill_stat_entry(struct sk_buff *msg, extern const struct uapi_definition mlx5_ib_devx_defs[]; extern const struct uapi_definition mlx5_ib_flow_defs[]; +extern const struct uapi_definition mlx5_ib_qos_defs[]; #if IS_ENABLED(CONFIG_INFINIBAND_USER_ACCESS) int mlx5_ib_devx_create(struct mlx5_ib_dev *dev, bool is_user); @@ -1477,12 +1510,11 @@ static inline int get_qp_user_index(struct mlx5_ib_ucontext *ucontext, { u8 cqe_version = ucontext->cqe_version; - if (field_avail(struct mlx5_ib_create_qp, uidx, inlen) && - !cqe_version && (ucmd->uidx == MLX5_IB_DEFAULT_UIDX)) + if ((offsetofend(typeof(*ucmd), uidx) <= inlen) && !cqe_version && + (ucmd->uidx == MLX5_IB_DEFAULT_UIDX)) return 0; - if (!!(field_avail(struct mlx5_ib_create_qp, uidx, inlen) != - !!cqe_version)) + if ((offsetofend(typeof(*ucmd), uidx) <= inlen) != !!cqe_version) return -EINVAL; return verify_assign_uidx(cqe_version, ucmd->uidx, user_index); @@ -1495,12 +1527,11 @@ static inline int get_srq_user_index(struct mlx5_ib_ucontext *ucontext, { u8 cqe_version = ucontext->cqe_version; - if (field_avail(struct mlx5_ib_create_srq, uidx, inlen) && - !cqe_version && (ucmd->uidx == MLX5_IB_DEFAULT_UIDX)) + if ((offsetofend(typeof(*ucmd), uidx) <= inlen) && !cqe_version && + (ucmd->uidx == MLX5_IB_DEFAULT_UIDX)) return 0; - if (!!(field_avail(struct mlx5_ib_create_srq, uidx, inlen) != - !!cqe_version)) + if ((offsetofend(typeof(*ucmd), uidx) <= inlen) != !!cqe_version) return -EINVAL; return verify_assign_uidx(cqe_version, ucmd->uidx, user_index); @@ -1539,7 +1570,9 @@ static inline bool mlx5_ib_can_use_umr(struct mlx5_ib_dev *dev, MLX5_CAP_GEN(dev->mdev, umr_modify_atomic_disabled)) return false; - if (access_flags & IB_ACCESS_RELAXED_ORDERING) + if (access_flags & IB_ACCESS_RELAXED_ORDERING && + (MLX5_CAP_GEN(dev->mdev, relaxed_ordering_write) || + MLX5_CAP_GEN(dev->mdev, relaxed_ordering_read))) return false; return true; diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c index 8508af500972..a401931189b7 100644 --- a/drivers/infiniband/hw/mlx5/mr.c +++ b/drivers/infiniband/hw/mlx5/mr.c @@ -54,12 +54,8 @@ static void assign_mkey_variant(struct mlx5_ib_dev *dev, struct mlx5_core_mkey *mkey, u32 *in) { + u8 key = atomic_inc_return(&dev->mkey_var); void *mkc; - u8 key; - - spin_lock_irq(&dev->mkey_lock); - key = dev->mkey_key++; - spin_unlock_irq(&dev->mkey_lock); mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); MLX5_SET(mkc, mkc, mkey_7_0, key); @@ -90,6 +86,7 @@ mlx5_ib_create_mkey_cb(struct mlx5_ib_dev *dev, static void clean_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr); static void dereg_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr); static int mr_cache_max_order(struct mlx5_ib_dev *dev); +static void queue_adjust_cache_locked(struct mlx5_cache_ent *ent); static bool umr_can_use_indirect_mkey(struct mlx5_ib_dev *dev) { @@ -103,16 +100,6 @@ static int destroy_mkey(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr) return mlx5_core_destroy_mkey(dev->mdev, &mr->mmkey); } -static int order2idx(struct mlx5_ib_dev *dev, int order) -{ - struct mlx5_mr_cache *cache = &dev->cache; - - if (order < cache->ent[0].order) - return 0; - else - return order - cache->ent[0].order; -} - static bool use_umr_mtt_update(struct mlx5_ib_mr *mr, u64 start, u64 length) { return ((u64)1 << mr->order) * MLX5_ADAPTER_PAGE_SIZE >= @@ -124,18 +111,16 @@ static void create_mkey_callback(int status, struct mlx5_async_work *context) struct mlx5_ib_mr *mr = container_of(context, struct mlx5_ib_mr, cb_work); struct mlx5_ib_dev *dev = mr->dev; - struct mlx5_mr_cache *cache = &dev->cache; - int c = order2idx(dev, mr->order); - struct mlx5_cache_ent *ent = &cache->ent[c]; + struct mlx5_cache_ent *ent = mr->cache_ent; unsigned long flags; - spin_lock_irqsave(&ent->lock, flags); - ent->pending--; - spin_unlock_irqrestore(&ent->lock, flags); if (status) { mlx5_ib_warn(dev, "async reg mr failed. status %d\n", status); kfree(mr); - dev->fill_delay = 1; + spin_lock_irqsave(&ent->lock, flags); + ent->pending--; + WRITE_ONCE(dev->fill_delay, 1); + spin_unlock_irqrestore(&ent->lock, flags); mod_timer(&dev->delay_timer, jiffies + HZ); return; } @@ -144,23 +129,44 @@ static void create_mkey_callback(int status, struct mlx5_async_work *context) mr->mmkey.key |= mlx5_idx_to_mkey( MLX5_GET(create_mkey_out, mr->out, mkey_index)); - cache->last_add = jiffies; + WRITE_ONCE(dev->cache.last_add, jiffies); spin_lock_irqsave(&ent->lock, flags); list_add_tail(&mr->list, &ent->head); - ent->cur++; - ent->size++; + ent->available_mrs++; + ent->total_mrs++; + /* If we are doing fill_to_high_water then keep going. */ + queue_adjust_cache_locked(ent); + ent->pending--; spin_unlock_irqrestore(&ent->lock, flags); +} - if (!completion_done(&ent->compl)) - complete(&ent->compl); +static struct mlx5_ib_mr *alloc_cache_mr(struct mlx5_cache_ent *ent, void *mkc) +{ + struct mlx5_ib_mr *mr; + + mr = kzalloc(sizeof(*mr), GFP_KERNEL); + if (!mr) + return NULL; + mr->order = ent->order; + mr->cache_ent = ent; + mr->dev = ent->dev; + + MLX5_SET(mkc, mkc, free, 1); + MLX5_SET(mkc, mkc, umr_en, 1); + MLX5_SET(mkc, mkc, access_mode_1_0, ent->access_mode & 0x3); + MLX5_SET(mkc, mkc, access_mode_4_2, (ent->access_mode >> 2) & 0x7); + + MLX5_SET(mkc, mkc, qpn, 0xffffff); + MLX5_SET(mkc, mkc, translations_octword_size, ent->xlt); + MLX5_SET(mkc, mkc, log_page_size, ent->page); + return mr; } -static int add_keys(struct mlx5_ib_dev *dev, int c, int num) +/* Asynchronously schedule new MRs to be populated in the cache. */ +static int add_keys(struct mlx5_cache_ent *ent, unsigned int num) { - struct mlx5_mr_cache *cache = &dev->cache; - struct mlx5_cache_ent *ent = &cache->ent[c]; - int inlen = MLX5_ST_SZ_BYTES(create_mkey_in); + size_t inlen = MLX5_ST_SZ_BYTES(create_mkey_in); struct mlx5_ib_mr *mr; void *mkc; u32 *in; @@ -173,42 +179,29 @@ static int add_keys(struct mlx5_ib_dev *dev, int c, int num) mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); for (i = 0; i < num; i++) { - if (ent->pending >= MAX_PENDING_REG_MR) { - err = -EAGAIN; - break; - } - - mr = kzalloc(sizeof(*mr), GFP_KERNEL); + mr = alloc_cache_mr(ent, mkc); if (!mr) { err = -ENOMEM; break; } - mr->order = ent->order; - mr->allocated_from_cache = true; - mr->dev = dev; - - MLX5_SET(mkc, mkc, free, 1); - MLX5_SET(mkc, mkc, umr_en, 1); - MLX5_SET(mkc, mkc, access_mode_1_0, ent->access_mode & 0x3); - MLX5_SET(mkc, mkc, access_mode_4_2, - (ent->access_mode >> 2) & 0x7); - - MLX5_SET(mkc, mkc, qpn, 0xffffff); - MLX5_SET(mkc, mkc, translations_octword_size, ent->xlt); - MLX5_SET(mkc, mkc, log_page_size, ent->page); - spin_lock_irq(&ent->lock); + if (ent->pending >= MAX_PENDING_REG_MR) { + err = -EAGAIN; + spin_unlock_irq(&ent->lock); + kfree(mr); + break; + } ent->pending++; spin_unlock_irq(&ent->lock); - err = mlx5_ib_create_mkey_cb(dev, &mr->mmkey, - &dev->async_ctx, in, inlen, - mr->out, sizeof(mr->out), - &mr->cb_work); + err = mlx5_ib_create_mkey_cb(ent->dev, &mr->mmkey, + &ent->dev->async_ctx, in, inlen, + mr->out, sizeof(mr->out), + &mr->cb_work); if (err) { spin_lock_irq(&ent->lock); ent->pending--; spin_unlock_irq(&ent->lock); - mlx5_ib_warn(dev, "create mkey failed %d\n", err); + mlx5_ib_warn(ent->dev, "create mkey failed %d\n", err); kfree(mr); break; } @@ -218,70 +211,128 @@ static int add_keys(struct mlx5_ib_dev *dev, int c, int num) return err; } -static void remove_keys(struct mlx5_ib_dev *dev, int c, int num) +/* Synchronously create a MR in the cache */ +static struct mlx5_ib_mr *create_cache_mr(struct mlx5_cache_ent *ent) { - struct mlx5_mr_cache *cache = &dev->cache; - struct mlx5_cache_ent *ent = &cache->ent[c]; - struct mlx5_ib_mr *tmp_mr; + size_t inlen = MLX5_ST_SZ_BYTES(create_mkey_in); struct mlx5_ib_mr *mr; - LIST_HEAD(del_list); - int i; + void *mkc; + u32 *in; + int err; - for (i = 0; i < num; i++) { - spin_lock_irq(&ent->lock); - if (list_empty(&ent->head)) { - spin_unlock_irq(&ent->lock); - break; - } - mr = list_first_entry(&ent->head, struct mlx5_ib_mr, list); - list_move(&mr->list, &del_list); - ent->cur--; - ent->size--; - spin_unlock_irq(&ent->lock); - mlx5_core_destroy_mkey(dev->mdev, &mr->mmkey); - } + in = kzalloc(inlen, GFP_KERNEL); + if (!in) + return ERR_PTR(-ENOMEM); + mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); - list_for_each_entry_safe(mr, tmp_mr, &del_list, list) { - list_del(&mr->list); - kfree(mr); + mr = alloc_cache_mr(ent, mkc); + if (!mr) { + err = -ENOMEM; + goto free_in; } + + err = mlx5_core_create_mkey(ent->dev->mdev, &mr->mmkey, in, inlen); + if (err) + goto free_mr; + + mr->mmkey.type = MLX5_MKEY_MR; + WRITE_ONCE(ent->dev->cache.last_add, jiffies); + spin_lock_irq(&ent->lock); + ent->total_mrs++; + spin_unlock_irq(&ent->lock); + kfree(in); + return mr; +free_mr: + kfree(mr); +free_in: + kfree(in); + return ERR_PTR(err); } -static ssize_t size_write(struct file *filp, const char __user *buf, - size_t count, loff_t *pos) +static void remove_cache_mr_locked(struct mlx5_cache_ent *ent) +{ + struct mlx5_ib_mr *mr; + + lockdep_assert_held(&ent->lock); + if (list_empty(&ent->head)) + return; + mr = list_first_entry(&ent->head, struct mlx5_ib_mr, list); + list_del(&mr->list); + ent->available_mrs--; + ent->total_mrs--; + spin_unlock_irq(&ent->lock); + mlx5_core_destroy_mkey(ent->dev->mdev, &mr->mmkey); + kfree(mr); + spin_lock_irq(&ent->lock); +} + +static int resize_available_mrs(struct mlx5_cache_ent *ent, unsigned int target, + bool limit_fill) { - struct mlx5_cache_ent *ent = filp->private_data; - struct mlx5_ib_dev *dev = ent->dev; - char lbuf[20] = {0}; - u32 var; int err; - int c; - count = min(count, sizeof(lbuf) - 1); - if (copy_from_user(lbuf, buf, count)) - return -EFAULT; + lockdep_assert_held(&ent->lock); - c = order2idx(dev, ent->order); + while (true) { + if (limit_fill) + target = ent->limit * 2; + if (target == ent->available_mrs + ent->pending) + return 0; + if (target > ent->available_mrs + ent->pending) { + u32 todo = target - (ent->available_mrs + ent->pending); - if (sscanf(lbuf, "%u", &var) != 1) - return -EINVAL; + spin_unlock_irq(&ent->lock); + err = add_keys(ent, todo); + if (err == -EAGAIN) + usleep_range(3000, 5000); + spin_lock_irq(&ent->lock); + if (err) { + if (err != -EAGAIN) + return err; + } else + return 0; + } else { + remove_cache_mr_locked(ent); + } + } +} - if (var < ent->limit) - return -EINVAL; +static ssize_t size_write(struct file *filp, const char __user *buf, + size_t count, loff_t *pos) +{ + struct mlx5_cache_ent *ent = filp->private_data; + u32 target; + int err; - if (var > ent->size) { - do { - err = add_keys(dev, c, var - ent->size); - if (err && err != -EAGAIN) - return err; + err = kstrtou32_from_user(buf, count, 0, &target); + if (err) + return err; - usleep_range(3000, 5000); - } while (err); - } else if (var < ent->size) { - remove_keys(dev, c, ent->size - var); + /* + * Target is the new value of total_mrs the user requests, however we + * cannot free MRs that are in use. Compute the target value for + * available_mrs. + */ + spin_lock_irq(&ent->lock); + if (target < ent->total_mrs - ent->available_mrs) { + err = -EINVAL; + goto err_unlock; } + target = target - (ent->total_mrs - ent->available_mrs); + if (target < ent->limit || target > ent->limit*2) { + err = -EINVAL; + goto err_unlock; + } + err = resize_available_mrs(ent, target, false); + if (err) + goto err_unlock; + spin_unlock_irq(&ent->lock); return count; + +err_unlock: + spin_unlock_irq(&ent->lock); + return err; } static ssize_t size_read(struct file *filp, char __user *buf, size_t count, @@ -291,7 +342,7 @@ static ssize_t size_read(struct file *filp, char __user *buf, size_t count, char lbuf[20]; int err; - err = snprintf(lbuf, sizeof(lbuf), "%d\n", ent->size); + err = snprintf(lbuf, sizeof(lbuf), "%d\n", ent->total_mrs); if (err < 0) return err; @@ -309,32 +360,23 @@ static ssize_t limit_write(struct file *filp, const char __user *buf, size_t count, loff_t *pos) { struct mlx5_cache_ent *ent = filp->private_data; - struct mlx5_ib_dev *dev = ent->dev; - char lbuf[20] = {0}; u32 var; int err; - int c; - - count = min(count, sizeof(lbuf) - 1); - if (copy_from_user(lbuf, buf, count)) - return -EFAULT; - - c = order2idx(dev, ent->order); - - if (sscanf(lbuf, "%u", &var) != 1) - return -EINVAL; - if (var > ent->size) - return -EINVAL; + err = kstrtou32_from_user(buf, count, 0, &var); + if (err) + return err; + /* + * Upon set we immediately fill the cache to high water mark implied by + * the limit. + */ + spin_lock_irq(&ent->lock); ent->limit = var; - - if (ent->cur < ent->limit) { - err = add_keys(dev, c, 2 * ent->limit - ent->cur); - if (err) - return err; - } - + err = resize_available_mrs(ent, 0, true); + spin_unlock_irq(&ent->lock); + if (err) + return err; return count; } @@ -359,68 +401,119 @@ static const struct file_operations limit_fops = { .read = limit_read, }; -static int someone_adding(struct mlx5_mr_cache *cache) +static bool someone_adding(struct mlx5_mr_cache *cache) { - int i; + unsigned int i; for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++) { - if (cache->ent[i].cur < cache->ent[i].limit) - return 1; + struct mlx5_cache_ent *ent = &cache->ent[i]; + bool ret; + + spin_lock_irq(&ent->lock); + ret = ent->available_mrs < ent->limit; + spin_unlock_irq(&ent->lock); + if (ret) + return true; } + return false; +} - return 0; +/* + * Check if the bucket is outside the high/low water mark and schedule an async + * update. The cache refill has hysteresis, once the low water mark is hit it is + * refilled up to the high mark. + */ +static void queue_adjust_cache_locked(struct mlx5_cache_ent *ent) +{ + lockdep_assert_held(&ent->lock); + + if (ent->disabled || READ_ONCE(ent->dev->fill_delay)) + return; + if (ent->available_mrs < ent->limit) { + ent->fill_to_high_water = true; + queue_work(ent->dev->cache.wq, &ent->work); + } else if (ent->fill_to_high_water && + ent->available_mrs + ent->pending < 2 * ent->limit) { + /* + * Once we start populating due to hitting a low water mark + * continue until we pass the high water mark. + */ + queue_work(ent->dev->cache.wq, &ent->work); + } else if (ent->available_mrs == 2 * ent->limit) { + ent->fill_to_high_water = false; + } else if (ent->available_mrs > 2 * ent->limit) { + /* Queue deletion of excess entries */ + ent->fill_to_high_water = false; + if (ent->pending) + queue_delayed_work(ent->dev->cache.wq, &ent->dwork, + msecs_to_jiffies(1000)); + else + queue_work(ent->dev->cache.wq, &ent->work); + } } static void __cache_work_func(struct mlx5_cache_ent *ent) { struct mlx5_ib_dev *dev = ent->dev; struct mlx5_mr_cache *cache = &dev->cache; - int i = order2idx(dev, ent->order); int err; - if (cache->stopped) - return; + spin_lock_irq(&ent->lock); + if (ent->disabled) + goto out; - ent = &dev->cache.ent[i]; - if (ent->cur < 2 * ent->limit && !dev->fill_delay) { - err = add_keys(dev, i, 1); - if (ent->cur < 2 * ent->limit) { - if (err == -EAGAIN) { - mlx5_ib_dbg(dev, "returned eagain, order %d\n", - i + 2); - queue_delayed_work(cache->wq, &ent->dwork, - msecs_to_jiffies(3)); - } else if (err) { - mlx5_ib_warn(dev, "command failed order %d, err %d\n", - i + 2, err); + if (ent->fill_to_high_water && + ent->available_mrs + ent->pending < 2 * ent->limit && + !READ_ONCE(dev->fill_delay)) { + spin_unlock_irq(&ent->lock); + err = add_keys(ent, 1); + spin_lock_irq(&ent->lock); + if (ent->disabled) + goto out; + if (err) { + /* + * EAGAIN only happens if pending is positive, so we + * will be rescheduled from reg_mr_callback(). The only + * failure path here is ENOMEM. + */ + if (err != -EAGAIN) { + mlx5_ib_warn( + dev, + "command failed order %d, err %d\n", + ent->order, err); queue_delayed_work(cache->wq, &ent->dwork, msecs_to_jiffies(1000)); - } else { - queue_work(cache->wq, &ent->work); } } - } else if (ent->cur > 2 * ent->limit) { + } else if (ent->available_mrs > 2 * ent->limit) { + bool need_delay; + /* - * The remove_keys() logic is performed as garbage collection - * task. Such task is intended to be run when no other active - * processes are running. + * The remove_cache_mr() logic is performed as garbage + * collection task. Such task is intended to be run when no + * other active processes are running. * * The need_resched() will return TRUE if there are user tasks * to be activated in near future. * - * In such case, we don't execute remove_keys() and postpone - * the garbage collection work to try to run in next cycle, - * in order to free CPU resources to other tasks. + * In such case, we don't execute remove_cache_mr() and postpone + * the garbage collection work to try to run in next cycle, in + * order to free CPU resources to other tasks. */ - if (!need_resched() && !someone_adding(cache) && - time_after(jiffies, cache->last_add + 300 * HZ)) { - remove_keys(dev, i, 1); - if (ent->cur > ent->limit) - queue_work(cache->wq, &ent->work); - } else { + spin_unlock_irq(&ent->lock); + need_delay = need_resched() || someone_adding(cache) || + time_after(jiffies, + READ_ONCE(cache->last_add) + 300 * HZ); + spin_lock_irq(&ent->lock); + if (ent->disabled) + goto out; + if (need_delay) queue_delayed_work(cache->wq, &ent->dwork, 300 * HZ); - } + remove_cache_mr_locked(ent); + queue_adjust_cache_locked(ent); } +out: + spin_unlock_irq(&ent->lock); } static void delayed_cache_work_func(struct work_struct *work) @@ -439,117 +532,95 @@ static void cache_work_func(struct work_struct *work) __cache_work_func(ent); } -struct mlx5_ib_mr *mlx5_mr_cache_alloc(struct mlx5_ib_dev *dev, int entry) +/* Allocate a special entry from the cache */ +struct mlx5_ib_mr *mlx5_mr_cache_alloc(struct mlx5_ib_dev *dev, + unsigned int entry) { struct mlx5_mr_cache *cache = &dev->cache; struct mlx5_cache_ent *ent; struct mlx5_ib_mr *mr; - int err; - if (entry < 0 || entry >= MAX_MR_CACHE_ENTRIES) { - mlx5_ib_err(dev, "cache entry %d is out of range\n", entry); + if (WARN_ON(entry <= MR_CACHE_LAST_STD_ENTRY || + entry >= ARRAY_SIZE(cache->ent))) return ERR_PTR(-EINVAL); - } ent = &cache->ent[entry]; - while (1) { - spin_lock_irq(&ent->lock); - if (list_empty(&ent->head)) { - spin_unlock_irq(&ent->lock); - - err = add_keys(dev, entry, 1); - if (err && err != -EAGAIN) - return ERR_PTR(err); - - wait_for_completion(&ent->compl); - } else { - mr = list_first_entry(&ent->head, struct mlx5_ib_mr, - list); - list_del(&mr->list); - ent->cur--; - spin_unlock_irq(&ent->lock); - if (ent->cur < ent->limit) - queue_work(cache->wq, &ent->work); + spin_lock_irq(&ent->lock); + if (list_empty(&ent->head)) { + spin_unlock_irq(&ent->lock); + mr = create_cache_mr(ent); + if (IS_ERR(mr)) return mr; - } + } else { + mr = list_first_entry(&ent->head, struct mlx5_ib_mr, list); + list_del(&mr->list); + ent->available_mrs--; + queue_adjust_cache_locked(ent); + spin_unlock_irq(&ent->lock); } + return mr; } -static struct mlx5_ib_mr *alloc_cached_mr(struct mlx5_ib_dev *dev, int order) +/* Return a MR already available in the cache */ +static struct mlx5_ib_mr *get_cache_mr(struct mlx5_cache_ent *req_ent) { - struct mlx5_mr_cache *cache = &dev->cache; + struct mlx5_ib_dev *dev = req_ent->dev; struct mlx5_ib_mr *mr = NULL; - struct mlx5_cache_ent *ent; - int last_umr_cache_entry; - int c; - int i; + struct mlx5_cache_ent *ent = req_ent; - c = order2idx(dev, order); - last_umr_cache_entry = order2idx(dev, mr_cache_max_order(dev)); - if (c < 0 || c > last_umr_cache_entry) { - mlx5_ib_warn(dev, "order %d, cache index %d\n", order, c); - return NULL; - } - - for (i = c; i <= last_umr_cache_entry; i++) { - ent = &cache->ent[i]; - - mlx5_ib_dbg(dev, "order %d, cache index %d\n", ent->order, i); + /* Try larger MR pools from the cache to satisfy the allocation */ + for (; ent != &dev->cache.ent[MR_CACHE_LAST_STD_ENTRY + 1]; ent++) { + mlx5_ib_dbg(dev, "order %u, cache index %zu\n", ent->order, + ent - dev->cache.ent); spin_lock_irq(&ent->lock); if (!list_empty(&ent->head)) { mr = list_first_entry(&ent->head, struct mlx5_ib_mr, list); list_del(&mr->list); - ent->cur--; + ent->available_mrs--; + queue_adjust_cache_locked(ent); spin_unlock_irq(&ent->lock); - if (ent->cur < ent->limit) - queue_work(cache->wq, &ent->work); break; } + queue_adjust_cache_locked(ent); spin_unlock_irq(&ent->lock); - - queue_work(cache->wq, &ent->work); } if (!mr) - cache->ent[c].miss++; + req_ent->miss++; return mr; } +static void detach_mr_from_cache(struct mlx5_ib_mr *mr) +{ + struct mlx5_cache_ent *ent = mr->cache_ent; + + mr->cache_ent = NULL; + spin_lock_irq(&ent->lock); + ent->total_mrs--; + spin_unlock_irq(&ent->lock); +} + void mlx5_mr_cache_free(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr) { - struct mlx5_mr_cache *cache = &dev->cache; - struct mlx5_cache_ent *ent; - int shrink = 0; - int c; + struct mlx5_cache_ent *ent = mr->cache_ent; - if (!mr->allocated_from_cache) + if (!ent) return; - c = order2idx(dev, mr->order); - WARN_ON(c < 0 || c >= MAX_MR_CACHE_ENTRIES); - if (mlx5_mr_cache_invalidate(mr)) { - mr->allocated_from_cache = false; + detach_mr_from_cache(mr); destroy_mkey(dev, mr); - ent = &cache->ent[c]; - if (ent->cur < ent->limit) - queue_work(cache->wq, &ent->work); return; } - ent = &cache->ent[c]; spin_lock_irq(&ent->lock); list_add_tail(&mr->list, &ent->head); - ent->cur++; - if (ent->cur > 2 * ent->limit) - shrink = 1; + ent->available_mrs++; + queue_adjust_cache_locked(ent); spin_unlock_irq(&ent->lock); - - if (shrink) - queue_work(cache->wq, &ent->work); } static void clean_keys(struct mlx5_ib_dev *dev, int c) @@ -569,8 +640,8 @@ static void clean_keys(struct mlx5_ib_dev *dev, int c) } mr = list_first_entry(&ent->head, struct mlx5_ib_mr, list); list_move(&mr->list, &del_list); - ent->cur--; - ent->size--; + ent->available_mrs--; + ent->total_mrs--; spin_unlock_irq(&ent->lock); mlx5_core_destroy_mkey(dev->mdev, &mr->mmkey); } @@ -608,7 +679,7 @@ static void mlx5_mr_cache_debugfs_init(struct mlx5_ib_dev *dev) dir = debugfs_create_dir(ent->name, cache->root); debugfs_create_file("size", 0600, dir, ent, &size_fops); debugfs_create_file("limit", 0600, dir, ent, &limit_fops); - debugfs_create_u32("cur", 0400, dir, &ent->cur); + debugfs_create_u32("cur", 0400, dir, &ent->available_mrs); debugfs_create_u32("miss", 0600, dir, &ent->miss); } } @@ -617,7 +688,7 @@ static void delay_time_func(struct timer_list *t) { struct mlx5_ib_dev *dev = from_timer(dev, t, delay_timer); - dev->fill_delay = 0; + WRITE_ONCE(dev->fill_delay, 0); } int mlx5_mr_cache_init(struct mlx5_ib_dev *dev) @@ -643,7 +714,6 @@ int mlx5_mr_cache_init(struct mlx5_ib_dev *dev) ent->dev = dev; ent->limit = 0; - init_completion(&ent->compl); INIT_WORK(&ent->work, cache_work_func); INIT_DELAYED_WORK(&ent->dwork, delayed_cache_work_func); @@ -665,7 +735,9 @@ int mlx5_mr_cache_init(struct mlx5_ib_dev *dev) ent->limit = dev->mdev->profile->mr_cache[i].limit; else ent->limit = 0; - queue_work(cache->wq, &ent->work); + spin_lock_irq(&ent->lock); + queue_adjust_cache_locked(ent); + spin_unlock_irq(&ent->lock); } mlx5_mr_cache_debugfs_init(dev); @@ -675,13 +747,20 @@ int mlx5_mr_cache_init(struct mlx5_ib_dev *dev) int mlx5_mr_cache_cleanup(struct mlx5_ib_dev *dev) { - int i; + unsigned int i; if (!dev->cache.wq) return 0; - dev->cache.stopped = 1; - flush_workqueue(dev->cache.wq); + for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++) { + struct mlx5_cache_ent *ent = &dev->cache.ent[i]; + + spin_lock_irq(&ent->lock); + ent->disabled = true; + spin_unlock_irq(&ent->lock); + cancel_work_sync(&ent->work); + cancel_delayed_work_sync(&ent->dwork); + } mlx5_mr_cache_debugfs_cleanup(dev); mlx5_cmd_cleanup_async_ctx(&dev->async_ctx); @@ -876,31 +955,37 @@ static int mlx5_ib_post_send_wait(struct mlx5_ib_dev *dev, return err; } -static struct mlx5_ib_mr *alloc_mr_from_cache( - struct ib_pd *pd, struct ib_umem *umem, - u64 virt_addr, u64 len, int npages, - int page_shift, int order, int access_flags) +static struct mlx5_cache_ent *mr_cache_ent_from_order(struct mlx5_ib_dev *dev, + unsigned int order) +{ + struct mlx5_mr_cache *cache = &dev->cache; + + if (order < cache->ent[0].order) + return &cache->ent[0]; + order = order - cache->ent[0].order; + if (order > MR_CACHE_LAST_STD_ENTRY) + return NULL; + return &cache->ent[order]; +} + +static struct mlx5_ib_mr * +alloc_mr_from_cache(struct ib_pd *pd, struct ib_umem *umem, u64 virt_addr, + u64 len, int npages, int page_shift, unsigned int order, + int access_flags) { struct mlx5_ib_dev *dev = to_mdev(pd->device); + struct mlx5_cache_ent *ent = mr_cache_ent_from_order(dev, order); struct mlx5_ib_mr *mr; - int err = 0; - int i; - - for (i = 0; i < 1; i++) { - mr = alloc_cached_mr(dev, order); - if (mr) - break; - err = add_keys(dev, order2idx(dev, order), 1); - if (err && err != -EAGAIN) { - mlx5_ib_warn(dev, "add_keys failed, err %d\n", err); - break; - } + if (!ent) + return ERR_PTR(-E2BIG); + mr = get_cache_mr(ent); + if (!mr) { + mr = create_cache_mr(ent); + if (IS_ERR(mr)) + return mr; } - if (!mr) - return ERR_PTR(-EAGAIN); - mr->ibmr.pd = pd; mr->umem = umem; mr->access_flags = access_flags; @@ -1474,10 +1559,9 @@ int mlx5_ib_rereg_user_mr(struct ib_mr *ib_mr, int flags, u64 start, /* * UMR can't be used - MKey needs to be replaced. */ - if (mr->allocated_from_cache) - err = mlx5_mr_cache_invalidate(mr); - else - err = destroy_mkey(dev, mr); + if (mr->cache_ent) + detach_mr_from_cache(mr); + err = destroy_mkey(dev, mr); if (err) goto err; @@ -1489,8 +1573,6 @@ int mlx5_ib_rereg_user_mr(struct ib_mr *ib_mr, int flags, u64 start, mr = to_mmr(ib_mr); goto err; } - - mr->allocated_from_cache = false; } else { /* * Send a UMR WQE @@ -1577,8 +1659,6 @@ mlx5_free_priv_descs(struct mlx5_ib_mr *mr) static void clean_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr) { - int allocated_from_cache = mr->allocated_from_cache; - if (mr->sig) { if (mlx5_core_destroy_psv(dev->mdev, mr->sig->psv_memory.psv_idx)) @@ -1593,7 +1673,7 @@ static void clean_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr) mr->sig = NULL; } - if (!allocated_from_cache) { + if (!mr->cache_ent) { destroy_mkey(dev, mr); mlx5_free_priv_descs(mr); } @@ -1610,7 +1690,7 @@ static void dereg_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr) else clean_mr(dev, mr); - if (mr->allocated_from_cache) + if (mr->cache_ent) mlx5_mr_cache_free(dev, mr); else kfree(mr); diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c index bf50cd91f472..3de7606d4a1a 100644 --- a/drivers/infiniband/hw/mlx5/odp.c +++ b/drivers/infiniband/hw/mlx5/odp.c @@ -197,7 +197,7 @@ static void dma_fence_odp_mr(struct mlx5_ib_mr *mr) odp->private = NULL; mutex_unlock(&odp->umem_mutex); - if (!mr->allocated_from_cache) { + if (!mr->cache_ent) { mlx5_core_destroy_mkey(mr->dev->mdev, &mr->mmkey); WARN_ON(mr->descs); } diff --git a/drivers/infiniband/hw/mlx5/qos.c b/drivers/infiniband/hw/mlx5/qos.c new file mode 100644 index 000000000000..cac878a70edb --- /dev/null +++ b/drivers/infiniband/hw/mlx5/qos.c @@ -0,0 +1,136 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* + * Copyright (c) 2020, Mellanox Technologies inc. All rights reserved. + */ + +#include <rdma/uverbs_ioctl.h> +#include <rdma/mlx5_user_ioctl_cmds.h> +#include <rdma/mlx5_user_ioctl_verbs.h> +#include <linux/mlx5/driver.h> +#include "mlx5_ib.h" + +#define UVERBS_MODULE_NAME mlx5_ib +#include <rdma/uverbs_named_ioctl.h> + +static bool pp_is_supported(struct ib_device *device) +{ + struct mlx5_ib_dev *dev = to_mdev(device); + + return (MLX5_CAP_GEN(dev->mdev, qos) && + MLX5_CAP_QOS(dev->mdev, packet_pacing) && + MLX5_CAP_QOS(dev->mdev, packet_pacing_uid)); +} + +static int UVERBS_HANDLER(MLX5_IB_METHOD_PP_OBJ_ALLOC)( + struct uverbs_attr_bundle *attrs) +{ + u8 rl_raw[MLX5_ST_SZ_BYTES(set_pp_rate_limit_context)] = {}; + struct ib_uobject *uobj = uverbs_attr_get_uobject(attrs, + MLX5_IB_ATTR_PP_OBJ_ALLOC_HANDLE); + struct mlx5_ib_dev *dev; + struct mlx5_ib_ucontext *c; + struct mlx5_ib_pp *pp_entry; + void *in_ctx; + u16 uid; + int inlen; + u32 flags; + int err; + + c = to_mucontext(ib_uverbs_get_ucontext(attrs)); + if (IS_ERR(c)) + return PTR_ERR(c); + + /* The allocated entry can be used only by a DEVX context */ + if (!c->devx_uid) + return -EINVAL; + + dev = to_mdev(c->ibucontext.device); + pp_entry = kzalloc(sizeof(*pp_entry), GFP_KERNEL); + if (!pp_entry) + return -ENOMEM; + + in_ctx = uverbs_attr_get_alloced_ptr(attrs, + MLX5_IB_ATTR_PP_OBJ_ALLOC_CTX); + inlen = uverbs_attr_get_len(attrs, + MLX5_IB_ATTR_PP_OBJ_ALLOC_CTX); + memcpy(rl_raw, in_ctx, inlen); + err = uverbs_get_flags32(&flags, attrs, + MLX5_IB_ATTR_PP_OBJ_ALLOC_FLAGS, + MLX5_IB_UAPI_PP_ALLOC_FLAGS_DEDICATED_INDEX); + if (err) + goto err; + + uid = (flags & MLX5_IB_UAPI_PP_ALLOC_FLAGS_DEDICATED_INDEX) ? + c->devx_uid : MLX5_SHARED_RESOURCE_UID; + + err = mlx5_rl_add_rate_raw(dev->mdev, rl_raw, uid, + (flags & MLX5_IB_UAPI_PP_ALLOC_FLAGS_DEDICATED_INDEX), + &pp_entry->index); + if (err) + goto err; + + err = uverbs_copy_to(attrs, MLX5_IB_ATTR_PP_OBJ_ALLOC_INDEX, + &pp_entry->index, sizeof(pp_entry->index)); + if (err) + goto clean; + + pp_entry->mdev = dev->mdev; + uobj->object = pp_entry; + return 0; + +clean: + mlx5_rl_remove_rate_raw(dev->mdev, pp_entry->index); +err: + kfree(pp_entry); + return err; +} + +static int pp_obj_cleanup(struct ib_uobject *uobject, + enum rdma_remove_reason why, + struct uverbs_attr_bundle *attrs) +{ + struct mlx5_ib_pp *pp_entry = uobject->object; + + mlx5_rl_remove_rate_raw(pp_entry->mdev, pp_entry->index); + kfree(pp_entry); + return 0; +} + +DECLARE_UVERBS_NAMED_METHOD( + MLX5_IB_METHOD_PP_OBJ_ALLOC, + UVERBS_ATTR_IDR(MLX5_IB_ATTR_PP_OBJ_ALLOC_HANDLE, + MLX5_IB_OBJECT_PP, + UVERBS_ACCESS_NEW, + UA_MANDATORY), + UVERBS_ATTR_PTR_IN( + MLX5_IB_ATTR_PP_OBJ_ALLOC_CTX, + UVERBS_ATTR_SIZE(1, + MLX5_ST_SZ_BYTES(set_pp_rate_limit_context)), + UA_MANDATORY, + UA_ALLOC_AND_COPY), + UVERBS_ATTR_FLAGS_IN(MLX5_IB_ATTR_PP_OBJ_ALLOC_FLAGS, + enum mlx5_ib_uapi_pp_alloc_flags, + UA_MANDATORY), + UVERBS_ATTR_PTR_OUT(MLX5_IB_ATTR_PP_OBJ_ALLOC_INDEX, + UVERBS_ATTR_TYPE(u16), + UA_MANDATORY)); + +DECLARE_UVERBS_NAMED_METHOD_DESTROY( + MLX5_IB_METHOD_PP_OBJ_DESTROY, + UVERBS_ATTR_IDR(MLX5_IB_ATTR_PP_OBJ_DESTROY_HANDLE, + MLX5_IB_OBJECT_PP, + UVERBS_ACCESS_DESTROY, + UA_MANDATORY)); + +DECLARE_UVERBS_NAMED_OBJECT(MLX5_IB_OBJECT_PP, + UVERBS_TYPE_ALLOC_IDR(pp_obj_cleanup), + &UVERBS_METHOD(MLX5_IB_METHOD_PP_OBJ_ALLOC), + &UVERBS_METHOD(MLX5_IB_METHOD_PP_OBJ_DESTROY)); + + +const struct uapi_definition mlx5_ib_qos_defs[] = { + UAPI_DEF_CHAIN_OBJ_TREE_NAMED( + MLX5_IB_OBJECT_PP, + UAPI_DEF_IS_OBJ_SUPPORTED(pp_is_supported)), + {}, +}; diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c index 8fe149e808af..1456db4b6295 100644 --- a/drivers/infiniband/hw/mlx5/qp.c +++ b/drivers/infiniband/hw/mlx5/qp.c @@ -697,6 +697,9 @@ static int alloc_bfreg(struct mlx5_ib_dev *dev, { int bfregn = -ENOMEM; + if (bfregi->lib_uar_dyn) + return -EINVAL; + mutex_lock(&bfregi->lock); if (bfregi->ver >= 2) { bfregn = alloc_high_class_bfreg(dev, bfregi); @@ -768,6 +771,9 @@ int bfregn_to_uar_index(struct mlx5_ib_dev *dev, u32 index_of_sys_page; u32 offset; + if (bfregi->lib_uar_dyn) + return -EINVAL; + bfregs_per_sys_page = get_uars_per_sys_page(dev, bfregi->lib_uar_4k) * MLX5_NON_FP_BFREGS_PER_UAR; index_of_sys_page = bfregn / bfregs_per_sys_page; @@ -919,6 +925,7 @@ static int create_user_qp(struct mlx5_ib_dev *dev, struct ib_pd *pd, void *qpc; int err; u16 uid; + u32 uar_flags; err = ib_copy_from_udata(&ucmd, udata, sizeof(ucmd)); if (err) { @@ -928,24 +935,29 @@ static int create_user_qp(struct mlx5_ib_dev *dev, struct ib_pd *pd, context = rdma_udata_to_drv_context(udata, struct mlx5_ib_ucontext, ibucontext); - if (ucmd.flags & MLX5_QP_FLAG_BFREG_INDEX) { + uar_flags = ucmd.flags & (MLX5_QP_FLAG_UAR_PAGE_INDEX | + MLX5_QP_FLAG_BFREG_INDEX); + switch (uar_flags) { + case MLX5_QP_FLAG_UAR_PAGE_INDEX: + uar_index = ucmd.bfreg_index; + bfregn = MLX5_IB_INVALID_BFREG; + break; + case MLX5_QP_FLAG_BFREG_INDEX: uar_index = bfregn_to_uar_index(dev, &context->bfregi, ucmd.bfreg_index, true); if (uar_index < 0) return uar_index; - bfregn = MLX5_IB_INVALID_BFREG; - } else if (qp->flags & MLX5_IB_QP_CROSS_CHANNEL) { - /* - * TBD: should come from the verbs when we have the API - */ - /* In CROSS_CHANNEL CQ and QP must use the same UAR */ - bfregn = MLX5_CROSS_CHANNEL_BFREG; - } - else { + break; + case 0: + if (qp->flags & MLX5_IB_QP_CROSS_CHANNEL) + return -EINVAL; bfregn = alloc_bfreg(dev, &context->bfregi); if (bfregn < 0) return bfregn; + break; + default: + return -EINVAL; } mlx5_ib_dbg(dev, "bfregn 0x%x, uar_index 0x%x\n", bfregn, uar_index); @@ -2100,6 +2112,7 @@ static int create_qp_common(struct mlx5_ib_dev *dev, struct ib_pd *pd, MLX5_QP_FLAG_TIR_ALLOW_SELF_LB_MC | MLX5_QP_FLAG_TIR_ALLOW_SELF_LB_UC | MLX5_QP_FLAG_TUNNEL_OFFLOADS | + MLX5_QP_FLAG_UAR_PAGE_INDEX | MLX5_QP_FLAG_TYPE_DCI | MLX5_QP_FLAG_TYPE_DCT)) return -EINVAL; @@ -2789,7 +2802,7 @@ struct ib_qp *mlx5_ib_create_qp(struct ib_pd *pd, mlx5_ib_dbg(dev, "unsupported qp type %d\n", init_attr->qp_type); /* Don't support raw QPs */ - return ERR_PTR(-EINVAL); + return ERR_PTR(-EOPNOTSUPP); } if (verbs_init_attr->qp_type == IB_QPT_DRIVER) |