diff options
Diffstat (limited to 'drivers/infiniband/hw/mlx5')
-rw-r--r-- | drivers/infiniband/hw/mlx5/devx.c | 93 | ||||
-rw-r--r-- | drivers/infiniband/hw/mlx5/devx.h | 4 | ||||
-rw-r--r-- | drivers/infiniband/hw/mlx5/fs.c | 37 | ||||
-rw-r--r-- | drivers/infiniband/hw/mlx5/mad.c | 8 | ||||
-rw-r--r-- | drivers/infiniband/hw/mlx5/main.c | 90 | ||||
-rw-r--r-- | drivers/infiniband/hw/mlx5/mlx5_ib.h | 9 | ||||
-rw-r--r-- | drivers/infiniband/hw/mlx5/mr.c | 17 | ||||
-rw-r--r-- | drivers/infiniband/hw/mlx5/odp.c | 70 | ||||
-rw-r--r-- | drivers/infiniband/hw/mlx5/qp.c | 51 | ||||
-rw-r--r-- | drivers/infiniband/hw/mlx5/restrack.c | 9 |
10 files changed, 304 insertions, 84 deletions
diff --git a/drivers/infiniband/hw/mlx5/devx.c b/drivers/infiniband/hw/mlx5/devx.c index 69999d8d24f3..4186884c66e1 100644 --- a/drivers/infiniband/hw/mlx5/devx.c +++ b/drivers/infiniband/hw/mlx5/devx.c @@ -27,6 +27,19 @@ enum devx_obj_flags { DEVX_OBJ_FLAGS_INDIRECT_MKEY = 1 << 0, DEVX_OBJ_FLAGS_DCT = 1 << 1, DEVX_OBJ_FLAGS_CQ = 1 << 2, + DEVX_OBJ_FLAGS_HW_FREED = 1 << 3, +}; + +#define MAX_ASYNC_CMDS 8 + +struct mlx5_async_cmd { + struct ib_uobject *uobject; + void *in; + int in_size; + u32 out[MLX5_ST_SZ_DW(general_obj_out_cmd_hdr)]; + int err; + struct mlx5_async_work cb_work; + struct completion comp; }; struct devx_async_data { @@ -1405,7 +1418,9 @@ static int devx_obj_cleanup(struct ib_uobject *uobject, */ mlx5r_deref_wait_odp_mkey(&obj->mkey); - if (obj->flags & DEVX_OBJ_FLAGS_DCT) + if (obj->flags & DEVX_OBJ_FLAGS_HW_FREED) + ret = 0; + else if (obj->flags & DEVX_OBJ_FLAGS_DCT) ret = mlx5_core_destroy_dct(obj->ib_dev, &obj->core_dct); else if (obj->flags & DEVX_OBJ_FLAGS_CQ) ret = mlx5_core_destroy_cq(obj->ib_dev->mdev, &obj->core_cq); @@ -2595,6 +2610,82 @@ void mlx5_ib_devx_cleanup(struct mlx5_ib_dev *dev) } } +static void devx_async_destroy_cb(int status, struct mlx5_async_work *context) +{ + struct mlx5_async_cmd *devx_out = container_of(context, + struct mlx5_async_cmd, cb_work); + struct devx_obj *obj = devx_out->uobject->object; + + if (!status) + obj->flags |= DEVX_OBJ_FLAGS_HW_FREED; + + complete(&devx_out->comp); +} + +static void devx_async_destroy(struct mlx5_ib_dev *dev, + struct mlx5_async_cmd *cmd) +{ + init_completion(&cmd->comp); + cmd->err = mlx5_cmd_exec_cb(&dev->async_ctx, cmd->in, cmd->in_size, + &cmd->out, sizeof(cmd->out), + devx_async_destroy_cb, &cmd->cb_work); +} + +static void devx_wait_async_destroy(struct mlx5_async_cmd *cmd) +{ + if (!cmd->err) + wait_for_completion(&cmd->comp); + atomic_set(&cmd->uobject->usecnt, 0); +} + +void mlx5_ib_ufile_hw_cleanup(struct ib_uverbs_file *ufile) +{ + struct mlx5_async_cmd async_cmd[MAX_ASYNC_CMDS]; + struct ib_ucontext *ucontext = ufile->ucontext; + struct ib_device *device = ucontext->device; + struct mlx5_ib_dev *dev = to_mdev(device); + struct ib_uobject *uobject; + struct devx_obj *obj; + int head = 0; + int tail = 0; + + list_for_each_entry(uobject, &ufile->uobjects, list) { + WARN_ON(uverbs_try_lock_object(uobject, UVERBS_LOOKUP_WRITE)); + + /* + * Currently we only support QP destruction, if other objects + * are to be destroyed need to add type synchronization to the + * cleanup algorithm and handle pre/post FW cleanup for the + * new types if needed. + */ + if (uobj_get_object_id(uobject) != MLX5_IB_OBJECT_DEVX_OBJ || + (get_dec_obj_type(uobject->object, MLX5_EVENT_TYPE_MAX) != + MLX5_OBJ_TYPE_QP)) { + atomic_set(&uobject->usecnt, 0); + continue; + } + + obj = uobject->object; + + async_cmd[tail % MAX_ASYNC_CMDS].in = obj->dinbox; + async_cmd[tail % MAX_ASYNC_CMDS].in_size = obj->dinlen; + async_cmd[tail % MAX_ASYNC_CMDS].uobject = uobject; + + devx_async_destroy(dev, &async_cmd[tail % MAX_ASYNC_CMDS]); + tail++; + + if (tail - head == MAX_ASYNC_CMDS) { + devx_wait_async_destroy(&async_cmd[head % MAX_ASYNC_CMDS]); + head++; + } + } + + while (head != tail) { + devx_wait_async_destroy(&async_cmd[head % MAX_ASYNC_CMDS]); + head++; + } +} + static ssize_t devx_async_cmd_event_read(struct file *filp, char __user *buf, size_t count, loff_t *pos) { diff --git a/drivers/infiniband/hw/mlx5/devx.h b/drivers/infiniband/hw/mlx5/devx.h index ee2213275fd6..1344bf4c9d21 100644 --- a/drivers/infiniband/hw/mlx5/devx.h +++ b/drivers/infiniband/hw/mlx5/devx.h @@ -28,6 +28,7 @@ int mlx5_ib_devx_create(struct mlx5_ib_dev *dev, bool is_user); void mlx5_ib_devx_destroy(struct mlx5_ib_dev *dev, u16 uid); int mlx5_ib_devx_init(struct mlx5_ib_dev *dev); void mlx5_ib_devx_cleanup(struct mlx5_ib_dev *dev); +void mlx5_ib_ufile_hw_cleanup(struct ib_uverbs_file *ufile); #else static inline int mlx5_ib_devx_create(struct mlx5_ib_dev *dev, bool is_user) { @@ -41,5 +42,8 @@ static inline int mlx5_ib_devx_init(struct mlx5_ib_dev *dev) static inline void mlx5_ib_devx_cleanup(struct mlx5_ib_dev *dev) { } +static inline void mlx5_ib_ufile_hw_cleanup(struct ib_uverbs_file *ufile) +{ +} #endif #endif /* _MLX5_IB_DEVX_H */ diff --git a/drivers/infiniband/hw/mlx5/fs.c b/drivers/infiniband/hw/mlx5/fs.c index 520034acf73a..162814ae8cb4 100644 --- a/drivers/infiniband/hw/mlx5/fs.c +++ b/drivers/infiniband/hw/mlx5/fs.c @@ -943,7 +943,7 @@ int mlx5_ib_fs_add_op_fc(struct mlx5_ib_dev *dev, u32 port_num, } dst.type = MLX5_FLOW_DESTINATION_TYPE_COUNTER; - dst.counter_id = mlx5_fc_id(opfc->fc); + dst.counter = opfc->fc; flow_act.action = MLX5_FLOW_CONTEXT_ACTION_COUNT | MLX5_FLOW_CONTEXT_ACTION_ALLOW; @@ -1113,8 +1113,8 @@ static struct mlx5_ib_flow_handler *_create_flow_rule(struct mlx5_ib_dev *dev, handler->ibcounters = flow_act.counters; dest_arr[dest_num].type = MLX5_FLOW_DESTINATION_TYPE_COUNTER; - dest_arr[dest_num].counter_id = - mlx5_fc_id(mcounters->hw_cntrs_hndl); + dest_arr[dest_num].counter = + mcounters->hw_cntrs_hndl; dest_num++; } @@ -1603,7 +1603,7 @@ static bool raw_fs_is_multicast(struct mlx5_ib_flow_matcher *fs_matcher, static struct mlx5_ib_flow_handler *raw_fs_rule_add( struct mlx5_ib_dev *dev, struct mlx5_ib_flow_matcher *fs_matcher, struct mlx5_flow_context *flow_context, struct mlx5_flow_act *flow_act, - u32 counter_id, void *cmd_in, int inlen, int dest_id, int dest_type) + struct mlx5_fc *counter, void *cmd_in, int inlen, int dest_id, int dest_type) { struct mlx5_flow_destination *dst; struct mlx5_ib_flow_prio *ft_prio; @@ -1652,8 +1652,12 @@ static struct mlx5_ib_flow_handler *raw_fs_rule_add( } if (flow_act->action & MLX5_FLOW_CONTEXT_ACTION_COUNT) { + if (WARN_ON(!counter)) { + err = -EINVAL; + goto unlock; + } dst[dst_num].type = MLX5_FLOW_DESTINATION_TYPE_COUNTER; - dst[dst_num].counter_id = counter_id; + dst[dst_num].counter = counter; dst_num++; } @@ -1878,7 +1882,8 @@ static int get_dests(struct uverbs_attr_bundle *attrs, return 0; } -static bool is_flow_counter(void *obj, u32 offset, u32 *counter_id) +static bool +is_flow_counter(void *obj, u32 offset, u32 *counter_id, u32 *fc_bulk_size) { struct devx_obj *devx_obj = obj; u16 opcode = MLX5_GET(general_obj_in_cmd_hdr, devx_obj->dinbox, opcode); @@ -1888,6 +1893,7 @@ static bool is_flow_counter(void *obj, u32 offset, u32 *counter_id) if (offset && offset >= devx_obj->flow_counter_bulk_size) return false; + *fc_bulk_size = devx_obj->flow_counter_bulk_size; *counter_id = MLX5_GET(dealloc_flow_counter_in, devx_obj->dinbox, flow_counter_id); @@ -1904,13 +1910,13 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_CREATE_FLOW)( { struct mlx5_flow_context flow_context = {.flow_tag = MLX5_FS_DEFAULT_FLOW_TAG}; - u32 *offset_attr, offset = 0, counter_id = 0; int dest_id, dest_type = -1, inlen, len, ret, i; struct mlx5_ib_flow_handler *flow_handler; struct mlx5_ib_flow_matcher *fs_matcher; struct ib_uobject **arr_flow_actions; struct ib_uflow_resources *uflow_res; struct mlx5_flow_act flow_act = {}; + struct mlx5_fc *counter = NULL; struct ib_qp *qp = NULL; void *devx_obj, *cmd_in; struct ib_uobject *uobj; @@ -1937,6 +1943,7 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_CREATE_FLOW)( len = uverbs_attr_get_uobjs_arr(attrs, MLX5_IB_ATTR_CREATE_FLOW_ARR_COUNTERS_DEVX, &arr_flow_actions); if (len) { + u32 *offset_attr, fc_bulk_size, offset = 0, counter_id = 0; devx_obj = arr_flow_actions[0]->object; if (uverbs_attr_is_valid(attrs, @@ -1956,8 +1963,11 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_CREATE_FLOW)( offset = *offset_attr; } - if (!is_flow_counter(devx_obj, offset, &counter_id)) + if (!is_flow_counter(devx_obj, offset, &counter_id, &fc_bulk_size)) return -EINVAL; + counter = mlx5_fc_local_create(counter_id, offset, fc_bulk_size); + if (IS_ERR(counter)) + return PTR_ERR(counter); flow_act.action |= MLX5_FLOW_CONTEXT_ACTION_COUNT; } @@ -1968,8 +1978,10 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_CREATE_FLOW)( MLX5_IB_ATTR_CREATE_FLOW_MATCH_VALUE); uflow_res = flow_resources_alloc(MLX5_IB_CREATE_FLOW_MAX_FLOW_ACTIONS); - if (!uflow_res) - return -ENOMEM; + if (!uflow_res) { + ret = -ENOMEM; + goto destroy_counter; + } len = uverbs_attr_get_uobjs_arr(attrs, MLX5_IB_ATTR_CREATE_FLOW_ARR_FLOW_ACTIONS, &arr_flow_actions); @@ -1996,7 +2008,7 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_CREATE_FLOW)( flow_handler = raw_fs_rule_add(dev, fs_matcher, &flow_context, &flow_act, - counter_id, cmd_in, inlen, dest_id, dest_type); + counter, cmd_in, inlen, dest_id, dest_type); if (IS_ERR(flow_handler)) { ret = PTR_ERR(flow_handler); goto err_out; @@ -2007,6 +2019,9 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_CREATE_FLOW)( return 0; err_out: ib_uverbs_flow_resources_free(uflow_res); +destroy_counter: + if (counter) + mlx5_fc_local_destroy(counter); return ret; } diff --git a/drivers/infiniband/hw/mlx5/mad.c b/drivers/infiniband/hw/mlx5/mad.c index 1b6c5e37d169..2453ae4384a7 100644 --- a/drivers/infiniband/hw/mlx5/mad.c +++ b/drivers/infiniband/hw/mlx5/mad.c @@ -278,7 +278,13 @@ static int process_pma_cmd(struct mlx5_ib_dev *dev, u32 port_num, goto done; } - err = query_ib_ppcnt(mdev, mdev_port_num, 0, out_cnt, sz, 0); + if (dev->ib_dev.type == RDMA_DEVICE_TYPE_SMI) + err = query_ib_ppcnt(mdev, mdev_port_num, port_num, + out_cnt, sz, 0); + else + err = query_ib_ppcnt(mdev, mdev_port_num, 0, + out_cnt, sz, 0); + if (!err) pma_cnt_assign(pma_cnt, out_cnt); } diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index 4999239c8f41..81849eb671a1 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -242,6 +242,10 @@ static int mlx5_netdev_event(struct notifier_block *this, case NETDEV_DOWN: { struct net_device *upper = NULL; + if (!netif_is_lag_master(ndev) && !netif_is_lag_port(ndev) && + !mlx5_core_mp_enabled(mdev)) + return NOTIFY_DONE; + if (mlx5_lag_is_roce(mdev) || mlx5_lag_is_sriov(mdev)) { struct net_device *lag_ndev; @@ -1182,6 +1186,14 @@ static int mlx5_ib_query_device(struct ib_device *ibdev, MLX5_IB_QUERY_DEV_RESP_PACKET_BASED_CREDIT_MODE; resp.flags |= MLX5_IB_QUERY_DEV_RESP_FLAGS_SCAT2CQE_DCT; + + if (MLX5_CAP_GEN_2(mdev, dp_ordering_force) && + (MLX5_CAP_GEN(mdev, dp_ordering_ooo_all_xrc) || + MLX5_CAP_GEN(mdev, dp_ordering_ooo_all_dc) || + MLX5_CAP_GEN(mdev, dp_ordering_ooo_all_rc) || + MLX5_CAP_GEN(mdev, dp_ordering_ooo_all_ud) || + MLX5_CAP_GEN(mdev, dp_ordering_ooo_all_uc))) + resp.flags |= MLX5_IB_QUERY_DEV_RESP_FLAGS_OOO_DP; } if (offsetofend(typeof(resp), sw_parsing_caps) <= uhw_outlen) { @@ -2831,7 +2843,7 @@ static int mlx5_ib_get_plane_num(struct mlx5_core_dev *mdev, u8 *num_plane) int err; *num_plane = 0; - if (!MLX5_CAP_GEN(mdev, ib_virt)) + if (!MLX5_CAP_GEN(mdev, ib_virt) || !MLX5_CAP_GEN_2(mdev, multiplane)) return 0; err = mlx5_query_hca_vport_context(mdev, 0, 1, 0, &vport_ctx); @@ -2997,7 +3009,6 @@ unlock: static int mlx5_ib_dev_res_init(struct mlx5_ib_dev *dev) { struct mlx5_ib_resources *devr = &dev->devr; - int port; int ret; if (!MLX5_CAP_GEN(dev->mdev, xrc)) @@ -3013,10 +3024,6 @@ static int mlx5_ib_dev_res_init(struct mlx5_ib_dev *dev) return ret; } - for (port = 0; port < ARRAY_SIZE(devr->ports); ++port) - INIT_WORK(&devr->ports[port].pkey_change_work, - pkey_change_handler); - mutex_init(&devr->cq_lock); mutex_init(&devr->srq_lock); @@ -3026,16 +3033,6 @@ static int mlx5_ib_dev_res_init(struct mlx5_ib_dev *dev) static void mlx5_ib_dev_res_cleanup(struct mlx5_ib_dev *dev) { struct mlx5_ib_resources *devr = &dev->devr; - int port; - - /* - * Make sure no change P_Key work items are still executing. - * - * At this stage, the mlx5_ib_event should be unregistered - * and it ensures that no new works are added. - */ - for (port = 0; port < ARRAY_SIZE(devr->ports); ++port) - cancel_work_sync(&devr->ports[port].pkey_change_work); /* After s0/s1 init, they are not unset during the device lifetime. */ if (devr->s1) { @@ -3211,12 +3208,14 @@ static int lag_event(struct notifier_block *nb, unsigned long event, void *data) struct mlx5_ib_dev *dev = container_of(nb, struct mlx5_ib_dev, lag_events); struct mlx5_core_dev *mdev = dev->mdev; + struct ib_device *ibdev = &dev->ib_dev; + struct net_device *old_ndev = NULL; struct mlx5_ib_port *port; struct net_device *ndev; - int i, err; - int portnum; + u32 portnum = 0; + int ret = 0; + int i; - portnum = 0; switch (event) { case MLX5_DRIVER_EVENT_ACTIVE_BACKUP_LAG_CHANGE_LOWERSTATE: ndev = data; @@ -3232,19 +3231,24 @@ static int lag_event(struct notifier_block *nb, unsigned long event, void *data) } } } - err = ib_device_set_netdev(&dev->ib_dev, ndev, - portnum + 1); - dev_put(ndev); - if (err) - return err; - /* Rescan gids after new netdev assignment */ - rdma_roce_rescan_device(&dev->ib_dev); + old_ndev = ib_device_get_netdev(ibdev, portnum + 1); + ret = ib_device_set_netdev(ibdev, ndev, portnum + 1); + if (ret) + goto out; + + if (old_ndev) + roce_del_all_netdev_gids(ibdev, portnum + 1, + old_ndev); + rdma_roce_rescan_port(ibdev, portnum + 1); } break; default: return NOTIFY_DONE; } - return NOTIFY_OK; + +out: + dev_put(old_ndev); + return notifier_from_errno(ret); } static void mlx5e_lag_event_register(struct mlx5_ib_dev *dev) @@ -3639,7 +3643,8 @@ static int mlx5_ib_init_multiport_master(struct mlx5_ib_dev *dev) list_for_each_entry(mpi, &mlx5_ib_unaffiliated_port_list, list) { if (dev->sys_image_guid == mpi->sys_image_guid && - (mlx5_core_native_port_num(mpi->mdev) - 1) == i) { + (mlx5_core_native_port_num(mpi->mdev) - 1) == i && + mlx5_core_same_coredev_type(dev->mdev, mpi->mdev)) { bound = mlx5_ib_bind_slave_port(dev, mpi); } @@ -4134,6 +4139,7 @@ static const struct ib_device_ops mlx5_ib_dev_ops = { .req_notify_cq = mlx5_ib_arm_cq, .rereg_user_mr = mlx5_ib_rereg_user_mr, .resize_cq = mlx5_ib_resize_cq, + .ufile_hw_cleanup = mlx5_ib_ufile_hw_cleanup, INIT_RDMA_OBJ_SIZE(ib_ah, mlx5_ib_ah, ibah), INIT_RDMA_OBJ_SIZE(ib_counters, mlx5_ib_mcounters, ibcntrs), @@ -4464,6 +4470,13 @@ static void mlx5_ib_stage_delay_drop_cleanup(struct mlx5_ib_dev *dev) static int mlx5_ib_stage_dev_notifier_init(struct mlx5_ib_dev *dev) { + struct mlx5_ib_resources *devr = &dev->devr; + int port; + + for (port = 0; port < ARRAY_SIZE(devr->ports); ++port) + INIT_WORK(&devr->ports[port].pkey_change_work, + pkey_change_handler); + dev->mdev_events.notifier_call = mlx5_ib_event; mlx5_notifier_register(dev->mdev, &dev->mdev_events); @@ -4474,8 +4487,14 @@ static int mlx5_ib_stage_dev_notifier_init(struct mlx5_ib_dev *dev) static void mlx5_ib_stage_dev_notifier_cleanup(struct mlx5_ib_dev *dev) { + struct mlx5_ib_resources *devr = &dev->devr; + int port; + mlx5r_macsec_event_unregister(dev); mlx5_notifier_unregister(dev->mdev, &dev->mdev_events); + + for (port = 0; port < ARRAY_SIZE(devr->ports); ++port) + cancel_work_sync(&devr->ports[port].pkey_change_work); } void mlx5_ib_data_direct_bind(struct mlx5_ib_dev *ibdev, @@ -4565,9 +4584,6 @@ static const struct mlx5_ib_profile pf_profile = { STAGE_CREATE(MLX5_IB_STAGE_DEVICE_RESOURCES, mlx5_ib_dev_res_init, mlx5_ib_dev_res_cleanup), - STAGE_CREATE(MLX5_IB_STAGE_DEVICE_NOTIFIER, - mlx5_ib_stage_dev_notifier_init, - mlx5_ib_stage_dev_notifier_cleanup), STAGE_CREATE(MLX5_IB_STAGE_ODP, mlx5_ib_odp_init_one, mlx5_ib_odp_cleanup_one), @@ -4592,6 +4608,9 @@ static const struct mlx5_ib_profile pf_profile = { STAGE_CREATE(MLX5_IB_STAGE_IB_REG, mlx5_ib_stage_ib_reg_init, mlx5_ib_stage_ib_reg_cleanup), + STAGE_CREATE(MLX5_IB_STAGE_DEVICE_NOTIFIER, + mlx5_ib_stage_dev_notifier_init, + mlx5_ib_stage_dev_notifier_cleanup), STAGE_CREATE(MLX5_IB_STAGE_POST_IB_REG_UMR, mlx5_ib_stage_post_ib_reg_umr_init, NULL), @@ -4628,9 +4647,6 @@ const struct mlx5_ib_profile raw_eth_profile = { STAGE_CREATE(MLX5_IB_STAGE_DEVICE_RESOURCES, mlx5_ib_dev_res_init, mlx5_ib_dev_res_cleanup), - STAGE_CREATE(MLX5_IB_STAGE_DEVICE_NOTIFIER, - mlx5_ib_stage_dev_notifier_init, - mlx5_ib_stage_dev_notifier_cleanup), STAGE_CREATE(MLX5_IB_STAGE_COUNTERS, mlx5_ib_counters_init, mlx5_ib_counters_cleanup), @@ -4652,6 +4668,9 @@ const struct mlx5_ib_profile raw_eth_profile = { STAGE_CREATE(MLX5_IB_STAGE_IB_REG, mlx5_ib_stage_ib_reg_init, mlx5_ib_stage_ib_reg_cleanup), + STAGE_CREATE(MLX5_IB_STAGE_DEVICE_NOTIFIER, + mlx5_ib_stage_dev_notifier_init, + mlx5_ib_stage_dev_notifier_cleanup), STAGE_CREATE(MLX5_IB_STAGE_POST_IB_REG_UMR, mlx5_ib_stage_post_ib_reg_umr_init, NULL), @@ -4771,7 +4790,8 @@ static int mlx5r_mp_probe(struct auxiliary_device *adev, mutex_lock(&mlx5_ib_multiport_mutex); list_for_each_entry(dev, &mlx5_ib_dev_list, ib_dev_list) { - if (dev->sys_image_guid == mpi->sys_image_guid) + if (dev->sys_image_guid == mpi->sys_image_guid && + mlx5_core_same_coredev_type(dev->mdev, mpi->mdev)) bound = mlx5_ib_bind_slave_port(dev, mpi); if (bound) { diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h index 23fd72f7f63d..974a45c92fbb 100644 --- a/drivers/infiniband/hw/mlx5/mlx5_ib.h +++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h @@ -521,6 +521,7 @@ struct mlx5_ib_qp { struct mlx5_bf bf; u8 has_rq:1; u8 is_rss:1; + u8 is_ooo_rq:1; /* only for user space QPs. For kernel * we have it from the bf object @@ -668,6 +669,12 @@ struct mlx5_ib_mkey { #define mlx5_update_odp_stats(mr, counter_name, value) \ atomic64_add(value, &((mr)->odp_stats.counter_name)) +#define mlx5_update_odp_stats_with_handled(mr, counter_name, value) \ + do { \ + mlx5_update_odp_stats(mr, counter_name, value); \ + atomic64_add(1, &((mr)->odp_stats.counter_name##_handled)); \ + } while (0) + struct mlx5_ib_mr { struct ib_mr ibmr; struct mlx5_ib_mkey mmkey; @@ -972,7 +979,6 @@ enum mlx5_ib_stages { MLX5_IB_STAGE_QP, MLX5_IB_STAGE_SRQ, MLX5_IB_STAGE_DEVICE_RESOURCES, - MLX5_IB_STAGE_DEVICE_NOTIFIER, MLX5_IB_STAGE_ODP, MLX5_IB_STAGE_COUNTERS, MLX5_IB_STAGE_CONG_DEBUGFS, @@ -981,6 +987,7 @@ enum mlx5_ib_stages { MLX5_IB_STAGE_PRE_IB_REG_UMR, MLX5_IB_STAGE_WHITELIST_UID, MLX5_IB_STAGE_IB_REG, + MLX5_IB_STAGE_DEVICE_NOTIFIER, MLX5_IB_STAGE_POST_IB_REG_UMR, MLX5_IB_STAGE_DELAY_DROP, MLX5_IB_STAGE_RESTRACK, diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c index 45d9dc9c6c8f..bb02b6adbf2c 100644 --- a/drivers/infiniband/hw/mlx5/mr.c +++ b/drivers/infiniband/hw/mlx5/mr.c @@ -2021,6 +2021,11 @@ static int mlx5_revoke_mr(struct mlx5_ib_mr *mr) { struct mlx5_ib_dev *dev = to_mdev(mr->ibmr.device); struct mlx5_cache_ent *ent = mr->mmkey.cache_ent; + bool is_odp = is_odp_mr(mr); + int ret = 0; + + if (is_odp) + mutex_lock(&to_ib_umem_odp(mr->umem)->umem_mutex); if (mr->mmkey.cacheable && !mlx5r_umr_revoke_mr(mr) && !cache_ent_find_and_store(dev, mr)) { ent = mr->mmkey.cache_ent; @@ -2032,7 +2037,7 @@ static int mlx5_revoke_mr(struct mlx5_ib_mr *mr) ent->tmp_cleanup_scheduled = true; } spin_unlock_irq(&ent->mkeys_queue.lock); - return 0; + goto out; } if (ent) { @@ -2041,7 +2046,15 @@ static int mlx5_revoke_mr(struct mlx5_ib_mr *mr) mr->mmkey.cache_ent = NULL; spin_unlock_irq(&ent->mkeys_queue.lock); } - return destroy_mkey(dev, mr); + ret = destroy_mkey(dev, mr); +out: + if (is_odp) { + if (!ret) + to_ib_umem_odp(mr->umem)->private = NULL; + mutex_unlock(&to_ib_umem_odp(mr->umem)->umem_mutex); + } + + return ret; } static int __mlx5_ib_dereg_mr(struct ib_mr *ibmr) diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c index 4b37446758fd..f1e23583e6c0 100644 --- a/drivers/infiniband/hw/mlx5/odp.c +++ b/drivers/infiniband/hw/mlx5/odp.c @@ -228,13 +228,27 @@ static void destroy_unused_implicit_child_mr(struct mlx5_ib_mr *mr) unsigned long idx = ib_umem_start(odp) >> MLX5_IMR_MTT_SHIFT; struct mlx5_ib_mr *imr = mr->parent; + /* + * If userspace is racing freeing the parent implicit ODP MR then we can + * loose the race with parent destruction. In this case + * mlx5_ib_free_odp_mr() will free everything in the implicit_children + * xarray so NOP is fine. This child MR cannot be destroyed here because + * we are under its umem_mutex. + */ if (!refcount_inc_not_zero(&imr->mmkey.usecount)) return; - xa_erase(&imr->implicit_children, idx); + xa_lock(&imr->implicit_children); + if (__xa_cmpxchg(&imr->implicit_children, idx, mr, NULL, GFP_KERNEL) != + mr) { + xa_unlock(&imr->implicit_children); + return; + } + if (MLX5_CAP_ODP(mr_to_mdev(mr)->mdev, mem_page_fault)) - xa_erase(&mr_to_mdev(mr)->odp_mkeys, - mlx5_base_mkey(mr->mmkey.key)); + __xa_erase(&mr_to_mdev(mr)->odp_mkeys, + mlx5_base_mkey(mr->mmkey.key)); + xa_unlock(&imr->implicit_children); /* Freeing a MR is a sleeping operation, so bounce to a work queue */ INIT_WORK(&mr->odp_destroy.work, free_implicit_child_mr_work); @@ -268,6 +282,8 @@ static bool mlx5_ib_invalidate_range(struct mmu_interval_notifier *mni, if (!umem_odp->npages) goto out; mr = umem_odp->private; + if (!mr) + goto out; start = max_t(u64, ib_umem_start(umem_odp), range->start); end = min_t(u64, ib_umem_end(umem_odp), range->end); @@ -313,7 +329,7 @@ static bool mlx5_ib_invalidate_range(struct mmu_interval_notifier *mni, MLX5_IB_UPD_XLT_ZAP | MLX5_IB_UPD_XLT_ATOMIC); - mlx5_update_odp_stats(mr, invalidations, invalidations); + mlx5_update_odp_stats_with_handled(mr, invalidations, invalidations); /* * We are now sure that the device will not access the @@ -500,18 +516,18 @@ static struct mlx5_ib_mr *implicit_get_child_mr(struct mlx5_ib_mr *imr, refcount_inc(&ret->mmkey.usecount); goto out_lock; } - xa_unlock(&imr->implicit_children); if (MLX5_CAP_ODP(dev->mdev, mem_page_fault)) { - ret = xa_store(&dev->odp_mkeys, mlx5_base_mkey(mr->mmkey.key), - &mr->mmkey, GFP_KERNEL); + ret = __xa_store(&dev->odp_mkeys, mlx5_base_mkey(mr->mmkey.key), + &mr->mmkey, GFP_KERNEL); if (xa_is_err(ret)) { ret = ERR_PTR(xa_err(ret)); - xa_erase(&imr->implicit_children, idx); - goto out_mr; + __xa_erase(&imr->implicit_children, idx); + goto out_lock; } mr->mmkey.type = MLX5_MKEY_IMPLICIT_CHILD; } + xa_unlock(&imr->implicit_children); mlx5_ib_dbg(mr_to_mdev(imr), "key %x mr %p\n", mr->mmkey.key, mr); return mr; @@ -944,8 +960,7 @@ out: /* * Handle a single data segment in a page-fault WQE or RDMA region. * - * Returns number of OS pages retrieved on success. The caller may continue to - * the next data segment. + * Returns zero on success. The caller may continue to the next data segment. * Can return the following error codes: * -EAGAIN to designate a temporary error. The caller will abort handling the * page fault and resolve it. @@ -958,7 +973,7 @@ static int pagefault_single_data_segment(struct mlx5_ib_dev *dev, u32 *bytes_committed, u32 *bytes_mapped) { - int npages = 0, ret, i, outlen, cur_outlen = 0, depth = 0; + int ret, i, outlen, cur_outlen = 0, depth = 0, pages_in_range; struct pf_frame *head = NULL, *frame; struct mlx5_ib_mkey *mmkey; struct mlx5_ib_mr *mr; @@ -993,13 +1008,20 @@ next_mr: case MLX5_MKEY_MR: mr = container_of(mmkey, struct mlx5_ib_mr, mmkey); + pages_in_range = (ALIGN(io_virt + bcnt, PAGE_SIZE) - + (io_virt & PAGE_MASK)) >> + PAGE_SHIFT; ret = pagefault_mr(mr, io_virt, bcnt, bytes_mapped, 0, false); if (ret < 0) goto end; - mlx5_update_odp_stats(mr, faults, ret); + mlx5_update_odp_stats_with_handled(mr, faults, ret); + + if (ret < pages_in_range) { + ret = -EFAULT; + goto end; + } - npages += ret; ret = 0; break; @@ -1090,7 +1112,7 @@ end: kfree(out); *bytes_committed = 0; - return ret ? ret : npages; + return ret; } /* @@ -1109,8 +1131,7 @@ end: * the committed bytes). * @receive_queue: receive WQE end of sg list * - * Returns the number of pages loaded if positive, zero for an empty WQE, or a - * negative error code. + * Returns zero for success or a negative error code. */ static int pagefault_data_segments(struct mlx5_ib_dev *dev, struct mlx5_pagefault *pfault, @@ -1118,7 +1139,7 @@ static int pagefault_data_segments(struct mlx5_ib_dev *dev, void *wqe_end, u32 *bytes_mapped, u32 *total_wqe_bytes, bool receive_queue) { - int ret = 0, npages = 0; + int ret = 0; u64 io_virt; __be32 key; u32 byte_count; @@ -1175,10 +1196,9 @@ static int pagefault_data_segments(struct mlx5_ib_dev *dev, bytes_mapped); if (ret < 0) break; - npages += ret; } - return ret < 0 ? ret : npages; + return ret; } /* @@ -1414,12 +1434,6 @@ resolve_page_fault: free_page((unsigned long)wqe_start); } -static int pages_in_range(u64 address, u32 length) -{ - return (ALIGN(address + length, PAGE_SIZE) - - (address & PAGE_MASK)) >> PAGE_SHIFT; -} - static void mlx5_ib_mr_rdma_pfault_handler(struct mlx5_ib_dev *dev, struct mlx5_pagefault *pfault) { @@ -1458,7 +1472,7 @@ static void mlx5_ib_mr_rdma_pfault_handler(struct mlx5_ib_dev *dev, if (ret == -EAGAIN) { /* We're racing with an invalidation, don't prefetch */ prefetch_activated = 0; - } else if (ret < 0 || pages_in_range(address, length) > ret) { + } else if (ret < 0) { mlx5_ib_page_fault_resume(dev, pfault, 1); if (ret != -ENOENT) mlx5_ib_dbg(dev, "PAGE FAULT error %d. QP 0x%llx, type: 0x%x\n", @@ -1529,7 +1543,7 @@ static void mlx5_ib_mr_memory_pfault_handler(struct mlx5_ib_dev *dev, goto err; } - mlx5_update_odp_stats(mr, faults, ret); + mlx5_update_odp_stats_with_handled(mr, faults, ret); mlx5r_deref_odp_mkey(mmkey); if (pfault->memory.flags & MLX5_MEMORY_PAGE_FAULT_FLAGS_LAST) diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c index 10ce3b44f645..a43eba9d3572 100644 --- a/drivers/infiniband/hw/mlx5/qp.c +++ b/drivers/infiniband/hw/mlx5/qp.c @@ -1960,7 +1960,7 @@ static int atomic_size_to_mode(int size_mask) } static int get_atomic_mode(struct mlx5_ib_dev *dev, - enum ib_qp_type qp_type) + struct mlx5_ib_qp *qp) { u8 atomic_operations = MLX5_CAP_ATOMIC(dev->mdev, atomic_operations); u8 atomic = MLX5_CAP_GEN(dev->mdev, atomic); @@ -1970,7 +1970,7 @@ static int get_atomic_mode(struct mlx5_ib_dev *dev, if (!atomic) return -EOPNOTSUPP; - if (qp_type == MLX5_IB_QPT_DCT) + if (qp->type == MLX5_IB_QPT_DCT) atomic_size_mask = MLX5_CAP_ATOMIC(dev->mdev, atomic_size_dc); else atomic_size_mask = MLX5_CAP_ATOMIC(dev->mdev, atomic_size_qp); @@ -1984,6 +1984,10 @@ static int get_atomic_mode(struct mlx5_ib_dev *dev, atomic_operations & MLX5_ATOMIC_OPS_FETCH_ADD)) atomic_mode = MLX5_ATOMIC_MODE_IB_COMP; + /* OOO DP QPs do not support larger than 8-Bytes atomic operations */ + if (atomic_mode > MLX5_ATOMIC_MODE_8B && qp->is_ooo_rq) + atomic_mode = MLX5_ATOMIC_MODE_8B; + return atomic_mode; } @@ -2839,6 +2843,29 @@ static int check_valid_flow(struct mlx5_ib_dev *dev, struct ib_pd *pd, return 0; } +static bool get_dp_ooo_cap(struct mlx5_core_dev *mdev, enum ib_qp_type qp_type) +{ + if (!MLX5_CAP_GEN_2(mdev, dp_ordering_force)) + return false; + + switch (qp_type) { + case IB_QPT_RC: + return MLX5_CAP_GEN(mdev, dp_ordering_ooo_all_rc); + case IB_QPT_XRC_INI: + case IB_QPT_XRC_TGT: + return MLX5_CAP_GEN(mdev, dp_ordering_ooo_all_xrc); + case IB_QPT_UC: + return MLX5_CAP_GEN(mdev, dp_ordering_ooo_all_uc); + case IB_QPT_UD: + return MLX5_CAP_GEN(mdev, dp_ordering_ooo_all_ud); + case MLX5_IB_QPT_DCI: + case MLX5_IB_QPT_DCT: + return MLX5_CAP_GEN(mdev, dp_ordering_ooo_all_dc); + default: + return false; + } +} + static void process_vendor_flag(struct mlx5_ib_dev *dev, int *flags, int flag, bool cond, struct mlx5_ib_qp *qp) { @@ -3365,7 +3392,7 @@ static int set_qpc_atomic_flags(struct mlx5_ib_qp *qp, if (access_flags & IB_ACCESS_REMOTE_ATOMIC) { int atomic_mode; - atomic_mode = get_atomic_mode(dev, qp->type); + atomic_mode = get_atomic_mode(dev, qp); if (atomic_mode < 0) return -EOPNOTSUPP; @@ -4316,6 +4343,11 @@ static int __mlx5_ib_modify_qp(struct ib_qp *ibqp, if (qp->flags & MLX5_IB_QP_CREATE_SQPN_QP1) MLX5_SET(qpc, qpc, deth_sqpn, 1); + if (qp->is_ooo_rq && cur_state == IB_QPS_INIT && new_state == IB_QPS_RTR) { + MLX5_SET(qpc, qpc, dp_ordering_1, 1); + MLX5_SET(qpc, qpc, dp_ordering_force, 1); + } + mlx5_cur = to_mlx5_state(cur_state); mlx5_new = to_mlx5_state(new_state); @@ -4531,7 +4563,7 @@ static int mlx5_ib_modify_dct(struct ib_qp *ibqp, struct ib_qp_attr *attr, if (attr->qp_access_flags & IB_ACCESS_REMOTE_ATOMIC) { int atomic_mode; - atomic_mode = get_atomic_mode(dev, MLX5_IB_QPT_DCT); + atomic_mode = get_atomic_mode(dev, qp); if (atomic_mode < 0) return -EOPNOTSUPP; @@ -4573,6 +4605,10 @@ static int mlx5_ib_modify_dct(struct ib_qp *ibqp, struct ib_qp_attr *attr, MLX5_SET(dctc, dctc, hop_limit, attr->ah_attr.grh.hop_limit); if (attr->ah_attr.type == RDMA_AH_ATTR_TYPE_ROCE) MLX5_SET(dctc, dctc, eth_prio, attr->ah_attr.sl & 0x7); + if (qp->is_ooo_rq) { + MLX5_SET(dctc, dctc, dp_ordering_1, 1); + MLX5_SET(dctc, dctc, dp_ordering_force, 1); + } err = mlx5_core_create_dct(dev, &qp->dct.mdct, qp->dct.in, MLX5_ST_SZ_BYTES(create_dct_in), out, @@ -4676,11 +4712,16 @@ int mlx5_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, min(udata->inlen, sizeof(ucmd)))) return -EFAULT; - if (ucmd.comp_mask || + if (ucmd.comp_mask & ~MLX5_IB_MODIFY_QP_OOO_DP || memchr_inv(&ucmd.burst_info.reserved, 0, sizeof(ucmd.burst_info.reserved))) return -EOPNOTSUPP; + if (ucmd.comp_mask & MLX5_IB_MODIFY_QP_OOO_DP) { + if (!get_dp_ooo_cap(dev->mdev, qp->type)) + return -EOPNOTSUPP; + qp->is_ooo_rq = 1; + } } if (qp->type == IB_QPT_GSI) diff --git a/drivers/infiniband/hw/mlx5/restrack.c b/drivers/infiniband/hw/mlx5/restrack.c index affcf8fe943c..67841922c7b8 100644 --- a/drivers/infiniband/hw/mlx5/restrack.c +++ b/drivers/infiniband/hw/mlx5/restrack.c @@ -96,9 +96,18 @@ static int fill_stat_mr_entry(struct sk_buff *msg, struct ib_mr *ibmr) atomic64_read(&mr->odp_stats.faults))) goto err_table; if (rdma_nl_stat_hwcounter_entry( + msg, "page_faults_handled", + atomic64_read(&mr->odp_stats.faults_handled))) + goto err_table; + if (rdma_nl_stat_hwcounter_entry( msg, "page_invalidations", atomic64_read(&mr->odp_stats.invalidations))) goto err_table; + if (rdma_nl_stat_hwcounter_entry( + msg, "page_invalidations_handled", + atomic64_read(&mr->odp_stats.invalidations_handled))) + goto err_table; + if (rdma_nl_stat_hwcounter_entry(msg, "page_prefetch", atomic64_read(&mr->odp_stats.prefetch))) goto err_table; |