summaryrefslogtreecommitdiffstats
path: root/drivers/infiniband/hw/mlx5
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/infiniband/hw/mlx5')
-rw-r--r--drivers/infiniband/hw/mlx5/devx.c93
-rw-r--r--drivers/infiniband/hw/mlx5/devx.h4
-rw-r--r--drivers/infiniband/hw/mlx5/fs.c37
-rw-r--r--drivers/infiniband/hw/mlx5/mad.c8
-rw-r--r--drivers/infiniband/hw/mlx5/main.c90
-rw-r--r--drivers/infiniband/hw/mlx5/mlx5_ib.h9
-rw-r--r--drivers/infiniband/hw/mlx5/mr.c17
-rw-r--r--drivers/infiniband/hw/mlx5/odp.c70
-rw-r--r--drivers/infiniband/hw/mlx5/qp.c51
-rw-r--r--drivers/infiniband/hw/mlx5/restrack.c9
10 files changed, 304 insertions, 84 deletions
diff --git a/drivers/infiniband/hw/mlx5/devx.c b/drivers/infiniband/hw/mlx5/devx.c
index 69999d8d24f3..4186884c66e1 100644
--- a/drivers/infiniband/hw/mlx5/devx.c
+++ b/drivers/infiniband/hw/mlx5/devx.c
@@ -27,6 +27,19 @@ enum devx_obj_flags {
DEVX_OBJ_FLAGS_INDIRECT_MKEY = 1 << 0,
DEVX_OBJ_FLAGS_DCT = 1 << 1,
DEVX_OBJ_FLAGS_CQ = 1 << 2,
+ DEVX_OBJ_FLAGS_HW_FREED = 1 << 3,
+};
+
+#define MAX_ASYNC_CMDS 8
+
+struct mlx5_async_cmd {
+ struct ib_uobject *uobject;
+ void *in;
+ int in_size;
+ u32 out[MLX5_ST_SZ_DW(general_obj_out_cmd_hdr)];
+ int err;
+ struct mlx5_async_work cb_work;
+ struct completion comp;
};
struct devx_async_data {
@@ -1405,7 +1418,9 @@ static int devx_obj_cleanup(struct ib_uobject *uobject,
*/
mlx5r_deref_wait_odp_mkey(&obj->mkey);
- if (obj->flags & DEVX_OBJ_FLAGS_DCT)
+ if (obj->flags & DEVX_OBJ_FLAGS_HW_FREED)
+ ret = 0;
+ else if (obj->flags & DEVX_OBJ_FLAGS_DCT)
ret = mlx5_core_destroy_dct(obj->ib_dev, &obj->core_dct);
else if (obj->flags & DEVX_OBJ_FLAGS_CQ)
ret = mlx5_core_destroy_cq(obj->ib_dev->mdev, &obj->core_cq);
@@ -2595,6 +2610,82 @@ void mlx5_ib_devx_cleanup(struct mlx5_ib_dev *dev)
}
}
+static void devx_async_destroy_cb(int status, struct mlx5_async_work *context)
+{
+ struct mlx5_async_cmd *devx_out = container_of(context,
+ struct mlx5_async_cmd, cb_work);
+ struct devx_obj *obj = devx_out->uobject->object;
+
+ if (!status)
+ obj->flags |= DEVX_OBJ_FLAGS_HW_FREED;
+
+ complete(&devx_out->comp);
+}
+
+static void devx_async_destroy(struct mlx5_ib_dev *dev,
+ struct mlx5_async_cmd *cmd)
+{
+ init_completion(&cmd->comp);
+ cmd->err = mlx5_cmd_exec_cb(&dev->async_ctx, cmd->in, cmd->in_size,
+ &cmd->out, sizeof(cmd->out),
+ devx_async_destroy_cb, &cmd->cb_work);
+}
+
+static void devx_wait_async_destroy(struct mlx5_async_cmd *cmd)
+{
+ if (!cmd->err)
+ wait_for_completion(&cmd->comp);
+ atomic_set(&cmd->uobject->usecnt, 0);
+}
+
+void mlx5_ib_ufile_hw_cleanup(struct ib_uverbs_file *ufile)
+{
+ struct mlx5_async_cmd async_cmd[MAX_ASYNC_CMDS];
+ struct ib_ucontext *ucontext = ufile->ucontext;
+ struct ib_device *device = ucontext->device;
+ struct mlx5_ib_dev *dev = to_mdev(device);
+ struct ib_uobject *uobject;
+ struct devx_obj *obj;
+ int head = 0;
+ int tail = 0;
+
+ list_for_each_entry(uobject, &ufile->uobjects, list) {
+ WARN_ON(uverbs_try_lock_object(uobject, UVERBS_LOOKUP_WRITE));
+
+ /*
+ * Currently we only support QP destruction, if other objects
+ * are to be destroyed need to add type synchronization to the
+ * cleanup algorithm and handle pre/post FW cleanup for the
+ * new types if needed.
+ */
+ if (uobj_get_object_id(uobject) != MLX5_IB_OBJECT_DEVX_OBJ ||
+ (get_dec_obj_type(uobject->object, MLX5_EVENT_TYPE_MAX) !=
+ MLX5_OBJ_TYPE_QP)) {
+ atomic_set(&uobject->usecnt, 0);
+ continue;
+ }
+
+ obj = uobject->object;
+
+ async_cmd[tail % MAX_ASYNC_CMDS].in = obj->dinbox;
+ async_cmd[tail % MAX_ASYNC_CMDS].in_size = obj->dinlen;
+ async_cmd[tail % MAX_ASYNC_CMDS].uobject = uobject;
+
+ devx_async_destroy(dev, &async_cmd[tail % MAX_ASYNC_CMDS]);
+ tail++;
+
+ if (tail - head == MAX_ASYNC_CMDS) {
+ devx_wait_async_destroy(&async_cmd[head % MAX_ASYNC_CMDS]);
+ head++;
+ }
+ }
+
+ while (head != tail) {
+ devx_wait_async_destroy(&async_cmd[head % MAX_ASYNC_CMDS]);
+ head++;
+ }
+}
+
static ssize_t devx_async_cmd_event_read(struct file *filp, char __user *buf,
size_t count, loff_t *pos)
{
diff --git a/drivers/infiniband/hw/mlx5/devx.h b/drivers/infiniband/hw/mlx5/devx.h
index ee2213275fd6..1344bf4c9d21 100644
--- a/drivers/infiniband/hw/mlx5/devx.h
+++ b/drivers/infiniband/hw/mlx5/devx.h
@@ -28,6 +28,7 @@ int mlx5_ib_devx_create(struct mlx5_ib_dev *dev, bool is_user);
void mlx5_ib_devx_destroy(struct mlx5_ib_dev *dev, u16 uid);
int mlx5_ib_devx_init(struct mlx5_ib_dev *dev);
void mlx5_ib_devx_cleanup(struct mlx5_ib_dev *dev);
+void mlx5_ib_ufile_hw_cleanup(struct ib_uverbs_file *ufile);
#else
static inline int mlx5_ib_devx_create(struct mlx5_ib_dev *dev, bool is_user)
{
@@ -41,5 +42,8 @@ static inline int mlx5_ib_devx_init(struct mlx5_ib_dev *dev)
static inline void mlx5_ib_devx_cleanup(struct mlx5_ib_dev *dev)
{
}
+static inline void mlx5_ib_ufile_hw_cleanup(struct ib_uverbs_file *ufile)
+{
+}
#endif
#endif /* _MLX5_IB_DEVX_H */
diff --git a/drivers/infiniband/hw/mlx5/fs.c b/drivers/infiniband/hw/mlx5/fs.c
index 520034acf73a..162814ae8cb4 100644
--- a/drivers/infiniband/hw/mlx5/fs.c
+++ b/drivers/infiniband/hw/mlx5/fs.c
@@ -943,7 +943,7 @@ int mlx5_ib_fs_add_op_fc(struct mlx5_ib_dev *dev, u32 port_num,
}
dst.type = MLX5_FLOW_DESTINATION_TYPE_COUNTER;
- dst.counter_id = mlx5_fc_id(opfc->fc);
+ dst.counter = opfc->fc;
flow_act.action =
MLX5_FLOW_CONTEXT_ACTION_COUNT | MLX5_FLOW_CONTEXT_ACTION_ALLOW;
@@ -1113,8 +1113,8 @@ static struct mlx5_ib_flow_handler *_create_flow_rule(struct mlx5_ib_dev *dev,
handler->ibcounters = flow_act.counters;
dest_arr[dest_num].type =
MLX5_FLOW_DESTINATION_TYPE_COUNTER;
- dest_arr[dest_num].counter_id =
- mlx5_fc_id(mcounters->hw_cntrs_hndl);
+ dest_arr[dest_num].counter =
+ mcounters->hw_cntrs_hndl;
dest_num++;
}
@@ -1603,7 +1603,7 @@ static bool raw_fs_is_multicast(struct mlx5_ib_flow_matcher *fs_matcher,
static struct mlx5_ib_flow_handler *raw_fs_rule_add(
struct mlx5_ib_dev *dev, struct mlx5_ib_flow_matcher *fs_matcher,
struct mlx5_flow_context *flow_context, struct mlx5_flow_act *flow_act,
- u32 counter_id, void *cmd_in, int inlen, int dest_id, int dest_type)
+ struct mlx5_fc *counter, void *cmd_in, int inlen, int dest_id, int dest_type)
{
struct mlx5_flow_destination *dst;
struct mlx5_ib_flow_prio *ft_prio;
@@ -1652,8 +1652,12 @@ static struct mlx5_ib_flow_handler *raw_fs_rule_add(
}
if (flow_act->action & MLX5_FLOW_CONTEXT_ACTION_COUNT) {
+ if (WARN_ON(!counter)) {
+ err = -EINVAL;
+ goto unlock;
+ }
dst[dst_num].type = MLX5_FLOW_DESTINATION_TYPE_COUNTER;
- dst[dst_num].counter_id = counter_id;
+ dst[dst_num].counter = counter;
dst_num++;
}
@@ -1878,7 +1882,8 @@ static int get_dests(struct uverbs_attr_bundle *attrs,
return 0;
}
-static bool is_flow_counter(void *obj, u32 offset, u32 *counter_id)
+static bool
+is_flow_counter(void *obj, u32 offset, u32 *counter_id, u32 *fc_bulk_size)
{
struct devx_obj *devx_obj = obj;
u16 opcode = MLX5_GET(general_obj_in_cmd_hdr, devx_obj->dinbox, opcode);
@@ -1888,6 +1893,7 @@ static bool is_flow_counter(void *obj, u32 offset, u32 *counter_id)
if (offset && offset >= devx_obj->flow_counter_bulk_size)
return false;
+ *fc_bulk_size = devx_obj->flow_counter_bulk_size;
*counter_id = MLX5_GET(dealloc_flow_counter_in,
devx_obj->dinbox,
flow_counter_id);
@@ -1904,13 +1910,13 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_CREATE_FLOW)(
{
struct mlx5_flow_context flow_context = {.flow_tag =
MLX5_FS_DEFAULT_FLOW_TAG};
- u32 *offset_attr, offset = 0, counter_id = 0;
int dest_id, dest_type = -1, inlen, len, ret, i;
struct mlx5_ib_flow_handler *flow_handler;
struct mlx5_ib_flow_matcher *fs_matcher;
struct ib_uobject **arr_flow_actions;
struct ib_uflow_resources *uflow_res;
struct mlx5_flow_act flow_act = {};
+ struct mlx5_fc *counter = NULL;
struct ib_qp *qp = NULL;
void *devx_obj, *cmd_in;
struct ib_uobject *uobj;
@@ -1937,6 +1943,7 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_CREATE_FLOW)(
len = uverbs_attr_get_uobjs_arr(attrs,
MLX5_IB_ATTR_CREATE_FLOW_ARR_COUNTERS_DEVX, &arr_flow_actions);
if (len) {
+ u32 *offset_attr, fc_bulk_size, offset = 0, counter_id = 0;
devx_obj = arr_flow_actions[0]->object;
if (uverbs_attr_is_valid(attrs,
@@ -1956,8 +1963,11 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_CREATE_FLOW)(
offset = *offset_attr;
}
- if (!is_flow_counter(devx_obj, offset, &counter_id))
+ if (!is_flow_counter(devx_obj, offset, &counter_id, &fc_bulk_size))
return -EINVAL;
+ counter = mlx5_fc_local_create(counter_id, offset, fc_bulk_size);
+ if (IS_ERR(counter))
+ return PTR_ERR(counter);
flow_act.action |= MLX5_FLOW_CONTEXT_ACTION_COUNT;
}
@@ -1968,8 +1978,10 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_CREATE_FLOW)(
MLX5_IB_ATTR_CREATE_FLOW_MATCH_VALUE);
uflow_res = flow_resources_alloc(MLX5_IB_CREATE_FLOW_MAX_FLOW_ACTIONS);
- if (!uflow_res)
- return -ENOMEM;
+ if (!uflow_res) {
+ ret = -ENOMEM;
+ goto destroy_counter;
+ }
len = uverbs_attr_get_uobjs_arr(attrs,
MLX5_IB_ATTR_CREATE_FLOW_ARR_FLOW_ACTIONS, &arr_flow_actions);
@@ -1996,7 +2008,7 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_CREATE_FLOW)(
flow_handler =
raw_fs_rule_add(dev, fs_matcher, &flow_context, &flow_act,
- counter_id, cmd_in, inlen, dest_id, dest_type);
+ counter, cmd_in, inlen, dest_id, dest_type);
if (IS_ERR(flow_handler)) {
ret = PTR_ERR(flow_handler);
goto err_out;
@@ -2007,6 +2019,9 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_CREATE_FLOW)(
return 0;
err_out:
ib_uverbs_flow_resources_free(uflow_res);
+destroy_counter:
+ if (counter)
+ mlx5_fc_local_destroy(counter);
return ret;
}
diff --git a/drivers/infiniband/hw/mlx5/mad.c b/drivers/infiniband/hw/mlx5/mad.c
index 1b6c5e37d169..2453ae4384a7 100644
--- a/drivers/infiniband/hw/mlx5/mad.c
+++ b/drivers/infiniband/hw/mlx5/mad.c
@@ -278,7 +278,13 @@ static int process_pma_cmd(struct mlx5_ib_dev *dev, u32 port_num,
goto done;
}
- err = query_ib_ppcnt(mdev, mdev_port_num, 0, out_cnt, sz, 0);
+ if (dev->ib_dev.type == RDMA_DEVICE_TYPE_SMI)
+ err = query_ib_ppcnt(mdev, mdev_port_num, port_num,
+ out_cnt, sz, 0);
+ else
+ err = query_ib_ppcnt(mdev, mdev_port_num, 0,
+ out_cnt, sz, 0);
+
if (!err)
pma_cnt_assign(pma_cnt, out_cnt);
}
diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c
index 4999239c8f41..81849eb671a1 100644
--- a/drivers/infiniband/hw/mlx5/main.c
+++ b/drivers/infiniband/hw/mlx5/main.c
@@ -242,6 +242,10 @@ static int mlx5_netdev_event(struct notifier_block *this,
case NETDEV_DOWN: {
struct net_device *upper = NULL;
+ if (!netif_is_lag_master(ndev) && !netif_is_lag_port(ndev) &&
+ !mlx5_core_mp_enabled(mdev))
+ return NOTIFY_DONE;
+
if (mlx5_lag_is_roce(mdev) || mlx5_lag_is_sriov(mdev)) {
struct net_device *lag_ndev;
@@ -1182,6 +1186,14 @@ static int mlx5_ib_query_device(struct ib_device *ibdev,
MLX5_IB_QUERY_DEV_RESP_PACKET_BASED_CREDIT_MODE;
resp.flags |= MLX5_IB_QUERY_DEV_RESP_FLAGS_SCAT2CQE_DCT;
+
+ if (MLX5_CAP_GEN_2(mdev, dp_ordering_force) &&
+ (MLX5_CAP_GEN(mdev, dp_ordering_ooo_all_xrc) ||
+ MLX5_CAP_GEN(mdev, dp_ordering_ooo_all_dc) ||
+ MLX5_CAP_GEN(mdev, dp_ordering_ooo_all_rc) ||
+ MLX5_CAP_GEN(mdev, dp_ordering_ooo_all_ud) ||
+ MLX5_CAP_GEN(mdev, dp_ordering_ooo_all_uc)))
+ resp.flags |= MLX5_IB_QUERY_DEV_RESP_FLAGS_OOO_DP;
}
if (offsetofend(typeof(resp), sw_parsing_caps) <= uhw_outlen) {
@@ -2831,7 +2843,7 @@ static int mlx5_ib_get_plane_num(struct mlx5_core_dev *mdev, u8 *num_plane)
int err;
*num_plane = 0;
- if (!MLX5_CAP_GEN(mdev, ib_virt))
+ if (!MLX5_CAP_GEN(mdev, ib_virt) || !MLX5_CAP_GEN_2(mdev, multiplane))
return 0;
err = mlx5_query_hca_vport_context(mdev, 0, 1, 0, &vport_ctx);
@@ -2997,7 +3009,6 @@ unlock:
static int mlx5_ib_dev_res_init(struct mlx5_ib_dev *dev)
{
struct mlx5_ib_resources *devr = &dev->devr;
- int port;
int ret;
if (!MLX5_CAP_GEN(dev->mdev, xrc))
@@ -3013,10 +3024,6 @@ static int mlx5_ib_dev_res_init(struct mlx5_ib_dev *dev)
return ret;
}
- for (port = 0; port < ARRAY_SIZE(devr->ports); ++port)
- INIT_WORK(&devr->ports[port].pkey_change_work,
- pkey_change_handler);
-
mutex_init(&devr->cq_lock);
mutex_init(&devr->srq_lock);
@@ -3026,16 +3033,6 @@ static int mlx5_ib_dev_res_init(struct mlx5_ib_dev *dev)
static void mlx5_ib_dev_res_cleanup(struct mlx5_ib_dev *dev)
{
struct mlx5_ib_resources *devr = &dev->devr;
- int port;
-
- /*
- * Make sure no change P_Key work items are still executing.
- *
- * At this stage, the mlx5_ib_event should be unregistered
- * and it ensures that no new works are added.
- */
- for (port = 0; port < ARRAY_SIZE(devr->ports); ++port)
- cancel_work_sync(&devr->ports[port].pkey_change_work);
/* After s0/s1 init, they are not unset during the device lifetime. */
if (devr->s1) {
@@ -3211,12 +3208,14 @@ static int lag_event(struct notifier_block *nb, unsigned long event, void *data)
struct mlx5_ib_dev *dev = container_of(nb, struct mlx5_ib_dev,
lag_events);
struct mlx5_core_dev *mdev = dev->mdev;
+ struct ib_device *ibdev = &dev->ib_dev;
+ struct net_device *old_ndev = NULL;
struct mlx5_ib_port *port;
struct net_device *ndev;
- int i, err;
- int portnum;
+ u32 portnum = 0;
+ int ret = 0;
+ int i;
- portnum = 0;
switch (event) {
case MLX5_DRIVER_EVENT_ACTIVE_BACKUP_LAG_CHANGE_LOWERSTATE:
ndev = data;
@@ -3232,19 +3231,24 @@ static int lag_event(struct notifier_block *nb, unsigned long event, void *data)
}
}
}
- err = ib_device_set_netdev(&dev->ib_dev, ndev,
- portnum + 1);
- dev_put(ndev);
- if (err)
- return err;
- /* Rescan gids after new netdev assignment */
- rdma_roce_rescan_device(&dev->ib_dev);
+ old_ndev = ib_device_get_netdev(ibdev, portnum + 1);
+ ret = ib_device_set_netdev(ibdev, ndev, portnum + 1);
+ if (ret)
+ goto out;
+
+ if (old_ndev)
+ roce_del_all_netdev_gids(ibdev, portnum + 1,
+ old_ndev);
+ rdma_roce_rescan_port(ibdev, portnum + 1);
}
break;
default:
return NOTIFY_DONE;
}
- return NOTIFY_OK;
+
+out:
+ dev_put(old_ndev);
+ return notifier_from_errno(ret);
}
static void mlx5e_lag_event_register(struct mlx5_ib_dev *dev)
@@ -3639,7 +3643,8 @@ static int mlx5_ib_init_multiport_master(struct mlx5_ib_dev *dev)
list_for_each_entry(mpi, &mlx5_ib_unaffiliated_port_list,
list) {
if (dev->sys_image_guid == mpi->sys_image_guid &&
- (mlx5_core_native_port_num(mpi->mdev) - 1) == i) {
+ (mlx5_core_native_port_num(mpi->mdev) - 1) == i &&
+ mlx5_core_same_coredev_type(dev->mdev, mpi->mdev)) {
bound = mlx5_ib_bind_slave_port(dev, mpi);
}
@@ -4134,6 +4139,7 @@ static const struct ib_device_ops mlx5_ib_dev_ops = {
.req_notify_cq = mlx5_ib_arm_cq,
.rereg_user_mr = mlx5_ib_rereg_user_mr,
.resize_cq = mlx5_ib_resize_cq,
+ .ufile_hw_cleanup = mlx5_ib_ufile_hw_cleanup,
INIT_RDMA_OBJ_SIZE(ib_ah, mlx5_ib_ah, ibah),
INIT_RDMA_OBJ_SIZE(ib_counters, mlx5_ib_mcounters, ibcntrs),
@@ -4464,6 +4470,13 @@ static void mlx5_ib_stage_delay_drop_cleanup(struct mlx5_ib_dev *dev)
static int mlx5_ib_stage_dev_notifier_init(struct mlx5_ib_dev *dev)
{
+ struct mlx5_ib_resources *devr = &dev->devr;
+ int port;
+
+ for (port = 0; port < ARRAY_SIZE(devr->ports); ++port)
+ INIT_WORK(&devr->ports[port].pkey_change_work,
+ pkey_change_handler);
+
dev->mdev_events.notifier_call = mlx5_ib_event;
mlx5_notifier_register(dev->mdev, &dev->mdev_events);
@@ -4474,8 +4487,14 @@ static int mlx5_ib_stage_dev_notifier_init(struct mlx5_ib_dev *dev)
static void mlx5_ib_stage_dev_notifier_cleanup(struct mlx5_ib_dev *dev)
{
+ struct mlx5_ib_resources *devr = &dev->devr;
+ int port;
+
mlx5r_macsec_event_unregister(dev);
mlx5_notifier_unregister(dev->mdev, &dev->mdev_events);
+
+ for (port = 0; port < ARRAY_SIZE(devr->ports); ++port)
+ cancel_work_sync(&devr->ports[port].pkey_change_work);
}
void mlx5_ib_data_direct_bind(struct mlx5_ib_dev *ibdev,
@@ -4565,9 +4584,6 @@ static const struct mlx5_ib_profile pf_profile = {
STAGE_CREATE(MLX5_IB_STAGE_DEVICE_RESOURCES,
mlx5_ib_dev_res_init,
mlx5_ib_dev_res_cleanup),
- STAGE_CREATE(MLX5_IB_STAGE_DEVICE_NOTIFIER,
- mlx5_ib_stage_dev_notifier_init,
- mlx5_ib_stage_dev_notifier_cleanup),
STAGE_CREATE(MLX5_IB_STAGE_ODP,
mlx5_ib_odp_init_one,
mlx5_ib_odp_cleanup_one),
@@ -4592,6 +4608,9 @@ static const struct mlx5_ib_profile pf_profile = {
STAGE_CREATE(MLX5_IB_STAGE_IB_REG,
mlx5_ib_stage_ib_reg_init,
mlx5_ib_stage_ib_reg_cleanup),
+ STAGE_CREATE(MLX5_IB_STAGE_DEVICE_NOTIFIER,
+ mlx5_ib_stage_dev_notifier_init,
+ mlx5_ib_stage_dev_notifier_cleanup),
STAGE_CREATE(MLX5_IB_STAGE_POST_IB_REG_UMR,
mlx5_ib_stage_post_ib_reg_umr_init,
NULL),
@@ -4628,9 +4647,6 @@ const struct mlx5_ib_profile raw_eth_profile = {
STAGE_CREATE(MLX5_IB_STAGE_DEVICE_RESOURCES,
mlx5_ib_dev_res_init,
mlx5_ib_dev_res_cleanup),
- STAGE_CREATE(MLX5_IB_STAGE_DEVICE_NOTIFIER,
- mlx5_ib_stage_dev_notifier_init,
- mlx5_ib_stage_dev_notifier_cleanup),
STAGE_CREATE(MLX5_IB_STAGE_COUNTERS,
mlx5_ib_counters_init,
mlx5_ib_counters_cleanup),
@@ -4652,6 +4668,9 @@ const struct mlx5_ib_profile raw_eth_profile = {
STAGE_CREATE(MLX5_IB_STAGE_IB_REG,
mlx5_ib_stage_ib_reg_init,
mlx5_ib_stage_ib_reg_cleanup),
+ STAGE_CREATE(MLX5_IB_STAGE_DEVICE_NOTIFIER,
+ mlx5_ib_stage_dev_notifier_init,
+ mlx5_ib_stage_dev_notifier_cleanup),
STAGE_CREATE(MLX5_IB_STAGE_POST_IB_REG_UMR,
mlx5_ib_stage_post_ib_reg_umr_init,
NULL),
@@ -4771,7 +4790,8 @@ static int mlx5r_mp_probe(struct auxiliary_device *adev,
mutex_lock(&mlx5_ib_multiport_mutex);
list_for_each_entry(dev, &mlx5_ib_dev_list, ib_dev_list) {
- if (dev->sys_image_guid == mpi->sys_image_guid)
+ if (dev->sys_image_guid == mpi->sys_image_guid &&
+ mlx5_core_same_coredev_type(dev->mdev, mpi->mdev))
bound = mlx5_ib_bind_slave_port(dev, mpi);
if (bound) {
diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h
index 23fd72f7f63d..974a45c92fbb 100644
--- a/drivers/infiniband/hw/mlx5/mlx5_ib.h
+++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h
@@ -521,6 +521,7 @@ struct mlx5_ib_qp {
struct mlx5_bf bf;
u8 has_rq:1;
u8 is_rss:1;
+ u8 is_ooo_rq:1;
/* only for user space QPs. For kernel
* we have it from the bf object
@@ -668,6 +669,12 @@ struct mlx5_ib_mkey {
#define mlx5_update_odp_stats(mr, counter_name, value) \
atomic64_add(value, &((mr)->odp_stats.counter_name))
+#define mlx5_update_odp_stats_with_handled(mr, counter_name, value) \
+ do { \
+ mlx5_update_odp_stats(mr, counter_name, value); \
+ atomic64_add(1, &((mr)->odp_stats.counter_name##_handled)); \
+ } while (0)
+
struct mlx5_ib_mr {
struct ib_mr ibmr;
struct mlx5_ib_mkey mmkey;
@@ -972,7 +979,6 @@ enum mlx5_ib_stages {
MLX5_IB_STAGE_QP,
MLX5_IB_STAGE_SRQ,
MLX5_IB_STAGE_DEVICE_RESOURCES,
- MLX5_IB_STAGE_DEVICE_NOTIFIER,
MLX5_IB_STAGE_ODP,
MLX5_IB_STAGE_COUNTERS,
MLX5_IB_STAGE_CONG_DEBUGFS,
@@ -981,6 +987,7 @@ enum mlx5_ib_stages {
MLX5_IB_STAGE_PRE_IB_REG_UMR,
MLX5_IB_STAGE_WHITELIST_UID,
MLX5_IB_STAGE_IB_REG,
+ MLX5_IB_STAGE_DEVICE_NOTIFIER,
MLX5_IB_STAGE_POST_IB_REG_UMR,
MLX5_IB_STAGE_DELAY_DROP,
MLX5_IB_STAGE_RESTRACK,
diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c
index 45d9dc9c6c8f..bb02b6adbf2c 100644
--- a/drivers/infiniband/hw/mlx5/mr.c
+++ b/drivers/infiniband/hw/mlx5/mr.c
@@ -2021,6 +2021,11 @@ static int mlx5_revoke_mr(struct mlx5_ib_mr *mr)
{
struct mlx5_ib_dev *dev = to_mdev(mr->ibmr.device);
struct mlx5_cache_ent *ent = mr->mmkey.cache_ent;
+ bool is_odp = is_odp_mr(mr);
+ int ret = 0;
+
+ if (is_odp)
+ mutex_lock(&to_ib_umem_odp(mr->umem)->umem_mutex);
if (mr->mmkey.cacheable && !mlx5r_umr_revoke_mr(mr) && !cache_ent_find_and_store(dev, mr)) {
ent = mr->mmkey.cache_ent;
@@ -2032,7 +2037,7 @@ static int mlx5_revoke_mr(struct mlx5_ib_mr *mr)
ent->tmp_cleanup_scheduled = true;
}
spin_unlock_irq(&ent->mkeys_queue.lock);
- return 0;
+ goto out;
}
if (ent) {
@@ -2041,7 +2046,15 @@ static int mlx5_revoke_mr(struct mlx5_ib_mr *mr)
mr->mmkey.cache_ent = NULL;
spin_unlock_irq(&ent->mkeys_queue.lock);
}
- return destroy_mkey(dev, mr);
+ ret = destroy_mkey(dev, mr);
+out:
+ if (is_odp) {
+ if (!ret)
+ to_ib_umem_odp(mr->umem)->private = NULL;
+ mutex_unlock(&to_ib_umem_odp(mr->umem)->umem_mutex);
+ }
+
+ return ret;
}
static int __mlx5_ib_dereg_mr(struct ib_mr *ibmr)
diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c
index 4b37446758fd..f1e23583e6c0 100644
--- a/drivers/infiniband/hw/mlx5/odp.c
+++ b/drivers/infiniband/hw/mlx5/odp.c
@@ -228,13 +228,27 @@ static void destroy_unused_implicit_child_mr(struct mlx5_ib_mr *mr)
unsigned long idx = ib_umem_start(odp) >> MLX5_IMR_MTT_SHIFT;
struct mlx5_ib_mr *imr = mr->parent;
+ /*
+ * If userspace is racing freeing the parent implicit ODP MR then we can
+ * loose the race with parent destruction. In this case
+ * mlx5_ib_free_odp_mr() will free everything in the implicit_children
+ * xarray so NOP is fine. This child MR cannot be destroyed here because
+ * we are under its umem_mutex.
+ */
if (!refcount_inc_not_zero(&imr->mmkey.usecount))
return;
- xa_erase(&imr->implicit_children, idx);
+ xa_lock(&imr->implicit_children);
+ if (__xa_cmpxchg(&imr->implicit_children, idx, mr, NULL, GFP_KERNEL) !=
+ mr) {
+ xa_unlock(&imr->implicit_children);
+ return;
+ }
+
if (MLX5_CAP_ODP(mr_to_mdev(mr)->mdev, mem_page_fault))
- xa_erase(&mr_to_mdev(mr)->odp_mkeys,
- mlx5_base_mkey(mr->mmkey.key));
+ __xa_erase(&mr_to_mdev(mr)->odp_mkeys,
+ mlx5_base_mkey(mr->mmkey.key));
+ xa_unlock(&imr->implicit_children);
/* Freeing a MR is a sleeping operation, so bounce to a work queue */
INIT_WORK(&mr->odp_destroy.work, free_implicit_child_mr_work);
@@ -268,6 +282,8 @@ static bool mlx5_ib_invalidate_range(struct mmu_interval_notifier *mni,
if (!umem_odp->npages)
goto out;
mr = umem_odp->private;
+ if (!mr)
+ goto out;
start = max_t(u64, ib_umem_start(umem_odp), range->start);
end = min_t(u64, ib_umem_end(umem_odp), range->end);
@@ -313,7 +329,7 @@ static bool mlx5_ib_invalidate_range(struct mmu_interval_notifier *mni,
MLX5_IB_UPD_XLT_ZAP |
MLX5_IB_UPD_XLT_ATOMIC);
- mlx5_update_odp_stats(mr, invalidations, invalidations);
+ mlx5_update_odp_stats_with_handled(mr, invalidations, invalidations);
/*
* We are now sure that the device will not access the
@@ -500,18 +516,18 @@ static struct mlx5_ib_mr *implicit_get_child_mr(struct mlx5_ib_mr *imr,
refcount_inc(&ret->mmkey.usecount);
goto out_lock;
}
- xa_unlock(&imr->implicit_children);
if (MLX5_CAP_ODP(dev->mdev, mem_page_fault)) {
- ret = xa_store(&dev->odp_mkeys, mlx5_base_mkey(mr->mmkey.key),
- &mr->mmkey, GFP_KERNEL);
+ ret = __xa_store(&dev->odp_mkeys, mlx5_base_mkey(mr->mmkey.key),
+ &mr->mmkey, GFP_KERNEL);
if (xa_is_err(ret)) {
ret = ERR_PTR(xa_err(ret));
- xa_erase(&imr->implicit_children, idx);
- goto out_mr;
+ __xa_erase(&imr->implicit_children, idx);
+ goto out_lock;
}
mr->mmkey.type = MLX5_MKEY_IMPLICIT_CHILD;
}
+ xa_unlock(&imr->implicit_children);
mlx5_ib_dbg(mr_to_mdev(imr), "key %x mr %p\n", mr->mmkey.key, mr);
return mr;
@@ -944,8 +960,7 @@ out:
/*
* Handle a single data segment in a page-fault WQE or RDMA region.
*
- * Returns number of OS pages retrieved on success. The caller may continue to
- * the next data segment.
+ * Returns zero on success. The caller may continue to the next data segment.
* Can return the following error codes:
* -EAGAIN to designate a temporary error. The caller will abort handling the
* page fault and resolve it.
@@ -958,7 +973,7 @@ static int pagefault_single_data_segment(struct mlx5_ib_dev *dev,
u32 *bytes_committed,
u32 *bytes_mapped)
{
- int npages = 0, ret, i, outlen, cur_outlen = 0, depth = 0;
+ int ret, i, outlen, cur_outlen = 0, depth = 0, pages_in_range;
struct pf_frame *head = NULL, *frame;
struct mlx5_ib_mkey *mmkey;
struct mlx5_ib_mr *mr;
@@ -993,13 +1008,20 @@ next_mr:
case MLX5_MKEY_MR:
mr = container_of(mmkey, struct mlx5_ib_mr, mmkey);
+ pages_in_range = (ALIGN(io_virt + bcnt, PAGE_SIZE) -
+ (io_virt & PAGE_MASK)) >>
+ PAGE_SHIFT;
ret = pagefault_mr(mr, io_virt, bcnt, bytes_mapped, 0, false);
if (ret < 0)
goto end;
- mlx5_update_odp_stats(mr, faults, ret);
+ mlx5_update_odp_stats_with_handled(mr, faults, ret);
+
+ if (ret < pages_in_range) {
+ ret = -EFAULT;
+ goto end;
+ }
- npages += ret;
ret = 0;
break;
@@ -1090,7 +1112,7 @@ end:
kfree(out);
*bytes_committed = 0;
- return ret ? ret : npages;
+ return ret;
}
/*
@@ -1109,8 +1131,7 @@ end:
* the committed bytes).
* @receive_queue: receive WQE end of sg list
*
- * Returns the number of pages loaded if positive, zero for an empty WQE, or a
- * negative error code.
+ * Returns zero for success or a negative error code.
*/
static int pagefault_data_segments(struct mlx5_ib_dev *dev,
struct mlx5_pagefault *pfault,
@@ -1118,7 +1139,7 @@ static int pagefault_data_segments(struct mlx5_ib_dev *dev,
void *wqe_end, u32 *bytes_mapped,
u32 *total_wqe_bytes, bool receive_queue)
{
- int ret = 0, npages = 0;
+ int ret = 0;
u64 io_virt;
__be32 key;
u32 byte_count;
@@ -1175,10 +1196,9 @@ static int pagefault_data_segments(struct mlx5_ib_dev *dev,
bytes_mapped);
if (ret < 0)
break;
- npages += ret;
}
- return ret < 0 ? ret : npages;
+ return ret;
}
/*
@@ -1414,12 +1434,6 @@ resolve_page_fault:
free_page((unsigned long)wqe_start);
}
-static int pages_in_range(u64 address, u32 length)
-{
- return (ALIGN(address + length, PAGE_SIZE) -
- (address & PAGE_MASK)) >> PAGE_SHIFT;
-}
-
static void mlx5_ib_mr_rdma_pfault_handler(struct mlx5_ib_dev *dev,
struct mlx5_pagefault *pfault)
{
@@ -1458,7 +1472,7 @@ static void mlx5_ib_mr_rdma_pfault_handler(struct mlx5_ib_dev *dev,
if (ret == -EAGAIN) {
/* We're racing with an invalidation, don't prefetch */
prefetch_activated = 0;
- } else if (ret < 0 || pages_in_range(address, length) > ret) {
+ } else if (ret < 0) {
mlx5_ib_page_fault_resume(dev, pfault, 1);
if (ret != -ENOENT)
mlx5_ib_dbg(dev, "PAGE FAULT error %d. QP 0x%llx, type: 0x%x\n",
@@ -1529,7 +1543,7 @@ static void mlx5_ib_mr_memory_pfault_handler(struct mlx5_ib_dev *dev,
goto err;
}
- mlx5_update_odp_stats(mr, faults, ret);
+ mlx5_update_odp_stats_with_handled(mr, faults, ret);
mlx5r_deref_odp_mkey(mmkey);
if (pfault->memory.flags & MLX5_MEMORY_PAGE_FAULT_FLAGS_LAST)
diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c
index 10ce3b44f645..a43eba9d3572 100644
--- a/drivers/infiniband/hw/mlx5/qp.c
+++ b/drivers/infiniband/hw/mlx5/qp.c
@@ -1960,7 +1960,7 @@ static int atomic_size_to_mode(int size_mask)
}
static int get_atomic_mode(struct mlx5_ib_dev *dev,
- enum ib_qp_type qp_type)
+ struct mlx5_ib_qp *qp)
{
u8 atomic_operations = MLX5_CAP_ATOMIC(dev->mdev, atomic_operations);
u8 atomic = MLX5_CAP_GEN(dev->mdev, atomic);
@@ -1970,7 +1970,7 @@ static int get_atomic_mode(struct mlx5_ib_dev *dev,
if (!atomic)
return -EOPNOTSUPP;
- if (qp_type == MLX5_IB_QPT_DCT)
+ if (qp->type == MLX5_IB_QPT_DCT)
atomic_size_mask = MLX5_CAP_ATOMIC(dev->mdev, atomic_size_dc);
else
atomic_size_mask = MLX5_CAP_ATOMIC(dev->mdev, atomic_size_qp);
@@ -1984,6 +1984,10 @@ static int get_atomic_mode(struct mlx5_ib_dev *dev,
atomic_operations & MLX5_ATOMIC_OPS_FETCH_ADD))
atomic_mode = MLX5_ATOMIC_MODE_IB_COMP;
+ /* OOO DP QPs do not support larger than 8-Bytes atomic operations */
+ if (atomic_mode > MLX5_ATOMIC_MODE_8B && qp->is_ooo_rq)
+ atomic_mode = MLX5_ATOMIC_MODE_8B;
+
return atomic_mode;
}
@@ -2839,6 +2843,29 @@ static int check_valid_flow(struct mlx5_ib_dev *dev, struct ib_pd *pd,
return 0;
}
+static bool get_dp_ooo_cap(struct mlx5_core_dev *mdev, enum ib_qp_type qp_type)
+{
+ if (!MLX5_CAP_GEN_2(mdev, dp_ordering_force))
+ return false;
+
+ switch (qp_type) {
+ case IB_QPT_RC:
+ return MLX5_CAP_GEN(mdev, dp_ordering_ooo_all_rc);
+ case IB_QPT_XRC_INI:
+ case IB_QPT_XRC_TGT:
+ return MLX5_CAP_GEN(mdev, dp_ordering_ooo_all_xrc);
+ case IB_QPT_UC:
+ return MLX5_CAP_GEN(mdev, dp_ordering_ooo_all_uc);
+ case IB_QPT_UD:
+ return MLX5_CAP_GEN(mdev, dp_ordering_ooo_all_ud);
+ case MLX5_IB_QPT_DCI:
+ case MLX5_IB_QPT_DCT:
+ return MLX5_CAP_GEN(mdev, dp_ordering_ooo_all_dc);
+ default:
+ return false;
+ }
+}
+
static void process_vendor_flag(struct mlx5_ib_dev *dev, int *flags, int flag,
bool cond, struct mlx5_ib_qp *qp)
{
@@ -3365,7 +3392,7 @@ static int set_qpc_atomic_flags(struct mlx5_ib_qp *qp,
if (access_flags & IB_ACCESS_REMOTE_ATOMIC) {
int atomic_mode;
- atomic_mode = get_atomic_mode(dev, qp->type);
+ atomic_mode = get_atomic_mode(dev, qp);
if (atomic_mode < 0)
return -EOPNOTSUPP;
@@ -4316,6 +4343,11 @@ static int __mlx5_ib_modify_qp(struct ib_qp *ibqp,
if (qp->flags & MLX5_IB_QP_CREATE_SQPN_QP1)
MLX5_SET(qpc, qpc, deth_sqpn, 1);
+ if (qp->is_ooo_rq && cur_state == IB_QPS_INIT && new_state == IB_QPS_RTR) {
+ MLX5_SET(qpc, qpc, dp_ordering_1, 1);
+ MLX5_SET(qpc, qpc, dp_ordering_force, 1);
+ }
+
mlx5_cur = to_mlx5_state(cur_state);
mlx5_new = to_mlx5_state(new_state);
@@ -4531,7 +4563,7 @@ static int mlx5_ib_modify_dct(struct ib_qp *ibqp, struct ib_qp_attr *attr,
if (attr->qp_access_flags & IB_ACCESS_REMOTE_ATOMIC) {
int atomic_mode;
- atomic_mode = get_atomic_mode(dev, MLX5_IB_QPT_DCT);
+ atomic_mode = get_atomic_mode(dev, qp);
if (atomic_mode < 0)
return -EOPNOTSUPP;
@@ -4573,6 +4605,10 @@ static int mlx5_ib_modify_dct(struct ib_qp *ibqp, struct ib_qp_attr *attr,
MLX5_SET(dctc, dctc, hop_limit, attr->ah_attr.grh.hop_limit);
if (attr->ah_attr.type == RDMA_AH_ATTR_TYPE_ROCE)
MLX5_SET(dctc, dctc, eth_prio, attr->ah_attr.sl & 0x7);
+ if (qp->is_ooo_rq) {
+ MLX5_SET(dctc, dctc, dp_ordering_1, 1);
+ MLX5_SET(dctc, dctc, dp_ordering_force, 1);
+ }
err = mlx5_core_create_dct(dev, &qp->dct.mdct, qp->dct.in,
MLX5_ST_SZ_BYTES(create_dct_in), out,
@@ -4676,11 +4712,16 @@ int mlx5_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
min(udata->inlen, sizeof(ucmd))))
return -EFAULT;
- if (ucmd.comp_mask ||
+ if (ucmd.comp_mask & ~MLX5_IB_MODIFY_QP_OOO_DP ||
memchr_inv(&ucmd.burst_info.reserved, 0,
sizeof(ucmd.burst_info.reserved)))
return -EOPNOTSUPP;
+ if (ucmd.comp_mask & MLX5_IB_MODIFY_QP_OOO_DP) {
+ if (!get_dp_ooo_cap(dev->mdev, qp->type))
+ return -EOPNOTSUPP;
+ qp->is_ooo_rq = 1;
+ }
}
if (qp->type == IB_QPT_GSI)
diff --git a/drivers/infiniband/hw/mlx5/restrack.c b/drivers/infiniband/hw/mlx5/restrack.c
index affcf8fe943c..67841922c7b8 100644
--- a/drivers/infiniband/hw/mlx5/restrack.c
+++ b/drivers/infiniband/hw/mlx5/restrack.c
@@ -96,9 +96,18 @@ static int fill_stat_mr_entry(struct sk_buff *msg, struct ib_mr *ibmr)
atomic64_read(&mr->odp_stats.faults)))
goto err_table;
if (rdma_nl_stat_hwcounter_entry(
+ msg, "page_faults_handled",
+ atomic64_read(&mr->odp_stats.faults_handled)))
+ goto err_table;
+ if (rdma_nl_stat_hwcounter_entry(
msg, "page_invalidations",
atomic64_read(&mr->odp_stats.invalidations)))
goto err_table;
+ if (rdma_nl_stat_hwcounter_entry(
+ msg, "page_invalidations_handled",
+ atomic64_read(&mr->odp_stats.invalidations_handled)))
+ goto err_table;
+
if (rdma_nl_stat_hwcounter_entry(msg, "page_prefetch",
atomic64_read(&mr->odp_stats.prefetch)))
goto err_table;