diff options
Diffstat (limited to 'drivers/infiniband/hw/mlx5')
-rw-r--r-- | drivers/infiniband/hw/mlx5/devx.c | 58 | ||||
-rw-r--r-- | drivers/infiniband/hw/mlx5/mlx5_ib.h | 3 | ||||
-rw-r--r-- | drivers/infiniband/hw/mlx5/mr.c | 68 | ||||
-rw-r--r-- | drivers/infiniband/hw/mlx5/odp.c | 58 | ||||
-rw-r--r-- | drivers/infiniband/hw/mlx5/qp.c | 8 |
5 files changed, 96 insertions, 99 deletions
diff --git a/drivers/infiniband/hw/mlx5/devx.c b/drivers/infiniband/hw/mlx5/devx.c index 59022b744144..d609f4659afb 100644 --- a/drivers/infiniband/hw/mlx5/devx.c +++ b/drivers/infiniband/hw/mlx5/devx.c @@ -1298,29 +1298,6 @@ static int devx_handle_mkey_create(struct mlx5_ib_dev *dev, return 0; } -static void devx_free_indirect_mkey(struct rcu_head *rcu) -{ - kfree(container_of(rcu, struct devx_obj, devx_mr.rcu)); -} - -/* This function to delete from the radix tree needs to be called before - * destroying the underlying mkey. Otherwise a race might occur in case that - * other thread will get the same mkey before this one will be deleted, - * in that case it will fail via inserting to the tree its own data. - * - * Note: - * An error in the destroy is not expected unless there is some other indirect - * mkey which points to this one. In a kernel cleanup flow it will be just - * destroyed in the iterative destruction call. In a user flow, in case - * the application didn't close in the expected order it's its own problem, - * the mkey won't be part of the tree, in both cases the kernel is safe. - */ -static void devx_cleanup_mkey(struct devx_obj *obj) -{ - xa_erase(&obj->ib_dev->mdev->priv.mkey_table, - mlx5_base_mkey(obj->devx_mr.mmkey.key)); -} - static void devx_cleanup_subscription(struct mlx5_ib_dev *dev, struct devx_event_subscription *sub) { @@ -1362,8 +1339,16 @@ static int devx_obj_cleanup(struct ib_uobject *uobject, int ret; dev = mlx5_udata_to_mdev(&attrs->driver_udata); - if (obj->flags & DEVX_OBJ_FLAGS_INDIRECT_MKEY) - devx_cleanup_mkey(obj); + if (obj->flags & DEVX_OBJ_FLAGS_INDIRECT_MKEY) { + /* + * The pagefault_single_data_segment() does commands against + * the mmkey, we must wait for that to stop before freeing the + * mkey, as another allocation could get the same mkey #. + */ + xa_erase(&obj->ib_dev->mdev->priv.mkey_table, + mlx5_base_mkey(obj->devx_mr.mmkey.key)); + synchronize_srcu(&dev->mr_srcu); + } if (obj->flags & DEVX_OBJ_FLAGS_DCT) ret = mlx5_core_destroy_dct(obj->ib_dev->mdev, &obj->core_dct); @@ -1382,12 +1367,6 @@ static int devx_obj_cleanup(struct ib_uobject *uobject, devx_cleanup_subscription(dev, sub_entry); mutex_unlock(&devx_event_table->event_xa_lock); - if (obj->flags & DEVX_OBJ_FLAGS_INDIRECT_MKEY) { - call_srcu(&dev->mr_srcu, &obj->devx_mr.rcu, - devx_free_indirect_mkey); - return ret; - } - kfree(obj); return ret; } @@ -1491,26 +1470,21 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_DEVX_OBJ_CREATE)( &obj_id); WARN_ON(obj->dinlen > MLX5_MAX_DESTROY_INBOX_SIZE_DW * sizeof(u32)); - if (obj->flags & DEVX_OBJ_FLAGS_INDIRECT_MKEY) { - err = devx_handle_mkey_indirect(obj, dev, cmd_in, cmd_out); - if (err) - goto obj_destroy; - } - err = uverbs_copy_to(attrs, MLX5_IB_ATTR_DEVX_OBJ_CREATE_CMD_OUT, cmd_out, cmd_out_len); if (err) - goto err_copy; + goto obj_destroy; if (opcode == MLX5_CMD_OP_CREATE_GENERAL_OBJECT) obj_type = MLX5_GET(general_obj_in_cmd_hdr, cmd_in, obj_type); - obj->obj_id = get_enc_obj_id(opcode | obj_type << 16, obj_id); + if (obj->flags & DEVX_OBJ_FLAGS_INDIRECT_MKEY) { + err = devx_handle_mkey_indirect(obj, dev, cmd_in, cmd_out); + if (err) + goto obj_destroy; + } return 0; -err_copy: - if (obj->flags & DEVX_OBJ_FLAGS_INDIRECT_MKEY) - devx_cleanup_mkey(obj); obj_destroy: if (obj->flags & DEVX_OBJ_FLAGS_DCT) mlx5_core_destroy_dct(obj->ib_dev->mdev, &obj->core_dct); diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h index 2ceaef3ea3fb..1a98ee2e01c4 100644 --- a/drivers/infiniband/hw/mlx5/mlx5_ib.h +++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h @@ -606,7 +606,7 @@ struct mlx5_ib_mr { struct mlx5_ib_dev *dev; u32 out[MLX5_ST_SZ_DW(create_mkey_out)]; struct mlx5_core_sig_ctx *sig; - int live; + unsigned int live; void *descs_alloc; int access_flags; /* Needed for rereg MR */ @@ -639,7 +639,6 @@ struct mlx5_ib_mw { struct mlx5_ib_devx_mr { struct mlx5_core_mkey mmkey; int ndescs; - struct rcu_head rcu; }; struct mlx5_ib_umr_context { diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c index 1eff031ef048..7019c12005f4 100644 --- a/drivers/infiniband/hw/mlx5/mr.c +++ b/drivers/infiniband/hw/mlx5/mr.c @@ -84,32 +84,6 @@ static bool use_umr_mtt_update(struct mlx5_ib_mr *mr, u64 start, u64 length) length + (start & (MLX5_ADAPTER_PAGE_SIZE - 1)); } -static void update_odp_mr(struct mlx5_ib_mr *mr) -{ - if (is_odp_mr(mr)) { - /* - * This barrier prevents the compiler from moving the - * setting of umem->odp_data->private to point to our - * MR, before reg_umr finished, to ensure that the MR - * initialization have finished before starting to - * handle invalidations. - */ - smp_wmb(); - to_ib_umem_odp(mr->umem)->private = mr; - /* - * Make sure we will see the new - * umem->odp_data->private value in the invalidation - * routines, before we can get page faults on the - * MR. Page faults can happen once we put the MR in - * the tree, below this line. Without the barrier, - * there can be a fault handling and an invalidation - * before umem->odp_data->private == mr is visible to - * the invalidation handler. - */ - smp_wmb(); - } -} - static void reg_mr_callback(int status, struct mlx5_async_work *context) { struct mlx5_ib_mr *mr = @@ -1346,8 +1320,6 @@ struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, mr->umem = umem; set_mr_fields(dev, mr, npages, length, access_flags); - update_odp_mr(mr); - if (use_umr) { int update_xlt_flags = MLX5_IB_UPD_XLT_ENABLE; @@ -1363,10 +1335,12 @@ struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, } } - if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING)) { - mr->live = 1; + if (is_odp_mr(mr)) { + to_ib_umem_odp(mr->umem)->private = mr; atomic_set(&mr->num_pending_prefetch, 0); } + if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING)) + smp_store_release(&mr->live, 1); return &mr->ibmr; error: @@ -1441,6 +1415,9 @@ int mlx5_ib_rereg_user_mr(struct ib_mr *ib_mr, int flags, u64 start, if (!mr->umem) return -EINVAL; + if (is_odp_mr(mr)) + return -EOPNOTSUPP; + if (flags & IB_MR_REREG_TRANS) { addr = virt_addr; len = length; @@ -1486,8 +1463,6 @@ int mlx5_ib_rereg_user_mr(struct ib_mr *ib_mr, int flags, u64 start, } mr->allocated_from_cache = 0; - if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING)) - mr->live = 1; } else { /* * Send a UMR WQE @@ -1516,7 +1491,6 @@ int mlx5_ib_rereg_user_mr(struct ib_mr *ib_mr, int flags, u64 start, set_mr_fields(dev, mr, npages, len, access_flags); - update_odp_mr(mr); return 0; err: @@ -1607,15 +1581,16 @@ static void dereg_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr) /* Prevent new page faults and * prefetch requests from succeeding */ - mr->live = 0; + WRITE_ONCE(mr->live, 0); + + /* Wait for all running page-fault handlers to finish. */ + synchronize_srcu(&dev->mr_srcu); /* dequeue pending prefetch requests for the mr */ if (atomic_read(&mr->num_pending_prefetch)) flush_workqueue(system_unbound_wq); WARN_ON(atomic_read(&mr->num_pending_prefetch)); - /* Wait for all running page-fault handlers to finish. */ - synchronize_srcu(&dev->mr_srcu); /* Destroy all page mappings */ if (!umem_odp->is_implicit_odp) mlx5_ib_invalidate_range(umem_odp, @@ -1987,14 +1962,25 @@ free: int mlx5_ib_dealloc_mw(struct ib_mw *mw) { + struct mlx5_ib_dev *dev = to_mdev(mw->device); struct mlx5_ib_mw *mmw = to_mmw(mw); int err; - err = mlx5_core_destroy_mkey((to_mdev(mw->device))->mdev, - &mmw->mmkey); - if (!err) - kfree(mmw); - return err; + if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING)) { + xa_erase_irq(&dev->mdev->priv.mkey_table, + mlx5_base_mkey(mmw->mmkey.key)); + /* + * pagefault_single_data_segment() may be accessing mmw under + * SRCU if the user bound an ODP MR to this MW. + */ + synchronize_srcu(&dev->mr_srcu); + } + + err = mlx5_core_destroy_mkey(dev->mdev, &mmw->mmkey); + if (err) + return err; + kfree(mmw); + return 0; } int mlx5_ib_check_mr_status(struct ib_mr *ibmr, u32 check_mask, diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c index 2e9b43061797..3f9478d19376 100644 --- a/drivers/infiniband/hw/mlx5/odp.c +++ b/drivers/infiniband/hw/mlx5/odp.c @@ -178,6 +178,29 @@ void mlx5_odp_populate_klm(struct mlx5_klm *pklm, size_t offset, return; } + /* + * The locking here is pretty subtle. Ideally the implicit children + * list would be protected by the umem_mutex, however that is not + * possible. Instead this uses a weaker update-then-lock pattern: + * + * srcu_read_lock() + * <change children list> + * mutex_lock(umem_mutex) + * mlx5_ib_update_xlt() + * mutex_unlock(umem_mutex) + * destroy lkey + * + * ie any change the children list must be followed by the locked + * update_xlt before destroying. + * + * The umem_mutex provides the acquire/release semantic needed to make + * the children list visible to a racing thread. While SRCU is not + * technically required, using it gives consistent use of the SRCU + * locking around the children list. + */ + lockdep_assert_held(&to_ib_umem_odp(mr->umem)->umem_mutex); + lockdep_assert_held(&mr->dev->mr_srcu); + odp = odp_lookup(offset * MLX5_IMR_MTT_SIZE, nentries * MLX5_IMR_MTT_SIZE, mr); @@ -202,15 +225,22 @@ static void mr_leaf_free_action(struct work_struct *work) struct ib_umem_odp *odp = container_of(work, struct ib_umem_odp, work); int idx = ib_umem_start(odp) >> MLX5_IMR_MTT_SHIFT; struct mlx5_ib_mr *mr = odp->private, *imr = mr->parent; + struct ib_umem_odp *odp_imr = to_ib_umem_odp(imr->umem); + int srcu_key; mr->parent = NULL; synchronize_srcu(&mr->dev->mr_srcu); - ib_umem_odp_release(odp); - if (imr->live) + if (smp_load_acquire(&imr->live)) { + srcu_key = srcu_read_lock(&mr->dev->mr_srcu); + mutex_lock(&odp_imr->umem_mutex); mlx5_ib_update_xlt(imr, idx, 1, 0, MLX5_IB_UPD_XLT_INDIRECT | MLX5_IB_UPD_XLT_ATOMIC); + mutex_unlock(&odp_imr->umem_mutex); + srcu_read_unlock(&mr->dev->mr_srcu, srcu_key); + } + ib_umem_odp_release(odp); mlx5_mr_cache_free(mr->dev, mr); if (atomic_dec_and_test(&imr->num_leaf_free)) @@ -278,7 +308,6 @@ void mlx5_ib_invalidate_range(struct ib_umem_odp *umem_odp, unsigned long start, idx - blk_start_idx + 1, 0, MLX5_IB_UPD_XLT_ZAP | MLX5_IB_UPD_XLT_ATOMIC); - mutex_unlock(&umem_odp->umem_mutex); /* * We are now sure that the device will not access the * memory. We can safely unmap it, and mark it as dirty if @@ -289,10 +318,12 @@ void mlx5_ib_invalidate_range(struct ib_umem_odp *umem_odp, unsigned long start, if (unlikely(!umem_odp->npages && mr->parent && !umem_odp->dying)) { - WRITE_ONCE(umem_odp->dying, 1); + WRITE_ONCE(mr->live, 0); + umem_odp->dying = 1; atomic_inc(&mr->parent->num_leaf_free); schedule_work(&umem_odp->work); } + mutex_unlock(&umem_odp->umem_mutex); } void mlx5_ib_internal_fill_odp_caps(struct mlx5_ib_dev *dev) @@ -429,8 +460,6 @@ static struct mlx5_ib_mr *implicit_mr_alloc(struct ib_pd *pd, mr->ibmr.lkey = mr->mmkey.key; mr->ibmr.rkey = mr->mmkey.key; - mr->live = 1; - mlx5_ib_dbg(dev, "key %x dev %p mr %p\n", mr->mmkey.key, dev->mdev, mr); @@ -484,6 +513,8 @@ next_mr: mtt->parent = mr; INIT_WORK(&odp->work, mr_leaf_free_action); + smp_store_release(&mtt->live, 1); + if (!nentries) start_idx = addr >> MLX5_IMR_MTT_SHIFT; nentries++; @@ -536,6 +567,7 @@ struct mlx5_ib_mr *mlx5_ib_alloc_implicit_mr(struct mlx5_ib_pd *pd, init_waitqueue_head(&imr->q_leaf_free); atomic_set(&imr->num_leaf_free, 0); atomic_set(&imr->num_pending_prefetch, 0); + smp_store_release(&imr->live, 1); return imr; } @@ -555,15 +587,19 @@ void mlx5_ib_free_implicit_mr(struct mlx5_ib_mr *imr) if (mr->parent != imr) continue; + mutex_lock(&umem_odp->umem_mutex); ib_umem_odp_unmap_dma_pages(umem_odp, ib_umem_start(umem_odp), ib_umem_end(umem_odp)); - if (umem_odp->dying) + if (umem_odp->dying) { + mutex_unlock(&umem_odp->umem_mutex); continue; + } - WRITE_ONCE(umem_odp->dying, 1); + umem_odp->dying = 1; atomic_inc(&imr->num_leaf_free); schedule_work(&umem_odp->work); + mutex_unlock(&umem_odp->umem_mutex); } up_read(&per_mm->umem_rwsem); @@ -773,7 +809,7 @@ next_mr: switch (mmkey->type) { case MLX5_MKEY_MR: mr = container_of(mmkey, struct mlx5_ib_mr, mmkey); - if (!mr->live || !mr->ibmr.pd) { + if (!smp_load_acquire(&mr->live) || !mr->ibmr.pd) { mlx5_ib_dbg(dev, "got dead MR\n"); ret = -EFAULT; goto srcu_unlock; @@ -1641,12 +1677,12 @@ static bool num_pending_prefetch_inc(struct ib_pd *pd, mr = container_of(mmkey, struct mlx5_ib_mr, mmkey); - if (mr->ibmr.pd != pd) { + if (!smp_load_acquire(&mr->live)) { ret = false; break; } - if (!mr->live) { + if (mr->ibmr.pd != pd) { ret = false; break; } diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c index 8937d72ddcf6..5fd071c05944 100644 --- a/drivers/infiniband/hw/mlx5/qp.c +++ b/drivers/infiniband/hw/mlx5/qp.c @@ -3249,10 +3249,12 @@ static int modify_raw_packet_qp_sq( } /* Only remove the old rate after new rate was set */ - if ((old_rl.rate && - !mlx5_rl_are_equal(&old_rl, &new_rl)) || - (new_state != MLX5_SQC_STATE_RDY)) + if ((old_rl.rate && !mlx5_rl_are_equal(&old_rl, &new_rl)) || + (new_state != MLX5_SQC_STATE_RDY)) { mlx5_rl_remove_rate(dev, &old_rl); + if (new_state != MLX5_SQC_STATE_RDY) + memset(&new_rl, 0, sizeof(new_rl)); + } ibqp->rl = new_rl; sq->state = new_state; |