diff options
-rw-r--r-- | drivers/infiniband/core/umem_odp.c | 38 | ||||
-rw-r--r-- | drivers/infiniband/hw/mlx5/cq.c | 33 | ||||
-rw-r--r-- | drivers/infiniband/hw/mlx5/devx.c | 8 | ||||
-rw-r--r-- | drivers/infiniband/hw/mlx5/main.c | 17 | ||||
-rw-r--r-- | drivers/infiniband/hw/mlx5/mlx5_ib.h | 24 | ||||
-rw-r--r-- | drivers/infiniband/hw/mlx5/mr.c | 139 | ||||
-rw-r--r-- | drivers/infiniband/hw/mlx5/odp.c | 974 | ||||
-rw-r--r-- | drivers/net/ethernet/mellanox/mlx5/core/main.c | 4 | ||||
-rw-r--r-- | drivers/net/ethernet/mellanox/mlx5/core/mr.c | 28 | ||||
-rw-r--r-- | include/linux/mlx5/driver.h | 4 | ||||
-rw-r--r-- | include/rdma/ib_umem_odp.h | 18 |
11 files changed, 624 insertions, 663 deletions
diff --git a/drivers/infiniband/core/umem_odp.c b/drivers/infiniband/core/umem_odp.c index 163ff7ba92b7..d7d5fadf0899 100644 --- a/drivers/infiniband/core/umem_odp.c +++ b/drivers/infiniband/core/umem_odp.c @@ -508,7 +508,6 @@ static int ib_umem_odp_map_dma_single_page( { struct ib_device *dev = umem_odp->umem.ibdev; dma_addr_t dma_addr; - int remove_existing_mapping = 0; int ret = 0; /* @@ -534,28 +533,29 @@ static int ib_umem_odp_map_dma_single_page( } else if (umem_odp->page_list[page_index] == page) { umem_odp->dma_list[page_index] |= access_mask; } else { - pr_err("error: got different pages in IB device and from get_user_pages. IB device page: %p, gup page: %p\n", - umem_odp->page_list[page_index], page); - /* Better remove the mapping now, to prevent any further - * damage. */ - remove_existing_mapping = 1; + /* + * This is a race here where we could have done: + * + * CPU0 CPU1 + * get_user_pages() + * invalidate() + * page_fault() + * mutex_lock(umem_mutex) + * page from GUP != page in ODP + * + * It should be prevented by the retry test above as reading + * the seq number should be reliable under the + * umem_mutex. Thus something is really not working right if + * things get here. + */ + WARN(true, + "Got different pages in IB device and from get_user_pages. IB device page: %p, gup page: %p\n", + umem_odp->page_list[page_index], page); + ret = -EAGAIN; } out: put_user_page(page); - - if (remove_existing_mapping) { - ib_umem_notifier_start_account(umem_odp); - dev->ops.invalidate_range( - umem_odp, - ib_umem_start(umem_odp) + - (page_index << umem_odp->page_shift), - ib_umem_start(umem_odp) + - ((page_index + 1) << umem_odp->page_shift)); - ib_umem_notifier_end_account(umem_odp); - ret = -EAGAIN; - } - return ret; } diff --git a/drivers/infiniband/hw/mlx5/cq.c b/drivers/infiniband/hw/mlx5/cq.c index 45f48cde6b9d..cc938d27f913 100644 --- a/drivers/infiniband/hw/mlx5/cq.c +++ b/drivers/infiniband/hw/mlx5/cq.c @@ -423,9 +423,6 @@ static int mlx5_poll_one(struct mlx5_ib_cq *cq, struct mlx5_cqe64 *cqe64; struct mlx5_core_qp *mqp; struct mlx5_ib_wq *wq; - struct mlx5_sig_err_cqe *sig_err_cqe; - struct mlx5_core_mkey *mmkey; - struct mlx5_ib_mr *mr; uint8_t opcode; uint32_t qpn; u16 wqe_ctr; @@ -519,27 +516,29 @@ repoll: } } break; - case MLX5_CQE_SIG_ERR: - sig_err_cqe = (struct mlx5_sig_err_cqe *)cqe64; + case MLX5_CQE_SIG_ERR: { + struct mlx5_sig_err_cqe *sig_err_cqe = + (struct mlx5_sig_err_cqe *)cqe64; + struct mlx5_core_sig_ctx *sig; - xa_lock(&dev->mdev->priv.mkey_table); - mmkey = xa_load(&dev->mdev->priv.mkey_table, + xa_lock(&dev->sig_mrs); + sig = xa_load(&dev->sig_mrs, mlx5_base_mkey(be32_to_cpu(sig_err_cqe->mkey))); - mr = to_mibmr(mmkey); - get_sig_err_item(sig_err_cqe, &mr->sig->err_item); - mr->sig->sig_err_exists = true; - mr->sig->sigerr_count++; + get_sig_err_item(sig_err_cqe, &sig->err_item); + sig->sig_err_exists = true; + sig->sigerr_count++; mlx5_ib_warn(dev, "CQN: 0x%x Got SIGERR on key: 0x%x err_type %x err_offset %llx expected %x actual %x\n", - cq->mcq.cqn, mr->sig->err_item.key, - mr->sig->err_item.err_type, - mr->sig->err_item.sig_err_offset, - mr->sig->err_item.expected, - mr->sig->err_item.actual); + cq->mcq.cqn, sig->err_item.key, + sig->err_item.err_type, + sig->err_item.sig_err_offset, + sig->err_item.expected, + sig->err_item.actual); - xa_unlock(&dev->mdev->priv.mkey_table); + xa_unlock(&dev->sig_mrs); goto repoll; } + } return 0; } diff --git a/drivers/infiniband/hw/mlx5/devx.c b/drivers/infiniband/hw/mlx5/devx.c index d609f4659afb..6b1fca91d7d3 100644 --- a/drivers/infiniband/hw/mlx5/devx.c +++ b/drivers/infiniband/hw/mlx5/devx.c @@ -1265,8 +1265,8 @@ static int devx_handle_mkey_indirect(struct devx_obj *obj, mkey->pd = MLX5_GET(mkc, mkc, pd); devx_mr->ndescs = MLX5_GET(mkc, mkc, translations_octword_size); - return xa_err(xa_store(&dev->mdev->priv.mkey_table, - mlx5_base_mkey(mkey->key), mkey, GFP_KERNEL)); + return xa_err(xa_store(&dev->odp_mkeys, mlx5_base_mkey(mkey->key), mkey, + GFP_KERNEL)); } static int devx_handle_mkey_create(struct mlx5_ib_dev *dev, @@ -1345,9 +1345,9 @@ static int devx_obj_cleanup(struct ib_uobject *uobject, * the mmkey, we must wait for that to stop before freeing the * mkey, as another allocation could get the same mkey #. */ - xa_erase(&obj->ib_dev->mdev->priv.mkey_table, + xa_erase(&obj->ib_dev->odp_mkeys, mlx5_base_mkey(obj->devx_mr.mmkey.key)); - synchronize_srcu(&dev->mr_srcu); + synchronize_srcu(&dev->odp_srcu); } if (obj->flags & DEVX_OBJ_FLAGS_DCT) diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index 20fabb368f61..0a8a1a129f17 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -6133,11 +6133,10 @@ static struct ib_counters *mlx5_ib_create_counters(struct ib_device *device, static void mlx5_ib_stage_init_cleanup(struct mlx5_ib_dev *dev) { mlx5_ib_cleanup_multiport_master(dev); - if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING)) { - srcu_barrier(&dev->mr_srcu); - cleanup_srcu_struct(&dev->mr_srcu); - } + WARN_ON(!xa_empty(&dev->odp_mkeys)); + cleanup_srcu_struct(&dev->odp_srcu); + WARN_ON(!xa_empty(&dev->sig_mrs)); WARN_ON(!bitmap_empty(dev->dm.memic_alloc_pages, MLX5_MAX_MEMIC_PAGES)); } @@ -6189,15 +6188,15 @@ static int mlx5_ib_stage_init_init(struct mlx5_ib_dev *dev) mutex_init(&dev->cap_mask_mutex); INIT_LIST_HEAD(&dev->qp_list); spin_lock_init(&dev->reset_flow_resource_lock); + xa_init(&dev->odp_mkeys); + xa_init(&dev->sig_mrs); spin_lock_init(&dev->dm.lock); dev->dm.dev = mdev; - if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING)) { - err = init_srcu_struct(&dev->mr_srcu); - if (err) - goto err_mp; - } + err = init_srcu_struct(&dev->odp_srcu); + if (err) + goto err_mp; return 0; diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h index 3a97c2da632a..5b4c5751a98f 100644 --- a/drivers/infiniband/hw/mlx5/mlx5_ib.h +++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h @@ -604,7 +604,6 @@ struct mlx5_ib_mr { struct mlx5_ib_dev *dev; u32 out[MLX5_ST_SZ_DW(create_mkey_out)]; struct mlx5_core_sig_ctx *sig; - unsigned int live; void *descs_alloc; int access_flags; /* Needed for rereg MR */ @@ -616,12 +615,18 @@ struct mlx5_ib_mr { u64 data_iova; u64 pi_iova; - atomic_t num_leaf_free; - wait_queue_head_t q_leaf_free; - struct mlx5_async_work cb_work; - atomic_t num_pending_prefetch; + /* For ODP and implicit */ + atomic_t num_deferred_work; + struct xarray implicit_children; + union { + struct rcu_head rcu; + struct list_head elm; + struct work_struct work; + } odp_destroy; struct ib_odp_counters odp_stats; bool is_odp_implicit; + + struct mlx5_async_work cb_work; }; static inline bool is_odp_mr(struct mlx5_ib_mr *mr) @@ -977,7 +982,9 @@ struct mlx5_ib_dev { * Sleepable RCU that prevents destruction of MRs while they are still * being used by a page fault handler. */ - struct srcu_struct mr_srcu; + struct srcu_struct odp_srcu; + struct xarray odp_mkeys; + u32 null_mkey; struct mlx5_ib_flow_db *flow_db; /* protect resources needed as part of reset flow */ @@ -999,6 +1006,8 @@ struct mlx5_ib_dev { struct mlx5_srq_table srq_table; struct mlx5_async_ctx async_ctx; struct mlx5_devx_event_table devx_event_table; + + struct xarray sig_mrs; }; static inline struct mlx5_ib_cq *to_mibcq(struct mlx5_core_cq *mcq) @@ -1162,6 +1171,7 @@ struct mlx5_ib_mr *mlx5_ib_alloc_implicit_mr(struct mlx5_ib_pd *pd, struct ib_udata *udata, int access_flags); void mlx5_ib_free_implicit_mr(struct mlx5_ib_mr *mr); +void mlx5_ib_fence_odp_mr(struct mlx5_ib_mr *mr); int mlx5_ib_rereg_user_mr(struct ib_mr *ib_mr, int flags, u64 start, u64 length, u64 virt_addr, int access_flags, struct ib_pd *pd, struct ib_udata *udata); @@ -1223,6 +1233,8 @@ int mlx5_mr_cache_cleanup(struct mlx5_ib_dev *dev); struct mlx5_ib_mr *mlx5_mr_cache_alloc(struct mlx5_ib_dev *dev, int entry); void mlx5_mr_cache_free(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr); +int mlx5_mr_cache_invalidate(struct mlx5_ib_mr *mr); + int mlx5_ib_check_mr_status(struct ib_mr *ibmr, u32 check_mask, struct ib_mr_status *mr_status); struct ib_wq *mlx5_ib_create_wq(struct ib_pd *pd, diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c index 986a7722ab27..f583476f5c8b 100644 --- a/drivers/infiniband/hw/mlx5/mr.c +++ b/drivers/infiniband/hw/mlx5/mr.c @@ -50,7 +50,6 @@ enum { static void clean_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr); static void dereg_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr); static int mr_cache_max_order(struct mlx5_ib_dev *dev); -static int unreg_umr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr); static bool umr_can_use_indirect_mkey(struct mlx5_ib_dev *dev) { @@ -59,13 +58,9 @@ static bool umr_can_use_indirect_mkey(struct mlx5_ib_dev *dev) static int destroy_mkey(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr) { - int err = mlx5_core_destroy_mkey(dev->mdev, &mr->mmkey); + WARN_ON(xa_load(&dev->odp_mkeys, mlx5_base_mkey(mr->mmkey.key))); - if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING)) - /* Wait until all page fault handlers using the mr complete. */ - synchronize_srcu(&dev->mr_srcu); - - return err; + return mlx5_core_destroy_mkey(dev->mdev, &mr->mmkey); } static int order2idx(struct mlx5_ib_dev *dev, int order) @@ -94,8 +89,6 @@ static void reg_mr_callback(int status, struct mlx5_async_work *context) struct mlx5_cache_ent *ent = &cache->ent[c]; u8 key; unsigned long flags; - struct xarray *mkeys = &dev->mdev->priv.mkey_table; - int err; spin_lock_irqsave(&ent->lock, flags); ent->pending--; @@ -122,13 +115,6 @@ static void reg_mr_callback(int status, struct mlx5_async_work *context) ent->size++; spin_unlock_irqrestore(&ent->lock, flags); - xa_lock_irqsave(mkeys, flags); - err = xa_err(__xa_store(mkeys, mlx5_base_mkey(mr->mmkey.key), - &mr->mmkey, GFP_ATOMIC)); - xa_unlock_irqrestore(mkeys, flags); - if (err) - pr_err("Error inserting to mkey tree. 0x%x\n", -err); - if (!completion_done(&ent->compl)) complete(&ent->compl); } @@ -218,9 +204,6 @@ static void remove_keys(struct mlx5_ib_dev *dev, int c, int num) mlx5_core_destroy_mkey(dev->mdev, &mr->mmkey); } - if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING)) - synchronize_srcu(&dev->mr_srcu); - list_for_each_entry_safe(mr, tmp_mr, &del_list, list) { list_del(&mr->list); kfree(mr); @@ -511,7 +494,7 @@ void mlx5_mr_cache_free(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr) c = order2idx(dev, mr->order); WARN_ON(c < 0 || c >= MAX_MR_CACHE_ENTRIES); - if (unreg_umr(dev, mr)) { + if (mlx5_mr_cache_invalidate(mr)) { mr->allocated_from_cache = false; destroy_mkey(dev, mr); ent = &cache->ent[c]; @@ -555,10 +538,6 @@ static void clean_keys(struct mlx5_ib_dev *dev, int c) mlx5_core_destroy_mkey(dev->mdev, &mr->mmkey); } -#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING - synchronize_srcu(&dev->mr_srcu); -#endif - list_for_each_entry_safe(mr, tmp_mr, &del_list, list) { list_del(&mr->list); kfree(mr); @@ -1335,10 +1314,15 @@ struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, if (is_odp_mr(mr)) { to_ib_umem_odp(mr->umem)->private = mr; - atomic_set(&mr->num_pending_prefetch, 0); + atomic_set(&mr->num_deferred_work, 0); + err = xa_err(xa_store(&dev->odp_mkeys, + mlx5_base_mkey(mr->mmkey.key), &mr->mmkey, + GFP_KERNEL)); + if (err) { + dereg_mr(dev, mr); + return ERR_PTR(err); + } } - if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING)) - smp_store_release(&mr->live, 1); return &mr->ibmr; error: @@ -1346,22 +1330,29 @@ error: return ERR_PTR(err); } -static int unreg_umr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr) +/** + * mlx5_mr_cache_invalidate - Fence all DMA on the MR + * @mr: The MR to fence + * + * Upon return the NIC will not be doing any DMA to the pages under the MR, + * and any DMA inprogress will be completed. Failure of this function + * indicates the HW has failed catastrophically. + */ +int mlx5_mr_cache_invalidate(struct mlx5_ib_mr *mr) { - struct mlx5_core_dev *mdev = dev->mdev; struct mlx5_umr_wr umrwr = {}; - if (mdev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR) + if (mr->dev->mdev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR) return 0; umrwr.wr.send_flags = MLX5_IB_SEND_UMR_DISABLE_MR | MLX5_IB_SEND_UMR_UPDATE_PD_ACCESS; umrwr.wr.opcode = MLX5_IB_WR_UMR; - umrwr.pd = dev->umrc.pd; + umrwr.pd = mr->dev->umrc.pd; umrwr.mkey = mr->mmkey.key; umrwr.ignore_free_state = 1; - return mlx5_ib_post_send_wait(dev, &umrwr); + return mlx5_ib_post_send_wait(mr->dev, &umrwr); } static int rereg_umr(struct ib_pd *pd, struct mlx5_ib_mr *mr, @@ -1445,7 +1436,7 @@ int mlx5_ib_rereg_user_mr(struct ib_mr *ib_mr, int flags, u64 start, * UMR can't be used - MKey needs to be replaced. */ if (mr->allocated_from_cache) - err = unreg_umr(dev, mr); + err = mlx5_mr_cache_invalidate(mr); else err = destroy_mkey(dev, mr); if (err) @@ -1558,6 +1549,7 @@ static void clean_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr) mr->sig->psv_wire.psv_idx)) mlx5_ib_warn(dev, "failed to destroy wire psv %d\n", mr->sig->psv_wire.psv_idx); + xa_erase(&dev->sig_mrs, mlx5_base_mkey(mr->mmkey.key)); kfree(mr->sig); mr->sig = NULL; } @@ -1573,54 +1565,20 @@ static void dereg_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr) int npages = mr->npages; struct ib_umem *umem = mr->umem; - if (is_odp_mr(mr)) { - struct ib_umem_odp *umem_odp = to_ib_umem_odp(umem); - - /* Prevent new page faults and - * prefetch requests from succeeding - */ - WRITE_ONCE(mr->live, 0); - - /* Wait for all running page-fault handlers to finish. */ - synchronize_srcu(&dev->mr_srcu); - - /* dequeue pending prefetch requests for the mr */ - if (atomic_read(&mr->num_pending_prefetch)) - flush_workqueue(system_unbound_wq); - WARN_ON(atomic_read(&mr->num_pending_prefetch)); - - /* Destroy all page mappings */ - if (!umem_odp->is_implicit_odp) - mlx5_ib_invalidate_range(umem_odp, - ib_umem_start(umem_odp), - ib_umem_end(umem_odp)); - else - mlx5_ib_free_implicit_mr(mr); - /* - * We kill the umem before the MR for ODP, - * so that there will not be any invalidations in - * flight, looking at the *mr struct. - */ - ib_umem_odp_release(umem_odp); - atomic_sub(npages, &dev->mdev->priv.reg_pages); - - /* Avoid double-freeing the umem. */ - umem = NULL; - } + /* Stop all DMA */ + if (is_odp_mr(mr)) + mlx5_ib_fence_odp_mr(mr); + else + clean_mr(dev, mr); - clean_mr(dev, mr); + if (mr->allocated_from_cache) + mlx5_mr_cache_free(dev, mr); + else + kfree(mr); - /* - * We should unregister the DMA address from the HCA before - * remove the DMA mapping. - */ - mlx5_mr_cache_free(dev, mr); ib_umem_release(umem); - if (umem) - atomic_sub(npages, &dev->mdev->priv.reg_pages); + atomic_sub(npages, &dev->mdev->priv.reg_pages); - if (!mr->allocated_from_cache) - kfree(mr); } int mlx5_ib_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata) @@ -1632,6 +1590,11 @@ int mlx5_ib_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata) dereg_mr(to_mdev(mmr->klm_mr->ibmr.device), mmr->klm_mr); } + if (is_odp_mr(mmr) && to_ib_umem_odp(mmr->umem)->is_implicit_odp) { + mlx5_ib_free_implicit_mr(mmr); + return 0; + } + dereg_mr(to_mdev(ibmr->device), mmr); return 0; @@ -1795,8 +1758,15 @@ static int mlx5_alloc_integrity_descs(struct ib_pd *pd, struct mlx5_ib_mr *mr, if (err) goto err_free_mtt_mr; + err = xa_err(xa_store(&dev->sig_mrs, mlx5_base_mkey(mr->mmkey.key), + mr->sig, GFP_KERNEL)); + if (err) + goto err_free_descs; return 0; +err_free_descs: + destroy_mkey(dev, mr); + mlx5_free_priv_descs(mr); err_free_mtt_mr: dereg_mr(to_mdev(mr->mtt_mr->ibmr.device), mr->mtt_mr); mr->mtt_mr = NULL; @@ -1949,9 +1919,19 @@ struct ib_mw *mlx5_ib_alloc_mw(struct ib_pd *pd, enum ib_mw_type type, } } + if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING)) { + err = xa_err(xa_store(&dev->odp_mkeys, + mlx5_base_mkey(mw->mmkey.key), &mw->mmkey, + GFP_KERNEL)); + if (err) + goto free_mkey; + } + kfree(in); return &mw->ibmw; +free_mkey: + mlx5_core_destroy_mkey(dev->mdev, &mw->mmkey); free: kfree(mw); kfree(in); @@ -1965,13 +1945,12 @@ int mlx5_ib_dealloc_mw(struct ib_mw *mw) int err; if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING)) { - xa_erase(&dev->mdev->priv.mkey_table, - mlx5_base_mkey(mmw->mmkey.key)); + xa_erase(&dev->odp_mkeys, mlx5_base_mkey(mmw->mmkey.key)); /* * pagefault_single_data_segment() may be accessing mmw under * SRCU if the user bound an ODP MR to this MW. */ - synchronize_srcu(&dev->mr_srcu); + synchronize_srcu(&dev->odp_srcu); } err = mlx5_core_destroy_mkey(dev->mdev, &mmw->mmkey); diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c index b332117bca97..45ee40c2f36e 100644 --- a/drivers/infiniband/hw/mlx5/odp.c +++ b/drivers/infiniband/hw/mlx5/odp.c @@ -93,158 +93,152 @@ struct mlx5_pagefault { static u64 mlx5_imr_ksm_entries; -static int check_parent(struct ib_umem_odp *odp, - struct mlx5_ib_mr *parent) +void mlx5_odp_populate_klm(struct mlx5_klm *pklm, size_t idx, size_t nentries, + struct mlx5_ib_mr *imr, int flags) { - struct mlx5_ib_mr *mr = odp->private; - - return mr && mr->parent == parent && !odp->dying; -} - -static struct ib_ucontext_per_mm *mr_to_per_mm(struct mlx5_ib_mr *mr) -{ - if (WARN_ON(!mr || !is_odp_mr(mr))) - return NULL; - - return to_ib_umem_odp(mr->umem)->per_mm; -} - -static struct ib_umem_odp *odp_next(struct ib_umem_odp *odp) -{ - struct mlx5_ib_mr *mr = odp->private, *parent = mr->parent; - struct ib_ucontext_per_mm *per_mm = odp->per_mm; - struct rb_node *rb; - - down_read(&per_mm->umem_rwsem); - while (1) { - rb = rb_next(&odp->interval_tree.rb); - if (!rb) - goto not_found; - odp = rb_entry(rb, struct ib_umem_odp, interval_tree.rb); - if (check_parent(odp, parent)) - goto end; - } -not_found: - odp = NULL; -end: - up_read(&per_mm->umem_rwsem); - return odp; -} - -static struct ib_umem_odp *odp_lookup(u64 start, u64 length, - struct mlx5_ib_mr *parent) -{ - struct ib_ucontext_per_mm *per_mm = mr_to_per_mm(parent); - struct ib_umem_odp *odp; - struct rb_node *rb; - - down_read(&per_mm->umem_rwsem); - odp = rbt_ib_umem_lookup(&per_mm->umem_tree, start, length); - if (!odp) - goto end; - - while (1) { - if (check_parent(odp, parent)) - goto end; - rb = rb_next(&odp->interval_tree.rb); - if (!rb) - goto not_found; - odp = rb_entry(rb, struct ib_umem_odp, interval_tree.rb); - if (ib_umem_start(odp) > start + length) - goto not_found; - } -not_found: - odp = NULL; -end: - up_read(&per_mm->umem_rwsem); - return odp; -} - -void mlx5_odp_populate_klm(struct mlx5_klm *pklm, size_t offset, - size_t nentries, struct mlx5_ib_mr *mr, int flags) -{ - struct ib_pd *pd = mr->ibmr.pd; - struct mlx5_ib_dev *dev = to_mdev(pd->device); - struct ib_umem_odp *odp; - unsigned long va; - int i; + struct mlx5_klm *end = pklm + nentries; if (flags & MLX5_IB_UPD_XLT_ZAP) { - for (i = 0; i < nentries; i++, pklm++) { + for (; pklm != end; pklm++, idx++) { pklm->bcount = cpu_to_be32(MLX5_IMR_MTT_SIZE); - pklm->key = cpu_to_be32(dev->null_mkey); + pklm->key = cpu_to_be32(imr->dev->null_mkey); pklm->va = 0; } return; } /* - * The locking here is pretty subtle. Ideally the implicit children - * list would be protected by the umem_mutex, however that is not + * The locking here is pretty subtle. Ideally the implicit_children + * xarray would be protected by the umem_mutex, however that is not * possible. Instead this uses a weaker update-then-lock pattern: * * srcu_read_lock() - * <change children list> + * xa_store() * mutex_lock(umem_mutex) * mlx5_ib_update_xlt() * mutex_unlock(umem_mutex) * destroy lkey * - * ie any change the children list must be followed by the locked - * update_xlt before destroying. + * ie any change the xarray must be followed by the locked update_xlt + * before destroying. * * The umem_mutex provides the acquire/release semantic needed to make - * the children list visible to a racing thread. While SRCU is not + * the xa_store() visible to a racing thread. While SRCU is not * technically required, using it gives consistent use of the SRCU - * locking around the children list. + * locking around the xarray. */ - lockdep_assert_held(&to_ib_umem_odp(mr->umem)->umem_mutex); - lockdep_assert_held(&mr->dev->mr_srcu); + lockdep_assert_held(&to_ib_umem_odp(imr->umem)->umem_mutex); + lockdep_assert_held(&imr->dev->odp_srcu); - odp = odp_lookup(offset * MLX5_IMR_MTT_SIZE, - nentries * MLX5_IMR_MTT_SIZE, mr); + for (; pklm != end; pklm++, idx++) { + struct mlx5_ib_mr *mtt = xa_load(&imr->implicit_children, idx); - for (i = 0; i < nentries; i++, pklm++) { pklm->bcount = cpu_to_be32(MLX5_IMR_MTT_SIZE); - va = (offset + i) * MLX5_IMR_MTT_SIZE; - if (odp && ib_umem_start(odp) == va) { - struct mlx5_ib_mr *mtt = odp->private; - + if (mtt) { pklm->key = cpu_to_be32(mtt->ibmr.lkey); - odp = odp_next(odp); + pklm->va = cpu_to_be64(idx * MLX5_IMR_MTT_SIZE); } else { - pklm->key = cpu_to_be32(dev->null_mkey); + pklm->key = cpu_to_be32(imr->dev->null_mkey); + pklm->va = 0; } - mlx5_ib_dbg(dev, "[%d] va %lx key %x\n", - i, va, be32_to_cpu(pklm->key)); } } -static void mr_leaf_free_action(struct work_struct *work) +static void dma_fence_odp_mr(struct mlx5_ib_mr *mr) +{ + struct ib_umem_odp *odp = to_ib_umem_odp(mr->umem); + + /* Ensure mlx5_ib_invalidate_range() will not touch the MR any more */ + mutex_lock(&odp->umem_mutex); + if (odp->npages) { + mlx5_mr_cache_invalidate(mr); + ib_umem_odp_unmap_dma_pages(odp, ib_umem_start(odp), + ib_umem_end(odp)); + WARN_ON(odp->npages); + } + odp->private = NULL; + mutex_unlock(&odp->umem_mutex); + + if (!mr->allocated_from_cache) { + mlx5_core_destroy_mkey(mr->dev->mdev, &mr->mmkey); + WARN_ON(mr->descs); + } +} + +/* + * This must be called after the mr has been removed from implicit_children + * and the SRCU synchronized. NOTE: The MR does not necessarily have to be + * empty here, parallel page faults could have raced with the free process and + * added pages to it. + */ +static void free_implicit_child_mr(struct mlx5_ib_mr *mr, bool need_imr_xlt) { - struct ib_umem_odp *odp = container_of(work, struct ib_umem_odp, work); - int idx = ib_umem_start(odp) >> MLX5_IMR_MTT_SHIFT; - struct mlx5_ib_mr *mr = odp->private, *imr = mr->parent; + struct mlx5_ib_mr *imr = mr->parent; struct ib_umem_odp *odp_imr = to_ib_umem_odp(imr->umem); + struct ib_umem_odp *odp = to_ib_umem_odp(mr->umem); + unsigned long idx = ib_umem_start(odp) >> MLX5_IMR_MTT_SHIFT; int srcu_key; - mr->parent = NULL; - synchronize_srcu(&mr->dev->mr_srcu); + /* implicit_child_mr's are not allowed to have deferred work */ + WARN_ON(atomic_read(&mr->num_deferred_work)); - if (smp_load_acquire(&imr->live)) { - srcu_key = srcu_read_lock(&mr->dev->mr_srcu); + if (need_imr_xlt) { + srcu_key = srcu_read_lock(&mr->dev->odp_srcu); mutex_lock(&odp_imr->umem_mutex); - mlx5_ib_update_xlt(imr, idx, 1, 0, + mlx5_ib_update_xlt(mr->parent, idx, 1, 0, MLX5_IB_UPD_XLT_INDIRECT | MLX5_IB_UPD_XLT_ATOMIC); mutex_unlock(&odp_imr->umem_mutex); - srcu_read_unlock(&mr->dev->mr_srcu, srcu_key); + srcu_read_unlock(&mr->dev->odp_srcu, srcu_key); } - ib_umem_odp_release(odp); + + dma_fence_odp_mr(mr); + + mr->parent = NULL; mlx5_mr_cache_free(mr->dev, mr); + ib_umem_odp_release(odp); + atomic_dec(&imr->num_deferred_work); +} + +static void free_implicit_child_mr_work(struct work_struct *work) +{ + struct mlx5_ib_mr *mr = + container_of(work, struct mlx5_ib_mr, odp_destroy.work); + + free_implicit_child_mr(mr, true); +} + +static void free_implicit_child_mr_rcu(struct rcu_head *head) +{ + struct mlx5_ib_mr *mr = + container_of(head, struct mlx5_ib_mr, odp_destroy.rcu); - if (atomic_dec_and_test(&imr->num_leaf_free)) - wake_up(&imr->q_leaf_free); + /* Freeing a MR is a sleeping operation, so bounce to a work queue */ + INIT_WORK(&mr->odp_destroy.work, free_implicit_child_mr_work); + queue_work(system_unbound_wq, &mr->odp_destroy.work); +} + +static void destroy_unused_implicit_child_mr(struct mlx5_ib_mr *mr) +{ + struct ib_umem_odp *odp = to_ib_umem_odp(mr->umem); + unsigned long idx = ib_umem_start(odp) >> MLX5_IMR_MTT_SHIFT; + struct mlx5_ib_mr *imr = mr->parent; + + xa_lock(&imr->implicit_children); + /* + * This can race with mlx5_ib_free_implicit_mr(), the first one to + * reach the xa lock wins the race and destroys the MR. + */ + if (__xa_cmpxchg(&imr->implicit_children, idx, mr, NULL, GFP_ATOMIC) != + mr) + goto out_unlock; + + atomic_inc(&imr->num_deferred_work); + call_srcu(&mr->dev->odp_srcu, &mr->odp_destroy.rcu, + free_implicit_child_mr_rcu); + +out_unlock: + xa_unlock(&imr->implicit_children); } void mlx5_ib_invalidate_range(struct ib_umem_odp *umem_odp, unsigned long start, @@ -258,16 +252,15 @@ void mlx5_ib_invalidate_range(struct ib_umem_odp *umem_odp, unsigned long start, int in_block = 0; u64 addr; - if (!umem_odp) { - pr_err("invalidation called on NULL umem or non-ODP umem\n"); - return; - } - + mutex_lock(&umem_odp->umem_mutex); + /* + * If npages is zero then umem_odp->private may not be setup yet. This + * does not complete until after the first page is mapped for DMA. + */ + if (!umem_odp->npages) + goto out; mr = umem_odp->private; - if (!mr || !mr->ibmr.pd) - return; - start = max_t(u64, ib_umem_start(umem_odp), start); end = min_t(u64, ib_umem_end(umem_odp), end); @@ -277,7 +270,6 @@ void mlx5_ib_invalidate_range(struct ib_umem_odp *umem_odp, unsigned long start, * overwrite the same MTTs. Concurent invalidations might race us, * but they will write 0s as well, so no difference in the end result. */ - mutex_lock(&umem_odp->umem_mutex); for (addr = start; addr < end; addr += BIT(umem_odp->page_shift)) { idx = (addr - ib_umem_start(umem_odp)) >> umem_odp->page_shift; /* @@ -323,14 +315,9 @@ void mlx5_ib_invalidate_range(struct ib_umem_odp *umem_odp, unsigned long start, ib_umem_odp_unmap_dma_pages(umem_odp, start, end); - - if (unlikely(!umem_odp->npages && mr->parent && - !umem_odp->dying)) { - WRITE_ONCE(mr->live, 0); - umem_odp->dying = 1; - atomic_inc(&mr->parent->num_leaf_free); - schedule_work(&umem_odp->work); - } + if (unlikely(!umem_odp->npages && mr->parent)) + destroy_unused_implicit_child_mr(mr); +out: mutex_unlock(&umem_odp->umem_mutex); } @@ -422,239 +409,213 @@ static void mlx5_ib_page_fault_resume(struct mlx5_ib_dev *dev, wq_num, err); } -static struct mlx5_ib_mr *implicit_mr_alloc(struct ib_pd *pd, - struct ib_umem_odp *umem_odp, - bool ksm, int access_flags) +static struct mlx5_ib_mr *implicit_get_child_mr(struct mlx5_ib_mr *imr, + unsigned long idx) { - struct mlx5_ib_dev *dev = to_mdev(pd->device); + struct ib_umem_odp *odp; struct mlx5_ib_mr *mr; + struct mlx5_ib_mr *ret; int err; - mr = mlx5_mr_cache_alloc(dev, ksm ? MLX5_IMR_KSM_CACHE_ENTRY : - MLX5_IMR_MTT_CACHE_ENTRY); + odp = ib_umem_odp_alloc_child(to_ib_umem_odp(imr->umem), + idx * MLX5_IMR_MTT_SIZE, + MLX5_IMR_MTT_SIZE); + if (IS_ERR(odp)) + return ERR_CAST(odp); + ret = mr = mlx5_mr_cache_alloc(imr->dev, MLX5_IMR_MTT_CACHE_ENTRY); if (IS_ERR(mr)) - return mr; - - mr->ibmr.pd = pd; - - mr->dev = dev; - mr->access_flags = access_flags; - mr->mmkey.iova = 0; - mr->umem = &umem_odp->umem; - - if (ksm) { - err = mlx5_ib_update_xlt(mr, 0, - mlx5_imr_ksm_entries, - MLX5_KSM_PAGE_SHIFT, - MLX5_IB_UPD_XLT_INDIRECT | - MLX5_IB_UPD_XLT_ZAP | - MLX5_IB_UPD_XLT_ENABLE); - - } else { - err = mlx5_ib_update_xlt(mr, 0, - MLX5_IMR_MTT_ENTRIES, - PAGE_SHIFT, - MLX5_IB_UPD_XLT_ZAP | - MLX5_IB_UPD_XLT_ENABLE | - MLX5_IB_UPD_XLT_ATOMIC); - } - - if (err) - goto fail; + goto out_umem; + mr->ibmr.pd = imr->ibmr.pd; + mr->access_flags = imr->access_flags; + mr->umem = &odp->umem; mr->ibmr.lkey = mr->mmkey.key; mr->ibmr.rkey = mr->mmkey.key; - - mlx5_ib_dbg(dev, "key %x dev %p mr %p\n", - mr->mmkey.key, dev->mdev, mr); - - return mr; - -fail: - mlx5_ib_err(dev, "Failed to register MKEY %d\n", err); - mlx5_mr_cache_free(dev, mr); - - return ERR_PTR(err); -} - -static struct ib_umem_odp *implicit_mr_get_data(struct mlx5_ib_mr *mr, - u64 io_virt, size_t bcnt) -{ - struct mlx5_ib_dev *dev = to_mdev(mr->ibmr.pd->device); - struct ib_umem_odp *odp, *result = NULL; - struct ib_umem_odp *odp_mr = to_ib_umem_odp(mr->umem); - u64 addr = io_virt & MLX5_IMR_MTT_MASK; - int nentries = 0, start_idx = 0, ret; - struct mlx5_ib_mr *mtt; - - mutex_lock(&odp_mr->umem_mutex); - odp = odp_lookup(addr, 1, mr); - - mlx5_ib_dbg(dev, "io_virt:%llx bcnt:%zx addr:%llx odp:%p\n", - io_virt, bcnt, addr, odp); - -next_mr: - if (likely(odp)) { - if (nentries) - nentries++; - } else { - odp = ib_umem_odp_alloc_child(odp_mr, addr, MLX5_IMR_MTT_SIZE); - if (IS_ERR(odp)) { - mutex_unlock(&odp_mr->umem_mutex); - return ERR_CAST(odp); - } - - mtt = implicit_mr_alloc(mr->ibmr.pd, odp, 0, - mr->access_flags); - if (IS_ERR(mtt)) { - mutex_unlock(&odp_mr->umem_mutex); - ib_umem_odp_release(odp); - return ERR_CAST(mtt); - } - - odp->private = mtt; - mtt->umem = &odp->umem; - mtt->mmkey.iova = addr; - mtt->parent = mr; - INIT_WORK(&odp->work, mr_leaf_free_action); - - smp_store_release(&mtt->live, 1); - - if (!nentries) - start_idx = addr >> MLX5_IMR_MTT_SHIFT; - nentries++; - } - - /* Return first odp if region not covered by single one */ - if (likely(!result)) - result = odp; - - addr += MLX5_IMR_MTT_SIZE; - if (unlikely(addr < io_virt + bcnt)) { - odp = odp_next(odp); - if (odp && ib_umem_start(odp) != addr) - odp = NULL; - goto next_mr; + mr->mmkey.iova = idx * MLX5_IMR_MTT_SIZE; + mr->parent = imr; + odp->private = mr; + + err = mlx5_ib_update_xlt(mr, 0, + MLX5_IMR_MTT_ENTRIES, + PAGE_SHIFT, + MLX5_IB_UPD_XLT_ZAP | + MLX5_IB_UPD_XLT_ENABLE); + if (err) { + ret = ERR_PTR(err); + goto out_mr; } - if (unlikely(nentries)) { - ret = mlx5_ib_update_xlt(mr, start_idx, nentries, 0, - MLX5_IB_UPD_XLT_INDIRECT | - MLX5_IB_UPD_XLT_ATOMIC); - if (ret) { - mlx5_ib_err(dev, "Failed to update PAS\n"); - result = ERR_PTR(ret); + /* + * Once the store to either xarray completes any error unwind has to + * use synchronize_srcu(). Avoid this with xa_reserve() + */ + ret = xa_cmpxchg(&imr->implicit_children, idx, NULL, mr, + GFP_KERNEL); + if (unlikely(ret)) { + if (xa_is_err(ret)) { + ret = ERR_PTR(xa_err(ret)); + goto out_mr; } + /* + * Another thread beat us to creating the child mr, use + * theirs. + */ + goto out_mr; } - mutex_unlock(&odp_mr->umem_mutex); - return result; + mlx5_ib_dbg(imr->dev, "key %x mr %p\n", mr->mmkey.key, mr); + return mr; + +out_mr: + mlx5_mr_cache_free(imr->dev, mr); +out_umem: + ib_umem_odp_release(odp); + return ret; } struct mlx5_ib_mr *mlx5_ib_alloc_implicit_mr(struct mlx5_ib_pd *pd, struct ib_udata *udata, int access_flags) { - struct mlx5_ib_mr *imr; + struct mlx5_ib_dev *dev = to_mdev(pd->ibpd.device); struct ib_umem_odp *umem_odp; + struct mlx5_ib_mr *imr; + int err; umem_odp = ib_umem_odp_alloc_implicit(udata, access_flags); if (IS_ERR(umem_odp)) return ERR_CAST(umem_odp); - imr = implicit_mr_alloc(&pd->ibpd, umem_odp, 1, access_flags); + imr = mlx5_mr_cache_alloc(dev, MLX5_IMR_KSM_CACHE_ENTRY); if (IS_ERR(imr)) { - ib_umem_odp_release(umem_odp); - return ERR_CAST(imr); + err = PTR_ERR(imr); + goto out_umem; } + imr->ibmr.pd = &pd->ibpd; + imr->access_flags = access_flags; + imr->mmkey.iova = 0; + imr->umem = &umem_odp->umem; + imr->ibmr.lkey = imr->mmkey.key; + imr->ibmr.rkey = imr->mmkey.key; imr->umem = &umem_odp->umem; - init_waitqueue_head(&imr->q_leaf_free); - atomic_set(&imr->num_leaf_free, 0); - atomic_set(&imr->num_pending_prefetch, 0); - smp_store_release(&imr->live, 1); - imr->is_odp_implicit = true; + atomic_set(&imr->num_deferred_work, 0); + xa_init(&imr->implicit_children); + + err = mlx5_ib_update_xlt(imr, 0, + mlx5_imr_ksm_entries, + MLX5_KSM_PAGE_SHIFT, + MLX5_IB_UPD_XLT_INDIRECT | + MLX5_IB_UPD_XLT_ZAP | + MLX5_IB_UPD_XLT_ENABLE); + if (err) + goto out_mr; + + err = xa_err(xa_store(&dev->odp_mkeys, mlx5_base_mkey(imr->mmkey.key), + &imr->mmkey, GFP_KERNEL)); + if (err) + goto out_mr; + mlx5_ib_dbg(dev, "key %x mr %p\n", imr->mmkey.key, imr); return imr; +out_mr: + mlx5_ib_err(dev, "Failed to register MKEY %d\n", err); + mlx5_mr_cache_free(dev, imr); +out_umem: + ib_umem_odp_release(umem_odp); + return ERR_PTR(err); } void mlx5_ib_free_implicit_mr(struct mlx5_ib_mr *imr) { - struct ib_ucontext_per_mm *per_mm = mr_to_per_mm(imr); - struct rb_node *node; + struct ib_umem_odp *odp_imr = to_ib_umem_odp(imr->umem); + struct mlx5_ib_dev *dev = imr->dev; + struct list_head destroy_list; + struct mlx5_ib_mr *mtt; + struct mlx5_ib_mr *tmp; + unsigned long idx; - down_read(&per_mm->umem_rwsem); - for (node = rb_first_cached(&per_mm->umem_tree); node; - node = rb_next(node)) { - struct ib_umem_odp *umem_odp = - rb_entry(node, struct ib_umem_odp, interval_tree.rb); - struct mlx5_ib_mr *mr = umem_odp->private; + INIT_LIST_HEAD(&destroy_list); - if (mr->parent != imr) - continue; + xa_erase(&dev->odp_mkeys, mlx5_base_mkey(imr->mmkey.key)); + /* + * This stops the SRCU protected page fault path from touching either + * the imr or any children. The page fault path can only reach the + * children xarray via the imr. + */ + synchronize_srcu(&dev->odp_srcu); - mutex_lock(&umem_odp->umem_mutex); - ib_umem_odp_unmap_dma_pages(umem_odp, ib_umem_start(umem_odp), - ib_umem_end(umem_odp)); + xa_lock(&imr->implicit_children); + xa_for_each (&imr->implicit_children, idx, mtt) { + __xa_erase(&imr->implicit_children, idx); + list_add(&mtt->odp_destroy.elm, &destroy_list); + } + xa_unlock(&imr->implicit_children); - if (umem_odp->dying) { - mutex_unlock(&umem_odp->umem_mutex); - continue; - } + /* + * num_deferred_work can only be incremented inside the odp_srcu, or + * under xa_lock while the child is in the xarray. Thus at this point + * it is only decreasing, and all work holding it is now on the wq. + */ + if (atomic_read(&imr->num_deferred_work)) { + flush_workqueue(system_unbound_wq); + WARN_ON(atomic_read(&imr->num_deferred_work)); + } + + /* + * Fence the imr before we destroy the children. This allows us to + * skip updating the XLT of the imr during destroy of the child mkey + * the imr points to. + */ + mlx5_mr_cache_invalidate(imr); + + list_for_each_entry_safe (mtt, tmp, &destroy_list, odp_destroy.elm) + free_implicit_child_mr(mtt, false); + + mlx5_mr_cache_free(dev, imr); + ib_umem_odp_release(odp_imr); +} + +/** + * mlx5_ib_fence_odp_mr - Stop all access to the ODP MR + * @mr: to fence + * + * On return no parallel threads will be touching this MR and no DMA will be + * active. + */ +void mlx5_ib_fence_odp_mr(struct mlx5_ib_mr *mr) +{ + /* Prevent new page faults and prefetch requests from succeeding */ + xa_erase(&mr->dev->odp_mkeys, mlx5_base_mkey(mr->mmkey.key)); + + /* Wait for all running page-fault handlers to finish. */ + synchronize_srcu(&mr->dev->odp_srcu); - umem_odp->dying = 1; - atomic_inc(&imr->num_leaf_free); - schedule_work(&umem_odp->work); - mutex_unlock(&umem_odp->umem_mutex); + if (atomic_read(&mr->num_deferred_work)) { + flush_workqueue(system_unbound_wq); + WARN_ON(atomic_read(&mr->num_deferred_work)); } - up_read(&per_mm->umem_rwsem); - wait_event(imr->q_leaf_free, !atomic_read(&imr->num_leaf_free)); + dma_fence_odp_mr(mr); } -#define MLX5_PF_FLAGS_PREFETCH BIT(0) #define MLX5_PF_FLAGS_DOWNGRADE BIT(1) -static int pagefault_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr, - u64 io_virt, size_t bcnt, u32 *bytes_mapped, - u32 flags) +static int pagefault_real_mr(struct mlx5_ib_mr *mr, struct ib_umem_odp *odp, + u64 user_va, size_t bcnt, u32 *bytes_mapped, + u32 flags) { - int npages = 0, current_seq, page_shift, ret, np; - struct ib_umem_odp *odp_mr = to_ib_umem_odp(mr->umem); + int current_seq, page_shift, ret, np; bool downgrade = flags & MLX5_PF_FLAGS_DOWNGRADE; - bool prefetch = flags & MLX5_PF_FLAGS_PREFETCH; u64 access_mask; u64 start_idx, page_mask; - struct ib_umem_odp *odp; - size_t size; - - if (odp_mr->is_implicit_odp) { - odp = implicit_mr_get_data(mr, io_virt, bcnt); - - if (IS_ERR(odp)) - return PTR_ERR(odp); - mr = odp->private; - } else { - odp = odp_mr; - } - -next_mr: - size = min_t(size_t, bcnt, ib_umem_end(odp) - io_virt); page_shift = odp->page_shift; page_mask = ~(BIT(page_shift) - 1); - start_idx = (io_virt - (mr->mmkey.iova & page_mask)) >> page_shift; + start_idx = (user_va - (mr->mmkey.iova & page_mask)) >> page_shift; access_mask = ODP_READ_ALLOWED_BIT; - if (prefetch && !downgrade && !odp->umem.writable) { - /* prefetch with write-access must - * be supported by the MR - */ - ret = -EINVAL; - goto out; - } - if (odp->umem.writable && !downgrade) access_mask |= ODP_WRITE_ALLOWED_BIT; @@ -665,13 +626,10 @@ next_mr: */ smp_rmb(); - ret = ib_umem_odp_map_dma_pages(odp, io_virt, size, access_mask, - current_seq); - - if (ret < 0) - goto out; - - np = ret; + np = ib_umem_odp_map_dma_pages(odp, user_va, bcnt, access_mask, + current_seq); + if (np < 0) + return np; mutex_lock(&odp->umem_mutex); if (!ib_umem_mmu_notifier_retry(odp, current_seq)) { @@ -689,35 +647,19 @@ next_mr: if (ret < 0) { if (ret != -EAGAIN) - mlx5_ib_err(dev, "Failed to update mkey page tables\n"); + mlx5_ib_err(mr->dev, + "Failed to update mkey page tables\n"); goto out; } if (bytes_mapped) { u32 new_mappings = (np << page_shift) - - (io_virt - round_down(io_virt, 1 << page_shift)); - *bytes_mapped += min_t(u32, new_mappings, size); - } - - npages += np << (page_shift - PAGE_SHIFT); - bcnt -= size; + (user_va - round_down(user_va, 1 << page_shift)); - if (unlikely(bcnt)) { - struct ib_umem_odp *next; - - io_virt += size; - next = odp_next(odp); - if (unlikely(!next || ib_umem_start(next) != io_virt)) { - mlx5_ib_dbg(dev, "next implicit leaf removed at 0x%llx. got %p\n", - io_virt, next); - return -EAGAIN; - } - odp = next; - mr = odp->private; - goto next_mr; + *bytes_mapped += min_t(u32, new_mappings, bcnt); } - return npages; + return np << (page_shift - PAGE_SHIFT); out: if (ret == -EAGAIN) { @@ -726,7 +668,7 @@ out: if (!wait_for_completion_timeout(&odp->notifier_completion, timeout)) { mlx5_ib_warn( - dev, + mr->dev, "timeout waiting for mmu notifier. seq %d against %d. notifiers_count=%d\n", current_seq, odp->notifiers_seq, odp->notifiers_count); @@ -736,6 +678,109 @@ out: return ret; } +static int pagefault_implicit_mr(struct mlx5_ib_mr *imr, + struct ib_umem_odp *odp_imr, u64 user_va, + size_t bcnt, u32 *bytes_mapped, u32 flags) +{ + unsigned long end_idx = (user_va + bcnt - 1) >> MLX5_IMR_MTT_SHIFT; + unsigned long upd_start_idx = end_idx + 1; + unsigned long upd_len = 0; + unsigned long npages = 0; + int err; + int ret; + + if (unlikely(user_va >= mlx5_imr_ksm_entries * MLX5_IMR_MTT_SIZE || + mlx5_imr_ksm_entries * MLX5_IMR_MTT_SIZE - user_va < bcnt)) + return -EFAULT; + + /* Fault each child mr that intersects with our interval. */ + while (bcnt) { + unsigned long idx = user_va >> MLX5_IMR_MTT_SHIFT; + struct ib_umem_odp *umem_odp; + struct mlx5_ib_mr *mtt; + u64 len; + + mtt = xa_load(&imr->implicit_children, idx); + if (unlikely(!mtt)) { + mtt = implicit_get_child_mr(imr, idx); + if (IS_ERR(mtt)) { + ret = PTR_ERR(mtt); + goto out; + } + upd_start_idx = min(upd_start_idx, idx); + upd_len = idx - upd_start_idx + 1; + } + + umem_odp = to_ib_umem_odp(mtt->umem); + len = min_t(u64, user_va + bcnt, ib_umem_end(umem_odp)) - + user_va; + + ret = pagefault_real_mr(mtt, umem_odp, user_va, len, + bytes_mapped, flags); + if (ret < 0) + goto out; + user_va += len; + bcnt -= len; + npages += ret; + } + + ret = npages; + + /* + * Any time the implicit_children are changed we must perform an + * update of the xlt before exiting to ensure the HW and the + * implicit_children remains synchronized. + */ +out: + if (likely(!upd_len)) + return ret; + + /* + * Notice this is not strictly ordered right, the KSM is updated after + * the implicit_children is updated, so a parallel page fault could + * see a MR that is not yet visible in the KSM. This is similar to a + * parallel page fault seeing a MR that is being concurrently removed + * from the KSM. Both of these improbable situations are resolved + * safely by resuming the HW and then taking another page fault. The + * next pagefault handler will see the new information. + */ + mutex_lock(&odp_imr->umem_mutex); + err = mlx5_ib_update_xlt(imr, upd_start_idx, upd_len, 0, + MLX5_IB_UPD_XLT_INDIRECT | + MLX5_IB_UPD_XLT_ATOMIC); + mutex_unlock(&odp_imr->umem_mutex); + if (err) { + mlx5_ib_err(imr->dev, "Failed to update PAS\n"); + return err; + } + return ret; +} + +/* + * Returns: + * -EFAULT: The io_virt->bcnt is not within the MR, it covers pages that are + * not accessible, or the MR is no longer valid. + * -EAGAIN/-ENOMEM: The operation should be retried + * + * -EINVAL/others: General internal malfunction + * >0: Number of pages mapped + */ +static int pagefault_mr(struct mlx5_ib_mr *mr, u64 io_virt, size_t bcnt, + u32 *bytes_mapped, u32 flags) +{ + struct ib_umem_odp *odp = to_ib_umem_odp(mr->umem); + + if (!odp->is_implicit_odp) { + if (unlikely(io_virt < ib_umem_start(odp) || + ib_umem_end(odp) - io_virt < bcnt)) + return -EFAULT; + return pagefault_real_mr(mr, odp, io_virt, bcnt, bytes_mapped, + flags); + } + return pagefault_implicit_mr(mr, odp, io_virt, bcnt, bytes_mapped, + flags); +} + struct pf_frame { struct pf_frame *next; u32 key; @@ -783,10 +828,9 @@ static int pagefault_single_data_segment(struct mlx5_ib_dev *dev, struct ib_pd *pd, u32 key, u64 io_virt, size_t bcnt, u32 *bytes_committed, - u32 *bytes_mapped, u32 flags) + u32 *bytes_mapped) { int npages = 0, srcu_key, ret, i, outlen, cur_outlen = 0, depth = 0; - bool prefetch = flags & MLX5_PF_FLAGS_PREFETCH; struct pf_frame *head = NULL, *frame; struct mlx5_core_mkey *mmkey; struct mlx5_ib_mr *mr; @@ -795,55 +839,39 @@ static int pagefault_single_data_segment(struct mlx5_ib_dev *dev, size_t offset; int ndescs; - srcu_key = srcu_read_lock(&dev->mr_srcu); + srcu_key = srcu_read_lock(&dev->odp_srcu); io_virt += *bytes_committed; bcnt -= *bytes_committed; next_mr: - mmkey = xa_load(&dev->mdev->priv.mkey_table, mlx5_base_mkey(key)); + mmkey = xa_load(&dev->odp_mkeys, mlx5_base_mkey(key)); + if (!mmkey) { + mlx5_ib_dbg( + dev, + "skipping non ODP MR (lkey=0x%06x) in page fault handler.\n", + key); + if (bytes_mapped) + *bytes_mapped += bcnt; + /* + * The user could specify a SGL with multiple lkeys and only + * some of them are ODP. Treat the non-ODP ones as fully + * faulted. + */ + ret = 0; + goto srcu_unlock; + } if (!mkey_is_eq(mmkey, key)) { mlx5_ib_dbg(dev, "failed to find mkey %x\n", key); ret = -EFAULT; goto srcu_unlock; } - if (prefetch && mmkey->type != MLX5_MKEY_MR) { - mlx5_ib_dbg(dev, "prefetch is allowed only for MR\n"); - ret = -EINVAL; - goto srcu_unlock; - } - switch (mmkey->type) { case MLX5_MKEY_MR: mr = container_of(mmkey, struct mlx5_ib_mr, mmkey); - if (!smp_load_acquire(&mr->live) || !mr->ibmr.pd) { - mlx5_ib_dbg(dev, "got dead MR\n"); - ret = -EFAULT; - goto srcu_unlock; - } - - if (prefetch) { - if (!is_odp_mr(mr) || - mr->ibmr.pd != pd) { - mlx5_ib_dbg(dev, "Invalid prefetch request: %s\n", - is_odp_mr(mr) ? "MR is not ODP" : - "PD is not of the MR"); - ret = -EINVAL; - goto srcu_unlock; - } - } - if (!is_odp_mr(mr)) { - mlx5_ib_dbg(dev, "skipping non ODP MR (lkey=0x%06x) in page fault handler.\n", - key); - if (bytes_mapped) - *bytes_mapped += bcnt; - ret = 0; - goto srcu_unlock; - } - - ret = pagefault_mr(dev, mr, io_virt, bcnt, bytes_mapped, flags); + ret = pagefault_mr(mr, io_virt, bcnt, bytes_mapped, 0); if (ret < 0) goto srcu_unlock; @@ -943,7 +971,7 @@ srcu_unlock: } kfree(out); - srcu_read_unlock(&dev->mr_srcu, srcu_key); + srcu_read_unlock(&dev->odp_srcu, srcu_key); *bytes_committed = 0; return ret ? ret : npages; } @@ -1024,7 +1052,7 @@ static int pagefault_data_segments(struct mlx5_ib_dev *dev, ret = pagefault_single_data_segment(dev, NULL, key, io_virt, bcnt, &pfault->bytes_committed, - bytes_mapped, 0); + bytes_mapped); if (ret < 0) break; npages += ret; @@ -1307,8 +1335,7 @@ static void mlx5_ib_mr_rdma_pfault_handler(struct mlx5_ib_dev *dev, } ret = pagefault_single_data_segment(dev, NULL, rkey, address, length, - &pfault->bytes_committed, NULL, - 0); + &pfault->bytes_committed, NULL); if (ret == -EAGAIN) { /* We're racing with an invalidation, don't prefetch */ prefetch_activated = 0; @@ -1335,8 +1362,7 @@ static void mlx5_ib_mr_rdma_pfault_handler(struct mlx5_ib_dev *dev, ret = pagefault_single_data_segment(dev, NULL, rkey, address, prefetch_len, - &bytes_committed, NULL, - 0); + &bytes_committed, NULL); if (ret < 0 && ret != -EAGAIN) { mlx5_ib_dbg(dev, "Prefetch failed. ret: %d, QP 0x%x, address: 0x%.16llx, length = 0x%.16x\n", ret, pfault->token, address, prefetch_len); @@ -1639,114 +1665,128 @@ int mlx5_ib_odp_init(void) struct prefetch_mr_work { struct work_struct work; - struct ib_pd *pd; u32 pf_flags; u32 num_sge; - struct ib_sge sg_list[0]; + struct { + u64 io_virt; + struct mlx5_ib_mr *mr; + size_t length; + } frags[]; }; -static void num_pending_prefetch_dec(struct mlx5_ib_dev *dev, - struct ib_sge *sg_list, u32 num_sge, - u32 from) +static void destroy_prefetch_work(struct prefetch_mr_work *work) { u32 i; - int srcu_key; - - srcu_key = srcu_read_lock(&dev->mr_srcu); - - for (i = from; i < num_sge; ++i) { - struct mlx5_core_mkey *mmkey; - struct mlx5_ib_mr *mr; - - mmkey = xa_load(&dev->mdev->priv.mkey_table, - mlx5_base_mkey(sg_list[i].lkey)); - mr = container_of(mmkey, struct mlx5_ib_mr, mmkey); - atomic_dec(&mr->num_pending_prefetch); - } - srcu_read_unlock(&dev->mr_srcu, srcu_key); + for (i = 0; i < work->num_sge; ++i) + atomic_dec(&work->frags[i].mr->num_deferred_work); + kvfree(work); } -static bool num_pending_prefetch_inc(struct ib_pd *pd, - struct ib_sge *sg_list, u32 num_sge) +static struct mlx5_ib_mr * +get_prefetchable_mr(struct ib_pd *pd, enum ib_uverbs_advise_mr_advice advice, + u32 lkey) { struct mlx5_ib_dev *dev = to_mdev(pd->device); - bool ret = true; - u32 i; + struct mlx5_core_mkey *mmkey; + struct ib_umem_odp *odp; + struct mlx5_ib_mr *mr; - for (i = 0; i < num_sge; ++i) { - struct mlx5_core_mkey *mmkey; - struct mlx5_ib_mr *mr; + lockdep_assert_held(&dev->odp_srcu); - mmkey = xa_load(&dev->mdev->priv.mkey_table, - mlx5_base_mkey(sg_list[i].lkey)); - if (!mmkey || mmkey->key != sg_list[i].lkey) { - ret = false; - break; - } + mmkey = xa_load(&dev->odp_mkeys, mlx5_base_mkey(lkey)); + if (!mmkey || mmkey->key != lkey || mmkey->type != MLX5_MKEY_MR) + return NULL; - if (mmkey->type != MLX5_MKEY_MR) { - ret = false; - break; - } + mr = container_of(mmkey, struct mlx5_ib_mr, mmkey); - mr = container_of(mmkey, struct mlx5_ib_mr, mmkey); + if (mr->ibmr.pd != pd) + return NULL; - if (!smp_load_acquire(&mr->live)) { - ret = false; - break; - } + odp = to_ib_umem_odp(mr->umem); - if (mr->ibmr.pd != pd) { - ret = false; - break; - } + /* prefetch with write-access must be supported by the MR */ + if (advice == IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH_WRITE && + !odp->umem.writable) + return NULL; - atomic_inc(&mr->num_pending_prefetch); - } + return mr; +} - if (!ret) - num_pending_prefetch_dec(dev, sg_list, i, 0); +static void mlx5_ib_prefetch_mr_work(struct work_struct *w) +{ + struct prefetch_mr_work *work = + container_of(w, struct prefetch_mr_work, work); + u32 bytes_mapped = 0; + u32 i; - return ret; + for (i = 0; i < work->num_sge; ++i) + pagefault_mr(work->frags[i].mr, work->frags[i].io_virt, + work->frags[i].length, &bytes_mapped, + work->pf_flags); + + destroy_prefetch_work(work); } -static int mlx5_ib_prefetch_sg_list(struct ib_pd *pd, u32 pf_flags, - struct ib_sge *sg_list, u32 num_sge) +static bool init_prefetch_work(struct ib_pd *pd, + enum ib_uverbs_advise_mr_advice advice, + u32 pf_flags, struct prefetch_mr_work *work, + struct ib_sge *sg_list, u32 num_sge) { u32 i; - int ret = 0; - struct mlx5_ib_dev *dev = to_mdev(pd->device); + + INIT_WORK(&work->work, mlx5_ib_prefetch_mr_work); + work->pf_flags = pf_flags; for (i = 0; i < num_sge; ++i) { - struct ib_sge *sg = &sg_list[i]; - int bytes_committed = 0; + work->frags[i].io_virt = sg_list[i].addr; + work->frags[i].length = sg_list[i].length; + work->frags[i].mr = + get_prefetchable_mr(pd, advice, sg_list[i].lkey); + if (!work->frags[i].mr) { + work->num_sge = i - 1; + if (i) + destroy_prefetch_work(work); + return false; + } - ret = pagefault_single_data_segment(dev, pd, sg->lkey, sg->addr, - sg->length, - &bytes_committed, NULL, - pf_flags); - if (ret < 0) - break; + /* Keep the MR pointer will valid outside the SRCU */ + atomic_inc(&work->frags[i].mr->num_deferred_work); } - - return ret < 0 ? ret : 0; + work->num_sge = num_sge; + return true; } -static void mlx5_ib_prefetch_mr_work(struct work_struct *work) +static int mlx5_ib_prefetch_sg_list(struct ib_pd *pd, + enum ib_uverbs_advise_mr_advice advice, + u32 pf_flags, struct ib_sge *sg_list, + u32 num_sge) { - struct prefetch_mr_work *w = - container_of(work, struct prefetch_mr_work, work); + struct mlx5_ib_dev *dev = to_mdev(pd->device); + u32 bytes_mapped = 0; + int srcu_key; + int ret = 0; + u32 i; + + srcu_key = srcu_read_lock(&dev->odp_srcu); + for (i = 0; i < num_sge; ++i) { + struct mlx5_ib_mr *mr; - if (ib_device_try_get(w->pd->device)) { - mlx5_ib_prefetch_sg_list(w->pd, w->pf_flags, w->sg_list, - w->num_sge); - ib_device_put(w->pd->device); + mr = get_prefetchable_mr(pd, advice, sg_list[i].lkey); + if (!mr) { + ret = -ENOENT; + goto out; + } + ret = pagefault_mr(mr, sg_list[i].addr, sg_list[i].length, + &bytes_mapped, pf_flags); + if (ret < 0) + goto out; } + ret = 0; - num_pending_prefetch_dec(to_mdev(w->pd->device), w->sg_list, - w->num_sge, 0); - kvfree(w); +out: + srcu_read_unlock(&dev->odp_srcu, srcu_key); + return ret; } int mlx5_ib_advise_mr_prefetch(struct ib_pd *pd, @@ -1754,43 +1794,27 @@ int mlx5_ib_advise_mr_prefetch(struct ib_pd *pd, u32 flags, struct ib_sge *sg_list, u32 num_sge) { struct mlx5_ib_dev *dev = to_mdev(pd->device); - u32 pf_flags = MLX5_PF_FLAGS_PREFETCH; + u32 pf_flags = 0; struct prefetch_mr_work *work; - bool valid_req; int srcu_key; if (advice == IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH) pf_flags |= MLX5_PF_FLAGS_DOWNGRADE; if (flags & IB_UVERBS_ADVISE_MR_FLAG_FLUSH) - return mlx5_ib_prefetch_sg_list(pd, pf_flags, sg_list, + return mlx5_ib_prefetch_sg_list(pd, advice, pf_flags, sg_list, num_sge); - work = kvzalloc(struct_size(work, sg_list, num_sge), GFP_KERNEL); + work = kvzalloc(struct_size(work, frags, num_sge), GFP_KERNEL); if (!work) return -ENOMEM; - memcpy(work->sg_list, sg_list, num_sge * sizeof(struct ib_sge)); - - /* It is guaranteed that the pd when work is executed is the pd when - * work was queued since pd can't be destroyed while it holds MRs and - * destroying a MR leads to flushing the workquque - */ - work->pd = pd; - work->pf_flags = pf_flags; - work->num_sge = num_sge; - - INIT_WORK(&work->work, mlx5_ib_prefetch_mr_work); - - srcu_key = srcu_read_lock(&dev->mr_srcu); - - valid_req = num_pending_prefetch_inc(pd, sg_list, num_sge); - if (valid_req) - queue_work(system_unbound_wq, &work->work); - else - kvfree(work); - - srcu_read_unlock(&dev->mr_srcu, srcu_key); - - return valid_req ? 0 : -EINVAL; + srcu_key = srcu_read_lock(&dev->odp_srcu); + if (!init_prefetch_work(pd, advice, pf_flags, work, sg_list, num_sge)) { + srcu_read_unlock(&dev->odp_srcu, srcu_key); + return -EINVAL; + } + queue_work(system_unbound_wq, &work->work); + srcu_read_unlock(&dev->odp_srcu, srcu_key); + return 0; } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c index e47dd7c1b909..d10051f39b9c 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c @@ -837,8 +837,6 @@ static int mlx5_init_once(struct mlx5_core_dev *dev) mlx5_init_qp_table(dev); - mlx5_init_mkey_table(dev); - mlx5_init_reserved_gids(dev); mlx5_init_clock(dev); @@ -896,7 +894,6 @@ err_rl_cleanup: err_tables_cleanup: mlx5_geneve_destroy(dev->geneve); mlx5_vxlan_destroy(dev->vxlan); - mlx5_cleanup_mkey_table(dev); mlx5_cleanup_qp_table(dev); mlx5_cq_debugfs_cleanup(dev); mlx5_events_cleanup(dev); @@ -924,7 +921,6 @@ static void mlx5_cleanup_once(struct mlx5_core_dev *dev) mlx5_vxlan_destroy(dev->vxlan); mlx5_cleanup_clock(dev); mlx5_cleanup_reserved_gids(dev); - mlx5_cleanup_mkey_table(dev); mlx5_cleanup_qp_table(dev); mlx5_cq_debugfs_cleanup(dev); mlx5_events_cleanup(dev); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/mr.c b/drivers/net/ethernet/mellanox/mlx5/core/mr.c index c501bf2a0252..42cc3c7ac5b6 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/mr.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/mr.c @@ -36,16 +36,6 @@ #include <linux/mlx5/cmd.h> #include "mlx5_core.h" -void mlx5_init_mkey_table(struct mlx5_core_dev *dev) -{ - xa_init_flags(&dev->priv.mkey_table, XA_FLAGS_LOCK_IRQ); -} - -void mlx5_cleanup_mkey_table(struct mlx5_core_dev *dev) -{ - WARN_ON(!xa_empty(&dev->priv.mkey_table)); -} - int mlx5_core_create_mkey_cb(struct mlx5_core_dev *dev, struct mlx5_core_mkey *mkey, struct mlx5_async_ctx *async_ctx, u32 *in, @@ -54,7 +44,6 @@ int mlx5_core_create_mkey_cb(struct mlx5_core_dev *dev, struct mlx5_async_work *context) { u32 lout[MLX5_ST_SZ_DW(create_mkey_out)] = {0}; - struct xarray *mkeys = &dev->priv.mkey_table; u32 mkey_index; void *mkc; int err; @@ -84,16 +73,7 @@ int mlx5_core_create_mkey_cb(struct mlx5_core_dev *dev, mlx5_core_dbg(dev, "out 0x%x, key 0x%x, mkey 0x%x\n", mkey_index, key, mkey->key); - - err = xa_err(xa_store_irq(mkeys, mlx5_base_mkey(mkey->key), mkey, - GFP_KERNEL)); - if (err) { - mlx5_core_warn(dev, "failed xarray insert of mkey 0x%x, %d\n", - mlx5_base_mkey(mkey->key), err); - mlx5_core_destroy_mkey(dev, mkey); - } - - return err; + return 0; } EXPORT_SYMBOL(mlx5_core_create_mkey_cb); @@ -111,12 +91,6 @@ int mlx5_core_destroy_mkey(struct mlx5_core_dev *dev, { u32 out[MLX5_ST_SZ_DW(destroy_mkey_out)] = {0}; u32 in[MLX5_ST_SZ_DW(destroy_mkey_in)] = {0}; - struct xarray *mkeys = &dev->priv.mkey_table; - unsigned long flags; - - xa_lock_irqsave(mkeys, flags); - __xa_erase(mkeys, mlx5_base_mkey(mkey->key)); - xa_unlock_irqrestore(mkeys, flags); MLX5_SET(destroy_mkey_in, in, opcode, MLX5_CMD_OP_DESTROY_MKEY); MLX5_SET(destroy_mkey_in, in, mkey_index, mlx5_mkey_to_idx(mkey->key)); diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 3e80f03a387f..8288b62b8f37 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -556,8 +556,6 @@ struct mlx5_priv { struct dentry *cmdif_debugfs; /* end: qp staff */ - struct xarray mkey_table; - /* start: alloc staff */ /* protect buffer alocation according to numa node */ struct mutex alloc_mutex; @@ -942,8 +940,6 @@ struct mlx5_cmd_mailbox *mlx5_alloc_cmd_mailbox_chain(struct mlx5_core_dev *dev, gfp_t flags, int npages); void mlx5_free_cmd_mailbox_chain(struct mlx5_core_dev *dev, struct mlx5_cmd_mailbox *head); -void mlx5_init_mkey_table(struct mlx5_core_dev *dev); -void mlx5_cleanup_mkey_table(struct mlx5_core_dev *dev); int mlx5_core_create_mkey_cb(struct mlx5_core_dev *dev, struct mlx5_core_mkey *mkey, struct mlx5_async_ctx *async_ctx, u32 *in, diff --git a/include/rdma/ib_umem_odp.h b/include/rdma/ib_umem_odp.h index 253df1a1fa54..09b0e4494986 100644 --- a/include/rdma/ib_umem_odp.h +++ b/include/rdma/ib_umem_odp.h @@ -78,9 +78,7 @@ struct ib_umem_odp { bool is_implicit_odp; struct completion notifier_completion; - int dying; unsigned int page_shift; - struct work_struct work; }; static inline struct ib_umem_odp *to_ib_umem_odp(struct ib_umem *umem) @@ -156,22 +154,6 @@ int rbt_ib_umem_for_each_in_range(struct rb_root_cached *root, umem_call_back cb, bool blockable, void *cookie); -/* - * Find first region intersecting with address range. - * Return NULL if not found - */ -static inline struct ib_umem_odp * -rbt_ib_umem_lookup(struct rb_root_cached *root, u64 addr, u64 length) -{ - struct interval_tree_node *node; - - node = interval_tree_iter_first(root, addr, addr + length - 1); - if (!node) - return NULL; - return container_of(node, struct ib_umem_odp, interval_tree); - -} - static inline int ib_umem_mmu_notifier_retry(struct ib_umem_odp *umem_odp, unsigned long mmu_seq) { |