diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2023-02-25 00:11:03 +0100 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2023-02-25 00:11:03 +0100 |
commit | 8cbd92339db08b19b93d1637e5799ff2a8dddfd2 (patch) | |
tree | 7e62d961f32e8a2a96271029b376f1e8bbd70a7c /drivers/infiniband/sw | |
parent | Merge tag 'for-linus-iommufd' of git://git.kernel.org/pub/scm/linux/kernel/gi... (diff) | |
parent | IB/mlx5: Extend debug control for CC parameters (diff) | |
download | linux-8cbd92339db08b19b93d1637e5799ff2a8dddfd2.tar.xz linux-8cbd92339db08b19b93d1637e5799ff2a8dddfd2.zip |
Merge tag 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma
Pull rdma updates from Jason Gunthorpe:
"Quite a small cycle this time, even with the rc8. I suppose everyone
went to sleep over xmas.
- Minor driver updates for hfi1, cxgb4, erdma, hns, irdma, mlx5, siw,
mana
- inline CQE support for hns
- Have mlx5 display device error codes
- Pinned DMABUF support for irdma
- Continued rxe cleanups, particularly converting the MRs to use
xarray
- Improvements to what can be cached in the mlx5 mkey cache"
* tag 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma: (61 commits)
IB/mlx5: Extend debug control for CC parameters
IB/hfi1: Fix sdma.h tx->num_descs off-by-one errors
IB/hfi1: Fix math bugs in hfi1_can_pin_pages()
RDMA/irdma: Add support for dmabuf pin memory regions
RDMA/mlx5: Use query_special_contexts for mkeys
net/mlx5e: Use query_special_contexts for mkeys
net/mlx5: Change define name for 0x100 lkey value
net/mlx5: Expose bits for querying special mkeys
RDMA/rxe: Fix missing memory barriers in rxe_queue.h
RDMA/mana_ib: Fix a bug when the PF indicates more entries for registering memory on first packet
RDMA/rxe: Remove rxe_alloc()
RDMA/cma: Distinguish between sockaddr_in and sockaddr_in6 by size
Subject: RDMA/rxe: Handle zero length rdma
iw_cxgb4: Fix potential NULL dereference in c4iw_fill_res_cm_id_entry()
RDMA/mlx5: Use rdma_umem_for_each_dma_block()
RDMA/umem: Remove unused 'work' member from struct ib_umem
RDMA/irdma: Cap MSIX used to online CPUs + 1
RDMA/mlx5: Check reg_create() create for errors
RDMA/restrack: Correct spelling
RDMA/cxgb4: Fix potential null-ptr-deref in pass_establish()
...
Diffstat (limited to 'drivers/infiniband/sw')
-rw-r--r-- | drivers/infiniband/sw/rxe/rxe.h | 38 | ||||
-rw-r--r-- | drivers/infiniband/sw/rxe/rxe_loc.h | 12 | ||||
-rw-r--r-- | drivers/infiniband/sw/rxe/rxe_mr.c | 606 | ||||
-rw-r--r-- | drivers/infiniband/sw/rxe/rxe_pool.c | 46 | ||||
-rw-r--r-- | drivers/infiniband/sw/rxe/rxe_pool.h | 3 | ||||
-rw-r--r-- | drivers/infiniband/sw/rxe/rxe_queue.h | 108 | ||||
-rw-r--r-- | drivers/infiniband/sw/rxe/rxe_resp.c | 202 | ||||
-rw-r--r-- | drivers/infiniband/sw/rxe/rxe_verbs.c | 115 | ||||
-rw-r--r-- | drivers/infiniband/sw/rxe/rxe_verbs.h | 32 | ||||
-rw-r--r-- | drivers/infiniband/sw/siw/siw_mem.c | 23 |
10 files changed, 600 insertions, 585 deletions
diff --git a/drivers/infiniband/sw/rxe/rxe.h b/drivers/infiniband/sw/rxe/rxe.h index ab334900fcc3..2415f3704f57 100644 --- a/drivers/infiniband/sw/rxe/rxe.h +++ b/drivers/infiniband/sw/rxe/rxe.h @@ -57,6 +57,44 @@ #define rxe_dbg_mw(mw, fmt, ...) ibdev_dbg((mw)->ibmw.device, \ "mw#%d %s: " fmt, (mw)->elem.index, __func__, ##__VA_ARGS__) +/* responder states */ +enum resp_states { + RESPST_NONE, + RESPST_GET_REQ, + RESPST_CHK_PSN, + RESPST_CHK_OP_SEQ, + RESPST_CHK_OP_VALID, + RESPST_CHK_RESOURCE, + RESPST_CHK_LENGTH, + RESPST_CHK_RKEY, + RESPST_EXECUTE, + RESPST_READ_REPLY, + RESPST_ATOMIC_REPLY, + RESPST_ATOMIC_WRITE_REPLY, + RESPST_PROCESS_FLUSH, + RESPST_COMPLETE, + RESPST_ACKNOWLEDGE, + RESPST_CLEANUP, + RESPST_DUPLICATE_REQUEST, + RESPST_ERR_MALFORMED_WQE, + RESPST_ERR_UNSUPPORTED_OPCODE, + RESPST_ERR_MISALIGNED_ATOMIC, + RESPST_ERR_PSN_OUT_OF_SEQ, + RESPST_ERR_MISSING_OPCODE_FIRST, + RESPST_ERR_MISSING_OPCODE_LAST_C, + RESPST_ERR_MISSING_OPCODE_LAST_D1E, + RESPST_ERR_TOO_MANY_RDMA_ATM_REQ, + RESPST_ERR_RNR, + RESPST_ERR_RKEY_VIOLATION, + RESPST_ERR_INVALIDATE_RKEY, + RESPST_ERR_LENGTH, + RESPST_ERR_CQ_OVERFLOW, + RESPST_ERROR, + RESPST_RESET, + RESPST_DONE, + RESPST_EXIT, +}; + void rxe_set_mtu(struct rxe_dev *rxe, unsigned int dev_mtu); int rxe_add(struct rxe_dev *rxe, unsigned int mtu, const char *ibdev_name); diff --git a/drivers/infiniband/sw/rxe/rxe_loc.h b/drivers/infiniband/sw/rxe/rxe_loc.h index 948ce4902b10..1bb0cb479eb1 100644 --- a/drivers/infiniband/sw/rxe/rxe_loc.h +++ b/drivers/infiniband/sw/rxe/rxe_loc.h @@ -64,12 +64,16 @@ void rxe_mr_init_dma(int access, struct rxe_mr *mr); int rxe_mr_init_user(struct rxe_dev *rxe, u64 start, u64 length, u64 iova, int access, struct rxe_mr *mr); int rxe_mr_init_fast(int max_pages, struct rxe_mr *mr); -int rxe_flush_pmem_iova(struct rxe_mr *mr, u64 iova, int length); -int rxe_mr_copy(struct rxe_mr *mr, u64 iova, void *addr, int length, - enum rxe_mr_copy_dir dir); +int rxe_flush_pmem_iova(struct rxe_mr *mr, u64 iova, unsigned int length); +int rxe_mr_copy(struct rxe_mr *mr, u64 iova, void *addr, + unsigned int length, enum rxe_mr_copy_dir dir); int copy_data(struct rxe_pd *pd, int access, struct rxe_dma_info *dma, void *addr, int length, enum rxe_mr_copy_dir dir); -void *iova_to_vaddr(struct rxe_mr *mr, u64 iova, int length); +int rxe_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, + int sg_nents, unsigned int *sg_offset); +int rxe_mr_do_atomic_op(struct rxe_mr *mr, u64 iova, int opcode, + u64 compare, u64 swap_add, u64 *orig_val); +int rxe_mr_do_atomic_write(struct rxe_mr *mr, u64 iova, u64 value); struct rxe_mr *lookup_mr(struct rxe_pd *pd, int access, u32 key, enum rxe_mr_lookup_type type); int mr_check_range(struct rxe_mr *mr, u64 iova, size_t length); diff --git a/drivers/infiniband/sw/rxe/rxe_mr.c b/drivers/infiniband/sw/rxe/rxe_mr.c index 072eac4b65d2..b10aa1580a64 100644 --- a/drivers/infiniband/sw/rxe/rxe_mr.c +++ b/drivers/infiniband/sw/rxe/rxe_mr.c @@ -26,22 +26,22 @@ u8 rxe_get_next_key(u32 last_key) int mr_check_range(struct rxe_mr *mr, u64 iova, size_t length) { - - switch (mr->ibmr.type) { case IB_MR_TYPE_DMA: return 0; case IB_MR_TYPE_USER: case IB_MR_TYPE_MEM_REG: - if (iova < mr->ibmr.iova || length > mr->ibmr.length || - iova > mr->ibmr.iova + mr->ibmr.length - length) - return -EFAULT; + if (iova < mr->ibmr.iova || + iova + length > mr->ibmr.iova + mr->ibmr.length) { + rxe_dbg_mr(mr, "iova/length out of range"); + return -EINVAL; + } return 0; default: - rxe_dbg_mr(mr, "type (%d) not supported\n", mr->ibmr.type); - return -EFAULT; + rxe_dbg_mr(mr, "mr type not supported\n"); + return -EINVAL; } } @@ -62,57 +62,31 @@ static void rxe_mr_init(int access, struct rxe_mr *mr) mr->lkey = mr->ibmr.lkey = lkey; mr->rkey = mr->ibmr.rkey = rkey; + mr->access = access; + mr->ibmr.page_size = PAGE_SIZE; + mr->page_mask = PAGE_MASK; + mr->page_shift = PAGE_SHIFT; mr->state = RXE_MR_STATE_INVALID; } -static int rxe_mr_alloc(struct rxe_mr *mr, int num_buf) -{ - int i; - int num_map; - struct rxe_map **map = mr->map; - - num_map = (num_buf + RXE_BUF_PER_MAP - 1) / RXE_BUF_PER_MAP; - - mr->map = kmalloc_array(num_map, sizeof(*map), GFP_KERNEL); - if (!mr->map) - goto err1; - - for (i = 0; i < num_map; i++) { - mr->map[i] = kmalloc(sizeof(**map), GFP_KERNEL); - if (!mr->map[i]) - goto err2; - } - - BUILD_BUG_ON(!is_power_of_2(RXE_BUF_PER_MAP)); - - mr->map_shift = ilog2(RXE_BUF_PER_MAP); - mr->map_mask = RXE_BUF_PER_MAP - 1; - - mr->num_buf = num_buf; - mr->num_map = num_map; - mr->max_buf = num_map * RXE_BUF_PER_MAP; - - return 0; - -err2: - for (i--; i >= 0; i--) - kfree(mr->map[i]); - - kfree(mr->map); - mr->map = NULL; -err1: - return -ENOMEM; -} - void rxe_mr_init_dma(int access, struct rxe_mr *mr) { rxe_mr_init(access, mr); - mr->access = access; mr->state = RXE_MR_STATE_VALID; mr->ibmr.type = IB_MR_TYPE_DMA; } +static unsigned long rxe_mr_iova_to_index(struct rxe_mr *mr, u64 iova) +{ + return (iova >> mr->page_shift) - (mr->ibmr.iova >> mr->page_shift); +} + +static unsigned long rxe_mr_iova_to_page_offset(struct rxe_mr *mr, u64 iova) +{ + return iova & (mr_page_size(mr) - 1); +} + static bool is_pmem_page(struct page *pg) { unsigned long paddr = page_to_phys(pg); @@ -122,86 +96,98 @@ static bool is_pmem_page(struct page *pg) IORES_DESC_PERSISTENT_MEMORY); } +static int rxe_mr_fill_pages_from_sgt(struct rxe_mr *mr, struct sg_table *sgt) +{ + XA_STATE(xas, &mr->page_list, 0); + struct sg_page_iter sg_iter; + struct page *page; + bool persistent = !!(mr->access & IB_ACCESS_FLUSH_PERSISTENT); + + __sg_page_iter_start(&sg_iter, sgt->sgl, sgt->orig_nents, 0); + if (!__sg_page_iter_next(&sg_iter)) + return 0; + + do { + xas_lock(&xas); + while (true) { + page = sg_page_iter_page(&sg_iter); + + if (persistent && !is_pmem_page(page)) { + rxe_dbg_mr(mr, "Page can't be persistent\n"); + xas_set_err(&xas, -EINVAL); + break; + } + + xas_store(&xas, page); + if (xas_error(&xas)) + break; + xas_next(&xas); + if (!__sg_page_iter_next(&sg_iter)) + break; + } + xas_unlock(&xas); + } while (xas_nomem(&xas, GFP_KERNEL)); + + return xas_error(&xas); +} + int rxe_mr_init_user(struct rxe_dev *rxe, u64 start, u64 length, u64 iova, int access, struct rxe_mr *mr) { - struct rxe_map **map; - struct rxe_phys_buf *buf = NULL; - struct ib_umem *umem; - struct sg_page_iter sg_iter; - int num_buf; - void *vaddr; + struct ib_umem *umem; int err; + rxe_mr_init(access, mr); + + xa_init(&mr->page_list); + umem = ib_umem_get(&rxe->ib_dev, start, length, access); if (IS_ERR(umem)) { rxe_dbg_mr(mr, "Unable to pin memory region err = %d\n", (int)PTR_ERR(umem)); - err = PTR_ERR(umem); - goto err_out; + return PTR_ERR(umem); } - num_buf = ib_umem_num_pages(umem); - - rxe_mr_init(access, mr); - - err = rxe_mr_alloc(mr, num_buf); + err = rxe_mr_fill_pages_from_sgt(mr, &umem->sgt_append.sgt); if (err) { - rxe_dbg_mr(mr, "Unable to allocate memory for map\n"); - goto err_release_umem; + ib_umem_release(umem); + return err; } - mr->page_shift = PAGE_SHIFT; - mr->page_mask = PAGE_SIZE - 1; - - num_buf = 0; - map = mr->map; - if (length > 0) { - bool persistent_access = access & IB_ACCESS_FLUSH_PERSISTENT; - - buf = map[0]->buf; - for_each_sgtable_page (&umem->sgt_append.sgt, &sg_iter, 0) { - struct page *pg = sg_page_iter_page(&sg_iter); + mr->umem = umem; + mr->ibmr.type = IB_MR_TYPE_USER; + mr->state = RXE_MR_STATE_VALID; - if (persistent_access && !is_pmem_page(pg)) { - rxe_dbg_mr(mr, "Unable to register persistent access to non-pmem device\n"); - err = -EINVAL; - goto err_release_umem; - } + return 0; +} - if (num_buf >= RXE_BUF_PER_MAP) { - map++; - buf = map[0]->buf; - num_buf = 0; - } +static int rxe_mr_alloc(struct rxe_mr *mr, int num_buf) +{ + XA_STATE(xas, &mr->page_list, 0); + int i = 0; + int err; - vaddr = page_address(pg); - if (!vaddr) { - rxe_dbg_mr(mr, "Unable to get virtual address\n"); - err = -ENOMEM; - goto err_release_umem; - } - buf->addr = (uintptr_t)vaddr; - buf->size = PAGE_SIZE; - num_buf++; - buf++; + xa_init(&mr->page_list); + do { + xas_lock(&xas); + while (i != num_buf) { + xas_store(&xas, XA_ZERO_ENTRY); + if (xas_error(&xas)) + break; + xas_next(&xas); + i++; } - } + xas_unlock(&xas); + } while (xas_nomem(&xas, GFP_KERNEL)); - mr->umem = umem; - mr->access = access; - mr->offset = ib_umem_offset(umem); - mr->state = RXE_MR_STATE_VALID; - mr->ibmr.type = IB_MR_TYPE_USER; - mr->ibmr.page_size = PAGE_SIZE; + err = xas_error(&xas); + if (err) + return err; - return 0; + mr->num_buf = num_buf; -err_release_umem: - ib_umem_release(umem); -err_out: - return err; + return 0; } int rxe_mr_init_fast(int max_pages, struct rxe_mr *mr) @@ -215,7 +201,6 @@ int rxe_mr_init_fast(int max_pages, struct rxe_mr *mr) if (err) goto err1; - mr->max_buf = max_pages; mr->state = RXE_MR_STATE_FREE; mr->ibmr.type = IB_MR_TYPE_MEM_REG; @@ -225,187 +210,125 @@ err1: return err; } -static void lookup_iova(struct rxe_mr *mr, u64 iova, int *m_out, int *n_out, - size_t *offset_out) +static int rxe_set_page(struct ib_mr *ibmr, u64 iova) { - size_t offset = iova - mr->ibmr.iova + mr->offset; - int map_index; - int buf_index; - u64 length; - - if (likely(mr->page_shift)) { - *offset_out = offset & mr->page_mask; - offset >>= mr->page_shift; - *n_out = offset & mr->map_mask; - *m_out = offset >> mr->map_shift; - } else { - map_index = 0; - buf_index = 0; + struct rxe_mr *mr = to_rmr(ibmr); + struct page *page = virt_to_page(iova & mr->page_mask); + bool persistent = !!(mr->access & IB_ACCESS_FLUSH_PERSISTENT); + int err; - length = mr->map[map_index]->buf[buf_index].size; + if (persistent && !is_pmem_page(page)) { + rxe_dbg_mr(mr, "Page cannot be persistent\n"); + return -EINVAL; + } - while (offset >= length) { - offset -= length; - buf_index++; + if (unlikely(mr->nbuf == mr->num_buf)) + return -ENOMEM; - if (buf_index == RXE_BUF_PER_MAP) { - map_index++; - buf_index = 0; - } - length = mr->map[map_index]->buf[buf_index].size; - } + err = xa_err(xa_store(&mr->page_list, mr->nbuf, page, GFP_KERNEL)); + if (err) + return err; - *m_out = map_index; - *n_out = buf_index; - *offset_out = offset; - } + mr->nbuf++; + return 0; } -void *iova_to_vaddr(struct rxe_mr *mr, u64 iova, int length) +int rxe_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sgl, + int sg_nents, unsigned int *sg_offset) { - size_t offset; - int m, n; - void *addr; - - if (mr->state != RXE_MR_STATE_VALID) { - rxe_dbg_mr(mr, "Not in valid state\n"); - addr = NULL; - goto out; - } - - if (!mr->map) { - addr = (void *)(uintptr_t)iova; - goto out; - } - - if (mr_check_range(mr, iova, length)) { - rxe_dbg_mr(mr, "Range violation\n"); - addr = NULL; - goto out; - } - - lookup_iova(mr, iova, &m, &n, &offset); - - if (offset + length > mr->map[m]->buf[n].size) { - rxe_dbg_mr(mr, "Crosses page boundary\n"); - addr = NULL; - goto out; - } + struct rxe_mr *mr = to_rmr(ibmr); + unsigned int page_size = mr_page_size(mr); - addr = (void *)(uintptr_t)mr->map[m]->buf[n].addr + offset; + mr->nbuf = 0; + mr->page_shift = ilog2(page_size); + mr->page_mask = ~((u64)page_size - 1); + mr->page_offset = mr->ibmr.iova & (page_size - 1); -out: - return addr; + return ib_sg_to_pages(ibmr, sgl, sg_nents, sg_offset, rxe_set_page); } -int rxe_flush_pmem_iova(struct rxe_mr *mr, u64 iova, int length) +static int rxe_mr_copy_xarray(struct rxe_mr *mr, u64 iova, void *addr, + unsigned int length, enum rxe_mr_copy_dir dir) { - size_t offset; - - if (length == 0) - return 0; - - if (mr->ibmr.type == IB_MR_TYPE_DMA) - return -EFAULT; + unsigned int page_offset = rxe_mr_iova_to_page_offset(mr, iova); + unsigned long index = rxe_mr_iova_to_index(mr, iova); + unsigned int bytes; + struct page *page; + void *va; - offset = (iova - mr->ibmr.iova + mr->offset) & mr->page_mask; - while (length > 0) { - u8 *va; - int bytes; - - bytes = mr->ibmr.page_size - offset; - if (bytes > length) - bytes = length; - - va = iova_to_vaddr(mr, iova, length); - if (!va) + while (length) { + page = xa_load(&mr->page_list, index); + if (!page) return -EFAULT; - arch_wb_cache_pmem(va, bytes); - + bytes = min_t(unsigned int, length, + mr_page_size(mr) - page_offset); + va = kmap_local_page(page); + if (dir == RXE_FROM_MR_OBJ) + memcpy(addr, va + page_offset, bytes); + else + memcpy(va + page_offset, addr, bytes); + kunmap_local(va); + + page_offset = 0; + addr += bytes; length -= bytes; - iova += bytes; - offset = 0; + index++; } return 0; } -/* copy data from a range (vaddr, vaddr+length-1) to or from - * a mr object starting at iova. - */ -int rxe_mr_copy(struct rxe_mr *mr, u64 iova, void *addr, int length, - enum rxe_mr_copy_dir dir) +static void rxe_mr_copy_dma(struct rxe_mr *mr, u64 iova, void *addr, + unsigned int length, enum rxe_mr_copy_dir dir) { - int err; - int bytes; - u8 *va; - struct rxe_map **map; - struct rxe_phys_buf *buf; - int m; - int i; - size_t offset; + unsigned int page_offset = iova & (PAGE_SIZE - 1); + unsigned int bytes; + struct page *page; + u8 *va; - if (length == 0) - return 0; - - if (mr->ibmr.type == IB_MR_TYPE_DMA) { - u8 *src, *dest; + while (length) { + page = virt_to_page(iova & mr->page_mask); + bytes = min_t(unsigned int, length, + PAGE_SIZE - page_offset); + va = kmap_local_page(page); + + if (dir == RXE_TO_MR_OBJ) + memcpy(va + page_offset, addr, bytes); + else + memcpy(addr, va + page_offset, bytes); + + kunmap_local(va); + page_offset = 0; + iova += bytes; + addr += bytes; + length -= bytes; + } +} - src = (dir == RXE_TO_MR_OBJ) ? addr : ((void *)(uintptr_t)iova); +int rxe_mr_copy(struct rxe_mr *mr, u64 iova, void *addr, + unsigned int length, enum rxe_mr_copy_dir dir) +{ + int err; - dest = (dir == RXE_TO_MR_OBJ) ? ((void *)(uintptr_t)iova) : addr; + if (length == 0) + return 0; - memcpy(dest, src, length); + if (WARN_ON(!mr)) + return -EINVAL; + if (mr->ibmr.type == IB_MR_TYPE_DMA) { + rxe_mr_copy_dma(mr, iova, addr, length, dir); return 0; } - WARN_ON_ONCE(!mr->map); - err = mr_check_range(mr, iova, length); - if (err) { - err = -EFAULT; - goto err1; - } - - lookup_iova(mr, iova, &m, &i, &offset); - - map = mr->map + m; - buf = map[0]->buf + i; - - while (length > 0) { - u8 *src, *dest; - - va = (u8 *)(uintptr_t)buf->addr + offset; - src = (dir == RXE_TO_MR_OBJ) ? addr : va; - dest = (dir == RXE_TO_MR_OBJ) ? va : addr; - - bytes = buf->size - offset; - - if (bytes > length) - bytes = length; - - memcpy(dest, src, bytes); - - length -= bytes; - addr += bytes; - - offset = 0; - buf++; - i++; - - if (i == RXE_BUF_PER_MAP) { - i = 0; - map++; - buf = map[0]->buf; - } + if (unlikely(err)) { + rxe_dbg_mr(mr, "iova out of range"); + return err; } - return 0; - -err1: - return err; + return rxe_mr_copy_xarray(mr, iova, addr, length, dir); } /* copy data in or out of a wqe, i.e. sg list @@ -477,7 +400,6 @@ int copy_data( if (bytes > 0) { iova = sge->addr + offset; - err = rxe_mr_copy(mr, iova, addr, bytes, dir); if (err) goto err2; @@ -504,6 +426,165 @@ err1: return err; } +int rxe_flush_pmem_iova(struct rxe_mr *mr, u64 iova, unsigned int length) +{ + unsigned int page_offset; + unsigned long index; + struct page *page; + unsigned int bytes; + int err; + u8 *va; + + /* mr must be valid even if length is zero */ + if (WARN_ON(!mr)) + return -EINVAL; + + if (length == 0) + return 0; + + if (mr->ibmr.type == IB_MR_TYPE_DMA) + return -EFAULT; + + err = mr_check_range(mr, iova, length); + if (err) + return err; + + while (length > 0) { + index = rxe_mr_iova_to_index(mr, iova); + page = xa_load(&mr->page_list, index); + page_offset = rxe_mr_iova_to_page_offset(mr, iova); + if (!page) + return -EFAULT; + bytes = min_t(unsigned int, length, + mr_page_size(mr) - page_offset); + + va = kmap_local_page(page); + arch_wb_cache_pmem(va + page_offset, bytes); + kunmap_local(va); + + length -= bytes; + iova += bytes; + page_offset = 0; + } + + return 0; +} + +/* Guarantee atomicity of atomic operations at the machine level. */ +static DEFINE_SPINLOCK(atomic_ops_lock); + +int rxe_mr_do_atomic_op(struct rxe_mr *mr, u64 iova, int opcode, + u64 compare, u64 swap_add, u64 *orig_val) +{ + unsigned int page_offset; + struct page *page; + u64 value; + u64 *va; + + if (unlikely(mr->state != RXE_MR_STATE_VALID)) { + rxe_dbg_mr(mr, "mr not in valid state"); + return RESPST_ERR_RKEY_VIOLATION; + } + + if (mr->ibmr.type == IB_MR_TYPE_DMA) { + page_offset = iova & (PAGE_SIZE - 1); + page = virt_to_page(iova & PAGE_MASK); + } else { + unsigned long index; + int err; + + err = mr_check_range(mr, iova, sizeof(value)); + if (err) { + rxe_dbg_mr(mr, "iova out of range"); + return RESPST_ERR_RKEY_VIOLATION; + } + page_offset = rxe_mr_iova_to_page_offset(mr, iova); + index = rxe_mr_iova_to_index(mr, iova); + page = xa_load(&mr->page_list, index); + if (!page) + return RESPST_ERR_RKEY_VIOLATION; + } + + if (unlikely(page_offset & 0x7)) { + rxe_dbg_mr(mr, "iova not aligned"); + return RESPST_ERR_MISALIGNED_ATOMIC; + } + + va = kmap_local_page(page); + + spin_lock_bh(&atomic_ops_lock); + value = *orig_val = va[page_offset >> 3]; + + if (opcode == IB_OPCODE_RC_COMPARE_SWAP) { + if (value == compare) + va[page_offset >> 3] = swap_add; + } else { + value += swap_add; + va[page_offset >> 3] = value; + } + spin_unlock_bh(&atomic_ops_lock); + + kunmap_local(va); + + return 0; +} + +#if defined CONFIG_64BIT +/* only implemented or called for 64 bit architectures */ +int rxe_mr_do_atomic_write(struct rxe_mr *mr, u64 iova, u64 value) +{ + unsigned int page_offset; + struct page *page; + u64 *va; + + /* See IBA oA19-28 */ + if (unlikely(mr->state != RXE_MR_STATE_VALID)) { + rxe_dbg_mr(mr, "mr not in valid state"); + return RESPST_ERR_RKEY_VIOLATION; + } + + if (mr->ibmr.type == IB_MR_TYPE_DMA) { + page_offset = iova & (PAGE_SIZE - 1); + page = virt_to_page(iova & PAGE_MASK); + } else { + unsigned long index; + int err; + + /* See IBA oA19-28 */ + err = mr_check_range(mr, iova, sizeof(value)); + if (unlikely(err)) { + rxe_dbg_mr(mr, "iova out of range"); + return RESPST_ERR_RKEY_VIOLATION; + } + page_offset = rxe_mr_iova_to_page_offset(mr, iova); + index = rxe_mr_iova_to_index(mr, iova); + page = xa_load(&mr->page_list, index); + if (!page) + return RESPST_ERR_RKEY_VIOLATION; + } + + /* See IBA A19.4.2 */ + if (unlikely(page_offset & 0x7)) { + rxe_dbg_mr(mr, "misaligned address"); + return RESPST_ERR_MISALIGNED_ATOMIC; + } + + va = kmap_local_page(page); + + /* Do atomic write after all prior operations have completed */ + smp_store_release(&va[page_offset >> 3], value); + + kunmap_local(va); + + return 0; +} +#else +int rxe_mr_do_atomic_write(struct rxe_mr *mr, u64 iova, u64 value) +{ + return RESPST_ERR_UNSUPPORTED_OPCODE; +} +#endif + int advance_dma_data(struct rxe_dma_info *dma, unsigned int length) { struct rxe_sge *sge = &dma->sge[dma->cur_sge]; @@ -537,12 +618,6 @@ int advance_dma_data(struct rxe_dma_info *dma, unsigned int length) return 0; } -/* (1) find the mr corresponding to lkey/rkey - * depending on lookup_type - * (2) verify that the (qp) pd matches the mr pd - * (3) verify that the mr can support the requested access - * (4) verify that mr state is valid - */ struct rxe_mr *lookup_mr(struct rxe_pd *pd, int access, u32 key, enum rxe_mr_lookup_type type) { @@ -656,22 +731,17 @@ int rxe_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata) return -EINVAL; rxe_cleanup(mr); - + kfree_rcu(mr); return 0; } void rxe_mr_cleanup(struct rxe_pool_elem *elem) { struct rxe_mr *mr = container_of(elem, typeof(*mr), elem); - int i; rxe_put(mr_pd(mr)); ib_umem_release(mr->umem); - if (mr->map) { - for (i = 0; i < mr->num_map; i++) - kfree(mr->map[i]); - - kfree(mr->map); - } + if (mr->ibmr.type != IB_MR_TYPE_DMA) + xa_destroy(&mr->page_list); } diff --git a/drivers/infiniband/sw/rxe/rxe_pool.c b/drivers/infiniband/sw/rxe/rxe_pool.c index 1151c0b5ccea..6215c6de3a84 100644 --- a/drivers/infiniband/sw/rxe/rxe_pool.c +++ b/drivers/infiniband/sw/rxe/rxe_pool.c @@ -116,55 +116,12 @@ void rxe_pool_cleanup(struct rxe_pool *pool) WARN_ON(!xa_empty(&pool->xa)); } -void *rxe_alloc(struct rxe_pool *pool) -{ - struct rxe_pool_elem *elem; - void *obj; - int err; - - if (WARN_ON(!(pool->type == RXE_TYPE_MR))) - return NULL; - - if (atomic_inc_return(&pool->num_elem) > pool->max_elem) - goto err_cnt; - - obj = kzalloc(pool->elem_size, GFP_KERNEL); - if (!obj) - goto err_cnt; - - elem = (struct rxe_pool_elem *)((u8 *)obj + pool->elem_offset); - - elem->pool = pool; - elem->obj = obj; - kref_init(&elem->ref_cnt); - init_completion(&elem->complete); - - /* allocate index in array but leave pointer as NULL so it - * can't be looked up until rxe_finalize() is called - */ - err = xa_alloc_cyclic(&pool->xa, &elem->index, NULL, pool->limit, - &pool->next, GFP_KERNEL); - if (err < 0) - goto err_free; - - return obj; - -err_free: - kfree(obj); -err_cnt: - atomic_dec(&pool->num_elem); - return NULL; -} - int __rxe_add_to_pool(struct rxe_pool *pool, struct rxe_pool_elem *elem, bool sleepable) { int err; gfp_t gfp_flags; - if (WARN_ON(pool->type == RXE_TYPE_MR)) - return -EINVAL; - if (atomic_inc_return(&pool->num_elem) > pool->max_elem) goto err_cnt; @@ -275,9 +232,6 @@ int __rxe_cleanup(struct rxe_pool_elem *elem, bool sleepable) if (pool->cleanup) pool->cleanup(elem); - if (pool->type == RXE_TYPE_MR) - kfree_rcu(elem->obj); - atomic_dec(&pool->num_elem); return err; diff --git a/drivers/infiniband/sw/rxe/rxe_pool.h b/drivers/infiniband/sw/rxe/rxe_pool.h index 9d83cb32092f..b42e26427a70 100644 --- a/drivers/infiniband/sw/rxe/rxe_pool.h +++ b/drivers/infiniband/sw/rxe/rxe_pool.h @@ -54,9 +54,6 @@ void rxe_pool_init(struct rxe_dev *rxe, struct rxe_pool *pool, /* free resources from object pool */ void rxe_pool_cleanup(struct rxe_pool *pool); -/* allocate an object from pool */ -void *rxe_alloc(struct rxe_pool *pool); - /* connect already allocated object to pool */ int __rxe_add_to_pool(struct rxe_pool *pool, struct rxe_pool_elem *elem, bool sleepable); diff --git a/drivers/infiniband/sw/rxe/rxe_queue.h b/drivers/infiniband/sw/rxe/rxe_queue.h index ed44042782fa..c711cb98b949 100644 --- a/drivers/infiniband/sw/rxe/rxe_queue.h +++ b/drivers/infiniband/sw/rxe/rxe_queue.h @@ -35,19 +35,26 @@ /** * enum queue_type - type of queue * @QUEUE_TYPE_TO_CLIENT: Queue is written by rxe driver and - * read by client. Used by rxe driver only. + * read by client which may be a user space + * application or a kernel ulp. + * Used by rxe internals only. * @QUEUE_TYPE_FROM_CLIENT: Queue is written by client and - * read by rxe driver. Used by rxe driver only. - * @QUEUE_TYPE_TO_DRIVER: Queue is written by client and - * read by rxe driver. Used by kernel client only. - * @QUEUE_TYPE_FROM_DRIVER: Queue is written by rxe driver and - * read by client. Used by kernel client only. + * read by rxe driver. + * Used by rxe internals only. + * @QUEUE_TYPE_FROM_ULP: Queue is written by kernel ulp and + * read by rxe driver. + * Used by kernel verbs APIs only on + * behalf of ulps. + * @QUEUE_TYPE_TO_ULP: Queue is written by rxe driver and + * read by kernel ulp. + * Used by kernel verbs APIs only on + * behalf of ulps. */ enum queue_type { QUEUE_TYPE_TO_CLIENT, QUEUE_TYPE_FROM_CLIENT, - QUEUE_TYPE_TO_DRIVER, - QUEUE_TYPE_FROM_DRIVER, + QUEUE_TYPE_FROM_ULP, + QUEUE_TYPE_TO_ULP, }; struct rxe_queue_buf; @@ -62,9 +69,9 @@ struct rxe_queue { u32 index_mask; enum queue_type type; /* private copy of index for shared queues between - * kernel space and user space. Kernel reads and writes + * driver and clients. Driver reads and writes * this copy and then replicates to rxe_queue_buf - * for read access by user space. + * for read access by clients. */ u32 index; }; @@ -97,19 +104,21 @@ static inline u32 queue_get_producer(const struct rxe_queue *q, switch (type) { case QUEUE_TYPE_FROM_CLIENT: - /* protect user index */ + /* used by rxe, client owns the index */ prod = smp_load_acquire(&q->buf->producer_index); break; case QUEUE_TYPE_TO_CLIENT: + /* used by rxe which owns the index */ prod = q->index; break; - case QUEUE_TYPE_FROM_DRIVER: - /* protect driver index */ - prod = smp_load_acquire(&q->buf->producer_index); - break; - case QUEUE_TYPE_TO_DRIVER: + case QUEUE_TYPE_FROM_ULP: + /* used by ulp which owns the index */ prod = q->buf->producer_index; break; + case QUEUE_TYPE_TO_ULP: + /* used by ulp, rxe owns the index */ + prod = smp_load_acquire(&q->buf->producer_index); + break; } return prod; @@ -122,19 +131,21 @@ static inline u32 queue_get_consumer(const struct rxe_queue *q, switch (type) { case QUEUE_TYPE_FROM_CLIENT: + /* used by rxe which owns the index */ cons = q->index; break; case QUEUE_TYPE_TO_CLIENT: - /* protect user index */ + /* used by rxe, client owns the index */ cons = smp_load_acquire(&q->buf->consumer_index); break; - case QUEUE_TYPE_FROM_DRIVER: - cons = q->buf->consumer_index; - break; - case QUEUE_TYPE_TO_DRIVER: - /* protect driver index */ + case QUEUE_TYPE_FROM_ULP: + /* used by ulp, rxe owns the index */ cons = smp_load_acquire(&q->buf->consumer_index); break; + case QUEUE_TYPE_TO_ULP: + /* used by ulp which owns the index */ + cons = q->buf->consumer_index; + break; } return cons; @@ -172,24 +183,31 @@ static inline void queue_advance_producer(struct rxe_queue *q, switch (type) { case QUEUE_TYPE_FROM_CLIENT: - pr_warn("%s: attempt to advance client index\n", - __func__); + /* used by rxe, client owns the index */ + if (WARN_ON(1)) + pr_warn("%s: attempt to advance client index\n", + __func__); break; case QUEUE_TYPE_TO_CLIENT: + /* used by rxe which owns the index */ prod = q->index; prod = (prod + 1) & q->index_mask; q->index = prod; - /* protect user index */ + /* release so client can read it safely */ smp_store_release(&q->buf->producer_index, prod); break; - case QUEUE_TYPE_FROM_DRIVER: - pr_warn("%s: attempt to advance driver index\n", - __func__); - break; - case QUEUE_TYPE_TO_DRIVER: + case QUEUE_TYPE_FROM_ULP: + /* used by ulp which owns the index */ prod = q->buf->producer_index; prod = (prod + 1) & q->index_mask; - q->buf->producer_index = prod; + /* release so rxe can read it safely */ + smp_store_release(&q->buf->producer_index, prod); + break; + case QUEUE_TYPE_TO_ULP: + /* used by ulp, rxe owns the index */ + if (WARN_ON(1)) + pr_warn("%s: attempt to advance driver index\n", + __func__); break; } } @@ -201,24 +219,30 @@ static inline void queue_advance_consumer(struct rxe_queue *q, switch (type) { case QUEUE_TYPE_FROM_CLIENT: - cons = q->index; - cons = (cons + 1) & q->index_mask; + /* used by rxe which owns the index */ + cons = (q->index + 1) & q->index_mask; q->index = cons; - /* protect user index */ + /* release so client can read it safely */ smp_store_release(&q->buf->consumer_index, cons); break; case QUEUE_TYPE_TO_CLIENT: - pr_warn("%s: attempt to advance client index\n", - __func__); + /* used by rxe, client owns the index */ + if (WARN_ON(1)) + pr_warn("%s: attempt to advance client index\n", + __func__); + break; + case QUEUE_TYPE_FROM_ULP: + /* used by ulp, rxe owns the index */ + if (WARN_ON(1)) + pr_warn("%s: attempt to advance driver index\n", + __func__); break; - case QUEUE_TYPE_FROM_DRIVER: + case QUEUE_TYPE_TO_ULP: + /* used by ulp which owns the index */ cons = q->buf->consumer_index; cons = (cons + 1) & q->index_mask; - q->buf->consumer_index = cons; - break; - case QUEUE_TYPE_TO_DRIVER: - pr_warn("%s: attempt to advance driver index\n", - __func__); + /* release so rxe can read it safely */ + smp_store_release(&q->buf->consumer_index, cons); break; } } diff --git a/drivers/infiniband/sw/rxe/rxe_resp.c b/drivers/infiniband/sw/rxe/rxe_resp.c index c74972244f08..0cc1ba91d48c 100644 --- a/drivers/infiniband/sw/rxe/rxe_resp.c +++ b/drivers/infiniband/sw/rxe/rxe_resp.c @@ -10,43 +10,6 @@ #include "rxe_loc.h" #include "rxe_queue.h" -enum resp_states { - RESPST_NONE, - RESPST_GET_REQ, - RESPST_CHK_PSN, - RESPST_CHK_OP_SEQ, - RESPST_CHK_OP_VALID, - RESPST_CHK_RESOURCE, - RESPST_CHK_LENGTH, - RESPST_CHK_RKEY, - RESPST_EXECUTE, - RESPST_READ_REPLY, - RESPST_ATOMIC_REPLY, - RESPST_ATOMIC_WRITE_REPLY, - RESPST_PROCESS_FLUSH, - RESPST_COMPLETE, - RESPST_ACKNOWLEDGE, - RESPST_CLEANUP, - RESPST_DUPLICATE_REQUEST, - RESPST_ERR_MALFORMED_WQE, - RESPST_ERR_UNSUPPORTED_OPCODE, - RESPST_ERR_MISALIGNED_ATOMIC, - RESPST_ERR_PSN_OUT_OF_SEQ, - RESPST_ERR_MISSING_OPCODE_FIRST, - RESPST_ERR_MISSING_OPCODE_LAST_C, - RESPST_ERR_MISSING_OPCODE_LAST_D1E, - RESPST_ERR_TOO_MANY_RDMA_ATM_REQ, - RESPST_ERR_RNR, - RESPST_ERR_RKEY_VIOLATION, - RESPST_ERR_INVALIDATE_RKEY, - RESPST_ERR_LENGTH, - RESPST_ERR_CQ_OVERFLOW, - RESPST_ERROR, - RESPST_RESET, - RESPST_DONE, - RESPST_EXIT, -}; - static char *resp_state_name[] = { [RESPST_NONE] = "NONE", [RESPST_GET_REQ] = "GET_REQ", @@ -457,13 +420,23 @@ static enum resp_states rxe_resp_check_length(struct rxe_qp *qp, return RESPST_CHK_RKEY; } +/* if the reth length field is zero we can assume nothing + * about the rkey value and should not validate or use it. + * Instead set qp->resp.rkey to 0 which is an invalid rkey + * value since the minimum index part is 1. + */ static void qp_resp_from_reth(struct rxe_qp *qp, struct rxe_pkt_info *pkt) { + unsigned int length = reth_len(pkt); + qp->resp.va = reth_va(pkt); qp->resp.offset = 0; - qp->resp.rkey = reth_rkey(pkt); - qp->resp.resid = reth_len(pkt); - qp->resp.length = reth_len(pkt); + qp->resp.resid = length; + qp->resp.length = length; + if (pkt->mask & RXE_READ_OR_WRITE_MASK && length == 0) + qp->resp.rkey = 0; + else + qp->resp.rkey = reth_rkey(pkt); } static void qp_resp_from_atmeth(struct rxe_qp *qp, struct rxe_pkt_info *pkt) @@ -474,6 +447,10 @@ static void qp_resp_from_atmeth(struct rxe_qp *qp, struct rxe_pkt_info *pkt) qp->resp.resid = sizeof(u64); } +/* resolve the packet rkey to qp->resp.mr or set qp->resp.mr to NULL + * if an invalid rkey is received or the rdma length is zero. For middle + * or last packets use the stored value of mr. + */ static enum resp_states check_rkey(struct rxe_qp *qp, struct rxe_pkt_info *pkt) { @@ -510,10 +487,12 @@ static enum resp_states check_rkey(struct rxe_qp *qp, return RESPST_EXECUTE; } - /* A zero-byte op is not required to set an addr or rkey. See C9-88 */ + /* A zero-byte read or write op is not required to + * set an addr or rkey. See C9-88 + */ if ((pkt->mask & RXE_READ_OR_WRITE_MASK) && - (pkt->mask & RXE_RETH_MASK) && - reth_len(pkt) == 0) { + (pkt->mask & RXE_RETH_MASK) && reth_len(pkt) == 0) { + qp->resp.mr = NULL; return RESPST_EXECUTE; } @@ -592,6 +571,7 @@ skip_check_range: return RESPST_EXECUTE; err: + qp->resp.mr = NULL; if (mr) rxe_put(mr); if (mw) @@ -725,17 +705,12 @@ static enum resp_states process_flush(struct rxe_qp *qp, return RESPST_ACKNOWLEDGE; } -/* Guarantee atomicity of atomic operations at the machine level. */ -static DEFINE_SPINLOCK(atomic_ops_lock); - static enum resp_states atomic_reply(struct rxe_qp *qp, - struct rxe_pkt_info *pkt) + struct rxe_pkt_info *pkt) { - u64 *vaddr; - enum resp_states ret; struct rxe_mr *mr = qp->resp.mr; struct resp_res *res = qp->resp.res; - u64 value; + int err; if (!res) { res = rxe_prepare_res(qp, pkt, RXE_ATOMIC_MASK); @@ -743,32 +718,14 @@ static enum resp_states atomic_reply(struct rxe_qp *qp, } if (!res->replay) { - if (mr->state != RXE_MR_STATE_VALID) { - ret = RESPST_ERR_RKEY_VIOLATION; - goto out; - } - - vaddr = iova_to_vaddr(mr, qp->resp.va + qp->resp.offset, - sizeof(u64)); - - /* check vaddr is 8 bytes aligned. */ - if (!vaddr || (uintptr_t)vaddr & 7) { - ret = RESPST_ERR_MISALIGNED_ATOMIC; - goto out; - } + u64 iova = qp->resp.va + qp->resp.offset; - spin_lock_bh(&atomic_ops_lock); - res->atomic.orig_val = value = *vaddr; - - if (pkt->opcode == IB_OPCODE_RC_COMPARE_SWAP) { - if (value == atmeth_comp(pkt)) - value = atmeth_swap_add(pkt); - } else { - value += atmeth_swap_add(pkt); - } - - *vaddr = value; - spin_unlock_bh(&atomic_ops_lock); + err = rxe_mr_do_atomic_op(mr, iova, pkt->opcode, + atmeth_comp(pkt), + atmeth_swap_add(pkt), + &res->atomic.orig_val); + if (err) + return err; qp->resp.msn++; @@ -780,35 +737,35 @@ static enum resp_states atomic_reply(struct rxe_qp *qp, qp->resp.status = IB_WC_SUCCESS; } - ret = RESPST_ACKNOWLEDGE; -out: - return ret; + return RESPST_ACKNOWLEDGE; } -#ifdef CONFIG_64BIT -static enum resp_states do_atomic_write(struct rxe_qp *qp, - struct rxe_pkt_info *pkt) +static enum resp_states atomic_write_reply(struct rxe_qp *qp, + struct rxe_pkt_info *pkt) { - struct rxe_mr *mr = qp->resp.mr; - int payload = payload_size(pkt); - u64 src, *dst; - - if (mr->state != RXE_MR_STATE_VALID) - return RESPST_ERR_RKEY_VIOLATION; + struct resp_res *res = qp->resp.res; + struct rxe_mr *mr; + u64 value; + u64 iova; + int err; - memcpy(&src, payload_addr(pkt), payload); + if (!res) { + res = rxe_prepare_res(qp, pkt, RXE_ATOMIC_WRITE_MASK); + qp->resp.res = res; + } - dst = iova_to_vaddr(mr, qp->resp.va + qp->resp.offset, payload); - /* check vaddr is 8 bytes aligned. */ - if (!dst || (uintptr_t)dst & 7) - return RESPST_ERR_MISALIGNED_ATOMIC; + if (res->replay) + return RESPST_ACKNOWLEDGE; - /* Do atomic write after all prior operations have completed */ - smp_store_release(dst, src); + mr = qp->resp.mr; + value = *(u64 *)payload_addr(pkt); + iova = qp->resp.va + qp->resp.offset; - /* decrease resp.resid to zero */ - qp->resp.resid -= sizeof(payload); + err = rxe_mr_do_atomic_write(mr, iova, value); + if (err) + return err; + qp->resp.resid = 0; qp->resp.msn++; /* next expected psn, read handles this separately */ @@ -817,29 +774,8 @@ static enum resp_states do_atomic_write(struct rxe_qp *qp, qp->resp.opcode = pkt->opcode; qp->resp.status = IB_WC_SUCCESS; - return RESPST_ACKNOWLEDGE; -} -#else -static enum resp_states do_atomic_write(struct rxe_qp *qp, - struct rxe_pkt_info *pkt) -{ - return RESPST_ERR_UNSUPPORTED_OPCODE; -} -#endif /* CONFIG_64BIT */ -static enum resp_states atomic_write_reply(struct rxe_qp *qp, - struct rxe_pkt_info *pkt) -{ - struct resp_res *res = qp->resp.res; - - if (!res) { - res = rxe_prepare_res(qp, pkt, RXE_ATOMIC_WRITE_MASK); - qp->resp.res = res; - } - - if (res->replay) - return RESPST_ACKNOWLEDGE; - return do_atomic_write(qp, pkt); + return RESPST_ACKNOWLEDGE; } static struct sk_buff *prepare_ack_packet(struct rxe_qp *qp, @@ -966,7 +902,11 @@ static enum resp_states read_reply(struct rxe_qp *qp, } if (res->state == rdatm_res_state_new) { - if (!res->replay) { + if (!res->replay || qp->resp.length == 0) { + /* if length == 0 mr will be NULL (is ok) + * otherwise qp->resp.mr holds a ref on mr + * which we transfer to mr and drop below. + */ mr = qp->resp.mr; qp->resp.mr = NULL; } else { @@ -980,6 +920,10 @@ static enum resp_states read_reply(struct rxe_qp *qp, else opcode = IB_OPCODE_RC_RDMA_READ_RESPONSE_FIRST; } else { + /* re-lookup mr from rkey on all later packets. + * length will be non-zero. This can fail if someone + * modifies or destroys the mr since the first packet. + */ mr = rxe_recheck_mr(qp, res->read.rkey); if (!mr) return RESPST_ERR_RKEY_VIOLATION; @@ -997,18 +941,16 @@ static enum resp_states read_reply(struct rxe_qp *qp, skb = prepare_ack_packet(qp, &ack_pkt, opcode, payload, res->cur_psn, AETH_ACK_UNLIMITED); if (!skb) { - if (mr) - rxe_put(mr); - return RESPST_ERR_RNR; + state = RESPST_ERR_RNR; + goto err_out; } err = rxe_mr_copy(mr, res->read.va, payload_addr(&ack_pkt), payload, RXE_FROM_MR_OBJ); - if (mr) - rxe_put(mr); if (err) { kfree_skb(skb); - return RESPST_ERR_RKEY_VIOLATION; + state = RESPST_ERR_RKEY_VIOLATION; + goto err_out; } if (bth_pad(&ack_pkt)) { @@ -1017,9 +959,12 @@ static enum resp_states read_reply(struct rxe_qp *qp, memset(pad, 0, bth_pad(&ack_pkt)); } + /* rxe_xmit_packet always consumes the skb */ err = rxe_xmit_packet(qp, &ack_pkt, skb); - if (err) - return RESPST_ERR_RNR; + if (err) { + state = RESPST_ERR_RNR; + goto err_out; + } res->read.va += payload; res->read.resid -= payload; @@ -1036,6 +981,9 @@ static enum resp_states read_reply(struct rxe_qp *qp, state = RESPST_CLEANUP; } +err_out: + if (mr) + rxe_put(mr); return state; } diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.c b/drivers/infiniband/sw/rxe/rxe_verbs.c index 025b35bf014e..e14050a69276 100644 --- a/drivers/infiniband/sw/rxe/rxe_verbs.c +++ b/drivers/infiniband/sw/rxe/rxe_verbs.c @@ -245,7 +245,7 @@ static int post_one_recv(struct rxe_rq *rq, const struct ib_recv_wr *ibwr) int num_sge = ibwr->num_sge; int full; - full = queue_full(rq->queue, QUEUE_TYPE_TO_DRIVER); + full = queue_full(rq->queue, QUEUE_TYPE_FROM_ULP); if (unlikely(full)) return -ENOMEM; @@ -256,7 +256,7 @@ static int post_one_recv(struct rxe_rq *rq, const struct ib_recv_wr *ibwr) for (i = 0; i < num_sge; i++) length += ibwr->sg_list[i].length; - recv_wqe = queue_producer_addr(rq->queue, QUEUE_TYPE_TO_DRIVER); + recv_wqe = queue_producer_addr(rq->queue, QUEUE_TYPE_FROM_ULP); recv_wqe->wr_id = ibwr->wr_id; memcpy(recv_wqe->dma.sge, ibwr->sg_list, @@ -268,7 +268,7 @@ static int post_one_recv(struct rxe_rq *rq, const struct ib_recv_wr *ibwr) recv_wqe->dma.cur_sge = 0; recv_wqe->dma.sge_offset = 0; - queue_advance_producer(rq->queue, QUEUE_TYPE_TO_DRIVER); + queue_advance_producer(rq->queue, QUEUE_TYPE_FROM_ULP); return 0; } @@ -623,17 +623,17 @@ static int post_one_send(struct rxe_qp *qp, const struct ib_send_wr *ibwr, spin_lock_irqsave(&qp->sq.sq_lock, flags); - full = queue_full(sq->queue, QUEUE_TYPE_TO_DRIVER); + full = queue_full(sq->queue, QUEUE_TYPE_FROM_ULP); if (unlikely(full)) { spin_unlock_irqrestore(&qp->sq.sq_lock, flags); return -ENOMEM; } - send_wqe = queue_producer_addr(sq->queue, QUEUE_TYPE_TO_DRIVER); + send_wqe = queue_producer_addr(sq->queue, QUEUE_TYPE_FROM_ULP); init_send_wqe(qp, ibwr, mask, length, send_wqe); - queue_advance_producer(sq->queue, QUEUE_TYPE_TO_DRIVER); + queue_advance_producer(sq->queue, QUEUE_TYPE_FROM_ULP); spin_unlock_irqrestore(&qp->sq.sq_lock, flags); @@ -821,12 +821,12 @@ static int rxe_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc) spin_lock_irqsave(&cq->cq_lock, flags); for (i = 0; i < num_entries; i++) { - cqe = queue_head(cq->queue, QUEUE_TYPE_FROM_DRIVER); + cqe = queue_head(cq->queue, QUEUE_TYPE_TO_ULP); if (!cqe) break; memcpy(wc++, &cqe->ibwc, sizeof(*wc)); - queue_advance_consumer(cq->queue, QUEUE_TYPE_FROM_DRIVER); + queue_advance_consumer(cq->queue, QUEUE_TYPE_TO_ULP); } spin_unlock_irqrestore(&cq->cq_lock, flags); @@ -838,7 +838,7 @@ static int rxe_peek_cq(struct ib_cq *ibcq, int wc_cnt) struct rxe_cq *cq = to_rcq(ibcq); int count; - count = queue_count(cq->queue, QUEUE_TYPE_FROM_DRIVER); + count = queue_count(cq->queue, QUEUE_TYPE_TO_ULP); return (count > wc_cnt) ? wc_cnt : count; } @@ -854,7 +854,7 @@ static int rxe_req_notify_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags) if (cq->notify != IB_CQ_NEXT_COMP) cq->notify = flags & IB_CQ_SOLICITED_MASK; - empty = queue_empty(cq->queue, QUEUE_TYPE_FROM_DRIVER); + empty = queue_empty(cq->queue, QUEUE_TYPE_TO_ULP); if ((flags & IB_CQ_REPORT_MISSED_EVENTS) && !empty) ret = 1; @@ -869,10 +869,17 @@ static struct ib_mr *rxe_get_dma_mr(struct ib_pd *ibpd, int access) struct rxe_dev *rxe = to_rdev(ibpd->device); struct rxe_pd *pd = to_rpd(ibpd); struct rxe_mr *mr; + int err; - mr = rxe_alloc(&rxe->mr_pool); - if (!mr) - return ERR_PTR(-ENOMEM); + mr = kzalloc(sizeof(*mr), GFP_KERNEL); + if (!mr) { + err = -ENOMEM; + goto err_out; + } + + err = rxe_add_to_pool(&rxe->mr_pool, mr); + if (err) + goto err_free; rxe_get(pd); mr->ibmr.pd = ibpd; @@ -880,8 +887,12 @@ static struct ib_mr *rxe_get_dma_mr(struct ib_pd *ibpd, int access) rxe_mr_init_dma(access, mr); rxe_finalize(mr); - return &mr->ibmr; + +err_free: + kfree(mr); +err_out: + return ERR_PTR(err); } static struct ib_mr *rxe_reg_user_mr(struct ib_pd *ibpd, @@ -895,9 +906,15 @@ static struct ib_mr *rxe_reg_user_mr(struct ib_pd *ibpd, struct rxe_pd *pd = to_rpd(ibpd); struct rxe_mr *mr; - mr = rxe_alloc(&rxe->mr_pool); - if (!mr) - return ERR_PTR(-ENOMEM); + mr = kzalloc(sizeof(*mr), GFP_KERNEL); + if (!mr) { + err = -ENOMEM; + goto err_out; + } + + err = rxe_add_to_pool(&rxe->mr_pool, mr); + if (err) + goto err_free; rxe_get(pd); mr->ibmr.pd = ibpd; @@ -905,14 +922,16 @@ static struct ib_mr *rxe_reg_user_mr(struct ib_pd *ibpd, err = rxe_mr_init_user(rxe, start, length, iova, access, mr); if (err) - goto err1; + goto err_cleanup; rxe_finalize(mr); - return &mr->ibmr; -err1: +err_cleanup: rxe_cleanup(mr); +err_free: + kfree(mr); +err_out: return ERR_PTR(err); } @@ -927,9 +946,15 @@ static struct ib_mr *rxe_alloc_mr(struct ib_pd *ibpd, enum ib_mr_type mr_type, if (mr_type != IB_MR_TYPE_MEM_REG) return ERR_PTR(-EINVAL); - mr = rxe_alloc(&rxe->mr_pool); - if (!mr) - return ERR_PTR(-ENOMEM); + mr = kzalloc(sizeof(*mr), GFP_KERNEL); + if (!mr) { + err = -ENOMEM; + goto err_out; + } + + err = rxe_add_to_pool(&rxe->mr_pool, mr); + if (err) + goto err_free; rxe_get(pd); mr->ibmr.pd = ibpd; @@ -937,53 +962,19 @@ static struct ib_mr *rxe_alloc_mr(struct ib_pd *ibpd, enum ib_mr_type mr_type, err = rxe_mr_init_fast(max_num_sg, mr); if (err) - goto err1; + goto err_cleanup; rxe_finalize(mr); - return &mr->ibmr; -err1: +err_cleanup: rxe_cleanup(mr); +err_free: + kfree(mr); +err_out: return ERR_PTR(err); } -static int rxe_set_page(struct ib_mr *ibmr, u64 addr) -{ - struct rxe_mr *mr = to_rmr(ibmr); - struct rxe_map *map; - struct rxe_phys_buf *buf; - - if (unlikely(mr->nbuf == mr->num_buf)) - return -ENOMEM; - - map = mr->map[mr->nbuf / RXE_BUF_PER_MAP]; - buf = &map->buf[mr->nbuf % RXE_BUF_PER_MAP]; - - buf->addr = addr; - buf->size = ibmr->page_size; - mr->nbuf++; - - return 0; -} - -static int rxe_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, - int sg_nents, unsigned int *sg_offset) -{ - struct rxe_mr *mr = to_rmr(ibmr); - int n; - - mr->nbuf = 0; - - n = ib_sg_to_pages(ibmr, sg, sg_nents, sg_offset, rxe_set_page); - - mr->page_shift = ilog2(ibmr->page_size); - mr->page_mask = ibmr->page_size - 1; - mr->offset = ibmr->iova & mr->page_mask; - - return n; -} - static ssize_t parent_show(struct device *device, struct device_attribute *attr, char *buf) { diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.h b/drivers/infiniband/sw/rxe/rxe_verbs.h index 19ddfa890480..c269ae2a3224 100644 --- a/drivers/infiniband/sw/rxe/rxe_verbs.h +++ b/drivers/infiniband/sw/rxe/rxe_verbs.h @@ -283,17 +283,6 @@ enum rxe_mr_lookup_type { RXE_LOOKUP_REMOTE, }; -#define RXE_BUF_PER_MAP (PAGE_SIZE / sizeof(struct rxe_phys_buf)) - -struct rxe_phys_buf { - u64 addr; - u64 size; -}; - -struct rxe_map { - struct rxe_phys_buf buf[RXE_BUF_PER_MAP]; -}; - static inline int rkey_is_mw(u32 rkey) { u32 index = rkey >> 8; @@ -310,25 +299,24 @@ struct rxe_mr { u32 lkey; u32 rkey; enum rxe_mr_state state; - u32 offset; int access; + atomic_t num_mw; - int page_shift; - int page_mask; - int map_shift; - int map_mask; + unsigned int page_offset; + unsigned int page_shift; + u64 page_mask; u32 num_buf; u32 nbuf; - u32 max_buf; - u32 num_map; - - atomic_t num_mw; - - struct rxe_map **map; + struct xarray page_list; }; +static inline unsigned int mr_page_size(struct rxe_mr *mr) +{ + return mr ? mr->ibmr.page_size : PAGE_SIZE; +} + enum rxe_mw_state { RXE_MW_STATE_INVALID = RXE_MR_STATE_INVALID, RXE_MW_STATE_FREE = RXE_MR_STATE_FREE, diff --git a/drivers/infiniband/sw/siw/siw_mem.c b/drivers/infiniband/sw/siw/siw_mem.c index b2b33dd3b4fa..f51ab2ccf151 100644 --- a/drivers/infiniband/sw/siw/siw_mem.c +++ b/drivers/infiniband/sw/siw/siw_mem.c @@ -398,7 +398,7 @@ struct siw_umem *siw_umem_get(u64 start, u64 len, bool writable) mlock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; - if (num_pages + atomic64_read(&mm_s->pinned_vm) > mlock_limit) { + if (atomic64_add_return(num_pages, &mm_s->pinned_vm) > mlock_limit) { rv = -ENOMEM; goto out_sem_up; } @@ -411,30 +411,27 @@ struct siw_umem *siw_umem_get(u64 start, u64 len, bool writable) goto out_sem_up; } for (i = 0; num_pages; i++) { - int got, nents = min_t(int, num_pages, PAGES_PER_CHUNK); - - umem->page_chunk[i].plist = + int nents = min_t(int, num_pages, PAGES_PER_CHUNK); + struct page **plist = kcalloc(nents, sizeof(struct page *), GFP_KERNEL); - if (!umem->page_chunk[i].plist) { + + if (!plist) { rv = -ENOMEM; goto out_sem_up; } - got = 0; + umem->page_chunk[i].plist = plist; while (nents) { - struct page **plist = &umem->page_chunk[i].plist[got]; - rv = pin_user_pages(first_page_va, nents, foll_flags, plist, NULL); if (rv < 0) goto out_sem_up; umem->num_pages += rv; - atomic64_add(rv, &mm_s->pinned_vm); first_page_va += rv * PAGE_SIZE; + plist += rv; nents -= rv; - got += rv; + num_pages -= rv; } - num_pages -= got; } out_sem_up: mmap_read_unlock(mm_s); @@ -442,6 +439,10 @@ out_sem_up: if (rv > 0) return umem; + /* Adjust accounting for pages not pinned */ + if (num_pages) + atomic64_sub(num_pages, &mm_s->pinned_vm); + siw_umem_release(umem, false); return ERR_PTR(rv); |