diff options
author | Idan Burstein <idanb@mellanox.com> | 2018-05-02 12:16:39 +0200 |
---|---|---|
committer | Doug Ledford <dledford@redhat.com> | 2018-05-09 18:08:21 +0200 |
commit | 064e526247070c79aa3063d93384db378649a640 (patch) | |
tree | a0eb985ac7ff5e0dc471de0623f9945ee5ab32c6 /drivers/infiniband/hw/mlx5 | |
parent | RDMA/hns: Drop local zgid in favor of core defined variable (diff) | |
download | linux-064e526247070c79aa3063d93384db378649a640.tar.xz linux-064e526247070c79aa3063d93384db378649a640.zip |
IB/mlx5: posting klm/mtt list inline in the send queue for reg_wr
As most kernel RDMA ULPs, (e.g. NVMe over Fabrics in its default
"register_always=Y" mode) registers and invalidates user buffer
upon each IO.
Today the mlx5 driver is posting the registration work
request using scatter/gather entry for the MTT/KLM list.
The fetch of the MTT/KLM list becomes the bottleneck in
number of IO operation could be done by NVMe over Fabrics
host driver on a single adapter as shown below.
This patch is adding the support for inline registration
work request upon MTT/KLM list of size <=64B.
The result for NVMe over Fabrics is increase of > x3.5 for small
IOs as shown below, I expect other ULPs (e.g iSER, SRP, NFS over RDMA)
performance to be enhanced as well.
The following results were taken against a single NVMe-oF (RoCE link layer)
subsystem with a single namespace backed by null_blk using fio benchmark
(with rw=randread, numjobs=48, iodepth={16,64}, ioengine=libaio direct=1):
ConnectX-5 (pci Width x16)
---------------------------
Block Size s/g reg_wr inline reg_wr
++++++++++ +++++++++++++++ ++++++++++++++++
512B 1302.8K/34.82% 4951.9K/99.02%
1KB 1284.3K/33.86% 4232.7K/98.09%
2KB 1238.6K/34.1% 2797.5K/80.04%
4KB 1169.3K/32.46% 1941.3K/61.35%
8KB 1013.4K/30.08% 1236.6K/39.47%
16KB 695.7K/20.19% 696.9K/20.59%
32KB 350.3K/9.64% 350.6K/10.3%
64KB 175.86K/5.27% 175.9K/5.28%
ConnectX-4 (pci Width x8)
---------------------------
Block Size s/g reg_wr inline reg_wr
++++++++++ +++++++++++++++ ++++++++++++++++
512B 1285.8K/42.66% 4242.7K/98.18%
1KB 1254.1K/41.74% 3569.2K/96.00%
2KB 1185.9K/39.83% 2173.9K/75.58%
4KB 1069.4K/36.46% 1343.3K/47.47%
8KB 755.1K/27.77% 748.7K/29.14%
Tested-by: Nitzan Carmi <nitzanc@mellanox.com>
Signed-off-by: Idan Burstein <idanb@mellanox.com>
Signed-off-by: Max Gurtovoy <maxg@mellanox.com>
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
Signed-off-by: Doug Ledford <dledford@redhat.com>
Diffstat (limited to 'drivers/infiniband/hw/mlx5')
-rw-r--r-- | drivers/infiniband/hw/mlx5/qp.c | 43 |
1 files changed, 36 insertions, 7 deletions
diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c index 7ed4b70f6447..7a9870a4823f 100644 --- a/drivers/infiniband/hw/mlx5/qp.c +++ b/drivers/infiniband/hw/mlx5/qp.c @@ -54,6 +54,7 @@ enum { enum { MLX5_IB_SQ_STRIDE = 6, + MLX5_IB_SQ_UMR_INLINE_THRESHOLD = 64, }; static const u32 mlx5_ib_opcode[] = { @@ -298,7 +299,9 @@ static int sq_overhead(struct ib_qp_init_attr *attr) max(sizeof(struct mlx5_wqe_atomic_seg) + sizeof(struct mlx5_wqe_raddr_seg), sizeof(struct mlx5_wqe_umr_ctrl_seg) + - sizeof(struct mlx5_mkey_seg)); + sizeof(struct mlx5_mkey_seg) + + MLX5_IB_SQ_UMR_INLINE_THRESHOLD / + MLX5_IB_UMR_OCTOWORD); break; case IB_QPT_XRC_TGT: @@ -3633,13 +3636,15 @@ static __be64 sig_mkey_mask(void) } static void set_reg_umr_seg(struct mlx5_wqe_umr_ctrl_seg *umr, - struct mlx5_ib_mr *mr) + struct mlx5_ib_mr *mr, bool umr_inline) { int size = mr->ndescs * mr->desc_size; memset(umr, 0, sizeof(*umr)); umr->flags = MLX5_UMR_CHECK_NOT_FREE; + if (umr_inline) + umr->flags |= MLX5_UMR_INLINE; umr->xlt_octowords = cpu_to_be16(get_xlt_octo(size)); umr->mkey_mask = frwr_mkey_mask(); } @@ -3823,6 +3828,24 @@ static void set_reg_data_seg(struct mlx5_wqe_data_seg *dseg, dseg->lkey = cpu_to_be32(pd->ibpd.local_dma_lkey); } +static void set_reg_umr_inline_seg(void *seg, struct mlx5_ib_qp *qp, + struct mlx5_ib_mr *mr, int mr_list_size) +{ + void *qend = qp->sq.qend; + void *addr = mr->descs; + int copy; + + if (unlikely(seg + mr_list_size > qend)) { + copy = qend - seg; + memcpy(seg, addr, copy); + addr += copy; + mr_list_size -= copy; + seg = mlx5_get_send_wqe(qp, 0); + } + memcpy(seg, addr, mr_list_size); + seg += mr_list_size; +} + static __be32 send_ieth(struct ib_send_wr *wr) { switch (wr->opcode) { @@ -4217,6 +4240,8 @@ static int set_reg_wr(struct mlx5_ib_qp *qp, { struct mlx5_ib_mr *mr = to_mmr(wr->mr); struct mlx5_ib_pd *pd = to_mpd(qp->ibqp.pd); + int mr_list_size = mr->ndescs * mr->desc_size; + bool umr_inline = mr_list_size <= MLX5_IB_SQ_UMR_INLINE_THRESHOLD; if (unlikely(wr->wr.send_flags & IB_SEND_INLINE)) { mlx5_ib_warn(to_mdev(qp->ibqp.device), @@ -4224,7 +4249,7 @@ static int set_reg_wr(struct mlx5_ib_qp *qp, return -EINVAL; } - set_reg_umr_seg(*seg, mr); + set_reg_umr_seg(*seg, mr, umr_inline); *seg += sizeof(struct mlx5_wqe_umr_ctrl_seg); *size += sizeof(struct mlx5_wqe_umr_ctrl_seg) / 16; if (unlikely((*seg == qp->sq.qend))) @@ -4236,10 +4261,14 @@ static int set_reg_wr(struct mlx5_ib_qp *qp, if (unlikely((*seg == qp->sq.qend))) *seg = mlx5_get_send_wqe(qp, 0); - set_reg_data_seg(*seg, mr, pd); - *seg += sizeof(struct mlx5_wqe_data_seg); - *size += (sizeof(struct mlx5_wqe_data_seg) / 16); - + if (umr_inline) { + set_reg_umr_inline_seg(*seg, qp, mr, mr_list_size); + *size += get_xlt_octo(mr_list_size); + } else { + set_reg_data_seg(*seg, mr, pd); + *seg += sizeof(struct mlx5_wqe_data_seg); + *size += (sizeof(struct mlx5_wqe_data_seg) / 16); + } return 0; } |