diff options
author | Daniel Borkmann <daniel@iogearbox.net> | 2020-11-14 02:29:00 +0100 |
---|---|---|
committer | Daniel Borkmann <daniel@iogearbox.net> | 2020-11-14 02:30:03 +0100 |
commit | c14d61fca0d10498bf267c0ab1f381dd0b35d96b (patch) | |
tree | 710de0411e07d3ebe0233c8eeb5f80245cf3b63b | |
parent | bpf: Expose bpf_d_path helper to sleepable LSM hooks (diff) | |
parent | net: mlx5: Add xdp tx return bulking support (diff) | |
download | linux-c14d61fca0d10498bf267c0ab1f381dd0b35d96b.tar.xz linux-c14d61fca0d10498bf267c0ab1f381dd0b35d96b.zip |
Merge branch 'xdp-redirect-bulk'
Lorenzo Bianconi says:
====================
XDP bulk APIs introduce a defer/flush mechanism to return
pages belonging to the same xdp_mem_allocator object
(identified via the mem.id field) in bulk to optimize
I-cache and D-cache since xdp_return_frame is usually
run inside the driver NAPI tx completion loop.
Convert mvneta, mvpp2 and mlx5 drivers to xdp_return_frame_bulk APIs.
More details on benchmarks run on mlx5 can be found here:
https://github.com/xdp-project/xdp-project/blob/master/areas/mem/xdp_bulk_return01.org
Changes since v5:
- do not keep looping over ptr_ring if the cache is full but release leftover
pages running page_pool_return_page
Changes since v4:
- fix comments
- introduce xdp_frame_bulk_init utility routine
- compiler annotations for I-cache code layout
- move rcu_read_lock outside fast-path
- mlx5 xdp bulking code optimization
Changes since v3:
- align DEV_MAP_BULK_SIZE to XDP_BULK_QUEUE_SIZE
- refactor page_pool_put_page_bulk to avoid code duplication
Changes since v2:
- move mvneta changes in a dedicated patch
Changes since v1:
- improve comments
- rework xdp_return_frame_bulk routine logic
- move count and xa fields at the beginning of xdp_frame_bulk struct
- invert logic in page_pool_put_page_bulk for loop
====================
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Jesper Dangaard Brouer <brouer@redhat.com>
-rw-r--r-- | drivers/net/ethernet/marvell/mvneta.c | 10 | ||||
-rw-r--r-- | drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c | 10 | ||||
-rw-r--r-- | drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c | 22 | ||||
-rw-r--r-- | include/net/page_pool.h | 26 | ||||
-rw-r--r-- | include/net/xdp.h | 17 | ||||
-rw-r--r-- | net/core/page_pool.c | 70 | ||||
-rw-r--r-- | net/core/xdp.c | 54 |
7 files changed, 192 insertions, 17 deletions
diff --git a/drivers/net/ethernet/marvell/mvneta.c b/drivers/net/ethernet/marvell/mvneta.c index 54b0bf574c05..183530ed4d1d 100644 --- a/drivers/net/ethernet/marvell/mvneta.c +++ b/drivers/net/ethernet/marvell/mvneta.c @@ -1834,8 +1834,13 @@ static void mvneta_txq_bufs_free(struct mvneta_port *pp, struct netdev_queue *nq, bool napi) { unsigned int bytes_compl = 0, pkts_compl = 0; + struct xdp_frame_bulk bq; int i; + xdp_frame_bulk_init(&bq); + + rcu_read_lock(); /* need for xdp_return_frame_bulk */ + for (i = 0; i < num; i++) { struct mvneta_tx_buf *buf = &txq->buf[txq->txq_get_index]; struct mvneta_tx_desc *tx_desc = txq->descs + @@ -1857,9 +1862,12 @@ static void mvneta_txq_bufs_free(struct mvneta_port *pp, if (napi && buf->type == MVNETA_TYPE_XDP_TX) xdp_return_frame_rx_napi(buf->xdpf); else - xdp_return_frame(buf->xdpf); + xdp_return_frame_bulk(buf->xdpf, &bq); } } + xdp_flush_frame_bulk(&bq); + + rcu_read_unlock(); netdev_tx_completed_queue(nq, pkts_compl, bytes_compl); } diff --git a/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c b/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c index f6616c8933ca..3069e192d773 100644 --- a/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c +++ b/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c @@ -2440,8 +2440,13 @@ static void mvpp2_txq_bufs_free(struct mvpp2_port *port, struct mvpp2_tx_queue *txq, struct mvpp2_txq_pcpu *txq_pcpu, int num) { + struct xdp_frame_bulk bq; int i; + xdp_frame_bulk_init(&bq); + + rcu_read_lock(); /* need for xdp_return_frame_bulk */ + for (i = 0; i < num; i++) { struct mvpp2_txq_pcpu_buf *tx_buf = txq_pcpu->buffs + txq_pcpu->txq_get_index; @@ -2454,10 +2459,13 @@ static void mvpp2_txq_bufs_free(struct mvpp2_port *port, dev_kfree_skb_any(tx_buf->skb); else if (tx_buf->type == MVPP2_TYPE_XDP_TX || tx_buf->type == MVPP2_TYPE_XDP_NDO) - xdp_return_frame(tx_buf->xdpf); + xdp_return_frame_bulk(tx_buf->xdpf, &bq); mvpp2_txq_inc_get(txq_pcpu); } + xdp_flush_frame_bulk(&bq); + + rcu_read_unlock(); } static inline struct mvpp2_rx_queue *mvpp2_get_rx_queue(struct mvpp2_port *port, diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c b/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c index ae90d533a350..2e3e78b0f333 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c @@ -366,7 +366,8 @@ mlx5e_xmit_xdp_frame(struct mlx5e_xdpsq *sq, struct mlx5e_xmit_data *xdptxd, static void mlx5e_free_xdpsq_desc(struct mlx5e_xdpsq *sq, struct mlx5e_xdp_wqe_info *wi, u32 *xsk_frames, - bool recycle) + bool recycle, + struct xdp_frame_bulk *bq) { struct mlx5e_xdp_info_fifo *xdpi_fifo = &sq->db.xdpi_fifo; u16 i; @@ -379,7 +380,7 @@ static void mlx5e_free_xdpsq_desc(struct mlx5e_xdpsq *sq, /* XDP_TX from the XSK RQ and XDP_REDIRECT */ dma_unmap_single(sq->pdev, xdpi.frame.dma_addr, xdpi.frame.xdpf->len, DMA_TO_DEVICE); - xdp_return_frame(xdpi.frame.xdpf); + xdp_return_frame_bulk(xdpi.frame.xdpf, bq); break; case MLX5E_XDP_XMIT_MODE_PAGE: /* XDP_TX from the regular RQ */ @@ -397,12 +398,15 @@ static void mlx5e_free_xdpsq_desc(struct mlx5e_xdpsq *sq, bool mlx5e_poll_xdpsq_cq(struct mlx5e_cq *cq) { + struct xdp_frame_bulk bq; struct mlx5e_xdpsq *sq; struct mlx5_cqe64 *cqe; u32 xsk_frames = 0; u16 sqcc; int i; + xdp_frame_bulk_init(&bq); + sq = container_of(cq, struct mlx5e_xdpsq, cq); if (unlikely(!test_bit(MLX5E_SQ_STATE_ENABLED, &sq->state))) @@ -434,7 +438,7 @@ bool mlx5e_poll_xdpsq_cq(struct mlx5e_cq *cq) sqcc += wi->num_wqebbs; - mlx5e_free_xdpsq_desc(sq, wi, &xsk_frames, true); + mlx5e_free_xdpsq_desc(sq, wi, &xsk_frames, true, &bq); } while (!last_wqe); if (unlikely(get_cqe_opcode(cqe) != MLX5_CQE_REQ)) { @@ -447,6 +451,8 @@ bool mlx5e_poll_xdpsq_cq(struct mlx5e_cq *cq) } } while ((++i < MLX5E_TX_CQ_POLL_BUDGET) && (cqe = mlx5_cqwq_get_cqe(&cq->wq))); + xdp_flush_frame_bulk(&bq); + if (xsk_frames) xsk_tx_completed(sq->xsk_pool, xsk_frames); @@ -463,8 +469,13 @@ bool mlx5e_poll_xdpsq_cq(struct mlx5e_cq *cq) void mlx5e_free_xdpsq_descs(struct mlx5e_xdpsq *sq) { + struct xdp_frame_bulk bq; u32 xsk_frames = 0; + xdp_frame_bulk_init(&bq); + + rcu_read_lock(); /* need for xdp_return_frame_bulk */ + while (sq->cc != sq->pc) { struct mlx5e_xdp_wqe_info *wi; u16 ci; @@ -474,9 +485,12 @@ void mlx5e_free_xdpsq_descs(struct mlx5e_xdpsq *sq) sq->cc += wi->num_wqebbs; - mlx5e_free_xdpsq_desc(sq, wi, &xsk_frames, false); + mlx5e_free_xdpsq_desc(sq, wi, &xsk_frames, false, &bq); } + xdp_flush_frame_bulk(&bq); + rcu_read_unlock(); + if (xsk_frames) xsk_tx_completed(sq->xsk_pool, xsk_frames); } diff --git a/include/net/page_pool.h b/include/net/page_pool.h index 81d7773f96cd..b5b195305346 100644 --- a/include/net/page_pool.h +++ b/include/net/page_pool.h @@ -152,6 +152,8 @@ struct page_pool *page_pool_create(const struct page_pool_params *params); void page_pool_destroy(struct page_pool *pool); void page_pool_use_xdp_mem(struct page_pool *pool, void (*disconnect)(void *)); void page_pool_release_page(struct page_pool *pool, struct page *page); +void page_pool_put_page_bulk(struct page_pool *pool, void **data, + int count); #else static inline void page_pool_destroy(struct page_pool *pool) { @@ -165,6 +167,11 @@ static inline void page_pool_release_page(struct page_pool *pool, struct page *page) { } + +static inline void page_pool_put_page_bulk(struct page_pool *pool, void **data, + int count) +{ +} #endif void page_pool_put_page(struct page_pool *pool, struct page *page, @@ -215,4 +222,23 @@ static inline void page_pool_nid_changed(struct page_pool *pool, int new_nid) if (unlikely(pool->p.nid != new_nid)) page_pool_update_nid(pool, new_nid); } + +static inline void page_pool_ring_lock(struct page_pool *pool) + __acquires(&pool->ring.producer_lock) +{ + if (in_serving_softirq()) + spin_lock(&pool->ring.producer_lock); + else + spin_lock_bh(&pool->ring.producer_lock); +} + +static inline void page_pool_ring_unlock(struct page_pool *pool) + __releases(&pool->ring.producer_lock) +{ + if (in_serving_softirq()) + spin_unlock(&pool->ring.producer_lock); + else + spin_unlock_bh(&pool->ring.producer_lock); +} + #endif /* _NET_PAGE_POOL_H */ diff --git a/include/net/xdp.h b/include/net/xdp.h index 3814fb631d52..7d48b2ae217a 100644 --- a/include/net/xdp.h +++ b/include/net/xdp.h @@ -104,6 +104,18 @@ struct xdp_frame { struct net_device *dev_rx; /* used by cpumap */ }; +#define XDP_BULK_QUEUE_SIZE 16 +struct xdp_frame_bulk { + int count; + void *xa; + void *q[XDP_BULK_QUEUE_SIZE]; +}; + +static __always_inline void xdp_frame_bulk_init(struct xdp_frame_bulk *bq) +{ + /* bq->count will be zero'ed when bq->xa gets updated */ + bq->xa = NULL; +} static inline struct skb_shared_info * xdp_get_shared_info_from_frame(struct xdp_frame *frame) @@ -194,6 +206,9 @@ struct xdp_frame *xdp_convert_buff_to_frame(struct xdp_buff *xdp) void xdp_return_frame(struct xdp_frame *xdpf); void xdp_return_frame_rx_napi(struct xdp_frame *xdpf); void xdp_return_buff(struct xdp_buff *xdp); +void xdp_flush_frame_bulk(struct xdp_frame_bulk *bq); +void xdp_return_frame_bulk(struct xdp_frame *xdpf, + struct xdp_frame_bulk *bq); /* When sending xdp_frame into the network stack, then there is no * return point callback, which is needed to release e.g. DMA-mapping @@ -245,6 +260,6 @@ bool xdp_attachment_flags_ok(struct xdp_attachment_info *info, void xdp_attachment_setup(struct xdp_attachment_info *info, struct netdev_bpf *bpf); -#define DEV_MAP_BULK_SIZE 16 +#define DEV_MAP_BULK_SIZE XDP_BULK_QUEUE_SIZE #endif /* __LINUX_NET_XDP_H__ */ diff --git a/net/core/page_pool.c b/net/core/page_pool.c index ef98372facf6..f3c690b8c8e3 100644 --- a/net/core/page_pool.c +++ b/net/core/page_pool.c @@ -11,6 +11,8 @@ #include <linux/device.h> #include <net/page_pool.h> +#include <net/xdp.h> + #include <linux/dma-direction.h> #include <linux/dma-mapping.h> #include <linux/page-flags.h> @@ -362,8 +364,9 @@ static bool pool_page_reusable(struct page_pool *pool, struct page *page) * If the page refcnt != 1, then the page will be returned to memory * subsystem. */ -void page_pool_put_page(struct page_pool *pool, struct page *page, - unsigned int dma_sync_size, bool allow_direct) +static __always_inline struct page * +__page_pool_put_page(struct page_pool *pool, struct page *page, + unsigned int dma_sync_size, bool allow_direct) { /* This allocator is optimized for the XDP mode that uses * one-frame-per-page, but have fallbacks that act like the @@ -379,15 +382,12 @@ void page_pool_put_page(struct page_pool *pool, struct page *page, page_pool_dma_sync_for_device(pool, page, dma_sync_size); - if (allow_direct && in_serving_softirq()) - if (page_pool_recycle_in_cache(page, pool)) - return; + if (allow_direct && in_serving_softirq() && + page_pool_recycle_in_cache(page, pool)) + return NULL; - if (!page_pool_recycle_in_ring(pool, page)) { - /* Cache full, fallback to free pages */ - page_pool_return_page(pool, page); - } - return; + /* Page found as candidate for recycling */ + return page; } /* Fallback/non-XDP mode: API user have elevated refcnt. * @@ -405,9 +405,59 @@ void page_pool_put_page(struct page_pool *pool, struct page *page, /* Do not replace this with page_pool_return_page() */ page_pool_release_page(pool, page); put_page(page); + + return NULL; +} + +void page_pool_put_page(struct page_pool *pool, struct page *page, + unsigned int dma_sync_size, bool allow_direct) +{ + page = __page_pool_put_page(pool, page, dma_sync_size, allow_direct); + if (page && !page_pool_recycle_in_ring(pool, page)) { + /* Cache full, fallback to free pages */ + page_pool_return_page(pool, page); + } } EXPORT_SYMBOL(page_pool_put_page); +/* Caller must not use data area after call, as this function overwrites it */ +void page_pool_put_page_bulk(struct page_pool *pool, void **data, + int count) +{ + int i, bulk_len = 0; + + for (i = 0; i < count; i++) { + struct page *page = virt_to_head_page(data[i]); + + page = __page_pool_put_page(pool, page, -1, false); + /* Approved for bulk recycling in ptr_ring cache */ + if (page) + data[bulk_len++] = page; + } + + if (unlikely(!bulk_len)) + return; + + /* Bulk producer into ptr_ring page_pool cache */ + page_pool_ring_lock(pool); + for (i = 0; i < bulk_len; i++) { + if (__ptr_ring_produce(&pool->ring, data[i])) + break; /* ring full */ + } + page_pool_ring_unlock(pool); + + /* Hopefully all pages was return into ptr_ring */ + if (likely(i == bulk_len)) + return; + + /* ptr_ring cache full, free remaining pages outside producer lock + * since put_page() with refcnt == 1 can be an expensive operation + */ + for (; i < bulk_len; i++) + page_pool_return_page(pool, data[i]); +} +EXPORT_SYMBOL(page_pool_put_page_bulk); + static void page_pool_empty_ring(struct page_pool *pool) { struct page *page; diff --git a/net/core/xdp.c b/net/core/xdp.c index 48aba933a5a8..3d330ebda893 100644 --- a/net/core/xdp.c +++ b/net/core/xdp.c @@ -380,6 +380,60 @@ void xdp_return_frame_rx_napi(struct xdp_frame *xdpf) } EXPORT_SYMBOL_GPL(xdp_return_frame_rx_napi); +/* XDP bulk APIs introduce a defer/flush mechanism to return + * pages belonging to the same xdp_mem_allocator object + * (identified via the mem.id field) in bulk to optimize + * I-cache and D-cache. + * The bulk queue size is set to 16 to be aligned to how + * XDP_REDIRECT bulking works. The bulk is flushed when + * it is full or when mem.id changes. + * xdp_frame_bulk is usually stored/allocated on the function + * call-stack to avoid locking penalties. + */ +void xdp_flush_frame_bulk(struct xdp_frame_bulk *bq) +{ + struct xdp_mem_allocator *xa = bq->xa; + + if (unlikely(!xa || !bq->count)) + return; + + page_pool_put_page_bulk(xa->page_pool, bq->q, bq->count); + /* bq->xa is not cleared to save lookup, if mem.id same in next bulk */ + bq->count = 0; +} +EXPORT_SYMBOL_GPL(xdp_flush_frame_bulk); + +/* Must be called with rcu_read_lock held */ +void xdp_return_frame_bulk(struct xdp_frame *xdpf, + struct xdp_frame_bulk *bq) +{ + struct xdp_mem_info *mem = &xdpf->mem; + struct xdp_mem_allocator *xa; + + if (mem->type != MEM_TYPE_PAGE_POOL) { + __xdp_return(xdpf->data, &xdpf->mem, false); + return; + } + + xa = bq->xa; + if (unlikely(!xa)) { + xa = rhashtable_lookup(mem_id_ht, &mem->id, mem_id_rht_params); + bq->count = 0; + bq->xa = xa; + } + + if (bq->count == XDP_BULK_QUEUE_SIZE) + xdp_flush_frame_bulk(bq); + + if (unlikely(mem->id != xa->mem.id)) { + xdp_flush_frame_bulk(bq); + bq->xa = rhashtable_lookup(mem_id_ht, &mem->id, mem_id_rht_params); + } + + bq->q[bq->count++] = xdpf->data; +} +EXPORT_SYMBOL_GPL(xdp_return_frame_bulk); + void xdp_return_buff(struct xdp_buff *xdp) { __xdp_return(xdp->data, &xdp->rxq->mem, true); |