diff options
-rw-r--r-- | drivers/net/ethernet/intel/i40e/i40e_txrx.c | 11 | ||||
-rw-r--r-- | drivers/net/ethernet/intel/i40e/i40e_txrx.h | 1 | ||||
-rw-r--r-- | drivers/net/ethernet/intel/i40e/i40e_xsk.c | 123 | ||||
-rw-r--r-- | drivers/net/ethernet/intel/i40e/i40e_xsk.h | 16 | ||||
-rw-r--r-- | include/net/xdp_sock_drv.h | 7 | ||||
-rw-r--r-- | net/xdp/xsk.c | 57 | ||||
-rw-r--r-- | net/xdp/xsk_queue.h | 93 | ||||
-rw-r--r-- | samples/bpf/xdpsock_user.c | 6 |
8 files changed, 258 insertions, 56 deletions
diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.c b/drivers/net/ethernet/intel/i40e/i40e_txrx.c index d43ce13a93c9..c21548c71bb1 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_txrx.c +++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.c @@ -676,6 +676,8 @@ void i40e_free_tx_resources(struct i40e_ring *tx_ring) i40e_clean_tx_ring(tx_ring); kfree(tx_ring->tx_bi); tx_ring->tx_bi = NULL; + kfree(tx_ring->xsk_descs); + tx_ring->xsk_descs = NULL; if (tx_ring->desc) { dma_free_coherent(tx_ring->dev, tx_ring->size, @@ -1277,6 +1279,13 @@ int i40e_setup_tx_descriptors(struct i40e_ring *tx_ring) if (!tx_ring->tx_bi) goto err; + if (ring_is_xdp(tx_ring)) { + tx_ring->xsk_descs = kcalloc(I40E_MAX_NUM_DESCRIPTORS, sizeof(*tx_ring->xsk_descs), + GFP_KERNEL); + if (!tx_ring->xsk_descs) + goto err; + } + u64_stats_init(&tx_ring->syncp); /* round up to nearest 4K */ @@ -1300,6 +1309,8 @@ int i40e_setup_tx_descriptors(struct i40e_ring *tx_ring) return 0; err: + kfree(tx_ring->xsk_descs); + tx_ring->xsk_descs = NULL; kfree(tx_ring->tx_bi); tx_ring->tx_bi = NULL; return -ENOMEM; diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.h b/drivers/net/ethernet/intel/i40e/i40e_txrx.h index 2feed920ef8a..5f531b195959 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_txrx.h +++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.h @@ -389,6 +389,7 @@ struct i40e_ring { struct i40e_channel *ch; struct xdp_rxq_info xdp_rxq; struct xsk_buff_pool *xsk_pool; + struct xdp_desc *xsk_descs; /* For storing descriptors in the AF_XDP ZC path */ } ____cacheline_internodealigned_in_smp; static inline bool ring_uses_build_skb(struct i40e_ring *ring) diff --git a/drivers/net/ethernet/intel/i40e/i40e_xsk.c b/drivers/net/ethernet/intel/i40e/i40e_xsk.c index 567fd67e900e..4c44f499fd49 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_xsk.c +++ b/drivers/net/ethernet/intel/i40e/i40e_xsk.c @@ -2,6 +2,7 @@ /* Copyright(c) 2018 Intel Corporation. */ #include <linux/bpf_trace.h> +#include <linux/stringify.h> #include <net/xdp_sock_drv.h> #include <net/xdp.h> @@ -381,58 +382,102 @@ int i40e_clean_rx_irq_zc(struct i40e_ring *rx_ring, int budget) return failure ? budget : (int)total_rx_packets; } -/** - * i40e_xmit_zc - Performs zero-copy Tx AF_XDP - * @xdp_ring: XDP Tx ring - * @budget: NAPI budget - * - * Returns true if the work is finished. - **/ -static bool i40e_xmit_zc(struct i40e_ring *xdp_ring, unsigned int budget) +static void i40e_xmit_pkt(struct i40e_ring *xdp_ring, struct xdp_desc *desc, + unsigned int *total_bytes) { - unsigned int sent_frames = 0, total_bytes = 0; - struct i40e_tx_desc *tx_desc = NULL; - struct i40e_tx_buffer *tx_bi; - struct xdp_desc desc; + struct i40e_tx_desc *tx_desc; dma_addr_t dma; - while (budget-- > 0) { - if (!xsk_tx_peek_desc(xdp_ring->xsk_pool, &desc)) - break; + dma = xsk_buff_raw_get_dma(xdp_ring->xsk_pool, desc->addr); + xsk_buff_raw_dma_sync_for_device(xdp_ring->xsk_pool, dma, desc->len); - dma = xsk_buff_raw_get_dma(xdp_ring->xsk_pool, desc.addr); - xsk_buff_raw_dma_sync_for_device(xdp_ring->xsk_pool, dma, - desc.len); + tx_desc = I40E_TX_DESC(xdp_ring, xdp_ring->next_to_use++); + tx_desc->buffer_addr = cpu_to_le64(dma); + tx_desc->cmd_type_offset_bsz = build_ctob(I40E_TX_DESC_CMD_ICRC | I40E_TX_DESC_CMD_EOP, + 0, desc->len, 0); - tx_bi = &xdp_ring->tx_bi[xdp_ring->next_to_use]; - tx_bi->bytecount = desc.len; + *total_bytes += desc->len; +} - tx_desc = I40E_TX_DESC(xdp_ring, xdp_ring->next_to_use); - tx_desc->buffer_addr = cpu_to_le64(dma); - tx_desc->cmd_type_offset_bsz = - build_ctob(I40E_TX_DESC_CMD_ICRC - | I40E_TX_DESC_CMD_EOP, - 0, desc.len, 0); +static void i40e_xmit_pkt_batch(struct i40e_ring *xdp_ring, struct xdp_desc *desc, + unsigned int *total_bytes) +{ + u16 ntu = xdp_ring->next_to_use; + struct i40e_tx_desc *tx_desc; + dma_addr_t dma; + u32 i; - sent_frames++; - total_bytes += tx_bi->bytecount; + loop_unrolled_for(i = 0; i < PKTS_PER_BATCH; i++) { + dma = xsk_buff_raw_get_dma(xdp_ring->xsk_pool, desc[i].addr); + xsk_buff_raw_dma_sync_for_device(xdp_ring->xsk_pool, dma, desc[i].len); - xdp_ring->next_to_use++; - if (xdp_ring->next_to_use == xdp_ring->count) - xdp_ring->next_to_use = 0; + tx_desc = I40E_TX_DESC(xdp_ring, ntu++); + tx_desc->buffer_addr = cpu_to_le64(dma); + tx_desc->cmd_type_offset_bsz = build_ctob(I40E_TX_DESC_CMD_ICRC | + I40E_TX_DESC_CMD_EOP, + 0, desc[i].len, 0); + + *total_bytes += desc[i].len; } - if (tx_desc) { - /* Request an interrupt for the last frame and bump tail ptr. */ - tx_desc->cmd_type_offset_bsz |= (I40E_TX_DESC_CMD_RS << - I40E_TXD_QW1_CMD_SHIFT); - i40e_xdp_ring_update_tail(xdp_ring); + xdp_ring->next_to_use = ntu; +} + +static void i40e_fill_tx_hw_ring(struct i40e_ring *xdp_ring, struct xdp_desc *descs, u32 nb_pkts, + unsigned int *total_bytes) +{ + u32 batched, leftover, i; + + batched = nb_pkts & ~(PKTS_PER_BATCH - 1); + leftover = nb_pkts & (PKTS_PER_BATCH - 1); + for (i = 0; i < batched; i += PKTS_PER_BATCH) + i40e_xmit_pkt_batch(xdp_ring, &descs[i], total_bytes); + for (i = batched; i < batched + leftover; i++) + i40e_xmit_pkt(xdp_ring, &descs[i], total_bytes); +} - xsk_tx_release(xdp_ring->xsk_pool); - i40e_update_tx_stats(xdp_ring, sent_frames, total_bytes); +static void i40e_set_rs_bit(struct i40e_ring *xdp_ring) +{ + u16 ntu = xdp_ring->next_to_use ? xdp_ring->next_to_use - 1 : xdp_ring->count - 1; + struct i40e_tx_desc *tx_desc; + + tx_desc = I40E_TX_DESC(xdp_ring, ntu); + tx_desc->cmd_type_offset_bsz |= (I40E_TX_DESC_CMD_RS << I40E_TXD_QW1_CMD_SHIFT); +} + +/** + * i40e_xmit_zc - Performs zero-copy Tx AF_XDP + * @xdp_ring: XDP Tx ring + * @budget: NAPI budget + * + * Returns true if the work is finished. + **/ +static bool i40e_xmit_zc(struct i40e_ring *xdp_ring, unsigned int budget) +{ + struct xdp_desc *descs = xdp_ring->xsk_descs; + u32 nb_pkts, nb_processed = 0; + unsigned int total_bytes = 0; + + nb_pkts = xsk_tx_peek_release_desc_batch(xdp_ring->xsk_pool, descs, budget); + if (!nb_pkts) + return false; + + if (xdp_ring->next_to_use + nb_pkts >= xdp_ring->count) { + nb_processed = xdp_ring->count - xdp_ring->next_to_use; + i40e_fill_tx_hw_ring(xdp_ring, descs, nb_processed, &total_bytes); + xdp_ring->next_to_use = 0; } - return !!budget; + i40e_fill_tx_hw_ring(xdp_ring, &descs[nb_processed], nb_pkts - nb_processed, + &total_bytes); + + /* Request an interrupt for the last frame and bump tail ptr. */ + i40e_set_rs_bit(xdp_ring); + i40e_xdp_ring_update_tail(xdp_ring); + + i40e_update_tx_stats(xdp_ring, nb_pkts, total_bytes); + + return true; } /** diff --git a/drivers/net/ethernet/intel/i40e/i40e_xsk.h b/drivers/net/ethernet/intel/i40e/i40e_xsk.h index 7adfd8539247..ea88f4597a07 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_xsk.h +++ b/drivers/net/ethernet/intel/i40e/i40e_xsk.h @@ -4,6 +4,22 @@ #ifndef _I40E_XSK_H_ #define _I40E_XSK_H_ +/* This value should match the pragma in the loop_unrolled_for + * macro. Why 4? It is strictly empirical. It seems to be a good + * compromise between the advantage of having simultaneous outstanding + * reads to the DMA array that can hide each others latency and the + * disadvantage of having a larger code path. + */ +#define PKTS_PER_BATCH 4 + +#ifdef __clang__ +#define loop_unrolled_for _Pragma("clang loop unroll_count(4)") for +#elif __GNUC__ >= 8 +#define loop_unrolled_for _Pragma("GCC unroll 4") for +#else +#define loop_unrolled_for for +#endif + struct i40e_vsi; struct xsk_buff_pool; struct zero_copy_allocator; diff --git a/include/net/xdp_sock_drv.h b/include/net/xdp_sock_drv.h index 5b1ee8a9976d..4e295541e396 100644 --- a/include/net/xdp_sock_drv.h +++ b/include/net/xdp_sock_drv.h @@ -13,6 +13,7 @@ void xsk_tx_completed(struct xsk_buff_pool *pool, u32 nb_entries); bool xsk_tx_peek_desc(struct xsk_buff_pool *pool, struct xdp_desc *desc); +u32 xsk_tx_peek_release_desc_batch(struct xsk_buff_pool *pool, struct xdp_desc *desc, u32 max); void xsk_tx_release(struct xsk_buff_pool *pool); struct xsk_buff_pool *xsk_get_pool_from_qid(struct net_device *dev, u16 queue_id); @@ -128,6 +129,12 @@ static inline bool xsk_tx_peek_desc(struct xsk_buff_pool *pool, return false; } +static inline u32 xsk_tx_peek_release_desc_batch(struct xsk_buff_pool *pool, struct xdp_desc *desc, + u32 max) +{ + return 0; +} + static inline void xsk_tx_release(struct xsk_buff_pool *pool) { } diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index cfbec3989a76..b0141973f23e 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -332,6 +332,63 @@ out: } EXPORT_SYMBOL(xsk_tx_peek_desc); +static u32 xsk_tx_peek_release_fallback(struct xsk_buff_pool *pool, struct xdp_desc *descs, + u32 max_entries) +{ + u32 nb_pkts = 0; + + while (nb_pkts < max_entries && xsk_tx_peek_desc(pool, &descs[nb_pkts])) + nb_pkts++; + + xsk_tx_release(pool); + return nb_pkts; +} + +u32 xsk_tx_peek_release_desc_batch(struct xsk_buff_pool *pool, struct xdp_desc *descs, + u32 max_entries) +{ + struct xdp_sock *xs; + u32 nb_pkts; + + rcu_read_lock(); + if (!list_is_singular(&pool->xsk_tx_list)) { + /* Fallback to the non-batched version */ + rcu_read_unlock(); + return xsk_tx_peek_release_fallback(pool, descs, max_entries); + } + + xs = list_first_or_null_rcu(&pool->xsk_tx_list, struct xdp_sock, tx_list); + if (!xs) { + nb_pkts = 0; + goto out; + } + + nb_pkts = xskq_cons_peek_desc_batch(xs->tx, descs, pool, max_entries); + if (!nb_pkts) { + xs->tx->queue_empty_descs++; + goto out; + } + + /* This is the backpressure mechanism for the Tx path. Try to + * reserve space in the completion queue for all packets, but + * if there are fewer slots available, just process that many + * packets. This avoids having to implement any buffering in + * the Tx path. + */ + nb_pkts = xskq_prod_reserve_addr_batch(pool->cq, descs, nb_pkts); + if (!nb_pkts) + goto out; + + xskq_cons_release_n(xs->tx, nb_pkts); + __xskq_cons_release(xs->tx); + xs->sk.sk_write_space(&xs->sk); + +out: + rcu_read_unlock(); + return nb_pkts; +} +EXPORT_SYMBOL(xsk_tx_peek_release_desc_batch); + static int xsk_wakeup(struct xdp_sock *xs, u8 flags) { struct net_device *dev = xs->dev; diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h index cdb9cf3cd136..b936c46b1e16 100644 --- a/net/xdp/xsk_queue.h +++ b/net/xdp/xsk_queue.h @@ -18,9 +18,11 @@ struct xdp_ring { /* Hinder the adjacent cache prefetcher to prefetch the consumer * pointer if the producer pointer is touched and vice versa. */ - u32 pad ____cacheline_aligned_in_smp; + u32 pad1 ____cacheline_aligned_in_smp; u32 consumer ____cacheline_aligned_in_smp; + u32 pad2 ____cacheline_aligned_in_smp; u32 flags; + u32 pad3 ____cacheline_aligned_in_smp; }; /* Used for the RX and TX queues for packets */ @@ -197,6 +199,30 @@ static inline bool xskq_cons_read_desc(struct xsk_queue *q, return false; } +static inline u32 xskq_cons_read_desc_batch(struct xsk_queue *q, + struct xdp_desc *descs, + struct xsk_buff_pool *pool, u32 max) +{ + u32 cached_cons = q->cached_cons, nb_entries = 0; + + while (cached_cons != q->cached_prod && nb_entries < max) { + struct xdp_rxtx_ring *ring = (struct xdp_rxtx_ring *)q->ring; + u32 idx = cached_cons & q->ring_mask; + + descs[nb_entries] = ring->desc[idx]; + if (unlikely(!xskq_cons_is_valid_desc(q, &descs[nb_entries], pool))) { + /* Skip the entry */ + cached_cons++; + continue; + } + + nb_entries++; + cached_cons++; + } + + return nb_entries; +} + /* Functions for consumers */ static inline void __xskq_cons_release(struct xsk_queue *q) @@ -218,17 +244,22 @@ static inline void xskq_cons_get_entries(struct xsk_queue *q) __xskq_cons_peek(q); } -static inline bool xskq_cons_has_entries(struct xsk_queue *q, u32 cnt) +static inline u32 xskq_cons_nb_entries(struct xsk_queue *q, u32 max) { u32 entries = q->cached_prod - q->cached_cons; - if (entries >= cnt) - return true; + if (entries >= max) + return max; __xskq_cons_peek(q); entries = q->cached_prod - q->cached_cons; - return entries >= cnt; + return entries >= max ? max : entries; +} + +static inline bool xskq_cons_has_entries(struct xsk_queue *q, u32 cnt) +{ + return xskq_cons_nb_entries(q, cnt) >= cnt ? true : false; } static inline bool xskq_cons_peek_addr_unchecked(struct xsk_queue *q, u64 *addr) @@ -247,16 +278,28 @@ static inline bool xskq_cons_peek_desc(struct xsk_queue *q, return xskq_cons_read_desc(q, desc, pool); } +static inline u32 xskq_cons_peek_desc_batch(struct xsk_queue *q, struct xdp_desc *descs, + struct xsk_buff_pool *pool, u32 max) +{ + u32 entries = xskq_cons_nb_entries(q, max); + + return xskq_cons_read_desc_batch(q, descs, pool, entries); +} + +/* To improve performance in the xskq_cons_release functions, only update local state here. + * Reflect this to global state when we get new entries from the ring in + * xskq_cons_get_entries() and whenever Rx or Tx processing are completed in the NAPI loop. + */ static inline void xskq_cons_release(struct xsk_queue *q) { - /* To improve performance, only update local state here. - * Reflect this to global state when we get new entries - * from the ring in xskq_cons_get_entries() and whenever - * Rx or Tx processing are completed in the NAPI loop. - */ q->cached_cons++; } +static inline void xskq_cons_release_n(struct xsk_queue *q, u32 cnt) +{ + q->cached_cons += cnt; +} + static inline bool xskq_cons_is_full(struct xsk_queue *q) { /* No barriers needed since data is not accessed */ @@ -266,18 +309,23 @@ static inline bool xskq_cons_is_full(struct xsk_queue *q) /* Functions for producers */ -static inline bool xskq_prod_is_full(struct xsk_queue *q) +static inline u32 xskq_prod_nb_free(struct xsk_queue *q, u32 max) { u32 free_entries = q->nentries - (q->cached_prod - q->cached_cons); - if (free_entries) - return false; + if (free_entries >= max) + return max; /* Refresh the local tail pointer */ q->cached_cons = READ_ONCE(q->ring->consumer); free_entries = q->nentries - (q->cached_prod - q->cached_cons); - return !free_entries; + return free_entries >= max ? max : free_entries; +} + +static inline bool xskq_prod_is_full(struct xsk_queue *q) +{ + return xskq_prod_nb_free(q, 1) ? false : true; } static inline int xskq_prod_reserve(struct xsk_queue *q) @@ -302,6 +350,23 @@ static inline int xskq_prod_reserve_addr(struct xsk_queue *q, u64 addr) return 0; } +static inline u32 xskq_prod_reserve_addr_batch(struct xsk_queue *q, struct xdp_desc *descs, + u32 max) +{ + struct xdp_umem_ring *ring = (struct xdp_umem_ring *)q->ring; + u32 nb_entries, i, cached_prod; + + nb_entries = xskq_prod_nb_free(q, max); + + /* A, matches D */ + cached_prod = q->cached_prod; + for (i = 0; i < nb_entries; i++) + ring->desc[cached_prod++ & q->ring_mask] = descs[i].addr; + q->cached_prod = cached_prod; + + return nb_entries; +} + static inline int xskq_prod_reserve_desc(struct xsk_queue *q, u64 addr, u32 len) { diff --git a/samples/bpf/xdpsock_user.c b/samples/bpf/xdpsock_user.c index 1149e94ca32f..2567f0db5aca 100644 --- a/samples/bpf/xdpsock_user.c +++ b/samples/bpf/xdpsock_user.c @@ -1146,7 +1146,6 @@ static inline void complete_tx_l2fwd(struct xsk_socket_info *xsk, xsk_ring_prod__submit(&xsk->umem->fq, rcvd); xsk_ring_cons__release(&xsk->umem->cq, rcvd); xsk->outstanding_tx -= rcvd; - xsk->ring_stats.tx_npkts += rcvd; } } @@ -1168,7 +1167,6 @@ static inline void complete_tx_only(struct xsk_socket_info *xsk, if (rcvd > 0) { xsk_ring_cons__release(&xsk->umem->cq, rcvd); xsk->outstanding_tx -= rcvd; - xsk->ring_stats.tx_npkts += rcvd; } } @@ -1260,6 +1258,7 @@ static void tx_only(struct xsk_socket_info *xsk, u32 *frame_nb, int batch_size) } xsk_ring_prod__submit(&xsk->tx, batch_size); + xsk->ring_stats.tx_npkts += batch_size; xsk->outstanding_tx += batch_size; *frame_nb += batch_size; *frame_nb %= NUM_FRAMES; @@ -1348,6 +1347,7 @@ static void l2fwd(struct xsk_socket_info *xsk, struct pollfd *fds) } return; } + xsk->ring_stats.rx_npkts += rcvd; ret = xsk_ring_prod__reserve(&xsk->tx, rcvd, &idx_tx); while (ret != rcvd) { @@ -1379,7 +1379,7 @@ static void l2fwd(struct xsk_socket_info *xsk, struct pollfd *fds) xsk_ring_prod__submit(&xsk->tx, rcvd); xsk_ring_cons__release(&xsk->rx, rcvd); - xsk->ring_stats.rx_npkts += rcvd; + xsk->ring_stats.tx_npkts += rcvd; xsk->outstanding_tx += rcvd; } |