From 4a74dc65e3ad825a66dfbcb256f98c550f96445b Mon Sep 17 00:00:00 2001 From: Ben Hutchings Date: Tue, 5 Mar 2013 20:13:54 +0000 Subject: sfc: Allow efx_channel_type::receive_skb() to reject a packet Instead of having efx_ptp_rx() call netif_receive_skb() for an invalid PTP packet, make it return false for rejected packets and have efx_rx_deliver() pass them up. Signed-off-by: Ben Hutchings --- drivers/net/ethernet/sfc/ptp.c | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) (limited to 'drivers/net/ethernet/sfc/ptp.c') diff --git a/drivers/net/ethernet/sfc/ptp.c b/drivers/net/ethernet/sfc/ptp.c index 3f93624fc273..faf4baf36861 100644 --- a/drivers/net/ethernet/sfc/ptp.c +++ b/drivers/net/ethernet/sfc/ptp.c @@ -1006,7 +1006,7 @@ bool efx_ptp_is_ptp_tx(struct efx_nic *efx, struct sk_buff *skb) * the receive timestamp from the MC - this will probably occur after the * packet arrival because of the processing in the MC. */ -static void efx_ptp_rx(struct efx_channel *channel, struct sk_buff *skb) +static bool efx_ptp_rx(struct efx_channel *channel, struct sk_buff *skb) { struct efx_nic *efx = channel->efx; struct efx_ptp_data *ptp = efx->ptp_data; @@ -1019,18 +1019,15 @@ static void efx_ptp_rx(struct efx_channel *channel, struct sk_buff *skb) /* Correct version? */ if (ptp->mode == MC_CMD_PTP_MODE_V1) { if (skb->len < PTP_V1_MIN_LENGTH) { - netif_receive_skb(skb); - return; + return false; } version = ntohs(*(__be16 *)&skb->data[PTP_V1_VERSION_OFFSET]); if (version != PTP_VERSION_V1) { - netif_receive_skb(skb); - return; + return false; } } else { if (skb->len < PTP_V2_MIN_LENGTH) { - netif_receive_skb(skb); - return; + return false; } version = skb->data[PTP_V2_VERSION_OFFSET]; @@ -1041,8 +1038,7 @@ static void efx_ptp_rx(struct efx_channel *channel, struct sk_buff *skb) BUILD_BUG_ON(PTP_V1_SEQUENCE_LENGTH != PTP_V2_SEQUENCE_LENGTH); if ((version & PTP_VERSION_V2_MASK) != PTP_VERSION_V2) { - netif_receive_skb(skb); - return; + return false; } } @@ -1073,6 +1069,8 @@ static void efx_ptp_rx(struct efx_channel *channel, struct sk_buff *skb) skb_queue_tail(&ptp->rxq, skb); queue_work(ptp->workwq, &ptp->work); + + return true; } /* Transmit a PTP packet. This has to be transmitted by the MC -- cgit v1.2.3 From c939a316459783e5cd6c6bd9dc90ea11b18ecd7f Mon Sep 17 00:00:00 2001 From: Laurence Evans Date: Thu, 15 Nov 2012 10:56:07 +0000 Subject: sfc: PTP changes to support improved UUID filtering mode There is a long-standing problem with the packet-timestamp matching in the driver. When a PTP packet is received by the MC, the FPGA timestamps the packet and the MC sends the timestamp and 6 bytes of the UUID to the driver. The driver then matches the timestamp against received packets using the same 6 bytes of UUID. The problem comes from the choice of which 6 bytes to use. The PTP spec is slightly contradictory and misleading in one of the two places where the UUIDs are discussed. From section 7.2.2.2 of the spec, a PTPD2 UUID can be either a EUI-64 or a EUI-64 constructed from a EUI-48. The typical ethernet based implementation uses a EUI-64 constructed from a EUI-48. This works by taking the first 3 bytes of the MAC address of the NIC being used for PTP (the OUI), then inserting 0xFF, 0xFE, then taking the last 3 bytes of the MAC address giving MAC[0], MAC[1], MAC[2], 0xFF, 0xFE, MAC[3], MAC[4], MAC[5] The current MC firmware and driver discard the first two bytes of this UUID and packets are matched against timestamps using bytes 2 to 7 so there is a small risk that in a deployment of Solarflare PTP NICs used with other vendors NICs, that a PTP packet could be matched against the wrong timestamp. This applies to all other organisations whose third byte of the OUI is 0x53. It's a long list but I notice that it includes Cisco. The necessary modifications to use bytes 0-2 and 5-7 of the UUID to match against are quite small but introduce incompatibility between older version of the firmware and driver. When PTP is enabled via SO_TIMESTAMPING specifying PTP V2, the driver will try to enable PTP in the firmware using the enhanced mode (above). If the firmware returns an error, the driver will enable PTP in the firmware using the old mode. [bwh: Fix some style errors; remove private ioctl bits] Signed-off-by: Ben Hutchings --- drivers/net/ethernet/sfc/mcdi_pcol.h | 1 + drivers/net/ethernet/sfc/ptp.c | 61 ++++++++++++++++++++++++++---------- 2 files changed, 46 insertions(+), 16 deletions(-) (limited to 'drivers/net/ethernet/sfc/ptp.c') diff --git a/drivers/net/ethernet/sfc/mcdi_pcol.h b/drivers/net/ethernet/sfc/mcdi_pcol.h index 9d426d0457bd..c5c9747861ba 100644 --- a/drivers/net/ethernet/sfc/mcdi_pcol.h +++ b/drivers/net/ethernet/sfc/mcdi_pcol.h @@ -553,6 +553,7 @@ #define MC_CMD_PTP_MODE_V1_VLAN 0x1 /* enum */ #define MC_CMD_PTP_MODE_V2 0x2 /* enum */ #define MC_CMD_PTP_MODE_V2_VLAN 0x3 /* enum */ +#define MC_CMD_PTP_MODE_V2_ENHANCED 0x4 /* enum */ /* MC_CMD_PTP_IN_DISABLE msgrequest */ #define MC_CMD_PTP_IN_DISABLE_LEN 8 diff --git a/drivers/net/ethernet/sfc/ptp.c b/drivers/net/ethernet/sfc/ptp.c index faf4baf36861..2b40cbd6667b 100644 --- a/drivers/net/ethernet/sfc/ptp.c +++ b/drivers/net/ethernet/sfc/ptp.c @@ -99,6 +99,9 @@ #define PTP_V2_VERSION_LENGTH 1 #define PTP_V2_VERSION_OFFSET 29 +#define PTP_V2_UUID_LENGTH 8 +#define PTP_V2_UUID_OFFSET 48 + /* Although PTP V2 UUIDs are comprised a ClockIdentity (8) and PortNumber (2), * the MC only captures the last six bytes of the clock identity. These values * reflect those, not the ones used in the standard. The standard permits @@ -1011,7 +1014,7 @@ static bool efx_ptp_rx(struct efx_channel *channel, struct sk_buff *skb) struct efx_nic *efx = channel->efx; struct efx_ptp_data *ptp = efx->ptp_data; struct efx_ptp_match *match = (struct efx_ptp_match *)skb->cb; - u8 *data; + u8 *match_data_012, *match_data_345; unsigned int version; match->expiry = jiffies + msecs_to_jiffies(PKT_EVENT_LIFETIME_MS); @@ -1025,21 +1028,35 @@ static bool efx_ptp_rx(struct efx_channel *channel, struct sk_buff *skb) if (version != PTP_VERSION_V1) { return false; } + + /* PTP V1 uses all six bytes of the UUID to match the packet + * to the timestamp + */ + match_data_012 = skb->data + PTP_V1_UUID_OFFSET; + match_data_345 = skb->data + PTP_V1_UUID_OFFSET + 3; } else { if (skb->len < PTP_V2_MIN_LENGTH) { return false; } version = skb->data[PTP_V2_VERSION_OFFSET]; - - BUG_ON(ptp->mode != MC_CMD_PTP_MODE_V2); - BUILD_BUG_ON(PTP_V1_UUID_OFFSET != PTP_V2_MC_UUID_OFFSET); - BUILD_BUG_ON(PTP_V1_UUID_LENGTH != PTP_V2_MC_UUID_LENGTH); - BUILD_BUG_ON(PTP_V1_SEQUENCE_OFFSET != PTP_V2_SEQUENCE_OFFSET); - BUILD_BUG_ON(PTP_V1_SEQUENCE_LENGTH != PTP_V2_SEQUENCE_LENGTH); - if ((version & PTP_VERSION_V2_MASK) != PTP_VERSION_V2) { return false; } + + /* The original V2 implementation uses bytes 2-7 of + * the UUID to match the packet to the timestamp. This + * discards two of the bytes of the MAC address used + * to create the UUID (SF bug 33070). The PTP V2 + * enhanced mode fixes this issue and uses bytes 0-2 + * and byte 5-7 of the UUID. + */ + match_data_345 = skb->data + PTP_V2_UUID_OFFSET + 5; + if (ptp->mode == MC_CMD_PTP_MODE_V2) { + match_data_012 = skb->data + PTP_V2_UUID_OFFSET + 2; + } else { + match_data_012 = skb->data + PTP_V2_UUID_OFFSET + 0; + BUG_ON(ptp->mode != MC_CMD_PTP_MODE_V2_ENHANCED); + } } /* Does this packet require timestamping? */ @@ -1052,14 +1069,19 @@ static bool efx_ptp_rx(struct efx_channel *channel, struct sk_buff *skb) timestamps = skb_hwtstamps(skb); memset(timestamps, 0, sizeof(*timestamps)); + /* We expect the sequence number to be in the same position in + * the packet for PTP V1 and V2 + */ + BUILD_BUG_ON(PTP_V1_SEQUENCE_OFFSET != PTP_V2_SEQUENCE_OFFSET); + BUILD_BUG_ON(PTP_V1_SEQUENCE_LENGTH != PTP_V2_SEQUENCE_LENGTH); + /* Extract UUID/Sequence information */ - data = skb->data + PTP_V1_UUID_OFFSET; - match->words[0] = (data[0] | - (data[1] << 8) | - (data[2] << 16) | - (data[3] << 24)); - match->words[1] = (data[4] | - (data[5] << 8) | + match->words[0] = (match_data_012[0] | + (match_data_012[1] << 8) | + (match_data_012[2] << 16) | + (match_data_345[0] << 24)); + match->words[1] = (match_data_345[1] | + (match_data_345[2] << 8) | (skb->data[PTP_V1_SEQUENCE_OFFSET + PTP_V1_SEQUENCE_LENGTH - 1] << 16)); @@ -1165,7 +1187,7 @@ static int efx_ptp_ts_init(struct efx_nic *efx, struct hwtstamp_config *init) * timestamped */ init->rx_filter = HWTSTAMP_FILTER_PTP_V2_L4_EVENT; - new_mode = MC_CMD_PTP_MODE_V2; + new_mode = MC_CMD_PTP_MODE_V2_ENHANCED; enable_wanted = true; break; case HWTSTAMP_FILTER_PTP_V2_EVENT: @@ -1184,7 +1206,14 @@ static int efx_ptp_ts_init(struct efx_nic *efx, struct hwtstamp_config *init) if (init->tx_type != HWTSTAMP_TX_OFF) enable_wanted = true; + /* Old versions of the firmware do not support the improved + * UUID filtering option (SF bug 33070). If the firmware does + * not accept the enhanced mode, fall back to the standard PTP + * v2 UUID filtering. + */ rc = efx_ptp_change_mode(efx, enable_wanted, new_mode); + if ((rc != 0) && (new_mode == MC_CMD_PTP_MODE_V2_ENHANCED)) + rc = efx_ptp_change_mode(efx, enable_wanted, MC_CMD_PTP_MODE_V2); if (rc != 0) return rc; -- cgit v1.2.3 From 9230451af9efcf5e3d60ce7f4fec2468e8ce54b1 Mon Sep 17 00:00:00 2001 From: Laurence Evans Date: Mon, 11 Feb 2013 13:55:08 +0000 Subject: sfc: tidy up PTP synchronize function efx_ptp_process_times() Signed-off-by: Ben Hutchings --- drivers/net/ethernet/sfc/ptp.c | 35 +++++++---------------------------- 1 file changed, 7 insertions(+), 28 deletions(-) (limited to 'drivers/net/ethernet/sfc/ptp.c') diff --git a/drivers/net/ethernet/sfc/ptp.c b/drivers/net/ethernet/sfc/ptp.c index 2b40cbd6667b..d1858c0e0827 100644 --- a/drivers/net/ethernet/sfc/ptp.c +++ b/drivers/net/ethernet/sfc/ptp.c @@ -432,13 +432,10 @@ static int efx_ptp_process_times(struct efx_nic *efx, u8 *synch_buf, unsigned number_readings = (response_length / MC_CMD_PTP_OUT_SYNCHRONIZE_TIMESET_LEN); unsigned i; - unsigned min; - unsigned min_set = 0; unsigned total; unsigned ngood = 0; unsigned last_good = 0; struct efx_ptp_data *ptp = efx->ptp_data; - bool min_valid = false; u32 last_sec; u32 start_sec; struct timespec delta; @@ -446,35 +443,17 @@ static int efx_ptp_process_times(struct efx_nic *efx, u8 *synch_buf, if (number_readings == 0) return -EAGAIN; - /* Find minimum value in this set of results, discarding clearly - * erroneous results. + /* Read the set of results and increment stats for any results that + * appera to be erroneous. */ for (i = 0; i < number_readings; i++) { efx_ptp_read_timeset(synch_buf, &ptp->timeset[i]); synch_buf += MC_CMD_PTP_OUT_SYNCHRONIZE_TIMESET_LEN; - if (ptp->timeset[i].window > SYNCHRONISATION_GRANULARITY_NS) { - if (min_valid) { - if (ptp->timeset[i].window < min_set) - min_set = ptp->timeset[i].window; - } else { - min_valid = true; - min_set = ptp->timeset[i].window; - } - } - } - - if (min_valid) { - if (ptp->base_sync_valid && (min_set > ptp->base_sync_ns)) - min = ptp->base_sync_ns; - else - min = min_set; - } else { - min = SYNCHRONISATION_GRANULARITY_NS; } - /* Discard excessively long synchronise durations. The MC times - * when it finishes reading the host time so the corrected window - * time should be fairly constant for a given platform. + /* Find the last good host-MC synchronization result. The MC times + * when it finishes reading the host time so the corrected window time + * should be fairly constant for a given platform. */ total = 0; for (i = 0; i < number_readings; i++) @@ -492,8 +471,8 @@ static int efx_ptp_process_times(struct efx_nic *efx, u8 *synch_buf, if (ngood == 0) { netif_warn(efx, drv, efx->net_dev, - "PTP no suitable synchronisations %dns %dns\n", - ptp->base_sync_ns, min_set); + "PTP no suitable synchronisations %dns\n", + ptp->base_sync_ns); return -EAGAIN; } -- cgit v1.2.3 From 97d48a10c670f87bba9e5b2241e32f2eccd3fef0 Mon Sep 17 00:00:00 2001 From: Alexandre Rames Date: Fri, 11 Jan 2013 12:26:21 +0000 Subject: sfc: Remove rx_alloc_method SKB [bwh: Remove more dead code, and make efx_ptp_rx() pull the data it needs into the header area.] Signed-off-by: Ben Hutchings --- drivers/net/ethernet/sfc/efx.c | 8 +- drivers/net/ethernet/sfc/efx.h | 1 - drivers/net/ethernet/sfc/net_driver.h | 23 +-- drivers/net/ethernet/sfc/ptp.c | 4 +- drivers/net/ethernet/sfc/rx.c | 330 ++++++++++------------------------ 5 files changed, 101 insertions(+), 265 deletions(-) (limited to 'drivers/net/ethernet/sfc/ptp.c') diff --git a/drivers/net/ethernet/sfc/efx.c b/drivers/net/ethernet/sfc/efx.c index 0bc00991d310..11a81084bec4 100644 --- a/drivers/net/ethernet/sfc/efx.c +++ b/drivers/net/ethernet/sfc/efx.c @@ -247,10 +247,8 @@ static int efx_process_channel(struct efx_channel *channel, int budget) __efx_rx_packet(channel, channel->rx_pkt); channel->rx_pkt = NULL; } - if (rx_queue->enabled) { - efx_rx_strategy(channel); + if (rx_queue->enabled) efx_fast_push_rx_descriptors(rx_queue); - } } return spent; @@ -655,16 +653,12 @@ static void efx_start_datapath(struct efx_nic *efx) efx_for_each_channel_tx_queue(tx_queue, channel) efx_init_tx_queue(tx_queue); - /* The rx buffer allocation strategy is MTU dependent */ - efx_rx_strategy(channel); - efx_for_each_channel_rx_queue(rx_queue, channel) { efx_init_rx_queue(rx_queue); efx_nic_generate_fill_event(rx_queue); } WARN_ON(channel->rx_pkt != NULL); - efx_rx_strategy(channel); } if (netif_device_present(efx->net_dev)) diff --git a/drivers/net/ethernet/sfc/efx.h b/drivers/net/ethernet/sfc/efx.h index d2f790df6dcb..64c555e493be 100644 --- a/drivers/net/ethernet/sfc/efx.h +++ b/drivers/net/ethernet/sfc/efx.h @@ -37,7 +37,6 @@ extern int efx_probe_rx_queue(struct efx_rx_queue *rx_queue); extern void efx_remove_rx_queue(struct efx_rx_queue *rx_queue); extern void efx_init_rx_queue(struct efx_rx_queue *rx_queue); extern void efx_fini_rx_queue(struct efx_rx_queue *rx_queue); -extern void efx_rx_strategy(struct efx_channel *channel); extern void efx_fast_push_rx_descriptors(struct efx_rx_queue *rx_queue); extern void efx_rx_slow_fill(unsigned long context); extern void __efx_rx_packet(struct efx_channel *channel, diff --git a/drivers/net/ethernet/sfc/net_driver.h b/drivers/net/ethernet/sfc/net_driver.h index cdcf510311c3..c83fe090406d 100644 --- a/drivers/net/ethernet/sfc/net_driver.h +++ b/drivers/net/ethernet/sfc/net_driver.h @@ -206,25 +206,19 @@ struct efx_tx_queue { /** * struct efx_rx_buffer - An Efx RX data buffer * @dma_addr: DMA base address of the buffer - * @skb: The associated socket buffer. Valid iff !(@flags & %EFX_RX_BUF_PAGE). + * @page: The associated page buffer. * Will be %NULL if the buffer slot is currently free. - * @page: The associated page buffer. Valif iff @flags & %EFX_RX_BUF_PAGE. - * Will be %NULL if the buffer slot is currently free. - * @page_offset: Offset within page. Valid iff @flags & %EFX_RX_BUF_PAGE. + * @page_offset: Offset within page * @len: Buffer length, in bytes. * @flags: Flags for buffer and packet state. */ struct efx_rx_buffer { dma_addr_t dma_addr; - union { - struct sk_buff *skb; - struct page *page; - } u; + struct page *page; u16 page_offset; u16 len; u16 flags; }; -#define EFX_RX_BUF_PAGE 0x0001 #define EFX_RX_PKT_CSUMMED 0x0002 #define EFX_RX_PKT_DISCARD 0x0004 @@ -266,8 +260,6 @@ struct efx_rx_page_state { * @min_fill: RX descriptor minimum non-zero fill level. * This records the minimum fill level observed when a ring * refill was triggered. - * @alloc_page_count: RX allocation strategy counter. - * @alloc_skb_count: RX allocation strategy counter. * @slow_fill: Timer used to defer efx_nic_generate_fill_event(). */ struct efx_rx_queue { @@ -286,8 +278,6 @@ struct efx_rx_queue { unsigned int fast_fill_trigger; unsigned int min_fill; unsigned int min_overfill; - unsigned int alloc_page_count; - unsigned int alloc_skb_count; struct timer_list slow_fill; unsigned int slow_fill_count; }; @@ -336,10 +326,6 @@ enum efx_rx_alloc_method { * @event_test_cpu: Last CPU to handle interrupt or test event for this channel * @irq_count: Number of IRQs since last adaptive moderation decision * @irq_mod_score: IRQ moderation score - * @rx_alloc_level: Watermark based heuristic counter for pushing descriptors - * and diagnostic counters - * @rx_alloc_push_pages: RX allocation method currently in use for pushing - * descriptors * @n_rx_tobe_disc: Count of RX_TOBE_DISC errors * @n_rx_ip_hdr_chksum_err: Count of RX IP header checksum errors * @n_rx_tcp_udp_chksum_err: Count of RX TCP and UDP checksum errors @@ -371,9 +357,6 @@ struct efx_channel { unsigned int rfs_filters_added; #endif - int rx_alloc_level; - int rx_alloc_push_pages; - unsigned n_rx_tobe_disc; unsigned n_rx_ip_hdr_chksum_err; unsigned n_rx_tcp_udp_chksum_err; diff --git a/drivers/net/ethernet/sfc/ptp.c b/drivers/net/ethernet/sfc/ptp.c index d1858c0e0827..07f6baa15c0c 100644 --- a/drivers/net/ethernet/sfc/ptp.c +++ b/drivers/net/ethernet/sfc/ptp.c @@ -1000,7 +1000,7 @@ static bool efx_ptp_rx(struct efx_channel *channel, struct sk_buff *skb) /* Correct version? */ if (ptp->mode == MC_CMD_PTP_MODE_V1) { - if (skb->len < PTP_V1_MIN_LENGTH) { + if (!pskb_may_pull(skb, PTP_V1_MIN_LENGTH)) { return false; } version = ntohs(*(__be16 *)&skb->data[PTP_V1_VERSION_OFFSET]); @@ -1014,7 +1014,7 @@ static bool efx_ptp_rx(struct efx_channel *channel, struct sk_buff *skb) match_data_012 = skb->data + PTP_V1_UUID_OFFSET; match_data_345 = skb->data + PTP_V1_UUID_OFFSET + 3; } else { - if (skb->len < PTP_V2_MIN_LENGTH) { + if (!pskb_may_pull(skb, PTP_V2_MIN_LENGTH)) { return false; } version = skb->data[PTP_V2_VERSION_OFFSET]; diff --git a/drivers/net/ethernet/sfc/rx.c b/drivers/net/ethernet/sfc/rx.c index f31c23ea2a07..e7aa28eb9327 100644 --- a/drivers/net/ethernet/sfc/rx.c +++ b/drivers/net/ethernet/sfc/rx.c @@ -33,46 +33,6 @@ /* Size of buffer allocated for skb header area. */ #define EFX_SKB_HEADERS 64u -/* - * rx_alloc_method - RX buffer allocation method - * - * This driver supports two methods for allocating and using RX buffers: - * each RX buffer may be backed by an skb or by an order-n page. - * - * When GRO is in use then the second method has a lower overhead, - * since we don't have to allocate then free skbs on reassembled frames. - * - * Values: - * - RX_ALLOC_METHOD_AUTO = 0 - * - RX_ALLOC_METHOD_SKB = 1 - * - RX_ALLOC_METHOD_PAGE = 2 - * - * The heuristic for %RX_ALLOC_METHOD_AUTO is a simple hysteresis count - * controlled by the parameters below. - * - * - Since pushing and popping descriptors are separated by the rx_queue - * size, so the watermarks should be ~rxd_size. - * - The performance win by using page-based allocation for GRO is less - * than the performance hit of using page-based allocation of non-GRO, - * so the watermarks should reflect this. - * - * Per channel we maintain a single variable, updated by each channel: - * - * rx_alloc_level += (gro_performed ? RX_ALLOC_FACTOR_GRO : - * RX_ALLOC_FACTOR_SKB) - * Per NAPI poll interval, we constrain rx_alloc_level to 0..MAX (which - * limits the hysteresis), and update the allocation strategy: - * - * rx_alloc_method = (rx_alloc_level > RX_ALLOC_LEVEL_GRO ? - * RX_ALLOC_METHOD_PAGE : RX_ALLOC_METHOD_SKB) - */ -static int rx_alloc_method = RX_ALLOC_METHOD_AUTO; - -#define RX_ALLOC_LEVEL_GRO 0x2000 -#define RX_ALLOC_LEVEL_MAX 0x3000 -#define RX_ALLOC_FACTOR_GRO 1 -#define RX_ALLOC_FACTOR_SKB (-2) - /* This is the percentage fill level below which new RX descriptors * will be added to the RX descriptor ring. */ @@ -99,10 +59,7 @@ static inline unsigned int efx_rx_buf_size(struct efx_nic *efx) static u8 *efx_rx_buf_eh(struct efx_nic *efx, struct efx_rx_buffer *buf) { - if (buf->flags & EFX_RX_BUF_PAGE) - return page_address(buf->u.page) + efx_rx_buf_offset(efx, buf); - else - return (u8 *)buf->u.skb->data + efx->type->rx_buffer_hash_size; + return page_address(buf->page) + efx_rx_buf_offset(efx, buf); } static inline u32 efx_rx_buf_hash(const u8 *eh) @@ -120,56 +77,7 @@ static inline u32 efx_rx_buf_hash(const u8 *eh) } /** - * efx_init_rx_buffers_skb - create EFX_RX_BATCH skb-based RX buffers - * - * @rx_queue: Efx RX queue - * - * This allocates EFX_RX_BATCH skbs, maps them for DMA, and populates a - * struct efx_rx_buffer for each one. Return a negative error code or 0 - * on success. May fail having only inserted fewer than EFX_RX_BATCH - * buffers. - */ -static int efx_init_rx_buffers_skb(struct efx_rx_queue *rx_queue) -{ - struct efx_nic *efx = rx_queue->efx; - struct net_device *net_dev = efx->net_dev; - struct efx_rx_buffer *rx_buf; - struct sk_buff *skb; - int skb_len = efx->rx_buffer_len; - unsigned index, count; - - for (count = 0; count < EFX_RX_BATCH; ++count) { - index = rx_queue->added_count & rx_queue->ptr_mask; - rx_buf = efx_rx_buffer(rx_queue, index); - - rx_buf->u.skb = skb = netdev_alloc_skb(net_dev, skb_len); - if (unlikely(!skb)) - return -ENOMEM; - - /* Adjust the SKB for padding */ - skb_reserve(skb, NET_IP_ALIGN); - rx_buf->len = skb_len - NET_IP_ALIGN; - rx_buf->flags = 0; - - rx_buf->dma_addr = dma_map_single(&efx->pci_dev->dev, - skb->data, rx_buf->len, - DMA_FROM_DEVICE); - if (unlikely(dma_mapping_error(&efx->pci_dev->dev, - rx_buf->dma_addr))) { - dev_kfree_skb_any(skb); - rx_buf->u.skb = NULL; - return -EIO; - } - - ++rx_queue->added_count; - ++rx_queue->alloc_skb_count; - } - - return 0; -} - -/** - * efx_init_rx_buffers_page - create EFX_RX_BATCH page-based RX buffers + * efx_init_rx_buffers - create EFX_RX_BATCH page-based RX buffers * * @rx_queue: Efx RX queue * @@ -178,7 +86,7 @@ static int efx_init_rx_buffers_skb(struct efx_rx_queue *rx_queue) * code or 0 on success. If a single page can be split between two buffers, * then the page will either be inserted fully, or not at at all. */ -static int efx_init_rx_buffers_page(struct efx_rx_queue *rx_queue) +static int efx_init_rx_buffers(struct efx_rx_queue *rx_queue) { struct efx_nic *efx = rx_queue->efx; struct efx_rx_buffer *rx_buf; @@ -214,12 +122,11 @@ static int efx_init_rx_buffers_page(struct efx_rx_queue *rx_queue) index = rx_queue->added_count & rx_queue->ptr_mask; rx_buf = efx_rx_buffer(rx_queue, index); rx_buf->dma_addr = dma_addr + EFX_PAGE_IP_ALIGN; - rx_buf->u.page = page; + rx_buf->page = page; rx_buf->page_offset = page_offset + EFX_PAGE_IP_ALIGN; rx_buf->len = efx->rx_buffer_len - EFX_PAGE_IP_ALIGN; - rx_buf->flags = EFX_RX_BUF_PAGE; + rx_buf->flags = 0; ++rx_queue->added_count; - ++rx_queue->alloc_page_count; ++state->refcnt; if ((~count & 1) && (efx->rx_buffer_len <= EFX_RX_HALF_PAGE)) { @@ -239,10 +146,10 @@ static void efx_unmap_rx_buffer(struct efx_nic *efx, struct efx_rx_buffer *rx_buf, unsigned int used_len) { - if ((rx_buf->flags & EFX_RX_BUF_PAGE) && rx_buf->u.page) { + if (rx_buf->page) { struct efx_rx_page_state *state; - state = page_address(rx_buf->u.page); + state = page_address(rx_buf->page); if (--state->refcnt == 0) { dma_unmap_page(&efx->pci_dev->dev, state->dma_addr, @@ -253,21 +160,15 @@ static void efx_unmap_rx_buffer(struct efx_nic *efx, rx_buf->dma_addr, used_len, DMA_FROM_DEVICE); } - } else if (!(rx_buf->flags & EFX_RX_BUF_PAGE) && rx_buf->u.skb) { - dma_unmap_single(&efx->pci_dev->dev, rx_buf->dma_addr, - rx_buf->len, DMA_FROM_DEVICE); } } static void efx_free_rx_buffer(struct efx_nic *efx, struct efx_rx_buffer *rx_buf) { - if ((rx_buf->flags & EFX_RX_BUF_PAGE) && rx_buf->u.page) { - __free_pages(rx_buf->u.page, efx->rx_buffer_order); - rx_buf->u.page = NULL; - } else if (!(rx_buf->flags & EFX_RX_BUF_PAGE) && rx_buf->u.skb) { - dev_kfree_skb_any(rx_buf->u.skb); - rx_buf->u.skb = NULL; + if (rx_buf->page) { + __free_pages(rx_buf->page, efx->rx_buffer_order); + rx_buf->page = NULL; } } @@ -283,7 +184,7 @@ static void efx_fini_rx_buffer(struct efx_rx_queue *rx_queue, static void efx_resurrect_rx_buffer(struct efx_rx_queue *rx_queue, struct efx_rx_buffer *rx_buf) { - struct efx_rx_page_state *state = page_address(rx_buf->u.page); + struct efx_rx_page_state *state = page_address(rx_buf->page); struct efx_rx_buffer *new_buf; unsigned fill_level, index; @@ -298,14 +199,13 @@ static void efx_resurrect_rx_buffer(struct efx_rx_queue *rx_queue, } ++state->refcnt; - get_page(rx_buf->u.page); + get_page(rx_buf->page); index = rx_queue->added_count & rx_queue->ptr_mask; new_buf = efx_rx_buffer(rx_queue, index); new_buf->dma_addr = rx_buf->dma_addr ^ (PAGE_SIZE >> 1); - new_buf->u.page = rx_buf->u.page; + new_buf->page = rx_buf->page; new_buf->len = rx_buf->len; - new_buf->flags = EFX_RX_BUF_PAGE; ++rx_queue->added_count; } @@ -319,18 +219,17 @@ static void efx_recycle_rx_buffer(struct efx_channel *channel, struct efx_rx_buffer *new_buf; unsigned index; - rx_buf->flags &= EFX_RX_BUF_PAGE; + rx_buf->flags = 0; - if ((rx_buf->flags & EFX_RX_BUF_PAGE) && - efx->rx_buffer_len <= EFX_RX_HALF_PAGE && - page_count(rx_buf->u.page) == 1) + if (efx->rx_buffer_len <= EFX_RX_HALF_PAGE && + page_count(rx_buf->page) == 1) efx_resurrect_rx_buffer(rx_queue, rx_buf); index = rx_queue->added_count & rx_queue->ptr_mask; new_buf = efx_rx_buffer(rx_queue, index); memcpy(new_buf, rx_buf, sizeof(*new_buf)); - rx_buf->u.page = NULL; + rx_buf->page = NULL; ++rx_queue->added_count; } @@ -348,7 +247,6 @@ static void efx_recycle_rx_buffer(struct efx_channel *channel, */ void efx_fast_push_rx_descriptors(struct efx_rx_queue *rx_queue) { - struct efx_channel *channel = efx_rx_queue_channel(rx_queue); unsigned fill_level; int space, rc = 0; @@ -369,16 +267,13 @@ void efx_fast_push_rx_descriptors(struct efx_rx_queue *rx_queue) netif_vdbg(rx_queue->efx, rx_status, rx_queue->efx->net_dev, "RX queue %d fast-filling descriptor ring from" - " level %d to level %d using %s allocation\n", + " level %d to level %d\n", efx_rx_queue_index(rx_queue), fill_level, - rx_queue->max_fill, - channel->rx_alloc_push_pages ? "page" : "skb"); + rx_queue->max_fill); + do { - if (channel->rx_alloc_push_pages) - rc = efx_init_rx_buffers_page(rx_queue); - else - rc = efx_init_rx_buffers_skb(rx_queue); + rc = efx_init_rx_buffers(rx_queue); if (unlikely(rc)) { /* Ensure that we don't leave the rx queue empty */ if (rx_queue->added_count == rx_queue->removed_count) @@ -408,7 +303,7 @@ void efx_rx_slow_fill(unsigned long context) static void efx_rx_packet__check_len(struct efx_rx_queue *rx_queue, struct efx_rx_buffer *rx_buf, - int len, bool *leak_packet) + int len) { struct efx_nic *efx = rx_queue->efx; unsigned max_len = rx_buf->len - efx->type->rx_buffer_padding; @@ -428,11 +323,6 @@ static void efx_rx_packet__check_len(struct efx_rx_queue *rx_queue, "RX event (0x%x > 0x%x+0x%x). Leaking\n", efx_rx_queue_index(rx_queue), len, max_len, efx->type->rx_buffer_padding); - /* If this buffer was skb-allocated, then the meta - * data at the end of the skb will be trashed. So - * we have no choice but to leak the fragment. - */ - *leak_packet = !(rx_buf->flags & EFX_RX_BUF_PAGE); efx_schedule_reset(efx, RESET_TYPE_RX_RECOVERY); } else { if (net_ratelimit()) @@ -454,51 +344,78 @@ static void efx_rx_packet_gro(struct efx_channel *channel, { struct napi_struct *napi = &channel->napi_str; gro_result_t gro_result; + struct efx_nic *efx = channel->efx; + struct page *page = rx_buf->page; + struct sk_buff *skb; - if (rx_buf->flags & EFX_RX_BUF_PAGE) { - struct efx_nic *efx = channel->efx; - struct page *page = rx_buf->u.page; - struct sk_buff *skb; - - rx_buf->u.page = NULL; + rx_buf->page = NULL; - skb = napi_get_frags(napi); - if (!skb) { - put_page(page); - return; - } + skb = napi_get_frags(napi); + if (!skb) { + put_page(page); + return; + } - if (efx->net_dev->features & NETIF_F_RXHASH) - skb->rxhash = efx_rx_buf_hash(eh); + if (efx->net_dev->features & NETIF_F_RXHASH) + skb->rxhash = efx_rx_buf_hash(eh); - skb_fill_page_desc(skb, 0, page, - efx_rx_buf_offset(efx, rx_buf), rx_buf->len); + skb_fill_page_desc(skb, 0, page, + efx_rx_buf_offset(efx, rx_buf), rx_buf->len); - skb->len = rx_buf->len; - skb->data_len = rx_buf->len; - skb->truesize += rx_buf->len; - skb->ip_summed = ((rx_buf->flags & EFX_RX_PKT_CSUMMED) ? - CHECKSUM_UNNECESSARY : CHECKSUM_NONE); + skb->len = rx_buf->len; + skb->data_len = rx_buf->len; + skb->truesize += rx_buf->len; + skb->ip_summed = ((rx_buf->flags & EFX_RX_PKT_CSUMMED) ? + CHECKSUM_UNNECESSARY : CHECKSUM_NONE); - skb_record_rx_queue(skb, channel->rx_queue.core_index); + skb_record_rx_queue(skb, channel->rx_queue.core_index); gro_result = napi_gro_frags(napi); - } else { - struct sk_buff *skb = rx_buf->u.skb; - EFX_BUG_ON_PARANOID(!(rx_buf->flags & EFX_RX_PKT_CSUMMED)); - rx_buf->u.skb = NULL; - skb->ip_summed = CHECKSUM_UNNECESSARY; + if (gro_result != GRO_DROP) + channel->irq_mod_score += 2; +} - gro_result = napi_gro_receive(napi, skb); - } +/* Allocate and construct an SKB around a struct page.*/ +static struct sk_buff *efx_rx_mk_skb(struct efx_channel *channel, + struct efx_rx_buffer *rx_buf, + u8 *eh, int hdr_len) +{ + struct efx_nic *efx = channel->efx; + struct sk_buff *skb; - if (gro_result == GRO_NORMAL) { - channel->rx_alloc_level += RX_ALLOC_FACTOR_SKB; - } else if (gro_result != GRO_DROP) { - channel->rx_alloc_level += RX_ALLOC_FACTOR_GRO; - channel->irq_mod_score += 2; + /* Allocate an SKB to store the headers */ + skb = netdev_alloc_skb(efx->net_dev, hdr_len + EFX_PAGE_SKB_ALIGN); + if (unlikely(skb == NULL)) + return NULL; + + EFX_BUG_ON_PARANOID(rx_buf->len < hdr_len); + + skb_reserve(skb, EFX_PAGE_SKB_ALIGN); + + skb->len = rx_buf->len; + skb->truesize = rx_buf->len + sizeof(struct sk_buff); + memcpy(skb->data, eh, hdr_len); + skb->tail += hdr_len; + + /* Append the remaining page onto the frag list */ + if (rx_buf->len > hdr_len) { + skb->data_len = skb->len - hdr_len; + skb_fill_page_desc(skb, 0, rx_buf->page, + efx_rx_buf_offset(efx, rx_buf) + hdr_len, + skb->data_len); + } else { + __free_pages(rx_buf->page, efx->rx_buffer_order); + skb->data_len = 0; } + + /* Ownership has transferred from the rx_buf to skb */ + rx_buf->page = NULL; + + /* Move past the ethernet header */ + skb->protocol = eth_type_trans(skb, efx->net_dev); + + return skb; } void efx_rx_packet(struct efx_rx_queue *rx_queue, unsigned int index, @@ -507,7 +424,6 @@ void efx_rx_packet(struct efx_rx_queue *rx_queue, unsigned int index, struct efx_nic *efx = rx_queue->efx; struct efx_channel *channel = efx_rx_queue_channel(rx_queue); struct efx_rx_buffer *rx_buf; - bool leak_packet = false; rx_buf = efx_rx_buffer(rx_queue, index); rx_buf->flags |= flags; @@ -519,7 +435,7 @@ void efx_rx_packet(struct efx_rx_queue *rx_queue, unsigned int index, rx_queue->removed_count++; /* Validate the length encoded in the event vs the descriptor pushed */ - efx_rx_packet__check_len(rx_queue, rx_buf, len, &leak_packet); + efx_rx_packet__check_len(rx_queue, rx_buf, len); netif_vdbg(efx, rx_status, efx->net_dev, "RX queue %d received id %x at %llx+%x %s%s\n", @@ -530,10 +446,7 @@ void efx_rx_packet(struct efx_rx_queue *rx_queue, unsigned int index, /* Discard packet, if instructed to do so */ if (unlikely(rx_buf->flags & EFX_RX_PKT_DISCARD)) { - if (unlikely(leak_packet)) - channel->n_skbuff_leaks++; - else - efx_recycle_rx_buffer(channel, rx_buf); + efx_recycle_rx_buffer(channel, rx_buf); /* Don't hold off the previous receive */ rx_buf = NULL; @@ -560,31 +473,28 @@ out: channel->rx_pkt = rx_buf; } -static void efx_rx_deliver(struct efx_channel *channel, +static void efx_rx_deliver(struct efx_channel *channel, u8 *eh, struct efx_rx_buffer *rx_buf) { struct sk_buff *skb; + u16 hdr_len = min_t(u16, rx_buf->len, EFX_SKB_HEADERS); - /* We now own the SKB */ - skb = rx_buf->u.skb; - rx_buf->u.skb = NULL; + skb = efx_rx_mk_skb(channel, rx_buf, eh, hdr_len); + if (unlikely(skb == NULL)) { + efx_free_rx_buffer(channel->efx, rx_buf); + return; + } + skb_record_rx_queue(skb, channel->rx_queue.core_index); /* Set the SKB flags */ skb_checksum_none_assert(skb); - /* Record the rx_queue */ - skb_record_rx_queue(skb, channel->rx_queue.core_index); - if (channel->type->receive_skb) if (channel->type->receive_skb(channel, skb)) - goto handled; + return; /* Pass the packet up */ netif_receive_skb(skb); - -handled: - /* Update allocation strategy method */ - channel->rx_alloc_level += RX_ALLOC_FACTOR_SKB; } /* Handle a received packet. Second half: Touches packet payload. */ @@ -602,60 +512,13 @@ void __efx_rx_packet(struct efx_channel *channel, struct efx_rx_buffer *rx_buf) return; } - if (!(rx_buf->flags & EFX_RX_BUF_PAGE)) { - struct sk_buff *skb = rx_buf->u.skb; - - prefetch(skb_shinfo(skb)); - - skb_reserve(skb, efx->type->rx_buffer_hash_size); - skb_put(skb, rx_buf->len); - - if (efx->net_dev->features & NETIF_F_RXHASH) - skb->rxhash = efx_rx_buf_hash(eh); - - /* Move past the ethernet header. rx_buf->data still points - * at the ethernet header */ - skb->protocol = eth_type_trans(skb, efx->net_dev); - - skb_record_rx_queue(skb, channel->rx_queue.core_index); - } - if (unlikely(!(efx->net_dev->features & NETIF_F_RXCSUM))) rx_buf->flags &= ~EFX_RX_PKT_CSUMMED; - if (likely(rx_buf->flags & (EFX_RX_BUF_PAGE | EFX_RX_PKT_CSUMMED)) && - !channel->type->receive_skb) + if (!channel->type->receive_skb) efx_rx_packet_gro(channel, rx_buf, eh); else - efx_rx_deliver(channel, rx_buf); -} - -void efx_rx_strategy(struct efx_channel *channel) -{ - enum efx_rx_alloc_method method = rx_alloc_method; - - if (channel->type->receive_skb) { - channel->rx_alloc_push_pages = false; - return; - } - - /* Only makes sense to use page based allocation if GRO is enabled */ - if (!(channel->efx->net_dev->features & NETIF_F_GRO)) { - method = RX_ALLOC_METHOD_SKB; - } else if (method == RX_ALLOC_METHOD_AUTO) { - /* Constrain the rx_alloc_level */ - if (channel->rx_alloc_level < 0) - channel->rx_alloc_level = 0; - else if (channel->rx_alloc_level > RX_ALLOC_LEVEL_MAX) - channel->rx_alloc_level = RX_ALLOC_LEVEL_MAX; - - /* Decide on the allocation method */ - method = ((channel->rx_alloc_level > RX_ALLOC_LEVEL_GRO) ? - RX_ALLOC_METHOD_PAGE : RX_ALLOC_METHOD_SKB); - } - - /* Push the option */ - channel->rx_alloc_push_pages = (method == RX_ALLOC_METHOD_PAGE); + efx_rx_deliver(channel, eh, rx_buf); } int efx_probe_rx_queue(struct efx_rx_queue *rx_queue) @@ -756,9 +619,6 @@ void efx_remove_rx_queue(struct efx_rx_queue *rx_queue) } -module_param(rx_alloc_method, int, 0644); -MODULE_PARM_DESC(rx_alloc_method, "Allocation method used for RX buffers"); - module_param(rx_refill_threshold, uint, 0444); MODULE_PARM_DESC(rx_refill_threshold, "RX descriptor ring refill threshold (%)"); -- cgit v1.2.3