From 7e01c7f7046efc2c7c192c3619db43292b98e997 Mon Sep 17 00:00:00 2001
From: Tudor Ambarus <tudor.ambarus@linaro.org>
Date: Wed, 17 May 2023 13:38:08 +0000
Subject: net: cdc_ncm: Deal with too low values of dwNtbOutMaxSize

Currently in cdc_ncm_check_tx_max(), if dwNtbOutMaxSize is lower than
the calculated "min" value, but greater than zero, the logic sets
tx_max to dwNtbOutMaxSize. This is then used to allocate a new SKB in
cdc_ncm_fill_tx_frame() where all the data is handled.

For small values of dwNtbOutMaxSize the memory allocated during
alloc_skb(dwNtbOutMaxSize, GFP_ATOMIC) will have the same size, due to
how size is aligned at alloc time:
	size = SKB_DATA_ALIGN(size);
        size += SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
Thus we hit the same bug that we tried to squash with
commit 2be6d4d16a084 ("net: cdc_ncm: Allow for dwNtbOutMaxSize to be unset or zero")

Low values of dwNtbOutMaxSize do not cause an issue presently because at
alloc_skb() time more memory (512b) is allocated than required for the
SKB headers alone (320b), leaving some space (512b - 320b = 192b)
for CDC data (172b).

However, if more elements (for example 3 x u64 = [24b]) were added to
one of the SKB header structs, say 'struct skb_shared_info',
increasing its original size (320b [320b aligned]) to something larger
(344b [384b aligned]), then suddenly the CDC data (172b) no longer
fits in the spare SKB data area (512b - 384b = 128b).

Consequently the SKB bounds checking semantics fails and panics:

skbuff: skb_over_panic: text:ffffffff831f755b len:184 put:172 head:ffff88811f1c6c00 data:ffff88811f1c6c00 tail:0xb8 end:0x80 dev:<NULL>
------------[ cut here ]------------
kernel BUG at net/core/skbuff.c:113!
invalid opcode: 0000 [#1] PREEMPT SMP KASAN
CPU: 0 PID: 57 Comm: kworker/0:2 Not tainted 5.15.106-syzkaller-00249-g19c0ed55a470 #0
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 04/14/2023
Workqueue: mld mld_ifc_work
RIP: 0010:skb_panic net/core/skbuff.c:113 [inline]
RIP: 0010:skb_over_panic+0x14c/0x150 net/core/skbuff.c:118
[snip]
Call Trace:
 <TASK>
 skb_put+0x151/0x210 net/core/skbuff.c:2047
 skb_put_zero include/linux/skbuff.h:2422 [inline]
 cdc_ncm_ndp16 drivers/net/usb/cdc_ncm.c:1131 [inline]
 cdc_ncm_fill_tx_frame+0x11ab/0x3da0 drivers/net/usb/cdc_ncm.c:1308
 cdc_ncm_tx_fixup+0xa3/0x100

Deal with too low values of dwNtbOutMaxSize, clamp it in the range
[USB_CDC_NCM_NTB_MIN_OUT_SIZE, CDC_NCM_NTB_MAX_SIZE_TX]. We ensure
enough data space is allocated to handle CDC data by making sure
dwNtbOutMaxSize is not smaller than USB_CDC_NCM_NTB_MIN_OUT_SIZE.

Fixes: 289507d3364f ("net: cdc_ncm: use sysfs for rx/tx aggregation tuning")
Cc: stable@vger.kernel.org
Reported-by: syzbot+9f575a1f15fc0c01ed69@syzkaller.appspotmail.com
Link: https://syzkaller.appspot.com/bug?extid=b982f1059506db48409d
Link: https://lore.kernel.org/all/20211202143437.1411410-1-lee.jones@linaro.org/
Signed-off-by: Tudor Ambarus <tudor.ambarus@linaro.org>
Reviewed-by: Simon Horman <simon.horman@corigine.com>
Link: https://lore.kernel.org/r/20230517133808.1873695-2-tudor.ambarus@linaro.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/usb/cdc_ncm.c | 24 +++++++++++++++---------
 1 file changed, 15 insertions(+), 9 deletions(-)

(limited to 'drivers')
diff --git a/drivers/net/usb/cdc_ncm.c b/drivers/net/usb/cdc_ncm.c
index 6ce8f4f0c70e..db05622f1f70 100644
--- a/drivers/net/usb/cdc_ncm.c
+++ b/drivers/net/usb/cdc_ncm.c
@@ -181,9 +181,12 @@ static u32 cdc_ncm_check_tx_max(struct usbnet *dev, u32 new_tx)
 	else
 		min = ctx->max_datagram_size + ctx->max_ndp_size + sizeof(struct usb_cdc_ncm_nth32);
 
-	max = min_t(u32, CDC_NCM_NTB_MAX_SIZE_TX, le32_to_cpu(ctx->ncm_parm.dwNtbOutMaxSize));
-	if (max == 0)
+	if (le32_to_cpu(ctx->ncm_parm.dwNtbOutMaxSize) == 0)
 		max = CDC_NCM_NTB_MAX_SIZE_TX; /* dwNtbOutMaxSize not set */
+	else
+		max = clamp_t(u32, le32_to_cpu(ctx->ncm_parm.dwNtbOutMaxSize),
+			      USB_CDC_NCM_NTB_MIN_OUT_SIZE,
+			      CDC_NCM_NTB_MAX_SIZE_TX);
 
 	/* some devices set dwNtbOutMaxSize too low for the above default */
 	min = min(min, max);
@@ -1244,6 +1247,9 @@ cdc_ncm_fill_tx_frame(struct usbnet *dev, struct sk_buff *skb, __le32 sign)
 			 * further.
 			 */
 			if (skb_out == NULL) {
+				/* If even the smallest allocation fails, abort. */
+				if (ctx->tx_curr_size == USB_CDC_NCM_NTB_MIN_OUT_SIZE)
+					goto alloc_failed;
 				ctx->tx_low_mem_max_cnt = min(ctx->tx_low_mem_max_cnt + 1,
 							      (unsigned)CDC_NCM_LOW_MEM_MAX_CNT);
 				ctx->tx_low_mem_val = ctx->tx_low_mem_max_cnt;
@@ -1262,13 +1268,8 @@ cdc_ncm_fill_tx_frame(struct usbnet *dev, struct sk_buff *skb, __le32 sign)
 			skb_out = alloc_skb(ctx->tx_curr_size, GFP_ATOMIC);
 
 			/* No allocation possible so we will abort */
-			if (skb_out == NULL) {
-				if (skb != NULL) {
-					dev_kfree_skb_any(skb);
-					dev->net->stats.tx_dropped++;
-				}
-				goto exit_no_skb;
-			}
+			if (!skb_out)
+				goto alloc_failed;
 			ctx->tx_low_mem_val--;
 		}
 		if (ctx->is_ndp16) {
@@ -1461,6 +1462,11 @@ cdc_ncm_fill_tx_frame(struct usbnet *dev, struct sk_buff *skb, __le32 sign)
 
 	return skb_out;
 
+alloc_failed:
+	if (skb) {
+		dev_kfree_skb_any(skb);
+		dev->net->stats.tx_dropped++;
+	}
 exit_no_skb:
 	/* Start timer, if there is a remaining non-empty skb */
 	if (ctx->tx_curr_skb != NULL && n > 0)
-- 
cgit v1.2.3


From afbed3f74830163f9559579dee382cac3cff82da Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Tue, 16 May 2023 18:59:35 -0700
Subject: net/mlx5e: do as little as possible in napi poll when budget is 0

NAPI gets called with budget of 0 from netpoll, which has interrupts
disabled. We should try to free some space on Tx rings and nothing
else.

Specifically do not try to handle XDP TX or try to refill Rx buffers -
we can't use the page pool from IRQ context. Don't check if IRQs moved,
either, that makes no sense in netpoll. Netpoll calls _all_ the rings
from whatever CPU it happens to be invoked on.

In general do as little as possible, the work quickly adds up when
there's tens of rings to poll.

The immediate stack trace I was seeing is:

    __do_softirq+0xd1/0x2c0
    __local_bh_enable_ip+0xc7/0x120
    </IRQ>
    <TASK>
    page_pool_put_defragged_page+0x267/0x320
    mlx5e_free_xdpsq_desc+0x99/0xd0
    mlx5e_poll_xdpsq_cq+0x138/0x3b0
    mlx5e_napi_poll+0xc3/0x8b0
    netpoll_poll_dev+0xce/0x150

AFAIU page pool takes a BH lock, releases it and since BH is now
enabled tries to run softirqs.

Reviewed-by: Tariq Toukan <tariqt@nvidia.com>
Fixes: 60bbf7eeef10 ("mlx5: use page_pool for xdp_return_frame call")
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Reviewed-by: Simon Horman <simon.horman@corigine.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlx5/core/en_txrx.c | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

(limited to 'drivers')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_txrx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_txrx.c
index a50bfda18e96..fbb2d963fb7e 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_txrx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_txrx.c
@@ -161,20 +161,22 @@ int mlx5e_napi_poll(struct napi_struct *napi, int budget)
 		}
 	}
 
+	/* budget=0 means we may be in IRQ context, do as little as possible */
+	if (unlikely(!budget))
+		goto out;
+
 	busy |= mlx5e_poll_xdpsq_cq(&c->xdpsq.cq);
 
 	if (c->xdp)
 		busy |= mlx5e_poll_xdpsq_cq(&c->rq_xdpsq.cq);
 
-	if (likely(budget)) { /* budget=0 means: don't poll rx rings */
-		if (xsk_open)
-			work_done = mlx5e_poll_rx_cq(&xskrq->cq, budget);
+	if (xsk_open)
+		work_done = mlx5e_poll_rx_cq(&xskrq->cq, budget);
 
-		if (likely(budget - work_done))
-			work_done += mlx5e_poll_rx_cq(&rq->cq, budget - work_done);
+	if (likely(budget - work_done))
+		work_done += mlx5e_poll_rx_cq(&rq->cq, budget - work_done);
 
-		busy |= work_done == budget;
-	}
+	busy |= work_done == budget;
 
 	mlx5e_poll_ico_cq(&c->icosq.cq);
 	if (mlx5e_poll_ico_cq(&c->async_icosq.cq))
-- 
cgit v1.2.3


From cfcb942863f6fce9266e1957a021e6c7295dee42 Mon Sep 17 00:00:00 2001
From: Alejandro Lucero <alejandro.lucero-palau@amd.com>
Date: Thu, 18 May 2023 06:48:22 +0100
Subject: sfc: fix devlink info error handling

Avoid early devlink info return if errors arise with MCDI commands
executed for getting the required info from the device. The rationale
is some commands can fail but later ones could still give useful data.
Moreover, some nvram partitions could not be present which needs to be
handled as a non error.

The specific errors are reported through system messages and if any
error appears, it will be reported generically through extack.

Fixes 14743ddd2495 ("sfc: add devlink info support for ef100")
Signed-off-by: Alejandro Lucero <alejandro.lucero-palau@amd.com>
Acked-by: Martin Habets <habetsm.xilinx@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/sfc/efx_devlink.c | 95 ++++++++++++++++------------------
 1 file changed, 45 insertions(+), 50 deletions(-)

(limited to 'drivers')

diff --git a/drivers/net/ethernet/sfc/efx_devlink.c b/drivers/net/ethernet/sfc/efx_devlink.c
index 381b805659d3..ef9971cbb695 100644
--- a/drivers/net/ethernet/sfc/efx_devlink.c
+++ b/drivers/net/ethernet/sfc/efx_devlink.c
@@ -171,9 +171,14 @@ static int efx_devlink_info_nvram_partition(struct efx_nic *efx,
 
 	rc = efx_mcdi_nvram_metadata(efx, partition_type, NULL, version, NULL,
 				     0);
+
+	/* If the partition does not exist, that is not an error. */
+	if (rc == -ENOENT)
+		return 0;
+
 	if (rc) {
-		netif_err(efx, drv, efx->net_dev, "mcdi nvram %s: failed\n",
-			  version_name);
+		netif_err(efx, drv, efx->net_dev, "mcdi nvram %s: failed (rc=%d)\n",
+			  version_name, rc);
 		return rc;
 	}
 
@@ -187,36 +192,33 @@ static int efx_devlink_info_nvram_partition(struct efx_nic *efx,
 static int efx_devlink_info_stored_versions(struct efx_nic *efx,
 					    struct devlink_info_req *req)
 {
-	int rc;
-
-	rc = efx_devlink_info_nvram_partition(efx, req,
-					      NVRAM_PARTITION_TYPE_BUNDLE,
-					      DEVLINK_INFO_VERSION_GENERIC_FW_BUNDLE_ID);
-	if (rc)
-		return rc;
-
-	rc = efx_devlink_info_nvram_partition(efx, req,
-					      NVRAM_PARTITION_TYPE_MC_FIRMWARE,
-					      DEVLINK_INFO_VERSION_GENERIC_FW_MGMT);
-	if (rc)
-		return rc;
-
-	rc = efx_devlink_info_nvram_partition(efx, req,
-					      NVRAM_PARTITION_TYPE_SUC_FIRMWARE,
-					      EFX_DEVLINK_INFO_VERSION_FW_MGMT_SUC);
-	if (rc)
-		return rc;
-
-	rc = efx_devlink_info_nvram_partition(efx, req,
-					      NVRAM_PARTITION_TYPE_EXPANSION_ROM,
-					      EFX_DEVLINK_INFO_VERSION_FW_EXPROM);
-	if (rc)
-		return rc;
+	int err;
 
-	rc = efx_devlink_info_nvram_partition(efx, req,
-					      NVRAM_PARTITION_TYPE_EXPANSION_UEFI,
-					      EFX_DEVLINK_INFO_VERSION_FW_UEFI);
-	return rc;
+	/* We do not care here about the specific error but just if an error
+	 * happened. The specific error will be reported inside the call
+	 * through system messages, and if any error happened in any call
+	 * below, we report it through extack.
+	 */
+	err = efx_devlink_info_nvram_partition(efx, req,
+					       NVRAM_PARTITION_TYPE_BUNDLE,
+					       DEVLINK_INFO_VERSION_GENERIC_FW_BUNDLE_ID);
+
+	err |= efx_devlink_info_nvram_partition(efx, req,
+						NVRAM_PARTITION_TYPE_MC_FIRMWARE,
+						DEVLINK_INFO_VERSION_GENERIC_FW_MGMT);
+
+	err |= efx_devlink_info_nvram_partition(efx, req,
+						NVRAM_PARTITION_TYPE_SUC_FIRMWARE,
+						EFX_DEVLINK_INFO_VERSION_FW_MGMT_SUC);
+
+	err |= efx_devlink_info_nvram_partition(efx, req,
+						NVRAM_PARTITION_TYPE_EXPANSION_ROM,
+						EFX_DEVLINK_INFO_VERSION_FW_EXPROM);
+
+	err |= efx_devlink_info_nvram_partition(efx, req,
+						NVRAM_PARTITION_TYPE_EXPANSION_UEFI,
+						EFX_DEVLINK_INFO_VERSION_FW_UEFI);
+	return err;
 }
 
 #define EFX_VER_FLAG(_f)	\
@@ -587,27 +589,20 @@ static int efx_devlink_info_get(struct devlink *devlink,
 {
 	struct efx_devlink *devlink_private = devlink_priv(devlink);
 	struct efx_nic *efx = devlink_private->efx;
-	int rc;
+	int err;
 
-	/* Several different MCDI commands are used. We report first error
-	 * through extack returning at that point. Specific error
-	 * information via system messages.
+	/* Several different MCDI commands are used. We report if errors
+	 * happened through extack. Specific error information via system
+	 * messages inside the calls.
 	 */
-	rc = efx_devlink_info_board_cfg(efx, req);
-	if (rc) {
-		NL_SET_ERR_MSG_MOD(extack, "Getting board info failed");
-		return rc;
-	}
-	rc = efx_devlink_info_stored_versions(efx, req);
-	if (rc) {
-		NL_SET_ERR_MSG_MOD(extack, "Getting stored versions failed");
-		return rc;
-	}
-	rc = efx_devlink_info_running_versions(efx, req);
-	if (rc) {
-		NL_SET_ERR_MSG_MOD(extack, "Getting running versions failed");
-		return rc;
-	}
+	err = efx_devlink_info_board_cfg(efx, req);
+
+	err |= efx_devlink_info_stored_versions(efx, req);
+
+	err |= efx_devlink_info_running_versions(efx, req);
+
+	if (err)
+		NL_SET_ERR_MSG_MOD(extack, "Errors when getting device info. Check system messages");
 
 	return 0;
 }
-- 
cgit v1.2.3


From de678ca38861f2eb58814048076dcf95ed1b5bf9 Mon Sep 17 00:00:00 2001
From: Sunil Goutham <sgoutham@marvell.com>
Date: Thu, 18 May 2023 12:10:42 +0530
Subject: octeontx2-pf: Fix TSOv6 offload

HW adds segment size to the payload length
in the IPv6 header. Fix payload length to
just TCP header length instead of 'TCP header
size + IPv6 header size'.

Fixes: 86d7476078b8 ("octeontx2-pf: TCP segmentation offload support")
Signed-off-by: Sunil Goutham <sgoutham@marvell.com>
Signed-off-by: Ratheesh Kannoth <rkannoth@marvell.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/marvell/octeontx2/nic/otx2_txrx.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'drivers')

diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_txrx.c b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_txrx.c
index 7045fedfd73a..7af223b0a37f 100644
--- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_txrx.c
+++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_txrx.c
@@ -652,9 +652,7 @@ static void otx2_sqe_add_ext(struct otx2_nic *pfvf, struct otx2_snd_queue *sq,
 				htons(ext->lso_sb - skb_network_offset(skb));
 		} else if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV6) {
 			ext->lso_format = pfvf->hw.lso_tsov6_idx;
-
-			ipv6_hdr(skb)->payload_len =
-				htons(ext->lso_sb - skb_network_offset(skb));
+			ipv6_hdr(skb)->payload_len = htons(tcp_hdrlen(skb));
 		} else if (skb_shinfo(skb)->gso_type & SKB_GSO_UDP_L4) {
 			__be16 l3_proto = vlan_get_protocol(skb);
 			struct udphdr *udph = udp_hdr(skb);
-- 
cgit v1.2.3


From 9025944fddfed5966c8f102f1fe921ab3aee2c12 Mon Sep 17 00:00:00 2001
From: Shenwei Wang <shenwei.wang@nxp.com>
Date: Thu, 18 May 2023 10:02:02 -0500
Subject: net: fec: add dma_wmb to ensure correct descriptor values

Two dma_wmb() are added in the XDP TX path to ensure proper ordering of
descriptor and buffer updates:
1. A dma_wmb() is added after updating the last BD to make sure
   the updates to rest of the descriptor are visible before
   transferring ownership to FEC.
2. A dma_wmb() is also added after updating the bdp to ensure these
   updates are visible before updating txq->bd.cur.
3. Start the xmit of the frame immediately right after configuring the
   tx descriptor.

Fixes: 6d6b39f180b8 ("net: fec: add initial XDP support")
Signed-off-by: Shenwei Wang <shenwei.wang@nxp.com>
Reviewed-by: Wei Fang <wei.fang@nxp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/freescale/fec_main.c | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

(limited to 'drivers')

diff --git a/drivers/net/ethernet/freescale/fec_main.c b/drivers/net/ethernet/freescale/fec_main.c
index 577d94821b3e..38e5b5abe067 100644
--- a/drivers/net/ethernet/freescale/fec_main.c
+++ b/drivers/net/ethernet/freescale/fec_main.c
@@ -3834,6 +3834,11 @@ static int fec_enet_txq_xmit_frame(struct fec_enet_private *fep,
 	index = fec_enet_get_bd_index(last_bdp, &txq->bd);
 	txq->tx_skbuff[index] = NULL;
 
+	/* Make sure the updates to rest of the descriptor are performed before
+	 * transferring ownership.
+	 */
+	dma_wmb();
+
 	/* Send it on its way.  Tell FEC it's ready, interrupt when done,
 	 * it's the last BD of the frame, and to put the CRC on the end.
 	 */
@@ -3843,8 +3848,14 @@ static int fec_enet_txq_xmit_frame(struct fec_enet_private *fep,
 	/* If this was the last BD in the ring, start at the beginning again. */
 	bdp = fec_enet_get_nextdesc(last_bdp, &txq->bd);
 
+	/* Make sure the update to bdp are performed before txq->bd.cur. */
+	dma_wmb();
+
 	txq->bd.cur = bdp;
 
+	/* Trigger transmission start */
+	writel(0, txq->bd.reg_desc_active);
+
 	return 0;
 }
 
@@ -3873,12 +3884,6 @@ static int fec_enet_xdp_xmit(struct net_device *dev,
 		sent_frames++;
 	}
 
-	/* Make sure the update to bdp and tx_skbuff are performed. */
-	wmb();
-
-	/* Trigger transmission start */
-	writel(0, txq->bd.reg_desc_active);
-
 	__netif_tx_unlock(nq);
 
 	return sent_frames;
-- 
cgit v1.2.3


From 6ce5169e05aa5360a49a574cc1490ceea6b651a6 Mon Sep 17 00:00:00 2001
From: Neeraj Sanjay Kale <neeraj.sanjaykale@nxp.com>
Date: Thu, 18 May 2023 22:13:47 +0530
Subject: Bluetooth: btnxpuart: Fix compiler warnings

This fixes the follwing compiler warning reported by kernel test robot:

  drivers/bluetooth/btnxpuart.c:1332:34: warning: unused variable
  'nxpuart_of_match_table' [-Wunused-const-variable]

Signed-off-by: Neeraj Sanjay Kale <neeraj.sanjaykale@nxp.com>
Reported-by: kernel test robot <lkp@intel.com>
Link: https://lore.kernel.org/oe-kbuild-all/202305161345.eClvTYQ9-lkp@intel.com/
Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
---
 drivers/bluetooth/btnxpuart.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'drivers')

diff --git a/drivers/bluetooth/btnxpuart.c b/drivers/bluetooth/btnxpuart.c
index 3a34d7c1475b..52ef44688d38 100644
--- a/drivers/bluetooth/btnxpuart.c
+++ b/drivers/bluetooth/btnxpuart.c
@@ -1319,17 +1319,17 @@ static void nxp_serdev_remove(struct serdev_device *serdev)
 	hci_free_dev(hdev);
 }
 
-static struct btnxpuart_data w8987_data = {
+static struct btnxpuart_data w8987_data __maybe_unused = {
 	.helper_fw_name = NULL,
 	.fw_name = FIRMWARE_W8987,
 };
 
-static struct btnxpuart_data w8997_data = {
+static struct btnxpuart_data w8997_data __maybe_unused = {
 	.helper_fw_name = FIRMWARE_HELPER,
 	.fw_name = FIRMWARE_W8997,
 };
 
-static const struct of_device_id nxpuart_of_match_table[] = {
+static const struct of_device_id nxpuart_of_match_table[] __maybe_unused = {
 	{ .compatible = "nxp,88w8987-bt", .data = &w8987_data },
 	{ .compatible = "nxp,88w8997-bt", .data = &w8997_data },
 	{ }
-- 
cgit v1.2.3


From ae9b15fbe63447bc1d3bba3769f409d17ca6fdf6 Mon Sep 17 00:00:00 2001
From: Taehee Yoo <ap420073@gmail.com>
Date: Wed, 17 May 2023 14:30:10 +0000
Subject: net: fix stack overflow when LRO is disabled for virtual interfaces

When the virtual interface's feature is updated, it synchronizes the
updated feature for its own lower interface.
This propagation logic should be worked as the iteration, not recursively.
But it works recursively due to the netdev notification unexpectedly.
This problem occurs when it disables LRO only for the team and bonding
interface type.

       team0
         |
  +------+------+-----+-----+
  |      |      |     |     |
team1  team2  team3  ...  team200

If team0's LRO feature is updated, it generates the NETDEV_FEAT_CHANGE
event to its own lower interfaces(team1 ~ team200).
It is worked by netdev_sync_lower_features().
So, the NETDEV_FEAT_CHANGE notification logic of each lower interface
work iteratively.
But generated NETDEV_FEAT_CHANGE event is also sent to the upper
interface too.
upper interface(team0) generates the NETDEV_FEAT_CHANGE event for its own
lower interfaces again.
lower and upper interfaces receive this event and generate this
event again and again.
So, the stack overflow occurs.

But it is not the infinite loop issue.
Because the netdev_sync_lower_features() updates features before
generating the NETDEV_FEAT_CHANGE event.
Already synchronized lower interfaces skip notification logic.
So, it is just the problem that iteration logic is changed to the
recursive unexpectedly due to the notification mechanism.

Reproducer:

ip link add team0 type team
ethtool -K team0 lro on
for i in {1..200}
do
        ip link add team$i master team0 type team
        ethtool -K team$i lro on
done

ethtool -K team0 lro off

In order to fix it, the notifier_ctx member of bonding/team is introduced.

Reported-by: syzbot+60748c96cf5c6df8e581@syzkaller.appspotmail.com
Fixes: fd867d51f889 ("net/core: generic support for disabling netdev features down stack")
Signed-off-by: Taehee Yoo <ap420073@gmail.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Nikolay Aleksandrov <razor@blackwall.org>
Link: https://lore.kernel.org/r/20230517143010.3596250-1-ap420073@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/bonding/bond_main.c | 8 +++++++-
 drivers/net/team/team.c         | 7 ++++++-
 include/linux/if_team.h         | 1 +
 include/net/bonding.h           | 1 +
 4 files changed, 15 insertions(+), 2 deletions(-)

(limited to 'drivers')

diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index 3fed888629f7..edbaa1444f8e 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -3947,7 +3947,11 @@ static int bond_slave_netdev_event(unsigned long event,
 		unblock_netpoll_tx();
 		break;
 	case NETDEV_FEAT_CHANGE:
-		bond_compute_features(bond);
+		if (!bond->notifier_ctx) {
+			bond->notifier_ctx = true;
+			bond_compute_features(bond);
+			bond->notifier_ctx = false;
+		}
 		break;
 	case NETDEV_RESEND_IGMP:
 		/* Propagate to master device */
@@ -6342,6 +6346,8 @@ static int bond_init(struct net_device *bond_dev)
 	if (!bond->wq)
 		return -ENOMEM;
 
+	bond->notifier_ctx = false;
+
 	spin_lock_init(&bond->stats_lock);
 	netdev_lockdep_set_classes(bond_dev);
 
diff --git a/drivers/net/team/team.c b/drivers/net/team/team.c
index d10606f257c4..555b0b1e9a78 100644
--- a/drivers/net/team/team.c
+++ b/drivers/net/team/team.c
@@ -1629,6 +1629,7 @@ static int team_init(struct net_device *dev)
 
 	team->dev = dev;
 	team_set_no_mode(team);
+	team->notifier_ctx = false;
 
 	team->pcpu_stats = netdev_alloc_pcpu_stats(struct team_pcpu_stats);
 	if (!team->pcpu_stats)
@@ -3022,7 +3023,11 @@ static int team_device_event(struct notifier_block *unused,
 		team_del_slave(port->team->dev, dev);
 		break;
 	case NETDEV_FEAT_CHANGE:
-		team_compute_features(port->team);
+		if (!port->team->notifier_ctx) {
+			port->team->notifier_ctx = true;
+			team_compute_features(port->team);
+			port->team->notifier_ctx = false;
+		}
 		break;
 	case NETDEV_PRECHANGEMTU:
 		/* Forbid to change mtu of underlaying device */
diff --git a/include/linux/if_team.h b/include/linux/if_team.h
index fc985e5c739d..8de6b6e67829 100644
--- a/include/linux/if_team.h
+++ b/include/linux/if_team.h
@@ -208,6 +208,7 @@ struct team {
 	bool queue_override_enabled;
 	struct list_head *qom_lists; /* array of queue override mapping lists */
 	bool port_mtu_change_allowed;
+	bool notifier_ctx;
 	struct {
 		unsigned int count;
 		unsigned int interval; /* in ms */
diff --git a/include/net/bonding.h b/include/net/bonding.h
index 0efef2a952b7..59955ac33157 100644
--- a/include/net/bonding.h
+++ b/include/net/bonding.h
@@ -221,6 +221,7 @@ struct bonding {
 	struct   bond_up_slave __rcu *usable_slaves;
 	struct   bond_up_slave __rcu *all_slaves;
 	bool     force_primary;
+	bool     notifier_ctx;
 	s32      slave_cnt; /* never change this value outside the attach/detach wrappers */
 	int     (*recv_probe)(const struct sk_buff *, struct bonding *,
 			      struct slave *);
-- 
cgit v1.2.3


From 5b17a4971d3b2a073f4078dd65331efbe35baa2d Mon Sep 17 00:00:00 2001
From: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Date: Sat, 20 May 2023 10:30:17 +0200
Subject: forcedeth: Fix an error handling path in nv_probe()

If an error occures after calling nv_mgmt_acquire_sema(), it should be
undone with a corresponding nv_mgmt_release_sema() call.

Add it in the error handling path of the probe as already done in the
remove function.

Fixes: cac1c52c3621 ("forcedeth: mgmt unit interface")
Signed-off-by: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Acked-by: Zhu Yanjun <zyjzyj2000@gmail.com>
Link: https://lore.kernel.org/r/355e9a7d351b32ad897251b6f81b5886fcdc6766.1684571393.git.christophe.jaillet@wanadoo.fr
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/ethernet/nvidia/forcedeth.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'drivers')

diff --git a/drivers/net/ethernet/nvidia/forcedeth.c b/drivers/net/ethernet/nvidia/forcedeth.c
index 0605d1ee490d..7a549b834e97 100644
--- a/drivers/net/ethernet/nvidia/forcedeth.c
+++ b/drivers/net/ethernet/nvidia/forcedeth.c
@@ -6138,6 +6138,7 @@ static int nv_probe(struct pci_dev *pci_dev, const struct pci_device_id *id)
 	return 0;
 
 out_error:
+	nv_mgmt_release_sema(dev);
 	if (phystate_orig)
 		writel(phystate|NVREG_ADAPTCTL_RUNNING, base + NvRegAdapterControl);
 out_freering:
-- 
cgit v1.2.3


From 640bf95b2c7c2981fb471acdafbd3e0458f8390d Mon Sep 17 00:00:00 2001
From: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Date: Sat, 20 May 2023 11:48:55 +0200
Subject: 3c589_cs: Fix an error handling path in tc589_probe()

Should tc589_config() fail, some resources need to be released as already
done in the remove function.

Fixes: 15b99ac17295 ("[PATCH] pcmcia: add return value to _config() functions")
Signed-off-by: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Reviewed-by: Simon Horman <simon.horman@corigine.com>
Link: https://lore.kernel.org/r/d8593ae867b24c79063646e36f9b18b0790107cb.1684575975.git.christophe.jaillet@wanadoo.fr
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/ethernet/3com/3c589_cs.c | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

(limited to 'drivers')

diff --git a/drivers/net/ethernet/3com/3c589_cs.c b/drivers/net/ethernet/3com/3c589_cs.c
index 82f94b1635bf..5267e9dcd87e 100644
--- a/drivers/net/ethernet/3com/3c589_cs.c
+++ b/drivers/net/ethernet/3com/3c589_cs.c
@@ -195,6 +195,7 @@ static int tc589_probe(struct pcmcia_device *link)
 {
 	struct el3_private *lp;
 	struct net_device *dev;
+	int ret;
 
 	dev_dbg(&link->dev, "3c589_attach()\n");
 
@@ -218,7 +219,15 @@ static int tc589_probe(struct pcmcia_device *link)
 
 	dev->ethtool_ops = &netdev_ethtool_ops;
 
-	return tc589_config(link);
+	ret = tc589_config(link);
+	if (ret)
+		goto err_free_netdev;
+
+	return 0;
+
+err_free_netdev:
+	free_netdev(dev);
+	return ret;
 }
 
 static void tc589_detach(struct pcmcia_device *link)
-- 
cgit v1.2.3


From 2a0a935fb64ee8af253b9c6133bb6702fb152ac2 Mon Sep 17 00:00:00 2001
From: Shay Drory <shayd@nvidia.com>
Date: Tue, 2 May 2023 11:03:53 +0300
Subject: net/mlx5: Collect command failures data only for known commands

DEVX can issue a general command, which is not used by mlx5 driver.
In case such command is failed, mlx5 is trying to collect the failure
data, However, mlx5 doesn't create a storage for this command, since
mlx5 doesn't use it. This lead to array-index-out-of-bounds error.

Fix it by checking whether the command is known before collecting the
failure data.

Fixes: 34f46ae0d4b3 ("net/mlx5: Add command failures data to debugfs")
Signed-off-by: Shay Drory <shayd@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/cmd.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'drivers')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c
index d53de39539a8..d532883b42d7 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c
@@ -1920,9 +1920,10 @@ static void mlx5_cmd_err_trace(struct mlx5_core_dev *dev, u16 opcode, u16 op_mod
 static void cmd_status_log(struct mlx5_core_dev *dev, u16 opcode, u8 status,
 			   u32 syndrome, int err)
 {
+	const char *namep = mlx5_command_str(opcode);
 	struct mlx5_cmd_stats *stats;
 
-	if (!err)
+	if (!err || !(strcmp(namep, "unknown command opcode")))
 		return;
 
 	stats = &dev->cmd.stats[opcode];
-- 
cgit v1.2.3


From 2be5bd42a5bba1a05daedc86cf0e248210009669 Mon Sep 17 00:00:00 2001
From: Shay Drory <shayd@nvidia.com>
Date: Mon, 20 Mar 2023 13:07:53 +0200
Subject: net/mlx5: Handle pairing of E-switch via uplink un/load APIs

In case user switch a device from switchdev mode to legacy mode, mlx5
first unpair the E-switch and afterwards unload the uplink vport.
From the other hand, in case user remove or reload a device, mlx5
first unload the uplink vport and afterwards unpair the E-switch.

The latter is causing a bug[1], hence, handle pairing of E-switch as
part of uplink un/load APIs.

[1]
In case VF_LAG is used, every tc fdb flow is duplicated to the peer
esw. However, the original esw keeps a pointer to this duplicated
flow, not the peer esw.
e.g.: if user create tc fdb flow over esw0, the flow is duplicated
over esw1, in FW/HW, but in SW, esw0 keeps a pointer to the duplicated
flow.
During module unload while a peer tc fdb flow is still offloaded, in
case the first device to be removed is the peer device (esw1 in the
example above), the peer net-dev is destroyed, and so the mlx5e_priv
is memset to 0.
Afterwards, the peer device is trying to unpair himself from the
original device (esw0 in the example above). Unpair API invoke the
original device to clear peer flow from its eswitch (esw0), but the
peer flow, which is stored over the original eswitch (esw0), is
trying to use the peer mlx5e_priv, which is memset to 0 and result in
bellow kernel-oops.

[  157.964081 ] BUG: unable to handle page fault for address: 000000000002ce60
[  157.964662 ] #PF: supervisor read access in kernel mode
[  157.965123 ] #PF: error_code(0x0000) - not-present page
[  157.965582 ] PGD 0 P4D 0
[  157.965866 ] Oops: 0000 [#1] SMP
[  157.967670 ] RIP: 0010:mlx5e_tc_del_fdb_flow+0x48/0x460 [mlx5_core]
[  157.976164 ] Call Trace:
[  157.976437 ]  <TASK>
[  157.976690 ]  __mlx5e_tc_del_fdb_peer_flow+0xe6/0x100 [mlx5_core]
[  157.977230 ]  mlx5e_tc_clean_fdb_peer_flows+0x67/0x90 [mlx5_core]
[  157.977767 ]  mlx5_esw_offloads_unpair+0x2d/0x1e0 [mlx5_core]
[  157.984653 ]  mlx5_esw_offloads_devcom_event+0xbf/0x130 [mlx5_core]
[  157.985212 ]  mlx5_devcom_send_event+0xa3/0xb0 [mlx5_core]
[  157.985714 ]  esw_offloads_disable+0x5a/0x110 [mlx5_core]
[  157.986209 ]  mlx5_eswitch_disable_locked+0x152/0x170 [mlx5_core]
[  157.986757 ]  mlx5_eswitch_disable+0x51/0x80 [mlx5_core]
[  157.987248 ]  mlx5_unload+0x2a/0xb0 [mlx5_core]
[  157.987678 ]  mlx5_uninit_one+0x5f/0xd0 [mlx5_core]
[  157.988127 ]  remove_one+0x64/0xe0 [mlx5_core]
[  157.988549 ]  pci_device_remove+0x31/0xa0
[  157.988933 ]  device_release_driver_internal+0x18f/0x1f0
[  157.989402 ]  driver_detach+0x3f/0x80
[  157.989754 ]  bus_remove_driver+0x70/0xf0
[  157.990129 ]  pci_unregister_driver+0x34/0x90
[  157.990537 ]  mlx5_cleanup+0xc/0x1c [mlx5_core]
[  157.990972 ]  __x64_sys_delete_module+0x15a/0x250
[  157.991398 ]  ? exit_to_user_mode_prepare+0xea/0x110
[  157.991840 ]  do_syscall_64+0x3d/0x90
[  157.992198 ]  entry_SYSCALL_64_after_hwframe+0x46/0xb0

Fixes: 04de7dda7394 ("net/mlx5e: Infrastructure for duplicated offloading of TC flows")
Fixes: 1418ddd96afd ("net/mlx5e: Duplicate offloaded TC eswitch rules under uplink LAG")
Signed-off-by: Shay Drory <shayd@nvidia.com>
Reviewed-by: Roi Dayan <roid@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/en_tc.c            | 4 +++-
 drivers/net/ethernet/mellanox/mlx5/core/eswitch.h          | 4 ++++
 drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c | 7 ++-----
 3 files changed, 9 insertions(+), 6 deletions(-)

(limited to 'drivers')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
index 728b82ce4031..65fe40f55d84 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
@@ -5301,6 +5301,8 @@ int mlx5e_tc_esw_init(struct mlx5_rep_uplink_priv *uplink_priv)
 		goto err_action_counter;
 	}
 
+	mlx5_esw_offloads_devcom_init(esw);
+
 	return 0;
 
 err_action_counter:
@@ -5329,7 +5331,7 @@ void mlx5e_tc_esw_cleanup(struct mlx5_rep_uplink_priv *uplink_priv)
 	priv = netdev_priv(rpriv->netdev);
 	esw = priv->mdev->priv.eswitch;
 
-	mlx5e_tc_clean_fdb_peer_flows(esw);
+	mlx5_esw_offloads_devcom_cleanup(esw);
 
 	mlx5e_tc_tun_cleanup(uplink_priv->encap);
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
index 1a042c981713..9f007c5438ee 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
@@ -369,6 +369,8 @@ int mlx5_eswitch_enable(struct mlx5_eswitch *esw, int num_vfs);
 void mlx5_eswitch_disable_sriov(struct mlx5_eswitch *esw, bool clear_vf);
 void mlx5_eswitch_disable_locked(struct mlx5_eswitch *esw);
 void mlx5_eswitch_disable(struct mlx5_eswitch *esw);
+void mlx5_esw_offloads_devcom_init(struct mlx5_eswitch *esw);
+void mlx5_esw_offloads_devcom_cleanup(struct mlx5_eswitch *esw);
 int mlx5_eswitch_set_vport_mac(struct mlx5_eswitch *esw,
 			       u16 vport, const u8 *mac);
 int mlx5_eswitch_set_vport_state(struct mlx5_eswitch *esw,
@@ -767,6 +769,8 @@ static inline void mlx5_eswitch_cleanup(struct mlx5_eswitch *esw) {}
 static inline int mlx5_eswitch_enable(struct mlx5_eswitch *esw, int num_vfs) { return 0; }
 static inline void mlx5_eswitch_disable_sriov(struct mlx5_eswitch *esw, bool clear_vf) {}
 static inline void mlx5_eswitch_disable(struct mlx5_eswitch *esw) {}
+static inline void mlx5_esw_offloads_devcom_init(struct mlx5_eswitch *esw) {}
+static inline void mlx5_esw_offloads_devcom_cleanup(struct mlx5_eswitch *esw) {}
 static inline bool mlx5_eswitch_is_funcs_handler(struct mlx5_core_dev *dev) { return false; }
 static inline
 int mlx5_eswitch_set_vport_state(struct mlx5_eswitch *esw, u16 vport, int link_state) { return 0; }
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
index 69215ffb9999..7c34c7cf506f 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
@@ -2779,7 +2779,7 @@ err_out:
 	return err;
 }
 
-static void esw_offloads_devcom_init(struct mlx5_eswitch *esw)
+void mlx5_esw_offloads_devcom_init(struct mlx5_eswitch *esw)
 {
 	struct mlx5_devcom *devcom = esw->dev->priv.devcom;
 
@@ -2802,7 +2802,7 @@ static void esw_offloads_devcom_init(struct mlx5_eswitch *esw)
 			       ESW_OFFLOADS_DEVCOM_PAIR, esw);
 }
 
-static void esw_offloads_devcom_cleanup(struct mlx5_eswitch *esw)
+void mlx5_esw_offloads_devcom_cleanup(struct mlx5_eswitch *esw)
 {
 	struct mlx5_devcom *devcom = esw->dev->priv.devcom;
 
@@ -3250,8 +3250,6 @@ int esw_offloads_enable(struct mlx5_eswitch *esw)
 	if (err)
 		goto err_vports;
 
-	esw_offloads_devcom_init(esw);
-
 	return 0;
 
 err_vports:
@@ -3292,7 +3290,6 @@ static int esw_offloads_stop(struct mlx5_eswitch *esw,
 
 void esw_offloads_disable(struct mlx5_eswitch *esw)
 {
-	esw_offloads_devcom_cleanup(esw);
 	mlx5_eswitch_disable_pf_vf_vports(esw);
 	esw_offloads_unload_rep(esw, MLX5_VPORT_UPLINK);
 	esw_set_passing_vport_metadata(esw, false);
-- 
cgit v1.2.3


From 1e5daf5565b61a96e570865091589afc9156e3d3 Mon Sep 17 00:00:00 2001
From: Erez Shitrit <erezsh@nvidia.com>
Date: Thu, 9 Mar 2023 16:43:15 +0200
Subject: net/mlx5: DR, Fix crc32 calculation to work on big-endian (BE) CPUs

When calculating crc for hash index we use the function crc32 that
calculates for little-endian (LE) arch.
Then we convert it to network endianness using htonl(), but it's wrong
to do the conversion in BE archs since the crc32 value is already LE.

The solution is to switch the bytes from the crc result for all types
of arc.

Fixes: 40416d8ede65 ("net/mlx5: DR, Replace CRC32 implementation to use kernel lib")
Signed-off-by: Erez Shitrit <erezsh@nvidia.com>
Reviewed-by: Alex Vesker <valex@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/steering/dr_ste.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'drivers')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_ste.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_ste.c
index 9413aaf51251..e94fbb015efa 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_ste.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_ste.c
@@ -15,7 +15,8 @@ static u32 dr_ste_crc32_calc(const void *input_data, size_t length)
 {
 	u32 crc = crc32(0, input_data, length);
 
-	return (__force u32)htonl(crc);
+	return (__force u32)((crc >> 24) & 0xff) | ((crc << 8) & 0xff0000) |
+			    ((crc >> 8) & 0xff00) | ((crc << 24) & 0xff000000);
 }
 
 bool mlx5dr_ste_supp_ttl_cs_recalc(struct mlx5dr_cmd_caps *caps)
-- 
cgit v1.2.3


From c7dd225bc224726c22db08e680bf787f60ebdee3 Mon Sep 17 00:00:00 2001
From: Yevgeny Kliteynik <kliteyn@nvidia.com>
Date: Sun, 2 Apr 2023 17:14:10 +0300
Subject: net/mlx5: DR, Check force-loopback RC QP capability independently
 from RoCE

SW Steering uses RC QP for writing STEs to ICM. This writingis done in LB
(loopback), and FL (force-loopback) QP is preferred for performance. FL is
available when RoCE is enabled or disabled based on RoCE caps.
This patch adds reading of FL capability from HCA caps in addition to the
existing reading from RoCE caps, thus fixing the case where we didn't
have loopback enabled when RoCE was disabled.

Fixes: 7304d603a57a ("net/mlx5: DR, Add support for force-loopback QP")
Signed-off-by: Itamar Gozlan <igozlan@nvidia.com>
Signed-off-by: Yevgeny Kliteynik <kliteyn@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/steering/dr_cmd.c | 4 +++-
 include/linux/mlx5/mlx5_ifc.h                             | 4 +++-
 2 files changed, 6 insertions(+), 2 deletions(-)

(limited to 'drivers')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_cmd.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_cmd.c
index 3835ba3f4dda..1aa525e509f1 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_cmd.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_cmd.c
@@ -117,6 +117,8 @@ int mlx5dr_cmd_query_device(struct mlx5_core_dev *mdev,
 	caps->gvmi		= MLX5_CAP_GEN(mdev, vhca_id);
 	caps->flex_protocols	= MLX5_CAP_GEN(mdev, flex_parser_protocols);
 	caps->sw_format_ver	= MLX5_CAP_GEN(mdev, steering_format_version);
+	caps->roce_caps.fl_rc_qp_when_roce_disabled =
+		MLX5_CAP_GEN(mdev, fl_rc_qp_when_roce_disabled);
 
 	if (MLX5_CAP_GEN(mdev, roce)) {
 		err = dr_cmd_query_nic_vport_roce_en(mdev, 0, &roce_en);
@@ -124,7 +126,7 @@ int mlx5dr_cmd_query_device(struct mlx5_core_dev *mdev,
 			return err;
 
 		caps->roce_caps.roce_en = roce_en;
-		caps->roce_caps.fl_rc_qp_when_roce_disabled =
+		caps->roce_caps.fl_rc_qp_when_roce_disabled |=
 			MLX5_CAP_ROCE(mdev, fl_rc_qp_when_roce_disabled);
 		caps->roce_caps.fl_rc_qp_when_roce_enabled =
 			MLX5_CAP_ROCE(mdev, fl_rc_qp_when_roce_enabled);
diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index dc5e2cb302a5..b89778d0d326 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -1705,7 +1705,9 @@ struct mlx5_ifc_cmd_hca_cap_bits {
 	u8         rc[0x1];
 
 	u8         uar_4k[0x1];
-	u8         reserved_at_241[0x9];
+	u8         reserved_at_241[0x7];
+	u8         fl_rc_qp_when_roce_disabled[0x1];
+	u8         regexp_params[0x1];
 	u8         uar_sz[0x6];
 	u8         port_selection_cap[0x1];
 	u8         reserved_at_248[0x1];
-- 
cgit v1.2.3


From be071cdb167fc3e25fe81922166b3d499d23e8ac Mon Sep 17 00:00:00 2001
From: Vlad Buslov <vladbu@nvidia.com>
Date: Mon, 3 Apr 2023 22:26:00 +0200
Subject: net/mlx5e: Use correct encap attribute during invalidation

With introduction of post action infrastructure most of the users of encap
attribute had been modified in order to obtain the correct attribute by
calling mlx5e_tc_get_encap_attr() helper instead of assuming encap action
is always on default attribute. However, the cited commit didn't modify
mlx5e_invalidate_encap() which prevents it from destroying correct modify
header action which leads to a warning [0]. Fix the issue by using correct
attribute.

[0]:

Feb 21 09:47:35 c-237-177-40-045 kernel: WARNING: CPU: 17 PID: 654 at drivers/net/ethernet/mellanox/mlx5/core/en_tc.c:684 mlx5e_tc_attach_mod_hdr+0x1cc/0x230 [mlx5_core]
Feb 21 09:47:35 c-237-177-40-045 kernel: RIP: 0010:mlx5e_tc_attach_mod_hdr+0x1cc/0x230 [mlx5_core]
Feb 21 09:47:35 c-237-177-40-045 kernel: Call Trace:
Feb 21 09:47:35 c-237-177-40-045 kernel:  <TASK>
Feb 21 09:47:35 c-237-177-40-045 kernel:  mlx5e_tc_fib_event_work+0x8e3/0x1f60 [mlx5_core]
Feb 21 09:47:35 c-237-177-40-045 kernel:  ? mlx5e_take_all_encap_flows+0xe0/0xe0 [mlx5_core]
Feb 21 09:47:35 c-237-177-40-045 kernel:  ? lock_downgrade+0x6d0/0x6d0
Feb 21 09:47:35 c-237-177-40-045 kernel:  ? lockdep_hardirqs_on_prepare+0x273/0x3f0
Feb 21 09:47:35 c-237-177-40-045 kernel:  ? lockdep_hardirqs_on_prepare+0x273/0x3f0
Feb 21 09:47:35 c-237-177-40-045 kernel:  process_one_work+0x7c2/0x1310
Feb 21 09:47:35 c-237-177-40-045 kernel:  ? lockdep_hardirqs_on_prepare+0x3f0/0x3f0
Feb 21 09:47:35 c-237-177-40-045 kernel:  ? pwq_dec_nr_in_flight+0x230/0x230
Feb 21 09:47:35 c-237-177-40-045 kernel:  ? rwlock_bug.part.0+0x90/0x90
Feb 21 09:47:35 c-237-177-40-045 kernel:  worker_thread+0x59d/0xec0
Feb 21 09:47:35 c-237-177-40-045 kernel:  ? __kthread_parkme+0xd9/0x1d0

Fixes: 8300f225268b ("net/mlx5e: Create new flow attr for multi table actions")
Signed-off-by: Vlad Buslov <vladbu@nvidia.com>
Reviewed-by: Roi Dayan <roid@nvidia.com>
Reviewed-by: Tariq Toukan <tariqt@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun_encap.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'drivers')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun_encap.c b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun_encap.c
index 20c2d2ecaf93..6a052c6cfc15 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun_encap.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun_encap.c
@@ -1369,11 +1369,13 @@ static void mlx5e_invalidate_encap(struct mlx5e_priv *priv,
 	struct mlx5e_tc_flow *flow;
 
 	list_for_each_entry(flow, encap_flows, tmp_list) {
-		struct mlx5_flow_attr *attr = flow->attr;
 		struct mlx5_esw_flow_attr *esw_attr;
+		struct mlx5_flow_attr *attr;
 
 		if (!mlx5e_is_offloaded_flow(flow))
 			continue;
+
+		attr = mlx5e_tc_get_encap_attr(flow);
 		esw_attr = attr->esw_attr;
 
 		if (flow_flag_test(flow, SLOW))
-- 
cgit v1.2.3


From a65735148e0328f80c0f72f9f8d2f609bfcf4aff Mon Sep 17 00:00:00 2001
From: Roi Dayan <roid@nvidia.com>
Date: Mon, 1 May 2023 14:37:56 +0300
Subject: net/mlx5: Fix error message when failing to allocate device memory

Fix spacing for the error and also the correct error code pointer.

Fixes: c9b9dcb430b3 ("net/mlx5: Move device memory management to mlx5_core")
Signed-off-by: Roi Dayan <roid@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/main.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'drivers')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c
index 995eb2d5ace0..a7eb65cd0bdd 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c
@@ -1049,7 +1049,7 @@ static int mlx5_init_once(struct mlx5_core_dev *dev)
 
 	dev->dm = mlx5_dm_create(dev);
 	if (IS_ERR(dev->dm))
-		mlx5_core_warn(dev, "Failed to init device memory%d\n", err);
+		mlx5_core_warn(dev, "Failed to init device memory %ld\n", PTR_ERR(dev->dm));
 
 	dev->tracer = mlx5_fw_tracer_create(dev);
 	dev->hv_vhca = mlx5_hv_vhca_create(dev);
-- 
cgit v1.2.3


From 691c041bf20899fc13c793f92ba61ab660fa3a30 Mon Sep 17 00:00:00 2001
From: Vlad Buslov <vladbu@nvidia.com>
Date: Fri, 31 Mar 2023 14:20:51 +0200
Subject: net/mlx5e: Fix deadlock in tc route query code

Cited commit causes ABBA deadlock[0] when peer flows are created while
holding the devcom rw semaphore. Due to peer flows offload implementation
the lock is taken much higher up the call chain and there is no obvious way
to easily fix the deadlock. Instead, since tc route query code needs the
peer eswitch structure only to perform a lookup in xarray and doesn't
perform any sleeping operations with it, refactor the code for lockless
execution in following ways:

- RCUify the devcom 'data' pointer. When resetting the pointer
synchronously wait for RCU grace period before returning. This is fine
since devcom is currently only used for synchronization of
pairing/unpairing of eswitches which is rare and already expensive as-is.

- Wrap all usages of 'paired' boolean in {READ|WRITE}_ONCE(). The flag has
already been used in some unlocked contexts without proper
annotations (e.g. users of mlx5_devcom_is_paired() function), but it wasn't
an issue since all relevant code paths checked it again after obtaining the
devcom semaphore. Now it is also used by mlx5_devcom_get_peer_data_rcu() as
"best effort" check to return NULL when devcom is being unpaired. Note that
while RCU read lock doesn't prevent the unpaired flag from being changed
concurrently it still guarantees that reader can continue to use 'data'.

- Refactor mlx5e_tc_query_route_vport() function to use new
mlx5_devcom_get_peer_data_rcu() API which fixes the deadlock.

[0]:

[  164.599612] ======================================================
[  164.600142] WARNING: possible circular locking dependency detected
[  164.600667] 6.3.0-rc3+ #1 Not tainted
[  164.601021] ------------------------------------------------------
[  164.601557] handler1/3456 is trying to acquire lock:
[  164.601998] ffff88811f1714b0 (&esw->offloads.encap_tbl_lock){+.+.}-{3:3}, at: mlx5e_attach_encap+0xd8/0x8b0 [mlx5_core]
[  164.603078]
               but task is already holding lock:
[  164.603617] ffff88810137fc98 (&comp->sem){++++}-{3:3}, at: mlx5_devcom_get_peer_data+0x37/0x80 [mlx5_core]
[  164.604459]
               which lock already depends on the new lock.

[  164.605190]
               the existing dependency chain (in reverse order) is:
[  164.605848]
               -> #1 (&comp->sem){++++}-{3:3}:
[  164.606380]        down_read+0x39/0x50
[  164.606772]        mlx5_devcom_get_peer_data+0x37/0x80 [mlx5_core]
[  164.607336]        mlx5e_tc_query_route_vport+0x86/0xc0 [mlx5_core]
[  164.607914]        mlx5e_tc_tun_route_lookup+0x1a4/0x1d0 [mlx5_core]
[  164.608495]        mlx5e_attach_decap_route+0xc6/0x1e0 [mlx5_core]
[  164.609063]        mlx5e_tc_add_fdb_flow+0x1ea/0x360 [mlx5_core]
[  164.609627]        __mlx5e_add_fdb_flow+0x2d2/0x430 [mlx5_core]
[  164.610175]        mlx5e_configure_flower+0x952/0x1a20 [mlx5_core]
[  164.610741]        tc_setup_cb_add+0xd4/0x200
[  164.611146]        fl_hw_replace_filter+0x14c/0x1f0 [cls_flower]
[  164.611661]        fl_change+0xc95/0x18a0 [cls_flower]
[  164.612116]        tc_new_tfilter+0x3fc/0xd20
[  164.612516]        rtnetlink_rcv_msg+0x418/0x5b0
[  164.612936]        netlink_rcv_skb+0x54/0x100
[  164.613339]        netlink_unicast+0x190/0x250
[  164.613746]        netlink_sendmsg+0x245/0x4a0
[  164.614150]        sock_sendmsg+0x38/0x60
[  164.614522]        ____sys_sendmsg+0x1d0/0x1e0
[  164.614934]        ___sys_sendmsg+0x80/0xc0
[  164.615320]        __sys_sendmsg+0x51/0x90
[  164.615701]        do_syscall_64+0x3d/0x90
[  164.616083]        entry_SYSCALL_64_after_hwframe+0x46/0xb0
[  164.616568]
               -> #0 (&esw->offloads.encap_tbl_lock){+.+.}-{3:3}:
[  164.617210]        __lock_acquire+0x159e/0x26e0
[  164.617638]        lock_acquire+0xc2/0x2a0
[  164.618018]        __mutex_lock+0x92/0xcd0
[  164.618401]        mlx5e_attach_encap+0xd8/0x8b0 [mlx5_core]
[  164.618943]        post_process_attr+0x153/0x2d0 [mlx5_core]
[  164.619471]        mlx5e_tc_add_fdb_flow+0x164/0x360 [mlx5_core]
[  164.620021]        __mlx5e_add_fdb_flow+0x2d2/0x430 [mlx5_core]
[  164.620564]        mlx5e_configure_flower+0xe33/0x1a20 [mlx5_core]
[  164.621125]        tc_setup_cb_add+0xd4/0x200
[  164.621531]        fl_hw_replace_filter+0x14c/0x1f0 [cls_flower]
[  164.622047]        fl_change+0xc95/0x18a0 [cls_flower]
[  164.622500]        tc_new_tfilter+0x3fc/0xd20
[  164.622906]        rtnetlink_rcv_msg+0x418/0x5b0
[  164.623324]        netlink_rcv_skb+0x54/0x100
[  164.623727]        netlink_unicast+0x190/0x250
[  164.624138]        netlink_sendmsg+0x245/0x4a0
[  164.624544]        sock_sendmsg+0x38/0x60
[  164.624919]        ____sys_sendmsg+0x1d0/0x1e0
[  164.625340]        ___sys_sendmsg+0x80/0xc0
[  164.625731]        __sys_sendmsg+0x51/0x90
[  164.626117]        do_syscall_64+0x3d/0x90
[  164.626502]        entry_SYSCALL_64_after_hwframe+0x46/0xb0
[  164.626995]
               other info that might help us debug this:

[  164.627725]  Possible unsafe locking scenario:

[  164.628268]        CPU0                    CPU1
[  164.628683]        ----                    ----
[  164.629098]   lock(&comp->sem);
[  164.629421]                                lock(&esw->offloads.encap_tbl_lock);
[  164.630066]                                lock(&comp->sem);
[  164.630555]   lock(&esw->offloads.encap_tbl_lock);
[  164.630993]
                *** DEADLOCK ***

[  164.631575] 3 locks held by handler1/3456:
[  164.631962]  #0: ffff888124b75130 (&block->cb_lock){++++}-{3:3}, at: tc_setup_cb_add+0x5b/0x200
[  164.632703]  #1: ffff888116e512b8 (&esw->mode_lock){++++}-{3:3}, at: mlx5_esw_hold+0x39/0x50 [mlx5_core]
[  164.633552]  #2: ffff88810137fc98 (&comp->sem){++++}-{3:3}, at: mlx5_devcom_get_peer_data+0x37/0x80 [mlx5_core]
[  164.634435]
               stack backtrace:
[  164.634883] CPU: 17 PID: 3456 Comm: handler1 Not tainted 6.3.0-rc3+ #1
[  164.635431] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014
[  164.636340] Call Trace:
[  164.636616]  <TASK>
[  164.636863]  dump_stack_lvl+0x47/0x70
[  164.637217]  check_noncircular+0xfe/0x110
[  164.637601]  __lock_acquire+0x159e/0x26e0
[  164.637977]  ? mlx5_cmd_set_fte+0x5b0/0x830 [mlx5_core]
[  164.638472]  lock_acquire+0xc2/0x2a0
[  164.638828]  ? mlx5e_attach_encap+0xd8/0x8b0 [mlx5_core]
[  164.639339]  ? lock_is_held_type+0x98/0x110
[  164.639728]  __mutex_lock+0x92/0xcd0
[  164.640074]  ? mlx5e_attach_encap+0xd8/0x8b0 [mlx5_core]
[  164.640576]  ? __lock_acquire+0x382/0x26e0
[  164.640958]  ? mlx5e_attach_encap+0xd8/0x8b0 [mlx5_core]
[  164.641468]  ? mlx5e_attach_encap+0xd8/0x8b0 [mlx5_core]
[  164.641965]  mlx5e_attach_encap+0xd8/0x8b0 [mlx5_core]
[  164.642454]  ? lock_release+0xbf/0x240
[  164.642819]  post_process_attr+0x153/0x2d0 [mlx5_core]
[  164.643318]  mlx5e_tc_add_fdb_flow+0x164/0x360 [mlx5_core]
[  164.643835]  __mlx5e_add_fdb_flow+0x2d2/0x430 [mlx5_core]
[  164.644340]  mlx5e_configure_flower+0xe33/0x1a20 [mlx5_core]
[  164.644862]  ? lock_acquire+0xc2/0x2a0
[  164.645219]  tc_setup_cb_add+0xd4/0x200
[  164.645588]  fl_hw_replace_filter+0x14c/0x1f0 [cls_flower]
[  164.646067]  fl_change+0xc95/0x18a0 [cls_flower]
[  164.646488]  tc_new_tfilter+0x3fc/0xd20
[  164.646861]  ? tc_del_tfilter+0x810/0x810
[  164.647236]  rtnetlink_rcv_msg+0x418/0x5b0
[  164.647621]  ? rtnl_setlink+0x160/0x160
[  164.647982]  netlink_rcv_skb+0x54/0x100
[  164.648348]  netlink_unicast+0x190/0x250
[  164.648722]  netlink_sendmsg+0x245/0x4a0
[  164.649090]  sock_sendmsg+0x38/0x60
[  164.649434]  ____sys_sendmsg+0x1d0/0x1e0
[  164.649804]  ? copy_msghdr_from_user+0x6d/0xa0
[  164.650213]  ___sys_sendmsg+0x80/0xc0
[  164.650563]  ? lock_acquire+0xc2/0x2a0
[  164.650926]  ? lock_acquire+0xc2/0x2a0
[  164.651286]  ? __fget_files+0x5/0x190
[  164.651644]  ? find_held_lock+0x2b/0x80
[  164.652006]  ? __fget_files+0xb9/0x190
[  164.652365]  ? lock_release+0xbf/0x240
[  164.652723]  ? __fget_files+0xd3/0x190
[  164.653079]  __sys_sendmsg+0x51/0x90
[  164.653435]  do_syscall_64+0x3d/0x90
[  164.653784]  entry_SYSCALL_64_after_hwframe+0x46/0xb0
[  164.654229] RIP: 0033:0x7f378054f8bd
[  164.654577] Code: 28 89 54 24 1c 48 89 74 24 10 89 7c 24 08 e8 6a c3 f4 ff 8b 54 24 1c 48 8b 74 24 10 41 89 c0 8b 7c 24 08 b8 2e 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 33 44 89 c7 48 89 44 24 08 e8 be c3 f4 ff 48
[  164.656041] RSP: 002b:00007f377fa114b0 EFLAGS: 00000293 ORIG_RAX: 000000000000002e
[  164.656701] RAX: ffffffffffffffda RBX: 0000000000000001 RCX: 00007f378054f8bd
[  164.657297] RDX: 0000000000000000 RSI: 00007f377fa11540 RDI: 0000000000000014
[  164.657885] RBP: 00007f377fa12278 R08: 0000000000000000 R09: 000000000000015c
[  164.658472] R10: 00007f377fa123d0 R11: 0000000000000293 R12: 0000560962d99bd0
[  164.665317] R13: 0000000000000000 R14: 0000560962d99bd0 R15: 00007f377fa11540

Fixes: f9d196bd632b ("net/mlx5e: Use correct eswitch for stack devices with lag")
Signed-off-by: Vlad Buslov <vladbu@nvidia.com>
Reviewed-by: Roi Dayan <roid@nvidia.com>
Reviewed-by: Shay Drory <shayd@nvidia.com>
Reviewed-by: Tariq Toukan <tariqt@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/en_tc.c    | 19 +++++----
 .../net/ethernet/mellanox/mlx5/core/lib/devcom.c   | 48 +++++++++++++++++-----
 .../net/ethernet/mellanox/mlx5/core/lib/devcom.h   |  1 +
 3 files changed, 48 insertions(+), 20 deletions(-)

(limited to 'drivers')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
index 65fe40f55d84..416ab6b6da97 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
@@ -1665,11 +1665,9 @@ bool mlx5e_tc_is_vf_tunnel(struct net_device *out_dev, struct net_device *route_
 int mlx5e_tc_query_route_vport(struct net_device *out_dev, struct net_device *route_dev, u16 *vport)
 {
 	struct mlx5e_priv *out_priv, *route_priv;
-	struct mlx5_devcom *devcom = NULL;
 	struct mlx5_core_dev *route_mdev;
 	struct mlx5_eswitch *esw;
 	u16 vhca_id;
-	int err;
 
 	out_priv = netdev_priv(out_dev);
 	esw = out_priv->mdev->priv.eswitch;
@@ -1678,6 +1676,9 @@ int mlx5e_tc_query_route_vport(struct net_device *out_dev, struct net_device *ro
 
 	vhca_id = MLX5_CAP_GEN(route_mdev, vhca_id);
 	if (mlx5_lag_is_active(out_priv->mdev)) {
+		struct mlx5_devcom *devcom;
+		int err;
+
 		/* In lag case we may get devices from different eswitch instances.
 		 * If we failed to get vport num, it means, mostly, that we on the wrong
 		 * eswitch.
@@ -1686,16 +1687,16 @@ int mlx5e_tc_query_route_vport(struct net_device *out_dev, struct net_device *ro
 		if (err != -ENOENT)
 			return err;
 
+		rcu_read_lock();
 		devcom = out_priv->mdev->priv.devcom;
-		esw = mlx5_devcom_get_peer_data(devcom, MLX5_DEVCOM_ESW_OFFLOADS);
-		if (!esw)
-			return -ENODEV;
+		esw = mlx5_devcom_get_peer_data_rcu(devcom, MLX5_DEVCOM_ESW_OFFLOADS);
+		err = esw ? mlx5_eswitch_vhca_id_to_vport(esw, vhca_id, vport) : -ENODEV;
+		rcu_read_unlock();
+
+		return err;
 	}
 
-	err = mlx5_eswitch_vhca_id_to_vport(esw, vhca_id, vport);
-	if (devcom)
-		mlx5_devcom_release_peer_data(devcom, MLX5_DEVCOM_ESW_OFFLOADS);
-	return err;
+	return mlx5_eswitch_vhca_id_to_vport(esw, vhca_id, vport);
 }
 
 static int
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/devcom.c b/drivers/net/ethernet/mellanox/mlx5/core/lib/devcom.c
index adefde3ea941..070d55f13419 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/lib/devcom.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/devcom.c
@@ -13,7 +13,7 @@ static LIST_HEAD(devcom_list);
 
 struct mlx5_devcom_component {
 	struct {
-		void *data;
+		void __rcu *data;
 	} device[MLX5_DEVCOM_PORTS_SUPPORTED];
 
 	mlx5_devcom_event_handler_t handler;
@@ -162,7 +162,7 @@ void mlx5_devcom_register_component(struct mlx5_devcom *devcom,
 	comp = &devcom->priv->components[id];
 	down_write(&comp->sem);
 	comp->handler = handler;
-	comp->device[devcom->idx].data = data;
+	rcu_assign_pointer(comp->device[devcom->idx].data, data);
 	up_write(&comp->sem);
 }
 
@@ -176,8 +176,9 @@ void mlx5_devcom_unregister_component(struct mlx5_devcom *devcom,
 
 	comp = &devcom->priv->components[id];
 	down_write(&comp->sem);
-	comp->device[devcom->idx].data = NULL;
+	RCU_INIT_POINTER(comp->device[devcom->idx].data, NULL);
 	up_write(&comp->sem);
+	synchronize_rcu();
 }
 
 int mlx5_devcom_send_event(struct mlx5_devcom *devcom,
@@ -193,12 +194,15 @@ int mlx5_devcom_send_event(struct mlx5_devcom *devcom,
 
 	comp = &devcom->priv->components[id];
 	down_write(&comp->sem);
-	for (i = 0; i < MLX5_DEVCOM_PORTS_SUPPORTED; i++)
-		if (i != devcom->idx && comp->device[i].data) {
-			err = comp->handler(event, comp->device[i].data,
-					    event_data);
+	for (i = 0; i < MLX5_DEVCOM_PORTS_SUPPORTED; i++) {
+		void *data = rcu_dereference_protected(comp->device[i].data,
+						       lockdep_is_held(&comp->sem));
+
+		if (i != devcom->idx && data) {
+			err = comp->handler(event, data, event_data);
 			break;
 		}
+	}
 
 	up_write(&comp->sem);
 	return err;
@@ -213,7 +217,7 @@ void mlx5_devcom_set_paired(struct mlx5_devcom *devcom,
 	comp = &devcom->priv->components[id];
 	WARN_ON(!rwsem_is_locked(&comp->sem));
 
-	comp->paired = paired;
+	WRITE_ONCE(comp->paired, paired);
 }
 
 bool mlx5_devcom_is_paired(struct mlx5_devcom *devcom,
@@ -222,7 +226,7 @@ bool mlx5_devcom_is_paired(struct mlx5_devcom *devcom,
 	if (IS_ERR_OR_NULL(devcom))
 		return false;
 
-	return devcom->priv->components[id].paired;
+	return READ_ONCE(devcom->priv->components[id].paired);
 }
 
 void *mlx5_devcom_get_peer_data(struct mlx5_devcom *devcom,
@@ -236,7 +240,7 @@ void *mlx5_devcom_get_peer_data(struct mlx5_devcom *devcom,
 
 	comp = &devcom->priv->components[id];
 	down_read(&comp->sem);
-	if (!comp->paired) {
+	if (!READ_ONCE(comp->paired)) {
 		up_read(&comp->sem);
 		return NULL;
 	}
@@ -245,7 +249,29 @@ void *mlx5_devcom_get_peer_data(struct mlx5_devcom *devcom,
 		if (i != devcom->idx)
 			break;
 
-	return comp->device[i].data;
+	return rcu_dereference_protected(comp->device[i].data, lockdep_is_held(&comp->sem));
+}
+
+void *mlx5_devcom_get_peer_data_rcu(struct mlx5_devcom *devcom, enum mlx5_devcom_components id)
+{
+	struct mlx5_devcom_component *comp;
+	int i;
+
+	if (IS_ERR_OR_NULL(devcom))
+		return NULL;
+
+	for (i = 0; i < MLX5_DEVCOM_PORTS_SUPPORTED; i++)
+		if (i != devcom->idx)
+			break;
+
+	comp = &devcom->priv->components[id];
+	/* This can change concurrently, however 'data' pointer will remain
+	 * valid for the duration of RCU read section.
+	 */
+	if (!READ_ONCE(comp->paired))
+		return NULL;
+
+	return rcu_dereference(comp->device[i].data);
 }
 
 void mlx5_devcom_release_peer_data(struct mlx5_devcom *devcom,
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/devcom.h b/drivers/net/ethernet/mellanox/mlx5/core/lib/devcom.h
index 94313c18bb64..9a496f4722da 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/lib/devcom.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/devcom.h
@@ -41,6 +41,7 @@ bool mlx5_devcom_is_paired(struct mlx5_devcom *devcom,
 
 void *mlx5_devcom_get_peer_data(struct mlx5_devcom *devcom,
 				enum mlx5_devcom_components id);
+void *mlx5_devcom_get_peer_data_rcu(struct mlx5_devcom *devcom, enum mlx5_devcom_components id);
 void mlx5_devcom_release_peer_data(struct mlx5_devcom *devcom,
 				   enum mlx5_devcom_components id);
 
-- 
cgit v1.2.3


From 7aa50380191635e5897a773f272829cc961a2be5 Mon Sep 17 00:00:00 2001
From: Rahul Rameshbabu <rrameshbabu@nvidia.com>
Date: Tue, 21 Feb 2023 16:18:48 -0800
Subject: net/mlx5e: Fix SQ wake logic in ptp napi_poll context

Check in the mlx5e_ptp_poll_ts_cq context if the ptp tx sq should be woken
up. Before change, the ptp tx sq may never wake up if the ptp tx ts skb
fifo is full when mlx5e_poll_tx_cq checks if the queue should be woken up.

Fixes: 1880bc4e4a96 ("net/mlx5e: Add TX port timestamp support")
Signed-off-by: Rahul Rameshbabu <rrameshbabu@nvidia.com>
Reviewed-by: Tariq Toukan <tariqt@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/en/ptp.c  |  2 ++
 drivers/net/ethernet/mellanox/mlx5/core/en/txrx.h |  2 ++
 drivers/net/ethernet/mellanox/mlx5/core/en_tx.c   | 19 ++++++++++++-------
 3 files changed, 16 insertions(+), 7 deletions(-)

(limited to 'drivers')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/ptp.c b/drivers/net/ethernet/mellanox/mlx5/core/en/ptp.c
index eb5abd0e55d9..3cbebfba582b 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/ptp.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/ptp.c
@@ -175,6 +175,8 @@ static bool mlx5e_ptp_poll_ts_cq(struct mlx5e_cq *cq, int budget)
 	/* ensure cq space is freed before enabling more cqes */
 	wmb();
 
+	mlx5e_txqsq_wake(&ptpsq->txqsq);
+
 	return work_done == budget;
 }
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/txrx.h b/drivers/net/ethernet/mellanox/mlx5/core/en/txrx.h
index 47381e949f1f..879d698b6119 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/txrx.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/txrx.h
@@ -193,6 +193,8 @@ static inline u16 mlx5e_txqsq_get_next_pi(struct mlx5e_txqsq *sq, u16 size)
 	return pi;
 }
 
+void mlx5e_txqsq_wake(struct mlx5e_txqsq *sq);
+
 static inline u16 mlx5e_shampo_get_cqe_header_index(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe)
 {
 	return be16_to_cpu(cqe->shampo.header_entry_index) & (rq->mpwqe.shampo->hd_per_wq - 1);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c
index df5e780e8e6a..c7eb6b238c2b 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c
@@ -762,6 +762,17 @@ static void mlx5e_tx_wi_consume_fifo_skbs(struct mlx5e_txqsq *sq, struct mlx5e_t
 	}
 }
 
+void mlx5e_txqsq_wake(struct mlx5e_txqsq *sq)
+{
+	if (netif_tx_queue_stopped(sq->txq) &&
+	    mlx5e_wqc_has_room_for(&sq->wq, sq->cc, sq->pc, sq->stop_room) &&
+	    mlx5e_ptpsq_fifo_has_room(sq) &&
+	    !test_bit(MLX5E_SQ_STATE_RECOVERING, &sq->state)) {
+		netif_tx_wake_queue(sq->txq);
+		sq->stats->wake++;
+	}
+}
+
 bool mlx5e_poll_tx_cq(struct mlx5e_cq *cq, int napi_budget)
 {
 	struct mlx5e_sq_stats *stats;
@@ -861,13 +872,7 @@ bool mlx5e_poll_tx_cq(struct mlx5e_cq *cq, int napi_budget)
 
 	netdev_tx_completed_queue(sq->txq, npkts, nbytes);
 
-	if (netif_tx_queue_stopped(sq->txq) &&
-	    mlx5e_wqc_has_room_for(&sq->wq, sq->cc, sq->pc, sq->stop_room) &&
-	    mlx5e_ptpsq_fifo_has_room(sq) &&
-	    !test_bit(MLX5E_SQ_STATE_RECOVERING, &sq->state)) {
-		netif_tx_wake_queue(sq->txq);
-		stats->wake++;
-	}
+	mlx5e_txqsq_wake(sq);
 
 	return (i == MLX5E_TX_CQ_POLL_BUDGET);
 }
-- 
cgit v1.2.3


From dfa1e46d6093831b9d49f0f350227a1d13644a2f Mon Sep 17 00:00:00 2001
From: Paul Blakey <paulb@nvidia.com>
Date: Wed, 26 Apr 2023 16:04:48 +0300
Subject: net/mlx5e: TC, Fix using eswitch mapping in nic mode

Cited patch is using the eswitch object mapping pool while
in nic mode where it isn't initialized. This results in the
trace below [0].

Fix that by using either nic or eswitch object mapping pool
depending if eswitch is enabled or not.

[0]:
[  826.446057] ==================================================================
[  826.446729] BUG: KASAN: slab-use-after-free in mlx5_add_flow_rules+0x30/0x490 [mlx5_core]
[  826.447515] Read of size 8 at addr ffff888194485830 by task tc/6233

[  826.448243] CPU: 16 PID: 6233 Comm: tc Tainted: G        W          6.3.0-rc6+ #1
[  826.448890] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014
[  826.449785] Call Trace:
[  826.450052]  <TASK>
[  826.450302]  dump_stack_lvl+0x33/0x50
[  826.450650]  print_report+0xc2/0x610
[  826.450998]  ? __virt_addr_valid+0xb1/0x130
[  826.451385]  ? mlx5_add_flow_rules+0x30/0x490 [mlx5_core]
[  826.451935]  kasan_report+0xae/0xe0
[  826.452276]  ? mlx5_add_flow_rules+0x30/0x490 [mlx5_core]
[  826.452829]  mlx5_add_flow_rules+0x30/0x490 [mlx5_core]
[  826.453368]  ? __kmalloc_node+0x5a/0x120
[  826.453733]  esw_add_restore_rule+0x20f/0x270 [mlx5_core]
[  826.454288]  ? mlx5_eswitch_add_send_to_vport_meta_rule+0x260/0x260 [mlx5_core]
[  826.455011]  ? mutex_unlock+0x80/0xd0
[  826.455361]  ? __mutex_unlock_slowpath.constprop.0+0x210/0x210
[  826.455862]  ? mapping_add+0x2cb/0x440 [mlx5_core]
[  826.456425]  mlx5e_tc_action_miss_mapping_get+0x139/0x180 [mlx5_core]
[  826.457058]  ? mlx5e_tc_update_skb_nic+0xb0/0xb0 [mlx5_core]
[  826.457636]  ? __kasan_kmalloc+0x77/0x90
[  826.458000]  ? __kmalloc+0x57/0x120
[  826.458336]  mlx5_tc_ct_flow_offload+0x325/0xe40 [mlx5_core]
[  826.458916]  ? ct_kernel_enter.constprop.0+0x48/0xa0
[  826.459360]  ? mlx5_tc_ct_parse_action+0xf0/0xf0 [mlx5_core]
[  826.459933]  ? mlx5e_mod_hdr_attach+0x491/0x520 [mlx5_core]
[  826.460507]  ? mlx5e_mod_hdr_get+0x12/0x20 [mlx5_core]
[  826.461046]  ? mlx5e_tc_attach_mod_hdr+0x154/0x170 [mlx5_core]
[  826.461635]  mlx5e_configure_flower+0x969/0x2110 [mlx5_core]
[  826.462217]  ? _raw_spin_lock_bh+0x85/0xe0
[  826.462597]  ? __mlx5e_add_fdb_flow+0x750/0x750 [mlx5_core]
[  826.463163]  ? kasan_save_stack+0x2e/0x40
[  826.463534]  ? down_read+0x115/0x1b0
[  826.463878]  ? down_write_killable+0x110/0x110
[  826.464288]  ? tc_setup_action.part.0+0x9f/0x3b0
[  826.464701]  ? mlx5e_is_uplink_rep+0x4c/0x90 [mlx5_core]
[  826.465253]  ? mlx5e_tc_reoffload_flows_work+0x130/0x130 [mlx5_core]
[  826.465878]  tc_setup_cb_add+0x112/0x250
[  826.466247]  fl_hw_replace_filter+0x230/0x310 [cls_flower]
[  826.466724]  ? fl_hw_destroy_filter+0x1a0/0x1a0 [cls_flower]
[  826.467212]  fl_change+0x14e1/0x2030 [cls_flower]
[  826.467636]  ? sock_def_readable+0x89/0x120
[  826.468019]  ? fl_tmplt_create+0x2d0/0x2d0 [cls_flower]
[  826.468509]  ? kasan_unpoison+0x23/0x50
[  826.468873]  ? get_random_u16+0x180/0x180
[  826.469244]  ? __radix_tree_lookup+0x2b/0x130
[  826.469640]  ? fl_get+0x7b/0x140 [cls_flower]
[  826.470042]  ? fl_mask_put+0x200/0x200 [cls_flower]
[  826.470478]  ? __mutex_unlock_slowpath.constprop.0+0x210/0x210
[  826.470973]  ? fl_tmplt_create+0x2d0/0x2d0 [cls_flower]
[  826.471427]  tc_new_tfilter+0x644/0x1050
[  826.471795]  ? tc_get_tfilter+0x860/0x860
[  826.472170]  ? __thaw_task+0x130/0x130
[  826.472525]  ? arch_stack_walk+0x98/0xf0
[  826.472892]  ? cap_capable+0x9f/0xd0
[  826.473235]  ? security_capable+0x47/0x60
[  826.473608]  rtnetlink_rcv_msg+0x1d5/0x550
[  826.473985]  ? rtnl_calcit.isra.0+0x1f0/0x1f0
[  826.474383]  ? __stack_depot_save+0x35/0x4c0
[  826.474779]  ? kasan_save_stack+0x2e/0x40
[  826.475149]  ? kasan_save_stack+0x1e/0x40
[  826.475518]  ? __kasan_record_aux_stack+0x9f/0xb0
[  826.475939]  ? task_work_add+0x77/0x1c0
[  826.476305]  netlink_rcv_skb+0xe0/0x210
[  826.476661]  ? rtnl_calcit.isra.0+0x1f0/0x1f0
[  826.477057]  ? netlink_ack+0x7c0/0x7c0
[  826.477412]  ? rhashtable_jhash2+0xef/0x150
[  826.477796]  ? _copy_from_iter+0x105/0x770
[  826.484386]  netlink_unicast+0x346/0x490
[  826.484755]  ? netlink_attachskb+0x400/0x400
[  826.485145]  ? kernel_text_address+0xc2/0xd0
[  826.485535]  netlink_sendmsg+0x3b0/0x6c0
[  826.485902]  ? kernel_text_address+0xc2/0xd0
[  826.486296]  ? netlink_unicast+0x490/0x490
[  826.486671]  ? iovec_from_user.part.0+0x7a/0x1a0
[  826.487083]  ? netlink_unicast+0x490/0x490
[  826.487461]  sock_sendmsg+0x73/0xc0
[  826.487803]  ____sys_sendmsg+0x364/0x380
[  826.488186]  ? import_iovec+0x7/0x10
[  826.488531]  ? kernel_sendmsg+0x30/0x30
[  826.488893]  ? __copy_msghdr+0x180/0x180
[  826.489258]  ? kasan_save_stack+0x2e/0x40
[  826.489629]  ? kasan_save_stack+0x1e/0x40
[  826.490002]  ? __kasan_record_aux_stack+0x9f/0xb0
[  826.490424]  ? __call_rcu_common.constprop.0+0x46/0x580
[  826.490876]  ___sys_sendmsg+0xdf/0x140
[  826.491231]  ? copy_msghdr_from_user+0x110/0x110
[  826.491649]  ? fget_raw+0x120/0x120
[  826.491988]  ? ___sys_recvmsg+0xd9/0x130
[  826.492355]  ? folio_batch_add_and_move+0x80/0xa0
[  826.492776]  ? _raw_spin_lock+0x7a/0xd0
[  826.493137]  ? _raw_spin_lock+0x7a/0xd0
[  826.493500]  ? _raw_read_lock_irq+0x30/0x30
[  826.493880]  ? kasan_set_track+0x21/0x30
[  826.494249]  ? kasan_save_free_info+0x2a/0x40
[  826.494650]  ? do_sys_openat2+0xff/0x270
[  826.495016]  ? __fget_light+0x1b5/0x200
[  826.495377]  ? __virt_addr_valid+0xb1/0x130
[  826.495763]  __sys_sendmsg+0xb2/0x130
[  826.496118]  ? __sys_sendmsg_sock+0x20/0x20
[  826.496501]  ? __x64_sys_rseq+0x2e0/0x2e0
[  826.496874]  ? do_user_addr_fault+0x276/0x820
[  826.497273]  ? fpregs_assert_state_consistent+0x52/0x60
[  826.497727]  ? exit_to_user_mode_prepare+0x30/0x120
[  826.498158]  do_syscall_64+0x3d/0x90
[  826.498502]  entry_SYSCALL_64_after_hwframe+0x46/0xb0
[  826.498949] RIP: 0033:0x7f9b67f4f887
[  826.499294] Code: 0a 00 f7 d8 64 89 02 48 c7 c0 ff ff ff ff eb b9 0f 1f 00 f3 0f 1e fa 64 8b 04 25 18 00 00 00 85 c0 75 10 b8 2e 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 51 c3 48 83 ec 28 89 54 24 1c 48 89 74 24 10
[  826.500742] RSP: 002b:00007fff5d1a5498 EFLAGS: 00000246 ORIG_RAX: 000000000000002e
[  826.501395] RAX: ffffffffffffffda RBX: 0000000064413ce6 RCX: 00007f9b67f4f887
[  826.501975] RDX: 0000000000000000 RSI: 00007fff5d1a5500 RDI: 0000000000000003
[  826.502556] RBP: 0000000000000000 R08: 0000000000000001 R09: 0000000000000001
[  826.503135] R10: 00007f9b67e08708 R11: 0000000000000246 R12: 0000000000000001
[  826.503714] R13: 0000000000000001 R14: 00007fff5d1a9800 R15: 0000000000485400
[  826.504304]  </TASK>

[  826.504753] Allocated by task 3764:
[  826.505090]  kasan_save_stack+0x1e/0x40
[  826.505453]  kasan_set_track+0x21/0x30
[  826.505810]  __kasan_kmalloc+0x77/0x90
[  826.506164]  __mlx5_create_flow_table+0x16d/0xbb0 [mlx5_core]
[  826.506742]  esw_offloads_enable+0x60d/0xfb0 [mlx5_core]
[  826.507292]  mlx5_eswitch_enable_locked+0x4d3/0x680 [mlx5_core]
[  826.507885]  mlx5_devlink_eswitch_mode_set+0x2a3/0x580 [mlx5_core]
[  826.508513]  devlink_nl_cmd_eswitch_set_doit+0xdf/0x1f0
[  826.508969]  genl_family_rcv_msg_doit.isra.0+0x146/0x1c0
[  826.509427]  genl_rcv_msg+0x28d/0x3e0
[  826.509772]  netlink_rcv_skb+0xe0/0x210
[  826.510133]  genl_rcv+0x24/0x40
[  826.510448]  netlink_unicast+0x346/0x490
[  826.510810]  netlink_sendmsg+0x3b0/0x6c0
[  826.511179]  sock_sendmsg+0x73/0xc0
[  826.511519]  __sys_sendto+0x18d/0x220
[  826.511867]  __x64_sys_sendto+0x72/0x80
[  826.512232]  do_syscall_64+0x3d/0x90
[  826.512576]  entry_SYSCALL_64_after_hwframe+0x46/0xb0

[  826.513220] Freed by task 5674:
[  826.513535]  kasan_save_stack+0x1e/0x40
[  826.513893]  kasan_set_track+0x21/0x30
[  826.514245]  kasan_save_free_info+0x2a/0x40
[  826.514629]  ____kasan_slab_free+0x11a/0x1b0
[  826.515021]  __kmem_cache_free+0x14d/0x280
[  826.515399]  tree_put_node+0x109/0x1c0 [mlx5_core]
[  826.515907]  mlx5_destroy_flow_table+0x119/0x630 [mlx5_core]
[  826.516481]  esw_offloads_steering_cleanup+0xe7/0x150 [mlx5_core]
[  826.517084]  esw_offloads_disable+0xe0/0x160 [mlx5_core]
[  826.517632]  mlx5_eswitch_disable_locked+0x26c/0x290 [mlx5_core]
[  826.518225]  mlx5_devlink_eswitch_mode_set+0x128/0x580 [mlx5_core]
[  826.518834]  devlink_nl_cmd_eswitch_set_doit+0xdf/0x1f0
[  826.519286]  genl_family_rcv_msg_doit.isra.0+0x146/0x1c0
[  826.519748]  genl_rcv_msg+0x28d/0x3e0
[  826.520101]  netlink_rcv_skb+0xe0/0x210
[  826.520458]  genl_rcv+0x24/0x40
[  826.520771]  netlink_unicast+0x346/0x490
[  826.521137]  netlink_sendmsg+0x3b0/0x6c0
[  826.521505]  sock_sendmsg+0x73/0xc0
[  826.521842]  __sys_sendto+0x18d/0x220
[  826.522191]  __x64_sys_sendto+0x72/0x80
[  826.522554]  do_syscall_64+0x3d/0x90
[  826.522894]  entry_SYSCALL_64_after_hwframe+0x46/0xb0

[  826.523540] Last potentially related work creation:
[  826.523969]  kasan_save_stack+0x1e/0x40
[  826.524331]  __kasan_record_aux_stack+0x9f/0xb0
[  826.524739]  insert_work+0x30/0x130
[  826.525078]  __queue_work+0x34b/0x690
[  826.525426]  queue_work_on+0x48/0x50
[  826.525766]  __rhashtable_remove_fast_one+0x4af/0x4d0 [mlx5_core]
[  826.526365]  del_sw_flow_group+0x1b5/0x270 [mlx5_core]
[  826.526898]  tree_put_node+0x109/0x1c0 [mlx5_core]
[  826.527407]  esw_offloads_steering_cleanup+0xd3/0x150 [mlx5_core]
[  826.528009]  esw_offloads_disable+0xe0/0x160 [mlx5_core]
[  826.528616]  mlx5_eswitch_disable_locked+0x26c/0x290 [mlx5_core]
[  826.529218]  mlx5_devlink_eswitch_mode_set+0x128/0x580 [mlx5_core]
[  826.529823]  devlink_nl_cmd_eswitch_set_doit+0xdf/0x1f0
[  826.530276]  genl_family_rcv_msg_doit.isra.0+0x146/0x1c0
[  826.530733]  genl_rcv_msg+0x28d/0x3e0
[  826.531079]  netlink_rcv_skb+0xe0/0x210
[  826.531439]  genl_rcv+0x24/0x40
[  826.531755]  netlink_unicast+0x346/0x490
[  826.532123]  netlink_sendmsg+0x3b0/0x6c0
[  826.532487]  sock_sendmsg+0x73/0xc0
[  826.532825]  __sys_sendto+0x18d/0x220
[  826.533175]  __x64_sys_sendto+0x72/0x80
[  826.533533]  do_syscall_64+0x3d/0x90
[  826.533877]  entry_SYSCALL_64_after_hwframe+0x46/0xb0

[  826.534521] The buggy address belongs to the object at ffff888194485800
                which belongs to the cache kmalloc-512 of size 512
[  826.535506] The buggy address is located 48 bytes inside of
                freed 512-byte region [ffff888194485800, ffff888194485a00)

[  826.536666] The buggy address belongs to the physical page:
[  826.537138] page:00000000d75841dd refcount:1 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0x194480
[  826.537915] head:00000000d75841dd order:3 entire_mapcount:0 nr_pages_mapped:0 pincount:0
[  826.538595] flags: 0x200000000010200(slab|head|node=0|zone=2)
[  826.539089] raw: 0200000000010200 ffff888100042c80 ffffea0004523800 dead000000000002
[  826.539755] raw: 0000000000000000 0000000000200020 00000001ffffffff 0000000000000000
[  826.540417] page dumped because: kasan: bad access detected

[  826.541095] Memory state around the buggy address:
[  826.541519]  ffff888194485700: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
[  826.542149]  ffff888194485780: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
[  826.542773] >ffff888194485800: fa fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
[  826.543400]                                      ^
[  826.543822]  ffff888194485880: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
[  826.544452]  ffff888194485900: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
[  826.545079] ==================================================================

Fixes: 6702782845a5 ("net/mlx5e: TC, Set CT miss to the specific ct action instance")
Signed-off-by: Paul Blakey <paulb@nvidia.com>
Reviewed-by: Vlad Buslov <vladbu@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/en_tc.c | 34 ++++++++++++++++++++-----
 1 file changed, 27 insertions(+), 7 deletions(-)

(limited to 'drivers')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
index 416ab6b6da97..e95414ef1f04 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
@@ -5646,22 +5646,43 @@ bool mlx5e_tc_update_skb_nic(struct mlx5_cqe64 *cqe, struct sk_buff *skb)
 				   0, NULL);
 }
 
+static struct mapping_ctx *
+mlx5e_get_priv_obj_mapping(struct mlx5e_priv *priv)
+{
+	struct mlx5e_tc_table *tc;
+	struct mlx5_eswitch *esw;
+	struct mapping_ctx *ctx;
+
+	if (is_mdev_switchdev_mode(priv->mdev)) {
+		esw = priv->mdev->priv.eswitch;
+		ctx = esw->offloads.reg_c0_obj_pool;
+	} else {
+		tc = mlx5e_fs_get_tc(priv->fs);
+		ctx = tc->mapping;
+	}
+
+	return ctx;
+}
+
 int mlx5e_tc_action_miss_mapping_get(struct mlx5e_priv *priv, struct mlx5_flow_attr *attr,
 				     u64 act_miss_cookie, u32 *act_miss_mapping)
 {
-	struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
 	struct mlx5_mapped_obj mapped_obj = {};
+	struct mlx5_eswitch *esw;
 	struct mapping_ctx *ctx;
 	int err;
 
-	ctx = esw->offloads.reg_c0_obj_pool;
-
+	ctx = mlx5e_get_priv_obj_mapping(priv);
 	mapped_obj.type = MLX5_MAPPED_OBJ_ACT_MISS;
 	mapped_obj.act_miss_cookie = act_miss_cookie;
 	err = mapping_add(ctx, &mapped_obj, act_miss_mapping);
 	if (err)
 		return err;
 
+	if (!is_mdev_switchdev_mode(priv->mdev))
+		return 0;
+
+	esw = priv->mdev->priv.eswitch;
 	attr->act_id_restore_rule = esw_add_restore_rule(esw, *act_miss_mapping);
 	if (IS_ERR(attr->act_id_restore_rule))
 		goto err_rule;
@@ -5676,10 +5697,9 @@ err_rule:
 void mlx5e_tc_action_miss_mapping_put(struct mlx5e_priv *priv, struct mlx5_flow_attr *attr,
 				      u32 act_miss_mapping)
 {
-	struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
-	struct mapping_ctx *ctx;
+	struct mapping_ctx *ctx = mlx5e_get_priv_obj_mapping(priv);
 
-	ctx = esw->offloads.reg_c0_obj_pool;
-	mlx5_del_flow_rules(attr->act_id_restore_rule);
+	if (is_mdev_switchdev_mode(priv->mdev))
+		mlx5_del_flow_rules(attr->act_id_restore_rule);
 	mapping_remove(ctx, act_miss_mapping);
 }
-- 
cgit v1.2.3


From 8c253dfc89efde6b5faddf9e7400e5d17884e042 Mon Sep 17 00:00:00 2001
From: Shay Drory <shayd@nvidia.com>
Date: Mon, 6 Feb 2023 11:52:02 +0200
Subject: net/mlx5: E-switch, Devcom, sync devcom events and devcom comp
 register

devcom events are sent to all registered component. Following the
cited patch, it is possible for two components, e.g.: two eswitches,
to send devcom events, while both components are registered. This
means eswitch layer will do double un/pairing, which is double
allocation and free of resources, even though only one un/pairing is
needed. flow example:

	cpu0					cpu1
	----					----

 mlx5_devlink_eswitch_mode_set(dev0)
  esw_offloads_devcom_init()
   mlx5_devcom_register_component(esw0)
                                         mlx5_devlink_eswitch_mode_set(dev1)
                                          esw_offloads_devcom_init()
                                           mlx5_devcom_register_component(esw1)
                                           mlx5_devcom_send_event()
   mlx5_devcom_send_event()

Hence, check whether the eswitches are already un/paired before
free/allocation of resources.

Fixes: 09b278462f16 ("net: devlink: enable parallel ops on netlink interface")
Signed-off-by: Shay Drory <shayd@nvidia.com>
Reviewed-by: Mark Bloch <mbloch@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/eswitch.h          | 1 +
 drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c | 9 ++++++++-
 2 files changed, 9 insertions(+), 1 deletion(-)

(limited to 'drivers')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
index 9f007c5438ee..add6cfa432a5 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
@@ -342,6 +342,7 @@ struct mlx5_eswitch {
 		u32             large_group_num;
 	}  params;
 	struct blocking_notifier_head n_head;
+	bool paired[MLX5_MAX_PORTS];
 };
 
 void esw_offloads_disable(struct mlx5_eswitch *esw);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
index 7c34c7cf506f..8d19c20d3447 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
@@ -2742,6 +2742,9 @@ static int mlx5_esw_offloads_devcom_event(int event,
 		    mlx5_eswitch_vport_match_metadata_enabled(peer_esw))
 			break;
 
+		if (esw->paired[mlx5_get_dev_index(peer_esw->dev)])
+			break;
+
 		err = mlx5_esw_offloads_set_ns_peer(esw, peer_esw, true);
 		if (err)
 			goto err_out;
@@ -2753,14 +2756,18 @@ static int mlx5_esw_offloads_devcom_event(int event,
 		if (err)
 			goto err_pair;
 
+		esw->paired[mlx5_get_dev_index(peer_esw->dev)] = true;
+		peer_esw->paired[mlx5_get_dev_index(esw->dev)] = true;
 		mlx5_devcom_set_paired(devcom, MLX5_DEVCOM_ESW_OFFLOADS, true);
 		break;
 
 	case ESW_OFFLOADS_DEVCOM_UNPAIR:
-		if (!mlx5_devcom_is_paired(devcom, MLX5_DEVCOM_ESW_OFFLOADS))
+		if (!esw->paired[mlx5_get_dev_index(peer_esw->dev)])
 			break;
 
 		mlx5_devcom_set_paired(devcom, MLX5_DEVCOM_ESW_OFFLOADS, false);
+		esw->paired[mlx5_get_dev_index(peer_esw->dev)] = false;
+		peer_esw->paired[mlx5_get_dev_index(esw->dev)] = false;
 		mlx5_esw_offloads_unpair(peer_esw);
 		mlx5_esw_offloads_unpair(esw);
 		mlx5_esw_offloads_set_ns_peer(esw, peer_esw, false);
-- 
cgit v1.2.3


From af87194352cad882d787d06fb7efa714acd95427 Mon Sep 17 00:00:00 2001
From: Shay Drory <shayd@nvidia.com>
Date: Tue, 2 May 2023 13:35:11 +0300
Subject: net/mlx5: Devcom, fix error flow in mlx5_devcom_register_device

In case devcom allocation is failed, mlx5 is always freeing the priv.
However, this priv might have been allocated by a different thread,
and freeing it might lead to use-after-free bugs.
Fix it by freeing the priv only in case it was allocated by the
running thread.

Fixes: fadd59fc50d0 ("net/mlx5: Introduce inter-device communication mechanism")
Signed-off-by: Shay Drory <shayd@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/lib/devcom.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'drivers')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/devcom.c b/drivers/net/ethernet/mellanox/mlx5/core/lib/devcom.c
index 070d55f13419..8f978491dd32 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/lib/devcom.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/devcom.c
@@ -112,7 +112,8 @@ struct mlx5_devcom *mlx5_devcom_register_device(struct mlx5_core_dev *dev)
 	priv->devs[idx] = dev;
 	devcom = mlx5_devcom_alloc(priv, idx);
 	if (!devcom) {
-		kfree(priv);
+		if (new_priv)
+			kfree(priv);
 		return ERR_PTR(-ENOMEM);
 	}
 
-- 
cgit v1.2.3


From 1f893f57a3bf9fe1f4bcb25b55aea7f7f9712fe7 Mon Sep 17 00:00:00 2001
From: Shay Drory <shayd@nvidia.com>
Date: Tue, 2 May 2023 13:36:42 +0300
Subject: net/mlx5: Devcom, serialize devcom registration

From one hand, mlx5 driver is allowing to probe PFs in parallel.
From the other hand, devcom, which is a share resource between PFs, is
registered without any lock. This might resulted in memory problems.

Hence, use the global mlx5_dev_list_lock in order to serialize devcom
registration.

Fixes: fadd59fc50d0 ("net/mlx5: Introduce inter-device communication mechanism")
Signed-off-by: Shay Drory <shayd@nvidia.com>
Reviewed-by: Mark Bloch <mbloch@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/lib/devcom.c | 19 ++++++++++++++-----
 1 file changed, 14 insertions(+), 5 deletions(-)

(limited to 'drivers')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/devcom.c b/drivers/net/ethernet/mellanox/mlx5/core/lib/devcom.c
index 8f978491dd32..b7d779d08d83 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/lib/devcom.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/devcom.c
@@ -3,6 +3,7 @@
 
 #include <linux/mlx5/vport.h>
 #include "lib/devcom.h"
+#include "mlx5_core.h"
 
 static LIST_HEAD(devcom_list);
 
@@ -77,6 +78,7 @@ struct mlx5_devcom *mlx5_devcom_register_device(struct mlx5_core_dev *dev)
 	if (MLX5_CAP_GEN(dev, num_lag_ports) != MLX5_DEVCOM_PORTS_SUPPORTED)
 		return NULL;
 
+	mlx5_dev_list_lock();
 	sguid0 = mlx5_query_nic_system_image_guid(dev);
 	list_for_each_entry(iter, &devcom_list, list) {
 		struct mlx5_core_dev *tmp_dev = NULL;
@@ -102,8 +104,10 @@ struct mlx5_devcom *mlx5_devcom_register_device(struct mlx5_core_dev *dev)
 
 	if (!priv) {
 		priv = mlx5_devcom_list_alloc();
-		if (!priv)
-			return ERR_PTR(-ENOMEM);
+		if (!priv) {
+			devcom = ERR_PTR(-ENOMEM);
+			goto out;
+		}
 
 		idx = 0;
 		new_priv = true;
@@ -114,12 +118,14 @@ struct mlx5_devcom *mlx5_devcom_register_device(struct mlx5_core_dev *dev)
 	if (!devcom) {
 		if (new_priv)
 			kfree(priv);
-		return ERR_PTR(-ENOMEM);
+		devcom = ERR_PTR(-ENOMEM);
+		goto out;
 	}
 
 	if (new_priv)
 		list_add(&priv->list, &devcom_list);
-
+out:
+	mlx5_dev_list_unlock();
 	return devcom;
 }
 
@@ -132,6 +138,7 @@ void mlx5_devcom_unregister_device(struct mlx5_devcom *devcom)
 	if (IS_ERR_OR_NULL(devcom))
 		return;
 
+	mlx5_dev_list_lock();
 	priv = devcom->priv;
 	priv->devs[devcom->idx] = NULL;
 
@@ -142,10 +149,12 @@ void mlx5_devcom_unregister_device(struct mlx5_devcom *devcom)
 			break;
 
 	if (i != MLX5_DEVCOM_PORTS_SUPPORTED)
-		return;
+		goto out;
 
 	list_del(&priv->list);
 	kfree(priv);
+out:
+	mlx5_dev_list_unlock();
 }
 
 void mlx5_devcom_register_component(struct mlx5_devcom *devcom,
-- 
cgit v1.2.3


From 9c2d08010963a61a171e8cb2852d3ce015b60cb4 Mon Sep 17 00:00:00 2001
From: Shay Drory <shayd@nvidia.com>
Date: Thu, 13 Apr 2023 22:15:31 +0300
Subject: net/mlx5: Free irqs only on shutdown callback

Whenever a shutdown is invoked, free irqs only and keep mlx5_irq
synthetic wrapper intact in order to avoid use-after-free on
system shutdown.

for example:
==================================================================
BUG: KASAN: use-after-free in _find_first_bit+0x66/0x80
Read of size 8 at addr ffff88823fc0d318 by task kworker/u192:0/13608

CPU: 25 PID: 13608 Comm: kworker/u192:0 Tainted: G    B   W  O  6.1.21-cloudflare-kasan-2023.3.21 #1
Hardware name: GIGABYTE R162-R2-GEN0/MZ12-HD2-CD, BIOS R14 05/03/2021
Workqueue: mlx5e mlx5e_tx_timeout_work [mlx5_core]
Call Trace:
  <TASK>
  dump_stack_lvl+0x34/0x48
  print_report+0x170/0x473
  ? _find_first_bit+0x66/0x80
  kasan_report+0xad/0x130
  ? _find_first_bit+0x66/0x80
  _find_first_bit+0x66/0x80
  mlx5e_open_channels+0x3c5/0x3a10 [mlx5_core]
  ? console_unlock+0x2fa/0x430
  ? _raw_spin_lock_irqsave+0x8d/0xf0
  ? _raw_spin_unlock_irqrestore+0x42/0x80
  ? preempt_count_add+0x7d/0x150
  ? __wake_up_klogd.part.0+0x7d/0xc0
  ? vprintk_emit+0xfe/0x2c0
  ? mlx5e_trigger_napi_sched+0x40/0x40 [mlx5_core]
  ? dev_attr_show.cold+0x35/0x35
  ? devlink_health_do_dump.part.0+0x174/0x340
  ? devlink_health_report+0x504/0x810
  ? mlx5e_reporter_tx_timeout+0x29d/0x3a0 [mlx5_core]
  ? mlx5e_tx_timeout_work+0x17c/0x230 [mlx5_core]
  ? process_one_work+0x680/0x1050
  mlx5e_safe_switch_params+0x156/0x220 [mlx5_core]
  ? mlx5e_switch_priv_channels+0x310/0x310 [mlx5_core]
  ? mlx5_eq_poll_irq_disabled+0xb6/0x100 [mlx5_core]
  mlx5e_tx_reporter_timeout_recover+0x123/0x240 [mlx5_core]
  ? __mutex_unlock_slowpath.constprop.0+0x2b0/0x2b0
  devlink_health_reporter_recover+0xa6/0x1f0
  devlink_health_report+0x2f7/0x810
  ? vsnprintf+0x854/0x15e0
  mlx5e_reporter_tx_timeout+0x29d/0x3a0 [mlx5_core]
  ? mlx5e_reporter_tx_err_cqe+0x1a0/0x1a0 [mlx5_core]
  ? mlx5e_tx_reporter_timeout_dump+0x50/0x50 [mlx5_core]
  ? mlx5e_tx_reporter_dump_sq+0x260/0x260 [mlx5_core]
  ? newidle_balance+0x9b7/0xe30
  ? psi_group_change+0x6a7/0xb80
  ? mutex_lock+0x96/0xf0
  ? __mutex_lock_slowpath+0x10/0x10
  mlx5e_tx_timeout_work+0x17c/0x230 [mlx5_core]
  process_one_work+0x680/0x1050
  worker_thread+0x5a0/0xeb0
  ? process_one_work+0x1050/0x1050
  kthread+0x2a2/0x340
  ? kthread_complete_and_exit+0x20/0x20
  ret_from_fork+0x22/0x30
  </TASK>

Freed by task 1:
  kasan_save_stack+0x23/0x50
  kasan_set_track+0x21/0x30
  kasan_save_free_info+0x2a/0x40
  ____kasan_slab_free+0x169/0x1d0
  slab_free_freelist_hook+0xd2/0x190
  __kmem_cache_free+0x1a1/0x2f0
  irq_pool_free+0x138/0x200 [mlx5_core]
  mlx5_irq_table_destroy+0xf6/0x170 [mlx5_core]
  mlx5_core_eq_free_irqs+0x74/0xf0 [mlx5_core]
  shutdown+0x194/0x1aa [mlx5_core]
  pci_device_shutdown+0x75/0x120
  device_shutdown+0x35c/0x620
  kernel_restart+0x60/0xa0
  __do_sys_reboot+0x1cb/0x2c0
  do_syscall_64+0x3b/0x90
  entry_SYSCALL_64_after_hwframe+0x4b/0xb5

The buggy address belongs to the object at ffff88823fc0d300
  which belongs to the cache kmalloc-192 of size 192
The buggy address is located 24 bytes inside of
  192-byte region [ffff88823fc0d300, ffff88823fc0d3c0)

The buggy address belongs to the physical page:
page:0000000010139587 refcount:1 mapcount:0 mapping:0000000000000000
index:0x0 pfn:0x23fc0c
head:0000000010139587 order:1 compound_mapcount:0 compound_pincount:0
flags: 0x2ffff800010200(slab|head|node=0|zone=2|lastcpupid=0x1ffff)
raw: 002ffff800010200 0000000000000000 dead000000000122 ffff88810004ca00
raw: 0000000000000000 0000000000200020 00000001ffffffff 0000000000000000
page dumped because: kasan: bad access detected

Memory state around the buggy address:
  ffff88823fc0d200: fa fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
  ffff88823fc0d280: fb fb fb fb fb fb fb fb fc fc fc fc fc fc fc fc
 >ffff88823fc0d300: fa fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
                             ^
  ffff88823fc0d380: fb fb fb fb fb fb fb fb fc fc fc fc fc fc fc fc
  ffff88823fc0d400: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
==================================================================
general protection fault, probably for non-canonical address
0xdffffc005c40d7ac: 0000 [#1] PREEMPT SMP KASAN NOPTI
KASAN: probably user-memory-access in range [0x00000002e206bd60-0x00000002e206bd67]
CPU: 25 PID: 13608 Comm: kworker/u192:0 Tainted: G    B   W  O  6.1.21-cloudflare-kasan-2023.3.21 #1
Hardware name: GIGABYTE R162-R2-GEN0/MZ12-HD2-CD, BIOS R14 05/03/2021
Workqueue: mlx5e mlx5e_tx_timeout_work [mlx5_core]
RIP: 0010:__alloc_pages+0x141/0x5c0
Call Trace:
  <TASK>
  ? sysvec_apic_timer_interrupt+0xa0/0xc0
  ? asm_sysvec_apic_timer_interrupt+0x16/0x20
  ? __alloc_pages_slowpath.constprop.0+0x1ec0/0x1ec0
  ? _raw_spin_unlock_irqrestore+0x3d/0x80
  __kmalloc_large_node+0x80/0x120
  ? kvmalloc_node+0x4e/0x170
  __kmalloc_node+0xd4/0x150
  kvmalloc_node+0x4e/0x170
  mlx5e_open_channels+0x631/0x3a10 [mlx5_core]
  ? console_unlock+0x2fa/0x430
  ? _raw_spin_lock_irqsave+0x8d/0xf0
  ? _raw_spin_unlock_irqrestore+0x42/0x80
  ? preempt_count_add+0x7d/0x150
  ? __wake_up_klogd.part.0+0x7d/0xc0
  ? vprintk_emit+0xfe/0x2c0
  ? mlx5e_trigger_napi_sched+0x40/0x40 [mlx5_core]
  ? dev_attr_show.cold+0x35/0x35
  ? devlink_health_do_dump.part.0+0x174/0x340
  ? devlink_health_report+0x504/0x810
  ? mlx5e_reporter_tx_timeout+0x29d/0x3a0 [mlx5_core]
  ? mlx5e_tx_timeout_work+0x17c/0x230 [mlx5_core]
  ? process_one_work+0x680/0x1050
  mlx5e_safe_switch_params+0x156/0x220 [mlx5_core]
  ? mlx5e_switch_priv_channels+0x310/0x310 [mlx5_core]
  ? mlx5_eq_poll_irq_disabled+0xb6/0x100 [mlx5_core]
  mlx5e_tx_reporter_timeout_recover+0x123/0x240 [mlx5_core]
  ? __mutex_unlock_slowpath.constprop.0+0x2b0/0x2b0
  devlink_health_reporter_recover+0xa6/0x1f0
  devlink_health_report+0x2f7/0x810
  ? vsnprintf+0x854/0x15e0
  mlx5e_reporter_tx_timeout+0x29d/0x3a0 [mlx5_core]
  ? mlx5e_reporter_tx_err_cqe+0x1a0/0x1a0 [mlx5_core]
  ? mlx5e_tx_reporter_timeout_dump+0x50/0x50 [mlx5_core]
  ? mlx5e_tx_reporter_dump_sq+0x260/0x260 [mlx5_core]
  ? newidle_balance+0x9b7/0xe30
  ? psi_group_change+0x6a7/0xb80
  ? mutex_lock+0x96/0xf0
  ? __mutex_lock_slowpath+0x10/0x10
  mlx5e_tx_timeout_work+0x17c/0x230 [mlx5_core]
  process_one_work+0x680/0x1050
  worker_thread+0x5a0/0xeb0
  ? process_one_work+0x1050/0x1050
  kthread+0x2a2/0x340
  ? kthread_complete_and_exit+0x20/0x20
  ret_from_fork+0x22/0x30
  </TASK>
---[ end trace 0000000000000000  ]---
RIP: 0010:__alloc_pages+0x141/0x5c0
Code: e0 39 a3 96 89 e9 b8 22 01 32 01 83 e1 0f 48 89 fa 01 c9 48 c1 ea
03 d3 f8 83 e0 03 89 44 24 6c 48 b8 00 00 00 00 00 fc ff df <80> 3c 02
00 0f 85 fc 03 00 00 89 e8 4a 8b 14 f5 e0 39 a3 96 4c 89
RSP: 0018:ffff888251f0f438 EFLAGS: 00010202
RAX: dffffc0000000000 RBX: 1ffff1104a3e1e8b RCX: 0000000000000000
RDX: 000000005c40d7ac RSI: 0000000000000003 RDI: 00000002e206bd60
RBP: 0000000000052dc0 R08: ffff8882b0044218 R09: ffff8882b0045e8a
R10: fffffbfff300fefc R11: ffff888167af4000 R12: 0000000000000003
R13: 0000000000000000 R14: 00000000696c7070 R15: ffff8882373f4380
FS:  0000000000000000(0000) GS:ffff88bf2be80000(0000)
knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 00005641d031eee8 CR3: 0000002e7ca14000 CR4: 0000000000350ee0
Kernel panic - not syncing: Fatal exception
Kernel Offset: 0x11000000 from 0xffffffff81000000 (relocation range:
0xffffffff80000000-0xffffffffbfffffff)
---[ end Kernel panic - not syncing: Fatal exception  ]---]

Reported-by: Frederick Lawler <fred@cloudflare.com>
Link: https://lore.kernel.org/netdev/be5b9271-7507-19c5-ded1-fa78f1980e69@cloudflare.com
Signed-off-by: Shay Drory <shayd@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/eq.c       |  2 +-
 drivers/net/ethernet/mellanox/mlx5/core/mlx5_irq.h |  1 +
 drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c  | 29 ++++++++++++++++++++++
 3 files changed, 31 insertions(+), 1 deletion(-)

(limited to 'drivers')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eq.c b/drivers/net/ethernet/mellanox/mlx5/core/eq.c
index 1c35d721a31d..fe698c79616c 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eq.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eq.c
@@ -1104,7 +1104,7 @@ void mlx5_core_eq_free_irqs(struct mlx5_core_dev *dev)
 	struct mlx5_eq_table *table = dev->priv.eq_table;
 
 	mutex_lock(&table->lock); /* sync with create/destroy_async_eq */
-	mlx5_irq_table_destroy(dev);
+	mlx5_irq_table_free_irqs(dev);
 	mutex_unlock(&table->lock);
 }
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/mlx5_irq.h b/drivers/net/ethernet/mellanox/mlx5/core/mlx5_irq.h
index efd0c299c5c7..aa403a5ea34e 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/mlx5_irq.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/mlx5_irq.h
@@ -15,6 +15,7 @@ int mlx5_irq_table_init(struct mlx5_core_dev *dev);
 void mlx5_irq_table_cleanup(struct mlx5_core_dev *dev);
 int mlx5_irq_table_create(struct mlx5_core_dev *dev);
 void mlx5_irq_table_destroy(struct mlx5_core_dev *dev);
+void mlx5_irq_table_free_irqs(struct mlx5_core_dev *dev);
 int mlx5_irq_table_get_num_comp(struct mlx5_irq_table *table);
 int mlx5_irq_table_get_sfs_vec(struct mlx5_irq_table *table);
 struct mlx5_irq_table *mlx5_irq_table_get(struct mlx5_core_dev *dev);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c b/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c
index 2245d3b2f393..ac1304c2d205 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c
@@ -691,6 +691,24 @@ static void irq_pools_destroy(struct mlx5_irq_table *table)
 	irq_pool_free(table->pcif_pool);
 }
 
+static void mlx5_irq_pool_free_irqs(struct mlx5_irq_pool *pool)
+{
+	struct mlx5_irq *irq;
+	unsigned long index;
+
+	xa_for_each(&pool->irqs, index, irq)
+		free_irq(irq->map.virq, &irq->nh);
+}
+
+static void mlx5_irq_pools_free_irqs(struct mlx5_irq_table *table)
+{
+	if (table->sf_ctrl_pool) {
+		mlx5_irq_pool_free_irqs(table->sf_comp_pool);
+		mlx5_irq_pool_free_irqs(table->sf_ctrl_pool);
+	}
+	mlx5_irq_pool_free_irqs(table->pcif_pool);
+}
+
 /* irq_table API */
 
 int mlx5_irq_table_init(struct mlx5_core_dev *dev)
@@ -774,6 +792,17 @@ void mlx5_irq_table_destroy(struct mlx5_core_dev *dev)
 	pci_free_irq_vectors(dev->pdev);
 }
 
+void mlx5_irq_table_free_irqs(struct mlx5_core_dev *dev)
+{
+	struct mlx5_irq_table *table = dev->priv.irq_table;
+
+	if (mlx5_core_is_sf(dev))
+		return;
+
+	mlx5_irq_pools_free_irqs(table);
+	pci_free_irq_vectors(dev->pdev);
+}
+
 int mlx5_irq_table_get_sfs_vec(struct mlx5_irq_table *table)
 {
 	if (table->sf_comp_pool)
-- 
cgit v1.2.3


From ef8c063cf88e1a3d99ab4ada1cbab5ba7248a4f2 Mon Sep 17 00:00:00 2001
From: Shay Drory <shayd@nvidia.com>
Date: Sun, 16 Apr 2023 08:54:04 +0300
Subject: net/mlx5: Fix irq affinity management

The cited patch deny the user of changing the affinity of mlx5 irqs,
which break backward compatibility.
Hence, allow the user to change the affinity of mlx5 irqs.

Fixes: bbac70c74183 ("net/mlx5: Use newer affinity descriptor")
Signed-off-by: Shay Drory <shayd@nvidia.com>
Reviewed-by: Eli Cohen <elic@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'drivers')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c b/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c
index ac1304c2d205..86b528aae6d4 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c
@@ -567,7 +567,7 @@ int mlx5_irqs_request_vectors(struct mlx5_core_dev *dev, u16 *cpus, int nirqs,
 	struct mlx5_irq *irq;
 	int i;
 
-	af_desc.is_managed = 1;
+	af_desc.is_managed = false;
 	for (i = 0; i < nirqs; i++) {
 		cpumask_set_cpu(cpus[i], &af_desc.mask);
 		irq = mlx5_irq_request(dev, i + 1, &af_desc, rmap);
-- 
cgit v1.2.3


From 1da438c0ae02396dc5018b63237492cb5908608d Mon Sep 17 00:00:00 2001
From: Shay Drory <shayd@nvidia.com>
Date: Mon, 17 Apr 2023 10:57:50 +0300
Subject: net/mlx5: Fix indexing of mlx5_irq

After the cited patch, mlx5_irq xarray index can be different then
mlx5_irq MSIX table index.
Fix it by storing both mlx5_irq xarray index and MSIX table index.

Fixes: 3354822cde5a ("net/mlx5: Use dynamic msix vectors allocation")
Signed-off-by: Shay Drory <shayd@nvidia.com>
Reviewed-by: Eli Cohen <elic@nvidia.com>
Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

(limited to 'drivers')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c b/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c
index 86b528aae6d4..db5687d9fec9 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c
@@ -32,6 +32,7 @@ struct mlx5_irq {
 	struct mlx5_irq_pool *pool;
 	int refcount;
 	struct msi_map map;
+	u32 pool_index;
 };
 
 struct mlx5_irq_table {
@@ -132,7 +133,7 @@ static void irq_release(struct mlx5_irq *irq)
 	struct cpu_rmap *rmap;
 #endif
 
-	xa_erase(&pool->irqs, irq->map.index);
+	xa_erase(&pool->irqs, irq->pool_index);
 	/* free_irq requires that affinity_hint and rmap will be cleared before
 	 * calling it. To satisfy this requirement, we call
 	 * irq_cpu_rmap_remove() to remove the notifier
@@ -276,11 +277,11 @@ struct mlx5_irq *mlx5_irq_alloc(struct mlx5_irq_pool *pool, int i,
 	}
 	irq->pool = pool;
 	irq->refcount = 1;
-	irq->map.index = i;
-	err = xa_err(xa_store(&pool->irqs, irq->map.index, irq, GFP_KERNEL));
+	irq->pool_index = i;
+	err = xa_err(xa_store(&pool->irqs, irq->pool_index, irq, GFP_KERNEL));
 	if (err) {
 		mlx5_core_err(dev, "Failed to alloc xa entry for irq(%u). err = %d\n",
-			      irq->map.index, err);
+			      irq->pool_index, err);
 		goto err_xa;
 	}
 	return irq;
-- 
cgit v1.2.3


From 600761245952d7f70280add6ce02894f1528992b Mon Sep 17 00:00:00 2001
From: Horatiu Vultur <horatiu.vultur@microchip.com>
Date: Mon, 22 May 2023 14:00:38 +0200
Subject: lan966x: Fix unloading/loading of the driver

It was noticing that after a while when unloading/loading the driver and
sending traffic through the switch, it would stop working. It would stop
forwarding any traffic and the only way to get out of this was to do a
power cycle of the board. The root cause seems to be that the switch
core is initialized twice. Apparently initializing twice the switch core
disturbs the pointers in the queue systems in the HW, so after a while
it would stop sending the traffic.
Unfortunetly, it is not possible to use a reset of the switch here,
because the reset line is connected to multiple devices like MDIO,
SGPIO, FAN, etc. So then all the devices will get reseted when the
network driver will be loaded.
So the fix is to check if the core is initialized already and if that is
the case don't initialize it again.

Fixes: db8bcaad5393 ("net: lan966x: add the basic lan966x driver")
Signed-off-by: Horatiu Vultur <horatiu.vultur@microchip.com>
Reviewed-by: Simon Horman <simon.horman@corigine.com>
Link: https://lore.kernel.org/r/20230522120038.3749026-1-horatiu.vultur@microchip.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 drivers/net/ethernet/microchip/lan966x/lan966x_main.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'drivers')

diff --git a/drivers/net/ethernet/microchip/lan966x/lan966x_main.c b/drivers/net/ethernet/microchip/lan966x/lan966x_main.c
index 2b6e046e1d10..ee2698698d71 100644
--- a/drivers/net/ethernet/microchip/lan966x/lan966x_main.c
+++ b/drivers/net/ethernet/microchip/lan966x/lan966x_main.c
@@ -1039,6 +1039,16 @@ static int lan966x_reset_switch(struct lan966x *lan966x)
 
 	reset_control_reset(switch_reset);
 
+	/* Don't reinitialize the switch core, if it is already initialized. In
+	 * case it is initialized twice, some pointers inside the queue system
+	 * in HW will get corrupted and then after a while the queue system gets
+	 * full and no traffic is passing through the switch. The issue is seen
+	 * when loading and unloading the driver and sending traffic through the
+	 * switch.
+	 */
+	if (lan_rd(lan966x, SYS_RESET_CFG) & SYS_RESET_CFG_CORE_ENA)
+		return 0;
+
 	lan_wr(SYS_RESET_CFG_CORE_ENA_SET(0), lan966x, SYS_RESET_CFG);
 	lan_wr(SYS_RAM_INIT_RAM_INIT_SET(1), lan966x, SYS_RAM_INIT);
 	ret = readx_poll_timeout(lan966x_ram_init, lan966x,
-- 
cgit v1.2.3


From d6c36cbc5e533f48bd89a7b5f339bd82b8b4378a Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Mon, 22 May 2023 15:41:21 +0200
Subject: r8169: Use a raw_spinlock_t for the register locks.

The driver's interrupt service routine is requested with the
IRQF_NO_THREAD if MSI is available. This means that the routine is
invoked in hardirq context even on PREEMPT_RT. The routine itself is
relatively short and schedules a worker, performs register access and
schedules NAPI. On PREEMPT_RT, scheduling NAPI from hardirq results in
waking ksoftirqd for further processing so using NAPI threads with this
driver is highly recommended since it NULL routes the threaded-IRQ
efforts.

Adding rtl_hw_aspm_clkreq_enable() to the ISR is problematic on
PREEMPT_RT because the function uses spinlock_t locks which become
sleeping locks on PREEMPT_RT. The locks are only used to protect
register access and don't nest into other functions or locks. They are
also not used for unbounded period of time. Therefore it looks okay to
convert them to raw_spinlock_t.

Convert the three locks which are used from the interrupt service
routine to raw_spinlock_t.

Fixes: e1ed3e4d9111 ("r8169: disable ASPM during NAPI poll")
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Reviewed-by: Heiner Kallweit <hkallweit1@gmail.com>
Link: https://lore.kernel.org/r/20230522134121.uxjax0F5@linutronix.de
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/ethernet/realtek/r8169_main.c | 44 +++++++++++++++----------------
 1 file changed, 22 insertions(+), 22 deletions(-)

(limited to 'drivers')

diff --git a/drivers/net/ethernet/realtek/r8169_main.c b/drivers/net/ethernet/realtek/r8169_main.c
index a7e376e7e689..4b19803a7dd0 100644
--- a/drivers/net/ethernet/realtek/r8169_main.c
+++ b/drivers/net/ethernet/realtek/r8169_main.c
@@ -616,10 +616,10 @@ struct rtl8169_private {
 		struct work_struct work;
 	} wk;
 
-	spinlock_t config25_lock;
-	spinlock_t mac_ocp_lock;
+	raw_spinlock_t config25_lock;
+	raw_spinlock_t mac_ocp_lock;
 
-	spinlock_t cfg9346_usage_lock;
+	raw_spinlock_t cfg9346_usage_lock;
 	int cfg9346_usage_count;
 
 	unsigned supports_gmii:1;
@@ -671,20 +671,20 @@ static void rtl_lock_config_regs(struct rtl8169_private *tp)
 {
 	unsigned long flags;
 
-	spin_lock_irqsave(&tp->cfg9346_usage_lock, flags);
+	raw_spin_lock_irqsave(&tp->cfg9346_usage_lock, flags);
 	if (!--tp->cfg9346_usage_count)
 		RTL_W8(tp, Cfg9346, Cfg9346_Lock);
-	spin_unlock_irqrestore(&tp->cfg9346_usage_lock, flags);
+	raw_spin_unlock_irqrestore(&tp->cfg9346_usage_lock, flags);
 }
 
 static void rtl_unlock_config_regs(struct rtl8169_private *tp)
 {
 	unsigned long flags;
 
-	spin_lock_irqsave(&tp->cfg9346_usage_lock, flags);
+	raw_spin_lock_irqsave(&tp->cfg9346_usage_lock, flags);
 	if (!tp->cfg9346_usage_count++)
 		RTL_W8(tp, Cfg9346, Cfg9346_Unlock);
-	spin_unlock_irqrestore(&tp->cfg9346_usage_lock, flags);
+	raw_spin_unlock_irqrestore(&tp->cfg9346_usage_lock, flags);
 }
 
 static void rtl_pci_commit(struct rtl8169_private *tp)
@@ -698,10 +698,10 @@ static void rtl_mod_config2(struct rtl8169_private *tp, u8 clear, u8 set)
 	unsigned long flags;
 	u8 val;
 
-	spin_lock_irqsave(&tp->config25_lock, flags);
+	raw_spin_lock_irqsave(&tp->config25_lock, flags);
 	val = RTL_R8(tp, Config2);
 	RTL_W8(tp, Config2, (val & ~clear) | set);
-	spin_unlock_irqrestore(&tp->config25_lock, flags);
+	raw_spin_unlock_irqrestore(&tp->config25_lock, flags);
 }
 
 static void rtl_mod_config5(struct rtl8169_private *tp, u8 clear, u8 set)
@@ -709,10 +709,10 @@ static void rtl_mod_config5(struct rtl8169_private *tp, u8 clear, u8 set)
 	unsigned long flags;
 	u8 val;
 
-	spin_lock_irqsave(&tp->config25_lock, flags);
+	raw_spin_lock_irqsave(&tp->config25_lock, flags);
 	val = RTL_R8(tp, Config5);
 	RTL_W8(tp, Config5, (val & ~clear) | set);
-	spin_unlock_irqrestore(&tp->config25_lock, flags);
+	raw_spin_unlock_irqrestore(&tp->config25_lock, flags);
 }
 
 static bool rtl_is_8125(struct rtl8169_private *tp)
@@ -899,9 +899,9 @@ static void r8168_mac_ocp_write(struct rtl8169_private *tp, u32 reg, u32 data)
 {
 	unsigned long flags;
 
-	spin_lock_irqsave(&tp->mac_ocp_lock, flags);
+	raw_spin_lock_irqsave(&tp->mac_ocp_lock, flags);
 	__r8168_mac_ocp_write(tp, reg, data);
-	spin_unlock_irqrestore(&tp->mac_ocp_lock, flags);
+	raw_spin_unlock_irqrestore(&tp->mac_ocp_lock, flags);
 }
 
 static u16 __r8168_mac_ocp_read(struct rtl8169_private *tp, u32 reg)
@@ -919,9 +919,9 @@ static u16 r8168_mac_ocp_read(struct rtl8169_private *tp, u32 reg)
 	unsigned long flags;
 	u16 val;
 
-	spin_lock_irqsave(&tp->mac_ocp_lock, flags);
+	raw_spin_lock_irqsave(&tp->mac_ocp_lock, flags);
 	val = __r8168_mac_ocp_read(tp, reg);
-	spin_unlock_irqrestore(&tp->mac_ocp_lock, flags);
+	raw_spin_unlock_irqrestore(&tp->mac_ocp_lock, flags);
 
 	return val;
 }
@@ -932,10 +932,10 @@ static void r8168_mac_ocp_modify(struct rtl8169_private *tp, u32 reg, u16 mask,
 	unsigned long flags;
 	u16 data;
 
-	spin_lock_irqsave(&tp->mac_ocp_lock, flags);
+	raw_spin_lock_irqsave(&tp->mac_ocp_lock, flags);
 	data = __r8168_mac_ocp_read(tp, reg);
 	__r8168_mac_ocp_write(tp, reg, (data & ~mask) | set);
-	spin_unlock_irqrestore(&tp->mac_ocp_lock, flags);
+	raw_spin_unlock_irqrestore(&tp->mac_ocp_lock, flags);
 }
 
 /* Work around a hw issue with RTL8168g PHY, the quirk disables
@@ -1420,14 +1420,14 @@ static void __rtl8169_set_wol(struct rtl8169_private *tp, u32 wolopts)
 			r8168_mac_ocp_modify(tp, 0xc0b6, BIT(0), 0);
 	}
 
-	spin_lock_irqsave(&tp->config25_lock, flags);
+	raw_spin_lock_irqsave(&tp->config25_lock, flags);
 	for (i = 0; i < tmp; i++) {
 		options = RTL_R8(tp, cfg[i].reg) & ~cfg[i].mask;
 		if (wolopts & cfg[i].opt)
 			options |= cfg[i].mask;
 		RTL_W8(tp, cfg[i].reg, options);
 	}
-	spin_unlock_irqrestore(&tp->config25_lock, flags);
+	raw_spin_unlock_irqrestore(&tp->config25_lock, flags);
 
 	switch (tp->mac_version) {
 	case RTL_GIGA_MAC_VER_02 ... RTL_GIGA_MAC_VER_06:
@@ -5179,9 +5179,9 @@ static int rtl_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
 	tp->eee_adv = -1;
 	tp->ocp_base = OCP_STD_PHY_BASE;
 
-	spin_lock_init(&tp->cfg9346_usage_lock);
-	spin_lock_init(&tp->config25_lock);
-	spin_lock_init(&tp->mac_ocp_lock);
+	raw_spin_lock_init(&tp->cfg9346_usage_lock);
+	raw_spin_lock_init(&tp->config25_lock);
+	raw_spin_lock_init(&tp->mac_ocp_lock);
 
 	dev->tstats = devm_netdev_alloc_pcpu_stats(&pdev->dev,
 						   struct pcpu_sw_netstats);
-- 
cgit v1.2.3


From 04910d8cbfed65dad21c31723c6c1a8d9f990fb6 Mon Sep 17 00:00:00 2001
From: Arınç ÜNAL <arinc.unal@arinc9.com>
Date: Mon, 22 May 2023 13:57:43 +0300
Subject: net: ethernet: mtk_eth_soc: fix QoS on DSA MAC on non MTK_NETSYS_V2
 SoCs
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The commit c6d96df9fa2c ("net: ethernet: mtk_eth_soc: drop generic vlan rx
offload, only use DSA untagging") makes VLAN RX offloading to be only used
on the SoCs without the MTK_NETSYS_V2 ability (which are not just MT7621
and MT7622). The commit disables the proper handling of special tagged
(DSA) frames, added with commit 87e3df4961f4 ("net-next: ethernet:
mediatek: add CDM able to recognize the tag for DSA"), for non
MTK_NETSYS_V2 SoCs when it finds a MAC that does not use DSA. So if the
other MAC uses DSA, the CDMQ component transmits DSA tagged frames to the
CPU improperly. This issue can be observed on frames with TCP, for example,
a TCP speed test using iperf3 won't work.

The commit disables the proper handling of special tagged (DSA) frames
because it assumes that these SoCs don't use more than one MAC, which is
wrong. Although I made Frank address this false assumption on the patch log
when they sent the patch on behalf of Felix, the code still made changes
with this assumption.

Therefore, the proper handling of special tagged (DSA) frames must be kept
enabled in all circumstances as it doesn't affect non DSA tagged frames.

Hardware DSA untagging, introduced with the commit 2d7605a72906 ("net:
ethernet: mtk_eth_soc: enable hardware DSA untagging"), and VLAN RX
offloading are operations on the two CDM components of the frame engine,
CDMP and CDMQ, which connect to Packet DMA (PDMA) and QoS DMA (QDMA) and
are between the MACs and the CPU. These operations apply to all MACs of the
SoC so if one MAC uses DSA and the other doesn't, the hardware DSA
untagging operation will cause the CDMP component to transmit non DSA
tagged frames to the CPU improperly.

Since the VLAN RX offloading feature configuration was dropped, VLAN RX
offloading can only be used along with hardware DSA untagging. So, for the
case above, we need to disable both features and leave it to the CPU,
therefore software, to untag the DSA and VLAN tags.

So the correct way to handle this is:

For all SoCs:

Enable the proper handling of special tagged (DSA) frames
(MTK_CDMQ_IG_CTRL).

For non MTK_NETSYS_V2 SoCs:

Enable hardware DSA untagging (MTK_CDMP_IG_CTRL).
Enable VLAN RX offloading (MTK_CDMP_EG_CTRL).

When a non MTK_NETSYS_V2 SoC MAC does not use DSA:

Disable hardware DSA untagging (MTK_CDMP_IG_CTRL).
Disable VLAN RX offloading (MTK_CDMP_EG_CTRL).

Fixes: c6d96df9fa2c ("net: ethernet: mtk_eth_soc: drop generic vlan rx offload, only use DSA untagging")
Signed-off-by: Arınç ÜNAL <arinc.unal@arinc9.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mediatek/mtk_eth_soc.c | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

(limited to 'drivers')

diff --git a/drivers/net/ethernet/mediatek/mtk_eth_soc.c b/drivers/net/ethernet/mediatek/mtk_eth_soc.c
index a75fd072082c..834c644b67db 100644
--- a/drivers/net/ethernet/mediatek/mtk_eth_soc.c
+++ b/drivers/net/ethernet/mediatek/mtk_eth_soc.c
@@ -3269,18 +3269,14 @@ static int mtk_open(struct net_device *dev)
 			eth->dsa_meta[i] = md_dst;
 		}
 	} else {
-		/* Hardware special tag parsing needs to be disabled if at least
-		 * one MAC does not use DSA.
+		/* Hardware DSA untagging and VLAN RX offloading need to be
+		 * disabled if at least one MAC does not use DSA.
 		 */
 		u32 val = mtk_r32(eth, MTK_CDMP_IG_CTRL);
 
 		val &= ~MTK_CDMP_STAG_EN;
 		mtk_w32(eth, val, MTK_CDMP_IG_CTRL);
 
-		val = mtk_r32(eth, MTK_CDMQ_IG_CTRL);
-		val &= ~MTK_CDMQ_STAG_EN;
-		mtk_w32(eth, val, MTK_CDMQ_IG_CTRL);
-
 		mtk_w32(eth, 0, MTK_CDMP_EG_CTRL);
 	}
 
-- 
cgit v1.2.3


From 57fb54ab9f6945e204740b696bd4cee61ee04e5e Mon Sep 17 00:00:00 2001
From: David Epping <david.epping@missinglinkelectronics.com>
Date: Tue, 23 May 2023 17:31:05 +0200
Subject: net: phy: mscc: add VSC8502 to MODULE_DEVICE_TABLE

The mscc driver implements support for VSC8502, so its ID should be in
the MODULE_DEVICE_TABLE for automatic loading.

Signed-off-by: David Epping <david.epping@missinglinkelectronics.com>
Fixes: d3169863310d ("net: phy: mscc: add support for VSC8502")
Reviewed-by: Vladimir Oltean <olteanv@gmail.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/phy/mscc/mscc_main.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'drivers')

diff --git a/drivers/net/phy/mscc/mscc_main.c b/drivers/net/phy/mscc/mscc_main.c
index 62bf99e45af1..bd81a4b041e5 100644
--- a/drivers/net/phy/mscc/mscc_main.c
+++ b/drivers/net/phy/mscc/mscc_main.c
@@ -2656,6 +2656,7 @@ static struct phy_driver vsc85xx_driver[] = {
 module_phy_driver(vsc85xx_driver);
 
 static struct mdio_device_id __maybe_unused vsc85xx_tbl[] = {
+	{ PHY_ID_VSC8502, 0xfffffff0, },
 	{ PHY_ID_VSC8504, 0xfffffff0, },
 	{ PHY_ID_VSC8514, 0xfffffff0, },
 	{ PHY_ID_VSC8530, 0xfffffff0, },
-- 
cgit v1.2.3


From fb055ce4a9e3a115f5dc42011a97cf0cfc7820e4 Mon Sep 17 00:00:00 2001
From: David Epping <david.epping@missinglinkelectronics.com>
Date: Tue, 23 May 2023 17:31:06 +0200
Subject: net: phy: mscc: add support for VSC8501

The VSC8501 PHY can use the same driver implementation as the VSC8502.
Adding the PHY ID and copying the handler functions of VSC8502 is
sufficient to operate it.

Signed-off-by: David Epping <david.epping@missinglinkelectronics.com>
Reviewed-by: Vladimir Oltean <olteanv@gmail.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/phy/mscc/mscc.h      |  1 +
 drivers/net/phy/mscc/mscc_main.c | 25 +++++++++++++++++++++++++
 2 files changed, 26 insertions(+)

(limited to 'drivers')

diff --git a/drivers/net/phy/mscc/mscc.h b/drivers/net/phy/mscc/mscc.h
index a50235fdf7d9..79cbb2418664 100644
--- a/drivers/net/phy/mscc/mscc.h
+++ b/drivers/net/phy/mscc/mscc.h
@@ -276,6 +276,7 @@ enum rgmii_clock_delay {
 /* Microsemi PHY ID's
  *   Code assumes lowest nibble is 0
  */
+#define PHY_ID_VSC8501			  0x00070530
 #define PHY_ID_VSC8502			  0x00070630
 #define PHY_ID_VSC8504			  0x000704c0
 #define PHY_ID_VSC8514			  0x00070670
diff --git a/drivers/net/phy/mscc/mscc_main.c b/drivers/net/phy/mscc/mscc_main.c
index bd81a4b041e5..29fc27a16805 100644
--- a/drivers/net/phy/mscc/mscc_main.c
+++ b/drivers/net/phy/mscc/mscc_main.c
@@ -2316,6 +2316,30 @@ static int vsc85xx_probe(struct phy_device *phydev)
 
 /* Microsemi VSC85xx PHYs */
 static struct phy_driver vsc85xx_driver[] = {
+{
+	.phy_id		= PHY_ID_VSC8501,
+	.name		= "Microsemi GE VSC8501 SyncE",
+	.phy_id_mask	= 0xfffffff0,
+	/* PHY_BASIC_FEATURES */
+	.soft_reset	= &genphy_soft_reset,
+	.config_init	= &vsc85xx_config_init,
+	.config_aneg    = &vsc85xx_config_aneg,
+	.read_status	= &vsc85xx_read_status,
+	.handle_interrupt = vsc85xx_handle_interrupt,
+	.config_intr	= &vsc85xx_config_intr,
+	.suspend	= &genphy_suspend,
+	.resume		= &genphy_resume,
+	.probe		= &vsc85xx_probe,
+	.set_wol	= &vsc85xx_wol_set,
+	.get_wol	= &vsc85xx_wol_get,
+	.get_tunable	= &vsc85xx_get_tunable,
+	.set_tunable	= &vsc85xx_set_tunable,
+	.read_page	= &vsc85xx_phy_read_page,
+	.write_page	= &vsc85xx_phy_write_page,
+	.get_sset_count = &vsc85xx_get_sset_count,
+	.get_strings    = &vsc85xx_get_strings,
+	.get_stats      = &vsc85xx_get_stats,
+},
 {
 	.phy_id		= PHY_ID_VSC8502,
 	.name		= "Microsemi GE VSC8502 SyncE",
@@ -2656,6 +2680,7 @@ static struct phy_driver vsc85xx_driver[] = {
 module_phy_driver(vsc85xx_driver);
 
 static struct mdio_device_id __maybe_unused vsc85xx_tbl[] = {
+	{ PHY_ID_VSC8501, 0xfffffff0, },
 	{ PHY_ID_VSC8502, 0xfffffff0, },
 	{ PHY_ID_VSC8504, 0xfffffff0, },
 	{ PHY_ID_VSC8514, 0xfffffff0, },
-- 
cgit v1.2.3


From 7df0b33d7993338a06e4039ec025bb67851ee41d Mon Sep 17 00:00:00 2001
From: David Epping <david.epping@missinglinkelectronics.com>
Date: Tue, 23 May 2023 17:31:07 +0200
Subject: net: phy: mscc: remove unnecessary phydev locking

Holding the struct phy_device (phydev) lock is unnecessary when
accessing phydev->interface in the PHY driver .config_init method,
which is the only place that vsc85xx_rgmii_set_skews() is called from.

The phy_modify_paged() function implements required MDIO bus level
locking, which can not be achieved by a phydev lock.

Signed-off-by: David Epping <david.epping@missinglinkelectronics.com>
Reviewed-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/phy/mscc/mscc_main.c | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'drivers')

diff --git a/drivers/net/phy/mscc/mscc_main.c b/drivers/net/phy/mscc/mscc_main.c
index 29fc27a16805..0c39b3ecb1f2 100644
--- a/drivers/net/phy/mscc/mscc_main.c
+++ b/drivers/net/phy/mscc/mscc_main.c
@@ -528,8 +528,6 @@ static int vsc85xx_rgmii_set_skews(struct phy_device *phydev, u32 rgmii_cntl,
 	u16 reg_val = 0;
 	int rc;
 
-	mutex_lock(&phydev->lock);
-
 	if (phydev->interface == PHY_INTERFACE_MODE_RGMII_RXID ||
 	    phydev->interface == PHY_INTERFACE_MODE_RGMII_ID)
 		reg_val |= RGMII_CLK_DELAY_2_0_NS << rgmii_rx_delay_pos;
@@ -542,8 +540,6 @@ static int vsc85xx_rgmii_set_skews(struct phy_device *phydev, u32 rgmii_cntl,
 			      rgmii_rx_delay_mask | rgmii_tx_delay_mask,
 			      reg_val);
 
-	mutex_unlock(&phydev->lock);
-
 	return rc;
 }
 
-- 
cgit v1.2.3


From 71460c9ec5c743e9ffffca3c874d66267c36345e Mon Sep 17 00:00:00 2001
From: David Epping <david.epping@missinglinkelectronics.com>
Date: Tue, 23 May 2023 17:31:08 +0200
Subject: net: phy: mscc: enable VSC8501/2 RGMII RX clock

By default the VSC8501 and VSC8502 RGMII/GMII/MII RX_CLK output is
disabled. To allow packet forwarding towards the MAC it needs to be
enabled.

For other PHYs supported by this driver the clock output is enabled
by default.

Fixes: d3169863310d ("net: phy: mscc: add support for VSC8502")
Signed-off-by: David Epping <david.epping@missinglinkelectronics.com>
Reviewed-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Reviewed-by: Vladimir Oltean <olteanv@gmail.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/phy/mscc/mscc.h      |  1 +
 drivers/net/phy/mscc/mscc_main.c | 54 +++++++++++++++++++++-------------------
 2 files changed, 29 insertions(+), 26 deletions(-)

(limited to 'drivers')

diff --git a/drivers/net/phy/mscc/mscc.h b/drivers/net/phy/mscc/mscc.h
index 79cbb2418664..defe5cc6d4fc 100644
--- a/drivers/net/phy/mscc/mscc.h
+++ b/drivers/net/phy/mscc/mscc.h
@@ -179,6 +179,7 @@ enum rgmii_clock_delay {
 #define VSC8502_RGMII_CNTL		  20
 #define VSC8502_RGMII_RX_DELAY_MASK	  0x0070
 #define VSC8502_RGMII_TX_DELAY_MASK	  0x0007
+#define VSC8502_RGMII_RX_CLK_DISABLE	  0x0800
 
 #define MSCC_PHY_WOL_LOWER_MAC_ADDR	  21
 #define MSCC_PHY_WOL_MID_MAC_ADDR	  22
diff --git a/drivers/net/phy/mscc/mscc_main.c b/drivers/net/phy/mscc/mscc_main.c
index 0c39b3ecb1f2..28df8a2e4230 100644
--- a/drivers/net/phy/mscc/mscc_main.c
+++ b/drivers/net/phy/mscc/mscc_main.c
@@ -519,14 +519,27 @@ out_unlock:
  *  * 2.0 ns (which causes the data to be sampled at exactly half way between
  *    clock transitions at 1000 Mbps) if delays should be enabled
  */
-static int vsc85xx_rgmii_set_skews(struct phy_device *phydev, u32 rgmii_cntl,
-				   u16 rgmii_rx_delay_mask,
-				   u16 rgmii_tx_delay_mask)
+static int vsc85xx_update_rgmii_cntl(struct phy_device *phydev, u32 rgmii_cntl,
+				     u16 rgmii_rx_delay_mask,
+				     u16 rgmii_tx_delay_mask)
 {
 	u16 rgmii_rx_delay_pos = ffs(rgmii_rx_delay_mask) - 1;
 	u16 rgmii_tx_delay_pos = ffs(rgmii_tx_delay_mask) - 1;
 	u16 reg_val = 0;
-	int rc;
+	u16 mask = 0;
+	int rc = 0;
+
+	/* For traffic to pass, the VSC8502 family needs the RX_CLK disable bit
+	 * to be unset for all PHY modes, so do that as part of the paged
+	 * register modification.
+	 * For some family members (like VSC8530/31/40/41) this bit is reserved
+	 * and read-only, and the RX clock is enabled by default.
+	 */
+	if (rgmii_cntl == VSC8502_RGMII_CNTL)
+		mask |= VSC8502_RGMII_RX_CLK_DISABLE;
+
+	if (phy_interface_is_rgmii(phydev))
+		mask |= rgmii_rx_delay_mask | rgmii_tx_delay_mask;
 
 	if (phydev->interface == PHY_INTERFACE_MODE_RGMII_RXID ||
 	    phydev->interface == PHY_INTERFACE_MODE_RGMII_ID)
@@ -535,29 +548,20 @@ static int vsc85xx_rgmii_set_skews(struct phy_device *phydev, u32 rgmii_cntl,
 	    phydev->interface == PHY_INTERFACE_MODE_RGMII_ID)
 		reg_val |= RGMII_CLK_DELAY_2_0_NS << rgmii_tx_delay_pos;
 
-	rc = phy_modify_paged(phydev, MSCC_PHY_PAGE_EXTENDED_2,
-			      rgmii_cntl,
-			      rgmii_rx_delay_mask | rgmii_tx_delay_mask,
-			      reg_val);
+	if (mask)
+		rc = phy_modify_paged(phydev, MSCC_PHY_PAGE_EXTENDED_2,
+				      rgmii_cntl, mask, reg_val);
 
 	return rc;
 }
 
 static int vsc85xx_default_config(struct phy_device *phydev)
 {
-	int rc;
-
 	phydev->mdix_ctrl = ETH_TP_MDI_AUTO;
 
-	if (phy_interface_mode_is_rgmii(phydev->interface)) {
-		rc = vsc85xx_rgmii_set_skews(phydev, VSC8502_RGMII_CNTL,
-					     VSC8502_RGMII_RX_DELAY_MASK,
-					     VSC8502_RGMII_TX_DELAY_MASK);
-		if (rc)
-			return rc;
-	}
-
-	return 0;
+	return vsc85xx_update_rgmii_cntl(phydev, VSC8502_RGMII_CNTL,
+					 VSC8502_RGMII_RX_DELAY_MASK,
+					 VSC8502_RGMII_TX_DELAY_MASK);
 }
 
 static int vsc85xx_get_tunable(struct phy_device *phydev,
@@ -1754,13 +1758,11 @@ static int vsc8584_config_init(struct phy_device *phydev)
 	if (ret)
 		return ret;
 
-	if (phy_interface_is_rgmii(phydev)) {
-		ret = vsc85xx_rgmii_set_skews(phydev, VSC8572_RGMII_CNTL,
-					      VSC8572_RGMII_RX_DELAY_MASK,
-					      VSC8572_RGMII_TX_DELAY_MASK);
-		if (ret)
-			return ret;
-	}
+	ret = vsc85xx_update_rgmii_cntl(phydev, VSC8572_RGMII_CNTL,
+					VSC8572_RGMII_RX_DELAY_MASK,
+					VSC8572_RGMII_TX_DELAY_MASK);
+	if (ret)
+		return ret;
 
 	ret = genphy_soft_reset(phydev);
 	if (ret)
-- 
cgit v1.2.3