From 96f887a612e4cda89efc3f54bc10c1997e3ab0e9 Mon Sep 17 00:00:00 2001
From: Hagar Hemdan <hagarhem@amazon.com>
Date: Sat, 18 May 2024 13:04:39 +0000
Subject: net: esp: cleanup esp_output_tail_tcp() in case of unsupported
 ESPINTCP

xmit() functions should consume skb or return error codes in error
paths.
When the configuration "CONFIG_INET_ESPINTCP" is not set, the
implementation of the function "esp_output_tail_tcp" violates this rule.
The function frees the skb and returns the error code.
This change removes the kfree_skb from both functions, for both
esp4 and esp6.
WARN_ON is added because esp_output_tail_tcp() should never be called if
CONFIG_INET_ESPINTCP is not set.

This bug was discovered and resolved using Coverity Static Analysis
Security Testing (SAST) by Synopsys, Inc.

Fixes: e27cca96cd68 ("xfrm: add espintcp (RFC 8229)")
Signed-off-by: Hagar Hemdan <hagarhem@amazon.com>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 net/ipv4/esp4.c | 3 +--
 net/ipv6/esp6.c | 3 +--
 2 files changed, 2 insertions(+), 4 deletions(-)

(limited to 'net')
diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c
index 3968d3f98e08..619a4df7be1e 100644
--- a/net/ipv4/esp4.c
+++ b/net/ipv4/esp4.c
@@ -239,8 +239,7 @@ static int esp_output_tail_tcp(struct xfrm_state *x, struct sk_buff *skb)
 #else
 static int esp_output_tail_tcp(struct xfrm_state *x, struct sk_buff *skb)
 {
-	kfree_skb(skb);
-
+	WARN_ON(1);
 	return -EOPNOTSUPP;
 }
 #endif
diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c
index 34a9a5b9ed00..3920e8aa1031 100644
--- a/net/ipv6/esp6.c
+++ b/net/ipv6/esp6.c
@@ -256,8 +256,7 @@ static int esp_output_tail_tcp(struct xfrm_state *x, struct sk_buff *skb)
 #else
 static int esp_output_tail_tcp(struct xfrm_state *x, struct sk_buff *skb)
 {
-	kfree_skb(skb);
-
+	WARN_ON(1);
 	return -EOPNOTSUPP;
 }
 #endif
-- 
cgit v1.2.3


From 54fcc6189dfb822eea984fa2b3e477a02447279d Mon Sep 17 00:00:00 2001
From: Antony Antony <antony.antony@secunet.com>
Date: Tue, 11 Jun 2024 08:31:29 +0200
Subject: xfrm: Fix input error path memory access

When there is a misconfiguration of input state slow path
KASAN report error. Fix this error.
west login:
[   52.987278] eth1: renamed from veth11
[   53.078814] eth1: renamed from veth21
[   53.181355] eth1: renamed from veth31
[   54.921702] ==================================================================
[   54.922602] BUG: KASAN: wild-memory-access in xfrmi_rcv_cb+0x2d/0x295
[   54.923393] Read of size 8 at addr 6b6b6b6b00000000 by task ping/512
[   54.924169]
[   54.924386] CPU: 0 PID: 512 Comm: ping Not tainted 6.9.0-08574-gcd29a4313a1b #25
[   54.925290] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.3-debian-1.16.3-2 04/01/2014
[   54.926401] Call Trace:
[   54.926731]  <IRQ>
[   54.927009]  dump_stack_lvl+0x2a/0x3b
[   54.927478]  kasan_report+0x84/0xa6
[   54.927930]  ? xfrmi_rcv_cb+0x2d/0x295
[   54.928410]  xfrmi_rcv_cb+0x2d/0x295
[   54.928872]  ? xfrm4_rcv_cb+0x3d/0x5e
[   54.929354]  xfrm4_rcv_cb+0x46/0x5e
[   54.929804]  xfrm_rcv_cb+0x7e/0xa1
[   54.930240]  xfrm_input+0x1b3a/0x1b96
[   54.930715]  ? xfrm_offload+0x41/0x41
[   54.931182]  ? raw_rcv+0x292/0x292
[   54.931617]  ? nf_conntrack_confirm+0xa2/0xa2
[   54.932158]  ? skb_sec_path+0xd/0x3f
[   54.932610]  ? xfrmi_input+0x90/0xce
[   54.933066]  xfrm4_esp_rcv+0x33/0x54
[   54.933521]  ip_protocol_deliver_rcu+0xd7/0x1b2
[   54.934089]  ip_local_deliver_finish+0x110/0x120
[   54.934659]  ? ip_protocol_deliver_rcu+0x1b2/0x1b2
[   54.935248]  NF_HOOK.constprop.0+0xf8/0x138
[   54.935767]  ? ip_sublist_rcv_finish+0x68/0x68
[   54.936317]  ? secure_tcpv6_ts_off+0x23/0x168
[   54.936859]  ? ip_protocol_deliver_rcu+0x1b2/0x1b2
[   54.937454]  ? __xfrm_policy_check2.constprop.0+0x18d/0x18d
[   54.938135]  NF_HOOK.constprop.0+0xf8/0x138
[   54.938663]  ? ip_sublist_rcv_finish+0x68/0x68
[   54.939220]  ? __xfrm_policy_check2.constprop.0+0x18d/0x18d
[   54.939904]  ? ip_local_deliver_finish+0x120/0x120
[   54.940497]  __netif_receive_skb_one_core+0xc9/0x107
[   54.941121]  ? __netif_receive_skb_list_core+0x1c2/0x1c2
[   54.941771]  ? blk_mq_start_stopped_hw_queues+0xc7/0xf9
[   54.942413]  ? blk_mq_start_stopped_hw_queue+0x38/0x38
[   54.943044]  ? virtqueue_get_buf_ctx+0x295/0x46b
[   54.943618]  process_backlog+0xb3/0x187
[   54.944102]  __napi_poll.constprop.0+0x57/0x1a7
[   54.944669]  net_rx_action+0x1cb/0x380
[   54.945150]  ? __napi_poll.constprop.0+0x1a7/0x1a7
[   54.945744]  ? vring_new_virtqueue+0x17a/0x17a
[   54.946300]  ? note_interrupt+0x2cd/0x367
[   54.946805]  handle_softirqs+0x13c/0x2c9
[   54.947300]  do_softirq+0x5f/0x7d
[   54.947727]  </IRQ>
[   54.948014]  <TASK>
[   54.948300]  __local_bh_enable_ip+0x48/0x62
[   54.948832]  __neigh_event_send+0x3fd/0x4ca
[   54.949361]  neigh_resolve_output+0x1e/0x210
[   54.949896]  ip_finish_output2+0x4bf/0x4f0
[   54.950410]  ? __ip_finish_output+0x171/0x1b8
[   54.950956]  ip_send_skb+0x25/0x57
[   54.951390]  raw_sendmsg+0xf95/0x10c0
[   54.951850]  ? check_new_pages+0x45/0x71
[   54.952343]  ? raw_hash_sk+0x21b/0x21b
[   54.952815]  ? kernel_init_pages+0x42/0x51
[   54.953337]  ? prep_new_page+0x44/0x51
[   54.953811]  ? get_page_from_freelist+0x72b/0x915
[   54.954390]  ? signal_pending_state+0x77/0x77
[   54.954936]  ? preempt_count_sub+0x14/0xb3
[   54.955450]  ? __might_resched+0x8a/0x240
[   54.955951]  ? __might_sleep+0x25/0xa0
[   54.956424]  ? first_zones_zonelist+0x2c/0x43
[   54.956977]  ? __rcu_read_lock+0x2d/0x3a
[   54.957476]  ? __pte_offset_map+0x32/0xa4
[   54.957980]  ? __might_resched+0x8a/0x240
[   54.958483]  ? __might_sleep+0x25/0xa0
[   54.958963]  ? inet_send_prepare+0x54/0x54
[   54.959478]  ? sock_sendmsg_nosec+0x42/0x6c
[   54.960000]  sock_sendmsg_nosec+0x42/0x6c
[   54.960502]  __sys_sendto+0x15d/0x1cc
[   54.960966]  ? __x64_sys_getpeername+0x44/0x44
[   54.961522]  ? __handle_mm_fault+0x679/0xae4
[   54.962068]  ? find_vma+0x6b/0x8b
[   54.962497]  ? find_vma_intersection+0x8a/0x8a
[   54.963052]  ? handle_mm_fault+0x38/0x154
[   54.963556]  ? handle_mm_fault+0xeb/0x154
[   54.964059]  ? preempt_latency_start+0x29/0x34
[   54.964613]  ? preempt_count_sub+0x14/0xb3
[   54.965141]  ? up_read+0x4b/0x5c
[   54.965557]  __x64_sys_sendto+0x76/0x82
[   54.966041]  do_syscall_64+0x69/0xd5
[   54.966497]  entry_SYSCALL_64_after_hwframe+0x4b/0x53
[   54.967119] RIP: 0033:0x7f2d2fec9a73
[   54.967572] Code: 8b 15 a9 83 0c 00 f7 d8 64 89 02 48 c7 c0 ff ff ff ff eb b8 0f 1f 00 80 3d 71 0b 0d 00 00 41 89 ca 74 14 b8 2c 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 75 c3 0f 1f 40 00 55 48 83 ec 30 44 89 4c 24
[   54.969747] RSP: 002b:00007ffe85756418 EFLAGS: 00000202 ORIG_RAX: 000000000000002c
[   54.970655] RAX: ffffffffffffffda RBX: 0000558bebad1340 RCX: 00007f2d2fec9a73
[   54.971511] RDX: 0000000000000040 RSI: 0000558bebad73c0 RDI: 0000000000000003
[   54.972366] RBP: 0000558bebad73c0 R08: 0000558bebad35c0 R09: 0000000000000010
[   54.973234] R10: 0000000000000000 R11: 0000000000000202 R12: 0000000000000040
[   54.974091] R13: 00007ffe85757b00 R14: 0000001d00000001 R15: 0000558bebad4680
[   54.974951]  </TASK>
[   54.975244] ==================================================================
[   54.976133] Disabling lock debugging due to kernel taint
[   54.976784] Oops: stack segment: 0000 [#1] PREEMPT DEBUG_PAGEALLOC KASAN
[   54.977603] CPU: 0 PID: 512 Comm: ping Tainted: G    B              6.9.0-08574-gcd29a4313a1b #25
[   54.978654] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.3-debian-1.16.3-2 04/01/2014
[   54.979750] RIP: 0010:xfrmi_rcv_cb+0x2d/0x295
[   54.980293] Code: 00 00 41 57 41 56 41 89 f6 41 55 41 54 55 53 48 89 fb 51 85 f6 75 31 48 89 df e8 d7 e8 ff ff 48 89 c5 48 89 c7 e8 8b a4 4f ff <48> 8b 7d 00 48 89 ee e8 eb f3 ff ff 49 89 c5 b8 01 00 00 00 4d 85
[   54.982462] RSP: 0018:ffffc90000007990 EFLAGS: 00010282
[   54.983099] RAX: 0000000000000001 RBX: ffff8881126e9900 RCX: fffffbfff07b77cd
[   54.983948] RDX: fffffbfff07b77cd RSI: fffffbfff07b77cd RDI: ffffffff83dbbe60
[   54.984794] RBP: 6b6b6b6b00000000 R08: 0000000000000008 R09: 0000000000000001
[   54.985647] R10: ffffffff83dbbe67 R11: fffffbfff07b77cc R12: 00000000ffffffff
[   54.986512] R13: 00000000ffffffff R14: 00000000ffffffff R15: 0000000000000002
[   54.987365] FS:  00007f2d2fc0dc40(0000) GS:ffffffff82eb2000(0000) knlGS:0000000000000000
[   54.988329] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[   54.989026] CR2: 00007ffe85755ff8 CR3: 0000000109941000 CR4: 0000000000350ef0
[   54.989897] Call Trace:
[   54.990223]  <IRQ>
[   54.990500]  ? __die_body+0x1a/0x56
[   54.990950]  ? die+0x30/0x49
[   54.991326]  ? do_trap+0x9b/0x132
[   54.991751]  ? do_error_trap+0x7d/0xaf
[   54.992223]  ? exc_stack_segment+0x35/0x45
[   54.992734]  ? asm_exc_stack_segment+0x22/0x30
[   54.993294]  ? xfrmi_rcv_cb+0x2d/0x295
[   54.993764]  ? xfrm4_rcv_cb+0x3d/0x5e
[   54.994228]  xfrm4_rcv_cb+0x46/0x5e
[   54.994670]  xfrm_rcv_cb+0x7e/0xa1
[   54.995106]  xfrm_input+0x1b3a/0x1b96
[   54.995572]  ? xfrm_offload+0x41/0x41
[   54.996038]  ? raw_rcv+0x292/0x292
[   54.996472]  ? nf_conntrack_confirm+0xa2/0xa2
[   54.997011]  ? skb_sec_path+0xd/0x3f
[   54.997466]  ? xfrmi_input+0x90/0xce
[   54.997925]  xfrm4_esp_rcv+0x33/0x54
[   54.998378]  ip_protocol_deliver_rcu+0xd7/0x1b2
[   54.998944]  ip_local_deliver_finish+0x110/0x120
[   54.999520]  ? ip_protocol_deliver_rcu+0x1b2/0x1b2
[   55.000111]  NF_HOOK.constprop.0+0xf8/0x138
[   55.000630]  ? ip_sublist_rcv_finish+0x68/0x68
[   55.001195]  ? secure_tcpv6_ts_off+0x23/0x168
[   55.001743]  ? ip_protocol_deliver_rcu+0x1b2/0x1b2
[   55.002331]  ? __xfrm_policy_check2.constprop.0+0x18d/0x18d
[   55.003008]  NF_HOOK.constprop.0+0xf8/0x138
[   55.003527]  ? ip_sublist_rcv_finish+0x68/0x68
[   55.004078]  ? __xfrm_policy_check2.constprop.0+0x18d/0x18d
[   55.004755]  ? ip_local_deliver_finish+0x120/0x120
[   55.005351]  __netif_receive_skb_one_core+0xc9/0x107
[   55.005972]  ? __netif_receive_skb_list_core+0x1c2/0x1c2
[   55.006626]  ? blk_mq_start_stopped_hw_queues+0xc7/0xf9
[   55.007266]  ? blk_mq_start_stopped_hw_queue+0x38/0x38
[   55.007899]  ? virtqueue_get_buf_ctx+0x295/0x46b
[   55.008476]  process_backlog+0xb3/0x187
[   55.008961]  __napi_poll.constprop.0+0x57/0x1a7
[   55.009540]  net_rx_action+0x1cb/0x380
[   55.010020]  ? __napi_poll.constprop.0+0x1a7/0x1a7
[   55.010610]  ? vring_new_virtqueue+0x17a/0x17a
[   55.011173]  ? note_interrupt+0x2cd/0x367
[   55.011675]  handle_softirqs+0x13c/0x2c9
[   55.012169]  do_softirq+0x5f/0x7d
[   55.012597]  </IRQ>
[   55.012882]  <TASK>
[   55.013179]  __local_bh_enable_ip+0x48/0x62
[   55.013704]  __neigh_event_send+0x3fd/0x4ca
[   55.014227]  neigh_resolve_output+0x1e/0x210
[   55.014761]  ip_finish_output2+0x4bf/0x4f0
[   55.015278]  ? __ip_finish_output+0x171/0x1b8
[   55.015823]  ip_send_skb+0x25/0x57
[   55.016261]  raw_sendmsg+0xf95/0x10c0
[   55.016729]  ? check_new_pages+0x45/0x71
[   55.017229]  ? raw_hash_sk+0x21b/0x21b
[   55.017708]  ? kernel_init_pages+0x42/0x51
[   55.018225]  ? prep_new_page+0x44/0x51
[   55.018704]  ? get_page_from_freelist+0x72b/0x915
[   55.019292]  ? signal_pending_state+0x77/0x77
[   55.019840]  ? preempt_count_sub+0x14/0xb3
[   55.020357]  ? __might_resched+0x8a/0x240
[   55.020860]  ? __might_sleep+0x25/0xa0
[   55.021345]  ? first_zones_zonelist+0x2c/0x43
[   55.021896]  ? __rcu_read_lock+0x2d/0x3a
[   55.022396]  ? __pte_offset_map+0x32/0xa4
[   55.022901]  ? __might_resched+0x8a/0x240
[   55.023404]  ? __might_sleep+0x25/0xa0
[   55.023879]  ? inet_send_prepare+0x54/0x54
[   55.024391]  ? sock_sendmsg_nosec+0x42/0x6c
[   55.024918]  sock_sendmsg_nosec+0x42/0x6c
[   55.025428]  __sys_sendto+0x15d/0x1cc
[   55.025892]  ? __x64_sys_getpeername+0x44/0x44
[   55.026441]  ? __handle_mm_fault+0x679/0xae4
[   55.026988]  ? find_vma+0x6b/0x8b
[   55.027414]  ? find_vma_intersection+0x8a/0x8a
[   55.027966]  ? handle_mm_fault+0x38/0x154
[   55.028470]  ? handle_mm_fault+0xeb/0x154
[   55.028972]  ? preempt_latency_start+0x29/0x34
[   55.029532]  ? preempt_count_sub+0x14/0xb3
[   55.030047]  ? up_read+0x4b/0x5c
[   55.030463]  __x64_sys_sendto+0x76/0x82
[   55.030949]  do_syscall_64+0x69/0xd5
[   55.031406]  entry_SYSCALL_64_after_hwframe+0x4b/0x53
[   55.032028] RIP: 0033:0x7f2d2fec9a73
[   55.032481] Code: 8b 15 a9 83 0c 00 f7 d8 64 89 02 48 c7 c0 ff ff ff ff eb b8 0f 1f 00 80 3d 71 0b 0d 00 00 41 89 ca 74 14 b8 2c 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 75 c3 0f 1f 40 00 55 48 83 ec 30 44 89 4c 24
[   55.034660] RSP: 002b:00007ffe85756418 EFLAGS: 00000202 ORIG_RAX: 000000000000002c
[   55.035567] RAX: ffffffffffffffda RBX: 0000558bebad1340 RCX: 00007f2d2fec9a73
[   55.036424] RDX: 0000000000000040 RSI: 0000558bebad73c0 RDI: 0000000000000003
[   55.037293] RBP: 0000558bebad73c0 R08: 0000558bebad35c0 R09: 0000000000000010
[   55.038153] R10: 0000000000000000 R11: 0000000000000202 R12: 0000000000000040
[   55.039012] R13: 00007ffe85757b00 R14: 0000001d00000001 R15: 0000558bebad4680
[   55.039871]  </TASK>
[   55.040167] Modules linked in:
[   55.040585] ---[ end trace 0000000000000000 ]---
[   55.041164] RIP: 0010:xfrmi_rcv_cb+0x2d/0x295
[   55.041714] Code: 00 00 41 57 41 56 41 89 f6 41 55 41 54 55 53 48 89 fb 51 85 f6 75 31 48 89 df e8 d7 e8 ff ff 48 89 c5 48 89 c7 e8 8b a4 4f ff <48> 8b 7d 00 48 89 ee e8 eb f3 ff ff 49 89 c5 b8 01 00 00 00 4d 85
[   55.043889] RSP: 0018:ffffc90000007990 EFLAGS: 00010282
[   55.044528] RAX: 0000000000000001 RBX: ffff8881126e9900 RCX: fffffbfff07b77cd
[   55.045386] RDX: fffffbfff07b77cd RSI: fffffbfff07b77cd RDI: ffffffff83dbbe60
[   55.046250] RBP: 6b6b6b6b00000000 R08: 0000000000000008 R09: 0000000000000001
[   55.047104] R10: ffffffff83dbbe67 R11: fffffbfff07b77cc R12: 00000000ffffffff
[   55.047960] R13: 00000000ffffffff R14: 00000000ffffffff R15: 0000000000000002
[   55.048820] FS:  00007f2d2fc0dc40(0000) GS:ffffffff82eb2000(0000) knlGS:0000000000000000
[   55.049805] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[   55.050507] CR2: 00007ffe85755ff8 CR3: 0000000109941000 CR4: 0000000000350ef0
[   55.051366] Kernel panic - not syncing: Fatal exception in interrupt
[   55.052136] Kernel Offset: disabled
[   55.052577] ---[ end Kernel panic - not syncing: Fatal exception in interrupt ]---

Fixes: 304b44f0d5a4 ("xfrm: Add dir validation to "in" data path lookup")
Signed-off-by: Antony Antony <antony.antony@secunet.com>
Reviewed-by: Simon Horman <horms@kernel.org>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 net/xfrm/xfrm_input.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'net')

diff --git a/net/xfrm/xfrm_input.c b/net/xfrm/xfrm_input.c
index d2ea18dcb0cb..63c004103912 100644
--- a/net/xfrm/xfrm_input.c
+++ b/net/xfrm/xfrm_input.c
@@ -585,8 +585,11 @@ int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type)
 		}
 
 		if (unlikely(x->dir && x->dir != XFRM_SA_DIR_IN)) {
+			secpath_reset(skb);
 			XFRM_INC_STATS(net, LINUX_MIB_XFRMINSTATEDIRERROR);
+			xfrm_audit_state_notfound(skb, family, spi, seq);
 			xfrm_state_put(x);
+			x = NULL;
 			goto drop;
 		}
 
-- 
cgit v1.2.3


From 15f5fe9e84839dcc9eaa69b08ced9d24cb464369 Mon Sep 17 00:00:00 2001
From: Antony Antony <antony.antony@secunet.com>
Date: Tue, 11 Jun 2024 08:32:15 +0200
Subject: xfrm: Log input direction mismatch error in one place

Previously, the offload data path decrypted the packet before checking
the direction, leading to error logging and packet dropping. However,
dropped packets wouldn't be visible in tcpdump or audit log.

With this fix, the offload path, upon noticing SA direction mismatch,
will pass the packet to the stack without decrypting it. The L3 layer
will then log the error, audit, and drop ESP without decrypting or
decapsulating it.

This also ensures that the slow path records the error and audit log,
making dropped packets visible in tcpdump.

Fixes: 304b44f0d5a4 ("xfrm: Add dir validation to "in" data path lookup")
Signed-off-by: Antony Antony <antony.antony@secunet.com>
Reviewed-by: Simon Horman <horms@kernel.org>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 net/ipv4/esp4_offload.c | 7 +++++++
 net/ipv6/esp6_offload.c | 7 +++++++
 net/xfrm/xfrm_input.c   | 5 -----
 3 files changed, 14 insertions(+), 5 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/esp4_offload.c b/net/ipv4/esp4_offload.c
index b3271957ad9a..3f28ecbdcaef 100644
--- a/net/ipv4/esp4_offload.c
+++ b/net/ipv4/esp4_offload.c
@@ -56,6 +56,13 @@ static struct sk_buff *esp4_gro_receive(struct list_head *head,
 		x = xfrm_state_lookup(dev_net(skb->dev), skb->mark,
 				      (xfrm_address_t *)&ip_hdr(skb)->daddr,
 				      spi, IPPROTO_ESP, AF_INET);
+
+		if (unlikely(x && x->dir && x->dir != XFRM_SA_DIR_IN)) {
+			/* non-offload path will record the error and audit log */
+			xfrm_state_put(x);
+			x = NULL;
+		}
+
 		if (!x)
 			goto out_reset;
 
diff --git a/net/ipv6/esp6_offload.c b/net/ipv6/esp6_offload.c
index 527b7caddbc6..919ebfabbe4e 100644
--- a/net/ipv6/esp6_offload.c
+++ b/net/ipv6/esp6_offload.c
@@ -83,6 +83,13 @@ static struct sk_buff *esp6_gro_receive(struct list_head *head,
 		x = xfrm_state_lookup(dev_net(skb->dev), skb->mark,
 				      (xfrm_address_t *)&ipv6_hdr(skb)->daddr,
 				      spi, IPPROTO_ESP, AF_INET6);
+
+		if (unlikely(x && x->dir && x->dir != XFRM_SA_DIR_IN)) {
+			/* non-offload path will record the error and audit log */
+			xfrm_state_put(x);
+			x = NULL;
+		}
+
 		if (!x)
 			goto out_reset;
 
diff --git a/net/xfrm/xfrm_input.c b/net/xfrm/xfrm_input.c
index 63c004103912..e95462b982b0 100644
--- a/net/xfrm/xfrm_input.c
+++ b/net/xfrm/xfrm_input.c
@@ -474,11 +474,6 @@ int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type)
 	if (encap_type < 0 || (xo && xo->flags & XFRM_GRO)) {
 		x = xfrm_input_state(skb);
 
-		if (unlikely(x->dir && x->dir != XFRM_SA_DIR_IN)) {
-			XFRM_INC_STATS(net, LINUX_MIB_XFRMINSTATEDIRERROR);
-			goto drop;
-		}
-
 		if (unlikely(x->km.state != XFRM_STATE_VALID)) {
 			if (x->km.state == XFRM_STATE_ACQ)
 				XFRM_INC_STATS(net, LINUX_MIB_XFRMACQUIREERROR);
-- 
cgit v1.2.3


From 07b87f9eea0c30675084d50c82532d20168da009 Mon Sep 17 00:00:00 2001
From: Steffen Klassert <steffen.klassert@secunet.com>
Date: Thu, 20 Jun 2024 08:47:24 +0200
Subject: xfrm: Fix unregister netdevice hang on hardware offload.

When offloading xfrm states to hardware, the offloading
device is attached to the skbs secpath. If a skb is free
is deferred, an unregister netdevice hangs because the
netdevice is still refcounted.

Fix this by removing the netdevice from the xfrm states
when the netdevice is unregistered. To find all xfrm states
that need to be cleared we add another list where skbs
linked to that are unlinked from the lists (deleted)
but not yet freed.

Fixes: d77e38e612a0 ("xfrm: Add an IPsec hardware offloading API")
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 include/net/xfrm.h    | 36 +++++++++---------------------
 net/xfrm/xfrm_state.c | 61 +++++++++++++++++++++++++++++++++++++++++++++++++--
 2 files changed, 69 insertions(+), 28 deletions(-)

(limited to 'net')

diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index 77ebf5bcf0b9..7d4c2235252c 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -178,7 +178,10 @@ struct xfrm_state {
 		struct hlist_node	gclist;
 		struct hlist_node	bydst;
 	};
-	struct hlist_node	bysrc;
+	union {
+		struct hlist_node	dev_gclist;
+		struct hlist_node	bysrc;
+	};
 	struct hlist_node	byspi;
 	struct hlist_node	byseq;
 
@@ -1588,7 +1591,7 @@ void xfrm_state_update_stats(struct net *net);
 static inline void xfrm_dev_state_update_stats(struct xfrm_state *x)
 {
 	struct xfrm_dev_offload *xdo = &x->xso;
-	struct net_device *dev = xdo->dev;
+	struct net_device *dev = READ_ONCE(xdo->dev);
 
 	if (dev && dev->xfrmdev_ops &&
 	    dev->xfrmdev_ops->xdo_dev_state_update_stats)
@@ -1946,13 +1949,16 @@ int xfrm_dev_policy_add(struct net *net, struct xfrm_policy *xp,
 			struct xfrm_user_offload *xuo, u8 dir,
 			struct netlink_ext_ack *extack);
 bool xfrm_dev_offload_ok(struct sk_buff *skb, struct xfrm_state *x);
+void xfrm_dev_state_delete(struct xfrm_state *x);
+void xfrm_dev_state_free(struct xfrm_state *x);
 
 static inline void xfrm_dev_state_advance_esn(struct xfrm_state *x)
 {
 	struct xfrm_dev_offload *xso = &x->xso;
+	struct net_device *dev = READ_ONCE(xso->dev);
 
-	if (xso->dev && xso->dev->xfrmdev_ops->xdo_dev_state_advance_esn)
-		xso->dev->xfrmdev_ops->xdo_dev_state_advance_esn(x);
+	if (dev && dev->xfrmdev_ops->xdo_dev_state_advance_esn)
+		dev->xfrmdev_ops->xdo_dev_state_advance_esn(x);
 }
 
 static inline bool xfrm_dst_offload_ok(struct dst_entry *dst)
@@ -1973,28 +1979,6 @@ static inline bool xfrm_dst_offload_ok(struct dst_entry *dst)
 	return false;
 }
 
-static inline void xfrm_dev_state_delete(struct xfrm_state *x)
-{
-	struct xfrm_dev_offload *xso = &x->xso;
-
-	if (xso->dev)
-		xso->dev->xfrmdev_ops->xdo_dev_state_delete(x);
-}
-
-static inline void xfrm_dev_state_free(struct xfrm_state *x)
-{
-	struct xfrm_dev_offload *xso = &x->xso;
-	struct net_device *dev = xso->dev;
-
-	if (dev && dev->xfrmdev_ops) {
-		if (dev->xfrmdev_ops->xdo_dev_state_free)
-			dev->xfrmdev_ops->xdo_dev_state_free(x);
-		xso->dev = NULL;
-		xso->type = XFRM_DEV_OFFLOAD_UNSPECIFIED;
-		netdev_put(dev, &xso->dev_tracker);
-	}
-}
-
 static inline void xfrm_dev_policy_delete(struct xfrm_policy *x)
 {
 	struct xfrm_dev_offload *xdo = &x->xdo;
diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c
index 649bb739df0d..d531d2a1fae2 100644
--- a/net/xfrm/xfrm_state.c
+++ b/net/xfrm/xfrm_state.c
@@ -49,6 +49,7 @@ static struct kmem_cache *xfrm_state_cache __ro_after_init;
 
 static DECLARE_WORK(xfrm_state_gc_work, xfrm_state_gc_task);
 static HLIST_HEAD(xfrm_state_gc_list);
+static HLIST_HEAD(xfrm_state_dev_gc_list);
 
 static inline bool xfrm_state_hold_rcu(struct xfrm_state __rcu *x)
 {
@@ -214,6 +215,7 @@ static DEFINE_SPINLOCK(xfrm_state_afinfo_lock);
 static struct xfrm_state_afinfo __rcu *xfrm_state_afinfo[NPROTO];
 
 static DEFINE_SPINLOCK(xfrm_state_gc_lock);
+static DEFINE_SPINLOCK(xfrm_state_dev_gc_lock);
 
 int __xfrm_state_delete(struct xfrm_state *x);
 
@@ -683,6 +685,40 @@ struct xfrm_state *xfrm_state_alloc(struct net *net)
 }
 EXPORT_SYMBOL(xfrm_state_alloc);
 
+#ifdef CONFIG_XFRM_OFFLOAD
+void xfrm_dev_state_delete(struct xfrm_state *x)
+{
+	struct xfrm_dev_offload *xso = &x->xso;
+	struct net_device *dev = READ_ONCE(xso->dev);
+
+	if (dev) {
+		dev->xfrmdev_ops->xdo_dev_state_delete(x);
+		spin_lock_bh(&xfrm_state_dev_gc_lock);
+		hlist_add_head(&x->dev_gclist, &xfrm_state_dev_gc_list);
+		spin_unlock_bh(&xfrm_state_dev_gc_lock);
+	}
+}
+
+void xfrm_dev_state_free(struct xfrm_state *x)
+{
+	struct xfrm_dev_offload *xso = &x->xso;
+	struct net_device *dev = READ_ONCE(xso->dev);
+
+	if (dev && dev->xfrmdev_ops) {
+		spin_lock_bh(&xfrm_state_dev_gc_lock);
+		if (!hlist_unhashed(&x->dev_gclist))
+			hlist_del(&x->dev_gclist);
+		spin_unlock_bh(&xfrm_state_dev_gc_lock);
+
+		if (dev->xfrmdev_ops->xdo_dev_state_free)
+			dev->xfrmdev_ops->xdo_dev_state_free(x);
+		WRITE_ONCE(xso->dev, NULL);
+		xso->type = XFRM_DEV_OFFLOAD_UNSPECIFIED;
+		netdev_put(dev, &xso->dev_tracker);
+	}
+}
+#endif
+
 void __xfrm_state_destroy(struct xfrm_state *x, bool sync)
 {
 	WARN_ON(x->km.state != XFRM_STATE_DEAD);
@@ -848,6 +884,9 @@ EXPORT_SYMBOL(xfrm_state_flush);
 
 int xfrm_dev_state_flush(struct net *net, struct net_device *dev, bool task_valid)
 {
+	struct xfrm_state *x;
+	struct hlist_node *tmp;
+	struct xfrm_dev_offload *xso;
 	int i, err = 0, cnt = 0;
 
 	spin_lock_bh(&net->xfrm.xfrm_state_lock);
@@ -857,8 +896,6 @@ int xfrm_dev_state_flush(struct net *net, struct net_device *dev, bool task_vali
 
 	err = -ESRCH;
 	for (i = 0; i <= net->xfrm.state_hmask; i++) {
-		struct xfrm_state *x;
-		struct xfrm_dev_offload *xso;
 restart:
 		hlist_for_each_entry(x, net->xfrm.state_bydst+i, bydst) {
 			xso = &x->xso;
@@ -868,6 +905,8 @@ restart:
 				spin_unlock_bh(&net->xfrm.xfrm_state_lock);
 
 				err = xfrm_state_delete(x);
+				xfrm_dev_state_free(x);
+
 				xfrm_audit_state_delete(x, err ? 0 : 1,
 							task_valid);
 				xfrm_state_put(x);
@@ -884,6 +923,24 @@ restart:
 
 out:
 	spin_unlock_bh(&net->xfrm.xfrm_state_lock);
+
+	spin_lock_bh(&xfrm_state_dev_gc_lock);
+restart_gc:
+	hlist_for_each_entry_safe(x, tmp, &xfrm_state_dev_gc_list, dev_gclist) {
+		xso = &x->xso;
+
+		if (xso->dev == dev) {
+			spin_unlock_bh(&xfrm_state_dev_gc_lock);
+			xfrm_dev_state_free(x);
+			spin_lock_bh(&xfrm_state_dev_gc_lock);
+			goto restart_gc;
+		}
+
+	}
+	spin_unlock_bh(&xfrm_state_dev_gc_lock);
+
+	xfrm_flush_gc();
+
 	return err;
 }
 EXPORT_SYMBOL(xfrm_dev_state_flush);
-- 
cgit v1.2.3


From 2d5317753e5f02a66e6d0afb9b25105d0beab1be Mon Sep 17 00:00:00 2001
From: Steffen Klassert <steffen.klassert@secunet.com>
Date: Fri, 28 Jun 2024 10:46:25 +0200
Subject: xfrm: Export symbol xfrm_dev_state_delete.

This fixes a build failure if xfrm_user is build as a module.

Fixes: 07b87f9eea0c ("xfrm: Fix unregister netdevice hang on hardware offload.")
Reported-by: Mark Brown <broonie@kernel.org>
Tested-by: Leon Romanovsky <leonro@nvidia.com>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 net/xfrm/xfrm_state.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net')

diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c
index d531d2a1fae2..936f9348e5f6 100644
--- a/net/xfrm/xfrm_state.c
+++ b/net/xfrm/xfrm_state.c
@@ -698,6 +698,7 @@ void xfrm_dev_state_delete(struct xfrm_state *x)
 		spin_unlock_bh(&xfrm_state_dev_gc_lock);
 	}
 }
+EXPORT_SYMBOL_GPL(xfrm_dev_state_delete);
 
 void xfrm_dev_state_free(struct xfrm_state *x)
 {
-- 
cgit v1.2.3


From 9199b915e9fad7f5eff6160d24ff6b38e970107d Mon Sep 17 00:00:00 2001
From: Jianbo Liu <jianbol@nvidia.com>
Date: Mon, 8 Jul 2024 09:58:11 +0300
Subject: xfrm: fix netdev reference count imbalance

In cited commit, netdev_tracker_alloc() is called for the newly
allocated xfrm state, but dev_hold() is missed, which causes netdev
reference count imbalance, because netdev_put() is called when the
state is freed in xfrm_dev_state_free(). Fix the issue by replacing
netdev_tracker_alloc() with netdev_hold().

Fixes: f8a70afafc17 ("xfrm: add TX datapath support for IPsec packet offload mode")
Signed-off-by: Jianbo Liu <jianbol@nvidia.com>
Reviewed-by: Cosmin Ratiu <cratiu@nvidia.com>
Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 net/xfrm/xfrm_state.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c
index 936f9348e5f6..67b2a399a48a 100644
--- a/net/xfrm/xfrm_state.c
+++ b/net/xfrm/xfrm_state.c
@@ -1331,8 +1331,7 @@ found:
 			xso->dev = xdo->dev;
 			xso->real_dev = xdo->real_dev;
 			xso->flags = XFRM_DEV_OFFLOAD_FLAG_ACQ;
-			netdev_tracker_alloc(xso->dev, &xso->dev_tracker,
-					     GFP_ATOMIC);
+			netdev_hold(xso->dev, &xso->dev_tracker, GFP_ATOMIC);
 			error = xso->dev->xfrmdev_ops->xdo_dev_state_add(x, NULL);
 			if (error) {
 				xso->dir = 0;
-- 
cgit v1.2.3


From 89a2aefe4b084686c2ffc1ee939585111ea4fc0f Mon Sep 17 00:00:00 2001
From: Jianbo Liu <jianbol@nvidia.com>
Date: Mon, 8 Jul 2024 09:58:12 +0300
Subject: xfrm: call xfrm_dev_policy_delete when kill policy

xfrm_policy_kill() is called at different places to delete xfrm
policy. It will call xfrm_pol_put(). But xfrm_dev_policy_delete() is
not called to free the policy offloaded to hardware.

The three commits cited here are to handle this issue by calling
xfrm_dev_policy_delete() outside xfrm_get_policy(). But they didn't
cover all the cases. An example, which is not handled for now, is
xfrm_policy_insert(). It is called when XFRM_MSG_UPDPOLICY request is
received. Old policy is replaced by new one, but the offloaded policy
is not deleted, so driver doesn't have the chance to release hardware
resources.

To resolve this issue for all cases, move xfrm_dev_policy_delete()
into xfrm_policy_kill(), so the offloaded policy can be deleted from
hardware when it is called, which avoids hardware resources leakage.

Fixes: 919e43fad516 ("xfrm: add an interface to offload policy")
Fixes: bf06fcf4be0f ("xfrm: add missed call to delete offloaded policies")
Fixes: 982c3aca8bac ("xfrm: delete offloaded policy")
Signed-off-by: Jianbo Liu <jianbol@nvidia.com>
Reviewed-by: Cosmin Ratiu <cratiu@nvidia.com>
Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 net/xfrm/xfrm_policy.c | 5 ++---
 net/xfrm/xfrm_user.c   | 1 -
 2 files changed, 2 insertions(+), 4 deletions(-)

(limited to 'net')

diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index 475b904fe68b..10f68d572885 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -452,6 +452,8 @@ EXPORT_SYMBOL(xfrm_policy_destroy);
 
 static void xfrm_policy_kill(struct xfrm_policy *policy)
 {
+	xfrm_dev_policy_delete(policy);
+
 	write_lock_bh(&policy->lock);
 	policy->walk.dead = 1;
 	write_unlock_bh(&policy->lock);
@@ -1850,7 +1852,6 @@ again:
 
 		__xfrm_policy_unlink(pol, dir);
 		spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
-		xfrm_dev_policy_delete(pol);
 		cnt++;
 		xfrm_audit_policy_delete(pol, 1, task_valid);
 		xfrm_policy_kill(pol);
@@ -1891,7 +1892,6 @@ again:
 
 		__xfrm_policy_unlink(pol, dir);
 		spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
-		xfrm_dev_policy_delete(pol);
 		cnt++;
 		xfrm_audit_policy_delete(pol, 1, task_valid);
 		xfrm_policy_kill(pol);
@@ -2342,7 +2342,6 @@ int xfrm_policy_delete(struct xfrm_policy *pol, int dir)
 	pol = __xfrm_policy_unlink(pol, dir);
 	spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
 	if (pol) {
-		xfrm_dev_policy_delete(pol);
 		xfrm_policy_kill(pol);
 		return 0;
 	}
diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c
index e83c687bd64e..77355422ce82 100644
--- a/net/xfrm/xfrm_user.c
+++ b/net/xfrm/xfrm_user.c
@@ -2455,7 +2455,6 @@ static int xfrm_get_policy(struct sk_buff *skb, struct nlmsghdr *nlh,
 					    NETLINK_CB(skb).portid);
 		}
 	} else {
-		xfrm_dev_policy_delete(xp);
 		xfrm_audit_policy_delete(xp, err ? 0 : 1, true);
 
 		if (err != 0)
-- 
cgit v1.2.3


From 93c3a96c301f0b1ac0bafb5e74bef58e79937648 Mon Sep 17 00:00:00 2001
From: Kory Maincent <kory.maincent@bootlin.com>
Date: Thu, 11 Jul 2024 15:55:18 +0200
Subject: net: pse-pd: Do not return EOPNOSUPP if config is null

For a PSE supporting both c33 and PoDL, setting config for one type of PoE
leaves the other type's config null. Currently, this case returns
EOPNOTSUPP, which is incorrect. Instead, we should do nothing if the
configuration is empty.

Signed-off-by: Kory Maincent <kory.maincent@bootlin.com>
Fixes: d83e13761d5b ("net: pse-pd: Use regulator framework within PSE framework")
Link: https://patch.msgid.link/20240711-fix_pse_pd_deref-v3-1-edd78fc4fe42@bootlin.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/pse-pd/pse_core.c | 4 ++--
 net/ethtool/pse-pd.c          | 4 +++-
 2 files changed, 5 insertions(+), 3 deletions(-)

(limited to 'net')

diff --git a/drivers/net/pse-pd/pse_core.c b/drivers/net/pse-pd/pse_core.c
index 795ab264eaf2..513cd7f85933 100644
--- a/drivers/net/pse-pd/pse_core.c
+++ b/drivers/net/pse-pd/pse_core.c
@@ -719,13 +719,13 @@ int pse_ethtool_set_config(struct pse_control *psec,
 {
 	int err = 0;
 
-	if (pse_has_c33(psec)) {
+	if (pse_has_c33(psec) && config->c33_admin_control) {
 		err = pse_ethtool_c33_set_config(psec, config);
 		if (err)
 			return err;
 	}
 
-	if (pse_has_podl(psec))
+	if (pse_has_podl(psec) && config->podl_admin_control)
 		err = pse_ethtool_podl_set_config(psec, config);
 
 	return err;
diff --git a/net/ethtool/pse-pd.c b/net/ethtool/pse-pd.c
index 2c981d443f27..982995ff1628 100644
--- a/net/ethtool/pse-pd.c
+++ b/net/ethtool/pse-pd.c
@@ -183,7 +183,9 @@ ethnl_set_pse(struct ethnl_req_info *req_info, struct genl_info *info)
 	if (pse_has_c33(phydev->psec))
 		config.c33_admin_control = nla_get_u32(tb[ETHTOOL_A_C33_PSE_ADMIN_CONTROL]);
 
-	/* Return errno directly - PSE has no notification */
+	/* Return errno directly - PSE has no notification
+	 * pse_ethtool_set_config() will do nothing if the config is null
+	 */
 	return pse_ethtool_set_config(phydev->psec, info->extack, &config);
 }
 
-- 
cgit v1.2.3


From 4cddb0f15ea9c62f81b4889ea69a99368cc63a86 Mon Sep 17 00:00:00 2001
From: Kory Maincent <kory.maincent@bootlin.com>
Date: Thu, 11 Jul 2024 15:55:19 +0200
Subject: net: ethtool: pse-pd: Fix possible null-deref

Fix a possible null dereference when a PSE supports both c33 and PoDL, but
only one of the netlink attributes is specified. The c33 or PoDL PSE
capabilities are already validated in the ethnl_set_pse_validate() call.

Signed-off-by: Kory Maincent <kory.maincent@bootlin.com>
Reported-by: Jakub Kicinski <kuba@kernel.org>
Closes: https://lore.kernel.org/netdev/20240705184116.13d8235a@kernel.org/
Fixes: 4d18e3ddf427 ("net: ethtool: pse-pd: Expand pse commands with the PSE PoE interface")
Link: https://patch.msgid.link/20240711-fix_pse_pd_deref-v3-2-edd78fc4fe42@bootlin.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 net/ethtool/pse-pd.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/ethtool/pse-pd.c b/net/ethtool/pse-pd.c
index 982995ff1628..776ac96cdadc 100644
--- a/net/ethtool/pse-pd.c
+++ b/net/ethtool/pse-pd.c
@@ -178,9 +178,9 @@ ethnl_set_pse(struct ethnl_req_info *req_info, struct genl_info *info)
 
 	phydev = dev->phydev;
 	/* These values are already validated by the ethnl_pse_set_policy */
-	if (pse_has_podl(phydev->psec))
+	if (tb[ETHTOOL_A_PODL_PSE_ADMIN_CONTROL])
 		config.podl_admin_control = nla_get_u32(tb[ETHTOOL_A_PODL_PSE_ADMIN_CONTROL]);
-	if (pse_has_c33(phydev->psec))
+	if (tb[ETHTOOL_A_C33_PSE_ADMIN_CONTROL])
 		config.c33_admin_control = nla_get_u32(tb[ETHTOOL_A_C33_PSE_ADMIN_CONTROL]);
 
 	/* Return errno directly - PSE has no notification
-- 
cgit v1.2.3


From 6807352353561187a718e87204458999dbcbba1b Mon Sep 17 00:00:00 2001
From: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Date: Wed, 10 Jul 2024 10:14:27 +0200
Subject: ipv4: fix source address selection with route leak

By default, an address assigned to the output interface is selected when
the source address is not specified. This is problematic when a route,
configured in a vrf, uses an interface from another vrf (aka route leak).
The original vrf does not own the selected source address.

Let's add a check against the output interface and call the appropriate
function to select the source address.

CC: stable@vger.kernel.org
Fixes: 8cbb512c923d ("net: Add source address lookup op for VRF")
Signed-off-by: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Reviewed-by: David Ahern <dsahern@kernel.org>
Link: https://patch.msgid.link/20240710081521.3809742-2-nicolas.dichtel@6wind.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 net/ipv4/fib_semantics.c | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index f669da98d11d..8956026bc0a2 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -2270,6 +2270,15 @@ void fib_select_path(struct net *net, struct fib_result *res,
 		fib_select_default(fl4, res);
 
 check_saddr:
-	if (!fl4->saddr)
-		fl4->saddr = fib_result_prefsrc(net, res);
+	if (!fl4->saddr) {
+		struct net_device *l3mdev;
+
+		l3mdev = dev_get_by_index_rcu(net, fl4->flowi4_l3mdev);
+
+		if (!l3mdev ||
+		    l3mdev_master_dev_rcu(FIB_RES_DEV(*res)) == l3mdev)
+			fl4->saddr = fib_result_prefsrc(net, res);
+		else
+			fl4->saddr = inet_select_addr(l3mdev, 0, RT_SCOPE_LINK);
+	}
 }
-- 
cgit v1.2.3


From 252442f2ae317d109ef0b4b39ce0608c09563042 Mon Sep 17 00:00:00 2001
From: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Date: Wed, 10 Jul 2024 10:14:28 +0200
Subject: ipv6: fix source address selection with route leak

By default, an address assigned to the output interface is selected when
the source address is not specified. This is problematic when a route,
configured in a vrf, uses an interface from another vrf (aka route leak).
The original vrf does not own the selected source address.

Let's add a check against the output interface and call the appropriate
function to select the source address.

CC: stable@vger.kernel.org
Fixes: 0d240e7811c4 ("net: vrf: Implement get_saddr for IPv6")
Signed-off-by: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Link: https://patch.msgid.link/20240710081521.3809742-3-nicolas.dichtel@6wind.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/ip6_route.h | 20 ++++++++++++++------
 net/ipv6/ip6_output.c   |  1 +
 net/ipv6/route.c        |  2 +-
 3 files changed, 16 insertions(+), 7 deletions(-)

(limited to 'net')

diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h
index a18ed24fed94..6dbdf60b342f 100644
--- a/include/net/ip6_route.h
+++ b/include/net/ip6_route.h
@@ -127,18 +127,26 @@ void rt6_age_exceptions(struct fib6_info *f6i, struct fib6_gc_args *gc_args,
 
 static inline int ip6_route_get_saddr(struct net *net, struct fib6_info *f6i,
 				      const struct in6_addr *daddr,
-				      unsigned int prefs,
+				      unsigned int prefs, int l3mdev_index,
 				      struct in6_addr *saddr)
 {
+	struct net_device *l3mdev;
+	struct net_device *dev;
+	bool same_vrf;
 	int err = 0;
 
-	if (f6i && f6i->fib6_prefsrc.plen) {
+	rcu_read_lock();
+
+	l3mdev = dev_get_by_index_rcu(net, l3mdev_index);
+	if (!f6i || !f6i->fib6_prefsrc.plen || l3mdev)
+		dev = f6i ? fib6_info_nh_dev(f6i) : NULL;
+	same_vrf = !l3mdev || l3mdev_master_dev_rcu(dev) == l3mdev;
+	if (f6i && f6i->fib6_prefsrc.plen && same_vrf)
 		*saddr = f6i->fib6_prefsrc.addr;
-	} else {
-		struct net_device *dev = f6i ? fib6_info_nh_dev(f6i) : NULL;
+	else
+		err = ipv6_dev_get_saddr(net, same_vrf ? dev : l3mdev, daddr, prefs, saddr);
 
-		err = ipv6_dev_get_saddr(net, dev, daddr, prefs, saddr);
-	}
+	rcu_read_unlock();
 
 	return err;
 }
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index 27d8725445e3..784424ac4147 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -1124,6 +1124,7 @@ static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk,
 		from = rt ? rcu_dereference(rt->from) : NULL;
 		err = ip6_route_get_saddr(net, from, &fl6->daddr,
 					  sk ? READ_ONCE(inet6_sk(sk)->srcprefs) : 0,
+					  fl6->flowi6_l3mdev,
 					  &fl6->saddr);
 		rcu_read_unlock();
 
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 8d72ca0b086d..c9a9506b714d 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -5689,7 +5689,7 @@ static int rt6_fill_node(struct net *net, struct sk_buff *skb,
 				goto nla_put_failure;
 	} else if (dest) {
 		struct in6_addr saddr_buf;
-		if (ip6_route_get_saddr(net, rt, dest, 0, &saddr_buf) == 0 &&
+		if (ip6_route_get_saddr(net, rt, dest, 0, 0, &saddr_buf) == 0 &&
 		    nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
 			goto nla_put_failure;
 	}
-- 
cgit v1.2.3


From abb9a68d2c64dd9b128ae1f2e635e4d805e7ce64 Mon Sep 17 00:00:00 2001
From: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Date: Wed, 10 Jul 2024 10:14:29 +0200
Subject: ipv6: take care of scope when choosing the src addr

When the source address is selected, the scope must be checked. For
example, if a loopback address is assigned to the vrf device, it must not
be chosen for packets sent outside.

CC: stable@vger.kernel.org
Fixes: afbac6010aec ("net: ipv6: Address selection needs to consider L3 domains")
Signed-off-by: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Reviewed-by: David Ahern <dsahern@kernel.org>
Link: https://patch.msgid.link/20240710081521.3809742-4-nicolas.dichtel@6wind.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 net/ipv6/addrconf.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'net')

diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 5c424a0e7232..4f2c5cc31015 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -1873,7 +1873,8 @@ int ipv6_dev_get_saddr(struct net *net, const struct net_device *dst_dev,
 							    master, &dst,
 							    scores, hiscore_idx);
 
-			if (scores[hiscore_idx].ifa)
+			if (scores[hiscore_idx].ifa &&
+			    scores[hiscore_idx].scopedist >= 0)
 				goto out;
 		}
 
-- 
cgit v1.2.3


From 79eecf631c14e7f4057186570ac20e2cfac3802e Mon Sep 17 00:00:00 2001
From: Chengen Du <chengen.du@canonical.com>
Date: Sat, 13 Jul 2024 19:47:35 +0800
Subject: af_packet: Handle outgoing VLAN packets without hardware offloading

The issue initially stems from libpcap. The ethertype will be overwritten
as the VLAN TPID if the network interface lacks hardware VLAN offloading.
In the outbound packet path, if hardware VLAN offloading is unavailable,
the VLAN tag is inserted into the payload but then cleared from the sk_buff
struct. Consequently, this can lead to a false negative when checking for
the presence of a VLAN tag, causing the packet sniffing outcome to lack
VLAN tag information (i.e., TCI-TPID). As a result, the packet capturing
tool may be unable to parse packets as expected.

The TCI-TPID is missing because the prb_fill_vlan_info() function does not
modify the tp_vlan_tci/tp_vlan_tpid values, as the information is in the
payload and not in the sk_buff struct. The skb_vlan_tag_present() function
only checks vlan_all in the sk_buff struct. In cooked mode, the L2 header
is stripped, preventing the packet capturing tool from determining the
correct TCI-TPID value. Additionally, the protocol in SLL is incorrect,
which means the packet capturing tool cannot parse the L3 header correctly.

Link: https://github.com/the-tcpdump-group/libpcap/issues/1105
Link: https://lore.kernel.org/netdev/20240520070348.26725-1-chengen.du@canonical.com/T/#u
Fixes: 393e52e33c6c ("packet: deliver VLAN TCI to userspace")
Cc: stable@vger.kernel.org
Signed-off-by: Chengen Du <chengen.du@canonical.com>
Reviewed-by: Willem de Bruijn <willemb@google.com>
Link: https://patch.msgid.link/20240713114735.62360-1-chengen.du@canonical.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 net/packet/af_packet.c | 86 ++++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 84 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index ea3ebc160e25..4692a9ef110b 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -538,6 +538,61 @@ static void *packet_current_frame(struct packet_sock *po,
 	return packet_lookup_frame(po, rb, rb->head, status);
 }
 
+static u16 vlan_get_tci(struct sk_buff *skb, struct net_device *dev)
+{
+	u8 *skb_orig_data = skb->data;
+	int skb_orig_len = skb->len;
+	struct vlan_hdr vhdr, *vh;
+	unsigned int header_len;
+
+	if (!dev)
+		return 0;
+
+	/* In the SOCK_DGRAM scenario, skb data starts at the network
+	 * protocol, which is after the VLAN headers. The outer VLAN
+	 * header is at the hard_header_len offset in non-variable
+	 * length link layer headers. If it's a VLAN device, the
+	 * min_header_len should be used to exclude the VLAN header
+	 * size.
+	 */
+	if (dev->min_header_len == dev->hard_header_len)
+		header_len = dev->hard_header_len;
+	else if (is_vlan_dev(dev))
+		header_len = dev->min_header_len;
+	else
+		return 0;
+
+	skb_push(skb, skb->data - skb_mac_header(skb));
+	vh = skb_header_pointer(skb, header_len, sizeof(vhdr), &vhdr);
+	if (skb_orig_data != skb->data) {
+		skb->data = skb_orig_data;
+		skb->len = skb_orig_len;
+	}
+	if (unlikely(!vh))
+		return 0;
+
+	return ntohs(vh->h_vlan_TCI);
+}
+
+static __be16 vlan_get_protocol_dgram(struct sk_buff *skb)
+{
+	__be16 proto = skb->protocol;
+
+	if (unlikely(eth_type_vlan(proto))) {
+		u8 *skb_orig_data = skb->data;
+		int skb_orig_len = skb->len;
+
+		skb_push(skb, skb->data - skb_mac_header(skb));
+		proto = __vlan_get_protocol(skb, proto, NULL);
+		if (skb_orig_data != skb->data) {
+			skb->data = skb_orig_data;
+			skb->len = skb_orig_len;
+		}
+	}
+
+	return proto;
+}
+
 static void prb_del_retire_blk_timer(struct tpacket_kbdq_core *pkc)
 {
 	del_timer_sync(&pkc->retire_blk_timer);
@@ -1007,10 +1062,16 @@ static void prb_clear_rxhash(struct tpacket_kbdq_core *pkc,
 static void prb_fill_vlan_info(struct tpacket_kbdq_core *pkc,
 			struct tpacket3_hdr *ppd)
 {
+	struct packet_sock *po = container_of(pkc, struct packet_sock, rx_ring.prb_bdqc);
+
 	if (skb_vlan_tag_present(pkc->skb)) {
 		ppd->hv1.tp_vlan_tci = skb_vlan_tag_get(pkc->skb);
 		ppd->hv1.tp_vlan_tpid = ntohs(pkc->skb->vlan_proto);
 		ppd->tp_status = TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID;
+	} else if (unlikely(po->sk.sk_type == SOCK_DGRAM && eth_type_vlan(pkc->skb->protocol))) {
+		ppd->hv1.tp_vlan_tci = vlan_get_tci(pkc->skb, pkc->skb->dev);
+		ppd->hv1.tp_vlan_tpid = ntohs(pkc->skb->protocol);
+		ppd->tp_status = TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID;
 	} else {
 		ppd->hv1.tp_vlan_tci = 0;
 		ppd->hv1.tp_vlan_tpid = 0;
@@ -2428,6 +2489,10 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
 			h.h2->tp_vlan_tci = skb_vlan_tag_get(skb);
 			h.h2->tp_vlan_tpid = ntohs(skb->vlan_proto);
 			status |= TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID;
+		} else if (unlikely(sk->sk_type == SOCK_DGRAM && eth_type_vlan(skb->protocol))) {
+			h.h2->tp_vlan_tci = vlan_get_tci(skb, skb->dev);
+			h.h2->tp_vlan_tpid = ntohs(skb->protocol);
+			status |= TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID;
 		} else {
 			h.h2->tp_vlan_tci = 0;
 			h.h2->tp_vlan_tpid = 0;
@@ -2457,7 +2522,8 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
 	sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
 	sll->sll_family = AF_PACKET;
 	sll->sll_hatype = dev->type;
-	sll->sll_protocol = skb->protocol;
+	sll->sll_protocol = (sk->sk_type == SOCK_DGRAM) ?
+		vlan_get_protocol_dgram(skb) : skb->protocol;
 	sll->sll_pkttype = skb->pkt_type;
 	if (unlikely(packet_sock_flag(po, PACKET_SOCK_ORIGDEV)))
 		sll->sll_ifindex = orig_dev->ifindex;
@@ -3482,7 +3548,8 @@ static int packet_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
 		/* Original length was stored in sockaddr_ll fields */
 		origlen = PACKET_SKB_CB(skb)->sa.origlen;
 		sll->sll_family = AF_PACKET;
-		sll->sll_protocol = skb->protocol;
+		sll->sll_protocol = (sock->type == SOCK_DGRAM) ?
+			vlan_get_protocol_dgram(skb) : skb->protocol;
 	}
 
 	sock_recv_cmsgs(msg, sk, skb);
@@ -3539,6 +3606,21 @@ static int packet_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
 			aux.tp_vlan_tci = skb_vlan_tag_get(skb);
 			aux.tp_vlan_tpid = ntohs(skb->vlan_proto);
 			aux.tp_status |= TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID;
+		} else if (unlikely(sock->type == SOCK_DGRAM && eth_type_vlan(skb->protocol))) {
+			struct sockaddr_ll *sll = &PACKET_SKB_CB(skb)->sa.ll;
+			struct net_device *dev;
+
+			rcu_read_lock();
+			dev = dev_get_by_index_rcu(sock_net(sk), sll->sll_ifindex);
+			if (dev) {
+				aux.tp_vlan_tci = vlan_get_tci(skb, dev);
+				aux.tp_vlan_tpid = ntohs(skb->protocol);
+				aux.tp_status |= TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID;
+			} else {
+				aux.tp_vlan_tci = 0;
+				aux.tp_vlan_tpid = 0;
+			}
+			rcu_read_unlock();
 		} else {
 			aux.tp_vlan_tci = 0;
 			aux.tp_vlan_tpid = 0;
-- 
cgit v1.2.3


From 59a931c5b732ca5fc2ca727f5a72aeabaafa85ec Mon Sep 17 00:00:00 2001
From: Taehee Yoo <ap420073@gmail.com>
Date: Fri, 12 Jul 2024 09:51:16 +0000
Subject: xdp: fix invalid wait context of page_pool_destroy()

If the driver uses a page pool, it creates a page pool with
page_pool_create().
The reference count of page pool is 1 as default.
A page pool will be destroyed only when a reference count reaches 0.
page_pool_destroy() is used to destroy page pool, it decreases a
reference count.
When a page pool is destroyed, ->disconnect() is called, which is
mem_allocator_disconnect().
This function internally acquires mutex_lock().

If the driver uses XDP, it registers a memory model with
xdp_rxq_info_reg_mem_model().
The xdp_rxq_info_reg_mem_model() internally increases a page pool
reference count if a memory model is a page pool.
Now the reference count is 2.

To destroy a page pool, the driver should call both page_pool_destroy()
and xdp_unreg_mem_model().
The xdp_unreg_mem_model() internally calls page_pool_destroy().
Only page_pool_destroy() decreases a reference count.

If a driver calls page_pool_destroy() then xdp_unreg_mem_model(), we
will face an invalid wait context warning.
Because xdp_unreg_mem_model() calls page_pool_destroy() with
rcu_read_lock().
The page_pool_destroy() internally acquires mutex_lock().

Splat looks like:
=============================
[ BUG: Invalid wait context ]
6.10.0-rc6+ #4 Tainted: G W
-----------------------------
ethtool/1806 is trying to lock:
ffffffff90387b90 (mem_id_lock){+.+.}-{4:4}, at: mem_allocator_disconnect+0x73/0x150
other info that might help us debug this:
context-{5:5}
3 locks held by ethtool/1806:
stack backtrace:
CPU: 0 PID: 1806 Comm: ethtool Tainted: G W 6.10.0-rc6+ #4 f916f41f172891c800f2fed
Hardware name: ASUS System Product Name/PRIME Z690-P D4, BIOS 0603 11/01/2021
Call Trace:
<TASK>
dump_stack_lvl+0x7e/0xc0
__lock_acquire+0x1681/0x4de0
? _printk+0x64/0xe0
? __pfx_mark_lock.part.0+0x10/0x10
? __pfx___lock_acquire+0x10/0x10
lock_acquire+0x1b3/0x580
? mem_allocator_disconnect+0x73/0x150
? __wake_up_klogd.part.0+0x16/0xc0
? __pfx_lock_acquire+0x10/0x10
? dump_stack_lvl+0x91/0xc0
__mutex_lock+0x15c/0x1690
? mem_allocator_disconnect+0x73/0x150
? __pfx_prb_read_valid+0x10/0x10
? mem_allocator_disconnect+0x73/0x150
? __pfx_llist_add_batch+0x10/0x10
? console_unlock+0x193/0x1b0
? lockdep_hardirqs_on+0xbe/0x140
? __pfx___mutex_lock+0x10/0x10
? tick_nohz_tick_stopped+0x16/0x90
? __irq_work_queue_local+0x1e5/0x330
? irq_work_queue+0x39/0x50
? __wake_up_klogd.part.0+0x79/0xc0
? mem_allocator_disconnect+0x73/0x150
mem_allocator_disconnect+0x73/0x150
? __pfx_mem_allocator_disconnect+0x10/0x10
? mark_held_locks+0xa5/0xf0
? rcu_is_watching+0x11/0xb0
page_pool_release+0x36e/0x6d0
page_pool_destroy+0xd7/0x440
xdp_unreg_mem_model+0x1a7/0x2a0
? __pfx_xdp_unreg_mem_model+0x10/0x10
? kfree+0x125/0x370
? bnxt_free_ring.isra.0+0x2eb/0x500
? bnxt_free_mem+0x5ac/0x2500
xdp_rxq_info_unreg+0x4a/0xd0
bnxt_free_mem+0x1356/0x2500
bnxt_close_nic+0xf0/0x3b0
? __pfx_bnxt_close_nic+0x10/0x10
? ethnl_parse_bit+0x2c6/0x6d0
? __pfx___nla_validate_parse+0x10/0x10
? __pfx_ethnl_parse_bit+0x10/0x10
bnxt_set_features+0x2a8/0x3e0
__netdev_update_features+0x4dc/0x1370
? ethnl_parse_bitset+0x4ff/0x750
? __pfx_ethnl_parse_bitset+0x10/0x10
? __pfx___netdev_update_features+0x10/0x10
? mark_held_locks+0xa5/0xf0
? _raw_spin_unlock_irqrestore+0x42/0x70
? __pm_runtime_resume+0x7d/0x110
ethnl_set_features+0x32d/0xa20

To fix this problem, it uses rhashtable_lookup_fast() instead of
rhashtable_lookup() with rcu_read_lock().
Using xa without rcu_read_lock() here is safe.
xa is freed by __xdp_mem_allocator_rcu_free() and this is called by
call_rcu() of mem_xa_remove().
The mem_xa_remove() is called by page_pool_destroy() if a reference
count reaches 0.
The xa is already protected by the reference count mechanism well in the
control plane.
So removing rcu_read_lock() for page_pool_destroy() is safe.

Fixes: c3f812cea0d7 ("page_pool: do not release pool until inflight == 0.")
Signed-off-by: Taehee Yoo <ap420073@gmail.com>
Reviewed-by: Jakub Kicinski <kuba@kernel.org>
Link: https://patch.msgid.link/20240712095116.3801586-1-ap420073@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 net/core/xdp.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'net')

diff --git a/net/core/xdp.c b/net/core/xdp.c
index 022c12059cf2..bcc5551c6424 100644
--- a/net/core/xdp.c
+++ b/net/core/xdp.c
@@ -127,10 +127,8 @@ void xdp_unreg_mem_model(struct xdp_mem_info *mem)
 		return;
 
 	if (type == MEM_TYPE_PAGE_POOL) {
-		rcu_read_lock();
-		xa = rhashtable_lookup(mem_id_ht, &id, mem_id_rht_params);
+		xa = rhashtable_lookup_fast(mem_id_ht, &id, mem_id_rht_params);
 		page_pool_destroy(xa->page_pool);
-		rcu_read_unlock();
 	}
 }
 EXPORT_SYMBOL_GPL(xdp_unreg_mem_model);
-- 
cgit v1.2.3


From 0a1868b93fad5938dbcca77286b25bf211c49f7a Mon Sep 17 00:00:00 2001
From: Elliot Ayrey <elliot.ayrey@alliedtelesis.co.nz>
Date: Fri, 12 Jul 2024 13:31:33 +1200
Subject: net: bridge: mst: Check vlan state for egress decision

If a port is blocking in the common instance but forwarding in an MST
instance, traffic egressing the bridge will be dropped because the
state of the common instance is overriding that of the MST instance.

Fix this by skipping the port state check in MST mode to allow
checking the vlan state via br_allowed_egress(). This is similar to
what happens in br_handle_frame_finish() when checking ingress
traffic, which was introduced in the change below.

Fixes: ec7328b59176 ("net: bridge: mst: Multiple Spanning Tree (MST) mode")
Signed-off-by: Elliot Ayrey <elliot.ayrey@alliedtelesis.co.nz>
Acked-by: Nikolay Aleksandrov <razor@blackwall.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/bridge/br_forward.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'net')

diff --git a/net/bridge/br_forward.c b/net/bridge/br_forward.c
index d97064d460dc..e19b583ff2c6 100644
--- a/net/bridge/br_forward.c
+++ b/net/bridge/br_forward.c
@@ -25,8 +25,8 @@ static inline int should_deliver(const struct net_bridge_port *p,
 
 	vg = nbp_vlan_group_rcu(p);
 	return ((p->flags & BR_HAIRPIN_MODE) || skb->dev != p->dev) &&
-		p->state == BR_STATE_FORWARDING && br_allowed_egress(vg, skb) &&
-		nbp_switchdev_allowed_egress(p, skb) &&
+		(br_mst_is_enabled(p->br) || p->state == BR_STATE_FORWARDING) &&
+		br_allowed_egress(vg, skb) && nbp_switchdev_allowed_egress(p, skb) &&
 		!br_skb_isolated(p, skb);
 }
 
-- 
cgit v1.2.3