From 943d045a6d796175e5d08f9973953b1d2c07d797 Mon Sep 17 00:00:00 2001 From: Siddharth Kawar Date: Mon, 20 Mar 2023 20:37:40 +0000 Subject: SUNRPC: fix shutdown of NFS TCP client socket NFS server Duplicate Request Cache (DRC) algorithms rely on NFS clients reconnecting using the same local TCP port. Unique NFS operations are identified by the per-TCP connection set of XIDs. This prevents file corruption when non-idempotent NFS operations are retried. Currently, NFS client TCP connections are using different local TCP ports when reconnecting to NFS servers. After an NFS server initiates shutdown of the TCP connection, the NFS client's TCP socket is set to NULL after the socket state has reached TCP_LAST_ACK(9). When reconnecting, the new socket attempts to reuse the same local port but fails with EADDRNOTAVAIL (99). This forces the socket to use a different local TCP port to reconnect to the remote NFS server. State Transition and Events: TCP_CLOSE_WAIT(8) TCP_LAST_ACK(9) connect(fail EADDRNOTAVAIL(99)) TCP_CLOSE(7) bind on new port connect success dmesg excerpts showing reconnect switching from TCP local port of 926 to 763 after commit 7c81e6a9d75b: [13354.947854] NFS call mkdir testW ... [13405.654781] RPC: xs_tcp_state_change client 00000000037d0f03... [13405.654813] RPC: state 8 conn 1 dead 0 zapped 1 sk_shutdown 1 [13405.654826] RPC: xs_data_ready... [13405.654892] RPC: xs_tcp_state_change client 00000000037d0f03... [13405.654895] RPC: state 9 conn 0 dead 0 zapped 1 sk_shutdown 3 [13405.654899] RPC: xs_tcp_state_change client 00000000037d0f03... [13405.654900] RPC: state 9 conn 0 dead 0 zapped 1 sk_shutdown 3 [13405.654950] RPC: xs_connect scheduled xprt 00000000037d0f03 [13405.654975] RPC: xs_bind 0.0.0.0:926: ok (0) [13405.654980] RPC: worker connecting xprt 00000000037d0f03 via tcp to 10.101.6.228 (port 2049) [13405.654991] RPC: 00000000037d0f03 connect status 99 connected 0 sock state 7 [13405.655001] RPC: xs_tcp_state_change client 00000000037d0f03... [13405.655002] RPC: state 7 conn 0 dead 0 zapped 1 sk_shutdown 3 [13405.655024] RPC: xs_connect scheduled xprt 00000000037d0f03 [13405.655038] RPC: xs_bind 0.0.0.0:763: ok (0) [13405.655041] RPC: worker connecting xprt 00000000037d0f03 via tcp to 10.101.6.228 (port 2049) [13405.655065] RPC: 00000000037d0f03 connect status 115 connected 0 sock state 2 State Transition and Events with patch applied: TCP_CLOSE_WAIT(8) TCP_LAST_ACK(9) TCP_CLOSE(7) connect(reuse of port succeeds) dmesg excerpts showing reconnect on same TCP local port of 936 with patch applied: [ 257.139935] NFS: mkdir(0:59/560857152), testQ [ 257.139937] NFS call mkdir testQ ... [ 307.822702] RPC: state 8 conn 1 dead 0 zapped 1 sk_shutdown 1 [ 307.822714] RPC: xs_data_ready... [ 307.822817] RPC: xs_tcp_state_change client 00000000ce702f14... [ 307.822821] RPC: state 9 conn 0 dead 0 zapped 1 sk_shutdown 3 [ 307.822825] RPC: xs_tcp_state_change client 00000000ce702f14... [ 307.822826] RPC: state 9 conn 0 dead 0 zapped 1 sk_shutdown 3 [ 307.823606] RPC: xs_tcp_state_change client 00000000ce702f14... [ 307.823609] RPC: state 7 conn 0 dead 0 zapped 1 sk_shutdown 3 [ 307.823629] RPC: xs_tcp_state_change client 00000000ce702f14... [ 307.823632] RPC: state 7 conn 0 dead 0 zapped 1 sk_shutdown 3 [ 307.823676] RPC: xs_connect scheduled xprt 00000000ce702f14 [ 307.823704] RPC: xs_bind 0.0.0.0:936: ok (0) [ 307.823709] RPC: worker connecting xprt 00000000ce702f14 via tcp to 10.101.1.30 (port 2049) [ 307.823748] RPC: 00000000ce702f14 connect status 115 connected 0 sock state 2 ... [ 314.916193] RPC: state 7 conn 0 dead 0 zapped 1 sk_shutdown 3 [ 314.916251] RPC: xs_connect scheduled xprt 00000000ce702f14 [ 314.916282] RPC: xs_bind 0.0.0.0:936: ok (0) [ 314.916292] RPC: worker connecting xprt 00000000ce702f14 via tcp to 10.101.1.30 (port 2049) [ 314.916342] RPC: 00000000ce702f14 connect status 115 connected 0 sock state 2 Fixes: 7c81e6a9d75b ("SUNRPC: Tweak TCP socket shutdown in the RPC client") Signed-off-by: Siddharth Rajendra Kawar Signed-off-by: Anna Schumaker --- net/sunrpc/xprtsock.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index adcbedc244d6..6cacd70a15ff 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -2158,6 +2158,7 @@ static void xs_tcp_shutdown(struct rpc_xprt *xprt) switch (skst) { case TCP_FIN_WAIT1: case TCP_FIN_WAIT2: + case TCP_LAST_ACK: break; case TCP_ESTABLISHED: case TCP_CLOSE_WAIT: -- cgit v1.2.3 From f9d2b1e146e0f82f3d04629afd92698522058361 Mon Sep 17 00:00:00 2001 From: Bobby Eshleman Date: Wed, 29 Mar 2023 16:51:58 +0000 Subject: virtio/vsock: fix leaks due to missing skb owner This patch sets the skb owner in the recv and send path for virtio. For the send path, this solves the leak caused when virtio_transport_purge_skbs() finds skb->sk is always NULL and therefore never matches it with the current socket. Setting the owner upon allocation fixes this. For the recv path, this ensures correctness of accounting and also correct transfer of ownership in vsock_loopback (when skbs are sent from one socket and received by another). Fixes: 71dc9ec9ac7d ("virtio/vsock: replace virtio_vsock_pkt with sk_buff") Signed-off-by: Bobby Eshleman Reported-by: Cong Wang Link: https://lore.kernel.org/all/ZCCbATwov4U+GBUv@pop-os.localdomain/ Reviewed-by: Stefano Garzarella Signed-off-by: David S. Miller --- net/vmw_vsock/virtio_transport_common.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'net') diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c index 37934dfe72f4..ee78b4082ef9 100644 --- a/net/vmw_vsock/virtio_transport_common.c +++ b/net/vmw_vsock/virtio_transport_common.c @@ -94,6 +94,11 @@ virtio_transport_alloc_skb(struct virtio_vsock_pkt_info *info, info->op, info->flags); + if (info->vsk && !skb_set_owner_sk_safe(skb, sk_vsock(info->vsk))) { + WARN_ONCE(1, "failed to allocate skb on vsock socket with sk_refcnt == 0\n"); + goto out; + } + return skb; out: @@ -1303,6 +1308,11 @@ void virtio_transport_recv_pkt(struct virtio_transport *t, goto free_pkt; } + if (!skb_set_owner_sk_safe(skb, sk)) { + WARN_ONCE(1, "receiving vsock socket has sk_refcnt == 0\n"); + goto free_pkt; + } + vsk = vsock_sk(sk); lock_sock(sk); -- cgit v1.2.3 From 44d807320000db0d0013372ad39b53e12d52f758 Mon Sep 17 00:00:00 2001 From: Ziyang Xuan Date: Thu, 30 Mar 2023 09:25:32 +0800 Subject: net: qrtr: Fix a refcount bug in qrtr_recvmsg() Syzbot reported a bug as following: refcount_t: addition on 0; use-after-free. ... RIP: 0010:refcount_warn_saturate+0x17c/0x1f0 lib/refcount.c:25 ... Call Trace: __refcount_add include/linux/refcount.h:199 [inline] __refcount_inc include/linux/refcount.h:250 [inline] refcount_inc include/linux/refcount.h:267 [inline] kref_get include/linux/kref.h:45 [inline] qrtr_node_acquire net/qrtr/af_qrtr.c:202 [inline] qrtr_node_lookup net/qrtr/af_qrtr.c:398 [inline] qrtr_send_resume_tx net/qrtr/af_qrtr.c:1003 [inline] qrtr_recvmsg+0x85f/0x990 net/qrtr/af_qrtr.c:1070 sock_recvmsg_nosec net/socket.c:1017 [inline] sock_recvmsg+0xe2/0x160 net/socket.c:1038 qrtr_ns_worker+0x170/0x1700 net/qrtr/ns.c:688 process_one_work+0x991/0x15c0 kernel/workqueue.c:2390 worker_thread+0x669/0x1090 kernel/workqueue.c:2537 It occurs in the concurrent scenario of qrtr_recvmsg() and qrtr_endpoint_unregister() as following: cpu0 cpu1 qrtr_recvmsg qrtr_endpoint_unregister qrtr_send_resume_tx qrtr_node_release qrtr_node_lookup mutex_lock(&qrtr_node_lock) spin_lock_irqsave(&qrtr_nodes_lock, ) refcount_dec_and_test(&node->ref) [node->ref == 0] radix_tree_lookup [node != NULL] __qrtr_node_release qrtr_node_acquire spin_lock_irqsave(&qrtr_nodes_lock, ) kref_get(&node->ref) [WARNING] ... mutex_unlock(&qrtr_node_lock) Use qrtr_node_lock to protect qrtr_node_lookup() implementation, this is actually improving the protection of node reference. Fixes: 0a7e0d0ef054 ("net: qrtr: Migrate node lookup tree to spinlock") Reported-by: syzbot+a7492efaa5d61b51db23@syzkaller.appspotmail.com Link: https://syzkaller.appspot.com/bug?extid=a7492efaa5d61b51db23 Signed-off-by: Ziyang Xuan Signed-off-by: David S. Miller --- net/qrtr/af_qrtr.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/qrtr/af_qrtr.c b/net/qrtr/af_qrtr.c index 5c2fb992803b..3a70255c8d02 100644 --- a/net/qrtr/af_qrtr.c +++ b/net/qrtr/af_qrtr.c @@ -393,10 +393,12 @@ static struct qrtr_node *qrtr_node_lookup(unsigned int nid) struct qrtr_node *node; unsigned long flags; + mutex_lock(&qrtr_node_lock); spin_lock_irqsave(&qrtr_nodes_lock, flags); node = radix_tree_lookup(&qrtr_nodes, nid); node = qrtr_node_acquire(node); spin_unlock_irqrestore(&qrtr_nodes_lock, flags); + mutex_unlock(&qrtr_node_lock); return node; } -- cgit v1.2.3 From 154e07c164859fc90bf4e8143f2f6c1af9f3a35e Mon Sep 17 00:00:00 2001 From: Andrea Righi Date: Thu, 30 Mar 2023 11:54:42 +0200 Subject: l2tp: generate correct module alias strings Commit 65b32f801bfb ("uapi: move IPPROTO_L2TP to in.h") moved the definition of IPPROTO_L2TP from a define to an enum, but since __stringify doesn't work properly with enums, we ended up breaking the modalias strings for the l2tp modules: $ modinfo l2tp_ip l2tp_ip6 | grep alias alias: net-pf-2-proto-IPPROTO_L2TP alias: net-pf-2-proto-2-type-IPPROTO_L2TP alias: net-pf-10-proto-IPPROTO_L2TP alias: net-pf-10-proto-2-type-IPPROTO_L2TP Use the resolved number directly in MODULE_ALIAS_*() macros (as we already do with SOCK_DGRAM) to fix the alias strings: $ modinfo l2tp_ip l2tp_ip6 | grep alias alias: net-pf-2-proto-115 alias: net-pf-2-proto-115-type-2 alias: net-pf-10-proto-115 alias: net-pf-10-proto-115-type-2 Moreover, fix the ordering of the parameters passed to MODULE_ALIAS_NET_PF_PROTO_TYPE() by switching proto and type. Fixes: 65b32f801bfb ("uapi: move IPPROTO_L2TP to in.h") Link: https://lore.kernel.org/lkml/ZCQt7hmodtUaBlCP@righiandr-XPS-13-7390 Signed-off-by: Guillaume Nault Signed-off-by: Andrea Righi Reviewed-by: Wojciech Drewek Tested-by: Wojciech Drewek Signed-off-by: David S. Miller --- net/l2tp/l2tp_ip.c | 8 ++++---- net/l2tp/l2tp_ip6.c | 8 ++++---- 2 files changed, 8 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/l2tp/l2tp_ip.c b/net/l2tp/l2tp_ip.c index 4db5a554bdbd..41a74fc84ca1 100644 --- a/net/l2tp/l2tp_ip.c +++ b/net/l2tp/l2tp_ip.c @@ -677,8 +677,8 @@ MODULE_AUTHOR("James Chapman "); MODULE_DESCRIPTION("L2TP over IP"); MODULE_VERSION("1.0"); -/* Use the value of SOCK_DGRAM (2) directory, because __stringify doesn't like - * enums +/* Use the values of SOCK_DGRAM (2) as type and IPPROTO_L2TP (115) as protocol, + * because __stringify doesn't like enums */ -MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_INET, 2, IPPROTO_L2TP); -MODULE_ALIAS_NET_PF_PROTO(PF_INET, IPPROTO_L2TP); +MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_INET, 115, 2); +MODULE_ALIAS_NET_PF_PROTO(PF_INET, 115); diff --git a/net/l2tp/l2tp_ip6.c b/net/l2tp/l2tp_ip6.c index 2478aa60145f..5137ea1861ce 100644 --- a/net/l2tp/l2tp_ip6.c +++ b/net/l2tp/l2tp_ip6.c @@ -806,8 +806,8 @@ MODULE_AUTHOR("Chris Elston "); MODULE_DESCRIPTION("L2TP IP encapsulation for IPv6"); MODULE_VERSION("1.0"); -/* Use the value of SOCK_DGRAM (2) directory, because __stringify doesn't like - * enums +/* Use the values of SOCK_DGRAM (2) as type and IPPROTO_L2TP (115) as protocol, + * because __stringify doesn't like enums */ -MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_INET6, 2, IPPROTO_L2TP); -MODULE_ALIAS_NET_PF_PROTO(PF_INET6, IPPROTO_L2TP); +MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_INET6, 115, 2); +MODULE_ALIAS_NET_PF_PROTO(PF_INET6, 115); -- cgit v1.2.3 From ffa5395a7901e83a68d88207c4592962906641bd Mon Sep 17 00:00:00 2001 From: Arseniy Krasnov Date: Fri, 31 Mar 2023 10:56:41 +0300 Subject: vsock/vmci: convert VMCI error code to -ENOMEM on send This adds conversion of VMCI specific error code to general -ENOMEM. It is needed, because af_vsock.c passes error value returned from transport to the user, which does not expect to get VMCI_ERROR_* values. Fixes: c43170b7e157 ("vsock: return errors other than -ENOMEM to socket") Signed-off-by: Arseniy Krasnov Reviewed-by: Vishnu Dasa Reviewed-by: Stefano Garzarella Signed-off-by: David S. Miller --- net/vmw_vsock/vmci_transport.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/vmw_vsock/vmci_transport.c b/net/vmw_vsock/vmci_transport.c index 36eb16a40745..95cc4d79ba29 100644 --- a/net/vmw_vsock/vmci_transport.c +++ b/net/vmw_vsock/vmci_transport.c @@ -1842,7 +1842,13 @@ static ssize_t vmci_transport_stream_enqueue( struct msghdr *msg, size_t len) { - return vmci_qpair_enquev(vmci_trans(vsk)->qpair, msg, len, 0); + ssize_t err; + + err = vmci_qpair_enquev(vmci_trans(vsk)->qpair, msg, len, 0); + if (err < 0) + err = -ENOMEM; + + return err; } static s64 vmci_transport_stream_has_data(struct vsock_sock *vsk) -- cgit v1.2.3 From 7d63b67125382ff0ffdfca434acbc94a38bd092b Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 30 Mar 2023 17:45:02 +0000 Subject: icmp: guard against too small mtu syzbot was able to trigger a panic [1] in icmp_glue_bits(), or more exactly in skb_copy_and_csum_bits() There is no repro yet, but I think the issue is that syzbot manages to lower device mtu to a small value, fooling __icmp_send() __icmp_send() must make sure there is enough room for the packet to include at least the headers. We might in the future refactor skb_copy_and_csum_bits() and its callers to no longer crash when something bad happens. [1] kernel BUG at net/core/skbuff.c:3343 ! invalid opcode: 0000 [#1] PREEMPT SMP KASAN CPU: 0 PID: 15766 Comm: syz-executor.0 Not tainted 6.3.0-rc4-syzkaller-00039-gffe78bbd5121 #0 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.14.0-2 04/01/2014 RIP: 0010:skb_copy_and_csum_bits+0x798/0x860 net/core/skbuff.c:3343 Code: f0 c1 c8 08 41 89 c6 e9 73 ff ff ff e8 61 48 d4 f9 e9 41 fd ff ff 48 8b 7c 24 48 e8 52 48 d4 f9 e9 c3 fc ff ff e8 c8 27 84 f9 <0f> 0b 48 89 44 24 28 e8 3c 48 d4 f9 48 8b 44 24 28 e9 9d fb ff ff RSP: 0018:ffffc90000007620 EFLAGS: 00010246 RAX: 0000000000000000 RBX: 00000000000001e8 RCX: 0000000000000100 RDX: ffff8880276f6280 RSI: ffffffff87fdd138 RDI: 0000000000000005 RBP: 0000000000000000 R08: 0000000000000005 R09: 0000000000000000 R10: 00000000000001e8 R11: 0000000000000001 R12: 000000000000003c R13: 0000000000000000 R14: ffff888028244868 R15: 0000000000000b0e FS: 00007fbc81f1c700(0000) GS:ffff88802ca00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000001b2df43000 CR3: 00000000744db000 CR4: 0000000000150ef0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: icmp_glue_bits+0x7b/0x210 net/ipv4/icmp.c:353 __ip_append_data+0x1d1b/0x39f0 net/ipv4/ip_output.c:1161 ip_append_data net/ipv4/ip_output.c:1343 [inline] ip_append_data+0x115/0x1a0 net/ipv4/ip_output.c:1322 icmp_push_reply+0xa8/0x440 net/ipv4/icmp.c:370 __icmp_send+0xb80/0x1430 net/ipv4/icmp.c:765 ipv4_send_dest_unreach net/ipv4/route.c:1239 [inline] ipv4_link_failure+0x5a9/0x9e0 net/ipv4/route.c:1246 dst_link_failure include/net/dst.h:423 [inline] arp_error_report+0xcb/0x1c0 net/ipv4/arp.c:296 neigh_invalidate+0x20d/0x560 net/core/neighbour.c:1079 neigh_timer_handler+0xc77/0xff0 net/core/neighbour.c:1166 call_timer_fn+0x1a0/0x580 kernel/time/timer.c:1700 expire_timers+0x29b/0x4b0 kernel/time/timer.c:1751 __run_timers kernel/time/timer.c:2022 [inline] Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Reported-by: syzbot+d373d60fddbdc915e666@syzkaller.appspotmail.com Signed-off-by: Eric Dumazet Link: https://lore.kernel.org/r/20230330174502.1915328-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/ipv4/icmp.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'net') diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index 8cebb476b3ab..b8607763d113 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -749,6 +749,11 @@ void __icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info, room = 576; room -= sizeof(struct iphdr) + icmp_param.replyopts.opt.opt.optlen; room -= sizeof(struct icmphdr); + /* Guard against tiny mtu. We need to include at least one + * IP network header for this message to make any sense. + */ + if (room <= (int)sizeof(struct iphdr)) + goto ende; icmp_param.data_len = skb_in->len - icmp_param.offset; if (icmp_param.data_len > room) -- cgit v1.2.3 From 275b471e3d2daf1472ae8fa70dc1b50c9e0b9e75 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 30 Mar 2023 19:21:44 -0700 Subject: net: don't let netpoll invoke NAPI if in xmit context Commit 0db3dc73f7a3 ("[NETPOLL]: tx lock deadlock fix") narrowed down the region under netif_tx_trylock() inside netpoll_send_skb(). (At that point in time netif_tx_trylock() would lock all queues of the device.) Taking the tx lock was problematic because driver's cleanup method may take the same lock. So the change made us hold the xmit lock only around xmit, and expected the driver to take care of locking within ->ndo_poll_controller(). Unfortunately this only works if netpoll isn't itself called with the xmit lock already held. Netpoll code is careful and uses trylock(). The drivers, however, may be using plain lock(). Printing while holding the xmit lock is going to result in rare deadlocks. Luckily we record the xmit lock owners, so we can scan all the queues, the same way we scan NAPI owners. If any of the xmit locks is held by the local CPU we better not attempt any polling. It would be nice if we could narrow down the check to only the NAPIs and the queue we're trying to use. I don't see a way to do that now. Reported-by: Roman Gushchin Fixes: 0db3dc73f7a3 ("[NETPOLL]: tx lock deadlock fix") Signed-off-by: Jakub Kicinski Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/netpoll.c | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/core/netpoll.c b/net/core/netpoll.c index a089b704b986..e6a739b1afa9 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -137,6 +137,20 @@ static void queue_process(struct work_struct *work) } } +static int netif_local_xmit_active(struct net_device *dev) +{ + int i; + + for (i = 0; i < dev->num_tx_queues; i++) { + struct netdev_queue *txq = netdev_get_tx_queue(dev, i); + + if (READ_ONCE(txq->xmit_lock_owner) == smp_processor_id()) + return 1; + } + + return 0; +} + static void poll_one_napi(struct napi_struct *napi) { int work; @@ -183,7 +197,10 @@ void netpoll_poll_dev(struct net_device *dev) if (!ni || down_trylock(&ni->dev_lock)) return; - if (!netif_running(dev)) { + /* Some drivers will take the same locks in poll and xmit, + * we can't poll if local CPU is already in xmit. + */ + if (!netif_running(dev) || netif_local_xmit_active(dev)) { up(&ni->dev_lock); return; } -- cgit v1.2.3 From 2584024b23552c00d95b50255e47bd18d306d31a Mon Sep 17 00:00:00 2001 From: Xin Long Date: Sat, 1 Apr 2023 19:09:57 -0400 Subject: sctp: check send stream number after wait_for_sndbuf This patch fixes a corner case where the asoc out stream count may change after wait_for_sndbuf. When the main thread in the client starts a connection, if its out stream count is set to N while the in stream count in the server is set to N - 2, another thread in the client keeps sending the msgs with stream number N - 1, and waits for sndbuf before processing INIT_ACK. However, after processing INIT_ACK, the out stream count in the client is shrunk to N - 2, the same to the in stream count in the server. The crash occurs when the thread waiting for sndbuf is awake and sends the msg in a non-existing stream(N - 1), the call trace is as below: KASAN: null-ptr-deref in range [0x0000000000000038-0x000000000000003f] Call Trace: sctp_cmd_send_msg net/sctp/sm_sideeffect.c:1114 [inline] sctp_cmd_interpreter net/sctp/sm_sideeffect.c:1777 [inline] sctp_side_effects net/sctp/sm_sideeffect.c:1199 [inline] sctp_do_sm+0x197d/0x5310 net/sctp/sm_sideeffect.c:1170 sctp_primitive_SEND+0x9f/0xc0 net/sctp/primitive.c:163 sctp_sendmsg_to_asoc+0x10eb/0x1a30 net/sctp/socket.c:1868 sctp_sendmsg+0x8d4/0x1d90 net/sctp/socket.c:2026 inet_sendmsg+0x9d/0xe0 net/ipv4/af_inet.c:825 sock_sendmsg_nosec net/socket.c:722 [inline] sock_sendmsg+0xde/0x190 net/socket.c:745 The fix is to add an unlikely check for the send stream number after the thread wakes up from the wait_for_sndbuf. Fixes: 5bbbbe32a431 ("sctp: introduce stream scheduler foundations") Reported-by: syzbot+47c24ca20a2fa01f082e@syzkaller.appspotmail.com Signed-off-by: Xin Long Signed-off-by: David S. Miller --- net/sctp/socket.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'net') diff --git a/net/sctp/socket.c b/net/sctp/socket.c index b91616f819de..218e0982c370 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -1830,6 +1830,10 @@ static int sctp_sendmsg_to_asoc(struct sctp_association *asoc, err = sctp_wait_for_sndbuf(asoc, &timeo, msg_len); if (err) goto err; + if (unlikely(sinfo->sinfo_stream >= asoc->stream.outcnt)) { + err = -EINVAL; + goto err; + } } if (sctp_state(asoc, CLOSED)) { -- cgit v1.2.3 From 839349d13905927d8a567ca4d21d88c82028e31d Mon Sep 17 00:00:00 2001 From: Sricharan Ramabadhran Date: Mon, 3 Apr 2023 12:28:51 +0530 Subject: net: qrtr: Do not do DEL_SERVER broadcast after DEL_CLIENT On the remote side, when QRTR socket is removed, af_qrtr will call qrtr_port_remove() which broadcasts the DEL_CLIENT packet to all neighbours including local NS. NS upon receiving the DEL_CLIENT packet, will remove the lookups associated with the node:port and broadcasts the DEL_SERVER packet. But on the host side, due to the arrival of the DEL_CLIENT packet, the NS would've already deleted the server belonging to that port. So when the remote's NS again broadcasts the DEL_SERVER for that port, it throws below error message on the host: "failed while handling packet from 2:-2" So fix this error by not broadcasting the DEL_SERVER packet when the DEL_CLIENT packet gets processed." Fixes: 0c2204a4ad71 ("net: qrtr: Migrate nameservice to kernel from userspace") Reviewed-by: Manivannan Sadhasivam Signed-off-by: Ram Kumar Dharuman Signed-off-by: Sricharan Ramabadhran Signed-off-by: David S. Miller --- net/qrtr/ns.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/qrtr/ns.c b/net/qrtr/ns.c index 722936f7dd98..0f25a386138c 100644 --- a/net/qrtr/ns.c +++ b/net/qrtr/ns.c @@ -274,7 +274,7 @@ err: return NULL; } -static int server_del(struct qrtr_node *node, unsigned int port) +static int server_del(struct qrtr_node *node, unsigned int port, bool bcast) { struct qrtr_lookup *lookup; struct qrtr_server *srv; @@ -287,7 +287,7 @@ static int server_del(struct qrtr_node *node, unsigned int port) radix_tree_delete(&node->servers, port); /* Broadcast the removal of local servers */ - if (srv->node == qrtr_ns.local_node) + if (srv->node == qrtr_ns.local_node && bcast) service_announce_del(&qrtr_ns.bcast_sq, srv); /* Announce the service's disappearance to observers */ @@ -373,7 +373,7 @@ static int ctrl_cmd_bye(struct sockaddr_qrtr *from) } slot = radix_tree_iter_resume(slot, &iter); rcu_read_unlock(); - server_del(node, srv->port); + server_del(node, srv->port, true); rcu_read_lock(); } rcu_read_unlock(); @@ -459,10 +459,13 @@ static int ctrl_cmd_del_client(struct sockaddr_qrtr *from, kfree(lookup); } - /* Remove the server belonging to this port */ + /* Remove the server belonging to this port but don't broadcast + * DEL_SERVER. Neighbours would've already removed the server belonging + * to this port due to the DEL_CLIENT broadcast from qrtr_port_remove(). + */ node = node_get(node_id); if (node) - server_del(node, port); + server_del(node, port, false); /* Advertise the removal of this client to all local servers */ local_node = node_get(qrtr_ns.local_node); @@ -567,7 +570,7 @@ static int ctrl_cmd_del_server(struct sockaddr_qrtr *from, if (!node) return -ENOENT; - return server_del(node, port); + return server_del(node, port, true); } static int ctrl_cmd_new_lookup(struct sockaddr_qrtr *from, -- cgit v1.2.3 From ea30388baebcce37fd594d425a65037ca35e59e8 Mon Sep 17 00:00:00 2001 From: Ziyang Xuan Date: Mon, 3 Apr 2023 15:34:17 +0800 Subject: ipv6: Fix an uninit variable access bug in __ip6_make_skb() Syzbot reported a bug as following: ===================================================== BUG: KMSAN: uninit-value in arch_atomic64_inc arch/x86/include/asm/atomic64_64.h:88 [inline] BUG: KMSAN: uninit-value in arch_atomic_long_inc include/linux/atomic/atomic-long.h:161 [inline] BUG: KMSAN: uninit-value in atomic_long_inc include/linux/atomic/atomic-instrumented.h:1429 [inline] BUG: KMSAN: uninit-value in __ip6_make_skb+0x2f37/0x30f0 net/ipv6/ip6_output.c:1956 arch_atomic64_inc arch/x86/include/asm/atomic64_64.h:88 [inline] arch_atomic_long_inc include/linux/atomic/atomic-long.h:161 [inline] atomic_long_inc include/linux/atomic/atomic-instrumented.h:1429 [inline] __ip6_make_skb+0x2f37/0x30f0 net/ipv6/ip6_output.c:1956 ip6_finish_skb include/net/ipv6.h:1122 [inline] ip6_push_pending_frames+0x10e/0x550 net/ipv6/ip6_output.c:1987 rawv6_push_pending_frames+0xb12/0xb90 net/ipv6/raw.c:579 rawv6_sendmsg+0x297e/0x2e60 net/ipv6/raw.c:922 inet_sendmsg+0x101/0x180 net/ipv4/af_inet.c:827 sock_sendmsg_nosec net/socket.c:714 [inline] sock_sendmsg net/socket.c:734 [inline] ____sys_sendmsg+0xa8e/0xe70 net/socket.c:2476 ___sys_sendmsg+0x2a1/0x3f0 net/socket.c:2530 __sys_sendmsg net/socket.c:2559 [inline] __do_sys_sendmsg net/socket.c:2568 [inline] __se_sys_sendmsg net/socket.c:2566 [inline] __x64_sys_sendmsg+0x367/0x540 net/socket.c:2566 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x3d/0xb0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x63/0xcd Uninit was created at: slab_post_alloc_hook mm/slab.h:766 [inline] slab_alloc_node mm/slub.c:3452 [inline] __kmem_cache_alloc_node+0x71f/0xce0 mm/slub.c:3491 __do_kmalloc_node mm/slab_common.c:967 [inline] __kmalloc_node_track_caller+0x114/0x3b0 mm/slab_common.c:988 kmalloc_reserve net/core/skbuff.c:492 [inline] __alloc_skb+0x3af/0x8f0 net/core/skbuff.c:565 alloc_skb include/linux/skbuff.h:1270 [inline] __ip6_append_data+0x51c1/0x6bb0 net/ipv6/ip6_output.c:1684 ip6_append_data+0x411/0x580 net/ipv6/ip6_output.c:1854 rawv6_sendmsg+0x2882/0x2e60 net/ipv6/raw.c:915 inet_sendmsg+0x101/0x180 net/ipv4/af_inet.c:827 sock_sendmsg_nosec net/socket.c:714 [inline] sock_sendmsg net/socket.c:734 [inline] ____sys_sendmsg+0xa8e/0xe70 net/socket.c:2476 ___sys_sendmsg+0x2a1/0x3f0 net/socket.c:2530 __sys_sendmsg net/socket.c:2559 [inline] __do_sys_sendmsg net/socket.c:2568 [inline] __se_sys_sendmsg net/socket.c:2566 [inline] __x64_sys_sendmsg+0x367/0x540 net/socket.c:2566 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x3d/0xb0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x63/0xcd It is because icmp6hdr does not in skb linear region under the scenario of SOCK_RAW socket. Access icmp6_hdr(skb)->icmp6_type directly will trigger the uninit variable access bug. Use a local variable icmp6_type to carry the correct value in different scenarios. Fixes: 14878f75abd5 ("[IPV6]: Add ICMPMsgStats MIB (RFC 4293) [rev 2]") Reported-by: syzbot+8257f4dcef79de670baf@syzkaller.appspotmail.com Link: https://syzkaller.appspot.com/bug?id=3d605ec1d0a7f2a269a1a6936ac7f2b85975ee9c Signed-off-by: Ziyang Xuan Signed-off-by: David S. Miller --- net/ipv6/ip6_output.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index c314fdde0097..95a55c6630ad 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -1965,8 +1965,13 @@ struct sk_buff *__ip6_make_skb(struct sock *sk, IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len); if (proto == IPPROTO_ICMPV6) { struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb)); + u8 icmp6_type; - ICMP6MSGOUT_INC_STATS(net, idev, icmp6_hdr(skb)->icmp6_type); + if (sk->sk_socket->type == SOCK_RAW && !inet_sk(sk)->hdrincl) + icmp6_type = fl6->fl6_icmp_type; + else + icmp6_type = icmp6_hdr(skb)->icmp6_type; + ICMP6MSGOUT_INC_STATS(net, idev, icmp6_type); ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS); } -- cgit v1.2.3 From 5085e41f9e83a1bec51da1f20b54f2ec3a13a3fe Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Thu, 30 Mar 2023 14:24:27 -0400 Subject: sunrpc: only free unix grouplist after RCU settles While the unix_gid object is rcu-freed, the group_info list that it contains is not. Ensure that we only put the group list reference once we are really freeing the unix_gid object. Reported-by: Zhi Li Link: https://bugzilla.redhat.com/show_bug.cgi?id=2183056 Signed-off-by: Jeff Layton Fixes: fd5d2f78261b ("SUNRPC: Make server side AUTH_UNIX use lockless lookups") Signed-off-by: Chuck Lever --- net/sunrpc/svcauth_unix.c | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/sunrpc/svcauth_unix.c b/net/sunrpc/svcauth_unix.c index 983c5891cb56..4246363cb095 100644 --- a/net/sunrpc/svcauth_unix.c +++ b/net/sunrpc/svcauth_unix.c @@ -416,14 +416,23 @@ static int unix_gid_hash(kuid_t uid) return hash_long(from_kuid(&init_user_ns, uid), GID_HASHBITS); } -static void unix_gid_put(struct kref *kref) +static void unix_gid_free(struct rcu_head *rcu) { - struct cache_head *item = container_of(kref, struct cache_head, ref); - struct unix_gid *ug = container_of(item, struct unix_gid, h); + struct unix_gid *ug = container_of(rcu, struct unix_gid, rcu); + struct cache_head *item = &ug->h; + if (test_bit(CACHE_VALID, &item->flags) && !test_bit(CACHE_NEGATIVE, &item->flags)) put_group_info(ug->gi); - kfree_rcu(ug, rcu); + kfree(ug); +} + +static void unix_gid_put(struct kref *kref) +{ + struct cache_head *item = container_of(kref, struct cache_head, ref); + struct unix_gid *ug = container_of(item, struct unix_gid, h); + + call_rcu(&ug->rcu, unix_gid_free); } static int unix_gid_match(struct cache_head *corig, struct cache_head *cnew) -- cgit v1.2.3 From 0a78cf7264d29abeca098eae0b188a10aabc8a32 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Mon, 3 Apr 2023 12:49:58 -0700 Subject: raw: Fix NULL deref in raw_get_next(). Dae R. Jeong reported a NULL deref in raw_get_next() [0]. It seems that the repro was running these sequences in parallel so that one thread was iterating on a socket that was being freed in another netns. unshare(0x40060200) r0 = syz_open_procfs(0x0, &(0x7f0000002080)='net/raw\x00') socket$inet_icmp_raw(0x2, 0x3, 0x1) pread64(r0, &(0x7f0000000000)=""/10, 0xa, 0x10000000007f) After commit 0daf07e52709 ("raw: convert raw sockets to RCU"), we use RCU and hlist_nulls_for_each_entry() to iterate over SOCK_RAW sockets. However, we should use spinlock for slow paths to avoid the NULL deref. Also, SOCK_RAW does not use SLAB_TYPESAFE_BY_RCU, and the slab object is not reused during iteration in the grace period. In fact, the lockless readers do not check the nulls marker with get_nulls_value(). So, SOCK_RAW should use hlist instead of hlist_nulls. Instead of adding an unnecessary barrier by sk_nulls_for_each_rcu(), let's convert hlist_nulls to hlist and use sk_for_each_rcu() for fast paths and sk_for_each() and spinlock for /proc/net/raw. [0]: general protection fault, probably for non-canonical address 0xdffffc0000000005: 0000 [#1] PREEMPT SMP KASAN KASAN: null-ptr-deref in range [0x0000000000000028-0x000000000000002f] CPU: 2 PID: 20952 Comm: syz-executor.0 Not tainted 6.2.0-g048ec869bafd-dirty #7 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.14.0-0-g155821a1990b-prebuilt.qemu.org 04/01/2014 RIP: 0010:read_pnet include/net/net_namespace.h:383 [inline] RIP: 0010:sock_net include/net/sock.h:649 [inline] RIP: 0010:raw_get_next net/ipv4/raw.c:974 [inline] RIP: 0010:raw_get_idx net/ipv4/raw.c:986 [inline] RIP: 0010:raw_seq_start+0x431/0x800 net/ipv4/raw.c:995 Code: ef e8 33 3d 94 f7 49 8b 6d 00 4c 89 ef e8 b7 65 5f f7 49 89 ed 49 83 c5 98 0f 84 9a 00 00 00 48 83 c5 c8 48 89 e8 48 c1 e8 03 <42> 80 3c 30 00 74 08 48 89 ef e8 00 3d 94 f7 4c 8b 7d 00 48 89 ef RSP: 0018:ffffc9001154f9b0 EFLAGS: 00010206 RAX: 0000000000000005 RBX: 1ffff1100302c8fd RCX: 0000000000000000 RDX: 0000000000000028 RSI: ffffc9001154f988 RDI: ffffc9000f77a338 RBP: 0000000000000029 R08: ffffffff8a50ffb4 R09: fffffbfff24b6bd9 R10: fffffbfff24b6bd9 R11: 0000000000000000 R12: ffff88801db73b78 R13: fffffffffffffff9 R14: dffffc0000000000 R15: 0000000000000030 FS: 00007f843ae8e700(0000) GS:ffff888063700000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 000055bb9614b35f CR3: 000000003c672000 CR4: 00000000003506e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: seq_read_iter+0x4c6/0x10f0 fs/seq_file.c:225 seq_read+0x224/0x320 fs/seq_file.c:162 pde_read fs/proc/inode.c:316 [inline] proc_reg_read+0x23f/0x330 fs/proc/inode.c:328 vfs_read+0x31e/0xd30 fs/read_write.c:468 ksys_pread64 fs/read_write.c:665 [inline] __do_sys_pread64 fs/read_write.c:675 [inline] __se_sys_pread64 fs/read_write.c:672 [inline] __x64_sys_pread64+0x1e9/0x280 fs/read_write.c:672 do_syscall_x64 arch/x86/entry/common.c:51 [inline] do_syscall_64+0x4e/0xa0 arch/x86/entry/common.c:82 entry_SYSCALL_64_after_hwframe+0x63/0xcd RIP: 0033:0x478d29 Code: f7 d8 64 89 02 b8 ff ff ff ff c3 66 0f 1f 44 00 00 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 c7 c1 bc ff ff ff f7 d8 64 89 01 48 RSP: 002b:00007f843ae8dbe8 EFLAGS: 00000246 ORIG_RAX: 0000000000000011 RAX: ffffffffffffffda RBX: 0000000000791408 RCX: 0000000000478d29 RDX: 000000000000000a RSI: 0000000020000000 RDI: 0000000000000003 RBP: 00000000f477909a R08: 0000000000000000 R09: 0000000000000000 R10: 000010000000007f R11: 0000000000000246 R12: 0000000000791740 R13: 0000000000791414 R14: 0000000000791408 R15: 00007ffc2eb48a50 Modules linked in: ---[ end trace 0000000000000000 ]--- RIP: 0010:read_pnet include/net/net_namespace.h:383 [inline] RIP: 0010:sock_net include/net/sock.h:649 [inline] RIP: 0010:raw_get_next net/ipv4/raw.c:974 [inline] RIP: 0010:raw_get_idx net/ipv4/raw.c:986 [inline] RIP: 0010:raw_seq_start+0x431/0x800 net/ipv4/raw.c:995 Code: ef e8 33 3d 94 f7 49 8b 6d 00 4c 89 ef e8 b7 65 5f f7 49 89 ed 49 83 c5 98 0f 84 9a 00 00 00 48 83 c5 c8 48 89 e8 48 c1 e8 03 <42> 80 3c 30 00 74 08 48 89 ef e8 00 3d 94 f7 4c 8b 7d 00 48 89 ef RSP: 0018:ffffc9001154f9b0 EFLAGS: 00010206 RAX: 0000000000000005 RBX: 1ffff1100302c8fd RCX: 0000000000000000 RDX: 0000000000000028 RSI: ffffc9001154f988 RDI: ffffc9000f77a338 RBP: 0000000000000029 R08: ffffffff8a50ffb4 R09: fffffbfff24b6bd9 R10: fffffbfff24b6bd9 R11: 0000000000000000 R12: ffff88801db73b78 R13: fffffffffffffff9 R14: dffffc0000000000 R15: 0000000000000030 FS: 00007f843ae8e700(0000) GS:ffff888063700000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007f92ff166000 CR3: 000000003c672000 CR4: 00000000003506e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Fixes: 0daf07e52709 ("raw: convert raw sockets to RCU") Reported-by: syzbot Reported-by: Dae R. Jeong Link: https://lore.kernel.org/netdev/ZCA2mGV_cmq7lIfV@dragonet/ Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Signed-off-by: Jakub Kicinski --- include/net/raw.h | 4 ++-- net/ipv4/raw.c | 36 +++++++++++++++++++----------------- net/ipv4/raw_diag.c | 10 ++++------ net/ipv6/raw.c | 10 ++++------ 4 files changed, 29 insertions(+), 31 deletions(-) (limited to 'net') diff --git a/include/net/raw.h b/include/net/raw.h index 2c004c20ed99..3af5289fdead 100644 --- a/include/net/raw.h +++ b/include/net/raw.h @@ -37,7 +37,7 @@ int raw_rcv(struct sock *, struct sk_buff *); struct raw_hashinfo { spinlock_t lock; - struct hlist_nulls_head ht[RAW_HTABLE_SIZE] ____cacheline_aligned; + struct hlist_head ht[RAW_HTABLE_SIZE] ____cacheline_aligned; }; static inline u32 raw_hashfunc(const struct net *net, u32 proto) @@ -51,7 +51,7 @@ static inline void raw_hashinfo_init(struct raw_hashinfo *hashinfo) spin_lock_init(&hashinfo->lock); for (i = 0; i < RAW_HTABLE_SIZE; i++) - INIT_HLIST_NULLS_HEAD(&hashinfo->ht[i], i); + INIT_HLIST_HEAD(&hashinfo->ht[i]); } #ifdef CONFIG_PROC_FS diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index 94df935ee0c5..8088a5011e7d 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -91,12 +91,12 @@ EXPORT_SYMBOL_GPL(raw_v4_hashinfo); int raw_hash_sk(struct sock *sk) { struct raw_hashinfo *h = sk->sk_prot->h.raw_hash; - struct hlist_nulls_head *hlist; + struct hlist_head *hlist; hlist = &h->ht[raw_hashfunc(sock_net(sk), inet_sk(sk)->inet_num)]; spin_lock(&h->lock); - __sk_nulls_add_node_rcu(sk, hlist); + sk_add_node_rcu(sk, hlist); sock_set_flag(sk, SOCK_RCU_FREE); spin_unlock(&h->lock); sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); @@ -110,7 +110,7 @@ void raw_unhash_sk(struct sock *sk) struct raw_hashinfo *h = sk->sk_prot->h.raw_hash; spin_lock(&h->lock); - if (__sk_nulls_del_node_init_rcu(sk)) + if (sk_del_node_init_rcu(sk)) sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); spin_unlock(&h->lock); } @@ -163,16 +163,15 @@ static int icmp_filter(const struct sock *sk, const struct sk_buff *skb) static int raw_v4_input(struct net *net, struct sk_buff *skb, const struct iphdr *iph, int hash) { - struct hlist_nulls_head *hlist; - struct hlist_nulls_node *hnode; int sdif = inet_sdif(skb); + struct hlist_head *hlist; int dif = inet_iif(skb); int delivered = 0; struct sock *sk; hlist = &raw_v4_hashinfo.ht[hash]; rcu_read_lock(); - sk_nulls_for_each(sk, hnode, hlist) { + sk_for_each_rcu(sk, hlist) { if (!raw_v4_match(net, sk, iph->protocol, iph->saddr, iph->daddr, dif, sdif)) continue; @@ -264,10 +263,9 @@ static void raw_err(struct sock *sk, struct sk_buff *skb, u32 info) void raw_icmp_error(struct sk_buff *skb, int protocol, u32 info) { struct net *net = dev_net(skb->dev); - struct hlist_nulls_head *hlist; - struct hlist_nulls_node *hnode; int dif = skb->dev->ifindex; int sdif = inet_sdif(skb); + struct hlist_head *hlist; const struct iphdr *iph; struct sock *sk; int hash; @@ -276,7 +274,7 @@ void raw_icmp_error(struct sk_buff *skb, int protocol, u32 info) hlist = &raw_v4_hashinfo.ht[hash]; rcu_read_lock(); - sk_nulls_for_each(sk, hnode, hlist) { + sk_for_each_rcu(sk, hlist) { iph = (const struct iphdr *)skb->data; if (!raw_v4_match(net, sk, iph->protocol, iph->daddr, iph->saddr, dif, sdif)) @@ -950,14 +948,13 @@ static struct sock *raw_get_first(struct seq_file *seq, int bucket) { struct raw_hashinfo *h = pde_data(file_inode(seq->file)); struct raw_iter_state *state = raw_seq_private(seq); - struct hlist_nulls_head *hlist; - struct hlist_nulls_node *hnode; + struct hlist_head *hlist; struct sock *sk; for (state->bucket = bucket; state->bucket < RAW_HTABLE_SIZE; ++state->bucket) { hlist = &h->ht[state->bucket]; - sk_nulls_for_each(sk, hnode, hlist) { + sk_for_each(sk, hlist) { if (sock_net(sk) == seq_file_net(seq)) return sk; } @@ -970,7 +967,7 @@ static struct sock *raw_get_next(struct seq_file *seq, struct sock *sk) struct raw_iter_state *state = raw_seq_private(seq); do { - sk = sk_nulls_next(sk); + sk = sk_next(sk); } while (sk && sock_net(sk) != seq_file_net(seq)); if (!sk) @@ -989,9 +986,12 @@ static struct sock *raw_get_idx(struct seq_file *seq, loff_t pos) } void *raw_seq_start(struct seq_file *seq, loff_t *pos) - __acquires(RCU) + __acquires(&h->lock) { - rcu_read_lock(); + struct raw_hashinfo *h = pde_data(file_inode(seq->file)); + + spin_lock(&h->lock); + return *pos ? raw_get_idx(seq, *pos - 1) : SEQ_START_TOKEN; } EXPORT_SYMBOL_GPL(raw_seq_start); @@ -1010,9 +1010,11 @@ void *raw_seq_next(struct seq_file *seq, void *v, loff_t *pos) EXPORT_SYMBOL_GPL(raw_seq_next); void raw_seq_stop(struct seq_file *seq, void *v) - __releases(RCU) + __releases(&h->lock) { - rcu_read_unlock(); + struct raw_hashinfo *h = pde_data(file_inode(seq->file)); + + spin_unlock(&h->lock); } EXPORT_SYMBOL_GPL(raw_seq_stop); diff --git a/net/ipv4/raw_diag.c b/net/ipv4/raw_diag.c index 999321834b94..da3591a66a16 100644 --- a/net/ipv4/raw_diag.c +++ b/net/ipv4/raw_diag.c @@ -57,8 +57,7 @@ static bool raw_lookup(struct net *net, struct sock *sk, static struct sock *raw_sock_get(struct net *net, const struct inet_diag_req_v2 *r) { struct raw_hashinfo *hashinfo = raw_get_hashinfo(r); - struct hlist_nulls_head *hlist; - struct hlist_nulls_node *hnode; + struct hlist_head *hlist; struct sock *sk; int slot; @@ -68,7 +67,7 @@ static struct sock *raw_sock_get(struct net *net, const struct inet_diag_req_v2 rcu_read_lock(); for (slot = 0; slot < RAW_HTABLE_SIZE; slot++) { hlist = &hashinfo->ht[slot]; - sk_nulls_for_each(sk, hnode, hlist) { + sk_for_each_rcu(sk, hlist) { if (raw_lookup(net, sk, r)) { /* * Grab it and keep until we fill @@ -142,9 +141,8 @@ static void raw_diag_dump(struct sk_buff *skb, struct netlink_callback *cb, struct raw_hashinfo *hashinfo = raw_get_hashinfo(r); struct net *net = sock_net(skb->sk); struct inet_diag_dump_data *cb_data; - struct hlist_nulls_head *hlist; - struct hlist_nulls_node *hnode; int num, s_num, slot, s_slot; + struct hlist_head *hlist; struct sock *sk = NULL; struct nlattr *bc; @@ -161,7 +159,7 @@ static void raw_diag_dump(struct sk_buff *skb, struct netlink_callback *cb, num = 0; hlist = &hashinfo->ht[slot]; - sk_nulls_for_each(sk, hnode, hlist) { + sk_for_each_rcu(sk, hlist) { struct inet_sock *inet = inet_sk(sk); if (!net_eq(sock_net(sk), net)) diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c index bac9ba747bde..a327aa481df4 100644 --- a/net/ipv6/raw.c +++ b/net/ipv6/raw.c @@ -141,10 +141,9 @@ EXPORT_SYMBOL(rawv6_mh_filter_unregister); static bool ipv6_raw_deliver(struct sk_buff *skb, int nexthdr) { struct net *net = dev_net(skb->dev); - struct hlist_nulls_head *hlist; - struct hlist_nulls_node *hnode; const struct in6_addr *saddr; const struct in6_addr *daddr; + struct hlist_head *hlist; struct sock *sk; bool delivered = false; __u8 hash; @@ -155,7 +154,7 @@ static bool ipv6_raw_deliver(struct sk_buff *skb, int nexthdr) hash = raw_hashfunc(net, nexthdr); hlist = &raw_v6_hashinfo.ht[hash]; rcu_read_lock(); - sk_nulls_for_each(sk, hnode, hlist) { + sk_for_each_rcu(sk, hlist) { int filtered; if (!raw_v6_match(net, sk, nexthdr, daddr, saddr, @@ -333,15 +332,14 @@ void raw6_icmp_error(struct sk_buff *skb, int nexthdr, u8 type, u8 code, int inner_offset, __be32 info) { struct net *net = dev_net(skb->dev); - struct hlist_nulls_head *hlist; - struct hlist_nulls_node *hnode; + struct hlist_head *hlist; struct sock *sk; int hash; hash = raw_hashfunc(net, nexthdr); hlist = &raw_v6_hashinfo.ht[hash]; rcu_read_lock(); - sk_nulls_for_each(sk, hnode, hlist) { + sk_for_each_rcu(sk, hlist) { /* Note: ipv6_hdr(skb) != skb->data */ const struct ipv6hdr *ip6h = (const struct ipv6hdr *)skb->data; -- cgit v1.2.3 From ab5fb73ffa01072b4d8031cc05801fa1cb653bee Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Mon, 3 Apr 2023 12:49:59 -0700 Subject: ping: Fix potentail NULL deref for /proc/net/icmp. After commit dbca1596bbb0 ("ping: convert to RCU lookups, get rid of rwlock"), we use RCU for ping sockets, but we should use spinlock for /proc/net/icmp to avoid a potential NULL deref mentioned in the previous patch. Let's go back to using spinlock there. Note we can convert ping sockets to use hlist instead of hlist_nulls because we do not use SLAB_TYPESAFE_BY_RCU for ping sockets. Fixes: dbca1596bbb0 ("ping: convert to RCU lookups, get rid of rwlock") Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Signed-off-by: Jakub Kicinski --- net/ipv4/ping.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c index 409ec2a1f95b..5178a3f3cb53 100644 --- a/net/ipv4/ping.c +++ b/net/ipv4/ping.c @@ -1089,13 +1089,13 @@ static struct sock *ping_get_idx(struct seq_file *seq, loff_t pos) } void *ping_seq_start(struct seq_file *seq, loff_t *pos, sa_family_t family) - __acquires(RCU) + __acquires(ping_table.lock) { struct ping_iter_state *state = seq->private; state->bucket = 0; state->family = family; - rcu_read_lock(); + spin_lock(&ping_table.lock); return *pos ? ping_get_idx(seq, *pos-1) : SEQ_START_TOKEN; } @@ -1121,9 +1121,9 @@ void *ping_seq_next(struct seq_file *seq, void *v, loff_t *pos) EXPORT_SYMBOL_GPL(ping_seq_next); void ping_seq_stop(struct seq_file *seq, void *v) - __releases(RCU) + __releases(ping_table.lock) { - rcu_read_unlock(); + spin_unlock(&ping_table.lock); } EXPORT_SYMBOL_GPL(ping_seq_stop); -- cgit v1.2.3 From e847c7675e19ef344913724dc68f83df31ad6a17 Mon Sep 17 00:00:00 2001 From: Andy Roulin Date: Mon, 3 Apr 2023 14:20:53 -0700 Subject: ethtool: reset #lanes when lanes is omitted If the number of lanes was forced and then subsequently the user omits this parameter, the ksettings->lanes is reset. The driver should then reset the number of lanes to the device's default for the specified speed. However, although the ksettings->lanes is set to 0, the mod variable is not set to true to indicate the driver and userspace should be notified of the changes. The consequence is that the same ethtool operation will produce different results based on the initial state. If the initial state is: $ ethtool swp1 | grep -A 3 'Speed: ' Speed: 500000Mb/s Lanes: 2 Duplex: Full Auto-negotiation: on then executing 'ethtool -s swp1 speed 50000 autoneg off' will yield: $ ethtool swp1 | grep -A 3 'Speed: ' Speed: 500000Mb/s Lanes: 2 Duplex: Full Auto-negotiation: off While if the initial state is: $ ethtool swp1 | grep -A 3 'Speed: ' Speed: 500000Mb/s Lanes: 1 Duplex: Full Auto-negotiation: off executing the same 'ethtool -s swp1 speed 50000 autoneg off' results in: $ ethtool swp1 | grep -A 3 'Speed: ' Speed: 500000Mb/s Lanes: 1 Duplex: Full Auto-negotiation: off This patch fixes this behavior. Omitting lanes will always results in the driver choosing the default lane width for the chosen speed. In this scenario, regardless of the initial state, the end state will be, e.g., $ ethtool swp1 | grep -A 3 'Speed: ' Speed: 500000Mb/s Lanes: 2 Duplex: Full Auto-negotiation: off Fixes: 012ce4dd3102 ("ethtool: Extend link modes settings uAPI with lanes") Signed-off-by: Andy Roulin Reviewed-by: Danielle Ratson Reviewed-by: Ido Schimmel Link: https://lore.kernel.org/r/ac238d6b-8726-8156-3810-6471291dbc7f@nvidia.com Signed-off-by: Jakub Kicinski --- net/ethtool/linkmodes.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/ethtool/linkmodes.c b/net/ethtool/linkmodes.c index fab66c169b9f..20165e07ef90 100644 --- a/net/ethtool/linkmodes.c +++ b/net/ethtool/linkmodes.c @@ -270,11 +270,12 @@ static int ethnl_update_linkmodes(struct genl_info *info, struct nlattr **tb, "lanes configuration not supported by device"); return -EOPNOTSUPP; } - } else if (!lsettings->autoneg) { - /* If autoneg is off and lanes parameter is not passed from user, - * set the lanes parameter to 0. + } else if (!lsettings->autoneg && ksettings->lanes) { + /* If autoneg is off and lanes parameter is not passed from user but + * it was defined previously then set the lanes parameter to 0. */ ksettings->lanes = 0; + *mod = true; } ret = ethnl_update_bitset(ksettings->link_modes.advertising, -- cgit v1.2.3 From a1865f2e7d10dde00d35a2122b38d2e469ae67ed Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 3 Apr 2023 21:46:43 +0000 Subject: netlink: annotate lockless accesses to nlk->max_recvmsg_len syzbot reported a data-race in data-race in netlink_recvmsg() [1] Indeed, netlink_recvmsg() can be run concurrently, and netlink_dump() also needs protection. [1] BUG: KCSAN: data-race in netlink_recvmsg / netlink_recvmsg read to 0xffff888141840b38 of 8 bytes by task 23057 on cpu 0: netlink_recvmsg+0xea/0x730 net/netlink/af_netlink.c:1988 sock_recvmsg_nosec net/socket.c:1017 [inline] sock_recvmsg net/socket.c:1038 [inline] __sys_recvfrom+0x1ee/0x2e0 net/socket.c:2194 __do_sys_recvfrom net/socket.c:2212 [inline] __se_sys_recvfrom net/socket.c:2208 [inline] __x64_sys_recvfrom+0x78/0x90 net/socket.c:2208 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x41/0xc0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x63/0xcd write to 0xffff888141840b38 of 8 bytes by task 23037 on cpu 1: netlink_recvmsg+0x114/0x730 net/netlink/af_netlink.c:1989 sock_recvmsg_nosec net/socket.c:1017 [inline] sock_recvmsg net/socket.c:1038 [inline] ____sys_recvmsg+0x156/0x310 net/socket.c:2720 ___sys_recvmsg net/socket.c:2762 [inline] do_recvmmsg+0x2e5/0x710 net/socket.c:2856 __sys_recvmmsg net/socket.c:2935 [inline] __do_sys_recvmmsg net/socket.c:2958 [inline] __se_sys_recvmmsg net/socket.c:2951 [inline] __x64_sys_recvmmsg+0xe2/0x160 net/socket.c:2951 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x41/0xc0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x63/0xcd value changed: 0x0000000000000000 -> 0x0000000000001000 Reported by Kernel Concurrency Sanitizer on: CPU: 1 PID: 23037 Comm: syz-executor.2 Not tainted 6.3.0-rc4-syzkaller-00195-g5a57b48fdfcb #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 03/02/2023 Fixes: 9063e21fb026 ("netlink: autosize skb lengthes") Reported-by: syzbot Signed-off-by: Eric Dumazet Reviewed-by: Simon Horman Link: https://lore.kernel.org/r/20230403214643.768555-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/netlink/af_netlink.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index c64277659753..f365dfdd672d 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -1952,7 +1952,7 @@ static int netlink_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, struct scm_cookie scm; struct sock *sk = sock->sk; struct netlink_sock *nlk = nlk_sk(sk); - size_t copied; + size_t copied, max_recvmsg_len; struct sk_buff *skb, *data_skb; int err, ret; @@ -1985,9 +1985,10 @@ static int netlink_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, #endif /* Record the max length of recvmsg() calls for future allocations */ - nlk->max_recvmsg_len = max(nlk->max_recvmsg_len, len); - nlk->max_recvmsg_len = min_t(size_t, nlk->max_recvmsg_len, - SKB_WITH_OVERHEAD(32768)); + max_recvmsg_len = max(READ_ONCE(nlk->max_recvmsg_len), len); + max_recvmsg_len = min_t(size_t, max_recvmsg_len, + SKB_WITH_OVERHEAD(32768)); + WRITE_ONCE(nlk->max_recvmsg_len, max_recvmsg_len); copied = data_skb->len; if (len < copied) { @@ -2236,6 +2237,7 @@ static int netlink_dump(struct sock *sk) struct netlink_ext_ack extack = {}; struct netlink_callback *cb; struct sk_buff *skb = NULL; + size_t max_recvmsg_len; struct module *module; int err = -ENOBUFS; int alloc_min_size; @@ -2258,8 +2260,9 @@ static int netlink_dump(struct sock *sk) cb = &nlk->cb; alloc_min_size = max_t(int, cb->min_dump_alloc, NLMSG_GOODSIZE); - if (alloc_min_size < nlk->max_recvmsg_len) { - alloc_size = nlk->max_recvmsg_len; + max_recvmsg_len = READ_ONCE(nlk->max_recvmsg_len); + if (alloc_min_size < max_recvmsg_len) { + alloc_size = max_recvmsg_len; skb = alloc_skb(alloc_size, (GFP_KERNEL & ~__GFP_DIRECT_RECLAIM) | __GFP_NOWARN | __GFP_NORETRY); -- cgit v1.2.3 From b45193cb4df556fe6251b285a5ce44046dd36b4a Mon Sep 17 00:00:00 2001 From: Oleksij Rempel Date: Tue, 4 Apr 2023 09:31:28 +0200 Subject: can: j1939: j1939_tp_tx_dat_new(): fix out-of-bounds memory access In the j1939_tp_tx_dat_new() function, an out-of-bounds memory access could occur during the memcpy() operation if the size of skb->cb is larger than the size of struct j1939_sk_buff_cb. This is because the memcpy() operation uses the size of skb->cb, leading to a read beyond the struct j1939_sk_buff_cb. Updated the memcpy() operation to use the size of struct j1939_sk_buff_cb instead of the size of skb->cb. This ensures that the memcpy() operation only reads the memory within the bounds of struct j1939_sk_buff_cb, preventing out-of-bounds memory access. Additionally, add a BUILD_BUG_ON() to check that the size of skb->cb is greater than or equal to the size of struct j1939_sk_buff_cb. This ensures that the skb->cb buffer is large enough to hold the j1939_sk_buff_cb structure. Fixes: 9d71dd0c7009 ("can: add support of SAE J1939 protocol") Reported-by: Shuangpeng Bai Tested-by: Shuangpeng Bai Signed-off-by: Oleksij Rempel Link: https://groups.google.com/g/syzkaller/c/G_LL-C3plRs/m/-8xCi6dCAgAJ Link: https://lore.kernel.org/all/20230404073128.3173900-1-o.rempel@pengutronix.de Cc: stable@vger.kernel.org [mkl: rephrase commit message] Signed-off-by: Marc Kleine-Budde --- net/can/j1939/transport.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/can/j1939/transport.c b/net/can/j1939/transport.c index fb92c3609e17..fe3df23a2595 100644 --- a/net/can/j1939/transport.c +++ b/net/can/j1939/transport.c @@ -604,7 +604,10 @@ sk_buff *j1939_tp_tx_dat_new(struct j1939_priv *priv, /* reserve CAN header */ skb_reserve(skb, offsetof(struct can_frame, data)); - memcpy(skb->cb, re_skcb, sizeof(skb->cb)); + /* skb->cb must be large enough to hold a j1939_sk_buff_cb structure */ + BUILD_BUG_ON(sizeof(skb->cb) < sizeof(*re_skcb)); + + memcpy(skb->cb, re_skcb, sizeof(*re_skcb)); skcb = j1939_skb_to_cb(skb); if (swap_src_dst) j1939_skbcb_swap(skcb); -- cgit v1.2.3 From 0145462fc802cd447ef5d029758043c7f15b4b1e Mon Sep 17 00:00:00 2001 From: Oliver Hartkopp Date: Thu, 30 Mar 2023 19:02:48 +0200 Subject: can: isotp: isotp_recvmsg(): use sock_recv_cmsgs() to get SOCK_RXQ_OVFL infos isotp.c was still using sock_recv_timestamp() which does not provide control messages to detect dropped PDUs in the receive path. Fixes: e057dd3fc20f ("can: add ISO 15765-2:2016 transport protocol") Signed-off-by: Oliver Hartkopp Link: https://lore.kernel.org/all/20230330170248.62342-1-socketcan@hartkopp.net Cc: stable@vger.kernel.org Signed-off-by: Marc Kleine-Budde --- net/can/isotp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/can/isotp.c b/net/can/isotp.c index 9bc344851704..47c2ebad10ed 100644 --- a/net/can/isotp.c +++ b/net/can/isotp.c @@ -1120,7 +1120,7 @@ static int isotp_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, if (ret < 0) goto out_err; - sock_recv_timestamp(msg, sk, skb); + sock_recv_cmsgs(msg, sk, skb); if (msg->msg_name) { __sockaddr_check_size(ISOTP_MIN_NAMELEN); -- cgit v1.2.3 From 79e19fa79cb5d5f1b3bf3e3ae24989ccb93c7b7b Mon Sep 17 00:00:00 2001 From: Michal Sojka Date: Fri, 31 Mar 2023 14:55:11 +0200 Subject: can: isotp: isotp_ops: fix poll() to not report false EPOLLOUT events When using select()/poll()/epoll() with a non-blocking ISOTP socket to wait for when non-blocking write is possible, a false EPOLLOUT event is sometimes returned. This can happen at least after sending a message which must be split to multiple CAN frames. The reason is that isotp_sendmsg() returns -EAGAIN when tx.state is not equal to ISOTP_IDLE and this behavior is not reflected in datagram_poll(), which is used in isotp_ops. This is fixed by introducing ISOTP-specific poll function, which suppresses the EPOLLOUT events in that case. v2: https://lore.kernel.org/all/20230302092812.320643-1-michal.sojka@cvut.cz v1: https://lore.kernel.org/all/20230224010659.48420-1-michal.sojka@cvut.cz https://lore.kernel.org/all/b53a04a2-ba1f-3858-84c1-d3eb3301ae15@hartkopp.net Signed-off-by: Michal Sojka Reported-by: Jakub Jira Tested-by: Oliver Hartkopp Acked-by: Oliver Hartkopp Fixes: e057dd3fc20f ("can: add ISO 15765-2:2016 transport protocol") Link: https://lore.kernel.org/all/20230331125511.372783-1-michal.sojka@cvut.cz Cc: stable@vger.kernel.org Signed-off-by: Marc Kleine-Budde --- net/can/isotp.c | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/can/isotp.c b/net/can/isotp.c index 47c2ebad10ed..281b7766c54e 100644 --- a/net/can/isotp.c +++ b/net/can/isotp.c @@ -1608,6 +1608,21 @@ static int isotp_init(struct sock *sk) return 0; } +static __poll_t isotp_poll(struct file *file, struct socket *sock, poll_table *wait) +{ + struct sock *sk = sock->sk; + struct isotp_sock *so = isotp_sk(sk); + + __poll_t mask = datagram_poll(file, sock, wait); + poll_wait(file, &so->wait, wait); + + /* Check for false positives due to TX state */ + if ((mask & EPOLLWRNORM) && (so->tx.state != ISOTP_IDLE)) + mask &= ~(EPOLLOUT | EPOLLWRNORM); + + return mask; +} + static int isotp_sock_no_ioctlcmd(struct socket *sock, unsigned int cmd, unsigned long arg) { @@ -1623,7 +1638,7 @@ static const struct proto_ops isotp_ops = { .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = isotp_getname, - .poll = datagram_poll, + .poll = isotp_poll, .ioctl = isotp_sock_no_ioctlcmd, .gettstamp = sock_gettstamp, .listen = sock_no_listen, -- cgit v1.2.3 From 051737439eaee5bdd03d3c2ef5510d54a478fd05 Mon Sep 17 00:00:00 2001 From: Oliver Hartkopp Date: Fri, 31 Mar 2023 15:19:35 +0200 Subject: can: isotp: fix race between isotp_sendsmg() and isotp_release() As discussed with Dae R. Jeong and Hillf Danton here [1] the sendmsg() function in isotp.c might get into a race condition when restoring the former tx.state from the old_state. Remove the old_state concept and implement proper locking for the ISOTP_IDLE transitions in isotp_sendmsg(), inspired by a simplification idea from Hillf Danton. Introduce a new tx.state ISOTP_SHUTDOWN and use the same locking mechanism from isotp_release() which resolves a potential race between isotp_sendsmg() and isotp_release(). [1] https://lore.kernel.org/linux-can/ZB%2F93xJxq%2FBUqAgG@dragonet v1: https://lore.kernel.org/all/20230331102114.15164-1-socketcan@hartkopp.net v2: https://lore.kernel.org/all/20230331123600.3550-1-socketcan@hartkopp.net take care of signal interrupts for wait_event_interruptible() in isotp_release() v3: https://lore.kernel.org/all/20230331130654.9886-1-socketcan@hartkopp.net take care of signal interrupts for wait_event_interruptible() in isotp_sendmsg() in the wait_tx_done case v4: https://lore.kernel.org/all/20230331131935.21465-1-socketcan@hartkopp.net take care of signal interrupts for wait_event_interruptible() in isotp_sendmsg() in ALL cases Cc: Dae R. Jeong Cc: Hillf Danton Signed-off-by: Oliver Hartkopp Fixes: 4f027cba8216 ("can: isotp: split tx timer into transmission and timeout") Link: https://lore.kernel.org/all/20230331131935.21465-1-socketcan@hartkopp.net Cc: stable@vger.kernel.org [mkl: rephrase commit message] Signed-off-by: Marc Kleine-Budde --- net/can/isotp.c | 55 +++++++++++++++++++++++++++++++------------------------ 1 file changed, 31 insertions(+), 24 deletions(-) (limited to 'net') diff --git a/net/can/isotp.c b/net/can/isotp.c index 281b7766c54e..5761d4ab839d 100644 --- a/net/can/isotp.c +++ b/net/can/isotp.c @@ -119,7 +119,8 @@ enum { ISOTP_WAIT_FIRST_FC, ISOTP_WAIT_FC, ISOTP_WAIT_DATA, - ISOTP_SENDING + ISOTP_SENDING, + ISOTP_SHUTDOWN, }; struct tpcon { @@ -880,8 +881,8 @@ static enum hrtimer_restart isotp_tx_timer_handler(struct hrtimer *hrtimer) txtimer); struct sock *sk = &so->sk; - /* don't handle timeouts in IDLE state */ - if (so->tx.state == ISOTP_IDLE) + /* don't handle timeouts in IDLE or SHUTDOWN state */ + if (so->tx.state == ISOTP_IDLE || so->tx.state == ISOTP_SHUTDOWN) return HRTIMER_NORESTART; /* we did not get any flow control or echo frame in time */ @@ -918,7 +919,6 @@ static int isotp_sendmsg(struct socket *sock, struct msghdr *msg, size_t size) { struct sock *sk = sock->sk; struct isotp_sock *so = isotp_sk(sk); - u32 old_state = so->tx.state; struct sk_buff *skb; struct net_device *dev; struct canfd_frame *cf; @@ -928,23 +928,24 @@ static int isotp_sendmsg(struct socket *sock, struct msghdr *msg, size_t size) int off; int err; - if (!so->bound) + if (!so->bound || so->tx.state == ISOTP_SHUTDOWN) return -EADDRNOTAVAIL; +wait_free_buffer: /* we do not support multiple buffers - for now */ - if (cmpxchg(&so->tx.state, ISOTP_IDLE, ISOTP_SENDING) != ISOTP_IDLE || - wq_has_sleeper(&so->wait)) { - if (msg->msg_flags & MSG_DONTWAIT) { - err = -EAGAIN; - goto err_out; - } + if (wq_has_sleeper(&so->wait) && (msg->msg_flags & MSG_DONTWAIT)) + return -EAGAIN; - /* wait for complete transmission of current pdu */ - err = wait_event_interruptible(so->wait, so->tx.state == ISOTP_IDLE); - if (err) - goto err_out; + /* wait for complete transmission of current pdu */ + err = wait_event_interruptible(so->wait, so->tx.state == ISOTP_IDLE); + if (err) + goto err_event_drop; - so->tx.state = ISOTP_SENDING; + if (cmpxchg(&so->tx.state, ISOTP_IDLE, ISOTP_SENDING) != ISOTP_IDLE) { + if (so->tx.state == ISOTP_SHUTDOWN) + return -EADDRNOTAVAIL; + + goto wait_free_buffer; } if (!size || size > MAX_MSG_LENGTH) { @@ -1074,7 +1075,9 @@ static int isotp_sendmsg(struct socket *sock, struct msghdr *msg, size_t size) if (wait_tx_done) { /* wait for complete transmission of current pdu */ - wait_event_interruptible(so->wait, so->tx.state == ISOTP_IDLE); + err = wait_event_interruptible(so->wait, so->tx.state == ISOTP_IDLE); + if (err) + goto err_event_drop; if (sk->sk_err) return -sk->sk_err; @@ -1082,13 +1085,15 @@ static int isotp_sendmsg(struct socket *sock, struct msghdr *msg, size_t size) return size; +err_event_drop: + /* got signal: force tx state machine to be idle */ + so->tx.state = ISOTP_IDLE; + hrtimer_cancel(&so->txfrtimer); + hrtimer_cancel(&so->txtimer); err_out_drop: /* drop this PDU and unlock a potential wait queue */ - old_state = ISOTP_IDLE; -err_out: - so->tx.state = old_state; - if (so->tx.state == ISOTP_IDLE) - wake_up_interruptible(&so->wait); + so->tx.state = ISOTP_IDLE; + wake_up_interruptible(&so->wait); return err; } @@ -1150,10 +1155,12 @@ static int isotp_release(struct socket *sock) net = sock_net(sk); /* wait for complete transmission of current pdu */ - wait_event_interruptible(so->wait, so->tx.state == ISOTP_IDLE); + while (wait_event_interruptible(so->wait, so->tx.state == ISOTP_IDLE) == 0 && + cmpxchg(&so->tx.state, ISOTP_IDLE, ISOTP_SHUTDOWN) != ISOTP_IDLE) + ; /* force state machines to be idle also when a signal occurred */ - so->tx.state = ISOTP_IDLE; + so->tx.state = ISOTP_SHUTDOWN; so->rx.state = ISOTP_IDLE; spin_lock(&isotp_notifier_lock); -- cgit v1.2.3