summaryrefslogtreecommitdiffstats
path: root/net/ipv4
diff options
context:
space:
mode:
Diffstat (limited to 'net/ipv4')
-rw-r--r--net/ipv4/af_inet.c29
-rw-r--r--net/ipv4/arp.c33
-rw-r--r--net/ipv4/esp4_offload.c1
-rw-r--r--net/ipv4/fou.c26
-rw-r--r--net/ipv4/gre_offload.c13
-rw-r--r--net/ipv4/igmp.c1
-rw-r--r--net/ipv4/inet_connection_sock.c2
-rw-r--r--net/ipv4/inet_hashtables.c8
-rw-r--r--net/ipv4/ip_output.c1
-rw-r--r--net/ipv4/ip_sockglue.c2
-rw-r--r--net/ipv4/nexthop.c8
-rw-r--r--net/ipv4/ping.c14
-rw-r--r--net/ipv4/raw.c15
-rw-r--r--net/ipv4/route.c2
-rw-r--r--net/ipv4/syncookies.c2
-rw-r--r--net/ipv4/tcp.c91
-rw-r--r--net/ipv4/tcp_input.c8
-rw-r--r--net/ipv4/tcp_ipv4.c10
-rw-r--r--net/ipv4/tcp_offload.c1
-rw-r--r--net/ipv4/tcp_output.c2
-rw-r--r--net/ipv4/udp.c10
-rw-r--r--net/ipv4/udp_offload.c32
22 files changed, 161 insertions, 150 deletions
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 0189e3cd4a7d..04067b249bf3 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -99,6 +99,7 @@
#include <net/route.h>
#include <net/ip_fib.h>
#include <net/inet_connection_sock.h>
+#include <net/gro.h>
#include <net/tcp.h>
#include <net/udp.h>
#include <net/udplite.h>
@@ -224,7 +225,7 @@ int inet_listen(struct socket *sock, int backlog)
tcp_fastopen_init_key_once(sock_net(sk));
}
- err = inet_csk_listen_start(sk, backlog);
+ err = inet_csk_listen_start(sk);
if (err)
goto out;
tcp_call_bpf(sk, BPF_SOCK_OPS_TCP_LISTEN_CB, 0, NULL);
@@ -488,11 +489,8 @@ int __inet_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len,
* is temporarily down)
*/
err = -EADDRNOTAVAIL;
- if (!inet_can_nonlocal_bind(net, inet) &&
- addr->sin_addr.s_addr != htonl(INADDR_ANY) &&
- chk_addr_ret != RTN_LOCAL &&
- chk_addr_ret != RTN_MULTICAST &&
- chk_addr_ret != RTN_BROADCAST)
+ if (!inet_addr_valid_or_nonlocal(net, inet, addr->sin_addr.s_addr,
+ chk_addr_ret))
goto out;
snum = ntohs(addr->sin_port);
@@ -1454,19 +1452,18 @@ struct sk_buff *inet_gro_receive(struct list_head *head, struct sk_buff *skb)
proto = iph->protocol;
- rcu_read_lock();
ops = rcu_dereference(inet_offloads[proto]);
if (!ops || !ops->callbacks.gro_receive)
- goto out_unlock;
+ goto out;
if (*(u8 *)iph != 0x45)
- goto out_unlock;
+ goto out;
if (ip_is_fragment(iph))
- goto out_unlock;
+ goto out;
if (unlikely(ip_fast_csum((u8 *)iph, 5)))
- goto out_unlock;
+ goto out;
id = ntohl(*(__be32 *)&iph->id);
flush = (u16)((ntohl(*(__be32 *)iph) ^ skb_gro_len(skb)) | (id & ~IP_DF));
@@ -1543,9 +1540,6 @@ struct sk_buff *inet_gro_receive(struct list_head *head, struct sk_buff *skb)
pp = indirect_call_gro_receive(tcp4_gro_receive, udp4_gro_receive,
ops->callbacks.gro_receive, head, skb);
-out_unlock:
- rcu_read_unlock();
-
out:
skb_gro_flush_final(skb, pp, flush);
@@ -1618,10 +1612,9 @@ int inet_gro_complete(struct sk_buff *skb, int nhoff)
csum_replace2(&iph->check, iph->tot_len, newlen);
iph->tot_len = newlen;
- rcu_read_lock();
ops = rcu_dereference(inet_offloads[proto]);
if (WARN_ON(!ops || !ops->callbacks.gro_complete))
- goto out_unlock;
+ goto out;
/* Only need to add sizeof(*iph) to get to the next hdr below
* because any hdr with option will have been flushed in
@@ -1631,9 +1624,7 @@ int inet_gro_complete(struct sk_buff *skb, int nhoff)
tcp4_gro_complete, udp4_gro_complete,
skb, nhoff + sizeof(*iph));
-out_unlock:
- rcu_read_unlock();
-
+out:
return err;
}
diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c
index 857a144b1ea9..4db0325f6e1a 100644
--- a/net/ipv4/arp.c
+++ b/net/ipv4/arp.c
@@ -1299,21 +1299,6 @@ static struct packet_type arp_packet_type __read_mostly = {
.func = arp_rcv,
};
-static int arp_proc_init(void);
-
-void __init arp_init(void)
-{
- neigh_table_init(NEIGH_ARP_TABLE, &arp_tbl);
-
- dev_add_pack(&arp_packet_type);
- arp_proc_init();
-#ifdef CONFIG_SYSCTL
- neigh_sysctl_register(NULL, &arp_tbl.parms, NULL);
-#endif
- register_netdevice_notifier(&arp_netdev_notifier);
-}
-
-#ifdef CONFIG_PROC_FS
#if IS_ENABLED(CONFIG_AX25)
/* ------------------------------------------------------------------------ */
@@ -1451,16 +1436,14 @@ static struct pernet_operations arp_net_ops = {
.exit = arp_net_exit,
};
-static int __init arp_proc_init(void)
+void __init arp_init(void)
{
- return register_pernet_subsys(&arp_net_ops);
-}
-
-#else /* CONFIG_PROC_FS */
+ neigh_table_init(NEIGH_ARP_TABLE, &arp_tbl);
-static int __init arp_proc_init(void)
-{
- return 0;
+ dev_add_pack(&arp_packet_type);
+ register_pernet_subsys(&arp_net_ops);
+#ifdef CONFIG_SYSCTL
+ neigh_sysctl_register(NULL, &arp_tbl.parms, NULL);
+#endif
+ register_netdevice_notifier(&arp_netdev_notifier);
}
-
-#endif /* CONFIG_PROC_FS */
diff --git a/net/ipv4/esp4_offload.c b/net/ipv4/esp4_offload.c
index 8e4e9aa12130..d87f02a6e934 100644
--- a/net/ipv4/esp4_offload.c
+++ b/net/ipv4/esp4_offload.c
@@ -16,6 +16,7 @@
#include <crypto/authenc.h>
#include <linux/err.h>
#include <linux/module.h>
+#include <net/gro.h>
#include <net/ip.h>
#include <net/xfrm.h>
#include <net/esp.h>
diff --git a/net/ipv4/fou.c b/net/ipv4/fou.c
index 8fcbc6258ec5..0d085cc8d96c 100644
--- a/net/ipv4/fou.c
+++ b/net/ipv4/fou.c
@@ -9,6 +9,7 @@
#include <linux/types.h>
#include <linux/kernel.h>
#include <net/genetlink.h>
+#include <net/gro.h>
#include <net/gue.h>
#include <net/fou.h>
#include <net/ip.h>
@@ -246,17 +247,14 @@ static struct sk_buff *fou_gro_receive(struct sock *sk,
/* Flag this frame as already having an outer encap header */
NAPI_GRO_CB(skb)->is_fou = 1;
- rcu_read_lock();
offloads = NAPI_GRO_CB(skb)->is_ipv6 ? inet6_offloads : inet_offloads;
ops = rcu_dereference(offloads[proto]);
if (!ops || !ops->callbacks.gro_receive)
- goto out_unlock;
+ goto out;
pp = call_gro_receive(ops->callbacks.gro_receive, head, skb);
-out_unlock:
- rcu_read_unlock();
-
+out:
return pp;
}
@@ -268,19 +266,16 @@ static int fou_gro_complete(struct sock *sk, struct sk_buff *skb,
const struct net_offload *ops;
int err = -ENOSYS;
- rcu_read_lock();
offloads = NAPI_GRO_CB(skb)->is_ipv6 ? inet6_offloads : inet_offloads;
ops = rcu_dereference(offloads[proto]);
if (WARN_ON(!ops || !ops->callbacks.gro_complete))
- goto out_unlock;
+ goto out;
err = ops->callbacks.gro_complete(skb, nhoff);
skb_set_inner_mac_header(skb, nhoff);
-out_unlock:
- rcu_read_unlock();
-
+out:
return err;
}
@@ -438,17 +433,14 @@ next_proto:
/* Flag this frame as already having an outer encap header */
NAPI_GRO_CB(skb)->is_fou = 1;
- rcu_read_lock();
offloads = NAPI_GRO_CB(skb)->is_ipv6 ? inet6_offloads : inet_offloads;
ops = rcu_dereference(offloads[proto]);
if (WARN_ON_ONCE(!ops || !ops->callbacks.gro_receive))
- goto out_unlock;
+ goto out;
pp = call_gro_receive(ops->callbacks.gro_receive, head, skb);
flush = 0;
-out_unlock:
- rcu_read_unlock();
out:
skb_gro_flush_final_remcsum(skb, pp, flush, &grc);
@@ -485,18 +477,16 @@ static int gue_gro_complete(struct sock *sk, struct sk_buff *skb, int nhoff)
return err;
}
- rcu_read_lock();
offloads = NAPI_GRO_CB(skb)->is_ipv6 ? inet6_offloads : inet_offloads;
ops = rcu_dereference(offloads[proto]);
if (WARN_ON(!ops || !ops->callbacks.gro_complete))
- goto out_unlock;
+ goto out;
err = ops->callbacks.gro_complete(skb, nhoff + guehlen);
skb_set_inner_mac_header(skb, nhoff + guehlen);
-out_unlock:
- rcu_read_unlock();
+out:
return err;
}
diff --git a/net/ipv4/gre_offload.c b/net/ipv4/gre_offload.c
index 1121a9d5fed9..07073fa35205 100644
--- a/net/ipv4/gre_offload.c
+++ b/net/ipv4/gre_offload.c
@@ -10,6 +10,7 @@
#include <linux/init.h>
#include <net/protocol.h>
#include <net/gre.h>
+#include <net/gro.h>
static struct sk_buff *gre_gso_segment(struct sk_buff *skb,
netdev_features_t features)
@@ -162,10 +163,9 @@ static struct sk_buff *gre_gro_receive(struct list_head *head,
type = greh->protocol;
- rcu_read_lock();
ptype = gro_find_receive_by_type(type);
if (!ptype)
- goto out_unlock;
+ goto out;
grehlen = GRE_HEADER_SECTION;
@@ -179,13 +179,13 @@ static struct sk_buff *gre_gro_receive(struct list_head *head,
if (skb_gro_header_hard(skb, hlen)) {
greh = skb_gro_header_slow(skb, hlen, off);
if (unlikely(!greh))
- goto out_unlock;
+ goto out;
}
/* Don't bother verifying checksum if we're going to flush anyway. */
if ((greh->flags & GRE_CSUM) && !NAPI_GRO_CB(skb)->flush) {
if (skb_gro_checksum_simple_validate(skb))
- goto out_unlock;
+ goto out;
skb_gro_checksum_try_convert(skb, IPPROTO_GRE,
null_compute_pseudo);
@@ -229,8 +229,6 @@ static struct sk_buff *gre_gro_receive(struct list_head *head,
pp = call_gro_receive(ptype->callbacks.gro_receive, head, skb);
flush = 0;
-out_unlock:
- rcu_read_unlock();
out:
skb_gro_flush_final(skb, pp, flush);
@@ -255,13 +253,10 @@ static int gre_gro_complete(struct sk_buff *skb, int nhoff)
if (greh->flags & GRE_CSUM)
grehlen += GRE_HEADER_SECTION;
- rcu_read_lock();
ptype = gro_find_complete_by_type(type);
if (ptype)
err = ptype->callbacks.gro_complete(skb, nhoff + grehlen);
- rcu_read_unlock();
-
skb_set_inner_mac_header(skb, nhoff + grehlen);
return err;
diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index d2e2b3d18c66..2ad3c7b42d6d 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -2558,7 +2558,6 @@ int ip_mc_msfget(struct sock *sk, struct ip_msfilter *msf,
msf->imsf_fmode = pmc->sfmode;
psl = rtnl_dereference(pmc->sflist);
if (!psl) {
- len = 0;
count = 0;
} else {
count = psl->sl_count;
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index f7fea3a7c5e6..23da67a3fc06 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -1035,7 +1035,7 @@ void inet_csk_prepare_forced_close(struct sock *sk)
}
EXPORT_SYMBOL(inet_csk_prepare_forced_close);
-int inet_csk_listen_start(struct sock *sk, int backlog)
+int inet_csk_listen_start(struct sock *sk)
{
struct inet_connection_sock *icsk = inet_csk(sk);
struct inet_sock *inet = inet_sk(sk);
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index 75737267746f..30ab717ff1b8 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -307,7 +307,7 @@ static inline struct sock *inet_lookup_run_bpf(struct net *net,
struct inet_hashinfo *hashinfo,
struct sk_buff *skb, int doff,
__be32 saddr, __be16 sport,
- __be32 daddr, u16 hnum)
+ __be32 daddr, u16 hnum, const int dif)
{
struct sock *sk, *reuse_sk;
bool no_reuseport;
@@ -315,8 +315,8 @@ static inline struct sock *inet_lookup_run_bpf(struct net *net,
if (hashinfo != &tcp_hashinfo)
return NULL; /* only TCP is supported */
- no_reuseport = bpf_sk_lookup_run_v4(net, IPPROTO_TCP,
- saddr, sport, daddr, hnum, &sk);
+ no_reuseport = bpf_sk_lookup_run_v4(net, IPPROTO_TCP, saddr, sport,
+ daddr, hnum, dif, &sk);
if (no_reuseport || IS_ERR_OR_NULL(sk))
return sk;
@@ -340,7 +340,7 @@ struct sock *__inet_lookup_listener(struct net *net,
/* Lookup redirect from BPF */
if (static_branch_unlikely(&bpf_sk_lookup_enabled)) {
result = inet_lookup_run_bpf(net, hashinfo, skb, doff,
- saddr, sport, daddr, hnum);
+ saddr, sport, daddr, hnum, dif);
if (result)
goto done;
}
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 9bca57ef8b83..57c1d8431386 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -672,7 +672,6 @@ struct sk_buff *ip_frag_next(struct sk_buff *skb, struct ip_frag_state *state)
struct sk_buff *skb2;
struct iphdr *iph;
- len = state->left;
/* IF: it doesn't fit, use 'mtu' - the data space left */
if (len > state->mtu)
len = state->mtu;
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index 38d29b175ca6..445a9ecaefa1 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -576,7 +576,7 @@ out:
return err;
}
-static void __ip_sock_set_tos(struct sock *sk, int val)
+void __ip_sock_set_tos(struct sock *sk, int val)
{
if (sk->sk_type == SOCK_STREAM) {
val &= ~INET_ECN_MASK;
diff --git a/net/ipv4/nexthop.c b/net/ipv4/nexthop.c
index 5dbd4b5505eb..1319d093cdda 100644
--- a/net/ipv4/nexthop.c
+++ b/net/ipv4/nexthop.c
@@ -1918,9 +1918,6 @@ static void nh_rt_cache_flush(struct net *net, struct nexthop *nh,
if (!replaced_nh->is_group)
return;
- /* new dsts must use only the new nexthop group */
- synchronize_net();
-
nhg = rtnl_dereference(replaced_nh->nh_grp);
for (i = 0; i < nhg->num_nh; i++) {
struct nh_grp_entry *nhge = &nhg->nh_entries[i];
@@ -2002,9 +1999,10 @@ static int replace_nexthop_grp(struct net *net, struct nexthop *old,
rcu_assign_pointer(old->nh_grp, newg);
+ /* Make sure concurrent readers are not using 'oldg' anymore. */
+ synchronize_net();
+
if (newg->resilient) {
- /* Make sure concurrent readers are not using 'oldg' anymore. */
- synchronize_net();
rcu_assign_pointer(oldg->res_table, tmp_table);
rcu_assign_pointer(oldg->spare->res_table, tmp_table);
}
diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c
index 1e44a43acfe2..e540b0dcf085 100644
--- a/net/ipv4/ping.c
+++ b/net/ipv4/ping.c
@@ -311,15 +311,11 @@ static int ping_check_bind_addr(struct sock *sk, struct inet_sock *isk,
pr_debug("ping_check_bind_addr(sk=%p,addr=%pI4,port=%d)\n",
sk, &addr->sin_addr.s_addr, ntohs(addr->sin_port));
- if (addr->sin_addr.s_addr == htonl(INADDR_ANY))
- chk_addr_ret = RTN_LOCAL;
- else
- chk_addr_ret = inet_addr_type(net, addr->sin_addr.s_addr);
-
- if ((!inet_can_nonlocal_bind(net, isk) &&
- chk_addr_ret != RTN_LOCAL) ||
- chk_addr_ret == RTN_MULTICAST ||
- chk_addr_ret == RTN_BROADCAST)
+ chk_addr_ret = inet_addr_type(net, addr->sin_addr.s_addr);
+
+ if (!inet_addr_valid_or_nonlocal(net, inet_sk(sk),
+ addr->sin_addr.s_addr,
+ chk_addr_ret))
return -EADDRNOTAVAIL;
#if IS_ENABLED(CONFIG_IPV6)
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index bb446e60cf58..a53f256bf9d3 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -99,8 +99,8 @@ int raw_hash_sk(struct sock *sk)
write_lock_bh(&h->lock);
sk_add_node(sk, head);
- sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
write_unlock_bh(&h->lock);
+ sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
return 0;
}
@@ -717,6 +717,7 @@ static int raw_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len)
{
struct inet_sock *inet = inet_sk(sk);
struct sockaddr_in *addr = (struct sockaddr_in *) uaddr;
+ struct net *net = sock_net(sk);
u32 tb_id = RT_TABLE_LOCAL;
int ret = -EINVAL;
int chk_addr_ret;
@@ -725,16 +726,16 @@ static int raw_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len)
goto out;
if (sk->sk_bound_dev_if)
- tb_id = l3mdev_fib_table_by_index(sock_net(sk),
- sk->sk_bound_dev_if) ? : tb_id;
+ tb_id = l3mdev_fib_table_by_index(net,
+ sk->sk_bound_dev_if) ? : tb_id;
- chk_addr_ret = inet_addr_type_table(sock_net(sk), addr->sin_addr.s_addr,
- tb_id);
+ chk_addr_ret = inet_addr_type_table(net, addr->sin_addr.s_addr, tb_id);
ret = -EADDRNOTAVAIL;
- if (addr->sin_addr.s_addr && chk_addr_ret != RTN_LOCAL &&
- chk_addr_ret != RTN_MULTICAST && chk_addr_ret != RTN_BROADCAST)
+ if (!inet_addr_valid_or_nonlocal(net, inet, addr->sin_addr.s_addr,
+ chk_addr_ret))
goto out;
+
inet->inet_rcv_saddr = inet->inet_saddr = addr->sin_addr.s_addr;
if (chk_addr_ret == RTN_MULTICAST || chk_addr_ret == RTN_BROADCAST)
inet->inet_saddr = 0; /* Use device */
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 0b4103b1e622..243a0c52be42 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -602,7 +602,7 @@ static void fnhe_remove_oldest(struct fnhe_hash_bucket *hash)
static u32 fnhe_hashfun(__be32 daddr)
{
- static siphash_key_t fnhe_hash_key __read_mostly;
+ static siphash_aligned_key_t fnhe_hash_key;
u64 hval;
net_get_random_once(&fnhe_hash_key, sizeof(fnhe_hash_key));
diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
index 8696dc343ad2..2cb3b852d148 100644
--- a/net/ipv4/syncookies.c
+++ b/net/ipv4/syncookies.c
@@ -14,7 +14,7 @@
#include <net/tcp.h>
#include <net/route.h>
-static siphash_key_t syncookie_secret[2] __read_mostly;
+static siphash_aligned_key_t syncookie_secret[2];
#define COOKIEBITS 24 /* Upper bits store count */
#define COOKIEMASK (((__u32)1 << COOKIEBITS) - 1)
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index bbb3d39c69af..6ab82e1a1d41 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -292,7 +292,7 @@ EXPORT_PER_CPU_SYMBOL_GPL(tcp_orphan_count);
long sysctl_tcp_mem[3] __read_mostly;
EXPORT_SYMBOL(sysctl_tcp_mem);
-atomic_long_t tcp_memory_allocated; /* Current allocated memory. */
+atomic_long_t tcp_memory_allocated ____cacheline_aligned_in_smp; /* Current allocated memory. */
EXPORT_SYMBOL(tcp_memory_allocated);
#if IS_ENABLED(CONFIG_SMC)
@@ -303,7 +303,7 @@ EXPORT_SYMBOL(tcp_have_smc);
/*
* Current number of TCP sockets.
*/
-struct percpu_counter tcp_sockets_allocated;
+struct percpu_counter tcp_sockets_allocated ____cacheline_aligned_in_smp;
EXPORT_SYMBOL(tcp_sockets_allocated);
/*
@@ -456,7 +456,6 @@ void tcp_init_sock(struct sock *sk)
WRITE_ONCE(sk->sk_rcvbuf, sock_net(sk)->ipv4.sysctl_tcp_rmem[1]);
sk_sockets_allocated_inc(sk);
- sk->sk_route_forced_caps = NETIF_F_GSO;
}
EXPORT_SYMBOL(tcp_init_sock);
@@ -546,10 +545,11 @@ __poll_t tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
if (state != TCP_SYN_SENT &&
(state != TCP_SYN_RECV || rcu_access_pointer(tp->fastopen_rsk))) {
int target = sock_rcvlowat(sk, 0, INT_MAX);
+ u16 urg_data = READ_ONCE(tp->urg_data);
- if (READ_ONCE(tp->urg_seq) == READ_ONCE(tp->copied_seq) &&
- !sock_flag(sk, SOCK_URGINLINE) &&
- tp->urg_data)
+ if (unlikely(urg_data) &&
+ READ_ONCE(tp->urg_seq) == READ_ONCE(tp->copied_seq) &&
+ !sock_flag(sk, SOCK_URGINLINE))
target++;
if (tcp_stream_is_readable(sk, target))
@@ -574,7 +574,7 @@ __poll_t tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
} else
mask |= EPOLLOUT | EPOLLWRNORM;
- if (tp->urg_data & TCP_URG_VALID)
+ if (urg_data & TCP_URG_VALID)
mask |= EPOLLPRI;
} else if (state == TCP_SYN_SENT && inet_sk(sk)->defer_connect) {
/* Active TCP fastopen socket with defer_connect
@@ -608,7 +608,7 @@ int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
unlock_sock_fast(sk, slow);
break;
case SIOCATMARK:
- answ = tp->urg_data &&
+ answ = READ_ONCE(tp->urg_data) &&
READ_ONCE(tp->urg_seq) == READ_ONCE(tp->copied_seq);
break;
case SIOCOUTQ:
@@ -1466,7 +1466,7 @@ static int tcp_recv_urg(struct sock *sk, struct msghdr *msg, int len, int flags)
char c = tp->urg_data;
if (!(flags & MSG_PEEK))
- tp->urg_data = TCP_URG_READ;
+ WRITE_ONCE(tp->urg_data, TCP_URG_READ);
/* Read urgent data. */
msg->msg_flags |= MSG_OOB;
@@ -1580,6 +1580,36 @@ void tcp_cleanup_rbuf(struct sock *sk, int copied)
tcp_send_ack(sk);
}
+void __sk_defer_free_flush(struct sock *sk)
+{
+ struct llist_node *head;
+ struct sk_buff *skb, *n;
+
+ head = llist_del_all(&sk->defer_list);
+ llist_for_each_entry_safe(skb, n, head, ll_node) {
+ prefetch(n);
+ skb_mark_not_on_list(skb);
+ __kfree_skb(skb);
+ }
+}
+EXPORT_SYMBOL(__sk_defer_free_flush);
+
+static void tcp_eat_recv_skb(struct sock *sk, struct sk_buff *skb)
+{
+ __skb_unlink(skb, &sk->sk_receive_queue);
+ if (likely(skb->destructor == sock_rfree)) {
+ sock_rfree(skb);
+ skb->destructor = NULL;
+ skb->sk = NULL;
+ if (!skb_queue_empty(&sk->sk_receive_queue) ||
+ !llist_empty(&sk->defer_list)) {
+ llist_add(&skb->ll_node, &sk->defer_list);
+ return;
+ }
+ }
+ __kfree_skb(skb);
+}
+
static struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off)
{
struct sk_buff *skb;
@@ -1599,7 +1629,7 @@ static struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off)
* splitted a fat GRO packet, while we released socket lock
* in skb_splice_bits()
*/
- sk_eat_skb(sk, skb);
+ tcp_eat_recv_skb(sk, skb);
}
return NULL;
}
@@ -1633,7 +1663,7 @@ int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
len = skb->len - offset;
/* Stop reading if we hit a patch of urgent data */
- if (tp->urg_data) {
+ if (unlikely(tp->urg_data)) {
u32 urg_offset = tp->urg_seq - seq;
if (urg_offset < len)
len = urg_offset;
@@ -1665,11 +1695,11 @@ int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
continue;
}
if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) {
- sk_eat_skb(sk, skb);
+ tcp_eat_recv_skb(sk, skb);
++seq;
break;
}
- sk_eat_skb(sk, skb);
+ tcp_eat_recv_skb(sk, skb);
if (!desc->count)
break;
WRITE_ONCE(tp->copied_seq, seq);
@@ -2329,7 +2359,7 @@ static int tcp_recvmsg_locked(struct sock *sk, struct msghdr *msg, size_t len,
u32 offset;
/* Are we at urgent data? Stop if we have read anything or have SIGURG pending. */
- if (tp->urg_data && tp->urg_seq == *seq) {
+ if (unlikely(tp->urg_data) && tp->urg_seq == *seq) {
if (copied)
break;
if (signal_pending(current)) {
@@ -2372,10 +2402,10 @@ static int tcp_recvmsg_locked(struct sock *sk, struct msghdr *msg, size_t len,
break;
if (copied) {
- if (sk->sk_err ||
+ if (!timeo ||
+ sk->sk_err ||
sk->sk_state == TCP_CLOSE ||
(sk->sk_shutdown & RCV_SHUTDOWN) ||
- !timeo ||
signal_pending(current))
break;
} else {
@@ -2409,13 +2439,12 @@ static int tcp_recvmsg_locked(struct sock *sk, struct msghdr *msg, size_t len,
}
}
- tcp_cleanup_rbuf(sk, copied);
-
if (copied >= target) {
/* Do not sleep, just process backlog. */
- release_sock(sk);
- lock_sock(sk);
+ __sk_flush_backlog(sk);
} else {
+ tcp_cleanup_rbuf(sk, copied);
+ sk_defer_free_flush(sk);
sk_wait_data(sk, &timeo, last);
}
@@ -2435,7 +2464,7 @@ found_ok_skb:
used = len;
/* Do we have urgent data here? */
- if (tp->urg_data) {
+ if (unlikely(tp->urg_data)) {
u32 urg_offset = tp->urg_seq - *seq;
if (urg_offset < used) {
if (!urg_offset) {
@@ -2469,8 +2498,8 @@ found_ok_skb:
tcp_rcv_space_adjust(sk);
skip_copy:
- if (tp->urg_data && after(tp->copied_seq, tp->urg_seq)) {
- tp->urg_data = 0;
+ if (unlikely(tp->urg_data) && after(tp->copied_seq, tp->urg_seq)) {
+ WRITE_ONCE(tp->urg_data, 0);
tcp_fast_path_check(sk);
}
@@ -2485,14 +2514,14 @@ skip_copy:
if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
goto found_fin_ok;
if (!(flags & MSG_PEEK))
- sk_eat_skb(sk, skb);
+ tcp_eat_recv_skb(sk, skb);
continue;
found_fin_ok:
/* Process the FIN. */
WRITE_ONCE(*seq, *seq + 1);
if (!(flags & MSG_PEEK))
- sk_eat_skb(sk, skb);
+ tcp_eat_recv_skb(sk, skb);
break;
} while (len > 0);
@@ -2534,6 +2563,7 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock,
ret = tcp_recvmsg_locked(sk, msg, len, nonblock, flags, &tss,
&cmsg_flags);
release_sock(sk);
+ sk_defer_free_flush(sk);
if (cmsg_flags && ret >= 0) {
if (cmsg_flags & TCP_CMSG_TS)
@@ -2964,7 +2994,7 @@ int tcp_disconnect(struct sock *sk, int flags)
tcp_clear_xmit_timers(sk);
__skb_queue_purge(&sk->sk_receive_queue);
WRITE_ONCE(tp->copied_seq, tp->rcv_nxt);
- tp->urg_data = 0;
+ WRITE_ONCE(tp->urg_data, 0);
tcp_write_queue_purge(sk);
tcp_fastopen_active_disable_ofo_check(sk);
skb_rbtree_purge(&tp->out_of_order_queue);
@@ -3059,7 +3089,7 @@ int tcp_disconnect(struct sock *sk, int flags)
sk->sk_frag.page = NULL;
sk->sk_frag.offset = 0;
}
-
+ sk_defer_free_flush(sk);
sk_error_report(sk);
return 0;
}
@@ -3774,10 +3804,12 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
tcp_get_info_chrono_stats(tp, info);
info->tcpi_segs_out = tp->segs_out;
- info->tcpi_segs_in = tp->segs_in;
+
+ /* segs_in and data_segs_in can be updated from tcp_segs_in() from BH */
+ info->tcpi_segs_in = READ_ONCE(tp->segs_in);
+ info->tcpi_data_segs_in = READ_ONCE(tp->data_segs_in);
info->tcpi_min_rtt = tcp_min_rtt(tp);
- info->tcpi_data_segs_in = tp->data_segs_in;
info->tcpi_data_segs_out = tp->data_segs_out;
info->tcpi_delivery_rate_app_limited = tp->rate_app_limited ? 1 : 0;
@@ -4186,6 +4218,7 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
err = BPF_CGROUP_RUN_PROG_GETSOCKOPT_KERN(sk, level, optname,
&zc, &len, err);
release_sock(sk);
+ sk_defer_free_flush(sk);
if (len >= offsetofend(struct tcp_zerocopy_receive, msg_flags))
goto zerocopy_rcv_cmsg;
switch (len) {
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 246ab7b5e857..3658b9c3dd2b 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -5591,7 +5591,7 @@ static void tcp_check_urg(struct sock *sk, const struct tcphdr *th)
}
}
- tp->urg_data = TCP_URG_NOTYET;
+ WRITE_ONCE(tp->urg_data, TCP_URG_NOTYET);
WRITE_ONCE(tp->urg_seq, ptr);
/* Disable header prediction. */
@@ -5604,11 +5604,11 @@ static void tcp_urg(struct sock *sk, struct sk_buff *skb, const struct tcphdr *t
struct tcp_sock *tp = tcp_sk(sk);
/* Check if we get a new urgent pointer - normally not. */
- if (th->urg)
+ if (unlikely(th->urg))
tcp_check_urg(sk, th);
/* Do we wait for any urgent data? - normally not... */
- if (tp->urg_data == TCP_URG_NOTYET) {
+ if (unlikely(tp->urg_data == TCP_URG_NOTYET)) {
u32 ptr = tp->urg_seq - ntohl(th->seq) + (th->doff * 4) -
th->syn;
@@ -5617,7 +5617,7 @@ static void tcp_urg(struct sock *sk, struct sk_buff *skb, const struct tcphdr *t
u8 tmp;
if (skb_copy_bits(skb, ptr, &tmp, 1))
BUG();
- tp->urg_data = TCP_URG_VALID | tmp;
+ WRITE_ONCE(tp->urg_data, TCP_URG_VALID | tmp);
if (!sock_flag(sk, SOCK_DEAD))
sk->sk_data_ready(sk);
}
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 13d868c43284..3dd19a2bf06c 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1182,7 +1182,7 @@ int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
if (!md5sig)
return -ENOMEM;
- sk_nocaps_add(sk, NETIF_F_GSO_MASK);
+ sk_gso_disable(sk);
INIT_HLIST_HEAD(&md5sig->head);
rcu_assign_pointer(tp->md5sig_info, md5sig);
}
@@ -1620,7 +1620,7 @@ struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
*/
tcp_md5_do_add(newsk, addr, AF_INET, 32, l3index, key->flags,
key->key, key->keylen, GFP_ATOMIC);
- sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
+ sk_gso_disable(newsk);
}
#endif
@@ -1800,8 +1800,7 @@ int tcp_v4_early_demux(struct sk_buff *skb)
bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb)
{
- u32 limit = READ_ONCE(sk->sk_rcvbuf) + READ_ONCE(sk->sk_sndbuf);
- u32 tail_gso_size, tail_gso_segs;
+ u32 limit, tail_gso_size, tail_gso_segs;
struct skb_shared_info *shinfo;
const struct tcphdr *th;
struct tcphdr *thtail;
@@ -1909,7 +1908,7 @@ no_coalesce:
* to reduce memory overhead, so add a little headroom here.
* Few sockets backlog are possibly concurrently non empty.
*/
- limit += 64*1024;
+ limit = READ_ONCE(sk->sk_rcvbuf) + READ_ONCE(sk->sk_sndbuf) + 64*1024;
if (unlikely(sk_add_backlog(sk, skb, limit))) {
bh_unlock_sock(sk);
@@ -2103,6 +2102,7 @@ process:
sk_incoming_cpu_update(sk);
+ sk_defer_free_flush(sk);
bh_lock_sock_nested(sk);
tcp_segs_in(tcp_sk(sk), skb);
ret = 0;
diff --git a/net/ipv4/tcp_offload.c b/net/ipv4/tcp_offload.c
index fc61cd3fea65..30abde86db45 100644
--- a/net/ipv4/tcp_offload.c
+++ b/net/ipv4/tcp_offload.c
@@ -8,6 +8,7 @@
#include <linux/indirect_call_wrapper.h>
#include <linux/skbuff.h>
+#include <net/gro.h>
#include <net/tcp.h>
#include <net/protocol.h>
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 2e6e5a70168e..5079832af5c1 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -1359,7 +1359,7 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb,
#ifdef CONFIG_TCP_MD5SIG
/* Calculate the MD5 hash, as we have all we need now */
if (md5) {
- sk_nocaps_add(sk, NETIF_F_GSO_MASK);
+ sk_gso_disable(sk);
tp->af_specific->calc_md5_hash(opts.hash_location,
md5, sk, skb);
}
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 8bcecdd6aeda..799b663daf87 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -122,7 +122,7 @@ EXPORT_SYMBOL(udp_table);
long sysctl_udp_mem[3] __read_mostly;
EXPORT_SYMBOL(sysctl_udp_mem);
-atomic_long_t udp_memory_allocated;
+atomic_long_t udp_memory_allocated ____cacheline_aligned_in_smp;
EXPORT_SYMBOL(udp_memory_allocated);
#define MAX_UDP_PORTS 65536
@@ -459,7 +459,7 @@ static struct sock *udp4_lookup_run_bpf(struct net *net,
struct udp_table *udptable,
struct sk_buff *skb,
__be32 saddr, __be16 sport,
- __be32 daddr, u16 hnum)
+ __be32 daddr, u16 hnum, const int dif)
{
struct sock *sk, *reuse_sk;
bool no_reuseport;
@@ -467,8 +467,8 @@ static struct sock *udp4_lookup_run_bpf(struct net *net,
if (udptable != &udp_table)
return NULL; /* only UDP is supported */
- no_reuseport = bpf_sk_lookup_run_v4(net, IPPROTO_UDP,
- saddr, sport, daddr, hnum, &sk);
+ no_reuseport = bpf_sk_lookup_run_v4(net, IPPROTO_UDP, saddr, sport,
+ daddr, hnum, dif, &sk);
if (no_reuseport || IS_ERR_OR_NULL(sk))
return sk;
@@ -504,7 +504,7 @@ struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr,
/* Lookup redirect from BPF */
if (static_branch_unlikely(&bpf_sk_lookup_enabled)) {
sk = udp4_lookup_run_bpf(net, udptable, skb,
- saddr, sport, daddr, hnum);
+ saddr, sport, daddr, hnum, dif);
if (sk) {
result = sk;
goto done;
diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c
index 86d32a1e62ac..6d1a4bec2614 100644
--- a/net/ipv4/udp_offload.c
+++ b/net/ipv4/udp_offload.c
@@ -7,6 +7,7 @@
*/
#include <linux/skbuff.h>
+#include <net/gro.h>
#include <net/udp.h>
#include <net/protocol.h>
#include <net/inet_common.h>
@@ -424,6 +425,33 @@ out:
return segs;
}
+static int skb_gro_receive_list(struct sk_buff *p, struct sk_buff *skb)
+{
+ if (unlikely(p->len + skb->len >= 65536))
+ return -E2BIG;
+
+ if (NAPI_GRO_CB(p)->last == p)
+ skb_shinfo(p)->frag_list = skb;
+ else
+ NAPI_GRO_CB(p)->last->next = skb;
+
+ skb_pull(skb, skb_gro_offset(skb));
+
+ NAPI_GRO_CB(p)->last = skb;
+ NAPI_GRO_CB(p)->count++;
+ p->data_len += skb->len;
+
+ /* sk owenrship - if any - completely transferred to the aggregated packet */
+ skb->destructor = NULL;
+ p->truesize += skb->truesize;
+ p->len += skb->len;
+
+ NAPI_GRO_CB(skb)->same_flow = 1;
+
+ return 0;
+}
+
+
#define UDP_GRO_CNT_MAX 64
static struct sk_buff *udp_gro_receive_segment(struct list_head *head,
struct sk_buff *skb)
@@ -600,13 +628,11 @@ struct sk_buff *udp4_gro_receive(struct list_head *head, struct sk_buff *skb)
inet_gro_compute_pseudo);
skip:
NAPI_GRO_CB(skb)->is_ipv6 = 0;
- rcu_read_lock();
if (static_branch_unlikely(&udp_encap_needed_key))
sk = udp4_gro_lookup_skb(skb, uh->source, uh->dest);
pp = udp_gro_receive(head, skb, uh, sk);
- rcu_read_unlock();
return pp;
flush:
@@ -641,7 +667,6 @@ int udp_gro_complete(struct sk_buff *skb, int nhoff,
uh->len = newlen;
- rcu_read_lock();
sk = INDIRECT_CALL_INET(lookup, udp6_lib_lookup_skb,
udp4_lib_lookup_skb, skb, uh->source, uh->dest);
if (sk && udp_sk(sk)->gro_complete) {
@@ -662,7 +687,6 @@ int udp_gro_complete(struct sk_buff *skb, int nhoff,
} else {
err = udp_gro_complete_segment(skb);
}
- rcu_read_unlock();
if (skb->remcsum_offload)
skb_shinfo(skb)->gso_type |= SKB_GSO_TUNNEL_REMCSUM;