diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2018-12-27 22:04:52 +0100 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2018-12-27 22:04:52 +0100 |
commit | e0c38a4d1f196a4b17d2eba36afff8f656a4f1de (patch) | |
tree | b26a69fabef0160adb127416a9744217700feeb7 /net/ipv4/inet_hashtables.c | |
parent | Merge tag 'modules-for-v4.21' of git://git.kernel.org/pub/scm/linux/kernel/gi... (diff) | |
parent | Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net (diff) | |
download | linux-e0c38a4d1f196a4b17d2eba36afff8f656a4f1de.tar.xz linux-e0c38a4d1f196a4b17d2eba36afff8f656a4f1de.zip |
Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next
Pull networking updates from David Miller:
1) New ipset extensions for matching on destination MAC addresses, from
Stefano Brivio.
2) Add ipv4 ttl and tos, plus ipv6 flow label and hop limit offloads to
nfp driver. From Stefano Brivio.
3) Implement GRO for plain UDP sockets, from Paolo Abeni.
4) Lots of work from Michał Mirosław to eliminate the VLAN_TAG_PRESENT
bit so that we could support the entire vlan_tci value.
5) Rework the IPSEC policy lookups to better optimize more usecases,
from Florian Westphal.
6) Infrastructure changes eliminating direct manipulation of SKB lists
wherever possible, and to always use the appropriate SKB list
helpers. This work is still ongoing...
7) Lots of PHY driver and state machine improvements and
simplifications, from Heiner Kallweit.
8) Various TSO deferral refinements, from Eric Dumazet.
9) Add ntuple filter support to aquantia driver, from Dmitry Bogdanov.
10) Batch dropping of XDP packets in tuntap, from Jason Wang.
11) Lots of cleanups and improvements to the r8169 driver from Heiner
Kallweit, including support for ->xmit_more. This driver has been
getting some much needed love since he started working on it.
12) Lots of new forwarding selftests from Petr Machata.
13) Enable VXLAN learning in mlxsw driver, from Ido Schimmel.
14) Packed ring support for virtio, from Tiwei Bie.
15) Add new Aquantia AQtion USB driver, from Dmitry Bezrukov.
16) Add XDP support to dpaa2-eth driver, from Ioana Ciocoi Radulescu.
17) Implement coalescing on TCP backlog queue, from Eric Dumazet.
18) Implement carrier change in tun driver, from Nicolas Dichtel.
19) Support msg_zerocopy in UDP, from Willem de Bruijn.
20) Significantly improve garbage collection of neighbor objects when
the table has many PERMANENT entries, from David Ahern.
21) Remove egdev usage from nfp and mlx5, and remove the facility
completely from the tree as it no longer has any users. From Oz
Shlomo and others.
22) Add a NETDEV_PRE_CHANGEADDR so that drivers can veto the change and
therefore abort the operation before the commit phase (which is the
NETDEV_CHANGEADDR event). From Petr Machata.
23) Add indirect call wrappers to avoid retpoline overhead, and use them
in the GRO code paths. From Paolo Abeni.
24) Add support for netlink FDB get operations, from Roopa Prabhu.
25) Support bloom filter in mlxsw driver, from Nir Dotan.
26) Add SKB extension infrastructure. This consolidates the handling of
the auxiliary SKB data used by IPSEC and bridge netfilter, and is
designed to support the needs to MPTCP which could be integrated in
the future.
27) Lots of XDP TX optimizations in mlx5 from Tariq Toukan.
* git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next: (1845 commits)
net: dccp: fix kernel crash on module load
drivers/net: appletalk/cops: remove redundant if statement and mask
bnx2x: Fix NULL pointer dereference in bnx2x_del_all_vlans() on some hw
net/net_namespace: Check the return value of register_pernet_subsys()
net/netlink_compat: Fix a missing check of nla_parse_nested
ieee802154: lowpan_header_create check must check daddr
net/mlx4_core: drop useless LIST_HEAD
mlxsw: spectrum: drop useless LIST_HEAD
net/mlx5e: drop useless LIST_HEAD
iptunnel: Set tun_flags in the iptunnel_metadata_reply from src
net/mlx5e: fix semicolon.cocci warnings
staging: octeon: fix build failure with XFRM enabled
net: Revert recent Spectre-v1 patches.
can: af_can: Fix Spectre v1 vulnerability
packet: validate address length if non-zero
nfc: af_nfc: Fix Spectre v1 vulnerability
phonet: af_phonet: Fix Spectre v1 vulnerability
net: core: Fix Spectre v1 vulnerability
net: minor cleanup in skb_ext_add()
net: drop the unused helper skb_ext_get()
...
Diffstat (limited to 'net/ipv4/inet_hashtables.c')
-rw-r--r-- | net/ipv4/inet_hashtables.c | 117 |
1 files changed, 50 insertions, 67 deletions
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index 411dd7a90046..942265d65eb3 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -65,12 +65,14 @@ static u32 sk_ehashfn(const struct sock *sk) struct inet_bind_bucket *inet_bind_bucket_create(struct kmem_cache *cachep, struct net *net, struct inet_bind_hashbucket *head, - const unsigned short snum) + const unsigned short snum, + int l3mdev) { struct inet_bind_bucket *tb = kmem_cache_alloc(cachep, GFP_ATOMIC); if (tb) { write_pnet(&tb->ib_net, net); + tb->l3mdev = l3mdev; tb->port = snum; tb->fastreuse = 0; tb->fastreuseport = 0; @@ -135,6 +137,7 @@ int __inet_inherit_port(const struct sock *sk, struct sock *child) table->bhash_size); struct inet_bind_hashbucket *head = &table->bhash[bhash]; struct inet_bind_bucket *tb; + int l3mdev; spin_lock(&head->lock); tb = inet_csk(sk)->icsk_bind_hash; @@ -143,6 +146,8 @@ int __inet_inherit_port(const struct sock *sk, struct sock *child) return -ENOENT; } if (tb->port != port) { + l3mdev = inet_sk_bound_l3mdev(sk); + /* NOTE: using tproxy and redirecting skbs to a proxy * on a different listener port breaks the assumption * that the listener socket's icsk_bind_hash is the same @@ -150,12 +155,13 @@ int __inet_inherit_port(const struct sock *sk, struct sock *child) * create a new bind bucket for the child here. */ inet_bind_bucket_for_each(tb, &head->chain) { if (net_eq(ib_net(tb), sock_net(sk)) && - tb->port == port) + tb->l3mdev == l3mdev && tb->port == port) break; } if (!tb) { tb = inet_bind_bucket_create(table->bind_bucket_cachep, - sock_net(sk), head, port); + sock_net(sk), head, port, + l3mdev); if (!tb) { spin_unlock(&head->lock); return -ENOMEM; @@ -228,26 +234,16 @@ static inline int compute_score(struct sock *sk, struct net *net, const int dif, const int sdif, bool exact_dif) { int score = -1; - struct inet_sock *inet = inet_sk(sk); - if (net_eq(sock_net(sk), net) && inet->inet_num == hnum && + if (net_eq(sock_net(sk), net) && sk->sk_num == hnum && !ipv6_only_sock(sk)) { - __be32 rcv_saddr = inet->inet_rcv_saddr; + if (sk->sk_rcv_saddr != daddr) + return -1; + + if (!inet_sk_bound_dev_eq(net, sk->sk_bound_dev_if, dif, sdif)) + return -1; + score = sk->sk_family == PF_INET ? 2 : 1; - if (rcv_saddr) { - if (rcv_saddr != daddr) - return -1; - score += 4; - } - if (sk->sk_bound_dev_if || exact_dif) { - bool dev_match = (sk->sk_bound_dev_if == dif || - sk->sk_bound_dev_if == sdif); - - if (!dev_match) - return -1; - if (sk->sk_bound_dev_if) - score += 4; - } if (sk->sk_incoming_cpu == raw_smp_processor_id()) score++; } @@ -303,26 +299,12 @@ struct sock *__inet_lookup_listener(struct net *net, const __be32 daddr, const unsigned short hnum, const int dif, const int sdif) { - unsigned int hash = inet_lhashfn(net, hnum); - struct inet_listen_hashbucket *ilb = &hashinfo->listening_hash[hash]; - bool exact_dif = inet_exact_dif_match(net, skb); struct inet_listen_hashbucket *ilb2; - struct sock *sk, *result = NULL; - int score, hiscore = 0; + struct sock *result = NULL; unsigned int hash2; - u32 phash = 0; - - if (ilb->count <= 10 || !hashinfo->lhash2) - goto port_lookup; - - /* Too many sk in the ilb bucket (which is hashed by port alone). - * Try lhash2 (which is hashed by port and addr) instead. - */ hash2 = ipv4_portaddr_hash(net, daddr, hnum); ilb2 = inet_lhash2_bucket(hashinfo, hash2); - if (ilb2->count > ilb->count) - goto port_lookup; result = inet_lhash2_lookup(net, ilb2, skb, doff, saddr, sport, daddr, hnum, @@ -331,34 +313,12 @@ struct sock *__inet_lookup_listener(struct net *net, goto done; /* Lookup lhash2 with INADDR_ANY */ - hash2 = ipv4_portaddr_hash(net, htonl(INADDR_ANY), hnum); ilb2 = inet_lhash2_bucket(hashinfo, hash2); - if (ilb2->count > ilb->count) - goto port_lookup; result = inet_lhash2_lookup(net, ilb2, skb, doff, - saddr, sport, daddr, hnum, + saddr, sport, htonl(INADDR_ANY), hnum, dif, sdif); - goto done; - -port_lookup: - sk_for_each_rcu(sk, &ilb->head) { - score = compute_score(sk, net, hnum, daddr, - dif, sdif, exact_dif); - if (score > hiscore) { - if (sk->sk_reuseport) { - phash = inet_ehashfn(net, daddr, hnum, - saddr, sport); - result = reuseport_select_sock(sk, phash, - skb, doff); - if (result) - goto done; - } - result = sk; - hiscore = score; - } - } done: if (unlikely(IS_ERR(result))) return NULL; @@ -675,6 +635,7 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row, u32 remaining, offset; int ret, i, low, high; static u32 hint; + int l3mdev; if (port) { head = &hinfo->bhash[inet_bhashfn(net, port, @@ -693,6 +654,8 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row, return ret; } + l3mdev = inet_sk_bound_l3mdev(sk); + inet_get_local_port_range(net, &low, &high); high++; /* [32768, 60999] -> [32768, 61000[ */ remaining = high - low; @@ -719,7 +682,8 @@ other_parity_scan: * the established check is already unique enough. */ inet_bind_bucket_for_each(tb, &head->chain) { - if (net_eq(ib_net(tb), net) && tb->port == port) { + if (net_eq(ib_net(tb), net) && tb->l3mdev == l3mdev && + tb->port == port) { if (tb->fastreuse >= 0 || tb->fastreuseport >= 0) goto next_port; @@ -732,7 +696,7 @@ other_parity_scan: } tb = inet_bind_bucket_create(hinfo->bind_bucket_cachep, - net, head, port); + net, head, port, l3mdev); if (!tb) { spin_unlock_bh(&head->lock); return -ENOMEM; @@ -798,13 +762,22 @@ void inet_hashinfo_init(struct inet_hashinfo *h) } EXPORT_SYMBOL_GPL(inet_hashinfo_init); +static void init_hashinfo_lhash2(struct inet_hashinfo *h) +{ + int i; + + for (i = 0; i <= h->lhash2_mask; i++) { + spin_lock_init(&h->lhash2[i].lock); + INIT_HLIST_HEAD(&h->lhash2[i].head); + h->lhash2[i].count = 0; + } +} + void __init inet_hashinfo2_init(struct inet_hashinfo *h, const char *name, unsigned long numentries, int scale, unsigned long low_limit, unsigned long high_limit) { - unsigned int i; - h->lhash2 = alloc_large_system_hash(name, sizeof(*h->lhash2), numentries, @@ -814,13 +787,23 @@ void __init inet_hashinfo2_init(struct inet_hashinfo *h, const char *name, &h->lhash2_mask, low_limit, high_limit); + init_hashinfo_lhash2(h); +} - for (i = 0; i <= h->lhash2_mask; i++) { - spin_lock_init(&h->lhash2[i].lock); - INIT_HLIST_HEAD(&h->lhash2[i].head); - h->lhash2[i].count = 0; - } +int inet_hashinfo2_init_mod(struct inet_hashinfo *h) +{ + h->lhash2 = kmalloc_array(INET_LHTABLE_SIZE, sizeof(*h->lhash2), GFP_KERNEL); + if (!h->lhash2) + return -ENOMEM; + + h->lhash2_mask = INET_LHTABLE_SIZE - 1; + /* INET_LHTABLE_SIZE must be a power of 2 */ + BUG_ON(INET_LHTABLE_SIZE & h->lhash2_mask); + + init_hashinfo_lhash2(h); + return 0; } +EXPORT_SYMBOL_GPL(inet_hashinfo2_init_mod); int inet_ehash_locks_alloc(struct inet_hashinfo *hashinfo) { |