summaryrefslogtreecommitdiffstats
path: root/net/dccp/proto.c
diff options
context:
space:
mode:
authorJoanne Koong <joannelkoong@gmail.com>2022-08-22 20:10:21 +0200
committerJakub Kicinski <kuba@kernel.org>2022-08-25 04:30:07 +0200
commit28044fc1d4953b07acec0da4d2fc4784c57ea6fb (patch)
tree1d21356094fbdd9994989e379d3d29b384dbe967 /net/dccp/proto.c
parentnetlink: fix some kernel-doc comments (diff)
downloadlinux-28044fc1d4953b07acec0da4d2fc4784c57ea6fb.tar.xz
linux-28044fc1d4953b07acec0da4d2fc4784c57ea6fb.zip
net: Add a bhash2 table hashed by port and address
The current bind hashtable (bhash) is hashed by port only. In the socket bind path, we have to check for bind conflicts by traversing the specified port's inet_bind_bucket while holding the hashbucket's spinlock (see inet_csk_get_port() and inet_csk_bind_conflict()). In instances where there are tons of sockets hashed to the same port at different addresses, the bind conflict check is time-intensive and can cause softirq cpu lockups, as well as stops new tcp connections since __inet_inherit_port() also contests for the spinlock. This patch adds a second bind table, bhash2, that hashes by port and sk->sk_rcv_saddr (ipv4) and sk->sk_v6_rcv_saddr (ipv6). Searching the bhash2 table leads to significantly faster conflict resolution and less time holding the hashbucket spinlock. Please note a few things: * There can be the case where the a socket's address changes after it has been bound. There are two cases where this happens: 1) The case where there is a bind() call on INADDR_ANY (ipv4) or IPV6_ADDR_ANY (ipv6) and then a connect() call. The kernel will assign the socket an address when it handles the connect() 2) In inet_sk_reselect_saddr(), which is called when rebuilding the sk header and a few pre-conditions are met (eg rerouting fails). In these two cases, we need to update the bhash2 table by removing the entry for the old address, and add a new entry reflecting the updated address. * The bhash2 table must have its own lock, even though concurrent accesses on the same port are protected by the bhash lock. Bhash2 must have its own lock to protect against cases where sockets on different ports hash to different bhash hashbuckets but to the same bhash2 hashbucket. This brings up a few stipulations: 1) When acquiring both the bhash and the bhash2 lock, the bhash2 lock will always be acquired after the bhash lock and released before the bhash lock is released. 2) There are no nested bhash2 hashbucket locks. A bhash2 lock is always acquired+released before another bhash2 lock is acquired+released. * The bhash table cannot be superseded by the bhash2 table because for bind requests on INADDR_ANY (ipv4) or IPV6_ADDR_ANY (ipv6), every socket bound to that port must be checked for a potential conflict. The bhash table is the only source of port->socket associations. Signed-off-by: Joanne Koong <joannelkoong@gmail.com> Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Diffstat (limited to 'net/dccp/proto.c')
-rw-r--r--net/dccp/proto.c34
1 files changed, 29 insertions, 5 deletions
diff --git a/net/dccp/proto.c b/net/dccp/proto.c
index e13641c65f88..7cd4a6cc99fc 100644
--- a/net/dccp/proto.c
+++ b/net/dccp/proto.c
@@ -1120,6 +1120,12 @@ static int __init dccp_init(void)
SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT, NULL);
if (!dccp_hashinfo.bind_bucket_cachep)
goto out_free_hashinfo2;
+ dccp_hashinfo.bind2_bucket_cachep =
+ kmem_cache_create("dccp_bind2_bucket",
+ sizeof(struct inet_bind2_bucket), 0,
+ SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT, NULL);
+ if (!dccp_hashinfo.bind2_bucket_cachep)
+ goto out_free_bind_bucket_cachep;
/*
* Size and allocate the main established and bind bucket
@@ -1150,7 +1156,7 @@ static int __init dccp_init(void)
if (!dccp_hashinfo.ehash) {
DCCP_CRIT("Failed to allocate DCCP established hash table");
- goto out_free_bind_bucket_cachep;
+ goto out_free_bind2_bucket_cachep;
}
for (i = 0; i <= dccp_hashinfo.ehash_mask; i++)
@@ -1176,14 +1182,24 @@ static int __init dccp_init(void)
goto out_free_dccp_locks;
}
+ dccp_hashinfo.bhash2 = (struct inet_bind_hashbucket *)
+ __get_free_pages(GFP_ATOMIC | __GFP_NOWARN, bhash_order);
+
+ if (!dccp_hashinfo.bhash2) {
+ DCCP_CRIT("Failed to allocate DCCP bind2 hash table");
+ goto out_free_dccp_bhash;
+ }
+
for (i = 0; i < dccp_hashinfo.bhash_size; i++) {
spin_lock_init(&dccp_hashinfo.bhash[i].lock);
INIT_HLIST_HEAD(&dccp_hashinfo.bhash[i].chain);
+ spin_lock_init(&dccp_hashinfo.bhash2[i].lock);
+ INIT_HLIST_HEAD(&dccp_hashinfo.bhash2[i].chain);
}
rc = dccp_mib_init();
if (rc)
- goto out_free_dccp_bhash;
+ goto out_free_dccp_bhash2;
rc = dccp_ackvec_init();
if (rc)
@@ -1207,30 +1223,38 @@ out_ackvec_exit:
dccp_ackvec_exit();
out_free_dccp_mib:
dccp_mib_exit();
+out_free_dccp_bhash2:
+ free_pages((unsigned long)dccp_hashinfo.bhash2, bhash_order);
out_free_dccp_bhash:
free_pages((unsigned long)dccp_hashinfo.bhash, bhash_order);
out_free_dccp_locks:
inet_ehash_locks_free(&dccp_hashinfo);
out_free_dccp_ehash:
free_pages((unsigned long)dccp_hashinfo.ehash, ehash_order);
+out_free_bind2_bucket_cachep:
+ kmem_cache_destroy(dccp_hashinfo.bind2_bucket_cachep);
out_free_bind_bucket_cachep:
kmem_cache_destroy(dccp_hashinfo.bind_bucket_cachep);
out_free_hashinfo2:
inet_hashinfo2_free_mod(&dccp_hashinfo);
out_fail:
dccp_hashinfo.bhash = NULL;
+ dccp_hashinfo.bhash2 = NULL;
dccp_hashinfo.ehash = NULL;
dccp_hashinfo.bind_bucket_cachep = NULL;
+ dccp_hashinfo.bind2_bucket_cachep = NULL;
return rc;
}
static void __exit dccp_fini(void)
{
+ int bhash_order = get_order(dccp_hashinfo.bhash_size *
+ sizeof(struct inet_bind_hashbucket));
+
ccid_cleanup_builtins();
dccp_mib_exit();
- free_pages((unsigned long)dccp_hashinfo.bhash,
- get_order(dccp_hashinfo.bhash_size *
- sizeof(struct inet_bind_hashbucket)));
+ free_pages((unsigned long)dccp_hashinfo.bhash, bhash_order);
+ free_pages((unsigned long)dccp_hashinfo.bhash2, bhash_order);
free_pages((unsigned long)dccp_hashinfo.ehash,
get_order((dccp_hashinfo.ehash_mask + 1) *
sizeof(struct inet_ehash_bucket)));