From 6670e152447732ba90626f36dfc015a13fbf150e Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Tue, 14 Nov 2017 08:25:49 -0800 Subject: tcp: Namespace-ify sysctl_tcp_default_congestion_control Make default TCP default congestion control to a per namespace value. This changes default congestion control to a pointer to congestion ops (rather than implicit as first element of available lsit). The congestion control setting of new namespaces is inherited from the current setting of the root namespace. Signed-off-by: Stephen Hemminger Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/fib_semantics.c | 4 +-- net/ipv4/sysctl_net_ipv4.c | 19 +++++++----- net/ipv4/tcp_cong.c | 76 ++++++++++++++++++++++------------------------ net/ipv4/tcp_ipv4.c | 9 ++++++ net/ipv6/route.c | 3 +- 5 files changed, 60 insertions(+), 51 deletions(-) (limited to 'net') diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index 589caaa90613..f04d944f8abe 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -710,7 +710,7 @@ bool fib_metrics_match(struct fib_config *cfg, struct fib_info *fi) bool ecn_ca = false; nla_strlcpy(tmp, nla, sizeof(tmp)); - val = tcp_ca_get_key_by_name(tmp, &ecn_ca); + val = tcp_ca_get_key_by_name(fi->fib_net, tmp, &ecn_ca); } else { val = nla_get_u32(nla); } @@ -1030,7 +1030,7 @@ fib_convert_metrics(struct fib_info *fi, const struct fib_config *cfg) char tmp[TCP_CA_NAME_MAX]; nla_strlcpy(tmp, nla, sizeof(tmp)); - val = tcp_ca_get_key_by_name(tmp, &ecn_ca); + val = tcp_ca_get_key_by_name(fi->fib_net, tmp, &ecn_ca); if (val == TCP_CA_UNSPEC) return -EINVAL; } else { diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index ef0ff3357a44..93e172118a94 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -201,6 +201,8 @@ static int ipv4_ping_group_range(struct ctl_table *table, int write, static int proc_tcp_congestion_control(struct ctl_table *ctl, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { + struct net *net = container_of(ctl->data, struct net, + ipv4.tcp_congestion_control); char val[TCP_CA_NAME_MAX]; struct ctl_table tbl = { .data = val, @@ -208,11 +210,11 @@ static int proc_tcp_congestion_control(struct ctl_table *ctl, int write, }; int ret; - tcp_get_default_congestion_control(val); + tcp_get_default_congestion_control(net, val); ret = proc_dostring(&tbl, write, buffer, lenp, ppos); if (write && ret == 0) - ret = tcp_set_default_congestion_control(val); + ret = tcp_set_default_congestion_control(net, val); return ret; } @@ -447,12 +449,6 @@ static struct ctl_table ipv4_table[] = { .mode = 0644, .proc_handler = proc_dointvec }, - { - .procname = "tcp_congestion_control", - .mode = 0644, - .maxlen = TCP_CA_NAME_MAX, - .proc_handler = proc_tcp_congestion_control, - }, #ifdef CONFIG_NETLABEL { .procname = "cipso_cache_enable", @@ -763,6 +759,13 @@ static struct ctl_table ipv4_net_table[] = { .extra1 = &one }, #endif + { + .procname = "tcp_congestion_control", + .data = &init_net.ipv4.tcp_congestion_control, + .mode = 0644, + .maxlen = TCP_CA_NAME_MAX, + .proc_handler = proc_tcp_congestion_control, + }, { .procname = "tcp_keepalive_time", .data = &init_net.ipv4.sysctl_tcp_keepalive_time, diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c index 2f26124fd160..bc6c02f16243 100644 --- a/net/ipv4/tcp_cong.c +++ b/net/ipv4/tcp_cong.c @@ -33,9 +33,11 @@ static struct tcp_congestion_ops *tcp_ca_find(const char *name) } /* Must be called with rcu lock held */ -static const struct tcp_congestion_ops *__tcp_ca_find_autoload(const char *name) +static struct tcp_congestion_ops *tcp_ca_find_autoload(struct net *net, + const char *name) { - const struct tcp_congestion_ops *ca = tcp_ca_find(name); + struct tcp_congestion_ops *ca = tcp_ca_find(name); + #ifdef CONFIG_MODULES if (!ca && capable(CAP_NET_ADMIN)) { rcu_read_unlock(); @@ -115,7 +117,7 @@ void tcp_unregister_congestion_control(struct tcp_congestion_ops *ca) } EXPORT_SYMBOL_GPL(tcp_unregister_congestion_control); -u32 tcp_ca_get_key_by_name(const char *name, bool *ecn_ca) +u32 tcp_ca_get_key_by_name(struct net *net, const char *name, bool *ecn_ca) { const struct tcp_congestion_ops *ca; u32 key = TCP_CA_UNSPEC; @@ -123,7 +125,7 @@ u32 tcp_ca_get_key_by_name(const char *name, bool *ecn_ca) might_sleep(); rcu_read_lock(); - ca = __tcp_ca_find_autoload(name); + ca = tcp_ca_find_autoload(net, name); if (ca) { key = ca->key; *ecn_ca = ca->flags & TCP_CONG_NEEDS_ECN; @@ -153,23 +155,18 @@ EXPORT_SYMBOL_GPL(tcp_ca_get_name_by_key); /* Assign choice of congestion control. */ void tcp_assign_congestion_control(struct sock *sk) { + struct net *net = sock_net(sk); struct inet_connection_sock *icsk = inet_csk(sk); - struct tcp_congestion_ops *ca; + const struct tcp_congestion_ops *ca; rcu_read_lock(); - list_for_each_entry_rcu(ca, &tcp_cong_list, list) { - if (likely(try_module_get(ca->owner))) { - icsk->icsk_ca_ops = ca; - goto out; - } - /* Fallback to next available. The last really - * guaranteed fallback is Reno from this list. - */ - } -out: + ca = rcu_dereference(net->ipv4.tcp_congestion_control); + if (unlikely(!try_module_get(ca->owner))) + ca = &tcp_reno; + icsk->icsk_ca_ops = ca; rcu_read_unlock(); - memset(icsk->icsk_ca_priv, 0, sizeof(icsk->icsk_ca_priv)); + memset(icsk->icsk_ca_priv, 0, sizeof(icsk->icsk_ca_priv)); if (ca->flags & TCP_CONG_NEEDS_ECN) INET_ECN_xmit(sk); else @@ -214,29 +211,27 @@ void tcp_cleanup_congestion_control(struct sock *sk) } /* Used by sysctl to change default congestion control */ -int tcp_set_default_congestion_control(const char *name) +int tcp_set_default_congestion_control(struct net *net, const char *name) { struct tcp_congestion_ops *ca; - int ret = -ENOENT; - - spin_lock(&tcp_cong_list_lock); - ca = tcp_ca_find(name); -#ifdef CONFIG_MODULES - if (!ca && capable(CAP_NET_ADMIN)) { - spin_unlock(&tcp_cong_list_lock); + const struct tcp_congestion_ops *prev; + int ret; - request_module("tcp_%s", name); - spin_lock(&tcp_cong_list_lock); - ca = tcp_ca_find(name); - } -#endif + rcu_read_lock(); + ca = tcp_ca_find_autoload(net, name); + if (!ca) { + ret = -ENOENT; + } else if (!try_module_get(ca->owner)) { + ret = -EBUSY; + } else { + prev = xchg(&net->ipv4.tcp_congestion_control, ca); + if (prev) + module_put(prev->owner); - if (ca) { - ca->flags |= TCP_CONG_NON_RESTRICTED; /* default is always allowed */ - list_move(&ca->list, &tcp_cong_list); + ca->flags |= TCP_CONG_NON_RESTRICTED; ret = 0; } - spin_unlock(&tcp_cong_list_lock); + rcu_read_unlock(); return ret; } @@ -244,7 +239,8 @@ int tcp_set_default_congestion_control(const char *name) /* Set default value from kernel configuration at bootup */ static int __init tcp_congestion_default(void) { - return tcp_set_default_congestion_control(CONFIG_DEFAULT_TCP_CONG); + return tcp_set_default_congestion_control(&init_net, + CONFIG_DEFAULT_TCP_CONG); } late_initcall(tcp_congestion_default); @@ -264,14 +260,12 @@ void tcp_get_available_congestion_control(char *buf, size_t maxlen) } /* Get current default congestion control */ -void tcp_get_default_congestion_control(char *name) +void tcp_get_default_congestion_control(struct net *net, char *name) { - struct tcp_congestion_ops *ca; - /* We will always have reno... */ - BUG_ON(list_empty(&tcp_cong_list)); + const struct tcp_congestion_ops *ca; rcu_read_lock(); - ca = list_entry(tcp_cong_list.next, struct tcp_congestion_ops, list); + ca = rcu_dereference(net->ipv4.tcp_congestion_control); strncpy(name, ca->name, TCP_CA_NAME_MAX); rcu_read_unlock(); } @@ -351,12 +345,14 @@ int tcp_set_congestion_control(struct sock *sk, const char *name, bool load, boo if (!load) ca = tcp_ca_find(name); else - ca = __tcp_ca_find_autoload(name); + ca = tcp_ca_find_autoload(sock_net(sk), name); + /* No change asking for existing value */ if (ca == icsk->icsk_ca_ops) { icsk->icsk_ca_setsockopt = 1; goto out; } + if (!ca) { err = -ENOENT; } else if (!load) { diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 1eac84b8044e..c6bc0c4d19c6 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -2430,6 +2430,8 @@ static void __net_exit tcp_sk_exit(struct net *net) { int cpu; + module_put(net->ipv4.tcp_congestion_control->owner); + for_each_possible_cpu(cpu) inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu)); free_percpu(net->ipv4.tcp_sk); @@ -2522,6 +2524,13 @@ static int __net_init tcp_sk_init(struct net *net) net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 60 * 60; atomic_set(&net->ipv4.tfo_active_disable_times, 0); + /* Reno is always built in */ + if (!net_eq(net, &init_net) && + try_module_get(init_net.ipv4.tcp_congestion_control->owner)) + net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control; + else + net->ipv4.tcp_congestion_control = &tcp_reno; + return 0; fail: tcp_sk_exit(net); diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 70d9659fc1e9..05eb7bc36156 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -2378,6 +2378,7 @@ out: static int ip6_convert_metrics(struct mx6_config *mxc, const struct fib6_config *cfg) { + struct net *net = cfg->fc_nlinfo.nl_net; bool ecn_ca = false; struct nlattr *nla; int remaining; @@ -2403,7 +2404,7 @@ static int ip6_convert_metrics(struct mx6_config *mxc, char tmp[TCP_CA_NAME_MAX]; nla_strlcpy(tmp, nla, sizeof(tmp)); - val = tcp_ca_get_key_by_name(tmp, &ecn_ca); + val = tcp_ca_get_key_by_name(net, tmp, &ecn_ca); if (val == TCP_CA_UNSPEC) goto err; } else { -- cgit v1.2.3