From 0e71c55c9e3eb32de08e17152ec739582afc9400 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sun, 21 Oct 2012 20:06:56 +0000 Subject: tcp: speedup SIOCINQ ioctl SIOCINQ can use the lock_sock_fast() version to avoid double acquisition of socket lock. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'net/ipv4/tcp.c') diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index b7c2f439b54f..eace049da052 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -536,13 +536,14 @@ int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg) { struct tcp_sock *tp = tcp_sk(sk); int answ; + bool slow; switch (cmd) { case SIOCINQ: if (sk->sk_state == TCP_LISTEN) return -EINVAL; - lock_sock(sk); + slow = lock_sock_fast(sk); if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) answ = 0; else if (sock_flag(sk, SOCK_URGINLINE) || @@ -557,7 +558,7 @@ int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg) answ--; } else answ = tp->urg_seq - tp->copied_seq; - release_sock(sk); + unlock_sock_fast(sk, slow); break; case SIOCATMARK: answ = tp->urg_data && tp->urg_seq == tp->copied_seq; -- cgit v1.2.3 From 52e804c6dfaa5df1e4b0e290357b82ad4e4cda2c Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Fri, 16 Nov 2012 03:03:05 +0000 Subject: net: Allow userns root to control ipv4 Allow an unpriviled user who has created a user namespace, and then created a network namespace to effectively use the new network namespace, by reducing capable(CAP_NET_ADMIN) and capable(CAP_NET_RAW) calls to be ns_capable(net->user_ns, CAP_NET_ADMIN), or capable(net->user_ns, CAP_NET_RAW) calls. Settings that merely control a single network device are allowed. Either the network device is a logical network device where restrictions make no difference or the network device is hardware NIC that has been explicity moved from the initial network namespace. In general policy and network stack state changes are allowed while resource control is left unchanged. Allow creating raw sockets. Allow the SIOCSARP ioctl to control the arp cache. Allow the SIOCSIFFLAG ioctl to allow setting network device flags. Allow the SIOCSIFADDR ioctl to allow setting a netdevice ipv4 address. Allow the SIOCSIFBRDADDR ioctl to allow setting a netdevice ipv4 broadcast address. Allow the SIOCSIFDSTADDR ioctl to allow setting a netdevice ipv4 destination address. Allow the SIOCSIFNETMASK ioctl to allow setting a netdevice ipv4 netmask. Allow the SIOCADDRT and SIOCDELRT ioctls to allow adding and deleting ipv4 routes. Allow the SIOCADDTUNNEL, SIOCCHGTUNNEL and SIOCDELTUNNEL ioctls for adding, changing and deleting gre tunnels. Allow the SIOCADDTUNNEL, SIOCCHGTUNNEL and SIOCDELTUNNEL ioctls for adding, changing and deleting ipip tunnels. Allow the SIOCADDTUNNEL, SIOCCHGTUNNEL and SIOCDELTUNNEL ioctls for adding, changing and deleting ipsec virtual tunnel interfaces. Allow setting the MRT_INIT, MRT_DONE, MRT_ADD_VIF, MRT_DEL_VIF, MRT_ADD_MFC, MRT_DEL_MFC, MRT_ASSERT, MRT_PIM, MRT_TABLE socket options on multicast routing sockets. Allow setting and receiving IPOPT_CIPSO, IP_OPT_SEC, IP_OPT_SID and arbitrary ip options. Allow setting IP_SEC_POLICY/IP_XFRM_POLICY ipv4 socket option. Allow setting the IP_TRANSPARENT ipv4 socket option. Allow setting the TCP_REPAIR socket option. Allow setting the TCP_CONGESTION socket option. Signed-off-by: "Eric W. Biederman" Signed-off-by: David S. Miller --- net/ipv4/af_inet.c | 3 ++- net/ipv4/arp.c | 2 +- net/ipv4/devinet.c | 4 ++-- net/ipv4/fib_frontend.c | 2 +- net/ipv4/ip_gre.c | 4 ++-- net/ipv4/ip_options.c | 6 +++--- net/ipv4/ip_sockglue.c | 5 +++-- net/ipv4/ip_vti.c | 4 ++-- net/ipv4/ipip.c | 4 ++-- net/ipv4/ipmr.c | 2 +- net/ipv4/netfilter/arp_tables.c | 8 ++++---- net/ipv4/netfilter/ip_tables.c | 8 ++++---- net/ipv4/tcp.c | 2 +- net/ipv4/tcp_cong.c | 3 ++- 14 files changed, 30 insertions(+), 27 deletions(-) (limited to 'net/ipv4/tcp.c') diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index d5e5a054123c..4f5f22061e1c 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -346,7 +346,8 @@ lookup_protocol: } err = -EPERM; - if (sock->type == SOCK_RAW && !kern && !capable(CAP_NET_RAW)) + if (sock->type == SOCK_RAW && !kern && + !ns_capable(net->user_ns, CAP_NET_RAW)) goto out_rcu_unlock; err = -EAFNOSUPPORT; diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c index 47800459e4cb..ce6fbdfd40b8 100644 --- a/net/ipv4/arp.c +++ b/net/ipv4/arp.c @@ -1161,7 +1161,7 @@ int arp_ioctl(struct net *net, unsigned int cmd, void __user *arg) switch (cmd) { case SIOCDARP: case SIOCSARP: - if (!capable(CAP_NET_ADMIN)) + if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) return -EPERM; case SIOCGARP: err = copy_from_user(&r, arg, sizeof(struct arpreq)); diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index 417093538916..259622a5e690 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -730,7 +730,7 @@ int devinet_ioctl(struct net *net, unsigned int cmd, void __user *arg) case SIOCSIFFLAGS: ret = -EPERM; - if (!capable(CAP_NET_ADMIN)) + if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) goto out; break; case SIOCSIFADDR: /* Set interface address (and family) */ @@ -738,7 +738,7 @@ int devinet_ioctl(struct net *net, unsigned int cmd, void __user *arg) case SIOCSIFDSTADDR: /* Set the destination address */ case SIOCSIFNETMASK: /* Set the netmask for the interface */ ret = -EPERM; - if (!capable(CAP_NET_ADMIN)) + if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) goto out; ret = -EINVAL; if (sin->sin_family != AF_INET) diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index bce4541c6784..784716a677ce 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -488,7 +488,7 @@ int ip_rt_ioctl(struct net *net, unsigned int cmd, void __user *arg) switch (cmd) { case SIOCADDRT: /* Add a route */ case SIOCDELRT: /* Delete a route */ - if (!capable(CAP_NET_ADMIN)) + if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) return -EPERM; if (copy_from_user(&rt, arg, sizeof(rt))) diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 127f2a1e67f5..a85ae2f7a21c 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -1064,7 +1064,7 @@ ipgre_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd) case SIOCADDTUNNEL: case SIOCCHGTUNNEL: err = -EPERM; - if (!capable(CAP_NET_ADMIN)) + if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) goto done; err = -EFAULT; @@ -1139,7 +1139,7 @@ ipgre_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd) case SIOCDELTUNNEL: err = -EPERM; - if (!capable(CAP_NET_ADMIN)) + if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) goto done; if (dev == ign->fb_tunnel_dev) { diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c index 1dc01f9793d5..f6289bf6f332 100644 --- a/net/ipv4/ip_options.c +++ b/net/ipv4/ip_options.c @@ -409,7 +409,7 @@ int ip_options_compile(struct net *net, optptr[2] += 8; break; default: - if (!skb && !capable(CAP_NET_RAW)) { + if (!skb && !ns_capable(net->user_ns, CAP_NET_RAW)) { pp_ptr = optptr + 3; goto error; } @@ -445,7 +445,7 @@ int ip_options_compile(struct net *net, opt->router_alert = optptr - iph; break; case IPOPT_CIPSO: - if ((!skb && !capable(CAP_NET_RAW)) || opt->cipso) { + if ((!skb && !ns_capable(net->user_ns, CAP_NET_RAW)) || opt->cipso) { pp_ptr = optptr; goto error; } @@ -458,7 +458,7 @@ int ip_options_compile(struct net *net, case IPOPT_SEC: case IPOPT_SID: default: - if (!skb && !capable(CAP_NET_RAW)) { + if (!skb && !ns_capable(net->user_ns, CAP_NET_RAW)) { pp_ptr = optptr; goto error; } diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index 14bbfcf717ac..3c9d20880283 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -989,13 +989,14 @@ mc_msf_out: case IP_IPSEC_POLICY: case IP_XFRM_POLICY: err = -EPERM; - if (!capable(CAP_NET_ADMIN)) + if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) break; err = xfrm_user_policy(sk, optname, optval, optlen); break; case IP_TRANSPARENT: - if (!!val && !capable(CAP_NET_RAW) && !capable(CAP_NET_ADMIN)) { + if (!!val && !ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) && + !ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) { err = -EPERM; break; } diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c index f4a825d3bd7f..c3a4233c0ac2 100644 --- a/net/ipv4/ip_vti.c +++ b/net/ipv4/ip_vti.c @@ -488,7 +488,7 @@ vti_tunnel_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) case SIOCADDTUNNEL: case SIOCCHGTUNNEL: err = -EPERM; - if (!capable(CAP_NET_ADMIN)) + if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) goto done; err = -EFAULT; @@ -553,7 +553,7 @@ vti_tunnel_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) case SIOCDELTUNNEL: err = -EPERM; - if (!capable(CAP_NET_ADMIN)) + if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) goto done; if (dev == ipn->fb_tunnel_dev) { diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c index c26c1717c1db..191fc24a745a 100644 --- a/net/ipv4/ipip.c +++ b/net/ipv4/ipip.c @@ -691,7 +691,7 @@ ipip_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd) case SIOCADDTUNNEL: case SIOCCHGTUNNEL: err = -EPERM; - if (!capable(CAP_NET_ADMIN)) + if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) goto done; err = -EFAULT; @@ -735,7 +735,7 @@ ipip_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd) case SIOCDELTUNNEL: err = -EPERM; - if (!capable(CAP_NET_ADMIN)) + if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) goto done; if (dev == ipn->fb_tunnel_dev) { diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index 6168c4dc58b1..adf3d349566f 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -1213,7 +1213,7 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, unsi if (optname != MRT_INIT) { if (sk != rcu_access_pointer(mrt->mroute_sk) && - !capable(CAP_NET_ADMIN)) + !ns_capable(net->user_ns, CAP_NET_ADMIN)) return -EACCES; } diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c index 97e61eadf580..3ea4127404d6 100644 --- a/net/ipv4/netfilter/arp_tables.c +++ b/net/ipv4/netfilter/arp_tables.c @@ -1533,7 +1533,7 @@ static int compat_do_arpt_set_ctl(struct sock *sk, int cmd, void __user *user, { int ret; - if (!capable(CAP_NET_ADMIN)) + if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) return -EPERM; switch (cmd) { @@ -1677,7 +1677,7 @@ static int compat_do_arpt_get_ctl(struct sock *sk, int cmd, void __user *user, { int ret; - if (!capable(CAP_NET_ADMIN)) + if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) return -EPERM; switch (cmd) { @@ -1698,7 +1698,7 @@ static int do_arpt_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned { int ret; - if (!capable(CAP_NET_ADMIN)) + if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) return -EPERM; switch (cmd) { @@ -1722,7 +1722,7 @@ static int do_arpt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len { int ret; - if (!capable(CAP_NET_ADMIN)) + if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) return -EPERM; switch (cmd) { diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index 170b1fdd6b72..17c5e06da662 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -1846,7 +1846,7 @@ compat_do_ipt_set_ctl(struct sock *sk, int cmd, void __user *user, { int ret; - if (!capable(CAP_NET_ADMIN)) + if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) return -EPERM; switch (cmd) { @@ -1961,7 +1961,7 @@ compat_do_ipt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len) { int ret; - if (!capable(CAP_NET_ADMIN)) + if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) return -EPERM; switch (cmd) { @@ -1983,7 +1983,7 @@ do_ipt_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len) { int ret; - if (!capable(CAP_NET_ADMIN)) + if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) return -EPERM; switch (cmd) { @@ -2008,7 +2008,7 @@ do_ipt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len) { int ret; - if (!capable(CAP_NET_ADMIN)) + if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) return -EPERM; switch (cmd) { diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 4aefa0b42c2e..e6eace1c2bdb 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2304,7 +2304,7 @@ void tcp_sock_destruct(struct sock *sk) static inline bool tcp_can_repair_sock(const struct sock *sk) { - return capable(CAP_NET_ADMIN) && + return ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN) && ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_ESTABLISHED)); } diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c index 1432cdb0644c..baf28611b334 100644 --- a/net/ipv4/tcp_cong.c +++ b/net/ipv4/tcp_cong.c @@ -259,7 +259,8 @@ int tcp_set_congestion_control(struct sock *sk, const char *name) if (!ca) err = -ENOENT; - else if (!((ca->flags & TCP_CONG_NON_RESTRICTED) || capable(CAP_NET_ADMIN))) + else if (!((ca->flags & TCP_CONG_NON_RESTRICTED) || + ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))) err = -EPERM; else if (!try_module_get(ca->owner)) -- cgit v1.2.3 From fd90b29d757827ab12d6669292612308ec249532 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 30 Nov 2012 10:08:52 +0000 Subject: tcp: change default tcp hash size As time passed, available memory increased faster than number of concurrent tcp sockets. As a result, a machine with 4GB of ram gets a hash table with 524288 slots, using 8388608 bytes of memory. Lets change that by a 16x factor (one slot for 128 KB of ram) Even if a small machine needs a _lot_ of sockets, tcp lookups are now very efficient, using one cache line per socket. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'net/ipv4/tcp.c') diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index e6eace1c2bdb..1aca02c9911e 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -3590,8 +3590,7 @@ void __init tcp_init(void) alloc_large_system_hash("TCP established", sizeof(struct inet_ehash_bucket), thash_entries, - (totalram_pages >= 128 * 1024) ? - 13 : 15, + 17, /* one slot per 128 KB of memory */ 0, NULL, &tcp_hashinfo.ehash_mask, @@ -3607,8 +3606,7 @@ void __init tcp_init(void) alloc_large_system_hash("TCP bind", sizeof(struct inet_bind_hashbucket), tcp_hashinfo.ehash_mask + 1, - (totalram_pages >= 128 * 1024) ? - 13 : 15, + 17, /* one slot per 128 KB of memory */ 0, &tcp_hashinfo.bhash_size, NULL, -- cgit v1.2.3 From 02275a2ee7c0ea475b6f4a6428f5df592bc9d30b Mon Sep 17 00:00:00 2001 From: Willy Tarreau Date: Sun, 2 Dec 2012 11:49:27 +0000 Subject: tcp: don't abort splice() after small transfers TCP coalescing added a regression in splice(socket->pipe) performance, for some workloads because of the way tcp_read_sock() is implemented. The reason for this is the break when (offset + 1 != skb->len). As we released the socket lock, this condition is possible if TCP stack added a fragment to the skb, which can happen with TCP coalescing. So let's go back to the beginning of the loop when this happens, to give a chance to splice more frags per system call. Doing so fixes the issue and makes GRO 10% faster than LRO on CPU-bound splice() workloads instead of the opposite. Signed-off-by: Willy Tarreau Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) (limited to 'net/ipv4/tcp.c') diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 1aca02c9911e..8fc5b3bd6075 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1494,15 +1494,19 @@ int tcp_read_sock(struct sock *sk, read_descriptor_t *desc, copied += used; offset += used; } - /* - * If recv_actor drops the lock (e.g. TCP splice + /* If recv_actor drops the lock (e.g. TCP splice * receive) the skb pointer might be invalid when * getting here: tcp_collapse might have deleted it * while aggregating skbs from the socket queue. */ - skb = tcp_recv_skb(sk, seq-1, &offset); - if (!skb || (offset+1 != skb->len)) + skb = tcp_recv_skb(sk, seq - 1, &offset); + if (!skb) break; + /* TCP coalescing might have appended data to the skb. + * Try to splice more frags + */ + if (offset + 1 != skb->len) + continue; } if (tcp_hdr(skb)->fin) { sk_eat_skb(sk, skb, false); -- cgit v1.2.3