diff options
Diffstat (limited to 'net/ipv4')
71 files changed, 1205 insertions, 932 deletions
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 24eca23c2db3..95a966dd1916 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -5,8 +5,6 @@ * * PF_INET protocol family socket handler. * - * Version: $Id: af_inet.c,v 1.137 2002/02/01 22:01:03 davem Exp $ - * * Authors: Ross Biro * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> * Florian La Roche, <flla@stud.uni-sb.de> @@ -1365,7 +1363,7 @@ static int __init init_ipv4_mibs(void) sizeof(struct udp_mib)) < 0) goto err_udplite_mib; - tcp_mib_init(); + tcp_mib_init(&init_net); return 0; @@ -1481,14 +1479,15 @@ static int __init inet_init(void) * Initialise the multicast router */ #if defined(CONFIG_IP_MROUTE) - ip_mr_init(); + if (ip_mr_init()) + printk(KERN_CRIT "inet_init: Cannot init ipv4 mroute\n"); #endif /* * Initialise per-cpu ipv4 mibs */ if (init_ipv4_mibs()) - printk(KERN_CRIT "inet_init: Cannot init ipv4 mibs\n"); ; + printk(KERN_CRIT "inet_init: Cannot init ipv4 mibs\n"); ipv4_proc_init(); diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c index 9b539fa9fe18..b043eda60b04 100644 --- a/net/ipv4/arp.c +++ b/net/ipv4/arp.c @@ -1,7 +1,5 @@ /* linux/net/ipv4/arp.c * - * Version: $Id: arp.c,v 1.99 2001/08/30 22:55:42 davem Exp $ - * * Copyright (C) 1994 by Florian La Roche * * This module implements the Address Resolution Protocol ARP (RFC 826), @@ -423,11 +421,12 @@ static int arp_filter(__be32 sip, __be32 tip, struct net_device *dev) struct rtable *rt; int flag = 0; /*unsigned long now; */ + struct net *net = dev_net(dev); - if (ip_route_output_key(dev_net(dev), &rt, &fl) < 0) + if (ip_route_output_key(net, &rt, &fl) < 0) return 1; if (rt->u.dst.dev != dev) { - NET_INC_STATS_BH(LINUX_MIB_ARPFILTER); + NET_INC_STATS_BH(net, LINUX_MIB_ARPFILTER); flag = 1; } ip_rt_put(rt); @@ -1199,7 +1198,7 @@ static int arp_netdev_event(struct notifier_block *this, unsigned long event, vo switch (event) { case NETDEV_CHANGEADDR: neigh_changeaddr(&arp_tbl, dev); - rt_cache_flush(0); + rt_cache_flush(dev_net(dev), 0); break; default: break; diff --git a/net/ipv4/datagram.c b/net/ipv4/datagram.c index 0c0c73f368ce..5e6c5a0f3fde 100644 --- a/net/ipv4/datagram.c +++ b/net/ipv4/datagram.c @@ -52,7 +52,7 @@ int ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) inet->sport, usin->sin_port, sk, 1); if (err) { if (err == -ENETUNREACH) - IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES); + IP_INC_STATS_BH(sock_net(sk), IPSTATS_MIB_OUTNOROUTES); return err; } diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index 79a7ef6209ff..2e667e2f90df 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -1,8 +1,6 @@ /* * NET3 IP device support routines. * - * Version: $Id: devinet.c,v 1.44 2001/10/31 21:55:54 davem Exp $ - * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License * as published by the Free Software Foundation; either version @@ -170,6 +168,8 @@ static struct in_device *inetdev_init(struct net_device *dev) in_dev->dev = dev; if ((in_dev->arp_parms = neigh_parms_alloc(dev, &arp_tbl)) == NULL) goto out_kfree; + if (IPV4_DEVCONF(in_dev->cnf, FORWARDING)) + dev_disable_lro(dev); /* Reference in_dev->dev */ dev_hold(dev); /* Account for reference dev->ip_ptr (below) */ @@ -1013,7 +1013,7 @@ static void inetdev_changename(struct net_device *dev, struct in_device *in_dev) memcpy(old, ifa->ifa_label, IFNAMSIZ); memcpy(ifa->ifa_label, dev->name, IFNAMSIZ); if (named++ == 0) - continue; + goto skip; dot = strchr(old, ':'); if (dot == NULL) { sprintf(old, ":%d", named); @@ -1024,6 +1024,8 @@ static void inetdev_changename(struct net_device *dev, struct in_device *in_dev) } else { strcpy(ifa->ifa_label + (IFNAMSIZ - strlen(dot) - 1), dot); } +skip: + rtmsg_ifa(RTM_NEWADDR, ifa, NULL, 0); } } @@ -1241,6 +1243,8 @@ static void inet_forward_change(struct net *net) read_lock(&dev_base_lock); for_each_netdev(net, dev) { struct in_device *in_dev; + if (on) + dev_disable_lro(dev); rcu_read_lock(); in_dev = __in_dev_get_rcu(dev); if (in_dev) @@ -1248,8 +1252,6 @@ static void inet_forward_change(struct net *net) rcu_read_unlock(); } read_unlock(&dev_base_lock); - - rt_cache_flush(0); } static int devinet_conf_proc(ctl_table *ctl, int write, @@ -1335,10 +1337,19 @@ static int devinet_sysctl_forward(ctl_table *ctl, int write, if (write && *valp != val) { struct net *net = ctl->extra2; - if (valp == &IPV4_DEVCONF_ALL(net, FORWARDING)) - inet_forward_change(net); - else if (valp != &IPV4_DEVCONF_DFLT(net, FORWARDING)) - rt_cache_flush(0); + if (valp != &IPV4_DEVCONF_DFLT(net, FORWARDING)) { + rtnl_lock(); + if (valp == &IPV4_DEVCONF_ALL(net, FORWARDING)) { + inet_forward_change(net); + } else if (*valp) { + struct ipv4_devconf *cnf = ctl->extra1; + struct in_device *idev = + container_of(cnf, struct in_device, cnf); + dev_disable_lro(idev->dev); + } + rtnl_unlock(); + rt_cache_flush(net, 0); + } } return ret; @@ -1351,9 +1362,10 @@ int ipv4_doint_and_flush(ctl_table *ctl, int write, int *valp = ctl->data; int val = *valp; int ret = proc_dointvec(ctl, write, filp, buffer, lenp, ppos); + struct net *net = ctl->extra2; if (write && *valp != val) - rt_cache_flush(0); + rt_cache_flush(net, 0); return ret; } @@ -1364,9 +1376,10 @@ int ipv4_doint_and_flush_strategy(ctl_table *table, int __user *name, int nlen, { int ret = devinet_conf_sysctl(table, name, nlen, oldval, oldlenp, newval, newlen); + struct net *net = table->extra2; if (ret == 1) - rt_cache_flush(0); + rt_cache_flush(net, 0); return ret; } diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 0b2ac6a3d903..65c1503f8cc8 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -5,8 +5,6 @@ * * IPv4 Forwarding Information Base: FIB frontend. * - * Version: $Id: fib_frontend.c,v 1.26 2001/10/31 21:55:54 davem Exp $ - * * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> * * This program is free software; you can redistribute it and/or @@ -146,7 +144,7 @@ static void fib_flush(struct net *net) } if (flushed) - rt_cache_flush(-1); + rt_cache_flush(net, -1); } /* @@ -899,21 +897,22 @@ static void fib_disable_ip(struct net_device *dev, int force) { if (fib_sync_down_dev(dev, force)) fib_flush(dev_net(dev)); - rt_cache_flush(0); + rt_cache_flush(dev_net(dev), 0); arp_ifdown(dev); } static int fib_inetaddr_event(struct notifier_block *this, unsigned long event, void *ptr) { struct in_ifaddr *ifa = (struct in_ifaddr*)ptr; + struct net_device *dev = ifa->ifa_dev->dev; switch (event) { case NETDEV_UP: fib_add_ifaddr(ifa); #ifdef CONFIG_IP_ROUTE_MULTIPATH - fib_sync_up(ifa->ifa_dev->dev); + fib_sync_up(dev); #endif - rt_cache_flush(-1); + rt_cache_flush(dev_net(dev), -1); break; case NETDEV_DOWN: fib_del_ifaddr(ifa); @@ -921,9 +920,9 @@ static int fib_inetaddr_event(struct notifier_block *this, unsigned long event, /* Last address was deleted from this interface. Disable IP. */ - fib_disable_ip(ifa->ifa_dev->dev, 1); + fib_disable_ip(dev, 1); } else { - rt_cache_flush(-1); + rt_cache_flush(dev_net(dev), -1); } break; } @@ -951,14 +950,14 @@ static int fib_netdev_event(struct notifier_block *this, unsigned long event, vo #ifdef CONFIG_IP_ROUTE_MULTIPATH fib_sync_up(dev); #endif - rt_cache_flush(-1); + rt_cache_flush(dev_net(dev), -1); break; case NETDEV_DOWN: fib_disable_ip(dev, 0); break; case NETDEV_CHANGEMTU: case NETDEV_CHANGE: - rt_cache_flush(0); + rt_cache_flush(dev_net(dev), 0); break; } return NOTIFY_DONE; diff --git a/net/ipv4/fib_hash.c b/net/ipv4/fib_hash.c index 2e2fc3376ac9..c8cac6c7f881 100644 --- a/net/ipv4/fib_hash.c +++ b/net/ipv4/fib_hash.c @@ -5,8 +5,6 @@ * * IPv4 FIB: lookup engine and maintenance routines. * - * Version: $Id: fib_hash.c,v 1.13 2001/10/31 21:55:54 davem Exp $ - * * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> * * This program is free software; you can redistribute it and/or @@ -474,7 +472,7 @@ static int fn_hash_insert(struct fib_table *tb, struct fib_config *cfg) fib_release_info(fi_drop); if (state & FA_S_ACCESSED) - rt_cache_flush(-1); + rt_cache_flush(cfg->fc_nlinfo.nl_net, -1); rtmsg_fib(RTM_NEWROUTE, key, fa, cfg->fc_dst_len, tb->tb_id, &cfg->fc_nlinfo, NLM_F_REPLACE); return 0; @@ -534,7 +532,7 @@ static int fn_hash_insert(struct fib_table *tb, struct fib_config *cfg) if (new_f) fz->fz_nent++; - rt_cache_flush(-1); + rt_cache_flush(cfg->fc_nlinfo.nl_net, -1); rtmsg_fib(RTM_NEWROUTE, key, new_fa, cfg->fc_dst_len, tb->tb_id, &cfg->fc_nlinfo, 0); @@ -616,7 +614,7 @@ static int fn_hash_delete(struct fib_table *tb, struct fib_config *cfg) write_unlock_bh(&fib_hash_lock); if (fa->fa_state & FA_S_ACCESSED) - rt_cache_flush(-1); + rt_cache_flush(cfg->fc_nlinfo.nl_net, -1); fn_free_alias(fa, f); if (kill_fn) { fn_free_node(f); diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c index 1fb56876be54..6080d7120821 100644 --- a/net/ipv4/fib_rules.c +++ b/net/ipv4/fib_rules.c @@ -258,9 +258,9 @@ static size_t fib4_rule_nlmsg_payload(struct fib_rule *rule) + nla_total_size(4); /* flow */ } -static void fib4_rule_flush_cache(void) +static void fib4_rule_flush_cache(struct fib_rules_ops *ops) { - rt_cache_flush(-1); + rt_cache_flush(ops->fro_net, -1); } static struct fib_rules_ops fib4_rules_ops_template = { diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index 0d4d72827e4b..ded2ae34eab1 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -5,8 +5,6 @@ * * IPv4 Forwarding Information Base: semantics. * - * Version: $Id: fib_semantics.c,v 1.19 2002/01/12 07:54:56 davem Exp $ - * * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> * * This program is free software; you can redistribute it and/or diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index e1600ad8fb0e..f155a66d6ebf 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -22,8 +22,6 @@ * IP-address lookup using LC-tries. Stefan Nilsson and Gunnar Karlsson * IEEE Journal on Selected Areas in Communications, 17(6):1083-1092, June 1999 * - * Version: $Id: fib_trie.c,v 1.3 2005/06/08 14:20:01 robert Exp $ - * * * Code from fib_hash has been reused which includes the following header: * @@ -1273,7 +1271,7 @@ static int fn_trie_insert(struct fib_table *tb, struct fib_config *cfg) fib_release_info(fi_drop); if (state & FA_S_ACCESSED) - rt_cache_flush(-1); + rt_cache_flush(cfg->fc_nlinfo.nl_net, -1); rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, tb->tb_id, &cfg->fc_nlinfo, NLM_F_REPLACE); @@ -1318,7 +1316,7 @@ static int fn_trie_insert(struct fib_table *tb, struct fib_config *cfg) list_add_tail_rcu(&new_fa->fa_list, (fa ? &fa->fa_list : fa_head)); - rt_cache_flush(-1); + rt_cache_flush(cfg->fc_nlinfo.nl_net, -1); rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, tb->tb_id, &cfg->fc_nlinfo, 0); succeeded: @@ -1661,7 +1659,7 @@ static int fn_trie_delete(struct fib_table *tb, struct fib_config *cfg) trie_leaf_remove(t, l); if (fa->fa_state & FA_S_ACCESSED) - rt_cache_flush(-1); + rt_cache_flush(cfg->fc_nlinfo.nl_net, -1); fib_release_info(fa->fa_info); alias_free_mem_rcu(fa); diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index 87397351ddac..ea60ad41008c 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -3,8 +3,6 @@ * * Alan Cox, <alan@redhat.com> * - * Version: $Id: icmp.c,v 1.85 2002/02/01 22:01:03 davem Exp $ - * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License * as published by the Free Software Foundation; either version @@ -298,10 +296,10 @@ out: /* * Maintain the counters used in the SNMP statistics for outgoing ICMP */ -void icmp_out_count(unsigned char type) +void icmp_out_count(struct net *net, unsigned char type) { - ICMPMSGOUT_INC_STATS(type); - ICMP_INC_STATS(ICMP_MIB_OUTMSGS); + ICMPMSGOUT_INC_STATS(net, type); + ICMP_INC_STATS(net, ICMP_MIB_OUTMSGS); } /* @@ -765,7 +763,7 @@ static void icmp_unreach(struct sk_buff *skb) out: return; out_err: - ICMP_INC_STATS_BH(ICMP_MIB_INERRORS); + ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS); goto out; } @@ -805,7 +803,7 @@ static void icmp_redirect(struct sk_buff *skb) out: return; out_err: - ICMP_INC_STATS_BH(ICMP_MIB_INERRORS); + ICMP_INC_STATS_BH(dev_net(skb->dev), ICMP_MIB_INERRORS); goto out; } @@ -876,7 +874,7 @@ static void icmp_timestamp(struct sk_buff *skb) out: return; out_err: - ICMP_INC_STATS_BH(ICMP_MIB_INERRORS); + ICMP_INC_STATS_BH(dev_net(skb->dst->dev), ICMP_MIB_INERRORS); goto out; } @@ -975,6 +973,7 @@ int icmp_rcv(struct sk_buff *skb) { struct icmphdr *icmph; struct rtable *rt = skb->rtable; + struct net *net = dev_net(rt->u.dst.dev); if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) { int nh; @@ -995,7 +994,7 @@ int icmp_rcv(struct sk_buff *skb) skb_set_network_header(skb, nh); } - ICMP_INC_STATS_BH(ICMP_MIB_INMSGS); + ICMP_INC_STATS_BH(net, ICMP_MIB_INMSGS); switch (skb->ip_summed) { case CHECKSUM_COMPLETE: @@ -1013,7 +1012,7 @@ int icmp_rcv(struct sk_buff *skb) icmph = icmp_hdr(skb); - ICMPMSGIN_INC_STATS_BH(icmph->type); + ICMPMSGIN_INC_STATS_BH(net, icmph->type); /* * 18 is the highest 'known' ICMP type. Anything else is a mystery * @@ -1029,9 +1028,6 @@ int icmp_rcv(struct sk_buff *skb) */ if (rt->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) { - struct net *net; - - net = dev_net(rt->u.dst.dev); /* * RFC 1122: 3.2.2.6 An ICMP_ECHO to broadcast MAY be * silently ignored (we let user decide with a sysctl). @@ -1057,7 +1053,7 @@ drop: kfree_skb(skb); return 0; error: - ICMP_INC_STATS_BH(ICMP_MIB_INERRORS); + ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS); goto drop; } diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index 2769dc4a4c84..68e84a933e90 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -8,8 +8,6 @@ * the older version didn't come out right using gcc 2.5.8, the newer one * seems to fall out with gcc 2.6.2. * - * Version: $Id: igmp.c,v 1.47 2002/02/01 22:01:03 davem Exp $ - * * Authors: * Alan Cox <Alan.Cox@linux.org> * diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index ec834480abe7..bb81c958b744 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -103,7 +103,8 @@ int inet_csk_get_port(struct sock *sk, unsigned short snum) rover = net_random() % remaining + low; do { - head = &hashinfo->bhash[inet_bhashfn(rover, hashinfo->bhash_size)]; + head = &hashinfo->bhash[inet_bhashfn(net, rover, + hashinfo->bhash_size)]; spin_lock(&head->lock); inet_bind_bucket_for_each(tb, node, &head->chain) if (tb->ib_net == net && tb->port == rover) @@ -130,7 +131,8 @@ int inet_csk_get_port(struct sock *sk, unsigned short snum) */ snum = rover; } else { - head = &hashinfo->bhash[inet_bhashfn(snum, hashinfo->bhash_size)]; + head = &hashinfo->bhash[inet_bhashfn(net, snum, + hashinfo->bhash_size)]; spin_lock(&head->lock); inet_bind_bucket_for_each(tb, node, &head->chain) if (tb->ib_net == net && tb->port == snum) @@ -336,15 +338,16 @@ struct dst_entry* inet_csk_route_req(struct sock *sk, .uli_u = { .ports = { .sport = inet_sk(sk)->sport, .dport = ireq->rmt_port } } }; + struct net *net = sock_net(sk); security_req_classify_flow(req, &fl); - if (ip_route_output_flow(sock_net(sk), &rt, &fl, sk, 0)) { - IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES); + if (ip_route_output_flow(net, &rt, &fl, sk, 0)) { + IP_INC_STATS_BH(net, IPSTATS_MIB_OUTNOROUTES); return NULL; } if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway) { ip_rt_put(rt); - IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES); + IP_INC_STATS_BH(net, IPSTATS_MIB_OUTNOROUTES); return NULL; } return &rt->u.dst; diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index da97695e7096..c10036e7a463 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -1,8 +1,6 @@ /* * inet_diag.c Module for monitoring INET transport protocols sockets. * - * Version: $Id: inet_diag.c,v 1.3 2002/02/01 22:01:04 davem Exp $ - * * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> * * This program is free software; you can redistribute it and/or diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index 2023d37b2708..115f53722d20 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -70,7 +70,8 @@ void inet_bind_hash(struct sock *sk, struct inet_bind_bucket *tb, static void __inet_put_port(struct sock *sk) { struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; - const int bhash = inet_bhashfn(inet_sk(sk)->num, hashinfo->bhash_size); + const int bhash = inet_bhashfn(sock_net(sk), inet_sk(sk)->num, + hashinfo->bhash_size); struct inet_bind_hashbucket *head = &hashinfo->bhash[bhash]; struct inet_bind_bucket *tb; @@ -95,7 +96,8 @@ EXPORT_SYMBOL(inet_put_port); void __inet_inherit_port(struct sock *sk, struct sock *child) { struct inet_hashinfo *table = sk->sk_prot->h.hashinfo; - const int bhash = inet_bhashfn(inet_sk(child)->num, table->bhash_size); + const int bhash = inet_bhashfn(sock_net(sk), inet_sk(child)->num, + table->bhash_size); struct inet_bind_hashbucket *head = &table->bhash[bhash]; struct inet_bind_bucket *tb; @@ -192,7 +194,7 @@ struct sock *__inet_lookup_listener(struct net *net, const struct hlist_head *head; read_lock(&hashinfo->lhash_lock); - head = &hashinfo->listening_hash[inet_lhashfn(hnum)]; + head = &hashinfo->listening_hash[inet_lhashfn(net, hnum)]; if (!hlist_empty(head)) { const struct inet_sock *inet = inet_sk((sk = __sk_head(head))); @@ -225,7 +227,7 @@ struct sock * __inet_lookup_established(struct net *net, /* Optimize here for direct hit, only listening connections can * have wildcards anyways. */ - unsigned int hash = inet_ehashfn(daddr, hnum, saddr, sport); + unsigned int hash = inet_ehashfn(net, daddr, hnum, saddr, sport); struct inet_ehash_bucket *head = inet_ehash_bucket(hashinfo, hash); rwlock_t *lock = inet_ehash_lockp(hashinfo, hash); @@ -265,13 +267,13 @@ static int __inet_check_established(struct inet_timewait_death_row *death_row, int dif = sk->sk_bound_dev_if; INET_ADDR_COOKIE(acookie, saddr, daddr) const __portpair ports = INET_COMBINED_PORTS(inet->dport, lport); - unsigned int hash = inet_ehashfn(daddr, lport, saddr, inet->dport); + struct net *net = sock_net(sk); + unsigned int hash = inet_ehashfn(net, daddr, lport, saddr, inet->dport); struct inet_ehash_bucket *head = inet_ehash_bucket(hinfo, hash); rwlock_t *lock = inet_ehash_lockp(hinfo, hash); struct sock *sk2; const struct hlist_node *node; struct inet_timewait_sock *tw; - struct net *net = sock_net(sk); prefetch(head->chain.first); write_lock(lock); @@ -310,11 +312,11 @@ unique: if (twp) { *twp = tw; - NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED); + NET_INC_STATS_BH(net, LINUX_MIB_TIMEWAITRECYCLED); } else if (tw) { /* Silly. Should hash-dance instead... */ inet_twsk_deschedule(tw, death_row); - NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED); + NET_INC_STATS_BH(net, LINUX_MIB_TIMEWAITRECYCLED); inet_twsk_put(tw); } @@ -438,7 +440,8 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row, local_bh_disable(); for (i = 1; i <= remaining; i++) { port = low + (i + offset) % remaining; - head = &hinfo->bhash[inet_bhashfn(port, hinfo->bhash_size)]; + head = &hinfo->bhash[inet_bhashfn(net, port, + hinfo->bhash_size)]; spin_lock(&head->lock); /* Does not bother with rcv_saddr checks, @@ -493,7 +496,7 @@ ok: goto out; } - head = &hinfo->bhash[inet_bhashfn(snum, hinfo->bhash_size)]; + head = &hinfo->bhash[inet_bhashfn(net, snum, hinfo->bhash_size)]; tb = inet_csk(sk)->icsk_bind_hash; spin_lock_bh(&head->lock); if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) { diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c index ce16e9ac24c1..75c2def8f9a0 100644 --- a/net/ipv4/inet_timewait_sock.c +++ b/net/ipv4/inet_timewait_sock.c @@ -32,7 +32,8 @@ static void __inet_twsk_kill(struct inet_timewait_sock *tw, write_unlock(lock); /* Disassociate with bind bucket. */ - bhead = &hashinfo->bhash[inet_bhashfn(tw->tw_num, hashinfo->bhash_size)]; + bhead = &hashinfo->bhash[inet_bhashfn(twsk_net(tw), tw->tw_num, + hashinfo->bhash_size)]; spin_lock(&bhead->lock); tb = tw->tw_tb; __hlist_del(&tw->tw_bind_node); @@ -81,7 +82,8 @@ void __inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk, Note, that any socket with inet->num != 0 MUST be bound in binding cache, even if it is closed. */ - bhead = &hashinfo->bhash[inet_bhashfn(inet->num, hashinfo->bhash_size)]; + bhead = &hashinfo->bhash[inet_bhashfn(twsk_net(tw), inet->num, + hashinfo->bhash_size)]; spin_lock(&bhead->lock); tw->tw_tb = icsk->icsk_bind_hash; BUG_TRAP(icsk->icsk_bind_hash); @@ -158,6 +160,9 @@ rescan: __inet_twsk_del_dead_node(tw); spin_unlock(&twdr->death_lock); __inet_twsk_kill(tw, twdr->hashinfo); +#ifdef CONFIG_NET_NS + NET_INC_STATS_BH(twsk_net(tw), LINUX_MIB_TIMEWAITED); +#endif inet_twsk_put(tw); killed++; spin_lock(&twdr->death_lock); @@ -176,8 +181,9 @@ rescan: } twdr->tw_count -= killed; - NET_ADD_STATS_BH(LINUX_MIB_TIMEWAITED, killed); - +#ifndef CONFIG_NET_NS + NET_ADD_STATS_BH(&init_net, LINUX_MIB_TIMEWAITED, killed); +#endif return ret; } @@ -370,6 +376,9 @@ void inet_twdr_twcal_tick(unsigned long data) &twdr->twcal_row[slot]) { __inet_twsk_del_dead_node(tw); __inet_twsk_kill(tw, twdr->hashinfo); +#ifdef CONFIG_NET_NS + NET_INC_STATS_BH(twsk_net(tw), LINUX_MIB_TIMEWAITKILLED); +#endif inet_twsk_put(tw); killed++; } @@ -393,7 +402,9 @@ void inet_twdr_twcal_tick(unsigned long data) out: if ((twdr->tw_count -= killed) == 0) del_timer(&twdr->tw_timer); - NET_ADD_STATS_BH(LINUX_MIB_TIMEWAITKILLED, killed); +#ifndef CONFIG_NET_NS + NET_ADD_STATS_BH(&init_net, LINUX_MIB_TIMEWAITKILLED, killed); +#endif spin_unlock(&twdr->death_lock); } diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c index af995198f643..a456ceeac3f2 100644 --- a/net/ipv4/inetpeer.c +++ b/net/ipv4/inetpeer.c @@ -3,8 +3,6 @@ * * This source is covered by the GNU GPL, the same as all kernel sources. * - * Version: $Id: inetpeer.c,v 1.7 2001/09/20 21:22:50 davem Exp $ - * * Authors: Andrey V. Savochkin <saw@msu.ru> */ diff --git a/net/ipv4/ip_forward.c b/net/ipv4/ip_forward.c index 4813c39b438b..450016b89a18 100644 --- a/net/ipv4/ip_forward.c +++ b/net/ipv4/ip_forward.c @@ -5,8 +5,6 @@ * * The IP forwarding functionality. * - * Version: $Id: ip_forward.c,v 1.48 2000/12/13 18:31:48 davem Exp $ - * * Authors: see ip.c * * Fixes: @@ -44,7 +42,7 @@ static int ip_forward_finish(struct sk_buff *skb) { struct ip_options * opt = &(IPCB(skb)->opt); - IP_INC_STATS_BH(IPSTATS_MIB_OUTFORWDATAGRAMS); + IP_INC_STATS_BH(dev_net(skb->dst->dev), IPSTATS_MIB_OUTFORWDATAGRAMS); if (unlikely(opt->optlen)) ip_forward_options(skb); @@ -58,6 +56,9 @@ int ip_forward(struct sk_buff *skb) struct rtable *rt; /* Route we use */ struct ip_options * opt = &(IPCB(skb)->opt); + if (skb_warn_if_lro(skb)) + goto drop; + if (!xfrm4_policy_check(NULL, XFRM_POLICY_FWD, skb)) goto drop; @@ -87,7 +88,7 @@ int ip_forward(struct sk_buff *skb) if (unlikely(skb->len > dst_mtu(&rt->u.dst) && !skb_is_gso(skb) && (ip_hdr(skb)->frag_off & htons(IP_DF))) && !skb->local_df) { - IP_INC_STATS(IPSTATS_MIB_FRAGFAILS); + IP_INC_STATS(dev_net(rt->u.dst.dev), IPSTATS_MIB_FRAGFAILS); icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(dst_mtu(&rt->u.dst))); goto drop; @@ -122,7 +123,7 @@ sr_failed: too_many_hops: /* Tell the sender its packet died... */ - IP_INC_STATS_BH(IPSTATS_MIB_INHDRERRORS); + IP_INC_STATS_BH(dev_net(skb->dst->dev), IPSTATS_MIB_INHDRERRORS); icmp_send(skb, ICMP_TIME_EXCEEDED, ICMP_EXC_TTL, 0); drop: kfree_skb(skb); diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index 37221f659159..38d38f058018 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -5,8 +5,6 @@ * * The IP fragmentation functionality. * - * Version: $Id: ip_fragment.c,v 1.59 2002/01/12 07:54:56 davem Exp $ - * * Authors: Fred N. van Kempen <waltje@uWalt.NL.Mugnet.ORG> * Alan Cox <Alan.Cox@linux.org> * @@ -180,7 +178,7 @@ static void ip_evictor(struct net *net) evicted = inet_frag_evictor(&net->ipv4.frags, &ip4_frags); if (evicted) - IP_ADD_STATS_BH(IPSTATS_MIB_REASMFAILS, evicted); + IP_ADD_STATS_BH(net, IPSTATS_MIB_REASMFAILS, evicted); } /* @@ -189,8 +187,10 @@ static void ip_evictor(struct net *net) static void ip_expire(unsigned long arg) { struct ipq *qp; + struct net *net; qp = container_of((struct inet_frag_queue *) arg, struct ipq, q); + net = container_of(qp->q.net, struct net, ipv4.frags); spin_lock(&qp->q.lock); @@ -199,14 +199,12 @@ static void ip_expire(unsigned long arg) ipq_kill(qp); - IP_INC_STATS_BH(IPSTATS_MIB_REASMTIMEOUT); - IP_INC_STATS_BH(IPSTATS_MIB_REASMFAILS); + IP_INC_STATS_BH(net, IPSTATS_MIB_REASMTIMEOUT); + IP_INC_STATS_BH(net, IPSTATS_MIB_REASMFAILS); if ((qp->q.last_in & INET_FRAG_FIRST_IN) && qp->q.fragments != NULL) { struct sk_buff *head = qp->q.fragments; - struct net *net; - net = container_of(qp->q.net, struct net, ipv4.frags); /* Send an ICMP "Fragment Reassembly Timeout" message. */ if ((head->dev = dev_get_by_index(net, qp->iif)) != NULL) { icmp_send(head, ICMP_TIME_EXCEEDED, ICMP_EXC_FRAGTIME, 0); @@ -263,7 +261,10 @@ static inline int ip_frag_too_far(struct ipq *qp) rc = qp->q.fragments && (end - start) > max; if (rc) { - IP_INC_STATS_BH(IPSTATS_MIB_REASMFAILS); + struct net *net; + + net = container_of(qp->q.net, struct net, ipv4.frags); + IP_INC_STATS_BH(net, IPSTATS_MIB_REASMFAILS); } return rc; @@ -547,7 +548,7 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev, iph = ip_hdr(head); iph->frag_off = 0; iph->tot_len = htons(len); - IP_INC_STATS_BH(IPSTATS_MIB_REASMOKS); + IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_REASMOKS); qp->q.fragments = NULL; return 0; @@ -562,7 +563,7 @@ out_oversize: "Oversized IP packet from " NIPQUAD_FMT ".\n", NIPQUAD(qp->saddr)); out_fail: - IP_INC_STATS_BH(IPSTATS_MIB_REASMFAILS); + IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_REASMFAILS); return err; } @@ -572,9 +573,9 @@ int ip_defrag(struct sk_buff *skb, u32 user) struct ipq *qp; struct net *net; - IP_INC_STATS_BH(IPSTATS_MIB_REASMREQDS); - net = skb->dev ? dev_net(skb->dev) : dev_net(skb->dst->dev); + IP_INC_STATS_BH(net, IPSTATS_MIB_REASMREQDS); + /* Start by cleaning up the memory. */ if (atomic_read(&net->ipv4.frags.mem) > net->ipv4.frags.high_thresh) ip_evictor(net); @@ -592,7 +593,7 @@ int ip_defrag(struct sk_buff *skb, u32 user) return ret; } - IP_INC_STATS_BH(IPSTATS_MIB_REASMFAILS); + IP_INC_STATS_BH(net, IPSTATS_MIB_REASMFAILS); kfree_skb(skb); return -ENOMEM; } @@ -600,7 +601,7 @@ int ip_defrag(struct sk_buff *skb, u32 user) #ifdef CONFIG_SYSCTL static int zero; -static struct ctl_table ip4_frags_ctl_table[] = { +static struct ctl_table ip4_frags_ns_ctl_table[] = { { .ctl_name = NET_IPV4_IPFRAG_HIGH_THRESH, .procname = "ipfrag_high_thresh", @@ -626,6 +627,10 @@ static struct ctl_table ip4_frags_ctl_table[] = { .proc_handler = &proc_dointvec_jiffies, .strategy = &sysctl_jiffies }, + { } +}; + +static struct ctl_table ip4_frags_ctl_table[] = { { .ctl_name = NET_IPV4_IPFRAG_SECRET_INTERVAL, .procname = "ipfrag_secret_interval", @@ -646,22 +651,20 @@ static struct ctl_table ip4_frags_ctl_table[] = { { } }; -static int ip4_frags_ctl_register(struct net *net) +static int ip4_frags_ns_ctl_register(struct net *net) { struct ctl_table *table; struct ctl_table_header *hdr; - table = ip4_frags_ctl_table; + table = ip4_frags_ns_ctl_table; if (net != &init_net) { - table = kmemdup(table, sizeof(ip4_frags_ctl_table), GFP_KERNEL); + table = kmemdup(table, sizeof(ip4_frags_ns_ctl_table), GFP_KERNEL); if (table == NULL) goto err_alloc; table[0].data = &net->ipv4.frags.high_thresh; table[1].data = &net->ipv4.frags.low_thresh; table[2].data = &net->ipv4.frags.timeout; - table[3].mode &= ~0222; - table[4].mode &= ~0222; } hdr = register_net_sysctl_table(net, net_ipv4_ctl_path, table); @@ -678,7 +681,7 @@ err_alloc: return -ENOMEM; } -static void ip4_frags_ctl_unregister(struct net *net) +static void ip4_frags_ns_ctl_unregister(struct net *net) { struct ctl_table *table; @@ -686,13 +689,22 @@ static void ip4_frags_ctl_unregister(struct net *net) unregister_net_sysctl_table(net->ipv4.frags_hdr); kfree(table); } + +static void ip4_frags_ctl_register(void) +{ + register_net_sysctl_rotable(net_ipv4_ctl_path, ip4_frags_ctl_table); +} #else -static inline int ip4_frags_ctl_register(struct net *net) +static inline int ip4_frags_ns_ctl_register(struct net *net) { return 0; } -static inline void ip4_frags_ctl_unregister(struct net *net) +static inline void ip4_frags_ns_ctl_unregister(struct net *net) +{ +} + +static inline void ip4_frags_ctl_register(void) { } #endif @@ -716,12 +728,12 @@ static int ipv4_frags_init_net(struct net *net) inet_frags_init_net(&net->ipv4.frags); - return ip4_frags_ctl_register(net); + return ip4_frags_ns_ctl_register(net); } static void ipv4_frags_exit_net(struct net *net) { - ip4_frags_ctl_unregister(net); + ip4_frags_ns_ctl_unregister(net); inet_frags_exit_net(&net->ipv4.frags, &ip4_frags); } @@ -732,6 +744,7 @@ static struct pernet_operations ip4_frags_ops = { void __init ipfrag_init(void) { + ip4_frags_ctl_register(); register_pernet_subsys(&ip4_frags_ops); ip4_frags.hashfn = ip4_hashfn; ip4_frags.constructor = ip4_frag_init; diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 4342cba4ff82..2a61158ea722 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -473,6 +473,8 @@ static int ipgre_rcv(struct sk_buff *skb) read_lock(&ipgre_lock); if ((tunnel = ipgre_tunnel_lookup(dev_net(skb->dev), iph->saddr, iph->daddr, key)) != NULL) { + struct net_device_stats *stats = &tunnel->dev->stats; + secpath_reset(skb); skb->protocol = *(__be16*)(h + 2); @@ -497,28 +499,28 @@ static int ipgre_rcv(struct sk_buff *skb) /* Looped back packet, drop it! */ if (skb->rtable->fl.iif == 0) goto drop; - tunnel->stat.multicast++; + stats->multicast++; skb->pkt_type = PACKET_BROADCAST; } #endif if (((flags&GRE_CSUM) && csum) || (!(flags&GRE_CSUM) && tunnel->parms.i_flags&GRE_CSUM)) { - tunnel->stat.rx_crc_errors++; - tunnel->stat.rx_errors++; + stats->rx_crc_errors++; + stats->rx_errors++; goto drop; } if (tunnel->parms.i_flags&GRE_SEQ) { if (!(flags&GRE_SEQ) || (tunnel->i_seqno && (s32)(seqno - tunnel->i_seqno) < 0)) { - tunnel->stat.rx_fifo_errors++; - tunnel->stat.rx_errors++; + stats->rx_fifo_errors++; + stats->rx_errors++; goto drop; } tunnel->i_seqno = seqno + 1; } - tunnel->stat.rx_packets++; - tunnel->stat.rx_bytes += skb->len; + stats->rx_packets++; + stats->rx_bytes += skb->len; skb->dev = tunnel->dev; dst_release(skb->dst); skb->dst = NULL; @@ -540,7 +542,7 @@ drop_nolock: static int ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) { struct ip_tunnel *tunnel = netdev_priv(dev); - struct net_device_stats *stats = &tunnel->stat; + struct net_device_stats *stats = &tunnel->dev->stats; struct iphdr *old_iph = ip_hdr(skb); struct iphdr *tiph; u8 tos; @@ -554,7 +556,7 @@ static int ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) int mtu; if (tunnel->recursion++) { - tunnel->stat.collisions++; + stats->collisions++; goto tx_error; } @@ -570,7 +572,7 @@ static int ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) /* NBMA tunnel */ if (skb->dst == NULL) { - tunnel->stat.tx_fifo_errors++; + stats->tx_fifo_errors++; goto tx_error; } @@ -621,7 +623,7 @@ static int ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) .tos = RT_TOS(tos) } }, .proto = IPPROTO_GRE }; if (ip_route_output_key(dev_net(dev), &rt, &fl)) { - tunnel->stat.tx_carrier_errors++; + stats->tx_carrier_errors++; goto tx_error; } } @@ -629,7 +631,7 @@ static int ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) if (tdev == dev) { ip_rt_put(rt); - tunnel->stat.collisions++; + stats->collisions++; goto tx_error; } @@ -954,11 +956,6 @@ done: return err; } -static struct net_device_stats *ipgre_tunnel_get_stats(struct net_device *dev) -{ - return &(((struct ip_tunnel*)netdev_priv(dev))->stat); -} - static int ipgre_tunnel_change_mtu(struct net_device *dev, int new_mtu) { struct ip_tunnel *tunnel = netdev_priv(dev); @@ -1084,7 +1081,6 @@ static void ipgre_tunnel_setup(struct net_device *dev) dev->uninit = ipgre_tunnel_uninit; dev->destructor = free_netdev; dev->hard_start_xmit = ipgre_tunnel_xmit; - dev->get_stats = ipgre_tunnel_get_stats; dev->do_ioctl = ipgre_tunnel_ioctl; dev->change_mtu = ipgre_tunnel_change_mtu; diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index ff77a4a7f9ec..043f640df4b7 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c @@ -5,8 +5,6 @@ * * The Internet Protocol (IP) module. * - * Version: $Id: ip_input.c,v 1.55 2002/01/12 07:39:45 davem Exp $ - * * Authors: Ross Biro * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> * Donald Becker, <becker@super.org> @@ -232,16 +230,16 @@ static int ip_local_deliver_finish(struct sk_buff *skb) protocol = -ret; goto resubmit; } - IP_INC_STATS_BH(IPSTATS_MIB_INDELIVERS); + IP_INC_STATS_BH(net, IPSTATS_MIB_INDELIVERS); } else { if (!raw) { if (xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) { - IP_INC_STATS_BH(IPSTATS_MIB_INUNKNOWNPROTOS); + IP_INC_STATS_BH(net, IPSTATS_MIB_INUNKNOWNPROTOS); icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PROT_UNREACH, 0); } } else - IP_INC_STATS_BH(IPSTATS_MIB_INDELIVERS); + IP_INC_STATS_BH(net, IPSTATS_MIB_INDELIVERS); kfree_skb(skb); } } @@ -283,7 +281,7 @@ static inline int ip_rcv_options(struct sk_buff *skb) --ANK (980813) */ if (skb_cow(skb, skb_headroom(skb))) { - IP_INC_STATS_BH(IPSTATS_MIB_INDISCARDS); + IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_INDISCARDS); goto drop; } @@ -292,7 +290,7 @@ static inline int ip_rcv_options(struct sk_buff *skb) opt->optlen = iph->ihl*4 - sizeof(struct iphdr); if (ip_options_compile(dev_net(dev), opt, skb)) { - IP_INC_STATS_BH(IPSTATS_MIB_INHDRERRORS); + IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_INHDRERRORS); goto drop; } @@ -336,9 +334,11 @@ static int ip_rcv_finish(struct sk_buff *skb) skb->dev); if (unlikely(err)) { if (err == -EHOSTUNREACH) - IP_INC_STATS_BH(IPSTATS_MIB_INADDRERRORS); + IP_INC_STATS_BH(dev_net(skb->dev), + IPSTATS_MIB_INADDRERRORS); else if (err == -ENETUNREACH) - IP_INC_STATS_BH(IPSTATS_MIB_INNOROUTES); + IP_INC_STATS_BH(dev_net(skb->dev), + IPSTATS_MIB_INNOROUTES); goto drop; } } @@ -359,9 +359,9 @@ static int ip_rcv_finish(struct sk_buff *skb) rt = skb->rtable; if (rt->rt_type == RTN_MULTICAST) - IP_INC_STATS_BH(IPSTATS_MIB_INMCASTPKTS); + IP_INC_STATS_BH(dev_net(rt->u.dst.dev), IPSTATS_MIB_INMCASTPKTS); else if (rt->rt_type == RTN_BROADCAST) - IP_INC_STATS_BH(IPSTATS_MIB_INBCASTPKTS); + IP_INC_STATS_BH(dev_net(rt->u.dst.dev), IPSTATS_MIB_INBCASTPKTS); return dst_input(skb); @@ -384,10 +384,10 @@ int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, if (skb->pkt_type == PACKET_OTHERHOST) goto drop; - IP_INC_STATS_BH(IPSTATS_MIB_INRECEIVES); + IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_INRECEIVES); if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL) { - IP_INC_STATS_BH(IPSTATS_MIB_INDISCARDS); + IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_INDISCARDS); goto out; } @@ -420,7 +420,7 @@ int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, len = ntohs(iph->tot_len); if (skb->len < len) { - IP_INC_STATS_BH(IPSTATS_MIB_INTRUNCATEDPKTS); + IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_INTRUNCATEDPKTS); goto drop; } else if (len < (iph->ihl*4)) goto inhdr_error; @@ -430,7 +430,7 @@ int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, * Note this now means skb->len holds ntohs(iph->tot_len). */ if (pskb_trim_rcsum(skb, len)) { - IP_INC_STATS_BH(IPSTATS_MIB_INDISCARDS); + IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_INDISCARDS); goto drop; } @@ -441,7 +441,7 @@ int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, ip_rcv_finish); inhdr_error: - IP_INC_STATS_BH(IPSTATS_MIB_INHDRERRORS); + IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_INHDRERRORS); drop: kfree_skb(skb); out: diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c index 33126ad2cfdc..be3f18a7a40e 100644 --- a/net/ipv4/ip_options.c +++ b/net/ipv4/ip_options.c @@ -5,8 +5,6 @@ * * The options processing module for ip.c * - * Version: $Id: ip_options.c,v 1.21 2001/09/01 00:31:50 davem Exp $ - * * Authors: A.N.Kuznetsov * */ diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index e527628f56cf..465544f6281a 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -5,8 +5,6 @@ * * The Internet Protocol (IP) output module. * - * Version: $Id: ip_output.c,v 1.100 2002/02/01 22:01:03 davem Exp $ - * * Authors: Ross Biro * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> * Donald Becker, <becker@super.org> @@ -184,9 +182,9 @@ static inline int ip_finish_output2(struct sk_buff *skb) unsigned int hh_len = LL_RESERVED_SPACE(dev); if (rt->rt_type == RTN_MULTICAST) - IP_INC_STATS(IPSTATS_MIB_OUTMCASTPKTS); + IP_INC_STATS(dev_net(dev), IPSTATS_MIB_OUTMCASTPKTS); else if (rt->rt_type == RTN_BROADCAST) - IP_INC_STATS(IPSTATS_MIB_OUTBCASTPKTS); + IP_INC_STATS(dev_net(dev), IPSTATS_MIB_OUTBCASTPKTS); /* Be paranoid, rather than too clever. */ if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) { @@ -246,7 +244,7 @@ int ip_mc_output(struct sk_buff *skb) /* * If the indicated interface is up and running, send the packet. */ - IP_INC_STATS(IPSTATS_MIB_OUTREQUESTS); + IP_INC_STATS(dev_net(dev), IPSTATS_MIB_OUTREQUESTS); skb->dev = dev; skb->protocol = htons(ETH_P_IP); @@ -300,7 +298,7 @@ int ip_output(struct sk_buff *skb) { struct net_device *dev = skb->dst->dev; - IP_INC_STATS(IPSTATS_MIB_OUTREQUESTS); + IP_INC_STATS(dev_net(dev), IPSTATS_MIB_OUTREQUESTS); skb->dev = dev; skb->protocol = htons(ETH_P_IP); @@ -391,7 +389,7 @@ packet_routed: return ip_local_out(skb); no_route: - IP_INC_STATS(IPSTATS_MIB_OUTNOROUTES); + IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES); kfree_skb(skb); return -EHOSTUNREACH; } @@ -453,7 +451,7 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff*)) iph = ip_hdr(skb); if (unlikely((iph->frag_off & htons(IP_DF)) && !skb->local_df)) { - IP_INC_STATS(IPSTATS_MIB_FRAGFAILS); + IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS); icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(ip_skb_dst_mtu(skb))); kfree_skb(skb); @@ -544,7 +542,7 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff*)) err = output(skb); if (!err) - IP_INC_STATS(IPSTATS_MIB_FRAGCREATES); + IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGCREATES); if (err || !frag) break; @@ -554,7 +552,7 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff*)) } if (err == 0) { - IP_INC_STATS(IPSTATS_MIB_FRAGOKS); + IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGOKS); return 0; } @@ -563,7 +561,7 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff*)) kfree_skb(frag); frag = skb; } - IP_INC_STATS(IPSTATS_MIB_FRAGFAILS); + IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS); return err; } @@ -675,15 +673,15 @@ slow_path: if (err) goto fail; - IP_INC_STATS(IPSTATS_MIB_FRAGCREATES); + IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGCREATES); } kfree_skb(skb); - IP_INC_STATS(IPSTATS_MIB_FRAGOKS); + IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGOKS); return err; fail: kfree_skb(skb); - IP_INC_STATS(IPSTATS_MIB_FRAGFAILS); + IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS); return err; } @@ -1049,7 +1047,7 @@ alloc_new_skb: error: inet->cork.length -= length; - IP_INC_STATS(IPSTATS_MIB_OUTDISCARDS); + IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS); return err; } @@ -1191,7 +1189,7 @@ ssize_t ip_append_page(struct sock *sk, struct page *page, error: inet->cork.length -= size; - IP_INC_STATS(IPSTATS_MIB_OUTDISCARDS); + IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS); return err; } @@ -1213,6 +1211,7 @@ int ip_push_pending_frames(struct sock *sk) struct sk_buff *skb, *tmp_skb; struct sk_buff **tail_skb; struct inet_sock *inet = inet_sk(sk); + struct net *net = sock_net(sk); struct ip_options *opt = NULL; struct rtable *rt = (struct rtable *)inet->cork.dst; struct iphdr *iph; @@ -1282,7 +1281,7 @@ int ip_push_pending_frames(struct sock *sk) skb->dst = dst_clone(&rt->u.dst); if (iph->protocol == IPPROTO_ICMP) - icmp_out_count(((struct icmphdr *) + icmp_out_count(net, ((struct icmphdr *) skb_transport_header(skb))->type); /* Netfilter gets whole the not fragmented skb. */ @@ -1299,7 +1298,7 @@ out: return err; error: - IP_INC_STATS(IPSTATS_MIB_OUTDISCARDS); + IP_INC_STATS(net, IPSTATS_MIB_OUTDISCARDS); goto out; } diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index e0514e82308e..105d92a039b9 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -5,8 +5,6 @@ * * The IP to API glue. * - * Version: $Id: ip_sockglue.c,v 1.62 2002/02/01 22:01:04 davem Exp $ - * * Authors: see ip.c * * Fixes: diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c index ed45037ce9be..b88aa9afa42e 100644 --- a/net/ipv4/ipconfig.c +++ b/net/ipv4/ipconfig.c @@ -1,6 +1,4 @@ /* - * $Id: ipconfig.c,v 1.46 2002/02/01 22:01:04 davem Exp $ - * * Automatic Configuration of IP -- use DHCP, BOOTP, RARP, or * user-supplied information to configure own IP address and routes. * diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c index af5cb53da5cc..4c6d2caf9203 100644 --- a/net/ipv4/ipip.c +++ b/net/ipv4/ipip.c @@ -1,8 +1,6 @@ /* * Linux NET3: IP/IP protocol decoder. * - * Version: $Id: ipip.c,v 1.50 2001/10/02 02:22:36 davem Exp $ - * * Authors: * Sam Lantinga (slouken@cs.ucdavis.edu) 02/01/95 * @@ -368,8 +366,8 @@ static int ipip_rcv(struct sk_buff *skb) skb->protocol = htons(ETH_P_IP); skb->pkt_type = PACKET_HOST; - tunnel->stat.rx_packets++; - tunnel->stat.rx_bytes += skb->len; + tunnel->dev->stats.rx_packets++; + tunnel->dev->stats.rx_bytes += skb->len; skb->dev = tunnel->dev; dst_release(skb->dst); skb->dst = NULL; @@ -392,7 +390,7 @@ static int ipip_rcv(struct sk_buff *skb) static int ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) { struct ip_tunnel *tunnel = netdev_priv(dev); - struct net_device_stats *stats = &tunnel->stat; + struct net_device_stats *stats = &tunnel->dev->stats; struct iphdr *tiph = &tunnel->parms.iph; u8 tos = tunnel->parms.iph.tos; __be16 df = tiph->frag_off; @@ -405,7 +403,7 @@ static int ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) int mtu; if (tunnel->recursion++) { - tunnel->stat.collisions++; + stats->collisions++; goto tx_error; } @@ -418,7 +416,7 @@ static int ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) if (!dst) { /* NBMA tunnel */ if ((rt = skb->rtable) == NULL) { - tunnel->stat.tx_fifo_errors++; + stats->tx_fifo_errors++; goto tx_error; } if ((dst = rt->rt_gateway) == 0) @@ -433,7 +431,7 @@ static int ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) .tos = RT_TOS(tos) } }, .proto = IPPROTO_IPIP }; if (ip_route_output_key(dev_net(dev), &rt, &fl)) { - tunnel->stat.tx_carrier_errors++; + stats->tx_carrier_errors++; goto tx_error_icmp; } } @@ -441,7 +439,7 @@ static int ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) if (tdev == dev) { ip_rt_put(rt); - tunnel->stat.collisions++; + stats->collisions++; goto tx_error; } @@ -451,7 +449,7 @@ static int ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) mtu = skb->dst ? dst_mtu(skb->dst) : dev->mtu; if (mtu < 68) { - tunnel->stat.collisions++; + stats->collisions++; ip_rt_put(rt); goto tx_error; } @@ -685,11 +683,6 @@ done: return err; } -static struct net_device_stats *ipip_tunnel_get_stats(struct net_device *dev) -{ - return &(((struct ip_tunnel*)netdev_priv(dev))->stat); -} - static int ipip_tunnel_change_mtu(struct net_device *dev, int new_mtu) { if (new_mtu < 68 || new_mtu > 0xFFF8 - sizeof(struct iphdr)) @@ -702,7 +695,6 @@ static void ipip_tunnel_setup(struct net_device *dev) { dev->uninit = ipip_tunnel_uninit; dev->hard_start_xmit = ipip_tunnel_xmit; - dev->get_stats = ipip_tunnel_get_stats; dev->do_ioctl = ipip_tunnel_ioctl; dev->change_mtu = ipip_tunnel_change_mtu; dev->destructor = free_netdev; diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index 11700a4dcd95..033c712c3a5d 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -9,8 +9,6 @@ * as published by the Free Software Foundation; either version * 2 of the License, or (at your option) any later version. * - * Version: $Id: ipmr.c,v 1.65 2001/10/31 21:55:54 davem Exp $ - * * Fixes: * Michael Chastain : Incorrect size of copying. * Alan Cox : Added the cache manager code @@ -120,6 +118,31 @@ static struct timer_list ipmr_expire_timer; /* Service routines creating virtual interfaces: DVMRP tunnels and PIMREG */ +static void ipmr_del_tunnel(struct net_device *dev, struct vifctl *v) +{ + dev_close(dev); + + dev = __dev_get_by_name(&init_net, "tunl0"); + if (dev) { + struct ifreq ifr; + mm_segment_t oldfs; + struct ip_tunnel_parm p; + + memset(&p, 0, sizeof(p)); + p.iph.daddr = v->vifc_rmt_addr.s_addr; + p.iph.saddr = v->vifc_lcl_addr.s_addr; + p.iph.version = 4; + p.iph.ihl = 5; + p.iph.protocol = IPPROTO_IPIP; + sprintf(p.name, "dvmrp%d", v->vifc_vifi); + ifr.ifr_ifru.ifru_data = (__force void __user *)&p; + + oldfs = get_fs(); set_fs(KERNEL_DS); + dev->do_ioctl(dev, &ifr, SIOCDELTUNNEL); + set_fs(oldfs); + } +} + static struct net_device *ipmr_new_tunnel(struct vifctl *v) { @@ -161,6 +184,7 @@ struct net_device *ipmr_new_tunnel(struct vifctl *v) if (dev_open(dev)) goto failure; + dev_hold(dev); } } return dev; @@ -181,26 +205,20 @@ static int reg_vif_num = -1; static int reg_vif_xmit(struct sk_buff *skb, struct net_device *dev) { read_lock(&mrt_lock); - ((struct net_device_stats*)netdev_priv(dev))->tx_bytes += skb->len; - ((struct net_device_stats*)netdev_priv(dev))->tx_packets++; + dev->stats.tx_bytes += skb->len; + dev->stats.tx_packets++; ipmr_cache_report(skb, reg_vif_num, IGMPMSG_WHOLEPKT); read_unlock(&mrt_lock); kfree_skb(skb); return 0; } -static struct net_device_stats *reg_vif_get_stats(struct net_device *dev) -{ - return (struct net_device_stats*)netdev_priv(dev); -} - static void reg_vif_setup(struct net_device *dev) { dev->type = ARPHRD_PIMREG; dev->mtu = ETH_DATA_LEN - sizeof(struct iphdr) - 8; dev->flags = IFF_NOARP; dev->hard_start_xmit = reg_vif_xmit; - dev->get_stats = reg_vif_get_stats; dev->destructor = free_netdev; } @@ -209,8 +227,7 @@ static struct net_device *ipmr_reg_vif(void) struct net_device *dev; struct in_device *in_dev; - dev = alloc_netdev(sizeof(struct net_device_stats), "pimreg", - reg_vif_setup); + dev = alloc_netdev(0, "pimreg", reg_vif_setup); if (dev == NULL) return NULL; @@ -234,6 +251,8 @@ static struct net_device *ipmr_reg_vif(void) if (dev_open(dev)) goto failure; + dev_hold(dev); + return dev; failure: @@ -248,9 +267,10 @@ failure: /* * Delete a VIF entry + * @notify: Set to 1, if the caller is a notifier_call */ -static int vif_delete(int vifi) +static int vif_delete(int vifi, int notify) { struct vif_device *v; struct net_device *dev; @@ -293,7 +313,7 @@ static int vif_delete(int vifi) ip_rt_multicast_event(in_dev); } - if (v->flags&(VIFF_TUNNEL|VIFF_REGISTER)) + if (v->flags&(VIFF_TUNNEL|VIFF_REGISTER) && !notify) unregister_netdevice(dev); dev_put(dev); @@ -398,6 +418,7 @@ static int vif_add(struct vifctl *vifc, int mrtsock) struct vif_device *v = &vif_table[vifi]; struct net_device *dev; struct in_device *in_dev; + int err; /* Is vif busy ? */ if (VIF_EXISTS(vifi)) @@ -415,18 +436,34 @@ static int vif_add(struct vifctl *vifc, int mrtsock) dev = ipmr_reg_vif(); if (!dev) return -ENOBUFS; + err = dev_set_allmulti(dev, 1); + if (err) { + unregister_netdevice(dev); + dev_put(dev); + return err; + } break; #endif case VIFF_TUNNEL: dev = ipmr_new_tunnel(vifc); if (!dev) return -ENOBUFS; + err = dev_set_allmulti(dev, 1); + if (err) { + ipmr_del_tunnel(dev, vifc); + dev_put(dev); + return err; + } break; case 0: dev = ip_dev_find(&init_net, vifc->vifc_lcl_addr.s_addr); if (!dev) return -EADDRNOTAVAIL; - dev_put(dev); + err = dev_set_allmulti(dev, 1); + if (err) { + dev_put(dev); + return err; + } break; default: return -EINVAL; @@ -435,7 +472,6 @@ static int vif_add(struct vifctl *vifc, int mrtsock) if ((in_dev = __in_dev_get_rtnl(dev)) == NULL) return -EADDRNOTAVAIL; IPV4_DEVCONF(in_dev->cnf, MC_FORWARDING)++; - dev_set_allmulti(dev, +1); ip_rt_multicast_event(in_dev); /* @@ -458,7 +494,6 @@ static int vif_add(struct vifctl *vifc, int mrtsock) /* And finish update writing critical data */ write_lock_bh(&mrt_lock); - dev_hold(dev); v->dev=dev; #ifdef CONFIG_IP_PIMSM if (v->flags&VIFF_REGISTER) @@ -805,7 +840,7 @@ static void mroute_clean_tables(struct sock *sk) */ for (i=0; i<maxvif; i++) { if (!(vif_table[i].flags&VIFF_STATIC)) - vif_delete(i); + vif_delete(i, 0); } /* @@ -918,7 +953,7 @@ int ip_mroute_setsockopt(struct sock *sk,int optname,char __user *optval,int opt if (optname==MRT_ADD_VIF) { ret = vif_add(&vif, sk==mroute_socket); } else { - ret = vif_delete(vif.vifc_vifi); + ret = vif_delete(vif.vifc_vifi, 0); } rtnl_unlock(); return ret; @@ -1097,7 +1132,7 @@ static int ipmr_device_event(struct notifier_block *this, unsigned long event, v v=&vif_table[0]; for (ct=0;ct<maxvif;ct++,v++) { if (v->dev==dev) - vif_delete(ct); + vif_delete(ct, 1); } return NOTIFY_DONE; } @@ -1143,7 +1178,7 @@ static inline int ipmr_forward_finish(struct sk_buff *skb) { struct ip_options * opt = &(IPCB(skb)->opt); - IP_INC_STATS_BH(IPSTATS_MIB_OUTFORWDATAGRAMS); + IP_INC_STATS_BH(dev_net(skb->dst->dev), IPSTATS_MIB_OUTFORWDATAGRAMS); if (unlikely(opt->optlen)) ip_forward_options(skb); @@ -1170,8 +1205,8 @@ static void ipmr_queue_xmit(struct sk_buff *skb, struct mfc_cache *c, int vifi) if (vif->flags & VIFF_REGISTER) { vif->pkt_out++; vif->bytes_out+=skb->len; - ((struct net_device_stats*)netdev_priv(vif->dev))->tx_bytes += skb->len; - ((struct net_device_stats*)netdev_priv(vif->dev))->tx_packets++; + vif->dev->stats.tx_bytes += skb->len; + vif->dev->stats.tx_packets++; ipmr_cache_report(skb, vifi, IGMPMSG_WHOLEPKT); kfree_skb(skb); return; @@ -1206,7 +1241,7 @@ static void ipmr_queue_xmit(struct sk_buff *skb, struct mfc_cache *c, int vifi) to blackhole. */ - IP_INC_STATS_BH(IPSTATS_MIB_FRAGFAILS); + IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_FRAGFAILS); ip_rt_put(rt); goto out_free; } @@ -1230,8 +1265,8 @@ static void ipmr_queue_xmit(struct sk_buff *skb, struct mfc_cache *c, int vifi) if (vif->flags & VIFF_TUNNEL) { ip_encap(skb, vif->local, vif->remote); /* FIXME: extra output firewall step used to be here. --RR */ - ((struct ip_tunnel *)netdev_priv(vif->dev))->stat.tx_packets++; - ((struct ip_tunnel *)netdev_priv(vif->dev))->stat.tx_bytes+=skb->len; + vif->dev->stats.tx_packets++; + vif->dev->stats.tx_bytes += skb->len; } IPCB(skb)->flags |= IPSKB_FORWARDED; @@ -1487,8 +1522,8 @@ int pim_rcv_v1(struct sk_buff * skb) skb->pkt_type = PACKET_HOST; dst_release(skb->dst); skb->dst = NULL; - ((struct net_device_stats*)netdev_priv(reg_dev))->rx_bytes += skb->len; - ((struct net_device_stats*)netdev_priv(reg_dev))->rx_packets++; + reg_dev->stats.rx_bytes += skb->len; + reg_dev->stats.rx_packets++; nf_reset(skb); netif_rx(skb); dev_put(reg_dev); @@ -1542,8 +1577,8 @@ static int pim_rcv(struct sk_buff * skb) skb->ip_summed = 0; skb->pkt_type = PACKET_HOST; dst_release(skb->dst); - ((struct net_device_stats*)netdev_priv(reg_dev))->rx_bytes += skb->len; - ((struct net_device_stats*)netdev_priv(reg_dev))->rx_packets++; + reg_dev->stats.rx_bytes += skb->len; + reg_dev->stats.rx_packets++; skb->dst = NULL; nf_reset(skb); netif_rx(skb); @@ -1887,16 +1922,36 @@ static struct net_protocol pim_protocol = { * Setup for IP multicast routing */ -void __init ip_mr_init(void) +int __init ip_mr_init(void) { + int err; + mrt_cachep = kmem_cache_create("ip_mrt_cache", sizeof(struct mfc_cache), 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); + if (!mrt_cachep) + return -ENOMEM; + setup_timer(&ipmr_expire_timer, ipmr_expire_process, 0); - register_netdevice_notifier(&ip_mr_notifier); + err = register_netdevice_notifier(&ip_mr_notifier); + if (err) + goto reg_notif_fail; #ifdef CONFIG_PROC_FS - proc_net_fops_create(&init_net, "ip_mr_vif", 0, &ipmr_vif_fops); - proc_net_fops_create(&init_net, "ip_mr_cache", 0, &ipmr_mfc_fops); + err = -ENOMEM; + if (!proc_net_fops_create(&init_net, "ip_mr_vif", 0, &ipmr_vif_fops)) + goto proc_vif_fail; + if (!proc_net_fops_create(&init_net, "ip_mr_cache", 0, &ipmr_mfc_fops)) + goto proc_cache_fail; #endif + return 0; +reg_notif_fail: + kmem_cache_destroy(mrt_cachep); +#ifdef CONFIG_PROC_FS +proc_vif_fail: + unregister_netdevice_notifier(&ip_mr_notifier); +proc_cache_fail: + proc_net_remove(&init_net, "ip_mr_vif"); +#endif + return err; } diff --git a/net/ipv4/ipvs/ip_vs_app.c b/net/ipv4/ipvs/ip_vs_app.c index 535abe0c45e7..1f1897a1a702 100644 --- a/net/ipv4/ipvs/ip_vs_app.c +++ b/net/ipv4/ipvs/ip_vs_app.c @@ -1,8 +1,6 @@ /* * ip_vs_app.c: Application module support for IPVS * - * Version: $Id: ip_vs_app.c,v 1.17 2003/03/22 06:31:21 wensong Exp $ - * * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> * * This program is free software; you can redistribute it and/or diff --git a/net/ipv4/ipvs/ip_vs_conn.c b/net/ipv4/ipvs/ip_vs_conn.c index 65f1ba112752..f8bdae47a77f 100644 --- a/net/ipv4/ipvs/ip_vs_conn.c +++ b/net/ipv4/ipvs/ip_vs_conn.c @@ -5,8 +5,6 @@ * high-performance and highly available server based on a * cluster of servers. * - * Version: $Id: ip_vs_conn.c,v 1.31 2003/04/18 09:03:16 wensong Exp $ - * * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> * Peter Kese <peter.kese@ijs.si> * Julian Anastasov <ja@ssi.bg> diff --git a/net/ipv4/ipvs/ip_vs_core.c b/net/ipv4/ipvs/ip_vs_core.c index 963981a9d501..a7879eafc3b5 100644 --- a/net/ipv4/ipvs/ip_vs_core.c +++ b/net/ipv4/ipvs/ip_vs_core.c @@ -5,8 +5,6 @@ * high-performance and highly available server based on a * cluster of servers. * - * Version: $Id: ip_vs_core.c,v 1.34 2003/05/10 03:05:23 wensong Exp $ - * * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> * Peter Kese <peter.kese@ijs.si> * Julian Anastasov <ja@ssi.bg> @@ -993,7 +991,8 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, == sysctl_ip_vs_sync_threshold[0])) || ((cp->protocol == IPPROTO_TCP) && (cp->old_state != cp->state) && ((cp->state == IP_VS_TCP_S_FIN_WAIT) || - (cp->state == IP_VS_TCP_S_CLOSE))))) + (cp->state == IP_VS_TCP_S_CLOSE_WAIT) || + (cp->state == IP_VS_TCP_S_TIME_WAIT))))) ip_vs_sync_conn(cp); cp->old_state = cp->state; diff --git a/net/ipv4/ipvs/ip_vs_ctl.c b/net/ipv4/ipvs/ip_vs_ctl.c index 94c5767c8e01..9a5ace0b4dd6 100644 --- a/net/ipv4/ipvs/ip_vs_ctl.c +++ b/net/ipv4/ipvs/ip_vs_ctl.c @@ -5,8 +5,6 @@ * high-performance and highly available server based on a * cluster of servers. * - * Version: $Id: ip_vs_ctl.c,v 1.36 2003/06/08 09:31:19 wensong Exp $ - * * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> * Peter Kese <peter.kese@ijs.si> * Julian Anastasov <ja@ssi.bg> diff --git a/net/ipv4/ipvs/ip_vs_dh.c b/net/ipv4/ipvs/ip_vs_dh.c index dcf5d46aaa5e..8afc1503ed20 100644 --- a/net/ipv4/ipvs/ip_vs_dh.c +++ b/net/ipv4/ipvs/ip_vs_dh.c @@ -1,8 +1,6 @@ /* * IPVS: Destination Hashing scheduling module * - * Version: $Id: ip_vs_dh.c,v 1.5 2002/09/15 08:14:08 wensong Exp $ - * * Authors: Wensong Zhang <wensong@gnuchina.org> * * Inspired by the consistent hashing scheduler patch from diff --git a/net/ipv4/ipvs/ip_vs_est.c b/net/ipv4/ipvs/ip_vs_est.c index dfa0d713c801..bc04eedd6dbb 100644 --- a/net/ipv4/ipvs/ip_vs_est.c +++ b/net/ipv4/ipvs/ip_vs_est.c @@ -1,8 +1,6 @@ /* * ip_vs_est.c: simple rate estimator for IPVS * - * Version: $Id: ip_vs_est.c,v 1.4 2002/11/30 01:50:35 wensong Exp $ - * * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> * * This program is free software; you can redistribute it and/or diff --git a/net/ipv4/ipvs/ip_vs_ftp.c b/net/ipv4/ipvs/ip_vs_ftp.c index 59aa166b7678..c1c758e4f733 100644 --- a/net/ipv4/ipvs/ip_vs_ftp.c +++ b/net/ipv4/ipvs/ip_vs_ftp.c @@ -1,8 +1,6 @@ /* * ip_vs_ftp.c: IPVS ftp application module * - * Version: $Id: ip_vs_ftp.c,v 1.13 2002/09/15 08:14:08 wensong Exp $ - * * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> * * Changes: diff --git a/net/ipv4/ipvs/ip_vs_lblc.c b/net/ipv4/ipvs/ip_vs_lblc.c index 3888642706ad..0efa3db4b180 100644 --- a/net/ipv4/ipvs/ip_vs_lblc.c +++ b/net/ipv4/ipvs/ip_vs_lblc.c @@ -1,8 +1,6 @@ /* * IPVS: Locality-Based Least-Connection scheduling module * - * Version: $Id: ip_vs_lblc.c,v 1.10 2002/09/15 08:14:08 wensong Exp $ - * * Authors: Wensong Zhang <wensong@gnuchina.org> * * This program is free software; you can redistribute it and/or diff --git a/net/ipv4/ipvs/ip_vs_lblcr.c b/net/ipv4/ipvs/ip_vs_lblcr.c index daa260eb21cf..8e3bbeb45138 100644 --- a/net/ipv4/ipvs/ip_vs_lblcr.c +++ b/net/ipv4/ipvs/ip_vs_lblcr.c @@ -1,8 +1,6 @@ /* * IPVS: Locality-Based Least-Connection with Replication scheduler * - * Version: $Id: ip_vs_lblcr.c,v 1.11 2002/09/15 08:14:08 wensong Exp $ - * * Authors: Wensong Zhang <wensong@gnuchina.org> * * This program is free software; you can redistribute it and/or diff --git a/net/ipv4/ipvs/ip_vs_lc.c b/net/ipv4/ipvs/ip_vs_lc.c index d88fef90a641..ac9f08e065d5 100644 --- a/net/ipv4/ipvs/ip_vs_lc.c +++ b/net/ipv4/ipvs/ip_vs_lc.c @@ -1,8 +1,6 @@ /* * IPVS: Least-Connection Scheduling module * - * Version: $Id: ip_vs_lc.c,v 1.10 2003/04/18 09:03:16 wensong Exp $ - * * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> * * This program is free software; you can redistribute it and/or diff --git a/net/ipv4/ipvs/ip_vs_nq.c b/net/ipv4/ipvs/ip_vs_nq.c index bc2a9e5f2a7b..a46bf258d420 100644 --- a/net/ipv4/ipvs/ip_vs_nq.c +++ b/net/ipv4/ipvs/ip_vs_nq.c @@ -1,8 +1,6 @@ /* * IPVS: Never Queue scheduling module * - * Version: $Id: ip_vs_nq.c,v 1.2 2003/06/08 09:31:19 wensong Exp $ - * * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> * * This program is free software; you can redistribute it and/or diff --git a/net/ipv4/ipvs/ip_vs_proto.c b/net/ipv4/ipvs/ip_vs_proto.c index 4b1c16cbb16b..876714f23d65 100644 --- a/net/ipv4/ipvs/ip_vs_proto.c +++ b/net/ipv4/ipvs/ip_vs_proto.c @@ -1,8 +1,6 @@ /* * ip_vs_proto.c: transport protocol load balancing support for IPVS * - * Version: $Id: ip_vs_proto.c,v 1.2 2003/04/18 09:03:16 wensong Exp $ - * * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> * Julian Anastasov <ja@ssi.bg> * diff --git a/net/ipv4/ipvs/ip_vs_proto_ah.c b/net/ipv4/ipvs/ip_vs_proto_ah.c index 4bf835e1d86d..73e0ea87c1f5 100644 --- a/net/ipv4/ipvs/ip_vs_proto_ah.c +++ b/net/ipv4/ipvs/ip_vs_proto_ah.c @@ -1,8 +1,6 @@ /* * ip_vs_proto_ah.c: AH IPSec load balancing support for IPVS * - * Version: $Id: ip_vs_proto_ah.c,v 1.1 2003/07/04 15:04:37 wensong Exp $ - * * Authors: Julian Anastasov <ja@ssi.bg>, February 2002 * Wensong Zhang <wensong@linuxvirtualserver.org> * diff --git a/net/ipv4/ipvs/ip_vs_proto_esp.c b/net/ipv4/ipvs/ip_vs_proto_esp.c index db6a6b7b1a0b..21d70c8ffa54 100644 --- a/net/ipv4/ipvs/ip_vs_proto_esp.c +++ b/net/ipv4/ipvs/ip_vs_proto_esp.c @@ -1,8 +1,6 @@ /* * ip_vs_proto_esp.c: ESP IPSec load balancing support for IPVS * - * Version: $Id: ip_vs_proto_esp.c,v 1.1 2003/07/04 15:04:37 wensong Exp $ - * * Authors: Julian Anastasov <ja@ssi.bg>, February 2002 * Wensong Zhang <wensong@linuxvirtualserver.org> * diff --git a/net/ipv4/ipvs/ip_vs_proto_tcp.c b/net/ipv4/ipvs/ip_vs_proto_tcp.c index b83dc14b0a4d..d0ea467986a0 100644 --- a/net/ipv4/ipvs/ip_vs_proto_tcp.c +++ b/net/ipv4/ipvs/ip_vs_proto_tcp.c @@ -1,8 +1,6 @@ /* * ip_vs_proto_tcp.c: TCP load balancing support for IPVS * - * Version: $Id: ip_vs_proto_tcp.c,v 1.3 2002/11/30 01:50:35 wensong Exp $ - * * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> * Julian Anastasov <ja@ssi.bg> * diff --git a/net/ipv4/ipvs/ip_vs_proto_udp.c b/net/ipv4/ipvs/ip_vs_proto_udp.c index 75771cb3cd6f..c6be5d56823f 100644 --- a/net/ipv4/ipvs/ip_vs_proto_udp.c +++ b/net/ipv4/ipvs/ip_vs_proto_udp.c @@ -1,8 +1,6 @@ /* * ip_vs_proto_udp.c: UDP load balancing support for IPVS * - * Version: $Id: ip_vs_proto_udp.c,v 1.3 2002/11/30 01:50:35 wensong Exp $ - * * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> * Julian Anastasov <ja@ssi.bg> * diff --git a/net/ipv4/ipvs/ip_vs_rr.c b/net/ipv4/ipvs/ip_vs_rr.c index 433f8a947924..c8db12d39e61 100644 --- a/net/ipv4/ipvs/ip_vs_rr.c +++ b/net/ipv4/ipvs/ip_vs_rr.c @@ -1,8 +1,6 @@ /* * IPVS: Round-Robin Scheduling module * - * Version: $Id: ip_vs_rr.c,v 1.9 2002/09/15 08:14:08 wensong Exp $ - * * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> * Peter Kese <peter.kese@ijs.si> * diff --git a/net/ipv4/ipvs/ip_vs_sched.c b/net/ipv4/ipvs/ip_vs_sched.c index 121a32b1b756..b64767309855 100644 --- a/net/ipv4/ipvs/ip_vs_sched.c +++ b/net/ipv4/ipvs/ip_vs_sched.c @@ -5,8 +5,6 @@ * high-performance and highly available server based on a * cluster of servers. * - * Version: $Id: ip_vs_sched.c,v 1.13 2003/05/10 03:05:23 wensong Exp $ - * * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> * Peter Kese <peter.kese@ijs.si> * diff --git a/net/ipv4/ipvs/ip_vs_sed.c b/net/ipv4/ipvs/ip_vs_sed.c index dd7c128f9db3..2a7d31358181 100644 --- a/net/ipv4/ipvs/ip_vs_sed.c +++ b/net/ipv4/ipvs/ip_vs_sed.c @@ -1,8 +1,6 @@ /* * IPVS: Shortest Expected Delay scheduling module * - * Version: $Id: ip_vs_sed.c,v 1.1 2003/05/10 03:06:08 wensong Exp $ - * * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> * * This program is free software; you can redistribute it and/or diff --git a/net/ipv4/ipvs/ip_vs_sh.c b/net/ipv4/ipvs/ip_vs_sh.c index 1b25b00ef1e1..b8fdfac65001 100644 --- a/net/ipv4/ipvs/ip_vs_sh.c +++ b/net/ipv4/ipvs/ip_vs_sh.c @@ -1,8 +1,6 @@ /* * IPVS: Source Hashing scheduling module * - * Version: $Id: ip_vs_sh.c,v 1.5 2002/09/15 08:14:08 wensong Exp $ - * * Authors: Wensong Zhang <wensong@gnuchina.org> * * This program is free software; you can redistribute it and/or diff --git a/net/ipv4/ipvs/ip_vs_sync.c b/net/ipv4/ipvs/ip_vs_sync.c index eff54efe0351..45e9bd96c286 100644 --- a/net/ipv4/ipvs/ip_vs_sync.c +++ b/net/ipv4/ipvs/ip_vs_sync.c @@ -5,8 +5,6 @@ * high-performance and highly available server based on a * cluster of servers. * - * Version: $Id: ip_vs_sync.c,v 1.13 2003/06/08 09:31:19 wensong Exp $ - * * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> * * ip_vs_sync: sync connection info from master load balancer to backups @@ -29,10 +27,12 @@ #include <linux/in.h> #include <linux/igmp.h> /* for ip_mc_join_group */ #include <linux/udp.h> +#include <linux/err.h> +#include <linux/kthread.h> +#include <linux/wait.h> #include <net/ip.h> #include <net/sock.h> -#include <asm/uaccess.h> /* for get_fs and set_fs */ #include <net/ip_vs.h> @@ -68,8 +68,8 @@ struct ip_vs_sync_conn_options { }; struct ip_vs_sync_thread_data { - struct completion *startup; - int state; + struct socket *sock; + char *buf; }; #define SIMPLE_CONN_SIZE (sizeof(struct ip_vs_sync_conn)) @@ -140,18 +140,19 @@ volatile int ip_vs_backup_syncid = 0; char ip_vs_master_mcast_ifn[IP_VS_IFNAME_MAXLEN]; char ip_vs_backup_mcast_ifn[IP_VS_IFNAME_MAXLEN]; +/* sync daemon tasks */ +static struct task_struct *sync_master_thread; +static struct task_struct *sync_backup_thread; + /* multicast addr */ -static struct sockaddr_in mcast_addr; +static struct sockaddr_in mcast_addr = { + .sin_family = AF_INET, + .sin_port = __constant_htons(IP_VS_SYNC_PORT), + .sin_addr.s_addr = __constant_htonl(IP_VS_SYNC_GROUP), +}; -static inline void sb_queue_tail(struct ip_vs_sync_buff *sb) -{ - spin_lock(&ip_vs_sync_lock); - list_add_tail(&sb->list, &ip_vs_sync_queue); - spin_unlock(&ip_vs_sync_lock); -} - -static inline struct ip_vs_sync_buff * sb_dequeue(void) +static inline struct ip_vs_sync_buff *sb_dequeue(void) { struct ip_vs_sync_buff *sb; @@ -195,6 +196,16 @@ static inline void ip_vs_sync_buff_release(struct ip_vs_sync_buff *sb) kfree(sb); } +static inline void sb_queue_tail(struct ip_vs_sync_buff *sb) +{ + spin_lock(&ip_vs_sync_lock); + if (ip_vs_sync_state & IP_VS_STATE_MASTER) + list_add_tail(&sb->list, &ip_vs_sync_queue); + else + ip_vs_sync_buff_release(sb); + spin_unlock(&ip_vs_sync_lock); +} + /* * Get the current sync buffer if it has been created for more * than the specified time or the specified time is zero. @@ -574,14 +585,17 @@ static int bind_mcastif_addr(struct socket *sock, char *ifname) static struct socket * make_send_sock(void) { struct socket *sock; + int result; /* First create a socket */ - if (sock_create_kern(PF_INET, SOCK_DGRAM, IPPROTO_UDP, &sock) < 0) { + result = sock_create_kern(PF_INET, SOCK_DGRAM, IPPROTO_UDP, &sock); + if (result < 0) { IP_VS_ERR("Error during creation of socket; terminating\n"); - return NULL; + return ERR_PTR(result); } - if (set_mcast_if(sock->sk, ip_vs_master_mcast_ifn) < 0) { + result = set_mcast_if(sock->sk, ip_vs_master_mcast_ifn); + if (result < 0) { IP_VS_ERR("Error setting outbound mcast interface\n"); goto error; } @@ -589,14 +603,15 @@ static struct socket * make_send_sock(void) set_mcast_loop(sock->sk, 0); set_mcast_ttl(sock->sk, 1); - if (bind_mcastif_addr(sock, ip_vs_master_mcast_ifn) < 0) { + result = bind_mcastif_addr(sock, ip_vs_master_mcast_ifn); + if (result < 0) { IP_VS_ERR("Error binding address of the mcast interface\n"); goto error; } - if (sock->ops->connect(sock, - (struct sockaddr*)&mcast_addr, - sizeof(struct sockaddr), 0) < 0) { + result = sock->ops->connect(sock, (struct sockaddr *) &mcast_addr, + sizeof(struct sockaddr), 0); + if (result < 0) { IP_VS_ERR("Error connecting to the multicast addr\n"); goto error; } @@ -605,7 +620,7 @@ static struct socket * make_send_sock(void) error: sock_release(sock); - return NULL; + return ERR_PTR(result); } @@ -615,27 +630,30 @@ static struct socket * make_send_sock(void) static struct socket * make_receive_sock(void) { struct socket *sock; + int result; /* First create a socket */ - if (sock_create_kern(PF_INET, SOCK_DGRAM, IPPROTO_UDP, &sock) < 0) { + result = sock_create_kern(PF_INET, SOCK_DGRAM, IPPROTO_UDP, &sock); + if (result < 0) { IP_VS_ERR("Error during creation of socket; terminating\n"); - return NULL; + return ERR_PTR(result); } /* it is equivalent to the REUSEADDR option in user-space */ sock->sk->sk_reuse = 1; - if (sock->ops->bind(sock, - (struct sockaddr*)&mcast_addr, - sizeof(struct sockaddr)) < 0) { + result = sock->ops->bind(sock, (struct sockaddr *) &mcast_addr, + sizeof(struct sockaddr)); + if (result < 0) { IP_VS_ERR("Error binding to the multicast addr\n"); goto error; } /* join the multicast group */ - if (join_mcast_group(sock->sk, - (struct in_addr*)&mcast_addr.sin_addr, - ip_vs_backup_mcast_ifn) < 0) { + result = join_mcast_group(sock->sk, + (struct in_addr *) &mcast_addr.sin_addr, + ip_vs_backup_mcast_ifn); + if (result < 0) { IP_VS_ERR("Error joining to the multicast group\n"); goto error; } @@ -644,7 +662,7 @@ static struct socket * make_receive_sock(void) error: sock_release(sock); - return NULL; + return ERR_PTR(result); } @@ -702,44 +720,29 @@ ip_vs_receive(struct socket *sock, char *buffer, const size_t buflen) } -static DECLARE_WAIT_QUEUE_HEAD(sync_wait); -static pid_t sync_master_pid = 0; -static pid_t sync_backup_pid = 0; - -static DECLARE_WAIT_QUEUE_HEAD(stop_sync_wait); -static int stop_master_sync = 0; -static int stop_backup_sync = 0; - -static void sync_master_loop(void) +static int sync_thread_master(void *data) { - struct socket *sock; + struct ip_vs_sync_thread_data *tinfo = data; struct ip_vs_sync_buff *sb; - /* create the sending multicast socket */ - sock = make_send_sock(); - if (!sock) - return; - IP_VS_INFO("sync thread started: state = MASTER, mcast_ifn = %s, " "syncid = %d\n", ip_vs_master_mcast_ifn, ip_vs_master_syncid); - for (;;) { - while ((sb=sb_dequeue())) { - ip_vs_send_sync_msg(sock, sb->mesg); + while (!kthread_should_stop()) { + while ((sb = sb_dequeue())) { + ip_vs_send_sync_msg(tinfo->sock, sb->mesg); ip_vs_sync_buff_release(sb); } /* check if entries stay in curr_sb for 2 seconds */ - if ((sb = get_curr_sync_buff(2*HZ))) { - ip_vs_send_sync_msg(sock, sb->mesg); + sb = get_curr_sync_buff(2 * HZ); + if (sb) { + ip_vs_send_sync_msg(tinfo->sock, sb->mesg); ip_vs_sync_buff_release(sb); } - if (stop_master_sync) - break; - - msleep_interruptible(1000); + schedule_timeout_interruptible(HZ); } /* clean up the sync_buff queue */ @@ -753,267 +756,175 @@ static void sync_master_loop(void) } /* release the sending multicast socket */ - sock_release(sock); + sock_release(tinfo->sock); + kfree(tinfo); + + return 0; } -static void sync_backup_loop(void) +static int sync_thread_backup(void *data) { - struct socket *sock; - char *buf; + struct ip_vs_sync_thread_data *tinfo = data; int len; - if (!(buf = kmalloc(sync_recv_mesg_maxlen, GFP_ATOMIC))) { - IP_VS_ERR("sync_backup_loop: kmalloc error\n"); - return; - } - - /* create the receiving multicast socket */ - sock = make_receive_sock(); - if (!sock) - goto out; - IP_VS_INFO("sync thread started: state = BACKUP, mcast_ifn = %s, " "syncid = %d\n", ip_vs_backup_mcast_ifn, ip_vs_backup_syncid); - for (;;) { - /* do you have data now? */ - while (!skb_queue_empty(&(sock->sk->sk_receive_queue))) { - if ((len = - ip_vs_receive(sock, buf, - sync_recv_mesg_maxlen)) <= 0) { + while (!kthread_should_stop()) { + wait_event_interruptible(*tinfo->sock->sk->sk_sleep, + !skb_queue_empty(&tinfo->sock->sk->sk_receive_queue) + || kthread_should_stop()); + + /* do we have data now? */ + while (!skb_queue_empty(&(tinfo->sock->sk->sk_receive_queue))) { + len = ip_vs_receive(tinfo->sock, tinfo->buf, + sync_recv_mesg_maxlen); + if (len <= 0) { IP_VS_ERR("receiving message error\n"); break; } - /* disable bottom half, because it accessed the data + + /* disable bottom half, because it accesses the data shared by softirq while getting/creating conns */ local_bh_disable(); - ip_vs_process_message(buf, len); + ip_vs_process_message(tinfo->buf, len); local_bh_enable(); } - - if (stop_backup_sync) - break; - - msleep_interruptible(1000); } /* release the sending multicast socket */ - sock_release(sock); + sock_release(tinfo->sock); + kfree(tinfo->buf); + kfree(tinfo); - out: - kfree(buf); + return 0; } -static void set_sync_pid(int sync_state, pid_t sync_pid) -{ - if (sync_state == IP_VS_STATE_MASTER) - sync_master_pid = sync_pid; - else if (sync_state == IP_VS_STATE_BACKUP) - sync_backup_pid = sync_pid; -} - -static void set_stop_sync(int sync_state, int set) +int start_sync_thread(int state, char *mcast_ifn, __u8 syncid) { - if (sync_state == IP_VS_STATE_MASTER) - stop_master_sync = set; - else if (sync_state == IP_VS_STATE_BACKUP) - stop_backup_sync = set; - else { - stop_master_sync = set; - stop_backup_sync = set; - } -} + struct ip_vs_sync_thread_data *tinfo; + struct task_struct **realtask, *task; + struct socket *sock; + char *name, *buf = NULL; + int (*threadfn)(void *data); + int result = -ENOMEM; -static int sync_thread(void *startup) -{ - DECLARE_WAITQUEUE(wait, current); - mm_segment_t oldmm; - int state; - const char *name; - struct ip_vs_sync_thread_data *tinfo = startup; + IP_VS_DBG(7, "%s: pid %d\n", __func__, task_pid_nr(current)); + IP_VS_DBG(7, "Each ip_vs_sync_conn entry needs %Zd bytes\n", + sizeof(struct ip_vs_sync_conn)); - /* increase the module use count */ - ip_vs_use_count_inc(); + if (state == IP_VS_STATE_MASTER) { + if (sync_master_thread) + return -EEXIST; - if (ip_vs_sync_state & IP_VS_STATE_MASTER && !sync_master_pid) { - state = IP_VS_STATE_MASTER; + strlcpy(ip_vs_master_mcast_ifn, mcast_ifn, + sizeof(ip_vs_master_mcast_ifn)); + ip_vs_master_syncid = syncid; + realtask = &sync_master_thread; name = "ipvs_syncmaster"; - } else if (ip_vs_sync_state & IP_VS_STATE_BACKUP && !sync_backup_pid) { - state = IP_VS_STATE_BACKUP; + threadfn = sync_thread_master; + sock = make_send_sock(); + } else if (state == IP_VS_STATE_BACKUP) { + if (sync_backup_thread) + return -EEXIST; + + strlcpy(ip_vs_backup_mcast_ifn, mcast_ifn, + sizeof(ip_vs_backup_mcast_ifn)); + ip_vs_backup_syncid = syncid; + realtask = &sync_backup_thread; name = "ipvs_syncbackup"; + threadfn = sync_thread_backup; + sock = make_receive_sock(); } else { - IP_VS_BUG(); - ip_vs_use_count_dec(); return -EINVAL; } - daemonize(name); - - oldmm = get_fs(); - set_fs(KERNEL_DS); - - /* Block all signals */ - spin_lock_irq(¤t->sighand->siglock); - siginitsetinv(¤t->blocked, 0); - recalc_sigpending(); - spin_unlock_irq(¤t->sighand->siglock); + if (IS_ERR(sock)) { + result = PTR_ERR(sock); + goto out; + } - /* set the maximum length of sync message */ set_sync_mesg_maxlen(state); + if (state == IP_VS_STATE_BACKUP) { + buf = kmalloc(sync_recv_mesg_maxlen, GFP_KERNEL); + if (!buf) + goto outsocket; + } - /* set up multicast address */ - mcast_addr.sin_family = AF_INET; - mcast_addr.sin_port = htons(IP_VS_SYNC_PORT); - mcast_addr.sin_addr.s_addr = htonl(IP_VS_SYNC_GROUP); - - add_wait_queue(&sync_wait, &wait); - - set_sync_pid(state, task_pid_nr(current)); - complete(tinfo->startup); - - /* - * once we call the completion queue above, we should - * null out that reference, since its allocated on the - * stack of the creating kernel thread - */ - tinfo->startup = NULL; - - /* processing master/backup loop here */ - if (state == IP_VS_STATE_MASTER) - sync_master_loop(); - else if (state == IP_VS_STATE_BACKUP) - sync_backup_loop(); - else IP_VS_BUG(); - - remove_wait_queue(&sync_wait, &wait); - - /* thread exits */ - - /* - * If we weren't explicitly stopped, then we - * exited in error, and should undo our state - */ - if ((!stop_master_sync) && (!stop_backup_sync)) - ip_vs_sync_state -= tinfo->state; + tinfo = kmalloc(sizeof(*tinfo), GFP_KERNEL); + if (!tinfo) + goto outbuf; - set_sync_pid(state, 0); - IP_VS_INFO("sync thread stopped!\n"); + tinfo->sock = sock; + tinfo->buf = buf; - set_fs(oldmm); + task = kthread_run(threadfn, tinfo, name); + if (IS_ERR(task)) { + result = PTR_ERR(task); + goto outtinfo; + } - /* decrease the module use count */ - ip_vs_use_count_dec(); + /* mark as active */ + *realtask = task; + ip_vs_sync_state |= state; - set_stop_sync(state, 0); - wake_up(&stop_sync_wait); + /* increase the module use count */ + ip_vs_use_count_inc(); - /* - * we need to free the structure that was allocated - * for us in start_sync_thread - */ - kfree(tinfo); return 0; -} - - -static int fork_sync_thread(void *startup) -{ - pid_t pid; - - /* fork the sync thread here, then the parent process of the - sync thread is the init process after this thread exits. */ - repeat: - if ((pid = kernel_thread(sync_thread, startup, 0)) < 0) { - IP_VS_ERR("could not create sync_thread due to %d... " - "retrying.\n", pid); - msleep_interruptible(1000); - goto repeat; - } - return 0; +outtinfo: + kfree(tinfo); +outbuf: + kfree(buf); +outsocket: + sock_release(sock); +out: + return result; } -int start_sync_thread(int state, char *mcast_ifn, __u8 syncid) +int stop_sync_thread(int state) { - DECLARE_COMPLETION_ONSTACK(startup); - pid_t pid; - struct ip_vs_sync_thread_data *tinfo; - - if ((state == IP_VS_STATE_MASTER && sync_master_pid) || - (state == IP_VS_STATE_BACKUP && sync_backup_pid)) - return -EEXIST; - - /* - * Note that tinfo will be freed in sync_thread on exit - */ - tinfo = kmalloc(sizeof(struct ip_vs_sync_thread_data), GFP_KERNEL); - if (!tinfo) - return -ENOMEM; - IP_VS_DBG(7, "%s: pid %d\n", __func__, task_pid_nr(current)); - IP_VS_DBG(7, "Each ip_vs_sync_conn entry need %Zd bytes\n", - sizeof(struct ip_vs_sync_conn)); - ip_vs_sync_state |= state; if (state == IP_VS_STATE_MASTER) { - strlcpy(ip_vs_master_mcast_ifn, mcast_ifn, - sizeof(ip_vs_master_mcast_ifn)); - ip_vs_master_syncid = syncid; - } else { - strlcpy(ip_vs_backup_mcast_ifn, mcast_ifn, - sizeof(ip_vs_backup_mcast_ifn)); - ip_vs_backup_syncid = syncid; - } - - tinfo->state = state; - tinfo->startup = &startup; - - repeat: - if ((pid = kernel_thread(fork_sync_thread, tinfo, 0)) < 0) { - IP_VS_ERR("could not create fork_sync_thread due to %d... " - "retrying.\n", pid); - msleep_interruptible(1000); - goto repeat; - } - - wait_for_completion(&startup); - - return 0; -} + if (!sync_master_thread) + return -ESRCH; + IP_VS_INFO("stopping master sync thread %d ...\n", + task_pid_nr(sync_master_thread)); -int stop_sync_thread(int state) -{ - DECLARE_WAITQUEUE(wait, current); + /* + * The lock synchronizes with sb_queue_tail(), so that we don't + * add sync buffers to the queue, when we are already in + * progress of stopping the master sync daemon. + */ - if ((state == IP_VS_STATE_MASTER && !sync_master_pid) || - (state == IP_VS_STATE_BACKUP && !sync_backup_pid)) - return -ESRCH; + spin_lock(&ip_vs_sync_lock); + ip_vs_sync_state &= ~IP_VS_STATE_MASTER; + spin_unlock(&ip_vs_sync_lock); + kthread_stop(sync_master_thread); + sync_master_thread = NULL; + } else if (state == IP_VS_STATE_BACKUP) { + if (!sync_backup_thread) + return -ESRCH; + + IP_VS_INFO("stopping backup sync thread %d ...\n", + task_pid_nr(sync_backup_thread)); + + ip_vs_sync_state &= ~IP_VS_STATE_BACKUP; + kthread_stop(sync_backup_thread); + sync_backup_thread = NULL; + } else { + return -EINVAL; + } - IP_VS_DBG(7, "%s: pid %d\n", __func__, task_pid_nr(current)); - IP_VS_INFO("stopping sync thread %d ...\n", - (state == IP_VS_STATE_MASTER) ? - sync_master_pid : sync_backup_pid); - - __set_current_state(TASK_UNINTERRUPTIBLE); - add_wait_queue(&stop_sync_wait, &wait); - set_stop_sync(state, 1); - ip_vs_sync_state -= state; - wake_up(&sync_wait); - schedule(); - __set_current_state(TASK_RUNNING); - remove_wait_queue(&stop_sync_wait, &wait); - - /* Note: no need to reap the sync thread, because its parent - process is the init process */ - - if ((state == IP_VS_STATE_MASTER && stop_master_sync) || - (state == IP_VS_STATE_BACKUP && stop_backup_sync)) - IP_VS_BUG(); + /* decrease the module use count */ + ip_vs_use_count_dec(); return 0; } diff --git a/net/ipv4/ipvs/ip_vs_wlc.c b/net/ipv4/ipvs/ip_vs_wlc.c index 8a9d913261d8..772c3cb4eca1 100644 --- a/net/ipv4/ipvs/ip_vs_wlc.c +++ b/net/ipv4/ipvs/ip_vs_wlc.c @@ -1,8 +1,6 @@ /* * IPVS: Weighted Least-Connection Scheduling module * - * Version: $Id: ip_vs_wlc.c,v 1.13 2003/04/18 09:03:16 wensong Exp $ - * * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> * Peter Kese <peter.kese@ijs.si> * diff --git a/net/ipv4/ipvs/ip_vs_wrr.c b/net/ipv4/ipvs/ip_vs_wrr.c index 85c680add6df..1d6932d7dc97 100644 --- a/net/ipv4/ipvs/ip_vs_wrr.c +++ b/net/ipv4/ipvs/ip_vs_wrr.c @@ -1,8 +1,6 @@ /* * IPVS: Weighted Round-Robin Scheduling module * - * Version: $Id: ip_vs_wrr.c,v 1.12 2002/09/15 08:14:08 wensong Exp $ - * * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> * * This program is free software; you can redistribute it and/or diff --git a/net/ipv4/ipvs/ip_vs_xmit.c b/net/ipv4/ipvs/ip_vs_xmit.c index f63006caea03..9892d4aca42e 100644 --- a/net/ipv4/ipvs/ip_vs_xmit.c +++ b/net/ipv4/ipvs/ip_vs_xmit.c @@ -1,8 +1,6 @@ /* * ip_vs_xmit.c: various packet transmitters for IPVS * - * Version: $Id: ip_vs_xmit.c,v 1.2 2002/11/30 01:50:35 wensong Exp $ - * * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> * Julian Anastasov <ja@ssi.bg> * diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig index 2767841a8cef..f23e60c93ef9 100644 --- a/net/ipv4/netfilter/Kconfig +++ b/net/ipv4/netfilter/Kconfig @@ -213,8 +213,7 @@ config IP_NF_TARGET_NETMAP help NETMAP is an implementation of static 1:1 NAT mapping of network addresses. It maps the network address part, while keeping the host - address part intact. It is similar to Fast NAT, except that - Netfilter's connection tracking doesn't work well with Fast NAT. + address part intact. To compile it as a module, choose M here. If unsure, say N. @@ -365,6 +364,18 @@ config IP_NF_RAW If you want to compile it as a module, say M here and read <file:Documentation/kbuild/modules.txt>. If unsure, say `N'. +# security table for MAC policy +config IP_NF_SECURITY + tristate "Security table" + depends on IP_NF_IPTABLES + depends on SECURITY + default m if NETFILTER_ADVANCED=n + help + This option adds a `security' table to iptables, for use + with Mandatory Access Control (MAC) policy. + + If unsure, say N. + # ARP tables config IP_NF_ARPTABLES tristate "ARP tables support" diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile index d9b92fbf5579..3f31291f37ce 100644 --- a/net/ipv4/netfilter/Makefile +++ b/net/ipv4/netfilter/Makefile @@ -42,6 +42,7 @@ obj-$(CONFIG_IP_NF_FILTER) += iptable_filter.o obj-$(CONFIG_IP_NF_MANGLE) += iptable_mangle.o obj-$(CONFIG_NF_NAT) += iptable_nat.o obj-$(CONFIG_IP_NF_RAW) += iptable_raw.o +obj-$(CONFIG_IP_NF_SECURITY) += iptable_security.o # matches obj-$(CONFIG_IP_NF_MATCH_ADDRTYPE) += ipt_addrtype.o diff --git a/net/ipv4/netfilter/ip_queue.c b/net/ipv4/netfilter/ip_queue.c index 26a37cedcf2e..aa33a4a7a715 100644 --- a/net/ipv4/netfilter/ip_queue.c +++ b/net/ipv4/netfilter/ip_queue.c @@ -156,7 +156,6 @@ ipq_build_packet_message(struct nf_queue_entry *entry, int *errp) case IPQ_COPY_META: case IPQ_COPY_NONE: size = NLMSG_SPACE(sizeof(*pmsg)); - data_len = 0; break; case IPQ_COPY_PACKET: @@ -224,8 +223,6 @@ ipq_build_packet_message(struct nf_queue_entry *entry, int *errp) return skb; nlmsg_failure: - if (skb) - kfree_skb(skb); *errp = -EINVAL; printk(KERN_ERR "ip_queue: error creating packet message\n"); return NULL; diff --git a/net/ipv4/netfilter/iptable_security.c b/net/ipv4/netfilter/iptable_security.c new file mode 100644 index 000000000000..2b472ac2263a --- /dev/null +++ b/net/ipv4/netfilter/iptable_security.c @@ -0,0 +1,180 @@ +/* + * "security" table + * + * This is for use by Mandatory Access Control (MAC) security models, + * which need to be able to manage security policy in separate context + * to DAC. + * + * Based on iptable_mangle.c + * + * Copyright (C) 1999 Paul `Rusty' Russell & Michael J. Neuling + * Copyright (C) 2000-2004 Netfilter Core Team <coreteam <at> netfilter.org> + * Copyright (C) 2008 Red Hat, Inc., James Morris <jmorris <at> redhat.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#include <linux/module.h> +#include <linux/netfilter_ipv4/ip_tables.h> +#include <net/ip.h> + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("James Morris <jmorris <at> redhat.com>"); +MODULE_DESCRIPTION("iptables security table, for MAC rules"); + +#define SECURITY_VALID_HOOKS (1 << NF_INET_LOCAL_IN) | \ + (1 << NF_INET_FORWARD) | \ + (1 << NF_INET_LOCAL_OUT) + +static struct +{ + struct ipt_replace repl; + struct ipt_standard entries[3]; + struct ipt_error term; +} initial_table __initdata = { + .repl = { + .name = "security", + .valid_hooks = SECURITY_VALID_HOOKS, + .num_entries = 4, + .size = sizeof(struct ipt_standard) * 3 + sizeof(struct ipt_error), + .hook_entry = { + [NF_INET_LOCAL_IN] = 0, + [NF_INET_FORWARD] = sizeof(struct ipt_standard), + [NF_INET_LOCAL_OUT] = sizeof(struct ipt_standard) * 2, + }, + .underflow = { + [NF_INET_LOCAL_IN] = 0, + [NF_INET_FORWARD] = sizeof(struct ipt_standard), + [NF_INET_LOCAL_OUT] = sizeof(struct ipt_standard) * 2, + }, + }, + .entries = { + IPT_STANDARD_INIT(NF_ACCEPT), /* LOCAL_IN */ + IPT_STANDARD_INIT(NF_ACCEPT), /* FORWARD */ + IPT_STANDARD_INIT(NF_ACCEPT), /* LOCAL_OUT */ + }, + .term = IPT_ERROR_INIT, /* ERROR */ +}; + +static struct xt_table security_table = { + .name = "security", + .valid_hooks = SECURITY_VALID_HOOKS, + .lock = __RW_LOCK_UNLOCKED(security_table.lock), + .me = THIS_MODULE, + .af = AF_INET, +}; + +static unsigned int +ipt_local_in_hook(unsigned int hook, + struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + return ipt_do_table(skb, hook, in, out, + nf_local_in_net(in, out)->ipv4.iptable_security); +} + +static unsigned int +ipt_forward_hook(unsigned int hook, + struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + return ipt_do_table(skb, hook, in, out, + nf_forward_net(in, out)->ipv4.iptable_security); +} + +static unsigned int +ipt_local_out_hook(unsigned int hook, + struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + /* Somebody is playing with raw sockets. */ + if (skb->len < sizeof(struct iphdr) + || ip_hdrlen(skb) < sizeof(struct iphdr)) { + if (net_ratelimit()) + printk(KERN_INFO "iptable_security: ignoring short " + "SOCK_RAW packet.\n"); + return NF_ACCEPT; + } + return ipt_do_table(skb, hook, in, out, + nf_local_out_net(in, out)->ipv4.iptable_security); +} + +static struct nf_hook_ops ipt_ops[] __read_mostly = { + { + .hook = ipt_local_in_hook, + .owner = THIS_MODULE, + .pf = PF_INET, + .hooknum = NF_INET_LOCAL_IN, + .priority = NF_IP_PRI_SECURITY, + }, + { + .hook = ipt_forward_hook, + .owner = THIS_MODULE, + .pf = PF_INET, + .hooknum = NF_INET_FORWARD, + .priority = NF_IP_PRI_SECURITY, + }, + { + .hook = ipt_local_out_hook, + .owner = THIS_MODULE, + .pf = PF_INET, + .hooknum = NF_INET_LOCAL_OUT, + .priority = NF_IP_PRI_SECURITY, + }, +}; + +static int __net_init iptable_security_net_init(struct net *net) +{ + net->ipv4.iptable_security = + ipt_register_table(net, &security_table, &initial_table.repl); + + if (IS_ERR(net->ipv4.iptable_security)) + return PTR_ERR(net->ipv4.iptable_security); + + return 0; +} + +static void __net_exit iptable_security_net_exit(struct net *net) +{ + ipt_unregister_table(net->ipv4.iptable_security); +} + +static struct pernet_operations iptable_security_net_ops = { + .init = iptable_security_net_init, + .exit = iptable_security_net_exit, +}; + +static int __init iptable_security_init(void) +{ + int ret; + + ret = register_pernet_subsys(&iptable_security_net_ops); + if (ret < 0) + return ret; + + ret = nf_register_hooks(ipt_ops, ARRAY_SIZE(ipt_ops)); + if (ret < 0) + goto cleanup_table; + + return ret; + +cleanup_table: + unregister_pernet_subsys(&iptable_security_net_ops); + return ret; +} + +static void __exit iptable_security_fini(void) +{ + nf_unregister_hooks(ipt_ops, ARRAY_SIZE(ipt_ops)); + unregister_pernet_subsys(&iptable_security_net_ops); +} + +module_init(iptable_security_init); +module_exit(iptable_security_fini); diff --git a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c index 78ab19accace..97791048fa9b 100644 --- a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c +++ b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c @@ -87,9 +87,8 @@ static int icmp_packet(struct nf_conn *ct, means this will only run once even if count hits zero twice (theoretically possible with SMP) */ if (CTINFO2DIR(ctinfo) == IP_CT_DIR_REPLY) { - if (atomic_dec_and_test(&ct->proto.icmp.count) - && del_timer(&ct->timeout)) - ct->timeout.function((unsigned long)ct); + if (atomic_dec_and_test(&ct->proto.icmp.count)) + nf_ct_kill_acct(ct, ctinfo, skb); } else { atomic_inc(&ct->proto.icmp.count); nf_conntrack_event_cache(IPCT_PROTOINFO_VOLATILE, skb); diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index 552169b41b16..eb5cee279c5f 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -7,8 +7,6 @@ * PROC file system. It is mainly used for debugging and * statistics. * - * Version: $Id: proc.c,v 1.45 2001/05/16 16:45:35 davem Exp $ - * * Authors: Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> * Gerald J. Heim, <heim@peanuts.informatik.uni-tuebingen.de> * Fred Baumgarten, <dc6iq@insu1.etec.uni-karlsruhe.de> diff --git a/net/ipv4/protocol.c b/net/ipv4/protocol.c index 971ab9356e51..ea50da0649fd 100644 --- a/net/ipv4/protocol.c +++ b/net/ipv4/protocol.c @@ -5,8 +5,6 @@ * * INET protocol dispatch tables. * - * Version: $Id: protocol.c,v 1.14 2001/05/18 02:25:49 davem Exp $ - * * Authors: Ross Biro * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> * diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index 37a1ecd9d600..cd975743bcd2 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -5,8 +5,6 @@ * * RAW - implementation of IP "raw" sockets. * - * Version: $Id: raw.c,v 1.64 2002/02/01 22:01:04 davem Exp $ - * * Authors: Ross Biro * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> * @@ -322,6 +320,7 @@ static int raw_send_hdrinc(struct sock *sk, void *from, size_t length, unsigned int flags) { struct inet_sock *inet = inet_sk(sk); + struct net *net = sock_net(sk); struct iphdr *iph; struct sk_buff *skb; unsigned int iphlen; @@ -370,7 +369,7 @@ static int raw_send_hdrinc(struct sock *sk, void *from, size_t length, iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl); } if (iph->protocol == IPPROTO_ICMP) - icmp_out_count(((struct icmphdr *) + icmp_out_count(net, ((struct icmphdr *) skb_transport_header(skb))->type); err = NF_HOOK(PF_INET, NF_INET_LOCAL_OUT, skb, NULL, rt->u.dst.dev, @@ -386,7 +385,7 @@ error_fault: err = -EFAULT; kfree_skb(skb); error: - IP_INC_STATS(IPSTATS_MIB_OUTDISCARDS); + IP_INC_STATS(net, IPSTATS_MIB_OUTDISCARDS); return err; } @@ -608,12 +607,11 @@ static void raw_close(struct sock *sk, long timeout) sk_common_release(sk); } -static int raw_destroy(struct sock *sk) +static void raw_destroy(struct sock *sk) { lock_sock(sk); ip_flush_pending_frames(sk); release_sock(sk); - return 0; } /* This gets rid of all the nasties in af_inet. -DaveM */ @@ -947,7 +945,7 @@ static int raw_seq_show(struct seq_file *seq, void *v) if (v == SEQ_START_TOKEN) seq_printf(seq, " sl local_address rem_address st tx_queue " "rx_queue tr tm->when retrnsmt uid timeout " - "inode drops\n"); + "inode ref pointer drops\n"); else raw_sock_seq_show(seq, v, raw_seq_private(seq)->bucket); return 0; diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 96be336064fb..e4ab0ac94f92 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -5,8 +5,6 @@ * * ROUTE - implementation of the IP router. * - * Version: $Id: route.c,v 1.103 2002/01/12 07:44:09 davem Exp $ - * * Authors: Ross Biro * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> * Alan Cox, <gw4pts@gw4pts.ampr.org> @@ -134,7 +132,6 @@ static int ip_rt_secret_interval __read_mostly = 10 * 60 * HZ; static void rt_worker_func(struct work_struct *work); static DECLARE_DELAYED_WORK(expires_work, rt_worker_func); -static struct timer_list rt_secret_timer; /* * Interface to generic destination cache. @@ -253,20 +250,25 @@ static inline void rt_hash_lock_init(void) static struct rt_hash_bucket *rt_hash_table __read_mostly; static unsigned rt_hash_mask __read_mostly; static unsigned int rt_hash_log __read_mostly; -static atomic_t rt_genid __read_mostly; static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat); #define RT_CACHE_STAT_INC(field) \ (__raw_get_cpu_var(rt_cache_stat).field++) -static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx) +static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx, + int genid) { return jhash_3words((__force u32)(__be32)(daddr), (__force u32)(__be32)(saddr), - idx, atomic_read(&rt_genid)) + idx, genid) & rt_hash_mask; } +static inline int rt_genid(struct net *net) +{ + return atomic_read(&net->ipv4.rt_genid); +} + #ifdef CONFIG_PROC_FS struct rt_cache_iter_state { struct seq_net_private p; @@ -336,7 +338,7 @@ static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos) struct rt_cache_iter_state *st = seq->private; if (*pos) return rt_cache_get_idx(seq, *pos - 1); - st->genid = atomic_read(&rt_genid); + st->genid = rt_genid(seq_file_net(seq)); return SEQ_START_TOKEN; } @@ -683,6 +685,11 @@ static inline int compare_netns(struct rtable *rt1, struct rtable *rt2) return dev_net(rt1->u.dst.dev) == dev_net(rt2->u.dst.dev); } +static inline int rt_is_expired(struct rtable *rth) +{ + return rth->rt_genid != rt_genid(dev_net(rth->u.dst.dev)); +} + /* * Perform a full scan of hash table and free all entries. * Can be called by a softirq or a process. @@ -692,6 +699,7 @@ static void rt_do_flush(int process_context) { unsigned int i; struct rtable *rth, *next; + struct rtable * tail; for (i = 0; i <= rt_hash_mask; i++) { if (process_context && need_resched()) @@ -701,11 +709,39 @@ static void rt_do_flush(int process_context) continue; spin_lock_bh(rt_hash_lock_addr(i)); +#ifdef CONFIG_NET_NS + { + struct rtable ** prev, * p; + + rth = rt_hash_table[i].chain; + + /* defer releasing the head of the list after spin_unlock */ + for (tail = rth; tail; tail = tail->u.dst.rt_next) + if (!rt_is_expired(tail)) + break; + if (rth != tail) + rt_hash_table[i].chain = tail; + + /* call rt_free on entries after the tail requiring flush */ + prev = &rt_hash_table[i].chain; + for (p = *prev; p; p = next) { + next = p->u.dst.rt_next; + if (!rt_is_expired(p)) { + prev = &p->u.dst.rt_next; + } else { + *prev = next; + rt_free(p); + } + } + } +#else rth = rt_hash_table[i].chain; rt_hash_table[i].chain = NULL; + tail = NULL; +#endif spin_unlock_bh(rt_hash_lock_addr(i)); - for (; rth; rth = next) { + for (; rth != tail; rth = next) { next = rth->u.dst.rt_next; rt_free(rth); } @@ -738,7 +774,7 @@ static void rt_check_expire(void) continue; spin_lock_bh(rt_hash_lock_addr(i)); while ((rth = *rthp) != NULL) { - if (rth->rt_genid != atomic_read(&rt_genid)) { + if (rt_is_expired(rth)) { *rthp = rth->u.dst.rt_next; rt_free(rth); continue; @@ -781,21 +817,21 @@ static void rt_worker_func(struct work_struct *work) * many times (2^24) without giving recent rt_genid. * Jenkins hash is strong enough that litle changes of rt_genid are OK. */ -static void rt_cache_invalidate(void) +static void rt_cache_invalidate(struct net *net) { unsigned char shuffle; get_random_bytes(&shuffle, sizeof(shuffle)); - atomic_add(shuffle + 1U, &rt_genid); + atomic_add(shuffle + 1U, &net->ipv4.rt_genid); } /* * delay < 0 : invalidate cache (fast : entries will be deleted later) * delay >= 0 : invalidate & flush cache (can be long) */ -void rt_cache_flush(int delay) +void rt_cache_flush(struct net *net, int delay) { - rt_cache_invalidate(); + rt_cache_invalidate(net); if (delay >= 0) rt_do_flush(!in_softirq()); } @@ -803,10 +839,11 @@ void rt_cache_flush(int delay) /* * We change rt_genid and let gc do the cleanup */ -static void rt_secret_rebuild(unsigned long dummy) +static void rt_secret_rebuild(unsigned long __net) { - rt_cache_invalidate(); - mod_timer(&rt_secret_timer, jiffies + ip_rt_secret_interval); + struct net *net = (struct net *)__net; + rt_cache_invalidate(net); + mod_timer(&net->ipv4.rt_secret_timer, jiffies + ip_rt_secret_interval); } /* @@ -882,7 +919,7 @@ static int rt_garbage_collect(struct dst_ops *ops) rthp = &rt_hash_table[k].chain; spin_lock_bh(rt_hash_lock_addr(k)); while ((rth = *rthp) != NULL) { - if (rth->rt_genid == atomic_read(&rt_genid) && + if (!rt_is_expired(rth) && !rt_may_expire(rth, tmo, expire)) { tmo >>= 1; rthp = &rth->u.dst.rt_next; @@ -964,7 +1001,7 @@ restart: spin_lock_bh(rt_hash_lock_addr(hash)); while ((rth = *rthp) != NULL) { - if (rth->rt_genid != atomic_read(&rt_genid)) { + if (rt_is_expired(rth)) { *rthp = rth->u.dst.rt_next; rt_free(rth); continue; @@ -1140,7 +1177,7 @@ static void rt_del(unsigned hash, struct rtable *rt) spin_lock_bh(rt_hash_lock_addr(hash)); ip_rt_put(rt); while ((aux = *rthp) != NULL) { - if (aux == rt || (aux->rt_genid != atomic_read(&rt_genid))) { + if (aux == rt || rt_is_expired(aux)) { *rthp = aux->u.dst.rt_next; rt_free(aux); continue; @@ -1182,7 +1219,8 @@ void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw, for (i = 0; i < 2; i++) { for (k = 0; k < 2; k++) { - unsigned hash = rt_hash(daddr, skeys[i], ikeys[k]); + unsigned hash = rt_hash(daddr, skeys[i], ikeys[k], + rt_genid(net)); rthp=&rt_hash_table[hash].chain; @@ -1194,7 +1232,7 @@ void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw, rth->fl.fl4_src != skeys[i] || rth->fl.oif != ikeys[k] || rth->fl.iif != 0 || - rth->rt_genid != atomic_read(&rt_genid) || + rt_is_expired(rth) || !net_eq(dev_net(rth->u.dst.dev), net)) { rthp = &rth->u.dst.rt_next; continue; @@ -1233,7 +1271,7 @@ void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw, rt->u.dst.neighbour = NULL; rt->u.dst.hh = NULL; rt->u.dst.xfrm = NULL; - rt->rt_genid = atomic_read(&rt_genid); + rt->rt_genid = rt_genid(net); rt->rt_flags |= RTCF_REDIRECTED; /* Gateway is different ... */ @@ -1297,7 +1335,8 @@ static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst) } else if ((rt->rt_flags & RTCF_REDIRECTED) || rt->u.dst.expires) { unsigned hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src, - rt->fl.oif); + rt->fl.oif, + rt_genid(dev_net(dst->dev))); #if RT_CACHE_DEBUG >= 1 printk(KERN_DEBUG "ipv4_negative_advice: redirect to " NIPQUAD_FMT "/%02x dropped\n", @@ -1390,7 +1429,8 @@ static int ip_error(struct sk_buff *skb) break; case ENETUNREACH: code = ICMP_NET_UNREACH; - IP_INC_STATS_BH(IPSTATS_MIB_INNOROUTES); + IP_INC_STATS_BH(dev_net(rt->u.dst.dev), + IPSTATS_MIB_INNOROUTES); break; case EACCES: code = ICMP_PKT_FILTERED; @@ -1446,7 +1486,8 @@ unsigned short ip_rt_frag_needed(struct net *net, struct iphdr *iph, for (k = 0; k < 2; k++) { for (i = 0; i < 2; i++) { - unsigned hash = rt_hash(daddr, skeys[i], ikeys[k]); + unsigned hash = rt_hash(daddr, skeys[i], ikeys[k], + rt_genid(net)); rcu_read_lock(); for (rth = rcu_dereference(rt_hash_table[hash].chain); rth; @@ -1461,7 +1502,7 @@ unsigned short ip_rt_frag_needed(struct net *net, struct iphdr *iph, rth->fl.iif != 0 || dst_metric_locked(&rth->u.dst, RTAX_MTU) || !net_eq(dev_net(rth->u.dst.dev), net) || - rth->rt_genid != atomic_read(&rt_genid)) + !rt_is_expired(rth)) continue; if (new_mtu < 68 || new_mtu >= old_mtu) { @@ -1696,7 +1737,7 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr, rth->fl.oif = 0; rth->rt_gateway = daddr; rth->rt_spec_dst= spec_dst; - rth->rt_genid = atomic_read(&rt_genid); + rth->rt_genid = rt_genid(dev_net(dev)); rth->rt_flags = RTCF_MULTICAST; rth->rt_type = RTN_MULTICAST; if (our) { @@ -1711,7 +1752,7 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr, RT_CACHE_STAT_INC(in_slow_mc); in_dev_put(in_dev); - hash = rt_hash(daddr, saddr, dev->ifindex); + hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev))); return rt_intern_hash(hash, rth, &skb->rtable); e_nobufs: @@ -1837,7 +1878,7 @@ static int __mkroute_input(struct sk_buff *skb, rth->u.dst.input = ip_forward; rth->u.dst.output = ip_output; - rth->rt_genid = atomic_read(&rt_genid); + rth->rt_genid = rt_genid(dev_net(rth->u.dst.dev)); rt_set_nexthop(rth, res, itag); @@ -1872,7 +1913,8 @@ static int ip_mkroute_input(struct sk_buff *skb, return err; /* put it into the cache */ - hash = rt_hash(daddr, saddr, fl->iif); + hash = rt_hash(daddr, saddr, fl->iif, + rt_genid(dev_net(rth->u.dst.dev))); return rt_intern_hash(hash, rth, &skb->rtable); } @@ -1998,7 +2040,7 @@ local_input: goto e_nobufs; rth->u.dst.output= ip_rt_bug; - rth->rt_genid = atomic_read(&rt_genid); + rth->rt_genid = rt_genid(net); atomic_set(&rth->u.dst.__refcnt, 1); rth->u.dst.flags= DST_HOST; @@ -2028,7 +2070,7 @@ local_input: rth->rt_flags &= ~RTCF_LOCAL; } rth->rt_type = res.type; - hash = rt_hash(daddr, saddr, fl.iif); + hash = rt_hash(daddr, saddr, fl.iif, rt_genid(net)); err = rt_intern_hash(hash, rth, &skb->rtable); goto done; @@ -2079,7 +2121,7 @@ int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr, net = dev_net(dev); tos &= IPTOS_RT_MASK; - hash = rt_hash(daddr, saddr, iif); + hash = rt_hash(daddr, saddr, iif, rt_genid(net)); rcu_read_lock(); for (rth = rcu_dereference(rt_hash_table[hash].chain); rth; @@ -2091,7 +2133,7 @@ int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr, (rth->fl.fl4_tos ^ tos)) == 0 && rth->fl.mark == skb->mark && net_eq(dev_net(rth->u.dst.dev), net) && - rth->rt_genid == atomic_read(&rt_genid)) { + !rt_is_expired(rth)) { dst_use(&rth->u.dst, jiffies); RT_CACHE_STAT_INC(in_hit); rcu_read_unlock(); @@ -2219,7 +2261,7 @@ static int __mkroute_output(struct rtable **result, rth->rt_spec_dst= fl->fl4_src; rth->u.dst.output=ip_output; - rth->rt_genid = atomic_read(&rt_genid); + rth->rt_genid = rt_genid(dev_net(dev_out)); RT_CACHE_STAT_INC(out_slow_tot); @@ -2268,7 +2310,8 @@ static int ip_mkroute_output(struct rtable **rp, int err = __mkroute_output(&rth, res, fl, oldflp, dev_out, flags); unsigned hash; if (err == 0) { - hash = rt_hash(oldflp->fl4_dst, oldflp->fl4_src, oldflp->oif); + hash = rt_hash(oldflp->fl4_dst, oldflp->fl4_src, oldflp->oif, + rt_genid(dev_net(dev_out))); err = rt_intern_hash(hash, rth, rp); } @@ -2480,7 +2523,7 @@ int __ip_route_output_key(struct net *net, struct rtable **rp, unsigned hash; struct rtable *rth; - hash = rt_hash(flp->fl4_dst, flp->fl4_src, flp->oif); + hash = rt_hash(flp->fl4_dst, flp->fl4_src, flp->oif, rt_genid(net)); rcu_read_lock_bh(); for (rth = rcu_dereference(rt_hash_table[hash].chain); rth; @@ -2493,7 +2536,7 @@ int __ip_route_output_key(struct net *net, struct rtable **rp, !((rth->fl.fl4_tos ^ flp->fl4_tos) & (IPTOS_RT_MASK | RTO_ONLINK)) && net_eq(dev_net(rth->u.dst.dev), net) && - rth->rt_genid == atomic_read(&rt_genid)) { + !rt_is_expired(rth)) { dst_use(&rth->u.dst, jiffies); RT_CACHE_STAT_INC(out_hit); rcu_read_unlock_bh(); @@ -2524,7 +2567,7 @@ static struct dst_ops ipv4_dst_blackhole_ops = { }; -static int ipv4_dst_blackhole(struct rtable **rp, struct flowi *flp) +static int ipv4_dst_blackhole(struct net *net, struct rtable **rp, struct flowi *flp) { struct rtable *ort = *rp; struct rtable *rt = (struct rtable *) @@ -2548,7 +2591,7 @@ static int ipv4_dst_blackhole(struct rtable **rp, struct flowi *flp) rt->idev = ort->idev; if (rt->idev) in_dev_hold(rt->idev); - rt->rt_genid = atomic_read(&rt_genid); + rt->rt_genid = rt_genid(net); rt->rt_flags = ort->rt_flags; rt->rt_type = ort->rt_type; rt->rt_dst = ort->rt_dst; @@ -2584,7 +2627,7 @@ int ip_route_output_flow(struct net *net, struct rtable **rp, struct flowi *flp, err = __xfrm_lookup((struct dst_entry **)rp, flp, sk, flags ? XFRM_LOOKUP_WAIT : 0); if (err == -EREMOTE) - err = ipv4_dst_blackhole(rp, flp); + err = ipv4_dst_blackhole(net, rp, flp); return err; } @@ -2803,7 +2846,7 @@ int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb) rt = rcu_dereference(rt->u.dst.rt_next), idx++) { if (!net_eq(dev_net(rt->u.dst.dev), net) || idx < s_idx) continue; - if (rt->rt_genid != atomic_read(&rt_genid)) + if (rt_is_expired(rt)) continue; skb->dst = dst_clone(&rt->u.dst); if (rt_fill_info(skb, NETLINK_CB(cb->skb).pid, @@ -2827,19 +2870,25 @@ done: void ip_rt_multicast_event(struct in_device *in_dev) { - rt_cache_flush(0); + rt_cache_flush(dev_net(in_dev->dev), 0); } #ifdef CONFIG_SYSCTL -static int flush_delay; - -static int ipv4_sysctl_rtcache_flush(ctl_table *ctl, int write, +static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write, struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos) { if (write) { - proc_dointvec(ctl, write, filp, buffer, lenp, ppos); - rt_cache_flush(flush_delay); + int flush_delay; + ctl_table ctl; + struct net *net; + + memcpy(&ctl, __ctl, sizeof(ctl)); + ctl.data = &flush_delay; + proc_dointvec(&ctl, write, filp, buffer, lenp, ppos); + + net = (struct net *)__ctl->extra1; + rt_cache_flush(net, flush_delay); return 0; } @@ -2855,25 +2904,18 @@ static int ipv4_sysctl_rtcache_flush_strategy(ctl_table *table, size_t newlen) { int delay; + struct net *net; if (newlen != sizeof(int)) return -EINVAL; if (get_user(delay, (int __user *)newval)) return -EFAULT; - rt_cache_flush(delay); + net = (struct net *)table->extra1; + rt_cache_flush(net, delay); return 0; } ctl_table ipv4_route_table[] = { { - .ctl_name = NET_IPV4_ROUTE_FLUSH, - .procname = "flush", - .data = &flush_delay, - .maxlen = sizeof(int), - .mode = 0200, - .proc_handler = &ipv4_sysctl_rtcache_flush, - .strategy = &ipv4_sysctl_rtcache_flush_strategy, - }, - { .ctl_name = NET_IPV4_ROUTE_GC_THRESH, .procname = "gc_thresh", .data = &ipv4_dst_ops.gc_thresh, @@ -3011,8 +3053,97 @@ ctl_table ipv4_route_table[] = { }, { .ctl_name = 0 } }; + +static __net_initdata struct ctl_path ipv4_route_path[] = { + { .procname = "net", .ctl_name = CTL_NET, }, + { .procname = "ipv4", .ctl_name = NET_IPV4, }, + { .procname = "route", .ctl_name = NET_IPV4_ROUTE, }, + { }, +}; + + +static struct ctl_table ipv4_route_flush_table[] = { + { + .ctl_name = NET_IPV4_ROUTE_FLUSH, + .procname = "flush", + .maxlen = sizeof(int), + .mode = 0200, + .proc_handler = &ipv4_sysctl_rtcache_flush, + .strategy = &ipv4_sysctl_rtcache_flush_strategy, + }, + { .ctl_name = 0 }, +}; + +static __net_init int sysctl_route_net_init(struct net *net) +{ + struct ctl_table *tbl; + + tbl = ipv4_route_flush_table; + if (net != &init_net) { + tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL); + if (tbl == NULL) + goto err_dup; + } + tbl[0].extra1 = net; + + net->ipv4.route_hdr = + register_net_sysctl_table(net, ipv4_route_path, tbl); + if (net->ipv4.route_hdr == NULL) + goto err_reg; + return 0; + +err_reg: + if (tbl != ipv4_route_flush_table) + kfree(tbl); +err_dup: + return -ENOMEM; +} + +static __net_exit void sysctl_route_net_exit(struct net *net) +{ + struct ctl_table *tbl; + + tbl = net->ipv4.route_hdr->ctl_table_arg; + unregister_net_sysctl_table(net->ipv4.route_hdr); + BUG_ON(tbl == ipv4_route_flush_table); + kfree(tbl); +} + +static __net_initdata struct pernet_operations sysctl_route_ops = { + .init = sysctl_route_net_init, + .exit = sysctl_route_net_exit, +}; #endif + +static __net_init int rt_secret_timer_init(struct net *net) +{ + atomic_set(&net->ipv4.rt_genid, + (int) ((num_physpages ^ (num_physpages>>8)) ^ + (jiffies ^ (jiffies >> 7)))); + + net->ipv4.rt_secret_timer.function = rt_secret_rebuild; + net->ipv4.rt_secret_timer.data = (unsigned long)net; + init_timer_deferrable(&net->ipv4.rt_secret_timer); + + net->ipv4.rt_secret_timer.expires = + jiffies + net_random() % ip_rt_secret_interval + + ip_rt_secret_interval; + add_timer(&net->ipv4.rt_secret_timer); + return 0; +} + +static __net_exit void rt_secret_timer_exit(struct net *net) +{ + del_timer_sync(&net->ipv4.rt_secret_timer); +} + +static __net_initdata struct pernet_operations rt_secret_timer_ops = { + .init = rt_secret_timer_init, + .exit = rt_secret_timer_exit, +}; + + #ifdef CONFIG_NET_CLS_ROUTE struct ip_rt_acct *ip_rt_acct __read_mostly; #endif /* CONFIG_NET_CLS_ROUTE */ @@ -3031,9 +3162,6 @@ int __init ip_rt_init(void) { int rc = 0; - atomic_set(&rt_genid, (int) ((num_physpages ^ (num_physpages>>8)) ^ - (jiffies ^ (jiffies >> 7)))); - #ifdef CONFIG_NET_CLS_ROUTE ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct)); if (!ip_rt_acct) @@ -3065,19 +3193,14 @@ int __init ip_rt_init(void) devinet_init(); ip_fib_init(); - rt_secret_timer.function = rt_secret_rebuild; - rt_secret_timer.data = 0; - init_timer_deferrable(&rt_secret_timer); - /* All the timers, started at system startup tend to synchronize. Perturb it a bit. */ schedule_delayed_work(&expires_work, net_random() % ip_rt_gc_interval + ip_rt_gc_interval); - rt_secret_timer.expires = jiffies + net_random() % ip_rt_secret_interval + - ip_rt_secret_interval; - add_timer(&rt_secret_timer); + if (register_pernet_subsys(&rt_secret_timer_ops)) + printk(KERN_ERR "Unable to setup rt_secret_timer\n"); if (ip_rt_proc_init()) printk(KERN_ERR "Unable to create route proc files\n"); @@ -3087,6 +3210,9 @@ int __init ip_rt_init(void) #endif rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL); +#ifdef CONFIG_SYSCTL + register_pernet_subsys(&sysctl_route_ops); +#endif return rc; } diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c index d182a2a26291..51bc24d3b8a7 100644 --- a/net/ipv4/syncookies.c +++ b/net/ipv4/syncookies.c @@ -8,8 +8,6 @@ * modify it under the terms of the GNU General Public License * as published by the Free Software Foundation; either version * 2 of the License, or (at your option) any later version. - * - * $Id: syncookies.c,v 1.18 2002/02/01 22:01:04 davem Exp $ */ #include <linux/tcp.h> @@ -175,7 +173,7 @@ __u32 cookie_v4_init_sequence(struct sock *sk, struct sk_buff *skb, __u16 *mssp) ; *mssp = msstab[mssind] + 1; - NET_INC_STATS_BH(LINUX_MIB_SYNCOOKIESSENT); + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_SYNCOOKIESSENT); return secure_tcp_syn_cookie(iph->saddr, iph->daddr, th->source, th->dest, ntohl(th->seq), @@ -271,11 +269,11 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb, if (time_after(jiffies, tp->last_synq_overflow + TCP_TIMEOUT_INIT) || (mss = cookie_check(skb, cookie)) == 0) { - NET_INC_STATS_BH(LINUX_MIB_SYNCOOKIESFAILED); + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_SYNCOOKIESFAILED); goto out; } - NET_INC_STATS_BH(LINUX_MIB_SYNCOOKIESRECV); + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_SYNCOOKIESRECV); /* check for timestamp cookie support */ memset(&tcp_opt, 0, sizeof(tcp_opt)); diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index c437f804ee38..14ef202a2254 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -1,8 +1,6 @@ /* * sysctl_net_ipv4.c: sysctl interface to net IPV4 subsystem. * - * $Id: sysctl_net_ipv4.c,v 1.50 2001/10/20 00:00:11 davem Exp $ - * * Begun April 1, 1996, Mike Shaver. * Added /proc/sys/net/ipv4 directory entry (empty =) ). [MS] */ @@ -795,7 +793,8 @@ static struct ctl_table ipv4_net_table[] = { .data = &init_net.ipv4.sysctl_icmp_ratelimit, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &proc_dointvec + .proc_handler = &proc_dointvec_ms_jiffies, + .strategy = &sysctl_ms_jiffies }, { .ctl_name = NET_IPV4_ICMP_RATEMASK, diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 1d723de18686..eec8cf7c0247 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -5,8 +5,6 @@ * * Implementation of the Transmission Control Protocol(TCP). * - * Version: $Id: tcp.c,v 1.216 2002/02/01 22:01:04 davem Exp $ - * * Authors: Ross Biro * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> * Mark Evans, <evansmp@uhura.aston.ac.uk> @@ -318,10 +316,10 @@ int tcp_memory_pressure __read_mostly; EXPORT_SYMBOL(tcp_memory_pressure); -void tcp_enter_memory_pressure(void) +void tcp_enter_memory_pressure(struct sock *sk) { if (!tcp_memory_pressure) { - NET_INC_STATS(LINUX_MIB_TCPMEMORYPRESSURES); + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMEMORYPRESSURES); tcp_memory_pressure = 1; } } @@ -346,8 +344,8 @@ unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait) return inet_csk_listen_poll(sk); /* Socket is not locked. We are protected from async events - by poll logic and correct handling of state changes - made by another threads is impossible in any case. + * by poll logic and correct handling of state changes + * made by other threads is impossible in any case. */ mask = 0; @@ -373,10 +371,10 @@ unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait) * in state CLOSE_WAIT. One solution is evident --- to set POLLHUP * if and only if shutdown has been made in both directions. * Actually, it is interesting to look how Solaris and DUX - * solve this dilemma. I would prefer, if PULLHUP were maskable, + * solve this dilemma. I would prefer, if POLLHUP were maskable, * then we could set it on SND_SHUTDOWN. BTW examples given * in Stevens' books assume exactly this behaviour, it explains - * why PULLHUP is incompatible with POLLOUT. --ANK + * why POLLHUP is incompatible with POLLOUT. --ANK * * NOTE. Check for TCP_CLOSE is added. The goal is to prevent * blocking on fresh not-connected or disconnected socket. --ANK @@ -651,7 +649,7 @@ struct sk_buff *sk_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp) } __kfree_skb(skb); } else { - sk->sk_prot->enter_memory_pressure(); + sk->sk_prot->enter_memory_pressure(sk); sk_stream_moderate_sndbuf(sk); } return NULL; @@ -1155,7 +1153,7 @@ static void tcp_prequeue_process(struct sock *sk) struct sk_buff *skb; struct tcp_sock *tp = tcp_sk(sk); - NET_INC_STATS_USER(LINUX_MIB_TCPPREQUEUED); + NET_INC_STATS_USER(sock_net(sk), LINUX_MIB_TCPPREQUEUED); /* RX process wants to run with disabled BHs, though it is not * necessary */ @@ -1477,7 +1475,7 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, /* __ Restore normal policy in scheduler __ */ if ((chunk = len - tp->ucopy.len) != 0) { - NET_ADD_STATS_USER(LINUX_MIB_TCPDIRECTCOPYFROMBACKLOG, chunk); + NET_ADD_STATS_USER(sock_net(sk), LINUX_MIB_TCPDIRECTCOPYFROMBACKLOG, chunk); len -= chunk; copied += chunk; } @@ -1488,7 +1486,7 @@ do_prequeue: tcp_prequeue_process(sk); if ((chunk = len - tp->ucopy.len) != 0) { - NET_ADD_STATS_USER(LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, chunk); + NET_ADD_STATS_USER(sock_net(sk), LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, chunk); len -= chunk; copied += chunk; } @@ -1603,7 +1601,7 @@ skip_copy: tcp_prequeue_process(sk); if (copied > 0 && (chunk = len - tp->ucopy.len) != 0) { - NET_ADD_STATS_USER(LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, chunk); + NET_ADD_STATS_USER(sock_net(sk), LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, chunk); len -= chunk; copied += chunk; } @@ -1670,12 +1668,12 @@ void tcp_set_state(struct sock *sk, int state) switch (state) { case TCP_ESTABLISHED: if (oldstate != TCP_ESTABLISHED) - TCP_INC_STATS(TCP_MIB_CURRESTAB); + TCP_INC_STATS(sock_net(sk), TCP_MIB_CURRESTAB); break; case TCP_CLOSE: if (oldstate == TCP_CLOSE_WAIT || oldstate == TCP_ESTABLISHED) - TCP_INC_STATS(TCP_MIB_ESTABRESETS); + TCP_INC_STATS(sock_net(sk), TCP_MIB_ESTABRESETS); sk->sk_prot->unhash(sk); if (inet_csk(sk)->icsk_bind_hash && @@ -1684,7 +1682,7 @@ void tcp_set_state(struct sock *sk, int state) /* fall through */ default: if (oldstate==TCP_ESTABLISHED) - TCP_DEC_STATS(TCP_MIB_CURRESTAB); + TCP_DEC_STATS(sock_net(sk), TCP_MIB_CURRESTAB); } /* Change state AFTER socket is unhashed to avoid closed @@ -1795,13 +1793,13 @@ void tcp_close(struct sock *sk, long timeout) */ if (data_was_unread) { /* Unread data was tossed, zap the connection. */ - NET_INC_STATS_USER(LINUX_MIB_TCPABORTONCLOSE); + NET_INC_STATS_USER(sock_net(sk), LINUX_MIB_TCPABORTONCLOSE); tcp_set_state(sk, TCP_CLOSE); tcp_send_active_reset(sk, GFP_KERNEL); } else if (sock_flag(sk, SOCK_LINGER) && !sk->sk_lingertime) { /* Check zero linger _after_ checking for unread data. */ sk->sk_prot->disconnect(sk, 0); - NET_INC_STATS_USER(LINUX_MIB_TCPABORTONDATA); + NET_INC_STATS_USER(sock_net(sk), LINUX_MIB_TCPABORTONDATA); } else if (tcp_close_state(sk)) { /* We FIN if the application ate all the data before * zapping the connection. @@ -1873,7 +1871,8 @@ adjudge_to_death: if (tp->linger2 < 0) { tcp_set_state(sk, TCP_CLOSE); tcp_send_active_reset(sk, GFP_ATOMIC); - NET_INC_STATS_BH(LINUX_MIB_TCPABORTONLINGER); + NET_INC_STATS_BH(sock_net(sk), + LINUX_MIB_TCPABORTONLINGER); } else { const int tmo = tcp_fin_time(sk); @@ -1895,7 +1894,8 @@ adjudge_to_death: "sockets\n"); tcp_set_state(sk, TCP_CLOSE); tcp_send_active_reset(sk, GFP_ATOMIC); - NET_INC_STATS_BH(LINUX_MIB_TCPABORTONMEMORY); + NET_INC_STATS_BH(sock_net(sk), + LINUX_MIB_TCPABORTONMEMORY); } } @@ -2467,6 +2467,76 @@ static unsigned long tcp_md5sig_users; static struct tcp_md5sig_pool **tcp_md5sig_pool; static DEFINE_SPINLOCK(tcp_md5sig_pool_lock); +int tcp_calc_md5_hash(char *md5_hash, struct tcp_md5sig_key *key, + int bplen, + struct tcphdr *th, unsigned int tcplen, + struct tcp_md5sig_pool *hp) +{ + struct scatterlist sg[4]; + __u16 data_len; + int block = 0; + __sum16 cksum; + struct hash_desc *desc = &hp->md5_desc; + int err; + unsigned int nbytes = 0; + + sg_init_table(sg, 4); + + /* 1. The TCP pseudo-header */ + sg_set_buf(&sg[block++], &hp->md5_blk, bplen); + nbytes += bplen; + + /* 2. The TCP header, excluding options, and assuming a + * checksum of zero + */ + cksum = th->check; + th->check = 0; + sg_set_buf(&sg[block++], th, sizeof(*th)); + nbytes += sizeof(*th); + + /* 3. The TCP segment data (if any) */ + data_len = tcplen - (th->doff << 2); + if (data_len > 0) { + u8 *data = (u8 *)th + (th->doff << 2); + sg_set_buf(&sg[block++], data, data_len); + nbytes += data_len; + } + + /* 4. an independently-specified key or password, known to both + * TCPs and presumably connection-specific + */ + sg_set_buf(&sg[block++], key->key, key->keylen); + nbytes += key->keylen; + + sg_mark_end(&sg[block - 1]); + + /* Now store the hash into the packet */ + err = crypto_hash_init(desc); + if (err) { + if (net_ratelimit()) + printk(KERN_WARNING "%s(): hash_init failed\n", __func__); + return -1; + } + err = crypto_hash_update(desc, sg, nbytes); + if (err) { + if (net_ratelimit()) + printk(KERN_WARNING "%s(): hash_update failed\n", __func__); + return -1; + } + err = crypto_hash_final(desc, md5_hash); + if (err) { + if (net_ratelimit()) + printk(KERN_WARNING "%s(): hash_final failed\n", __func__); + return -1; + } + + /* Reset header */ + th->check = cksum; + + return 0; +} +EXPORT_SYMBOL(tcp_calc_md5_hash); + static void __tcp_free_md5sig_pool(struct tcp_md5sig_pool **pool) { int cpu; @@ -2595,7 +2665,7 @@ EXPORT_SYMBOL(__tcp_put_md5sig_pool); void tcp_done(struct sock *sk) { if(sk->sk_state == TCP_SYN_SENT || sk->sk_state == TCP_SYN_RECV) - TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS); + TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_ATTEMPTFAILS); tcp_set_state(sk, TCP_CLOSE); tcp_clear_xmit_timers(sk); diff --git a/net/ipv4/tcp_diag.c b/net/ipv4/tcp_diag.c index 2fbcc7d1b1a0..838d491dfda7 100644 --- a/net/ipv4/tcp_diag.c +++ b/net/ipv4/tcp_diag.c @@ -1,8 +1,6 @@ /* * tcp_diag.c Module for monitoring TCP transport protocols sockets. * - * Version: $Id: tcp_diag.c,v 1.3 2002/02/01 22:01:04 davem Exp $ - * * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> * * This program is free software; you can redistribute it and/or diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index cad73b7dfef0..fac49a6e1611 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -5,8 +5,6 @@ * * Implementation of the Transmission Control Protocol(TCP). * - * Version: $Id: tcp_input.c,v 1.243 2002/02/01 22:01:04 davem Exp $ - * * Authors: Ross Biro * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> * Mark Evans, <evansmp@uhura.aston.ac.uk> @@ -949,17 +947,21 @@ static void tcp_update_reordering(struct sock *sk, const int metric, { struct tcp_sock *tp = tcp_sk(sk); if (metric > tp->reordering) { + int mib_idx; + tp->reordering = min(TCP_MAX_REORDERING, metric); /* This exciting event is worth to be remembered. 8) */ if (ts) - NET_INC_STATS_BH(LINUX_MIB_TCPTSREORDER); + mib_idx = LINUX_MIB_TCPTSREORDER; else if (tcp_is_reno(tp)) - NET_INC_STATS_BH(LINUX_MIB_TCPRENOREORDER); + mib_idx = LINUX_MIB_TCPRENOREORDER; else if (tcp_is_fack(tp)) - NET_INC_STATS_BH(LINUX_MIB_TCPFACKREORDER); + mib_idx = LINUX_MIB_TCPFACKREORDER; else - NET_INC_STATS_BH(LINUX_MIB_TCPSACKREORDER); + mib_idx = LINUX_MIB_TCPSACKREORDER; + + NET_INC_STATS_BH(sock_net(sk), mib_idx); #if FASTRETRANS_DEBUG > 1 printk(KERN_DEBUG "Disorder%d %d %u f%u s%u rr%d\n", tp->rx_opt.sack_ok, inet_csk(sk)->icsk_ca_state, @@ -1155,7 +1157,7 @@ static void tcp_mark_lost_retrans(struct sock *sk) tp->lost_out += tcp_skb_pcount(skb); TCP_SKB_CB(skb)->sacked |= TCPCB_LOST; } - NET_INC_STATS_BH(LINUX_MIB_TCPLOSTRETRANSMIT); + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPLOSTRETRANSMIT); } else { if (before(ack_seq, new_low_seq)) new_low_seq = ack_seq; @@ -1167,10 +1169,11 @@ static void tcp_mark_lost_retrans(struct sock *sk) tp->lost_retrans_low = new_low_seq; } -static int tcp_check_dsack(struct tcp_sock *tp, struct sk_buff *ack_skb, +static int tcp_check_dsack(struct sock *sk, struct sk_buff *ack_skb, struct tcp_sack_block_wire *sp, int num_sacks, u32 prior_snd_una) { + struct tcp_sock *tp = tcp_sk(sk); u32 start_seq_0 = get_unaligned_be32(&sp[0].start_seq); u32 end_seq_0 = get_unaligned_be32(&sp[0].end_seq); int dup_sack = 0; @@ -1178,7 +1181,7 @@ static int tcp_check_dsack(struct tcp_sock *tp, struct sk_buff *ack_skb, if (before(start_seq_0, TCP_SKB_CB(ack_skb)->ack_seq)) { dup_sack = 1; tcp_dsack_seen(tp); - NET_INC_STATS_BH(LINUX_MIB_TCPDSACKRECV); + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPDSACKRECV); } else if (num_sacks > 1) { u32 end_seq_1 = get_unaligned_be32(&sp[1].end_seq); u32 start_seq_1 = get_unaligned_be32(&sp[1].start_seq); @@ -1187,7 +1190,8 @@ static int tcp_check_dsack(struct tcp_sock *tp, struct sk_buff *ack_skb, !before(start_seq_0, start_seq_1)) { dup_sack = 1; tcp_dsack_seen(tp); - NET_INC_STATS_BH(LINUX_MIB_TCPDSACKOFORECV); + NET_INC_STATS_BH(sock_net(sk), + LINUX_MIB_TCPDSACKOFORECV); } } @@ -1432,7 +1436,7 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, tcp_highest_sack_reset(sk); } - found_dup_sack = tcp_check_dsack(tp, ack_skb, sp_wire, + found_dup_sack = tcp_check_dsack(sk, ack_skb, sp_wire, num_sacks, prior_snd_una); if (found_dup_sack) flag |= FLAG_DSACKING_ACK; @@ -1458,18 +1462,22 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, if (!tcp_is_sackblock_valid(tp, dup_sack, sp[used_sacks].start_seq, sp[used_sacks].end_seq)) { + int mib_idx; + if (dup_sack) { if (!tp->undo_marker) - NET_INC_STATS_BH(LINUX_MIB_TCPDSACKIGNOREDNOUNDO); + mib_idx = LINUX_MIB_TCPDSACKIGNOREDNOUNDO; else - NET_INC_STATS_BH(LINUX_MIB_TCPDSACKIGNOREDOLD); + mib_idx = LINUX_MIB_TCPDSACKIGNOREDOLD; } else { /* Don't count olds caused by ACK reordering */ if ((TCP_SKB_CB(ack_skb)->ack_seq != tp->snd_una) && !after(sp[used_sacks].end_seq, tp->snd_una)) continue; - NET_INC_STATS_BH(LINUX_MIB_TCPSACKDISCARD); + mib_idx = LINUX_MIB_TCPSACKDISCARD; } + + NET_INC_STATS_BH(sock_net(sk), mib_idx); if (i == 0) first_sack_index = -1; continue; @@ -1962,7 +1970,7 @@ static int tcp_check_sack_reneging(struct sock *sk, int flag) { if (flag & FLAG_SACK_RENEGING) { struct inet_connection_sock *icsk = inet_csk(sk); - NET_INC_STATS_BH(LINUX_MIB_TCPSACKRENEGING); + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSACKRENEGING); tcp_enter_loss(sk, 1); icsk->icsk_retransmits++; @@ -2382,15 +2390,19 @@ static int tcp_try_undo_recovery(struct sock *sk) struct tcp_sock *tp = tcp_sk(sk); if (tcp_may_undo(tp)) { + int mib_idx; + /* Happy end! We did not retransmit anything * or our original transmission succeeded. */ DBGUNDO(sk, inet_csk(sk)->icsk_ca_state == TCP_CA_Loss ? "loss" : "retrans"); tcp_undo_cwr(sk, 1); if (inet_csk(sk)->icsk_ca_state == TCP_CA_Loss) - NET_INC_STATS_BH(LINUX_MIB_TCPLOSSUNDO); + mib_idx = LINUX_MIB_TCPLOSSUNDO; else - NET_INC_STATS_BH(LINUX_MIB_TCPFULLUNDO); + mib_idx = LINUX_MIB_TCPFULLUNDO; + + NET_INC_STATS_BH(sock_net(sk), mib_idx); tp->undo_marker = 0; } if (tp->snd_una == tp->high_seq && tcp_is_reno(tp)) { @@ -2413,7 +2425,7 @@ static void tcp_try_undo_dsack(struct sock *sk) DBGUNDO(sk, "D-SACK"); tcp_undo_cwr(sk, 1); tp->undo_marker = 0; - NET_INC_STATS_BH(LINUX_MIB_TCPDSACKUNDO); + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPDSACKUNDO); } } @@ -2436,7 +2448,7 @@ static int tcp_try_undo_partial(struct sock *sk, int acked) DBGUNDO(sk, "Hoe"); tcp_undo_cwr(sk, 0); - NET_INC_STATS_BH(LINUX_MIB_TCPPARTIALUNDO); + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPPARTIALUNDO); /* So... Do not make Hoe's retransmit yet. * If the first packet was delayed, the rest @@ -2465,7 +2477,7 @@ static int tcp_try_undo_loss(struct sock *sk) DBGUNDO(sk, "partial loss"); tp->lost_out = 0; tcp_undo_cwr(sk, 1); - NET_INC_STATS_BH(LINUX_MIB_TCPLOSSUNDO); + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPLOSSUNDO); inet_csk(sk)->icsk_retransmits = 0; tp->undo_marker = 0; if (tcp_is_sack(tp)) @@ -2562,7 +2574,7 @@ static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked, int flag) int is_dupack = !(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP)); int do_lost = is_dupack || ((flag & FLAG_DATA_SACKED) && (tcp_fackets_out(tp) > tp->reordering)); - int fast_rexmit = 0; + int fast_rexmit = 0, mib_idx; if (WARN_ON(!tp->packets_out && tp->sacked_out)) tp->sacked_out = 0; @@ -2584,7 +2596,7 @@ static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked, int flag) icsk->icsk_ca_state != TCP_CA_Open && tp->fackets_out > tp->reordering) { tcp_mark_head_lost(sk, tp->fackets_out - tp->reordering); - NET_INC_STATS_BH(LINUX_MIB_TCPLOSS); + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPLOSS); } /* D. Check consistency of the current state. */ @@ -2685,9 +2697,11 @@ static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked, int flag) /* Otherwise enter Recovery state */ if (tcp_is_reno(tp)) - NET_INC_STATS_BH(LINUX_MIB_TCPRENORECOVERY); + mib_idx = LINUX_MIB_TCPRENORECOVERY; else - NET_INC_STATS_BH(LINUX_MIB_TCPSACKRECOVERY); + mib_idx = LINUX_MIB_TCPSACKRECOVERY; + + NET_INC_STATS_BH(sock_net(sk), mib_idx); tp->high_seq = tp->snd_nxt; tp->prior_ssthresh = 0; @@ -3198,7 +3212,7 @@ static int tcp_process_frto(struct sock *sk, int flag) } tp->frto_counter = 0; tp->undo_marker = 0; - NET_INC_STATS_BH(LINUX_MIB_TCPSPURIOUSRTOS); + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSPURIOUSRTOS); } return 0; } @@ -3251,12 +3265,12 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag) tcp_ca_event(sk, CA_EVENT_FAST_ACK); - NET_INC_STATS_BH(LINUX_MIB_TCPHPACKS); + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPHPACKS); } else { if (ack_seq != TCP_SKB_CB(skb)->end_seq) flag |= FLAG_DATA; else - NET_INC_STATS_BH(LINUX_MIB_TCPPUREACKS); + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPPUREACKS); flag |= tcp_ack_update_window(sk, skb, ack, ack_seq); @@ -3450,6 +3464,43 @@ static int tcp_fast_parse_options(struct sk_buff *skb, struct tcphdr *th, return 1; } +#ifdef CONFIG_TCP_MD5SIG +/* + * Parse MD5 Signature option + */ +u8 *tcp_parse_md5sig_option(struct tcphdr *th) +{ + int length = (th->doff << 2) - sizeof (*th); + u8 *ptr = (u8*)(th + 1); + + /* If the TCP option is too short, we can short cut */ + if (length < TCPOLEN_MD5SIG) + return NULL; + + while (length > 0) { + int opcode = *ptr++; + int opsize; + + switch(opcode) { + case TCPOPT_EOL: + return NULL; + case TCPOPT_NOP: + length--; + continue; + default: + opsize = *ptr++; + if (opsize < 2 || opsize > length) + return NULL; + if (opcode == TCPOPT_MD5SIG) + return ptr; + } + ptr += opsize - 2; + length -= opsize; + } + return NULL; +} +#endif + static inline void tcp_store_ts_recent(struct tcp_sock *tp) { tp->rx_opt.ts_recent = tp->rx_opt.rcv_tsval; @@ -3662,13 +3713,19 @@ static inline int tcp_sack_extend(struct tcp_sack_block *sp, u32 seq, return 0; } -static void tcp_dsack_set(struct tcp_sock *tp, u32 seq, u32 end_seq) +static void tcp_dsack_set(struct sock *sk, u32 seq, u32 end_seq) { + struct tcp_sock *tp = tcp_sk(sk); + if (tcp_is_sack(tp) && sysctl_tcp_dsack) { + int mib_idx; + if (before(seq, tp->rcv_nxt)) - NET_INC_STATS_BH(LINUX_MIB_TCPDSACKOLDSENT); + mib_idx = LINUX_MIB_TCPDSACKOLDSENT; else - NET_INC_STATS_BH(LINUX_MIB_TCPDSACKOFOSENT); + mib_idx = LINUX_MIB_TCPDSACKOFOSENT; + + NET_INC_STATS_BH(sock_net(sk), mib_idx); tp->rx_opt.dsack = 1; tp->duplicate_sack[0].start_seq = seq; @@ -3678,10 +3735,12 @@ static void tcp_dsack_set(struct tcp_sock *tp, u32 seq, u32 end_seq) } } -static void tcp_dsack_extend(struct tcp_sock *tp, u32 seq, u32 end_seq) +static void tcp_dsack_extend(struct sock *sk, u32 seq, u32 end_seq) { + struct tcp_sock *tp = tcp_sk(sk); + if (!tp->rx_opt.dsack) - tcp_dsack_set(tp, seq, end_seq); + tcp_dsack_set(sk, seq, end_seq); else tcp_sack_extend(tp->duplicate_sack, seq, end_seq); } @@ -3692,7 +3751,7 @@ static void tcp_send_dupack(struct sock *sk, struct sk_buff *skb) if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq && before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) { - NET_INC_STATS_BH(LINUX_MIB_DELAYEDACKLOST); + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_DELAYEDACKLOST); tcp_enter_quickack_mode(sk); if (tcp_is_sack(tp) && sysctl_tcp_dsack) { @@ -3700,7 +3759,7 @@ static void tcp_send_dupack(struct sock *sk, struct sk_buff *skb) if (after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) end_seq = tp->rcv_nxt; - tcp_dsack_set(tp, TCP_SKB_CB(skb)->seq, end_seq); + tcp_dsack_set(sk, TCP_SKB_CB(skb)->seq, end_seq); } } @@ -3853,7 +3912,7 @@ static void tcp_ofo_queue(struct sock *sk) __u32 dsack = dsack_high; if (before(TCP_SKB_CB(skb)->end_seq, dsack_high)) dsack_high = TCP_SKB_CB(skb)->end_seq; - tcp_dsack_extend(tp, TCP_SKB_CB(skb)->seq, dsack); + tcp_dsack_extend(sk, TCP_SKB_CB(skb)->seq, dsack); } if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) { @@ -3981,8 +4040,8 @@ queue_and_out: if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) { /* A retransmit, 2nd most common case. Force an immediate ack. */ - NET_INC_STATS_BH(LINUX_MIB_DELAYEDACKLOST); - tcp_dsack_set(tp, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq); + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_DELAYEDACKLOST); + tcp_dsack_set(sk, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq); out_of_window: tcp_enter_quickack_mode(sk); @@ -4004,7 +4063,7 @@ drop: tp->rcv_nxt, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq); - tcp_dsack_set(tp, TCP_SKB_CB(skb)->seq, tp->rcv_nxt); + tcp_dsack_set(sk, TCP_SKB_CB(skb)->seq, tp->rcv_nxt); /* If window is closed, drop tail of packet. But after * remembering D-SACK for its head made in previous line. @@ -4069,12 +4128,12 @@ drop: if (!after(end_seq, TCP_SKB_CB(skb1)->end_seq)) { /* All the bits are present. Drop. */ __kfree_skb(skb); - tcp_dsack_set(tp, seq, end_seq); + tcp_dsack_set(sk, seq, end_seq); goto add_sack; } if (after(seq, TCP_SKB_CB(skb1)->seq)) { /* Partial overlap. */ - tcp_dsack_set(tp, seq, + tcp_dsack_set(sk, seq, TCP_SKB_CB(skb1)->end_seq); } else { skb1 = skb1->prev; @@ -4087,12 +4146,12 @@ drop: (struct sk_buff *)&tp->out_of_order_queue && after(end_seq, TCP_SKB_CB(skb1)->seq)) { if (before(end_seq, TCP_SKB_CB(skb1)->end_seq)) { - tcp_dsack_extend(tp, TCP_SKB_CB(skb1)->seq, + tcp_dsack_extend(sk, TCP_SKB_CB(skb1)->seq, end_seq); break; } __skb_unlink(skb1, &tp->out_of_order_queue); - tcp_dsack_extend(tp, TCP_SKB_CB(skb1)->seq, + tcp_dsack_extend(sk, TCP_SKB_CB(skb1)->seq, TCP_SKB_CB(skb1)->end_seq); __kfree_skb(skb1); } @@ -4123,7 +4182,7 @@ tcp_collapse(struct sock *sk, struct sk_buff_head *list, struct sk_buff *next = skb->next; __skb_unlink(skb, list); __kfree_skb(skb); - NET_INC_STATS_BH(LINUX_MIB_TCPRCVCOLLAPSED); + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPRCVCOLLAPSED); skb = next; continue; } @@ -4191,7 +4250,7 @@ tcp_collapse(struct sock *sk, struct sk_buff_head *list, struct sk_buff *next = skb->next; __skb_unlink(skb, list); __kfree_skb(skb); - NET_INC_STATS_BH(LINUX_MIB_TCPRCVCOLLAPSED); + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPRCVCOLLAPSED); skb = next; if (skb == tail || tcp_hdr(skb)->syn || @@ -4254,7 +4313,7 @@ static int tcp_prune_ofo_queue(struct sock *sk) int res = 0; if (!skb_queue_empty(&tp->out_of_order_queue)) { - NET_INC_STATS_BH(LINUX_MIB_OFOPRUNED); + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_OFOPRUNED); __skb_queue_purge(&tp->out_of_order_queue); /* Reset SACK state. A conforming SACK implementation will @@ -4283,7 +4342,7 @@ static int tcp_prune_queue(struct sock *sk) SOCK_DEBUG(sk, "prune_queue: c=%x\n", tp->copied_seq); - NET_INC_STATS_BH(LINUX_MIB_PRUNECALLED); + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PRUNECALLED); if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) tcp_clamp_window(sk); @@ -4312,7 +4371,7 @@ static int tcp_prune_queue(struct sock *sk) * drop receive data on the floor. It will get retransmitted * and hopefully then we'll have sufficient space. */ - NET_INC_STATS_BH(LINUX_MIB_RCVPRUNED); + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_RCVPRUNED); /* Massive buffer overcommit. */ tp->pred_flags = 0; @@ -4742,7 +4801,7 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, tcp_data_snd_check(sk); return 0; } else { /* Header too small */ - TCP_INC_STATS_BH(TCP_MIB_INERRS); + TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS); goto discard; } } else { @@ -4779,7 +4838,7 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, __skb_pull(skb, tcp_header_len); tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq; - NET_INC_STATS_BH(LINUX_MIB_TCPHPHITSTOUSER); + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPHPHITSTOUSER); } if (copied_early) tcp_cleanup_rbuf(sk, skb->len); @@ -4802,7 +4861,7 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, if ((int)skb->truesize > sk->sk_forward_alloc) goto step5; - NET_INC_STATS_BH(LINUX_MIB_TCPHPHITS); + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPHPHITS); /* Bulk data transfer: receiver */ __skb_pull(skb, tcp_header_len); @@ -4846,7 +4905,7 @@ slow_path: if (tcp_fast_parse_options(skb, th, tp) && tp->rx_opt.saw_tstamp && tcp_paws_discard(sk, skb)) { if (!th->rst) { - NET_INC_STATS_BH(LINUX_MIB_PAWSESTABREJECTED); + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSESTABREJECTED); tcp_send_dupack(sk, skb); goto discard; } @@ -4881,8 +4940,8 @@ slow_path: tcp_replace_ts_recent(tp, TCP_SKB_CB(skb)->seq); if (th->syn && !before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) { - TCP_INC_STATS_BH(TCP_MIB_INERRS); - NET_INC_STATS_BH(LINUX_MIB_TCPABORTONSYN); + TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS); + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPABORTONSYN); tcp_reset(sk); return 1; } @@ -4904,7 +4963,7 @@ step5: return 0; csum_error: - TCP_INC_STATS_BH(TCP_MIB_INERRS); + TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS); discard: __kfree_skb(skb); @@ -4938,7 +4997,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr && !between(tp->rx_opt.rcv_tsecr, tp->retrans_stamp, tcp_time_stamp)) { - NET_INC_STATS_BH(LINUX_MIB_PAWSACTIVEREJECTED); + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSACTIVEREJECTED); goto reset_and_undo; } @@ -5222,7 +5281,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, if (tcp_fast_parse_options(skb, th, tp) && tp->rx_opt.saw_tstamp && tcp_paws_discard(sk, skb)) { if (!th->rst) { - NET_INC_STATS_BH(LINUX_MIB_PAWSESTABREJECTED); + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSESTABREJECTED); tcp_send_dupack(sk, skb); goto discard; } @@ -5251,7 +5310,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, * Check for a SYN in window. */ if (th->syn && !before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) { - NET_INC_STATS_BH(LINUX_MIB_TCPABORTONSYN); + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPABORTONSYN); tcp_reset(sk); return 1; } @@ -5333,7 +5392,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq && after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt))) { tcp_done(sk); - NET_INC_STATS_BH(LINUX_MIB_TCPABORTONDATA); + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPABORTONDATA); return 1; } @@ -5393,7 +5452,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, if (sk->sk_shutdown & RCV_SHUTDOWN) { if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq && after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt)) { - NET_INC_STATS_BH(LINUX_MIB_TCPABORTONDATA); + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPABORTONDATA); tcp_reset(sk); return 1; } @@ -5422,6 +5481,9 @@ EXPORT_SYMBOL(sysctl_tcp_ecn); EXPORT_SYMBOL(sysctl_tcp_reordering); EXPORT_SYMBOL(sysctl_tcp_adv_win_scale); EXPORT_SYMBOL(tcp_parse_options); +#ifdef CONFIG_TCP_MD5SIG +EXPORT_SYMBOL(tcp_parse_md5sig_option); +#endif EXPORT_SYMBOL(tcp_rcv_established); EXPORT_SYMBOL(tcp_rcv_state_process); EXPORT_SYMBOL(tcp_initialize_rcv_mss); diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index ffe869ac1bcf..29adc668ad51 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -5,8 +5,6 @@ * * Implementation of the Transmission Control Protocol(TCP). * - * Version: $Id: tcp_ipv4.c,v 1.240 2002/02/01 22:01:04 davem Exp $ - * * IPv4 specific functions * * @@ -91,8 +89,13 @@ static struct tcp_md5sig_key *tcp_v4_md5_do_lookup(struct sock *sk, __be32 addr); static int tcp_v4_do_calc_md5_hash(char *md5_hash, struct tcp_md5sig_key *key, __be32 saddr, __be32 daddr, - struct tcphdr *th, int protocol, - unsigned int tcplen); + struct tcphdr *th, unsigned int tcplen); +#else +static inline +struct tcp_md5sig_key *tcp_v4_md5_do_lookup(struct sock *sk, __be32 addr) +{ + return NULL; +} #endif struct inet_hashinfo __cacheline_aligned tcp_hashinfo = { @@ -172,7 +175,7 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) inet->sport, usin->sin_port, sk, 1); if (tmp < 0) { if (tmp == -ENETUNREACH) - IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES); + IP_INC_STATS_BH(sock_net(sk), IPSTATS_MIB_OUTNOROUTES); return tmp; } @@ -340,16 +343,17 @@ void tcp_v4_err(struct sk_buff *skb, u32 info) struct sock *sk; __u32 seq; int err; + struct net *net = dev_net(skb->dev); if (skb->len < (iph->ihl << 2) + 8) { - ICMP_INC_STATS_BH(ICMP_MIB_INERRORS); + ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS); return; } - sk = inet_lookup(dev_net(skb->dev), &tcp_hashinfo, iph->daddr, th->dest, + sk = inet_lookup(net, &tcp_hashinfo, iph->daddr, th->dest, iph->saddr, th->source, inet_iif(skb)); if (!sk) { - ICMP_INC_STATS_BH(ICMP_MIB_INERRORS); + ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS); return; } if (sk->sk_state == TCP_TIME_WAIT) { @@ -362,7 +366,7 @@ void tcp_v4_err(struct sk_buff *skb, u32 info) * servers this needs to be solved differently. */ if (sock_owned_by_user(sk)) - NET_INC_STATS_BH(LINUX_MIB_LOCKDROPPEDICMPS); + NET_INC_STATS_BH(net, LINUX_MIB_LOCKDROPPEDICMPS); if (sk->sk_state == TCP_CLOSE) goto out; @@ -371,7 +375,7 @@ void tcp_v4_err(struct sk_buff *skb, u32 info) seq = ntohl(th->seq); if (sk->sk_state != TCP_LISTEN && !between(seq, tp->snd_una, tp->snd_nxt)) { - NET_INC_STATS_BH(LINUX_MIB_OUTOFWINDOWICMPS); + NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS); goto out; } @@ -418,7 +422,7 @@ void tcp_v4_err(struct sk_buff *skb, u32 info) BUG_TRAP(!req->sk); if (seq != tcp_rsk(req)->snt_isn) { - NET_INC_STATS_BH(LINUX_MIB_OUTOFWINDOWICMPS); + NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS); goto out; } @@ -540,6 +544,7 @@ static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb) #ifdef CONFIG_TCP_MD5SIG struct tcp_md5sig_key *key; #endif + struct net *net; /* Never send a reset in response to a reset. */ if (th->rst) @@ -582,8 +587,7 @@ static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb) key, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr, - &rep.th, IPPROTO_TCP, - arg.iov[0].iov_len); + &rep.th, arg.iov[0].iov_len); } #endif arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr, @@ -591,20 +595,21 @@ static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb) sizeof(struct tcphdr), IPPROTO_TCP, 0); arg.csumoffset = offsetof(struct tcphdr, check) / 2; - ip_send_reply(dev_net(skb->dst->dev)->ipv4.tcp_sock, skb, + net = dev_net(skb->dst->dev); + ip_send_reply(net->ipv4.tcp_sock, skb, &arg, arg.iov[0].iov_len); - TCP_INC_STATS_BH(TCP_MIB_OUTSEGS); - TCP_INC_STATS_BH(TCP_MIB_OUTRSTS); + TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS); + TCP_INC_STATS_BH(net, TCP_MIB_OUTRSTS); } /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states outside socket context is ugly, certainly. What can I do? */ -static void tcp_v4_send_ack(struct tcp_timewait_sock *twsk, - struct sk_buff *skb, u32 seq, u32 ack, - u32 win, u32 ts) +static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack, + u32 win, u32 ts, int oif, + struct tcp_md5sig_key *key) { struct tcphdr *th = tcp_hdr(skb); struct { @@ -616,10 +621,7 @@ static void tcp_v4_send_ack(struct tcp_timewait_sock *twsk, ]; } rep; struct ip_reply_arg arg; -#ifdef CONFIG_TCP_MD5SIG - struct tcp_md5sig_key *key; - struct tcp_md5sig_key tw_key; -#endif + struct net *net = dev_net(skb->dev); memset(&rep.th, 0, sizeof(struct tcphdr)); memset(&arg, 0, sizeof(arg)); @@ -645,23 +647,6 @@ static void tcp_v4_send_ack(struct tcp_timewait_sock *twsk, rep.th.window = htons(win); #ifdef CONFIG_TCP_MD5SIG - /* - * The SKB holds an imcoming packet, but may not have a valid ->sk - * pointer. This is especially the case when we're dealing with a - * TIME_WAIT ack, because the sk structure is long gone, and only - * the tcp_timewait_sock remains. So the md5 key is stashed in that - * structure, and we use it in preference. I believe that (twsk || - * skb->sk) holds true, but we program defensively. - */ - if (!twsk && skb->sk) { - key = tcp_v4_md5_do_lookup(skb->sk, ip_hdr(skb)->daddr); - } else if (twsk && twsk->tw_md5_keylen) { - tw_key.key = twsk->tw_md5_key; - tw_key.keylen = twsk->tw_md5_keylen; - key = &tw_key; - } else - key = NULL; - if (key) { int offset = (ts) ? 3 : 0; @@ -676,21 +661,20 @@ static void tcp_v4_send_ack(struct tcp_timewait_sock *twsk, key, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr, - &rep.th, IPPROTO_TCP, - arg.iov[0].iov_len); + &rep.th, arg.iov[0].iov_len); } #endif arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr, ip_hdr(skb)->saddr, /* XXX */ arg.iov[0].iov_len, IPPROTO_TCP, 0); arg.csumoffset = offsetof(struct tcphdr, check) / 2; - if (twsk) - arg.bound_dev_if = twsk->tw_sk.tw_bound_dev_if; + if (oif) + arg.bound_dev_if = oif; - ip_send_reply(dev_net(skb->dev)->ipv4.tcp_sock, skb, + ip_send_reply(net->ipv4.tcp_sock, skb, &arg, arg.iov[0].iov_len); - TCP_INC_STATS_BH(TCP_MIB_OUTSEGS); + TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS); } static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb) @@ -698,9 +682,12 @@ static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb) struct inet_timewait_sock *tw = inet_twsk(sk); struct tcp_timewait_sock *tcptw = tcp_twsk(sk); - tcp_v4_send_ack(tcptw, skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt, + tcp_v4_send_ack(skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt, tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale, - tcptw->tw_ts_recent); + tcptw->tw_ts_recent, + tw->tw_bound_dev_if, + tcp_twsk_md5_key(tcptw) + ); inet_twsk_put(tw); } @@ -708,9 +695,11 @@ static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb) static void tcp_v4_reqsk_send_ack(struct sk_buff *skb, struct request_sock *req) { - tcp_v4_send_ack(NULL, skb, tcp_rsk(req)->snt_isn + 1, + tcp_v4_send_ack(skb, tcp_rsk(req)->snt_isn + 1, tcp_rsk(req)->rcv_isn + 1, req->rcv_wnd, - req->ts_recent); + req->ts_recent, + 0, + tcp_v4_md5_do_lookup(skb->sk, ip_hdr(skb)->daddr)); } /* @@ -1002,18 +991,12 @@ static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval, static int tcp_v4_do_calc_md5_hash(char *md5_hash, struct tcp_md5sig_key *key, __be32 saddr, __be32 daddr, - struct tcphdr *th, int protocol, + struct tcphdr *th, unsigned int tcplen) { - struct scatterlist sg[4]; - __u16 data_len; - int block = 0; - __sum16 old_checksum; struct tcp_md5sig_pool *hp; struct tcp4_pseudohdr *bp; - struct hash_desc *desc; int err; - unsigned int nbytes = 0; /* * Okay, so RFC2385 is turned on for this connection, @@ -1025,63 +1008,25 @@ static int tcp_v4_do_calc_md5_hash(char *md5_hash, struct tcp_md5sig_key *key, goto clear_hash_noput; bp = &hp->md5_blk.ip4; - desc = &hp->md5_desc; /* - * 1. the TCP pseudo-header (in the order: source IP address, + * The TCP pseudo-header (in the order: source IP address, * destination IP address, zero-padded protocol number, and * segment length) */ bp->saddr = saddr; bp->daddr = daddr; bp->pad = 0; - bp->protocol = protocol; + bp->protocol = IPPROTO_TCP; bp->len = htons(tcplen); - sg_init_table(sg, 4); - - sg_set_buf(&sg[block++], bp, sizeof(*bp)); - nbytes += sizeof(*bp); - - /* 2. the TCP header, excluding options, and assuming a - * checksum of zero/ - */ - old_checksum = th->check; - th->check = 0; - sg_set_buf(&sg[block++], th, sizeof(struct tcphdr)); - nbytes += sizeof(struct tcphdr); - - /* 3. the TCP segment data (if any) */ - data_len = tcplen - (th->doff << 2); - if (data_len > 0) { - unsigned char *data = (unsigned char *)th + (th->doff << 2); - sg_set_buf(&sg[block++], data, data_len); - nbytes += data_len; - } - - /* 4. an independently-specified key or password, known to both - * TCPs and presumably connection-specific - */ - sg_set_buf(&sg[block++], key->key, key->keylen); - nbytes += key->keylen; - - sg_mark_end(&sg[block - 1]); - - /* Now store the Hash into the packet */ - err = crypto_hash_init(desc); - if (err) - goto clear_hash; - err = crypto_hash_update(desc, sg, nbytes); - if (err) - goto clear_hash; - err = crypto_hash_final(desc, md5_hash); + err = tcp_calc_md5_hash(md5_hash, key, sizeof(*bp), + th, tcplen, hp); if (err) goto clear_hash; - /* Reset header, and free up the crypto */ + /* Free up the crypto pool */ tcp_put_md5sig_pool(); - th->check = old_checksum; - out: return 0; clear_hash: @@ -1095,7 +1040,7 @@ int tcp_v4_calc_md5_hash(char *md5_hash, struct tcp_md5sig_key *key, struct sock *sk, struct dst_entry *dst, struct request_sock *req, - struct tcphdr *th, int protocol, + struct tcphdr *th, unsigned int tcplen) { __be32 saddr, daddr; @@ -1111,7 +1056,7 @@ int tcp_v4_calc_md5_hash(char *md5_hash, struct tcp_md5sig_key *key, } return tcp_v4_do_calc_md5_hash(md5_hash, key, saddr, daddr, - th, protocol, tcplen); + th, tcplen); } EXPORT_SYMBOL(tcp_v4_calc_md5_hash); @@ -1130,52 +1075,12 @@ static int tcp_v4_inbound_md5_hash(struct sock *sk, struct sk_buff *skb) struct tcp_md5sig_key *hash_expected; const struct iphdr *iph = ip_hdr(skb); struct tcphdr *th = tcp_hdr(skb); - int length = (th->doff << 2) - sizeof(struct tcphdr); int genhash; - unsigned char *ptr; unsigned char newhash[16]; hash_expected = tcp_v4_md5_do_lookup(sk, iph->saddr); + hash_location = tcp_parse_md5sig_option(th); - /* - * If the TCP option length is less than the TCP_MD5SIG - * option length, then we can shortcut - */ - if (length < TCPOLEN_MD5SIG) { - if (hash_expected) - return 1; - else - return 0; - } - - /* Okay, we can't shortcut - we have to grub through the options */ - ptr = (unsigned char *)(th + 1); - while (length > 0) { - int opcode = *ptr++; - int opsize; - - switch (opcode) { - case TCPOPT_EOL: - goto done_opts; - case TCPOPT_NOP: - length--; - continue; - default: - opsize = *ptr++; - if (opsize < 2) - goto done_opts; - if (opsize > length) - goto done_opts; - - if (opcode == TCPOPT_MD5SIG) { - hash_location = ptr; - goto done_opts; - } - } - ptr += opsize-2; - length -= opsize; - } -done_opts: /* We've parsed the options - do we have a hash? */ if (!hash_expected && !hash_location) return 0; @@ -1202,8 +1107,7 @@ done_opts: genhash = tcp_v4_do_calc_md5_hash(newhash, hash_expected, iph->saddr, iph->daddr, - th, sk->sk_protocol, - skb->len); + th, skb->len); if (genhash || memcmp(hash_location, newhash, 16) != 0) { if (net_ratelimit()) { @@ -1347,7 +1251,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) if (get_seconds() < peer->tcp_ts_stamp + TCP_PAWS_MSL && (s32)(peer->tcp_ts - req->ts_recent) > TCP_PAWS_WINDOW) { - NET_INC_STATS_BH(LINUX_MIB_PAWSPASSIVEREJECTED); + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED); goto drop_and_release; } } @@ -1461,9 +1365,9 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb, return newsk; exit_overflow: - NET_INC_STATS_BH(LINUX_MIB_LISTENOVERFLOWS); + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS); exit: - NET_INC_STATS_BH(LINUX_MIB_LISTENDROPS); + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS); dst_release(dst); return NULL; } @@ -1590,7 +1494,7 @@ discard: return 0; csum_err: - TCP_INC_STATS_BH(TCP_MIB_INERRS); + TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS); goto discard; } @@ -1604,12 +1508,13 @@ int tcp_v4_rcv(struct sk_buff *skb) struct tcphdr *th; struct sock *sk; int ret; + struct net *net = dev_net(skb->dev); if (skb->pkt_type != PACKET_HOST) goto discard_it; /* Count it even if it's bad */ - TCP_INC_STATS_BH(TCP_MIB_INSEGS); + TCP_INC_STATS_BH(net, TCP_MIB_INSEGS); if (!pskb_may_pull(skb, sizeof(struct tcphdr))) goto discard_it; @@ -1638,7 +1543,7 @@ int tcp_v4_rcv(struct sk_buff *skb) TCP_SKB_CB(skb)->flags = iph->tos; TCP_SKB_CB(skb)->sacked = 0; - sk = __inet_lookup(dev_net(skb->dev), &tcp_hashinfo, iph->saddr, + sk = __inet_lookup(net, &tcp_hashinfo, iph->saddr, th->source, iph->daddr, th->dest, inet_iif(skb)); if (!sk) goto no_tcp_socket; @@ -1685,7 +1590,7 @@ no_tcp_socket: if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) { bad_packet: - TCP_INC_STATS_BH(TCP_MIB_INERRS); + TCP_INC_STATS_BH(net, TCP_MIB_INERRS); } else { tcp_v4_send_reset(NULL, skb); } @@ -1706,7 +1611,7 @@ do_time_wait: } if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) { - TCP_INC_STATS_BH(TCP_MIB_INERRS); + TCP_INC_STATS_BH(net, TCP_MIB_INERRS); inet_twsk_put(inet_twsk(sk)); goto discard_it; } @@ -1871,7 +1776,7 @@ static int tcp_v4_init_sock(struct sock *sk) return 0; } -int tcp_v4_destroy_sock(struct sock *sk) +void tcp_v4_destroy_sock(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); @@ -1915,8 +1820,6 @@ int tcp_v4_destroy_sock(struct sock *sk) } atomic_dec(&tcp_sockets_allocated); - - return 0; } EXPORT_SYMBOL(tcp_v4_destroy_sock); diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 8245247a6ceb..204c42162660 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -5,8 +5,6 @@ * * Implementation of the Transmission Control Protocol(TCP). * - * Version: $Id: tcp_minisocks.c,v 1.15 2002/02/01 22:01:04 davem Exp $ - * * Authors: Ross Biro * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> * Mark Evans, <evansmp@uhura.aston.ac.uk> @@ -246,7 +244,7 @@ kill: } if (paws_reject) - NET_INC_STATS_BH(LINUX_MIB_PAWSESTABREJECTED); + NET_INC_STATS_BH(twsk_net(tw), LINUX_MIB_PAWSESTABREJECTED); if (!th->rst) { /* In this case we must reset the TIMEWAIT timer. @@ -482,7 +480,7 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, newtp->rx_opt.mss_clamp = req->mss; TCP_ECN_openreq_child(newtp, req); - TCP_INC_STATS_BH(TCP_MIB_PASSIVEOPENS); + TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_PASSIVEOPENS); } return newsk; } @@ -613,7 +611,7 @@ struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb, if (!(flg & TCP_FLAG_RST)) req->rsk_ops->send_ack(skb, req); if (paws_reject) - NET_INC_STATS_BH(LINUX_MIB_PAWSESTABREJECTED); + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSESTABREJECTED); return NULL; } @@ -632,7 +630,7 @@ struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb, * "fourth, check the SYN bit" */ if (flg & (TCP_FLAG_RST|TCP_FLAG_SYN)) { - TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS); + TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_ATTEMPTFAILS); goto embryonic_reset; } @@ -697,7 +695,7 @@ struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb, } embryonic_reset: - NET_INC_STATS_BH(LINUX_MIB_EMBRYONICRSTS); + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_EMBRYONICRSTS); if (!(flg & TCP_FLAG_RST)) req->rsk_ops->send_reset(sk, skb); diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index ad993ecb4810..36a19707f67f 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -5,8 +5,6 @@ * * Implementation of the Transmission Control Protocol(TCP). * - * Version: $Id: tcp_output.c,v 1.146 2002/02/01 22:01:04 davem Exp $ - * * Authors: Ross Biro * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> * Mark Evans, <evansmp@uhura.aston.ac.uk> @@ -607,7 +605,6 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, md5, sk, NULL, NULL, tcp_hdr(skb), - sk->sk_protocol, skb->len); } #endif @@ -621,7 +618,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, tcp_event_data_sent(tp, skb, sk); if (after(tcb->end_seq, tp->snd_nxt) || tcb->seq == tcb->end_seq) - TCP_INC_STATS(TCP_MIB_OUTSEGS); + TCP_INC_STATS(sock_net(sk), TCP_MIB_OUTSEGS); err = icsk->icsk_af_ops->queue_xmit(skb, 0); if (likely(err <= 0)) @@ -1913,7 +1910,7 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) if (err == 0) { /* Update global TCP statistics. */ - TCP_INC_STATS(TCP_MIB_RETRANSSEGS); + TCP_INC_STATS(sock_net(sk), TCP_MIB_RETRANSSEGS); tp->total_retrans++; @@ -1988,14 +1985,17 @@ void tcp_xmit_retransmit_queue(struct sock *sk) if (sacked & TCPCB_LOST) { if (!(sacked & (TCPCB_SACKED_ACKED|TCPCB_SACKED_RETRANS))) { + int mib_idx; + if (tcp_retransmit_skb(sk, skb)) { tp->retransmit_skb_hint = NULL; return; } if (icsk->icsk_ca_state != TCP_CA_Loss) - NET_INC_STATS_BH(LINUX_MIB_TCPFASTRETRANS); + mib_idx = LINUX_MIB_TCPFASTRETRANS; else - NET_INC_STATS_BH(LINUX_MIB_TCPSLOWSTARTRETRANS); + mib_idx = LINUX_MIB_TCPSLOWSTARTRETRANS; + NET_INC_STATS_BH(sock_net(sk), mib_idx); if (skb == tcp_write_queue_head(sk)) inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, @@ -2065,7 +2065,7 @@ void tcp_xmit_retransmit_queue(struct sock *sk) inet_csk(sk)->icsk_rto, TCP_RTO_MAX); - NET_INC_STATS_BH(LINUX_MIB_TCPFORWARDRETRANS); + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPFORWARDRETRANS); } } @@ -2119,7 +2119,7 @@ void tcp_send_active_reset(struct sock *sk, gfp_t priority) /* NOTE: No TCP options attached and we never retransmit this. */ skb = alloc_skb(MAX_TCP_HEADER, priority); if (!skb) { - NET_INC_STATS(LINUX_MIB_TCPABORTFAILED); + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTFAILED); return; } @@ -2130,9 +2130,9 @@ void tcp_send_active_reset(struct sock *sk, gfp_t priority) /* Send it off. */ TCP_SKB_CB(skb)->when = tcp_time_stamp; if (tcp_transmit_skb(sk, skb, 0, priority)) - NET_INC_STATS(LINUX_MIB_TCPABORTFAILED); + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTFAILED); - TCP_INC_STATS(TCP_MIB_OUTRSTS); + TCP_INC_STATS(sock_net(sk), TCP_MIB_OUTRSTS); } /* WARNING: This routine must only be called when we have already sent @@ -2258,7 +2258,7 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst, ); th->doff = (tcp_header_size >> 2); - TCP_INC_STATS(TCP_MIB_OUTSEGS); + TCP_INC_STATS(sock_net(sk), TCP_MIB_OUTSEGS); #ifdef CONFIG_TCP_MD5SIG /* Okay, we have all we need - do the md5 hash if needed */ @@ -2266,7 +2266,7 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst, tp->af_specific->calc_md5_hash(md5_hash_location, md5, NULL, dst, req, - tcp_hdr(skb), sk->sk_protocol, + tcp_hdr(skb), skb->len); } #endif @@ -2367,7 +2367,7 @@ int tcp_connect(struct sock *sk) */ tp->snd_nxt = tp->write_seq; tp->pushed_seq = tp->write_seq; - TCP_INC_STATS(TCP_MIB_ACTIVEOPENS); + TCP_INC_STATS(sock_net(sk), TCP_MIB_ACTIVEOPENS); /* Timer for repeating the SYN until an answer. */ inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 63ed9d6830e7..328e0cf42b3c 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -5,8 +5,6 @@ * * Implementation of the Transmission Control Protocol(TCP). * - * Version: $Id: tcp_timer.c,v 1.88 2002/02/01 22:01:04 davem Exp $ - * * Authors: Ross Biro * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> * Mark Evans, <evansmp@uhura.aston.ac.uk> @@ -50,7 +48,7 @@ static void tcp_write_err(struct sock *sk) sk->sk_error_report(sk); tcp_done(sk); - NET_INC_STATS_BH(LINUX_MIB_TCPABORTONTIMEOUT); + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPABORTONTIMEOUT); } /* Do not allow orphaned sockets to eat all our resources. @@ -91,7 +89,7 @@ static int tcp_out_of_resources(struct sock *sk, int do_reset) if (do_reset) tcp_send_active_reset(sk, GFP_ATOMIC); tcp_done(sk); - NET_INC_STATS_BH(LINUX_MIB_TCPABORTONMEMORY); + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPABORTONMEMORY); return 1; } return 0; @@ -181,7 +179,7 @@ static void tcp_delack_timer(unsigned long data) if (sock_owned_by_user(sk)) { /* Try again later. */ icsk->icsk_ack.blocked = 1; - NET_INC_STATS_BH(LINUX_MIB_DELAYEDACKLOCKED); + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_DELAYEDACKLOCKED); sk_reset_timer(sk, &icsk->icsk_delack_timer, jiffies + TCP_DELACK_MIN); goto out_unlock; } @@ -200,7 +198,7 @@ static void tcp_delack_timer(unsigned long data) if (!skb_queue_empty(&tp->ucopy.prequeue)) { struct sk_buff *skb; - NET_INC_STATS_BH(LINUX_MIB_TCPSCHEDULERFAILED); + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSCHEDULERFAILED); while ((skb = __skb_dequeue(&tp->ucopy.prequeue)) != NULL) sk->sk_backlog_rcv(sk, skb); @@ -220,7 +218,7 @@ static void tcp_delack_timer(unsigned long data) icsk->icsk_ack.ato = TCP_ATO_MIN; } tcp_send_ack(sk); - NET_INC_STATS_BH(LINUX_MIB_DELAYEDACKS); + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_DELAYEDACKS); } TCP_CHECK_TIMER(sk); @@ -328,24 +326,27 @@ static void tcp_retransmit_timer(struct sock *sk) goto out; if (icsk->icsk_retransmits == 0) { + int mib_idx; + if (icsk->icsk_ca_state == TCP_CA_Disorder || icsk->icsk_ca_state == TCP_CA_Recovery) { if (tcp_is_sack(tp)) { if (icsk->icsk_ca_state == TCP_CA_Recovery) - NET_INC_STATS_BH(LINUX_MIB_TCPSACKRECOVERYFAIL); + mib_idx = LINUX_MIB_TCPSACKRECOVERYFAIL; else - NET_INC_STATS_BH(LINUX_MIB_TCPSACKFAILURES); + mib_idx = LINUX_MIB_TCPSACKFAILURES; } else { if (icsk->icsk_ca_state == TCP_CA_Recovery) - NET_INC_STATS_BH(LINUX_MIB_TCPRENORECOVERYFAIL); + mib_idx = LINUX_MIB_TCPRENORECOVERYFAIL; else - NET_INC_STATS_BH(LINUX_MIB_TCPRENOFAILURES); + mib_idx = LINUX_MIB_TCPRENOFAILURES; } } else if (icsk->icsk_ca_state == TCP_CA_Loss) { - NET_INC_STATS_BH(LINUX_MIB_TCPLOSSFAILURES); + mib_idx = LINUX_MIB_TCPLOSSFAILURES; } else { - NET_INC_STATS_BH(LINUX_MIB_TCPTIMEOUTS); + mib_idx = LINUX_MIB_TCPTIMEOUTS; } + NET_INC_STATS_BH(sock_net(sk), mib_idx); } if (tcp_use_frto(sk)) { diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 56fcda3694ba..1560d11ba6ef 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -5,8 +5,6 @@ * * The User Datagram Protocol (UDP). * - * Version: $Id: udp.c,v 1.102 2002/02/01 22:01:04 davem Exp $ - * * Authors: Ross Biro * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> * Arnt Gulbrandsen, <agulbra@nvg.unit.no> @@ -136,7 +134,7 @@ static inline int __udp_lib_lport_inuse(struct net *net, __u16 num, struct sock *sk; struct hlist_node *node; - sk_for_each(sk, node, &udptable[num & (UDP_HTABLE_SIZE - 1)]) + sk_for_each(sk, node, &udptable[udp_hashfn(net, num)]) if (net_eq(sock_net(sk), net) && sk->sk_hash == num) return 1; return 0; @@ -176,7 +174,7 @@ int udp_lib_get_port(struct sock *sk, unsigned short snum, for (i = 0; i < UDP_HTABLE_SIZE; i++) { int size = 0; - head = &udptable[rover & (UDP_HTABLE_SIZE - 1)]; + head = &udptable[udp_hashfn(net, rover)]; if (hlist_empty(head)) goto gotit; @@ -213,7 +211,7 @@ int udp_lib_get_port(struct sock *sk, unsigned short snum, gotit: snum = rover; } else { - head = &udptable[snum & (UDP_HTABLE_SIZE - 1)]; + head = &udptable[udp_hashfn(net, snum)]; sk_for_each(sk2, node, head) if (sk2->sk_hash == snum && @@ -229,7 +227,7 @@ gotit: inet_sk(sk)->num = snum; sk->sk_hash = snum; if (sk_unhashed(sk)) { - head = &udptable[snum & (UDP_HTABLE_SIZE - 1)]; + head = &udptable[udp_hashfn(net, snum)]; sk_add_node(sk, head); sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); } @@ -266,7 +264,7 @@ static struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr, int badness = -1; read_lock(&udp_hash_lock); - sk_for_each(sk, node, &udptable[hnum & (UDP_HTABLE_SIZE - 1)]) { + sk_for_each(sk, node, &udptable[udp_hashfn(net, hnum)]) { struct inet_sock *inet = inet_sk(sk); if (net_eq(sock_net(sk), net) && sk->sk_hash == hnum && @@ -356,11 +354,12 @@ void __udp4_lib_err(struct sk_buff *skb, u32 info, struct hlist_head udptable[]) struct sock *sk; int harderr; int err; + struct net *net = dev_net(skb->dev); - sk = __udp4_lib_lookup(dev_net(skb->dev), iph->daddr, uh->dest, + sk = __udp4_lib_lookup(net, iph->daddr, uh->dest, iph->saddr, uh->source, skb->dev->ifindex, udptable); if (sk == NULL) { - ICMP_INC_STATS_BH(ICMP_MIB_INERRORS); + ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS); return; /* No socket for error */ } @@ -528,7 +527,8 @@ out: up->len = 0; up->pending = 0; if (!err) - UDP_INC_STATS_USER(UDP_MIB_OUTDATAGRAMS, is_udplite); + UDP_INC_STATS_USER(sock_net(sk), + UDP_MIB_OUTDATAGRAMS, is_udplite); return err; } @@ -656,11 +656,13 @@ int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, .uli_u = { .ports = { .sport = inet->sport, .dport = dport } } }; + struct net *net = sock_net(sk); + security_sk_classify_flow(sk, &fl); - err = ip_route_output_flow(sock_net(sk), &rt, &fl, sk, 1); + err = ip_route_output_flow(net, &rt, &fl, sk, 1); if (err) { if (err == -ENETUNREACH) - IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES); + IP_INC_STATS_BH(net, IPSTATS_MIB_OUTNOROUTES); goto out; } @@ -727,7 +729,8 @@ out: * seems like overkill. */ if (err == -ENOBUFS || test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) { - UDP_INC_STATS_USER(UDP_MIB_SNDBUFERRORS, is_udplite); + UDP_INC_STATS_USER(sock_net(sk), + UDP_MIB_SNDBUFERRORS, is_udplite); } return err; @@ -890,7 +893,8 @@ try_again: goto out_free; if (!peeked) - UDP_INC_STATS_USER(UDP_MIB_INDATAGRAMS, is_udplite); + UDP_INC_STATS_USER(sock_net(sk), + UDP_MIB_INDATAGRAMS, is_udplite); sock_recv_timestamp(msg, sk, skb); @@ -919,7 +923,7 @@ out: csum_copy_err: lock_sock(sk); if (!skb_kill_datagram(sk, skb, flags)) - UDP_INC_STATS_USER(UDP_MIB_INERRORS, is_udplite); + UDP_INC_STATS_USER(sock_net(sk), UDP_MIB_INERRORS, is_udplite); release_sock(sk); if (noblock) @@ -990,7 +994,8 @@ int udp_queue_rcv_skb(struct sock * sk, struct sk_buff *skb) ret = (*up->encap_rcv)(sk, skb); if (ret <= 0) { - UDP_INC_STATS_BH(UDP_MIB_INDATAGRAMS, + UDP_INC_STATS_BH(sock_net(sk), + UDP_MIB_INDATAGRAMS, is_udplite); return -ret; } @@ -1042,15 +1047,18 @@ int udp_queue_rcv_skb(struct sock * sk, struct sk_buff *skb) if ((rc = sock_queue_rcv_skb(sk,skb)) < 0) { /* Note that an ENOMEM error is charged twice */ - if (rc == -ENOMEM) - UDP_INC_STATS_BH(UDP_MIB_RCVBUFERRORS, is_udplite); + if (rc == -ENOMEM) { + UDP_INC_STATS_BH(sock_net(sk), + UDP_MIB_RCVBUFERRORS, is_udplite); + atomic_inc(&sk->sk_drops); + } goto drop; } return 0; drop: - UDP_INC_STATS_BH(UDP_MIB_INERRORS, is_udplite); + UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_INERRORS, is_udplite); kfree_skb(skb); return -1; } @@ -1061,7 +1069,7 @@ drop: * Note: called only from the BH handler context, * so we don't need to lock the hashes. */ -static int __udp4_lib_mcast_deliver(struct sk_buff *skb, +static int __udp4_lib_mcast_deliver(struct net *net, struct sk_buff *skb, struct udphdr *uh, __be32 saddr, __be32 daddr, struct hlist_head udptable[]) @@ -1070,7 +1078,7 @@ static int __udp4_lib_mcast_deliver(struct sk_buff *skb, int dif; read_lock(&udp_hash_lock); - sk = sk_head(&udptable[ntohs(uh->dest) & (UDP_HTABLE_SIZE - 1)]); + sk = sk_head(&udptable[udp_hashfn(net, ntohs(uh->dest))]); dif = skb->dev->ifindex; sk = udp_v4_mcast_next(sk, uh->dest, daddr, uh->source, saddr, dif); if (sk) { @@ -1158,6 +1166,7 @@ int __udp4_lib_rcv(struct sk_buff *skb, struct hlist_head udptable[], struct rtable *rt = (struct rtable*)skb->dst; __be32 saddr = ip_hdr(skb)->saddr; __be32 daddr = ip_hdr(skb)->daddr; + struct net *net = dev_net(skb->dev); /* * Validate the packet. @@ -1180,9 +1189,10 @@ int __udp4_lib_rcv(struct sk_buff *skb, struct hlist_head udptable[], goto csum_error; if (rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST)) - return __udp4_lib_mcast_deliver(skb, uh, saddr, daddr, udptable); + return __udp4_lib_mcast_deliver(net, skb, uh, + saddr, daddr, udptable); - sk = __udp4_lib_lookup(dev_net(skb->dev), saddr, uh->source, daddr, + sk = __udp4_lib_lookup(net, saddr, uh->source, daddr, uh->dest, inet_iif(skb), udptable); if (sk != NULL) { @@ -1211,7 +1221,7 @@ int __udp4_lib_rcv(struct sk_buff *skb, struct hlist_head udptable[], if (udp_lib_checksum_complete(skb)) goto csum_error; - UDP_INC_STATS_BH(UDP_MIB_NOPORTS, proto == IPPROTO_UDPLITE); + UDP_INC_STATS_BH(net, UDP_MIB_NOPORTS, proto == IPPROTO_UDPLITE); icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0); /* @@ -1245,7 +1255,7 @@ csum_error: ntohs(uh->dest), ulen); drop: - UDP_INC_STATS_BH(UDP_MIB_INERRORS, proto == IPPROTO_UDPLITE); + UDP_INC_STATS_BH(net, UDP_MIB_INERRORS, proto == IPPROTO_UDPLITE); kfree_skb(skb); return 0; } @@ -1255,12 +1265,11 @@ int udp_rcv(struct sk_buff *skb) return __udp4_lib_rcv(skb, udp_hash, IPPROTO_UDP); } -int udp_destroy_sock(struct sock *sk) +void udp_destroy_sock(struct sock *sk) { lock_sock(sk); udp_flush_pending_frames(sk); release_sock(sk); - return 0; } /* @@ -1453,7 +1462,8 @@ unsigned int udp_poll(struct file *file, struct socket *sock, poll_table *wait) spin_lock_bh(&rcvq->lock); while ((skb = skb_peek(rcvq)) != NULL && udp_lib_checksum_complete(skb)) { - UDP_INC_STATS_BH(UDP_MIB_INERRORS, is_lite); + UDP_INC_STATS_BH(sock_net(sk), + UDP_MIB_INERRORS, is_lite); __skb_unlink(skb, rcvq); kfree_skb(skb); } @@ -1629,12 +1639,13 @@ static void udp4_format_sock(struct sock *sp, struct seq_file *f, __u16 srcp = ntohs(inet->sport); seq_printf(f, "%4d: %08X:%04X %08X:%04X" - " %02X %08X:%08X %02X:%08lX %08X %5d %8d %lu %d %p%n", + " %02X %08X:%08X %02X:%08lX %08X %5d %8d %lu %d %p %d%n", bucket, src, srcp, dest, destp, sp->sk_state, atomic_read(&sp->sk_wmem_alloc), atomic_read(&sp->sk_rmem_alloc), 0, 0L, 0, sock_i_uid(sp), 0, sock_i_ino(sp), - atomic_read(&sp->sk_refcnt), sp, len); + atomic_read(&sp->sk_refcnt), sp, + atomic_read(&sp->sk_drops), len); } int udp4_seq_show(struct seq_file *seq, void *v) @@ -1643,7 +1654,7 @@ int udp4_seq_show(struct seq_file *seq, void *v) seq_printf(seq, "%-127s\n", " sl local_address rem_address st tx_queue " "rx_queue tr tm->when retrnsmt uid timeout " - "inode"); + "inode ref pointer drops"); else { struct udp_iter_state *state = seq->private; int len; diff --git a/net/ipv4/udp_impl.h b/net/ipv4/udp_impl.h index 7288bf7977fb..2e9bad2fa1bc 100644 --- a/net/ipv4/udp_impl.h +++ b/net/ipv4/udp_impl.h @@ -26,7 +26,7 @@ extern int udp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, extern int udp_sendpage(struct sock *sk, struct page *page, int offset, size_t size, int flags); extern int udp_queue_rcv_skb(struct sock * sk, struct sk_buff *skb); -extern int udp_destroy_sock(struct sock *sk); +extern void udp_destroy_sock(struct sock *sk); #ifdef CONFIG_PROC_FS extern int udp4_seq_show(struct seq_file *seq, void *v); diff --git a/net/ipv4/udplite.c b/net/ipv4/udplite.c index 72ce26b6c4d3..4ad16b6d5138 100644 --- a/net/ipv4/udplite.c +++ b/net/ipv4/udplite.c @@ -1,8 +1,6 @@ /* * UDPLITE An implementation of the UDP-Lite protocol (RFC 3828). * - * Version: $Id: udplite.c,v 1.25 2006/10/19 07:22:36 gerrit Exp $ - * * Authors: Gerrit Renker <gerrit@erg.abdn.ac.uk> * * Changes: |