diff options
author | Dmitry Torokhov <dtor_core@ameritech.net> | 2006-06-26 07:31:38 +0200 |
---|---|---|
committer | Dmitry Torokhov <dtor_core@ameritech.net> | 2006-06-26 07:31:38 +0200 |
commit | 4854c7b27f0975a2b629f35ea3996d2968eb7c4f (patch) | |
tree | 4102bdb70289764a2058aff0f907b13d7cf0e0d1 /net/ipv4 | |
parent | Input: fix accuracy of fixp-arith.h (diff) | |
parent | [PATCH] uclinux: use PER_LINUX_32BIT in binfmt_flat (diff) | |
download | linux-4854c7b27f0975a2b629f35ea3996d2968eb7c4f.tar.xz linux-4854c7b27f0975a2b629f35ea3996d2968eb7c4f.zip |
Merge rsync://rsync.kernel.org/pub/scm/linux/kernel/git/torvalds/linux-2.6
Diffstat (limited to 'net/ipv4')
59 files changed, 3594 insertions, 1388 deletions
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig index e40f75322377..da33393be45f 100644 --- a/net/ipv4/Kconfig +++ b/net/ipv4/Kconfig @@ -414,6 +414,24 @@ config INET_TUNNEL tristate default n +config INET_XFRM_MODE_TRANSPORT + tristate "IP: IPsec transport mode" + default y + select XFRM + ---help--- + Support for IPsec transport mode. + + If unsure, say Y. + +config INET_XFRM_MODE_TUNNEL + tristate "IP: IPsec tunnel mode" + default y + select XFRM + ---help--- + Support for IPsec tunnel mode. + + If unsure, say Y. + config INET_DIAG tristate "INET: socket monitoring interface" default y @@ -532,6 +550,38 @@ config TCP_CONG_SCALABLE properties, though is known to have fairness issues. See http://www-lce.eng.cam.ac.uk/~ctk21/scalable/ +config TCP_CONG_LP + tristate "TCP Low Priority" + depends on EXPERIMENTAL + default n + ---help--- + TCP Low Priority (TCP-LP), a distributed algorithm whose goal is + to utiliza only the excess network bandwidth as compared to the + ``fair share`` of bandwidth as targeted by TCP. + See http://www-ece.rice.edu/networks/TCP-LP/ + +config TCP_CONG_VENO + tristate "TCP Veno" + depends on EXPERIMENTAL + default n + ---help--- + TCP Veno is a sender-side only enhancement of TCP to obtain better + throughput over wireless networks. TCP Veno makes use of state + distinguishing to circumvent the difficult judgment of the packet loss + type. TCP Veno cuts down less congestion window in response to random + loss packets. + See http://www.ntu.edu.sg/home5/ZHOU0022/papers/CPFu03a.pdf + +config TCP_CONG_COMPOUND + tristate "TCP Compound" + depends on EXPERIMENTAL + default n + ---help--- + TCP Compound is a sender-side only change to TCP that uses + a mixed Reno/Vegas approach to calculate the cwnd. + For further details look here: + ftp://ftp.research.microsoft.com/pub/tr/TR-2005-86.pdf + endmenu config TCP_CONG_BIC diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile index 9ef50a0b9d2c..38b8039bdd55 100644 --- a/net/ipv4/Makefile +++ b/net/ipv4/Makefile @@ -24,6 +24,8 @@ obj-$(CONFIG_INET_ESP) += esp4.o obj-$(CONFIG_INET_IPCOMP) += ipcomp.o obj-$(CONFIG_INET_XFRM_TUNNEL) += xfrm4_tunnel.o obj-$(CONFIG_INET_TUNNEL) += tunnel4.o +obj-$(CONFIG_INET_XFRM_MODE_TRANSPORT) += xfrm4_mode_transport.o +obj-$(CONFIG_INET_XFRM_MODE_TUNNEL) += xfrm4_mode_tunnel.o obj-$(CONFIG_IP_PNP) += ipconfig.o obj-$(CONFIG_IP_ROUTE_MULTIPATH_RR) += multipath_rr.o obj-$(CONFIG_IP_ROUTE_MULTIPATH_RANDOM) += multipath_random.o @@ -34,6 +36,7 @@ obj-$(CONFIG_IP_VS) += ipvs/ obj-$(CONFIG_INET_DIAG) += inet_diag.o obj-$(CONFIG_IP_ROUTE_MULTIPATH_CACHED) += multipath.o obj-$(CONFIG_INET_TCP_DIAG) += tcp_diag.o +obj-$(CONFIG_NET_TCPPROBE) += tcp_probe.o obj-$(CONFIG_TCP_CONG_BIC) += tcp_bic.o obj-$(CONFIG_TCP_CONG_CUBIC) += tcp_cubic.o obj-$(CONFIG_TCP_CONG_WESTWOOD) += tcp_westwood.o @@ -41,7 +44,10 @@ obj-$(CONFIG_TCP_CONG_HSTCP) += tcp_highspeed.o obj-$(CONFIG_TCP_CONG_HYBLA) += tcp_hybla.o obj-$(CONFIG_TCP_CONG_HTCP) += tcp_htcp.o obj-$(CONFIG_TCP_CONG_VEGAS) += tcp_vegas.o +obj-$(CONFIG_TCP_CONG_VENO) += tcp_veno.o obj-$(CONFIG_TCP_CONG_SCALABLE) += tcp_scalable.o +obj-$(CONFIG_TCP_CONG_LP) += tcp_lp.o +obj-$(CONFIG_TCP_CONG_COMPOUND) += tcp_compound.o obj-$(CONFIG_XFRM) += xfrm4_policy.o xfrm4_state.o xfrm4_input.o \ xfrm4_output.o diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 0a277453526b..461216b47948 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -68,6 +68,7 @@ */ #include <linux/config.h> +#include <linux/err.h> #include <linux/errno.h> #include <linux/types.h> #include <linux/socket.h> @@ -1096,6 +1097,54 @@ int inet_sk_rebuild_header(struct sock *sk) EXPORT_SYMBOL(inet_sk_rebuild_header); +static struct sk_buff *inet_gso_segment(struct sk_buff *skb, int sg) +{ + struct sk_buff *segs = ERR_PTR(-EINVAL); + struct iphdr *iph; + struct net_protocol *ops; + int proto; + int ihl; + int id; + + if (!pskb_may_pull(skb, sizeof(*iph))) + goto out; + + iph = skb->nh.iph; + ihl = iph->ihl * 4; + if (ihl < sizeof(*iph)) + goto out; + + if (!pskb_may_pull(skb, ihl)) + goto out; + + skb->h.raw = __skb_pull(skb, ihl); + iph = skb->nh.iph; + id = ntohs(iph->id); + proto = iph->protocol & (MAX_INET_PROTOS - 1); + segs = ERR_PTR(-EPROTONOSUPPORT); + + rcu_read_lock(); + ops = rcu_dereference(inet_protos[proto]); + if (ops && ops->gso_segment) + segs = ops->gso_segment(skb, sg); + rcu_read_unlock(); + + if (IS_ERR(segs)) + goto out; + + skb = segs; + do { + iph = skb->nh.iph; + iph->id = htons(id++); + iph->tot_len = htons(skb->len - skb->mac_len); + iph->check = 0; + iph->check = ip_fast_csum(skb->nh.raw, iph->ihl); + } while ((skb = skb->next)); + +out: + return segs; +} + #ifdef CONFIG_IP_MULTICAST static struct net_protocol igmp_protocol = { .handler = igmp_rcv, @@ -1105,6 +1154,7 @@ static struct net_protocol igmp_protocol = { static struct net_protocol tcp_protocol = { .handler = tcp_v4_rcv, .err_handler = tcp_v4_err, + .gso_segment = tcp_tso_segment, .no_policy = 1, }; @@ -1150,6 +1200,7 @@ static int ipv4_proc_init(void); static struct packet_type ip_packet_type = { .type = __constant_htons(ETH_P_IP), .func = ip_rcv, + .gso_segment = inet_gso_segment, }; static int __init inet_init(void) diff --git a/net/ipv4/ah4.c b/net/ipv4/ah4.c index e2e4771fa4c6..c7782230080d 100644 --- a/net/ipv4/ah4.c +++ b/net/ipv4/ah4.c @@ -119,6 +119,7 @@ error: static int ah_input(struct xfrm_state *x, struct sk_buff *skb) { int ah_hlen; + int ihl; struct iphdr *iph; struct ip_auth_hdr *ah; struct ah_data *ahp; @@ -149,13 +150,14 @@ static int ah_input(struct xfrm_state *x, struct sk_buff *skb) ah = (struct ip_auth_hdr*)skb->data; iph = skb->nh.iph; - memcpy(work_buf, iph, iph->ihl*4); + ihl = skb->data - skb->nh.raw; + memcpy(work_buf, iph, ihl); iph->ttl = 0; iph->tos = 0; iph->frag_off = 0; iph->check = 0; - if (iph->ihl != 5) { + if (ihl > sizeof(*iph)) { u32 dummy; if (ip_clear_mutable_options(iph, &dummy)) goto out; @@ -164,7 +166,7 @@ static int ah_input(struct xfrm_state *x, struct sk_buff *skb) u8 auth_data[MAX_AH_AUTH_LEN]; memcpy(auth_data, ah->auth_data, ahp->icv_trunc_len); - skb_push(skb, skb->data - skb->nh.raw); + skb_push(skb, ihl); ahp->icv(ahp, skb, ah->auth_data); if (memcmp(ah->auth_data, auth_data, ahp->icv_trunc_len)) { x->stats.integrity_failed++; @@ -172,11 +174,8 @@ static int ah_input(struct xfrm_state *x, struct sk_buff *skb) } } ((struct iphdr*)work_buf)->protocol = ah->nexthdr; - skb->nh.raw = skb_pull(skb, ah_hlen); - memcpy(skb->nh.raw, work_buf, iph->ihl*4); - skb->nh.iph->tot_len = htons(skb->len); - skb_pull(skb, skb->nh.iph->ihl*4); - skb->h.raw = skb->data; + skb->h.raw = memcpy(skb->nh.raw += ah_hlen, work_buf, ihl); + __skb_pull(skb, ah_hlen + ihl); return 0; diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c index 9d1881c07a32..9bbdd4494551 100644 --- a/net/ipv4/esp4.c +++ b/net/ipv4/esp4.c @@ -143,10 +143,9 @@ static int esp_input(struct xfrm_state *x, struct sk_buff *skb) int alen = esp->auth.icv_trunc_len; int elen = skb->len - sizeof(struct ip_esp_hdr) - esp->conf.ivlen - alen; int nfrags; - int encap_len = 0; + int ihl; u8 nexthdr[2]; struct scatterlist *sg; - u8 workbuf[60]; int padlen; if (!pskb_may_pull(skb, sizeof(struct ip_esp_hdr))) @@ -177,7 +176,6 @@ static int esp_input(struct xfrm_state *x, struct sk_buff *skb) skb->ip_summed = CHECKSUM_NONE; esph = (struct ip_esp_hdr*)skb->data; - iph = skb->nh.iph; /* Get ivec. This can be wrong, check against another impls. */ if (esp->conf.ivlen) @@ -204,12 +202,12 @@ static int esp_input(struct xfrm_state *x, struct sk_buff *skb) /* ... check padding bits here. Silly. :-) */ + iph = skb->nh.iph; + ihl = iph->ihl * 4; + if (x->encap) { struct xfrm_encap_tmpl *encap = x->encap; - struct udphdr *uh; - - uh = (struct udphdr *)(iph + 1); - encap_len = (void*)esph - (void*)uh; + struct udphdr *uh = (void *)(skb->nh.raw + ihl); /* * 1) if the NAT-T peer's IP or port changed then @@ -246,11 +244,7 @@ static int esp_input(struct xfrm_state *x, struct sk_buff *skb) iph->protocol = nexthdr[1]; pskb_trim(skb, skb->len - alen - padlen - 2); - memcpy(workbuf, skb->nh.raw, iph->ihl*4); - skb->h.raw = skb_pull(skb, sizeof(struct ip_esp_hdr) + esp->conf.ivlen); - skb->nh.raw += encap_len + sizeof(struct ip_esp_hdr) + esp->conf.ivlen; - memcpy(skb->nh.raw, workbuf, iph->ihl*4); - skb->nh.iph->tot_len = htons(skb->len); + skb->h.raw = __skb_pull(skb, sizeof(*esph) + esp->conf.ivlen) - ihl; return 0; diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index cdde96390960..31387abf53a2 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -666,3 +666,4 @@ void __init ip_fib_init(void) } EXPORT_SYMBOL(inet_addr_type); +EXPORT_SYMBOL(ip_dev_find); diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index 2a0455911ee0..017900172f7d 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -730,7 +730,6 @@ out_err: static void icmp_redirect(struct sk_buff *skb) { struct iphdr *iph; - unsigned long ip; if (skb->len < sizeof(struct iphdr)) goto out_err; @@ -742,7 +741,6 @@ static void icmp_redirect(struct sk_buff *skb) goto out; iph = (struct iphdr *)skb->data; - ip = iph->daddr; switch (skb->h.icmph->code & 7) { case ICMP_REDIR_NET: @@ -752,7 +750,8 @@ static void icmp_redirect(struct sk_buff *skb) */ case ICMP_REDIR_HOST: case ICMP_REDIR_HOSTTOS: - ip_rt_redirect(skb->nh.iph->saddr, ip, skb->h.icmph->un.gateway, + ip_rt_redirect(skb->nh.iph->saddr, iph->daddr, + skb->h.icmph->un.gateway, iph->saddr, skb->dev); break; } diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index d512239a1473..ab680c851aa2 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -2361,7 +2361,7 @@ static int igmp_mc_seq_show(struct seq_file *seq, void *v) } seq_printf(seq, - "\t\t\t\t%08lX %5d %d:%08lX\t\t%d\n", + "\t\t\t\t%08X %5d %d:%08lX\t\t%d\n", im->multiaddr, im->users, im->tm_running, im->tm_running ? jiffies_to_clock_t(im->timer.expires-jiffies) : 0, diff --git a/net/ipv4/ip_forward.c b/net/ipv4/ip_forward.c index 0923add122b4..9f0bb529ab70 100644 --- a/net/ipv4/ip_forward.c +++ b/net/ipv4/ip_forward.c @@ -116,6 +116,7 @@ sr_failed: too_many_hops: /* Tell the sender its packet died... */ + IP_INC_STATS_BH(IPSTATS_MIB_INHDRERRORS); icmp_send(skb, ICMP_TIME_EXCEEDED, ICMP_EXC_TTL, 0); drop: kfree_skb(skb); diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index cff9c3a72daf..7624fd1d8f9f 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -210,8 +210,7 @@ static inline int ip_finish_output(struct sk_buff *skb) return dst_output(skb); } #endif - if (skb->len > dst_mtu(skb->dst) && - !(skb_shinfo(skb)->ufo_size || skb_shinfo(skb)->tso_size)) + if (skb->len > dst_mtu(skb->dst) && !skb_shinfo(skb)->gso_size) return ip_fragment(skb, ip_finish_output2); else return ip_finish_output2(skb); @@ -362,7 +361,7 @@ packet_routed: } ip_select_ident_more(iph, &rt->u.dst, sk, - (skb_shinfo(skb)->tso_segs ?: 1) - 1); + (skb_shinfo(skb)->gso_segs ?: 1) - 1); /* Add an IP checksum. */ ip_send_check(iph); @@ -410,6 +409,7 @@ static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from) nf_bridge_get(to->nf_bridge); #endif #endif + skb_copy_secmark(to, from); } /* @@ -743,7 +743,8 @@ static inline int ip_ufo_append_data(struct sock *sk, (length - transhdrlen)); if (!err) { /* specify the length of each IP datagram fragment*/ - skb_shinfo(skb)->ufo_size = (mtu - fragheaderlen); + skb_shinfo(skb)->gso_size = mtu - fragheaderlen; + skb_shinfo(skb)->gso_type = SKB_GSO_UDPV4; __skb_queue_tail(&sk->sk_write_queue, skb); return 0; @@ -839,7 +840,7 @@ int ip_append_data(struct sock *sk, */ if (transhdrlen && length + fragheaderlen <= mtu && - rt->u.dst.dev->features&(NETIF_F_IP_CSUM|NETIF_F_NO_CSUM|NETIF_F_HW_CSUM) && + rt->u.dst.dev->features & NETIF_F_ALL_CSUM && !exthdrlen) csummode = CHECKSUM_HW; @@ -1086,14 +1087,16 @@ ssize_t ip_append_page(struct sock *sk, struct page *page, inet->cork.length += size; if ((sk->sk_protocol == IPPROTO_UDP) && - (rt->u.dst.dev->features & NETIF_F_UFO)) - skb_shinfo(skb)->ufo_size = (mtu - fragheaderlen); + (rt->u.dst.dev->features & NETIF_F_UFO)) { + skb_shinfo(skb)->gso_size = mtu - fragheaderlen; + skb_shinfo(skb)->gso_type = SKB_GSO_UDPV4; + } while (size > 0) { int i; - if (skb_shinfo(skb)->ufo_size) + if (skb_shinfo(skb)->gso_size) len = size; else { diff --git a/net/ipv4/ipcomp.c b/net/ipv4/ipcomp.c index 95278b22b669..3ed8b57a1002 100644 --- a/net/ipv4/ipcomp.c +++ b/net/ipv4/ipcomp.c @@ -45,7 +45,6 @@ static LIST_HEAD(ipcomp_tfms_list); static int ipcomp_decompress(struct xfrm_state *x, struct sk_buff *skb) { int err, plen, dlen; - struct iphdr *iph; struct ipcomp_data *ipcd = x->data; u8 *start, *scratch; struct crypto_tfm *tfm; @@ -74,8 +73,6 @@ static int ipcomp_decompress(struct xfrm_state *x, struct sk_buff *skb) skb_put(skb, dlen - plen); memcpy(skb->data, scratch, dlen); - iph = skb->nh.iph; - iph->tot_len = htons(dlen + iph->ihl * 4); out: put_cpu(); return err; @@ -83,34 +80,21 @@ out: static int ipcomp_input(struct xfrm_state *x, struct sk_buff *skb) { - u8 nexthdr; - int err = 0; + int err = -ENOMEM; struct iphdr *iph; - union { - struct iphdr iph; - char buf[60]; - } tmp_iph; - + struct ip_comp_hdr *ipch; - if ((skb_is_nonlinear(skb) || skb_cloned(skb)) && - skb_linearize(skb, GFP_ATOMIC) != 0) { - err = -ENOMEM; + if (skb_linearize_cow(skb)) goto out; - } skb->ip_summed = CHECKSUM_NONE; /* Remove ipcomp header and decompress original payload */ iph = skb->nh.iph; - memcpy(&tmp_iph, iph, iph->ihl * 4); - nexthdr = *(u8 *)skb->data; - skb_pull(skb, sizeof(struct ip_comp_hdr)); - skb->nh.raw += sizeof(struct ip_comp_hdr); - memcpy(skb->nh.raw, &tmp_iph, tmp_iph.iph.ihl * 4); - iph = skb->nh.iph; - iph->tot_len = htons(ntohs(iph->tot_len) - sizeof(struct ip_comp_hdr)); - iph->protocol = nexthdr; - skb->h.raw = skb->data; + ipch = (void *)skb->data; + iph->protocol = ipch->nexthdr; + skb->h.raw = skb->nh.raw + sizeof(*ipch); + __skb_pull(skb, sizeof(*ipch)); err = ipcomp_decompress(x, skb); out: @@ -171,10 +155,8 @@ static int ipcomp_output(struct xfrm_state *x, struct sk_buff *skb) goto out_ok; } - if ((skb_is_nonlinear(skb) || skb_cloned(skb)) && - skb_linearize(skb, GFP_ATOMIC) != 0) { + if (skb_linearize_cow(skb)) goto out_ok; - } err = ipcomp_compress(x, skb); iph = skb->nh.iph; diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig index 3d560dec63ab..e1d7f5fbc526 100644 --- a/net/ipv4/netfilter/Kconfig +++ b/net/ipv4/netfilter/Kconfig @@ -55,6 +55,18 @@ config IP_NF_CONNTRACK_MARK of packets, but this mark value is kept in the conntrack session instead of the individual packets. +config IP_NF_CONNTRACK_SECMARK + bool 'Connection tracking security mark support' + depends on IP_NF_CONNTRACK && NETWORK_SECMARK + help + This option enables security markings to be applied to + connections. Typically they are copied to connections from + packets using the CONNSECMARK target and copied back from + connections to packets with the same target, with the packets + being originally labeled via SECMARK. + + If unsure, say 'N'. + config IP_NF_CONNTRACK_EVENTS bool "Connection tracking events (EXPERIMENTAL)" depends on EXPERIMENTAL && IP_NF_CONNTRACK @@ -142,6 +154,8 @@ config IP_NF_TFTP config IP_NF_AMANDA tristate "Amanda backup protocol support" depends on IP_NF_CONNTRACK + select TEXTSEARCH + select TEXTSEARCH_KMP help If you are running the Amanda backup package <http://www.amanda.org/> on this machine or machines that will be MASQUERADED through this @@ -170,8 +184,8 @@ config IP_NF_PPTP Documentation/modules.txt. If unsure, say `N'. config IP_NF_H323 - tristate 'H.323 protocol support' - depends on IP_NF_CONNTRACK + tristate 'H.323 protocol support (EXPERIMENTAL)' + depends on IP_NF_CONNTRACK && EXPERIMENTAL help H.323 is a VoIP signalling protocol from ITU-T. As one of the most important VoIP protocols, it is widely used by voice hardware and @@ -181,14 +195,26 @@ config IP_NF_H323 With this module you can support H.323 on a connection tracking/NAT firewall. - This module supports RAS, Fast-start, H.245 tunnelling, RTP/RTCP - and T.120 based data and applications including audio, video, FAX, - chat, whiteboard, file transfer, etc. For more information, please - see http://nath323.sourceforge.net/. + This module supports RAS, Fast Start, H.245 Tunnelling, Call + Forwarding, RTP/RTCP and T.120 based audio, video, fax, chat, + whiteboard, file transfer, etc. For more information, please + visit http://nath323.sourceforge.net/. If you want to compile it as a module, say 'M' here and read Documentation/modules.txt. If unsure, say 'N'. +config IP_NF_SIP + tristate "SIP protocol support (EXPERIMENTAL)" + depends on IP_NF_CONNTRACK && EXPERIMENTAL + help + SIP is an application-layer control protocol that can establish, + modify, and terminate multimedia sessions (conferences) such as + Internet telephony calls. With the ip_conntrack_sip and + the ip_nat_sip modules you can support the protocol on a connection + tracking/NATing firewall. + + To compile it as a module, choose M here. If unsure, say Y. + config IP_NF_QUEUE tristate "IP Userspace queueing via NETLINK (OBSOLETE)" help @@ -501,6 +527,12 @@ config IP_NF_NAT_H323 default IP_NF_NAT if IP_NF_H323=y default m if IP_NF_H323=m +config IP_NF_NAT_SIP + tristate + depends on IP_NF_IPTABLES!=n && IP_NF_CONNTRACK!=n && IP_NF_NAT!=n + default IP_NF_NAT if IP_NF_SIP=y + default m if IP_NF_SIP=m + # mangle + specific targets config IP_NF_MANGLE tristate "Packet mangling" diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile index 461cb1eb5de7..3ded4a3af59c 100644 --- a/net/ipv4/netfilter/Makefile +++ b/net/ipv4/netfilter/Makefile @@ -31,6 +31,7 @@ obj-$(CONFIG_IP_NF_AMANDA) += ip_conntrack_amanda.o obj-$(CONFIG_IP_NF_TFTP) += ip_conntrack_tftp.o obj-$(CONFIG_IP_NF_FTP) += ip_conntrack_ftp.o obj-$(CONFIG_IP_NF_IRC) += ip_conntrack_irc.o +obj-$(CONFIG_IP_NF_SIP) += ip_conntrack_sip.o obj-$(CONFIG_IP_NF_NETBIOS_NS) += ip_conntrack_netbios_ns.o # NAT helpers @@ -40,6 +41,7 @@ obj-$(CONFIG_IP_NF_NAT_AMANDA) += ip_nat_amanda.o obj-$(CONFIG_IP_NF_NAT_TFTP) += ip_nat_tftp.o obj-$(CONFIG_IP_NF_NAT_FTP) += ip_nat_ftp.o obj-$(CONFIG_IP_NF_NAT_IRC) += ip_nat_irc.o +obj-$(CONFIG_IP_NF_NAT_SIP) += ip_nat_sip.o # generic IP tables obj-$(CONFIG_IP_NF_IPTABLES) += ip_tables.o diff --git a/net/ipv4/netfilter/ip_conntrack_amanda.c b/net/ipv4/netfilter/ip_conntrack_amanda.c index a604b1ccfdaa..0a7bd7f04061 100644 --- a/net/ipv4/netfilter/ip_conntrack_amanda.c +++ b/net/ipv4/netfilter/ip_conntrack_amanda.c @@ -17,33 +17,29 @@ * this value. * */ - -#include <linux/in.h> #include <linux/kernel.h> #include <linux/module.h> -#include <linux/netfilter.h> -#include <linux/ip.h> #include <linux/moduleparam.h> +#include <linux/textsearch.h> +#include <linux/skbuff.h> +#include <linux/in.h> +#include <linux/ip.h> #include <linux/udp.h> -#include <net/checksum.h> -#include <net/udp.h> +#include <linux/netfilter.h> #include <linux/netfilter_ipv4/ip_conntrack_helper.h> #include <linux/netfilter_ipv4/ip_conntrack_amanda.h> static unsigned int master_timeout = 300; +static char *ts_algo = "kmp"; MODULE_AUTHOR("Brian J. Murrell <netfilter@interlinx.bc.ca>"); MODULE_DESCRIPTION("Amanda connection tracking module"); MODULE_LICENSE("GPL"); module_param(master_timeout, uint, 0600); MODULE_PARM_DESC(master_timeout, "timeout for the master connection"); - -static const char *conns[] = { "DATA ", "MESG ", "INDEX " }; - -/* This is slow, but it's simple. --RR */ -static char *amanda_buffer; -static DEFINE_SPINLOCK(amanda_buffer_lock); +module_param(ts_algo, charp, 0400); +MODULE_PARM_DESC(ts_algo, "textsearch algorithm to use (default kmp)"); unsigned int (*ip_nat_amanda_hook)(struct sk_buff **pskb, enum ip_conntrack_info ctinfo, @@ -52,12 +48,48 @@ unsigned int (*ip_nat_amanda_hook)(struct sk_buff **pskb, struct ip_conntrack_expect *exp); EXPORT_SYMBOL_GPL(ip_nat_amanda_hook); +enum amanda_strings { + SEARCH_CONNECT, + SEARCH_NEWLINE, + SEARCH_DATA, + SEARCH_MESG, + SEARCH_INDEX, +}; + +static struct { + char *string; + size_t len; + struct ts_config *ts; +} search[] = { + [SEARCH_CONNECT] = { + .string = "CONNECT ", + .len = 8, + }, + [SEARCH_NEWLINE] = { + .string = "\n", + .len = 1, + }, + [SEARCH_DATA] = { + .string = "DATA ", + .len = 5, + }, + [SEARCH_MESG] = { + .string = "MESG ", + .len = 5, + }, + [SEARCH_INDEX] = { + .string = "INDEX ", + .len = 6, + }, +}; + static int help(struct sk_buff **pskb, struct ip_conntrack *ct, enum ip_conntrack_info ctinfo) { + struct ts_state ts; struct ip_conntrack_expect *exp; - char *data, *data_limit, *tmp; - unsigned int dataoff, i; + unsigned int dataoff, start, stop, off, i; + char pbuf[sizeof("65535")], *tmp; u_int16_t port, len; int ret = NF_ACCEPT; @@ -77,29 +109,34 @@ static int help(struct sk_buff **pskb, return NF_ACCEPT; } - spin_lock_bh(&amanda_buffer_lock); - skb_copy_bits(*pskb, dataoff, amanda_buffer, (*pskb)->len - dataoff); - data = amanda_buffer; - data_limit = amanda_buffer + (*pskb)->len - dataoff; - *data_limit = '\0'; - - /* Search for the CONNECT string */ - data = strstr(data, "CONNECT "); - if (!data) + memset(&ts, 0, sizeof(ts)); + start = skb_find_text(*pskb, dataoff, (*pskb)->len, + search[SEARCH_CONNECT].ts, &ts); + if (start == UINT_MAX) goto out; - data += strlen("CONNECT "); + start += dataoff + search[SEARCH_CONNECT].len; - /* Only search first line. */ - if ((tmp = strchr(data, '\n'))) - *tmp = '\0'; + memset(&ts, 0, sizeof(ts)); + stop = skb_find_text(*pskb, start, (*pskb)->len, + search[SEARCH_NEWLINE].ts, &ts); + if (stop == UINT_MAX) + goto out; + stop += start; - for (i = 0; i < ARRAY_SIZE(conns); i++) { - char *match = strstr(data, conns[i]); - if (!match) + for (i = SEARCH_DATA; i <= SEARCH_INDEX; i++) { + memset(&ts, 0, sizeof(ts)); + off = skb_find_text(*pskb, start, stop, search[i].ts, &ts); + if (off == UINT_MAX) continue; - tmp = data = match + strlen(conns[i]); - port = simple_strtoul(data, &data, 10); - len = data - tmp; + off += start + search[i].len; + + len = min_t(unsigned int, sizeof(pbuf) - 1, stop - off); + if (skb_copy_bits(*pskb, off, pbuf, len)) + break; + pbuf[len] = '\0'; + + port = simple_strtoul(pbuf, &tmp, 10); + len = tmp - pbuf; if (port == 0 || len > 5) break; @@ -125,8 +162,7 @@ static int help(struct sk_buff **pskb, exp->mask.dst.u.tcp.port = 0xFFFF; if (ip_nat_amanda_hook) - ret = ip_nat_amanda_hook(pskb, ctinfo, - tmp - amanda_buffer, + ret = ip_nat_amanda_hook(pskb, ctinfo, off - dataoff, len, exp); else if (ip_conntrack_expect_related(exp) != 0) ret = NF_DROP; @@ -134,12 +170,11 @@ static int help(struct sk_buff **pskb, } out: - spin_unlock_bh(&amanda_buffer_lock); return ret; } static struct ip_conntrack_helper amanda_helper = { - .max_expected = ARRAY_SIZE(conns), + .max_expected = 3, .timeout = 180, .me = THIS_MODULE, .help = help, @@ -155,26 +190,36 @@ static struct ip_conntrack_helper amanda_helper = { static void __exit ip_conntrack_amanda_fini(void) { + int i; + ip_conntrack_helper_unregister(&amanda_helper); - kfree(amanda_buffer); + for (i = 0; i < ARRAY_SIZE(search); i++) + textsearch_destroy(search[i].ts); } static int __init ip_conntrack_amanda_init(void) { - int ret; - - amanda_buffer = kmalloc(65536, GFP_KERNEL); - if (!amanda_buffer) - return -ENOMEM; - - ret = ip_conntrack_helper_register(&amanda_helper); - if (ret < 0) { - kfree(amanda_buffer); - return ret; + int ret, i; + + ret = -ENOMEM; + for (i = 0; i < ARRAY_SIZE(search); i++) { + search[i].ts = textsearch_prepare(ts_algo, search[i].string, + search[i].len, + GFP_KERNEL, TS_AUTOLOAD); + if (search[i].ts == NULL) + goto err; } + ret = ip_conntrack_helper_register(&amanda_helper); + if (ret < 0) + goto err; return 0; - +err: + for (; i >= 0; i--) { + if (search[i].ts) + textsearch_destroy(search[i].ts); + } + return ret; } module_init(ip_conntrack_amanda_init); diff --git a/net/ipv4/netfilter/ip_conntrack_core.c b/net/ipv4/netfilter/ip_conntrack_core.c index 979a2eac6f00..7e4cf9a4d15f 100644 --- a/net/ipv4/netfilter/ip_conntrack_core.c +++ b/net/ipv4/netfilter/ip_conntrack_core.c @@ -724,6 +724,9 @@ init_conntrack(struct ip_conntrack_tuple *tuple, /* this is ugly, but there is no other place where to put it */ conntrack->nat.masq_index = exp->master->nat.masq_index; #endif +#ifdef CONFIG_IP_NF_CONNTRACK_SECMARK + conntrack->secmark = exp->master->secmark; +#endif nf_conntrack_get(&conntrack->master->ct_general); CONNTRACK_STAT_INC(expect_new); } else { @@ -1130,6 +1133,12 @@ void __ip_ct_refresh_acct(struct ip_conntrack *ct, write_lock_bh(&ip_conntrack_lock); + /* Only update if this is not a fixed timeout */ + if (test_bit(IPS_FIXED_TIMEOUT_BIT, &ct->status)) { + write_unlock_bh(&ip_conntrack_lock); + return; + } + /* If not in hash table, timer will not be active yet */ if (!is_confirmed(ct)) { ct->timeout.expires = extra_jiffies; @@ -1318,6 +1327,7 @@ getorigdst(struct sock *sk, int optval, void __user *user, int *len) .tuple.dst.u.tcp.port; sin.sin_addr.s_addr = ct->tuplehash[IP_CT_DIR_ORIGINAL] .tuple.dst.ip; + memset(sin.sin_zero, 0, sizeof(sin.sin_zero)); DEBUGP("SO_ORIGINAL_DST: %u.%u.%u.%u %u\n", NIPQUAD(sin.sin_addr.s_addr), ntohs(sin.sin_port)); diff --git a/net/ipv4/netfilter/ip_conntrack_ftp.c b/net/ipv4/netfilter/ip_conntrack_ftp.c index 3e542bf28a9d..4dcf526c3944 100644 --- a/net/ipv4/netfilter/ip_conntrack_ftp.c +++ b/net/ipv4/netfilter/ip_conntrack_ftp.c @@ -56,37 +56,48 @@ static int try_eprt(const char *, size_t, u_int32_t [], char); static int try_epsv_response(const char *, size_t, u_int32_t [], char); static const struct ftp_search { - enum ip_conntrack_dir dir; const char *pattern; size_t plen; char skip; char term; enum ip_ct_ftp_type ftptype; int (*getnum)(const char *, size_t, u_int32_t[], char); -} search[] = { - { - IP_CT_DIR_ORIGINAL, - "PORT", sizeof("PORT") - 1, ' ', '\r', - IP_CT_FTP_PORT, - try_rfc959, +} search[IP_CT_DIR_MAX][2] = { + [IP_CT_DIR_ORIGINAL] = { + { + .pattern = "PORT", + .plen = sizeof("PORT") - 1, + .skip = ' ', + .term = '\r', + .ftptype = IP_CT_FTP_PORT, + .getnum = try_rfc959, + }, + { + .pattern = "EPRT", + .plen = sizeof("EPRT") - 1, + .skip = ' ', + .term = '\r', + .ftptype = IP_CT_FTP_EPRT, + .getnum = try_eprt, + }, }, - { - IP_CT_DIR_REPLY, - "227 ", sizeof("227 ") - 1, '(', ')', - IP_CT_FTP_PASV, - try_rfc959, - }, - { - IP_CT_DIR_ORIGINAL, - "EPRT", sizeof("EPRT") - 1, ' ', '\r', - IP_CT_FTP_EPRT, - try_eprt, - }, - { - IP_CT_DIR_REPLY, - "229 ", sizeof("229 ") - 1, '(', ')', - IP_CT_FTP_EPSV, - try_epsv_response, + [IP_CT_DIR_REPLY] = { + { + .pattern = "227 ", + .plen = sizeof("227 ") - 1, + .skip = '(', + .term = ')', + .ftptype = IP_CT_FTP_PASV, + .getnum = try_rfc959, + }, + { + .pattern = "229 ", + .plen = sizeof("229 ") - 1, + .skip = '(', + .term = ')', + .ftptype = IP_CT_FTP_EPSV, + .getnum = try_epsv_response, + }, }, }; @@ -346,17 +357,15 @@ static int help(struct sk_buff **pskb, array[2] = (ntohl(ct->tuplehash[dir].tuple.src.ip) >> 8) & 0xFF; array[3] = ntohl(ct->tuplehash[dir].tuple.src.ip) & 0xFF; - for (i = 0; i < ARRAY_SIZE(search); i++) { - if (search[i].dir != dir) continue; - + for (i = 0; i < ARRAY_SIZE(search[dir]); i++) { found = find_pattern(fb_ptr, (*pskb)->len - dataoff, - search[i].pattern, - search[i].plen, - search[i].skip, - search[i].term, + search[dir][i].pattern, + search[dir][i].plen, + search[dir][i].skip, + search[dir][i].term, &matchoff, &matchlen, array, - search[i].getnum); + search[dir][i].getnum); if (found) break; } if (found == -1) { @@ -366,7 +375,7 @@ static int help(struct sk_buff **pskb, this case. */ if (net_ratelimit()) printk("conntrack_ftp: partial %s %u+%u\n", - search[i].pattern, + search[dir][i].pattern, ntohl(th->seq), datalen); ret = NF_DROP; goto out; @@ -426,7 +435,7 @@ static int help(struct sk_buff **pskb, /* Now, NAT might want to mangle the packet, and register the * (possibly changed) expectation itself. */ if (ip_nat_ftp_hook) - ret = ip_nat_ftp_hook(pskb, ctinfo, search[i].ftptype, + ret = ip_nat_ftp_hook(pskb, ctinfo, search[dir][i].ftptype, matchoff, matchlen, exp, &seq); else { /* Can't expect this? Best to drop packet now. */ diff --git a/net/ipv4/netfilter/ip_conntrack_helper_h323.c b/net/ipv4/netfilter/ip_conntrack_helper_h323.c index 518f581d39ec..0665674218c6 100644 --- a/net/ipv4/netfilter/ip_conntrack_helper_h323.c +++ b/net/ipv4/netfilter/ip_conntrack_helper_h323.c @@ -22,6 +22,8 @@ #include <linux/netfilter_ipv4/ip_conntrack_tuple.h> #include <linux/netfilter_ipv4/ip_conntrack_h323.h> #include <linux/moduleparam.h> +#include <linux/ctype.h> +#include <linux/inet.h> #if 0 #define DEBUGP printk @@ -38,6 +40,12 @@ static int gkrouted_only = 1; module_param(gkrouted_only, int, 0600); MODULE_PARM_DESC(gkrouted_only, "only accept calls from gatekeeper"); +static int callforward_filter = 1; +module_param(callforward_filter, bool, 0600); +MODULE_PARM_DESC(callforward_filter, "only create call forwarding expectations " + "if both endpoints are on different sides " + "(determined by routing information)"); + /* Hooks for NAT */ int (*set_h245_addr_hook) (struct sk_buff ** pskb, unsigned char **data, int dataoff, @@ -77,6 +85,12 @@ int (*nat_h245_hook) (struct sk_buff ** pskb, unsigned char **data, int dataoff, TransportAddress * addr, u_int16_t port, struct ip_conntrack_expect * exp); +int (*nat_callforwarding_hook) (struct sk_buff ** pskb, + struct ip_conntrack * ct, + enum ip_conntrack_info ctinfo, + unsigned char **data, int dataoff, + TransportAddress * addr, u_int16_t port, + struct ip_conntrack_expect * exp); int (*nat_q931_hook) (struct sk_buff ** pskb, struct ip_conntrack * ct, enum ip_conntrack_info ctinfo, @@ -683,6 +697,92 @@ static int expect_h245(struct sk_buff **pskb, struct ip_conntrack *ct, return ret; } +/* Forwarding declaration */ +void ip_conntrack_q931_expect(struct ip_conntrack *new, + struct ip_conntrack_expect *this); + +/****************************************************************************/ +static int expect_callforwarding(struct sk_buff **pskb, + struct ip_conntrack *ct, + enum ip_conntrack_info ctinfo, + unsigned char **data, int dataoff, + TransportAddress * addr) +{ + int dir = CTINFO2DIR(ctinfo); + int ret = 0; + u_int32_t ip; + u_int16_t port; + struct ip_conntrack_expect *exp = NULL; + + /* Read alternativeAddress */ + if (!get_h225_addr(*data, addr, &ip, &port) || port == 0) + return 0; + + /* If the calling party is on the same side of the forward-to party, + * we don't need to track the second call */ + if (callforward_filter) { + struct rtable *rt1, *rt2; + struct flowi fl1 = { + .fl4_dst = ip, + }; + struct flowi fl2 = { + .fl4_dst = ct->tuplehash[!dir].tuple.src.ip, + }; + + if (ip_route_output_key(&rt1, &fl1) == 0) { + if (ip_route_output_key(&rt2, &fl2) == 0) { + if (rt1->rt_gateway == rt2->rt_gateway && + rt1->u.dst.dev == rt2->u.dst.dev) + ret = 1; + dst_release(&rt2->u.dst); + } + dst_release(&rt1->u.dst); + } + if (ret) { + DEBUGP("ip_ct_q931: Call Forwarding not tracked\n"); + return 0; + } + } + + /* Create expect for the second call leg */ + if ((exp = ip_conntrack_expect_alloc(ct)) == NULL) + return -1; + exp->tuple.src.ip = ct->tuplehash[!dir].tuple.src.ip; + exp->tuple.src.u.tcp.port = 0; + exp->tuple.dst.ip = ip; + exp->tuple.dst.u.tcp.port = htons(port); + exp->tuple.dst.protonum = IPPROTO_TCP; + exp->mask.src.ip = 0xFFFFFFFF; + exp->mask.src.u.tcp.port = 0; + exp->mask.dst.ip = 0xFFFFFFFF; + exp->mask.dst.u.tcp.port = 0xFFFF; + exp->mask.dst.protonum = 0xFF; + exp->flags = 0; + + if (ct->tuplehash[dir].tuple.src.ip != + ct->tuplehash[!dir].tuple.dst.ip && nat_callforwarding_hook) { + /* Need NAT */ + ret = nat_callforwarding_hook(pskb, ct, ctinfo, data, dataoff, + addr, port, exp); + } else { /* Conntrack only */ + exp->expectfn = ip_conntrack_q931_expect; + + if (ip_conntrack_expect_related(exp) == 0) { + DEBUGP("ip_ct_q931: expect Call Forwarding " + "%u.%u.%u.%u:%hu->%u.%u.%u.%u:%hu\n", + NIPQUAD(exp->tuple.src.ip), + ntohs(exp->tuple.src.u.tcp.port), + NIPQUAD(exp->tuple.dst.ip), + ntohs(exp->tuple.dst.u.tcp.port)); + } else + ret = -1; + } + + ip_conntrack_expect_put(exp); + + return ret; +} + /****************************************************************************/ static int process_setup(struct sk_buff **pskb, struct ip_conntrack *ct, enum ip_conntrack_info ctinfo, @@ -878,6 +978,15 @@ static int process_facility(struct sk_buff **pskb, struct ip_conntrack *ct, DEBUGP("ip_ct_q931: Facility\n"); + if (facility->reason.choice == eFacilityReason_callForwarded) { + if (facility->options & eFacility_UUIE_alternativeAddress) + return expect_callforwarding(pskb, ct, ctinfo, data, + dataoff, + &facility-> + alternativeAddress); + return 0; + } + if (facility->options & eFacility_UUIE_h245Address) { ret = expect_h245(pskb, ct, ctinfo, data, dataoff, &facility->h245Address); @@ -1677,7 +1786,6 @@ static int __init init(void) fini(); return ret; } - DEBUGP("ip_ct_h323: init success\n"); return 0; } @@ -1696,6 +1804,7 @@ EXPORT_SYMBOL_GPL(set_ras_addr_hook); EXPORT_SYMBOL_GPL(nat_rtp_rtcp_hook); EXPORT_SYMBOL_GPL(nat_t120_hook); EXPORT_SYMBOL_GPL(nat_h245_hook); +EXPORT_SYMBOL_GPL(nat_callforwarding_hook); EXPORT_SYMBOL_GPL(nat_q931_hook); MODULE_AUTHOR("Jing Min Zhao <zhaojingmin@users.sourceforge.net>"); diff --git a/net/ipv4/netfilter/ip_conntrack_helper_h323_types.c b/net/ipv4/netfilter/ip_conntrack_helper_h323_types.c index 022c47b9f6c9..4b359618bedd 100644 --- a/net/ipv4/netfilter/ip_conntrack_helper_h323_types.c +++ b/net/ipv4/netfilter/ip_conntrack_helper_h323_types.c @@ -1,4 +1,4 @@ -/* Generated by Jing Min Zhao's ASN.1 parser, Mar 15 2006 +/* Generated by Jing Min Zhao's ASN.1 parser, Apr 20 2006 * * Copyright (c) 2006 Jing Min Zhao <zhaojingmin@users.sourceforge.net> * @@ -1069,8 +1069,8 @@ static field_t _Facility_UUIE_fastStart[] = { /* SEQUENCE OF */ static field_t _Facility_UUIE[] = { /* SEQUENCE */ {FNAME("protocolIdentifier") OID, BYTE, 0, 0, SKIP, 0, NULL}, - {FNAME("alternativeAddress") CHOICE, 3, 7, 7, SKIP | EXT | OPT, 0, - _TransportAddress}, + {FNAME("alternativeAddress") CHOICE, 3, 7, 7, DECODE | EXT | OPT, + offsetof(Facility_UUIE, alternativeAddress), _TransportAddress}, {FNAME("alternativeAliasAddress") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, _Facility_UUIE_alternativeAliasAddress}, {FNAME("conferenceID") OCTSTR, FIXD, 16, 0, SKIP | OPT, 0, NULL}, diff --git a/net/ipv4/netfilter/ip_conntrack_helper_pptp.c b/net/ipv4/netfilter/ip_conntrack_helper_pptp.c index 7d3ba4302e9e..8ccfe17bb253 100644 --- a/net/ipv4/netfilter/ip_conntrack_helper_pptp.c +++ b/net/ipv4/netfilter/ip_conntrack_helper_pptp.c @@ -469,8 +469,8 @@ pptp_inbound_pkt(struct sk_buff **pskb, DEBUGP("%s but no session\n", pptp_msg_name[msg]); break; } - if (info->sstate != PPTP_CALL_IN_REP - && info->sstate != PPTP_CALL_IN_CONF) { + if (info->cstate != PPTP_CALL_IN_REP + && info->cstate != PPTP_CALL_IN_CONF) { DEBUGP("%s but never sent IN_CALL_REPLY\n", pptp_msg_name[msg]); break; diff --git a/net/ipv4/netfilter/ip_conntrack_netlink.c b/net/ipv4/netfilter/ip_conntrack_netlink.c index 01bd7cab9367..33891bb1fde4 100644 --- a/net/ipv4/netfilter/ip_conntrack_netlink.c +++ b/net/ipv4/netfilter/ip_conntrack_netlink.c @@ -399,38 +399,54 @@ nfattr_failure: static int ctnetlink_done(struct netlink_callback *cb) { DEBUGP("entered %s\n", __FUNCTION__); + if (cb->args[1]) + ip_conntrack_put((struct ip_conntrack *)cb->args[1]); return 0; } static int ctnetlink_dump_table(struct sk_buff *skb, struct netlink_callback *cb) { - struct ip_conntrack *ct = NULL; + struct ip_conntrack *ct, *last; struct ip_conntrack_tuple_hash *h; struct list_head *i; - u_int32_t *id = (u_int32_t *) &cb->args[1]; DEBUGP("entered %s, last bucket=%lu id=%u\n", __FUNCTION__, cb->args[0], *id); read_lock_bh(&ip_conntrack_lock); - for (; cb->args[0] < ip_conntrack_htable_size; cb->args[0]++, *id = 0) { + for (; cb->args[0] < ip_conntrack_htable_size; cb->args[0]++) { +restart: + last = (struct ip_conntrack *)cb->args[1]; list_for_each_prev(i, &ip_conntrack_hash[cb->args[0]]) { h = (struct ip_conntrack_tuple_hash *) i; if (DIRECTION(h) != IP_CT_DIR_ORIGINAL) continue; ct = tuplehash_to_ctrack(h); - if (ct->id <= *id) - continue; + if (last != NULL) { + if (ct == last) { + ip_conntrack_put(last); + cb->args[1] = 0; + last = NULL; + } else + continue; + } if (ctnetlink_fill_info(skb, NETLINK_CB(cb->skb).pid, cb->nlh->nlmsg_seq, IPCTNL_MSG_CT_NEW, - 1, ct) < 0) + 1, ct) < 0) { + nf_conntrack_get(&ct->ct_general); + cb->args[1] = (unsigned long)ct; goto out; - *id = ct->id; + } + } + if (last != NULL) { + ip_conntrack_put(last); + cb->args[1] = 0; + goto restart; } } -out: +out: read_unlock_bh(&ip_conntrack_lock); DEBUGP("leaving, last bucket=%lu id=%u\n", cb->args[0], *id); @@ -629,7 +645,7 @@ static const size_t cta_min_nat[CTA_NAT_MAX] = { }; static inline int -ctnetlink_parse_nat(struct nfattr *cda[], +ctnetlink_parse_nat(struct nfattr *nat, const struct ip_conntrack *ct, struct ip_nat_range *range) { struct nfattr *tb[CTA_NAT_MAX]; @@ -639,7 +655,7 @@ ctnetlink_parse_nat(struct nfattr *cda[], memset(range, 0, sizeof(*range)); - nfattr_parse_nested(tb, CTA_NAT_MAX, cda[CTA_NAT-1]); + nfattr_parse_nested(tb, CTA_NAT_MAX, nat); if (nfattr_bad_size(tb, CTA_NAT_MAX, cta_min_nat)) return -EINVAL; @@ -854,39 +870,30 @@ ctnetlink_change_status(struct ip_conntrack *ct, struct nfattr *cda[]) /* ASSURED bit can only be set */ return -EINVAL; - if (cda[CTA_NAT-1]) { + if (cda[CTA_NAT_SRC-1] || cda[CTA_NAT_DST-1]) { #ifndef CONFIG_IP_NF_NAT_NEEDED return -EINVAL; #else - unsigned int hooknum; struct ip_nat_range range; - if (ctnetlink_parse_nat(cda, ct, &range) < 0) - return -EINVAL; - - DEBUGP("NAT: %u.%u.%u.%u-%u.%u.%u.%u:%u-%u\n", - NIPQUAD(range.min_ip), NIPQUAD(range.max_ip), - htons(range.min.all), htons(range.max.all)); - - /* This is tricky but it works. ip_nat_setup_info needs the - * hook number as parameter, so let's do the correct - * conversion and run away */ - if (status & IPS_SRC_NAT_DONE) - hooknum = NF_IP_POST_ROUTING; /* IP_NAT_MANIP_SRC */ - else if (status & IPS_DST_NAT_DONE) - hooknum = NF_IP_PRE_ROUTING; /* IP_NAT_MANIP_DST */ - else - return -EINVAL; /* Missing NAT flags */ - - DEBUGP("NAT status: %lu\n", - status & (IPS_NAT_MASK | IPS_NAT_DONE_MASK)); - - if (ip_nat_initialized(ct, HOOK2MANIP(hooknum))) - return -EEXIST; - ip_nat_setup_info(ct, &range, hooknum); - - DEBUGP("NAT status after setup_info: %lu\n", - ct->status & (IPS_NAT_MASK | IPS_NAT_DONE_MASK)); + if (cda[CTA_NAT_DST-1]) { + if (ctnetlink_parse_nat(cda[CTA_NAT_DST-1], ct, + &range) < 0) + return -EINVAL; + if (ip_nat_initialized(ct, + HOOK2MANIP(NF_IP_PRE_ROUTING))) + return -EEXIST; + ip_nat_setup_info(ct, &range, NF_IP_PRE_ROUTING); + } + if (cda[CTA_NAT_SRC-1]) { + if (ctnetlink_parse_nat(cda[CTA_NAT_SRC-1], ct, + &range) < 0) + return -EINVAL; + if (ip_nat_initialized(ct, + HOOK2MANIP(NF_IP_POST_ROUTING))) + return -EEXIST; + ip_nat_setup_info(ct, &range, NF_IP_POST_ROUTING); + } #endif } @@ -1106,7 +1113,7 @@ ctnetlink_new_conntrack(struct sock *ctnl, struct sk_buff *skb, /* implicit 'else' */ /* we only allow nat config for new conntracks */ - if (cda[CTA_NAT-1]) { + if (cda[CTA_NAT_SRC-1] || cda[CTA_NAT_DST-1]) { err = -EINVAL; goto out_unlock; } diff --git a/net/ipv4/netfilter/ip_conntrack_proto_gre.c b/net/ipv4/netfilter/ip_conntrack_proto_gre.c index 56794797d55b..21ee124c0463 100644 --- a/net/ipv4/netfilter/ip_conntrack_proto_gre.c +++ b/net/ipv4/netfilter/ip_conntrack_proto_gre.c @@ -77,10 +77,10 @@ static inline int gre_key_cmpfn(const struct ip_ct_gre_keymap *km, } /* look up the source key for a given tuple */ -static u_int32_t gre_keymap_lookup(struct ip_conntrack_tuple *t) +static __be16 gre_keymap_lookup(struct ip_conntrack_tuple *t) { struct ip_ct_gre_keymap *km; - u_int32_t key = 0; + __be16 key = 0; read_lock_bh(&ip_ct_gre_lock); km = LIST_FIND(&gre_keymap_list, gre_key_cmpfn, @@ -190,7 +190,7 @@ static int gre_pkt_to_tuple(const struct sk_buff *skb, struct ip_conntrack_tuple *tuple) { struct gre_hdr_pptp _pgrehdr, *pgrehdr; - u_int32_t srckey; + __be16 srckey; struct gre_hdr _grehdr, *grehdr; /* first only delinearize old RFC1701 GRE header */ diff --git a/net/ipv4/netfilter/ip_conntrack_proto_icmp.c b/net/ipv4/netfilter/ip_conntrack_proto_icmp.c index d8b14a9010a6..23f1c504586d 100644 --- a/net/ipv4/netfilter/ip_conntrack_proto_icmp.c +++ b/net/ipv4/netfilter/ip_conntrack_proto_icmp.c @@ -224,7 +224,7 @@ icmp_error(struct sk_buff *skb, enum ip_conntrack_info *ctinfo, } /* See ip_conntrack_proto_tcp.c */ - if (hooknum == NF_IP_PRE_ROUTING && + if (ip_conntrack_checksum && hooknum == NF_IP_PRE_ROUTING && nf_ip_checksum(skb, hooknum, skb->nh.iph->ihl * 4, 0)) { if (LOG_INVALID(IPPROTO_ICMP)) nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL, diff --git a/net/ipv4/netfilter/ip_conntrack_proto_tcp.c b/net/ipv4/netfilter/ip_conntrack_proto_tcp.c index 062b252b58ad..c5c2ce5cdeb8 100644 --- a/net/ipv4/netfilter/ip_conntrack_proto_tcp.c +++ b/net/ipv4/netfilter/ip_conntrack_proto_tcp.c @@ -870,7 +870,7 @@ static int tcp_error(struct sk_buff *skb, * and moreover root might send raw packets. */ /* FIXME: Source route IP option packets --RR */ - if (hooknum == NF_IP_PRE_ROUTING && + if (ip_conntrack_checksum && hooknum == NF_IP_PRE_ROUTING && nf_ip_checksum(skb, hooknum, iph->ihl * 4, IPPROTO_TCP)) { if (LOG_INVALID(IPPROTO_TCP)) nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL, diff --git a/net/ipv4/netfilter/ip_conntrack_proto_udp.c b/net/ipv4/netfilter/ip_conntrack_proto_udp.c index 70899868783b..9b2c16b4d2ff 100644 --- a/net/ipv4/netfilter/ip_conntrack_proto_udp.c +++ b/net/ipv4/netfilter/ip_conntrack_proto_udp.c @@ -120,7 +120,7 @@ static int udp_error(struct sk_buff *skb, enum ip_conntrack_info *ctinfo, * because the semantic of CHECKSUM_HW is different there * and moreover root might send raw packets. * FIXME: Source route IP option packets --RR */ - if (hooknum == NF_IP_PRE_ROUTING && + if (ip_conntrack_checksum && hooknum == NF_IP_PRE_ROUTING && nf_ip_checksum(skb, hooknum, iph->ihl * 4, IPPROTO_UDP)) { if (LOG_INVALID(IPPROTO_UDP)) nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL, diff --git a/net/ipv4/netfilter/ip_conntrack_sip.c b/net/ipv4/netfilter/ip_conntrack_sip.c new file mode 100644 index 000000000000..fc87ce0da40d --- /dev/null +++ b/net/ipv4/netfilter/ip_conntrack_sip.c @@ -0,0 +1,471 @@ +/* SIP extension for IP connection tracking. + * + * (C) 2005 by Christian Hentschel <chentschel@arnet.com.ar> + * based on RR's ip_conntrack_ftp.c and other modules. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/config.h> +#include <linux/module.h> +#include <linux/ctype.h> +#include <linux/skbuff.h> +#include <linux/in.h> +#include <linux/ip.h> +#include <linux/udp.h> + +#include <linux/netfilter.h> +#include <linux/netfilter_ipv4.h> +#include <linux/netfilter_ipv4/ip_conntrack_helper.h> +#include <linux/netfilter_ipv4/ip_conntrack_sip.h> + +#if 0 +#define DEBUGP printk +#else +#define DEBUGP(format, args...) +#endif + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Christian Hentschel <chentschel@arnet.com.ar>"); +MODULE_DESCRIPTION("SIP connection tracking helper"); + +#define MAX_PORTS 8 +static unsigned short ports[MAX_PORTS]; +static int ports_c; +module_param_array(ports, ushort, &ports_c, 0400); +MODULE_PARM_DESC(ports, "port numbers of sip servers"); + +static unsigned int sip_timeout = SIP_TIMEOUT; +module_param(sip_timeout, uint, 0600); +MODULE_PARM_DESC(sip_timeout, "timeout for the master SIP session"); + +unsigned int (*ip_nat_sip_hook)(struct sk_buff **pskb, + enum ip_conntrack_info ctinfo, + struct ip_conntrack *ct, + const char **dptr); +EXPORT_SYMBOL_GPL(ip_nat_sip_hook); + +unsigned int (*ip_nat_sdp_hook)(struct sk_buff **pskb, + enum ip_conntrack_info ctinfo, + struct ip_conntrack_expect *exp, + const char *dptr); +EXPORT_SYMBOL_GPL(ip_nat_sdp_hook); + +int ct_sip_get_info(const char *dptr, size_t dlen, + unsigned int *matchoff, + unsigned int *matchlen, + struct sip_header_nfo *hnfo); +EXPORT_SYMBOL_GPL(ct_sip_get_info); + + +static int digits_len(const char *dptr, const char *limit, int *shift); +static int epaddr_len(const char *dptr, const char *limit, int *shift); +static int skp_digits_len(const char *dptr, const char *limit, int *shift); +static int skp_epaddr_len(const char *dptr, const char *limit, int *shift); + +struct sip_header_nfo ct_sip_hdrs[] = { + { /* Via header */ + .lname = "Via:", + .lnlen = sizeof("Via:") - 1, + .sname = "\r\nv:", + .snlen = sizeof("\r\nv:") - 1, /* rfc3261 "\r\n" */ + .ln_str = "UDP ", + .ln_strlen = sizeof("UDP ") - 1, + .match_len = epaddr_len, + }, + { /* Contact header */ + .lname = "Contact:", + .lnlen = sizeof("Contact:") - 1, + .sname = "\r\nm:", + .snlen = sizeof("\r\nm:") - 1, + .ln_str = "sip:", + .ln_strlen = sizeof("sip:") - 1, + .match_len = skp_epaddr_len + }, + { /* Content length header */ + .lname = "Content-Length:", + .lnlen = sizeof("Content-Length:") - 1, + .sname = "\r\nl:", + .snlen = sizeof("\r\nl:") - 1, + .ln_str = ":", + .ln_strlen = sizeof(":") - 1, + .match_len = skp_digits_len + }, + { /* SDP media info */ + .lname = "\nm=", + .lnlen = sizeof("\nm=") - 1, + .sname = "\rm=", + .snlen = sizeof("\rm=") - 1, + .ln_str = "audio ", + .ln_strlen = sizeof("audio ") - 1, + .match_len = digits_len + }, + { /* SDP owner address*/ + .lname = "\no=", + .lnlen = sizeof("\no=") - 1, + .sname = "\ro=", + .snlen = sizeof("\ro=") - 1, + .ln_str = "IN IP4 ", + .ln_strlen = sizeof("IN IP4 ") - 1, + .match_len = epaddr_len + }, + { /* SDP connection info */ + .lname = "\nc=", + .lnlen = sizeof("\nc=") - 1, + .sname = "\rc=", + .snlen = sizeof("\rc=") - 1, + .ln_str = "IN IP4 ", + .ln_strlen = sizeof("IN IP4 ") - 1, + .match_len = epaddr_len + }, + { /* Requests headers */ + .lname = "sip:", + .lnlen = sizeof("sip:") - 1, + .sname = "sip:", + .snlen = sizeof("sip:") - 1, /* yes, i know.. ;) */ + .ln_str = "@", + .ln_strlen = sizeof("@") - 1, + .match_len = epaddr_len + }, + { /* SDP version header */ + .lname = "\nv=", + .lnlen = sizeof("\nv=") - 1, + .sname = "\rv=", + .snlen = sizeof("\rv=") - 1, + .ln_str = "=", + .ln_strlen = sizeof("=") - 1, + .match_len = digits_len + } +}; +EXPORT_SYMBOL_GPL(ct_sip_hdrs); + +/* get line lenght until first CR or LF seen. */ +int ct_sip_lnlen(const char *line, const char *limit) +{ + const char *k = line; + + while ((line <= limit) && (*line == '\r' || *line == '\n')) + line++; + + while (line <= limit) { + if (*line == '\r' || *line == '\n') + break; + line++; + } + return line - k; +} +EXPORT_SYMBOL_GPL(ct_sip_lnlen); + +/* Linear string search, case sensitive. */ +const char *ct_sip_search(const char *needle, const char *haystack, + size_t needle_len, size_t haystack_len) +{ + const char *limit = haystack + (haystack_len - needle_len); + + while (haystack <= limit) { + if (memcmp(haystack, needle, needle_len) == 0) + return haystack; + haystack++; + } + return NULL; +} +EXPORT_SYMBOL_GPL(ct_sip_search); + +static int digits_len(const char *dptr, const char *limit, int *shift) +{ + int len = 0; + while (dptr <= limit && isdigit(*dptr)) { + dptr++; + len++; + } + return len; +} + +/* get digits lenght, skiping blank spaces. */ +static int skp_digits_len(const char *dptr, const char *limit, int *shift) +{ + for (; dptr <= limit && *dptr == ' '; dptr++) + (*shift)++; + + return digits_len(dptr, limit, shift); +} + +/* Simple ipaddr parser.. */ +static int parse_ipaddr(const char *cp, const char **endp, + u_int32_t *ipaddr, const char *limit) +{ + unsigned long int val; + int i, digit = 0; + + for (i = 0, *ipaddr = 0; cp <= limit && i < 4; i++) { + digit = 0; + if (!isdigit(*cp)) + break; + + val = simple_strtoul(cp, (char **)&cp, 10); + if (val > 0xFF) + return -1; + + ((u_int8_t *)ipaddr)[i] = val; + digit = 1; + + if (*cp != '.') + break; + cp++; + } + if (!digit) + return -1; + + if (endp) + *endp = cp; + + return 0; +} + +/* skip ip address. returns it lenght. */ +static int epaddr_len(const char *dptr, const char *limit, int *shift) +{ + const char *aux = dptr; + u_int32_t ip; + + if (parse_ipaddr(dptr, &dptr, &ip, limit) < 0) { + DEBUGP("ip: %s parse failed.!\n", dptr); + return 0; + } + + /* Port number */ + if (*dptr == ':') { + dptr++; + dptr += digits_len(dptr, limit, shift); + } + return dptr - aux; +} + +/* get address length, skiping user info. */ +static int skp_epaddr_len(const char *dptr, const char *limit, int *shift) +{ + int s = *shift; + + for (; dptr <= limit && *dptr != '@'; dptr++) + (*shift)++; + + if (*dptr == '@') { + dptr++; + (*shift)++; + } else + *shift = s; + + return epaddr_len(dptr, limit, shift); +} + +/* Returns 0 if not found, -1 error parsing. */ +int ct_sip_get_info(const char *dptr, size_t dlen, + unsigned int *matchoff, + unsigned int *matchlen, + struct sip_header_nfo *hnfo) +{ + const char *limit, *aux, *k = dptr; + int shift = 0; + + limit = dptr + (dlen - hnfo->lnlen); + + while (dptr <= limit) { + if ((strncmp(dptr, hnfo->lname, hnfo->lnlen) != 0) && + (strncmp(dptr, hnfo->sname, hnfo->snlen) != 0)) { + dptr++; + continue; + } + aux = ct_sip_search(hnfo->ln_str, dptr, hnfo->ln_strlen, + ct_sip_lnlen(dptr, limit)); + if (!aux) { + DEBUGP("'%s' not found in '%s'.\n", hnfo->ln_str, + hnfo->lname); + return -1; + } + aux += hnfo->ln_strlen; + + *matchlen = hnfo->match_len(aux, limit, &shift); + if (!*matchlen) + return -1; + + *matchoff = (aux - k) + shift; + + DEBUGP("%s match succeeded! - len: %u\n", hnfo->lname, + *matchlen); + return 1; + } + DEBUGP("%s header not found.\n", hnfo->lname); + return 0; +} + +static int set_expected_rtp(struct sk_buff **pskb, + struct ip_conntrack *ct, + enum ip_conntrack_info ctinfo, + u_int32_t ipaddr, u_int16_t port, + const char *dptr) +{ + struct ip_conntrack_expect *exp; + enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); + int ret; + + exp = ip_conntrack_expect_alloc(ct); + if (exp == NULL) + return NF_DROP; + + exp->tuple.src.ip = ct->tuplehash[!dir].tuple.src.ip; + exp->tuple.src.u.udp.port = 0; + exp->tuple.dst.ip = ipaddr; + exp->tuple.dst.u.udp.port = htons(port); + exp->tuple.dst.protonum = IPPROTO_UDP; + + exp->mask.src.ip = 0xFFFFFFFF; + exp->mask.src.u.udp.port = 0; + exp->mask.dst.ip = 0xFFFFFFFF; + exp->mask.dst.u.udp.port = 0xFFFF; + exp->mask.dst.protonum = 0xFF; + + exp->expectfn = NULL; + exp->flags = 0; + + if (ip_nat_sdp_hook) + ret = ip_nat_sdp_hook(pskb, ctinfo, exp, dptr); + else { + if (ip_conntrack_expect_related(exp) != 0) + ret = NF_DROP; + else + ret = NF_ACCEPT; + } + ip_conntrack_expect_put(exp); + + return ret; +} + +static int sip_help(struct sk_buff **pskb, + struct ip_conntrack *ct, + enum ip_conntrack_info ctinfo) +{ + unsigned int dataoff, datalen; + const char *dptr; + int ret = NF_ACCEPT; + int matchoff, matchlen; + u_int32_t ipaddr; + u_int16_t port; + + /* No Data ? */ + dataoff = (*pskb)->nh.iph->ihl*4 + sizeof(struct udphdr); + if (dataoff >= (*pskb)->len) { + DEBUGP("skb->len = %u\n", (*pskb)->len); + return NF_ACCEPT; + } + + ip_ct_refresh(ct, *pskb, sip_timeout * HZ); + + if (!skb_is_nonlinear(*pskb)) + dptr = (*pskb)->data + dataoff; + else { + DEBUGP("Copy of skbuff not supported yet.\n"); + goto out; + } + + if (ip_nat_sip_hook) { + if (!ip_nat_sip_hook(pskb, ctinfo, ct, &dptr)) { + ret = NF_DROP; + goto out; + } + } + + /* After this point NAT, could have mangled skb, so + we need to recalculate payload lenght. */ + datalen = (*pskb)->len - dataoff; + + if (datalen < (sizeof("SIP/2.0 200") - 1)) + goto out; + + /* RTP info only in some SDP pkts */ + if (memcmp(dptr, "INVITE", sizeof("INVITE") - 1) != 0 && + memcmp(dptr, "SIP/2.0 200", sizeof("SIP/2.0 200") - 1) != 0) { + goto out; + } + /* Get ip and port address from SDP packet. */ + if (ct_sip_get_info(dptr, datalen, &matchoff, &matchlen, + &ct_sip_hdrs[POS_CONNECTION]) > 0) { + + /* We'll drop only if there are parse problems. */ + if (parse_ipaddr(dptr + matchoff, NULL, &ipaddr, + dptr + datalen) < 0) { + ret = NF_DROP; + goto out; + } + if (ct_sip_get_info(dptr, datalen, &matchoff, &matchlen, + &ct_sip_hdrs[POS_MEDIA]) > 0) { + + port = simple_strtoul(dptr + matchoff, NULL, 10); + if (port < 1024) { + ret = NF_DROP; + goto out; + } + ret = set_expected_rtp(pskb, ct, ctinfo, + ipaddr, port, dptr); + } + } +out: + return ret; +} + +static struct ip_conntrack_helper sip[MAX_PORTS]; +static char sip_names[MAX_PORTS][10]; + +static void fini(void) +{ + int i; + for (i = 0; i < ports_c; i++) { + DEBUGP("unregistering helper for port %d\n", ports[i]); + ip_conntrack_helper_unregister(&sip[i]); + } +} + +static int __init init(void) +{ + int i, ret; + char *tmpname; + + if (ports_c == 0) + ports[ports_c++] = SIP_PORT; + + for (i = 0; i < ports_c; i++) { + /* Create helper structure */ + memset(&sip[i], 0, sizeof(struct ip_conntrack_helper)); + + sip[i].tuple.dst.protonum = IPPROTO_UDP; + sip[i].tuple.src.u.udp.port = htons(ports[i]); + sip[i].mask.src.u.udp.port = 0xFFFF; + sip[i].mask.dst.protonum = 0xFF; + sip[i].max_expected = 1; + sip[i].timeout = 3 * 60; /* 3 minutes */ + sip[i].me = THIS_MODULE; + sip[i].help = sip_help; + + tmpname = &sip_names[i][0]; + if (ports[i] == SIP_PORT) + sprintf(tmpname, "sip"); + else + sprintf(tmpname, "sip-%d", i); + sip[i].name = tmpname; + + DEBUGP("port #%d: %d\n", i, ports[i]); + + ret = ip_conntrack_helper_register(&sip[i]); + if (ret) { + printk("ERROR registering helper for port %d\n", + ports[i]); + fini(); + return ret; + } + } + return 0; +} + +module_init(init); +module_exit(fini); diff --git a/net/ipv4/netfilter/ip_conntrack_standalone.c b/net/ipv4/netfilter/ip_conntrack_standalone.c index 929d61f7be91..88445aac3f28 100644 --- a/net/ipv4/netfilter/ip_conntrack_standalone.c +++ b/net/ipv4/netfilter/ip_conntrack_standalone.c @@ -189,6 +189,11 @@ static int ct_seq_show(struct seq_file *s, void *v) return -ENOSPC; #endif +#ifdef CONFIG_IP_NF_CONNTRACK_SECMARK + if (seq_printf(s, "secmark=%u ", conntrack->secmark)) + return -ENOSPC; +#endif + if (seq_printf(s, "use=%u\n", atomic_read(&conntrack->ct_general.use))) return -ENOSPC; @@ -417,7 +422,7 @@ static unsigned int ip_conntrack_help(unsigned int hooknum, /* This is where we call the helper: as the packet goes out. */ ct = ip_conntrack_get(*pskb, &ctinfo); - if (ct && ct->helper) { + if (ct && ct->helper && ctinfo != IP_CT_RELATED + IP_CT_IS_REPLY) { unsigned int ret; ret = ct->helper->help(pskb, ct, ctinfo); if (ret != NF_ACCEPT) @@ -564,6 +569,8 @@ extern unsigned int ip_ct_generic_timeout; static int log_invalid_proto_min = 0; static int log_invalid_proto_max = 255; +int ip_conntrack_checksum = 1; + static struct ctl_table_header *ip_ct_sysctl_header; static ctl_table ip_ct_sysctl_table[] = { @@ -592,6 +599,14 @@ static ctl_table ip_ct_sysctl_table[] = { .proc_handler = &proc_dointvec, }, { + .ctl_name = NET_IPV4_NF_CONNTRACK_CHECKSUM, + .procname = "ip_conntrack_checksum", + .data = &ip_conntrack_checksum, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { .ctl_name = NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_SYN_SENT, .procname = "ip_conntrack_tcp_timeout_syn_sent", .data = &ip_ct_tcp_timeout_syn_sent, @@ -946,6 +961,7 @@ EXPORT_SYMBOL_GPL(__ip_conntrack_helper_find_byname); EXPORT_SYMBOL_GPL(ip_conntrack_proto_find_get); EXPORT_SYMBOL_GPL(ip_conntrack_proto_put); EXPORT_SYMBOL_GPL(__ip_conntrack_proto_find); +EXPORT_SYMBOL_GPL(ip_conntrack_checksum); #if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \ defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE) EXPORT_SYMBOL_GPL(ip_ct_port_tuple_to_nfattr); diff --git a/net/ipv4/netfilter/ip_nat_helper_h323.c b/net/ipv4/netfilter/ip_nat_helper_h323.c index d45663d137a7..419b878fb467 100644 --- a/net/ipv4/netfilter/ip_nat_helper_h323.c +++ b/net/ipv4/netfilter/ip_nat_helper_h323.c @@ -487,6 +487,80 @@ static int nat_q931(struct sk_buff **pskb, struct ip_conntrack *ct, } /****************************************************************************/ +static void ip_nat_callforwarding_expect(struct ip_conntrack *new, + struct ip_conntrack_expect *this) +{ + struct ip_nat_range range; + + /* This must be a fresh one. */ + BUG_ON(new->status & IPS_NAT_DONE_MASK); + + /* Change src to where master sends to */ + range.flags = IP_NAT_RANGE_MAP_IPS; + range.min_ip = range.max_ip = new->tuplehash[!this->dir].tuple.src.ip; + + /* hook doesn't matter, but it has to do source manip */ + ip_nat_setup_info(new, &range, NF_IP_POST_ROUTING); + + /* For DST manip, map port here to where it's expected. */ + range.flags = (IP_NAT_RANGE_MAP_IPS | IP_NAT_RANGE_PROTO_SPECIFIED); + range.min = range.max = this->saved_proto; + range.min_ip = range.max_ip = this->saved_ip; + + /* hook doesn't matter, but it has to do destination manip */ + ip_nat_setup_info(new, &range, NF_IP_PRE_ROUTING); + + ip_conntrack_q931_expect(new, this); +} + +/****************************************************************************/ +static int nat_callforwarding(struct sk_buff **pskb, struct ip_conntrack *ct, + enum ip_conntrack_info ctinfo, + unsigned char **data, int dataoff, + TransportAddress * addr, u_int16_t port, + struct ip_conntrack_expect *exp) +{ + int dir = CTINFO2DIR(ctinfo); + u_int16_t nated_port; + + /* Set expectations for NAT */ + exp->saved_ip = exp->tuple.dst.ip; + exp->tuple.dst.ip = ct->tuplehash[!dir].tuple.dst.ip; + exp->saved_proto.tcp.port = exp->tuple.dst.u.tcp.port; + exp->expectfn = ip_nat_callforwarding_expect; + exp->dir = !dir; + + /* Try to get same port: if not, try to change it. */ + for (nated_port = port; nated_port != 0; nated_port++) { + exp->tuple.dst.u.tcp.port = htons(nated_port); + if (ip_conntrack_expect_related(exp) == 0) + break; + } + + if (nated_port == 0) { /* No port available */ + if (net_ratelimit()) + printk("ip_nat_q931: out of TCP ports\n"); + return 0; + } + + /* Modify signal */ + if (!set_h225_addr(pskb, data, dataoff, addr, + ct->tuplehash[!dir].tuple.dst.ip, + nated_port) == 0) { + ip_conntrack_unexpect_related(exp); + return -1; + } + + /* Success */ + DEBUGP("ip_nat_q931: expect Call Forwarding " + "%u.%u.%u.%u:%hu->%u.%u.%u.%u:%hu\n", + NIPQUAD(exp->tuple.src.ip), ntohs(exp->tuple.src.u.tcp.port), + NIPQUAD(exp->tuple.dst.ip), ntohs(exp->tuple.dst.u.tcp.port)); + + return 0; +} + +/****************************************************************************/ static int __init init(void) { BUG_ON(set_h245_addr_hook != NULL); @@ -496,6 +570,7 @@ static int __init init(void) BUG_ON(nat_rtp_rtcp_hook != NULL); BUG_ON(nat_t120_hook != NULL); BUG_ON(nat_h245_hook != NULL); + BUG_ON(nat_callforwarding_hook != NULL); BUG_ON(nat_q931_hook != NULL); set_h245_addr_hook = set_h245_addr; @@ -505,6 +580,7 @@ static int __init init(void) nat_rtp_rtcp_hook = nat_rtp_rtcp; nat_t120_hook = nat_t120; nat_h245_hook = nat_h245; + nat_callforwarding_hook = nat_callforwarding; nat_q931_hook = nat_q931; DEBUGP("ip_nat_h323: init success\n"); @@ -521,6 +597,7 @@ static void __exit fini(void) nat_rtp_rtcp_hook = NULL; nat_t120_hook = NULL; nat_h245_hook = NULL; + nat_callforwarding_hook = NULL; nat_q931_hook = NULL; synchronize_net(); } diff --git a/net/ipv4/netfilter/ip_nat_sip.c b/net/ipv4/netfilter/ip_nat_sip.c new file mode 100644 index 000000000000..6ffba63adca2 --- /dev/null +++ b/net/ipv4/netfilter/ip_nat_sip.c @@ -0,0 +1,249 @@ +/* SIP extension for UDP NAT alteration. + * + * (C) 2005 by Christian Hentschel <chentschel@arnet.com.ar> + * based on RR's ip_nat_ftp.c and other modules. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/ip.h> +#include <linux/udp.h> + +#include <linux/netfilter_ipv4.h> +#include <linux/netfilter_ipv4/ip_nat.h> +#include <linux/netfilter_ipv4/ip_nat_helper.h> +#include <linux/netfilter_ipv4/ip_conntrack_helper.h> +#include <linux/netfilter_ipv4/ip_conntrack_sip.h> + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Christian Hentschel <chentschel@arnet.com.ar>"); +MODULE_DESCRIPTION("SIP NAT helper"); + +#if 0 +#define DEBUGP printk +#else +#define DEBUGP(format, args...) +#endif + +extern struct sip_header_nfo ct_sip_hdrs[]; + +static unsigned int mangle_sip_packet(struct sk_buff **pskb, + enum ip_conntrack_info ctinfo, + struct ip_conntrack *ct, + const char **dptr, size_t dlen, + char *buffer, int bufflen, + struct sip_header_nfo *hnfo) +{ + unsigned int matchlen, matchoff; + + if (ct_sip_get_info(*dptr, dlen, &matchoff, &matchlen, hnfo) <= 0) + return 0; + + if (!ip_nat_mangle_udp_packet(pskb, ct, ctinfo, + matchoff, matchlen, buffer, bufflen)) + return 0; + + /* We need to reload this. Thanks Patrick. */ + *dptr = (*pskb)->data + (*pskb)->nh.iph->ihl*4 + sizeof(struct udphdr); + return 1; +} + +static unsigned int ip_nat_sip(struct sk_buff **pskb, + enum ip_conntrack_info ctinfo, + struct ip_conntrack *ct, + const char **dptr) +{ + enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); + char buffer[sizeof("nnn.nnn.nnn.nnn:nnnnn")]; + unsigned int bufflen, dataoff; + u_int32_t ip; + u_int16_t port; + + dataoff = (*pskb)->nh.iph->ihl*4 + sizeof(struct udphdr); + + ip = ct->tuplehash[!dir].tuple.dst.ip; + port = ct->tuplehash[!dir].tuple.dst.u.udp.port; + bufflen = sprintf(buffer, "%u.%u.%u.%u:%u", NIPQUAD(ip), ntohs(port)); + + /* short packet ? */ + if (((*pskb)->len - dataoff) < (sizeof("SIP/2.0") - 1)) + return 0; + + /* Basic rules: requests and responses. */ + if (memcmp(*dptr, "SIP/2.0", sizeof("SIP/2.0") - 1) == 0) { + const char *aux; + + if ((ctinfo) < IP_CT_IS_REPLY) { + mangle_sip_packet(pskb, ctinfo, ct, dptr, + (*pskb)->len - dataoff, + buffer, bufflen, + &ct_sip_hdrs[POS_CONTACT]); + return 1; + } + + if (!mangle_sip_packet(pskb, ctinfo, ct, dptr, + (*pskb)->len - dataoff, + buffer, bufflen, &ct_sip_hdrs[POS_VIA])) + return 0; + + /* This search should ignore case, but later.. */ + aux = ct_sip_search("CSeq:", *dptr, sizeof("CSeq:") - 1, + (*pskb)->len - dataoff); + if (!aux) + return 0; + + if (!ct_sip_search("REGISTER", aux, sizeof("REGISTER"), + ct_sip_lnlen(aux, *dptr + (*pskb)->len - dataoff))) + return 1; + + return mangle_sip_packet(pskb, ctinfo, ct, dptr, + (*pskb)->len - dataoff, + buffer, bufflen, + &ct_sip_hdrs[POS_CONTACT]); + } + if ((ctinfo) < IP_CT_IS_REPLY) { + if (!mangle_sip_packet(pskb, ctinfo, ct, dptr, + (*pskb)->len - dataoff, + buffer, bufflen, &ct_sip_hdrs[POS_VIA])) + return 0; + + /* Mangle Contact if exists only. - watch udp_nat_mangle()! */ + mangle_sip_packet(pskb, ctinfo, ct, dptr, (*pskb)->len - dataoff, + buffer, bufflen, &ct_sip_hdrs[POS_CONTACT]); + return 1; + } + /* This mangle requests headers. */ + return mangle_sip_packet(pskb, ctinfo, ct, dptr, + ct_sip_lnlen(*dptr, + *dptr + (*pskb)->len - dataoff), + buffer, bufflen, &ct_sip_hdrs[POS_REQ_HEADER]); +} + +static int mangle_content_len(struct sk_buff **pskb, + enum ip_conntrack_info ctinfo, + struct ip_conntrack *ct, + const char *dptr) +{ + unsigned int dataoff, matchoff, matchlen; + char buffer[sizeof("65536")]; + int bufflen; + + dataoff = (*pskb)->nh.iph->ihl*4 + sizeof(struct udphdr); + + /* Get actual SDP lenght */ + if (ct_sip_get_info(dptr, (*pskb)->len - dataoff, &matchoff, + &matchlen, &ct_sip_hdrs[POS_SDP_HEADER]) > 0) { + + /* since ct_sip_get_info() give us a pointer passing 'v=' + we need to add 2 bytes in this count. */ + int c_len = (*pskb)->len - dataoff - matchoff + 2; + + /* Now, update SDP lenght */ + if (ct_sip_get_info(dptr, (*pskb)->len - dataoff, &matchoff, + &matchlen, &ct_sip_hdrs[POS_CONTENT]) > 0) { + + bufflen = sprintf(buffer, "%u", c_len); + + return ip_nat_mangle_udp_packet(pskb, ct, ctinfo, + matchoff, matchlen, + buffer, bufflen); + } + } + return 0; +} + +static unsigned int mangle_sdp(struct sk_buff **pskb, + enum ip_conntrack_info ctinfo, + struct ip_conntrack *ct, + u_int32_t newip, u_int16_t port, + const char *dptr) +{ + char buffer[sizeof("nnn.nnn.nnn.nnn")]; + unsigned int dataoff, bufflen; + + dataoff = (*pskb)->nh.iph->ihl*4 + sizeof(struct udphdr); + + /* Mangle owner and contact info. */ + bufflen = sprintf(buffer, "%u.%u.%u.%u", NIPQUAD(newip)); + if (!mangle_sip_packet(pskb, ctinfo, ct, &dptr, (*pskb)->len - dataoff, + buffer, bufflen, &ct_sip_hdrs[POS_OWNER])) + return 0; + + if (!mangle_sip_packet(pskb, ctinfo, ct, &dptr, (*pskb)->len - dataoff, + buffer, bufflen, &ct_sip_hdrs[POS_CONNECTION])) + return 0; + + /* Mangle media port. */ + bufflen = sprintf(buffer, "%u", port); + if (!mangle_sip_packet(pskb, ctinfo, ct, &dptr, (*pskb)->len - dataoff, + buffer, bufflen, &ct_sip_hdrs[POS_MEDIA])) + return 0; + + return mangle_content_len(pskb, ctinfo, ct, dptr); +} + +/* So, this packet has hit the connection tracking matching code. + Mangle it, and change the expectation to match the new version. */ +static unsigned int ip_nat_sdp(struct sk_buff **pskb, + enum ip_conntrack_info ctinfo, + struct ip_conntrack_expect *exp, + const char *dptr) +{ + struct ip_conntrack *ct = exp->master; + enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); + u_int32_t newip; + u_int16_t port; + + DEBUGP("ip_nat_sdp():\n"); + + /* Connection will come from reply */ + newip = ct->tuplehash[!dir].tuple.dst.ip; + + exp->tuple.dst.ip = newip; + exp->saved_proto.udp.port = exp->tuple.dst.u.udp.port; + exp->dir = !dir; + + /* When you see the packet, we need to NAT it the same as the + this one. */ + exp->expectfn = ip_nat_follow_master; + + /* Try to get same port: if not, try to change it. */ + for (port = ntohs(exp->saved_proto.udp.port); port != 0; port++) { + exp->tuple.dst.u.udp.port = htons(port); + if (ip_conntrack_expect_related(exp) == 0) + break; + } + + if (port == 0) + return NF_DROP; + + if (!mangle_sdp(pskb, ctinfo, ct, newip, port, dptr)) { + ip_conntrack_unexpect_related(exp); + return NF_DROP; + } + return NF_ACCEPT; +} + +static void __exit fini(void) +{ + ip_nat_sip_hook = NULL; + ip_nat_sdp_hook = NULL; + /* Make sure noone calls it, meanwhile. */ + synchronize_net(); +} + +static int __init init(void) +{ + BUG_ON(ip_nat_sip_hook); + BUG_ON(ip_nat_sdp_hook); + ip_nat_sip_hook = ip_nat_sip; + ip_nat_sdp_hook = ip_nat_sdp; + return 0; +} + +module_init(init); +module_exit(fini); diff --git a/net/ipv4/netfilter/ip_nat_snmp_basic.c b/net/ipv4/netfilter/ip_nat_snmp_basic.c index c33244263b90..d20d557f915a 100644 --- a/net/ipv4/netfilter/ip_nat_snmp_basic.c +++ b/net/ipv4/netfilter/ip_nat_snmp_basic.c @@ -1348,4 +1348,4 @@ static void __exit ip_nat_snmp_basic_fini(void) module_init(ip_nat_snmp_basic_init); module_exit(ip_nat_snmp_basic_fini); -module_param(debug, bool, 0600); +module_param(debug, int, 0600); diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index cee3397ec277..706c0025ec5e 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -1761,7 +1761,7 @@ translate_compat_table(const char *name, goto free_newinfo; /* And one copy for every other CPU */ - for_each_cpu(i) + for_each_possible_cpu(i) if (newinfo->entries[i] && newinfo->entries[i] != entry1) memcpy(newinfo->entries[i], entry1, newinfo->size); diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c index aad9d28c8d71..dbc83c5d7aa6 100644 --- a/net/ipv4/netfilter/ipt_CLUSTERIP.c +++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c @@ -241,25 +241,17 @@ clusterip_hashfn(struct sk_buff *skb, struct clusterip_config *config) struct iphdr *iph = skb->nh.iph; unsigned long hashval; u_int16_t sport, dport; - struct tcphdr *th; - struct udphdr *uh; - struct icmphdr *ih; + u_int16_t *ports; switch (iph->protocol) { case IPPROTO_TCP: - th = (void *)iph+iph->ihl*4; - sport = ntohs(th->source); - dport = ntohs(th->dest); - break; case IPPROTO_UDP: - uh = (void *)iph+iph->ihl*4; - sport = ntohs(uh->source); - dport = ntohs(uh->dest); - break; + case IPPROTO_SCTP: + case IPPROTO_DCCP: case IPPROTO_ICMP: - ih = (void *)iph+iph->ihl*4; - sport = ntohs(ih->un.echo.id); - dport = (ih->type<<8)|ih->code; + ports = (void *)iph+iph->ihl*4; + sport = ports[0]; + dport = ports[1]; break; default: if (net_ratelimit()) { diff --git a/net/ipv4/netfilter/ipt_REJECT.c b/net/ipv4/netfilter/ipt_REJECT.c index 0bba3c2bb786..431a3ce6f7b7 100644 --- a/net/ipv4/netfilter/ipt_REJECT.c +++ b/net/ipv4/netfilter/ipt_REJECT.c @@ -147,6 +147,7 @@ static void send_reset(struct sk_buff *oldskb, int hook) /* This packet will not be the same as the other: clear nf fields */ nf_reset(nskb); nskb->nfmark = 0; + skb_init_secmark(nskb); tcph = (struct tcphdr *)((u_int32_t*)nskb->nh.iph + nskb->nh.iph->ihl); diff --git a/net/ipv4/netfilter/ipt_hashlimit.c b/net/ipv4/netfilter/ipt_hashlimit.c index 7c6836c4646e..92980ab8ce48 100644 --- a/net/ipv4/netfilter/ipt_hashlimit.c +++ b/net/ipv4/netfilter/ipt_hashlimit.c @@ -28,9 +28,6 @@ #include <linux/jhash.h> #include <linux/slab.h> #include <linux/vmalloc.h> -#include <linux/tcp.h> -#include <linux/udp.h> -#include <linux/sctp.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/list.h> @@ -83,6 +80,7 @@ struct ipt_hashlimit_htable { /* used internally */ spinlock_t lock; /* lock for list_head */ u_int32_t rnd; /* random seed for hash */ + int rnd_initialized; struct timer_list timer; /* timer for gc */ atomic_t count; /* number entries in table */ @@ -137,8 +135,10 @@ __dsthash_alloc_init(struct ipt_hashlimit_htable *ht, struct dsthash_dst *dst) /* initialize hash with random val at the time we allocate * the first hashtable entry */ - if (!ht->rnd) + if (!ht->rnd_initialized) { get_random_bytes(&ht->rnd, 4); + ht->rnd_initialized = 1; + } if (ht->cfg.max && atomic_read(&ht->count) >= ht->cfg.max) { @@ -217,7 +217,7 @@ static int htable_create(struct ipt_hashlimit_info *minfo) atomic_set(&hinfo->count, 0); atomic_set(&hinfo->use, 1); - hinfo->rnd = 0; + hinfo->rnd_initialized = 0; spin_lock_init(&hinfo->lock); hinfo->pde = create_proc_entry(minfo->name, 0, hashlimit_procdir); if (!hinfo->pde) { @@ -381,49 +381,6 @@ static inline void rateinfo_recalc(struct dsthash_ent *dh, unsigned long now) dh->rateinfo.credit = dh->rateinfo.credit_cap; } -static inline int get_ports(const struct sk_buff *skb, int offset, - u16 ports[2]) -{ - union { - struct tcphdr th; - struct udphdr uh; - sctp_sctphdr_t sctph; - } hdr_u, *ptr_u; - - /* Must not be a fragment. */ - if (offset) - return 1; - - /* Must be big enough to read ports (both UDP and TCP have - them at the start). */ - ptr_u = skb_header_pointer(skb, skb->nh.iph->ihl*4, 8, &hdr_u); - if (!ptr_u) - return 1; - - switch (skb->nh.iph->protocol) { - case IPPROTO_TCP: - ports[0] = ptr_u->th.source; - ports[1] = ptr_u->th.dest; - break; - case IPPROTO_UDP: - ports[0] = ptr_u->uh.source; - ports[1] = ptr_u->uh.dest; - break; - case IPPROTO_SCTP: - ports[0] = ptr_u->sctph.source; - ports[1] = ptr_u->sctph.dest; - break; - default: - /* all other protocols don't supprot per-port hash - * buckets */ - ports[0] = ports[1] = 0; - break; - } - - return 0; -} - - static int hashlimit_match(const struct sk_buff *skb, const struct net_device *in, @@ -449,8 +406,22 @@ hashlimit_match(const struct sk_buff *skb, dst.src_ip = skb->nh.iph->saddr; if (hinfo->cfg.mode & IPT_HASHLIMIT_HASH_DPT ||hinfo->cfg.mode & IPT_HASHLIMIT_HASH_SPT) { - u_int16_t ports[2]; - if (get_ports(skb, offset, ports)) { + u_int16_t _ports[2], *ports; + + switch (skb->nh.iph->protocol) { + case IPPROTO_TCP: + case IPPROTO_UDP: + case IPPROTO_SCTP: + case IPPROTO_DCCP: + ports = skb_header_pointer(skb, skb->nh.iph->ihl*4, + sizeof(_ports), &_ports); + break; + default: + _ports[0] = _ports[1] = 0; + ports = _ports; + break; + } + if (!ports) { /* We've been asked to examine this packet, and we can't. Hence, no choice but to drop. */ *hotdrop = 1; @@ -561,7 +532,7 @@ static void hashlimit_destroy(const struct xt_match *match, void *matchinfo, unsigned int matchsize) { - struct ipt_hashlimit_info *r = (struct ipt_hashlimit_info *) matchinfo; + struct ipt_hashlimit_info *r = matchinfo; htable_put(r->hinfo); } diff --git a/net/ipv4/netfilter/ipt_recent.c b/net/ipv4/netfilter/ipt_recent.c index b847ee409efb..61a2139f9cfd 100644 --- a/net/ipv4/netfilter/ipt_recent.c +++ b/net/ipv4/netfilter/ipt_recent.c @@ -1,1007 +1,499 @@ -/* Kernel module to check if the source address has been seen recently. */ -/* Copyright 2002-2003, Stephen Frost, 2.5.x port by laforge@netfilter.org */ -/* Author: Stephen Frost <sfrost@snowman.net> */ -/* Project Page: http://snowman.net/projects/ipt_recent/ */ -/* This software is distributed under the terms of the GPL, Version 2 */ -/* This copyright does not cover user programs that use kernel services - * by normal system calls. */ - -#include <linux/module.h> -#include <linux/skbuff.h> +/* + * Copyright (c) 2006 Patrick McHardy <kaber@trash.net> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This is a replacement of the old ipt_recent module, which carried the + * following copyright notice: + * + * Author: Stephen Frost <sfrost@snowman.net> + * Copyright 2002-2003, Stephen Frost, 2.5.x port by laforge@netfilter.org + */ +#include <linux/init.h> +#include <linux/moduleparam.h> #include <linux/proc_fs.h> -#include <linux/spinlock.h> -#include <linux/interrupt.h> -#include <asm/uaccess.h> +#include <linux/seq_file.h> +#include <linux/string.h> #include <linux/ctype.h> -#include <linux/ip.h> -#include <linux/vmalloc.h> -#include <linux/moduleparam.h> +#include <linux/list.h> +#include <linux/random.h> +#include <linux/jhash.h> +#include <linux/bitops.h> +#include <linux/skbuff.h> +#include <linux/inet.h> #include <linux/netfilter_ipv4/ip_tables.h> #include <linux/netfilter_ipv4/ipt_recent.h> -#undef DEBUG -#define HASH_LOG 9 +MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); +MODULE_DESCRIPTION("IP tables recently seen matching module"); +MODULE_LICENSE("GPL"); -/* Defaults, these can be overridden on the module command-line. */ static unsigned int ip_list_tot = 100; static unsigned int ip_pkt_list_tot = 20; static unsigned int ip_list_hash_size = 0; static unsigned int ip_list_perms = 0644; -#ifdef DEBUG -static int debug = 1; -#endif - -static char version[] = -KERN_INFO RECENT_NAME " " RECENT_VER ": Stephen Frost <sfrost@snowman.net>. http://snowman.net/projects/ipt_recent/\n"; - -MODULE_AUTHOR("Stephen Frost <sfrost@snowman.net>"); -MODULE_DESCRIPTION("IP tables recently seen matching module " RECENT_VER); -MODULE_LICENSE("GPL"); module_param(ip_list_tot, uint, 0400); module_param(ip_pkt_list_tot, uint, 0400); module_param(ip_list_hash_size, uint, 0400); module_param(ip_list_perms, uint, 0400); -#ifdef DEBUG -module_param(debug, bool, 0600); -MODULE_PARM_DESC(debug,"enable debugging output"); -#endif -MODULE_PARM_DESC(ip_list_tot,"number of IPs to remember per list"); -MODULE_PARM_DESC(ip_pkt_list_tot,"number of packets per IP to remember"); -MODULE_PARM_DESC(ip_list_hash_size,"size of hash table used to look up IPs"); -MODULE_PARM_DESC(ip_list_perms,"permissions on /proc/net/ipt_recent/* files"); - -/* Structure of our list of recently seen addresses. */ -struct recent_ip_list { - u_int32_t addr; - u_int8_t ttl; - unsigned long last_seen; - unsigned long *last_pkts; - u_int32_t oldest_pkt; - u_int32_t hash_entry; - u_int32_t time_pos; -}; - -struct time_info_list { - u_int32_t position; - u_int32_t time; +MODULE_PARM_DESC(ip_list_tot, "number of IPs to remember per list"); +MODULE_PARM_DESC(ip_pkt_list_tot, "number of packets per IP to remember (max. 255)"); +MODULE_PARM_DESC(ip_list_hash_size, "size of hash table used to look up IPs"); +MODULE_PARM_DESC(ip_list_perms, "permissions on /proc/net/ipt_recent/* files"); + + +struct recent_entry { + struct list_head list; + struct list_head lru_list; + u_int32_t addr; + u_int8_t ttl; + u_int8_t index; + u_int16_t nstamps; + unsigned long stamps[0]; }; -/* Structure of our linked list of tables of recent lists. */ -struct recent_ip_tables { - char name[IPT_RECENT_NAME_LEN]; - int count; - int time_pos; - struct recent_ip_list *table; - struct recent_ip_tables *next; - spinlock_t list_lock; - int *hash_table; - struct time_info_list *time_info; +struct recent_table { + struct list_head list; + char name[IPT_RECENT_NAME_LEN]; #ifdef CONFIG_PROC_FS - struct proc_dir_entry *status_proc; -#endif /* CONFIG_PROC_FS */ + struct proc_dir_entry *proc; +#endif + unsigned int refcnt; + unsigned int entries; + struct list_head lru_list; + struct list_head iphash[0]; }; -/* Our current list of addresses we have recently seen. - * Only added to on a --set, and only updated on --set || --update - */ -static struct recent_ip_tables *r_tables = NULL; - -/* We protect r_list with this spinlock so two processors are not modifying - * the list at the same time. - */ +static LIST_HEAD(tables); static DEFINE_SPINLOCK(recent_lock); +static DEFINE_MUTEX(recent_mutex); #ifdef CONFIG_PROC_FS -/* Our /proc/net/ipt_recent entry */ -static struct proc_dir_entry *proc_net_ipt_recent = NULL; -#endif - -/* Function declaration for later. */ -static int -match(const struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - const struct xt_match *match, - const void *matchinfo, - int offset, - unsigned int protoff, - int *hotdrop); - -/* Function to hash a given address into the hash table of table_size size */ -static int hash_func(unsigned int addr, int table_size) -{ - int result = 0; - unsigned int value = addr; - do { result ^= value; } while((value >>= HASH_LOG)); - -#ifdef DEBUG - if(debug) printk(KERN_INFO RECENT_NAME ": %d = hash_func(%u,%d)\n", - result & (table_size - 1), - addr, - table_size); +static struct proc_dir_entry *proc_dir; +static struct file_operations recent_fops; #endif - return(result & (table_size - 1)); -} +static u_int32_t hash_rnd; +static int hash_rnd_initted; -#ifdef CONFIG_PROC_FS -/* This is the function which produces the output for our /proc output - * interface which lists each IP address, the last seen time and the - * other recent times the address was seen. - */ - -static int ip_recent_get_info(char *buffer, char **start, off_t offset, int length, int *eof, void *data) +static unsigned int recent_entry_hash(u_int32_t addr) { - int len = 0, count, last_len = 0, pkt_count; - off_t pos = 0; - off_t begin = 0; - struct recent_ip_tables *curr_table; - - curr_table = (struct recent_ip_tables*) data; - - spin_lock_bh(&curr_table->list_lock); - for(count = 0; count < ip_list_tot; count++) { - if(!curr_table->table[count].addr) continue; - last_len = len; - len += sprintf(buffer+len,"src=%u.%u.%u.%u ",NIPQUAD(curr_table->table[count].addr)); - len += sprintf(buffer+len,"ttl: %u ",curr_table->table[count].ttl); - len += sprintf(buffer+len,"last_seen: %lu ",curr_table->table[count].last_seen); - len += sprintf(buffer+len,"oldest_pkt: %u ",curr_table->table[count].oldest_pkt); - len += sprintf(buffer+len,"last_pkts: %lu",curr_table->table[count].last_pkts[0]); - for(pkt_count = 1; pkt_count < ip_pkt_list_tot; pkt_count++) { - if(!curr_table->table[count].last_pkts[pkt_count]) break; - len += sprintf(buffer+len,", %lu",curr_table->table[count].last_pkts[pkt_count]); - } - len += sprintf(buffer+len,"\n"); - pos = begin + len; - if(pos < offset) { len = 0; begin = pos; } - if(pos > offset + length) { len = last_len; break; } + if (!hash_rnd_initted) { + get_random_bytes(&hash_rnd, 4); + hash_rnd_initted = 1; } - - *start = buffer + (offset - begin); - len -= (offset - begin); - if(len > length) len = length; - - spin_unlock_bh(&curr_table->list_lock); - return len; + return jhash_1word(addr, hash_rnd) & (ip_list_hash_size - 1); } -/* ip_recent_ctrl provides an interface for users to modify the table - * directly. This allows adding entries, removing entries, and - * flushing the entire table. - * This is done by opening up the appropriate table for writing and - * sending one of: - * xx.xx.xx.xx -- Add entry to table with current time - * +xx.xx.xx.xx -- Add entry to table with current time - * -xx.xx.xx.xx -- Remove entry from table - * clear -- Flush table, remove all entries - */ - -static int ip_recent_ctrl(struct file *file, const char __user *input, unsigned long size, void *data) +static struct recent_entry * +recent_entry_lookup(const struct recent_table *table, u_int32_t addr, u_int8_t ttl) { - static const u_int32_t max[4] = { 0xffffffff, 0xffffff, 0xffff, 0xff }; - u_int32_t val; - int base, used = 0; - char c, *cp; - union iaddr { - uint8_t bytes[4]; - uint32_t word; - } res; - uint8_t *pp = res.bytes; - int digit; - - char buffer[20]; - int len, check_set = 0, count; - u_int32_t addr = 0; - struct sk_buff *skb; - struct ipt_recent_info *info; - struct recent_ip_tables *curr_table; - - curr_table = (struct recent_ip_tables*) data; - - if(size > 20) len = 20; else len = size; - - if(copy_from_user(buffer,input,len)) return -EFAULT; - - if(len < 20) buffer[len] = '\0'; - -#ifdef DEBUG - if(debug) printk(KERN_INFO RECENT_NAME ": ip_recent_ctrl len: %d, input: `%.20s'\n",len,buffer); -#endif + struct recent_entry *e; + unsigned int h; + + h = recent_entry_hash(addr); + list_for_each_entry(e, &table->iphash[h], list) + if (e->addr == addr && (ttl == e->ttl || !ttl || !e->ttl)) + return e; + return NULL; +} - cp = buffer; - while(isspace(*cp)) { cp++; used++; if(used >= len-5) return used; } +static void recent_entry_remove(struct recent_table *t, struct recent_entry *e) +{ + list_del(&e->list); + list_del(&e->lru_list); + kfree(e); + t->entries--; +} - /* Check if we are asked to flush the entire table */ - if(!memcmp(cp,"clear",5)) { - used += 5; - spin_lock_bh(&curr_table->list_lock); - curr_table->time_pos = 0; - for(count = 0; count < ip_list_hash_size; count++) { - curr_table->hash_table[count] = -1; - } - for(count = 0; count < ip_list_tot; count++) { - curr_table->table[count].last_seen = 0; - curr_table->table[count].addr = 0; - curr_table->table[count].ttl = 0; - memset(curr_table->table[count].last_pkts,0,ip_pkt_list_tot*sizeof(unsigned long)); - curr_table->table[count].oldest_pkt = 0; - curr_table->table[count].time_pos = 0; - curr_table->time_info[count].position = count; - curr_table->time_info[count].time = 0; - } - spin_unlock_bh(&curr_table->list_lock); - return used; - } +static struct recent_entry * +recent_entry_init(struct recent_table *t, u_int32_t addr, u_int8_t ttl) +{ + struct recent_entry *e; - check_set = IPT_RECENT_SET; - switch(*cp) { - case '+': check_set = IPT_RECENT_SET; cp++; used++; break; - case '-': check_set = IPT_RECENT_REMOVE; cp++; used++; break; - default: if(!isdigit(*cp)) return (used+1); break; + if (t->entries >= ip_list_tot) { + e = list_entry(t->lru_list.next, struct recent_entry, lru_list); + recent_entry_remove(t, e); } + e = kmalloc(sizeof(*e) + sizeof(e->stamps[0]) * ip_pkt_list_tot, + GFP_ATOMIC); + if (e == NULL) + return NULL; + e->addr = addr; + e->ttl = ttl; + e->stamps[0] = jiffies; + e->nstamps = 1; + e->index = 1; + list_add_tail(&e->list, &t->iphash[recent_entry_hash(addr)]); + list_add_tail(&e->lru_list, &t->lru_list); + t->entries++; + return e; +} -#ifdef DEBUG - if(debug) printk(KERN_INFO RECENT_NAME ": ip_recent_ctrl cp: `%c', check_set: %d\n",*cp,check_set); -#endif - /* Get addr (effectively inet_aton()) */ - /* Shamelessly stolen from libc, a function in the kernel for doing - * this would, of course, be greatly preferred, but our options appear - * to be rather limited, so we will just do it ourselves here. - */ - res.word = 0; - - c = *cp; - for(;;) { - if(!isdigit(c)) return used; - val = 0; base = 10; digit = 0; - if(c == '0') { - c = *++cp; - if(c == 'x' || c == 'X') base = 16, c = *++cp; - else { base = 8; digit = 1; } - } - for(;;) { - if(isascii(c) && isdigit(c)) { - if(base == 8 && (c == '8' || c == '0')) return used; - val = (val * base) + (c - '0'); - c = *++cp; - digit = 1; - } else if(base == 16 && isascii(c) && isxdigit(c)) { - val = (val << 4) | (c + 10 - (islower(c) ? 'a' : 'A')); - c = *++cp; - digit = 1; - } else break; - } - if(c == '.') { - if(pp > res.bytes + 2 || val > 0xff) return used; - *pp++ = val; - c = *++cp; - } else break; - } - used = cp - buffer; - if(c != '\0' && (!isascii(c) || !isspace(c))) return used; - if(c == '\n') used++; - if(!digit) return used; +static void recent_entry_update(struct recent_table *t, struct recent_entry *e) +{ + e->stamps[e->index++] = jiffies; + if (e->index > e->nstamps) + e->nstamps = e->index; + e->index %= ip_pkt_list_tot; + list_move_tail(&e->lru_list, &t->lru_list); +} - if(val > max[pp - res.bytes]) return used; - addr = res.word | htonl(val); +static struct recent_table *recent_table_lookup(const char *name) +{ + struct recent_table *t; - if(!addr && check_set == IPT_RECENT_SET) return used; + list_for_each_entry(t, &tables, list) + if (!strcmp(t->name, name)) + return t; + return NULL; +} -#ifdef DEBUG - if(debug) printk(KERN_INFO RECENT_NAME ": ip_recent_ctrl c: %c, addr: %u used: %d\n",c,addr,used); -#endif +static void recent_table_flush(struct recent_table *t) +{ + struct recent_entry *e, *next; + unsigned int i; - /* Set up and just call match */ - info = kmalloc(sizeof(struct ipt_recent_info),GFP_KERNEL); - if(!info) { return -ENOMEM; } - info->seconds = 0; - info->hit_count = 0; - info->check_set = check_set; - info->invert = 0; - info->side = IPT_RECENT_SOURCE; - strncpy(info->name,curr_table->name,IPT_RECENT_NAME_LEN); - info->name[IPT_RECENT_NAME_LEN-1] = '\0'; - - skb = kmalloc(sizeof(struct sk_buff),GFP_KERNEL); - if (!skb) { - used = -ENOMEM; - goto out_free_info; - } - skb->nh.iph = kmalloc(sizeof(struct iphdr),GFP_KERNEL); - if (!skb->nh.iph) { - used = -ENOMEM; - goto out_free_skb; + for (i = 0; i < ip_list_hash_size; i++) { + list_for_each_entry_safe(e, next, &t->iphash[i], list) + recent_entry_remove(t, e); } - - skb->nh.iph->saddr = addr; - skb->nh.iph->daddr = 0; - /* Clear ttl since we have no way of knowing it */ - skb->nh.iph->ttl = 0; - match(skb,NULL,NULL,NULL,info,0,0,NULL); - - kfree(skb->nh.iph); -out_free_skb: - kfree(skb); -out_free_info: - kfree(info); - -#ifdef DEBUG - if(debug) printk(KERN_INFO RECENT_NAME ": Leaving ip_recent_ctrl addr: %u used: %d\n",addr,used); -#endif - return used; } -#endif /* CONFIG_PROC_FS */ - -/* 'match' is our primary function, called by the kernel whenever a rule is - * hit with our module as an option to it. - * What this function does depends on what was specifically asked of it by - * the user: - * --set -- Add or update last seen time of the source address of the packet - * -- matchinfo->check_set == IPT_RECENT_SET - * --rcheck -- Just check if the source address is in the list - * -- matchinfo->check_set == IPT_RECENT_CHECK - * --update -- If the source address is in the list, update last_seen - * -- matchinfo->check_set == IPT_RECENT_UPDATE - * --remove -- If the source address is in the list, remove it - * -- matchinfo->check_set == IPT_RECENT_REMOVE - * --seconds -- Option to --rcheck/--update, only match if last_seen within seconds - * -- matchinfo->seconds - * --hitcount -- Option to --rcheck/--update, only match if seen hitcount times - * -- matchinfo->hit_count - * --seconds and --hitcount can be combined - */ static int -match(const struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - const struct xt_match *match, - const void *matchinfo, - int offset, - unsigned int protoff, - int *hotdrop) +ipt_recent_match(const struct sk_buff *skb, + const struct net_device *in, const struct net_device *out, + const struct xt_match *match, const void *matchinfo, + int offset, unsigned int protoff, int *hotdrop) { - int pkt_count, hits_found, ans; - unsigned long now; const struct ipt_recent_info *info = matchinfo; - u_int32_t addr = 0, time_temp; - u_int8_t ttl = skb->nh.iph->ttl; - int *hash_table; - int orig_hash_result, hash_result, temp, location = 0, time_loc, end_collision_chain = -1; - struct time_info_list *time_info; - struct recent_ip_tables *curr_table; - struct recent_ip_tables *last_table; - struct recent_ip_list *r_list; - -#ifdef DEBUG - if(debug) printk(KERN_INFO RECENT_NAME ": match() called\n"); -#endif - - /* Default is false ^ info->invert */ - ans = info->invert; + struct recent_table *t; + struct recent_entry *e; + u_int32_t addr; + u_int8_t ttl; + int ret = info->invert; -#ifdef DEBUG - if(debug) printk(KERN_INFO RECENT_NAME ": match(): name = '%s'\n",info->name); -#endif + if (info->side == IPT_RECENT_DEST) + addr = skb->nh.iph->daddr; + else + addr = skb->nh.iph->saddr; - /* if out != NULL then routing has been done and TTL changed. - * We change it back here internally for match what came in before routing. */ - if(out) ttl++; + ttl = skb->nh.iph->ttl; + /* use TTL as seen before forwarding */ + if (out && !skb->sk) + ttl++; - /* Find the right table */ spin_lock_bh(&recent_lock); - curr_table = r_tables; - while( (last_table = curr_table) && strncmp(info->name,curr_table->name,IPT_RECENT_NAME_LEN) && (curr_table = curr_table->next) ); - -#ifdef DEBUG - if(debug) printk(KERN_INFO RECENT_NAME ": match(): table found('%s')\n",info->name); -#endif - - spin_unlock_bh(&recent_lock); - - /* Table with this name not found, match impossible */ - if(!curr_table) { return ans; } - - /* Make sure no one is changing the list while we work with it */ - spin_lock_bh(&curr_table->list_lock); - - r_list = curr_table->table; - if(info->side == IPT_RECENT_DEST) addr = skb->nh.iph->daddr; else addr = skb->nh.iph->saddr; - - if(!addr) { -#ifdef DEBUG - if(debug) printk(KERN_INFO RECENT_NAME ": match() address (%u) invalid, leaving.\n",addr); -#endif - spin_unlock_bh(&curr_table->list_lock); - return ans; - } - -#ifdef DEBUG - if(debug) printk(KERN_INFO RECENT_NAME ": match(): checking table, addr: %u, ttl: %u, orig_ttl: %u\n",addr,ttl,skb->nh.iph->ttl); -#endif - - /* Get jiffies now in case they changed while we were waiting for a lock */ - now = jiffies; - hash_table = curr_table->hash_table; - time_info = curr_table->time_info; - - orig_hash_result = hash_result = hash_func(addr,ip_list_hash_size); - /* Hash entry at this result used */ - /* Check for TTL match if requested. If TTL is zero then a match would never - * happen, so match regardless of existing TTL in that case. Zero means the - * entry was added via the /proc interface anyway, so we will just use the - * first TTL we get for that IP address. */ - if(info->check_set & IPT_RECENT_TTL) { - while(hash_table[hash_result] != -1 && !(r_list[hash_table[hash_result]].addr == addr && - (!r_list[hash_table[hash_result]].ttl || r_list[hash_table[hash_result]].ttl == ttl))) { - /* Collision in hash table */ - hash_result = (hash_result + 1) % ip_list_hash_size; - } - } else { - while(hash_table[hash_result] != -1 && r_list[hash_table[hash_result]].addr != addr) { - /* Collision in hash table */ - hash_result = (hash_result + 1) % ip_list_hash_size; - } - } - - if(hash_table[hash_result] == -1 && !(info->check_set & IPT_RECENT_SET)) { - /* IP not in list and not asked to SET */ - spin_unlock_bh(&curr_table->list_lock); - return ans; - } - - /* Check if we need to handle the collision, do not need to on REMOVE */ - if(orig_hash_result != hash_result && !(info->check_set & IPT_RECENT_REMOVE)) { -#ifdef DEBUG - if(debug) printk(KERN_INFO RECENT_NAME ": match(): Collision in hash table. (or: %d,hr: %d,oa: %u,ha: %u)\n", - orig_hash_result, - hash_result, - r_list[hash_table[orig_hash_result]].addr, - addr); -#endif - - /* We had a collision. - * orig_hash_result is where we started, hash_result is where we ended up. - * So, swap them because we are likely to see the same guy again sooner */ -#ifdef DEBUG - if(debug) { - printk(KERN_INFO RECENT_NAME ": match(): Collision; hash_table[orig_hash_result] = %d\n",hash_table[orig_hash_result]); - printk(KERN_INFO RECENT_NAME ": match(): Collision; r_list[hash_table[orig_hash_result]].hash_entry = %d\n", - r_list[hash_table[orig_hash_result]].hash_entry); - } -#endif - - r_list[hash_table[orig_hash_result]].hash_entry = hash_result; - - - temp = hash_table[orig_hash_result]; -#ifdef DEBUG - if(debug) printk(KERN_INFO RECENT_NAME ": match(): Collision; hash_table[hash_result] = %d\n",hash_table[hash_result]); -#endif - hash_table[orig_hash_result] = hash_table[hash_result]; - hash_table[hash_result] = temp; - temp = hash_result; - hash_result = orig_hash_result; - orig_hash_result = temp; - time_info[r_list[hash_table[orig_hash_result]].time_pos].position = hash_table[orig_hash_result]; - if(hash_table[hash_result] != -1) { - r_list[hash_table[hash_result]].hash_entry = hash_result; - time_info[r_list[hash_table[hash_result]].time_pos].position = hash_table[hash_result]; - } - -#ifdef DEBUG - if(debug) printk(KERN_INFO RECENT_NAME ": match(): Collision handled.\n"); -#endif + t = recent_table_lookup(info->name); + e = recent_entry_lookup(t, addr, + info->check_set & IPT_RECENT_TTL ? ttl : 0); + if (e == NULL) { + if (!(info->check_set & IPT_RECENT_SET)) + goto out; + e = recent_entry_init(t, addr, ttl); + if (e == NULL) + *hotdrop = 1; + ret ^= 1; + goto out; } - if(hash_table[hash_result] == -1) { -#ifdef DEBUG - if(debug) printk(KERN_INFO RECENT_NAME ": match(): New table entry. (hr: %d,ha: %u)\n", - hash_result, addr); -#endif - - /* New item found and IPT_RECENT_SET, so we need to add it */ - location = time_info[curr_table->time_pos].position; - hash_table[r_list[location].hash_entry] = -1; - hash_table[hash_result] = location; - memset(r_list[location].last_pkts,0,ip_pkt_list_tot*sizeof(unsigned long)); - r_list[location].time_pos = curr_table->time_pos; - r_list[location].addr = addr; - r_list[location].ttl = ttl; - r_list[location].last_seen = now; - r_list[location].oldest_pkt = 1; - r_list[location].last_pkts[0] = now; - r_list[location].hash_entry = hash_result; - time_info[curr_table->time_pos].time = r_list[location].last_seen; - curr_table->time_pos = (curr_table->time_pos + 1) % ip_list_tot; - - ans = !info->invert; - } else { -#ifdef DEBUG - if(debug) printk(KERN_INFO RECENT_NAME ": match(): Existing table entry. (hr: %d,ha: %u)\n", - hash_result, - addr); -#endif - - /* Existing item found */ - location = hash_table[hash_result]; - /* We have a match on address, now to make sure it meets all requirements for a - * full match. */ - if(info->check_set & IPT_RECENT_CHECK || info->check_set & IPT_RECENT_UPDATE) { - if(!info->seconds && !info->hit_count) ans = !info->invert; else ans = info->invert; - if(info->seconds && !info->hit_count) { - if(time_before_eq(now,r_list[location].last_seen+info->seconds*HZ)) ans = !info->invert; else ans = info->invert; - } - if(info->seconds && info->hit_count) { - for(pkt_count = 0, hits_found = 0; pkt_count < ip_pkt_list_tot; pkt_count++) { - if(r_list[location].last_pkts[pkt_count] == 0) break; - if(time_before_eq(now,r_list[location].last_pkts[pkt_count]+info->seconds*HZ)) hits_found++; - } - if(hits_found >= info->hit_count) ans = !info->invert; else ans = info->invert; - } - if(info->hit_count && !info->seconds) { - for(pkt_count = 0, hits_found = 0; pkt_count < ip_pkt_list_tot; pkt_count++) { - if(r_list[location].last_pkts[pkt_count] == 0) break; - hits_found++; - } - if(hits_found >= info->hit_count) ans = !info->invert; else ans = info->invert; + if (info->check_set & IPT_RECENT_SET) + ret ^= 1; + else if (info->check_set & IPT_RECENT_REMOVE) { + recent_entry_remove(t, e); + ret ^= 1; + } else if (info->check_set & (IPT_RECENT_CHECK | IPT_RECENT_UPDATE)) { + unsigned long t = jiffies - info->seconds * HZ; + unsigned int i, hits = 0; + + for (i = 0; i < e->nstamps; i++) { + if (info->seconds && time_after(t, e->stamps[i])) + continue; + if (++hits >= info->hit_count) { + ret ^= 1; + break; } } -#ifdef DEBUG - if(debug) { - if(ans) - printk(KERN_INFO RECENT_NAME ": match(): match addr: %u\n",addr); - else - printk(KERN_INFO RECENT_NAME ": match(): no match addr: %u\n",addr); - } -#endif - - /* If and only if we have been asked to SET, or to UPDATE (on match) do we add the - * current timestamp to the last_seen. */ - if((info->check_set & IPT_RECENT_SET && (ans = !info->invert)) || (info->check_set & IPT_RECENT_UPDATE && ans)) { -#ifdef DEBUG - if(debug) printk(KERN_INFO RECENT_NAME ": match(): SET or UPDATE; updating time info.\n"); -#endif - /* Have to update our time info */ - time_loc = r_list[location].time_pos; - time_info[time_loc].time = now; - time_info[time_loc].position = location; - while((time_info[(time_loc+1) % ip_list_tot].time < time_info[time_loc].time) && ((time_loc+1) % ip_list_tot) != curr_table->time_pos) { - time_temp = time_info[time_loc].time; - time_info[time_loc].time = time_info[(time_loc+1)%ip_list_tot].time; - time_info[(time_loc+1)%ip_list_tot].time = time_temp; - time_temp = time_info[time_loc].position; - time_info[time_loc].position = time_info[(time_loc+1)%ip_list_tot].position; - time_info[(time_loc+1)%ip_list_tot].position = time_temp; - r_list[time_info[time_loc].position].time_pos = time_loc; - r_list[time_info[(time_loc+1)%ip_list_tot].position].time_pos = (time_loc+1)%ip_list_tot; - time_loc = (time_loc+1) % ip_list_tot; - } - r_list[location].time_pos = time_loc; - r_list[location].ttl = ttl; - r_list[location].last_pkts[r_list[location].oldest_pkt] = now; - r_list[location].oldest_pkt = ++r_list[location].oldest_pkt % ip_pkt_list_tot; - r_list[location].last_seen = now; - } - /* If we have been asked to remove the entry from the list, just set it to 0 */ - if(info->check_set & IPT_RECENT_REMOVE) { -#ifdef DEBUG - if(debug) printk(KERN_INFO RECENT_NAME ": match(): REMOVE; clearing entry (or: %d, hr: %d).\n",orig_hash_result,hash_result); -#endif - /* Check if this is part of a collision chain */ - while(hash_table[(orig_hash_result+1) % ip_list_hash_size] != -1) { - orig_hash_result++; - if(hash_func(r_list[hash_table[orig_hash_result]].addr,ip_list_hash_size) == hash_result) { - /* Found collision chain, how deep does this rabbit hole go? */ -#ifdef DEBUG - if(debug) printk(KERN_INFO RECENT_NAME ": match(): REMOVE; found collision chain.\n"); -#endif - end_collision_chain = orig_hash_result; - } - } - if(end_collision_chain != -1) { -#ifdef DEBUG - if(debug) printk(KERN_INFO RECENT_NAME ": match(): REMOVE; part of collision chain, moving to end.\n"); -#endif - /* Part of a collision chain, swap it with the end of the chain - * before removing. */ - r_list[hash_table[end_collision_chain]].hash_entry = hash_result; - temp = hash_table[end_collision_chain]; - hash_table[end_collision_chain] = hash_table[hash_result]; - hash_table[hash_result] = temp; - time_info[r_list[hash_table[hash_result]].time_pos].position = hash_table[hash_result]; - hash_result = end_collision_chain; - r_list[hash_table[hash_result]].hash_entry = hash_result; - time_info[r_list[hash_table[hash_result]].time_pos].position = hash_table[hash_result]; - } - location = hash_table[hash_result]; - hash_table[r_list[location].hash_entry] = -1; - time_loc = r_list[location].time_pos; - time_info[time_loc].time = 0; - time_info[time_loc].position = location; - while((time_info[(time_loc+1) % ip_list_tot].time < time_info[time_loc].time) && ((time_loc+1) % ip_list_tot) != curr_table->time_pos) { - time_temp = time_info[time_loc].time; - time_info[time_loc].time = time_info[(time_loc+1)%ip_list_tot].time; - time_info[(time_loc+1)%ip_list_tot].time = time_temp; - time_temp = time_info[time_loc].position; - time_info[time_loc].position = time_info[(time_loc+1)%ip_list_tot].position; - time_info[(time_loc+1)%ip_list_tot].position = time_temp; - r_list[time_info[time_loc].position].time_pos = time_loc; - r_list[time_info[(time_loc+1)%ip_list_tot].position].time_pos = (time_loc+1)%ip_list_tot; - time_loc = (time_loc+1) % ip_list_tot; - } - r_list[location].time_pos = time_loc; - r_list[location].last_seen = 0; - r_list[location].addr = 0; - r_list[location].ttl = 0; - memset(r_list[location].last_pkts,0,ip_pkt_list_tot*sizeof(unsigned long)); - r_list[location].oldest_pkt = 0; - ans = !info->invert; - } - spin_unlock_bh(&curr_table->list_lock); - return ans; } - spin_unlock_bh(&curr_table->list_lock); -#ifdef DEBUG - if(debug) printk(KERN_INFO RECENT_NAME ": match() left.\n"); -#endif - return ans; + if (info->check_set & IPT_RECENT_SET || + (info->check_set & IPT_RECENT_UPDATE && ret)) { + recent_entry_update(t, e); + e->ttl = ttl; + } +out: + spin_unlock_bh(&recent_lock); + return ret; } -/* This function is to verify that the rule given during the userspace iptables - * command is correct. - * If the command is valid then we check if the table name referred to by the - * rule exists, if not it is created. - */ static int -checkentry(const char *tablename, - const void *ip, - const struct xt_match *match, - void *matchinfo, - unsigned int matchsize, - unsigned int hook_mask) +ipt_recent_checkentry(const char *tablename, const void *ip, + const struct xt_match *match, void *matchinfo, + unsigned int matchsize, unsigned int hook_mask) { - int flag = 0, c; - unsigned long *hold; const struct ipt_recent_info *info = matchinfo; - struct recent_ip_tables *curr_table, *find_table, *last_table; - -#ifdef DEBUG - if(debug) printk(KERN_INFO RECENT_NAME ": checkentry() entered.\n"); -#endif - - /* seconds and hit_count only valid for CHECK/UPDATE */ - if(info->check_set & IPT_RECENT_SET) { flag++; if(info->seconds || info->hit_count) return 0; } - if(info->check_set & IPT_RECENT_REMOVE) { flag++; if(info->seconds || info->hit_count) return 0; } - if(info->check_set & IPT_RECENT_CHECK) flag++; - if(info->check_set & IPT_RECENT_UPDATE) flag++; - - /* One and only one of these should ever be set */ - if(flag != 1) return 0; - - /* Name must be set to something */ - if(!info->name || !info->name[0]) return 0; + struct recent_table *t; + unsigned i; + int ret = 0; - /* Things look good, create a list for this if it does not exist */ - /* Lock the linked list while we play with it */ - spin_lock_bh(&recent_lock); - - /* Look for an entry with this name already created */ - /* Finds the end of the list and the entry before the end if current name does not exist */ - find_table = r_tables; - while( (last_table = find_table) && strncmp(info->name,find_table->name,IPT_RECENT_NAME_LEN) && (find_table = find_table->next) ); + if (hweight8(info->check_set & + (IPT_RECENT_SET | IPT_RECENT_REMOVE | + IPT_RECENT_CHECK | IPT_RECENT_UPDATE)) != 1) + return 0; + if ((info->check_set & (IPT_RECENT_SET | IPT_RECENT_REMOVE)) && + (info->seconds || info->hit_count)) + return 0; + if (info->name[0] == '\0' || + strnlen(info->name, IPT_RECENT_NAME_LEN) == IPT_RECENT_NAME_LEN) + return 0; - /* If a table already exists just increment the count on that table and return */ - if(find_table) { -#ifdef DEBUG - if(debug) printk(KERN_INFO RECENT_NAME ": checkentry: table found (%s), incrementing count.\n",info->name); -#endif - find_table->count++; - spin_unlock_bh(&recent_lock); - return 1; + mutex_lock(&recent_mutex); + t = recent_table_lookup(info->name); + if (t != NULL) { + t->refcnt++; + ret = 1; + goto out; } - spin_unlock_bh(&recent_lock); - - /* Table with this name not found */ - /* Allocate memory for new linked list item */ - -#ifdef DEBUG - if(debug) { - printk(KERN_INFO RECENT_NAME ": checkentry: no table found (%s)\n",info->name); - printk(KERN_INFO RECENT_NAME ": checkentry: Allocationg %d for link-list entry.\n",sizeof(struct recent_ip_tables)); + t = kzalloc(sizeof(*t) + sizeof(t->iphash[0]) * ip_list_hash_size, + GFP_KERNEL); + if (t == NULL) + goto out; + t->refcnt = 1; + strcpy(t->name, info->name); + INIT_LIST_HEAD(&t->lru_list); + for (i = 0; i < ip_list_hash_size; i++) + INIT_LIST_HEAD(&t->iphash[i]); +#ifdef CONFIG_PROC_FS + t->proc = create_proc_entry(t->name, ip_list_perms, proc_dir); + if (t->proc == NULL) { + kfree(t); + goto out; } + t->proc->proc_fops = &recent_fops; + t->proc->data = t; #endif + spin_lock_bh(&recent_lock); + list_add_tail(&t->list, &tables); + spin_unlock_bh(&recent_lock); + ret = 1; +out: + mutex_unlock(&recent_mutex); + return ret; +} - curr_table = vmalloc(sizeof(struct recent_ip_tables)); - if(curr_table == NULL) return 0; - - spin_lock_init(&curr_table->list_lock); - curr_table->next = NULL; - curr_table->count = 1; - curr_table->time_pos = 0; - strncpy(curr_table->name,info->name,IPT_RECENT_NAME_LEN); - curr_table->name[IPT_RECENT_NAME_LEN-1] = '\0'; - - /* Allocate memory for this table and the list of packets in each entry. */ -#ifdef DEBUG - if(debug) printk(KERN_INFO RECENT_NAME ": checkentry: Allocating %d for table (%s).\n", - sizeof(struct recent_ip_list)*ip_list_tot, - info->name); -#endif - - curr_table->table = vmalloc(sizeof(struct recent_ip_list)*ip_list_tot); - if(curr_table->table == NULL) { vfree(curr_table); return 0; } - memset(curr_table->table,0,sizeof(struct recent_ip_list)*ip_list_tot); -#ifdef DEBUG - if(debug) printk(KERN_INFO RECENT_NAME ": checkentry: Allocating %d for pkt_list.\n", - sizeof(unsigned long)*ip_pkt_list_tot*ip_list_tot); -#endif - - hold = vmalloc(sizeof(unsigned long)*ip_pkt_list_tot*ip_list_tot); -#ifdef DEBUG - if(debug) printk(KERN_INFO RECENT_NAME ": checkentry: After pkt_list allocation.\n"); -#endif - if(hold == NULL) { - printk(KERN_INFO RECENT_NAME ": checkentry: unable to allocate for pkt_list.\n"); - vfree(curr_table->table); - vfree(curr_table); - return 0; - } - for(c = 0; c < ip_list_tot; c++) { - curr_table->table[c].last_pkts = hold + c*ip_pkt_list_tot; - } +static void +ipt_recent_destroy(const struct xt_match *match, void *matchinfo, + unsigned int matchsize) +{ + const struct ipt_recent_info *info = matchinfo; + struct recent_table *t; - /* Allocate memory for the hash table */ -#ifdef DEBUG - if(debug) printk(KERN_INFO RECENT_NAME ": checkentry: Allocating %d for hash_table.\n", - sizeof(int)*ip_list_hash_size); + mutex_lock(&recent_mutex); + t = recent_table_lookup(info->name); + if (--t->refcnt == 0) { + spin_lock_bh(&recent_lock); + list_del(&t->list); + spin_unlock_bh(&recent_lock); + recent_table_flush(t); +#ifdef CONFIG_PROC_FS + remove_proc_entry(t->name, proc_dir); #endif - - curr_table->hash_table = vmalloc(sizeof(int)*ip_list_hash_size); - if(!curr_table->hash_table) { - printk(KERN_INFO RECENT_NAME ": checkentry: unable to allocate for hash_table.\n"); - vfree(hold); - vfree(curr_table->table); - vfree(curr_table); - return 0; - } - - for(c = 0; c < ip_list_hash_size; c++) { - curr_table->hash_table[c] = -1; + kfree(t); } + mutex_unlock(&recent_mutex); +} - /* Allocate memory for the time info */ -#ifdef DEBUG - if(debug) printk(KERN_INFO RECENT_NAME ": checkentry: Allocating %d for time_info.\n", - sizeof(struct time_info_list)*ip_list_tot); -#endif +#ifdef CONFIG_PROC_FS +struct recent_iter_state { + struct recent_table *table; + unsigned int bucket; +}; - curr_table->time_info = vmalloc(sizeof(struct time_info_list)*ip_list_tot); - if(!curr_table->time_info) { - printk(KERN_INFO RECENT_NAME ": checkentry: unable to allocate for time_info.\n"); - vfree(curr_table->hash_table); - vfree(hold); - vfree(curr_table->table); - vfree(curr_table); - return 0; - } - for(c = 0; c < ip_list_tot; c++) { - curr_table->time_info[c].position = c; - curr_table->time_info[c].time = 0; - } +static void *recent_seq_start(struct seq_file *seq, loff_t *pos) +{ + struct recent_iter_state *st = seq->private; + struct recent_table *t = st->table; + struct recent_entry *e; + loff_t p = *pos; - /* Put the new table in place */ spin_lock_bh(&recent_lock); - find_table = r_tables; - while( (last_table = find_table) && strncmp(info->name,find_table->name,IPT_RECENT_NAME_LEN) && (find_table = find_table->next) ); - - /* If a table already exists just increment the count on that table and return */ - if(find_table) { - find_table->count++; - spin_unlock_bh(&recent_lock); -#ifdef DEBUG - if(debug) printk(KERN_INFO RECENT_NAME ": checkentry: table found (%s), created by other process.\n",info->name); -#endif - vfree(curr_table->time_info); - vfree(curr_table->hash_table); - vfree(hold); - vfree(curr_table->table); - vfree(curr_table); - return 1; - } - if(!last_table) r_tables = curr_table; else last_table->next = curr_table; - - spin_unlock_bh(&recent_lock); -#ifdef CONFIG_PROC_FS - /* Create our proc 'status' entry. */ - curr_table->status_proc = create_proc_entry(curr_table->name, ip_list_perms, proc_net_ipt_recent); - if (!curr_table->status_proc) { - vfree(hold); - printk(KERN_INFO RECENT_NAME ": checkentry: unable to allocate for /proc entry.\n"); - /* Destroy the created table */ - spin_lock_bh(&recent_lock); - last_table = NULL; - curr_table = r_tables; - if(!curr_table) { -#ifdef DEBUG - if(debug) printk(KERN_INFO RECENT_NAME ": checkentry() create_proc failed, no tables.\n"); -#endif - spin_unlock_bh(&recent_lock); - return 0; - } - while( strncmp(info->name,curr_table->name,IPT_RECENT_NAME_LEN) && (last_table = curr_table) && (curr_table = curr_table->next) ); - if(!curr_table) { -#ifdef DEBUG - if(debug) printk(KERN_INFO RECENT_NAME ": checkentry() create_proc failed, table already destroyed.\n"); -#endif - spin_unlock_bh(&recent_lock); - return 0; + for (st->bucket = 0; st->bucket < ip_list_hash_size; st->bucket++) { + list_for_each_entry(e, &t->iphash[st->bucket], list) { + if (p-- == 0) + return e; } - if(last_table) last_table->next = curr_table->next; else r_tables = curr_table->next; - spin_unlock_bh(&recent_lock); - vfree(curr_table->time_info); - vfree(curr_table->hash_table); - vfree(curr_table->table); - vfree(curr_table); - return 0; } - - curr_table->status_proc->owner = THIS_MODULE; - curr_table->status_proc->data = curr_table; - wmb(); - curr_table->status_proc->read_proc = ip_recent_get_info; - curr_table->status_proc->write_proc = ip_recent_ctrl; -#endif /* CONFIG_PROC_FS */ - -#ifdef DEBUG - if(debug) printk(KERN_INFO RECENT_NAME ": checkentry() left.\n"); -#endif + return NULL; +} - return 1; +static void *recent_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct recent_iter_state *st = seq->private; + struct recent_table *t = st->table; + struct recent_entry *e = v; + struct list_head *head = e->list.next; + + while (head == &t->iphash[st->bucket]) { + if (++st->bucket >= ip_list_hash_size) + return NULL; + head = t->iphash[st->bucket].next; + } + (*pos)++; + return list_entry(head, struct recent_entry, list); } -/* This function is called in the event that a rule matching this module is - * removed. - * When this happens we need to check if there are no other rules matching - * the table given. If that is the case then we remove the table and clean - * up its memory. - */ -static void -destroy(const struct xt_match *match, void *matchinfo, unsigned int matchsize) +static void recent_seq_stop(struct seq_file *s, void *v) { - const struct ipt_recent_info *info = matchinfo; - struct recent_ip_tables *curr_table, *last_table; + spin_unlock_bh(&recent_lock); +} -#ifdef DEBUG - if(debug) printk(KERN_INFO RECENT_NAME ": destroy() entered.\n"); -#endif +static int recent_seq_show(struct seq_file *seq, void *v) +{ + struct recent_entry *e = v; + unsigned int i; + + i = (e->index - 1) % ip_pkt_list_tot; + seq_printf(seq, "src=%u.%u.%u.%u ttl: %u last_seen: %lu oldest_pkt: %u", + NIPQUAD(e->addr), e->ttl, e->stamps[i], e->index); + for (i = 0; i < e->nstamps; i++) + seq_printf(seq, "%s %lu", i ? "," : "", e->stamps[i]); + seq_printf(seq, "\n"); + return 0; +} - if(matchsize != IPT_ALIGN(sizeof(struct ipt_recent_info))) return; +static struct seq_operations recent_seq_ops = { + .start = recent_seq_start, + .next = recent_seq_next, + .stop = recent_seq_stop, + .show = recent_seq_show, +}; - /* Lock the linked list while we play with it */ - spin_lock_bh(&recent_lock); +static int recent_seq_open(struct inode *inode, struct file *file) +{ + struct proc_dir_entry *pde = PDE(inode); + struct seq_file *seq; + struct recent_iter_state *st; + int ret; + + st = kzalloc(sizeof(*st), GFP_KERNEL); + if (st == NULL) + return -ENOMEM; + ret = seq_open(file, &recent_seq_ops); + if (ret) + kfree(st); + st->table = pde->data; + seq = file->private_data; + seq->private = st; + return ret; +} - /* Look for an entry with this name already created */ - /* Finds the end of the list and the entry before the end if current name does not exist */ - last_table = NULL; - curr_table = r_tables; - if(!curr_table) { -#ifdef DEBUG - if(debug) printk(KERN_INFO RECENT_NAME ": destroy() No tables found, leaving.\n"); -#endif +static ssize_t recent_proc_write(struct file *file, const char __user *input, + size_t size, loff_t *loff) +{ + struct proc_dir_entry *pde = PDE(file->f_dentry->d_inode); + struct recent_table *t = pde->data; + struct recent_entry *e; + char buf[sizeof("+255.255.255.255")], *c = buf; + u_int32_t addr; + int add; + + if (size > sizeof(buf)) + size = sizeof(buf); + if (copy_from_user(buf, input, size)) + return -EFAULT; + while (isspace(*c)) + c++; + + if (size - (c - buf) < 5) + return c - buf; + if (!strncmp(c, "clear", 5)) { + c += 5; + spin_lock_bh(&recent_lock); + recent_table_flush(t); spin_unlock_bh(&recent_lock); - return; + return c - buf; } - while( strncmp(info->name,curr_table->name,IPT_RECENT_NAME_LEN) && (last_table = curr_table) && (curr_table = curr_table->next) ); - /* If a table does not exist then do nothing and return */ - if(!curr_table) { -#ifdef DEBUG - if(debug) printk(KERN_INFO RECENT_NAME ": destroy() table not found, leaving.\n"); -#endif - spin_unlock_bh(&recent_lock); - return; + switch (*c) { + case '-': + add = 0; + c++; + break; + case '+': + c++; + default: + add = 1; + break; } + addr = in_aton(c); - curr_table->count--; - - /* If count is still non-zero then there are still rules referenceing it so we do nothing */ - if(curr_table->count) { -#ifdef DEBUG - if(debug) printk(KERN_INFO RECENT_NAME ": destroy() table found, non-zero count, leaving.\n"); -#endif - spin_unlock_bh(&recent_lock); - return; + spin_lock_bh(&recent_lock); + e = recent_entry_lookup(t, addr, 0); + if (e == NULL) { + if (add) + recent_entry_init(t, addr, 0); + } else { + if (add) + recent_entry_update(t, e); + else + recent_entry_remove(t, e); } - -#ifdef DEBUG - if(debug) printk(KERN_INFO RECENT_NAME ": destroy() table found, zero count, removing.\n"); -#endif - - /* Count must be zero so we remove this table from the list */ - if(last_table) last_table->next = curr_table->next; else r_tables = curr_table->next; - spin_unlock_bh(&recent_lock); + return size; +} - /* lock to make sure any late-runners still using this after we removed it from - * the list finish up then remove everything */ - spin_lock_bh(&curr_table->list_lock); - spin_unlock_bh(&curr_table->list_lock); - -#ifdef CONFIG_PROC_FS - if(curr_table->status_proc) remove_proc_entry(curr_table->name,proc_net_ipt_recent); +static struct file_operations recent_fops = { + .open = recent_seq_open, + .read = seq_read, + .write = recent_proc_write, + .release = seq_release_private, + .owner = THIS_MODULE, +}; #endif /* CONFIG_PROC_FS */ - vfree(curr_table->table[0].last_pkts); - vfree(curr_table->table); - vfree(curr_table->hash_table); - vfree(curr_table->time_info); - vfree(curr_table); - -#ifdef DEBUG - if(debug) printk(KERN_INFO RECENT_NAME ": destroy() left.\n"); -#endif - return; -} - -/* This is the structure we pass to ipt_register to register our - * module with iptables. - */ static struct ipt_match recent_match = { .name = "recent", - .match = match, + .match = ipt_recent_match, .matchsize = sizeof(struct ipt_recent_info), - .checkentry = checkentry, - .destroy = destroy, - .me = THIS_MODULE + .checkentry = ipt_recent_checkentry, + .destroy = ipt_recent_destroy, + .me = THIS_MODULE, }; -/* Kernel module initialization. */ static int __init ipt_recent_init(void) { - int err, count; + int err; - printk(version); -#ifdef CONFIG_PROC_FS - proc_net_ipt_recent = proc_mkdir("ipt_recent",proc_net); - if(!proc_net_ipt_recent) return -ENOMEM; -#endif - - if(ip_list_hash_size && ip_list_hash_size <= ip_list_tot) { - printk(KERN_WARNING RECENT_NAME ": ip_list_hash_size too small, resetting to default.\n"); - ip_list_hash_size = 0; - } - - if(!ip_list_hash_size) { - ip_list_hash_size = ip_list_tot*3; - count = 2*2; - while(ip_list_hash_size > count) count = count*2; - ip_list_hash_size = count; - } - -#ifdef DEBUG - if(debug) printk(KERN_INFO RECENT_NAME ": ip_list_hash_size: %d\n",ip_list_hash_size); -#endif + if (!ip_list_tot || !ip_pkt_list_tot || ip_pkt_list_tot > 255) + return -EINVAL; + ip_list_hash_size = 1 << fls(ip_list_tot); err = ipt_register_match(&recent_match); +#ifdef CONFIG_PROC_FS if (err) - remove_proc_entry("ipt_recent", proc_net); + return err; + proc_dir = proc_mkdir("ipt_recent", proc_net); + if (proc_dir == NULL) { + ipt_unregister_match(&recent_match); + err = -ENOMEM; + } +#endif return err; } -/* Kernel module destruction. */ -static void __exit ipt_recent_fini(void) +static void __exit ipt_recent_exit(void) { + BUG_ON(!list_empty(&tables)); ipt_unregister_match(&recent_match); - - remove_proc_entry("ipt_recent",proc_net); +#ifdef CONFIG_PROC_FS + remove_proc_entry("ipt_recent", proc_net); +#endif } -/* Register our module with the kernel. */ module_init(ipt_recent_init); -module_exit(ipt_recent_fini); +module_exit(ipt_recent_exit); diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c index 5bc9f64d7b5b..8cc8e1b36778 100644 --- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c +++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c @@ -145,7 +145,7 @@ static unsigned int ipv4_conntrack_help(unsigned int hooknum, /* This is where we call the helper: as the packet goes out. */ ct = nf_ct_get(*pskb, &ctinfo); - if (!ct) + if (!ct || ctinfo == IP_CT_RELATED + IP_CT_IS_REPLY) return NF_ACCEPT; help = nfct_help(ct); @@ -348,6 +348,7 @@ getorigdst(struct sock *sk, int optval, void __user *user, int *len) .tuple.dst.u.tcp.port; sin.sin_addr.s_addr = ct->tuplehash[IP_CT_DIR_ORIGINAL] .tuple.dst.u3.ip; + memset(sin.sin_zero, 0, sizeof(sin.sin_zero)); DEBUGP("SO_ORIGINAL_DST: %u.%u.%u.%u %u\n", NIPQUAD(sin.sin_addr.s_addr), ntohs(sin.sin_port)); diff --git a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c index 4b0d361cc6e6..663a73ee3f2f 100644 --- a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c +++ b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c @@ -235,7 +235,7 @@ icmp_error(struct sk_buff *skb, unsigned int dataoff, } /* See ip_conntrack_proto_tcp.c */ - if (hooknum == NF_IP_PRE_ROUTING && + if (nf_conntrack_checksum && hooknum == NF_IP_PRE_ROUTING && nf_ip_checksum(skb, hooknum, dataoff, 0)) { if (LOG_INVALID(IPPROTO_ICMP)) nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL, diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index fc2562415555..bd221ec3f81e 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -103,7 +103,7 @@ static void raw_v4_unhash(struct sock *sk) } struct sock *__raw_v4_lookup(struct sock *sk, unsigned short num, - unsigned long raddr, unsigned long laddr, + __be32 raddr, __be32 laddr, int dif) { struct hlist_node *node; diff --git a/net/ipv4/route.c b/net/ipv4/route.c index cc9423de7311..60b11aece5c3 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -244,7 +244,7 @@ static unsigned int rt_hash_rnd; static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat); #define RT_CACHE_STAT_INC(field) \ - (per_cpu(rt_cache_stat, raw_smp_processor_id()).field++) + (__raw_get_cpu_var(rt_cache_stat).field++) static int rt_intern_hash(unsigned hash, struct rtable *rth, struct rtable **res); diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 6b6c3adfcf00..ce4cd5f35511 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -182,14 +182,6 @@ ctl_table ipv4_table[] = { .strategy = &ipv4_doint_and_flush_strategy, }, { - .ctl_name = NET_IPV4_AUTOCONFIG, - .procname = "ip_autoconfig", - .data = &ipv4_config.autoconfig, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec - }, - { .ctl_name = NET_IPV4_NO_PMTU_DISC, .procname = "ip_no_pmtu_disc", .data = &ipv4_config.no_pmtu_disc, @@ -688,6 +680,24 @@ ctl_table ipv4_table[] = { .mode = 0644, .proc_handler = &proc_dointvec }, +#ifdef CONFIG_NET_DMA + { + .ctl_name = NET_TCP_DMA_COPYBREAK, + .procname = "tcp_dma_copybreak", + .data = &sysctl_tcp_dma_copybreak, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec + }, +#endif + { + .ctl_name = NET_TCP_SLOW_START_AFTER_IDLE, + .procname = "tcp_slow_start_after_idle", + .data = &sysctl_tcp_slow_start_after_idle, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec + }, { .ctl_name = 0 } }; diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index e2b7b8055037..0e029c4e2903 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -258,12 +258,13 @@ #include <linux/random.h> #include <linux/bootmem.h> #include <linux/cache.h> +#include <linux/err.h> #include <net/icmp.h> #include <net/tcp.h> #include <net/xfrm.h> #include <net/ip.h> - +#include <net/netdma.h> #include <asm/uaccess.h> #include <asm/ioctls.h> @@ -571,7 +572,7 @@ new_segment: skb->ip_summed = CHECKSUM_HW; tp->write_seq += copy; TCP_SKB_CB(skb)->end_seq += copy; - skb_shinfo(skb)->tso_segs = 0; + skb_shinfo(skb)->gso_segs = 0; if (!copied) TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_PSH; @@ -622,14 +623,10 @@ ssize_t tcp_sendpage(struct socket *sock, struct page *page, int offset, ssize_t res; struct sock *sk = sock->sk; -#define TCP_ZC_CSUM_FLAGS (NETIF_F_IP_CSUM | NETIF_F_NO_CSUM | NETIF_F_HW_CSUM) - if (!(sk->sk_route_caps & NETIF_F_SG) || - !(sk->sk_route_caps & TCP_ZC_CSUM_FLAGS)) + !(sk->sk_route_caps & NETIF_F_ALL_CSUM)) return sock_no_sendpage(sock, page, offset, size, flags); -#undef TCP_ZC_CSUM_FLAGS - lock_sock(sk); TCP_CHECK_TIMER(sk); res = do_tcp_sendpages(sk, &page, offset, size, flags); @@ -726,9 +723,7 @@ new_segment: /* * Check whether we can use HW checksum. */ - if (sk->sk_route_caps & - (NETIF_F_IP_CSUM | NETIF_F_NO_CSUM | - NETIF_F_HW_CSUM)) + if (sk->sk_route_caps & NETIF_F_ALL_CSUM) skb->ip_summed = CHECKSUM_HW; skb_entail(sk, tp, skb); @@ -824,7 +819,7 @@ new_segment: tp->write_seq += copy; TCP_SKB_CB(skb)->end_seq += copy; - skb_shinfo(skb)->tso_segs = 0; + skb_shinfo(skb)->gso_segs = 0; from += copy; copied += copy; @@ -937,7 +932,7 @@ static int tcp_recv_urg(struct sock *sk, long timeo, * calculation of whether or not we must ACK for the sake of * a window update. */ -static void cleanup_rbuf(struct sock *sk, int copied) +void tcp_cleanup_rbuf(struct sock *sk, int copied) { struct tcp_sock *tp = tcp_sk(sk); int time_to_ack = 0; @@ -1072,11 +1067,11 @@ int tcp_read_sock(struct sock *sk, read_descriptor_t *desc, break; } if (skb->h.th->fin) { - sk_eat_skb(sk, skb); + sk_eat_skb(sk, skb, 0); ++seq; break; } - sk_eat_skb(sk, skb); + sk_eat_skb(sk, skb, 0); if (!desc->count) break; } @@ -1086,7 +1081,7 @@ int tcp_read_sock(struct sock *sk, read_descriptor_t *desc, /* Clean up data we have read: This will do ACK frames. */ if (copied) - cleanup_rbuf(sk, copied); + tcp_cleanup_rbuf(sk, copied); return copied; } @@ -1110,6 +1105,7 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, int target; /* Read at least this many bytes */ long timeo; struct task_struct *user_recv = NULL; + int copied_early = 0; lock_sock(sk); @@ -1133,6 +1129,17 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, target = sock_rcvlowat(sk, flags & MSG_WAITALL, len); +#ifdef CONFIG_NET_DMA + tp->ucopy.dma_chan = NULL; + preempt_disable(); + if ((len > sysctl_tcp_dma_copybreak) && !(flags & MSG_PEEK) && + !sysctl_tcp_low_latency && __get_cpu_var(softnet_data.net_dma)) { + preempt_enable_no_resched(); + tp->ucopy.pinned_list = dma_pin_iovec_pages(msg->msg_iov, len); + } else + preempt_enable_no_resched(); +#endif + do { struct sk_buff *skb; u32 offset; @@ -1220,7 +1227,7 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, } } - cleanup_rbuf(sk, copied); + tcp_cleanup_rbuf(sk, copied); if (!sysctl_tcp_low_latency && tp->ucopy.task == user_recv) { /* Install new reader */ @@ -1274,6 +1281,10 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, } else sk_wait_data(sk, &timeo); +#ifdef CONFIG_NET_DMA + tp->ucopy.wakeup = 0; +#endif + if (user_recv) { int chunk; @@ -1329,13 +1340,39 @@ do_prequeue: } if (!(flags & MSG_TRUNC)) { - err = skb_copy_datagram_iovec(skb, offset, - msg->msg_iov, used); - if (err) { - /* Exception. Bailout! */ - if (!copied) - copied = -EFAULT; - break; +#ifdef CONFIG_NET_DMA + if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list) + tp->ucopy.dma_chan = get_softnet_dma(); + + if (tp->ucopy.dma_chan) { + tp->ucopy.dma_cookie = dma_skb_copy_datagram_iovec( + tp->ucopy.dma_chan, skb, offset, + msg->msg_iov, used, + tp->ucopy.pinned_list); + + if (tp->ucopy.dma_cookie < 0) { + + printk(KERN_ALERT "dma_cookie < 0\n"); + + /* Exception. Bailout! */ + if (!copied) + copied = -EFAULT; + break; + } + if ((offset + used) == skb->len) + copied_early = 1; + + } else +#endif + { + err = skb_copy_datagram_iovec(skb, offset, + msg->msg_iov, used); + if (err) { + /* Exception. Bailout! */ + if (!copied) + copied = -EFAULT; + break; + } } } @@ -1355,15 +1392,19 @@ skip_copy: if (skb->h.th->fin) goto found_fin_ok; - if (!(flags & MSG_PEEK)) - sk_eat_skb(sk, skb); + if (!(flags & MSG_PEEK)) { + sk_eat_skb(sk, skb, copied_early); + copied_early = 0; + } continue; found_fin_ok: /* Process the FIN. */ ++*seq; - if (!(flags & MSG_PEEK)) - sk_eat_skb(sk, skb); + if (!(flags & MSG_PEEK)) { + sk_eat_skb(sk, skb, copied_early); + copied_early = 0; + } break; } while (len > 0); @@ -1386,12 +1427,42 @@ skip_copy: tp->ucopy.len = 0; } +#ifdef CONFIG_NET_DMA + if (tp->ucopy.dma_chan) { + struct sk_buff *skb; + dma_cookie_t done, used; + + dma_async_memcpy_issue_pending(tp->ucopy.dma_chan); + + while (dma_async_memcpy_complete(tp->ucopy.dma_chan, + tp->ucopy.dma_cookie, &done, + &used) == DMA_IN_PROGRESS) { + /* do partial cleanup of sk_async_wait_queue */ + while ((skb = skb_peek(&sk->sk_async_wait_queue)) && + (dma_async_is_complete(skb->dma_cookie, done, + used) == DMA_SUCCESS)) { + __skb_dequeue(&sk->sk_async_wait_queue); + kfree_skb(skb); + } + } + + /* Safe to free early-copied skbs now */ + __skb_queue_purge(&sk->sk_async_wait_queue); + dma_chan_put(tp->ucopy.dma_chan); + tp->ucopy.dma_chan = NULL; + } + if (tp->ucopy.pinned_list) { + dma_unpin_iovec_pages(tp->ucopy.pinned_list); + tp->ucopy.pinned_list = NULL; + } +#endif + /* According to UNIX98, msg_name/msg_namelen are ignored * on connected socket. I was just happy when found this 8) --ANK */ /* Clean up data we have read: This will do ACK frames. */ - cleanup_rbuf(sk, copied); + tcp_cleanup_rbuf(sk, copied); TCP_CHECK_TIMER(sk); release_sock(sk); @@ -1658,6 +1729,9 @@ int tcp_disconnect(struct sock *sk, int flags) __skb_queue_purge(&sk->sk_receive_queue); sk_stream_writequeue_purge(sk); __skb_queue_purge(&tp->out_of_order_queue); +#ifdef CONFIG_NET_DMA + __skb_queue_purge(&sk->sk_async_wait_queue); +#endif inet->dport = 0; @@ -1858,7 +1932,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level, (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT) && inet_csk_ack_scheduled(sk)) { icsk->icsk_ack.pending |= ICSK_ACK_PUSHED; - cleanup_rbuf(sk, 1); + tcp_cleanup_rbuf(sk, 1); if (!(val & 1)) icsk->icsk_ack.pingpong = 1; } @@ -2071,6 +2145,67 @@ int compat_tcp_getsockopt(struct sock *sk, int level, int optname, EXPORT_SYMBOL(compat_tcp_getsockopt); #endif +struct sk_buff *tcp_tso_segment(struct sk_buff *skb, int sg) +{ + struct sk_buff *segs = ERR_PTR(-EINVAL); + struct tcphdr *th; + unsigned thlen; + unsigned int seq; + unsigned int delta; + unsigned int oldlen; + unsigned int len; + + if (!pskb_may_pull(skb, sizeof(*th))) + goto out; + + th = skb->h.th; + thlen = th->doff * 4; + if (thlen < sizeof(*th)) + goto out; + + if (!pskb_may_pull(skb, thlen)) + goto out; + + oldlen = ~htonl(skb->len); + __skb_pull(skb, thlen); + + segs = skb_segment(skb, sg); + if (IS_ERR(segs)) + goto out; + + len = skb_shinfo(skb)->gso_size; + delta = csum_add(oldlen, htonl(thlen + len)); + + skb = segs; + th = skb->h.th; + seq = ntohl(th->seq); + + do { + th->fin = th->psh = 0; + + if (skb->ip_summed == CHECKSUM_NONE) { + th->check = csum_fold(csum_partial( + skb->h.raw, thlen, csum_add(skb->csum, delta))); + } + + seq += len; + skb = skb->next; + th = skb->h.th; + + th->seq = htonl(seq); + th->cwr = 0; + } while (skb->next); + + if (skb->ip_summed == CHECKSUM_NONE) { + delta = csum_add(oldlen, htonl(skb->tail - skb->h.raw)); + th->check = csum_fold(csum_partial( + skb->h.raw, thlen, csum_add(skb->csum, delta))); + } + +out: + return segs; +} + extern void __skb_cb_too_small_for_tcp(int, int); extern struct tcp_congestion_ops tcp_reno; diff --git a/net/ipv4/tcp_bic.c b/net/ipv4/tcp_bic.c index 035f2092d73a..b2d9021ad22b 100644 --- a/net/ipv4/tcp_bic.c +++ b/net/ipv4/tcp_bic.c @@ -198,12 +198,6 @@ static u32 bictcp_undo_cwnd(struct sock *sk) return max(tp->snd_cwnd, ca->last_max_cwnd); } -static u32 bictcp_min_cwnd(struct sock *sk) -{ - const struct tcp_sock *tp = tcp_sk(sk); - return tp->snd_ssthresh; -} - static void bictcp_state(struct sock *sk, u8 new_state) { if (new_state == TCP_CA_Loss) @@ -231,7 +225,6 @@ static struct tcp_congestion_ops bictcp = { .cong_avoid = bictcp_cong_avoid, .set_state = bictcp_state, .undo_cwnd = bictcp_undo_cwnd, - .min_cwnd = bictcp_min_cwnd, .pkts_acked = bictcp_acked, .owner = THIS_MODULE, .name = "bic", diff --git a/net/ipv4/tcp_compound.c b/net/ipv4/tcp_compound.c new file mode 100644 index 000000000000..bc54f7e9aea9 --- /dev/null +++ b/net/ipv4/tcp_compound.c @@ -0,0 +1,448 @@ +/* + * TCP Vegas congestion control + * + * This is based on the congestion detection/avoidance scheme described in + * Lawrence S. Brakmo and Larry L. Peterson. + * "TCP Vegas: End to end congestion avoidance on a global internet." + * IEEE Journal on Selected Areas in Communication, 13(8):1465--1480, + * October 1995. Available from: + * ftp://ftp.cs.arizona.edu/xkernel/Papers/jsac.ps + * + * See http://www.cs.arizona.edu/xkernel/ for their implementation. + * The main aspects that distinguish this implementation from the + * Arizona Vegas implementation are: + * o We do not change the loss detection or recovery mechanisms of + * Linux in any way. Linux already recovers from losses quite well, + * using fine-grained timers, NewReno, and FACK. + * o To avoid the performance penalty imposed by increasing cwnd + * only every-other RTT during slow start, we increase during + * every RTT during slow start, just like Reno. + * o Largely to allow continuous cwnd growth during slow start, + * we use the rate at which ACKs come back as the "actual" + * rate, rather than the rate at which data is sent. + * o To speed convergence to the right rate, we set the cwnd + * to achieve the right ("actual") rate when we exit slow start. + * o To filter out the noise caused by delayed ACKs, we use the + * minimum RTT sample observed during the last RTT to calculate + * the actual rate. + * o When the sender re-starts from idle, it waits until it has + * received ACKs for an entire flight of new data before making + * a cwnd adjustment decision. The original Vegas implementation + * assumed senders never went idle. + * + * + * TCP Compound based on TCP Vegas + * + * further details can be found here: + * ftp://ftp.research.microsoft.com/pub/tr/TR-2005-86.pdf + */ + +#include <linux/config.h> +#include <linux/mm.h> +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/inet_diag.h> + +#include <net/tcp.h> + +/* Default values of the Vegas variables, in fixed-point representation + * with V_PARAM_SHIFT bits to the right of the binary point. + */ +#define V_PARAM_SHIFT 1 + +#define TCP_COMPOUND_ALPHA 3U +#define TCP_COMPOUND_BETA 1U +#define TCP_COMPOUND_GAMMA 30 +#define TCP_COMPOUND_ZETA 1 + +/* TCP compound variables */ +struct compound { + u32 beg_snd_nxt; /* right edge during last RTT */ + u32 beg_snd_una; /* left edge during last RTT */ + u32 beg_snd_cwnd; /* saves the size of the cwnd */ + u8 doing_vegas_now; /* if true, do vegas for this RTT */ + u16 cntRTT; /* # of RTTs measured within last RTT */ + u32 minRTT; /* min of RTTs measured within last RTT (in usec) */ + u32 baseRTT; /* the min of all Vegas RTT measurements seen (in usec) */ + + u32 cwnd; + u32 dwnd; +}; + +/* There are several situations when we must "re-start" Vegas: + * + * o when a connection is established + * o after an RTO + * o after fast recovery + * o when we send a packet and there is no outstanding + * unacknowledged data (restarting an idle connection) + * + * In these circumstances we cannot do a Vegas calculation at the + * end of the first RTT, because any calculation we do is using + * stale info -- both the saved cwnd and congestion feedback are + * stale. + * + * Instead we must wait until the completion of an RTT during + * which we actually receive ACKs. + */ +static inline void vegas_enable(struct sock *sk) +{ + const struct tcp_sock *tp = tcp_sk(sk); + struct compound *vegas = inet_csk_ca(sk); + + /* Begin taking Vegas samples next time we send something. */ + vegas->doing_vegas_now = 1; + + /* Set the beginning of the next send window. */ + vegas->beg_snd_nxt = tp->snd_nxt; + + vegas->cntRTT = 0; + vegas->minRTT = 0x7fffffff; +} + +/* Stop taking Vegas samples for now. */ +static inline void vegas_disable(struct sock *sk) +{ + struct compound *vegas = inet_csk_ca(sk); + + vegas->doing_vegas_now = 0; +} + +static void tcp_compound_init(struct sock *sk) +{ + struct compound *vegas = inet_csk_ca(sk); + const struct tcp_sock *tp = tcp_sk(sk); + + vegas->baseRTT = 0x7fffffff; + vegas_enable(sk); + + vegas->dwnd = 0; + vegas->cwnd = tp->snd_cwnd; +} + +/* Do RTT sampling needed for Vegas. + * Basically we: + * o min-filter RTT samples from within an RTT to get the current + * propagation delay + queuing delay (we are min-filtering to try to + * avoid the effects of delayed ACKs) + * o min-filter RTT samples from a much longer window (forever for now) + * to find the propagation delay (baseRTT) + */ +static void tcp_compound_rtt_calc(struct sock *sk, u32 usrtt) +{ + struct compound *vegas = inet_csk_ca(sk); + u32 vrtt = usrtt + 1; /* Never allow zero rtt or baseRTT */ + + /* Filter to find propagation delay: */ + if (vrtt < vegas->baseRTT) + vegas->baseRTT = vrtt; + + /* Find the min RTT during the last RTT to find + * the current prop. delay + queuing delay: + */ + + vegas->minRTT = min(vegas->minRTT, vrtt); + vegas->cntRTT++; +} + +static void tcp_compound_state(struct sock *sk, u8 ca_state) +{ + + if (ca_state == TCP_CA_Open) + vegas_enable(sk); + else + vegas_disable(sk); +} + + +/* 64bit divisor, dividend and result. dynamic precision */ +static inline u64 div64_64(u64 dividend, u64 divisor) +{ + u32 d = divisor; + + if (divisor > 0xffffffffULL) { + unsigned int shift = fls(divisor >> 32); + + d = divisor >> shift; + dividend >>= shift; + } + + /* avoid 64 bit division if possible */ + if (dividend >> 32) + do_div(dividend, d); + else + dividend = (u32) dividend / d; + + return dividend; +} + +/* calculate the quartic root of "a" using Newton-Raphson */ +static u32 qroot(u64 a) +{ + u32 x, x1; + + /* Initial estimate is based on: + * qrt(x) = exp(log(x) / 4) + */ + x = 1u << (fls64(a) >> 2); + + /* + * Iteration based on: + * 3 + * x = ( 3 * x + a / x ) / 4 + * k+1 k k + */ + do { + u64 x3 = x; + + x1 = x; + x3 *= x; + x3 *= x; + + x = (3 * x + (u32) div64_64(a, x3)) / 4; + } while (abs(x1 - x) > 1); + + return x; +} + + +/* + * If the connection is idle and we are restarting, + * then we don't want to do any Vegas calculations + * until we get fresh RTT samples. So when we + * restart, we reset our Vegas state to a clean + * slate. After we get acks for this flight of + * packets, _then_ we can make Vegas calculations + * again. + */ +static void tcp_compound_cwnd_event(struct sock *sk, enum tcp_ca_event event) +{ + if (event == CA_EVENT_CWND_RESTART || event == CA_EVENT_TX_START) + tcp_compound_init(sk); +} + +static void tcp_compound_cong_avoid(struct sock *sk, u32 ack, + u32 seq_rtt, u32 in_flight, int flag) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct compound *vegas = inet_csk_ca(sk); + u8 inc = 0; + + if (vegas->cwnd + vegas->dwnd > tp->snd_cwnd) { + if (vegas->cwnd > tp->snd_cwnd || vegas->dwnd > tp->snd_cwnd) { + vegas->cwnd = tp->snd_cwnd; + vegas->dwnd = 0; + } else + vegas->cwnd = tp->snd_cwnd - vegas->dwnd; + + } + + if (!tcp_is_cwnd_limited(sk, in_flight)) + return; + + if (vegas->cwnd <= tp->snd_ssthresh) + inc = 1; + else if (tp->snd_cwnd_cnt < tp->snd_cwnd) + tp->snd_cwnd_cnt++; + + if (tp->snd_cwnd_cnt >= tp->snd_cwnd) { + inc = 1; + tp->snd_cwnd_cnt = 0; + } + + if (inc && tp->snd_cwnd < tp->snd_cwnd_clamp) + vegas->cwnd++; + + /* The key players are v_beg_snd_una and v_beg_snd_nxt. + * + * These are so named because they represent the approximate values + * of snd_una and snd_nxt at the beginning of the current RTT. More + * precisely, they represent the amount of data sent during the RTT. + * At the end of the RTT, when we receive an ACK for v_beg_snd_nxt, + * we will calculate that (v_beg_snd_nxt - v_beg_snd_una) outstanding + * bytes of data have been ACKed during the course of the RTT, giving + * an "actual" rate of: + * + * (v_beg_snd_nxt - v_beg_snd_una) / (rtt duration) + * + * Unfortunately, v_beg_snd_una is not exactly equal to snd_una, + * because delayed ACKs can cover more than one segment, so they + * don't line up nicely with the boundaries of RTTs. + * + * Another unfortunate fact of life is that delayed ACKs delay the + * advance of the left edge of our send window, so that the number + * of bytes we send in an RTT is often less than our cwnd will allow. + * So we keep track of our cwnd separately, in v_beg_snd_cwnd. + */ + + if (after(ack, vegas->beg_snd_nxt)) { + /* Do the Vegas once-per-RTT cwnd adjustment. */ + u32 old_wnd, old_snd_cwnd; + + /* Here old_wnd is essentially the window of data that was + * sent during the previous RTT, and has all + * been acknowledged in the course of the RTT that ended + * with the ACK we just received. Likewise, old_snd_cwnd + * is the cwnd during the previous RTT. + */ + if (!tp->mss_cache) + return; + + old_wnd = (vegas->beg_snd_nxt - vegas->beg_snd_una) / + tp->mss_cache; + old_snd_cwnd = vegas->beg_snd_cwnd; + + /* Save the extent of the current window so we can use this + * at the end of the next RTT. + */ + vegas->beg_snd_una = vegas->beg_snd_nxt; + vegas->beg_snd_nxt = tp->snd_nxt; + vegas->beg_snd_cwnd = tp->snd_cwnd; + + /* We do the Vegas calculations only if we got enough RTT + * samples that we can be reasonably sure that we got + * at least one RTT sample that wasn't from a delayed ACK. + * If we only had 2 samples total, + * then that means we're getting only 1 ACK per RTT, which + * means they're almost certainly delayed ACKs. + * If we have 3 samples, we should be OK. + */ + + if (vegas->cntRTT > 2) { + u32 rtt, target_cwnd, diff; + u32 brtt, dwnd; + + /* We have enough RTT samples, so, using the Vegas + * algorithm, we determine if we should increase or + * decrease cwnd, and by how much. + */ + + /* Pluck out the RTT we are using for the Vegas + * calculations. This is the min RTT seen during the + * last RTT. Taking the min filters out the effects + * of delayed ACKs, at the cost of noticing congestion + * a bit later. + */ + rtt = vegas->minRTT; + + /* Calculate the cwnd we should have, if we weren't + * going too fast. + * + * This is: + * (actual rate in segments) * baseRTT + * We keep it as a fixed point number with + * V_PARAM_SHIFT bits to the right of the binary point. + */ + if (!rtt) + return; + + brtt = vegas->baseRTT; + target_cwnd = ((old_wnd * brtt) + << V_PARAM_SHIFT) / rtt; + + /* Calculate the difference between the window we had, + * and the window we would like to have. This quantity + * is the "Diff" from the Arizona Vegas papers. + * + * Again, this is a fixed point number with + * V_PARAM_SHIFT bits to the right of the binary + * point. + */ + + diff = (old_wnd << V_PARAM_SHIFT) - target_cwnd; + + dwnd = vegas->dwnd; + + if (diff < (TCP_COMPOUND_GAMMA << V_PARAM_SHIFT)) { + u64 v; + u32 x; + + /* + * The TCP Compound paper describes the choice + * of "k" determines the agressiveness, + * ie. slope of the response function. + * + * For same value as HSTCP would be 0.8 + * but for computaional reasons, both the + * original authors and this implementation + * use 0.75. + */ + v = old_wnd; + x = qroot(v * v * v) >> TCP_COMPOUND_ALPHA; + if (x > 1) + dwnd = x - 1; + else + dwnd = 0; + + dwnd += vegas->dwnd; + + } else if ((dwnd << V_PARAM_SHIFT) < + (diff * TCP_COMPOUND_BETA)) + dwnd = 0; + else + dwnd = + ((dwnd << V_PARAM_SHIFT) - + (diff * + TCP_COMPOUND_BETA)) >> V_PARAM_SHIFT; + + vegas->dwnd = dwnd; + + } + + /* Wipe the slate clean for the next RTT. */ + vegas->cntRTT = 0; + vegas->minRTT = 0x7fffffff; + } + + tp->snd_cwnd = vegas->cwnd + vegas->dwnd; +} + +/* Extract info for Tcp socket info provided via netlink. */ +static void tcp_compound_get_info(struct sock *sk, u32 ext, struct sk_buff *skb) +{ + const struct compound *ca = inet_csk_ca(sk); + if (ext & (1 << (INET_DIAG_VEGASINFO - 1))) { + struct tcpvegas_info *info; + + info = RTA_DATA(__RTA_PUT(skb, INET_DIAG_VEGASINFO, + sizeof(*info))); + + info->tcpv_enabled = ca->doing_vegas_now; + info->tcpv_rttcnt = ca->cntRTT; + info->tcpv_rtt = ca->baseRTT; + info->tcpv_minrtt = ca->minRTT; + rtattr_failure:; + } +} + +static struct tcp_congestion_ops tcp_compound = { + .init = tcp_compound_init, + .ssthresh = tcp_reno_ssthresh, + .cong_avoid = tcp_compound_cong_avoid, + .rtt_sample = tcp_compound_rtt_calc, + .set_state = tcp_compound_state, + .cwnd_event = tcp_compound_cwnd_event, + .get_info = tcp_compound_get_info, + + .owner = THIS_MODULE, + .name = "compound", +}; + +static int __init tcp_compound_register(void) +{ + BUG_ON(sizeof(struct compound) > ICSK_CA_PRIV_SIZE); + tcp_register_congestion_control(&tcp_compound); + return 0; +} + +static void __exit tcp_compound_unregister(void) +{ + tcp_unregister_congestion_control(&tcp_compound); +} + +module_init(tcp_compound_register); +module_exit(tcp_compound_unregister); + +MODULE_AUTHOR("Angelo P. Castellani, Stephen Hemminger"); +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("TCP Compound"); diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c index 91c2f41c7f58..857eefc52aab 100644 --- a/net/ipv4/tcp_cong.c +++ b/net/ipv4/tcp_cong.c @@ -38,7 +38,7 @@ int tcp_register_congestion_control(struct tcp_congestion_ops *ca) int ret = 0; /* all algorithms must implement ssthresh and cong_avoid ops */ - if (!ca->ssthresh || !ca->cong_avoid || !ca->min_cwnd) { + if (!ca->ssthresh || !ca->cong_avoid) { printk(KERN_ERR "TCP %s does not implement required ops\n", ca->name); return -EINVAL; @@ -251,8 +251,8 @@ u32 tcp_reno_ssthresh(struct sock *sk) } EXPORT_SYMBOL_GPL(tcp_reno_ssthresh); -/* Lower bound on congestion window. */ -u32 tcp_reno_min_cwnd(struct sock *sk) +/* Lower bound on congestion window with halving. */ +u32 tcp_reno_min_cwnd(const struct sock *sk) { const struct tcp_sock *tp = tcp_sk(sk); return tp->snd_ssthresh/2; diff --git a/net/ipv4/tcp_cubic.c b/net/ipv4/tcp_cubic.c index 31a4986dfbf7..78b7a6b9e4de 100644 --- a/net/ipv4/tcp_cubic.c +++ b/net/ipv4/tcp_cubic.c @@ -325,11 +325,6 @@ static u32 bictcp_undo_cwnd(struct sock *sk) return max(tcp_sk(sk)->snd_cwnd, ca->last_max_cwnd); } -static u32 bictcp_min_cwnd(struct sock *sk) -{ - return tcp_sk(sk)->snd_ssthresh; -} - static void bictcp_state(struct sock *sk, u8 new_state) { if (new_state == TCP_CA_Loss) @@ -357,7 +352,6 @@ static struct tcp_congestion_ops cubictcp = { .cong_avoid = bictcp_cong_avoid, .set_state = bictcp_state, .undo_cwnd = bictcp_undo_cwnd, - .min_cwnd = bictcp_min_cwnd, .pkts_acked = bictcp_acked, .owner = THIS_MODULE, .name = "cubic", diff --git a/net/ipv4/tcp_highspeed.c b/net/ipv4/tcp_highspeed.c index b72fa55dfb84..1120245b2373 100644 --- a/net/ipv4/tcp_highspeed.c +++ b/net/ipv4/tcp_highspeed.c @@ -98,6 +98,10 @@ struct hstcp { u32 ai; }; +static int max_ssthresh = 100; +module_param(max_ssthresh, int, 0644); +MODULE_PARM_DESC(max_ssthresh, "limited slow start threshold (RFC3742)"); + static void hstcp_init(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); @@ -119,9 +123,23 @@ static void hstcp_cong_avoid(struct sock *sk, u32 adk, u32 rtt, if (!tcp_is_cwnd_limited(sk, in_flight)) return; - if (tp->snd_cwnd <= tp->snd_ssthresh) - tcp_slow_start(tp); - else { + if (tp->snd_cwnd <= tp->snd_ssthresh) { + /* RFC3742: limited slow start + * the window is increased by 1/K MSS for each arriving ACK, + * for K = int(cwnd/(0.5 max_ssthresh)) + */ + if (max_ssthresh > 0 && tp->snd_cwnd > max_ssthresh) { + u32 k = max(tp->snd_cwnd / (max_ssthresh >> 1), 1U); + if (++tp->snd_cwnd_cnt >= k) { + if (tp->snd_cwnd < tp->snd_cwnd_clamp) + tp->snd_cwnd++; + tp->snd_cwnd_cnt = 0; + } + } else { + if (tp->snd_cwnd < tp->snd_cwnd_clamp) + tp->snd_cwnd++; + } + } else { /* Update AIMD parameters */ if (tp->snd_cwnd > hstcp_aimd_vals[ca->ai].cwnd) { while (tp->snd_cwnd > hstcp_aimd_vals[ca->ai].cwnd && @@ -135,7 +153,8 @@ static void hstcp_cong_avoid(struct sock *sk, u32 adk, u32 rtt, /* Do additive increase */ if (tp->snd_cwnd < tp->snd_cwnd_clamp) { - tp->snd_cwnd_cnt += ca->ai; + /* cwnd = cwnd + a(w) / cwnd */ + tp->snd_cwnd_cnt += ca->ai + 1; if (tp->snd_cwnd_cnt >= tp->snd_cwnd) { tp->snd_cwnd_cnt -= tp->snd_cwnd; tp->snd_cwnd++; diff --git a/net/ipv4/tcp_htcp.c b/net/ipv4/tcp_htcp.c index 1b2ff53f98ed..3d92c1859267 100644 --- a/net/ipv4/tcp_htcp.c +++ b/net/ipv4/tcp_htcp.c @@ -246,14 +246,6 @@ static void htcp_cong_avoid(struct sock *sk, u32 ack, u32 rtt, } } -/* Lower bound on congestion window. */ -static u32 htcp_min_cwnd(struct sock *sk) -{ - const struct tcp_sock *tp = tcp_sk(sk); - return tp->snd_ssthresh; -} - - static void htcp_init(struct sock *sk) { struct htcp *ca = inet_csk_ca(sk); @@ -285,7 +277,6 @@ static void htcp_state(struct sock *sk, u8 new_state) static struct tcp_congestion_ops htcp = { .init = htcp_init, .ssthresh = htcp_recalc_ssthresh, - .min_cwnd = htcp_min_cwnd, .cong_avoid = htcp_cong_avoid, .set_state = htcp_state, .undo_cwnd = htcp_cwnd_undo, diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 4a538bc1683d..94fe5b1f9dcb 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -71,6 +71,7 @@ #include <net/inet_common.h> #include <linux/ipsec.h> #include <asm/unaligned.h> +#include <net/netdma.h> int sysctl_tcp_timestamps = 1; int sysctl_tcp_window_scaling = 1; @@ -1072,7 +1073,7 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_ else pkt_len = (end_seq - TCP_SKB_CB(skb)->seq); - if (tcp_fragment(sk, skb, pkt_len, skb_shinfo(skb)->tso_size)) + if (tcp_fragment(sk, skb, pkt_len, skb_shinfo(skb)->gso_size)) break; pcount = tcp_skb_pcount(skb); } @@ -1649,7 +1650,7 @@ static void tcp_update_scoreboard(struct sock *sk, struct tcp_sock *tp) * Hence, we can detect timed out packets during fast * retransmit without falling to slow start. */ - if (tcp_head_timedout(sk, tp)) { + if (!IsReno(tp) && tcp_head_timedout(sk, tp)) { struct sk_buff *skb; skb = tp->scoreboard_skb_hint ? tp->scoreboard_skb_hint @@ -1662,8 +1663,6 @@ static void tcp_update_scoreboard(struct sock *sk, struct tcp_sock *tp) if (!(TCP_SKB_CB(skb)->sacked&TCPCB_TAGBITS)) { TCP_SKB_CB(skb)->sacked |= TCPCB_LOST; tp->lost_out += tcp_skb_pcount(skb); - if (IsReno(tp)) - tcp_remove_reno_sacks(sk, tp, tcp_skb_pcount(skb) + 1); /* clear xmit_retrans hint */ if (tp->retransmit_skb_hint && @@ -1690,17 +1689,26 @@ static inline void tcp_moderate_cwnd(struct tcp_sock *tp) tp->snd_cwnd_stamp = tcp_time_stamp; } +/* Lower bound on congestion window is slow start threshold + * unless congestion avoidance choice decides to overide it. + */ +static inline u32 tcp_cwnd_min(const struct sock *sk) +{ + const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops; + + return ca_ops->min_cwnd ? ca_ops->min_cwnd(sk) : tcp_sk(sk)->snd_ssthresh; +} + /* Decrease cwnd each second ack. */ static void tcp_cwnd_down(struct sock *sk) { - const struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); int decr = tp->snd_cwnd_cnt + 1; tp->snd_cwnd_cnt = decr&1; decr >>= 1; - if (decr && tp->snd_cwnd > icsk->icsk_ca_ops->min_cwnd(sk)) + if (decr && tp->snd_cwnd > tcp_cwnd_min(sk)) tp->snd_cwnd -= decr; tp->snd_cwnd = min(tp->snd_cwnd, tcp_packets_in_flight(tp)+1); @@ -3787,6 +3795,50 @@ static inline int tcp_checksum_complete_user(struct sock *sk, struct sk_buff *sk __tcp_checksum_complete_user(sk, skb); } +#ifdef CONFIG_NET_DMA +static int tcp_dma_try_early_copy(struct sock *sk, struct sk_buff *skb, int hlen) +{ + struct tcp_sock *tp = tcp_sk(sk); + int chunk = skb->len - hlen; + int dma_cookie; + int copied_early = 0; + + if (tp->ucopy.wakeup) + return 0; + + if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list) + tp->ucopy.dma_chan = get_softnet_dma(); + + if (tp->ucopy.dma_chan && skb->ip_summed == CHECKSUM_UNNECESSARY) { + + dma_cookie = dma_skb_copy_datagram_iovec(tp->ucopy.dma_chan, + skb, hlen, tp->ucopy.iov, chunk, tp->ucopy.pinned_list); + + if (dma_cookie < 0) + goto out; + + tp->ucopy.dma_cookie = dma_cookie; + copied_early = 1; + + tp->ucopy.len -= chunk; + tp->copied_seq += chunk; + tcp_rcv_space_adjust(sk); + + if ((tp->ucopy.len == 0) || + (tcp_flag_word(skb->h.th) & TCP_FLAG_PSH) || + (atomic_read(&sk->sk_rmem_alloc) > (sk->sk_rcvbuf >> 1))) { + tp->ucopy.wakeup = 1; + sk->sk_data_ready(sk, 0); + } + } else if (chunk > 0) { + tp->ucopy.wakeup = 1; + sk->sk_data_ready(sk, 0); + } +out: + return copied_early; +} +#endif /* CONFIG_NET_DMA */ + /* * TCP receive function for the ESTABLISHED state. * @@ -3888,8 +3940,6 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, tp->rcv_nxt == tp->rcv_wup) tcp_store_ts_recent(tp); - tcp_rcv_rtt_measure_ts(sk, skb); - /* We know that such packets are checksummed * on entry. */ @@ -3903,14 +3953,23 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, } } else { int eaten = 0; + int copied_early = 0; - if (tp->ucopy.task == current && - tp->copied_seq == tp->rcv_nxt && - len - tcp_header_len <= tp->ucopy.len && - sock_owned_by_user(sk)) { - __set_current_state(TASK_RUNNING); + if (tp->copied_seq == tp->rcv_nxt && + len - tcp_header_len <= tp->ucopy.len) { +#ifdef CONFIG_NET_DMA + if (tcp_dma_try_early_copy(sk, skb, tcp_header_len)) { + copied_early = 1; + eaten = 1; + } +#endif + if (tp->ucopy.task == current && sock_owned_by_user(sk) && !copied_early) { + __set_current_state(TASK_RUNNING); - if (!tcp_copy_to_iovec(sk, skb, tcp_header_len)) { + if (!tcp_copy_to_iovec(sk, skb, tcp_header_len)) + eaten = 1; + } + if (eaten) { /* Predicted packet is in window by definition. * seq == rcv_nxt and rcv_wup <= rcv_nxt. * Hence, check seq<=rcv_wup reduces to: @@ -3926,8 +3985,9 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, __skb_pull(skb, tcp_header_len); tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq; NET_INC_STATS_BH(LINUX_MIB_TCPHPHITSTOUSER); - eaten = 1; } + if (copied_early) + tcp_cleanup_rbuf(sk, skb->len); } if (!eaten) { if (tcp_checksum_complete_user(sk, skb)) @@ -3968,6 +4028,11 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, __tcp_ack_snd_check(sk, 0); no_ack: +#ifdef CONFIG_NET_DMA + if (copied_early) + __skb_queue_tail(&sk->sk_async_wait_queue, skb); + else +#endif if (eaten) __kfree_skb(skb); else diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 672950e54c49..25ecc6e2478b 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -71,6 +71,7 @@ #include <net/inet_common.h> #include <net/timewait_sock.h> #include <net/xfrm.h> +#include <net/netdma.h> #include <linux/inet.h> #include <linux/ipv6.h> @@ -1091,8 +1092,18 @@ process: bh_lock_sock(sk); ret = 0; if (!sock_owned_by_user(sk)) { - if (!tcp_prequeue(sk, skb)) +#ifdef CONFIG_NET_DMA + struct tcp_sock *tp = tcp_sk(sk); + if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list) + tp->ucopy.dma_chan = get_softnet_dma(); + if (tp->ucopy.dma_chan) ret = tcp_v4_do_rcv(sk, skb); + else +#endif + { + if (!tcp_prequeue(sk, skb)) + ret = tcp_v4_do_rcv(sk, skb); + } } else sk_add_backlog(sk, skb); bh_unlock_sock(sk); @@ -1296,6 +1307,11 @@ int tcp_v4_destroy_sock(struct sock *sk) /* Cleans up our, hopefully empty, out_of_order_queue. */ __skb_queue_purge(&tp->out_of_order_queue); +#ifdef CONFIG_NET_DMA + /* Cleans up our sk_async_wait_queue */ + __skb_queue_purge(&sk->sk_async_wait_queue); +#endif + /* Clean prequeue, it must be empty really */ __skb_queue_purge(&tp->ucopy.prequeue); diff --git a/net/ipv4/tcp_lp.c b/net/ipv4/tcp_lp.c new file mode 100644 index 000000000000..1f977b6ee9a1 --- /dev/null +++ b/net/ipv4/tcp_lp.c @@ -0,0 +1,338 @@ +/* + * TCP Low Priority (TCP-LP) + * + * TCP Low Priority is a distributed algorithm whose goal is to utilize only + * the excess network bandwidth as compared to the ``fair share`` of + * bandwidth as targeted by TCP. Available from: + * http://www.ece.rice.edu/~akuzma/Doc/akuzma/TCP-LP.pdf + * + * Original Author: + * Aleksandar Kuzmanovic <akuzma@northwestern.edu> + * + * See http://www-ece.rice.edu/networks/TCP-LP/ for their implementation. + * As of 2.6.13, Linux supports pluggable congestion control algorithms. + * Due to the limitation of the API, we take the following changes from + * the original TCP-LP implementation: + * o We use newReno in most core CA handling. Only add some checking + * within cong_avoid. + * o Error correcting in remote HZ, therefore remote HZ will be keeped + * on checking and updating. + * o Handling calculation of One-Way-Delay (OWD) within rtt_sample, sicne + * OWD have a similar meaning as RTT. Also correct the buggy formular. + * o Handle reaction for Early Congestion Indication (ECI) within + * pkts_acked, as mentioned within pseudo code. + * o OWD is handled in relative format, where local time stamp will in + * tcp_time_stamp format. + * + * Port from 2.4.19 to 2.6.16 as module by: + * Wong Hoi Sing Edison <hswong3i@gmail.com> + * Hung Hing Lun <hlhung3i@gmail.com> + * + * Version: $Id: tcp_lp.c,v 1.22 2006-05-02 18:18:19 hswong3i Exp $ + */ + +#include <linux/config.h> +#include <linux/module.h> +#include <net/tcp.h> + +/* resolution of owd */ +#define LP_RESOL 1000 + +/** + * enum tcp_lp_state + * @LP_VALID_RHZ: is remote HZ valid? + * @LP_VALID_OWD: is OWD valid? + * @LP_WITHIN_THR: are we within threshold? + * @LP_WITHIN_INF: are we within inference? + * + * TCP-LP's state flags. + * We create this set of state flag mainly for debugging. + */ +enum tcp_lp_state { + LP_VALID_RHZ = (1 << 0), + LP_VALID_OWD = (1 << 1), + LP_WITHIN_THR = (1 << 3), + LP_WITHIN_INF = (1 << 4), +}; + +/** + * struct lp + * @flag: TCP-LP state flag + * @sowd: smoothed OWD << 3 + * @owd_min: min OWD + * @owd_max: max OWD + * @owd_max_rsv: resrved max owd + * @remote_hz: estimated remote HZ + * @remote_ref_time: remote reference time + * @local_ref_time: local reference time + * @last_drop: time for last active drop + * @inference: current inference + * + * TCP-LP's private struct. + * We get the idea from original TCP-LP implementation where only left those we + * found are really useful. + */ +struct lp { + u32 flag; + u32 sowd; + u32 owd_min; + u32 owd_max; + u32 owd_max_rsv; + u32 remote_hz; + u32 remote_ref_time; + u32 local_ref_time; + u32 last_drop; + u32 inference; +}; + +/** + * tcp_lp_init + * + * Init all required variables. + * Clone the handling from Vegas module implementation. + */ +static void tcp_lp_init(struct sock *sk) +{ + struct lp *lp = inet_csk_ca(sk); + + lp->flag = 0; + lp->sowd = 0; + lp->owd_min = 0xffffffff; + lp->owd_max = 0; + lp->owd_max_rsv = 0; + lp->remote_hz = 0; + lp->remote_ref_time = 0; + lp->local_ref_time = 0; + lp->last_drop = 0; + lp->inference = 0; +} + +/** + * tcp_lp_cong_avoid + * + * Implementation of cong_avoid. + * Will only call newReno CA when away from inference. + * From TCP-LP's paper, this will be handled in additive increasement. + */ +static void tcp_lp_cong_avoid(struct sock *sk, u32 ack, u32 rtt, u32 in_flight, + int flag) +{ + struct lp *lp = inet_csk_ca(sk); + + if (!(lp->flag & LP_WITHIN_INF)) + tcp_reno_cong_avoid(sk, ack, rtt, in_flight, flag); +} + +/** + * tcp_lp_remote_hz_estimator + * + * Estimate remote HZ. + * We keep on updating the estimated value, where original TCP-LP + * implementation only guest it for once and use forever. + */ +static u32 tcp_lp_remote_hz_estimator(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct lp *lp = inet_csk_ca(sk); + s64 rhz = lp->remote_hz << 6; /* remote HZ << 6 */ + s64 m = 0; + + /* not yet record reference time + * go away!! record it before come back!! */ + if (lp->remote_ref_time == 0 || lp->local_ref_time == 0) + goto out; + + /* we can't calc remote HZ with no different!! */ + if (tp->rx_opt.rcv_tsval == lp->remote_ref_time + || tp->rx_opt.rcv_tsecr == lp->local_ref_time) + goto out; + + m = HZ * (tp->rx_opt.rcv_tsval - + lp->remote_ref_time) / (tp->rx_opt.rcv_tsecr - + lp->local_ref_time); + if (m < 0) + m = -m; + + if (rhz != 0) { + m -= rhz >> 6; /* m is now error in remote HZ est */ + rhz += m; /* 63/64 old + 1/64 new */ + } else + rhz = m << 6; + + /* record time for successful remote HZ calc */ + lp->flag |= LP_VALID_RHZ; + + out: + /* record reference time stamp */ + lp->remote_ref_time = tp->rx_opt.rcv_tsval; + lp->local_ref_time = tp->rx_opt.rcv_tsecr; + + return rhz >> 6; +} + +/** + * tcp_lp_owd_calculator + * + * Calculate one way delay (in relative format). + * Original implement OWD as minus of remote time difference to local time + * difference directly. As this time difference just simply equal to RTT, when + * the network status is stable, remote RTT will equal to local RTT, and result + * OWD into zero. + * It seems to be a bug and so we fixed it. + */ +static u32 tcp_lp_owd_calculator(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct lp *lp = inet_csk_ca(sk); + s64 owd = 0; + + lp->remote_hz = tcp_lp_remote_hz_estimator(sk); + + if (lp->flag & LP_VALID_RHZ) { + owd = + tp->rx_opt.rcv_tsval * (LP_RESOL / lp->remote_hz) - + tp->rx_opt.rcv_tsecr * (LP_RESOL / HZ); + if (owd < 0) + owd = -owd; + } + + if (owd > 0) + lp->flag |= LP_VALID_OWD; + else + lp->flag &= ~LP_VALID_OWD; + + return owd; +} + +/** + * tcp_lp_rtt_sample + * + * Implementation or rtt_sample. + * Will take the following action, + * 1. calc OWD, + * 2. record the min/max OWD, + * 3. calc smoothed OWD (SOWD). + * Most ideas come from the original TCP-LP implementation. + */ +static void tcp_lp_rtt_sample(struct sock *sk, u32 usrtt) +{ + struct lp *lp = inet_csk_ca(sk); + s64 mowd = tcp_lp_owd_calculator(sk); + + /* sorry that we don't have valid data */ + if (!(lp->flag & LP_VALID_RHZ) || !(lp->flag & LP_VALID_OWD)) + return; + + /* record the next min owd */ + if (mowd < lp->owd_min) + lp->owd_min = mowd; + + /* always forget the max of the max + * we just set owd_max as one below it */ + if (mowd > lp->owd_max) { + if (mowd > lp->owd_max_rsv) { + if (lp->owd_max_rsv == 0) + lp->owd_max = mowd; + else + lp->owd_max = lp->owd_max_rsv; + lp->owd_max_rsv = mowd; + } else + lp->owd_max = mowd; + } + + /* calc for smoothed owd */ + if (lp->sowd != 0) { + mowd -= lp->sowd >> 3; /* m is now error in owd est */ + lp->sowd += mowd; /* owd = 7/8 owd + 1/8 new */ + } else + lp->sowd = mowd << 3; /* take the measured time be owd */ +} + +/** + * tcp_lp_pkts_acked + * + * Implementation of pkts_acked. + * Deal with active drop under Early Congestion Indication. + * Only drop to half and 1 will be handle, because we hope to use back + * newReno in increase case. + * We work it out by following the idea from TCP-LP's paper directly + */ +static void tcp_lp_pkts_acked(struct sock *sk, u32 num_acked) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct lp *lp = inet_csk_ca(sk); + + /* calc inference */ + if (tcp_time_stamp > tp->rx_opt.rcv_tsecr) + lp->inference = 3 * (tcp_time_stamp - tp->rx_opt.rcv_tsecr); + + /* test if within inference */ + if (lp->last_drop && (tcp_time_stamp - lp->last_drop < lp->inference)) + lp->flag |= LP_WITHIN_INF; + else + lp->flag &= ~LP_WITHIN_INF; + + /* test if within threshold */ + if (lp->sowd >> 3 < + lp->owd_min + 15 * (lp->owd_max - lp->owd_min) / 100) + lp->flag |= LP_WITHIN_THR; + else + lp->flag &= ~LP_WITHIN_THR; + + pr_debug("TCP-LP: %05o|%5u|%5u|%15u|%15u|%15u\n", lp->flag, + tp->snd_cwnd, lp->remote_hz, lp->owd_min, lp->owd_max, + lp->sowd >> 3); + + if (lp->flag & LP_WITHIN_THR) + return; + + /* FIXME: try to reset owd_min and owd_max here + * so decrease the chance the min/max is no longer suitable + * and will usually within threshold when whithin inference */ + lp->owd_min = lp->sowd >> 3; + lp->owd_max = lp->sowd >> 2; + lp->owd_max_rsv = lp->sowd >> 2; + + /* happened within inference + * drop snd_cwnd into 1 */ + if (lp->flag & LP_WITHIN_INF) + tp->snd_cwnd = 1U; + + /* happened after inference + * cut snd_cwnd into half */ + else + tp->snd_cwnd = max(tp->snd_cwnd >> 1U, 1U); + + /* record this drop time */ + lp->last_drop = tcp_time_stamp; +} + +static struct tcp_congestion_ops tcp_lp = { + .init = tcp_lp_init, + .ssthresh = tcp_reno_ssthresh, + .cong_avoid = tcp_lp_cong_avoid, + .min_cwnd = tcp_reno_min_cwnd, + .rtt_sample = tcp_lp_rtt_sample, + .pkts_acked = tcp_lp_pkts_acked, + + .owner = THIS_MODULE, + .name = "lp" +}; + +static int __init tcp_lp_register(void) +{ + BUG_ON(sizeof(struct lp) > ICSK_CA_PRIV_SIZE); + return tcp_register_congestion_control(&tcp_lp); +} + +static void __exit tcp_lp_unregister(void) +{ + tcp_unregister_congestion_control(&tcp_lp); +} + +module_init(tcp_lp_register); +module_exit(tcp_lp_unregister); + +MODULE_AUTHOR("Wong Hoi Sing Edison, Hung Hing Lun"); +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("TCP Low Priority"); diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 743016baa048..bdd71db8bf90 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -59,6 +59,9 @@ int sysctl_tcp_tso_win_divisor = 3; int sysctl_tcp_mtu_probing = 0; int sysctl_tcp_base_mss = 512; +/* By default, RFC2861 behavior. */ +int sysctl_tcp_slow_start_after_idle = 1; + static void update_send_head(struct sock *sk, struct tcp_sock *tp, struct sk_buff *skb) { @@ -138,7 +141,8 @@ static void tcp_event_data_sent(struct tcp_sock *tp, struct inet_connection_sock *icsk = inet_csk(sk); const u32 now = tcp_time_stamp; - if (!tp->packets_out && (s32)(now - tp->lsndtime) > icsk->icsk_rto) + if (sysctl_tcp_slow_start_after_idle && + (!tp->packets_out && (s32)(now - tp->lsndtime) > icsk->icsk_rto)) tcp_cwnd_restart(sk, __sk_dst_get(sk)); tp->lsndtime = now; @@ -511,15 +515,17 @@ static void tcp_set_skb_tso_segs(struct sock *sk, struct sk_buff *skb, unsigned /* Avoid the costly divide in the normal * non-TSO case. */ - skb_shinfo(skb)->tso_segs = 1; - skb_shinfo(skb)->tso_size = 0; + skb_shinfo(skb)->gso_segs = 1; + skb_shinfo(skb)->gso_size = 0; + skb_shinfo(skb)->gso_type = 0; } else { unsigned int factor; factor = skb->len + (mss_now - 1); factor /= mss_now; - skb_shinfo(skb)->tso_segs = factor; - skb_shinfo(skb)->tso_size = mss_now; + skb_shinfo(skb)->gso_segs = factor; + skb_shinfo(skb)->gso_size = mss_now; + skb_shinfo(skb)->gso_type = SKB_GSO_TCPV4; } } @@ -642,7 +648,7 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, unsigned int mss * eventually). The difference is that pulled data not copied, but * immediately discarded. */ -static unsigned char *__pskb_trim_head(struct sk_buff *skb, int len) +static void __pskb_trim_head(struct sk_buff *skb, int len) { int i, k, eat; @@ -667,7 +673,6 @@ static unsigned char *__pskb_trim_head(struct sk_buff *skb, int len) skb->tail = skb->data; skb->data_len -= len; skb->len = skb->data_len; - return skb->tail; } int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len) @@ -676,12 +681,11 @@ int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len) pskb_expand_head(skb, 0, 0, GFP_ATOMIC)) return -ENOMEM; - if (len <= skb_headlen(skb)) { + /* If len == headlen, we avoid __skb_pull to preserve alignment. */ + if (unlikely(len < skb_headlen(skb))) __skb_pull(skb, len); - } else { - if (__pskb_trim_head(skb, len-skb_headlen(skb)) == NULL) - return -ENOMEM; - } + else + __pskb_trim_head(skb, len - skb_headlen(skb)); TCP_SKB_CB(skb)->seq += len; skb->ip_summed = CHECKSUM_HW; @@ -912,7 +916,7 @@ static int tcp_init_tso_segs(struct sock *sk, struct sk_buff *skb, unsigned int if (!tso_segs || (tso_segs > 1 && - skb_shinfo(skb)->tso_size != mss_now)) { + tcp_skb_mss(skb) != mss_now)) { tcp_set_skb_tso_segs(sk, skb, mss_now); tso_segs = tcp_skb_pcount(skb); } @@ -1722,8 +1726,9 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) tp->snd_una == (TCP_SKB_CB(skb)->end_seq - 1)) { if (!pskb_trim(skb, 0)) { TCP_SKB_CB(skb)->seq = TCP_SKB_CB(skb)->end_seq - 1; - skb_shinfo(skb)->tso_segs = 1; - skb_shinfo(skb)->tso_size = 0; + skb_shinfo(skb)->gso_segs = 1; + skb_shinfo(skb)->gso_size = 0; + skb_shinfo(skb)->gso_type = 0; skb->ip_summed = CHECKSUM_NONE; skb->csum = 0; } @@ -1928,8 +1933,9 @@ void tcp_send_fin(struct sock *sk) skb->csum = 0; TCP_SKB_CB(skb)->flags = (TCPCB_FLAG_ACK | TCPCB_FLAG_FIN); TCP_SKB_CB(skb)->sacked = 0; - skb_shinfo(skb)->tso_segs = 1; - skb_shinfo(skb)->tso_size = 0; + skb_shinfo(skb)->gso_segs = 1; + skb_shinfo(skb)->gso_size = 0; + skb_shinfo(skb)->gso_type = 0; /* FIN eats a sequence byte, write_seq advanced by tcp_queue_skb(). */ TCP_SKB_CB(skb)->seq = tp->write_seq; @@ -1961,8 +1967,9 @@ void tcp_send_active_reset(struct sock *sk, gfp_t priority) skb->csum = 0; TCP_SKB_CB(skb)->flags = (TCPCB_FLAG_ACK | TCPCB_FLAG_RST); TCP_SKB_CB(skb)->sacked = 0; - skb_shinfo(skb)->tso_segs = 1; - skb_shinfo(skb)->tso_size = 0; + skb_shinfo(skb)->gso_segs = 1; + skb_shinfo(skb)->gso_size = 0; + skb_shinfo(skb)->gso_type = 0; /* Send it off. */ TCP_SKB_CB(skb)->seq = tcp_acceptable_seq(sk, tp); @@ -2045,8 +2052,9 @@ struct sk_buff * tcp_make_synack(struct sock *sk, struct dst_entry *dst, TCP_SKB_CB(skb)->seq = tcp_rsk(req)->snt_isn; TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq + 1; TCP_SKB_CB(skb)->sacked = 0; - skb_shinfo(skb)->tso_segs = 1; - skb_shinfo(skb)->tso_size = 0; + skb_shinfo(skb)->gso_segs = 1; + skb_shinfo(skb)->gso_size = 0; + skb_shinfo(skb)->gso_type = 0; th->seq = htonl(TCP_SKB_CB(skb)->seq); th->ack_seq = htonl(tcp_rsk(req)->rcv_isn + 1); if (req->rcv_wnd == 0) { /* ignored for retransmitted syns */ @@ -2150,8 +2158,9 @@ int tcp_connect(struct sock *sk) TCP_SKB_CB(buff)->flags = TCPCB_FLAG_SYN; TCP_ECN_send_syn(sk, tp, buff); TCP_SKB_CB(buff)->sacked = 0; - skb_shinfo(buff)->tso_segs = 1; - skb_shinfo(buff)->tso_size = 0; + skb_shinfo(buff)->gso_segs = 1; + skb_shinfo(buff)->gso_size = 0; + skb_shinfo(buff)->gso_type = 0; buff->csum = 0; TCP_SKB_CB(buff)->seq = tp->write_seq++; TCP_SKB_CB(buff)->end_seq = tp->write_seq; @@ -2255,8 +2264,9 @@ void tcp_send_ack(struct sock *sk) buff->csum = 0; TCP_SKB_CB(buff)->flags = TCPCB_FLAG_ACK; TCP_SKB_CB(buff)->sacked = 0; - skb_shinfo(buff)->tso_segs = 1; - skb_shinfo(buff)->tso_size = 0; + skb_shinfo(buff)->gso_segs = 1; + skb_shinfo(buff)->gso_size = 0; + skb_shinfo(buff)->gso_type = 0; /* Send it off, this clears delayed acks for us. */ TCP_SKB_CB(buff)->seq = TCP_SKB_CB(buff)->end_seq = tcp_acceptable_seq(sk, tp); @@ -2291,8 +2301,9 @@ static int tcp_xmit_probe_skb(struct sock *sk, int urgent) skb->csum = 0; TCP_SKB_CB(skb)->flags = TCPCB_FLAG_ACK; TCP_SKB_CB(skb)->sacked = urgent; - skb_shinfo(skb)->tso_segs = 1; - skb_shinfo(skb)->tso_size = 0; + skb_shinfo(skb)->gso_segs = 1; + skb_shinfo(skb)->gso_size = 0; + skb_shinfo(skb)->gso_type = 0; /* Use a previous sequence. This should cause the other * end to send an ack. Don't queue or clone SKB, just diff --git a/net/ipv4/tcp_probe.c b/net/ipv4/tcp_probe.c new file mode 100644 index 000000000000..d7d517a3a238 --- /dev/null +++ b/net/ipv4/tcp_probe.c @@ -0,0 +1,181 @@ +/* + * tcpprobe - Observe the TCP flow with kprobes. + * + * The idea for this came from Werner Almesberger's umlsim + * Copyright (C) 2004, Stephen Hemminger <shemminger@osdl.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#include <linux/kernel.h> +#include <linux/kprobes.h> +#include <linux/socket.h> +#include <linux/tcp.h> +#include <linux/proc_fs.h> +#include <linux/module.h> +#include <linux/kfifo.h> +#include <linux/vmalloc.h> + +#include <net/tcp.h> + +MODULE_AUTHOR("Stephen Hemminger <shemminger@osdl.org>"); +MODULE_DESCRIPTION("TCP cwnd snooper"); +MODULE_LICENSE("GPL"); + +static int port = 0; +MODULE_PARM_DESC(port, "Port to match (0=all)"); +module_param(port, int, 0); + +static int bufsize = 64*1024; +MODULE_PARM_DESC(bufsize, "Log buffer size (default 64k)"); +module_param(bufsize, int, 0); + +static const char procname[] = "tcpprobe"; + +struct { + struct kfifo *fifo; + spinlock_t lock; + wait_queue_head_t wait; + struct timeval tstart; +} tcpw; + +static void printl(const char *fmt, ...) +{ + va_list args; + int len; + struct timeval now; + char tbuf[256]; + + va_start(args, fmt); + do_gettimeofday(&now); + + now.tv_sec -= tcpw.tstart.tv_sec; + now.tv_usec -= tcpw.tstart.tv_usec; + if (now.tv_usec < 0) { + --now.tv_sec; + now.tv_usec += 1000000; + } + + len = sprintf(tbuf, "%lu.%06lu ", + (unsigned long) now.tv_sec, + (unsigned long) now.tv_usec); + len += vscnprintf(tbuf+len, sizeof(tbuf)-len, fmt, args); + va_end(args); + + kfifo_put(tcpw.fifo, tbuf, len); + wake_up(&tcpw.wait); +} + +static int jtcp_sendmsg(struct kiocb *iocb, struct sock *sk, + struct msghdr *msg, size_t size) +{ + const struct tcp_sock *tp = tcp_sk(sk); + const struct inet_sock *inet = inet_sk(sk); + + if (port == 0 || ntohs(inet->dport) == port || + ntohs(inet->sport) == port) { + printl("%d.%d.%d.%d:%u %d.%d.%d.%d:%u %d %#x %#x %u %u %u\n", + NIPQUAD(inet->saddr), ntohs(inet->sport), + NIPQUAD(inet->daddr), ntohs(inet->dport), + size, tp->snd_nxt, tp->snd_una, + tp->snd_cwnd, tcp_current_ssthresh(sk), + tp->snd_wnd); + } + + jprobe_return(); + return 0; +} + +static struct jprobe tcp_send_probe = { + .kp = { .addr = (kprobe_opcode_t *) &tcp_sendmsg, }, + .entry = (kprobe_opcode_t *) &jtcp_sendmsg, +}; + + +static int tcpprobe_open(struct inode * inode, struct file * file) +{ + kfifo_reset(tcpw.fifo); + do_gettimeofday(&tcpw.tstart); + return 0; +} + +static ssize_t tcpprobe_read(struct file *file, char __user *buf, + size_t len, loff_t *ppos) +{ + int error = 0, cnt; + unsigned char *tbuf; + + if (!buf || len < 0) + return -EINVAL; + + if (len == 0) + return 0; + + tbuf = vmalloc(len); + if (!tbuf) + return -ENOMEM; + + error = wait_event_interruptible(tcpw.wait, + __kfifo_len(tcpw.fifo) != 0); + if (error) + return error; + + cnt = kfifo_get(tcpw.fifo, tbuf, len); + error = copy_to_user(buf, tbuf, cnt); + + vfree(tbuf); + + return error ? error : cnt; +} + +static struct file_operations tcpprobe_fops = { + .owner = THIS_MODULE, + .open = tcpprobe_open, + .read = tcpprobe_read, +}; + +static __init int tcpprobe_init(void) +{ + int ret = -ENOMEM; + + init_waitqueue_head(&tcpw.wait); + spin_lock_init(&tcpw.lock); + tcpw.fifo = kfifo_alloc(bufsize, GFP_KERNEL, &tcpw.lock); + + if (!proc_net_fops_create(procname, S_IRUSR, &tcpprobe_fops)) + goto err0; + + ret = register_jprobe(&tcp_send_probe); + if (ret) + goto err1; + + pr_info("TCP watch registered (port=%d)\n", port); + return 0; + err1: + proc_net_remove(procname); + err0: + kfifo_free(tcpw.fifo); + return ret; +} +module_init(tcpprobe_init); + +static __exit void tcpprobe_exit(void) +{ + kfifo_free(tcpw.fifo); + proc_net_remove(procname); + unregister_jprobe(&tcp_send_probe); + +} +module_exit(tcpprobe_exit); diff --git a/net/ipv4/tcp_veno.c b/net/ipv4/tcp_veno.c new file mode 100644 index 000000000000..11b42a7135c1 --- /dev/null +++ b/net/ipv4/tcp_veno.c @@ -0,0 +1,231 @@ +/* + * TCP Veno congestion control + * + * This is based on the congestion detection/avoidance scheme described in + * C. P. Fu, S. C. Liew. + * "TCP Veno: TCP Enhancement for Transmission over Wireless Access Networks." + * IEEE Journal on Selected Areas in Communication, + * Feb. 2003. + * See http://www.ntu.edu.sg/home5/ZHOU0022/papers/CPFu03a.pdf + */ + +#include <linux/config.h> +#include <linux/mm.h> +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/inet_diag.h> + +#include <net/tcp.h> + +/* Default values of the Veno variables, in fixed-point representation + * with V_PARAM_SHIFT bits to the right of the binary point. + */ +#define V_PARAM_SHIFT 1 +static const int beta = 3 << V_PARAM_SHIFT; + +/* Veno variables */ +struct veno { + u8 doing_veno_now; /* if true, do veno for this rtt */ + u16 cntrtt; /* # of rtts measured within last rtt */ + u32 minrtt; /* min of rtts measured within last rtt (in usec) */ + u32 basertt; /* the min of all Veno rtt measurements seen (in usec) */ + u32 inc; /* decide whether to increase cwnd */ + u32 diff; /* calculate the diff rate */ +}; + +/* There are several situations when we must "re-start" Veno: + * + * o when a connection is established + * o after an RTO + * o after fast recovery + * o when we send a packet and there is no outstanding + * unacknowledged data (restarting an idle connection) + * + */ +static inline void veno_enable(struct sock *sk) +{ + struct veno *veno = inet_csk_ca(sk); + + /* turn on Veno */ + veno->doing_veno_now = 1; + + veno->minrtt = 0x7fffffff; +} + +static inline void veno_disable(struct sock *sk) +{ + struct veno *veno = inet_csk_ca(sk); + + /* turn off Veno */ + veno->doing_veno_now = 0; +} + +static void tcp_veno_init(struct sock *sk) +{ + struct veno *veno = inet_csk_ca(sk); + + veno->basertt = 0x7fffffff; + veno->inc = 1; + veno_enable(sk); +} + +/* Do rtt sampling needed for Veno. */ +static void tcp_veno_rtt_calc(struct sock *sk, u32 usrtt) +{ + struct veno *veno = inet_csk_ca(sk); + u32 vrtt = usrtt + 1; /* Never allow zero rtt or basertt */ + + /* Filter to find propagation delay: */ + if (vrtt < veno->basertt) + veno->basertt = vrtt; + + /* Find the min rtt during the last rtt to find + * the current prop. delay + queuing delay: + */ + veno->minrtt = min(veno->minrtt, vrtt); + veno->cntrtt++; +} + +static void tcp_veno_state(struct sock *sk, u8 ca_state) +{ + if (ca_state == TCP_CA_Open) + veno_enable(sk); + else + veno_disable(sk); +} + +/* + * If the connection is idle and we are restarting, + * then we don't want to do any Veno calculations + * until we get fresh rtt samples. So when we + * restart, we reset our Veno state to a clean + * state. After we get acks for this flight of + * packets, _then_ we can make Veno calculations + * again. + */ +static void tcp_veno_cwnd_event(struct sock *sk, enum tcp_ca_event event) +{ + if (event == CA_EVENT_CWND_RESTART || event == CA_EVENT_TX_START) + tcp_veno_init(sk); +} + +static void tcp_veno_cong_avoid(struct sock *sk, u32 ack, + u32 seq_rtt, u32 in_flight, int flag) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct veno *veno = inet_csk_ca(sk); + + if (!veno->doing_veno_now) + return tcp_reno_cong_avoid(sk, ack, seq_rtt, in_flight, flag); + + /* limited by applications */ + if (!tcp_is_cwnd_limited(sk, in_flight)) + return; + + /* We do the Veno calculations only if we got enough rtt samples */ + if (veno->cntrtt <= 2) { + /* We don't have enough rtt samples to do the Veno + * calculation, so we'll behave like Reno. + */ + tcp_reno_cong_avoid(sk, ack, seq_rtt, in_flight, flag); + } else { + u32 rtt, target_cwnd; + + /* We have enough rtt samples, so, using the Veno + * algorithm, we determine the state of the network. + */ + + rtt = veno->minrtt; + + target_cwnd = ((tp->snd_cwnd * veno->basertt) + << V_PARAM_SHIFT) / rtt; + + veno->diff = (tp->snd_cwnd << V_PARAM_SHIFT) - target_cwnd; + + if (tp->snd_cwnd <= tp->snd_ssthresh) { + /* Slow start. */ + tcp_slow_start(tp); + } else { + /* Congestion avoidance. */ + if (veno->diff < beta) { + /* In the "non-congestive state", increase cwnd + * every rtt. + */ + if (tp->snd_cwnd_cnt >= tp->snd_cwnd) { + if (tp->snd_cwnd < tp->snd_cwnd_clamp) + tp->snd_cwnd++; + tp->snd_cwnd_cnt = 0; + } else + tp->snd_cwnd_cnt++; + } else { + /* In the "congestive state", increase cwnd + * every other rtt. + */ + if (tp->snd_cwnd_cnt >= tp->snd_cwnd) { + if (veno->inc + && tp->snd_cwnd < + tp->snd_cwnd_clamp) { + tp->snd_cwnd++; + veno->inc = 0; + } else + veno->inc = 1; + tp->snd_cwnd_cnt = 0; + } else + tp->snd_cwnd_cnt++; + } + + } + if (tp->snd_cwnd < 2) + tp->snd_cwnd = 2; + else if (tp->snd_cwnd > tp->snd_cwnd_clamp) + tp->snd_cwnd = tp->snd_cwnd_clamp; + } + /* Wipe the slate clean for the next rtt. */ + /* veno->cntrtt = 0; */ + veno->minrtt = 0x7fffffff; +} + +/* Veno MD phase */ +static u32 tcp_veno_ssthresh(struct sock *sk) +{ + const struct tcp_sock *tp = tcp_sk(sk); + struct veno *veno = inet_csk_ca(sk); + + if (veno->diff < beta) + /* in "non-congestive state", cut cwnd by 1/5 */ + return max(tp->snd_cwnd * 4 / 5, 2U); + else + /* in "congestive state", cut cwnd by 1/2 */ + return max(tp->snd_cwnd >> 1U, 2U); +} + +static struct tcp_congestion_ops tcp_veno = { + .init = tcp_veno_init, + .ssthresh = tcp_veno_ssthresh, + .cong_avoid = tcp_veno_cong_avoid, + .rtt_sample = tcp_veno_rtt_calc, + .set_state = tcp_veno_state, + .cwnd_event = tcp_veno_cwnd_event, + + .owner = THIS_MODULE, + .name = "veno", +}; + +static int __init tcp_veno_register(void) +{ + BUG_ON(sizeof(struct veno) > ICSK_CA_PRIV_SIZE); + tcp_register_congestion_control(&tcp_veno); + return 0; +} + +static void __exit tcp_veno_unregister(void) +{ + tcp_unregister_congestion_control(&tcp_veno); +} + +module_init(tcp_veno_register); +module_exit(tcp_veno_unregister); + +MODULE_AUTHOR("Bin Zhou, Cheng Peng Fu"); +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("TCP Veno"); diff --git a/net/ipv4/tcp_westwood.c b/net/ipv4/tcp_westwood.c index 0c340c3756c2..4247da1384bf 100644 --- a/net/ipv4/tcp_westwood.c +++ b/net/ipv4/tcp_westwood.c @@ -1,7 +1,24 @@ /* - * TCP Westwood+ + * TCP Westwood+: end-to-end bandwidth estimation for TCP * - * Angelo Dell'Aera: TCP Westwood+ support + * Angelo Dell'Aera: author of the first version of TCP Westwood+ in Linux 2.4 + * + * Support at http://c3lab.poliba.it/index.php/Westwood + * Main references in literature: + * + * - Mascolo S, Casetti, M. Gerla et al. + * "TCP Westwood: bandwidth estimation for TCP" Proc. ACM Mobicom 2001 + * + * - A. Grieco, s. Mascolo + * "Performance evaluation of New Reno, Vegas, Westwood+ TCP" ACM Computer + * Comm. Review, 2004 + * + * - A. Dell'Aera, L. Grieco, S. Mascolo. + * "Linux 2.4 Implementation of Westwood+ TCP with Rate-Halving : + * A Performance Evaluation Over the Internet" (ICC 2004), Paris, June 2004 + * + * Westwood+ employs end-to-end bandwidth measurement to set cwnd and + * ssthresh after packet loss. The probing phase is as the original Reno. */ #include <linux/config.h> @@ -22,6 +39,8 @@ struct westwood { u32 accounted; u32 rtt; u32 rtt_min; /* minimum observed RTT */ + u8 first_ack; /* flag which infers that this is the first ack */ + u8 reset_rtt_min; /* Reset RTT min to next RTT sample*/ }; @@ -49,9 +68,11 @@ static void tcp_westwood_init(struct sock *sk) w->bw_est = 0; w->accounted = 0; w->cumul_ack = 0; + w->reset_rtt_min = 1; w->rtt_min = w->rtt = TCP_WESTWOOD_INIT_RTT; w->rtt_win_sx = tcp_time_stamp; w->snd_una = tcp_sk(sk)->snd_una; + w->first_ack = 1; } /* @@ -63,10 +84,16 @@ static inline u32 westwood_do_filter(u32 a, u32 b) return (((7 * a) + b) >> 3); } -static inline void westwood_filter(struct westwood *w, u32 delta) +static void westwood_filter(struct westwood *w, u32 delta) { - w->bw_ns_est = westwood_do_filter(w->bw_ns_est, w->bk / delta); - w->bw_est = westwood_do_filter(w->bw_est, w->bw_ns_est); + /* If the filter is empty fill it with the first sample of bandwidth */ + if (w->bw_ns_est == 0 && w->bw_est == 0) { + w->bw_ns_est = w->bk / delta; + w->bw_est = w->bw_ns_est; + } else { + w->bw_ns_est = westwood_do_filter(w->bw_ns_est, w->bk / delta); + w->bw_est = westwood_do_filter(w->bw_est, w->bw_ns_est); + } } /* @@ -91,6 +118,15 @@ static void westwood_update_window(struct sock *sk) struct westwood *w = inet_csk_ca(sk); s32 delta = tcp_time_stamp - w->rtt_win_sx; + /* Initialize w->snd_una with the first acked sequence number in order + * to fix mismatch between tp->snd_una and w->snd_una for the first + * bandwidth sample + */ + if (w->first_ack) { + w->snd_una = tcp_sk(sk)->snd_una; + w->first_ack = 0; + } + /* * See if a RTT-window has passed. * Be careful since if RTT is less than @@ -108,6 +144,16 @@ static void westwood_update_window(struct sock *sk) } } +static inline void update_rtt_min(struct westwood *w) +{ + if (w->reset_rtt_min) { + w->rtt_min = w->rtt; + w->reset_rtt_min = 0; + } else + w->rtt_min = min(w->rtt, w->rtt_min); +} + + /* * @westwood_fast_bw * It is called when we are in fast path. In particular it is called when @@ -123,7 +169,7 @@ static inline void westwood_fast_bw(struct sock *sk) w->bk += tp->snd_una - w->snd_una; w->snd_una = tp->snd_una; - w->rtt_min = min(w->rtt, w->rtt_min); + update_rtt_min(w); } /* @@ -162,12 +208,6 @@ static inline u32 westwood_acked_count(struct sock *sk) return w->cumul_ack; } -static inline u32 westwood_bw_rttmin(const struct sock *sk) -{ - const struct tcp_sock *tp = tcp_sk(sk); - const struct westwood *w = inet_csk_ca(sk); - return max_t(u32, (w->bw_est * w->rtt_min) / tp->mss_cache, 2); -} /* * TCP Westwood @@ -175,9 +215,11 @@ static inline u32 westwood_bw_rttmin(const struct sock *sk) * in packets we use mss_cache). Rttmin is guaranteed to be >= 2 * so avoids ever returning 0. */ -static u32 tcp_westwood_cwnd_min(struct sock *sk) +static u32 tcp_westwood_bw_rttmin(const struct sock *sk) { - return westwood_bw_rttmin(sk); + const struct tcp_sock *tp = tcp_sk(sk); + const struct westwood *w = inet_csk_ca(sk); + return max_t(u32, (w->bw_est * w->rtt_min) / tp->mss_cache, 2); } static void tcp_westwood_event(struct sock *sk, enum tcp_ca_event event) @@ -191,17 +233,19 @@ static void tcp_westwood_event(struct sock *sk, enum tcp_ca_event event) break; case CA_EVENT_COMPLETE_CWR: - tp->snd_cwnd = tp->snd_ssthresh = westwood_bw_rttmin(sk); + tp->snd_cwnd = tp->snd_ssthresh = tcp_westwood_bw_rttmin(sk); break; case CA_EVENT_FRTO: - tp->snd_ssthresh = westwood_bw_rttmin(sk); + tp->snd_ssthresh = tcp_westwood_bw_rttmin(sk); + /* Update RTT_min when next ack arrives */ + w->reset_rtt_min = 1; break; case CA_EVENT_SLOW_ACK: westwood_update_window(sk); w->bk += westwood_acked_count(sk); - w->rtt_min = min(w->rtt, w->rtt_min); + update_rtt_min(w); break; default: @@ -235,7 +279,7 @@ static struct tcp_congestion_ops tcp_westwood = { .init = tcp_westwood_init, .ssthresh = tcp_reno_ssthresh, .cong_avoid = tcp_reno_cong_avoid, - .min_cwnd = tcp_westwood_cwnd_min, + .min_cwnd = tcp_westwood_bw_rttmin, .cwnd_event = tcp_westwood_event, .get_info = tcp_westwood_info, .pkts_acked = tcp_westwood_pkts_acked, diff --git a/net/ipv4/xfrm4_input.c b/net/ipv4/xfrm4_input.c index 3e174c83bfe7..817ed84511a6 100644 --- a/net/ipv4/xfrm4_input.c +++ b/net/ipv4/xfrm4_input.c @@ -13,7 +13,6 @@ #include <linux/string.h> #include <linux/netfilter.h> #include <linux/netfilter_ipv4.h> -#include <net/inet_ecn.h> #include <net/ip.h> #include <net/xfrm.h> @@ -24,15 +23,6 @@ int xfrm4_rcv(struct sk_buff *skb) EXPORT_SYMBOL(xfrm4_rcv); -static inline void ipip_ecn_decapsulate(struct sk_buff *skb) -{ - struct iphdr *outer_iph = skb->nh.iph; - struct iphdr *inner_iph = skb->h.ipiph; - - if (INET_ECN_is_ce(outer_iph->tos)) - IP_ECN_set_ce(inner_iph); -} - static int xfrm4_parse_spi(struct sk_buff *skb, u8 nexthdr, u32 *spi, u32 *seq) { switch (nexthdr) { @@ -113,24 +103,10 @@ int xfrm4_rcv_encap(struct sk_buff *skb, __u16 encap_type) xfrm_vec[xfrm_nr++] = x; - iph = skb->nh.iph; + if (x->mode->input(x, skb)) + goto drop; if (x->props.mode) { - if (iph->protocol != IPPROTO_IPIP) - goto drop; - if (!pskb_may_pull(skb, sizeof(struct iphdr))) - goto drop; - if (skb_cloned(skb) && - pskb_expand_head(skb, 0, 0, GFP_ATOMIC)) - goto drop; - if (x->props.flags & XFRM_STATE_DECAP_DSCP) - ipv4_copy_dscp(iph, skb->h.ipiph); - if (!(x->props.flags & XFRM_STATE_NOECN)) - ipip_ecn_decapsulate(skb); - skb->mac.raw = memmove(skb->data - skb->mac_len, - skb->mac.raw, skb->mac_len); - skb->nh.raw = skb->data; - memset(&(IPCB(skb)->opt), 0, sizeof(struct ip_options)); decaps = 1; break; } diff --git a/net/ipv4/xfrm4_mode_transport.c b/net/ipv4/xfrm4_mode_transport.c new file mode 100644 index 000000000000..a9e6b3dd19c9 --- /dev/null +++ b/net/ipv4/xfrm4_mode_transport.c @@ -0,0 +1,83 @@ +/* + * xfrm4_mode_transport.c - Transport mode encapsulation for IPv4. + * + * Copyright (c) 2004-2006 Herbert Xu <herbert@gondor.apana.org.au> + */ + +#include <linux/init.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/stringify.h> +#include <net/dst.h> +#include <net/ip.h> +#include <net/xfrm.h> + +/* Add encapsulation header. + * + * The IP header will be moved forward to make space for the encapsulation + * header. + * + * On exit, skb->h will be set to the start of the payload to be processed + * by x->type->output and skb->nh will be set to the top IP header. + */ +static int xfrm4_transport_output(struct sk_buff *skb) +{ + struct xfrm_state *x; + struct iphdr *iph; + int ihl; + + iph = skb->nh.iph; + skb->h.ipiph = iph; + + ihl = iph->ihl * 4; + skb->h.raw += ihl; + + x = skb->dst->xfrm; + skb->nh.raw = memmove(skb_push(skb, x->props.header_len), iph, ihl); + return 0; +} + +/* Remove encapsulation header. + * + * The IP header will be moved over the top of the encapsulation header. + * + * On entry, skb->h shall point to where the IP header should be and skb->nh + * shall be set to where the IP header currently is. skb->data shall point + * to the start of the payload. + */ +static int xfrm4_transport_input(struct xfrm_state *x, struct sk_buff *skb) +{ + int ihl = skb->data - skb->h.raw; + + if (skb->h.raw != skb->nh.raw) + skb->nh.raw = memmove(skb->h.raw, skb->nh.raw, ihl); + skb->nh.iph->tot_len = htons(skb->len + ihl); + skb->h.raw = skb->data; + return 0; +} + +static struct xfrm_mode xfrm4_transport_mode = { + .input = xfrm4_transport_input, + .output = xfrm4_transport_output, + .owner = THIS_MODULE, + .encap = XFRM_MODE_TRANSPORT, +}; + +static int __init xfrm4_transport_init(void) +{ + return xfrm_register_mode(&xfrm4_transport_mode, AF_INET); +} + +static void __exit xfrm4_transport_exit(void) +{ + int err; + + err = xfrm_unregister_mode(&xfrm4_transport_mode, AF_INET); + BUG_ON(err); +} + +module_init(xfrm4_transport_init); +module_exit(xfrm4_transport_exit); +MODULE_LICENSE("GPL"); +MODULE_ALIAS_XFRM_MODE(AF_INET, XFRM_MODE_TRANSPORT); diff --git a/net/ipv4/xfrm4_mode_tunnel.c b/net/ipv4/xfrm4_mode_tunnel.c new file mode 100644 index 000000000000..f8d880beb12f --- /dev/null +++ b/net/ipv4/xfrm4_mode_tunnel.c @@ -0,0 +1,125 @@ +/* + * xfrm4_mode_tunnel.c - Tunnel mode encapsulation for IPv4. + * + * Copyright (c) 2004-2006 Herbert Xu <herbert@gondor.apana.org.au> + */ + +#include <linux/init.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/stringify.h> +#include <net/dst.h> +#include <net/inet_ecn.h> +#include <net/ip.h> +#include <net/xfrm.h> + +static inline void ipip_ecn_decapsulate(struct sk_buff *skb) +{ + struct iphdr *outer_iph = skb->nh.iph; + struct iphdr *inner_iph = skb->h.ipiph; + + if (INET_ECN_is_ce(outer_iph->tos)) + IP_ECN_set_ce(inner_iph); +} + +/* Add encapsulation header. + * + * The top IP header will be constructed per RFC 2401. The following fields + * in it shall be filled in by x->type->output: + * tot_len + * check + * + * On exit, skb->h will be set to the start of the payload to be processed + * by x->type->output and skb->nh will be set to the top IP header. + */ +static int xfrm4_tunnel_output(struct sk_buff *skb) +{ + struct dst_entry *dst = skb->dst; + struct xfrm_state *x = dst->xfrm; + struct iphdr *iph, *top_iph; + int flags; + + iph = skb->nh.iph; + skb->h.ipiph = iph; + + skb->nh.raw = skb_push(skb, x->props.header_len); + top_iph = skb->nh.iph; + + top_iph->ihl = 5; + top_iph->version = 4; + + /* DS disclosed */ + top_iph->tos = INET_ECN_encapsulate(iph->tos, iph->tos); + + flags = x->props.flags; + if (flags & XFRM_STATE_NOECN) + IP_ECN_clear(top_iph); + + top_iph->frag_off = (flags & XFRM_STATE_NOPMTUDISC) ? + 0 : (iph->frag_off & htons(IP_DF)); + if (!top_iph->frag_off) + __ip_select_ident(top_iph, dst->child, 0); + + top_iph->ttl = dst_metric(dst->child, RTAX_HOPLIMIT); + + top_iph->saddr = x->props.saddr.a4; + top_iph->daddr = x->id.daddr.a4; + top_iph->protocol = IPPROTO_IPIP; + + memset(&(IPCB(skb)->opt), 0, sizeof(struct ip_options)); + return 0; +} + +static int xfrm4_tunnel_input(struct xfrm_state *x, struct sk_buff *skb) +{ + struct iphdr *iph = skb->nh.iph; + int err = -EINVAL; + + if (iph->protocol != IPPROTO_IPIP) + goto out; + if (!pskb_may_pull(skb, sizeof(struct iphdr))) + goto out; + + if (skb_cloned(skb) && + (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC))) + goto out; + + if (x->props.flags & XFRM_STATE_DECAP_DSCP) + ipv4_copy_dscp(iph, skb->h.ipiph); + if (!(x->props.flags & XFRM_STATE_NOECN)) + ipip_ecn_decapsulate(skb); + skb->mac.raw = memmove(skb->data - skb->mac_len, + skb->mac.raw, skb->mac_len); + skb->nh.raw = skb->data; + memset(&(IPCB(skb)->opt), 0, sizeof(struct ip_options)); + err = 0; + +out: + return err; +} + +static struct xfrm_mode xfrm4_tunnel_mode = { + .input = xfrm4_tunnel_input, + .output = xfrm4_tunnel_output, + .owner = THIS_MODULE, + .encap = XFRM_MODE_TUNNEL, +}; + +static int __init xfrm4_tunnel_init(void) +{ + return xfrm_register_mode(&xfrm4_tunnel_mode, AF_INET); +} + +static void __exit xfrm4_tunnel_exit(void) +{ + int err; + + err = xfrm_unregister_mode(&xfrm4_tunnel_mode, AF_INET); + BUG_ON(err); +} + +module_init(xfrm4_tunnel_init); +module_exit(xfrm4_tunnel_exit); +MODULE_LICENSE("GPL"); +MODULE_ALIAS_XFRM_MODE(AF_INET, XFRM_MODE_TUNNEL); diff --git a/net/ipv4/xfrm4_output.c b/net/ipv4/xfrm4_output.c index 4ef8efaf6a67..193363e22932 100644 --- a/net/ipv4/xfrm4_output.c +++ b/net/ipv4/xfrm4_output.c @@ -9,70 +9,15 @@ */ #include <linux/compiler.h> +#include <linux/if_ether.h> +#include <linux/kernel.h> #include <linux/skbuff.h> #include <linux/spinlock.h> #include <linux/netfilter_ipv4.h> -#include <net/inet_ecn.h> #include <net/ip.h> #include <net/xfrm.h> #include <net/icmp.h> -/* Add encapsulation header. - * - * In transport mode, the IP header will be moved forward to make space - * for the encapsulation header. - * - * In tunnel mode, the top IP header will be constructed per RFC 2401. - * The following fields in it shall be filled in by x->type->output: - * tot_len - * check - * - * On exit, skb->h will be set to the start of the payload to be processed - * by x->type->output and skb->nh will be set to the top IP header. - */ -static void xfrm4_encap(struct sk_buff *skb) -{ - struct dst_entry *dst = skb->dst; - struct xfrm_state *x = dst->xfrm; - struct iphdr *iph, *top_iph; - int flags; - - iph = skb->nh.iph; - skb->h.ipiph = iph; - - skb->nh.raw = skb_push(skb, x->props.header_len); - top_iph = skb->nh.iph; - - if (!x->props.mode) { - skb->h.raw += iph->ihl*4; - memmove(top_iph, iph, iph->ihl*4); - return; - } - - top_iph->ihl = 5; - top_iph->version = 4; - - /* DS disclosed */ - top_iph->tos = INET_ECN_encapsulate(iph->tos, iph->tos); - - flags = x->props.flags; - if (flags & XFRM_STATE_NOECN) - IP_ECN_clear(top_iph); - - top_iph->frag_off = (flags & XFRM_STATE_NOPMTUDISC) ? - 0 : (iph->frag_off & htons(IP_DF)); - if (!top_iph->frag_off) - __ip_select_ident(top_iph, dst->child, 0); - - top_iph->ttl = dst_metric(dst->child, RTAX_HOPLIMIT); - - top_iph->saddr = x->props.saddr.a4; - top_iph->daddr = x->id.daddr.a4; - top_iph->protocol = IPPROTO_IPIP; - - memset(&(IPCB(skb)->opt), 0, sizeof(struct ip_options)); -} - static int xfrm4_tunnel_check_size(struct sk_buff *skb) { int mtu, ret = 0; @@ -121,7 +66,9 @@ static int xfrm4_output_one(struct sk_buff *skb) if (err) goto error; - xfrm4_encap(skb); + err = x->mode->output(skb); + if (err) + goto error; err = x->type->output(x, skb); if (err) @@ -152,16 +99,10 @@ error_nolock: goto out_exit; } -static int xfrm4_output_finish(struct sk_buff *skb) +static int xfrm4_output_finish2(struct sk_buff *skb) { int err; -#ifdef CONFIG_NETFILTER - if (!skb->dst->xfrm) { - IPCB(skb)->flags |= IPSKB_REROUTED; - return dst_output(skb); - } -#endif while (likely((err = xfrm4_output_one(skb)) == 0)) { nf_reset(skb); @@ -174,7 +115,7 @@ static int xfrm4_output_finish(struct sk_buff *skb) return dst_output(skb); err = nf_hook(PF_INET, NF_IP_POST_ROUTING, &skb, NULL, - skb->dst->dev, xfrm4_output_finish); + skb->dst->dev, xfrm4_output_finish2); if (unlikely(err != 1)) break; } @@ -182,6 +123,48 @@ static int xfrm4_output_finish(struct sk_buff *skb) return err; } +static int xfrm4_output_finish(struct sk_buff *skb) +{ + struct sk_buff *segs; + +#ifdef CONFIG_NETFILTER + if (!skb->dst->xfrm) { + IPCB(skb)->flags |= IPSKB_REROUTED; + return dst_output(skb); + } +#endif + + if (!skb_shinfo(skb)->gso_size) + return xfrm4_output_finish2(skb); + + skb->protocol = htons(ETH_P_IP); + segs = skb_gso_segment(skb, 0); + kfree_skb(skb); + if (unlikely(IS_ERR(segs))) + return PTR_ERR(segs); + + do { + struct sk_buff *nskb = segs->next; + int err; + + segs->next = NULL; + err = xfrm4_output_finish2(segs); + + if (unlikely(err)) { + while ((segs = nskb)) { + nskb = segs->next; + segs->next = NULL; + kfree_skb(segs); + } + return err; + } + + segs = nskb; + } while (segs); + + return 0; +} + int xfrm4_output(struct sk_buff *skb) { return NF_HOOK_COND(PF_INET, NF_IP_POST_ROUTING, skb, NULL, skb->dst->dev, diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c index 8604c747bca5..c0465284dfac 100644 --- a/net/ipv4/xfrm4_policy.c +++ b/net/ipv4/xfrm4_policy.c @@ -17,8 +17,6 @@ static struct dst_ops xfrm4_dst_ops; static struct xfrm_policy_afinfo xfrm4_policy_afinfo; -static struct xfrm_type_map xfrm4_type_map = { .lock = RW_LOCK_UNLOCKED }; - static int xfrm4_dst_lookup(struct xfrm_dst **dst, struct flowi *fl) { return __ip_route_output_key((struct rtable**)dst, fl); @@ -237,9 +235,7 @@ _decode_session4(struct sk_buff *skb, struct flowi *fl) static inline int xfrm4_garbage_collect(void) { - read_lock(&xfrm4_policy_afinfo.lock); xfrm4_policy_afinfo.garbage_collect(); - read_unlock(&xfrm4_policy_afinfo.lock); return (atomic_read(&xfrm4_dst_ops.entries) > xfrm4_dst_ops.gc_thresh*2); } @@ -299,8 +295,6 @@ static struct dst_ops xfrm4_dst_ops = { static struct xfrm_policy_afinfo xfrm4_policy_afinfo = { .family = AF_INET, - .lock = RW_LOCK_UNLOCKED, - .type_map = &xfrm4_type_map, .dst_ops = &xfrm4_dst_ops, .dst_lookup = xfrm4_dst_lookup, .find_bundle = __xfrm4_find_bundle, diff --git a/net/ipv4/xfrm4_state.c b/net/ipv4/xfrm4_state.c index dbabf81a9b7b..81e1751c966e 100644 --- a/net/ipv4/xfrm4_state.c +++ b/net/ipv4/xfrm4_state.c @@ -131,7 +131,6 @@ __xfrm4_find_acq(u8 mode, u32 reqid, u8 proto, static struct xfrm_state_afinfo xfrm4_state_afinfo = { .family = AF_INET, - .lock = RW_LOCK_UNLOCKED, .init_flags = xfrm4_init_flags, .init_tempsel = __xfrm4_init_tempsel, .state_lookup = __xfrm4_state_lookup, |